geoai-py 0.24.0__py2.py3-none-any.whl → 0.26.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geoai/__init__.py +16 -1
- geoai/auto.py +0 -1
- geoai/change_detection.py +1 -1
- geoai/moondream.py +748 -1
- geoai/prithvi.py +1253 -0
- geoai/utils.py +1877 -327
- {geoai_py-0.24.0.dist-info → geoai_py-0.26.0.dist-info}/METADATA +3 -2
- {geoai_py-0.24.0.dist-info → geoai_py-0.26.0.dist-info}/RECORD +12 -11
- {geoai_py-0.24.0.dist-info → geoai_py-0.26.0.dist-info}/WHEEL +1 -1
- {geoai_py-0.24.0.dist-info → geoai_py-0.26.0.dist-info}/entry_points.txt +0 -0
- {geoai_py-0.24.0.dist-info → geoai_py-0.26.0.dist-info}/licenses/LICENSE +0 -0
- {geoai_py-0.24.0.dist-info → geoai_py-0.26.0.dist-info}/top_level.txt +0 -0
geoai/moondream.py
CHANGED
|
@@ -18,11 +18,11 @@ import rasterio
|
|
|
18
18
|
import torch
|
|
19
19
|
from PIL import Image
|
|
20
20
|
from shapely.geometry import Point, box
|
|
21
|
+
from tqdm import tqdm
|
|
21
22
|
from transformers.utils import logging as hf_logging
|
|
22
23
|
|
|
23
24
|
from .utils import get_device
|
|
24
25
|
|
|
25
|
-
|
|
26
26
|
hf_logging.set_verbosity_error() # silence HF load reports
|
|
27
27
|
|
|
28
28
|
|
|
@@ -867,6 +867,558 @@ class MoondreamGeo:
|
|
|
867
867
|
else:
|
|
868
868
|
return None
|
|
869
869
|
|
|
870
|
+
def _create_sliding_windows(
|
|
871
|
+
self,
|
|
872
|
+
image_width: int,
|
|
873
|
+
image_height: int,
|
|
874
|
+
window_size: int = 512,
|
|
875
|
+
overlap: int = 64,
|
|
876
|
+
) -> List[Tuple[int, int, int, int]]:
|
|
877
|
+
"""Create sliding window coordinates for tiled processing.
|
|
878
|
+
|
|
879
|
+
Args:
|
|
880
|
+
image_width: Width of the full image.
|
|
881
|
+
image_height: Height of the full image.
|
|
882
|
+
window_size: Size of each window/tile.
|
|
883
|
+
overlap: Overlap between adjacent windows.
|
|
884
|
+
|
|
885
|
+
Returns:
|
|
886
|
+
List of tuples (x_start, y_start, x_end, y_end) for each window.
|
|
887
|
+
"""
|
|
888
|
+
windows = []
|
|
889
|
+
stride = window_size - overlap
|
|
890
|
+
|
|
891
|
+
for y in range(0, image_height, stride):
|
|
892
|
+
for x in range(0, image_width, stride):
|
|
893
|
+
x_start = x
|
|
894
|
+
y_start = y
|
|
895
|
+
x_end = min(x + window_size, image_width)
|
|
896
|
+
y_end = min(y + window_size, image_height)
|
|
897
|
+
|
|
898
|
+
# Only add windows that have sufficient size
|
|
899
|
+
if (x_end - x_start) >= window_size // 2 and (
|
|
900
|
+
y_end - y_start
|
|
901
|
+
) >= window_size // 2:
|
|
902
|
+
windows.append((x_start, y_start, x_end, y_end))
|
|
903
|
+
|
|
904
|
+
return windows
|
|
905
|
+
|
|
906
|
+
def _apply_nms(
|
|
907
|
+
self,
|
|
908
|
+
detections: List[Dict[str, Any]],
|
|
909
|
+
iou_threshold: float = 0.5,
|
|
910
|
+
) -> List[Dict[str, Any]]:
|
|
911
|
+
"""Apply Non-Maximum Suppression to remove overlapping detections.
|
|
912
|
+
|
|
913
|
+
Args:
|
|
914
|
+
detections: List of detection dictionaries with bounding boxes.
|
|
915
|
+
iou_threshold: IoU threshold for considering boxes as overlapping.
|
|
916
|
+
|
|
917
|
+
Returns:
|
|
918
|
+
Filtered list of detections after NMS.
|
|
919
|
+
"""
|
|
920
|
+
if not detections:
|
|
921
|
+
return []
|
|
922
|
+
|
|
923
|
+
# Sort by confidence/score if available
|
|
924
|
+
if "score" in detections[0]:
|
|
925
|
+
detections = sorted(
|
|
926
|
+
detections, key=lambda x: x.get("score", 1.0), reverse=True
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
# Convert to arrays for efficient computation
|
|
930
|
+
boxes = np.array(
|
|
931
|
+
[[d["x_min"], d["y_min"], d["x_max"], d["y_max"]] for d in detections]
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
# Calculate areas
|
|
935
|
+
x1 = boxes[:, 0]
|
|
936
|
+
y1 = boxes[:, 1]
|
|
937
|
+
x2 = boxes[:, 2]
|
|
938
|
+
y2 = boxes[:, 3]
|
|
939
|
+
areas = (x2 - x1) * (y2 - y1)
|
|
940
|
+
|
|
941
|
+
# Sort by y2 coordinate (bottom of box)
|
|
942
|
+
order = y2.argsort()
|
|
943
|
+
|
|
944
|
+
keep = []
|
|
945
|
+
while order.size > 0:
|
|
946
|
+
i = order[-1]
|
|
947
|
+
keep.append(i)
|
|
948
|
+
|
|
949
|
+
# Calculate IoU with remaining boxes
|
|
950
|
+
xx1 = np.maximum(x1[i], x1[order[:-1]])
|
|
951
|
+
yy1 = np.maximum(y1[i], y1[order[:-1]])
|
|
952
|
+
xx2 = np.minimum(x2[i], x2[order[:-1]])
|
|
953
|
+
yy2 = np.minimum(y2[i], y2[order[:-1]])
|
|
954
|
+
|
|
955
|
+
w = np.maximum(0, xx2 - xx1)
|
|
956
|
+
h = np.maximum(0, yy2 - yy1)
|
|
957
|
+
intersection = w * h
|
|
958
|
+
|
|
959
|
+
iou = intersection / (areas[i] + areas[order[:-1]] - intersection)
|
|
960
|
+
|
|
961
|
+
# Keep only boxes with IoU less than threshold
|
|
962
|
+
inds = np.where(iou <= iou_threshold)[0]
|
|
963
|
+
order = order[inds]
|
|
964
|
+
|
|
965
|
+
return [detections[i] for i in keep]
|
|
966
|
+
|
|
967
|
+
def detect_sliding_window(
|
|
968
|
+
self,
|
|
969
|
+
source: Union[str, Image.Image, np.ndarray],
|
|
970
|
+
object_type: str,
|
|
971
|
+
window_size: int = 512,
|
|
972
|
+
overlap: int = 64,
|
|
973
|
+
iou_threshold: float = 0.5,
|
|
974
|
+
bands: Optional[List[int]] = None,
|
|
975
|
+
output_path: Optional[str] = None,
|
|
976
|
+
settings: Optional[Dict] = None,
|
|
977
|
+
show_progress: bool = True,
|
|
978
|
+
**kwargs: Any,
|
|
979
|
+
) -> Dict[str, Any]:
|
|
980
|
+
"""Detect objects using sliding window for large images.
|
|
981
|
+
|
|
982
|
+
This method processes large images by dividing them into overlapping
|
|
983
|
+
windows/tiles, running detection on each tile, and merging results
|
|
984
|
+
using Non-Maximum Suppression (NMS) to handle overlapping detections.
|
|
985
|
+
|
|
986
|
+
Args:
|
|
987
|
+
source: Image source or pre-encoded image.
|
|
988
|
+
object_type: Type of object to detect (e.g., "car", "building").
|
|
989
|
+
window_size: Size of each processing window/tile. Default 512.
|
|
990
|
+
overlap: Overlap between adjacent windows. Default 64.
|
|
991
|
+
iou_threshold: IoU threshold for NMS to merge overlapping detections.
|
|
992
|
+
bands: Band indices for GeoTIFF.
|
|
993
|
+
output_path: Path to save results as GeoJSON/Shapefile/GeoPackage.
|
|
994
|
+
settings: Additional settings for the model.
|
|
995
|
+
show_progress: Whether to show progress bar.
|
|
996
|
+
**kwargs: Additional arguments for the model.
|
|
997
|
+
|
|
998
|
+
Returns:
|
|
999
|
+
Dictionary with "objects" key containing list of bounding boxes
|
|
1000
|
+
with normalized coordinates. If georeferenced, also includes
|
|
1001
|
+
"gdf" (GeoDataFrame).
|
|
1002
|
+
"""
|
|
1003
|
+
# Load image
|
|
1004
|
+
if isinstance(source, (str, Image.Image, np.ndarray)):
|
|
1005
|
+
image, metadata = self.load_image(source, bands)
|
|
1006
|
+
else:
|
|
1007
|
+
image = source
|
|
1008
|
+
metadata = self._metadata
|
|
1009
|
+
|
|
1010
|
+
width, height = image.size
|
|
1011
|
+
|
|
1012
|
+
# If image is smaller than window size, use regular detection
|
|
1013
|
+
if width <= window_size and height <= window_size:
|
|
1014
|
+
return self.detect(
|
|
1015
|
+
image,
|
|
1016
|
+
object_type,
|
|
1017
|
+
bands=bands,
|
|
1018
|
+
output_path=output_path,
|
|
1019
|
+
settings=settings,
|
|
1020
|
+
**kwargs,
|
|
1021
|
+
)
|
|
1022
|
+
|
|
1023
|
+
# Create sliding windows
|
|
1024
|
+
windows = self._create_sliding_windows(width, height, window_size, overlap)
|
|
1025
|
+
|
|
1026
|
+
all_detections = []
|
|
1027
|
+
|
|
1028
|
+
# Progress bar setup
|
|
1029
|
+
iterator = (
|
|
1030
|
+
tqdm(windows, desc=f"Detecting {object_type}") if show_progress else windows
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
# Process each window
|
|
1034
|
+
for x_start, y_start, x_end, y_end in iterator:
|
|
1035
|
+
# Crop window from image
|
|
1036
|
+
window_img = image.crop((x_start, y_start, x_end, y_end))
|
|
1037
|
+
|
|
1038
|
+
# Detect in window
|
|
1039
|
+
call_kwargs = {}
|
|
1040
|
+
if settings:
|
|
1041
|
+
call_kwargs["settings"] = settings
|
|
1042
|
+
call_kwargs.update(kwargs)
|
|
1043
|
+
|
|
1044
|
+
try:
|
|
1045
|
+
result = self.model.detect(window_img, object_type, **call_kwargs)
|
|
1046
|
+
|
|
1047
|
+
# Adjust coordinates to full image space
|
|
1048
|
+
window_width = x_end - x_start
|
|
1049
|
+
window_height = y_end - y_start
|
|
1050
|
+
|
|
1051
|
+
for obj in result.get("objects", []):
|
|
1052
|
+
# Convert from window-relative normalized coords to full image normalized coords
|
|
1053
|
+
full_x_min = (x_start + obj["x_min"] * window_width) / width
|
|
1054
|
+
full_y_min = (y_start + obj["y_min"] * window_height) / height
|
|
1055
|
+
full_x_max = (x_start + obj["x_max"] * window_width) / width
|
|
1056
|
+
full_y_max = (y_start + obj["y_max"] * window_height) / height
|
|
1057
|
+
|
|
1058
|
+
detection = {
|
|
1059
|
+
"x_min": full_x_min,
|
|
1060
|
+
"y_min": full_y_min,
|
|
1061
|
+
"x_max": full_x_max,
|
|
1062
|
+
"y_max": full_y_max,
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
# Preserve additional fields if present
|
|
1066
|
+
for key in obj:
|
|
1067
|
+
if key not in ["x_min", "y_min", "x_max", "y_max"]:
|
|
1068
|
+
detection[key] = obj[key]
|
|
1069
|
+
|
|
1070
|
+
all_detections.append(detection)
|
|
1071
|
+
|
|
1072
|
+
except Exception as e:
|
|
1073
|
+
if show_progress:
|
|
1074
|
+
print(
|
|
1075
|
+
f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
# Apply NMS to merge overlapping detections
|
|
1079
|
+
merged_detections = self._apply_nms(all_detections, iou_threshold)
|
|
1080
|
+
|
|
1081
|
+
result = {"objects": merged_detections}
|
|
1082
|
+
|
|
1083
|
+
# Convert to georeferenced if possible
|
|
1084
|
+
if metadata and metadata.get("crs") and metadata.get("transform"):
|
|
1085
|
+
result = self._georef_detections(result, metadata)
|
|
1086
|
+
|
|
1087
|
+
if output_path:
|
|
1088
|
+
self._save_vector(result["gdf"], output_path)
|
|
1089
|
+
|
|
1090
|
+
return result
|
|
1091
|
+
|
|
1092
|
+
def point_sliding_window(
|
|
1093
|
+
self,
|
|
1094
|
+
source: Union[str, Image.Image, np.ndarray],
|
|
1095
|
+
object_description: str,
|
|
1096
|
+
window_size: int = 512,
|
|
1097
|
+
overlap: int = 64,
|
|
1098
|
+
bands: Optional[List[int]] = None,
|
|
1099
|
+
output_path: Optional[str] = None,
|
|
1100
|
+
show_progress: bool = True,
|
|
1101
|
+
**kwargs: Any,
|
|
1102
|
+
) -> Dict[str, Any]:
|
|
1103
|
+
"""Find points using sliding window for large images.
|
|
1104
|
+
|
|
1105
|
+
This method processes large images by dividing them into overlapping
|
|
1106
|
+
windows/tiles and finding points in each tile.
|
|
1107
|
+
|
|
1108
|
+
Args:
|
|
1109
|
+
source: Image source or pre-encoded image.
|
|
1110
|
+
object_description: Description of objects to find.
|
|
1111
|
+
window_size: Size of each processing window/tile. Default 512.
|
|
1112
|
+
overlap: Overlap between adjacent windows. Default 64.
|
|
1113
|
+
bands: Band indices for GeoTIFF.
|
|
1114
|
+
output_path: Path to save results as GeoJSON/Shapefile/GeoPackage.
|
|
1115
|
+
show_progress: Whether to show progress bar.
|
|
1116
|
+
**kwargs: Additional arguments for the model.
|
|
1117
|
+
|
|
1118
|
+
Returns:
|
|
1119
|
+
Dictionary with "points" key containing list of points
|
|
1120
|
+
with normalized coordinates. If georeferenced, also includes
|
|
1121
|
+
"gdf" (GeoDataFrame).
|
|
1122
|
+
"""
|
|
1123
|
+
# Load image
|
|
1124
|
+
if isinstance(source, (str, Image.Image, np.ndarray)):
|
|
1125
|
+
image, metadata = self.load_image(source, bands)
|
|
1126
|
+
else:
|
|
1127
|
+
image = source
|
|
1128
|
+
metadata = self._metadata
|
|
1129
|
+
|
|
1130
|
+
width, height = image.size
|
|
1131
|
+
|
|
1132
|
+
# If image is smaller than window size, use regular point detection
|
|
1133
|
+
if width <= window_size and height <= window_size:
|
|
1134
|
+
return self.point(
|
|
1135
|
+
image,
|
|
1136
|
+
object_description,
|
|
1137
|
+
bands=bands,
|
|
1138
|
+
output_path=output_path,
|
|
1139
|
+
**kwargs,
|
|
1140
|
+
)
|
|
1141
|
+
|
|
1142
|
+
# Create sliding windows
|
|
1143
|
+
windows = self._create_sliding_windows(width, height, window_size, overlap)
|
|
1144
|
+
|
|
1145
|
+
all_points = []
|
|
1146
|
+
|
|
1147
|
+
# Progress bar setup
|
|
1148
|
+
iterator = (
|
|
1149
|
+
tqdm(windows, desc=f"Finding {object_description}")
|
|
1150
|
+
if show_progress
|
|
1151
|
+
else windows
|
|
1152
|
+
)
|
|
1153
|
+
|
|
1154
|
+
# Process each window
|
|
1155
|
+
for x_start, y_start, x_end, y_end in iterator:
|
|
1156
|
+
# Crop window from image
|
|
1157
|
+
window_img = image.crop((x_start, y_start, x_end, y_end))
|
|
1158
|
+
|
|
1159
|
+
# Find points in window
|
|
1160
|
+
try:
|
|
1161
|
+
result = self.model.point(window_img, object_description, **kwargs)
|
|
1162
|
+
|
|
1163
|
+
# Adjust coordinates to full image space
|
|
1164
|
+
window_width = x_end - x_start
|
|
1165
|
+
window_height = y_end - y_start
|
|
1166
|
+
|
|
1167
|
+
for pt in result.get("points", []):
|
|
1168
|
+
# Convert from window-relative normalized coords to full image normalized coords
|
|
1169
|
+
full_x = (x_start + pt["x"] * window_width) / width
|
|
1170
|
+
full_y = (y_start + pt["y"] * window_height) / height
|
|
1171
|
+
|
|
1172
|
+
point = {"x": full_x, "y": full_y}
|
|
1173
|
+
|
|
1174
|
+
# Preserve additional fields if present
|
|
1175
|
+
for key in pt:
|
|
1176
|
+
if key not in ["x", "y"]:
|
|
1177
|
+
point[key] = pt[key]
|
|
1178
|
+
|
|
1179
|
+
all_points.append(point)
|
|
1180
|
+
|
|
1181
|
+
except Exception as e:
|
|
1182
|
+
if show_progress:
|
|
1183
|
+
print(
|
|
1184
|
+
f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
|
|
1185
|
+
)
|
|
1186
|
+
|
|
1187
|
+
result = {"points": all_points}
|
|
1188
|
+
|
|
1189
|
+
# Convert to georeferenced if possible
|
|
1190
|
+
if metadata and metadata.get("crs") and metadata.get("transform"):
|
|
1191
|
+
result = self._georef_points(result, metadata)
|
|
1192
|
+
|
|
1193
|
+
if output_path:
|
|
1194
|
+
self._save_vector(result["gdf"], output_path)
|
|
1195
|
+
|
|
1196
|
+
return result
|
|
1197
|
+
|
|
1198
|
+
def query_sliding_window(
|
|
1199
|
+
self,
|
|
1200
|
+
question: str,
|
|
1201
|
+
source: Union[str, Image.Image, np.ndarray],
|
|
1202
|
+
window_size: int = 512,
|
|
1203
|
+
overlap: int = 64,
|
|
1204
|
+
reasoning: Optional[bool] = None,
|
|
1205
|
+
bands: Optional[List[int]] = None,
|
|
1206
|
+
settings: Optional[Dict] = None,
|
|
1207
|
+
show_progress: bool = True,
|
|
1208
|
+
combine_strategy: str = "concatenate",
|
|
1209
|
+
**kwargs: Any,
|
|
1210
|
+
) -> Dict[str, Any]:
|
|
1211
|
+
"""Query image using sliding window for large images.
|
|
1212
|
+
|
|
1213
|
+
This method processes large images by dividing them into overlapping
|
|
1214
|
+
windows/tiles, querying each tile, and combining the responses.
|
|
1215
|
+
|
|
1216
|
+
Args:
|
|
1217
|
+
question: The question to ask about each window.
|
|
1218
|
+
source: Image source or pre-encoded image.
|
|
1219
|
+
window_size: Size of each processing window/tile. Default 512.
|
|
1220
|
+
overlap: Overlap between adjacent windows. Default 64.
|
|
1221
|
+
reasoning: Enable reasoning mode (moondream3 only).
|
|
1222
|
+
bands: Band indices for GeoTIFF.
|
|
1223
|
+
settings: Additional settings for the model.
|
|
1224
|
+
show_progress: Whether to show progress bar.
|
|
1225
|
+
combine_strategy: How to combine answers from different windows.
|
|
1226
|
+
Options: "concatenate", "summarize". Default "concatenate".
|
|
1227
|
+
**kwargs: Additional arguments for the model.
|
|
1228
|
+
|
|
1229
|
+
Returns:
|
|
1230
|
+
Dictionary with "answer" key containing the combined response,
|
|
1231
|
+
and "tile_answers" with individual tile responses.
|
|
1232
|
+
"""
|
|
1233
|
+
# Load image
|
|
1234
|
+
if isinstance(source, (str, Image.Image, np.ndarray)):
|
|
1235
|
+
image, _ = self.load_image(source, bands)
|
|
1236
|
+
else:
|
|
1237
|
+
image = source
|
|
1238
|
+
|
|
1239
|
+
width, height = image.size
|
|
1240
|
+
|
|
1241
|
+
# If image is smaller than window size, use regular query
|
|
1242
|
+
if width <= window_size and height <= window_size:
|
|
1243
|
+
return self.query(
|
|
1244
|
+
question,
|
|
1245
|
+
image,
|
|
1246
|
+
reasoning=reasoning,
|
|
1247
|
+
bands=bands,
|
|
1248
|
+
settings=settings,
|
|
1249
|
+
**kwargs,
|
|
1250
|
+
)
|
|
1251
|
+
|
|
1252
|
+
# Create sliding windows
|
|
1253
|
+
windows = self._create_sliding_windows(width, height, window_size, overlap)
|
|
1254
|
+
|
|
1255
|
+
tile_answers = []
|
|
1256
|
+
|
|
1257
|
+
# Progress bar setup
|
|
1258
|
+
iterator = tqdm(windows, desc="Querying tiles") if show_progress else windows
|
|
1259
|
+
|
|
1260
|
+
# Process each window
|
|
1261
|
+
for idx, (x_start, y_start, x_end, y_end) in enumerate(iterator):
|
|
1262
|
+
# Crop window from image
|
|
1263
|
+
window_img = image.crop((x_start, y_start, x_end, y_end))
|
|
1264
|
+
|
|
1265
|
+
# Query window
|
|
1266
|
+
call_kwargs = {"question": question, "image": window_img}
|
|
1267
|
+
if reasoning is not None and self.model_version == "moondream3":
|
|
1268
|
+
call_kwargs["reasoning"] = reasoning
|
|
1269
|
+
if settings:
|
|
1270
|
+
call_kwargs["settings"] = settings
|
|
1271
|
+
call_kwargs.update(kwargs)
|
|
1272
|
+
|
|
1273
|
+
try:
|
|
1274
|
+
result = self.model.query(**call_kwargs)
|
|
1275
|
+
tile_answers.append(
|
|
1276
|
+
{
|
|
1277
|
+
"tile_id": idx,
|
|
1278
|
+
"bounds": (x_start, y_start, x_end, y_end),
|
|
1279
|
+
"answer": result.get("answer", ""),
|
|
1280
|
+
}
|
|
1281
|
+
)
|
|
1282
|
+
except Exception as e:
|
|
1283
|
+
if show_progress:
|
|
1284
|
+
print(
|
|
1285
|
+
f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
# Combine answers
|
|
1289
|
+
if combine_strategy == "concatenate":
|
|
1290
|
+
combined_answer = "\n\n".join(
|
|
1291
|
+
[
|
|
1292
|
+
f"Tile {ta['tile_id']} (region {ta['bounds']}): {ta['answer']}"
|
|
1293
|
+
for ta in tile_answers
|
|
1294
|
+
]
|
|
1295
|
+
)
|
|
1296
|
+
elif combine_strategy == "summarize":
|
|
1297
|
+
# Use the model to summarize the tile answers
|
|
1298
|
+
summary_prompt = (
|
|
1299
|
+
f"Based on these regional observations about '{question}', "
|
|
1300
|
+
f"provide a comprehensive summary:\n\n"
|
|
1301
|
+
)
|
|
1302
|
+
for ta in tile_answers:
|
|
1303
|
+
summary_prompt += f"Region {ta['tile_id']}: {ta['answer']}\n"
|
|
1304
|
+
|
|
1305
|
+
try:
|
|
1306
|
+
summary_result = self.model.query(question=summary_prompt)
|
|
1307
|
+
combined_answer = summary_result.get("answer", "")
|
|
1308
|
+
except:
|
|
1309
|
+
# Fall back to concatenation if summarization fails
|
|
1310
|
+
combined_answer = " ".join([ta["answer"] for ta in tile_answers])
|
|
1311
|
+
else:
|
|
1312
|
+
combined_answer = " ".join([ta["answer"] for ta in tile_answers])
|
|
1313
|
+
|
|
1314
|
+
return {"answer": combined_answer, "tile_answers": tile_answers}
|
|
1315
|
+
|
|
1316
|
+
def caption_sliding_window(
|
|
1317
|
+
self,
|
|
1318
|
+
source: Union[str, Image.Image, np.ndarray],
|
|
1319
|
+
window_size: int = 512,
|
|
1320
|
+
overlap: int = 64,
|
|
1321
|
+
length: str = "normal",
|
|
1322
|
+
bands: Optional[List[int]] = None,
|
|
1323
|
+
settings: Optional[Dict] = None,
|
|
1324
|
+
show_progress: bool = True,
|
|
1325
|
+
combine_strategy: str = "concatenate",
|
|
1326
|
+
**kwargs: Any,
|
|
1327
|
+
) -> Dict[str, Any]:
|
|
1328
|
+
"""Generate caption using sliding window for large images.
|
|
1329
|
+
|
|
1330
|
+
This method processes large images by dividing them into overlapping
|
|
1331
|
+
windows/tiles, generating captions for each tile, and combining them.
|
|
1332
|
+
|
|
1333
|
+
Args:
|
|
1334
|
+
source: Image source or pre-encoded image.
|
|
1335
|
+
window_size: Size of each processing window/tile. Default 512.
|
|
1336
|
+
overlap: Overlap between adjacent windows. Default 64.
|
|
1337
|
+
length: Caption length - "short", "normal", or "long".
|
|
1338
|
+
bands: Band indices for GeoTIFF.
|
|
1339
|
+
settings: Additional settings for the model.
|
|
1340
|
+
show_progress: Whether to show progress bar.
|
|
1341
|
+
combine_strategy: How to combine captions from different windows.
|
|
1342
|
+
Options: "concatenate", "summarize". Default "concatenate".
|
|
1343
|
+
**kwargs: Additional arguments for the model.
|
|
1344
|
+
|
|
1345
|
+
Returns:
|
|
1346
|
+
Dictionary with "caption" key containing the combined caption,
|
|
1347
|
+
and "tile_captions" with individual tile captions.
|
|
1348
|
+
"""
|
|
1349
|
+
# Load image
|
|
1350
|
+
if isinstance(source, (str, Image.Image, np.ndarray)):
|
|
1351
|
+
image, _ = self.load_image(source, bands)
|
|
1352
|
+
else:
|
|
1353
|
+
image = source
|
|
1354
|
+
|
|
1355
|
+
width, height = image.size
|
|
1356
|
+
|
|
1357
|
+
# If image is smaller than window size, use regular caption
|
|
1358
|
+
if width <= window_size and height <= window_size:
|
|
1359
|
+
return self.caption(
|
|
1360
|
+
image, length=length, bands=bands, settings=settings, **kwargs
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
# Create sliding windows
|
|
1364
|
+
windows = self._create_sliding_windows(width, height, window_size, overlap)
|
|
1365
|
+
|
|
1366
|
+
tile_captions = []
|
|
1367
|
+
|
|
1368
|
+
# Progress bar setup
|
|
1369
|
+
iterator = (
|
|
1370
|
+
tqdm(windows, desc="Generating captions") if show_progress else windows
|
|
1371
|
+
)
|
|
1372
|
+
|
|
1373
|
+
# Process each window
|
|
1374
|
+
for idx, (x_start, y_start, x_end, y_end) in enumerate(iterator):
|
|
1375
|
+
# Crop window from image
|
|
1376
|
+
window_img = image.crop((x_start, y_start, x_end, y_end))
|
|
1377
|
+
|
|
1378
|
+
# Caption window
|
|
1379
|
+
call_kwargs = {"length": length}
|
|
1380
|
+
if settings:
|
|
1381
|
+
call_kwargs["settings"] = settings
|
|
1382
|
+
call_kwargs.update(kwargs)
|
|
1383
|
+
|
|
1384
|
+
try:
|
|
1385
|
+
result = self.model.caption(window_img, **call_kwargs)
|
|
1386
|
+
tile_captions.append(
|
|
1387
|
+
{
|
|
1388
|
+
"tile_id": idx,
|
|
1389
|
+
"bounds": (x_start, y_start, x_end, y_end),
|
|
1390
|
+
"caption": result.get("caption", ""),
|
|
1391
|
+
}
|
|
1392
|
+
)
|
|
1393
|
+
except Exception as e:
|
|
1394
|
+
if show_progress:
|
|
1395
|
+
print(
|
|
1396
|
+
f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
|
|
1397
|
+
)
|
|
1398
|
+
|
|
1399
|
+
# Combine captions
|
|
1400
|
+
if combine_strategy == "concatenate":
|
|
1401
|
+
combined_caption = " ".join([tc["caption"] for tc in tile_captions])
|
|
1402
|
+
elif combine_strategy == "summarize":
|
|
1403
|
+
# Use the model to create a cohesive summary caption
|
|
1404
|
+
summary_prompt = (
|
|
1405
|
+
"Based on these descriptions of different regions of an image, "
|
|
1406
|
+
"create a single comprehensive caption for the entire image:\n\n"
|
|
1407
|
+
)
|
|
1408
|
+
for tc in tile_captions:
|
|
1409
|
+
summary_prompt += f"Region {tc['tile_id']}: {tc['caption']}\n"
|
|
1410
|
+
|
|
1411
|
+
try:
|
|
1412
|
+
summary_result = self.model.query(question=summary_prompt)
|
|
1413
|
+
combined_caption = summary_result.get("answer", "")
|
|
1414
|
+
except:
|
|
1415
|
+
# Fall back to concatenation if summarization fails
|
|
1416
|
+
combined_caption = " ".join([tc["caption"] for tc in tile_captions])
|
|
1417
|
+
else:
|
|
1418
|
+
combined_caption = " ".join([tc["caption"] for tc in tile_captions])
|
|
1419
|
+
|
|
1420
|
+
return {"caption": combined_caption, "tile_captions": tile_captions}
|
|
1421
|
+
|
|
870
1422
|
|
|
871
1423
|
def moondream_caption(
|
|
872
1424
|
source: Union[str, Image.Image, np.ndarray],
|
|
@@ -988,3 +1540,198 @@ def moondream_point(
|
|
|
988
1540
|
return processor.point(
|
|
989
1541
|
source, object_description, output_path=output_path, bands=bands, **kwargs
|
|
990
1542
|
)
|
|
1543
|
+
|
|
1544
|
+
|
|
1545
|
+
def moondream_detect_sliding_window(
|
|
1546
|
+
source: Union[str, Image.Image, np.ndarray],
|
|
1547
|
+
object_type: str,
|
|
1548
|
+
window_size: int = 512,
|
|
1549
|
+
overlap: int = 64,
|
|
1550
|
+
iou_threshold: float = 0.5,
|
|
1551
|
+
model_name: str = "vikhyatk/moondream2",
|
|
1552
|
+
revision: Optional[str] = None,
|
|
1553
|
+
output_path: Optional[str] = None,
|
|
1554
|
+
bands: Optional[List[int]] = None,
|
|
1555
|
+
device: Optional[str] = None,
|
|
1556
|
+
show_progress: bool = True,
|
|
1557
|
+
**kwargs: Any,
|
|
1558
|
+
) -> Dict[str, Any]:
|
|
1559
|
+
"""Convenience function to detect objects using sliding window.
|
|
1560
|
+
|
|
1561
|
+
This function is designed for large images where the standard detection
|
|
1562
|
+
may not work well. It divides the image into overlapping windows and
|
|
1563
|
+
merges detections using NMS.
|
|
1564
|
+
|
|
1565
|
+
Args:
|
|
1566
|
+
source: Image source.
|
|
1567
|
+
object_type: Type of object to detect.
|
|
1568
|
+
window_size: Size of each processing window. Default 512.
|
|
1569
|
+
overlap: Overlap between windows. Default 64.
|
|
1570
|
+
iou_threshold: IoU threshold for NMS. Default 0.5.
|
|
1571
|
+
model_name: Moondream model name.
|
|
1572
|
+
revision: Model revision.
|
|
1573
|
+
output_path: Path to save results as vector file.
|
|
1574
|
+
bands: Band indices for GeoTIFF.
|
|
1575
|
+
device: Device for inference.
|
|
1576
|
+
show_progress: Whether to show progress bar.
|
|
1577
|
+
**kwargs: Additional arguments.
|
|
1578
|
+
|
|
1579
|
+
Returns:
|
|
1580
|
+
Detection results dictionary with "objects" and optionally "gdf".
|
|
1581
|
+
"""
|
|
1582
|
+
processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
|
|
1583
|
+
return processor.detect_sliding_window(
|
|
1584
|
+
source,
|
|
1585
|
+
object_type,
|
|
1586
|
+
window_size=window_size,
|
|
1587
|
+
overlap=overlap,
|
|
1588
|
+
iou_threshold=iou_threshold,
|
|
1589
|
+
output_path=output_path,
|
|
1590
|
+
bands=bands,
|
|
1591
|
+
show_progress=show_progress,
|
|
1592
|
+
**kwargs,
|
|
1593
|
+
)
|
|
1594
|
+
|
|
1595
|
+
|
|
1596
|
+
def moondream_point_sliding_window(
|
|
1597
|
+
source: Union[str, Image.Image, np.ndarray],
|
|
1598
|
+
object_description: str,
|
|
1599
|
+
window_size: int = 512,
|
|
1600
|
+
overlap: int = 64,
|
|
1601
|
+
model_name: str = "vikhyatk/moondream2",
|
|
1602
|
+
revision: Optional[str] = None,
|
|
1603
|
+
output_path: Optional[str] = None,
|
|
1604
|
+
bands: Optional[List[int]] = None,
|
|
1605
|
+
device: Optional[str] = None,
|
|
1606
|
+
show_progress: bool = True,
|
|
1607
|
+
**kwargs: Any,
|
|
1608
|
+
) -> Dict[str, Any]:
|
|
1609
|
+
"""Convenience function to find points using sliding window.
|
|
1610
|
+
|
|
1611
|
+
This function is designed for large images. It divides the image
|
|
1612
|
+
into overlapping windows and aggregates all detected points.
|
|
1613
|
+
|
|
1614
|
+
Args:
|
|
1615
|
+
source: Image source.
|
|
1616
|
+
object_description: Description of objects to find.
|
|
1617
|
+
window_size: Size of each processing window. Default 512.
|
|
1618
|
+
overlap: Overlap between windows. Default 64.
|
|
1619
|
+
model_name: Moondream model name.
|
|
1620
|
+
revision: Model revision.
|
|
1621
|
+
output_path: Path to save results as vector file.
|
|
1622
|
+
bands: Band indices for GeoTIFF.
|
|
1623
|
+
device: Device for inference.
|
|
1624
|
+
show_progress: Whether to show progress bar.
|
|
1625
|
+
**kwargs: Additional arguments.
|
|
1626
|
+
|
|
1627
|
+
Returns:
|
|
1628
|
+
Point results dictionary with "points" and optionally "gdf".
|
|
1629
|
+
"""
|
|
1630
|
+
processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
|
|
1631
|
+
return processor.point_sliding_window(
|
|
1632
|
+
source,
|
|
1633
|
+
object_description,
|
|
1634
|
+
window_size=window_size,
|
|
1635
|
+
overlap=overlap,
|
|
1636
|
+
output_path=output_path,
|
|
1637
|
+
bands=bands,
|
|
1638
|
+
show_progress=show_progress,
|
|
1639
|
+
**kwargs,
|
|
1640
|
+
)
|
|
1641
|
+
|
|
1642
|
+
|
|
1643
|
+
def moondream_query_sliding_window(
|
|
1644
|
+
question: str,
|
|
1645
|
+
source: Union[str, Image.Image, np.ndarray],
|
|
1646
|
+
window_size: int = 512,
|
|
1647
|
+
overlap: int = 64,
|
|
1648
|
+
model_name: str = "vikhyatk/moondream2",
|
|
1649
|
+
revision: Optional[str] = None,
|
|
1650
|
+
reasoning: Optional[bool] = None,
|
|
1651
|
+
bands: Optional[List[int]] = None,
|
|
1652
|
+
device: Optional[str] = None,
|
|
1653
|
+
show_progress: bool = True,
|
|
1654
|
+
combine_strategy: str = "concatenate",
|
|
1655
|
+
**kwargs: Any,
|
|
1656
|
+
) -> Dict[str, Any]:
|
|
1657
|
+
"""Convenience function to query large images using sliding window.
|
|
1658
|
+
|
|
1659
|
+
This function divides the image into overlapping windows, queries each,
|
|
1660
|
+
and combines the answers.
|
|
1661
|
+
|
|
1662
|
+
Args:
|
|
1663
|
+
question: Question to ask about the image.
|
|
1664
|
+
source: Image source.
|
|
1665
|
+
window_size: Size of each processing window. Default 512.
|
|
1666
|
+
overlap: Overlap between windows. Default 64.
|
|
1667
|
+
model_name: Moondream model name.
|
|
1668
|
+
revision: Model revision.
|
|
1669
|
+
reasoning: Enable reasoning mode (moondream3 only).
|
|
1670
|
+
bands: Band indices for GeoTIFF.
|
|
1671
|
+
device: Device for inference.
|
|
1672
|
+
show_progress: Whether to show progress bar.
|
|
1673
|
+
combine_strategy: How to combine answers ("concatenate" or "summarize").
|
|
1674
|
+
**kwargs: Additional arguments.
|
|
1675
|
+
|
|
1676
|
+
Returns:
|
|
1677
|
+
Dictionary with "answer" and "tile_answers" keys.
|
|
1678
|
+
"""
|
|
1679
|
+
processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
|
|
1680
|
+
return processor.query_sliding_window(
|
|
1681
|
+
question,
|
|
1682
|
+
source,
|
|
1683
|
+
window_size=window_size,
|
|
1684
|
+
overlap=overlap,
|
|
1685
|
+
reasoning=reasoning,
|
|
1686
|
+
bands=bands,
|
|
1687
|
+
show_progress=show_progress,
|
|
1688
|
+
combine_strategy=combine_strategy,
|
|
1689
|
+
**kwargs,
|
|
1690
|
+
)
|
|
1691
|
+
|
|
1692
|
+
|
|
1693
|
+
def moondream_caption_sliding_window(
|
|
1694
|
+
source: Union[str, Image.Image, np.ndarray],
|
|
1695
|
+
window_size: int = 512,
|
|
1696
|
+
overlap: int = 64,
|
|
1697
|
+
length: str = "normal",
|
|
1698
|
+
model_name: str = "vikhyatk/moondream2",
|
|
1699
|
+
revision: Optional[str] = None,
|
|
1700
|
+
bands: Optional[List[int]] = None,
|
|
1701
|
+
device: Optional[str] = None,
|
|
1702
|
+
show_progress: bool = True,
|
|
1703
|
+
combine_strategy: str = "concatenate",
|
|
1704
|
+
**kwargs: Any,
|
|
1705
|
+
) -> Dict[str, Any]:
|
|
1706
|
+
"""Convenience function to caption large images using sliding window.
|
|
1707
|
+
|
|
1708
|
+
This function divides the image into overlapping windows, captions each,
|
|
1709
|
+
and combines the results.
|
|
1710
|
+
|
|
1711
|
+
Args:
|
|
1712
|
+
source: Image source.
|
|
1713
|
+
window_size: Size of each processing window. Default 512.
|
|
1714
|
+
overlap: Overlap between windows. Default 64.
|
|
1715
|
+
length: Caption length ("short", "normal", "long").
|
|
1716
|
+
model_name: Moondream model name.
|
|
1717
|
+
revision: Model revision.
|
|
1718
|
+
bands: Band indices for GeoTIFF.
|
|
1719
|
+
device: Device for inference.
|
|
1720
|
+
show_progress: Whether to show progress bar.
|
|
1721
|
+
combine_strategy: How to combine captions ("concatenate" or "summarize").
|
|
1722
|
+
**kwargs: Additional arguments.
|
|
1723
|
+
|
|
1724
|
+
Returns:
|
|
1725
|
+
Dictionary with "caption" and "tile_captions" keys.
|
|
1726
|
+
"""
|
|
1727
|
+
processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
|
|
1728
|
+
return processor.caption_sliding_window(
|
|
1729
|
+
source,
|
|
1730
|
+
window_size=window_size,
|
|
1731
|
+
overlap=overlap,
|
|
1732
|
+
length=length,
|
|
1733
|
+
bands=bands,
|
|
1734
|
+
show_progress=show_progress,
|
|
1735
|
+
combine_strategy=combine_strategy,
|
|
1736
|
+
**kwargs,
|
|
1737
|
+
)
|