geoai-py 0.24.0__py2.py3-none-any.whl → 0.26.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geoai/moondream.py CHANGED
@@ -18,11 +18,11 @@ import rasterio
18
18
  import torch
19
19
  from PIL import Image
20
20
  from shapely.geometry import Point, box
21
+ from tqdm import tqdm
21
22
  from transformers.utils import logging as hf_logging
22
23
 
23
24
  from .utils import get_device
24
25
 
25
-
26
26
  hf_logging.set_verbosity_error() # silence HF load reports
27
27
 
28
28
 
@@ -867,6 +867,558 @@ class MoondreamGeo:
867
867
  else:
868
868
  return None
869
869
 
870
+ def _create_sliding_windows(
871
+ self,
872
+ image_width: int,
873
+ image_height: int,
874
+ window_size: int = 512,
875
+ overlap: int = 64,
876
+ ) -> List[Tuple[int, int, int, int]]:
877
+ """Create sliding window coordinates for tiled processing.
878
+
879
+ Args:
880
+ image_width: Width of the full image.
881
+ image_height: Height of the full image.
882
+ window_size: Size of each window/tile.
883
+ overlap: Overlap between adjacent windows.
884
+
885
+ Returns:
886
+ List of tuples (x_start, y_start, x_end, y_end) for each window.
887
+ """
888
+ windows = []
889
+ stride = window_size - overlap
890
+
891
+ for y in range(0, image_height, stride):
892
+ for x in range(0, image_width, stride):
893
+ x_start = x
894
+ y_start = y
895
+ x_end = min(x + window_size, image_width)
896
+ y_end = min(y + window_size, image_height)
897
+
898
+ # Only add windows that have sufficient size
899
+ if (x_end - x_start) >= window_size // 2 and (
900
+ y_end - y_start
901
+ ) >= window_size // 2:
902
+ windows.append((x_start, y_start, x_end, y_end))
903
+
904
+ return windows
905
+
906
+ def _apply_nms(
907
+ self,
908
+ detections: List[Dict[str, Any]],
909
+ iou_threshold: float = 0.5,
910
+ ) -> List[Dict[str, Any]]:
911
+ """Apply Non-Maximum Suppression to remove overlapping detections.
912
+
913
+ Args:
914
+ detections: List of detection dictionaries with bounding boxes.
915
+ iou_threshold: IoU threshold for considering boxes as overlapping.
916
+
917
+ Returns:
918
+ Filtered list of detections after NMS.
919
+ """
920
+ if not detections:
921
+ return []
922
+
923
+ # Sort by confidence/score if available
924
+ if "score" in detections[0]:
925
+ detections = sorted(
926
+ detections, key=lambda x: x.get("score", 1.0), reverse=True
927
+ )
928
+
929
+ # Convert to arrays for efficient computation
930
+ boxes = np.array(
931
+ [[d["x_min"], d["y_min"], d["x_max"], d["y_max"]] for d in detections]
932
+ )
933
+
934
+ # Calculate areas
935
+ x1 = boxes[:, 0]
936
+ y1 = boxes[:, 1]
937
+ x2 = boxes[:, 2]
938
+ y2 = boxes[:, 3]
939
+ areas = (x2 - x1) * (y2 - y1)
940
+
941
+ # Sort by y2 coordinate (bottom of box)
942
+ order = y2.argsort()
943
+
944
+ keep = []
945
+ while order.size > 0:
946
+ i = order[-1]
947
+ keep.append(i)
948
+
949
+ # Calculate IoU with remaining boxes
950
+ xx1 = np.maximum(x1[i], x1[order[:-1]])
951
+ yy1 = np.maximum(y1[i], y1[order[:-1]])
952
+ xx2 = np.minimum(x2[i], x2[order[:-1]])
953
+ yy2 = np.minimum(y2[i], y2[order[:-1]])
954
+
955
+ w = np.maximum(0, xx2 - xx1)
956
+ h = np.maximum(0, yy2 - yy1)
957
+ intersection = w * h
958
+
959
+ iou = intersection / (areas[i] + areas[order[:-1]] - intersection)
960
+
961
+ # Keep only boxes with IoU less than threshold
962
+ inds = np.where(iou <= iou_threshold)[0]
963
+ order = order[inds]
964
+
965
+ return [detections[i] for i in keep]
966
+
967
+ def detect_sliding_window(
968
+ self,
969
+ source: Union[str, Image.Image, np.ndarray],
970
+ object_type: str,
971
+ window_size: int = 512,
972
+ overlap: int = 64,
973
+ iou_threshold: float = 0.5,
974
+ bands: Optional[List[int]] = None,
975
+ output_path: Optional[str] = None,
976
+ settings: Optional[Dict] = None,
977
+ show_progress: bool = True,
978
+ **kwargs: Any,
979
+ ) -> Dict[str, Any]:
980
+ """Detect objects using sliding window for large images.
981
+
982
+ This method processes large images by dividing them into overlapping
983
+ windows/tiles, running detection on each tile, and merging results
984
+ using Non-Maximum Suppression (NMS) to handle overlapping detections.
985
+
986
+ Args:
987
+ source: Image source or pre-encoded image.
988
+ object_type: Type of object to detect (e.g., "car", "building").
989
+ window_size: Size of each processing window/tile. Default 512.
990
+ overlap: Overlap between adjacent windows. Default 64.
991
+ iou_threshold: IoU threshold for NMS to merge overlapping detections.
992
+ bands: Band indices for GeoTIFF.
993
+ output_path: Path to save results as GeoJSON/Shapefile/GeoPackage.
994
+ settings: Additional settings for the model.
995
+ show_progress: Whether to show progress bar.
996
+ **kwargs: Additional arguments for the model.
997
+
998
+ Returns:
999
+ Dictionary with "objects" key containing list of bounding boxes
1000
+ with normalized coordinates. If georeferenced, also includes
1001
+ "gdf" (GeoDataFrame).
1002
+ """
1003
+ # Load image
1004
+ if isinstance(source, (str, Image.Image, np.ndarray)):
1005
+ image, metadata = self.load_image(source, bands)
1006
+ else:
1007
+ image = source
1008
+ metadata = self._metadata
1009
+
1010
+ width, height = image.size
1011
+
1012
+ # If image is smaller than window size, use regular detection
1013
+ if width <= window_size and height <= window_size:
1014
+ return self.detect(
1015
+ image,
1016
+ object_type,
1017
+ bands=bands,
1018
+ output_path=output_path,
1019
+ settings=settings,
1020
+ **kwargs,
1021
+ )
1022
+
1023
+ # Create sliding windows
1024
+ windows = self._create_sliding_windows(width, height, window_size, overlap)
1025
+
1026
+ all_detections = []
1027
+
1028
+ # Progress bar setup
1029
+ iterator = (
1030
+ tqdm(windows, desc=f"Detecting {object_type}") if show_progress else windows
1031
+ )
1032
+
1033
+ # Process each window
1034
+ for x_start, y_start, x_end, y_end in iterator:
1035
+ # Crop window from image
1036
+ window_img = image.crop((x_start, y_start, x_end, y_end))
1037
+
1038
+ # Detect in window
1039
+ call_kwargs = {}
1040
+ if settings:
1041
+ call_kwargs["settings"] = settings
1042
+ call_kwargs.update(kwargs)
1043
+
1044
+ try:
1045
+ result = self.model.detect(window_img, object_type, **call_kwargs)
1046
+
1047
+ # Adjust coordinates to full image space
1048
+ window_width = x_end - x_start
1049
+ window_height = y_end - y_start
1050
+
1051
+ for obj in result.get("objects", []):
1052
+ # Convert from window-relative normalized coords to full image normalized coords
1053
+ full_x_min = (x_start + obj["x_min"] * window_width) / width
1054
+ full_y_min = (y_start + obj["y_min"] * window_height) / height
1055
+ full_x_max = (x_start + obj["x_max"] * window_width) / width
1056
+ full_y_max = (y_start + obj["y_max"] * window_height) / height
1057
+
1058
+ detection = {
1059
+ "x_min": full_x_min,
1060
+ "y_min": full_y_min,
1061
+ "x_max": full_x_max,
1062
+ "y_max": full_y_max,
1063
+ }
1064
+
1065
+ # Preserve additional fields if present
1066
+ for key in obj:
1067
+ if key not in ["x_min", "y_min", "x_max", "y_max"]:
1068
+ detection[key] = obj[key]
1069
+
1070
+ all_detections.append(detection)
1071
+
1072
+ except Exception as e:
1073
+ if show_progress:
1074
+ print(
1075
+ f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
1076
+ )
1077
+
1078
+ # Apply NMS to merge overlapping detections
1079
+ merged_detections = self._apply_nms(all_detections, iou_threshold)
1080
+
1081
+ result = {"objects": merged_detections}
1082
+
1083
+ # Convert to georeferenced if possible
1084
+ if metadata and metadata.get("crs") and metadata.get("transform"):
1085
+ result = self._georef_detections(result, metadata)
1086
+
1087
+ if output_path:
1088
+ self._save_vector(result["gdf"], output_path)
1089
+
1090
+ return result
1091
+
1092
+ def point_sliding_window(
1093
+ self,
1094
+ source: Union[str, Image.Image, np.ndarray],
1095
+ object_description: str,
1096
+ window_size: int = 512,
1097
+ overlap: int = 64,
1098
+ bands: Optional[List[int]] = None,
1099
+ output_path: Optional[str] = None,
1100
+ show_progress: bool = True,
1101
+ **kwargs: Any,
1102
+ ) -> Dict[str, Any]:
1103
+ """Find points using sliding window for large images.
1104
+
1105
+ This method processes large images by dividing them into overlapping
1106
+ windows/tiles and finding points in each tile.
1107
+
1108
+ Args:
1109
+ source: Image source or pre-encoded image.
1110
+ object_description: Description of objects to find.
1111
+ window_size: Size of each processing window/tile. Default 512.
1112
+ overlap: Overlap between adjacent windows. Default 64.
1113
+ bands: Band indices for GeoTIFF.
1114
+ output_path: Path to save results as GeoJSON/Shapefile/GeoPackage.
1115
+ show_progress: Whether to show progress bar.
1116
+ **kwargs: Additional arguments for the model.
1117
+
1118
+ Returns:
1119
+ Dictionary with "points" key containing list of points
1120
+ with normalized coordinates. If georeferenced, also includes
1121
+ "gdf" (GeoDataFrame).
1122
+ """
1123
+ # Load image
1124
+ if isinstance(source, (str, Image.Image, np.ndarray)):
1125
+ image, metadata = self.load_image(source, bands)
1126
+ else:
1127
+ image = source
1128
+ metadata = self._metadata
1129
+
1130
+ width, height = image.size
1131
+
1132
+ # If image is smaller than window size, use regular point detection
1133
+ if width <= window_size and height <= window_size:
1134
+ return self.point(
1135
+ image,
1136
+ object_description,
1137
+ bands=bands,
1138
+ output_path=output_path,
1139
+ **kwargs,
1140
+ )
1141
+
1142
+ # Create sliding windows
1143
+ windows = self._create_sliding_windows(width, height, window_size, overlap)
1144
+
1145
+ all_points = []
1146
+
1147
+ # Progress bar setup
1148
+ iterator = (
1149
+ tqdm(windows, desc=f"Finding {object_description}")
1150
+ if show_progress
1151
+ else windows
1152
+ )
1153
+
1154
+ # Process each window
1155
+ for x_start, y_start, x_end, y_end in iterator:
1156
+ # Crop window from image
1157
+ window_img = image.crop((x_start, y_start, x_end, y_end))
1158
+
1159
+ # Find points in window
1160
+ try:
1161
+ result = self.model.point(window_img, object_description, **kwargs)
1162
+
1163
+ # Adjust coordinates to full image space
1164
+ window_width = x_end - x_start
1165
+ window_height = y_end - y_start
1166
+
1167
+ for pt in result.get("points", []):
1168
+ # Convert from window-relative normalized coords to full image normalized coords
1169
+ full_x = (x_start + pt["x"] * window_width) / width
1170
+ full_y = (y_start + pt["y"] * window_height) / height
1171
+
1172
+ point = {"x": full_x, "y": full_y}
1173
+
1174
+ # Preserve additional fields if present
1175
+ for key in pt:
1176
+ if key not in ["x", "y"]:
1177
+ point[key] = pt[key]
1178
+
1179
+ all_points.append(point)
1180
+
1181
+ except Exception as e:
1182
+ if show_progress:
1183
+ print(
1184
+ f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
1185
+ )
1186
+
1187
+ result = {"points": all_points}
1188
+
1189
+ # Convert to georeferenced if possible
1190
+ if metadata and metadata.get("crs") and metadata.get("transform"):
1191
+ result = self._georef_points(result, metadata)
1192
+
1193
+ if output_path:
1194
+ self._save_vector(result["gdf"], output_path)
1195
+
1196
+ return result
1197
+
1198
+ def query_sliding_window(
1199
+ self,
1200
+ question: str,
1201
+ source: Union[str, Image.Image, np.ndarray],
1202
+ window_size: int = 512,
1203
+ overlap: int = 64,
1204
+ reasoning: Optional[bool] = None,
1205
+ bands: Optional[List[int]] = None,
1206
+ settings: Optional[Dict] = None,
1207
+ show_progress: bool = True,
1208
+ combine_strategy: str = "concatenate",
1209
+ **kwargs: Any,
1210
+ ) -> Dict[str, Any]:
1211
+ """Query image using sliding window for large images.
1212
+
1213
+ This method processes large images by dividing them into overlapping
1214
+ windows/tiles, querying each tile, and combining the responses.
1215
+
1216
+ Args:
1217
+ question: The question to ask about each window.
1218
+ source: Image source or pre-encoded image.
1219
+ window_size: Size of each processing window/tile. Default 512.
1220
+ overlap: Overlap between adjacent windows. Default 64.
1221
+ reasoning: Enable reasoning mode (moondream3 only).
1222
+ bands: Band indices for GeoTIFF.
1223
+ settings: Additional settings for the model.
1224
+ show_progress: Whether to show progress bar.
1225
+ combine_strategy: How to combine answers from different windows.
1226
+ Options: "concatenate", "summarize". Default "concatenate".
1227
+ **kwargs: Additional arguments for the model.
1228
+
1229
+ Returns:
1230
+ Dictionary with "answer" key containing the combined response,
1231
+ and "tile_answers" with individual tile responses.
1232
+ """
1233
+ # Load image
1234
+ if isinstance(source, (str, Image.Image, np.ndarray)):
1235
+ image, _ = self.load_image(source, bands)
1236
+ else:
1237
+ image = source
1238
+
1239
+ width, height = image.size
1240
+
1241
+ # If image is smaller than window size, use regular query
1242
+ if width <= window_size and height <= window_size:
1243
+ return self.query(
1244
+ question,
1245
+ image,
1246
+ reasoning=reasoning,
1247
+ bands=bands,
1248
+ settings=settings,
1249
+ **kwargs,
1250
+ )
1251
+
1252
+ # Create sliding windows
1253
+ windows = self._create_sliding_windows(width, height, window_size, overlap)
1254
+
1255
+ tile_answers = []
1256
+
1257
+ # Progress bar setup
1258
+ iterator = tqdm(windows, desc="Querying tiles") if show_progress else windows
1259
+
1260
+ # Process each window
1261
+ for idx, (x_start, y_start, x_end, y_end) in enumerate(iterator):
1262
+ # Crop window from image
1263
+ window_img = image.crop((x_start, y_start, x_end, y_end))
1264
+
1265
+ # Query window
1266
+ call_kwargs = {"question": question, "image": window_img}
1267
+ if reasoning is not None and self.model_version == "moondream3":
1268
+ call_kwargs["reasoning"] = reasoning
1269
+ if settings:
1270
+ call_kwargs["settings"] = settings
1271
+ call_kwargs.update(kwargs)
1272
+
1273
+ try:
1274
+ result = self.model.query(**call_kwargs)
1275
+ tile_answers.append(
1276
+ {
1277
+ "tile_id": idx,
1278
+ "bounds": (x_start, y_start, x_end, y_end),
1279
+ "answer": result.get("answer", ""),
1280
+ }
1281
+ )
1282
+ except Exception as e:
1283
+ if show_progress:
1284
+ print(
1285
+ f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
1286
+ )
1287
+
1288
+ # Combine answers
1289
+ if combine_strategy == "concatenate":
1290
+ combined_answer = "\n\n".join(
1291
+ [
1292
+ f"Tile {ta['tile_id']} (region {ta['bounds']}): {ta['answer']}"
1293
+ for ta in tile_answers
1294
+ ]
1295
+ )
1296
+ elif combine_strategy == "summarize":
1297
+ # Use the model to summarize the tile answers
1298
+ summary_prompt = (
1299
+ f"Based on these regional observations about '{question}', "
1300
+ f"provide a comprehensive summary:\n\n"
1301
+ )
1302
+ for ta in tile_answers:
1303
+ summary_prompt += f"Region {ta['tile_id']}: {ta['answer']}\n"
1304
+
1305
+ try:
1306
+ summary_result = self.model.query(question=summary_prompt)
1307
+ combined_answer = summary_result.get("answer", "")
1308
+ except:
1309
+ # Fall back to concatenation if summarization fails
1310
+ combined_answer = " ".join([ta["answer"] for ta in tile_answers])
1311
+ else:
1312
+ combined_answer = " ".join([ta["answer"] for ta in tile_answers])
1313
+
1314
+ return {"answer": combined_answer, "tile_answers": tile_answers}
1315
+
1316
+ def caption_sliding_window(
1317
+ self,
1318
+ source: Union[str, Image.Image, np.ndarray],
1319
+ window_size: int = 512,
1320
+ overlap: int = 64,
1321
+ length: str = "normal",
1322
+ bands: Optional[List[int]] = None,
1323
+ settings: Optional[Dict] = None,
1324
+ show_progress: bool = True,
1325
+ combine_strategy: str = "concatenate",
1326
+ **kwargs: Any,
1327
+ ) -> Dict[str, Any]:
1328
+ """Generate caption using sliding window for large images.
1329
+
1330
+ This method processes large images by dividing them into overlapping
1331
+ windows/tiles, generating captions for each tile, and combining them.
1332
+
1333
+ Args:
1334
+ source: Image source or pre-encoded image.
1335
+ window_size: Size of each processing window/tile. Default 512.
1336
+ overlap: Overlap between adjacent windows. Default 64.
1337
+ length: Caption length - "short", "normal", or "long".
1338
+ bands: Band indices for GeoTIFF.
1339
+ settings: Additional settings for the model.
1340
+ show_progress: Whether to show progress bar.
1341
+ combine_strategy: How to combine captions from different windows.
1342
+ Options: "concatenate", "summarize". Default "concatenate".
1343
+ **kwargs: Additional arguments for the model.
1344
+
1345
+ Returns:
1346
+ Dictionary with "caption" key containing the combined caption,
1347
+ and "tile_captions" with individual tile captions.
1348
+ """
1349
+ # Load image
1350
+ if isinstance(source, (str, Image.Image, np.ndarray)):
1351
+ image, _ = self.load_image(source, bands)
1352
+ else:
1353
+ image = source
1354
+
1355
+ width, height = image.size
1356
+
1357
+ # If image is smaller than window size, use regular caption
1358
+ if width <= window_size and height <= window_size:
1359
+ return self.caption(
1360
+ image, length=length, bands=bands, settings=settings, **kwargs
1361
+ )
1362
+
1363
+ # Create sliding windows
1364
+ windows = self._create_sliding_windows(width, height, window_size, overlap)
1365
+
1366
+ tile_captions = []
1367
+
1368
+ # Progress bar setup
1369
+ iterator = (
1370
+ tqdm(windows, desc="Generating captions") if show_progress else windows
1371
+ )
1372
+
1373
+ # Process each window
1374
+ for idx, (x_start, y_start, x_end, y_end) in enumerate(iterator):
1375
+ # Crop window from image
1376
+ window_img = image.crop((x_start, y_start, x_end, y_end))
1377
+
1378
+ # Caption window
1379
+ call_kwargs = {"length": length}
1380
+ if settings:
1381
+ call_kwargs["settings"] = settings
1382
+ call_kwargs.update(kwargs)
1383
+
1384
+ try:
1385
+ result = self.model.caption(window_img, **call_kwargs)
1386
+ tile_captions.append(
1387
+ {
1388
+ "tile_id": idx,
1389
+ "bounds": (x_start, y_start, x_end, y_end),
1390
+ "caption": result.get("caption", ""),
1391
+ }
1392
+ )
1393
+ except Exception as e:
1394
+ if show_progress:
1395
+ print(
1396
+ f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
1397
+ )
1398
+
1399
+ # Combine captions
1400
+ if combine_strategy == "concatenate":
1401
+ combined_caption = " ".join([tc["caption"] for tc in tile_captions])
1402
+ elif combine_strategy == "summarize":
1403
+ # Use the model to create a cohesive summary caption
1404
+ summary_prompt = (
1405
+ "Based on these descriptions of different regions of an image, "
1406
+ "create a single comprehensive caption for the entire image:\n\n"
1407
+ )
1408
+ for tc in tile_captions:
1409
+ summary_prompt += f"Region {tc['tile_id']}: {tc['caption']}\n"
1410
+
1411
+ try:
1412
+ summary_result = self.model.query(question=summary_prompt)
1413
+ combined_caption = summary_result.get("answer", "")
1414
+ except:
1415
+ # Fall back to concatenation if summarization fails
1416
+ combined_caption = " ".join([tc["caption"] for tc in tile_captions])
1417
+ else:
1418
+ combined_caption = " ".join([tc["caption"] for tc in tile_captions])
1419
+
1420
+ return {"caption": combined_caption, "tile_captions": tile_captions}
1421
+
870
1422
 
871
1423
  def moondream_caption(
872
1424
  source: Union[str, Image.Image, np.ndarray],
@@ -988,3 +1540,198 @@ def moondream_point(
988
1540
  return processor.point(
989
1541
  source, object_description, output_path=output_path, bands=bands, **kwargs
990
1542
  )
1543
+
1544
+
1545
+ def moondream_detect_sliding_window(
1546
+ source: Union[str, Image.Image, np.ndarray],
1547
+ object_type: str,
1548
+ window_size: int = 512,
1549
+ overlap: int = 64,
1550
+ iou_threshold: float = 0.5,
1551
+ model_name: str = "vikhyatk/moondream2",
1552
+ revision: Optional[str] = None,
1553
+ output_path: Optional[str] = None,
1554
+ bands: Optional[List[int]] = None,
1555
+ device: Optional[str] = None,
1556
+ show_progress: bool = True,
1557
+ **kwargs: Any,
1558
+ ) -> Dict[str, Any]:
1559
+ """Convenience function to detect objects using sliding window.
1560
+
1561
+ This function is designed for large images where the standard detection
1562
+ may not work well. It divides the image into overlapping windows and
1563
+ merges detections using NMS.
1564
+
1565
+ Args:
1566
+ source: Image source.
1567
+ object_type: Type of object to detect.
1568
+ window_size: Size of each processing window. Default 512.
1569
+ overlap: Overlap between windows. Default 64.
1570
+ iou_threshold: IoU threshold for NMS. Default 0.5.
1571
+ model_name: Moondream model name.
1572
+ revision: Model revision.
1573
+ output_path: Path to save results as vector file.
1574
+ bands: Band indices for GeoTIFF.
1575
+ device: Device for inference.
1576
+ show_progress: Whether to show progress bar.
1577
+ **kwargs: Additional arguments.
1578
+
1579
+ Returns:
1580
+ Detection results dictionary with "objects" and optionally "gdf".
1581
+ """
1582
+ processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
1583
+ return processor.detect_sliding_window(
1584
+ source,
1585
+ object_type,
1586
+ window_size=window_size,
1587
+ overlap=overlap,
1588
+ iou_threshold=iou_threshold,
1589
+ output_path=output_path,
1590
+ bands=bands,
1591
+ show_progress=show_progress,
1592
+ **kwargs,
1593
+ )
1594
+
1595
+
1596
+ def moondream_point_sliding_window(
1597
+ source: Union[str, Image.Image, np.ndarray],
1598
+ object_description: str,
1599
+ window_size: int = 512,
1600
+ overlap: int = 64,
1601
+ model_name: str = "vikhyatk/moondream2",
1602
+ revision: Optional[str] = None,
1603
+ output_path: Optional[str] = None,
1604
+ bands: Optional[List[int]] = None,
1605
+ device: Optional[str] = None,
1606
+ show_progress: bool = True,
1607
+ **kwargs: Any,
1608
+ ) -> Dict[str, Any]:
1609
+ """Convenience function to find points using sliding window.
1610
+
1611
+ This function is designed for large images. It divides the image
1612
+ into overlapping windows and aggregates all detected points.
1613
+
1614
+ Args:
1615
+ source: Image source.
1616
+ object_description: Description of objects to find.
1617
+ window_size: Size of each processing window. Default 512.
1618
+ overlap: Overlap between windows. Default 64.
1619
+ model_name: Moondream model name.
1620
+ revision: Model revision.
1621
+ output_path: Path to save results as vector file.
1622
+ bands: Band indices for GeoTIFF.
1623
+ device: Device for inference.
1624
+ show_progress: Whether to show progress bar.
1625
+ **kwargs: Additional arguments.
1626
+
1627
+ Returns:
1628
+ Point results dictionary with "points" and optionally "gdf".
1629
+ """
1630
+ processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
1631
+ return processor.point_sliding_window(
1632
+ source,
1633
+ object_description,
1634
+ window_size=window_size,
1635
+ overlap=overlap,
1636
+ output_path=output_path,
1637
+ bands=bands,
1638
+ show_progress=show_progress,
1639
+ **kwargs,
1640
+ )
1641
+
1642
+
1643
+ def moondream_query_sliding_window(
1644
+ question: str,
1645
+ source: Union[str, Image.Image, np.ndarray],
1646
+ window_size: int = 512,
1647
+ overlap: int = 64,
1648
+ model_name: str = "vikhyatk/moondream2",
1649
+ revision: Optional[str] = None,
1650
+ reasoning: Optional[bool] = None,
1651
+ bands: Optional[List[int]] = None,
1652
+ device: Optional[str] = None,
1653
+ show_progress: bool = True,
1654
+ combine_strategy: str = "concatenate",
1655
+ **kwargs: Any,
1656
+ ) -> Dict[str, Any]:
1657
+ """Convenience function to query large images using sliding window.
1658
+
1659
+ This function divides the image into overlapping windows, queries each,
1660
+ and combines the answers.
1661
+
1662
+ Args:
1663
+ question: Question to ask about the image.
1664
+ source: Image source.
1665
+ window_size: Size of each processing window. Default 512.
1666
+ overlap: Overlap between windows. Default 64.
1667
+ model_name: Moondream model name.
1668
+ revision: Model revision.
1669
+ reasoning: Enable reasoning mode (moondream3 only).
1670
+ bands: Band indices for GeoTIFF.
1671
+ device: Device for inference.
1672
+ show_progress: Whether to show progress bar.
1673
+ combine_strategy: How to combine answers ("concatenate" or "summarize").
1674
+ **kwargs: Additional arguments.
1675
+
1676
+ Returns:
1677
+ Dictionary with "answer" and "tile_answers" keys.
1678
+ """
1679
+ processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
1680
+ return processor.query_sliding_window(
1681
+ question,
1682
+ source,
1683
+ window_size=window_size,
1684
+ overlap=overlap,
1685
+ reasoning=reasoning,
1686
+ bands=bands,
1687
+ show_progress=show_progress,
1688
+ combine_strategy=combine_strategy,
1689
+ **kwargs,
1690
+ )
1691
+
1692
+
1693
+ def moondream_caption_sliding_window(
1694
+ source: Union[str, Image.Image, np.ndarray],
1695
+ window_size: int = 512,
1696
+ overlap: int = 64,
1697
+ length: str = "normal",
1698
+ model_name: str = "vikhyatk/moondream2",
1699
+ revision: Optional[str] = None,
1700
+ bands: Optional[List[int]] = None,
1701
+ device: Optional[str] = None,
1702
+ show_progress: bool = True,
1703
+ combine_strategy: str = "concatenate",
1704
+ **kwargs: Any,
1705
+ ) -> Dict[str, Any]:
1706
+ """Convenience function to caption large images using sliding window.
1707
+
1708
+ This function divides the image into overlapping windows, captions each,
1709
+ and combines the results.
1710
+
1711
+ Args:
1712
+ source: Image source.
1713
+ window_size: Size of each processing window. Default 512.
1714
+ overlap: Overlap between windows. Default 64.
1715
+ length: Caption length ("short", "normal", "long").
1716
+ model_name: Moondream model name.
1717
+ revision: Model revision.
1718
+ bands: Band indices for GeoTIFF.
1719
+ device: Device for inference.
1720
+ show_progress: Whether to show progress bar.
1721
+ combine_strategy: How to combine captions ("concatenate" or "summarize").
1722
+ **kwargs: Additional arguments.
1723
+
1724
+ Returns:
1725
+ Dictionary with "caption" and "tile_captions" keys.
1726
+ """
1727
+ processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
1728
+ return processor.caption_sliding_window(
1729
+ source,
1730
+ window_size=window_size,
1731
+ overlap=overlap,
1732
+ length=length,
1733
+ bands=bands,
1734
+ show_progress=show_progress,
1735
+ combine_strategy=combine_strategy,
1736
+ **kwargs,
1737
+ )