geoai-py 0.23.0__py2.py3-none-any.whl → 0.25.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geoai/moondream.py CHANGED
@@ -18,6 +18,7 @@ import rasterio
18
18
  import torch
19
19
  from PIL import Image
20
20
  from shapely.geometry import Point, box
21
+ from tqdm import tqdm
21
22
  from transformers.utils import logging as hf_logging
22
23
 
23
24
  from .utils import get_device
@@ -867,6 +868,558 @@ class MoondreamGeo:
867
868
  else:
868
869
  return None
869
870
 
871
+ def _create_sliding_windows(
872
+ self,
873
+ image_width: int,
874
+ image_height: int,
875
+ window_size: int = 512,
876
+ overlap: int = 64,
877
+ ) -> List[Tuple[int, int, int, int]]:
878
+ """Create sliding window coordinates for tiled processing.
879
+
880
+ Args:
881
+ image_width: Width of the full image.
882
+ image_height: Height of the full image.
883
+ window_size: Size of each window/tile.
884
+ overlap: Overlap between adjacent windows.
885
+
886
+ Returns:
887
+ List of tuples (x_start, y_start, x_end, y_end) for each window.
888
+ """
889
+ windows = []
890
+ stride = window_size - overlap
891
+
892
+ for y in range(0, image_height, stride):
893
+ for x in range(0, image_width, stride):
894
+ x_start = x
895
+ y_start = y
896
+ x_end = min(x + window_size, image_width)
897
+ y_end = min(y + window_size, image_height)
898
+
899
+ # Only add windows that have sufficient size
900
+ if (x_end - x_start) >= window_size // 2 and (
901
+ y_end - y_start
902
+ ) >= window_size // 2:
903
+ windows.append((x_start, y_start, x_end, y_end))
904
+
905
+ return windows
906
+
907
+ def _apply_nms(
908
+ self,
909
+ detections: List[Dict[str, Any]],
910
+ iou_threshold: float = 0.5,
911
+ ) -> List[Dict[str, Any]]:
912
+ """Apply Non-Maximum Suppression to remove overlapping detections.
913
+
914
+ Args:
915
+ detections: List of detection dictionaries with bounding boxes.
916
+ iou_threshold: IoU threshold for considering boxes as overlapping.
917
+
918
+ Returns:
919
+ Filtered list of detections after NMS.
920
+ """
921
+ if not detections:
922
+ return []
923
+
924
+ # Sort by confidence/score if available
925
+ if "score" in detections[0]:
926
+ detections = sorted(
927
+ detections, key=lambda x: x.get("score", 1.0), reverse=True
928
+ )
929
+
930
+ # Convert to arrays for efficient computation
931
+ boxes = np.array(
932
+ [[d["x_min"], d["y_min"], d["x_max"], d["y_max"]] for d in detections]
933
+ )
934
+
935
+ # Calculate areas
936
+ x1 = boxes[:, 0]
937
+ y1 = boxes[:, 1]
938
+ x2 = boxes[:, 2]
939
+ y2 = boxes[:, 3]
940
+ areas = (x2 - x1) * (y2 - y1)
941
+
942
+ # Sort by y2 coordinate (bottom of box)
943
+ order = y2.argsort()
944
+
945
+ keep = []
946
+ while order.size > 0:
947
+ i = order[-1]
948
+ keep.append(i)
949
+
950
+ # Calculate IoU with remaining boxes
951
+ xx1 = np.maximum(x1[i], x1[order[:-1]])
952
+ yy1 = np.maximum(y1[i], y1[order[:-1]])
953
+ xx2 = np.minimum(x2[i], x2[order[:-1]])
954
+ yy2 = np.minimum(y2[i], y2[order[:-1]])
955
+
956
+ w = np.maximum(0, xx2 - xx1)
957
+ h = np.maximum(0, yy2 - yy1)
958
+ intersection = w * h
959
+
960
+ iou = intersection / (areas[i] + areas[order[:-1]] - intersection)
961
+
962
+ # Keep only boxes with IoU less than threshold
963
+ inds = np.where(iou <= iou_threshold)[0]
964
+ order = order[inds]
965
+
966
+ return [detections[i] for i in keep]
967
+
968
+ def detect_sliding_window(
969
+ self,
970
+ source: Union[str, Image.Image, np.ndarray],
971
+ object_type: str,
972
+ window_size: int = 512,
973
+ overlap: int = 64,
974
+ iou_threshold: float = 0.5,
975
+ bands: Optional[List[int]] = None,
976
+ output_path: Optional[str] = None,
977
+ settings: Optional[Dict] = None,
978
+ show_progress: bool = True,
979
+ **kwargs: Any,
980
+ ) -> Dict[str, Any]:
981
+ """Detect objects using sliding window for large images.
982
+
983
+ This method processes large images by dividing them into overlapping
984
+ windows/tiles, running detection on each tile, and merging results
985
+ using Non-Maximum Suppression (NMS) to handle overlapping detections.
986
+
987
+ Args:
988
+ source: Image source or pre-encoded image.
989
+ object_type: Type of object to detect (e.g., "car", "building").
990
+ window_size: Size of each processing window/tile. Default 512.
991
+ overlap: Overlap between adjacent windows. Default 64.
992
+ iou_threshold: IoU threshold for NMS to merge overlapping detections.
993
+ bands: Band indices for GeoTIFF.
994
+ output_path: Path to save results as GeoJSON/Shapefile/GeoPackage.
995
+ settings: Additional settings for the model.
996
+ show_progress: Whether to show progress bar.
997
+ **kwargs: Additional arguments for the model.
998
+
999
+ Returns:
1000
+ Dictionary with "objects" key containing list of bounding boxes
1001
+ with normalized coordinates. If georeferenced, also includes
1002
+ "gdf" (GeoDataFrame).
1003
+ """
1004
+ # Load image
1005
+ if isinstance(source, (str, Image.Image, np.ndarray)):
1006
+ image, metadata = self.load_image(source, bands)
1007
+ else:
1008
+ image = source
1009
+ metadata = self._metadata
1010
+
1011
+ width, height = image.size
1012
+
1013
+ # If image is smaller than window size, use regular detection
1014
+ if width <= window_size and height <= window_size:
1015
+ return self.detect(
1016
+ image,
1017
+ object_type,
1018
+ bands=bands,
1019
+ output_path=output_path,
1020
+ settings=settings,
1021
+ **kwargs,
1022
+ )
1023
+
1024
+ # Create sliding windows
1025
+ windows = self._create_sliding_windows(width, height, window_size, overlap)
1026
+
1027
+ all_detections = []
1028
+
1029
+ # Progress bar setup
1030
+ iterator = (
1031
+ tqdm(windows, desc=f"Detecting {object_type}") if show_progress else windows
1032
+ )
1033
+
1034
+ # Process each window
1035
+ for x_start, y_start, x_end, y_end in iterator:
1036
+ # Crop window from image
1037
+ window_img = image.crop((x_start, y_start, x_end, y_end))
1038
+
1039
+ # Detect in window
1040
+ call_kwargs = {}
1041
+ if settings:
1042
+ call_kwargs["settings"] = settings
1043
+ call_kwargs.update(kwargs)
1044
+
1045
+ try:
1046
+ result = self.model.detect(window_img, object_type, **call_kwargs)
1047
+
1048
+ # Adjust coordinates to full image space
1049
+ window_width = x_end - x_start
1050
+ window_height = y_end - y_start
1051
+
1052
+ for obj in result.get("objects", []):
1053
+ # Convert from window-relative normalized coords to full image normalized coords
1054
+ full_x_min = (x_start + obj["x_min"] * window_width) / width
1055
+ full_y_min = (y_start + obj["y_min"] * window_height) / height
1056
+ full_x_max = (x_start + obj["x_max"] * window_width) / width
1057
+ full_y_max = (y_start + obj["y_max"] * window_height) / height
1058
+
1059
+ detection = {
1060
+ "x_min": full_x_min,
1061
+ "y_min": full_y_min,
1062
+ "x_max": full_x_max,
1063
+ "y_max": full_y_max,
1064
+ }
1065
+
1066
+ # Preserve additional fields if present
1067
+ for key in obj:
1068
+ if key not in ["x_min", "y_min", "x_max", "y_max"]:
1069
+ detection[key] = obj[key]
1070
+
1071
+ all_detections.append(detection)
1072
+
1073
+ except Exception as e:
1074
+ if show_progress:
1075
+ print(
1076
+ f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
1077
+ )
1078
+
1079
+ # Apply NMS to merge overlapping detections
1080
+ merged_detections = self._apply_nms(all_detections, iou_threshold)
1081
+
1082
+ result = {"objects": merged_detections}
1083
+
1084
+ # Convert to georeferenced if possible
1085
+ if metadata and metadata.get("crs") and metadata.get("transform"):
1086
+ result = self._georef_detections(result, metadata)
1087
+
1088
+ if output_path:
1089
+ self._save_vector(result["gdf"], output_path)
1090
+
1091
+ return result
1092
+
1093
+ def point_sliding_window(
1094
+ self,
1095
+ source: Union[str, Image.Image, np.ndarray],
1096
+ object_description: str,
1097
+ window_size: int = 512,
1098
+ overlap: int = 64,
1099
+ bands: Optional[List[int]] = None,
1100
+ output_path: Optional[str] = None,
1101
+ show_progress: bool = True,
1102
+ **kwargs: Any,
1103
+ ) -> Dict[str, Any]:
1104
+ """Find points using sliding window for large images.
1105
+
1106
+ This method processes large images by dividing them into overlapping
1107
+ windows/tiles and finding points in each tile.
1108
+
1109
+ Args:
1110
+ source: Image source or pre-encoded image.
1111
+ object_description: Description of objects to find.
1112
+ window_size: Size of each processing window/tile. Default 512.
1113
+ overlap: Overlap between adjacent windows. Default 64.
1114
+ bands: Band indices for GeoTIFF.
1115
+ output_path: Path to save results as GeoJSON/Shapefile/GeoPackage.
1116
+ show_progress: Whether to show progress bar.
1117
+ **kwargs: Additional arguments for the model.
1118
+
1119
+ Returns:
1120
+ Dictionary with "points" key containing list of points
1121
+ with normalized coordinates. If georeferenced, also includes
1122
+ "gdf" (GeoDataFrame).
1123
+ """
1124
+ # Load image
1125
+ if isinstance(source, (str, Image.Image, np.ndarray)):
1126
+ image, metadata = self.load_image(source, bands)
1127
+ else:
1128
+ image = source
1129
+ metadata = self._metadata
1130
+
1131
+ width, height = image.size
1132
+
1133
+ # If image is smaller than window size, use regular point detection
1134
+ if width <= window_size and height <= window_size:
1135
+ return self.point(
1136
+ image,
1137
+ object_description,
1138
+ bands=bands,
1139
+ output_path=output_path,
1140
+ **kwargs,
1141
+ )
1142
+
1143
+ # Create sliding windows
1144
+ windows = self._create_sliding_windows(width, height, window_size, overlap)
1145
+
1146
+ all_points = []
1147
+
1148
+ # Progress bar setup
1149
+ iterator = (
1150
+ tqdm(windows, desc=f"Finding {object_description}")
1151
+ if show_progress
1152
+ else windows
1153
+ )
1154
+
1155
+ # Process each window
1156
+ for x_start, y_start, x_end, y_end in iterator:
1157
+ # Crop window from image
1158
+ window_img = image.crop((x_start, y_start, x_end, y_end))
1159
+
1160
+ # Find points in window
1161
+ try:
1162
+ result = self.model.point(window_img, object_description, **kwargs)
1163
+
1164
+ # Adjust coordinates to full image space
1165
+ window_width = x_end - x_start
1166
+ window_height = y_end - y_start
1167
+
1168
+ for pt in result.get("points", []):
1169
+ # Convert from window-relative normalized coords to full image normalized coords
1170
+ full_x = (x_start + pt["x"] * window_width) / width
1171
+ full_y = (y_start + pt["y"] * window_height) / height
1172
+
1173
+ point = {"x": full_x, "y": full_y}
1174
+
1175
+ # Preserve additional fields if present
1176
+ for key in pt:
1177
+ if key not in ["x", "y"]:
1178
+ point[key] = pt[key]
1179
+
1180
+ all_points.append(point)
1181
+
1182
+ except Exception as e:
1183
+ if show_progress:
1184
+ print(
1185
+ f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
1186
+ )
1187
+
1188
+ result = {"points": all_points}
1189
+
1190
+ # Convert to georeferenced if possible
1191
+ if metadata and metadata.get("crs") and metadata.get("transform"):
1192
+ result = self._georef_points(result, metadata)
1193
+
1194
+ if output_path:
1195
+ self._save_vector(result["gdf"], output_path)
1196
+
1197
+ return result
1198
+
1199
+ def query_sliding_window(
1200
+ self,
1201
+ question: str,
1202
+ source: Union[str, Image.Image, np.ndarray],
1203
+ window_size: int = 512,
1204
+ overlap: int = 64,
1205
+ reasoning: Optional[bool] = None,
1206
+ bands: Optional[List[int]] = None,
1207
+ settings: Optional[Dict] = None,
1208
+ show_progress: bool = True,
1209
+ combine_strategy: str = "concatenate",
1210
+ **kwargs: Any,
1211
+ ) -> Dict[str, Any]:
1212
+ """Query image using sliding window for large images.
1213
+
1214
+ This method processes large images by dividing them into overlapping
1215
+ windows/tiles, querying each tile, and combining the responses.
1216
+
1217
+ Args:
1218
+ question: The question to ask about each window.
1219
+ source: Image source or pre-encoded image.
1220
+ window_size: Size of each processing window/tile. Default 512.
1221
+ overlap: Overlap between adjacent windows. Default 64.
1222
+ reasoning: Enable reasoning mode (moondream3 only).
1223
+ bands: Band indices for GeoTIFF.
1224
+ settings: Additional settings for the model.
1225
+ show_progress: Whether to show progress bar.
1226
+ combine_strategy: How to combine answers from different windows.
1227
+ Options: "concatenate", "summarize". Default "concatenate".
1228
+ **kwargs: Additional arguments for the model.
1229
+
1230
+ Returns:
1231
+ Dictionary with "answer" key containing the combined response,
1232
+ and "tile_answers" with individual tile responses.
1233
+ """
1234
+ # Load image
1235
+ if isinstance(source, (str, Image.Image, np.ndarray)):
1236
+ image, _ = self.load_image(source, bands)
1237
+ else:
1238
+ image = source
1239
+
1240
+ width, height = image.size
1241
+
1242
+ # If image is smaller than window size, use regular query
1243
+ if width <= window_size and height <= window_size:
1244
+ return self.query(
1245
+ question,
1246
+ image,
1247
+ reasoning=reasoning,
1248
+ bands=bands,
1249
+ settings=settings,
1250
+ **kwargs,
1251
+ )
1252
+
1253
+ # Create sliding windows
1254
+ windows = self._create_sliding_windows(width, height, window_size, overlap)
1255
+
1256
+ tile_answers = []
1257
+
1258
+ # Progress bar setup
1259
+ iterator = tqdm(windows, desc="Querying tiles") if show_progress else windows
1260
+
1261
+ # Process each window
1262
+ for idx, (x_start, y_start, x_end, y_end) in enumerate(iterator):
1263
+ # Crop window from image
1264
+ window_img = image.crop((x_start, y_start, x_end, y_end))
1265
+
1266
+ # Query window
1267
+ call_kwargs = {"question": question, "image": window_img}
1268
+ if reasoning is not None and self.model_version == "moondream3":
1269
+ call_kwargs["reasoning"] = reasoning
1270
+ if settings:
1271
+ call_kwargs["settings"] = settings
1272
+ call_kwargs.update(kwargs)
1273
+
1274
+ try:
1275
+ result = self.model.query(**call_kwargs)
1276
+ tile_answers.append(
1277
+ {
1278
+ "tile_id": idx,
1279
+ "bounds": (x_start, y_start, x_end, y_end),
1280
+ "answer": result.get("answer", ""),
1281
+ }
1282
+ )
1283
+ except Exception as e:
1284
+ if show_progress:
1285
+ print(
1286
+ f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
1287
+ )
1288
+
1289
+ # Combine answers
1290
+ if combine_strategy == "concatenate":
1291
+ combined_answer = "\n\n".join(
1292
+ [
1293
+ f"Tile {ta['tile_id']} (region {ta['bounds']}): {ta['answer']}"
1294
+ for ta in tile_answers
1295
+ ]
1296
+ )
1297
+ elif combine_strategy == "summarize":
1298
+ # Use the model to summarize the tile answers
1299
+ summary_prompt = (
1300
+ f"Based on these regional observations about '{question}', "
1301
+ f"provide a comprehensive summary:\n\n"
1302
+ )
1303
+ for ta in tile_answers:
1304
+ summary_prompt += f"Region {ta['tile_id']}: {ta['answer']}\n"
1305
+
1306
+ try:
1307
+ summary_result = self.model.query(question=summary_prompt)
1308
+ combined_answer = summary_result.get("answer", "")
1309
+ except:
1310
+ # Fall back to concatenation if summarization fails
1311
+ combined_answer = " ".join([ta["answer"] for ta in tile_answers])
1312
+ else:
1313
+ combined_answer = " ".join([ta["answer"] for ta in tile_answers])
1314
+
1315
+ return {"answer": combined_answer, "tile_answers": tile_answers}
1316
+
1317
+ def caption_sliding_window(
1318
+ self,
1319
+ source: Union[str, Image.Image, np.ndarray],
1320
+ window_size: int = 512,
1321
+ overlap: int = 64,
1322
+ length: str = "normal",
1323
+ bands: Optional[List[int]] = None,
1324
+ settings: Optional[Dict] = None,
1325
+ show_progress: bool = True,
1326
+ combine_strategy: str = "concatenate",
1327
+ **kwargs: Any,
1328
+ ) -> Dict[str, Any]:
1329
+ """Generate caption using sliding window for large images.
1330
+
1331
+ This method processes large images by dividing them into overlapping
1332
+ windows/tiles, generating captions for each tile, and combining them.
1333
+
1334
+ Args:
1335
+ source: Image source or pre-encoded image.
1336
+ window_size: Size of each processing window/tile. Default 512.
1337
+ overlap: Overlap between adjacent windows. Default 64.
1338
+ length: Caption length - "short", "normal", or "long".
1339
+ bands: Band indices for GeoTIFF.
1340
+ settings: Additional settings for the model.
1341
+ show_progress: Whether to show progress bar.
1342
+ combine_strategy: How to combine captions from different windows.
1343
+ Options: "concatenate", "summarize". Default "concatenate".
1344
+ **kwargs: Additional arguments for the model.
1345
+
1346
+ Returns:
1347
+ Dictionary with "caption" key containing the combined caption,
1348
+ and "tile_captions" with individual tile captions.
1349
+ """
1350
+ # Load image
1351
+ if isinstance(source, (str, Image.Image, np.ndarray)):
1352
+ image, _ = self.load_image(source, bands)
1353
+ else:
1354
+ image = source
1355
+
1356
+ width, height = image.size
1357
+
1358
+ # If image is smaller than window size, use regular caption
1359
+ if width <= window_size and height <= window_size:
1360
+ return self.caption(
1361
+ image, length=length, bands=bands, settings=settings, **kwargs
1362
+ )
1363
+
1364
+ # Create sliding windows
1365
+ windows = self._create_sliding_windows(width, height, window_size, overlap)
1366
+
1367
+ tile_captions = []
1368
+
1369
+ # Progress bar setup
1370
+ iterator = (
1371
+ tqdm(windows, desc="Generating captions") if show_progress else windows
1372
+ )
1373
+
1374
+ # Process each window
1375
+ for idx, (x_start, y_start, x_end, y_end) in enumerate(iterator):
1376
+ # Crop window from image
1377
+ window_img = image.crop((x_start, y_start, x_end, y_end))
1378
+
1379
+ # Caption window
1380
+ call_kwargs = {"length": length}
1381
+ if settings:
1382
+ call_kwargs["settings"] = settings
1383
+ call_kwargs.update(kwargs)
1384
+
1385
+ try:
1386
+ result = self.model.caption(window_img, **call_kwargs)
1387
+ tile_captions.append(
1388
+ {
1389
+ "tile_id": idx,
1390
+ "bounds": (x_start, y_start, x_end, y_end),
1391
+ "caption": result.get("caption", ""),
1392
+ }
1393
+ )
1394
+ except Exception as e:
1395
+ if show_progress:
1396
+ print(
1397
+ f"Warning: Failed to process window ({x_start},{y_start})-({x_end},{y_end}): {e}"
1398
+ )
1399
+
1400
+ # Combine captions
1401
+ if combine_strategy == "concatenate":
1402
+ combined_caption = " ".join([tc["caption"] for tc in tile_captions])
1403
+ elif combine_strategy == "summarize":
1404
+ # Use the model to create a cohesive summary caption
1405
+ summary_prompt = (
1406
+ "Based on these descriptions of different regions of an image, "
1407
+ "create a single comprehensive caption for the entire image:\n\n"
1408
+ )
1409
+ for tc in tile_captions:
1410
+ summary_prompt += f"Region {tc['tile_id']}: {tc['caption']}\n"
1411
+
1412
+ try:
1413
+ summary_result = self.model.query(question=summary_prompt)
1414
+ combined_caption = summary_result.get("answer", "")
1415
+ except:
1416
+ # Fall back to concatenation if summarization fails
1417
+ combined_caption = " ".join([tc["caption"] for tc in tile_captions])
1418
+ else:
1419
+ combined_caption = " ".join([tc["caption"] for tc in tile_captions])
1420
+
1421
+ return {"caption": combined_caption, "tile_captions": tile_captions}
1422
+
870
1423
 
871
1424
  def moondream_caption(
872
1425
  source: Union[str, Image.Image, np.ndarray],
@@ -988,3 +1541,198 @@ def moondream_point(
988
1541
  return processor.point(
989
1542
  source, object_description, output_path=output_path, bands=bands, **kwargs
990
1543
  )
1544
+
1545
+
1546
+ def moondream_detect_sliding_window(
1547
+ source: Union[str, Image.Image, np.ndarray],
1548
+ object_type: str,
1549
+ window_size: int = 512,
1550
+ overlap: int = 64,
1551
+ iou_threshold: float = 0.5,
1552
+ model_name: str = "vikhyatk/moondream2",
1553
+ revision: Optional[str] = None,
1554
+ output_path: Optional[str] = None,
1555
+ bands: Optional[List[int]] = None,
1556
+ device: Optional[str] = None,
1557
+ show_progress: bool = True,
1558
+ **kwargs: Any,
1559
+ ) -> Dict[str, Any]:
1560
+ """Convenience function to detect objects using sliding window.
1561
+
1562
+ This function is designed for large images where the standard detection
1563
+ may not work well. It divides the image into overlapping windows and
1564
+ merges detections using NMS.
1565
+
1566
+ Args:
1567
+ source: Image source.
1568
+ object_type: Type of object to detect.
1569
+ window_size: Size of each processing window. Default 512.
1570
+ overlap: Overlap between windows. Default 64.
1571
+ iou_threshold: IoU threshold for NMS. Default 0.5.
1572
+ model_name: Moondream model name.
1573
+ revision: Model revision.
1574
+ output_path: Path to save results as vector file.
1575
+ bands: Band indices for GeoTIFF.
1576
+ device: Device for inference.
1577
+ show_progress: Whether to show progress bar.
1578
+ **kwargs: Additional arguments.
1579
+
1580
+ Returns:
1581
+ Detection results dictionary with "objects" and optionally "gdf".
1582
+ """
1583
+ processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
1584
+ return processor.detect_sliding_window(
1585
+ source,
1586
+ object_type,
1587
+ window_size=window_size,
1588
+ overlap=overlap,
1589
+ iou_threshold=iou_threshold,
1590
+ output_path=output_path,
1591
+ bands=bands,
1592
+ show_progress=show_progress,
1593
+ **kwargs,
1594
+ )
1595
+
1596
+
1597
+ def moondream_point_sliding_window(
1598
+ source: Union[str, Image.Image, np.ndarray],
1599
+ object_description: str,
1600
+ window_size: int = 512,
1601
+ overlap: int = 64,
1602
+ model_name: str = "vikhyatk/moondream2",
1603
+ revision: Optional[str] = None,
1604
+ output_path: Optional[str] = None,
1605
+ bands: Optional[List[int]] = None,
1606
+ device: Optional[str] = None,
1607
+ show_progress: bool = True,
1608
+ **kwargs: Any,
1609
+ ) -> Dict[str, Any]:
1610
+ """Convenience function to find points using sliding window.
1611
+
1612
+ This function is designed for large images. It divides the image
1613
+ into overlapping windows and aggregates all detected points.
1614
+
1615
+ Args:
1616
+ source: Image source.
1617
+ object_description: Description of objects to find.
1618
+ window_size: Size of each processing window. Default 512.
1619
+ overlap: Overlap between windows. Default 64.
1620
+ model_name: Moondream model name.
1621
+ revision: Model revision.
1622
+ output_path: Path to save results as vector file.
1623
+ bands: Band indices for GeoTIFF.
1624
+ device: Device for inference.
1625
+ show_progress: Whether to show progress bar.
1626
+ **kwargs: Additional arguments.
1627
+
1628
+ Returns:
1629
+ Point results dictionary with "points" and optionally "gdf".
1630
+ """
1631
+ processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
1632
+ return processor.point_sliding_window(
1633
+ source,
1634
+ object_description,
1635
+ window_size=window_size,
1636
+ overlap=overlap,
1637
+ output_path=output_path,
1638
+ bands=bands,
1639
+ show_progress=show_progress,
1640
+ **kwargs,
1641
+ )
1642
+
1643
+
1644
+ def moondream_query_sliding_window(
1645
+ question: str,
1646
+ source: Union[str, Image.Image, np.ndarray],
1647
+ window_size: int = 512,
1648
+ overlap: int = 64,
1649
+ model_name: str = "vikhyatk/moondream2",
1650
+ revision: Optional[str] = None,
1651
+ reasoning: Optional[bool] = None,
1652
+ bands: Optional[List[int]] = None,
1653
+ device: Optional[str] = None,
1654
+ show_progress: bool = True,
1655
+ combine_strategy: str = "concatenate",
1656
+ **kwargs: Any,
1657
+ ) -> Dict[str, Any]:
1658
+ """Convenience function to query large images using sliding window.
1659
+
1660
+ This function divides the image into overlapping windows, queries each,
1661
+ and combines the answers.
1662
+
1663
+ Args:
1664
+ question: Question to ask about the image.
1665
+ source: Image source.
1666
+ window_size: Size of each processing window. Default 512.
1667
+ overlap: Overlap between windows. Default 64.
1668
+ model_name: Moondream model name.
1669
+ revision: Model revision.
1670
+ reasoning: Enable reasoning mode (moondream3 only).
1671
+ bands: Band indices for GeoTIFF.
1672
+ device: Device for inference.
1673
+ show_progress: Whether to show progress bar.
1674
+ combine_strategy: How to combine answers ("concatenate" or "summarize").
1675
+ **kwargs: Additional arguments.
1676
+
1677
+ Returns:
1678
+ Dictionary with "answer" and "tile_answers" keys.
1679
+ """
1680
+ processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
1681
+ return processor.query_sliding_window(
1682
+ question,
1683
+ source,
1684
+ window_size=window_size,
1685
+ overlap=overlap,
1686
+ reasoning=reasoning,
1687
+ bands=bands,
1688
+ show_progress=show_progress,
1689
+ combine_strategy=combine_strategy,
1690
+ **kwargs,
1691
+ )
1692
+
1693
+
1694
+ def moondream_caption_sliding_window(
1695
+ source: Union[str, Image.Image, np.ndarray],
1696
+ window_size: int = 512,
1697
+ overlap: int = 64,
1698
+ length: str = "normal",
1699
+ model_name: str = "vikhyatk/moondream2",
1700
+ revision: Optional[str] = None,
1701
+ bands: Optional[List[int]] = None,
1702
+ device: Optional[str] = None,
1703
+ show_progress: bool = True,
1704
+ combine_strategy: str = "concatenate",
1705
+ **kwargs: Any,
1706
+ ) -> Dict[str, Any]:
1707
+ """Convenience function to caption large images using sliding window.
1708
+
1709
+ This function divides the image into overlapping windows, captions each,
1710
+ and combines the results.
1711
+
1712
+ Args:
1713
+ source: Image source.
1714
+ window_size: Size of each processing window. Default 512.
1715
+ overlap: Overlap between windows. Default 64.
1716
+ length: Caption length ("short", "normal", "long").
1717
+ model_name: Moondream model name.
1718
+ revision: Model revision.
1719
+ bands: Band indices for GeoTIFF.
1720
+ device: Device for inference.
1721
+ show_progress: Whether to show progress bar.
1722
+ combine_strategy: How to combine captions ("concatenate" or "summarize").
1723
+ **kwargs: Additional arguments.
1724
+
1725
+ Returns:
1726
+ Dictionary with "caption" and "tile_captions" keys.
1727
+ """
1728
+ processor = MoondreamGeo(model_name=model_name, revision=revision, device=device)
1729
+ return processor.caption_sliding_window(
1730
+ source,
1731
+ window_size=window_size,
1732
+ overlap=overlap,
1733
+ length=length,
1734
+ bands=bands,
1735
+ show_progress=show_progress,
1736
+ combine_strategy=combine_strategy,
1737
+ **kwargs,
1738
+ )