vision-agent 0.2.226__tar.gz → 0.2.228__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. {vision_agent-0.2.226 → vision_agent-0.2.228}/PKG-INFO +1 -1
  2. {vision_agent-0.2.226 → vision_agent-0.2.228}/pyproject.toml +1 -1
  3. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/tools/__init__.py +4 -0
  4. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/tools/tools.py +384 -0
  5. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/utils/video_tracking.py +2 -0
  6. {vision_agent-0.2.226 → vision_agent-0.2.228}/LICENSE +0 -0
  7. {vision_agent-0.2.226 → vision_agent-0.2.228}/README.md +0 -0
  8. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/.sim_tools/df.csv +0 -0
  9. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/.sim_tools/embs.npy +0 -0
  10. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/__init__.py +0 -0
  11. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/README.md +0 -0
  12. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/__init__.py +0 -0
  13. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/agent.py +0 -0
  14. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/agent_utils.py +0 -0
  15. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/types.py +0 -0
  16. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent.py +0 -0
  17. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder.py +0 -0
  18. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  19. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  20. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  21. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner.py +0 -0
  22. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  23. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  24. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  25. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_prompts.py +0 -0
  26. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  27. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/agent/vision_agent_v2.py +0 -0
  28. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/clients/__init__.py +0 -0
  29. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/clients/http.py +0 -0
  30. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/clients/landing_public_api.py +0 -0
  31. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/fonts/__init__.py +0 -0
  32. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  33. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/lmm/__init__.py +0 -0
  34. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/lmm/lmm.py +0 -0
  35. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/lmm/types.py +0 -0
  36. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/tools/meta_tools.py +0 -0
  37. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/tools/planner_tools.py +0 -0
  38. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/tools/prompts.py +0 -0
  39. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/tools/tool_utils.py +0 -0
  40. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/tools/tools_types.py +0 -0
  41. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/utils/__init__.py +0 -0
  42. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/utils/exceptions.py +0 -0
  43. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/utils/execute.py +0 -0
  44. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/utils/image_utils.py +0 -0
  45. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/utils/sim.py +0 -0
  46. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/utils/type_defs.py +0 -0
  47. {vision_agent-0.2.226 → vision_agent-0.2.228}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.226
3
+ Version: 0.2.228
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.226"
7
+ version = "0.2.228"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -63,6 +63,10 @@ from .tools import (
63
63
  video_temporal_localization,
64
64
  vit_image_classification,
65
65
  vit_nsfw_classification,
66
+ custom_object_detection,
67
+ agentic_object_detection,
68
+ agentic_sam2_instance_segmentation,
69
+ agentic_sam2_video_tracking,
66
70
  )
67
71
 
68
72
  __new_tools__ = [
@@ -290,6 +290,21 @@ def od_sam2_video_tracking(
290
290
  )
291
291
  function_name = "florence2_object_detection"
292
292
 
293
+ elif od_model == ODModels.AGENTIC:
294
+ segment_results = agentic_object_detection(
295
+ prompt=prompt,
296
+ image=segment_frames[frame_number],
297
+ fine_tune_id=fine_tune_id,
298
+ )
299
+ function_name = "agentic_object_detection"
300
+
301
+ elif od_model == ODModels.CUSTOM:
302
+ segment_results = custom_object_detection(
303
+ deployment_id=fine_tune_id,
304
+ image=segment_frames[frame_number],
305
+ )
306
+ function_name = "custom_object_detection"
307
+
293
308
  else:
294
309
  raise NotImplementedError(
295
310
  f"Object detection model '{od_model}' is not implemented."
@@ -1217,6 +1232,139 @@ def countgd_visual_prompt_object_detection(
1217
1232
  return bboxes_formatted
1218
1233
 
1219
1234
 
1235
+ def custom_object_detection(
1236
+ deployment_id: str,
1237
+ image: np.ndarray,
1238
+ box_threshold: float = 0.1,
1239
+ ) -> List[Dict[str, Any]]:
1240
+ """'custom_object_detection' is a tool that can detect instances of an
1241
+ object given a deployment_id of a previously finetuned object detection model.
1242
+ It is particularly useful when trying to detect objects that are not well detected by generalist models.
1243
+ It returns a list of bounding boxes with normalized
1244
+ coordinates, label names and associated confidence scores.
1245
+
1246
+ Parameters:
1247
+ deployment_id (str): The id of the finetuned model.
1248
+ image (np.ndarray): The image that contains instances of the object.
1249
+ box_threshold (float, optional): The threshold for detection. Defaults
1250
+ to 0.1.
1251
+
1252
+ Returns:
1253
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
1254
+ bounding box of the detected objects with normalized coordinates between 0
1255
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
1256
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
1257
+ bounding box.
1258
+
1259
+ Example
1260
+ -------
1261
+ >>> custom_object_detection("abcd1234-5678efg", image)
1262
+ [
1263
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1264
+ {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5]},
1265
+ {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52]},
1266
+ {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58]},
1267
+ ]
1268
+ """
1269
+ image_size = image.shape[:2]
1270
+ if image_size[0] < 1 or image_size[1] < 1:
1271
+ return []
1272
+
1273
+ files = [("image", numpy_to_bytes(image))]
1274
+ payload = {
1275
+ "deployment_id": deployment_id,
1276
+ "confidence": box_threshold,
1277
+ }
1278
+ detections: List[List[Dict[str, Any]]] = send_inference_request(
1279
+ payload, "custom-object-detection", files=files, v2=True
1280
+ )
1281
+
1282
+ bboxes = detections[0]
1283
+ bboxes_formatted = [
1284
+ {
1285
+ "label": bbox["label"],
1286
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1287
+ "score": bbox["score"],
1288
+ }
1289
+ for bbox in bboxes
1290
+ ]
1291
+ display_data = [
1292
+ {
1293
+ "label": bbox["label"],
1294
+ "bbox": bbox["bounding_box"],
1295
+ "score": bbox["score"],
1296
+ }
1297
+ for bbox in bboxes
1298
+ ]
1299
+
1300
+ _display_tool_trace(
1301
+ custom_object_detection.__name__,
1302
+ payload,
1303
+ display_data,
1304
+ files,
1305
+ )
1306
+ return bboxes_formatted
1307
+
1308
+
1309
+ def custom_od_sam2_video_tracking(
1310
+ deployment_id: str,
1311
+ frames: List[np.ndarray],
1312
+ chunk_length: Optional[int] = 10,
1313
+ ) -> List[List[Dict[str, Any]]]:
1314
+ """'custom_od_sam2_video_tracking' is a tool that can segment multiple objects given a
1315
+ custom model with predefined category names.
1316
+ It returns a list of bounding boxes, label names,
1317
+ mask file names and associated probability scores.
1318
+
1319
+ Parameters:
1320
+ deployment_id (str): The id of the deployed custom model.
1321
+ image (np.ndarray): The image to ground the prompt to.
1322
+ chunk_length (Optional[int]): The number of frames to re-run florence2 to find
1323
+ new objects.
1324
+
1325
+ Returns:
1326
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1327
+ bounding box, and mask of the detected objects with normalized coordinates
1328
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1329
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1330
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1331
+ the background.
1332
+
1333
+ Example
1334
+ -------
1335
+ >>> custom_od_sam2_video_tracking("abcd1234-5678efg", frames)
1336
+ [
1337
+ [
1338
+ {
1339
+ 'label': '0: dinosaur',
1340
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1341
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1342
+ [0, 0, 0, ..., 0, 0, 0],
1343
+ ...,
1344
+ [0, 0, 0, ..., 0, 0, 0],
1345
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1346
+ },
1347
+ ],
1348
+ ...
1349
+ ]
1350
+ """
1351
+
1352
+ ret = od_sam2_video_tracking(
1353
+ ODModels.CUSTOM,
1354
+ prompt="",
1355
+ frames=frames,
1356
+ chunk_length=chunk_length,
1357
+ fine_tune_id=deployment_id,
1358
+ )
1359
+ _display_tool_trace(
1360
+ custom_od_sam2_video_tracking.__name__,
1361
+ {},
1362
+ ret["display_data"],
1363
+ ret["files"],
1364
+ )
1365
+ return ret["return_data"] # type: ignore
1366
+
1367
+
1220
1368
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1221
1369
  """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
1222
1370
  images including regular images or images of documents or presentations. It can be
@@ -2000,6 +2148,242 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2000
2148
  return response
2001
2149
 
2002
2150
 
2151
+ # agentic od tools
2152
+
2153
+
2154
+ def _agentic_object_detection(
2155
+ prompt: str,
2156
+ image: np.ndarray,
2157
+ image_size: Tuple[int, ...],
2158
+ image_bytes: Optional[bytes] = None,
2159
+ fine_tune_id: Optional[str] = None,
2160
+ ) -> Dict[str, Any]:
2161
+ if image_bytes is None:
2162
+ image_bytes = numpy_to_bytes(image)
2163
+
2164
+ files = [("image", image_bytes)]
2165
+ payload = {
2166
+ "prompts": [s.strip() for s in prompt.split(",")],
2167
+ "model": "agentic",
2168
+ }
2169
+ metadata = {"function_name": "agentic_object_detection"}
2170
+
2171
+ if fine_tune_id is not None:
2172
+ landing_api = LandingPublicAPI()
2173
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
2174
+ if status is not JobStatus.SUCCEEDED:
2175
+ raise FineTuneModelIsNotReady(
2176
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
2177
+ )
2178
+
2179
+ # we can only execute fine-tuned models with florence2
2180
+ payload = {
2181
+ "prompts": payload["prompts"],
2182
+ "jobId": fine_tune_id,
2183
+ "model": "florence2",
2184
+ }
2185
+
2186
+ detections = send_task_inference_request(
2187
+ payload,
2188
+ "text-to-object-detection",
2189
+ files=files,
2190
+ metadata=metadata,
2191
+ )
2192
+
2193
+ # get the first frame
2194
+ bboxes = detections[0]
2195
+ bboxes_formatted = [
2196
+ {
2197
+ "label": bbox["label"],
2198
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
2199
+ "score": bbox["score"],
2200
+ }
2201
+ for bbox in bboxes
2202
+ ]
2203
+ display_data = [
2204
+ {
2205
+ "label": bbox["label"],
2206
+ "bbox": bbox["bounding_box"],
2207
+ "score": bbox["score"],
2208
+ }
2209
+ for bbox in bboxes
2210
+ ]
2211
+ return {
2212
+ "files": files,
2213
+ "return_data": bboxes_formatted,
2214
+ "display_data": display_data,
2215
+ }
2216
+
2217
+
2218
+ def agentic_object_detection(
2219
+ prompt: str,
2220
+ image: np.ndarray,
2221
+ fine_tune_id: Optional[str] = None,
2222
+ ) -> List[Dict[str, Any]]:
2223
+ """'agentic_object_detection' is a tool that can detect and count multiple objects
2224
+ given a text prompt such as category names or referring expressions on images. The
2225
+ categories in text prompt are separated by commas. It returns a list of bounding
2226
+ boxes with normalized coordinates, label names and associated probability scores.
2227
+
2228
+ Parameters:
2229
+ prompt (str): The prompt to ground to the image.
2230
+ image (np.ndarray): The image to ground the prompt to.
2231
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2232
+ fine-tuned model ID here to use it.
2233
+
2234
+ Returns:
2235
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
2236
+ bounding box of the detected objects with normalized coordinates between 0
2237
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
2238
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
2239
+ bounding box.
2240
+
2241
+ Example
2242
+ -------
2243
+ >>> agentic_object_detection("car", image)
2244
+ [
2245
+ {'score': 0.99, 'label': 'car', 'bbox': [0.1, 0.11, 0.35, 0.4]},
2246
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
2247
+ ]
2248
+ """
2249
+
2250
+ image_size = image.shape[:2]
2251
+ if image_size[0] < 1 or image_size[1] < 1:
2252
+ return []
2253
+
2254
+ ret = _agentic_object_detection(
2255
+ prompt, image, image_size, fine_tune_id=fine_tune_id
2256
+ )
2257
+
2258
+ _display_tool_trace(
2259
+ agentic_object_detection.__name__,
2260
+ {"prompts": prompt},
2261
+ ret["display_data"],
2262
+ ret["files"],
2263
+ )
2264
+ return ret["return_data"] # type: ignore
2265
+
2266
+
2267
+ def agentic_sam2_instance_segmentation(
2268
+ prompt: str, image: np.ndarray
2269
+ ) -> List[Dict[str, Any]]:
2270
+ """'agentic_sam2_instance_segmentation' is a tool that can detect and count multiple
2271
+ instances of objects given a text prompt such as category names or referring
2272
+ expressions on images. The categories in text prompt are separated by commas. It
2273
+ returns a list of bounding boxes with normalized coordinates, label names, masks
2274
+ and associated probability scores.
2275
+
2276
+ Parameters:
2277
+ prompt (str): The object that needs to be counted.
2278
+ image (np.ndarray): The image that contains multiple instances of the object.
2279
+
2280
+ Returns:
2281
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2282
+ bounding box, and mask of the detected objects with normalized coordinates
2283
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2284
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2285
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2286
+ the background.
2287
+
2288
+ Example
2289
+ -------
2290
+ >>> agentic_sam2_instance_segmentation("flower", image)
2291
+ [
2292
+ {
2293
+ 'score': 0.49,
2294
+ 'label': 'flower',
2295
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2296
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2297
+ [0, 0, 0, ..., 0, 0, 0],
2298
+ ...,
2299
+ [0, 0, 0, ..., 0, 0, 0],
2300
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2301
+ },
2302
+ ]
2303
+ """
2304
+
2305
+ od_ret = _agentic_object_detection(prompt, image, image.shape[:2])
2306
+ seg_ret = _sam2(
2307
+ image, od_ret["return_data"], image.shape[:2], image_bytes=od_ret["files"][0][1]
2308
+ )
2309
+
2310
+ _display_tool_trace(
2311
+ agentic_sam2_instance_segmentation.__name__,
2312
+ {
2313
+ "prompts": prompt,
2314
+ },
2315
+ seg_ret["display_data"],
2316
+ seg_ret["files"],
2317
+ )
2318
+
2319
+ return seg_ret["return_data"] # type: ignore
2320
+
2321
+
2322
+ def agentic_sam2_video_tracking(
2323
+ prompt: str,
2324
+ frames: List[np.ndarray],
2325
+ chunk_length: Optional[int] = 10,
2326
+ fine_tune_id: Optional[str] = None,
2327
+ ) -> List[List[Dict[str, Any]]]:
2328
+ """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
2329
+ objects in a video given a text prompt such as category names or referring
2330
+ expressions. The categories in the text prompt are separated by commas. It returns
2331
+ a list of bounding boxes, label names, masks and associated probability scores and
2332
+ is useful for tracking and counting without duplicating counts.
2333
+
2334
+ Parameters:
2335
+ prompt (str): The prompt to ground to the image.
2336
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
2337
+ chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
2338
+ to find new objects.
2339
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
2340
+ fine-tuned model ID here to use it.
2341
+
2342
+ Returns:
2343
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
2344
+ label, segmentation mask and bounding boxes. The outer list represents each
2345
+ frame and the inner list is the entities per frame. The detected objects
2346
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
2347
+ and ymin are the coordinates of the top-left and xmax and ymax are the
2348
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
2349
+ numpy array where 1 indicates the object and 0 indicates the background.
2350
+ The label names are prefixed with their ID represent the total count.
2351
+
2352
+ Example
2353
+ -------
2354
+ >>> agentic_sam2_video_tracking("dinosaur", frames)
2355
+ [
2356
+ [
2357
+ {
2358
+ 'label': '0: dinosaur',
2359
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
2360
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2361
+ [0, 0, 0, ..., 0, 0, 0],
2362
+ ...,
2363
+ [0, 0, 0, ..., 0, 0, 0],
2364
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2365
+ },
2366
+ ],
2367
+ ...
2368
+ ]
2369
+ """
2370
+
2371
+ ret = od_sam2_video_tracking(
2372
+ ODModels.AGENTIC,
2373
+ prompt=prompt,
2374
+ frames=frames,
2375
+ chunk_length=chunk_length,
2376
+ fine_tune_id=fine_tune_id,
2377
+ )
2378
+ _display_tool_trace(
2379
+ agentic_sam2_video_tracking.__name__,
2380
+ {},
2381
+ ret["display_data"],
2382
+ ret["files"],
2383
+ )
2384
+ return ret["return_data"] # type: ignore
2385
+
2386
+
2003
2387
  def minimum_distance(
2004
2388
  det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
2005
2389
  ) -> float:
@@ -17,6 +17,8 @@ class ODModels(str, Enum):
17
17
  COUNTGD = "countgd"
18
18
  FLORENCE2 = "florence2"
19
19
  OWLV2 = "owlv2"
20
+ AGENTIC = "agentic"
21
+ CUSTOM = "custom"
20
22
 
21
23
 
22
24
  def split_frames_into_segments(
File without changes
File without changes