vision-agent 0.2.214__py3-none-any.whl → 0.2.216__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -444,6 +444,35 @@ desc,doc,name
444
444
  >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
445
445
  'Lionel Messi'
446
446
  ",qwen2_vl_video_vqa
447
+ "'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
448
+ 'document_extraction' is a tool that can extract structured information out of
449
+ documents with different layouts. It returns the extracted data in a structured
450
+ hierarchical format containing text, tables, pictures, charts, and other
451
+ information.
452
+
453
+ Parameters:
454
+ image (np.ndarray): The document image to analyze
455
+
456
+ Returns:
457
+ Dict[str, Any]: A dictionary containing the extracted information.
458
+
459
+ Example
460
+ -------
461
+ >>> document_analysis(image)
462
+ {'pages':
463
+ [{'bbox': [0, 0, 1700, 2200],
464
+ 'chunks': [{'bbox': [1371, 75, 1503, 112],
465
+ 'label': 'page_header',
466
+ 'order': 75
467
+ 'caption': 'Annual Report 2024',
468
+ 'summary': 'This annual report summarizes ...' },
469
+ {'bbox': [201, 1119, 1497, 1647],
470
+ 'label': table',
471
+ 'order': 1119,
472
+ 'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
473
+ 'summary': 'This table illustrates a trend of ...'},
474
+ ],
475
+ ",document_extraction
447
476
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
448
477
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames
449
478
  value selected for the video. It can detect multiple objects independently per
@@ -513,6 +542,78 @@ desc,doc,name
513
542
  >>> siglip_classification(image, ['dog', 'cat', 'bird'])
514
543
  {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
515
544
  ",siglip_classification
545
+ "'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
546
+ 'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
547
+ prompt such as category names or referring expressions. The categories in the text
548
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
549
+ mask file names and associated probability scores.
550
+
551
+ Parameters:
552
+ prompt (str): The prompt to ground to the image.
553
+ image (np.ndarray): The image to ground the prompt to.
554
+
555
+ Returns:
556
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
557
+ bounding box, and mask of the detected objects with normalized coordinates
558
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
559
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
560
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
561
+ the background.
562
+
563
+ Example
564
+ -------
565
+ >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
566
+ [
567
+ [
568
+ {
569
+ 'label': '0: dinosaur',
570
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
571
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
572
+ [0, 0, 0, ..., 0, 0, 0],
573
+ ...,
574
+ [0, 0, 0, ..., 0, 0, 0],
575
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
576
+ },
577
+ ],
578
+ ...
579
+ ]
580
+ ",owlv2_sam2_video_tracking
581
+ "'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
582
+ 'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
583
+ prompt such as category names or referring expressions. The categories in the text
584
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
585
+ mask file names and associated probability scores.
586
+
587
+ Parameters:
588
+ prompt (str): The prompt to ground to the image.
589
+ image (np.ndarray): The image to ground the prompt to.
590
+
591
+ Returns:
592
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
593
+ bounding box, and mask of the detected objects with normalized coordinates
594
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
595
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
596
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
597
+ the background.
598
+
599
+ Example
600
+ -------
601
+ >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
602
+ [
603
+ [
604
+ {
605
+ 'label': '0: dinosaur',
606
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
607
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
608
+ [0, 0, 0, ..., 0, 0, 0],
609
+ ...,
610
+ [0, 0, 0, ..., 0, 0, 0],
611
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
612
+ },
613
+ ],
614
+ ...
615
+ ]
616
+ ",countgd_sam2_video_tracking
516
617
  "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
517
618
  'extract_frames_and_timestamps' extracts frames and timestamps from a video
518
619
  which can be a file path, url or youtube link, returns a list of dictionaries
Binary file
@@ -32,6 +32,7 @@ from .tools import (
32
32
  countgd_sam2_video_tracking,
33
33
  depth_anything_v2,
34
34
  detr_segmentation,
35
+ document_extraction,
35
36
  extract_frames_and_timestamps,
36
37
  florence2_ocr,
37
38
  florence2_phrase_grounding,
@@ -143,7 +143,14 @@ def run_tool_testing(
143
143
  code = extract_tag(response, "code") # type: ignore
144
144
  if code is None:
145
145
  raise ValueError(f"Could not extract code from response: {response}")
146
- code = process_code(code)
146
+
147
+ # If there's a syntax error with the code, process_code can crash. Executing the
148
+ # code and then sending the error to the LLM should correct it.
149
+ try:
150
+ code = process_code(code)
151
+ except Exception as e:
152
+ _LOGGER.error(f"Error processing code: {e}")
153
+
147
154
  tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
148
155
  tool_output_str = tool_output.text(include_results=False).strip()
149
156
 
@@ -167,6 +174,7 @@ def run_tool_testing(
167
174
  DefaultImports.prepend_imports(code)
168
175
  )
169
176
  tool_output_str = tool_output.text(include_results=False).strip()
177
+ count += 1
170
178
 
171
179
  return code, tool_docs_str, tool_output
172
180
 
@@ -119,6 +119,120 @@ def _display_tool_trace(
119
119
  display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
120
120
 
121
121
 
122
+ class ODModels(str, Enum):
123
+ COUNTGD = "countgd"
124
+ FLORENCE2 = "florence2"
125
+ OWLV2 = "owlv2"
126
+
127
+
128
+ def od_sam2_video_tracking(
129
+ od_model: ODModels,
130
+ prompt: str,
131
+ frames: List[np.ndarray],
132
+ chunk_length: Optional[int] = 10,
133
+ fine_tune_id: Optional[str] = None,
134
+ ) -> Dict[str, Any]:
135
+ results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
136
+
137
+ if chunk_length is None:
138
+ step = 1 # Process every frame
139
+ elif chunk_length <= 0:
140
+ raise ValueError("chunk_length must be a positive integer or None.")
141
+ else:
142
+ step = chunk_length # Process frames with the specified step size
143
+
144
+ for idx in range(0, len(frames), step):
145
+ if od_model == ODModels.COUNTGD:
146
+ results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
147
+ function_name = "countgd_object_detection"
148
+ elif od_model == ODModels.OWLV2:
149
+ results[idx] = owl_v2_image(
150
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
151
+ )
152
+ function_name = "owl_v2_image"
153
+ elif od_model == ODModels.FLORENCE2:
154
+ results[idx] = florence2_sam2_image(
155
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
156
+ )
157
+ function_name = "florence2_sam2_image"
158
+ else:
159
+ raise NotImplementedError(
160
+ f"Object detection model '{od_model}' is not implemented."
161
+ )
162
+
163
+ image_size = frames[0].shape[:2]
164
+
165
+ def _transform_detections(
166
+ input_list: List[Optional[List[Dict[str, Any]]]]
167
+ ) -> List[Optional[Dict[str, Any]]]:
168
+ output_list: List[Optional[Dict[str, Any]]] = []
169
+
170
+ for _, frame in enumerate(input_list):
171
+ if frame is not None:
172
+ labels = [detection["label"] for detection in frame]
173
+ bboxes = [
174
+ denormalize_bbox(detection["bbox"], image_size)
175
+ for detection in frame
176
+ ]
177
+
178
+ output_list.append(
179
+ {
180
+ "labels": labels,
181
+ "bboxes": bboxes,
182
+ }
183
+ )
184
+ else:
185
+ output_list.append(None)
186
+
187
+ return output_list
188
+
189
+ output = _transform_detections(results)
190
+
191
+ buffer_bytes = frames_to_bytes(frames)
192
+ files = [("video", buffer_bytes)]
193
+ payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
194
+ metadata = {"function_name": function_name}
195
+
196
+ detections = send_task_inference_request(
197
+ payload,
198
+ "sam2",
199
+ files=files,
200
+ metadata=metadata,
201
+ )
202
+
203
+ return_data = []
204
+ for frame in detections:
205
+ return_frame_data = []
206
+ for detection in frame:
207
+ mask = rle_decode_array(detection["mask"])
208
+ label = str(detection["id"]) + ": " + detection["label"]
209
+ return_frame_data.append(
210
+ {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
211
+ )
212
+ return_data.append(return_frame_data)
213
+ return_data = add_bboxes_from_masks(return_data)
214
+ return_data = nms(return_data, iou_threshold=0.95)
215
+
216
+ # We save the RLE for display purposes, re-calculting RLE can get very expensive.
217
+ # Deleted here because we are returning the numpy masks instead
218
+ display_data = []
219
+ for frame in return_data:
220
+ display_frame_data = []
221
+ for obj in frame:
222
+ display_frame_data.append(
223
+ {
224
+ "label": obj["label"],
225
+ "score": obj["score"],
226
+ "bbox": denormalize_bbox(obj["bbox"], image_size),
227
+ "mask": obj["rle"],
228
+ }
229
+ )
230
+ del obj["rle"]
231
+ display_data.append(display_frame_data)
232
+
233
+ return {"files": files, "return_data": return_data, "display_data": detections}
234
+
235
+
122
236
  def owl_v2_image(
123
237
  prompt: str,
124
238
  image: np.ndarray,
@@ -302,6 +416,64 @@ def owl_v2_video(
302
416
  return bboxes_formatted
303
417
 
304
418
 
419
+ def owlv2_sam2_video_tracking(
420
+ prompt: str,
421
+ frames: List[np.ndarray],
422
+ chunk_length: Optional[int] = 10,
423
+ fine_tune_id: Optional[str] = None,
424
+ ) -> List[List[Dict[str, Any]]]:
425
+ """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
426
+ prompt such as category names or referring expressions. The categories in the text
427
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
428
+ mask file names and associated probability scores.
429
+
430
+ Parameters:
431
+ prompt (str): The prompt to ground to the image.
432
+ image (np.ndarray): The image to ground the prompt to.
433
+
434
+ Returns:
435
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
436
+ bounding box, and mask of the detected objects with normalized coordinates
437
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
438
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
439
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
440
+ the background.
441
+
442
+ Example
443
+ -------
444
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
445
+ [
446
+ [
447
+ {
448
+ 'label': '0: dinosaur',
449
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
450
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
451
+ [0, 0, 0, ..., 0, 0, 0],
452
+ ...,
453
+ [0, 0, 0, ..., 0, 0, 0],
454
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
455
+ },
456
+ ],
457
+ ...
458
+ ]
459
+ """
460
+
461
+ ret = od_sam2_video_tracking(
462
+ ODModels.OWLV2,
463
+ prompt=prompt,
464
+ frames=frames,
465
+ chunk_length=chunk_length,
466
+ fine_tune_id=fine_tune_id,
467
+ )
468
+ _display_tool_trace(
469
+ owlv2_sam2_video_tracking.__name__,
470
+ {},
471
+ ret["display_data"],
472
+ ret["files"],
473
+ )
474
+ return ret["return_data"] # type: ignore
475
+
476
+
305
477
  def florence2_sam2_image(
306
478
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
307
479
  ) -> List[Dict[str, Any]]:
@@ -834,6 +1006,59 @@ def countgd_sam2_object_detection(
834
1006
  return seg_ret["return_data"] # type: ignore
835
1007
 
836
1008
 
1009
+ def countgd_sam2_video_tracking(
1010
+ prompt: str,
1011
+ frames: List[np.ndarray],
1012
+ chunk_length: Optional[int] = 10,
1013
+ ) -> List[List[Dict[str, Any]]]:
1014
+ """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
1015
+ prompt such as category names or referring expressions. The categories in the text
1016
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
1017
+ mask file names and associated probability scores.
1018
+
1019
+ Parameters:
1020
+ prompt (str): The prompt to ground to the image.
1021
+ image (np.ndarray): The image to ground the prompt to.
1022
+
1023
+ Returns:
1024
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1025
+ bounding box, and mask of the detected objects with normalized coordinates
1026
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1027
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1028
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1029
+ the background.
1030
+
1031
+ Example
1032
+ -------
1033
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
1034
+ [
1035
+ [
1036
+ {
1037
+ 'label': '0: dinosaur',
1038
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1039
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1040
+ [0, 0, 0, ..., 0, 0, 0],
1041
+ ...,
1042
+ [0, 0, 0, ..., 0, 0, 0],
1043
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1044
+ },
1045
+ ],
1046
+ ...
1047
+ ]
1048
+ """
1049
+
1050
+ ret = od_sam2_video_tracking(
1051
+ ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
1052
+ )
1053
+ _display_tool_trace(
1054
+ countgd_sam2_video_tracking.__name__,
1055
+ {},
1056
+ ret["display_data"],
1057
+ ret["files"],
1058
+ )
1059
+ return ret["return_data"] # type: ignore
1060
+
1061
+
837
1062
  def countgd_example_based_counting(
838
1063
  visual_prompts: List[List[float]],
839
1064
  image: np.ndarray,
@@ -1879,6 +2104,76 @@ def closest_box_distance(
1879
2104
  return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1880
2105
 
1881
2106
 
2107
+ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
2108
+ """'document_extraction' is a tool that can extract structured information out of
2109
+ documents with different layouts. It returns the extracted data in a structured
2110
+ hierarchical format containing text, tables, pictures, charts, and other
2111
+ information.
2112
+
2113
+ Parameters:
2114
+ image (np.ndarray): The document image to analyze
2115
+
2116
+ Returns:
2117
+ Dict[str, Any]: A dictionary containing the extracted information.
2118
+
2119
+ Example
2120
+ -------
2121
+ >>> document_analysis(image)
2122
+ {'pages':
2123
+ [{'bbox': [0, 0, 1.0, 1.0],
2124
+ 'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
2125
+ 'label': 'page_header',
2126
+ 'order': 75
2127
+ 'caption': 'Annual Report 2024',
2128
+ 'summary': 'This annual report summarizes ...' },
2129
+ {'bbox': [0.2, 0.9, 0.9, 1.0],
2130
+ 'label': table',
2131
+ 'order': 1119,
2132
+ 'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
2133
+ 'summary': 'This table illustrates a trend of ...'},
2134
+ ],
2135
+ """
2136
+
2137
+ image_file = numpy_to_bytes(image)
2138
+
2139
+ files = [("image", image_file)]
2140
+
2141
+ payload = {
2142
+ "model": "document-analysis",
2143
+ }
2144
+
2145
+ data: Dict[str, Any] = send_inference_request(
2146
+ payload=payload,
2147
+ endpoint_name="document-analysis",
2148
+ files=files,
2149
+ v2=True,
2150
+ metadata_payload={"function_name": "document_analysis"},
2151
+ )
2152
+
2153
+ # don't display normalized bboxes
2154
+ _display_tool_trace(
2155
+ document_extraction.__name__,
2156
+ payload,
2157
+ data,
2158
+ files,
2159
+ )
2160
+
2161
+ def normalize(data: Any) -> Dict[str, Any]:
2162
+ if isinstance(data, Dict):
2163
+ if "bbox" in data:
2164
+ data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
2165
+ for key in data:
2166
+ data[key] = normalize(data[key])
2167
+ elif isinstance(data, List):
2168
+ for i in range(len(data)):
2169
+ data[i] = normalize(data[i])
2170
+ return data # type: ignore
2171
+
2172
+ data = normalize(data)
2173
+
2174
+ return data
2175
+
2176
+
1882
2177
  # Utility and visualization functions
1883
2178
 
1884
2179
 
@@ -2395,197 +2690,6 @@ def _plot_counting(
2395
2690
  return image
2396
2691
 
2397
2692
 
2398
- class ODModels(str, Enum):
2399
- COUNTGD = "countgd"
2400
- FLORENCE2 = "florence2"
2401
- OWLV2 = "owlv2"
2402
-
2403
-
2404
- def od_sam2_video_tracking(
2405
- od_model: ODModels,
2406
- prompt: str,
2407
- frames: List[np.ndarray],
2408
- chunk_length: Optional[int] = 10,
2409
- fine_tune_id: Optional[str] = None,
2410
- ) -> List[List[Dict[str, Any]]]:
2411
-
2412
- results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
2413
-
2414
- if chunk_length is None:
2415
- step = 1 # Process every frame
2416
- elif chunk_length <= 0:
2417
- raise ValueError("chunk_length must be a positive integer or None.")
2418
- else:
2419
- step = chunk_length # Process frames with the specified step size
2420
-
2421
- for idx in range(0, len(frames), step):
2422
- if od_model == ODModels.COUNTGD:
2423
- results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
2424
- function_name = "countgd_object_detection"
2425
- elif od_model == ODModels.OWLV2:
2426
- results[idx] = owl_v2_image(
2427
- prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2428
- )
2429
- function_name = "owl_v2_image"
2430
- elif od_model == ODModels.FLORENCE2:
2431
- results[idx] = florence2_sam2_image(
2432
- prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2433
- )
2434
- function_name = "florence2_sam2_image"
2435
- else:
2436
- raise NotImplementedError(
2437
- f"Object detection model '{od_model}' is not implemented."
2438
- )
2439
-
2440
- image_size = frames[0].shape[:2]
2441
-
2442
- def _transform_detections(
2443
- input_list: List[Optional[List[Dict[str, Any]]]]
2444
- ) -> List[Optional[Dict[str, Any]]]:
2445
- output_list: List[Optional[Dict[str, Any]]] = []
2446
-
2447
- for idx, frame in enumerate(input_list):
2448
- if frame is not None:
2449
- labels = [detection["label"] for detection in frame]
2450
- bboxes = [
2451
- denormalize_bbox(detection["bbox"], image_size)
2452
- for detection in frame
2453
- ]
2454
-
2455
- output_list.append(
2456
- {
2457
- "labels": labels,
2458
- "bboxes": bboxes,
2459
- }
2460
- )
2461
- else:
2462
- output_list.append(None)
2463
-
2464
- return output_list
2465
-
2466
- output = _transform_detections(results)
2467
-
2468
- buffer_bytes = frames_to_bytes(frames)
2469
- files = [("video", buffer_bytes)]
2470
- payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
2471
- metadata = {"function_name": function_name}
2472
-
2473
- detections = send_task_inference_request(
2474
- payload,
2475
- "sam2",
2476
- files=files,
2477
- metadata=metadata,
2478
- )
2479
-
2480
- return_data = []
2481
- for frame in detections:
2482
- return_frame_data = []
2483
- for detection in frame:
2484
- mask = rle_decode_array(detection["mask"])
2485
- label = str(detection["id"]) + ": " + detection["label"]
2486
- return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
2487
- return_data.append(return_frame_data)
2488
- return_data = add_bboxes_from_masks(return_data)
2489
- return nms(return_data, iou_threshold=0.95)
2490
-
2491
-
2492
- def countgd_sam2_video_tracking(
2493
- prompt: str,
2494
- frames: List[np.ndarray],
2495
- chunk_length: Optional[int] = 10,
2496
- ) -> List[List[Dict[str, Any]]]:
2497
- """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
2498
- prompt such as category names or referring expressions. The categories in the text
2499
- prompt are separated by commas. It returns a list of bounding boxes, label names,
2500
- mask file names and associated probability scores.
2501
-
2502
- Parameters:
2503
- prompt (str): The prompt to ground to the image.
2504
- image (np.ndarray): The image to ground the prompt to.
2505
-
2506
- Returns:
2507
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2508
- bounding box, and mask of the detected objects with normalized coordinates
2509
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2510
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2511
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2512
- the background.
2513
-
2514
- Example
2515
- -------
2516
- >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2517
- [
2518
- [
2519
- {
2520
- 'label': '0: dinosaur',
2521
- 'bbox': [0.1, 0.11, 0.35, 0.4],
2522
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2523
- [0, 0, 0, ..., 0, 0, 0],
2524
- ...,
2525
- [0, 0, 0, ..., 0, 0, 0],
2526
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2527
- },
2528
- ],
2529
- ...
2530
- ]
2531
- """
2532
-
2533
- return od_sam2_video_tracking(
2534
- ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
2535
- )
2536
-
2537
-
2538
- def owlv2_sam2_video_tracking(
2539
- prompt: str,
2540
- frames: List[np.ndarray],
2541
- chunk_length: Optional[int] = 10,
2542
- fine_tune_id: Optional[str] = None,
2543
- ) -> List[List[Dict[str, Any]]]:
2544
- """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
2545
- prompt such as category names or referring expressions. The categories in the text
2546
- prompt are separated by commas. It returns a list of bounding boxes, label names,
2547
- mask file names and associated probability scores.
2548
-
2549
- Parameters:
2550
- prompt (str): The prompt to ground to the image.
2551
- image (np.ndarray): The image to ground the prompt to.
2552
-
2553
- Returns:
2554
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2555
- bounding box, and mask of the detected objects with normalized coordinates
2556
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2557
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2558
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2559
- the background.
2560
-
2561
- Example
2562
- -------
2563
- >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2564
- [
2565
- [
2566
- {
2567
- 'label': '0: dinosaur',
2568
- 'bbox': [0.1, 0.11, 0.35, 0.4],
2569
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2570
- [0, 0, 0, ..., 0, 0, 0],
2571
- ...,
2572
- [0, 0, 0, ..., 0, 0, 0],
2573
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2574
- },
2575
- ],
2576
- ...
2577
- ]
2578
- """
2579
-
2580
- return od_sam2_video_tracking(
2581
- ODModels.OWLV2,
2582
- prompt=prompt,
2583
- frames=frames,
2584
- chunk_length=chunk_length,
2585
- fine_tune_id=fine_tune_id,
2586
- )
2587
-
2588
-
2589
2693
  FUNCTION_TOOLS = [
2590
2694
  owl_v2_image,
2591
2695
  owl_v2_video,
@@ -2605,6 +2709,7 @@ FUNCTION_TOOLS = [
2605
2709
  minimum_distance,
2606
2710
  qwen2_vl_images_vqa,
2607
2711
  qwen2_vl_video_vqa,
2712
+ document_extraction,
2608
2713
  video_temporal_localization,
2609
2714
  flux_image_inpainting,
2610
2715
  siglip_classification,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.214
3
+ Version: 0.2.216
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -1,5 +1,5 @@
1
- vision_agent/.sim_tools/df.csv,sha256=la1TzS9GK-oaI4rbJsj2UArr-eiz5qjNc01CEs98acQ,36494
2
- vision_agent/.sim_tools/embs.npy,sha256=N73_ritehDS9S6JnOd7qIR_htu4qvyuq7VLMjlW4N4c,356480
1
+ vision_agent/.sim_tools/df.csv,sha256=nHhcCD55RO9XTiWq_uQ8pHKkVxLXciCHH-SbGPAQEy0,41969
2
+ vision_agent/.sim_tools/embs.npy,sha256=UmnXd2Zv1xBu4a7pxHHf4wOhTLKub629rVX9fAusTxY,393344
3
3
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
4
4
  vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
5
5
  vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=InL8zUTRN8i_9J6r2wAtYdtNrVkElqdO_p-e2OA8q5A,2770
29
+ vision_agent/tools/__init__.py,sha256=qzAqY2WnRLoClz3qiNtupkLtvpPlcGa5ZUCIs21WS7k,2795
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
- vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
31
+ vision_agent/tools/planner_tools.py,sha256=tU1qz_VIQM_yPKDmuxjMWu68ZlAZ7ePWI1g7zswyWhI,13540
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
34
- vision_agent/tools/tools.py,sha256=ZcXEI0Pb54OGXnLWi690SFx22k7JlEmQ-N16LzRLHlk,90627
34
+ vision_agent/tools/tools.py,sha256=n6-UPaZ4XjF29_7EF5GRgx74GjiZ7HqZn4a1Aw-e4P0,94059
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
40
40
  vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent-0.2.214.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.214.dist-info/METADATA,sha256=H-NMLaCs8bVHlxWxDQzqixEQjbqMOwYk4aYGkE13BqM,19071
45
- vision_agent-0.2.214.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.214.dist-info/RECORD,,
43
+ vision_agent-0.2.216.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.216.dist-info/METADATA,sha256=B88HzV_M0A12EmhiC-968LcdospsiOUUR-aTcZFTH8A,19071
45
+ vision_agent-0.2.216.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.216.dist-info/RECORD,,