vision-agent 0.2.215__py3-none-any.whl → 0.2.217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -444,6 +444,35 @@ desc,doc,name
444
444
  >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
445
445
  'Lionel Messi'
446
446
  ",qwen2_vl_video_vqa
447
+ "'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
448
+ 'document_extraction' is a tool that can extract structured information out of
449
+ documents with different layouts. It returns the extracted data in a structured
450
+ hierarchical format containing text, tables, pictures, charts, and other
451
+ information.
452
+
453
+ Parameters:
454
+ image (np.ndarray): The document image to analyze
455
+
456
+ Returns:
457
+ Dict[str, Any]: A dictionary containing the extracted information.
458
+
459
+ Example
460
+ -------
461
+ >>> document_analysis(image)
462
+ {'pages':
463
+ [{'bbox': [0, 0, 1700, 2200],
464
+ 'chunks': [{'bbox': [1371, 75, 1503, 112],
465
+ 'label': 'page_header',
466
+ 'order': 75
467
+ 'caption': 'Annual Report 2024',
468
+ 'summary': 'This annual report summarizes ...' },
469
+ {'bbox': [201, 1119, 1497, 1647],
470
+ 'label': table',
471
+ 'order': 1119,
472
+ 'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
473
+ 'summary': 'This table illustrates a trend of ...'},
474
+ ],
475
+ ",document_extraction
447
476
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
448
477
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames
449
478
  value selected for the video. It can detect multiple objects independently per
@@ -513,6 +542,78 @@ desc,doc,name
513
542
  >>> siglip_classification(image, ['dog', 'cat', 'bird'])
514
543
  {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
515
544
  ",siglip_classification
545
+ "'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
546
+ 'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
547
+ prompt such as category names or referring expressions. The categories in the text
548
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
549
+ mask file names and associated probability scores.
550
+
551
+ Parameters:
552
+ prompt (str): The prompt to ground to the image.
553
+ image (np.ndarray): The image to ground the prompt to.
554
+
555
+ Returns:
556
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
557
+ bounding box, and mask of the detected objects with normalized coordinates
558
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
559
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
560
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
561
+ the background.
562
+
563
+ Example
564
+ -------
565
+ >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
566
+ [
567
+ [
568
+ {
569
+ 'label': '0: dinosaur',
570
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
571
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
572
+ [0, 0, 0, ..., 0, 0, 0],
573
+ ...,
574
+ [0, 0, 0, ..., 0, 0, 0],
575
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
576
+ },
577
+ ],
578
+ ...
579
+ ]
580
+ ",owlv2_sam2_video_tracking
581
+ "'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
582
+ 'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
583
+ prompt such as category names or referring expressions. The categories in the text
584
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
585
+ mask file names and associated probability scores.
586
+
587
+ Parameters:
588
+ prompt (str): The prompt to ground to the image.
589
+ image (np.ndarray): The image to ground the prompt to.
590
+
591
+ Returns:
592
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
593
+ bounding box, and mask of the detected objects with normalized coordinates
594
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
595
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
596
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
597
+ the background.
598
+
599
+ Example
600
+ -------
601
+ >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
602
+ [
603
+ [
604
+ {
605
+ 'label': '0: dinosaur',
606
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
607
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
608
+ [0, 0, 0, ..., 0, 0, 0],
609
+ ...,
610
+ [0, 0, 0, ..., 0, 0, 0],
611
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
612
+ },
613
+ ],
614
+ ...
615
+ ]
616
+ ",countgd_sam2_video_tracking
516
617
  "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
517
618
  'extract_frames_and_timestamps' extracts frames and timestamps from a video
518
619
  which can be a file path, url or youtube link, returns a list of dictionaries
Binary file
@@ -32,7 +32,8 @@ from .tools import (
32
32
  countgd_sam2_video_tracking,
33
33
  depth_anything_v2,
34
34
  detr_segmentation,
35
- document_analysis,
35
+ document_extraction,
36
+ document_qa,
36
37
  extract_frames_and_timestamps,
37
38
  florence2_ocr,
38
39
  florence2_phrase_grounding,
@@ -143,7 +143,14 @@ def run_tool_testing(
143
143
  code = extract_tag(response, "code") # type: ignore
144
144
  if code is None:
145
145
  raise ValueError(f"Could not extract code from response: {response}")
146
- code = process_code(code)
146
+
147
+ # If there's a syntax error with the code, process_code can crash. Executing the
148
+ # code and then sending the error to the LLM should correct it.
149
+ try:
150
+ code = process_code(code)
151
+ except Exception as e:
152
+ _LOGGER.error(f"Error processing code: {e}")
153
+
147
154
  tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
148
155
  tool_output_str = tool_output.text(include_results=False).strip()
149
156
 
@@ -167,6 +174,7 @@ def run_tool_testing(
167
174
  DefaultImports.prepend_imports(code)
168
175
  )
169
176
  tool_output_str = tool_output.text(include_results=False).strip()
177
+ count += 1
170
178
 
171
179
  return code, tool_docs_str, tool_output
172
180
 
@@ -119,6 +119,120 @@ def _display_tool_trace(
119
119
  display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
120
120
 
121
121
 
122
+ class ODModels(str, Enum):
123
+ COUNTGD = "countgd"
124
+ FLORENCE2 = "florence2"
125
+ OWLV2 = "owlv2"
126
+
127
+
128
+ def od_sam2_video_tracking(
129
+ od_model: ODModels,
130
+ prompt: str,
131
+ frames: List[np.ndarray],
132
+ chunk_length: Optional[int] = 10,
133
+ fine_tune_id: Optional[str] = None,
134
+ ) -> Dict[str, Any]:
135
+ results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
136
+
137
+ if chunk_length is None:
138
+ step = 1 # Process every frame
139
+ elif chunk_length <= 0:
140
+ raise ValueError("chunk_length must be a positive integer or None.")
141
+ else:
142
+ step = chunk_length # Process frames with the specified step size
143
+
144
+ for idx in range(0, len(frames), step):
145
+ if od_model == ODModels.COUNTGD:
146
+ results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
147
+ function_name = "countgd_object_detection"
148
+ elif od_model == ODModels.OWLV2:
149
+ results[idx] = owl_v2_image(
150
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
151
+ )
152
+ function_name = "owl_v2_image"
153
+ elif od_model == ODModels.FLORENCE2:
154
+ results[idx] = florence2_sam2_image(
155
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
156
+ )
157
+ function_name = "florence2_sam2_image"
158
+ else:
159
+ raise NotImplementedError(
160
+ f"Object detection model '{od_model}' is not implemented."
161
+ )
162
+
163
+ image_size = frames[0].shape[:2]
164
+
165
+ def _transform_detections(
166
+ input_list: List[Optional[List[Dict[str, Any]]]]
167
+ ) -> List[Optional[Dict[str, Any]]]:
168
+ output_list: List[Optional[Dict[str, Any]]] = []
169
+
170
+ for _, frame in enumerate(input_list):
171
+ if frame is not None:
172
+ labels = [detection["label"] for detection in frame]
173
+ bboxes = [
174
+ denormalize_bbox(detection["bbox"], image_size)
175
+ for detection in frame
176
+ ]
177
+
178
+ output_list.append(
179
+ {
180
+ "labels": labels,
181
+ "bboxes": bboxes,
182
+ }
183
+ )
184
+ else:
185
+ output_list.append(None)
186
+
187
+ return output_list
188
+
189
+ output = _transform_detections(results)
190
+
191
+ buffer_bytes = frames_to_bytes(frames)
192
+ files = [("video", buffer_bytes)]
193
+ payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
194
+ metadata = {"function_name": function_name}
195
+
196
+ detections = send_task_inference_request(
197
+ payload,
198
+ "sam2",
199
+ files=files,
200
+ metadata=metadata,
201
+ )
202
+
203
+ return_data = []
204
+ for frame in detections:
205
+ return_frame_data = []
206
+ for detection in frame:
207
+ mask = rle_decode_array(detection["mask"])
208
+ label = str(detection["id"]) + ": " + detection["label"]
209
+ return_frame_data.append(
210
+ {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
211
+ )
212
+ return_data.append(return_frame_data)
213
+ return_data = add_bboxes_from_masks(return_data)
214
+ return_data = nms(return_data, iou_threshold=0.95)
215
+
216
+ # We save the RLE for display purposes, re-calculting RLE can get very expensive.
217
+ # Deleted here because we are returning the numpy masks instead
218
+ display_data = []
219
+ for frame in return_data:
220
+ display_frame_data = []
221
+ for obj in frame:
222
+ display_frame_data.append(
223
+ {
224
+ "label": obj["label"],
225
+ "score": obj["score"],
226
+ "bbox": denormalize_bbox(obj["bbox"], image_size),
227
+ "mask": obj["rle"],
228
+ }
229
+ )
230
+ del obj["rle"]
231
+ display_data.append(display_frame_data)
232
+
233
+ return {"files": files, "return_data": return_data, "display_data": detections}
234
+
235
+
122
236
  def owl_v2_image(
123
237
  prompt: str,
124
238
  image: np.ndarray,
@@ -302,6 +416,64 @@ def owl_v2_video(
302
416
  return bboxes_formatted
303
417
 
304
418
 
419
+ def owlv2_sam2_video_tracking(
420
+ prompt: str,
421
+ frames: List[np.ndarray],
422
+ chunk_length: Optional[int] = 10,
423
+ fine_tune_id: Optional[str] = None,
424
+ ) -> List[List[Dict[str, Any]]]:
425
+ """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
426
+ prompt such as category names or referring expressions. The categories in the text
427
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
428
+ mask file names and associated probability scores.
429
+
430
+ Parameters:
431
+ prompt (str): The prompt to ground to the image.
432
+ image (np.ndarray): The image to ground the prompt to.
433
+
434
+ Returns:
435
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
436
+ bounding box, and mask of the detected objects with normalized coordinates
437
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
438
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
439
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
440
+ the background.
441
+
442
+ Example
443
+ -------
444
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
445
+ [
446
+ [
447
+ {
448
+ 'label': '0: dinosaur',
449
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
450
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
451
+ [0, 0, 0, ..., 0, 0, 0],
452
+ ...,
453
+ [0, 0, 0, ..., 0, 0, 0],
454
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
455
+ },
456
+ ],
457
+ ...
458
+ ]
459
+ """
460
+
461
+ ret = od_sam2_video_tracking(
462
+ ODModels.OWLV2,
463
+ prompt=prompt,
464
+ frames=frames,
465
+ chunk_length=chunk_length,
466
+ fine_tune_id=fine_tune_id,
467
+ )
468
+ _display_tool_trace(
469
+ owlv2_sam2_video_tracking.__name__,
470
+ {},
471
+ ret["display_data"],
472
+ ret["files"],
473
+ )
474
+ return ret["return_data"] # type: ignore
475
+
476
+
305
477
  def florence2_sam2_image(
306
478
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
307
479
  ) -> List[Dict[str, Any]]:
@@ -834,6 +1006,59 @@ def countgd_sam2_object_detection(
834
1006
  return seg_ret["return_data"] # type: ignore
835
1007
 
836
1008
 
1009
+ def countgd_sam2_video_tracking(
1010
+ prompt: str,
1011
+ frames: List[np.ndarray],
1012
+ chunk_length: Optional[int] = 10,
1013
+ ) -> List[List[Dict[str, Any]]]:
1014
+ """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
1015
+ prompt such as category names or referring expressions. The categories in the text
1016
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
1017
+ mask file names and associated probability scores.
1018
+
1019
+ Parameters:
1020
+ prompt (str): The prompt to ground to the image.
1021
+ image (np.ndarray): The image to ground the prompt to.
1022
+
1023
+ Returns:
1024
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1025
+ bounding box, and mask of the detected objects with normalized coordinates
1026
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1027
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1028
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1029
+ the background.
1030
+
1031
+ Example
1032
+ -------
1033
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
1034
+ [
1035
+ [
1036
+ {
1037
+ 'label': '0: dinosaur',
1038
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1039
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1040
+ [0, 0, 0, ..., 0, 0, 0],
1041
+ ...,
1042
+ [0, 0, 0, ..., 0, 0, 0],
1043
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1044
+ },
1045
+ ],
1046
+ ...
1047
+ ]
1048
+ """
1049
+
1050
+ ret = od_sam2_video_tracking(
1051
+ ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
1052
+ )
1053
+ _display_tool_trace(
1054
+ countgd_sam2_video_tracking.__name__,
1055
+ {},
1056
+ ret["display_data"],
1057
+ ret["files"],
1058
+ )
1059
+ return ret["return_data"] # type: ignore
1060
+
1061
+
837
1062
  def countgd_example_based_counting(
838
1063
  visual_prompts: List[List[float]],
839
1064
  image: np.ndarray,
@@ -1879,11 +2104,11 @@ def closest_box_distance(
1879
2104
  return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1880
2105
 
1881
2106
 
1882
- def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1883
- """'document_analysis' is an understanding tool that can handle various
1884
- types of document image layouts. It returns a structured output containing the text,
1885
- tables, pictures, charts and information caption, summary, labels, bounding boxes, etc
1886
- avoiding information loss.
2107
+ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
2108
+ """'document_extraction' is a tool that can extract structured information out of
2109
+ documents with different layouts. It returns the extracted data in a structured
2110
+ hierarchical format containing text, tables, pictures, charts, and other
2111
+ information.
1887
2112
 
1888
2113
  Parameters:
1889
2114
  image (np.ndarray): The document image to analyze
@@ -1894,20 +2119,18 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1894
2119
  Example
1895
2120
  -------
1896
2121
  >>> document_analysis(image)
1897
- {'pages': [{'bbox': [left_0, top_0, right_0, bottom_0],
1898
- 'chunks': [{'bbox': [left_1, top_1, right_1, bottom_1],
1899
- 'caption': 'TITLE',
2122
+ {'pages':
2123
+ [{'bbox': [0, 0, 1.0, 1.0],
2124
+ 'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
1900
2125
  'label': 'page_header',
1901
- 'summary': 'The image contains a single word ...' },
1902
- {'bbox': [left_2, top_2, right_2, bottom_2],
1903
- 'caption': {'data': [{'value': 200, 'year': '2024' ...},
1904
- 'title': 'Total CapEx Spending',
1905
- 'type': 'bar chart',
1906
- 'unit': 'Billion USD',
1907
- 'xAxis': 'Year',
1908
- 'yAxis': 'Total CapEx Spending'},
1909
- 'label': 'picture',
1910
- 'summary': 'This bar chart illustrates the trend of ...'},
2126
+ 'order': 75
2127
+ 'caption': 'Annual Report 2024',
2128
+ 'summary': 'This annual report summarizes ...' },
2129
+ {'bbox': [0.2, 0.9, 0.9, 1.0],
2130
+ 'label': table',
2131
+ 'order': 1119,
2132
+ 'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
2133
+ 'summary': 'This table illustrates a trend of ...'},
1911
2134
  ],
1912
2135
  """
1913
2136
 
@@ -1919,7 +2142,7 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1919
2142
  "model": "document-analysis",
1920
2143
  }
1921
2144
 
1922
- response: dict[str, Any] = send_inference_request(
2145
+ data: Dict[str, Any] = send_inference_request(
1923
2146
  payload=payload,
1924
2147
  endpoint_name="document-analysis",
1925
2148
  files=files,
@@ -1927,14 +2150,99 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1927
2150
  metadata_payload={"function_name": "document_analysis"},
1928
2151
  )
1929
2152
 
2153
+ # don't display normalized bboxes
1930
2154
  _display_tool_trace(
1931
- document_analysis.__name__,
2155
+ document_extraction.__name__,
1932
2156
  payload,
1933
- response,
2157
+ data,
1934
2158
  files,
1935
2159
  )
1936
2160
 
1937
- return response
2161
+ def normalize(data: Any) -> Dict[str, Any]:
2162
+ if isinstance(data, Dict):
2163
+ if "bbox" in data:
2164
+ data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
2165
+ for key in data:
2166
+ data[key] = normalize(data[key])
2167
+ elif isinstance(data, List):
2168
+ for i in range(len(data)):
2169
+ data[i] = normalize(data[i])
2170
+ return data # type: ignore
2171
+
2172
+ data = normalize(data)
2173
+
2174
+ return data
2175
+
2176
+
2177
+ def document_qa(
2178
+ prompt: str,
2179
+ image: np.ndarray,
2180
+ ) -> str:
2181
+ """'document_qa' is a tool that can answer any questions about arbitrary
2182
+ images of documents or presentations. It answers by analyzing the contextual document data
2183
+ and then using a model to answer specific questions. It returns text as an answer to the question.
2184
+
2185
+ Parameters:
2186
+ prompt (str): The question to be answered about the document image
2187
+ image (np.ndarray): The document image to analyze
2188
+
2189
+ Returns:
2190
+ str: The answer to the question based on the document's context.
2191
+
2192
+ Example
2193
+ -------
2194
+ >>> document_qa(image, question)
2195
+ 'The answer to the question ...'
2196
+ """
2197
+
2198
+ image_file = numpy_to_bytes(image)
2199
+
2200
+ files = [("image", image_file)]
2201
+
2202
+ payload = {
2203
+ "model": "document-analysis",
2204
+ }
2205
+
2206
+ data: dict[str, Any] = send_inference_request(
2207
+ payload=payload,
2208
+ endpoint_name="document-analysis",
2209
+ files=files,
2210
+ v2=True,
2211
+ metadata_payload={"function_name": "document_qa"},
2212
+ )
2213
+
2214
+ def normalize(data: Any) -> Dict[str, Any]:
2215
+ if isinstance(data, Dict):
2216
+ if "bbox" in data:
2217
+ data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
2218
+ for key in data:
2219
+ data[key] = normalize(data[key])
2220
+ elif isinstance(data, List):
2221
+ for i in range(len(data)):
2222
+ data[i] = normalize(data[i])
2223
+ return data # type: ignore
2224
+
2225
+ data = normalize(data)
2226
+
2227
+ prompt = f"""
2228
+ Document Context:
2229
+ {data}\n
2230
+ Question: {prompt}\n
2231
+ Please provide a clear, concise answer using only the information from the document. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
2232
+ """
2233
+
2234
+ lmm = AnthropicLMM()
2235
+ llm_output = lmm.generate(prompt=prompt)
2236
+ llm_output = cast(str, llm_output)
2237
+
2238
+ _display_tool_trace(
2239
+ document_qa.__name__,
2240
+ payload,
2241
+ llm_output,
2242
+ files,
2243
+ )
2244
+
2245
+ return llm_output
1938
2246
 
1939
2247
 
1940
2248
  # Utility and visualization functions
@@ -2453,197 +2761,6 @@ def _plot_counting(
2453
2761
  return image
2454
2762
 
2455
2763
 
2456
- class ODModels(str, Enum):
2457
- COUNTGD = "countgd"
2458
- FLORENCE2 = "florence2"
2459
- OWLV2 = "owlv2"
2460
-
2461
-
2462
- def od_sam2_video_tracking(
2463
- od_model: ODModels,
2464
- prompt: str,
2465
- frames: List[np.ndarray],
2466
- chunk_length: Optional[int] = 10,
2467
- fine_tune_id: Optional[str] = None,
2468
- ) -> List[List[Dict[str, Any]]]:
2469
-
2470
- results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
2471
-
2472
- if chunk_length is None:
2473
- step = 1 # Process every frame
2474
- elif chunk_length <= 0:
2475
- raise ValueError("chunk_length must be a positive integer or None.")
2476
- else:
2477
- step = chunk_length # Process frames with the specified step size
2478
-
2479
- for idx in range(0, len(frames), step):
2480
- if od_model == ODModels.COUNTGD:
2481
- results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
2482
- function_name = "countgd_object_detection"
2483
- elif od_model == ODModels.OWLV2:
2484
- results[idx] = owl_v2_image(
2485
- prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2486
- )
2487
- function_name = "owl_v2_image"
2488
- elif od_model == ODModels.FLORENCE2:
2489
- results[idx] = florence2_sam2_image(
2490
- prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2491
- )
2492
- function_name = "florence2_sam2_image"
2493
- else:
2494
- raise NotImplementedError(
2495
- f"Object detection model '{od_model}' is not implemented."
2496
- )
2497
-
2498
- image_size = frames[0].shape[:2]
2499
-
2500
- def _transform_detections(
2501
- input_list: List[Optional[List[Dict[str, Any]]]]
2502
- ) -> List[Optional[Dict[str, Any]]]:
2503
- output_list: List[Optional[Dict[str, Any]]] = []
2504
-
2505
- for idx, frame in enumerate(input_list):
2506
- if frame is not None:
2507
- labels = [detection["label"] for detection in frame]
2508
- bboxes = [
2509
- denormalize_bbox(detection["bbox"], image_size)
2510
- for detection in frame
2511
- ]
2512
-
2513
- output_list.append(
2514
- {
2515
- "labels": labels,
2516
- "bboxes": bboxes,
2517
- }
2518
- )
2519
- else:
2520
- output_list.append(None)
2521
-
2522
- return output_list
2523
-
2524
- output = _transform_detections(results)
2525
-
2526
- buffer_bytes = frames_to_bytes(frames)
2527
- files = [("video", buffer_bytes)]
2528
- payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
2529
- metadata = {"function_name": function_name}
2530
-
2531
- detections = send_task_inference_request(
2532
- payload,
2533
- "sam2",
2534
- files=files,
2535
- metadata=metadata,
2536
- )
2537
-
2538
- return_data = []
2539
- for frame in detections:
2540
- return_frame_data = []
2541
- for detection in frame:
2542
- mask = rle_decode_array(detection["mask"])
2543
- label = str(detection["id"]) + ": " + detection["label"]
2544
- return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
2545
- return_data.append(return_frame_data)
2546
- return_data = add_bboxes_from_masks(return_data)
2547
- return nms(return_data, iou_threshold=0.95)
2548
-
2549
-
2550
- def countgd_sam2_video_tracking(
2551
- prompt: str,
2552
- frames: List[np.ndarray],
2553
- chunk_length: Optional[int] = 10,
2554
- ) -> List[List[Dict[str, Any]]]:
2555
- """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
2556
- prompt such as category names or referring expressions. The categories in the text
2557
- prompt are separated by commas. It returns a list of bounding boxes, label names,
2558
- mask file names and associated probability scores.
2559
-
2560
- Parameters:
2561
- prompt (str): The prompt to ground to the image.
2562
- image (np.ndarray): The image to ground the prompt to.
2563
-
2564
- Returns:
2565
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2566
- bounding box, and mask of the detected objects with normalized coordinates
2567
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2568
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2569
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2570
- the background.
2571
-
2572
- Example
2573
- -------
2574
- >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2575
- [
2576
- [
2577
- {
2578
- 'label': '0: dinosaur',
2579
- 'bbox': [0.1, 0.11, 0.35, 0.4],
2580
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2581
- [0, 0, 0, ..., 0, 0, 0],
2582
- ...,
2583
- [0, 0, 0, ..., 0, 0, 0],
2584
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2585
- },
2586
- ],
2587
- ...
2588
- ]
2589
- """
2590
-
2591
- return od_sam2_video_tracking(
2592
- ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
2593
- )
2594
-
2595
-
2596
- def owlv2_sam2_video_tracking(
2597
- prompt: str,
2598
- frames: List[np.ndarray],
2599
- chunk_length: Optional[int] = 10,
2600
- fine_tune_id: Optional[str] = None,
2601
- ) -> List[List[Dict[str, Any]]]:
2602
- """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
2603
- prompt such as category names or referring expressions. The categories in the text
2604
- prompt are separated by commas. It returns a list of bounding boxes, label names,
2605
- mask file names and associated probability scores.
2606
-
2607
- Parameters:
2608
- prompt (str): The prompt to ground to the image.
2609
- image (np.ndarray): The image to ground the prompt to.
2610
-
2611
- Returns:
2612
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2613
- bounding box, and mask of the detected objects with normalized coordinates
2614
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2615
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2616
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2617
- the background.
2618
-
2619
- Example
2620
- -------
2621
- >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2622
- [
2623
- [
2624
- {
2625
- 'label': '0: dinosaur',
2626
- 'bbox': [0.1, 0.11, 0.35, 0.4],
2627
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2628
- [0, 0, 0, ..., 0, 0, 0],
2629
- ...,
2630
- [0, 0, 0, ..., 0, 0, 0],
2631
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2632
- },
2633
- ],
2634
- ...
2635
- ]
2636
- """
2637
-
2638
- return od_sam2_video_tracking(
2639
- ODModels.OWLV2,
2640
- prompt=prompt,
2641
- frames=frames,
2642
- chunk_length=chunk_length,
2643
- fine_tune_id=fine_tune_id,
2644
- )
2645
-
2646
-
2647
2764
  FUNCTION_TOOLS = [
2648
2765
  owl_v2_image,
2649
2766
  owl_v2_video,
@@ -2663,6 +2780,7 @@ FUNCTION_TOOLS = [
2663
2780
  minimum_distance,
2664
2781
  qwen2_vl_images_vqa,
2665
2782
  qwen2_vl_video_vqa,
2783
+ document_extraction,
2666
2784
  video_temporal_localization,
2667
2785
  flux_image_inpainting,
2668
2786
  siglip_classification,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.215
3
+ Version: 0.2.217
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -1,5 +1,5 @@
1
- vision_agent/.sim_tools/df.csv,sha256=la1TzS9GK-oaI4rbJsj2UArr-eiz5qjNc01CEs98acQ,36494
2
- vision_agent/.sim_tools/embs.npy,sha256=N73_ritehDS9S6JnOd7qIR_htu4qvyuq7VLMjlW4N4c,356480
1
+ vision_agent/.sim_tools/df.csv,sha256=nHhcCD55RO9XTiWq_uQ8pHKkVxLXciCHH-SbGPAQEy0,41969
2
+ vision_agent/.sim_tools/embs.npy,sha256=UmnXd2Zv1xBu4a7pxHHf4wOhTLKub629rVX9fAusTxY,393344
3
3
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
4
4
  vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
5
5
  vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=Ny522Y4h1xDQTW6kBP_ceUM4jc0Y14dRhcHdtMDdr24,2793
29
+ vision_agent/tools/__init__.py,sha256=Jdq34jMw_KuYwk4Wexqm4DRjuLLoL1Q8wukm0NBv1Tc,2812
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
- vision_agent/tools/planner_tools.py,sha256=k7PPu-HhwDwusQgFSPTCWKRVVHBzPMeYB6h2xSEjdUo,13273
31
+ vision_agent/tools/planner_tools.py,sha256=tU1qz_VIQM_yPKDmuxjMWu68ZlAZ7ePWI1g7zswyWhI,13540
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
34
- vision_agent/tools/tools.py,sha256=xzN1uOkVQ9l1MaXsJxT_VlDp6nLQfdBX04kex_jE0fc,92692
34
+ vision_agent/tools/tools.py,sha256=Xcm_9EQdDCR9X5FhIm7VJaTL0qWqhnJUVTRVrRtETrA,96112
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
40
40
  vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent-0.2.215.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.215.dist-info/METADATA,sha256=nSGpnpDpzJmWmGYDSShBvfjD5dbB6ZWSgOXGQ2Ci_yM,19071
45
- vision_agent-0.2.215.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.215.dist-info/RECORD,,
43
+ vision_agent-0.2.217.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.217.dist-info/METADATA,sha256=xl9AmXP9RBpC5frlASsiG7YktdIOTRuJgv8WZdRV_bA,19071
45
+ vision_agent-0.2.217.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.217.dist-info/RECORD,,