vision-agent 0.2.215__tar.gz → 0.2.216__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. {vision_agent-0.2.215 → vision_agent-0.2.216}/PKG-INFO +1 -1
  2. {vision_agent-0.2.215 → vision_agent-0.2.216}/pyproject.toml +1 -1
  3. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/.sim_tools/df.csv +101 -0
  4. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/.sim_tools/embs.npy +0 -0
  5. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/tools/__init__.py +1 -1
  6. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/tools/planner_tools.py +9 -1
  7. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/tools/tools.py +260 -213
  8. {vision_agent-0.2.215 → vision_agent-0.2.216}/LICENSE +0 -0
  9. {vision_agent-0.2.215 → vision_agent-0.2.216}/README.md +0 -0
  10. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/__init__.py +0 -0
  11. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/README.md +0 -0
  12. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/__init__.py +0 -0
  13. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/agent.py +0 -0
  14. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/agent_utils.py +0 -0
  15. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/types.py +0 -0
  16. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent.py +0 -0
  17. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_coder.py +0 -0
  18. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  19. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  20. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  21. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_planner.py +0 -0
  22. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  23. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  24. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  25. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_prompts.py +0 -0
  26. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  27. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/agent/vision_agent_v2.py +0 -0
  28. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/clients/__init__.py +0 -0
  29. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/clients/http.py +0 -0
  30. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/clients/landing_public_api.py +0 -0
  31. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/fonts/__init__.py +0 -0
  32. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  33. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/lmm/__init__.py +0 -0
  34. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/lmm/lmm.py +0 -0
  35. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/lmm/types.py +0 -0
  36. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/tools/meta_tools.py +0 -0
  37. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/tools/prompts.py +0 -0
  38. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/tools/tool_utils.py +0 -0
  39. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/tools/tools_types.py +0 -0
  40. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/utils/__init__.py +0 -0
  41. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/utils/exceptions.py +0 -0
  42. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/utils/execute.py +0 -0
  43. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/utils/image_utils.py +0 -0
  44. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/utils/sim.py +0 -0
  45. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/utils/type_defs.py +0 -0
  46. {vision_agent-0.2.215 → vision_agent-0.2.216}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.215
3
+ Version: 0.2.216
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.215"
7
+ version = "0.2.216"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -444,6 +444,35 @@ desc,doc,name
444
444
  >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
445
445
  'Lionel Messi'
446
446
  ",qwen2_vl_video_vqa
447
+ "'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
448
+ 'document_extraction' is a tool that can extract structured information out of
449
+ documents with different layouts. It returns the extracted data in a structured
450
+ hierarchical format containing text, tables, pictures, charts, and other
451
+ information.
452
+
453
+ Parameters:
454
+ image (np.ndarray): The document image to analyze
455
+
456
+ Returns:
457
+ Dict[str, Any]: A dictionary containing the extracted information.
458
+
459
+ Example
460
+ -------
461
+ >>> document_analysis(image)
462
+ {'pages':
463
+ [{'bbox': [0, 0, 1700, 2200],
464
+ 'chunks': [{'bbox': [1371, 75, 1503, 112],
465
+ 'label': 'page_header',
466
+ 'order': 75
467
+ 'caption': 'Annual Report 2024',
468
+ 'summary': 'This annual report summarizes ...' },
469
+ {'bbox': [201, 1119, 1497, 1647],
470
+ 'label': table',
471
+ 'order': 1119,
472
+ 'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
473
+ 'summary': 'This table illustrates a trend of ...'},
474
+ ],
475
+ ",document_extraction
447
476
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
448
477
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames
449
478
  value selected for the video. It can detect multiple objects independently per
@@ -513,6 +542,78 @@ desc,doc,name
513
542
  >>> siglip_classification(image, ['dog', 'cat', 'bird'])
514
543
  {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
515
544
  ",siglip_classification
545
+ "'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
546
+ 'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
547
+ prompt such as category names or referring expressions. The categories in the text
548
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
549
+ mask file names and associated probability scores.
550
+
551
+ Parameters:
552
+ prompt (str): The prompt to ground to the image.
553
+ image (np.ndarray): The image to ground the prompt to.
554
+
555
+ Returns:
556
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
557
+ bounding box, and mask of the detected objects with normalized coordinates
558
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
559
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
560
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
561
+ the background.
562
+
563
+ Example
564
+ -------
565
+ >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
566
+ [
567
+ [
568
+ {
569
+ 'label': '0: dinosaur',
570
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
571
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
572
+ [0, 0, 0, ..., 0, 0, 0],
573
+ ...,
574
+ [0, 0, 0, ..., 0, 0, 0],
575
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
576
+ },
577
+ ],
578
+ ...
579
+ ]
580
+ ",owlv2_sam2_video_tracking
581
+ "'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
582
+ 'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
583
+ prompt such as category names or referring expressions. The categories in the text
584
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
585
+ mask file names and associated probability scores.
586
+
587
+ Parameters:
588
+ prompt (str): The prompt to ground to the image.
589
+ image (np.ndarray): The image to ground the prompt to.
590
+
591
+ Returns:
592
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
593
+ bounding box, and mask of the detected objects with normalized coordinates
594
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
595
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
596
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
597
+ the background.
598
+
599
+ Example
600
+ -------
601
+ >>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
602
+ [
603
+ [
604
+ {
605
+ 'label': '0: dinosaur',
606
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
607
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
608
+ [0, 0, 0, ..., 0, 0, 0],
609
+ ...,
610
+ [0, 0, 0, ..., 0, 0, 0],
611
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
612
+ },
613
+ ],
614
+ ...
615
+ ]
616
+ ",countgd_sam2_video_tracking
516
617
  "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
517
618
  'extract_frames_and_timestamps' extracts frames and timestamps from a video
518
619
  which can be a file path, url or youtube link, returns a list of dictionaries
@@ -32,7 +32,7 @@ from .tools import (
32
32
  countgd_sam2_video_tracking,
33
33
  depth_anything_v2,
34
34
  detr_segmentation,
35
- document_analysis,
35
+ document_extraction,
36
36
  extract_frames_and_timestamps,
37
37
  florence2_ocr,
38
38
  florence2_phrase_grounding,
@@ -143,7 +143,14 @@ def run_tool_testing(
143
143
  code = extract_tag(response, "code") # type: ignore
144
144
  if code is None:
145
145
  raise ValueError(f"Could not extract code from response: {response}")
146
- code = process_code(code)
146
+
147
+ # If there's a syntax error with the code, process_code can crash. Executing the
148
+ # code and then sending the error to the LLM should correct it.
149
+ try:
150
+ code = process_code(code)
151
+ except Exception as e:
152
+ _LOGGER.error(f"Error processing code: {e}")
153
+
147
154
  tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
148
155
  tool_output_str = tool_output.text(include_results=False).strip()
149
156
 
@@ -167,6 +174,7 @@ def run_tool_testing(
167
174
  DefaultImports.prepend_imports(code)
168
175
  )
169
176
  tool_output_str = tool_output.text(include_results=False).strip()
177
+ count += 1
170
178
 
171
179
  return code, tool_docs_str, tool_output
172
180
 
@@ -119,6 +119,120 @@ def _display_tool_trace(
119
119
  display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
120
120
 
121
121
 
122
+ class ODModels(str, Enum):
123
+ COUNTGD = "countgd"
124
+ FLORENCE2 = "florence2"
125
+ OWLV2 = "owlv2"
126
+
127
+
128
+ def od_sam2_video_tracking(
129
+ od_model: ODModels,
130
+ prompt: str,
131
+ frames: List[np.ndarray],
132
+ chunk_length: Optional[int] = 10,
133
+ fine_tune_id: Optional[str] = None,
134
+ ) -> Dict[str, Any]:
135
+ results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
136
+
137
+ if chunk_length is None:
138
+ step = 1 # Process every frame
139
+ elif chunk_length <= 0:
140
+ raise ValueError("chunk_length must be a positive integer or None.")
141
+ else:
142
+ step = chunk_length # Process frames with the specified step size
143
+
144
+ for idx in range(0, len(frames), step):
145
+ if od_model == ODModels.COUNTGD:
146
+ results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
147
+ function_name = "countgd_object_detection"
148
+ elif od_model == ODModels.OWLV2:
149
+ results[idx] = owl_v2_image(
150
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
151
+ )
152
+ function_name = "owl_v2_image"
153
+ elif od_model == ODModels.FLORENCE2:
154
+ results[idx] = florence2_sam2_image(
155
+ prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
156
+ )
157
+ function_name = "florence2_sam2_image"
158
+ else:
159
+ raise NotImplementedError(
160
+ f"Object detection model '{od_model}' is not implemented."
161
+ )
162
+
163
+ image_size = frames[0].shape[:2]
164
+
165
+ def _transform_detections(
166
+ input_list: List[Optional[List[Dict[str, Any]]]]
167
+ ) -> List[Optional[Dict[str, Any]]]:
168
+ output_list: List[Optional[Dict[str, Any]]] = []
169
+
170
+ for _, frame in enumerate(input_list):
171
+ if frame is not None:
172
+ labels = [detection["label"] for detection in frame]
173
+ bboxes = [
174
+ denormalize_bbox(detection["bbox"], image_size)
175
+ for detection in frame
176
+ ]
177
+
178
+ output_list.append(
179
+ {
180
+ "labels": labels,
181
+ "bboxes": bboxes,
182
+ }
183
+ )
184
+ else:
185
+ output_list.append(None)
186
+
187
+ return output_list
188
+
189
+ output = _transform_detections(results)
190
+
191
+ buffer_bytes = frames_to_bytes(frames)
192
+ files = [("video", buffer_bytes)]
193
+ payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
194
+ metadata = {"function_name": function_name}
195
+
196
+ detections = send_task_inference_request(
197
+ payload,
198
+ "sam2",
199
+ files=files,
200
+ metadata=metadata,
201
+ )
202
+
203
+ return_data = []
204
+ for frame in detections:
205
+ return_frame_data = []
206
+ for detection in frame:
207
+ mask = rle_decode_array(detection["mask"])
208
+ label = str(detection["id"]) + ": " + detection["label"]
209
+ return_frame_data.append(
210
+ {"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
211
+ )
212
+ return_data.append(return_frame_data)
213
+ return_data = add_bboxes_from_masks(return_data)
214
+ return_data = nms(return_data, iou_threshold=0.95)
215
+
216
+ # We save the RLE for display purposes, re-calculting RLE can get very expensive.
217
+ # Deleted here because we are returning the numpy masks instead
218
+ display_data = []
219
+ for frame in return_data:
220
+ display_frame_data = []
221
+ for obj in frame:
222
+ display_frame_data.append(
223
+ {
224
+ "label": obj["label"],
225
+ "score": obj["score"],
226
+ "bbox": denormalize_bbox(obj["bbox"], image_size),
227
+ "mask": obj["rle"],
228
+ }
229
+ )
230
+ del obj["rle"]
231
+ display_data.append(display_frame_data)
232
+
233
+ return {"files": files, "return_data": return_data, "display_data": detections}
234
+
235
+
122
236
  def owl_v2_image(
123
237
  prompt: str,
124
238
  image: np.ndarray,
@@ -302,6 +416,64 @@ def owl_v2_video(
302
416
  return bboxes_formatted
303
417
 
304
418
 
419
+ def owlv2_sam2_video_tracking(
420
+ prompt: str,
421
+ frames: List[np.ndarray],
422
+ chunk_length: Optional[int] = 10,
423
+ fine_tune_id: Optional[str] = None,
424
+ ) -> List[List[Dict[str, Any]]]:
425
+ """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
426
+ prompt such as category names or referring expressions. The categories in the text
427
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
428
+ mask file names and associated probability scores.
429
+
430
+ Parameters:
431
+ prompt (str): The prompt to ground to the image.
432
+ image (np.ndarray): The image to ground the prompt to.
433
+
434
+ Returns:
435
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
436
+ bounding box, and mask of the detected objects with normalized coordinates
437
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
438
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
439
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
440
+ the background.
441
+
442
+ Example
443
+ -------
444
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
445
+ [
446
+ [
447
+ {
448
+ 'label': '0: dinosaur',
449
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
450
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
451
+ [0, 0, 0, ..., 0, 0, 0],
452
+ ...,
453
+ [0, 0, 0, ..., 0, 0, 0],
454
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
455
+ },
456
+ ],
457
+ ...
458
+ ]
459
+ """
460
+
461
+ ret = od_sam2_video_tracking(
462
+ ODModels.OWLV2,
463
+ prompt=prompt,
464
+ frames=frames,
465
+ chunk_length=chunk_length,
466
+ fine_tune_id=fine_tune_id,
467
+ )
468
+ _display_tool_trace(
469
+ owlv2_sam2_video_tracking.__name__,
470
+ {},
471
+ ret["display_data"],
472
+ ret["files"],
473
+ )
474
+ return ret["return_data"] # type: ignore
475
+
476
+
305
477
  def florence2_sam2_image(
306
478
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
307
479
  ) -> List[Dict[str, Any]]:
@@ -834,6 +1006,59 @@ def countgd_sam2_object_detection(
834
1006
  return seg_ret["return_data"] # type: ignore
835
1007
 
836
1008
 
1009
+ def countgd_sam2_video_tracking(
1010
+ prompt: str,
1011
+ frames: List[np.ndarray],
1012
+ chunk_length: Optional[int] = 10,
1013
+ ) -> List[List[Dict[str, Any]]]:
1014
+ """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
1015
+ prompt such as category names or referring expressions. The categories in the text
1016
+ prompt are separated by commas. It returns a list of bounding boxes, label names,
1017
+ mask file names and associated probability scores.
1018
+
1019
+ Parameters:
1020
+ prompt (str): The prompt to ground to the image.
1021
+ image (np.ndarray): The image to ground the prompt to.
1022
+
1023
+ Returns:
1024
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1025
+ bounding box, and mask of the detected objects with normalized coordinates
1026
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1027
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1028
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1029
+ the background.
1030
+
1031
+ Example
1032
+ -------
1033
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
1034
+ [
1035
+ [
1036
+ {
1037
+ 'label': '0: dinosaur',
1038
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1039
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1040
+ [0, 0, 0, ..., 0, 0, 0],
1041
+ ...,
1042
+ [0, 0, 0, ..., 0, 0, 0],
1043
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1044
+ },
1045
+ ],
1046
+ ...
1047
+ ]
1048
+ """
1049
+
1050
+ ret = od_sam2_video_tracking(
1051
+ ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
1052
+ )
1053
+ _display_tool_trace(
1054
+ countgd_sam2_video_tracking.__name__,
1055
+ {},
1056
+ ret["display_data"],
1057
+ ret["files"],
1058
+ )
1059
+ return ret["return_data"] # type: ignore
1060
+
1061
+
837
1062
  def countgd_example_based_counting(
838
1063
  visual_prompts: List[List[float]],
839
1064
  image: np.ndarray,
@@ -1879,11 +2104,11 @@ def closest_box_distance(
1879
2104
  return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1880
2105
 
1881
2106
 
1882
- def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1883
- """'document_analysis' is an understanding tool that can handle various
1884
- types of document image layouts. It returns a structured output containing the text,
1885
- tables, pictures, charts and information caption, summary, labels, bounding boxes, etc
1886
- avoiding information loss.
2107
+ def document_extraction(image: np.ndarray) -> Dict[str, Any]:
2108
+ """'document_extraction' is a tool that can extract structured information out of
2109
+ documents with different layouts. It returns the extracted data in a structured
2110
+ hierarchical format containing text, tables, pictures, charts, and other
2111
+ information.
1887
2112
 
1888
2113
  Parameters:
1889
2114
  image (np.ndarray): The document image to analyze
@@ -1894,20 +2119,18 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1894
2119
  Example
1895
2120
  -------
1896
2121
  >>> document_analysis(image)
1897
- {'pages': [{'bbox': [left_0, top_0, right_0, bottom_0],
1898
- 'chunks': [{'bbox': [left_1, top_1, right_1, bottom_1],
1899
- 'caption': 'TITLE',
2122
+ {'pages':
2123
+ [{'bbox': [0, 0, 1.0, 1.0],
2124
+ 'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
1900
2125
  'label': 'page_header',
1901
- 'summary': 'The image contains a single word ...' },
1902
- {'bbox': [left_2, top_2, right_2, bottom_2],
1903
- 'caption': {'data': [{'value': 200, 'year': '2024' ...},
1904
- 'title': 'Total CapEx Spending',
1905
- 'type': 'bar chart',
1906
- 'unit': 'Billion USD',
1907
- 'xAxis': 'Year',
1908
- 'yAxis': 'Total CapEx Spending'},
1909
- 'label': 'picture',
1910
- 'summary': 'This bar chart illustrates the trend of ...'},
2126
+ 'order': 75
2127
+ 'caption': 'Annual Report 2024',
2128
+ 'summary': 'This annual report summarizes ...' },
2129
+ {'bbox': [0.2, 0.9, 0.9, 1.0],
2130
+ 'label': table',
2131
+ 'order': 1119,
2132
+ 'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
2133
+ 'summary': 'This table illustrates a trend of ...'},
1911
2134
  ],
1912
2135
  """
1913
2136
 
@@ -1919,7 +2142,7 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1919
2142
  "model": "document-analysis",
1920
2143
  }
1921
2144
 
1922
- response: dict[str, Any] = send_inference_request(
2145
+ data: Dict[str, Any] = send_inference_request(
1923
2146
  payload=payload,
1924
2147
  endpoint_name="document-analysis",
1925
2148
  files=files,
@@ -1927,14 +2150,28 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
1927
2150
  metadata_payload={"function_name": "document_analysis"},
1928
2151
  )
1929
2152
 
2153
+ # don't display normalized bboxes
1930
2154
  _display_tool_trace(
1931
- document_analysis.__name__,
2155
+ document_extraction.__name__,
1932
2156
  payload,
1933
- response,
2157
+ data,
1934
2158
  files,
1935
2159
  )
1936
2160
 
1937
- return response
2161
+ def normalize(data: Any) -> Dict[str, Any]:
2162
+ if isinstance(data, Dict):
2163
+ if "bbox" in data:
2164
+ data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
2165
+ for key in data:
2166
+ data[key] = normalize(data[key])
2167
+ elif isinstance(data, List):
2168
+ for i in range(len(data)):
2169
+ data[i] = normalize(data[i])
2170
+ return data # type: ignore
2171
+
2172
+ data = normalize(data)
2173
+
2174
+ return data
1938
2175
 
1939
2176
 
1940
2177
  # Utility and visualization functions
@@ -2453,197 +2690,6 @@ def _plot_counting(
2453
2690
  return image
2454
2691
 
2455
2692
 
2456
- class ODModels(str, Enum):
2457
- COUNTGD = "countgd"
2458
- FLORENCE2 = "florence2"
2459
- OWLV2 = "owlv2"
2460
-
2461
-
2462
- def od_sam2_video_tracking(
2463
- od_model: ODModels,
2464
- prompt: str,
2465
- frames: List[np.ndarray],
2466
- chunk_length: Optional[int] = 10,
2467
- fine_tune_id: Optional[str] = None,
2468
- ) -> List[List[Dict[str, Any]]]:
2469
-
2470
- results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
2471
-
2472
- if chunk_length is None:
2473
- step = 1 # Process every frame
2474
- elif chunk_length <= 0:
2475
- raise ValueError("chunk_length must be a positive integer or None.")
2476
- else:
2477
- step = chunk_length # Process frames with the specified step size
2478
-
2479
- for idx in range(0, len(frames), step):
2480
- if od_model == ODModels.COUNTGD:
2481
- results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
2482
- function_name = "countgd_object_detection"
2483
- elif od_model == ODModels.OWLV2:
2484
- results[idx] = owl_v2_image(
2485
- prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2486
- )
2487
- function_name = "owl_v2_image"
2488
- elif od_model == ODModels.FLORENCE2:
2489
- results[idx] = florence2_sam2_image(
2490
- prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
2491
- )
2492
- function_name = "florence2_sam2_image"
2493
- else:
2494
- raise NotImplementedError(
2495
- f"Object detection model '{od_model}' is not implemented."
2496
- )
2497
-
2498
- image_size = frames[0].shape[:2]
2499
-
2500
- def _transform_detections(
2501
- input_list: List[Optional[List[Dict[str, Any]]]]
2502
- ) -> List[Optional[Dict[str, Any]]]:
2503
- output_list: List[Optional[Dict[str, Any]]] = []
2504
-
2505
- for idx, frame in enumerate(input_list):
2506
- if frame is not None:
2507
- labels = [detection["label"] for detection in frame]
2508
- bboxes = [
2509
- denormalize_bbox(detection["bbox"], image_size)
2510
- for detection in frame
2511
- ]
2512
-
2513
- output_list.append(
2514
- {
2515
- "labels": labels,
2516
- "bboxes": bboxes,
2517
- }
2518
- )
2519
- else:
2520
- output_list.append(None)
2521
-
2522
- return output_list
2523
-
2524
- output = _transform_detections(results)
2525
-
2526
- buffer_bytes = frames_to_bytes(frames)
2527
- files = [("video", buffer_bytes)]
2528
- payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
2529
- metadata = {"function_name": function_name}
2530
-
2531
- detections = send_task_inference_request(
2532
- payload,
2533
- "sam2",
2534
- files=files,
2535
- metadata=metadata,
2536
- )
2537
-
2538
- return_data = []
2539
- for frame in detections:
2540
- return_frame_data = []
2541
- for detection in frame:
2542
- mask = rle_decode_array(detection["mask"])
2543
- label = str(detection["id"]) + ": " + detection["label"]
2544
- return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
2545
- return_data.append(return_frame_data)
2546
- return_data = add_bboxes_from_masks(return_data)
2547
- return nms(return_data, iou_threshold=0.95)
2548
-
2549
-
2550
- def countgd_sam2_video_tracking(
2551
- prompt: str,
2552
- frames: List[np.ndarray],
2553
- chunk_length: Optional[int] = 10,
2554
- ) -> List[List[Dict[str, Any]]]:
2555
- """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
2556
- prompt such as category names or referring expressions. The categories in the text
2557
- prompt are separated by commas. It returns a list of bounding boxes, label names,
2558
- mask file names and associated probability scores.
2559
-
2560
- Parameters:
2561
- prompt (str): The prompt to ground to the image.
2562
- image (np.ndarray): The image to ground the prompt to.
2563
-
2564
- Returns:
2565
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2566
- bounding box, and mask of the detected objects with normalized coordinates
2567
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2568
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2569
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2570
- the background.
2571
-
2572
- Example
2573
- -------
2574
- >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2575
- [
2576
- [
2577
- {
2578
- 'label': '0: dinosaur',
2579
- 'bbox': [0.1, 0.11, 0.35, 0.4],
2580
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2581
- [0, 0, 0, ..., 0, 0, 0],
2582
- ...,
2583
- [0, 0, 0, ..., 0, 0, 0],
2584
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2585
- },
2586
- ],
2587
- ...
2588
- ]
2589
- """
2590
-
2591
- return od_sam2_video_tracking(
2592
- ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
2593
- )
2594
-
2595
-
2596
- def owlv2_sam2_video_tracking(
2597
- prompt: str,
2598
- frames: List[np.ndarray],
2599
- chunk_length: Optional[int] = 10,
2600
- fine_tune_id: Optional[str] = None,
2601
- ) -> List[List[Dict[str, Any]]]:
2602
- """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
2603
- prompt such as category names or referring expressions. The categories in the text
2604
- prompt are separated by commas. It returns a list of bounding boxes, label names,
2605
- mask file names and associated probability scores.
2606
-
2607
- Parameters:
2608
- prompt (str): The prompt to ground to the image.
2609
- image (np.ndarray): The image to ground the prompt to.
2610
-
2611
- Returns:
2612
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
2613
- bounding box, and mask of the detected objects with normalized coordinates
2614
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
2615
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
2616
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
2617
- the background.
2618
-
2619
- Example
2620
- -------
2621
- >>> countgd_sam2_video_tracking("car, dinosaur", frames)
2622
- [
2623
- [
2624
- {
2625
- 'label': '0: dinosaur',
2626
- 'bbox': [0.1, 0.11, 0.35, 0.4],
2627
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
2628
- [0, 0, 0, ..., 0, 0, 0],
2629
- ...,
2630
- [0, 0, 0, ..., 0, 0, 0],
2631
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
2632
- },
2633
- ],
2634
- ...
2635
- ]
2636
- """
2637
-
2638
- return od_sam2_video_tracking(
2639
- ODModels.OWLV2,
2640
- prompt=prompt,
2641
- frames=frames,
2642
- chunk_length=chunk_length,
2643
- fine_tune_id=fine_tune_id,
2644
- )
2645
-
2646
-
2647
2693
  FUNCTION_TOOLS = [
2648
2694
  owl_v2_image,
2649
2695
  owl_v2_video,
@@ -2663,6 +2709,7 @@ FUNCTION_TOOLS = [
2663
2709
  minimum_distance,
2664
2710
  qwen2_vl_images_vqa,
2665
2711
  qwen2_vl_video_vqa,
2712
+ document_extraction,
2666
2713
  video_temporal_localization,
2667
2714
  flux_image_inpainting,
2668
2715
  siglip_classification,
File without changes
File without changes