vision-agent 0.2.215__py3-none-any.whl → 0.2.217__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/.sim_tools/df.csv +101 -0
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/tools/__init__.py +2 -1
- vision_agent/tools/planner_tools.py +9 -1
- vision_agent/tools/tools.py +331 -213
- {vision_agent-0.2.215.dist-info → vision_agent-0.2.217.dist-info}/METADATA +1 -1
- {vision_agent-0.2.215.dist-info → vision_agent-0.2.217.dist-info}/RECORD +9 -9
- {vision_agent-0.2.215.dist-info → vision_agent-0.2.217.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.215.dist-info → vision_agent-0.2.217.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -444,6 +444,35 @@ desc,doc,name
|
|
444
444
|
>>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
|
445
445
|
'Lionel Messi'
|
446
446
|
",qwen2_vl_video_vqa
|
447
|
+
"'document_extraction' is a tool that can extract structured information out of documents with different layouts. It returns the extracted data in a structured hierarchical format containing text, tables, pictures, charts, and other information.","document_extraction(image: numpy.ndarray) -> Dict[str, Any]:
|
448
|
+
'document_extraction' is a tool that can extract structured information out of
|
449
|
+
documents with different layouts. It returns the extracted data in a structured
|
450
|
+
hierarchical format containing text, tables, pictures, charts, and other
|
451
|
+
information.
|
452
|
+
|
453
|
+
Parameters:
|
454
|
+
image (np.ndarray): The document image to analyze
|
455
|
+
|
456
|
+
Returns:
|
457
|
+
Dict[str, Any]: A dictionary containing the extracted information.
|
458
|
+
|
459
|
+
Example
|
460
|
+
-------
|
461
|
+
>>> document_analysis(image)
|
462
|
+
{'pages':
|
463
|
+
[{'bbox': [0, 0, 1700, 2200],
|
464
|
+
'chunks': [{'bbox': [1371, 75, 1503, 112],
|
465
|
+
'label': 'page_header',
|
466
|
+
'order': 75
|
467
|
+
'caption': 'Annual Report 2024',
|
468
|
+
'summary': 'This annual report summarizes ...' },
|
469
|
+
{'bbox': [201, 1119, 1497, 1647],
|
470
|
+
'label': table',
|
471
|
+
'order': 1119,
|
472
|
+
'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
|
473
|
+
'summary': 'This table illustrates a trend of ...'},
|
474
|
+
],
|
475
|
+
",document_extraction
|
447
476
|
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
|
448
477
|
'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
449
478
|
value selected for the video. It can detect multiple objects independently per
|
@@ -513,6 +542,78 @@ desc,doc,name
|
|
513
542
|
>>> siglip_classification(image, ['dog', 'cat', 'bird'])
|
514
543
|
{""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
515
544
|
",siglip_classification
|
545
|
+
"'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
546
|
+
'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
547
|
+
prompt such as category names or referring expressions. The categories in the text
|
548
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
549
|
+
mask file names and associated probability scores.
|
550
|
+
|
551
|
+
Parameters:
|
552
|
+
prompt (str): The prompt to ground to the image.
|
553
|
+
image (np.ndarray): The image to ground the prompt to.
|
554
|
+
|
555
|
+
Returns:
|
556
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
557
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
558
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
559
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
560
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
561
|
+
the background.
|
562
|
+
|
563
|
+
Example
|
564
|
+
-------
|
565
|
+
>>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
|
566
|
+
[
|
567
|
+
[
|
568
|
+
{
|
569
|
+
'label': '0: dinosaur',
|
570
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
571
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
572
|
+
[0, 0, 0, ..., 0, 0, 0],
|
573
|
+
...,
|
574
|
+
[0, 0, 0, ..., 0, 0, 0],
|
575
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
576
|
+
},
|
577
|
+
],
|
578
|
+
...
|
579
|
+
]
|
580
|
+
",owlv2_sam2_video_tracking
|
581
|
+
"'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
|
582
|
+
'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
583
|
+
prompt such as category names or referring expressions. The categories in the text
|
584
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
585
|
+
mask file names and associated probability scores.
|
586
|
+
|
587
|
+
Parameters:
|
588
|
+
prompt (str): The prompt to ground to the image.
|
589
|
+
image (np.ndarray): The image to ground the prompt to.
|
590
|
+
|
591
|
+
Returns:
|
592
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
593
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
594
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
595
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
596
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
597
|
+
the background.
|
598
|
+
|
599
|
+
Example
|
600
|
+
-------
|
601
|
+
>>> countgd_sam2_video_tracking(""car, dinosaur"", frames)
|
602
|
+
[
|
603
|
+
[
|
604
|
+
{
|
605
|
+
'label': '0: dinosaur',
|
606
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
607
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
608
|
+
[0, 0, 0, ..., 0, 0, 0],
|
609
|
+
...,
|
610
|
+
[0, 0, 0, ..., 0, 0, 0],
|
611
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
612
|
+
},
|
613
|
+
],
|
614
|
+
...
|
615
|
+
]
|
616
|
+
",countgd_sam2_video_tracking
|
516
617
|
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
|
517
618
|
'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
518
619
|
which can be a file path, url or youtube link, returns a list of dictionaries
|
vision_agent/.sim_tools/embs.npy
CHANGED
Binary file
|
vision_agent/tools/__init__.py
CHANGED
@@ -143,7 +143,14 @@ def run_tool_testing(
|
|
143
143
|
code = extract_tag(response, "code") # type: ignore
|
144
144
|
if code is None:
|
145
145
|
raise ValueError(f"Could not extract code from response: {response}")
|
146
|
-
|
146
|
+
|
147
|
+
# If there's a syntax error with the code, process_code can crash. Executing the
|
148
|
+
# code and then sending the error to the LLM should correct it.
|
149
|
+
try:
|
150
|
+
code = process_code(code)
|
151
|
+
except Exception as e:
|
152
|
+
_LOGGER.error(f"Error processing code: {e}")
|
153
|
+
|
147
154
|
tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
|
148
155
|
tool_output_str = tool_output.text(include_results=False).strip()
|
149
156
|
|
@@ -167,6 +174,7 @@ def run_tool_testing(
|
|
167
174
|
DefaultImports.prepend_imports(code)
|
168
175
|
)
|
169
176
|
tool_output_str = tool_output.text(include_results=False).strip()
|
177
|
+
count += 1
|
170
178
|
|
171
179
|
return code, tool_docs_str, tool_output
|
172
180
|
|
vision_agent/tools/tools.py
CHANGED
@@ -119,6 +119,120 @@ def _display_tool_trace(
|
|
119
119
|
display({MimeType.APPLICATION_JSON: tool_call_trace.model_dump()}, raw=True)
|
120
120
|
|
121
121
|
|
122
|
+
class ODModels(str, Enum):
|
123
|
+
COUNTGD = "countgd"
|
124
|
+
FLORENCE2 = "florence2"
|
125
|
+
OWLV2 = "owlv2"
|
126
|
+
|
127
|
+
|
128
|
+
def od_sam2_video_tracking(
|
129
|
+
od_model: ODModels,
|
130
|
+
prompt: str,
|
131
|
+
frames: List[np.ndarray],
|
132
|
+
chunk_length: Optional[int] = 10,
|
133
|
+
fine_tune_id: Optional[str] = None,
|
134
|
+
) -> Dict[str, Any]:
|
135
|
+
results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
|
136
|
+
|
137
|
+
if chunk_length is None:
|
138
|
+
step = 1 # Process every frame
|
139
|
+
elif chunk_length <= 0:
|
140
|
+
raise ValueError("chunk_length must be a positive integer or None.")
|
141
|
+
else:
|
142
|
+
step = chunk_length # Process frames with the specified step size
|
143
|
+
|
144
|
+
for idx in range(0, len(frames), step):
|
145
|
+
if od_model == ODModels.COUNTGD:
|
146
|
+
results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
|
147
|
+
function_name = "countgd_object_detection"
|
148
|
+
elif od_model == ODModels.OWLV2:
|
149
|
+
results[idx] = owl_v2_image(
|
150
|
+
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
151
|
+
)
|
152
|
+
function_name = "owl_v2_image"
|
153
|
+
elif od_model == ODModels.FLORENCE2:
|
154
|
+
results[idx] = florence2_sam2_image(
|
155
|
+
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
156
|
+
)
|
157
|
+
function_name = "florence2_sam2_image"
|
158
|
+
else:
|
159
|
+
raise NotImplementedError(
|
160
|
+
f"Object detection model '{od_model}' is not implemented."
|
161
|
+
)
|
162
|
+
|
163
|
+
image_size = frames[0].shape[:2]
|
164
|
+
|
165
|
+
def _transform_detections(
|
166
|
+
input_list: List[Optional[List[Dict[str, Any]]]]
|
167
|
+
) -> List[Optional[Dict[str, Any]]]:
|
168
|
+
output_list: List[Optional[Dict[str, Any]]] = []
|
169
|
+
|
170
|
+
for _, frame in enumerate(input_list):
|
171
|
+
if frame is not None:
|
172
|
+
labels = [detection["label"] for detection in frame]
|
173
|
+
bboxes = [
|
174
|
+
denormalize_bbox(detection["bbox"], image_size)
|
175
|
+
for detection in frame
|
176
|
+
]
|
177
|
+
|
178
|
+
output_list.append(
|
179
|
+
{
|
180
|
+
"labels": labels,
|
181
|
+
"bboxes": bboxes,
|
182
|
+
}
|
183
|
+
)
|
184
|
+
else:
|
185
|
+
output_list.append(None)
|
186
|
+
|
187
|
+
return output_list
|
188
|
+
|
189
|
+
output = _transform_detections(results)
|
190
|
+
|
191
|
+
buffer_bytes = frames_to_bytes(frames)
|
192
|
+
files = [("video", buffer_bytes)]
|
193
|
+
payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
|
194
|
+
metadata = {"function_name": function_name}
|
195
|
+
|
196
|
+
detections = send_task_inference_request(
|
197
|
+
payload,
|
198
|
+
"sam2",
|
199
|
+
files=files,
|
200
|
+
metadata=metadata,
|
201
|
+
)
|
202
|
+
|
203
|
+
return_data = []
|
204
|
+
for frame in detections:
|
205
|
+
return_frame_data = []
|
206
|
+
for detection in frame:
|
207
|
+
mask = rle_decode_array(detection["mask"])
|
208
|
+
label = str(detection["id"]) + ": " + detection["label"]
|
209
|
+
return_frame_data.append(
|
210
|
+
{"label": label, "mask": mask, "score": 1.0, "rle": detection["mask"]}
|
211
|
+
)
|
212
|
+
return_data.append(return_frame_data)
|
213
|
+
return_data = add_bboxes_from_masks(return_data)
|
214
|
+
return_data = nms(return_data, iou_threshold=0.95)
|
215
|
+
|
216
|
+
# We save the RLE for display purposes, re-calculting RLE can get very expensive.
|
217
|
+
# Deleted here because we are returning the numpy masks instead
|
218
|
+
display_data = []
|
219
|
+
for frame in return_data:
|
220
|
+
display_frame_data = []
|
221
|
+
for obj in frame:
|
222
|
+
display_frame_data.append(
|
223
|
+
{
|
224
|
+
"label": obj["label"],
|
225
|
+
"score": obj["score"],
|
226
|
+
"bbox": denormalize_bbox(obj["bbox"], image_size),
|
227
|
+
"mask": obj["rle"],
|
228
|
+
}
|
229
|
+
)
|
230
|
+
del obj["rle"]
|
231
|
+
display_data.append(display_frame_data)
|
232
|
+
|
233
|
+
return {"files": files, "return_data": return_data, "display_data": detections}
|
234
|
+
|
235
|
+
|
122
236
|
def owl_v2_image(
|
123
237
|
prompt: str,
|
124
238
|
image: np.ndarray,
|
@@ -302,6 +416,64 @@ def owl_v2_video(
|
|
302
416
|
return bboxes_formatted
|
303
417
|
|
304
418
|
|
419
|
+
def owlv2_sam2_video_tracking(
|
420
|
+
prompt: str,
|
421
|
+
frames: List[np.ndarray],
|
422
|
+
chunk_length: Optional[int] = 10,
|
423
|
+
fine_tune_id: Optional[str] = None,
|
424
|
+
) -> List[List[Dict[str, Any]]]:
|
425
|
+
"""'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
426
|
+
prompt such as category names or referring expressions. The categories in the text
|
427
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
428
|
+
mask file names and associated probability scores.
|
429
|
+
|
430
|
+
Parameters:
|
431
|
+
prompt (str): The prompt to ground to the image.
|
432
|
+
image (np.ndarray): The image to ground the prompt to.
|
433
|
+
|
434
|
+
Returns:
|
435
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
436
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
437
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
438
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
439
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
440
|
+
the background.
|
441
|
+
|
442
|
+
Example
|
443
|
+
-------
|
444
|
+
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
445
|
+
[
|
446
|
+
[
|
447
|
+
{
|
448
|
+
'label': '0: dinosaur',
|
449
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
450
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
451
|
+
[0, 0, 0, ..., 0, 0, 0],
|
452
|
+
...,
|
453
|
+
[0, 0, 0, ..., 0, 0, 0],
|
454
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
455
|
+
},
|
456
|
+
],
|
457
|
+
...
|
458
|
+
]
|
459
|
+
"""
|
460
|
+
|
461
|
+
ret = od_sam2_video_tracking(
|
462
|
+
ODModels.OWLV2,
|
463
|
+
prompt=prompt,
|
464
|
+
frames=frames,
|
465
|
+
chunk_length=chunk_length,
|
466
|
+
fine_tune_id=fine_tune_id,
|
467
|
+
)
|
468
|
+
_display_tool_trace(
|
469
|
+
owlv2_sam2_video_tracking.__name__,
|
470
|
+
{},
|
471
|
+
ret["display_data"],
|
472
|
+
ret["files"],
|
473
|
+
)
|
474
|
+
return ret["return_data"] # type: ignore
|
475
|
+
|
476
|
+
|
305
477
|
def florence2_sam2_image(
|
306
478
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
307
479
|
) -> List[Dict[str, Any]]:
|
@@ -834,6 +1006,59 @@ def countgd_sam2_object_detection(
|
|
834
1006
|
return seg_ret["return_data"] # type: ignore
|
835
1007
|
|
836
1008
|
|
1009
|
+
def countgd_sam2_video_tracking(
|
1010
|
+
prompt: str,
|
1011
|
+
frames: List[np.ndarray],
|
1012
|
+
chunk_length: Optional[int] = 10,
|
1013
|
+
) -> List[List[Dict[str, Any]]]:
|
1014
|
+
"""'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
1015
|
+
prompt such as category names or referring expressions. The categories in the text
|
1016
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
1017
|
+
mask file names and associated probability scores.
|
1018
|
+
|
1019
|
+
Parameters:
|
1020
|
+
prompt (str): The prompt to ground to the image.
|
1021
|
+
image (np.ndarray): The image to ground the prompt to.
|
1022
|
+
|
1023
|
+
Returns:
|
1024
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
1025
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
1026
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
1027
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
1028
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
1029
|
+
the background.
|
1030
|
+
|
1031
|
+
Example
|
1032
|
+
-------
|
1033
|
+
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
1034
|
+
[
|
1035
|
+
[
|
1036
|
+
{
|
1037
|
+
'label': '0: dinosaur',
|
1038
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
1039
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
1040
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1041
|
+
...,
|
1042
|
+
[0, 0, 0, ..., 0, 0, 0],
|
1043
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
1044
|
+
},
|
1045
|
+
],
|
1046
|
+
...
|
1047
|
+
]
|
1048
|
+
"""
|
1049
|
+
|
1050
|
+
ret = od_sam2_video_tracking(
|
1051
|
+
ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
|
1052
|
+
)
|
1053
|
+
_display_tool_trace(
|
1054
|
+
countgd_sam2_video_tracking.__name__,
|
1055
|
+
{},
|
1056
|
+
ret["display_data"],
|
1057
|
+
ret["files"],
|
1058
|
+
)
|
1059
|
+
return ret["return_data"] # type: ignore
|
1060
|
+
|
1061
|
+
|
837
1062
|
def countgd_example_based_counting(
|
838
1063
|
visual_prompts: List[List[float]],
|
839
1064
|
image: np.ndarray,
|
@@ -1879,11 +2104,11 @@ def closest_box_distance(
|
|
1879
2104
|
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
1880
2105
|
|
1881
2106
|
|
1882
|
-
def
|
1883
|
-
"""'
|
1884
|
-
|
1885
|
-
|
1886
|
-
|
2107
|
+
def document_extraction(image: np.ndarray) -> Dict[str, Any]:
|
2108
|
+
"""'document_extraction' is a tool that can extract structured information out of
|
2109
|
+
documents with different layouts. It returns the extracted data in a structured
|
2110
|
+
hierarchical format containing text, tables, pictures, charts, and other
|
2111
|
+
information.
|
1887
2112
|
|
1888
2113
|
Parameters:
|
1889
2114
|
image (np.ndarray): The document image to analyze
|
@@ -1894,20 +2119,18 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
|
|
1894
2119
|
Example
|
1895
2120
|
-------
|
1896
2121
|
>>> document_analysis(image)
|
1897
|
-
{'pages':
|
1898
|
-
|
1899
|
-
|
2122
|
+
{'pages':
|
2123
|
+
[{'bbox': [0, 0, 1.0, 1.0],
|
2124
|
+
'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
|
1900
2125
|
'label': 'page_header',
|
1901
|
-
'
|
1902
|
-
|
1903
|
-
'
|
1904
|
-
|
1905
|
-
|
1906
|
-
|
1907
|
-
|
1908
|
-
|
1909
|
-
'label': 'picture',
|
1910
|
-
'summary': 'This bar chart illustrates the trend of ...'},
|
2126
|
+
'order': 75
|
2127
|
+
'caption': 'Annual Report 2024',
|
2128
|
+
'summary': 'This annual report summarizes ...' },
|
2129
|
+
{'bbox': [0.2, 0.9, 0.9, 1.0],
|
2130
|
+
'label': table',
|
2131
|
+
'order': 1119,
|
2132
|
+
'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
|
2133
|
+
'summary': 'This table illustrates a trend of ...'},
|
1911
2134
|
],
|
1912
2135
|
"""
|
1913
2136
|
|
@@ -1919,7 +2142,7 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
|
|
1919
2142
|
"model": "document-analysis",
|
1920
2143
|
}
|
1921
2144
|
|
1922
|
-
|
2145
|
+
data: Dict[str, Any] = send_inference_request(
|
1923
2146
|
payload=payload,
|
1924
2147
|
endpoint_name="document-analysis",
|
1925
2148
|
files=files,
|
@@ -1927,14 +2150,99 @@ def document_analysis(image: np.ndarray) -> Dict[str, Any]:
|
|
1927
2150
|
metadata_payload={"function_name": "document_analysis"},
|
1928
2151
|
)
|
1929
2152
|
|
2153
|
+
# don't display normalized bboxes
|
1930
2154
|
_display_tool_trace(
|
1931
|
-
|
2155
|
+
document_extraction.__name__,
|
1932
2156
|
payload,
|
1933
|
-
|
2157
|
+
data,
|
1934
2158
|
files,
|
1935
2159
|
)
|
1936
2160
|
|
1937
|
-
|
2161
|
+
def normalize(data: Any) -> Dict[str, Any]:
|
2162
|
+
if isinstance(data, Dict):
|
2163
|
+
if "bbox" in data:
|
2164
|
+
data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
|
2165
|
+
for key in data:
|
2166
|
+
data[key] = normalize(data[key])
|
2167
|
+
elif isinstance(data, List):
|
2168
|
+
for i in range(len(data)):
|
2169
|
+
data[i] = normalize(data[i])
|
2170
|
+
return data # type: ignore
|
2171
|
+
|
2172
|
+
data = normalize(data)
|
2173
|
+
|
2174
|
+
return data
|
2175
|
+
|
2176
|
+
|
2177
|
+
def document_qa(
|
2178
|
+
prompt: str,
|
2179
|
+
image: np.ndarray,
|
2180
|
+
) -> str:
|
2181
|
+
"""'document_qa' is a tool that can answer any questions about arbitrary
|
2182
|
+
images of documents or presentations. It answers by analyzing the contextual document data
|
2183
|
+
and then using a model to answer specific questions. It returns text as an answer to the question.
|
2184
|
+
|
2185
|
+
Parameters:
|
2186
|
+
prompt (str): The question to be answered about the document image
|
2187
|
+
image (np.ndarray): The document image to analyze
|
2188
|
+
|
2189
|
+
Returns:
|
2190
|
+
str: The answer to the question based on the document's context.
|
2191
|
+
|
2192
|
+
Example
|
2193
|
+
-------
|
2194
|
+
>>> document_qa(image, question)
|
2195
|
+
'The answer to the question ...'
|
2196
|
+
"""
|
2197
|
+
|
2198
|
+
image_file = numpy_to_bytes(image)
|
2199
|
+
|
2200
|
+
files = [("image", image_file)]
|
2201
|
+
|
2202
|
+
payload = {
|
2203
|
+
"model": "document-analysis",
|
2204
|
+
}
|
2205
|
+
|
2206
|
+
data: dict[str, Any] = send_inference_request(
|
2207
|
+
payload=payload,
|
2208
|
+
endpoint_name="document-analysis",
|
2209
|
+
files=files,
|
2210
|
+
v2=True,
|
2211
|
+
metadata_payload={"function_name": "document_qa"},
|
2212
|
+
)
|
2213
|
+
|
2214
|
+
def normalize(data: Any) -> Dict[str, Any]:
|
2215
|
+
if isinstance(data, Dict):
|
2216
|
+
if "bbox" in data:
|
2217
|
+
data["bbox"] = normalize_bbox(data["bbox"], image.shape[:2])
|
2218
|
+
for key in data:
|
2219
|
+
data[key] = normalize(data[key])
|
2220
|
+
elif isinstance(data, List):
|
2221
|
+
for i in range(len(data)):
|
2222
|
+
data[i] = normalize(data[i])
|
2223
|
+
return data # type: ignore
|
2224
|
+
|
2225
|
+
data = normalize(data)
|
2226
|
+
|
2227
|
+
prompt = f"""
|
2228
|
+
Document Context:
|
2229
|
+
{data}\n
|
2230
|
+
Question: {prompt}\n
|
2231
|
+
Please provide a clear, concise answer using only the information from the document. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
|
2232
|
+
"""
|
2233
|
+
|
2234
|
+
lmm = AnthropicLMM()
|
2235
|
+
llm_output = lmm.generate(prompt=prompt)
|
2236
|
+
llm_output = cast(str, llm_output)
|
2237
|
+
|
2238
|
+
_display_tool_trace(
|
2239
|
+
document_qa.__name__,
|
2240
|
+
payload,
|
2241
|
+
llm_output,
|
2242
|
+
files,
|
2243
|
+
)
|
2244
|
+
|
2245
|
+
return llm_output
|
1938
2246
|
|
1939
2247
|
|
1940
2248
|
# Utility and visualization functions
|
@@ -2453,197 +2761,6 @@ def _plot_counting(
|
|
2453
2761
|
return image
|
2454
2762
|
|
2455
2763
|
|
2456
|
-
class ODModels(str, Enum):
|
2457
|
-
COUNTGD = "countgd"
|
2458
|
-
FLORENCE2 = "florence2"
|
2459
|
-
OWLV2 = "owlv2"
|
2460
|
-
|
2461
|
-
|
2462
|
-
def od_sam2_video_tracking(
|
2463
|
-
od_model: ODModels,
|
2464
|
-
prompt: str,
|
2465
|
-
frames: List[np.ndarray],
|
2466
|
-
chunk_length: Optional[int] = 10,
|
2467
|
-
fine_tune_id: Optional[str] = None,
|
2468
|
-
) -> List[List[Dict[str, Any]]]:
|
2469
|
-
|
2470
|
-
results: List[Optional[List[Dict[str, Any]]]] = [None] * len(frames)
|
2471
|
-
|
2472
|
-
if chunk_length is None:
|
2473
|
-
step = 1 # Process every frame
|
2474
|
-
elif chunk_length <= 0:
|
2475
|
-
raise ValueError("chunk_length must be a positive integer or None.")
|
2476
|
-
else:
|
2477
|
-
step = chunk_length # Process frames with the specified step size
|
2478
|
-
|
2479
|
-
for idx in range(0, len(frames), step):
|
2480
|
-
if od_model == ODModels.COUNTGD:
|
2481
|
-
results[idx] = countgd_object_detection(prompt=prompt, image=frames[idx])
|
2482
|
-
function_name = "countgd_object_detection"
|
2483
|
-
elif od_model == ODModels.OWLV2:
|
2484
|
-
results[idx] = owl_v2_image(
|
2485
|
-
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
2486
|
-
)
|
2487
|
-
function_name = "owl_v2_image"
|
2488
|
-
elif od_model == ODModels.FLORENCE2:
|
2489
|
-
results[idx] = florence2_sam2_image(
|
2490
|
-
prompt=prompt, image=frames[idx], fine_tune_id=fine_tune_id
|
2491
|
-
)
|
2492
|
-
function_name = "florence2_sam2_image"
|
2493
|
-
else:
|
2494
|
-
raise NotImplementedError(
|
2495
|
-
f"Object detection model '{od_model}' is not implemented."
|
2496
|
-
)
|
2497
|
-
|
2498
|
-
image_size = frames[0].shape[:2]
|
2499
|
-
|
2500
|
-
def _transform_detections(
|
2501
|
-
input_list: List[Optional[List[Dict[str, Any]]]]
|
2502
|
-
) -> List[Optional[Dict[str, Any]]]:
|
2503
|
-
output_list: List[Optional[Dict[str, Any]]] = []
|
2504
|
-
|
2505
|
-
for idx, frame in enumerate(input_list):
|
2506
|
-
if frame is not None:
|
2507
|
-
labels = [detection["label"] for detection in frame]
|
2508
|
-
bboxes = [
|
2509
|
-
denormalize_bbox(detection["bbox"], image_size)
|
2510
|
-
for detection in frame
|
2511
|
-
]
|
2512
|
-
|
2513
|
-
output_list.append(
|
2514
|
-
{
|
2515
|
-
"labels": labels,
|
2516
|
-
"bboxes": bboxes,
|
2517
|
-
}
|
2518
|
-
)
|
2519
|
-
else:
|
2520
|
-
output_list.append(None)
|
2521
|
-
|
2522
|
-
return output_list
|
2523
|
-
|
2524
|
-
output = _transform_detections(results)
|
2525
|
-
|
2526
|
-
buffer_bytes = frames_to_bytes(frames)
|
2527
|
-
files = [("video", buffer_bytes)]
|
2528
|
-
payload = {"bboxes": json.dumps(output), "chunk_length": chunk_length}
|
2529
|
-
metadata = {"function_name": function_name}
|
2530
|
-
|
2531
|
-
detections = send_task_inference_request(
|
2532
|
-
payload,
|
2533
|
-
"sam2",
|
2534
|
-
files=files,
|
2535
|
-
metadata=metadata,
|
2536
|
-
)
|
2537
|
-
|
2538
|
-
return_data = []
|
2539
|
-
for frame in detections:
|
2540
|
-
return_frame_data = []
|
2541
|
-
for detection in frame:
|
2542
|
-
mask = rle_decode_array(detection["mask"])
|
2543
|
-
label = str(detection["id"]) + ": " + detection["label"]
|
2544
|
-
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
2545
|
-
return_data.append(return_frame_data)
|
2546
|
-
return_data = add_bboxes_from_masks(return_data)
|
2547
|
-
return nms(return_data, iou_threshold=0.95)
|
2548
|
-
|
2549
|
-
|
2550
|
-
def countgd_sam2_video_tracking(
|
2551
|
-
prompt: str,
|
2552
|
-
frames: List[np.ndarray],
|
2553
|
-
chunk_length: Optional[int] = 10,
|
2554
|
-
) -> List[List[Dict[str, Any]]]:
|
2555
|
-
"""'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
2556
|
-
prompt such as category names or referring expressions. The categories in the text
|
2557
|
-
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
2558
|
-
mask file names and associated probability scores.
|
2559
|
-
|
2560
|
-
Parameters:
|
2561
|
-
prompt (str): The prompt to ground to the image.
|
2562
|
-
image (np.ndarray): The image to ground the prompt to.
|
2563
|
-
|
2564
|
-
Returns:
|
2565
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2566
|
-
bounding box, and mask of the detected objects with normalized coordinates
|
2567
|
-
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2568
|
-
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2569
|
-
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2570
|
-
the background.
|
2571
|
-
|
2572
|
-
Example
|
2573
|
-
-------
|
2574
|
-
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
2575
|
-
[
|
2576
|
-
[
|
2577
|
-
{
|
2578
|
-
'label': '0: dinosaur',
|
2579
|
-
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2580
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2581
|
-
[0, 0, 0, ..., 0, 0, 0],
|
2582
|
-
...,
|
2583
|
-
[0, 0, 0, ..., 0, 0, 0],
|
2584
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2585
|
-
},
|
2586
|
-
],
|
2587
|
-
...
|
2588
|
-
]
|
2589
|
-
"""
|
2590
|
-
|
2591
|
-
return od_sam2_video_tracking(
|
2592
|
-
ODModels.COUNTGD, prompt=prompt, frames=frames, chunk_length=chunk_length
|
2593
|
-
)
|
2594
|
-
|
2595
|
-
|
2596
|
-
def owlv2_sam2_video_tracking(
|
2597
|
-
prompt: str,
|
2598
|
-
frames: List[np.ndarray],
|
2599
|
-
chunk_length: Optional[int] = 10,
|
2600
|
-
fine_tune_id: Optional[str] = None,
|
2601
|
-
) -> List[List[Dict[str, Any]]]:
|
2602
|
-
"""'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
|
2603
|
-
prompt such as category names or referring expressions. The categories in the text
|
2604
|
-
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
2605
|
-
mask file names and associated probability scores.
|
2606
|
-
|
2607
|
-
Parameters:
|
2608
|
-
prompt (str): The prompt to ground to the image.
|
2609
|
-
image (np.ndarray): The image to ground the prompt to.
|
2610
|
-
|
2611
|
-
Returns:
|
2612
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
2613
|
-
bounding box, and mask of the detected objects with normalized coordinates
|
2614
|
-
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
2615
|
-
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
2616
|
-
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
2617
|
-
the background.
|
2618
|
-
|
2619
|
-
Example
|
2620
|
-
-------
|
2621
|
-
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
2622
|
-
[
|
2623
|
-
[
|
2624
|
-
{
|
2625
|
-
'label': '0: dinosaur',
|
2626
|
-
'bbox': [0.1, 0.11, 0.35, 0.4],
|
2627
|
-
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
2628
|
-
[0, 0, 0, ..., 0, 0, 0],
|
2629
|
-
...,
|
2630
|
-
[0, 0, 0, ..., 0, 0, 0],
|
2631
|
-
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
2632
|
-
},
|
2633
|
-
],
|
2634
|
-
...
|
2635
|
-
]
|
2636
|
-
"""
|
2637
|
-
|
2638
|
-
return od_sam2_video_tracking(
|
2639
|
-
ODModels.OWLV2,
|
2640
|
-
prompt=prompt,
|
2641
|
-
frames=frames,
|
2642
|
-
chunk_length=chunk_length,
|
2643
|
-
fine_tune_id=fine_tune_id,
|
2644
|
-
)
|
2645
|
-
|
2646
|
-
|
2647
2764
|
FUNCTION_TOOLS = [
|
2648
2765
|
owl_v2_image,
|
2649
2766
|
owl_v2_video,
|
@@ -2663,6 +2780,7 @@ FUNCTION_TOOLS = [
|
|
2663
2780
|
minimum_distance,
|
2664
2781
|
qwen2_vl_images_vqa,
|
2665
2782
|
qwen2_vl_video_vqa,
|
2783
|
+
document_extraction,
|
2666
2784
|
video_temporal_localization,
|
2667
2785
|
flux_image_inpainting,
|
2668
2786
|
siglip_classification,
|
@@ -1,5 +1,5 @@
|
|
1
|
-
vision_agent/.sim_tools/df.csv,sha256=
|
2
|
-
vision_agent/.sim_tools/embs.npy,sha256=
|
1
|
+
vision_agent/.sim_tools/df.csv,sha256=nHhcCD55RO9XTiWq_uQ8pHKkVxLXciCHH-SbGPAQEy0,41969
|
2
|
+
vision_agent/.sim_tools/embs.npy,sha256=UmnXd2Zv1xBu4a7pxHHf4wOhTLKub629rVX9fAusTxY,393344
|
3
3
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
4
4
|
vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
|
5
5
|
vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
|
@@ -26,12 +26,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
26
26
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
27
27
|
vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
|
28
28
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=Jdq34jMw_KuYwk4Wexqm4DRjuLLoL1Q8wukm0NBv1Tc,2812
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
|
31
|
-
vision_agent/tools/planner_tools.py,sha256=
|
31
|
+
vision_agent/tools/planner_tools.py,sha256=tU1qz_VIQM_yPKDmuxjMWu68ZlAZ7ePWI1g7zswyWhI,13540
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
33
|
vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
|
34
|
-
vision_agent/tools/tools.py,sha256=
|
34
|
+
vision_agent/tools/tools.py,sha256=Xcm_9EQdDCR9X5FhIm7VJaTL0qWqhnJUVTRVrRtETrA,96112
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -40,7 +40,7 @@ vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50
|
|
40
40
|
vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
-
vision_agent-0.2.
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
43
|
+
vision_agent-0.2.217.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-0.2.217.dist-info/METADATA,sha256=xl9AmXP9RBpC5frlASsiG7YktdIOTRuJgv8WZdRV_bA,19071
|
45
|
+
vision_agent-0.2.217.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-0.2.217.dist-info/RECORD,,
|
File without changes
|
File without changes
|