vision-agent 0.2.221__py3-none-any.whl → 0.2.222__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +253 -244
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/vision_agent_planner_prompts_v2.py +28 -23
- vision_agent/tools/__init__.py +6 -10
- vision_agent/tools/tools.py +639 -787
- vision_agent/utils/sim.py +24 -1
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/METADATA +1 -1
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/RECORD +10 -10
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/embs.npy
CHANGED
Binary file
|
@@ -330,11 +330,11 @@ get_tool_for_task('Identify if there is tape on the boxes', crops[:3])
|
|
330
330
|
|
331
331
|
OBSERVATION:
|
332
332
|
[get_tool_for_task output]
|
333
|
-
|
333
|
+
owlv2_object_detection performed best as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
|
334
334
|
|
335
|
-
'
|
336
|
-
prompt such as category names or referring expressions on images. The categories
|
337
|
-
text prompt are separated by commas. It returns a list of bounding boxes with
|
335
|
+
'owlv2_object_detection' is a tool that can detect and count multiple objects given a
|
336
|
+
text prompt such as category names or referring expressions on images. The categories
|
337
|
+
in text prompt are separated by commas. It returns a list of bounding boxes with
|
338
338
|
normalized coordinates, label names and associated probability scores.
|
339
339
|
|
340
340
|
Parameters:
|
@@ -354,7 +354,7 @@ Returns:
|
|
354
354
|
|
355
355
|
Example
|
356
356
|
-------
|
357
|
-
>>>
|
357
|
+
>>> owlv2_object_detection("car, dinosaur", image)
|
358
358
|
[
|
359
359
|
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
360
360
|
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
@@ -375,7 +375,7 @@ for frame, frame_predictions in zip(frames, track_predictions):
|
|
375
375
|
int(obj["bbox"][0] * width) : int(obj["bbox"][2] * width),
|
376
376
|
:,
|
377
377
|
]
|
378
|
-
detections =
|
378
|
+
detections = owlv2_object_detection("tape", crop)
|
379
379
|
obj_to_info[obj["label"]].extend(detections)
|
380
380
|
|
381
381
|
|
@@ -441,7 +441,8 @@ PICK_PLAN = """
|
|
441
441
|
|
442
442
|
CATEGORIZE_TOOL_REQUEST = """
|
443
443
|
You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
|
444
|
-
- "object detection and counting" - detecting objects or counting objects from a text prompt in an image
|
444
|
+
- "object detection and counting" - detecting objects or counting objects from a text prompt in an image.
|
445
|
+
- "instance segmentation" - segmenting objects in an image given a text prompt.
|
445
446
|
- "classification" - classifying objects in an image given a text prompt.
|
446
447
|
- "segmentation" - segmenting objects in an image or video given a text prompt.
|
447
448
|
- "OCR" - extracting text from an image.
|
@@ -477,8 +478,9 @@ TEST_TOOLS = """
|
|
477
478
|
1. List all the tools under **Tools** and the user request. Write a program to load the media and call the most relevant tools in parallel and print it's output along with other relevant information.
|
478
479
|
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
479
480
|
3. Your test case MUST run only on the given images which are {media}
|
480
|
-
4.
|
481
|
-
5.
|
481
|
+
4. For video tracking, use chunk_length=1 and at least 3 frames to ensure the best results when evaluating the tool.
|
482
|
+
5. Print this final dictionary.
|
483
|
+
6. Output your code in the following format wrapped in <code> tags:
|
482
484
|
<code>
|
483
485
|
# Your code here
|
484
486
|
</code>
|
@@ -494,17 +496,17 @@ Count the number of pedestrians across all the images.
|
|
494
496
|
|
495
497
|
<code>
|
496
498
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
497
|
-
from vision_agent.tools import load_image,
|
499
|
+
from vision_agent.tools import load_image, owlv2_object_detection, florence2_object_detection, countgd_object_detection
|
498
500
|
|
499
501
|
# process functions in a try catch so that if it fails it doesn't cause `as_completed` to hang
|
500
|
-
def
|
502
|
+
def process_owlv2(image_paths):
|
501
503
|
try:
|
502
504
|
results = []
|
503
505
|
for image_path in image_paths:
|
504
506
|
image = load_image(image_path)
|
505
|
-
results.extend(
|
507
|
+
results.extend(owlv2_object_detection("person", image))
|
506
508
|
except Exception as e:
|
507
|
-
results = f"Encountered error when executing
|
509
|
+
results = f"Encountered error when executing process_owlv2: {str(e)}"
|
508
510
|
return results
|
509
511
|
|
510
512
|
def process_florence2(image_paths):
|
@@ -512,7 +514,7 @@ def process_florence2(image_paths):
|
|
512
514
|
results = []
|
513
515
|
for image_path in image_paths:
|
514
516
|
image = load_image(image_path)
|
515
|
-
results.extend(
|
517
|
+
results.extend(florence2_object_detection("person", image))
|
516
518
|
except Exception as e:
|
517
519
|
results = f"Encountered error when executing process_florence2: {str(e)}"
|
518
520
|
return results
|
@@ -531,7 +533,7 @@ image_paths = ["image1.jpg", "image2.jpg", "image3.jpg", "image4.jpg"]
|
|
531
533
|
|
532
534
|
with ThreadPoolExecutor() as executor:
|
533
535
|
futures = {{
|
534
|
-
executor.submit(
|
536
|
+
executor.submit(process_owlv2, image_paths): "owlv2_object_detection",
|
535
537
|
executor.submit(process_florence2, image_paths): "florence2_phrase_grounding",
|
536
538
|
executor.submit(process_countgd, image_paths): "countgd_object_detection",
|
537
539
|
}}
|
@@ -557,7 +559,7 @@ Count the number of people in the video.
|
|
557
559
|
<code>
|
558
560
|
import numpy as np
|
559
561
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
560
|
-
from vision_agent.tools import extract_frames_and_timestamps,
|
562
|
+
from vision_agent.tools import extract_frames_and_timestamps, owlv2_sam2_video_tracking, florence2_sam2_video_tracking
|
561
563
|
|
562
564
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
563
565
|
frames = extract_frames_and_timestamps("video.mp4", 1)
|
@@ -574,16 +576,18 @@ def remove_arrays(o):
|
|
574
576
|
else:
|
575
577
|
return o
|
576
578
|
|
577
|
-
def
|
579
|
+
def process_owlv2_sam2_video_tracking(frames):
|
578
580
|
try:
|
579
|
-
|
581
|
+
# run with chunk_length=1 to ensure best results
|
582
|
+
results = owlv2_sam2_video_tracking("person", frames, chunk_length=1)
|
580
583
|
except Exception as e:
|
581
|
-
results = f"Encountered error when executing
|
584
|
+
results = f"Encountered error when executing process_owlv2_sam2_video_tracking: {str(e)}"
|
582
585
|
return results
|
583
586
|
|
584
|
-
def
|
587
|
+
def process_florence2_sam2_video_tracking(frames):
|
585
588
|
try:
|
586
|
-
|
589
|
+
# run with chunk_length=1 to ensure best results
|
590
|
+
results = florence2_sam2_video_tracking("person", frames, chunk_length=1)
|
587
591
|
except Exception as e:
|
588
592
|
results = f"Encountered error when executing process_florence2_sam2: {str(e)}"
|
589
593
|
return results
|
@@ -591,8 +595,8 @@ def process_florence2_sam2(frames):
|
|
591
595
|
|
592
596
|
with ThreadPoolExecutor() as executor:
|
593
597
|
futures = {{
|
594
|
-
executor.submit(
|
595
|
-
executor.submit(
|
598
|
+
executor.submit(process_owlv2_sam2_video_tracking, frames): "owlv2_sam2_video_tracking",
|
599
|
+
executor.submit(process_florence2_sam2_video_tracking, frames): "florence2_sam2_video_tracking",
|
596
600
|
}}
|
597
601
|
final_results = {{}}
|
598
602
|
for future in as_completed(futures):
|
@@ -686,6 +690,7 @@ FINALIZE_PLAN = """
|
|
686
690
|
3. Include ALL relevant python code in your plan to accomplish the user request.
|
687
691
|
4. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
|
688
692
|
5. Do not include {excluded_tools} tools in your instructions.
|
693
|
+
6. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_file` or `save_video`.
|
689
694
|
6. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
690
695
|
|
691
696
|
<json>
|
vision_agent/tools/__init__.py
CHANGED
@@ -26,25 +26,22 @@ from .tools import (
|
|
26
26
|
claude35_text_extraction,
|
27
27
|
closest_box_distance,
|
28
28
|
closest_mask_distance,
|
29
|
-
countgd_example_based_counting,
|
30
29
|
countgd_object_detection,
|
31
|
-
|
30
|
+
countgd_sam2_instance_segmentation,
|
32
31
|
countgd_sam2_video_tracking,
|
32
|
+
countgd_visual_prompt_object_detection,
|
33
33
|
depth_anything_v2,
|
34
34
|
detr_segmentation,
|
35
35
|
document_extraction,
|
36
36
|
document_qa,
|
37
37
|
extract_frames_and_timestamps,
|
38
|
+
florence2_object_detection,
|
38
39
|
florence2_ocr,
|
39
|
-
|
40
|
-
florence2_phrase_grounding_video,
|
41
|
-
florence2_sam2_image,
|
40
|
+
florence2_sam2_instance_segmentation,
|
42
41
|
florence2_sam2_video_tracking,
|
43
42
|
flux_image_inpainting,
|
44
43
|
generate_pose_image,
|
45
44
|
get_tool_documentation,
|
46
|
-
gpt4o_image_vqa,
|
47
|
-
gpt4o_video_vqa,
|
48
45
|
load_image,
|
49
46
|
minimum_distance,
|
50
47
|
ocr,
|
@@ -52,8 +49,8 @@ from .tools import (
|
|
52
49
|
overlay_bounding_boxes,
|
53
50
|
overlay_heat_map,
|
54
51
|
overlay_segmentation_masks,
|
55
|
-
|
56
|
-
|
52
|
+
owlv2_object_detection,
|
53
|
+
owlv2_sam2_instance_segmentation,
|
57
54
|
owlv2_sam2_video_tracking,
|
58
55
|
qwen2_vl_images_vqa,
|
59
56
|
qwen2_vl_video_vqa,
|
@@ -62,7 +59,6 @@ from .tools import (
|
|
62
59
|
save_json,
|
63
60
|
save_video,
|
64
61
|
siglip_classification,
|
65
|
-
stella_embeddings,
|
66
62
|
template_match,
|
67
63
|
video_temporal_localization,
|
68
64
|
vit_image_classification,
|