vision-agent 0.2.221__py3-none-any.whl → 0.2.222__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/.sim_tools/df.csv +253 -244
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/vision_agent_planner_prompts_v2.py +28 -23
- vision_agent/tools/__init__.py +6 -10
- vision_agent/tools/tools.py +639 -787
- vision_agent/utils/sim.py +24 -1
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/METADATA +1 -1
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/RECORD +10 -10
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.221.dist-info → vision_agent-0.2.222.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/embs.npy
CHANGED
Binary file
|
@@ -330,11 +330,11 @@ get_tool_for_task('Identify if there is tape on the boxes', crops[:3])
|
|
330
330
|
|
331
331
|
OBSERVATION:
|
332
332
|
[get_tool_for_task output]
|
333
|
-
|
333
|
+
owlv2_object_detection performed best as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
|
334
334
|
|
335
|
-
'
|
336
|
-
prompt such as category names or referring expressions on images. The categories
|
337
|
-
text prompt are separated by commas. It returns a list of bounding boxes with
|
335
|
+
'owlv2_object_detection' is a tool that can detect and count multiple objects given a
|
336
|
+
text prompt such as category names or referring expressions on images. The categories
|
337
|
+
in text prompt are separated by commas. It returns a list of bounding boxes with
|
338
338
|
normalized coordinates, label names and associated probability scores.
|
339
339
|
|
340
340
|
Parameters:
|
@@ -354,7 +354,7 @@ Returns:
|
|
354
354
|
|
355
355
|
Example
|
356
356
|
-------
|
357
|
-
>>>
|
357
|
+
>>> owlv2_object_detection("car, dinosaur", image)
|
358
358
|
[
|
359
359
|
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
360
360
|
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
@@ -375,7 +375,7 @@ for frame, frame_predictions in zip(frames, track_predictions):
|
|
375
375
|
int(obj["bbox"][0] * width) : int(obj["bbox"][2] * width),
|
376
376
|
:,
|
377
377
|
]
|
378
|
-
detections =
|
378
|
+
detections = owlv2_object_detection("tape", crop)
|
379
379
|
obj_to_info[obj["label"]].extend(detections)
|
380
380
|
|
381
381
|
|
@@ -441,7 +441,8 @@ PICK_PLAN = """
|
|
441
441
|
|
442
442
|
CATEGORIZE_TOOL_REQUEST = """
|
443
443
|
You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
|
444
|
-
- "object detection and counting" - detecting objects or counting objects from a text prompt in an image
|
444
|
+
- "object detection and counting" - detecting objects or counting objects from a text prompt in an image.
|
445
|
+
- "instance segmentation" - segmenting objects in an image given a text prompt.
|
445
446
|
- "classification" - classifying objects in an image given a text prompt.
|
446
447
|
- "segmentation" - segmenting objects in an image or video given a text prompt.
|
447
448
|
- "OCR" - extracting text from an image.
|
@@ -477,8 +478,9 @@ TEST_TOOLS = """
|
|
477
478
|
1. List all the tools under **Tools** and the user request. Write a program to load the media and call the most relevant tools in parallel and print it's output along with other relevant information.
|
478
479
|
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
479
480
|
3. Your test case MUST run only on the given images which are {media}
|
480
|
-
4.
|
481
|
-
5.
|
481
|
+
4. For video tracking, use chunk_length=1 and at least 3 frames to ensure the best results when evaluating the tool.
|
482
|
+
5. Print this final dictionary.
|
483
|
+
6. Output your code in the following format wrapped in <code> tags:
|
482
484
|
<code>
|
483
485
|
# Your code here
|
484
486
|
</code>
|
@@ -494,17 +496,17 @@ Count the number of pedestrians across all the images.
|
|
494
496
|
|
495
497
|
<code>
|
496
498
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
497
|
-
from vision_agent.tools import load_image,
|
499
|
+
from vision_agent.tools import load_image, owlv2_object_detection, florence2_object_detection, countgd_object_detection
|
498
500
|
|
499
501
|
# process functions in a try catch so that if it fails it doesn't cause `as_completed` to hang
|
500
|
-
def
|
502
|
+
def process_owlv2(image_paths):
|
501
503
|
try:
|
502
504
|
results = []
|
503
505
|
for image_path in image_paths:
|
504
506
|
image = load_image(image_path)
|
505
|
-
results.extend(
|
507
|
+
results.extend(owlv2_object_detection("person", image))
|
506
508
|
except Exception as e:
|
507
|
-
results = f"Encountered error when executing
|
509
|
+
results = f"Encountered error when executing process_owlv2: {str(e)}"
|
508
510
|
return results
|
509
511
|
|
510
512
|
def process_florence2(image_paths):
|
@@ -512,7 +514,7 @@ def process_florence2(image_paths):
|
|
512
514
|
results = []
|
513
515
|
for image_path in image_paths:
|
514
516
|
image = load_image(image_path)
|
515
|
-
results.extend(
|
517
|
+
results.extend(florence2_object_detection("person", image))
|
516
518
|
except Exception as e:
|
517
519
|
results = f"Encountered error when executing process_florence2: {str(e)}"
|
518
520
|
return results
|
@@ -531,7 +533,7 @@ image_paths = ["image1.jpg", "image2.jpg", "image3.jpg", "image4.jpg"]
|
|
531
533
|
|
532
534
|
with ThreadPoolExecutor() as executor:
|
533
535
|
futures = {{
|
534
|
-
executor.submit(
|
536
|
+
executor.submit(process_owlv2, image_paths): "owlv2_object_detection",
|
535
537
|
executor.submit(process_florence2, image_paths): "florence2_phrase_grounding",
|
536
538
|
executor.submit(process_countgd, image_paths): "countgd_object_detection",
|
537
539
|
}}
|
@@ -557,7 +559,7 @@ Count the number of people in the video.
|
|
557
559
|
<code>
|
558
560
|
import numpy as np
|
559
561
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
560
|
-
from vision_agent.tools import extract_frames_and_timestamps,
|
562
|
+
from vision_agent.tools import extract_frames_and_timestamps, owlv2_sam2_video_tracking, florence2_sam2_video_tracking
|
561
563
|
|
562
564
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
563
565
|
frames = extract_frames_and_timestamps("video.mp4", 1)
|
@@ -574,16 +576,18 @@ def remove_arrays(o):
|
|
574
576
|
else:
|
575
577
|
return o
|
576
578
|
|
577
|
-
def
|
579
|
+
def process_owlv2_sam2_video_tracking(frames):
|
578
580
|
try:
|
579
|
-
|
581
|
+
# run with chunk_length=1 to ensure best results
|
582
|
+
results = owlv2_sam2_video_tracking("person", frames, chunk_length=1)
|
580
583
|
except Exception as e:
|
581
|
-
results = f"Encountered error when executing
|
584
|
+
results = f"Encountered error when executing process_owlv2_sam2_video_tracking: {str(e)}"
|
582
585
|
return results
|
583
586
|
|
584
|
-
def
|
587
|
+
def process_florence2_sam2_video_tracking(frames):
|
585
588
|
try:
|
586
|
-
|
589
|
+
# run with chunk_length=1 to ensure best results
|
590
|
+
results = florence2_sam2_video_tracking("person", frames, chunk_length=1)
|
587
591
|
except Exception as e:
|
588
592
|
results = f"Encountered error when executing process_florence2_sam2: {str(e)}"
|
589
593
|
return results
|
@@ -591,8 +595,8 @@ def process_florence2_sam2(frames):
|
|
591
595
|
|
592
596
|
with ThreadPoolExecutor() as executor:
|
593
597
|
futures = {{
|
594
|
-
executor.submit(
|
595
|
-
executor.submit(
|
598
|
+
executor.submit(process_owlv2_sam2_video_tracking, frames): "owlv2_sam2_video_tracking",
|
599
|
+
executor.submit(process_florence2_sam2_video_tracking, frames): "florence2_sam2_video_tracking",
|
596
600
|
}}
|
597
601
|
final_results = {{}}
|
598
602
|
for future in as_completed(futures):
|
@@ -686,6 +690,7 @@ FINALIZE_PLAN = """
|
|
686
690
|
3. Include ALL relevant python code in your plan to accomplish the user request.
|
687
691
|
4. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
|
688
692
|
5. Do not include {excluded_tools} tools in your instructions.
|
693
|
+
6. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_file` or `save_video`.
|
689
694
|
6. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
690
695
|
|
691
696
|
<json>
|
vision_agent/tools/__init__.py
CHANGED
@@ -26,25 +26,22 @@ from .tools import (
|
|
26
26
|
claude35_text_extraction,
|
27
27
|
closest_box_distance,
|
28
28
|
closest_mask_distance,
|
29
|
-
countgd_example_based_counting,
|
30
29
|
countgd_object_detection,
|
31
|
-
|
30
|
+
countgd_sam2_instance_segmentation,
|
32
31
|
countgd_sam2_video_tracking,
|
32
|
+
countgd_visual_prompt_object_detection,
|
33
33
|
depth_anything_v2,
|
34
34
|
detr_segmentation,
|
35
35
|
document_extraction,
|
36
36
|
document_qa,
|
37
37
|
extract_frames_and_timestamps,
|
38
|
+
florence2_object_detection,
|
38
39
|
florence2_ocr,
|
39
|
-
|
40
|
-
florence2_phrase_grounding_video,
|
41
|
-
florence2_sam2_image,
|
40
|
+
florence2_sam2_instance_segmentation,
|
42
41
|
florence2_sam2_video_tracking,
|
43
42
|
flux_image_inpainting,
|
44
43
|
generate_pose_image,
|
45
44
|
get_tool_documentation,
|
46
|
-
gpt4o_image_vqa,
|
47
|
-
gpt4o_video_vqa,
|
48
45
|
load_image,
|
49
46
|
minimum_distance,
|
50
47
|
ocr,
|
@@ -52,8 +49,8 @@ from .tools import (
|
|
52
49
|
overlay_bounding_boxes,
|
53
50
|
overlay_heat_map,
|
54
51
|
overlay_segmentation_masks,
|
55
|
-
|
56
|
-
|
52
|
+
owlv2_object_detection,
|
53
|
+
owlv2_sam2_instance_segmentation,
|
57
54
|
owlv2_sam2_video_tracking,
|
58
55
|
qwen2_vl_images_vqa,
|
59
56
|
qwen2_vl_video_vqa,
|
@@ -62,7 +59,6 @@ from .tools import (
|
|
62
59
|
save_json,
|
63
60
|
save_video,
|
64
61
|
siglip_classification,
|
65
|
-
stella_embeddings,
|
66
62
|
template_match,
|
67
63
|
video_temporal_localization,
|
68
64
|
vit_image_classification,
|