vision-agent 0.2.208__tar.gz → 0.2.209__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.208 → vision_agent-0.2.209}/PKG-INFO +2 -1
- {vision_agent-0.2.208 → vision_agent-0.2.209}/pyproject.toml +2 -1
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/.sim_tools/df.csv +61 -36
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_planner_prompts_v2.py +16 -15
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_planner_v2.py +1 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/__init__.py +4 -1
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/planner_tools.py +0 -4
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/tools.py +204 -25
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/utils/image_utils.py +4 -4
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/utils/sim.py +4 -1
- {vision_agent-0.2.208 → vision_agent-0.2.209}/LICENSE +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/README.md +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/types.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_v2.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.209
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -31,6 +31,7 @@ Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
|
31
31
|
Requires-Dist: pytube (==15.0.0)
|
32
32
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
33
33
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
34
|
+
Requires-Dist: scikit-learn (>=1.5.2,<2.0.0)
|
34
35
|
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
35
36
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
36
37
|
Requires-Dist: tenacity (>=8.3.0,<9.0.0)
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "vision-agent"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.209"
|
8
8
|
description = "Toolset for Vision Agent"
|
9
9
|
authors = ["Landing AI <dev@landing.ai>"]
|
10
10
|
readme = "README.md"
|
@@ -46,6 +46,7 @@ pydantic = "2.7.4"
|
|
46
46
|
av = "^11.0.0"
|
47
47
|
libcst = "^1.5.0"
|
48
48
|
matplotlib = "^3.9.2"
|
49
|
+
scikit-learn = "^1.5.2"
|
49
50
|
|
50
51
|
[tool.poetry.group.dev.dependencies]
|
51
52
|
autoflake = "1.*"
|
@@ -112,10 +112,11 @@ desc,doc,name
|
|
112
112
|
>>> vit_nsfw_classification(image)
|
113
113
|
{""label"": ""normal"", ""scores"": 0.68},
|
114
114
|
",vit_nsfw_classification
|
115
|
-
"'
|
116
|
-
'
|
117
|
-
given a text prompt. It is particularly useful when trying to detect and
|
118
|
-
large number of objects.
|
115
|
+
"'countgd_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
116
|
+
'countgd_object_detection' is a tool that can detect multiple instances of an
|
117
|
+
object given a text prompt. It is particularly useful when trying to detect and
|
118
|
+
count a large number of objects. You can optionally separate object names in the
|
119
|
+
prompt with commas. It returns a list of bounding boxes with normalized
|
119
120
|
coordinates, label names and associated confidence scores.
|
120
121
|
|
121
122
|
Parameters:
|
@@ -133,14 +134,51 @@ desc,doc,name
|
|
133
134
|
|
134
135
|
Example
|
135
136
|
-------
|
136
|
-
>>>
|
137
|
+
>>> countgd_object_detection(""flower"", image)
|
137
138
|
[
|
138
139
|
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
139
140
|
{'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
140
141
|
{'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
|
141
142
|
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
142
143
|
]
|
143
|
-
",
|
144
|
+
",countgd_object_detection
|
145
|
+
"'countgd_sam2_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names, masks associated confidence scores.","countgd_sam2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
146
|
+
'countgd_sam2_object_detection' is a tool that can detect multiple instances of
|
147
|
+
an object given a text prompt. It is particularly useful when trying to detect and
|
148
|
+
count a large number of objects. You can optionally separate object names in the
|
149
|
+
prompt with commas. It returns a list of bounding boxes with normalized coordinates,
|
150
|
+
label names, masks associated confidence scores.
|
151
|
+
|
152
|
+
Parameters:
|
153
|
+
prompt (str): The object that needs to be counted.
|
154
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
155
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
156
|
+
to 0.23.
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
160
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
161
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
162
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
163
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
164
|
+
the background.
|
165
|
+
|
166
|
+
Example
|
167
|
+
-------
|
168
|
+
>>> countgd_object_detection(""flower"", image)
|
169
|
+
[
|
170
|
+
{
|
171
|
+
'score': 0.49,
|
172
|
+
'label': 'flower',
|
173
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
174
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
175
|
+
[0, 0, 0, ..., 0, 0, 0],
|
176
|
+
...,
|
177
|
+
[0, 0, 0, ..., 0, 0, 0],
|
178
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
179
|
+
},
|
180
|
+
]
|
181
|
+
",countgd_sam2_object_detection
|
144
182
|
"'florence2_ocr' is a tool that can detect text and text regions in an image. Each text region contains one line of text. It returns a list of detected text, the text region as a bounding box with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","florence2_ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
145
183
|
'florence2_ocr' is a tool that can detect text and text regions in an image.
|
146
184
|
Each text region contains one line of text. It returns a list of detected text,
|
@@ -214,8 +252,8 @@ desc,doc,name
|
|
214
252
|
|
215
253
|
Returns:
|
216
254
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
217
|
-
label,segment mask and bounding boxes. The outer list represents each frame
|
218
|
-
the inner list is the entities per frame. The label contains the object ID
|
255
|
+
label, segment mask and bounding boxes. The outer list represents each frame
|
256
|
+
and the inner list is the entities per frame. The label contains the object ID
|
219
257
|
followed by the label name. The objects are only identified in the first framed
|
220
258
|
and tracked throughout the video.
|
221
259
|
|
@@ -237,12 +275,12 @@ desc,doc,name
|
|
237
275
|
...
|
238
276
|
]
|
239
277
|
",florence2_sam2_video_tracking
|
240
|
-
"'florence2_phrase_grounding' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated
|
278
|
+
"'florence2_phrase_grounding' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_phrase_grounding(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
241
279
|
'florence2_phrase_grounding' is a tool that can detect multiple
|
242
280
|
objects given a text prompt which can be object names or caption. You
|
243
281
|
can optionally separate the object names in the text with commas. It returns a list
|
244
282
|
of bounding boxes with normalized coordinates, label names and associated
|
245
|
-
|
283
|
+
confidence scores of 1.0.
|
246
284
|
|
247
285
|
Parameters:
|
248
286
|
prompt (str): The prompt to ground to the image.
|
@@ -353,37 +391,24 @@ desc,doc,name
|
|
353
391
|
[10, 11, 15, ..., 202, 202, 205],
|
354
392
|
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
355
393
|
",generate_pose_image
|
356
|
-
'
|
357
|
-
'
|
358
|
-
|
359
|
-
|
360
|
-
mask1 (np.ndarray): The first mask.
|
361
|
-
mask2 (np.ndarray): The second mask.
|
362
|
-
|
363
|
-
Returns:
|
364
|
-
float: The closest distance between the two masks.
|
365
|
-
|
366
|
-
Example
|
367
|
-
-------
|
368
|
-
>>> closest_mask_distance(mask1, mask2)
|
369
|
-
0.5
|
370
|
-
",closest_mask_distance
|
371
|
-
'closest_box_distance' calculates the closest distance between two bounding boxes.,"closest_box_distance(box1: List[float], box2: List[float], image_size: Tuple[int, int]) -> float:
|
372
|
-
'closest_box_distance' calculates the closest distance between two bounding boxes.
|
394
|
+
"'minimum_distance' calculates the minimum distance between two detections which can include bounding boxes and or masks. This will return the closest distance between the objects, not the distance between the centers of the objects.","minimum_distance(det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]) -> float:
|
395
|
+
'minimum_distance' calculates the minimum distance between two detections which
|
396
|
+
can include bounding boxes and or masks. This will return the closest distance
|
397
|
+
between the objects, not the distance between the centers of the objects.
|
373
398
|
|
374
399
|
Parameters:
|
375
|
-
|
376
|
-
|
400
|
+
det1 (Dict[str, Any]): The first detection of boxes or masks.
|
401
|
+
det2 (Dict[str, Any]): The second detection of boxes or masks.
|
377
402
|
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
378
403
|
|
379
404
|
Returns:
|
380
|
-
float: The closest distance between the two
|
405
|
+
float: The closest distance between the two detections.
|
381
406
|
|
382
407
|
Example
|
383
408
|
-------
|
384
|
-
>>>
|
409
|
+
>>> closest_distance(det1, det2, image_size)
|
385
410
|
141.42
|
386
|
-
",
|
411
|
+
",minimum_distance
|
387
412
|
'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen2_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
|
388
413
|
'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
389
414
|
images including regular images or images of documents or presentations. It can be
|
@@ -561,9 +586,9 @@ desc,doc,name
|
|
561
586
|
>>> save_video(frames)
|
562
587
|
""/tmp/tmpvideo123.mp4""
|
563
588
|
",save_video
|
564
|
-
'overlay_bounding_boxes' is a utility function that displays bounding boxes on an image.,"overlay_bounding_boxes(medias: Union[numpy.ndarray, List[numpy.ndarray]], bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]) -> Union[numpy.ndarray, List[numpy.ndarray]]:
|
589
|
+
'overlay_bounding_boxes' is a utility function that displays bounding boxes on an image. It will draw a box around the detected object with the label and score.,"overlay_bounding_boxes(medias: Union[numpy.ndarray, List[numpy.ndarray]], bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]) -> Union[numpy.ndarray, List[numpy.ndarray]]:
|
565
590
|
'overlay_bounding_boxes' is a utility function that displays bounding boxes on
|
566
|
-
an image.
|
591
|
+
an image. It will draw a box around the detected object with the label and score.
|
567
592
|
|
568
593
|
Parameters:
|
569
594
|
medias (Union[np.ndarray, List[np.ndarra]]): The image or frames to display the
|
@@ -581,9 +606,9 @@ desc,doc,name
|
|
581
606
|
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
582
607
|
)
|
583
608
|
",overlay_bounding_boxes
|
584
|
-
'overlay_segmentation_masks' is a utility function that displays segmentation masks.,"overlay_segmentation_masks(medias: Union[numpy.ndarray, List[numpy.ndarray]], masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]], draw_label: bool = True, secondary_label_key: str = 'tracking_label') -> Union[numpy.ndarray, List[numpy.ndarray]]:
|
609
|
+
'overlay_segmentation_masks' is a utility function that displays segmentation masks. It will overlay a colored mask on the detected object with the label.,"overlay_segmentation_masks(medias: Union[numpy.ndarray, List[numpy.ndarray]], masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]], draw_label: bool = True, secondary_label_key: str = 'tracking_label') -> Union[numpy.ndarray, List[numpy.ndarray]]:
|
585
610
|
'overlay_segmentation_masks' is a utility function that displays segmentation
|
586
|
-
masks.
|
611
|
+
masks. It will overlay a colored mask on the detected object with the label.
|
587
612
|
|
588
613
|
Parameters:
|
589
614
|
medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
|
Binary file
|
{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
@@ -1,7 +1,7 @@
|
|
1
1
|
PLAN = """
|
2
2
|
**Role**: You are an expert planning agent that can understand the user request and search for a plan to accomplish it.
|
3
3
|
|
4
|
-
**Task**: As a planning agent you are required to understand the user's request and search for a plan to accomplish it. Use Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Esnure your response is clear, concise,
|
4
|
+
**Task**: As a planning agent you are required to understand the user's request and search for a plan to accomplish it. Use Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Esnure your response is clear, concise, and helpful. You can use an interactive Pyton (Jupyter Notebok) environment, executing code with <execute_python>, each execution is a new cell so old code and outputs are saved.
|
5
5
|
|
6
6
|
**Documentation**: this is the documentation for the functions you can use to accomplish the task:
|
7
7
|
{tool_desc}
|
@@ -18,7 +18,7 @@ PLAN = """
|
|
18
18
|
1. Read over the user request and context provided and output <thinking> tags to indicate your thought process. You can <count> number of turns to complete the user's request.
|
19
19
|
2. You can execute python code in the ipython notebook using <execute_python> tags. Only output one <execute_python> tag at a time.
|
20
20
|
3. Only output <finalize_plan> when you are done planning and want to end the planning process. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION's.
|
21
|
-
4. Only load/save files from {media_list} unless you specifically saved the previously.
|
21
|
+
4. Only load/save files from {media_list} unless you specifically saved the file previously.
|
22
22
|
5. Ensure you always call `suggestion` initially and `get_tool_for_task` to get the right tool for the subtask.
|
23
23
|
6. Calling `plt.imshow` or `save_image` will display the image to you, use this to visually check your results.
|
24
24
|
7. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
|
@@ -130,13 +130,14 @@ In these aerial images, I can see approximately 5-6 pedestrians walking in vario
|
|
130
130
|
[end of claude35_vqa_output]
|
131
131
|
|
132
132
|
[get_tool_for_task output]
|
133
|
-
After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The
|
133
|
+
After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool to count the number of pedestrians in the image.
|
134
134
|
|
135
135
|
Tool Documentation:
|
136
|
-
|
137
|
-
'
|
138
|
-
object given a text prompt. It
|
139
|
-
|
136
|
+
countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
137
|
+
'countgd_object_detection' is a tool that can detect multiple instances of an
|
138
|
+
object given a text prompt. It is particularly useful when trying to detect and
|
139
|
+
count a large number of objects. It returns a list of bounding boxes with
|
140
|
+
normalized coordinates, label names and associated confidence scores.
|
140
141
|
|
141
142
|
Parameters:
|
142
143
|
prompt (str): The object that needs to be counted.
|
@@ -153,7 +154,7 @@ countgd_counting(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23)
|
|
153
154
|
|
154
155
|
Example
|
155
156
|
-------
|
156
|
-
>>>
|
157
|
+
>>> countgd_object_detection("flower", image)
|
157
158
|
[
|
158
159
|
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
159
160
|
]
|
@@ -195,7 +196,7 @@ def merge_bounding_box_list(bboxes):
|
|
195
196
|
|
196
197
|
detections = []
|
197
198
|
for region in subdivide_image(image):
|
198
|
-
detections.extend(
|
199
|
+
detections.extend(countgd_object_detection("pedestrian", region))
|
199
200
|
detections = merge_bounding_box_list(detections)
|
200
201
|
print(len(detections))
|
201
202
|
</execute_python>
|
@@ -203,8 +204,8 @@ print(len(detections))
|
|
203
204
|
OBSERVATION: 24
|
204
205
|
<count>7</count>
|
205
206
|
|
206
|
-
AGENT: <thinking>
|
207
|
-
<finalize_plan>This plan successfully detected 24 pedestrians in the image by subdividing the image into smaller regions and using the
|
207
|
+
AGENT: <thinking>countgd_object_detection detected 24 pedestrians in the image by subdividing the image into smaller regions, this differs from claude35_vqa's answer of 5-6 pedestrians. I will use countgd_object_detection's answer since it is specifically designed to count objects.</thinking>
|
208
|
+
<finalize_plan>This plan successfully detected 24 pedestrians in the image by subdividing the image into smaller regions and using the countgd_object_detection tool on each region. The bounding boxes were then merged to count the unique pedestrians.</finalize_plan>
|
208
209
|
--- END EXAMPLE1 ---
|
209
210
|
"""
|
210
211
|
|
@@ -448,7 +449,7 @@ You are given a task: {task} from the user. Your task is to extract the type of
|
|
448
449
|
- "video object tracking" - tracking objects in a video.
|
449
450
|
- "depth and pose estimation" - estimating the depth or pose of objects in an image.
|
450
451
|
|
451
|
-
Return the category inside tags <category># your categories here</category>.
|
452
|
+
Return the category or categories (comma separated) inside tags <category># your categories here</category>.
|
452
453
|
"""
|
453
454
|
|
454
455
|
TEST_TOOLS = """
|
@@ -492,7 +493,7 @@ Count the number of pedestrians across all the images.
|
|
492
493
|
|
493
494
|
<code>
|
494
495
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
495
|
-
from vision_agent.tools import load_image, owl_v2_image, florence2_phrase_grounding,
|
496
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_phrase_grounding, countgd_object_detection
|
496
497
|
|
497
498
|
# process functions in a try catch so that if it fails it doesn't cause `as_completed` to hang
|
498
499
|
def process_owl_v2(image_paths):
|
@@ -520,7 +521,7 @@ def process_countgd(image_paths):
|
|
520
521
|
results = []
|
521
522
|
for image_path in image_paths:
|
522
523
|
image = load_image(image_path)
|
523
|
-
results.extend(
|
524
|
+
results.extend(countgd_object_detection("person", image))
|
524
525
|
except Exception as e:
|
525
526
|
results = f"Encountered error when executing process_countgd: {str(e)}"
|
526
527
|
return results
|
@@ -531,7 +532,7 @@ with ThreadPoolExecutor() as executor:
|
|
531
532
|
futures = {{
|
532
533
|
executor.submit(process_owl_v2, image_paths): "owl_v2_image",
|
533
534
|
executor.submit(process_florence2, image_paths): "florence2_phrase_grounding",
|
534
|
-
executor.submit(process_countgd, image_paths): "
|
535
|
+
executor.submit(process_countgd, image_paths): "countgd_object_detection",
|
535
536
|
}}
|
536
537
|
|
537
538
|
final_results = {{}}
|
@@ -28,8 +28,9 @@ from .tools import (
|
|
28
28
|
clip,
|
29
29
|
closest_box_distance,
|
30
30
|
closest_mask_distance,
|
31
|
-
countgd_counting,
|
32
31
|
countgd_example_based_counting,
|
32
|
+
countgd_object_detection,
|
33
|
+
countgd_sam2_object_detection,
|
33
34
|
depth_anything_v2,
|
34
35
|
detr_segmentation,
|
35
36
|
dpt_hybrid_midas,
|
@@ -56,6 +57,7 @@ from .tools import (
|
|
56
57
|
load_image,
|
57
58
|
loca_visual_prompt_counting,
|
58
59
|
loca_zero_shot_counting,
|
60
|
+
minimum_distance,
|
59
61
|
ocr,
|
60
62
|
overlay_bounding_boxes,
|
61
63
|
overlay_heat_map,
|
@@ -64,6 +66,7 @@ from .tools import (
|
|
64
66
|
owl_v2_video,
|
65
67
|
qwen2_vl_images_vqa,
|
66
68
|
qwen2_vl_video_vqa,
|
69
|
+
sam2,
|
67
70
|
save_image,
|
68
71
|
save_json,
|
69
72
|
save_video,
|
@@ -4,6 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
6
|
import urllib.request
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
7
8
|
from functools import lru_cache
|
8
9
|
from importlib import resources
|
9
10
|
from pathlib import Path
|
@@ -484,8 +485,8 @@ def florence2_sam2_video_tracking(
|
|
484
485
|
|
485
486
|
Returns:
|
486
487
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
487
|
-
label,segment mask and bounding boxes. The outer list represents each frame
|
488
|
-
the inner list is the entities per frame. The label contains the object ID
|
488
|
+
label, segment mask and bounding boxes. The outer list represents each frame
|
489
|
+
and the inner list is the entities per frame. The label contains the object ID
|
489
490
|
followed by the label name. The objects are only identified in the first framed
|
490
491
|
and tracked throughout the video.
|
491
492
|
|
@@ -684,14 +685,15 @@ def loca_visual_prompt_counting(
|
|
684
685
|
return resp_data
|
685
686
|
|
686
687
|
|
687
|
-
def
|
688
|
+
def countgd_object_detection(
|
688
689
|
prompt: str,
|
689
690
|
image: np.ndarray,
|
690
691
|
box_threshold: float = 0.23,
|
691
692
|
) -> List[Dict[str, Any]]:
|
692
|
-
"""'
|
693
|
-
given a text prompt. It is particularly useful when trying to detect and
|
694
|
-
large number of objects.
|
693
|
+
"""'countgd_object_detection' is a tool that can detect multiple instances of an
|
694
|
+
object given a text prompt. It is particularly useful when trying to detect and
|
695
|
+
count a large number of objects. You can optionally separate object names in the
|
696
|
+
prompt with commas. It returns a list of bounding boxes with normalized
|
695
697
|
coordinates, label names and associated confidence scores.
|
696
698
|
|
697
699
|
Parameters:
|
@@ -709,7 +711,7 @@ def countgd_counting(
|
|
709
711
|
|
710
712
|
Example
|
711
713
|
-------
|
712
|
-
>>>
|
714
|
+
>>> countgd_object_detection("flower", image)
|
713
715
|
[
|
714
716
|
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
715
717
|
{'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
@@ -723,19 +725,28 @@ def countgd_counting(
|
|
723
725
|
|
724
726
|
buffer_bytes = numpy_to_bytes(image)
|
725
727
|
files = [("image", buffer_bytes)]
|
726
|
-
|
727
|
-
"prompts": [prompt.replace(", ", ". ")],
|
728
|
-
"confidence": box_threshold, # still not being used in the API
|
729
|
-
"model": "countgd",
|
730
|
-
}
|
731
|
-
metadata = {"function_name": "countgd_counting"}
|
728
|
+
prompts = [p.strip() for p in prompt.split(", ")]
|
732
729
|
|
733
|
-
|
734
|
-
payload
|
735
|
-
|
730
|
+
def _run_countgd(prompt: str) -> List[Dict[str, Any]]:
|
731
|
+
payload = {
|
732
|
+
"prompts": [prompt],
|
733
|
+
"confidence": box_threshold, # still not being used in the API
|
734
|
+
"model": "countgd",
|
735
|
+
}
|
736
|
+
metadata = {"function_name": "countgd_counting"}
|
737
|
+
|
738
|
+
detections = send_task_inference_request(
|
739
|
+
payload, "text-to-object-detection", files=files, metadata=metadata
|
740
|
+
)
|
741
|
+
# get the first frame
|
742
|
+
return detections[0] # type: ignore
|
743
|
+
|
744
|
+
bboxes = []
|
745
|
+
with ThreadPoolExecutor() as executor:
|
746
|
+
futures = [executor.submit(_run_countgd, prompt) for prompt in prompts]
|
747
|
+
for future in as_completed(futures):
|
748
|
+
bboxes.extend(future.result())
|
736
749
|
|
737
|
-
# get the first frame
|
738
|
-
bboxes = detections[0]
|
739
750
|
bboxes_formatted = [
|
740
751
|
ODResponseData(
|
741
752
|
label=bbox["label"],
|
@@ -750,6 +761,131 @@ def countgd_counting(
|
|
750
761
|
return single_nms(return_data, iou_threshold=0.80)
|
751
762
|
|
752
763
|
|
764
|
+
def sam2(
|
765
|
+
image: np.ndarray,
|
766
|
+
detections: List[Dict[str, Any]],
|
767
|
+
) -> List[Dict[str, Any]]:
|
768
|
+
"""'sam2' is a tool that can segment multiple objects given an input bounding box,
|
769
|
+
label and score. It returns a set of masks along with the corresponding bounding
|
770
|
+
boxes and labels.
|
771
|
+
|
772
|
+
Parameters:
|
773
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
774
|
+
detections (List[Dict[str, Any]]): A list of dictionaries containing the score,
|
775
|
+
label, and bounding box of the detected objects with normalized coordinates
|
776
|
+
between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
|
777
|
+
of the top-left and xmax and ymax are the coordinates of the bottom-right of
|
778
|
+
the bounding box.
|
779
|
+
|
780
|
+
Returns:
|
781
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
782
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
783
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
784
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
785
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
786
|
+
the background.
|
787
|
+
|
788
|
+
Example
|
789
|
+
-------
|
790
|
+
>>> sam2(image, [
|
791
|
+
{'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
792
|
+
])
|
793
|
+
[
|
794
|
+
{
|
795
|
+
'score': 0.49,
|
796
|
+
'label': 'flower',
|
797
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
798
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
799
|
+
[0, 0, 0, ..., 0, 0, 0],
|
800
|
+
...,
|
801
|
+
[0, 0, 0, ..., 0, 0, 0],
|
802
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
803
|
+
},
|
804
|
+
]
|
805
|
+
"""
|
806
|
+
image_size = image.shape[:2]
|
807
|
+
|
808
|
+
files = [("images", numpy_to_bytes(image))]
|
809
|
+
payload = {
|
810
|
+
"model": "sam2",
|
811
|
+
"bboxes": json.dumps(
|
812
|
+
[
|
813
|
+
{
|
814
|
+
"labels": [d["label"] for d in detections],
|
815
|
+
"bboxes": [
|
816
|
+
denormalize_bbox(d["bbox"], image_size) for d in detections
|
817
|
+
],
|
818
|
+
}
|
819
|
+
]
|
820
|
+
),
|
821
|
+
}
|
822
|
+
metadata = {"function_name": "sam2"}
|
823
|
+
pred_detections = send_task_inference_request(
|
824
|
+
payload, "sam2", files=files, metadata=metadata
|
825
|
+
)
|
826
|
+
frame = pred_detections[0]
|
827
|
+
return_data = []
|
828
|
+
for inp_detection, detection in zip(detections, frame):
|
829
|
+
mask = rle_decode_array(detection["mask"])
|
830
|
+
label = detection["label"]
|
831
|
+
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
832
|
+
return_data.append(
|
833
|
+
{
|
834
|
+
"label": label,
|
835
|
+
"bbox": bbox,
|
836
|
+
"mask": mask,
|
837
|
+
"score": inp_detection["score"],
|
838
|
+
}
|
839
|
+
)
|
840
|
+
return return_data
|
841
|
+
|
842
|
+
|
843
|
+
def countgd_sam2_object_detection(
|
844
|
+
prompt: str,
|
845
|
+
image: np.ndarray,
|
846
|
+
box_threshold: float = 0.23,
|
847
|
+
) -> List[Dict[str, Any]]:
|
848
|
+
"""'countgd_sam2_object_detection' is a tool that can detect multiple instances of
|
849
|
+
an object given a text prompt. It is particularly useful when trying to detect and
|
850
|
+
count a large number of objects. You can optionally separate object names in the
|
851
|
+
prompt with commas. It returns a list of bounding boxes with normalized coordinates,
|
852
|
+
label names, masks associated confidence scores.
|
853
|
+
|
854
|
+
Parameters:
|
855
|
+
prompt (str): The object that needs to be counted.
|
856
|
+
image (np.ndarray): The image that contains multiple instances of the object.
|
857
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
858
|
+
to 0.23.
|
859
|
+
|
860
|
+
Returns:
|
861
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
862
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
863
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
864
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
865
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
866
|
+
the background.
|
867
|
+
|
868
|
+
Example
|
869
|
+
-------
|
870
|
+
>>> countgd_object_detection("flower", image)
|
871
|
+
[
|
872
|
+
{
|
873
|
+
'score': 0.49,
|
874
|
+
'label': 'flower',
|
875
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
876
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
877
|
+
[0, 0, 0, ..., 0, 0, 0],
|
878
|
+
...,
|
879
|
+
[0, 0, 0, ..., 0, 0, 0],
|
880
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
881
|
+
},
|
882
|
+
]
|
883
|
+
"""
|
884
|
+
detections = countgd_object_detection(prompt, image, box_threshold)
|
885
|
+
detections_with_masks = sam2(image, detections)
|
886
|
+
return detections_with_masks
|
887
|
+
|
888
|
+
|
753
889
|
def countgd_example_based_counting(
|
754
890
|
visual_prompts: List[List[float]],
|
755
891
|
image: np.ndarray,
|
@@ -1299,7 +1435,7 @@ def florence2_phrase_grounding(
|
|
1299
1435
|
objects given a text prompt which can be object names or caption. You
|
1300
1436
|
can optionally separate the object names in the text with commas. It returns a list
|
1301
1437
|
of bounding boxes with normalized coordinates, label names and associated
|
1302
|
-
|
1438
|
+
confidence scores of 1.0.
|
1303
1439
|
|
1304
1440
|
Parameters:
|
1305
1441
|
prompt (str): The prompt to ground to the image.
|
@@ -1732,6 +1868,35 @@ def template_match(
|
|
1732
1868
|
return return_data
|
1733
1869
|
|
1734
1870
|
|
1871
|
+
def minimum_distance(
|
1872
|
+
det1: Dict[str, Any], det2: Dict[str, Any], image_size: Tuple[int, int]
|
1873
|
+
) -> float:
|
1874
|
+
"""'minimum_distance' calculates the minimum distance between two detections which
|
1875
|
+
can include bounding boxes and or masks. This will return the closest distance
|
1876
|
+
between the objects, not the distance between the centers of the objects.
|
1877
|
+
|
1878
|
+
Parameters:
|
1879
|
+
det1 (Dict[str, Any]): The first detection of boxes or masks.
|
1880
|
+
det2 (Dict[str, Any]): The second detection of boxes or masks.
|
1881
|
+
image_size (Tuple[int, int]): The size of the image given as (height, width).
|
1882
|
+
|
1883
|
+
Returns:
|
1884
|
+
float: The closest distance between the two detections.
|
1885
|
+
|
1886
|
+
Example
|
1887
|
+
-------
|
1888
|
+
>>> closest_distance(det1, det2, image_size)
|
1889
|
+
141.42
|
1890
|
+
"""
|
1891
|
+
|
1892
|
+
if "mask" in det1 and "mask" in det2:
|
1893
|
+
return closest_mask_distance(det1["mask"], det2["mask"])
|
1894
|
+
elif "bbox" in det1 and "bbox" in det2:
|
1895
|
+
return closest_box_distance(det1["bbox"], det2["bbox"], image_size)
|
1896
|
+
else:
|
1897
|
+
raise ValueError("Both detections must have either bbox or mask")
|
1898
|
+
|
1899
|
+
|
1735
1900
|
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
1736
1901
|
"""'closest_mask_distance' calculates the closest distance between two masks.
|
1737
1902
|
|
@@ -2156,7 +2321,7 @@ def overlay_bounding_boxes(
|
|
2156
2321
|
bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
|
2157
2322
|
) -> Union[np.ndarray, List[np.ndarray]]:
|
2158
2323
|
"""'overlay_bounding_boxes' is a utility function that displays bounding boxes on
|
2159
|
-
an image.
|
2324
|
+
an image. It will draw a box around the detected object with the label and score.
|
2160
2325
|
|
2161
2326
|
Parameters:
|
2162
2327
|
medias (Union[np.ndarray, List[np.ndarra]]): The image or frames to display the
|
@@ -2270,7 +2435,7 @@ def overlay_segmentation_masks(
|
|
2270
2435
|
secondary_label_key: str = "tracking_label",
|
2271
2436
|
) -> Union[np.ndarray, List[np.ndarray]]:
|
2272
2437
|
"""'overlay_segmentation_masks' is a utility function that displays segmentation
|
2273
|
-
masks.
|
2438
|
+
masks. It will overlay a colored mask on the detected object with the label.
|
2274
2439
|
|
2275
2440
|
Parameters:
|
2276
2441
|
medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
|
@@ -2329,11 +2494,25 @@ def overlay_segmentation_masks(
|
|
2329
2494
|
mask = elt["mask"]
|
2330
2495
|
label = elt["label"]
|
2331
2496
|
tracking_lbl = elt.get(secondary_label_key, None)
|
2497
|
+
|
2498
|
+
# Create semi-transparent mask overlay
|
2332
2499
|
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
2333
|
-
np_mask[mask > 0, :] = color[label] + (255 * 0.
|
2500
|
+
np_mask[mask > 0, :] = color[label] + (255 * 0.7,)
|
2334
2501
|
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
2335
2502
|
pil_image = Image.alpha_composite(pil_image, mask_img)
|
2336
2503
|
|
2504
|
+
# Draw contour border
|
2505
|
+
mask_uint8 = mask.astype(np.uint8) * 255
|
2506
|
+
contours, _ = cv2.findContours(
|
2507
|
+
mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
2508
|
+
)
|
2509
|
+
border_mask = np.zeros(
|
2510
|
+
(pil_image.size[1], pil_image.size[0], 4), dtype=np.uint8
|
2511
|
+
)
|
2512
|
+
cv2.drawContours(border_mask, contours, -1, color[label] + (255,), 8)
|
2513
|
+
border_img = Image.fromarray(border_mask)
|
2514
|
+
pil_image = Image.alpha_composite(pil_image, border_img)
|
2515
|
+
|
2337
2516
|
if draw_label:
|
2338
2517
|
draw = ImageDraw.Draw(pil_image)
|
2339
2518
|
text = tracking_lbl if tracking_lbl else label
|
@@ -2452,7 +2631,8 @@ FUNCTION_TOOLS = [
|
|
2452
2631
|
ocr,
|
2453
2632
|
vit_image_classification,
|
2454
2633
|
vit_nsfw_classification,
|
2455
|
-
|
2634
|
+
countgd_object_detection,
|
2635
|
+
countgd_sam2_object_detection,
|
2456
2636
|
florence2_ocr,
|
2457
2637
|
florence2_sam2_image,
|
2458
2638
|
florence2_sam2_video_tracking,
|
@@ -2461,8 +2641,7 @@ FUNCTION_TOOLS = [
|
|
2461
2641
|
detr_segmentation,
|
2462
2642
|
depth_anything_v2,
|
2463
2643
|
generate_pose_image,
|
2464
|
-
|
2465
|
-
closest_box_distance,
|
2644
|
+
minimum_distance,
|
2466
2645
|
qwen2_vl_images_vqa,
|
2467
2646
|
qwen2_vl_video_vqa,
|
2468
2647
|
video_temporal_localization,
|
@@ -42,10 +42,10 @@ def normalize_bbox(
|
|
42
42
|
) -> List[float]:
|
43
43
|
r"""Normalize the bounding box coordinates to be between 0 and 1."""
|
44
44
|
x1, y1, x2, y2 = bbox
|
45
|
-
x1 = max(round(x1 / image_size[1],
|
46
|
-
y1 = max(round(y1 / image_size[0],
|
47
|
-
x2 = min(round(x2 / image_size[1],
|
48
|
-
y2 = min(round(y2 / image_size[0],
|
45
|
+
x1 = max(round(x1 / image_size[1], 3), 0)
|
46
|
+
y1 = max(round(y1 / image_size[0], 3), 0)
|
47
|
+
x2 = min(round(x2 / image_size[1], 3), image_size[1])
|
48
|
+
y2 = min(round(y2 / image_size[0], 3), image_size[0])
|
49
49
|
return [x1, y1, x2, y2]
|
50
50
|
|
51
51
|
|
@@ -109,7 +109,10 @@ class Sim:
|
|
109
109
|
|
110
110
|
@lru_cache(maxsize=256)
|
111
111
|
def top_k(
|
112
|
-
self,
|
112
|
+
self,
|
113
|
+
query: str,
|
114
|
+
k: int = 5,
|
115
|
+
thresh: Optional[float] = None,
|
113
116
|
) -> Sequence[Dict]:
|
114
117
|
"""Returns the top k most similar items to the query.
|
115
118
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.208 → vision_agent-0.2.209}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|