vision-agent 0.2.153__py3-none-any.whl → 0.2.155__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_coder_prompts.py +5 -5
- vision_agent/agent/vision_agent_prompts.py +7 -7
- vision_agent/tools/__init__.py +2 -1
- vision_agent/tools/meta_tools.py +2 -2
- vision_agent/tools/tools.py +15 -12
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.155.dist-info}/METADATA +1 -1
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.155.dist-info}/RECORD +9 -9
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.155.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.155.dist-info}/WHEEL +0 -0
@@ -101,7 +101,7 @@ plan1:
|
|
101
101
|
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
102
102
|
plan2:
|
103
103
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
104
|
-
- Use the '
|
104
|
+
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
105
105
|
plan3:
|
106
106
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
107
107
|
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
@@ -109,7 +109,7 @@ plan3:
|
|
109
109
|
|
110
110
|
```python
|
111
111
|
import numpy as np
|
112
|
-
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video,
|
112
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
|
113
113
|
|
114
114
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
115
115
|
frames = extract_frames_and_timestamps("video.mp4", 1)
|
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
|
|
143
143
|
owl_v2_counts = get_counts(owl_v2_out)
|
144
144
|
|
145
145
|
# plan2
|
146
|
-
florence2_out = [
|
146
|
+
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
147
147
|
florence2_counts = get_counts(florence2_out)
|
148
148
|
|
149
149
|
# plan3
|
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
|
|
153
153
|
|
154
154
|
final_out = {{
|
155
155
|
"owl_v2_video": owl_v2_out,
|
156
|
-
"
|
156
|
+
"florence2_phrase_grounding": florence2_out,
|
157
157
|
"florence2_sam2_video_tracking": f2s2_out,
|
158
158
|
}}
|
159
159
|
|
160
160
|
counts = {{
|
161
161
|
"owl_v2_video": owl_v2_counts,
|
162
|
-
"
|
162
|
+
"florence2_phrase_grounding": florence2_counts,
|
163
163
|
"florence2_sam2_video_tracking": f2s2_counts,
|
164
164
|
}}
|
165
165
|
|
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
|
|
131
131
|
|
132
132
|
OBSERVATION:
|
133
133
|
[Artifact code.py]
|
134
|
-
0|from vision_agent.tools import load_image,
|
134
|
+
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
135
135
|
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
136
136
|
2| image = load_image(image_path)
|
137
|
-
3| detections =
|
137
|
+
3| detections = florence2_phrase_grounding("worker, helmet", image)
|
138
138
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
139
139
|
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
140
140
|
6| count = 0
|
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
|
|
166
166
|
OBSERVATION:
|
167
167
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
168
168
|
|
169
|
-
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original
|
169
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
170
170
|
|
171
171
|
OBSERVATION:
|
172
172
|
[Artifact code.py edits]
|
173
173
|
---
|
174
174
|
+++
|
175
175
|
@@ -1,7 +1,7 @@
|
176
|
-
from vision_agent.tools import load_image,
|
176
|
+
from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
177
177
|
def count_workers_with_helmets(image_path: str, output_path: str):
|
178
178
|
image = load_image(image_path)
|
179
|
-
- detections =
|
180
|
-
+ detections =
|
179
|
+
- detections = florence2_phrase_grounding("worker, helmet", image)
|
180
|
+
+ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
|
181
181
|
workers = [d for d in detections if d['label'] == 'worker']
|
182
182
|
helmets = [d for d in detections if d['label'] == 'helmet']
|
183
183
|
count = 0
|
@@ -189,5 +189,5 @@ OBSERVATION:
|
|
189
189
|
----- stdout -----
|
190
190
|
3
|
191
191
|
|
192
|
-
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned
|
192
|
+
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
193
193
|
"""
|
vision_agent/tools/__init__.py
CHANGED
@@ -24,7 +24,8 @@ from .tools import (
|
|
24
24
|
extract_frames_and_timestamps,
|
25
25
|
florence2_image_caption,
|
26
26
|
florence2_ocr,
|
27
|
-
|
27
|
+
florence2_phrase_grounding,
|
28
|
+
florence2_phrase_grounding_video,
|
28
29
|
florence2_roberta_vqa,
|
29
30
|
florence2_sam2_image,
|
30
31
|
florence2_sam2_video_tracking,
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -668,8 +668,8 @@ def use_object_detection_fine_tuning(
|
|
668
668
|
|
669
669
|
patterns_with_fine_tune_id = [
|
670
670
|
(
|
671
|
-
r'
|
672
|
-
lambda match: f'
|
671
|
+
r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
672
|
+
lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
673
673
|
),
|
674
674
|
(
|
675
675
|
r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
vision_agent/tools/tools.py
CHANGED
@@ -700,18 +700,22 @@ def countgd_counting(
|
|
700
700
|
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
701
701
|
]
|
702
702
|
"""
|
703
|
-
|
703
|
+
buffer_bytes = numpy_to_bytes(image)
|
704
|
+
files = [("image", buffer_bytes)]
|
704
705
|
prompt = prompt.replace(", ", " .")
|
705
|
-
payload = {"
|
706
|
+
payload = {"prompts": [prompt], "model": "countgd"}
|
706
707
|
metadata = {"function_name": "countgd_counting"}
|
707
|
-
resp_data = send_task_inference_request(
|
708
|
+
resp_data = send_task_inference_request(
|
709
|
+
payload, "text-to-object-detection", files=files, metadata=metadata
|
710
|
+
)
|
711
|
+
bboxes_per_frame = resp_data[0]
|
708
712
|
bboxes_formatted = [
|
709
713
|
ODResponseData(
|
710
714
|
label=bbox["label"],
|
711
|
-
bbox=list(map(lambda x: round(x, 2), bbox["
|
715
|
+
bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
|
712
716
|
score=round(bbox["score"], 2),
|
713
717
|
)
|
714
|
-
for bbox in
|
718
|
+
for bbox in bboxes_per_frame
|
715
719
|
]
|
716
720
|
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
717
721
|
return [bbox.model_dump() for bbox in filtered_bboxes]
|
@@ -1143,10 +1147,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
1143
1147
|
return answer[task] # type: ignore
|
1144
1148
|
|
1145
1149
|
|
1146
|
-
def
|
1150
|
+
def florence2_phrase_grounding(
|
1147
1151
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1148
1152
|
) -> List[Dict[str, Any]]:
|
1149
|
-
"""'
|
1153
|
+
"""'florence2_phrase_grounding' will run florence2 on a image. It can
|
1150
1154
|
detect multiple objects given a text prompt which can be object names or caption.
|
1151
1155
|
You can optionally separate the object names in the text with commas. It returns
|
1152
1156
|
a list of bounding boxes with normalized coordinates, label names and associated
|
@@ -1167,7 +1171,7 @@ def florence2_phrase_grounding_image(
|
|
1167
1171
|
|
1168
1172
|
Example
|
1169
1173
|
-------
|
1170
|
-
>>>
|
1174
|
+
>>> florence2_phrase_grounding('person looking at a coyote', image)
|
1171
1175
|
[
|
1172
1176
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1173
1177
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -1196,7 +1200,7 @@ def florence2_phrase_grounding_image(
|
|
1196
1200
|
"florence2-ft",
|
1197
1201
|
v2=True,
|
1198
1202
|
is_form=True,
|
1199
|
-
metadata_payload={"function_name": "
|
1203
|
+
metadata_payload={"function_name": "florence2_phrase_grounding"},
|
1200
1204
|
)
|
1201
1205
|
# get the first frame
|
1202
1206
|
detection = detections[0]
|
@@ -1205,7 +1209,7 @@ def florence2_phrase_grounding_image(
|
|
1205
1209
|
"image": image_b64,
|
1206
1210
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1207
1211
|
"prompt": prompt,
|
1208
|
-
"function_name": "
|
1212
|
+
"function_name": "florence2_phrase_grounding",
|
1209
1213
|
}
|
1210
1214
|
detections = send_inference_request(data, "florence2", v2=True)
|
1211
1215
|
detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
@@ -2164,8 +2168,7 @@ FUNCTION_TOOLS = [
|
|
2164
2168
|
florence2_ocr,
|
2165
2169
|
florence2_sam2_image,
|
2166
2170
|
florence2_sam2_video_tracking,
|
2167
|
-
|
2168
|
-
florence2_phrase_grounding_video,
|
2171
|
+
florence2_phrase_grounding,
|
2169
2172
|
ixc25_image_vqa,
|
2170
2173
|
ixc25_video_vqa,
|
2171
2174
|
detr_segmentation,
|
@@ -4,8 +4,8 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
|
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
|
5
5
|
vision_agent/agent/vision_agent.py,sha256=m7apb1smJbRyj0VAellrN_mDrSPAee4DVm6FWRa-e78,18459
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=
|
8
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
|
8
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=3n92aF-jpUyyrAy06izdHIMPEMZPKD1JV0wfQvt-PD8,11251
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
11
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -14,11 +14,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
14
14
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
15
15
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=u41fm9KGX1s9DWzVAGnuungEooxH4X8fSDk5hjXvDiY,2450
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=FN2oMhXzCzSzmk6Na6uKw1r5-CGO3lCk94izcWNFKwA,25167
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=FTSboDmYPQLmIhsc9FeydcrdRZU6huBZKnyBmm0VsHE,8196
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
21
|
+
vision_agent/tools/tools.py,sha256=3T5h9dewsqkKu66BlNdBwXnEKNCBl0_FhdHwTNYQolI,78471
|
22
22
|
vision_agent/tools/tools_types.py,sha256=Qijj5NmY6_Aq1fYwuQYf3J1TAQYTz_1mWkX3Dq4d4e0,2339
|
23
23
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
27
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.155.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.155.dist-info/METADATA,sha256=lueDmQRoKz_BUNDRApWHxege_xxXnPI117OBh1nZJcg,13758
|
32
|
+
vision_agent-0.2.155.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.155.dist-info/RECORD,,
|
File without changes
|
File without changes
|