vision-agent 0.2.153__py3-none-any.whl → 0.2.154__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_coder_prompts.py +5 -5
- vision_agent/agent/vision_agent_prompts.py +7 -7
- vision_agent/tools/__init__.py +2 -1
- vision_agent/tools/meta_tools.py +2 -2
- vision_agent/tools/tools.py +6 -7
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.154.dist-info}/METADATA +1 -1
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.154.dist-info}/RECORD +9 -9
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.154.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.154.dist-info}/WHEEL +0 -0
@@ -101,7 +101,7 @@ plan1:
|
|
101
101
|
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
102
102
|
plan2:
|
103
103
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
104
|
-
- Use the '
|
104
|
+
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
105
105
|
plan3:
|
106
106
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
107
107
|
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
@@ -109,7 +109,7 @@ plan3:
|
|
109
109
|
|
110
110
|
```python
|
111
111
|
import numpy as np
|
112
|
-
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video,
|
112
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
|
113
113
|
|
114
114
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
115
115
|
frames = extract_frames_and_timestamps("video.mp4", 1)
|
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
|
|
143
143
|
owl_v2_counts = get_counts(owl_v2_out)
|
144
144
|
|
145
145
|
# plan2
|
146
|
-
florence2_out = [
|
146
|
+
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
147
147
|
florence2_counts = get_counts(florence2_out)
|
148
148
|
|
149
149
|
# plan3
|
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
|
|
153
153
|
|
154
154
|
final_out = {{
|
155
155
|
"owl_v2_video": owl_v2_out,
|
156
|
-
"
|
156
|
+
"florence2_phrase_grounding": florence2_out,
|
157
157
|
"florence2_sam2_video_tracking": f2s2_out,
|
158
158
|
}}
|
159
159
|
|
160
160
|
counts = {{
|
161
161
|
"owl_v2_video": owl_v2_counts,
|
162
|
-
"
|
162
|
+
"florence2_phrase_grounding": florence2_counts,
|
163
163
|
"florence2_sam2_video_tracking": f2s2_counts,
|
164
164
|
}}
|
165
165
|
|
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
|
|
131
131
|
|
132
132
|
OBSERVATION:
|
133
133
|
[Artifact code.py]
|
134
|
-
0|from vision_agent.tools import load_image,
|
134
|
+
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
135
135
|
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
136
136
|
2| image = load_image(image_path)
|
137
|
-
3| detections =
|
137
|
+
3| detections = florence2_phrase_grounding("worker, helmet", image)
|
138
138
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
139
139
|
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
140
140
|
6| count = 0
|
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
|
|
166
166
|
OBSERVATION:
|
167
167
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
168
168
|
|
169
|
-
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original
|
169
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
170
170
|
|
171
171
|
OBSERVATION:
|
172
172
|
[Artifact code.py edits]
|
173
173
|
---
|
174
174
|
+++
|
175
175
|
@@ -1,7 +1,7 @@
|
176
|
-
from vision_agent.tools import load_image,
|
176
|
+
from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
177
177
|
def count_workers_with_helmets(image_path: str, output_path: str):
|
178
178
|
image = load_image(image_path)
|
179
|
-
- detections =
|
180
|
-
+ detections =
|
179
|
+
- detections = florence2_phrase_grounding("worker, helmet", image)
|
180
|
+
+ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
|
181
181
|
workers = [d for d in detections if d['label'] == 'worker']
|
182
182
|
helmets = [d for d in detections if d['label'] == 'helmet']
|
183
183
|
count = 0
|
@@ -189,5 +189,5 @@ OBSERVATION:
|
|
189
189
|
----- stdout -----
|
190
190
|
3
|
191
191
|
|
192
|
-
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned
|
192
|
+
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
193
193
|
"""
|
vision_agent/tools/__init__.py
CHANGED
@@ -24,7 +24,8 @@ from .tools import (
|
|
24
24
|
extract_frames_and_timestamps,
|
25
25
|
florence2_image_caption,
|
26
26
|
florence2_ocr,
|
27
|
-
|
27
|
+
florence2_phrase_grounding,
|
28
|
+
florence2_phrase_grounding_video,
|
28
29
|
florence2_roberta_vqa,
|
29
30
|
florence2_sam2_image,
|
30
31
|
florence2_sam2_video_tracking,
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -668,8 +668,8 @@ def use_object_detection_fine_tuning(
|
|
668
668
|
|
669
669
|
patterns_with_fine_tune_id = [
|
670
670
|
(
|
671
|
-
r'
|
672
|
-
lambda match: f'
|
671
|
+
r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
672
|
+
lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
673
673
|
),
|
674
674
|
(
|
675
675
|
r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
vision_agent/tools/tools.py
CHANGED
@@ -1143,10 +1143,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
1143
1143
|
return answer[task] # type: ignore
|
1144
1144
|
|
1145
1145
|
|
1146
|
-
def
|
1146
|
+
def florence2_phrase_grounding(
|
1147
1147
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1148
1148
|
) -> List[Dict[str, Any]]:
|
1149
|
-
"""'
|
1149
|
+
"""'florence2_phrase_grounding' will run florence2 on a image. It can
|
1150
1150
|
detect multiple objects given a text prompt which can be object names or caption.
|
1151
1151
|
You can optionally separate the object names in the text with commas. It returns
|
1152
1152
|
a list of bounding boxes with normalized coordinates, label names and associated
|
@@ -1167,7 +1167,7 @@ def florence2_phrase_grounding_image(
|
|
1167
1167
|
|
1168
1168
|
Example
|
1169
1169
|
-------
|
1170
|
-
>>>
|
1170
|
+
>>> florence2_phrase_grounding('person looking at a coyote', image)
|
1171
1171
|
[
|
1172
1172
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1173
1173
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -1196,7 +1196,7 @@ def florence2_phrase_grounding_image(
|
|
1196
1196
|
"florence2-ft",
|
1197
1197
|
v2=True,
|
1198
1198
|
is_form=True,
|
1199
|
-
metadata_payload={"function_name": "
|
1199
|
+
metadata_payload={"function_name": "florence2_phrase_grounding"},
|
1200
1200
|
)
|
1201
1201
|
# get the first frame
|
1202
1202
|
detection = detections[0]
|
@@ -1205,7 +1205,7 @@ def florence2_phrase_grounding_image(
|
|
1205
1205
|
"image": image_b64,
|
1206
1206
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1207
1207
|
"prompt": prompt,
|
1208
|
-
"function_name": "
|
1208
|
+
"function_name": "florence2_phrase_grounding",
|
1209
1209
|
}
|
1210
1210
|
detections = send_inference_request(data, "florence2", v2=True)
|
1211
1211
|
detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
@@ -2164,8 +2164,7 @@ FUNCTION_TOOLS = [
|
|
2164
2164
|
florence2_ocr,
|
2165
2165
|
florence2_sam2_image,
|
2166
2166
|
florence2_sam2_video_tracking,
|
2167
|
-
|
2168
|
-
florence2_phrase_grounding_video,
|
2167
|
+
florence2_phrase_grounding,
|
2169
2168
|
ixc25_image_vqa,
|
2170
2169
|
ixc25_video_vqa,
|
2171
2170
|
detr_segmentation,
|
@@ -4,8 +4,8 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
|
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
|
5
5
|
vision_agent/agent/vision_agent.py,sha256=m7apb1smJbRyj0VAellrN_mDrSPAee4DVm6FWRa-e78,18459
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=
|
8
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
|
8
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=3n92aF-jpUyyrAy06izdHIMPEMZPKD1JV0wfQvt-PD8,11251
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
11
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -14,11 +14,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
14
14
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
15
15
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=u41fm9KGX1s9DWzVAGnuungEooxH4X8fSDk5hjXvDiY,2450
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=FN2oMhXzCzSzmk6Na6uKw1r5-CGO3lCk94izcWNFKwA,25167
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=FTSboDmYPQLmIhsc9FeydcrdRZU6huBZKnyBmm0VsHE,8196
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
21
|
+
vision_agent/tools/tools.py,sha256=l6PreDTod2tJYuaaWxJljJ7PDsKtBV3YJ-nHgl0AzeI,78332
|
22
22
|
vision_agent/tools/tools_types.py,sha256=Qijj5NmY6_Aq1fYwuQYf3J1TAQYTz_1mWkX3Dq4d4e0,2339
|
23
23
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
27
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.154.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.154.dist-info/METADATA,sha256=gpOhgbgqKO1ypr5HfV4ZLpn0jFA3_v5MX4qNe08154g,13758
|
32
|
+
vision_agent-0.2.154.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.154.dist-info/RECORD,,
|
File without changes
|
File without changes
|