vision-agent 0.2.153__py3-none-any.whl → 0.2.154__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent_coder_prompts.py +5 -5
- vision_agent/agent/vision_agent_prompts.py +7 -7
- vision_agent/tools/__init__.py +2 -1
- vision_agent/tools/meta_tools.py +2 -2
- vision_agent/tools/tools.py +6 -7
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.154.dist-info}/METADATA +1 -1
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.154.dist-info}/RECORD +9 -9
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.154.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.153.dist-info → vision_agent-0.2.154.dist-info}/WHEEL +0 -0
@@ -101,7 +101,7 @@ plan1:
|
|
101
101
|
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
102
102
|
plan2:
|
103
103
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
104
|
-
- Use the '
|
104
|
+
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
105
105
|
plan3:
|
106
106
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
107
107
|
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
@@ -109,7 +109,7 @@ plan3:
|
|
109
109
|
|
110
110
|
```python
|
111
111
|
import numpy as np
|
112
|
-
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video,
|
112
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
|
113
113
|
|
114
114
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
115
115
|
frames = extract_frames_and_timestamps("video.mp4", 1)
|
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
|
|
143
143
|
owl_v2_counts = get_counts(owl_v2_out)
|
144
144
|
|
145
145
|
# plan2
|
146
|
-
florence2_out = [
|
146
|
+
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
147
147
|
florence2_counts = get_counts(florence2_out)
|
148
148
|
|
149
149
|
# plan3
|
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
|
|
153
153
|
|
154
154
|
final_out = {{
|
155
155
|
"owl_v2_video": owl_v2_out,
|
156
|
-
"
|
156
|
+
"florence2_phrase_grounding": florence2_out,
|
157
157
|
"florence2_sam2_video_tracking": f2s2_out,
|
158
158
|
}}
|
159
159
|
|
160
160
|
counts = {{
|
161
161
|
"owl_v2_video": owl_v2_counts,
|
162
|
-
"
|
162
|
+
"florence2_phrase_grounding": florence2_counts,
|
163
163
|
"florence2_sam2_video_tracking": f2s2_counts,
|
164
164
|
}}
|
165
165
|
|
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
|
|
131
131
|
|
132
132
|
OBSERVATION:
|
133
133
|
[Artifact code.py]
|
134
|
-
0|from vision_agent.tools import load_image,
|
134
|
+
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
135
135
|
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
136
136
|
2| image = load_image(image_path)
|
137
|
-
3| detections =
|
137
|
+
3| detections = florence2_phrase_grounding("worker, helmet", image)
|
138
138
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
139
139
|
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
140
140
|
6| count = 0
|
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
|
|
166
166
|
OBSERVATION:
|
167
167
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
168
168
|
|
169
|
-
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original
|
169
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
170
170
|
|
171
171
|
OBSERVATION:
|
172
172
|
[Artifact code.py edits]
|
173
173
|
---
|
174
174
|
+++
|
175
175
|
@@ -1,7 +1,7 @@
|
176
|
-
from vision_agent.tools import load_image,
|
176
|
+
from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
177
177
|
def count_workers_with_helmets(image_path: str, output_path: str):
|
178
178
|
image = load_image(image_path)
|
179
|
-
- detections =
|
180
|
-
+ detections =
|
179
|
+
- detections = florence2_phrase_grounding("worker, helmet", image)
|
180
|
+
+ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
|
181
181
|
workers = [d for d in detections if d['label'] == 'worker']
|
182
182
|
helmets = [d for d in detections if d['label'] == 'helmet']
|
183
183
|
count = 0
|
@@ -189,5 +189,5 @@ OBSERVATION:
|
|
189
189
|
----- stdout -----
|
190
190
|
3
|
191
191
|
|
192
|
-
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned
|
192
|
+
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
193
193
|
"""
|
vision_agent/tools/__init__.py
CHANGED
@@ -24,7 +24,8 @@ from .tools import (
|
|
24
24
|
extract_frames_and_timestamps,
|
25
25
|
florence2_image_caption,
|
26
26
|
florence2_ocr,
|
27
|
-
|
27
|
+
florence2_phrase_grounding,
|
28
|
+
florence2_phrase_grounding_video,
|
28
29
|
florence2_roberta_vqa,
|
29
30
|
florence2_sam2_image,
|
30
31
|
florence2_sam2_video_tracking,
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -668,8 +668,8 @@ def use_object_detection_fine_tuning(
|
|
668
668
|
|
669
669
|
patterns_with_fine_tune_id = [
|
670
670
|
(
|
671
|
-
r'
|
672
|
-
lambda match: f'
|
671
|
+
r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
672
|
+
lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
673
673
|
),
|
674
674
|
(
|
675
675
|
r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
vision_agent/tools/tools.py
CHANGED
@@ -1143,10 +1143,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
1143
1143
|
return answer[task] # type: ignore
|
1144
1144
|
|
1145
1145
|
|
1146
|
-
def
|
1146
|
+
def florence2_phrase_grounding(
|
1147
1147
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1148
1148
|
) -> List[Dict[str, Any]]:
|
1149
|
-
"""'
|
1149
|
+
"""'florence2_phrase_grounding' will run florence2 on a image. It can
|
1150
1150
|
detect multiple objects given a text prompt which can be object names or caption.
|
1151
1151
|
You can optionally separate the object names in the text with commas. It returns
|
1152
1152
|
a list of bounding boxes with normalized coordinates, label names and associated
|
@@ -1167,7 +1167,7 @@ def florence2_phrase_grounding_image(
|
|
1167
1167
|
|
1168
1168
|
Example
|
1169
1169
|
-------
|
1170
|
-
>>>
|
1170
|
+
>>> florence2_phrase_grounding('person looking at a coyote', image)
|
1171
1171
|
[
|
1172
1172
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1173
1173
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -1196,7 +1196,7 @@ def florence2_phrase_grounding_image(
|
|
1196
1196
|
"florence2-ft",
|
1197
1197
|
v2=True,
|
1198
1198
|
is_form=True,
|
1199
|
-
metadata_payload={"function_name": "
|
1199
|
+
metadata_payload={"function_name": "florence2_phrase_grounding"},
|
1200
1200
|
)
|
1201
1201
|
# get the first frame
|
1202
1202
|
detection = detections[0]
|
@@ -1205,7 +1205,7 @@ def florence2_phrase_grounding_image(
|
|
1205
1205
|
"image": image_b64,
|
1206
1206
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1207
1207
|
"prompt": prompt,
|
1208
|
-
"function_name": "
|
1208
|
+
"function_name": "florence2_phrase_grounding",
|
1209
1209
|
}
|
1210
1210
|
detections = send_inference_request(data, "florence2", v2=True)
|
1211
1211
|
detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
@@ -2164,8 +2164,7 @@ FUNCTION_TOOLS = [
|
|
2164
2164
|
florence2_ocr,
|
2165
2165
|
florence2_sam2_image,
|
2166
2166
|
florence2_sam2_video_tracking,
|
2167
|
-
|
2168
|
-
florence2_phrase_grounding_video,
|
2167
|
+
florence2_phrase_grounding,
|
2169
2168
|
ixc25_image_vqa,
|
2170
2169
|
ixc25_video_vqa,
|
2171
2170
|
detr_segmentation,
|
@@ -4,8 +4,8 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
|
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
|
5
5
|
vision_agent/agent/vision_agent.py,sha256=m7apb1smJbRyj0VAellrN_mDrSPAee4DVm6FWRa-e78,18459
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=
|
8
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
|
8
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=3n92aF-jpUyyrAy06izdHIMPEMZPKD1JV0wfQvt-PD8,11251
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
11
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -14,11 +14,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
14
14
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
15
15
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=u41fm9KGX1s9DWzVAGnuungEooxH4X8fSDk5hjXvDiY,2450
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=FN2oMhXzCzSzmk6Na6uKw1r5-CGO3lCk94izcWNFKwA,25167
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=FTSboDmYPQLmIhsc9FeydcrdRZU6huBZKnyBmm0VsHE,8196
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
21
|
+
vision_agent/tools/tools.py,sha256=l6PreDTod2tJYuaaWxJljJ7PDsKtBV3YJ-nHgl0AzeI,78332
|
22
22
|
vision_agent/tools/tools_types.py,sha256=Qijj5NmY6_Aq1fYwuQYf3J1TAQYTz_1mWkX3Dq4d4e0,2339
|
23
23
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
27
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.154.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.154.dist-info/METADATA,sha256=gpOhgbgqKO1ypr5HfV4ZLpn0jFA3_v5MX4qNe08154g,13758
|
32
|
+
vision_agent-0.2.154.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.154.dist-info/RECORD,,
|
File without changes
|
File without changes
|