vision-agent 0.2.153__tar.gz → 0.2.154__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.153 → vision_agent-0.2.154}/PKG-INFO +1 -1
- {vision_agent-0.2.153 → vision_agent-0.2.154}/pyproject.toml +1 -1
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent_coder_prompts.py +5 -5
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent_prompts.py +7 -7
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/__init__.py +2 -1
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/meta_tools.py +2 -2
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/tools.py +6 -7
- {vision_agent-0.2.153 → vision_agent-0.2.154}/LICENSE +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/README.md +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/utils/video.py +0 -0
{vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
@@ -101,7 +101,7 @@ plan1:
|
|
101
101
|
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
102
102
|
plan2:
|
103
103
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
104
|
-
- Use the '
|
104
|
+
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
105
105
|
plan3:
|
106
106
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
107
107
|
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
@@ -109,7 +109,7 @@ plan3:
|
|
109
109
|
|
110
110
|
```python
|
111
111
|
import numpy as np
|
112
|
-
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video,
|
112
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
|
113
113
|
|
114
114
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
115
115
|
frames = extract_frames_and_timestamps("video.mp4", 1)
|
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
|
|
143
143
|
owl_v2_counts = get_counts(owl_v2_out)
|
144
144
|
|
145
145
|
# plan2
|
146
|
-
florence2_out = [
|
146
|
+
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
147
147
|
florence2_counts = get_counts(florence2_out)
|
148
148
|
|
149
149
|
# plan3
|
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
|
|
153
153
|
|
154
154
|
final_out = {{
|
155
155
|
"owl_v2_video": owl_v2_out,
|
156
|
-
"
|
156
|
+
"florence2_phrase_grounding": florence2_out,
|
157
157
|
"florence2_sam2_video_tracking": f2s2_out,
|
158
158
|
}}
|
159
159
|
|
160
160
|
counts = {{
|
161
161
|
"owl_v2_video": owl_v2_counts,
|
162
|
-
"
|
162
|
+
"florence2_phrase_grounding": florence2_counts,
|
163
163
|
"florence2_sam2_video_tracking": f2s2_counts,
|
164
164
|
}}
|
165
165
|
|
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
|
|
131
131
|
|
132
132
|
OBSERVATION:
|
133
133
|
[Artifact code.py]
|
134
|
-
0|from vision_agent.tools import load_image,
|
134
|
+
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
135
135
|
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
136
136
|
2| image = load_image(image_path)
|
137
|
-
3| detections =
|
137
|
+
3| detections = florence2_phrase_grounding("worker, helmet", image)
|
138
138
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
139
139
|
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
140
140
|
6| count = 0
|
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
|
|
166
166
|
OBSERVATION:
|
167
167
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
168
168
|
|
169
|
-
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original
|
169
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
170
170
|
|
171
171
|
OBSERVATION:
|
172
172
|
[Artifact code.py edits]
|
173
173
|
---
|
174
174
|
+++
|
175
175
|
@@ -1,7 +1,7 @@
|
176
|
-
from vision_agent.tools import load_image,
|
176
|
+
from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
177
177
|
def count_workers_with_helmets(image_path: str, output_path: str):
|
178
178
|
image = load_image(image_path)
|
179
|
-
- detections =
|
180
|
-
+ detections =
|
179
|
+
- detections = florence2_phrase_grounding("worker, helmet", image)
|
180
|
+
+ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
|
181
181
|
workers = [d for d in detections if d['label'] == 'worker']
|
182
182
|
helmets = [d for d in detections if d['label'] == 'helmet']
|
183
183
|
count = 0
|
@@ -189,5 +189,5 @@ OBSERVATION:
|
|
189
189
|
----- stdout -----
|
190
190
|
3
|
191
191
|
|
192
|
-
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned
|
192
|
+
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
193
193
|
"""
|
@@ -24,7 +24,8 @@ from .tools import (
|
|
24
24
|
extract_frames_and_timestamps,
|
25
25
|
florence2_image_caption,
|
26
26
|
florence2_ocr,
|
27
|
-
|
27
|
+
florence2_phrase_grounding,
|
28
|
+
florence2_phrase_grounding_video,
|
28
29
|
florence2_roberta_vqa,
|
29
30
|
florence2_sam2_image,
|
30
31
|
florence2_sam2_video_tracking,
|
@@ -668,8 +668,8 @@ def use_object_detection_fine_tuning(
|
|
668
668
|
|
669
669
|
patterns_with_fine_tune_id = [
|
670
670
|
(
|
671
|
-
r'
|
672
|
-
lambda match: f'
|
671
|
+
r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
672
|
+
lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
673
673
|
),
|
674
674
|
(
|
675
675
|
r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
@@ -1143,10 +1143,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
1143
1143
|
return answer[task] # type: ignore
|
1144
1144
|
|
1145
1145
|
|
1146
|
-
def
|
1146
|
+
def florence2_phrase_grounding(
|
1147
1147
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1148
1148
|
) -> List[Dict[str, Any]]:
|
1149
|
-
"""'
|
1149
|
+
"""'florence2_phrase_grounding' will run florence2 on a image. It can
|
1150
1150
|
detect multiple objects given a text prompt which can be object names or caption.
|
1151
1151
|
You can optionally separate the object names in the text with commas. It returns
|
1152
1152
|
a list of bounding boxes with normalized coordinates, label names and associated
|
@@ -1167,7 +1167,7 @@ def florence2_phrase_grounding_image(
|
|
1167
1167
|
|
1168
1168
|
Example
|
1169
1169
|
-------
|
1170
|
-
>>>
|
1170
|
+
>>> florence2_phrase_grounding('person looking at a coyote', image)
|
1171
1171
|
[
|
1172
1172
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1173
1173
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -1196,7 +1196,7 @@ def florence2_phrase_grounding_image(
|
|
1196
1196
|
"florence2-ft",
|
1197
1197
|
v2=True,
|
1198
1198
|
is_form=True,
|
1199
|
-
metadata_payload={"function_name": "
|
1199
|
+
metadata_payload={"function_name": "florence2_phrase_grounding"},
|
1200
1200
|
)
|
1201
1201
|
# get the first frame
|
1202
1202
|
detection = detections[0]
|
@@ -1205,7 +1205,7 @@ def florence2_phrase_grounding_image(
|
|
1205
1205
|
"image": image_b64,
|
1206
1206
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1207
1207
|
"prompt": prompt,
|
1208
|
-
"function_name": "
|
1208
|
+
"function_name": "florence2_phrase_grounding",
|
1209
1209
|
}
|
1210
1210
|
detections = send_inference_request(data, "florence2", v2=True)
|
1211
1211
|
detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
@@ -2164,8 +2164,7 @@ FUNCTION_TOOLS = [
|
|
2164
2164
|
florence2_ocr,
|
2165
2165
|
florence2_sam2_image,
|
2166
2166
|
florence2_sam2_video_tracking,
|
2167
|
-
|
2168
|
-
florence2_phrase_grounding_video,
|
2167
|
+
florence2_phrase_grounding,
|
2169
2168
|
ixc25_image_vqa,
|
2170
2169
|
ixc25_video_vqa,
|
2171
2170
|
detr_segmentation,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|