vision-agent 0.2.147__py3-none-any.whl → 0.2.149__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent.py +62 -41
- vision_agent/agent/vision_agent_coder.py +4 -4
- vision_agent/agent/vision_agent_coder_prompts.py +90 -14
- vision_agent/agent/vision_agent_prompts.py +5 -5
- vision_agent/tools/meta_tools.py +51 -54
- {vision_agent-0.2.147.dist-info → vision_agent-0.2.149.dist-info}/METADATA +1 -1
- {vision_agent-0.2.147.dist-info → vision_agent-0.2.149.dist-info}/RECORD +9 -9
- {vision_agent-0.2.147.dist-info → vision_agent-0.2.149.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.147.dist-info → vision_agent-0.2.149.dist-info}/WHEEL +0 -0
@@ -87,7 +87,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
87
87
|
return extract_json(orch([message], stream=False)) # type: ignore
|
88
88
|
|
89
89
|
|
90
|
-
def
|
90
|
+
def execute_code_action(
|
91
91
|
code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
|
92
92
|
) -> Tuple[Execution, str]:
|
93
93
|
result = code_interpreter.exec_isolation(
|
@@ -106,19 +106,53 @@ def parse_execution(
|
|
106
106
|
customed_tool_names: Optional[List[str]] = None,
|
107
107
|
) -> Optional[str]:
|
108
108
|
code = None
|
109
|
-
|
110
|
-
|
111
|
-
|
109
|
+
remaining = response
|
110
|
+
all_code = []
|
111
|
+
while "<execute_python>" in remaining:
|
112
|
+
code_i = remaining[
|
113
|
+
remaining.find("<execute_python>") + len("<execute_python>") :
|
114
|
+
]
|
115
|
+
code_i = code_i[: code_i.find("</execute_python>")]
|
116
|
+
remaining = remaining[
|
117
|
+
remaining.find("</execute_python>") + len("</execute_python>") :
|
118
|
+
]
|
119
|
+
all_code.append(code_i)
|
120
|
+
|
121
|
+
if len(all_code) > 0:
|
122
|
+
code = "\n".join(all_code)
|
112
123
|
|
113
124
|
if code is not None:
|
114
125
|
code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names)
|
115
126
|
return code
|
116
127
|
|
117
128
|
|
129
|
+
def execute_user_code_action(
|
130
|
+
last_user_message: Message,
|
131
|
+
code_interpreter: CodeInterpreter,
|
132
|
+
artifact_remote_path: str,
|
133
|
+
) -> Tuple[Optional[Execution], Optional[str]]:
|
134
|
+
user_result = None
|
135
|
+
user_obs = None
|
136
|
+
|
137
|
+
if last_user_message["role"] != "user":
|
138
|
+
return user_result, user_obs
|
139
|
+
|
140
|
+
last_user_content = cast(str, last_user_message.get("content", ""))
|
141
|
+
|
142
|
+
user_code_action = parse_execution(last_user_content, False)
|
143
|
+
if user_code_action is not None:
|
144
|
+
user_result, user_obs = execute_code_action(
|
145
|
+
user_code_action, code_interpreter, artifact_remote_path
|
146
|
+
)
|
147
|
+
if user_result.error:
|
148
|
+
user_obs += f"\n{user_result.error}"
|
149
|
+
return user_result, user_obs
|
150
|
+
|
151
|
+
|
118
152
|
class VisionAgent(Agent):
|
119
153
|
"""Vision Agent is an agent that can chat with the user and call tools or other
|
120
154
|
agents to generate code for it. Vision Agent uses python code to execute actions
|
121
|
-
for the user. Vision Agent is inspired by by
|
155
|
+
for the user. Vision Agent is inspired by by OpenDevin
|
122
156
|
https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
|
123
157
|
|
124
158
|
Example
|
@@ -278,9 +312,24 @@ class VisionAgent(Agent):
|
|
278
312
|
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
279
313
|
self.streaming_message({"role": "observation", "content": artifacts_loaded})
|
280
314
|
|
281
|
-
|
282
|
-
last_user_message, code_interpreter, remote_artifacts_path
|
315
|
+
user_result, user_obs = execute_user_code_action(
|
316
|
+
last_user_message, code_interpreter, str(remote_artifacts_path)
|
283
317
|
)
|
318
|
+
finished = user_result is not None and user_obs is not None
|
319
|
+
if user_result is not None and user_obs is not None:
|
320
|
+
# be sure to update the chat with user execution results
|
321
|
+
chat_elt: Message = {"role": "observation", "content": user_obs}
|
322
|
+
int_chat.append(chat_elt)
|
323
|
+
chat_elt["execution"] = user_result
|
324
|
+
orig_chat.append(chat_elt)
|
325
|
+
self.streaming_message(
|
326
|
+
{
|
327
|
+
"role": "observation",
|
328
|
+
"content": user_obs,
|
329
|
+
"execution": user_result,
|
330
|
+
"finished": finished,
|
331
|
+
}
|
332
|
+
)
|
284
333
|
|
285
334
|
while not finished and iterations < self.max_iterations:
|
286
335
|
response = run_conversation(self.agent, int_chat)
|
@@ -322,7 +371,7 @@ class VisionAgent(Agent):
|
|
322
371
|
)
|
323
372
|
|
324
373
|
if code_action is not None:
|
325
|
-
result, obs =
|
374
|
+
result, obs = execute_code_action(
|
326
375
|
code_action, code_interpreter, str(remote_artifacts_path)
|
327
376
|
)
|
328
377
|
|
@@ -331,17 +380,17 @@ class VisionAgent(Agent):
|
|
331
380
|
if self.verbosity >= 1:
|
332
381
|
_LOGGER.info(obs)
|
333
382
|
|
334
|
-
|
383
|
+
obs_chat_elt: Message = {"role": "observation", "content": obs}
|
335
384
|
if media_obs and result.success:
|
336
|
-
|
385
|
+
obs_chat_elt["media"] = [
|
337
386
|
Path(code_interpreter.remote_path) / media_ob
|
338
387
|
for media_ob in media_obs
|
339
388
|
]
|
340
389
|
|
341
390
|
# don't add execution results to internal chat
|
342
|
-
int_chat.append(
|
343
|
-
|
344
|
-
orig_chat.append(
|
391
|
+
int_chat.append(obs_chat_elt)
|
392
|
+
obs_chat_elt["execution"] = result
|
393
|
+
orig_chat.append(obs_chat_elt)
|
345
394
|
self.streaming_message(
|
346
395
|
{
|
347
396
|
"role": "observation",
|
@@ -362,34 +411,6 @@ class VisionAgent(Agent):
|
|
362
411
|
artifacts.save()
|
363
412
|
return orig_chat, artifacts
|
364
413
|
|
365
|
-
def execute_user_code_action(
|
366
|
-
self,
|
367
|
-
last_user_message: Message,
|
368
|
-
code_interpreter: CodeInterpreter,
|
369
|
-
remote_artifacts_path: Path,
|
370
|
-
) -> bool:
|
371
|
-
if last_user_message["role"] != "user":
|
372
|
-
return False
|
373
|
-
user_code_action = parse_execution(
|
374
|
-
cast(str, last_user_message.get("content", "")), False
|
375
|
-
)
|
376
|
-
if user_code_action is not None:
|
377
|
-
user_result, user_obs = run_code_action(
|
378
|
-
user_code_action, code_interpreter, str(remote_artifacts_path)
|
379
|
-
)
|
380
|
-
if self.verbosity >= 1:
|
381
|
-
_LOGGER.info(user_obs)
|
382
|
-
self.streaming_message(
|
383
|
-
{
|
384
|
-
"role": "observation",
|
385
|
-
"content": user_obs,
|
386
|
-
"execution": user_result,
|
387
|
-
"finished": True,
|
388
|
-
}
|
389
|
-
)
|
390
|
-
return True
|
391
|
-
return False
|
392
|
-
|
393
414
|
def streaming_message(self, message: Dict[str, Any]) -> None:
|
394
415
|
if self.callback_message:
|
395
416
|
self.callback_message(message)
|
@@ -691,7 +691,7 @@ class VisionAgentCoder(Agent):
|
|
691
691
|
chat: List[Message],
|
692
692
|
test_multi_plan: bool = True,
|
693
693
|
display_visualization: bool = False,
|
694
|
-
|
694
|
+
custom_tool_names: Optional[List[str]] = None,
|
695
695
|
) -> Dict[str, Any]:
|
696
696
|
"""Chat with VisionAgentCoder and return intermediate information regarding the
|
697
697
|
task.
|
@@ -707,8 +707,8 @@ class VisionAgentCoder(Agent):
|
|
707
707
|
with the first plan.
|
708
708
|
display_visualization (bool): If True, it opens a new window locally to
|
709
709
|
show the image(s) created by visualization code (if there is any).
|
710
|
-
|
711
|
-
If not provided, default to full tool set from vision_agent.tools.
|
710
|
+
custom_tool_names (List[str]): A list of custom tools for the agent to pick
|
711
|
+
and use. If not provided, default to full tool set from vision_agent.tools.
|
712
712
|
|
713
713
|
Returns:
|
714
714
|
Dict[str, Any]: A dictionary containing the code, test, test result, plan,
|
@@ -760,7 +760,7 @@ class VisionAgentCoder(Agent):
|
|
760
760
|
success = False
|
761
761
|
|
762
762
|
plans = self._create_plans(
|
763
|
-
int_chat,
|
763
|
+
int_chat, custom_tool_names, working_memory, self.planner
|
764
764
|
)
|
765
765
|
|
766
766
|
if test_multi_plan:
|
@@ -67,14 +67,7 @@ This is the documentation for the functions you have access to. You may call any
|
|
67
67
|
**Previous Attempts**:
|
68
68
|
{previous_attempts}
|
69
69
|
|
70
|
-
**
|
71
|
-
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
|
72
|
-
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
73
|
-
3. Your test case MUST run only on the given images which are {media}
|
74
|
-
4. Print this final dictionary.
|
75
|
-
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
|
76
|
-
|
77
|
-
**Example**:
|
70
|
+
**Examples**:
|
78
71
|
--- EXAMPLE1 ---
|
79
72
|
plan1:
|
80
73
|
- Load the image from the provided file path 'image.jpg'.
|
@@ -100,6 +93,7 @@ cgd_out = countgd_counting(image)
|
|
100
93
|
|
101
94
|
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
|
102
95
|
print(final_out)
|
96
|
+
--- END EXAMPLE1 ---
|
103
97
|
|
104
98
|
--- EXAMPLE2 ---
|
105
99
|
plan1:
|
@@ -173,6 +167,14 @@ print(final_out)
|
|
173
167
|
print(labels_and_scores)
|
174
168
|
print(counts)
|
175
169
|
```
|
170
|
+
--- END EXAMPLE2 ---
|
171
|
+
|
172
|
+
**Instructions**:
|
173
|
+
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
|
174
|
+
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
175
|
+
3. Your test case MUST run only on the given images which are {media}
|
176
|
+
4. Print this final dictionary.
|
177
|
+
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
|
176
178
|
"""
|
177
179
|
|
178
180
|
|
@@ -224,11 +226,6 @@ This is the documentation for the functions you have access to. You may call any
|
|
224
226
|
|
225
227
|
{docstring}
|
226
228
|
|
227
|
-
**Input Code Snippet**:
|
228
|
-
```python
|
229
|
-
# Your code here
|
230
|
-
```
|
231
|
-
|
232
229
|
**User Instructions**:
|
233
230
|
{question}
|
234
231
|
|
@@ -241,11 +238,90 @@ This is the documentation for the functions you have access to. You may call any
|
|
241
238
|
**Previous Feedback**:
|
242
239
|
{feedback}
|
243
240
|
|
241
|
+
**Examples**:
|
242
|
+
--- EXAMPLE1 ---
|
243
|
+
**User Instructions**:
|
244
|
+
|
245
|
+
## User Request
|
246
|
+
Can you write a program to check if each person is wearing a helmet? First detect all the people in the image, then detect the helmets, check whether or not a person is wearing a helmet if the helmet is on the worker. Return a dictionary with the count of people with helments and people without helmets. Media name worker_helmets.webp
|
247
|
+
|
248
|
+
## Subtasks
|
249
|
+
|
250
|
+
This plan uses the owl_v2_image tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
|
251
|
+
-Use owl_v2_image with prompt 'person, helmet' to detect both people and helmets in the image
|
252
|
+
-Process the detections to match helmets with people based on bounding box proximity
|
253
|
+
-Count people with and without helmets based on the matching results
|
254
|
+
-Return a dictionary with the counts
|
255
|
+
|
256
|
+
|
257
|
+
**Tool Tests and Outputs**:
|
258
|
+
After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owl_v2_image seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
|
259
|
+
|
260
|
+
**Tool Output Thoughts**:
|
261
|
+
```python
|
262
|
+
...
|
263
|
+
```
|
264
|
+
----- stdout -----
|
265
|
+
Plan 1 - owl_v2_image:
|
266
|
+
|
267
|
+
[{{'label': 'helmet', 'score': 0.15, 'bbox': [0.85, 0.41, 0.87, 0.45]}}, {{'label': 'helmet', 'score': 0.3, 'bbox': [0.8, 0.43, 0.81, 0.46]}}, {{'label': 'helmet', 'score': 0.31, 'bbox': [0.85, 0.45, 0.86, 0.46]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.84, 0.45, 0.88, 0.58]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.78, 0.43, 0.82, 0.57]}}, {{'label': 'helmet', 'score': 0.33, 'bbox': [0.3, 0.65, 0.32, 0.67]}}, {{'label': 'person', 'score': 0.29, 'bbox': [0.28, 0.65, 0.36, 0.84]}}, {{'label': 'helmet', 'score': 0.29, 'bbox': [0.13, 0.82, 0.15, 0.85]}}, {{'label': 'person', 'score': 0.3, 'bbox': [0.1, 0.82, 0.24, 1.0]}}]
|
268
|
+
|
269
|
+
...
|
270
|
+
|
271
|
+
**Input Code Snippet**:
|
272
|
+
```python
|
273
|
+
from vision_agent.tools import load_image, owl_v2_image
|
274
|
+
|
275
|
+
def check_helmets(image_path):
|
276
|
+
image = load_image(image_path)
|
277
|
+
# Detect people and helmets, filter out the lowest confidence helmet score of 0.15
|
278
|
+
detections = owl_v2_image("person, helmet", image, box_threshold=0.15)
|
279
|
+
height, width = image.shape[:2]
|
280
|
+
|
281
|
+
# Separate people and helmets
|
282
|
+
people = [d for d in detections if d['label'] == 'person']
|
283
|
+
helmets = [d for d in detections if d['label'] == 'helmet']
|
284
|
+
|
285
|
+
people_with_helmets = 0
|
286
|
+
people_without_helmets = 0
|
287
|
+
|
288
|
+
for person in people:
|
289
|
+
person_x = (person['bbox'][0] + person['bbox'][2]) / 2
|
290
|
+
person_y = person['bbox'][1] # Top of the bounding box
|
291
|
+
|
292
|
+
helmet_found = False
|
293
|
+
for helmet in helmets:
|
294
|
+
helmet_x = (helmet['bbox'][0] + helmet['bbox'][2]) / 2
|
295
|
+
helmet_y = (helmet['bbox'][1] + helmet['bbox'][3]) / 2
|
296
|
+
|
297
|
+
# Check if the helmet is within 20 pixels of the person's head. Unnormalize
|
298
|
+
# the coordinates so we can better compare them.
|
299
|
+
if (abs((helmet_x - person_x) * width) < 20 and
|
300
|
+
-5 < ((helmet_y - person_y) * height) < 20):
|
301
|
+
helmet_found = True
|
302
|
+
break
|
303
|
+
|
304
|
+
if helmet_found:
|
305
|
+
people_with_helmets += 1
|
306
|
+
else:
|
307
|
+
people_without_helmets += 1
|
308
|
+
|
309
|
+
return {{
|
310
|
+
"people_with_helmets": people_with_helmets,
|
311
|
+
"people_without_helmets": people_without_helmets
|
312
|
+
}}
|
313
|
+
```
|
314
|
+
--- END EXAMPLE1 ---
|
315
|
+
|
244
316
|
**Instructions**:
|
245
317
|
1. **Understand and Clarify**: Make sure you understand the task.
|
246
318
|
2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
|
247
319
|
3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
|
248
|
-
4. **Code Generation**: Translate your pseudocode into executable Python code.
|
320
|
+
4. **Code Generation**: Translate your pseudocode into executable Python code.
|
321
|
+
4.1. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`.
|
322
|
+
4.2. Coordinates are always returned normalized from `vision_agent.tools`.
|
323
|
+
4.3. Do not create dummy input or functions, the code must be usable if the user provides new media.
|
324
|
+
4.4. Use unnormalized coordinates when comparing bounding boxes.
|
249
325
|
"""
|
250
326
|
|
251
327
|
TEST = """
|
@@ -26,7 +26,9 @@ Here is the current conversation so far:
|
|
26
26
|
|
27
27
|
**Instructions**:
|
28
28
|
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
29
|
-
2. **
|
29
|
+
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
|
30
|
+
3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
|
31
|
+
4. **Output in JSON**: Respond in the following format in JSON:
|
30
32
|
|
31
33
|
```json
|
32
34
|
{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
|
@@ -149,7 +151,7 @@ OBSERVATION:
|
|
149
151
|
16| return count
|
150
152
|
[End of artifact]
|
151
153
|
|
152
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
154
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
153
155
|
|
154
156
|
OBSERVATION:
|
155
157
|
----- stdout -----
|
@@ -164,7 +166,6 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
|
|
164
166
|
OBSERVATION:
|
165
167
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
166
168
|
|
167
|
-
|
168
169
|
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
169
170
|
|
170
171
|
OBSERVATION:
|
@@ -182,8 +183,7 @@ OBSERVATION:
|
|
182
183
|
count = 0
|
183
184
|
[End of edits]
|
184
185
|
|
185
|
-
|
186
|
-
AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
186
|
+
AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
187
187
|
|
188
188
|
OBSERVATION:
|
189
189
|
----- stdout -----
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -8,6 +8,7 @@ import tempfile
|
|
8
8
|
from pathlib import Path
|
9
9
|
from typing import Any, Dict, List, Optional, Union
|
10
10
|
|
11
|
+
import numpy as np
|
11
12
|
from IPython.display import display
|
12
13
|
|
13
14
|
import vision_agent as va
|
@@ -17,7 +18,8 @@ from vision_agent.tools.tool_utils import get_tool_documentation
|
|
17
18
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
18
19
|
from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
|
19
20
|
from vision_agent.utils.execute import Execution, MimeType
|
20
|
-
from vision_agent.utils.image_utils import convert_to_b64
|
21
|
+
from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
|
22
|
+
from vision_agent.utils.video import frames_to_bytes
|
21
23
|
|
22
24
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
23
25
|
|
@@ -328,7 +330,7 @@ def generate_vision_code(
|
|
328
330
|
chat: str,
|
329
331
|
media: List[str],
|
330
332
|
test_multi_plan: bool = True,
|
331
|
-
|
333
|
+
custom_tool_names: Optional[List[str]] = None,
|
332
334
|
) -> str:
|
333
335
|
"""Generates python code to solve vision based tasks.
|
334
336
|
|
@@ -338,7 +340,7 @@ def generate_vision_code(
|
|
338
340
|
chat (str): The chat message from the user.
|
339
341
|
media (List[str]): The media files to use.
|
340
342
|
test_multi_plan (bool): Do not change this parameter.
|
341
|
-
|
343
|
+
custom_tool_names (Optional[List[str]]): Do not change this parameter.
|
342
344
|
|
343
345
|
Returns:
|
344
346
|
str: The generated code.
|
@@ -366,7 +368,7 @@ def generate_vision_code(
|
|
366
368
|
response = agent.chat_with_workflow(
|
367
369
|
fixed_chat,
|
368
370
|
test_multi_plan=test_multi_plan,
|
369
|
-
|
371
|
+
custom_tool_names=custom_tool_names,
|
370
372
|
)
|
371
373
|
redisplay_results(response["test_result"])
|
372
374
|
code = response["code"]
|
@@ -432,19 +434,21 @@ def edit_vision_code(
|
|
432
434
|
|
433
435
|
# Append latest code to second to last message from assistant
|
434
436
|
fixed_chat_history: List[Message] = []
|
437
|
+
user_message = "Previous user requests:"
|
435
438
|
for i, chat in enumerate(chat_history):
|
436
|
-
if i
|
437
|
-
|
438
|
-
|
439
|
-
fixed_chat_history.append(
|
440
|
-
|
439
|
+
if i < len(chat_history) - 1:
|
440
|
+
user_message += " " + chat
|
441
|
+
else:
|
442
|
+
fixed_chat_history.append(
|
443
|
+
{"role": "user", "content": user_message, "media": media}
|
444
|
+
)
|
441
445
|
fixed_chat_history.append({"role": "assistant", "content": code})
|
442
446
|
fixed_chat_history.append({"role": "user", "content": chat})
|
443
447
|
|
444
448
|
response = agent.chat_with_workflow(
|
445
449
|
fixed_chat_history,
|
446
450
|
test_multi_plan=False,
|
447
|
-
|
451
|
+
custom_tool_names=customized_tool_names,
|
448
452
|
)
|
449
453
|
redisplay_results(response["test_result"])
|
450
454
|
code = response["code"]
|
@@ -467,17 +471,34 @@ def edit_vision_code(
|
|
467
471
|
return view_lines(code_lines, 0, total_lines, name, total_lines)
|
468
472
|
|
469
473
|
|
470
|
-
def write_media_artifact(
|
474
|
+
def write_media_artifact(
|
475
|
+
artifacts: Artifacts,
|
476
|
+
name: str,
|
477
|
+
media: Union[str, np.ndarray, List[np.ndarray]],
|
478
|
+
fps: Optional[float] = None,
|
479
|
+
) -> str:
|
471
480
|
"""Writes a media file to the artifacts object.
|
472
481
|
|
473
482
|
Parameters:
|
474
483
|
artifacts (Artifacts): The artifacts object to save the media to.
|
475
|
-
|
484
|
+
name (str): The name of the media artifact to save.
|
485
|
+
media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either
|
486
|
+
be a file path, single image or list of frames for a video.
|
487
|
+
fps (Optional[float]): The frames per second if you are writing a video.
|
476
488
|
"""
|
477
|
-
|
478
|
-
media
|
479
|
-
|
480
|
-
|
489
|
+
if isinstance(media, str):
|
490
|
+
with open(media, "rb") as f:
|
491
|
+
media_bytes = f.read()
|
492
|
+
elif isinstance(media, list):
|
493
|
+
media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0)
|
494
|
+
elif isinstance(media, np.ndarray):
|
495
|
+
media_bytes = numpy_to_bytes(media)
|
496
|
+
else:
|
497
|
+
print(f"[Invalid media type {type(media)}]")
|
498
|
+
return f"[Invalid media type {type(media)}]"
|
499
|
+
artifacts[name] = media_bytes
|
500
|
+
print(f"[Media {name} saved]")
|
501
|
+
return f"[Media {name} saved]"
|
481
502
|
|
482
503
|
|
483
504
|
def list_artifacts(artifacts: Artifacts) -> str:
|
@@ -491,16 +512,14 @@ def check_and_load_image(code: str) -> List[str]:
|
|
491
512
|
if not code.strip():
|
492
513
|
return []
|
493
514
|
|
494
|
-
pattern = r"
|
495
|
-
|
496
|
-
|
497
|
-
name = match.group(2)
|
498
|
-
return [name]
|
499
|
-
return []
|
515
|
+
pattern = r"view_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
|
516
|
+
matches = re.findall(pattern, code)
|
517
|
+
return [match[1] for match in matches]
|
500
518
|
|
501
519
|
|
502
520
|
def view_media_artifact(artifacts: Artifacts, name: str) -> str:
|
503
|
-
"""
|
521
|
+
"""Allows you to view the media artifact with the given name. This does not show
|
522
|
+
the media to the user, the user can already see all media saved in the artifacts.
|
504
523
|
|
505
524
|
Parameters:
|
506
525
|
artifacts (Artifacts): The artifacts object to show the image from.
|
@@ -598,7 +617,7 @@ def use_extra_vision_agent_args(
|
|
598
617
|
arg = match.group(1)
|
599
618
|
out_str = f"generate_vision_code({arg}, test_multi_plan={test_multi_plan}"
|
600
619
|
if customized_tool_names is not None:
|
601
|
-
out_str += f",
|
620
|
+
out_str += f", custom_tool_names={customized_tool_names})"
|
602
621
|
else:
|
603
622
|
out_str += ")"
|
604
623
|
return out_str
|
@@ -609,7 +628,7 @@ def use_extra_vision_agent_args(
|
|
609
628
|
arg = match.group(1)
|
610
629
|
out_str = f"edit_vision_code({arg}"
|
611
630
|
if customized_tool_names is not None:
|
612
|
-
out_str += f",
|
631
|
+
out_str += f", custom_tool_names={customized_tool_names})"
|
613
632
|
else:
|
614
633
|
out_str += ")"
|
615
634
|
return out_str
|
@@ -646,50 +665,28 @@ def use_object_detection_fine_tuning(
|
|
646
665
|
|
647
666
|
patterns_with_fine_tune_id = [
|
648
667
|
(
|
649
|
-
r'florence2_phrase_grounding\(\s*"([^"]+)"\s*,\s*([^,]+)(?:,\s*"[^"]+")?\s*\)',
|
668
|
+
r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
650
669
|
lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
651
670
|
),
|
652
671
|
(
|
653
|
-
r'owl_v2_image\(\s*"([^"]+)"\s*,\s*([^,]+)(?:,\s*"[^"]+")?\s*\)',
|
672
|
+
r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
654
673
|
lambda match: f'owl_v2_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
655
674
|
),
|
656
675
|
(
|
657
|
-
r'florence2_sam2_image\(\s*"([^"]+)"\s*,\s*([^,]+)(?:,\s*"[^"]+")?\s*\)',
|
676
|
+
r'florence2_sam2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
658
677
|
lambda match: f'florence2_sam2_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
659
678
|
),
|
660
679
|
]
|
661
680
|
|
662
|
-
patterns_without_fine_tune_id = [
|
663
|
-
(
|
664
|
-
r"florence2_phrase_grounding\(\s*([^\)]+)\s*\)",
|
665
|
-
lambda match: f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")',
|
666
|
-
),
|
667
|
-
(
|
668
|
-
r"owl_v2_image\(\s*([^\)]+)\s*\)",
|
669
|
-
lambda match: f'owl_v2_image({match.group(1)}, "{fine_tune_id}")',
|
670
|
-
),
|
671
|
-
(
|
672
|
-
r"florence2_sam2_image\(\s*([^\)]+)\s*\)",
|
673
|
-
lambda match: f'florence2_sam2_image({match.group(1)}, "{fine_tune_id}")',
|
674
|
-
),
|
675
|
-
]
|
676
|
-
|
677
681
|
new_code = code
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
):
|
682
|
+
for (
|
683
|
+
pattern_with_fine_tune_id,
|
684
|
+
replacer_with_fine_tune_id,
|
685
|
+
) in patterns_with_fine_tune_id:
|
682
686
|
if re.search(pattern_with_fine_tune_id, new_code):
|
683
687
|
new_code = re.sub(
|
684
688
|
pattern_with_fine_tune_id, replacer_with_fine_tune_id, new_code
|
685
689
|
)
|
686
|
-
else:
|
687
|
-
(pattern_without_fine_tune_id, replacer_without_fine_tune_id) = (
|
688
|
-
patterns_without_fine_tune_id[index]
|
689
|
-
)
|
690
|
-
new_code = re.sub(
|
691
|
-
pattern_without_fine_tune_id, replacer_without_fine_tune_id, new_code
|
692
|
-
)
|
693
690
|
|
694
691
|
if new_code == code:
|
695
692
|
output_str = (
|
@@ -2,10 +2,10 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=
|
8
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=MDXIM5md1V6y62-chyGiDg_138Rns6KKOO2wMqb6vD8,18431
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
|
8
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=3n92aF-jpUyyrAy06izdHIMPEMZPKD1JV0wfQvt-PD8,11251
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
11
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -15,7 +15,7 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,
|
|
15
15
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
17
|
vision_agent/tools/__init__.py,sha256=zUv3aVPN1MXfyQiQi5To4rkQGtG7mxLQ1NjLI3pxM80,2412
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=rudM9heiuTfNjp741ZNcUGRJdpfDZ38BuellA1IPuIo,24747
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
|
21
21
|
vision_agent/tools/tools.py,sha256=c7SjtZD7YfxhEAGYYe-ExVCBA4NDXmRwerBIbd-XEH8,74557
|
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
27
|
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.149.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.149.dist-info/METADATA,sha256=kdaPDsxWxc0gwBoPFzN0EiQ79NCDBeJzkHzEnh7mRjM,13758
|
32
|
+
vision_agent-0.2.149.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.149.dist-info/RECORD,,
|
File without changes
|
File without changes
|