vision-agent 0.2.164__py3-none-any.whl → 0.2.166__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +94 -64
- vision_agent/agent/vision_agent_prompts.py +78 -21
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +67 -61
- vision_agent/tools/tools.py +25 -41
- vision_agent/utils/video.py +23 -3
- {vision_agent-0.2.164.dist-info → vision_agent-0.2.166.dist-info}/METADATA +7 -2
- {vision_agent-0.2.164.dist-info → vision_agent-0.2.166.dist-info}/RECORD +10 -10
- {vision_agent-0.2.164.dist-info → vision_agent-0.2.166.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.164.dist-info → vision_agent-0.2.166.dist-info}/WHEEL +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
import copy
|
2
|
+
import json
|
2
3
|
import logging
|
3
4
|
import os
|
4
5
|
import tempfile
|
@@ -6,7 +7,7 @@ from pathlib import Path
|
|
6
7
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
7
8
|
|
8
9
|
from vision_agent.agent import Agent
|
9
|
-
from vision_agent.agent.agent_utils import extract_json
|
10
|
+
from vision_agent.agent.agent_utils import extract_json, extract_tag
|
10
11
|
from vision_agent.agent.vision_agent_prompts import (
|
11
12
|
EXAMPLES_CODE1,
|
12
13
|
EXAMPLES_CODE2,
|
@@ -18,6 +19,7 @@ from vision_agent.tools.meta_tools import (
|
|
18
19
|
META_TOOL_DOCSTRING,
|
19
20
|
Artifacts,
|
20
21
|
check_and_load_image,
|
22
|
+
extract_and_save_files_to_artifacts,
|
21
23
|
use_extra_vision_agent_args,
|
22
24
|
)
|
23
25
|
from vision_agent.utils import CodeInterpreterFactory
|
@@ -35,7 +37,7 @@ class BoilerplateCode:
|
|
35
37
|
pre_code = [
|
36
38
|
"from typing import *",
|
37
39
|
"from vision_agent.utils.execute import CodeInterpreter",
|
38
|
-
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code,
|
40
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
|
39
41
|
"artifacts = Artifacts('{remote_path}')",
|
40
42
|
"artifacts.load('{remote_path}')",
|
41
43
|
]
|
@@ -57,6 +59,32 @@ class BoilerplateCode:
|
|
57
59
|
)
|
58
60
|
|
59
61
|
|
62
|
+
def format_agent_message(agent_message: str) -> str:
|
63
|
+
agent_message_json = extract_json(agent_message)
|
64
|
+
output = ""
|
65
|
+
if "thinking" in agent_message_json and agent_message_json["thinking"]:
|
66
|
+
output += "<thinking>" + agent_message_json["thinking"] + "</thinking>"
|
67
|
+
if "response" in agent_message_json and agent_message_json["response"]:
|
68
|
+
output += "<response>" + agent_message_json["response"] + "</response>"
|
69
|
+
if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
|
70
|
+
output += (
|
71
|
+
"\n<execute_python>\n"
|
72
|
+
+ agent_message_json["execute_python"]
|
73
|
+
+ "\n</execute_python>\n"
|
74
|
+
)
|
75
|
+
if (
|
76
|
+
"let_user_respond" in agent_message_json
|
77
|
+
and agent_message_json["let_user_respond"]
|
78
|
+
):
|
79
|
+
output += (
|
80
|
+
"<let_user_respond>"
|
81
|
+
+ str(agent_message_json["let_user_respond"])
|
82
|
+
+ "</let_user_respond>"
|
83
|
+
)
|
84
|
+
|
85
|
+
return output
|
86
|
+
|
87
|
+
|
60
88
|
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
61
89
|
chat = copy.deepcopy(chat)
|
62
90
|
|
@@ -67,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
67
95
|
elif chat_i["role"] == "observation":
|
68
96
|
conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
|
69
97
|
elif chat_i["role"] == "assistant":
|
70
|
-
conversation += f"AGENT: {chat_i['content']}\n\n"
|
98
|
+
conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" # type: ignore
|
71
99
|
else:
|
72
100
|
raise ValueError(f"role {chat_i['role']} is not supported")
|
73
101
|
|
@@ -84,15 +112,31 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
84
112
|
and len(chat[-1]["media"]) > 0 # type: ignore
|
85
113
|
):
|
86
114
|
message["media"] = chat[-1]["media"]
|
87
|
-
|
115
|
+
conv_resp = cast(str, orch([message], stream=False))
|
116
|
+
|
117
|
+
let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
|
118
|
+
let_user_respond = (
|
119
|
+
"true" in let_user_respond_str.lower() if let_user_respond_str else False
|
120
|
+
)
|
121
|
+
|
122
|
+
return {
|
123
|
+
"thinking": extract_tag(conv_resp, "thinking"),
|
124
|
+
"response": extract_tag(conv_resp, "response"),
|
125
|
+
"execute_python": extract_tag(conv_resp, "execute_python"),
|
126
|
+
"let_user_respond": let_user_respond,
|
127
|
+
}
|
88
128
|
|
89
129
|
|
90
130
|
def execute_code_action(
|
91
|
-
|
131
|
+
artifacts: Artifacts,
|
132
|
+
code: str,
|
133
|
+
code_interpreter: CodeInterpreter,
|
134
|
+
artifact_remote_path: str,
|
92
135
|
) -> Tuple[Execution, str]:
|
93
136
|
result = code_interpreter.exec_isolation(
|
94
137
|
BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
|
95
138
|
)
|
139
|
+
extract_and_save_files_to_artifacts(artifacts, code)
|
96
140
|
|
97
141
|
obs = str(result.logs)
|
98
142
|
if result.error:
|
@@ -100,33 +144,8 @@ def execute_code_action(
|
|
100
144
|
return result, obs
|
101
145
|
|
102
146
|
|
103
|
-
def parse_execution(
|
104
|
-
response: str,
|
105
|
-
test_multi_plan: bool = True,
|
106
|
-
custom_tool_names: Optional[List[str]] = None,
|
107
|
-
) -> Optional[str]:
|
108
|
-
code = None
|
109
|
-
remaining = response
|
110
|
-
all_code = []
|
111
|
-
while "<execute_python>" in remaining:
|
112
|
-
code_i = remaining[
|
113
|
-
remaining.find("<execute_python>") + len("<execute_python>") :
|
114
|
-
]
|
115
|
-
code_i = code_i[: code_i.find("</execute_python>")]
|
116
|
-
remaining = remaining[
|
117
|
-
remaining.find("</execute_python>") + len("</execute_python>") :
|
118
|
-
]
|
119
|
-
all_code.append(code_i)
|
120
|
-
|
121
|
-
if len(all_code) > 0:
|
122
|
-
code = "\n".join(all_code)
|
123
|
-
|
124
|
-
if code is not None:
|
125
|
-
code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
|
126
|
-
return code
|
127
|
-
|
128
|
-
|
129
147
|
def execute_user_code_action(
|
148
|
+
artifacts: Artifacts,
|
130
149
|
last_user_message: Message,
|
131
150
|
code_interpreter: CodeInterpreter,
|
132
151
|
artifact_remote_path: str,
|
@@ -138,40 +157,43 @@ def execute_user_code_action(
|
|
138
157
|
return user_result, user_obs
|
139
158
|
|
140
159
|
last_user_content = cast(str, last_user_message.get("content", ""))
|
160
|
+
try:
|
161
|
+
user_code_action = json.loads(last_user_content).get("execute_python", None)
|
162
|
+
except json.JSONDecodeError:
|
163
|
+
return user_result, user_obs
|
141
164
|
|
142
|
-
user_code_action = parse_execution(last_user_content, False)
|
143
165
|
if user_code_action is not None:
|
166
|
+
user_code_action = use_extra_vision_agent_args(user_code_action, False)
|
144
167
|
user_result, user_obs = execute_code_action(
|
145
|
-
user_code_action, code_interpreter, artifact_remote_path
|
168
|
+
artifacts, user_code_action, code_interpreter, artifact_remote_path
|
146
169
|
)
|
147
170
|
if user_result.error:
|
148
171
|
user_obs += f"\n{user_result.error}"
|
149
172
|
return user_result, user_obs
|
150
173
|
|
151
174
|
|
152
|
-
def add_step_descriptions(response: Dict[str,
|
175
|
+
def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
|
153
176
|
response = copy.deepcopy(response)
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
response["response"] = resp_str
|
177
|
+
|
178
|
+
if "execute_python" in response and response["execute_python"]:
|
179
|
+
# only include descriptions for these, the rest will just have executing
|
180
|
+
# code
|
181
|
+
description_map = {
|
182
|
+
"open_code_artifact": "Reading file.",
|
183
|
+
"create_code_artifact": "Creating file.",
|
184
|
+
"edit_code_artifact": "Editing file.",
|
185
|
+
"generate_vision_code": "Generating vision code.",
|
186
|
+
"edit_vision_code": "Editing vision code.",
|
187
|
+
}
|
188
|
+
description = ""
|
189
|
+
for k, v in description_map.items():
|
190
|
+
if k in response["execute_python"]:
|
191
|
+
description += v + " "
|
192
|
+
if description == "":
|
193
|
+
description = "Executing code."
|
194
|
+
|
195
|
+
response["response"] = description
|
196
|
+
|
175
197
|
return response
|
176
198
|
|
177
199
|
|
@@ -369,7 +391,10 @@ class VisionAgent(Agent):
|
|
369
391
|
self.streaming_message({"role": "observation", "content": artifacts_loaded})
|
370
392
|
|
371
393
|
user_result, user_obs = execute_user_code_action(
|
372
|
-
|
394
|
+
artifacts,
|
395
|
+
last_user_message,
|
396
|
+
code_interpreter,
|
397
|
+
str(remote_artifacts_path),
|
373
398
|
)
|
374
399
|
finished = user_result is not None and user_obs is not None
|
375
400
|
if user_result is not None and user_obs is not None:
|
@@ -394,13 +419,13 @@ class VisionAgent(Agent):
|
|
394
419
|
int_chat.append(
|
395
420
|
{
|
396
421
|
"role": "assistant",
|
397
|
-
"content":
|
422
|
+
"content": json.dumps(add_step_descriptions(response)),
|
398
423
|
}
|
399
424
|
)
|
400
425
|
orig_chat.append(
|
401
426
|
{
|
402
427
|
"role": "assistant",
|
403
|
-
"content":
|
428
|
+
"content": json.dumps(add_step_descriptions(response)),
|
404
429
|
}
|
405
430
|
)
|
406
431
|
|
@@ -408,11 +433,13 @@ class VisionAgent(Agent):
|
|
408
433
|
if last_response == response:
|
409
434
|
response["let_user_respond"] = True
|
410
435
|
|
411
|
-
finished = response
|
436
|
+
finished = response.get("let_user_respond", False)
|
412
437
|
|
413
|
-
code_action =
|
414
|
-
|
415
|
-
|
438
|
+
code_action = response.get("execute_python", None)
|
439
|
+
if code_action is not None:
|
440
|
+
code_action = use_extra_vision_agent_args(
|
441
|
+
code_action, test_multi_plan, custom_tool_names
|
442
|
+
)
|
416
443
|
|
417
444
|
if last_response == response:
|
418
445
|
self.streaming_message(
|
@@ -431,14 +458,17 @@ class VisionAgent(Agent):
|
|
431
458
|
self.streaming_message(
|
432
459
|
{
|
433
460
|
"role": "assistant",
|
434
|
-
"content": response,
|
461
|
+
"content": json.dumps(response),
|
435
462
|
"finished": finished and code_action is None,
|
436
463
|
}
|
437
464
|
)
|
438
465
|
|
439
466
|
if code_action is not None:
|
440
467
|
result, obs = execute_code_action(
|
441
|
-
|
468
|
+
artifacts,
|
469
|
+
code_action,
|
470
|
+
code_interpreter,
|
471
|
+
str(remote_artifacts_path),
|
442
472
|
)
|
443
473
|
|
444
474
|
media_obs = check_and_load_image(code_action)
|
@@ -27,13 +27,14 @@ Here is the current conversation so far:
|
|
27
27
|
**Instructions**:
|
28
28
|
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
29
29
|
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
|
30
|
-
3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set
|
30
|
+
3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
|
31
31
|
4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
|
32
|
-
5. **Output
|
32
|
+
5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
|
33
33
|
|
34
|
-
|
35
|
-
|
36
|
-
|
34
|
+
<thinking>Your thoughts here...</thinking>
|
35
|
+
<response>Your response to the user here...</response>
|
36
|
+
<execute_python>Your code here...</execute_python>
|
37
|
+
<let_user_respond>true/false for whether or not to you want to let the user respond.</let_user_respond>
|
37
38
|
"""
|
38
39
|
|
39
40
|
|
@@ -45,7 +46,11 @@ OBSERVATION:
|
|
45
46
|
Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
46
47
|
[End of artifacts]
|
47
48
|
|
48
|
-
AGENT:
|
49
|
+
AGENT: <thinking>I will use the generate_vision_code to detect the dogs in the image.</thinking>
|
50
|
+
<execute_python>
|
51
|
+
generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
|
52
|
+
</execute_python>
|
53
|
+
<let_user_respond>false</let_user_respond>
|
49
54
|
|
50
55
|
OBSERVATION:
|
51
56
|
[Artifact dog_detector.py]
|
@@ -56,14 +61,21 @@ OBSERVATION:
|
|
56
61
|
4| return dogs
|
57
62
|
[End of artifact]
|
58
63
|
|
59
|
-
AGENT:
|
64
|
+
AGENT: <thinking>I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.</thinking>
|
65
|
+
<execute_python>
|
66
|
+
from dog_detector import detect_dogs
|
67
|
+
print(detect_dogs('/example/workspace/dog.jpg'))
|
68
|
+
</execute_python>
|
69
|
+
<let_user_respond>false</let_user_respond>
|
60
70
|
|
61
71
|
OBSERVATION:
|
62
72
|
----- stdout -----
|
63
73
|
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
|
64
74
|
|
65
75
|
|
66
|
-
AGENT:
|
76
|
+
AGENT: <thinking>Two dogs are detected, I will show this to the user and ask them if the result looks good.</thinking>
|
77
|
+
<response>The code detectd two dogs, do the results look good to you?</response>
|
78
|
+
<let_user_respond>true</let_user_respond>
|
67
79
|
"""
|
68
80
|
|
69
81
|
EXAMPLES_CODE1_EXTRA = """
|
@@ -75,7 +87,11 @@ Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
|
75
87
|
Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
|
76
88
|
[End of artifacts]
|
77
89
|
|
78
|
-
AGENT:
|
90
|
+
AGENT: <thinking>I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that</thinking>
|
91
|
+
<execute_python>
|
92
|
+
edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
|
93
|
+
</execute_python>
|
94
|
+
<let_user_respond>false</let_user_respond>
|
79
95
|
|
80
96
|
OBSERVATION:
|
81
97
|
[Artifact dog_detector.py]
|
@@ -86,13 +102,20 @@ OBSERVATION:
|
|
86
102
|
4| return dogs
|
87
103
|
[End of artifact]
|
88
104
|
|
89
|
-
AGENT:
|
105
|
+
AGENT: <thinking>I have edited the code to detect only one dog, I must now run the code and print the results to get the output.</thinking>
|
106
|
+
<execute_python>
|
107
|
+
from dog_detector import detect_dogs
|
108
|
+
print(detect_dogs('/path/to/images/dog.jpg'))
|
109
|
+
</execute_python>
|
110
|
+
<let_user_respond>false</let_user_respond>
|
90
111
|
|
91
112
|
OBSERVATION:
|
92
113
|
----- stdout -----
|
93
114
|
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
|
94
115
|
|
95
|
-
AGENT:
|
116
|
+
AGENT: <thinking>One dog is detected, I will show this to the user and ask them if the result looks good.</thinking>
|
117
|
+
<response>The code detected one dog, do these results look good to you?</response>
|
118
|
+
<let_user_respond>true</let_user_respond>
|
96
119
|
"""
|
97
120
|
|
98
121
|
EXAMPLES_CODE2 = """
|
@@ -103,12 +126,18 @@ OBSERVATION:
|
|
103
126
|
Artifact image.jpg loaded to /path/to/images/image.jpg
|
104
127
|
[End of artifacts]
|
105
128
|
|
106
|
-
AGENT:
|
129
|
+
AGENT: <thinking>The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.</thinking>
|
130
|
+
<execute_python>
|
131
|
+
view_media_artifacts('image.jpg')
|
132
|
+
</execute_python>
|
133
|
+
<let_user_respond>false</let_user_respond>
|
107
134
|
|
108
135
|
OBSERVATION:
|
109
136
|
[Image image.jpg displayed]
|
110
137
|
|
111
|
-
AGENT:
|
138
|
+
AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.</thinking>
|
139
|
+
<response>The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?</response>
|
140
|
+
<let_user_respond>true</let_user_respond>
|
112
141
|
"""
|
113
142
|
|
114
143
|
|
@@ -119,7 +148,9 @@ OBSERVATION:
|
|
119
148
|
[Artifacts loaded]
|
120
149
|
[End of artifacts]
|
121
150
|
|
122
|
-
AGENT:
|
151
|
+
AGENT: <thinking>The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.</thinking>
|
152
|
+
<response>Can you provide an image of workers with helmets?</response>
|
153
|
+
<let_user_respond>true</let_user_respond>
|
123
154
|
|
124
155
|
USER: Yes you can use workers.png
|
125
156
|
|
@@ -128,7 +159,11 @@ OBSERVATION:
|
|
128
159
|
Artifact workers.png loaded to /path/to/images/workers.png
|
129
160
|
[End of artifacts]
|
130
161
|
|
131
|
-
AGENT:
|
162
|
+
AGENT: <thinking>I will use the generate_vision_code to count the workers with helmets in the image.</thinking>
|
163
|
+
<execute_python>
|
164
|
+
generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
|
165
|
+
</execute_python>
|
166
|
+
<let_user_respond>false</let_user_respond>
|
132
167
|
|
133
168
|
OBSERVATION:
|
134
169
|
[Artifact code.py]
|
@@ -152,22 +187,37 @@ OBSERVATION:
|
|
152
187
|
16| return count
|
153
188
|
[End of artifact]
|
154
189
|
|
155
|
-
AGENT:
|
190
|
+
AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.</thinking>
|
191
|
+
<execute_python>
|
192
|
+
from code import count_workers_with_helmets
|
193
|
+
print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
|
194
|
+
</execute_python>
|
195
|
+
<let_user_respond>false</let_user_respond>
|
156
196
|
|
157
197
|
OBSERVATION:
|
158
198
|
----- stdout -----
|
159
199
|
2
|
160
200
|
|
161
|
-
AGENT:
|
201
|
+
AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
|
202
|
+
<response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
|
203
|
+
<let_user_respond>true</let_user_respond>
|
162
204
|
|
163
205
|
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
|
164
206
|
|
165
|
-
AGENT:
|
207
|
+
AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
|
208
|
+
<execute_python>
|
209
|
+
object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
|
210
|
+
</execute_python>
|
211
|
+
<let_user_respond>false</let_user_respond>
|
166
212
|
|
167
213
|
OBSERVATION:
|
168
214
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
169
215
|
|
170
|
-
AGENT:
|
216
|
+
AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.</thinking>
|
217
|
+
<execute_python>
|
218
|
+
use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
|
219
|
+
</execute_python>
|
220
|
+
<let_user_respond>false</let_user_respond>
|
171
221
|
|
172
222
|
OBSERVATION:
|
173
223
|
[Artifact code.py edits]
|
@@ -184,11 +234,18 @@ OBSERVATION:
|
|
184
234
|
count = 0
|
185
235
|
[End of edits]
|
186
236
|
|
187
|
-
AGENT:
|
237
|
+
AGENT: <thinking>I have updated the code to use the fine tuned model, I must now run the code and print the output.</thinking>
|
238
|
+
<execute_python>
|
239
|
+
from code import count_workers_with_helmets
|
240
|
+
print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
|
241
|
+
</execute_python>
|
242
|
+
<let_user_respond>false</let_user_respond>
|
188
243
|
|
189
244
|
OBSERVATION:
|
190
245
|
----- stdout -----
|
191
246
|
3
|
192
247
|
|
193
|
-
AGENT:
|
248
|
+
AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
|
249
|
+
<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
|
250
|
+
<let_user_respond>true</let_user_respond>
|
194
251
|
"""
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
import base64
|
2
1
|
import difflib
|
3
2
|
import json
|
4
3
|
import os
|
@@ -9,7 +8,6 @@ import tempfile
|
|
9
8
|
from pathlib import Path
|
10
9
|
from typing import Any, Dict, List, Optional, Union
|
11
10
|
|
12
|
-
import numpy as np
|
13
11
|
from IPython.display import display
|
14
12
|
from redbaron import RedBaron # type: ignore
|
15
13
|
|
@@ -22,8 +20,7 @@ from vision_agent.tools.tool_utils import get_tool_documentation
|
|
22
20
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
23
21
|
from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
|
24
22
|
from vision_agent.utils.execute import Execution, MimeType
|
25
|
-
from vision_agent.utils.image_utils import convert_to_b64
|
26
|
-
from vision_agent.utils.video import frames_to_bytes
|
23
|
+
from vision_agent.utils.image_utils import convert_to_b64
|
27
24
|
|
28
25
|
CURRENT_FILE = None
|
29
26
|
CURRENT_LINE = 0
|
@@ -393,19 +390,6 @@ def generate_vision_plan(
|
|
393
390
|
redisplay_results(response.test_results)
|
394
391
|
response.test_results = None
|
395
392
|
artifacts[name] = response.model_dump_json()
|
396
|
-
media_names = extract_json(
|
397
|
-
AnthropicLMM()( # type: ignore
|
398
|
-
f"""Extract any media file names from this output in the following JSON format:
|
399
|
-
{{"media": ["image1.jpg", "image2.jpg"]}}
|
400
|
-
|
401
|
-
{artifacts[name]}"""
|
402
|
-
)
|
403
|
-
)
|
404
|
-
if "media" in media_names and isinstance(media_names, dict):
|
405
|
-
for media in media_names["media"]:
|
406
|
-
if isinstance(media, str):
|
407
|
-
with open(media, "rb") as f:
|
408
|
-
artifacts[media] = f.read()
|
409
393
|
|
410
394
|
output_str = f"[Start Plan Context, saved at {name}]"
|
411
395
|
for plan in response.plans.keys():
|
@@ -466,6 +450,12 @@ def generate_vision_code(
|
|
466
450
|
test_multi_plan=test_multi_plan,
|
467
451
|
custom_tool_names=custom_tool_names,
|
468
452
|
)
|
453
|
+
|
454
|
+
# capture and save any files that were saved in the code to the artifacts
|
455
|
+
extract_and_save_files_to_artifacts(
|
456
|
+
artifacts, response["code"] + "\n" + response["test"]
|
457
|
+
)
|
458
|
+
|
469
459
|
redisplay_results(response["test_result"])
|
470
460
|
code = response["code"]
|
471
461
|
artifacts[name] = code
|
@@ -546,6 +536,11 @@ def edit_vision_code(
|
|
546
536
|
test_multi_plan=False,
|
547
537
|
custom_tool_names=custom_tool_names,
|
548
538
|
)
|
539
|
+
# capture and save any files that were saved in the code to the artifacts
|
540
|
+
extract_and_save_files_to_artifacts(
|
541
|
+
artifacts, response["code"] + "\n" + response["test"]
|
542
|
+
)
|
543
|
+
|
549
544
|
redisplay_results(response["test_result"])
|
550
545
|
code = response["code"]
|
551
546
|
artifacts[name] = code
|
@@ -567,49 +562,6 @@ def edit_vision_code(
|
|
567
562
|
return view_lines(code_lines, 0, total_lines, name, total_lines)
|
568
563
|
|
569
564
|
|
570
|
-
def write_media_artifact(
|
571
|
-
artifacts: Artifacts,
|
572
|
-
name: str,
|
573
|
-
media: Union[str, np.ndarray, List[np.ndarray]],
|
574
|
-
fps: Optional[float] = None,
|
575
|
-
) -> str:
|
576
|
-
"""Writes a media file to the artifacts object.
|
577
|
-
|
578
|
-
Parameters:
|
579
|
-
artifacts (Artifacts): The artifacts object to save the media to.
|
580
|
-
name (str): The name of the media artifact to save.
|
581
|
-
media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either
|
582
|
-
be a file path, single image or list of frames for a video.
|
583
|
-
fps (Optional[float]): The frames per second if you are writing a video.
|
584
|
-
"""
|
585
|
-
if isinstance(media, str):
|
586
|
-
with open(media, "rb") as f:
|
587
|
-
media_bytes = f.read()
|
588
|
-
elif isinstance(media, list):
|
589
|
-
media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0)
|
590
|
-
elif isinstance(media, np.ndarray):
|
591
|
-
media_bytes = numpy_to_bytes(media)
|
592
|
-
else:
|
593
|
-
print(f"[Invalid media type {type(media)}]")
|
594
|
-
return f"[Invalid media type {type(media)}]"
|
595
|
-
artifacts[name] = media_bytes
|
596
|
-
print(f"[Media {name} saved]")
|
597
|
-
display(
|
598
|
-
{
|
599
|
-
MimeType.APPLICATION_ARTIFACT: json.dumps(
|
600
|
-
{
|
601
|
-
"name": name,
|
602
|
-
"action": "create",
|
603
|
-
"content": base64.b64encode(media_bytes).decode("utf-8"),
|
604
|
-
"contentType": "media_output",
|
605
|
-
}
|
606
|
-
)
|
607
|
-
},
|
608
|
-
raw=True,
|
609
|
-
)
|
610
|
-
return f"[Media {name} saved]"
|
611
|
-
|
612
|
-
|
613
565
|
def list_artifacts(artifacts: Artifacts) -> str:
|
614
566
|
"""Lists all the artifacts that have been loaded into the artifacts object."""
|
615
567
|
output_str = artifacts.show()
|
@@ -813,6 +765,61 @@ def use_object_detection_fine_tuning(
|
|
813
765
|
return diff
|
814
766
|
|
815
767
|
|
768
|
+
def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None:
|
769
|
+
"""Extracts and saves files used in the code to the artifacts object.
|
770
|
+
|
771
|
+
Parameters:
|
772
|
+
artifacts (Artifacts): The artifacts object to save the files to.
|
773
|
+
code (str): The code to extract the files from.
|
774
|
+
"""
|
775
|
+
try:
|
776
|
+
response = extract_json(
|
777
|
+
AnthropicLMM()( # type: ignore
|
778
|
+
f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet:
|
779
|
+
|
780
|
+
```python
|
781
|
+
{code}
|
782
|
+
```
|
783
|
+
|
784
|
+
Return the file paths in the following JSON format:
|
785
|
+
{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
|
786
|
+
)
|
787
|
+
)
|
788
|
+
except json.JSONDecodeError:
|
789
|
+
return
|
790
|
+
|
791
|
+
text_file_ext = [
|
792
|
+
".txt",
|
793
|
+
".md",
|
794
|
+
"rtf",
|
795
|
+
".html",
|
796
|
+
".htm",
|
797
|
+
"xml",
|
798
|
+
".json",
|
799
|
+
".csv",
|
800
|
+
".tsv",
|
801
|
+
".yaml",
|
802
|
+
".yml",
|
803
|
+
".toml",
|
804
|
+
".conf",
|
805
|
+
".env" ".ini",
|
806
|
+
".log",
|
807
|
+
".py",
|
808
|
+
".java",
|
809
|
+
".js",
|
810
|
+
".cpp",
|
811
|
+
".c" ".sql",
|
812
|
+
".sh",
|
813
|
+
]
|
814
|
+
|
815
|
+
if "file_paths" in response and isinstance(response["file_paths"], list):
|
816
|
+
for file_path in response["file_paths"]:
|
817
|
+
read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
|
818
|
+
if Path(file_path).is_file():
|
819
|
+
with open(file_path, read_mode) as f:
|
820
|
+
artifacts[Path(file_path).name] = f.read()
|
821
|
+
|
822
|
+
|
816
823
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
817
824
|
[
|
818
825
|
get_tool_descriptions,
|
@@ -822,7 +829,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
822
829
|
generate_vision_plan,
|
823
830
|
generate_vision_code,
|
824
831
|
edit_vision_code,
|
825
|
-
write_media_artifact,
|
826
832
|
view_media_artifact,
|
827
833
|
object_detection_fine_tuning,
|
828
834
|
use_object_detection_fine_tuning,
|
vision_agent/tools/tools.py
CHANGED
@@ -181,6 +181,8 @@ def owl_v2_image(
|
|
181
181
|
"""
|
182
182
|
|
183
183
|
image_size = image.shape[:2]
|
184
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
185
|
+
return []
|
184
186
|
|
185
187
|
if fine_tune_id is not None:
|
186
188
|
image_b64 = convert_to_b64(image)
|
@@ -413,6 +415,9 @@ def florence2_sam2_image(
|
|
413
415
|
},
|
414
416
|
]
|
415
417
|
"""
|
418
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
419
|
+
return []
|
420
|
+
|
416
421
|
if fine_tune_id is not None:
|
417
422
|
image_b64 = convert_to_b64(image)
|
418
423
|
landing_api = LandingPublicAPI()
|
@@ -701,6 +706,8 @@ def countgd_counting(
|
|
701
706
|
]
|
702
707
|
"""
|
703
708
|
image_size = image.shape[:2]
|
709
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
710
|
+
return []
|
704
711
|
buffer_bytes = numpy_to_bytes(image)
|
705
712
|
files = [("image", buffer_bytes)]
|
706
713
|
prompt = prompt.replace(", ", " .")
|
@@ -759,6 +766,8 @@ def countgd_example_based_counting(
|
|
759
766
|
]
|
760
767
|
"""
|
761
768
|
image_size = image.shape[:2]
|
769
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
770
|
+
return []
|
762
771
|
buffer_bytes = numpy_to_bytes(image)
|
763
772
|
files = [("image", buffer_bytes)]
|
764
773
|
visual_prompts = [
|
@@ -828,6 +837,8 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
|
|
828
837
|
>>> ixc25_image_vqa('What is the cat doing?', image)
|
829
838
|
'drinking milk'
|
830
839
|
"""
|
840
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
841
|
+
raise ValueError(f"Image is empty, image shape: {image.shape}")
|
831
842
|
|
832
843
|
buffer_bytes = numpy_to_bytes(image)
|
833
844
|
files = [("image", buffer_bytes)]
|
@@ -871,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
|
871
882
|
return cast(str, data["answer"])
|
872
883
|
|
873
884
|
|
874
|
-
def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
|
875
|
-
"""'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
|
876
|
-
given a prompt that can be other an object or a phrase. It returns a list of
|
877
|
-
boolean values indicating whether the object or phrase is present in the
|
878
|
-
corresponding frame.
|
879
|
-
|
880
|
-
Parameters:
|
881
|
-
prompt (str): The question about the video
|
882
|
-
frames (List[np.ndarray]): The reference frames used for the question
|
883
|
-
|
884
|
-
Returns:
|
885
|
-
List[bool]: A list of boolean values indicating whether the object or phrase is
|
886
|
-
present in the corresponding frame.
|
887
|
-
|
888
|
-
Example
|
889
|
-
-------
|
890
|
-
>>> output = ixc25_temporal_localization('soccer goal', frames)
|
891
|
-
>>> print(output)
|
892
|
-
[False, False, False, True, True, True, False, False, False, False]
|
893
|
-
>>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
|
894
|
-
"""
|
895
|
-
|
896
|
-
buffer_bytes = frames_to_bytes(frames)
|
897
|
-
files = [("video", buffer_bytes)]
|
898
|
-
payload = {
|
899
|
-
"prompt": prompt,
|
900
|
-
"chunk_length": 2,
|
901
|
-
"function_name": "ixc25_temporal_localization",
|
902
|
-
}
|
903
|
-
data: List[int] = send_inference_request(
|
904
|
-
payload,
|
905
|
-
"video-temporal-localization?model=internlm-xcomposer",
|
906
|
-
files=files,
|
907
|
-
v2=True,
|
908
|
-
)
|
909
|
-
chunk_size = round(len(frames) / len(data))
|
910
|
-
data_explode = [[elt] * chunk_size for elt in data]
|
911
|
-
data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
|
912
|
-
return data_bool[: len(frames)]
|
913
|
-
|
914
|
-
|
915
885
|
def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
|
916
886
|
"""'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
|
917
887
|
including regular images or images of documents or presentations. It returns text
|
@@ -1024,6 +994,9 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
|
|
1024
994
|
{"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
|
1025
995
|
"""
|
1026
996
|
|
997
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
998
|
+
return {"labels": [], "scores": []}
|
999
|
+
|
1027
1000
|
image_b64 = convert_to_b64(image)
|
1028
1001
|
data = {
|
1029
1002
|
"prompt": ",".join(classes),
|
@@ -1052,6 +1025,8 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
1052
1025
|
>>> vit_image_classification(image)
|
1053
1026
|
{"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
|
1054
1027
|
"""
|
1028
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
1029
|
+
return {"labels": [], "scores": []}
|
1055
1030
|
|
1056
1031
|
image_b64 = convert_to_b64(image)
|
1057
1032
|
data = {
|
@@ -1080,6 +1055,8 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
1080
1055
|
>>> vit_nsfw_classification(image)
|
1081
1056
|
{"label": "normal", "scores": 0.68},
|
1082
1057
|
"""
|
1058
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
1059
|
+
raise ValueError(f"Image is empty, image shape: {image.shape}")
|
1083
1060
|
|
1084
1061
|
image_b64 = convert_to_b64(image)
|
1085
1062
|
data = {
|
@@ -1180,6 +1157,8 @@ def florence2_phrase_grounding(
|
|
1180
1157
|
]
|
1181
1158
|
"""
|
1182
1159
|
image_size = image.shape[:2]
|
1160
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
1161
|
+
return []
|
1183
1162
|
image_b64 = convert_to_b64(image)
|
1184
1163
|
|
1185
1164
|
if fine_tune_id is not None:
|
@@ -1399,6 +1378,8 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
1399
1378
|
},
|
1400
1379
|
]
|
1401
1380
|
"""
|
1381
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
1382
|
+
return []
|
1402
1383
|
image_b64 = convert_to_b64(image)
|
1403
1384
|
data = {
|
1404
1385
|
"image": image_b64,
|
@@ -1442,6 +1423,9 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
|
1442
1423
|
[10, 11, 15, ..., 202, 202, 205],
|
1443
1424
|
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
1444
1425
|
"""
|
1426
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
1427
|
+
raise ValueError(f"Image is empty, image shape: {image.shape}")
|
1428
|
+
|
1445
1429
|
image_b64 = convert_to_b64(image)
|
1446
1430
|
data = {
|
1447
1431
|
"image": image_b64,
|
vision_agent/utils/video.py
CHANGED
@@ -11,6 +11,9 @@ import numpy as np
|
|
11
11
|
_LOGGER = logging.getLogger(__name__)
|
12
12
|
# The maximum length of the clip to extract frames from, in seconds
|
13
13
|
|
14
|
+
_DEFAULT_VIDEO_FPS = 24
|
15
|
+
_DEFAULT_INPUT_FPS = 1.0
|
16
|
+
|
14
17
|
|
15
18
|
def play_video(video_base64: str) -> None:
|
16
19
|
"""Play a video file"""
|
@@ -51,7 +54,9 @@ def _resize_frame(frame: np.ndarray) -> np.ndarray:
|
|
51
54
|
|
52
55
|
|
53
56
|
def video_writer(
|
54
|
-
frames: List[np.ndarray],
|
57
|
+
frames: List[np.ndarray],
|
58
|
+
fps: float = _DEFAULT_INPUT_FPS,
|
59
|
+
filename: Optional[str] = None,
|
55
60
|
) -> str:
|
56
61
|
if filename is None:
|
57
62
|
filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
@@ -78,7 +83,7 @@ def video_writer(
|
|
78
83
|
|
79
84
|
|
80
85
|
def frames_to_bytes(
|
81
|
-
frames: List[np.ndarray], fps: float =
|
86
|
+
frames: List[np.ndarray], fps: float = _DEFAULT_INPUT_FPS, file_ext: str = ".mp4"
|
82
87
|
) -> bytes:
|
83
88
|
r"""Convert a list of frames to a video file encoded into a byte string.
|
84
89
|
|
@@ -101,7 +106,7 @@ def frames_to_bytes(
|
|
101
106
|
# same file name and the time savings are very large.
|
102
107
|
@lru_cache(maxsize=8)
|
103
108
|
def extract_frames_from_video(
|
104
|
-
video_uri: str, fps: float =
|
109
|
+
video_uri: str, fps: float = _DEFAULT_INPUT_FPS
|
105
110
|
) -> List[Tuple[np.ndarray, float]]:
|
106
111
|
"""Extract frames from a video along with the timestamp in seconds.
|
107
112
|
|
@@ -118,6 +123,16 @@ def extract_frames_from_video(
|
|
118
123
|
|
119
124
|
cap = cv2.VideoCapture(video_uri)
|
120
125
|
orig_fps = cap.get(cv2.CAP_PROP_FPS)
|
126
|
+
if not orig_fps or orig_fps <= 0:
|
127
|
+
_LOGGER.warning(
|
128
|
+
f"Input video, {video_uri}, has no fps, using the default value {_DEFAULT_VIDEO_FPS}"
|
129
|
+
)
|
130
|
+
orig_fps = _DEFAULT_VIDEO_FPS
|
131
|
+
if not fps or fps <= 0:
|
132
|
+
_LOGGER.warning(
|
133
|
+
f"Input fps, {fps}, is illegal, using the default value: {_DEFAULT_INPUT_FPS}"
|
134
|
+
)
|
135
|
+
fps = _DEFAULT_INPUT_FPS
|
121
136
|
orig_frame_time = 1 / orig_fps
|
122
137
|
targ_frame_time = 1 / fps
|
123
138
|
frames: List[Tuple[np.ndarray, float]] = []
|
@@ -129,10 +144,15 @@ def extract_frames_from_video(
|
|
129
144
|
break
|
130
145
|
|
131
146
|
elapsed_time += orig_frame_time
|
147
|
+
# This is to prevent float point precision loss issue, which can cause
|
148
|
+
# the elapsed time to be slightly less than the target frame time, which
|
149
|
+
# causes the last frame to be skipped
|
150
|
+
elapsed_time = round(elapsed_time, 8)
|
132
151
|
if elapsed_time >= targ_frame_time:
|
133
152
|
frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps))
|
134
153
|
elapsed_time -= targ_frame_time
|
135
154
|
|
136
155
|
i += 1
|
137
156
|
cap.release()
|
157
|
+
_LOGGER.info(f"Extracted {len(frames)} frames from {video_uri}")
|
138
158
|
return frames
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.166
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -41,7 +41,7 @@ Project-URL: repository, https://github.com/landing-ai/vision-agent
|
|
41
41
|
Description-Content-Type: text/markdown
|
42
42
|
|
43
43
|
<div align="center">
|
44
|
-
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.
|
44
|
+
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.png?raw=true">
|
45
45
|
|
46
46
|
# 🔍🤖 Vision Agent
|
47
47
|
[](https://discord.gg/wPdN8RCYew)
|
@@ -387,6 +387,11 @@ result = agent.generate_code(conv)
|
|
387
387
|
|
388
388
|
|
389
389
|
## Additional Backends
|
390
|
+
### E2B Code Execution
|
391
|
+
If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
|
392
|
+
set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
|
393
|
+
run all the agent generated code on the E2B backend.
|
394
|
+
|
390
395
|
### Anthropic
|
391
396
|
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
|
392
397
|
Anthropic API key and set it in your environment variables:
|
@@ -2,12 +2,12 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=cbY_V3f85_g8JmASa3m2LBX4G6xgsOKX1n7YtCf-C98,23676
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=aVkl0b9LKvy-auuHGYSag-ixYnue0iRQqD1PYLPBR-s,29312
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
8
8
|
vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
|
9
9
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
|
10
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
10
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=_xAITNDKcS45tqhEax5i6vDQa4V39f9n55iRGk2R6RM,11218
|
11
11
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
13
13
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
16
16
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
|
-
vision_agent/tools/__init__.py,sha256=
|
20
|
-
vision_agent/tools/meta_tools.py,sha256=
|
19
|
+
vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
|
20
|
+
vision_agent/tools/meta_tools.py,sha256=ZF-7z3KT-Su08MvF5OhSm3Taqeu1Ek-EZjFhpN5w1uU,28257
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -28,8 +28,8 @@ vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4
|
|
28
28
|
vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
|
-
vision_agent/utils/video.py,sha256=
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
31
|
+
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
+
vision_agent-0.2.166.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.166.dist-info/METADATA,sha256=e15d4yNaAJvLCViaBUFo_RNHII88W-y9WgJauEFfbyU,18034
|
34
|
+
vision_agent-0.2.166.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.166.dist-info/RECORD,,
|
File without changes
|
File without changes
|