vision-agent 0.2.164__py3-none-any.whl → 0.2.166__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import json
2
3
  import logging
3
4
  import os
4
5
  import tempfile
@@ -6,7 +7,7 @@ from pathlib import Path
6
7
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
7
8
 
8
9
  from vision_agent.agent import Agent
9
- from vision_agent.agent.agent_utils import extract_json
10
+ from vision_agent.agent.agent_utils import extract_json, extract_tag
10
11
  from vision_agent.agent.vision_agent_prompts import (
11
12
  EXAMPLES_CODE1,
12
13
  EXAMPLES_CODE2,
@@ -18,6 +19,7 @@ from vision_agent.tools.meta_tools import (
18
19
  META_TOOL_DOCSTRING,
19
20
  Artifacts,
20
21
  check_and_load_image,
22
+ extract_and_save_files_to_artifacts,
21
23
  use_extra_vision_agent_args,
22
24
  )
23
25
  from vision_agent.utils import CodeInterpreterFactory
@@ -35,7 +37,7 @@ class BoilerplateCode:
35
37
  pre_code = [
36
38
  "from typing import *",
37
39
  "from vision_agent.utils.execute import CodeInterpreter",
38
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
40
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
39
41
  "artifacts = Artifacts('{remote_path}')",
40
42
  "artifacts.load('{remote_path}')",
41
43
  ]
@@ -57,6 +59,32 @@ class BoilerplateCode:
57
59
  )
58
60
 
59
61
 
62
+ def format_agent_message(agent_message: str) -> str:
63
+ agent_message_json = extract_json(agent_message)
64
+ output = ""
65
+ if "thinking" in agent_message_json and agent_message_json["thinking"]:
66
+ output += "<thinking>" + agent_message_json["thinking"] + "</thinking>"
67
+ if "response" in agent_message_json and agent_message_json["response"]:
68
+ output += "<response>" + agent_message_json["response"] + "</response>"
69
+ if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
70
+ output += (
71
+ "\n<execute_python>\n"
72
+ + agent_message_json["execute_python"]
73
+ + "\n</execute_python>\n"
74
+ )
75
+ if (
76
+ "let_user_respond" in agent_message_json
77
+ and agent_message_json["let_user_respond"]
78
+ ):
79
+ output += (
80
+ "<let_user_respond>"
81
+ + str(agent_message_json["let_user_respond"])
82
+ + "</let_user_respond>"
83
+ )
84
+
85
+ return output
86
+
87
+
60
88
  def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
61
89
  chat = copy.deepcopy(chat)
62
90
 
@@ -67,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
67
95
  elif chat_i["role"] == "observation":
68
96
  conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
69
97
  elif chat_i["role"] == "assistant":
70
- conversation += f"AGENT: {chat_i['content']}\n\n"
98
+ conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" # type: ignore
71
99
  else:
72
100
  raise ValueError(f"role {chat_i['role']} is not supported")
73
101
 
@@ -84,15 +112,31 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
84
112
  and len(chat[-1]["media"]) > 0 # type: ignore
85
113
  ):
86
114
  message["media"] = chat[-1]["media"]
87
- return extract_json(orch([message], stream=False)) # type: ignore
115
+ conv_resp = cast(str, orch([message], stream=False))
116
+
117
+ let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
118
+ let_user_respond = (
119
+ "true" in let_user_respond_str.lower() if let_user_respond_str else False
120
+ )
121
+
122
+ return {
123
+ "thinking": extract_tag(conv_resp, "thinking"),
124
+ "response": extract_tag(conv_resp, "response"),
125
+ "execute_python": extract_tag(conv_resp, "execute_python"),
126
+ "let_user_respond": let_user_respond,
127
+ }
88
128
 
89
129
 
90
130
  def execute_code_action(
91
- code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
131
+ artifacts: Artifacts,
132
+ code: str,
133
+ code_interpreter: CodeInterpreter,
134
+ artifact_remote_path: str,
92
135
  ) -> Tuple[Execution, str]:
93
136
  result = code_interpreter.exec_isolation(
94
137
  BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
95
138
  )
139
+ extract_and_save_files_to_artifacts(artifacts, code)
96
140
 
97
141
  obs = str(result.logs)
98
142
  if result.error:
@@ -100,33 +144,8 @@ def execute_code_action(
100
144
  return result, obs
101
145
 
102
146
 
103
- def parse_execution(
104
- response: str,
105
- test_multi_plan: bool = True,
106
- custom_tool_names: Optional[List[str]] = None,
107
- ) -> Optional[str]:
108
- code = None
109
- remaining = response
110
- all_code = []
111
- while "<execute_python>" in remaining:
112
- code_i = remaining[
113
- remaining.find("<execute_python>") + len("<execute_python>") :
114
- ]
115
- code_i = code_i[: code_i.find("</execute_python>")]
116
- remaining = remaining[
117
- remaining.find("</execute_python>") + len("</execute_python>") :
118
- ]
119
- all_code.append(code_i)
120
-
121
- if len(all_code) > 0:
122
- code = "\n".join(all_code)
123
-
124
- if code is not None:
125
- code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
126
- return code
127
-
128
-
129
147
  def execute_user_code_action(
148
+ artifacts: Artifacts,
130
149
  last_user_message: Message,
131
150
  code_interpreter: CodeInterpreter,
132
151
  artifact_remote_path: str,
@@ -138,40 +157,43 @@ def execute_user_code_action(
138
157
  return user_result, user_obs
139
158
 
140
159
  last_user_content = cast(str, last_user_message.get("content", ""))
160
+ try:
161
+ user_code_action = json.loads(last_user_content).get("execute_python", None)
162
+ except json.JSONDecodeError:
163
+ return user_result, user_obs
141
164
 
142
- user_code_action = parse_execution(last_user_content, False)
143
165
  if user_code_action is not None:
166
+ user_code_action = use_extra_vision_agent_args(user_code_action, False)
144
167
  user_result, user_obs = execute_code_action(
145
- user_code_action, code_interpreter, artifact_remote_path
168
+ artifacts, user_code_action, code_interpreter, artifact_remote_path
146
169
  )
147
170
  if user_result.error:
148
171
  user_obs += f"\n{user_result.error}"
149
172
  return user_result, user_obs
150
173
 
151
174
 
152
- def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
175
+ def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
153
176
  response = copy.deepcopy(response)
154
- if "response" in response:
155
- resp_str = response["response"]
156
- if "<execute_python>" in resp_str:
157
- # only include descriptions for these, the rest will just have executing
158
- # code
159
- description_map = {
160
- "open_code_artifact": "Reading file.",
161
- "create_code_artifact": "Creating file.",
162
- "edit_code_artifact": "Editing file.",
163
- "generate_vision_code": "Generating vision code.",
164
- "edit_vision_code": "Editing vision code.",
165
- }
166
- description = ""
167
- for k, v in description_map.items():
168
- if k in resp_str:
169
- description += v + " "
170
- if description == "":
171
- description = "Executing code."
172
- resp_str = resp_str[resp_str.find("<execute_python>") :]
173
- resp_str = description + resp_str
174
- response["response"] = resp_str
177
+
178
+ if "execute_python" in response and response["execute_python"]:
179
+ # only include descriptions for these, the rest will just have executing
180
+ # code
181
+ description_map = {
182
+ "open_code_artifact": "Reading file.",
183
+ "create_code_artifact": "Creating file.",
184
+ "edit_code_artifact": "Editing file.",
185
+ "generate_vision_code": "Generating vision code.",
186
+ "edit_vision_code": "Editing vision code.",
187
+ }
188
+ description = ""
189
+ for k, v in description_map.items():
190
+ if k in response["execute_python"]:
191
+ description += v + " "
192
+ if description == "":
193
+ description = "Executing code."
194
+
195
+ response["response"] = description
196
+
175
197
  return response
176
198
 
177
199
 
@@ -369,7 +391,10 @@ class VisionAgent(Agent):
369
391
  self.streaming_message({"role": "observation", "content": artifacts_loaded})
370
392
 
371
393
  user_result, user_obs = execute_user_code_action(
372
- last_user_message, code_interpreter, str(remote_artifacts_path)
394
+ artifacts,
395
+ last_user_message,
396
+ code_interpreter,
397
+ str(remote_artifacts_path),
373
398
  )
374
399
  finished = user_result is not None and user_obs is not None
375
400
  if user_result is not None and user_obs is not None:
@@ -394,13 +419,13 @@ class VisionAgent(Agent):
394
419
  int_chat.append(
395
420
  {
396
421
  "role": "assistant",
397
- "content": str(add_step_descriptions(response)),
422
+ "content": json.dumps(add_step_descriptions(response)),
398
423
  }
399
424
  )
400
425
  orig_chat.append(
401
426
  {
402
427
  "role": "assistant",
403
- "content": str(add_step_descriptions(response)),
428
+ "content": json.dumps(add_step_descriptions(response)),
404
429
  }
405
430
  )
406
431
 
@@ -408,11 +433,13 @@ class VisionAgent(Agent):
408
433
  if last_response == response:
409
434
  response["let_user_respond"] = True
410
435
 
411
- finished = response["let_user_respond"]
436
+ finished = response.get("let_user_respond", False)
412
437
 
413
- code_action = parse_execution(
414
- response["response"], test_multi_plan, custom_tool_names
415
- )
438
+ code_action = response.get("execute_python", None)
439
+ if code_action is not None:
440
+ code_action = use_extra_vision_agent_args(
441
+ code_action, test_multi_plan, custom_tool_names
442
+ )
416
443
 
417
444
  if last_response == response:
418
445
  self.streaming_message(
@@ -431,14 +458,17 @@ class VisionAgent(Agent):
431
458
  self.streaming_message(
432
459
  {
433
460
  "role": "assistant",
434
- "content": response,
461
+ "content": json.dumps(response),
435
462
  "finished": finished and code_action is None,
436
463
  }
437
464
  )
438
465
 
439
466
  if code_action is not None:
440
467
  result, obs = execute_code_action(
441
- code_action, code_interpreter, str(remote_artifacts_path)
468
+ artifacts,
469
+ code_action,
470
+ code_interpreter,
471
+ str(remote_artifacts_path),
442
472
  )
443
473
 
444
474
  media_obs = check_and_load_image(code_action)
@@ -27,13 +27,14 @@ Here is the current conversation so far:
27
27
  **Instructions**:
28
28
  1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
29
29
  2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
30
- 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
30
+ 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
31
31
  4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
32
- 5. **Output in JSON**: Respond in the following format in JSON:
32
+ 5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
33
33
 
34
- ```json
35
- {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
36
- ```
34
+ <thinking>Your thoughts here...</thinking>
35
+ <response>Your response to the user here...</response>
36
+ <execute_python>Your code here...</execute_python>
37
+ <let_user_respond>true/false for whether or not to you want to let the user respond.</let_user_respond>
37
38
  """
38
39
 
39
40
 
@@ -45,7 +46,11 @@ OBSERVATION:
45
46
  Artifact dog.jpg loaded to /path/to/images/dog.jpg
46
47
  [End of artifacts]
47
48
 
48
- AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
49
+ AGENT: <thinking>I will use the generate_vision_code to detect the dogs in the image.</thinking>
50
+ <execute_python>
51
+ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
52
+ </execute_python>
53
+ <let_user_respond>false</let_user_respond>
49
54
 
50
55
  OBSERVATION:
51
56
  [Artifact dog_detector.py]
@@ -56,14 +61,21 @@ OBSERVATION:
56
61
  4| return dogs
57
62
  [End of artifact]
58
63
 
59
- AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
64
+ AGENT: <thinking>I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.</thinking>
65
+ <execute_python>
66
+ from dog_detector import detect_dogs
67
+ print(detect_dogs('/example/workspace/dog.jpg'))
68
+ </execute_python>
69
+ <let_user_respond>false</let_user_respond>
60
70
 
61
71
  OBSERVATION:
62
72
  ----- stdout -----
63
73
  [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
64
74
 
65
75
 
66
- AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
76
+ AGENT: <thinking>Two dogs are detected, I will show this to the user and ask them if the result looks good.</thinking>
77
+ <response>The code detectd two dogs, do the results look good to you?</response>
78
+ <let_user_respond>true</let_user_respond>
67
79
  """
68
80
 
69
81
  EXAMPLES_CODE1_EXTRA = """
@@ -75,7 +87,11 @@ Artifact dog.jpg loaded to /path/to/images/dog.jpg
75
87
  Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
76
88
  [End of artifacts]
77
89
 
78
- AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
90
+ AGENT: <thinking>I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that</thinking>
91
+ <execute_python>
92
+ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
93
+ </execute_python>
94
+ <let_user_respond>false</let_user_respond>
79
95
 
80
96
  OBSERVATION:
81
97
  [Artifact dog_detector.py]
@@ -86,13 +102,20 @@ OBSERVATION:
86
102
  4| return dogs
87
103
  [End of artifact]
88
104
 
89
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
105
+ AGENT: <thinking>I have edited the code to detect only one dog, I must now run the code and print the results to get the output.</thinking>
106
+ <execute_python>
107
+ from dog_detector import detect_dogs
108
+ print(detect_dogs('/path/to/images/dog.jpg'))
109
+ </execute_python>
110
+ <let_user_respond>false</let_user_respond>
90
111
 
91
112
  OBSERVATION:
92
113
  ----- stdout -----
93
114
  [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
94
115
 
95
- AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
116
+ AGENT: <thinking>One dog is detected, I will show this to the user and ask them if the result looks good.</thinking>
117
+ <response>The code detected one dog, do these results look good to you?</response>
118
+ <let_user_respond>true</let_user_respond>
96
119
  """
97
120
 
98
121
  EXAMPLES_CODE2 = """
@@ -103,12 +126,18 @@ OBSERVATION:
103
126
  Artifact image.jpg loaded to /path/to/images/image.jpg
104
127
  [End of artifacts]
105
128
 
106
- AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
129
+ AGENT: <thinking>The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.</thinking>
130
+ <execute_python>
131
+ view_media_artifacts('image.jpg')
132
+ </execute_python>
133
+ <let_user_respond>false</let_user_respond>
107
134
 
108
135
  OBSERVATION:
109
136
  [Image image.jpg displayed]
110
137
 
111
- AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
138
+ AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.</thinking>
139
+ <response>The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?</response>
140
+ <let_user_respond>true</let_user_respond>
112
141
  """
113
142
 
114
143
 
@@ -119,7 +148,9 @@ OBSERVATION:
119
148
  [Artifacts loaded]
120
149
  [End of artifacts]
121
150
 
122
- AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
151
+ AGENT: <thinking>The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.</thinking>
152
+ <response>Can you provide an image of workers with helmets?</response>
153
+ <let_user_respond>true</let_user_respond>
123
154
 
124
155
  USER: Yes you can use workers.png
125
156
 
@@ -128,7 +159,11 @@ OBSERVATION:
128
159
  Artifact workers.png loaded to /path/to/images/workers.png
129
160
  [End of artifacts]
130
161
 
131
- AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
162
+ AGENT: <thinking>I will use the generate_vision_code to count the workers with helmets in the image.</thinking>
163
+ <execute_python>
164
+ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
165
+ </execute_python>
166
+ <let_user_respond>false</let_user_respond>
132
167
 
133
168
  OBSERVATION:
134
169
  [Artifact code.py]
@@ -152,22 +187,37 @@ OBSERVATION:
152
187
  16| return count
153
188
  [End of artifact]
154
189
 
155
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
190
+ AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.</thinking>
191
+ <execute_python>
192
+ from code import count_workers_with_helmets
193
+ print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
194
+ </execute_python>
195
+ <let_user_respond>false</let_user_respond>
156
196
 
157
197
  OBSERVATION:
158
198
  ----- stdout -----
159
199
  2
160
200
 
161
- AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
201
+ AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
202
+ <response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
203
+ <let_user_respond>true</let_user_respond>
162
204
 
163
205
  USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
164
206
 
165
- AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
207
+ AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
208
+ <execute_python>
209
+ object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
210
+ </execute_python>
211
+ <let_user_respond>false</let_user_respond>
166
212
 
167
213
  OBSERVATION:
168
214
  [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
169
215
 
170
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
216
+ AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.</thinking>
217
+ <execute_python>
218
+ use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
219
+ </execute_python>
220
+ <let_user_respond>false</let_user_respond>
171
221
 
172
222
  OBSERVATION:
173
223
  [Artifact code.py edits]
@@ -184,11 +234,18 @@ OBSERVATION:
184
234
  count = 0
185
235
  [End of edits]
186
236
 
187
- AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
237
+ AGENT: <thinking>I have updated the code to use the fine tuned model, I must now run the code and print the output.</thinking>
238
+ <execute_python>
239
+ from code import count_workers_with_helmets
240
+ print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
241
+ </execute_python>
242
+ <let_user_respond>false</let_user_respond>
188
243
 
189
244
  OBSERVATION:
190
245
  ----- stdout -----
191
246
  3
192
247
 
193
- AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
248
+ AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
249
+ <response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
250
+ <let_user_respond>true</let_user_respond>
194
251
  """
@@ -37,7 +37,6 @@ from .tools import (
37
37
  grounding_dino,
38
38
  grounding_sam,
39
39
  ixc25_image_vqa,
40
- ixc25_temporal_localization,
41
40
  ixc25_video_vqa,
42
41
  load_image,
43
42
  loca_visual_prompt_counting,
@@ -1,4 +1,3 @@
1
- import base64
2
1
  import difflib
3
2
  import json
4
3
  import os
@@ -9,7 +8,6 @@ import tempfile
9
8
  from pathlib import Path
10
9
  from typing import Any, Dict, List, Optional, Union
11
10
 
12
- import numpy as np
13
11
  from IPython.display import display
14
12
  from redbaron import RedBaron # type: ignore
15
13
 
@@ -22,8 +20,7 @@ from vision_agent.tools.tool_utils import get_tool_documentation
22
20
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
23
21
  from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
24
22
  from vision_agent.utils.execute import Execution, MimeType
25
- from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
26
- from vision_agent.utils.video import frames_to_bytes
23
+ from vision_agent.utils.image_utils import convert_to_b64
27
24
 
28
25
  CURRENT_FILE = None
29
26
  CURRENT_LINE = 0
@@ -393,19 +390,6 @@ def generate_vision_plan(
393
390
  redisplay_results(response.test_results)
394
391
  response.test_results = None
395
392
  artifacts[name] = response.model_dump_json()
396
- media_names = extract_json(
397
- AnthropicLMM()( # type: ignore
398
- f"""Extract any media file names from this output in the following JSON format:
399
- {{"media": ["image1.jpg", "image2.jpg"]}}
400
-
401
- {artifacts[name]}"""
402
- )
403
- )
404
- if "media" in media_names and isinstance(media_names, dict):
405
- for media in media_names["media"]:
406
- if isinstance(media, str):
407
- with open(media, "rb") as f:
408
- artifacts[media] = f.read()
409
393
 
410
394
  output_str = f"[Start Plan Context, saved at {name}]"
411
395
  for plan in response.plans.keys():
@@ -466,6 +450,12 @@ def generate_vision_code(
466
450
  test_multi_plan=test_multi_plan,
467
451
  custom_tool_names=custom_tool_names,
468
452
  )
453
+
454
+ # capture and save any files that were saved in the code to the artifacts
455
+ extract_and_save_files_to_artifacts(
456
+ artifacts, response["code"] + "\n" + response["test"]
457
+ )
458
+
469
459
  redisplay_results(response["test_result"])
470
460
  code = response["code"]
471
461
  artifacts[name] = code
@@ -546,6 +536,11 @@ def edit_vision_code(
546
536
  test_multi_plan=False,
547
537
  custom_tool_names=custom_tool_names,
548
538
  )
539
+ # capture and save any files that were saved in the code to the artifacts
540
+ extract_and_save_files_to_artifacts(
541
+ artifacts, response["code"] + "\n" + response["test"]
542
+ )
543
+
549
544
  redisplay_results(response["test_result"])
550
545
  code = response["code"]
551
546
  artifacts[name] = code
@@ -567,49 +562,6 @@ def edit_vision_code(
567
562
  return view_lines(code_lines, 0, total_lines, name, total_lines)
568
563
 
569
564
 
570
- def write_media_artifact(
571
- artifacts: Artifacts,
572
- name: str,
573
- media: Union[str, np.ndarray, List[np.ndarray]],
574
- fps: Optional[float] = None,
575
- ) -> str:
576
- """Writes a media file to the artifacts object.
577
-
578
- Parameters:
579
- artifacts (Artifacts): The artifacts object to save the media to.
580
- name (str): The name of the media artifact to save.
581
- media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either
582
- be a file path, single image or list of frames for a video.
583
- fps (Optional[float]): The frames per second if you are writing a video.
584
- """
585
- if isinstance(media, str):
586
- with open(media, "rb") as f:
587
- media_bytes = f.read()
588
- elif isinstance(media, list):
589
- media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0)
590
- elif isinstance(media, np.ndarray):
591
- media_bytes = numpy_to_bytes(media)
592
- else:
593
- print(f"[Invalid media type {type(media)}]")
594
- return f"[Invalid media type {type(media)}]"
595
- artifacts[name] = media_bytes
596
- print(f"[Media {name} saved]")
597
- display(
598
- {
599
- MimeType.APPLICATION_ARTIFACT: json.dumps(
600
- {
601
- "name": name,
602
- "action": "create",
603
- "content": base64.b64encode(media_bytes).decode("utf-8"),
604
- "contentType": "media_output",
605
- }
606
- )
607
- },
608
- raw=True,
609
- )
610
- return f"[Media {name} saved]"
611
-
612
-
613
565
  def list_artifacts(artifacts: Artifacts) -> str:
614
566
  """Lists all the artifacts that have been loaded into the artifacts object."""
615
567
  output_str = artifacts.show()
@@ -813,6 +765,61 @@ def use_object_detection_fine_tuning(
813
765
  return diff
814
766
 
815
767
 
768
+ def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None:
769
+ """Extracts and saves files used in the code to the artifacts object.
770
+
771
+ Parameters:
772
+ artifacts (Artifacts): The artifacts object to save the files to.
773
+ code (str): The code to extract the files from.
774
+ """
775
+ try:
776
+ response = extract_json(
777
+ AnthropicLMM()( # type: ignore
778
+ f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet:
779
+
780
+ ```python
781
+ {code}
782
+ ```
783
+
784
+ Return the file paths in the following JSON format:
785
+ {{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
786
+ )
787
+ )
788
+ except json.JSONDecodeError:
789
+ return
790
+
791
+ text_file_ext = [
792
+ ".txt",
793
+ ".md",
794
+ "rtf",
795
+ ".html",
796
+ ".htm",
797
+ "xml",
798
+ ".json",
799
+ ".csv",
800
+ ".tsv",
801
+ ".yaml",
802
+ ".yml",
803
+ ".toml",
804
+ ".conf",
805
+ ".env" ".ini",
806
+ ".log",
807
+ ".py",
808
+ ".java",
809
+ ".js",
810
+ ".cpp",
811
+ ".c" ".sql",
812
+ ".sh",
813
+ ]
814
+
815
+ if "file_paths" in response and isinstance(response["file_paths"], list):
816
+ for file_path in response["file_paths"]:
817
+ read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
818
+ if Path(file_path).is_file():
819
+ with open(file_path, read_mode) as f:
820
+ artifacts[Path(file_path).name] = f.read()
821
+
822
+
816
823
  META_TOOL_DOCSTRING = get_tool_documentation(
817
824
  [
818
825
  get_tool_descriptions,
@@ -822,7 +829,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
822
829
  generate_vision_plan,
823
830
  generate_vision_code,
824
831
  edit_vision_code,
825
- write_media_artifact,
826
832
  view_media_artifact,
827
833
  object_detection_fine_tuning,
828
834
  use_object_detection_fine_tuning,
@@ -181,6 +181,8 @@ def owl_v2_image(
181
181
  """
182
182
 
183
183
  image_size = image.shape[:2]
184
+ if image_size[0] < 1 or image_size[1] < 1:
185
+ return []
184
186
 
185
187
  if fine_tune_id is not None:
186
188
  image_b64 = convert_to_b64(image)
@@ -413,6 +415,9 @@ def florence2_sam2_image(
413
415
  },
414
416
  ]
415
417
  """
418
+ if image.shape[0] < 1 or image.shape[1] < 1:
419
+ return []
420
+
416
421
  if fine_tune_id is not None:
417
422
  image_b64 = convert_to_b64(image)
418
423
  landing_api = LandingPublicAPI()
@@ -701,6 +706,8 @@ def countgd_counting(
701
706
  ]
702
707
  """
703
708
  image_size = image.shape[:2]
709
+ if image_size[0] < 1 or image_size[1] < 1:
710
+ return []
704
711
  buffer_bytes = numpy_to_bytes(image)
705
712
  files = [("image", buffer_bytes)]
706
713
  prompt = prompt.replace(", ", " .")
@@ -759,6 +766,8 @@ def countgd_example_based_counting(
759
766
  ]
760
767
  """
761
768
  image_size = image.shape[:2]
769
+ if image_size[0] < 1 or image_size[1] < 1:
770
+ return []
762
771
  buffer_bytes = numpy_to_bytes(image)
763
772
  files = [("image", buffer_bytes)]
764
773
  visual_prompts = [
@@ -828,6 +837,8 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
828
837
  >>> ixc25_image_vqa('What is the cat doing?', image)
829
838
  'drinking milk'
830
839
  """
840
+ if image.shape[0] < 1 or image.shape[1] < 1:
841
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
831
842
 
832
843
  buffer_bytes = numpy_to_bytes(image)
833
844
  files = [("image", buffer_bytes)]
@@ -871,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
871
882
  return cast(str, data["answer"])
872
883
 
873
884
 
874
- def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
875
- """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
876
- given a prompt that can be other an object or a phrase. It returns a list of
877
- boolean values indicating whether the object or phrase is present in the
878
- corresponding frame.
879
-
880
- Parameters:
881
- prompt (str): The question about the video
882
- frames (List[np.ndarray]): The reference frames used for the question
883
-
884
- Returns:
885
- List[bool]: A list of boolean values indicating whether the object or phrase is
886
- present in the corresponding frame.
887
-
888
- Example
889
- -------
890
- >>> output = ixc25_temporal_localization('soccer goal', frames)
891
- >>> print(output)
892
- [False, False, False, True, True, True, False, False, False, False]
893
- >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
894
- """
895
-
896
- buffer_bytes = frames_to_bytes(frames)
897
- files = [("video", buffer_bytes)]
898
- payload = {
899
- "prompt": prompt,
900
- "chunk_length": 2,
901
- "function_name": "ixc25_temporal_localization",
902
- }
903
- data: List[int] = send_inference_request(
904
- payload,
905
- "video-temporal-localization?model=internlm-xcomposer",
906
- files=files,
907
- v2=True,
908
- )
909
- chunk_size = round(len(frames) / len(data))
910
- data_explode = [[elt] * chunk_size for elt in data]
911
- data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
912
- return data_bool[: len(frames)]
913
-
914
-
915
885
  def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
916
886
  """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
917
887
  including regular images or images of documents or presentations. It returns text
@@ -1024,6 +994,9 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
1024
994
  {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
1025
995
  """
1026
996
 
997
+ if image.shape[0] < 1 or image.shape[1] < 1:
998
+ return {"labels": [], "scores": []}
999
+
1027
1000
  image_b64 = convert_to_b64(image)
1028
1001
  data = {
1029
1002
  "prompt": ",".join(classes),
@@ -1052,6 +1025,8 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1052
1025
  >>> vit_image_classification(image)
1053
1026
  {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
1054
1027
  """
1028
+ if image.shape[0] < 1 or image.shape[1] < 1:
1029
+ return {"labels": [], "scores": []}
1055
1030
 
1056
1031
  image_b64 = convert_to_b64(image)
1057
1032
  data = {
@@ -1080,6 +1055,8 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
1080
1055
  >>> vit_nsfw_classification(image)
1081
1056
  {"label": "normal", "scores": 0.68},
1082
1057
  """
1058
+ if image.shape[0] < 1 or image.shape[1] < 1:
1059
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
1083
1060
 
1084
1061
  image_b64 = convert_to_b64(image)
1085
1062
  data = {
@@ -1180,6 +1157,8 @@ def florence2_phrase_grounding(
1180
1157
  ]
1181
1158
  """
1182
1159
  image_size = image.shape[:2]
1160
+ if image_size[0] < 1 or image_size[1] < 1:
1161
+ return []
1183
1162
  image_b64 = convert_to_b64(image)
1184
1163
 
1185
1164
  if fine_tune_id is not None:
@@ -1399,6 +1378,8 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
1399
1378
  },
1400
1379
  ]
1401
1380
  """
1381
+ if image.shape[0] < 1 or image.shape[1] < 1:
1382
+ return []
1402
1383
  image_b64 = convert_to_b64(image)
1403
1384
  data = {
1404
1385
  "image": image_b64,
@@ -1442,6 +1423,9 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
1442
1423
  [10, 11, 15, ..., 202, 202, 205],
1443
1424
  [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1444
1425
  """
1426
+ if image.shape[0] < 1 or image.shape[1] < 1:
1427
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
1428
+
1445
1429
  image_b64 = convert_to_b64(image)
1446
1430
  data = {
1447
1431
  "image": image_b64,
@@ -11,6 +11,9 @@ import numpy as np
11
11
  _LOGGER = logging.getLogger(__name__)
12
12
  # The maximum length of the clip to extract frames from, in seconds
13
13
 
14
+ _DEFAULT_VIDEO_FPS = 24
15
+ _DEFAULT_INPUT_FPS = 1.0
16
+
14
17
 
15
18
  def play_video(video_base64: str) -> None:
16
19
  """Play a video file"""
@@ -51,7 +54,9 @@ def _resize_frame(frame: np.ndarray) -> np.ndarray:
51
54
 
52
55
 
53
56
  def video_writer(
54
- frames: List[np.ndarray], fps: float = 1.0, filename: Optional[str] = None
57
+ frames: List[np.ndarray],
58
+ fps: float = _DEFAULT_INPUT_FPS,
59
+ filename: Optional[str] = None,
55
60
  ) -> str:
56
61
  if filename is None:
57
62
  filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
@@ -78,7 +83,7 @@ def video_writer(
78
83
 
79
84
 
80
85
  def frames_to_bytes(
81
- frames: List[np.ndarray], fps: float = 1.0, file_ext: str = ".mp4"
86
+ frames: List[np.ndarray], fps: float = _DEFAULT_INPUT_FPS, file_ext: str = ".mp4"
82
87
  ) -> bytes:
83
88
  r"""Convert a list of frames to a video file encoded into a byte string.
84
89
 
@@ -101,7 +106,7 @@ def frames_to_bytes(
101
106
  # same file name and the time savings are very large.
102
107
  @lru_cache(maxsize=8)
103
108
  def extract_frames_from_video(
104
- video_uri: str, fps: float = 1.0
109
+ video_uri: str, fps: float = _DEFAULT_INPUT_FPS
105
110
  ) -> List[Tuple[np.ndarray, float]]:
106
111
  """Extract frames from a video along with the timestamp in seconds.
107
112
 
@@ -118,6 +123,16 @@ def extract_frames_from_video(
118
123
 
119
124
  cap = cv2.VideoCapture(video_uri)
120
125
  orig_fps = cap.get(cv2.CAP_PROP_FPS)
126
+ if not orig_fps or orig_fps <= 0:
127
+ _LOGGER.warning(
128
+ f"Input video, {video_uri}, has no fps, using the default value {_DEFAULT_VIDEO_FPS}"
129
+ )
130
+ orig_fps = _DEFAULT_VIDEO_FPS
131
+ if not fps or fps <= 0:
132
+ _LOGGER.warning(
133
+ f"Input fps, {fps}, is illegal, using the default value: {_DEFAULT_INPUT_FPS}"
134
+ )
135
+ fps = _DEFAULT_INPUT_FPS
121
136
  orig_frame_time = 1 / orig_fps
122
137
  targ_frame_time = 1 / fps
123
138
  frames: List[Tuple[np.ndarray, float]] = []
@@ -129,10 +144,15 @@ def extract_frames_from_video(
129
144
  break
130
145
 
131
146
  elapsed_time += orig_frame_time
147
+ # This is to prevent float point precision loss issue, which can cause
148
+ # the elapsed time to be slightly less than the target frame time, which
149
+ # causes the last frame to be skipped
150
+ elapsed_time = round(elapsed_time, 8)
132
151
  if elapsed_time >= targ_frame_time:
133
152
  frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps))
134
153
  elapsed_time -= targ_frame_time
135
154
 
136
155
  i += 1
137
156
  cap.release()
157
+ _LOGGER.info(f"Extracted {len(frames)} frames from {video_uri}")
138
158
  return frames
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.164
3
+ Version: 0.2.166
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -41,7 +41,7 @@ Project-URL: repository, https://github.com/landing-ai/vision-agent
41
41
  Description-Content-Type: text/markdown
42
42
 
43
43
  <div align="center">
44
- <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
44
+ <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.png?raw=true">
45
45
 
46
46
  # 🔍🤖 Vision Agent
47
47
  [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
@@ -387,6 +387,11 @@ result = agent.generate_code(conv)
387
387
 
388
388
 
389
389
  ## Additional Backends
390
+ ### E2B Code Execution
391
+ If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
392
+ set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
393
+ run all the agent generated code on the E2B backend.
394
+
390
395
  ### Anthropic
391
396
  `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
392
397
  Anthropic API key and set it in your environment variables:
@@ -2,12 +2,12 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
4
  vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
5
- vision_agent/agent/vision_agent.py,sha256=MUigVufYML2sYn9Hsngswa77XxlZBgCwQyBfK8tlsio,22551
5
+ vision_agent/agent/vision_agent.py,sha256=cbY_V3f85_g8JmASa3m2LBX4G6xgsOKX1n7YtCf-C98,23676
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=aVkl0b9LKvy-auuHGYSag-ixYnue0iRQqD1PYLPBR-s,29312
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
8
8
  vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
9
9
  vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
10
- vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
10
+ vision_agent/agent/vision_agent_prompts.py,sha256=_xAITNDKcS45tqhEax5i6vDQa4V39f9n55iRGk2R6RM,11218
11
11
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
13
13
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=50wwisjudmZn7_SEwigTiiDxQ0HXbSIhVI4O8kvE9Es,2365
20
- vision_agent/tools/meta_tools.py,sha256=MULJrZiTODOAN20TGceLdXcwoSGMNaE7bQbywySITnA,28458
19
+ vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
20
+ vision_agent/tools/meta_tools.py,sha256=ZF-7z3KT-Su08MvF5OhSm3Taqeu1Ek-EZjFhpN5w1uU,28257
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=hjv1mZdq8AHgsX_0zmya0i9yiEK6My5FO6mWbGjZvV4,78521
23
+ vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -28,8 +28,8 @@ vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4
28
28
  vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
- vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
32
- vision_agent-0.2.164.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.164.dist-info/METADATA,sha256=v8XdfcxjcFNC1sgOI5BBGklOoEtb6QxpHremU02onX0,17785
34
- vision_agent-0.2.164.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.164.dist-info/RECORD,,
31
+ vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
+ vision_agent-0.2.166.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.166.dist-info/METADATA,sha256=e15d4yNaAJvLCViaBUFo_RNHII88W-y9WgJauEFfbyU,18034
34
+ vision_agent-0.2.166.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.166.dist-info/RECORD,,