vision-agent 0.2.164__py3-none-any.whl → 0.2.166__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import json
2
3
  import logging
3
4
  import os
4
5
  import tempfile
@@ -6,7 +7,7 @@ from pathlib import Path
6
7
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
7
8
 
8
9
  from vision_agent.agent import Agent
9
- from vision_agent.agent.agent_utils import extract_json
10
+ from vision_agent.agent.agent_utils import extract_json, extract_tag
10
11
  from vision_agent.agent.vision_agent_prompts import (
11
12
  EXAMPLES_CODE1,
12
13
  EXAMPLES_CODE2,
@@ -18,6 +19,7 @@ from vision_agent.tools.meta_tools import (
18
19
  META_TOOL_DOCSTRING,
19
20
  Artifacts,
20
21
  check_and_load_image,
22
+ extract_and_save_files_to_artifacts,
21
23
  use_extra_vision_agent_args,
22
24
  )
23
25
  from vision_agent.utils import CodeInterpreterFactory
@@ -35,7 +37,7 @@ class BoilerplateCode:
35
37
  pre_code = [
36
38
  "from typing import *",
37
39
  "from vision_agent.utils.execute import CodeInterpreter",
38
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
40
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
39
41
  "artifacts = Artifacts('{remote_path}')",
40
42
  "artifacts.load('{remote_path}')",
41
43
  ]
@@ -57,6 +59,32 @@ class BoilerplateCode:
57
59
  )
58
60
 
59
61
 
62
+ def format_agent_message(agent_message: str) -> str:
63
+ agent_message_json = extract_json(agent_message)
64
+ output = ""
65
+ if "thinking" in agent_message_json and agent_message_json["thinking"]:
66
+ output += "<thinking>" + agent_message_json["thinking"] + "</thinking>"
67
+ if "response" in agent_message_json and agent_message_json["response"]:
68
+ output += "<response>" + agent_message_json["response"] + "</response>"
69
+ if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
70
+ output += (
71
+ "\n<execute_python>\n"
72
+ + agent_message_json["execute_python"]
73
+ + "\n</execute_python>\n"
74
+ )
75
+ if (
76
+ "let_user_respond" in agent_message_json
77
+ and agent_message_json["let_user_respond"]
78
+ ):
79
+ output += (
80
+ "<let_user_respond>"
81
+ + str(agent_message_json["let_user_respond"])
82
+ + "</let_user_respond>"
83
+ )
84
+
85
+ return output
86
+
87
+
60
88
  def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
61
89
  chat = copy.deepcopy(chat)
62
90
 
@@ -67,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
67
95
  elif chat_i["role"] == "observation":
68
96
  conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
69
97
  elif chat_i["role"] == "assistant":
70
- conversation += f"AGENT: {chat_i['content']}\n\n"
98
+ conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" # type: ignore
71
99
  else:
72
100
  raise ValueError(f"role {chat_i['role']} is not supported")
73
101
 
@@ -84,15 +112,31 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
84
112
  and len(chat[-1]["media"]) > 0 # type: ignore
85
113
  ):
86
114
  message["media"] = chat[-1]["media"]
87
- return extract_json(orch([message], stream=False)) # type: ignore
115
+ conv_resp = cast(str, orch([message], stream=False))
116
+
117
+ let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
118
+ let_user_respond = (
119
+ "true" in let_user_respond_str.lower() if let_user_respond_str else False
120
+ )
121
+
122
+ return {
123
+ "thinking": extract_tag(conv_resp, "thinking"),
124
+ "response": extract_tag(conv_resp, "response"),
125
+ "execute_python": extract_tag(conv_resp, "execute_python"),
126
+ "let_user_respond": let_user_respond,
127
+ }
88
128
 
89
129
 
90
130
  def execute_code_action(
91
- code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
131
+ artifacts: Artifacts,
132
+ code: str,
133
+ code_interpreter: CodeInterpreter,
134
+ artifact_remote_path: str,
92
135
  ) -> Tuple[Execution, str]:
93
136
  result = code_interpreter.exec_isolation(
94
137
  BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
95
138
  )
139
+ extract_and_save_files_to_artifacts(artifacts, code)
96
140
 
97
141
  obs = str(result.logs)
98
142
  if result.error:
@@ -100,33 +144,8 @@ def execute_code_action(
100
144
  return result, obs
101
145
 
102
146
 
103
- def parse_execution(
104
- response: str,
105
- test_multi_plan: bool = True,
106
- custom_tool_names: Optional[List[str]] = None,
107
- ) -> Optional[str]:
108
- code = None
109
- remaining = response
110
- all_code = []
111
- while "<execute_python>" in remaining:
112
- code_i = remaining[
113
- remaining.find("<execute_python>") + len("<execute_python>") :
114
- ]
115
- code_i = code_i[: code_i.find("</execute_python>")]
116
- remaining = remaining[
117
- remaining.find("</execute_python>") + len("</execute_python>") :
118
- ]
119
- all_code.append(code_i)
120
-
121
- if len(all_code) > 0:
122
- code = "\n".join(all_code)
123
-
124
- if code is not None:
125
- code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
126
- return code
127
-
128
-
129
147
  def execute_user_code_action(
148
+ artifacts: Artifacts,
130
149
  last_user_message: Message,
131
150
  code_interpreter: CodeInterpreter,
132
151
  artifact_remote_path: str,
@@ -138,40 +157,43 @@ def execute_user_code_action(
138
157
  return user_result, user_obs
139
158
 
140
159
  last_user_content = cast(str, last_user_message.get("content", ""))
160
+ try:
161
+ user_code_action = json.loads(last_user_content).get("execute_python", None)
162
+ except json.JSONDecodeError:
163
+ return user_result, user_obs
141
164
 
142
- user_code_action = parse_execution(last_user_content, False)
143
165
  if user_code_action is not None:
166
+ user_code_action = use_extra_vision_agent_args(user_code_action, False)
144
167
  user_result, user_obs = execute_code_action(
145
- user_code_action, code_interpreter, artifact_remote_path
168
+ artifacts, user_code_action, code_interpreter, artifact_remote_path
146
169
  )
147
170
  if user_result.error:
148
171
  user_obs += f"\n{user_result.error}"
149
172
  return user_result, user_obs
150
173
 
151
174
 
152
- def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
175
+ def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
153
176
  response = copy.deepcopy(response)
154
- if "response" in response:
155
- resp_str = response["response"]
156
- if "<execute_python>" in resp_str:
157
- # only include descriptions for these, the rest will just have executing
158
- # code
159
- description_map = {
160
- "open_code_artifact": "Reading file.",
161
- "create_code_artifact": "Creating file.",
162
- "edit_code_artifact": "Editing file.",
163
- "generate_vision_code": "Generating vision code.",
164
- "edit_vision_code": "Editing vision code.",
165
- }
166
- description = ""
167
- for k, v in description_map.items():
168
- if k in resp_str:
169
- description += v + " "
170
- if description == "":
171
- description = "Executing code."
172
- resp_str = resp_str[resp_str.find("<execute_python>") :]
173
- resp_str = description + resp_str
174
- response["response"] = resp_str
177
+
178
+ if "execute_python" in response and response["execute_python"]:
179
+ # only include descriptions for these, the rest will just have executing
180
+ # code
181
+ description_map = {
182
+ "open_code_artifact": "Reading file.",
183
+ "create_code_artifact": "Creating file.",
184
+ "edit_code_artifact": "Editing file.",
185
+ "generate_vision_code": "Generating vision code.",
186
+ "edit_vision_code": "Editing vision code.",
187
+ }
188
+ description = ""
189
+ for k, v in description_map.items():
190
+ if k in response["execute_python"]:
191
+ description += v + " "
192
+ if description == "":
193
+ description = "Executing code."
194
+
195
+ response["response"] = description
196
+
175
197
  return response
176
198
 
177
199
 
@@ -369,7 +391,10 @@ class VisionAgent(Agent):
369
391
  self.streaming_message({"role": "observation", "content": artifacts_loaded})
370
392
 
371
393
  user_result, user_obs = execute_user_code_action(
372
- last_user_message, code_interpreter, str(remote_artifacts_path)
394
+ artifacts,
395
+ last_user_message,
396
+ code_interpreter,
397
+ str(remote_artifacts_path),
373
398
  )
374
399
  finished = user_result is not None and user_obs is not None
375
400
  if user_result is not None and user_obs is not None:
@@ -394,13 +419,13 @@ class VisionAgent(Agent):
394
419
  int_chat.append(
395
420
  {
396
421
  "role": "assistant",
397
- "content": str(add_step_descriptions(response)),
422
+ "content": json.dumps(add_step_descriptions(response)),
398
423
  }
399
424
  )
400
425
  orig_chat.append(
401
426
  {
402
427
  "role": "assistant",
403
- "content": str(add_step_descriptions(response)),
428
+ "content": json.dumps(add_step_descriptions(response)),
404
429
  }
405
430
  )
406
431
 
@@ -408,11 +433,13 @@ class VisionAgent(Agent):
408
433
  if last_response == response:
409
434
  response["let_user_respond"] = True
410
435
 
411
- finished = response["let_user_respond"]
436
+ finished = response.get("let_user_respond", False)
412
437
 
413
- code_action = parse_execution(
414
- response["response"], test_multi_plan, custom_tool_names
415
- )
438
+ code_action = response.get("execute_python", None)
439
+ if code_action is not None:
440
+ code_action = use_extra_vision_agent_args(
441
+ code_action, test_multi_plan, custom_tool_names
442
+ )
416
443
 
417
444
  if last_response == response:
418
445
  self.streaming_message(
@@ -431,14 +458,17 @@ class VisionAgent(Agent):
431
458
  self.streaming_message(
432
459
  {
433
460
  "role": "assistant",
434
- "content": response,
461
+ "content": json.dumps(response),
435
462
  "finished": finished and code_action is None,
436
463
  }
437
464
  )
438
465
 
439
466
  if code_action is not None:
440
467
  result, obs = execute_code_action(
441
- code_action, code_interpreter, str(remote_artifacts_path)
468
+ artifacts,
469
+ code_action,
470
+ code_interpreter,
471
+ str(remote_artifacts_path),
442
472
  )
443
473
 
444
474
  media_obs = check_and_load_image(code_action)
@@ -27,13 +27,14 @@ Here is the current conversation so far:
27
27
  **Instructions**:
28
28
  1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
29
29
  2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
30
- 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
30
+ 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
31
31
  4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
32
- 5. **Output in JSON**: Respond in the following format in JSON:
32
+ 5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
33
33
 
34
- ```json
35
- {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
36
- ```
34
+ <thinking>Your thoughts here...</thinking>
35
+ <response>Your response to the user here...</response>
36
+ <execute_python>Your code here...</execute_python>
37
+ <let_user_respond>true/false for whether or not to you want to let the user respond.</let_user_respond>
37
38
  """
38
39
 
39
40
 
@@ -45,7 +46,11 @@ OBSERVATION:
45
46
  Artifact dog.jpg loaded to /path/to/images/dog.jpg
46
47
  [End of artifacts]
47
48
 
48
- AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
49
+ AGENT: <thinking>I will use the generate_vision_code to detect the dogs in the image.</thinking>
50
+ <execute_python>
51
+ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
52
+ </execute_python>
53
+ <let_user_respond>false</let_user_respond>
49
54
 
50
55
  OBSERVATION:
51
56
  [Artifact dog_detector.py]
@@ -56,14 +61,21 @@ OBSERVATION:
56
61
  4| return dogs
57
62
  [End of artifact]
58
63
 
59
- AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
64
+ AGENT: <thinking>I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.</thinking>
65
+ <execute_python>
66
+ from dog_detector import detect_dogs
67
+ print(detect_dogs('/example/workspace/dog.jpg'))
68
+ </execute_python>
69
+ <let_user_respond>false</let_user_respond>
60
70
 
61
71
  OBSERVATION:
62
72
  ----- stdout -----
63
73
  [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
64
74
 
65
75
 
66
- AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
76
+ AGENT: <thinking>Two dogs are detected, I will show this to the user and ask them if the result looks good.</thinking>
77
+ <response>The code detectd two dogs, do the results look good to you?</response>
78
+ <let_user_respond>true</let_user_respond>
67
79
  """
68
80
 
69
81
  EXAMPLES_CODE1_EXTRA = """
@@ -75,7 +87,11 @@ Artifact dog.jpg loaded to /path/to/images/dog.jpg
75
87
  Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
76
88
  [End of artifacts]
77
89
 
78
- AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
90
+ AGENT: <thinking>I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that</thinking>
91
+ <execute_python>
92
+ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
93
+ </execute_python>
94
+ <let_user_respond>false</let_user_respond>
79
95
 
80
96
  OBSERVATION:
81
97
  [Artifact dog_detector.py]
@@ -86,13 +102,20 @@ OBSERVATION:
86
102
  4| return dogs
87
103
  [End of artifact]
88
104
 
89
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
105
+ AGENT: <thinking>I have edited the code to detect only one dog, I must now run the code and print the results to get the output.</thinking>
106
+ <execute_python>
107
+ from dog_detector import detect_dogs
108
+ print(detect_dogs('/path/to/images/dog.jpg'))
109
+ </execute_python>
110
+ <let_user_respond>false</let_user_respond>
90
111
 
91
112
  OBSERVATION:
92
113
  ----- stdout -----
93
114
  [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
94
115
 
95
- AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
116
+ AGENT: <thinking>One dog is detected, I will show this to the user and ask them if the result looks good.</thinking>
117
+ <response>The code detected one dog, do these results look good to you?</response>
118
+ <let_user_respond>true</let_user_respond>
96
119
  """
97
120
 
98
121
  EXAMPLES_CODE2 = """
@@ -103,12 +126,18 @@ OBSERVATION:
103
126
  Artifact image.jpg loaded to /path/to/images/image.jpg
104
127
  [End of artifacts]
105
128
 
106
- AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
129
+ AGENT: <thinking>The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.</thinking>
130
+ <execute_python>
131
+ view_media_artifacts('image.jpg')
132
+ </execute_python>
133
+ <let_user_respond>false</let_user_respond>
107
134
 
108
135
  OBSERVATION:
109
136
  [Image image.jpg displayed]
110
137
 
111
- AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
138
+ AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.</thinking>
139
+ <response>The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?</response>
140
+ <let_user_respond>true</let_user_respond>
112
141
  """
113
142
 
114
143
 
@@ -119,7 +148,9 @@ OBSERVATION:
119
148
  [Artifacts loaded]
120
149
  [End of artifacts]
121
150
 
122
- AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
151
+ AGENT: <thinking>The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.</thinking>
152
+ <response>Can you provide an image of workers with helmets?</response>
153
+ <let_user_respond>true</let_user_respond>
123
154
 
124
155
  USER: Yes you can use workers.png
125
156
 
@@ -128,7 +159,11 @@ OBSERVATION:
128
159
  Artifact workers.png loaded to /path/to/images/workers.png
129
160
  [End of artifacts]
130
161
 
131
- AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
162
+ AGENT: <thinking>I will use the generate_vision_code to count the workers with helmets in the image.</thinking>
163
+ <execute_python>
164
+ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
165
+ </execute_python>
166
+ <let_user_respond>false</let_user_respond>
132
167
 
133
168
  OBSERVATION:
134
169
  [Artifact code.py]
@@ -152,22 +187,37 @@ OBSERVATION:
152
187
  16| return count
153
188
  [End of artifact]
154
189
 
155
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
190
+ AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.</thinking>
191
+ <execute_python>
192
+ from code import count_workers_with_helmets
193
+ print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
194
+ </execute_python>
195
+ <let_user_respond>false</let_user_respond>
156
196
 
157
197
  OBSERVATION:
158
198
  ----- stdout -----
159
199
  2
160
200
 
161
- AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
201
+ AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
202
+ <response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
203
+ <let_user_respond>true</let_user_respond>
162
204
 
163
205
  USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
164
206
 
165
- AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
207
+ AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
208
+ <execute_python>
209
+ object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
210
+ </execute_python>
211
+ <let_user_respond>false</let_user_respond>
166
212
 
167
213
  OBSERVATION:
168
214
  [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
169
215
 
170
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
216
+ AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.</thinking>
217
+ <execute_python>
218
+ use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
219
+ </execute_python>
220
+ <let_user_respond>false</let_user_respond>
171
221
 
172
222
  OBSERVATION:
173
223
  [Artifact code.py edits]
@@ -184,11 +234,18 @@ OBSERVATION:
184
234
  count = 0
185
235
  [End of edits]
186
236
 
187
- AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
237
+ AGENT: <thinking>I have updated the code to use the fine tuned model, I must now run the code and print the output.</thinking>
238
+ <execute_python>
239
+ from code import count_workers_with_helmets
240
+ print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
241
+ </execute_python>
242
+ <let_user_respond>false</let_user_respond>
188
243
 
189
244
  OBSERVATION:
190
245
  ----- stdout -----
191
246
  3
192
247
 
193
- AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
248
+ AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
249
+ <response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
250
+ <let_user_respond>true</let_user_respond>
194
251
  """
@@ -37,7 +37,6 @@ from .tools import (
37
37
  grounding_dino,
38
38
  grounding_sam,
39
39
  ixc25_image_vqa,
40
- ixc25_temporal_localization,
41
40
  ixc25_video_vqa,
42
41
  load_image,
43
42
  loca_visual_prompt_counting,
@@ -1,4 +1,3 @@
1
- import base64
2
1
  import difflib
3
2
  import json
4
3
  import os
@@ -9,7 +8,6 @@ import tempfile
9
8
  from pathlib import Path
10
9
  from typing import Any, Dict, List, Optional, Union
11
10
 
12
- import numpy as np
13
11
  from IPython.display import display
14
12
  from redbaron import RedBaron # type: ignore
15
13
 
@@ -22,8 +20,7 @@ from vision_agent.tools.tool_utils import get_tool_documentation
22
20
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
23
21
  from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
24
22
  from vision_agent.utils.execute import Execution, MimeType
25
- from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
26
- from vision_agent.utils.video import frames_to_bytes
23
+ from vision_agent.utils.image_utils import convert_to_b64
27
24
 
28
25
  CURRENT_FILE = None
29
26
  CURRENT_LINE = 0
@@ -393,19 +390,6 @@ def generate_vision_plan(
393
390
  redisplay_results(response.test_results)
394
391
  response.test_results = None
395
392
  artifacts[name] = response.model_dump_json()
396
- media_names = extract_json(
397
- AnthropicLMM()( # type: ignore
398
- f"""Extract any media file names from this output in the following JSON format:
399
- {{"media": ["image1.jpg", "image2.jpg"]}}
400
-
401
- {artifacts[name]}"""
402
- )
403
- )
404
- if "media" in media_names and isinstance(media_names, dict):
405
- for media in media_names["media"]:
406
- if isinstance(media, str):
407
- with open(media, "rb") as f:
408
- artifacts[media] = f.read()
409
393
 
410
394
  output_str = f"[Start Plan Context, saved at {name}]"
411
395
  for plan in response.plans.keys():
@@ -466,6 +450,12 @@ def generate_vision_code(
466
450
  test_multi_plan=test_multi_plan,
467
451
  custom_tool_names=custom_tool_names,
468
452
  )
453
+
454
+ # capture and save any files that were saved in the code to the artifacts
455
+ extract_and_save_files_to_artifacts(
456
+ artifacts, response["code"] + "\n" + response["test"]
457
+ )
458
+
469
459
  redisplay_results(response["test_result"])
470
460
  code = response["code"]
471
461
  artifacts[name] = code
@@ -546,6 +536,11 @@ def edit_vision_code(
546
536
  test_multi_plan=False,
547
537
  custom_tool_names=custom_tool_names,
548
538
  )
539
+ # capture and save any files that were saved in the code to the artifacts
540
+ extract_and_save_files_to_artifacts(
541
+ artifacts, response["code"] + "\n" + response["test"]
542
+ )
543
+
549
544
  redisplay_results(response["test_result"])
550
545
  code = response["code"]
551
546
  artifacts[name] = code
@@ -567,49 +562,6 @@ def edit_vision_code(
567
562
  return view_lines(code_lines, 0, total_lines, name, total_lines)
568
563
 
569
564
 
570
- def write_media_artifact(
571
- artifacts: Artifacts,
572
- name: str,
573
- media: Union[str, np.ndarray, List[np.ndarray]],
574
- fps: Optional[float] = None,
575
- ) -> str:
576
- """Writes a media file to the artifacts object.
577
-
578
- Parameters:
579
- artifacts (Artifacts): The artifacts object to save the media to.
580
- name (str): The name of the media artifact to save.
581
- media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either
582
- be a file path, single image or list of frames for a video.
583
- fps (Optional[float]): The frames per second if you are writing a video.
584
- """
585
- if isinstance(media, str):
586
- with open(media, "rb") as f:
587
- media_bytes = f.read()
588
- elif isinstance(media, list):
589
- media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0)
590
- elif isinstance(media, np.ndarray):
591
- media_bytes = numpy_to_bytes(media)
592
- else:
593
- print(f"[Invalid media type {type(media)}]")
594
- return f"[Invalid media type {type(media)}]"
595
- artifacts[name] = media_bytes
596
- print(f"[Media {name} saved]")
597
- display(
598
- {
599
- MimeType.APPLICATION_ARTIFACT: json.dumps(
600
- {
601
- "name": name,
602
- "action": "create",
603
- "content": base64.b64encode(media_bytes).decode("utf-8"),
604
- "contentType": "media_output",
605
- }
606
- )
607
- },
608
- raw=True,
609
- )
610
- return f"[Media {name} saved]"
611
-
612
-
613
565
  def list_artifacts(artifacts: Artifacts) -> str:
614
566
  """Lists all the artifacts that have been loaded into the artifacts object."""
615
567
  output_str = artifacts.show()
@@ -813,6 +765,61 @@ def use_object_detection_fine_tuning(
813
765
  return diff
814
766
 
815
767
 
768
+ def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None:
769
+ """Extracts and saves files used in the code to the artifacts object.
770
+
771
+ Parameters:
772
+ artifacts (Artifacts): The artifacts object to save the files to.
773
+ code (str): The code to extract the files from.
774
+ """
775
+ try:
776
+ response = extract_json(
777
+ AnthropicLMM()( # type: ignore
778
+ f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet:
779
+
780
+ ```python
781
+ {code}
782
+ ```
783
+
784
+ Return the file paths in the following JSON format:
785
+ {{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
786
+ )
787
+ )
788
+ except json.JSONDecodeError:
789
+ return
790
+
791
+ text_file_ext = [
792
+ ".txt",
793
+ ".md",
794
+ "rtf",
795
+ ".html",
796
+ ".htm",
797
+ "xml",
798
+ ".json",
799
+ ".csv",
800
+ ".tsv",
801
+ ".yaml",
802
+ ".yml",
803
+ ".toml",
804
+ ".conf",
805
+ ".env" ".ini",
806
+ ".log",
807
+ ".py",
808
+ ".java",
809
+ ".js",
810
+ ".cpp",
811
+ ".c" ".sql",
812
+ ".sh",
813
+ ]
814
+
815
+ if "file_paths" in response and isinstance(response["file_paths"], list):
816
+ for file_path in response["file_paths"]:
817
+ read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
818
+ if Path(file_path).is_file():
819
+ with open(file_path, read_mode) as f:
820
+ artifacts[Path(file_path).name] = f.read()
821
+
822
+
816
823
  META_TOOL_DOCSTRING = get_tool_documentation(
817
824
  [
818
825
  get_tool_descriptions,
@@ -822,7 +829,6 @@ META_TOOL_DOCSTRING = get_tool_documentation(
822
829
  generate_vision_plan,
823
830
  generate_vision_code,
824
831
  edit_vision_code,
825
- write_media_artifact,
826
832
  view_media_artifact,
827
833
  object_detection_fine_tuning,
828
834
  use_object_detection_fine_tuning,
@@ -181,6 +181,8 @@ def owl_v2_image(
181
181
  """
182
182
 
183
183
  image_size = image.shape[:2]
184
+ if image_size[0] < 1 or image_size[1] < 1:
185
+ return []
184
186
 
185
187
  if fine_tune_id is not None:
186
188
  image_b64 = convert_to_b64(image)
@@ -413,6 +415,9 @@ def florence2_sam2_image(
413
415
  },
414
416
  ]
415
417
  """
418
+ if image.shape[0] < 1 or image.shape[1] < 1:
419
+ return []
420
+
416
421
  if fine_tune_id is not None:
417
422
  image_b64 = convert_to_b64(image)
418
423
  landing_api = LandingPublicAPI()
@@ -701,6 +706,8 @@ def countgd_counting(
701
706
  ]
702
707
  """
703
708
  image_size = image.shape[:2]
709
+ if image_size[0] < 1 or image_size[1] < 1:
710
+ return []
704
711
  buffer_bytes = numpy_to_bytes(image)
705
712
  files = [("image", buffer_bytes)]
706
713
  prompt = prompt.replace(", ", " .")
@@ -759,6 +766,8 @@ def countgd_example_based_counting(
759
766
  ]
760
767
  """
761
768
  image_size = image.shape[:2]
769
+ if image_size[0] < 1 or image_size[1] < 1:
770
+ return []
762
771
  buffer_bytes = numpy_to_bytes(image)
763
772
  files = [("image", buffer_bytes)]
764
773
  visual_prompts = [
@@ -828,6 +837,8 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
828
837
  >>> ixc25_image_vqa('What is the cat doing?', image)
829
838
  'drinking milk'
830
839
  """
840
+ if image.shape[0] < 1 or image.shape[1] < 1:
841
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
831
842
 
832
843
  buffer_bytes = numpy_to_bytes(image)
833
844
  files = [("image", buffer_bytes)]
@@ -871,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
871
882
  return cast(str, data["answer"])
872
883
 
873
884
 
874
- def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
875
- """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
876
- given a prompt that can be other an object or a phrase. It returns a list of
877
- boolean values indicating whether the object or phrase is present in the
878
- corresponding frame.
879
-
880
- Parameters:
881
- prompt (str): The question about the video
882
- frames (List[np.ndarray]): The reference frames used for the question
883
-
884
- Returns:
885
- List[bool]: A list of boolean values indicating whether the object or phrase is
886
- present in the corresponding frame.
887
-
888
- Example
889
- -------
890
- >>> output = ixc25_temporal_localization('soccer goal', frames)
891
- >>> print(output)
892
- [False, False, False, True, True, True, False, False, False, False]
893
- >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
894
- """
895
-
896
- buffer_bytes = frames_to_bytes(frames)
897
- files = [("video", buffer_bytes)]
898
- payload = {
899
- "prompt": prompt,
900
- "chunk_length": 2,
901
- "function_name": "ixc25_temporal_localization",
902
- }
903
- data: List[int] = send_inference_request(
904
- payload,
905
- "video-temporal-localization?model=internlm-xcomposer",
906
- files=files,
907
- v2=True,
908
- )
909
- chunk_size = round(len(frames) / len(data))
910
- data_explode = [[elt] * chunk_size for elt in data]
911
- data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
912
- return data_bool[: len(frames)]
913
-
914
-
915
885
  def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
916
886
  """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
917
887
  including regular images or images of documents or presentations. It returns text
@@ -1024,6 +994,9 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
1024
994
  {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
1025
995
  """
1026
996
 
997
+ if image.shape[0] < 1 or image.shape[1] < 1:
998
+ return {"labels": [], "scores": []}
999
+
1027
1000
  image_b64 = convert_to_b64(image)
1028
1001
  data = {
1029
1002
  "prompt": ",".join(classes),
@@ -1052,6 +1025,8 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
1052
1025
  >>> vit_image_classification(image)
1053
1026
  {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
1054
1027
  """
1028
+ if image.shape[0] < 1 or image.shape[1] < 1:
1029
+ return {"labels": [], "scores": []}
1055
1030
 
1056
1031
  image_b64 = convert_to_b64(image)
1057
1032
  data = {
@@ -1080,6 +1055,8 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
1080
1055
  >>> vit_nsfw_classification(image)
1081
1056
  {"label": "normal", "scores": 0.68},
1082
1057
  """
1058
+ if image.shape[0] < 1 or image.shape[1] < 1:
1059
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
1083
1060
 
1084
1061
  image_b64 = convert_to_b64(image)
1085
1062
  data = {
@@ -1180,6 +1157,8 @@ def florence2_phrase_grounding(
1180
1157
  ]
1181
1158
  """
1182
1159
  image_size = image.shape[:2]
1160
+ if image_size[0] < 1 or image_size[1] < 1:
1161
+ return []
1183
1162
  image_b64 = convert_to_b64(image)
1184
1163
 
1185
1164
  if fine_tune_id is not None:
@@ -1399,6 +1378,8 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
1399
1378
  },
1400
1379
  ]
1401
1380
  """
1381
+ if image.shape[0] < 1 or image.shape[1] < 1:
1382
+ return []
1402
1383
  image_b64 = convert_to_b64(image)
1403
1384
  data = {
1404
1385
  "image": image_b64,
@@ -1442,6 +1423,9 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
1442
1423
  [10, 11, 15, ..., 202, 202, 205],
1443
1424
  [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
1444
1425
  """
1426
+ if image.shape[0] < 1 or image.shape[1] < 1:
1427
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
1428
+
1445
1429
  image_b64 = convert_to_b64(image)
1446
1430
  data = {
1447
1431
  "image": image_b64,
@@ -11,6 +11,9 @@ import numpy as np
11
11
  _LOGGER = logging.getLogger(__name__)
12
12
  # The maximum length of the clip to extract frames from, in seconds
13
13
 
14
+ _DEFAULT_VIDEO_FPS = 24
15
+ _DEFAULT_INPUT_FPS = 1.0
16
+
14
17
 
15
18
  def play_video(video_base64: str) -> None:
16
19
  """Play a video file"""
@@ -51,7 +54,9 @@ def _resize_frame(frame: np.ndarray) -> np.ndarray:
51
54
 
52
55
 
53
56
  def video_writer(
54
- frames: List[np.ndarray], fps: float = 1.0, filename: Optional[str] = None
57
+ frames: List[np.ndarray],
58
+ fps: float = _DEFAULT_INPUT_FPS,
59
+ filename: Optional[str] = None,
55
60
  ) -> str:
56
61
  if filename is None:
57
62
  filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
@@ -78,7 +83,7 @@ def video_writer(
78
83
 
79
84
 
80
85
  def frames_to_bytes(
81
- frames: List[np.ndarray], fps: float = 1.0, file_ext: str = ".mp4"
86
+ frames: List[np.ndarray], fps: float = _DEFAULT_INPUT_FPS, file_ext: str = ".mp4"
82
87
  ) -> bytes:
83
88
  r"""Convert a list of frames to a video file encoded into a byte string.
84
89
 
@@ -101,7 +106,7 @@ def frames_to_bytes(
101
106
  # same file name and the time savings are very large.
102
107
  @lru_cache(maxsize=8)
103
108
  def extract_frames_from_video(
104
- video_uri: str, fps: float = 1.0
109
+ video_uri: str, fps: float = _DEFAULT_INPUT_FPS
105
110
  ) -> List[Tuple[np.ndarray, float]]:
106
111
  """Extract frames from a video along with the timestamp in seconds.
107
112
 
@@ -118,6 +123,16 @@ def extract_frames_from_video(
118
123
 
119
124
  cap = cv2.VideoCapture(video_uri)
120
125
  orig_fps = cap.get(cv2.CAP_PROP_FPS)
126
+ if not orig_fps or orig_fps <= 0:
127
+ _LOGGER.warning(
128
+ f"Input video, {video_uri}, has no fps, using the default value {_DEFAULT_VIDEO_FPS}"
129
+ )
130
+ orig_fps = _DEFAULT_VIDEO_FPS
131
+ if not fps or fps <= 0:
132
+ _LOGGER.warning(
133
+ f"Input fps, {fps}, is illegal, using the default value: {_DEFAULT_INPUT_FPS}"
134
+ )
135
+ fps = _DEFAULT_INPUT_FPS
121
136
  orig_frame_time = 1 / orig_fps
122
137
  targ_frame_time = 1 / fps
123
138
  frames: List[Tuple[np.ndarray, float]] = []
@@ -129,10 +144,15 @@ def extract_frames_from_video(
129
144
  break
130
145
 
131
146
  elapsed_time += orig_frame_time
147
+ # This is to prevent float point precision loss issue, which can cause
148
+ # the elapsed time to be slightly less than the target frame time, which
149
+ # causes the last frame to be skipped
150
+ elapsed_time = round(elapsed_time, 8)
132
151
  if elapsed_time >= targ_frame_time:
133
152
  frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps))
134
153
  elapsed_time -= targ_frame_time
135
154
 
136
155
  i += 1
137
156
  cap.release()
157
+ _LOGGER.info(f"Extracted {len(frames)} frames from {video_uri}")
138
158
  return frames
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.164
3
+ Version: 0.2.166
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -41,7 +41,7 @@ Project-URL: repository, https://github.com/landing-ai/vision-agent
41
41
  Description-Content-Type: text/markdown
42
42
 
43
43
  <div align="center">
44
- <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
44
+ <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.png?raw=true">
45
45
 
46
46
  # 🔍🤖 Vision Agent
47
47
  [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
@@ -387,6 +387,11 @@ result = agent.generate_code(conv)
387
387
 
388
388
 
389
389
  ## Additional Backends
390
+ ### E2B Code Execution
391
+ If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
392
+ set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
393
+ run all the agent generated code on the E2B backend.
394
+
390
395
  ### Anthropic
391
396
  `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
392
397
  Anthropic API key and set it in your environment variables:
@@ -2,12 +2,12 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
4
  vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
5
- vision_agent/agent/vision_agent.py,sha256=MUigVufYML2sYn9Hsngswa77XxlZBgCwQyBfK8tlsio,22551
5
+ vision_agent/agent/vision_agent.py,sha256=cbY_V3f85_g8JmASa3m2LBX4G6xgsOKX1n7YtCf-C98,23676
6
6
  vision_agent/agent/vision_agent_coder.py,sha256=aVkl0b9LKvy-auuHGYSag-ixYnue0iRQqD1PYLPBR-s,29312
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
8
8
  vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
9
9
  vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
10
- vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
10
+ vision_agent/agent/vision_agent_prompts.py,sha256=_xAITNDKcS45tqhEax5i6vDQa4V39f9n55iRGk2R6RM,11218
11
11
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
13
13
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=50wwisjudmZn7_SEwigTiiDxQ0HXbSIhVI4O8kvE9Es,2365
20
- vision_agent/tools/meta_tools.py,sha256=MULJrZiTODOAN20TGceLdXcwoSGMNaE7bQbywySITnA,28458
19
+ vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
20
+ vision_agent/tools/meta_tools.py,sha256=ZF-7z3KT-Su08MvF5OhSm3Taqeu1Ek-EZjFhpN5w1uU,28257
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=hjv1mZdq8AHgsX_0zmya0i9yiEK6My5FO6mWbGjZvV4,78521
23
+ vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -28,8 +28,8 @@ vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4
28
28
  vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
- vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
32
- vision_agent-0.2.164.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.164.dist-info/METADATA,sha256=v8XdfcxjcFNC1sgOI5BBGklOoEtb6QxpHremU02onX0,17785
34
- vision_agent-0.2.164.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.164.dist-info/RECORD,,
31
+ vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
+ vision_agent-0.2.166.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.166.dist-info/METADATA,sha256=e15d4yNaAJvLCViaBUFo_RNHII88W-y9WgJauEFfbyU,18034
34
+ vision_agent-0.2.166.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.166.dist-info/RECORD,,