vision-agent 0.2.161__py3-none-any.whl → 0.2.163__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +98 -2
- vision_agent/agent/vision_agent.py +54 -22
- vision_agent/agent/vision_agent_coder.py +222 -512
- vision_agent/agent/vision_agent_coder_prompts.py +12 -221
- vision_agent/agent/vision_agent_planner.py +583 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +107 -35
- vision_agent/tools/tools.py +2 -2
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/METADATA +8 -7
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/RECORD +14 -12
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/WHEEL +0 -0
vision_agent/agent/__init__.py
CHANGED
@@ -7,3 +7,11 @@ from .vision_agent_coder import (
|
|
7
7
|
OpenAIVisionAgentCoder,
|
8
8
|
VisionAgentCoder,
|
9
9
|
)
|
10
|
+
from .vision_agent_planner import (
|
11
|
+
AnthropicVisionAgentPlanner,
|
12
|
+
AzureVisionAgentPlanner,
|
13
|
+
OllamaVisionAgentPlanner,
|
14
|
+
OpenAIVisionAgentPlanner,
|
15
|
+
PlanContext,
|
16
|
+
VisionAgentPlanner,
|
17
|
+
)
|
@@ -2,10 +2,18 @@ import json
|
|
2
2
|
import logging
|
3
3
|
import re
|
4
4
|
import sys
|
5
|
-
from typing import Any, Dict, Optional
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
from rich.console import Console
|
8
|
+
from rich.style import Style
|
9
|
+
from rich.syntax import Syntax
|
10
|
+
|
11
|
+
import vision_agent.tools as T
|
6
12
|
|
7
13
|
logging.basicConfig(stream=sys.stdout)
|
8
14
|
_LOGGER = logging.getLogger(__name__)
|
15
|
+
_CONSOLE = Console()
|
16
|
+
_MAX_TABULATE_COL_WIDTH = 80
|
9
17
|
|
10
18
|
|
11
19
|
def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
|
@@ -41,11 +49,16 @@ def _strip_markdown_code(inp_str: str) -> str:
|
|
41
49
|
|
42
50
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
43
51
|
json_str_mod = json_str.replace("\n", " ").strip()
|
44
|
-
json_str_mod = json_str_mod.replace("'", '"')
|
45
52
|
json_str_mod = json_str_mod.replace(": True", ": true").replace(
|
46
53
|
": False", ": false"
|
47
54
|
)
|
48
55
|
|
56
|
+
# sometimes the json is in single quotes
|
57
|
+
try:
|
58
|
+
return json.loads(json_str_mod.replace("'", '"')) # type: ignore
|
59
|
+
except json.JSONDecodeError:
|
60
|
+
pass
|
61
|
+
|
49
62
|
try:
|
50
63
|
return json.loads(json_str_mod) # type: ignore
|
51
64
|
except json.JSONDecodeError:
|
@@ -79,7 +92,90 @@ def extract_code(code: str) -> str:
|
|
79
92
|
return code
|
80
93
|
|
81
94
|
|
95
|
+
def extract_tag(
|
96
|
+
content: str,
|
97
|
+
tag: str,
|
98
|
+
) -> Optional[str]:
|
99
|
+
inner_content = None
|
100
|
+
remaning = content
|
101
|
+
all_inner_content = []
|
102
|
+
|
103
|
+
while f"<{tag}>" in remaning:
|
104
|
+
inner_content_i = remaning[remaning.find(f"<{tag}>") + len(f"<{tag}>") :]
|
105
|
+
if f"</{tag}>" not in inner_content_i:
|
106
|
+
break
|
107
|
+
inner_content_i = inner_content_i[: inner_content_i.find(f"</{tag}>")]
|
108
|
+
remaning = remaning[remaning.find(f"</{tag}>") + len(f"</{tag}>") :]
|
109
|
+
all_inner_content.append(inner_content_i)
|
110
|
+
|
111
|
+
if len(all_inner_content) > 0:
|
112
|
+
inner_content = "\n".join(all_inner_content)
|
113
|
+
return inner_content
|
114
|
+
|
115
|
+
|
82
116
|
def remove_installs_from_code(code: str) -> str:
|
83
117
|
pattern = r"\n!pip install.*?(\n|\Z)\n"
|
84
118
|
code = re.sub(pattern, "", code, flags=re.DOTALL)
|
85
119
|
return code
|
120
|
+
|
121
|
+
|
122
|
+
def format_memory(memory: List[Dict[str, str]]) -> str:
|
123
|
+
output_str = ""
|
124
|
+
for i, m in enumerate(memory):
|
125
|
+
output_str += f"### Feedback {i}:\n"
|
126
|
+
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
127
|
+
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
128
|
+
if "edits" in m:
|
129
|
+
output_str += f"Edits {i}:\n{m['edits']}\n"
|
130
|
+
output_str += "\n"
|
131
|
+
|
132
|
+
return output_str
|
133
|
+
|
134
|
+
|
135
|
+
def format_plans(plans: Dict[str, Any]) -> str:
|
136
|
+
plan_str = ""
|
137
|
+
for k, v in plans.items():
|
138
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
139
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
140
|
+
|
141
|
+
return plan_str
|
142
|
+
|
143
|
+
|
144
|
+
class DefaultImports:
|
145
|
+
"""Container for default imports used in the code execution."""
|
146
|
+
|
147
|
+
common_imports = [
|
148
|
+
"import os",
|
149
|
+
"import numpy as np",
|
150
|
+
"from vision_agent.tools import *",
|
151
|
+
"from typing import *",
|
152
|
+
"from pillow_heif import register_heif_opener",
|
153
|
+
"register_heif_opener()",
|
154
|
+
]
|
155
|
+
|
156
|
+
@staticmethod
|
157
|
+
def to_code_string() -> str:
|
158
|
+
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
159
|
+
|
160
|
+
@staticmethod
|
161
|
+
def prepend_imports(code: str) -> str:
|
162
|
+
"""Run this method to prepend the default imports to the code.
|
163
|
+
NOTE: be sure to run this method after the custom tools have been registered.
|
164
|
+
"""
|
165
|
+
return DefaultImports.to_code_string() + "\n\n" + code
|
166
|
+
|
167
|
+
|
168
|
+
def print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
169
|
+
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
170
|
+
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
171
|
+
_CONSOLE.print(
|
172
|
+
Syntax(
|
173
|
+
DefaultImports.prepend_imports(code),
|
174
|
+
"python",
|
175
|
+
theme="gruvbox-dark",
|
176
|
+
line_numbers=True,
|
177
|
+
)
|
178
|
+
)
|
179
|
+
if test:
|
180
|
+
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
181
|
+
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
14
14
|
VA_CODE,
|
15
15
|
)
|
16
16
|
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
17
|
-
from vision_agent.tools import META_TOOL_DOCSTRING
|
18
17
|
from vision_agent.tools.meta_tools import (
|
18
|
+
META_TOOL_DOCSTRING,
|
19
19
|
Artifacts,
|
20
20
|
check_and_load_image,
|
21
21
|
use_extra_vision_agent_args,
|
@@ -103,7 +103,7 @@ def execute_code_action(
|
|
103
103
|
def parse_execution(
|
104
104
|
response: str,
|
105
105
|
test_multi_plan: bool = True,
|
106
|
-
|
106
|
+
custom_tool_names: Optional[List[str]] = None,
|
107
107
|
) -> Optional[str]:
|
108
108
|
code = None
|
109
109
|
remaining = response
|
@@ -122,7 +122,7 @@ def parse_execution(
|
|
122
122
|
code = "\n".join(all_code)
|
123
123
|
|
124
124
|
if code is not None:
|
125
|
-
code = use_extra_vision_agent_args(code, test_multi_plan,
|
125
|
+
code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
|
126
126
|
return code
|
127
127
|
|
128
128
|
|
@@ -195,9 +195,8 @@ class VisionAgent(Agent):
|
|
195
195
|
agent: Optional[LMM] = None,
|
196
196
|
verbosity: int = 0,
|
197
197
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
198
|
-
code_sandbox_runtime: Optional[str] = None,
|
199
198
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
200
|
-
code_interpreter: Optional[CodeInterpreter] = None,
|
199
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
201
200
|
) -> None:
|
202
201
|
"""Initialize the VisionAgent.
|
203
202
|
|
@@ -207,14 +206,17 @@ class VisionAgent(Agent):
|
|
207
206
|
verbosity (int): The verbosity level of the agent.
|
208
207
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
209
208
|
artifacts file.
|
210
|
-
|
211
|
-
|
209
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
210
|
+
function to send intermediate update messages.
|
211
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
212
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
213
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
214
|
+
object is provided it will use that.
|
212
215
|
"""
|
213
216
|
|
214
217
|
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
215
218
|
self.max_iterations = 12
|
216
219
|
self.verbosity = verbosity
|
217
|
-
self.code_sandbox_runtime = code_sandbox_runtime
|
218
220
|
self.code_interpreter = code_interpreter
|
219
221
|
self.callback_message = callback_message
|
220
222
|
if self.verbosity >= 1:
|
@@ -233,7 +235,7 @@ class VisionAgent(Agent):
|
|
233
235
|
input: Union[str, List[Message]],
|
234
236
|
media: Optional[Union[str, Path]] = None,
|
235
237
|
artifacts: Optional[Artifacts] = None,
|
236
|
-
) ->
|
238
|
+
) -> str:
|
237
239
|
"""Chat with VisionAgent and get the conversation response.
|
238
240
|
|
239
241
|
Parameters:
|
@@ -250,15 +252,33 @@ class VisionAgent(Agent):
|
|
250
252
|
input = [{"role": "user", "content": input}]
|
251
253
|
if media is not None:
|
252
254
|
input[0]["media"] = [media]
|
253
|
-
results, _ = self.
|
254
|
-
return results
|
255
|
+
results, _ = self.chat_with_artifacts(input, artifacts)
|
256
|
+
return results[-1]["content"] # type: ignore
|
257
|
+
|
258
|
+
def chat(
|
259
|
+
self,
|
260
|
+
chat: List[Message],
|
261
|
+
) -> List[Message]:
|
262
|
+
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
263
|
+
its tasks.
|
264
|
+
|
265
|
+
Parameters:
|
266
|
+
chat (List[Message]): A conversation in the format of:
|
267
|
+
[{"role": "user", "content": "describe your task here..."}]
|
268
|
+
or if it contains media files, it should be in the format of:
|
269
|
+
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
List[Message]: The conversation response.
|
273
|
+
"""
|
274
|
+
return self.chat_with_artifacts(chat)[0]
|
255
275
|
|
256
|
-
def
|
276
|
+
def chat_with_artifacts(
|
257
277
|
self,
|
258
278
|
chat: List[Message],
|
259
279
|
artifacts: Optional[Artifacts] = None,
|
260
280
|
test_multi_plan: bool = True,
|
261
|
-
|
281
|
+
custom_tool_names: Optional[List[str]] = None,
|
262
282
|
) -> Tuple[List[Message], Artifacts]:
|
263
283
|
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
264
284
|
its tasks.
|
@@ -272,7 +292,7 @@ class VisionAgent(Agent):
|
|
272
292
|
test_multi_plan (bool): If True, it will test tools for multiple plans and
|
273
293
|
pick the best one based off of the tool results. If False, it will go
|
274
294
|
with the first plan.
|
275
|
-
|
295
|
+
custom_tool_names (List[str]): A list of customized tools for agent to
|
276
296
|
pick and use. If not provided, default to full tool set from
|
277
297
|
vision_agent.tools.
|
278
298
|
|
@@ -287,11 +307,13 @@ class VisionAgent(Agent):
|
|
287
307
|
# this is setting remote artifacts path
|
288
308
|
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
289
309
|
|
310
|
+
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
290
311
|
code_interpreter = (
|
291
312
|
self.code_interpreter
|
292
313
|
if self.code_interpreter is not None
|
314
|
+
and not isinstance(self.code_interpreter, str)
|
293
315
|
else CodeInterpreterFactory.new_instance(
|
294
|
-
code_sandbox_runtime=self.
|
316
|
+
code_sandbox_runtime=self.code_interpreter,
|
295
317
|
)
|
296
318
|
)
|
297
319
|
with code_interpreter:
|
@@ -389,7 +411,7 @@ class VisionAgent(Agent):
|
|
389
411
|
finished = response["let_user_respond"]
|
390
412
|
|
391
413
|
code_action = parse_execution(
|
392
|
-
response["response"], test_multi_plan,
|
414
|
+
response["response"], test_multi_plan, custom_tool_names
|
393
415
|
)
|
394
416
|
|
395
417
|
if last_response == response:
|
@@ -480,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
480
502
|
agent: Optional[LMM] = None,
|
481
503
|
verbosity: int = 0,
|
482
504
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
483
|
-
code_sandbox_runtime: Optional[str] = None,
|
484
505
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
506
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
485
507
|
) -> None:
|
486
508
|
"""Initialize the VisionAgent using OpenAI LMMs.
|
487
509
|
|
@@ -491,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
|
|
491
513
|
verbosity (int): The verbosity level of the agent.
|
492
514
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
493
515
|
artifacts file.
|
494
|
-
|
516
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
517
|
+
function to send intermediate update messages.
|
518
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
519
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
520
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
521
|
+
object is provided it will use that.
|
495
522
|
"""
|
496
523
|
|
497
524
|
agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
@@ -499,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
499
526
|
agent,
|
500
527
|
verbosity,
|
501
528
|
local_artifacts_path,
|
502
|
-
code_sandbox_runtime,
|
503
529
|
callback_message,
|
530
|
+
code_interpreter,
|
504
531
|
)
|
505
532
|
|
506
533
|
|
@@ -510,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
|
|
510
537
|
agent: Optional[LMM] = None,
|
511
538
|
verbosity: int = 0,
|
512
539
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
513
|
-
code_sandbox_runtime: Optional[str] = None,
|
514
540
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
541
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
515
542
|
) -> None:
|
516
543
|
"""Initialize the VisionAgent using Anthropic LMMs.
|
517
544
|
|
@@ -521,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
|
|
521
548
|
verbosity (int): The verbosity level of the agent.
|
522
549
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
523
550
|
artifacts file.
|
524
|
-
|
551
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
552
|
+
function to send intermediate update messages.
|
553
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
554
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
555
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
556
|
+
object is provided it will use that.
|
525
557
|
"""
|
526
558
|
|
527
559
|
agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
@@ -529,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
529
561
|
agent,
|
530
562
|
verbosity,
|
531
563
|
local_artifacts_path,
|
532
|
-
code_sandbox_runtime,
|
533
564
|
callback_message,
|
565
|
+
code_interpreter,
|
534
566
|
)
|