vision-agent 0.2.161__py3-none-any.whl → 0.2.162__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +76 -2
- vision_agent/agent/vision_agent.py +49 -17
- vision_agent/agent/vision_agent_coder.py +163 -489
- vision_agent/agent/vision_agent_coder_prompts.py +0 -203
- vision_agent/agent/vision_agent_planner.py +553 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +84 -3
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/METADATA +7 -7
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/RECORD +13 -11
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/WHEEL +0 -0
vision_agent/agent/__init__.py
CHANGED
@@ -7,3 +7,11 @@ from .vision_agent_coder import (
|
|
7
7
|
OpenAIVisionAgentCoder,
|
8
8
|
VisionAgentCoder,
|
9
9
|
)
|
10
|
+
from .vision_agent_planner import (
|
11
|
+
AnthropicVisionAgentPlanner,
|
12
|
+
AzureVisionAgentPlanner,
|
13
|
+
OllamaVisionAgentPlanner,
|
14
|
+
OpenAIVisionAgentPlanner,
|
15
|
+
PlanContext,
|
16
|
+
VisionAgentPlanner,
|
17
|
+
)
|
@@ -2,10 +2,17 @@ import json
|
|
2
2
|
import logging
|
3
3
|
import re
|
4
4
|
import sys
|
5
|
-
from typing import Any, Dict, Optional
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
from rich.console import Console
|
8
|
+
from rich.style import Style
|
9
|
+
from rich.syntax import Syntax
|
10
|
+
|
11
|
+
import vision_agent.tools as T
|
6
12
|
|
7
13
|
logging.basicConfig(stream=sys.stdout)
|
8
14
|
_LOGGER = logging.getLogger(__name__)
|
15
|
+
_CONSOLE = Console()
|
9
16
|
|
10
17
|
|
11
18
|
def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
|
@@ -41,11 +48,16 @@ def _strip_markdown_code(inp_str: str) -> str:
|
|
41
48
|
|
42
49
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
43
50
|
json_str_mod = json_str.replace("\n", " ").strip()
|
44
|
-
json_str_mod = json_str_mod.replace("'", '"')
|
45
51
|
json_str_mod = json_str_mod.replace(": True", ": true").replace(
|
46
52
|
": False", ": false"
|
47
53
|
)
|
48
54
|
|
55
|
+
# sometimes the json is in single quotes
|
56
|
+
try:
|
57
|
+
return json.loads(json_str_mod.replace("'", '"')) # type: ignore
|
58
|
+
except json.JSONDecodeError:
|
59
|
+
pass
|
60
|
+
|
49
61
|
try:
|
50
62
|
return json.loads(json_str_mod) # type: ignore
|
51
63
|
except json.JSONDecodeError:
|
@@ -83,3 +95,65 @@ def remove_installs_from_code(code: str) -> str:
|
|
83
95
|
pattern = r"\n!pip install.*?(\n|\Z)\n"
|
84
96
|
code = re.sub(pattern, "", code, flags=re.DOTALL)
|
85
97
|
return code
|
98
|
+
|
99
|
+
|
100
|
+
def format_memory(memory: List[Dict[str, str]]) -> str:
|
101
|
+
output_str = ""
|
102
|
+
for i, m in enumerate(memory):
|
103
|
+
output_str += f"### Feedback {i}:\n"
|
104
|
+
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
105
|
+
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
106
|
+
if "edits" in m:
|
107
|
+
output_str += f"Edits {i}:\n{m['edits']}\n"
|
108
|
+
output_str += "\n"
|
109
|
+
|
110
|
+
return output_str
|
111
|
+
|
112
|
+
|
113
|
+
def format_plans(plans: Dict[str, Any]) -> str:
|
114
|
+
plan_str = ""
|
115
|
+
for k, v in plans.items():
|
116
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
117
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
118
|
+
|
119
|
+
return plan_str
|
120
|
+
|
121
|
+
|
122
|
+
class DefaultImports:
|
123
|
+
"""Container for default imports used in the code execution."""
|
124
|
+
|
125
|
+
common_imports = [
|
126
|
+
"import os",
|
127
|
+
"import numpy as np",
|
128
|
+
"from vision_agent.tools import *",
|
129
|
+
"from typing import *",
|
130
|
+
"from pillow_heif import register_heif_opener",
|
131
|
+
"register_heif_opener()",
|
132
|
+
]
|
133
|
+
|
134
|
+
@staticmethod
|
135
|
+
def to_code_string() -> str:
|
136
|
+
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
137
|
+
|
138
|
+
@staticmethod
|
139
|
+
def prepend_imports(code: str) -> str:
|
140
|
+
"""Run this method to prepend the default imports to the code.
|
141
|
+
NOTE: be sure to run this method after the custom tools have been registered.
|
142
|
+
"""
|
143
|
+
return DefaultImports.to_code_string() + "\n\n" + code
|
144
|
+
|
145
|
+
|
146
|
+
def print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
147
|
+
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
148
|
+
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
149
|
+
_CONSOLE.print(
|
150
|
+
Syntax(
|
151
|
+
DefaultImports.prepend_imports(code),
|
152
|
+
"python",
|
153
|
+
theme="gruvbox-dark",
|
154
|
+
line_numbers=True,
|
155
|
+
)
|
156
|
+
)
|
157
|
+
if test:
|
158
|
+
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
159
|
+
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
14
14
|
VA_CODE,
|
15
15
|
)
|
16
16
|
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
17
|
-
from vision_agent.tools import META_TOOL_DOCSTRING
|
18
17
|
from vision_agent.tools.meta_tools import (
|
18
|
+
META_TOOL_DOCSTRING,
|
19
19
|
Artifacts,
|
20
20
|
check_and_load_image,
|
21
21
|
use_extra_vision_agent_args,
|
@@ -195,9 +195,8 @@ class VisionAgent(Agent):
|
|
195
195
|
agent: Optional[LMM] = None,
|
196
196
|
verbosity: int = 0,
|
197
197
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
198
|
-
code_sandbox_runtime: Optional[str] = None,
|
199
198
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
200
|
-
code_interpreter: Optional[CodeInterpreter] = None,
|
199
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
201
200
|
) -> None:
|
202
201
|
"""Initialize the VisionAgent.
|
203
202
|
|
@@ -207,14 +206,17 @@ class VisionAgent(Agent):
|
|
207
206
|
verbosity (int): The verbosity level of the agent.
|
208
207
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
209
208
|
artifacts file.
|
210
|
-
|
211
|
-
|
209
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
210
|
+
function to send intermediate update messages.
|
211
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
212
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
213
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
214
|
+
object is provided it will use that.
|
212
215
|
"""
|
213
216
|
|
214
217
|
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
215
218
|
self.max_iterations = 12
|
216
219
|
self.verbosity = verbosity
|
217
|
-
self.code_sandbox_runtime = code_sandbox_runtime
|
218
220
|
self.code_interpreter = code_interpreter
|
219
221
|
self.callback_message = callback_message
|
220
222
|
if self.verbosity >= 1:
|
@@ -233,7 +235,7 @@ class VisionAgent(Agent):
|
|
233
235
|
input: Union[str, List[Message]],
|
234
236
|
media: Optional[Union[str, Path]] = None,
|
235
237
|
artifacts: Optional[Artifacts] = None,
|
236
|
-
) ->
|
238
|
+
) -> str:
|
237
239
|
"""Chat with VisionAgent and get the conversation response.
|
238
240
|
|
239
241
|
Parameters:
|
@@ -250,10 +252,28 @@ class VisionAgent(Agent):
|
|
250
252
|
input = [{"role": "user", "content": input}]
|
251
253
|
if media is not None:
|
252
254
|
input[0]["media"] = [media]
|
253
|
-
results, _ = self.
|
254
|
-
return results
|
255
|
+
results, _ = self.chat_with_artifacts(input, artifacts)
|
256
|
+
return results[-1]["content"] # type: ignore
|
257
|
+
|
258
|
+
def chat(
|
259
|
+
self,
|
260
|
+
chat: List[Message],
|
261
|
+
) -> List[Message]:
|
262
|
+
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
263
|
+
its tasks.
|
264
|
+
|
265
|
+
Parameters:
|
266
|
+
chat (List[Message]): A conversation in the format of:
|
267
|
+
[{"role": "user", "content": "describe your task here..."}]
|
268
|
+
or if it contains media files, it should be in the format of:
|
269
|
+
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
List[Message]: The conversation response.
|
273
|
+
"""
|
274
|
+
return self.chat_with_artifacts(chat)[0]
|
255
275
|
|
256
|
-
def
|
276
|
+
def chat_with_artifacts(
|
257
277
|
self,
|
258
278
|
chat: List[Message],
|
259
279
|
artifacts: Optional[Artifacts] = None,
|
@@ -287,11 +307,13 @@ class VisionAgent(Agent):
|
|
287
307
|
# this is setting remote artifacts path
|
288
308
|
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
289
309
|
|
310
|
+
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
290
311
|
code_interpreter = (
|
291
312
|
self.code_interpreter
|
292
313
|
if self.code_interpreter is not None
|
314
|
+
and not isinstance(self.code_interpreter, str)
|
293
315
|
else CodeInterpreterFactory.new_instance(
|
294
|
-
code_sandbox_runtime=self.
|
316
|
+
code_sandbox_runtime=self.code_interpreter,
|
295
317
|
)
|
296
318
|
)
|
297
319
|
with code_interpreter:
|
@@ -480,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
480
502
|
agent: Optional[LMM] = None,
|
481
503
|
verbosity: int = 0,
|
482
504
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
483
|
-
code_sandbox_runtime: Optional[str] = None,
|
484
505
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
506
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
485
507
|
) -> None:
|
486
508
|
"""Initialize the VisionAgent using OpenAI LMMs.
|
487
509
|
|
@@ -491,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
|
|
491
513
|
verbosity (int): The verbosity level of the agent.
|
492
514
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
493
515
|
artifacts file.
|
494
|
-
|
516
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
517
|
+
function to send intermediate update messages.
|
518
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
519
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
520
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
521
|
+
object is provided it will use that.
|
495
522
|
"""
|
496
523
|
|
497
524
|
agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
@@ -499,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
499
526
|
agent,
|
500
527
|
verbosity,
|
501
528
|
local_artifacts_path,
|
502
|
-
code_sandbox_runtime,
|
503
529
|
callback_message,
|
530
|
+
code_interpreter,
|
504
531
|
)
|
505
532
|
|
506
533
|
|
@@ -510,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
|
|
510
537
|
agent: Optional[LMM] = None,
|
511
538
|
verbosity: int = 0,
|
512
539
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
513
|
-
code_sandbox_runtime: Optional[str] = None,
|
514
540
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
541
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
515
542
|
) -> None:
|
516
543
|
"""Initialize the VisionAgent using Anthropic LMMs.
|
517
544
|
|
@@ -521,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
|
|
521
548
|
verbosity (int): The verbosity level of the agent.
|
522
549
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
523
550
|
artifacts file.
|
524
|
-
|
551
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
552
|
+
function to send intermediate update messages.
|
553
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
554
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
555
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
556
|
+
object is provided it will use that.
|
525
557
|
"""
|
526
558
|
|
527
559
|
agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
@@ -529,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
529
561
|
agent,
|
530
562
|
verbosity,
|
531
563
|
local_artifacts_path,
|
532
|
-
code_sandbox_runtime,
|
533
564
|
callback_message,
|
565
|
+
code_interpreter,
|
534
566
|
)
|