vision-agent 0.2.161__py3-none-any.whl → 0.2.162__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +76 -2
- vision_agent/agent/vision_agent.py +49 -17
- vision_agent/agent/vision_agent_coder.py +163 -489
- vision_agent/agent/vision_agent_coder_prompts.py +0 -203
- vision_agent/agent/vision_agent_planner.py +553 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +84 -3
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/METADATA +7 -7
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/RECORD +13 -11
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/WHEEL +0 -0
vision_agent/agent/__init__.py
CHANGED
@@ -7,3 +7,11 @@ from .vision_agent_coder import (
|
|
7
7
|
OpenAIVisionAgentCoder,
|
8
8
|
VisionAgentCoder,
|
9
9
|
)
|
10
|
+
from .vision_agent_planner import (
|
11
|
+
AnthropicVisionAgentPlanner,
|
12
|
+
AzureVisionAgentPlanner,
|
13
|
+
OllamaVisionAgentPlanner,
|
14
|
+
OpenAIVisionAgentPlanner,
|
15
|
+
PlanContext,
|
16
|
+
VisionAgentPlanner,
|
17
|
+
)
|
@@ -2,10 +2,17 @@ import json
|
|
2
2
|
import logging
|
3
3
|
import re
|
4
4
|
import sys
|
5
|
-
from typing import Any, Dict, Optional
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
from rich.console import Console
|
8
|
+
from rich.style import Style
|
9
|
+
from rich.syntax import Syntax
|
10
|
+
|
11
|
+
import vision_agent.tools as T
|
6
12
|
|
7
13
|
logging.basicConfig(stream=sys.stdout)
|
8
14
|
_LOGGER = logging.getLogger(__name__)
|
15
|
+
_CONSOLE = Console()
|
9
16
|
|
10
17
|
|
11
18
|
def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
|
@@ -41,11 +48,16 @@ def _strip_markdown_code(inp_str: str) -> str:
|
|
41
48
|
|
42
49
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
43
50
|
json_str_mod = json_str.replace("\n", " ").strip()
|
44
|
-
json_str_mod = json_str_mod.replace("'", '"')
|
45
51
|
json_str_mod = json_str_mod.replace(": True", ": true").replace(
|
46
52
|
": False", ": false"
|
47
53
|
)
|
48
54
|
|
55
|
+
# sometimes the json is in single quotes
|
56
|
+
try:
|
57
|
+
return json.loads(json_str_mod.replace("'", '"')) # type: ignore
|
58
|
+
except json.JSONDecodeError:
|
59
|
+
pass
|
60
|
+
|
49
61
|
try:
|
50
62
|
return json.loads(json_str_mod) # type: ignore
|
51
63
|
except json.JSONDecodeError:
|
@@ -83,3 +95,65 @@ def remove_installs_from_code(code: str) -> str:
|
|
83
95
|
pattern = r"\n!pip install.*?(\n|\Z)\n"
|
84
96
|
code = re.sub(pattern, "", code, flags=re.DOTALL)
|
85
97
|
return code
|
98
|
+
|
99
|
+
|
100
|
+
def format_memory(memory: List[Dict[str, str]]) -> str:
|
101
|
+
output_str = ""
|
102
|
+
for i, m in enumerate(memory):
|
103
|
+
output_str += f"### Feedback {i}:\n"
|
104
|
+
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
105
|
+
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
106
|
+
if "edits" in m:
|
107
|
+
output_str += f"Edits {i}:\n{m['edits']}\n"
|
108
|
+
output_str += "\n"
|
109
|
+
|
110
|
+
return output_str
|
111
|
+
|
112
|
+
|
113
|
+
def format_plans(plans: Dict[str, Any]) -> str:
|
114
|
+
plan_str = ""
|
115
|
+
for k, v in plans.items():
|
116
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
117
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
118
|
+
|
119
|
+
return plan_str
|
120
|
+
|
121
|
+
|
122
|
+
class DefaultImports:
|
123
|
+
"""Container for default imports used in the code execution."""
|
124
|
+
|
125
|
+
common_imports = [
|
126
|
+
"import os",
|
127
|
+
"import numpy as np",
|
128
|
+
"from vision_agent.tools import *",
|
129
|
+
"from typing import *",
|
130
|
+
"from pillow_heif import register_heif_opener",
|
131
|
+
"register_heif_opener()",
|
132
|
+
]
|
133
|
+
|
134
|
+
@staticmethod
|
135
|
+
def to_code_string() -> str:
|
136
|
+
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
137
|
+
|
138
|
+
@staticmethod
|
139
|
+
def prepend_imports(code: str) -> str:
|
140
|
+
"""Run this method to prepend the default imports to the code.
|
141
|
+
NOTE: be sure to run this method after the custom tools have been registered.
|
142
|
+
"""
|
143
|
+
return DefaultImports.to_code_string() + "\n\n" + code
|
144
|
+
|
145
|
+
|
146
|
+
def print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
147
|
+
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
148
|
+
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
149
|
+
_CONSOLE.print(
|
150
|
+
Syntax(
|
151
|
+
DefaultImports.prepend_imports(code),
|
152
|
+
"python",
|
153
|
+
theme="gruvbox-dark",
|
154
|
+
line_numbers=True,
|
155
|
+
)
|
156
|
+
)
|
157
|
+
if test:
|
158
|
+
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
159
|
+
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
14
14
|
VA_CODE,
|
15
15
|
)
|
16
16
|
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
17
|
-
from vision_agent.tools import META_TOOL_DOCSTRING
|
18
17
|
from vision_agent.tools.meta_tools import (
|
18
|
+
META_TOOL_DOCSTRING,
|
19
19
|
Artifacts,
|
20
20
|
check_and_load_image,
|
21
21
|
use_extra_vision_agent_args,
|
@@ -195,9 +195,8 @@ class VisionAgent(Agent):
|
|
195
195
|
agent: Optional[LMM] = None,
|
196
196
|
verbosity: int = 0,
|
197
197
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
198
|
-
code_sandbox_runtime: Optional[str] = None,
|
199
198
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
200
|
-
code_interpreter: Optional[CodeInterpreter] = None,
|
199
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
201
200
|
) -> None:
|
202
201
|
"""Initialize the VisionAgent.
|
203
202
|
|
@@ -207,14 +206,17 @@ class VisionAgent(Agent):
|
|
207
206
|
verbosity (int): The verbosity level of the agent.
|
208
207
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
209
208
|
artifacts file.
|
210
|
-
|
211
|
-
|
209
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
210
|
+
function to send intermediate update messages.
|
211
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
212
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
213
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
214
|
+
object is provided it will use that.
|
212
215
|
"""
|
213
216
|
|
214
217
|
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
215
218
|
self.max_iterations = 12
|
216
219
|
self.verbosity = verbosity
|
217
|
-
self.code_sandbox_runtime = code_sandbox_runtime
|
218
220
|
self.code_interpreter = code_interpreter
|
219
221
|
self.callback_message = callback_message
|
220
222
|
if self.verbosity >= 1:
|
@@ -233,7 +235,7 @@ class VisionAgent(Agent):
|
|
233
235
|
input: Union[str, List[Message]],
|
234
236
|
media: Optional[Union[str, Path]] = None,
|
235
237
|
artifacts: Optional[Artifacts] = None,
|
236
|
-
) ->
|
238
|
+
) -> str:
|
237
239
|
"""Chat with VisionAgent and get the conversation response.
|
238
240
|
|
239
241
|
Parameters:
|
@@ -250,10 +252,28 @@ class VisionAgent(Agent):
|
|
250
252
|
input = [{"role": "user", "content": input}]
|
251
253
|
if media is not None:
|
252
254
|
input[0]["media"] = [media]
|
253
|
-
results, _ = self.
|
254
|
-
return results
|
255
|
+
results, _ = self.chat_with_artifacts(input, artifacts)
|
256
|
+
return results[-1]["content"] # type: ignore
|
257
|
+
|
258
|
+
def chat(
|
259
|
+
self,
|
260
|
+
chat: List[Message],
|
261
|
+
) -> List[Message]:
|
262
|
+
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
263
|
+
its tasks.
|
264
|
+
|
265
|
+
Parameters:
|
266
|
+
chat (List[Message]): A conversation in the format of:
|
267
|
+
[{"role": "user", "content": "describe your task here..."}]
|
268
|
+
or if it contains media files, it should be in the format of:
|
269
|
+
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
List[Message]: The conversation response.
|
273
|
+
"""
|
274
|
+
return self.chat_with_artifacts(chat)[0]
|
255
275
|
|
256
|
-
def
|
276
|
+
def chat_with_artifacts(
|
257
277
|
self,
|
258
278
|
chat: List[Message],
|
259
279
|
artifacts: Optional[Artifacts] = None,
|
@@ -287,11 +307,13 @@ class VisionAgent(Agent):
|
|
287
307
|
# this is setting remote artifacts path
|
288
308
|
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
289
309
|
|
310
|
+
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
290
311
|
code_interpreter = (
|
291
312
|
self.code_interpreter
|
292
313
|
if self.code_interpreter is not None
|
314
|
+
and not isinstance(self.code_interpreter, str)
|
293
315
|
else CodeInterpreterFactory.new_instance(
|
294
|
-
code_sandbox_runtime=self.
|
316
|
+
code_sandbox_runtime=self.code_interpreter,
|
295
317
|
)
|
296
318
|
)
|
297
319
|
with code_interpreter:
|
@@ -480,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
480
502
|
agent: Optional[LMM] = None,
|
481
503
|
verbosity: int = 0,
|
482
504
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
483
|
-
code_sandbox_runtime: Optional[str] = None,
|
484
505
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
506
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
485
507
|
) -> None:
|
486
508
|
"""Initialize the VisionAgent using OpenAI LMMs.
|
487
509
|
|
@@ -491,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
|
|
491
513
|
verbosity (int): The verbosity level of the agent.
|
492
514
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
493
515
|
artifacts file.
|
494
|
-
|
516
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
517
|
+
function to send intermediate update messages.
|
518
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
519
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
520
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
521
|
+
object is provided it will use that.
|
495
522
|
"""
|
496
523
|
|
497
524
|
agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
@@ -499,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
499
526
|
agent,
|
500
527
|
verbosity,
|
501
528
|
local_artifacts_path,
|
502
|
-
code_sandbox_runtime,
|
503
529
|
callback_message,
|
530
|
+
code_interpreter,
|
504
531
|
)
|
505
532
|
|
506
533
|
|
@@ -510,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
|
|
510
537
|
agent: Optional[LMM] = None,
|
511
538
|
verbosity: int = 0,
|
512
539
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
513
|
-
code_sandbox_runtime: Optional[str] = None,
|
514
540
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
541
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
515
542
|
) -> None:
|
516
543
|
"""Initialize the VisionAgent using Anthropic LMMs.
|
517
544
|
|
@@ -521,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
|
|
521
548
|
verbosity (int): The verbosity level of the agent.
|
522
549
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
523
550
|
artifacts file.
|
524
|
-
|
551
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
552
|
+
function to send intermediate update messages.
|
553
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
554
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
555
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
556
|
+
object is provided it will use that.
|
525
557
|
"""
|
526
558
|
|
527
559
|
agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
@@ -529,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
529
561
|
agent,
|
530
562
|
verbosity,
|
531
563
|
local_artifacts_path,
|
532
|
-
code_sandbox_runtime,
|
533
564
|
callback_message,
|
565
|
+
code_interpreter,
|
534
566
|
)
|