vision-agent 0.2.160__py3-none-any.whl → 0.2.162__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +76 -2
- vision_agent/agent/vision_agent.py +57 -17
- vision_agent/agent/vision_agent_coder.py +163 -489
- vision_agent/agent/vision_agent_coder_prompts.py +0 -203
- vision_agent/agent/vision_agent_planner.py +553 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +87 -5
- {vision_agent-0.2.160.dist-info → vision_agent-0.2.162.dist-info}/METADATA +7 -7
- {vision_agent-0.2.160.dist-info → vision_agent-0.2.162.dist-info}/RECORD +13 -11
- {vision_agent-0.2.160.dist-info → vision_agent-0.2.162.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.160.dist-info → vision_agent-0.2.162.dist-info}/WHEEL +0 -0
vision_agent/agent/__init__.py
CHANGED
@@ -7,3 +7,11 @@ from .vision_agent_coder import (
|
|
7
7
|
OpenAIVisionAgentCoder,
|
8
8
|
VisionAgentCoder,
|
9
9
|
)
|
10
|
+
from .vision_agent_planner import (
|
11
|
+
AnthropicVisionAgentPlanner,
|
12
|
+
AzureVisionAgentPlanner,
|
13
|
+
OllamaVisionAgentPlanner,
|
14
|
+
OpenAIVisionAgentPlanner,
|
15
|
+
PlanContext,
|
16
|
+
VisionAgentPlanner,
|
17
|
+
)
|
@@ -2,10 +2,17 @@ import json
|
|
2
2
|
import logging
|
3
3
|
import re
|
4
4
|
import sys
|
5
|
-
from typing import Any, Dict, Optional
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
from rich.console import Console
|
8
|
+
from rich.style import Style
|
9
|
+
from rich.syntax import Syntax
|
10
|
+
|
11
|
+
import vision_agent.tools as T
|
6
12
|
|
7
13
|
logging.basicConfig(stream=sys.stdout)
|
8
14
|
_LOGGER = logging.getLogger(__name__)
|
15
|
+
_CONSOLE = Console()
|
9
16
|
|
10
17
|
|
11
18
|
def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
|
@@ -41,11 +48,16 @@ def _strip_markdown_code(inp_str: str) -> str:
|
|
41
48
|
|
42
49
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
43
50
|
json_str_mod = json_str.replace("\n", " ").strip()
|
44
|
-
json_str_mod = json_str_mod.replace("'", '"')
|
45
51
|
json_str_mod = json_str_mod.replace(": True", ": true").replace(
|
46
52
|
": False", ": false"
|
47
53
|
)
|
48
54
|
|
55
|
+
# sometimes the json is in single quotes
|
56
|
+
try:
|
57
|
+
return json.loads(json_str_mod.replace("'", '"')) # type: ignore
|
58
|
+
except json.JSONDecodeError:
|
59
|
+
pass
|
60
|
+
|
49
61
|
try:
|
50
62
|
return json.loads(json_str_mod) # type: ignore
|
51
63
|
except json.JSONDecodeError:
|
@@ -83,3 +95,65 @@ def remove_installs_from_code(code: str) -> str:
|
|
83
95
|
pattern = r"\n!pip install.*?(\n|\Z)\n"
|
84
96
|
code = re.sub(pattern, "", code, flags=re.DOTALL)
|
85
97
|
return code
|
98
|
+
|
99
|
+
|
100
|
+
def format_memory(memory: List[Dict[str, str]]) -> str:
|
101
|
+
output_str = ""
|
102
|
+
for i, m in enumerate(memory):
|
103
|
+
output_str += f"### Feedback {i}:\n"
|
104
|
+
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
105
|
+
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
106
|
+
if "edits" in m:
|
107
|
+
output_str += f"Edits {i}:\n{m['edits']}\n"
|
108
|
+
output_str += "\n"
|
109
|
+
|
110
|
+
return output_str
|
111
|
+
|
112
|
+
|
113
|
+
def format_plans(plans: Dict[str, Any]) -> str:
|
114
|
+
plan_str = ""
|
115
|
+
for k, v in plans.items():
|
116
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
117
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
118
|
+
|
119
|
+
return plan_str
|
120
|
+
|
121
|
+
|
122
|
+
class DefaultImports:
|
123
|
+
"""Container for default imports used in the code execution."""
|
124
|
+
|
125
|
+
common_imports = [
|
126
|
+
"import os",
|
127
|
+
"import numpy as np",
|
128
|
+
"from vision_agent.tools import *",
|
129
|
+
"from typing import *",
|
130
|
+
"from pillow_heif import register_heif_opener",
|
131
|
+
"register_heif_opener()",
|
132
|
+
]
|
133
|
+
|
134
|
+
@staticmethod
|
135
|
+
def to_code_string() -> str:
|
136
|
+
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
137
|
+
|
138
|
+
@staticmethod
|
139
|
+
def prepend_imports(code: str) -> str:
|
140
|
+
"""Run this method to prepend the default imports to the code.
|
141
|
+
NOTE: be sure to run this method after the custom tools have been registered.
|
142
|
+
"""
|
143
|
+
return DefaultImports.to_code_string() + "\n\n" + code
|
144
|
+
|
145
|
+
|
146
|
+
def print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
147
|
+
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
148
|
+
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
149
|
+
_CONSOLE.print(
|
150
|
+
Syntax(
|
151
|
+
DefaultImports.prepend_imports(code),
|
152
|
+
"python",
|
153
|
+
theme="gruvbox-dark",
|
154
|
+
line_numbers=True,
|
155
|
+
)
|
156
|
+
)
|
157
|
+
if test:
|
158
|
+
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
159
|
+
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
14
14
|
VA_CODE,
|
15
15
|
)
|
16
16
|
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
17
|
-
from vision_agent.tools import META_TOOL_DOCSTRING
|
18
17
|
from vision_agent.tools.meta_tools import (
|
18
|
+
META_TOOL_DOCSTRING,
|
19
19
|
Artifacts,
|
20
20
|
check_and_load_image,
|
21
21
|
use_extra_vision_agent_args,
|
@@ -195,8 +195,8 @@ class VisionAgent(Agent):
|
|
195
195
|
agent: Optional[LMM] = None,
|
196
196
|
verbosity: int = 0,
|
197
197
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
198
|
-
code_sandbox_runtime: Optional[str] = None,
|
199
198
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
199
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
200
200
|
) -> None:
|
201
201
|
"""Initialize the VisionAgent.
|
202
202
|
|
@@ -206,13 +206,18 @@ class VisionAgent(Agent):
|
|
206
206
|
verbosity (int): The verbosity level of the agent.
|
207
207
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
208
208
|
artifacts file.
|
209
|
-
|
209
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
210
|
+
function to send intermediate update messages.
|
211
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
212
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
213
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
214
|
+
object is provided it will use that.
|
210
215
|
"""
|
211
216
|
|
212
217
|
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
213
218
|
self.max_iterations = 12
|
214
219
|
self.verbosity = verbosity
|
215
|
-
self.
|
220
|
+
self.code_interpreter = code_interpreter
|
216
221
|
self.callback_message = callback_message
|
217
222
|
if self.verbosity >= 1:
|
218
223
|
_LOGGER.setLevel(logging.INFO)
|
@@ -230,7 +235,7 @@ class VisionAgent(Agent):
|
|
230
235
|
input: Union[str, List[Message]],
|
231
236
|
media: Optional[Union[str, Path]] = None,
|
232
237
|
artifacts: Optional[Artifacts] = None,
|
233
|
-
) ->
|
238
|
+
) -> str:
|
234
239
|
"""Chat with VisionAgent and get the conversation response.
|
235
240
|
|
236
241
|
Parameters:
|
@@ -247,10 +252,28 @@ class VisionAgent(Agent):
|
|
247
252
|
input = [{"role": "user", "content": input}]
|
248
253
|
if media is not None:
|
249
254
|
input[0]["media"] = [media]
|
250
|
-
results, _ = self.
|
251
|
-
return results
|
255
|
+
results, _ = self.chat_with_artifacts(input, artifacts)
|
256
|
+
return results[-1]["content"] # type: ignore
|
252
257
|
|
253
|
-
def
|
258
|
+
def chat(
|
259
|
+
self,
|
260
|
+
chat: List[Message],
|
261
|
+
) -> List[Message]:
|
262
|
+
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
263
|
+
its tasks.
|
264
|
+
|
265
|
+
Parameters:
|
266
|
+
chat (List[Message]): A conversation in the format of:
|
267
|
+
[{"role": "user", "content": "describe your task here..."}]
|
268
|
+
or if it contains media files, it should be in the format of:
|
269
|
+
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
List[Message]: The conversation response.
|
273
|
+
"""
|
274
|
+
return self.chat_with_artifacts(chat)[0]
|
275
|
+
|
276
|
+
def chat_with_artifacts(
|
254
277
|
self,
|
255
278
|
chat: List[Message],
|
256
279
|
artifacts: Optional[Artifacts] = None,
|
@@ -284,9 +307,16 @@ class VisionAgent(Agent):
|
|
284
307
|
# this is setting remote artifacts path
|
285
308
|
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
286
309
|
|
287
|
-
|
288
|
-
|
289
|
-
|
310
|
+
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
311
|
+
code_interpreter = (
|
312
|
+
self.code_interpreter
|
313
|
+
if self.code_interpreter is not None
|
314
|
+
and not isinstance(self.code_interpreter, str)
|
315
|
+
else CodeInterpreterFactory.new_instance(
|
316
|
+
code_sandbox_runtime=self.code_interpreter,
|
317
|
+
)
|
318
|
+
)
|
319
|
+
with code_interpreter:
|
290
320
|
orig_chat = copy.deepcopy(chat)
|
291
321
|
int_chat = copy.deepcopy(chat)
|
292
322
|
last_user_message = chat[-1]
|
@@ -472,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
472
502
|
agent: Optional[LMM] = None,
|
473
503
|
verbosity: int = 0,
|
474
504
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
475
|
-
code_sandbox_runtime: Optional[str] = None,
|
476
505
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
506
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
477
507
|
) -> None:
|
478
508
|
"""Initialize the VisionAgent using OpenAI LMMs.
|
479
509
|
|
@@ -483,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
|
|
483
513
|
verbosity (int): The verbosity level of the agent.
|
484
514
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
485
515
|
artifacts file.
|
486
|
-
|
516
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
517
|
+
function to send intermediate update messages.
|
518
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
519
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
520
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
521
|
+
object is provided it will use that.
|
487
522
|
"""
|
488
523
|
|
489
524
|
agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
@@ -491,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
491
526
|
agent,
|
492
527
|
verbosity,
|
493
528
|
local_artifacts_path,
|
494
|
-
code_sandbox_runtime,
|
495
529
|
callback_message,
|
530
|
+
code_interpreter,
|
496
531
|
)
|
497
532
|
|
498
533
|
|
@@ -502,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
|
|
502
537
|
agent: Optional[LMM] = None,
|
503
538
|
verbosity: int = 0,
|
504
539
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
505
|
-
code_sandbox_runtime: Optional[str] = None,
|
506
540
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
541
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
507
542
|
) -> None:
|
508
543
|
"""Initialize the VisionAgent using Anthropic LMMs.
|
509
544
|
|
@@ -513,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
|
|
513
548
|
verbosity (int): The verbosity level of the agent.
|
514
549
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
515
550
|
artifacts file.
|
516
|
-
|
551
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
552
|
+
function to send intermediate update messages.
|
553
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
554
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
555
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
556
|
+
object is provided it will use that.
|
517
557
|
"""
|
518
558
|
|
519
559
|
agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
@@ -521,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
521
561
|
agent,
|
522
562
|
verbosity,
|
523
563
|
local_artifacts_path,
|
524
|
-
code_sandbox_runtime,
|
525
564
|
callback_message,
|
565
|
+
code_interpreter,
|
526
566
|
)
|