vision-agent 0.2.160__py3-none-any.whl → 0.2.162__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +76 -2
- vision_agent/agent/vision_agent.py +57 -17
- vision_agent/agent/vision_agent_coder.py +163 -489
- vision_agent/agent/vision_agent_coder_prompts.py +0 -203
- vision_agent/agent/vision_agent_planner.py +553 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +87 -5
- {vision_agent-0.2.160.dist-info → vision_agent-0.2.162.dist-info}/METADATA +7 -7
- {vision_agent-0.2.160.dist-info → vision_agent-0.2.162.dist-info}/RECORD +13 -11
- {vision_agent-0.2.160.dist-info → vision_agent-0.2.162.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.160.dist-info → vision_agent-0.2.162.dist-info}/WHEEL +0 -0
vision_agent/agent/__init__.py
CHANGED
@@ -7,3 +7,11 @@ from .vision_agent_coder import (
|
|
7
7
|
OpenAIVisionAgentCoder,
|
8
8
|
VisionAgentCoder,
|
9
9
|
)
|
10
|
+
from .vision_agent_planner import (
|
11
|
+
AnthropicVisionAgentPlanner,
|
12
|
+
AzureVisionAgentPlanner,
|
13
|
+
OllamaVisionAgentPlanner,
|
14
|
+
OpenAIVisionAgentPlanner,
|
15
|
+
PlanContext,
|
16
|
+
VisionAgentPlanner,
|
17
|
+
)
|
@@ -2,10 +2,17 @@ import json
|
|
2
2
|
import logging
|
3
3
|
import re
|
4
4
|
import sys
|
5
|
-
from typing import Any, Dict, Optional
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
|
+
|
7
|
+
from rich.console import Console
|
8
|
+
from rich.style import Style
|
9
|
+
from rich.syntax import Syntax
|
10
|
+
|
11
|
+
import vision_agent.tools as T
|
6
12
|
|
7
13
|
logging.basicConfig(stream=sys.stdout)
|
8
14
|
_LOGGER = logging.getLogger(__name__)
|
15
|
+
_CONSOLE = Console()
|
9
16
|
|
10
17
|
|
11
18
|
def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
|
@@ -41,11 +48,16 @@ def _strip_markdown_code(inp_str: str) -> str:
|
|
41
48
|
|
42
49
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
43
50
|
json_str_mod = json_str.replace("\n", " ").strip()
|
44
|
-
json_str_mod = json_str_mod.replace("'", '"')
|
45
51
|
json_str_mod = json_str_mod.replace(": True", ": true").replace(
|
46
52
|
": False", ": false"
|
47
53
|
)
|
48
54
|
|
55
|
+
# sometimes the json is in single quotes
|
56
|
+
try:
|
57
|
+
return json.loads(json_str_mod.replace("'", '"')) # type: ignore
|
58
|
+
except json.JSONDecodeError:
|
59
|
+
pass
|
60
|
+
|
49
61
|
try:
|
50
62
|
return json.loads(json_str_mod) # type: ignore
|
51
63
|
except json.JSONDecodeError:
|
@@ -83,3 +95,65 @@ def remove_installs_from_code(code: str) -> str:
|
|
83
95
|
pattern = r"\n!pip install.*?(\n|\Z)\n"
|
84
96
|
code = re.sub(pattern, "", code, flags=re.DOTALL)
|
85
97
|
return code
|
98
|
+
|
99
|
+
|
100
|
+
def format_memory(memory: List[Dict[str, str]]) -> str:
|
101
|
+
output_str = ""
|
102
|
+
for i, m in enumerate(memory):
|
103
|
+
output_str += f"### Feedback {i}:\n"
|
104
|
+
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
105
|
+
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
106
|
+
if "edits" in m:
|
107
|
+
output_str += f"Edits {i}:\n{m['edits']}\n"
|
108
|
+
output_str += "\n"
|
109
|
+
|
110
|
+
return output_str
|
111
|
+
|
112
|
+
|
113
|
+
def format_plans(plans: Dict[str, Any]) -> str:
|
114
|
+
plan_str = ""
|
115
|
+
for k, v in plans.items():
|
116
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
117
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
118
|
+
|
119
|
+
return plan_str
|
120
|
+
|
121
|
+
|
122
|
+
class DefaultImports:
|
123
|
+
"""Container for default imports used in the code execution."""
|
124
|
+
|
125
|
+
common_imports = [
|
126
|
+
"import os",
|
127
|
+
"import numpy as np",
|
128
|
+
"from vision_agent.tools import *",
|
129
|
+
"from typing import *",
|
130
|
+
"from pillow_heif import register_heif_opener",
|
131
|
+
"register_heif_opener()",
|
132
|
+
]
|
133
|
+
|
134
|
+
@staticmethod
|
135
|
+
def to_code_string() -> str:
|
136
|
+
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
137
|
+
|
138
|
+
@staticmethod
|
139
|
+
def prepend_imports(code: str) -> str:
|
140
|
+
"""Run this method to prepend the default imports to the code.
|
141
|
+
NOTE: be sure to run this method after the custom tools have been registered.
|
142
|
+
"""
|
143
|
+
return DefaultImports.to_code_string() + "\n\n" + code
|
144
|
+
|
145
|
+
|
146
|
+
def print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
147
|
+
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
148
|
+
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
149
|
+
_CONSOLE.print(
|
150
|
+
Syntax(
|
151
|
+
DefaultImports.prepend_imports(code),
|
152
|
+
"python",
|
153
|
+
theme="gruvbox-dark",
|
154
|
+
line_numbers=True,
|
155
|
+
)
|
156
|
+
)
|
157
|
+
if test:
|
158
|
+
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
159
|
+
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
14
14
|
VA_CODE,
|
15
15
|
)
|
16
16
|
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
17
|
-
from vision_agent.tools import META_TOOL_DOCSTRING
|
18
17
|
from vision_agent.tools.meta_tools import (
|
18
|
+
META_TOOL_DOCSTRING,
|
19
19
|
Artifacts,
|
20
20
|
check_and_load_image,
|
21
21
|
use_extra_vision_agent_args,
|
@@ -195,8 +195,8 @@ class VisionAgent(Agent):
|
|
195
195
|
agent: Optional[LMM] = None,
|
196
196
|
verbosity: int = 0,
|
197
197
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
198
|
-
code_sandbox_runtime: Optional[str] = None,
|
199
198
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
199
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
200
200
|
) -> None:
|
201
201
|
"""Initialize the VisionAgent.
|
202
202
|
|
@@ -206,13 +206,18 @@ class VisionAgent(Agent):
|
|
206
206
|
verbosity (int): The verbosity level of the agent.
|
207
207
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
208
208
|
artifacts file.
|
209
|
-
|
209
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
210
|
+
function to send intermediate update messages.
|
211
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
212
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
213
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
214
|
+
object is provided it will use that.
|
210
215
|
"""
|
211
216
|
|
212
217
|
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
213
218
|
self.max_iterations = 12
|
214
219
|
self.verbosity = verbosity
|
215
|
-
self.
|
220
|
+
self.code_interpreter = code_interpreter
|
216
221
|
self.callback_message = callback_message
|
217
222
|
if self.verbosity >= 1:
|
218
223
|
_LOGGER.setLevel(logging.INFO)
|
@@ -230,7 +235,7 @@ class VisionAgent(Agent):
|
|
230
235
|
input: Union[str, List[Message]],
|
231
236
|
media: Optional[Union[str, Path]] = None,
|
232
237
|
artifacts: Optional[Artifacts] = None,
|
233
|
-
) ->
|
238
|
+
) -> str:
|
234
239
|
"""Chat with VisionAgent and get the conversation response.
|
235
240
|
|
236
241
|
Parameters:
|
@@ -247,10 +252,28 @@ class VisionAgent(Agent):
|
|
247
252
|
input = [{"role": "user", "content": input}]
|
248
253
|
if media is not None:
|
249
254
|
input[0]["media"] = [media]
|
250
|
-
results, _ = self.
|
251
|
-
return results
|
255
|
+
results, _ = self.chat_with_artifacts(input, artifacts)
|
256
|
+
return results[-1]["content"] # type: ignore
|
252
257
|
|
253
|
-
def
|
258
|
+
def chat(
|
259
|
+
self,
|
260
|
+
chat: List[Message],
|
261
|
+
) -> List[Message]:
|
262
|
+
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
263
|
+
its tasks.
|
264
|
+
|
265
|
+
Parameters:
|
266
|
+
chat (List[Message]): A conversation in the format of:
|
267
|
+
[{"role": "user", "content": "describe your task here..."}]
|
268
|
+
or if it contains media files, it should be in the format of:
|
269
|
+
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
List[Message]: The conversation response.
|
273
|
+
"""
|
274
|
+
return self.chat_with_artifacts(chat)[0]
|
275
|
+
|
276
|
+
def chat_with_artifacts(
|
254
277
|
self,
|
255
278
|
chat: List[Message],
|
256
279
|
artifacts: Optional[Artifacts] = None,
|
@@ -284,9 +307,16 @@ class VisionAgent(Agent):
|
|
284
307
|
# this is setting remote artifacts path
|
285
308
|
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
286
309
|
|
287
|
-
|
288
|
-
|
289
|
-
|
310
|
+
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
311
|
+
code_interpreter = (
|
312
|
+
self.code_interpreter
|
313
|
+
if self.code_interpreter is not None
|
314
|
+
and not isinstance(self.code_interpreter, str)
|
315
|
+
else CodeInterpreterFactory.new_instance(
|
316
|
+
code_sandbox_runtime=self.code_interpreter,
|
317
|
+
)
|
318
|
+
)
|
319
|
+
with code_interpreter:
|
290
320
|
orig_chat = copy.deepcopy(chat)
|
291
321
|
int_chat = copy.deepcopy(chat)
|
292
322
|
last_user_message = chat[-1]
|
@@ -472,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
472
502
|
agent: Optional[LMM] = None,
|
473
503
|
verbosity: int = 0,
|
474
504
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
475
|
-
code_sandbox_runtime: Optional[str] = None,
|
476
505
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
506
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
477
507
|
) -> None:
|
478
508
|
"""Initialize the VisionAgent using OpenAI LMMs.
|
479
509
|
|
@@ -483,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
|
|
483
513
|
verbosity (int): The verbosity level of the agent.
|
484
514
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
485
515
|
artifacts file.
|
486
|
-
|
516
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
517
|
+
function to send intermediate update messages.
|
518
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
519
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
520
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
521
|
+
object is provided it will use that.
|
487
522
|
"""
|
488
523
|
|
489
524
|
agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
@@ -491,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
|
|
491
526
|
agent,
|
492
527
|
verbosity,
|
493
528
|
local_artifacts_path,
|
494
|
-
code_sandbox_runtime,
|
495
529
|
callback_message,
|
530
|
+
code_interpreter,
|
496
531
|
)
|
497
532
|
|
498
533
|
|
@@ -502,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
|
|
502
537
|
agent: Optional[LMM] = None,
|
503
538
|
verbosity: int = 0,
|
504
539
|
local_artifacts_path: Optional[Union[str, Path]] = None,
|
505
|
-
code_sandbox_runtime: Optional[str] = None,
|
506
540
|
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
541
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
507
542
|
) -> None:
|
508
543
|
"""Initialize the VisionAgent using Anthropic LMMs.
|
509
544
|
|
@@ -513,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
|
|
513
548
|
verbosity (int): The verbosity level of the agent.
|
514
549
|
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
515
550
|
artifacts file.
|
516
|
-
|
551
|
+
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
552
|
+
function to send intermediate update messages.
|
553
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
554
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
555
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
556
|
+
object is provided it will use that.
|
517
557
|
"""
|
518
558
|
|
519
559
|
agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
@@ -521,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
|
|
521
561
|
agent,
|
522
562
|
verbosity,
|
523
563
|
local_artifacts_path,
|
524
|
-
code_sandbox_runtime,
|
525
564
|
callback_message,
|
565
|
+
code_interpreter,
|
526
566
|
)
|