vision-agent 0.2.160__py3-none-any.whl → 0.2.162__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,3 +7,11 @@ from .vision_agent_coder import (
7
7
  OpenAIVisionAgentCoder,
8
8
  VisionAgentCoder,
9
9
  )
10
+ from .vision_agent_planner import (
11
+ AnthropicVisionAgentPlanner,
12
+ AzureVisionAgentPlanner,
13
+ OllamaVisionAgentPlanner,
14
+ OpenAIVisionAgentPlanner,
15
+ PlanContext,
16
+ VisionAgentPlanner,
17
+ )
@@ -2,10 +2,17 @@ import json
2
2
  import logging
3
3
  import re
4
4
  import sys
5
- from typing import Any, Dict, Optional
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from rich.console import Console
8
+ from rich.style import Style
9
+ from rich.syntax import Syntax
10
+
11
+ import vision_agent.tools as T
6
12
 
7
13
  logging.basicConfig(stream=sys.stdout)
8
14
  _LOGGER = logging.getLogger(__name__)
15
+ _CONSOLE = Console()
9
16
 
10
17
 
11
18
  def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
@@ -41,11 +48,16 @@ def _strip_markdown_code(inp_str: str) -> str:
41
48
 
42
49
  def extract_json(json_str: str) -> Dict[str, Any]:
43
50
  json_str_mod = json_str.replace("\n", " ").strip()
44
- json_str_mod = json_str_mod.replace("'", '"')
45
51
  json_str_mod = json_str_mod.replace(": True", ": true").replace(
46
52
  ": False", ": false"
47
53
  )
48
54
 
55
+ # sometimes the json is in single quotes
56
+ try:
57
+ return json.loads(json_str_mod.replace("'", '"')) # type: ignore
58
+ except json.JSONDecodeError:
59
+ pass
60
+
49
61
  try:
50
62
  return json.loads(json_str_mod) # type: ignore
51
63
  except json.JSONDecodeError:
@@ -83,3 +95,65 @@ def remove_installs_from_code(code: str) -> str:
83
95
  pattern = r"\n!pip install.*?(\n|\Z)\n"
84
96
  code = re.sub(pattern, "", code, flags=re.DOTALL)
85
97
  return code
98
+
99
+
100
+ def format_memory(memory: List[Dict[str, str]]) -> str:
101
+ output_str = ""
102
+ for i, m in enumerate(memory):
103
+ output_str += f"### Feedback {i}:\n"
104
+ output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
105
+ output_str += f"Feedback {i}: {m['feedback']}\n\n"
106
+ if "edits" in m:
107
+ output_str += f"Edits {i}:\n{m['edits']}\n"
108
+ output_str += "\n"
109
+
110
+ return output_str
111
+
112
+
113
+ def format_plans(plans: Dict[str, Any]) -> str:
114
+ plan_str = ""
115
+ for k, v in plans.items():
116
+ plan_str += "\n" + f"{k}: {v['thoughts']}\n"
117
+ plan_str += " -" + "\n -".join([e for e in v["instructions"]])
118
+
119
+ return plan_str
120
+
121
+
122
+ class DefaultImports:
123
+ """Container for default imports used in the code execution."""
124
+
125
+ common_imports = [
126
+ "import os",
127
+ "import numpy as np",
128
+ "from vision_agent.tools import *",
129
+ "from typing import *",
130
+ "from pillow_heif import register_heif_opener",
131
+ "register_heif_opener()",
132
+ ]
133
+
134
+ @staticmethod
135
+ def to_code_string() -> str:
136
+ return "\n".join(DefaultImports.common_imports + T.__new_tools__)
137
+
138
+ @staticmethod
139
+ def prepend_imports(code: str) -> str:
140
+ """Run this method to prepend the default imports to the code.
141
+ NOTE: be sure to run this method after the custom tools have been registered.
142
+ """
143
+ return DefaultImports.to_code_string() + "\n\n" + code
144
+
145
+
146
+ def print_code(title: str, code: str, test: Optional[str] = None) -> None:
147
+ _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
148
+ _CONSOLE.print("=" * 30 + " Code " + "=" * 30)
149
+ _CONSOLE.print(
150
+ Syntax(
151
+ DefaultImports.prepend_imports(code),
152
+ "python",
153
+ theme="gruvbox-dark",
154
+ line_numbers=True,
155
+ )
156
+ )
157
+ if test:
158
+ _CONSOLE.print("=" * 30 + " Test " + "=" * 30)
159
+ _CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
14
14
  VA_CODE,
15
15
  )
16
16
  from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
17
- from vision_agent.tools import META_TOOL_DOCSTRING
18
17
  from vision_agent.tools.meta_tools import (
18
+ META_TOOL_DOCSTRING,
19
19
  Artifacts,
20
20
  check_and_load_image,
21
21
  use_extra_vision_agent_args,
@@ -195,8 +195,8 @@ class VisionAgent(Agent):
195
195
  agent: Optional[LMM] = None,
196
196
  verbosity: int = 0,
197
197
  local_artifacts_path: Optional[Union[str, Path]] = None,
198
- code_sandbox_runtime: Optional[str] = None,
199
198
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
199
+ code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
200
200
  ) -> None:
201
201
  """Initialize the VisionAgent.
202
202
 
@@ -206,13 +206,18 @@ class VisionAgent(Agent):
206
206
  verbosity (int): The verbosity level of the agent.
207
207
  local_artifacts_path (Optional[Union[str, Path]]): The path to the local
208
208
  artifacts file.
209
- code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
209
+ callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
210
+ function to send intermediate update messages.
211
+ code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
212
+ it can be one of: None, "local" or "e2b". If None, it will read from
213
+ the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
214
+ object is provided it will use that.
210
215
  """
211
216
 
212
217
  self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
213
218
  self.max_iterations = 12
214
219
  self.verbosity = verbosity
215
- self.code_sandbox_runtime = code_sandbox_runtime
220
+ self.code_interpreter = code_interpreter
216
221
  self.callback_message = callback_message
217
222
  if self.verbosity >= 1:
218
223
  _LOGGER.setLevel(logging.INFO)
@@ -230,7 +235,7 @@ class VisionAgent(Agent):
230
235
  input: Union[str, List[Message]],
231
236
  media: Optional[Union[str, Path]] = None,
232
237
  artifacts: Optional[Artifacts] = None,
233
- ) -> List[Message]:
238
+ ) -> str:
234
239
  """Chat with VisionAgent and get the conversation response.
235
240
 
236
241
  Parameters:
@@ -247,10 +252,28 @@ class VisionAgent(Agent):
247
252
  input = [{"role": "user", "content": input}]
248
253
  if media is not None:
249
254
  input[0]["media"] = [media]
250
- results, _ = self.chat_with_code(input, artifacts)
251
- return results
255
+ results, _ = self.chat_with_artifacts(input, artifacts)
256
+ return results[-1]["content"] # type: ignore
252
257
 
253
- def chat_with_code(
258
+ def chat(
259
+ self,
260
+ chat: List[Message],
261
+ ) -> List[Message]:
262
+ """Chat with VisionAgent, it will use code to execute actions to accomplish
263
+ its tasks.
264
+
265
+ Parameters:
266
+ chat (List[Message]): A conversation in the format of:
267
+ [{"role": "user", "content": "describe your task here..."}]
268
+ or if it contains media files, it should be in the format of:
269
+ [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
270
+
271
+ Returns:
272
+ List[Message]: The conversation response.
273
+ """
274
+ return self.chat_with_artifacts(chat)[0]
275
+
276
+ def chat_with_artifacts(
254
277
  self,
255
278
  chat: List[Message],
256
279
  artifacts: Optional[Artifacts] = None,
@@ -284,9 +307,16 @@ class VisionAgent(Agent):
284
307
  # this is setting remote artifacts path
285
308
  artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
286
309
 
287
- with CodeInterpreterFactory.new_instance(
288
- code_sandbox_runtime=self.code_sandbox_runtime,
289
- ) as code_interpreter:
310
+ # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
311
+ code_interpreter = (
312
+ self.code_interpreter
313
+ if self.code_interpreter is not None
314
+ and not isinstance(self.code_interpreter, str)
315
+ else CodeInterpreterFactory.new_instance(
316
+ code_sandbox_runtime=self.code_interpreter,
317
+ )
318
+ )
319
+ with code_interpreter:
290
320
  orig_chat = copy.deepcopy(chat)
291
321
  int_chat = copy.deepcopy(chat)
292
322
  last_user_message = chat[-1]
@@ -472,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
472
502
  agent: Optional[LMM] = None,
473
503
  verbosity: int = 0,
474
504
  local_artifacts_path: Optional[Union[str, Path]] = None,
475
- code_sandbox_runtime: Optional[str] = None,
476
505
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
506
+ code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
477
507
  ) -> None:
478
508
  """Initialize the VisionAgent using OpenAI LMMs.
479
509
 
@@ -483,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
483
513
  verbosity (int): The verbosity level of the agent.
484
514
  local_artifacts_path (Optional[Union[str, Path]]): The path to the local
485
515
  artifacts file.
486
- code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
516
+ callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
517
+ function to send intermediate update messages.
518
+ code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
519
+ it can be one of: None, "local" or "e2b". If None, it will read from
520
+ the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
521
+ object is provided it will use that.
487
522
  """
488
523
 
489
524
  agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
@@ -491,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
491
526
  agent,
492
527
  verbosity,
493
528
  local_artifacts_path,
494
- code_sandbox_runtime,
495
529
  callback_message,
530
+ code_interpreter,
496
531
  )
497
532
 
498
533
 
@@ -502,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
502
537
  agent: Optional[LMM] = None,
503
538
  verbosity: int = 0,
504
539
  local_artifacts_path: Optional[Union[str, Path]] = None,
505
- code_sandbox_runtime: Optional[str] = None,
506
540
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
541
+ code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
507
542
  ) -> None:
508
543
  """Initialize the VisionAgent using Anthropic LMMs.
509
544
 
@@ -513,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
513
548
  verbosity (int): The verbosity level of the agent.
514
549
  local_artifacts_path (Optional[Union[str, Path]]): The path to the local
515
550
  artifacts file.
516
- code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
551
+ callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
552
+ function to send intermediate update messages.
553
+ code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
554
+ it can be one of: None, "local" or "e2b". If None, it will read from
555
+ the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
556
+ object is provided it will use that.
517
557
  """
518
558
 
519
559
  agent = AnthropicLMM(temperature=0.0) if agent is None else agent
@@ -521,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
521
561
  agent,
522
562
  verbosity,
523
563
  local_artifacts_path,
524
- code_sandbox_runtime,
525
564
  callback_message,
565
+ code_interpreter,
526
566
  )