vision-agent 0.2.160__py3-none-any.whl → 0.2.162__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,3 +7,11 @@ from .vision_agent_coder import (
7
7
  OpenAIVisionAgentCoder,
8
8
  VisionAgentCoder,
9
9
  )
10
+ from .vision_agent_planner import (
11
+ AnthropicVisionAgentPlanner,
12
+ AzureVisionAgentPlanner,
13
+ OllamaVisionAgentPlanner,
14
+ OpenAIVisionAgentPlanner,
15
+ PlanContext,
16
+ VisionAgentPlanner,
17
+ )
@@ -2,10 +2,17 @@ import json
2
2
  import logging
3
3
  import re
4
4
  import sys
5
- from typing import Any, Dict, Optional
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from rich.console import Console
8
+ from rich.style import Style
9
+ from rich.syntax import Syntax
10
+
11
+ import vision_agent.tools as T
6
12
 
7
13
  logging.basicConfig(stream=sys.stdout)
8
14
  _LOGGER = logging.getLogger(__name__)
15
+ _CONSOLE = Console()
9
16
 
10
17
 
11
18
  def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
@@ -41,11 +48,16 @@ def _strip_markdown_code(inp_str: str) -> str:
41
48
 
42
49
  def extract_json(json_str: str) -> Dict[str, Any]:
43
50
  json_str_mod = json_str.replace("\n", " ").strip()
44
- json_str_mod = json_str_mod.replace("'", '"')
45
51
  json_str_mod = json_str_mod.replace(": True", ": true").replace(
46
52
  ": False", ": false"
47
53
  )
48
54
 
55
+ # sometimes the json is in single quotes
56
+ try:
57
+ return json.loads(json_str_mod.replace("'", '"')) # type: ignore
58
+ except json.JSONDecodeError:
59
+ pass
60
+
49
61
  try:
50
62
  return json.loads(json_str_mod) # type: ignore
51
63
  except json.JSONDecodeError:
@@ -83,3 +95,65 @@ def remove_installs_from_code(code: str) -> str:
83
95
  pattern = r"\n!pip install.*?(\n|\Z)\n"
84
96
  code = re.sub(pattern, "", code, flags=re.DOTALL)
85
97
  return code
98
+
99
+
100
+ def format_memory(memory: List[Dict[str, str]]) -> str:
101
+ output_str = ""
102
+ for i, m in enumerate(memory):
103
+ output_str += f"### Feedback {i}:\n"
104
+ output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
105
+ output_str += f"Feedback {i}: {m['feedback']}\n\n"
106
+ if "edits" in m:
107
+ output_str += f"Edits {i}:\n{m['edits']}\n"
108
+ output_str += "\n"
109
+
110
+ return output_str
111
+
112
+
113
+ def format_plans(plans: Dict[str, Any]) -> str:
114
+ plan_str = ""
115
+ for k, v in plans.items():
116
+ plan_str += "\n" + f"{k}: {v['thoughts']}\n"
117
+ plan_str += " -" + "\n -".join([e for e in v["instructions"]])
118
+
119
+ return plan_str
120
+
121
+
122
+ class DefaultImports:
123
+ """Container for default imports used in the code execution."""
124
+
125
+ common_imports = [
126
+ "import os",
127
+ "import numpy as np",
128
+ "from vision_agent.tools import *",
129
+ "from typing import *",
130
+ "from pillow_heif import register_heif_opener",
131
+ "register_heif_opener()",
132
+ ]
133
+
134
+ @staticmethod
135
+ def to_code_string() -> str:
136
+ return "\n".join(DefaultImports.common_imports + T.__new_tools__)
137
+
138
+ @staticmethod
139
+ def prepend_imports(code: str) -> str:
140
+ """Run this method to prepend the default imports to the code.
141
+ NOTE: be sure to run this method after the custom tools have been registered.
142
+ """
143
+ return DefaultImports.to_code_string() + "\n\n" + code
144
+
145
+
146
+ def print_code(title: str, code: str, test: Optional[str] = None) -> None:
147
+ _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
148
+ _CONSOLE.print("=" * 30 + " Code " + "=" * 30)
149
+ _CONSOLE.print(
150
+ Syntax(
151
+ DefaultImports.prepend_imports(code),
152
+ "python",
153
+ theme="gruvbox-dark",
154
+ line_numbers=True,
155
+ )
156
+ )
157
+ if test:
158
+ _CONSOLE.print("=" * 30 + " Test " + "=" * 30)
159
+ _CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
14
14
  VA_CODE,
15
15
  )
16
16
  from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
17
- from vision_agent.tools import META_TOOL_DOCSTRING
18
17
  from vision_agent.tools.meta_tools import (
18
+ META_TOOL_DOCSTRING,
19
19
  Artifacts,
20
20
  check_and_load_image,
21
21
  use_extra_vision_agent_args,
@@ -195,8 +195,8 @@ class VisionAgent(Agent):
195
195
  agent: Optional[LMM] = None,
196
196
  verbosity: int = 0,
197
197
  local_artifacts_path: Optional[Union[str, Path]] = None,
198
- code_sandbox_runtime: Optional[str] = None,
199
198
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
199
+ code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
200
200
  ) -> None:
201
201
  """Initialize the VisionAgent.
202
202
 
@@ -206,13 +206,18 @@ class VisionAgent(Agent):
206
206
  verbosity (int): The verbosity level of the agent.
207
207
  local_artifacts_path (Optional[Union[str, Path]]): The path to the local
208
208
  artifacts file.
209
- code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
209
+ callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
210
+ function to send intermediate update messages.
211
+ code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
212
+ it can be one of: None, "local" or "e2b". If None, it will read from
213
+ the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
214
+ object is provided it will use that.
210
215
  """
211
216
 
212
217
  self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
213
218
  self.max_iterations = 12
214
219
  self.verbosity = verbosity
215
- self.code_sandbox_runtime = code_sandbox_runtime
220
+ self.code_interpreter = code_interpreter
216
221
  self.callback_message = callback_message
217
222
  if self.verbosity >= 1:
218
223
  _LOGGER.setLevel(logging.INFO)
@@ -230,7 +235,7 @@ class VisionAgent(Agent):
230
235
  input: Union[str, List[Message]],
231
236
  media: Optional[Union[str, Path]] = None,
232
237
  artifacts: Optional[Artifacts] = None,
233
- ) -> List[Message]:
238
+ ) -> str:
234
239
  """Chat with VisionAgent and get the conversation response.
235
240
 
236
241
  Parameters:
@@ -247,10 +252,28 @@ class VisionAgent(Agent):
247
252
  input = [{"role": "user", "content": input}]
248
253
  if media is not None:
249
254
  input[0]["media"] = [media]
250
- results, _ = self.chat_with_code(input, artifacts)
251
- return results
255
+ results, _ = self.chat_with_artifacts(input, artifacts)
256
+ return results[-1]["content"] # type: ignore
252
257
 
253
- def chat_with_code(
258
+ def chat(
259
+ self,
260
+ chat: List[Message],
261
+ ) -> List[Message]:
262
+ """Chat with VisionAgent, it will use code to execute actions to accomplish
263
+ its tasks.
264
+
265
+ Parameters:
266
+ chat (List[Message]): A conversation in the format of:
267
+ [{"role": "user", "content": "describe your task here..."}]
268
+ or if it contains media files, it should be in the format of:
269
+ [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
270
+
271
+ Returns:
272
+ List[Message]: The conversation response.
273
+ """
274
+ return self.chat_with_artifacts(chat)[0]
275
+
276
+ def chat_with_artifacts(
254
277
  self,
255
278
  chat: List[Message],
256
279
  artifacts: Optional[Artifacts] = None,
@@ -284,9 +307,16 @@ class VisionAgent(Agent):
284
307
  # this is setting remote artifacts path
285
308
  artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
286
309
 
287
- with CodeInterpreterFactory.new_instance(
288
- code_sandbox_runtime=self.code_sandbox_runtime,
289
- ) as code_interpreter:
310
+ # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
311
+ code_interpreter = (
312
+ self.code_interpreter
313
+ if self.code_interpreter is not None
314
+ and not isinstance(self.code_interpreter, str)
315
+ else CodeInterpreterFactory.new_instance(
316
+ code_sandbox_runtime=self.code_interpreter,
317
+ )
318
+ )
319
+ with code_interpreter:
290
320
  orig_chat = copy.deepcopy(chat)
291
321
  int_chat = copy.deepcopy(chat)
292
322
  last_user_message = chat[-1]
@@ -472,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
472
502
  agent: Optional[LMM] = None,
473
503
  verbosity: int = 0,
474
504
  local_artifacts_path: Optional[Union[str, Path]] = None,
475
- code_sandbox_runtime: Optional[str] = None,
476
505
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
506
+ code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
477
507
  ) -> None:
478
508
  """Initialize the VisionAgent using OpenAI LMMs.
479
509
 
@@ -483,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
483
513
  verbosity (int): The verbosity level of the agent.
484
514
  local_artifacts_path (Optional[Union[str, Path]]): The path to the local
485
515
  artifacts file.
486
- code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
516
+ callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
517
+ function to send intermediate update messages.
518
+ code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
519
+ it can be one of: None, "local" or "e2b". If None, it will read from
520
+ the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
521
+ object is provided it will use that.
487
522
  """
488
523
 
489
524
  agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
@@ -491,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
491
526
  agent,
492
527
  verbosity,
493
528
  local_artifacts_path,
494
- code_sandbox_runtime,
495
529
  callback_message,
530
+ code_interpreter,
496
531
  )
497
532
 
498
533
 
@@ -502,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
502
537
  agent: Optional[LMM] = None,
503
538
  verbosity: int = 0,
504
539
  local_artifacts_path: Optional[Union[str, Path]] = None,
505
- code_sandbox_runtime: Optional[str] = None,
506
540
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
541
+ code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
507
542
  ) -> None:
508
543
  """Initialize the VisionAgent using Anthropic LMMs.
509
544
 
@@ -513,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
513
548
  verbosity (int): The verbosity level of the agent.
514
549
  local_artifacts_path (Optional[Union[str, Path]]): The path to the local
515
550
  artifacts file.
516
- code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
551
+ callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
552
+ function to send intermediate update messages.
553
+ code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
554
+ it can be one of: None, "local" or "e2b". If None, it will read from
555
+ the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
556
+ object is provided it will use that.
517
557
  """
518
558
 
519
559
  agent = AnthropicLMM(temperature=0.0) if agent is None else agent
@@ -521,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
521
561
  agent,
522
562
  verbosity,
523
563
  local_artifacts_path,
524
- code_sandbox_runtime,
525
564
  callback_message,
565
+ code_interpreter,
526
566
  )