vision-agent 0.2.160__tar.gz → 0.2.162__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. {vision_agent-0.2.160 → vision_agent-0.2.162}/PKG-INFO +7 -7
  2. {vision_agent-0.2.160 → vision_agent-0.2.162}/README.md +6 -6
  3. {vision_agent-0.2.160 → vision_agent-0.2.162}/pyproject.toml +1 -1
  4. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/agent/__init__.py +8 -0
  5. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/agent/agent_utils.py +76 -2
  6. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/agent/vision_agent.py +57 -17
  7. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/agent/vision_agent_coder.py +163 -489
  8. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/agent/vision_agent_coder_prompts.py +0 -203
  9. vision_agent-0.2.162/vision_agent/agent/vision_agent_planner.py +553 -0
  10. vision_agent-0.2.162/vision_agent/agent/vision_agent_planner_prompts.py +199 -0
  11. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/tools/__init__.py +0 -1
  12. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/tools/meta_tools.py +87 -5
  13. {vision_agent-0.2.160 → vision_agent-0.2.162}/LICENSE +0 -0
  14. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/__init__.py +0 -0
  15. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/agent/agent.py +0 -0
  16. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/agent/vision_agent_prompts.py +0 -0
  17. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/clients/__init__.py +0 -0
  18. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/clients/http.py +0 -0
  19. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/clients/landing_public_api.py +0 -0
  20. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/fonts/__init__.py +0 -0
  21. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  22. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/lmm/__init__.py +0 -0
  23. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/lmm/lmm.py +0 -0
  24. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/lmm/types.py +0 -0
  25. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/tools/prompts.py +0 -0
  26. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/tools/tool_utils.py +0 -0
  27. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/tools/tools.py +0 -0
  28. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/tools/tools_types.py +0 -0
  29. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/utils/__init__.py +0 -0
  30. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/utils/exceptions.py +0 -0
  31. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/utils/execute.py +0 -0
  32. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/utils/image_utils.py +0 -0
  33. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/utils/sim.py +0 -0
  34. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/utils/type_defs.py +0 -0
  35. {vision_agent-0.2.160 → vision_agent-0.2.162}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.160
3
+ Version: 0.2.162
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -142,7 +142,7 @@ continuing, for example it may want to execute code and look at the output befor
142
142
  letting the user respond.
143
143
 
144
144
  ### Chatting and Artifacts
145
- If you run `chat_with_code` you will also notice an `Artifact` object. `Artifact`'s
145
+ If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
146
146
  are a way to sync files between local and remote environments. The agent will read and
147
147
  write to the artifact object, which is just a pickle object, when it wants to save or
148
148
  load files.
@@ -159,7 +159,7 @@ with open("image.png", "rb") as f:
159
159
  artifacts["image.png"] = f.read()
160
160
 
161
161
  agent = va.agent.VisionAgent()
162
- response, artifacts = agent.chat_with_code(
162
+ response, artifacts = agent.chat_with_artifacts(
163
163
  [
164
164
  {
165
165
  "role": "user",
@@ -339,11 +339,11 @@ mode by passing in the verbose argument:
339
339
  ```
340
340
 
341
341
  ### Detailed Usage
342
- You can also have it return more information by calling `chat_with_workflow`. The format
342
+ You can also have it return more information by calling `generate_code`. The format
343
343
  of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
344
344
 
345
345
  ```python
346
- >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
346
+ >>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
347
347
  >>> print(results)
348
348
  {
349
349
  "code": "from vision_agent.tools import ..."
@@ -372,7 +372,7 @@ conv = [
372
372
  "media": ["workers.png"],
373
373
  }
374
374
  ]
375
- result = agent.chat_with_workflow(conv)
375
+ result = agent.generate_code(conv)
376
376
  code = result["code"]
377
377
  conv.append({"role": "assistant", "content": code})
378
378
  conv.append(
@@ -381,7 +381,7 @@ conv.append(
381
381
  "content": "Can you also return the number of workers wearing safety gear?",
382
382
  }
383
383
  )
384
- result = agent.chat_with_workflow(conv)
384
+ result = agent.generate_code(conv)
385
385
  ```
386
386
 
387
387
 
@@ -101,7 +101,7 @@ continuing, for example it may want to execute code and look at the output befor
101
101
  letting the user respond.
102
102
 
103
103
  ### Chatting and Artifacts
104
- If you run `chat_with_code` you will also notice an `Artifact` object. `Artifact`'s
104
+ If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
105
105
  are a way to sync files between local and remote environments. The agent will read and
106
106
  write to the artifact object, which is just a pickle object, when it wants to save or
107
107
  load files.
@@ -118,7 +118,7 @@ with open("image.png", "rb") as f:
118
118
  artifacts["image.png"] = f.read()
119
119
 
120
120
  agent = va.agent.VisionAgent()
121
- response, artifacts = agent.chat_with_code(
121
+ response, artifacts = agent.chat_with_artifacts(
122
122
  [
123
123
  {
124
124
  "role": "user",
@@ -298,11 +298,11 @@ mode by passing in the verbose argument:
298
298
  ```
299
299
 
300
300
  ### Detailed Usage
301
- You can also have it return more information by calling `chat_with_workflow`. The format
301
+ You can also have it return more information by calling `generate_code`. The format
302
302
  of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
303
303
 
304
304
  ```python
305
- >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
305
+ >>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
306
306
  >>> print(results)
307
307
  {
308
308
  "code": "from vision_agent.tools import ..."
@@ -331,7 +331,7 @@ conv = [
331
331
  "media": ["workers.png"],
332
332
  }
333
333
  ]
334
- result = agent.chat_with_workflow(conv)
334
+ result = agent.generate_code(conv)
335
335
  code = result["code"]
336
336
  conv.append({"role": "assistant", "content": code})
337
337
  conv.append(
@@ -340,7 +340,7 @@ conv.append(
340
340
  "content": "Can you also return the number of workers wearing safety gear?",
341
341
  }
342
342
  )
343
- result = agent.chat_with_workflow(conv)
343
+ result = agent.generate_code(conv)
344
344
  ```
345
345
 
346
346
 
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.160"
7
+ version = "0.2.162"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -7,3 +7,11 @@ from .vision_agent_coder import (
7
7
  OpenAIVisionAgentCoder,
8
8
  VisionAgentCoder,
9
9
  )
10
+ from .vision_agent_planner import (
11
+ AnthropicVisionAgentPlanner,
12
+ AzureVisionAgentPlanner,
13
+ OllamaVisionAgentPlanner,
14
+ OpenAIVisionAgentPlanner,
15
+ PlanContext,
16
+ VisionAgentPlanner,
17
+ )
@@ -2,10 +2,17 @@ import json
2
2
  import logging
3
3
  import re
4
4
  import sys
5
- from typing import Any, Dict, Optional
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from rich.console import Console
8
+ from rich.style import Style
9
+ from rich.syntax import Syntax
10
+
11
+ import vision_agent.tools as T
6
12
 
7
13
  logging.basicConfig(stream=sys.stdout)
8
14
  _LOGGER = logging.getLogger(__name__)
15
+ _CONSOLE = Console()
9
16
 
10
17
 
11
18
  def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
@@ -41,11 +48,16 @@ def _strip_markdown_code(inp_str: str) -> str:
41
48
 
42
49
  def extract_json(json_str: str) -> Dict[str, Any]:
43
50
  json_str_mod = json_str.replace("\n", " ").strip()
44
- json_str_mod = json_str_mod.replace("'", '"')
45
51
  json_str_mod = json_str_mod.replace(": True", ": true").replace(
46
52
  ": False", ": false"
47
53
  )
48
54
 
55
+ # sometimes the json is in single quotes
56
+ try:
57
+ return json.loads(json_str_mod.replace("'", '"')) # type: ignore
58
+ except json.JSONDecodeError:
59
+ pass
60
+
49
61
  try:
50
62
  return json.loads(json_str_mod) # type: ignore
51
63
  except json.JSONDecodeError:
@@ -83,3 +95,65 @@ def remove_installs_from_code(code: str) -> str:
83
95
  pattern = r"\n!pip install.*?(\n|\Z)\n"
84
96
  code = re.sub(pattern, "", code, flags=re.DOTALL)
85
97
  return code
98
+
99
+
100
+ def format_memory(memory: List[Dict[str, str]]) -> str:
101
+ output_str = ""
102
+ for i, m in enumerate(memory):
103
+ output_str += f"### Feedback {i}:\n"
104
+ output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
105
+ output_str += f"Feedback {i}: {m['feedback']}\n\n"
106
+ if "edits" in m:
107
+ output_str += f"Edits {i}:\n{m['edits']}\n"
108
+ output_str += "\n"
109
+
110
+ return output_str
111
+
112
+
113
+ def format_plans(plans: Dict[str, Any]) -> str:
114
+ plan_str = ""
115
+ for k, v in plans.items():
116
+ plan_str += "\n" + f"{k}: {v['thoughts']}\n"
117
+ plan_str += " -" + "\n -".join([e for e in v["instructions"]])
118
+
119
+ return plan_str
120
+
121
+
122
+ class DefaultImports:
123
+ """Container for default imports used in the code execution."""
124
+
125
+ common_imports = [
126
+ "import os",
127
+ "import numpy as np",
128
+ "from vision_agent.tools import *",
129
+ "from typing import *",
130
+ "from pillow_heif import register_heif_opener",
131
+ "register_heif_opener()",
132
+ ]
133
+
134
+ @staticmethod
135
+ def to_code_string() -> str:
136
+ return "\n".join(DefaultImports.common_imports + T.__new_tools__)
137
+
138
+ @staticmethod
139
+ def prepend_imports(code: str) -> str:
140
+ """Run this method to prepend the default imports to the code.
141
+ NOTE: be sure to run this method after the custom tools have been registered.
142
+ """
143
+ return DefaultImports.to_code_string() + "\n\n" + code
144
+
145
+
146
+ def print_code(title: str, code: str, test: Optional[str] = None) -> None:
147
+ _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
148
+ _CONSOLE.print("=" * 30 + " Code " + "=" * 30)
149
+ _CONSOLE.print(
150
+ Syntax(
151
+ DefaultImports.prepend_imports(code),
152
+ "python",
153
+ theme="gruvbox-dark",
154
+ line_numbers=True,
155
+ )
156
+ )
157
+ if test:
158
+ _CONSOLE.print("=" * 30 + " Test " + "=" * 30)
159
+ _CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
14
14
  VA_CODE,
15
15
  )
16
16
  from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
17
- from vision_agent.tools import META_TOOL_DOCSTRING
18
17
  from vision_agent.tools.meta_tools import (
18
+ META_TOOL_DOCSTRING,
19
19
  Artifacts,
20
20
  check_and_load_image,
21
21
  use_extra_vision_agent_args,
@@ -195,8 +195,8 @@ class VisionAgent(Agent):
195
195
  agent: Optional[LMM] = None,
196
196
  verbosity: int = 0,
197
197
  local_artifacts_path: Optional[Union[str, Path]] = None,
198
- code_sandbox_runtime: Optional[str] = None,
199
198
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
199
+ code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
200
200
  ) -> None:
201
201
  """Initialize the VisionAgent.
202
202
 
@@ -206,13 +206,18 @@ class VisionAgent(Agent):
206
206
  verbosity (int): The verbosity level of the agent.
207
207
  local_artifacts_path (Optional[Union[str, Path]]): The path to the local
208
208
  artifacts file.
209
- code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
209
+ callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
210
+ function to send intermediate update messages.
211
+ code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
212
+ it can be one of: None, "local" or "e2b". If None, it will read from
213
+ the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
214
+ object is provided it will use that.
210
215
  """
211
216
 
212
217
  self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
213
218
  self.max_iterations = 12
214
219
  self.verbosity = verbosity
215
- self.code_sandbox_runtime = code_sandbox_runtime
220
+ self.code_interpreter = code_interpreter
216
221
  self.callback_message = callback_message
217
222
  if self.verbosity >= 1:
218
223
  _LOGGER.setLevel(logging.INFO)
@@ -230,7 +235,7 @@ class VisionAgent(Agent):
230
235
  input: Union[str, List[Message]],
231
236
  media: Optional[Union[str, Path]] = None,
232
237
  artifacts: Optional[Artifacts] = None,
233
- ) -> List[Message]:
238
+ ) -> str:
234
239
  """Chat with VisionAgent and get the conversation response.
235
240
 
236
241
  Parameters:
@@ -247,10 +252,28 @@ class VisionAgent(Agent):
247
252
  input = [{"role": "user", "content": input}]
248
253
  if media is not None:
249
254
  input[0]["media"] = [media]
250
- results, _ = self.chat_with_code(input, artifacts)
251
- return results
255
+ results, _ = self.chat_with_artifacts(input, artifacts)
256
+ return results[-1]["content"] # type: ignore
252
257
 
253
- def chat_with_code(
258
+ def chat(
259
+ self,
260
+ chat: List[Message],
261
+ ) -> List[Message]:
262
+ """Chat with VisionAgent, it will use code to execute actions to accomplish
263
+ its tasks.
264
+
265
+ Parameters:
266
+ chat (List[Message]): A conversation in the format of:
267
+ [{"role": "user", "content": "describe your task here..."}]
268
+ or if it contains media files, it should be in the format of:
269
+ [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
270
+
271
+ Returns:
272
+ List[Message]: The conversation response.
273
+ """
274
+ return self.chat_with_artifacts(chat)[0]
275
+
276
+ def chat_with_artifacts(
254
277
  self,
255
278
  chat: List[Message],
256
279
  artifacts: Optional[Artifacts] = None,
@@ -284,9 +307,16 @@ class VisionAgent(Agent):
284
307
  # this is setting remote artifacts path
285
308
  artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
286
309
 
287
- with CodeInterpreterFactory.new_instance(
288
- code_sandbox_runtime=self.code_sandbox_runtime,
289
- ) as code_interpreter:
310
+ # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
311
+ code_interpreter = (
312
+ self.code_interpreter
313
+ if self.code_interpreter is not None
314
+ and not isinstance(self.code_interpreter, str)
315
+ else CodeInterpreterFactory.new_instance(
316
+ code_sandbox_runtime=self.code_interpreter,
317
+ )
318
+ )
319
+ with code_interpreter:
290
320
  orig_chat = copy.deepcopy(chat)
291
321
  int_chat = copy.deepcopy(chat)
292
322
  last_user_message = chat[-1]
@@ -472,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
472
502
  agent: Optional[LMM] = None,
473
503
  verbosity: int = 0,
474
504
  local_artifacts_path: Optional[Union[str, Path]] = None,
475
- code_sandbox_runtime: Optional[str] = None,
476
505
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
506
+ code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
477
507
  ) -> None:
478
508
  """Initialize the VisionAgent using OpenAI LMMs.
479
509
 
@@ -483,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
483
513
  verbosity (int): The verbosity level of the agent.
484
514
  local_artifacts_path (Optional[Union[str, Path]]): The path to the local
485
515
  artifacts file.
486
- code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
516
+ callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
517
+ function to send intermediate update messages.
518
+ code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
519
+ it can be one of: None, "local" or "e2b". If None, it will read from
520
+ the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
521
+ object is provided it will use that.
487
522
  """
488
523
 
489
524
  agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
@@ -491,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
491
526
  agent,
492
527
  verbosity,
493
528
  local_artifacts_path,
494
- code_sandbox_runtime,
495
529
  callback_message,
530
+ code_interpreter,
496
531
  )
497
532
 
498
533
 
@@ -502,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
502
537
  agent: Optional[LMM] = None,
503
538
  verbosity: int = 0,
504
539
  local_artifacts_path: Optional[Union[str, Path]] = None,
505
- code_sandbox_runtime: Optional[str] = None,
506
540
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
541
+ code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
507
542
  ) -> None:
508
543
  """Initialize the VisionAgent using Anthropic LMMs.
509
544
 
@@ -513,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
513
548
  verbosity (int): The verbosity level of the agent.
514
549
  local_artifacts_path (Optional[Union[str, Path]]): The path to the local
515
550
  artifacts file.
516
- code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
551
+ callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
552
+ function to send intermediate update messages.
553
+ code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
554
+ it can be one of: None, "local" or "e2b". If None, it will read from
555
+ the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
556
+ object is provided it will use that.
517
557
  """
518
558
 
519
559
  agent = AnthropicLMM(temperature=0.0) if agent is None else agent
@@ -521,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
521
561
  agent,
522
562
  verbosity,
523
563
  local_artifacts_path,
524
- code_sandbox_runtime,
525
564
  callback_message,
565
+ code_interpreter,
526
566
  )