vision-agent 0.2.29__py3-none-any.whl → 0.2.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  from .agent import Agent
2
2
  from .agent_coder import AgentCoder
3
+ from .data_interpreter import DataInterpreter
3
4
  from .easytool import EasyTool
5
+ from .easytool_v2 import EasyToolV2
4
6
  from .reflexion import Reflexion
5
7
  from .vision_agent import VisionAgent
6
- from .vision_agent_v2 import VisionAgentV2
7
- from .vision_agent_v3 import VisionAgentV3
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from pathlib import Path
3
- from typing import Dict, List, Optional, Union, Any
3
+ from typing import Any, Dict, List, Optional, Union
4
4
 
5
5
 
6
6
  class Agent(ABC):
@@ -8,7 +8,7 @@ class Agent(ABC):
8
8
  def __call__(
9
9
  self,
10
10
  input: Union[List[Dict[str, str]], str],
11
- image: Optional[Union[str, Path]] = None,
11
+ media: Optional[Union[str, Path]] = None,
12
12
  ) -> str:
13
13
  pass
14
14
 
@@ -3,7 +3,7 @@ import logging
3
3
  import os
4
4
  import sys
5
5
  from pathlib import Path
6
- from typing import Dict, List, Optional, Union, Any
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from rich.console import Console
9
9
  from rich.syntax import Syntax
@@ -18,7 +18,7 @@ from vision_agent.agent.agent_coder_prompts import (
18
18
  )
19
19
  from vision_agent.llm import LLM, OpenAILLM
20
20
  from vision_agent.lmm import LMM, OpenAILMM
21
- from vision_agent.tools.tools_v2 import TOOL_DOCSTRING, UTILITIES_DOCSTRING
21
+ from vision_agent.tools import TOOL_DOCSTRING, UTILITIES_DOCSTRING
22
22
  from vision_agent.utils import Execute
23
23
 
24
24
  IMPORT_HELPER = """
@@ -38,7 +38,7 @@ import numpy as np
38
38
  import string
39
39
  from typing import *
40
40
  from collections import *
41
- from vision_agent.tools.tools_v2 import *
41
+ from vision_agent.tools import *
42
42
  """
43
43
  logging.basicConfig(stream=sys.stdout)
44
44
  _LOGGER = logging.getLogger(__name__)
@@ -150,20 +150,20 @@ class AgentCoder(Agent):
150
150
  def __call__(
151
151
  self,
152
152
  input: Union[List[Dict[str, str]], str],
153
- image: Optional[Union[str, Path]] = None,
153
+ media: Optional[Union[str, Path]] = None,
154
154
  ) -> str:
155
155
  if isinstance(input, str):
156
156
  input = [{"role": "user", "content": input}]
157
- return self.chat(input, image)
157
+ return self.chat(input, media)
158
158
 
159
159
  def chat(
160
160
  self,
161
161
  input: List[Dict[str, str]],
162
- image: Optional[Union[str, Path]] = None,
162
+ media: Optional[Union[str, Path]] = None,
163
163
  ) -> str:
164
164
  question = input[0]["content"]
165
- if image:
166
- question += f" Input file path: {os.path.abspath(image)}"
165
+ if media:
166
+ question += f" Input file path: {os.path.abspath(media)}"
167
167
 
168
168
  code = ""
169
169
  feedback = ""
@@ -10,7 +10,7 @@ from rich.syntax import Syntax
10
10
  from tabulate import tabulate
11
11
 
12
12
  from vision_agent.agent import Agent
13
- from vision_agent.agent.vision_agent_v2_prompts import (
13
+ from vision_agent.agent.data_interpreter_prompts import (
14
14
  CODE,
15
15
  CODE_SYS_MSG,
16
16
  DEBUG,
@@ -25,7 +25,7 @@ from vision_agent.agent.vision_agent_v2_prompts import (
25
25
  USER_REQ_SUBTASK_WM_CONTEXT,
26
26
  )
27
27
  from vision_agent.llm import LLM, OpenAILLM
28
- from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
28
+ from vision_agent.tools import TOOL_DESCRIPTIONS, TOOLS_DF
29
29
  from vision_agent.utils import Execute, Sim
30
30
 
31
31
  logging.basicConfig(level=logging.INFO)
@@ -331,11 +331,11 @@ def run_plan(
331
331
  return current_code, current_test, plan, working_memory
332
332
 
333
333
 
334
- class VisionAgentV2(Agent):
335
- """Vision Agent is an AI agentic framework geared towards outputting Python code to
336
- solve vision tasks. It is inspired by MetaGPT's Data Interpreter
337
- https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it
338
- generate code:
334
+ class DataInterpreter(Agent):
335
+ """This version of Data Interpreter is an AI agentic framework geared towards
336
+ outputting Python code to solve vision tasks. It is inspired by MetaGPT's Data
337
+ Interpreter https://arxiv.org/abs/2402.18679. This version of Data Interpreter has
338
+ several key features to help it generate code:
339
339
 
340
340
  - A planner to generate a plan of tasks to solve a user requirement. The planner
341
341
  can output code tasks or test tasks, where test tasks are used to verify the code.
@@ -379,29 +379,29 @@ class VisionAgentV2(Agent):
379
379
  def __call__(
380
380
  self,
381
381
  input: Union[List[Dict[str, str]], str],
382
- image: Optional[Union[str, Path]] = None,
382
+ media: Optional[Union[str, Path]] = None,
383
383
  plan: Optional[List[Dict[str, Any]]] = None,
384
384
  ) -> str:
385
385
  if isinstance(input, str):
386
386
  input = [{"role": "user", "content": input}]
387
- results = self.chat_with_workflow(input, image, plan)
387
+ results = self.chat_with_workflow(input, media, plan)
388
388
  return results["code"] # type: ignore
389
389
 
390
390
  @traceable
391
391
  def chat_with_workflow(
392
392
  self,
393
393
  chat: List[Dict[str, str]],
394
- image: Optional[Union[str, Path]] = None,
394
+ media: Optional[Union[str, Path]] = None,
395
395
  plan: Optional[List[Dict[str, Any]]] = None,
396
396
  ) -> Dict[str, Any]:
397
397
  if len(chat) == 0:
398
398
  raise ValueError("Input cannot be empty.")
399
399
 
400
- if image is not None:
400
+ if media is not None:
401
401
  # append file names to all user messages
402
402
  for chat_i in chat:
403
403
  if chat_i["role"] == "user":
404
- chat_i["content"] += f" Image name {image}"
404
+ chat_i["content"] += f" Image name {media}"
405
405
 
406
406
  working_code = ""
407
407
  if plan is not None:
@@ -74,15 +74,15 @@ CODE = """
74
74
 
75
75
  # Constraints
76
76
  - Write a function that accomplishes the 'Current Subtask'. You are supplied code from a previous task under 'Previous Code', do not delete or change previous code unless it contains a bug or it is necessary to complete the 'Current Subtask'.
77
- - Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
77
+ - Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools import *` import.
78
78
  - You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
79
- - Use the `save_json` function from `vision_agent.tools.tools_v2` to save your output as a json file.
79
+ - Use the `save_json` function from `vision_agent.tools` to save your output as a json file.
80
80
  - Write clean, readable, and well-documented code.
81
81
 
82
82
  # Output
83
83
  While some concise thoughts are helpful, code is absolutely required. If possible, execute your defined functions in the code output. Output code in the following format:
84
84
  ```python
85
- from vision_agent.tools.tools_v2 imoprt *
85
+ from vision_agent.tools imoprt *
86
86
 
87
87
  # your code goes here
88
88
  ```
@@ -6,7 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from vision_agent.llm import LLM, OpenAILLM
8
8
  from vision_agent.lmm import LMM
9
- from vision_agent.tools import TOOLS
9
+ from vision_agent.tools.easytool_tools import TOOLS
10
10
 
11
11
  from .agent import Agent
12
12
  from .easytool_prompts import (
@@ -272,7 +272,7 @@ class EasyTool(Agent):
272
272
  def __call__(
273
273
  self,
274
274
  input: Union[List[Dict[str, str]], str],
275
- image: Optional[Union[str, Path]] = None,
275
+ media: Optional[Union[str, Path]] = None,
276
276
  ) -> str:
277
277
  """Invoke the vision agent.
278
278
 
@@ -285,14 +285,14 @@ class EasyTool(Agent):
285
285
  """
286
286
  if isinstance(input, str):
287
287
  input = [{"role": "user", "content": input}]
288
- return self.chat(input, image=image)
288
+ return self.chat(input, media=media)
289
289
 
290
290
  def chat_with_workflow(
291
- self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
291
+ self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
292
292
  ) -> Tuple[str, List[Dict]]:
293
293
  question = chat[0]["content"]
294
- if image:
295
- question += f" Image name: {image}"
294
+ if media:
295
+ question += f" Image name: {media}"
296
296
  tasks = task_decompose(
297
297
  self.task_model,
298
298
  question,
@@ -340,7 +340,7 @@ class EasyTool(Agent):
340
340
  return answer_summarize(self.answer_model, question, answers), all_tool_results
341
341
 
342
342
  def chat(
343
- self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
343
+ self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
344
344
  ) -> str:
345
- answer, _ = self.chat_with_workflow(chat, image=image)
345
+ answer, _ = self.chat_with_workflow(chat, media=media)
346
346
  return answer