vision-agent 0.2.30__tar.gz → 0.2.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. vision_agent-0.2.31/PKG-INFO +175 -0
  2. vision_agent-0.2.31/README.md +141 -0
  3. {vision_agent-0.2.30 → vision_agent-0.2.31}/pyproject.toml +1 -1
  4. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/__init__.py +2 -2
  5. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/agent.py +1 -1
  6. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/agent_coder.py +7 -7
  7. vision_agent-0.2.30/vision_agent/agent/vision_agent_v2.py → vision_agent-0.2.31/vision_agent/agent/data_interpreter.py +12 -12
  8. vision_agent-0.2.30/vision_agent/agent/vision_agent_v2_prompts.py → vision_agent-0.2.31/vision_agent/agent/data_interpreter_prompts.py +3 -3
  9. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/easytool.py +8 -8
  10. vision_agent-0.2.30/vision_agent/agent/vision_agent.py → vision_agent-0.2.31/vision_agent/agent/easytool_v2.py +20 -20
  11. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/reflexion.py +8 -8
  12. vision_agent-0.2.30/vision_agent/agent/vision_agent_v3.py → vision_agent-0.2.31/vision_agent/agent/vision_agent.py +68 -15
  13. vision_agent-0.2.30/vision_agent/agent/vision_agent_v3_prompts.py → vision_agent-0.2.31/vision_agent/agent/vision_agent_prompts.py +4 -4
  14. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/llm/llm.py +3 -4
  15. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/lmm/lmm.py +6 -6
  16. vision_agent-0.2.31/vision_agent/tools/__init__.py +24 -0
  17. vision_agent-0.2.30/PKG-INFO +0 -226
  18. vision_agent-0.2.30/README.md +0 -192
  19. vision_agent-0.2.30/vision_agent/tools/__init__.py +0 -25
  20. {vision_agent-0.2.30 → vision_agent-0.2.31}/LICENSE +0 -0
  21. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/__init__.py +0 -0
  22. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/agent_coder_prompts.py +0 -0
  23. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/easytool_prompts.py +0 -0
  24. /vision_agent-0.2.30/vision_agent/agent/vision_agent_prompts.py → /vision_agent-0.2.31/vision_agent/agent/easytool_v2_prompts.py +0 -0
  25. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/reflexion_prompts.py +0 -0
  26. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/fonts/__init__.py +0 -0
  27. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  28. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/llm/__init__.py +0 -0
  29. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/lmm/__init__.py +0 -0
  30. /vision_agent-0.2.30/vision_agent/tools/tools.py → /vision_agent-0.2.31/vision_agent/tools/easytool_tools.py +0 -0
  31. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/tools/prompts.py +0 -0
  32. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/tools/tool_utils.py +0 -0
  33. /vision_agent-0.2.30/vision_agent/tools/tools_v2.py → /vision_agent-0.2.31/vision_agent/tools/tools.py +0 -0
  34. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/__init__.py +0 -0
  35. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/execute.py +0 -0
  36. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/image_utils.py +0 -0
  37. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/sim.py +0 -0
  38. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/type_defs.py +0 -0
  39. {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/video.py +0 -0
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.1
2
+ Name: vision-agent
3
+ Version: 0.2.31
4
+ Summary: Toolset for Vision Agent
5
+ Author: Landing AI
6
+ Author-email: dev@landing.ai
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
13
+ Requires-Dist: langsmith (>=0.1.58,<0.2.0)
14
+ Requires-Dist: moviepy (>=1.0.0,<2.0.0)
15
+ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
16
+ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
17
+ Requires-Dist: numpy (>=1.21.0,<2.0.0)
18
+ Requires-Dist: openai (>=1.0.0,<2.0.0)
19
+ Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
20
+ Requires-Dist: pandas (>=2.0.0,<3.0.0)
21
+ Requires-Dist: pillow (>=10.0.0,<11.0.0)
22
+ Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
23
+ Requires-Dist: requests (>=2.0.0,<3.0.0)
24
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
25
+ Requires-Dist: scipy (>=1.13.0,<1.14.0)
26
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
27
+ Requires-Dist: tqdm (>=4.64.0,<5.0.0)
28
+ Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
29
+ Project-URL: Homepage, https://landing.ai
30
+ Project-URL: documentation, https://github.com/landing-ai/vision-agent
31
+ Project-URL: repository, https://github.com/landing-ai/vision-agent
32
+ Description-Content-Type: text/markdown
33
+
34
+ <div align="center">
35
+ <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
36
+
37
+ # 🔍🤖 Vision Agent
38
+
39
+ [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
40
+ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
41
+ [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
42
+ ![version](https://img.shields.io/pypi/pyversions/vision-agent)
43
+ </div>
44
+
45
+ Vision Agent is a library that helps you utilize agent frameworks to generate code to
46
+ solve your vision task. Many current vision problems can easily take hours or days to
47
+ solve, you need to find the right model, figure out how to use it and program it to
48
+ accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
49
+ allowing users to describe their problem in text and have the agent framework generate
50
+ code to solve the task for them. Check out our discord for updates and roadmaps!
51
+
52
+ ## Documentation
53
+
54
+ - [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
55
+
56
+
57
+ ## Getting Started
58
+ ### Installation
59
+ To get started, you can install the library using pip:
60
+
61
+ ```bash
62
+ pip install vision-agent
63
+ ```
64
+
65
+ Ensure you have an OpenAI API key and set it as an environment variable (if you are
66
+ using Azure OpenAI please see the Azure setup section):
67
+
68
+ ```bash
69
+ export OPENAI_API_KEY="your-api-key"
70
+ ```
71
+
72
+ ### Vision Agent
73
+ You can interact with the agent as you would with any LLM or LMM model:
74
+
75
+ ```python
76
+ >>> from vision_agent.agent import VisionAgent
77
+ >>> agent = VisionAgent()
78
+ >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
79
+ ```
80
+
81
+ Which produces the following code:
82
+ ```python
83
+ from vision_agent.tools import load_image, grounding_sam
84
+
85
+ def calculate_filled_percentage(image_path: str) -> float:
86
+ # Step 1: Load the image
87
+ image = load_image(image_path)
88
+
89
+ # Step 2: Segment the jar
90
+ jar_segments = grounding_sam(prompt="jar", image=image)
91
+
92
+ # Step 3: Segment the coffee beans
93
+ coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
94
+
95
+ # Step 4: Calculate the area of the segmented jar
96
+ jar_area = 0
97
+ for segment in jar_segments:
98
+ jar_area += segment['mask'].sum()
99
+
100
+ # Step 5: Calculate the area of the segmented coffee beans
101
+ coffee_beans_area = 0
102
+ for segment in coffee_beans_segments:
103
+ coffee_beans_area += segment['mask'].sum()
104
+
105
+ # Step 6: Compute the percentage of the jar area that is filled with coffee beans
106
+ if jar_area == 0:
107
+ return 0.0 # To avoid division by zero
108
+ filled_percentage = (coffee_beans_area / jar_area) * 100
109
+
110
+ # Step 7: Return the computed percentage
111
+ return filled_percentage
112
+ ```
113
+
114
+ To better understand how the model came up with it's answer, you can run it in debug
115
+ mode by passing in the verbose argument:
116
+
117
+ ```python
118
+ >>> agent = VisionAgent(verbose=2)
119
+ ```
120
+
121
+ You can also have it return more information by calling `chat_with_workflow`:
122
+
123
+ ```python
124
+ >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
125
+ >>> print(results)
126
+ {
127
+ "code": "from vision_agent.tools import ..."
128
+ "test": "calculate_filled_percentage('jar.jpg')",
129
+ "test_result": "...",
130
+ "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
131
+ "working_memory": ...,
132
+ }
133
+ ```
134
+
135
+ With this you can examine more detailed information such as the etesting code, testing
136
+ results, plan or working memory it used to complete the task.
137
+
138
+ ### Tools
139
+ There are a variety of tools for the model or the user to use. Some are executed locally
140
+ while others are hosted for you. You can also ask an LLM directly to build a tool for
141
+ you. For example:
142
+
143
+ ```python
144
+ >>> import vision_agent as va
145
+ >>> llm = va.llm.OpenAILLM()
146
+ >>> detector = llm.generate_detector("Can you build a jar detector for me?")
147
+ >>> detector("jar.jpg")
148
+ [{"labels": ["jar",],
149
+ "scores": [0.99],
150
+ "bboxes": [
151
+ [0.58, 0.2, 0.72, 0.45],
152
+ ]
153
+ }]
154
+ ```
155
+
156
+ ### Azure Setup
157
+ If you want to use Azure OpenAI models, you can set the environment variable:
158
+
159
+ ```bash
160
+ export AZURE_OPENAI_API_KEY="your-api-key"
161
+ export AZURE_OPENAI_ENDPOINT="your-endpoint"
162
+ ```
163
+
164
+ You can then run Vision Agent using the Azure OpenAI models:
165
+
166
+ ```python
167
+ >>> import vision_agent as va
168
+ >>> agent = va.agent.VisionAgent(
169
+ >>> task_model=va.llm.AzureOpenAILLM(),
170
+ >>> answer_model=va.lmm.AzureOpenAILMM(),
171
+ >>> reflection_model=va.lmm.AzureOpenAILMM(),
172
+ >>> )
173
+ ```
174
+
175
+
@@ -0,0 +1,141 @@
1
+ <div align="center">
2
+ <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
3
+
4
+ # 🔍🤖 Vision Agent
5
+
6
+ [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
7
+ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
8
+ [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
9
+ ![version](https://img.shields.io/pypi/pyversions/vision-agent)
10
+ </div>
11
+
12
+ Vision Agent is a library that helps you utilize agent frameworks to generate code to
13
+ solve your vision task. Many current vision problems can easily take hours or days to
14
+ solve, you need to find the right model, figure out how to use it and program it to
15
+ accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
16
+ allowing users to describe their problem in text and have the agent framework generate
17
+ code to solve the task for them. Check out our discord for updates and roadmaps!
18
+
19
+ ## Documentation
20
+
21
+ - [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
22
+
23
+
24
+ ## Getting Started
25
+ ### Installation
26
+ To get started, you can install the library using pip:
27
+
28
+ ```bash
29
+ pip install vision-agent
30
+ ```
31
+
32
+ Ensure you have an OpenAI API key and set it as an environment variable (if you are
33
+ using Azure OpenAI please see the Azure setup section):
34
+
35
+ ```bash
36
+ export OPENAI_API_KEY="your-api-key"
37
+ ```
38
+
39
+ ### Vision Agent
40
+ You can interact with the agent as you would with any LLM or LMM model:
41
+
42
+ ```python
43
+ >>> from vision_agent.agent import VisionAgent
44
+ >>> agent = VisionAgent()
45
+ >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
46
+ ```
47
+
48
+ Which produces the following code:
49
+ ```python
50
+ from vision_agent.tools import load_image, grounding_sam
51
+
52
+ def calculate_filled_percentage(image_path: str) -> float:
53
+ # Step 1: Load the image
54
+ image = load_image(image_path)
55
+
56
+ # Step 2: Segment the jar
57
+ jar_segments = grounding_sam(prompt="jar", image=image)
58
+
59
+ # Step 3: Segment the coffee beans
60
+ coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
61
+
62
+ # Step 4: Calculate the area of the segmented jar
63
+ jar_area = 0
64
+ for segment in jar_segments:
65
+ jar_area += segment['mask'].sum()
66
+
67
+ # Step 5: Calculate the area of the segmented coffee beans
68
+ coffee_beans_area = 0
69
+ for segment in coffee_beans_segments:
70
+ coffee_beans_area += segment['mask'].sum()
71
+
72
+ # Step 6: Compute the percentage of the jar area that is filled with coffee beans
73
+ if jar_area == 0:
74
+ return 0.0 # To avoid division by zero
75
+ filled_percentage = (coffee_beans_area / jar_area) * 100
76
+
77
+ # Step 7: Return the computed percentage
78
+ return filled_percentage
79
+ ```
80
+
81
+ To better understand how the model came up with it's answer, you can run it in debug
82
+ mode by passing in the verbose argument:
83
+
84
+ ```python
85
+ >>> agent = VisionAgent(verbose=2)
86
+ ```
87
+
88
+ You can also have it return more information by calling `chat_with_workflow`:
89
+
90
+ ```python
91
+ >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
92
+ >>> print(results)
93
+ {
94
+ "code": "from vision_agent.tools import ..."
95
+ "test": "calculate_filled_percentage('jar.jpg')",
96
+ "test_result": "...",
97
+ "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
98
+ "working_memory": ...,
99
+ }
100
+ ```
101
+
102
+ With this you can examine more detailed information such as the etesting code, testing
103
+ results, plan or working memory it used to complete the task.
104
+
105
+ ### Tools
106
+ There are a variety of tools for the model or the user to use. Some are executed locally
107
+ while others are hosted for you. You can also ask an LLM directly to build a tool for
108
+ you. For example:
109
+
110
+ ```python
111
+ >>> import vision_agent as va
112
+ >>> llm = va.llm.OpenAILLM()
113
+ >>> detector = llm.generate_detector("Can you build a jar detector for me?")
114
+ >>> detector("jar.jpg")
115
+ [{"labels": ["jar",],
116
+ "scores": [0.99],
117
+ "bboxes": [
118
+ [0.58, 0.2, 0.72, 0.45],
119
+ ]
120
+ }]
121
+ ```
122
+
123
+ ### Azure Setup
124
+ If you want to use Azure OpenAI models, you can set the environment variable:
125
+
126
+ ```bash
127
+ export AZURE_OPENAI_API_KEY="your-api-key"
128
+ export AZURE_OPENAI_ENDPOINT="your-endpoint"
129
+ ```
130
+
131
+ You can then run Vision Agent using the Azure OpenAI models:
132
+
133
+ ```python
134
+ >>> import vision_agent as va
135
+ >>> agent = va.agent.VisionAgent(
136
+ >>> task_model=va.llm.AzureOpenAILLM(),
137
+ >>> answer_model=va.lmm.AzureOpenAILMM(),
138
+ >>> reflection_model=va.lmm.AzureOpenAILMM(),
139
+ >>> )
140
+ ```
141
+
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.30"
7
+ version = "0.2.31"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -1,7 +1,7 @@
1
1
  from .agent import Agent
2
2
  from .agent_coder import AgentCoder
3
+ from .data_interpreter import DataInterpreter
3
4
  from .easytool import EasyTool
5
+ from .easytool_v2 import EasyToolV2
4
6
  from .reflexion import Reflexion
5
7
  from .vision_agent import VisionAgent
6
- from .vision_agent_v2 import VisionAgentV2
7
- from .vision_agent_v3 import VisionAgentV3
@@ -8,7 +8,7 @@ class Agent(ABC):
8
8
  def __call__(
9
9
  self,
10
10
  input: Union[List[Dict[str, str]], str],
11
- image: Optional[Union[str, Path]] = None,
11
+ media: Optional[Union[str, Path]] = None,
12
12
  ) -> str:
13
13
  pass
14
14
 
@@ -18,7 +18,7 @@ from vision_agent.agent.agent_coder_prompts import (
18
18
  )
19
19
  from vision_agent.llm import LLM, OpenAILLM
20
20
  from vision_agent.lmm import LMM, OpenAILMM
21
- from vision_agent.tools.tools_v2 import TOOL_DOCSTRING, UTILITIES_DOCSTRING
21
+ from vision_agent.tools import TOOL_DOCSTRING, UTILITIES_DOCSTRING
22
22
  from vision_agent.utils import Execute
23
23
 
24
24
  IMPORT_HELPER = """
@@ -38,7 +38,7 @@ import numpy as np
38
38
  import string
39
39
  from typing import *
40
40
  from collections import *
41
- from vision_agent.tools.tools_v2 import *
41
+ from vision_agent.tools import *
42
42
  """
43
43
  logging.basicConfig(stream=sys.stdout)
44
44
  _LOGGER = logging.getLogger(__name__)
@@ -150,20 +150,20 @@ class AgentCoder(Agent):
150
150
  def __call__(
151
151
  self,
152
152
  input: Union[List[Dict[str, str]], str],
153
- image: Optional[Union[str, Path]] = None,
153
+ media: Optional[Union[str, Path]] = None,
154
154
  ) -> str:
155
155
  if isinstance(input, str):
156
156
  input = [{"role": "user", "content": input}]
157
- return self.chat(input, image)
157
+ return self.chat(input, media)
158
158
 
159
159
  def chat(
160
160
  self,
161
161
  input: List[Dict[str, str]],
162
- image: Optional[Union[str, Path]] = None,
162
+ media: Optional[Union[str, Path]] = None,
163
163
  ) -> str:
164
164
  question = input[0]["content"]
165
- if image:
166
- question += f" Input file path: {os.path.abspath(image)}"
165
+ if media:
166
+ question += f" Input file path: {os.path.abspath(media)}"
167
167
 
168
168
  code = ""
169
169
  feedback = ""
@@ -10,7 +10,7 @@ from rich.syntax import Syntax
10
10
  from tabulate import tabulate
11
11
 
12
12
  from vision_agent.agent import Agent
13
- from vision_agent.agent.vision_agent_v2_prompts import (
13
+ from vision_agent.agent.data_interpreter_prompts import (
14
14
  CODE,
15
15
  CODE_SYS_MSG,
16
16
  DEBUG,
@@ -25,7 +25,7 @@ from vision_agent.agent.vision_agent_v2_prompts import (
25
25
  USER_REQ_SUBTASK_WM_CONTEXT,
26
26
  )
27
27
  from vision_agent.llm import LLM, OpenAILLM
28
- from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
28
+ from vision_agent.tools import TOOL_DESCRIPTIONS, TOOLS_DF
29
29
  from vision_agent.utils import Execute, Sim
30
30
 
31
31
  logging.basicConfig(level=logging.INFO)
@@ -331,11 +331,11 @@ def run_plan(
331
331
  return current_code, current_test, plan, working_memory
332
332
 
333
333
 
334
- class VisionAgentV2(Agent):
335
- """Vision Agent is an AI agentic framework geared towards outputting Python code to
336
- solve vision tasks. It is inspired by MetaGPT's Data Interpreter
337
- https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it
338
- generate code:
334
+ class DataInterpreter(Agent):
335
+ """This version of Data Interpreter is an AI agentic framework geared towards
336
+ outputting Python code to solve vision tasks. It is inspired by MetaGPT's Data
337
+ Interpreter https://arxiv.org/abs/2402.18679. This version of Data Interpreter has
338
+ several key features to help it generate code:
339
339
 
340
340
  - A planner to generate a plan of tasks to solve a user requirement. The planner
341
341
  can output code tasks or test tasks, where test tasks are used to verify the code.
@@ -379,29 +379,29 @@ class VisionAgentV2(Agent):
379
379
  def __call__(
380
380
  self,
381
381
  input: Union[List[Dict[str, str]], str],
382
- image: Optional[Union[str, Path]] = None,
382
+ media: Optional[Union[str, Path]] = None,
383
383
  plan: Optional[List[Dict[str, Any]]] = None,
384
384
  ) -> str:
385
385
  if isinstance(input, str):
386
386
  input = [{"role": "user", "content": input}]
387
- results = self.chat_with_workflow(input, image, plan)
387
+ results = self.chat_with_workflow(input, media, plan)
388
388
  return results["code"] # type: ignore
389
389
 
390
390
  @traceable
391
391
  def chat_with_workflow(
392
392
  self,
393
393
  chat: List[Dict[str, str]],
394
- image: Optional[Union[str, Path]] = None,
394
+ media: Optional[Union[str, Path]] = None,
395
395
  plan: Optional[List[Dict[str, Any]]] = None,
396
396
  ) -> Dict[str, Any]:
397
397
  if len(chat) == 0:
398
398
  raise ValueError("Input cannot be empty.")
399
399
 
400
- if image is not None:
400
+ if media is not None:
401
401
  # append file names to all user messages
402
402
  for chat_i in chat:
403
403
  if chat_i["role"] == "user":
404
- chat_i["content"] += f" Image name {image}"
404
+ chat_i["content"] += f" Image name {media}"
405
405
 
406
406
  working_code = ""
407
407
  if plan is not None:
@@ -74,15 +74,15 @@ CODE = """
74
74
 
75
75
  # Constraints
76
76
  - Write a function that accomplishes the 'Current Subtask'. You are supplied code from a previous task under 'Previous Code', do not delete or change previous code unless it contains a bug or it is necessary to complete the 'Current Subtask'.
77
- - Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
77
+ - Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools import *` import.
78
78
  - You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
79
- - Use the `save_json` function from `vision_agent.tools.tools_v2` to save your output as a json file.
79
+ - Use the `save_json` function from `vision_agent.tools` to save your output as a json file.
80
80
  - Write clean, readable, and well-documented code.
81
81
 
82
82
  # Output
83
83
  While some concise thoughts are helpful, code is absolutely required. If possible, execute your defined functions in the code output. Output code in the following format:
84
84
  ```python
85
- from vision_agent.tools.tools_v2 imoprt *
85
+ from vision_agent.tools imoprt *
86
86
 
87
87
  # your code goes here
88
88
  ```
@@ -6,7 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from vision_agent.llm import LLM, OpenAILLM
8
8
  from vision_agent.lmm import LMM
9
- from vision_agent.tools import TOOLS
9
+ from vision_agent.tools.easytool_tools import TOOLS
10
10
 
11
11
  from .agent import Agent
12
12
  from .easytool_prompts import (
@@ -272,7 +272,7 @@ class EasyTool(Agent):
272
272
  def __call__(
273
273
  self,
274
274
  input: Union[List[Dict[str, str]], str],
275
- image: Optional[Union[str, Path]] = None,
275
+ media: Optional[Union[str, Path]] = None,
276
276
  ) -> str:
277
277
  """Invoke the vision agent.
278
278
 
@@ -285,14 +285,14 @@ class EasyTool(Agent):
285
285
  """
286
286
  if isinstance(input, str):
287
287
  input = [{"role": "user", "content": input}]
288
- return self.chat(input, image=image)
288
+ return self.chat(input, media=media)
289
289
 
290
290
  def chat_with_workflow(
291
- self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
291
+ self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
292
292
  ) -> Tuple[str, List[Dict]]:
293
293
  question = chat[0]["content"]
294
- if image:
295
- question += f" Image name: {image}"
294
+ if media:
295
+ question += f" Image name: {media}"
296
296
  tasks = task_decompose(
297
297
  self.task_model,
298
298
  question,
@@ -340,7 +340,7 @@ class EasyTool(Agent):
340
340
  return answer_summarize(self.answer_model, question, answers), all_tool_results
341
341
 
342
342
  def chat(
343
- self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
343
+ self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
344
344
  ) -> str:
345
- answer, _ = self.chat_with_workflow(chat, image=image)
345
+ answer, _ = self.chat_with_workflow(chat, media=media)
346
346
  return answer
@@ -17,7 +17,7 @@ from vision_agent.agent.easytool_prompts import (
17
17
  TASK_DECOMPOSE,
18
18
  TASK_TOPOLOGY,
19
19
  )
20
- from vision_agent.agent.vision_agent_prompts import (
20
+ from vision_agent.agent.easytool_v2_prompts import (
21
21
  ANSWER_GENERATE_DEPENDS,
22
22
  ANSWER_SUMMARIZE_DEPENDS,
23
23
  CHOOSE_PARAMETER_DEPENDS,
@@ -27,7 +27,7 @@ from vision_agent.agent.vision_agent_prompts import (
27
27
  )
28
28
  from vision_agent.llm import LLM, OpenAILLM
29
29
  from vision_agent.lmm import LMM, OpenAILMM
30
- from vision_agent.tools import TOOLS
30
+ from vision_agent.tools.easytool_tools import TOOLS
31
31
  from vision_agent.utils.image_utils import (
32
32
  convert_to_b64,
33
33
  overlay_bboxes,
@@ -427,9 +427,9 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
427
427
  return visualized_images
428
428
 
429
429
 
430
- class VisionAgent(Agent):
431
- r"""Vision Agent is an agent framework that utilizes tools as well as self
432
- reflection to accomplish tasks, in particular vision tasks. Vision Agent is based
430
+ class EasyToolV2(Agent):
431
+ r"""EasyToolV2 is an agent framework that utilizes tools as well as self
432
+ reflection to accomplish tasks, in particular vision tasks. EasyToolV2 is based
433
433
  off of EasyTool https://arxiv.org/abs/2401.06201 and Reflexion
434
434
  https://arxiv.org/abs/2303.11366 where it will attempt to complete a task and then
435
435
  reflect on whether or not it was able to accomplish the task based off of the plan
@@ -437,8 +437,8 @@ class VisionAgent(Agent):
437
437
 
438
438
  Example
439
439
  -------
440
- >>> from vision_agent.agent import VisionAgent
441
- >>> agent = VisionAgent()
440
+ >>> from vision_agent.agent import EasyToolV2
441
+ >>> agent = EasyToolV2()
442
442
  >>> resp = agent("If red tomatoes cost $5 each and yellow tomatoes cost $2.50 each, what is the total cost of all the tomatoes in the image?", image="tomatoes.jpg")
443
443
  >>> print(resp)
444
444
  "The total cost is $57.50."
@@ -453,7 +453,7 @@ class VisionAgent(Agent):
453
453
  verbose: bool = False,
454
454
  report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
455
455
  ):
456
- """VisionAgent constructor.
456
+ """EasyToolV2 constructor.
457
457
 
458
458
  Parameters:
459
459
  task_model: the model to use for task decomposition.
@@ -461,7 +461,7 @@ class VisionAgent(Agent):
461
461
  reflect_model: the model to use for self reflection.
462
462
  max_retries: maximum number of retries to attempt to complete the task.
463
463
  verbose: whether to print more logs.
464
- report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgent instances are running in parallel. This callback ensures that the progress are not mixed up.
464
+ report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple EasyToolV2 instances are running in parallel. This callback ensures that the progress are not mixed up.
465
465
  """
466
466
  self.task_model = (
467
467
  OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
@@ -487,7 +487,7 @@ class VisionAgent(Agent):
487
487
  def __call__(
488
488
  self,
489
489
  input: Union[List[Dict[str, str]], str],
490
- image: Optional[Union[str, Path]] = None,
490
+ media: Optional[Union[str, Path]] = None,
491
491
  reference_data: Optional[Dict[str, str]] = None,
492
492
  visualize_output: Optional[bool] = False,
493
493
  self_reflection: Optional[bool] = True,
@@ -512,7 +512,7 @@ class VisionAgent(Agent):
512
512
  input = [{"role": "user", "content": input}]
513
513
  return self.chat(
514
514
  input,
515
- image=image,
515
+ media=media,
516
516
  visualize_output=visualize_output,
517
517
  reference_data=reference_data,
518
518
  self_reflection=self_reflection,
@@ -539,12 +539,12 @@ class VisionAgent(Agent):
539
539
  def chat_with_workflow(
540
540
  self,
541
541
  chat: List[Dict[str, str]],
542
- image: Optional[Union[str, Path]] = None,
542
+ media: Optional[Union[str, Path]] = None,
543
543
  reference_data: Optional[Dict[str, str]] = None,
544
544
  visualize_output: Optional[bool] = False,
545
545
  self_reflection: Optional[bool] = True,
546
546
  ) -> Tuple[str, List[Dict]]:
547
- """Chat with the vision agent and return the final answer and all tool results.
547
+ """Chat with EasyToolV2 and return the final answer and all tool results.
548
548
 
549
549
  Parameters:
550
550
  chat: A conversation in the format of
@@ -566,8 +566,8 @@ class VisionAgent(Agent):
566
566
  raise ValueError("Input cannot be empty.")
567
567
 
568
568
  question = chat[0]["content"]
569
- if image:
570
- question += f" Image name: {image}"
569
+ if media:
570
+ question += f" Image name: {media}"
571
571
  if reference_data:
572
572
  question += (
573
573
  f" Reference image: {reference_data['image']}"
@@ -630,8 +630,8 @@ class VisionAgent(Agent):
630
630
  all_tool_results.append({"visualized_output": visualized_output})
631
631
  if len(visualized_output) > 0:
632
632
  reflection_images = sample_n_evenly_spaced(visualized_output, 3)
633
- elif image is not None:
634
- reflection_images = [image]
633
+ elif media is not None:
634
+ reflection_images = [media]
635
635
  else:
636
636
  reflection_images = None
637
637
 
@@ -658,7 +658,7 @@ class VisionAgent(Agent):
658
658
  # '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
659
659
  self.log_progress(
660
660
  {
661
- "log": f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
661
+ "log": f"EasyToolV2 has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
662
662
  }
663
663
  )
664
664
 
@@ -675,14 +675,14 @@ class VisionAgent(Agent):
675
675
  def chat(
676
676
  self,
677
677
  chat: List[Dict[str, str]],
678
- image: Optional[Union[str, Path]] = None,
678
+ media: Optional[Union[str, Path]] = None,
679
679
  reference_data: Optional[Dict[str, str]] = None,
680
680
  visualize_output: Optional[bool] = False,
681
681
  self_reflection: Optional[bool] = True,
682
682
  ) -> str:
683
683
  answer, _ = self.chat_with_workflow(
684
684
  chat,
685
- image=image,
685
+ media=media,
686
686
  visualize_output=visualize_output,
687
687
  reference_data=reference_data,
688
688
  self_reflection=self_reflection,