vision-agent 0.2.30__tar.gz → 0.2.31__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent-0.2.31/PKG-INFO +175 -0
- vision_agent-0.2.31/README.md +141 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/pyproject.toml +1 -1
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/__init__.py +2 -2
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/agent.py +1 -1
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/agent_coder.py +7 -7
- vision_agent-0.2.30/vision_agent/agent/vision_agent_v2.py → vision_agent-0.2.31/vision_agent/agent/data_interpreter.py +12 -12
- vision_agent-0.2.30/vision_agent/agent/vision_agent_v2_prompts.py → vision_agent-0.2.31/vision_agent/agent/data_interpreter_prompts.py +3 -3
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/easytool.py +8 -8
- vision_agent-0.2.30/vision_agent/agent/vision_agent.py → vision_agent-0.2.31/vision_agent/agent/easytool_v2.py +20 -20
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/reflexion.py +8 -8
- vision_agent-0.2.30/vision_agent/agent/vision_agent_v3.py → vision_agent-0.2.31/vision_agent/agent/vision_agent.py +68 -15
- vision_agent-0.2.30/vision_agent/agent/vision_agent_v3_prompts.py → vision_agent-0.2.31/vision_agent/agent/vision_agent_prompts.py +4 -4
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/llm/llm.py +3 -4
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/lmm/lmm.py +6 -6
- vision_agent-0.2.31/vision_agent/tools/__init__.py +24 -0
- vision_agent-0.2.30/PKG-INFO +0 -226
- vision_agent-0.2.30/README.md +0 -192
- vision_agent-0.2.30/vision_agent/tools/__init__.py +0 -25
- {vision_agent-0.2.30 → vision_agent-0.2.31}/LICENSE +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/agent_coder_prompts.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/easytool_prompts.py +0 -0
- /vision_agent-0.2.30/vision_agent/agent/vision_agent_prompts.py → /vision_agent-0.2.31/vision_agent/agent/easytool_v2_prompts.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/agent/reflexion_prompts.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/llm/__init__.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/lmm/__init__.py +0 -0
- /vision_agent-0.2.30/vision_agent/tools/tools.py → /vision_agent-0.2.31/vision_agent/tools/easytool_tools.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/tools/tool_utils.py +0 -0
- /vision_agent-0.2.30/vision_agent/tools/tools_v2.py → /vision_agent-0.2.31/vision_agent/tools/tools.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.30 → vision_agent-0.2.31}/vision_agent/utils/video.py +0 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: vision-agent
|
3
|
+
Version: 0.2.31
|
4
|
+
Summary: Toolset for Vision Agent
|
5
|
+
Author: Landing AI
|
6
|
+
Author-email: dev@landing.ai
|
7
|
+
Requires-Python: >=3.9,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
12
|
+
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
13
|
+
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
14
|
+
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
15
|
+
Requires-Dist: nbclient (>=0.10.0,<0.11.0)
|
16
|
+
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
17
|
+
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
18
|
+
Requires-Dist: openai (>=1.0.0,<2.0.0)
|
19
|
+
Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
|
20
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
21
|
+
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
22
|
+
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
23
|
+
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
24
|
+
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
25
|
+
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
26
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
27
|
+
Requires-Dist: tqdm (>=4.64.0,<5.0.0)
|
28
|
+
Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
|
29
|
+
Project-URL: Homepage, https://landing.ai
|
30
|
+
Project-URL: documentation, https://github.com/landing-ai/vision-agent
|
31
|
+
Project-URL: repository, https://github.com/landing-ai/vision-agent
|
32
|
+
Description-Content-Type: text/markdown
|
33
|
+
|
34
|
+
<div align="center">
|
35
|
+
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
|
36
|
+
|
37
|
+
# 🔍🤖 Vision Agent
|
38
|
+
|
39
|
+
[](https://discord.gg/wPdN8RCYew)
|
40
|
+

|
41
|
+
[](https://badge.fury.io/py/vision-agent)
|
42
|
+

|
43
|
+
</div>
|
44
|
+
|
45
|
+
Vision Agent is a library that helps you utilize agent frameworks to generate code to
|
46
|
+
solve your vision task. Many current vision problems can easily take hours or days to
|
47
|
+
solve, you need to find the right model, figure out how to use it and program it to
|
48
|
+
accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
|
49
|
+
allowing users to describe their problem in text and have the agent framework generate
|
50
|
+
code to solve the task for them. Check out our discord for updates and roadmaps!
|
51
|
+
|
52
|
+
## Documentation
|
53
|
+
|
54
|
+
- [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
55
|
+
|
56
|
+
|
57
|
+
## Getting Started
|
58
|
+
### Installation
|
59
|
+
To get started, you can install the library using pip:
|
60
|
+
|
61
|
+
```bash
|
62
|
+
pip install vision-agent
|
63
|
+
```
|
64
|
+
|
65
|
+
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
66
|
+
using Azure OpenAI please see the Azure setup section):
|
67
|
+
|
68
|
+
```bash
|
69
|
+
export OPENAI_API_KEY="your-api-key"
|
70
|
+
```
|
71
|
+
|
72
|
+
### Vision Agent
|
73
|
+
You can interact with the agent as you would with any LLM or LMM model:
|
74
|
+
|
75
|
+
```python
|
76
|
+
>>> from vision_agent.agent import VisionAgent
|
77
|
+
>>> agent = VisionAgent()
|
78
|
+
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
79
|
+
```
|
80
|
+
|
81
|
+
Which produces the following code:
|
82
|
+
```python
|
83
|
+
from vision_agent.tools import load_image, grounding_sam
|
84
|
+
|
85
|
+
def calculate_filled_percentage(image_path: str) -> float:
|
86
|
+
# Step 1: Load the image
|
87
|
+
image = load_image(image_path)
|
88
|
+
|
89
|
+
# Step 2: Segment the jar
|
90
|
+
jar_segments = grounding_sam(prompt="jar", image=image)
|
91
|
+
|
92
|
+
# Step 3: Segment the coffee beans
|
93
|
+
coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
|
94
|
+
|
95
|
+
# Step 4: Calculate the area of the segmented jar
|
96
|
+
jar_area = 0
|
97
|
+
for segment in jar_segments:
|
98
|
+
jar_area += segment['mask'].sum()
|
99
|
+
|
100
|
+
# Step 5: Calculate the area of the segmented coffee beans
|
101
|
+
coffee_beans_area = 0
|
102
|
+
for segment in coffee_beans_segments:
|
103
|
+
coffee_beans_area += segment['mask'].sum()
|
104
|
+
|
105
|
+
# Step 6: Compute the percentage of the jar area that is filled with coffee beans
|
106
|
+
if jar_area == 0:
|
107
|
+
return 0.0 # To avoid division by zero
|
108
|
+
filled_percentage = (coffee_beans_area / jar_area) * 100
|
109
|
+
|
110
|
+
# Step 7: Return the computed percentage
|
111
|
+
return filled_percentage
|
112
|
+
```
|
113
|
+
|
114
|
+
To better understand how the model came up with it's answer, you can run it in debug
|
115
|
+
mode by passing in the verbose argument:
|
116
|
+
|
117
|
+
```python
|
118
|
+
>>> agent = VisionAgent(verbose=2)
|
119
|
+
```
|
120
|
+
|
121
|
+
You can also have it return more information by calling `chat_with_workflow`:
|
122
|
+
|
123
|
+
```python
|
124
|
+
>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
|
125
|
+
>>> print(results)
|
126
|
+
{
|
127
|
+
"code": "from vision_agent.tools import ..."
|
128
|
+
"test": "calculate_filled_percentage('jar.jpg')",
|
129
|
+
"test_result": "...",
|
130
|
+
"plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
|
131
|
+
"working_memory": ...,
|
132
|
+
}
|
133
|
+
```
|
134
|
+
|
135
|
+
With this you can examine more detailed information such as the etesting code, testing
|
136
|
+
results, plan or working memory it used to complete the task.
|
137
|
+
|
138
|
+
### Tools
|
139
|
+
There are a variety of tools for the model or the user to use. Some are executed locally
|
140
|
+
while others are hosted for you. You can also ask an LLM directly to build a tool for
|
141
|
+
you. For example:
|
142
|
+
|
143
|
+
```python
|
144
|
+
>>> import vision_agent as va
|
145
|
+
>>> llm = va.llm.OpenAILLM()
|
146
|
+
>>> detector = llm.generate_detector("Can you build a jar detector for me?")
|
147
|
+
>>> detector("jar.jpg")
|
148
|
+
[{"labels": ["jar",],
|
149
|
+
"scores": [0.99],
|
150
|
+
"bboxes": [
|
151
|
+
[0.58, 0.2, 0.72, 0.45],
|
152
|
+
]
|
153
|
+
}]
|
154
|
+
```
|
155
|
+
|
156
|
+
### Azure Setup
|
157
|
+
If you want to use Azure OpenAI models, you can set the environment variable:
|
158
|
+
|
159
|
+
```bash
|
160
|
+
export AZURE_OPENAI_API_KEY="your-api-key"
|
161
|
+
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
162
|
+
```
|
163
|
+
|
164
|
+
You can then run Vision Agent using the Azure OpenAI models:
|
165
|
+
|
166
|
+
```python
|
167
|
+
>>> import vision_agent as va
|
168
|
+
>>> agent = va.agent.VisionAgent(
|
169
|
+
>>> task_model=va.llm.AzureOpenAILLM(),
|
170
|
+
>>> answer_model=va.lmm.AzureOpenAILMM(),
|
171
|
+
>>> reflection_model=va.lmm.AzureOpenAILMM(),
|
172
|
+
>>> )
|
173
|
+
```
|
174
|
+
|
175
|
+
|
@@ -0,0 +1,141 @@
|
|
1
|
+
<div align="center">
|
2
|
+
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
|
3
|
+
|
4
|
+
# 🔍🤖 Vision Agent
|
5
|
+
|
6
|
+
[](https://discord.gg/wPdN8RCYew)
|
7
|
+

|
8
|
+
[](https://badge.fury.io/py/vision-agent)
|
9
|
+

|
10
|
+
</div>
|
11
|
+
|
12
|
+
Vision Agent is a library that helps you utilize agent frameworks to generate code to
|
13
|
+
solve your vision task. Many current vision problems can easily take hours or days to
|
14
|
+
solve, you need to find the right model, figure out how to use it and program it to
|
15
|
+
accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
|
16
|
+
allowing users to describe their problem in text and have the agent framework generate
|
17
|
+
code to solve the task for them. Check out our discord for updates and roadmaps!
|
18
|
+
|
19
|
+
## Documentation
|
20
|
+
|
21
|
+
- [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
22
|
+
|
23
|
+
|
24
|
+
## Getting Started
|
25
|
+
### Installation
|
26
|
+
To get started, you can install the library using pip:
|
27
|
+
|
28
|
+
```bash
|
29
|
+
pip install vision-agent
|
30
|
+
```
|
31
|
+
|
32
|
+
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
33
|
+
using Azure OpenAI please see the Azure setup section):
|
34
|
+
|
35
|
+
```bash
|
36
|
+
export OPENAI_API_KEY="your-api-key"
|
37
|
+
```
|
38
|
+
|
39
|
+
### Vision Agent
|
40
|
+
You can interact with the agent as you would with any LLM or LMM model:
|
41
|
+
|
42
|
+
```python
|
43
|
+
>>> from vision_agent.agent import VisionAgent
|
44
|
+
>>> agent = VisionAgent()
|
45
|
+
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
46
|
+
```
|
47
|
+
|
48
|
+
Which produces the following code:
|
49
|
+
```python
|
50
|
+
from vision_agent.tools import load_image, grounding_sam
|
51
|
+
|
52
|
+
def calculate_filled_percentage(image_path: str) -> float:
|
53
|
+
# Step 1: Load the image
|
54
|
+
image = load_image(image_path)
|
55
|
+
|
56
|
+
# Step 2: Segment the jar
|
57
|
+
jar_segments = grounding_sam(prompt="jar", image=image)
|
58
|
+
|
59
|
+
# Step 3: Segment the coffee beans
|
60
|
+
coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
|
61
|
+
|
62
|
+
# Step 4: Calculate the area of the segmented jar
|
63
|
+
jar_area = 0
|
64
|
+
for segment in jar_segments:
|
65
|
+
jar_area += segment['mask'].sum()
|
66
|
+
|
67
|
+
# Step 5: Calculate the area of the segmented coffee beans
|
68
|
+
coffee_beans_area = 0
|
69
|
+
for segment in coffee_beans_segments:
|
70
|
+
coffee_beans_area += segment['mask'].sum()
|
71
|
+
|
72
|
+
# Step 6: Compute the percentage of the jar area that is filled with coffee beans
|
73
|
+
if jar_area == 0:
|
74
|
+
return 0.0 # To avoid division by zero
|
75
|
+
filled_percentage = (coffee_beans_area / jar_area) * 100
|
76
|
+
|
77
|
+
# Step 7: Return the computed percentage
|
78
|
+
return filled_percentage
|
79
|
+
```
|
80
|
+
|
81
|
+
To better understand how the model came up with it's answer, you can run it in debug
|
82
|
+
mode by passing in the verbose argument:
|
83
|
+
|
84
|
+
```python
|
85
|
+
>>> agent = VisionAgent(verbose=2)
|
86
|
+
```
|
87
|
+
|
88
|
+
You can also have it return more information by calling `chat_with_workflow`:
|
89
|
+
|
90
|
+
```python
|
91
|
+
>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
|
92
|
+
>>> print(results)
|
93
|
+
{
|
94
|
+
"code": "from vision_agent.tools import ..."
|
95
|
+
"test": "calculate_filled_percentage('jar.jpg')",
|
96
|
+
"test_result": "...",
|
97
|
+
"plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
|
98
|
+
"working_memory": ...,
|
99
|
+
}
|
100
|
+
```
|
101
|
+
|
102
|
+
With this you can examine more detailed information such as the etesting code, testing
|
103
|
+
results, plan or working memory it used to complete the task.
|
104
|
+
|
105
|
+
### Tools
|
106
|
+
There are a variety of tools for the model or the user to use. Some are executed locally
|
107
|
+
while others are hosted for you. You can also ask an LLM directly to build a tool for
|
108
|
+
you. For example:
|
109
|
+
|
110
|
+
```python
|
111
|
+
>>> import vision_agent as va
|
112
|
+
>>> llm = va.llm.OpenAILLM()
|
113
|
+
>>> detector = llm.generate_detector("Can you build a jar detector for me?")
|
114
|
+
>>> detector("jar.jpg")
|
115
|
+
[{"labels": ["jar",],
|
116
|
+
"scores": [0.99],
|
117
|
+
"bboxes": [
|
118
|
+
[0.58, 0.2, 0.72, 0.45],
|
119
|
+
]
|
120
|
+
}]
|
121
|
+
```
|
122
|
+
|
123
|
+
### Azure Setup
|
124
|
+
If you want to use Azure OpenAI models, you can set the environment variable:
|
125
|
+
|
126
|
+
```bash
|
127
|
+
export AZURE_OPENAI_API_KEY="your-api-key"
|
128
|
+
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
129
|
+
```
|
130
|
+
|
131
|
+
You can then run Vision Agent using the Azure OpenAI models:
|
132
|
+
|
133
|
+
```python
|
134
|
+
>>> import vision_agent as va
|
135
|
+
>>> agent = va.agent.VisionAgent(
|
136
|
+
>>> task_model=va.llm.AzureOpenAILLM(),
|
137
|
+
>>> answer_model=va.lmm.AzureOpenAILMM(),
|
138
|
+
>>> reflection_model=va.lmm.AzureOpenAILMM(),
|
139
|
+
>>> )
|
140
|
+
```
|
141
|
+
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from .agent import Agent
|
2
2
|
from .agent_coder import AgentCoder
|
3
|
+
from .data_interpreter import DataInterpreter
|
3
4
|
from .easytool import EasyTool
|
5
|
+
from .easytool_v2 import EasyToolV2
|
4
6
|
from .reflexion import Reflexion
|
5
7
|
from .vision_agent import VisionAgent
|
6
|
-
from .vision_agent_v2 import VisionAgentV2
|
7
|
-
from .vision_agent_v3 import VisionAgentV3
|
@@ -18,7 +18,7 @@ from vision_agent.agent.agent_coder_prompts import (
|
|
18
18
|
)
|
19
19
|
from vision_agent.llm import LLM, OpenAILLM
|
20
20
|
from vision_agent.lmm import LMM, OpenAILMM
|
21
|
-
from vision_agent.tools
|
21
|
+
from vision_agent.tools import TOOL_DOCSTRING, UTILITIES_DOCSTRING
|
22
22
|
from vision_agent.utils import Execute
|
23
23
|
|
24
24
|
IMPORT_HELPER = """
|
@@ -38,7 +38,7 @@ import numpy as np
|
|
38
38
|
import string
|
39
39
|
from typing import *
|
40
40
|
from collections import *
|
41
|
-
from vision_agent.tools
|
41
|
+
from vision_agent.tools import *
|
42
42
|
"""
|
43
43
|
logging.basicConfig(stream=sys.stdout)
|
44
44
|
_LOGGER = logging.getLogger(__name__)
|
@@ -150,20 +150,20 @@ class AgentCoder(Agent):
|
|
150
150
|
def __call__(
|
151
151
|
self,
|
152
152
|
input: Union[List[Dict[str, str]], str],
|
153
|
-
|
153
|
+
media: Optional[Union[str, Path]] = None,
|
154
154
|
) -> str:
|
155
155
|
if isinstance(input, str):
|
156
156
|
input = [{"role": "user", "content": input}]
|
157
|
-
return self.chat(input,
|
157
|
+
return self.chat(input, media)
|
158
158
|
|
159
159
|
def chat(
|
160
160
|
self,
|
161
161
|
input: List[Dict[str, str]],
|
162
|
-
|
162
|
+
media: Optional[Union[str, Path]] = None,
|
163
163
|
) -> str:
|
164
164
|
question = input[0]["content"]
|
165
|
-
if
|
166
|
-
question += f" Input file path: {os.path.abspath(
|
165
|
+
if media:
|
166
|
+
question += f" Input file path: {os.path.abspath(media)}"
|
167
167
|
|
168
168
|
code = ""
|
169
169
|
feedback = ""
|
@@ -10,7 +10,7 @@ from rich.syntax import Syntax
|
|
10
10
|
from tabulate import tabulate
|
11
11
|
|
12
12
|
from vision_agent.agent import Agent
|
13
|
-
from vision_agent.agent.
|
13
|
+
from vision_agent.agent.data_interpreter_prompts import (
|
14
14
|
CODE,
|
15
15
|
CODE_SYS_MSG,
|
16
16
|
DEBUG,
|
@@ -25,7 +25,7 @@ from vision_agent.agent.vision_agent_v2_prompts import (
|
|
25
25
|
USER_REQ_SUBTASK_WM_CONTEXT,
|
26
26
|
)
|
27
27
|
from vision_agent.llm import LLM, OpenAILLM
|
28
|
-
from vision_agent.tools
|
28
|
+
from vision_agent.tools import TOOL_DESCRIPTIONS, TOOLS_DF
|
29
29
|
from vision_agent.utils import Execute, Sim
|
30
30
|
|
31
31
|
logging.basicConfig(level=logging.INFO)
|
@@ -331,11 +331,11 @@ def run_plan(
|
|
331
331
|
return current_code, current_test, plan, working_memory
|
332
332
|
|
333
333
|
|
334
|
-
class
|
335
|
-
"""
|
336
|
-
solve vision tasks. It is inspired by MetaGPT's Data
|
337
|
-
https://arxiv.org/abs/2402.18679.
|
338
|
-
generate code:
|
334
|
+
class DataInterpreter(Agent):
|
335
|
+
"""This version of Data Interpreter is an AI agentic framework geared towards
|
336
|
+
outputting Python code to solve vision tasks. It is inspired by MetaGPT's Data
|
337
|
+
Interpreter https://arxiv.org/abs/2402.18679. This version of Data Interpreter has
|
338
|
+
several key features to help it generate code:
|
339
339
|
|
340
340
|
- A planner to generate a plan of tasks to solve a user requirement. The planner
|
341
341
|
can output code tasks or test tasks, where test tasks are used to verify the code.
|
@@ -379,29 +379,29 @@ class VisionAgentV2(Agent):
|
|
379
379
|
def __call__(
|
380
380
|
self,
|
381
381
|
input: Union[List[Dict[str, str]], str],
|
382
|
-
|
382
|
+
media: Optional[Union[str, Path]] = None,
|
383
383
|
plan: Optional[List[Dict[str, Any]]] = None,
|
384
384
|
) -> str:
|
385
385
|
if isinstance(input, str):
|
386
386
|
input = [{"role": "user", "content": input}]
|
387
|
-
results = self.chat_with_workflow(input,
|
387
|
+
results = self.chat_with_workflow(input, media, plan)
|
388
388
|
return results["code"] # type: ignore
|
389
389
|
|
390
390
|
@traceable
|
391
391
|
def chat_with_workflow(
|
392
392
|
self,
|
393
393
|
chat: List[Dict[str, str]],
|
394
|
-
|
394
|
+
media: Optional[Union[str, Path]] = None,
|
395
395
|
plan: Optional[List[Dict[str, Any]]] = None,
|
396
396
|
) -> Dict[str, Any]:
|
397
397
|
if len(chat) == 0:
|
398
398
|
raise ValueError("Input cannot be empty.")
|
399
399
|
|
400
|
-
if
|
400
|
+
if media is not None:
|
401
401
|
# append file names to all user messages
|
402
402
|
for chat_i in chat:
|
403
403
|
if chat_i["role"] == "user":
|
404
|
-
chat_i["content"] += f" Image name {
|
404
|
+
chat_i["content"] += f" Image name {media}"
|
405
405
|
|
406
406
|
working_code = ""
|
407
407
|
if plan is not None:
|
@@ -74,15 +74,15 @@ CODE = """
|
|
74
74
|
|
75
75
|
# Constraints
|
76
76
|
- Write a function that accomplishes the 'Current Subtask'. You are supplied code from a previous task under 'Previous Code', do not delete or change previous code unless it contains a bug or it is necessary to complete the 'Current Subtask'.
|
77
|
-
- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools
|
77
|
+
- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools import *` import.
|
78
78
|
- You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
|
79
|
-
- Use the `save_json` function from `vision_agent.tools
|
79
|
+
- Use the `save_json` function from `vision_agent.tools` to save your output as a json file.
|
80
80
|
- Write clean, readable, and well-documented code.
|
81
81
|
|
82
82
|
# Output
|
83
83
|
While some concise thoughts are helpful, code is absolutely required. If possible, execute your defined functions in the code output. Output code in the following format:
|
84
84
|
```python
|
85
|
-
from vision_agent.tools
|
85
|
+
from vision_agent.tools imoprt *
|
86
86
|
|
87
87
|
# your code goes here
|
88
88
|
```
|
@@ -6,7 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
7
7
|
from vision_agent.llm import LLM, OpenAILLM
|
8
8
|
from vision_agent.lmm import LMM
|
9
|
-
from vision_agent.tools import TOOLS
|
9
|
+
from vision_agent.tools.easytool_tools import TOOLS
|
10
10
|
|
11
11
|
from .agent import Agent
|
12
12
|
from .easytool_prompts import (
|
@@ -272,7 +272,7 @@ class EasyTool(Agent):
|
|
272
272
|
def __call__(
|
273
273
|
self,
|
274
274
|
input: Union[List[Dict[str, str]], str],
|
275
|
-
|
275
|
+
media: Optional[Union[str, Path]] = None,
|
276
276
|
) -> str:
|
277
277
|
"""Invoke the vision agent.
|
278
278
|
|
@@ -285,14 +285,14 @@ class EasyTool(Agent):
|
|
285
285
|
"""
|
286
286
|
if isinstance(input, str):
|
287
287
|
input = [{"role": "user", "content": input}]
|
288
|
-
return self.chat(input,
|
288
|
+
return self.chat(input, media=media)
|
289
289
|
|
290
290
|
def chat_with_workflow(
|
291
|
-
self, chat: List[Dict[str, str]],
|
291
|
+
self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
|
292
292
|
) -> Tuple[str, List[Dict]]:
|
293
293
|
question = chat[0]["content"]
|
294
|
-
if
|
295
|
-
question += f" Image name: {
|
294
|
+
if media:
|
295
|
+
question += f" Image name: {media}"
|
296
296
|
tasks = task_decompose(
|
297
297
|
self.task_model,
|
298
298
|
question,
|
@@ -340,7 +340,7 @@ class EasyTool(Agent):
|
|
340
340
|
return answer_summarize(self.answer_model, question, answers), all_tool_results
|
341
341
|
|
342
342
|
def chat(
|
343
|
-
self, chat: List[Dict[str, str]],
|
343
|
+
self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
|
344
344
|
) -> str:
|
345
|
-
answer, _ = self.chat_with_workflow(chat,
|
345
|
+
answer, _ = self.chat_with_workflow(chat, media=media)
|
346
346
|
return answer
|
@@ -17,7 +17,7 @@ from vision_agent.agent.easytool_prompts import (
|
|
17
17
|
TASK_DECOMPOSE,
|
18
18
|
TASK_TOPOLOGY,
|
19
19
|
)
|
20
|
-
from vision_agent.agent.
|
20
|
+
from vision_agent.agent.easytool_v2_prompts import (
|
21
21
|
ANSWER_GENERATE_DEPENDS,
|
22
22
|
ANSWER_SUMMARIZE_DEPENDS,
|
23
23
|
CHOOSE_PARAMETER_DEPENDS,
|
@@ -27,7 +27,7 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
27
27
|
)
|
28
28
|
from vision_agent.llm import LLM, OpenAILLM
|
29
29
|
from vision_agent.lmm import LMM, OpenAILMM
|
30
|
-
from vision_agent.tools import TOOLS
|
30
|
+
from vision_agent.tools.easytool_tools import TOOLS
|
31
31
|
from vision_agent.utils.image_utils import (
|
32
32
|
convert_to_b64,
|
33
33
|
overlay_bboxes,
|
@@ -427,9 +427,9 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
|
|
427
427
|
return visualized_images
|
428
428
|
|
429
429
|
|
430
|
-
class
|
431
|
-
r"""
|
432
|
-
reflection to accomplish tasks, in particular vision tasks.
|
430
|
+
class EasyToolV2(Agent):
|
431
|
+
r"""EasyToolV2 is an agent framework that utilizes tools as well as self
|
432
|
+
reflection to accomplish tasks, in particular vision tasks. EasyToolV2 is based
|
433
433
|
off of EasyTool https://arxiv.org/abs/2401.06201 and Reflexion
|
434
434
|
https://arxiv.org/abs/2303.11366 where it will attempt to complete a task and then
|
435
435
|
reflect on whether or not it was able to accomplish the task based off of the plan
|
@@ -437,8 +437,8 @@ class VisionAgent(Agent):
|
|
437
437
|
|
438
438
|
Example
|
439
439
|
-------
|
440
|
-
>>> from vision_agent.agent import
|
441
|
-
>>> agent =
|
440
|
+
>>> from vision_agent.agent import EasyToolV2
|
441
|
+
>>> agent = EasyToolV2()
|
442
442
|
>>> resp = agent("If red tomatoes cost $5 each and yellow tomatoes cost $2.50 each, what is the total cost of all the tomatoes in the image?", image="tomatoes.jpg")
|
443
443
|
>>> print(resp)
|
444
444
|
"The total cost is $57.50."
|
@@ -453,7 +453,7 @@ class VisionAgent(Agent):
|
|
453
453
|
verbose: bool = False,
|
454
454
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
455
455
|
):
|
456
|
-
"""
|
456
|
+
"""EasyToolV2 constructor.
|
457
457
|
|
458
458
|
Parameters:
|
459
459
|
task_model: the model to use for task decomposition.
|
@@ -461,7 +461,7 @@ class VisionAgent(Agent):
|
|
461
461
|
reflect_model: the model to use for self reflection.
|
462
462
|
max_retries: maximum number of retries to attempt to complete the task.
|
463
463
|
verbose: whether to print more logs.
|
464
|
-
report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple
|
464
|
+
report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple EasyToolV2 instances are running in parallel. This callback ensures that the progress are not mixed up.
|
465
465
|
"""
|
466
466
|
self.task_model = (
|
467
467
|
OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
|
@@ -487,7 +487,7 @@ class VisionAgent(Agent):
|
|
487
487
|
def __call__(
|
488
488
|
self,
|
489
489
|
input: Union[List[Dict[str, str]], str],
|
490
|
-
|
490
|
+
media: Optional[Union[str, Path]] = None,
|
491
491
|
reference_data: Optional[Dict[str, str]] = None,
|
492
492
|
visualize_output: Optional[bool] = False,
|
493
493
|
self_reflection: Optional[bool] = True,
|
@@ -512,7 +512,7 @@ class VisionAgent(Agent):
|
|
512
512
|
input = [{"role": "user", "content": input}]
|
513
513
|
return self.chat(
|
514
514
|
input,
|
515
|
-
|
515
|
+
media=media,
|
516
516
|
visualize_output=visualize_output,
|
517
517
|
reference_data=reference_data,
|
518
518
|
self_reflection=self_reflection,
|
@@ -539,12 +539,12 @@ class VisionAgent(Agent):
|
|
539
539
|
def chat_with_workflow(
|
540
540
|
self,
|
541
541
|
chat: List[Dict[str, str]],
|
542
|
-
|
542
|
+
media: Optional[Union[str, Path]] = None,
|
543
543
|
reference_data: Optional[Dict[str, str]] = None,
|
544
544
|
visualize_output: Optional[bool] = False,
|
545
545
|
self_reflection: Optional[bool] = True,
|
546
546
|
) -> Tuple[str, List[Dict]]:
|
547
|
-
"""Chat with
|
547
|
+
"""Chat with EasyToolV2 and return the final answer and all tool results.
|
548
548
|
|
549
549
|
Parameters:
|
550
550
|
chat: A conversation in the format of
|
@@ -566,8 +566,8 @@ class VisionAgent(Agent):
|
|
566
566
|
raise ValueError("Input cannot be empty.")
|
567
567
|
|
568
568
|
question = chat[0]["content"]
|
569
|
-
if
|
570
|
-
question += f" Image name: {
|
569
|
+
if media:
|
570
|
+
question += f" Image name: {media}"
|
571
571
|
if reference_data:
|
572
572
|
question += (
|
573
573
|
f" Reference image: {reference_data['image']}"
|
@@ -630,8 +630,8 @@ class VisionAgent(Agent):
|
|
630
630
|
all_tool_results.append({"visualized_output": visualized_output})
|
631
631
|
if len(visualized_output) > 0:
|
632
632
|
reflection_images = sample_n_evenly_spaced(visualized_output, 3)
|
633
|
-
elif
|
634
|
-
reflection_images = [
|
633
|
+
elif media is not None:
|
634
|
+
reflection_images = [media]
|
635
635
|
else:
|
636
636
|
reflection_images = None
|
637
637
|
|
@@ -658,7 +658,7 @@ class VisionAgent(Agent):
|
|
658
658
|
# '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
|
659
659
|
self.log_progress(
|
660
660
|
{
|
661
|
-
"log": f"
|
661
|
+
"log": f"EasyToolV2 has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
|
662
662
|
}
|
663
663
|
)
|
664
664
|
|
@@ -675,14 +675,14 @@ class VisionAgent(Agent):
|
|
675
675
|
def chat(
|
676
676
|
self,
|
677
677
|
chat: List[Dict[str, str]],
|
678
|
-
|
678
|
+
media: Optional[Union[str, Path]] = None,
|
679
679
|
reference_data: Optional[Dict[str, str]] = None,
|
680
680
|
visualize_output: Optional[bool] = False,
|
681
681
|
self_reflection: Optional[bool] = True,
|
682
682
|
) -> str:
|
683
683
|
answer, _ = self.chat_with_workflow(
|
684
684
|
chat,
|
685
|
-
|
685
|
+
media=media,
|
686
686
|
visualize_output=visualize_output,
|
687
687
|
reference_data=reference_data,
|
688
688
|
self_reflection=self_reflection,
|