vision-agent 0.2.29__tar.gz → 0.2.31__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent-0.2.31/PKG-INFO +175 -0
- vision_agent-0.2.31/README.md +141 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/pyproject.toml +1 -1
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/__init__.py +2 -2
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/agent.py +2 -2
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/agent_coder.py +8 -8
- vision_agent-0.2.29/vision_agent/agent/vision_agent_v2.py → vision_agent-0.2.31/vision_agent/agent/data_interpreter.py +12 -12
- vision_agent-0.2.29/vision_agent/agent/vision_agent_v2_prompts.py → vision_agent-0.2.31/vision_agent/agent/data_interpreter_prompts.py +3 -3
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/easytool.py +8 -8
- vision_agent-0.2.29/vision_agent/agent/vision_agent.py → vision_agent-0.2.31/vision_agent/agent/easytool_v2.py +20 -20
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/reflexion.py +8 -8
- vision_agent-0.2.29/vision_agent/agent/vision_agent_v3.py → vision_agent-0.2.31/vision_agent/agent/vision_agent.py +78 -17
- vision_agent-0.2.29/vision_agent/agent/vision_agent_v3_prompts.py → vision_agent-0.2.31/vision_agent/agent/vision_agent_prompts.py +13 -5
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/llm/llm.py +3 -4
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/lmm/lmm.py +6 -6
- vision_agent-0.2.31/vision_agent/tools/__init__.py +24 -0
- vision_agent-0.2.29/PKG-INFO +0 -226
- vision_agent-0.2.29/README.md +0 -192
- vision_agent-0.2.29/vision_agent/tools/__init__.py +0 -25
- {vision_agent-0.2.29 → vision_agent-0.2.31}/LICENSE +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/agent_coder_prompts.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/easytool_prompts.py +0 -0
- /vision_agent-0.2.29/vision_agent/agent/vision_agent_prompts.py → /vision_agent-0.2.31/vision_agent/agent/easytool_v2_prompts.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/reflexion_prompts.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/llm/__init__.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/lmm/__init__.py +0 -0
- /vision_agent-0.2.29/vision_agent/tools/tools.py → /vision_agent-0.2.31/vision_agent/tools/easytool_tools.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/tools/tool_utils.py +0 -0
- /vision_agent-0.2.29/vision_agent/tools/tools_v2.py → /vision_agent-0.2.31/vision_agent/tools/tools.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/video.py +0 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: vision-agent
|
3
|
+
Version: 0.2.31
|
4
|
+
Summary: Toolset for Vision Agent
|
5
|
+
Author: Landing AI
|
6
|
+
Author-email: dev@landing.ai
|
7
|
+
Requires-Python: >=3.9,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
12
|
+
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
13
|
+
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
14
|
+
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
15
|
+
Requires-Dist: nbclient (>=0.10.0,<0.11.0)
|
16
|
+
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
17
|
+
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
18
|
+
Requires-Dist: openai (>=1.0.0,<2.0.0)
|
19
|
+
Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
|
20
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
21
|
+
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
22
|
+
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
23
|
+
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
24
|
+
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
25
|
+
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
26
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
27
|
+
Requires-Dist: tqdm (>=4.64.0,<5.0.0)
|
28
|
+
Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
|
29
|
+
Project-URL: Homepage, https://landing.ai
|
30
|
+
Project-URL: documentation, https://github.com/landing-ai/vision-agent
|
31
|
+
Project-URL: repository, https://github.com/landing-ai/vision-agent
|
32
|
+
Description-Content-Type: text/markdown
|
33
|
+
|
34
|
+
<div align="center">
|
35
|
+
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
|
36
|
+
|
37
|
+
# 🔍🤖 Vision Agent
|
38
|
+
|
39
|
+
[](https://discord.gg/wPdN8RCYew)
|
40
|
+

|
41
|
+
[](https://badge.fury.io/py/vision-agent)
|
42
|
+

|
43
|
+
</div>
|
44
|
+
|
45
|
+
Vision Agent is a library that helps you utilize agent frameworks to generate code to
|
46
|
+
solve your vision task. Many current vision problems can easily take hours or days to
|
47
|
+
solve, you need to find the right model, figure out how to use it and program it to
|
48
|
+
accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
|
49
|
+
allowing users to describe their problem in text and have the agent framework generate
|
50
|
+
code to solve the task for them. Check out our discord for updates and roadmaps!
|
51
|
+
|
52
|
+
## Documentation
|
53
|
+
|
54
|
+
- [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
55
|
+
|
56
|
+
|
57
|
+
## Getting Started
|
58
|
+
### Installation
|
59
|
+
To get started, you can install the library using pip:
|
60
|
+
|
61
|
+
```bash
|
62
|
+
pip install vision-agent
|
63
|
+
```
|
64
|
+
|
65
|
+
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
66
|
+
using Azure OpenAI please see the Azure setup section):
|
67
|
+
|
68
|
+
```bash
|
69
|
+
export OPENAI_API_KEY="your-api-key"
|
70
|
+
```
|
71
|
+
|
72
|
+
### Vision Agent
|
73
|
+
You can interact with the agent as you would with any LLM or LMM model:
|
74
|
+
|
75
|
+
```python
|
76
|
+
>>> from vision_agent.agent import VisionAgent
|
77
|
+
>>> agent = VisionAgent()
|
78
|
+
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
79
|
+
```
|
80
|
+
|
81
|
+
Which produces the following code:
|
82
|
+
```python
|
83
|
+
from vision_agent.tools import load_image, grounding_sam
|
84
|
+
|
85
|
+
def calculate_filled_percentage(image_path: str) -> float:
|
86
|
+
# Step 1: Load the image
|
87
|
+
image = load_image(image_path)
|
88
|
+
|
89
|
+
# Step 2: Segment the jar
|
90
|
+
jar_segments = grounding_sam(prompt="jar", image=image)
|
91
|
+
|
92
|
+
# Step 3: Segment the coffee beans
|
93
|
+
coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
|
94
|
+
|
95
|
+
# Step 4: Calculate the area of the segmented jar
|
96
|
+
jar_area = 0
|
97
|
+
for segment in jar_segments:
|
98
|
+
jar_area += segment['mask'].sum()
|
99
|
+
|
100
|
+
# Step 5: Calculate the area of the segmented coffee beans
|
101
|
+
coffee_beans_area = 0
|
102
|
+
for segment in coffee_beans_segments:
|
103
|
+
coffee_beans_area += segment['mask'].sum()
|
104
|
+
|
105
|
+
# Step 6: Compute the percentage of the jar area that is filled with coffee beans
|
106
|
+
if jar_area == 0:
|
107
|
+
return 0.0 # To avoid division by zero
|
108
|
+
filled_percentage = (coffee_beans_area / jar_area) * 100
|
109
|
+
|
110
|
+
# Step 7: Return the computed percentage
|
111
|
+
return filled_percentage
|
112
|
+
```
|
113
|
+
|
114
|
+
To better understand how the model came up with it's answer, you can run it in debug
|
115
|
+
mode by passing in the verbose argument:
|
116
|
+
|
117
|
+
```python
|
118
|
+
>>> agent = VisionAgent(verbose=2)
|
119
|
+
```
|
120
|
+
|
121
|
+
You can also have it return more information by calling `chat_with_workflow`:
|
122
|
+
|
123
|
+
```python
|
124
|
+
>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
|
125
|
+
>>> print(results)
|
126
|
+
{
|
127
|
+
"code": "from vision_agent.tools import ..."
|
128
|
+
"test": "calculate_filled_percentage('jar.jpg')",
|
129
|
+
"test_result": "...",
|
130
|
+
"plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
|
131
|
+
"working_memory": ...,
|
132
|
+
}
|
133
|
+
```
|
134
|
+
|
135
|
+
With this you can examine more detailed information such as the etesting code, testing
|
136
|
+
results, plan or working memory it used to complete the task.
|
137
|
+
|
138
|
+
### Tools
|
139
|
+
There are a variety of tools for the model or the user to use. Some are executed locally
|
140
|
+
while others are hosted for you. You can also ask an LLM directly to build a tool for
|
141
|
+
you. For example:
|
142
|
+
|
143
|
+
```python
|
144
|
+
>>> import vision_agent as va
|
145
|
+
>>> llm = va.llm.OpenAILLM()
|
146
|
+
>>> detector = llm.generate_detector("Can you build a jar detector for me?")
|
147
|
+
>>> detector("jar.jpg")
|
148
|
+
[{"labels": ["jar",],
|
149
|
+
"scores": [0.99],
|
150
|
+
"bboxes": [
|
151
|
+
[0.58, 0.2, 0.72, 0.45],
|
152
|
+
]
|
153
|
+
}]
|
154
|
+
```
|
155
|
+
|
156
|
+
### Azure Setup
|
157
|
+
If you want to use Azure OpenAI models, you can set the environment variable:
|
158
|
+
|
159
|
+
```bash
|
160
|
+
export AZURE_OPENAI_API_KEY="your-api-key"
|
161
|
+
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
162
|
+
```
|
163
|
+
|
164
|
+
You can then run Vision Agent using the Azure OpenAI models:
|
165
|
+
|
166
|
+
```python
|
167
|
+
>>> import vision_agent as va
|
168
|
+
>>> agent = va.agent.VisionAgent(
|
169
|
+
>>> task_model=va.llm.AzureOpenAILLM(),
|
170
|
+
>>> answer_model=va.lmm.AzureOpenAILMM(),
|
171
|
+
>>> reflection_model=va.lmm.AzureOpenAILMM(),
|
172
|
+
>>> )
|
173
|
+
```
|
174
|
+
|
175
|
+
|
@@ -0,0 +1,141 @@
|
|
1
|
+
<div align="center">
|
2
|
+
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
|
3
|
+
|
4
|
+
# 🔍🤖 Vision Agent
|
5
|
+
|
6
|
+
[](https://discord.gg/wPdN8RCYew)
|
7
|
+

|
8
|
+
[](https://badge.fury.io/py/vision-agent)
|
9
|
+

|
10
|
+
</div>
|
11
|
+
|
12
|
+
Vision Agent is a library that helps you utilize agent frameworks to generate code to
|
13
|
+
solve your vision task. Many current vision problems can easily take hours or days to
|
14
|
+
solve, you need to find the right model, figure out how to use it and program it to
|
15
|
+
accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
|
16
|
+
allowing users to describe their problem in text and have the agent framework generate
|
17
|
+
code to solve the task for them. Check out our discord for updates and roadmaps!
|
18
|
+
|
19
|
+
## Documentation
|
20
|
+
|
21
|
+
- [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
22
|
+
|
23
|
+
|
24
|
+
## Getting Started
|
25
|
+
### Installation
|
26
|
+
To get started, you can install the library using pip:
|
27
|
+
|
28
|
+
```bash
|
29
|
+
pip install vision-agent
|
30
|
+
```
|
31
|
+
|
32
|
+
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
33
|
+
using Azure OpenAI please see the Azure setup section):
|
34
|
+
|
35
|
+
```bash
|
36
|
+
export OPENAI_API_KEY="your-api-key"
|
37
|
+
```
|
38
|
+
|
39
|
+
### Vision Agent
|
40
|
+
You can interact with the agent as you would with any LLM or LMM model:
|
41
|
+
|
42
|
+
```python
|
43
|
+
>>> from vision_agent.agent import VisionAgent
|
44
|
+
>>> agent = VisionAgent()
|
45
|
+
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
46
|
+
```
|
47
|
+
|
48
|
+
Which produces the following code:
|
49
|
+
```python
|
50
|
+
from vision_agent.tools import load_image, grounding_sam
|
51
|
+
|
52
|
+
def calculate_filled_percentage(image_path: str) -> float:
|
53
|
+
# Step 1: Load the image
|
54
|
+
image = load_image(image_path)
|
55
|
+
|
56
|
+
# Step 2: Segment the jar
|
57
|
+
jar_segments = grounding_sam(prompt="jar", image=image)
|
58
|
+
|
59
|
+
# Step 3: Segment the coffee beans
|
60
|
+
coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
|
61
|
+
|
62
|
+
# Step 4: Calculate the area of the segmented jar
|
63
|
+
jar_area = 0
|
64
|
+
for segment in jar_segments:
|
65
|
+
jar_area += segment['mask'].sum()
|
66
|
+
|
67
|
+
# Step 5: Calculate the area of the segmented coffee beans
|
68
|
+
coffee_beans_area = 0
|
69
|
+
for segment in coffee_beans_segments:
|
70
|
+
coffee_beans_area += segment['mask'].sum()
|
71
|
+
|
72
|
+
# Step 6: Compute the percentage of the jar area that is filled with coffee beans
|
73
|
+
if jar_area == 0:
|
74
|
+
return 0.0 # To avoid division by zero
|
75
|
+
filled_percentage = (coffee_beans_area / jar_area) * 100
|
76
|
+
|
77
|
+
# Step 7: Return the computed percentage
|
78
|
+
return filled_percentage
|
79
|
+
```
|
80
|
+
|
81
|
+
To better understand how the model came up with it's answer, you can run it in debug
|
82
|
+
mode by passing in the verbose argument:
|
83
|
+
|
84
|
+
```python
|
85
|
+
>>> agent = VisionAgent(verbose=2)
|
86
|
+
```
|
87
|
+
|
88
|
+
You can also have it return more information by calling `chat_with_workflow`:
|
89
|
+
|
90
|
+
```python
|
91
|
+
>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
|
92
|
+
>>> print(results)
|
93
|
+
{
|
94
|
+
"code": "from vision_agent.tools import ..."
|
95
|
+
"test": "calculate_filled_percentage('jar.jpg')",
|
96
|
+
"test_result": "...",
|
97
|
+
"plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
|
98
|
+
"working_memory": ...,
|
99
|
+
}
|
100
|
+
```
|
101
|
+
|
102
|
+
With this you can examine more detailed information such as the etesting code, testing
|
103
|
+
results, plan or working memory it used to complete the task.
|
104
|
+
|
105
|
+
### Tools
|
106
|
+
There are a variety of tools for the model or the user to use. Some are executed locally
|
107
|
+
while others are hosted for you. You can also ask an LLM directly to build a tool for
|
108
|
+
you. For example:
|
109
|
+
|
110
|
+
```python
|
111
|
+
>>> import vision_agent as va
|
112
|
+
>>> llm = va.llm.OpenAILLM()
|
113
|
+
>>> detector = llm.generate_detector("Can you build a jar detector for me?")
|
114
|
+
>>> detector("jar.jpg")
|
115
|
+
[{"labels": ["jar",],
|
116
|
+
"scores": [0.99],
|
117
|
+
"bboxes": [
|
118
|
+
[0.58, 0.2, 0.72, 0.45],
|
119
|
+
]
|
120
|
+
}]
|
121
|
+
```
|
122
|
+
|
123
|
+
### Azure Setup
|
124
|
+
If you want to use Azure OpenAI models, you can set the environment variable:
|
125
|
+
|
126
|
+
```bash
|
127
|
+
export AZURE_OPENAI_API_KEY="your-api-key"
|
128
|
+
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
129
|
+
```
|
130
|
+
|
131
|
+
You can then run Vision Agent using the Azure OpenAI models:
|
132
|
+
|
133
|
+
```python
|
134
|
+
>>> import vision_agent as va
|
135
|
+
>>> agent = va.agent.VisionAgent(
|
136
|
+
>>> task_model=va.llm.AzureOpenAILLM(),
|
137
|
+
>>> answer_model=va.lmm.AzureOpenAILMM(),
|
138
|
+
>>> reflection_model=va.lmm.AzureOpenAILMM(),
|
139
|
+
>>> )
|
140
|
+
```
|
141
|
+
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from .agent import Agent
|
2
2
|
from .agent_coder import AgentCoder
|
3
|
+
from .data_interpreter import DataInterpreter
|
3
4
|
from .easytool import EasyTool
|
5
|
+
from .easytool_v2 import EasyToolV2
|
4
6
|
from .reflexion import Reflexion
|
5
7
|
from .vision_agent import VisionAgent
|
6
|
-
from .vision_agent_v2 import VisionAgentV2
|
7
|
-
from .vision_agent_v3 import VisionAgentV3
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
2
|
from pathlib import Path
|
3
|
-
from typing import Dict, List, Optional, Union
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
4
4
|
|
5
5
|
|
6
6
|
class Agent(ABC):
|
@@ -8,7 +8,7 @@ class Agent(ABC):
|
|
8
8
|
def __call__(
|
9
9
|
self,
|
10
10
|
input: Union[List[Dict[str, str]], str],
|
11
|
-
|
11
|
+
media: Optional[Union[str, Path]] = None,
|
12
12
|
) -> str:
|
13
13
|
pass
|
14
14
|
|
@@ -3,7 +3,7 @@ import logging
|
|
3
3
|
import os
|
4
4
|
import sys
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Dict, List, Optional, Union
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
7
7
|
|
8
8
|
from rich.console import Console
|
9
9
|
from rich.syntax import Syntax
|
@@ -18,7 +18,7 @@ from vision_agent.agent.agent_coder_prompts import (
|
|
18
18
|
)
|
19
19
|
from vision_agent.llm import LLM, OpenAILLM
|
20
20
|
from vision_agent.lmm import LMM, OpenAILMM
|
21
|
-
from vision_agent.tools
|
21
|
+
from vision_agent.tools import TOOL_DOCSTRING, UTILITIES_DOCSTRING
|
22
22
|
from vision_agent.utils import Execute
|
23
23
|
|
24
24
|
IMPORT_HELPER = """
|
@@ -38,7 +38,7 @@ import numpy as np
|
|
38
38
|
import string
|
39
39
|
from typing import *
|
40
40
|
from collections import *
|
41
|
-
from vision_agent.tools
|
41
|
+
from vision_agent.tools import *
|
42
42
|
"""
|
43
43
|
logging.basicConfig(stream=sys.stdout)
|
44
44
|
_LOGGER = logging.getLogger(__name__)
|
@@ -150,20 +150,20 @@ class AgentCoder(Agent):
|
|
150
150
|
def __call__(
|
151
151
|
self,
|
152
152
|
input: Union[List[Dict[str, str]], str],
|
153
|
-
|
153
|
+
media: Optional[Union[str, Path]] = None,
|
154
154
|
) -> str:
|
155
155
|
if isinstance(input, str):
|
156
156
|
input = [{"role": "user", "content": input}]
|
157
|
-
return self.chat(input,
|
157
|
+
return self.chat(input, media)
|
158
158
|
|
159
159
|
def chat(
|
160
160
|
self,
|
161
161
|
input: List[Dict[str, str]],
|
162
|
-
|
162
|
+
media: Optional[Union[str, Path]] = None,
|
163
163
|
) -> str:
|
164
164
|
question = input[0]["content"]
|
165
|
-
if
|
166
|
-
question += f" Input file path: {os.path.abspath(
|
165
|
+
if media:
|
166
|
+
question += f" Input file path: {os.path.abspath(media)}"
|
167
167
|
|
168
168
|
code = ""
|
169
169
|
feedback = ""
|
@@ -10,7 +10,7 @@ from rich.syntax import Syntax
|
|
10
10
|
from tabulate import tabulate
|
11
11
|
|
12
12
|
from vision_agent.agent import Agent
|
13
|
-
from vision_agent.agent.
|
13
|
+
from vision_agent.agent.data_interpreter_prompts import (
|
14
14
|
CODE,
|
15
15
|
CODE_SYS_MSG,
|
16
16
|
DEBUG,
|
@@ -25,7 +25,7 @@ from vision_agent.agent.vision_agent_v2_prompts import (
|
|
25
25
|
USER_REQ_SUBTASK_WM_CONTEXT,
|
26
26
|
)
|
27
27
|
from vision_agent.llm import LLM, OpenAILLM
|
28
|
-
from vision_agent.tools
|
28
|
+
from vision_agent.tools import TOOL_DESCRIPTIONS, TOOLS_DF
|
29
29
|
from vision_agent.utils import Execute, Sim
|
30
30
|
|
31
31
|
logging.basicConfig(level=logging.INFO)
|
@@ -331,11 +331,11 @@ def run_plan(
|
|
331
331
|
return current_code, current_test, plan, working_memory
|
332
332
|
|
333
333
|
|
334
|
-
class
|
335
|
-
"""
|
336
|
-
solve vision tasks. It is inspired by MetaGPT's Data
|
337
|
-
https://arxiv.org/abs/2402.18679.
|
338
|
-
generate code:
|
334
|
+
class DataInterpreter(Agent):
|
335
|
+
"""This version of Data Interpreter is an AI agentic framework geared towards
|
336
|
+
outputting Python code to solve vision tasks. It is inspired by MetaGPT's Data
|
337
|
+
Interpreter https://arxiv.org/abs/2402.18679. This version of Data Interpreter has
|
338
|
+
several key features to help it generate code:
|
339
339
|
|
340
340
|
- A planner to generate a plan of tasks to solve a user requirement. The planner
|
341
341
|
can output code tasks or test tasks, where test tasks are used to verify the code.
|
@@ -379,29 +379,29 @@ class VisionAgentV2(Agent):
|
|
379
379
|
def __call__(
|
380
380
|
self,
|
381
381
|
input: Union[List[Dict[str, str]], str],
|
382
|
-
|
382
|
+
media: Optional[Union[str, Path]] = None,
|
383
383
|
plan: Optional[List[Dict[str, Any]]] = None,
|
384
384
|
) -> str:
|
385
385
|
if isinstance(input, str):
|
386
386
|
input = [{"role": "user", "content": input}]
|
387
|
-
results = self.chat_with_workflow(input,
|
387
|
+
results = self.chat_with_workflow(input, media, plan)
|
388
388
|
return results["code"] # type: ignore
|
389
389
|
|
390
390
|
@traceable
|
391
391
|
def chat_with_workflow(
|
392
392
|
self,
|
393
393
|
chat: List[Dict[str, str]],
|
394
|
-
|
394
|
+
media: Optional[Union[str, Path]] = None,
|
395
395
|
plan: Optional[List[Dict[str, Any]]] = None,
|
396
396
|
) -> Dict[str, Any]:
|
397
397
|
if len(chat) == 0:
|
398
398
|
raise ValueError("Input cannot be empty.")
|
399
399
|
|
400
|
-
if
|
400
|
+
if media is not None:
|
401
401
|
# append file names to all user messages
|
402
402
|
for chat_i in chat:
|
403
403
|
if chat_i["role"] == "user":
|
404
|
-
chat_i["content"] += f" Image name {
|
404
|
+
chat_i["content"] += f" Image name {media}"
|
405
405
|
|
406
406
|
working_code = ""
|
407
407
|
if plan is not None:
|
@@ -74,15 +74,15 @@ CODE = """
|
|
74
74
|
|
75
75
|
# Constraints
|
76
76
|
- Write a function that accomplishes the 'Current Subtask'. You are supplied code from a previous task under 'Previous Code', do not delete or change previous code unless it contains a bug or it is necessary to complete the 'Current Subtask'.
|
77
|
-
- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools
|
77
|
+
- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools import *` import.
|
78
78
|
- You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
|
79
|
-
- Use the `save_json` function from `vision_agent.tools
|
79
|
+
- Use the `save_json` function from `vision_agent.tools` to save your output as a json file.
|
80
80
|
- Write clean, readable, and well-documented code.
|
81
81
|
|
82
82
|
# Output
|
83
83
|
While some concise thoughts are helpful, code is absolutely required. If possible, execute your defined functions in the code output. Output code in the following format:
|
84
84
|
```python
|
85
|
-
from vision_agent.tools
|
85
|
+
from vision_agent.tools imoprt *
|
86
86
|
|
87
87
|
# your code goes here
|
88
88
|
```
|
@@ -6,7 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
7
7
|
from vision_agent.llm import LLM, OpenAILLM
|
8
8
|
from vision_agent.lmm import LMM
|
9
|
-
from vision_agent.tools import TOOLS
|
9
|
+
from vision_agent.tools.easytool_tools import TOOLS
|
10
10
|
|
11
11
|
from .agent import Agent
|
12
12
|
from .easytool_prompts import (
|
@@ -272,7 +272,7 @@ class EasyTool(Agent):
|
|
272
272
|
def __call__(
|
273
273
|
self,
|
274
274
|
input: Union[List[Dict[str, str]], str],
|
275
|
-
|
275
|
+
media: Optional[Union[str, Path]] = None,
|
276
276
|
) -> str:
|
277
277
|
"""Invoke the vision agent.
|
278
278
|
|
@@ -285,14 +285,14 @@ class EasyTool(Agent):
|
|
285
285
|
"""
|
286
286
|
if isinstance(input, str):
|
287
287
|
input = [{"role": "user", "content": input}]
|
288
|
-
return self.chat(input,
|
288
|
+
return self.chat(input, media=media)
|
289
289
|
|
290
290
|
def chat_with_workflow(
|
291
|
-
self, chat: List[Dict[str, str]],
|
291
|
+
self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
|
292
292
|
) -> Tuple[str, List[Dict]]:
|
293
293
|
question = chat[0]["content"]
|
294
|
-
if
|
295
|
-
question += f" Image name: {
|
294
|
+
if media:
|
295
|
+
question += f" Image name: {media}"
|
296
296
|
tasks = task_decompose(
|
297
297
|
self.task_model,
|
298
298
|
question,
|
@@ -340,7 +340,7 @@ class EasyTool(Agent):
|
|
340
340
|
return answer_summarize(self.answer_model, question, answers), all_tool_results
|
341
341
|
|
342
342
|
def chat(
|
343
|
-
self, chat: List[Dict[str, str]],
|
343
|
+
self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
|
344
344
|
) -> str:
|
345
|
-
answer, _ = self.chat_with_workflow(chat,
|
345
|
+
answer, _ = self.chat_with_workflow(chat, media=media)
|
346
346
|
return answer
|