vision-agent 0.2.55__tar.gz → 0.2.57__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.55 → vision_agent-0.2.57}/PKG-INFO +48 -15
- {vision_agent-0.2.55 → vision_agent-0.2.57}/README.md +47 -14
- {vision_agent-0.2.55 → vision_agent-0.2.57}/pyproject.toml +1 -1
- vision_agent-0.2.57/vision_agent/__init__.py +2 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/agent/agent.py +3 -1
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/agent/vision_agent.py +110 -81
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/agent/vision_agent_prompts.py +1 -1
- vision_agent-0.2.57/vision_agent/lmm/__init__.py +1 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/lmm/lmm.py +54 -116
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/tools/__init__.py +2 -1
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/tools/tools.py +3 -3
- vision_agent-0.2.55/vision_agent/__init__.py +0 -3
- vision_agent-0.2.55/vision_agent/agent/agent_coder.py +0 -216
- vision_agent-0.2.55/vision_agent/agent/agent_coder_prompts.py +0 -135
- vision_agent-0.2.55/vision_agent/agent/data_interpreter.py +0 -475
- vision_agent-0.2.55/vision_agent/agent/data_interpreter_prompts.py +0 -186
- vision_agent-0.2.55/vision_agent/agent/easytool.py +0 -346
- vision_agent-0.2.55/vision_agent/agent/easytool_prompts.py +0 -89
- vision_agent-0.2.55/vision_agent/agent/easytool_v2.py +0 -781
- vision_agent-0.2.55/vision_agent/agent/easytool_v2_prompts.py +0 -152
- vision_agent-0.2.55/vision_agent/agent/reflexion.py +0 -299
- vision_agent-0.2.55/vision_agent/agent/reflexion_prompts.py +0 -100
- vision_agent-0.2.55/vision_agent/llm/__init__.py +0 -1
- vision_agent-0.2.55/vision_agent/llm/llm.py +0 -176
- vision_agent-0.2.55/vision_agent/lmm/__init__.py +0 -1
- vision_agent-0.2.55/vision_agent/tools/easytool_tools.py +0 -1242
- {vision_agent-0.2.55 → vision_agent-0.2.57}/LICENSE +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.57
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -38,7 +38,6 @@ Description-Content-Type: text/markdown
|
|
38
38
|
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
|
39
39
|
|
40
40
|
# 🔍🤖 Vision Agent
|
41
|
-
|
42
41
|
[](https://discord.gg/wPdN8RCYew)
|
43
42
|

|
44
43
|
[](https://badge.fury.io/py/vision-agent)
|
@@ -52,9 +51,14 @@ accomplish the task you want. Vision Agent aims to provide an in-seconds experie
|
|
52
51
|
allowing users to describe their problem in text and have the agent framework generate
|
53
52
|
code to solve the task for them. Check out our discord for updates and roadmaps!
|
54
53
|
|
54
|
+
|
55
|
+
## Web Application
|
56
|
+
|
57
|
+
Try Vision Agent live on [va.landing.ai](https://va.landing.ai/)
|
58
|
+
|
55
59
|
## Documentation
|
56
60
|
|
57
|
-
|
61
|
+
[Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
58
62
|
|
59
63
|
|
60
64
|
## Getting Started
|
@@ -73,6 +77,7 @@ export OPENAI_API_KEY="your-api-key"
|
|
73
77
|
```
|
74
78
|
|
75
79
|
### Vision Agent
|
80
|
+
#### Basic Usage
|
76
81
|
You can interact with the agent as you would with any LLM or LMM model:
|
77
82
|
|
78
83
|
```python
|
@@ -88,28 +93,28 @@ from vision_agent.tools import load_image, grounding_sam
|
|
88
93
|
def calculate_filled_percentage(image_path: str) -> float:
|
89
94
|
# Step 1: Load the image
|
90
95
|
image = load_image(image_path)
|
91
|
-
|
96
|
+
|
92
97
|
# Step 2: Segment the jar
|
93
98
|
jar_segments = grounding_sam(prompt="jar", image=image)
|
94
|
-
|
99
|
+
|
95
100
|
# Step 3: Segment the coffee beans
|
96
101
|
coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
|
97
|
-
|
102
|
+
|
98
103
|
# Step 4: Calculate the area of the segmented jar
|
99
104
|
jar_area = 0
|
100
105
|
for segment in jar_segments:
|
101
106
|
jar_area += segment['mask'].sum()
|
102
|
-
|
107
|
+
|
103
108
|
# Step 5: Calculate the area of the segmented coffee beans
|
104
109
|
coffee_beans_area = 0
|
105
110
|
for segment in coffee_beans_segments:
|
106
111
|
coffee_beans_area += segment['mask'].sum()
|
107
|
-
|
112
|
+
|
108
113
|
# Step 6: Compute the percentage of the jar area that is filled with coffee beans
|
109
114
|
if jar_area == 0:
|
110
115
|
return 0.0 # To avoid division by zero
|
111
116
|
filled_percentage = (coffee_beans_area / jar_area) * 100
|
112
|
-
|
117
|
+
|
113
118
|
# Step 7: Return the computed percentage
|
114
119
|
return filled_percentage
|
115
120
|
```
|
@@ -121,10 +126,12 @@ mode by passing in the verbose argument:
|
|
121
126
|
>>> agent = VisionAgent(verbose=2)
|
122
127
|
```
|
123
128
|
|
124
|
-
|
129
|
+
#### Detailed Usage
|
130
|
+
You can also have it return more information by calling `chat_with_workflow`. The format
|
131
|
+
of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
|
125
132
|
|
126
133
|
```python
|
127
|
-
>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"
|
134
|
+
>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
|
128
135
|
>>> print(results)
|
129
136
|
{
|
130
137
|
"code": "from vision_agent.tools import ..."
|
@@ -135,19 +142,45 @@ You can also have it return more information by calling `chat_with_workflow`:
|
|
135
142
|
}
|
136
143
|
```
|
137
144
|
|
138
|
-
With this you can examine more detailed information such as the
|
145
|
+
With this you can examine more detailed information such as the testing code, testing
|
139
146
|
results, plan or working memory it used to complete the task.
|
140
147
|
|
148
|
+
#### Multi-turn conversations
|
149
|
+
You can have multi-turn conversations with vision-agent as well, giving it feedback on
|
150
|
+
the code and having it update. You just need to add the code as a response from the
|
151
|
+
assistant:
|
152
|
+
|
153
|
+
```python
|
154
|
+
agent = va.agent.VisionAgent(verbosity=2)
|
155
|
+
conv = [
|
156
|
+
{
|
157
|
+
"role": "user",
|
158
|
+
"content": "Are these workers wearing safety gear? Output only a True or False value.",
|
159
|
+
"media": ["workers.png"],
|
160
|
+
}
|
161
|
+
]
|
162
|
+
result = agent.chat_with_workflow(conv)
|
163
|
+
code = result["code"]
|
164
|
+
conv.append({"role": "assistant", "content": code})
|
165
|
+
conv.append(
|
166
|
+
{
|
167
|
+
"role": "user",
|
168
|
+
"content": "Can you also return the number of workers wearing safety gear?",
|
169
|
+
}
|
170
|
+
)
|
171
|
+
result = agent.chat_with_workflow(conv)
|
172
|
+
```
|
173
|
+
|
141
174
|
### Tools
|
142
175
|
There are a variety of tools for the model or the user to use. Some are executed locally
|
143
|
-
while others are hosted for you. You can also ask an
|
176
|
+
while others are hosted for you. You can also ask an LMM directly to build a tool for
|
144
177
|
you. For example:
|
145
178
|
|
146
179
|
```python
|
147
180
|
>>> import vision_agent as va
|
148
|
-
>>> llm = va.llm.
|
181
|
+
>>> llm = va.llm.OpenAILMM()
|
149
182
|
>>> detector = llm.generate_detector("Can you build a jar detector for me?")
|
150
|
-
>>> detector("jar.jpg")
|
183
|
+
>>> detector(va.tools.load_image("jar.jpg"))
|
151
184
|
[{"labels": ["jar",],
|
152
185
|
"scores": [0.99],
|
153
186
|
"bboxes": [
|
@@ -2,7 +2,6 @@
|
|
2
2
|
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
|
3
3
|
|
4
4
|
# 🔍🤖 Vision Agent
|
5
|
-
|
6
5
|
[](https://discord.gg/wPdN8RCYew)
|
7
6
|

|
8
7
|
[](https://badge.fury.io/py/vision-agent)
|
@@ -16,9 +15,14 @@ accomplish the task you want. Vision Agent aims to provide an in-seconds experie
|
|
16
15
|
allowing users to describe their problem in text and have the agent framework generate
|
17
16
|
code to solve the task for them. Check out our discord for updates and roadmaps!
|
18
17
|
|
18
|
+
|
19
|
+
## Web Application
|
20
|
+
|
21
|
+
Try Vision Agent live on [va.landing.ai](https://va.landing.ai/)
|
22
|
+
|
19
23
|
## Documentation
|
20
24
|
|
21
|
-
|
25
|
+
[Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
22
26
|
|
23
27
|
|
24
28
|
## Getting Started
|
@@ -37,6 +41,7 @@ export OPENAI_API_KEY="your-api-key"
|
|
37
41
|
```
|
38
42
|
|
39
43
|
### Vision Agent
|
44
|
+
#### Basic Usage
|
40
45
|
You can interact with the agent as you would with any LLM or LMM model:
|
41
46
|
|
42
47
|
```python
|
@@ -52,28 +57,28 @@ from vision_agent.tools import load_image, grounding_sam
|
|
52
57
|
def calculate_filled_percentage(image_path: str) -> float:
|
53
58
|
# Step 1: Load the image
|
54
59
|
image = load_image(image_path)
|
55
|
-
|
60
|
+
|
56
61
|
# Step 2: Segment the jar
|
57
62
|
jar_segments = grounding_sam(prompt="jar", image=image)
|
58
|
-
|
63
|
+
|
59
64
|
# Step 3: Segment the coffee beans
|
60
65
|
coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
|
61
|
-
|
66
|
+
|
62
67
|
# Step 4: Calculate the area of the segmented jar
|
63
68
|
jar_area = 0
|
64
69
|
for segment in jar_segments:
|
65
70
|
jar_area += segment['mask'].sum()
|
66
|
-
|
71
|
+
|
67
72
|
# Step 5: Calculate the area of the segmented coffee beans
|
68
73
|
coffee_beans_area = 0
|
69
74
|
for segment in coffee_beans_segments:
|
70
75
|
coffee_beans_area += segment['mask'].sum()
|
71
|
-
|
76
|
+
|
72
77
|
# Step 6: Compute the percentage of the jar area that is filled with coffee beans
|
73
78
|
if jar_area == 0:
|
74
79
|
return 0.0 # To avoid division by zero
|
75
80
|
filled_percentage = (coffee_beans_area / jar_area) * 100
|
76
|
-
|
81
|
+
|
77
82
|
# Step 7: Return the computed percentage
|
78
83
|
return filled_percentage
|
79
84
|
```
|
@@ -85,10 +90,12 @@ mode by passing in the verbose argument:
|
|
85
90
|
>>> agent = VisionAgent(verbose=2)
|
86
91
|
```
|
87
92
|
|
88
|
-
|
93
|
+
#### Detailed Usage
|
94
|
+
You can also have it return more information by calling `chat_with_workflow`. The format
|
95
|
+
of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
|
89
96
|
|
90
97
|
```python
|
91
|
-
>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"
|
98
|
+
>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
|
92
99
|
>>> print(results)
|
93
100
|
{
|
94
101
|
"code": "from vision_agent.tools import ..."
|
@@ -99,19 +106,45 @@ You can also have it return more information by calling `chat_with_workflow`:
|
|
99
106
|
}
|
100
107
|
```
|
101
108
|
|
102
|
-
With this you can examine more detailed information such as the
|
109
|
+
With this you can examine more detailed information such as the testing code, testing
|
103
110
|
results, plan or working memory it used to complete the task.
|
104
111
|
|
112
|
+
#### Multi-turn conversations
|
113
|
+
You can have multi-turn conversations with vision-agent as well, giving it feedback on
|
114
|
+
the code and having it update. You just need to add the code as a response from the
|
115
|
+
assistant:
|
116
|
+
|
117
|
+
```python
|
118
|
+
agent = va.agent.VisionAgent(verbosity=2)
|
119
|
+
conv = [
|
120
|
+
{
|
121
|
+
"role": "user",
|
122
|
+
"content": "Are these workers wearing safety gear? Output only a True or False value.",
|
123
|
+
"media": ["workers.png"],
|
124
|
+
}
|
125
|
+
]
|
126
|
+
result = agent.chat_with_workflow(conv)
|
127
|
+
code = result["code"]
|
128
|
+
conv.append({"role": "assistant", "content": code})
|
129
|
+
conv.append(
|
130
|
+
{
|
131
|
+
"role": "user",
|
132
|
+
"content": "Can you also return the number of workers wearing safety gear?",
|
133
|
+
}
|
134
|
+
)
|
135
|
+
result = agent.chat_with_workflow(conv)
|
136
|
+
```
|
137
|
+
|
105
138
|
### Tools
|
106
139
|
There are a variety of tools for the model or the user to use. Some are executed locally
|
107
|
-
while others are hosted for you. You can also ask an
|
140
|
+
while others are hosted for you. You can also ask an LMM directly to build a tool for
|
108
141
|
you. For example:
|
109
142
|
|
110
143
|
```python
|
111
144
|
>>> import vision_agent as va
|
112
|
-
>>> llm = va.llm.
|
145
|
+
>>> llm = va.llm.OpenAILMM()
|
113
146
|
>>> detector = llm.generate_detector("Can you build a jar detector for me?")
|
114
|
-
>>> detector("jar.jpg")
|
147
|
+
>>> detector(va.tools.load_image("jar.jpg"))
|
115
148
|
[{"labels": ["jar",],
|
116
149
|
"scores": [0.99],
|
117
150
|
"bboxes": [
|
@@ -2,12 +2,14 @@ from abc import ABC, abstractmethod
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Any, Dict, List, Optional, Union
|
4
4
|
|
5
|
+
from vision_agent.lmm import Message
|
6
|
+
|
5
7
|
|
6
8
|
class Agent(ABC):
|
7
9
|
@abstractmethod
|
8
10
|
def __call__(
|
9
11
|
self,
|
10
|
-
input: Union[
|
12
|
+
input: Union[str, List[Message]],
|
11
13
|
media: Optional[Union[str, Path]] = None,
|
12
14
|
) -> str:
|
13
15
|
pass
|
@@ -13,7 +13,6 @@ from rich.style import Style
|
|
13
13
|
from rich.syntax import Syntax
|
14
14
|
from tabulate import tabulate
|
15
15
|
|
16
|
-
from vision_agent.llm.llm import AzureOpenAILLM
|
17
16
|
import vision_agent.tools as T
|
18
17
|
from vision_agent.agent import Agent
|
19
18
|
from vision_agent.agent.vision_agent_prompts import (
|
@@ -25,8 +24,7 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
25
24
|
SIMPLE_TEST,
|
26
25
|
USER_REQ,
|
27
26
|
)
|
28
|
-
from vision_agent.
|
29
|
-
from vision_agent.lmm import LMM, OpenAILMM
|
27
|
+
from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM
|
30
28
|
from vision_agent.utils import CodeInterpreterFactory, Execution
|
31
29
|
from vision_agent.utils.execute import CodeInterpreter
|
32
30
|
from vision_agent.utils.image_utils import b64_to_pil
|
@@ -133,11 +131,10 @@ def extract_image(
|
|
133
131
|
|
134
132
|
|
135
133
|
def write_plan(
|
136
|
-
chat: List[
|
134
|
+
chat: List[Message],
|
137
135
|
tool_desc: str,
|
138
136
|
working_memory: str,
|
139
|
-
model:
|
140
|
-
media: Optional[Sequence[Union[str, Path]]] = None,
|
137
|
+
model: LMM,
|
141
138
|
) -> List[Dict[str, str]]:
|
142
139
|
chat = copy.deepcopy(chat)
|
143
140
|
if chat[-1]["role"] != "user":
|
@@ -147,18 +144,58 @@ def write_plan(
|
|
147
144
|
context = USER_REQ.format(user_request=user_request)
|
148
145
|
prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
|
149
146
|
chat[-1]["content"] = prompt
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
147
|
+
return extract_json(model.chat(chat))["plan"] # type: ignore
|
148
|
+
|
149
|
+
|
150
|
+
def write_code(
|
151
|
+
coder: LMM,
|
152
|
+
chat: List[Message],
|
153
|
+
tool_info: str,
|
154
|
+
feedback: str,
|
155
|
+
) -> str:
|
156
|
+
chat = copy.deepcopy(chat)
|
157
|
+
if chat[-1]["role"] != "user":
|
158
|
+
raise ValueError("Last chat message must be from the user.")
|
159
|
+
|
160
|
+
user_request = chat[-1]["content"]
|
161
|
+
prompt = CODE.format(
|
162
|
+
docstring=tool_info,
|
163
|
+
question=user_request,
|
164
|
+
feedback=feedback,
|
165
|
+
)
|
166
|
+
chat[-1]["content"] = prompt
|
167
|
+
return extract_code(coder(chat))
|
168
|
+
|
169
|
+
|
170
|
+
def write_test(
|
171
|
+
tester: LMM,
|
172
|
+
chat: List[Message],
|
173
|
+
tool_utils: str,
|
174
|
+
code: str,
|
175
|
+
feedback: str,
|
176
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
177
|
+
) -> str:
|
178
|
+
chat = copy.deepcopy(chat)
|
179
|
+
if chat[-1]["role"] != "user":
|
180
|
+
raise ValueError("Last chat message must be from the user.")
|
181
|
+
|
182
|
+
user_request = chat[-1]["content"]
|
183
|
+
prompt = SIMPLE_TEST.format(
|
184
|
+
docstring=tool_utils,
|
185
|
+
question=user_request,
|
186
|
+
code=code,
|
187
|
+
feedback=feedback,
|
188
|
+
media=media,
|
189
|
+
)
|
190
|
+
chat[-1]["content"] = prompt
|
191
|
+
return extract_code(tester(chat))
|
155
192
|
|
156
193
|
|
157
194
|
def reflect(
|
158
|
-
chat: List[
|
195
|
+
chat: List[Message],
|
159
196
|
plan: str,
|
160
197
|
code: str,
|
161
|
-
model:
|
198
|
+
model: LMM,
|
162
199
|
) -> Dict[str, Union[str, bool]]:
|
163
200
|
chat = copy.deepcopy(chat)
|
164
201
|
if chat[-1]["role"] != "user":
|
@@ -168,22 +205,22 @@ def reflect(
|
|
168
205
|
context = USER_REQ.format(user_request=user_request)
|
169
206
|
prompt = REFLECT.format(context=context, plan=plan, code=code)
|
170
207
|
chat[-1]["content"] = prompt
|
171
|
-
return extract_json(model
|
208
|
+
return extract_json(model(chat))
|
172
209
|
|
173
210
|
|
174
211
|
def write_and_test_code(
|
175
|
-
|
212
|
+
chat: List[Message],
|
176
213
|
tool_info: str,
|
177
214
|
tool_utils: str,
|
178
215
|
working_memory: List[Dict[str, str]],
|
179
|
-
coder:
|
180
|
-
tester:
|
181
|
-
debugger:
|
216
|
+
coder: LMM,
|
217
|
+
tester: LMM,
|
218
|
+
debugger: LMM,
|
182
219
|
code_interpreter: CodeInterpreter,
|
183
220
|
log_progress: Callable[[Dict[str, Any]], None],
|
184
221
|
verbosity: int = 0,
|
185
222
|
max_retries: int = 3,
|
186
|
-
|
223
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
187
224
|
) -> Dict[str, Any]:
|
188
225
|
log_progress(
|
189
226
|
{
|
@@ -191,25 +228,9 @@ def write_and_test_code(
|
|
191
228
|
"status": "started",
|
192
229
|
}
|
193
230
|
)
|
194
|
-
code =
|
195
|
-
|
196
|
-
|
197
|
-
docstring=tool_info,
|
198
|
-
question=task,
|
199
|
-
feedback=format_memory(working_memory),
|
200
|
-
)
|
201
|
-
)
|
202
|
-
)
|
203
|
-
test = extract_code(
|
204
|
-
tester(
|
205
|
-
SIMPLE_TEST.format(
|
206
|
-
docstring=tool_utils,
|
207
|
-
question=task,
|
208
|
-
code=code,
|
209
|
-
feedback=working_memory,
|
210
|
-
media=input_media,
|
211
|
-
)
|
212
|
-
)
|
231
|
+
code = write_code(coder, chat, tool_info, format_memory(working_memory))
|
232
|
+
test = write_test(
|
233
|
+
tester, chat, tool_utils, code, format_memory(working_memory), media
|
213
234
|
)
|
214
235
|
|
215
236
|
log_progress(
|
@@ -392,10 +413,10 @@ class VisionAgent(Agent):
|
|
392
413
|
|
393
414
|
def __init__(
|
394
415
|
self,
|
395
|
-
planner: Optional[
|
396
|
-
coder: Optional[
|
397
|
-
tester: Optional[
|
398
|
-
debugger: Optional[
|
416
|
+
planner: Optional[LMM] = None,
|
417
|
+
coder: Optional[LMM] = None,
|
418
|
+
tester: Optional[LMM] = None,
|
419
|
+
debugger: Optional[LMM] = None,
|
399
420
|
tool_recommender: Optional[Sim] = None,
|
400
421
|
verbosity: int = 0,
|
401
422
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
@@ -403,10 +424,10 @@ class VisionAgent(Agent):
|
|
403
424
|
"""Initialize the Vision Agent.
|
404
425
|
|
405
426
|
Parameters:
|
406
|
-
planner (Optional[
|
407
|
-
coder (Optional[
|
408
|
-
tester (Optional[
|
409
|
-
debugger (Optional[
|
427
|
+
planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
|
428
|
+
coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
|
429
|
+
tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
|
430
|
+
debugger (Optional[LMM]): The debugger model to
|
410
431
|
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
411
432
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
412
433
|
highest verbosity level which will output all intermediate debugging
|
@@ -418,12 +439,12 @@ class VisionAgent(Agent):
|
|
418
439
|
"""
|
419
440
|
|
420
441
|
self.planner = (
|
421
|
-
|
442
|
+
OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
|
422
443
|
)
|
423
|
-
self.coder =
|
424
|
-
self.tester =
|
444
|
+
self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
|
445
|
+
self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
|
425
446
|
self.debugger = (
|
426
|
-
|
447
|
+
OpenAILMM(temperature=0.0, json_mode=True) if debugger is None else debugger
|
427
448
|
)
|
428
449
|
|
429
450
|
self.tool_recommender = (
|
@@ -437,7 +458,7 @@ class VisionAgent(Agent):
|
|
437
458
|
|
438
459
|
def __call__(
|
439
460
|
self,
|
440
|
-
input: Union[
|
461
|
+
input: Union[str, List[Message]],
|
441
462
|
media: Optional[Union[str, Path]] = None,
|
442
463
|
) -> str:
|
443
464
|
"""Chat with Vision Agent and return intermediate information regarding the task.
|
@@ -454,23 +475,26 @@ class VisionAgent(Agent):
|
|
454
475
|
|
455
476
|
if isinstance(input, str):
|
456
477
|
input = [{"role": "user", "content": input}]
|
457
|
-
|
478
|
+
if media is not None:
|
479
|
+
input[0]["media"] = [media]
|
480
|
+
results = self.chat_with_workflow(input)
|
458
481
|
results.pop("working_memory")
|
459
482
|
return results # type: ignore
|
460
483
|
|
461
484
|
def chat_with_workflow(
|
462
485
|
self,
|
463
|
-
chat: List[
|
464
|
-
media: Optional[Union[str, Path]] = None,
|
486
|
+
chat: List[Message],
|
465
487
|
self_reflection: bool = False,
|
466
488
|
display_visualization: bool = False,
|
467
489
|
) -> Dict[str, Any]:
|
468
490
|
"""Chat with Vision Agent and return intermediate information regarding the task.
|
469
491
|
|
470
492
|
Parameters:
|
471
|
-
chat (List[
|
472
|
-
|
473
|
-
|
493
|
+
chat (List[MediaChatItem]): A conversation
|
494
|
+
in the format of:
|
495
|
+
[{"role": "user", "content": "describe your task here..."}]
|
496
|
+
or if it contains media files, it should be in the format of:
|
497
|
+
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
474
498
|
self_reflection (bool): Whether to reflect on the task and debug the code.
|
475
499
|
display_visualization (bool): If True, it opens a new window locally to
|
476
500
|
show the image(s) created by visualization code (if there is any).
|
@@ -485,11 +509,19 @@ class VisionAgent(Agent):
|
|
485
509
|
|
486
510
|
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
487
511
|
with CodeInterpreterFactory.new_instance() as code_interpreter:
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
512
|
+
chat = copy.deepcopy(chat)
|
513
|
+
media_list = []
|
514
|
+
for chat_i in chat:
|
515
|
+
if "media" in chat_i:
|
516
|
+
for media in chat_i["media"]:
|
517
|
+
media = code_interpreter.upload_file(media)
|
518
|
+
chat_i["content"] += f" Media name {media}" # type: ignore
|
519
|
+
media_list.append(media)
|
520
|
+
|
521
|
+
int_chat = cast(
|
522
|
+
List[Message],
|
523
|
+
[{"role": c["role"], "content": c["content"]} for c in chat],
|
524
|
+
)
|
493
525
|
|
494
526
|
code = ""
|
495
527
|
test = ""
|
@@ -507,11 +539,10 @@ class VisionAgent(Agent):
|
|
507
539
|
}
|
508
540
|
)
|
509
541
|
plan_i = write_plan(
|
510
|
-
|
542
|
+
int_chat,
|
511
543
|
T.TOOL_DESCRIPTIONS,
|
512
544
|
format_memory(working_memory),
|
513
545
|
self.planner,
|
514
|
-
media=[media] if media else None,
|
515
546
|
)
|
516
547
|
plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
|
517
548
|
|
@@ -534,9 +565,7 @@ class VisionAgent(Agent):
|
|
534
565
|
self.verbosity,
|
535
566
|
)
|
536
567
|
results = write_and_test_code(
|
537
|
-
|
538
|
-
user_request=chat[0]["content"], subtasks=plan_i_str
|
539
|
-
),
|
568
|
+
chat=int_chat,
|
540
569
|
tool_info=tool_info,
|
541
570
|
tool_utils=T.UTILITIES_DOCSTRING,
|
542
571
|
working_memory=working_memory,
|
@@ -546,7 +575,7 @@ class VisionAgent(Agent):
|
|
546
575
|
code_interpreter=code_interpreter,
|
547
576
|
log_progress=self.log_progress,
|
548
577
|
verbosity=self.verbosity,
|
549
|
-
|
578
|
+
media=media_list,
|
550
579
|
)
|
551
580
|
success = cast(bool, results["success"])
|
552
581
|
code = cast(str, results["code"])
|
@@ -564,7 +593,7 @@ class VisionAgent(Agent):
|
|
564
593
|
}
|
565
594
|
)
|
566
595
|
reflection = reflect(
|
567
|
-
|
596
|
+
int_chat,
|
568
597
|
FULL_TASK.format(
|
569
598
|
user_request=chat[0]["content"], subtasks=plan_i_str
|
570
599
|
),
|
@@ -634,10 +663,10 @@ class AzureVisionAgent(VisionAgent):
|
|
634
663
|
|
635
664
|
def __init__(
|
636
665
|
self,
|
637
|
-
planner: Optional[
|
638
|
-
coder: Optional[
|
639
|
-
tester: Optional[
|
640
|
-
debugger: Optional[
|
666
|
+
planner: Optional[LMM] = None,
|
667
|
+
coder: Optional[LMM] = None,
|
668
|
+
tester: Optional[LMM] = None,
|
669
|
+
debugger: Optional[LMM] = None,
|
641
670
|
tool_recommender: Optional[Sim] = None,
|
642
671
|
verbosity: int = 0,
|
643
672
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
@@ -645,10 +674,10 @@ class AzureVisionAgent(VisionAgent):
|
|
645
674
|
"""Initialize the Vision Agent.
|
646
675
|
|
647
676
|
Parameters:
|
648
|
-
planner (Optional[
|
649
|
-
coder (Optional[
|
650
|
-
tester (Optional[
|
651
|
-
debugger (Optional[
|
677
|
+
planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
|
678
|
+
coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
|
679
|
+
tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
|
680
|
+
debugger (Optional[LMM]): The debugger model to
|
652
681
|
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
653
682
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
654
683
|
highest verbosity level which will output all intermediate debugging
|
@@ -660,14 +689,14 @@ class AzureVisionAgent(VisionAgent):
|
|
660
689
|
"""
|
661
690
|
super().__init__(
|
662
691
|
planner=(
|
663
|
-
|
692
|
+
AzureOpenAILMM(temperature=0.0, json_mode=True)
|
664
693
|
if planner is None
|
665
694
|
else planner
|
666
695
|
),
|
667
|
-
coder=
|
668
|
-
tester=
|
696
|
+
coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder,
|
697
|
+
tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester,
|
669
698
|
debugger=(
|
670
|
-
|
699
|
+
AzureOpenAILMM(temperature=0.0, json_mode=True)
|
671
700
|
if debugger is None
|
672
701
|
else debugger
|
673
702
|
),
|
@@ -171,7 +171,7 @@ This is the documentation for the functions you have access to. You may call any
|
|
171
171
|
**Instructions**:
|
172
172
|
1. Verify the fundamental functionality under normal conditions.
|
173
173
|
2. Ensure each test case is well-documented with comments explaining the scenario it covers.
|
174
|
-
3. Your test case MUST run only on the given
|
174
|
+
3. Your test case MUST run only on the given images which are {media}
|
175
175
|
4. Your test case MUST run only with the given values which is available in the question - {question}
|
176
176
|
5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions.
|
177
177
|
6. DO NOT mock any functions, you must test their functionality as is.
|
@@ -0,0 +1 @@
|
|
1
|
+
from .lmm import LMM, AzureOpenAILMM, Message, OpenAILMM
|