vision-agent 0.2.91__tar.gz → 0.2.92__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.91 → vision_agent-0.2.92}/PKG-INFO +42 -12
- {vision_agent-0.2.91 → vision_agent-0.2.92}/README.md +41 -11
- {vision_agent-0.2.91 → vision_agent-0.2.92}/pyproject.toml +1 -1
- vision_agent-0.2.92/vision_agent/agent/__init__.py +3 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/agent/agent.py +1 -1
- vision_agent-0.2.92/vision_agent/agent/agent_utils.py +43 -0
- vision_agent-0.2.92/vision_agent/agent/vision_agent.py +230 -0
- vision_agent-0.2.91/vision_agent/agent/vision_agent.py → vision_agent-0.2.92/vision_agent/agent/vision_agent_coder.py +112 -153
- vision_agent-0.2.91/vision_agent/agent/vision_agent_prompts.py → vision_agent-0.2.92/vision_agent/agent/vision_agent_coder_prompts.py +3 -2
- vision_agent-0.2.92/vision_agent/agent/vision_agent_prompts.py +114 -0
- vision_agent-0.2.92/vision_agent/lmm/__init__.py +2 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/lmm/lmm.py +3 -5
- vision_agent-0.2.92/vision_agent/lmm/types.py +5 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/tools/__init__.py +1 -0
- vision_agent-0.2.92/vision_agent/tools/meta_tools.py +402 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/tools/tool_utils.py +47 -1
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/tools/tools.py +7 -49
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/execute.py +52 -76
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/image_utils.py +1 -1
- vision_agent-0.2.91/vision_agent/agent/__init__.py +0 -2
- vision_agent-0.2.91/vision_agent/lmm/__init__.py +0 -1
- {vision_agent-0.2.91 → vision_agent-0.2.92}/LICENSE +0 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.92
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -57,7 +57,7 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
|
|
57
57
|
|
58
58
|
## Web Application
|
59
59
|
|
60
|
-
Try Vision Agent live on [va.landing.ai](https://va.landing.ai/)
|
60
|
+
Try Vision Agent live on (note this may not be running the most up-to-date version) [va.landing.ai](https://va.landing.ai/)
|
61
61
|
|
62
62
|
## Documentation
|
63
63
|
|
@@ -79,16 +79,44 @@ using Azure OpenAI please see the Azure setup section):
|
|
79
79
|
export OPENAI_API_KEY="your-api-key"
|
80
80
|
```
|
81
81
|
|
82
|
-
### Important Note on API Usage
|
83
|
-
Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits)
|
84
|
-
|
85
82
|
### Vision Agent
|
83
|
+
There are two agents that you can use. Vision Agent is a conversational agent that has
|
84
|
+
access to tools that allow it to write an navigate python code and file systems. It can
|
85
|
+
converse with the user in natural language. VisionAgentCoder is an agent that can write
|
86
|
+
code for vision tasks, such as counting people in an image. However, it cannot converse
|
87
|
+
and can only respond with code. VisionAgent can call VisionAgentCoder to write vision
|
88
|
+
code.
|
89
|
+
|
86
90
|
#### Basic Usage
|
87
|
-
|
91
|
+
To run the streamlit app locally to chat with Vision Agent, you can run the following
|
92
|
+
command:
|
93
|
+
|
94
|
+
```bash
|
95
|
+
pip install -r examples/chat/requirements.txt
|
96
|
+
export WORKSPACE=/path/to/your/workspace
|
97
|
+
export ZMQ_PORT=5555
|
98
|
+
streamlit run examples/chat/app.py
|
99
|
+
```
|
100
|
+
You can find more details about the streamlit app [here](examples/chat/).
|
88
101
|
|
102
|
+
#### Basic Programmatic Usage
|
89
103
|
```python
|
90
104
|
>>> from vision_agent.agent import VisionAgent
|
91
105
|
>>> agent = VisionAgent()
|
106
|
+
>>> resp = agent("Hello")
|
107
|
+
>>> print(resp)
|
108
|
+
[{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}]
|
109
|
+
>>> resp.append({"role": "user", "content": "Can you count the number of people in this image?", "media": ["people.jpg"]})
|
110
|
+
>>> resp = agent(resp)
|
111
|
+
```
|
112
|
+
|
113
|
+
### Vision Agent Coder
|
114
|
+
#### Basic Usage
|
115
|
+
You can interact with the agent as you would with any LLM or LMM model:
|
116
|
+
|
117
|
+
```python
|
118
|
+
>>> from vision_agent.agent import VisionAgentCoder
|
119
|
+
>>> agent = VisionAgentCoder()
|
92
120
|
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
93
121
|
```
|
94
122
|
|
@@ -129,7 +157,7 @@ To better understand how the model came up with it's answer, you can run it in d
|
|
129
157
|
mode by passing in the verbose argument:
|
130
158
|
|
131
159
|
```python
|
132
|
-
>>> agent =
|
160
|
+
>>> agent = VisionAgentCoder(verbose=2)
|
133
161
|
```
|
134
162
|
|
135
163
|
#### Detailed Usage
|
@@ -219,9 +247,11 @@ def custom_tool(image_path: str) -> str:
|
|
219
247
|
return np.zeros((10, 10))
|
220
248
|
```
|
221
249
|
|
222
|
-
You need to ensure you call `@va.tools.register_tool` with any imports it
|
223
|
-
|
224
|
-
|
250
|
+
You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
|
251
|
+
variables will not be captured by `register_tool` so you need to include them in the
|
252
|
+
function. Make sure the documentation is in the same format above with description,
|
253
|
+
`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
|
254
|
+
[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
|
225
255
|
|
226
256
|
### Azure Setup
|
227
257
|
If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
|
@@ -248,7 +278,7 @@ You can then run Vision Agent using the Azure OpenAI models:
|
|
248
278
|
|
249
279
|
```python
|
250
280
|
import vision_agent as va
|
251
|
-
agent = va.agent.
|
281
|
+
agent = va.agent.AzureVisionAgentCoder()
|
252
282
|
```
|
253
283
|
|
254
284
|
******************************************************************************************************************************
|
@@ -257,7 +287,7 @@ agent = va.agent.AzureVisionAgent()
|
|
257
287
|
|
258
288
|
#### How to get started with OpenAI API credits
|
259
289
|
|
260
|
-
1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
|
290
|
+
1. Visit the [OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
|
261
291
|
2. Follow the instructions to purchase and manage your API credits.
|
262
292
|
3. Ensure your API key is correctly configured in your project settings.
|
263
293
|
|
@@ -18,7 +18,7 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
|
|
18
18
|
|
19
19
|
## Web Application
|
20
20
|
|
21
|
-
Try Vision Agent live on [va.landing.ai](https://va.landing.ai/)
|
21
|
+
Try Vision Agent live on (note this may not be running the most up-to-date version) [va.landing.ai](https://va.landing.ai/)
|
22
22
|
|
23
23
|
## Documentation
|
24
24
|
|
@@ -40,16 +40,44 @@ using Azure OpenAI please see the Azure setup section):
|
|
40
40
|
export OPENAI_API_KEY="your-api-key"
|
41
41
|
```
|
42
42
|
|
43
|
-
### Important Note on API Usage
|
44
|
-
Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits)
|
45
|
-
|
46
43
|
### Vision Agent
|
44
|
+
There are two agents that you can use. Vision Agent is a conversational agent that has
|
45
|
+
access to tools that allow it to write an navigate python code and file systems. It can
|
46
|
+
converse with the user in natural language. VisionAgentCoder is an agent that can write
|
47
|
+
code for vision tasks, such as counting people in an image. However, it cannot converse
|
48
|
+
and can only respond with code. VisionAgent can call VisionAgentCoder to write vision
|
49
|
+
code.
|
50
|
+
|
47
51
|
#### Basic Usage
|
48
|
-
|
52
|
+
To run the streamlit app locally to chat with Vision Agent, you can run the following
|
53
|
+
command:
|
54
|
+
|
55
|
+
```bash
|
56
|
+
pip install -r examples/chat/requirements.txt
|
57
|
+
export WORKSPACE=/path/to/your/workspace
|
58
|
+
export ZMQ_PORT=5555
|
59
|
+
streamlit run examples/chat/app.py
|
60
|
+
```
|
61
|
+
You can find more details about the streamlit app [here](examples/chat/).
|
49
62
|
|
63
|
+
#### Basic Programmatic Usage
|
50
64
|
```python
|
51
65
|
>>> from vision_agent.agent import VisionAgent
|
52
66
|
>>> agent = VisionAgent()
|
67
|
+
>>> resp = agent("Hello")
|
68
|
+
>>> print(resp)
|
69
|
+
[{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}]
|
70
|
+
>>> resp.append({"role": "user", "content": "Can you count the number of people in this image?", "media": ["people.jpg"]})
|
71
|
+
>>> resp = agent(resp)
|
72
|
+
```
|
73
|
+
|
74
|
+
### Vision Agent Coder
|
75
|
+
#### Basic Usage
|
76
|
+
You can interact with the agent as you would with any LLM or LMM model:
|
77
|
+
|
78
|
+
```python
|
79
|
+
>>> from vision_agent.agent import VisionAgentCoder
|
80
|
+
>>> agent = VisionAgentCoder()
|
53
81
|
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
54
82
|
```
|
55
83
|
|
@@ -90,7 +118,7 @@ To better understand how the model came up with it's answer, you can run it in d
|
|
90
118
|
mode by passing in the verbose argument:
|
91
119
|
|
92
120
|
```python
|
93
|
-
>>> agent =
|
121
|
+
>>> agent = VisionAgentCoder(verbose=2)
|
94
122
|
```
|
95
123
|
|
96
124
|
#### Detailed Usage
|
@@ -180,9 +208,11 @@ def custom_tool(image_path: str) -> str:
|
|
180
208
|
return np.zeros((10, 10))
|
181
209
|
```
|
182
210
|
|
183
|
-
You need to ensure you call `@va.tools.register_tool` with any imports it
|
184
|
-
|
185
|
-
|
211
|
+
You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
|
212
|
+
variables will not be captured by `register_tool` so you need to include them in the
|
213
|
+
function. Make sure the documentation is in the same format above with description,
|
214
|
+
`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
|
215
|
+
[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
|
186
216
|
|
187
217
|
### Azure Setup
|
188
218
|
If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
|
@@ -209,7 +239,7 @@ You can then run Vision Agent using the Azure OpenAI models:
|
|
209
239
|
|
210
240
|
```python
|
211
241
|
import vision_agent as va
|
212
|
-
agent = va.agent.
|
242
|
+
agent = va.agent.AzureVisionAgentCoder()
|
213
243
|
```
|
214
244
|
|
215
245
|
******************************************************************************************************************************
|
@@ -218,7 +248,7 @@ agent = va.agent.AzureVisionAgent()
|
|
218
248
|
|
219
249
|
#### How to get started with OpenAI API credits
|
220
250
|
|
221
|
-
1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
|
251
|
+
1. Visit the [OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
|
222
252
|
2. Follow the instructions to purchase and manage your API credits.
|
223
253
|
3. Ensure your API key is correctly configured in your project settings.
|
224
254
|
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import sys
|
4
|
+
from typing import Any, Dict
|
5
|
+
|
6
|
+
logging.basicConfig(stream=sys.stdout)
|
7
|
+
_LOGGER = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
def extract_json(json_str: str) -> Dict[str, Any]:
|
11
|
+
try:
|
12
|
+
json_dict = json.loads(json_str)
|
13
|
+
except json.JSONDecodeError:
|
14
|
+
input_json_str = json_str
|
15
|
+
if "```json" in json_str:
|
16
|
+
json_str = json_str[json_str.find("```json") + len("```json") :]
|
17
|
+
json_str = json_str[: json_str.find("```")]
|
18
|
+
elif "```" in json_str:
|
19
|
+
json_str = json_str[json_str.find("```") + len("```") :]
|
20
|
+
# get the last ``` not one from an intermediate string
|
21
|
+
json_str = json_str[: json_str.find("}```")]
|
22
|
+
try:
|
23
|
+
json_dict = json.loads(json_str)
|
24
|
+
except json.JSONDecodeError as e:
|
25
|
+
error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
|
26
|
+
_LOGGER.exception(error_msg)
|
27
|
+
raise ValueError(error_msg) from e
|
28
|
+
return json_dict # type: ignore
|
29
|
+
|
30
|
+
|
31
|
+
def extract_code(code: str) -> str:
|
32
|
+
if "\n```python" in code:
|
33
|
+
start = "\n```python"
|
34
|
+
elif "```python" in code:
|
35
|
+
start = "```python"
|
36
|
+
else:
|
37
|
+
return code
|
38
|
+
|
39
|
+
code = code[code.find(start) + len(start) :]
|
40
|
+
code = code[: code.find("```")]
|
41
|
+
if code.startswith("python\n"):
|
42
|
+
code = code[len("python\n") :]
|
43
|
+
return code
|
@@ -0,0 +1,230 @@
|
|
1
|
+
import copy
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Any, Dict, List, Optional, Union, cast
|
6
|
+
|
7
|
+
from vision_agent.agent import Agent
|
8
|
+
from vision_agent.agent.agent_utils import extract_json
|
9
|
+
from vision_agent.agent.vision_agent_prompts import (
|
10
|
+
EXAMPLES_CODE1,
|
11
|
+
EXAMPLES_CODE2,
|
12
|
+
VA_CODE,
|
13
|
+
)
|
14
|
+
from vision_agent.lmm import LMM, Message, OpenAILMM
|
15
|
+
from vision_agent.tools import META_TOOL_DOCSTRING
|
16
|
+
from vision_agent.utils import CodeInterpreterFactory
|
17
|
+
from vision_agent.utils.execute import CodeInterpreter
|
18
|
+
|
19
|
+
logging.basicConfig(level=logging.INFO)
|
20
|
+
_LOGGER = logging.getLogger(__name__)
|
21
|
+
WORKSPACE = Path(os.getenv("WORKSPACE", ""))
|
22
|
+
WORKSPACE.mkdir(parents=True, exist_ok=True)
|
23
|
+
if str(WORKSPACE) != "":
|
24
|
+
os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
|
25
|
+
|
26
|
+
|
27
|
+
class DefaultImports:
|
28
|
+
code = [
|
29
|
+
"from typing import *",
|
30
|
+
"from vision_agent.utils.execute import CodeInterpreter",
|
31
|
+
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
|
32
|
+
]
|
33
|
+
|
34
|
+
@staticmethod
|
35
|
+
def to_code_string() -> str:
|
36
|
+
return "\n".join(DefaultImports.code)
|
37
|
+
|
38
|
+
@staticmethod
|
39
|
+
def prepend_imports(code: str) -> str:
|
40
|
+
"""Run this method to prepend the default imports to the code.
|
41
|
+
NOTE: be sure to run this method after the custom tools have been registered.
|
42
|
+
"""
|
43
|
+
return DefaultImports.to_code_string() + "\n\n" + code
|
44
|
+
|
45
|
+
|
46
|
+
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
47
|
+
chat = copy.deepcopy(chat)
|
48
|
+
|
49
|
+
conversation = ""
|
50
|
+
for chat_i in chat:
|
51
|
+
if chat_i["role"] == "user":
|
52
|
+
conversation += f"USER: {chat_i['content']}\n\n"
|
53
|
+
elif chat_i["role"] == "observation":
|
54
|
+
conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
|
55
|
+
elif chat_i["role"] == "assistant":
|
56
|
+
conversation += f"AGENT: {chat_i['content']}\n\n"
|
57
|
+
else:
|
58
|
+
raise ValueError(f"role {chat_i['role']} is not supported")
|
59
|
+
|
60
|
+
prompt = VA_CODE.format(
|
61
|
+
documentation=META_TOOL_DOCSTRING,
|
62
|
+
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
|
63
|
+
dir=WORKSPACE,
|
64
|
+
conversation=conversation,
|
65
|
+
)
|
66
|
+
return extract_json(orch([{"role": "user", "content": prompt}]))
|
67
|
+
|
68
|
+
|
69
|
+
def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str:
|
70
|
+
# Note the code interpreter needs to keep running in the same environment because
|
71
|
+
# the SWE tools hold state like line numbers and currently open files.
|
72
|
+
result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
|
73
|
+
|
74
|
+
return_str = ""
|
75
|
+
if result.success:
|
76
|
+
for res in result.results:
|
77
|
+
if res.text is not None:
|
78
|
+
return_str += res.text.replace("\\n", "\n")
|
79
|
+
if result.logs.stdout:
|
80
|
+
return_str += "----- stdout -----\n"
|
81
|
+
for log in result.logs.stdout:
|
82
|
+
return_str += log.replace("\\n", "\n")
|
83
|
+
else:
|
84
|
+
# for log in result.logs.stderr:
|
85
|
+
# return_str += log.replace("\\n", "\n")
|
86
|
+
if result.error:
|
87
|
+
return_str += (
|
88
|
+
"\n" + result.error.value + "\n".join(result.error.traceback_raw)
|
89
|
+
)
|
90
|
+
|
91
|
+
return return_str
|
92
|
+
|
93
|
+
|
94
|
+
def parse_execution(response: str) -> Optional[str]:
|
95
|
+
code = None
|
96
|
+
if "<execute_python>" in response:
|
97
|
+
code = response[response.find("<execute_python>") + len("<execute_python>") :]
|
98
|
+
code = code[: code.find("</execute_python>")]
|
99
|
+
return code
|
100
|
+
|
101
|
+
|
102
|
+
class VisionAgent(Agent):
|
103
|
+
"""Vision Agent is an agent that can chat with the user and call tools or other
|
104
|
+
agents to generate code for it. Vision Agent uses python code to execute actions for
|
105
|
+
the user. Vision Agent is inspired by by OpenDev
|
106
|
+
https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
|
107
|
+
|
108
|
+
Example
|
109
|
+
-------
|
110
|
+
>>> from vision_agent.agent import VisionAgent
|
111
|
+
>>> agent = VisionAgent()
|
112
|
+
>>> resp = agent("Hello")
|
113
|
+
>>> resp.append({"role": "user", "content": "Can you write a function that counts dogs?", "media": ["dog.jpg"]})
|
114
|
+
>>> resp = agent(resp)
|
115
|
+
"""
|
116
|
+
|
117
|
+
def __init__(
|
118
|
+
self,
|
119
|
+
agent: Optional[LMM] = None,
|
120
|
+
verbosity: int = 0,
|
121
|
+
code_sandbox_runtime: Optional[str] = None,
|
122
|
+
) -> None:
|
123
|
+
self.agent = (
|
124
|
+
OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
125
|
+
)
|
126
|
+
self.max_iterations = 100
|
127
|
+
self.verbosity = verbosity
|
128
|
+
self.code_sandbox_runtime = code_sandbox_runtime
|
129
|
+
if self.verbosity >= 1:
|
130
|
+
_LOGGER.setLevel(logging.INFO)
|
131
|
+
|
132
|
+
def __call__(
|
133
|
+
self,
|
134
|
+
input: Union[str, List[Message]],
|
135
|
+
media: Optional[Union[str, Path]] = None,
|
136
|
+
) -> str:
|
137
|
+
"""Chat with VisionAgent and get the conversation response.
|
138
|
+
|
139
|
+
Parameters:
|
140
|
+
input (Union[str, List[Message]): A conversation in the format of
|
141
|
+
[{"role": "user", "content": "describe your task here..."}, ...] or a
|
142
|
+
string of just the contents.
|
143
|
+
media (Optional[Union[str, Path]]): The media file to be used in the task.
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
str: The conversation response.
|
147
|
+
"""
|
148
|
+
if isinstance(input, str):
|
149
|
+
input = [{"role": "user", "content": input}]
|
150
|
+
if media is not None:
|
151
|
+
input[0]["media"] = [media]
|
152
|
+
results = self.chat_with_code(input)
|
153
|
+
return results # type: ignore
|
154
|
+
|
155
|
+
def chat_with_code(
|
156
|
+
self,
|
157
|
+
chat: List[Message],
|
158
|
+
) -> List[Message]:
|
159
|
+
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
160
|
+
its tasks.
|
161
|
+
|
162
|
+
Parameters:
|
163
|
+
chat (List[Message]): A conversation
|
164
|
+
in the format of:
|
165
|
+
[{"role": "user", "content": "describe your task here..."}]
|
166
|
+
or if it contains media files, it should be in the format of:
|
167
|
+
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
List[Message]: The conversation response.
|
171
|
+
"""
|
172
|
+
|
173
|
+
if not chat:
|
174
|
+
raise ValueError("chat cannot be empty")
|
175
|
+
|
176
|
+
with CodeInterpreterFactory.new_instance(
|
177
|
+
code_sandbox_runtime=self.code_sandbox_runtime
|
178
|
+
) as code_interpreter:
|
179
|
+
orig_chat = copy.deepcopy(chat)
|
180
|
+
int_chat = copy.deepcopy(chat)
|
181
|
+
media_list = []
|
182
|
+
for chat_i in int_chat:
|
183
|
+
if "media" in chat_i:
|
184
|
+
for media in chat_i["media"]:
|
185
|
+
media = code_interpreter.upload_file(media)
|
186
|
+
chat_i["content"] += f" Media name {media}" # type: ignore
|
187
|
+
media_list.append(media)
|
188
|
+
|
189
|
+
int_chat = cast(
|
190
|
+
List[Message],
|
191
|
+
[
|
192
|
+
(
|
193
|
+
{
|
194
|
+
"role": c["role"],
|
195
|
+
"content": c["content"],
|
196
|
+
"media": c["media"],
|
197
|
+
}
|
198
|
+
if "media" in c
|
199
|
+
else {"role": c["role"], "content": c["content"]}
|
200
|
+
)
|
201
|
+
for c in int_chat
|
202
|
+
],
|
203
|
+
)
|
204
|
+
|
205
|
+
finished = False
|
206
|
+
iterations = 0
|
207
|
+
while not finished and iterations < self.max_iterations:
|
208
|
+
response = run_conversation(self.agent, int_chat)
|
209
|
+
if self.verbosity >= 1:
|
210
|
+
_LOGGER.info(response)
|
211
|
+
int_chat.append({"role": "assistant", "content": str(response)})
|
212
|
+
orig_chat.append({"role": "assistant", "content": str(response)})
|
213
|
+
|
214
|
+
if response["let_user_respond"]:
|
215
|
+
break
|
216
|
+
|
217
|
+
code_action = parse_execution(response["response"])
|
218
|
+
|
219
|
+
if code_action is not None:
|
220
|
+
obs = run_code_action(code_action, code_interpreter)
|
221
|
+
if self.verbosity >= 1:
|
222
|
+
_LOGGER.info(obs)
|
223
|
+
int_chat.append({"role": "observation", "content": obs})
|
224
|
+
orig_chat.append({"role": "observation", "content": obs})
|
225
|
+
|
226
|
+
iterations += 1
|
227
|
+
return orig_chat
|
228
|
+
|
229
|
+
def log_progress(self, data: Dict[str, Any]) -> None:
|
230
|
+
pass
|