vision-agent 0.2.91__tar.gz → 0.2.92__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {vision_agent-0.2.91 → vision_agent-0.2.92}/PKG-INFO +42 -12
  2. {vision_agent-0.2.91 → vision_agent-0.2.92}/README.md +41 -11
  3. {vision_agent-0.2.91 → vision_agent-0.2.92}/pyproject.toml +1 -1
  4. vision_agent-0.2.92/vision_agent/agent/__init__.py +3 -0
  5. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/agent/agent.py +1 -1
  6. vision_agent-0.2.92/vision_agent/agent/agent_utils.py +43 -0
  7. vision_agent-0.2.92/vision_agent/agent/vision_agent.py +230 -0
  8. vision_agent-0.2.91/vision_agent/agent/vision_agent.py → vision_agent-0.2.92/vision_agent/agent/vision_agent_coder.py +112 -153
  9. vision_agent-0.2.91/vision_agent/agent/vision_agent_prompts.py → vision_agent-0.2.92/vision_agent/agent/vision_agent_coder_prompts.py +3 -2
  10. vision_agent-0.2.92/vision_agent/agent/vision_agent_prompts.py +114 -0
  11. vision_agent-0.2.92/vision_agent/lmm/__init__.py +2 -0
  12. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/lmm/lmm.py +3 -5
  13. vision_agent-0.2.92/vision_agent/lmm/types.py +5 -0
  14. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/tools/__init__.py +1 -0
  15. vision_agent-0.2.92/vision_agent/tools/meta_tools.py +402 -0
  16. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/tools/tool_utils.py +47 -1
  17. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/tools/tools.py +7 -49
  18. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/execute.py +52 -76
  19. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/image_utils.py +1 -1
  20. vision_agent-0.2.91/vision_agent/agent/__init__.py +0 -2
  21. vision_agent-0.2.91/vision_agent/lmm/__init__.py +0 -1
  22. {vision_agent-0.2.91 → vision_agent-0.2.92}/LICENSE +0 -0
  23. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/__init__.py +0 -0
  24. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/fonts/__init__.py +0 -0
  25. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  26. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/tools/prompts.py +0 -0
  27. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/__init__.py +0 -0
  28. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/exceptions.py +0 -0
  29. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/sim.py +0 -0
  30. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/type_defs.py +0 -0
  31. {vision_agent-0.2.91 → vision_agent-0.2.92}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.91
3
+ Version: 0.2.92
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -57,7 +57,7 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
57
57
 
58
58
  ## Web Application
59
59
 
60
- Try Vision Agent live on [va.landing.ai](https://va.landing.ai/)
60
+ Try Vision Agent live on (note this may not be running the most up-to-date version) [va.landing.ai](https://va.landing.ai/)
61
61
 
62
62
  ## Documentation
63
63
 
@@ -79,16 +79,44 @@ using Azure OpenAI please see the Azure setup section):
79
79
  export OPENAI_API_KEY="your-api-key"
80
80
  ```
81
81
 
82
- ### Important Note on API Usage
83
- Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits)
84
-
85
82
  ### Vision Agent
83
+ There are two agents that you can use. Vision Agent is a conversational agent that has
84
+ access to tools that allow it to write an navigate python code and file systems. It can
85
+ converse with the user in natural language. VisionAgentCoder is an agent that can write
86
+ code for vision tasks, such as counting people in an image. However, it cannot converse
87
+ and can only respond with code. VisionAgent can call VisionAgentCoder to write vision
88
+ code.
89
+
86
90
  #### Basic Usage
87
- You can interact with the agent as you would with any LLM or LMM model:
91
+ To run the streamlit app locally to chat with Vision Agent, you can run the following
92
+ command:
93
+
94
+ ```bash
95
+ pip install -r examples/chat/requirements.txt
96
+ export WORKSPACE=/path/to/your/workspace
97
+ export ZMQ_PORT=5555
98
+ streamlit run examples/chat/app.py
99
+ ```
100
+ You can find more details about the streamlit app [here](examples/chat/).
88
101
 
102
+ #### Basic Programmatic Usage
89
103
  ```python
90
104
  >>> from vision_agent.agent import VisionAgent
91
105
  >>> agent = VisionAgent()
106
+ >>> resp = agent("Hello")
107
+ >>> print(resp)
108
+ [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}]
109
+ >>> resp.append({"role": "user", "content": "Can you count the number of people in this image?", "media": ["people.jpg"]})
110
+ >>> resp = agent(resp)
111
+ ```
112
+
113
+ ### Vision Agent Coder
114
+ #### Basic Usage
115
+ You can interact with the agent as you would with any LLM or LMM model:
116
+
117
+ ```python
118
+ >>> from vision_agent.agent import VisionAgentCoder
119
+ >>> agent = VisionAgentCoder()
92
120
  >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
93
121
  ```
94
122
 
@@ -129,7 +157,7 @@ To better understand how the model came up with it's answer, you can run it in d
129
157
  mode by passing in the verbose argument:
130
158
 
131
159
  ```python
132
- >>> agent = VisionAgent(verbose=2)
160
+ >>> agent = VisionAgentCoder(verbose=2)
133
161
  ```
134
162
 
135
163
  #### Detailed Usage
@@ -219,9 +247,11 @@ def custom_tool(image_path: str) -> str:
219
247
  return np.zeros((10, 10))
220
248
  ```
221
249
 
222
- You need to ensure you call `@va.tools.register_tool` with any imports it might use and
223
- ensure the documentation is in the same format above with description, `Parameters:`,
224
- `Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/).
250
+ You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
251
+ variables will not be captured by `register_tool` so you need to include them in the
252
+ function. Make sure the documentation is in the same format above with description,
253
+ `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
254
+ [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
225
255
 
226
256
  ### Azure Setup
227
257
  If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
@@ -248,7 +278,7 @@ You can then run Vision Agent using the Azure OpenAI models:
248
278
 
249
279
  ```python
250
280
  import vision_agent as va
251
- agent = va.agent.AzureVisionAgent()
281
+ agent = va.agent.AzureVisionAgentCoder()
252
282
  ```
253
283
 
254
284
  ******************************************************************************************************************************
@@ -257,7 +287,7 @@ agent = va.agent.AzureVisionAgent()
257
287
 
258
288
  #### How to get started with OpenAI API credits
259
289
 
260
- 1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
290
+ 1. Visit the [OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
261
291
  2. Follow the instructions to purchase and manage your API credits.
262
292
  3. Ensure your API key is correctly configured in your project settings.
263
293
 
@@ -18,7 +18,7 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
18
18
 
19
19
  ## Web Application
20
20
 
21
- Try Vision Agent live on [va.landing.ai](https://va.landing.ai/)
21
+ Try Vision Agent live on (note this may not be running the most up-to-date version) [va.landing.ai](https://va.landing.ai/)
22
22
 
23
23
  ## Documentation
24
24
 
@@ -40,16 +40,44 @@ using Azure OpenAI please see the Azure setup section):
40
40
  export OPENAI_API_KEY="your-api-key"
41
41
  ```
42
42
 
43
- ### Important Note on API Usage
44
- Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits)
45
-
46
43
  ### Vision Agent
44
+ There are two agents that you can use. Vision Agent is a conversational agent that has
45
+ access to tools that allow it to write an navigate python code and file systems. It can
46
+ converse with the user in natural language. VisionAgentCoder is an agent that can write
47
+ code for vision tasks, such as counting people in an image. However, it cannot converse
48
+ and can only respond with code. VisionAgent can call VisionAgentCoder to write vision
49
+ code.
50
+
47
51
  #### Basic Usage
48
- You can interact with the agent as you would with any LLM or LMM model:
52
+ To run the streamlit app locally to chat with Vision Agent, you can run the following
53
+ command:
54
+
55
+ ```bash
56
+ pip install -r examples/chat/requirements.txt
57
+ export WORKSPACE=/path/to/your/workspace
58
+ export ZMQ_PORT=5555
59
+ streamlit run examples/chat/app.py
60
+ ```
61
+ You can find more details about the streamlit app [here](examples/chat/).
49
62
 
63
+ #### Basic Programmatic Usage
50
64
  ```python
51
65
  >>> from vision_agent.agent import VisionAgent
52
66
  >>> agent = VisionAgent()
67
+ >>> resp = agent("Hello")
68
+ >>> print(resp)
69
+ [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}]
70
+ >>> resp.append({"role": "user", "content": "Can you count the number of people in this image?", "media": ["people.jpg"]})
71
+ >>> resp = agent(resp)
72
+ ```
73
+
74
+ ### Vision Agent Coder
75
+ #### Basic Usage
76
+ You can interact with the agent as you would with any LLM or LMM model:
77
+
78
+ ```python
79
+ >>> from vision_agent.agent import VisionAgentCoder
80
+ >>> agent = VisionAgentCoder()
53
81
  >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
54
82
  ```
55
83
 
@@ -90,7 +118,7 @@ To better understand how the model came up with it's answer, you can run it in d
90
118
  mode by passing in the verbose argument:
91
119
 
92
120
  ```python
93
- >>> agent = VisionAgent(verbose=2)
121
+ >>> agent = VisionAgentCoder(verbose=2)
94
122
  ```
95
123
 
96
124
  #### Detailed Usage
@@ -180,9 +208,11 @@ def custom_tool(image_path: str) -> str:
180
208
  return np.zeros((10, 10))
181
209
  ```
182
210
 
183
- You need to ensure you call `@va.tools.register_tool` with any imports it might use and
184
- ensure the documentation is in the same format above with description, `Parameters:`,
185
- `Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/).
211
+ You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global
212
+ variables will not be captured by `register_tool` so you need to include them in the
213
+ function. Make sure the documentation is in the same format above with description,
214
+ `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
215
+ [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
186
216
 
187
217
  ### Azure Setup
188
218
  If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
@@ -209,7 +239,7 @@ You can then run Vision Agent using the Azure OpenAI models:
209
239
 
210
240
  ```python
211
241
  import vision_agent as va
212
- agent = va.agent.AzureVisionAgent()
242
+ agent = va.agent.AzureVisionAgentCoder()
213
243
  ```
214
244
 
215
245
  ******************************************************************************************************************************
@@ -218,7 +248,7 @@ agent = va.agent.AzureVisionAgent()
218
248
 
219
249
  #### How to get started with OpenAI API credits
220
250
 
221
- 1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
251
+ 1. Visit the [OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
222
252
  2. Follow the instructions to purchase and manage your API credits.
223
253
  3. Ensure your API key is correctly configured in your project settings.
224
254
 
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.91"
7
+ version = "0.2.92"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -0,0 +1,3 @@
1
+ from .agent import Agent
2
+ from .vision_agent import VisionAgent
3
+ from .vision_agent_coder import AzureVisionAgentCoder, VisionAgentCoder
@@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
2
2
  from pathlib import Path
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
 
5
- from vision_agent.lmm import Message
5
+ from vision_agent.lmm.types import Message
6
6
 
7
7
 
8
8
  class Agent(ABC):
@@ -0,0 +1,43 @@
1
+ import json
2
+ import logging
3
+ import sys
4
+ from typing import Any, Dict
5
+
6
+ logging.basicConfig(stream=sys.stdout)
7
+ _LOGGER = logging.getLogger(__name__)
8
+
9
+
10
+ def extract_json(json_str: str) -> Dict[str, Any]:
11
+ try:
12
+ json_dict = json.loads(json_str)
13
+ except json.JSONDecodeError:
14
+ input_json_str = json_str
15
+ if "```json" in json_str:
16
+ json_str = json_str[json_str.find("```json") + len("```json") :]
17
+ json_str = json_str[: json_str.find("```")]
18
+ elif "```" in json_str:
19
+ json_str = json_str[json_str.find("```") + len("```") :]
20
+ # get the last ``` not one from an intermediate string
21
+ json_str = json_str[: json_str.find("}```")]
22
+ try:
23
+ json_dict = json.loads(json_str)
24
+ except json.JSONDecodeError as e:
25
+ error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
26
+ _LOGGER.exception(error_msg)
27
+ raise ValueError(error_msg) from e
28
+ return json_dict # type: ignore
29
+
30
+
31
+ def extract_code(code: str) -> str:
32
+ if "\n```python" in code:
33
+ start = "\n```python"
34
+ elif "```python" in code:
35
+ start = "```python"
36
+ else:
37
+ return code
38
+
39
+ code = code[code.find(start) + len(start) :]
40
+ code = code[: code.find("```")]
41
+ if code.startswith("python\n"):
42
+ code = code[len("python\n") :]
43
+ return code
@@ -0,0 +1,230 @@
1
+ import copy
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional, Union, cast
6
+
7
+ from vision_agent.agent import Agent
8
+ from vision_agent.agent.agent_utils import extract_json
9
+ from vision_agent.agent.vision_agent_prompts import (
10
+ EXAMPLES_CODE1,
11
+ EXAMPLES_CODE2,
12
+ VA_CODE,
13
+ )
14
+ from vision_agent.lmm import LMM, Message, OpenAILMM
15
+ from vision_agent.tools import META_TOOL_DOCSTRING
16
+ from vision_agent.utils import CodeInterpreterFactory
17
+ from vision_agent.utils.execute import CodeInterpreter
18
+
19
+ logging.basicConfig(level=logging.INFO)
20
+ _LOGGER = logging.getLogger(__name__)
21
+ WORKSPACE = Path(os.getenv("WORKSPACE", ""))
22
+ WORKSPACE.mkdir(parents=True, exist_ok=True)
23
+ if str(WORKSPACE) != "":
24
+ os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
25
+
26
+
27
+ class DefaultImports:
28
+ code = [
29
+ "from typing import *",
30
+ "from vision_agent.utils.execute import CodeInterpreter",
31
+ "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
32
+ ]
33
+
34
+ @staticmethod
35
+ def to_code_string() -> str:
36
+ return "\n".join(DefaultImports.code)
37
+
38
+ @staticmethod
39
+ def prepend_imports(code: str) -> str:
40
+ """Run this method to prepend the default imports to the code.
41
+ NOTE: be sure to run this method after the custom tools have been registered.
42
+ """
43
+ return DefaultImports.to_code_string() + "\n\n" + code
44
+
45
+
46
+ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
47
+ chat = copy.deepcopy(chat)
48
+
49
+ conversation = ""
50
+ for chat_i in chat:
51
+ if chat_i["role"] == "user":
52
+ conversation += f"USER: {chat_i['content']}\n\n"
53
+ elif chat_i["role"] == "observation":
54
+ conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
55
+ elif chat_i["role"] == "assistant":
56
+ conversation += f"AGENT: {chat_i['content']}\n\n"
57
+ else:
58
+ raise ValueError(f"role {chat_i['role']} is not supported")
59
+
60
+ prompt = VA_CODE.format(
61
+ documentation=META_TOOL_DOCSTRING,
62
+ examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
63
+ dir=WORKSPACE,
64
+ conversation=conversation,
65
+ )
66
+ return extract_json(orch([{"role": "user", "content": prompt}]))
67
+
68
+
69
+ def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str:
70
+ # Note the code interpreter needs to keep running in the same environment because
71
+ # the SWE tools hold state like line numbers and currently open files.
72
+ result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
73
+
74
+ return_str = ""
75
+ if result.success:
76
+ for res in result.results:
77
+ if res.text is not None:
78
+ return_str += res.text.replace("\\n", "\n")
79
+ if result.logs.stdout:
80
+ return_str += "----- stdout -----\n"
81
+ for log in result.logs.stdout:
82
+ return_str += log.replace("\\n", "\n")
83
+ else:
84
+ # for log in result.logs.stderr:
85
+ # return_str += log.replace("\\n", "\n")
86
+ if result.error:
87
+ return_str += (
88
+ "\n" + result.error.value + "\n".join(result.error.traceback_raw)
89
+ )
90
+
91
+ return return_str
92
+
93
+
94
+ def parse_execution(response: str) -> Optional[str]:
95
+ code = None
96
+ if "<execute_python>" in response:
97
+ code = response[response.find("<execute_python>") + len("<execute_python>") :]
98
+ code = code[: code.find("</execute_python>")]
99
+ return code
100
+
101
+
102
+ class VisionAgent(Agent):
103
+ """Vision Agent is an agent that can chat with the user and call tools or other
104
+ agents to generate code for it. Vision Agent uses python code to execute actions for
105
+ the user. Vision Agent is inspired by by OpenDev
106
+ https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
107
+
108
+ Example
109
+ -------
110
+ >>> from vision_agent.agent import VisionAgent
111
+ >>> agent = VisionAgent()
112
+ >>> resp = agent("Hello")
113
+ >>> resp.append({"role": "user", "content": "Can you write a function that counts dogs?", "media": ["dog.jpg"]})
114
+ >>> resp = agent(resp)
115
+ """
116
+
117
+ def __init__(
118
+ self,
119
+ agent: Optional[LMM] = None,
120
+ verbosity: int = 0,
121
+ code_sandbox_runtime: Optional[str] = None,
122
+ ) -> None:
123
+ self.agent = (
124
+ OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
125
+ )
126
+ self.max_iterations = 100
127
+ self.verbosity = verbosity
128
+ self.code_sandbox_runtime = code_sandbox_runtime
129
+ if self.verbosity >= 1:
130
+ _LOGGER.setLevel(logging.INFO)
131
+
132
+ def __call__(
133
+ self,
134
+ input: Union[str, List[Message]],
135
+ media: Optional[Union[str, Path]] = None,
136
+ ) -> str:
137
+ """Chat with VisionAgent and get the conversation response.
138
+
139
+ Parameters:
140
+ input (Union[str, List[Message]): A conversation in the format of
141
+ [{"role": "user", "content": "describe your task here..."}, ...] or a
142
+ string of just the contents.
143
+ media (Optional[Union[str, Path]]): The media file to be used in the task.
144
+
145
+ Returns:
146
+ str: The conversation response.
147
+ """
148
+ if isinstance(input, str):
149
+ input = [{"role": "user", "content": input}]
150
+ if media is not None:
151
+ input[0]["media"] = [media]
152
+ results = self.chat_with_code(input)
153
+ return results # type: ignore
154
+
155
+ def chat_with_code(
156
+ self,
157
+ chat: List[Message],
158
+ ) -> List[Message]:
159
+ """Chat with VisionAgent, it will use code to execute actions to accomplish
160
+ its tasks.
161
+
162
+ Parameters:
163
+ chat (List[Message]): A conversation
164
+ in the format of:
165
+ [{"role": "user", "content": "describe your task here..."}]
166
+ or if it contains media files, it should be in the format of:
167
+ [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
168
+
169
+ Returns:
170
+ List[Message]: The conversation response.
171
+ """
172
+
173
+ if not chat:
174
+ raise ValueError("chat cannot be empty")
175
+
176
+ with CodeInterpreterFactory.new_instance(
177
+ code_sandbox_runtime=self.code_sandbox_runtime
178
+ ) as code_interpreter:
179
+ orig_chat = copy.deepcopy(chat)
180
+ int_chat = copy.deepcopy(chat)
181
+ media_list = []
182
+ for chat_i in int_chat:
183
+ if "media" in chat_i:
184
+ for media in chat_i["media"]:
185
+ media = code_interpreter.upload_file(media)
186
+ chat_i["content"] += f" Media name {media}" # type: ignore
187
+ media_list.append(media)
188
+
189
+ int_chat = cast(
190
+ List[Message],
191
+ [
192
+ (
193
+ {
194
+ "role": c["role"],
195
+ "content": c["content"],
196
+ "media": c["media"],
197
+ }
198
+ if "media" in c
199
+ else {"role": c["role"], "content": c["content"]}
200
+ )
201
+ for c in int_chat
202
+ ],
203
+ )
204
+
205
+ finished = False
206
+ iterations = 0
207
+ while not finished and iterations < self.max_iterations:
208
+ response = run_conversation(self.agent, int_chat)
209
+ if self.verbosity >= 1:
210
+ _LOGGER.info(response)
211
+ int_chat.append({"role": "assistant", "content": str(response)})
212
+ orig_chat.append({"role": "assistant", "content": str(response)})
213
+
214
+ if response["let_user_respond"]:
215
+ break
216
+
217
+ code_action = parse_execution(response["response"])
218
+
219
+ if code_action is not None:
220
+ obs = run_code_action(code_action, code_interpreter)
221
+ if self.verbosity >= 1:
222
+ _LOGGER.info(obs)
223
+ int_chat.append({"role": "observation", "content": obs})
224
+ orig_chat.append({"role": "observation", "content": obs})
225
+
226
+ iterations += 1
227
+ return orig_chat
228
+
229
+ def log_progress(self, data: Dict[str, Any]) -> None:
230
+ pass