vision-agent 0.2.30__py3-none-any.whl → 0.2.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.1
2
+ Name: vision-agent
3
+ Version: 0.2.31
4
+ Summary: Toolset for Vision Agent
5
+ Author: Landing AI
6
+ Author-email: dev@landing.ai
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
13
+ Requires-Dist: langsmith (>=0.1.58,<0.2.0)
14
+ Requires-Dist: moviepy (>=1.0.0,<2.0.0)
15
+ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
16
+ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
17
+ Requires-Dist: numpy (>=1.21.0,<2.0.0)
18
+ Requires-Dist: openai (>=1.0.0,<2.0.0)
19
+ Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
20
+ Requires-Dist: pandas (>=2.0.0,<3.0.0)
21
+ Requires-Dist: pillow (>=10.0.0,<11.0.0)
22
+ Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
23
+ Requires-Dist: requests (>=2.0.0,<3.0.0)
24
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
25
+ Requires-Dist: scipy (>=1.13.0,<1.14.0)
26
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
27
+ Requires-Dist: tqdm (>=4.64.0,<5.0.0)
28
+ Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
29
+ Project-URL: Homepage, https://landing.ai
30
+ Project-URL: documentation, https://github.com/landing-ai/vision-agent
31
+ Project-URL: repository, https://github.com/landing-ai/vision-agent
32
+ Description-Content-Type: text/markdown
33
+
34
+ <div align="center">
35
+ <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
36
+
37
+ # 🔍🤖 Vision Agent
38
+
39
+ [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
40
+ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
41
+ [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
42
+ ![version](https://img.shields.io/pypi/pyversions/vision-agent)
43
+ </div>
44
+
45
+ Vision Agent is a library that helps you utilize agent frameworks to generate code to
46
+ solve your vision task. Many current vision problems can easily take hours or days to
47
+ solve, you need to find the right model, figure out how to use it and program it to
48
+ accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
49
+ allowing users to describe their problem in text and have the agent framework generate
50
+ code to solve the task for them. Check out our discord for updates and roadmaps!
51
+
52
+ ## Documentation
53
+
54
+ - [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
55
+
56
+
57
+ ## Getting Started
58
+ ### Installation
59
+ To get started, you can install the library using pip:
60
+
61
+ ```bash
62
+ pip install vision-agent
63
+ ```
64
+
65
+ Ensure you have an OpenAI API key and set it as an environment variable (if you are
66
+ using Azure OpenAI please see the Azure setup section):
67
+
68
+ ```bash
69
+ export OPENAI_API_KEY="your-api-key"
70
+ ```
71
+
72
+ ### Vision Agent
73
+ You can interact with the agent as you would with any LLM or LMM model:
74
+
75
+ ```python
76
+ >>> from vision_agent.agent import VisionAgent
77
+ >>> agent = VisionAgent()
78
+ >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
79
+ ```
80
+
81
+ Which produces the following code:
82
+ ```python
83
+ from vision_agent.tools import load_image, grounding_sam
84
+
85
+ def calculate_filled_percentage(image_path: str) -> float:
86
+ # Step 1: Load the image
87
+ image = load_image(image_path)
88
+
89
+ # Step 2: Segment the jar
90
+ jar_segments = grounding_sam(prompt="jar", image=image)
91
+
92
+ # Step 3: Segment the coffee beans
93
+ coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
94
+
95
+ # Step 4: Calculate the area of the segmented jar
96
+ jar_area = 0
97
+ for segment in jar_segments:
98
+ jar_area += segment['mask'].sum()
99
+
100
+ # Step 5: Calculate the area of the segmented coffee beans
101
+ coffee_beans_area = 0
102
+ for segment in coffee_beans_segments:
103
+ coffee_beans_area += segment['mask'].sum()
104
+
105
+ # Step 6: Compute the percentage of the jar area that is filled with coffee beans
106
+ if jar_area == 0:
107
+ return 0.0 # To avoid division by zero
108
+ filled_percentage = (coffee_beans_area / jar_area) * 100
109
+
110
+ # Step 7: Return the computed percentage
111
+ return filled_percentage
112
+ ```
113
+
114
+ To better understand how the model came up with it's answer, you can run it in debug
115
+ mode by passing in the verbose argument:
116
+
117
+ ```python
118
+ >>> agent = VisionAgent(verbose=2)
119
+ ```
120
+
121
+ You can also have it return more information by calling `chat_with_workflow`:
122
+
123
+ ```python
124
+ >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
125
+ >>> print(results)
126
+ {
127
+ "code": "from vision_agent.tools import ..."
128
+ "test": "calculate_filled_percentage('jar.jpg')",
129
+ "test_result": "...",
130
+ "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
131
+ "working_memory": ...,
132
+ }
133
+ ```
134
+
135
+ With this you can examine more detailed information such as the etesting code, testing
136
+ results, plan or working memory it used to complete the task.
137
+
138
+ ### Tools
139
+ There are a variety of tools for the model or the user to use. Some are executed locally
140
+ while others are hosted for you. You can also ask an LLM directly to build a tool for
141
+ you. For example:
142
+
143
+ ```python
144
+ >>> import vision_agent as va
145
+ >>> llm = va.llm.OpenAILLM()
146
+ >>> detector = llm.generate_detector("Can you build a jar detector for me?")
147
+ >>> detector("jar.jpg")
148
+ [{"labels": ["jar",],
149
+ "scores": [0.99],
150
+ "bboxes": [
151
+ [0.58, 0.2, 0.72, 0.45],
152
+ ]
153
+ }]
154
+ ```
155
+
156
+ ### Azure Setup
157
+ If you want to use Azure OpenAI models, you can set the environment variable:
158
+
159
+ ```bash
160
+ export AZURE_OPENAI_API_KEY="your-api-key"
161
+ export AZURE_OPENAI_ENDPOINT="your-endpoint"
162
+ ```
163
+
164
+ You can then run Vision Agent using the Azure OpenAI models:
165
+
166
+ ```python
167
+ >>> import vision_agent as va
168
+ >>> agent = va.agent.VisionAgent(
169
+ >>> task_model=va.llm.AzureOpenAILLM(),
170
+ >>> answer_model=va.lmm.AzureOpenAILMM(),
171
+ >>> reflection_model=va.lmm.AzureOpenAILMM(),
172
+ >>> )
173
+ ```
174
+
175
+
@@ -0,0 +1,36 @@
1
+ vision_agent/__init__.py,sha256=GVLHCeK_R-zgldpbcPmOzJat-BkadvkuRCMxDvTIcXs,108
2
+ vision_agent/agent/__init__.py,sha256=iiC5eknTQnv87iSwAoHqBthJ3g2Zm6D0dWbYPDfuQ7A,245
3
+ vision_agent/agent/agent.py,sha256=TXh93MOwmArNRieOkYrhliq1rf7wIkhxvCdTiGhTqFs,538
4
+ vision_agent/agent/agent_coder.py,sha256=MQw8SPeNy1D9tUvB-u60H9ab1eLXnrpV0Ggn7Eq_mIo,6988
5
+ vision_agent/agent/agent_coder_prompts.py,sha256=CJe3v7xvHQ32u3RQAXQga_Tk_4UgU64RBAMHZ3S70KY,5538
6
+ vision_agent/agent/data_interpreter.py,sha256=YCREEHWiyTYpKT8hibotylEkx1kF5AH0k9wnmymwPBY,15143
7
+ vision_agent/agent/data_interpreter_prompts.py,sha256=RDJggOfXwGaEoIcTYGX41ZEayCgYei1AootDOc_SN2g,6134
8
+ vision_agent/agent/easytool.py,sha256=wMa9-tpAaiC4E2ONbidxmMM9YvAOw4_Sypf5mGKco_w,11526
9
+ vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
10
+ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFIypCA,27274
11
+ vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
+ vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
13
+ vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
14
+ vision_agent/agent/vision_agent.py,sha256=5Bfxif2sqRKS1ZUlQ4yT468EfevI9CQ6V7_Y6xRbbq0,14992
15
+ vision_agent/agent/vision_agent_prompts.py,sha256=s6T5UnyrKIAcaKqcMudWQOBCHt6Obn9QpX3QtqiDv2I,8034
16
+ vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
18
+ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
19
+ vision_agent/llm/llm.py,sha256=UZ73GqQHE-NKOJWsrOTWfmdHYsbCBkJ5rZ7dhcSCHHw,5951
20
+ vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
21
+ vision_agent/lmm/lmm.py,sha256=NwcZYLTzi95LSMAk0sTtw7G_zBLa9lU-DHM5GUUCiK4,10622
22
+ vision_agent/tools/__init__.py,sha256=1kyJy4euA8t73_ALhKZIUOjVb2A1IyYztu-MJJJ0TYI,505
23
+ vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
24
+ vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
25
+ vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
26
+ vision_agent/tools/tools.py,sha256=mio0A1l5QcyRC5IgaD4Trfqg7hFTZ8rOjx1dYivwb4Q,21585
27
+ vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
28
+ vision_agent/utils/execute.py,sha256=8_SfK-IkHH4lXF0JVyV7sDFszZn9HKsh1bFITKGCJ1g,3881
29
+ vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
30
+ vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
31
+ vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
32
+ vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
33
+ vision_agent-0.2.31.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
+ vision_agent-0.2.31.dist-info/METADATA,sha256=tsCUD6WuSXUt5XLCmOD89DMzDTAxyrCPiA0cAES85AI,5942
35
+ vision_agent-0.2.31.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
+ vision_agent-0.2.31.dist-info/RECORD,,
@@ -1,394 +0,0 @@
1
- import copy
2
- import json
3
- import logging
4
- import sys
5
- from pathlib import Path
6
- from typing import Any, Callable, Dict, List, Optional, Union, cast, no_type_check
7
-
8
- from rich.console import Console
9
- from rich.syntax import Syntax
10
- from tabulate import tabulate
11
-
12
- from vision_agent.agent import Agent
13
- from vision_agent.agent.vision_agent_v3_prompts import (
14
- CODE,
15
- FEEDBACK,
16
- FIX_BUG,
17
- FULL_TASK,
18
- PLAN,
19
- REFLECT,
20
- SIMPLE_TEST,
21
- USER_REQ,
22
- )
23
- from vision_agent.llm import LLM, OpenAILLM
24
- from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF, UTILITIES_DOCSTRING
25
- from vision_agent.utils import Execute
26
- from vision_agent.utils.sim import Sim
27
-
28
- logging.basicConfig(stream=sys.stdout)
29
- _LOGGER = logging.getLogger(__name__)
30
- _MAX_TABULATE_COL_WIDTH = 80
31
- _EXECUTE = Execute(600)
32
- _CONSOLE = Console()
33
-
34
-
35
- def format_memory(memory: List[Dict[str, str]]) -> str:
36
- return FEEDBACK.format(
37
- feedback="\n".join(
38
- [
39
- f"### Feedback {i}:\nCode: ```python\n{m['code']}\n```\nFeedback: {m['feedback']}\n"
40
- for i, m in enumerate(memory)
41
- ]
42
- )
43
- )
44
-
45
-
46
- def extract_code(code: str) -> str:
47
- if "\n```python" in code:
48
- start = "\n```python"
49
- elif "```python" in code:
50
- start = "```python"
51
- else:
52
- return code
53
-
54
- code = code[code.find(start) + len(start) :]
55
- code = code[: code.find("```")]
56
- if code.startswith("python\n"):
57
- code = code[len("python\n") :]
58
- return code
59
-
60
-
61
- def extract_json(json_str: str) -> Dict[str, Any]:
62
- try:
63
- json_dict = json.loads(json_str)
64
- except json.JSONDecodeError:
65
- if "```json" in json_str:
66
- json_str = json_str[json_str.find("```json") + len("```json") :]
67
- json_str = json_str[: json_str.find("```")]
68
- elif "```" in json_str:
69
- json_str = json_str[json_str.find("```") + len("```") :]
70
- # get the last ``` not one from an intermediate string
71
- json_str = json_str[: json_str.find("}```")]
72
- json_dict = json.loads(json_str)
73
- return json_dict # type: ignore
74
-
75
-
76
- def write_plan(
77
- chat: List[Dict[str, str]],
78
- tool_desc: str,
79
- working_memory: str,
80
- model: LLM,
81
- ) -> List[Dict[str, str]]:
82
- chat = copy.deepcopy(chat)
83
- if chat[-1]["role"] != "user":
84
- raise ValueError("Last chat message must be from the user.")
85
-
86
- user_request = chat[-1]["content"]
87
- context = USER_REQ.format(user_request=user_request)
88
- prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
89
- chat[-1]["content"] = prompt
90
- return extract_json(model.chat(chat))["plan"] # type: ignore
91
-
92
-
93
- def reflect(
94
- chat: List[Dict[str, str]],
95
- plan: str,
96
- code: str,
97
- model: LLM,
98
- ) -> Dict[str, Union[str, bool]]:
99
- chat = copy.deepcopy(chat)
100
- if chat[-1]["role"] != "user":
101
- raise ValueError("Last chat message must be from the user.")
102
-
103
- user_request = chat[-1]["content"]
104
- context = USER_REQ.format(user_request=user_request)
105
- prompt = REFLECT.format(context=context, plan=plan, code=code)
106
- chat[-1]["content"] = prompt
107
- return extract_json(model.chat(chat))
108
-
109
-
110
- def write_and_test_code(
111
- task: str,
112
- tool_info: str,
113
- tool_utils: str,
114
- working_memory: str,
115
- coder: LLM,
116
- tester: LLM,
117
- debugger: LLM,
118
- log_progress: Callable[[Dict[str, Any]], None],
119
- verbosity: int = 0,
120
- max_retries: int = 3,
121
- input_media: Optional[Union[str, Path]] = None,
122
- ) -> Dict[str, Any]:
123
- code = extract_code(
124
- coder(CODE.format(docstring=tool_info, question=task, feedback=working_memory))
125
- )
126
- test = extract_code(
127
- tester(
128
- SIMPLE_TEST.format(
129
- docstring=tool_utils,
130
- question=task,
131
- code=code,
132
- feedback=working_memory,
133
- media=input_media,
134
- )
135
- )
136
- )
137
-
138
- success, result = _EXECUTE.run_isolation(f"{code}\n{test}")
139
- if verbosity == 2:
140
- _LOGGER.info("Initial code and tests:")
141
- log_progress(
142
- {
143
- "log": "Code:",
144
- "code": code,
145
- }
146
- )
147
- log_progress(
148
- {
149
- "log": "Test:",
150
- "code": test,
151
- }
152
- )
153
- _CONSOLE.print(
154
- Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True)
155
- )
156
- log_progress(
157
- {
158
- "log": "Result:",
159
- "result": result,
160
- }
161
- )
162
- _LOGGER.info(f"Initial result: {result}")
163
-
164
- count = 0
165
- new_working_memory = []
166
- while not success and count < max_retries:
167
- fixed_code_and_test = extract_json(
168
- debugger(
169
- FIX_BUG.format(
170
- code=code, tests=test, result=result, feedback=working_memory
171
- )
172
- )
173
- )
174
- if fixed_code_and_test["code"].strip() != "":
175
- code = extract_code(fixed_code_and_test["code"])
176
- if fixed_code_and_test["test"].strip() != "":
177
- test = extract_code(fixed_code_and_test["test"])
178
- new_working_memory.append(
179
- {"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]}
180
- )
181
-
182
- success, result = _EXECUTE.run_isolation(f"{code}\n{test}")
183
- if verbosity == 2:
184
- log_progress(
185
- {
186
- "log": f"Debug attempt {count + 1}, reflection:",
187
- "result": fixed_code_and_test["reflections"],
188
- }
189
- )
190
- _LOGGER.info(
191
- f"Debug attempt {count + 1}, reflection: {fixed_code_and_test['reflections']}"
192
- )
193
- _CONSOLE.print(
194
- Syntax(
195
- f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True
196
- )
197
- )
198
- log_progress(
199
- {
200
- "log": "Debug result:",
201
- "result": result,
202
- }
203
- )
204
- _LOGGER.info(f"Debug result: {result}")
205
- count += 1
206
-
207
- if verbosity >= 1:
208
- _LOGGER.info("Final code and tests:")
209
- _CONSOLE.print(
210
- Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True)
211
- )
212
- _LOGGER.info(f"Final Result: {result}")
213
-
214
- return {
215
- "code": code,
216
- "test": test,
217
- "success": success,
218
- "test_result": result,
219
- "working_memory": new_working_memory,
220
- }
221
-
222
-
223
- def retrieve_tools(
224
- plan: List[Dict[str, str]],
225
- tool_recommender: Sim,
226
- log_progress: Callable[[Dict[str, Any]], None],
227
- verbosity: int = 0,
228
- ) -> str:
229
- tool_info = []
230
- tool_desc = []
231
- for task in plan:
232
- tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
233
- tool_info.extend([e["doc"] for e in tools])
234
- tool_desc.extend([e["desc"] for e in tools])
235
- if verbosity == 2:
236
- log_progress(
237
- {
238
- "log": "Retrieved tools:",
239
- "tools": tool_desc,
240
- }
241
- )
242
- _LOGGER.info(f"Tools: {tool_desc}")
243
- tool_info_set = set(tool_info)
244
- return "\n\n".join(tool_info_set)
245
-
246
-
247
- class VisionAgentV3(Agent):
248
- def __init__(
249
- self,
250
- timeout: int = 600,
251
- planner: Optional[LLM] = None,
252
- coder: Optional[LLM] = None,
253
- tester: Optional[LLM] = None,
254
- debugger: Optional[LLM] = None,
255
- tool_recommender: Optional[Sim] = None,
256
- verbosity: int = 0,
257
- report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
258
- ) -> None:
259
- self.planner = (
260
- OpenAILLM(temperature=0.0, json_mode=True) if planner is None else planner
261
- )
262
- self.coder = OpenAILLM(temperature=0.0) if coder is None else coder
263
- self.tester = OpenAILLM(temperature=0.0) if tester is None else tester
264
- self.debugger = (
265
- OpenAILLM(temperature=0.0, json_mode=True) if debugger is None else debugger
266
- )
267
-
268
- self.tool_recommender = (
269
- Sim(TOOLS_DF, sim_key="desc")
270
- if tool_recommender is None
271
- else tool_recommender
272
- )
273
- self.verbosity = verbosity
274
- self.max_retries = 2
275
- self.report_progress_callback = report_progress_callback
276
-
277
- @no_type_check
278
- def __call__(
279
- self,
280
- input: Union[List[Dict[str, str]], str],
281
- image: Optional[Union[str, Path]] = None,
282
- ) -> Dict[str, Any]:
283
- if isinstance(input, str):
284
- input = [{"role": "user", "content": input}]
285
- results = self.chat_with_workflow(input, image)
286
- results.pop("working_memory")
287
- return results
288
-
289
- def chat_with_workflow(
290
- self,
291
- chat: List[Dict[str, str]],
292
- image: Optional[Union[str, Path]] = None,
293
- self_reflection: bool = False,
294
- ) -> Dict[str, Any]:
295
- if len(chat) == 0:
296
- raise ValueError("Chat cannot be empty.")
297
-
298
- if image is not None:
299
- for chat_i in chat:
300
- if chat_i["role"] == "user":
301
- chat_i["content"] += f" Image name {image}"
302
-
303
- code = ""
304
- test = ""
305
- working_memory: List[Dict[str, str]] = []
306
- results = {"code": "", "test": "", "plan": []}
307
- plan = []
308
- success = False
309
- retries = 0
310
-
311
- while not success and retries < self.max_retries:
312
- plan_i = write_plan(
313
- chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner
314
- )
315
- plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
316
- if self.verbosity >= 1:
317
- self.log_progress(
318
- {
319
- "log": "Going to run the following plan(s) in sequence:\n",
320
- "plan": plan_i,
321
- }
322
- )
323
-
324
- _LOGGER.info(
325
- f"""
326
- {tabulate(tabular_data=plan_i, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
327
- )
328
-
329
- tool_info = retrieve_tools(
330
- plan_i,
331
- self.tool_recommender,
332
- self.log_progress,
333
- self.verbosity,
334
- )
335
- results = write_and_test_code(
336
- FULL_TASK.format(user_request=chat[0]["content"], subtasks=plan_i_str),
337
- tool_info,
338
- UTILITIES_DOCSTRING,
339
- format_memory(working_memory),
340
- self.coder,
341
- self.tester,
342
- self.debugger,
343
- self.log_progress,
344
- verbosity=self.verbosity,
345
- input_media=image,
346
- )
347
- success = cast(bool, results["success"])
348
- code = cast(str, results["code"])
349
- test = cast(str, results["test"])
350
- working_memory.extend(results["working_memory"]) # type: ignore
351
- plan.append({"code": code, "test": test, "plan": plan_i})
352
-
353
- if self_reflection:
354
- reflection = reflect(
355
- chat,
356
- FULL_TASK.format(
357
- user_request=chat[0]["content"], subtasks=plan_i_str
358
- ),
359
- code,
360
- self.planner,
361
- )
362
- if self.verbosity > 0:
363
- self.log_progress(
364
- {
365
- "log": "Reflection:",
366
- "reflection": reflection,
367
- }
368
- )
369
- _LOGGER.info(f"Reflection: {reflection}")
370
- feedback = cast(str, reflection["feedback"])
371
- success = cast(bool, reflection["success"])
372
- working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
373
-
374
- retries += 1
375
-
376
- self.log_progress(
377
- {
378
- "log": f"The Vision Agent V3 has concluded this chat.\nSuccess: {success}",
379
- "finished": True,
380
- }
381
- )
382
-
383
- return {
384
- "code": code,
385
- "test": test,
386
- "test_result": results["test_result"],
387
- "plan": plan,
388
- "working_memory": working_memory,
389
- }
390
-
391
- def log_progress(self, data: Dict[str, Any]) -> None:
392
- if self.report_progress_callback is not None:
393
- self.report_progress_callback(data)
394
- pass