vision-agent 0.2.161__py3-none-any.whl → 0.2.162__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +76 -2
- vision_agent/agent/vision_agent.py +49 -17
- vision_agent/agent/vision_agent_coder.py +163 -489
- vision_agent/agent/vision_agent_coder_prompts.py +0 -203
- vision_agent/agent/vision_agent_planner.py +553 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +84 -3
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/METADATA +7 -7
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/RECORD +13 -11
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/WHEEL +0 -0
@@ -2,32 +2,33 @@ import copy
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import sys
|
5
|
-
from json import JSONDecodeError
|
6
5
|
from pathlib import Path
|
7
|
-
from typing import Any, Callable, Dict, List, Optional, Sequence,
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
8
7
|
|
9
|
-
from rich.console import Console
|
10
|
-
from rich.style import Style
|
11
|
-
from rich.syntax import Syntax
|
12
8
|
from tabulate import tabulate
|
13
9
|
|
14
10
|
import vision_agent.tools as T
|
15
|
-
from vision_agent.agent import Agent
|
11
|
+
from vision_agent.agent.agent import Agent
|
16
12
|
from vision_agent.agent.agent_utils import (
|
13
|
+
DefaultImports,
|
17
14
|
extract_code,
|
18
15
|
extract_json,
|
16
|
+
format_memory,
|
17
|
+
print_code,
|
19
18
|
remove_installs_from_code,
|
20
19
|
)
|
21
20
|
from vision_agent.agent.vision_agent_coder_prompts import (
|
22
21
|
CODE,
|
23
22
|
FIX_BUG,
|
24
23
|
FULL_TASK,
|
25
|
-
PICK_PLAN,
|
26
|
-
PLAN,
|
27
|
-
PREVIOUS_FAILED,
|
28
24
|
SIMPLE_TEST,
|
29
|
-
|
30
|
-
|
25
|
+
)
|
26
|
+
from vision_agent.agent.vision_agent_planner import (
|
27
|
+
AnthropicVisionAgentPlanner,
|
28
|
+
AzureVisionAgentPlanner,
|
29
|
+
OllamaVisionAgentPlanner,
|
30
|
+
OpenAIVisionAgentPlanner,
|
31
|
+
PlanContext,
|
31
32
|
)
|
32
33
|
from vision_agent.lmm import (
|
33
34
|
LMM,
|
@@ -40,241 +41,11 @@ from vision_agent.lmm import (
|
|
40
41
|
from vision_agent.tools.meta_tools import get_diff
|
41
42
|
from vision_agent.utils import CodeInterpreterFactory, Execution
|
42
43
|
from vision_agent.utils.execute import CodeInterpreter
|
43
|
-
from vision_agent.utils.image_utils import b64_to_pil
|
44
|
-
from vision_agent.utils.sim import AzureSim, OllamaSim, Sim
|
45
|
-
from vision_agent.utils.video import play_video
|
46
44
|
|
47
45
|
logging.basicConfig(stream=sys.stdout)
|
48
46
|
WORKSPACE = Path(os.getenv("WORKSPACE", ""))
|
49
47
|
_LOGGER = logging.getLogger(__name__)
|
50
48
|
_MAX_TABULATE_COL_WIDTH = 80
|
51
|
-
_CONSOLE = Console()
|
52
|
-
|
53
|
-
|
54
|
-
class DefaultImports:
|
55
|
-
"""Container for default imports used in the code execution."""
|
56
|
-
|
57
|
-
common_imports = [
|
58
|
-
"import os",
|
59
|
-
"import numpy as np",
|
60
|
-
"from vision_agent.tools import *",
|
61
|
-
"from typing import *",
|
62
|
-
"from pillow_heif import register_heif_opener",
|
63
|
-
"register_heif_opener()",
|
64
|
-
]
|
65
|
-
|
66
|
-
@staticmethod
|
67
|
-
def to_code_string() -> str:
|
68
|
-
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
69
|
-
|
70
|
-
@staticmethod
|
71
|
-
def prepend_imports(code: str) -> str:
|
72
|
-
"""Run this method to prepend the default imports to the code.
|
73
|
-
NOTE: be sure to run this method after the custom tools have been registered.
|
74
|
-
"""
|
75
|
-
return DefaultImports.to_code_string() + "\n\n" + code
|
76
|
-
|
77
|
-
|
78
|
-
def format_memory(memory: List[Dict[str, str]]) -> str:
|
79
|
-
output_str = ""
|
80
|
-
for i, m in enumerate(memory):
|
81
|
-
output_str += f"### Feedback {i}:\n"
|
82
|
-
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
83
|
-
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
84
|
-
if "edits" in m:
|
85
|
-
output_str += f"Edits {i}:\n{m['edits']}\n"
|
86
|
-
output_str += "\n"
|
87
|
-
|
88
|
-
return output_str
|
89
|
-
|
90
|
-
|
91
|
-
def format_plans(plans: Dict[str, Any]) -> str:
|
92
|
-
plan_str = ""
|
93
|
-
for k, v in plans.items():
|
94
|
-
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
95
|
-
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
96
|
-
|
97
|
-
return plan_str
|
98
|
-
|
99
|
-
|
100
|
-
def write_plans(
|
101
|
-
chat: List[Message],
|
102
|
-
tool_desc: str,
|
103
|
-
working_memory: str,
|
104
|
-
model: LMM,
|
105
|
-
) -> Dict[str, Any]:
|
106
|
-
chat = copy.deepcopy(chat)
|
107
|
-
if chat[-1]["role"] != "user":
|
108
|
-
raise ValueError("Last chat message must be from the user.")
|
109
|
-
|
110
|
-
user_request = chat[-1]["content"]
|
111
|
-
context = USER_REQ.format(user_request=user_request)
|
112
|
-
prompt = PLAN.format(
|
113
|
-
context=context,
|
114
|
-
tool_desc=tool_desc,
|
115
|
-
feedback=working_memory,
|
116
|
-
)
|
117
|
-
chat[-1]["content"] = prompt
|
118
|
-
return extract_json(model(chat, stream=False)) # type: ignore
|
119
|
-
|
120
|
-
|
121
|
-
def pick_plan(
|
122
|
-
chat: List[Message],
|
123
|
-
plans: Dict[str, Any],
|
124
|
-
tool_info: str,
|
125
|
-
model: LMM,
|
126
|
-
code_interpreter: CodeInterpreter,
|
127
|
-
media: List[str],
|
128
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
129
|
-
verbosity: int = 0,
|
130
|
-
max_retries: int = 3,
|
131
|
-
) -> Tuple[Dict[str, str], str]:
|
132
|
-
log_progress(
|
133
|
-
{
|
134
|
-
"type": "log",
|
135
|
-
"log_content": "Generating code to pick the best plan",
|
136
|
-
"status": "started",
|
137
|
-
}
|
138
|
-
)
|
139
|
-
|
140
|
-
chat = copy.deepcopy(chat)
|
141
|
-
if chat[-1]["role"] != "user":
|
142
|
-
raise ValueError("Last chat message must be from the user.")
|
143
|
-
|
144
|
-
plan_str = format_plans(plans)
|
145
|
-
prompt = TEST_PLANS.format(
|
146
|
-
docstring=tool_info, plans=plan_str, previous_attempts="", media=media
|
147
|
-
)
|
148
|
-
|
149
|
-
code = extract_code(model(prompt, stream=False)) # type: ignore
|
150
|
-
log_progress(
|
151
|
-
{
|
152
|
-
"type": "log",
|
153
|
-
"log_content": "Executing code to test plans",
|
154
|
-
"code": DefaultImports.prepend_imports(code),
|
155
|
-
"status": "running",
|
156
|
-
}
|
157
|
-
)
|
158
|
-
tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
|
159
|
-
# Because of the way we trace function calls the trace information ends up in the
|
160
|
-
# results. We don't want to show this info to the LLM so we don't include it in the
|
161
|
-
# tool_output_str.
|
162
|
-
tool_output_str = tool_output.text(include_results=False).strip()
|
163
|
-
|
164
|
-
if verbosity == 2:
|
165
|
-
_print_code("Initial code and tests:", code)
|
166
|
-
_LOGGER.info(f"Initial code execution result:\n{tool_output_str}")
|
167
|
-
|
168
|
-
log_progress(
|
169
|
-
{
|
170
|
-
"type": "log",
|
171
|
-
"log_content": (
|
172
|
-
"Code execution succeeded"
|
173
|
-
if tool_output.success
|
174
|
-
else "Code execution failed"
|
175
|
-
),
|
176
|
-
"code": DefaultImports.prepend_imports(code),
|
177
|
-
# "payload": tool_output.to_json(),
|
178
|
-
"status": "completed" if tool_output.success else "failed",
|
179
|
-
}
|
180
|
-
)
|
181
|
-
|
182
|
-
# retry if the tool output is empty or code fails
|
183
|
-
count = 0
|
184
|
-
while (
|
185
|
-
not tool_output.success
|
186
|
-
or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
|
187
|
-
) and count < max_retries:
|
188
|
-
prompt = TEST_PLANS.format(
|
189
|
-
docstring=tool_info,
|
190
|
-
plans=plan_str,
|
191
|
-
previous_attempts=PREVIOUS_FAILED.format(
|
192
|
-
code=code, error="\n".join(tool_output_str.splitlines()[-50:])
|
193
|
-
),
|
194
|
-
media=media,
|
195
|
-
)
|
196
|
-
log_progress(
|
197
|
-
{
|
198
|
-
"type": "log",
|
199
|
-
"log_content": "Retrying code to test plans",
|
200
|
-
"status": "running",
|
201
|
-
"code": DefaultImports.prepend_imports(code),
|
202
|
-
}
|
203
|
-
)
|
204
|
-
code = extract_code(model(prompt, stream=False)) # type: ignore
|
205
|
-
tool_output = code_interpreter.exec_isolation(
|
206
|
-
DefaultImports.prepend_imports(code)
|
207
|
-
)
|
208
|
-
log_progress(
|
209
|
-
{
|
210
|
-
"type": "log",
|
211
|
-
"log_content": (
|
212
|
-
"Code execution succeeded"
|
213
|
-
if tool_output.success
|
214
|
-
else "Code execution failed"
|
215
|
-
),
|
216
|
-
"code": DefaultImports.prepend_imports(code),
|
217
|
-
# "payload": tool_output.to_json(),
|
218
|
-
"status": "completed" if tool_output.success else "failed",
|
219
|
-
}
|
220
|
-
)
|
221
|
-
tool_output_str = tool_output.text(include_results=False).strip()
|
222
|
-
|
223
|
-
if verbosity == 2:
|
224
|
-
_print_code("Code and test after attempted fix:", code)
|
225
|
-
_LOGGER.info(f"Code execution result after attempt {count + 1}")
|
226
|
-
_LOGGER.info(f"{tool_output_str}")
|
227
|
-
|
228
|
-
count += 1
|
229
|
-
|
230
|
-
if verbosity >= 1:
|
231
|
-
_print_code("Final code:", code)
|
232
|
-
|
233
|
-
user_req = chat[-1]["content"]
|
234
|
-
context = USER_REQ.format(user_request=user_req)
|
235
|
-
# because the tool picker model gets the image as well, we have to be careful with
|
236
|
-
# how much text we send it, so we truncate the tool output to 20,000 characters
|
237
|
-
prompt = PICK_PLAN.format(
|
238
|
-
context=context,
|
239
|
-
plans=format_plans(plans),
|
240
|
-
tool_output=tool_output_str[:20_000],
|
241
|
-
)
|
242
|
-
chat[-1]["content"] = prompt
|
243
|
-
|
244
|
-
count = 0
|
245
|
-
plan_thoughts = None
|
246
|
-
while plan_thoughts is None and count < max_retries:
|
247
|
-
try:
|
248
|
-
plan_thoughts = extract_json(model(chat, stream=False)) # type: ignore
|
249
|
-
except JSONDecodeError as e:
|
250
|
-
_LOGGER.exception(
|
251
|
-
f"Error while extracting JSON during picking best plan {str(e)}"
|
252
|
-
)
|
253
|
-
pass
|
254
|
-
count += 1
|
255
|
-
|
256
|
-
if (
|
257
|
-
plan_thoughts is None
|
258
|
-
or "best_plan" not in plan_thoughts
|
259
|
-
or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
|
260
|
-
):
|
261
|
-
_LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
|
262
|
-
plan_thoughts = {"best_plan": list(plans.keys())[0]}
|
263
|
-
|
264
|
-
if "thoughts" not in plan_thoughts:
|
265
|
-
plan_thoughts["thoughts"] = ""
|
266
|
-
|
267
|
-
if verbosity >= 1:
|
268
|
-
_LOGGER.info(f"Best plan:\n{plan_thoughts}")
|
269
|
-
log_progress(
|
270
|
-
{
|
271
|
-
"type": "log",
|
272
|
-
"log_content": "Picked best plan",
|
273
|
-
"status": "completed",
|
274
|
-
"payload": plans[plan_thoughts["best_plan"]],
|
275
|
-
}
|
276
|
-
)
|
277
|
-
return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
|
278
49
|
|
279
50
|
|
280
51
|
def write_code(
|
@@ -393,7 +164,7 @@ def write_and_test_code(
|
|
393
164
|
}
|
394
165
|
)
|
395
166
|
if verbosity == 2:
|
396
|
-
|
167
|
+
print_code("Initial code and tests:", code, test)
|
397
168
|
_LOGGER.info(
|
398
169
|
f"Initial code execution result:\n{result.text(include_logs=True)}"
|
399
170
|
)
|
@@ -418,7 +189,7 @@ def write_and_test_code(
|
|
418
189
|
count += 1
|
419
190
|
|
420
191
|
if verbosity >= 1:
|
421
|
-
|
192
|
+
print_code("Final code and tests:", code, test)
|
422
193
|
|
423
194
|
return {
|
424
195
|
"code": code,
|
@@ -537,7 +308,7 @@ def debug_code(
|
|
537
308
|
}
|
538
309
|
)
|
539
310
|
if verbosity == 2:
|
540
|
-
|
311
|
+
print_code("Code and test after attempted fix:", code, test)
|
541
312
|
_LOGGER.info(
|
542
313
|
f"Reflection: {fixed_code_and_test['reflections']}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
|
543
314
|
)
|
@@ -545,62 +316,6 @@ def debug_code(
|
|
545
316
|
return code, test, result
|
546
317
|
|
547
318
|
|
548
|
-
def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
549
|
-
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
550
|
-
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
551
|
-
_CONSOLE.print(
|
552
|
-
Syntax(
|
553
|
-
DefaultImports.prepend_imports(code),
|
554
|
-
"python",
|
555
|
-
theme="gruvbox-dark",
|
556
|
-
line_numbers=True,
|
557
|
-
)
|
558
|
-
)
|
559
|
-
if test:
|
560
|
-
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
561
|
-
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
562
|
-
|
563
|
-
|
564
|
-
def retrieve_tools(
|
565
|
-
plans: Dict[str, Dict[str, Any]],
|
566
|
-
tool_recommender: Sim,
|
567
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
568
|
-
verbosity: int = 0,
|
569
|
-
) -> Dict[str, str]:
|
570
|
-
log_progress(
|
571
|
-
{
|
572
|
-
"type": "log",
|
573
|
-
"log_content": ("Retrieving tools for each plan"),
|
574
|
-
"status": "started",
|
575
|
-
}
|
576
|
-
)
|
577
|
-
tool_info = []
|
578
|
-
tool_desc = []
|
579
|
-
tool_lists: Dict[str, List[Dict[str, str]]] = {}
|
580
|
-
for k, plan in plans.items():
|
581
|
-
tool_lists[k] = []
|
582
|
-
for task in plan["instructions"]:
|
583
|
-
tools = tool_recommender.top_k(task, k=2, thresh=0.3)
|
584
|
-
tool_info.extend([e["doc"] for e in tools])
|
585
|
-
tool_desc.extend([e["desc"] for e in tools])
|
586
|
-
tool_lists[k].extend(
|
587
|
-
{"description": e["desc"], "documentation": e["doc"]} for e in tools
|
588
|
-
)
|
589
|
-
|
590
|
-
if verbosity == 2:
|
591
|
-
tool_desc_str = "\n".join(set(tool_desc))
|
592
|
-
_LOGGER.info(f"Tools Description:\n{tool_desc_str}")
|
593
|
-
|
594
|
-
tool_lists_unique = {}
|
595
|
-
for k in tool_lists:
|
596
|
-
tool_lists_unique[k] = "\n\n".join(
|
597
|
-
set(e["documentation"] for e in tool_lists[k])
|
598
|
-
)
|
599
|
-
all_tools = "\n\n".join(set(tool_info))
|
600
|
-
tool_lists_unique["all"] = all_tools
|
601
|
-
return tool_lists_unique
|
602
|
-
|
603
|
-
|
604
319
|
class VisionAgentCoder(Agent):
|
605
320
|
"""Vision Agent Coder is an agentic framework that can output code based on a user
|
606
321
|
request. It can plan tasks, retrieve relevant tools, write code, write tests and
|
@@ -616,23 +331,22 @@ class VisionAgentCoder(Agent):
|
|
616
331
|
|
617
332
|
def __init__(
|
618
333
|
self,
|
619
|
-
planner: Optional[
|
334
|
+
planner: Optional[Agent] = None,
|
620
335
|
coder: Optional[LMM] = None,
|
621
336
|
tester: Optional[LMM] = None,
|
622
337
|
debugger: Optional[LMM] = None,
|
623
|
-
tool_recommender: Optional[Sim] = None,
|
624
338
|
verbosity: int = 0,
|
625
339
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
626
|
-
|
340
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
627
341
|
) -> None:
|
628
342
|
"""Initialize the Vision Agent Coder.
|
629
343
|
|
630
344
|
Parameters:
|
631
|
-
planner (Optional[
|
345
|
+
planner (Optional[Agent]): The planner model to use. Defaults to
|
346
|
+
AnthropicVisionAgentPlanner.
|
632
347
|
coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
|
633
348
|
tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
|
634
349
|
debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
|
635
|
-
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
636
350
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
637
351
|
highest verbosity level which will output all intermediate debugging
|
638
352
|
code.
|
@@ -641,14 +355,17 @@ class VisionAgentCoder(Agent):
|
|
641
355
|
in a web application where multiple VisionAgentCoder instances are
|
642
356
|
running in parallel. This callback ensures that the progress are not
|
643
357
|
mixed up.
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
If it's also None, the local python runtime environment will be used.
|
358
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
359
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
360
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
361
|
+
object is provided it will use that.
|
649
362
|
"""
|
650
363
|
|
651
|
-
self.planner =
|
364
|
+
self.planner = (
|
365
|
+
AnthropicVisionAgentPlanner(verbosity=verbosity)
|
366
|
+
if planner is None
|
367
|
+
else planner
|
368
|
+
)
|
652
369
|
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
653
370
|
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
654
371
|
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
@@ -656,21 +373,15 @@ class VisionAgentCoder(Agent):
|
|
656
373
|
if self.verbosity > 0:
|
657
374
|
_LOGGER.setLevel(logging.INFO)
|
658
375
|
|
659
|
-
self.tool_recommender = (
|
660
|
-
Sim(T.TOOLS_DF, sim_key="desc")
|
661
|
-
if tool_recommender is None
|
662
|
-
else tool_recommender
|
663
|
-
)
|
664
376
|
self.report_progress_callback = report_progress_callback
|
665
|
-
self.
|
377
|
+
self.code_interpreter = code_interpreter
|
666
378
|
|
667
379
|
def __call__(
|
668
380
|
self,
|
669
381
|
input: Union[str, List[Message]],
|
670
382
|
media: Optional[Union[str, Path]] = None,
|
671
383
|
) -> str:
|
672
|
-
"""
|
673
|
-
task.
|
384
|
+
"""Generate code based on a user request.
|
674
385
|
|
675
386
|
Parameters:
|
676
387
|
input (Union[str, List[Message]]): A conversation in the format of
|
@@ -686,46 +397,58 @@ class VisionAgentCoder(Agent):
|
|
686
397
|
input = [{"role": "user", "content": input}]
|
687
398
|
if media is not None:
|
688
399
|
input[0]["media"] = [media]
|
689
|
-
|
690
|
-
|
691
|
-
return results["code"] # type: ignore
|
400
|
+
code_and_context = self.generate_code(input)
|
401
|
+
return code_and_context["code"] # type: ignore
|
692
402
|
|
693
|
-
def
|
403
|
+
def generate_code_from_plan(
|
694
404
|
self,
|
695
405
|
chat: List[Message],
|
696
|
-
|
697
|
-
|
698
|
-
custom_tool_names: Optional[List[str]] = None,
|
406
|
+
plan_context: PlanContext,
|
407
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
699
408
|
) -> Dict[str, Any]:
|
700
|
-
"""
|
701
|
-
|
409
|
+
"""Generates code and other intermediate outputs from a chat input and a plan.
|
410
|
+
The plan includes:
|
411
|
+
- plans: The plans generated by the planner.
|
412
|
+
- best_plan: The best plan selected by the planner.
|
413
|
+
- plan_thoughts: The thoughts of the planner, including any modifications
|
414
|
+
to the plan.
|
415
|
+
- tool_doc: The tool documentation for the best plan.
|
416
|
+
- tool_output: The tool output from the tools used by the best plan.
|
702
417
|
|
703
418
|
Parameters:
|
704
|
-
chat (List[Message]): A conversation
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
with the first plan.
|
712
|
-
display_visualization (bool): If True, it opens a new window locally to
|
713
|
-
show the image(s) created by visualization code (if there is any).
|
714
|
-
custom_tool_names (List[str]): A list of custom tools for the agent to pick
|
715
|
-
and use. If not provided, default to full tool set from vision_agent.tools.
|
419
|
+
chat (List[Message]): A conversation in the format of
|
420
|
+
[{"role": "user", "content": "describe your task here..."}].
|
421
|
+
plan_context (PlanContext): The context of the plan, including the plans,
|
422
|
+
best_plan, plan_thoughts, tool_doc, and tool_output.
|
423
|
+
test_multi_plan (bool): Whether to test multiple plans or just the best plan.
|
424
|
+
custom_tool_names (Optional[List[str]]): A list of custom tool names to use
|
425
|
+
for the planner.
|
716
426
|
|
717
427
|
Returns:
|
718
|
-
Dict[str, Any]: A dictionary containing the code
|
719
|
-
and
|
428
|
+
Dict[str, Any]: A dictionary containing the code output by the
|
429
|
+
VisionAgentCoder and other intermediate outputs. include:
|
430
|
+
- status (str): Whether or not the agent completed or failed generating
|
431
|
+
the code.
|
432
|
+
- code (str): The code output by the VisionAgentCoder.
|
433
|
+
- test (str): The test output by the VisionAgentCoder.
|
434
|
+
- test_result (Execution): The result of the test execution.
|
435
|
+
- plans (Dict[str, Any]): The plans generated by the planner.
|
436
|
+
- plan_thoughts (str): The thoughts of the planner.
|
437
|
+
- working_memory (List[Dict[str, str]]): The working memory of the agent.
|
720
438
|
"""
|
721
|
-
|
722
439
|
if not chat:
|
723
440
|
raise ValueError("Chat cannot be empty.")
|
724
441
|
|
725
442
|
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
726
|
-
|
727
|
-
|
728
|
-
|
443
|
+
code_interpreter = (
|
444
|
+
self.code_interpreter
|
445
|
+
if self.code_interpreter is not None
|
446
|
+
and not isinstance(self.code_interpreter, str)
|
447
|
+
else CodeInterpreterFactory.new_instance(
|
448
|
+
code_sandbox_runtime=self.code_interpreter,
|
449
|
+
)
|
450
|
+
)
|
451
|
+
with code_interpreter:
|
729
452
|
chat = copy.deepcopy(chat)
|
730
453
|
media_list = []
|
731
454
|
for chat_i in chat:
|
@@ -759,74 +482,22 @@ class VisionAgentCoder(Agent):
|
|
759
482
|
code = ""
|
760
483
|
test = ""
|
761
484
|
working_memory: List[Dict[str, str]] = []
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
plans = self._create_plans(
|
767
|
-
int_chat, custom_tool_names, working_memory, self.planner
|
768
|
-
)
|
769
|
-
|
770
|
-
if test_multi_plan:
|
771
|
-
self._log_plans(plans, self.verbosity)
|
772
|
-
|
773
|
-
tool_infos = retrieve_tools(
|
774
|
-
plans,
|
775
|
-
self.tool_recommender,
|
776
|
-
self.log_progress,
|
777
|
-
self.verbosity,
|
778
|
-
)
|
779
|
-
|
780
|
-
if test_multi_plan:
|
781
|
-
plan_thoughts, tool_output_str = pick_plan(
|
782
|
-
int_chat,
|
783
|
-
plans,
|
784
|
-
tool_infos["all"],
|
785
|
-
self.coder,
|
786
|
-
code_interpreter,
|
787
|
-
media_list,
|
788
|
-
self.log_progress,
|
789
|
-
verbosity=self.verbosity,
|
790
|
-
)
|
791
|
-
best_plan = plan_thoughts["best_plan"]
|
792
|
-
plan_thoughts_str = plan_thoughts["thoughts"]
|
793
|
-
else:
|
794
|
-
best_plan = list(plans.keys())[0]
|
795
|
-
tool_output_str = ""
|
796
|
-
plan_thoughts_str = ""
|
797
|
-
|
798
|
-
if best_plan in plans and best_plan in tool_infos:
|
799
|
-
plan_i = plans[best_plan]
|
800
|
-
tool_info = tool_infos[best_plan]
|
801
|
-
else:
|
802
|
-
if self.verbosity >= 1:
|
803
|
-
_LOGGER.warning(
|
804
|
-
f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
|
805
|
-
)
|
806
|
-
k = list(plans.keys())[0]
|
807
|
-
plan_i = plans[k]
|
808
|
-
tool_info = tool_infos[k]
|
809
|
-
|
810
|
-
self.log_progress(
|
811
|
-
{
|
812
|
-
"type": "log",
|
813
|
-
"log_content": "Creating plans",
|
814
|
-
"status": "completed",
|
815
|
-
"payload": tool_info,
|
816
|
-
}
|
817
|
-
)
|
485
|
+
plan = plan_context.plans[plan_context.best_plan]
|
486
|
+
tool_doc = plan_context.tool_doc
|
487
|
+
tool_output_str = plan_context.tool_output
|
488
|
+
plan_thoughts_str = str(plan_context.plan_thoughts)
|
818
489
|
|
819
490
|
if self.verbosity >= 1:
|
820
|
-
|
491
|
+
plan_fixed = [{"instructions": e} for e in plan["instructions"]]
|
821
492
|
_LOGGER.info(
|
822
|
-
f"Picked best plan:\n{tabulate(tabular_data=
|
493
|
+
f"Picked best plan:\n{tabulate(tabular_data=plan_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
823
494
|
)
|
824
495
|
|
825
496
|
results = write_and_test_code(
|
826
497
|
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
|
827
|
-
plan=f"\n{
|
828
|
-
+ "\n-".join([e for e in
|
829
|
-
tool_info=
|
498
|
+
plan=f"\n{plan['thoughts']}\n-"
|
499
|
+
+ "\n-".join([e for e in plan["instructions"]]),
|
500
|
+
tool_info=tool_doc,
|
830
501
|
tool_output=tool_output_str,
|
831
502
|
plan_thoughts=plan_thoughts_str,
|
832
503
|
tool_utils=T.UTILITIES_DOCSTRING,
|
@@ -842,64 +513,83 @@ class VisionAgentCoder(Agent):
|
|
842
513
|
success = cast(bool, results["success"])
|
843
514
|
code = remove_installs_from_code(cast(str, results["code"]))
|
844
515
|
test = remove_installs_from_code(cast(str, results["test"]))
|
845
|
-
working_memory.extend(results["working_memory"])
|
846
|
-
plan.append({"code": code, "test": test, "plan": plan_i})
|
516
|
+
working_memory.extend(results["working_memory"])
|
847
517
|
|
848
518
|
execution_result = cast(Execution, results["test_result"])
|
849
519
|
|
850
|
-
if display_visualization:
|
851
|
-
for res in execution_result.results:
|
852
|
-
if res.png:
|
853
|
-
b64_to_pil(res.png).show()
|
854
|
-
if res.mp4:
|
855
|
-
play_video(res.mp4)
|
856
|
-
|
857
520
|
return {
|
858
521
|
"status": "completed" if success else "failed",
|
859
522
|
"code": DefaultImports.prepend_imports(code),
|
860
523
|
"test": test,
|
861
524
|
"test_result": execution_result,
|
862
|
-
"plans": plans,
|
525
|
+
"plans": plan_context.plans,
|
863
526
|
"plan_thoughts": plan_thoughts_str,
|
864
527
|
"working_memory": working_memory,
|
865
528
|
}
|
866
529
|
|
867
|
-
def
|
868
|
-
if self.report_progress_callback is not None:
|
869
|
-
self.report_progress_callback(data)
|
870
|
-
|
871
|
-
def _create_plans(
|
530
|
+
def generate_code(
|
872
531
|
self,
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
planner: LMM,
|
532
|
+
chat: List[Message],
|
533
|
+
test_multi_plan: bool = True,
|
534
|
+
custom_tool_names: Optional[List[str]] = None,
|
877
535
|
) -> Dict[str, Any]:
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
"
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
536
|
+
"""Generates code and other intermediate outputs from a chat input.
|
537
|
+
|
538
|
+
Parameters:
|
539
|
+
chat (List[Message]): A conversation in the format of
|
540
|
+
[{"role": "user", "content": "describe your task here..."}].
|
541
|
+
test_multi_plan (bool): Whether to test multiple plans or just the best plan.
|
542
|
+
custom_tool_names (Optional[List[str]]): A list of custom tool names to use
|
543
|
+
for the planner.
|
544
|
+
|
545
|
+
Returns:
|
546
|
+
Dict[str, Any]: A dictionary containing the code output by the
|
547
|
+
VisionAgentCoder and other intermediate outputs. include:
|
548
|
+
- status (str): Whether or not the agent completed or failed generating
|
549
|
+
the code.
|
550
|
+
- code (str): The code output by the VisionAgentCoder.
|
551
|
+
- test (str): The test output by the VisionAgentCoder.
|
552
|
+
- test_result (Execution): The result of the test execution.
|
553
|
+
- plans (Dict[str, Any]): The plans generated by the planner.
|
554
|
+
- plan_thoughts (str): The thoughts of the planner.
|
555
|
+
- working_memory (List[Dict[str, str]]): The working memory of the agent.
|
556
|
+
"""
|
557
|
+
if not chat:
|
558
|
+
raise ValueError("Chat cannot be empty.")
|
559
|
+
|
560
|
+
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
561
|
+
code_interpreter = (
|
562
|
+
self.code_interpreter
|
563
|
+
if self.code_interpreter is not None
|
564
|
+
and not isinstance(self.code_interpreter, str)
|
565
|
+
else CodeInterpreterFactory.new_instance(
|
566
|
+
code_sandbox_runtime=self.code_interpreter,
|
567
|
+
)
|
892
568
|
)
|
893
|
-
|
569
|
+
with code_interpreter:
|
570
|
+
plan_context = self.planner.generate_plan( # type: ignore
|
571
|
+
chat,
|
572
|
+
test_multi_plan=test_multi_plan,
|
573
|
+
custom_tool_names=custom_tool_names,
|
574
|
+
code_interpreter=code_interpreter,
|
575
|
+
)
|
894
576
|
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
577
|
+
code_and_context = self.generate_code_from_plan(
|
578
|
+
chat,
|
579
|
+
plan_context,
|
580
|
+
code_interpreter=code_interpreter,
|
581
|
+
)
|
582
|
+
return code_and_context
|
583
|
+
|
584
|
+
def chat(self, chat: List[Message]) -> List[Message]:
|
585
|
+
chat = copy.deepcopy(chat)
|
586
|
+
code = self.generate_code(chat)
|
587
|
+
chat.append({"role": "agent", "content": code["code"]})
|
588
|
+
return chat
|
589
|
+
|
590
|
+
def log_progress(self, data: Dict[str, Any]) -> None:
|
591
|
+
if self.report_progress_callback is not None:
|
592
|
+
self.report_progress_callback(data)
|
903
593
|
|
904
594
|
|
905
595
|
class OpenAIVisionAgentCoder(VisionAgentCoder):
|
@@ -907,17 +597,18 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
|
|
907
597
|
|
908
598
|
def __init__(
|
909
599
|
self,
|
910
|
-
planner: Optional[
|
600
|
+
planner: Optional[Agent] = None,
|
911
601
|
coder: Optional[LMM] = None,
|
912
602
|
tester: Optional[LMM] = None,
|
913
603
|
debugger: Optional[LMM] = None,
|
914
|
-
tool_recommender: Optional[Sim] = None,
|
915
604
|
verbosity: int = 0,
|
916
605
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
917
|
-
|
606
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
918
607
|
) -> None:
|
919
608
|
self.planner = (
|
920
|
-
|
609
|
+
OpenAIVisionAgentPlanner(verbosity=verbosity)
|
610
|
+
if planner is None
|
611
|
+
else planner
|
921
612
|
)
|
922
613
|
self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
|
923
614
|
self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
|
@@ -926,13 +617,8 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
|
|
926
617
|
if self.verbosity > 0:
|
927
618
|
_LOGGER.setLevel(logging.INFO)
|
928
619
|
|
929
|
-
self.tool_recommender = (
|
930
|
-
Sim(T.TOOLS_DF, sim_key="desc")
|
931
|
-
if tool_recommender is None
|
932
|
-
else tool_recommender
|
933
|
-
)
|
934
620
|
self.report_progress_callback = report_progress_callback
|
935
|
-
self.
|
621
|
+
self.code_interpreter = code_interpreter
|
936
622
|
|
937
623
|
|
938
624
|
class AnthropicVisionAgentCoder(VisionAgentCoder):
|
@@ -940,17 +626,20 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
|
|
940
626
|
|
941
627
|
def __init__(
|
942
628
|
self,
|
943
|
-
planner: Optional[
|
629
|
+
planner: Optional[Agent] = None,
|
944
630
|
coder: Optional[LMM] = None,
|
945
631
|
tester: Optional[LMM] = None,
|
946
632
|
debugger: Optional[LMM] = None,
|
947
|
-
tool_recommender: Optional[Sim] = None,
|
948
633
|
verbosity: int = 0,
|
949
634
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
950
|
-
|
635
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
951
636
|
) -> None:
|
952
637
|
# NOTE: Claude doesn't have an official JSON mode
|
953
|
-
self.planner =
|
638
|
+
self.planner = (
|
639
|
+
AnthropicVisionAgentPlanner(verbosity=verbosity)
|
640
|
+
if planner is None
|
641
|
+
else planner
|
642
|
+
)
|
954
643
|
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
955
644
|
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
956
645
|
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
@@ -958,15 +647,8 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
|
|
958
647
|
if self.verbosity > 0:
|
959
648
|
_LOGGER.setLevel(logging.INFO)
|
960
649
|
|
961
|
-
# Anthropic does not offer any embedding models and instead recomends Voyage,
|
962
|
-
# we're using OpenAI's embedder for now.
|
963
|
-
self.tool_recommender = (
|
964
|
-
Sim(T.TOOLS_DF, sim_key="desc")
|
965
|
-
if tool_recommender is None
|
966
|
-
else tool_recommender
|
967
|
-
)
|
968
650
|
self.report_progress_callback = report_progress_callback
|
969
|
-
self.
|
651
|
+
self.code_interpreter = code_interpreter
|
970
652
|
|
971
653
|
|
972
654
|
class OllamaVisionAgentCoder(VisionAgentCoder):
|
@@ -988,17 +670,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
988
670
|
|
989
671
|
def __init__(
|
990
672
|
self,
|
991
|
-
planner: Optional[
|
673
|
+
planner: Optional[Agent] = None,
|
992
674
|
coder: Optional[LMM] = None,
|
993
675
|
tester: Optional[LMM] = None,
|
994
676
|
debugger: Optional[LMM] = None,
|
995
|
-
tool_recommender: Optional[Sim] = None,
|
996
677
|
verbosity: int = 0,
|
997
678
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
679
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
998
680
|
) -> None:
|
999
681
|
super().__init__(
|
1000
682
|
planner=(
|
1001
|
-
|
683
|
+
OllamaVisionAgentPlanner(verbosity=verbosity)
|
1002
684
|
if planner is None
|
1003
685
|
else planner
|
1004
686
|
),
|
@@ -1017,13 +699,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
1017
699
|
if debugger is None
|
1018
700
|
else debugger
|
1019
701
|
),
|
1020
|
-
tool_recommender=(
|
1021
|
-
OllamaSim(T.TOOLS_DF, sim_key="desc")
|
1022
|
-
if tool_recommender is None
|
1023
|
-
else tool_recommender
|
1024
|
-
),
|
1025
702
|
verbosity=verbosity,
|
1026
703
|
report_progress_callback=report_progress_callback,
|
704
|
+
code_interpreter=code_interpreter,
|
1027
705
|
)
|
1028
706
|
|
1029
707
|
|
@@ -1043,22 +721,22 @@ class AzureVisionAgentCoder(VisionAgentCoder):
|
|
1043
721
|
|
1044
722
|
def __init__(
|
1045
723
|
self,
|
1046
|
-
planner: Optional[
|
724
|
+
planner: Optional[Agent] = None,
|
1047
725
|
coder: Optional[LMM] = None,
|
1048
726
|
tester: Optional[LMM] = None,
|
1049
727
|
debugger: Optional[LMM] = None,
|
1050
|
-
tool_recommender: Optional[Sim] = None,
|
1051
728
|
verbosity: int = 0,
|
1052
729
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
730
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
1053
731
|
) -> None:
|
1054
732
|
"""Initialize the Vision Agent Coder.
|
1055
733
|
|
1056
734
|
Parameters:
|
1057
|
-
planner (Optional[
|
735
|
+
planner (Optional[Agent]): The planner model to use. Defaults to
|
736
|
+
AzureVisionAgentPlanner.
|
1058
737
|
coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
|
1059
738
|
tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
|
1060
739
|
debugger (Optional[LMM]): The debugger model to
|
1061
|
-
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
1062
740
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
1063
741
|
highest verbosity level which will output all intermediate debugging
|
1064
742
|
code.
|
@@ -1069,7 +747,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
|
|
1069
747
|
"""
|
1070
748
|
super().__init__(
|
1071
749
|
planner=(
|
1072
|
-
|
750
|
+
AzureVisionAgentPlanner(verbosity=verbosity)
|
1073
751
|
if planner is None
|
1074
752
|
else planner
|
1075
753
|
),
|
@@ -1078,11 +756,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
|
|
1078
756
|
debugger=(
|
1079
757
|
AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
|
1080
758
|
),
|
1081
|
-
tool_recommender=(
|
1082
|
-
AzureSim(T.TOOLS_DF, sim_key="desc")
|
1083
|
-
if tool_recommender is None
|
1084
|
-
else tool_recommender
|
1085
|
-
),
|
1086
759
|
verbosity=verbosity,
|
1087
760
|
report_progress_callback=report_progress_callback,
|
761
|
+
code_interpreter=code_interpreter,
|
1088
762
|
)
|