vision-agent 0.2.161__py3-none-any.whl → 0.2.163__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +98 -2
- vision_agent/agent/vision_agent.py +54 -22
- vision_agent/agent/vision_agent_coder.py +222 -512
- vision_agent/agent/vision_agent_coder_prompts.py +12 -221
- vision_agent/agent/vision_agent_planner.py +583 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +107 -35
- vision_agent/tools/tools.py +2 -2
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/METADATA +8 -7
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/RECORD +14 -12
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/WHEEL +0 -0
@@ -2,32 +2,35 @@ import copy
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import sys
|
5
|
-
from json import JSONDecodeError
|
6
5
|
from pathlib import Path
|
7
|
-
from typing import Any, Callable, Dict, List, Optional, Sequence,
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
8
7
|
|
9
|
-
from
|
10
|
-
from rich.style import Style
|
11
|
-
from rich.syntax import Syntax
|
8
|
+
from redbaron import RedBaron # type: ignore
|
12
9
|
from tabulate import tabulate
|
13
10
|
|
14
11
|
import vision_agent.tools as T
|
15
|
-
from vision_agent.agent import Agent
|
12
|
+
from vision_agent.agent.agent import Agent
|
16
13
|
from vision_agent.agent.agent_utils import (
|
14
|
+
_MAX_TABULATE_COL_WIDTH,
|
15
|
+
DefaultImports,
|
17
16
|
extract_code,
|
18
|
-
|
17
|
+
extract_tag,
|
18
|
+
format_memory,
|
19
|
+
print_code,
|
19
20
|
remove_installs_from_code,
|
20
21
|
)
|
21
22
|
from vision_agent.agent.vision_agent_coder_prompts import (
|
22
23
|
CODE,
|
23
24
|
FIX_BUG,
|
24
25
|
FULL_TASK,
|
25
|
-
PICK_PLAN,
|
26
|
-
PLAN,
|
27
|
-
PREVIOUS_FAILED,
|
28
26
|
SIMPLE_TEST,
|
29
|
-
|
30
|
-
|
27
|
+
)
|
28
|
+
from vision_agent.agent.vision_agent_planner import (
|
29
|
+
AnthropicVisionAgentPlanner,
|
30
|
+
AzureVisionAgentPlanner,
|
31
|
+
OllamaVisionAgentPlanner,
|
32
|
+
OpenAIVisionAgentPlanner,
|
33
|
+
PlanContext,
|
31
34
|
)
|
32
35
|
from vision_agent.lmm import (
|
33
36
|
LMM,
|
@@ -40,241 +43,48 @@ from vision_agent.lmm import (
|
|
40
43
|
from vision_agent.tools.meta_tools import get_diff
|
41
44
|
from vision_agent.utils import CodeInterpreterFactory, Execution
|
42
45
|
from vision_agent.utils.execute import CodeInterpreter
|
43
|
-
from vision_agent.utils.image_utils import b64_to_pil
|
44
|
-
from vision_agent.utils.sim import AzureSim, OllamaSim, Sim
|
45
|
-
from vision_agent.utils.video import play_video
|
46
46
|
|
47
47
|
logging.basicConfig(stream=sys.stdout)
|
48
48
|
WORKSPACE = Path(os.getenv("WORKSPACE", ""))
|
49
49
|
_LOGGER = logging.getLogger(__name__)
|
50
|
-
_MAX_TABULATE_COL_WIDTH = 80
|
51
|
-
_CONSOLE = Console()
|
52
|
-
|
53
|
-
|
54
|
-
class DefaultImports:
|
55
|
-
"""Container for default imports used in the code execution."""
|
56
|
-
|
57
|
-
common_imports = [
|
58
|
-
"import os",
|
59
|
-
"import numpy as np",
|
60
|
-
"from vision_agent.tools import *",
|
61
|
-
"from typing import *",
|
62
|
-
"from pillow_heif import register_heif_opener",
|
63
|
-
"register_heif_opener()",
|
64
|
-
]
|
65
|
-
|
66
|
-
@staticmethod
|
67
|
-
def to_code_string() -> str:
|
68
|
-
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
69
|
-
|
70
|
-
@staticmethod
|
71
|
-
def prepend_imports(code: str) -> str:
|
72
|
-
"""Run this method to prepend the default imports to the code.
|
73
|
-
NOTE: be sure to run this method after the custom tools have been registered.
|
74
|
-
"""
|
75
|
-
return DefaultImports.to_code_string() + "\n\n" + code
|
76
|
-
|
77
|
-
|
78
|
-
def format_memory(memory: List[Dict[str, str]]) -> str:
|
79
|
-
output_str = ""
|
80
|
-
for i, m in enumerate(memory):
|
81
|
-
output_str += f"### Feedback {i}:\n"
|
82
|
-
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
83
|
-
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
84
|
-
if "edits" in m:
|
85
|
-
output_str += f"Edits {i}:\n{m['edits']}\n"
|
86
|
-
output_str += "\n"
|
87
|
-
|
88
|
-
return output_str
|
89
|
-
|
90
|
-
|
91
|
-
def format_plans(plans: Dict[str, Any]) -> str:
|
92
|
-
plan_str = ""
|
93
|
-
for k, v in plans.items():
|
94
|
-
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
95
|
-
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
96
|
-
|
97
|
-
return plan_str
|
98
|
-
|
99
|
-
|
100
|
-
def write_plans(
|
101
|
-
chat: List[Message],
|
102
|
-
tool_desc: str,
|
103
|
-
working_memory: str,
|
104
|
-
model: LMM,
|
105
|
-
) -> Dict[str, Any]:
|
106
|
-
chat = copy.deepcopy(chat)
|
107
|
-
if chat[-1]["role"] != "user":
|
108
|
-
raise ValueError("Last chat message must be from the user.")
|
109
|
-
|
110
|
-
user_request = chat[-1]["content"]
|
111
|
-
context = USER_REQ.format(user_request=user_request)
|
112
|
-
prompt = PLAN.format(
|
113
|
-
context=context,
|
114
|
-
tool_desc=tool_desc,
|
115
|
-
feedback=working_memory,
|
116
|
-
)
|
117
|
-
chat[-1]["content"] = prompt
|
118
|
-
return extract_json(model(chat, stream=False)) # type: ignore
|
119
|
-
|
120
|
-
|
121
|
-
def pick_plan(
|
122
|
-
chat: List[Message],
|
123
|
-
plans: Dict[str, Any],
|
124
|
-
tool_info: str,
|
125
|
-
model: LMM,
|
126
|
-
code_interpreter: CodeInterpreter,
|
127
|
-
media: List[str],
|
128
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
129
|
-
verbosity: int = 0,
|
130
|
-
max_retries: int = 3,
|
131
|
-
) -> Tuple[Dict[str, str], str]:
|
132
|
-
log_progress(
|
133
|
-
{
|
134
|
-
"type": "log",
|
135
|
-
"log_content": "Generating code to pick the best plan",
|
136
|
-
"status": "started",
|
137
|
-
}
|
138
|
-
)
|
139
50
|
|
140
|
-
chat = copy.deepcopy(chat)
|
141
|
-
if chat[-1]["role"] != "user":
|
142
|
-
raise ValueError("Last chat message must be from the user.")
|
143
51
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
)
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
)
|
181
|
-
|
182
|
-
# retry if the tool output is empty or code fails
|
183
|
-
count = 0
|
184
|
-
while (
|
185
|
-
not tool_output.success
|
186
|
-
or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
|
187
|
-
) and count < max_retries:
|
188
|
-
prompt = TEST_PLANS.format(
|
189
|
-
docstring=tool_info,
|
190
|
-
plans=plan_str,
|
191
|
-
previous_attempts=PREVIOUS_FAILED.format(
|
192
|
-
code=code, error="\n".join(tool_output_str.splitlines()[-50:])
|
193
|
-
),
|
194
|
-
media=media,
|
195
|
-
)
|
196
|
-
log_progress(
|
197
|
-
{
|
198
|
-
"type": "log",
|
199
|
-
"log_content": "Retrying code to test plans",
|
200
|
-
"status": "running",
|
201
|
-
"code": DefaultImports.prepend_imports(code),
|
202
|
-
}
|
203
|
-
)
|
204
|
-
code = extract_code(model(prompt, stream=False)) # type: ignore
|
205
|
-
tool_output = code_interpreter.exec_isolation(
|
206
|
-
DefaultImports.prepend_imports(code)
|
207
|
-
)
|
208
|
-
log_progress(
|
209
|
-
{
|
210
|
-
"type": "log",
|
211
|
-
"log_content": (
|
212
|
-
"Code execution succeeded"
|
213
|
-
if tool_output.success
|
214
|
-
else "Code execution failed"
|
215
|
-
),
|
216
|
-
"code": DefaultImports.prepend_imports(code),
|
217
|
-
# "payload": tool_output.to_json(),
|
218
|
-
"status": "completed" if tool_output.success else "failed",
|
219
|
-
}
|
220
|
-
)
|
221
|
-
tool_output_str = tool_output.text(include_results=False).strip()
|
222
|
-
|
223
|
-
if verbosity == 2:
|
224
|
-
_print_code("Code and test after attempted fix:", code)
|
225
|
-
_LOGGER.info(f"Code execution result after attempt {count + 1}")
|
226
|
-
_LOGGER.info(f"{tool_output_str}")
|
227
|
-
|
228
|
-
count += 1
|
229
|
-
|
230
|
-
if verbosity >= 1:
|
231
|
-
_print_code("Final code:", code)
|
232
|
-
|
233
|
-
user_req = chat[-1]["content"]
|
234
|
-
context = USER_REQ.format(user_request=user_req)
|
235
|
-
# because the tool picker model gets the image as well, we have to be careful with
|
236
|
-
# how much text we send it, so we truncate the tool output to 20,000 characters
|
237
|
-
prompt = PICK_PLAN.format(
|
238
|
-
context=context,
|
239
|
-
plans=format_plans(plans),
|
240
|
-
tool_output=tool_output_str[:20_000],
|
241
|
-
)
|
242
|
-
chat[-1]["content"] = prompt
|
243
|
-
|
244
|
-
count = 0
|
245
|
-
plan_thoughts = None
|
246
|
-
while plan_thoughts is None and count < max_retries:
|
247
|
-
try:
|
248
|
-
plan_thoughts = extract_json(model(chat, stream=False)) # type: ignore
|
249
|
-
except JSONDecodeError as e:
|
250
|
-
_LOGGER.exception(
|
251
|
-
f"Error while extracting JSON during picking best plan {str(e)}"
|
252
|
-
)
|
253
|
-
pass
|
254
|
-
count += 1
|
255
|
-
|
256
|
-
if (
|
257
|
-
plan_thoughts is None
|
258
|
-
or "best_plan" not in plan_thoughts
|
259
|
-
or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
|
260
|
-
):
|
261
|
-
_LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
|
262
|
-
plan_thoughts = {"best_plan": list(plans.keys())[0]}
|
263
|
-
|
264
|
-
if "thoughts" not in plan_thoughts:
|
265
|
-
plan_thoughts["thoughts"] = ""
|
266
|
-
|
267
|
-
if verbosity >= 1:
|
268
|
-
_LOGGER.info(f"Best plan:\n{plan_thoughts}")
|
269
|
-
log_progress(
|
270
|
-
{
|
271
|
-
"type": "log",
|
272
|
-
"log_content": "Picked best plan",
|
273
|
-
"status": "completed",
|
274
|
-
"payload": plans[plan_thoughts["best_plan"]],
|
275
|
-
}
|
276
|
-
)
|
277
|
-
return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
|
52
|
+
def strip_function_calls(code: str, exclusions: Optional[List[str]] = None) -> str:
|
53
|
+
"""This will strip out all code that calls functions except for functions included
|
54
|
+
in exclusions.
|
55
|
+
"""
|
56
|
+
if exclusions is None:
|
57
|
+
exclusions = []
|
58
|
+
|
59
|
+
red = RedBaron(code)
|
60
|
+
nodes_to_remove = []
|
61
|
+
for node in red:
|
62
|
+
if node.type == "def":
|
63
|
+
continue
|
64
|
+
elif node.type == "import" or node.type == "from_import":
|
65
|
+
continue
|
66
|
+
elif node.type == "call":
|
67
|
+
if node.value and node.value[0].value in exclusions:
|
68
|
+
continue
|
69
|
+
nodes_to_remove.append(node)
|
70
|
+
elif node.type == "atomtrailers":
|
71
|
+
if node[0].value in exclusions:
|
72
|
+
continue
|
73
|
+
nodes_to_remove.append(node)
|
74
|
+
elif node.type == "assignment":
|
75
|
+
if node.value.type == "call" or node.value.type == "atomtrailers":
|
76
|
+
func_name = node.value[0].value
|
77
|
+
if func_name in exclusions:
|
78
|
+
continue
|
79
|
+
nodes_to_remove.append(node)
|
80
|
+
elif node.type == "endl":
|
81
|
+
continue
|
82
|
+
else:
|
83
|
+
nodes_to_remove.append(node)
|
84
|
+
for node in nodes_to_remove:
|
85
|
+
node.parent.remove(node)
|
86
|
+
cleaned_code = red.dumps().strip()
|
87
|
+
return cleaned_code if isinstance(cleaned_code, str) else code
|
278
88
|
|
279
89
|
|
280
90
|
def write_code(
|
@@ -359,6 +169,7 @@ def write_and_test_code(
|
|
359
169
|
plan_thoughts,
|
360
170
|
format_memory(working_memory),
|
361
171
|
)
|
172
|
+
code = strip_function_calls(code)
|
362
173
|
test = write_test(
|
363
174
|
tester, chat, tool_utils, code, format_memory(working_memory), media
|
364
175
|
)
|
@@ -393,7 +204,7 @@ def write_and_test_code(
|
|
393
204
|
}
|
394
205
|
)
|
395
206
|
if verbosity == 2:
|
396
|
-
|
207
|
+
print_code("Initial code and tests:", code, test)
|
397
208
|
_LOGGER.info(
|
398
209
|
f"Initial code execution result:\n{result.text(include_logs=True)}"
|
399
210
|
)
|
@@ -418,7 +229,7 @@ def write_and_test_code(
|
|
418
229
|
count += 1
|
419
230
|
|
420
231
|
if verbosity >= 1:
|
421
|
-
|
232
|
+
print_code("Final code and tests:", code, test)
|
422
233
|
|
423
234
|
return {
|
424
235
|
"code": code,
|
@@ -449,7 +260,9 @@ def debug_code(
|
|
449
260
|
}
|
450
261
|
)
|
451
262
|
|
452
|
-
|
263
|
+
fixed_code = None
|
264
|
+
fixed_test = None
|
265
|
+
thoughts = ""
|
453
266
|
success = False
|
454
267
|
count = 0
|
455
268
|
while not success and count < 3:
|
@@ -472,21 +285,16 @@ def debug_code(
|
|
472
285
|
stream=False,
|
473
286
|
)
|
474
287
|
fixed_code_and_test_str = cast(str, fixed_code_and_test_str)
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
fixed_code_and_test["test"] = ""
|
486
|
-
if "which_code" in fixed_code_and_test:
|
487
|
-
del fixed_code_and_test["which_code"]
|
488
|
-
|
489
|
-
success = True
|
288
|
+
thoughts_tag = extract_tag(fixed_code_and_test_str, "thoughts")
|
289
|
+
thoughts = thoughts_tag if thoughts_tag is not None else ""
|
290
|
+
fixed_code = extract_tag(fixed_code_and_test_str, "code")
|
291
|
+
fixed_test = extract_tag(fixed_code_and_test_str, "test")
|
292
|
+
|
293
|
+
if fixed_code is None and fixed_test is None:
|
294
|
+
success = False
|
295
|
+
else:
|
296
|
+
success = True
|
297
|
+
|
490
298
|
except Exception as e:
|
491
299
|
_LOGGER.exception(f"Error while extracting JSON: {e}")
|
492
300
|
|
@@ -495,15 +303,15 @@ def debug_code(
|
|
495
303
|
old_code = code
|
496
304
|
old_test = test
|
497
305
|
|
498
|
-
if
|
499
|
-
code =
|
500
|
-
if
|
501
|
-
test =
|
306
|
+
if fixed_code is not None and fixed_code.strip() != "":
|
307
|
+
code = fixed_code
|
308
|
+
if fixed_test is not None and fixed_test.strip() != "":
|
309
|
+
test = fixed_test
|
502
310
|
|
503
311
|
new_working_memory.append(
|
504
312
|
{
|
505
313
|
"code": f"{code}\n{test}",
|
506
|
-
"feedback":
|
314
|
+
"feedback": thoughts,
|
507
315
|
"edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
|
508
316
|
}
|
509
317
|
)
|
@@ -537,70 +345,14 @@ def debug_code(
|
|
537
345
|
}
|
538
346
|
)
|
539
347
|
if verbosity == 2:
|
540
|
-
|
348
|
+
print_code("Code and test after attempted fix:", code, test)
|
541
349
|
_LOGGER.info(
|
542
|
-
f"Reflection: {
|
350
|
+
f"Reflection: {thoughts}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
|
543
351
|
)
|
544
352
|
|
545
353
|
return code, test, result
|
546
354
|
|
547
355
|
|
548
|
-
def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
549
|
-
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
550
|
-
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
551
|
-
_CONSOLE.print(
|
552
|
-
Syntax(
|
553
|
-
DefaultImports.prepend_imports(code),
|
554
|
-
"python",
|
555
|
-
theme="gruvbox-dark",
|
556
|
-
line_numbers=True,
|
557
|
-
)
|
558
|
-
)
|
559
|
-
if test:
|
560
|
-
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
561
|
-
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
562
|
-
|
563
|
-
|
564
|
-
def retrieve_tools(
|
565
|
-
plans: Dict[str, Dict[str, Any]],
|
566
|
-
tool_recommender: Sim,
|
567
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
568
|
-
verbosity: int = 0,
|
569
|
-
) -> Dict[str, str]:
|
570
|
-
log_progress(
|
571
|
-
{
|
572
|
-
"type": "log",
|
573
|
-
"log_content": ("Retrieving tools for each plan"),
|
574
|
-
"status": "started",
|
575
|
-
}
|
576
|
-
)
|
577
|
-
tool_info = []
|
578
|
-
tool_desc = []
|
579
|
-
tool_lists: Dict[str, List[Dict[str, str]]] = {}
|
580
|
-
for k, plan in plans.items():
|
581
|
-
tool_lists[k] = []
|
582
|
-
for task in plan["instructions"]:
|
583
|
-
tools = tool_recommender.top_k(task, k=2, thresh=0.3)
|
584
|
-
tool_info.extend([e["doc"] for e in tools])
|
585
|
-
tool_desc.extend([e["desc"] for e in tools])
|
586
|
-
tool_lists[k].extend(
|
587
|
-
{"description": e["desc"], "documentation": e["doc"]} for e in tools
|
588
|
-
)
|
589
|
-
|
590
|
-
if verbosity == 2:
|
591
|
-
tool_desc_str = "\n".join(set(tool_desc))
|
592
|
-
_LOGGER.info(f"Tools Description:\n{tool_desc_str}")
|
593
|
-
|
594
|
-
tool_lists_unique = {}
|
595
|
-
for k in tool_lists:
|
596
|
-
tool_lists_unique[k] = "\n\n".join(
|
597
|
-
set(e["documentation"] for e in tool_lists[k])
|
598
|
-
)
|
599
|
-
all_tools = "\n\n".join(set(tool_info))
|
600
|
-
tool_lists_unique["all"] = all_tools
|
601
|
-
return tool_lists_unique
|
602
|
-
|
603
|
-
|
604
356
|
class VisionAgentCoder(Agent):
|
605
357
|
"""Vision Agent Coder is an agentic framework that can output code based on a user
|
606
358
|
request. It can plan tasks, retrieve relevant tools, write code, write tests and
|
@@ -616,23 +368,22 @@ class VisionAgentCoder(Agent):
|
|
616
368
|
|
617
369
|
def __init__(
|
618
370
|
self,
|
619
|
-
planner: Optional[
|
371
|
+
planner: Optional[Agent] = None,
|
620
372
|
coder: Optional[LMM] = None,
|
621
373
|
tester: Optional[LMM] = None,
|
622
374
|
debugger: Optional[LMM] = None,
|
623
|
-
tool_recommender: Optional[Sim] = None,
|
624
375
|
verbosity: int = 0,
|
625
376
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
626
|
-
|
377
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
627
378
|
) -> None:
|
628
379
|
"""Initialize the Vision Agent Coder.
|
629
380
|
|
630
381
|
Parameters:
|
631
|
-
planner (Optional[
|
382
|
+
planner (Optional[Agent]): The planner model to use. Defaults to
|
383
|
+
AnthropicVisionAgentPlanner.
|
632
384
|
coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
|
633
385
|
tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
|
634
386
|
debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
|
635
|
-
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
636
387
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
637
388
|
highest verbosity level which will output all intermediate debugging
|
638
389
|
code.
|
@@ -641,14 +392,17 @@ class VisionAgentCoder(Agent):
|
|
641
392
|
in a web application where multiple VisionAgentCoder instances are
|
642
393
|
running in parallel. This callback ensures that the progress are not
|
643
394
|
mixed up.
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
If it's also None, the local python runtime environment will be used.
|
395
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
396
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
397
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
398
|
+
object is provided it will use that.
|
649
399
|
"""
|
650
400
|
|
651
|
-
self.planner =
|
401
|
+
self.planner = (
|
402
|
+
AnthropicVisionAgentPlanner(verbosity=verbosity)
|
403
|
+
if planner is None
|
404
|
+
else planner
|
405
|
+
)
|
652
406
|
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
653
407
|
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
654
408
|
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
@@ -656,21 +410,15 @@ class VisionAgentCoder(Agent):
|
|
656
410
|
if self.verbosity > 0:
|
657
411
|
_LOGGER.setLevel(logging.INFO)
|
658
412
|
|
659
|
-
self.tool_recommender = (
|
660
|
-
Sim(T.TOOLS_DF, sim_key="desc")
|
661
|
-
if tool_recommender is None
|
662
|
-
else tool_recommender
|
663
|
-
)
|
664
413
|
self.report_progress_callback = report_progress_callback
|
665
|
-
self.
|
414
|
+
self.code_interpreter = code_interpreter
|
666
415
|
|
667
416
|
def __call__(
|
668
417
|
self,
|
669
418
|
input: Union[str, List[Message]],
|
670
419
|
media: Optional[Union[str, Path]] = None,
|
671
420
|
) -> str:
|
672
|
-
"""
|
673
|
-
task.
|
421
|
+
"""Generate code based on a user request.
|
674
422
|
|
675
423
|
Parameters:
|
676
424
|
input (Union[str, List[Message]]): A conversation in the format of
|
@@ -686,46 +434,58 @@ class VisionAgentCoder(Agent):
|
|
686
434
|
input = [{"role": "user", "content": input}]
|
687
435
|
if media is not None:
|
688
436
|
input[0]["media"] = [media]
|
689
|
-
|
690
|
-
|
691
|
-
return results["code"] # type: ignore
|
437
|
+
code_and_context = self.generate_code(input)
|
438
|
+
return code_and_context["code"] # type: ignore
|
692
439
|
|
693
|
-
def
|
440
|
+
def generate_code_from_plan(
|
694
441
|
self,
|
695
442
|
chat: List[Message],
|
696
|
-
|
697
|
-
|
698
|
-
custom_tool_names: Optional[List[str]] = None,
|
443
|
+
plan_context: PlanContext,
|
444
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
699
445
|
) -> Dict[str, Any]:
|
700
|
-
"""
|
701
|
-
|
446
|
+
"""Generates code and other intermediate outputs from a chat input and a plan.
|
447
|
+
The plan includes:
|
448
|
+
- plans: The plans generated by the planner.
|
449
|
+
- best_plan: The best plan selected by the planner.
|
450
|
+
- plan_thoughts: The thoughts of the planner, including any modifications
|
451
|
+
to the plan.
|
452
|
+
- tool_doc: The tool documentation for the best plan.
|
453
|
+
- tool_output: The tool output from the tools used by the best plan.
|
702
454
|
|
703
455
|
Parameters:
|
704
|
-
chat (List[Message]): A conversation
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
with the first plan.
|
712
|
-
display_visualization (bool): If True, it opens a new window locally to
|
713
|
-
show the image(s) created by visualization code (if there is any).
|
714
|
-
custom_tool_names (List[str]): A list of custom tools for the agent to pick
|
715
|
-
and use. If not provided, default to full tool set from vision_agent.tools.
|
456
|
+
chat (List[Message]): A conversation in the format of
|
457
|
+
[{"role": "user", "content": "describe your task here..."}].
|
458
|
+
plan_context (PlanContext): The context of the plan, including the plans,
|
459
|
+
best_plan, plan_thoughts, tool_doc, and tool_output.
|
460
|
+
test_multi_plan (bool): Whether to test multiple plans or just the best plan.
|
461
|
+
custom_tool_names (Optional[List[str]]): A list of custom tool names to use
|
462
|
+
for the planner.
|
716
463
|
|
717
464
|
Returns:
|
718
|
-
Dict[str, Any]: A dictionary containing the code
|
719
|
-
and
|
465
|
+
Dict[str, Any]: A dictionary containing the code output by the
|
466
|
+
VisionAgentCoder and other intermediate outputs. include:
|
467
|
+
- status (str): Whether or not the agent completed or failed generating
|
468
|
+
the code.
|
469
|
+
- code (str): The code output by the VisionAgentCoder.
|
470
|
+
- test (str): The test output by the VisionAgentCoder.
|
471
|
+
- test_result (Execution): The result of the test execution.
|
472
|
+
- plans (Dict[str, Any]): The plans generated by the planner.
|
473
|
+
- plan_thoughts (str): The thoughts of the planner.
|
474
|
+
- working_memory (List[Dict[str, str]]): The working memory of the agent.
|
720
475
|
"""
|
721
|
-
|
722
476
|
if not chat:
|
723
477
|
raise ValueError("Chat cannot be empty.")
|
724
478
|
|
725
479
|
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
726
|
-
|
727
|
-
|
728
|
-
|
480
|
+
code_interpreter = (
|
481
|
+
self.code_interpreter
|
482
|
+
if self.code_interpreter is not None
|
483
|
+
and not isinstance(self.code_interpreter, str)
|
484
|
+
else CodeInterpreterFactory.new_instance(
|
485
|
+
code_sandbox_runtime=self.code_interpreter,
|
486
|
+
)
|
487
|
+
)
|
488
|
+
with code_interpreter:
|
729
489
|
chat = copy.deepcopy(chat)
|
730
490
|
media_list = []
|
731
491
|
for chat_i in chat:
|
@@ -759,74 +519,22 @@ class VisionAgentCoder(Agent):
|
|
759
519
|
code = ""
|
760
520
|
test = ""
|
761
521
|
working_memory: List[Dict[str, str]] = []
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
plans = self._create_plans(
|
767
|
-
int_chat, custom_tool_names, working_memory, self.planner
|
768
|
-
)
|
769
|
-
|
770
|
-
if test_multi_plan:
|
771
|
-
self._log_plans(plans, self.verbosity)
|
772
|
-
|
773
|
-
tool_infos = retrieve_tools(
|
774
|
-
plans,
|
775
|
-
self.tool_recommender,
|
776
|
-
self.log_progress,
|
777
|
-
self.verbosity,
|
778
|
-
)
|
779
|
-
|
780
|
-
if test_multi_plan:
|
781
|
-
plan_thoughts, tool_output_str = pick_plan(
|
782
|
-
int_chat,
|
783
|
-
plans,
|
784
|
-
tool_infos["all"],
|
785
|
-
self.coder,
|
786
|
-
code_interpreter,
|
787
|
-
media_list,
|
788
|
-
self.log_progress,
|
789
|
-
verbosity=self.verbosity,
|
790
|
-
)
|
791
|
-
best_plan = plan_thoughts["best_plan"]
|
792
|
-
plan_thoughts_str = plan_thoughts["thoughts"]
|
793
|
-
else:
|
794
|
-
best_plan = list(plans.keys())[0]
|
795
|
-
tool_output_str = ""
|
796
|
-
plan_thoughts_str = ""
|
797
|
-
|
798
|
-
if best_plan in plans and best_plan in tool_infos:
|
799
|
-
plan_i = plans[best_plan]
|
800
|
-
tool_info = tool_infos[best_plan]
|
801
|
-
else:
|
802
|
-
if self.verbosity >= 1:
|
803
|
-
_LOGGER.warning(
|
804
|
-
f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
|
805
|
-
)
|
806
|
-
k = list(plans.keys())[0]
|
807
|
-
plan_i = plans[k]
|
808
|
-
tool_info = tool_infos[k]
|
809
|
-
|
810
|
-
self.log_progress(
|
811
|
-
{
|
812
|
-
"type": "log",
|
813
|
-
"log_content": "Creating plans",
|
814
|
-
"status": "completed",
|
815
|
-
"payload": tool_info,
|
816
|
-
}
|
817
|
-
)
|
522
|
+
plan = plan_context.plans[plan_context.best_plan]
|
523
|
+
tool_doc = plan_context.tool_doc
|
524
|
+
tool_output_str = plan_context.tool_output
|
525
|
+
plan_thoughts_str = str(plan_context.plan_thoughts)
|
818
526
|
|
819
527
|
if self.verbosity >= 1:
|
820
|
-
|
528
|
+
plan_fixed = [{"instructions": e} for e in plan["instructions"]]
|
821
529
|
_LOGGER.info(
|
822
|
-
f"Picked best plan:\n{tabulate(tabular_data=
|
530
|
+
f"Picked best plan:\n{tabulate(tabular_data=plan_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
823
531
|
)
|
824
532
|
|
825
533
|
results = write_and_test_code(
|
826
534
|
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
|
827
|
-
plan=f"\n{
|
828
|
-
+ "\n-".join([e for e in
|
829
|
-
tool_info=
|
535
|
+
plan=f"\n{plan['thoughts']}\n-"
|
536
|
+
+ "\n-".join([e for e in plan["instructions"]]),
|
537
|
+
tool_info=tool_doc,
|
830
538
|
tool_output=tool_output_str,
|
831
539
|
plan_thoughts=plan_thoughts_str,
|
832
540
|
tool_utils=T.UTILITIES_DOCSTRING,
|
@@ -842,64 +550,82 @@ class VisionAgentCoder(Agent):
|
|
842
550
|
success = cast(bool, results["success"])
|
843
551
|
code = remove_installs_from_code(cast(str, results["code"]))
|
844
552
|
test = remove_installs_from_code(cast(str, results["test"]))
|
845
|
-
working_memory.extend(results["working_memory"])
|
846
|
-
plan.append({"code": code, "test": test, "plan": plan_i})
|
847
|
-
|
553
|
+
working_memory.extend(results["working_memory"])
|
848
554
|
execution_result = cast(Execution, results["test_result"])
|
849
555
|
|
850
|
-
if display_visualization:
|
851
|
-
for res in execution_result.results:
|
852
|
-
if res.png:
|
853
|
-
b64_to_pil(res.png).show()
|
854
|
-
if res.mp4:
|
855
|
-
play_video(res.mp4)
|
856
|
-
|
857
556
|
return {
|
858
557
|
"status": "completed" if success else "failed",
|
859
558
|
"code": DefaultImports.prepend_imports(code),
|
860
559
|
"test": test,
|
861
560
|
"test_result": execution_result,
|
862
|
-
"plans": plans,
|
561
|
+
"plans": plan_context.plans,
|
863
562
|
"plan_thoughts": plan_thoughts_str,
|
864
563
|
"working_memory": working_memory,
|
865
564
|
}
|
866
565
|
|
867
|
-
def
|
868
|
-
if self.report_progress_callback is not None:
|
869
|
-
self.report_progress_callback(data)
|
870
|
-
|
871
|
-
def _create_plans(
|
566
|
+
def generate_code(
|
872
567
|
self,
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
planner: LMM,
|
568
|
+
chat: List[Message],
|
569
|
+
test_multi_plan: bool = True,
|
570
|
+
custom_tool_names: Optional[List[str]] = None,
|
877
571
|
) -> Dict[str, Any]:
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
"
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
572
|
+
"""Generates code and other intermediate outputs from a chat input.
|
573
|
+
|
574
|
+
Parameters:
|
575
|
+
chat (List[Message]): A conversation in the format of
|
576
|
+
[{"role": "user", "content": "describe your task here..."}].
|
577
|
+
test_multi_plan (bool): Whether to test multiple plans or just the best plan.
|
578
|
+
custom_tool_names (Optional[List[str]]): A list of custom tool names to use
|
579
|
+
for the planner.
|
580
|
+
|
581
|
+
Returns:
|
582
|
+
Dict[str, Any]: A dictionary containing the code output by the
|
583
|
+
VisionAgentCoder and other intermediate outputs. include:
|
584
|
+
- status (str): Whether or not the agent completed or failed generating
|
585
|
+
the code.
|
586
|
+
- code (str): The code output by the VisionAgentCoder.
|
587
|
+
- test (str): The test output by the VisionAgentCoder.
|
588
|
+
- test_result (Execution): The result of the test execution.
|
589
|
+
- plans (Dict[str, Any]): The plans generated by the planner.
|
590
|
+
- plan_thoughts (str): The thoughts of the planner.
|
591
|
+
- working_memory (List[Dict[str, str]]): The working memory of the agent.
|
592
|
+
"""
|
593
|
+
if not chat:
|
594
|
+
raise ValueError("Chat cannot be empty.")
|
595
|
+
|
596
|
+
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
597
|
+
code_interpreter = (
|
598
|
+
self.code_interpreter
|
599
|
+
if self.code_interpreter is not None
|
600
|
+
and not isinstance(self.code_interpreter, str)
|
601
|
+
else CodeInterpreterFactory.new_instance(
|
602
|
+
code_sandbox_runtime=self.code_interpreter,
|
603
|
+
)
|
892
604
|
)
|
893
|
-
|
605
|
+
with code_interpreter:
|
606
|
+
plan_context = self.planner.generate_plan( # type: ignore
|
607
|
+
chat,
|
608
|
+
test_multi_plan=test_multi_plan,
|
609
|
+
custom_tool_names=custom_tool_names,
|
610
|
+
code_interpreter=code_interpreter,
|
611
|
+
)
|
894
612
|
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
613
|
+
code_and_context = self.generate_code_from_plan(
|
614
|
+
chat,
|
615
|
+
plan_context,
|
616
|
+
code_interpreter=code_interpreter,
|
617
|
+
)
|
618
|
+
return code_and_context
|
619
|
+
|
620
|
+
def chat(self, chat: List[Message]) -> List[Message]:
|
621
|
+
chat = copy.deepcopy(chat)
|
622
|
+
code = self.generate_code(chat)
|
623
|
+
chat.append({"role": "agent", "content": code["code"]})
|
624
|
+
return chat
|
625
|
+
|
626
|
+
def log_progress(self, data: Dict[str, Any]) -> None:
|
627
|
+
if self.report_progress_callback is not None:
|
628
|
+
self.report_progress_callback(data)
|
903
629
|
|
904
630
|
|
905
631
|
class OpenAIVisionAgentCoder(VisionAgentCoder):
|
@@ -907,17 +633,18 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
|
|
907
633
|
|
908
634
|
def __init__(
|
909
635
|
self,
|
910
|
-
planner: Optional[
|
636
|
+
planner: Optional[Agent] = None,
|
911
637
|
coder: Optional[LMM] = None,
|
912
638
|
tester: Optional[LMM] = None,
|
913
639
|
debugger: Optional[LMM] = None,
|
914
|
-
tool_recommender: Optional[Sim] = None,
|
915
640
|
verbosity: int = 0,
|
916
641
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
917
|
-
|
642
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
918
643
|
) -> None:
|
919
644
|
self.planner = (
|
920
|
-
|
645
|
+
OpenAIVisionAgentPlanner(verbosity=verbosity)
|
646
|
+
if planner is None
|
647
|
+
else planner
|
921
648
|
)
|
922
649
|
self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
|
923
650
|
self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
|
@@ -926,13 +653,8 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
|
|
926
653
|
if self.verbosity > 0:
|
927
654
|
_LOGGER.setLevel(logging.INFO)
|
928
655
|
|
929
|
-
self.tool_recommender = (
|
930
|
-
Sim(T.TOOLS_DF, sim_key="desc")
|
931
|
-
if tool_recommender is None
|
932
|
-
else tool_recommender
|
933
|
-
)
|
934
656
|
self.report_progress_callback = report_progress_callback
|
935
|
-
self.
|
657
|
+
self.code_interpreter = code_interpreter
|
936
658
|
|
937
659
|
|
938
660
|
class AnthropicVisionAgentCoder(VisionAgentCoder):
|
@@ -940,17 +662,20 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
|
|
940
662
|
|
941
663
|
def __init__(
|
942
664
|
self,
|
943
|
-
planner: Optional[
|
665
|
+
planner: Optional[Agent] = None,
|
944
666
|
coder: Optional[LMM] = None,
|
945
667
|
tester: Optional[LMM] = None,
|
946
668
|
debugger: Optional[LMM] = None,
|
947
|
-
tool_recommender: Optional[Sim] = None,
|
948
669
|
verbosity: int = 0,
|
949
670
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
950
|
-
|
671
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
951
672
|
) -> None:
|
952
673
|
# NOTE: Claude doesn't have an official JSON mode
|
953
|
-
self.planner =
|
674
|
+
self.planner = (
|
675
|
+
AnthropicVisionAgentPlanner(verbosity=verbosity)
|
676
|
+
if planner is None
|
677
|
+
else planner
|
678
|
+
)
|
954
679
|
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
955
680
|
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
956
681
|
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
@@ -958,15 +683,8 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
|
|
958
683
|
if self.verbosity > 0:
|
959
684
|
_LOGGER.setLevel(logging.INFO)
|
960
685
|
|
961
|
-
# Anthropic does not offer any embedding models and instead recomends Voyage,
|
962
|
-
# we're using OpenAI's embedder for now.
|
963
|
-
self.tool_recommender = (
|
964
|
-
Sim(T.TOOLS_DF, sim_key="desc")
|
965
|
-
if tool_recommender is None
|
966
|
-
else tool_recommender
|
967
|
-
)
|
968
686
|
self.report_progress_callback = report_progress_callback
|
969
|
-
self.
|
687
|
+
self.code_interpreter = code_interpreter
|
970
688
|
|
971
689
|
|
972
690
|
class OllamaVisionAgentCoder(VisionAgentCoder):
|
@@ -988,17 +706,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
988
706
|
|
989
707
|
def __init__(
|
990
708
|
self,
|
991
|
-
planner: Optional[
|
709
|
+
planner: Optional[Agent] = None,
|
992
710
|
coder: Optional[LMM] = None,
|
993
711
|
tester: Optional[LMM] = None,
|
994
712
|
debugger: Optional[LMM] = None,
|
995
|
-
tool_recommender: Optional[Sim] = None,
|
996
713
|
verbosity: int = 0,
|
997
714
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
715
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
998
716
|
) -> None:
|
999
717
|
super().__init__(
|
1000
718
|
planner=(
|
1001
|
-
|
719
|
+
OllamaVisionAgentPlanner(verbosity=verbosity)
|
1002
720
|
if planner is None
|
1003
721
|
else planner
|
1004
722
|
),
|
@@ -1017,13 +735,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
1017
735
|
if debugger is None
|
1018
736
|
else debugger
|
1019
737
|
),
|
1020
|
-
tool_recommender=(
|
1021
|
-
OllamaSim(T.TOOLS_DF, sim_key="desc")
|
1022
|
-
if tool_recommender is None
|
1023
|
-
else tool_recommender
|
1024
|
-
),
|
1025
738
|
verbosity=verbosity,
|
1026
739
|
report_progress_callback=report_progress_callback,
|
740
|
+
code_interpreter=code_interpreter,
|
1027
741
|
)
|
1028
742
|
|
1029
743
|
|
@@ -1043,22 +757,22 @@ class AzureVisionAgentCoder(VisionAgentCoder):
|
|
1043
757
|
|
1044
758
|
def __init__(
|
1045
759
|
self,
|
1046
|
-
planner: Optional[
|
760
|
+
planner: Optional[Agent] = None,
|
1047
761
|
coder: Optional[LMM] = None,
|
1048
762
|
tester: Optional[LMM] = None,
|
1049
763
|
debugger: Optional[LMM] = None,
|
1050
|
-
tool_recommender: Optional[Sim] = None,
|
1051
764
|
verbosity: int = 0,
|
1052
765
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
766
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
1053
767
|
) -> None:
|
1054
768
|
"""Initialize the Vision Agent Coder.
|
1055
769
|
|
1056
770
|
Parameters:
|
1057
|
-
planner (Optional[
|
771
|
+
planner (Optional[Agent]): The planner model to use. Defaults to
|
772
|
+
AzureVisionAgentPlanner.
|
1058
773
|
coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
|
1059
774
|
tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
|
1060
775
|
debugger (Optional[LMM]): The debugger model to
|
1061
|
-
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
1062
776
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
1063
777
|
highest verbosity level which will output all intermediate debugging
|
1064
778
|
code.
|
@@ -1069,7 +783,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
|
|
1069
783
|
"""
|
1070
784
|
super().__init__(
|
1071
785
|
planner=(
|
1072
|
-
|
786
|
+
AzureVisionAgentPlanner(verbosity=verbosity)
|
1073
787
|
if planner is None
|
1074
788
|
else planner
|
1075
789
|
),
|
@@ -1078,11 +792,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
|
|
1078
792
|
debugger=(
|
1079
793
|
AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
|
1080
794
|
),
|
1081
|
-
tool_recommender=(
|
1082
|
-
AzureSim(T.TOOLS_DF, sim_key="desc")
|
1083
|
-
if tool_recommender is None
|
1084
|
-
else tool_recommender
|
1085
|
-
),
|
1086
795
|
verbosity=verbosity,
|
1087
796
|
report_progress_callback=report_progress_callback,
|
797
|
+
code_interpreter=code_interpreter,
|
1088
798
|
)
|