vision-agent 0.2.161__py3-none-any.whl → 0.2.162__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +76 -2
- vision_agent/agent/vision_agent.py +49 -17
- vision_agent/agent/vision_agent_coder.py +163 -489
- vision_agent/agent/vision_agent_coder_prompts.py +0 -203
- vision_agent/agent/vision_agent_planner.py +553 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +84 -3
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/METADATA +7 -7
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/RECORD +13 -11
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.162.dist-info}/WHEEL +0 -0
@@ -2,32 +2,33 @@ import copy
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import sys
|
5
|
-
from json import JSONDecodeError
|
6
5
|
from pathlib import Path
|
7
|
-
from typing import Any, Callable, Dict, List, Optional, Sequence,
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
8
7
|
|
9
|
-
from rich.console import Console
|
10
|
-
from rich.style import Style
|
11
|
-
from rich.syntax import Syntax
|
12
8
|
from tabulate import tabulate
|
13
9
|
|
14
10
|
import vision_agent.tools as T
|
15
|
-
from vision_agent.agent import Agent
|
11
|
+
from vision_agent.agent.agent import Agent
|
16
12
|
from vision_agent.agent.agent_utils import (
|
13
|
+
DefaultImports,
|
17
14
|
extract_code,
|
18
15
|
extract_json,
|
16
|
+
format_memory,
|
17
|
+
print_code,
|
19
18
|
remove_installs_from_code,
|
20
19
|
)
|
21
20
|
from vision_agent.agent.vision_agent_coder_prompts import (
|
22
21
|
CODE,
|
23
22
|
FIX_BUG,
|
24
23
|
FULL_TASK,
|
25
|
-
PICK_PLAN,
|
26
|
-
PLAN,
|
27
|
-
PREVIOUS_FAILED,
|
28
24
|
SIMPLE_TEST,
|
29
|
-
|
30
|
-
|
25
|
+
)
|
26
|
+
from vision_agent.agent.vision_agent_planner import (
|
27
|
+
AnthropicVisionAgentPlanner,
|
28
|
+
AzureVisionAgentPlanner,
|
29
|
+
OllamaVisionAgentPlanner,
|
30
|
+
OpenAIVisionAgentPlanner,
|
31
|
+
PlanContext,
|
31
32
|
)
|
32
33
|
from vision_agent.lmm import (
|
33
34
|
LMM,
|
@@ -40,241 +41,11 @@ from vision_agent.lmm import (
|
|
40
41
|
from vision_agent.tools.meta_tools import get_diff
|
41
42
|
from vision_agent.utils import CodeInterpreterFactory, Execution
|
42
43
|
from vision_agent.utils.execute import CodeInterpreter
|
43
|
-
from vision_agent.utils.image_utils import b64_to_pil
|
44
|
-
from vision_agent.utils.sim import AzureSim, OllamaSim, Sim
|
45
|
-
from vision_agent.utils.video import play_video
|
46
44
|
|
47
45
|
logging.basicConfig(stream=sys.stdout)
|
48
46
|
WORKSPACE = Path(os.getenv("WORKSPACE", ""))
|
49
47
|
_LOGGER = logging.getLogger(__name__)
|
50
48
|
_MAX_TABULATE_COL_WIDTH = 80
|
51
|
-
_CONSOLE = Console()
|
52
|
-
|
53
|
-
|
54
|
-
class DefaultImports:
|
55
|
-
"""Container for default imports used in the code execution."""
|
56
|
-
|
57
|
-
common_imports = [
|
58
|
-
"import os",
|
59
|
-
"import numpy as np",
|
60
|
-
"from vision_agent.tools import *",
|
61
|
-
"from typing import *",
|
62
|
-
"from pillow_heif import register_heif_opener",
|
63
|
-
"register_heif_opener()",
|
64
|
-
]
|
65
|
-
|
66
|
-
@staticmethod
|
67
|
-
def to_code_string() -> str:
|
68
|
-
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
69
|
-
|
70
|
-
@staticmethod
|
71
|
-
def prepend_imports(code: str) -> str:
|
72
|
-
"""Run this method to prepend the default imports to the code.
|
73
|
-
NOTE: be sure to run this method after the custom tools have been registered.
|
74
|
-
"""
|
75
|
-
return DefaultImports.to_code_string() + "\n\n" + code
|
76
|
-
|
77
|
-
|
78
|
-
def format_memory(memory: List[Dict[str, str]]) -> str:
|
79
|
-
output_str = ""
|
80
|
-
for i, m in enumerate(memory):
|
81
|
-
output_str += f"### Feedback {i}:\n"
|
82
|
-
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
83
|
-
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
84
|
-
if "edits" in m:
|
85
|
-
output_str += f"Edits {i}:\n{m['edits']}\n"
|
86
|
-
output_str += "\n"
|
87
|
-
|
88
|
-
return output_str
|
89
|
-
|
90
|
-
|
91
|
-
def format_plans(plans: Dict[str, Any]) -> str:
|
92
|
-
plan_str = ""
|
93
|
-
for k, v in plans.items():
|
94
|
-
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
95
|
-
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
96
|
-
|
97
|
-
return plan_str
|
98
|
-
|
99
|
-
|
100
|
-
def write_plans(
|
101
|
-
chat: List[Message],
|
102
|
-
tool_desc: str,
|
103
|
-
working_memory: str,
|
104
|
-
model: LMM,
|
105
|
-
) -> Dict[str, Any]:
|
106
|
-
chat = copy.deepcopy(chat)
|
107
|
-
if chat[-1]["role"] != "user":
|
108
|
-
raise ValueError("Last chat message must be from the user.")
|
109
|
-
|
110
|
-
user_request = chat[-1]["content"]
|
111
|
-
context = USER_REQ.format(user_request=user_request)
|
112
|
-
prompt = PLAN.format(
|
113
|
-
context=context,
|
114
|
-
tool_desc=tool_desc,
|
115
|
-
feedback=working_memory,
|
116
|
-
)
|
117
|
-
chat[-1]["content"] = prompt
|
118
|
-
return extract_json(model(chat, stream=False)) # type: ignore
|
119
|
-
|
120
|
-
|
121
|
-
def pick_plan(
|
122
|
-
chat: List[Message],
|
123
|
-
plans: Dict[str, Any],
|
124
|
-
tool_info: str,
|
125
|
-
model: LMM,
|
126
|
-
code_interpreter: CodeInterpreter,
|
127
|
-
media: List[str],
|
128
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
129
|
-
verbosity: int = 0,
|
130
|
-
max_retries: int = 3,
|
131
|
-
) -> Tuple[Dict[str, str], str]:
|
132
|
-
log_progress(
|
133
|
-
{
|
134
|
-
"type": "log",
|
135
|
-
"log_content": "Generating code to pick the best plan",
|
136
|
-
"status": "started",
|
137
|
-
}
|
138
|
-
)
|
139
|
-
|
140
|
-
chat = copy.deepcopy(chat)
|
141
|
-
if chat[-1]["role"] != "user":
|
142
|
-
raise ValueError("Last chat message must be from the user.")
|
143
|
-
|
144
|
-
plan_str = format_plans(plans)
|
145
|
-
prompt = TEST_PLANS.format(
|
146
|
-
docstring=tool_info, plans=plan_str, previous_attempts="", media=media
|
147
|
-
)
|
148
|
-
|
149
|
-
code = extract_code(model(prompt, stream=False)) # type: ignore
|
150
|
-
log_progress(
|
151
|
-
{
|
152
|
-
"type": "log",
|
153
|
-
"log_content": "Executing code to test plans",
|
154
|
-
"code": DefaultImports.prepend_imports(code),
|
155
|
-
"status": "running",
|
156
|
-
}
|
157
|
-
)
|
158
|
-
tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
|
159
|
-
# Because of the way we trace function calls the trace information ends up in the
|
160
|
-
# results. We don't want to show this info to the LLM so we don't include it in the
|
161
|
-
# tool_output_str.
|
162
|
-
tool_output_str = tool_output.text(include_results=False).strip()
|
163
|
-
|
164
|
-
if verbosity == 2:
|
165
|
-
_print_code("Initial code and tests:", code)
|
166
|
-
_LOGGER.info(f"Initial code execution result:\n{tool_output_str}")
|
167
|
-
|
168
|
-
log_progress(
|
169
|
-
{
|
170
|
-
"type": "log",
|
171
|
-
"log_content": (
|
172
|
-
"Code execution succeeded"
|
173
|
-
if tool_output.success
|
174
|
-
else "Code execution failed"
|
175
|
-
),
|
176
|
-
"code": DefaultImports.prepend_imports(code),
|
177
|
-
# "payload": tool_output.to_json(),
|
178
|
-
"status": "completed" if tool_output.success else "failed",
|
179
|
-
}
|
180
|
-
)
|
181
|
-
|
182
|
-
# retry if the tool output is empty or code fails
|
183
|
-
count = 0
|
184
|
-
while (
|
185
|
-
not tool_output.success
|
186
|
-
or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
|
187
|
-
) and count < max_retries:
|
188
|
-
prompt = TEST_PLANS.format(
|
189
|
-
docstring=tool_info,
|
190
|
-
plans=plan_str,
|
191
|
-
previous_attempts=PREVIOUS_FAILED.format(
|
192
|
-
code=code, error="\n".join(tool_output_str.splitlines()[-50:])
|
193
|
-
),
|
194
|
-
media=media,
|
195
|
-
)
|
196
|
-
log_progress(
|
197
|
-
{
|
198
|
-
"type": "log",
|
199
|
-
"log_content": "Retrying code to test plans",
|
200
|
-
"status": "running",
|
201
|
-
"code": DefaultImports.prepend_imports(code),
|
202
|
-
}
|
203
|
-
)
|
204
|
-
code = extract_code(model(prompt, stream=False)) # type: ignore
|
205
|
-
tool_output = code_interpreter.exec_isolation(
|
206
|
-
DefaultImports.prepend_imports(code)
|
207
|
-
)
|
208
|
-
log_progress(
|
209
|
-
{
|
210
|
-
"type": "log",
|
211
|
-
"log_content": (
|
212
|
-
"Code execution succeeded"
|
213
|
-
if tool_output.success
|
214
|
-
else "Code execution failed"
|
215
|
-
),
|
216
|
-
"code": DefaultImports.prepend_imports(code),
|
217
|
-
# "payload": tool_output.to_json(),
|
218
|
-
"status": "completed" if tool_output.success else "failed",
|
219
|
-
}
|
220
|
-
)
|
221
|
-
tool_output_str = tool_output.text(include_results=False).strip()
|
222
|
-
|
223
|
-
if verbosity == 2:
|
224
|
-
_print_code("Code and test after attempted fix:", code)
|
225
|
-
_LOGGER.info(f"Code execution result after attempt {count + 1}")
|
226
|
-
_LOGGER.info(f"{tool_output_str}")
|
227
|
-
|
228
|
-
count += 1
|
229
|
-
|
230
|
-
if verbosity >= 1:
|
231
|
-
_print_code("Final code:", code)
|
232
|
-
|
233
|
-
user_req = chat[-1]["content"]
|
234
|
-
context = USER_REQ.format(user_request=user_req)
|
235
|
-
# because the tool picker model gets the image as well, we have to be careful with
|
236
|
-
# how much text we send it, so we truncate the tool output to 20,000 characters
|
237
|
-
prompt = PICK_PLAN.format(
|
238
|
-
context=context,
|
239
|
-
plans=format_plans(plans),
|
240
|
-
tool_output=tool_output_str[:20_000],
|
241
|
-
)
|
242
|
-
chat[-1]["content"] = prompt
|
243
|
-
|
244
|
-
count = 0
|
245
|
-
plan_thoughts = None
|
246
|
-
while plan_thoughts is None and count < max_retries:
|
247
|
-
try:
|
248
|
-
plan_thoughts = extract_json(model(chat, stream=False)) # type: ignore
|
249
|
-
except JSONDecodeError as e:
|
250
|
-
_LOGGER.exception(
|
251
|
-
f"Error while extracting JSON during picking best plan {str(e)}"
|
252
|
-
)
|
253
|
-
pass
|
254
|
-
count += 1
|
255
|
-
|
256
|
-
if (
|
257
|
-
plan_thoughts is None
|
258
|
-
or "best_plan" not in plan_thoughts
|
259
|
-
or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
|
260
|
-
):
|
261
|
-
_LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
|
262
|
-
plan_thoughts = {"best_plan": list(plans.keys())[0]}
|
263
|
-
|
264
|
-
if "thoughts" not in plan_thoughts:
|
265
|
-
plan_thoughts["thoughts"] = ""
|
266
|
-
|
267
|
-
if verbosity >= 1:
|
268
|
-
_LOGGER.info(f"Best plan:\n{plan_thoughts}")
|
269
|
-
log_progress(
|
270
|
-
{
|
271
|
-
"type": "log",
|
272
|
-
"log_content": "Picked best plan",
|
273
|
-
"status": "completed",
|
274
|
-
"payload": plans[plan_thoughts["best_plan"]],
|
275
|
-
}
|
276
|
-
)
|
277
|
-
return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
|
278
49
|
|
279
50
|
|
280
51
|
def write_code(
|
@@ -393,7 +164,7 @@ def write_and_test_code(
|
|
393
164
|
}
|
394
165
|
)
|
395
166
|
if verbosity == 2:
|
396
|
-
|
167
|
+
print_code("Initial code and tests:", code, test)
|
397
168
|
_LOGGER.info(
|
398
169
|
f"Initial code execution result:\n{result.text(include_logs=True)}"
|
399
170
|
)
|
@@ -418,7 +189,7 @@ def write_and_test_code(
|
|
418
189
|
count += 1
|
419
190
|
|
420
191
|
if verbosity >= 1:
|
421
|
-
|
192
|
+
print_code("Final code and tests:", code, test)
|
422
193
|
|
423
194
|
return {
|
424
195
|
"code": code,
|
@@ -537,7 +308,7 @@ def debug_code(
|
|
537
308
|
}
|
538
309
|
)
|
539
310
|
if verbosity == 2:
|
540
|
-
|
311
|
+
print_code("Code and test after attempted fix:", code, test)
|
541
312
|
_LOGGER.info(
|
542
313
|
f"Reflection: {fixed_code_and_test['reflections']}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
|
543
314
|
)
|
@@ -545,62 +316,6 @@ def debug_code(
|
|
545
316
|
return code, test, result
|
546
317
|
|
547
318
|
|
548
|
-
def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
549
|
-
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
550
|
-
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
551
|
-
_CONSOLE.print(
|
552
|
-
Syntax(
|
553
|
-
DefaultImports.prepend_imports(code),
|
554
|
-
"python",
|
555
|
-
theme="gruvbox-dark",
|
556
|
-
line_numbers=True,
|
557
|
-
)
|
558
|
-
)
|
559
|
-
if test:
|
560
|
-
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
561
|
-
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
562
|
-
|
563
|
-
|
564
|
-
def retrieve_tools(
|
565
|
-
plans: Dict[str, Dict[str, Any]],
|
566
|
-
tool_recommender: Sim,
|
567
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
568
|
-
verbosity: int = 0,
|
569
|
-
) -> Dict[str, str]:
|
570
|
-
log_progress(
|
571
|
-
{
|
572
|
-
"type": "log",
|
573
|
-
"log_content": ("Retrieving tools for each plan"),
|
574
|
-
"status": "started",
|
575
|
-
}
|
576
|
-
)
|
577
|
-
tool_info = []
|
578
|
-
tool_desc = []
|
579
|
-
tool_lists: Dict[str, List[Dict[str, str]]] = {}
|
580
|
-
for k, plan in plans.items():
|
581
|
-
tool_lists[k] = []
|
582
|
-
for task in plan["instructions"]:
|
583
|
-
tools = tool_recommender.top_k(task, k=2, thresh=0.3)
|
584
|
-
tool_info.extend([e["doc"] for e in tools])
|
585
|
-
tool_desc.extend([e["desc"] for e in tools])
|
586
|
-
tool_lists[k].extend(
|
587
|
-
{"description": e["desc"], "documentation": e["doc"]} for e in tools
|
588
|
-
)
|
589
|
-
|
590
|
-
if verbosity == 2:
|
591
|
-
tool_desc_str = "\n".join(set(tool_desc))
|
592
|
-
_LOGGER.info(f"Tools Description:\n{tool_desc_str}")
|
593
|
-
|
594
|
-
tool_lists_unique = {}
|
595
|
-
for k in tool_lists:
|
596
|
-
tool_lists_unique[k] = "\n\n".join(
|
597
|
-
set(e["documentation"] for e in tool_lists[k])
|
598
|
-
)
|
599
|
-
all_tools = "\n\n".join(set(tool_info))
|
600
|
-
tool_lists_unique["all"] = all_tools
|
601
|
-
return tool_lists_unique
|
602
|
-
|
603
|
-
|
604
319
|
class VisionAgentCoder(Agent):
|
605
320
|
"""Vision Agent Coder is an agentic framework that can output code based on a user
|
606
321
|
request. It can plan tasks, retrieve relevant tools, write code, write tests and
|
@@ -616,23 +331,22 @@ class VisionAgentCoder(Agent):
|
|
616
331
|
|
617
332
|
def __init__(
|
618
333
|
self,
|
619
|
-
planner: Optional[
|
334
|
+
planner: Optional[Agent] = None,
|
620
335
|
coder: Optional[LMM] = None,
|
621
336
|
tester: Optional[LMM] = None,
|
622
337
|
debugger: Optional[LMM] = None,
|
623
|
-
tool_recommender: Optional[Sim] = None,
|
624
338
|
verbosity: int = 0,
|
625
339
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
626
|
-
|
340
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
627
341
|
) -> None:
|
628
342
|
"""Initialize the Vision Agent Coder.
|
629
343
|
|
630
344
|
Parameters:
|
631
|
-
planner (Optional[
|
345
|
+
planner (Optional[Agent]): The planner model to use. Defaults to
|
346
|
+
AnthropicVisionAgentPlanner.
|
632
347
|
coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
|
633
348
|
tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
|
634
349
|
debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
|
635
|
-
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
636
350
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
637
351
|
highest verbosity level which will output all intermediate debugging
|
638
352
|
code.
|
@@ -641,14 +355,17 @@ class VisionAgentCoder(Agent):
|
|
641
355
|
in a web application where multiple VisionAgentCoder instances are
|
642
356
|
running in parallel. This callback ensures that the progress are not
|
643
357
|
mixed up.
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
If it's also None, the local python runtime environment will be used.
|
358
|
+
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
359
|
+
it can be one of: None, "local" or "e2b". If None, it will read from
|
360
|
+
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
361
|
+
object is provided it will use that.
|
649
362
|
"""
|
650
363
|
|
651
|
-
self.planner =
|
364
|
+
self.planner = (
|
365
|
+
AnthropicVisionAgentPlanner(verbosity=verbosity)
|
366
|
+
if planner is None
|
367
|
+
else planner
|
368
|
+
)
|
652
369
|
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
653
370
|
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
654
371
|
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
@@ -656,21 +373,15 @@ class VisionAgentCoder(Agent):
|
|
656
373
|
if self.verbosity > 0:
|
657
374
|
_LOGGER.setLevel(logging.INFO)
|
658
375
|
|
659
|
-
self.tool_recommender = (
|
660
|
-
Sim(T.TOOLS_DF, sim_key="desc")
|
661
|
-
if tool_recommender is None
|
662
|
-
else tool_recommender
|
663
|
-
)
|
664
376
|
self.report_progress_callback = report_progress_callback
|
665
|
-
self.
|
377
|
+
self.code_interpreter = code_interpreter
|
666
378
|
|
667
379
|
def __call__(
|
668
380
|
self,
|
669
381
|
input: Union[str, List[Message]],
|
670
382
|
media: Optional[Union[str, Path]] = None,
|
671
383
|
) -> str:
|
672
|
-
"""
|
673
|
-
task.
|
384
|
+
"""Generate code based on a user request.
|
674
385
|
|
675
386
|
Parameters:
|
676
387
|
input (Union[str, List[Message]]): A conversation in the format of
|
@@ -686,46 +397,58 @@ class VisionAgentCoder(Agent):
|
|
686
397
|
input = [{"role": "user", "content": input}]
|
687
398
|
if media is not None:
|
688
399
|
input[0]["media"] = [media]
|
689
|
-
|
690
|
-
|
691
|
-
return results["code"] # type: ignore
|
400
|
+
code_and_context = self.generate_code(input)
|
401
|
+
return code_and_context["code"] # type: ignore
|
692
402
|
|
693
|
-
def
|
403
|
+
def generate_code_from_plan(
|
694
404
|
self,
|
695
405
|
chat: List[Message],
|
696
|
-
|
697
|
-
|
698
|
-
custom_tool_names: Optional[List[str]] = None,
|
406
|
+
plan_context: PlanContext,
|
407
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
699
408
|
) -> Dict[str, Any]:
|
700
|
-
"""
|
701
|
-
|
409
|
+
"""Generates code and other intermediate outputs from a chat input and a plan.
|
410
|
+
The plan includes:
|
411
|
+
- plans: The plans generated by the planner.
|
412
|
+
- best_plan: The best plan selected by the planner.
|
413
|
+
- plan_thoughts: The thoughts of the planner, including any modifications
|
414
|
+
to the plan.
|
415
|
+
- tool_doc: The tool documentation for the best plan.
|
416
|
+
- tool_output: The tool output from the tools used by the best plan.
|
702
417
|
|
703
418
|
Parameters:
|
704
|
-
chat (List[Message]): A conversation
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
with the first plan.
|
712
|
-
display_visualization (bool): If True, it opens a new window locally to
|
713
|
-
show the image(s) created by visualization code (if there is any).
|
714
|
-
custom_tool_names (List[str]): A list of custom tools for the agent to pick
|
715
|
-
and use. If not provided, default to full tool set from vision_agent.tools.
|
419
|
+
chat (List[Message]): A conversation in the format of
|
420
|
+
[{"role": "user", "content": "describe your task here..."}].
|
421
|
+
plan_context (PlanContext): The context of the plan, including the plans,
|
422
|
+
best_plan, plan_thoughts, tool_doc, and tool_output.
|
423
|
+
test_multi_plan (bool): Whether to test multiple plans or just the best plan.
|
424
|
+
custom_tool_names (Optional[List[str]]): A list of custom tool names to use
|
425
|
+
for the planner.
|
716
426
|
|
717
427
|
Returns:
|
718
|
-
Dict[str, Any]: A dictionary containing the code
|
719
|
-
and
|
428
|
+
Dict[str, Any]: A dictionary containing the code output by the
|
429
|
+
VisionAgentCoder and other intermediate outputs. include:
|
430
|
+
- status (str): Whether or not the agent completed or failed generating
|
431
|
+
the code.
|
432
|
+
- code (str): The code output by the VisionAgentCoder.
|
433
|
+
- test (str): The test output by the VisionAgentCoder.
|
434
|
+
- test_result (Execution): The result of the test execution.
|
435
|
+
- plans (Dict[str, Any]): The plans generated by the planner.
|
436
|
+
- plan_thoughts (str): The thoughts of the planner.
|
437
|
+
- working_memory (List[Dict[str, str]]): The working memory of the agent.
|
720
438
|
"""
|
721
|
-
|
722
439
|
if not chat:
|
723
440
|
raise ValueError("Chat cannot be empty.")
|
724
441
|
|
725
442
|
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
726
|
-
|
727
|
-
|
728
|
-
|
443
|
+
code_interpreter = (
|
444
|
+
self.code_interpreter
|
445
|
+
if self.code_interpreter is not None
|
446
|
+
and not isinstance(self.code_interpreter, str)
|
447
|
+
else CodeInterpreterFactory.new_instance(
|
448
|
+
code_sandbox_runtime=self.code_interpreter,
|
449
|
+
)
|
450
|
+
)
|
451
|
+
with code_interpreter:
|
729
452
|
chat = copy.deepcopy(chat)
|
730
453
|
media_list = []
|
731
454
|
for chat_i in chat:
|
@@ -759,74 +482,22 @@ class VisionAgentCoder(Agent):
|
|
759
482
|
code = ""
|
760
483
|
test = ""
|
761
484
|
working_memory: List[Dict[str, str]] = []
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
plans = self._create_plans(
|
767
|
-
int_chat, custom_tool_names, working_memory, self.planner
|
768
|
-
)
|
769
|
-
|
770
|
-
if test_multi_plan:
|
771
|
-
self._log_plans(plans, self.verbosity)
|
772
|
-
|
773
|
-
tool_infos = retrieve_tools(
|
774
|
-
plans,
|
775
|
-
self.tool_recommender,
|
776
|
-
self.log_progress,
|
777
|
-
self.verbosity,
|
778
|
-
)
|
779
|
-
|
780
|
-
if test_multi_plan:
|
781
|
-
plan_thoughts, tool_output_str = pick_plan(
|
782
|
-
int_chat,
|
783
|
-
plans,
|
784
|
-
tool_infos["all"],
|
785
|
-
self.coder,
|
786
|
-
code_interpreter,
|
787
|
-
media_list,
|
788
|
-
self.log_progress,
|
789
|
-
verbosity=self.verbosity,
|
790
|
-
)
|
791
|
-
best_plan = plan_thoughts["best_plan"]
|
792
|
-
plan_thoughts_str = plan_thoughts["thoughts"]
|
793
|
-
else:
|
794
|
-
best_plan = list(plans.keys())[0]
|
795
|
-
tool_output_str = ""
|
796
|
-
plan_thoughts_str = ""
|
797
|
-
|
798
|
-
if best_plan in plans and best_plan in tool_infos:
|
799
|
-
plan_i = plans[best_plan]
|
800
|
-
tool_info = tool_infos[best_plan]
|
801
|
-
else:
|
802
|
-
if self.verbosity >= 1:
|
803
|
-
_LOGGER.warning(
|
804
|
-
f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
|
805
|
-
)
|
806
|
-
k = list(plans.keys())[0]
|
807
|
-
plan_i = plans[k]
|
808
|
-
tool_info = tool_infos[k]
|
809
|
-
|
810
|
-
self.log_progress(
|
811
|
-
{
|
812
|
-
"type": "log",
|
813
|
-
"log_content": "Creating plans",
|
814
|
-
"status": "completed",
|
815
|
-
"payload": tool_info,
|
816
|
-
}
|
817
|
-
)
|
485
|
+
plan = plan_context.plans[plan_context.best_plan]
|
486
|
+
tool_doc = plan_context.tool_doc
|
487
|
+
tool_output_str = plan_context.tool_output
|
488
|
+
plan_thoughts_str = str(plan_context.plan_thoughts)
|
818
489
|
|
819
490
|
if self.verbosity >= 1:
|
820
|
-
|
491
|
+
plan_fixed = [{"instructions": e} for e in plan["instructions"]]
|
821
492
|
_LOGGER.info(
|
822
|
-
f"Picked best plan:\n{tabulate(tabular_data=
|
493
|
+
f"Picked best plan:\n{tabulate(tabular_data=plan_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
823
494
|
)
|
824
495
|
|
825
496
|
results = write_and_test_code(
|
826
497
|
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
|
827
|
-
plan=f"\n{
|
828
|
-
+ "\n-".join([e for e in
|
829
|
-
tool_info=
|
498
|
+
plan=f"\n{plan['thoughts']}\n-"
|
499
|
+
+ "\n-".join([e for e in plan["instructions"]]),
|
500
|
+
tool_info=tool_doc,
|
830
501
|
tool_output=tool_output_str,
|
831
502
|
plan_thoughts=plan_thoughts_str,
|
832
503
|
tool_utils=T.UTILITIES_DOCSTRING,
|
@@ -842,64 +513,83 @@ class VisionAgentCoder(Agent):
|
|
842
513
|
success = cast(bool, results["success"])
|
843
514
|
code = remove_installs_from_code(cast(str, results["code"]))
|
844
515
|
test = remove_installs_from_code(cast(str, results["test"]))
|
845
|
-
working_memory.extend(results["working_memory"])
|
846
|
-
plan.append({"code": code, "test": test, "plan": plan_i})
|
516
|
+
working_memory.extend(results["working_memory"])
|
847
517
|
|
848
518
|
execution_result = cast(Execution, results["test_result"])
|
849
519
|
|
850
|
-
if display_visualization:
|
851
|
-
for res in execution_result.results:
|
852
|
-
if res.png:
|
853
|
-
b64_to_pil(res.png).show()
|
854
|
-
if res.mp4:
|
855
|
-
play_video(res.mp4)
|
856
|
-
|
857
520
|
return {
|
858
521
|
"status": "completed" if success else "failed",
|
859
522
|
"code": DefaultImports.prepend_imports(code),
|
860
523
|
"test": test,
|
861
524
|
"test_result": execution_result,
|
862
|
-
"plans": plans,
|
525
|
+
"plans": plan_context.plans,
|
863
526
|
"plan_thoughts": plan_thoughts_str,
|
864
527
|
"working_memory": working_memory,
|
865
528
|
}
|
866
529
|
|
867
|
-
def
|
868
|
-
if self.report_progress_callback is not None:
|
869
|
-
self.report_progress_callback(data)
|
870
|
-
|
871
|
-
def _create_plans(
|
530
|
+
def generate_code(
|
872
531
|
self,
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
planner: LMM,
|
532
|
+
chat: List[Message],
|
533
|
+
test_multi_plan: bool = True,
|
534
|
+
custom_tool_names: Optional[List[str]] = None,
|
877
535
|
) -> Dict[str, Any]:
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
"
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
536
|
+
"""Generates code and other intermediate outputs from a chat input.
|
537
|
+
|
538
|
+
Parameters:
|
539
|
+
chat (List[Message]): A conversation in the format of
|
540
|
+
[{"role": "user", "content": "describe your task here..."}].
|
541
|
+
test_multi_plan (bool): Whether to test multiple plans or just the best plan.
|
542
|
+
custom_tool_names (Optional[List[str]]): A list of custom tool names to use
|
543
|
+
for the planner.
|
544
|
+
|
545
|
+
Returns:
|
546
|
+
Dict[str, Any]: A dictionary containing the code output by the
|
547
|
+
VisionAgentCoder and other intermediate outputs. include:
|
548
|
+
- status (str): Whether or not the agent completed or failed generating
|
549
|
+
the code.
|
550
|
+
- code (str): The code output by the VisionAgentCoder.
|
551
|
+
- test (str): The test output by the VisionAgentCoder.
|
552
|
+
- test_result (Execution): The result of the test execution.
|
553
|
+
- plans (Dict[str, Any]): The plans generated by the planner.
|
554
|
+
- plan_thoughts (str): The thoughts of the planner.
|
555
|
+
- working_memory (List[Dict[str, str]]): The working memory of the agent.
|
556
|
+
"""
|
557
|
+
if not chat:
|
558
|
+
raise ValueError("Chat cannot be empty.")
|
559
|
+
|
560
|
+
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
561
|
+
code_interpreter = (
|
562
|
+
self.code_interpreter
|
563
|
+
if self.code_interpreter is not None
|
564
|
+
and not isinstance(self.code_interpreter, str)
|
565
|
+
else CodeInterpreterFactory.new_instance(
|
566
|
+
code_sandbox_runtime=self.code_interpreter,
|
567
|
+
)
|
892
568
|
)
|
893
|
-
|
569
|
+
with code_interpreter:
|
570
|
+
plan_context = self.planner.generate_plan( # type: ignore
|
571
|
+
chat,
|
572
|
+
test_multi_plan=test_multi_plan,
|
573
|
+
custom_tool_names=custom_tool_names,
|
574
|
+
code_interpreter=code_interpreter,
|
575
|
+
)
|
894
576
|
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
577
|
+
code_and_context = self.generate_code_from_plan(
|
578
|
+
chat,
|
579
|
+
plan_context,
|
580
|
+
code_interpreter=code_interpreter,
|
581
|
+
)
|
582
|
+
return code_and_context
|
583
|
+
|
584
|
+
def chat(self, chat: List[Message]) -> List[Message]:
|
585
|
+
chat = copy.deepcopy(chat)
|
586
|
+
code = self.generate_code(chat)
|
587
|
+
chat.append({"role": "agent", "content": code["code"]})
|
588
|
+
return chat
|
589
|
+
|
590
|
+
def log_progress(self, data: Dict[str, Any]) -> None:
|
591
|
+
if self.report_progress_callback is not None:
|
592
|
+
self.report_progress_callback(data)
|
903
593
|
|
904
594
|
|
905
595
|
class OpenAIVisionAgentCoder(VisionAgentCoder):
|
@@ -907,17 +597,18 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
|
|
907
597
|
|
908
598
|
def __init__(
|
909
599
|
self,
|
910
|
-
planner: Optional[
|
600
|
+
planner: Optional[Agent] = None,
|
911
601
|
coder: Optional[LMM] = None,
|
912
602
|
tester: Optional[LMM] = None,
|
913
603
|
debugger: Optional[LMM] = None,
|
914
|
-
tool_recommender: Optional[Sim] = None,
|
915
604
|
verbosity: int = 0,
|
916
605
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
917
|
-
|
606
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
918
607
|
) -> None:
|
919
608
|
self.planner = (
|
920
|
-
|
609
|
+
OpenAIVisionAgentPlanner(verbosity=verbosity)
|
610
|
+
if planner is None
|
611
|
+
else planner
|
921
612
|
)
|
922
613
|
self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
|
923
614
|
self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
|
@@ -926,13 +617,8 @@ class OpenAIVisionAgentCoder(VisionAgentCoder):
|
|
926
617
|
if self.verbosity > 0:
|
927
618
|
_LOGGER.setLevel(logging.INFO)
|
928
619
|
|
929
|
-
self.tool_recommender = (
|
930
|
-
Sim(T.TOOLS_DF, sim_key="desc")
|
931
|
-
if tool_recommender is None
|
932
|
-
else tool_recommender
|
933
|
-
)
|
934
620
|
self.report_progress_callback = report_progress_callback
|
935
|
-
self.
|
621
|
+
self.code_interpreter = code_interpreter
|
936
622
|
|
937
623
|
|
938
624
|
class AnthropicVisionAgentCoder(VisionAgentCoder):
|
@@ -940,17 +626,20 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
|
|
940
626
|
|
941
627
|
def __init__(
|
942
628
|
self,
|
943
|
-
planner: Optional[
|
629
|
+
planner: Optional[Agent] = None,
|
944
630
|
coder: Optional[LMM] = None,
|
945
631
|
tester: Optional[LMM] = None,
|
946
632
|
debugger: Optional[LMM] = None,
|
947
|
-
tool_recommender: Optional[Sim] = None,
|
948
633
|
verbosity: int = 0,
|
949
634
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
950
|
-
|
635
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
951
636
|
) -> None:
|
952
637
|
# NOTE: Claude doesn't have an official JSON mode
|
953
|
-
self.planner =
|
638
|
+
self.planner = (
|
639
|
+
AnthropicVisionAgentPlanner(verbosity=verbosity)
|
640
|
+
if planner is None
|
641
|
+
else planner
|
642
|
+
)
|
954
643
|
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
955
644
|
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
956
645
|
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
@@ -958,15 +647,8 @@ class AnthropicVisionAgentCoder(VisionAgentCoder):
|
|
958
647
|
if self.verbosity > 0:
|
959
648
|
_LOGGER.setLevel(logging.INFO)
|
960
649
|
|
961
|
-
# Anthropic does not offer any embedding models and instead recomends Voyage,
|
962
|
-
# we're using OpenAI's embedder for now.
|
963
|
-
self.tool_recommender = (
|
964
|
-
Sim(T.TOOLS_DF, sim_key="desc")
|
965
|
-
if tool_recommender is None
|
966
|
-
else tool_recommender
|
967
|
-
)
|
968
650
|
self.report_progress_callback = report_progress_callback
|
969
|
-
self.
|
651
|
+
self.code_interpreter = code_interpreter
|
970
652
|
|
971
653
|
|
972
654
|
class OllamaVisionAgentCoder(VisionAgentCoder):
|
@@ -988,17 +670,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
988
670
|
|
989
671
|
def __init__(
|
990
672
|
self,
|
991
|
-
planner: Optional[
|
673
|
+
planner: Optional[Agent] = None,
|
992
674
|
coder: Optional[LMM] = None,
|
993
675
|
tester: Optional[LMM] = None,
|
994
676
|
debugger: Optional[LMM] = None,
|
995
|
-
tool_recommender: Optional[Sim] = None,
|
996
677
|
verbosity: int = 0,
|
997
678
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
679
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
998
680
|
) -> None:
|
999
681
|
super().__init__(
|
1000
682
|
planner=(
|
1001
|
-
|
683
|
+
OllamaVisionAgentPlanner(verbosity=verbosity)
|
1002
684
|
if planner is None
|
1003
685
|
else planner
|
1004
686
|
),
|
@@ -1017,13 +699,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
1017
699
|
if debugger is None
|
1018
700
|
else debugger
|
1019
701
|
),
|
1020
|
-
tool_recommender=(
|
1021
|
-
OllamaSim(T.TOOLS_DF, sim_key="desc")
|
1022
|
-
if tool_recommender is None
|
1023
|
-
else tool_recommender
|
1024
|
-
),
|
1025
702
|
verbosity=verbosity,
|
1026
703
|
report_progress_callback=report_progress_callback,
|
704
|
+
code_interpreter=code_interpreter,
|
1027
705
|
)
|
1028
706
|
|
1029
707
|
|
@@ -1043,22 +721,22 @@ class AzureVisionAgentCoder(VisionAgentCoder):
|
|
1043
721
|
|
1044
722
|
def __init__(
|
1045
723
|
self,
|
1046
|
-
planner: Optional[
|
724
|
+
planner: Optional[Agent] = None,
|
1047
725
|
coder: Optional[LMM] = None,
|
1048
726
|
tester: Optional[LMM] = None,
|
1049
727
|
debugger: Optional[LMM] = None,
|
1050
|
-
tool_recommender: Optional[Sim] = None,
|
1051
728
|
verbosity: int = 0,
|
1052
729
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
730
|
+
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
1053
731
|
) -> None:
|
1054
732
|
"""Initialize the Vision Agent Coder.
|
1055
733
|
|
1056
734
|
Parameters:
|
1057
|
-
planner (Optional[
|
735
|
+
planner (Optional[Agent]): The planner model to use. Defaults to
|
736
|
+
AzureVisionAgentPlanner.
|
1058
737
|
coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
|
1059
738
|
tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
|
1060
739
|
debugger (Optional[LMM]): The debugger model to
|
1061
|
-
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
1062
740
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
1063
741
|
highest verbosity level which will output all intermediate debugging
|
1064
742
|
code.
|
@@ -1069,7 +747,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
|
|
1069
747
|
"""
|
1070
748
|
super().__init__(
|
1071
749
|
planner=(
|
1072
|
-
|
750
|
+
AzureVisionAgentPlanner(verbosity=verbosity)
|
1073
751
|
if planner is None
|
1074
752
|
else planner
|
1075
753
|
),
|
@@ -1078,11 +756,7 @@ class AzureVisionAgentCoder(VisionAgentCoder):
|
|
1078
756
|
debugger=(
|
1079
757
|
AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
|
1080
758
|
),
|
1081
|
-
tool_recommender=(
|
1082
|
-
AzureSim(T.TOOLS_DF, sim_key="desc")
|
1083
|
-
if tool_recommender is None
|
1084
|
-
else tool_recommender
|
1085
|
-
),
|
1086
759
|
verbosity=verbosity,
|
1087
760
|
report_progress_callback=report_progress_callback,
|
761
|
+
code_interpreter=code_interpreter,
|
1088
762
|
)
|