vision-agent 1.0.5__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +0 -16
- vision_agent/tools/__init__.py +0 -6
- vision_agent/tools/meta_tools.py +1 -492
- vision_agent/utils/tools.py +3 -1
- vision_agent-1.0.8.dist-info/METADATA +259 -0
- {vision_agent-1.0.5.dist-info → vision_agent-1.0.8.dist-info}/RECORD +8 -14
- {vision_agent-1.0.5.dist-info → vision_agent-1.0.8.dist-info}/WHEEL +1 -1
- vision_agent/agent/vision_agent.py +0 -605
- vision_agent/agent/vision_agent_coder.py +0 -742
- vision_agent/agent/vision_agent_coder_prompts.py +0 -290
- vision_agent/agent/vision_agent_planner.py +0 -564
- vision_agent/agent/vision_agent_planner_prompts.py +0 -199
- vision_agent/agent/vision_agent_prompts.py +0 -312
- vision_agent-1.0.5.dist-info/METADATA +0 -179
- {vision_agent-1.0.5.dist-info → vision_agent-1.0.8.dist-info}/LICENSE +0 -0
@@ -1,742 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import logging
|
3
|
-
import os
|
4
|
-
import sys
|
5
|
-
from pathlib import Path
|
6
|
-
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
7
|
-
|
8
|
-
from tabulate import tabulate
|
9
|
-
|
10
|
-
import vision_agent.tools as T
|
11
|
-
from vision_agent.agent.agent import Agent
|
12
|
-
from vision_agent.agent.vision_agent_coder_prompts import (
|
13
|
-
CODE,
|
14
|
-
FIX_BUG,
|
15
|
-
FULL_TASK,
|
16
|
-
SIMPLE_TEST,
|
17
|
-
)
|
18
|
-
from vision_agent.agent.vision_agent_planner import (
|
19
|
-
AnthropicVisionAgentPlanner,
|
20
|
-
AzureVisionAgentPlanner,
|
21
|
-
OllamaVisionAgentPlanner,
|
22
|
-
OpenAIVisionAgentPlanner,
|
23
|
-
PlanContext,
|
24
|
-
)
|
25
|
-
from vision_agent.lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
|
26
|
-
from vision_agent.models import Message
|
27
|
-
from vision_agent.tools.meta_tools import get_diff
|
28
|
-
from vision_agent.utils import CodeInterpreterFactory, Execution
|
29
|
-
from vision_agent.utils.agent import (
|
30
|
-
_MAX_TABULATE_COL_WIDTH,
|
31
|
-
DefaultImports,
|
32
|
-
extract_code,
|
33
|
-
extract_tag,
|
34
|
-
format_feedback,
|
35
|
-
print_code,
|
36
|
-
remove_installs_from_code,
|
37
|
-
strip_function_calls,
|
38
|
-
)
|
39
|
-
from vision_agent.utils.execute import CodeInterpreter
|
40
|
-
|
41
|
-
logging.basicConfig(stream=sys.stdout)
|
42
|
-
WORKSPACE = Path(os.getenv("WORKSPACE", ""))
|
43
|
-
_LOGGER = logging.getLogger(__name__)
|
44
|
-
|
45
|
-
|
46
|
-
def write_code(
|
47
|
-
coder: LMM,
|
48
|
-
chat: List[Message],
|
49
|
-
plan: str,
|
50
|
-
tool_info: str,
|
51
|
-
plan_thoughts: str,
|
52
|
-
tool_output: str,
|
53
|
-
feedback: str,
|
54
|
-
) -> str:
|
55
|
-
chat = copy.deepcopy(chat)
|
56
|
-
if chat[-1]["role"] != "user":
|
57
|
-
raise ValueError("Last chat message must be from the user.")
|
58
|
-
|
59
|
-
user_request = chat[-1]["content"]
|
60
|
-
prompt = CODE.format(
|
61
|
-
docstring=tool_info,
|
62
|
-
question=FULL_TASK.format(user_request=user_request, subtasks=plan),
|
63
|
-
tool_output=tool_output,
|
64
|
-
plan_thoughts=plan_thoughts,
|
65
|
-
feedback=feedback,
|
66
|
-
)
|
67
|
-
chat[-1]["content"] = prompt
|
68
|
-
return extract_code(coder(chat, stream=False)) # type: ignore
|
69
|
-
|
70
|
-
|
71
|
-
def write_test(
|
72
|
-
tester: LMM,
|
73
|
-
chat: List[Message],
|
74
|
-
tool_utils: str,
|
75
|
-
code: str,
|
76
|
-
feedback: str,
|
77
|
-
media: Optional[Sequence[Union[str, Path]]] = None,
|
78
|
-
) -> str:
|
79
|
-
chat = copy.deepcopy(chat)
|
80
|
-
if chat[-1]["role"] != "user":
|
81
|
-
raise ValueError("Last chat message must be from the user.")
|
82
|
-
|
83
|
-
user_request = chat[-1]["content"]
|
84
|
-
prompt = SIMPLE_TEST.format(
|
85
|
-
docstring=tool_utils,
|
86
|
-
question=user_request,
|
87
|
-
code=code,
|
88
|
-
feedback=feedback,
|
89
|
-
media=media,
|
90
|
-
)
|
91
|
-
chat[-1]["content"] = prompt
|
92
|
-
return extract_code(tester(chat, stream=False)) # type: ignore
|
93
|
-
|
94
|
-
|
95
|
-
def write_and_test_code(
|
96
|
-
chat: List[Message],
|
97
|
-
plan: str,
|
98
|
-
tool_info: str,
|
99
|
-
tool_output: str,
|
100
|
-
plan_thoughts: str,
|
101
|
-
tool_utils: str,
|
102
|
-
working_memory: List[Dict[str, str]],
|
103
|
-
coder: LMM,
|
104
|
-
tester: LMM,
|
105
|
-
debugger: LMM,
|
106
|
-
code_interpreter: CodeInterpreter,
|
107
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
108
|
-
verbosity: int = 0,
|
109
|
-
max_retries: int = 3,
|
110
|
-
media: Optional[Sequence[Union[str, Path]]] = None,
|
111
|
-
) -> Dict[str, Any]:
|
112
|
-
log_progress(
|
113
|
-
{
|
114
|
-
"type": "log",
|
115
|
-
"log_content": "Generating code",
|
116
|
-
"status": "started",
|
117
|
-
}
|
118
|
-
)
|
119
|
-
code = write_code(
|
120
|
-
coder,
|
121
|
-
chat,
|
122
|
-
plan,
|
123
|
-
tool_info,
|
124
|
-
tool_output,
|
125
|
-
plan_thoughts,
|
126
|
-
format_feedback(working_memory),
|
127
|
-
)
|
128
|
-
code = strip_function_calls(code)
|
129
|
-
test = write_test(
|
130
|
-
tester, chat, tool_utils, code, format_feedback(working_memory), media
|
131
|
-
)
|
132
|
-
|
133
|
-
log_progress(
|
134
|
-
{
|
135
|
-
"type": "log",
|
136
|
-
"log_content": "Running code",
|
137
|
-
"status": "running",
|
138
|
-
"code": DefaultImports.prepend_imports(code),
|
139
|
-
"payload": {
|
140
|
-
"test": test,
|
141
|
-
},
|
142
|
-
}
|
143
|
-
)
|
144
|
-
result = code_interpreter.exec_isolation(
|
145
|
-
f"{DefaultImports.to_code_string()}\n{code}\n{test}"
|
146
|
-
)
|
147
|
-
log_progress(
|
148
|
-
{
|
149
|
-
"type": "log",
|
150
|
-
"log_content": (
|
151
|
-
"Code execution succeeded"
|
152
|
-
if result.success
|
153
|
-
else "Code execution failed"
|
154
|
-
),
|
155
|
-
"status": "completed" if result.success else "failed",
|
156
|
-
"code": DefaultImports.prepend_imports(code),
|
157
|
-
"payload": {
|
158
|
-
"test": test,
|
159
|
-
},
|
160
|
-
}
|
161
|
-
)
|
162
|
-
if verbosity == 2:
|
163
|
-
print_code("Initial code and tests:", code, test)
|
164
|
-
_LOGGER.info(
|
165
|
-
f"Initial code execution result:\n{result.text(include_logs=True)}"
|
166
|
-
)
|
167
|
-
|
168
|
-
count = 0
|
169
|
-
new_working_memory: List[Dict[str, str]] = []
|
170
|
-
while not result.success and count < max_retries:
|
171
|
-
if verbosity == 2:
|
172
|
-
_LOGGER.info(f"Start debugging attempt {count + 1}")
|
173
|
-
code, test, result = debug_code(
|
174
|
-
working_memory,
|
175
|
-
debugger,
|
176
|
-
code_interpreter,
|
177
|
-
tool_info,
|
178
|
-
code,
|
179
|
-
test,
|
180
|
-
result,
|
181
|
-
new_working_memory,
|
182
|
-
log_progress,
|
183
|
-
verbosity,
|
184
|
-
)
|
185
|
-
count += 1
|
186
|
-
|
187
|
-
if verbosity >= 1:
|
188
|
-
print_code("Final code and tests:", code, test)
|
189
|
-
|
190
|
-
return {
|
191
|
-
"code": code,
|
192
|
-
"test": test,
|
193
|
-
"success": result.success,
|
194
|
-
"test_result": result,
|
195
|
-
"working_memory": new_working_memory,
|
196
|
-
}
|
197
|
-
|
198
|
-
|
199
|
-
def debug_code(
|
200
|
-
working_memory: List[Dict[str, str]],
|
201
|
-
debugger: LMM,
|
202
|
-
code_interpreter: CodeInterpreter,
|
203
|
-
tool_info: str,
|
204
|
-
code: str,
|
205
|
-
test: str,
|
206
|
-
result: Execution,
|
207
|
-
new_working_memory: List[Dict[str, str]],
|
208
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
209
|
-
verbosity: int = 0,
|
210
|
-
) -> tuple[str, str, Execution]:
|
211
|
-
log_progress(
|
212
|
-
{
|
213
|
-
"type": "log",
|
214
|
-
"log_content": ("Debugging code"),
|
215
|
-
"status": "started",
|
216
|
-
}
|
217
|
-
)
|
218
|
-
|
219
|
-
fixed_code = None
|
220
|
-
fixed_test = None
|
221
|
-
thoughts = ""
|
222
|
-
success = False
|
223
|
-
count = 0
|
224
|
-
while not success and count < 3:
|
225
|
-
try:
|
226
|
-
# LLMs write worse code when it's in JSON, so we have it write JSON
|
227
|
-
# followed by code each wrapped in markdown blocks.
|
228
|
-
fixed_code_and_test_str = debugger(
|
229
|
-
FIX_BUG.format(
|
230
|
-
docstring=tool_info,
|
231
|
-
code=code,
|
232
|
-
tests=test,
|
233
|
-
# Because of the way we trace function calls the trace information
|
234
|
-
# ends up in the results. We don't want to show this info to the
|
235
|
-
# LLM so we don't include it in the tool_output_str.
|
236
|
-
result="\n".join(
|
237
|
-
result.text(include_results=False).splitlines()[-50:]
|
238
|
-
),
|
239
|
-
feedback=format_feedback(working_memory + new_working_memory),
|
240
|
-
),
|
241
|
-
stream=False,
|
242
|
-
)
|
243
|
-
fixed_code_and_test_str = cast(str, fixed_code_and_test_str)
|
244
|
-
thoughts_tag = extract_tag(fixed_code_and_test_str, "thoughts")
|
245
|
-
thoughts = thoughts_tag if thoughts_tag is not None else ""
|
246
|
-
fixed_code = extract_tag(fixed_code_and_test_str, "code")
|
247
|
-
fixed_test = extract_tag(fixed_code_and_test_str, "test")
|
248
|
-
|
249
|
-
if fixed_code is None and fixed_test is None:
|
250
|
-
success = False
|
251
|
-
else:
|
252
|
-
success = True
|
253
|
-
|
254
|
-
except Exception as e:
|
255
|
-
_LOGGER.exception(f"Error while extracting JSON: {e}")
|
256
|
-
|
257
|
-
count += 1
|
258
|
-
|
259
|
-
old_code = code
|
260
|
-
old_test = test
|
261
|
-
|
262
|
-
if fixed_code is not None and fixed_code.strip() != "":
|
263
|
-
code = fixed_code
|
264
|
-
if fixed_test is not None and fixed_test.strip() != "":
|
265
|
-
test = fixed_test
|
266
|
-
|
267
|
-
new_working_memory.append(
|
268
|
-
{
|
269
|
-
"code": f"{code}\n{test}",
|
270
|
-
"feedback": thoughts,
|
271
|
-
"edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
|
272
|
-
}
|
273
|
-
)
|
274
|
-
log_progress(
|
275
|
-
{
|
276
|
-
"type": "log",
|
277
|
-
"log_content": ("Running code"),
|
278
|
-
"status": "running",
|
279
|
-
"code": DefaultImports.prepend_imports(code),
|
280
|
-
"payload": {
|
281
|
-
"test": test,
|
282
|
-
},
|
283
|
-
}
|
284
|
-
)
|
285
|
-
|
286
|
-
result = code_interpreter.exec_isolation(
|
287
|
-
f"{DefaultImports.to_code_string()}\n{code}\n{test}"
|
288
|
-
)
|
289
|
-
log_progress(
|
290
|
-
{
|
291
|
-
"type": "log",
|
292
|
-
"log_content": (
|
293
|
-
"Code execution succeed" if result.success else "Code execution failed"
|
294
|
-
),
|
295
|
-
"status": "completed" if result.success else "failed",
|
296
|
-
"code": DefaultImports.prepend_imports(code),
|
297
|
-
"payload": {
|
298
|
-
"test": test,
|
299
|
-
# "result": result.to_json(),
|
300
|
-
},
|
301
|
-
}
|
302
|
-
)
|
303
|
-
if verbosity == 2:
|
304
|
-
print_code("Code and test after attempted fix:", code, test)
|
305
|
-
_LOGGER.info(
|
306
|
-
f"Reflection: {thoughts}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
|
307
|
-
)
|
308
|
-
|
309
|
-
return code, test, result
|
310
|
-
|
311
|
-
|
312
|
-
class VisionAgentCoder(Agent):
|
313
|
-
"""Vision Agent Coder is an agentic framework that can output code based on a user
|
314
|
-
request. It can plan tasks, retrieve relevant tools, write code, write tests and
|
315
|
-
reflect on failed test cases to debug code. It is inspired by AgentCoder
|
316
|
-
https://arxiv.org/abs/2312.13010 and Data Interpeter https://arxiv.org/abs/2402.18679
|
317
|
-
|
318
|
-
Example
|
319
|
-
-------
|
320
|
-
>>> import vision_agent as va
|
321
|
-
>>> agent = va.agent.VisionAgentCoder()
|
322
|
-
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
323
|
-
"""
|
324
|
-
|
325
|
-
def __init__(
|
326
|
-
self,
|
327
|
-
planner: Optional[Agent] = None,
|
328
|
-
coder: Optional[LMM] = None,
|
329
|
-
tester: Optional[LMM] = None,
|
330
|
-
debugger: Optional[LMM] = None,
|
331
|
-
verbosity: int = 0,
|
332
|
-
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
333
|
-
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
334
|
-
) -> None:
|
335
|
-
"""Initialize the Vision Agent Coder.
|
336
|
-
|
337
|
-
Parameters:
|
338
|
-
planner (Optional[Agent]): The planner model to use. Defaults to
|
339
|
-
AnthropicVisionAgentPlanner.
|
340
|
-
coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
|
341
|
-
tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
|
342
|
-
debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
|
343
|
-
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
344
|
-
highest verbosity level which will output all intermediate debugging
|
345
|
-
code.
|
346
|
-
report_progress_callback (Optional[Callable[Dict[str, Any]]]): a callback
|
347
|
-
to report the progress of the agent. This is useful for streaming logs
|
348
|
-
in a web application where multiple VisionAgentCoder instances are
|
349
|
-
running in parallel. This callback ensures that the progress are not
|
350
|
-
mixed up.
|
351
|
-
code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
|
352
|
-
it can be one of: None or "local". If None, it will read from
|
353
|
-
the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
|
354
|
-
object is provided it will use that.
|
355
|
-
"""
|
356
|
-
|
357
|
-
self.planner = (
|
358
|
-
AnthropicVisionAgentPlanner(verbosity=verbosity)
|
359
|
-
if planner is None
|
360
|
-
else planner
|
361
|
-
)
|
362
|
-
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
363
|
-
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
364
|
-
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
365
|
-
self.verbosity = verbosity
|
366
|
-
if self.verbosity > 0:
|
367
|
-
_LOGGER.setLevel(logging.INFO)
|
368
|
-
|
369
|
-
self.report_progress_callback = report_progress_callback
|
370
|
-
self.code_interpreter = code_interpreter
|
371
|
-
|
372
|
-
def __call__(
|
373
|
-
self,
|
374
|
-
input: Union[str, List[Message]],
|
375
|
-
media: Optional[Union[str, Path]] = None,
|
376
|
-
) -> str:
|
377
|
-
"""Generate code based on a user request.
|
378
|
-
|
379
|
-
Parameters:
|
380
|
-
input (Union[str, List[Message]]): A conversation in the format of
|
381
|
-
[{"role": "user", "content": "describe your task here..."}] or a string
|
382
|
-
of just the contents.
|
383
|
-
media (Optional[Union[str, Path]]): The media file to be used in the task.
|
384
|
-
|
385
|
-
Returns:
|
386
|
-
str: The code output by the VisionAgentCoder.
|
387
|
-
"""
|
388
|
-
|
389
|
-
if isinstance(input, str):
|
390
|
-
input = [{"role": "user", "content": input}]
|
391
|
-
if media is not None:
|
392
|
-
input[0]["media"] = [media]
|
393
|
-
code_and_context = self.generate_code(input)
|
394
|
-
return code_and_context["code"] # type: ignore
|
395
|
-
|
396
|
-
def generate_code_from_plan(
|
397
|
-
self,
|
398
|
-
chat: List[Message],
|
399
|
-
plan_context: PlanContext,
|
400
|
-
code_interpreter: Optional[CodeInterpreter] = None,
|
401
|
-
) -> Dict[str, Any]:
|
402
|
-
"""Generates code and other intermediate outputs from a chat input and a plan.
|
403
|
-
The plan includes:
|
404
|
-
- plans: The plans generated by the planner.
|
405
|
-
- best_plan: The best plan selected by the planner.
|
406
|
-
- plan_thoughts: The thoughts of the planner, including any modifications
|
407
|
-
to the plan.
|
408
|
-
- tool_doc: The tool documentation for the best plan.
|
409
|
-
- tool_output: The tool output from the tools used by the best plan.
|
410
|
-
|
411
|
-
Parameters:
|
412
|
-
chat (List[Message]): A conversation in the format of
|
413
|
-
[{"role": "user", "content": "describe your task here..."}].
|
414
|
-
plan_context (PlanContext): The context of the plan, including the plans,
|
415
|
-
best_plan, plan_thoughts, tool_doc, and tool_output.
|
416
|
-
|
417
|
-
Returns:
|
418
|
-
Dict[str, Any]: A dictionary containing the code output by the
|
419
|
-
VisionAgentCoder and other intermediate outputs. include:
|
420
|
-
- status (str): Whether or not the agent completed or failed generating
|
421
|
-
the code.
|
422
|
-
- code (str): The code output by the VisionAgentCoder.
|
423
|
-
- test (str): The test output by the VisionAgentCoder.
|
424
|
-
- test_result (Execution): The result of the test execution.
|
425
|
-
- plans (Dict[str, Any]): The plans generated by the planner.
|
426
|
-
- plan_thoughts (str): The thoughts of the planner.
|
427
|
-
- working_memory (List[Dict[str, str]]): The working memory of the agent.
|
428
|
-
"""
|
429
|
-
if not chat:
|
430
|
-
raise ValueError("Chat cannot be empty.")
|
431
|
-
|
432
|
-
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
433
|
-
code_interpreter = (
|
434
|
-
self.code_interpreter
|
435
|
-
if self.code_interpreter is not None
|
436
|
-
and not isinstance(self.code_interpreter, str)
|
437
|
-
else CodeInterpreterFactory.new_instance(
|
438
|
-
code_sandbox_runtime=self.code_interpreter,
|
439
|
-
)
|
440
|
-
)
|
441
|
-
with code_interpreter:
|
442
|
-
chat = copy.deepcopy(chat)
|
443
|
-
media_list = []
|
444
|
-
for chat_i in chat:
|
445
|
-
if "media" in chat_i:
|
446
|
-
for media in chat_i["media"]:
|
447
|
-
chat_i["content"] += f" Media name {media}" # type: ignore
|
448
|
-
media_list.append(str(media))
|
449
|
-
|
450
|
-
int_chat = cast(
|
451
|
-
List[Message],
|
452
|
-
[
|
453
|
-
(
|
454
|
-
{
|
455
|
-
"role": c["role"],
|
456
|
-
"content": c["content"],
|
457
|
-
"media": c["media"],
|
458
|
-
}
|
459
|
-
if "media" in c
|
460
|
-
else {"role": c["role"], "content": c["content"]}
|
461
|
-
)
|
462
|
-
for c in chat
|
463
|
-
],
|
464
|
-
)
|
465
|
-
|
466
|
-
code = ""
|
467
|
-
test = ""
|
468
|
-
working_memory: List[Dict[str, str]] = []
|
469
|
-
plan = plan_context.plans[plan_context.best_plan]
|
470
|
-
tool_doc = plan_context.tool_doc
|
471
|
-
tool_output_str = plan_context.tool_output
|
472
|
-
plan_thoughts_str = str(plan_context.plan_thoughts)
|
473
|
-
|
474
|
-
if self.verbosity >= 1:
|
475
|
-
plan_fixed = [{"instructions": e} for e in plan["instructions"]]
|
476
|
-
_LOGGER.info(
|
477
|
-
f"Picked best plan:\n{tabulate(tabular_data=plan_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
478
|
-
)
|
479
|
-
|
480
|
-
results = write_and_test_code(
|
481
|
-
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
|
482
|
-
plan=f"\n{plan['thoughts']}\n-"
|
483
|
-
+ "\n-".join([e for e in plan["instructions"]]),
|
484
|
-
tool_info=tool_doc,
|
485
|
-
tool_output=tool_output_str,
|
486
|
-
plan_thoughts=plan_thoughts_str,
|
487
|
-
tool_utils=T.get_utilties_docstring(),
|
488
|
-
working_memory=working_memory,
|
489
|
-
coder=self.coder,
|
490
|
-
tester=self.tester,
|
491
|
-
debugger=self.debugger,
|
492
|
-
code_interpreter=code_interpreter,
|
493
|
-
log_progress=self.log_progress,
|
494
|
-
verbosity=self.verbosity,
|
495
|
-
media=media_list,
|
496
|
-
)
|
497
|
-
success = cast(bool, results["success"])
|
498
|
-
code = remove_installs_from_code(cast(str, results["code"]))
|
499
|
-
test = remove_installs_from_code(cast(str, results["test"]))
|
500
|
-
working_memory.extend(results["working_memory"])
|
501
|
-
execution_result = cast(Execution, results["test_result"])
|
502
|
-
|
503
|
-
return {
|
504
|
-
"status": "completed" if success else "failed",
|
505
|
-
"code": DefaultImports.prepend_imports(code),
|
506
|
-
"test": test,
|
507
|
-
"test_result": execution_result,
|
508
|
-
"plans": plan_context.plans,
|
509
|
-
"plan_thoughts": plan_thoughts_str,
|
510
|
-
"working_memory": working_memory,
|
511
|
-
}
|
512
|
-
|
513
|
-
def generate_code(
|
514
|
-
self,
|
515
|
-
chat: List[Message],
|
516
|
-
test_multi_plan: bool = True,
|
517
|
-
custom_tool_names: Optional[List[str]] = None,
|
518
|
-
) -> Dict[str, Any]:
|
519
|
-
"""Generates code and other intermediate outputs from a chat input.
|
520
|
-
|
521
|
-
Parameters:
|
522
|
-
chat (List[Message]): A conversation in the format of
|
523
|
-
[{"role": "user", "content": "describe your task here..."}].
|
524
|
-
test_multi_plan (bool): Whether to test multiple plans or just the best plan.
|
525
|
-
custom_tool_names (Optional[List[str]]): A list of custom tool names to use
|
526
|
-
for the planner.
|
527
|
-
|
528
|
-
Returns:
|
529
|
-
Dict[str, Any]: A dictionary containing the code output by the
|
530
|
-
VisionAgentCoder and other intermediate outputs. include:
|
531
|
-
- status (str): Whether or not the agent completed or failed generating
|
532
|
-
the code.
|
533
|
-
- code (str): The code output by the VisionAgentCoder.
|
534
|
-
- test (str): The test output by the VisionAgentCoder.
|
535
|
-
- test_result (Execution): The result of the test execution.
|
536
|
-
- plans (Dict[str, Any]): The plans generated by the planner.
|
537
|
-
- plan_thoughts (str): The thoughts of the planner.
|
538
|
-
- working_memory (List[Dict[str, str]]): The working memory of the agent.
|
539
|
-
"""
|
540
|
-
if not chat:
|
541
|
-
raise ValueError("Chat cannot be empty.")
|
542
|
-
|
543
|
-
# NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
|
544
|
-
code_interpreter = (
|
545
|
-
self.code_interpreter
|
546
|
-
if self.code_interpreter is not None
|
547
|
-
and not isinstance(self.code_interpreter, str)
|
548
|
-
else CodeInterpreterFactory.new_instance(
|
549
|
-
code_sandbox_runtime=self.code_interpreter,
|
550
|
-
)
|
551
|
-
)
|
552
|
-
with code_interpreter:
|
553
|
-
plan_context = self.planner.generate_plan( # type: ignore
|
554
|
-
chat,
|
555
|
-
test_multi_plan=test_multi_plan,
|
556
|
-
custom_tool_names=custom_tool_names,
|
557
|
-
code_interpreter=code_interpreter,
|
558
|
-
)
|
559
|
-
|
560
|
-
code_and_context = self.generate_code_from_plan(
|
561
|
-
chat,
|
562
|
-
plan_context,
|
563
|
-
code_interpreter=code_interpreter,
|
564
|
-
)
|
565
|
-
return code_and_context
|
566
|
-
|
567
|
-
def chat(self, chat: List[Message]) -> List[Message]:
|
568
|
-
chat = copy.deepcopy(chat)
|
569
|
-
code = self.generate_code(chat)
|
570
|
-
chat.append({"role": "agent", "content": code["code"]})
|
571
|
-
return chat
|
572
|
-
|
573
|
-
def log_progress(self, data: Dict[str, Any]) -> None:
|
574
|
-
if self.report_progress_callback is not None:
|
575
|
-
self.report_progress_callback(data)
|
576
|
-
|
577
|
-
|
578
|
-
class OpenAIVisionAgentCoder(VisionAgentCoder):
|
579
|
-
"""Initializes Vision Agent Coder using OpenAI models for planning, coding, testing."""
|
580
|
-
|
581
|
-
def __init__(
|
582
|
-
self,
|
583
|
-
planner: Optional[Agent] = None,
|
584
|
-
coder: Optional[LMM] = None,
|
585
|
-
tester: Optional[LMM] = None,
|
586
|
-
debugger: Optional[LMM] = None,
|
587
|
-
verbosity: int = 0,
|
588
|
-
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
589
|
-
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
590
|
-
) -> None:
|
591
|
-
self.planner = (
|
592
|
-
OpenAIVisionAgentPlanner(verbosity=verbosity)
|
593
|
-
if planner is None
|
594
|
-
else planner
|
595
|
-
)
|
596
|
-
self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
|
597
|
-
self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
|
598
|
-
self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
|
599
|
-
self.verbosity = verbosity
|
600
|
-
if self.verbosity > 0:
|
601
|
-
_LOGGER.setLevel(logging.INFO)
|
602
|
-
|
603
|
-
self.report_progress_callback = report_progress_callback
|
604
|
-
self.code_interpreter = code_interpreter
|
605
|
-
|
606
|
-
|
607
|
-
class AnthropicVisionAgentCoder(VisionAgentCoder):
|
608
|
-
"""Initializes Vision Agent Coder using Anthropic models for planning, coding, testing."""
|
609
|
-
|
610
|
-
def __init__(
|
611
|
-
self,
|
612
|
-
planner: Optional[Agent] = None,
|
613
|
-
coder: Optional[LMM] = None,
|
614
|
-
tester: Optional[LMM] = None,
|
615
|
-
debugger: Optional[LMM] = None,
|
616
|
-
verbosity: int = 0,
|
617
|
-
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
618
|
-
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
619
|
-
) -> None:
|
620
|
-
# NOTE: Claude doesn't have an official JSON mode
|
621
|
-
self.planner = (
|
622
|
-
AnthropicVisionAgentPlanner(verbosity=verbosity)
|
623
|
-
if planner is None
|
624
|
-
else planner
|
625
|
-
)
|
626
|
-
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
627
|
-
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
628
|
-
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
629
|
-
self.verbosity = verbosity
|
630
|
-
if self.verbosity > 0:
|
631
|
-
_LOGGER.setLevel(logging.INFO)
|
632
|
-
|
633
|
-
self.report_progress_callback = report_progress_callback
|
634
|
-
self.code_interpreter = code_interpreter
|
635
|
-
|
636
|
-
|
637
|
-
class OllamaVisionAgentCoder(VisionAgentCoder):
|
638
|
-
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
639
|
-
|
640
|
-
Pre-requisites:
|
641
|
-
1. Run ollama pull llama3.2-vision for the LMM
|
642
|
-
2. Run ollama pull mxbai-embed-large for the embedding similarity model
|
643
|
-
|
644
|
-
Example
|
645
|
-
-------
|
646
|
-
>>> image vision_agent as va
|
647
|
-
>>> agent = va.agent.OllamaVisionAgentCoder()
|
648
|
-
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
649
|
-
"""
|
650
|
-
|
651
|
-
def __init__(
|
652
|
-
self,
|
653
|
-
planner: Optional[Agent] = None,
|
654
|
-
coder: Optional[LMM] = None,
|
655
|
-
tester: Optional[LMM] = None,
|
656
|
-
debugger: Optional[LMM] = None,
|
657
|
-
verbosity: int = 0,
|
658
|
-
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
659
|
-
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
660
|
-
) -> None:
|
661
|
-
super().__init__(
|
662
|
-
planner=(
|
663
|
-
OllamaVisionAgentPlanner(verbosity=verbosity)
|
664
|
-
if planner is None
|
665
|
-
else planner
|
666
|
-
),
|
667
|
-
coder=(
|
668
|
-
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
669
|
-
if coder is None
|
670
|
-
else coder
|
671
|
-
),
|
672
|
-
tester=(
|
673
|
-
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
674
|
-
if tester is None
|
675
|
-
else tester
|
676
|
-
),
|
677
|
-
debugger=(
|
678
|
-
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
679
|
-
if debugger is None
|
680
|
-
else debugger
|
681
|
-
),
|
682
|
-
verbosity=verbosity,
|
683
|
-
report_progress_callback=report_progress_callback,
|
684
|
-
code_interpreter=code_interpreter,
|
685
|
-
)
|
686
|
-
|
687
|
-
|
688
|
-
class AzureVisionAgentCoder(VisionAgentCoder):
|
689
|
-
"""VisionAgentCoder that uses Azure OpenAI APIs for planning, coding, testing.
|
690
|
-
|
691
|
-
Pre-requisites:
|
692
|
-
1. Set the environment variable AZURE_OPENAI_API_KEY to your Azure OpenAI API key.
|
693
|
-
2. Set the environment variable AZURE_OPENAI_ENDPOINT to your Azure OpenAI endpoint.
|
694
|
-
|
695
|
-
Example
|
696
|
-
-------
|
697
|
-
>>> import vision_agent as va
|
698
|
-
>>> agent = va.agent.AzureVisionAgentCoder()
|
699
|
-
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
700
|
-
"""
|
701
|
-
|
702
|
-
def __init__(
|
703
|
-
self,
|
704
|
-
planner: Optional[Agent] = None,
|
705
|
-
coder: Optional[LMM] = None,
|
706
|
-
tester: Optional[LMM] = None,
|
707
|
-
debugger: Optional[LMM] = None,
|
708
|
-
verbosity: int = 0,
|
709
|
-
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
710
|
-
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
711
|
-
) -> None:
|
712
|
-
"""Initialize the Vision Agent Coder.
|
713
|
-
|
714
|
-
Parameters:
|
715
|
-
planner (Optional[Agent]): The planner model to use. Defaults to
|
716
|
-
AzureVisionAgentPlanner.
|
717
|
-
coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
|
718
|
-
tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
|
719
|
-
debugger (Optional[LMM]): The debugger model to
|
720
|
-
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
721
|
-
highest verbosity level which will output all intermediate debugging
|
722
|
-
code.
|
723
|
-
report_progress_callback: a callback to report the progress of the agent.
|
724
|
-
This is useful for streaming logs in a web application where multiple
|
725
|
-
VisionAgentCoder instances are running in parallel. This callback
|
726
|
-
ensures that the progress are not mixed up.
|
727
|
-
"""
|
728
|
-
super().__init__(
|
729
|
-
planner=(
|
730
|
-
AzureVisionAgentPlanner(verbosity=verbosity)
|
731
|
-
if planner is None
|
732
|
-
else planner
|
733
|
-
),
|
734
|
-
coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder,
|
735
|
-
tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester,
|
736
|
-
debugger=(
|
737
|
-
AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
|
738
|
-
),
|
739
|
-
verbosity=verbosity,
|
740
|
-
report_progress_callback=report_progress_callback,
|
741
|
-
code_interpreter=code_interpreter,
|
742
|
-
)
|