vision-agent 0.2.140__py3-none-any.whl → 0.2.141__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -1
- vision_agent/agent/agent_utils.py +8 -2
- vision_agent/agent/vision_agent.py +97 -17
- vision_agent/agent/vision_agent_coder.py +93 -66
- vision_agent/agent/vision_agent_coder_prompts.py +53 -19
- vision_agent/agent/vision_agent_prompts.py +31 -9
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +6 -9
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +64 -32
- vision_agent/tools/tools.py +115 -30
- vision_agent/tools/tools_types.py +1 -0
- vision_agent/utils/image_utils.py +18 -7
- vision_agent/utils/video.py +2 -1
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/METADATA +60 -12
- vision_agent-0.2.141.dist-info/RECORD +33 -0
- vision_agent-0.2.140.dist-info/RECORD +0 -33
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/WHEEL +0 -0
vision_agent/agent/__init__.py
CHANGED
@@ -40,12 +40,18 @@ def _strip_markdown_code(inp_str: str) -> str:
|
|
40
40
|
|
41
41
|
|
42
42
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
43
|
-
|
43
|
+
json_str_mod = json_str.replace("\n", " ").strip()
|
44
|
+
json_str_mod = json_str_mod.replace("'", '"')
|
45
|
+
json_str_mod = json_str_mod.replace(": True", ": true").replace(
|
46
|
+
": False", ": false"
|
47
|
+
)
|
44
48
|
|
45
49
|
try:
|
46
|
-
return json.loads(
|
50
|
+
return json.loads(json_str_mod) # type: ignore
|
47
51
|
except json.JSONDecodeError:
|
48
52
|
json_orig = json_str
|
53
|
+
# don't replace quotes here or booleans since it can also introduce errors
|
54
|
+
json_str = json_str.replace("\n", " ").strip()
|
49
55
|
json_str = _strip_markdown_code(json_str)
|
50
56
|
json_str = _find_markdown_json(json_str)
|
51
57
|
json_dict = _extract_sub_json(json_str)
|
@@ -3,18 +3,23 @@ import logging
|
|
3
3
|
import os
|
4
4
|
import tempfile
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
7
7
|
|
8
8
|
from vision_agent.agent import Agent
|
9
9
|
from vision_agent.agent.agent_utils import extract_json
|
10
10
|
from vision_agent.agent.vision_agent_prompts import (
|
11
11
|
EXAMPLES_CODE1,
|
12
12
|
EXAMPLES_CODE2,
|
13
|
+
EXAMPLES_CODE3,
|
13
14
|
VA_CODE,
|
14
15
|
)
|
15
|
-
from vision_agent.lmm import LMM, Message, OpenAILMM
|
16
|
+
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
16
17
|
from vision_agent.tools import META_TOOL_DOCSTRING
|
17
|
-
from vision_agent.tools.meta_tools import
|
18
|
+
from vision_agent.tools.meta_tools import (
|
19
|
+
Artifacts,
|
20
|
+
check_and_load_image,
|
21
|
+
use_extra_vision_agent_args,
|
22
|
+
)
|
18
23
|
from vision_agent.utils import CodeInterpreterFactory
|
19
24
|
from vision_agent.utils.execute import CodeInterpreter, Execution
|
20
25
|
|
@@ -30,7 +35,7 @@ class BoilerplateCode:
|
|
30
35
|
pre_code = [
|
31
36
|
"from typing import *",
|
32
37
|
"from vision_agent.utils.execute import CodeInterpreter",
|
33
|
-
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact,
|
38
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
|
34
39
|
"artifacts = Artifacts('{remote_path}')",
|
35
40
|
"artifacts.load('{remote_path}')",
|
36
41
|
]
|
@@ -68,10 +73,18 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
68
73
|
|
69
74
|
prompt = VA_CODE.format(
|
70
75
|
documentation=META_TOOL_DOCSTRING,
|
71
|
-
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
|
76
|
+
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
|
72
77
|
conversation=conversation,
|
73
78
|
)
|
74
|
-
|
79
|
+
message: Message = {"role": "user", "content": prompt}
|
80
|
+
# only add recent media so we don't overload the model with old images
|
81
|
+
if (
|
82
|
+
chat[-1]["role"] == "observation"
|
83
|
+
and "media" in chat[-1]
|
84
|
+
and len(chat[-1]["media"]) > 0 # type: ignore
|
85
|
+
):
|
86
|
+
message["media"] = chat[-1]["media"]
|
87
|
+
return extract_json(orch([message], stream=False)) # type: ignore
|
75
88
|
|
76
89
|
|
77
90
|
def run_code_action(
|
@@ -136,10 +149,8 @@ class VisionAgent(Agent):
|
|
136
149
|
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
137
150
|
"""
|
138
151
|
|
139
|
-
self.agent = (
|
140
|
-
|
141
|
-
)
|
142
|
-
self.max_iterations = 100
|
152
|
+
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
153
|
+
self.max_iterations = 12
|
143
154
|
self.verbosity = verbosity
|
144
155
|
self.code_sandbox_runtime = code_sandbox_runtime
|
145
156
|
self.callback_message = callback_message
|
@@ -267,7 +278,8 @@ class VisionAgent(Agent):
|
|
267
278
|
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
268
279
|
self.streaming_message({"role": "observation", "content": artifacts_loaded})
|
269
280
|
|
270
|
-
if
|
281
|
+
if int_chat[-1]["role"] == "user":
|
282
|
+
last_user_message_content = cast(str, int_chat[-1].get("content", ""))
|
271
283
|
user_code_action = parse_execution(last_user_message_content, False)
|
272
284
|
if user_code_action is not None:
|
273
285
|
user_result, user_obs = run_code_action(
|
@@ -309,8 +321,7 @@ class VisionAgent(Agent):
|
|
309
321
|
else:
|
310
322
|
self.streaming_message({"role": "assistant", "content": response})
|
311
323
|
|
312
|
-
|
313
|
-
break
|
324
|
+
finished = response["let_user_respond"]
|
314
325
|
|
315
326
|
code_action = parse_execution(
|
316
327
|
response["response"], test_multi_plan, customized_tool_names
|
@@ -321,13 +332,22 @@ class VisionAgent(Agent):
|
|
321
332
|
code_action, code_interpreter, str(remote_artifacts_path)
|
322
333
|
)
|
323
334
|
|
335
|
+
media_obs = check_and_load_image(code_action)
|
336
|
+
|
324
337
|
if self.verbosity >= 1:
|
325
338
|
_LOGGER.info(obs)
|
339
|
+
|
340
|
+
chat_elt: Message = {"role": "observation", "content": obs}
|
341
|
+
if media_obs and result.success:
|
342
|
+
chat_elt["media"] = [
|
343
|
+
Path(code_interpreter.remote_path) / media_ob
|
344
|
+
for media_ob in media_obs
|
345
|
+
]
|
346
|
+
|
326
347
|
# don't add execution results to internal chat
|
327
|
-
int_chat.append(
|
328
|
-
|
329
|
-
|
330
|
-
)
|
348
|
+
int_chat.append(chat_elt)
|
349
|
+
chat_elt["execution"] = result
|
350
|
+
orig_chat.append(chat_elt)
|
331
351
|
self.streaming_message(
|
332
352
|
{
|
333
353
|
"role": "observation",
|
@@ -353,3 +373,63 @@ class VisionAgent(Agent):
|
|
353
373
|
|
354
374
|
def log_progress(self, data: Dict[str, Any]) -> None:
|
355
375
|
pass
|
376
|
+
|
377
|
+
|
378
|
+
class OpenAIVisionAgent(VisionAgent):
|
379
|
+
def __init__(
|
380
|
+
self,
|
381
|
+
agent: Optional[LMM] = None,
|
382
|
+
verbosity: int = 0,
|
383
|
+
local_artifacts_path: Optional[Union[str, Path]] = None,
|
384
|
+
code_sandbox_runtime: Optional[str] = None,
|
385
|
+
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
386
|
+
) -> None:
|
387
|
+
"""Initialize the VisionAgent using OpenAI LMMs.
|
388
|
+
|
389
|
+
Parameters:
|
390
|
+
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
391
|
+
of other agents.
|
392
|
+
verbosity (int): The verbosity level of the agent.
|
393
|
+
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
394
|
+
artifacts file.
|
395
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
396
|
+
"""
|
397
|
+
|
398
|
+
agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
399
|
+
super().__init__(
|
400
|
+
agent,
|
401
|
+
verbosity,
|
402
|
+
local_artifacts_path,
|
403
|
+
code_sandbox_runtime,
|
404
|
+
callback_message,
|
405
|
+
)
|
406
|
+
|
407
|
+
|
408
|
+
class AnthropicVisionAgent(VisionAgent):
|
409
|
+
def __init__(
|
410
|
+
self,
|
411
|
+
agent: Optional[LMM] = None,
|
412
|
+
verbosity: int = 0,
|
413
|
+
local_artifacts_path: Optional[Union[str, Path]] = None,
|
414
|
+
code_sandbox_runtime: Optional[str] = None,
|
415
|
+
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
416
|
+
) -> None:
|
417
|
+
"""Initialize the VisionAgent using Anthropic LMMs.
|
418
|
+
|
419
|
+
Parameters:
|
420
|
+
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
421
|
+
of other agents.
|
422
|
+
verbosity (int): The verbosity level of the agent.
|
423
|
+
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
424
|
+
artifacts file.
|
425
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
426
|
+
"""
|
427
|
+
|
428
|
+
agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
429
|
+
super().__init__(
|
430
|
+
agent,
|
431
|
+
verbosity,
|
432
|
+
local_artifacts_path,
|
433
|
+
code_sandbox_runtime,
|
434
|
+
callback_message,
|
435
|
+
)
|
@@ -2,12 +2,10 @@ import copy
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import sys
|
5
|
-
import tempfile
|
6
5
|
from json import JSONDecodeError
|
7
6
|
from pathlib import Path
|
8
7
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
|
9
8
|
|
10
|
-
from PIL import Image
|
11
9
|
from rich.console import Console
|
12
10
|
from rich.style import Style
|
13
11
|
from rich.syntax import Syntax
|
@@ -29,8 +27,8 @@ from vision_agent.agent.vision_agent_coder_prompts import (
|
|
29
27
|
)
|
30
28
|
from vision_agent.lmm import (
|
31
29
|
LMM,
|
30
|
+
AnthropicLMM,
|
32
31
|
AzureOpenAILMM,
|
33
|
-
ClaudeSonnetLMM,
|
34
32
|
Message,
|
35
33
|
OllamaLMM,
|
36
34
|
OpenAILMM,
|
@@ -53,6 +51,9 @@ class DefaultImports:
|
|
53
51
|
"""Container for default imports used in the code execution."""
|
54
52
|
|
55
53
|
common_imports = [
|
54
|
+
"import os",
|
55
|
+
"import numpy as np",
|
56
|
+
"from vision_agent.tools import *",
|
56
57
|
"from typing import *",
|
57
58
|
"from pillow_heif import register_heif_opener",
|
58
59
|
"register_heif_opener()",
|
@@ -92,29 +93,6 @@ def format_plans(plans: Dict[str, Any]) -> str:
|
|
92
93
|
return plan_str
|
93
94
|
|
94
95
|
|
95
|
-
def extract_image(
|
96
|
-
media: Optional[Sequence[Union[str, Path]]],
|
97
|
-
) -> Optional[Sequence[Union[str, Path]]]:
|
98
|
-
if media is None:
|
99
|
-
return None
|
100
|
-
|
101
|
-
new_media = []
|
102
|
-
for m in media:
|
103
|
-
m = Path(m)
|
104
|
-
extension = m.suffix
|
105
|
-
if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
|
106
|
-
new_media.append(m)
|
107
|
-
elif extension in [".mp4", ".mov"]:
|
108
|
-
frames = T.extract_frames(m)
|
109
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
110
|
-
if len(frames) > 0:
|
111
|
-
Image.fromarray(frames[0][0]).save(tmp.name)
|
112
|
-
new_media.append(Path(tmp.name))
|
113
|
-
if len(new_media) == 0:
|
114
|
-
return None
|
115
|
-
return new_media
|
116
|
-
|
117
|
-
|
118
96
|
def write_plans(
|
119
97
|
chat: List[Message],
|
120
98
|
tool_desc: str,
|
@@ -146,7 +124,7 @@ def pick_plan(
|
|
146
124
|
log_progress: Callable[[Dict[str, Any]], None],
|
147
125
|
verbosity: int = 0,
|
148
126
|
max_retries: int = 3,
|
149
|
-
) -> Tuple[str, str]:
|
127
|
+
) -> Tuple[Dict[str, str], str]:
|
150
128
|
log_progress(
|
151
129
|
{
|
152
130
|
"type": "log",
|
@@ -199,7 +177,10 @@ def pick_plan(
|
|
199
177
|
|
200
178
|
# retry if the tool output is empty or code fails
|
201
179
|
count = 0
|
202
|
-
while (
|
180
|
+
while (
|
181
|
+
not tool_output.success
|
182
|
+
or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
|
183
|
+
) and count < max_retries:
|
203
184
|
prompt = TEST_PLANS.format(
|
204
185
|
docstring=tool_info,
|
205
186
|
plans=plan_str,
|
@@ -238,6 +219,7 @@ def pick_plan(
|
|
238
219
|
if verbosity == 2:
|
239
220
|
_print_code("Code and test after attempted fix:", code)
|
240
221
|
_LOGGER.info(f"Code execution result after attempt {count + 1}")
|
222
|
+
_LOGGER.info(f"{tool_output_str}")
|
241
223
|
|
242
224
|
count += 1
|
243
225
|
|
@@ -256,10 +238,10 @@ def pick_plan(
|
|
256
238
|
chat[-1]["content"] = prompt
|
257
239
|
|
258
240
|
count = 0
|
259
|
-
|
260
|
-
while
|
241
|
+
plan_thoughts = None
|
242
|
+
while plan_thoughts is None and count < max_retries:
|
261
243
|
try:
|
262
|
-
|
244
|
+
plan_thoughts = extract_json(model(chat, stream=False)) # type: ignore
|
263
245
|
except JSONDecodeError as e:
|
264
246
|
_LOGGER.exception(
|
265
247
|
f"Error while extracting JSON during picking best plan {str(e)}"
|
@@ -268,23 +250,27 @@ def pick_plan(
|
|
268
250
|
count += 1
|
269
251
|
|
270
252
|
if (
|
271
|
-
|
272
|
-
or "best_plan" not in
|
273
|
-
or ("best_plan" in
|
253
|
+
plan_thoughts is None
|
254
|
+
or "best_plan" not in plan_thoughts
|
255
|
+
or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
|
274
256
|
):
|
275
|
-
|
257
|
+
_LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
|
258
|
+
plan_thoughts = {"best_plan": list(plans.keys())[0]}
|
259
|
+
|
260
|
+
if "thoughts" not in plan_thoughts:
|
261
|
+
plan_thoughts["thoughts"] = ""
|
276
262
|
|
277
263
|
if verbosity >= 1:
|
278
|
-
_LOGGER.info(f"Best plan:\n{
|
264
|
+
_LOGGER.info(f"Best plan:\n{plan_thoughts}")
|
279
265
|
log_progress(
|
280
266
|
{
|
281
267
|
"type": "log",
|
282
268
|
"log_content": "Picked best plan",
|
283
269
|
"status": "completed",
|
284
|
-
"payload": plans[
|
270
|
+
"payload": plans[plan_thoughts["best_plan"]],
|
285
271
|
}
|
286
272
|
)
|
287
|
-
return
|
273
|
+
return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
|
288
274
|
|
289
275
|
|
290
276
|
def write_code(
|
@@ -292,6 +278,7 @@ def write_code(
|
|
292
278
|
chat: List[Message],
|
293
279
|
plan: str,
|
294
280
|
tool_info: str,
|
281
|
+
plan_thoughts: str,
|
295
282
|
tool_output: str,
|
296
283
|
feedback: str,
|
297
284
|
) -> str:
|
@@ -304,6 +291,7 @@ def write_code(
|
|
304
291
|
docstring=tool_info,
|
305
292
|
question=FULL_TASK.format(user_request=user_request, subtasks=plan),
|
306
293
|
tool_output=tool_output,
|
294
|
+
plan_thoughts=plan_thoughts,
|
307
295
|
feedback=feedback,
|
308
296
|
)
|
309
297
|
chat[-1]["content"] = prompt
|
@@ -339,6 +327,7 @@ def write_and_test_code(
|
|
339
327
|
plan: str,
|
340
328
|
tool_info: str,
|
341
329
|
tool_output: str,
|
330
|
+
plan_thoughts: str,
|
342
331
|
tool_utils: str,
|
343
332
|
working_memory: List[Dict[str, str]],
|
344
333
|
coder: LMM,
|
@@ -363,6 +352,7 @@ def write_and_test_code(
|
|
363
352
|
plan,
|
364
353
|
tool_info,
|
365
354
|
tool_output,
|
355
|
+
plan_thoughts,
|
366
356
|
format_memory(working_memory),
|
367
357
|
)
|
368
358
|
test = write_test(
|
@@ -634,31 +624,30 @@ class VisionAgentCoder(Agent):
|
|
634
624
|
"""Initialize the Vision Agent Coder.
|
635
625
|
|
636
626
|
Parameters:
|
637
|
-
planner (Optional[LMM]): The planner model to use. Defaults to
|
638
|
-
coder (Optional[LMM]): The coder model to use. Defaults to
|
639
|
-
tester (Optional[LMM]): The tester model to use. Defaults to
|
640
|
-
debugger (Optional[LMM]): The debugger model to
|
627
|
+
planner (Optional[LMM]): The planner model to use. Defaults to AnthropicLMM.
|
628
|
+
coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
|
629
|
+
tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
|
630
|
+
debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
|
641
631
|
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
642
632
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
643
633
|
highest verbosity level which will output all intermediate debugging
|
644
634
|
code.
|
645
|
-
report_progress_callback: a callback
|
646
|
-
This is useful for streaming logs
|
647
|
-
|
648
|
-
ensures that the progress are not
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
635
|
+
report_progress_callback (Optional[Callable[Dict[str, Any]]]): a callback
|
636
|
+
to report the progress of the agent. This is useful for streaming logs
|
637
|
+
in a web application where multiple VisionAgentCoder instances are
|
638
|
+
running in parallel. This callback ensures that the progress are not
|
639
|
+
mixed up.
|
640
|
+
code_sandbox_runtime (Optional[str]): the code sandbox runtime to use. A
|
641
|
+
code sandbox is used to run the generated code. It can be one of the
|
642
|
+
following values: None, "local" or "e2b". If None, VisionAgentCoder
|
643
|
+
will read the value from the environment variable CODE_SANDBOX_RUNTIME.
|
644
|
+
If it's also None, the local python runtime environment will be used.
|
654
645
|
"""
|
655
646
|
|
656
|
-
self.planner = (
|
657
|
-
|
658
|
-
)
|
659
|
-
self.
|
660
|
-
self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
|
661
|
-
self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
|
647
|
+
self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
|
648
|
+
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
649
|
+
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
650
|
+
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
662
651
|
self.verbosity = verbosity
|
663
652
|
if self.verbosity > 0:
|
664
653
|
_LOGGER.setLevel(logging.INFO)
|
@@ -785,7 +774,7 @@ class VisionAgentCoder(Agent):
|
|
785
774
|
)
|
786
775
|
|
787
776
|
if test_multi_plan:
|
788
|
-
|
777
|
+
plan_thoughts, tool_output_str = pick_plan(
|
789
778
|
int_chat,
|
790
779
|
plans,
|
791
780
|
tool_infos["all"],
|
@@ -795,9 +784,12 @@ class VisionAgentCoder(Agent):
|
|
795
784
|
self.log_progress,
|
796
785
|
verbosity=self.verbosity,
|
797
786
|
)
|
787
|
+
best_plan = plan_thoughts["best_plan"]
|
788
|
+
plan_thoughts_str = plan_thoughts["thoughts"]
|
798
789
|
else:
|
799
790
|
best_plan = list(plans.keys())[0]
|
800
791
|
tool_output_str = ""
|
792
|
+
plan_thoughts_str = ""
|
801
793
|
|
802
794
|
if best_plan in plans and best_plan in tool_infos:
|
803
795
|
plan_i = plans[best_plan]
|
@@ -832,6 +824,7 @@ class VisionAgentCoder(Agent):
|
|
832
824
|
+ "\n-".join([e for e in plan_i["instructions"]]),
|
833
825
|
tool_info=tool_info,
|
834
826
|
tool_output=tool_output_str,
|
827
|
+
plan_thoughts=plan_thoughts_str,
|
835
828
|
tool_utils=T.UTILITIES_DOCSTRING,
|
836
829
|
working_memory=working_memory,
|
837
830
|
coder=self.coder,
|
@@ -862,7 +855,8 @@ class VisionAgentCoder(Agent):
|
|
862
855
|
"code": DefaultImports.prepend_imports(code),
|
863
856
|
"test": test,
|
864
857
|
"test_result": execution_result,
|
865
|
-
"
|
858
|
+
"plans": plans,
|
859
|
+
"plan_thoughts": plan_thoughts_str,
|
866
860
|
"working_memory": working_memory,
|
867
861
|
}
|
868
862
|
|
@@ -904,7 +898,9 @@ class VisionAgentCoder(Agent):
|
|
904
898
|
)
|
905
899
|
|
906
900
|
|
907
|
-
class
|
901
|
+
class OpenAIVisionAgentCoder(VisionAgentCoder):
|
902
|
+
"""Initializes Vision Agent Coder using OpenAI models for planning, coding, testing."""
|
903
|
+
|
908
904
|
def __init__(
|
909
905
|
self,
|
910
906
|
planner: Optional[LMM] = None,
|
@@ -916,13 +912,44 @@ class ClaudeVisionAgentCoder(VisionAgentCoder):
|
|
916
912
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
917
913
|
code_sandbox_runtime: Optional[str] = None,
|
918
914
|
) -> None:
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
self.
|
923
|
-
self.
|
924
|
-
|
915
|
+
self.planner = (
|
916
|
+
OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
|
917
|
+
)
|
918
|
+
self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
|
919
|
+
self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
|
920
|
+
self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
|
921
|
+
self.verbosity = verbosity
|
922
|
+
if self.verbosity > 0:
|
923
|
+
_LOGGER.setLevel(logging.INFO)
|
924
|
+
|
925
|
+
self.tool_recommender = (
|
926
|
+
Sim(T.TOOLS_DF, sim_key="desc")
|
927
|
+
if tool_recommender is None
|
928
|
+
else tool_recommender
|
925
929
|
)
|
930
|
+
self.report_progress_callback = report_progress_callback
|
931
|
+
self.code_sandbox_runtime = code_sandbox_runtime
|
932
|
+
|
933
|
+
|
934
|
+
class AnthropicVisionAgentCoder(VisionAgentCoder):
|
935
|
+
"""Initializes Vision Agent Coder using Anthropic models for planning, coding, testing."""
|
936
|
+
|
937
|
+
def __init__(
|
938
|
+
self,
|
939
|
+
planner: Optional[LMM] = None,
|
940
|
+
coder: Optional[LMM] = None,
|
941
|
+
tester: Optional[LMM] = None,
|
942
|
+
debugger: Optional[LMM] = None,
|
943
|
+
tool_recommender: Optional[Sim] = None,
|
944
|
+
verbosity: int = 0,
|
945
|
+
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
946
|
+
code_sandbox_runtime: Optional[str] = None,
|
947
|
+
) -> None:
|
948
|
+
# NOTE: Claude doesn't have an official JSON mode
|
949
|
+
self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
|
950
|
+
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
951
|
+
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
952
|
+
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
926
953
|
self.verbosity = verbosity
|
927
954
|
if self.verbosity > 0:
|
928
955
|
_LOGGER.setLevel(logging.INFO)
|