vision-agent 0.2.140__py3-none-any.whl → 0.2.142__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/__init__.py +2 -1
- vision_agent/agent/agent_utils.py +8 -2
- vision_agent/agent/vision_agent.py +97 -17
- vision_agent/agent/vision_agent_coder.py +93 -66
- vision_agent/agent/vision_agent_coder_prompts.py +53 -19
- vision_agent/agent/vision_agent_prompts.py +31 -9
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +6 -9
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +65 -33
- vision_agent/tools/tools.py +115 -30
- vision_agent/tools/tools_types.py +1 -0
- vision_agent/utils/image_utils.py +18 -7
- vision_agent/utils/video.py +2 -1
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/METADATA +60 -12
- vision_agent-0.2.142.dist-info/RECORD +33 -0
- vision_agent-0.2.140.dist-info/RECORD +0 -33
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/WHEEL +0 -0
vision_agent/agent/__init__.py
CHANGED
@@ -40,12 +40,18 @@ def _strip_markdown_code(inp_str: str) -> str:
|
|
40
40
|
|
41
41
|
|
42
42
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
43
|
-
|
43
|
+
json_str_mod = json_str.replace("\n", " ").strip()
|
44
|
+
json_str_mod = json_str_mod.replace("'", '"')
|
45
|
+
json_str_mod = json_str_mod.replace(": True", ": true").replace(
|
46
|
+
": False", ": false"
|
47
|
+
)
|
44
48
|
|
45
49
|
try:
|
46
|
-
return json.loads(
|
50
|
+
return json.loads(json_str_mod) # type: ignore
|
47
51
|
except json.JSONDecodeError:
|
48
52
|
json_orig = json_str
|
53
|
+
# don't replace quotes here or booleans since it can also introduce errors
|
54
|
+
json_str = json_str.replace("\n", " ").strip()
|
49
55
|
json_str = _strip_markdown_code(json_str)
|
50
56
|
json_str = _find_markdown_json(json_str)
|
51
57
|
json_dict = _extract_sub_json(json_str)
|
@@ -3,18 +3,23 @@ import logging
|
|
3
3
|
import os
|
4
4
|
import tempfile
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
7
7
|
|
8
8
|
from vision_agent.agent import Agent
|
9
9
|
from vision_agent.agent.agent_utils import extract_json
|
10
10
|
from vision_agent.agent.vision_agent_prompts import (
|
11
11
|
EXAMPLES_CODE1,
|
12
12
|
EXAMPLES_CODE2,
|
13
|
+
EXAMPLES_CODE3,
|
13
14
|
VA_CODE,
|
14
15
|
)
|
15
|
-
from vision_agent.lmm import LMM, Message, OpenAILMM
|
16
|
+
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
|
16
17
|
from vision_agent.tools import META_TOOL_DOCSTRING
|
17
|
-
from vision_agent.tools.meta_tools import
|
18
|
+
from vision_agent.tools.meta_tools import (
|
19
|
+
Artifacts,
|
20
|
+
check_and_load_image,
|
21
|
+
use_extra_vision_agent_args,
|
22
|
+
)
|
18
23
|
from vision_agent.utils import CodeInterpreterFactory
|
19
24
|
from vision_agent.utils.execute import CodeInterpreter, Execution
|
20
25
|
|
@@ -30,7 +35,7 @@ class BoilerplateCode:
|
|
30
35
|
pre_code = [
|
31
36
|
"from typing import *",
|
32
37
|
"from vision_agent.utils.execute import CodeInterpreter",
|
33
|
-
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact,
|
38
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
|
34
39
|
"artifacts = Artifacts('{remote_path}')",
|
35
40
|
"artifacts.load('{remote_path}')",
|
36
41
|
]
|
@@ -68,10 +73,18 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
68
73
|
|
69
74
|
prompt = VA_CODE.format(
|
70
75
|
documentation=META_TOOL_DOCSTRING,
|
71
|
-
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
|
76
|
+
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
|
72
77
|
conversation=conversation,
|
73
78
|
)
|
74
|
-
|
79
|
+
message: Message = {"role": "user", "content": prompt}
|
80
|
+
# only add recent media so we don't overload the model with old images
|
81
|
+
if (
|
82
|
+
chat[-1]["role"] == "observation"
|
83
|
+
and "media" in chat[-1]
|
84
|
+
and len(chat[-1]["media"]) > 0 # type: ignore
|
85
|
+
):
|
86
|
+
message["media"] = chat[-1]["media"]
|
87
|
+
return extract_json(orch([message], stream=False)) # type: ignore
|
75
88
|
|
76
89
|
|
77
90
|
def run_code_action(
|
@@ -136,10 +149,8 @@ class VisionAgent(Agent):
|
|
136
149
|
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
137
150
|
"""
|
138
151
|
|
139
|
-
self.agent = (
|
140
|
-
|
141
|
-
)
|
142
|
-
self.max_iterations = 100
|
152
|
+
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
153
|
+
self.max_iterations = 12
|
143
154
|
self.verbosity = verbosity
|
144
155
|
self.code_sandbox_runtime = code_sandbox_runtime
|
145
156
|
self.callback_message = callback_message
|
@@ -267,7 +278,8 @@ class VisionAgent(Agent):
|
|
267
278
|
orig_chat.append({"role": "observation", "content": artifacts_loaded})
|
268
279
|
self.streaming_message({"role": "observation", "content": artifacts_loaded})
|
269
280
|
|
270
|
-
if
|
281
|
+
if int_chat[-1]["role"] == "user":
|
282
|
+
last_user_message_content = cast(str, int_chat[-1].get("content", ""))
|
271
283
|
user_code_action = parse_execution(last_user_message_content, False)
|
272
284
|
if user_code_action is not None:
|
273
285
|
user_result, user_obs = run_code_action(
|
@@ -309,8 +321,7 @@ class VisionAgent(Agent):
|
|
309
321
|
else:
|
310
322
|
self.streaming_message({"role": "assistant", "content": response})
|
311
323
|
|
312
|
-
|
313
|
-
break
|
324
|
+
finished = response["let_user_respond"]
|
314
325
|
|
315
326
|
code_action = parse_execution(
|
316
327
|
response["response"], test_multi_plan, customized_tool_names
|
@@ -321,13 +332,22 @@ class VisionAgent(Agent):
|
|
321
332
|
code_action, code_interpreter, str(remote_artifacts_path)
|
322
333
|
)
|
323
334
|
|
335
|
+
media_obs = check_and_load_image(code_action)
|
336
|
+
|
324
337
|
if self.verbosity >= 1:
|
325
338
|
_LOGGER.info(obs)
|
339
|
+
|
340
|
+
chat_elt: Message = {"role": "observation", "content": obs}
|
341
|
+
if media_obs and result.success:
|
342
|
+
chat_elt["media"] = [
|
343
|
+
Path(code_interpreter.remote_path) / media_ob
|
344
|
+
for media_ob in media_obs
|
345
|
+
]
|
346
|
+
|
326
347
|
# don't add execution results to internal chat
|
327
|
-
int_chat.append(
|
328
|
-
|
329
|
-
|
330
|
-
)
|
348
|
+
int_chat.append(chat_elt)
|
349
|
+
chat_elt["execution"] = result
|
350
|
+
orig_chat.append(chat_elt)
|
331
351
|
self.streaming_message(
|
332
352
|
{
|
333
353
|
"role": "observation",
|
@@ -353,3 +373,63 @@ class VisionAgent(Agent):
|
|
353
373
|
|
354
374
|
def log_progress(self, data: Dict[str, Any]) -> None:
|
355
375
|
pass
|
376
|
+
|
377
|
+
|
378
|
+
class OpenAIVisionAgent(VisionAgent):
|
379
|
+
def __init__(
|
380
|
+
self,
|
381
|
+
agent: Optional[LMM] = None,
|
382
|
+
verbosity: int = 0,
|
383
|
+
local_artifacts_path: Optional[Union[str, Path]] = None,
|
384
|
+
code_sandbox_runtime: Optional[str] = None,
|
385
|
+
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
386
|
+
) -> None:
|
387
|
+
"""Initialize the VisionAgent using OpenAI LMMs.
|
388
|
+
|
389
|
+
Parameters:
|
390
|
+
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
391
|
+
of other agents.
|
392
|
+
verbosity (int): The verbosity level of the agent.
|
393
|
+
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
394
|
+
artifacts file.
|
395
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
396
|
+
"""
|
397
|
+
|
398
|
+
agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
|
399
|
+
super().__init__(
|
400
|
+
agent,
|
401
|
+
verbosity,
|
402
|
+
local_artifacts_path,
|
403
|
+
code_sandbox_runtime,
|
404
|
+
callback_message,
|
405
|
+
)
|
406
|
+
|
407
|
+
|
408
|
+
class AnthropicVisionAgent(VisionAgent):
|
409
|
+
def __init__(
|
410
|
+
self,
|
411
|
+
agent: Optional[LMM] = None,
|
412
|
+
verbosity: int = 0,
|
413
|
+
local_artifacts_path: Optional[Union[str, Path]] = None,
|
414
|
+
code_sandbox_runtime: Optional[str] = None,
|
415
|
+
callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
|
416
|
+
) -> None:
|
417
|
+
"""Initialize the VisionAgent using Anthropic LMMs.
|
418
|
+
|
419
|
+
Parameters:
|
420
|
+
agent (Optional[LMM]): The agent to use for conversation and orchestration
|
421
|
+
of other agents.
|
422
|
+
verbosity (int): The verbosity level of the agent.
|
423
|
+
local_artifacts_path (Optional[Union[str, Path]]): The path to the local
|
424
|
+
artifacts file.
|
425
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
|
426
|
+
"""
|
427
|
+
|
428
|
+
agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
429
|
+
super().__init__(
|
430
|
+
agent,
|
431
|
+
verbosity,
|
432
|
+
local_artifacts_path,
|
433
|
+
code_sandbox_runtime,
|
434
|
+
callback_message,
|
435
|
+
)
|
@@ -2,12 +2,10 @@ import copy
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import sys
|
5
|
-
import tempfile
|
6
5
|
from json import JSONDecodeError
|
7
6
|
from pathlib import Path
|
8
7
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
|
9
8
|
|
10
|
-
from PIL import Image
|
11
9
|
from rich.console import Console
|
12
10
|
from rich.style import Style
|
13
11
|
from rich.syntax import Syntax
|
@@ -29,8 +27,8 @@ from vision_agent.agent.vision_agent_coder_prompts import (
|
|
29
27
|
)
|
30
28
|
from vision_agent.lmm import (
|
31
29
|
LMM,
|
30
|
+
AnthropicLMM,
|
32
31
|
AzureOpenAILMM,
|
33
|
-
ClaudeSonnetLMM,
|
34
32
|
Message,
|
35
33
|
OllamaLMM,
|
36
34
|
OpenAILMM,
|
@@ -53,6 +51,9 @@ class DefaultImports:
|
|
53
51
|
"""Container for default imports used in the code execution."""
|
54
52
|
|
55
53
|
common_imports = [
|
54
|
+
"import os",
|
55
|
+
"import numpy as np",
|
56
|
+
"from vision_agent.tools import *",
|
56
57
|
"from typing import *",
|
57
58
|
"from pillow_heif import register_heif_opener",
|
58
59
|
"register_heif_opener()",
|
@@ -92,29 +93,6 @@ def format_plans(plans: Dict[str, Any]) -> str:
|
|
92
93
|
return plan_str
|
93
94
|
|
94
95
|
|
95
|
-
def extract_image(
|
96
|
-
media: Optional[Sequence[Union[str, Path]]],
|
97
|
-
) -> Optional[Sequence[Union[str, Path]]]:
|
98
|
-
if media is None:
|
99
|
-
return None
|
100
|
-
|
101
|
-
new_media = []
|
102
|
-
for m in media:
|
103
|
-
m = Path(m)
|
104
|
-
extension = m.suffix
|
105
|
-
if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
|
106
|
-
new_media.append(m)
|
107
|
-
elif extension in [".mp4", ".mov"]:
|
108
|
-
frames = T.extract_frames(m)
|
109
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
110
|
-
if len(frames) > 0:
|
111
|
-
Image.fromarray(frames[0][0]).save(tmp.name)
|
112
|
-
new_media.append(Path(tmp.name))
|
113
|
-
if len(new_media) == 0:
|
114
|
-
return None
|
115
|
-
return new_media
|
116
|
-
|
117
|
-
|
118
96
|
def write_plans(
|
119
97
|
chat: List[Message],
|
120
98
|
tool_desc: str,
|
@@ -146,7 +124,7 @@ def pick_plan(
|
|
146
124
|
log_progress: Callable[[Dict[str, Any]], None],
|
147
125
|
verbosity: int = 0,
|
148
126
|
max_retries: int = 3,
|
149
|
-
) -> Tuple[str, str]:
|
127
|
+
) -> Tuple[Dict[str, str], str]:
|
150
128
|
log_progress(
|
151
129
|
{
|
152
130
|
"type": "log",
|
@@ -199,7 +177,10 @@ def pick_plan(
|
|
199
177
|
|
200
178
|
# retry if the tool output is empty or code fails
|
201
179
|
count = 0
|
202
|
-
while (
|
180
|
+
while (
|
181
|
+
not tool_output.success
|
182
|
+
or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
|
183
|
+
) and count < max_retries:
|
203
184
|
prompt = TEST_PLANS.format(
|
204
185
|
docstring=tool_info,
|
205
186
|
plans=plan_str,
|
@@ -238,6 +219,7 @@ def pick_plan(
|
|
238
219
|
if verbosity == 2:
|
239
220
|
_print_code("Code and test after attempted fix:", code)
|
240
221
|
_LOGGER.info(f"Code execution result after attempt {count + 1}")
|
222
|
+
_LOGGER.info(f"{tool_output_str}")
|
241
223
|
|
242
224
|
count += 1
|
243
225
|
|
@@ -256,10 +238,10 @@ def pick_plan(
|
|
256
238
|
chat[-1]["content"] = prompt
|
257
239
|
|
258
240
|
count = 0
|
259
|
-
|
260
|
-
while
|
241
|
+
plan_thoughts = None
|
242
|
+
while plan_thoughts is None and count < max_retries:
|
261
243
|
try:
|
262
|
-
|
244
|
+
plan_thoughts = extract_json(model(chat, stream=False)) # type: ignore
|
263
245
|
except JSONDecodeError as e:
|
264
246
|
_LOGGER.exception(
|
265
247
|
f"Error while extracting JSON during picking best plan {str(e)}"
|
@@ -268,23 +250,27 @@ def pick_plan(
|
|
268
250
|
count += 1
|
269
251
|
|
270
252
|
if (
|
271
|
-
|
272
|
-
or "best_plan" not in
|
273
|
-
or ("best_plan" in
|
253
|
+
plan_thoughts is None
|
254
|
+
or "best_plan" not in plan_thoughts
|
255
|
+
or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
|
274
256
|
):
|
275
|
-
|
257
|
+
_LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
|
258
|
+
plan_thoughts = {"best_plan": list(plans.keys())[0]}
|
259
|
+
|
260
|
+
if "thoughts" not in plan_thoughts:
|
261
|
+
plan_thoughts["thoughts"] = ""
|
276
262
|
|
277
263
|
if verbosity >= 1:
|
278
|
-
_LOGGER.info(f"Best plan:\n{
|
264
|
+
_LOGGER.info(f"Best plan:\n{plan_thoughts}")
|
279
265
|
log_progress(
|
280
266
|
{
|
281
267
|
"type": "log",
|
282
268
|
"log_content": "Picked best plan",
|
283
269
|
"status": "completed",
|
284
|
-
"payload": plans[
|
270
|
+
"payload": plans[plan_thoughts["best_plan"]],
|
285
271
|
}
|
286
272
|
)
|
287
|
-
return
|
273
|
+
return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
|
288
274
|
|
289
275
|
|
290
276
|
def write_code(
|
@@ -292,6 +278,7 @@ def write_code(
|
|
292
278
|
chat: List[Message],
|
293
279
|
plan: str,
|
294
280
|
tool_info: str,
|
281
|
+
plan_thoughts: str,
|
295
282
|
tool_output: str,
|
296
283
|
feedback: str,
|
297
284
|
) -> str:
|
@@ -304,6 +291,7 @@ def write_code(
|
|
304
291
|
docstring=tool_info,
|
305
292
|
question=FULL_TASK.format(user_request=user_request, subtasks=plan),
|
306
293
|
tool_output=tool_output,
|
294
|
+
plan_thoughts=plan_thoughts,
|
307
295
|
feedback=feedback,
|
308
296
|
)
|
309
297
|
chat[-1]["content"] = prompt
|
@@ -339,6 +327,7 @@ def write_and_test_code(
|
|
339
327
|
plan: str,
|
340
328
|
tool_info: str,
|
341
329
|
tool_output: str,
|
330
|
+
plan_thoughts: str,
|
342
331
|
tool_utils: str,
|
343
332
|
working_memory: List[Dict[str, str]],
|
344
333
|
coder: LMM,
|
@@ -363,6 +352,7 @@ def write_and_test_code(
|
|
363
352
|
plan,
|
364
353
|
tool_info,
|
365
354
|
tool_output,
|
355
|
+
plan_thoughts,
|
366
356
|
format_memory(working_memory),
|
367
357
|
)
|
368
358
|
test = write_test(
|
@@ -634,31 +624,30 @@ class VisionAgentCoder(Agent):
|
|
634
624
|
"""Initialize the Vision Agent Coder.
|
635
625
|
|
636
626
|
Parameters:
|
637
|
-
planner (Optional[LMM]): The planner model to use. Defaults to
|
638
|
-
coder (Optional[LMM]): The coder model to use. Defaults to
|
639
|
-
tester (Optional[LMM]): The tester model to use. Defaults to
|
640
|
-
debugger (Optional[LMM]): The debugger model to
|
627
|
+
planner (Optional[LMM]): The planner model to use. Defaults to AnthropicLMM.
|
628
|
+
coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
|
629
|
+
tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
|
630
|
+
debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
|
641
631
|
tool_recommender (Optional[Sim]): The tool recommender model to use.
|
642
632
|
verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
|
643
633
|
highest verbosity level which will output all intermediate debugging
|
644
634
|
code.
|
645
|
-
report_progress_callback: a callback
|
646
|
-
This is useful for streaming logs
|
647
|
-
|
648
|
-
ensures that the progress are not
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
635
|
+
report_progress_callback (Optional[Callable[Dict[str, Any]]]): a callback
|
636
|
+
to report the progress of the agent. This is useful for streaming logs
|
637
|
+
in a web application where multiple VisionAgentCoder instances are
|
638
|
+
running in parallel. This callback ensures that the progress are not
|
639
|
+
mixed up.
|
640
|
+
code_sandbox_runtime (Optional[str]): the code sandbox runtime to use. A
|
641
|
+
code sandbox is used to run the generated code. It can be one of the
|
642
|
+
following values: None, "local" or "e2b". If None, VisionAgentCoder
|
643
|
+
will read the value from the environment variable CODE_SANDBOX_RUNTIME.
|
644
|
+
If it's also None, the local python runtime environment will be used.
|
654
645
|
"""
|
655
646
|
|
656
|
-
self.planner = (
|
657
|
-
|
658
|
-
)
|
659
|
-
self.
|
660
|
-
self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
|
661
|
-
self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
|
647
|
+
self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
|
648
|
+
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
649
|
+
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
650
|
+
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
662
651
|
self.verbosity = verbosity
|
663
652
|
if self.verbosity > 0:
|
664
653
|
_LOGGER.setLevel(logging.INFO)
|
@@ -785,7 +774,7 @@ class VisionAgentCoder(Agent):
|
|
785
774
|
)
|
786
775
|
|
787
776
|
if test_multi_plan:
|
788
|
-
|
777
|
+
plan_thoughts, tool_output_str = pick_plan(
|
789
778
|
int_chat,
|
790
779
|
plans,
|
791
780
|
tool_infos["all"],
|
@@ -795,9 +784,12 @@ class VisionAgentCoder(Agent):
|
|
795
784
|
self.log_progress,
|
796
785
|
verbosity=self.verbosity,
|
797
786
|
)
|
787
|
+
best_plan = plan_thoughts["best_plan"]
|
788
|
+
plan_thoughts_str = plan_thoughts["thoughts"]
|
798
789
|
else:
|
799
790
|
best_plan = list(plans.keys())[0]
|
800
791
|
tool_output_str = ""
|
792
|
+
plan_thoughts_str = ""
|
801
793
|
|
802
794
|
if best_plan in plans and best_plan in tool_infos:
|
803
795
|
plan_i = plans[best_plan]
|
@@ -832,6 +824,7 @@ class VisionAgentCoder(Agent):
|
|
832
824
|
+ "\n-".join([e for e in plan_i["instructions"]]),
|
833
825
|
tool_info=tool_info,
|
834
826
|
tool_output=tool_output_str,
|
827
|
+
plan_thoughts=plan_thoughts_str,
|
835
828
|
tool_utils=T.UTILITIES_DOCSTRING,
|
836
829
|
working_memory=working_memory,
|
837
830
|
coder=self.coder,
|
@@ -862,7 +855,8 @@ class VisionAgentCoder(Agent):
|
|
862
855
|
"code": DefaultImports.prepend_imports(code),
|
863
856
|
"test": test,
|
864
857
|
"test_result": execution_result,
|
865
|
-
"
|
858
|
+
"plans": plans,
|
859
|
+
"plan_thoughts": plan_thoughts_str,
|
866
860
|
"working_memory": working_memory,
|
867
861
|
}
|
868
862
|
|
@@ -904,7 +898,9 @@ class VisionAgentCoder(Agent):
|
|
904
898
|
)
|
905
899
|
|
906
900
|
|
907
|
-
class
|
901
|
+
class OpenAIVisionAgentCoder(VisionAgentCoder):
|
902
|
+
"""Initializes Vision Agent Coder using OpenAI models for planning, coding, testing."""
|
903
|
+
|
908
904
|
def __init__(
|
909
905
|
self,
|
910
906
|
planner: Optional[LMM] = None,
|
@@ -916,13 +912,44 @@ class ClaudeVisionAgentCoder(VisionAgentCoder):
|
|
916
912
|
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
917
913
|
code_sandbox_runtime: Optional[str] = None,
|
918
914
|
) -> None:
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
self.
|
923
|
-
self.
|
924
|
-
|
915
|
+
self.planner = (
|
916
|
+
OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
|
917
|
+
)
|
918
|
+
self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
|
919
|
+
self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
|
920
|
+
self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
|
921
|
+
self.verbosity = verbosity
|
922
|
+
if self.verbosity > 0:
|
923
|
+
_LOGGER.setLevel(logging.INFO)
|
924
|
+
|
925
|
+
self.tool_recommender = (
|
926
|
+
Sim(T.TOOLS_DF, sim_key="desc")
|
927
|
+
if tool_recommender is None
|
928
|
+
else tool_recommender
|
925
929
|
)
|
930
|
+
self.report_progress_callback = report_progress_callback
|
931
|
+
self.code_sandbox_runtime = code_sandbox_runtime
|
932
|
+
|
933
|
+
|
934
|
+
class AnthropicVisionAgentCoder(VisionAgentCoder):
|
935
|
+
"""Initializes Vision Agent Coder using Anthropic models for planning, coding, testing."""
|
936
|
+
|
937
|
+
def __init__(
|
938
|
+
self,
|
939
|
+
planner: Optional[LMM] = None,
|
940
|
+
coder: Optional[LMM] = None,
|
941
|
+
tester: Optional[LMM] = None,
|
942
|
+
debugger: Optional[LMM] = None,
|
943
|
+
tool_recommender: Optional[Sim] = None,
|
944
|
+
verbosity: int = 0,
|
945
|
+
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
946
|
+
code_sandbox_runtime: Optional[str] = None,
|
947
|
+
) -> None:
|
948
|
+
# NOTE: Claude doesn't have an official JSON mode
|
949
|
+
self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
|
950
|
+
self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
|
951
|
+
self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
|
952
|
+
self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
|
926
953
|
self.verbosity = verbosity
|
927
954
|
if self.verbosity > 0:
|
928
955
|
_LOGGER.setLevel(logging.INFO)
|