vision-agent 0.2.199__tar.gz → 0.2.200__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.199 → vision_agent-0.2.200}/PKG-INFO +1 -1
- {vision_agent-0.2.199 → vision_agent-0.2.200}/pyproject.toml +1 -1
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/__init__.py +2 -1
- vision_agent-0.2.200/vision_agent/agent/agent.py +55 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/agent_utils.py +47 -34
- vision_agent-0.2.200/vision_agent/agent/types.py +51 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_coder_v2.py +131 -43
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_planner_prompts_v2.py +1 -1
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_planner_v2.py +109 -50
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_prompts.py +4 -4
- vision_agent-0.2.200/vision_agent/agent/vision_agent_prompts_v2.py +46 -0
- vision_agent-0.2.200/vision_agent/agent/vision_agent_v2.py +215 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/utils/execute.py +1 -1
- vision_agent-0.2.199/vision_agent/agent/agent.py +0 -22
- {vision_agent-0.2.199 → vision_agent-0.2.200}/LICENSE +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/README.md +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/.sim_tools/df.csv +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/tools/planner_tools.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/tools/tools.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.199 → vision_agent-0.2.200}/vision_agent/utils/video.py +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
from .agent import Agent
|
1
|
+
from .agent import Agent, AgentCoder, AgentPlanner
|
2
2
|
from .vision_agent import VisionAgent
|
3
3
|
from .vision_agent_coder import (
|
4
4
|
AnthropicVisionAgentCoder,
|
@@ -17,3 +17,4 @@ from .vision_agent_planner import (
|
|
17
17
|
VisionAgentPlanner,
|
18
18
|
)
|
19
19
|
from .vision_agent_planner_v2 import VisionAgentPlannerV2
|
20
|
+
from .vision_agent_v2 import VisionAgentV2
|
@@ -0,0 +1,55 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
4
|
+
|
5
|
+
from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
|
6
|
+
from vision_agent.lmm.types import Message
|
7
|
+
from vision_agent.utils.execute import CodeInterpreter
|
8
|
+
|
9
|
+
|
10
|
+
class Agent(ABC):
|
11
|
+
@abstractmethod
|
12
|
+
def __call__(
|
13
|
+
self,
|
14
|
+
input: Union[str, List[Message]],
|
15
|
+
media: Optional[Union[str, Path]] = None,
|
16
|
+
) -> Union[str, List[Message]]:
|
17
|
+
pass
|
18
|
+
|
19
|
+
@abstractmethod
|
20
|
+
def log_progress(self, data: Dict[str, Any]) -> None:
|
21
|
+
"""Log the progress of the agent.
|
22
|
+
This is a hook that is intended for reporting the progress of the agent.
|
23
|
+
"""
|
24
|
+
pass
|
25
|
+
|
26
|
+
|
27
|
+
class AgentCoder(Agent):
|
28
|
+
@abstractmethod
|
29
|
+
def generate_code(
|
30
|
+
self,
|
31
|
+
chat: List[AgentMessage],
|
32
|
+
max_steps: Optional[int] = None,
|
33
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
34
|
+
) -> CodeContext:
|
35
|
+
pass
|
36
|
+
|
37
|
+
@abstractmethod
|
38
|
+
def generate_code_from_plan(
|
39
|
+
self,
|
40
|
+
chat: List[AgentMessage],
|
41
|
+
plan_context: PlanContext,
|
42
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
43
|
+
) -> CodeContext:
|
44
|
+
pass
|
45
|
+
|
46
|
+
|
47
|
+
class AgentPlanner(Agent):
|
48
|
+
@abstractmethod
|
49
|
+
def generate_plan(
|
50
|
+
self,
|
51
|
+
chat: List[AgentMessage],
|
52
|
+
max_steps: Optional[int] = None,
|
53
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
54
|
+
) -> PlanContext:
|
55
|
+
pass
|
@@ -4,16 +4,17 @@ import logging
|
|
4
4
|
import re
|
5
5
|
import sys
|
6
6
|
import tempfile
|
7
|
-
from
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
9
|
|
9
10
|
import libcst as cst
|
10
|
-
from pydantic import BaseModel
|
11
11
|
from rich.console import Console
|
12
12
|
from rich.style import Style
|
13
13
|
from rich.syntax import Syntax
|
14
14
|
from rich.table import Table
|
15
15
|
|
16
16
|
import vision_agent.tools as T
|
17
|
+
from vision_agent.agent.types import AgentMessage, PlanContext
|
17
18
|
from vision_agent.lmm.types import Message
|
18
19
|
from vision_agent.utils.execute import CodeInterpreter, Execution
|
19
20
|
from vision_agent.utils.image_utils import b64_to_pil, convert_to_b64
|
@@ -24,19 +25,6 @@ _CONSOLE = Console()
|
|
24
25
|
_MAX_TABULATE_COL_WIDTH = 80
|
25
26
|
|
26
27
|
|
27
|
-
class PlanContext(BaseModel):
|
28
|
-
plan: str
|
29
|
-
instructions: List[str]
|
30
|
-
code: str
|
31
|
-
|
32
|
-
|
33
|
-
class CodeContext(BaseModel):
|
34
|
-
code: str
|
35
|
-
test: str
|
36
|
-
success: bool
|
37
|
-
test_result: Execution
|
38
|
-
|
39
|
-
|
40
28
|
def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
|
41
29
|
json_pattern = r"\{.*\}"
|
42
30
|
match = re.search(json_pattern, json_str, re.DOTALL)
|
@@ -228,15 +216,15 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
|
|
228
216
|
|
229
217
|
|
230
218
|
def add_media_to_chat(
|
231
|
-
chat: List[
|
232
|
-
) -> Tuple[List[
|
219
|
+
chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
|
220
|
+
) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
|
233
221
|
orig_chat = copy.deepcopy(chat)
|
234
222
|
int_chat = copy.deepcopy(chat)
|
235
|
-
media_list = []
|
223
|
+
media_list: List[Union[str, Path]] = []
|
236
224
|
for chat_i in int_chat:
|
237
|
-
if
|
238
|
-
media_list_i = []
|
239
|
-
for media in chat_i
|
225
|
+
if chat_i.media is not None:
|
226
|
+
media_list_i: List[Union[str, Path]] = []
|
227
|
+
for media in chat_i.media:
|
240
228
|
if isinstance(media, str) and media.startswith("data:image/"):
|
241
229
|
media_pil = b64_to_pil(media)
|
242
230
|
with tempfile.NamedTemporaryFile(
|
@@ -244,25 +232,29 @@ def add_media_to_chat(
|
|
244
232
|
) as temp_file:
|
245
233
|
media_pil.save(temp_file, format="PNG")
|
246
234
|
media = str(temp_file.name)
|
247
|
-
|
235
|
+
if code_interpreter is not None:
|
236
|
+
media = str(code_interpreter.upload_file(media))
|
248
237
|
media_list_i.append(media)
|
249
|
-
# don't duplicate appending media name
|
250
|
-
if
|
251
|
-
chat_i
|
252
|
-
|
238
|
+
# don't duplicate appending media name and only add them for user messages
|
239
|
+
if (
|
240
|
+
not str(chat_i.content).endswith(f" Media name {media}")
|
241
|
+
and chat_i.role == "user"
|
242
|
+
):
|
243
|
+
chat_i.content += f" Media name {media}"
|
244
|
+
chat_i.media = media_list_i if len(media_list_i) > 0 else None
|
253
245
|
media_list.extend(media_list_i)
|
254
246
|
|
255
247
|
int_chat = cast(
|
256
|
-
List[
|
248
|
+
List[AgentMessage],
|
257
249
|
[
|
258
250
|
(
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
if
|
265
|
-
else
|
251
|
+
AgentMessage(
|
252
|
+
role=c.role,
|
253
|
+
content=c.content,
|
254
|
+
media=c.media,
|
255
|
+
)
|
256
|
+
if c.media is not None
|
257
|
+
else AgentMessage(role=c.role, content=c.content, media=None)
|
266
258
|
)
|
267
259
|
for c in int_chat
|
268
260
|
],
|
@@ -283,6 +275,27 @@ def capture_media_from_exec(execution: Execution) -> List[str]:
|
|
283
275
|
return images
|
284
276
|
|
285
277
|
|
278
|
+
def convert_message_to_agentmessage(
|
279
|
+
input: Union[str, List[Message]],
|
280
|
+
media: Optional[Union[str, Path]] = None,
|
281
|
+
) -> List[AgentMessage]:
|
282
|
+
if isinstance(input, str):
|
283
|
+
input_msg = [
|
284
|
+
AgentMessage(
|
285
|
+
role="user",
|
286
|
+
content=input,
|
287
|
+
media=([media] if media is not None else None),
|
288
|
+
)
|
289
|
+
]
|
290
|
+
else:
|
291
|
+
input_msg = [
|
292
|
+
AgentMessage(role=msg["role"], content=msg["content"], media=None)
|
293
|
+
for msg in input
|
294
|
+
]
|
295
|
+
input_msg[0].media = [media] if media is not None else None
|
296
|
+
return input_msg
|
297
|
+
|
298
|
+
|
286
299
|
def strip_function_calls( # noqa: C901
|
287
300
|
code: str, exclusions: Optional[List[str]] = None
|
288
301
|
) -> str:
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import List, Literal, Optional, Union
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
from vision_agent.utils.execute import Execution
|
7
|
+
|
8
|
+
|
9
|
+
class AgentMessage(BaseModel):
|
10
|
+
"""AgentMessage encompases messages sent to the entire Agentic system, which includes
|
11
|
+
both LMMs and sub-agents.
|
12
|
+
|
13
|
+
user: The user's message.
|
14
|
+
assistant: The assistant's message.
|
15
|
+
observation: An observation made after conducting an action, either by the user or
|
16
|
+
assistant.
|
17
|
+
interaction: An interaction between the user and the assistant. For example if the
|
18
|
+
assistant wants to ask the user for help on a task, it could send an
|
19
|
+
interaction message.
|
20
|
+
conversation: Messages coming from the conversation agent, this is a type of
|
21
|
+
assistant messages.
|
22
|
+
planner: Messages coming from the planner agent, this is a type of assistant
|
23
|
+
messages.
|
24
|
+
coder: Messages coming from the coder agent, this is a type of assistant messages.
|
25
|
+
|
26
|
+
"""
|
27
|
+
|
28
|
+
role: Union[
|
29
|
+
Literal["user"],
|
30
|
+
Literal["assistant"], # planner, coder and conversation are of type assistant
|
31
|
+
Literal["observation"],
|
32
|
+
Literal["interaction"],
|
33
|
+
Literal["conversation"],
|
34
|
+
Literal["planner"],
|
35
|
+
Literal["coder"],
|
36
|
+
]
|
37
|
+
content: str
|
38
|
+
media: Optional[List[Union[str, Path]]] = None
|
39
|
+
|
40
|
+
|
41
|
+
class PlanContext(BaseModel):
|
42
|
+
plan: str
|
43
|
+
instructions: List[str]
|
44
|
+
code: str
|
45
|
+
|
46
|
+
|
47
|
+
class CodeContext(BaseModel):
|
48
|
+
code: str
|
49
|
+
test: str
|
50
|
+
success: bool
|
51
|
+
test_result: Execution
|
@@ -6,19 +6,19 @@ from rich.console import Console
|
|
6
6
|
from rich.markup import escape
|
7
7
|
|
8
8
|
import vision_agent.tools as T
|
9
|
-
from vision_agent.agent import
|
9
|
+
from vision_agent.agent import AgentCoder, AgentPlanner
|
10
10
|
from vision_agent.agent.agent_utils import (
|
11
|
-
CodeContext,
|
12
11
|
DefaultImports,
|
13
|
-
PlanContext,
|
14
12
|
add_media_to_chat,
|
15
13
|
capture_media_from_exec,
|
14
|
+
convert_message_to_agentmessage,
|
16
15
|
extract_tag,
|
17
16
|
format_feedback,
|
18
17
|
format_plan_v2,
|
19
18
|
print_code,
|
20
19
|
strip_function_calls,
|
21
20
|
)
|
21
|
+
from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
|
22
22
|
from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
|
23
23
|
from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
|
24
24
|
from vision_agent.lmm import LMM, AnthropicLMM
|
@@ -34,6 +34,12 @@ from vision_agent.utils.sim import Sim, load_cached_sim
|
|
34
34
|
_CONSOLE = Console()
|
35
35
|
|
36
36
|
|
37
|
+
def format_code_context(
|
38
|
+
code_context: CodeContext,
|
39
|
+
) -> str:
|
40
|
+
return f"<final_code>{code_context.code}</final_code>\n<final_test>{code_context.test}</final_test>"
|
41
|
+
|
42
|
+
|
37
43
|
def retrieve_tools(
|
38
44
|
plan: List[str],
|
39
45
|
tool_recommender: Sim,
|
@@ -49,46 +55,54 @@ def retrieve_tools(
|
|
49
55
|
|
50
56
|
def write_code(
|
51
57
|
coder: LMM,
|
52
|
-
chat: List[
|
58
|
+
chat: List[AgentMessage],
|
53
59
|
tool_docs: str,
|
54
60
|
plan: str,
|
55
61
|
) -> str:
|
56
62
|
chat = copy.deepcopy(chat)
|
57
|
-
if chat[-1]
|
63
|
+
if chat[-1].role != "user":
|
58
64
|
raise ValueError("Last chat message must be from the user.")
|
59
65
|
|
60
|
-
user_request = chat[-1]
|
66
|
+
user_request = chat[-1].content
|
61
67
|
prompt = CODE.format(
|
62
68
|
docstring=tool_docs,
|
63
69
|
question=user_request,
|
64
70
|
plan=plan,
|
65
71
|
)
|
66
|
-
|
67
|
-
|
68
|
-
|
72
|
+
response = cast(str, coder([{"role": "user", "content": prompt}], stream=False))
|
73
|
+
maybe_code = extract_tag(response, "code")
|
74
|
+
|
75
|
+
# if the response wasn't properly formatted with the code tags just retrun the response
|
76
|
+
if maybe_code is None:
|
77
|
+
return response
|
78
|
+
return maybe_code
|
69
79
|
|
70
80
|
|
71
81
|
def write_test(
|
72
82
|
tester: LMM,
|
73
|
-
chat: List[
|
83
|
+
chat: List[AgentMessage],
|
74
84
|
tool_util_docs: str,
|
75
85
|
code: str,
|
76
86
|
media_list: Optional[Sequence[Union[str, Path]]] = None,
|
77
87
|
) -> str:
|
78
88
|
chat = copy.deepcopy(chat)
|
79
|
-
if chat[-1]
|
89
|
+
if chat[-1].role != "user":
|
80
90
|
raise ValueError("Last chat message must be from the user.")
|
81
91
|
|
82
|
-
user_request = chat[-1]
|
92
|
+
user_request = chat[-1].content
|
83
93
|
prompt = TEST.format(
|
84
94
|
docstring=tool_util_docs,
|
85
95
|
question=user_request,
|
86
96
|
code=code,
|
87
97
|
media=media_list,
|
88
98
|
)
|
89
|
-
|
90
|
-
|
91
|
-
|
99
|
+
response = cast(str, tester([{"role": "user", "content": prompt}], stream=False))
|
100
|
+
maybe_code = extract_tag(response, "code")
|
101
|
+
|
102
|
+
# if the response wasn't properly formatted with the code tags just retrun the response
|
103
|
+
if maybe_code is None:
|
104
|
+
return response
|
105
|
+
return maybe_code
|
92
106
|
|
93
107
|
|
94
108
|
def debug_code(
|
@@ -170,12 +184,11 @@ def write_and_test_code(
|
|
170
184
|
coder: LMM,
|
171
185
|
tester: LMM,
|
172
186
|
debugger: LMM,
|
173
|
-
chat: List[
|
187
|
+
chat: List[AgentMessage],
|
174
188
|
plan: str,
|
175
189
|
tool_docs: str,
|
176
190
|
code_interpreter: CodeInterpreter,
|
177
191
|
media_list: List[Union[str, Path]],
|
178
|
-
update_callback: Callable[[Dict[str, Any]], None],
|
179
192
|
verbose: bool,
|
180
193
|
) -> CodeContext:
|
181
194
|
code = write_code(
|
@@ -226,14 +239,6 @@ def write_and_test_code(
|
|
226
239
|
f"[bold cyan]Code execution result after attempted fix:[/bold cyan] [yellow]{escape(result.text(include_logs=True))}[/yellow]"
|
227
240
|
)
|
228
241
|
|
229
|
-
update_callback(
|
230
|
-
{
|
231
|
-
"role": "assistant",
|
232
|
-
"content": f"<final_code>{DefaultImports.to_code_string()}\n{code}</final_code>\n<final_test>{DefaultImports.to_code_string()}\n{test}</final_test>",
|
233
|
-
"media": capture_media_from_exec(result),
|
234
|
-
}
|
235
|
-
)
|
236
|
-
|
237
242
|
return CodeContext(
|
238
243
|
code=f"{DefaultImports.to_code_string()}\n{code}",
|
239
244
|
test=f"{DefaultImports.to_code_string()}\n{test}",
|
@@ -242,10 +247,12 @@ def write_and_test_code(
|
|
242
247
|
)
|
243
248
|
|
244
249
|
|
245
|
-
class VisionAgentCoderV2(
|
250
|
+
class VisionAgentCoderV2(AgentCoder):
|
251
|
+
"""VisionAgentCoderV2 is an agent that will write vision code for you."""
|
252
|
+
|
246
253
|
def __init__(
|
247
254
|
self,
|
248
|
-
planner: Optional[
|
255
|
+
planner: Optional[AgentPlanner] = None,
|
249
256
|
coder: Optional[LMM] = None,
|
250
257
|
tester: Optional[LMM] = None,
|
251
258
|
debugger: Optional[LMM] = None,
|
@@ -254,6 +261,25 @@ class VisionAgentCoderV2(Agent):
|
|
254
261
|
code_sandbox_runtime: Optional[str] = None,
|
255
262
|
update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
|
256
263
|
) -> None:
|
264
|
+
"""Initialize the VisionAgentCoderV2.
|
265
|
+
|
266
|
+
Parameters:
|
267
|
+
planner (Optional[AgentPlanner]): The planner agent to use for generating
|
268
|
+
vision plans. If None, a default VisionAgentPlannerV2 will be used.
|
269
|
+
coder (Optional[LMM]): The language model to use for the coder agent. If
|
270
|
+
None, a default AnthropicLMM will be used.
|
271
|
+
tester (Optional[LMM]): The language model to use for the tester agent. If
|
272
|
+
None, a default AnthropicLMM will be used.
|
273
|
+
debugger (Optional[LMM]): The language model to use for the debugger agent.
|
274
|
+
tool_recommender (Optional[Union[str, Sim]]): The tool recommender to use.
|
275
|
+
verbose (bool): Whether to print out debug information.
|
276
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
|
277
|
+
be one of: None, "local" or "e2b". If None, it will read from the
|
278
|
+
environment variable CODE_SANDBOX_RUNTIME.
|
279
|
+
update_callback (Callable[[Dict[str, Any]], None]): The callback function
|
280
|
+
that will send back intermediate conversation messages.
|
281
|
+
"""
|
282
|
+
|
257
283
|
self.planner = (
|
258
284
|
planner
|
259
285
|
if planner is not None
|
@@ -290,20 +316,52 @@ class VisionAgentCoderV2(Agent):
|
|
290
316
|
self,
|
291
317
|
input: Union[str, List[Message]],
|
292
318
|
media: Optional[Union[str, Path]] = None,
|
293
|
-
) ->
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
input[
|
298
|
-
|
299
|
-
|
300
|
-
|
319
|
+
) -> str:
|
320
|
+
"""Generate vision code from a conversation.
|
321
|
+
|
322
|
+
Parameters:
|
323
|
+
input (Union[str, List[Message]]): The input to the agent. This can be a
|
324
|
+
string or a list of messages in the format of [{"role": "user",
|
325
|
+
"content": "describe your task here..."}, ...].
|
326
|
+
media (Optional[Union[str, Path]]): The path to the media file to use with
|
327
|
+
the input. This can be an image or video file.
|
328
|
+
|
329
|
+
Returns:
|
330
|
+
str: The generated code as a string.
|
331
|
+
"""
|
332
|
+
|
333
|
+
input_msg = convert_message_to_agentmessage(input, media)
|
334
|
+
return self.generate_code(input_msg).code
|
335
|
+
|
336
|
+
def generate_code(
|
337
|
+
self,
|
338
|
+
chat: List[AgentMessage],
|
339
|
+
max_steps: Optional[int] = None,
|
340
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
341
|
+
) -> CodeContext:
|
342
|
+
"""Generate vision code from a conversation.
|
343
|
+
|
344
|
+
Parameters:
|
345
|
+
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
346
|
+
AgentMessage objects.
|
347
|
+
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
348
|
+
|
349
|
+
Returns:
|
350
|
+
CodeContext: The generated code as a CodeContext object which includes the
|
351
|
+
code, test code, whether or not it was exceuted successfully, and the
|
352
|
+
execution result.
|
353
|
+
"""
|
354
|
+
|
301
355
|
chat = copy.deepcopy(chat)
|
302
|
-
with
|
303
|
-
self.code_sandbox_runtime
|
356
|
+
with (
|
357
|
+
CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
|
358
|
+
if code_interpreter is None
|
359
|
+
else code_interpreter
|
304
360
|
) as code_interpreter:
|
305
361
|
int_chat, orig_chat, _ = add_media_to_chat(chat, code_interpreter)
|
306
|
-
plan_context = self.planner.generate_plan(
|
362
|
+
plan_context = self.planner.generate_plan(
|
363
|
+
int_chat, max_steps=max_steps, code_interpreter=code_interpreter
|
364
|
+
)
|
307
365
|
code_context = self.generate_code_from_plan(
|
308
366
|
orig_chat,
|
309
367
|
plan_context,
|
@@ -313,13 +371,30 @@ class VisionAgentCoderV2(Agent):
|
|
313
371
|
|
314
372
|
def generate_code_from_plan(
|
315
373
|
self,
|
316
|
-
chat: List[
|
374
|
+
chat: List[AgentMessage],
|
317
375
|
plan_context: PlanContext,
|
318
376
|
code_interpreter: Optional[CodeInterpreter] = None,
|
319
377
|
) -> CodeContext:
|
378
|
+
"""Generate vision code from a conversation and a previously made plan. This
|
379
|
+
will skip the planning step and go straight to generating code.
|
380
|
+
|
381
|
+
Parameters:
|
382
|
+
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
383
|
+
AgentMessage objects.
|
384
|
+
plan_context (PlanContext): The plan context that was previously generated.
|
385
|
+
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
386
|
+
|
387
|
+
Returns:
|
388
|
+
CodeContext: The generated code as a CodeContext object which includes the
|
389
|
+
code, test code, whether or not it was exceuted successfully, and the
|
390
|
+
execution result.
|
391
|
+
"""
|
392
|
+
|
320
393
|
chat = copy.deepcopy(chat)
|
321
|
-
with
|
322
|
-
self.code_sandbox_runtime
|
394
|
+
with (
|
395
|
+
CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
|
396
|
+
if code_interpreter is None
|
397
|
+
else code_interpreter
|
323
398
|
) as code_interpreter:
|
324
399
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
325
400
|
tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
|
@@ -331,10 +406,23 @@ class VisionAgentCoderV2(Agent):
|
|
331
406
|
plan=format_plan_v2(plan_context),
|
332
407
|
tool_docs=tool_docs,
|
333
408
|
code_interpreter=code_interpreter,
|
334
|
-
media_list=media_list,
|
335
|
-
update_callback=self.update_callback,
|
409
|
+
media_list=media_list,
|
336
410
|
verbose=self.verbose,
|
337
411
|
)
|
412
|
+
|
413
|
+
self.update_callback(
|
414
|
+
{
|
415
|
+
"role": "coder",
|
416
|
+
"content": format_code_context(code_context),
|
417
|
+
"media": capture_media_from_exec(code_context.test_result),
|
418
|
+
}
|
419
|
+
)
|
420
|
+
self.update_callback(
|
421
|
+
{
|
422
|
+
"role": "observation",
|
423
|
+
"content": code_context.test_result.text(),
|
424
|
+
}
|
425
|
+
)
|
338
426
|
return code_context
|
339
427
|
|
340
428
|
def log_progress(self, data: Dict[str, Any]) -> None:
|