vision-agent 0.2.198__py3-none-any.whl → 0.2.200__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/.sim_tools/df.csv +18 -18
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/__init__.py +2 -1
- vision_agent/agent/agent.py +33 -0
- vision_agent/agent/agent_utils.py +47 -34
- vision_agent/agent/types.py +51 -0
- vision_agent/agent/vision_agent_coder_v2.py +131 -43
- vision_agent/agent/vision_agent_planner_prompts_v2.py +1 -1
- vision_agent/agent/vision_agent_planner_v2.py +109 -50
- vision_agent/agent/vision_agent_prompts.py +4 -4
- vision_agent/agent/vision_agent_prompts_v2.py +46 -0
- vision_agent/agent/vision_agent_v2.py +215 -0
- vision_agent/tools/tools.py +1 -1
- vision_agent/utils/execute.py +1 -1
- {vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/METADATA +1 -1
- {vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/RECORD +18 -15
- {vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -80,24 +80,6 @@ desc,doc,name
|
|
80
80
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
81
81
|
]
|
82
82
|
",ocr
|
83
|
-
'clip' is a tool that can classify an image or a cropped detection given a list of input classes or tags. It returns the same list of the input classes along with their probability scores based on image content.,"clip(image: numpy.ndarray, classes: List[str]) -> Dict[str, Any]:
|
84
|
-
'clip' is a tool that can classify an image or a cropped detection given a list
|
85
|
-
of input classes or tags. It returns the same list of the input classes along with
|
86
|
-
their probability scores based on image content.
|
87
|
-
|
88
|
-
Parameters:
|
89
|
-
image (np.ndarray): The image to classify or tag
|
90
|
-
classes (List[str]): The list of classes or tags that is associated with the image
|
91
|
-
|
92
|
-
Returns:
|
93
|
-
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
94
|
-
contains a list of given labels and other a list of scores.
|
95
|
-
|
96
|
-
Example
|
97
|
-
-------
|
98
|
-
>>> clip(image, ['dog', 'cat', 'bird'])
|
99
|
-
{""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
100
|
-
",clip
|
101
83
|
'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
102
84
|
'vit_image_classification' is a tool that can classify an image. It returns a
|
103
85
|
list of classes and their probability scores based on image content.
|
@@ -488,6 +470,24 @@ desc,doc,name
|
|
488
470
|
... )
|
489
471
|
>>> save_image(result, ""inpainted_room.png"")
|
490
472
|
",flux_image_inpainting
|
473
|
+
'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
|
474
|
+
'siglip_classification' is a tool that can classify an image or a cropped detection given a list
|
475
|
+
of input labels or tags. It returns the same list of the input labels along with
|
476
|
+
their probability scores based on image content.
|
477
|
+
|
478
|
+
Parameters:
|
479
|
+
image (np.ndarray): The image to classify or tag
|
480
|
+
labels (List[str]): The list of labels or tags that is associated with the image
|
481
|
+
|
482
|
+
Returns:
|
483
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
484
|
+
contains a list of given labels and other a list of scores.
|
485
|
+
|
486
|
+
Example
|
487
|
+
-------
|
488
|
+
>>> siglip_classification(image, ['dog', 'cat', 'bird'])
|
489
|
+
{""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
490
|
+
",siglip_classification
|
491
491
|
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
|
492
492
|
'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
493
493
|
which can be a file path, url or youtube link, returns a list of dictionaries
|
vision_agent/.sim_tools/embs.npy
CHANGED
Binary file
|
vision_agent/agent/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from .agent import Agent
|
1
|
+
from .agent import Agent, AgentCoder, AgentPlanner
|
2
2
|
from .vision_agent import VisionAgent
|
3
3
|
from .vision_agent_coder import (
|
4
4
|
AnthropicVisionAgentCoder,
|
@@ -17,3 +17,4 @@ from .vision_agent_planner import (
|
|
17
17
|
VisionAgentPlanner,
|
18
18
|
)
|
19
19
|
from .vision_agent_planner_v2 import VisionAgentPlannerV2
|
20
|
+
from .vision_agent_v2 import VisionAgentV2
|
vision_agent/agent/agent.py
CHANGED
@@ -2,7 +2,9 @@ from abc import ABC, abstractmethod
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Any, Dict, List, Optional, Union
|
4
4
|
|
5
|
+
from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
|
5
6
|
from vision_agent.lmm.types import Message
|
7
|
+
from vision_agent.utils.execute import CodeInterpreter
|
6
8
|
|
7
9
|
|
8
10
|
class Agent(ABC):
|
@@ -20,3 +22,34 @@ class Agent(ABC):
|
|
20
22
|
This is a hook that is intended for reporting the progress of the agent.
|
21
23
|
"""
|
22
24
|
pass
|
25
|
+
|
26
|
+
|
27
|
+
class AgentCoder(Agent):
|
28
|
+
@abstractmethod
|
29
|
+
def generate_code(
|
30
|
+
self,
|
31
|
+
chat: List[AgentMessage],
|
32
|
+
max_steps: Optional[int] = None,
|
33
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
34
|
+
) -> CodeContext:
|
35
|
+
pass
|
36
|
+
|
37
|
+
@abstractmethod
|
38
|
+
def generate_code_from_plan(
|
39
|
+
self,
|
40
|
+
chat: List[AgentMessage],
|
41
|
+
plan_context: PlanContext,
|
42
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
43
|
+
) -> CodeContext:
|
44
|
+
pass
|
45
|
+
|
46
|
+
|
47
|
+
class AgentPlanner(Agent):
|
48
|
+
@abstractmethod
|
49
|
+
def generate_plan(
|
50
|
+
self,
|
51
|
+
chat: List[AgentMessage],
|
52
|
+
max_steps: Optional[int] = None,
|
53
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
54
|
+
) -> PlanContext:
|
55
|
+
pass
|
@@ -4,16 +4,17 @@ import logging
|
|
4
4
|
import re
|
5
5
|
import sys
|
6
6
|
import tempfile
|
7
|
-
from
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
9
|
|
9
10
|
import libcst as cst
|
10
|
-
from pydantic import BaseModel
|
11
11
|
from rich.console import Console
|
12
12
|
from rich.style import Style
|
13
13
|
from rich.syntax import Syntax
|
14
14
|
from rich.table import Table
|
15
15
|
|
16
16
|
import vision_agent.tools as T
|
17
|
+
from vision_agent.agent.types import AgentMessage, PlanContext
|
17
18
|
from vision_agent.lmm.types import Message
|
18
19
|
from vision_agent.utils.execute import CodeInterpreter, Execution
|
19
20
|
from vision_agent.utils.image_utils import b64_to_pil, convert_to_b64
|
@@ -24,19 +25,6 @@ _CONSOLE = Console()
|
|
24
25
|
_MAX_TABULATE_COL_WIDTH = 80
|
25
26
|
|
26
27
|
|
27
|
-
class PlanContext(BaseModel):
|
28
|
-
plan: str
|
29
|
-
instructions: List[str]
|
30
|
-
code: str
|
31
|
-
|
32
|
-
|
33
|
-
class CodeContext(BaseModel):
|
34
|
-
code: str
|
35
|
-
test: str
|
36
|
-
success: bool
|
37
|
-
test_result: Execution
|
38
|
-
|
39
|
-
|
40
28
|
def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
|
41
29
|
json_pattern = r"\{.*\}"
|
42
30
|
match = re.search(json_pattern, json_str, re.DOTALL)
|
@@ -228,15 +216,15 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
|
|
228
216
|
|
229
217
|
|
230
218
|
def add_media_to_chat(
|
231
|
-
chat: List[
|
232
|
-
) -> Tuple[List[
|
219
|
+
chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
|
220
|
+
) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
|
233
221
|
orig_chat = copy.deepcopy(chat)
|
234
222
|
int_chat = copy.deepcopy(chat)
|
235
|
-
media_list = []
|
223
|
+
media_list: List[Union[str, Path]] = []
|
236
224
|
for chat_i in int_chat:
|
237
|
-
if
|
238
|
-
media_list_i = []
|
239
|
-
for media in chat_i
|
225
|
+
if chat_i.media is not None:
|
226
|
+
media_list_i: List[Union[str, Path]] = []
|
227
|
+
for media in chat_i.media:
|
240
228
|
if isinstance(media, str) and media.startswith("data:image/"):
|
241
229
|
media_pil = b64_to_pil(media)
|
242
230
|
with tempfile.NamedTemporaryFile(
|
@@ -244,25 +232,29 @@ def add_media_to_chat(
|
|
244
232
|
) as temp_file:
|
245
233
|
media_pil.save(temp_file, format="PNG")
|
246
234
|
media = str(temp_file.name)
|
247
|
-
|
235
|
+
if code_interpreter is not None:
|
236
|
+
media = str(code_interpreter.upload_file(media))
|
248
237
|
media_list_i.append(media)
|
249
|
-
# don't duplicate appending media name
|
250
|
-
if
|
251
|
-
chat_i
|
252
|
-
|
238
|
+
# don't duplicate appending media name and only add them for user messages
|
239
|
+
if (
|
240
|
+
not str(chat_i.content).endswith(f" Media name {media}")
|
241
|
+
and chat_i.role == "user"
|
242
|
+
):
|
243
|
+
chat_i.content += f" Media name {media}"
|
244
|
+
chat_i.media = media_list_i if len(media_list_i) > 0 else None
|
253
245
|
media_list.extend(media_list_i)
|
254
246
|
|
255
247
|
int_chat = cast(
|
256
|
-
List[
|
248
|
+
List[AgentMessage],
|
257
249
|
[
|
258
250
|
(
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
if
|
265
|
-
else
|
251
|
+
AgentMessage(
|
252
|
+
role=c.role,
|
253
|
+
content=c.content,
|
254
|
+
media=c.media,
|
255
|
+
)
|
256
|
+
if c.media is not None
|
257
|
+
else AgentMessage(role=c.role, content=c.content, media=None)
|
266
258
|
)
|
267
259
|
for c in int_chat
|
268
260
|
],
|
@@ -283,6 +275,27 @@ def capture_media_from_exec(execution: Execution) -> List[str]:
|
|
283
275
|
return images
|
284
276
|
|
285
277
|
|
278
|
+
def convert_message_to_agentmessage(
|
279
|
+
input: Union[str, List[Message]],
|
280
|
+
media: Optional[Union[str, Path]] = None,
|
281
|
+
) -> List[AgentMessage]:
|
282
|
+
if isinstance(input, str):
|
283
|
+
input_msg = [
|
284
|
+
AgentMessage(
|
285
|
+
role="user",
|
286
|
+
content=input,
|
287
|
+
media=([media] if media is not None else None),
|
288
|
+
)
|
289
|
+
]
|
290
|
+
else:
|
291
|
+
input_msg = [
|
292
|
+
AgentMessage(role=msg["role"], content=msg["content"], media=None)
|
293
|
+
for msg in input
|
294
|
+
]
|
295
|
+
input_msg[0].media = [media] if media is not None else None
|
296
|
+
return input_msg
|
297
|
+
|
298
|
+
|
286
299
|
def strip_function_calls( # noqa: C901
|
287
300
|
code: str, exclusions: Optional[List[str]] = None
|
288
301
|
) -> str:
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import List, Literal, Optional, Union
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
from vision_agent.utils.execute import Execution
|
7
|
+
|
8
|
+
|
9
|
+
class AgentMessage(BaseModel):
|
10
|
+
"""AgentMessage encompases messages sent to the entire Agentic system, which includes
|
11
|
+
both LMMs and sub-agents.
|
12
|
+
|
13
|
+
user: The user's message.
|
14
|
+
assistant: The assistant's message.
|
15
|
+
observation: An observation made after conducting an action, either by the user or
|
16
|
+
assistant.
|
17
|
+
interaction: An interaction between the user and the assistant. For example if the
|
18
|
+
assistant wants to ask the user for help on a task, it could send an
|
19
|
+
interaction message.
|
20
|
+
conversation: Messages coming from the conversation agent, this is a type of
|
21
|
+
assistant messages.
|
22
|
+
planner: Messages coming from the planner agent, this is a type of assistant
|
23
|
+
messages.
|
24
|
+
coder: Messages coming from the coder agent, this is a type of assistant messages.
|
25
|
+
|
26
|
+
"""
|
27
|
+
|
28
|
+
role: Union[
|
29
|
+
Literal["user"],
|
30
|
+
Literal["assistant"], # planner, coder and conversation are of type assistant
|
31
|
+
Literal["observation"],
|
32
|
+
Literal["interaction"],
|
33
|
+
Literal["conversation"],
|
34
|
+
Literal["planner"],
|
35
|
+
Literal["coder"],
|
36
|
+
]
|
37
|
+
content: str
|
38
|
+
media: Optional[List[Union[str, Path]]] = None
|
39
|
+
|
40
|
+
|
41
|
+
class PlanContext(BaseModel):
|
42
|
+
plan: str
|
43
|
+
instructions: List[str]
|
44
|
+
code: str
|
45
|
+
|
46
|
+
|
47
|
+
class CodeContext(BaseModel):
|
48
|
+
code: str
|
49
|
+
test: str
|
50
|
+
success: bool
|
51
|
+
test_result: Execution
|
@@ -6,19 +6,19 @@ from rich.console import Console
|
|
6
6
|
from rich.markup import escape
|
7
7
|
|
8
8
|
import vision_agent.tools as T
|
9
|
-
from vision_agent.agent import
|
9
|
+
from vision_agent.agent import AgentCoder, AgentPlanner
|
10
10
|
from vision_agent.agent.agent_utils import (
|
11
|
-
CodeContext,
|
12
11
|
DefaultImports,
|
13
|
-
PlanContext,
|
14
12
|
add_media_to_chat,
|
15
13
|
capture_media_from_exec,
|
14
|
+
convert_message_to_agentmessage,
|
16
15
|
extract_tag,
|
17
16
|
format_feedback,
|
18
17
|
format_plan_v2,
|
19
18
|
print_code,
|
20
19
|
strip_function_calls,
|
21
20
|
)
|
21
|
+
from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
|
22
22
|
from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
|
23
23
|
from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
|
24
24
|
from vision_agent.lmm import LMM, AnthropicLMM
|
@@ -34,6 +34,12 @@ from vision_agent.utils.sim import Sim, load_cached_sim
|
|
34
34
|
_CONSOLE = Console()
|
35
35
|
|
36
36
|
|
37
|
+
def format_code_context(
|
38
|
+
code_context: CodeContext,
|
39
|
+
) -> str:
|
40
|
+
return f"<final_code>{code_context.code}</final_code>\n<final_test>{code_context.test}</final_test>"
|
41
|
+
|
42
|
+
|
37
43
|
def retrieve_tools(
|
38
44
|
plan: List[str],
|
39
45
|
tool_recommender: Sim,
|
@@ -49,46 +55,54 @@ def retrieve_tools(
|
|
49
55
|
|
50
56
|
def write_code(
|
51
57
|
coder: LMM,
|
52
|
-
chat: List[
|
58
|
+
chat: List[AgentMessage],
|
53
59
|
tool_docs: str,
|
54
60
|
plan: str,
|
55
61
|
) -> str:
|
56
62
|
chat = copy.deepcopy(chat)
|
57
|
-
if chat[-1]
|
63
|
+
if chat[-1].role != "user":
|
58
64
|
raise ValueError("Last chat message must be from the user.")
|
59
65
|
|
60
|
-
user_request = chat[-1]
|
66
|
+
user_request = chat[-1].content
|
61
67
|
prompt = CODE.format(
|
62
68
|
docstring=tool_docs,
|
63
69
|
question=user_request,
|
64
70
|
plan=plan,
|
65
71
|
)
|
66
|
-
|
67
|
-
|
68
|
-
|
72
|
+
response = cast(str, coder([{"role": "user", "content": prompt}], stream=False))
|
73
|
+
maybe_code = extract_tag(response, "code")
|
74
|
+
|
75
|
+
# if the response wasn't properly formatted with the code tags just retrun the response
|
76
|
+
if maybe_code is None:
|
77
|
+
return response
|
78
|
+
return maybe_code
|
69
79
|
|
70
80
|
|
71
81
|
def write_test(
|
72
82
|
tester: LMM,
|
73
|
-
chat: List[
|
83
|
+
chat: List[AgentMessage],
|
74
84
|
tool_util_docs: str,
|
75
85
|
code: str,
|
76
86
|
media_list: Optional[Sequence[Union[str, Path]]] = None,
|
77
87
|
) -> str:
|
78
88
|
chat = copy.deepcopy(chat)
|
79
|
-
if chat[-1]
|
89
|
+
if chat[-1].role != "user":
|
80
90
|
raise ValueError("Last chat message must be from the user.")
|
81
91
|
|
82
|
-
user_request = chat[-1]
|
92
|
+
user_request = chat[-1].content
|
83
93
|
prompt = TEST.format(
|
84
94
|
docstring=tool_util_docs,
|
85
95
|
question=user_request,
|
86
96
|
code=code,
|
87
97
|
media=media_list,
|
88
98
|
)
|
89
|
-
|
90
|
-
|
91
|
-
|
99
|
+
response = cast(str, tester([{"role": "user", "content": prompt}], stream=False))
|
100
|
+
maybe_code = extract_tag(response, "code")
|
101
|
+
|
102
|
+
# if the response wasn't properly formatted with the code tags just retrun the response
|
103
|
+
if maybe_code is None:
|
104
|
+
return response
|
105
|
+
return maybe_code
|
92
106
|
|
93
107
|
|
94
108
|
def debug_code(
|
@@ -170,12 +184,11 @@ def write_and_test_code(
|
|
170
184
|
coder: LMM,
|
171
185
|
tester: LMM,
|
172
186
|
debugger: LMM,
|
173
|
-
chat: List[
|
187
|
+
chat: List[AgentMessage],
|
174
188
|
plan: str,
|
175
189
|
tool_docs: str,
|
176
190
|
code_interpreter: CodeInterpreter,
|
177
191
|
media_list: List[Union[str, Path]],
|
178
|
-
update_callback: Callable[[Dict[str, Any]], None],
|
179
192
|
verbose: bool,
|
180
193
|
) -> CodeContext:
|
181
194
|
code = write_code(
|
@@ -226,14 +239,6 @@ def write_and_test_code(
|
|
226
239
|
f"[bold cyan]Code execution result after attempted fix:[/bold cyan] [yellow]{escape(result.text(include_logs=True))}[/yellow]"
|
227
240
|
)
|
228
241
|
|
229
|
-
update_callback(
|
230
|
-
{
|
231
|
-
"role": "assistant",
|
232
|
-
"content": f"<final_code>{DefaultImports.to_code_string()}\n{code}</final_code>\n<final_test>{DefaultImports.to_code_string()}\n{test}</final_test>",
|
233
|
-
"media": capture_media_from_exec(result),
|
234
|
-
}
|
235
|
-
)
|
236
|
-
|
237
242
|
return CodeContext(
|
238
243
|
code=f"{DefaultImports.to_code_string()}\n{code}",
|
239
244
|
test=f"{DefaultImports.to_code_string()}\n{test}",
|
@@ -242,10 +247,12 @@ def write_and_test_code(
|
|
242
247
|
)
|
243
248
|
|
244
249
|
|
245
|
-
class VisionAgentCoderV2(
|
250
|
+
class VisionAgentCoderV2(AgentCoder):
|
251
|
+
"""VisionAgentCoderV2 is an agent that will write vision code for you."""
|
252
|
+
|
246
253
|
def __init__(
|
247
254
|
self,
|
248
|
-
planner: Optional[
|
255
|
+
planner: Optional[AgentPlanner] = None,
|
249
256
|
coder: Optional[LMM] = None,
|
250
257
|
tester: Optional[LMM] = None,
|
251
258
|
debugger: Optional[LMM] = None,
|
@@ -254,6 +261,25 @@ class VisionAgentCoderV2(Agent):
|
|
254
261
|
code_sandbox_runtime: Optional[str] = None,
|
255
262
|
update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
|
256
263
|
) -> None:
|
264
|
+
"""Initialize the VisionAgentCoderV2.
|
265
|
+
|
266
|
+
Parameters:
|
267
|
+
planner (Optional[AgentPlanner]): The planner agent to use for generating
|
268
|
+
vision plans. If None, a default VisionAgentPlannerV2 will be used.
|
269
|
+
coder (Optional[LMM]): The language model to use for the coder agent. If
|
270
|
+
None, a default AnthropicLMM will be used.
|
271
|
+
tester (Optional[LMM]): The language model to use for the tester agent. If
|
272
|
+
None, a default AnthropicLMM will be used.
|
273
|
+
debugger (Optional[LMM]): The language model to use for the debugger agent.
|
274
|
+
tool_recommender (Optional[Union[str, Sim]]): The tool recommender to use.
|
275
|
+
verbose (bool): Whether to print out debug information.
|
276
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
|
277
|
+
be one of: None, "local" or "e2b". If None, it will read from the
|
278
|
+
environment variable CODE_SANDBOX_RUNTIME.
|
279
|
+
update_callback (Callable[[Dict[str, Any]], None]): The callback function
|
280
|
+
that will send back intermediate conversation messages.
|
281
|
+
"""
|
282
|
+
|
257
283
|
self.planner = (
|
258
284
|
planner
|
259
285
|
if planner is not None
|
@@ -290,20 +316,52 @@ class VisionAgentCoderV2(Agent):
|
|
290
316
|
self,
|
291
317
|
input: Union[str, List[Message]],
|
292
318
|
media: Optional[Union[str, Path]] = None,
|
293
|
-
) ->
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
input[
|
298
|
-
|
299
|
-
|
300
|
-
|
319
|
+
) -> str:
|
320
|
+
"""Generate vision code from a conversation.
|
321
|
+
|
322
|
+
Parameters:
|
323
|
+
input (Union[str, List[Message]]): The input to the agent. This can be a
|
324
|
+
string or a list of messages in the format of [{"role": "user",
|
325
|
+
"content": "describe your task here..."}, ...].
|
326
|
+
media (Optional[Union[str, Path]]): The path to the media file to use with
|
327
|
+
the input. This can be an image or video file.
|
328
|
+
|
329
|
+
Returns:
|
330
|
+
str: The generated code as a string.
|
331
|
+
"""
|
332
|
+
|
333
|
+
input_msg = convert_message_to_agentmessage(input, media)
|
334
|
+
return self.generate_code(input_msg).code
|
335
|
+
|
336
|
+
def generate_code(
|
337
|
+
self,
|
338
|
+
chat: List[AgentMessage],
|
339
|
+
max_steps: Optional[int] = None,
|
340
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
341
|
+
) -> CodeContext:
|
342
|
+
"""Generate vision code from a conversation.
|
343
|
+
|
344
|
+
Parameters:
|
345
|
+
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
346
|
+
AgentMessage objects.
|
347
|
+
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
348
|
+
|
349
|
+
Returns:
|
350
|
+
CodeContext: The generated code as a CodeContext object which includes the
|
351
|
+
code, test code, whether or not it was exceuted successfully, and the
|
352
|
+
execution result.
|
353
|
+
"""
|
354
|
+
|
301
355
|
chat = copy.deepcopy(chat)
|
302
|
-
with
|
303
|
-
self.code_sandbox_runtime
|
356
|
+
with (
|
357
|
+
CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
|
358
|
+
if code_interpreter is None
|
359
|
+
else code_interpreter
|
304
360
|
) as code_interpreter:
|
305
361
|
int_chat, orig_chat, _ = add_media_to_chat(chat, code_interpreter)
|
306
|
-
plan_context = self.planner.generate_plan(
|
362
|
+
plan_context = self.planner.generate_plan(
|
363
|
+
int_chat, max_steps=max_steps, code_interpreter=code_interpreter
|
364
|
+
)
|
307
365
|
code_context = self.generate_code_from_plan(
|
308
366
|
orig_chat,
|
309
367
|
plan_context,
|
@@ -313,13 +371,30 @@ class VisionAgentCoderV2(Agent):
|
|
313
371
|
|
314
372
|
def generate_code_from_plan(
|
315
373
|
self,
|
316
|
-
chat: List[
|
374
|
+
chat: List[AgentMessage],
|
317
375
|
plan_context: PlanContext,
|
318
376
|
code_interpreter: Optional[CodeInterpreter] = None,
|
319
377
|
) -> CodeContext:
|
378
|
+
"""Generate vision code from a conversation and a previously made plan. This
|
379
|
+
will skip the planning step and go straight to generating code.
|
380
|
+
|
381
|
+
Parameters:
|
382
|
+
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
383
|
+
AgentMessage objects.
|
384
|
+
plan_context (PlanContext): The plan context that was previously generated.
|
385
|
+
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
386
|
+
|
387
|
+
Returns:
|
388
|
+
CodeContext: The generated code as a CodeContext object which includes the
|
389
|
+
code, test code, whether or not it was exceuted successfully, and the
|
390
|
+
execution result.
|
391
|
+
"""
|
392
|
+
|
320
393
|
chat = copy.deepcopy(chat)
|
321
|
-
with
|
322
|
-
self.code_sandbox_runtime
|
394
|
+
with (
|
395
|
+
CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
|
396
|
+
if code_interpreter is None
|
397
|
+
else code_interpreter
|
323
398
|
) as code_interpreter:
|
324
399
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
325
400
|
tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
|
@@ -331,10 +406,23 @@ class VisionAgentCoderV2(Agent):
|
|
331
406
|
plan=format_plan_v2(plan_context),
|
332
407
|
tool_docs=tool_docs,
|
333
408
|
code_interpreter=code_interpreter,
|
334
|
-
media_list=media_list,
|
335
|
-
update_callback=self.update_callback,
|
409
|
+
media_list=media_list,
|
336
410
|
verbose=self.verbose,
|
337
411
|
)
|
412
|
+
|
413
|
+
self.update_callback(
|
414
|
+
{
|
415
|
+
"role": "coder",
|
416
|
+
"content": format_code_context(code_context),
|
417
|
+
"media": capture_media_from_exec(code_context.test_result),
|
418
|
+
}
|
419
|
+
)
|
420
|
+
self.update_callback(
|
421
|
+
{
|
422
|
+
"role": "observation",
|
423
|
+
"content": code_context.test_result.text(),
|
424
|
+
}
|
425
|
+
)
|
338
426
|
return code_context
|
339
427
|
|
340
428
|
def log_progress(self, data: Dict[str, Any]) -> None:
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
|
+
import time
|
3
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4
5
|
from pathlib import Path
|
5
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
@@ -10,16 +11,17 @@ from rich.markup import escape
|
|
10
11
|
|
11
12
|
import vision_agent.tools as T
|
12
13
|
import vision_agent.tools.planner_tools as pt
|
13
|
-
from vision_agent.agent import
|
14
|
+
from vision_agent.agent import AgentPlanner
|
14
15
|
from vision_agent.agent.agent_utils import (
|
15
|
-
PlanContext,
|
16
16
|
add_media_to_chat,
|
17
17
|
capture_media_from_exec,
|
18
|
+
convert_message_to_agentmessage,
|
18
19
|
extract_json,
|
19
20
|
extract_tag,
|
20
21
|
print_code,
|
21
22
|
print_table,
|
22
23
|
)
|
24
|
+
from vision_agent.agent.types import AgentMessage, PlanContext
|
23
25
|
from vision_agent.agent.vision_agent_planner_prompts_v2 import (
|
24
26
|
CRITIQUE_PLAN,
|
25
27
|
EXAMPLE_PLAN1,
|
@@ -70,26 +72,24 @@ class DefaultPlanningImports:
|
|
70
72
|
|
71
73
|
|
72
74
|
def get_planning(
|
73
|
-
chat: List[
|
75
|
+
chat: List[AgentMessage],
|
74
76
|
) -> str:
|
75
77
|
chat = copy.deepcopy(chat)
|
76
78
|
planning = ""
|
77
79
|
for chat_i in chat:
|
78
|
-
if chat_i
|
79
|
-
planning += f"USER: {chat_i
|
80
|
-
elif chat_i
|
81
|
-
planning += f"OBSERVATION: {chat_i
|
82
|
-
elif chat_i
|
83
|
-
planning += f"
|
84
|
-
else:
|
85
|
-
raise ValueError(f"Unknown role: {chat_i['role']}")
|
80
|
+
if chat_i.role == "user":
|
81
|
+
planning += f"USER: {chat_i.content}\n\n"
|
82
|
+
elif chat_i.role == "observation":
|
83
|
+
planning += f"OBSERVATION: {chat_i.content}\n\n"
|
84
|
+
elif chat_i.role == "planner":
|
85
|
+
planning += f"AGENT: {chat_i.content}\n\n"
|
86
86
|
|
87
87
|
return planning
|
88
88
|
|
89
89
|
|
90
90
|
def run_planning(
|
91
|
-
chat: List[
|
92
|
-
media_list: List[str],
|
91
|
+
chat: List[AgentMessage],
|
92
|
+
media_list: List[Union[str, Path]],
|
93
93
|
model: LMM,
|
94
94
|
) -> str:
|
95
95
|
# only keep last 10 messages for planning
|
@@ -102,16 +102,16 @@ def run_planning(
|
|
102
102
|
)
|
103
103
|
|
104
104
|
message: Message = {"role": "user", "content": prompt}
|
105
|
-
if chat[-1]
|
106
|
-
message["media"] = chat[-1]
|
105
|
+
if chat[-1].role == "observation" and chat[-1].media is not None:
|
106
|
+
message["media"] = chat[-1].media
|
107
107
|
|
108
108
|
response = model.chat([message])
|
109
109
|
return cast(str, response)
|
110
110
|
|
111
111
|
|
112
112
|
def run_multi_trial_planning(
|
113
|
-
chat: List[
|
114
|
-
media_list: List[str],
|
113
|
+
chat: List[AgentMessage],
|
114
|
+
media_list: List[Union[str, Path]],
|
115
115
|
model: LMM,
|
116
116
|
) -> str:
|
117
117
|
planning = get_planning(chat)
|
@@ -123,8 +123,8 @@ def run_multi_trial_planning(
|
|
123
123
|
)
|
124
124
|
|
125
125
|
message: Message = {"role": "user", "content": prompt}
|
126
|
-
if chat[-1]
|
127
|
-
message["media"] = chat[-1]
|
126
|
+
if chat[-1].role == "observation" and chat[-1].media is not None:
|
127
|
+
message["media"] = chat[-1].media
|
128
128
|
|
129
129
|
responses = []
|
130
130
|
with ThreadPoolExecutor() as executor:
|
@@ -151,7 +151,9 @@ def run_multi_trial_planning(
|
|
151
151
|
return cast(str, responses[0])
|
152
152
|
|
153
153
|
|
154
|
-
def run_critic(
|
154
|
+
def run_critic(
|
155
|
+
chat: List[AgentMessage], media_list: List[Union[str, Path]], model: LMM
|
156
|
+
) -> Optional[str]:
|
155
157
|
planning = get_planning(chat)
|
156
158
|
prompt = CRITIQUE_PLAN.format(
|
157
159
|
planning=planning,
|
@@ -196,17 +198,19 @@ def response_safeguards(response: str) -> str:
|
|
196
198
|
def execute_code_action(
|
197
199
|
code: str,
|
198
200
|
code_interpreter: CodeInterpreter,
|
199
|
-
chat: List[
|
201
|
+
chat: List[AgentMessage],
|
200
202
|
model: LMM,
|
201
203
|
verbose: bool = False,
|
202
204
|
) -> Tuple[Execution, str, str]:
|
203
205
|
if verbose:
|
204
206
|
print_code("Code to Execute:", code)
|
207
|
+
start = time.time()
|
205
208
|
execution = code_interpreter.exec_cell(DefaultPlanningImports.prepend_imports(code))
|
209
|
+
end = time.time()
|
206
210
|
obs = execution.text(include_results=False).strip()
|
207
211
|
if verbose:
|
208
212
|
_CONSOLE.print(
|
209
|
-
f"[bold cyan]Code Execution Output:[/bold cyan] [yellow]{escape(obs)}[/yellow]"
|
213
|
+
f"[bold cyan]Code Execution Output ({end - start:.2f} sec):[/bold cyan] [yellow]{escape(obs)}[/yellow]"
|
210
214
|
)
|
211
215
|
|
212
216
|
count = 1
|
@@ -246,13 +250,13 @@ def find_and_replace_code(response: str, code: str) -> str:
|
|
246
250
|
def maybe_run_code(
|
247
251
|
code: Optional[str],
|
248
252
|
response: str,
|
249
|
-
chat: List[
|
250
|
-
media_list: List[str],
|
253
|
+
chat: List[AgentMessage],
|
254
|
+
media_list: List[Union[str, Path]],
|
251
255
|
model: LMM,
|
252
256
|
code_interpreter: CodeInterpreter,
|
253
257
|
verbose: bool = False,
|
254
|
-
) -> List[
|
255
|
-
return_chat: List[
|
258
|
+
) -> List[AgentMessage]:
|
259
|
+
return_chat: List[AgentMessage] = []
|
256
260
|
if code is not None:
|
257
261
|
code = code_safeguards(code)
|
258
262
|
execution, obs, code = execute_code_action(
|
@@ -262,30 +266,32 @@ def maybe_run_code(
|
|
262
266
|
# if we had to debug the code to fix an issue, replace the old code
|
263
267
|
# with the fixed code in the response
|
264
268
|
fixed_response = find_and_replace_code(response, code)
|
265
|
-
return_chat.append(
|
269
|
+
return_chat.append(
|
270
|
+
AgentMessage(role="planner", content=fixed_response, media=None)
|
271
|
+
)
|
266
272
|
|
267
273
|
media_data = capture_media_from_exec(execution)
|
268
|
-
int_chat_elt
|
274
|
+
int_chat_elt = AgentMessage(role="observation", content=obs, media=None)
|
269
275
|
if media_list:
|
270
|
-
int_chat_elt
|
276
|
+
int_chat_elt.media = cast(List[Union[str, Path]], media_data)
|
271
277
|
return_chat.append(int_chat_elt)
|
272
278
|
else:
|
273
|
-
return_chat.append(
|
279
|
+
return_chat.append(AgentMessage(role="planner", content=response, media=None))
|
274
280
|
return return_chat
|
275
281
|
|
276
282
|
|
277
283
|
def create_finalize_plan(
|
278
|
-
chat: List[
|
284
|
+
chat: List[AgentMessage],
|
279
285
|
model: LMM,
|
280
286
|
verbose: bool = False,
|
281
|
-
) -> Tuple[List[
|
287
|
+
) -> Tuple[List[AgentMessage], PlanContext]:
|
282
288
|
prompt = FINALIZE_PLAN.format(
|
283
289
|
planning=get_planning(chat),
|
284
290
|
excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
|
285
291
|
)
|
286
292
|
response = model.chat([{"role": "user", "content": prompt}])
|
287
293
|
plan_str = cast(str, response)
|
288
|
-
return_chat
|
294
|
+
return_chat = [AgentMessage(role="planner", content=plan_str, media=None)]
|
289
295
|
|
290
296
|
plan_json = extract_tag(plan_str, "json")
|
291
297
|
plan = (
|
@@ -305,7 +311,16 @@ def create_finalize_plan(
|
|
305
311
|
return return_chat, PlanContext(**plan)
|
306
312
|
|
307
313
|
|
308
|
-
|
314
|
+
def get_steps(chat: List[AgentMessage], max_steps: int) -> int:
|
315
|
+
for chat_elt in reversed(chat):
|
316
|
+
if "<count>" in chat_elt.content:
|
317
|
+
return int(extract_tag(chat_elt.content, "count")) # type: ignore
|
318
|
+
return max_steps
|
319
|
+
|
320
|
+
|
321
|
+
class VisionAgentPlannerV2(AgentPlanner):
|
322
|
+
"""VisionAgentPlannerV2 is a class that generates a plan to solve a vision task."""
|
323
|
+
|
309
324
|
def __init__(
|
310
325
|
self,
|
311
326
|
planner: Optional[LMM] = None,
|
@@ -317,6 +332,25 @@ class VisionAgentPlannerV2(Agent):
|
|
317
332
|
code_sandbox_runtime: Optional[str] = None,
|
318
333
|
update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
|
319
334
|
) -> None:
|
335
|
+
"""Initialize the VisionAgentPlannerV2.
|
336
|
+
|
337
|
+
Parameters:
|
338
|
+
planner (Optional[LMM]): The language model to use for planning. If None, a
|
339
|
+
default AnthropicLMM will be used.
|
340
|
+
critic (Optional[LMM]): The language model to use for critiquing the plan.
|
341
|
+
If None, a default AnthropicLMM will be used.
|
342
|
+
max_steps (int): The maximum number of steps to plan.
|
343
|
+
use_multi_trial_planning (bool): Whether to use multi-trial planning.
|
344
|
+
critique_steps (int): The number of steps between critiques. If critic steps
|
345
|
+
is larger than max_steps no critiques will be made.
|
346
|
+
verbose (bool): Whether to print out debug information.
|
347
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
|
348
|
+
be one of: None, "local" or "e2b". If None, it will read from the
|
349
|
+
environment variable CODE_SANDBOX_RUNTIME.
|
350
|
+
update_callback (Callable[[Dict[str, Any]], None]): The callback function
|
351
|
+
that will send back intermediate conversation messages.
|
352
|
+
"""
|
353
|
+
|
320
354
|
self.planner = (
|
321
355
|
planner
|
322
356
|
if planner is not None
|
@@ -339,20 +373,42 @@ class VisionAgentPlannerV2(Agent):
|
|
339
373
|
self,
|
340
374
|
input: Union[str, List[Message]],
|
341
375
|
media: Optional[Union[str, Path]] = None,
|
342
|
-
) ->
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
376
|
+
) -> str:
|
377
|
+
"""Generate a plan to solve a vision task.
|
378
|
+
|
379
|
+
Parameters:
|
380
|
+
input (Union[str, List[Message]]): The input to the agent. This can be a
|
381
|
+
string or a list of messages in the format of [{"role": "user",
|
382
|
+
"content": "describe your task here..."}, ...].
|
383
|
+
media (Optional[Union[str, Path]]): The path to the media file to use with
|
384
|
+
the input. This can be an image or video file.
|
385
|
+
|
386
|
+
Returns:
|
387
|
+
str: The generated plan as a string.
|
388
|
+
"""
|
389
|
+
|
390
|
+
input_msg = convert_message_to_agentmessage(input, media)
|
391
|
+
plan = self.generate_plan(input_msg)
|
392
|
+
return plan.plan
|
350
393
|
|
351
394
|
def generate_plan(
|
352
395
|
self,
|
353
|
-
chat: List[
|
396
|
+
chat: List[AgentMessage],
|
397
|
+
max_steps: Optional[int] = None,
|
354
398
|
code_interpreter: Optional[CodeInterpreter] = None,
|
355
399
|
) -> PlanContext:
|
400
|
+
"""Generate a plan to solve a vision task.
|
401
|
+
|
402
|
+
Parameters:
|
403
|
+
chat (List[AgentMessage]): The conversation messages to generate a plan for.
|
404
|
+
max_steps (Optional[int]): The maximum number of steps to plan.
|
405
|
+
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
406
|
+
|
407
|
+
Returns:
|
408
|
+
PlanContext: The generated plan including the instructions and code snippets
|
409
|
+
needed to solve the task.
|
410
|
+
"""
|
411
|
+
|
356
412
|
if not chat:
|
357
413
|
raise ValueError("Chat cannot be empty")
|
358
414
|
|
@@ -360,13 +416,16 @@ class VisionAgentPlannerV2(Agent):
|
|
360
416
|
code_interpreter = code_interpreter or CodeInterpreterFactory.new_instance(
|
361
417
|
self.code_sandbox_runtime
|
362
418
|
)
|
419
|
+
max_steps = max_steps or self.max_steps
|
363
420
|
|
364
421
|
with code_interpreter:
|
365
422
|
critque_steps = 1
|
366
|
-
step = self.max_steps
|
367
423
|
finished = False
|
368
424
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
369
|
-
|
425
|
+
|
426
|
+
step = get_steps(int_chat, max_steps)
|
427
|
+
if "<count>" not in int_chat[-1].content and step == max_steps:
|
428
|
+
int_chat[-1].content += f"\n<count>{step}</count>\n"
|
370
429
|
while step > 0 and not finished:
|
371
430
|
if self.use_multi_trial_planning:
|
372
431
|
response = run_multi_trial_planning(
|
@@ -402,29 +461,29 @@ class VisionAgentPlannerV2(Agent):
|
|
402
461
|
|
403
462
|
if critque_steps % self.critique_steps == 0:
|
404
463
|
critique = run_critic(int_chat, media_list, self.critic)
|
405
|
-
if critique is not None and int_chat[-1]
|
464
|
+
if critique is not None and int_chat[-1].role == "observation":
|
406
465
|
_CONSOLE.print(
|
407
466
|
f"[bold cyan]Critique:[/bold cyan] [red]{critique}[/red]"
|
408
467
|
)
|
409
468
|
critique_str = f"\n[critique]\n{critique}\n[end of critique]"
|
410
|
-
updated_chat[-1]
|
469
|
+
updated_chat[-1].content += critique_str
|
411
470
|
# if plan was critiqued, ensure we don't finish so we can
|
412
471
|
# respond to the critique
|
413
472
|
finished = False
|
414
473
|
|
415
474
|
critque_steps += 1
|
416
475
|
step -= 1
|
417
|
-
updated_chat[-1]
|
476
|
+
updated_chat[-1].content += f"\n<count>{step}</count>\n"
|
418
477
|
int_chat.extend(updated_chat)
|
419
478
|
for chat_elt in updated_chat:
|
420
|
-
self.update_callback(chat_elt)
|
479
|
+
self.update_callback(chat_elt.model_dump())
|
421
480
|
|
422
481
|
updated_chat, plan_context = create_finalize_plan(
|
423
482
|
int_chat, self.planner, self.verbose
|
424
483
|
)
|
425
484
|
int_chat.extend(updated_chat)
|
426
485
|
for chat_elt in updated_chat:
|
427
|
-
self.update_callback(chat_elt)
|
486
|
+
self.update_callback(chat_elt.model_dump())
|
428
487
|
|
429
488
|
return plan_context
|
430
489
|
|
@@ -55,10 +55,10 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
|
|
55
55
|
|
56
56
|
OBSERVATION:
|
57
57
|
[Artifact dog_detector.py (5 lines total)]
|
58
|
-
0|from vision_agent.tools import load_image,
|
58
|
+
0|from vision_agent.tools import load_image, owl_v2_image
|
59
59
|
1|def detect_dogs(image_path: str):
|
60
60
|
2| image = load_image(image_path)
|
61
|
-
3| dogs =
|
61
|
+
3| dogs = owl_v2_image("dog", image)
|
62
62
|
4| return dogs
|
63
63
|
[End of artifact]
|
64
64
|
|
@@ -96,10 +96,10 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
|
|
96
96
|
|
97
97
|
OBSERVATION:
|
98
98
|
[Artifact dog_detector.py (5 lines total)]
|
99
|
-
0|from vision_agent.tools import load_image,
|
99
|
+
0|from vision_agent.tools import load_image, owl_v2_image
|
100
100
|
1|def detect_dogs(image_path: str):
|
101
101
|
2| image = load_image(image_path)
|
102
|
-
3| dogs =
|
102
|
+
3| dogs = owl_v2_image("dog", image, threshold=0.24)
|
103
103
|
4| return dogs
|
104
104
|
[End of artifact]
|
105
105
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
CONVERSATION = """
|
2
|
+
**Role**: You are a help agent that assists users with their vision-related questions.
|
3
|
+
|
4
|
+
**Actions**:
|
5
|
+
`generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
|
6
|
+
|
7
|
+
`edit_code` - If the user already has code for a vision task, from a previous call to `generate_or_edit_vision_code` and wants to quickly modify the code, you can use this action to edit the code. This is good for non-vision related changes such as formatting the response as a json or removing print statements.
|
8
|
+
|
9
|
+
**Exampels**:
|
10
|
+
Here is an example of how you can interact with a user and Actions:
|
11
|
+
--- START EXAMPLES ---
|
12
|
+
USER: Can you help me write code to detect dogs in this image? Media name images/dogs.jpg
|
13
|
+
|
14
|
+
AGENT: <response>Yes, I can help you with that. I will write the code to detect dogs in the image.</response><action>generate_or_edit_vision_code</action>
|
15
|
+
|
16
|
+
OBSERVATION:
|
17
|
+
<final_code>
|
18
|
+
from vision_agent.tools import load_image, owl_v2_image
|
19
|
+
def detect_dogs(image_path: str):
|
20
|
+
image = load_image(image_path)
|
21
|
+
dogs = owl_v2_image(image)
|
22
|
+
return dogs
|
23
|
+
</final_code>
|
24
|
+
<final_test>
|
25
|
+
def test_detect_dogs():
|
26
|
+
dogs = detect_dogs("images/dogs.jpg")
|
27
|
+
assert len(dogs) > 0
|
28
|
+
</final_test>
|
29
|
+
|
30
|
+
AGENT: <response>Here is the code to detect dogs in the image.</response>
|
31
|
+
|
32
|
+
--- END EXAMPLES ---
|
33
|
+
|
34
|
+
**Conversation**:
|
35
|
+
Here is the current conversation so far:
|
36
|
+
--- START CONVERSATION ---
|
37
|
+
{conversation}
|
38
|
+
--- END CONVERSATION ---
|
39
|
+
|
40
|
+
**Instructions**:
|
41
|
+
1. Only respond with a single <response> tag and a single <action> tag.
|
42
|
+
2. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
|
43
|
+
|
44
|
+
<response>Your response to the user's message</response>
|
45
|
+
<action>The action you want to take from **Actions**</action>
|
46
|
+
"""
|
@@ -0,0 +1,215 @@
|
|
1
|
+
import copy
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
4
|
+
|
5
|
+
from vision_agent.agent import Agent, AgentCoder, VisionAgentCoderV2
|
6
|
+
from vision_agent.agent.agent_utils import (
|
7
|
+
add_media_to_chat,
|
8
|
+
convert_message_to_agentmessage,
|
9
|
+
extract_tag,
|
10
|
+
)
|
11
|
+
from vision_agent.agent.types import AgentMessage, PlanContext
|
12
|
+
from vision_agent.agent.vision_agent_coder_v2 import format_code_context
|
13
|
+
from vision_agent.agent.vision_agent_prompts_v2 import CONVERSATION
|
14
|
+
from vision_agent.lmm import LMM, AnthropicLMM
|
15
|
+
from vision_agent.lmm.types import Message
|
16
|
+
from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
|
17
|
+
|
18
|
+
|
19
|
+
def format_conversation(chat: List[AgentMessage]) -> str:
|
20
|
+
chat = copy.deepcopy(chat)
|
21
|
+
prompt = ""
|
22
|
+
for chat_i in chat:
|
23
|
+
if chat_i.role == "user":
|
24
|
+
prompt += f"USER: {chat_i.content}\n\n"
|
25
|
+
elif chat_i.role == "observation" or chat_i.role == "coder":
|
26
|
+
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
27
|
+
elif chat_i.role == "conversation":
|
28
|
+
prompt += f"AGENT: {chat_i.content}\n\n"
|
29
|
+
return prompt
|
30
|
+
|
31
|
+
|
32
|
+
def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
|
33
|
+
# only keep last 10 messages
|
34
|
+
conv = format_conversation(chat[-10:])
|
35
|
+
prompt = CONVERSATION.format(
|
36
|
+
conversation=conv,
|
37
|
+
)
|
38
|
+
response = agent([{"role": "user", "content": prompt}], stream=False)
|
39
|
+
return cast(str, response)
|
40
|
+
|
41
|
+
|
42
|
+
def extract_conversation_for_generate_code(
|
43
|
+
chat: List[AgentMessage],
|
44
|
+
) -> List[AgentMessage]:
|
45
|
+
chat = copy.deepcopy(chat)
|
46
|
+
extracted_chat = []
|
47
|
+
for chat_i in chat:
|
48
|
+
if chat_i.role == "user":
|
49
|
+
extracted_chat.append(chat_i)
|
50
|
+
elif chat_i.role == "coder":
|
51
|
+
if "<final_code>" in chat_i.content and "<final_test>" in chat_i.content:
|
52
|
+
extracted_chat.append(chat_i)
|
53
|
+
|
54
|
+
return extracted_chat
|
55
|
+
|
56
|
+
|
57
|
+
def maybe_run_action(
|
58
|
+
coder: AgentCoder,
|
59
|
+
action: Optional[str],
|
60
|
+
chat: List[AgentMessage],
|
61
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
62
|
+
) -> Optional[List[AgentMessage]]:
|
63
|
+
if action == "generate_or_edit_vision_code":
|
64
|
+
extracted_chat = extract_conversation_for_generate_code(chat)
|
65
|
+
# there's an issue here because coder.generate_code will send it's code_context
|
66
|
+
# to the outside user via it's update_callback, but we don't necessarily have
|
67
|
+
# access to that update_callback here, so we re-create the message using
|
68
|
+
# format_code_context.
|
69
|
+
code_context = coder.generate_code(
|
70
|
+
extracted_chat, code_interpreter=code_interpreter
|
71
|
+
)
|
72
|
+
return [
|
73
|
+
AgentMessage(role="coder", content=format_code_context(code_context)),
|
74
|
+
AgentMessage(role="observation", content=code_context.test_result.text()),
|
75
|
+
]
|
76
|
+
elif action == "edit_code":
|
77
|
+
extracted_chat = extract_conversation_for_generate_code(chat)
|
78
|
+
plan_context = PlanContext(
|
79
|
+
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
|
80
|
+
instructions=[],
|
81
|
+
code="",
|
82
|
+
)
|
83
|
+
code_context = coder.generate_code_from_plan(
|
84
|
+
extracted_chat, plan_context, code_interpreter=code_interpreter
|
85
|
+
)
|
86
|
+
return [
|
87
|
+
AgentMessage(role="coder", content=format_code_context(code_context)),
|
88
|
+
AgentMessage(role="observation", content=code_context.test_result.text()),
|
89
|
+
]
|
90
|
+
elif action == "view_image":
|
91
|
+
pass
|
92
|
+
|
93
|
+
return None
|
94
|
+
|
95
|
+
|
96
|
+
class VisionAgentV2(Agent):
|
97
|
+
"""VisionAgentV2 is a conversational agent that allows you to more easily use a
|
98
|
+
coder agent such as VisionAgentCoderV2 to write vision code for you.
|
99
|
+
"""
|
100
|
+
|
101
|
+
def __init__(
|
102
|
+
self,
|
103
|
+
agent: Optional[LMM] = None,
|
104
|
+
coder: Optional[AgentCoder] = None,
|
105
|
+
verbose: bool = False,
|
106
|
+
code_sandbox_runtime: Optional[str] = None,
|
107
|
+
update_callback: Callable[[Dict[str, Any]], None] = lambda x: None,
|
108
|
+
) -> None:
|
109
|
+
"""Initialize the VisionAgentV2.
|
110
|
+
|
111
|
+
Parameters:
|
112
|
+
agent (Optional[LMM]): The language model to use for the agent. If None, a
|
113
|
+
default AnthropicLMM will be used.
|
114
|
+
coder (Optional[AgentCoder]): The coder agent to use for generating vision
|
115
|
+
code. If None, a default VisionAgentCoderV2 will be used.
|
116
|
+
verbose (bool): Whether to print out debug information.
|
117
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
|
118
|
+
be one of: None, "local" or "e2b". If None, it will read from the
|
119
|
+
environment variable CODE_SANDBOX_RUNTIME.
|
120
|
+
update_callback (Callable[[Dict[str, Any]], None]): The callback function
|
121
|
+
that will send back intermediate conversation messages.
|
122
|
+
"""
|
123
|
+
|
124
|
+
self.agent = (
|
125
|
+
agent
|
126
|
+
if agent is not None
|
127
|
+
else AnthropicLMM(
|
128
|
+
model_name="claude-3-5-sonnet-20241022",
|
129
|
+
temperature=0.0,
|
130
|
+
)
|
131
|
+
)
|
132
|
+
self.coder = (
|
133
|
+
coder
|
134
|
+
if coder is not None
|
135
|
+
else VisionAgentCoderV2(verbose=verbose, update_callback=update_callback)
|
136
|
+
)
|
137
|
+
|
138
|
+
self.verbose = verbose
|
139
|
+
self.code_sandbox_runtime = code_sandbox_runtime
|
140
|
+
self.update_callback = update_callback
|
141
|
+
|
142
|
+
# force coder to use the same update_callback
|
143
|
+
if hasattr(self.coder, "update_callback"):
|
144
|
+
self.coder.update_callback = update_callback
|
145
|
+
|
146
|
+
def __call__(
|
147
|
+
self,
|
148
|
+
input: Union[str, List[Message]],
|
149
|
+
media: Optional[Union[str, Path]] = None,
|
150
|
+
) -> str:
|
151
|
+
"""Conversational interface to the agent. This is the main method to use to
|
152
|
+
interact with the agent. It takes in a string or list of messages and returns
|
153
|
+
the agent's response as a string.
|
154
|
+
|
155
|
+
Parameters:
|
156
|
+
input (Union[str, List[Message]]): The input to the agent. This can be a
|
157
|
+
string or a list of messages in the format of [{"role": "user",
|
158
|
+
"content": "describe your task here..."}, ...].
|
159
|
+
media (Optional[Union[str, Path]]): The path to the media file to use with
|
160
|
+
the input. This can be an image or video file.
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
str: The agent's response as a string.
|
164
|
+
"""
|
165
|
+
|
166
|
+
input_msg = convert_message_to_agentmessage(input, media)
|
167
|
+
return self.chat(input_msg)[-1].content
|
168
|
+
|
169
|
+
def chat(
|
170
|
+
self,
|
171
|
+
chat: List[AgentMessage],
|
172
|
+
) -> List[AgentMessage]:
|
173
|
+
"""Conversational interface to the agent. This is the main method to use to
|
174
|
+
interact with the agent. It takes in a list of messages and returns the agent's
|
175
|
+
response as a list of messages.
|
176
|
+
|
177
|
+
Parameters:
|
178
|
+
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
179
|
+
AgentMessage objects.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
List[AgentMessage]: The agent's response as a list of AgentMessage objects.
|
183
|
+
"""
|
184
|
+
|
185
|
+
return_chat = []
|
186
|
+
with CodeInterpreterFactory.new_instance(
|
187
|
+
self.code_sandbox_runtime
|
188
|
+
) as code_interpreter:
|
189
|
+
int_chat, _, _ = add_media_to_chat(chat, code_interpreter)
|
190
|
+
response_context = run_conversation(self.agent, int_chat)
|
191
|
+
return_chat.append(
|
192
|
+
AgentMessage(role="conversation", content=response_context)
|
193
|
+
)
|
194
|
+
self.update_callback(return_chat[-1].model_dump())
|
195
|
+
|
196
|
+
action = extract_tag(response_context, "action")
|
197
|
+
|
198
|
+
updated_chat = maybe_run_action(
|
199
|
+
self.coder, action, int_chat, code_interpreter=code_interpreter
|
200
|
+
)
|
201
|
+
if updated_chat is not None:
|
202
|
+
# do not append updated_chat to return_chat becuase the observation
|
203
|
+
# from running the action will have already been added via the callbacks
|
204
|
+
obs_response_context = run_conversation(
|
205
|
+
self.agent, return_chat + updated_chat
|
206
|
+
)
|
207
|
+
return_chat.append(
|
208
|
+
AgentMessage(role="conversation", content=obs_response_context)
|
209
|
+
)
|
210
|
+
self.update_callback(return_chat[-1].model_dump())
|
211
|
+
|
212
|
+
return return_chat
|
213
|
+
|
214
|
+
def log_progress(self, data: Dict[str, Any]) -> None:
|
215
|
+
pass
|
vision_agent/tools/tools.py
CHANGED
@@ -2453,7 +2453,6 @@ FUNCTION_TOOLS = [
|
|
2453
2453
|
owl_v2_image,
|
2454
2454
|
owl_v2_video,
|
2455
2455
|
ocr,
|
2456
|
-
clip,
|
2457
2456
|
vit_image_classification,
|
2458
2457
|
vit_nsfw_classification,
|
2459
2458
|
countgd_counting,
|
@@ -2471,6 +2470,7 @@ FUNCTION_TOOLS = [
|
|
2471
2470
|
qwen2_vl_video_vqa,
|
2472
2471
|
video_temporal_localization,
|
2473
2472
|
flux_image_inpainting,
|
2473
|
+
siglip_classification,
|
2474
2474
|
]
|
2475
2475
|
|
2476
2476
|
UTIL_TOOLS = [
|
vision_agent/utils/execute.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
-
vision_agent/.sim_tools/df.csv,sha256=
|
2
|
-
vision_agent/.sim_tools/embs.npy,sha256=
|
1
|
+
vision_agent/.sim_tools/df.csv,sha256=0fmLwTDjnRTiqYwamTOdCPjruE6wZz0AVrONIPTHxZY,34086
|
2
|
+
vision_agent/.sim_tools/embs.npy,sha256=xF8Cg7Xd09QCTySj831aL1O2_0kRNaaH8XRJIRjgWzQ,356480
|
3
3
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
4
|
-
vision_agent/agent/__init__.py,sha256=
|
5
|
-
vision_agent/agent/agent.py,sha256=
|
6
|
-
vision_agent/agent/agent_utils.py,sha256=
|
4
|
+
vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
|
5
|
+
vision_agent/agent/agent.py,sha256=sf8JcA3LNy_4GaS_gQb2Q-PXkl4dBuGh-7raI9KAtZo,1470
|
6
|
+
vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
|
7
|
+
vision_agent/agent/types.py,sha256=aAd_ez1-NQh04k27cmywyOV2uA_vWWYE-Ok7zq_JoAk,1532
|
7
8
|
vision_agent/agent/vision_agent.py,sha256=rr1P9iTbr7OsjgMYWCeIxQYI4cLwPWia3NIMJNi-9Yo,26110
|
8
9
|
vision_agent/agent/vision_agent_coder.py,sha256=waCmw_NTgsy9G-UqlRZFhsFJJVuWVrjxVnShe4Xp_lI,27743
|
9
10
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
10
11
|
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
|
11
|
-
vision_agent/agent/vision_agent_coder_v2.py,sha256=
|
12
|
+
vision_agent/agent/vision_agent_coder_v2.py,sha256=SVIJC0N5TBgq9z-F99UebLimRuQuAe_HHvTFupBzVfo,14715
|
12
13
|
vision_agent/agent/vision_agent_planner.py,sha256=F_5opnc0XmQmNH40rs2T7DFrai4CC6aDYe02Z8e93AM,18875
|
13
14
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=Y3jz9HRf8fz9NLUseN7cTgZqewP0RazxR7vw1sPhcn0,6691
|
14
|
-
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=
|
15
|
-
vision_agent/agent/vision_agent_planner_v2.py,sha256=
|
16
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
15
|
+
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=Tzon3h5iZdHJglesk8GVS-2myNf5-fhf7HUbkpZWHQk,33143
|
16
|
+
vision_agent/agent/vision_agent_planner_v2.py,sha256=mxQxD_B8sKYharh8e7W0uc1tN11YCztyLowc83seScc,17023
|
17
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=PENFd8VM_vHKxeZPiotVM1RBVW9NrXimKbpvI1UteKI,13772
|
18
|
+
vision_agent/agent/vision_agent_prompts_v2.py,sha256=-vCWat-ARlCOOOeIDIFhg-kcwRRwjTXYEwsvvqPeaCs,1972
|
19
|
+
vision_agent/agent/vision_agent_v2.py,sha256=Cudp_ZZBI9rDwMjIYlvY4jzh_srsulYgfRWZLo4_2TQ,8366
|
17
20
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
21
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
19
22
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -27,16 +30,16 @@ vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB
|
|
27
30
|
vision_agent/tools/planner_tools.py,sha256=FROahw_6Taqvytv6pOjCHUEypOfjsi_f8Vo1c5vz6Mw,8823
|
28
31
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
29
32
|
vision_agent/tools/tool_utils.py,sha256=GDGOmBCo4UfYz-DJ-olREJHPsqs5mzHu0YXiAnpNE8E,10179
|
30
|
-
vision_agent/tools/tools.py,sha256=
|
33
|
+
vision_agent/tools/tools.py,sha256=wXDs0m_Yb601FQVp5fPYYVtt4lHUeMnuqIbfDZhsE4Q,87852
|
31
34
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
32
35
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
33
36
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
34
|
-
vision_agent/utils/execute.py,sha256=
|
37
|
+
vision_agent/utils/execute.py,sha256=b3AA1G16Ixwlgd-kke13brKclxh5nJXQTrk25oj1W3o,28027
|
35
38
|
vision_agent/utils/image_utils.py,sha256=rRWcxKggPXIRXIY_XT9rZt30ECDRq8zq7FDeXRDqQWs,11679
|
36
39
|
vision_agent/utils/sim.py,sha256=NZc9QGD6BTY5O29NVbHH7oxDePL_QMnylT1lYcDUn1Y,7437
|
37
40
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
38
41
|
vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
|
39
|
-
vision_agent-0.2.
|
40
|
-
vision_agent-0.2.
|
41
|
-
vision_agent-0.2.
|
42
|
-
vision_agent-0.2.
|
42
|
+
vision_agent-0.2.200.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
43
|
+
vision_agent-0.2.200.dist-info/METADATA,sha256=goRTW73tD79-UlJiy4cL0twnVYm9iSjU9f5HsC4A1ZI,19026
|
44
|
+
vision_agent-0.2.200.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
45
|
+
vision_agent-0.2.200.dist-info/RECORD,,
|
File without changes
|
File without changes
|