vision-agent 0.2.198__py3-none-any.whl → 0.2.200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +18 -18
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/__init__.py +2 -1
- vision_agent/agent/agent.py +33 -0
- vision_agent/agent/agent_utils.py +47 -34
- vision_agent/agent/types.py +51 -0
- vision_agent/agent/vision_agent_coder_v2.py +131 -43
- vision_agent/agent/vision_agent_planner_prompts_v2.py +1 -1
- vision_agent/agent/vision_agent_planner_v2.py +109 -50
- vision_agent/agent/vision_agent_prompts.py +4 -4
- vision_agent/agent/vision_agent_prompts_v2.py +46 -0
- vision_agent/agent/vision_agent_v2.py +215 -0
- vision_agent/tools/tools.py +1 -1
- vision_agent/utils/execute.py +1 -1
- {vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/METADATA +1 -1
- {vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/RECORD +18 -15
- {vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.198.dist-info → vision_agent-0.2.200.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -80,24 +80,6 @@ desc,doc,name
|
|
80
80
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
81
81
|
]
|
82
82
|
",ocr
|
83
|
-
'clip' is a tool that can classify an image or a cropped detection given a list of input classes or tags. It returns the same list of the input classes along with their probability scores based on image content.,"clip(image: numpy.ndarray, classes: List[str]) -> Dict[str, Any]:
|
84
|
-
'clip' is a tool that can classify an image or a cropped detection given a list
|
85
|
-
of input classes or tags. It returns the same list of the input classes along with
|
86
|
-
their probability scores based on image content.
|
87
|
-
|
88
|
-
Parameters:
|
89
|
-
image (np.ndarray): The image to classify or tag
|
90
|
-
classes (List[str]): The list of classes or tags that is associated with the image
|
91
|
-
|
92
|
-
Returns:
|
93
|
-
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
94
|
-
contains a list of given labels and other a list of scores.
|
95
|
-
|
96
|
-
Example
|
97
|
-
-------
|
98
|
-
>>> clip(image, ['dog', 'cat', 'bird'])
|
99
|
-
{""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
100
|
-
",clip
|
101
83
|
'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
|
102
84
|
'vit_image_classification' is a tool that can classify an image. It returns a
|
103
85
|
list of classes and their probability scores based on image content.
|
@@ -488,6 +470,24 @@ desc,doc,name
|
|
488
470
|
... )
|
489
471
|
>>> save_image(result, ""inpainted_room.png"")
|
490
472
|
",flux_image_inpainting
|
473
|
+
'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
|
474
|
+
'siglip_classification' is a tool that can classify an image or a cropped detection given a list
|
475
|
+
of input labels or tags. It returns the same list of the input labels along with
|
476
|
+
their probability scores based on image content.
|
477
|
+
|
478
|
+
Parameters:
|
479
|
+
image (np.ndarray): The image to classify or tag
|
480
|
+
labels (List[str]): The list of labels or tags that is associated with the image
|
481
|
+
|
482
|
+
Returns:
|
483
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
484
|
+
contains a list of given labels and other a list of scores.
|
485
|
+
|
486
|
+
Example
|
487
|
+
-------
|
488
|
+
>>> siglip_classification(image, ['dog', 'cat', 'bird'])
|
489
|
+
{""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
|
490
|
+
",siglip_classification
|
491
491
|
"'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
|
492
492
|
'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
493
493
|
which can be a file path, url or youtube link, returns a list of dictionaries
|
vision_agent/.sim_tools/embs.npy
CHANGED
Binary file
|
vision_agent/agent/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from .agent import Agent
|
1
|
+
from .agent import Agent, AgentCoder, AgentPlanner
|
2
2
|
from .vision_agent import VisionAgent
|
3
3
|
from .vision_agent_coder import (
|
4
4
|
AnthropicVisionAgentCoder,
|
@@ -17,3 +17,4 @@ from .vision_agent_planner import (
|
|
17
17
|
VisionAgentPlanner,
|
18
18
|
)
|
19
19
|
from .vision_agent_planner_v2 import VisionAgentPlannerV2
|
20
|
+
from .vision_agent_v2 import VisionAgentV2
|
vision_agent/agent/agent.py
CHANGED
@@ -2,7 +2,9 @@ from abc import ABC, abstractmethod
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Any, Dict, List, Optional, Union
|
4
4
|
|
5
|
+
from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
|
5
6
|
from vision_agent.lmm.types import Message
|
7
|
+
from vision_agent.utils.execute import CodeInterpreter
|
6
8
|
|
7
9
|
|
8
10
|
class Agent(ABC):
|
@@ -20,3 +22,34 @@ class Agent(ABC):
|
|
20
22
|
This is a hook that is intended for reporting the progress of the agent.
|
21
23
|
"""
|
22
24
|
pass
|
25
|
+
|
26
|
+
|
27
|
+
class AgentCoder(Agent):
|
28
|
+
@abstractmethod
|
29
|
+
def generate_code(
|
30
|
+
self,
|
31
|
+
chat: List[AgentMessage],
|
32
|
+
max_steps: Optional[int] = None,
|
33
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
34
|
+
) -> CodeContext:
|
35
|
+
pass
|
36
|
+
|
37
|
+
@abstractmethod
|
38
|
+
def generate_code_from_plan(
|
39
|
+
self,
|
40
|
+
chat: List[AgentMessage],
|
41
|
+
plan_context: PlanContext,
|
42
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
43
|
+
) -> CodeContext:
|
44
|
+
pass
|
45
|
+
|
46
|
+
|
47
|
+
class AgentPlanner(Agent):
|
48
|
+
@abstractmethod
|
49
|
+
def generate_plan(
|
50
|
+
self,
|
51
|
+
chat: List[AgentMessage],
|
52
|
+
max_steps: Optional[int] = None,
|
53
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
54
|
+
) -> PlanContext:
|
55
|
+
pass
|
@@ -4,16 +4,17 @@ import logging
|
|
4
4
|
import re
|
5
5
|
import sys
|
6
6
|
import tempfile
|
7
|
-
from
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
9
|
|
9
10
|
import libcst as cst
|
10
|
-
from pydantic import BaseModel
|
11
11
|
from rich.console import Console
|
12
12
|
from rich.style import Style
|
13
13
|
from rich.syntax import Syntax
|
14
14
|
from rich.table import Table
|
15
15
|
|
16
16
|
import vision_agent.tools as T
|
17
|
+
from vision_agent.agent.types import AgentMessage, PlanContext
|
17
18
|
from vision_agent.lmm.types import Message
|
18
19
|
from vision_agent.utils.execute import CodeInterpreter, Execution
|
19
20
|
from vision_agent.utils.image_utils import b64_to_pil, convert_to_b64
|
@@ -24,19 +25,6 @@ _CONSOLE = Console()
|
|
24
25
|
_MAX_TABULATE_COL_WIDTH = 80
|
25
26
|
|
26
27
|
|
27
|
-
class PlanContext(BaseModel):
|
28
|
-
plan: str
|
29
|
-
instructions: List[str]
|
30
|
-
code: str
|
31
|
-
|
32
|
-
|
33
|
-
class CodeContext(BaseModel):
|
34
|
-
code: str
|
35
|
-
test: str
|
36
|
-
success: bool
|
37
|
-
test_result: Execution
|
38
|
-
|
39
|
-
|
40
28
|
def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
|
41
29
|
json_pattern = r"\{.*\}"
|
42
30
|
match = re.search(json_pattern, json_str, re.DOTALL)
|
@@ -228,15 +216,15 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
|
|
228
216
|
|
229
217
|
|
230
218
|
def add_media_to_chat(
|
231
|
-
chat: List[
|
232
|
-
) -> Tuple[List[
|
219
|
+
chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
|
220
|
+
) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
|
233
221
|
orig_chat = copy.deepcopy(chat)
|
234
222
|
int_chat = copy.deepcopy(chat)
|
235
|
-
media_list = []
|
223
|
+
media_list: List[Union[str, Path]] = []
|
236
224
|
for chat_i in int_chat:
|
237
|
-
if
|
238
|
-
media_list_i = []
|
239
|
-
for media in chat_i
|
225
|
+
if chat_i.media is not None:
|
226
|
+
media_list_i: List[Union[str, Path]] = []
|
227
|
+
for media in chat_i.media:
|
240
228
|
if isinstance(media, str) and media.startswith("data:image/"):
|
241
229
|
media_pil = b64_to_pil(media)
|
242
230
|
with tempfile.NamedTemporaryFile(
|
@@ -244,25 +232,29 @@ def add_media_to_chat(
|
|
244
232
|
) as temp_file:
|
245
233
|
media_pil.save(temp_file, format="PNG")
|
246
234
|
media = str(temp_file.name)
|
247
|
-
|
235
|
+
if code_interpreter is not None:
|
236
|
+
media = str(code_interpreter.upload_file(media))
|
248
237
|
media_list_i.append(media)
|
249
|
-
# don't duplicate appending media name
|
250
|
-
if
|
251
|
-
chat_i
|
252
|
-
|
238
|
+
# don't duplicate appending media name and only add them for user messages
|
239
|
+
if (
|
240
|
+
not str(chat_i.content).endswith(f" Media name {media}")
|
241
|
+
and chat_i.role == "user"
|
242
|
+
):
|
243
|
+
chat_i.content += f" Media name {media}"
|
244
|
+
chat_i.media = media_list_i if len(media_list_i) > 0 else None
|
253
245
|
media_list.extend(media_list_i)
|
254
246
|
|
255
247
|
int_chat = cast(
|
256
|
-
List[
|
248
|
+
List[AgentMessage],
|
257
249
|
[
|
258
250
|
(
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
if
|
265
|
-
else
|
251
|
+
AgentMessage(
|
252
|
+
role=c.role,
|
253
|
+
content=c.content,
|
254
|
+
media=c.media,
|
255
|
+
)
|
256
|
+
if c.media is not None
|
257
|
+
else AgentMessage(role=c.role, content=c.content, media=None)
|
266
258
|
)
|
267
259
|
for c in int_chat
|
268
260
|
],
|
@@ -283,6 +275,27 @@ def capture_media_from_exec(execution: Execution) -> List[str]:
|
|
283
275
|
return images
|
284
276
|
|
285
277
|
|
278
|
+
def convert_message_to_agentmessage(
|
279
|
+
input: Union[str, List[Message]],
|
280
|
+
media: Optional[Union[str, Path]] = None,
|
281
|
+
) -> List[AgentMessage]:
|
282
|
+
if isinstance(input, str):
|
283
|
+
input_msg = [
|
284
|
+
AgentMessage(
|
285
|
+
role="user",
|
286
|
+
content=input,
|
287
|
+
media=([media] if media is not None else None),
|
288
|
+
)
|
289
|
+
]
|
290
|
+
else:
|
291
|
+
input_msg = [
|
292
|
+
AgentMessage(role=msg["role"], content=msg["content"], media=None)
|
293
|
+
for msg in input
|
294
|
+
]
|
295
|
+
input_msg[0].media = [media] if media is not None else None
|
296
|
+
return input_msg
|
297
|
+
|
298
|
+
|
286
299
|
def strip_function_calls( # noqa: C901
|
287
300
|
code: str, exclusions: Optional[List[str]] = None
|
288
301
|
) -> str:
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import List, Literal, Optional, Union
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
from vision_agent.utils.execute import Execution
|
7
|
+
|
8
|
+
|
9
|
+
class AgentMessage(BaseModel):
|
10
|
+
"""AgentMessage encompases messages sent to the entire Agentic system, which includes
|
11
|
+
both LMMs and sub-agents.
|
12
|
+
|
13
|
+
user: The user's message.
|
14
|
+
assistant: The assistant's message.
|
15
|
+
observation: An observation made after conducting an action, either by the user or
|
16
|
+
assistant.
|
17
|
+
interaction: An interaction between the user and the assistant. For example if the
|
18
|
+
assistant wants to ask the user for help on a task, it could send an
|
19
|
+
interaction message.
|
20
|
+
conversation: Messages coming from the conversation agent, this is a type of
|
21
|
+
assistant messages.
|
22
|
+
planner: Messages coming from the planner agent, this is a type of assistant
|
23
|
+
messages.
|
24
|
+
coder: Messages coming from the coder agent, this is a type of assistant messages.
|
25
|
+
|
26
|
+
"""
|
27
|
+
|
28
|
+
role: Union[
|
29
|
+
Literal["user"],
|
30
|
+
Literal["assistant"], # planner, coder and conversation are of type assistant
|
31
|
+
Literal["observation"],
|
32
|
+
Literal["interaction"],
|
33
|
+
Literal["conversation"],
|
34
|
+
Literal["planner"],
|
35
|
+
Literal["coder"],
|
36
|
+
]
|
37
|
+
content: str
|
38
|
+
media: Optional[List[Union[str, Path]]] = None
|
39
|
+
|
40
|
+
|
41
|
+
class PlanContext(BaseModel):
|
42
|
+
plan: str
|
43
|
+
instructions: List[str]
|
44
|
+
code: str
|
45
|
+
|
46
|
+
|
47
|
+
class CodeContext(BaseModel):
|
48
|
+
code: str
|
49
|
+
test: str
|
50
|
+
success: bool
|
51
|
+
test_result: Execution
|
@@ -6,19 +6,19 @@ from rich.console import Console
|
|
6
6
|
from rich.markup import escape
|
7
7
|
|
8
8
|
import vision_agent.tools as T
|
9
|
-
from vision_agent.agent import
|
9
|
+
from vision_agent.agent import AgentCoder, AgentPlanner
|
10
10
|
from vision_agent.agent.agent_utils import (
|
11
|
-
CodeContext,
|
12
11
|
DefaultImports,
|
13
|
-
PlanContext,
|
14
12
|
add_media_to_chat,
|
15
13
|
capture_media_from_exec,
|
14
|
+
convert_message_to_agentmessage,
|
16
15
|
extract_tag,
|
17
16
|
format_feedback,
|
18
17
|
format_plan_v2,
|
19
18
|
print_code,
|
20
19
|
strip_function_calls,
|
21
20
|
)
|
21
|
+
from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
|
22
22
|
from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
|
23
23
|
from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
|
24
24
|
from vision_agent.lmm import LMM, AnthropicLMM
|
@@ -34,6 +34,12 @@ from vision_agent.utils.sim import Sim, load_cached_sim
|
|
34
34
|
_CONSOLE = Console()
|
35
35
|
|
36
36
|
|
37
|
+
def format_code_context(
|
38
|
+
code_context: CodeContext,
|
39
|
+
) -> str:
|
40
|
+
return f"<final_code>{code_context.code}</final_code>\n<final_test>{code_context.test}</final_test>"
|
41
|
+
|
42
|
+
|
37
43
|
def retrieve_tools(
|
38
44
|
plan: List[str],
|
39
45
|
tool_recommender: Sim,
|
@@ -49,46 +55,54 @@ def retrieve_tools(
|
|
49
55
|
|
50
56
|
def write_code(
|
51
57
|
coder: LMM,
|
52
|
-
chat: List[
|
58
|
+
chat: List[AgentMessage],
|
53
59
|
tool_docs: str,
|
54
60
|
plan: str,
|
55
61
|
) -> str:
|
56
62
|
chat = copy.deepcopy(chat)
|
57
|
-
if chat[-1]
|
63
|
+
if chat[-1].role != "user":
|
58
64
|
raise ValueError("Last chat message must be from the user.")
|
59
65
|
|
60
|
-
user_request = chat[-1]
|
66
|
+
user_request = chat[-1].content
|
61
67
|
prompt = CODE.format(
|
62
68
|
docstring=tool_docs,
|
63
69
|
question=user_request,
|
64
70
|
plan=plan,
|
65
71
|
)
|
66
|
-
|
67
|
-
|
68
|
-
|
72
|
+
response = cast(str, coder([{"role": "user", "content": prompt}], stream=False))
|
73
|
+
maybe_code = extract_tag(response, "code")
|
74
|
+
|
75
|
+
# if the response wasn't properly formatted with the code tags just retrun the response
|
76
|
+
if maybe_code is None:
|
77
|
+
return response
|
78
|
+
return maybe_code
|
69
79
|
|
70
80
|
|
71
81
|
def write_test(
|
72
82
|
tester: LMM,
|
73
|
-
chat: List[
|
83
|
+
chat: List[AgentMessage],
|
74
84
|
tool_util_docs: str,
|
75
85
|
code: str,
|
76
86
|
media_list: Optional[Sequence[Union[str, Path]]] = None,
|
77
87
|
) -> str:
|
78
88
|
chat = copy.deepcopy(chat)
|
79
|
-
if chat[-1]
|
89
|
+
if chat[-1].role != "user":
|
80
90
|
raise ValueError("Last chat message must be from the user.")
|
81
91
|
|
82
|
-
user_request = chat[-1]
|
92
|
+
user_request = chat[-1].content
|
83
93
|
prompt = TEST.format(
|
84
94
|
docstring=tool_util_docs,
|
85
95
|
question=user_request,
|
86
96
|
code=code,
|
87
97
|
media=media_list,
|
88
98
|
)
|
89
|
-
|
90
|
-
|
91
|
-
|
99
|
+
response = cast(str, tester([{"role": "user", "content": prompt}], stream=False))
|
100
|
+
maybe_code = extract_tag(response, "code")
|
101
|
+
|
102
|
+
# if the response wasn't properly formatted with the code tags just retrun the response
|
103
|
+
if maybe_code is None:
|
104
|
+
return response
|
105
|
+
return maybe_code
|
92
106
|
|
93
107
|
|
94
108
|
def debug_code(
|
@@ -170,12 +184,11 @@ def write_and_test_code(
|
|
170
184
|
coder: LMM,
|
171
185
|
tester: LMM,
|
172
186
|
debugger: LMM,
|
173
|
-
chat: List[
|
187
|
+
chat: List[AgentMessage],
|
174
188
|
plan: str,
|
175
189
|
tool_docs: str,
|
176
190
|
code_interpreter: CodeInterpreter,
|
177
191
|
media_list: List[Union[str, Path]],
|
178
|
-
update_callback: Callable[[Dict[str, Any]], None],
|
179
192
|
verbose: bool,
|
180
193
|
) -> CodeContext:
|
181
194
|
code = write_code(
|
@@ -226,14 +239,6 @@ def write_and_test_code(
|
|
226
239
|
f"[bold cyan]Code execution result after attempted fix:[/bold cyan] [yellow]{escape(result.text(include_logs=True))}[/yellow]"
|
227
240
|
)
|
228
241
|
|
229
|
-
update_callback(
|
230
|
-
{
|
231
|
-
"role": "assistant",
|
232
|
-
"content": f"<final_code>{DefaultImports.to_code_string()}\n{code}</final_code>\n<final_test>{DefaultImports.to_code_string()}\n{test}</final_test>",
|
233
|
-
"media": capture_media_from_exec(result),
|
234
|
-
}
|
235
|
-
)
|
236
|
-
|
237
242
|
return CodeContext(
|
238
243
|
code=f"{DefaultImports.to_code_string()}\n{code}",
|
239
244
|
test=f"{DefaultImports.to_code_string()}\n{test}",
|
@@ -242,10 +247,12 @@ def write_and_test_code(
|
|
242
247
|
)
|
243
248
|
|
244
249
|
|
245
|
-
class VisionAgentCoderV2(
|
250
|
+
class VisionAgentCoderV2(AgentCoder):
|
251
|
+
"""VisionAgentCoderV2 is an agent that will write vision code for you."""
|
252
|
+
|
246
253
|
def __init__(
|
247
254
|
self,
|
248
|
-
planner: Optional[
|
255
|
+
planner: Optional[AgentPlanner] = None,
|
249
256
|
coder: Optional[LMM] = None,
|
250
257
|
tester: Optional[LMM] = None,
|
251
258
|
debugger: Optional[LMM] = None,
|
@@ -254,6 +261,25 @@ class VisionAgentCoderV2(Agent):
|
|
254
261
|
code_sandbox_runtime: Optional[str] = None,
|
255
262
|
update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
|
256
263
|
) -> None:
|
264
|
+
"""Initialize the VisionAgentCoderV2.
|
265
|
+
|
266
|
+
Parameters:
|
267
|
+
planner (Optional[AgentPlanner]): The planner agent to use for generating
|
268
|
+
vision plans. If None, a default VisionAgentPlannerV2 will be used.
|
269
|
+
coder (Optional[LMM]): The language model to use for the coder agent. If
|
270
|
+
None, a default AnthropicLMM will be used.
|
271
|
+
tester (Optional[LMM]): The language model to use for the tester agent. If
|
272
|
+
None, a default AnthropicLMM will be used.
|
273
|
+
debugger (Optional[LMM]): The language model to use for the debugger agent.
|
274
|
+
tool_recommender (Optional[Union[str, Sim]]): The tool recommender to use.
|
275
|
+
verbose (bool): Whether to print out debug information.
|
276
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
|
277
|
+
be one of: None, "local" or "e2b". If None, it will read from the
|
278
|
+
environment variable CODE_SANDBOX_RUNTIME.
|
279
|
+
update_callback (Callable[[Dict[str, Any]], None]): The callback function
|
280
|
+
that will send back intermediate conversation messages.
|
281
|
+
"""
|
282
|
+
|
257
283
|
self.planner = (
|
258
284
|
planner
|
259
285
|
if planner is not None
|
@@ -290,20 +316,52 @@ class VisionAgentCoderV2(Agent):
|
|
290
316
|
self,
|
291
317
|
input: Union[str, List[Message]],
|
292
318
|
media: Optional[Union[str, Path]] = None,
|
293
|
-
) ->
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
input[
|
298
|
-
|
299
|
-
|
300
|
-
|
319
|
+
) -> str:
|
320
|
+
"""Generate vision code from a conversation.
|
321
|
+
|
322
|
+
Parameters:
|
323
|
+
input (Union[str, List[Message]]): The input to the agent. This can be a
|
324
|
+
string or a list of messages in the format of [{"role": "user",
|
325
|
+
"content": "describe your task here..."}, ...].
|
326
|
+
media (Optional[Union[str, Path]]): The path to the media file to use with
|
327
|
+
the input. This can be an image or video file.
|
328
|
+
|
329
|
+
Returns:
|
330
|
+
str: The generated code as a string.
|
331
|
+
"""
|
332
|
+
|
333
|
+
input_msg = convert_message_to_agentmessage(input, media)
|
334
|
+
return self.generate_code(input_msg).code
|
335
|
+
|
336
|
+
def generate_code(
|
337
|
+
self,
|
338
|
+
chat: List[AgentMessage],
|
339
|
+
max_steps: Optional[int] = None,
|
340
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
341
|
+
) -> CodeContext:
|
342
|
+
"""Generate vision code from a conversation.
|
343
|
+
|
344
|
+
Parameters:
|
345
|
+
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
346
|
+
AgentMessage objects.
|
347
|
+
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
348
|
+
|
349
|
+
Returns:
|
350
|
+
CodeContext: The generated code as a CodeContext object which includes the
|
351
|
+
code, test code, whether or not it was exceuted successfully, and the
|
352
|
+
execution result.
|
353
|
+
"""
|
354
|
+
|
301
355
|
chat = copy.deepcopy(chat)
|
302
|
-
with
|
303
|
-
self.code_sandbox_runtime
|
356
|
+
with (
|
357
|
+
CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
|
358
|
+
if code_interpreter is None
|
359
|
+
else code_interpreter
|
304
360
|
) as code_interpreter:
|
305
361
|
int_chat, orig_chat, _ = add_media_to_chat(chat, code_interpreter)
|
306
|
-
plan_context = self.planner.generate_plan(
|
362
|
+
plan_context = self.planner.generate_plan(
|
363
|
+
int_chat, max_steps=max_steps, code_interpreter=code_interpreter
|
364
|
+
)
|
307
365
|
code_context = self.generate_code_from_plan(
|
308
366
|
orig_chat,
|
309
367
|
plan_context,
|
@@ -313,13 +371,30 @@ class VisionAgentCoderV2(Agent):
|
|
313
371
|
|
314
372
|
def generate_code_from_plan(
|
315
373
|
self,
|
316
|
-
chat: List[
|
374
|
+
chat: List[AgentMessage],
|
317
375
|
plan_context: PlanContext,
|
318
376
|
code_interpreter: Optional[CodeInterpreter] = None,
|
319
377
|
) -> CodeContext:
|
378
|
+
"""Generate vision code from a conversation and a previously made plan. This
|
379
|
+
will skip the planning step and go straight to generating code.
|
380
|
+
|
381
|
+
Parameters:
|
382
|
+
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
383
|
+
AgentMessage objects.
|
384
|
+
plan_context (PlanContext): The plan context that was previously generated.
|
385
|
+
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
386
|
+
|
387
|
+
Returns:
|
388
|
+
CodeContext: The generated code as a CodeContext object which includes the
|
389
|
+
code, test code, whether or not it was exceuted successfully, and the
|
390
|
+
execution result.
|
391
|
+
"""
|
392
|
+
|
320
393
|
chat = copy.deepcopy(chat)
|
321
|
-
with
|
322
|
-
self.code_sandbox_runtime
|
394
|
+
with (
|
395
|
+
CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
|
396
|
+
if code_interpreter is None
|
397
|
+
else code_interpreter
|
323
398
|
) as code_interpreter:
|
324
399
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
325
400
|
tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
|
@@ -331,10 +406,23 @@ class VisionAgentCoderV2(Agent):
|
|
331
406
|
plan=format_plan_v2(plan_context),
|
332
407
|
tool_docs=tool_docs,
|
333
408
|
code_interpreter=code_interpreter,
|
334
|
-
media_list=media_list,
|
335
|
-
update_callback=self.update_callback,
|
409
|
+
media_list=media_list,
|
336
410
|
verbose=self.verbose,
|
337
411
|
)
|
412
|
+
|
413
|
+
self.update_callback(
|
414
|
+
{
|
415
|
+
"role": "coder",
|
416
|
+
"content": format_code_context(code_context),
|
417
|
+
"media": capture_media_from_exec(code_context.test_result),
|
418
|
+
}
|
419
|
+
)
|
420
|
+
self.update_callback(
|
421
|
+
{
|
422
|
+
"role": "observation",
|
423
|
+
"content": code_context.test_result.text(),
|
424
|
+
}
|
425
|
+
)
|
338
426
|
return code_context
|
339
427
|
|
340
428
|
def log_progress(self, data: Dict[str, Any]) -> None:
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
|
+
import time
|
3
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4
5
|
from pathlib import Path
|
5
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
@@ -10,16 +11,17 @@ from rich.markup import escape
|
|
10
11
|
|
11
12
|
import vision_agent.tools as T
|
12
13
|
import vision_agent.tools.planner_tools as pt
|
13
|
-
from vision_agent.agent import
|
14
|
+
from vision_agent.agent import AgentPlanner
|
14
15
|
from vision_agent.agent.agent_utils import (
|
15
|
-
PlanContext,
|
16
16
|
add_media_to_chat,
|
17
17
|
capture_media_from_exec,
|
18
|
+
convert_message_to_agentmessage,
|
18
19
|
extract_json,
|
19
20
|
extract_tag,
|
20
21
|
print_code,
|
21
22
|
print_table,
|
22
23
|
)
|
24
|
+
from vision_agent.agent.types import AgentMessage, PlanContext
|
23
25
|
from vision_agent.agent.vision_agent_planner_prompts_v2 import (
|
24
26
|
CRITIQUE_PLAN,
|
25
27
|
EXAMPLE_PLAN1,
|
@@ -70,26 +72,24 @@ class DefaultPlanningImports:
|
|
70
72
|
|
71
73
|
|
72
74
|
def get_planning(
|
73
|
-
chat: List[
|
75
|
+
chat: List[AgentMessage],
|
74
76
|
) -> str:
|
75
77
|
chat = copy.deepcopy(chat)
|
76
78
|
planning = ""
|
77
79
|
for chat_i in chat:
|
78
|
-
if chat_i
|
79
|
-
planning += f"USER: {chat_i
|
80
|
-
elif chat_i
|
81
|
-
planning += f"OBSERVATION: {chat_i
|
82
|
-
elif chat_i
|
83
|
-
planning += f"
|
84
|
-
else:
|
85
|
-
raise ValueError(f"Unknown role: {chat_i['role']}")
|
80
|
+
if chat_i.role == "user":
|
81
|
+
planning += f"USER: {chat_i.content}\n\n"
|
82
|
+
elif chat_i.role == "observation":
|
83
|
+
planning += f"OBSERVATION: {chat_i.content}\n\n"
|
84
|
+
elif chat_i.role == "planner":
|
85
|
+
planning += f"AGENT: {chat_i.content}\n\n"
|
86
86
|
|
87
87
|
return planning
|
88
88
|
|
89
89
|
|
90
90
|
def run_planning(
|
91
|
-
chat: List[
|
92
|
-
media_list: List[str],
|
91
|
+
chat: List[AgentMessage],
|
92
|
+
media_list: List[Union[str, Path]],
|
93
93
|
model: LMM,
|
94
94
|
) -> str:
|
95
95
|
# only keep last 10 messages for planning
|
@@ -102,16 +102,16 @@ def run_planning(
|
|
102
102
|
)
|
103
103
|
|
104
104
|
message: Message = {"role": "user", "content": prompt}
|
105
|
-
if chat[-1]
|
106
|
-
message["media"] = chat[-1]
|
105
|
+
if chat[-1].role == "observation" and chat[-1].media is not None:
|
106
|
+
message["media"] = chat[-1].media
|
107
107
|
|
108
108
|
response = model.chat([message])
|
109
109
|
return cast(str, response)
|
110
110
|
|
111
111
|
|
112
112
|
def run_multi_trial_planning(
|
113
|
-
chat: List[
|
114
|
-
media_list: List[str],
|
113
|
+
chat: List[AgentMessage],
|
114
|
+
media_list: List[Union[str, Path]],
|
115
115
|
model: LMM,
|
116
116
|
) -> str:
|
117
117
|
planning = get_planning(chat)
|
@@ -123,8 +123,8 @@ def run_multi_trial_planning(
|
|
123
123
|
)
|
124
124
|
|
125
125
|
message: Message = {"role": "user", "content": prompt}
|
126
|
-
if chat[-1]
|
127
|
-
message["media"] = chat[-1]
|
126
|
+
if chat[-1].role == "observation" and chat[-1].media is not None:
|
127
|
+
message["media"] = chat[-1].media
|
128
128
|
|
129
129
|
responses = []
|
130
130
|
with ThreadPoolExecutor() as executor:
|
@@ -151,7 +151,9 @@ def run_multi_trial_planning(
|
|
151
151
|
return cast(str, responses[0])
|
152
152
|
|
153
153
|
|
154
|
-
def run_critic(
|
154
|
+
def run_critic(
|
155
|
+
chat: List[AgentMessage], media_list: List[Union[str, Path]], model: LMM
|
156
|
+
) -> Optional[str]:
|
155
157
|
planning = get_planning(chat)
|
156
158
|
prompt = CRITIQUE_PLAN.format(
|
157
159
|
planning=planning,
|
@@ -196,17 +198,19 @@ def response_safeguards(response: str) -> str:
|
|
196
198
|
def execute_code_action(
|
197
199
|
code: str,
|
198
200
|
code_interpreter: CodeInterpreter,
|
199
|
-
chat: List[
|
201
|
+
chat: List[AgentMessage],
|
200
202
|
model: LMM,
|
201
203
|
verbose: bool = False,
|
202
204
|
) -> Tuple[Execution, str, str]:
|
203
205
|
if verbose:
|
204
206
|
print_code("Code to Execute:", code)
|
207
|
+
start = time.time()
|
205
208
|
execution = code_interpreter.exec_cell(DefaultPlanningImports.prepend_imports(code))
|
209
|
+
end = time.time()
|
206
210
|
obs = execution.text(include_results=False).strip()
|
207
211
|
if verbose:
|
208
212
|
_CONSOLE.print(
|
209
|
-
f"[bold cyan]Code Execution Output:[/bold cyan] [yellow]{escape(obs)}[/yellow]"
|
213
|
+
f"[bold cyan]Code Execution Output ({end - start:.2f} sec):[/bold cyan] [yellow]{escape(obs)}[/yellow]"
|
210
214
|
)
|
211
215
|
|
212
216
|
count = 1
|
@@ -246,13 +250,13 @@ def find_and_replace_code(response: str, code: str) -> str:
|
|
246
250
|
def maybe_run_code(
|
247
251
|
code: Optional[str],
|
248
252
|
response: str,
|
249
|
-
chat: List[
|
250
|
-
media_list: List[str],
|
253
|
+
chat: List[AgentMessage],
|
254
|
+
media_list: List[Union[str, Path]],
|
251
255
|
model: LMM,
|
252
256
|
code_interpreter: CodeInterpreter,
|
253
257
|
verbose: bool = False,
|
254
|
-
) -> List[
|
255
|
-
return_chat: List[
|
258
|
+
) -> List[AgentMessage]:
|
259
|
+
return_chat: List[AgentMessage] = []
|
256
260
|
if code is not None:
|
257
261
|
code = code_safeguards(code)
|
258
262
|
execution, obs, code = execute_code_action(
|
@@ -262,30 +266,32 @@ def maybe_run_code(
|
|
262
266
|
# if we had to debug the code to fix an issue, replace the old code
|
263
267
|
# with the fixed code in the response
|
264
268
|
fixed_response = find_and_replace_code(response, code)
|
265
|
-
return_chat.append(
|
269
|
+
return_chat.append(
|
270
|
+
AgentMessage(role="planner", content=fixed_response, media=None)
|
271
|
+
)
|
266
272
|
|
267
273
|
media_data = capture_media_from_exec(execution)
|
268
|
-
int_chat_elt
|
274
|
+
int_chat_elt = AgentMessage(role="observation", content=obs, media=None)
|
269
275
|
if media_list:
|
270
|
-
int_chat_elt
|
276
|
+
int_chat_elt.media = cast(List[Union[str, Path]], media_data)
|
271
277
|
return_chat.append(int_chat_elt)
|
272
278
|
else:
|
273
|
-
return_chat.append(
|
279
|
+
return_chat.append(AgentMessage(role="planner", content=response, media=None))
|
274
280
|
return return_chat
|
275
281
|
|
276
282
|
|
277
283
|
def create_finalize_plan(
|
278
|
-
chat: List[
|
284
|
+
chat: List[AgentMessage],
|
279
285
|
model: LMM,
|
280
286
|
verbose: bool = False,
|
281
|
-
) -> Tuple[List[
|
287
|
+
) -> Tuple[List[AgentMessage], PlanContext]:
|
282
288
|
prompt = FINALIZE_PLAN.format(
|
283
289
|
planning=get_planning(chat),
|
284
290
|
excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
|
285
291
|
)
|
286
292
|
response = model.chat([{"role": "user", "content": prompt}])
|
287
293
|
plan_str = cast(str, response)
|
288
|
-
return_chat
|
294
|
+
return_chat = [AgentMessage(role="planner", content=plan_str, media=None)]
|
289
295
|
|
290
296
|
plan_json = extract_tag(plan_str, "json")
|
291
297
|
plan = (
|
@@ -305,7 +311,16 @@ def create_finalize_plan(
|
|
305
311
|
return return_chat, PlanContext(**plan)
|
306
312
|
|
307
313
|
|
308
|
-
|
314
|
+
def get_steps(chat: List[AgentMessage], max_steps: int) -> int:
|
315
|
+
for chat_elt in reversed(chat):
|
316
|
+
if "<count>" in chat_elt.content:
|
317
|
+
return int(extract_tag(chat_elt.content, "count")) # type: ignore
|
318
|
+
return max_steps
|
319
|
+
|
320
|
+
|
321
|
+
class VisionAgentPlannerV2(AgentPlanner):
|
322
|
+
"""VisionAgentPlannerV2 is a class that generates a plan to solve a vision task."""
|
323
|
+
|
309
324
|
def __init__(
|
310
325
|
self,
|
311
326
|
planner: Optional[LMM] = None,
|
@@ -317,6 +332,25 @@ class VisionAgentPlannerV2(Agent):
|
|
317
332
|
code_sandbox_runtime: Optional[str] = None,
|
318
333
|
update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
|
319
334
|
) -> None:
|
335
|
+
"""Initialize the VisionAgentPlannerV2.
|
336
|
+
|
337
|
+
Parameters:
|
338
|
+
planner (Optional[LMM]): The language model to use for planning. If None, a
|
339
|
+
default AnthropicLMM will be used.
|
340
|
+
critic (Optional[LMM]): The language model to use for critiquing the plan.
|
341
|
+
If None, a default AnthropicLMM will be used.
|
342
|
+
max_steps (int): The maximum number of steps to plan.
|
343
|
+
use_multi_trial_planning (bool): Whether to use multi-trial planning.
|
344
|
+
critique_steps (int): The number of steps between critiques. If critic steps
|
345
|
+
is larger than max_steps no critiques will be made.
|
346
|
+
verbose (bool): Whether to print out debug information.
|
347
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
|
348
|
+
be one of: None, "local" or "e2b". If None, it will read from the
|
349
|
+
environment variable CODE_SANDBOX_RUNTIME.
|
350
|
+
update_callback (Callable[[Dict[str, Any]], None]): The callback function
|
351
|
+
that will send back intermediate conversation messages.
|
352
|
+
"""
|
353
|
+
|
320
354
|
self.planner = (
|
321
355
|
planner
|
322
356
|
if planner is not None
|
@@ -339,20 +373,42 @@ class VisionAgentPlannerV2(Agent):
|
|
339
373
|
self,
|
340
374
|
input: Union[str, List[Message]],
|
341
375
|
media: Optional[Union[str, Path]] = None,
|
342
|
-
) ->
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
376
|
+
) -> str:
|
377
|
+
"""Generate a plan to solve a vision task.
|
378
|
+
|
379
|
+
Parameters:
|
380
|
+
input (Union[str, List[Message]]): The input to the agent. This can be a
|
381
|
+
string or a list of messages in the format of [{"role": "user",
|
382
|
+
"content": "describe your task here..."}, ...].
|
383
|
+
media (Optional[Union[str, Path]]): The path to the media file to use with
|
384
|
+
the input. This can be an image or video file.
|
385
|
+
|
386
|
+
Returns:
|
387
|
+
str: The generated plan as a string.
|
388
|
+
"""
|
389
|
+
|
390
|
+
input_msg = convert_message_to_agentmessage(input, media)
|
391
|
+
plan = self.generate_plan(input_msg)
|
392
|
+
return plan.plan
|
350
393
|
|
351
394
|
def generate_plan(
|
352
395
|
self,
|
353
|
-
chat: List[
|
396
|
+
chat: List[AgentMessage],
|
397
|
+
max_steps: Optional[int] = None,
|
354
398
|
code_interpreter: Optional[CodeInterpreter] = None,
|
355
399
|
) -> PlanContext:
|
400
|
+
"""Generate a plan to solve a vision task.
|
401
|
+
|
402
|
+
Parameters:
|
403
|
+
chat (List[AgentMessage]): The conversation messages to generate a plan for.
|
404
|
+
max_steps (Optional[int]): The maximum number of steps to plan.
|
405
|
+
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
406
|
+
|
407
|
+
Returns:
|
408
|
+
PlanContext: The generated plan including the instructions and code snippets
|
409
|
+
needed to solve the task.
|
410
|
+
"""
|
411
|
+
|
356
412
|
if not chat:
|
357
413
|
raise ValueError("Chat cannot be empty")
|
358
414
|
|
@@ -360,13 +416,16 @@ class VisionAgentPlannerV2(Agent):
|
|
360
416
|
code_interpreter = code_interpreter or CodeInterpreterFactory.new_instance(
|
361
417
|
self.code_sandbox_runtime
|
362
418
|
)
|
419
|
+
max_steps = max_steps or self.max_steps
|
363
420
|
|
364
421
|
with code_interpreter:
|
365
422
|
critque_steps = 1
|
366
|
-
step = self.max_steps
|
367
423
|
finished = False
|
368
424
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
369
|
-
|
425
|
+
|
426
|
+
step = get_steps(int_chat, max_steps)
|
427
|
+
if "<count>" not in int_chat[-1].content and step == max_steps:
|
428
|
+
int_chat[-1].content += f"\n<count>{step}</count>\n"
|
370
429
|
while step > 0 and not finished:
|
371
430
|
if self.use_multi_trial_planning:
|
372
431
|
response = run_multi_trial_planning(
|
@@ -402,29 +461,29 @@ class VisionAgentPlannerV2(Agent):
|
|
402
461
|
|
403
462
|
if critque_steps % self.critique_steps == 0:
|
404
463
|
critique = run_critic(int_chat, media_list, self.critic)
|
405
|
-
if critique is not None and int_chat[-1]
|
464
|
+
if critique is not None and int_chat[-1].role == "observation":
|
406
465
|
_CONSOLE.print(
|
407
466
|
f"[bold cyan]Critique:[/bold cyan] [red]{critique}[/red]"
|
408
467
|
)
|
409
468
|
critique_str = f"\n[critique]\n{critique}\n[end of critique]"
|
410
|
-
updated_chat[-1]
|
469
|
+
updated_chat[-1].content += critique_str
|
411
470
|
# if plan was critiqued, ensure we don't finish so we can
|
412
471
|
# respond to the critique
|
413
472
|
finished = False
|
414
473
|
|
415
474
|
critque_steps += 1
|
416
475
|
step -= 1
|
417
|
-
updated_chat[-1]
|
476
|
+
updated_chat[-1].content += f"\n<count>{step}</count>\n"
|
418
477
|
int_chat.extend(updated_chat)
|
419
478
|
for chat_elt in updated_chat:
|
420
|
-
self.update_callback(chat_elt)
|
479
|
+
self.update_callback(chat_elt.model_dump())
|
421
480
|
|
422
481
|
updated_chat, plan_context = create_finalize_plan(
|
423
482
|
int_chat, self.planner, self.verbose
|
424
483
|
)
|
425
484
|
int_chat.extend(updated_chat)
|
426
485
|
for chat_elt in updated_chat:
|
427
|
-
self.update_callback(chat_elt)
|
486
|
+
self.update_callback(chat_elt.model_dump())
|
428
487
|
|
429
488
|
return plan_context
|
430
489
|
|
@@ -55,10 +55,10 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
|
|
55
55
|
|
56
56
|
OBSERVATION:
|
57
57
|
[Artifact dog_detector.py (5 lines total)]
|
58
|
-
0|from vision_agent.tools import load_image,
|
58
|
+
0|from vision_agent.tools import load_image, owl_v2_image
|
59
59
|
1|def detect_dogs(image_path: str):
|
60
60
|
2| image = load_image(image_path)
|
61
|
-
3| dogs =
|
61
|
+
3| dogs = owl_v2_image("dog", image)
|
62
62
|
4| return dogs
|
63
63
|
[End of artifact]
|
64
64
|
|
@@ -96,10 +96,10 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
|
|
96
96
|
|
97
97
|
OBSERVATION:
|
98
98
|
[Artifact dog_detector.py (5 lines total)]
|
99
|
-
0|from vision_agent.tools import load_image,
|
99
|
+
0|from vision_agent.tools import load_image, owl_v2_image
|
100
100
|
1|def detect_dogs(image_path: str):
|
101
101
|
2| image = load_image(image_path)
|
102
|
-
3| dogs =
|
102
|
+
3| dogs = owl_v2_image("dog", image, threshold=0.24)
|
103
103
|
4| return dogs
|
104
104
|
[End of artifact]
|
105
105
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
CONVERSATION = """
|
2
|
+
**Role**: You are a help agent that assists users with their vision-related questions.
|
3
|
+
|
4
|
+
**Actions**:
|
5
|
+
`generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
|
6
|
+
|
7
|
+
`edit_code` - If the user already has code for a vision task, from a previous call to `generate_or_edit_vision_code` and wants to quickly modify the code, you can use this action to edit the code. This is good for non-vision related changes such as formatting the response as a json or removing print statements.
|
8
|
+
|
9
|
+
**Exampels**:
|
10
|
+
Here is an example of how you can interact with a user and Actions:
|
11
|
+
--- START EXAMPLES ---
|
12
|
+
USER: Can you help me write code to detect dogs in this image? Media name images/dogs.jpg
|
13
|
+
|
14
|
+
AGENT: <response>Yes, I can help you with that. I will write the code to detect dogs in the image.</response><action>generate_or_edit_vision_code</action>
|
15
|
+
|
16
|
+
OBSERVATION:
|
17
|
+
<final_code>
|
18
|
+
from vision_agent.tools import load_image, owl_v2_image
|
19
|
+
def detect_dogs(image_path: str):
|
20
|
+
image = load_image(image_path)
|
21
|
+
dogs = owl_v2_image(image)
|
22
|
+
return dogs
|
23
|
+
</final_code>
|
24
|
+
<final_test>
|
25
|
+
def test_detect_dogs():
|
26
|
+
dogs = detect_dogs("images/dogs.jpg")
|
27
|
+
assert len(dogs) > 0
|
28
|
+
</final_test>
|
29
|
+
|
30
|
+
AGENT: <response>Here is the code to detect dogs in the image.</response>
|
31
|
+
|
32
|
+
--- END EXAMPLES ---
|
33
|
+
|
34
|
+
**Conversation**:
|
35
|
+
Here is the current conversation so far:
|
36
|
+
--- START CONVERSATION ---
|
37
|
+
{conversation}
|
38
|
+
--- END CONVERSATION ---
|
39
|
+
|
40
|
+
**Instructions**:
|
41
|
+
1. Only respond with a single <response> tag and a single <action> tag.
|
42
|
+
2. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
|
43
|
+
|
44
|
+
<response>Your response to the user's message</response>
|
45
|
+
<action>The action you want to take from **Actions**</action>
|
46
|
+
"""
|
@@ -0,0 +1,215 @@
|
|
1
|
+
import copy
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
4
|
+
|
5
|
+
from vision_agent.agent import Agent, AgentCoder, VisionAgentCoderV2
|
6
|
+
from vision_agent.agent.agent_utils import (
|
7
|
+
add_media_to_chat,
|
8
|
+
convert_message_to_agentmessage,
|
9
|
+
extract_tag,
|
10
|
+
)
|
11
|
+
from vision_agent.agent.types import AgentMessage, PlanContext
|
12
|
+
from vision_agent.agent.vision_agent_coder_v2 import format_code_context
|
13
|
+
from vision_agent.agent.vision_agent_prompts_v2 import CONVERSATION
|
14
|
+
from vision_agent.lmm import LMM, AnthropicLMM
|
15
|
+
from vision_agent.lmm.types import Message
|
16
|
+
from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
|
17
|
+
|
18
|
+
|
19
|
+
def format_conversation(chat: List[AgentMessage]) -> str:
|
20
|
+
chat = copy.deepcopy(chat)
|
21
|
+
prompt = ""
|
22
|
+
for chat_i in chat:
|
23
|
+
if chat_i.role == "user":
|
24
|
+
prompt += f"USER: {chat_i.content}\n\n"
|
25
|
+
elif chat_i.role == "observation" or chat_i.role == "coder":
|
26
|
+
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
27
|
+
elif chat_i.role == "conversation":
|
28
|
+
prompt += f"AGENT: {chat_i.content}\n\n"
|
29
|
+
return prompt
|
30
|
+
|
31
|
+
|
32
|
+
def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
|
33
|
+
# only keep last 10 messages
|
34
|
+
conv = format_conversation(chat[-10:])
|
35
|
+
prompt = CONVERSATION.format(
|
36
|
+
conversation=conv,
|
37
|
+
)
|
38
|
+
response = agent([{"role": "user", "content": prompt}], stream=False)
|
39
|
+
return cast(str, response)
|
40
|
+
|
41
|
+
|
42
|
+
def extract_conversation_for_generate_code(
|
43
|
+
chat: List[AgentMessage],
|
44
|
+
) -> List[AgentMessage]:
|
45
|
+
chat = copy.deepcopy(chat)
|
46
|
+
extracted_chat = []
|
47
|
+
for chat_i in chat:
|
48
|
+
if chat_i.role == "user":
|
49
|
+
extracted_chat.append(chat_i)
|
50
|
+
elif chat_i.role == "coder":
|
51
|
+
if "<final_code>" in chat_i.content and "<final_test>" in chat_i.content:
|
52
|
+
extracted_chat.append(chat_i)
|
53
|
+
|
54
|
+
return extracted_chat
|
55
|
+
|
56
|
+
|
57
|
+
def maybe_run_action(
|
58
|
+
coder: AgentCoder,
|
59
|
+
action: Optional[str],
|
60
|
+
chat: List[AgentMessage],
|
61
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
62
|
+
) -> Optional[List[AgentMessage]]:
|
63
|
+
if action == "generate_or_edit_vision_code":
|
64
|
+
extracted_chat = extract_conversation_for_generate_code(chat)
|
65
|
+
# there's an issue here because coder.generate_code will send it's code_context
|
66
|
+
# to the outside user via it's update_callback, but we don't necessarily have
|
67
|
+
# access to that update_callback here, so we re-create the message using
|
68
|
+
# format_code_context.
|
69
|
+
code_context = coder.generate_code(
|
70
|
+
extracted_chat, code_interpreter=code_interpreter
|
71
|
+
)
|
72
|
+
return [
|
73
|
+
AgentMessage(role="coder", content=format_code_context(code_context)),
|
74
|
+
AgentMessage(role="observation", content=code_context.test_result.text()),
|
75
|
+
]
|
76
|
+
elif action == "edit_code":
|
77
|
+
extracted_chat = extract_conversation_for_generate_code(chat)
|
78
|
+
plan_context = PlanContext(
|
79
|
+
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
|
80
|
+
instructions=[],
|
81
|
+
code="",
|
82
|
+
)
|
83
|
+
code_context = coder.generate_code_from_plan(
|
84
|
+
extracted_chat, plan_context, code_interpreter=code_interpreter
|
85
|
+
)
|
86
|
+
return [
|
87
|
+
AgentMessage(role="coder", content=format_code_context(code_context)),
|
88
|
+
AgentMessage(role="observation", content=code_context.test_result.text()),
|
89
|
+
]
|
90
|
+
elif action == "view_image":
|
91
|
+
pass
|
92
|
+
|
93
|
+
return None
|
94
|
+
|
95
|
+
|
96
|
+
class VisionAgentV2(Agent):
|
97
|
+
"""VisionAgentV2 is a conversational agent that allows you to more easily use a
|
98
|
+
coder agent such as VisionAgentCoderV2 to write vision code for you.
|
99
|
+
"""
|
100
|
+
|
101
|
+
def __init__(
|
102
|
+
self,
|
103
|
+
agent: Optional[LMM] = None,
|
104
|
+
coder: Optional[AgentCoder] = None,
|
105
|
+
verbose: bool = False,
|
106
|
+
code_sandbox_runtime: Optional[str] = None,
|
107
|
+
update_callback: Callable[[Dict[str, Any]], None] = lambda x: None,
|
108
|
+
) -> None:
|
109
|
+
"""Initialize the VisionAgentV2.
|
110
|
+
|
111
|
+
Parameters:
|
112
|
+
agent (Optional[LMM]): The language model to use for the agent. If None, a
|
113
|
+
default AnthropicLMM will be used.
|
114
|
+
coder (Optional[AgentCoder]): The coder agent to use for generating vision
|
115
|
+
code. If None, a default VisionAgentCoderV2 will be used.
|
116
|
+
verbose (bool): Whether to print out debug information.
|
117
|
+
code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
|
118
|
+
be one of: None, "local" or "e2b". If None, it will read from the
|
119
|
+
environment variable CODE_SANDBOX_RUNTIME.
|
120
|
+
update_callback (Callable[[Dict[str, Any]], None]): The callback function
|
121
|
+
that will send back intermediate conversation messages.
|
122
|
+
"""
|
123
|
+
|
124
|
+
self.agent = (
|
125
|
+
agent
|
126
|
+
if agent is not None
|
127
|
+
else AnthropicLMM(
|
128
|
+
model_name="claude-3-5-sonnet-20241022",
|
129
|
+
temperature=0.0,
|
130
|
+
)
|
131
|
+
)
|
132
|
+
self.coder = (
|
133
|
+
coder
|
134
|
+
if coder is not None
|
135
|
+
else VisionAgentCoderV2(verbose=verbose, update_callback=update_callback)
|
136
|
+
)
|
137
|
+
|
138
|
+
self.verbose = verbose
|
139
|
+
self.code_sandbox_runtime = code_sandbox_runtime
|
140
|
+
self.update_callback = update_callback
|
141
|
+
|
142
|
+
# force coder to use the same update_callback
|
143
|
+
if hasattr(self.coder, "update_callback"):
|
144
|
+
self.coder.update_callback = update_callback
|
145
|
+
|
146
|
+
def __call__(
|
147
|
+
self,
|
148
|
+
input: Union[str, List[Message]],
|
149
|
+
media: Optional[Union[str, Path]] = None,
|
150
|
+
) -> str:
|
151
|
+
"""Conversational interface to the agent. This is the main method to use to
|
152
|
+
interact with the agent. It takes in a string or list of messages and returns
|
153
|
+
the agent's response as a string.
|
154
|
+
|
155
|
+
Parameters:
|
156
|
+
input (Union[str, List[Message]]): The input to the agent. This can be a
|
157
|
+
string or a list of messages in the format of [{"role": "user",
|
158
|
+
"content": "describe your task here..."}, ...].
|
159
|
+
media (Optional[Union[str, Path]]): The path to the media file to use with
|
160
|
+
the input. This can be an image or video file.
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
str: The agent's response as a string.
|
164
|
+
"""
|
165
|
+
|
166
|
+
input_msg = convert_message_to_agentmessage(input, media)
|
167
|
+
return self.chat(input_msg)[-1].content
|
168
|
+
|
169
|
+
def chat(
|
170
|
+
self,
|
171
|
+
chat: List[AgentMessage],
|
172
|
+
) -> List[AgentMessage]:
|
173
|
+
"""Conversational interface to the agent. This is the main method to use to
|
174
|
+
interact with the agent. It takes in a list of messages and returns the agent's
|
175
|
+
response as a list of messages.
|
176
|
+
|
177
|
+
Parameters:
|
178
|
+
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
179
|
+
AgentMessage objects.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
List[AgentMessage]: The agent's response as a list of AgentMessage objects.
|
183
|
+
"""
|
184
|
+
|
185
|
+
return_chat = []
|
186
|
+
with CodeInterpreterFactory.new_instance(
|
187
|
+
self.code_sandbox_runtime
|
188
|
+
) as code_interpreter:
|
189
|
+
int_chat, _, _ = add_media_to_chat(chat, code_interpreter)
|
190
|
+
response_context = run_conversation(self.agent, int_chat)
|
191
|
+
return_chat.append(
|
192
|
+
AgentMessage(role="conversation", content=response_context)
|
193
|
+
)
|
194
|
+
self.update_callback(return_chat[-1].model_dump())
|
195
|
+
|
196
|
+
action = extract_tag(response_context, "action")
|
197
|
+
|
198
|
+
updated_chat = maybe_run_action(
|
199
|
+
self.coder, action, int_chat, code_interpreter=code_interpreter
|
200
|
+
)
|
201
|
+
if updated_chat is not None:
|
202
|
+
# do not append updated_chat to return_chat becuase the observation
|
203
|
+
# from running the action will have already been added via the callbacks
|
204
|
+
obs_response_context = run_conversation(
|
205
|
+
self.agent, return_chat + updated_chat
|
206
|
+
)
|
207
|
+
return_chat.append(
|
208
|
+
AgentMessage(role="conversation", content=obs_response_context)
|
209
|
+
)
|
210
|
+
self.update_callback(return_chat[-1].model_dump())
|
211
|
+
|
212
|
+
return return_chat
|
213
|
+
|
214
|
+
def log_progress(self, data: Dict[str, Any]) -> None:
|
215
|
+
pass
|
vision_agent/tools/tools.py
CHANGED
@@ -2453,7 +2453,6 @@ FUNCTION_TOOLS = [
|
|
2453
2453
|
owl_v2_image,
|
2454
2454
|
owl_v2_video,
|
2455
2455
|
ocr,
|
2456
|
-
clip,
|
2457
2456
|
vit_image_classification,
|
2458
2457
|
vit_nsfw_classification,
|
2459
2458
|
countgd_counting,
|
@@ -2471,6 +2470,7 @@ FUNCTION_TOOLS = [
|
|
2471
2470
|
qwen2_vl_video_vqa,
|
2472
2471
|
video_temporal_localization,
|
2473
2472
|
flux_image_inpainting,
|
2473
|
+
siglip_classification,
|
2474
2474
|
]
|
2475
2475
|
|
2476
2476
|
UTIL_TOOLS = [
|
vision_agent/utils/execute.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
-
vision_agent/.sim_tools/df.csv,sha256=
|
2
|
-
vision_agent/.sim_tools/embs.npy,sha256=
|
1
|
+
vision_agent/.sim_tools/df.csv,sha256=0fmLwTDjnRTiqYwamTOdCPjruE6wZz0AVrONIPTHxZY,34086
|
2
|
+
vision_agent/.sim_tools/embs.npy,sha256=xF8Cg7Xd09QCTySj831aL1O2_0kRNaaH8XRJIRjgWzQ,356480
|
3
3
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
4
|
-
vision_agent/agent/__init__.py,sha256=
|
5
|
-
vision_agent/agent/agent.py,sha256=
|
6
|
-
vision_agent/agent/agent_utils.py,sha256=
|
4
|
+
vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
|
5
|
+
vision_agent/agent/agent.py,sha256=sf8JcA3LNy_4GaS_gQb2Q-PXkl4dBuGh-7raI9KAtZo,1470
|
6
|
+
vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
|
7
|
+
vision_agent/agent/types.py,sha256=aAd_ez1-NQh04k27cmywyOV2uA_vWWYE-Ok7zq_JoAk,1532
|
7
8
|
vision_agent/agent/vision_agent.py,sha256=rr1P9iTbr7OsjgMYWCeIxQYI4cLwPWia3NIMJNi-9Yo,26110
|
8
9
|
vision_agent/agent/vision_agent_coder.py,sha256=waCmw_NTgsy9G-UqlRZFhsFJJVuWVrjxVnShe4Xp_lI,27743
|
9
10
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
10
11
|
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
|
11
|
-
vision_agent/agent/vision_agent_coder_v2.py,sha256=
|
12
|
+
vision_agent/agent/vision_agent_coder_v2.py,sha256=SVIJC0N5TBgq9z-F99UebLimRuQuAe_HHvTFupBzVfo,14715
|
12
13
|
vision_agent/agent/vision_agent_planner.py,sha256=F_5opnc0XmQmNH40rs2T7DFrai4CC6aDYe02Z8e93AM,18875
|
13
14
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=Y3jz9HRf8fz9NLUseN7cTgZqewP0RazxR7vw1sPhcn0,6691
|
14
|
-
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=
|
15
|
-
vision_agent/agent/vision_agent_planner_v2.py,sha256=
|
16
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
15
|
+
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=Tzon3h5iZdHJglesk8GVS-2myNf5-fhf7HUbkpZWHQk,33143
|
16
|
+
vision_agent/agent/vision_agent_planner_v2.py,sha256=mxQxD_B8sKYharh8e7W0uc1tN11YCztyLowc83seScc,17023
|
17
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=PENFd8VM_vHKxeZPiotVM1RBVW9NrXimKbpvI1UteKI,13772
|
18
|
+
vision_agent/agent/vision_agent_prompts_v2.py,sha256=-vCWat-ARlCOOOeIDIFhg-kcwRRwjTXYEwsvvqPeaCs,1972
|
19
|
+
vision_agent/agent/vision_agent_v2.py,sha256=Cudp_ZZBI9rDwMjIYlvY4jzh_srsulYgfRWZLo4_2TQ,8366
|
17
20
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
21
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
19
22
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -27,16 +30,16 @@ vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB
|
|
27
30
|
vision_agent/tools/planner_tools.py,sha256=FROahw_6Taqvytv6pOjCHUEypOfjsi_f8Vo1c5vz6Mw,8823
|
28
31
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
29
32
|
vision_agent/tools/tool_utils.py,sha256=GDGOmBCo4UfYz-DJ-olREJHPsqs5mzHu0YXiAnpNE8E,10179
|
30
|
-
vision_agent/tools/tools.py,sha256=
|
33
|
+
vision_agent/tools/tools.py,sha256=wXDs0m_Yb601FQVp5fPYYVtt4lHUeMnuqIbfDZhsE4Q,87852
|
31
34
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
32
35
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
33
36
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
34
|
-
vision_agent/utils/execute.py,sha256=
|
37
|
+
vision_agent/utils/execute.py,sha256=b3AA1G16Ixwlgd-kke13brKclxh5nJXQTrk25oj1W3o,28027
|
35
38
|
vision_agent/utils/image_utils.py,sha256=rRWcxKggPXIRXIY_XT9rZt30ECDRq8zq7FDeXRDqQWs,11679
|
36
39
|
vision_agent/utils/sim.py,sha256=NZc9QGD6BTY5O29NVbHH7oxDePL_QMnylT1lYcDUn1Y,7437
|
37
40
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
38
41
|
vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
|
39
|
-
vision_agent-0.2.
|
40
|
-
vision_agent-0.2.
|
41
|
-
vision_agent-0.2.
|
42
|
-
vision_agent-0.2.
|
42
|
+
vision_agent-0.2.200.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
43
|
+
vision_agent-0.2.200.dist-info/METADATA,sha256=goRTW73tD79-UlJiy4cL0twnVYm9iSjU9f5HsC4A1ZI,19026
|
44
|
+
vision_agent-0.2.200.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
45
|
+
vision_agent-0.2.200.dist-info/RECORD,,
|
File without changes
|
File without changes
|