vision-agent 0.2.198__tar.gz → 0.2.200__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. {vision_agent-0.2.198 → vision_agent-0.2.200}/PKG-INFO +1 -1
  2. {vision_agent-0.2.198 → vision_agent-0.2.200}/pyproject.toml +1 -1
  3. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/.sim_tools/df.csv +18 -18
  4. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/.sim_tools/embs.npy +0 -0
  5. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/__init__.py +2 -1
  6. vision_agent-0.2.200/vision_agent/agent/agent.py +55 -0
  7. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/agent_utils.py +47 -34
  8. vision_agent-0.2.200/vision_agent/agent/types.py +51 -0
  9. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_coder_v2.py +131 -43
  10. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_planner_prompts_v2.py +1 -1
  11. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_planner_v2.py +109 -50
  12. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_prompts.py +4 -4
  13. vision_agent-0.2.200/vision_agent/agent/vision_agent_prompts_v2.py +46 -0
  14. vision_agent-0.2.200/vision_agent/agent/vision_agent_v2.py +215 -0
  15. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/tools/tools.py +1 -1
  16. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/utils/execute.py +1 -1
  17. vision_agent-0.2.198/vision_agent/agent/agent.py +0 -22
  18. {vision_agent-0.2.198 → vision_agent-0.2.200}/LICENSE +0 -0
  19. {vision_agent-0.2.198 → vision_agent-0.2.200}/README.md +0 -0
  20. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/__init__.py +0 -0
  21. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent.py +0 -0
  22. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_coder.py +0 -0
  23. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  24. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  25. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_planner.py +0 -0
  26. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  27. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/clients/__init__.py +0 -0
  28. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/clients/http.py +0 -0
  29. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/clients/landing_public_api.py +0 -0
  30. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/fonts/__init__.py +0 -0
  31. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  32. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/lmm/__init__.py +0 -0
  33. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/lmm/lmm.py +0 -0
  34. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/lmm/types.py +0 -0
  35. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/tools/__init__.py +0 -0
  36. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/tools/meta_tools.py +0 -0
  37. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/tools/planner_tools.py +0 -0
  38. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/tools/prompts.py +0 -0
  39. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/tools/tool_utils.py +0 -0
  40. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/tools/tools_types.py +0 -0
  41. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/utils/__init__.py +0 -0
  42. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/utils/exceptions.py +0 -0
  43. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/utils/image_utils.py +0 -0
  44. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/utils/sim.py +0 -0
  45. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/utils/type_defs.py +0 -0
  46. {vision_agent-0.2.198 → vision_agent-0.2.200}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.198
3
+ Version: 0.2.200
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.198"
7
+ version = "0.2.200"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -80,24 +80,6 @@ desc,doc,name
80
80
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
81
81
  ]
82
82
  ",ocr
83
- 'clip' is a tool that can classify an image or a cropped detection given a list of input classes or tags. It returns the same list of the input classes along with their probability scores based on image content.,"clip(image: numpy.ndarray, classes: List[str]) -> Dict[str, Any]:
84
- 'clip' is a tool that can classify an image or a cropped detection given a list
85
- of input classes or tags. It returns the same list of the input classes along with
86
- their probability scores based on image content.
87
-
88
- Parameters:
89
- image (np.ndarray): The image to classify or tag
90
- classes (List[str]): The list of classes or tags that is associated with the image
91
-
92
- Returns:
93
- Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
94
- contains a list of given labels and other a list of scores.
95
-
96
- Example
97
- -------
98
- >>> clip(image, ['dog', 'cat', 'bird'])
99
- {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
100
- ",clip
101
83
  'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
102
84
  'vit_image_classification' is a tool that can classify an image. It returns a
103
85
  list of classes and their probability scores based on image content.
@@ -488,6 +470,24 @@ desc,doc,name
488
470
  ... )
489
471
  >>> save_image(result, ""inpainted_room.png"")
490
472
  ",flux_image_inpainting
473
+ 'siglip_classification' is a tool that can classify an image or a cropped detection given a list of input labels or tags. It returns the same list of the input labels along with their probability scores based on image content.,"siglip_classification(image: numpy.ndarray, labels: List[str]) -> Dict[str, Any]:
474
+ 'siglip_classification' is a tool that can classify an image or a cropped detection given a list
475
+ of input labels or tags. It returns the same list of the input labels along with
476
+ their probability scores based on image content.
477
+
478
+ Parameters:
479
+ image (np.ndarray): The image to classify or tag
480
+ labels (List[str]): The list of labels or tags that is associated with the image
481
+
482
+ Returns:
483
+ Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
484
+ contains a list of given labels and other a list of scores.
485
+
486
+ Example
487
+ -------
488
+ >>> siglip_classification(image, ['dog', 'cat', 'bird'])
489
+ {""labels"": [""dog"", ""cat"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
490
+ ",siglip_classification
491
491
  "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
492
492
  'extract_frames_and_timestamps' extracts frames and timestamps from a video
493
493
  which can be a file path, url or youtube link, returns a list of dictionaries
@@ -1,4 +1,4 @@
1
- from .agent import Agent
1
+ from .agent import Agent, AgentCoder, AgentPlanner
2
2
  from .vision_agent import VisionAgent
3
3
  from .vision_agent_coder import (
4
4
  AnthropicVisionAgentCoder,
@@ -17,3 +17,4 @@ from .vision_agent_planner import (
17
17
  VisionAgentPlanner,
18
18
  )
19
19
  from .vision_agent_planner_v2 import VisionAgentPlannerV2
20
+ from .vision_agent_v2 import VisionAgentV2
@@ -0,0 +1,55 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
6
+ from vision_agent.lmm.types import Message
7
+ from vision_agent.utils.execute import CodeInterpreter
8
+
9
+
10
+ class Agent(ABC):
11
+ @abstractmethod
12
+ def __call__(
13
+ self,
14
+ input: Union[str, List[Message]],
15
+ media: Optional[Union[str, Path]] = None,
16
+ ) -> Union[str, List[Message]]:
17
+ pass
18
+
19
+ @abstractmethod
20
+ def log_progress(self, data: Dict[str, Any]) -> None:
21
+ """Log the progress of the agent.
22
+ This is a hook that is intended for reporting the progress of the agent.
23
+ """
24
+ pass
25
+
26
+
27
+ class AgentCoder(Agent):
28
+ @abstractmethod
29
+ def generate_code(
30
+ self,
31
+ chat: List[AgentMessage],
32
+ max_steps: Optional[int] = None,
33
+ code_interpreter: Optional[CodeInterpreter] = None,
34
+ ) -> CodeContext:
35
+ pass
36
+
37
+ @abstractmethod
38
+ def generate_code_from_plan(
39
+ self,
40
+ chat: List[AgentMessage],
41
+ plan_context: PlanContext,
42
+ code_interpreter: Optional[CodeInterpreter] = None,
43
+ ) -> CodeContext:
44
+ pass
45
+
46
+
47
+ class AgentPlanner(Agent):
48
+ @abstractmethod
49
+ def generate_plan(
50
+ self,
51
+ chat: List[AgentMessage],
52
+ max_steps: Optional[int] = None,
53
+ code_interpreter: Optional[CodeInterpreter] = None,
54
+ ) -> PlanContext:
55
+ pass
@@ -4,16 +4,17 @@ import logging
4
4
  import re
5
5
  import sys
6
6
  import tempfile
7
- from typing import Any, Dict, List, Optional, Tuple, cast
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
8
9
 
9
10
  import libcst as cst
10
- from pydantic import BaseModel
11
11
  from rich.console import Console
12
12
  from rich.style import Style
13
13
  from rich.syntax import Syntax
14
14
  from rich.table import Table
15
15
 
16
16
  import vision_agent.tools as T
17
+ from vision_agent.agent.types import AgentMessage, PlanContext
17
18
  from vision_agent.lmm.types import Message
18
19
  from vision_agent.utils.execute import CodeInterpreter, Execution
19
20
  from vision_agent.utils.image_utils import b64_to_pil, convert_to_b64
@@ -24,19 +25,6 @@ _CONSOLE = Console()
24
25
  _MAX_TABULATE_COL_WIDTH = 80
25
26
 
26
27
 
27
- class PlanContext(BaseModel):
28
- plan: str
29
- instructions: List[str]
30
- code: str
31
-
32
-
33
- class CodeContext(BaseModel):
34
- code: str
35
- test: str
36
- success: bool
37
- test_result: Execution
38
-
39
-
40
28
  def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
41
29
  json_pattern = r"\{.*\}"
42
30
  match = re.search(json_pattern, json_str, re.DOTALL)
@@ -228,15 +216,15 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
228
216
 
229
217
 
230
218
  def add_media_to_chat(
231
- chat: List[Message], code_interpreter: CodeInterpreter
232
- ) -> Tuple[List[Message], List[Message], List[str]]:
219
+ chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
220
+ ) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
233
221
  orig_chat = copy.deepcopy(chat)
234
222
  int_chat = copy.deepcopy(chat)
235
- media_list = []
223
+ media_list: List[Union[str, Path]] = []
236
224
  for chat_i in int_chat:
237
- if "media" in chat_i:
238
- media_list_i = []
239
- for media in chat_i["media"]:
225
+ if chat_i.media is not None:
226
+ media_list_i: List[Union[str, Path]] = []
227
+ for media in chat_i.media:
240
228
  if isinstance(media, str) and media.startswith("data:image/"):
241
229
  media_pil = b64_to_pil(media)
242
230
  with tempfile.NamedTemporaryFile(
@@ -244,25 +232,29 @@ def add_media_to_chat(
244
232
  ) as temp_file:
245
233
  media_pil.save(temp_file, format="PNG")
246
234
  media = str(temp_file.name)
247
- media = str(code_interpreter.upload_file(media)) # type: ignore
235
+ if code_interpreter is not None:
236
+ media = str(code_interpreter.upload_file(media))
248
237
  media_list_i.append(media)
249
- # don't duplicate appending media name
250
- if not str(chat_i["content"]).endswith(f" Media name {media}"):
251
- chat_i["content"] += f" Media name {media}" # type: ignore
252
- chat_i["media"] = media_list_i
238
+ # don't duplicate appending media name and only add them for user messages
239
+ if (
240
+ not str(chat_i.content).endswith(f" Media name {media}")
241
+ and chat_i.role == "user"
242
+ ):
243
+ chat_i.content += f" Media name {media}"
244
+ chat_i.media = media_list_i if len(media_list_i) > 0 else None
253
245
  media_list.extend(media_list_i)
254
246
 
255
247
  int_chat = cast(
256
- List[Message],
248
+ List[AgentMessage],
257
249
  [
258
250
  (
259
- {
260
- "role": c["role"],
261
- "content": c["content"],
262
- "media": c["media"],
263
- }
264
- if "media" in c
265
- else {"role": c["role"], "content": c["content"]}
251
+ AgentMessage(
252
+ role=c.role,
253
+ content=c.content,
254
+ media=c.media,
255
+ )
256
+ if c.media is not None
257
+ else AgentMessage(role=c.role, content=c.content, media=None)
266
258
  )
267
259
  for c in int_chat
268
260
  ],
@@ -283,6 +275,27 @@ def capture_media_from_exec(execution: Execution) -> List[str]:
283
275
  return images
284
276
 
285
277
 
278
+ def convert_message_to_agentmessage(
279
+ input: Union[str, List[Message]],
280
+ media: Optional[Union[str, Path]] = None,
281
+ ) -> List[AgentMessage]:
282
+ if isinstance(input, str):
283
+ input_msg = [
284
+ AgentMessage(
285
+ role="user",
286
+ content=input,
287
+ media=([media] if media is not None else None),
288
+ )
289
+ ]
290
+ else:
291
+ input_msg = [
292
+ AgentMessage(role=msg["role"], content=msg["content"], media=None)
293
+ for msg in input
294
+ ]
295
+ input_msg[0].media = [media] if media is not None else None
296
+ return input_msg
297
+
298
+
286
299
  def strip_function_calls( # noqa: C901
287
300
  code: str, exclusions: Optional[List[str]] = None
288
301
  ) -> str:
@@ -0,0 +1,51 @@
1
+ from pathlib import Path
2
+ from typing import List, Literal, Optional, Union
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from vision_agent.utils.execute import Execution
7
+
8
+
9
+ class AgentMessage(BaseModel):
10
+ """AgentMessage encompases messages sent to the entire Agentic system, which includes
11
+ both LMMs and sub-agents.
12
+
13
+ user: The user's message.
14
+ assistant: The assistant's message.
15
+ observation: An observation made after conducting an action, either by the user or
16
+ assistant.
17
+ interaction: An interaction between the user and the assistant. For example if the
18
+ assistant wants to ask the user for help on a task, it could send an
19
+ interaction message.
20
+ conversation: Messages coming from the conversation agent, this is a type of
21
+ assistant messages.
22
+ planner: Messages coming from the planner agent, this is a type of assistant
23
+ messages.
24
+ coder: Messages coming from the coder agent, this is a type of assistant messages.
25
+
26
+ """
27
+
28
+ role: Union[
29
+ Literal["user"],
30
+ Literal["assistant"], # planner, coder and conversation are of type assistant
31
+ Literal["observation"],
32
+ Literal["interaction"],
33
+ Literal["conversation"],
34
+ Literal["planner"],
35
+ Literal["coder"],
36
+ ]
37
+ content: str
38
+ media: Optional[List[Union[str, Path]]] = None
39
+
40
+
41
+ class PlanContext(BaseModel):
42
+ plan: str
43
+ instructions: List[str]
44
+ code: str
45
+
46
+
47
+ class CodeContext(BaseModel):
48
+ code: str
49
+ test: str
50
+ success: bool
51
+ test_result: Execution
@@ -6,19 +6,19 @@ from rich.console import Console
6
6
  from rich.markup import escape
7
7
 
8
8
  import vision_agent.tools as T
9
- from vision_agent.agent import Agent
9
+ from vision_agent.agent import AgentCoder, AgentPlanner
10
10
  from vision_agent.agent.agent_utils import (
11
- CodeContext,
12
11
  DefaultImports,
13
- PlanContext,
14
12
  add_media_to_chat,
15
13
  capture_media_from_exec,
14
+ convert_message_to_agentmessage,
16
15
  extract_tag,
17
16
  format_feedback,
18
17
  format_plan_v2,
19
18
  print_code,
20
19
  strip_function_calls,
21
20
  )
21
+ from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
22
22
  from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
23
23
  from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
24
24
  from vision_agent.lmm import LMM, AnthropicLMM
@@ -34,6 +34,12 @@ from vision_agent.utils.sim import Sim, load_cached_sim
34
34
  _CONSOLE = Console()
35
35
 
36
36
 
37
+ def format_code_context(
38
+ code_context: CodeContext,
39
+ ) -> str:
40
+ return f"<final_code>{code_context.code}</final_code>\n<final_test>{code_context.test}</final_test>"
41
+
42
+
37
43
  def retrieve_tools(
38
44
  plan: List[str],
39
45
  tool_recommender: Sim,
@@ -49,46 +55,54 @@ def retrieve_tools(
49
55
 
50
56
  def write_code(
51
57
  coder: LMM,
52
- chat: List[Message],
58
+ chat: List[AgentMessage],
53
59
  tool_docs: str,
54
60
  plan: str,
55
61
  ) -> str:
56
62
  chat = copy.deepcopy(chat)
57
- if chat[-1]["role"] != "user":
63
+ if chat[-1].role != "user":
58
64
  raise ValueError("Last chat message must be from the user.")
59
65
 
60
- user_request = chat[-1]["content"]
66
+ user_request = chat[-1].content
61
67
  prompt = CODE.format(
62
68
  docstring=tool_docs,
63
69
  question=user_request,
64
70
  plan=plan,
65
71
  )
66
- chat[-1]["content"] = prompt
67
- response = coder(chat, stream=False)
68
- return extract_tag(response, "code") # type: ignore
72
+ response = cast(str, coder([{"role": "user", "content": prompt}], stream=False))
73
+ maybe_code = extract_tag(response, "code")
74
+
75
+ # if the response wasn't properly formatted with the code tags just retrun the response
76
+ if maybe_code is None:
77
+ return response
78
+ return maybe_code
69
79
 
70
80
 
71
81
  def write_test(
72
82
  tester: LMM,
73
- chat: List[Message],
83
+ chat: List[AgentMessage],
74
84
  tool_util_docs: str,
75
85
  code: str,
76
86
  media_list: Optional[Sequence[Union[str, Path]]] = None,
77
87
  ) -> str:
78
88
  chat = copy.deepcopy(chat)
79
- if chat[-1]["role"] != "user":
89
+ if chat[-1].role != "user":
80
90
  raise ValueError("Last chat message must be from the user.")
81
91
 
82
- user_request = chat[-1]["content"]
92
+ user_request = chat[-1].content
83
93
  prompt = TEST.format(
84
94
  docstring=tool_util_docs,
85
95
  question=user_request,
86
96
  code=code,
87
97
  media=media_list,
88
98
  )
89
- chat[-1]["content"] = prompt
90
- response = tester(chat, stream=False)
91
- return extract_tag(response, "code") # type: ignore
99
+ response = cast(str, tester([{"role": "user", "content": prompt}], stream=False))
100
+ maybe_code = extract_tag(response, "code")
101
+
102
+ # if the response wasn't properly formatted with the code tags just retrun the response
103
+ if maybe_code is None:
104
+ return response
105
+ return maybe_code
92
106
 
93
107
 
94
108
  def debug_code(
@@ -170,12 +184,11 @@ def write_and_test_code(
170
184
  coder: LMM,
171
185
  tester: LMM,
172
186
  debugger: LMM,
173
- chat: List[Message],
187
+ chat: List[AgentMessage],
174
188
  plan: str,
175
189
  tool_docs: str,
176
190
  code_interpreter: CodeInterpreter,
177
191
  media_list: List[Union[str, Path]],
178
- update_callback: Callable[[Dict[str, Any]], None],
179
192
  verbose: bool,
180
193
  ) -> CodeContext:
181
194
  code = write_code(
@@ -226,14 +239,6 @@ def write_and_test_code(
226
239
  f"[bold cyan]Code execution result after attempted fix:[/bold cyan] [yellow]{escape(result.text(include_logs=True))}[/yellow]"
227
240
  )
228
241
 
229
- update_callback(
230
- {
231
- "role": "assistant",
232
- "content": f"<final_code>{DefaultImports.to_code_string()}\n{code}</final_code>\n<final_test>{DefaultImports.to_code_string()}\n{test}</final_test>",
233
- "media": capture_media_from_exec(result),
234
- }
235
- )
236
-
237
242
  return CodeContext(
238
243
  code=f"{DefaultImports.to_code_string()}\n{code}",
239
244
  test=f"{DefaultImports.to_code_string()}\n{test}",
@@ -242,10 +247,12 @@ def write_and_test_code(
242
247
  )
243
248
 
244
249
 
245
- class VisionAgentCoderV2(Agent):
250
+ class VisionAgentCoderV2(AgentCoder):
251
+ """VisionAgentCoderV2 is an agent that will write vision code for you."""
252
+
246
253
  def __init__(
247
254
  self,
248
- planner: Optional[Agent] = None,
255
+ planner: Optional[AgentPlanner] = None,
249
256
  coder: Optional[LMM] = None,
250
257
  tester: Optional[LMM] = None,
251
258
  debugger: Optional[LMM] = None,
@@ -254,6 +261,25 @@ class VisionAgentCoderV2(Agent):
254
261
  code_sandbox_runtime: Optional[str] = None,
255
262
  update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
256
263
  ) -> None:
264
+ """Initialize the VisionAgentCoderV2.
265
+
266
+ Parameters:
267
+ planner (Optional[AgentPlanner]): The planner agent to use for generating
268
+ vision plans. If None, a default VisionAgentPlannerV2 will be used.
269
+ coder (Optional[LMM]): The language model to use for the coder agent. If
270
+ None, a default AnthropicLMM will be used.
271
+ tester (Optional[LMM]): The language model to use for the tester agent. If
272
+ None, a default AnthropicLMM will be used.
273
+ debugger (Optional[LMM]): The language model to use for the debugger agent.
274
+ tool_recommender (Optional[Union[str, Sim]]): The tool recommender to use.
275
+ verbose (bool): Whether to print out debug information.
276
+ code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
277
+ be one of: None, "local" or "e2b". If None, it will read from the
278
+ environment variable CODE_SANDBOX_RUNTIME.
279
+ update_callback (Callable[[Dict[str, Any]], None]): The callback function
280
+ that will send back intermediate conversation messages.
281
+ """
282
+
257
283
  self.planner = (
258
284
  planner
259
285
  if planner is not None
@@ -290,20 +316,52 @@ class VisionAgentCoderV2(Agent):
290
316
  self,
291
317
  input: Union[str, List[Message]],
292
318
  media: Optional[Union[str, Path]] = None,
293
- ) -> Union[str, List[Message]]:
294
- if isinstance(input, str):
295
- input = [{"role": "user", "content": input}]
296
- if media is not None:
297
- input[0]["media"] = [media]
298
- return self.generate_code(input).code
299
-
300
- def generate_code(self, chat: List[Message]) -> CodeContext:
319
+ ) -> str:
320
+ """Generate vision code from a conversation.
321
+
322
+ Parameters:
323
+ input (Union[str, List[Message]]): The input to the agent. This can be a
324
+ string or a list of messages in the format of [{"role": "user",
325
+ "content": "describe your task here..."}, ...].
326
+ media (Optional[Union[str, Path]]): The path to the media file to use with
327
+ the input. This can be an image or video file.
328
+
329
+ Returns:
330
+ str: The generated code as a string.
331
+ """
332
+
333
+ input_msg = convert_message_to_agentmessage(input, media)
334
+ return self.generate_code(input_msg).code
335
+
336
+ def generate_code(
337
+ self,
338
+ chat: List[AgentMessage],
339
+ max_steps: Optional[int] = None,
340
+ code_interpreter: Optional[CodeInterpreter] = None,
341
+ ) -> CodeContext:
342
+ """Generate vision code from a conversation.
343
+
344
+ Parameters:
345
+ chat (List[AgentMessage]): The input to the agent. This should be a list of
346
+ AgentMessage objects.
347
+ code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
348
+
349
+ Returns:
350
+ CodeContext: The generated code as a CodeContext object which includes the
351
+ code, test code, whether or not it was exceuted successfully, and the
352
+ execution result.
353
+ """
354
+
301
355
  chat = copy.deepcopy(chat)
302
- with CodeInterpreterFactory.new_instance(
303
- self.code_sandbox_runtime
356
+ with (
357
+ CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
358
+ if code_interpreter is None
359
+ else code_interpreter
304
360
  ) as code_interpreter:
305
361
  int_chat, orig_chat, _ = add_media_to_chat(chat, code_interpreter)
306
- plan_context = self.planner.generate_plan(int_chat, code_interpreter) # type: ignore
362
+ plan_context = self.planner.generate_plan(
363
+ int_chat, max_steps=max_steps, code_interpreter=code_interpreter
364
+ )
307
365
  code_context = self.generate_code_from_plan(
308
366
  orig_chat,
309
367
  plan_context,
@@ -313,13 +371,30 @@ class VisionAgentCoderV2(Agent):
313
371
 
314
372
  def generate_code_from_plan(
315
373
  self,
316
- chat: List[Message],
374
+ chat: List[AgentMessage],
317
375
  plan_context: PlanContext,
318
376
  code_interpreter: Optional[CodeInterpreter] = None,
319
377
  ) -> CodeContext:
378
+ """Generate vision code from a conversation and a previously made plan. This
379
+ will skip the planning step and go straight to generating code.
380
+
381
+ Parameters:
382
+ chat (List[AgentMessage]): The input to the agent. This should be a list of
383
+ AgentMessage objects.
384
+ plan_context (PlanContext): The plan context that was previously generated.
385
+ code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
386
+
387
+ Returns:
388
+ CodeContext: The generated code as a CodeContext object which includes the
389
+ code, test code, whether or not it was exceuted successfully, and the
390
+ execution result.
391
+ """
392
+
320
393
  chat = copy.deepcopy(chat)
321
- with CodeInterpreterFactory.new_instance(
322
- self.code_sandbox_runtime
394
+ with (
395
+ CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
396
+ if code_interpreter is None
397
+ else code_interpreter
323
398
  ) as code_interpreter:
324
399
  int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
325
400
  tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
@@ -331,10 +406,23 @@ class VisionAgentCoderV2(Agent):
331
406
  plan=format_plan_v2(plan_context),
332
407
  tool_docs=tool_docs,
333
408
  code_interpreter=code_interpreter,
334
- media_list=media_list, # type: ignore
335
- update_callback=self.update_callback,
409
+ media_list=media_list,
336
410
  verbose=self.verbose,
337
411
  )
412
+
413
+ self.update_callback(
414
+ {
415
+ "role": "coder",
416
+ "content": format_code_context(code_context),
417
+ "media": capture_media_from_exec(code_context.test_result),
418
+ }
419
+ )
420
+ self.update_callback(
421
+ {
422
+ "role": "observation",
423
+ "content": code_context.test_result.text(),
424
+ }
425
+ )
338
426
  return code_context
339
427
 
340
428
  def log_progress(self, data: Dict[str, Any]) -> None:
@@ -389,7 +389,7 @@ for infos in obj_to_info:
389
389
  print(f"{len(objects_with_tape)} boxes with tape found")
390
390
  </execute_python>
391
391
 
392
- OBJERVATION:
392
+ OBSERVATION:
393
393
  3 boxes were tracked
394
394
  2 boxes with tape found
395
395
  <count>6</count>