vision-agent 0.2.199__py3-none-any.whl → 0.2.200__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,4 @@
1
- from .agent import Agent
1
+ from .agent import Agent, AgentCoder, AgentPlanner
2
2
  from .vision_agent import VisionAgent
3
3
  from .vision_agent_coder import (
4
4
  AnthropicVisionAgentCoder,
@@ -17,3 +17,4 @@ from .vision_agent_planner import (
17
17
  VisionAgentPlanner,
18
18
  )
19
19
  from .vision_agent_planner_v2 import VisionAgentPlannerV2
20
+ from .vision_agent_v2 import VisionAgentV2
@@ -2,7 +2,9 @@ from abc import ABC, abstractmethod
2
2
  from pathlib import Path
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
 
5
+ from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
5
6
  from vision_agent.lmm.types import Message
7
+ from vision_agent.utils.execute import CodeInterpreter
6
8
 
7
9
 
8
10
  class Agent(ABC):
@@ -20,3 +22,34 @@ class Agent(ABC):
20
22
  This is a hook that is intended for reporting the progress of the agent.
21
23
  """
22
24
  pass
25
+
26
+
27
+ class AgentCoder(Agent):
28
+ @abstractmethod
29
+ def generate_code(
30
+ self,
31
+ chat: List[AgentMessage],
32
+ max_steps: Optional[int] = None,
33
+ code_interpreter: Optional[CodeInterpreter] = None,
34
+ ) -> CodeContext:
35
+ pass
36
+
37
+ @abstractmethod
38
+ def generate_code_from_plan(
39
+ self,
40
+ chat: List[AgentMessage],
41
+ plan_context: PlanContext,
42
+ code_interpreter: Optional[CodeInterpreter] = None,
43
+ ) -> CodeContext:
44
+ pass
45
+
46
+
47
+ class AgentPlanner(Agent):
48
+ @abstractmethod
49
+ def generate_plan(
50
+ self,
51
+ chat: List[AgentMessage],
52
+ max_steps: Optional[int] = None,
53
+ code_interpreter: Optional[CodeInterpreter] = None,
54
+ ) -> PlanContext:
55
+ pass
@@ -4,16 +4,17 @@ import logging
4
4
  import re
5
5
  import sys
6
6
  import tempfile
7
- from typing import Any, Dict, List, Optional, Tuple, cast
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
8
9
 
9
10
  import libcst as cst
10
- from pydantic import BaseModel
11
11
  from rich.console import Console
12
12
  from rich.style import Style
13
13
  from rich.syntax import Syntax
14
14
  from rich.table import Table
15
15
 
16
16
  import vision_agent.tools as T
17
+ from vision_agent.agent.types import AgentMessage, PlanContext
17
18
  from vision_agent.lmm.types import Message
18
19
  from vision_agent.utils.execute import CodeInterpreter, Execution
19
20
  from vision_agent.utils.image_utils import b64_to_pil, convert_to_b64
@@ -24,19 +25,6 @@ _CONSOLE = Console()
24
25
  _MAX_TABULATE_COL_WIDTH = 80
25
26
 
26
27
 
27
- class PlanContext(BaseModel):
28
- plan: str
29
- instructions: List[str]
30
- code: str
31
-
32
-
33
- class CodeContext(BaseModel):
34
- code: str
35
- test: str
36
- success: bool
37
- test_result: Execution
38
-
39
-
40
28
  def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
41
29
  json_pattern = r"\{.*\}"
42
30
  match = re.search(json_pattern, json_str, re.DOTALL)
@@ -228,15 +216,15 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
228
216
 
229
217
 
230
218
  def add_media_to_chat(
231
- chat: List[Message], code_interpreter: CodeInterpreter
232
- ) -> Tuple[List[Message], List[Message], List[str]]:
219
+ chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
220
+ ) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
233
221
  orig_chat = copy.deepcopy(chat)
234
222
  int_chat = copy.deepcopy(chat)
235
- media_list = []
223
+ media_list: List[Union[str, Path]] = []
236
224
  for chat_i in int_chat:
237
- if "media" in chat_i:
238
- media_list_i = []
239
- for media in chat_i["media"]:
225
+ if chat_i.media is not None:
226
+ media_list_i: List[Union[str, Path]] = []
227
+ for media in chat_i.media:
240
228
  if isinstance(media, str) and media.startswith("data:image/"):
241
229
  media_pil = b64_to_pil(media)
242
230
  with tempfile.NamedTemporaryFile(
@@ -244,25 +232,29 @@ def add_media_to_chat(
244
232
  ) as temp_file:
245
233
  media_pil.save(temp_file, format="PNG")
246
234
  media = str(temp_file.name)
247
- media = str(code_interpreter.upload_file(media)) # type: ignore
235
+ if code_interpreter is not None:
236
+ media = str(code_interpreter.upload_file(media))
248
237
  media_list_i.append(media)
249
- # don't duplicate appending media name
250
- if not str(chat_i["content"]).endswith(f" Media name {media}"):
251
- chat_i["content"] += f" Media name {media}" # type: ignore
252
- chat_i["media"] = media_list_i
238
+ # don't duplicate appending media name and only add them for user messages
239
+ if (
240
+ not str(chat_i.content).endswith(f" Media name {media}")
241
+ and chat_i.role == "user"
242
+ ):
243
+ chat_i.content += f" Media name {media}"
244
+ chat_i.media = media_list_i if len(media_list_i) > 0 else None
253
245
  media_list.extend(media_list_i)
254
246
 
255
247
  int_chat = cast(
256
- List[Message],
248
+ List[AgentMessage],
257
249
  [
258
250
  (
259
- {
260
- "role": c["role"],
261
- "content": c["content"],
262
- "media": c["media"],
263
- }
264
- if "media" in c
265
- else {"role": c["role"], "content": c["content"]}
251
+ AgentMessage(
252
+ role=c.role,
253
+ content=c.content,
254
+ media=c.media,
255
+ )
256
+ if c.media is not None
257
+ else AgentMessage(role=c.role, content=c.content, media=None)
266
258
  )
267
259
  for c in int_chat
268
260
  ],
@@ -283,6 +275,27 @@ def capture_media_from_exec(execution: Execution) -> List[str]:
283
275
  return images
284
276
 
285
277
 
278
+ def convert_message_to_agentmessage(
279
+ input: Union[str, List[Message]],
280
+ media: Optional[Union[str, Path]] = None,
281
+ ) -> List[AgentMessage]:
282
+ if isinstance(input, str):
283
+ input_msg = [
284
+ AgentMessage(
285
+ role="user",
286
+ content=input,
287
+ media=([media] if media is not None else None),
288
+ )
289
+ ]
290
+ else:
291
+ input_msg = [
292
+ AgentMessage(role=msg["role"], content=msg["content"], media=None)
293
+ for msg in input
294
+ ]
295
+ input_msg[0].media = [media] if media is not None else None
296
+ return input_msg
297
+
298
+
286
299
  def strip_function_calls( # noqa: C901
287
300
  code: str, exclusions: Optional[List[str]] = None
288
301
  ) -> str:
@@ -0,0 +1,51 @@
1
+ from pathlib import Path
2
+ from typing import List, Literal, Optional, Union
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from vision_agent.utils.execute import Execution
7
+
8
+
9
+ class AgentMessage(BaseModel):
10
+ """AgentMessage encompases messages sent to the entire Agentic system, which includes
11
+ both LMMs and sub-agents.
12
+
13
+ user: The user's message.
14
+ assistant: The assistant's message.
15
+ observation: An observation made after conducting an action, either by the user or
16
+ assistant.
17
+ interaction: An interaction between the user and the assistant. For example if the
18
+ assistant wants to ask the user for help on a task, it could send an
19
+ interaction message.
20
+ conversation: Messages coming from the conversation agent, this is a type of
21
+ assistant messages.
22
+ planner: Messages coming from the planner agent, this is a type of assistant
23
+ messages.
24
+ coder: Messages coming from the coder agent, this is a type of assistant messages.
25
+
26
+ """
27
+
28
+ role: Union[
29
+ Literal["user"],
30
+ Literal["assistant"], # planner, coder and conversation are of type assistant
31
+ Literal["observation"],
32
+ Literal["interaction"],
33
+ Literal["conversation"],
34
+ Literal["planner"],
35
+ Literal["coder"],
36
+ ]
37
+ content: str
38
+ media: Optional[List[Union[str, Path]]] = None
39
+
40
+
41
+ class PlanContext(BaseModel):
42
+ plan: str
43
+ instructions: List[str]
44
+ code: str
45
+
46
+
47
+ class CodeContext(BaseModel):
48
+ code: str
49
+ test: str
50
+ success: bool
51
+ test_result: Execution
@@ -6,19 +6,19 @@ from rich.console import Console
6
6
  from rich.markup import escape
7
7
 
8
8
  import vision_agent.tools as T
9
- from vision_agent.agent import Agent
9
+ from vision_agent.agent import AgentCoder, AgentPlanner
10
10
  from vision_agent.agent.agent_utils import (
11
- CodeContext,
12
11
  DefaultImports,
13
- PlanContext,
14
12
  add_media_to_chat,
15
13
  capture_media_from_exec,
14
+ convert_message_to_agentmessage,
16
15
  extract_tag,
17
16
  format_feedback,
18
17
  format_plan_v2,
19
18
  print_code,
20
19
  strip_function_calls,
21
20
  )
21
+ from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
22
22
  from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
23
23
  from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
24
24
  from vision_agent.lmm import LMM, AnthropicLMM
@@ -34,6 +34,12 @@ from vision_agent.utils.sim import Sim, load_cached_sim
34
34
  _CONSOLE = Console()
35
35
 
36
36
 
37
+ def format_code_context(
38
+ code_context: CodeContext,
39
+ ) -> str:
40
+ return f"<final_code>{code_context.code}</final_code>\n<final_test>{code_context.test}</final_test>"
41
+
42
+
37
43
  def retrieve_tools(
38
44
  plan: List[str],
39
45
  tool_recommender: Sim,
@@ -49,46 +55,54 @@ def retrieve_tools(
49
55
 
50
56
  def write_code(
51
57
  coder: LMM,
52
- chat: List[Message],
58
+ chat: List[AgentMessage],
53
59
  tool_docs: str,
54
60
  plan: str,
55
61
  ) -> str:
56
62
  chat = copy.deepcopy(chat)
57
- if chat[-1]["role"] != "user":
63
+ if chat[-1].role != "user":
58
64
  raise ValueError("Last chat message must be from the user.")
59
65
 
60
- user_request = chat[-1]["content"]
66
+ user_request = chat[-1].content
61
67
  prompt = CODE.format(
62
68
  docstring=tool_docs,
63
69
  question=user_request,
64
70
  plan=plan,
65
71
  )
66
- chat[-1]["content"] = prompt
67
- response = coder(chat, stream=False)
68
- return extract_tag(response, "code") # type: ignore
72
+ response = cast(str, coder([{"role": "user", "content": prompt}], stream=False))
73
+ maybe_code = extract_tag(response, "code")
74
+
75
+ # if the response wasn't properly formatted with the code tags just retrun the response
76
+ if maybe_code is None:
77
+ return response
78
+ return maybe_code
69
79
 
70
80
 
71
81
  def write_test(
72
82
  tester: LMM,
73
- chat: List[Message],
83
+ chat: List[AgentMessage],
74
84
  tool_util_docs: str,
75
85
  code: str,
76
86
  media_list: Optional[Sequence[Union[str, Path]]] = None,
77
87
  ) -> str:
78
88
  chat = copy.deepcopy(chat)
79
- if chat[-1]["role"] != "user":
89
+ if chat[-1].role != "user":
80
90
  raise ValueError("Last chat message must be from the user.")
81
91
 
82
- user_request = chat[-1]["content"]
92
+ user_request = chat[-1].content
83
93
  prompt = TEST.format(
84
94
  docstring=tool_util_docs,
85
95
  question=user_request,
86
96
  code=code,
87
97
  media=media_list,
88
98
  )
89
- chat[-1]["content"] = prompt
90
- response = tester(chat, stream=False)
91
- return extract_tag(response, "code") # type: ignore
99
+ response = cast(str, tester([{"role": "user", "content": prompt}], stream=False))
100
+ maybe_code = extract_tag(response, "code")
101
+
102
+ # if the response wasn't properly formatted with the code tags just retrun the response
103
+ if maybe_code is None:
104
+ return response
105
+ return maybe_code
92
106
 
93
107
 
94
108
  def debug_code(
@@ -170,12 +184,11 @@ def write_and_test_code(
170
184
  coder: LMM,
171
185
  tester: LMM,
172
186
  debugger: LMM,
173
- chat: List[Message],
187
+ chat: List[AgentMessage],
174
188
  plan: str,
175
189
  tool_docs: str,
176
190
  code_interpreter: CodeInterpreter,
177
191
  media_list: List[Union[str, Path]],
178
- update_callback: Callable[[Dict[str, Any]], None],
179
192
  verbose: bool,
180
193
  ) -> CodeContext:
181
194
  code = write_code(
@@ -226,14 +239,6 @@ def write_and_test_code(
226
239
  f"[bold cyan]Code execution result after attempted fix:[/bold cyan] [yellow]{escape(result.text(include_logs=True))}[/yellow]"
227
240
  )
228
241
 
229
- update_callback(
230
- {
231
- "role": "assistant",
232
- "content": f"<final_code>{DefaultImports.to_code_string()}\n{code}</final_code>\n<final_test>{DefaultImports.to_code_string()}\n{test}</final_test>",
233
- "media": capture_media_from_exec(result),
234
- }
235
- )
236
-
237
242
  return CodeContext(
238
243
  code=f"{DefaultImports.to_code_string()}\n{code}",
239
244
  test=f"{DefaultImports.to_code_string()}\n{test}",
@@ -242,10 +247,12 @@ def write_and_test_code(
242
247
  )
243
248
 
244
249
 
245
- class VisionAgentCoderV2(Agent):
250
+ class VisionAgentCoderV2(AgentCoder):
251
+ """VisionAgentCoderV2 is an agent that will write vision code for you."""
252
+
246
253
  def __init__(
247
254
  self,
248
- planner: Optional[Agent] = None,
255
+ planner: Optional[AgentPlanner] = None,
249
256
  coder: Optional[LMM] = None,
250
257
  tester: Optional[LMM] = None,
251
258
  debugger: Optional[LMM] = None,
@@ -254,6 +261,25 @@ class VisionAgentCoderV2(Agent):
254
261
  code_sandbox_runtime: Optional[str] = None,
255
262
  update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
256
263
  ) -> None:
264
+ """Initialize the VisionAgentCoderV2.
265
+
266
+ Parameters:
267
+ planner (Optional[AgentPlanner]): The planner agent to use for generating
268
+ vision plans. If None, a default VisionAgentPlannerV2 will be used.
269
+ coder (Optional[LMM]): The language model to use for the coder agent. If
270
+ None, a default AnthropicLMM will be used.
271
+ tester (Optional[LMM]): The language model to use for the tester agent. If
272
+ None, a default AnthropicLMM will be used.
273
+ debugger (Optional[LMM]): The language model to use for the debugger agent.
274
+ tool_recommender (Optional[Union[str, Sim]]): The tool recommender to use.
275
+ verbose (bool): Whether to print out debug information.
276
+ code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
277
+ be one of: None, "local" or "e2b". If None, it will read from the
278
+ environment variable CODE_SANDBOX_RUNTIME.
279
+ update_callback (Callable[[Dict[str, Any]], None]): The callback function
280
+ that will send back intermediate conversation messages.
281
+ """
282
+
257
283
  self.planner = (
258
284
  planner
259
285
  if planner is not None
@@ -290,20 +316,52 @@ class VisionAgentCoderV2(Agent):
290
316
  self,
291
317
  input: Union[str, List[Message]],
292
318
  media: Optional[Union[str, Path]] = None,
293
- ) -> Union[str, List[Message]]:
294
- if isinstance(input, str):
295
- input = [{"role": "user", "content": input}]
296
- if media is not None:
297
- input[0]["media"] = [media]
298
- return self.generate_code(input).code
299
-
300
- def generate_code(self, chat: List[Message]) -> CodeContext:
319
+ ) -> str:
320
+ """Generate vision code from a conversation.
321
+
322
+ Parameters:
323
+ input (Union[str, List[Message]]): The input to the agent. This can be a
324
+ string or a list of messages in the format of [{"role": "user",
325
+ "content": "describe your task here..."}, ...].
326
+ media (Optional[Union[str, Path]]): The path to the media file to use with
327
+ the input. This can be an image or video file.
328
+
329
+ Returns:
330
+ str: The generated code as a string.
331
+ """
332
+
333
+ input_msg = convert_message_to_agentmessage(input, media)
334
+ return self.generate_code(input_msg).code
335
+
336
+ def generate_code(
337
+ self,
338
+ chat: List[AgentMessage],
339
+ max_steps: Optional[int] = None,
340
+ code_interpreter: Optional[CodeInterpreter] = None,
341
+ ) -> CodeContext:
342
+ """Generate vision code from a conversation.
343
+
344
+ Parameters:
345
+ chat (List[AgentMessage]): The input to the agent. This should be a list of
346
+ AgentMessage objects.
347
+ code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
348
+
349
+ Returns:
350
+ CodeContext: The generated code as a CodeContext object which includes the
351
+ code, test code, whether or not it was exceuted successfully, and the
352
+ execution result.
353
+ """
354
+
301
355
  chat = copy.deepcopy(chat)
302
- with CodeInterpreterFactory.new_instance(
303
- self.code_sandbox_runtime
356
+ with (
357
+ CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
358
+ if code_interpreter is None
359
+ else code_interpreter
304
360
  ) as code_interpreter:
305
361
  int_chat, orig_chat, _ = add_media_to_chat(chat, code_interpreter)
306
- plan_context = self.planner.generate_plan(int_chat, code_interpreter) # type: ignore
362
+ plan_context = self.planner.generate_plan(
363
+ int_chat, max_steps=max_steps, code_interpreter=code_interpreter
364
+ )
307
365
  code_context = self.generate_code_from_plan(
308
366
  orig_chat,
309
367
  plan_context,
@@ -313,13 +371,30 @@ class VisionAgentCoderV2(Agent):
313
371
 
314
372
  def generate_code_from_plan(
315
373
  self,
316
- chat: List[Message],
374
+ chat: List[AgentMessage],
317
375
  plan_context: PlanContext,
318
376
  code_interpreter: Optional[CodeInterpreter] = None,
319
377
  ) -> CodeContext:
378
+ """Generate vision code from a conversation and a previously made plan. This
379
+ will skip the planning step and go straight to generating code.
380
+
381
+ Parameters:
382
+ chat (List[AgentMessage]): The input to the agent. This should be a list of
383
+ AgentMessage objects.
384
+ plan_context (PlanContext): The plan context that was previously generated.
385
+ code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
386
+
387
+ Returns:
388
+ CodeContext: The generated code as a CodeContext object which includes the
389
+ code, test code, whether or not it was exceuted successfully, and the
390
+ execution result.
391
+ """
392
+
320
393
  chat = copy.deepcopy(chat)
321
- with CodeInterpreterFactory.new_instance(
322
- self.code_sandbox_runtime
394
+ with (
395
+ CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
396
+ if code_interpreter is None
397
+ else code_interpreter
323
398
  ) as code_interpreter:
324
399
  int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
325
400
  tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
@@ -331,10 +406,23 @@ class VisionAgentCoderV2(Agent):
331
406
  plan=format_plan_v2(plan_context),
332
407
  tool_docs=tool_docs,
333
408
  code_interpreter=code_interpreter,
334
- media_list=media_list, # type: ignore
335
- update_callback=self.update_callback,
409
+ media_list=media_list,
336
410
  verbose=self.verbose,
337
411
  )
412
+
413
+ self.update_callback(
414
+ {
415
+ "role": "coder",
416
+ "content": format_code_context(code_context),
417
+ "media": capture_media_from_exec(code_context.test_result),
418
+ }
419
+ )
420
+ self.update_callback(
421
+ {
422
+ "role": "observation",
423
+ "content": code_context.test_result.text(),
424
+ }
425
+ )
338
426
  return code_context
339
427
 
340
428
  def log_progress(self, data: Dict[str, Any]) -> None:
@@ -389,7 +389,7 @@ for infos in obj_to_info:
389
389
  print(f"{len(objects_with_tape)} boxes with tape found")
390
390
  </execute_python>
391
391
 
392
- OBJERVATION:
392
+ OBSERVATION:
393
393
  3 boxes were tracked
394
394
  2 boxes with tape found
395
395
  <count>6</count>
@@ -1,5 +1,6 @@
1
1
  import copy
2
2
  import logging
3
+ import time
3
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
5
  from pathlib import Path
5
6
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
@@ -10,16 +11,17 @@ from rich.markup import escape
10
11
 
11
12
  import vision_agent.tools as T
12
13
  import vision_agent.tools.planner_tools as pt
13
- from vision_agent.agent import Agent
14
+ from vision_agent.agent import AgentPlanner
14
15
  from vision_agent.agent.agent_utils import (
15
- PlanContext,
16
16
  add_media_to_chat,
17
17
  capture_media_from_exec,
18
+ convert_message_to_agentmessage,
18
19
  extract_json,
19
20
  extract_tag,
20
21
  print_code,
21
22
  print_table,
22
23
  )
24
+ from vision_agent.agent.types import AgentMessage, PlanContext
23
25
  from vision_agent.agent.vision_agent_planner_prompts_v2 import (
24
26
  CRITIQUE_PLAN,
25
27
  EXAMPLE_PLAN1,
@@ -70,26 +72,24 @@ class DefaultPlanningImports:
70
72
 
71
73
 
72
74
  def get_planning(
73
- chat: List[Message],
75
+ chat: List[AgentMessage],
74
76
  ) -> str:
75
77
  chat = copy.deepcopy(chat)
76
78
  planning = ""
77
79
  for chat_i in chat:
78
- if chat_i["role"] == "user":
79
- planning += f"USER: {chat_i['content']}\n\n"
80
- elif chat_i["role"] == "observation":
81
- planning += f"OBSERVATION: {chat_i['content']}\n\n"
82
- elif chat_i["role"] == "assistant":
83
- planning += f"ASSISTANT: {chat_i['content']}\n\n"
84
- else:
85
- raise ValueError(f"Unknown role: {chat_i['role']}")
80
+ if chat_i.role == "user":
81
+ planning += f"USER: {chat_i.content}\n\n"
82
+ elif chat_i.role == "observation":
83
+ planning += f"OBSERVATION: {chat_i.content}\n\n"
84
+ elif chat_i.role == "planner":
85
+ planning += f"AGENT: {chat_i.content}\n\n"
86
86
 
87
87
  return planning
88
88
 
89
89
 
90
90
  def run_planning(
91
- chat: List[Message],
92
- media_list: List[str],
91
+ chat: List[AgentMessage],
92
+ media_list: List[Union[str, Path]],
93
93
  model: LMM,
94
94
  ) -> str:
95
95
  # only keep last 10 messages for planning
@@ -102,16 +102,16 @@ def run_planning(
102
102
  )
103
103
 
104
104
  message: Message = {"role": "user", "content": prompt}
105
- if chat[-1]["role"] == "observation" and "media" in chat[-1]:
106
- message["media"] = chat[-1]["media"]
105
+ if chat[-1].role == "observation" and chat[-1].media is not None:
106
+ message["media"] = chat[-1].media
107
107
 
108
108
  response = model.chat([message])
109
109
  return cast(str, response)
110
110
 
111
111
 
112
112
  def run_multi_trial_planning(
113
- chat: List[Message],
114
- media_list: List[str],
113
+ chat: List[AgentMessage],
114
+ media_list: List[Union[str, Path]],
115
115
  model: LMM,
116
116
  ) -> str:
117
117
  planning = get_planning(chat)
@@ -123,8 +123,8 @@ def run_multi_trial_planning(
123
123
  )
124
124
 
125
125
  message: Message = {"role": "user", "content": prompt}
126
- if chat[-1]["role"] == "observation" and "media" in chat[-1]:
127
- message["media"] = chat[-1]["media"]
126
+ if chat[-1].role == "observation" and chat[-1].media is not None:
127
+ message["media"] = chat[-1].media
128
128
 
129
129
  responses = []
130
130
  with ThreadPoolExecutor() as executor:
@@ -151,7 +151,9 @@ def run_multi_trial_planning(
151
151
  return cast(str, responses[0])
152
152
 
153
153
 
154
- def run_critic(chat: List[Message], media_list: List[str], model: LMM) -> Optional[str]:
154
+ def run_critic(
155
+ chat: List[AgentMessage], media_list: List[Union[str, Path]], model: LMM
156
+ ) -> Optional[str]:
155
157
  planning = get_planning(chat)
156
158
  prompt = CRITIQUE_PLAN.format(
157
159
  planning=planning,
@@ -196,17 +198,19 @@ def response_safeguards(response: str) -> str:
196
198
  def execute_code_action(
197
199
  code: str,
198
200
  code_interpreter: CodeInterpreter,
199
- chat: List[Message],
201
+ chat: List[AgentMessage],
200
202
  model: LMM,
201
203
  verbose: bool = False,
202
204
  ) -> Tuple[Execution, str, str]:
203
205
  if verbose:
204
206
  print_code("Code to Execute:", code)
207
+ start = time.time()
205
208
  execution = code_interpreter.exec_cell(DefaultPlanningImports.prepend_imports(code))
209
+ end = time.time()
206
210
  obs = execution.text(include_results=False).strip()
207
211
  if verbose:
208
212
  _CONSOLE.print(
209
- f"[bold cyan]Code Execution Output:[/bold cyan] [yellow]{escape(obs)}[/yellow]"
213
+ f"[bold cyan]Code Execution Output ({end - start:.2f} sec):[/bold cyan] [yellow]{escape(obs)}[/yellow]"
210
214
  )
211
215
 
212
216
  count = 1
@@ -246,13 +250,13 @@ def find_and_replace_code(response: str, code: str) -> str:
246
250
  def maybe_run_code(
247
251
  code: Optional[str],
248
252
  response: str,
249
- chat: List[Message],
250
- media_list: List[str],
253
+ chat: List[AgentMessage],
254
+ media_list: List[Union[str, Path]],
251
255
  model: LMM,
252
256
  code_interpreter: CodeInterpreter,
253
257
  verbose: bool = False,
254
- ) -> List[Message]:
255
- return_chat: List[Message] = []
258
+ ) -> List[AgentMessage]:
259
+ return_chat: List[AgentMessage] = []
256
260
  if code is not None:
257
261
  code = code_safeguards(code)
258
262
  execution, obs, code = execute_code_action(
@@ -262,30 +266,32 @@ def maybe_run_code(
262
266
  # if we had to debug the code to fix an issue, replace the old code
263
267
  # with the fixed code in the response
264
268
  fixed_response = find_and_replace_code(response, code)
265
- return_chat.append({"role": "assistant", "content": fixed_response})
269
+ return_chat.append(
270
+ AgentMessage(role="planner", content=fixed_response, media=None)
271
+ )
266
272
 
267
273
  media_data = capture_media_from_exec(execution)
268
- int_chat_elt: Message = {"role": "observation", "content": obs}
274
+ int_chat_elt = AgentMessage(role="observation", content=obs, media=None)
269
275
  if media_list:
270
- int_chat_elt["media"] = media_data
276
+ int_chat_elt.media = cast(List[Union[str, Path]], media_data)
271
277
  return_chat.append(int_chat_elt)
272
278
  else:
273
- return_chat.append({"role": "assistant", "content": response})
279
+ return_chat.append(AgentMessage(role="planner", content=response, media=None))
274
280
  return return_chat
275
281
 
276
282
 
277
283
  def create_finalize_plan(
278
- chat: List[Message],
284
+ chat: List[AgentMessage],
279
285
  model: LMM,
280
286
  verbose: bool = False,
281
- ) -> Tuple[List[Message], PlanContext]:
287
+ ) -> Tuple[List[AgentMessage], PlanContext]:
282
288
  prompt = FINALIZE_PLAN.format(
283
289
  planning=get_planning(chat),
284
290
  excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
285
291
  )
286
292
  response = model.chat([{"role": "user", "content": prompt}])
287
293
  plan_str = cast(str, response)
288
- return_chat: List[Message] = [{"role": "assistant", "content": plan_str}]
294
+ return_chat = [AgentMessage(role="planner", content=plan_str, media=None)]
289
295
 
290
296
  plan_json = extract_tag(plan_str, "json")
291
297
  plan = (
@@ -305,7 +311,16 @@ def create_finalize_plan(
305
311
  return return_chat, PlanContext(**plan)
306
312
 
307
313
 
308
- class VisionAgentPlannerV2(Agent):
314
+ def get_steps(chat: List[AgentMessage], max_steps: int) -> int:
315
+ for chat_elt in reversed(chat):
316
+ if "<count>" in chat_elt.content:
317
+ return int(extract_tag(chat_elt.content, "count")) # type: ignore
318
+ return max_steps
319
+
320
+
321
+ class VisionAgentPlannerV2(AgentPlanner):
322
+ """VisionAgentPlannerV2 is a class that generates a plan to solve a vision task."""
323
+
309
324
  def __init__(
310
325
  self,
311
326
  planner: Optional[LMM] = None,
@@ -317,6 +332,25 @@ class VisionAgentPlannerV2(Agent):
317
332
  code_sandbox_runtime: Optional[str] = None,
318
333
  update_callback: Callable[[Dict[str, Any]], None] = lambda _: None,
319
334
  ) -> None:
335
+ """Initialize the VisionAgentPlannerV2.
336
+
337
+ Parameters:
338
+ planner (Optional[LMM]): The language model to use for planning. If None, a
339
+ default AnthropicLMM will be used.
340
+ critic (Optional[LMM]): The language model to use for critiquing the plan.
341
+ If None, a default AnthropicLMM will be used.
342
+ max_steps (int): The maximum number of steps to plan.
343
+ use_multi_trial_planning (bool): Whether to use multi-trial planning.
344
+ critique_steps (int): The number of steps between critiques. If critic steps
345
+ is larger than max_steps no critiques will be made.
346
+ verbose (bool): Whether to print out debug information.
347
+ code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
348
+ be one of: None, "local" or "e2b". If None, it will read from the
349
+ environment variable CODE_SANDBOX_RUNTIME.
350
+ update_callback (Callable[[Dict[str, Any]], None]): The callback function
351
+ that will send back intermediate conversation messages.
352
+ """
353
+
320
354
  self.planner = (
321
355
  planner
322
356
  if planner is not None
@@ -339,20 +373,42 @@ class VisionAgentPlannerV2(Agent):
339
373
  self,
340
374
  input: Union[str, List[Message]],
341
375
  media: Optional[Union[str, Path]] = None,
342
- ) -> Union[str, List[Message]]:
343
- if isinstance(input, str):
344
- if media is not None:
345
- input = [{"role": "user", "content": input, "media": [media]}]
346
- else:
347
- input = [{"role": "user", "content": input}]
348
- plan = self.generate_plan(input)
349
- return str(plan)
376
+ ) -> str:
377
+ """Generate a plan to solve a vision task.
378
+
379
+ Parameters:
380
+ input (Union[str, List[Message]]): The input to the agent. This can be a
381
+ string or a list of messages in the format of [{"role": "user",
382
+ "content": "describe your task here..."}, ...].
383
+ media (Optional[Union[str, Path]]): The path to the media file to use with
384
+ the input. This can be an image or video file.
385
+
386
+ Returns:
387
+ str: The generated plan as a string.
388
+ """
389
+
390
+ input_msg = convert_message_to_agentmessage(input, media)
391
+ plan = self.generate_plan(input_msg)
392
+ return plan.plan
350
393
 
351
394
  def generate_plan(
352
395
  self,
353
- chat: List[Message],
396
+ chat: List[AgentMessage],
397
+ max_steps: Optional[int] = None,
354
398
  code_interpreter: Optional[CodeInterpreter] = None,
355
399
  ) -> PlanContext:
400
+ """Generate a plan to solve a vision task.
401
+
402
+ Parameters:
403
+ chat (List[AgentMessage]): The conversation messages to generate a plan for.
404
+ max_steps (Optional[int]): The maximum number of steps to plan.
405
+ code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
406
+
407
+ Returns:
408
+ PlanContext: The generated plan including the instructions and code snippets
409
+ needed to solve the task.
410
+ """
411
+
356
412
  if not chat:
357
413
  raise ValueError("Chat cannot be empty")
358
414
 
@@ -360,13 +416,16 @@ class VisionAgentPlannerV2(Agent):
360
416
  code_interpreter = code_interpreter or CodeInterpreterFactory.new_instance(
361
417
  self.code_sandbox_runtime
362
418
  )
419
+ max_steps = max_steps or self.max_steps
363
420
 
364
421
  with code_interpreter:
365
422
  critque_steps = 1
366
- step = self.max_steps
367
423
  finished = False
368
424
  int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
369
- int_chat[-1]["content"] += f"\n<count>{step}</count>\n" # type: ignore
425
+
426
+ step = get_steps(int_chat, max_steps)
427
+ if "<count>" not in int_chat[-1].content and step == max_steps:
428
+ int_chat[-1].content += f"\n<count>{step}</count>\n"
370
429
  while step > 0 and not finished:
371
430
  if self.use_multi_trial_planning:
372
431
  response = run_multi_trial_planning(
@@ -402,29 +461,29 @@ class VisionAgentPlannerV2(Agent):
402
461
 
403
462
  if critque_steps % self.critique_steps == 0:
404
463
  critique = run_critic(int_chat, media_list, self.critic)
405
- if critique is not None and int_chat[-1]["role"] == "observation":
464
+ if critique is not None and int_chat[-1].role == "observation":
406
465
  _CONSOLE.print(
407
466
  f"[bold cyan]Critique:[/bold cyan] [red]{critique}[/red]"
408
467
  )
409
468
  critique_str = f"\n[critique]\n{critique}\n[end of critique]"
410
- updated_chat[-1]["content"] += critique_str # type: ignore
469
+ updated_chat[-1].content += critique_str
411
470
  # if plan was critiqued, ensure we don't finish so we can
412
471
  # respond to the critique
413
472
  finished = False
414
473
 
415
474
  critque_steps += 1
416
475
  step -= 1
417
- updated_chat[-1]["content"] += f"\n<count>{step}</count>\n" # type: ignore
476
+ updated_chat[-1].content += f"\n<count>{step}</count>\n"
418
477
  int_chat.extend(updated_chat)
419
478
  for chat_elt in updated_chat:
420
- self.update_callback(chat_elt)
479
+ self.update_callback(chat_elt.model_dump())
421
480
 
422
481
  updated_chat, plan_context = create_finalize_plan(
423
482
  int_chat, self.planner, self.verbose
424
483
  )
425
484
  int_chat.extend(updated_chat)
426
485
  for chat_elt in updated_chat:
427
- self.update_callback(chat_elt)
486
+ self.update_callback(chat_elt.model_dump())
428
487
 
429
488
  return plan_context
430
489
 
@@ -55,10 +55,10 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
55
55
 
56
56
  OBSERVATION:
57
57
  [Artifact dog_detector.py (5 lines total)]
58
- 0|from vision_agent.tools import load_image, owl_v2
58
+ 0|from vision_agent.tools import load_image, owl_v2_image
59
59
  1|def detect_dogs(image_path: str):
60
60
  2| image = load_image(image_path)
61
- 3| dogs = owl_v2("dog", image)
61
+ 3| dogs = owl_v2_image("dog", image)
62
62
  4| return dogs
63
63
  [End of artifact]
64
64
 
@@ -96,10 +96,10 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
96
96
 
97
97
  OBSERVATION:
98
98
  [Artifact dog_detector.py (5 lines total)]
99
- 0|from vision_agent.tools import load_image, owl_v2
99
+ 0|from vision_agent.tools import load_image, owl_v2_image
100
100
  1|def detect_dogs(image_path: str):
101
101
  2| image = load_image(image_path)
102
- 3| dogs = owl_v2("dog", image, threshold=0.24)
102
+ 3| dogs = owl_v2_image("dog", image, threshold=0.24)
103
103
  4| return dogs
104
104
  [End of artifact]
105
105
 
@@ -0,0 +1,46 @@
1
+ CONVERSATION = """
2
+ **Role**: You are a help agent that assists users with their vision-related questions.
3
+
4
+ **Actions**:
5
+ `generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
6
+
7
+ `edit_code` - If the user already has code for a vision task, from a previous call to `generate_or_edit_vision_code` and wants to quickly modify the code, you can use this action to edit the code. This is good for non-vision related changes such as formatting the response as a json or removing print statements.
8
+
9
+ **Exampels**:
10
+ Here is an example of how you can interact with a user and Actions:
11
+ --- START EXAMPLES ---
12
+ USER: Can you help me write code to detect dogs in this image? Media name images/dogs.jpg
13
+
14
+ AGENT: <response>Yes, I can help you with that. I will write the code to detect dogs in the image.</response><action>generate_or_edit_vision_code</action>
15
+
16
+ OBSERVATION:
17
+ <final_code>
18
+ from vision_agent.tools import load_image, owl_v2_image
19
+ def detect_dogs(image_path: str):
20
+ image = load_image(image_path)
21
+ dogs = owl_v2_image(image)
22
+ return dogs
23
+ </final_code>
24
+ <final_test>
25
+ def test_detect_dogs():
26
+ dogs = detect_dogs("images/dogs.jpg")
27
+ assert len(dogs) > 0
28
+ </final_test>
29
+
30
+ AGENT: <response>Here is the code to detect dogs in the image.</response>
31
+
32
+ --- END EXAMPLES ---
33
+
34
+ **Conversation**:
35
+ Here is the current conversation so far:
36
+ --- START CONVERSATION ---
37
+ {conversation}
38
+ --- END CONVERSATION ---
39
+
40
+ **Instructions**:
41
+ 1. Only respond with a single <response> tag and a single <action> tag.
42
+ 2. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
43
+
44
+ <response>Your response to the user's message</response>
45
+ <action>The action you want to take from **Actions**</action>
46
+ """
@@ -0,0 +1,215 @@
1
+ import copy
2
+ from pathlib import Path
3
+ from typing import Any, Callable, Dict, List, Optional, Union, cast
4
+
5
+ from vision_agent.agent import Agent, AgentCoder, VisionAgentCoderV2
6
+ from vision_agent.agent.agent_utils import (
7
+ add_media_to_chat,
8
+ convert_message_to_agentmessage,
9
+ extract_tag,
10
+ )
11
+ from vision_agent.agent.types import AgentMessage, PlanContext
12
+ from vision_agent.agent.vision_agent_coder_v2 import format_code_context
13
+ from vision_agent.agent.vision_agent_prompts_v2 import CONVERSATION
14
+ from vision_agent.lmm import LMM, AnthropicLMM
15
+ from vision_agent.lmm.types import Message
16
+ from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
17
+
18
+
19
+ def format_conversation(chat: List[AgentMessage]) -> str:
20
+ chat = copy.deepcopy(chat)
21
+ prompt = ""
22
+ for chat_i in chat:
23
+ if chat_i.role == "user":
24
+ prompt += f"USER: {chat_i.content}\n\n"
25
+ elif chat_i.role == "observation" or chat_i.role == "coder":
26
+ prompt += f"OBSERVATION: {chat_i.content}\n\n"
27
+ elif chat_i.role == "conversation":
28
+ prompt += f"AGENT: {chat_i.content}\n\n"
29
+ return prompt
30
+
31
+
32
+ def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
33
+ # only keep last 10 messages
34
+ conv = format_conversation(chat[-10:])
35
+ prompt = CONVERSATION.format(
36
+ conversation=conv,
37
+ )
38
+ response = agent([{"role": "user", "content": prompt}], stream=False)
39
+ return cast(str, response)
40
+
41
+
42
+ def extract_conversation_for_generate_code(
43
+ chat: List[AgentMessage],
44
+ ) -> List[AgentMessage]:
45
+ chat = copy.deepcopy(chat)
46
+ extracted_chat = []
47
+ for chat_i in chat:
48
+ if chat_i.role == "user":
49
+ extracted_chat.append(chat_i)
50
+ elif chat_i.role == "coder":
51
+ if "<final_code>" in chat_i.content and "<final_test>" in chat_i.content:
52
+ extracted_chat.append(chat_i)
53
+
54
+ return extracted_chat
55
+
56
+
57
+ def maybe_run_action(
58
+ coder: AgentCoder,
59
+ action: Optional[str],
60
+ chat: List[AgentMessage],
61
+ code_interpreter: Optional[CodeInterpreter] = None,
62
+ ) -> Optional[List[AgentMessage]]:
63
+ if action == "generate_or_edit_vision_code":
64
+ extracted_chat = extract_conversation_for_generate_code(chat)
65
+ # there's an issue here because coder.generate_code will send it's code_context
66
+ # to the outside user via it's update_callback, but we don't necessarily have
67
+ # access to that update_callback here, so we re-create the message using
68
+ # format_code_context.
69
+ code_context = coder.generate_code(
70
+ extracted_chat, code_interpreter=code_interpreter
71
+ )
72
+ return [
73
+ AgentMessage(role="coder", content=format_code_context(code_context)),
74
+ AgentMessage(role="observation", content=code_context.test_result.text()),
75
+ ]
76
+ elif action == "edit_code":
77
+ extracted_chat = extract_conversation_for_generate_code(chat)
78
+ plan_context = PlanContext(
79
+ plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
80
+ instructions=[],
81
+ code="",
82
+ )
83
+ code_context = coder.generate_code_from_plan(
84
+ extracted_chat, plan_context, code_interpreter=code_interpreter
85
+ )
86
+ return [
87
+ AgentMessage(role="coder", content=format_code_context(code_context)),
88
+ AgentMessage(role="observation", content=code_context.test_result.text()),
89
+ ]
90
+ elif action == "view_image":
91
+ pass
92
+
93
+ return None
94
+
95
+
96
+ class VisionAgentV2(Agent):
97
+ """VisionAgentV2 is a conversational agent that allows you to more easily use a
98
+ coder agent such as VisionAgentCoderV2 to write vision code for you.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ agent: Optional[LMM] = None,
104
+ coder: Optional[AgentCoder] = None,
105
+ verbose: bool = False,
106
+ code_sandbox_runtime: Optional[str] = None,
107
+ update_callback: Callable[[Dict[str, Any]], None] = lambda x: None,
108
+ ) -> None:
109
+ """Initialize the VisionAgentV2.
110
+
111
+ Parameters:
112
+ agent (Optional[LMM]): The language model to use for the agent. If None, a
113
+ default AnthropicLMM will be used.
114
+ coder (Optional[AgentCoder]): The coder agent to use for generating vision
115
+ code. If None, a default VisionAgentCoderV2 will be used.
116
+ verbose (bool): Whether to print out debug information.
117
+ code_sandbox_runtime (Optional[str]): The code sandbox runtime to use, can
118
+ be one of: None, "local" or "e2b". If None, it will read from the
119
+ environment variable CODE_SANDBOX_RUNTIME.
120
+ update_callback (Callable[[Dict[str, Any]], None]): The callback function
121
+ that will send back intermediate conversation messages.
122
+ """
123
+
124
+ self.agent = (
125
+ agent
126
+ if agent is not None
127
+ else AnthropicLMM(
128
+ model_name="claude-3-5-sonnet-20241022",
129
+ temperature=0.0,
130
+ )
131
+ )
132
+ self.coder = (
133
+ coder
134
+ if coder is not None
135
+ else VisionAgentCoderV2(verbose=verbose, update_callback=update_callback)
136
+ )
137
+
138
+ self.verbose = verbose
139
+ self.code_sandbox_runtime = code_sandbox_runtime
140
+ self.update_callback = update_callback
141
+
142
+ # force coder to use the same update_callback
143
+ if hasattr(self.coder, "update_callback"):
144
+ self.coder.update_callback = update_callback
145
+
146
+ def __call__(
147
+ self,
148
+ input: Union[str, List[Message]],
149
+ media: Optional[Union[str, Path]] = None,
150
+ ) -> str:
151
+ """Conversational interface to the agent. This is the main method to use to
152
+ interact with the agent. It takes in a string or list of messages and returns
153
+ the agent's response as a string.
154
+
155
+ Parameters:
156
+ input (Union[str, List[Message]]): The input to the agent. This can be a
157
+ string or a list of messages in the format of [{"role": "user",
158
+ "content": "describe your task here..."}, ...].
159
+ media (Optional[Union[str, Path]]): The path to the media file to use with
160
+ the input. This can be an image or video file.
161
+
162
+ Returns:
163
+ str: The agent's response as a string.
164
+ """
165
+
166
+ input_msg = convert_message_to_agentmessage(input, media)
167
+ return self.chat(input_msg)[-1].content
168
+
169
+ def chat(
170
+ self,
171
+ chat: List[AgentMessage],
172
+ ) -> List[AgentMessage]:
173
+ """Conversational interface to the agent. This is the main method to use to
174
+ interact with the agent. It takes in a list of messages and returns the agent's
175
+ response as a list of messages.
176
+
177
+ Parameters:
178
+ chat (List[AgentMessage]): The input to the agent. This should be a list of
179
+ AgentMessage objects.
180
+
181
+ Returns:
182
+ List[AgentMessage]: The agent's response as a list of AgentMessage objects.
183
+ """
184
+
185
+ return_chat = []
186
+ with CodeInterpreterFactory.new_instance(
187
+ self.code_sandbox_runtime
188
+ ) as code_interpreter:
189
+ int_chat, _, _ = add_media_to_chat(chat, code_interpreter)
190
+ response_context = run_conversation(self.agent, int_chat)
191
+ return_chat.append(
192
+ AgentMessage(role="conversation", content=response_context)
193
+ )
194
+ self.update_callback(return_chat[-1].model_dump())
195
+
196
+ action = extract_tag(response_context, "action")
197
+
198
+ updated_chat = maybe_run_action(
199
+ self.coder, action, int_chat, code_interpreter=code_interpreter
200
+ )
201
+ if updated_chat is not None:
202
+ # do not append updated_chat to return_chat becuase the observation
203
+ # from running the action will have already been added via the callbacks
204
+ obs_response_context = run_conversation(
205
+ self.agent, return_chat + updated_chat
206
+ )
207
+ return_chat.append(
208
+ AgentMessage(role="conversation", content=obs_response_context)
209
+ )
210
+ self.update_callback(return_chat[-1].model_dump())
211
+
212
+ return return_chat
213
+
214
+ def log_progress(self, data: Dict[str, Any]) -> None:
215
+ pass
@@ -38,7 +38,7 @@ from vision_agent.utils.exceptions import (
38
38
 
39
39
  load_dotenv()
40
40
  _LOGGER = logging.getLogger(__name__)
41
- _SESSION_TIMEOUT = 600 # 10 minutes
41
+ _SESSION_TIMEOUT = 180 # 3 minutes
42
42
  WORKSPACE = Path(os.getenv("WORKSPACE", ""))
43
43
 
44
44
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.199
3
+ Version: 0.2.200
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -1,19 +1,22 @@
1
1
  vision_agent/.sim_tools/df.csv,sha256=0fmLwTDjnRTiqYwamTOdCPjruE6wZz0AVrONIPTHxZY,34086
2
2
  vision_agent/.sim_tools/embs.npy,sha256=xF8Cg7Xd09QCTySj831aL1O2_0kRNaaH8XRJIRjgWzQ,356480
3
3
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
4
- vision_agent/agent/__init__.py,sha256=j4W3zHXKE96o93ZziY62ZBWgicLYEink1rIU3gPsfwM,548
5
- vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
6
- vision_agent/agent/agent_utils.py,sha256=jDkvanBsT_ZH7MnPWP_Wa_ToPOy4hdy4kTw9FZytwwo,12765
4
+ vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
5
+ vision_agent/agent/agent.py,sha256=sf8JcA3LNy_4GaS_gQb2Q-PXkl4dBuGh-7raI9KAtZo,1470
6
+ vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
7
+ vision_agent/agent/types.py,sha256=aAd_ez1-NQh04k27cmywyOV2uA_vWWYE-Ok7zq_JoAk,1532
7
8
  vision_agent/agent/vision_agent.py,sha256=rr1P9iTbr7OsjgMYWCeIxQYI4cLwPWia3NIMJNi-9Yo,26110
8
9
  vision_agent/agent/vision_agent_coder.py,sha256=waCmw_NTgsy9G-UqlRZFhsFJJVuWVrjxVnShe4Xp_lI,27743
9
10
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
10
11
  vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
11
- vision_agent/agent/vision_agent_coder_v2.py,sha256=LVV5Ij-2s03Cj27VJZI11dMKios8ALYZ4_mZTpeMDJU,10863
12
+ vision_agent/agent/vision_agent_coder_v2.py,sha256=SVIJC0N5TBgq9z-F99UebLimRuQuAe_HHvTFupBzVfo,14715
12
13
  vision_agent/agent/vision_agent_planner.py,sha256=F_5opnc0XmQmNH40rs2T7DFrai4CC6aDYe02Z8e93AM,18875
13
14
  vision_agent/agent/vision_agent_planner_prompts.py,sha256=Y3jz9HRf8fz9NLUseN7cTgZqewP0RazxR7vw1sPhcn0,6691
14
- vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=PrihfrkxbeVQNzR2Vu3UwG_PRjFsjoC9IQko3WfUqPM,33143
15
- vision_agent/agent/vision_agent_planner_v2.py,sha256=11pCfaXXsivV9DKWI7nDcLf5dJV3IyHX0IR4Zn7UC9E,14288
16
- vision_agent/agent/vision_agent_prompts.py,sha256=4329ll0kqCznRALIMl-rlKWGjN92p3bcRrz8R-cO744,13748
15
+ vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=Tzon3h5iZdHJglesk8GVS-2myNf5-fhf7HUbkpZWHQk,33143
16
+ vision_agent/agent/vision_agent_planner_v2.py,sha256=mxQxD_B8sKYharh8e7W0uc1tN11YCztyLowc83seScc,17023
17
+ vision_agent/agent/vision_agent_prompts.py,sha256=PENFd8VM_vHKxeZPiotVM1RBVW9NrXimKbpvI1UteKI,13772
18
+ vision_agent/agent/vision_agent_prompts_v2.py,sha256=-vCWat-ARlCOOOeIDIFhg-kcwRRwjTXYEwsvvqPeaCs,1972
19
+ vision_agent/agent/vision_agent_v2.py,sha256=Cudp_ZZBI9rDwMjIYlvY4jzh_srsulYgfRWZLo4_2TQ,8366
17
20
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
21
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
19
22
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -31,12 +34,12 @@ vision_agent/tools/tools.py,sha256=wXDs0m_Yb601FQVp5fPYYVtt4lHUeMnuqIbfDZhsE4Q,8
31
34
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
32
35
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
33
36
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
34
- vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo,28028
37
+ vision_agent/utils/execute.py,sha256=b3AA1G16Ixwlgd-kke13brKclxh5nJXQTrk25oj1W3o,28027
35
38
  vision_agent/utils/image_utils.py,sha256=rRWcxKggPXIRXIY_XT9rZt30ECDRq8zq7FDeXRDqQWs,11679
36
39
  vision_agent/utils/sim.py,sha256=NZc9QGD6BTY5O29NVbHH7oxDePL_QMnylT1lYcDUn1Y,7437
37
40
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
38
41
  vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
39
- vision_agent-0.2.199.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
40
- vision_agent-0.2.199.dist-info/METADATA,sha256=NbaPI49uOha3uZXbfOokpji32pilLujBz7DcmhaXW1M,19026
41
- vision_agent-0.2.199.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
42
- vision_agent-0.2.199.dist-info/RECORD,,
42
+ vision_agent-0.2.200.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
43
+ vision_agent-0.2.200.dist-info/METADATA,sha256=goRTW73tD79-UlJiy4cL0twnVYm9iSjU9f5HsC4A1ZI,19026
44
+ vision_agent-0.2.200.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
45
+ vision_agent-0.2.200.dist-info/RECORD,,