vision-agent 0.2.241__tar.gz → 0.2.242__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {vision_agent-0.2.241 → vision_agent-0.2.242}/PKG-INFO +1 -1
  2. {vision_agent-0.2.241 → vision_agent-0.2.242}/pyproject.toml +1 -1
  3. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/agent.py +3 -2
  4. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_coder_v2.py +6 -1
  5. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_planner_v2.py +27 -10
  6. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_prompts_v2.py +15 -3
  7. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_v2.py +25 -6
  8. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/models/__init__.py +7 -1
  9. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/models/agent_types.py +16 -1
  10. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/agent.py +5 -4
  11. {vision_agent-0.2.241 → vision_agent-0.2.242}/LICENSE +0 -0
  12. {vision_agent-0.2.241 → vision_agent-0.2.242}/README.md +0 -0
  13. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/.sim_tools/df.csv +0 -0
  14. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/.sim_tools/embs.npy +0 -0
  15. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/__init__.py +0 -0
  16. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/README.md +0 -0
  17. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/__init__.py +0 -0
  18. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent.py +0 -0
  19. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_coder.py +0 -0
  20. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  21. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  22. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_planner.py +0 -0
  23. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  24. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  25. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_prompts.py +0 -0
  26. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/clients/__init__.py +0 -0
  27. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/clients/http.py +0 -0
  28. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/configs/__init__.py +0 -0
  29. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/configs/anthropic_config.py +0 -0
  30. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/configs/anthropic_openai_config.py +0 -0
  31. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/configs/config.py +0 -0
  32. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/configs/openai_config.py +0 -0
  33. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/fonts/__init__.py +0 -0
  34. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  35. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/lmm/__init__.py +0 -0
  36. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/lmm/lmm.py +0 -0
  37. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/models/lmm_types.py +0 -0
  38. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/models/tools_types.py +0 -0
  39. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/sim/__init__.py +0 -0
  40. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/sim/sim.py +0 -0
  41. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/tools/__init__.py +0 -0
  42. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/tools/meta_tools.py +0 -0
  43. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/tools/planner_tools.py +0 -0
  44. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/tools/prompts.py +0 -0
  45. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/tools/tools.py +0 -0
  46. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/__init__.py +0 -0
  47. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/exceptions.py +0 -0
  48. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/execute.py +0 -0
  49. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/image_utils.py +0 -0
  50. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/tools.py +0 -0
  51. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/tools_doc.py +0 -0
  52. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/type_defs.py +0 -0
  53. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/video.py +0 -0
  54. {vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.241
3
+ Version: 0.2.242
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.241"
7
+ version = "0.2.242"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union
5
5
  from vision_agent.models import (
6
6
  AgentMessage,
7
7
  CodeContext,
8
+ ErrorContext,
8
9
  InteractionContext,
9
10
  Message,
10
11
  PlanContext,
@@ -36,7 +37,7 @@ class AgentCoder(Agent):
36
37
  chat: List[AgentMessage],
37
38
  max_steps: Optional[int] = None,
38
39
  code_interpreter: Optional[CodeInterpreter] = None,
39
- ) -> Union[CodeContext, InteractionContext]:
40
+ ) -> Union[CodeContext, InteractionContext, ErrorContext]:
40
41
  pass
41
42
 
42
43
  @abstractmethod
@@ -56,5 +57,5 @@ class AgentPlanner(Agent):
56
57
  chat: List[AgentMessage],
57
58
  max_steps: Optional[int] = None,
58
59
  code_interpreter: Optional[CodeInterpreter] = None,
59
- ) -> Union[PlanContext, InteractionContext]:
60
+ ) -> Union[PlanContext, InteractionContext, ErrorContext]:
60
61
  pass
@@ -13,6 +13,7 @@ from vision_agent.lmm import LMM
13
13
  from vision_agent.models import (
14
14
  AgentMessage,
15
15
  CodeContext,
16
+ ErrorContext,
16
17
  InteractionContext,
17
18
  Message,
18
19
  PlanContext,
@@ -365,6 +366,8 @@ class VisionAgentCoderV2(AgentCoder):
365
366
  code_or_interaction = self.generate_code(input_msg)
366
367
  if isinstance(code_or_interaction, InteractionContext):
367
368
  return code_or_interaction.chat[-1].content
369
+ elif isinstance(code_or_interaction, ErrorContext):
370
+ return code_or_interaction.error
368
371
  return code_or_interaction.code
369
372
 
370
373
  def generate_code(
@@ -372,7 +375,7 @@ class VisionAgentCoderV2(AgentCoder):
372
375
  chat: List[AgentMessage],
373
376
  max_steps: Optional[int] = None,
374
377
  code_interpreter: Optional[CodeInterpreter] = None,
375
- ) -> Union[CodeContext, InteractionContext]:
378
+ ) -> Union[CodeContext, InteractionContext, ErrorContext]:
376
379
  """Generate vision code from a conversation.
377
380
 
378
381
  Parameters:
@@ -404,6 +407,8 @@ class VisionAgentCoderV2(AgentCoder):
404
407
  # the planner needs an interaction, so return before generating code
405
408
  if isinstance(plan_context, InteractionContext):
406
409
  return plan_context
410
+ elif isinstance(plan_context, ErrorContext):
411
+ return plan_context
407
412
 
408
413
  code_context = self.generate_code_from_plan(
409
414
  orig_chat,
@@ -24,7 +24,13 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
24
24
  )
25
25
  from vision_agent.configs import Config
26
26
  from vision_agent.lmm import LMM
27
- from vision_agent.models import AgentMessage, InteractionContext, Message, PlanContext
27
+ from vision_agent.models import (
28
+ AgentMessage,
29
+ ErrorContext,
30
+ InteractionContext,
31
+ Message,
32
+ PlanContext,
33
+ )
28
34
  from vision_agent.tools.planner_tools import check_function_call
29
35
  from vision_agent.utils.agent import (
30
36
  add_media_to_chat,
@@ -322,7 +328,7 @@ def create_finalize_plan(
322
328
  model: LMM,
323
329
  chat: List[AgentMessage],
324
330
  verbose: bool = False,
325
- ) -> Tuple[List[AgentMessage], PlanContext]:
331
+ ) -> Tuple[List[AgentMessage], Union[PlanContext, ErrorContext]]:
326
332
  # if we're in the middle of an interaction, don't finalize the plan
327
333
  if chat[-1].role == "interaction":
328
334
  return [], PlanContext(plan="", instructions=[], code="")
@@ -337,11 +343,19 @@ def create_finalize_plan(
337
343
  return_chat = [AgentMessage(role="planner", content=plan_str, media=None)]
338
344
 
339
345
  plan_json = extract_tag(plan_str, "json")
340
- plan = (
341
- extract_json(plan_json)
342
- if plan_json is not None
343
- else {"plan": plan_str, "instructions": [], "code": ""}
344
- )
346
+
347
+ # sometimes the planner model will refuse to answer a question becuase of some
348
+ # safety concern, we then wont be able to parse the response so we have to send
349
+ # it back to the user/conversation agent
350
+ try:
351
+ plan = (
352
+ extract_json(plan_json)
353
+ if plan_json is not None
354
+ else {"plan": plan_str, "instructions": [], "code": ""}
355
+ )
356
+ except json.JSONDecodeError:
357
+ return return_chat, ErrorContext(error=plan_str)
358
+
345
359
  code_snippets = extract_tag(plan_str, "code")
346
360
  plan["code"] = code_snippets if code_snippets is not None else ""
347
361
  if verbose:
@@ -473,14 +487,17 @@ class VisionAgentPlannerV2(AgentPlanner):
473
487
  plan_or_interaction = self.generate_plan(input_msg)
474
488
  if isinstance(plan_or_interaction, InteractionContext):
475
489
  return plan_or_interaction.chat[-1].content
476
- return plan_or_interaction.plan
490
+ elif isinstance(plan_or_interaction, PlanContext):
491
+ return plan_or_interaction.plan
492
+ else:
493
+ return plan_or_interaction.error
477
494
 
478
495
  def generate_plan(
479
496
  self,
480
497
  chat: List[AgentMessage],
481
498
  max_steps: Optional[int] = None,
482
499
  code_interpreter: Optional[CodeInterpreter] = None,
483
- ) -> Union[PlanContext, InteractionContext]:
500
+ ) -> Union[PlanContext, InteractionContext, ErrorContext]:
484
501
  """Generate a plan to solve a vision task.
485
502
 
486
503
  Parameters:
@@ -571,7 +588,7 @@ class VisionAgentPlannerV2(AgentPlanner):
571
588
  for chat_elt in updated_chat:
572
589
  self.update_callback(chat_elt.model_dump())
573
590
 
574
- context: Union[PlanContext, InteractionContext]
591
+ context: Union[PlanContext, InteractionContext, ErrorContext]
575
592
  if interaction:
576
593
  context = InteractionContext(chat=int_chat)
577
594
  else:
@@ -16,17 +16,29 @@ AGENT: <response>Yes, I can help you with that. I will write the code to detect
16
16
  OBSERVATION:
17
17
  <final_code>
18
18
  from vision_agent.tools import load_image, owl_v2_image
19
- def detect_dogs(image_path: str):
19
+ def detect_dogs(image_path: str) -> int:
20
20
  image = load_image(image_path)
21
21
  dogs = owl_v2_image(image)
22
- return dogs
22
+ return len(dogs)
23
23
  </final_code>
24
24
  <final_test>
25
25
  def test_detect_dogs():
26
26
  dogs = detect_dogs("images/dogs.jpg")
27
- assert len(dogs) > 0
27
+ assert isinstance(dogs, int)
28
+ print(f"Number of dogs detected: {{dogs}}")
29
+ return dogs
28
30
  </final_test>
29
31
 
32
+ OBSERVATION: ----- stdout -----
33
+ Number of dogs detected: 8
34
+
35
+ ----- stderr -----
36
+
37
+ ----- Intermediate output-----
38
+ None
39
+ ----- Final output -----
40
+ 8
41
+
30
42
  AGENT: <response>Here is the code to detect dogs in the image.</response>
31
43
  --- END EXAMPLE1 ---
32
44
 
@@ -11,6 +11,7 @@ from vision_agent.lmm import LMM
11
11
  from vision_agent.models import (
12
12
  AgentMessage,
13
13
  CodeContext,
14
+ ErrorContext,
14
15
  InteractionContext,
15
16
  Message,
16
17
  PlanContext,
@@ -27,7 +28,9 @@ CONFIG = Config()
27
28
 
28
29
 
29
30
  def extract_conversation(
30
- chat: List[AgentMessage], include_conv: bool = False
31
+ chat: List[AgentMessage],
32
+ include_conv: bool = False,
33
+ include_errors: bool = False,
31
34
  ) -> Tuple[List[AgentMessage], Optional[str]]:
32
35
  chat = copy.deepcopy(chat)
33
36
 
@@ -43,13 +46,18 @@ def extract_conversation(
43
46
  elif chat_i.role == "coder":
44
47
  if "<final_code>" in chat_i.content:
45
48
  extracted_chat.append(chat_i)
49
+ elif chat_i.role == "final_observation":
50
+ extracted_chat.append(chat_i)
46
51
  elif include_conv and chat_i.role == "conversation":
47
52
  extracted_chat.append(chat_i)
53
+ elif include_errors and chat_i.role == "error_observation":
54
+ extracted_chat.append(chat_i)
48
55
 
49
- # only keep the last <final_code> and <final_test>
56
+ # only keep the last <final_code>, <final_test>
50
57
  final_code = None
51
58
  extracted_chat_strip_code: List[AgentMessage] = []
52
- for chat_i in reversed(extracted_chat):
59
+ for chat_i in reversed((extracted_chat)):
60
+ # don't check role here because user could send updated <final_code>
53
61
  if "<final_code>" in chat_i.content and final_code is None:
54
62
  extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
55
63
  final_code = extract_tag(chat_i.content, "final_code")
@@ -66,7 +74,12 @@ def extract_conversation(
66
74
 
67
75
 
68
76
  def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
69
- extracted_chat, _ = extract_conversation(chat, include_conv=True)
77
+ # Include conversation and error messages. The error messages can come from one of
78
+ # the agents refusing to write a correctly formatted message, want to inform the
79
+ # conversation agent of this.
80
+ extracted_chat, _ = extract_conversation(
81
+ chat, include_conv=True, include_errors=True
82
+ )
70
83
 
71
84
  conv = format_conversation(extracted_chat)
72
85
  prompt = CONVERSATION.format(
@@ -101,7 +114,9 @@ def maybe_run_action(
101
114
  if isinstance(context, CodeContext):
102
115
  return [
103
116
  AgentMessage(role="coder", content=format_code_context(context)),
104
- AgentMessage(role="observation", content=context.test_result.text()),
117
+ AgentMessage(
118
+ role="final_observation", content=context.test_result.text()
119
+ ),
105
120
  ]
106
121
  elif isinstance(context, InteractionContext):
107
122
  return [
@@ -110,6 +125,10 @@ def maybe_run_action(
110
125
  content=json.dumps([elt.model_dump() for elt in context.chat]),
111
126
  )
112
127
  ]
128
+ elif isinstance(context, ErrorContext):
129
+ return [
130
+ AgentMessage(role="error_observation", content=context.error),
131
+ ]
113
132
  elif action == "edit_code":
114
133
  # We don't want to pass code in plan_context.code so the coder will generate
115
134
  # new code from plan_context.plan
@@ -129,7 +148,7 @@ def maybe_run_action(
129
148
  )
130
149
  return [
131
150
  AgentMessage(role="coder", content=format_code_context(context)),
132
- AgentMessage(role="observation", content=context.test_result.text()),
151
+ AgentMessage(role="final_observation", content=context.test_result.text()),
133
152
  ]
134
153
  elif action == "view_image":
135
154
  pass
@@ -1,4 +1,10 @@
1
- from .agent_types import AgentMessage, CodeContext, InteractionContext, PlanContext
1
+ from .agent_types import (
2
+ AgentMessage,
3
+ CodeContext,
4
+ ErrorContext,
5
+ InteractionContext,
6
+ PlanContext,
7
+ )
2
8
  from .lmm_types import Message, TextOrImage
3
9
  from .tools_types import (
4
10
  BboxInput,
@@ -29,11 +29,15 @@ class AgentMessage(BaseModel):
29
29
  Literal["user"],
30
30
  Literal["assistant"], # planner, coder and conversation are of type assistant
31
31
  Literal["observation"],
32
+ Literal["final_observation"], # the observation from the final code output
33
+ Literal["error_observation"], # the observation from the error message
32
34
  Literal["interaction"],
33
35
  Literal["interaction_response"],
34
36
  Literal["conversation"],
35
37
  Literal["planner"],
36
- Literal["planner_update"],
38
+ Literal[
39
+ "planner_update"
40
+ ], # an intermediate update from the planner to show partial information
37
41
  Literal["coder"],
38
42
  ]
39
43
  content: str
@@ -75,3 +79,14 @@ class InteractionContext(BaseModel):
75
79
  """
76
80
 
77
81
  chat: List[AgentMessage]
82
+
83
+
84
+ class ErrorContext(BaseModel):
85
+ """ErrorContext is a data model that represents an error message. These errors can
86
+ happen in the planning phase when a model does not output correctly formatted
87
+ messages (often because it considers some response to be a safety issue).
88
+
89
+ error: The error message.
90
+ """
91
+
92
+ error: str
@@ -159,11 +159,12 @@ def format_conversation(chat: List[AgentMessage]) -> str:
159
159
  chat = copy.deepcopy(chat)
160
160
  prompt = ""
161
161
  for chat_i in chat:
162
- if chat_i.role == "user" or chat_i.role == "coder":
163
- if "<final_code>" in chat_i.content:
164
- prompt += f"OBSERVATION: {chat_i.content}\n\n"
165
- elif chat_i.role == "user":
162
+ # we want to print user messages, final code, final code observations or errors
163
+ if chat_i.role in ["user", "coder", "final_observation", "error_observation"]:
164
+ if chat_i.role == "user":
166
165
  prompt += f"USER: {chat_i.content}\n\n"
166
+ else:
167
+ prompt += f"OBSERVATION: {chat_i.content}\n\n"
167
168
  elif chat_i.role == "conversation":
168
169
  prompt += f"AGENT: {chat_i.content}\n\n"
169
170
  return prompt
File without changes
File without changes