PyPI - vision-agent - Versions diffs - 0.2.241__tar.gz → 0.2.242__tar.gz - Mend

vision-agent 0.2.241tar.gz → 0.2.242tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{vision_agent-0.2.241 → vision_agent-0.2.242}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.241
+Version: 0.2.242
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.241 → vision_agent-0.2.242}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.241"
+version = "0.2.242"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/agent.py RENAMED Viewed

@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional, Union
 from vision_agent.models import (
     AgentMessage,
     CodeContext,
+    ErrorContext,
     InteractionContext,
     Message,
     PlanContext,
@@ -36,7 +37,7 @@ class AgentCoder(Agent):
         chat: List[AgentMessage],
         max_steps: Optional[int] = None,
         code_interpreter: Optional[CodeInterpreter] = None,
-    ) -> Union[CodeContext, InteractionContext]:
+    ) -> Union[CodeContext, InteractionContext, ErrorContext]:
         pass
     @abstractmethod
@@ -56,5 +57,5 @@ class AgentPlanner(Agent):
         chat: List[AgentMessage],
         max_steps: Optional[int] = None,
         code_interpreter: Optional[CodeInterpreter] = None,
-    ) -> Union[PlanContext, InteractionContext]:
+    ) -> Union[PlanContext, InteractionContext, ErrorContext]:
         pass

{vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_coder_v2.py RENAMED Viewed

@@ -13,6 +13,7 @@ from vision_agent.lmm import LMM
 from vision_agent.models import (
     AgentMessage,
     CodeContext,
+    ErrorContext,
     InteractionContext,
     Message,
     PlanContext,
@@ -365,6 +366,8 @@ class VisionAgentCoderV2(AgentCoder):
         code_or_interaction = self.generate_code(input_msg)
         if isinstance(code_or_interaction, InteractionContext):
             return code_or_interaction.chat[-1].content
+        elif isinstance(code_or_interaction, ErrorContext):
+            return code_or_interaction.error
         return code_or_interaction.code
     def generate_code(
@@ -372,7 +375,7 @@ class VisionAgentCoderV2(AgentCoder):
         chat: List[AgentMessage],
         max_steps: Optional[int] = None,
         code_interpreter: Optional[CodeInterpreter] = None,
-    ) -> Union[CodeContext, InteractionContext]:
+    ) -> Union[CodeContext, InteractionContext, ErrorContext]:
         """Generate vision code from a conversation.
         Parameters:
@@ -404,6 +407,8 @@ class VisionAgentCoderV2(AgentCoder):
             # the planner needs an interaction, so return before generating code
             if isinstance(plan_context, InteractionContext):
                 return plan_context
+            elif isinstance(plan_context, ErrorContext):
+                return plan_context
             code_context = self.generate_code_from_plan(
                 orig_chat,

{vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_planner_v2.py RENAMED Viewed

@@ -24,7 +24,13 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
 )
 from vision_agent.configs import Config
 from vision_agent.lmm import LMM
-from vision_agent.models import AgentMessage, InteractionContext, Message, PlanContext
+from vision_agent.models import (
+    AgentMessage,
+    ErrorContext,
+    InteractionContext,
+    Message,
+    PlanContext,
+)
 from vision_agent.tools.planner_tools import check_function_call
 from vision_agent.utils.agent import (
     add_media_to_chat,
@@ -322,7 +328,7 @@ def create_finalize_plan(
     model: LMM,
     chat: List[AgentMessage],
     verbose: bool = False,
-) -> Tuple[List[AgentMessage], PlanContext]:
+) -> Tuple[List[AgentMessage], Union[PlanContext, ErrorContext]]:
     # if we're in the middle of an interaction, don't finalize the plan
     if chat[-1].role == "interaction":
         return [], PlanContext(plan="", instructions=[], code="")
@@ -337,11 +343,19 @@ def create_finalize_plan(
     return_chat = [AgentMessage(role="planner", content=plan_str, media=None)]
     plan_json = extract_tag(plan_str, "json")
-    plan = (
-        extract_json(plan_json)
-        if plan_json is not None
-        else {"plan": plan_str, "instructions": [], "code": ""}
-    )
+    # sometimes the planner model will refuse to answer a question becuase of some
+    # safety concern, we then wont be able to parse the response so we have to send
+    # it back to the user/conversation agent
+    try:
+        plan = (
+            extract_json(plan_json)
+            if plan_json is not None
+            else {"plan": plan_str, "instructions": [], "code": ""}
+        )
+    except json.JSONDecodeError:
+        return return_chat, ErrorContext(error=plan_str)
     code_snippets = extract_tag(plan_str, "code")
     plan["code"] = code_snippets if code_snippets is not None else ""
     if verbose:
@@ -473,14 +487,17 @@ class VisionAgentPlannerV2(AgentPlanner):
         plan_or_interaction = self.generate_plan(input_msg)
         if isinstance(plan_or_interaction, InteractionContext):
             return plan_or_interaction.chat[-1].content
-        return plan_or_interaction.plan
+        elif isinstance(plan_or_interaction, PlanContext):
+            return plan_or_interaction.plan
+        else:
+            return plan_or_interaction.error
     def generate_plan(
         self,
         chat: List[AgentMessage],
         max_steps: Optional[int] = None,
         code_interpreter: Optional[CodeInterpreter] = None,
-    ) -> Union[PlanContext, InteractionContext]:
+    ) -> Union[PlanContext, InteractionContext, ErrorContext]:
         """Generate a plan to solve a vision task.
         Parameters:
@@ -571,7 +588,7 @@ class VisionAgentPlannerV2(AgentPlanner):
                 for chat_elt in updated_chat:
                     self.update_callback(chat_elt.model_dump())
-            context: Union[PlanContext, InteractionContext]
+            context: Union[PlanContext, InteractionContext, ErrorContext]
             if interaction:
                 context = InteractionContext(chat=int_chat)
             else:

{vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_prompts_v2.py RENAMED Viewed

@@ -16,17 +16,29 @@ AGENT: <response>Yes, I can help you with that. I will write the code to detect
 OBSERVATION:
 <final_code>
 from vision_agent.tools import load_image, owl_v2_image
-def detect_dogs(image_path: str):
+def detect_dogs(image_path: str) -> int:
     image = load_image(image_path)
     dogs = owl_v2_image(image)
-    return dogs
+    return len(dogs)
 </final_code>
 <final_test>
 def test_detect_dogs():
     dogs = detect_dogs("images/dogs.jpg")
-    assert len(dogs) > 0
+    assert isinstance(dogs, int)
+    print(f"Number of dogs detected: {{dogs}}")
+    return dogs
 </final_test>
+OBSERVATION: ----- stdout -----
+Number of dogs detected: 8
+----- stderr -----
+----- Intermediate output-----
+None
+----- Final output -----
+8
 AGENT: <response>Here is the code to detect dogs in the image.</response>
 --- END EXAMPLE1 ---

{vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/agent/vision_agent_v2.py RENAMED Viewed

@@ -11,6 +11,7 @@ from vision_agent.lmm import LMM
 from vision_agent.models import (
     AgentMessage,
     CodeContext,
+    ErrorContext,
     InteractionContext,
     Message,
     PlanContext,
@@ -27,7 +28,9 @@ CONFIG = Config()
 def extract_conversation(
-    chat: List[AgentMessage], include_conv: bool = False
+    chat: List[AgentMessage],
+    include_conv: bool = False,
+    include_errors: bool = False,
 ) -> Tuple[List[AgentMessage], Optional[str]]:
     chat = copy.deepcopy(chat)
@@ -43,13 +46,18 @@ def extract_conversation(
         elif chat_i.role == "coder":
             if "<final_code>" in chat_i.content:
                 extracted_chat.append(chat_i)
+        elif chat_i.role == "final_observation":
+            extracted_chat.append(chat_i)
         elif include_conv and chat_i.role == "conversation":
             extracted_chat.append(chat_i)
+        elif include_errors and chat_i.role == "error_observation":
+            extracted_chat.append(chat_i)
-    # only keep the last <final_code> and <final_test>
+    # only keep the last <final_code>, <final_test>
     final_code = None
     extracted_chat_strip_code: List[AgentMessage] = []
-    for chat_i in reversed(extracted_chat):
+    for chat_i in reversed((extracted_chat)):
+        # don't check role here because user could send updated <final_code>
         if "<final_code>" in chat_i.content and final_code is None:
             extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
             final_code = extract_tag(chat_i.content, "final_code")
@@ -66,7 +74,12 @@ def extract_conversation(
 def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
-    extracted_chat, _ = extract_conversation(chat, include_conv=True)
+    # Include conversation and error messages. The error messages can come from one of
+    # the agents refusing to write a correctly formatted message, want to inform the
+    # conversation agent of this.
+    extracted_chat, _ = extract_conversation(
+        chat, include_conv=True, include_errors=True
+    )
     conv = format_conversation(extracted_chat)
     prompt = CONVERSATION.format(
@@ -101,7 +114,9 @@ def maybe_run_action(
         if isinstance(context, CodeContext):
             return [
                 AgentMessage(role="coder", content=format_code_context(context)),
-                AgentMessage(role="observation", content=context.test_result.text()),
+                AgentMessage(
+                    role="final_observation", content=context.test_result.text()
+                ),
             ]
         elif isinstance(context, InteractionContext):
             return [
@@ -110,6 +125,10 @@ def maybe_run_action(
                     content=json.dumps([elt.model_dump() for elt in context.chat]),
                 )
             ]
+        elif isinstance(context, ErrorContext):
+            return [
+                AgentMessage(role="error_observation", content=context.error),
+            ]
     elif action == "edit_code":
         # We don't want to pass code in plan_context.code so the coder will generate
         # new code from plan_context.plan
@@ -129,7 +148,7 @@ def maybe_run_action(
         )
         return [
             AgentMessage(role="coder", content=format_code_context(context)),
-            AgentMessage(role="observation", content=context.test_result.text()),
+            AgentMessage(role="final_observation", content=context.test_result.text()),
         ]
     elif action == "view_image":
         pass

{vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/models/__init__.py RENAMED Viewed

@@ -1,4 +1,10 @@
-from .agent_types import AgentMessage, CodeContext, InteractionContext, PlanContext
+from .agent_types import (
+    AgentMessage,
+    CodeContext,
+    ErrorContext,
+    InteractionContext,
+    PlanContext,
+)
 from .lmm_types import Message, TextOrImage
 from .tools_types import (
     BboxInput,

{vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/models/agent_types.py RENAMED Viewed

@@ -29,11 +29,15 @@ class AgentMessage(BaseModel):
         Literal["user"],
         Literal["assistant"],  # planner, coder and conversation are of type assistant
         Literal["observation"],
+        Literal["final_observation"],  # the observation from the final code output
+        Literal["error_observation"],  # the observation from the error message
         Literal["interaction"],
         Literal["interaction_response"],
         Literal["conversation"],
         Literal["planner"],
-        Literal["planner_update"],
+        Literal[
+            "planner_update"
+        ],  # an intermediate update from the planner to show partial information
         Literal["coder"],
     ]
     content: str
@@ -75,3 +79,14 @@ class InteractionContext(BaseModel):
     """
     chat: List[AgentMessage]
+class ErrorContext(BaseModel):
+    """ErrorContext is a data model that represents an error message. These errors can
+    happen in the planning phase when a model does not output correctly formatted
+    messages (often because it considers some response to be a safety issue).
+    error: The error message.
+    """
+    error: str

{vision_agent-0.2.241 → vision_agent-0.2.242}/vision_agent/utils/agent.py RENAMED Viewed

@@ -159,11 +159,12 @@ def format_conversation(chat: List[AgentMessage]) -> str:
     chat = copy.deepcopy(chat)
     prompt = ""
     for chat_i in chat:
-        if chat_i.role == "user" or chat_i.role == "coder":
-            if "<final_code>" in chat_i.content:
-                prompt += f"OBSERVATION: {chat_i.content}\n\n"
-            elif chat_i.role == "user":
+        # we want to print user messages, final code, final code observations or errors
+        if chat_i.role in ["user", "coder", "final_observation", "error_observation"]:
+            if chat_i.role == "user":
                 prompt += f"USER: {chat_i.content}\n\n"
+            else:
+                prompt += f"OBSERVATION: {chat_i.content}\n\n"
         elif chat_i.role == "conversation":
             prompt += f"AGENT: {chat_i.content}\n\n"
     return prompt