PyPI - vision-agent - Versions diffs - 0.2.166__tar.gz → 0.2.168__tar.gz - Mend

vision-agent 0.2.166tar.gz → 0.2.168tar.gz

Files changed (35) hide show

{vision_agent-0.2.166 → vision_agent-0.2.168}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.166
+Version: 0.2.168
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.166 → vision_agent-0.2.168}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.166"
+version = "0.2.168"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.166 → vision_agent-0.2.168}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -85,6 +85,15 @@ def format_agent_message(agent_message: str) -> str:
     return output
+def _clean_response(response: str) -> str:
+    # Sometimes the LLM will hallucinate responses to an <execute_python> tag as if it
+    # had already executed the code. This function removes the hallucinated response.
+    if "<execute_python>" in response:
+        end_execute_python = response.find("</execute_python>")
+        response = response[: end_execute_python + len("</execute_python>")]
+    return response
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     chat = copy.deepcopy(chat)
@@ -114,6 +123,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         message["media"] = chat[-1]["media"]
     conv_resp = cast(str, orch([message], stream=False))
+    # clean the response first, if we are executing code, do not resond or end
+    # conversation before the code has been executed.
+    conv_resp = _clean_response(conv_resp)
     let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
     let_user_respond = (
         "true" in let_user_respond_str.lower() if let_user_respond_str else False
@@ -197,6 +210,51 @@ def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
     return response
+def new_format_to_old_format(new_format: Dict[str, Any]) -> Dict[str, Any]:
+    thoughts = new_format["thinking"] if new_format["thinking"] is not None else ""
+    response = new_format["response"] if new_format["response"] is not None else ""
+    if new_format["execute_python"] is not None:
+        response += (
+            f"\n<execute_python>\n{new_format['execute_python']}\n</execute_python>"
+        )
+    return {
+        "thoughts": thoughts,
+        "response": response,
+        "let_user_respond": new_format["let_user_respond"],
+    }
+def old_format_to_new_format(old_format_str: str) -> str:
+    try:
+        old_format = json.loads(old_format_str)
+    except json.JSONDecodeError:
+        return old_format_str
+    thinking = old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
+    let_user_respond = old_format["let_user_respond"]
+    if "<execute_python>" in old_format["response"]:
+        execute_python = extract_tag(old_format["response"], "execute_python")
+        response = (
+            old_format["response"]
+            .replace(execute_python, "")
+            .replace("<execute_python>", "")
+            .replace("</execute_python>", "")
+            .strip()
+        )
+    else:
+        execute_python = None
+        response = old_format["response"]
+    return json.dumps(
+        {
+            "thinking": thinking,
+            "response": response,
+            "execute_python": execute_python,
+            "let_user_respond": let_user_respond,
+        }
+    )
 class VisionAgent(Agent):
     """Vision Agent is an agent that can chat with the user and call tools or other
     agents to generate code for it. Vision Agent uses python code to execute actions
@@ -361,11 +419,11 @@ class VisionAgent(Agent):
                     (
                         {
                             "role": c["role"],
-                            "content": c["content"],
+                            "content": old_format_to_new_format(c["content"]),  # type: ignore
                             "media": c["media"],
                         }
                         if "media" in c
-                        else {"role": c["role"], "content": c["content"]}
+                        else {"role": c["role"], "content": old_format_to_new_format(c["content"])}  # type: ignore
                     )
                     for c in int_chat
                 ],
@@ -419,13 +477,17 @@ class VisionAgent(Agent):
                 int_chat.append(
                     {
                         "role": "assistant",
-                        "content": json.dumps(add_step_descriptions(response)),
+                        "content": json.dumps(
+                            new_format_to_old_format(add_step_descriptions(response))
+                        ),
                     }
                 )
                 orig_chat.append(
                     {
                         "role": "assistant",
-                        "content": json.dumps(add_step_descriptions(response)),
+                        "content": json.dumps(
+                            new_format_to_old_format(add_step_descriptions(response))
+                        ),
                     }
                 )
@@ -458,7 +520,11 @@ class VisionAgent(Agent):
                     self.streaming_message(
                         {
                             "role": "assistant",
-                            "content": json.dumps(response),
+                            "content": json.dumps(
+                                new_format_to_old_format(
+                                    add_step_descriptions(response)
+                                )
+                            ),
                             "finished": finished and code_action is None,
                         }
                     )

{vision_agent-0.2.166 → vision_agent-0.2.168}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -676,12 +676,13 @@ def use_extra_vision_agent_args(
     for node in red:
         # seems to always be atomtrailers not call type
         if node.type == "atomtrailers":
+            if node.name.value == "generate_vision_code":
+                node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
             if (
                 node.name.value == "generate_vision_code"
                 or node.name.value == "edit_vision_code"
             ):
-                node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
                 if custom_tool_names is not None:
                     node.value[1].value.append(f"custom_tool_names={custom_tool_names}")
     cleaned_code = red.dumps().strip()