PyPI - vision-agent - Versions diffs - 0.2.161__tar.gz → 0.2.163__tar.gz - Mend

vision-agent 0.2.161tar.gz → 0.2.163tar.gz

Files changed (36) hide show

{vision_agent-0.2.161 → vision_agent-0.2.163}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.161
+Version: 0.2.163
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -27,6 +27,7 @@ Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
 Requires-Dist: pydantic (==2.7.4)
 Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
 Requires-Dist: pytube (==15.0.0)
+Requires-Dist: redbaron (>=0.9.2,<0.10.0)
 Requires-Dist: requests (>=2.0.0,<3.0.0)
 Requires-Dist: rich (>=13.7.1,<14.0.0)
 Requires-Dist: scipy (>=1.13.0,<1.14.0)
@@ -142,7 +143,7 @@ continuing, for example it may want to execute code and look at the output befor
 letting the user respond.
 ### Chatting and Artifacts
-If you run `chat_with_code` you will also notice an `Artifact` object. `Artifact`'s
+If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
 are a way to sync files between local and remote environments. The agent will read and
 write to the artifact object, which is just a pickle object, when it wants to save or
 load files.
@@ -159,7 +160,7 @@ with open("image.png", "rb") as f:
     artifacts["image.png"] = f.read()
 agent = va.agent.VisionAgent()
-response, artifacts = agent.chat_with_code(
+response, artifacts = agent.chat_with_artifacts(
     [
         {
             "role": "user",
@@ -339,11 +340,11 @@ mode by passing in the verbose argument:
 ```
 ### Detailed Usage
-You can also have it return more information by calling `chat_with_workflow`. The format
+You can also have it return more information by calling `generate_code`. The format
 of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
 ```python
->>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
+>>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
 >>> print(results)
 {
     "code": "from vision_agent.tools import ..."
@@ -372,7 +373,7 @@ conv = [
         "media": ["workers.png"],
     }
 ]
-result = agent.chat_with_workflow(conv)
+result = agent.generate_code(conv)
 code = result["code"]
 conv.append({"role": "assistant", "content": code})
 conv.append(
@@ -381,7 +382,7 @@ conv.append(
         "content": "Can you also return the number of workers wearing safety gear?",
     }
 )
-result = agent.chat_with_workflow(conv)
+result = agent.generate_code(conv)
 ```

{vision_agent-0.2.161 → vision_agent-0.2.163}/README.md RENAMED Viewed

@@ -101,7 +101,7 @@ continuing, for example it may want to execute code and look at the output befor
 letting the user respond.
 ### Chatting and Artifacts
-If you run `chat_with_code` you will also notice an `Artifact` object. `Artifact`'s
+If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
 are a way to sync files between local and remote environments. The agent will read and
 write to the artifact object, which is just a pickle object, when it wants to save or
 load files.
@@ -118,7 +118,7 @@ with open("image.png", "rb") as f:
     artifacts["image.png"] = f.read()
 agent = va.agent.VisionAgent()
-response, artifacts = agent.chat_with_code(
+response, artifacts = agent.chat_with_artifacts(
     [
         {
             "role": "user",
@@ -298,11 +298,11 @@ mode by passing in the verbose argument:
 ```
 ### Detailed Usage
-You can also have it return more information by calling `chat_with_workflow`. The format
+You can also have it return more information by calling `generate_code`. The format
 of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
 ```python
->>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
+>>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
 >>> print(results)
 {
     "code": "from vision_agent.tools import ..."
@@ -331,7 +331,7 @@ conv = [
         "media": ["workers.png"],
     }
 ]
-result = agent.chat_with_workflow(conv)
+result = agent.generate_code(conv)
 code = result["code"]
 conv.append({"role": "assistant", "content": code})
 conv.append(
@@ -340,7 +340,7 @@ conv.append(
         "content": "Can you also return the number of workers wearing safety gear?",
     }
 )
-result = agent.chat_with_workflow(conv)
+result = agent.generate_code(conv)
 ```

{vision_agent-0.2.161 → vision_agent-0.2.163}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.161"
+version = "0.2.163"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"
@@ -43,6 +43,7 @@ pytube = "15.0.0"
 anthropic = "^0.31.0"
 pydantic = "2.7.4"
 av = "^11.0.0"
+redbaron = "^0.9.2"
 [tool.poetry.group.dev.dependencies]
 autoflake = "1.*"

{vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/agent/__init__.py RENAMED Viewed

@@ -7,3 +7,11 @@ from .vision_agent_coder import (
     OpenAIVisionAgentCoder,
     VisionAgentCoder,
 )
+from .vision_agent_planner import (
+    AnthropicVisionAgentPlanner,
+    AzureVisionAgentPlanner,
+    OllamaVisionAgentPlanner,
+    OpenAIVisionAgentPlanner,
+    PlanContext,
+    VisionAgentPlanner,
+)

vision_agent-0.2.163/vision_agent/agent/agent_utils.py ADDED Viewed

@@ -0,0 +1,181 @@
+import json
+import logging
+import re
+import sys
+from typing import Any, Dict, List, Optional
+from rich.console import Console
+from rich.style import Style
+from rich.syntax import Syntax
+import vision_agent.tools as T
+logging.basicConfig(stream=sys.stdout)
+_LOGGER = logging.getLogger(__name__)
+_CONSOLE = Console()
+_MAX_TABULATE_COL_WIDTH = 80
+def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
+    json_pattern = r"\{.*\}"
+    match = re.search(json_pattern, json_str, re.DOTALL)
+    if match:
+        json_str = match.group()
+        try:
+            # remove trailing comma
+            trailing_bracket_pattern = r",\s+\}"
+            json_str = re.sub(trailing_bracket_pattern, "}", json_str, flags=re.DOTALL)
+            json_dict = json.loads(json_str)
+            return json_dict  # type: ignore
+        except json.JSONDecodeError:
+            return None
+    return None
+def _find_markdown_json(json_str: str) -> str:
+    pattern = r"```json(.*?)```"
+    match = re.search(pattern, json_str, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    return json_str
+def _strip_markdown_code(inp_str: str) -> str:
+    pattern = r"```python.*?```"
+    cleaned_str = re.sub(pattern, "", inp_str, flags=re.DOTALL)
+    return cleaned_str
+def extract_json(json_str: str) -> Dict[str, Any]:
+    json_str_mod = json_str.replace("\n", " ").strip()
+    json_str_mod = json_str_mod.replace(": True", ": true").replace(
+        ": False", ": false"
+    )
+    # sometimes the json is in single quotes
+    try:
+        return json.loads(json_str_mod.replace("'", '"'))  # type: ignore
+    except json.JSONDecodeError:
+        pass
+    try:
+        return json.loads(json_str_mod)  # type: ignore
+    except json.JSONDecodeError:
+        json_orig = json_str
+        # don't replace quotes here or booleans since it can also introduce errors
+        json_str = json_str.replace("\n", " ").strip()
+        json_str = _strip_markdown_code(json_str)
+        json_str = _find_markdown_json(json_str)
+        json_dict = _extract_sub_json(json_str)
+        if json_dict is None:
+            error_msg = f"Could not extract JSON from the given str: {json_orig}"
+            _LOGGER.exception(error_msg)
+            raise ValueError(error_msg)
+        return json_dict
+def extract_code(code: str) -> str:
+    if "\n```python" in code:
+        start = "\n```python"
+    elif "```python" in code:
+        start = "```python"
+    else:
+        return code
+    code = code[code.find(start) + len(start) :]
+    code = code[: code.find("```")]
+    if code.startswith("python\n"):
+        code = code[len("python\n") :]
+    return code
+def extract_tag(
+    content: str,
+    tag: str,
+) -> Optional[str]:
+    inner_content = None
+    remaning = content
+    all_inner_content = []
+    while f"<{tag}>" in remaning:
+        inner_content_i = remaning[remaning.find(f"<{tag}>") + len(f"<{tag}>") :]
+        if f"</{tag}>" not in inner_content_i:
+            break
+        inner_content_i = inner_content_i[: inner_content_i.find(f"</{tag}>")]
+        remaning = remaning[remaning.find(f"</{tag}>") + len(f"</{tag}>") :]
+        all_inner_content.append(inner_content_i)
+    if len(all_inner_content) > 0:
+        inner_content = "\n".join(all_inner_content)
+    return inner_content
+def remove_installs_from_code(code: str) -> str:
+    pattern = r"\n!pip install.*?(\n|\Z)\n"
+    code = re.sub(pattern, "", code, flags=re.DOTALL)
+    return code
+def format_memory(memory: List[Dict[str, str]]) -> str:
+    output_str = ""
+    for i, m in enumerate(memory):
+        output_str += f"### Feedback {i}:\n"
+        output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
+        output_str += f"Feedback {i}: {m['feedback']}\n\n"
+        if "edits" in m:
+            output_str += f"Edits {i}:\n{m['edits']}\n"
+        output_str += "\n"
+    return output_str
+def format_plans(plans: Dict[str, Any]) -> str:
+    plan_str = ""
+    for k, v in plans.items():
+        plan_str += "\n" + f"{k}: {v['thoughts']}\n"
+        plan_str += "    -" + "\n    -".join([e for e in v["instructions"]])
+    return plan_str
+class DefaultImports:
+    """Container for default imports used in the code execution."""
+    common_imports = [
+        "import os",
+        "import numpy as np",
+        "from vision_agent.tools import *",
+        "from typing import *",
+        "from pillow_heif import register_heif_opener",
+        "register_heif_opener()",
+    ]
+    @staticmethod
+    def to_code_string() -> str:
+        return "\n".join(DefaultImports.common_imports + T.__new_tools__)
+    @staticmethod
+    def prepend_imports(code: str) -> str:
+        """Run this method to prepend the default imports to the code.
+        NOTE: be sure to run this method after the custom tools have been registered.
+        """
+        return DefaultImports.to_code_string() + "\n\n" + code
+def print_code(title: str, code: str, test: Optional[str] = None) -> None:
+    _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
+    _CONSOLE.print("=" * 30 + " Code " + "=" * 30)
+    _CONSOLE.print(
+        Syntax(
+            DefaultImports.prepend_imports(code),
+            "python",
+            theme="gruvbox-dark",
+            line_numbers=True,
+        )
+    )
+    if test:
+        _CONSOLE.print("=" * 30 + " Test " + "=" * 30)
+        _CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))

{vision_agent-0.2.161 → vision_agent-0.2.163}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -14,8 +14,8 @@ from vision_agent.agent.vision_agent_prompts import (
     VA_CODE,
 )
 from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
-from vision_agent.tools import META_TOOL_DOCSTRING
 from vision_agent.tools.meta_tools import (
+    META_TOOL_DOCSTRING,
     Artifacts,
     check_and_load_image,
     use_extra_vision_agent_args,
@@ -103,7 +103,7 @@ def execute_code_action(
 def parse_execution(
     response: str,
     test_multi_plan: bool = True,
-    customed_tool_names: Optional[List[str]] = None,
+    custom_tool_names: Optional[List[str]] = None,
 ) -> Optional[str]:
     code = None
     remaining = response
@@ -122,7 +122,7 @@ def parse_execution(
         code = "\n".join(all_code)
     if code is not None:
-        code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names)
+        code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
     return code
@@ -195,9 +195,8 @@ class VisionAgent(Agent):
         agent: Optional[LMM] = None,
         verbosity: int = 0,
         local_artifacts_path: Optional[Union[str, Path]] = None,
-        code_sandbox_runtime: Optional[str] = None,
         callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
-        code_interpreter: Optional[CodeInterpreter] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         """Initialize the VisionAgent.
@@ -207,14 +206,17 @@ class VisionAgent(Agent):
             verbosity (int): The verbosity level of the agent.
             local_artifacts_path (Optional[Union[str, Path]]): The path to the local
                 artifacts file.
-            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
-            code_interpreter (Optional[CodeInterpreter]): if not None, use this CodeInterpreter
+            callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
+                function to send intermediate update messages.
+            code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
+                it can be one of: None, "local" or "e2b". If None, it will read from
+                the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
+                object is provided it will use that.
         """
         self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
         self.max_iterations = 12
         self.verbosity = verbosity
-        self.code_sandbox_runtime = code_sandbox_runtime
         self.code_interpreter = code_interpreter
         self.callback_message = callback_message
         if self.verbosity >= 1:
@@ -233,7 +235,7 @@ class VisionAgent(Agent):
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
         artifacts: Optional[Artifacts] = None,
-    ) -> List[Message]:
+    ) -> str:
         """Chat with VisionAgent and get the conversation response.
         Parameters:
@@ -250,15 +252,33 @@ class VisionAgent(Agent):
             input = [{"role": "user", "content": input}]
             if media is not None:
                 input[0]["media"] = [media]
-        results, _ = self.chat_with_code(input, artifacts)
-        return results
+        results, _ = self.chat_with_artifacts(input, artifacts)
+        return results[-1]["content"]  # type: ignore
+    def chat(
+        self,
+        chat: List[Message],
+    ) -> List[Message]:
+        """Chat with VisionAgent, it will use code to execute actions to accomplish
+        its tasks.
+        Parameters:
+            chat (List[Message]): A conversation in the format of:
+                [{"role": "user", "content": "describe your task here..."}]
+                or if it contains media files, it should be in the format of:
+                [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
+        Returns:
+            List[Message]: The conversation response.
+        """
+        return self.chat_with_artifacts(chat)[0]
-    def chat_with_code(
+    def chat_with_artifacts(
         self,
         chat: List[Message],
         artifacts: Optional[Artifacts] = None,
         test_multi_plan: bool = True,
-        customized_tool_names: Optional[List[str]] = None,
+        custom_tool_names: Optional[List[str]] = None,
     ) -> Tuple[List[Message], Artifacts]:
         """Chat with VisionAgent, it will use code to execute actions to accomplish
         its tasks.
@@ -272,7 +292,7 @@ class VisionAgent(Agent):
             test_multi_plan (bool): If True, it will test tools for multiple plans and
                 pick the best one based off of the tool results. If False, it will go
                 with the first plan.
-            customized_tool_names (List[str]): A list of customized tools for agent to
+            custom_tool_names (List[str]): A list of customized tools for agent to
                 pick and use. If not provided, default to full tool set from
                 vision_agent.tools.
@@ -287,11 +307,13 @@ class VisionAgent(Agent):
             # this is setting remote artifacts path
             artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
+        # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
         code_interpreter = (
             self.code_interpreter
             if self.code_interpreter is not None
+            and not isinstance(self.code_interpreter, str)
             else CodeInterpreterFactory.new_instance(
-                code_sandbox_runtime=self.code_sandbox_runtime,
+                code_sandbox_runtime=self.code_interpreter,
             )
         )
         with code_interpreter:
@@ -389,7 +411,7 @@ class VisionAgent(Agent):
                 finished = response["let_user_respond"]
                 code_action = parse_execution(
-                    response["response"], test_multi_plan, customized_tool_names
+                    response["response"], test_multi_plan, custom_tool_names
                 )
                 if last_response == response:
@@ -480,8 +502,8 @@ class OpenAIVisionAgent(VisionAgent):
         agent: Optional[LMM] = None,
         verbosity: int = 0,
         local_artifacts_path: Optional[Union[str, Path]] = None,
-        code_sandbox_runtime: Optional[str] = None,
         callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         """Initialize the VisionAgent using OpenAI LMMs.
@@ -491,7 +513,12 @@ class OpenAIVisionAgent(VisionAgent):
             verbosity (int): The verbosity level of the agent.
             local_artifacts_path (Optional[Union[str, Path]]): The path to the local
                 artifacts file.
-            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+            callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
+                function to send intermediate update messages.
+            code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
+                it can be one of: None, "local" or "e2b". If None, it will read from
+                the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
+                object is provided it will use that.
         """
         agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
@@ -499,8 +526,8 @@ class OpenAIVisionAgent(VisionAgent):
             agent,
             verbosity,
             local_artifacts_path,
-            code_sandbox_runtime,
             callback_message,
+            code_interpreter,
         )
@@ -510,8 +537,8 @@ class AnthropicVisionAgent(VisionAgent):
         agent: Optional[LMM] = None,
         verbosity: int = 0,
         local_artifacts_path: Optional[Union[str, Path]] = None,
-        code_sandbox_runtime: Optional[str] = None,
         callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
         """Initialize the VisionAgent using Anthropic LMMs.
@@ -521,7 +548,12 @@ class AnthropicVisionAgent(VisionAgent):
             verbosity (int): The verbosity level of the agent.
             local_artifacts_path (Optional[Union[str, Path]]): The path to the local
                 artifacts file.
-            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+            callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
+                function to send intermediate update messages.
+            code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
+                it can be one of: None, "local" or "e2b". If None, it will read from
+                the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
+                object is provided it will use that.
         """
         agent = AnthropicLMM(temperature=0.0) if agent is None else agent
@@ -529,6 +561,6 @@ class AnthropicVisionAgent(VisionAgent):
             agent,
             verbosity,
             local_artifacts_path,
-            code_sandbox_runtime,
             callback_message,
+            code_interpreter,
         )

vision-agent 0.2.161__tar.gz → 0.2.163__tar.gz

vision-agent 0.2.161tar.gz → 0.2.163tar.gz