PyPI - vision-agent - Versions diffs - 0.2.203__tar.gz → 0.2.207__tar.gz - Mend

vision-agent 0.2.203tar.gz → 0.2.207tar.gz

Files changed (46) hide show

{vision_agent-0.2.203 → vision_agent-0.2.207}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.203
+Version: 0.2.207
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.203 → vision_agent-0.2.207}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.203"
+version = "0.2.207"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.203 → vision_agent-0.2.207}/vision_agent/agent/vision_agent_coder_v2.py RENAMED Viewed

@@ -34,7 +34,7 @@ from vision_agent.utils.execute import (
     CodeInterpreterFactory,
     Execution,
 )
-from vision_agent.utils.sim import Sim, load_cached_sim
+from vision_agent.utils.sim import Sim
 _CONSOLE = Console()
@@ -316,7 +316,7 @@ class VisionAgentCoderV2(AgentCoder):
             elif isinstance(tool_recommender, Sim):
                 self.tool_recommender = tool_recommender
         else:
-            self.tool_recommender = load_cached_sim(T.TOOLS_DF)
+            self.tool_recommender = T.get_tool_recommender()
         self.verbose = verbose
         self.code_sandbox_runtime = code_sandbox_runtime

{vision_agent-0.2.203 → vision_agent-0.2.207}/vision_agent/agent/vision_agent_planner_v2.py RENAMED Viewed

@@ -367,8 +367,10 @@ def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]
                 response = json.loads(chat[i + 1].content)
                 function_name = response["function_name"]
                 tool_doc = get_tool_documentation(function_name)
+                if "box_threshold" in response:
+                    tool_doc = f"Use the following function with box_threshold={response['box_threshold']}\n\n{tool_doc}"
                 new_chat.append(AgentMessage(role="observation", content=tool_doc))
-            except json.JSONDecodeError:
+            except (json.JSONDecodeError, KeyError):
                 raise ValueError(f"Invalid JSON in interaction response: {chat_i}")
         else:
             new_chat.append(chat_i)

{vision_agent-0.2.203 → vision_agent-0.2.207}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

@@ -280,7 +280,7 @@ def main():
 if __name__ == "__main__":
     main()
 '''
-edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0)
+edit_code_artifact(artifacts, 'streamlit_app.py', 0, 0, CODE)
 </execute_python>
 OBSERVATION:

{vision_agent-0.2.203 → vision_agent-0.2.207}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -45,6 +45,7 @@ from .tools import (
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,
+    get_tool_recommender,
     git_vqa_v2,
     gpt4o_image_vqa,
     gpt4o_video_vqa,

{vision_agent-0.2.203 → vision_agent-0.2.207}/vision_agent/tools/planner_tools.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import inspect
 import logging
 import shutil
 import tempfile
@@ -31,10 +32,8 @@ from vision_agent.utils.execute import (
     MimeType,
 )
 from vision_agent.utils.image_utils import convert_to_b64
-from vision_agent.utils.sim import load_cached_sim
 TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
-TOOL_RECOMMENDER = load_cached_sim(T.TOOLS_DF)
 _LOGGER = logging.getLogger(__name__)
 EXAMPLES = f"\n{TEST_TOOLS_EXAMPLE1}\n{TEST_TOOLS_EXAMPLE2}\n"
@@ -51,7 +50,7 @@ def format_tool_output(tool_thoughts: str, tool_docstring: str) -> str:
 def extract_tool_info(
-    tool_choice_context: Dict[str, Any]
+    tool_choice_context: Dict[str, Any],
 ) -> Tuple[Optional[Callable], str, str, str]:
     tool_thoughts = tool_choice_context.get("thoughts", "")
     tool_docstring = ""
@@ -63,12 +62,55 @@ def extract_tool_info(
     return tool, tool_thoughts, tool_docstring, ""
+def replace_box_threshold(code: str, functions: List[str], box_threshold: float) -> str:
+    class ReplaceBoxThresholdTransformer(cst.CSTTransformer):
+        def leave_Call(
+            self, original_node: cst.Call, updated_node: cst.Call
+        ) -> cst.Call:
+            if (
+                isinstance(updated_node.func, cst.Name)
+                and updated_node.func.value in functions
+            ) or (
+                isinstance(updated_node.func, cst.Attribute)
+                and updated_node.func.attr.value in functions
+            ):
+                new_args = []
+                found = False
+                for arg in updated_node.args:
+                    if arg.keyword and arg.keyword.value == "box_threshold":
+                        new_arg = arg.with_changes(value=cst.Float(str(box_threshold)))
+                        new_args.append(new_arg)
+                        found = True
+                    else:
+                        new_args.append(arg)
+                if not found:
+                    new_args.append(
+                        cst.Arg(
+                            keyword=cst.Name("box_threshold"),
+                            value=cst.Float(str(box_threshold)),
+                            equal=cst.AssignEqual(
+                                whitespace_before=cst.SimpleWhitespace(""),
+                                whitespace_after=cst.SimpleWhitespace(""),
+                            ),
+                        )
+                    )
+                return updated_node.with_changes(args=new_args)
+            return updated_node
+    tree = cst.parse_module(code)
+    transformer = ReplaceBoxThresholdTransformer()
+    new_tree = tree.visit(transformer)
+    return new_tree.code
 def run_tool_testing(
     task: str,
     image_paths: List[str],
     lmm: LMM,
     exclude_tools: Optional[List[str]],
     code_interpreter: CodeInterpreter,
+    process_code: Callable[[str], str] = lambda x: x,
 ) -> tuple[str, str, Execution]:
     """Helper function to generate and run tool testing code."""
     query = lmm.generate(CATEGORIZE_TOOL_REQUEST.format(task=task))
@@ -80,7 +122,7 @@ def run_tool_testing(
             f"I need models from the {category.strip()} category of tools. {task}"
         )
-    tool_docs = TOOL_RECOMMENDER.top_k(category, k=10, thresh=0.2)
+    tool_docs = T.get_tool_recommender().top_k(category, k=10, thresh=0.2)
     if exclude_tools is not None and len(exclude_tools) > 0:
         cleaned_tool_docs = []
         for tool_doc in tool_docs:
@@ -101,6 +143,7 @@ def run_tool_testing(
     code = extract_tag(response, "code")  # type: ignore
     if code is None:
         raise ValueError(f"Could not extract code from response: {response}")
+    code = process_code(code)
     tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
     tool_output_str = tool_output.text(include_results=False).strip()
@@ -119,6 +162,7 @@ def run_tool_testing(
             media=str(image_paths),
         )
         code = extract_code(lmm.generate(prompt, media=image_paths))  # type: ignore
+        code = process_code(code)
         tool_output = code_interpreter.exec_isolation(
             DefaultImports.prepend_imports(code)
         )
@@ -200,7 +244,9 @@ def get_tool_for_task(
                 context=f"<code>\n{code}\n</code>\n<tool_output>\n{tool_output_str}\n</tool_output>",
                 previous_attempts=error_message,
             )
-            tool_choice_context_dict = extract_json(lmm.generate(prompt, media=image_paths))  # type: ignore
+            tool_choice_context_dict = extract_json(
+                lmm.generate(prompt, media=image_paths)  # type: ignore
+            )
             tool, tool_thoughts, tool_docstring, error_message = extract_tool_info(
                 tool_choice_context_dict
             )
@@ -221,36 +267,7 @@ def get_tool_documentation(tool_name: str) -> str:
 def get_tool_for_task_human_reviewer(
     task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
 ) -> None:
-    # NOTE: this should be the same documentation as get_tool_for_task
-    """Given a task and one or more images this function will find a tool to accomplish
-    the jobs. It prints the tool documentation and thoughts on why it chose the tool.
-    It can produce tools for the following types of tasks:
-        - Object detection and counting
-        - Classification
-        - Segmentation
-        - OCR
-        - VQA
-        - Depth and pose estimation
-        - Video object tracking
-    Wait until the documentation is printed to use the function so you know what the
-    input and output signatures are.
-    Parameters:
-        task: str: The task to accomplish.
-        images: List[np.ndarray]: The images to use for the task.
-        exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
-            recommendations. This is helpful if you are calling get_tool_for_task twice
-            and do not want the same tool recommended.
-    Returns:
-        The tool to use for the task is printed to stdout
-    Examples
-    --------
-        >>> get_tool_for_task("Give me an OCR model that can find 'hot chocolate' in the image", [image])
-    """
+    # NOTE: this will have the same documentation as get_tool_for_task
     lmm = AnthropicLMM()
     with (
@@ -263,8 +280,19 @@ def get_tool_for_task_human_reviewer(
             Image.fromarray(image).save(image_path)
             image_paths.append(image_path)
+        tools = [
+            t.__name__
+            for t in T.TOOLS
+            if inspect.signature(t).parameters.get("box_threshold")  # type: ignore
+        ]
         _, _, tool_output = run_tool_testing(
-            task, image_paths, lmm, exclude_tools, code_interpreter
+            task,
+            image_paths,
+            lmm,
+            exclude_tools,
+            code_interpreter,
+            process_code=lambda x: replace_box_threshold(x, tools, 0.05),
         )
         # need to re-display results for the outer notebook to see them

{vision_agent-0.2.203 → vision_agent-0.2.207}/vision_agent/tools/tools.py RENAMED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 import tempfile
 import urllib.request
+from functools import lru_cache
 from importlib import resources
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -45,6 +46,7 @@ from vision_agent.utils.image_utils import (
     rle_decode,
     rle_decode_array,
 )
+from vision_agent.utils.sim import Sim, load_cached_sim
 from vision_agent.utils.video import (
     extract_frames_from_video,
     frames_to_bytes,
@@ -80,6 +82,11 @@ _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
 _LOGGER = logging.getLogger(__name__)
+@lru_cache(maxsize=1)
+def get_tool_recommender() -> Sim:
+    return load_cached_sim(TOOLS_DF)
 def grounding_dino(
     prompt: str,
     image: np.ndarray,