PyPI - vision-agent - Versions diffs - 0.2.219__tar.gz → 0.2.221__tar.gz - Mend

vision-agent 0.2.219tar.gz → 0.2.221tar.gz

Files changed (47) hide show

{vision_agent-0.2.219 → vision_agent-0.2.221}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.219
+Version: 0.2.221
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -89,18 +89,15 @@ To get started with the python library, you can install it using pip:
 pip install vision-agent
 ```
-Ensure you have both an Anthropic key and an OpenAI API key and set in your environment
-variables (if you are using Azure OpenAI please see the Azure setup section):
 ```bash
 export ANTHROPIC_API_KEY="your-api-key"
-export OPENAI_API_KEY="your-api-key"
 ```
 ---
 **NOTE**
-You must have both Anthropic and OpenAI API keys set in your environment variables to
-use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
+You must have the Anthropic API key set in your environment variables to use
+VisionAgent. If you don't have an Anthropic key you can use another provider like
+OpenAI or Ollama.
 ---
 #### Chatting with VisionAgent
@@ -161,8 +158,7 @@ Anthropic/OpenAI models.
 ### Chatting and Message Formats
 `VisionAgent` is an agent that can chat with you and call other tools or agents to
 write vision code for you. You can interact with it like you would ChatGPT or any other
-chatbot. The agent uses Clause-3.5 for it's LMM and OpenAI for embeddings for searching
-for tools.
+chatbot. The agent uses Clause-3.5 for it's LMM.
 The message format is:
 ```json

{vision_agent-0.2.219 → vision_agent-0.2.221}/README.md RENAMED Viewed

@@ -44,18 +44,15 @@ To get started with the python library, you can install it using pip:
 pip install vision-agent
 ```
-Ensure you have both an Anthropic key and an OpenAI API key and set in your environment
-variables (if you are using Azure OpenAI please see the Azure setup section):
 ```bash
 export ANTHROPIC_API_KEY="your-api-key"
-export OPENAI_API_KEY="your-api-key"
 ```
 ---
 **NOTE**
-You must have both Anthropic and OpenAI API keys set in your environment variables to
-use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
+You must have the Anthropic API key set in your environment variables to use
+VisionAgent. If you don't have an Anthropic key you can use another provider like
+OpenAI or Ollama.
 ---
 #### Chatting with VisionAgent
@@ -116,8 +113,7 @@ Anthropic/OpenAI models.
 ### Chatting and Message Formats
 `VisionAgent` is an agent that can chat with you and call other tools or agents to
 write vision code for you. You can interact with it like you would ChatGPT or any other
-chatbot. The agent uses Clause-3.5 for it's LMM and OpenAI for embeddings for searching
-for tools.
+chatbot. The agent uses Clause-3.5 for it's LMM.
 The message format is:
 ```json

{vision_agent-0.2.219 → vision_agent-0.2.221}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.219"
+version = "0.2.221"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/.sim_tools/df.csv RENAMED Viewed

@@ -460,19 +460,37 @@ desc,doc,name
     -------
         >>> document_analysis(image)
         {'pages':
-            [{'bbox': [0, 0, 1700, 2200],
-                    'chunks': [{'bbox': [1371, 75, 1503, 112],
+            [{'bbox': [0, 0, 1.0, 1.0],
+                    'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
                                 'label': 'page_header',
                                 'order': 75
                                 'caption': 'Annual Report 2024',
                                 'summary': 'This annual report summarizes ...' },
-                               {'bbox': [201, 1119, 1497, 1647],
+                               {'bbox': [0.2, 0.9, 0.9, 1.0],
                                 'label': table',
                                 'order': 1119,
                                 'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
                                 'summary': 'This table illustrates a trend of ...'},
                     ],
     ",document_extraction
+"'document_qa' is a tool that can answer any questions about arbitrary documents, presentations, or tables. It's very useful for document QA tasks, you can ask it a specific question or ask it to return a JSON object answering multiple questions about the document.","document_qa(prompt: str, image: numpy.ndarray) -> str:
+'document_qa' is a tool that can answer any questions about arbitrary documents,
+    presentations, or tables. It's very useful for document QA tasks, you can ask it a
+    specific question or ask it to return a JSON object answering multiple questions
+    about the document.
+    Parameters:
+        prompt (str): The question to be answered about the document image.
+        image (np.ndarray): The document image to analyze.
+    Returns:
+        str: The answer to the question based on the document's context.
+    Example
+    -------
+        >>> document_qa(image, question)
+        'The answer to the question ...'
+    ",document_qa
 'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
 'video_temporal_localization' will run qwen2vl on each chunk_length_frames
     value selected for the video. It can detect multiple objects independently per

vision_agent-0.2.221/vision_agent/.sim_tools/embs.npy ADDED Viewed

Binary file

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/agent/vision_agent_coder_v2.py RENAMED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
 from rich.console import Console
 from rich.markup import escape
-import vision_agent.tools as T
+import vision_agent.tools.tools as T
 from vision_agent.agent import AgentCoder, AgentPlanner
 from vision_agent.agent.agent_utils import (
     DefaultImports,
@@ -34,7 +34,7 @@ from vision_agent.utils.execute import (
     CodeInterpreterFactory,
     Execution,
 )
-from vision_agent.utils.sim import Sim
+from vision_agent.utils.sim import Sim, get_tool_recommender
 _CONSOLE = Console()
@@ -316,7 +316,7 @@ class VisionAgentCoderV2(AgentCoder):
             elif isinstance(tool_recommender, Sim):
                 self.tool_recommender = tool_recommender
         else:
-            self.tool_recommender = T.get_tool_recommender()
+            self.tool_recommender = get_tool_recommender()
         self.verbose = verbose
         self.code_sandbox_runtime = code_sandbox_runtime

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/agent/vision_agent_planner_prompts_v2.py RENAMED Viewed

@@ -440,16 +440,17 @@ PICK_PLAN = """
 """
 CATEGORIZE_TOOL_REQUEST = """
-You are given a task: {task} from the user. Your task is to extract the type of category this task belongs to, it can be one or more of the following:
+You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
 - "object detection and counting" - detecting objects or counting objects from a text prompt in an image or video.
 - "classification" - classifying objects in an image given a text prompt.
 - "segmentation" - segmenting objects in an image or video given a text prompt.
 - "OCR" - extracting text from an image.
 - "VQA" - answering questions about an image or video, can also be used for text extraction.
+- "DocQA" - answering questions about a document or extracting information from a document.
 - "video object tracking" - tracking objects in a video.
 - "depth and pose estimation" - estimating the depth or pose of objects in an image.
-Return the category or categories (comma separated) inside tags <category># your categories here</category>.
+Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
 """
 TEST_TOOLS = """
@@ -473,7 +474,7 @@ TEST_TOOLS = """
 {examples}
 **Instructions**:
-1. List all the tools under **Tools** and the user request. Write a program to load the media and call every tool in parallel and print it's output along with other relevant information.
+1. List all the tools under **Tools** and the user request. Write a program to load the media and call the most relevant tools in parallel and print it's output along with other relevant information.
 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
 3. Your test case MUST run only on the given images which are {media}
 4. Print this final dictionary.

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -43,7 +43,6 @@ from .tools import (
     flux_image_inpainting,
     generate_pose_image,
     get_tool_documentation,
-    get_tool_recommender,
     gpt4o_image_vqa,
     gpt4o_video_vqa,
     load_image,
@@ -63,6 +62,7 @@ from .tools import (
     save_json,
     save_video,
     siglip_classification,
+    stella_embeddings,
     template_match,
     video_temporal_localization,
     vit_image_classification,

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/tools/planner_tools.py RENAMED Viewed

@@ -32,6 +32,7 @@ from vision_agent.utils.execute import (
     MimeType,
 )
 from vision_agent.utils.image_utils import convert_to_b64
+from vision_agent.utils.sim import get_tool_recommender
 TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
@@ -116,13 +117,11 @@ def run_tool_testing(
     query = lmm.generate(CATEGORIZE_TOOL_REQUEST.format(task=task))
     category = extract_tag(query, "category")  # type: ignore
     if category is None:
-        category = task
+        query = task
     else:
-        category = (
-            f"I need models from the {category.strip()} category of tools. {task}"
-        )
+        query = f"{category.strip()}. {task}"
-    tool_docs = T.get_tool_recommender().top_k(category, k=10, thresh=0.2)
+    tool_docs = get_tool_recommender().top_k(query, k=5, thresh=0.3)
     if exclude_tools is not None and len(exclude_tools) > 0:
         cleaned_tool_docs = []
         for tool_doc in tool_docs:

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/tools/tools.py RENAMED Viewed

@@ -7,7 +7,6 @@ import urllib.request
 from base64 import b64encode
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from enum import Enum
-from functools import lru_cache
 from importlib import resources
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -49,7 +48,6 @@ from vision_agent.utils.image_utils import (
     rle_decode,
     rle_decode_array,
 )
-from vision_agent.utils.sim import Sim, load_cached_sim
 from vision_agent.utils.video import (
     extract_frames_from_video,
     frames_to_bytes,
@@ -85,11 +83,6 @@ _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
 _LOGGER = logging.getLogger(__name__)
-@lru_cache(maxsize=1)
-def get_tool_recommender() -> Sim:
-    return load_cached_sim(TOOLS_DF)
 def _display_tool_trace(
     function_name: str,
     request: Dict[str, Any],
@@ -410,7 +403,7 @@ def owl_v2_video(
     _display_tool_trace(
         owl_v2_video.__name__,
         payload,
-        detections[0],
+        detections,
         files,
     )
     return bboxes_formatted
@@ -2178,13 +2171,14 @@ def document_qa(
     prompt: str,
     image: np.ndarray,
 ) -> str:
-    """'document_qa' is a tool that can answer any questions about arbitrary
-    images of documents or presentations. It answers by analyzing the contextual document data
-    and then using a model to answer specific questions. It returns text as an answer to the question.
+    """'document_qa' is a tool that can answer any questions about arbitrary documents,
+    presentations, or tables. It's very useful for document QA tasks, you can ask it a
+    specific question or ask it to return a JSON object answering multiple questions
+    about the document.
     Parameters:
-        prompt (str): The question to be answered about the document image
-        image (np.ndarray): The document image to analyze
+        prompt (str): The question to be answered about the document image.
+        image (np.ndarray): The document image to analyze.
     Returns:
         str: The answer to the question based on the document's context.
@@ -2203,7 +2197,7 @@ def document_qa(
         "model": "document-analysis",
     }
-    data: dict[str, Any] = send_inference_request(
+    data: Dict[str, Any] = send_inference_request(
         payload=payload,
         endpoint_name="document-analysis",
         files=files,
@@ -2225,10 +2219,10 @@ def document_qa(
     data = normalize(data)
     prompt = f"""
-    Document Context:
-    {data}\n
-    Question: {prompt}\n
-    Please provide a clear, concise answer using only the information from the document. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
+Document Context:
+{data}\n
+Question: {prompt}\n
+Answer the question directly using only the information from the document, do not answer with any additional text besides the answer. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
     """
     lmm = AnthropicLMM()
@@ -2245,6 +2239,22 @@ def document_qa(
     return llm_output
+def stella_embeddings(prompts: List[str]) -> List[np.ndarray]:
+    payload = {
+        "input": prompts,
+        "model": "stella1.5b",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload=payload,
+        endpoint_name="embeddings",
+        v2=True,
+        metadata_payload={"function_name": "get_embeddings"},
+        is_form=True,
+    )
+    return [d["embedding"] for d in data]  # type: ignore
 # Utility and visualization functions
@@ -2781,6 +2791,7 @@ FUNCTION_TOOLS = [
     qwen2_vl_images_vqa,
     qwen2_vl_video_vqa,
     document_extraction,
+    document_qa,
     video_temporal_localization,
     flux_image_inpainting,
     siglip_classification,

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/utils/__init__.py RENAMED Viewed

@@ -7,4 +7,3 @@ from .execute import (
     Result,
 )
 from .sim import AzureSim, OllamaSim, Sim, load_sim, merge_sim
-from .video import extract_frames_from_video, video_writer

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/utils/execute.py RENAMED Viewed

@@ -28,10 +28,10 @@ from nbclient import __version__ as nbclient_version
 from nbclient.exceptions import CellTimeoutError, DeadKernelError
 from nbclient.util import run_sync
 from nbformat.v4 import new_code_cell
+from opentelemetry.context import get_current
+from opentelemetry.trace import SpanKind, Status, StatusCode, get_tracer
 from pydantic import BaseModel, field_serializer
 from typing_extensions import Self
-from opentelemetry.trace import get_tracer, Status, StatusCode, SpanKind
-from opentelemetry.context import get_current
 from vision_agent.utils.exceptions import (
     RemoteSandboxCreationError,

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/utils/image_utils.py RENAMED Viewed

@@ -11,7 +11,7 @@ import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 from PIL.Image import Image as ImageType
-from vision_agent.utils import extract_frames_from_video
+from vision_agent.utils.video import extract_frames_from_video
 COLORS = [
     (158, 218, 229),

{vision_agent-0.2.219 → vision_agent-0.2.221}/vision_agent/utils/sim.py RENAMED Viewed

@@ -12,6 +12,13 @@ import requests
 from openai import AzureOpenAI, OpenAI
 from scipy.spatial.distance import cosine  # type: ignore
+from vision_agent.tools.tools import TOOLS_DF, stella_embeddings
+@lru_cache(maxsize=1)
+def get_tool_recommender() -> "Sim":
+    return load_cached_sim(TOOLS_DF)
 @lru_cache(maxsize=512)
 def get_embedding(
@@ -27,13 +34,13 @@ def load_cached_sim(
     cached_dir_full_path = str(resources.files("vision_agent") / cached_dir)
     if os.path.exists(cached_dir_full_path):
         if tools_df is not None:
-            if Sim.check_load(cached_dir_full_path, tools_df):
+            if StellaSim.check_load(cached_dir_full_path, tools_df):
                 # don't pass sim_key to loaded Sim object or else it will re-calculate embeddings
-                return Sim.load(cached_dir_full_path)
+                return StellaSim.load(cached_dir_full_path)
     if os.path.exists(cached_dir_full_path):
         shutil.rmtree(cached_dir_full_path)
-    sim = Sim(tools_df, sim_key=sim_key)
+    sim = StellaSim(tools_df, sim_key=sim_key)
     sim.save(cached_dir_full_path)
     return sim
@@ -214,6 +221,40 @@ class OllamaSim(Sim):
             )
+class StellaSim(Sim):
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        sim_key: Optional[str] = None,
+    ) -> None:
+        self.df = df
+        def emb_call(text: List[str]) -> List[float]:
+            return stella_embeddings(text)[0]  # type: ignore
+        self.emb_call = emb_call
+        if "embs" not in df.columns and sim_key is None:
+            raise ValueError("key is required if no column 'embs' is present.")
+        if sim_key is not None:
+            self.df["embs"] = self.df[sim_key].apply(
+                lambda x: get_embedding(emb_call, x)
+            )
+    @staticmethod
+    def load(
+        load_dir: Union[str, Path],
+        api_key: Optional[str] = None,
+        model: str = "stella1.5b",
+    ) -> "StellaSim":
+        load_dir = Path(load_dir)
+        df = pd.read_csv(load_dir / "df.csv")
+        embs = np.load(load_dir / "embs.npy")
+        df["embs"] = list(embs)
+        return StellaSim(df)
 def merge_sim(sim1: Sim, sim2: Sim) -> Sim:
     return Sim(pd.concat([sim1.df, sim2.df], ignore_index=True))