PyPI - vision-agent - Versions diffs - 1.1.16__py3-none-any.whl → 1.1.18__py3-none-any.whl - Mend

vision-agent 1.1.16py3-none-any.whl → 1.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

vision_agent/.sim_tools/df.csv +12 -12
vision_agent/.sim_tools/embs.npy +0 -0
vision_agent/agent/__init__.py +1 -0
vision_agent/agent/vision_agent_prompts_v3.py +372 -0
vision_agent/agent/vision_agent_v3.py +278 -0
vision_agent/lmm/lmm.py +219 -57
vision_agent/tools/__init__.py +3 -3
vision_agent/tools/planner_v3_tools.py +206 -0
vision_agent/tools/tools.py +55 -64
vision_agent/utils/agent.py +24 -8
vision_agent/utils/tools.py +1 -1
{vision_agent-1.1.16.dist-info → vision_agent-1.1.18.dist-info}/METADATA +4 -4
{vision_agent-1.1.16.dist-info → vision_agent-1.1.18.dist-info}/RECORD +15 -12
{vision_agent-1.1.16.dist-info → vision_agent-1.1.18.dist-info}/WHEEL +0 -0
{vision_agent-1.1.16.dist-info → vision_agent-1.1.18.dist-info}/licenses/LICENSE +0 -0

vision_agent/agent/vision_agent_v3.py ADDED Viewed

@@ -0,0 +1,278 @@
+import copy
+import re
+import time
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from rich.console import Console
+from rich.markup import escape
+from vision_agent.agent import Agent
+from vision_agent.agent.vision_agent_prompts_v3 import get_init_prompt
+from vision_agent.configs import Config
+from vision_agent.lmm import LMM, AnthropicLMM
+from vision_agent.models import AgentMessage, Message
+from vision_agent.utils.agent import (
+    add_media_to_chat,
+    capture_media_from_exec,
+    convert_message_to_agentmessage,
+    extract_tag,
+    print_code,
+    remove_installs_from_code,
+)
+from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
+CONFIG = Config()
+MAX_IMAGES = 10
+_CONSOLE = Console()
+class DefaultImports:
+    imports = [
+        "import os",
+        "import numpy as np",
+        "import cv2",
+        "from typing import *",
+        "from pillow_heif import register_heif_opener",
+        "from vision_agent.tools import load_image",
+        "from vision_agent.tools.planner_v3_tools import instance_segmentation, ocr, depth_estimation, visualize_bounding_boxes, visualize_segmentation_masks, get_crops, rotate_90, display_image, iou",
+        "register_heif_opener()",
+        "import matplotlib.pyplot as plt",
+    ]
+    @staticmethod
+    def prepend_imports(code: str) -> str:
+        return "\n".join(DefaultImports.imports) + "\n\n" + code
+def run_chat(
+    model: LMM,
+    chat: List[AgentMessage],
+    kwargs: Optional[Dict[str, Any]] = None,
+) -> str:
+    chat = copy.deepcopy(chat)
+    formatted_chat = []
+    for c in chat:
+        if c.role in ["user", "observation", "final_observation", "error_observation"]:
+            role = "user"
+        else:
+            role = "assistant"
+        formatted_chat.append({"role": role, "content": c.content, "media": c.media})
+    response = cast(str, model(formatted_chat, **(kwargs or {})))  # type: ignore
+    return response
+def strip_signature(response: str) -> str:
+    signature = extract_tag(response, "signature")
+    if signature is not None:
+        response = response.replace(f"<signature>{signature}</signature>", "")
+    return response
+def strip_signature_from_agentmessage(
+    response: AgentMessage,
+) -> AgentMessage:
+    return AgentMessage(
+        role=response.role,
+        content=strip_signature(response.content),
+        media=response.media,
+    )
+def fix_xml_code_tags(response: str) -> str:
+    start_tag = "```python"
+    end_tag = "```"
+    start_index = response.find(start_tag)
+    if start_index != -1:
+        end_index = response.find(end_tag, start_index + len(start_tag))
+        if end_index != -1:
+            # Extract the code content
+            code_content = response[start_index + len(start_tag) : end_index].strip()
+            # Replace the markdown block with XML <code> tags
+            response = (
+                response[:start_index]
+                + f"<code>\n{code_content}\n</code>"
+                + response[end_index + len(end_tag) :]
+            )
+    # Original logic to fix potentially missing XML tags
+    if "<answer>" in response and "</answer>" not in response:
+        response += "</answer>"
+    # Ensure <code> tags are closed if they exist (could be pre-existing or just added)
+    if "<code>" in response and "</code>" not in response:
+        response += "</code>"
+    return response
+def strip_extra_content(response: str) -> str:
+    code_pos = [i.start() for i in re.finditer("<code>", response)]
+    if len(code_pos) > 0:
+        thinking_start = response.find("<thinking>")
+        thinking_end = response.find("</thinking>", thinking_start)
+        signature_start = response.find("<signature>")
+        signature_end = response.find("</signature>", signature_start)
+        code_start = response.find("<code>")
+        code_end = response.find("</code>", code_start)
+        return (
+            response[thinking_start : thinking_end + len("</thinking>")]
+            + (
+                response[signature_start : signature_end + len("</signature>")]
+                if signature_start != -1
+                else ""
+            )
+            + response[code_start : code_end + len("</code>")]
+        )
+    return response
+def run_code(
+    code: str,
+    code_interpreter: CodeInterpreter,
+) -> Tuple[str, List[str], float]:
+    code = remove_installs_from_code(code)
+    start = time.time()
+    execution = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
+    end = time.time()
+    obs = execution.text(include_logs=True).strip()
+    result_images = capture_media_from_exec(execution)
+    max_images_to_include = MAX_IMAGES
+    if result_images:
+        max_images_to_include = min(len(result_images), MAX_IMAGES)
+        return_images = result_images[:max_images_to_include]
+        image_note = f"\n\n[{len(return_images)} images were generated by your code and are included with this message]"
+        obs += image_note
+    return_images = result_images[:max_images_to_include] if result_images else []
+    return obs, return_images, end - start
+def format_obs_message(
+    obs: str,
+    turn: int,
+    turns: int,
+) -> str:
+    obs_message = f"[Turn {turn + 1}/{turns}] Code execution result:\n{obs}"
+    if turn == turns - 2:
+        warning_msg = "\n\n⚠️CRITICAL: The next turn will be your FINAL turn. Please make sure to provide your final answer in <answer> tags in your next response, no need to incude <code> tags. Rember to print out final answers without any explaination, it could be a single word, number, price or a list of bounding boxes of object detection."
+        obs_message += warning_msg
+    return obs_message
+class VisionAgentV3(Agent):
+    def __init__(
+        self,
+        agent: Optional[LMM] = None,
+        hil: bool = False,
+        verbose: bool = False,
+        code_sandbox_runtime: Optional[str] = None,
+        update_callback: Callable[[Dict[str, Any]], None] = lambda x: None,
+    ) -> None:
+        if agent is None:
+            self.agent = AnthropicLMM(
+                model_name="claude-3-7-sonnet-20250219", max_tokens=8192
+            )
+        self.kwargs = {
+            "thinking": {"type": "enabled", "budget_tokens": 4096},
+            "stop_sequences": ["</code>", "</answer>"],
+        }
+        self.turns = 7
+        self.verbose = verbose
+        self.code_sandbox_runtime = code_sandbox_runtime
+        self.update_callback = update_callback
+    def __call__(
+        self,
+        input: Union[str, List[Message]],
+        media: Optional[Union[str, Path]] = None,
+    ) -> str:
+        msg = convert_message_to_agentmessage(input, media)
+        return self.chat(msg)[-1].content
+    def chat(
+        self,
+        chat: List[AgentMessage],
+        code_interpreter: Optional[CodeInterpreter] = None,
+    ) -> List[AgentMessage]:
+        chat = copy.deepcopy(chat)
+        if not chat or chat[-1].role not in {"user", "interaction_response"}:
+            raise ValueError(
+                f"Last chat message must be from the user or interaction_response, got {chat[-1].role}."
+            )
+        return_chat = []
+        with (
+            CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
+            if code_interpreter is None
+            else code_interpreter
+        ) as code_interpreter:
+            int_chat, _, _ = add_media_to_chat(
+                chat, code_interpreter, append_to_prompt=False
+            )
+            init_prompt = get_init_prompt(
+                model="",
+                turns=self.turns,
+                question=int_chat[0].content,
+                category="",
+                image_path=str(int_chat[0].media),
+            )
+            return_chat.append(
+                AgentMessage(role="user", content=init_prompt, media=int_chat[0].media)
+            )
+            for turn in range(self.turns):
+                response = run_chat(self.agent, return_chat, self.kwargs)
+                response = fix_xml_code_tags(response)
+                response = strip_extra_content(response)
+                return_chat.append(AgentMessage(role="assistant", content=response))
+                self.update_callback(
+                    strip_signature_from_agentmessage(return_chat[-1]).model_dump()
+                )
+                code = extract_tag(response, "code")
+                thoughts = extract_tag(response, "thinking")
+                answer = extract_tag(response, "answer")
+                if self.verbose:
+                    _CONSOLE.print(
+                        f"[bold cyan]Step {turn}/{self.turns}[/bold cyan]\n"
+                        f"[green]{thoughts}[/green]\n"
+                    )
+                    if answer is not None:
+                        _CONSOLE.print(
+                            f"[magenta]Final answer: {escape(answer)}[/magenta]\n"
+                        )
+                    if code is not None:
+                        print_code("Code:", code)
+                if answer is not None:
+                    # final answer is in the previous response message so no need to add
+                    # add it to the return_chat
+                    self.update_callback(
+                        AgentMessage(
+                            role="final_observation",
+                            content=f"<answer>{answer}</answer>",
+                        ).model_dump()
+                    )
+                elif code is not None:
+                    obs, images, latency = run_code(code, code_interpreter)
+                    obs = format_obs_message(obs, turn, self.turns)
+                    _CONSOLE.print(
+                        f"[bold cyan]Code execution took {latency:.2f} seconds.[/bold cyan]\n"
+                        f"[yellow]{escape(obs)}[/yellow]\n"
+                    )
+                    return_chat.append(
+                        AgentMessage(role="observation", content=obs, media=images)
+                    )
+                    self.update_callback(
+                        strip_signature_from_agentmessage(return_chat[-1]).model_dump()
+                    )
+        return return_chat
+    def log_progress(self, data: Dict[str, Any]) -> None:
+        pass

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -1,19 +1,33 @@
+import base64
 import json
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
-import base64
+from typing import (
+    Any,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Union,
+    cast,
+)
 import anthropic
 import requests
-from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
-from openai import AzureOpenAI, OpenAI
+from anthropic.types import (
+    ImageBlockParam,
+    MessageParam,
+    TextBlockParam,
+    ThinkingBlockParam,
+)
 from google import genai  # type: ignore
 from google.genai import types  # type: ignore
+from openai import AzureOpenAI, OpenAI
 from vision_agent.models import Message
+from vision_agent.utils.agent import extract_tag
 from vision_agent.utils.image_utils import encode_media
@@ -99,11 +113,15 @@ class OpenAILMM(LMM):
                 [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
         """
         fixed_chat = []
-        for c in chat:
-            fixed_c = {"role": c["role"]}
-            fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
-            if "media" in c and self.model_name != "o3-mini":
-                for media in c["media"]:
+        for msg in chat:
+            fixed_c = {"role": msg["role"]}
+            fixed_c["content"] = [{"type": "text", "text": msg["content"]}]  # type: ignore
+            if (
+                "media" in msg
+                and msg["media"] is not None
+                and self.model_name != "o3-mini"
+            ):
+                for media in msg["media"]:
                     resize = kwargs["resize"] if "resize" in kwargs else self.image_size
                     image_detail = (
                         kwargs["image_detail"]
@@ -297,14 +315,14 @@ class OllamaLMM(LMM):
                 [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
         """
         fixed_chat = []
-        for message in chat:
-            if "media" in message:
+        for msg in chat:
+            if "media" in msg and msg["media"] is not None:
                 resize = kwargs["resize"] if "resize" in kwargs else self.image_size
-                message["images"] = [
-                    encode_media(cast(str, m), resize=resize) for m in message["media"]
+                msg["images"] = [
+                    encode_media(cast(str, m), resize=resize) for m in msg["media"]
                 ]
-                del message["media"]
-            fixed_chat.append(message)
+                del msg["media"]
+            fixed_chat.append(msg)
         url = f"{self.url}/chat"
         model = self.model_name
         messages = fixed_chat
@@ -410,63 +428,207 @@ class AnthropicLMM(LMM):
     def __call__(
         self,
-        input: Union[str, Sequence[Dict[str, Any]]],
+        input: Union[str, Sequence[Message]],
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         if isinstance(input, str):
             return self.generate(input, **kwargs)
         return self.chat(input, **kwargs)
-    def chat(
+    def create_thinking_assistant_message(
         self,
-        chat: Sequence[Dict[str, Any]],
-        **kwargs: Any,
-    ) -> Union[str, Iterator[Optional[str]]]:
+        msg_content: str,
+    ) -> MessageParam:
+        content: List[Union[TextBlockParam, ThinkingBlockParam]] = []
+        thinking_content = extract_tag(msg_content, "thinking")
+        signature = extract_tag(msg_content, "signature")
+        if thinking_content:
+            content.append(
+                ThinkingBlockParam(
+                    type="thinking",
+                    thinking=thinking_content.strip(),
+                    signature=signature.strip() if signature else "",
+                )
+            )
+        signature_content = extract_tag(msg_content, "signature")
+        if signature_content:
+            text_content = msg_content.replace(
+                f"<thinking>{thinking_content}</thinking>", ""
+            ).replace(f"<signature>{signature_content}</signature>", "")
+        else:
+            text_content = msg_content.replace(
+                f"<thinking>{thinking_content}</thinking>", ""
+            )
+        if text_content.strip():
+            content.append(TextBlockParam(type="text", text=text_content.strip()))
+        return MessageParam(role="assistant", content=content)
+    def _setup_chat_kwargs(self, kwargs: Dict[str, Any]) -> tuple[Dict[str, Any], bool]:
+        """Set up kwargs and determine if thinking mode is enabled."""
+        tmp_kwargs = self.kwargs | kwargs
+        thinking_enabled = (
+            "thinking" in tmp_kwargs
+            and "type" in tmp_kwargs["thinking"]
+            and tmp_kwargs["thinking"]["type"] == "enabled"
+        )
+        if thinking_enabled:
+            tmp_kwargs["temperature"] = 1.0
+        return tmp_kwargs, thinking_enabled
+    def _convert_messages_to_anthropic_format(
+        self, chat: Sequence[Message], thinking_enabled: bool, **kwargs: Any
+    ) -> List[MessageParam]:
+        """Convert chat messages to Anthropic format."""
         messages: List[MessageParam] = []
         for msg in chat:
-            content: List[Union[TextBlockParam, ImageBlockParam]] = [
-                TextBlockParam(type="text", text=msg["content"])
-            ]
-            if "media" in msg:
-                for media_path in msg["media"]:
-                    resize = kwargs["resize"] if "resize" in kwargs else self.image_size
-                    encoded_media = encode_media(media_path, resize=resize)
-                    if encoded_media.startswith("data:image/png;base64,"):
-                        encoded_media = encoded_media[len("data:image/png;base64,") :]
-                    content.append(
-                        ImageBlockParam(
-                            type="image",
-                            source={
-                                "type": "base64",
-                                "media_type": "image/png",
-                                "data": encoded_media,
-                            },
+            if msg["role"] == "user":
+                content: List[Union[TextBlockParam, ImageBlockParam]] = [
+                    TextBlockParam(type="text", text=cast(str, msg["content"]))
+                ]
+                if "media" in msg and msg["media"] is not None:
+                    for media_path in msg["media"]:
+                        resize = (
+                            kwargs["resize"] if "resize" in kwargs else self.image_size
+                        )
+                        encoded_media = encode_media(
+                            cast(str, media_path), resize=resize
+                        )
+                        if encoded_media.startswith("data:image/png;base64,"):
+                            encoded_media = encoded_media[
+                                len("data:image/png;base64,") :
+                            ]
+                        content.append(
+                            ImageBlockParam(
+                                type="image",
+                                source={
+                                    "type": "base64",
+                                    "media_type": "image/png",
+                                    "data": encoded_media,
+                                },
+                            )
+                        )
+                messages.append({"role": "user", "content": content})
+            elif msg["role"] == "assistant":
+                if thinking_enabled:
+                    messages.append(
+                        self.create_thinking_assistant_message(
+                            cast(str, msg["content"]),
+                        )
+                    )
+                else:
+                    messages.append(
+                        MessageParam(
+                            role="assistant",
+                            content=[
+                                {"type": "text", "text": cast(str, msg["content"])}
+                            ],
                         )
                     )
-            messages.append({"role": msg["role"], "content": content})
+            else:
+                raise ValueError(
+                    f"Unsupported role {msg['role']}. Only 'user' and 'assistant' roles are supported."
+                )
-        # prefers kwargs from second dictionary over first
-        tmp_kwargs = self.kwargs | kwargs
-        response = self.client.messages.create(
-            model=self.model_name, messages=messages, **tmp_kwargs
+        return messages
+    def _handle_streaming_response(
+        self, stream_response: anthropic.Stream[anthropic.MessageStreamEvent]
+    ) -> Iterator[Optional[str]]:
+        """Handle streaming response from Anthropic API."""
+        def f() -> Iterator[Optional[str]]:
+            thinking_start = False
+            signature_start = False
+            for chunk in stream_response:
+                if chunk.type == "message_start" or chunk.type == "content_block_start":
+                    continue
+                elif chunk.type == "content_block_delta":
+                    if chunk.delta.type == "text_delta":
+                        if thinking_start:
+                            thinking_start = False
+                            yield f"</thinking>\n{chunk.delta.text}"
+                        elif signature_start:
+                            signature_start = False
+                            yield f"</signature>\n{chunk.delta.text}"
+                        else:
+                            yield chunk.delta.text
+                    elif chunk.delta.type == "thinking_delta":
+                        if not thinking_start:
+                            thinking_start = True
+                            yield f"<thinking>{chunk.delta.thinking}"
+                        else:
+                            yield chunk.delta.thinking
+                    elif chunk.delta.type == "signature_delta":
+                        if not signature_start:
+                            signature_start = True
+                            yield f"<signature>{chunk.delta.signature}"
+                        else:
+                            yield chunk.delta.signature
+                elif chunk.type == "message_stop":
+                    yield None
+        return f()
+    def _format_thinking_response(self, msg_response: anthropic.types.Message) -> str:
+        """Format thinking mode response with proper tags."""
+        thinking = ""
+        signature = ""
+        redacted_thinking = ""
+        text = ""
+        for block in msg_response.content:
+            if block.type == "thinking":
+                thinking += block.thinking
+                if block.signature:
+                    signature = block.signature
+            elif block.type == "text":
+                text += block.text
+            elif block.type == "redacted_thinking":
+                redacted_thinking += block.data
+        return (
+            f"<thinking>{thinking}</thinking>\n"
+            + (
+                f"<redacted_thinking>{redacted_thinking}</redacted_thinking>\n"
+                if redacted_thinking
+                else ""
+            )
+            + (f"<signature>{signature}</signature>\n" if signature else "")
+            + text
         )
-        if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
-            def f() -> Iterator[Optional[str]]:
-                for chunk in response:
-                    if (
-                        chunk.type == "message_start"
-                        or chunk.type == "content_block_start"
-                    ):
-                        continue
-                    elif chunk.type == "content_block_delta":
-                        yield chunk.delta.text
-                    elif chunk.type == "message_stop":
-                        yield None
+    def _handle_non_streaming_response(
+        self, response_untyped: Any, thinking_enabled: bool
+    ) -> str:
+        """Handle non-streaming response from Anthropic API."""
+        msg_response = cast(anthropic.types.Message, response_untyped)
+        if thinking_enabled:
+            return self._format_thinking_response(msg_response)
+        return cast(anthropic.types.TextBlock, msg_response.content[0]).text
-            return f()
+    def chat(
+        self,
+        chat: Sequence[Message],
+        **kwargs: Any,
+    ) -> Union[str, Iterator[Optional[str]]]:
+        tmp_kwargs, thinking_enabled = self._setup_chat_kwargs(kwargs)
+        messages = self._convert_messages_to_anthropic_format(
+            chat, thinking_enabled, **kwargs
+        )
+        response_untyped = self.client.messages.create(
+            model=self.model_name, messages=messages, **tmp_kwargs
+        )
+        is_stream = bool(tmp_kwargs.get("stream", False))
+        if is_stream:
+            stream_response = cast(
+                anthropic.Stream[anthropic.MessageStreamEvent], response_untyped
+            )
+            return self._handle_streaming_response(stream_response)
         else:
-            return cast(str, response.content[0].text)
+            return self._handle_non_streaming_response(
+                response_untyped, thinking_enabled
+            )
     def generate(
         self,

vision_agent/tools/__init__.py CHANGED Viewed

@@ -21,7 +21,7 @@ from .tools import (
     countgd_sam2_visual_instance_segmentation,
     countgd_visual_object_detection,
     custom_object_detection,
-    depth_anything_v2,
+    depth_pro,
     detr_segmentation,
     document_extraction,
     document_qa,
@@ -42,7 +42,6 @@ from .tools import (
     glee_sam2_video_tracking,
     load_image,
     minimum_distance,
-    ocr,
     od_sam2_video_tracking,
     overlay_bounding_boxes,
     overlay_heat_map,
@@ -50,6 +49,7 @@ from .tools import (
     owlv2_object_detection,
     owlv2_sam2_instance_segmentation,
     owlv2_sam2_video_tracking,
+    paddle_ocr,
     qwen2_vl_images_vqa,
     qwen2_vl_video_vqa,
     qwen25_vl_images_vqa,
@@ -74,7 +74,7 @@ def register_tool(imports: Optional[List] = None) -> Callable:
     def decorator(tool: Callable) -> Callable:
         import inspect
-        global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO
+        global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO  # noqa: F824
         from vision_agent.tools.tools import TOOLS
         if tool not in TOOLS:  # type: ignore

vision-agent 1.1.16__py3-none-any.whl → 1.1.18__py3-none-any.whl

vision-agent 1.1.16py3-none-any.whl → 1.1.18py3-none-any.whl