PyPI - vision-agent - Versions diffs - 0.2.229__py3-none-any.whl → 0.2.231__py3-none-any.whl - Mend

vision-agent 0.2.229py3-none-any.whl → 0.2.231py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

vision_agent/.sim_tools/df.csv +10 -8
vision_agent/agent/agent_utils.py +10 -9
vision_agent/agent/types.py +1 -0
vision_agent/agent/vision_agent.py +3 -4
vision_agent/agent/vision_agent_coder_prompts.py +6 -6
vision_agent/agent/vision_agent_coder_v2.py +41 -26
vision_agent/agent/vision_agent_planner_prompts.py +6 -6
vision_agent/agent/vision_agent_planner_prompts_v2.py +16 -50
vision_agent/agent/vision_agent_planner_v2.py +11 -12
vision_agent/agent/vision_agent_prompts.py +11 -11
vision_agent/agent/vision_agent_prompts_v2.py +18 -3
vision_agent/agent/vision_agent_v2.py +29 -30
vision_agent/configs/__init__.py +1 -0
vision_agent/configs/anthropic_config.py +150 -0
vision_agent/configs/anthropic_openai_config.py +150 -0
vision_agent/configs/config.py +150 -0
vision_agent/configs/openai_config.py +160 -0
vision_agent/lmm/__init__.py +1 -1
vision_agent/lmm/lmm.py +63 -9
vision_agent/tools/__init__.py +4 -4
vision_agent/tools/planner_tools.py +74 -48
vision_agent/tools/tool_utils.py +3 -0
vision_agent/tools/tools.py +49 -31
vision_agent/utils/sim.py +33 -12
vision_agent-0.2.231.dist-info/METADATA +148 -0
vision_agent-0.2.231.dist-info/RECORD +52 -0
vision_agent-0.2.229.dist-info/METADATA +0 -562
vision_agent-0.2.229.dist-info/RECORD +0 -47
{vision_agent-0.2.229.dist-info → vision_agent-0.2.231.dist-info}/LICENSE +0 -0
{vision_agent-0.2.229.dist-info → vision_agent-0.2.231.dist-info}/WHEEL +0 -0

vision_agent/configs/openai_config.py ADDED Viewed

@@ -0,0 +1,160 @@
+from typing import Type
+from pydantic import BaseModel, Field
+from vision_agent.lmm import LMM, OpenAILMM
+class Config(BaseModel):
+    # for vision_agent_v2
+    agent: Type[LMM] = Field(default=OpenAILMM)
+    agent_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_planner_v2
+    planner: Type[LMM] = Field(default=OpenAILMM)
+    planner_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_planner_v2
+    summarizer: Type[LMM] = Field(default=OpenAILMM)
+    summarizer_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "o1",
+            "temperature": 1.0,
+            "image_size": 768,
+        }
+    )
+    # for vision_agent_planner_v2
+    critic: Type[LMM] = Field(default=OpenAILMM)
+    critic_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_coder_v2
+    coder: Type[LMM] = Field(default=OpenAILMM)
+    coder_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_coder_v2
+    tester: Type[LMM] = Field(default=OpenAILMM)
+    tester_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_coder_v2
+    debugger: Type[LMM] = Field(default=OpenAILMM)
+    debugger_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for get_tool_for_task
+    tool_tester: Type[LMM] = Field(default=OpenAILMM)
+    tool_tester_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for get_tool_for_task
+    tool_chooser: Type[LMM] = Field(default=OpenAILMM)
+    tool_chooser_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 1.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for suggestions module
+    suggester: Type[LMM] = Field(default=OpenAILMM)
+    suggester_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 1.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vqa module
+    vqa: Type[LMM] = Field(default=OpenAILMM)
+    vqa_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    def create_agent(self) -> LMM:
+        return self.agent(**self.agent_kwargs)
+    def create_planner(self) -> LMM:
+        return self.planner(**self.planner_kwargs)
+    def create_summarizer(self) -> LMM:
+        return self.summarizer(**self.summarizer_kwargs)
+    def create_critic(self) -> LMM:
+        return self.critic(**self.critic_kwargs)
+    def create_coder(self) -> LMM:
+        return self.coder(**self.coder_kwargs)
+    def create_tester(self) -> LMM:
+        return self.tester(**self.tester_kwargs)
+    def create_debugger(self) -> LMM:
+        return self.debugger(**self.debugger_kwargs)
+    def create_tool_tester(self) -> LMM:
+        return self.tool_tester(**self.tool_tester_kwargs)
+    def create_tool_chooser(self) -> LMM:
+        return self.tool_chooser(**self.tool_chooser_kwargs)
+    def create_suggester(self) -> LMM:
+        return self.suggester(**self.suggester_kwargs)
+    def create_vqa(self) -> LMM:
+        return self.vqa(**self.vqa_kwargs)

vision_agent/lmm/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
+from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, GoogleLMM, OllamaLMM, OpenAILMM
 from .types import Message

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -50,6 +50,8 @@ class OpenAILMM(LMM):
         api_key: Optional[str] = None,
         max_tokens: int = 4096,
         json_mode: bool = False,
+        image_size: int = 768,
+        image_detail: str = "low",
         **kwargs: Any,
     ):
         if not api_key:
@@ -59,7 +61,10 @@ class OpenAILMM(LMM):
         self.client = OpenAI(api_key=api_key)
         self.model_name = model_name
-        if "max_tokens" not in kwargs:
+        self.image_size = image_size
+        self.image_detail = image_detail
+        # o1 does not use max_tokens
+        if "max_tokens" not in kwargs and not model_name.startswith("o1"):
             kwargs["max_tokens"] = max_tokens
         if json_mode:
             kwargs["response_format"] = {"type": "json_object"}
@@ -94,7 +99,13 @@ class OpenAILMM(LMM):
             fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
             if "media" in c:
                 for media in c["media"]:
-                    encoded_media = encode_media(cast(str, media))
+                    resize = kwargs["resize"] if "resize" in kwargs else self.image_size
+                    image_detail = (
+                        kwargs["image_detail"]
+                        if "image_detail" in kwargs
+                        else self.image_detail
+                    )
+                    encoded_media = encode_media(cast(str, media), resize=resize)
                     fixed_c["content"].append(  # type: ignore
                         {
@@ -106,7 +117,7 @@ class OpenAILMM(LMM):
                                     or encoded_media.startswith("data:image/")
                                     else f"data:image/png;base64,{encoded_media}"
                                 ),
-                                "detail": "low",
+                                "detail": image_detail,
                             },
                         },
                     )
@@ -144,7 +155,13 @@ class OpenAILMM(LMM):
         ]
         if media and len(media) > 0:
             for m in media:
-                encoded_media = encode_media(m)
+                resize = kwargs["resize"] if "resize" in kwargs else None
+                image_detail = (
+                    kwargs["image_detail"]
+                    if "image_detail" in kwargs
+                    else self.image_detail
+                )
+                encoded_media = encode_media(m, resize=resize)
                 message[0]["content"].append(
                     {
                         "type": "image_url",
@@ -155,7 +172,7 @@ class OpenAILMM(LMM):
                                 or encoded_media.startswith("data:image/")
                                 else f"data:image/png;base64,{encoded_media}"
                             ),
-                            "detail": "low",
+                            "detail": image_detail,
                         },
                     },
                 )
@@ -186,6 +203,7 @@ class AzureOpenAILMM(OpenAILMM):
         azure_endpoint: Optional[str] = None,
         max_tokens: int = 4096,
         json_mode: bool = False,
+        image_detail: str = "low",
         **kwargs: Any,
     ):
         if not api_key:
@@ -208,6 +226,7 @@ class AzureOpenAILMM(OpenAILMM):
             azure_endpoint=azure_endpoint,
         )
         self.model_name = model_name
+        self.image_detail = image_detail
         if "max_tokens" not in kwargs:
             kwargs["max_tokens"] = max_tokens
@@ -225,6 +244,7 @@ class OllamaLMM(LMM):
         base_url: Optional[str] = "http://localhost:11434/api",
         json_mode: bool = False,
         num_ctx: int = 128_000,
+        image_size: int = 768,
         **kwargs: Any,
     ):
         """Initializes the Ollama LMM. kwargs are passed as 'options' to the model.
@@ -241,6 +261,7 @@ class OllamaLMM(LMM):
         self.url = base_url
         self.model_name = model_name
+        self.image_size = image_size
         self.kwargs = {"options": kwargs}
         if json_mode:
@@ -273,8 +294,9 @@ class OllamaLMM(LMM):
         fixed_chat = []
         for message in chat:
             if "media" in message:
+                resize = kwargs["resize"] if "resize" in kwargs else self.image_size
                 message["images"] = [
-                    encode_media(cast(str, m)) for m in message["media"]
+                    encode_media(cast(str, m), resize=resize) for m in message["media"]
                 ]
                 del message["media"]
             fixed_chat.append(message)
@@ -328,7 +350,8 @@ class OllamaLMM(LMM):
         if media and len(media) > 0:
             for m in media:
-                data["images"].append(encode_media(m))
+                resize = kwargs["resize"] if "resize" in kwargs else self.image_size
+                data["images"].append(encode_media(m, resize=resize))
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
@@ -370,9 +393,11 @@ class AnthropicLMM(LMM):
         api_key: Optional[str] = None,
         model_name: str = "claude-3-5-sonnet-20240620",
         max_tokens: int = 4096,
+        image_size: int = 768,
         **kwargs: Any,
     ):
         self.client = anthropic.Anthropic(api_key=api_key)
+        self.image_size = image_size
         self.model_name = model_name
         if "max_tokens" not in kwargs:
             kwargs["max_tokens"] = max_tokens
@@ -399,7 +424,8 @@ class AnthropicLMM(LMM):
             ]
             if "media" in msg:
                 for media_path in msg["media"]:
-                    encoded_media = encode_media(media_path, resize=768)
+                    resize = kwargs["resize"] if "resize" in kwargs else self.image_size
+                    encoded_media = encode_media(media_path, resize=resize)
                     if encoded_media.startswith("data:image/png;base64,"):
                         encoded_media = encoded_media[len("data:image/png;base64,") :]
                     content.append(
@@ -448,7 +474,8 @@ class AnthropicLMM(LMM):
         ]
         if media:
             for m in media:
-                encoded_media = encode_media(m, resize=768)
+                resize = kwargs["resize"] if "resize" in kwargs else self.image_size
+                encoded_media = encode_media(m, resize=resize)
                 if encoded_media.startswith("data:image/png;base64,"):
                     encoded_media = encoded_media[len("data:image/png;base64,") :]
                 content.append(
@@ -486,3 +513,30 @@ class AnthropicLMM(LMM):
             return f()
         else:
             return cast(str, response.content[0].text)
+class GoogleLMM(OpenAILMM):
+    r"""An LMM class for the Google LMMs."""
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model_name: str = "gemini-2.0-flash-exp",
+        max_tokens: int = 4096,
+        image_detail: str = "low",
+        image_size: int = 768,
+        **kwargs: Any,
+    ):
+        base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
+        if not api_key:
+            api_key = os.environ.get("GEMINI_API_KEY")
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+        self.model_name = model_name
+        self.image_size = image_size
+        self.image_detail = image_detail
+        if "max_tokens" not in kwargs:
+            kwargs["max_tokens"] = max_tokens
+        self.kwargs = kwargs

vision_agent/tools/__init__.py CHANGED Viewed

@@ -23,6 +23,9 @@ from .tools import (
     TOOLS_INFO,
     UTIL_TOOLS,
     UTILITIES_DOCSTRING,
+    agentic_object_detection,
+    agentic_sam2_instance_segmentation,
+    agentic_sam2_video_tracking,
     claude35_text_extraction,
     closest_box_distance,
     closest_mask_distance,
@@ -30,6 +33,7 @@ from .tools import (
     countgd_sam2_instance_segmentation,
     countgd_sam2_video_tracking,
     countgd_visual_prompt_object_detection,
+    custom_object_detection,
     depth_anything_v2,
     detr_segmentation,
     document_extraction,
@@ -63,10 +67,6 @@ from .tools import (
     video_temporal_localization,
     vit_image_classification,
     vit_nsfw_classification,
-    custom_object_detection,
-    agentic_object_detection,
-    agentic_sam2_instance_segmentation,
-    agentic_sam2_video_tracking,
 )
 __new_tools__ = [

vision_agent/tools/planner_tools.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import inspect
 import logging
-import shutil
 import tempfile
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Callable, Dict, List, Optional, Tuple, cast
 import libcst as cst
@@ -10,12 +10,7 @@ from IPython.display import display
 from PIL import Image
 import vision_agent.tools as T
-from vision_agent.agent.agent_utils import (
-    DefaultImports,
-    extract_code,
-    extract_json,
-    extract_tag,
-)
+from vision_agent.agent.agent_utils import DefaultImports, extract_json, extract_tag
 from vision_agent.agent.vision_agent_planner_prompts_v2 import (
     CATEGORIZE_TOOL_REQUEST,
     FINALIZE_PLAN,
@@ -24,6 +19,7 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
     TEST_TOOLS_EXAMPLE1,
     TEST_TOOLS_EXAMPLE2,
 )
+from vision_agent.configs import Config
 from vision_agent.lmm import LMM, AnthropicLMM
 from vision_agent.utils.execute import (
     CodeInterpreter,
@@ -35,7 +31,11 @@ from vision_agent.utils.image_utils import convert_to_b64
 from vision_agent.utils.sim import get_tool_recommender
 TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
+LOAD_TOOLS_DOCSTRING = T.get_tool_documentation(
+    [T.load_image, T.extract_frames_and_timestamps]
+)
+CONFIG = Config()
 _LOGGER = logging.getLogger(__name__)
 EXAMPLES = f"\n{TEST_TOOLS_EXAMPLE1}\n{TEST_TOOLS_EXAMPLE2}\n"
@@ -50,6 +50,54 @@ def format_tool_output(tool_thoughts: str, tool_docstring: str) -> str:
     return return_str
+def run_multi_judge(
+    tool_chooser: LMM,
+    tool_docs_str: str,
+    task: str,
+    code: str,
+    tool_output_str: str,
+    image_paths: List[str],
+) -> Tuple[Optional[Callable], str, str]:
+    error_message = ""
+    prompt = PICK_TOOL.format(
+        tool_docs=tool_docs_str,
+        user_request=task,
+        context=f"<code>\n{code}\n</code>\n<tool_output>\n{tool_output_str}\n</tool_output>",
+        previous_attempts=error_message,
+    )
+    def run_judge() -> Tuple[Optional[Callable], str, str]:
+        response = tool_chooser.generate(prompt, media=image_paths, temperature=1.0)
+        tool_choice_context = extract_tag(response, "json")  # type: ignore
+        tool_choice_context_dict = extract_json(tool_choice_context)  # type: ignore
+        tool, tool_thoughts, tool_docstring, _ = extract_tool_info(
+            tool_choice_context_dict
+        )
+        return tool, tool_thoughts, tool_docstring
+    responses = []
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(run_judge) for _ in range(3)]
+        for future in as_completed(futures):
+            responses.append(future.result())
+    responses = [r for r in responses if r[0] is not None]
+    counts: Dict[str, int] = {}
+    for tool, tool_thoughts, tool_docstring in responses:
+        if tool is not None:
+            counts[tool.__name__] = counts.get(tool.__name__, 0) + 1
+            if counts[tool.__name__] >= 2:
+                return tool, tool_thoughts, tool_docstring
+    if len(responses) == 0:
+        return (
+            None,
+            "No tool could be found, please try again with a different prompt or image",
+            "",
+        )
+    return responses[0]
 def extract_tool_info(
     tool_choice_context: Dict[str, Any],
 ) -> Tuple[Optional[Callable], str, str, str]:
@@ -129,6 +177,7 @@ def run_tool_testing(
                 cleaned_tool_docs.append(tool_doc)
         tool_docs = cleaned_tool_docs
     tool_docs_str = "\n".join([e["doc"] for e in tool_docs])
+    tool_docs_str += "\n" + LOAD_TOOLS_DOCSTRING
     prompt = TEST_TOOLS.format(
         tool_docs=tool_docs_str,
@@ -167,8 +216,15 @@ def run_tool_testing(
             examples=EXAMPLES,
             media=str(image_paths),
         )
-        code = extract_code(lmm.generate(prompt, media=image_paths))  # type: ignore
-        code = process_code(code)
+        response = cast(str, lmm.generate(prompt, media=image_paths))
+        code = extract_tag(response, "code")
+        if code is None:
+            code = response
+        try:
+            code = process_code(code)
+        except Exception as e:
+            _LOGGER.error(f"Error processing code: {e}")
         tool_output = code_interpreter.exec_isolation(
             DefaultImports.prepend_imports(code)
         )
@@ -212,7 +268,8 @@ def get_tool_for_task(
     --------
         >>> get_tool_for_task("Give me an OCR model that can find 'hot chocolate' in the image", [image])
     """
-    lmm = AnthropicLMM()
+    tool_tester = CONFIG.create_tool_tester()
+    tool_chooser = CONFIG.create_tool_chooser()
     with (
         tempfile.TemporaryDirectory() as tmpdirname,
@@ -225,45 +282,14 @@ def get_tool_for_task(
             image_paths.append(image_path)
         code, tool_docs_str, tool_output = run_tool_testing(
-            task, image_paths, lmm, exclude_tools, code_interpreter
+            task, image_paths, tool_tester, exclude_tools, code_interpreter
         )
         tool_output_str = tool_output.text(include_results=False).strip()
-        error_message = ""
-        prompt = PICK_TOOL.format(
-            tool_docs=tool_docs_str,
-            user_request=task,
-            context=f"<code>\n{code}\n</code>\n<tool_output>\n{tool_output_str}\n</tool_output>",
-            previous_attempts=error_message,
-        )
-        response = lmm.generate(prompt, media=image_paths)
-        tool_choice_context = extract_tag(response, "json")  # type: ignore
-        tool_choice_context_dict = extract_json(tool_choice_context)  # type: ignore
-        tool, tool_thoughts, tool_docstring, error_message = extract_tool_info(
-            tool_choice_context_dict
+        _, tool_thoughts, tool_docstring = run_multi_judge(
+            tool_chooser, tool_docs_str, task, code, tool_output_str, image_paths
         )
-        count = 1
-        while tool is None and count <= 3:
-            prompt = PICK_TOOL.format(
-                tool_docs=tool_docs_str,
-                user_request=task,
-                context=f"<code>\n{code}\n</code>\n<tool_output>\n{tool_output_str}\n</tool_output>",
-                previous_attempts=error_message,
-            )
-            tool_choice_context_dict = extract_json(
-                lmm.generate(prompt, media=image_paths)  # type: ignore
-            )
-            tool, tool_thoughts, tool_docstring, error_message = extract_tool_info(
-                tool_choice_context_dict
-            )
-        try:
-            shutil.rmtree(tmpdirname)
-        except Exception as e:
-            _LOGGER.error(f"Error removing temp directory: {e}")
     print(format_tool_output(tool_thoughts, tool_docstring))
@@ -277,7 +303,7 @@ def get_tool_for_task_human_reviewer(
     task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
 ) -> None:
     # NOTE: this will have the same documentation as get_tool_for_task
-    lmm = AnthropicLMM()
+    tool_tester = CONFIG.create_tool_tester()
     with (
         tempfile.TemporaryDirectory() as tmpdirname,
@@ -298,7 +324,7 @@ def get_tool_for_task_human_reviewer(
         _, _, tool_output = run_tool_testing(
             task,
             image_paths,
-            lmm,
+            tool_tester,
             exclude_tools,
             code_interpreter,
             process_code=lambda x: replace_box_threshold(x, tools, 0.05),
@@ -349,7 +375,7 @@ def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
         medias: List[np.ndarray]: The images to ask the question about, it could also
             be frames from a video. You can send up to 5 frames from a video.
     """
-    lmm = AnthropicLMM()
+    vqa = CONFIG.create_vqa()
     if isinstance(medias, np.ndarray):
         medias = [medias]
     if isinstance(medias, list) and len(medias) > 5:
@@ -358,7 +384,7 @@ def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
         "data:image/png;base64," + convert_to_b64(media) for media in medias
     ]
-    response = cast(str, lmm.generate(prompt, media=all_media_b64))
+    response = cast(str, vqa.generate(prompt, media=all_media_b64))
     print(f"[claude35_vqa output]\n{response}\n[end of claude35_vqa output]")

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -318,6 +318,9 @@ def single_nms(
 def nms(
     all_preds: List[List[Dict[str, Any]]], iou_threshold: float
 ) -> List[List[Dict[str, Any]]]:
+    if not isinstance(all_preds[0], List):
+        all_preds = [all_preds]
     return_preds = []
     for frame_preds in all_preds:
         frame_preds = single_nms(frame_preds, iou_threshold)

vision-agent 0.2.229__py3-none-any.whl → 0.2.231__py3-none-any.whl

vision-agent 0.2.229py3-none-any.whl → 0.2.231py3-none-any.whl