PyPI - vision-agent - Versions diffs - 0.2.228__py3-none-any.whl → 0.2.230__py3-none-any.whl - Mend

vision-agent 0.2.228py3-none-any.whl → 0.2.230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

vision_agent/.sim_tools/df.csv +10 -8
vision_agent/agent/agent_utils.py +10 -9
vision_agent/agent/vision_agent.py +3 -4
vision_agent/agent/vision_agent_coder_prompts.py +6 -6
vision_agent/agent/vision_agent_coder_v2.py +41 -26
vision_agent/agent/vision_agent_planner_prompts.py +6 -6
vision_agent/agent/vision_agent_planner_prompts_v2.py +16 -50
vision_agent/agent/vision_agent_planner_v2.py +10 -12
vision_agent/agent/vision_agent_prompts.py +11 -11
vision_agent/agent/vision_agent_prompts_v2.py +18 -3
vision_agent/agent/vision_agent_v2.py +29 -30
vision_agent/configs/__init__.py +1 -0
vision_agent/configs/anthropic_config.py +150 -0
vision_agent/configs/anthropic_openai_config.py +150 -0
vision_agent/configs/config.py +150 -0
vision_agent/configs/openai_config.py +160 -0
vision_agent/lmm/__init__.py +1 -1
vision_agent/lmm/lmm.py +63 -9
vision_agent/tools/planner_tools.py +60 -40
vision_agent/tools/tool_utils.py +1 -2
vision_agent/tools/tools.py +10 -8
vision_agent-0.2.230.dist-info/METADATA +156 -0
{vision_agent-0.2.228.dist-info → vision_agent-0.2.230.dist-info}/RECORD +25 -20
vision_agent-0.2.228.dist-info/METADATA +0 -562
{vision_agent-0.2.228.dist-info → vision_agent-0.2.230.dist-info}/LICENSE +0 -0
{vision_agent-0.2.228.dist-info → vision_agent-0.2.230.dist-info}/WHEEL +0 -0

vision_agent/configs/openai_config.py ADDED Viewed

@@ -0,0 +1,160 @@
+from typing import Type
+from pydantic import BaseModel, Field
+from vision_agent.lmm import LMM, OpenAILMM
+class Config(BaseModel):
+    # for vision_agent_v2
+    agent: Type[LMM] = Field(default=OpenAILMM)
+    agent_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_planner_v2
+    planner: Type[LMM] = Field(default=OpenAILMM)
+    planner_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_planner_v2
+    summarizer: Type[LMM] = Field(default=OpenAILMM)
+    summarizer_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "o1",
+            "temperature": 1.0,
+            "image_size": 768,
+        }
+    )
+    # for vision_agent_planner_v2
+    critic: Type[LMM] = Field(default=OpenAILMM)
+    critic_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_coder_v2
+    coder: Type[LMM] = Field(default=OpenAILMM)
+    coder_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_coder_v2
+    tester: Type[LMM] = Field(default=OpenAILMM)
+    tester_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vision_agent_coder_v2
+    debugger: Type[LMM] = Field(default=OpenAILMM)
+    debugger_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for get_tool_for_task
+    tool_tester: Type[LMM] = Field(default=OpenAILMM)
+    tool_tester_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for get_tool_for_task
+    tool_chooser: Type[LMM] = Field(default=OpenAILMM)
+    tool_chooser_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for suggestions module
+    suggester: Type[LMM] = Field(default=OpenAILMM)
+    suggester_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    # for vqa module
+    vqa: Type[LMM] = Field(default=OpenAILMM)
+    vqa_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-08-06",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+    def create_agent(self) -> LMM:
+        return self.agent(**self.agent_kwargs)
+    def create_planner(self) -> LMM:
+        return self.planner(**self.planner_kwargs)
+    def create_summarizer(self) -> LMM:
+        return self.summarizer(**self.summarizer_kwargs)
+    def create_critic(self) -> LMM:
+        return self.critic(**self.critic_kwargs)
+    def create_coder(self) -> LMM:
+        return self.coder(**self.coder_kwargs)
+    def create_tester(self) -> LMM:
+        return self.tester(**self.tester_kwargs)
+    def create_debugger(self) -> LMM:
+        return self.debugger(**self.debugger_kwargs)
+    def create_tool_tester(self) -> LMM:
+        return self.tool_tester(**self.tool_tester_kwargs)
+    def create_tool_chooser(self) -> LMM:
+        return self.tool_chooser(**self.tool_chooser_kwargs)
+    def create_suggester(self) -> LMM:
+        return self.suggester(**self.suggester_kwargs)
+    def create_vqa(self) -> LMM:
+        return self.vqa(**self.vqa_kwargs)

vision_agent/lmm/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
+from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, GoogleLMM, OllamaLMM, OpenAILMM
 from .types import Message

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -50,6 +50,8 @@ class OpenAILMM(LMM):
         api_key: Optional[str] = None,
         max_tokens: int = 4096,
         json_mode: bool = False,
+        image_size: int = 768,
+        image_detail: str = "low",
         **kwargs: Any,
     ):
         if not api_key:
@@ -59,7 +61,10 @@ class OpenAILMM(LMM):
         self.client = OpenAI(api_key=api_key)
         self.model_name = model_name
-        if "max_tokens" not in kwargs:
+        self.image_size = image_size
+        self.image_detail = image_detail
+        # o1 does not use max_tokens
+        if "max_tokens" not in kwargs and not model_name.startswith("o1"):
             kwargs["max_tokens"] = max_tokens
         if json_mode:
             kwargs["response_format"] = {"type": "json_object"}
@@ -94,7 +99,13 @@ class OpenAILMM(LMM):
             fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
             if "media" in c:
                 for media in c["media"]:
-                    encoded_media = encode_media(cast(str, media))
+                    resize = kwargs["resize"] if "resize" in kwargs else self.image_size
+                    image_detail = (
+                        kwargs["image_detail"]
+                        if "image_detail" in kwargs
+                        else self.image_detail
+                    )
+                    encoded_media = encode_media(cast(str, media), resize=resize)
                     fixed_c["content"].append(  # type: ignore
                         {
@@ -106,7 +117,7 @@ class OpenAILMM(LMM):
                                     or encoded_media.startswith("data:image/")
                                     else f"data:image/png;base64,{encoded_media}"
                                 ),
-                                "detail": "low",
+                                "detail": image_detail,
                             },
                         },
                     )
@@ -144,7 +155,13 @@ class OpenAILMM(LMM):
         ]
         if media and len(media) > 0:
             for m in media:
-                encoded_media = encode_media(m)
+                resize = kwargs["resize"] if "resize" in kwargs else None
+                image_detail = (
+                    kwargs["image_detail"]
+                    if "image_detail" in kwargs
+                    else self.image_detail
+                )
+                encoded_media = encode_media(m, resize=resize)
                 message[0]["content"].append(
                     {
                         "type": "image_url",
@@ -155,7 +172,7 @@ class OpenAILMM(LMM):
                                 or encoded_media.startswith("data:image/")
                                 else f"data:image/png;base64,{encoded_media}"
                             ),
-                            "detail": "low",
+                            "detail": image_detail,
                         },
                     },
                 )
@@ -186,6 +203,7 @@ class AzureOpenAILMM(OpenAILMM):
         azure_endpoint: Optional[str] = None,
         max_tokens: int = 4096,
         json_mode: bool = False,
+        image_detail: str = "low",
         **kwargs: Any,
     ):
         if not api_key:
@@ -208,6 +226,7 @@ class AzureOpenAILMM(OpenAILMM):
             azure_endpoint=azure_endpoint,
         )
         self.model_name = model_name
+        self.image_detail = image_detail
         if "max_tokens" not in kwargs:
             kwargs["max_tokens"] = max_tokens
@@ -225,6 +244,7 @@ class OllamaLMM(LMM):
         base_url: Optional[str] = "http://localhost:11434/api",
         json_mode: bool = False,
         num_ctx: int = 128_000,
+        image_size: int = 768,
         **kwargs: Any,
     ):
         """Initializes the Ollama LMM. kwargs are passed as 'options' to the model.
@@ -241,6 +261,7 @@ class OllamaLMM(LMM):
         self.url = base_url
         self.model_name = model_name
+        self.image_size = image_size
         self.kwargs = {"options": kwargs}
         if json_mode:
@@ -273,8 +294,9 @@ class OllamaLMM(LMM):
         fixed_chat = []
         for message in chat:
             if "media" in message:
+                resize = kwargs["resize"] if "resize" in kwargs else self.image_size
                 message["images"] = [
-                    encode_media(cast(str, m)) for m in message["media"]
+                    encode_media(cast(str, m), resize=resize) for m in message["media"]
                 ]
                 del message["media"]
             fixed_chat.append(message)
@@ -328,7 +350,8 @@ class OllamaLMM(LMM):
         if media and len(media) > 0:
             for m in media:
-                data["images"].append(encode_media(m))
+                resize = kwargs["resize"] if "resize" in kwargs else self.image_size
+                data["images"].append(encode_media(m, resize=resize))
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
@@ -370,9 +393,11 @@ class AnthropicLMM(LMM):
         api_key: Optional[str] = None,
         model_name: str = "claude-3-5-sonnet-20240620",
         max_tokens: int = 4096,
+        image_size: int = 768,
         **kwargs: Any,
     ):
         self.client = anthropic.Anthropic(api_key=api_key)
+        self.image_size = image_size
         self.model_name = model_name
         if "max_tokens" not in kwargs:
             kwargs["max_tokens"] = max_tokens
@@ -399,7 +424,8 @@ class AnthropicLMM(LMM):
             ]
             if "media" in msg:
                 for media_path in msg["media"]:
-                    encoded_media = encode_media(media_path, resize=768)
+                    resize = kwargs["resize"] if "resize" in kwargs else self.image_size
+                    encoded_media = encode_media(media_path, resize=resize)
                     if encoded_media.startswith("data:image/png;base64,"):
                         encoded_media = encoded_media[len("data:image/png;base64,") :]
                     content.append(
@@ -448,7 +474,8 @@ class AnthropicLMM(LMM):
         ]
         if media:
             for m in media:
-                encoded_media = encode_media(m, resize=768)
+                resize = kwargs["resize"] if "resize" in kwargs else self.image_size
+                encoded_media = encode_media(m, resize=resize)
                 if encoded_media.startswith("data:image/png;base64,"):
                     encoded_media = encoded_media[len("data:image/png;base64,") :]
                 content.append(
@@ -486,3 +513,30 @@ class AnthropicLMM(LMM):
             return f()
         else:
             return cast(str, response.content[0].text)
+class GoogleLMM(OpenAILMM):
+    r"""An LMM class for the Google LMMs."""
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model_name: str = "gemini-2.0-flash-exp",
+        max_tokens: int = 4096,
+        image_detail: str = "low",
+        image_size: int = 768,
+        **kwargs: Any,
+    ):
+        base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
+        if not api_key:
+            api_key = os.environ.get("GEMINI_API_KEY")
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+        self.model_name = model_name
+        self.image_size = image_size
+        self.image_detail = image_detail
+        if "max_tokens" not in kwargs:
+            kwargs["max_tokens"] = max_tokens
+        self.kwargs = kwargs

vision_agent/tools/planner_tools.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import inspect
 import logging
-import shutil
 import tempfile
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Callable, Dict, List, Optional, Tuple, cast
 import libcst as cst
@@ -24,6 +24,7 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
     TEST_TOOLS_EXAMPLE1,
     TEST_TOOLS_EXAMPLE2,
 )
+from vision_agent.configs import Config
 from vision_agent.lmm import LMM, AnthropicLMM
 from vision_agent.utils.execute import (
     CodeInterpreter,
@@ -36,6 +37,7 @@ from vision_agent.utils.sim import get_tool_recommender
 TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
+CONFIG = Config()
 _LOGGER = logging.getLogger(__name__)
 EXAMPLES = f"\n{TEST_TOOLS_EXAMPLE1}\n{TEST_TOOLS_EXAMPLE2}\n"
@@ -50,6 +52,54 @@ def format_tool_output(tool_thoughts: str, tool_docstring: str) -> str:
     return return_str
+def run_multi_judge(
+    tool_chooser: LMM,
+    tool_docs_str: str,
+    task: str,
+    code: str,
+    tool_output_str: str,
+    image_paths: List[str],
+) -> Tuple[Optional[Callable], str, str]:
+    error_message = ""
+    prompt = PICK_TOOL.format(
+        tool_docs=tool_docs_str,
+        user_request=task,
+        context=f"<code>\n{code}\n</code>\n<tool_output>\n{tool_output_str}\n</tool_output>",
+        previous_attempts=error_message,
+    )
+    def run_judge() -> Tuple[Optional[Callable], str, str]:
+        response = tool_chooser.generate(prompt, media=image_paths, temperature=1.0)
+        tool_choice_context = extract_tag(response, "json")  # type: ignore
+        tool_choice_context_dict = extract_json(tool_choice_context)  # type: ignore
+        tool, tool_thoughts, tool_docstring, _ = extract_tool_info(
+            tool_choice_context_dict
+        )
+        return tool, tool_thoughts, tool_docstring
+    responses = []
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(run_judge) for _ in range(3)]
+        for future in as_completed(futures):
+            responses.append(future.result())
+    responses = [r for r in responses if r[0] is not None]
+    counts: Dict[str, int] = {}
+    for tool, tool_thoughts, tool_docstring in responses:
+        if tool is not None:
+            counts[tool.__name__] = counts.get(tool.__name__, 0) + 1
+            if counts[tool.__name__] >= 2:
+                return tool, tool_thoughts, tool_docstring
+    if len(responses) == 0:
+        return (
+            None,
+            "No tool could be found, please try again with a different prompt or image",
+            "",
+        )
+    return responses[0]
 def extract_tool_info(
     tool_choice_context: Dict[str, Any],
 ) -> Tuple[Optional[Callable], str, str, str]:
@@ -212,7 +262,8 @@ def get_tool_for_task(
     --------
         >>> get_tool_for_task("Give me an OCR model that can find 'hot chocolate' in the image", [image])
     """
-    lmm = AnthropicLMM()
+    tool_tester = CONFIG.create_tool_tester()
+    tool_chooser = CONFIG.create_tool_chooser()
     with (
         tempfile.TemporaryDirectory() as tmpdirname,
@@ -225,45 +276,14 @@ def get_tool_for_task(
             image_paths.append(image_path)
         code, tool_docs_str, tool_output = run_tool_testing(
-            task, image_paths, lmm, exclude_tools, code_interpreter
+            task, image_paths, tool_tester, exclude_tools, code_interpreter
         )
         tool_output_str = tool_output.text(include_results=False).strip()
-        error_message = ""
-        prompt = PICK_TOOL.format(
-            tool_docs=tool_docs_str,
-            user_request=task,
-            context=f"<code>\n{code}\n</code>\n<tool_output>\n{tool_output_str}\n</tool_output>",
-            previous_attempts=error_message,
-        )
-        response = lmm.generate(prompt, media=image_paths)
-        tool_choice_context = extract_tag(response, "json")  # type: ignore
-        tool_choice_context_dict = extract_json(tool_choice_context)  # type: ignore
-        tool, tool_thoughts, tool_docstring, error_message = extract_tool_info(
-            tool_choice_context_dict
+        _, tool_thoughts, tool_docstring = run_multi_judge(
+            tool_chooser, tool_docs_str, task, code, tool_output_str, image_paths
         )
-        count = 1
-        while tool is None and count <= 3:
-            prompt = PICK_TOOL.format(
-                tool_docs=tool_docs_str,
-                user_request=task,
-                context=f"<code>\n{code}\n</code>\n<tool_output>\n{tool_output_str}\n</tool_output>",
-                previous_attempts=error_message,
-            )
-            tool_choice_context_dict = extract_json(
-                lmm.generate(prompt, media=image_paths)  # type: ignore
-            )
-            tool, tool_thoughts, tool_docstring, error_message = extract_tool_info(
-                tool_choice_context_dict
-            )
-        try:
-            shutil.rmtree(tmpdirname)
-        except Exception as e:
-            _LOGGER.error(f"Error removing temp directory: {e}")
     print(format_tool_output(tool_thoughts, tool_docstring))
@@ -277,7 +297,7 @@ def get_tool_for_task_human_reviewer(
     task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
 ) -> None:
     # NOTE: this will have the same documentation as get_tool_for_task
-    lmm = AnthropicLMM()
+    tool_tester = CONFIG.create_tool_tester()
     with (
         tempfile.TemporaryDirectory() as tmpdirname,
@@ -298,7 +318,7 @@ def get_tool_for_task_human_reviewer(
         _, _, tool_output = run_tool_testing(
             task,
             image_paths,
-            lmm,
+            tool_tester,
             exclude_tools,
             code_interpreter,
             process_code=lambda x: replace_box_threshold(x, tools, 0.05),
@@ -349,7 +369,7 @@ def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
         medias: List[np.ndarray]: The images to ask the question about, it could also
             be frames from a video. You can send up to 5 frames from a video.
     """
-    lmm = AnthropicLMM()
+    vqa = CONFIG.create_vqa()
     if isinstance(medias, np.ndarray):
         medias = [medias]
     if isinstance(medias, list) and len(medias) > 5:
@@ -358,7 +378,7 @@ def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
         "data:image/png;base64," + convert_to_b64(media) for media in medias
     ]
-    response = cast(str, lmm.generate(prompt, media=all_media_b64))
+    response = cast(str, vqa.generate(prompt, media=all_media_b64))
     print(f"[claude35_vqa output]\n{response}\n[end of claude35_vqa output]")

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -72,8 +72,7 @@ def send_inference_request(
     response = _call_post(url, payload, session, files, function_name, is_form)
-    # TODO: consider making the response schema the same between below two sources
-    return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
+    return response["data"]
 def send_task_inference_request(

vision_agent/tools/tools.py CHANGED Viewed

@@ -595,14 +595,14 @@ def owlv2_sam2_video_tracking(
 def florence2_object_detection(
     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
 ) -> List[Dict[str, Any]]:
-    """'florence2_object_detection' is a tool that can detect multiple
-    objects given a text prompt which can be object names or caption. You
-    can optionally separate the object names in the text with commas. It returns a list
-    of bounding boxes with normalized coordinates, label names and associated
-    confidence scores of 1.0.
+    """'florence2_object_detection' is a tool that can detect multiple objects given a
+    text prompt which can be object names or caption. You can optionally separate the
+    object names in the text with commas. It returns a list of bounding boxes with
+    normalized coordinates, label names and associated confidence scores of 1.0.
     Parameters:
-        prompt (str): The prompt to ground to the image.
+        prompt (str): The prompt to ground to the image. Use exclusive categories that
+            do not overlap such as 'person, car' and NOT 'person, athlete'.
         image (np.ndarray): The image to used to detect objects
         fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
             fine-tuned model ID here to use it.
@@ -681,7 +681,8 @@ def florence2_sam2_instance_segmentation(
     1.0.
     Parameters:
-        prompt (str): The prompt to ground to the image.
+        prompt (str): The prompt to ground to the image. Use exclusive categories that
+            do not overlap such as 'person, car' and NOT 'person, athlete'.
         image (np.ndarray): The image to ground the prompt to.
         fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
             fine-tuned model ID here to use it.
@@ -769,7 +770,8 @@ def florence2_sam2_video_tracking(
     is useful for tracking and counting without duplicating counts.
     Parameters:
-        prompt (str): The prompt to ground to the video.
+        prompt (str): The prompt to ground to the image. Use exclusive categories that
+            do not overlap such as 'person, car' and NOT 'person, athlete'.
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
         chunk_length (Optional[int]): The number of frames to re-run florence2 to find
             new objects.

vision-agent 0.2.228__py3-none-any.whl → 0.2.230__py3-none-any.whl

vision-agent 0.2.228py3-none-any.whl → 0.2.230py3-none-any.whl