PyPI - vision-agent - Versions diffs - 0.2.56__py3-none-any.whl → 0.2.58__py3-none-any.whl - Mend

vision-agent 0.2.56py3-none-any.whl → 0.2.58py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

vision_agent/__init__.py +1 -2
vision_agent/agent/agent.py +3 -1
vision_agent/agent/vision_agent.py +110 -81
vision_agent/agent/vision_agent_prompts.py +1 -1
vision_agent/lmm/__init__.py +1 -1
vision_agent/lmm/lmm.py +54 -116
vision_agent/tools/__init__.py +2 -1
vision_agent/tools/tools.py +3 -3
{vision_agent-0.2.56.dist-info → vision_agent-0.2.58.dist-info}/METADATA +36 -7
vision_agent-0.2.58.dist-info/RECORD +23 -0
vision_agent/agent/agent_coder.py +0 -216
vision_agent/agent/agent_coder_prompts.py +0 -135
vision_agent/agent/data_interpreter.py +0 -475
vision_agent/agent/data_interpreter_prompts.py +0 -186
vision_agent/agent/easytool.py +0 -346
vision_agent/agent/easytool_prompts.py +0 -89
vision_agent/agent/easytool_v2.py +0 -781
vision_agent/agent/easytool_v2_prompts.py +0 -152
vision_agent/agent/reflexion.py +0 -299
vision_agent/agent/reflexion_prompts.py +0 -100
vision_agent/llm/__init__.py +0 -1
vision_agent/llm/llm.py +0 -176
vision_agent/tools/easytool_tools.py +0 -1242
vision_agent-0.2.56.dist-info/RECORD +0 -36
{vision_agent-0.2.56.dist-info → vision_agent-0.2.58.dist-info}/LICENSE +0 -0
{vision_agent-0.2.56.dist-info → vision_agent-0.2.58.dist-info}/WHEEL +0 -0

vision_agent/__init__.py CHANGED Viewed

@@ -1,3 +1,2 @@
 from .agent import Agent
-from .llm import LLM, OpenAILLM
-from .lmm import LMM, LLaVALMM, OpenAILMM, get_lmm
+from .lmm import LMM, OpenAILMM

vision_agent/agent/agent.py CHANGED Viewed

@@ -2,12 +2,14 @@ from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
+from vision_agent.lmm import Message
 class Agent(ABC):
     @abstractmethod
     def __call__(
         self,
-        input: Union[List[Dict[str, str]], str],
+        input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
     ) -> str:
         pass

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -13,7 +13,6 @@ from rich.style import Style
 from rich.syntax import Syntax
 from tabulate import tabulate
-from vision_agent.llm.llm import AzureOpenAILLM
 import vision_agent.tools as T
 from vision_agent.agent import Agent
 from vision_agent.agent.vision_agent_prompts import (
@@ -25,8 +24,7 @@ from vision_agent.agent.vision_agent_prompts import (
     SIMPLE_TEST,
     USER_REQ,
 )
-from vision_agent.llm import LLM, OpenAILLM
-from vision_agent.lmm import LMM, OpenAILMM
+from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
 from vision_agent.utils.image_utils import b64_to_pil
@@ -133,11 +131,10 @@ def extract_image(
 def write_plan(
-    chat: List[Dict[str, str]],
+    chat: List[Message],
     tool_desc: str,
     working_memory: str,
-    model: Union[LLM, LMM],
-    media: Optional[Sequence[Union[str, Path]]] = None,
+    model: LMM,
 ) -> List[Dict[str, str]]:
     chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
@@ -147,18 +144,58 @@ def write_plan(
     context = USER_REQ.format(user_request=user_request)
     prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
     chat[-1]["content"] = prompt
-    if isinstance(model, OpenAILMM):
-        media = extract_image(media)
-        return extract_json(model.chat(chat, images=media))["plan"]  # type: ignore
-    else:
-        return extract_json(model.chat(chat))["plan"]  # type: ignore
+    return extract_json(model.chat(chat))["plan"]  # type: ignore
+def write_code(
+    coder: LMM,
+    chat: List[Message],
+    tool_info: str,
+    feedback: str,
+) -> str:
+    chat = copy.deepcopy(chat)
+    if chat[-1]["role"] != "user":
+        raise ValueError("Last chat message must be from the user.")
+    user_request = chat[-1]["content"]
+    prompt = CODE.format(
+        docstring=tool_info,
+        question=user_request,
+        feedback=feedback,
+    )
+    chat[-1]["content"] = prompt
+    return extract_code(coder(chat))
+def write_test(
+    tester: LMM,
+    chat: List[Message],
+    tool_utils: str,
+    code: str,
+    feedback: str,
+    media: Optional[Sequence[Union[str, Path]]] = None,
+) -> str:
+    chat = copy.deepcopy(chat)
+    if chat[-1]["role"] != "user":
+        raise ValueError("Last chat message must be from the user.")
+    user_request = chat[-1]["content"]
+    prompt = SIMPLE_TEST.format(
+        docstring=tool_utils,
+        question=user_request,
+        code=code,
+        feedback=feedback,
+        media=media,
+    )
+    chat[-1]["content"] = prompt
+    return extract_code(tester(chat))
 def reflect(
-    chat: List[Dict[str, str]],
+    chat: List[Message],
     plan: str,
     code: str,
-    model: Union[LLM, LMM],
+    model: LMM,
 ) -> Dict[str, Union[str, bool]]:
     chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
@@ -168,22 +205,22 @@ def reflect(
     context = USER_REQ.format(user_request=user_request)
     prompt = REFLECT.format(context=context, plan=plan, code=code)
     chat[-1]["content"] = prompt
-    return extract_json(model.chat(chat))
+    return extract_json(model(chat))
 def write_and_test_code(
-    task: str,
+    chat: List[Message],
     tool_info: str,
     tool_utils: str,
     working_memory: List[Dict[str, str]],
-    coder: LLM,
-    tester: LLM,
-    debugger: LLM,
+    coder: LMM,
+    tester: LMM,
+    debugger: LMM,
     code_interpreter: CodeInterpreter,
     log_progress: Callable[[Dict[str, Any]], None],
     verbosity: int = 0,
     max_retries: int = 3,
-    input_media: Optional[Union[str, Path]] = None,
+    media: Optional[Sequence[Union[str, Path]]] = None,
 ) -> Dict[str, Any]:
     log_progress(
         {
@@ -191,25 +228,9 @@ def write_and_test_code(
             "status": "started",
         }
     )
-    code = extract_code(
-        coder(
-            CODE.format(
-                docstring=tool_info,
-                question=task,
-                feedback=format_memory(working_memory),
-            )
-        )
-    )
-    test = extract_code(
-        tester(
-            SIMPLE_TEST.format(
-                docstring=tool_utils,
-                question=task,
-                code=code,
-                feedback=working_memory,
-                media=input_media,
-            )
-        )
+    code = write_code(coder, chat, tool_info, format_memory(working_memory))
+    test = write_test(
+        tester, chat, tool_utils, code, format_memory(working_memory), media
     )
     log_progress(
@@ -392,10 +413,10 @@ class VisionAgent(Agent):
     def __init__(
         self,
-        planner: Optional[Union[LLM, LMM]] = None,
-        coder: Optional[LLM] = None,
-        tester: Optional[LLM] = None,
-        debugger: Optional[LLM] = None,
+        planner: Optional[LMM] = None,
+        coder: Optional[LMM] = None,
+        tester: Optional[LMM] = None,
+        debugger: Optional[LMM] = None,
         tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
@@ -403,10 +424,10 @@ class VisionAgent(Agent):
         """Initialize the Vision Agent.
         Parameters:
-            planner (Optional[LLM]): The planner model to use. Defaults to OpenAILLM.
-            coder (Optional[LLM]): The coder model to use. Defaults to OpenAILLM.
-            tester (Optional[LLM]): The tester model to use. Defaults to OpenAILLM.
-            debugger (Optional[LLM]): The debugger model to
+            planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
+            coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
+            tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
+            debugger (Optional[LMM]): The debugger model to
             tool_recommender (Optional[Sim]): The tool recommender model to use.
             verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
                 highest verbosity level which will output all intermediate debugging
@@ -418,12 +439,12 @@ class VisionAgent(Agent):
         """
         self.planner = (
-            OpenAILLM(temperature=0.0, json_mode=True) if planner is None else planner
+            OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
         )
-        self.coder = OpenAILLM(temperature=0.0) if coder is None else coder
-        self.tester = OpenAILLM(temperature=0.0) if tester is None else tester
+        self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
+        self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
         self.debugger = (
-            OpenAILLM(temperature=0.0, json_mode=True) if debugger is None else debugger
+            OpenAILMM(temperature=0.0, json_mode=True) if debugger is None else debugger
         )
         self.tool_recommender = (
@@ -437,7 +458,7 @@ class VisionAgent(Agent):
     def __call__(
         self,
-        input: Union[List[Dict[str, str]], str],
+        input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
     ) -> str:
         """Chat with Vision Agent and return intermediate information regarding the task.
@@ -454,23 +475,26 @@ class VisionAgent(Agent):
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
-        results = self.chat_with_workflow(input, media)
+            if media is not None:
+                input[0]["media"] = [media]
+        results = self.chat_with_workflow(input)
         results.pop("working_memory")
         return results  # type: ignore
     def chat_with_workflow(
         self,
-        chat: List[Dict[str, str]],
-        media: Optional[Union[str, Path]] = None,
+        chat: List[Message],
         self_reflection: bool = False,
         display_visualization: bool = False,
     ) -> Dict[str, Any]:
         """Chat with Vision Agent and return intermediate information regarding the task.
         Parameters:
-            chat (List[Dict[str, str]]): A conversation in the format of
-                [{"role": "user", "content": "describe your task here..."}].
-            media (Optional[Union[str, Path]]): The media file to be used in the task.
+            chat (List[MediaChatItem]): A conversation
+                in the format of:
+                [{"role": "user", "content": "describe your task here..."}]
+                or if it contains media files, it should be in the format of:
+                [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
             self_reflection (bool): Whether to reflect on the task and debug the code.
             display_visualization (bool): If True, it opens a new window locally to
                 show the image(s) created by visualization code (if there is any).
@@ -485,11 +509,19 @@ class VisionAgent(Agent):
         # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
         with CodeInterpreterFactory.new_instance() as code_interpreter:
-            if media is not None:
-                media = code_interpreter.upload_file(media)
-                for chat_i in chat:
-                    if chat_i["role"] == "user":
-                        chat_i["content"] += f" Image name {media}"
+            chat = copy.deepcopy(chat)
+            media_list = []
+            for chat_i in chat:
+                if "media" in chat_i:
+                    for media in chat_i["media"]:
+                        media = code_interpreter.upload_file(media)
+                        chat_i["content"] += f" Media name {media}"  # type: ignore
+                        media_list.append(media)
+            int_chat = cast(
+                List[Message],
+                [{"role": c["role"], "content": c["content"]} for c in chat],
+            )
             code = ""
             test = ""
@@ -507,11 +539,10 @@ class VisionAgent(Agent):
                     }
                 )
                 plan_i = write_plan(
-                    chat,
+                    int_chat,
                     T.TOOL_DESCRIPTIONS,
                     format_memory(working_memory),
                     self.planner,
-                    media=[media] if media else None,
                 )
                 plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
@@ -534,9 +565,7 @@ class VisionAgent(Agent):
                     self.verbosity,
                 )
                 results = write_and_test_code(
-                    task=FULL_TASK.format(
-                        user_request=chat[0]["content"], subtasks=plan_i_str
-                    ),
+                    chat=int_chat,
                     tool_info=tool_info,
                     tool_utils=T.UTILITIES_DOCSTRING,
                     working_memory=working_memory,
@@ -546,7 +575,7 @@ class VisionAgent(Agent):
                     code_interpreter=code_interpreter,
                     log_progress=self.log_progress,
                     verbosity=self.verbosity,
-                    input_media=media,
+                    media=media_list,
                 )
                 success = cast(bool, results["success"])
                 code = cast(str, results["code"])
@@ -564,7 +593,7 @@ class VisionAgent(Agent):
                     }
                 )
                 reflection = reflect(
-                    chat,
+                    int_chat,
                     FULL_TASK.format(
                         user_request=chat[0]["content"], subtasks=plan_i_str
                     ),
@@ -634,10 +663,10 @@ class AzureVisionAgent(VisionAgent):
     def __init__(
         self,
-        planner: Optional[Union[LLM, LMM]] = None,
-        coder: Optional[LLM] = None,
-        tester: Optional[LLM] = None,
-        debugger: Optional[LLM] = None,
+        planner: Optional[LMM] = None,
+        coder: Optional[LMM] = None,
+        tester: Optional[LMM] = None,
+        debugger: Optional[LMM] = None,
         tool_recommender: Optional[Sim] = None,
         verbosity: int = 0,
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
@@ -645,10 +674,10 @@ class AzureVisionAgent(VisionAgent):
         """Initialize the Vision Agent.
         Parameters:
-            planner (Optional[LLM]): The planner model to use. Defaults to OpenAILLM.
-            coder (Optional[LLM]): The coder model to use. Defaults to OpenAILLM.
-            tester (Optional[LLM]): The tester model to use. Defaults to OpenAILLM.
-            debugger (Optional[LLM]): The debugger model to
+            planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
+            coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
+            tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
+            debugger (Optional[LMM]): The debugger model to
             tool_recommender (Optional[Sim]): The tool recommender model to use.
             verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
                 highest verbosity level which will output all intermediate debugging
@@ -660,14 +689,14 @@ class AzureVisionAgent(VisionAgent):
         """
         super().__init__(
             planner=(
-                AzureOpenAILLM(temperature=0.0, json_mode=True)
+                AzureOpenAILMM(temperature=0.0, json_mode=True)
                 if planner is None
                 else planner
             ),
-            coder=AzureOpenAILLM(temperature=0.0) if coder is None else coder,
-            tester=AzureOpenAILLM(temperature=0.0) if tester is None else tester,
+            coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder,
+            tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester,
             debugger=(
-                AzureOpenAILLM(temperature=0.0, json_mode=True)
+                AzureOpenAILMM(temperature=0.0, json_mode=True)
                 if debugger is None
                 else debugger
             ),

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -171,7 +171,7 @@ This is the documentation for the functions you have access to. You may call any
 **Instructions**:
 1. Verify the fundamental functionality under normal conditions.
 2. Ensure each test case is well-documented with comments explaining the scenario it covers.
-3. Your test case MUST run only on the given image which is {media}
+3. Your test case MUST run only on the given images which are {media}
 4. Your test case MUST run only with the given values which is available in the question - {question}
 5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions.
 6. DO NOT mock any functions, you must test their functionality as is.

vision_agent/lmm/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from .lmm import LMM, AzureOpenAILMM, ~~LLaVALMM~~, OpenAILMM~~, get_lmm~~
1	+ from .lmm import LMM, AzureOpenAILMM, Message, OpenAILMM

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -6,15 +6,13 @@ from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union, cast
-import requests
 from openai import AzureOpenAI, OpenAI
+import vision_agent.tools as T
 from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 _LOGGER = logging.getLogger(__name__)
-_LLAVA_ENDPOINT = "https://svtswgdnleslqcsjvilau4p6u40jwrkn.lambda-url.us-east-2.on.aws"
 def encode_image(image: Union[str, Path]) -> str:
     with open(image, "rb") as f:
@@ -22,84 +20,38 @@ def encode_image(image: Union[str, Path]) -> str:
     return encoded_image
+TextOrImage = Union[str, List[Union[str, Path]]]
+Message = Dict[str, TextOrImage]
 class LMM(ABC):
     @abstractmethod
     def generate(
-        self, prompt: str, images: Optional[List[Union[str, Path]]] = None
+        self, prompt: str, media: Optional[List[Union[str, Path]]] = None
     ) -> str:
         pass
     @abstractmethod
     def chat(
         self,
-        chat: List[Dict[str, str]],
-        images: Optional[List[Union[str, Path]]] = None,
+        chat: List[Message],
     ) -> str:
         pass
     @abstractmethod
     def __call__(
         self,
-        input: Union[str, List[Dict[str, str]]],
-        images: Optional[List[Union[str, Path]]] = None,
+        input: Union[str, List[Message]],
     ) -> str:
         pass
-class LLaVALMM(LMM):
-    r"""An LMM class for the LLaVA-1.6 34B model."""
-    def __init__(self, model_name: str):
-        self.model_name = model_name
-    def __call__(
-        self,
-        input: Union[str, List[Dict[str, str]]],
-        images: Optional[List[Union[str, Path]]] = None,
-    ) -> str:
-        if isinstance(input, str):
-            return self.generate(input, images)
-        return self.chat(input, images)
-    def chat(
-        self,
-        chat: List[Dict[str, str]],
-        images: Optional[List[Union[str, Path]]] = None,
-    ) -> str:
-        raise NotImplementedError("Chat not supported for LLaVA")
-    def generate(
-        self,
-        prompt: str,
-        images: Optional[List[Union[str, Path]]] = None,
-        temperature: float = 0.1,
-        max_new_tokens: int = 1500,
-    ) -> str:
-        data = {"prompt": prompt}
-        if images and len(images) > 0:
-            data["image"] = encode_image(images[0])
-        data["temperature"] = temperature  # type: ignore
-        data["max_new_tokens"] = max_new_tokens  # type: ignore
-        res = requests.post(
-            _LLAVA_ENDPOINT,
-            headers={"Content-Type": "application/json"},
-            json=data,
-        )
-        resp_json: Dict[str, Any] = res.json()
-        if (
-            "statusCode" in resp_json and resp_json["statusCode"] != 200
-        ) or "statusCode" not in resp_json:
-            _LOGGER.error(f"Request failed: {resp_json}")
-            raise ValueError(f"Request failed: {resp_json}")
-        return cast(str, resp_json["data"])
 class OpenAILMM(LMM):
     r"""An LMM class for the OpenAI GPT-4 Vision model."""
     def __init__(
         self,
-        model_name: str = "gpt-4-turbo",
+        model_name: str = "gpt-4o",
         api_key: Optional[str] = None,
         max_tokens: int = 1024,
         json_mode: bool = False,
@@ -120,44 +72,49 @@ class OpenAILMM(LMM):
     def __call__(
         self,
-        input: Union[str, List[Dict[str, str]]],
-        images: Optional[List[Union[str, Path]]] = None,
+        input: Union[str, List[Message]],
     ) -> str:
         if isinstance(input, str):
-            return self.generate(input, images)
-        return self.chat(input, images)
+            return self.generate(input)
+        return self.chat(input)
     def chat(
         self,
-        chat: List[Dict[str, str]],
-        images: Optional[List[Union[str, Path]]] = None,
+        chat: List[Message],
     ) -> str:
+        """Chat with the LMM model.
+        Parameters:
+            chat (List[Dict[str, str]]): A list of dictionaries containing the chat
+                messages. The messages can be in the format:
+                [{"role": "user", "content": "Hello!"}, ...]
+                or if it contains media, it should be in the format:
+                [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
+        """
         fixed_chat = []
         for c in chat:
             fixed_c = {"role": c["role"]}
             fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
-            fixed_chat.append(fixed_c)
-        if images and len(images) > 0:
-            for image in images:
-                extension = Path(image).suffix
-                if extension.lower() == ".jpeg" or extension.lower() == ".jpg":
-                    extension = "jpg"
-                elif extension.lower() == ".png":
-                    extension = "png"
-                else:
-                    raise ValueError(f"Unsupported image extension: {extension}")
-                encoded_image = encode_image(image)
-                fixed_chat[0]["content"].append(  # type: ignore
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/{extension};base64,{encoded_image}",
-                            "detail": "low",
+            if "media" in c:
+                for image in c["media"]:
+                    extension = Path(image).suffix
+                    if extension.lower() == ".jpeg" or extension.lower() == ".jpg":
+                        extension = "jpg"
+                    elif extension.lower() == ".png":
+                        extension = "png"
+                    else:
+                        raise ValueError(f"Unsupported image extension: {extension}")
+                    encoded_image = encode_image(image)
+                    fixed_c["content"].append(  # type: ignore
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/{extension};base64,{encoded_image}",  # type: ignore
+                                "detail": "low",
+                            },
                         },
-                    },
-                )
+                    )
+            fixed_chat.append(fixed_c)
         response = self.client.chat.completions.create(
             model=self.model_name, messages=fixed_chat, **self.kwargs  # type: ignore
@@ -168,7 +125,7 @@ class OpenAILMM(LMM):
     def generate(
         self,
         prompt: str,
-        images: Optional[List[Union[str, Path]]] = None,
+        media: Optional[List[Union[str, Path]]] = None,
     ) -> str:
         message: List[Dict[str, Any]] = [
             {
@@ -178,10 +135,10 @@ class OpenAILMM(LMM):
                 ],
             }
         ]
-        if images and len(images) > 0:
-            for image in images:
-                extension = Path(image).suffix
-                encoded_image = encode_image(image)
+        if media and len(media) > 0:
+            for m in media:
+                extension = Path(m).suffix
+                encoded_image = encode_image(m)
                 message[0]["content"].append(
                     {
                         "type": "image_url",
@@ -198,9 +155,7 @@ class OpenAILMM(LMM):
         return cast(str, response.choices[0].message.content)
     def generate_classifier(self, question: str) -> Callable:
-        from vision_agent.tools.easytool_tools import CLIP
-        api_doc = CLIP.description + "\n" + str(CLIP.usage)
+        api_doc = T.get_tool_documentation([T.clip])
         prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
         response = self.client.chat.completions.create(
             model=self.model_name,
@@ -220,12 +175,10 @@ class OpenAILMM(LMM):
             )
             raise ValueError("Failed to decode response")
-        return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x})
+        return lambda x: T.clip(x, params["prompt"])
     def generate_detector(self, question: str) -> Callable:
-        from vision_agent.tools.easytool_tools import GroundingDINO
-        api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage)
+        api_doc = T.get_tool_documentation([T.grounding_dino])
         prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
         response = self.client.chat.completions.create(
             model=self.model_name,
@@ -245,12 +198,10 @@ class OpenAILMM(LMM):
             )
             raise ValueError("Failed to decode response")
-        return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x})
+        return lambda x: T.grounding_dino(params["prompt"], x)
     def generate_segmentor(self, question: str) -> Callable:
-        from vision_agent.tools.easytool_tools import GroundingSAM
-        api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage)
+        api_doc = T.get_tool_documentation([T.grounding_sam])
         prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
         response = self.client.chat.completions.create(
             model=self.model_name,
@@ -270,17 +221,13 @@ class OpenAILMM(LMM):
             )
             raise ValueError("Failed to decode response")
-        return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
+        return lambda x: T.grounding_sam(params["prompt"], x)
     def generate_zero_shot_counter(self, question: str) -> Callable:
-        from vision_agent.tools.easytool_tools import ZeroShotCounting
-        return lambda x: ZeroShotCounting()(**{"image": x})
+        return T.zero_shot_counting
     def generate_image_qa_tool(self, question: str) -> Callable:
-        from vision_agent.tools.easytool_tools import ImageQuestionAnswering
-        return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
+        return lambda x: T.image_question_answering(question, x)
 class AzureOpenAILMM(OpenAILMM):
@@ -314,12 +261,3 @@ class AzureOpenAILMM(OpenAILMM):
         if json_mode:
             kwargs["response_format"] = {"type": "json_object"}
         self.kwargs = kwargs
-def get_lmm(name: str) -> LMM:
-    if name == "openai":
-        return OpenAILMM(name)
-    elif name == "llava":
-        return LLaVALMM(name)
-    else:
-        raise ValueError(f"Unknown LMM: {name}, current support openai, llava")

vision-agent 0.2.56__py3-none-any.whl → 0.2.58__py3-none-any.whl

vision-agent 0.2.56py3-none-any.whl → 0.2.58py3-none-any.whl