PyPI - vision-agent - Versions diffs - 0.2.117__py3-none-any.whl → 0.2.119__py3-none-any.whl - Mend

vision-agent 0.2.117py3-none-any.whl → 0.2.119py3-none-any.whl

Files changed (17) hide show

vision_agent/agent/agent.py +1 -1
vision_agent/agent/vision_agent.py +107 -49
vision_agent/agent/vision_agent_coder.py +46 -23
vision_agent/agent/vision_agent_prompts.py +43 -22
vision_agent/clients/landing_public_api.py +2 -2
vision_agent/lmm/lmm.py +15 -6
vision_agent/lmm/types.py +3 -1
vision_agent/tools/__init__.py +2 -2
vision_agent/tools/meta_tools.py +281 -273
vision_agent/tools/tools.py +36 -14
vision_agent/tools/tools_types.py +3 -3
vision_agent/utils/execute.py +69 -22
vision_agent/utils/image_utils.py +2 -2
{vision_agent-0.2.117.dist-info → vision_agent-0.2.119.dist-info}/METADATA +12 -8
{vision_agent-0.2.117.dist-info → vision_agent-0.2.119.dist-info}/RECORD +17 -17
{vision_agent-0.2.117.dist-info → vision_agent-0.2.119.dist-info}/LICENSE +0 -0
{vision_agent-0.2.117.dist-info → vision_agent-0.2.119.dist-info}/WHEEL +0 -0

vision_agent/agent/agent.py CHANGED Viewed

@@ -11,7 +11,7 @@ class Agent(ABC):
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
-    ) -> str:
+    ) -> Union[str, List[Message]]:
         pass
     @abstractmethod

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import copy
 import logging
 import os
+import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from vision_agent.agent import Agent
 from vision_agent.agent.agent_utils import extract_json
@@ -13,8 +14,9 @@ from vision_agent.agent.vision_agent_prompts import (
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
+from vision_agent.tools.meta_tools import Artifacts
 from vision_agent.utils import CodeInterpreterFactory
-from vision_agent.utils.execute import CodeInterpreter
+from vision_agent.utils.execute import CodeInterpreter, Execution
 logging.basicConfig(level=logging.INFO)
 _LOGGER = logging.getLogger(__name__)
@@ -24,23 +26,30 @@ if str(WORKSPACE) != "":
     os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
-class DefaultImports:
-    code = [
+class BoilerplateCode:
+    pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
+        "artifacts = Artifacts('{remote_path}')",
+        "artifacts.load('{remote_path}')",
+    ]
+    post_code = [
+        "artifacts.save()",
     ]
     @staticmethod
-    def to_code_string() -> str:
-        return "\n".join(DefaultImports.code)
-    @staticmethod
-    def prepend_imports(code: str) -> str:
+    def add_boilerplate(code: str, **format: Any) -> str:
         """Run this method to prepend the default imports to the code.
         NOTE: be sure to run this method after the custom tools have been registered.
         """
-        return DefaultImports.to_code_string() + "\n\n" + code
+        return (
+            "\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
+            + "\n\n"
+            + code
+            + "\n\n"
+            + "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
+        )
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
@@ -60,35 +69,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     prompt = VA_CODE.format(
         documentation=META_TOOL_DOCSTRING,
         examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
-        dir=WORKSPACE,
         conversation=conversation,
     )
     return extract_json(orch([{"role": "user", "content": prompt}], stream=False))  # type: ignore
-def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str:
-    # Note the code interpreter needs to keep running in the same environment because
-    # the SWE tools hold state like line numbers and currently open files.
-    result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
-    return_str = ""
-    if result.success:
-        for res in result.results:
-            if res.text is not None:
-                return_str += res.text.replace("\\n", "\n")
-        if result.logs.stdout:
-            return_str += "----- stdout -----\n"
-            for log in result.logs.stdout:
-                return_str += log.replace("\\n", "\n")
-    else:
-        # for log in result.logs.stderr:
-        #     return_str += log.replace("\\n", "\n")
-        if result.error:
-            return_str += (
-                "\n" + result.error.value + "\n".join(result.error.traceback_raw)
-            )
-    return return_str
+def run_code_action(
+    code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
+) -> Execution:
+    return code_interpreter.exec_isolation(
+        BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
+    )
 def parse_execution(response: str) -> Optional[str]:
@@ -101,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]:
 class VisionAgent(Agent):
     """Vision Agent is an agent that can chat with the user and call tools or other
-    agents to generate code for it. Vision Agent uses python code to execute actions for
-    the user. Vision Agent is inspired by by OpenDev
+    agents to generate code for it. Vision Agent uses python code to execute actions
+    for the user. Vision Agent is inspired by by OpenDev
     https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
     Example
@@ -118,8 +109,20 @@ class VisionAgent(Agent):
         self,
         agent: Optional[LMM] = None,
         verbosity: int = 0,
+        local_artifacts_path: Optional[Union[str, Path]] = None,
         code_sandbox_runtime: Optional[str] = None,
     ) -> None:
+        """Initialize the VisionAgent.
+        Parameters:
+            agent (Optional[LMM]): The agent to use for conversation and orchestration
+                of other agents.
+            verbosity (int): The verbosity level of the agent.
+            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
+                artifacts file.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+        """
         self.agent = (
             OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
         )
@@ -128,12 +131,21 @@ class VisionAgent(Agent):
         self.code_sandbox_runtime = code_sandbox_runtime
         if self.verbosity >= 1:
             _LOGGER.setLevel(logging.INFO)
+        self.local_artifacts_path = cast(
+            str,
+            (
+                Path(local_artifacts_path)
+                if local_artifacts_path is not None
+                else Path(tempfile.NamedTemporaryFile(delete=False).name)
+            ),
+        )
     def __call__(
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
-    ) -> str:
+        artifacts: Optional[Artifacts] = None,
+    ) -> List[Message]:
         """Chat with VisionAgent and get the conversation response.
         Parameters:
@@ -141,6 +153,7 @@ class VisionAgent(Agent):
                 [{"role": "user", "content": "describe your task here..."}, ...] or a
                 string of just the contents.
             media (Optional[Union[str, Path]]): The media file to be used in the task.
+            artifacts (Optional[Artifacts]): The artifacts to use in the task.
         Returns:
             str: The conversation response.
@@ -149,22 +162,23 @@ class VisionAgent(Agent):
             input = [{"role": "user", "content": input}]
             if media is not None:
                 input[0]["media"] = [media]
-        results = self.chat_with_code(input)
-        return results  # type: ignore
+        results, _ = self.chat_with_code(input, artifacts)
+        return results
     def chat_with_code(
         self,
         chat: List[Message],
-    ) -> List[Message]:
+        artifacts: Optional[Artifacts] = None,
+    ) -> Tuple[List[Message], Artifacts]:
         """Chat with VisionAgent, it will use code to execute actions to accomplish
         its tasks.
         Parameters:
-            chat (List[Message]): A conversation
-                in the format of:
+            chat (List[Message]): A conversation in the format of:
                 [{"role": "user", "content": "describe your task here..."}]
                 or if it contains media files, it should be in the format of:
                 [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
+            artifacts (Optional[Artifacts]): The artifacts to use in the task.
         Returns:
             List[Message]: The conversation response.
@@ -173,6 +187,10 @@ class VisionAgent(Agent):
         if not chat:
             raise ValueError("chat cannot be empty")
+        if not artifacts:
+            # this is setting remote artifacts path
+            artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
         with CodeInterpreterFactory.new_instance(
             code_sandbox_runtime=self.code_sandbox_runtime
         ) as code_interpreter:
@@ -182,9 +200,14 @@ class VisionAgent(Agent):
             for chat_i in int_chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
-                        media = code_interpreter.upload_file(media)
-                        chat_i["content"] += f" Media name {media}"  # type: ignore
-                        media_list.append(media)
+                        media = cast(str, media)
+                        artifacts.artifacts[Path(media).name] = open(media, "rb").read()
+                        media_remote_path = (
+                            Path(code_interpreter.remote_path) / Path(media).name
+                        )
+                        chat_i["content"] += f" Media name {media_remote_path}"  # type: ignore
+                        media_list.append(media_remote_path)
             int_chat = cast(
                 List[Message],
@@ -204,6 +227,22 @@ class VisionAgent(Agent):
             finished = False
             iterations = 0
+            last_response = None
+            # Save the current state of artifacts, will include any images the user
+            # passed in.
+            artifacts.save(self.local_artifacts_path)
+            # Upload artifacts to remote location and show where they are going
+            # to be loaded to. The actual loading happens in BoilerplateCode as
+            # part of the pre_code.
+            remote_artifacts_path = code_interpreter.upload_file(
+                self.local_artifacts_path
+            )
+            artifacts_loaded = artifacts.show()
+            int_chat.append({"role": "observation", "content": artifacts_loaded})
+            orig_chat.append({"role": "observation", "content": artifacts_loaded})
             while not finished and iterations < self.max_iterations:
                 response = run_conversation(self.agent, int_chat)
                 if self.verbosity >= 1:
@@ -211,20 +250,39 @@ class VisionAgent(Agent):
                 int_chat.append({"role": "assistant", "content": str(response)})
                 orig_chat.append({"role": "assistant", "content": str(response)})
+                # sometimes it gets stuck in a loop, so we force it to exit
+                if last_response == response:
+                    response["let_user_respond"] = True
                 if response["let_user_respond"]:
                     break
                 code_action = parse_execution(response["response"])
                 if code_action is not None:
-                    obs = run_code_action(code_action, code_interpreter)
+                    result = run_code_action(
+                        code_action, code_interpreter, str(remote_artifacts_path)
+                    )
+                    obs = str(result.logs)
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
+                    # don't add execution results to internal chat
                     int_chat.append({"role": "observation", "content": obs})
-                    orig_chat.append({"role": "observation", "content": obs})
+                    orig_chat.append(
+                        {"role": "observation", "content": obs, "execution": result}
+                    )
                 iterations += 1
-        return orig_chat
+                last_response = response
+            # after running the agent, download the artifacts locally
+            code_interpreter.download_file(
+                str(remote_artifacts_path.name), str(self.local_artifacts_path)
+            )
+            artifacts.load(self.local_artifacts_path)
+            artifacts.save()
+        return orig_chat, artifacts
     def log_progress(self, data: Dict[str, Any]) -> None:
         pass

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -718,9 +718,14 @@ class VisionAgentCoder(Agent):
             for chat_i in chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
-                        media = code_interpreter.upload_file(media)
+                        media = (
+                            media
+                            if type(media) is str
+                            and media.startswith(("http", "https"))
+                            else code_interpreter.upload_file(cast(str, media))
+                        )
                         chat_i["content"] += f" Media name {media}"  # type: ignore
-                        media_list.append(media)
+                        media_list.append(str(media))
             int_chat = cast(
                 List[Message],
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
             results = {"code": "", "test": "", "plan": []}
             plan = []
             success = False
-            self.log_progress(
-                {
-                    "type": "log",
-                    "log_content": "Creating plans",
-                    "status": "started",
-                }
-            )
-            plans = write_plans(
-                int_chat,
-                T.get_tool_descriptions_by_names(
-                    customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
-                ),
-                format_memory(working_memory),
-                self.planner,
+            plans = self._create_plans(
+                int_chat, customized_tool_names, working_memory, self.planner
             )
-            if self.verbosity >= 1:
-                for p in plans:
-                    # tabulate will fail if the keys are not the same for all elements
-                    p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
-                    _LOGGER.info(
-                        f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
-                    )
+            if test_multi_plan:
+                self._log_plans(plans, self.verbosity)
             tool_infos = retrieve_tools(
                 plans,
                 self.tool_recommender,
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
         if self.report_progress_callback is not None:
             self.report_progress_callback(data)
+    def _create_plans(
+        self,
+        int_chat: List[Message],
+        customized_tool_names: Optional[List[str]],
+        working_memory: List[Dict[str, str]],
+        planner: LMM,
+    ) -> Dict[str, Any]:
+        self.log_progress(
+            {
+                "type": "log",
+                "log_content": "Creating plans",
+                "status": "started",
+            }
+        )
+        plans = write_plans(
+            int_chat,
+            T.get_tool_descriptions_by_names(
+                customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
+            ),
+            format_memory(working_memory),
+            planner,
+        )
+        return plans
+    def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
+        if verbosity >= 1:
+            for p in plans:
+                # tabulate will fail if the keys are not the same for all elements
+                p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
+                _LOGGER.info(
+                    f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+                )
 class OllamaVisionAgentCoder(VisionAgentCoder):
     """VisionAgentCoder that uses Ollama models for planning, coding, testing.

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -1,7 +1,7 @@
 VA_CODE = """
 **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
-**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
+**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
 <execute_python>
 print("Hello World!")
@@ -15,7 +15,6 @@ This is the documentation for the different actions you can take:
 **Examples**:
 Here is an example of how you can interact with a user and Actions to complete a task:
 --- START EXAMPLES ---
-[Current directory: /example/workspace]
 {examples}
 --- END EXAMPLES ---
@@ -26,24 +25,28 @@ Here is an example of how you can interact with a user and Actions to complete a
 **Conversation**:
 Here is the current conversation so far:
 --- START CONVERSATION ---
-[Current directory: {dir}]
 {conversation}
 """
 EXAMPLES_CODE1 = """
 USER: Can you detect the dogs in this image? Media name dog.jpg
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
+OBSERVATION:
+[Artifacts loaded]
+Artifact dog.jpg loaded to /path/to/images/dog.jpg
+[End of artifacts]
+AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[File /example/workspace/dog_detector.py]
+[Artifact dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
 3|    dogs = owl_v2("dog", image)
 4|    return dogs
-[End of file]
+[End of artifact]
 AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
@@ -56,18 +59,23 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
 USER: The the image only has one dog, can you fix this?
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
+[Artifacts loaded]
+Artifact dog.jpg loaded to /path/to/images/dog.jpg
+Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
+[End of artifacts]
+AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[File /example/workspace/dog_detector.py]
+[Artifact dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
 3|    dogs = owl_v2("dog", image, threshold=0.24)
 4|    return dogs
-[End of file]
+[End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -76,23 +84,34 @@ OBSERVATION:
 AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
 """
 EXAMPLES_CODE2 = """
-USER: Can you create a function to count workers with helmets?
+USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
+OBSERVATION:
+[Artifacts loaded]
+[End of artifacts]
 AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
 USER: Yes you can use workers.png
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
+OBSERVATION:
+[Artifacts loaded]
+Artifact workers.png loaded to /path/to/images/workers.png
+[End of artifacts]
+AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[File /example/workspace/code.py]
-0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
-1|def count_workers_with_helmets(image_path: str):
+[Artifact code.py]
+0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
+1|def count_workers_with_helmets(image_path: str, output_path: str):
 2|    image = load_image(image_path)
-3|    workers = owl_v2("worker", image)
-4|    helmets = owl_v2("helmet", image)
-5|    count = 0
+3|    detections = owl_v2("worker, helmet", image)
+4|    workers = [d for d in detections if d['label'] == 'worker']
+5|    helmets = [d for d in detections if d['label'] == 'helmet']
+6|    count = 0
 6|    for worker in workers:
 7|        person_box = worker['bbox']
 8|        person_has_helmet = False
@@ -102,14 +121,16 @@ OBSERVATION:
 12|                break
 13|        if person_has_helmet:
 14|            count += 1
+15|    overlay_bounding_boxes(image, detections)
+16|    save_image(output_path, image)
 15|    return count
-[End of file]
+[End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
 2
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
+AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
 """

vision_agent/clients/landing_public_api.py CHANGED Viewed

@@ -5,9 +5,9 @@ from uuid import UUID
 from requests.exceptions import HTTPError
 from vision_agent.clients.http import BaseHTTP
-from vision_agent.utils.type_defs import LandingaiAPIKey
+from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
 from vision_agent.utils.exceptions import FineTuneModelNotFound
-from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
+from vision_agent.utils.type_defs import LandingaiAPIKey
 class LandingPublicAPI(BaseHTTP):

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
 def encode_media(media: Union[str, Path]) -> str:
+    if type(media) is str and media.startswith(("http", "https")):
+        # for mp4 video url, we assume there is a same url but ends with png
+        # vision-agent-ui will upload this png when uploading the video
+        if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
+            return media[:-4] + ".png"
+        return media
     extension = "png"
     extension = Path(media).suffix
     if extension.lower() not in {
@@ -132,13 +138,17 @@ class OpenAILMM(LMM):
             fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
             if "media" in c:
                 for media in c["media"]:
-                    encoded_media = encode_media(media)
+                    encoded_media = encode_media(cast(str, media))
                     fixed_c["content"].append(  # type: ignore
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/png;base64,{encoded_media}",
+                                "url": (
+                                    encoded_media
+                                    if encoded_media.startswith(("http", "https"))
+                                    else f"data:image/png;base64,{encoded_media}"
+                                ),
                                 "detail": "low",
                             },
                         },
@@ -379,7 +389,9 @@ class OllamaLMM(LMM):
         fixed_chat = []
         for message in chat:
             if "media" in message:
-                message["images"] = [encode_media(m) for m in message["media"]]
+                message["images"] = [
+                    encode_media(cast(str, m)) for m in message["media"]
+                ]
                 del message["media"]
             fixed_chat.append(message)
         url = f"{self.url}/chat"
@@ -390,7 +402,6 @@ class OllamaLMM(LMM):
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
         if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
             json_data = json.dumps(data)
             def f() -> Iterator[Optional[str]]:
@@ -424,7 +435,6 @@ class OllamaLMM(LMM):
         media: Optional[List[Union[str, Path]]] = None,
         **kwargs: Any,
     ) -> Union[str, Iterator[Optional[str]]]:
         url = f"{self.url}/generate"
         data: Dict[str, Any] = {
             "model": self.model_name,
@@ -439,7 +449,6 @@ class OllamaLMM(LMM):
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
         if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
             json_data = json.dumps(data)
             def f() -> Iterator[Optional[str]]:

vision_agent/lmm/types.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from pathlib import Path
 from typing import Dict, Sequence, Union
+from vision_agent.utils.execute import Execution
 TextOrImage = Union[str, Sequence[Union[str, Path]]]
-Message = Dict[str, TextOrImage]
+Message = Dict[str, Union[TextOrImage, Execution]]

vision_agent/tools/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Callable, List, Optional
-from .meta_tools import META_TOOL_DOCSTRING
+from .meta_tools import META_TOOL_DOCSTRING, Artifacts
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 from .tool_utils import get_tool_descriptions_by_names
 from .tools import (
@@ -21,8 +21,8 @@ from .tools import (
     dpt_hybrid_midas,
     extract_frames,
     florence2_image_caption,
-    florence2_object_detection,
     florence2_ocr,
+    florence2_phrase_grounding,
     florence2_roberta_vqa,
     florence2_sam2_image,
     florence2_sam2_video,

vision-agent 0.2.117__py3-none-any.whl → 0.2.119__py3-none-any.whl

vision-agent 0.2.117py3-none-any.whl → 0.2.119py3-none-any.whl