PyPI - vision-agent - Versions diffs - 0.2.118__py3-none-any.whl → 0.2.119__py3-none-any.whl - Mend

vision-agent 0.2.118py3-none-any.whl → 0.2.119py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

vision_agent/agent/agent.py +1 -1
vision_agent/agent/vision_agent.py +107 -49
vision_agent/agent/vision_agent_coder.py +2 -2
vision_agent/agent/vision_agent_prompts.py +43 -22
vision_agent/clients/landing_public_api.py +2 -2
vision_agent/lmm/lmm.py +4 -2
vision_agent/lmm/types.py +3 -1
vision_agent/tools/__init__.py +2 -2
vision_agent/tools/meta_tools.py +281 -273
vision_agent/tools/tools.py +3 -3
vision_agent/tools/tools_types.py +3 -3
vision_agent/utils/execute.py +69 -22
vision_agent/utils/image_utils.py +2 -2
{vision_agent-0.2.118.dist-info → vision_agent-0.2.119.dist-info}/METADATA +12 -8
{vision_agent-0.2.118.dist-info → vision_agent-0.2.119.dist-info}/RECORD +17 -17
{vision_agent-0.2.118.dist-info → vision_agent-0.2.119.dist-info}/LICENSE +0 -0
{vision_agent-0.2.118.dist-info → vision_agent-0.2.119.dist-info}/WHEEL +0 -0

vision_agent/agent/agent.py CHANGED Viewed

@@ -11,7 +11,7 @@ class Agent(ABC):
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
-    ) -> str:
+    ) -> Union[str, List[Message]]:
         pass
     @abstractmethod

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import copy
 import logging
 import os
+import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from vision_agent.agent import Agent
 from vision_agent.agent.agent_utils import extract_json
@@ -13,8 +14,9 @@ from vision_agent.agent.vision_agent_prompts import (
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
+from vision_agent.tools.meta_tools import Artifacts
 from vision_agent.utils import CodeInterpreterFactory
-from vision_agent.utils.execute import CodeInterpreter
+from vision_agent.utils.execute import CodeInterpreter, Execution
 logging.basicConfig(level=logging.INFO)
 _LOGGER = logging.getLogger(__name__)
@@ -24,23 +26,30 @@ if str(WORKSPACE) != "":
     os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
-class DefaultImports:
-    code = [
+class BoilerplateCode:
+    pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
+        "artifacts = Artifacts('{remote_path}')",
+        "artifacts.load('{remote_path}')",
+    ]
+    post_code = [
+        "artifacts.save()",
     ]
     @staticmethod
-    def to_code_string() -> str:
-        return "\n".join(DefaultImports.code)
-    @staticmethod
-    def prepend_imports(code: str) -> str:
+    def add_boilerplate(code: str, **format: Any) -> str:
         """Run this method to prepend the default imports to the code.
         NOTE: be sure to run this method after the custom tools have been registered.
         """
-        return DefaultImports.to_code_string() + "\n\n" + code
+        return (
+            "\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
+            + "\n\n"
+            + code
+            + "\n\n"
+            + "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
+        )
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
@@ -60,35 +69,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     prompt = VA_CODE.format(
         documentation=META_TOOL_DOCSTRING,
         examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
-        dir=WORKSPACE,
         conversation=conversation,
     )
     return extract_json(orch([{"role": "user", "content": prompt}], stream=False))  # type: ignore
-def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str:
-    # Note the code interpreter needs to keep running in the same environment because
-    # the SWE tools hold state like line numbers and currently open files.
-    result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
-    return_str = ""
-    if result.success:
-        for res in result.results:
-            if res.text is not None:
-                return_str += res.text.replace("\\n", "\n")
-        if result.logs.stdout:
-            return_str += "----- stdout -----\n"
-            for log in result.logs.stdout:
-                return_str += log.replace("\\n", "\n")
-    else:
-        # for log in result.logs.stderr:
-        #     return_str += log.replace("\\n", "\n")
-        if result.error:
-            return_str += (
-                "\n" + result.error.value + "\n".join(result.error.traceback_raw)
-            )
-    return return_str
+def run_code_action(
+    code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
+) -> Execution:
+    return code_interpreter.exec_isolation(
+        BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
+    )
 def parse_execution(response: str) -> Optional[str]:
@@ -101,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]:
 class VisionAgent(Agent):
     """Vision Agent is an agent that can chat with the user and call tools or other
-    agents to generate code for it. Vision Agent uses python code to execute actions for
-    the user. Vision Agent is inspired by by OpenDev
+    agents to generate code for it. Vision Agent uses python code to execute actions
+    for the user. Vision Agent is inspired by by OpenDev
     https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
     Example
@@ -118,8 +109,20 @@ class VisionAgent(Agent):
         self,
         agent: Optional[LMM] = None,
         verbosity: int = 0,
+        local_artifacts_path: Optional[Union[str, Path]] = None,
         code_sandbox_runtime: Optional[str] = None,
     ) -> None:
+        """Initialize the VisionAgent.
+        Parameters:
+            agent (Optional[LMM]): The agent to use for conversation and orchestration
+                of other agents.
+            verbosity (int): The verbosity level of the agent.
+            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
+                artifacts file.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+        """
         self.agent = (
             OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
         )
@@ -128,12 +131,21 @@ class VisionAgent(Agent):
         self.code_sandbox_runtime = code_sandbox_runtime
         if self.verbosity >= 1:
             _LOGGER.setLevel(logging.INFO)
+        self.local_artifacts_path = cast(
+            str,
+            (
+                Path(local_artifacts_path)
+                if local_artifacts_path is not None
+                else Path(tempfile.NamedTemporaryFile(delete=False).name)
+            ),
+        )
     def __call__(
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
-    ) -> str:
+        artifacts: Optional[Artifacts] = None,
+    ) -> List[Message]:
         """Chat with VisionAgent and get the conversation response.
         Parameters:
@@ -141,6 +153,7 @@ class VisionAgent(Agent):
                 [{"role": "user", "content": "describe your task here..."}, ...] or a
                 string of just the contents.
             media (Optional[Union[str, Path]]): The media file to be used in the task.
+            artifacts (Optional[Artifacts]): The artifacts to use in the task.
         Returns:
             str: The conversation response.
@@ -149,22 +162,23 @@ class VisionAgent(Agent):
             input = [{"role": "user", "content": input}]
             if media is not None:
                 input[0]["media"] = [media]
-        results = self.chat_with_code(input)
-        return results  # type: ignore
+        results, _ = self.chat_with_code(input, artifacts)
+        return results
     def chat_with_code(
         self,
         chat: List[Message],
-    ) -> List[Message]:
+        artifacts: Optional[Artifacts] = None,
+    ) -> Tuple[List[Message], Artifacts]:
         """Chat with VisionAgent, it will use code to execute actions to accomplish
         its tasks.
         Parameters:
-            chat (List[Message]): A conversation
-                in the format of:
+            chat (List[Message]): A conversation in the format of:
                 [{"role": "user", "content": "describe your task here..."}]
                 or if it contains media files, it should be in the format of:
                 [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
+            artifacts (Optional[Artifacts]): The artifacts to use in the task.
         Returns:
             List[Message]: The conversation response.
@@ -173,6 +187,10 @@ class VisionAgent(Agent):
         if not chat:
             raise ValueError("chat cannot be empty")
+        if not artifacts:
+            # this is setting remote artifacts path
+            artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
         with CodeInterpreterFactory.new_instance(
             code_sandbox_runtime=self.code_sandbox_runtime
         ) as code_interpreter:
@@ -182,9 +200,14 @@ class VisionAgent(Agent):
             for chat_i in int_chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
-                        media = code_interpreter.upload_file(media)
-                        chat_i["content"] += f" Media name {media}"  # type: ignore
-                        media_list.append(media)
+                        media = cast(str, media)
+                        artifacts.artifacts[Path(media).name] = open(media, "rb").read()
+                        media_remote_path = (
+                            Path(code_interpreter.remote_path) / Path(media).name
+                        )
+                        chat_i["content"] += f" Media name {media_remote_path}"  # type: ignore
+                        media_list.append(media_remote_path)
             int_chat = cast(
                 List[Message],
@@ -204,6 +227,22 @@ class VisionAgent(Agent):
             finished = False
             iterations = 0
+            last_response = None
+            # Save the current state of artifacts, will include any images the user
+            # passed in.
+            artifacts.save(self.local_artifacts_path)
+            # Upload artifacts to remote location and show where they are going
+            # to be loaded to. The actual loading happens in BoilerplateCode as
+            # part of the pre_code.
+            remote_artifacts_path = code_interpreter.upload_file(
+                self.local_artifacts_path
+            )
+            artifacts_loaded = artifacts.show()
+            int_chat.append({"role": "observation", "content": artifacts_loaded})
+            orig_chat.append({"role": "observation", "content": artifacts_loaded})
             while not finished and iterations < self.max_iterations:
                 response = run_conversation(self.agent, int_chat)
                 if self.verbosity >= 1:
@@ -211,20 +250,39 @@ class VisionAgent(Agent):
                 int_chat.append({"role": "assistant", "content": str(response)})
                 orig_chat.append({"role": "assistant", "content": str(response)})
+                # sometimes it gets stuck in a loop, so we force it to exit
+                if last_response == response:
+                    response["let_user_respond"] = True
                 if response["let_user_respond"]:
                     break
                 code_action = parse_execution(response["response"])
                 if code_action is not None:
-                    obs = run_code_action(code_action, code_interpreter)
+                    result = run_code_action(
+                        code_action, code_interpreter, str(remote_artifacts_path)
+                    )
+                    obs = str(result.logs)
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
+                    # don't add execution results to internal chat
                     int_chat.append({"role": "observation", "content": obs})
-                    orig_chat.append({"role": "observation", "content": obs})
+                    orig_chat.append(
+                        {"role": "observation", "content": obs, "execution": result}
+                    )
                 iterations += 1
-        return orig_chat
+                last_response = response
+            # after running the agent, download the artifacts locally
+            code_interpreter.download_file(
+                str(remote_artifacts_path.name), str(self.local_artifacts_path)
+            )
+            artifacts.load(self.local_artifacts_path)
+            artifacts.save()
+        return orig_chat, artifacts
     def log_progress(self, data: Dict[str, Any]) -> None:
         pass

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -722,10 +722,10 @@ class VisionAgentCoder(Agent):
                             media
                             if type(media) is str
                             and media.startswith(("http", "https"))
-                            else code_interpreter.upload_file(media)
+                            else code_interpreter.upload_file(cast(str, media))
                         )
                         chat_i["content"] += f" Media name {media}"  # type: ignore
-                        media_list.append(media)
+                        media_list.append(str(media))
             int_chat = cast(
                 List[Message],

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -1,7 +1,7 @@
 VA_CODE = """
 **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
-**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
+**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
 <execute_python>
 print("Hello World!")
@@ -15,7 +15,6 @@ This is the documentation for the different actions you can take:
 **Examples**:
 Here is an example of how you can interact with a user and Actions to complete a task:
 --- START EXAMPLES ---
-[Current directory: /example/workspace]
 {examples}
 --- END EXAMPLES ---
@@ -26,24 +25,28 @@ Here is an example of how you can interact with a user and Actions to complete a
 **Conversation**:
 Here is the current conversation so far:
 --- START CONVERSATION ---
-[Current directory: {dir}]
 {conversation}
 """
 EXAMPLES_CODE1 = """
 USER: Can you detect the dogs in this image? Media name dog.jpg
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
+OBSERVATION:
+[Artifacts loaded]
+Artifact dog.jpg loaded to /path/to/images/dog.jpg
+[End of artifacts]
+AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[File /example/workspace/dog_detector.py]
+[Artifact dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
 3|    dogs = owl_v2("dog", image)
 4|    return dogs
-[End of file]
+[End of artifact]
 AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
@@ -56,18 +59,23 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
 USER: The the image only has one dog, can you fix this?
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
+[Artifacts loaded]
+Artifact dog.jpg loaded to /path/to/images/dog.jpg
+Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
+[End of artifacts]
+AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[File /example/workspace/dog_detector.py]
+[Artifact dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
 3|    dogs = owl_v2("dog", image, threshold=0.24)
 4|    return dogs
-[End of file]
+[End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -76,23 +84,34 @@ OBSERVATION:
 AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
 """
 EXAMPLES_CODE2 = """
-USER: Can you create a function to count workers with helmets?
+USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
+OBSERVATION:
+[Artifacts loaded]
+[End of artifacts]
 AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
 USER: Yes you can use workers.png
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
+OBSERVATION:
+[Artifacts loaded]
+Artifact workers.png loaded to /path/to/images/workers.png
+[End of artifacts]
+AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[File /example/workspace/code.py]
-0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
-1|def count_workers_with_helmets(image_path: str):
+[Artifact code.py]
+0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
+1|def count_workers_with_helmets(image_path: str, output_path: str):
 2|    image = load_image(image_path)
-3|    workers = owl_v2("worker", image)
-4|    helmets = owl_v2("helmet", image)
-5|    count = 0
+3|    detections = owl_v2("worker, helmet", image)
+4|    workers = [d for d in detections if d['label'] == 'worker']
+5|    helmets = [d for d in detections if d['label'] == 'helmet']
+6|    count = 0
 6|    for worker in workers:
 7|        person_box = worker['bbox']
 8|        person_has_helmet = False
@@ -102,14 +121,16 @@ OBSERVATION:
 12|                break
 13|        if person_has_helmet:
 14|            count += 1
+15|    overlay_bounding_boxes(image, detections)
+16|    save_image(output_path, image)
 15|    return count
-[End of file]
+[End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
 2
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
+AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
 """

vision_agent/clients/landing_public_api.py CHANGED Viewed

@@ -5,9 +5,9 @@ from uuid import UUID
 from requests.exceptions import HTTPError
 from vision_agent.clients.http import BaseHTTP
-from vision_agent.utils.type_defs import LandingaiAPIKey
+from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
 from vision_agent.utils.exceptions import FineTuneModelNotFound
-from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
+from vision_agent.utils.type_defs import LandingaiAPIKey
 class LandingPublicAPI(BaseHTTP):

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -138,7 +138,7 @@ class OpenAILMM(LMM):
             fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
             if "media" in c:
                 for media in c["media"]:
-                    encoded_media = encode_media(media)
+                    encoded_media = encode_media(cast(str, media))
                     fixed_c["content"].append(  # type: ignore
                         {
@@ -389,7 +389,9 @@ class OllamaLMM(LMM):
         fixed_chat = []
         for message in chat:
             if "media" in message:
-                message["images"] = [encode_media(m) for m in message["media"]]
+                message["images"] = [
+                    encode_media(cast(str, m)) for m in message["media"]
+                ]
                 del message["media"]
             fixed_chat.append(message)
         url = f"{self.url}/chat"

vision_agent/lmm/types.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from pathlib import Path
 from typing import Dict, Sequence, Union
+from vision_agent.utils.execute import Execution
 TextOrImage = Union[str, Sequence[Union[str, Path]]]
-Message = Dict[str, TextOrImage]
+Message = Dict[str, Union[TextOrImage, Execution]]

vision_agent/tools/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Callable, List, Optional
-from .meta_tools import META_TOOL_DOCSTRING
+from .meta_tools import META_TOOL_DOCSTRING, Artifacts
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 from .tool_utils import get_tool_descriptions_by_names
 from .tools import (
@@ -21,8 +21,8 @@ from .tools import (
     dpt_hybrid_midas,
     extract_frames,
     florence2_image_caption,
-    florence2_phrase_grounding,
     florence2_ocr,
+    florence2_phrase_grounding,
     florence2_roberta_vqa,
     florence2_sam2_image,
     florence2_sam2_video,

vision-agent 0.2.118__py3-none-any.whl → 0.2.119__py3-none-any.whl

vision-agent 0.2.118py3-none-any.whl → 0.2.119py3-none-any.whl