PyPI - vision-agent - Versions diffs - 0.2.120__py3-none-any.whl → 0.2.122__py3-none-any.whl - Mend

vision-agent 0.2.120py3-none-any.whl → 0.2.122py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

vision_agent/agent/vision_agent.py +10 -6
vision_agent/agent/vision_agent_coder.py +1 -9
vision_agent/agent/vision_agent_coder_prompts.py +4 -5
vision_agent/agent/vision_agent_prompts.py +3 -3
vision_agent/lmm/lmm.py +0 -3
vision_agent/tools/__init__.py +3 -0
vision_agent/tools/meta_tools.py +140 -8
vision_agent/tools/tool_utils.py +95 -51
vision_agent/tools/tools.py +196 -114
vision_agent/tools/tools_types.py +18 -12
vision_agent/utils/execute.py +13 -4
vision_agent/utils/image_utils.py +1 -1
{vision_agent-0.2.120.dist-info → vision_agent-0.2.122.dist-info}/METADATA +1 -1
{vision_agent-0.2.120.dist-info → vision_agent-0.2.122.dist-info}/RECORD +16 -16
{vision_agent-0.2.120.dist-info → vision_agent-0.2.122.dist-info}/LICENSE +0 -0
{vision_agent-0.2.120.dist-info → vision_agent-0.2.122.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -30,7 +30,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
 def run_code_action(
     code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
-) -> Execution:
-    return code_interpreter.exec_isolation(
+) -> Tuple[Execution, str]:
+    result = code_interpreter.exec_isolation(
         BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
     )
+    obs = str(result.logs)
+    if result.error:
+        obs += f"\n{result.error}"
+    return result, obs
 def parse_execution(response: str) -> Optional[str]:
     code = None
@@ -192,7 +197,7 @@ class VisionAgent(Agent):
             artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
         with CodeInterpreterFactory.new_instance(
-            code_sandbox_runtime=self.code_sandbox_runtime
+            code_sandbox_runtime=self.code_sandbox_runtime,
         ) as code_interpreter:
             orig_chat = copy.deepcopy(chat)
             int_chat = copy.deepcopy(chat)
@@ -260,10 +265,9 @@ class VisionAgent(Agent):
                 code_action = parse_execution(response["response"])
                 if code_action is not None:
-                    result = run_code_action(
+                    result, obs = run_code_action(
                         code_action, code_interpreter, str(remote_artifacts_path)
                     )
-                    obs = str(result.logs)
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import copy
-import difflib
 import logging
 import os
 import sys
@@ -29,6 +28,7 @@ from vision_agent.agent.vision_agent_coder_prompts import (
     USER_REQ,
 )
 from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
+from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
 from vision_agent.utils.image_utils import b64_to_pil
@@ -63,14 +63,6 @@ class DefaultImports:
         return DefaultImports.to_code_string() + "\n\n" + code
-def get_diff(before: str, after: str) -> str:
-    return "".join(
-        difflib.unified_diff(
-            before.splitlines(keepends=True), after.splitlines(keepends=True)
-        )
-    )
 def format_memory(memory: List[Dict[str, str]]) -> str:
     output_str = ""
     for i, m in enumerate(memory):

vision_agent/agent/vision_agent_coder_prompts.py CHANGED Viewed

@@ -81,20 +81,19 @@ plan2:
 - Count the number of detected objects labeled as 'person'.
 plan3:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'loca_zero_shot_counting' tool to count the dominant foreground object, which in this case is people.
+- Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
 ```python
-from vision_agent.tools import load_image, owl_v2, grounding_sam, loca_zero_shot_counting
+from vision_agent.tools import load_image, owl_v2, grounding_sam, countgd_counting
 image = load_image("image.jpg")
 owl_v2_out = owl_v2("person", image)
 gsam_out = grounding_sam("person", image)
 gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
-loca_out = loca_zero_shot_counting(image)
-loca_out = loca_out["count"]
+cgd_out = countgd_counting(image)
-final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "loca_zero_shot_counting": loca_out}}
+final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
 print(final_out)
 ```
 """

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -48,7 +48,7 @@ OBSERVATION:
 4|    return dogs
 [End of artifact]
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -75,7 +75,7 @@ OBSERVATION:
 4|    return dogs
 [End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -126,7 +126,7 @@ OBSERVATION:
 15|    return count
 [End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -286,9 +286,6 @@ class OpenAILMM(LMM):
         return lambda x: T.grounding_sam(params["prompt"], x)
-    def generate_zero_shot_counter(self, question: str) -> Callable:
-        return T.loca_zero_shot_counting
     def generate_image_qa_tool(self, question: str) -> Callable:
         return lambda x: T.git_vqa_v2(question, x)

vision_agent/tools/__init__.py CHANGED Viewed

@@ -37,10 +37,13 @@ from .tools import (
     load_image,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
+    countgd_counting,
+    countgd_example_based_counting,
     ocr,
     overlay_bounding_boxes,
     overlay_heat_map,
     overlay_segmentation_masks,
+    overlay_counting_results,
     owl_v2,
     save_image,
     save_json,

vision_agent/tools/meta_tools.py CHANGED Viewed

@@ -1,5 +1,7 @@
+import difflib
 import os
 import pickle as pkl
+import re
 import subprocess
 import tempfile
 from pathlib import Path
@@ -8,10 +10,13 @@ from typing import Any, Dict, List, Optional, Union
 from IPython.display import display
 import vision_agent as va
+from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
+from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.utils.execute import Execution, MimeType
+from vision_agent.utils.image_utils import convert_to_b64
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
@@ -99,13 +104,14 @@ class Artifacts:
     def show(self) -> str:
         """Shows the artifacts that have been loaded and their remote save paths."""
-        out_str = "[Artifacts loaded]\n"
+        output_str = "[Artifacts loaded]\n"
         for k in self.artifacts.keys():
-            out_str += (
+            output_str += (
                 f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
             )
-        out_str += "[End of artifacts]\n"
-        return out_str
+        output_str += "[End of artifacts]\n"
+        print(output_str)
+        return output_str
     def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
         save_path = (
@@ -135,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str:
 def view_lines(
-    lines: List[str], line_num: int, window_size: int, name: str, total_lines: int
+    lines: List[str],
+    line_num: int,
+    window_size: int,
+    name: str,
+    total_lines: int,
+    print_output: bool = True,
 ) -> str:
     start = max(0, line_num - window_size)
     end = min(len(lines), line_num + window_size)
@@ -148,7 +159,9 @@ def view_lines(
             else f"[{len(lines) - end} more lines]"
         )
     )
-    print(return_str)
+    if print_output:
+        print(return_str)
     return return_str
@@ -231,7 +244,7 @@ def edit_code_artifact(
     new_content_lines = [
         line if line.endswith("\n") else line + "\n" for line in new_content_lines
     ]
-    lines = artifacts[name].splitlines()
+    lines = artifacts[name].splitlines(keepends=True)
     edited_lines = lines[:start] + new_content_lines + lines[end:]
     cur_line = start + len(content.split("\n")) // 2
@@ -261,13 +274,20 @@ def edit_code_artifact(
                 DEFAULT_WINDOW_SIZE,
                 name,
                 total_lines,
+                print_output=False,
             )
             total_lines_edit = sum(1 for _ in edited_lines)
             edited_view = view_lines(
-                edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit
+                edited_lines,
+                cur_line,
+                DEFAULT_WINDOW_SIZE,
+                name,
+                total_lines_edit,
+                print_output=False,
             )
             error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
+            print(error_msg)
             return error_msg
     artifacts[name] = "".join(edited_lines)
@@ -390,6 +410,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
     return f"[Media {Path(local_path).name} saved]"
+def list_artifacts(artifacts: Artifacts) -> str:
+    """Lists all the artifacts that have been loaded into the artifacts object."""
+    output_str = artifacts.show()
+    print(output_str)
+    return output_str
 def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
@@ -397,6 +424,108 @@ def get_tool_descriptions() -> str:
     return TOOL_DESCRIPTIONS
+def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
+    """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
+    objects in an image based on a given dataset. It returns the fine tuning job id.
+    Parameters:
+        bboxes (List[BboxInput]): A list of BboxInput containing the
+            image path, labels and bounding boxes.
+        task (str): The florencev2 fine-tuning task. The options are
+            'phrase_grounding'.
+    Returns:
+        UUID: The fine tuning job id, this id will used to retrieve the fine
+            tuned model.
+    Example
+    -------
+        >>> fine_tuning_job_id = florencev2_fine_tuning(
+            [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
+             {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
+             "phrase_grounding"
+        )
+    """
+    bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
+    task_type = PromptTask[task.upper()]
+    fine_tuning_request = [
+        BboxInputBase64(
+            image=convert_to_b64(bbox_input.image_path),
+            filename=Path(bbox_input.image_path).name,
+            labels=bbox_input.labels,
+            bboxes=bbox_input.bboxes,
+        )
+        for bbox_input in bboxes_input
+    ]
+    landing_api = LandingPublicAPI()
+    fine_tune_id = str(
+        landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
+    )
+    print(f"[Florence2 fine tuning id: {fine_tune_id}]")
+    return fine_tune_id
+def get_diff(before: str, after: str) -> str:
+    return "".join(
+        difflib.unified_diff(
+            before.splitlines(keepends=True), after.splitlines(keepends=True)
+        )
+    )
+def use_florence2_fine_tuning(
+    artifacts: Artifacts, name: str, task: str, fine_tune_id: str
+) -> str:
+    """Replaces florence2 calls with the fine tuning id. This ensures that the code
+    utilizes the fined tuned florence2 model. Returns the diff between the original
+    code and the new code.
+    Parameters:
+        artifacts (Artifacts): The artifacts object to edit the code from.
+        name (str): The name of the artifact to edit.
+        task (str): The task to fine tune the model for. The options are
+            'phrase_grounding'.
+        fine_tune_id (str): The fine tuning job id.
+    Examples
+    --------
+        >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
+    """
+    task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
+    if name not in artifacts:
+        output_str = f"[Artifact {name} does not exist]"
+        print(output_str)
+        return output_str
+    code = artifacts[name]
+    if task.lower() == "phrase_grounding":
+        pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
+        def replacer(match: re.Match) -> str:
+            arg = match.group(1)  # capture all initial arguments
+            return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
+    else:
+        raise ValueError(f"Task {task} is not supported.")
+    new_code = re.sub(pattern, replacer, code)
+    if new_code == code:
+        output_str = (
+            f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+        )
+        print(output_str)
+        return output_str
+    artifacts[name] = new_code
+    diff = get_diff(code, new_code)
+    print(diff)
+    return diff
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
@@ -406,5 +535,8 @@ META_TOOL_DOCSTRING = get_tool_documentation(
         generate_vision_code,
         edit_vision_code,
         write_media_artifact,
+        florence2_fine_tuning,
+        use_florence2_fine_tuning,
+        list_artifacts,
     ]
 )

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
+import os
 import inspect
 import logging
-import os
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 import pandas as pd
@@ -13,6 +13,7 @@ from urllib3.util.retry import Retry
 from vision_agent.utils.exceptions import RemoteToolCallFailed
 from vision_agent.utils.execute import Error, MimeType
 from vision_agent.utils.type_defs import LandingaiAPIKey
+from vision_agent.tools.tools_types import BoundingBoxes
 _LOGGER = logging.getLogger(__name__)
 _LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
@@ -34,61 +35,58 @@ def send_inference_request(
     files: Optional[List[Tuple[Any, ...]]] = None,
     v2: bool = False,
     metadata_payload: Optional[Dict[str, Any]] = None,
-) -> Dict[str, Any]:
+) -> Any:
     # TODO: runtime_tag and function_name should be metadata_payload and now included
     # in the service payload
-    try:
-        if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
-            payload["runtime_tag"] = runtime_tag
+    if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
+        payload["runtime_tag"] = runtime_tag
+    url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
+    if "TOOL_ENDPOINT_URL" in os.environ:
+        url = os.environ["TOOL_ENDPOINT_URL"]
+    headers = {"apikey": _LND_API_KEY}
+    if "TOOL_ENDPOINT_AUTH" in os.environ:
+        headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
+        headers.pop("apikey")
+    session = _create_requests_session(
+        url=url,
+        num_retry=3,
+        headers=headers,
+    )
-        url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
-        if "TOOL_ENDPOINT_URL" in os.environ:
-            url = os.environ["TOOL_ENDPOINT_URL"]
+    function_name = "unknown"
+    if "function_name" in payload:
+        function_name = payload["function_name"]
+    elif metadata_payload is not None and "function_name" in metadata_payload:
+        function_name = metadata_payload["function_name"]
-        tool_call_trace = ToolCallTrace(
-            endpoint_url=url,
-            request=payload,
-            response={},
-            error=None,
-        )
-        headers = {"apikey": _LND_API_KEY}
-        if "TOOL_ENDPOINT_AUTH" in os.environ:
-            headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
-            headers.pop("apikey")
-        session = _create_requests_session(
-            url=url,
-            num_retry=3,
-            headers=headers,
-        )
+    response = _call_post(url, payload, session, files, function_name)
-        if files is not None:
-            res = session.post(url, data=payload, files=files)
-        else:
-            res = session.post(url, json=payload)
-        if res.status_code != 200:
-            tool_call_trace.error = Error(
-                name="RemoteToolCallFailed",
-                value=f"{res.status_code} - {res.text}",
-                traceback_raw=[],
-            )
-            _LOGGER.error(f"Request failed: {res.status_code} {res.text}")
-            # TODO: function_name should be in metadata_payload
-            function_name = "unknown"
-            if "function_name" in payload:
-                function_name = payload["function_name"]
-            elif metadata_payload is not None and "function_name" in metadata_payload:
-                function_name = metadata_payload["function_name"]
-            raise RemoteToolCallFailed(function_name, res.status_code, res.text)
-        resp = res.json()
-        tool_call_trace.response = resp
-        # TODO: consider making the response schema the same between below two sources
-        return resp if "TOOL_ENDPOINT_AUTH" in os.environ else resp["data"]  # type: ignore
-    finally:
-        trace = tool_call_trace.model_dump()
-        trace["type"] = "tool_call"
-        display({MimeType.APPLICATION_JSON: trace}, raw=True)
+    # TODO: consider making the response schema the same between below two sources
+    return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
+def send_task_inference_request(
+    payload: Dict[str, Any],
+    task_name: str,
+    files: Optional[List[Tuple[Any, ...]]] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+) -> Any:
+    url = f"{_LND_API_URL_v2}/{task_name}"
+    headers = {"apikey": _LND_API_KEY}
+    session = _create_requests_session(
+        url=url,
+        num_retry=3,
+        headers=headers,
+    )
+    function_name = "unknown"
+    if metadata is not None and "function_name" in metadata:
+        function_name = metadata["function_name"]
+    response = _call_post(url, payload, session, files, function_name)
+    return response["data"]
 def _create_requests_session(
@@ -195,3 +193,49 @@ def get_tools_info(funcs: List[Callable[..., Any]]) -> Dict[str, str]:
         data[func.__name__] = f"{func.__name__}{inspect.signature(func)}:\n{desc}"
     return data
+def _call_post(
+    url: str,
+    payload: dict[str, Any],
+    session: Session,
+    files: Optional[List[Tuple[Any, ...]]] = None,
+    function_name: str = "unknown",
+) -> Any:
+    try:
+        tool_call_trace = ToolCallTrace(
+            endpoint_url=url,
+            request=payload,
+            response={},
+            error=None,
+        )
+        if files is not None:
+            response = session.post(url, data=payload, files=files)
+        else:
+            response = session.post(url, json=payload)
+        if response.status_code != 200:
+            tool_call_trace.error = Error(
+                name="RemoteToolCallFailed",
+                value=f"{response.status_code} - {response.text}",
+                traceback_raw=[],
+            )
+            _LOGGER.error(f"Request failed: {response.status_code} {response.text}")
+            raise RemoteToolCallFailed(
+                function_name, response.status_code, response.text
+            )
+        result = response.json()
+        tool_call_trace.response = result
+        return result
+    finally:
+        trace = tool_call_trace.model_dump()
+        trace["type"] = "tool_call"
+        display({MimeType.APPLICATION_JSON: trace}, raw=True)
+def filter_bboxes_by_threshold(
+    bboxes: BoundingBoxes, threshold: float
+) -> BoundingBoxes:
+    return list(filter(lambda bbox: bbox.score >= threshold, bboxes))

vision_agent/tools/tools.py CHANGED Viewed

@@ -13,7 +13,7 @@ import cv2
 import numpy as np
 import requests
 from moviepy.editor import ImageSequenceClip
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image, ImageDraw, ImageFont, ImageEnhance
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
@@ -24,14 +24,15 @@ from vision_agent.tools.tool_utils import (
     get_tools_df,
     get_tools_info,
     send_inference_request,
+    send_task_inference_request,
+    filter_bboxes_by_threshold,
 )
 from vision_agent.tools.tools_types import (
-    BboxInput,
-    BboxInputBase64,
     FineTuning,
-    Florencev2FtRequest,
+    Florence2FtRequest,
     JobStatus,
     PromptTask,
+    ODResponseData,
 )
 from vision_agent.utils import extract_frames_from_video
 from vision_agent.utils.exceptions import FineTuneModelIsNotReady
@@ -455,7 +456,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
         "image": image_b64,
         "function_name": "loca_zero_shot_counting",
     }
-    resp_data = send_inference_request(data, "loca", v2=True)
+    resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
     resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
     return resp_data
@@ -469,6 +470,8 @@ def loca_visual_prompt_counting(
     Parameters:
         image (np.ndarray): The image that contains lot of instances of a single object
+        visual_prompt (Dict[str, List[float]]): Bounding box of the object in format
+        [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
     Returns:
         Dict[str, Any]: A dictionary containing the key 'count' and the count as a
@@ -496,11 +499,109 @@ def loca_visual_prompt_counting(
         "bbox": list(map(int, denormalize_bbox(bbox, image_size))),
         "function_name": "loca_visual_prompt_counting",
     }
-    resp_data = send_inference_request(data, "loca", v2=True)
+    resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
     resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
     return resp_data
+def countgd_counting(
+    prompt: str,
+    image: np.ndarray,
+    box_threshold: float = 0.23,
+) -> List[Dict[str, Any]]:
+    """'countgd_counting' is a tool that can precisely count multiple instances of an
+    object given a text prompt. It returns a list of bounding boxes with normalized
+    coordinates, label names and associated confidence scores.
+    Parameters:
+        prompt (str): The object that needs to be counted.
+        image (np.ndarray): The image that contains multiple instances of the object.
+        box_threshold (float, optional): The threshold for detection. Defaults
+            to 0.23.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box.
+    Example
+    -------
+        >>> countgd_counting("flower", image)
+        [
+            {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
+            {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
+            {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
+        ]
+    """
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    prompt = prompt.replace(", ", " .")
+    payload = {"prompts": [prompt], "model": "countgd"}
+    metadata = {"function_name": "countgd_counting"}
+    resp_data = send_task_inference_request(
+        payload, "text-to-object-detection", files=files, metadata=metadata
+    )
+    bboxes_per_frame = resp_data[0]
+    bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
+    filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
+    return [bbox.model_dump() for bbox in filtered_bboxes]
+def countgd_example_based_counting(
+    visual_prompts: List[List[float]],
+    image: np.ndarray,
+    box_threshold: float = 0.23,
+) -> List[Dict[str, Any]]:
+    """'countgd_example_based_counting' is a tool that can precisely count multiple
+    instances of an object given few visual example prompts. It returns a list of bounding
+    boxes with normalized coordinates, label names and associated confidence scores.
+    Parameters:
+        visual_prompts (List[List[float]]): Bounding boxes of the object in format
+        [xmin, ymin, xmax, ymax]. Upto 3 bounding boxes can be provided.
+        image (np.ndarray): The image that contains multiple instances of the object.
+        box_threshold (float, optional): The threshold for detection. Defaults
+            to 0.23.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box.
+    Example
+    -------
+        >>> countgd_example_based_counting(
+            visual_prompts=[[0.1, 0.1, 0.4, 0.42], [0.2, 0.3, 0.25, 0.35]],
+            image=image
+        )
+        [
+            {'score': 0.49, 'label': 'object', 'bounding_box': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 0.68, 'label': 'object', 'bounding_box': [0.2, 0.21, 0.45, 0.5},
+            {'score': 0.78, 'label': 'object', 'bounding_box': [0.3, 0.35, 0.48, 0.52},
+            {'score': 0.98, 'label': 'object', 'bounding_box': [0.44, 0.24, 0.49, 0.58},
+        ]
+    """
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    visual_prompts = [
+        denormalize_bbox(bbox, image.shape[:2]) for bbox in visual_prompts
+    ]
+    payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
+    metadata = {"function_name": "countgd_example_based_counting"}
+    resp_data = send_task_inference_request(
+        payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
+    )
+    bboxes_per_frame = resp_data[0]
+    bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
+    filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
+    return [bbox.model_dump() for bbox in filtered_bboxes]
 def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
     """'florence2_roberta_vqa' is a tool that takes an image and analyzes
     its contents, generates detailed captions and then tries to answer the given
@@ -646,7 +747,7 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
         "tool": "closed_set_image_classification",
         "function_name": "clip",
     }
-    resp_data = send_inference_request(data, "tools")
+    resp_data: dict[str, Any] = send_inference_request(data, "tools")
     resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
     return resp_data
@@ -674,7 +775,7 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
         "tool": "image_classification",
         "function_name": "vit_image_classification",
     }
-    resp_data = send_inference_request(data, "tools")
+    resp_data: dict[str, Any] = send_inference_request(data, "tools")
     resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
     return resp_data
@@ -701,7 +802,9 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
         "image": image_b64,
         "function_name": "vit_nsfw_classification",
     }
-    resp_data = send_inference_request(data, "nsfw-classification", v2=True)
+    resp_data: dict[str, Any] = send_inference_request(
+        data, "nsfw-classification", v2=True
+    )
     resp_data["score"] = round(resp_data["score"], 4)
     return resp_data
@@ -762,7 +865,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
-def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+def florence2_phrase_grounding(
+    prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
     """'florence2_phrase_grounding' is a tool that can detect multiple
     objects given a text prompt which can be object names or caption. You
     can optionally separate the object names in the text with commas. It returns a list
@@ -772,6 +877,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to used to detect objects
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -790,14 +897,33 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
-    data = {
-        "image": image_b64,
-        "task": "<CAPTION_TO_PHRASE_GROUNDING>",
-        "prompt": prompt,
-        "function_name": "florence2_phrase_grounding",
-    }
-    detections = send_inference_request(data, "florence2", v2=True)
+    if fine_tune_id is not None:
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+        data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florencev2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+        )
+        data = data_obj.model_dump(by_alias=True)
+        detections = send_inference_request(data, "tools", v2=False)
+    else:
+        data = {
+            "image": image_b64,
+            "task": "<CAPTION_TO_PHRASE_GROUNDING>",
+            "prompt": prompt,
+            "function_name": "florence2_phrase_grounding",
+        }
+        detections = send_inference_request(data, "florence2", v2=True)
     detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
     return_data = []
     for i in range(len(detections["bboxes"])):
@@ -1559,117 +1685,72 @@ def overlay_heat_map(
     return np.array(combined)
-# TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
-    """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
-    to detect objects in an image based on a given dataset. It returns the fine
-    tuning job id.
+def overlay_counting_results(
+    image: np.ndarray, instances: List[Dict[str, Any]]
+) -> np.ndarray:
+    """'overlay_counting_results' is a utility function that displays counting results on
+    an image.
     Parameters:
-        bboxes (List[BboxInput]): A list of BboxInput containing the
-            image path, labels and bounding boxes.
-        task (PromptTask): The florencev2 fine-tuning task. The options are
-            CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+        image (np.ndarray): The image to display the bounding boxes on.
+        instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding
+            box information of each instance
     Returns:
-        UUID: The fine tuning job id, this id will used to retrieve the fine
-            tuned model.
+        np.ndarray: The image with the instance_id dislpayed
     Example
     -------
-        >>> fine_tuning_job_id = florencev2_fine_tuning(
-            [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
-             {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
-             "OBJECT_DETECTION"
+        >>> image_with_bboxes = overlay_counting_results(
+            image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
         )
     """
-    bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
-    task_input = PromptTask[task]
-    fine_tuning_request = [
-        BboxInputBase64(
-            image=convert_to_b64(bbox_input.image_path),
-            filename=bbox_input.image_path.split("/")[-1],
-            labels=bbox_input.labels,
-            bboxes=bbox_input.bboxes,
-        )
-        for bbox_input in bboxes_input
-    ]
-    landing_api = LandingPublicAPI()
-    return landing_api.launch_fine_tuning_job(
-        "florencev2", task_input, fine_tuning_request
-    )
+    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
+    color = (158, 218, 229)
-# TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuned_object_detection(
-    image: np.ndarray, prompt: str, model_id: UUID, task: str
-) -> List[Dict[str, Any]]:
-    """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
-    to detect objects given a text prompt such as a phrase or class names separated by
-    commas. It returns a list of detected objects as labels and their location as
-    bounding boxes with score of 1.0.
+    width, height = pil_image.size
+    fontsize = max(10, int(min(width, height) / 80))
+    pil_image = ImageEnhance.Brightness(pil_image).enhance(0.5)
+    draw = ImageDraw.Draw(pil_image)
+    font = ImageFont.truetype(
+        str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
+        fontsize,
+    )
-    Parameters:
-        image (np.ndarray): The image to used to detect objects.
-        prompt (str): The prompt to help find objects in the image.
-        model_id (UUID): The fine-tuned model id.
-        task (PromptTask): The florencev2 fine-tuning task. The options are
-            CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+    for i, elt in enumerate(instances):
+        label = f"{i}"
+        box = elt["bbox"]
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-            bounding box of the detected objects with normalized coordinates between 0
-            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
-            top-left and xmax and ymax are the coordinates of the bottom-right of the
-            bounding box. The scores are always 1.0 and cannot be thresholded
+        # denormalize the box if it is normalized
+        box = denormalize_bbox(box, (height, width))
+        x0, y0, x1, y1 = box
+        cx, cy = (x0 + x1) / 2, (y0 + y1) / 2
-    Example
-    -------
-        >>> florencev2_fine_tuned_object_detection(
-            image,
-            'person looking at a coyote',
-            UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
+        text_box = draw.textbbox(
+            (cx, cy), text=label, font=font, align="center", anchor="mm"
         )
-        [
-            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
-        ]
-    """
-    # check if job succeeded first
-    landing_api = LandingPublicAPI()
-    status = landing_api.check_fine_tuning_job(model_id)
-    if status is not JobStatus.SUCCEEDED:
-        raise FineTuneModelIsNotReady()
-    task = PromptTask[task]
-    if task is PromptTask.OBJECT_DETECTION:
-        prompt = ""
-    data_obj = Florencev2FtRequest(
-        image=convert_to_b64(image),
-        task=task,
-        tool="florencev2_fine_tuning",
-        prompt=prompt,
-        fine_tuning=FineTuning(job_id=model_id),
-    )
-    data = data_obj.model_dump(by_alias=True)
-    metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
-    detections = send_inference_request(
-        data, "tools", v2=False, metadata_payload=metadata_payload
-    )
-    detections = detections[task.value]
-    return_data = []
-    image_size = image.shape[:2]
-    for i in range(len(detections["bboxes"])):
-        return_data.append(
-            {
-                "score": 1.0,
-                "label": detections["labels"][i],
-                "bbox": normalize_bbox(detections["bboxes"][i], image_size),
-            }
+        # Calculate the offset to center the text within the bounding box
+        text_width = text_box[2] - text_box[0]
+        text_height = text_box[3] - text_box[1]
+        text_x0 = cx - text_width / 2
+        text_y0 = cy - text_height / 2
+        text_x1 = cx + text_width / 2
+        text_y1 = cy + text_height / 2
+        # Draw the rectangle encapsulating the text
+        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color)
+        # Draw the text at the center of the bounding box
+        draw.text(
+            (text_x0, text_y0),
+            label,
+            fill="black",
+            font=font,
+            anchor="lt",
         )
-    return return_data
+    return np.array(pil_image)
 FUNCTION_TOOLS = [
@@ -1679,8 +1760,7 @@ FUNCTION_TOOLS = [
     clip,
     vit_image_classification,
     vit_nsfw_classification,
-    loca_zero_shot_counting,
-    loca_visual_prompt_counting,
+    countgd_counting,
     florence2_image_caption,
     florence2_ocr,
     florence2_sam2_image,
@@ -1703,6 +1783,7 @@ UTIL_TOOLS = [
     overlay_bounding_boxes,
     overlay_segmentation_masks,
     overlay_heat_map,
+    overlay_counting_results,
 ]
 TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
@@ -1720,5 +1801,6 @@ UTILITIES_DOCSTRING = get_tool_documentation(
         overlay_bounding_boxes,
         overlay_segmentation_masks,
         overlay_heat_map,
+        overlay_counting_results,
     ]
 )

vision_agent/tools/tools_types.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from enum import Enum
-from typing import List, Optional, Tuple
 from uuid import UUID
+from typing import List, Tuple, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
+from pydantic import BaseModel, ConfigDict, Field, field_serializer, SerializationInfo
 class BboxInput(BaseModel):
@@ -19,16 +19,9 @@ class BboxInputBase64(BaseModel):
 class PromptTask(str, Enum):
-    """
-    Valid task prompts options for the Florencev2 model.
-    """
+    """Valid task prompts options for the Florence2 model."""
-    CAPTION = "<CAPTION>"
-    """"""
-    CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
-    """"""
-    OBJECT_DETECTION = "<OD>"
-    """"""
+    PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
 class FineTuning(BaseModel):
@@ -41,7 +34,7 @@ class FineTuning(BaseModel):
         return str(job_id)
-class Florencev2FtRequest(BaseModel):
+class Florence2FtRequest(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
     image: str
@@ -82,3 +75,16 @@ class JobStatus(str, Enum):
     SUCCEEDED = "SUCCEEDED"
     FAILED = "FAILED"
     STOPPED = "STOPPED"
+class ODResponseData(BaseModel):
+    label: str
+    score: float
+    bbox: Union[list[int], list[float]] = Field(alias="bounding_box")
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+BoundingBoxes = list[ODResponseData]

vision_agent/utils/execute.py CHANGED Viewed

@@ -564,7 +564,13 @@ class LocalCodeInterpreter(CodeInterpreter):
     ) -> None:
         super().__init__(timeout=timeout)
         self.nb = nbformat.v4.new_notebook()
-        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        # Set the notebook execution path to the remote path
+        self.resources = {"metadata": {"path": str(self.remote_path)}}
+        self.nb_client = NotebookClient(
+            self.nb,
+            timeout=self.timeout,
+            resources=self.resources,
+        )
         _LOGGER.info(
             f"""Local code interpreter initialized
 Python version: {sys.version}
@@ -606,7 +612,9 @@ Timeout: {self.timeout}"""
     def restart_kernel(self) -> None:
         self.close()
         self.nb = nbformat.v4.new_notebook()
-        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        self.nb_client = NotebookClient(
+            self.nb, timeout=self.timeout, resources=self.resources
+        )
         sleep(1)
         self._new_kernel()
@@ -636,7 +644,7 @@ Timeout: {self.timeout}"""
             f.write(contents)
         _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
-        return Path(self.remote_path / file_path)
+        return Path(self.remote_path / Path(file_path).name)
     def download_file(
         self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
@@ -672,7 +680,8 @@ class CodeInterpreterFactory:
     @staticmethod
     def new_instance(
-        code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None
+        code_sandbox_runtime: Optional[str] = None,
+        remote_path: Optional[Union[str, Path]] = None,
     ) -> CodeInterpreter:
         if not code_sandbox_runtime:
             code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")

vision_agent/utils/image_utils.py CHANGED Viewed

@@ -181,7 +181,7 @@ def denormalize_bbox(
         raise ValueError("Bounding box must be of length 4.")
     arr = np.array(bbox)
-    if np.all((arr >= 0) & (arr <= 1)):
+    if np.all((arr[:2] >= 0) & (arr[:2] <= 1)):
         x1, y1, x2, y2 = bbox
         x1 = round(x1 * image_size[1])
         y1 = round(y1 * image_size[0])

{vision_agent-0.2.120.dist-info → vision_agent-0.2.122.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.120
+Version: 0.2.122
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.120.dist-info → vision_agent-0.2.122.dist-info}/RECORD RENAMED Viewed

@@ -2,32 +2,32 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
 vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepovI,176
 vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
 vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
-vision_agent/agent/vision_agent.py,sha256=IEyXT_JPCuWmBHdEnM1Wrsj7hmCe5pKLf0gnZFJTddI,11046
-vision_agent/agent/vision_agent_coder.py,sha256=DOTmDdGPxcI06Jp6yx4ekRMP0vhiVaK9B9Dl8UyJHeo,34396
-vision_agent/agent/vision_agent_coder_prompts.py,sha256=xIya1txRZM8qoQHAWTEkEFCL8L3iZD7QD09t3ZtdxSE,11305
-vision_agent/agent/vision_agent_prompts.py,sha256=0GliXFtBf32aPu2ClU63FI5ii5CTxWYsvrsmnnDp-gs,7134
+vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
+vision_agent/agent/vision_agent_coder.py,sha256=ujctkpmQkX2C6YXjlp7VLZFqSB00xwkGe-9swA8Gv8s,34240
+vision_agent/agent/vision_agent_coder_prompts.py,sha256=Rg7-Ih7oFgFbHFFno0EHpaZEgm0SYj_nTdqqdp21YLo,11246
+vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
 vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
 vision_agent/clients/landing_public_api.py,sha256=rGtACkr8o5egDuMHQ5MBO4NuvsgPTp9Ew3rbq4R-vs0,1507
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
 vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
-vision_agent/lmm/lmm.py,sha256=AYrZNdhghG293wd3aKZ1jK1lUm2NLWwALktbM4wNais,20862
+vision_agent/lmm/lmm.py,sha256=H3a5V7c073-vXRJfQOblE2j_CsZkH1CNNRoQgLjJZuQ,20751
 vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
-vision_agent/tools/__init__.py,sha256=i7JOLxRaLdcY7-vCNOGAeOFMBfiAUIwWhnT32FO97VE,2201
-vision_agent/tools/meta_tools.py,sha256=Vu9WnKicGhafx9dPzDbQjQdcIzRCYYFPF68o79hDP-8,14616
+vision_agent/tools/__init__.py,sha256=TILaqdFYicScvpnCXMxgBsFmSW22NQDIvucvEgo0etw,2289
+vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tool_utils.py,sha256=qMsb9d8QtpXGgF9rpPO2dA390BewKdYO68oWKDu-TGg,6504
-vision_agent/tools/tools.py,sha256=kbbMToAaHxl42dDEvyz9Mvtpqts0l0hGoC5YQQyozr8,59953
-vision_agent/tools/tools_types.py,sha256=iLWSirheC87fKQolIhx_O4Jk8Lv7DRiLuE8PJqLGiVQ,2216
+vision_agent/tools/tool_utils.py,sha256=e_p-G2nwgWOpoaqpDitY3FJ6fFuTEg5GhDOD67wI2bE,7527
+vision_agent/tools/tools.py,sha256=jOBsuN-spY_2TlvpahoRYGvyInhQDTPXXukx9q72lEU,63454
+vision_agent/tools/tools_types.py,sha256=qs11HGLRXc9zytahBtG6TQxCh8Gigvn232at3jk54jI,2356
 vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
-vision_agent/utils/execute.py,sha256=Ap8Yx80spQq5f2QtKGx1MK03BR45mJKhlp1kfh-rIao,26751
-vision_agent/utils/image_utils.py,sha256=eNghu_2L8624jEXy8ZZS9OX46Mv0DT9bcvLForujwTs,9848
+vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4,27037
+vision_agent/utils/image_utils.py,sha256=UloC4byIQLM4CSCaH41SBciQ7X2OqKvsVvNOVKqIH_k,9856
 vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
-vision_agent-0.2.120.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.120.dist-info/METADATA,sha256=-FuNdlrzt5cTK6Ou_HTTROGVvsIwP3trsB5Edt2St3o,12255
-vision_agent-0.2.120.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.120.dist-info/RECORD,,
+vision_agent-0.2.122.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.122.dist-info/METADATA,sha256=WMdLNPyKY4Ot6ifOzwXNDiVm2TsStY-l-ge8t72Ynhk,12255
+vision_agent-0.2.122.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.122.dist-info/RECORD,,

{vision_agent-0.2.120.dist-info → vision_agent-0.2.122.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.120.dist-info → vision_agent-0.2.122.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.120__py3-none-any.whl → 0.2.122__py3-none-any.whl

vision-agent 0.2.120py3-none-any.whl → 0.2.122py3-none-any.whl