PyPI - vision-agent - Versions diffs - 0.2.120__tar.gz → 0.2.122__tar.gz - Mend

vision-agent 0.2.120tar.gz → 0.2.122tar.gz

Files changed (33) hide show

{vision_agent-0.2.120 → vision_agent-0.2.122}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.120
+Version: 0.2.122
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.120 → vision_agent-0.2.122}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.120"
+version = "0.2.122"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -30,7 +30,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
 def run_code_action(
     code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
-) -> Execution:
-    return code_interpreter.exec_isolation(
+) -> Tuple[Execution, str]:
+    result = code_interpreter.exec_isolation(
         BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
     )
+    obs = str(result.logs)
+    if result.error:
+        obs += f"\n{result.error}"
+    return result, obs
 def parse_execution(response: str) -> Optional[str]:
     code = None
@@ -192,7 +197,7 @@ class VisionAgent(Agent):
             artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
         with CodeInterpreterFactory.new_instance(
-            code_sandbox_runtime=self.code_sandbox_runtime
+            code_sandbox_runtime=self.code_sandbox_runtime,
         ) as code_interpreter:
             orig_chat = copy.deepcopy(chat)
             int_chat = copy.deepcopy(chat)
@@ -260,10 +265,9 @@ class VisionAgent(Agent):
                 code_action = parse_execution(response["response"])
                 if code_action is not None:
-                    result = run_code_action(
+                    result, obs = run_code_action(
                         code_action, code_interpreter, str(remote_artifacts_path)
                     )
-                    obs = str(result.logs)
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

@@ -1,5 +1,4 @@
 import copy
-import difflib
 import logging
 import os
 import sys
@@ -29,6 +28,7 @@ from vision_agent.agent.vision_agent_coder_prompts import (
     USER_REQ,
 )
 from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
+from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
 from vision_agent.utils.image_utils import b64_to_pil
@@ -63,14 +63,6 @@ class DefaultImports:
         return DefaultImports.to_code_string() + "\n\n" + code
-def get_diff(before: str, after: str) -> str:
-    return "".join(
-        difflib.unified_diff(
-            before.splitlines(keepends=True), after.splitlines(keepends=True)
-        )
-    )
 def format_memory(memory: List[Dict[str, str]]) -> str:
     output_str = ""
     for i, m in enumerate(memory):

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/agent/vision_agent_coder_prompts.py RENAMED Viewed

@@ -81,20 +81,19 @@ plan2:
 - Count the number of detected objects labeled as 'person'.
 plan3:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'loca_zero_shot_counting' tool to count the dominant foreground object, which in this case is people.
+- Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
 ```python
-from vision_agent.tools import load_image, owl_v2, grounding_sam, loca_zero_shot_counting
+from vision_agent.tools import load_image, owl_v2, grounding_sam, countgd_counting
 image = load_image("image.jpg")
 owl_v2_out = owl_v2("person", image)
 gsam_out = grounding_sam("person", image)
 gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
-loca_out = loca_zero_shot_counting(image)
-loca_out = loca_out["count"]
+cgd_out = countgd_counting(image)
-final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "loca_zero_shot_counting": loca_out}}
+final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
 print(final_out)
 ```
 """

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

@@ -48,7 +48,7 @@ OBSERVATION:
 4|    return dogs
 [End of artifact]
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -75,7 +75,7 @@ OBSERVATION:
 4|    return dogs
 [End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -126,7 +126,7 @@ OBSERVATION:
 15|    return count
 [End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/lmm/lmm.py RENAMED Viewed

@@ -286,9 +286,6 @@ class OpenAILMM(LMM):
         return lambda x: T.grounding_sam(params["prompt"], x)
-    def generate_zero_shot_counter(self, question: str) -> Callable:
-        return T.loca_zero_shot_counting
     def generate_image_qa_tool(self, question: str) -> Callable:
         return lambda x: T.git_vqa_v2(question, x)

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -37,10 +37,13 @@ from .tools import (
     load_image,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
+    countgd_counting,
+    countgd_example_based_counting,
     ocr,
     overlay_bounding_boxes,
     overlay_heat_map,
     overlay_segmentation_masks,
+    overlay_counting_results,
     owl_v2,
     save_image,
     save_json,

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -1,5 +1,7 @@
+import difflib
 import os
 import pickle as pkl
+import re
 import subprocess
 import tempfile
 from pathlib import Path
@@ -8,10 +10,13 @@ from typing import Any, Dict, List, Optional, Union
 from IPython.display import display
 import vision_agent as va
+from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
+from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.utils.execute import Execution, MimeType
+from vision_agent.utils.image_utils import convert_to_b64
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
@@ -99,13 +104,14 @@ class Artifacts:
     def show(self) -> str:
         """Shows the artifacts that have been loaded and their remote save paths."""
-        out_str = "[Artifacts loaded]\n"
+        output_str = "[Artifacts loaded]\n"
         for k in self.artifacts.keys():
-            out_str += (
+            output_str += (
                 f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
             )
-        out_str += "[End of artifacts]\n"
-        return out_str
+        output_str += "[End of artifacts]\n"
+        print(output_str)
+        return output_str
     def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
         save_path = (
@@ -135,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str:
 def view_lines(
-    lines: List[str], line_num: int, window_size: int, name: str, total_lines: int
+    lines: List[str],
+    line_num: int,
+    window_size: int,
+    name: str,
+    total_lines: int,
+    print_output: bool = True,
 ) -> str:
     start = max(0, line_num - window_size)
     end = min(len(lines), line_num + window_size)
@@ -148,7 +159,9 @@ def view_lines(
             else f"[{len(lines) - end} more lines]"
         )
     )
-    print(return_str)
+    if print_output:
+        print(return_str)
     return return_str
@@ -231,7 +244,7 @@ def edit_code_artifact(
     new_content_lines = [
         line if line.endswith("\n") else line + "\n" for line in new_content_lines
     ]
-    lines = artifacts[name].splitlines()
+    lines = artifacts[name].splitlines(keepends=True)
     edited_lines = lines[:start] + new_content_lines + lines[end:]
     cur_line = start + len(content.split("\n")) // 2
@@ -261,13 +274,20 @@ def edit_code_artifact(
                 DEFAULT_WINDOW_SIZE,
                 name,
                 total_lines,
+                print_output=False,
             )
             total_lines_edit = sum(1 for _ in edited_lines)
             edited_view = view_lines(
-                edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit
+                edited_lines,
+                cur_line,
+                DEFAULT_WINDOW_SIZE,
+                name,
+                total_lines_edit,
+                print_output=False,
             )
             error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
+            print(error_msg)
             return error_msg
     artifacts[name] = "".join(edited_lines)
@@ -390,6 +410,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
     return f"[Media {Path(local_path).name} saved]"
+def list_artifacts(artifacts: Artifacts) -> str:
+    """Lists all the artifacts that have been loaded into the artifacts object."""
+    output_str = artifacts.show()
+    print(output_str)
+    return output_str
 def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
@@ -397,6 +424,108 @@ def get_tool_descriptions() -> str:
     return TOOL_DESCRIPTIONS
+def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
+    """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
+    objects in an image based on a given dataset. It returns the fine tuning job id.
+    Parameters:
+        bboxes (List[BboxInput]): A list of BboxInput containing the
+            image path, labels and bounding boxes.
+        task (str): The florencev2 fine-tuning task. The options are
+            'phrase_grounding'.
+    Returns:
+        UUID: The fine tuning job id, this id will used to retrieve the fine
+            tuned model.
+    Example
+    -------
+        >>> fine_tuning_job_id = florencev2_fine_tuning(
+            [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
+             {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
+             "phrase_grounding"
+        )
+    """
+    bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
+    task_type = PromptTask[task.upper()]
+    fine_tuning_request = [
+        BboxInputBase64(
+            image=convert_to_b64(bbox_input.image_path),
+            filename=Path(bbox_input.image_path).name,
+            labels=bbox_input.labels,
+            bboxes=bbox_input.bboxes,
+        )
+        for bbox_input in bboxes_input
+    ]
+    landing_api = LandingPublicAPI()
+    fine_tune_id = str(
+        landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
+    )
+    print(f"[Florence2 fine tuning id: {fine_tune_id}]")
+    return fine_tune_id
+def get_diff(before: str, after: str) -> str:
+    return "".join(
+        difflib.unified_diff(
+            before.splitlines(keepends=True), after.splitlines(keepends=True)
+        )
+    )
+def use_florence2_fine_tuning(
+    artifacts: Artifacts, name: str, task: str, fine_tune_id: str
+) -> str:
+    """Replaces florence2 calls with the fine tuning id. This ensures that the code
+    utilizes the fined tuned florence2 model. Returns the diff between the original
+    code and the new code.
+    Parameters:
+        artifacts (Artifacts): The artifacts object to edit the code from.
+        name (str): The name of the artifact to edit.
+        task (str): The task to fine tune the model for. The options are
+            'phrase_grounding'.
+        fine_tune_id (str): The fine tuning job id.
+    Examples
+    --------
+        >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
+    """
+    task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
+    if name not in artifacts:
+        output_str = f"[Artifact {name} does not exist]"
+        print(output_str)
+        return output_str
+    code = artifacts[name]
+    if task.lower() == "phrase_grounding":
+        pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
+        def replacer(match: re.Match) -> str:
+            arg = match.group(1)  # capture all initial arguments
+            return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
+    else:
+        raise ValueError(f"Task {task} is not supported.")
+    new_code = re.sub(pattern, replacer, code)
+    if new_code == code:
+        output_str = (
+            f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+        )
+        print(output_str)
+        return output_str
+    artifacts[name] = new_code
+    diff = get_diff(code, new_code)
+    print(diff)
+    return diff
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
@@ -406,5 +535,8 @@ META_TOOL_DOCSTRING = get_tool_documentation(
         generate_vision_code,
         edit_vision_code,
         write_media_artifact,
+        florence2_fine_tuning,
+        use_florence2_fine_tuning,
+        list_artifacts,
     ]
 )

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/tools/tool_utils.py RENAMED Viewed

@@ -1,6 +1,6 @@
+import os
 import inspect
 import logging
-import os
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 import pandas as pd
@@ -13,6 +13,7 @@ from urllib3.util.retry import Retry
 from vision_agent.utils.exceptions import RemoteToolCallFailed
 from vision_agent.utils.execute import Error, MimeType
 from vision_agent.utils.type_defs import LandingaiAPIKey
+from vision_agent.tools.tools_types import BoundingBoxes
 _LOGGER = logging.getLogger(__name__)
 _LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
@@ -34,61 +35,58 @@ def send_inference_request(
     files: Optional[List[Tuple[Any, ...]]] = None,
     v2: bool = False,
     metadata_payload: Optional[Dict[str, Any]] = None,
-) -> Dict[str, Any]:
+) -> Any:
     # TODO: runtime_tag and function_name should be metadata_payload and now included
     # in the service payload
-    try:
-        if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
-            payload["runtime_tag"] = runtime_tag
+    if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
+        payload["runtime_tag"] = runtime_tag
+    url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
+    if "TOOL_ENDPOINT_URL" in os.environ:
+        url = os.environ["TOOL_ENDPOINT_URL"]
+    headers = {"apikey": _LND_API_KEY}
+    if "TOOL_ENDPOINT_AUTH" in os.environ:
+        headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
+        headers.pop("apikey")
+    session = _create_requests_session(
+        url=url,
+        num_retry=3,
+        headers=headers,
+    )
-        url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
-        if "TOOL_ENDPOINT_URL" in os.environ:
-            url = os.environ["TOOL_ENDPOINT_URL"]
+    function_name = "unknown"
+    if "function_name" in payload:
+        function_name = payload["function_name"]
+    elif metadata_payload is not None and "function_name" in metadata_payload:
+        function_name = metadata_payload["function_name"]
-        tool_call_trace = ToolCallTrace(
-            endpoint_url=url,
-            request=payload,
-            response={},
-            error=None,
-        )
-        headers = {"apikey": _LND_API_KEY}
-        if "TOOL_ENDPOINT_AUTH" in os.environ:
-            headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
-            headers.pop("apikey")
-        session = _create_requests_session(
-            url=url,
-            num_retry=3,
-            headers=headers,
-        )
+    response = _call_post(url, payload, session, files, function_name)
-        if files is not None:
-            res = session.post(url, data=payload, files=files)
-        else:
-            res = session.post(url, json=payload)
-        if res.status_code != 200:
-            tool_call_trace.error = Error(
-                name="RemoteToolCallFailed",
-                value=f"{res.status_code} - {res.text}",
-                traceback_raw=[],
-            )
-            _LOGGER.error(f"Request failed: {res.status_code} {res.text}")
-            # TODO: function_name should be in metadata_payload
-            function_name = "unknown"
-            if "function_name" in payload:
-                function_name = payload["function_name"]
-            elif metadata_payload is not None and "function_name" in metadata_payload:
-                function_name = metadata_payload["function_name"]
-            raise RemoteToolCallFailed(function_name, res.status_code, res.text)
-        resp = res.json()
-        tool_call_trace.response = resp
-        # TODO: consider making the response schema the same between below two sources
-        return resp if "TOOL_ENDPOINT_AUTH" in os.environ else resp["data"]  # type: ignore
-    finally:
-        trace = tool_call_trace.model_dump()
-        trace["type"] = "tool_call"
-        display({MimeType.APPLICATION_JSON: trace}, raw=True)
+    # TODO: consider making the response schema the same between below two sources
+    return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
+def send_task_inference_request(
+    payload: Dict[str, Any],
+    task_name: str,
+    files: Optional[List[Tuple[Any, ...]]] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+) -> Any:
+    url = f"{_LND_API_URL_v2}/{task_name}"
+    headers = {"apikey": _LND_API_KEY}
+    session = _create_requests_session(
+        url=url,
+        num_retry=3,
+        headers=headers,
+    )
+    function_name = "unknown"
+    if metadata is not None and "function_name" in metadata:
+        function_name = metadata["function_name"]
+    response = _call_post(url, payload, session, files, function_name)
+    return response["data"]
 def _create_requests_session(
@@ -195,3 +193,49 @@ def get_tools_info(funcs: List[Callable[..., Any]]) -> Dict[str, str]:
         data[func.__name__] = f"{func.__name__}{inspect.signature(func)}:\n{desc}"
     return data
+def _call_post(
+    url: str,
+    payload: dict[str, Any],
+    session: Session,
+    files: Optional[List[Tuple[Any, ...]]] = None,
+    function_name: str = "unknown",
+) -> Any:
+    try:
+        tool_call_trace = ToolCallTrace(
+            endpoint_url=url,
+            request=payload,
+            response={},
+            error=None,
+        )
+        if files is not None:
+            response = session.post(url, data=payload, files=files)
+        else:
+            response = session.post(url, json=payload)
+        if response.status_code != 200:
+            tool_call_trace.error = Error(
+                name="RemoteToolCallFailed",
+                value=f"{response.status_code} - {response.text}",
+                traceback_raw=[],
+            )
+            _LOGGER.error(f"Request failed: {response.status_code} {response.text}")
+            raise RemoteToolCallFailed(
+                function_name, response.status_code, response.text
+            )
+        result = response.json()
+        tool_call_trace.response = result
+        return result
+    finally:
+        trace = tool_call_trace.model_dump()
+        trace["type"] = "tool_call"
+        display({MimeType.APPLICATION_JSON: trace}, raw=True)
+def filter_bboxes_by_threshold(
+    bboxes: BoundingBoxes, threshold: float
+) -> BoundingBoxes:
+    return list(filter(lambda bbox: bbox.score >= threshold, bboxes))

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/tools/tools.py RENAMED Viewed

@@ -13,7 +13,7 @@ import cv2
 import numpy as np
 import requests
 from moviepy.editor import ImageSequenceClip
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image, ImageDraw, ImageFont, ImageEnhance
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
@@ -24,14 +24,15 @@ from vision_agent.tools.tool_utils import (
     get_tools_df,
     get_tools_info,
     send_inference_request,
+    send_task_inference_request,
+    filter_bboxes_by_threshold,
 )
 from vision_agent.tools.tools_types import (
-    BboxInput,
-    BboxInputBase64,
     FineTuning,
-    Florencev2FtRequest,
+    Florence2FtRequest,
     JobStatus,
     PromptTask,
+    ODResponseData,
 )
 from vision_agent.utils import extract_frames_from_video
 from vision_agent.utils.exceptions import FineTuneModelIsNotReady
@@ -455,7 +456,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
         "image": image_b64,
         "function_name": "loca_zero_shot_counting",
     }
-    resp_data = send_inference_request(data, "loca", v2=True)
+    resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
     resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
     return resp_data
@@ -469,6 +470,8 @@ def loca_visual_prompt_counting(
     Parameters:
         image (np.ndarray): The image that contains lot of instances of a single object
+        visual_prompt (Dict[str, List[float]]): Bounding box of the object in format
+        [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
     Returns:
         Dict[str, Any]: A dictionary containing the key 'count' and the count as a
@@ -496,11 +499,109 @@ def loca_visual_prompt_counting(
         "bbox": list(map(int, denormalize_bbox(bbox, image_size))),
         "function_name": "loca_visual_prompt_counting",
     }
-    resp_data = send_inference_request(data, "loca", v2=True)
+    resp_data: dict[str, Any] = send_inference_request(data, "loca", v2=True)
     resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
     return resp_data
+def countgd_counting(
+    prompt: str,
+    image: np.ndarray,
+    box_threshold: float = 0.23,
+) -> List[Dict[str, Any]]:
+    """'countgd_counting' is a tool that can precisely count multiple instances of an
+    object given a text prompt. It returns a list of bounding boxes with normalized
+    coordinates, label names and associated confidence scores.
+    Parameters:
+        prompt (str): The object that needs to be counted.
+        image (np.ndarray): The image that contains multiple instances of the object.
+        box_threshold (float, optional): The threshold for detection. Defaults
+            to 0.23.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box.
+    Example
+    -------
+        >>> countgd_counting("flower", image)
+        [
+            {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5},
+            {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52},
+            {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
+        ]
+    """
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    prompt = prompt.replace(", ", " .")
+    payload = {"prompts": [prompt], "model": "countgd"}
+    metadata = {"function_name": "countgd_counting"}
+    resp_data = send_task_inference_request(
+        payload, "text-to-object-detection", files=files, metadata=metadata
+    )
+    bboxes_per_frame = resp_data[0]
+    bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
+    filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
+    return [bbox.model_dump() for bbox in filtered_bboxes]
+def countgd_example_based_counting(
+    visual_prompts: List[List[float]],
+    image: np.ndarray,
+    box_threshold: float = 0.23,
+) -> List[Dict[str, Any]]:
+    """'countgd_example_based_counting' is a tool that can precisely count multiple
+    instances of an object given few visual example prompts. It returns a list of bounding
+    boxes with normalized coordinates, label names and associated confidence scores.
+    Parameters:
+        visual_prompts (List[List[float]]): Bounding boxes of the object in format
+        [xmin, ymin, xmax, ymax]. Upto 3 bounding boxes can be provided.
+        image (np.ndarray): The image that contains multiple instances of the object.
+        box_threshold (float, optional): The threshold for detection. Defaults
+            to 0.23.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box.
+    Example
+    -------
+        >>> countgd_example_based_counting(
+            visual_prompts=[[0.1, 0.1, 0.4, 0.42], [0.2, 0.3, 0.25, 0.35]],
+            image=image
+        )
+        [
+            {'score': 0.49, 'label': 'object', 'bounding_box': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 0.68, 'label': 'object', 'bounding_box': [0.2, 0.21, 0.45, 0.5},
+            {'score': 0.78, 'label': 'object', 'bounding_box': [0.3, 0.35, 0.48, 0.52},
+            {'score': 0.98, 'label': 'object', 'bounding_box': [0.44, 0.24, 0.49, 0.58},
+        ]
+    """
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    visual_prompts = [
+        denormalize_bbox(bbox, image.shape[:2]) for bbox in visual_prompts
+    ]
+    payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
+    metadata = {"function_name": "countgd_example_based_counting"}
+    resp_data = send_task_inference_request(
+        payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
+    )
+    bboxes_per_frame = resp_data[0]
+    bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
+    filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
+    return [bbox.model_dump() for bbox in filtered_bboxes]
 def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
     """'florence2_roberta_vqa' is a tool that takes an image and analyzes
     its contents, generates detailed captions and then tries to answer the given
@@ -646,7 +747,7 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
         "tool": "closed_set_image_classification",
         "function_name": "clip",
     }
-    resp_data = send_inference_request(data, "tools")
+    resp_data: dict[str, Any] = send_inference_request(data, "tools")
     resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
     return resp_data
@@ -674,7 +775,7 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
         "tool": "image_classification",
         "function_name": "vit_image_classification",
     }
-    resp_data = send_inference_request(data, "tools")
+    resp_data: dict[str, Any] = send_inference_request(data, "tools")
     resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
     return resp_data
@@ -701,7 +802,9 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
         "image": image_b64,
         "function_name": "vit_nsfw_classification",
     }
-    resp_data = send_inference_request(data, "nsfw-classification", v2=True)
+    resp_data: dict[str, Any] = send_inference_request(
+        data, "nsfw-classification", v2=True
+    )
     resp_data["score"] = round(resp_data["score"], 4)
     return resp_data
@@ -762,7 +865,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
-def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+def florence2_phrase_grounding(
+    prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
     """'florence2_phrase_grounding' is a tool that can detect multiple
     objects given a text prompt which can be object names or caption. You
     can optionally separate the object names in the text with commas. It returns a list
@@ -772,6 +877,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to used to detect objects
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -790,14 +897,33 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
-    data = {
-        "image": image_b64,
-        "task": "<CAPTION_TO_PHRASE_GROUNDING>",
-        "prompt": prompt,
-        "function_name": "florence2_phrase_grounding",
-    }
-    detections = send_inference_request(data, "florence2", v2=True)
+    if fine_tune_id is not None:
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+        data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florencev2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+        )
+        data = data_obj.model_dump(by_alias=True)
+        detections = send_inference_request(data, "tools", v2=False)
+    else:
+        data = {
+            "image": image_b64,
+            "task": "<CAPTION_TO_PHRASE_GROUNDING>",
+            "prompt": prompt,
+            "function_name": "florence2_phrase_grounding",
+        }
+        detections = send_inference_request(data, "florence2", v2=True)
     detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
     return_data = []
     for i in range(len(detections["bboxes"])):
@@ -1559,117 +1685,72 @@ def overlay_heat_map(
     return np.array(combined)
-# TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
-    """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
-    to detect objects in an image based on a given dataset. It returns the fine
-    tuning job id.
+def overlay_counting_results(
+    image: np.ndarray, instances: List[Dict[str, Any]]
+) -> np.ndarray:
+    """'overlay_counting_results' is a utility function that displays counting results on
+    an image.
     Parameters:
-        bboxes (List[BboxInput]): A list of BboxInput containing the
-            image path, labels and bounding boxes.
-        task (PromptTask): The florencev2 fine-tuning task. The options are
-            CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+        image (np.ndarray): The image to display the bounding boxes on.
+        instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding
+            box information of each instance
     Returns:
-        UUID: The fine tuning job id, this id will used to retrieve the fine
-            tuned model.
+        np.ndarray: The image with the instance_id dislpayed
     Example
     -------
-        >>> fine_tuning_job_id = florencev2_fine_tuning(
-            [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
-             {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
-             "OBJECT_DETECTION"
+        >>> image_with_bboxes = overlay_counting_results(
+            image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
         )
     """
-    bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
-    task_input = PromptTask[task]
-    fine_tuning_request = [
-        BboxInputBase64(
-            image=convert_to_b64(bbox_input.image_path),
-            filename=bbox_input.image_path.split("/")[-1],
-            labels=bbox_input.labels,
-            bboxes=bbox_input.bboxes,
-        )
-        for bbox_input in bboxes_input
-    ]
-    landing_api = LandingPublicAPI()
-    return landing_api.launch_fine_tuning_job(
-        "florencev2", task_input, fine_tuning_request
-    )
+    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
+    color = (158, 218, 229)
-# TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuned_object_detection(
-    image: np.ndarray, prompt: str, model_id: UUID, task: str
-) -> List[Dict[str, Any]]:
-    """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
-    to detect objects given a text prompt such as a phrase or class names separated by
-    commas. It returns a list of detected objects as labels and their location as
-    bounding boxes with score of 1.0.
+    width, height = pil_image.size
+    fontsize = max(10, int(min(width, height) / 80))
+    pil_image = ImageEnhance.Brightness(pil_image).enhance(0.5)
+    draw = ImageDraw.Draw(pil_image)
+    font = ImageFont.truetype(
+        str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
+        fontsize,
+    )
-    Parameters:
-        image (np.ndarray): The image to used to detect objects.
-        prompt (str): The prompt to help find objects in the image.
-        model_id (UUID): The fine-tuned model id.
-        task (PromptTask): The florencev2 fine-tuning task. The options are
-            CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+    for i, elt in enumerate(instances):
+        label = f"{i}"
+        box = elt["bbox"]
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-            bounding box of the detected objects with normalized coordinates between 0
-            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
-            top-left and xmax and ymax are the coordinates of the bottom-right of the
-            bounding box. The scores are always 1.0 and cannot be thresholded
+        # denormalize the box if it is normalized
+        box = denormalize_bbox(box, (height, width))
+        x0, y0, x1, y1 = box
+        cx, cy = (x0 + x1) / 2, (y0 + y1) / 2
-    Example
-    -------
-        >>> florencev2_fine_tuned_object_detection(
-            image,
-            'person looking at a coyote',
-            UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
+        text_box = draw.textbbox(
+            (cx, cy), text=label, font=font, align="center", anchor="mm"
         )
-        [
-            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
-        ]
-    """
-    # check if job succeeded first
-    landing_api = LandingPublicAPI()
-    status = landing_api.check_fine_tuning_job(model_id)
-    if status is not JobStatus.SUCCEEDED:
-        raise FineTuneModelIsNotReady()
-    task = PromptTask[task]
-    if task is PromptTask.OBJECT_DETECTION:
-        prompt = ""
-    data_obj = Florencev2FtRequest(
-        image=convert_to_b64(image),
-        task=task,
-        tool="florencev2_fine_tuning",
-        prompt=prompt,
-        fine_tuning=FineTuning(job_id=model_id),
-    )
-    data = data_obj.model_dump(by_alias=True)
-    metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
-    detections = send_inference_request(
-        data, "tools", v2=False, metadata_payload=metadata_payload
-    )
-    detections = detections[task.value]
-    return_data = []
-    image_size = image.shape[:2]
-    for i in range(len(detections["bboxes"])):
-        return_data.append(
-            {
-                "score": 1.0,
-                "label": detections["labels"][i],
-                "bbox": normalize_bbox(detections["bboxes"][i], image_size),
-            }
+        # Calculate the offset to center the text within the bounding box
+        text_width = text_box[2] - text_box[0]
+        text_height = text_box[3] - text_box[1]
+        text_x0 = cx - text_width / 2
+        text_y0 = cy - text_height / 2
+        text_x1 = cx + text_width / 2
+        text_y1 = cy + text_height / 2
+        # Draw the rectangle encapsulating the text
+        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color)
+        # Draw the text at the center of the bounding box
+        draw.text(
+            (text_x0, text_y0),
+            label,
+            fill="black",
+            font=font,
+            anchor="lt",
         )
-    return return_data
+    return np.array(pil_image)
 FUNCTION_TOOLS = [
@@ -1679,8 +1760,7 @@ FUNCTION_TOOLS = [
     clip,
     vit_image_classification,
     vit_nsfw_classification,
-    loca_zero_shot_counting,
-    loca_visual_prompt_counting,
+    countgd_counting,
     florence2_image_caption,
     florence2_ocr,
     florence2_sam2_image,
@@ -1703,6 +1783,7 @@ UTIL_TOOLS = [
     overlay_bounding_boxes,
     overlay_segmentation_masks,
     overlay_heat_map,
+    overlay_counting_results,
 ]
 TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
@@ -1720,5 +1801,6 @@ UTILITIES_DOCSTRING = get_tool_documentation(
         overlay_bounding_boxes,
         overlay_segmentation_masks,
         overlay_heat_map,
+        overlay_counting_results,
     ]
 )

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/tools/tools_types.py RENAMED Viewed

@@ -1,8 +1,8 @@
 from enum import Enum
-from typing import List, Optional, Tuple
 from uuid import UUID
+from typing import List, Tuple, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
+from pydantic import BaseModel, ConfigDict, Field, field_serializer, SerializationInfo
 class BboxInput(BaseModel):
@@ -19,16 +19,9 @@ class BboxInputBase64(BaseModel):
 class PromptTask(str, Enum):
-    """
-    Valid task prompts options for the Florencev2 model.
-    """
+    """Valid task prompts options for the Florence2 model."""
-    CAPTION = "<CAPTION>"
-    """"""
-    CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
-    """"""
-    OBJECT_DETECTION = "<OD>"
-    """"""
+    PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
 class FineTuning(BaseModel):
@@ -41,7 +34,7 @@ class FineTuning(BaseModel):
         return str(job_id)
-class Florencev2FtRequest(BaseModel):
+class Florence2FtRequest(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
     image: str
@@ -82,3 +75,16 @@ class JobStatus(str, Enum):
     SUCCEEDED = "SUCCEEDED"
     FAILED = "FAILED"
     STOPPED = "STOPPED"
+class ODResponseData(BaseModel):
+    label: str
+    score: float
+    bbox: Union[list[int], list[float]] = Field(alias="bounding_box")
+    model_config = ConfigDict(
+        populate_by_name=True,
+    )
+BoundingBoxes = list[ODResponseData]

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/utils/execute.py RENAMED Viewed

@@ -564,7 +564,13 @@ class LocalCodeInterpreter(CodeInterpreter):
     ) -> None:
         super().__init__(timeout=timeout)
         self.nb = nbformat.v4.new_notebook()
-        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        # Set the notebook execution path to the remote path
+        self.resources = {"metadata": {"path": str(self.remote_path)}}
+        self.nb_client = NotebookClient(
+            self.nb,
+            timeout=self.timeout,
+            resources=self.resources,
+        )
         _LOGGER.info(
             f"""Local code interpreter initialized
 Python version: {sys.version}
@@ -606,7 +612,9 @@ Timeout: {self.timeout}"""
     def restart_kernel(self) -> None:
         self.close()
         self.nb = nbformat.v4.new_notebook()
-        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        self.nb_client = NotebookClient(
+            self.nb, timeout=self.timeout, resources=self.resources
+        )
         sleep(1)
         self._new_kernel()
@@ -636,7 +644,7 @@ Timeout: {self.timeout}"""
             f.write(contents)
         _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
-        return Path(self.remote_path / file_path)
+        return Path(self.remote_path / Path(file_path).name)
     def download_file(
         self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
@@ -672,7 +680,8 @@ class CodeInterpreterFactory:
     @staticmethod
     def new_instance(
-        code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None
+        code_sandbox_runtime: Optional[str] = None,
+        remote_path: Optional[Union[str, Path]] = None,
     ) -> CodeInterpreter:
         if not code_sandbox_runtime:
             code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")

{vision_agent-0.2.120 → vision_agent-0.2.122}/vision_agent/utils/image_utils.py RENAMED Viewed

@@ -181,7 +181,7 @@ def denormalize_bbox(
         raise ValueError("Bounding box must be of length 4.")
     arr = np.array(bbox)
-    if np.all((arr >= 0) & (arr <= 1)):
+    if np.all((arr[:2] >= 0) & (arr[:2] <= 1)):
         x1, y1, x2, y2 = bbox
         x1 = round(x1 * image_size[1])
         y1 = round(y1 * image_size[0])