PyPI - vision-agent - Versions diffs - 0.2.155__tar.gz → 0.2.156__tar.gz - Mend

vision-agent 0.2.155tar.gz → 0.2.156tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{vision_agent-0.2.155 → vision_agent-0.2.156}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.155
+Version: 0.2.156
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.155 → vision_agent-0.2.156}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.155"
+version = "0.2.156"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/agent_utils.py RENAMED Viewed

@@ -77,3 +77,9 @@ def extract_code(code: str) -> str:
     if code.startswith("python\n"):
         code = code[len("python\n") :]
     return code
+def remove_installs_from_code(code: str) -> str:
+    pattern = r"\n!pip install.*?(\n|\Z)\n"
+    code = re.sub(pattern, "", code, flags=re.DOTALL)
+    return code

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -407,8 +407,6 @@ class VisionAgent(Agent):
             code_interpreter.download_file(
                 str(remote_artifacts_path.name), str(self.local_artifacts_path)
             )
-            artifacts.load(self.local_artifacts_path)
-            artifacts.save()
         return orig_chat, artifacts
     def streaming_message(self, message: Dict[str, Any]) -> None:

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

@@ -13,7 +13,11 @@ from tabulate import tabulate
 import vision_agent.tools as T
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_code, extract_json
+from vision_agent.agent.agent_utils import (
+    extract_code,
+    extract_json,
+    remove_installs_from_code,
+)
 from vision_agent.agent.vision_agent_coder_prompts import (
     CODE,
     FIX_BUG,
@@ -836,8 +840,8 @@ class VisionAgentCoder(Agent):
                 media=media_list,
             )
             success = cast(bool, results["success"])
-            code = cast(str, results["code"])
-            test = cast(str, results["test"])
+            code = remove_installs_from_code(cast(str, results["code"]))
+            test = remove_installs_from_code(cast(str, results["test"]))
             working_memory.extend(results["working_memory"])  # type: ignore
             plan.append({"code": code, "test": test, "plan": plan_i})

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

@@ -28,7 +28,8 @@ Here is the current conversation so far:
 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
-4. **Output in JSON**: Respond in the following format in JSON:
+4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
+5. **Output in JSON**: Respond in the following format in JSON:
 ```json
 {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
@@ -62,7 +63,7 @@ OBSERVATION:
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
-AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
+AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
 """
 EXAMPLES_CODE1_EXTRA = """
@@ -91,7 +92,7 @@ OBSERVATION:
 ----- stdout -----
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
-AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
+AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
 """
 EXAMPLES_CODE2 = """
@@ -157,16 +158,16 @@ OBSERVATION:
 ----- stdout -----
 2
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 OBSERVATION:
 [Artifact code.py edits]

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -45,7 +45,6 @@ from .tools import (
     loca_zero_shot_counting,
     ocr,
     overlay_bounding_boxes,
-    overlay_counting_results,
     overlay_heat_map,
     overlay_segmentation_masks,
     owl_v2_image,

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -116,7 +116,9 @@ class Artifacts:
         )
         output_str = "[Artifacts loaded]\n"
         for k in self.artifacts.keys():
-            output_str += f"Artifact {k} loaded to {str(loaded_path / k)}\n"
+            output_str += (
+                f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n"
+            )
         output_str += "[End of artifacts]\n"
         print(output_str)
         return output_str

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/tools.py RENAMED Viewed

@@ -13,7 +13,7 @@ from uuid import UUID
 import cv2
 import numpy as np
 import requests
-from PIL import Image, ImageDraw, ImageEnhance, ImageFont
+from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
@@ -1150,10 +1150,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
 def florence2_phrase_grounding(
     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
 ) -> List[Dict[str, Any]]:
-    """'florence2_phrase_grounding' will run florence2 on a image. It can
-    detect multiple objects given a text prompt which can be object names or caption.
-    You can optionally separate the object names in the text with commas. It returns
-    a list of bounding boxes with normalized coordinates, label names and associated
+    """'florence2_phrase_grounding' is a tool that can detect multiple
+    objects given a text prompt which can be object names or caption. You
+    can optionally separate the object names in the text with commas. It returns a list
+    of bounding boxes with normalized coordinates, label names and associated
     probability scores of 1.0.
     Parameters:
@@ -1812,6 +1812,11 @@ def save_image(image: np.ndarray, file_path: str) -> None:
     """
     from IPython.display import display
+    if not isinstance(image, np.ndarray) or (
+        image.shape[0] == 0 and image.shape[1] == 0
+    ):
+        raise ValueError("The image is not a valid NumPy array with shape (H, W, C)")
     pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
     display(pil_image)
     pil_image.save(file_path)
@@ -1838,6 +1843,15 @@ def save_video(
     if fps <= 0:
         raise ValueError(f"fps must be greater than 0 got {fps}")
+    if not isinstance(frames, list) or len(frames) == 0:
+        raise ValueError("Frames must be a list of NumPy arrays")
+    for frame in frames:
+        if not isinstance(frame, np.ndarray) or (
+            frame.shape[0] == 0 and frame.shape[1] == 0
+        ):
+            raise ValueError("A frame is not a valid NumPy array with shape (H, W, C)")
     if output_video_path is None:
         output_video_path = tempfile.NamedTemporaryFile(
             delete=False, suffix=".mp4"
@@ -1907,30 +1921,36 @@ def overlay_bounding_boxes(
         bboxes = bbox_int[i]
         bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
-        width, height = pil_image.size
-        fontsize = max(12, int(min(width, height) / 40))
-        draw = ImageDraw.Draw(pil_image)
-        font = ImageFont.truetype(
-            str(
-                resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")
-            ),
-            fontsize,
-        )
-        for elt in bboxes:
-            label = elt["label"]
-            box = elt["bbox"]
-            scores = elt["score"]
-            # denormalize the box if it is normalized
-            box = denormalize_bbox(box, (height, width))
-            draw.rectangle(box, outline=color[label], width=4)
-            text = f"{label}: {scores:.2f}"
-            text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
-            draw.rectangle(
-                (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
+        if len(bboxes) > 20:
+            pil_image = _plot_counting(pil_image, bboxes, color)
+        else:
+            width, height = pil_image.size
+            fontsize = max(12, int(min(width, height) / 40))
+            draw = ImageDraw.Draw(pil_image)
+            font = ImageFont.truetype(
+                str(
+                    resources.files("vision_agent.fonts").joinpath(
+                        "default_font_ch_en.ttf"
+                    )
+                ),
+                fontsize,
             )
-            draw.text((box[0], box[1]), text, fill="black", font=font)
+            for elt in bboxes:
+                label = elt["label"]
+                box = elt["bbox"]
+                scores = elt["score"]
+                # denormalize the box if it is normalized
+                box = denormalize_bbox(box, (height, width))
+                draw.rectangle(box, outline=color[label], width=4)
+                text = f"{label}: {scores:.2f}"
+                text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
+                draw.rectangle(
+                    (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
+                )
+                draw.text((box[0], box[1]), text, fill="black", font=font)
         frame_out.append(np.array(pil_image))
     return frame_out[0] if len(frame_out) == 1 else frame_out
@@ -2089,39 +2109,19 @@ def overlay_heat_map(
     return np.array(combined)
-def overlay_counting_results(
-    image: np.ndarray, instances: List[Dict[str, Any]]
-) -> np.ndarray:
-    """'overlay_counting_results' is a utility function that displays counting results on
-    an image.
-    Parameters:
-        image (np.ndarray): The image to display the bounding boxes on.
-        instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding
-            box information of each instance
-    Returns:
-        np.ndarray: The image with the instance_id dislpayed
-    Example
-    -------
-        >>> image_with_bboxes = overlay_counting_results(
-            image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
-        )
-    """
-    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
-    color = (158, 218, 229)
-    width, height = pil_image.size
+def _plot_counting(
+    image: Image.Image,
+    bboxes: List[Dict[str, Any]],
+    colors: Dict[str, Tuple[int, int, int]],
+) -> Image.Image:
+    width, height = image.size
     fontsize = max(10, int(min(width, height) / 80))
-    pil_image = ImageEnhance.Brightness(pil_image).enhance(0.5)
-    draw = ImageDraw.Draw(pil_image)
+    draw = ImageDraw.Draw(image)
     font = ImageFont.truetype(
         str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
         fontsize,
     )
-    for i, elt in enumerate(instances, 1):
+    for i, elt in enumerate(bboxes, 1):
         label = f"{i}"
         box = elt["bbox"]
@@ -2143,7 +2143,7 @@ def overlay_counting_results(
         text_y1 = cy + text_height / 2
         # Draw the rectangle encapsulating the text
-        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color)
+        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=colors[elt["label"]])
         # Draw the text at the center of the bounding box
         draw.text(
@@ -2154,7 +2154,7 @@ def overlay_counting_results(
             anchor="lt",
         )
-    return np.array(pil_image)
+    return image
 FUNCTION_TOOLS = [
@@ -2187,7 +2187,6 @@ UTIL_TOOLS = [
     overlay_bounding_boxes,
     overlay_segmentation_masks,
     overlay_heat_map,
-    overlay_counting_results,
 ]
 TOOLS = FUNCTION_TOOLS + UTIL_TOOLS

{vision_agent-0.2.155 → vision_agent-0.2.156}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/README.md RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/agent.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_coder_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/clients/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/clients/http.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/clients/landing_public_api.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/fonts/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/fonts/default_font_ch_en.ttf RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/lmm/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/lmm/lmm.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/lmm/types.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/tool_utils.py RENAMED Viewed

@@ -1,6 +1,6 @@
-import os
 import inspect
 import logging
+import os
 from base64 import b64encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/tools_types.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from enum import Enum
-from uuid import UUID
 from typing import List, Optional, Tuple, Union
+from uuid import UUID
 from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/exceptions.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/execute.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/image_utils.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/sim.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/type_defs.py RENAMED Viewed

File without changes

{vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/video.py RENAMED Viewed

File without changes

vision-agent 0.2.155__tar.gz → 0.2.156__tar.gz

vision-agent 0.2.155tar.gz → 0.2.156tar.gz