PyPI - vision-agent - Versions diffs - 0.2.154__tar.gz → 0.2.156__tar.gz - Mend

vision-agent 0.2.154tar.gz → 0.2.156tar.gz

Files changed (33) hide show

{vision_agent-0.2.154 → vision_agent-0.2.156}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.154
+Version: 0.2.156
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.154 → vision_agent-0.2.156}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.154"
+version = "0.2.156"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/agent_utils.py RENAMED Viewed

@@ -77,3 +77,9 @@ def extract_code(code: str) -> str:
     if code.startswith("python\n"):
         code = code[len("python\n") :]
     return code
+def remove_installs_from_code(code: str) -> str:
+    pattern = r"\n!pip install.*?(\n|\Z)\n"
+    code = re.sub(pattern, "", code, flags=re.DOTALL)
+    return code

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -407,8 +407,6 @@ class VisionAgent(Agent):
             code_interpreter.download_file(
                 str(remote_artifacts_path.name), str(self.local_artifacts_path)
             )
-            artifacts.load(self.local_artifacts_path)
-            artifacts.save()
         return orig_chat, artifacts
     def streaming_message(self, message: Dict[str, Any]) -> None:

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

@@ -13,7 +13,11 @@ from tabulate import tabulate
 import vision_agent.tools as T
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_code, extract_json
+from vision_agent.agent.agent_utils import (
+    extract_code,
+    extract_json,
+    remove_installs_from_code,
+)
 from vision_agent.agent.vision_agent_coder_prompts import (
     CODE,
     FIX_BUG,
@@ -836,8 +840,8 @@ class VisionAgentCoder(Agent):
                 media=media_list,
             )
             success = cast(bool, results["success"])
-            code = cast(str, results["code"])
-            test = cast(str, results["test"])
+            code = remove_installs_from_code(cast(str, results["code"]))
+            test = remove_installs_from_code(cast(str, results["test"]))
             working_memory.extend(results["working_memory"])  # type: ignore
             plan.append({"code": code, "test": test, "plan": plan_i})

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

@@ -28,7 +28,8 @@ Here is the current conversation so far:
 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
-4. **Output in JSON**: Respond in the following format in JSON:
+4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
+5. **Output in JSON**: Respond in the following format in JSON:
 ```json
 {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
@@ -62,7 +63,7 @@ OBSERVATION:
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
-AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
+AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
 """
 EXAMPLES_CODE1_EXTRA = """
@@ -91,7 +92,7 @@ OBSERVATION:
 ----- stdout -----
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
-AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
+AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
 """
 EXAMPLES_CODE2 = """
@@ -157,16 +158,16 @@ OBSERVATION:
 ----- stdout -----
 2
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 OBSERVATION:
 [Artifact code.py edits]

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -45,7 +45,6 @@ from .tools import (
     loca_zero_shot_counting,
     ocr,
     overlay_bounding_boxes,
-    overlay_counting_results,
     overlay_heat_map,
     overlay_segmentation_masks,
     owl_v2_image,

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -116,7 +116,9 @@ class Artifacts:
         )
         output_str = "[Artifacts loaded]\n"
         for k in self.artifacts.keys():
-            output_str += f"Artifact {k} loaded to {str(loaded_path / k)}\n"
+            output_str += (
+                f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n"
+            )
         output_str += "[End of artifacts]\n"
         print(output_str)
         return output_str

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/tools.py RENAMED Viewed

@@ -13,7 +13,7 @@ from uuid import UUID
 import cv2
 import numpy as np
 import requests
-from PIL import Image, ImageDraw, ImageEnhance, ImageFont
+from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
@@ -700,18 +700,22 @@ def countgd_counting(
             {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
         ]
     """
-    image_b64 = convert_to_b64(image)
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
     prompt = prompt.replace(", ", " .")
-    payload = {"prompt": prompt, "image": image_b64}
+    payload = {"prompts": [prompt], "model": "countgd"}
     metadata = {"function_name": "countgd_counting"}
-    resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
+    resp_data = send_task_inference_request(
+        payload, "text-to-object-detection", files=files, metadata=metadata
+    )
+    bboxes_per_frame = resp_data[0]
     bboxes_formatted = [
         ODResponseData(
             label=bbox["label"],
-            bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
+            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
             score=round(bbox["score"], 2),
         )
-        for bbox in resp_data
+        for bbox in bboxes_per_frame
     ]
     filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
     return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -1146,10 +1150,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
 def florence2_phrase_grounding(
     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
 ) -> List[Dict[str, Any]]:
-    """'florence2_phrase_grounding' will run florence2 on a image. It can
-    detect multiple objects given a text prompt which can be object names or caption.
-    You can optionally separate the object names in the text with commas. It returns
-    a list of bounding boxes with normalized coordinates, label names and associated
+    """'florence2_phrase_grounding' is a tool that can detect multiple
+    objects given a text prompt which can be object names or caption. You
+    can optionally separate the object names in the text with commas. It returns a list
+    of bounding boxes with normalized coordinates, label names and associated
     probability scores of 1.0.
     Parameters:
@@ -1808,6 +1812,11 @@ def save_image(image: np.ndarray, file_path: str) -> None:
     """
     from IPython.display import display
+    if not isinstance(image, np.ndarray) or (
+        image.shape[0] == 0 and image.shape[1] == 0
+    ):
+        raise ValueError("The image is not a valid NumPy array with shape (H, W, C)")
     pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
     display(pil_image)
     pil_image.save(file_path)
@@ -1834,6 +1843,15 @@ def save_video(
     if fps <= 0:
         raise ValueError(f"fps must be greater than 0 got {fps}")
+    if not isinstance(frames, list) or len(frames) == 0:
+        raise ValueError("Frames must be a list of NumPy arrays")
+    for frame in frames:
+        if not isinstance(frame, np.ndarray) or (
+            frame.shape[0] == 0 and frame.shape[1] == 0
+        ):
+            raise ValueError("A frame is not a valid NumPy array with shape (H, W, C)")
     if output_video_path is None:
         output_video_path = tempfile.NamedTemporaryFile(
             delete=False, suffix=".mp4"
@@ -1903,30 +1921,36 @@ def overlay_bounding_boxes(
         bboxes = bbox_int[i]
         bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
-        width, height = pil_image.size
-        fontsize = max(12, int(min(width, height) / 40))
-        draw = ImageDraw.Draw(pil_image)
-        font = ImageFont.truetype(
-            str(
-                resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")
-            ),
-            fontsize,
-        )
-        for elt in bboxes:
-            label = elt["label"]
-            box = elt["bbox"]
-            scores = elt["score"]
-            # denormalize the box if it is normalized
-            box = denormalize_bbox(box, (height, width))
-            draw.rectangle(box, outline=color[label], width=4)
-            text = f"{label}: {scores:.2f}"
-            text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
-            draw.rectangle(
-                (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
+        if len(bboxes) > 20:
+            pil_image = _plot_counting(pil_image, bboxes, color)
+        else:
+            width, height = pil_image.size
+            fontsize = max(12, int(min(width, height) / 40))
+            draw = ImageDraw.Draw(pil_image)
+            font = ImageFont.truetype(
+                str(
+                    resources.files("vision_agent.fonts").joinpath(
+                        "default_font_ch_en.ttf"
+                    )
+                ),
+                fontsize,
             )
-            draw.text((box[0], box[1]), text, fill="black", font=font)
+            for elt in bboxes:
+                label = elt["label"]
+                box = elt["bbox"]
+                scores = elt["score"]
+                # denormalize the box if it is normalized
+                box = denormalize_bbox(box, (height, width))
+                draw.rectangle(box, outline=color[label], width=4)
+                text = f"{label}: {scores:.2f}"
+                text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
+                draw.rectangle(
+                    (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
+                )
+                draw.text((box[0], box[1]), text, fill="black", font=font)
         frame_out.append(np.array(pil_image))
     return frame_out[0] if len(frame_out) == 1 else frame_out
@@ -2085,39 +2109,19 @@ def overlay_heat_map(
     return np.array(combined)
-def overlay_counting_results(
-    image: np.ndarray, instances: List[Dict[str, Any]]
-) -> np.ndarray:
-    """'overlay_counting_results' is a utility function that displays counting results on
-    an image.
-    Parameters:
-        image (np.ndarray): The image to display the bounding boxes on.
-        instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding
-            box information of each instance
-    Returns:
-        np.ndarray: The image with the instance_id dislpayed
-    Example
-    -------
-        >>> image_with_bboxes = overlay_counting_results(
-            image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
-        )
-    """
-    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
-    color = (158, 218, 229)
-    width, height = pil_image.size
+def _plot_counting(
+    image: Image.Image,
+    bboxes: List[Dict[str, Any]],
+    colors: Dict[str, Tuple[int, int, int]],
+) -> Image.Image:
+    width, height = image.size
     fontsize = max(10, int(min(width, height) / 80))
-    pil_image = ImageEnhance.Brightness(pil_image).enhance(0.5)
-    draw = ImageDraw.Draw(pil_image)
+    draw = ImageDraw.Draw(image)
     font = ImageFont.truetype(
         str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
         fontsize,
     )
-    for i, elt in enumerate(instances, 1):
+    for i, elt in enumerate(bboxes, 1):
         label = f"{i}"
         box = elt["bbox"]
@@ -2139,7 +2143,7 @@ def overlay_counting_results(
         text_y1 = cy + text_height / 2
         # Draw the rectangle encapsulating the text
-        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color)
+        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=colors[elt["label"]])
         # Draw the text at the center of the bounding box
         draw.text(
@@ -2150,7 +2154,7 @@ def overlay_counting_results(
             anchor="lt",
         )
-    return np.array(pil_image)
+    return image
 FUNCTION_TOOLS = [
@@ -2183,7 +2187,6 @@ UTIL_TOOLS = [
     overlay_bounding_boxes,
     overlay_segmentation_masks,
     overlay_heat_map,
-    overlay_counting_results,
 ]
 TOOLS = FUNCTION_TOOLS + UTIL_TOOLS

{vision_agent-0.2.154 → vision_agent-0.2.156}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/README.md RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/agent.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_coder_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/clients/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/clients/http.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/clients/landing_public_api.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/fonts/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/fonts/default_font_ch_en.ttf RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/lmm/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/lmm/lmm.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/lmm/types.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/tool_utils.py RENAMED Viewed

@@ -1,6 +1,6 @@
-import os
 import inspect
 import logging
+import os
 from base64 import b64encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/tools_types.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from enum import Enum
-from uuid import UUID
 from typing import List, Optional, Tuple, Union
+from uuid import UUID
 from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/exceptions.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/execute.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/image_utils.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/sim.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/type_defs.py RENAMED Viewed

File without changes

{vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/video.py RENAMED Viewed

File without changes

vision-agent 0.2.154__tar.gz → 0.2.156__tar.gz

vision-agent 0.2.154tar.gz → 0.2.156tar.gz