PyPI - vision-agent - Versions diffs - 0.0.36__tar.gz → 0.0.37__tar.gz - Mend

vision-agent 0.0.36tar.gz → 0.0.37tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{vision_agent-0.0.36 → vision_agent-0.0.37}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.0.36
+Version: 0.0.37
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.0.36 → vision_agent-0.0.37}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.0.36"
+version = "0.0.37"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/easytool.py RENAMED Viewed

@@ -241,7 +241,8 @@ class EasyTool(Agent):
     based on the original implementation https://github.com/microsoft/JARVIS/tree/main/easytool
     from the funcQA code.
-    Examples::
+    Example
+    -------
         >>> from vision_agent.agent import EasyTool
         >>> agent = EasyTool()
         >>> resp = agent("If a car is traveling at 64 km/h, how many kilometers does it travel in 29 minutes?")
@@ -273,6 +274,15 @@ class EasyTool(Agent):
         input: Union[List[Dict[str, str]], str],
         image: Optional[Union[str, Path]] = None,
     ) -> str:
+        """Invoke the vision agent.
+        Parameters:
+            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
+            image: the input image referenced in the prompt parameter.
+        Returns:
+            A text response.
+        """
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
         return self.chat(input, image=image)

{vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/reflexion.py RENAMED Viewed

@@ -68,7 +68,8 @@ class Reflexion(Agent):
     self_reflect_model. Using Reflexion with LMMs may not work well, if it gets it wrong
     the first time, chances are it can't actually see the thing you want it to see.
-    Examples::
+    Example
+    -------
         >>> from vision_agent.agent import Reflexion
         >>> agent = Reflexion()
         >>> question = "How many tires does a truck have?"
@@ -139,6 +140,15 @@ class Reflexion(Agent):
         input: Union[str, List[Dict[str, str]]],
         image: Optional[Union[str, Path]] = None,
     ) -> str:
+        """Invoke the vision agent.
+        Parameters:
+            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
+            image: the input image referenced in the prompt parameter.
+        Returns:
+            A text response.
+        """
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
         return self.chat(input, image)

{vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -344,7 +344,8 @@ class VisionAgent(Agent):
     reflect on whether or not it was able to accomplish the task based off of the plan
     and final results, if not it will redo the task with this newly added reflection.
-    Examples::
+    Example
+    -------
         >>> from vision_agent.agent import VisionAgent
         >>> agent = VisionAgent()
         >>> resp = agent("If red tomatoes cost $5 each and yellow tomatoes cost $2.50 each, what is the total cost of all the tomatoes in the image?", image="tomatoes.jpg")
@@ -376,6 +377,15 @@ class VisionAgent(Agent):
         input: Union[List[Dict[str, str]], str],
         image: Optional[Union[str, Path]] = None,
     ) -> str:
+        """Invoke the vision agent.
+        Parameters:
+            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
+            image: the input image referenced in the prompt parameter.
+        Returns:
+            The result of the vision agent in text.
+        """
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
         return self.chat(input, image=image)

{vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/image_utils.py RENAMED Viewed

@@ -1,3 +1,5 @@
+"""Utility functions for image processing."""
 import base64
 from io import BytesIO
 from pathlib import Path
@@ -9,6 +11,14 @@ from PIL.Image import Image as ImageType
 def b64_to_pil(b64_str: str) -> ImageType:
+    """Convert a base64 string to a PIL Image.
+    Parameters:
+        b64_str: the base64 encoded image
+    Returns:
+        The decoded PIL Image
+    """
     # , can't be encoded in b64 data so must be part of prefix
     if "," in b64_str:
         b64_str = b64_str.split(",")[1]
@@ -16,16 +26,29 @@ def b64_to_pil(b64_str: str) -> ImageType:
 def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
+    """Get the size of an image.
+    Parameters:
+        data: the input image
+    Returns:
+        The size of the image in the form (height, width)
+    """
     if isinstance(data, (str, Path)):
         data = Image.open(data)
-    if isinstance(data, Image.Image):
-        return data.size[::-1]
-    else:
-        return data.shape[:2]
+    return data.size[::-1] if isinstance(data, Image.Image) else data.shape[:2]
 def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
+    """Convert an image to a base64 string.
+    Parameters:
+        data: the input image
+    Returns:
+        The base64 encoded image
+    """
     if data is None:
         raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
     if isinstance(data, (str, Path)):

{vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/tools/tools.py RENAMED Viewed

@@ -30,7 +30,7 @@ def normalize_bbox(
 def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
     r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
-    Args:
+    Parameters:
         mask_rle: Run-length as string formated (start length)
         shape: The (height, width) of array to return
     """
@@ -54,7 +54,8 @@ class CLIP(Tool):
     r"""CLIP is a tool that can classify or tag any image given a set if input classes
     or tags.
-    Examples::
+    Example
+    -------
         >>> import vision_agent as va
         >>> clip = va.tools.CLIP()
         >>> clip(["red line", "yellow dot"], "ct_scan1.jpg"))
@@ -89,7 +90,17 @@ class CLIP(Tool):
         ],
     }
+    # TODO: Add support for input multiple images, which aligns with the output type.
     def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+        """Invoke the CLIP model.
+        Parameters:
+            prompt: a list of classes or tags to classify the image.
+            image: the input image to classify.
+        Returns:
+            A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
+        """
         image_b64 = convert_to_b64(image)
         data = {
             "classes": prompt,
@@ -117,7 +128,8 @@ class GroundingDINO(Tool):
     r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
     category names or referring expressions.
-    Examples::
+    Example
+    -------
         >>> import vision_agent as va
         >>> t = va.tools.GroundingDINO()
         >>> t("red line. yellow dot", "ct_scan1.jpg")
@@ -154,7 +166,17 @@ class GroundingDINO(Tool):
         ],
     }
+    # TODO: Add support for input multiple images, which aligns with the output type.
     def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict]:
+        """Invoke the Grounding DINO model.
+        Parameters:
+            prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat"
+            image: the input image to run against.
+        Returns:
+            A list of dictionaries containing the labels, scores, and bboxes. Each dictionary contains the detection result for an image.
+        """
         image_size = get_image_size(image)
         image_b64 = convert_to_b64(image)
         data = {
@@ -188,7 +210,8 @@ class GroundingSAM(Tool):
     r"""Grounding SAM is a tool that can detect and segment arbitrary objects with
     inputs such as category names or referring expressions.
-    Examples::
+    Example
+    -------
         >>> import vision_agent as va
         >>> t = va.tools.GroundingSAM()
         >>> t(["red line", "yellow dot"], ct_scan1.jpg"])
@@ -234,7 +257,17 @@ class GroundingSAM(Tool):
         ],
     }
+    # TODO: Add support for input multiple images, which aligns with the output type.
     def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+        """Invoke the Grounding SAM model.
+        Parameters:
+            prompt: a list of classes to segment.
+            image: the input image to segment.
+        Returns:
+            A list of dictionaries containing the labels, scores, bboxes and masks. Each dictionary contains the segmentation result for an image.
+        """
         image_size = get_image_size(image)
         image_b64 = convert_to_b64(image)
         data = {
@@ -260,8 +293,7 @@ class GroundingSAM(Tool):
             ret_pred["labels"].append(pred["label_name"])
             ret_pred["bboxes"].append(normalize_bbox(pred["bbox"], image_size))
             ret_pred["masks"].append(mask)
-        ret_preds = [ret_pred]
-        return ret_preds
+        return [ret_pred]
 class AgentGroundingSAM(GroundingSAM):
@@ -282,6 +314,8 @@ class AgentGroundingSAM(GroundingSAM):
 class Counter(Tool):
+    r"""Counter detects and counts the number of objects in an image given an input such as a category name or referring expression."""
     name = "counter_"
     description = "'counter_' detects and counts the number of objects in an image given an input such as a category name or referring expression."
     usage = {
@@ -307,6 +341,8 @@ class Counter(Tool):
 class Crop(Tool):
+    r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
     name = "crop_"
     description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image."
     usage = {
@@ -343,6 +379,8 @@ class Crop(Tool):
 class BboxArea(Tool):
+    r"""BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places."""
     name = "bbox_area_"
     description = "'bbox_area_' returns the area of the bounding box in pixels normalized to 2 decimal places."
     usage = {
@@ -371,6 +409,8 @@ class BboxArea(Tool):
 class SegArea(Tool):
+    r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places."""
     name = "seg_area_"
     description = "'seg_area_' returns the area of the segmentation mask in pixels normalized to 2 decimal places."
     usage = {
@@ -390,6 +430,8 @@ class SegArea(Tool):
 class Add(Tool):
+    r"""Add returns the sum of all the arguments passed to it, normalized to 2 decimal places."""
     name = "add_"
     description = "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places."
     usage = {
@@ -407,6 +449,8 @@ class Add(Tool):
 class Subtract(Tool):
+    r"""Subtract returns the difference of all the arguments passed to it, normalized to 2 decimal places."""
     name = "subtract_"
     description = "'subtract_' returns the difference of all the arguments passed to it, normalized to 2 decimal places."
     usage = {
@@ -424,6 +468,8 @@ class Subtract(Tool):
 class Multiply(Tool):
+    r"""Multiply returns the product of all the arguments passed to it, normalized to 2 decimal places."""
     name = "multiply_"
     description = "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places."
     usage = {
@@ -441,6 +487,8 @@ class Multiply(Tool):
 class Divide(Tool):
+    r"""Divide returns the division of all the arguments passed to it, normalized to 2 decimal places."""
     name = "divide_"
     description = "'divide_' returns the division of all the arguments passed to it, normalized to 2 decimal places."
     usage = {