PyPI - vision-agent - Versions diffs - 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl - Mend

vision-agent 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

vision_agent/agent/easytool.py +11 -1
vision_agent/agent/reflexion.py +11 -1
vision_agent/agent/vision_agent.py +24 -4
vision_agent/agent/vision_agent_prompts.py +4 -1
vision_agent/image_utils.py +27 -4
vision_agent/tools/tools.py +54 -6
{vision_agent-0.0.35.dist-info → vision_agent-0.0.37.dist-info}/METADATA +1 -1
{vision_agent-0.0.35.dist-info → vision_agent-0.0.37.dist-info}/RECORD +10 -10
{vision_agent-0.0.35.dist-info → vision_agent-0.0.37.dist-info}/LICENSE +0 -0
{vision_agent-0.0.35.dist-info → vision_agent-0.0.37.dist-info}/WHEEL +0 -0

vision_agent/agent/easytool.py CHANGED Viewed

@@ -241,7 +241,8 @@ class EasyTool(Agent):
     based on the original implementation https://github.com/microsoft/JARVIS/tree/main/easytool
     from the funcQA code.
-    Examples::
+    Example
+    -------
         >>> from vision_agent.agent import EasyTool
         >>> agent = EasyTool()
         >>> resp = agent("If a car is traveling at 64 km/h, how many kilometers does it travel in 29 minutes?")
@@ -273,6 +274,15 @@ class EasyTool(Agent):
         input: Union[List[Dict[str, str]], str],
         image: Optional[Union[str, Path]] = None,
     ) -> str:
+        """Invoke the vision agent.
+        Parameters:
+            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
+            image: the input image referenced in the prompt parameter.
+        Returns:
+            A text response.
+        """
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
         return self.chat(input, image=image)

vision_agent/agent/reflexion.py CHANGED Viewed

@@ -68,7 +68,8 @@ class Reflexion(Agent):
     self_reflect_model. Using Reflexion with LMMs may not work well, if it gets it wrong
     the first time, chances are it can't actually see the thing you want it to see.
-    Examples::
+    Example
+    -------
         >>> from vision_agent.agent import Reflexion
         >>> agent = Reflexion()
         >>> question = "How many tires does a truck have?"
@@ -139,6 +140,15 @@ class Reflexion(Agent):
         input: Union[str, List[Dict[str, str]]],
         image: Optional[Union[str, Path]] = None,
     ) -> str:
+        """Invoke the vision agent.
+        Parameters:
+            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
+            image: the input image referenced in the prompt parameter.
+        Returns:
+            A text response.
+        """
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
         return self.chat(input, image)

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -315,12 +315,16 @@ def create_tasks(
 def self_reflect(
     reflect_model: Union[LLM, LMM],
     question: str,
+    tools: Dict[int, Any],
     tool_result: List[Dict],
     final_answer: str,
     image: Optional[Union[str, Path]] = None,
 ) -> str:
     prompt = VISION_AGENT_REFLECTION.format(
-        question=question, tool_results=str(tool_result), final_answer=final_answer
+        question=question,
+        tools=format_tools(tools),
+        tool_results=str(tool_result),
+        final_answer=final_answer,
     )
     if issubclass(type(reflect_model), LMM):
         return reflect_model(prompt, image=image)  # type: ignore
@@ -328,7 +332,8 @@ def self_reflect(
 def parse_reflect(reflect: str) -> bool:
-    return reflect.lower() == "finish"
+    # GPT-4V has a hard time following directions, so make the criteria less strict
+    return "finish" in reflect.lower() and len(reflect) < 100
 class VisionAgent(Agent):
@@ -339,7 +344,8 @@ class VisionAgent(Agent):
     reflect on whether or not it was able to accomplish the task based off of the plan
     and final results, if not it will redo the task with this newly added reflection.
-    Examples::
+    Example
+    -------
         >>> from vision_agent.agent import VisionAgent
         >>> agent = VisionAgent()
         >>> resp = agent("If red tomatoes cost $5 each and yellow tomatoes cost $2.50 each, what is the total cost of all the tomatoes in the image?", image="tomatoes.jpg")
@@ -371,6 +377,15 @@ class VisionAgent(Agent):
         input: Union[List[Dict[str, str]], str],
         image: Optional[Union[str, Path]] = None,
     ) -> str:
+        """Invoke the vision agent.
+        Parameters:
+            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
+            image: the input image referenced in the prompt parameter.
+        Returns:
+            The result of the vision agent in text.
+        """
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
         return self.chat(input, image=image)
@@ -425,7 +440,12 @@ class VisionAgent(Agent):
             )
             reflection = self_reflect(
-                self.reflect_model, question, all_tool_results, final_answer, image
+                self.reflect_model,
+                question,
+                self.tools,
+                all_tool_results,
+                final_answer,
+                image,
             )
             _LOGGER.info(f"\tReflection: {reflection}")
             if parse_reflect(reflection):

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -1,7 +1,10 @@
-VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given the user's question, the decomposed tasks and tools that the agent used to answer teh question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agen'ts answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
+VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure with the tools avilable. Use complete sentences.
 User's question: {question}
+Tools available:
+{tools}
 Tasks and tools used:
 {tool_results}

vision_agent/image_utils.py CHANGED Viewed

@@ -1,3 +1,5 @@
+"""Utility functions for image processing."""
 import base64
 from io import BytesIO
 from pathlib import Path
@@ -9,6 +11,14 @@ from PIL.Image import Image as ImageType
 def b64_to_pil(b64_str: str) -> ImageType:
+    """Convert a base64 string to a PIL Image.
+    Parameters:
+        b64_str: the base64 encoded image
+    Returns:
+        The decoded PIL Image
+    """
     # , can't be encoded in b64 data so must be part of prefix
     if "," in b64_str:
         b64_str = b64_str.split(",")[1]
@@ -16,16 +26,29 @@ def b64_to_pil(b64_str: str) -> ImageType:
 def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
+    """Get the size of an image.
+    Parameters:
+        data: the input image
+    Returns:
+        The size of the image in the form (height, width)
+    """
     if isinstance(data, (str, Path)):
         data = Image.open(data)
-    if isinstance(data, Image.Image):
-        return data.size[::-1]
-    else:
-        return data.shape[:2]
+    return data.size[::-1] if isinstance(data, Image.Image) else data.shape[:2]
 def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
+    """Convert an image to a base64 string.
+    Parameters:
+        data: the input image
+    Returns:
+        The base64 encoded image
+    """
     if data is None:
         raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
     if isinstance(data, (str, Path)):

vision_agent/tools/tools.py CHANGED Viewed

@@ -30,7 +30,7 @@ def normalize_bbox(
 def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
     r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
-    Args:
+    Parameters:
         mask_rle: Run-length as string formated (start length)
         shape: The (height, width) of array to return
     """
@@ -54,7 +54,8 @@ class CLIP(Tool):
     r"""CLIP is a tool that can classify or tag any image given a set if input classes
     or tags.
-    Examples::
+    Example
+    -------
         >>> import vision_agent as va
         >>> clip = va.tools.CLIP()
         >>> clip(["red line", "yellow dot"], "ct_scan1.jpg"))
@@ -89,7 +90,17 @@ class CLIP(Tool):
         ],
     }
+    # TODO: Add support for input multiple images, which aligns with the output type.
     def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+        """Invoke the CLIP model.
+        Parameters:
+            prompt: a list of classes or tags to classify the image.
+            image: the input image to classify.
+        Returns:
+            A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
+        """
         image_b64 = convert_to_b64(image)
         data = {
             "classes": prompt,
@@ -117,7 +128,8 @@ class GroundingDINO(Tool):
     r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
     category names or referring expressions.
-    Examples::
+    Example
+    -------
         >>> import vision_agent as va
         >>> t = va.tools.GroundingDINO()
         >>> t("red line. yellow dot", "ct_scan1.jpg")
@@ -154,7 +166,17 @@ class GroundingDINO(Tool):
         ],
     }
+    # TODO: Add support for input multiple images, which aligns with the output type.
     def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict]:
+        """Invoke the Grounding DINO model.
+        Parameters:
+            prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat"
+            image: the input image to run against.
+        Returns:
+            A list of dictionaries containing the labels, scores, and bboxes. Each dictionary contains the detection result for an image.
+        """
         image_size = get_image_size(image)
         image_b64 = convert_to_b64(image)
         data = {
@@ -188,7 +210,8 @@ class GroundingSAM(Tool):
     r"""Grounding SAM is a tool that can detect and segment arbitrary objects with
     inputs such as category names or referring expressions.
-    Examples::
+    Example
+    -------
         >>> import vision_agent as va
         >>> t = va.tools.GroundingSAM()
         >>> t(["red line", "yellow dot"], ct_scan1.jpg"])
@@ -234,7 +257,17 @@ class GroundingSAM(Tool):
         ],
     }
+    # TODO: Add support for input multiple images, which aligns with the output type.
     def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+        """Invoke the Grounding SAM model.
+        Parameters:
+            prompt: a list of classes to segment.
+            image: the input image to segment.
+        Returns:
+            A list of dictionaries containing the labels, scores, bboxes and masks. Each dictionary contains the segmentation result for an image.
+        """
         image_size = get_image_size(image)
         image_b64 = convert_to_b64(image)
         data = {
@@ -260,8 +293,7 @@ class GroundingSAM(Tool):
             ret_pred["labels"].append(pred["label_name"])
             ret_pred["bboxes"].append(normalize_bbox(pred["bbox"], image_size))
             ret_pred["masks"].append(mask)
-        ret_preds = [ret_pred]
-        return ret_preds
+        return [ret_pred]
 class AgentGroundingSAM(GroundingSAM):
@@ -282,6 +314,8 @@ class AgentGroundingSAM(GroundingSAM):
 class Counter(Tool):
+    r"""Counter detects and counts the number of objects in an image given an input such as a category name or referring expression."""
     name = "counter_"
     description = "'counter_' detects and counts the number of objects in an image given an input such as a category name or referring expression."
     usage = {
@@ -307,6 +341,8 @@ class Counter(Tool):
 class Crop(Tool):
+    r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
     name = "crop_"
     description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image."
     usage = {
@@ -343,6 +379,8 @@ class Crop(Tool):
 class BboxArea(Tool):
+    r"""BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places."""
     name = "bbox_area_"
     description = "'bbox_area_' returns the area of the bounding box in pixels normalized to 2 decimal places."
     usage = {
@@ -371,6 +409,8 @@ class BboxArea(Tool):
 class SegArea(Tool):
+    r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places."""
     name = "seg_area_"
     description = "'seg_area_' returns the area of the segmentation mask in pixels normalized to 2 decimal places."
     usage = {
@@ -390,6 +430,8 @@ class SegArea(Tool):
 class Add(Tool):
+    r"""Add returns the sum of all the arguments passed to it, normalized to 2 decimal places."""
     name = "add_"
     description = "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places."
     usage = {
@@ -407,6 +449,8 @@ class Add(Tool):
 class Subtract(Tool):
+    r"""Subtract returns the difference of all the arguments passed to it, normalized to 2 decimal places."""
     name = "subtract_"
     description = "'subtract_' returns the difference of all the arguments passed to it, normalized to 2 decimal places."
     usage = {
@@ -424,6 +468,8 @@ class Subtract(Tool):
 class Multiply(Tool):
+    r"""Multiply returns the product of all the arguments passed to it, normalized to 2 decimal places."""
     name = "multiply_"
     description = "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places."
     usage = {
@@ -441,6 +487,8 @@ class Multiply(Tool):
 class Divide(Tool):
+    r"""Divide returns the division of all the arguments passed to it, normalized to 2 decimal places."""
     name = "divide_"
     description = "'divide_' returns the division of all the arguments passed to it, normalized to 2 decimal places."
     usage = {

{vision_agent-0.0.35.dist-info → vision_agent-0.0.37.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.0.35
+Version: 0.0.37
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.0.35.dist-info → vision_agent-0.0.37.dist-info}/RECORD RENAMED Viewed

@@ -1,25 +1,25 @@
 vision_agent/__init__.py,sha256=wD1cssVTAJ55uTViNfBGooqJUV0p9fmVAuTMHHrmUBU,229
 vision_agent/agent/__init__.py,sha256=B4JVrbY4IRVCJfjmrgvcp7h1mTUEk8MZvL0Zmej4Ka0,127
 vision_agent/agent/agent.py,sha256=PRLItaPfMc94H6mAIPj_gBvJ8RezDEPanB6Cmu81A0M,306
-vision_agent/agent/easytool.py,sha256=SJ1Y8Lnz_HVGEzs2qSb-rq6glEjVG2slVHg8Sri17yo,11168
+vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
 vision_agent/agent/easytool_prompts.py,sha256=uNp12LOFRLr3i2zLhNuLuyFms2-s8es2t6P6h76QDow,4493
-vision_agent/agent/reflexion.py,sha256=TDNBpno_8Z-MIENr05msyqIqYOavW-ZP_ARPeXrPr_k,9758
+vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
 vision_agent/agent/reflexion_prompts.py,sha256=UPGkt_qgHBMUY0VPVoF-BqhR0d_6WPjjrhbYLBYOtnQ,9342
-vision_agent/agent/vision_agent.py,sha256=5d_EuySLii7PNLlPsnNkX1_88xzl3ajE31HLJKBYyY0,14336
-vision_agent/agent/vision_agent_prompts.py,sha256=F4WEpyYx_HpQj-vDm2LTtUm-yaLCOug-AKhxr7MNCvc,6061
+vision_agent/agent/vision_agent.py,sha256=JPoY92M5xNaViLdNf4d1oqAX00QUuQxk-gcc9jIlfqA,14981
+vision_agent/agent/vision_agent_prompts.py,sha256=otaDRsaHc7bqw_tgWTnu-eUcFeOzBFrn9sPU7_xr2VQ,6151
 vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
 vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
 vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
 vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
-vision_agent/image_utils.py,sha256=b1iYAoBlmGgOo-ZRNV_Hdz2XsxH8Nuas6CBBoz2HFUQ,1202
+vision_agent/image_utils.py,sha256=D5H-GN35Bz3u1Fq_JfYQVjNzAmZjJl138wma5fRtVjA,1684
 vision_agent/llm/__init__.py,sha256=fBKsIjL4z08eA0QYx6wvhRe4Nkp2pJ4VrZK0-uUL5Ec,32
 vision_agent/llm/llm.py,sha256=d8A7jmLVGx5HzoiYJ75mTMU7dbD5-bOYeXYlHaay6WA,3957
 vision_agent/lmm/__init__.py,sha256=I8mbeNUajTfWVNqLsuFQVOaNBDlkIhYp9DFU8H4kB7g,51
 vision_agent/lmm/lmm.py,sha256=ARcbgkcyP83TbVVoXI9B-gtG0gJuTaG_MjcUGbams4U,8052
 vision_agent/tools/__init__.py,sha256=aX0pU3pXU1V0Cj9FzYCvdsX76TAglFMHx59kNhXHbPs,131
 vision_agent/tools/prompts.py,sha256=9RBbyqlNlExsGKlJ89Jkph83DAEJ8PCVGaHoNbyN7TM,1416
-vision_agent/tools/tools.py,sha256=j_Jq_YHNmwrGXNR3fL9qi0yrHorqFui5UnAnLcEw20U,16826
-vision_agent-0.0.35.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.0.35.dist-info/METADATA,sha256=5ofgjIl0NMVqXu_gFeoZ5xlfedqVbNztHONDVa3xP2E,4966
-vision_agent-0.0.35.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.0.35.dist-info/RECORD,,
+vision_agent/tools/tools.py,sha256=Vlb8H9qm4rA5HxGw5p-gJES6jgPIkfrtVlM7jcxw7d8,19141
+vision_agent-0.0.37.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.0.37.dist-info/METADATA,sha256=Y9oIfWbRK-3EuNewrwK4WOnpHY2ca7FB8jDa5oucT5Y,4966
+vision_agent-0.0.37.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.0.37.dist-info/RECORD,,

{vision_agent-0.0.35.dist-info → vision_agent-0.0.37.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.0.35.dist-info → vision_agent-0.0.37.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

vision-agent 0.0.35py3-none-any.whl → 0.0.37py3-none-any.whl