PyPI - vision-agent - Versions diffs - 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl - Mend

vision-agent 0.0.51py3-none-any.whl → 0.0.53py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

vision_agent/agent/vision_agent.py +35 -14
vision_agent/agent/vision_agent_prompts.py +1 -3
vision_agent/fonts/__init__.py +0 -0
vision_agent/fonts/default_font_ch_en.ttf +0 -0
vision_agent/image_utils.py +22 -10
vision_agent/tools/__init__.py +1 -0
vision_agent/tools/tools.py +109 -90
{vision_agent-0.0.51.dist-info → vision_agent-0.0.53.dist-info}/METADATA +3 -2
{vision_agent-0.0.51.dist-info → vision_agent-0.0.53.dist-info}/RECORD +11 -9
{vision_agent-0.0.51.dist-info → vision_agent-0.0.53.dist-info}/LICENSE +0 -0
{vision_agent-0.0.51.dist-info → vision_agent-0.0.53.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -5,6 +5,7 @@ import tempfile
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from PIL import Image
 from tabulate import tabulate
 from vision_agent.image_utils import overlay_bboxes, overlay_masks
@@ -288,9 +289,8 @@ def visualize_result(all_tool_results: List[Dict]) -> List[str]:
                 continue
             parameters = [parameters]
         elif isinstance(tool_result["parameters"], list):
-            if (
-                len(tool_result["parameters"]) < 1
-                and "image" not in tool_result["parameters"][0]
+            if len(tool_result["parameters"]) < 1 or (
+                "image" not in tool_result["parameters"][0]
             ):
                 continue
@@ -304,10 +304,16 @@ def visualize_result(all_tool_results: List[Dict]) -> List[str]:
             # if the call was successful, then we can add the image data
             image = param["image"]
             if image not in image_to_data:
-                image_to_data[image] = {"bboxes": [], "masks": [], "labels": []}
+                image_to_data[image] = {
+                    "bboxes": [],
+                    "masks": [],
+                    "labels": [],
+                    "scores": [],
+                }
             image_to_data[image]["bboxes"].extend(call_result["bboxes"])
             image_to_data[image]["labels"].extend(call_result["labels"])
+            image_to_data[image]["scores"].extend(call_result["scores"])
             if "masks" in call_result:
                 image_to_data[image]["masks"].extend(call_result["masks"])
@@ -380,6 +386,7 @@ class VisionAgent(Agent):
         self,
         input: Union[List[Dict[str, str]], str],
         image: Optional[Union[str, Path]] = None,
+        visualize_output: Optional[bool] = False,
     ) -> str:
         """Invoke the vision agent.
@@ -393,7 +400,7 @@ class VisionAgent(Agent):
         """
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
-        return self.chat(input, image=image)
+        return self.chat(input, image=image, visualize_output=visualize_output)
     def log_progress(self, description: str) -> None:
         _LOGGER.info(description)
@@ -401,7 +408,10 @@ class VisionAgent(Agent):
             self.report_progress_callback(description)
     def chat_with_workflow(
-        self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
+        self,
+        chat: List[Dict[str, str]],
+        image: Optional[Union[str, Path]] = None,
+        visualize_output: Optional[bool] = False,
     ) -> Tuple[str, List[Dict]]:
         question = chat[0]["content"]
         if image:
@@ -449,31 +459,42 @@ class VisionAgent(Agent):
                 self.answer_model, question, answers, reflections
             )
-            visualized_images = visualize_result(all_tool_results)
-            all_tool_results.append({"visualized_images": visualized_images})
+            visualized_output = visualize_result(all_tool_results)
+            all_tool_results.append({"visualized_output": visualized_output})
             reflection = self_reflect(
                 self.reflect_model,
                 question,
                 self.tools,
                 all_tool_results,
                 final_answer,
-                visualized_images[0] if len(visualized_images) > 0 else image,
+                visualized_output[0] if len(visualized_output) > 0 else image,
             )
             self.log_progress(f"Reflection: {reflection}")
             if parse_reflect(reflection):
                 break
             else:
-                reflections += reflection
-        # '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
+                reflections += "\n" + reflection
+        # '<END>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
         self.log_progress(
-            f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
+            f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</<ANSWER>"
         )
+        if visualize_output:
+            visualized_output = all_tool_results[-1]["visualized_output"]
+            for image in visualized_output:
+                Image.open(image).show()
         return final_answer, all_tool_results
     def chat(
-        self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
+        self,
+        chat: List[Dict[str, str]],
+        image: Optional[Union[str, Path]] = None,
+        visualize_output: Optional[bool] = False,
     ) -> str:
-        answer, _ = self.chat_with_workflow(chat, image=image)
+        answer, _ = self.chat_with_workflow(
+            chat, image=image, visualize_output=visualize_output
+        )
         return answer
     def retrieval(

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -1,4 +1,4 @@
-VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure with the tools available. Use complete sentences.
+VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You may also receive an image with the visualized bounding boxes or masks with their associated labels and scores from the tools used. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, concrete plan that aims to mitigate the same failure with the tools available. Do not make vague steps like re-evaluate the threshold, instead make concrete steps like use a threshold of 0.5 or whatever threshold you think would fix this issue. If the task cannot be completed with the existing tools, respond with Finish. Use complete sentences.
 User's question: {question}
@@ -49,7 +49,6 @@ Output: """
 CHOOSE_TOOL = """This is the user's question: {question}
 These are the tools you can select to solve the question:
 {tools}
 Please note that:
@@ -63,7 +62,6 @@ Output: """
 CHOOSE_TOOL_DEPENDS = """This is the user's question: {question}
 These are the tools you can select to solve the question:
 {tools}
 This is a reflection from a previous failed attempt:

vision_agent/fonts/__init__.py ADDED Viewed

File without changes

vision_agent/fonts/default_font_ch_en.ttf ADDED Viewed

Binary file

vision_agent/image_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Utility functions for image processing."""
 import base64
+from importlib import resources
 from io import BytesIO
 from pathlib import Path
 from typing import Dict, Tuple, Union
@@ -104,19 +105,28 @@ def overlay_bboxes(
     color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
-    draw = ImageDraw.Draw(image)
-    font = ImageFont.load_default()
     width, height = image.size
+    fontsize = max(12, int(min(width, height) / 40))
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.truetype(
+        str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
+        fontsize,
+    )
     if "bboxes" not in bboxes:
         return image.convert("RGB")
-    for label, box in zip(bboxes["labels"], bboxes["bboxes"]):
-        box = [box[0] * width, box[1] * height, box[2] * width, box[3] * height]
-        draw.rectangle(box, outline=color[label], width=3)
-        label = f"{label}"
-        text_box = draw.textbbox((box[0], box[1]), text=label, font=font)
-        draw.rectangle(text_box, fill=color[label])
-        draw.text((text_box[0], text_box[1]), label, fill="black", font=font)
+    for label, box, scores in zip(bboxes["labels"], bboxes["bboxes"], bboxes["scores"]):
+        box = [
+            int(box[0] * width),
+            int(box[1] * height),
+            int(box[2] * width),
+            int(box[3] * height),
+        ]
+        draw.rectangle(box, outline=color[label], width=4)
+        text = f"{label}: {scores:.2f}"
+        text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
+        draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
+        draw.text((box[0], box[1]), text, fill="black", font=font)
     return image.convert("RGB")
@@ -138,7 +148,9 @@ def overlay_masks(
     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
-    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(masks["labels"])}
+    color = {
+        label: COLORS[i % len(COLORS)] for i, label in enumerate(set(masks["labels"]))
+    }
     if "masks" not in masks:
         return image.convert("RGB")

vision_agent/tools/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from .tools import (
     ExtractFrames,
     GroundingDINO,
     GroundingSAM,
+    ImageCaption,
     SegArea,
     SegIoU,
     Tool,

vision_agent/tools/tools.py CHANGED Viewed

@@ -53,9 +53,7 @@ class Tool(ABC):
 class NoOp(Tool):
     name = "noop_"
-    description = (
-        "'noop_' is a no-op tool that does nothing if you do not need to use a tool."
-    )
+    description = "'noop_' is a no-op tool that does nothing if you do not want answer the question directly and not use a tool."
     usage = {
         "required_parameters": [],
         "examples": [
@@ -85,7 +83,7 @@ class CLIP(Tool):
     _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
     name = "clip_"
-    description = "'clip_' is a tool that can classify or tag any image given a set of input classes or tags."
+    description = "'clip_' is a tool that can classify any image given a set of input names or tags. It returns a list of the input names along with their probability scores."
     usage = {
         "required_parameters": [
             {"name": "prompt", "type": "str"},
@@ -146,6 +144,74 @@ class CLIP(Tool):
         return resp_json["data"]  # type: ignore
+class ImageCaption(Tool):
+    r"""ImageCaption is a tool that can caption an image based on its contents
+    or tags.
+    Example
+    -------
+        >>> import vision_agent as va
+        >>> caption = va.tools.ImageCaption()
+        >>> caption("image1.jpg")
+        {'text': ['a box of orange and white socks']}
+    """
+    _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
+    name = "image_caption_"
+    description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
+    usage = {
+        "required_parameters": [
+            {"name": "image", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "Can you describe this image ? Image name: cat.jpg",
+                "parameters": {"image": "cat.jpg"},
+            },
+            {
+                "scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
+                "parameters": {"image": "cat_dog.jpg"},
+            },
+            {
+                "scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
+                "parameters": {
+                    "image": "shirts.jpg",
+                },
+            },
+        ],
+    }
+    # TODO: Add support for input multiple images, which aligns with the output type.
+    def __call__(self, image: Union[str, ImageType]) -> Dict:
+        """Invoke the Image captioning model.
+        Parameters:
+            image: the input image to caption.
+        Returns:
+            A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
+        """
+        image_b64 = convert_to_b64(image)
+        data = {
+            "image": image_b64,
+            "tool": "image_captioning",
+        }
+        res = requests.post(
+            self._ENDPOINT,
+            headers={"Content-Type": "application/json"},
+            json=data,
+        )
+        resp_json: Dict[str, Any] = res.json()
+        if (
+            "statusCode" in resp_json and resp_json["statusCode"] != 200
+        ) or "statusCode" not in resp_json:
+            _LOGGER.error(f"Request failed: {resp_json}")
+            raise ValueError(f"Request failed: {resp_json}")
+        return resp_json["data"]  # type: ignore
 class GroundingDINO(Tool):
     r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
     category names or referring expressions.
@@ -163,7 +229,7 @@ class GroundingDINO(Tool):
     _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
     name = "grounding_dino_"
-    description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions."
+    description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and associated probability scores."
     usage = {
         "required_parameters": [
             {"name": "prompt", "type": "str"},
@@ -179,8 +245,11 @@ class GroundingDINO(Tool):
                 "parameters": {"prompt": "car", "image": ""},
             },
             {
-                "scenario": "Can you detect the person on the left? Image name: person.jpg",
-                "parameters": {"prompt": "person on the left", "image": "person.jpg"},
+                "scenario": "Can you detect the person on the left and right? Image name: person.jpg",
+                "parameters": {
+                    "prompt": "left person. right person",
+                    "image": "person.jpg",
+                },
             },
             {
                 "scenario": "Detect the red shirts and green shirst. Image name: shirts.jpg",
@@ -269,7 +338,7 @@ class GroundingSAM(Tool):
     _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
     name = "grounding_sam_"
-    description = "'grounding_sam_' is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions."
+    description = "'grounding_sam_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
     usage = {
         "required_parameters": [
             {"name": "prompt", "type": "str"},
@@ -285,8 +354,11 @@ class GroundingSAM(Tool):
                 "parameters": {"prompt": "car", "image": ""},
             },
             {
-                "scenario": "Can you segment the person on the left? Image name: person.jpg",
-                "parameters": {"prompt": "person on the left", "image": "person.jpg"},
+                "scenario": "Can you segment the person on the left and right? Image name: person.jpg",
+                "parameters": {
+                    "prompt": "left person. right person",
+                    "image": "person.jpg",
+                },
             },
             {
                 "scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg",
@@ -370,8 +442,9 @@ class AgentGroundingSAM(GroundingSAM):
         mask_files = []
         for mask in rets["masks"]:
             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
-                Image.fromarray(mask * 255).save(tmp)
-                mask_files.append(tmp.name)
+                file_name = Path(tmp.name).with_suffix(".mask.png")
+                Image.fromarray(mask * 255).save(file_name)
+                mask_files.append(str(file_name))
         rets["masks"] = mask_files
         return rets
@@ -380,7 +453,7 @@ class Counter(Tool):
     r"""Counter detects and counts the number of objects in an image given an input such as a category name or referring expression."""
     name = "counter_"
-    description = "'counter_' detects and counts the number of objects in an image given an input such as a category name or referring expression."
+    description = "'counter_' detects and counts the number of objects in an image given an input such as a category name or referring expression. It returns a dictionary containing the labels and their counts."
     usage = {
         "required_parameters": [
             {"name": "prompt", "type": "str"},
@@ -400,14 +473,14 @@ class Counter(Tool):
     def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
         resp = GroundingDINO()(prompt, image)
-        return dict(CounterClass(resp[0]["labels"]))
+        return dict(CounterClass(resp["labels"]))
 class Crop(Tool):
     r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
     name = "crop_"
-    description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image."
+    description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image. It returns a file with the cropped image."
     usage = {
         "required_parameters": [
             {"name": "bbox", "type": "List[float]"},
@@ -495,9 +568,7 @@ class SegArea(Tool):
 class BboxIoU(Tool):
     name = "bbox_iou_"
-    description = (
-        "'bbox_iou_' returns the intersection over union of two bounding boxes."
-    )
+    description = "'bbox_iou_' returns the intersection over union of two bounding boxes. This is a good tool for determining if two objects are overlapping."
     usage = {
         "required_parameters": [
             {"name": "bbox1", "type": "List[int]"},
@@ -591,85 +662,35 @@ class ExtractFrames(Tool):
         )
         for frame, ts in frames:
             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
-                Image.fromarray(frame).save(tmp)
-            result.append((tmp.name, ts))
+                file_name = Path(tmp.name).with_suffix(".frame.png")
+                Image.fromarray(frame).save(file_name)
+            result.append((str(file_name), ts))
         return result
-class Add(Tool):
-    r"""Add returns the sum of all the arguments passed to it, normalized to 2 decimal places."""
-    name = "add_"
-    description = "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places."
-    usage = {
-        "required_parameters": [{"name": "input", "type": "List[int]"}],
-        "examples": [
-            {
-                "scenario": "If you want to calculate 2 + 4",
-                "parameters": {"input": [2, 4]},
-            }
-        ],
-    }
-    def __call__(self, input: List[int]) -> float:
-        return round(sum(input), 2)
-class Subtract(Tool):
-    r"""Subtract returns the difference of all the arguments passed to it, normalized to 2 decimal places."""
+class Calculator(Tool):
+    r"""Calculator is a tool that can perform basic arithmetic operations."""
-    name = "subtract_"
-    description = "'subtract_' returns the difference of all the arguments passed to it, normalized to 2 decimal places."
-    usage = {
-        "required_parameters": [{"name": "input", "type": "List[int]"}],
-        "examples": [
-            {
-                "scenario": "If you want to calculate 4 - 2",
-                "parameters": {"input": [4, 2]},
-            }
-        ],
-    }
-    def __call__(self, input: List[int]) -> float:
-        return round(input[0] - input[1], 2)
-class Multiply(Tool):
-    r"""Multiply returns the product of all the arguments passed to it, normalized to 2 decimal places."""
-    name = "multiply_"
-    description = "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places."
+    name = "calculator_"
+    description = (
+        "'calculator_' is a tool that can perform basic arithmetic operations."
+    )
     usage = {
-        "required_parameters": [{"name": "input", "type": "List[int]"}],
+        "required_parameters": [{"name": "equation", "type": "str"}],
         "examples": [
             {
-                "scenario": "If you want to calculate 2 * 4",
-                "parameters": {"input": [2, 4]},
-            }
-        ],
-    }
-    def __call__(self, input: List[int]) -> float:
-        return round(input[0] * input[1], 2)
-class Divide(Tool):
-    r"""Divide returns the division of all the arguments passed to it, normalized to 2 decimal places."""
-    name = "divide_"
-    description = "'divide_' returns the division of all the arguments passed to it, normalized to 2 decimal places."
-    usage = {
-        "required_parameters": [{"name": "input", "type": "List[int]"}],
-        "examples": [
+                "scenario": "If you want to calculate (2 * 3) + 4",
+                "parameters": {"equation": "2 + 4"},
+            },
             {
-                "scenario": "If you want to calculate 4 / 2",
-                "parameters": {"input": [4, 2]},
-            }
+                "scenario": "If you want to calculate (4 + 2.5) / 2.1",
+                "parameters": {"equation": "(4 + 2.5) / 2.1"},
+            },
         ],
     }
-    def __call__(self, input: List[int]) -> float:
-        return round(input[0] / input[1], 2)
+    def __call__(self, equation: str) -> float:
+        return cast(float, round(eval(equation), 2))
 TOOLS = {
@@ -678,6 +699,7 @@ TOOLS = {
         [
             NoOp,
             CLIP,
+            ImageCaption,
             GroundingDINO,
             AgentGroundingSAM,
             ExtractFrames,
@@ -687,10 +709,7 @@ TOOLS = {
             SegArea,
             BboxIoU,
             SegIoU,
-            Add,
-            Subtract,
-            Multiply,
-            Divide,
+            Calculator,
         ]
     )
     if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))

{vision_agent-0.0.51.dist-info → vision_agent-0.0.53.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.0.51
+Version: 0.0.53
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -103,7 +103,8 @@ the individual steps and tools to get the answer:
     }
   ]],
   "answer": "The jar is located at [0.58, 0.2, 0.72, 0.45].",
-}]
+},
+{"visualize_output": "final_output.png"}]
 ```
 ### Tools

{vision_agent-0.0.51.dist-info → vision_agent-0.0.53.dist-info}/RECORD RENAMED Viewed

@@ -5,22 +5,24 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
 vision_agent/agent/easytool_prompts.py,sha256=dYzWa_RaiaFSQ-CowoQOcFmjZtBTTljRyA809bLgrvU,4519
 vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=4m6nSP6rFbD9rg813pjcnsrtLVkTl42KeSAE1NIO_pE,19103
-vision_agent/agent/vision_agent_prompts.py,sha256=fYnOT6z7DmuVTfUknUuc6b_vPmO0vgCyVJRQSR5M-G8,6192
+vision_agent/agent/vision_agent.py,sha256=UV7_mqejfF4B-AqqmETqWvfiPvRcjfq-0nlNfeo_RxM,19765
+vision_agent/agent/vision_agent_prompts.py,sha256=dPg0mLVK_fGJpYK2xXGhm-zuXX1KVZW_zFXyYsspUz8,6567
 vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
 vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
 vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
 vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
-vision_agent/image_utils.py,sha256=_hDikKa40U-2nQufKMRDgU9t-OmwCK9Rb_6O3v1U3nE,4436
+vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
+vision_agent/image_utils.py,sha256=hFdPoRmeVU5jErFr5xaagMQ6Wy7Xbw8H8HXuLGdJIAM,4786
 vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
 vision_agent/llm/llm.py,sha256=tgL6ZtuwZKuxSNiCxJCuP2ETjNMrosdgxXkZJb0_00E,5024
 vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
 vision_agent/lmm/lmm.py,sha256=LxwxCArp7DfnPbjf_Gl55xBxPwo2Qx8eDp1gCnGYSO0,9535
-vision_agent/tools/__init__.py,sha256=AKN-T659HpwVearRnkCd6wWNoJ6K5kW9gAZwb8IQSLE,235
+vision_agent/tools/__init__.py,sha256=OEqEysxm5wnnOD73NKNCUggALB72GEmVg9FNsEkSBtA,253
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tools.py,sha256=bYc3Xeg0wDjpfd8WGxRPCSaGQxUHRLI2PJk-SThqjHY,25644
+vision_agent/tools/tools.py,sha256=WPqLHw8D0tkaP2LFYo6cBithP4q0vb6Bve4Nv577Prk,27045
 vision_agent/tools/video.py,sha256=40rscP8YvKN3lhZ4PDcOK4XbdFX2duCRpHY_krmBYKU,7476
-vision_agent-0.0.51.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.0.51.dist-info/METADATA,sha256=WTmAhHd0ZjlbDQH8acO9O6-vSrfVMCsPhcdMehW_4GY,6142
-vision_agent-0.0.51.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.0.51.dist-info/RECORD,,
+vision_agent-0.0.53.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.0.53.dist-info/METADATA,sha256=ybezBW-LYFhlCovdbKNq6iC93mb0wZNOQ29HD30OPz4,6184
+vision_agent-0.0.53.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.0.53.dist-info/RECORD,,

{vision_agent-0.0.51.dist-info → vision_agent-0.0.53.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.0.51.dist-info → vision_agent-0.0.53.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl

vision-agent 0.0.51py3-none-any.whl → 0.0.53py3-none-any.whl