PyPI - vision-agent - Versions diffs - 0.2.50__py3-none-any.whl → 0.2.51__py3-none-any.whl - Mend

vision-agent 0.2.50py3-none-any.whl → 0.2.51py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

vision_agent/agent/easytool_v2.py CHANGED Viewed

@@ -428,12 +428,12 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
 class EasyToolV2(Agent):
-    r"""EasyToolV2 is an agent framework that utilizes tools as well as self
-    reflection to accomplish tasks, in particular vision tasks. EasyToolV2 is based
-    off of EasyTool https://arxiv.org/abs/2401.06201 and Reflexion
-    https://arxiv.org/abs/2303.11366 where it will attempt to complete a task and then
-    reflect on whether or not it was able to accomplish the task based off of the plan
-    and final results, if not it will redo the task with this newly added reflection.
+    """EasyToolV2 is an agent framework that utilizes tools as well as self reflection
+    to accomplish tasks, in particular vision tasks. EasyToolV2 is based off of EasyTool
+    https://arxiv.org/abs/2401.06201 and Reflexion https://arxiv.org/abs/2303.11366
+    where it will attempt to complete a task and then reflect on whether or not it was
+    able to accomplish the task based off of the plan and final results, if not it will
+    redo the task with this newly added reflection.
     Example
     -------
@@ -461,7 +461,10 @@ class EasyToolV2(Agent):
             reflect_model: the model to use for self reflection.
             max_retries: maximum number of retries to attempt to complete the task.
             verbose: whether to print more logs.
-            report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple EasyToolV2 instances are running in parallel. This callback ensures that the progress are not mixed up.
+            report_progress_callback: a callback to report the progress of the agent.
+                This is useful for streaming logs in a web application where multiple
+                EasyToolV2 instances are running in parallel. This callback ensures
+                that the progress are not mixed up.
         """
         self.task_model = (
             OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
@@ -495,9 +498,10 @@ class EasyToolV2(Agent):
         """Invoke the vision agent.
         Parameters:
-            chat: A conversation in the format of
-                [{"role": "user", "content": "describe your task here..."}].
-            image: The input image referenced in the chat parameter.
+            input: A conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}] or a string
+                containing just the content.
+            media: The input media referenced in the chat parameter.
             reference_data: A dictionary containing the reference image, mask or bounding
                 box in the format of:
                 {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
@@ -549,7 +553,7 @@ class EasyToolV2(Agent):
         Parameters:
             chat: A conversation in the format of
                 [{"role": "user", "content": "describe your task here..."}].
-            image: The input image referenced in the chat parameter.
+            media: The media image referenced in the chat parameter.
             reference_data: A dictionary containing the reference image, mask or bounding
                 box in the format of:
                 {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
@@ -558,9 +562,8 @@ class EasyToolV2(Agent):
             self_reflection: boolean to enable and disable self reflection.
         Returns:
-            A tuple where the first item is the final answer and the second item is a
-            list of all the tool results. The last item in the tool results also
-            contains the visualized output.
+            Tuple[str, List[Dict]]: A tuple where the first item is the final answer
+                and the second item is a list of all the tool results.
         """
         if len(chat) == 0:
             raise ValueError("Input cannot be empty.")

vision_agent/agent/reflexion.py CHANGED Viewed

@@ -144,7 +144,7 @@ class Reflexion(Agent):
         Parameters:
             input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
-            image: the input image referenced in the prompt parameter.
+            media: the input media referenced in the prompt parameter.
         Returns:
             A text response.

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -442,10 +442,10 @@ class VisionAgent(Agent):
         """Chat with Vision Agent and return intermediate information regarding the task.
         Parameters:
-            chat (List[Dict[str, str]]): A conversation in the format of
-                [{"role": "user", "content": "describe your task here..."}].
+            input (Union[List[Dict[str, str]], str]): A conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}] or a string
+                of just the contents.
             media (Optional[Union[str, Path]]): The media file to be used in the task.
-            self_reflection (bool): Whether to reflect on the task and debug the code.
         Returns:
             str: The code output by the Vision Agent.
@@ -471,7 +471,8 @@ class VisionAgent(Agent):
                 [{"role": "user", "content": "describe your task here..."}].
             media (Optional[Union[str, Path]]): The media file to be used in the task.
             self_reflection (bool): Whether to reflect on the task and debug the code.
-            show_visualization (bool): If True, it opens a new window locally to show the image(s) created by visualization code (if there is any).
+            display_visualization (bool): If True, it opens a new window locally to
+                show the image(s) created by visualization code (if there is any).
         Returns:
             Dict[str, Any]: A dictionary containing the code, test, test result, plan,

vision_agent/tools/tools.py CHANGED Viewed

@@ -75,17 +75,18 @@ def grounding_dino(
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-        bounding box of the detected objects with normalized coordinates between 0 and 1
-        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
-        xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            bounding box of the detected objects with normalized coordinates between 0
+            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+            top-left and xmax and ymax are the coordinates of the bottom-right of the
+            bounding box.
     Example
     -------
-    >>> grounding_dino("car. dinosaur", image)
-    [
-        {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-        {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
-    ]
+        >>> grounding_dino("car. dinosaur", image)
+        [
+            {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
+        ]
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
@@ -129,27 +130,27 @@ def grounding_sam(
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label,
-        bounding box, and mask of the detected objects with normalized coordinates
-        (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
-        xmax and ymax are the coordinates of the bottom-right of the bounding box.
-        The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
-        the background.
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
     Example
     -------
-    >>> grounding_sam("car. dinosaur", image)
-    [
-        {
-            'score': 0.99,
-            'label': 'dinosaur',
-            'bbox': [0.1, 0.11, 0.35, 0.4],
-            'mask': array([[0, 0, 0, ..., 0, 0, 0],
-                [0, 0, 0, ..., 0, 0, 0],
-                ...,
-                [0, 0, 0, ..., 0, 0, 0],
-                [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
-        },
-    ]
+        >>> grounding_sam("car. dinosaur", image)
+        [
+            {
+                'score': 0.99,
+                'label': 'dinosaur',
+                'bbox': [0.1, 0.11, 0.35, 0.4],
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            },
+        ]
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
@@ -187,12 +188,12 @@ def extract_frames(
     Returns:
         List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
-        as a numpy array and the timestamp in seconds.
+            as a numpy array and the timestamp in seconds.
     Example
     -------
-    >>> extract_frames("path/to/video.mp4")
-    [(frame1, 0.0), (frame2, 0.5), ...]
+        >>> extract_frames("path/to/video.mp4")
+        [(frame1, 0.0), (frame2, 0.5), ...]
     """
     return extract_frames_from_video(str(video_uri), fps)
@@ -212,10 +213,10 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     Example
     -------
-    >>> ocr(image)
-    [
-        {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
-    ]
+        >>> ocr(image)
+        [
+            {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
+        ]
     """
     pil_image = Image.fromarray(image).convert("RGB")
@@ -266,9 +267,8 @@ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
     Example
     -------
-    >>> zero_shot_counting(image)
-    {'count': 45},
+        >>> zero_shot_counting(image)
+        {'count': 45},
     """
     image_b64 = convert_to_b64(image)
@@ -297,9 +297,8 @@ def visual_prompt_counting(
     Example
     -------
-    >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
-    {'count': 45},
+        >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
+        {'count': 45},
     """
     image_size = get_image_size(image)
@@ -332,9 +331,8 @@ def image_question_answering(image: np.ndarray, prompt: str) -> str:
     Example
     -------
-    >>> image_question_answering(image, 'What is the cat doing ?')
-    'drinking milk'
+        >>> image_question_answering(image, 'What is the cat doing ?')
+        'drinking milk'
     """
     image_b64 = convert_to_b64(image)
@@ -363,9 +361,8 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
     Example
     -------
-    >>> clip(image, ['dog', 'cat', 'bird'])
-    {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
+        >>> clip(image, ['dog', 'cat', 'bird'])
+        {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
     """
     image_b64 = convert_to_b64(image)
@@ -391,9 +388,8 @@ def image_caption(image: np.ndarray) -> str:
     Example
     -------
-    >>> image_caption(image)
-    'This image contains a cat sitting on a table with a bowl of milk.'
+        >>> image_caption(image)
+        'This image contains a cat sitting on a table with a bowl of milk.'
     """
     image_b64 = convert_to_b64(image)
@@ -418,8 +414,8 @@ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
     Example
     -------
-    >>> closest_mask_distance(mask1, mask2)
-    0.5
+        >>> closest_mask_distance(mask1, mask2)
+        0.5
     """
     mask1 = np.clip(mask1, 0, 1)
@@ -474,8 +470,8 @@ def closest_box_distance(
     Example
     -------
-    >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
-    141.42
+        >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
+        141.42
     """
     x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
@@ -499,7 +495,7 @@ def save_json(data: Any, file_path: str) -> None:
     Example
     -------
-    >>> save_json(data, "path/to/file.json")
+        >>> save_json(data, "path/to/file.json")
     """
     class NumpyEncoder(json.JSONEncoder):
@@ -525,7 +521,7 @@ def load_image(image_path: str) -> np.ndarray:
     Example
     -------
-    >>> load_image("path/to/image.jpg")
+        >>> load_image("path/to/image.jpg")
     """
     # NOTE: sometimes the generated code pass in a NumPy array
     if isinstance(image_path, np.ndarray):
@@ -545,8 +541,8 @@ def save_image(image: np.ndarray) -> str:
     Example
     -------
-    >>> save_image(image)
-    "/tmp/tmpabc123.png"
+        >>> save_image(image)
+        "/tmp/tmpabc123.png"
     """
     from IPython.display import display
@@ -572,8 +568,8 @@ def save_video(
     Example
     -------
-    >>> save_video(frames)
-    "/tmp/tmpvideo123.mp4"
+        >>> save_video(frames)
+        "/tmp/tmpvideo123.mp4"
     """
     if fps <= 0:
         _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
@@ -619,9 +615,9 @@ def overlay_bounding_boxes(
     Example
     -------
-    >>> image_with_bboxes = overlay_bounding_boxes(
-        image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
-    )
+        >>> image_with_bboxes = overlay_bounding_boxes(
+            image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
+        )
     """
     pil_image = Image.fromarray(image.astype(np.uint8))
@@ -675,18 +671,18 @@ def overlay_segmentation_masks(
     Example
     -------
-    >>> image_with_masks = overlay_segmentation_masks(
-        image,
-        [{
-            'score': 0.99,
-            'label': 'dinosaur',
-            'mask': array([[0, 0, 0, ..., 0, 0, 0],
-                [0, 0, 0, ..., 0, 0, 0],
-                ...,
-                [0, 0, 0, ..., 0, 0, 0],
-                [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
-        }],
-    )
+        >>> image_with_masks = overlay_segmentation_masks(
+            image,
+            [{
+                'score': 0.99,
+                'label': 'dinosaur',
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            }],
+        )
     """
     pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
@@ -727,16 +723,16 @@ def overlay_heat_map(
     Example
     -------
-    >>> image_with_heat_map = overlay_heat_map(
-        image,
-        {
-            'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
-                [0, 0, 0, ..., 0, 0, 0],
-                ...,
-                [0, 0, 0, ..., 0, 0, 0],
-                [0, 0, 0, ..., 125, 125, 125]], dtype=uint8),
-        },
-    )
+        >>> image_with_heat_map = overlay_heat_map(
+            image,
+            {
+                'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 125, 125, 125]], dtype=uint8),
+            },
+        )
     """
     pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")

vision_agent/utils/video.py CHANGED Viewed

@@ -63,9 +63,9 @@ def extract_frames_from_video(
     Returns:
         a list of tuples containing the extracted frame and the timestamp in seconds.
-        E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
-        from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
-        the video. The frames are sorted by the timestamp in ascending order.
+            E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
+            from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
+            the video. The frames are sorted by the timestamp in ascending order.
     """
     with VideoFileClip(video_uri) as video:
         video_duration: float = video.duration

{vision_agent-0.2.50.dist-info → vision_agent-0.2.51.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.50
+Version: 0.2.51
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.50.dist-info → vision_agent-0.2.51.dist-info}/RECORD RENAMED Viewed

@@ -7,11 +7,11 @@ vision_agent/agent/data_interpreter.py,sha256=YlCm3DVyhCM9T6wpccWxC5XHoIj9smsEsk
 vision_agent/agent/data_interpreter_prompts.py,sha256=RDJggOfXwGaEoIcTYGX41ZEayCgYei1AootDOc_SN2g,6134
 vision_agent/agent/easytool.py,sha256=wMa9-tpAaiC4E2ONbidxmMM9YvAOw4_Sypf5mGKco_w,11526
 vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
-vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFIypCA,27274
+vision_agent/agent/easytool_v2.py,sha256=LY2cqzjVHBr7QMn4WsrZ7AfpWrDN0LjJIrd5tMo2-PI,27323
 vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
-vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
+vision_agent/agent/reflexion.py,sha256=scck3YcME6DhX5Vs4Wr1rYb8S4wkBUkN9UksyazfrZg,10506
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=0EqpLyyzpRGmT7fhS2XvLeUlktgCXTE5k1KGMQ8z3_s,20963
+vision_agent/agent/vision_agent.py,sha256=wGGISg6pDVNseF2fIAN1jH66OX2qZk2nDhuobeSNGHk,20957
 vision_agent/agent/vision_agent_prompts.py,sha256=hgnTlaYp2HMBHLi3e4faPb-DI5jQL9jfhKq9jyEUEgY,8370
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
@@ -23,14 +23,14 @@ vision_agent/tools/__init__.py,sha256=Sng6dChynJJCYWjraXXM0tep_VPdnYl3L9vb0HMy_P
 vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
-vision_agent/tools/tools.py,sha256=Sc6tAYbH03TbrPKAT8XIj1YZIwhd9j2k4ia8iKHhxzM,26743
+vision_agent/tools/tools.py,sha256=L1_umAVxk_BlrDYEmV2eyu2cJnpieTW-Ipb03VwKqWU,27062
 vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
 vision_agent/utils/execute.py,sha256=GqoAodxtwTPBr1nujPTsWiZO2rBGvWVXTe8lgxY4d_g,20603
 vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
 vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
 vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
-vision_agent/utils/video.py,sha256=BJ9fomy2giAl038JThQP1WQZ-u4J4J_nsZB7QEWvlcQ,8767
-vision_agent-0.2.50.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.50.dist-info/METADATA,sha256=nLyeSFYnn4Bv_RyKzrP5iqnCRRkwCZT_d3euN1zgBOA,6817
-vision_agent-0.2.50.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.50.dist-info/RECORD,,
+vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
+vision_agent-0.2.51.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.51.dist-info/METADATA,sha256=xUYxi6YH3U4QTlYNWZ51YI365ER6NANcYBiVeXN4egQ,6817
+vision_agent-0.2.51.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.51.dist-info/RECORD,,

{vision_agent-0.2.50.dist-info → vision_agent-0.2.51.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.50.dist-info → vision_agent-0.2.51.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.50__py3-none-any.whl → 0.2.51__py3-none-any.whl

vision-agent 0.2.50py3-none-any.whl → 0.2.51py3-none-any.whl