PyPI - vision-agent - Versions diffs - 0.2.102__py3-none-any.whl → 0.2.103__py3-none-any.whl - Mend

vision-agent 0.2.102py3-none-any.whl → 0.2.103py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

vision_agent/tools/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@ from .tools import (
     florencev2_image_caption,
     florencev2_object_detection,
     florencev2_roberta_vqa,
+    florencev2_ocr,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,

vision_agent/tools/tools.py CHANGED Viewed

@@ -28,6 +28,7 @@ from vision_agent.utils.image_utils import (
     denormalize_bbox,
     get_image_size,
     normalize_bbox,
+    convert_quad_box_to_bbox,
     rle_decode,
 )
@@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
     return return_data
+def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florencev2_ocr' is a tool that can detect text and text regions in an image.
+    Each text region contains one line of text. It returns a list of detected text,
+    the text region as a bounding box with normalized coordinates, and confidence
+    scores. The results are sorted from top-left to bottom right.
+    Parameters:
+        image (np.ndarray): The image to extract text from.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
+            with nornmalized coordinates, and confidence score.
+    Example
+    -------
+        >>> florencev2_ocr(image)
+        [
+            {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
+        ]
+    """
+    image_size = image.shape[:2]
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "task": "<OCR_WITH_REGION>",
+        "function_name": "florencev2_ocr",
+    }
+    detections = send_inference_request(data, "florence2", v2=True)
+    detections = detections["<OCR_WITH_REGION>"]
+    return_data = []
+    for i in range(len(detections["quad_boxes"])):
+        return_data.append(
+            {
+                "label": detections["labels"][i],
+                "bbox": normalize_bbox(
+                    convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
+                ),
+                "score": 1.0,
+            }
+        )
+    return return_data
 def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
     """'detr_segmentation' is a tool that can segment common objects in an
     image without any text prompt. It returns a list of detected objects
@@ -1248,6 +1294,7 @@ TOOLS = [
     loca_visual_prompt_counting,
     florencev2_roberta_vqa,
     florencev2_image_caption,
+    florencev2_ocr,
     detr_segmentation,
     depth_anything_v2,
     generate_soft_edge_image,

vision_agent/utils/image_utils.py CHANGED Viewed

@@ -140,6 +140,23 @@ def denormalize_bbox(
         return bbox
+def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
+    r"""Convert a quadrilateral bounding box to a rectangular bounding box.
+    Parameters:
+        quad_box: the quadrilateral bounding box
+    Returns:
+        The rectangular bounding box
+    """
+    x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
+    x_min = min(x1, x2, x3, x4)
+    x_max = max(x1, x2, x3, x4)
+    y_min = min(y1, y2, y3, y4)
+    y_max = max(y1, y2, y3, y4)
+    return [x_min, y_min, x_max, y_max]
 def overlay_bboxes(
     image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
 ) -> ImageType:

{vision_agent-0.2.102.dist-info → vision_agent-0.2.103.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.102
+Version: 0.2.103
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.102.dist-info → vision_agent-0.2.103.dist-info}/RECORD RENAMED Viewed

@@ -14,20 +14,20 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
 vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
 vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
 vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
-vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnvM,1906
+vision_agent/tools/__init__.py,sha256=MK0D8NtIChwGHwqsTz3LeV5BGuQecNVrNzUsyaEwuGA,1926
 vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
 vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=ZhZ9oEcOvRSuWPy-gV0rx3pvaaXzBW-ZC3YQanXrq1g,4733
-vision_agent/tools/tools.py,sha256=jWWioqBNsoNaGa8WKVldKBk_y9ZD1shO52kSE-26MFc,43111
+vision_agent/tools/tools.py,sha256=fgPE0VHfBiQPJKkslBm_hugTOyRT-Hnw7eztvC-l4_o,44661
 vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
 vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
 vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
-vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
+vision_agent/utils/image_utils.py,sha256=c_g5i_cFC0C-Yw9gU_NaVgQdmBlyumw3bLIDtCU42xo,8200
 vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
-vision_agent-0.2.102.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.102.dist-info/METADATA,sha256=NUnuzJmGX6d9tboY_RafIVOLAhcT_phRqcNh8Xgwd2Q,10729
-vision_agent-0.2.102.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.102.dist-info/RECORD,,
+vision_agent-0.2.103.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.103.dist-info/METADATA,sha256=DfZa2bcKHvQxsgAJRBdIEpPGdBjt18TuOwMzXOUIV_w,10729
+vision_agent-0.2.103.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.103.dist-info/RECORD,,

{vision_agent-0.2.102.dist-info → vision_agent-0.2.103.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.102.dist-info → vision_agent-0.2.103.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.102__py3-none-any.whl → 0.2.103__py3-none-any.whl

vision-agent 0.2.102py3-none-any.whl → 0.2.103py3-none-any.whl