PyPI - vision-agent - Versions diffs - 0.2.101__tar.gz → 0.2.103__tar.gz - Mend

vision-agent 0.2.101tar.gz → 0.2.103tar.gz

Files changed (33) hide show

{vision_agent-0.2.101 → vision_agent-0.2.103}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.101
+Version: 0.2.103
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.101 → vision_agent-0.2.103}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.101"
+version = "0.2.103"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

@@ -735,8 +735,17 @@ class VisionAgentCoder(Agent):
             if self.verbosity >= 1:
                 for p in plans:
+                    # tabulate will fail if the keys are not the same for all elements
+                    p_fixed = [
+                        {
+                            "instructions": (
+                                e["instructions"] if "instructions" in e else ""
+                            )
+                        }
+                        for e in plans[p]
+                    ]
                     _LOGGER.info(
-                        f"\n{tabulate(tabular_data=plans[p], headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+                        f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
                     )
             tool_infos = retrieve_tools(

{vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -19,6 +19,7 @@ from .tools import (
     florencev2_image_caption,
     florencev2_object_detection,
     florencev2_roberta_vqa,
+    florencev2_ocr,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,

{vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/tools.py RENAMED Viewed

@@ -28,6 +28,7 @@ from vision_agent.utils.image_utils import (
     denormalize_bbox,
     get_image_size,
     normalize_bbox,
+    convert_quad_box_to_bbox,
     rle_decode,
 )
@@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
     return return_data
+def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florencev2_ocr' is a tool that can detect text and text regions in an image.
+    Each text region contains one line of text. It returns a list of detected text,
+    the text region as a bounding box with normalized coordinates, and confidence
+    scores. The results are sorted from top-left to bottom right.
+    Parameters:
+        image (np.ndarray): The image to extract text from.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
+            with nornmalized coordinates, and confidence score.
+    Example
+    -------
+        >>> florencev2_ocr(image)
+        [
+            {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
+        ]
+    """
+    image_size = image.shape[:2]
+    image_b64 = convert_to_b64(image)
+    data = {
+        "image": image_b64,
+        "task": "<OCR_WITH_REGION>",
+        "function_name": "florencev2_ocr",
+    }
+    detections = send_inference_request(data, "florence2", v2=True)
+    detections = detections["<OCR_WITH_REGION>"]
+    return_data = []
+    for i in range(len(detections["quad_boxes"])):
+        return_data.append(
+            {
+                "label": detections["labels"][i],
+                "bbox": normalize_bbox(
+                    convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
+                ),
+                "score": 1.0,
+            }
+        )
+    return return_data
 def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
     """'detr_segmentation' is a tool that can segment common objects in an
     image without any text prompt. It returns a list of detected objects
@@ -1248,6 +1294,7 @@ TOOLS = [
     loca_visual_prompt_counting,
     florencev2_roberta_vqa,
     florencev2_image_caption,
+    florencev2_ocr,
     detr_segmentation,
     depth_anything_v2,
     generate_soft_edge_image,

{vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/image_utils.py RENAMED Viewed

@@ -140,6 +140,23 @@ def denormalize_bbox(
         return bbox
+def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
+    r"""Convert a quadrilateral bounding box to a rectangular bounding box.
+    Parameters:
+        quad_box: the quadrilateral bounding box
+    Returns:
+        The rectangular bounding box
+    """
+    x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
+    x_min = min(x1, x2, x3, x4)
+    x_max = max(x1, x2, x3, x4)
+    y_min = min(y1, y2, y3, y4)
+    y_max = max(y1, y2, y3, y4)
+    return [x_min, y_min, x_max, y_max]
 def overlay_bboxes(
     image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
 ) -> ImageType: