vision-agent 0.2.102__py3-none-any.whl → 0.2.103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@ from .tools import (
19
19
  florencev2_image_caption,
20
20
  florencev2_object_detection,
21
21
  florencev2_roberta_vqa,
22
+ florencev2_ocr,
22
23
  generate_pose_image,
23
24
  generate_soft_edge_image,
24
25
  get_tool_documentation,
@@ -28,6 +28,7 @@ from vision_agent.utils.image_utils import (
28
28
  denormalize_bbox,
29
29
  get_image_size,
30
30
  normalize_bbox,
31
+ convert_quad_box_to_bbox,
31
32
  rle_decode,
32
33
  )
33
34
 
@@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
652
653
  return return_data
653
654
 
654
655
 
656
+ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
657
+ """'florencev2_ocr' is a tool that can detect text and text regions in an image.
658
+ Each text region contains one line of text. It returns a list of detected text,
659
+ the text region as a bounding box with normalized coordinates, and confidence
660
+ scores. The results are sorted from top-left to bottom right.
661
+
662
+ Parameters:
663
+ image (np.ndarray): The image to extract text from.
664
+
665
+ Returns:
666
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
667
+ with nornmalized coordinates, and confidence score.
668
+
669
+ Example
670
+ -------
671
+ >>> florencev2_ocr(image)
672
+ [
673
+ {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
674
+ ]
675
+ """
676
+
677
+ image_size = image.shape[:2]
678
+ image_b64 = convert_to_b64(image)
679
+ data = {
680
+ "image": image_b64,
681
+ "task": "<OCR_WITH_REGION>",
682
+ "function_name": "florencev2_ocr",
683
+ }
684
+
685
+ detections = send_inference_request(data, "florence2", v2=True)
686
+ detections = detections["<OCR_WITH_REGION>"]
687
+ return_data = []
688
+ for i in range(len(detections["quad_boxes"])):
689
+ return_data.append(
690
+ {
691
+ "label": detections["labels"][i],
692
+ "bbox": normalize_bbox(
693
+ convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
694
+ ),
695
+ "score": 1.0,
696
+ }
697
+ )
698
+ return return_data
699
+
700
+
655
701
  def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
656
702
  """'detr_segmentation' is a tool that can segment common objects in an
657
703
  image without any text prompt. It returns a list of detected objects
@@ -1248,6 +1294,7 @@ TOOLS = [
1248
1294
  loca_visual_prompt_counting,
1249
1295
  florencev2_roberta_vqa,
1250
1296
  florencev2_image_caption,
1297
+ florencev2_ocr,
1251
1298
  detr_segmentation,
1252
1299
  depth_anything_v2,
1253
1300
  generate_soft_edge_image,
@@ -140,6 +140,23 @@ def denormalize_bbox(
140
140
  return bbox
141
141
 
142
142
 
143
+ def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
144
+ r"""Convert a quadrilateral bounding box to a rectangular bounding box.
145
+
146
+ Parameters:
147
+ quad_box: the quadrilateral bounding box
148
+
149
+ Returns:
150
+ The rectangular bounding box
151
+ """
152
+ x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
153
+ x_min = min(x1, x2, x3, x4)
154
+ x_max = max(x1, x2, x3, x4)
155
+ y_min = min(y1, y2, y3, y4)
156
+ y_max = max(y1, y2, y3, y4)
157
+ return [x_min, y_min, x_max, y_max]
158
+
159
+
143
160
  def overlay_bboxes(
144
161
  image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
145
162
  ) -> ImageType:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.102
3
+ Version: 0.2.103
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -14,20 +14,20 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
14
14
  vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
15
15
  vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
16
16
  vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
17
- vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnvM,1906
17
+ vision_agent/tools/__init__.py,sha256=MK0D8NtIChwGHwqsTz3LeV5BGuQecNVrNzUsyaEwuGA,1926
18
18
  vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
19
19
  vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
20
20
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
21
21
  vision_agent/tools/tool_utils.py,sha256=ZhZ9oEcOvRSuWPy-gV0rx3pvaaXzBW-ZC3YQanXrq1g,4733
22
- vision_agent/tools/tools.py,sha256=jWWioqBNsoNaGa8WKVldKBk_y9ZD1shO52kSE-26MFc,43111
22
+ vision_agent/tools/tools.py,sha256=fgPE0VHfBiQPJKkslBm_hugTOyRT-Hnw7eztvC-l4_o,44661
23
23
  vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
24
24
  vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
25
25
  vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
26
- vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
26
+ vision_agent/utils/image_utils.py,sha256=c_g5i_cFC0C-Yw9gU_NaVgQdmBlyumw3bLIDtCU42xo,8200
27
27
  vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
30
- vision_agent-0.2.102.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.102.dist-info/METADATA,sha256=NUnuzJmGX6d9tboY_RafIVOLAhcT_phRqcNh8Xgwd2Q,10729
32
- vision_agent-0.2.102.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.102.dist-info/RECORD,,
30
+ vision_agent-0.2.103.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.103.dist-info/METADATA,sha256=DfZa2bcKHvQxsgAJRBdIEpPGdBjt18TuOwMzXOUIV_w,10729
32
+ vision_agent-0.2.103.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.103.dist-info/RECORD,,