vision-agent 0.2.102__py3-none-any.whl → 0.2.103__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,6 +19,7 @@ from .tools import (
19
19
  florencev2_image_caption,
20
20
  florencev2_object_detection,
21
21
  florencev2_roberta_vqa,
22
+ florencev2_ocr,
22
23
  generate_pose_image,
23
24
  generate_soft_edge_image,
24
25
  get_tool_documentation,
@@ -28,6 +28,7 @@ from vision_agent.utils.image_utils import (
28
28
  denormalize_bbox,
29
29
  get_image_size,
30
30
  normalize_bbox,
31
+ convert_quad_box_to_bbox,
31
32
  rle_decode,
32
33
  )
33
34
 
@@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
652
653
  return return_data
653
654
 
654
655
 
656
+ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
657
+ """'florencev2_ocr' is a tool that can detect text and text regions in an image.
658
+ Each text region contains one line of text. It returns a list of detected text,
659
+ the text region as a bounding box with normalized coordinates, and confidence
660
+ scores. The results are sorted from top-left to bottom right.
661
+
662
+ Parameters:
663
+ image (np.ndarray): The image to extract text from.
664
+
665
+ Returns:
666
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
667
+ with nornmalized coordinates, and confidence score.
668
+
669
+ Example
670
+ -------
671
+ >>> florencev2_ocr(image)
672
+ [
673
+ {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
674
+ ]
675
+ """
676
+
677
+ image_size = image.shape[:2]
678
+ image_b64 = convert_to_b64(image)
679
+ data = {
680
+ "image": image_b64,
681
+ "task": "<OCR_WITH_REGION>",
682
+ "function_name": "florencev2_ocr",
683
+ }
684
+
685
+ detections = send_inference_request(data, "florence2", v2=True)
686
+ detections = detections["<OCR_WITH_REGION>"]
687
+ return_data = []
688
+ for i in range(len(detections["quad_boxes"])):
689
+ return_data.append(
690
+ {
691
+ "label": detections["labels"][i],
692
+ "bbox": normalize_bbox(
693
+ convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
694
+ ),
695
+ "score": 1.0,
696
+ }
697
+ )
698
+ return return_data
699
+
700
+
655
701
  def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
656
702
  """'detr_segmentation' is a tool that can segment common objects in an
657
703
  image without any text prompt. It returns a list of detected objects
@@ -1248,6 +1294,7 @@ TOOLS = [
1248
1294
  loca_visual_prompt_counting,
1249
1295
  florencev2_roberta_vqa,
1250
1296
  florencev2_image_caption,
1297
+ florencev2_ocr,
1251
1298
  detr_segmentation,
1252
1299
  depth_anything_v2,
1253
1300
  generate_soft_edge_image,
@@ -140,6 +140,23 @@ def denormalize_bbox(
140
140
  return bbox
141
141
 
142
142
 
143
+ def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
144
+ r"""Convert a quadrilateral bounding box to a rectangular bounding box.
145
+
146
+ Parameters:
147
+ quad_box: the quadrilateral bounding box
148
+
149
+ Returns:
150
+ The rectangular bounding box
151
+ """
152
+ x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
153
+ x_min = min(x1, x2, x3, x4)
154
+ x_max = max(x1, x2, x3, x4)
155
+ y_min = min(y1, y2, y3, y4)
156
+ y_max = max(y1, y2, y3, y4)
157
+ return [x_min, y_min, x_max, y_max]
158
+
159
+
143
160
  def overlay_bboxes(
144
161
  image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
145
162
  ) -> ImageType:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.102
3
+ Version: 0.2.103
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -14,20 +14,20 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
14
14
  vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
15
15
  vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
16
16
  vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
17
- vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnvM,1906
17
+ vision_agent/tools/__init__.py,sha256=MK0D8NtIChwGHwqsTz3LeV5BGuQecNVrNzUsyaEwuGA,1926
18
18
  vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
19
19
  vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
20
20
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
21
21
  vision_agent/tools/tool_utils.py,sha256=ZhZ9oEcOvRSuWPy-gV0rx3pvaaXzBW-ZC3YQanXrq1g,4733
22
- vision_agent/tools/tools.py,sha256=jWWioqBNsoNaGa8WKVldKBk_y9ZD1shO52kSE-26MFc,43111
22
+ vision_agent/tools/tools.py,sha256=fgPE0VHfBiQPJKkslBm_hugTOyRT-Hnw7eztvC-l4_o,44661
23
23
  vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
24
24
  vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
25
25
  vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
26
- vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
26
+ vision_agent/utils/image_utils.py,sha256=c_g5i_cFC0C-Yw9gU_NaVgQdmBlyumw3bLIDtCU42xo,8200
27
27
  vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
30
- vision_agent-0.2.102.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.102.dist-info/METADATA,sha256=NUnuzJmGX6d9tboY_RafIVOLAhcT_phRqcNh8Xgwd2Q,10729
32
- vision_agent-0.2.102.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.102.dist-info/RECORD,,
30
+ vision_agent-0.2.103.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.103.dist-info/METADATA,sha256=DfZa2bcKHvQxsgAJRBdIEpPGdBjt18TuOwMzXOUIV_w,10729
32
+ vision_agent-0.2.103.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.103.dist-info/RECORD,,