doc-page-extractor 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

@@ -0,0 +1,71 @@
1
+ import numpy as np
2
+ import cv2
3
+
4
+ def get_rotate_crop_image(img, points):
5
+ """
6
+ img_height, img_width = img.shape[0:2]
7
+ left = int(np.min(points[:, 0]))
8
+ right = int(np.max(points[:, 0]))
9
+ top = int(np.min(points[:, 1]))
10
+ bottom = int(np.max(points[:, 1]))
11
+ img_crop = img[top:bottom, left:right, :].copy()
12
+ points[:, 0] = points[:, 0] - left
13
+ points[:, 1] = points[:, 1] - top
14
+ """
15
+ assert len(points) == 4, "shape of points must be 4*2"
16
+ img_crop_width = int(
17
+ max(
18
+ np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3])
19
+ )
20
+ )
21
+ img_crop_height = int(
22
+ max(
23
+ np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2])
24
+ )
25
+ )
26
+ pts_std = np.float32(
27
+ [
28
+ [0, 0],
29
+ [img_crop_width, 0],
30
+ [img_crop_width, img_crop_height],
31
+ [0, img_crop_height],
32
+ ]
33
+ )
34
+ M = cv2.getPerspectiveTransform(points, pts_std)
35
+ dst_img = cv2.warpPerspective(
36
+ img,
37
+ M,
38
+ (img_crop_width, img_crop_height),
39
+ borderMode=cv2.BORDER_REPLICATE,
40
+ flags=cv2.INTER_CUBIC,
41
+ )
42
+ dst_img_height, dst_img_width = dst_img.shape[0:2]
43
+ if dst_img_height * 1.0 / dst_img_width >= 1.5:
44
+ dst_img = np.rot90(dst_img)
45
+ return dst_img
46
+
47
+
48
+ def get_minarea_rect_crop(img, points):
49
+ bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
50
+ points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
51
+
52
+ index_a, index_b, index_c, index_d = 0, 1, 2, 3
53
+ if points[1][1] > points[0][1]:
54
+ index_a = 0
55
+ index_d = 1
56
+ else:
57
+ index_a = 1
58
+ index_d = 0
59
+ if points[3][1] > points[2][1]:
60
+ index_b = 2
61
+ index_c = 3
62
+ else:
63
+ index_b = 3
64
+ index_c = 2
65
+
66
+ box = [points[index_a], points[index_b], points[index_c], points[index_d]]
67
+ crop_img = get_rotate_crop_image(img, np.array(box))
68
+ return crop_img
69
+
70
+ def str2bool(v):
71
+ return v.lower() in ("true", "t", "1")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: doc-page-extractor
3
- Version: 0.0.5
3
+ Version: 0.0.7
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -9,11 +9,11 @@ Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: opencv-python<5.0,>=4.11.0
11
11
  Requires-Dist: pillow<11.0,>=10.3
12
- Requires-Dist: numpy<1.26,>=1.24.0
12
+ Requires-Dist: pyclipper<2.0,>=1.2.0
13
+ Requires-Dist: numpy<2.0,>=1.24.0
13
14
  Requires-Dist: shapely<3.0,>=2.0.0
14
15
  Requires-Dist: transformers<5.0,>=4.48.0
15
16
  Requires-Dist: doclayout_yolo>=0.0.3
16
- Requires-Dist: paddleocr==2.9.0
17
17
  Dynamic: author
18
18
  Dynamic: author-email
19
19
  Dynamic: description
@@ -36,10 +36,20 @@ doc page extractor can identify text and format in images and return structured
36
36
  pip install doc-page-extractor
37
37
  ```
38
38
 
39
+ ```shell
40
+ pip install onnxruntime==1.21.0
41
+ ```
42
+
39
43
  ## Using CUDA
40
44
 
41
45
  Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
42
46
 
47
+ In addition, replace the command to install `onnxruntime` in the previous article with the following:
48
+
49
+ ```shell
50
+ pip install onnxruntime-gpu==1.21.0
51
+ ```
52
+
43
53
  ## Example
44
54
 
45
55
  ```python
@@ -48,7 +58,7 @@ from doc_page_extractor import DocExtractor
48
58
 
49
59
  extractor = DocExtractor(
50
60
  model_dir_path=model_path, # Folder address where AI model is downloaded and installed
51
- device="cpu", # If you want to use CUDA, please change to device="cuda:0".
61
+ device="cpu", # If you want to use CUDA, please change to device="cuda".
52
62
  )
53
63
  with Image.open("/path/to/your/image.png") as image:
54
64
  result = extractor.extract(
@@ -62,6 +72,8 @@ for layout in result.layouts:
62
72
 
63
73
  ## Acknowledgements
64
74
 
75
+ The code of `doc_page_extractor/onnxocr` in this repo comes from [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR).
76
+
65
77
  - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
66
- - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
78
+ - [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR)
67
79
  - [layoutreader](https://github.com/ppaanngggg/layoutreader)
@@ -0,0 +1,33 @@
1
+ doc_page_extractor/__init__.py,sha256=jCf5lo3A9JVDquflYMlvH8nJIs3EjBt8AG5y8mwfS68,210
2
+ doc_page_extractor/clipper.py,sha256=PDafB_9JGwV_dRY7oWe1yf44roSsEaCuEdg3VRMvJNo,3125
3
+ doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
4
+ doc_page_extractor/extractor.py,sha256=D3SLWUAciq8jGU6mlkVwIon-4nHJaYoKpPjGCN_YLhQ,10055
5
+ doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
6
+ doc_page_extractor/ocr.py,sha256=6eLUVx6NSuRAwrq8Mc2zYs3yocxpOgUQS_4LIIqywnQ,5147
7
+ doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
8
+ doc_page_extractor/overlap.py,sha256=9_WbHxbKIbHM6R3ZUP2YG33pZlbLCHgwFb--NF3cCG0,5155
9
+ doc_page_extractor/plot.py,sha256=R8hbmdGjtw2pAH1lJkGc7Qbis4aRaaAkrkEo6WjbqyM,1378
10
+ doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
11
+ doc_page_extractor/rectangle.py,sha256=Tp__NPiY6JlYwYxejST7BUXhv_bl8tkmDXi4JgHCK6E,1539
12
+ doc_page_extractor/rotation.py,sha256=Dp8rXfgCzHQwqlAbU-uQt-zHC6Jm9KsIjcR6IhFQ5EU,4284
13
+ doc_page_extractor/types.py,sha256=UWghDwajMtEKEYUcOOjr5dM-MWiJ-P-8nWYagouf9ds,631
14
+ doc_page_extractor/utils.py,sha256=3rtIxiTJ7W5yOuY0UHedUJ3G34tPOw0jdHnUdOQ1tWI,207
15
+ doc_page_extractor/onnxocr/__init__.py,sha256=BK4YpX4pU0nRxbcI5f5cbIVfdBEsx4W980QYmpNQaH0,38
16
+ doc_page_extractor/onnxocr/cls_postprocess.py,sha256=o879Ned0RMUERYLviuToZ0xTvhn2UsYAb-yPC5gj8h4,822
17
+ doc_page_extractor/onnxocr/db_postprocess.py,sha256=R3yXXfReiQgLaYIvvfnrFfshI202LjHMvcZwcLpjmTY,7913
18
+ doc_page_extractor/onnxocr/imaug.py,sha256=Q192kIsRPI5zTm4RA_UUXlo6tvGJS8wrUaa-xrfnO_w,811
19
+ doc_page_extractor/onnxocr/operators.py,sha256=0nLiV1dWej9vdPa_DO04F7SvqF-l9NOFgHUuHUPNvsw,5556
20
+ doc_page_extractor/onnxocr/predict_base.py,sha256=LzRSPgxgFSRAreJOMpDTUKuBvvO_Qe5_5tK4lNuGl-w,1269
21
+ doc_page_extractor/onnxocr/predict_cls.py,sha256=28MliSQIyHc82EUbdkQb31KaB90rSzSmps1v6WsskQk,3065
22
+ doc_page_extractor/onnxocr/predict_det.py,sha256=VYsvNbCJQi1UuetwvR_hx-U7JScHyFjmJmo8YwHyQt4,4092
23
+ doc_page_extractor/onnxocr/predict_rec.py,sha256=qQrCs5jzCf5PYp-iEKJ53pcx_xRoJdJyavPvsvuh5Ic,10999
24
+ doc_page_extractor/onnxocr/predict_system.py,sha256=yoqXunAsoboPsWe7qQjvQf2_SMW1T1QMriEoiGdX3BM,2721
25
+ doc_page_extractor/onnxocr/rec_postprocess.py,sha256=qZt5Ripal7z9hniKq5e7azOkD9e6NR1ylWpRpznhweg,29556
26
+ doc_page_extractor/onnxocr/utils.py,sha256=AQoHgQyv-jpPo4BsVzq3r7_ze698EZ-a7LJobm2fwUI,1864
27
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
29
+ doc_page_extractor-0.0.7.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
30
+ doc_page_extractor-0.0.7.dist-info/METADATA,sha256=s-ewJAyPQ1I_fgTee91NN99T42HcAaKFu1MAUhZKqdk,2203
31
+ doc_page_extractor-0.0.7.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
32
+ doc_page_extractor-0.0.7.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
33
+ doc_page_extractor-0.0.7.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- doc_page_extractor/__init__.py,sha256=CeBP-ggpQ-MIMjaOm_MDNNkFfrlDyJshYDgYGjY8OoI,222
2
- doc_page_extractor/clipper.py,sha256=PDafB_9JGwV_dRY7oWe1yf44roSsEaCuEdg3VRMvJNo,3125
3
- doc_page_extractor/downloader.py,sha256=0rI8Ysafkq2jPpWm_FLWli68WTuBxxQYHAPJ8jn16sM,294
4
- doc_page_extractor/extractor.py,sha256=rksJO7nS0gmxCW_Xi3I-SLvh0c4WfYZQg8P1TZhdy8E,10328
5
- doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
6
- doc_page_extractor/ocr.py,sha256=KiT2rVALlJvIuW0p7fIs3Anrtmxutt4PZwlrZTSgYGY,4001
7
- doc_page_extractor/ocr_corrector.py,sha256=gaI_nj9xFGFp5KV5EBUZ3qtjG2pVRi8EMU_UT0obvVc,3953
8
- doc_page_extractor/overlap.py,sha256=9_WbHxbKIbHM6R3ZUP2YG33pZlbLCHgwFb--NF3cCG0,5155
9
- doc_page_extractor/plot.py,sha256=R8hbmdGjtw2pAH1lJkGc7Qbis4aRaaAkrkEo6WjbqyM,1378
10
- doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
11
- doc_page_extractor/rectangle.py,sha256=Tp__NPiY6JlYwYxejST7BUXhv_bl8tkmDXi4JgHCK6E,1539
12
- doc_page_extractor/rotation.py,sha256=Dp8rXfgCzHQwqlAbU-uQt-zHC6Jm9KsIjcR6IhFQ5EU,4284
13
- doc_page_extractor/types.py,sha256=UWghDwajMtEKEYUcOOjr5dM-MWiJ-P-8nWYagouf9ds,631
14
- doc_page_extractor/utils.py,sha256=3rtIxiTJ7W5yOuY0UHedUJ3G34tPOw0jdHnUdOQ1tWI,207
15
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
17
- doc_page_extractor-0.0.5.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
18
- doc_page_extractor-0.0.5.dist-info/METADATA,sha256=m5uJPWjeEqkoPqevT-M79nyFnrh6UmiOAgxY9Dgx34U,1882
19
- doc_page_extractor-0.0.5.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
20
- doc_page_extractor-0.0.5.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
21
- doc_page_extractor-0.0.5.dist-info/RECORD,,