doc-page-extractor 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

@@ -0,0 +1,71 @@
1
+ import numpy as np
2
+ import cv2
3
+
4
+ def get_rotate_crop_image(img, points):
5
+ """
6
+ img_height, img_width = img.shape[0:2]
7
+ left = int(np.min(points[:, 0]))
8
+ right = int(np.max(points[:, 0]))
9
+ top = int(np.min(points[:, 1]))
10
+ bottom = int(np.max(points[:, 1]))
11
+ img_crop = img[top:bottom, left:right, :].copy()
12
+ points[:, 0] = points[:, 0] - left
13
+ points[:, 1] = points[:, 1] - top
14
+ """
15
+ assert len(points) == 4, "shape of points must be 4*2"
16
+ img_crop_width = int(
17
+ max(
18
+ np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3])
19
+ )
20
+ )
21
+ img_crop_height = int(
22
+ max(
23
+ np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2])
24
+ )
25
+ )
26
+ pts_std = np.float32(
27
+ [
28
+ [0, 0],
29
+ [img_crop_width, 0],
30
+ [img_crop_width, img_crop_height],
31
+ [0, img_crop_height],
32
+ ]
33
+ )
34
+ M = cv2.getPerspectiveTransform(points, pts_std)
35
+ dst_img = cv2.warpPerspective(
36
+ img,
37
+ M,
38
+ (img_crop_width, img_crop_height),
39
+ borderMode=cv2.BORDER_REPLICATE,
40
+ flags=cv2.INTER_CUBIC,
41
+ )
42
+ dst_img_height, dst_img_width = dst_img.shape[0:2]
43
+ if dst_img_height * 1.0 / dst_img_width >= 1.5:
44
+ dst_img = np.rot90(dst_img)
45
+ return dst_img
46
+
47
+
48
+ def get_minarea_rect_crop(img, points):
49
+ bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
50
+ points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
51
+
52
+ index_a, index_b, index_c, index_d = 0, 1, 2, 3
53
+ if points[1][1] > points[0][1]:
54
+ index_a = 0
55
+ index_d = 1
56
+ else:
57
+ index_a = 1
58
+ index_d = 0
59
+ if points[3][1] > points[2][1]:
60
+ index_b = 2
61
+ index_c = 3
62
+ else:
63
+ index_b = 3
64
+ index_c = 2
65
+
66
+ box = [points[index_a], points[index_b], points[index_c], points[index_d]]
67
+ crop_img = get_rotate_crop_image(img, np.array(box))
68
+ return crop_img
69
+
70
+ def str2bool(v):
71
+ return v.lower() in ("true", "t", "1")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: doc-page-extractor
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -9,10 +9,12 @@ Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: opencv-python<5.0,>=4.11.0
11
11
  Requires-Dist: pillow<11.0,>=10.3
12
+ Requires-Dist: pyclipper<2.0,>=1.2.0
13
+ Requires-Dist: onnxruntime<2.0,>=1.19.0
14
+ Requires-Dist: numpy<2.0,>=1.24.0
12
15
  Requires-Dist: shapely<3.0,>=2.0.0
13
16
  Requires-Dist: transformers<5.0,>=4.48.0
14
17
  Requires-Dist: doclayout_yolo>=0.0.3
15
- Requires-Dist: paddleocr==2.9.0
16
18
  Dynamic: author
17
19
  Dynamic: author-email
18
20
  Dynamic: description
@@ -61,6 +63,8 @@ for layout in result.layouts:
61
63
 
62
64
  ## Acknowledgements
63
65
 
66
+ The code of `doc_page_extractor/onnxocr` in this repo comes from [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR).
67
+
64
68
  - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
65
- - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
69
+ - [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR)
66
70
  - [layoutreader](https://github.com/ppaanngggg/layoutreader)
@@ -0,0 +1,33 @@
1
+ doc_page_extractor/__init__.py,sha256=jCf5lo3A9JVDquflYMlvH8nJIs3EjBt8AG5y8mwfS68,210
2
+ doc_page_extractor/clipper.py,sha256=PDafB_9JGwV_dRY7oWe1yf44roSsEaCuEdg3VRMvJNo,3125
3
+ doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
4
+ doc_page_extractor/extractor.py,sha256=aoan_RgSrZiz5LIvjTlr4UpM9ErttpEZ8puqRxwlmDU,10232
5
+ doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
6
+ doc_page_extractor/ocr.py,sha256=6eLUVx6NSuRAwrq8Mc2zYs3yocxpOgUQS_4LIIqywnQ,5147
7
+ doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
8
+ doc_page_extractor/overlap.py,sha256=9_WbHxbKIbHM6R3ZUP2YG33pZlbLCHgwFb--NF3cCG0,5155
9
+ doc_page_extractor/plot.py,sha256=R8hbmdGjtw2pAH1lJkGc7Qbis4aRaaAkrkEo6WjbqyM,1378
10
+ doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
11
+ doc_page_extractor/rectangle.py,sha256=Tp__NPiY6JlYwYxejST7BUXhv_bl8tkmDXi4JgHCK6E,1539
12
+ doc_page_extractor/rotation.py,sha256=Dp8rXfgCzHQwqlAbU-uQt-zHC6Jm9KsIjcR6IhFQ5EU,4284
13
+ doc_page_extractor/types.py,sha256=UWghDwajMtEKEYUcOOjr5dM-MWiJ-P-8nWYagouf9ds,631
14
+ doc_page_extractor/utils.py,sha256=3rtIxiTJ7W5yOuY0UHedUJ3G34tPOw0jdHnUdOQ1tWI,207
15
+ doc_page_extractor/onnxocr/__init__.py,sha256=BK4YpX4pU0nRxbcI5f5cbIVfdBEsx4W980QYmpNQaH0,38
16
+ doc_page_extractor/onnxocr/cls_postprocess.py,sha256=o879Ned0RMUERYLviuToZ0xTvhn2UsYAb-yPC5gj8h4,822
17
+ doc_page_extractor/onnxocr/db_postprocess.py,sha256=R3yXXfReiQgLaYIvvfnrFfshI202LjHMvcZwcLpjmTY,7913
18
+ doc_page_extractor/onnxocr/imaug.py,sha256=Q192kIsRPI5zTm4RA_UUXlo6tvGJS8wrUaa-xrfnO_w,811
19
+ doc_page_extractor/onnxocr/operators.py,sha256=0nLiV1dWej9vdPa_DO04F7SvqF-l9NOFgHUuHUPNvsw,5556
20
+ doc_page_extractor/onnxocr/predict_base.py,sha256=LzRSPgxgFSRAreJOMpDTUKuBvvO_Qe5_5tK4lNuGl-w,1269
21
+ doc_page_extractor/onnxocr/predict_cls.py,sha256=28MliSQIyHc82EUbdkQb31KaB90rSzSmps1v6WsskQk,3065
22
+ doc_page_extractor/onnxocr/predict_det.py,sha256=VYsvNbCJQi1UuetwvR_hx-U7JScHyFjmJmo8YwHyQt4,4092
23
+ doc_page_extractor/onnxocr/predict_rec.py,sha256=qQrCs5jzCf5PYp-iEKJ53pcx_xRoJdJyavPvsvuh5Ic,10999
24
+ doc_page_extractor/onnxocr/predict_system.py,sha256=yoqXunAsoboPsWe7qQjvQf2_SMW1T1QMriEoiGdX3BM,2721
25
+ doc_page_extractor/onnxocr/rec_postprocess.py,sha256=qZt5Ripal7z9hniKq5e7azOkD9e6NR1ylWpRpznhweg,29556
26
+ doc_page_extractor/onnxocr/utils.py,sha256=AQoHgQyv-jpPo4BsVzq3r7_ze698EZ-a7LJobm2fwUI,1864
27
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
29
+ doc_page_extractor-0.0.6.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
30
+ doc_page_extractor-0.0.6.dist-info/METADATA,sha256=A2fj-ylx5ug6h_eiTA4C-oFNpzRq8dP3-yQJ9ccyczQ,2046
31
+ doc_page_extractor-0.0.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
32
+ doc_page_extractor-0.0.6.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
33
+ doc_page_extractor-0.0.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,21 +0,0 @@
1
- doc_page_extractor/__init__.py,sha256=CeBP-ggpQ-MIMjaOm_MDNNkFfrlDyJshYDgYGjY8OoI,222
2
- doc_page_extractor/clipper.py,sha256=PDafB_9JGwV_dRY7oWe1yf44roSsEaCuEdg3VRMvJNo,3125
3
- doc_page_extractor/downloader.py,sha256=0rI8Ysafkq2jPpWm_FLWli68WTuBxxQYHAPJ8jn16sM,294
4
- doc_page_extractor/extractor.py,sha256=rksJO7nS0gmxCW_Xi3I-SLvh0c4WfYZQg8P1TZhdy8E,10328
5
- doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
6
- doc_page_extractor/ocr.py,sha256=KiT2rVALlJvIuW0p7fIs3Anrtmxutt4PZwlrZTSgYGY,4001
7
- doc_page_extractor/ocr_corrector.py,sha256=gaI_nj9xFGFp5KV5EBUZ3qtjG2pVRi8EMU_UT0obvVc,3953
8
- doc_page_extractor/overlap.py,sha256=9_WbHxbKIbHM6R3ZUP2YG33pZlbLCHgwFb--NF3cCG0,5155
9
- doc_page_extractor/plot.py,sha256=R8hbmdGjtw2pAH1lJkGc7Qbis4aRaaAkrkEo6WjbqyM,1378
10
- doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
11
- doc_page_extractor/rectangle.py,sha256=Tp__NPiY6JlYwYxejST7BUXhv_bl8tkmDXi4JgHCK6E,1539
12
- doc_page_extractor/rotation.py,sha256=Dp8rXfgCzHQwqlAbU-uQt-zHC6Jm9KsIjcR6IhFQ5EU,4284
13
- doc_page_extractor/types.py,sha256=UWghDwajMtEKEYUcOOjr5dM-MWiJ-P-8nWYagouf9ds,631
14
- doc_page_extractor/utils.py,sha256=3rtIxiTJ7W5yOuY0UHedUJ3G34tPOw0jdHnUdOQ1tWI,207
15
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
17
- doc_page_extractor-0.0.4.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
18
- doc_page_extractor-0.0.4.dist-info/METADATA,sha256=IiiZ0E74gR-g0yGXhNidyu7Tle_RJXCH2EAapD5lxKo,1847
19
- doc_page_extractor-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
- doc_page_extractor-0.0.4.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
21
- doc_page_extractor-0.0.4.dist-info/RECORD,,