doc-page-extractor 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of doc-page-extractor might be problematic. Click here for more details.
- doc_page_extractor/__init__.py +1 -1
- doc_page_extractor/downloader.py +4 -1
- doc_page_extractor/extractor.py +6 -7
- doc_page_extractor/ocr.py +110 -58
- doc_page_extractor/ocr_corrector.py +3 -3
- doc_page_extractor/onnxocr/__init__.py +1 -0
- doc_page_extractor/onnxocr/cls_postprocess.py +26 -0
- doc_page_extractor/onnxocr/db_postprocess.py +246 -0
- doc_page_extractor/onnxocr/imaug.py +32 -0
- doc_page_extractor/onnxocr/operators.py +187 -0
- doc_page_extractor/onnxocr/predict_base.py +52 -0
- doc_page_extractor/onnxocr/predict_cls.py +89 -0
- doc_page_extractor/onnxocr/predict_det.py +120 -0
- doc_page_extractor/onnxocr/predict_rec.py +321 -0
- doc_page_extractor/onnxocr/predict_system.py +97 -0
- doc_page_extractor/onnxocr/rec_postprocess.py +896 -0
- doc_page_extractor/onnxocr/utils.py +71 -0
- {doc_page_extractor-0.0.4.dist-info → doc_page_extractor-0.0.6.dist-info}/METADATA +7 -3
- doc_page_extractor-0.0.6.dist-info/RECORD +33 -0
- {doc_page_extractor-0.0.4.dist-info → doc_page_extractor-0.0.6.dist-info}/WHEEL +1 -1
- doc_page_extractor-0.0.4.dist-info/RECORD +0 -21
- {doc_page_extractor-0.0.4.dist-info → doc_page_extractor-0.0.6.dist-info}/LICENSE +0 -0
- {doc_page_extractor-0.0.4.dist-info → doc_page_extractor-0.0.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import cv2
|
|
3
|
+
|
|
4
|
+
def get_rotate_crop_image(img, points):
|
|
5
|
+
"""
|
|
6
|
+
img_height, img_width = img.shape[0:2]
|
|
7
|
+
left = int(np.min(points[:, 0]))
|
|
8
|
+
right = int(np.max(points[:, 0]))
|
|
9
|
+
top = int(np.min(points[:, 1]))
|
|
10
|
+
bottom = int(np.max(points[:, 1]))
|
|
11
|
+
img_crop = img[top:bottom, left:right, :].copy()
|
|
12
|
+
points[:, 0] = points[:, 0] - left
|
|
13
|
+
points[:, 1] = points[:, 1] - top
|
|
14
|
+
"""
|
|
15
|
+
assert len(points) == 4, "shape of points must be 4*2"
|
|
16
|
+
img_crop_width = int(
|
|
17
|
+
max(
|
|
18
|
+
np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3])
|
|
19
|
+
)
|
|
20
|
+
)
|
|
21
|
+
img_crop_height = int(
|
|
22
|
+
max(
|
|
23
|
+
np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2])
|
|
24
|
+
)
|
|
25
|
+
)
|
|
26
|
+
pts_std = np.float32(
|
|
27
|
+
[
|
|
28
|
+
[0, 0],
|
|
29
|
+
[img_crop_width, 0],
|
|
30
|
+
[img_crop_width, img_crop_height],
|
|
31
|
+
[0, img_crop_height],
|
|
32
|
+
]
|
|
33
|
+
)
|
|
34
|
+
M = cv2.getPerspectiveTransform(points, pts_std)
|
|
35
|
+
dst_img = cv2.warpPerspective(
|
|
36
|
+
img,
|
|
37
|
+
M,
|
|
38
|
+
(img_crop_width, img_crop_height),
|
|
39
|
+
borderMode=cv2.BORDER_REPLICATE,
|
|
40
|
+
flags=cv2.INTER_CUBIC,
|
|
41
|
+
)
|
|
42
|
+
dst_img_height, dst_img_width = dst_img.shape[0:2]
|
|
43
|
+
if dst_img_height * 1.0 / dst_img_width >= 1.5:
|
|
44
|
+
dst_img = np.rot90(dst_img)
|
|
45
|
+
return dst_img
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_minarea_rect_crop(img, points):
|
|
49
|
+
bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32))
|
|
50
|
+
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
|
|
51
|
+
|
|
52
|
+
index_a, index_b, index_c, index_d = 0, 1, 2, 3
|
|
53
|
+
if points[1][1] > points[0][1]:
|
|
54
|
+
index_a = 0
|
|
55
|
+
index_d = 1
|
|
56
|
+
else:
|
|
57
|
+
index_a = 1
|
|
58
|
+
index_d = 0
|
|
59
|
+
if points[3][1] > points[2][1]:
|
|
60
|
+
index_b = 2
|
|
61
|
+
index_c = 3
|
|
62
|
+
else:
|
|
63
|
+
index_b = 3
|
|
64
|
+
index_c = 2
|
|
65
|
+
|
|
66
|
+
box = [points[index_a], points[index_b], points[index_c], points[index_d]]
|
|
67
|
+
crop_img = get_rotate_crop_image(img, np.array(box))
|
|
68
|
+
return crop_img
|
|
69
|
+
|
|
70
|
+
def str2bool(v):
|
|
71
|
+
return v.lower() in ("true", "t", "1")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: doc-page-extractor
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.6
|
|
4
4
|
Summary: doc page extractor can identify text and format in images and return structured data.
|
|
5
5
|
Home-page: https://github.com/Moskize91/doc-page-extractor
|
|
6
6
|
Author: Tao Zeyu
|
|
@@ -9,10 +9,12 @@ Description-Content-Type: text/markdown
|
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: opencv-python<5.0,>=4.11.0
|
|
11
11
|
Requires-Dist: pillow<11.0,>=10.3
|
|
12
|
+
Requires-Dist: pyclipper<2.0,>=1.2.0
|
|
13
|
+
Requires-Dist: onnxruntime<2.0,>=1.19.0
|
|
14
|
+
Requires-Dist: numpy<2.0,>=1.24.0
|
|
12
15
|
Requires-Dist: shapely<3.0,>=2.0.0
|
|
13
16
|
Requires-Dist: transformers<5.0,>=4.48.0
|
|
14
17
|
Requires-Dist: doclayout_yolo>=0.0.3
|
|
15
|
-
Requires-Dist: paddleocr==2.9.0
|
|
16
18
|
Dynamic: author
|
|
17
19
|
Dynamic: author-email
|
|
18
20
|
Dynamic: description
|
|
@@ -61,6 +63,8 @@ for layout in result.layouts:
|
|
|
61
63
|
|
|
62
64
|
## Acknowledgements
|
|
63
65
|
|
|
66
|
+
The code of `doc_page_extractor/onnxocr` in this repo comes from [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR).
|
|
67
|
+
|
|
64
68
|
- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
|
|
65
|
-
- [
|
|
69
|
+
- [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR)
|
|
66
70
|
- [layoutreader](https://github.com/ppaanngggg/layoutreader)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
doc_page_extractor/__init__.py,sha256=jCf5lo3A9JVDquflYMlvH8nJIs3EjBt8AG5y8mwfS68,210
|
|
2
|
+
doc_page_extractor/clipper.py,sha256=PDafB_9JGwV_dRY7oWe1yf44roSsEaCuEdg3VRMvJNo,3125
|
|
3
|
+
doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
|
|
4
|
+
doc_page_extractor/extractor.py,sha256=aoan_RgSrZiz5LIvjTlr4UpM9ErttpEZ8puqRxwlmDU,10232
|
|
5
|
+
doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
|
|
6
|
+
doc_page_extractor/ocr.py,sha256=6eLUVx6NSuRAwrq8Mc2zYs3yocxpOgUQS_4LIIqywnQ,5147
|
|
7
|
+
doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
|
|
8
|
+
doc_page_extractor/overlap.py,sha256=9_WbHxbKIbHM6R3ZUP2YG33pZlbLCHgwFb--NF3cCG0,5155
|
|
9
|
+
doc_page_extractor/plot.py,sha256=R8hbmdGjtw2pAH1lJkGc7Qbis4aRaaAkrkEo6WjbqyM,1378
|
|
10
|
+
doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
|
|
11
|
+
doc_page_extractor/rectangle.py,sha256=Tp__NPiY6JlYwYxejST7BUXhv_bl8tkmDXi4JgHCK6E,1539
|
|
12
|
+
doc_page_extractor/rotation.py,sha256=Dp8rXfgCzHQwqlAbU-uQt-zHC6Jm9KsIjcR6IhFQ5EU,4284
|
|
13
|
+
doc_page_extractor/types.py,sha256=UWghDwajMtEKEYUcOOjr5dM-MWiJ-P-8nWYagouf9ds,631
|
|
14
|
+
doc_page_extractor/utils.py,sha256=3rtIxiTJ7W5yOuY0UHedUJ3G34tPOw0jdHnUdOQ1tWI,207
|
|
15
|
+
doc_page_extractor/onnxocr/__init__.py,sha256=BK4YpX4pU0nRxbcI5f5cbIVfdBEsx4W980QYmpNQaH0,38
|
|
16
|
+
doc_page_extractor/onnxocr/cls_postprocess.py,sha256=o879Ned0RMUERYLviuToZ0xTvhn2UsYAb-yPC5gj8h4,822
|
|
17
|
+
doc_page_extractor/onnxocr/db_postprocess.py,sha256=R3yXXfReiQgLaYIvvfnrFfshI202LjHMvcZwcLpjmTY,7913
|
|
18
|
+
doc_page_extractor/onnxocr/imaug.py,sha256=Q192kIsRPI5zTm4RA_UUXlo6tvGJS8wrUaa-xrfnO_w,811
|
|
19
|
+
doc_page_extractor/onnxocr/operators.py,sha256=0nLiV1dWej9vdPa_DO04F7SvqF-l9NOFgHUuHUPNvsw,5556
|
|
20
|
+
doc_page_extractor/onnxocr/predict_base.py,sha256=LzRSPgxgFSRAreJOMpDTUKuBvvO_Qe5_5tK4lNuGl-w,1269
|
|
21
|
+
doc_page_extractor/onnxocr/predict_cls.py,sha256=28MliSQIyHc82EUbdkQb31KaB90rSzSmps1v6WsskQk,3065
|
|
22
|
+
doc_page_extractor/onnxocr/predict_det.py,sha256=VYsvNbCJQi1UuetwvR_hx-U7JScHyFjmJmo8YwHyQt4,4092
|
|
23
|
+
doc_page_extractor/onnxocr/predict_rec.py,sha256=qQrCs5jzCf5PYp-iEKJ53pcx_xRoJdJyavPvsvuh5Ic,10999
|
|
24
|
+
doc_page_extractor/onnxocr/predict_system.py,sha256=yoqXunAsoboPsWe7qQjvQf2_SMW1T1QMriEoiGdX3BM,2721
|
|
25
|
+
doc_page_extractor/onnxocr/rec_postprocess.py,sha256=qZt5Ripal7z9hniKq5e7azOkD9e6NR1ylWpRpznhweg,29556
|
|
26
|
+
doc_page_extractor/onnxocr/utils.py,sha256=AQoHgQyv-jpPo4BsVzq3r7_ze698EZ-a7LJobm2fwUI,1864
|
|
27
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
|
|
29
|
+
doc_page_extractor-0.0.6.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
|
|
30
|
+
doc_page_extractor-0.0.6.dist-info/METADATA,sha256=A2fj-ylx5ug6h_eiTA4C-oFNpzRq8dP3-yQJ9ccyczQ,2046
|
|
31
|
+
doc_page_extractor-0.0.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
32
|
+
doc_page_extractor-0.0.6.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
|
|
33
|
+
doc_page_extractor-0.0.6.dist-info/RECORD,,
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
doc_page_extractor/__init__.py,sha256=CeBP-ggpQ-MIMjaOm_MDNNkFfrlDyJshYDgYGjY8OoI,222
|
|
2
|
-
doc_page_extractor/clipper.py,sha256=PDafB_9JGwV_dRY7oWe1yf44roSsEaCuEdg3VRMvJNo,3125
|
|
3
|
-
doc_page_extractor/downloader.py,sha256=0rI8Ysafkq2jPpWm_FLWli68WTuBxxQYHAPJ8jn16sM,294
|
|
4
|
-
doc_page_extractor/extractor.py,sha256=rksJO7nS0gmxCW_Xi3I-SLvh0c4WfYZQg8P1TZhdy8E,10328
|
|
5
|
-
doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
|
|
6
|
-
doc_page_extractor/ocr.py,sha256=KiT2rVALlJvIuW0p7fIs3Anrtmxutt4PZwlrZTSgYGY,4001
|
|
7
|
-
doc_page_extractor/ocr_corrector.py,sha256=gaI_nj9xFGFp5KV5EBUZ3qtjG2pVRi8EMU_UT0obvVc,3953
|
|
8
|
-
doc_page_extractor/overlap.py,sha256=9_WbHxbKIbHM6R3ZUP2YG33pZlbLCHgwFb--NF3cCG0,5155
|
|
9
|
-
doc_page_extractor/plot.py,sha256=R8hbmdGjtw2pAH1lJkGc7Qbis4aRaaAkrkEo6WjbqyM,1378
|
|
10
|
-
doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
|
|
11
|
-
doc_page_extractor/rectangle.py,sha256=Tp__NPiY6JlYwYxejST7BUXhv_bl8tkmDXi4JgHCK6E,1539
|
|
12
|
-
doc_page_extractor/rotation.py,sha256=Dp8rXfgCzHQwqlAbU-uQt-zHC6Jm9KsIjcR6IhFQ5EU,4284
|
|
13
|
-
doc_page_extractor/types.py,sha256=UWghDwajMtEKEYUcOOjr5dM-MWiJ-P-8nWYagouf9ds,631
|
|
14
|
-
doc_page_extractor/utils.py,sha256=3rtIxiTJ7W5yOuY0UHedUJ3G34tPOw0jdHnUdOQ1tWI,207
|
|
15
|
-
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
|
|
17
|
-
doc_page_extractor-0.0.4.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
|
|
18
|
-
doc_page_extractor-0.0.4.dist-info/METADATA,sha256=IiiZ0E74gR-g0yGXhNidyu7Tle_RJXCH2EAapD5lxKo,1847
|
|
19
|
-
doc_page_extractor-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
20
|
-
doc_page_extractor-0.0.4.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
|
|
21
|
-
doc_page_extractor-0.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|