doc-page-extractor 0.2.4__cp310-cp310-macosx_15_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (34) hide show
  1. doc_page_extractor/__init__.py +16 -0
  2. doc_page_extractor/clipper.py +119 -0
  3. doc_page_extractor/downloader.py +16 -0
  4. doc_page_extractor/extractor.py +218 -0
  5. doc_page_extractor/latex.py +33 -0
  6. doc_page_extractor/layout_order.py +239 -0
  7. doc_page_extractor/layoutreader.py +126 -0
  8. doc_page_extractor/model.py +133 -0
  9. doc_page_extractor/ocr.py +196 -0
  10. doc_page_extractor/ocr_corrector.py +126 -0
  11. doc_page_extractor/onnxocr/__init__.py +1 -0
  12. doc_page_extractor/onnxocr/cls_postprocess.py +26 -0
  13. doc_page_extractor/onnxocr/db_postprocess.py +246 -0
  14. doc_page_extractor/onnxocr/imaug.py +32 -0
  15. doc_page_extractor/onnxocr/operators.py +187 -0
  16. doc_page_extractor/onnxocr/predict_base.py +57 -0
  17. doc_page_extractor/onnxocr/predict_cls.py +109 -0
  18. doc_page_extractor/onnxocr/predict_det.py +139 -0
  19. doc_page_extractor/onnxocr/predict_rec.py +344 -0
  20. doc_page_extractor/onnxocr/predict_system.py +97 -0
  21. doc_page_extractor/onnxocr/rec_postprocess.py +896 -0
  22. doc_page_extractor/onnxocr/utils.py +71 -0
  23. doc_page_extractor/overlap.py +167 -0
  24. doc_page_extractor/plot.py +93 -0
  25. doc_page_extractor/raw_optimizer.py +104 -0
  26. doc_page_extractor/rectangle.py +72 -0
  27. doc_page_extractor/rotation.py +158 -0
  28. doc_page_extractor/table.py +60 -0
  29. doc_page_extractor/types.py +68 -0
  30. doc_page_extractor/utils.py +32 -0
  31. doc_page_extractor-0.2.4.dist-info/LICENSE +661 -0
  32. doc_page_extractor-0.2.4.dist-info/METADATA +88 -0
  33. doc_page_extractor-0.2.4.dist-info/RECORD +34 -0
  34. doc_page_extractor-0.2.4.dist-info/WHEEL +4 -0
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.3
2
+ Name: doc-page-extractor
3
+ Version: 0.2.4
4
+ Summary:
5
+ License: AGPL-3.0
6
+ Author: Tao Zeyu
7
+ Author-email: i@taozeyu.com
8
+ Maintainer: Tao Zeyu
9
+ Maintainer-email: i@taozeyu.com
10
+ Requires-Python: >=3.10,<3.13
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Provides-Extra: cpu
20
+ Provides-Extra: cuda
21
+ Requires-Dist: accelerate (>=1.6.0,<2.0)
22
+ Requires-Dist: doclayout_yolo (>=0.0.3)
23
+ Requires-Dist: huggingface_hub (>=0.33.0,<1.0)
24
+ Requires-Dist: numpy (>=1.24.0,<2.0)
25
+ Requires-Dist: onnxruntime (==1.21.0) ; extra == "cpu"
26
+ Requires-Dist: onnxruntime-gpu (==1.21.0) ; extra == "cuda"
27
+ Requires-Dist: opencv-python (>=4.10.0,<5.0)
28
+ Requires-Dist: pillow (>=10.3,<11.0)
29
+ Requires-Dist: pix2tex (>=0.1.4,<=0.2.0)
30
+ Requires-Dist: pyclipper (>=1.2.0,<2.0)
31
+ Requires-Dist: shapely (>=2.0.0,<3.0)
32
+ Requires-Dist: transformers (>=4.42.4,<=4.47)
33
+ Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
34
+ Description-Content-Type: text/markdown
35
+
36
+ # doc page extractor
37
+
38
+ English | [中文](./README_zh-CN.md)
39
+
40
+ ## Introduction
41
+
42
+ doc page extractor can identify text and format in images and return structured data.
43
+
44
+ ## Installation
45
+
46
+ ```shell
47
+ pip install doc-page-extractor[cpu]
48
+ ```
49
+
50
+ ## Using CUDA
51
+
52
+ Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
53
+
54
+ The installation mentioned above uses the following command.
55
+
56
+ ```shell
57
+ pip install doc-page-extractor[cuda]
58
+ ```
59
+
60
+ ## Example
61
+
62
+ ```python
63
+ from PIL import Image
64
+ from doc_page_extractor import DocExtractor
65
+
66
+ extractor = DocExtractor(
67
+ model_dir_path=model_path, # Folder address where AI model is downloaded and installed
68
+ device="cpu", # If you want to use CUDA, please change to device="cuda".
69
+ )
70
+ with Image.open("/path/to/your/image.png") as image:
71
+ result = extractor.extract(
72
+ image=image,
73
+ lang="ch", # Language of image text
74
+ )
75
+ for layout in result.layouts:
76
+ for fragment in layout.fragments:
77
+ print(fragment.rect, fragment.text)
78
+ ```
79
+
80
+ ## Acknowledgements
81
+
82
+ The code of `doc_page_extractor/onnxocr` in this repo comes from [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR).
83
+
84
+ - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
85
+ - [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR)
86
+ - [layoutreader](https://github.com/ppaanngggg/layoutreader)
87
+ - [StructEqTable](https://github.com/Alpha-Innovator/StructEqTable-Deploy)
88
+ - [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
@@ -0,0 +1,34 @@
1
+ doc_page_extractor/__init__.py,sha256=rt_XALcqNNg3iVkMTUHltWxvdweH2FY6Y_olU2TkVBY,355
2
+ doc_page_extractor/clipper.py,sha256=mI8AjYHig1Jt3idiaBhNTfaY4bnAnxBCInT2OECIxmQ,3250
3
+ doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
4
+ doc_page_extractor/extractor.py,sha256=5ue4Dz5tU3bHMu2OoBDEizdgtk2jZ9jkQfF7K8QCu1U,7232
5
+ doc_page_extractor/latex.py,sha256=kD3NIzZTEGUFIAqqyHYmNcfyTrlu77GB1YSFgzbFb7A,1024
6
+ doc_page_extractor/layout_order.py,sha256=TkumMyYMc8Tosodl0-v114N1_lEeur2Izo7iLTfQdDk,7463
7
+ doc_page_extractor/layoutreader.py,sha256=RbD55SJVGm9s7QkM9vMYJpcQ4t3x-2WXmOHYMJxqSYc,4033
8
+ doc_page_extractor/model.py,sha256=cI2heB1pMRu-0gCsKZOdxPbdrPo2zH1Dp9oQOFlTHaw,3754
9
+ doc_page_extractor/ocr.py,sha256=zY-nJJ5V7gRF6c6gwn_ipqanaX4DILlN1DJNnBXRBL0,5685
10
+ doc_page_extractor/ocr_corrector.py,sha256=MhFsfd-pmPctmbtog7CzExRqy8jG4QR_rRbs-zGunr4,3919
11
+ doc_page_extractor/onnxocr/__init__.py,sha256=BK4YpX4pU0nRxbcI5f5cbIVfdBEsx4W980QYmpNQaH0,38
12
+ doc_page_extractor/onnxocr/cls_postprocess.py,sha256=o879Ned0RMUERYLviuToZ0xTvhn2UsYAb-yPC5gj8h4,822
13
+ doc_page_extractor/onnxocr/db_postprocess.py,sha256=R3yXXfReiQgLaYIvvfnrFfshI202LjHMvcZwcLpjmTY,7913
14
+ doc_page_extractor/onnxocr/imaug.py,sha256=Q192kIsRPI5zTm4RA_UUXlo6tvGJS8wrUaa-xrfnO_w,811
15
+ doc_page_extractor/onnxocr/operators.py,sha256=0nLiV1dWej9vdPa_DO04F7SvqF-l9NOFgHUuHUPNvsw,5556
16
+ doc_page_extractor/onnxocr/predict_base.py,sha256=8AljJTHGNxlDZb2xWEJmuHor2MFVBHk7xUtstrU2G8M,1439
17
+ doc_page_extractor/onnxocr/predict_cls.py,sha256=ua5fN1O5-TmJX4Vk0rseZiFFKaf949I7X1Uehu1fjRo,3569
18
+ doc_page_extractor/onnxocr/predict_det.py,sha256=8LOBHYkxFRixEU_2a6VCO_mN2obQDi5lUeYPNSVP-q4,4576
19
+ doc_page_extractor/onnxocr/predict_rec.py,sha256=UsgPhl6X3frx5u-LzIEPITOM3WJ1iAmTVznsHgXq8f8,11555
20
+ doc_page_extractor/onnxocr/predict_system.py,sha256=yoqXunAsoboPsWe7qQjvQf2_SMW1T1QMriEoiGdX3BM,2721
21
+ doc_page_extractor/onnxocr/rec_postprocess.py,sha256=qZt5Ripal7z9hniKq5e7azOkD9e6NR1ylWpRpznhweg,29556
22
+ doc_page_extractor/onnxocr/utils.py,sha256=AQoHgQyv-jpPo4BsVzq3r7_ze698EZ-a7LJobm2fwUI,1864
23
+ doc_page_extractor/overlap.py,sha256=fz0unHNLzJ91CttzddwU8bxnSXJEBjoKGG-I9Az0Lvo,5356
24
+ doc_page_extractor/plot.py,sha256=pww3D2n6T_iWsKH5jnbqtPuiwoR2uBU1Pdhn-gpVMAs,2640
25
+ doc_page_extractor/raw_optimizer.py,sha256=enDKBpuYbQ9yF4sxvyrgCTvavFRluXsBFvhahVf2wqc,2844
26
+ doc_page_extractor/rectangle.py,sha256=yeW6srdrsxaJg1eb3nn8oxtY0sfgeBk3hMiuJGaRXwY,1678
27
+ doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
28
+ doc_page_extractor/table.py,sha256=EttX0T7XfMmg4a8zfhE-o4VLs2BoGHHYf6HhLaAOx34,1695
29
+ doc_page_extractor/types.py,sha256=b0rxGglHoH7qiT4329AoNFQz9FNYGeiw1SLjFTztg6c,1331
30
+ doc_page_extractor/utils.py,sha256=wOnNPf-Tb0HQuFEE9R4sAh_yji-P__aw1ZbJnUkqoXw,694
31
+ doc_page_extractor-0.2.4.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
32
+ doc_page_extractor-0.2.4.dist-info/METADATA,sha256=YEB09n5XqpxXbak7lrHF5o3LHv4mp91CKe6_gCMM-nE,2831
33
+ doc_page_extractor-0.2.4.dist-info/WHEEL,sha256=OGZjwypyJnl3Dt4tsa3cvutwo3VFPpH2hizQfbnLBAw,106
34
+ doc_page_extractor-0.2.4.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.3
3
+ Root-Is-Purelib: false
4
+ Tag: cp310-cp310-macosx_15_0_arm64