doc-page-extractor 0.1.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. doc_page_extractor/__init__.py +5 -14
  2. doc_page_extractor/check_env.py +40 -0
  3. doc_page_extractor/extractor.py +87 -212
  4. doc_page_extractor/model.py +97 -0
  5. doc_page_extractor/parser.py +51 -0
  6. doc_page_extractor/plot.py +52 -79
  7. doc_page_extractor/redacter.py +111 -0
  8. doc_page_extractor-1.0.2.dist-info/METADATA +120 -0
  9. doc_page_extractor-1.0.2.dist-info/RECORD +11 -0
  10. {doc_page_extractor-0.1.1.dist-info → doc_page_extractor-1.0.2.dist-info}/WHEEL +1 -2
  11. doc_page_extractor-1.0.2.dist-info/licenses/LICENSE +21 -0
  12. doc_page_extractor/clipper.py +0 -119
  13. doc_page_extractor/downloader.py +0 -16
  14. doc_page_extractor/latex.py +0 -57
  15. doc_page_extractor/layout_order.py +0 -240
  16. doc_page_extractor/layoutreader.py +0 -126
  17. doc_page_extractor/ocr.py +0 -175
  18. doc_page_extractor/ocr_corrector.py +0 -126
  19. doc_page_extractor/onnxocr/__init__.py +0 -1
  20. doc_page_extractor/onnxocr/cls_postprocess.py +0 -26
  21. doc_page_extractor/onnxocr/db_postprocess.py +0 -246
  22. doc_page_extractor/onnxocr/imaug.py +0 -32
  23. doc_page_extractor/onnxocr/operators.py +0 -187
  24. doc_page_extractor/onnxocr/predict_base.py +0 -52
  25. doc_page_extractor/onnxocr/predict_cls.py +0 -89
  26. doc_page_extractor/onnxocr/predict_det.py +0 -120
  27. doc_page_extractor/onnxocr/predict_rec.py +0 -321
  28. doc_page_extractor/onnxocr/predict_system.py +0 -97
  29. doc_page_extractor/onnxocr/rec_postprocess.py +0 -896
  30. doc_page_extractor/onnxocr/utils.py +0 -71
  31. doc_page_extractor/overlap.py +0 -167
  32. doc_page_extractor/raw_optimizer.py +0 -104
  33. doc_page_extractor/rectangle.py +0 -72
  34. doc_page_extractor/rotation.py +0 -158
  35. doc_page_extractor/struct_eqtable/__init__.py +0 -49
  36. doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -2
  37. doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -394
  38. doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -198
  39. doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -81
  40. doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -3
  41. doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -76
  42. doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -1047
  43. doc_page_extractor/table.py +0 -71
  44. doc_page_extractor/types.py +0 -67
  45. doc_page_extractor/utils.py +0 -32
  46. doc_page_extractor-0.1.1.dist-info/METADATA +0 -84
  47. doc_page_extractor-0.1.1.dist-info/RECORD +0 -44
  48. doc_page_extractor-0.1.1.dist-info/licenses/LICENSE +0 -661
  49. doc_page_extractor-0.1.1.dist-info/top_level.txt +0 -2
  50. tests/__init__.py +0 -0
  51. tests/test_history_bus.py +0 -55
@@ -1,71 +0,0 @@
1
- import os
2
- import torch
3
-
4
- from typing import Literal, Any
5
- from PIL.Image import Image
6
- from .types import TableLayoutParsedFormat
7
- from .utils import expand_image
8
-
9
-
10
- OutputFormat = Literal["latex", "markdown", "html"]
11
-
12
- class Table:
13
- def __init__(
14
- self,
15
- device: Literal["cpu", "cuda"],
16
- model_path: str,
17
- ):
18
- self._model: Any | None = None
19
- self._model_path: str = model_path
20
- self._ban: bool = False
21
- if device == "cpu" or not torch.cuda.is_available():
22
- self._ban = True
23
-
24
- def predict(self, image: Image, format: TableLayoutParsedFormat) -> str | None:
25
- if self._ban:
26
- print("CUDA is not available. You cannot parse table from image.")
27
- return None
28
-
29
- output_format: str
30
- if format == TableLayoutParsedFormat.LATEX:
31
- output_format = "latex"
32
- elif format == TableLayoutParsedFormat.MARKDOWN:
33
- output_format = "markdown"
34
- elif format == TableLayoutParsedFormat.HTML:
35
- output_format = "html"
36
- else:
37
- raise ValueError(f"Table format {format} is not supported.")
38
-
39
- image = expand_image(image, 0.1)
40
- model = self._get_model()
41
-
42
- with torch.no_grad():
43
- results = model([image], output_format=output_format)
44
-
45
- if len(results) == 0:
46
- return None
47
-
48
- return results[0]
49
-
50
- def _get_model(self):
51
- if self._model is None:
52
- local_files_only: bool
53
- if os.path.exists(self._model_path):
54
- local_files_only = True
55
- else:
56
- local_files_only = False
57
- os.makedirs(self._model_path)
58
-
59
- from .struct_eqtable import build_model
60
- model = build_model(
61
- model_ckpt="U4R/StructTable-InternVL2-1B",
62
- max_new_tokens=1024,
63
- max_time=30,
64
- lmdeploy=False,
65
- flash_attn=True,
66
- batch_size=1,
67
- cache_dir=self._model_path,
68
- local_files_only=local_files_only,
69
- )
70
- self._model = model.cuda()
71
- return self._model
@@ -1,67 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Literal
3
- from enum import auto, Enum
4
- from PIL.Image import Image
5
- from .rectangle import Rectangle
6
-
7
-
8
- @dataclass
9
- class OCRFragment:
10
- order: int
11
- text: str
12
- rank: float
13
- rect: Rectangle
14
-
15
- class LayoutClass(Enum):
16
- TITLE = 0
17
- PLAIN_TEXT = 1
18
- ABANDON = 2
19
- FIGURE = 3
20
- FIGURE_CAPTION = 4
21
- TABLE = 5
22
- TABLE_CAPTION = 6
23
- TABLE_FOOTNOTE = 7
24
- ISOLATE_FORMULA = 8
25
- FORMULA_CAPTION = 9
26
-
27
- class TableLayoutParsedFormat(Enum):
28
- LATEX = auto()
29
- MARKDOWN = auto()
30
- HTML = auto()
31
-
32
- @dataclass
33
- class BaseLayout:
34
- rect: Rectangle
35
- fragments: list[OCRFragment]
36
-
37
- @dataclass
38
- class PlainLayout(BaseLayout):
39
- cls: Literal[
40
- LayoutClass.TITLE,
41
- LayoutClass.PLAIN_TEXT,
42
- LayoutClass.ABANDON,
43
- LayoutClass.FIGURE,
44
- LayoutClass.FIGURE_CAPTION,
45
- LayoutClass.TABLE_CAPTION,
46
- LayoutClass.TABLE_FOOTNOTE,
47
- LayoutClass.FORMULA_CAPTION,
48
- ]
49
-
50
- @dataclass
51
- class TableLayout(BaseLayout):
52
- parsed: tuple[str, TableLayoutParsedFormat] | None
53
- cls: LayoutClass.TABLE
54
-
55
- @dataclass
56
- class FormulaLayout(BaseLayout):
57
- latex: str | None
58
- cls: LayoutClass.ISOLATE_FORMULA
59
-
60
- Layout = PlainLayout | TableLayout | FormulaLayout
61
-
62
- @dataclass
63
- class ExtractedResult:
64
- rotation: float
65
- layouts: list[Layout]
66
- extracted_image: Image
67
- adjusted_image: Image | None
@@ -1,32 +0,0 @@
1
- import os
2
- import re
3
-
4
- from math import ceil
5
- from PIL.Image import Image
6
- from PIL.ImageOps import expand
7
-
8
-
9
- def ensure_dir(path: str) -> str:
10
- path = os.path.abspath(path)
11
- os.makedirs(path, exist_ok=True)
12
- return path
13
-
14
- def is_space_text(text: str) -> bool:
15
- return re.match(r"^\s*$", text)
16
-
17
- def expand_image(image: Image, percent: float):
18
- width, height = image.size
19
- border_width = ceil(width * percent)
20
- border_height = ceil(height * percent)
21
- fill_color: tuple[int, ...]
22
-
23
- if image.mode == "RGBA":
24
- fill_color = (255, 255, 255, 255)
25
- else:
26
- fill_color = (255, 255, 255)
27
-
28
- return expand(
29
- image=image,
30
- border=(border_width, border_height),
31
- fill=fill_color,
32
- )
@@ -1,84 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: doc-page-extractor
3
- Version: 0.1.1
4
- Summary: doc page extractor can identify text and format in images and return structured data.
5
- Home-page: https://github.com/Moskize91/doc-page-extractor
6
- Author: Tao Zeyu
7
- Author-email: i@taozeyu.com
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: opencv-python<5.0,>=4.11.0
11
- Requires-Dist: pillow<11.0,>=10.3
12
- Requires-Dist: pyclipper<2.0,>=1.2.0
13
- Requires-Dist: numpy<2.0,>=1.24.0
14
- Requires-Dist: shapely<3.0,>=2.0.0
15
- Requires-Dist: transformers<=4.47,>=4.42.4
16
- Requires-Dist: doclayout_yolo>=0.0.3
17
- Requires-Dist: pix2tex<=0.2.0,>=0.1.4
18
- Requires-Dist: accelerate<2.0,>=1.6.0
19
- Dynamic: author
20
- Dynamic: author-email
21
- Dynamic: description
22
- Dynamic: description-content-type
23
- Dynamic: home-page
24
- Dynamic: license-file
25
- Dynamic: requires-dist
26
- Dynamic: summary
27
-
28
- # doc page extractor
29
-
30
- English | [中文](./README_zh-CN.md)
31
-
32
- ## Introduction
33
-
34
- doc page extractor can identify text and format in images and return structured data.
35
-
36
- ## Installation
37
-
38
- ```shell
39
- pip install doc-page-extractor
40
- ```
41
-
42
- ```shell
43
- pip install onnxruntime==1.21.0
44
- ```
45
-
46
- ## Using CUDA
47
-
48
- Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
49
-
50
- In addition, replace the command to install `onnxruntime` in the previous article with the following:
51
-
52
- ```shell
53
- pip install onnxruntime-gpu==1.21.0
54
- ```
55
-
56
- ## Example
57
-
58
- ```python
59
- from PIL import Image
60
- from doc_page_extractor import DocExtractor
61
-
62
- extractor = DocExtractor(
63
- model_dir_path=model_path, # Folder address where AI model is downloaded and installed
64
- device="cpu", # If you want to use CUDA, please change to device="cuda".
65
- )
66
- with Image.open("/path/to/your/image.png") as image:
67
- result = extractor.extract(
68
- image=image,
69
- lang="ch", # Language of image text
70
- )
71
- for layout in result.layouts:
72
- for fragment in layout.fragments:
73
- print(fragment.rect, fragment.text)
74
- ```
75
-
76
- ## Acknowledgements
77
-
78
- The code of `doc_page_extractor/onnxocr` in this repo comes from [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR).
79
-
80
- - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
81
- - [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR)
82
- - [layoutreader](https://github.com/ppaanngggg/layoutreader)
83
- - [StructEqTable](https://github.com/Alpha-Innovator/StructEqTable-Deploy)
84
- - [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
@@ -1,44 +0,0 @@
1
- doc_page_extractor/__init__.py,sha256=9rWKSMTgzP7Xv15zA4upsyPaR8S8JeNpMyhWElRCW0M,311
2
- doc_page_extractor/clipper.py,sha256=5S1TI0aqMebwlPv_Ih4Nxpp6MchEjOih-CiZfMWUAhI,3201
3
- doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
4
- doc_page_extractor/extractor.py,sha256=V0S9Nn65lKpkz8DnTUMcsAzUJGwSQvupIBTiVqzLpJ8,7303
5
- doc_page_extractor/latex.py,sha256=W_zAcksNRuru-WjCq4CSn07s_SWrDhikadJSy_Cg3Do,1954
6
- doc_page_extractor/layout_order.py,sha256=NwMzTPr4xsriz4slCwqwhw2-vrMu-qfwtcFsDu8d1yM,7426
7
- doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
8
- doc_page_extractor/ocr.py,sha256=hQhT9bdsJmWESqt1FODCoE19wfOroM8uHZiFoZZrkQU,5182
9
- doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
10
- doc_page_extractor/overlap.py,sha256=z1DF4_2OPvauDHwmz1SC1WosULkE84HKaRfNEgexPzc,5337
11
- doc_page_extractor/plot.py,sha256=4uibjS_x1SyEyjaJJd0YsBbzkgldDOCct4Ry2cOhdXU,2556
12
- doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
13
- doc_page_extractor/rectangle.py,sha256=yeW6srdrsxaJg1eb3nn8oxtY0sfgeBk3hMiuJGaRXwY,1678
14
- doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
15
- doc_page_extractor/table.py,sha256=AWymTRbOet55uImW8QJqb90Qs_v2V2U1mZv0U6rSz3c,1891
16
- doc_page_extractor/types.py,sha256=7blT8YNKrOsc4qQdAhM7J7MEQjFcBwE0QV8-lipZBeQ,1305
17
- doc_page_extractor/utils.py,sha256=ZlQVOLPUg_v5J8u6SoD8XtMG_JkF-ERgjubc4LO5_Lg,688
18
- doc_page_extractor/onnxocr/__init__.py,sha256=BK4YpX4pU0nRxbcI5f5cbIVfdBEsx4W980QYmpNQaH0,38
19
- doc_page_extractor/onnxocr/cls_postprocess.py,sha256=o879Ned0RMUERYLviuToZ0xTvhn2UsYAb-yPC5gj8h4,822
20
- doc_page_extractor/onnxocr/db_postprocess.py,sha256=R3yXXfReiQgLaYIvvfnrFfshI202LjHMvcZwcLpjmTY,7913
21
- doc_page_extractor/onnxocr/imaug.py,sha256=Q192kIsRPI5zTm4RA_UUXlo6tvGJS8wrUaa-xrfnO_w,811
22
- doc_page_extractor/onnxocr/operators.py,sha256=0nLiV1dWej9vdPa_DO04F7SvqF-l9NOFgHUuHUPNvsw,5556
23
- doc_page_extractor/onnxocr/predict_base.py,sha256=LzRSPgxgFSRAreJOMpDTUKuBvvO_Qe5_5tK4lNuGl-w,1269
24
- doc_page_extractor/onnxocr/predict_cls.py,sha256=28MliSQIyHc82EUbdkQb31KaB90rSzSmps1v6WsskQk,3065
25
- doc_page_extractor/onnxocr/predict_det.py,sha256=VYsvNbCJQi1UuetwvR_hx-U7JScHyFjmJmo8YwHyQt4,4092
26
- doc_page_extractor/onnxocr/predict_rec.py,sha256=qQrCs5jzCf5PYp-iEKJ53pcx_xRoJdJyavPvsvuh5Ic,10999
27
- doc_page_extractor/onnxocr/predict_system.py,sha256=yoqXunAsoboPsWe7qQjvQf2_SMW1T1QMriEoiGdX3BM,2721
28
- doc_page_extractor/onnxocr/rec_postprocess.py,sha256=qZt5Ripal7z9hniKq5e7azOkD9e6NR1ylWpRpznhweg,29556
29
- doc_page_extractor/onnxocr/utils.py,sha256=AQoHgQyv-jpPo4BsVzq3r7_ze698EZ-a7LJobm2fwUI,1864
30
- doc_page_extractor/struct_eqtable/__init__.py,sha256=QoTsNuJfpNSrMIMd6Cot1jJqWk88_lDqFP_C2rcVJO4,1329
31
- doc_page_extractor/struct_eqtable/internvl/__init__.py,sha256=2aOsU-aHkFv_gjdP8LeUXjj_9-0d4x79iyxh4cCzaEw,79
32
- doc_page_extractor/struct_eqtable/internvl/conversation.py,sha256=s7DceRlM6JtHmxgyuE6vqu5XVT1fHzhzCL_I6r8MI1c,15129
33
- doc_page_extractor/struct_eqtable/internvl/internvl.py,sha256=ovVZG-PuBrsj_9lEoNPOygJ-2en3v6gPzRfWjDpWNOM,7678
34
- doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py,sha256=ACHxFntxS38G43PzE955Nv4fjKk_-Oz4y_o9JEjQwlg,2608
35
- doc_page_extractor/struct_eqtable/pix2s/__init__.py,sha256=cXRo4eg6u1-TXktZ8rQf0HIzLmmScIwYQhbxMKl-MyA,76
36
- doc_page_extractor/struct_eqtable/pix2s/pix2s.py,sha256=fCNve8PNeJ3-AWJIhSeGtp-mYKoMXfW0CIpszkQnAaA,2535
37
- doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py,sha256=zSGw45JhWdZ3iuJel5Chsy-NzsOHx9QyPQIUAzzCjFE,43880
38
- doc_page_extractor-0.1.1.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
39
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
41
- doc_page_extractor-0.1.1.dist-info/METADATA,sha256=5bQtvYgjNghsYER1zmc19_6BH4JrgrLj9_KxUmnLnHc,2436
42
- doc_page_extractor-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
43
- doc_page_extractor-0.1.1.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
44
- doc_page_extractor-0.1.1.dist-info/RECORD,,