doc-page-extractor 0.2.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. doc_page_extractor/__init__.py +5 -15
  2. doc_page_extractor/check_env.py +40 -0
  3. doc_page_extractor/extractor.py +88 -215
  4. doc_page_extractor/model.py +97 -0
  5. doc_page_extractor/parser.py +51 -0
  6. doc_page_extractor/plot.py +52 -79
  7. doc_page_extractor/redacter.py +111 -0
  8. doc_page_extractor-1.0.2.dist-info/METADATA +120 -0
  9. doc_page_extractor-1.0.2.dist-info/RECORD +11 -0
  10. {doc_page_extractor-0.2.0.dist-info → doc_page_extractor-1.0.2.dist-info}/WHEEL +1 -2
  11. doc_page_extractor-1.0.2.dist-info/licenses/LICENSE +21 -0
  12. doc_page_extractor/clipper.py +0 -119
  13. doc_page_extractor/downloader.py +0 -16
  14. doc_page_extractor/latex.py +0 -31
  15. doc_page_extractor/layout_order.py +0 -237
  16. doc_page_extractor/layoutreader.py +0 -126
  17. doc_page_extractor/models.py +0 -92
  18. doc_page_extractor/ocr.py +0 -200
  19. doc_page_extractor/ocr_corrector.py +0 -126
  20. doc_page_extractor/onnxocr/__init__.py +0 -1
  21. doc_page_extractor/onnxocr/cls_postprocess.py +0 -26
  22. doc_page_extractor/onnxocr/db_postprocess.py +0 -246
  23. doc_page_extractor/onnxocr/imaug.py +0 -32
  24. doc_page_extractor/onnxocr/operators.py +0 -187
  25. doc_page_extractor/onnxocr/predict_base.py +0 -57
  26. doc_page_extractor/onnxocr/predict_cls.py +0 -109
  27. doc_page_extractor/onnxocr/predict_det.py +0 -139
  28. doc_page_extractor/onnxocr/predict_rec.py +0 -344
  29. doc_page_extractor/onnxocr/predict_system.py +0 -97
  30. doc_page_extractor/onnxocr/rec_postprocess.py +0 -896
  31. doc_page_extractor/onnxocr/utils.py +0 -71
  32. doc_page_extractor/overlap.py +0 -167
  33. doc_page_extractor/raw_optimizer.py +0 -104
  34. doc_page_extractor/rectangle.py +0 -72
  35. doc_page_extractor/rotation.py +0 -158
  36. doc_page_extractor/struct_eqtable/__init__.py +0 -49
  37. doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -2
  38. doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -394
  39. doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -198
  40. doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -81
  41. doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -3
  42. doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -76
  43. doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -1047
  44. doc_page_extractor/table.py +0 -70
  45. doc_page_extractor/types.py +0 -91
  46. doc_page_extractor/utils.py +0 -32
  47. doc_page_extractor-0.2.0.dist-info/METADATA +0 -85
  48. doc_page_extractor-0.2.0.dist-info/RECORD +0 -45
  49. doc_page_extractor-0.2.0.dist-info/licenses/LICENSE +0 -661
  50. doc_page_extractor-0.2.0.dist-info/top_level.txt +0 -2
  51. tests/__init__.py +0 -0
  52. tests/test_history_bus.py +0 -55
@@ -1,70 +0,0 @@
1
- import os
2
- import torch
3
-
4
- from typing import Literal, Any
5
- from PIL.Image import Image
6
- from .types import TableLayoutParsedFormat, GetModelDir
7
- from .utils import expand_image
8
-
9
-
10
- OutputFormat = Literal["latex", "markdown", "html"]
11
-
12
- class Table:
13
- def __init__(
14
- self,
15
- device: Literal["cpu", "cuda"],
16
- get_model_dir: GetModelDir,
17
- ):
18
- self._model: Any | None = None
19
- self._model_path: str = get_model_dir()
20
- self._ban: bool = False
21
- if device == "cpu" or not torch.cuda.is_available():
22
- self._ban = True
23
-
24
- def predict(self, image: Image, format: TableLayoutParsedFormat) -> str | None:
25
- if self._ban:
26
- print("CUDA is not available. You cannot parse table from image.")
27
- return None
28
-
29
- output_format: str
30
- if format == TableLayoutParsedFormat.LATEX:
31
- output_format = "latex"
32
- elif format == TableLayoutParsedFormat.MARKDOWN:
33
- output_format = "markdown"
34
- elif format == TableLayoutParsedFormat.HTML:
35
- output_format = "html"
36
- else:
37
- raise ValueError(f"Table format {format} is not supported.")
38
-
39
- image = expand_image(image, 0.1)
40
- model = self._get_model()
41
-
42
- with torch.no_grad():
43
- results = model([image], output_format=output_format)
44
-
45
- if len(results) == 0:
46
- return None
47
-
48
- return results[0]
49
-
50
- def _get_model(self):
51
- if self._model is None:
52
- local_files_only: bool
53
- if os.path.exists(self._model_path):
54
- local_files_only = True
55
- else:
56
- local_files_only = False
57
- os.makedirs(self._model_path)
58
-
59
- from .struct_eqtable import build_model
60
- model = build_model(
61
- model_ckpt=self._model_path,
62
- max_new_tokens=1024,
63
- max_time=30,
64
- lmdeploy=False,
65
- flash_attn=True,
66
- batch_size=1,
67
- local_files_only=local_files_only,
68
- )
69
- self._model = model.cuda()
70
- return self._model
@@ -1,91 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Literal, Callable, Protocol, runtime_checkable, List
3
- from enum import auto, Enum
4
- from PIL.Image import Image
5
- from .rectangle import Rectangle
6
-
7
-
8
- @dataclass
9
- class OCRFragment:
10
- order: int
11
- text: str
12
- rank: float
13
- rect: Rectangle
14
-
15
- class LayoutClass(Enum):
16
- TITLE = 0
17
- PLAIN_TEXT = 1
18
- ABANDON = 2
19
- FIGURE = 3
20
- FIGURE_CAPTION = 4
21
- TABLE = 5
22
- TABLE_CAPTION = 6
23
- TABLE_FOOTNOTE = 7
24
- ISOLATE_FORMULA = 8
25
- FORMULA_CAPTION = 9
26
-
27
- class TableLayoutParsedFormat(Enum):
28
- LATEX = auto()
29
- MARKDOWN = auto()
30
- HTML = auto()
31
-
32
- @dataclass
33
- class BaseLayout:
34
- rect: Rectangle
35
- fragments: List[OCRFragment]
36
-
37
- @dataclass
38
- class PlainLayout(BaseLayout):
39
- cls: Literal[
40
- LayoutClass.TITLE,
41
- LayoutClass.PLAIN_TEXT,
42
- LayoutClass.ABANDON,
43
- LayoutClass.FIGURE,
44
- LayoutClass.FIGURE_CAPTION,
45
- LayoutClass.TABLE_CAPTION,
46
- LayoutClass.TABLE_FOOTNOTE,
47
- LayoutClass.FORMULA_CAPTION,
48
- ]
49
-
50
- @dataclass
51
- class TableLayout(BaseLayout):
52
- parsed: tuple[str, TableLayoutParsedFormat] | None
53
- cls: LayoutClass.TABLE
54
-
55
- @dataclass
56
- class FormulaLayout(BaseLayout):
57
- latex: str | None
58
- cls: LayoutClass.ISOLATE_FORMULA
59
-
60
- Layout = PlainLayout | TableLayout | FormulaLayout
61
-
62
-
63
- @dataclass
64
- class ExtractedResult:
65
- rotation: float
66
- layouts: List[Layout]
67
- extracted_image: Image | None
68
- adjusted_image: Image | None
69
-
70
- GetModelDir = Callable[[], str]
71
-
72
-
73
- @runtime_checkable
74
- class ModelsDownloader(Protocol):
75
-
76
- def onnx_ocr(self) -> str:
77
- pass
78
-
79
- def yolo(self) -> str:
80
- pass
81
-
82
- def layoutreader(self) -> str:
83
- pass
84
-
85
- def struct_eqtable(self) -> str:
86
- pass
87
-
88
- def latex(self) -> str:
89
- pass
90
-
91
-
@@ -1,32 +0,0 @@
1
- import os
2
- import re
3
-
4
- from math import ceil
5
- from PIL.Image import Image
6
- from PIL.ImageOps import expand
7
-
8
-
9
- def ensure_dir(path: str) -> str:
10
- path = os.path.abspath(path)
11
- os.makedirs(path, exist_ok=True)
12
- return path
13
-
14
- def is_space_text(text: str) -> bool:
15
- return re.match(r"^\s*$", text)
16
-
17
- def expand_image(image: Image, percent: float):
18
- width, height = image.size
19
- border_width = ceil(width * percent)
20
- border_height = ceil(height * percent)
21
- fill_color: tuple[int, ...]
22
-
23
- if image.mode == "RGBA":
24
- fill_color = (255, 255, 255, 255)
25
- else:
26
- fill_color = (255, 255, 255)
27
-
28
- return expand(
29
- image=image,
30
- border=(border_width, border_height),
31
- fill=fill_color,
32
- )
@@ -1,85 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: doc-page-extractor
3
- Version: 0.2.0
4
- Summary: doc page extractor can identify text and format in images and return structured data.
5
- Home-page: https://github.com/Moskize91/doc-page-extractor
6
- Author: Tao Zeyu
7
- Author-email: i@taozeyu.com
8
- Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: opencv-python<5.0,>=4.10.0
11
- Requires-Dist: pillow<11.0,>=10.3
12
- Requires-Dist: pyclipper<2.0,>=1.2.0
13
- Requires-Dist: numpy<2.0,>=1.24.0
14
- Requires-Dist: shapely<3.0,>=2.0.0
15
- Requires-Dist: transformers<=4.47,>=4.42.4
16
- Requires-Dist: doclayout_yolo>=0.0.3
17
- Requires-Dist: pix2tex<=0.2.0,>=0.1.4
18
- Requires-Dist: accelerate<2.0,>=1.6.0
19
- Requires-Dist: huggingface_hub>=0.30.2
20
- Dynamic: author
21
- Dynamic: author-email
22
- Dynamic: description
23
- Dynamic: description-content-type
24
- Dynamic: home-page
25
- Dynamic: license-file
26
- Dynamic: requires-dist
27
- Dynamic: summary
28
-
29
- # doc page extractor
30
-
31
- English | [中文](./README_zh-CN.md)
32
-
33
- ## Introduction
34
-
35
- doc page extractor can identify text and format in images and return structured data.
36
-
37
- ## Installation
38
-
39
- ```shell
40
- pip install doc-page-extractor
41
- ```
42
-
43
- ```shell
44
- pip install onnxruntime==1.21.0
45
- ```
46
-
47
- ## Using CUDA
48
-
49
- Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
50
-
51
- In addition, replace the command to install `onnxruntime` in the previous article with the following:
52
-
53
- ```shell
54
- pip install onnxruntime-gpu==1.21.0
55
- ```
56
-
57
- ## Example
58
-
59
- ```python
60
- from PIL import Image
61
- from doc_page_extractor import DocExtractor
62
-
63
- extractor = DocExtractor(
64
- model_dir_path=model_path, # Folder address where AI model is downloaded and installed
65
- device="cpu", # If you want to use CUDA, please change to device="cuda".
66
- )
67
- with Image.open("/path/to/your/image.png") as image:
68
- result = extractor.extract(
69
- image=image,
70
- lang="ch", # Language of image text
71
- )
72
- for layout in result.layouts:
73
- for fragment in layout.fragments:
74
- print(fragment.rect, fragment.text)
75
- ```
76
-
77
- ## Acknowledgements
78
-
79
- The code of `doc_page_extractor/onnxocr` in this repo comes from [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR).
80
-
81
- - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
82
- - [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR)
83
- - [layoutreader](https://github.com/ppaanngggg/layoutreader)
84
- - [StructEqTable](https://github.com/Alpha-Innovator/StructEqTable-Deploy)
85
- - [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
@@ -1,45 +0,0 @@
1
- doc_page_extractor/__init__.py,sha256=gFRRw14P0SgDw67lggIwpMSi3vB4XRTZbGLZOFNWmsg,331
2
- doc_page_extractor/clipper.py,sha256=5S1TI0aqMebwlPv_Ih4Nxpp6MchEjOih-CiZfMWUAhI,3201
3
- doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
4
- doc_page_extractor/extractor.py,sha256=c1wmRWe63bcQUQwV2kRRcnRm25tZ0sIN4SXXZ93wIPg,7113
5
- doc_page_extractor/latex.py,sha256=rAKbzAO5xuN9twm8R_aWEUL8Lh8SAiAodBH7-nlJuR8,981
6
- doc_page_extractor/layout_order.py,sha256=IABcrSKbImrBQIfE1L1WmHxUsRVW1or__fWWOCWEQAg,7366
7
- doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
8
- doc_page_extractor/models.py,sha256=9cLHD5w6BbTIMso4guSLs9sLtN1AAjvZEePJ6WPSKYc,2786
9
- doc_page_extractor/ocr.py,sha256=32mnGRoooy403c8sltuGccKWPUgwbQj_gcDiqRm8Qls,5624
10
- doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
11
- doc_page_extractor/overlap.py,sha256=z1DF4_2OPvauDHwmz1SC1WosULkE84HKaRfNEgexPzc,5337
12
- doc_page_extractor/plot.py,sha256=4uibjS_x1SyEyjaJJd0YsBbzkgldDOCct4Ry2cOhdXU,2556
13
- doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
14
- doc_page_extractor/rectangle.py,sha256=yeW6srdrsxaJg1eb3nn8oxtY0sfgeBk3hMiuJGaRXwY,1678
15
- doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
16
- doc_page_extractor/table.py,sha256=k6psiz43OShvV9tB0IX8K5h7KKZLEXkhLxOUPcnxK-M,1870
17
- doc_page_extractor/types.py,sha256=8dDxPxWdXgo5Q5jjPF49eYQl1TwFyM8Tr1IRZVtz1b4,1647
18
- doc_page_extractor/utils.py,sha256=ZlQVOLPUg_v5J8u6SoD8XtMG_JkF-ERgjubc4LO5_Lg,688
19
- doc_page_extractor/onnxocr/__init__.py,sha256=BK4YpX4pU0nRxbcI5f5cbIVfdBEsx4W980QYmpNQaH0,38
20
- doc_page_extractor/onnxocr/cls_postprocess.py,sha256=o879Ned0RMUERYLviuToZ0xTvhn2UsYAb-yPC5gj8h4,822
21
- doc_page_extractor/onnxocr/db_postprocess.py,sha256=R3yXXfReiQgLaYIvvfnrFfshI202LjHMvcZwcLpjmTY,7913
22
- doc_page_extractor/onnxocr/imaug.py,sha256=Q192kIsRPI5zTm4RA_UUXlo6tvGJS8wrUaa-xrfnO_w,811
23
- doc_page_extractor/onnxocr/operators.py,sha256=0nLiV1dWej9vdPa_DO04F7SvqF-l9NOFgHUuHUPNvsw,5556
24
- doc_page_extractor/onnxocr/predict_base.py,sha256=8AljJTHGNxlDZb2xWEJmuHor2MFVBHk7xUtstrU2G8M,1439
25
- doc_page_extractor/onnxocr/predict_cls.py,sha256=ua5fN1O5-TmJX4Vk0rseZiFFKaf949I7X1Uehu1fjRo,3569
26
- doc_page_extractor/onnxocr/predict_det.py,sha256=8LOBHYkxFRixEU_2a6VCO_mN2obQDi5lUeYPNSVP-q4,4576
27
- doc_page_extractor/onnxocr/predict_rec.py,sha256=UsgPhl6X3frx5u-LzIEPITOM3WJ1iAmTVznsHgXq8f8,11555
28
- doc_page_extractor/onnxocr/predict_system.py,sha256=yoqXunAsoboPsWe7qQjvQf2_SMW1T1QMriEoiGdX3BM,2721
29
- doc_page_extractor/onnxocr/rec_postprocess.py,sha256=qZt5Ripal7z9hniKq5e7azOkD9e6NR1ylWpRpznhweg,29556
30
- doc_page_extractor/onnxocr/utils.py,sha256=AQoHgQyv-jpPo4BsVzq3r7_ze698EZ-a7LJobm2fwUI,1864
31
- doc_page_extractor/struct_eqtable/__init__.py,sha256=QoTsNuJfpNSrMIMd6Cot1jJqWk88_lDqFP_C2rcVJO4,1329
32
- doc_page_extractor/struct_eqtable/internvl/__init__.py,sha256=2aOsU-aHkFv_gjdP8LeUXjj_9-0d4x79iyxh4cCzaEw,79
33
- doc_page_extractor/struct_eqtable/internvl/conversation.py,sha256=s7DceRlM6JtHmxgyuE6vqu5XVT1fHzhzCL_I6r8MI1c,15129
34
- doc_page_extractor/struct_eqtable/internvl/internvl.py,sha256=ovVZG-PuBrsj_9lEoNPOygJ-2en3v6gPzRfWjDpWNOM,7678
35
- doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py,sha256=ACHxFntxS38G43PzE955Nv4fjKk_-Oz4y_o9JEjQwlg,2608
36
- doc_page_extractor/struct_eqtable/pix2s/__init__.py,sha256=cXRo4eg6u1-TXktZ8rQf0HIzLmmScIwYQhbxMKl-MyA,76
37
- doc_page_extractor/struct_eqtable/pix2s/pix2s.py,sha256=fCNve8PNeJ3-AWJIhSeGtp-mYKoMXfW0CIpszkQnAaA,2535
38
- doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py,sha256=zSGw45JhWdZ3iuJel5Chsy-NzsOHx9QyPQIUAzzCjFE,43880
39
- doc_page_extractor-0.2.0.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
40
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- tests/test_history_bus.py,sha256=g-bpDIiebyEHKDH0YS5OHF2ONfhZt3-EFLZhWJn94WE,2534
42
- doc_page_extractor-0.2.0.dist-info/METADATA,sha256=AkYx5BNYLoJ4ShceYRGgmstIukhU9JGsNk3wiCLVb60,2475
43
- doc_page_extractor-0.2.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
44
- doc_page_extractor-0.2.0.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
45
- doc_page_extractor-0.2.0.dist-info/RECORD,,