doc-page-extractor 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

@@ -2,6 +2,7 @@ from .extractor import DocExtractor
2
2
  from .clipper import clip, clip_from_image
3
3
  from .plot import plot
4
4
  from .rectangle import Point, Rectangle
5
+ from .model import Model, HuggingfaceModel
5
6
  from .types import (
6
7
  ExtractedResult,
7
8
  OCRFragment,
@@ -52,6 +52,13 @@ class DocExtractor:
52
52
  self._latex: LaTeX = LaTeX(device, model)
53
53
  self._layout_order: LayoutOrder = LayoutOrder(device, model)
54
54
 
55
+ def prepare_models(self):
56
+ self._model.get_onnx_ocr_path()
57
+ self._model.get_yolo_path()
58
+ self._model.get_layoutreader_path()
59
+ self._model.get_struct_eqtable_path()
60
+ self._model.get_latex_path()
61
+
55
62
  def extract(
56
63
  self,
57
64
  image: Image,
@@ -1,10 +1,14 @@
1
1
  from os import PathLike
2
+ from time import sleep
2
3
  from typing import runtime_checkable, Protocol
3
4
  from pathlib import Path
4
5
  from threading import Lock
5
6
  from huggingface_hub import hf_hub_download, snapshot_download, try_to_load_from_cache
6
7
 
7
8
 
9
+ _RETRY_TIMES = 6
10
+ _RETRY_SLEEP = 3.5
11
+
8
12
  @runtime_checkable
9
13
  class Model(Protocol):
10
14
  def get_onnx_ocr_path(self) -> Path:
@@ -31,9 +35,10 @@ class HuggingfaceModel(Model):
31
35
  def get_onnx_ocr_path(self) -> Path:
32
36
  return self._get_model_path(
33
37
  repo_id="moskize/OnnxOCR",
34
- filename=None,
38
+ filename="README.md",
35
39
  repo_type=None,
36
- is_snapshot=True
40
+ is_snapshot=True,
41
+ wanna_dir_path=True,
37
42
  )
38
43
 
39
44
  def get_yolo_path(self) -> Path:
@@ -42,14 +47,16 @@ class HuggingfaceModel(Model):
42
47
  filename="models/Layout/YOLO/doclayout_yolo_ft.pt",
43
48
  repo_type=None,
44
49
  is_snapshot=False,
50
+ wanna_dir_path=False,
45
51
  )
46
52
 
47
53
  def get_layoutreader_path(self) -> Path:
48
54
  return self._get_model_path(
49
55
  repo_id="hantian/layoutreader",
50
- filename=None,
56
+ filename="model.safetensors",
51
57
  repo_type=None,
52
58
  is_snapshot=True,
59
+ wanna_dir_path=True,
53
60
  )
54
61
 
55
62
  def get_struct_eqtable_path(self) -> Path:
@@ -58,6 +65,7 @@ class HuggingfaceModel(Model):
58
65
  filename="model.safetensors",
59
66
  repo_type=None,
60
67
  is_snapshot=True,
68
+ wanna_dir_path=True,
61
69
  )
62
70
 
63
71
  def get_latex_path(self) -> Path:
@@ -66,38 +74,60 @@ class HuggingfaceModel(Model):
66
74
  filename="checkpoints/weights.pth",
67
75
  repo_type="space",
68
76
  is_snapshot=True,
77
+ wanna_dir_path=True,
69
78
  )
70
79
 
71
80
  def _get_model_path(
72
81
  self,
73
82
  repo_id: str,
74
- filename: str | None,
83
+ filename: str,
75
84
  repo_type: str | None,
76
85
  is_snapshot: bool,
86
+ wanna_dir_path: bool,
77
87
  ) -> Path:
88
+
78
89
  with self._lock:
79
- cache_filename = "README.md"
80
- if filename is not None:
81
- cache_filename = filename
82
90
  model_path = try_to_load_from_cache(
83
91
  repo_id=repo_id,
84
- filename=cache_filename,
92
+ filename=filename,
85
93
  repo_type=repo_type,
86
94
  cache_dir=self._model_cache_dir
87
95
  )
88
96
  if isinstance(model_path, str):
89
- if filename is None:
90
- model_path = Path(model_path).parent
91
-
92
- elif is_snapshot:
93
- model_path = snapshot_download(
94
- cache_dir=self._model_cache_dir,
95
- repo_id=repo_id,
96
- )
97
+ model_path = Path(model_path)
98
+ if wanna_dir_path:
99
+ for _ in Path(filename).parts:
100
+ model_path = model_path.parent
101
+
97
102
  else:
98
- model_path = hf_hub_download(
99
- cache_dir=self._model_cache_dir,
100
- repo_id=repo_id,
101
- filename=filename,
102
- )
103
- return Path(model_path)
103
+ # https://github.com/huggingface/huggingface_hub/issues/1542#issuecomment-1630465844
104
+ latest_error: ConnectionError | None = None
105
+ for i in range(_RETRY_TIMES + 1):
106
+ if latest_error is not None:
107
+ print(f"Retrying to download {repo_id} model, attempt {i + 1}/{_RETRY_TIMES}...")
108
+ sleep(_RETRY_SLEEP)
109
+ try:
110
+ if is_snapshot:
111
+ model_path = snapshot_download(
112
+ cache_dir=self._model_cache_dir,
113
+ repo_id=repo_id,
114
+ repo_type=repo_type,
115
+ resume_download=True,
116
+ )
117
+ else:
118
+ model_path = hf_hub_download(
119
+ cache_dir=self._model_cache_dir,
120
+ repo_id=repo_id,
121
+ repo_type=repo_type,
122
+ filename=filename,
123
+ resume_download=True,
124
+ )
125
+ latest_error = None
126
+ except ConnectionError as err:
127
+ latest_error = err
128
+
129
+ if latest_error is not None:
130
+ raise latest_error
131
+ model_path = Path(model_path)
132
+
133
+ return model_path
@@ -1,30 +1,33 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.3
2
2
  Name: doc-page-extractor
3
- Version: 0.2.1
4
- Summary: doc page extractor can identify text and format in images and return structured data.
5
- Home-page: https://github.com/Moskize91/doc-page-extractor
3
+ Version: 0.2.3
4
+ Summary:
5
+ License: AGPL-3.0
6
6
  Author: Tao Zeyu
7
7
  Author-email: i@taozeyu.com
8
+ Maintainer: Tao Zeyu
9
+ Maintainer-email: i@taozeyu.com
10
+ Requires-Python: >=3.10,<3.13
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Dist: accelerate (>=1.6.0,<2.0)
20
+ Requires-Dist: doclayout_yolo (>=0.0.3)
21
+ Requires-Dist: huggingface_hub (>=0.33.0,<1.0)
22
+ Requires-Dist: numpy (>=1.24.0,<2.0)
23
+ Requires-Dist: opencv-python (>=4.10.0,<5.0)
24
+ Requires-Dist: pillow (>=10.3,<11.0)
25
+ Requires-Dist: pix2tex (>=0.1.4,<=0.2.0)
26
+ Requires-Dist: pyclipper (>=1.2.0,<2.0)
27
+ Requires-Dist: shapely (>=2.0.0,<3.0)
28
+ Requires-Dist: transformers (>=4.42.4,<=4.47)
29
+ Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
8
30
  Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: opencv-python<5.0,>=4.10.0
11
- Requires-Dist: pillow<11.0,>=10.3
12
- Requires-Dist: pyclipper<2.0,>=1.2.0
13
- Requires-Dist: numpy<2.0,>=1.24.0
14
- Requires-Dist: shapely<3.0,>=2.0.0
15
- Requires-Dist: transformers<=4.47,>=4.42.4
16
- Requires-Dist: doclayout_yolo>=0.0.3
17
- Requires-Dist: pix2tex<=0.2.0,>=0.1.4
18
- Requires-Dist: accelerate<2.0,>=1.6.0
19
- Requires-Dist: huggingface_hub<1.0,>=0.30.2
20
- Dynamic: author
21
- Dynamic: author-email
22
- Dynamic: description
23
- Dynamic: description-content-type
24
- Dynamic: home-page
25
- Dynamic: license-file
26
- Dynamic: requires-dist
27
- Dynamic: summary
28
31
 
29
32
  # doc page extractor
30
33
 
@@ -1,21 +1,13 @@
1
- doc_page_extractor/__init__.py,sha256=Q3cZxT1wpjb1kGI0fJ9YvL4Fh_rBN3NdLqaZh4ATFQM,312
1
+ doc_page_extractor/__init__.py,sha256=rt_XALcqNNg3iVkMTUHltWxvdweH2FY6Y_olU2TkVBY,355
2
2
  doc_page_extractor/clipper.py,sha256=5S1TI0aqMebwlPv_Ih4Nxpp6MchEjOih-CiZfMWUAhI,3201
3
3
  doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
4
- doc_page_extractor/extractor.py,sha256=SGDPEIjmp5HiAYq_nSVsKFTt0IvqJJXXDZRfFFRlfzU,6993
4
+ doc_page_extractor/extractor.py,sha256=7dJ1N4d2_9tB49XAfKBIJjEUvNtjf0CGqcm2uf6BMPg,7205
5
5
  doc_page_extractor/latex.py,sha256=kD3NIzZTEGUFIAqqyHYmNcfyTrlu77GB1YSFgzbFb7A,1024
6
6
  doc_page_extractor/layout_order.py,sha256=lm_rXzRZ3AtufaRwxqZPfM9vplwaOlE2pnTNc7Z_oLM,7435
7
7
  doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
8
- doc_page_extractor/model.py,sha256=BBrsY2iQXCIsvqemjL37pKRPV3EtKkXsl7yA5_SI8RE,2610
8
+ doc_page_extractor/model.py,sha256=0J8Nt9Xl6mJu5VaFQ0nyUJrM3UmFg48_lXv4mDJSEpQ,3617
9
9
  doc_page_extractor/ocr.py,sha256=niZY4ZgbbbV-IeTrtF3Y8LPPrynU2wvZKRVYbBP1Dog,5586
10
10
  doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
11
- doc_page_extractor/overlap.py,sha256=z1DF4_2OPvauDHwmz1SC1WosULkE84HKaRfNEgexPzc,5337
12
- doc_page_extractor/plot.py,sha256=4uibjS_x1SyEyjaJJd0YsBbzkgldDOCct4Ry2cOhdXU,2556
13
- doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
14
- doc_page_extractor/rectangle.py,sha256=yeW6srdrsxaJg1eb3nn8oxtY0sfgeBk3hMiuJGaRXwY,1678
15
- doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
16
- doc_page_extractor/table.py,sha256=3esYqvHS9btINF9htmLrnDgTec2q7VvljaMXzDLj34w,1690
17
- doc_page_extractor/types.py,sha256=XtNfGuG6ZEwozFdk04mOD77m34zxr8aD9-fR9hbrrNU,1313
18
- doc_page_extractor/utils.py,sha256=ZlQVOLPUg_v5J8u6SoD8XtMG_JkF-ERgjubc4LO5_Lg,688
19
11
  doc_page_extractor/onnxocr/__init__.py,sha256=BK4YpX4pU0nRxbcI5f5cbIVfdBEsx4W980QYmpNQaH0,38
20
12
  doc_page_extractor/onnxocr/cls_postprocess.py,sha256=o879Ned0RMUERYLviuToZ0xTvhn2UsYAb-yPC5gj8h4,822
21
13
  doc_page_extractor/onnxocr/db_postprocess.py,sha256=R3yXXfReiQgLaYIvvfnrFfshI202LjHMvcZwcLpjmTY,7913
@@ -28,18 +20,15 @@ doc_page_extractor/onnxocr/predict_rec.py,sha256=UsgPhl6X3frx5u-LzIEPITOM3WJ1iAm
28
20
  doc_page_extractor/onnxocr/predict_system.py,sha256=yoqXunAsoboPsWe7qQjvQf2_SMW1T1QMriEoiGdX3BM,2721
29
21
  doc_page_extractor/onnxocr/rec_postprocess.py,sha256=qZt5Ripal7z9hniKq5e7azOkD9e6NR1ylWpRpznhweg,29556
30
22
  doc_page_extractor/onnxocr/utils.py,sha256=AQoHgQyv-jpPo4BsVzq3r7_ze698EZ-a7LJobm2fwUI,1864
31
- doc_page_extractor/struct_eqtable/__init__.py,sha256=QoTsNuJfpNSrMIMd6Cot1jJqWk88_lDqFP_C2rcVJO4,1329
32
- doc_page_extractor/struct_eqtable/internvl/__init__.py,sha256=2aOsU-aHkFv_gjdP8LeUXjj_9-0d4x79iyxh4cCzaEw,79
33
- doc_page_extractor/struct_eqtable/internvl/conversation.py,sha256=s7DceRlM6JtHmxgyuE6vqu5XVT1fHzhzCL_I6r8MI1c,15129
34
- doc_page_extractor/struct_eqtable/internvl/internvl.py,sha256=ovVZG-PuBrsj_9lEoNPOygJ-2en3v6gPzRfWjDpWNOM,7678
35
- doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py,sha256=ACHxFntxS38G43PzE955Nv4fjKk_-Oz4y_o9JEjQwlg,2608
36
- doc_page_extractor/struct_eqtable/pix2s/__init__.py,sha256=cXRo4eg6u1-TXktZ8rQf0HIzLmmScIwYQhbxMKl-MyA,76
37
- doc_page_extractor/struct_eqtable/pix2s/pix2s.py,sha256=fCNve8PNeJ3-AWJIhSeGtp-mYKoMXfW0CIpszkQnAaA,2535
38
- doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py,sha256=zSGw45JhWdZ3iuJel5Chsy-NzsOHx9QyPQIUAzzCjFE,43880
39
- doc_page_extractor-0.2.1.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
40
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- tests/test_history_bus.py,sha256=g-bpDIiebyEHKDH0YS5OHF2ONfhZt3-EFLZhWJn94WE,2534
42
- doc_page_extractor-0.2.1.dist-info/METADATA,sha256=lAM7CCs34AN2TPf9dvNQQZu2EsGFVLMXPKqSaRddZQQ,2480
43
- doc_page_extractor-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- doc_page_extractor-0.2.1.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
45
- doc_page_extractor-0.2.1.dist-info/RECORD,,
23
+ doc_page_extractor/overlap.py,sha256=z1DF4_2OPvauDHwmz1SC1WosULkE84HKaRfNEgexPzc,5337
24
+ doc_page_extractor/plot.py,sha256=4uibjS_x1SyEyjaJJd0YsBbzkgldDOCct4Ry2cOhdXU,2556
25
+ doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
26
+ doc_page_extractor/rectangle.py,sha256=yeW6srdrsxaJg1eb3nn8oxtY0sfgeBk3hMiuJGaRXwY,1678
27
+ doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
28
+ doc_page_extractor/table.py,sha256=3esYqvHS9btINF9htmLrnDgTec2q7VvljaMXzDLj34w,1690
29
+ doc_page_extractor/types.py,sha256=XtNfGuG6ZEwozFdk04mOD77m34zxr8aD9-fR9hbrrNU,1313
30
+ doc_page_extractor/utils.py,sha256=ZlQVOLPUg_v5J8u6SoD8XtMG_JkF-ERgjubc4LO5_Lg,688
31
+ doc_page_extractor-0.2.3.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
32
+ doc_page_extractor-0.2.3.dist-info/METADATA,sha256=KYcecqR5aLCGkepWf-gDLur-oRrIHrsd3DNB0BeI8EU,2756
33
+ doc_page_extractor-0.2.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
34
+ doc_page_extractor-0.2.3.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: poetry-core 2.1.3
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -1,49 +0,0 @@
1
- from .pix2s import Pix2Struct, Pix2StructTensorRT
2
- from .internvl import InternVL, InternVL_LMDeploy
3
-
4
- from transformers import AutoConfig
5
-
6
-
7
- __ALL_MODELS__ = {
8
- 'Pix2Struct': Pix2Struct,
9
- 'Pix2StructTensorRT': Pix2StructTensorRT,
10
- 'InternVL': InternVL,
11
- 'InternVL_LMDeploy': InternVL_LMDeploy,
12
- }
13
-
14
-
15
- def get_model_name(model_path):
16
- model_config = AutoConfig.from_pretrained(
17
- model_path,
18
- trust_remote_code=True,
19
- )
20
-
21
- if 'Pix2Struct' in model_config.architectures[0]:
22
- model_name = 'Pix2Struct'
23
- elif 'InternVL' in model_config.architectures[0]:
24
- model_name = 'InternVL'
25
- else:
26
- raise ValueError(f"Unsupported model type: {model_config.architectures[0]}")
27
-
28
- return model_name
29
-
30
-
31
- def build_model(
32
- model_ckpt='U4R/StructTable-InternVL2-1B',
33
- cache_dir=None,
34
- local_files_only=None,
35
- **kwargs,
36
- ):
37
- model_name = get_model_name(model_ckpt)
38
- if model_name == 'InternVL' and kwargs.get('lmdeploy', False):
39
- model_name = 'InternVL_LMDeploy'
40
- elif model_name == 'Pix2Struct' and kwargs.get('tensorrt_path', None):
41
- model_name = 'Pix2StructTensorRT'
42
-
43
- model = __ALL_MODELS__[model_name](
44
- model_ckpt,
45
- cache_dir=cache_dir,
46
- local_files_only=local_files_only,
47
- **kwargs
48
- )
49
- return model
@@ -1,2 +0,0 @@
1
- from .internvl import InternVL
2
- from .internvl_lmdeploy import InternVL_LMDeploy