doc-page-extractor 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (51) hide show
  1. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/PKG-INFO +26 -23
  2. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/__init__.py +1 -0
  3. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/extractor.py +7 -0
  4. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/model.py +52 -22
  5. doc_page_extractor-0.2.3/pyproject.toml +48 -0
  6. doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/__init__.py +0 -49
  7. doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -2
  8. doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -394
  9. doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -198
  10. doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -81
  11. doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -3
  12. doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -76
  13. doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -1047
  14. doc_page_extractor-0.2.1/doc_page_extractor.egg-info/PKG-INFO +0 -85
  15. doc_page_extractor-0.2.1/doc_page_extractor.egg-info/SOURCES.txt +0 -48
  16. doc_page_extractor-0.2.1/doc_page_extractor.egg-info/dependency_links.txt +0 -1
  17. doc_page_extractor-0.2.1/doc_page_extractor.egg-info/requires.txt +0 -10
  18. doc_page_extractor-0.2.1/doc_page_extractor.egg-info/top_level.txt +0 -2
  19. doc_page_extractor-0.2.1/setup.cfg +0 -4
  20. doc_page_extractor-0.2.1/setup.py +0 -28
  21. doc_page_extractor-0.2.1/tests/__init__.py +0 -0
  22. doc_page_extractor-0.2.1/tests/test_history_bus.py +0 -55
  23. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/LICENSE +0 -0
  24. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/README.md +0 -0
  25. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/clipper.py +0 -0
  26. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/downloader.py +0 -0
  27. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/latex.py +0 -0
  28. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/layout_order.py +0 -0
  29. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/layoutreader.py +0 -0
  30. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/ocr.py +0 -0
  31. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/ocr_corrector.py +0 -0
  32. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/__init__.py +0 -0
  33. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
  34. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
  35. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/imaug.py +0 -0
  36. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/operators.py +0 -0
  37. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_base.py +0 -0
  38. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
  39. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_det.py +0 -0
  40. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
  41. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_system.py +0 -0
  42. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
  43. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/utils.py +0 -0
  44. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/overlap.py +0 -0
  45. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/plot.py +0 -0
  46. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/raw_optimizer.py +0 -0
  47. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/rectangle.py +0 -0
  48. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/rotation.py +0 -0
  49. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/table.py +0 -0
  50. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/types.py +0 -0
  51. {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/utils.py +0 -0
@@ -1,30 +1,33 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.3
2
2
  Name: doc-page-extractor
3
- Version: 0.2.1
4
- Summary: doc page extractor can identify text and format in images and return structured data.
5
- Home-page: https://github.com/Moskize91/doc-page-extractor
3
+ Version: 0.2.3
4
+ Summary:
5
+ License: AGPL-3.0
6
6
  Author: Tao Zeyu
7
7
  Author-email: i@taozeyu.com
8
+ Maintainer: Tao Zeyu
9
+ Maintainer-email: i@taozeyu.com
10
+ Requires-Python: >=3.10,<3.13
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Dist: accelerate (>=1.6.0,<2.0)
20
+ Requires-Dist: doclayout_yolo (>=0.0.3)
21
+ Requires-Dist: huggingface_hub (>=0.33.0,<1.0)
22
+ Requires-Dist: numpy (>=1.24.0,<2.0)
23
+ Requires-Dist: opencv-python (>=4.10.0,<5.0)
24
+ Requires-Dist: pillow (>=10.3,<11.0)
25
+ Requires-Dist: pix2tex (>=0.1.4,<=0.2.0)
26
+ Requires-Dist: pyclipper (>=1.2.0,<2.0)
27
+ Requires-Dist: shapely (>=2.0.0,<3.0)
28
+ Requires-Dist: transformers (>=4.42.4,<=4.47)
29
+ Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
8
30
  Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
- Requires-Dist: opencv-python<5.0,>=4.10.0
11
- Requires-Dist: pillow<11.0,>=10.3
12
- Requires-Dist: pyclipper<2.0,>=1.2.0
13
- Requires-Dist: numpy<2.0,>=1.24.0
14
- Requires-Dist: shapely<3.0,>=2.0.0
15
- Requires-Dist: transformers<=4.47,>=4.42.4
16
- Requires-Dist: doclayout_yolo>=0.0.3
17
- Requires-Dist: pix2tex<=0.2.0,>=0.1.4
18
- Requires-Dist: accelerate<2.0,>=1.6.0
19
- Requires-Dist: huggingface_hub<1.0,>=0.30.2
20
- Dynamic: author
21
- Dynamic: author-email
22
- Dynamic: description
23
- Dynamic: description-content-type
24
- Dynamic: home-page
25
- Dynamic: license-file
26
- Dynamic: requires-dist
27
- Dynamic: summary
28
31
 
29
32
  # doc page extractor
30
33
 
@@ -2,6 +2,7 @@ from .extractor import DocExtractor
2
2
  from .clipper import clip, clip_from_image
3
3
  from .plot import plot
4
4
  from .rectangle import Point, Rectangle
5
+ from .model import Model, HuggingfaceModel
5
6
  from .types import (
6
7
  ExtractedResult,
7
8
  OCRFragment,
@@ -52,6 +52,13 @@ class DocExtractor:
52
52
  self._latex: LaTeX = LaTeX(device, model)
53
53
  self._layout_order: LayoutOrder = LayoutOrder(device, model)
54
54
 
55
+ def prepare_models(self):
56
+ self._model.get_onnx_ocr_path()
57
+ self._model.get_yolo_path()
58
+ self._model.get_layoutreader_path()
59
+ self._model.get_struct_eqtable_path()
60
+ self._model.get_latex_path()
61
+
55
62
  def extract(
56
63
  self,
57
64
  image: Image,
@@ -1,10 +1,14 @@
1
1
  from os import PathLike
2
+ from time import sleep
2
3
  from typing import runtime_checkable, Protocol
3
4
  from pathlib import Path
4
5
  from threading import Lock
5
6
  from huggingface_hub import hf_hub_download, snapshot_download, try_to_load_from_cache
6
7
 
7
8
 
9
+ _RETRY_TIMES = 6
10
+ _RETRY_SLEEP = 3.5
11
+
8
12
  @runtime_checkable
9
13
  class Model(Protocol):
10
14
  def get_onnx_ocr_path(self) -> Path:
@@ -31,9 +35,10 @@ class HuggingfaceModel(Model):
31
35
  def get_onnx_ocr_path(self) -> Path:
32
36
  return self._get_model_path(
33
37
  repo_id="moskize/OnnxOCR",
34
- filename=None,
38
+ filename="README.md",
35
39
  repo_type=None,
36
- is_snapshot=True
40
+ is_snapshot=True,
41
+ wanna_dir_path=True,
37
42
  )
38
43
 
39
44
  def get_yolo_path(self) -> Path:
@@ -42,14 +47,16 @@ class HuggingfaceModel(Model):
42
47
  filename="models/Layout/YOLO/doclayout_yolo_ft.pt",
43
48
  repo_type=None,
44
49
  is_snapshot=False,
50
+ wanna_dir_path=False,
45
51
  )
46
52
 
47
53
  def get_layoutreader_path(self) -> Path:
48
54
  return self._get_model_path(
49
55
  repo_id="hantian/layoutreader",
50
- filename=None,
56
+ filename="model.safetensors",
51
57
  repo_type=None,
52
58
  is_snapshot=True,
59
+ wanna_dir_path=True,
53
60
  )
54
61
 
55
62
  def get_struct_eqtable_path(self) -> Path:
@@ -58,6 +65,7 @@ class HuggingfaceModel(Model):
58
65
  filename="model.safetensors",
59
66
  repo_type=None,
60
67
  is_snapshot=True,
68
+ wanna_dir_path=True,
61
69
  )
62
70
 
63
71
  def get_latex_path(self) -> Path:
@@ -66,38 +74,60 @@ class HuggingfaceModel(Model):
66
74
  filename="checkpoints/weights.pth",
67
75
  repo_type="space",
68
76
  is_snapshot=True,
77
+ wanna_dir_path=True,
69
78
  )
70
79
 
71
80
  def _get_model_path(
72
81
  self,
73
82
  repo_id: str,
74
- filename: str | None,
83
+ filename: str,
75
84
  repo_type: str | None,
76
85
  is_snapshot: bool,
86
+ wanna_dir_path: bool,
77
87
  ) -> Path:
88
+
78
89
  with self._lock:
79
- cache_filename = "README.md"
80
- if filename is not None:
81
- cache_filename = filename
82
90
  model_path = try_to_load_from_cache(
83
91
  repo_id=repo_id,
84
- filename=cache_filename,
92
+ filename=filename,
85
93
  repo_type=repo_type,
86
94
  cache_dir=self._model_cache_dir
87
95
  )
88
96
  if isinstance(model_path, str):
89
- if filename is None:
90
- model_path = Path(model_path).parent
91
-
92
- elif is_snapshot:
93
- model_path = snapshot_download(
94
- cache_dir=self._model_cache_dir,
95
- repo_id=repo_id,
96
- )
97
+ model_path = Path(model_path)
98
+ if wanna_dir_path:
99
+ for _ in Path(filename).parts:
100
+ model_path = model_path.parent
101
+
97
102
  else:
98
- model_path = hf_hub_download(
99
- cache_dir=self._model_cache_dir,
100
- repo_id=repo_id,
101
- filename=filename,
102
- )
103
- return Path(model_path)
103
+ # https://github.com/huggingface/huggingface_hub/issues/1542#issuecomment-1630465844
104
+ latest_error: ConnectionError | None = None
105
+ for i in range(_RETRY_TIMES + 1):
106
+ if latest_error is not None:
107
+ print(f"Retrying to download {repo_id} model, attempt {i + 1}/{_RETRY_TIMES}...")
108
+ sleep(_RETRY_SLEEP)
109
+ try:
110
+ if is_snapshot:
111
+ model_path = snapshot_download(
112
+ cache_dir=self._model_cache_dir,
113
+ repo_id=repo_id,
114
+ repo_type=repo_type,
115
+ resume_download=True,
116
+ )
117
+ else:
118
+ model_path = hf_hub_download(
119
+ cache_dir=self._model_cache_dir,
120
+ repo_id=repo_id,
121
+ repo_type=repo_type,
122
+ filename=filename,
123
+ resume_download=True,
124
+ )
125
+ latest_error = None
126
+ except ConnectionError as err:
127
+ latest_error = err
128
+
129
+ if latest_error is not None:
130
+ raise latest_error
131
+ model_path = Path(model_path)
132
+
133
+ return model_path
@@ -0,0 +1,48 @@
1
+ [project]
2
+ name = "doc-page-extractor"
3
+ version = "0.2.3"
4
+ description = ""
5
+ authors = [
6
+ {name = "Tao Zeyu",email = "i@taozeyu.com"}
7
+ ]
8
+ maintainers = [
9
+ {name = "Tao Zeyu", email = "i@taozeyu.com"}
10
+ ]
11
+ license = {text = "AGPL-3.0"}
12
+ readme = "README.md"
13
+ requires-python = ">=3.10,<3.13"
14
+ dependencies = [
15
+ "opencv-python>=4.10.0,<5.0",
16
+ "pillow>=10.3,<11.0",
17
+ "pyclipper>=1.2.0,<2.0",
18
+ "numpy>=1.24.0,<2.0",
19
+ "shapely>=2.0.0,<3.0",
20
+ "transformers>=4.42.4,<=4.47",
21
+ "doclayout_yolo>=0.0.3",
22
+ "pix2tex>=0.1.4,<=0.2.0",
23
+ "accelerate>=1.6.0,<2.0",
24
+ "huggingface_hub>=0.33.0,<1.0",
25
+ ]
26
+
27
+
28
+ [build-system]
29
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
30
+ build-backend = "poetry.core.masonry.api"
31
+
32
+ [tool.poetry]
33
+ license = "AGPL-3.0"
34
+ readme = "README.md"
35
+ repository = "https://github.com/moskize91/doc-page-extractor"
36
+ packages = [
37
+ {include = "doc_page_extractor" }
38
+ ]
39
+ classifiers=[
40
+ "Development Status :: 2 - Pre-Alpha",
41
+ "Intended Audience :: Developers",
42
+ "License :: OSI Approved :: GNU Affero General Public License v3",
43
+ "Programming Language :: Python",
44
+ "Programming Language :: Python :: 3.10",
45
+ ]
46
+
47
+ [tool.poetry.group.dev.dependencies]
48
+ pylint = "^3.3.7"
@@ -1,49 +0,0 @@
1
- from .pix2s import Pix2Struct, Pix2StructTensorRT
2
- from .internvl import InternVL, InternVL_LMDeploy
3
-
4
- from transformers import AutoConfig
5
-
6
-
7
- __ALL_MODELS__ = {
8
- 'Pix2Struct': Pix2Struct,
9
- 'Pix2StructTensorRT': Pix2StructTensorRT,
10
- 'InternVL': InternVL,
11
- 'InternVL_LMDeploy': InternVL_LMDeploy,
12
- }
13
-
14
-
15
- def get_model_name(model_path):
16
- model_config = AutoConfig.from_pretrained(
17
- model_path,
18
- trust_remote_code=True,
19
- )
20
-
21
- if 'Pix2Struct' in model_config.architectures[0]:
22
- model_name = 'Pix2Struct'
23
- elif 'InternVL' in model_config.architectures[0]:
24
- model_name = 'InternVL'
25
- else:
26
- raise ValueError(f"Unsupported model type: {model_config.architectures[0]}")
27
-
28
- return model_name
29
-
30
-
31
- def build_model(
32
- model_ckpt='U4R/StructTable-InternVL2-1B',
33
- cache_dir=None,
34
- local_files_only=None,
35
- **kwargs,
36
- ):
37
- model_name = get_model_name(model_ckpt)
38
- if model_name == 'InternVL' and kwargs.get('lmdeploy', False):
39
- model_name = 'InternVL_LMDeploy'
40
- elif model_name == 'Pix2Struct' and kwargs.get('tensorrt_path', None):
41
- model_name = 'Pix2StructTensorRT'
42
-
43
- model = __ALL_MODELS__[model_name](
44
- model_ckpt,
45
- cache_dir=cache_dir,
46
- local_files_only=local_files_only,
47
- **kwargs
48
- )
49
- return model
@@ -1,2 +0,0 @@
1
- from .internvl import InternVL
2
- from .internvl_lmdeploy import InternVL_LMDeploy