doc-page-extractor 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of doc-page-extractor might be problematic. Click here for more details.
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/PKG-INFO +26 -23
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/__init__.py +1 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/extractor.py +7 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/model.py +52 -22
- doc_page_extractor-0.2.3/pyproject.toml +48 -0
- doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/__init__.py +0 -49
- doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -2
- doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -394
- doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -198
- doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -81
- doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -3
- doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -76
- doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -1047
- doc_page_extractor-0.2.1/doc_page_extractor.egg-info/PKG-INFO +0 -85
- doc_page_extractor-0.2.1/doc_page_extractor.egg-info/SOURCES.txt +0 -48
- doc_page_extractor-0.2.1/doc_page_extractor.egg-info/dependency_links.txt +0 -1
- doc_page_extractor-0.2.1/doc_page_extractor.egg-info/requires.txt +0 -10
- doc_page_extractor-0.2.1/doc_page_extractor.egg-info/top_level.txt +0 -2
- doc_page_extractor-0.2.1/setup.cfg +0 -4
- doc_page_extractor-0.2.1/setup.py +0 -28
- doc_page_extractor-0.2.1/tests/__init__.py +0 -0
- doc_page_extractor-0.2.1/tests/test_history_bus.py +0 -55
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/LICENSE +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/README.md +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/clipper.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/downloader.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/latex.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/layout_order.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/layoutreader.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/ocr.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/ocr_corrector.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/__init__.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/imaug.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/operators.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_base.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_det.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/predict_system.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/onnxocr/utils.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/overlap.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/plot.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/raw_optimizer.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/rectangle.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/rotation.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/table.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/types.py +0 -0
- {doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/utils.py +0 -0
|
@@ -1,30 +1,33 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: doc-page-extractor
|
|
3
|
-
Version: 0.2.
|
|
4
|
-
Summary:
|
|
5
|
-
|
|
3
|
+
Version: 0.2.3
|
|
4
|
+
Summary:
|
|
5
|
+
License: AGPL-3.0
|
|
6
6
|
Author: Tao Zeyu
|
|
7
7
|
Author-email: i@taozeyu.com
|
|
8
|
+
Maintainer: Tao Zeyu
|
|
9
|
+
Maintainer-email: i@taozeyu.com
|
|
10
|
+
Requires-Python: >=3.10,<3.13
|
|
11
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
14
|
+
Classifier: Programming Language :: Python
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Requires-Dist: accelerate (>=1.6.0,<2.0)
|
|
20
|
+
Requires-Dist: doclayout_yolo (>=0.0.3)
|
|
21
|
+
Requires-Dist: huggingface_hub (>=0.33.0,<1.0)
|
|
22
|
+
Requires-Dist: numpy (>=1.24.0,<2.0)
|
|
23
|
+
Requires-Dist: opencv-python (>=4.10.0,<5.0)
|
|
24
|
+
Requires-Dist: pillow (>=10.3,<11.0)
|
|
25
|
+
Requires-Dist: pix2tex (>=0.1.4,<=0.2.0)
|
|
26
|
+
Requires-Dist: pyclipper (>=1.2.0,<2.0)
|
|
27
|
+
Requires-Dist: shapely (>=2.0.0,<3.0)
|
|
28
|
+
Requires-Dist: transformers (>=4.42.4,<=4.47)
|
|
29
|
+
Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
|
|
8
30
|
Description-Content-Type: text/markdown
|
|
9
|
-
License-File: LICENSE
|
|
10
|
-
Requires-Dist: opencv-python<5.0,>=4.10.0
|
|
11
|
-
Requires-Dist: pillow<11.0,>=10.3
|
|
12
|
-
Requires-Dist: pyclipper<2.0,>=1.2.0
|
|
13
|
-
Requires-Dist: numpy<2.0,>=1.24.0
|
|
14
|
-
Requires-Dist: shapely<3.0,>=2.0.0
|
|
15
|
-
Requires-Dist: transformers<=4.47,>=4.42.4
|
|
16
|
-
Requires-Dist: doclayout_yolo>=0.0.3
|
|
17
|
-
Requires-Dist: pix2tex<=0.2.0,>=0.1.4
|
|
18
|
-
Requires-Dist: accelerate<2.0,>=1.6.0
|
|
19
|
-
Requires-Dist: huggingface_hub<1.0,>=0.30.2
|
|
20
|
-
Dynamic: author
|
|
21
|
-
Dynamic: author-email
|
|
22
|
-
Dynamic: description
|
|
23
|
-
Dynamic: description-content-type
|
|
24
|
-
Dynamic: home-page
|
|
25
|
-
Dynamic: license-file
|
|
26
|
-
Dynamic: requires-dist
|
|
27
|
-
Dynamic: summary
|
|
28
31
|
|
|
29
32
|
# doc page extractor
|
|
30
33
|
|
|
@@ -52,6 +52,13 @@ class DocExtractor:
|
|
|
52
52
|
self._latex: LaTeX = LaTeX(device, model)
|
|
53
53
|
self._layout_order: LayoutOrder = LayoutOrder(device, model)
|
|
54
54
|
|
|
55
|
+
def prepare_models(self):
|
|
56
|
+
self._model.get_onnx_ocr_path()
|
|
57
|
+
self._model.get_yolo_path()
|
|
58
|
+
self._model.get_layoutreader_path()
|
|
59
|
+
self._model.get_struct_eqtable_path()
|
|
60
|
+
self._model.get_latex_path()
|
|
61
|
+
|
|
55
62
|
def extract(
|
|
56
63
|
self,
|
|
57
64
|
image: Image,
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
from os import PathLike
|
|
2
|
+
from time import sleep
|
|
2
3
|
from typing import runtime_checkable, Protocol
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from threading import Lock
|
|
5
6
|
from huggingface_hub import hf_hub_download, snapshot_download, try_to_load_from_cache
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
_RETRY_TIMES = 6
|
|
10
|
+
_RETRY_SLEEP = 3.5
|
|
11
|
+
|
|
8
12
|
@runtime_checkable
|
|
9
13
|
class Model(Protocol):
|
|
10
14
|
def get_onnx_ocr_path(self) -> Path:
|
|
@@ -31,9 +35,10 @@ class HuggingfaceModel(Model):
|
|
|
31
35
|
def get_onnx_ocr_path(self) -> Path:
|
|
32
36
|
return self._get_model_path(
|
|
33
37
|
repo_id="moskize/OnnxOCR",
|
|
34
|
-
filename=
|
|
38
|
+
filename="README.md",
|
|
35
39
|
repo_type=None,
|
|
36
|
-
is_snapshot=True
|
|
40
|
+
is_snapshot=True,
|
|
41
|
+
wanna_dir_path=True,
|
|
37
42
|
)
|
|
38
43
|
|
|
39
44
|
def get_yolo_path(self) -> Path:
|
|
@@ -42,14 +47,16 @@ class HuggingfaceModel(Model):
|
|
|
42
47
|
filename="models/Layout/YOLO/doclayout_yolo_ft.pt",
|
|
43
48
|
repo_type=None,
|
|
44
49
|
is_snapshot=False,
|
|
50
|
+
wanna_dir_path=False,
|
|
45
51
|
)
|
|
46
52
|
|
|
47
53
|
def get_layoutreader_path(self) -> Path:
|
|
48
54
|
return self._get_model_path(
|
|
49
55
|
repo_id="hantian/layoutreader",
|
|
50
|
-
filename=
|
|
56
|
+
filename="model.safetensors",
|
|
51
57
|
repo_type=None,
|
|
52
58
|
is_snapshot=True,
|
|
59
|
+
wanna_dir_path=True,
|
|
53
60
|
)
|
|
54
61
|
|
|
55
62
|
def get_struct_eqtable_path(self) -> Path:
|
|
@@ -58,6 +65,7 @@ class HuggingfaceModel(Model):
|
|
|
58
65
|
filename="model.safetensors",
|
|
59
66
|
repo_type=None,
|
|
60
67
|
is_snapshot=True,
|
|
68
|
+
wanna_dir_path=True,
|
|
61
69
|
)
|
|
62
70
|
|
|
63
71
|
def get_latex_path(self) -> Path:
|
|
@@ -66,38 +74,60 @@ class HuggingfaceModel(Model):
|
|
|
66
74
|
filename="checkpoints/weights.pth",
|
|
67
75
|
repo_type="space",
|
|
68
76
|
is_snapshot=True,
|
|
77
|
+
wanna_dir_path=True,
|
|
69
78
|
)
|
|
70
79
|
|
|
71
80
|
def _get_model_path(
|
|
72
81
|
self,
|
|
73
82
|
repo_id: str,
|
|
74
|
-
filename: str
|
|
83
|
+
filename: str,
|
|
75
84
|
repo_type: str | None,
|
|
76
85
|
is_snapshot: bool,
|
|
86
|
+
wanna_dir_path: bool,
|
|
77
87
|
) -> Path:
|
|
88
|
+
|
|
78
89
|
with self._lock:
|
|
79
|
-
cache_filename = "README.md"
|
|
80
|
-
if filename is not None:
|
|
81
|
-
cache_filename = filename
|
|
82
90
|
model_path = try_to_load_from_cache(
|
|
83
91
|
repo_id=repo_id,
|
|
84
|
-
filename=
|
|
92
|
+
filename=filename,
|
|
85
93
|
repo_type=repo_type,
|
|
86
94
|
cache_dir=self._model_cache_dir
|
|
87
95
|
)
|
|
88
96
|
if isinstance(model_path, str):
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
cache_dir=self._model_cache_dir,
|
|
95
|
-
repo_id=repo_id,
|
|
96
|
-
)
|
|
97
|
+
model_path = Path(model_path)
|
|
98
|
+
if wanna_dir_path:
|
|
99
|
+
for _ in Path(filename).parts:
|
|
100
|
+
model_path = model_path.parent
|
|
101
|
+
|
|
97
102
|
else:
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
103
|
+
# https://github.com/huggingface/huggingface_hub/issues/1542#issuecomment-1630465844
|
|
104
|
+
latest_error: ConnectionError | None = None
|
|
105
|
+
for i in range(_RETRY_TIMES + 1):
|
|
106
|
+
if latest_error is not None:
|
|
107
|
+
print(f"Retrying to download {repo_id} model, attempt {i + 1}/{_RETRY_TIMES}...")
|
|
108
|
+
sleep(_RETRY_SLEEP)
|
|
109
|
+
try:
|
|
110
|
+
if is_snapshot:
|
|
111
|
+
model_path = snapshot_download(
|
|
112
|
+
cache_dir=self._model_cache_dir,
|
|
113
|
+
repo_id=repo_id,
|
|
114
|
+
repo_type=repo_type,
|
|
115
|
+
resume_download=True,
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
model_path = hf_hub_download(
|
|
119
|
+
cache_dir=self._model_cache_dir,
|
|
120
|
+
repo_id=repo_id,
|
|
121
|
+
repo_type=repo_type,
|
|
122
|
+
filename=filename,
|
|
123
|
+
resume_download=True,
|
|
124
|
+
)
|
|
125
|
+
latest_error = None
|
|
126
|
+
except ConnectionError as err:
|
|
127
|
+
latest_error = err
|
|
128
|
+
|
|
129
|
+
if latest_error is not None:
|
|
130
|
+
raise latest_error
|
|
131
|
+
model_path = Path(model_path)
|
|
132
|
+
|
|
133
|
+
return model_path
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "doc-page-extractor"
|
|
3
|
+
version = "0.2.3"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Tao Zeyu",email = "i@taozeyu.com"}
|
|
7
|
+
]
|
|
8
|
+
maintainers = [
|
|
9
|
+
{name = "Tao Zeyu", email = "i@taozeyu.com"}
|
|
10
|
+
]
|
|
11
|
+
license = {text = "AGPL-3.0"}
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10,<3.13"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"opencv-python>=4.10.0,<5.0",
|
|
16
|
+
"pillow>=10.3,<11.0",
|
|
17
|
+
"pyclipper>=1.2.0,<2.0",
|
|
18
|
+
"numpy>=1.24.0,<2.0",
|
|
19
|
+
"shapely>=2.0.0,<3.0",
|
|
20
|
+
"transformers>=4.42.4,<=4.47",
|
|
21
|
+
"doclayout_yolo>=0.0.3",
|
|
22
|
+
"pix2tex>=0.1.4,<=0.2.0",
|
|
23
|
+
"accelerate>=1.6.0,<2.0",
|
|
24
|
+
"huggingface_hub>=0.33.0,<1.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
[build-system]
|
|
29
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
30
|
+
build-backend = "poetry.core.masonry.api"
|
|
31
|
+
|
|
32
|
+
[tool.poetry]
|
|
33
|
+
license = "AGPL-3.0"
|
|
34
|
+
readme = "README.md"
|
|
35
|
+
repository = "https://github.com/moskize91/doc-page-extractor"
|
|
36
|
+
packages = [
|
|
37
|
+
{include = "doc_page_extractor" }
|
|
38
|
+
]
|
|
39
|
+
classifiers=[
|
|
40
|
+
"Development Status :: 2 - Pre-Alpha",
|
|
41
|
+
"Intended Audience :: Developers",
|
|
42
|
+
"License :: OSI Approved :: GNU Affero General Public License v3",
|
|
43
|
+
"Programming Language :: Python",
|
|
44
|
+
"Programming Language :: Python :: 3.10",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[tool.poetry.group.dev.dependencies]
|
|
48
|
+
pylint = "^3.3.7"
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
from .pix2s import Pix2Struct, Pix2StructTensorRT
|
|
2
|
-
from .internvl import InternVL, InternVL_LMDeploy
|
|
3
|
-
|
|
4
|
-
from transformers import AutoConfig
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
__ALL_MODELS__ = {
|
|
8
|
-
'Pix2Struct': Pix2Struct,
|
|
9
|
-
'Pix2StructTensorRT': Pix2StructTensorRT,
|
|
10
|
-
'InternVL': InternVL,
|
|
11
|
-
'InternVL_LMDeploy': InternVL_LMDeploy,
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def get_model_name(model_path):
|
|
16
|
-
model_config = AutoConfig.from_pretrained(
|
|
17
|
-
model_path,
|
|
18
|
-
trust_remote_code=True,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
if 'Pix2Struct' in model_config.architectures[0]:
|
|
22
|
-
model_name = 'Pix2Struct'
|
|
23
|
-
elif 'InternVL' in model_config.architectures[0]:
|
|
24
|
-
model_name = 'InternVL'
|
|
25
|
-
else:
|
|
26
|
-
raise ValueError(f"Unsupported model type: {model_config.architectures[0]}")
|
|
27
|
-
|
|
28
|
-
return model_name
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def build_model(
|
|
32
|
-
model_ckpt='U4R/StructTable-InternVL2-1B',
|
|
33
|
-
cache_dir=None,
|
|
34
|
-
local_files_only=None,
|
|
35
|
-
**kwargs,
|
|
36
|
-
):
|
|
37
|
-
model_name = get_model_name(model_ckpt)
|
|
38
|
-
if model_name == 'InternVL' and kwargs.get('lmdeploy', False):
|
|
39
|
-
model_name = 'InternVL_LMDeploy'
|
|
40
|
-
elif model_name == 'Pix2Struct' and kwargs.get('tensorrt_path', None):
|
|
41
|
-
model_name = 'Pix2StructTensorRT'
|
|
42
|
-
|
|
43
|
-
model = __ALL_MODELS__[model_name](
|
|
44
|
-
model_ckpt,
|
|
45
|
-
cache_dir=cache_dir,
|
|
46
|
-
local_files_only=local_files_only,
|
|
47
|
-
**kwargs
|
|
48
|
-
)
|
|
49
|
-
return model
|