doc-page-extractor 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of doc-page-extractor might be problematic. Click here for more details.
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/PKG-INFO +33 -30
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/README.md +3 -7
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/clipper.py +3 -3
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/extractor.py +3 -3
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/layout_order.py +3 -3
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/layoutreader.py +1 -1
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/model.py +39 -20
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/ocr.py +8 -5
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/ocr_corrector.py +3 -3
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/overlap.py +3 -3
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/plot.py +6 -4
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/raw_optimizer.py +1 -1
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/table.py +1 -1
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/types.py +2 -2
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/utils.py +1 -1
- doc_page_extractor-0.2.4/pyproject.toml +59 -0
- doc_page_extractor-0.2.4/scripts/prebuild.py +8 -0
- doc_page_extractor-0.2.2/doc_page_extractor.egg-info/PKG-INFO +0 -85
- doc_page_extractor-0.2.2/doc_page_extractor.egg-info/SOURCES.txt +0 -48
- doc_page_extractor-0.2.2/doc_page_extractor.egg-info/dependency_links.txt +0 -1
- doc_page_extractor-0.2.2/doc_page_extractor.egg-info/requires.txt +0 -10
- doc_page_extractor-0.2.2/doc_page_extractor.egg-info/top_level.txt +0 -2
- doc_page_extractor-0.2.2/setup.cfg +0 -4
- doc_page_extractor-0.2.2/setup.py +0 -28
- doc_page_extractor-0.2.2/tests/__init__.py +0 -0
- doc_page_extractor-0.2.2/tests/test_history_bus.py +0 -55
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/LICENSE +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/__init__.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/downloader.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/latex.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/__init__.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/imaug.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/operators.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_base.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_det.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_system.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/utils.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/rectangle.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/rotation.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/__init__.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -0
- {doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -0
|
@@ -1,30 +1,37 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: doc-page-extractor
|
|
3
|
-
Version: 0.2.
|
|
4
|
-
Summary:
|
|
5
|
-
|
|
3
|
+
Version: 0.2.4
|
|
4
|
+
Summary:
|
|
5
|
+
License: AGPL-3.0
|
|
6
6
|
Author: Tao Zeyu
|
|
7
7
|
Author-email: i@taozeyu.com
|
|
8
|
+
Maintainer: Tao Zeyu
|
|
9
|
+
Maintainer-email: i@taozeyu.com
|
|
10
|
+
Requires-Python: >=3.10,<3.13
|
|
11
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
14
|
+
Classifier: Programming Language :: Python
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Provides-Extra: cpu
|
|
20
|
+
Provides-Extra: cuda
|
|
21
|
+
Requires-Dist: accelerate (>=1.6.0,<2.0)
|
|
22
|
+
Requires-Dist: doclayout_yolo (>=0.0.3)
|
|
23
|
+
Requires-Dist: huggingface_hub (>=0.33.0,<1.0)
|
|
24
|
+
Requires-Dist: numpy (>=1.24.0,<2.0)
|
|
25
|
+
Requires-Dist: onnxruntime (==1.21.0) ; extra == "cpu"
|
|
26
|
+
Requires-Dist: onnxruntime-gpu (==1.21.0) ; extra == "cuda"
|
|
27
|
+
Requires-Dist: opencv-python (>=4.10.0,<5.0)
|
|
28
|
+
Requires-Dist: pillow (>=10.3,<11.0)
|
|
29
|
+
Requires-Dist: pix2tex (>=0.1.4,<=0.2.0)
|
|
30
|
+
Requires-Dist: pyclipper (>=1.2.0,<2.0)
|
|
31
|
+
Requires-Dist: shapely (>=2.0.0,<3.0)
|
|
32
|
+
Requires-Dist: transformers (>=4.42.4,<=4.47)
|
|
33
|
+
Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
|
|
8
34
|
Description-Content-Type: text/markdown
|
|
9
|
-
License-File: LICENSE
|
|
10
|
-
Requires-Dist: opencv-python<5.0,>=4.10.0
|
|
11
|
-
Requires-Dist: pillow<11.0,>=10.3
|
|
12
|
-
Requires-Dist: pyclipper<2.0,>=1.2.0
|
|
13
|
-
Requires-Dist: numpy<2.0,>=1.24.0
|
|
14
|
-
Requires-Dist: shapely<3.0,>=2.0.0
|
|
15
|
-
Requires-Dist: transformers<=4.47,>=4.42.4
|
|
16
|
-
Requires-Dist: doclayout_yolo>=0.0.3
|
|
17
|
-
Requires-Dist: pix2tex<=0.2.0,>=0.1.4
|
|
18
|
-
Requires-Dist: accelerate<2.0,>=1.6.0
|
|
19
|
-
Requires-Dist: huggingface_hub<1.0,>=0.30.2
|
|
20
|
-
Dynamic: author
|
|
21
|
-
Dynamic: author-email
|
|
22
|
-
Dynamic: description
|
|
23
|
-
Dynamic: description-content-type
|
|
24
|
-
Dynamic: home-page
|
|
25
|
-
Dynamic: license-file
|
|
26
|
-
Dynamic: requires-dist
|
|
27
|
-
Dynamic: summary
|
|
28
35
|
|
|
29
36
|
# doc page extractor
|
|
30
37
|
|
|
@@ -37,21 +44,17 @@ doc page extractor can identify text and format in images and return structured
|
|
|
37
44
|
## Installation
|
|
38
45
|
|
|
39
46
|
```shell
|
|
40
|
-
pip install doc-page-extractor
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
```shell
|
|
44
|
-
pip install onnxruntime==1.21.0
|
|
47
|
+
pip install doc-page-extractor[cpu]
|
|
45
48
|
```
|
|
46
49
|
|
|
47
50
|
## Using CUDA
|
|
48
51
|
|
|
49
52
|
Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
|
|
50
53
|
|
|
51
|
-
|
|
54
|
+
The installation mentioned above uses the following command.
|
|
52
55
|
|
|
53
56
|
```shell
|
|
54
|
-
pip install
|
|
57
|
+
pip install doc-page-extractor[cuda]
|
|
55
58
|
```
|
|
56
59
|
|
|
57
60
|
## Example
|
|
@@ -9,21 +9,17 @@ doc page extractor can identify text and format in images and return structured
|
|
|
9
9
|
## Installation
|
|
10
10
|
|
|
11
11
|
```shell
|
|
12
|
-
pip install doc-page-extractor
|
|
13
|
-
```
|
|
14
|
-
|
|
15
|
-
```shell
|
|
16
|
-
pip install onnxruntime==1.21.0
|
|
12
|
+
pip install doc-page-extractor[cpu]
|
|
17
13
|
```
|
|
18
14
|
|
|
19
15
|
## Using CUDA
|
|
20
16
|
|
|
21
17
|
Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
|
|
22
18
|
|
|
23
|
-
|
|
19
|
+
The installation mentioned above uses the following command.
|
|
24
20
|
|
|
25
21
|
```shell
|
|
26
|
-
pip install
|
|
22
|
+
pip install doc-page-extractor[cuda]
|
|
27
23
|
```
|
|
28
24
|
|
|
29
25
|
## Example
|
|
@@ -13,12 +13,12 @@ def clip(
|
|
|
13
13
|
wrapped_width: float = 0.0,
|
|
14
14
|
wrapped_height: float = 0.0,
|
|
15
15
|
) -> Image:
|
|
16
|
-
image: Image
|
|
16
|
+
image: Image | None
|
|
17
17
|
if extracted_result.adjusted_image is None:
|
|
18
18
|
image = extracted_result.extracted_image
|
|
19
19
|
else:
|
|
20
20
|
image = extracted_result.adjusted_image
|
|
21
|
-
|
|
21
|
+
assert image is not None, "Image must not be None"
|
|
22
22
|
return clip_from_image(
|
|
23
23
|
image, layout.rect,
|
|
24
24
|
wrapped_width, wrapped_height,
|
|
@@ -91,7 +91,7 @@ def _size_and_wrapper(rect: Rectangle):
|
|
|
91
91
|
|
|
92
92
|
return width, height, max_width, max_height
|
|
93
93
|
|
|
94
|
-
def _to_pillow_matrix(matrix
|
|
94
|
+
def _to_pillow_matrix(matrix):
|
|
95
95
|
return (
|
|
96
96
|
matrix[0][0], matrix[0][1], matrix[0][2],
|
|
97
97
|
matrix[1][0], matrix[1][1], matrix[1][2],
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
|
|
3
3
|
from os import PathLike
|
|
4
|
-
from typing import Literal, Generator
|
|
4
|
+
from typing import cast, Any, Literal, Generator
|
|
5
5
|
from PIL.Image import Image
|
|
6
6
|
from doclayout_yolo import YOLOv10
|
|
7
7
|
|
|
@@ -99,7 +99,7 @@ class DocExtractor:
|
|
|
99
99
|
# about source parameter to see:
|
|
100
100
|
# https://github.com/opendatalab/DocLayout-YOLO/blob/7c4be36bc61f11b67cf4a44ee47f3c41e9800a91/doclayout_yolo/data/build.py#L157-L175
|
|
101
101
|
det_res = self._get_yolo().predict(
|
|
102
|
-
source=source,
|
|
102
|
+
source=cast(Any, source),
|
|
103
103
|
imgsz=1024,
|
|
104
104
|
conf=0.2,
|
|
105
105
|
device=self._device # Device to use (e.g., "cuda" or "cpu")
|
|
@@ -180,7 +180,7 @@ class DocExtractor:
|
|
|
180
180
|
|
|
181
181
|
def _find_matched_layout(self, fragment: OCRFragment, layouts: list[Layout]) -> Layout | None:
|
|
182
182
|
fragment_area = fragment.rect.area
|
|
183
|
-
primary_layouts: list[
|
|
183
|
+
primary_layouts: list[tuple[Layout, float]] = []
|
|
184
184
|
|
|
185
185
|
if fragment_area == 0.0:
|
|
186
186
|
return None
|
|
@@ -88,7 +88,7 @@ class LayoutOrder:
|
|
|
88
88
|
y_scale = layoutreader_size / float(height)
|
|
89
89
|
|
|
90
90
|
for bbox in bbox_list:
|
|
91
|
-
x0, y0, x1, y1 = self._squeeze(bbox
|
|
91
|
+
x0, y0, x1, y1 = self._squeeze(bbox, width, height)
|
|
92
92
|
x0 = round(x0 * x_scale)
|
|
93
93
|
y0 = round(y0 * y_scale)
|
|
94
94
|
x1 = round(x1 * x_scale)
|
|
@@ -223,8 +223,8 @@ class LayoutOrder:
|
|
|
223
223
|
mid2 = sorted_numbers[n // 2]
|
|
224
224
|
return float((mid1 + mid2) / 2)
|
|
225
225
|
|
|
226
|
-
def _squeeze(self, bbox: _BBox, width: int, height: int) ->
|
|
227
|
-
x0, y0, x1, y1 = bbox
|
|
226
|
+
def _squeeze(self, bbox: _BBox, width: int, height: int) -> tuple[float, float, float, float]:
|
|
227
|
+
x0, y0, x1, y1 = bbox.value
|
|
228
228
|
x0 = self._squeeze_value(x0, width)
|
|
229
229
|
x1 = self._squeeze_value(x1, width)
|
|
230
230
|
y0 = self._squeeze_value(y0, height)
|
|
@@ -64,7 +64,7 @@ class DataCollator:
|
|
|
64
64
|
return ret
|
|
65
65
|
|
|
66
66
|
|
|
67
|
-
def boxes2inputs(boxes: List[List[
|
|
67
|
+
def boxes2inputs(boxes: List[List[float]]) -> Dict[str, torch.Tensor]:
|
|
68
68
|
bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]
|
|
69
69
|
input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID]
|
|
70
70
|
attention_mask = [1] + [1] * len(boxes) + [1]
|
|
@@ -1,26 +1,30 @@
|
|
|
1
1
|
from os import PathLike
|
|
2
|
-
from
|
|
2
|
+
from time import sleep
|
|
3
|
+
from typing import cast, runtime_checkable, Protocol
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from threading import Lock
|
|
5
6
|
from huggingface_hub import hf_hub_download, snapshot_download, try_to_load_from_cache
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
_RETRY_TIMES = 6
|
|
10
|
+
_RETRY_SLEEP = 3.5
|
|
11
|
+
|
|
8
12
|
@runtime_checkable
|
|
9
13
|
class Model(Protocol):
|
|
10
14
|
def get_onnx_ocr_path(self) -> Path:
|
|
11
|
-
|
|
15
|
+
raise NotImplementedError()
|
|
12
16
|
|
|
13
17
|
def get_yolo_path(self) -> Path:
|
|
14
|
-
|
|
18
|
+
raise NotImplementedError()
|
|
15
19
|
|
|
16
20
|
def get_layoutreader_path(self) -> Path:
|
|
17
|
-
|
|
21
|
+
raise NotImplementedError()
|
|
18
22
|
|
|
19
23
|
def get_struct_eqtable_path(self) -> Path:
|
|
20
|
-
|
|
24
|
+
raise NotImplementedError()
|
|
21
25
|
|
|
22
26
|
def get_latex_path(self) -> Path:
|
|
23
|
-
|
|
27
|
+
raise NotImplementedError()
|
|
24
28
|
|
|
25
29
|
class HuggingfaceModel(Model):
|
|
26
30
|
def __init__(self, model_cache_dir: PathLike):
|
|
@@ -96,19 +100,34 @@ class HuggingfaceModel(Model):
|
|
|
96
100
|
model_path = model_path.parent
|
|
97
101
|
|
|
98
102
|
else:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
103
|
+
# https://github.com/huggingface/huggingface_hub/issues/1542#issuecomment-1630465844
|
|
104
|
+
latest_error: ConnectionError | None = None
|
|
105
|
+
for i in range(_RETRY_TIMES + 1):
|
|
106
|
+
if latest_error is not None:
|
|
107
|
+
print(f"Retrying to download {repo_id} model, attempt {i + 1}/{_RETRY_TIMES}...")
|
|
108
|
+
sleep(_RETRY_SLEEP)
|
|
109
|
+
try:
|
|
110
|
+
if is_snapshot:
|
|
111
|
+
model_path = snapshot_download(
|
|
112
|
+
cache_dir=self._model_cache_dir,
|
|
113
|
+
repo_id=repo_id,
|
|
114
|
+
repo_type=repo_type,
|
|
115
|
+
resume_download=True,
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
model_path = hf_hub_download(
|
|
119
|
+
cache_dir=self._model_cache_dir,
|
|
120
|
+
repo_id=repo_id,
|
|
121
|
+
repo_type=repo_type,
|
|
122
|
+
filename=filename,
|
|
123
|
+
resume_download=True,
|
|
124
|
+
)
|
|
125
|
+
latest_error = None
|
|
126
|
+
except ConnectionError as err:
|
|
127
|
+
latest_error = err
|
|
128
|
+
|
|
129
|
+
if latest_error is not None:
|
|
130
|
+
raise latest_error
|
|
131
|
+
model_path = Path(cast(PathLike, model_path))
|
|
113
132
|
|
|
114
133
|
return model_path
|
|
@@ -2,7 +2,7 @@ import numpy as np
|
|
|
2
2
|
import cv2
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
|
-
from typing import Literal, Generator
|
|
5
|
+
from typing import cast, Any, Iterable, Literal, Generator
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from .onnxocr import TextSystem
|
|
8
8
|
from .types import OCRFragment
|
|
@@ -80,7 +80,10 @@ class OCR:
|
|
|
80
80
|
image = self._preprocess_image(image)
|
|
81
81
|
dt_boxes, rec_res = text_system(image)
|
|
82
82
|
|
|
83
|
-
for box, res in zip(
|
|
83
|
+
for box, res in zip(
|
|
84
|
+
cast(Iterable[Any], dt_boxes),
|
|
85
|
+
cast(Iterable[Any], rec_res),
|
|
86
|
+
):
|
|
84
87
|
yield box.tolist(), res
|
|
85
88
|
|
|
86
89
|
def _get_text_system(self) -> TextSystem:
|
|
@@ -123,8 +126,8 @@ class OCR:
|
|
|
123
126
|
model_paths.append(str(model_dir / file_name))
|
|
124
127
|
return model_paths
|
|
125
128
|
|
|
126
|
-
def _preprocess_image(self,
|
|
127
|
-
image = self._alpha_to_color(
|
|
129
|
+
def _preprocess_image(self, np_image: np.ndarray) -> np.ndarray:
|
|
130
|
+
image = self._alpha_to_color(np_image, (255, 255, 255))
|
|
128
131
|
# image = cv2.bitwise_not(image) # inv
|
|
129
132
|
# image = self._binarize_img(image) # bin
|
|
130
133
|
image = cv2.normalize(
|
|
@@ -148,7 +151,7 @@ class OCR:
|
|
|
148
151
|
image = gpu_frame.download()
|
|
149
152
|
elif cv2.ocl.haveOpenCL():
|
|
150
153
|
cv2.ocl.setUseOpenCL(True)
|
|
151
|
-
gpu_frame = cv2.UMat(image)
|
|
154
|
+
gpu_frame = cv2.UMat(cast(Any, image))
|
|
152
155
|
image = cv2.fastNlMeansDenoisingColored(
|
|
153
156
|
src=gpu_frame,
|
|
154
157
|
dst=None,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
|
|
3
|
-
from typing import Iterable
|
|
3
|
+
from typing import cast, Iterable
|
|
4
4
|
from shapely.geometry import Polygon
|
|
5
5
|
from PIL.Image import new, Image, Resampling
|
|
6
6
|
from .types import Layout, OCRFragment
|
|
@@ -90,13 +90,13 @@ def _match_fragments(
|
|
|
90
90
|
) -> tuple[list[tuple[OCRFragment, OCRFragment]], list[OCRFragment]]:
|
|
91
91
|
|
|
92
92
|
zone_polygon = Polygon(zone_rect)
|
|
93
|
-
fragments2
|
|
93
|
+
fragments2 = list(fragments2)
|
|
94
94
|
matched_fragments: list[tuple[OCRFragment, OCRFragment]] = []
|
|
95
95
|
not_matched_fragments: list[OCRFragment] = []
|
|
96
96
|
|
|
97
97
|
for fragment1 in fragments1:
|
|
98
98
|
polygon1 = Polygon(fragment1.rect)
|
|
99
|
-
polygon1 = zone_polygon.intersection(polygon1)
|
|
99
|
+
polygon1 = cast(Polygon, zone_polygon.intersection(polygon1))
|
|
100
100
|
if polygon1.is_empty:
|
|
101
101
|
continue
|
|
102
102
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Generator
|
|
1
|
+
from typing import cast, Generator
|
|
2
2
|
from shapely.geometry import Polygon
|
|
3
3
|
from .types import Layout, OCRFragment
|
|
4
4
|
from .rectangle import Rectangle
|
|
@@ -92,7 +92,7 @@ def merge_fragments_as_line(origin_fragments: list[OCRFragment]) -> list[OCRFrag
|
|
|
92
92
|
continue
|
|
93
93
|
|
|
94
94
|
fragments.append(OCRFragment(
|
|
95
|
-
order=min_order,
|
|
95
|
+
order=round(min_order),
|
|
96
96
|
text=" ".join(texts),
|
|
97
97
|
rank=text_rate_weights / proto_texts_len,
|
|
98
98
|
rect=Rectangle(
|
|
@@ -141,7 +141,7 @@ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list
|
|
|
141
141
|
# they are very sensitive to changes in height because they are very thin and long.
|
|
142
142
|
# In order to make it equally sensitive to length and width, the ratio of area is not used.
|
|
143
143
|
def overlap_rate(polygon1: Polygon, polygon2: Polygon) -> float:
|
|
144
|
-
intersection
|
|
144
|
+
intersection = cast(Polygon, polygon1.intersection(polygon2))
|
|
145
145
|
if intersection.is_empty:
|
|
146
146
|
return 0.0
|
|
147
147
|
else:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Iterable
|
|
1
|
+
from typing import cast, Iterable
|
|
2
2
|
from PIL import ImageDraw
|
|
3
3
|
from PIL.ImageFont import load_default, FreeTypeFont
|
|
4
4
|
from PIL.Image import Image
|
|
@@ -9,8 +9,8 @@ _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
|
|
|
9
9
|
_Color = tuple[int, int, int]
|
|
10
10
|
|
|
11
11
|
def plot(image: Image, layouts: Iterable[Layout]) -> None:
|
|
12
|
-
layout_font = load_default(size=35)
|
|
13
|
-
fragment_font = load_default(size=25)
|
|
12
|
+
layout_font = cast(FreeTypeFont, load_default(size=35))
|
|
13
|
+
fragment_font = cast(FreeTypeFont, load_default(size=25))
|
|
14
14
|
draw = ImageDraw.Draw(image, mode="RGBA")
|
|
15
15
|
|
|
16
16
|
def _draw_number(position: Point, number: int, font: FreeTypeFont, bold: bool, color: _Color) -> None:
|
|
@@ -88,4 +88,6 @@ def _layout_color(layout: Layout) -> _Color:
|
|
|
88
88
|
elif cls == LayoutClass.ISOLATE_FORMULA:
|
|
89
89
|
return (0xFA, 0x38, 0x27) # Red
|
|
90
90
|
elif cls == LayoutClass.FORMULA_CAPTION:
|
|
91
|
-
return (0xFF, 0x9D, 0x24) # Orange
|
|
91
|
+
return (0xFF, 0x9D, 0x24) # Orange
|
|
92
|
+
else:
|
|
93
|
+
return (0x00, 0x00, 0x00)
|
|
@@ -3,7 +3,7 @@ import numpy as np
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from PIL.Image import Image
|
|
5
5
|
from math import pi
|
|
6
|
-
from .types import
|
|
6
|
+
from .types import Layout, OCRFragment
|
|
7
7
|
from .rotation import calculate_rotation, RotationAdjuster
|
|
8
8
|
from .rectangle import Rectangle
|
|
9
9
|
|
|
@@ -50,12 +50,12 @@ class PlainLayout(BaseLayout):
|
|
|
50
50
|
@dataclass
|
|
51
51
|
class TableLayout(BaseLayout):
|
|
52
52
|
parsed: tuple[str, TableLayoutParsedFormat] | None
|
|
53
|
-
cls: LayoutClass.TABLE
|
|
53
|
+
cls: Literal[LayoutClass.TABLE]
|
|
54
54
|
|
|
55
55
|
@dataclass
|
|
56
56
|
class FormulaLayout(BaseLayout):
|
|
57
57
|
latex: str | None
|
|
58
|
-
cls: LayoutClass.ISOLATE_FORMULA
|
|
58
|
+
cls: Literal[LayoutClass.ISOLATE_FORMULA]
|
|
59
59
|
|
|
60
60
|
Layout = PlainLayout | TableLayout | FormulaLayout
|
|
61
61
|
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "doc-page-extractor"
|
|
3
|
+
version = "0.2.4"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Tao Zeyu",email = "i@taozeyu.com"}
|
|
7
|
+
]
|
|
8
|
+
maintainers = [
|
|
9
|
+
{name = "Tao Zeyu", email = "i@taozeyu.com"}
|
|
10
|
+
]
|
|
11
|
+
license = {text = "AGPL-3.0"}
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10,<3.13"
|
|
14
|
+
|
|
15
|
+
[build-system]
|
|
16
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
17
|
+
build-backend = "poetry.core.masonry.api"
|
|
18
|
+
|
|
19
|
+
[tool.poetry]
|
|
20
|
+
license = "AGPL-3.0"
|
|
21
|
+
readme = "README.md"
|
|
22
|
+
repository = "https://github.com/moskize91/doc-page-extractor"
|
|
23
|
+
packages = [
|
|
24
|
+
{include = "doc_page_extractor" }
|
|
25
|
+
]
|
|
26
|
+
include = ["doc_page_extractor/struct_eqtable/**/*.py"]
|
|
27
|
+
classifiers=[
|
|
28
|
+
"Development Status :: 2 - Pre-Alpha",
|
|
29
|
+
"Intended Audience :: Developers",
|
|
30
|
+
"License :: OSI Approved :: GNU Affero General Public License v3",
|
|
31
|
+
"Programming Language :: Python",
|
|
32
|
+
"Programming Language :: Python :: 3.10",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[tool.poetry.dependencies]
|
|
36
|
+
opencv-python = ">=4.10.0,<5.0"
|
|
37
|
+
pillow = ">=10.3,<11.0"
|
|
38
|
+
pyclipper = ">=1.2.0,<2.0"
|
|
39
|
+
numpy = ">=1.24.0,<2.0"
|
|
40
|
+
shapely = ">=2.0.0,<3.0"
|
|
41
|
+
transformers = ">=4.42.4,<=4.47"
|
|
42
|
+
doclayout_yolo = ">=0.0.3"
|
|
43
|
+
pix2tex = ">=0.1.4,<=0.2.0"
|
|
44
|
+
accelerate = ">=1.6.0,<2.0"
|
|
45
|
+
huggingface_hub = ">=0.33.0,<1.0"
|
|
46
|
+
|
|
47
|
+
onnxruntime = { version = "1.21.0", optional = true }
|
|
48
|
+
onnxruntime-gpu = { version = "1.21.0", optional = true }
|
|
49
|
+
|
|
50
|
+
[tool.poetry.extras]
|
|
51
|
+
cpu = ["onnxruntime"]
|
|
52
|
+
cuda = ["onnxruntime-gpu"]
|
|
53
|
+
|
|
54
|
+
[tool.poetry.group.dev.dependencies]
|
|
55
|
+
pylint = "^3.3.7"
|
|
56
|
+
|
|
57
|
+
[tool.poetry.build]
|
|
58
|
+
generate-setup-file = false
|
|
59
|
+
script = "scripts/prebuild.py"
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: doc-page-extractor
|
|
3
|
-
Version: 0.2.2
|
|
4
|
-
Summary: doc page extractor can identify text and format in images and return structured data.
|
|
5
|
-
Home-page: https://github.com/Moskize91/doc-page-extractor
|
|
6
|
-
Author: Tao Zeyu
|
|
7
|
-
Author-email: i@taozeyu.com
|
|
8
|
-
Description-Content-Type: text/markdown
|
|
9
|
-
License-File: LICENSE
|
|
10
|
-
Requires-Dist: opencv-python<5.0,>=4.10.0
|
|
11
|
-
Requires-Dist: pillow<11.0,>=10.3
|
|
12
|
-
Requires-Dist: pyclipper<2.0,>=1.2.0
|
|
13
|
-
Requires-Dist: numpy<2.0,>=1.24.0
|
|
14
|
-
Requires-Dist: shapely<3.0,>=2.0.0
|
|
15
|
-
Requires-Dist: transformers<=4.47,>=4.42.4
|
|
16
|
-
Requires-Dist: doclayout_yolo>=0.0.3
|
|
17
|
-
Requires-Dist: pix2tex<=0.2.0,>=0.1.4
|
|
18
|
-
Requires-Dist: accelerate<2.0,>=1.6.0
|
|
19
|
-
Requires-Dist: huggingface_hub<1.0,>=0.30.2
|
|
20
|
-
Dynamic: author
|
|
21
|
-
Dynamic: author-email
|
|
22
|
-
Dynamic: description
|
|
23
|
-
Dynamic: description-content-type
|
|
24
|
-
Dynamic: home-page
|
|
25
|
-
Dynamic: license-file
|
|
26
|
-
Dynamic: requires-dist
|
|
27
|
-
Dynamic: summary
|
|
28
|
-
|
|
29
|
-
# doc page extractor
|
|
30
|
-
|
|
31
|
-
English | [中文](./README_zh-CN.md)
|
|
32
|
-
|
|
33
|
-
## Introduction
|
|
34
|
-
|
|
35
|
-
doc page extractor can identify text and format in images and return structured data.
|
|
36
|
-
|
|
37
|
-
## Installation
|
|
38
|
-
|
|
39
|
-
```shell
|
|
40
|
-
pip install doc-page-extractor
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
```shell
|
|
44
|
-
pip install onnxruntime==1.21.0
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
## Using CUDA
|
|
48
|
-
|
|
49
|
-
Please refer to the introduction of [PyTorch](https://pytorch.org/get-started/locally/) and select the appropriate command to install according to your operating system.
|
|
50
|
-
|
|
51
|
-
In addition, replace the command to install `onnxruntime` in the previous article with the following:
|
|
52
|
-
|
|
53
|
-
```shell
|
|
54
|
-
pip install onnxruntime-gpu==1.21.0
|
|
55
|
-
```
|
|
56
|
-
|
|
57
|
-
## Example
|
|
58
|
-
|
|
59
|
-
```python
|
|
60
|
-
from PIL import Image
|
|
61
|
-
from doc_page_extractor import DocExtractor
|
|
62
|
-
|
|
63
|
-
extractor = DocExtractor(
|
|
64
|
-
model_dir_path=model_path, # Folder address where AI model is downloaded and installed
|
|
65
|
-
device="cpu", # If you want to use CUDA, please change to device="cuda".
|
|
66
|
-
)
|
|
67
|
-
with Image.open("/path/to/your/image.png") as image:
|
|
68
|
-
result = extractor.extract(
|
|
69
|
-
image=image,
|
|
70
|
-
lang="ch", # Language of image text
|
|
71
|
-
)
|
|
72
|
-
for layout in result.layouts:
|
|
73
|
-
for fragment in layout.fragments:
|
|
74
|
-
print(fragment.rect, fragment.text)
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
## Acknowledgements
|
|
78
|
-
|
|
79
|
-
The code of `doc_page_extractor/onnxocr` in this repo comes from [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR).
|
|
80
|
-
|
|
81
|
-
- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
|
|
82
|
-
- [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR)
|
|
83
|
-
- [layoutreader](https://github.com/ppaanngggg/layoutreader)
|
|
84
|
-
- [StructEqTable](https://github.com/Alpha-Innovator/StructEqTable-Deploy)
|
|
85
|
-
- [LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR)
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
LICENSE
|
|
2
|
-
README.md
|
|
3
|
-
setup.py
|
|
4
|
-
doc_page_extractor/__init__.py
|
|
5
|
-
doc_page_extractor/clipper.py
|
|
6
|
-
doc_page_extractor/downloader.py
|
|
7
|
-
doc_page_extractor/extractor.py
|
|
8
|
-
doc_page_extractor/latex.py
|
|
9
|
-
doc_page_extractor/layout_order.py
|
|
10
|
-
doc_page_extractor/layoutreader.py
|
|
11
|
-
doc_page_extractor/model.py
|
|
12
|
-
doc_page_extractor/ocr.py
|
|
13
|
-
doc_page_extractor/ocr_corrector.py
|
|
14
|
-
doc_page_extractor/overlap.py
|
|
15
|
-
doc_page_extractor/plot.py
|
|
16
|
-
doc_page_extractor/raw_optimizer.py
|
|
17
|
-
doc_page_extractor/rectangle.py
|
|
18
|
-
doc_page_extractor/rotation.py
|
|
19
|
-
doc_page_extractor/table.py
|
|
20
|
-
doc_page_extractor/types.py
|
|
21
|
-
doc_page_extractor/utils.py
|
|
22
|
-
doc_page_extractor.egg-info/PKG-INFO
|
|
23
|
-
doc_page_extractor.egg-info/SOURCES.txt
|
|
24
|
-
doc_page_extractor.egg-info/dependency_links.txt
|
|
25
|
-
doc_page_extractor.egg-info/requires.txt
|
|
26
|
-
doc_page_extractor.egg-info/top_level.txt
|
|
27
|
-
doc_page_extractor/onnxocr/__init__.py
|
|
28
|
-
doc_page_extractor/onnxocr/cls_postprocess.py
|
|
29
|
-
doc_page_extractor/onnxocr/db_postprocess.py
|
|
30
|
-
doc_page_extractor/onnxocr/imaug.py
|
|
31
|
-
doc_page_extractor/onnxocr/operators.py
|
|
32
|
-
doc_page_extractor/onnxocr/predict_base.py
|
|
33
|
-
doc_page_extractor/onnxocr/predict_cls.py
|
|
34
|
-
doc_page_extractor/onnxocr/predict_det.py
|
|
35
|
-
doc_page_extractor/onnxocr/predict_rec.py
|
|
36
|
-
doc_page_extractor/onnxocr/predict_system.py
|
|
37
|
-
doc_page_extractor/onnxocr/rec_postprocess.py
|
|
38
|
-
doc_page_extractor/onnxocr/utils.py
|
|
39
|
-
doc_page_extractor/struct_eqtable/__init__.py
|
|
40
|
-
doc_page_extractor/struct_eqtable/internvl/__init__.py
|
|
41
|
-
doc_page_extractor/struct_eqtable/internvl/conversation.py
|
|
42
|
-
doc_page_extractor/struct_eqtable/internvl/internvl.py
|
|
43
|
-
doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py
|
|
44
|
-
doc_page_extractor/struct_eqtable/pix2s/__init__.py
|
|
45
|
-
doc_page_extractor/struct_eqtable/pix2s/pix2s.py
|
|
46
|
-
doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py
|
|
47
|
-
tests/__init__.py
|
|
48
|
-
tests/test_history_bus.py
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
from setuptools import setup, find_packages
|
|
2
|
-
|
|
3
|
-
if "doc_page_extractor.struct_eqtable" not in find_packages():
|
|
4
|
-
raise RuntimeError("struct_eqtable not found. Please download struct_eqtable first.")
|
|
5
|
-
|
|
6
|
-
setup(
|
|
7
|
-
name="doc-page-extractor",
|
|
8
|
-
version="0.2.2",
|
|
9
|
-
author="Tao Zeyu",
|
|
10
|
-
author_email="i@taozeyu.com",
|
|
11
|
-
url="https://github.com/Moskize91/doc-page-extractor",
|
|
12
|
-
description="doc page extractor can identify text and format in images and return structured data.",
|
|
13
|
-
packages=find_packages(),
|
|
14
|
-
long_description=open("./README.md", encoding="utf8").read(),
|
|
15
|
-
long_description_content_type="text/markdown",
|
|
16
|
-
install_requires=[
|
|
17
|
-
"opencv-python>=4.10.0,<5.0",
|
|
18
|
-
"pillow>=10.3,<11.0",
|
|
19
|
-
"pyclipper>=1.2.0,<2.0",
|
|
20
|
-
"numpy>=1.24.0,<2.0",
|
|
21
|
-
"shapely>=2.0.0,<3.0",
|
|
22
|
-
"transformers>=4.42.4,<=4.47",
|
|
23
|
-
"doclayout_yolo>=0.0.3",
|
|
24
|
-
"pix2tex>=0.1.4,<=0.2.0",
|
|
25
|
-
"accelerate>=1.6.0,<2.0",
|
|
26
|
-
"huggingface_hub>=0.30.2,<1.0",
|
|
27
|
-
],
|
|
28
|
-
)
|
|
File without changes
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import unittest
|
|
3
|
-
|
|
4
|
-
from PIL import Image
|
|
5
|
-
from doc_page_extractor import DocExtractor, Layout, LayoutClass
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class TestGroup(unittest.TestCase):
|
|
9
|
-
def test_history_bugs(self):
|
|
10
|
-
model_path = os.path.join(self._project_path(), "model")
|
|
11
|
-
image_path = os.path.join(self._project_path(), "tests", "images", "figure.png")
|
|
12
|
-
os.makedirs(model_path, exist_ok=True)
|
|
13
|
-
|
|
14
|
-
extractor = DocExtractor(model_path, "cpu")
|
|
15
|
-
layouts: list[tuple[LayoutClass, list[str]]]
|
|
16
|
-
|
|
17
|
-
with Image.open(image_path) as image:
|
|
18
|
-
result = extractor.extract(image, extract_formula=False)
|
|
19
|
-
layouts = [self._format_Layout(layout) for layout in result.layouts]
|
|
20
|
-
|
|
21
|
-
self.assertEqual(layouts, [
|
|
22
|
-
(LayoutClass.PLAIN_TEXT, [
|
|
23
|
-
"口的11.8%①。这既是江南农业落后的反映,又是它的原因。当战国以",
|
|
24
|
-
"后黄河流域因铁器牛耕的普及获得基本的开发,农区联结成一大片的",
|
|
25
|
-
"时候,南方农业开发始终没有突破星点状或斑块状分布的格局。由于",
|
|
26
|
-
"地旷人稀,耕作相当粗放,许多水田采取火耕水瓣的方式,旱田则多",
|
|
27
|
-
"行刀耕火种②。司马迁在《史记·货殖列传》中说:“总之,楚越之",
|
|
28
|
-
"地,地厂人希,饭稻囊鱼,或火耕而水瓣,果隋(蕨)赢(螺)蛤,",
|
|
29
|
-
"不待贾而足,地势饶食,无饥谨之患,以故皆偷生,无积聚而多",
|
|
30
|
-
"贫。”这种概括虽然未免太突出了南方经济的落后面,有一定片面性,",
|
|
31
|
-
"但大体还是反映了实际情形的。战国秦汉时期,南方与黄河流域农业",
|
|
32
|
-
"的差距显然拉大了。",
|
|
33
|
-
]),
|
|
34
|
-
(LayoutClass.FIGURE, []),
|
|
35
|
-
(LayoutClass.FIGURE_CAPTION, [
|
|
36
|
-
"西晋陶水田犁耙模型(广东连县出土)"
|
|
37
|
-
]),
|
|
38
|
-
(LayoutClass.FIGURE, []),
|
|
39
|
-
(LayoutClass.FIGURE_CAPTION, [
|
|
40
|
-
"南朝陶耙田模型 (广西苍梧倒水出土)"
|
|
41
|
-
]),
|
|
42
|
-
(LayoutClass.PLAIN_TEXT, [
|
|
43
|
-
"①据赵文林、谢淑君:《中国人口史》(人民出版社1988年)有关资料统计。",
|
|
44
|
-
"②《盐铁论·通有》:“荆扬…………伐木而树谷,焚莱而播粟,火耕而水。”"
|
|
45
|
-
]),
|
|
46
|
-
(LayoutClass.ABANDON, [
|
|
47
|
-
"136"
|
|
48
|
-
]),
|
|
49
|
-
])
|
|
50
|
-
|
|
51
|
-
def _format_Layout(self, layout: Layout) -> tuple[LayoutClass, list[str]]:
|
|
52
|
-
return layout.cls, [f.text.strip() for f in layout.fragments]
|
|
53
|
-
|
|
54
|
-
def _project_path(self) -> str:
|
|
55
|
-
return os.path.abspath(os.path.join(__file__, "..", ".."))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/__init__.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/cls_postprocess.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/db_postprocess.py
RENAMED
|
File without changes
|
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/operators.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_base.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_cls.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_det.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_rec.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/predict_system.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/onnxocr/rec_postprocess.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{doc_page_extractor-0.2.2 → doc_page_extractor-0.2.4}/doc_page_extractor/struct_eqtable/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|