doc-page-extractor 0.1.2__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/PKG-INFO +2 -2
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/extractor.py +19 -15
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/latex.py +4 -2
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/layout_order.py +2 -1
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/ocr.py +34 -8
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_base.py +9 -4
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_cls.py +23 -3
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_det.py +24 -5
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_rec.py +30 -7
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/types.py +7 -4
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor.egg-info/PKG-INFO +2 -2
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor.egg-info/requires.txt +1 -1
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/setup.py +2 -2
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/tests/test_history_bus.py +1 -1
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/LICENSE +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/README.md +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/__init__.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/clipper.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/downloader.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/layoutreader.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/models.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/ocr_corrector.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/__init__.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/imaug.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/operators.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_system.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/utils.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/overlap.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/plot.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/raw_optimizer.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/rectangle.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/rotation.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/struct_eqtable/__init__.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/table.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/utils.py +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor.egg-info/SOURCES.txt +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor.egg-info/dependency_links.txt +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor.egg-info/top_level.txt +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/setup.cfg +0 -0
- {doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/tests/__init__.py +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: doc-page-extractor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: doc page extractor can identify text and format in images and return structured data.
|
|
5
5
|
Home-page: https://github.com/Moskize91/doc-page-extractor
|
|
6
6
|
Author: Tao Zeyu
|
|
7
7
|
Author-email: i@taozeyu.com
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
|
-
Requires-Dist: opencv-python<5.0,>=4.
|
|
10
|
+
Requires-Dist: opencv-python<5.0,>=4.10.0
|
|
11
11
|
Requires-Dist: pillow<11.0,>=10.3
|
|
12
12
|
Requires-Dist: pyclipper<2.0,>=1.2.0
|
|
13
13
|
Requires-Dist: numpy<2.0,>=1.24.0
|
|
@@ -18,12 +18,12 @@ from .types import (
|
|
|
18
18
|
ExtractedResult,
|
|
19
19
|
ModelsDownloader,
|
|
20
20
|
OCRFragment,
|
|
21
|
-
TableLayoutParsedFormat,
|
|
22
21
|
Layout,
|
|
23
22
|
LayoutClass,
|
|
24
23
|
PlainLayout,
|
|
25
24
|
TableLayout,
|
|
26
25
|
FormulaLayout,
|
|
26
|
+
TableLayoutParsedFormat
|
|
27
27
|
)
|
|
28
28
|
|
|
29
29
|
|
|
@@ -32,9 +32,6 @@ class DocExtractor:
|
|
|
32
32
|
self,
|
|
33
33
|
model_cache_dir: str | None = None,
|
|
34
34
|
device: Literal["cpu", "cuda"] = "cpu",
|
|
35
|
-
ocr_for_each_layouts: bool = True,
|
|
36
|
-
extract_formula: bool = True,
|
|
37
|
-
extract_table_format: TableLayoutParsedFormat | None = None,
|
|
38
35
|
models_downloader: ModelsDownloader | None = None,
|
|
39
36
|
logger: Logger | None = None,
|
|
40
37
|
):
|
|
@@ -42,9 +39,6 @@ class DocExtractor:
|
|
|
42
39
|
self._models_downloader = models_downloader or HuggingfaceModelsDownloader(self._logger, model_cache_dir)
|
|
43
40
|
|
|
44
41
|
self._device: Literal["cpu", "cuda"] = device
|
|
45
|
-
self._ocr_for_each_layouts: bool = ocr_for_each_layouts
|
|
46
|
-
self._extract_formula: bool = extract_formula
|
|
47
|
-
self._extract_table_format: TableLayoutParsedFormat | None = extract_table_format
|
|
48
42
|
self._yolo: YOLOv10 | None = None
|
|
49
43
|
self._ocr: OCR = OCR(
|
|
50
44
|
device=device,
|
|
@@ -56,6 +50,7 @@ class DocExtractor:
|
|
|
56
50
|
)
|
|
57
51
|
self._latex: LaTeX = LaTeX(
|
|
58
52
|
get_model_dir=self._models_downloader.latex,
|
|
53
|
+
device=device,
|
|
59
54
|
)
|
|
60
55
|
self._layout_order: LayoutOrder = LayoutOrder(
|
|
61
56
|
get_model_dir=self._models_downloader.layoutreader,
|
|
@@ -64,7 +59,10 @@ class DocExtractor:
|
|
|
64
59
|
def extract(
|
|
65
60
|
self,
|
|
66
61
|
image: Image,
|
|
67
|
-
|
|
62
|
+
extract_formula: bool,
|
|
63
|
+
extract_table_format: TableLayoutParsedFormat | None = None,
|
|
64
|
+
ocr_for_each_layouts: bool = False,
|
|
65
|
+
adjust_points: bool = False
|
|
68
66
|
) -> ExtractedResult:
|
|
69
67
|
|
|
70
68
|
raw_optimizer = RawOptimizer(image, adjust_points)
|
|
@@ -74,13 +72,13 @@ class DocExtractor:
|
|
|
74
72
|
layouts = self._layouts_matched_by_fragments(fragments, layouts)
|
|
75
73
|
layouts = remove_overlap_layouts(layouts)
|
|
76
74
|
|
|
77
|
-
if
|
|
75
|
+
if ocr_for_each_layouts:
|
|
78
76
|
self._correct_fragments_by_ocr_layouts(raw_optimizer.image, layouts)
|
|
79
77
|
|
|
80
78
|
layouts = self._layout_order.sort(layouts, raw_optimizer.image.size)
|
|
81
79
|
layouts = [layout for layout in layouts if self._should_keep_layout(layout)]
|
|
82
80
|
|
|
83
|
-
self._parse_table_and_formula_layouts(layouts, raw_optimizer)
|
|
81
|
+
self._parse_table_and_formula_layouts(layouts, raw_optimizer, extract_formula=extract_formula, extract_table_format=extract_table_format)
|
|
84
82
|
|
|
85
83
|
for layout in layouts:
|
|
86
84
|
layout.fragments = merge_fragments_as_line(layout.fragments)
|
|
@@ -142,16 +140,22 @@ class DocExtractor:
|
|
|
142
140
|
for layout in layouts:
|
|
143
141
|
correct_fragments(self._ocr, source, layout)
|
|
144
142
|
|
|
145
|
-
def _parse_table_and_formula_layouts(
|
|
143
|
+
def _parse_table_and_formula_layouts(
|
|
144
|
+
self,
|
|
145
|
+
layouts: list[Layout],
|
|
146
|
+
raw_optimizer: RawOptimizer,
|
|
147
|
+
extract_formula: bool,
|
|
148
|
+
extract_table_format: TableLayoutParsedFormat | None,
|
|
149
|
+
):
|
|
146
150
|
for layout in layouts:
|
|
147
|
-
if isinstance(layout, FormulaLayout) and
|
|
151
|
+
if isinstance(layout, FormulaLayout) and extract_formula:
|
|
148
152
|
image = clip_from_image(raw_optimizer.image, layout.rect)
|
|
149
153
|
layout.latex = self._latex.extract(image)
|
|
150
|
-
elif isinstance(layout, TableLayout) and
|
|
154
|
+
elif isinstance(layout, TableLayout) and extract_table_format is not None:
|
|
151
155
|
image = clip_from_image(raw_optimizer.image, layout.rect)
|
|
152
|
-
parsed = self._table.predict(image,
|
|
156
|
+
parsed = self._table.predict(image, extract_table_format)
|
|
153
157
|
if parsed is not None:
|
|
154
|
-
layout.parsed = (parsed,
|
|
158
|
+
layout.parsed = (parsed, extract_table_format)
|
|
155
159
|
|
|
156
160
|
def _split_layouts_by_group(self, layouts: list[Layout]):
|
|
157
161
|
texts_layouts: list[Layout] = []
|
|
@@ -4,13 +4,15 @@ import torch
|
|
|
4
4
|
from munch import Munch
|
|
5
5
|
from pix2tex.cli import LatexOCR
|
|
6
6
|
from PIL.Image import Image
|
|
7
|
+
from typing import Literal
|
|
7
8
|
from .utils import expand_image
|
|
8
9
|
from .types import GetModelDir
|
|
9
10
|
|
|
10
11
|
class LaTeX:
|
|
11
|
-
def __init__(self, get_model_dir: GetModelDir):
|
|
12
|
+
def __init__(self, device: Literal["cpu", "cuda"],get_model_dir: GetModelDir):
|
|
12
13
|
self._model_path: str = get_model_dir()
|
|
13
14
|
self._model: LatexOCR | None = None
|
|
15
|
+
self._device: Literal["cpu", "cuda"] = device
|
|
14
16
|
|
|
15
17
|
def extract(self, image: Image) -> str | None:
|
|
16
18
|
image = expand_image(image, 0.1) # 添加边缘提高识别准确率
|
|
@@ -23,7 +25,7 @@ class LaTeX:
|
|
|
23
25
|
self._model = LatexOCR(Munch({
|
|
24
26
|
"config": os.path.join("settings", "config.yaml"),
|
|
25
27
|
"checkpoint": os.path.join(self._model_path, "checkpoints", "weights.pth"),
|
|
26
|
-
"no_cuda":
|
|
28
|
+
"no_cuda": self._device == "cpu",
|
|
27
29
|
"no_resize": False,
|
|
28
30
|
}))
|
|
29
31
|
return self._model
|
|
@@ -20,13 +20,14 @@ class LayoutOrder:
|
|
|
20
20
|
def __init__(self, get_model_dir: GetModelDir):
|
|
21
21
|
self._model_path: str = get_model_dir()
|
|
22
22
|
self._model: LayoutLMv3ForTokenClassification | None = None
|
|
23
|
+
self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
23
24
|
|
|
24
25
|
def _get_model(self) -> LayoutLMv3ForTokenClassification:
|
|
25
26
|
if self._model is None:
|
|
26
27
|
self._model = LayoutLMv3ForTokenClassification.from_pretrained(
|
|
27
28
|
pretrained_model_name_or_path=self._model_path,
|
|
28
29
|
local_files_only=True,
|
|
29
|
-
)
|
|
30
|
+
).to(device=self._device)
|
|
30
31
|
return self._model
|
|
31
32
|
|
|
32
33
|
def sort(self, layouts: list[Layout], size: tuple[int, int]) -> list[Layout]:
|
|
@@ -141,14 +141,40 @@ class OCR:
|
|
|
141
141
|
beta=255,
|
|
142
142
|
norm_type=cv2.NORM_MINMAX,
|
|
143
143
|
)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
144
|
+
if cv2.cuda.getCudaEnabledDeviceCount() > 0:
|
|
145
|
+
gpu_frame = cv2.cuda.GpuMat()
|
|
146
|
+
gpu_frame.upload(image)
|
|
147
|
+
image = cv2.cuda.fastNlMeansDenoisingColored(
|
|
148
|
+
src=gpu_frame,
|
|
149
|
+
dst=None,
|
|
150
|
+
h_luminance=10,
|
|
151
|
+
photo_render=10,
|
|
152
|
+
search_window=15,
|
|
153
|
+
block_size=7,
|
|
154
|
+
)
|
|
155
|
+
image = gpu_frame.download()
|
|
156
|
+
elif cv2.ocl.haveOpenCL():
|
|
157
|
+
cv2.ocl.setUseOpenCL(True)
|
|
158
|
+
gpu_frame = cv2.UMat(image)
|
|
159
|
+
image = cv2.fastNlMeansDenoisingColored(
|
|
160
|
+
src=gpu_frame,
|
|
161
|
+
dst=None,
|
|
162
|
+
h=10,
|
|
163
|
+
hColor=10,
|
|
164
|
+
templateWindowSize=7,
|
|
165
|
+
searchWindowSize=15,
|
|
166
|
+
)
|
|
167
|
+
image = image.get()
|
|
168
|
+
else:
|
|
169
|
+
image = cv2.fastNlMeansDenoisingColored(
|
|
170
|
+
src=image,
|
|
171
|
+
dst=None,
|
|
172
|
+
h=10,
|
|
173
|
+
hColor=10,
|
|
174
|
+
templateWindowSize=7,
|
|
175
|
+
searchWindowSize=15,
|
|
176
|
+
)
|
|
177
|
+
|
|
152
178
|
# image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # image to gray
|
|
153
179
|
return image
|
|
154
180
|
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_base.py
RENAMED
|
@@ -1,8 +1,13 @@
|
|
|
1
|
-
import onnxruntime
|
|
2
|
-
|
|
3
1
|
class PredictBase(object):
|
|
4
2
|
def __init__(self):
|
|
5
|
-
|
|
3
|
+
self._onnxruntime = None
|
|
4
|
+
|
|
5
|
+
@property
|
|
6
|
+
def onnxruntime(self):
|
|
7
|
+
if self._onnxruntime is None:
|
|
8
|
+
import onnxruntime
|
|
9
|
+
self._onnxruntime = onnxruntime
|
|
10
|
+
return self._onnxruntime
|
|
6
11
|
|
|
7
12
|
def get_onnx_session(self, model_dir, use_gpu):
|
|
8
13
|
# 使用gpu
|
|
@@ -11,7 +16,7 @@ class PredictBase(object):
|
|
|
11
16
|
else:
|
|
12
17
|
providers = providers = ['CPUExecutionProvider']
|
|
13
18
|
|
|
14
|
-
onnx_session = onnxruntime.InferenceSession(model_dir, None,providers=providers)
|
|
19
|
+
onnx_session = self.onnxruntime.InferenceSession(model_dir, None, providers=providers)
|
|
15
20
|
|
|
16
21
|
# print("providers:", onnxruntime.get_device())
|
|
17
22
|
return onnx_session
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_cls.py
RENAMED
|
@@ -9,15 +9,35 @@ from .predict_base import PredictBase
|
|
|
9
9
|
|
|
10
10
|
class TextClassifier(PredictBase):
|
|
11
11
|
def __init__(self, args):
|
|
12
|
+
super().__init__()
|
|
12
13
|
self.cls_image_shape = args.cls_image_shape
|
|
13
14
|
self.cls_batch_num = args.cls_batch_num
|
|
14
15
|
self.cls_thresh = args.cls_thresh
|
|
15
16
|
self.postprocess_op = ClsPostProcess(label_list=args.label_list)
|
|
17
|
+
self._args = args
|
|
16
18
|
|
|
17
19
|
# 初始化模型
|
|
18
|
-
self.
|
|
19
|
-
self.
|
|
20
|
-
self.
|
|
20
|
+
self._cls_onnx_session = None
|
|
21
|
+
self._cls_input_name = None
|
|
22
|
+
self._cls_output_name = None
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def cls_onnx_session(self):
|
|
26
|
+
if self._cls_onnx_session is None:
|
|
27
|
+
self._cls_onnx_session = self.get_onnx_session(self._args.cls_model_dir, self._args.use_gpu)
|
|
28
|
+
return self._cls_onnx_session
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def cls_input_name(self):
|
|
32
|
+
if self._cls_input_name is None:
|
|
33
|
+
self._cls_input_name = self.get_input_name(self.cls_onnx_session)
|
|
34
|
+
return self._cls_input_name
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def cls_output_name(self):
|
|
38
|
+
if self._cls_output_name is None:
|
|
39
|
+
self._cls_output_name = self.get_output_name(self.cls_onnx_session)
|
|
40
|
+
return self._cls_output_name
|
|
21
41
|
|
|
22
42
|
def resize_norm_img(self, img):
|
|
23
43
|
imgC, imgH, imgW = self.cls_image_shape
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_det.py
RENAMED
|
@@ -6,7 +6,8 @@ from .predict_base import PredictBase
|
|
|
6
6
|
|
|
7
7
|
class TextDetector(PredictBase):
|
|
8
8
|
def __init__(self, args):
|
|
9
|
-
|
|
9
|
+
super().__init__()
|
|
10
|
+
self._args = args
|
|
10
11
|
self.det_algorithm = args.det_algorithm
|
|
11
12
|
pre_process_list = [
|
|
12
13
|
{
|
|
@@ -43,9 +44,27 @@ class TextDetector(PredictBase):
|
|
|
43
44
|
self.postprocess_op = DBPostProcess(**postprocess_params)
|
|
44
45
|
|
|
45
46
|
# 初始化模型
|
|
46
|
-
self.
|
|
47
|
-
self.
|
|
48
|
-
self.
|
|
47
|
+
self._det_onnx_session = None
|
|
48
|
+
self._det_input_name = None
|
|
49
|
+
self._det_output_name = None
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def det_onnx_session(self):
|
|
53
|
+
if self._det_onnx_session is None:
|
|
54
|
+
self._det_onnx_session = self.get_onnx_session(self._args.det_model_dir, self._args.use_gpu)
|
|
55
|
+
return self._det_onnx_session
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def det_input_name(self):
|
|
59
|
+
if self._det_input_name is None:
|
|
60
|
+
self._det_input_name = self.get_input_name(self.det_onnx_session)
|
|
61
|
+
return self._det_input_name
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def det_output_name(self):
|
|
65
|
+
if self._det_output_name is None:
|
|
66
|
+
self._det_output_name = self.get_output_name(self.det_onnx_session)
|
|
67
|
+
return self._det_output_name
|
|
49
68
|
|
|
50
69
|
def order_points_clockwise(self, pts):
|
|
51
70
|
rect = np.zeros((4, 2), dtype="float32")
|
|
@@ -112,7 +131,7 @@ class TextDetector(PredictBase):
|
|
|
112
131
|
post_result = self.postprocess_op(preds, shape_list)
|
|
113
132
|
dt_boxes = post_result[0]["points"]
|
|
114
133
|
|
|
115
|
-
if self.
|
|
134
|
+
if self._args.det_box_type == "poly":
|
|
116
135
|
dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
|
|
117
136
|
else:
|
|
118
137
|
dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_rec.py
RENAMED
|
@@ -10,6 +10,8 @@ from .predict_base import PredictBase
|
|
|
10
10
|
|
|
11
11
|
class TextRecognizer(PredictBase):
|
|
12
12
|
def __init__(self, args):
|
|
13
|
+
super().__init__()
|
|
14
|
+
self._args = args
|
|
13
15
|
self.rec_image_shape = args.rec_image_shape
|
|
14
16
|
self.rec_batch_num = args.rec_batch_num
|
|
15
17
|
self.rec_algorithm = args.rec_algorithm
|
|
@@ -19,9 +21,29 @@ class TextRecognizer(PredictBase):
|
|
|
19
21
|
)
|
|
20
22
|
|
|
21
23
|
# 初始化模型
|
|
22
|
-
self.
|
|
23
|
-
self.
|
|
24
|
-
self.
|
|
24
|
+
self._rec_onnx_session = None
|
|
25
|
+
self._rec_input_name = None
|
|
26
|
+
self._rec_output_name = None
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def rec_onnx_session(self):
|
|
30
|
+
if self._rec_onnx_session is None:
|
|
31
|
+
self._rec_onnx_session = self.get_onnx_session(
|
|
32
|
+
self._args.rec_model_dir, self._args.use_gpu
|
|
33
|
+
)
|
|
34
|
+
return self._rec_onnx_session
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def rec_input_name(self):
|
|
38
|
+
if self._rec_input_name is None:
|
|
39
|
+
self._rec_input_name = self.get_input_name(self.rec_onnx_session)
|
|
40
|
+
return self._rec_input_name
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def rec_output_name(self):
|
|
44
|
+
if self._rec_output_name is None:
|
|
45
|
+
self._rec_output_name = self.get_output_name(self.rec_onnx_session)
|
|
46
|
+
return self._rec_output_name
|
|
25
47
|
|
|
26
48
|
def resize_norm_img(self, img, max_wh_ratio):
|
|
27
49
|
imgC, imgH, imgW = self.rec_image_shape
|
|
@@ -30,9 +52,9 @@ class TextRecognizer(PredictBase):
|
|
|
30
52
|
# return padding_im
|
|
31
53
|
image_pil = Image.fromarray(np.uint8(img))
|
|
32
54
|
if self.rec_algorithm == "ViTSTR":
|
|
33
|
-
img = image_pil.resize([imgW, imgH], Image.BICUBIC)
|
|
55
|
+
img = image_pil.resize([imgW, imgH], Image.Resampling.BICUBIC)
|
|
34
56
|
else:
|
|
35
|
-
img = image_pil.resize([imgW, imgH], Image.
|
|
57
|
+
img = image_pil.resize([imgW, imgH], Image.Resampling.LANCZOS)
|
|
36
58
|
img = np.array(img)
|
|
37
59
|
norm_img = np.expand_dims(img, -1)
|
|
38
60
|
norm_img = norm_img.transpose((2, 0, 1))
|
|
@@ -250,8 +272,9 @@ class TextRecognizer(PredictBase):
|
|
|
250
272
|
def norm_img_can(self, img, image_shape):
|
|
251
273
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image
|
|
252
274
|
|
|
253
|
-
|
|
254
|
-
|
|
275
|
+
# FIXME
|
|
276
|
+
# if self.inverse:
|
|
277
|
+
# img = 255 - img
|
|
255
278
|
|
|
256
279
|
if self.rec_image_shape[0] == 1:
|
|
257
280
|
h, w = img.shape
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Literal, Callable, Protocol, runtime_checkable
|
|
2
|
+
from typing import Literal, Callable, Protocol, runtime_checkable, List
|
|
3
3
|
from enum import auto, Enum
|
|
4
4
|
from PIL.Image import Image
|
|
5
5
|
from .rectangle import Rectangle
|
|
@@ -32,7 +32,7 @@ class TableLayoutParsedFormat(Enum):
|
|
|
32
32
|
@dataclass
|
|
33
33
|
class BaseLayout:
|
|
34
34
|
rect: Rectangle
|
|
35
|
-
fragments:
|
|
35
|
+
fragments: List[OCRFragment]
|
|
36
36
|
|
|
37
37
|
@dataclass
|
|
38
38
|
class PlainLayout(BaseLayout):
|
|
@@ -59,11 +59,12 @@ class FormulaLayout(BaseLayout):
|
|
|
59
59
|
|
|
60
60
|
Layout = PlainLayout | TableLayout | FormulaLayout
|
|
61
61
|
|
|
62
|
+
|
|
62
63
|
@dataclass
|
|
63
64
|
class ExtractedResult:
|
|
64
65
|
rotation: float
|
|
65
|
-
layouts:
|
|
66
|
-
extracted_image: Image
|
|
66
|
+
layouts: List[Layout]
|
|
67
|
+
extracted_image: Image | None
|
|
67
68
|
adjusted_image: Image | None
|
|
68
69
|
|
|
69
70
|
GetModelDir = Callable[[], str]
|
|
@@ -86,3 +87,5 @@ class ModelsDownloader(Protocol):
|
|
|
86
87
|
|
|
87
88
|
def latex(self) -> str:
|
|
88
89
|
pass
|
|
90
|
+
|
|
91
|
+
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: doc-page-extractor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: doc page extractor can identify text and format in images and return structured data.
|
|
5
5
|
Home-page: https://github.com/Moskize91/doc-page-extractor
|
|
6
6
|
Author: Tao Zeyu
|
|
7
7
|
Author-email: i@taozeyu.com
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
|
-
Requires-Dist: opencv-python<5.0,>=4.
|
|
10
|
+
Requires-Dist: opencv-python<5.0,>=4.10.0
|
|
11
11
|
Requires-Dist: pillow<11.0,>=10.3
|
|
12
12
|
Requires-Dist: pyclipper<2.0,>=1.2.0
|
|
13
13
|
Requires-Dist: numpy<2.0,>=1.24.0
|
|
@@ -5,7 +5,7 @@ if "doc_page_extractor.struct_eqtable" not in find_packages():
|
|
|
5
5
|
|
|
6
6
|
setup(
|
|
7
7
|
name="doc-page-extractor",
|
|
8
|
-
version="0.
|
|
8
|
+
version="0.2.0",
|
|
9
9
|
author="Tao Zeyu",
|
|
10
10
|
author_email="i@taozeyu.com",
|
|
11
11
|
url="https://github.com/Moskize91/doc-page-extractor",
|
|
@@ -14,7 +14,7 @@ setup(
|
|
|
14
14
|
long_description=open("./README.md", encoding="utf8").read(),
|
|
15
15
|
long_description_content_type="text/markdown",
|
|
16
16
|
install_requires=[
|
|
17
|
-
"opencv-python>=4.
|
|
17
|
+
"opencv-python>=4.10.0,<5.0",
|
|
18
18
|
"pillow>=10.3,<11.0",
|
|
19
19
|
"pyclipper>=1.2.0,<2.0",
|
|
20
20
|
"numpy>=1.24.0,<2.0",
|
|
@@ -15,7 +15,7 @@ class TestGroup(unittest.TestCase):
|
|
|
15
15
|
layouts: list[tuple[LayoutClass, list[str]]]
|
|
16
16
|
|
|
17
17
|
with Image.open(image_path) as image:
|
|
18
|
-
result = extractor.extract(image,
|
|
18
|
+
result = extractor.extract(image, extract_formula=False)
|
|
19
19
|
layouts = [self._format_Layout(layout) for layout in result.layouts]
|
|
20
20
|
|
|
21
21
|
self.assertEqual(layouts, [
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/__init__.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/cls_postprocess.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/db_postprocess.py
RENAMED
|
File without changes
|
|
File without changes
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/operators.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/predict_system.py
RENAMED
|
File without changes
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/onnxocr/rec_postprocess.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor/struct_eqtable/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{doc_page_extractor-0.1.2 → doc_page_extractor-0.2.0}/doc_page_extractor.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|