deepdoc-lib 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoc/README.md +122 -0
- deepdoc/README_zh.md +116 -0
- deepdoc/__init__.py +43 -0
- deepdoc/_version.py +34 -0
- deepdoc/common/__init__.py +52 -0
- deepdoc/common/config_utils.py +63 -0
- deepdoc/common/connection_utils.py +73 -0
- deepdoc/common/file_utils.py +19 -0
- deepdoc/common/misc_utils.py +44 -0
- deepdoc/common/model_store.py +369 -0
- deepdoc/common/settings.py +42 -0
- deepdoc/common/tiktoken_cache.py +84 -0
- deepdoc/common/token_utils.py +96 -0
- deepdoc/config.py +149 -0
- deepdoc/depend/find_codec.py +42 -0
- deepdoc/depend/nltk_manager.py +114 -0
- deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
- deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
- deepdoc/depend/prompts.py +35 -0
- deepdoc/depend/rag_tokenizer.py +578 -0
- deepdoc/depend/simple_cv_model.py +469 -0
- deepdoc/depend/surname.py +91 -0
- deepdoc/depend/timeout.py +73 -0
- deepdoc/depend/vision_llm_chunk.py +35 -0
- deepdoc/dict/README.md +19 -0
- deepdoc/dict/huqie.txt +555629 -0
- deepdoc/download_models.py +169 -0
- deepdoc/llm_adapter/__init__.py +15 -0
- deepdoc/llm_adapter/adapter.py +223 -0
- deepdoc/llm_adapter/utils.py +104 -0
- deepdoc/llm_adapter/vision.py +163 -0
- deepdoc/parser/__init__.py +42 -0
- deepdoc/parser/docling_parser.py +889 -0
- deepdoc/parser/docx_parser.py +150 -0
- deepdoc/parser/excel_parser.py +270 -0
- deepdoc/parser/figure_parser.py +182 -0
- deepdoc/parser/html_parser.py +221 -0
- deepdoc/parser/json_parser.py +179 -0
- deepdoc/parser/markdown_parser.py +321 -0
- deepdoc/parser/mineru_parser.py +646 -0
- deepdoc/parser/pdf_parser.py +1591 -0
- deepdoc/parser/ppt_parser.py +96 -0
- deepdoc/parser/resume/__init__.py +109 -0
- deepdoc/parser/resume/entities/__init__.py +15 -0
- deepdoc/parser/resume/entities/corporations.py +128 -0
- deepdoc/parser/resume/entities/degrees.py +44 -0
- deepdoc/parser/resume/entities/industries.py +712 -0
- deepdoc/parser/resume/entities/regions.py +789 -0
- deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
- deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
- deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
- deepdoc/parser/resume/entities/res/good_corp.json +911 -0
- deepdoc/parser/resume/entities/res/good_sch.json +595 -0
- deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
- deepdoc/parser/resume/entities/res/schools.csv +5713 -0
- deepdoc/parser/resume/entities/schools.py +91 -0
- deepdoc/parser/resume/step_one.py +189 -0
- deepdoc/parser/resume/step_two.py +692 -0
- deepdoc/parser/tcadp_parser.py +538 -0
- deepdoc/parser/txt_parser.py +64 -0
- deepdoc/parser/utils.py +33 -0
- deepdoc/vision/__init__.py +90 -0
- deepdoc/vision/layout_recognizer.py +481 -0
- deepdoc/vision/ocr.py +757 -0
- deepdoc/vision/operators.py +733 -0
- deepdoc/vision/postprocess.py +370 -0
- deepdoc/vision/recognizer.py +451 -0
- deepdoc/vision/seeit.py +87 -0
- deepdoc/vision/t_ocr.py +101 -0
- deepdoc/vision/t_recognizer.py +186 -0
- deepdoc/vision/table_structure_recognizer.py +617 -0
- deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
- deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
- deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
- deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
- deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
- deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
- scripts/download_models.py +10 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
import io
|
|
17
|
+
import sys
|
|
18
|
+
import threading
|
|
19
|
+
|
|
20
|
+
import pdfplumber
|
|
21
|
+
|
|
22
|
+
from .ocr import OCR
|
|
23
|
+
from .recognizer import Recognizer
|
|
24
|
+
from .layout_recognizer import AscendLayoutRecognizer
|
|
25
|
+
from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
|
|
26
|
+
from .table_structure_recognizer import TableStructureRecognizer
|
|
27
|
+
|
|
28
|
+
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
|
29
|
+
if LOCK_KEY_pdfplumber not in sys.modules:
|
|
30
|
+
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def init_in_out(args):
|
|
34
|
+
import os
|
|
35
|
+
import traceback
|
|
36
|
+
|
|
37
|
+
from PIL import Image
|
|
38
|
+
|
|
39
|
+
from ..common.file_utils import traversal_files
|
|
40
|
+
|
|
41
|
+
images = []
|
|
42
|
+
outputs = []
|
|
43
|
+
|
|
44
|
+
if not os.path.exists(args.output_dir):
|
|
45
|
+
os.mkdir(args.output_dir)
|
|
46
|
+
|
|
47
|
+
def pdf_pages(fnm, zoomin=3):
|
|
48
|
+
nonlocal outputs, images
|
|
49
|
+
with sys.modules[LOCK_KEY_pdfplumber]:
|
|
50
|
+
pdf = pdfplumber.open(fnm)
|
|
51
|
+
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(pdf.pages)]
|
|
52
|
+
|
|
53
|
+
for i, page in enumerate(images):
|
|
54
|
+
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
|
55
|
+
pdf.close()
|
|
56
|
+
|
|
57
|
+
def images_and_outputs(fnm):
|
|
58
|
+
nonlocal outputs, images
|
|
59
|
+
if fnm.split(".")[-1].lower() == "pdf":
|
|
60
|
+
pdf_pages(fnm)
|
|
61
|
+
return
|
|
62
|
+
try:
|
|
63
|
+
fp = open(fnm, "rb")
|
|
64
|
+
binary = fp.read()
|
|
65
|
+
fp.close()
|
|
66
|
+
images.append(Image.open(io.BytesIO(binary)).convert("RGB"))
|
|
67
|
+
outputs.append(os.path.split(fnm)[-1])
|
|
68
|
+
except Exception:
|
|
69
|
+
traceback.print_exc()
|
|
70
|
+
|
|
71
|
+
if os.path.isdir(args.inputs):
|
|
72
|
+
for fnm in traversal_files(args.inputs):
|
|
73
|
+
images_and_outputs(fnm)
|
|
74
|
+
else:
|
|
75
|
+
images_and_outputs(args.inputs)
|
|
76
|
+
|
|
77
|
+
for i in range(len(outputs)):
|
|
78
|
+
outputs[i] = os.path.join(args.output_dir, outputs[i])
|
|
79
|
+
|
|
80
|
+
return images, outputs
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
__all__ = [
|
|
84
|
+
"OCR",
|
|
85
|
+
"Recognizer",
|
|
86
|
+
"LayoutRecognizer",
|
|
87
|
+
"AscendLayoutRecognizer",
|
|
88
|
+
"TableStructureRecognizer",
|
|
89
|
+
"init_in_out",
|
|
90
|
+
]
|
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import math
|
|
19
|
+
import os
|
|
20
|
+
# import re
|
|
21
|
+
from collections import Counter
|
|
22
|
+
from copy import deepcopy
|
|
23
|
+
|
|
24
|
+
import cv2
|
|
25
|
+
import numpy as np
|
|
26
|
+
from ..common.model_store import resolve_vision_model_dir
|
|
27
|
+
from deepdoc.vision import Recognizer
|
|
28
|
+
from deepdoc.vision.operators import nms
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LayoutRecognizer(Recognizer):
|
|
32
|
+
labels = [
|
|
33
|
+
"_background_",
|
|
34
|
+
"Text",
|
|
35
|
+
"Title",
|
|
36
|
+
"Figure",
|
|
37
|
+
"Figure caption",
|
|
38
|
+
"Table",
|
|
39
|
+
"Table caption",
|
|
40
|
+
"Header",
|
|
41
|
+
"Footer",
|
|
42
|
+
"Reference",
|
|
43
|
+
"Equation",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
domain,
|
|
49
|
+
model_dir: str | None = None,
|
|
50
|
+
model_home: str | None = None,
|
|
51
|
+
model_provider: str | None = None,
|
|
52
|
+
offline: bool | None = None,
|
|
53
|
+
):
|
|
54
|
+
if not model_dir:
|
|
55
|
+
model_dir = resolve_vision_model_dir(
|
|
56
|
+
model_home=model_home,
|
|
57
|
+
provider=model_provider,
|
|
58
|
+
offline=offline,
|
|
59
|
+
)
|
|
60
|
+
super().__init__(self.labels, domain, model_dir)
|
|
61
|
+
|
|
62
|
+
self.garbage_layouts = ["footer", "header", "reference"]
|
|
63
|
+
self.client = None
|
|
64
|
+
if os.environ.get("TENSORRT_DLA_SVR"):
|
|
65
|
+
from deepdoc.vision.dla_cli import DLAClient
|
|
66
|
+
|
|
67
|
+
self.client = DLAClient(os.environ["TENSORRT_DLA_SVR"])
|
|
68
|
+
|
|
69
|
+
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
|
|
70
|
+
def __is_garbage(b):
|
|
71
|
+
return False
|
|
72
|
+
# patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"]
|
|
73
|
+
# return any([re.search(p, b["text"]) for p in patt])
|
|
74
|
+
|
|
75
|
+
if self.client:
|
|
76
|
+
layouts = self.client.predict(image_list)
|
|
77
|
+
else:
|
|
78
|
+
layouts = super().__call__(image_list, thr, batch_size)
|
|
79
|
+
# save_results(image_list, layouts, self.labels, output_dir='output/', threshold=0.7)
|
|
80
|
+
assert len(image_list) == len(ocr_res)
|
|
81
|
+
# Tag layout type
|
|
82
|
+
boxes = []
|
|
83
|
+
assert len(image_list) == len(layouts)
|
|
84
|
+
garbages = {}
|
|
85
|
+
page_layout = []
|
|
86
|
+
for pn, lts in enumerate(layouts):
|
|
87
|
+
bxs = ocr_res[pn]
|
|
88
|
+
lts = [
|
|
89
|
+
{
|
|
90
|
+
"type": b["type"],
|
|
91
|
+
"score": float(b["score"]),
|
|
92
|
+
"x0": b["bbox"][0] / scale_factor,
|
|
93
|
+
"x1": b["bbox"][2] / scale_factor,
|
|
94
|
+
"top": b["bbox"][1] / scale_factor,
|
|
95
|
+
"bottom": b["bbox"][-1] / scale_factor,
|
|
96
|
+
"page_number": pn,
|
|
97
|
+
}
|
|
98
|
+
for b in lts
|
|
99
|
+
if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts
|
|
100
|
+
]
|
|
101
|
+
lts = self.sort_Y_firstly(lts, np.mean([lt["bottom"] - lt["top"] for lt in lts]) / 2)
|
|
102
|
+
lts = self.layouts_cleanup(bxs, lts)
|
|
103
|
+
page_layout.append(lts)
|
|
104
|
+
|
|
105
|
+
def findLayout(ty):
|
|
106
|
+
nonlocal bxs, lts, self
|
|
107
|
+
lts_ = [lt for lt in lts if lt["type"] == ty]
|
|
108
|
+
i = 0
|
|
109
|
+
while i < len(bxs):
|
|
110
|
+
if bxs[i].get("layout_type"):
|
|
111
|
+
i += 1
|
|
112
|
+
continue
|
|
113
|
+
if __is_garbage(bxs[i]):
|
|
114
|
+
bxs.pop(i)
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
ii = self.find_overlapped_with_threshold(bxs[i], lts_, thr=0.4)
|
|
118
|
+
if ii is None:
|
|
119
|
+
bxs[i]["layout_type"] = ""
|
|
120
|
+
i += 1
|
|
121
|
+
continue
|
|
122
|
+
lts_[ii]["visited"] = True
|
|
123
|
+
keep_feats = [
|
|
124
|
+
lts_[ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
|
|
125
|
+
lts_[ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
|
|
126
|
+
]
|
|
127
|
+
if drop and lts_[ii]["type"] in self.garbage_layouts and not any(keep_feats):
|
|
128
|
+
if lts_[ii]["type"] not in garbages:
|
|
129
|
+
garbages[lts_[ii]["type"]] = []
|
|
130
|
+
garbages[lts_[ii]["type"]].append(bxs[i]["text"])
|
|
131
|
+
bxs.pop(i)
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
bxs[i]["layoutno"] = f"{ty}-{ii}"
|
|
135
|
+
bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"] != "equation" else "figure"
|
|
136
|
+
i += 1
|
|
137
|
+
|
|
138
|
+
for lt in ["footer", "header", "reference", "figure caption", "table caption", "title", "table", "text", "figure", "equation"]:
|
|
139
|
+
findLayout(lt)
|
|
140
|
+
|
|
141
|
+
# add box to figure layouts which has not text box
|
|
142
|
+
for i, lt in enumerate([lt for lt in lts if lt["type"] in ["figure", "equation"]]):
|
|
143
|
+
if lt.get("visited"):
|
|
144
|
+
continue
|
|
145
|
+
lt = deepcopy(lt)
|
|
146
|
+
del lt["type"]
|
|
147
|
+
lt["text"] = ""
|
|
148
|
+
lt["layout_type"] = "figure"
|
|
149
|
+
lt["layoutno"] = f"figure-{i}"
|
|
150
|
+
bxs.append(lt)
|
|
151
|
+
|
|
152
|
+
boxes.extend(bxs)
|
|
153
|
+
|
|
154
|
+
ocr_res = boxes
|
|
155
|
+
|
|
156
|
+
garbag_set = set()
|
|
157
|
+
for k in garbages.keys():
|
|
158
|
+
garbages[k] = Counter(garbages[k])
|
|
159
|
+
for g, c in garbages[k].items():
|
|
160
|
+
if c > 1:
|
|
161
|
+
garbag_set.add(g)
|
|
162
|
+
|
|
163
|
+
ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
|
|
164
|
+
return ocr_res, page_layout
|
|
165
|
+
|
|
166
|
+
def forward(self, image_list, thr=0.7, batch_size=16):
|
|
167
|
+
return super().__call__(image_list, thr, batch_size)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class LayoutRecognizer4YOLOv10(LayoutRecognizer):
|
|
171
|
+
labels = [
|
|
172
|
+
"title",
|
|
173
|
+
"Text",
|
|
174
|
+
"Reference",
|
|
175
|
+
"Figure",
|
|
176
|
+
"Figure caption",
|
|
177
|
+
"Table",
|
|
178
|
+
"Table caption",
|
|
179
|
+
"Table caption",
|
|
180
|
+
"Equation",
|
|
181
|
+
"Figure caption",
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
def __init__(
|
|
185
|
+
self,
|
|
186
|
+
domain,
|
|
187
|
+
model_dir: str | None = None,
|
|
188
|
+
model_home: str | None = None,
|
|
189
|
+
model_provider: str | None = None,
|
|
190
|
+
offline: bool | None = None,
|
|
191
|
+
):
|
|
192
|
+
domain = "layout"
|
|
193
|
+
super().__init__(
|
|
194
|
+
domain,
|
|
195
|
+
model_dir=model_dir,
|
|
196
|
+
model_home=model_home,
|
|
197
|
+
model_provider=model_provider,
|
|
198
|
+
offline=offline,
|
|
199
|
+
)
|
|
200
|
+
self.auto = False
|
|
201
|
+
self.scaleFill = False
|
|
202
|
+
self.scaleup = True
|
|
203
|
+
self.stride = 32
|
|
204
|
+
self.center = True
|
|
205
|
+
|
|
206
|
+
def preprocess(self, image_list):
|
|
207
|
+
inputs = []
|
|
208
|
+
new_shape = self.input_shape # height, width
|
|
209
|
+
for img in image_list:
|
|
210
|
+
shape = img.shape[:2] # current shape [height, width]
|
|
211
|
+
# Scale ratio (new / old)
|
|
212
|
+
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
|
|
213
|
+
# Compute padding
|
|
214
|
+
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
|
|
215
|
+
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
|
|
216
|
+
dw /= 2 # divide padding into 2 sides
|
|
217
|
+
dh /= 2
|
|
218
|
+
ww, hh = new_unpad
|
|
219
|
+
img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
|
|
220
|
+
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
|
|
221
|
+
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
|
|
222
|
+
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
|
|
223
|
+
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)) # add border
|
|
224
|
+
img /= 255.0
|
|
225
|
+
img = img.transpose(2, 0, 1)
|
|
226
|
+
img = img[np.newaxis, :, :, :].astype(np.float32)
|
|
227
|
+
inputs.append({self.input_names[0]: img, "scale_factor": [shape[1] / ww, shape[0] / hh, dw, dh]})
|
|
228
|
+
|
|
229
|
+
return inputs
|
|
230
|
+
|
|
231
|
+
def postprocess(self, boxes, inputs, thr):
|
|
232
|
+
thr = 0.08
|
|
233
|
+
boxes = np.squeeze(boxes)
|
|
234
|
+
scores = boxes[:, 4]
|
|
235
|
+
boxes = boxes[scores > thr, :]
|
|
236
|
+
scores = scores[scores > thr]
|
|
237
|
+
if len(boxes) == 0:
|
|
238
|
+
return []
|
|
239
|
+
class_ids = boxes[:, -1].astype(int)
|
|
240
|
+
boxes = boxes[:, :4]
|
|
241
|
+
boxes[:, 0] -= inputs["scale_factor"][2]
|
|
242
|
+
boxes[:, 2] -= inputs["scale_factor"][2]
|
|
243
|
+
boxes[:, 1] -= inputs["scale_factor"][3]
|
|
244
|
+
boxes[:, 3] -= inputs["scale_factor"][3]
|
|
245
|
+
input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0], inputs["scale_factor"][1]])
|
|
246
|
+
boxes = np.multiply(boxes, input_shape, dtype=np.float32)
|
|
247
|
+
|
|
248
|
+
unique_class_ids = np.unique(class_ids)
|
|
249
|
+
indices = []
|
|
250
|
+
for class_id in unique_class_ids:
|
|
251
|
+
class_indices = np.where(class_ids == class_id)[0]
|
|
252
|
+
class_boxes = boxes[class_indices, :]
|
|
253
|
+
class_scores = scores[class_indices]
|
|
254
|
+
class_keep_boxes = nms(class_boxes, class_scores, 0.45)
|
|
255
|
+
indices.extend(class_indices[class_keep_boxes])
|
|
256
|
+
|
|
257
|
+
return [{"type": self.label_list[class_ids[i]].lower(), "bbox": [float(t) for t in boxes[i].tolist()], "score": float(scores[i])} for i in indices]
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class AscendLayoutRecognizer(Recognizer):
|
|
261
|
+
labels = [
|
|
262
|
+
"title",
|
|
263
|
+
"Text",
|
|
264
|
+
"Reference",
|
|
265
|
+
"Figure",
|
|
266
|
+
"Figure caption",
|
|
267
|
+
"Table",
|
|
268
|
+
"Table caption",
|
|
269
|
+
"Table caption",
|
|
270
|
+
"Equation",
|
|
271
|
+
"Figure caption",
|
|
272
|
+
]
|
|
273
|
+
|
|
274
|
+
def __init__(self, domain, model_dir: str | None = None):
|
|
275
|
+
from ais_bench.infer.interface import InferSession
|
|
276
|
+
|
|
277
|
+
model_root = model_dir or os.getenv("DEEPDOC_ASCEND_MODEL_DIR")
|
|
278
|
+
if not model_root:
|
|
279
|
+
raise FileNotFoundError(
|
|
280
|
+
"Ascend layout recognizer requires DEEPDOC_ASCEND_MODEL_DIR or an explicit model_dir."
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
model_file_path = os.path.join(model_root, domain + ".om")
|
|
284
|
+
|
|
285
|
+
if not os.path.exists(model_file_path):
|
|
286
|
+
raise ValueError(f"Model file not found: {model_file_path}")
|
|
287
|
+
|
|
288
|
+
device_id = int(os.getenv("ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID", 0))
|
|
289
|
+
self.session = InferSession(device_id=device_id, model_path=model_file_path)
|
|
290
|
+
self.input_shape = self.session.get_inputs()[0].shape[2:4] # H,W
|
|
291
|
+
self.garbage_layouts = ["footer", "header", "reference"]
|
|
292
|
+
|
|
293
|
+
def preprocess(self, image_list):
|
|
294
|
+
inputs = []
|
|
295
|
+
H, W = self.input_shape
|
|
296
|
+
for img in image_list:
|
|
297
|
+
h, w = img.shape[:2]
|
|
298
|
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
|
|
299
|
+
|
|
300
|
+
r = min(H / h, W / w)
|
|
301
|
+
new_unpad = (int(round(w * r)), int(round(h * r)))
|
|
302
|
+
dw, dh = (W - new_unpad[0]) / 2.0, (H - new_unpad[1]) / 2.0
|
|
303
|
+
|
|
304
|
+
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
|
|
305
|
+
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
|
|
306
|
+
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
|
|
307
|
+
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))
|
|
308
|
+
|
|
309
|
+
img /= 255.0
|
|
310
|
+
img = img.transpose(2, 0, 1)[np.newaxis, :, :, :].astype(np.float32)
|
|
311
|
+
|
|
312
|
+
inputs.append(
|
|
313
|
+
{
|
|
314
|
+
"image": img,
|
|
315
|
+
"scale_factor": [w / new_unpad[0], h / new_unpad[1]],
|
|
316
|
+
"pad": [dw, dh],
|
|
317
|
+
"orig_shape": [h, w],
|
|
318
|
+
}
|
|
319
|
+
)
|
|
320
|
+
return inputs
|
|
321
|
+
|
|
322
|
+
def postprocess(self, boxes, inputs, thr=0.25):
|
|
323
|
+
arr = np.squeeze(boxes)
|
|
324
|
+
if arr.ndim == 1:
|
|
325
|
+
arr = arr.reshape(1, -1)
|
|
326
|
+
|
|
327
|
+
results = []
|
|
328
|
+
if arr.shape[1] == 6:
|
|
329
|
+
# [x1,y1,x2,y2,score,cls]
|
|
330
|
+
m = arr[:, 4] >= thr
|
|
331
|
+
arr = arr[m]
|
|
332
|
+
if arr.size == 0:
|
|
333
|
+
return []
|
|
334
|
+
xyxy = arr[:, :4].astype(np.float32)
|
|
335
|
+
scores = arr[:, 4].astype(np.float32)
|
|
336
|
+
cls_ids = arr[:, 5].astype(np.int32)
|
|
337
|
+
|
|
338
|
+
if "pad" in inputs:
|
|
339
|
+
dw, dh = inputs["pad"]
|
|
340
|
+
sx, sy = inputs["scale_factor"]
|
|
341
|
+
xyxy[:, [0, 2]] -= dw
|
|
342
|
+
xyxy[:, [1, 3]] -= dh
|
|
343
|
+
xyxy *= np.array([sx, sy, sx, sy], dtype=np.float32)
|
|
344
|
+
else:
|
|
345
|
+
# backup
|
|
346
|
+
sx, sy = inputs["scale_factor"]
|
|
347
|
+
xyxy *= np.array([sx, sy, sx, sy], dtype=np.float32)
|
|
348
|
+
|
|
349
|
+
keep_indices = []
|
|
350
|
+
for c in np.unique(cls_ids):
|
|
351
|
+
idx = np.where(cls_ids == c)[0]
|
|
352
|
+
k = nms(xyxy[idx], scores[idx], 0.45)
|
|
353
|
+
keep_indices.extend(idx[k])
|
|
354
|
+
|
|
355
|
+
for i in keep_indices:
|
|
356
|
+
cid = int(cls_ids[i])
|
|
357
|
+
if 0 <= cid < len(self.labels):
|
|
358
|
+
results.append({"type": self.labels[cid].lower(), "bbox": [float(t) for t in xyxy[i].tolist()], "score": float(scores[i])})
|
|
359
|
+
return results
|
|
360
|
+
|
|
361
|
+
raise ValueError(f"Unexpected output shape: {arr.shape}")
|
|
362
|
+
|
|
363
|
+
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
|
|
364
|
+
import re
|
|
365
|
+
from collections import Counter
|
|
366
|
+
|
|
367
|
+
assert len(image_list) == len(ocr_res)
|
|
368
|
+
|
|
369
|
+
images = [np.array(im) if not isinstance(im, np.ndarray) else im for im in image_list]
|
|
370
|
+
layouts_all_pages = [] # list of list[{"type","score","bbox":[x1,y1,x2,y2]}]
|
|
371
|
+
|
|
372
|
+
conf_thr = max(thr, 0.08)
|
|
373
|
+
|
|
374
|
+
batch_loop_cnt = math.ceil(float(len(images)) / batch_size)
|
|
375
|
+
for bi in range(batch_loop_cnt):
|
|
376
|
+
s = bi * batch_size
|
|
377
|
+
e = min((bi + 1) * batch_size, len(images))
|
|
378
|
+
batch_images = images[s:e]
|
|
379
|
+
|
|
380
|
+
inputs_list = self.preprocess(batch_images)
|
|
381
|
+
logging.debug("preprocess done")
|
|
382
|
+
|
|
383
|
+
for ins in inputs_list:
|
|
384
|
+
feeds = [ins["image"]]
|
|
385
|
+
out_list = self.session.infer(feeds=feeds, mode="static")
|
|
386
|
+
|
|
387
|
+
for out in out_list:
|
|
388
|
+
lts = self.postprocess(out, ins, conf_thr)
|
|
389
|
+
|
|
390
|
+
page_lts = []
|
|
391
|
+
for b in lts:
|
|
392
|
+
if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts:
|
|
393
|
+
x0, y0, x1, y1 = b["bbox"]
|
|
394
|
+
page_lts.append(
|
|
395
|
+
{
|
|
396
|
+
"type": b["type"],
|
|
397
|
+
"score": float(b["score"]),
|
|
398
|
+
"x0": float(x0) / scale_factor,
|
|
399
|
+
"x1": float(x1) / scale_factor,
|
|
400
|
+
"top": float(y0) / scale_factor,
|
|
401
|
+
"bottom": float(y1) / scale_factor,
|
|
402
|
+
"page_number": len(layouts_all_pages),
|
|
403
|
+
}
|
|
404
|
+
)
|
|
405
|
+
layouts_all_pages.append(page_lts)
|
|
406
|
+
|
|
407
|
+
def _is_garbage_text(box):
|
|
408
|
+
patt = [r"^•+$", r"^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", r"^http://[^ ]{12,}", r"\(cid *: *[0-9]+ *\)"]
|
|
409
|
+
return any(re.search(p, box.get("text", "")) for p in patt)
|
|
410
|
+
|
|
411
|
+
boxes_out = []
|
|
412
|
+
page_layout = []
|
|
413
|
+
garbages = {}
|
|
414
|
+
|
|
415
|
+
for pn, lts in enumerate(layouts_all_pages):
|
|
416
|
+
if lts:
|
|
417
|
+
avg_h = np.mean([lt["bottom"] - lt["top"] for lt in lts])
|
|
418
|
+
lts = self.sort_Y_firstly(lts, avg_h / 2 if avg_h > 0 else 0)
|
|
419
|
+
|
|
420
|
+
bxs = ocr_res[pn]
|
|
421
|
+
lts = self.layouts_cleanup(bxs, lts)
|
|
422
|
+
page_layout.append(lts)
|
|
423
|
+
|
|
424
|
+
def _tag_layout(ty):
|
|
425
|
+
nonlocal bxs, lts
|
|
426
|
+
lts_of_ty = [lt for lt in lts if lt["type"] == ty]
|
|
427
|
+
i = 0
|
|
428
|
+
while i < len(bxs):
|
|
429
|
+
if bxs[i].get("layout_type"):
|
|
430
|
+
i += 1
|
|
431
|
+
continue
|
|
432
|
+
if _is_garbage_text(bxs[i]):
|
|
433
|
+
bxs.pop(i)
|
|
434
|
+
continue
|
|
435
|
+
|
|
436
|
+
ii = self.find_overlapped_with_threshold(bxs[i], lts_of_ty, thr=0.4)
|
|
437
|
+
if ii is None:
|
|
438
|
+
bxs[i]["layout_type"] = ""
|
|
439
|
+
i += 1
|
|
440
|
+
continue
|
|
441
|
+
|
|
442
|
+
lts_of_ty[ii]["visited"] = True
|
|
443
|
+
|
|
444
|
+
keep_feats = [
|
|
445
|
+
lts_of_ty[ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].shape[0] * 0.9 / scale_factor,
|
|
446
|
+
lts_of_ty[ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].shape[0] * 0.1 / scale_factor,
|
|
447
|
+
]
|
|
448
|
+
if drop and lts_of_ty[ii]["type"] in self.garbage_layouts and not any(keep_feats):
|
|
449
|
+
garbages.setdefault(lts_of_ty[ii]["type"], []).append(bxs[i].get("text", ""))
|
|
450
|
+
bxs.pop(i)
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
bxs[i]["layoutno"] = f"{ty}-{ii}"
|
|
454
|
+
bxs[i]["layout_type"] = lts_of_ty[ii]["type"] if lts_of_ty[ii]["type"] != "equation" else "figure"
|
|
455
|
+
i += 1
|
|
456
|
+
|
|
457
|
+
for ty in ["footer", "header", "reference", "figure caption", "table caption", "title", "table", "text", "figure", "equation"]:
|
|
458
|
+
_tag_layout(ty)
|
|
459
|
+
|
|
460
|
+
figs = [lt for lt in lts if lt["type"] in ["figure", "equation"]]
|
|
461
|
+
for i, lt in enumerate(figs):
|
|
462
|
+
if lt.get("visited"):
|
|
463
|
+
continue
|
|
464
|
+
lt = deepcopy(lt)
|
|
465
|
+
lt.pop("type", None)
|
|
466
|
+
lt["text"] = ""
|
|
467
|
+
lt["layout_type"] = "figure"
|
|
468
|
+
lt["layoutno"] = f"figure-{i}"
|
|
469
|
+
bxs.append(lt)
|
|
470
|
+
|
|
471
|
+
boxes_out.extend(bxs)
|
|
472
|
+
|
|
473
|
+
garbag_set = set()
|
|
474
|
+
for k, lst in garbages.items():
|
|
475
|
+
cnt = Counter(lst)
|
|
476
|
+
for g, c in cnt.items():
|
|
477
|
+
if c > 1:
|
|
478
|
+
garbag_set.add(g)
|
|
479
|
+
|
|
480
|
+
ocr_res_new = [b for b in boxes_out if b["text"].strip() not in garbag_set]
|
|
481
|
+
return ocr_res_new, page_layout
|