deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,90 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import io
17
+ import sys
18
+ import threading
19
+
20
+ import pdfplumber
21
+
22
+ from .ocr import OCR
23
+ from .recognizer import Recognizer
24
+ from .layout_recognizer import AscendLayoutRecognizer
25
+ from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
26
+ from .table_structure_recognizer import TableStructureRecognizer
27
+
28
+ LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
29
+ if LOCK_KEY_pdfplumber not in sys.modules:
30
+ sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
31
+
32
+
33
+ def init_in_out(args):
34
+ import os
35
+ import traceback
36
+
37
+ from PIL import Image
38
+
39
+ from ..common.file_utils import traversal_files
40
+
41
+ images = []
42
+ outputs = []
43
+
44
+ if not os.path.exists(args.output_dir):
45
+ os.mkdir(args.output_dir)
46
+
47
+ def pdf_pages(fnm, zoomin=3):
48
+ nonlocal outputs, images
49
+ with sys.modules[LOCK_KEY_pdfplumber]:
50
+ pdf = pdfplumber.open(fnm)
51
+ images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(pdf.pages)]
52
+
53
+ for i, page in enumerate(images):
54
+ outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
55
+ pdf.close()
56
+
57
+ def images_and_outputs(fnm):
58
+ nonlocal outputs, images
59
+ if fnm.split(".")[-1].lower() == "pdf":
60
+ pdf_pages(fnm)
61
+ return
62
+ try:
63
+ fp = open(fnm, "rb")
64
+ binary = fp.read()
65
+ fp.close()
66
+ images.append(Image.open(io.BytesIO(binary)).convert("RGB"))
67
+ outputs.append(os.path.split(fnm)[-1])
68
+ except Exception:
69
+ traceback.print_exc()
70
+
71
+ if os.path.isdir(args.inputs):
72
+ for fnm in traversal_files(args.inputs):
73
+ images_and_outputs(fnm)
74
+ else:
75
+ images_and_outputs(args.inputs)
76
+
77
+ for i in range(len(outputs)):
78
+ outputs[i] = os.path.join(args.output_dir, outputs[i])
79
+
80
+ return images, outputs
81
+
82
+
83
+ __all__ = [
84
+ "OCR",
85
+ "Recognizer",
86
+ "LayoutRecognizer",
87
+ "AscendLayoutRecognizer",
88
+ "TableStructureRecognizer",
89
+ "init_in_out",
90
+ ]
@@ -0,0 +1,481 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import logging
18
+ import math
19
+ import os
20
+ # import re
21
+ from collections import Counter
22
+ from copy import deepcopy
23
+
24
+ import cv2
25
+ import numpy as np
26
+ from ..common.model_store import resolve_vision_model_dir
27
+ from deepdoc.vision import Recognizer
28
+ from deepdoc.vision.operators import nms
29
+
30
+
31
+ class LayoutRecognizer(Recognizer):
32
+ labels = [
33
+ "_background_",
34
+ "Text",
35
+ "Title",
36
+ "Figure",
37
+ "Figure caption",
38
+ "Table",
39
+ "Table caption",
40
+ "Header",
41
+ "Footer",
42
+ "Reference",
43
+ "Equation",
44
+ ]
45
+
46
+ def __init__(
47
+ self,
48
+ domain,
49
+ model_dir: str | None = None,
50
+ model_home: str | None = None,
51
+ model_provider: str | None = None,
52
+ offline: bool | None = None,
53
+ ):
54
+ if not model_dir:
55
+ model_dir = resolve_vision_model_dir(
56
+ model_home=model_home,
57
+ provider=model_provider,
58
+ offline=offline,
59
+ )
60
+ super().__init__(self.labels, domain, model_dir)
61
+
62
+ self.garbage_layouts = ["footer", "header", "reference"]
63
+ self.client = None
64
+ if os.environ.get("TENSORRT_DLA_SVR"):
65
+ from deepdoc.vision.dla_cli import DLAClient
66
+
67
+ self.client = DLAClient(os.environ["TENSORRT_DLA_SVR"])
68
+
69
+ def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
70
+ def __is_garbage(b):
71
+ return False
72
+ # patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"]
73
+ # return any([re.search(p, b["text"]) for p in patt])
74
+
75
+ if self.client:
76
+ layouts = self.client.predict(image_list)
77
+ else:
78
+ layouts = super().__call__(image_list, thr, batch_size)
79
+ # save_results(image_list, layouts, self.labels, output_dir='output/', threshold=0.7)
80
+ assert len(image_list) == len(ocr_res)
81
+ # Tag layout type
82
+ boxes = []
83
+ assert len(image_list) == len(layouts)
84
+ garbages = {}
85
+ page_layout = []
86
+ for pn, lts in enumerate(layouts):
87
+ bxs = ocr_res[pn]
88
+ lts = [
89
+ {
90
+ "type": b["type"],
91
+ "score": float(b["score"]),
92
+ "x0": b["bbox"][0] / scale_factor,
93
+ "x1": b["bbox"][2] / scale_factor,
94
+ "top": b["bbox"][1] / scale_factor,
95
+ "bottom": b["bbox"][-1] / scale_factor,
96
+ "page_number": pn,
97
+ }
98
+ for b in lts
99
+ if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts
100
+ ]
101
+ lts = self.sort_Y_firstly(lts, np.mean([lt["bottom"] - lt["top"] for lt in lts]) / 2)
102
+ lts = self.layouts_cleanup(bxs, lts)
103
+ page_layout.append(lts)
104
+
105
+ def findLayout(ty):
106
+ nonlocal bxs, lts, self
107
+ lts_ = [lt for lt in lts if lt["type"] == ty]
108
+ i = 0
109
+ while i < len(bxs):
110
+ if bxs[i].get("layout_type"):
111
+ i += 1
112
+ continue
113
+ if __is_garbage(bxs[i]):
114
+ bxs.pop(i)
115
+ continue
116
+
117
+ ii = self.find_overlapped_with_threshold(bxs[i], lts_, thr=0.4)
118
+ if ii is None:
119
+ bxs[i]["layout_type"] = ""
120
+ i += 1
121
+ continue
122
+ lts_[ii]["visited"] = True
123
+ keep_feats = [
124
+ lts_[ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
125
+ lts_[ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
126
+ ]
127
+ if drop and lts_[ii]["type"] in self.garbage_layouts and not any(keep_feats):
128
+ if lts_[ii]["type"] not in garbages:
129
+ garbages[lts_[ii]["type"]] = []
130
+ garbages[lts_[ii]["type"]].append(bxs[i]["text"])
131
+ bxs.pop(i)
132
+ continue
133
+
134
+ bxs[i]["layoutno"] = f"{ty}-{ii}"
135
+ bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"] != "equation" else "figure"
136
+ i += 1
137
+
138
+ for lt in ["footer", "header", "reference", "figure caption", "table caption", "title", "table", "text", "figure", "equation"]:
139
+ findLayout(lt)
140
+
141
+ # add box to figure layouts which has not text box
142
+ for i, lt in enumerate([lt for lt in lts if lt["type"] in ["figure", "equation"]]):
143
+ if lt.get("visited"):
144
+ continue
145
+ lt = deepcopy(lt)
146
+ del lt["type"]
147
+ lt["text"] = ""
148
+ lt["layout_type"] = "figure"
149
+ lt["layoutno"] = f"figure-{i}"
150
+ bxs.append(lt)
151
+
152
+ boxes.extend(bxs)
153
+
154
+ ocr_res = boxes
155
+
156
+ garbag_set = set()
157
+ for k in garbages.keys():
158
+ garbages[k] = Counter(garbages[k])
159
+ for g, c in garbages[k].items():
160
+ if c > 1:
161
+ garbag_set.add(g)
162
+
163
+ ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
164
+ return ocr_res, page_layout
165
+
166
+ def forward(self, image_list, thr=0.7, batch_size=16):
167
+ return super().__call__(image_list, thr, batch_size)
168
+
169
+
170
+ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
171
+ labels = [
172
+ "title",
173
+ "Text",
174
+ "Reference",
175
+ "Figure",
176
+ "Figure caption",
177
+ "Table",
178
+ "Table caption",
179
+ "Table caption",
180
+ "Equation",
181
+ "Figure caption",
182
+ ]
183
+
184
+ def __init__(
185
+ self,
186
+ domain,
187
+ model_dir: str | None = None,
188
+ model_home: str | None = None,
189
+ model_provider: str | None = None,
190
+ offline: bool | None = None,
191
+ ):
192
+ domain = "layout"
193
+ super().__init__(
194
+ domain,
195
+ model_dir=model_dir,
196
+ model_home=model_home,
197
+ model_provider=model_provider,
198
+ offline=offline,
199
+ )
200
+ self.auto = False
201
+ self.scaleFill = False
202
+ self.scaleup = True
203
+ self.stride = 32
204
+ self.center = True
205
+
206
+ def preprocess(self, image_list):
207
+ inputs = []
208
+ new_shape = self.input_shape # height, width
209
+ for img in image_list:
210
+ shape = img.shape[:2] # current shape [height, width]
211
+ # Scale ratio (new / old)
212
+ r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
213
+ # Compute padding
214
+ new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
215
+ dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
216
+ dw /= 2 # divide padding into 2 sides
217
+ dh /= 2
218
+ ww, hh = new_unpad
219
+ img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
220
+ img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
221
+ top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
222
+ left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
223
+ img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)) # add border
224
+ img /= 255.0
225
+ img = img.transpose(2, 0, 1)
226
+ img = img[np.newaxis, :, :, :].astype(np.float32)
227
+ inputs.append({self.input_names[0]: img, "scale_factor": [shape[1] / ww, shape[0] / hh, dw, dh]})
228
+
229
+ return inputs
230
+
231
+ def postprocess(self, boxes, inputs, thr):
232
+ thr = 0.08
233
+ boxes = np.squeeze(boxes)
234
+ scores = boxes[:, 4]
235
+ boxes = boxes[scores > thr, :]
236
+ scores = scores[scores > thr]
237
+ if len(boxes) == 0:
238
+ return []
239
+ class_ids = boxes[:, -1].astype(int)
240
+ boxes = boxes[:, :4]
241
+ boxes[:, 0] -= inputs["scale_factor"][2]
242
+ boxes[:, 2] -= inputs["scale_factor"][2]
243
+ boxes[:, 1] -= inputs["scale_factor"][3]
244
+ boxes[:, 3] -= inputs["scale_factor"][3]
245
+ input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0], inputs["scale_factor"][1]])
246
+ boxes = np.multiply(boxes, input_shape, dtype=np.float32)
247
+
248
+ unique_class_ids = np.unique(class_ids)
249
+ indices = []
250
+ for class_id in unique_class_ids:
251
+ class_indices = np.where(class_ids == class_id)[0]
252
+ class_boxes = boxes[class_indices, :]
253
+ class_scores = scores[class_indices]
254
+ class_keep_boxes = nms(class_boxes, class_scores, 0.45)
255
+ indices.extend(class_indices[class_keep_boxes])
256
+
257
+ return [{"type": self.label_list[class_ids[i]].lower(), "bbox": [float(t) for t in boxes[i].tolist()], "score": float(scores[i])} for i in indices]
258
+
259
+
260
+ class AscendLayoutRecognizer(Recognizer):
261
+ labels = [
262
+ "title",
263
+ "Text",
264
+ "Reference",
265
+ "Figure",
266
+ "Figure caption",
267
+ "Table",
268
+ "Table caption",
269
+ "Table caption",
270
+ "Equation",
271
+ "Figure caption",
272
+ ]
273
+
274
+ def __init__(self, domain, model_dir: str | None = None):
275
+ from ais_bench.infer.interface import InferSession
276
+
277
+ model_root = model_dir or os.getenv("DEEPDOC_ASCEND_MODEL_DIR")
278
+ if not model_root:
279
+ raise FileNotFoundError(
280
+ "Ascend layout recognizer requires DEEPDOC_ASCEND_MODEL_DIR or an explicit model_dir."
281
+ )
282
+
283
+ model_file_path = os.path.join(model_root, domain + ".om")
284
+
285
+ if not os.path.exists(model_file_path):
286
+ raise ValueError(f"Model file not found: {model_file_path}")
287
+
288
+ device_id = int(os.getenv("ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID", 0))
289
+ self.session = InferSession(device_id=device_id, model_path=model_file_path)
290
+ self.input_shape = self.session.get_inputs()[0].shape[2:4] # H,W
291
+ self.garbage_layouts = ["footer", "header", "reference"]
292
+
293
+ def preprocess(self, image_list):
294
+ inputs = []
295
+ H, W = self.input_shape
296
+ for img in image_list:
297
+ h, w = img.shape[:2]
298
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
299
+
300
+ r = min(H / h, W / w)
301
+ new_unpad = (int(round(w * r)), int(round(h * r)))
302
+ dw, dh = (W - new_unpad[0]) / 2.0, (H - new_unpad[1]) / 2.0
303
+
304
+ img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
305
+ top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
306
+ left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
307
+ img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))
308
+
309
+ img /= 255.0
310
+ img = img.transpose(2, 0, 1)[np.newaxis, :, :, :].astype(np.float32)
311
+
312
+ inputs.append(
313
+ {
314
+ "image": img,
315
+ "scale_factor": [w / new_unpad[0], h / new_unpad[1]],
316
+ "pad": [dw, dh],
317
+ "orig_shape": [h, w],
318
+ }
319
+ )
320
+ return inputs
321
+
322
+ def postprocess(self, boxes, inputs, thr=0.25):
323
+ arr = np.squeeze(boxes)
324
+ if arr.ndim == 1:
325
+ arr = arr.reshape(1, -1)
326
+
327
+ results = []
328
+ if arr.shape[1] == 6:
329
+ # [x1,y1,x2,y2,score,cls]
330
+ m = arr[:, 4] >= thr
331
+ arr = arr[m]
332
+ if arr.size == 0:
333
+ return []
334
+ xyxy = arr[:, :4].astype(np.float32)
335
+ scores = arr[:, 4].astype(np.float32)
336
+ cls_ids = arr[:, 5].astype(np.int32)
337
+
338
+ if "pad" in inputs:
339
+ dw, dh = inputs["pad"]
340
+ sx, sy = inputs["scale_factor"]
341
+ xyxy[:, [0, 2]] -= dw
342
+ xyxy[:, [1, 3]] -= dh
343
+ xyxy *= np.array([sx, sy, sx, sy], dtype=np.float32)
344
+ else:
345
+ # backup
346
+ sx, sy = inputs["scale_factor"]
347
+ xyxy *= np.array([sx, sy, sx, sy], dtype=np.float32)
348
+
349
+ keep_indices = []
350
+ for c in np.unique(cls_ids):
351
+ idx = np.where(cls_ids == c)[0]
352
+ k = nms(xyxy[idx], scores[idx], 0.45)
353
+ keep_indices.extend(idx[k])
354
+
355
+ for i in keep_indices:
356
+ cid = int(cls_ids[i])
357
+ if 0 <= cid < len(self.labels):
358
+ results.append({"type": self.labels[cid].lower(), "bbox": [float(t) for t in xyxy[i].tolist()], "score": float(scores[i])})
359
+ return results
360
+
361
+ raise ValueError(f"Unexpected output shape: {arr.shape}")
362
+
363
+ def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
364
+ import re
365
+ from collections import Counter
366
+
367
+ assert len(image_list) == len(ocr_res)
368
+
369
+ images = [np.array(im) if not isinstance(im, np.ndarray) else im for im in image_list]
370
+ layouts_all_pages = [] # list of list[{"type","score","bbox":[x1,y1,x2,y2]}]
371
+
372
+ conf_thr = max(thr, 0.08)
373
+
374
+ batch_loop_cnt = math.ceil(float(len(images)) / batch_size)
375
+ for bi in range(batch_loop_cnt):
376
+ s = bi * batch_size
377
+ e = min((bi + 1) * batch_size, len(images))
378
+ batch_images = images[s:e]
379
+
380
+ inputs_list = self.preprocess(batch_images)
381
+ logging.debug("preprocess done")
382
+
383
+ for ins in inputs_list:
384
+ feeds = [ins["image"]]
385
+ out_list = self.session.infer(feeds=feeds, mode="static")
386
+
387
+ for out in out_list:
388
+ lts = self.postprocess(out, ins, conf_thr)
389
+
390
+ page_lts = []
391
+ for b in lts:
392
+ if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts:
393
+ x0, y0, x1, y1 = b["bbox"]
394
+ page_lts.append(
395
+ {
396
+ "type": b["type"],
397
+ "score": float(b["score"]),
398
+ "x0": float(x0) / scale_factor,
399
+ "x1": float(x1) / scale_factor,
400
+ "top": float(y0) / scale_factor,
401
+ "bottom": float(y1) / scale_factor,
402
+ "page_number": len(layouts_all_pages),
403
+ }
404
+ )
405
+ layouts_all_pages.append(page_lts)
406
+
407
+ def _is_garbage_text(box):
408
+ patt = [r"^•+$", r"^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", r"^http://[^ ]{12,}", r"\(cid *: *[0-9]+ *\)"]
409
+ return any(re.search(p, box.get("text", "")) for p in patt)
410
+
411
+ boxes_out = []
412
+ page_layout = []
413
+ garbages = {}
414
+
415
+ for pn, lts in enumerate(layouts_all_pages):
416
+ if lts:
417
+ avg_h = np.mean([lt["bottom"] - lt["top"] for lt in lts])
418
+ lts = self.sort_Y_firstly(lts, avg_h / 2 if avg_h > 0 else 0)
419
+
420
+ bxs = ocr_res[pn]
421
+ lts = self.layouts_cleanup(bxs, lts)
422
+ page_layout.append(lts)
423
+
424
+ def _tag_layout(ty):
425
+ nonlocal bxs, lts
426
+ lts_of_ty = [lt for lt in lts if lt["type"] == ty]
427
+ i = 0
428
+ while i < len(bxs):
429
+ if bxs[i].get("layout_type"):
430
+ i += 1
431
+ continue
432
+ if _is_garbage_text(bxs[i]):
433
+ bxs.pop(i)
434
+ continue
435
+
436
+ ii = self.find_overlapped_with_threshold(bxs[i], lts_of_ty, thr=0.4)
437
+ if ii is None:
438
+ bxs[i]["layout_type"] = ""
439
+ i += 1
440
+ continue
441
+
442
+ lts_of_ty[ii]["visited"] = True
443
+
444
+ keep_feats = [
445
+ lts_of_ty[ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].shape[0] * 0.9 / scale_factor,
446
+ lts_of_ty[ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].shape[0] * 0.1 / scale_factor,
447
+ ]
448
+ if drop and lts_of_ty[ii]["type"] in self.garbage_layouts and not any(keep_feats):
449
+ garbages.setdefault(lts_of_ty[ii]["type"], []).append(bxs[i].get("text", ""))
450
+ bxs.pop(i)
451
+ continue
452
+
453
+ bxs[i]["layoutno"] = f"{ty}-{ii}"
454
+ bxs[i]["layout_type"] = lts_of_ty[ii]["type"] if lts_of_ty[ii]["type"] != "equation" else "figure"
455
+ i += 1
456
+
457
+ for ty in ["footer", "header", "reference", "figure caption", "table caption", "title", "table", "text", "figure", "equation"]:
458
+ _tag_layout(ty)
459
+
460
+ figs = [lt for lt in lts if lt["type"] in ["figure", "equation"]]
461
+ for i, lt in enumerate(figs):
462
+ if lt.get("visited"):
463
+ continue
464
+ lt = deepcopy(lt)
465
+ lt.pop("type", None)
466
+ lt["text"] = ""
467
+ lt["layout_type"] = "figure"
468
+ lt["layoutno"] = f"figure-{i}"
469
+ bxs.append(lt)
470
+
471
+ boxes_out.extend(bxs)
472
+
473
+ garbag_set = set()
474
+ for k, lst in garbages.items():
475
+ cnt = Counter(lst)
476
+ for g, c in cnt.items():
477
+ if c > 1:
478
+ garbag_set.add(g)
479
+
480
+ ocr_res_new = [b for b in boxes_out if b["text"].strip() not in garbag_set]
481
+ return ocr_res_new, page_layout