deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,451 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import logging
17
+ import math
18
+ import numpy as np
19
+ import cv2
20
+ from functools import cmp_to_key
21
+
22
+
23
+ from ..common.model_store import resolve_vision_model_dir
24
+ from .operators import * # noqa: F403
25
+ from .operators import preprocess
26
+ from . import operators
27
+ from .ocr import load_model
28
+
29
+ class Recognizer:
30
+ def __init__(
31
+ self,
32
+ label_list,
33
+ task_name,
34
+ model_dir=None,
35
+ model_home: str | None = None,
36
+ model_provider: str | None = None,
37
+ offline: bool | None = None,
38
+ ):
39
+ if not model_dir:
40
+ model_dir = resolve_vision_model_dir(
41
+ model_home=model_home,
42
+ provider=model_provider,
43
+ offline=offline,
44
+ )
45
+ self.ort_sess, self.run_options = load_model(model_dir, task_name)
46
+ self.input_names = [node.name for node in self.ort_sess.get_inputs()]
47
+ self.output_names = [node.name for node in self.ort_sess.get_outputs()]
48
+ self.input_shape = self.ort_sess.get_inputs()[0].shape[2:4]
49
+ self.label_list = label_list
50
+
51
+ @staticmethod
52
+ def sort_Y_firstly(arr, threshold):
53
+ def cmp(c1, c2):
54
+ diff = c1["top"] - c2["top"]
55
+ if abs(diff) < threshold:
56
+ diff = c1["x0"] - c2["x0"]
57
+ return diff
58
+ arr = sorted(arr, key=cmp_to_key(cmp))
59
+ return arr
60
+
61
+ @staticmethod
62
+ def sort_X_firstly(arr, threshold):
63
+ def cmp(c1, c2):
64
+ diff = c1["x0"] - c2["x0"]
65
+ if abs(diff) < threshold:
66
+ diff = c1["top"] - c2["top"]
67
+ return diff
68
+ arr = sorted(arr, key=cmp_to_key(cmp))
69
+ return arr
70
+
71
+ @staticmethod
72
+ def sort_C_firstly(arr, thr=0):
73
+ # sort using y1 first and then x1
74
+ # sorted(arr, key=lambda r: (r["x0"], r["top"]))
75
+ arr = Recognizer.sort_X_firstly(arr, thr)
76
+ for i in range(len(arr) - 1):
77
+ for j in range(i, -1, -1):
78
+ # restore the order using th
79
+ if "C" not in arr[j] or "C" not in arr[j + 1]:
80
+ continue
81
+ if arr[j + 1]["C"] < arr[j]["C"] \
82
+ or (
83
+ arr[j + 1]["C"] == arr[j]["C"]
84
+ and arr[j + 1]["top"] < arr[j]["top"]
85
+ ):
86
+ tmp = arr[j]
87
+ arr[j] = arr[j + 1]
88
+ arr[j + 1] = tmp
89
+ return arr
90
+
91
+ @staticmethod
92
+ def sort_R_firstly(arr, thr=0):
93
+ # sort using y1 first and then x1
94
+ # sorted(arr, key=lambda r: (r["top"], r["x0"]))
95
+ arr = Recognizer.sort_Y_firstly(arr, thr)
96
+ for i in range(len(arr) - 1):
97
+ for j in range(i, -1, -1):
98
+ if "R" not in arr[j] or "R" not in arr[j + 1]:
99
+ continue
100
+ if arr[j + 1]["R"] < arr[j]["R"] \
101
+ or (
102
+ arr[j + 1]["R"] == arr[j]["R"]
103
+ and arr[j + 1]["x0"] < arr[j]["x0"]
104
+ ):
105
+ tmp = arr[j]
106
+ arr[j] = arr[j + 1]
107
+ arr[j + 1] = tmp
108
+ return arr
109
+
110
+ @staticmethod
111
+ def overlapped_area(a, b, ratio=True):
112
+ tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"]
113
+ if b["x0"] > x1 or b["x1"] < x0:
114
+ return 0
115
+ if b["bottom"] < tp or b["top"] > btm:
116
+ return 0
117
+ x0_ = max(b["x0"], x0)
118
+ x1_ = min(b["x1"], x1)
119
+ assert x0_ <= x1_, "Bbox mismatch! T:{},B:{},X0:{},X1:{} ==> {}".format(
120
+ tp, btm, x0, x1, b)
121
+ tp_ = max(b["top"], tp)
122
+ btm_ = min(b["bottom"], btm)
123
+ assert tp_ <= btm_, "Bbox mismatch! T:{},B:{},X0:{},X1:{} => {}".format(
124
+ tp, btm, x0, x1, b)
125
+ ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
126
+ x0 != 0 and btm - tp != 0 else 0
127
+ if ov > 0 and ratio:
128
+ ov /= (x1 - x0) * (btm - tp)
129
+ return ov
130
+
131
+ @staticmethod
132
+ def layouts_cleanup(boxes, layouts, far=2, thr=0.7):
133
+ def not_overlapped(a, b):
134
+ return any([a["x1"] < b["x0"],
135
+ a["x0"] > b["x1"],
136
+ a["bottom"] < b["top"],
137
+ a["top"] > b["bottom"]])
138
+
139
+ i = 0
140
+ while i + 1 < len(layouts):
141
+ j = i + 1
142
+ while j < min(i + far, len(layouts)) \
143
+ and (layouts[i].get("type", "") != layouts[j].get("type", "")
144
+ or not_overlapped(layouts[i], layouts[j])):
145
+ j += 1
146
+ if j >= min(i + far, len(layouts)):
147
+ i += 1
148
+ continue
149
+ if Recognizer.overlapped_area(layouts[i], layouts[j]) < thr \
150
+ and Recognizer.overlapped_area(layouts[j], layouts[i]) < thr:
151
+ i += 1
152
+ continue
153
+
154
+ if layouts[i].get("score") and layouts[j].get("score"):
155
+ if layouts[i]["score"] > layouts[j]["score"]:
156
+ layouts.pop(j)
157
+ else:
158
+ layouts.pop(i)
159
+ continue
160
+
161
+ area_i, area_i_1 = 0, 0
162
+ for b in boxes:
163
+ if not not_overlapped(b, layouts[i]):
164
+ area_i += Recognizer.overlapped_area(b, layouts[i], False)
165
+ if not not_overlapped(b, layouts[j]):
166
+ area_i_1 += Recognizer.overlapped_area(b, layouts[j], False)
167
+
168
+ if area_i > area_i_1:
169
+ layouts.pop(j)
170
+ else:
171
+ layouts.pop(i)
172
+
173
+ return layouts
174
+
175
+ def create_inputs(self, imgs, im_info):
176
+ """generate input for different model type
177
+ Args:
178
+ imgs (list(numpy)): list of images (np.ndarray)
179
+ im_info (list(dict)): list of image info
180
+ Returns:
181
+ inputs (dict): input of model
182
+ """
183
+ inputs = {}
184
+
185
+ im_shape = []
186
+ scale_factor = []
187
+ if len(imgs) == 1:
188
+ inputs['image'] = np.array((imgs[0],)).astype('float32')
189
+ inputs['im_shape'] = np.array(
190
+ (im_info[0]['im_shape'],)).astype('float32')
191
+ inputs['scale_factor'] = np.array(
192
+ (im_info[0]['scale_factor'],)).astype('float32')
193
+ return inputs
194
+
195
+ im_shape = np.array([info['im_shape'] for info in im_info], dtype='float32')
196
+ scale_factor = np.array([info['scale_factor'] for info in im_info], dtype='float32')
197
+
198
+ inputs['im_shape'] = np.concatenate(im_shape, axis=0)
199
+ inputs['scale_factor'] = np.concatenate(scale_factor, axis=0)
200
+
201
+ imgs_shape = [[e.shape[1], e.shape[2]] for e in imgs]
202
+ max_shape_h = max([e[0] for e in imgs_shape])
203
+ max_shape_w = max([e[1] for e in imgs_shape])
204
+ padding_imgs = []
205
+ for img in imgs:
206
+ im_c, im_h, im_w = img.shape[:]
207
+ padding_im = np.zeros(
208
+ (im_c, max_shape_h, max_shape_w), dtype=np.float32)
209
+ padding_im[:, :im_h, :im_w] = img
210
+ padding_imgs.append(padding_im)
211
+ inputs['image'] = np.stack(padding_imgs, axis=0)
212
+ return inputs
213
+
214
+ @staticmethod
215
+ def find_overlapped(box, boxes_sorted_by_y, naive=False):
216
+ if not boxes_sorted_by_y:
217
+ return
218
+ bxs = boxes_sorted_by_y
219
+ s, e, ii = 0, len(bxs), 0
220
+ while s < e and not naive:
221
+ ii = (e + s) // 2
222
+ pv = bxs[ii]
223
+ if box["bottom"] < pv["top"]:
224
+ e = ii
225
+ continue
226
+ if box["top"] > pv["bottom"]:
227
+ s = ii + 1
228
+ continue
229
+ break
230
+ while s < ii:
231
+ if box["top"] > bxs[s]["bottom"]:
232
+ s += 1
233
+ break
234
+ while e - 1 > ii:
235
+ if box["bottom"] < bxs[e - 1]["top"]:
236
+ e -= 1
237
+ break
238
+
239
+ max_overlapped_i, max_overlapped = None, 0
240
+ for i in range(s, e):
241
+ ov = Recognizer.overlapped_area(bxs[i], box)
242
+ if ov <= max_overlapped:
243
+ continue
244
+ max_overlapped_i = i
245
+ max_overlapped = ov
246
+
247
+ return max_overlapped_i
248
+
249
+ @staticmethod
250
+ def find_horizontally_tightest_fit(box, boxes):
251
+ if not boxes:
252
+ return
253
+ min_dis, min_i = 1000000, None
254
+ for i,b in enumerate(boxes):
255
+ if box.get("layoutno", "0") != b.get("layoutno", "0"):
256
+ continue
257
+ dis = min(abs(box["x0"] - b["x0"]), abs(box["x1"] - b["x1"]), abs(box["x0"]+box["x1"] - b["x1"] - b["x0"])/2)
258
+ if dis < min_dis:
259
+ min_i = i
260
+ min_dis = dis
261
+ return min_i
262
+
263
+ @staticmethod
264
+ def find_overlapped_with_threshold(box, boxes, thr=0.3):
265
+ if not boxes:
266
+ return
267
+ max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
268
+ s, e = 0, len(boxes)
269
+ for i in range(s, e):
270
+ ov = Recognizer.overlapped_area(box, boxes[i])
271
+ _ov = Recognizer.overlapped_area(boxes[i], box)
272
+ if (ov, _ov) < (max_overlapped, _max_overlapped):
273
+ continue
274
+ max_overlapped_i = i
275
+ max_overlapped = ov
276
+ _max_overlapped = _ov
277
+
278
+ return max_overlapped_i
279
+
280
+ def preprocess(self, image_list):
281
+ inputs = []
282
+ if "scale_factor" in self.input_names:
283
+ preprocess_ops = []
284
+ for op_info in [
285
+ {'interp': 2, 'keep_ratio': False, 'target_size': [800, 608], 'type': 'LinearResize'},
286
+ {'is_scale': True, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'type': 'StandardizeImage'},
287
+ {'type': 'Permute'},
288
+ {'stride': 32, 'type': 'PadStride'}
289
+ ]:
290
+ new_op_info = op_info.copy()
291
+ op_type = new_op_info.pop('type')
292
+ preprocess_ops.append(getattr(operators, op_type)(**new_op_info))
293
+
294
+ for im_path in image_list:
295
+ im, im_info = preprocess(im_path, preprocess_ops)
296
+ inputs.append({"image": np.array((im,)).astype('float32'),
297
+ "scale_factor": np.array((im_info["scale_factor"],)).astype('float32')})
298
+ else:
299
+ hh, ww = self.input_shape
300
+ for img in image_list:
301
+ h, w = img.shape[:2]
302
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
303
+ img = cv2.resize(np.array(img).astype('float32'), (ww, hh))
304
+ # Scale input pixel values to 0 to 1
305
+ img /= 255.0
306
+ img = img.transpose(2, 0, 1)
307
+ img = img[np.newaxis, :, :, :].astype(np.float32)
308
+ inputs.append({self.input_names[0]: img, "scale_factor": [w/ww, h/hh]})
309
+ return inputs
310
+
311
+ def postprocess(self, boxes, inputs, thr):
312
+ if "scale_factor" in self.input_names:
313
+ bb = []
314
+ for b in boxes:
315
+ clsid, bbox, score = int(b[0]), b[2:], b[1]
316
+ if score < thr:
317
+ continue
318
+ if clsid >= len(self.label_list):
319
+ continue
320
+ bb.append({
321
+ "type": self.label_list[clsid].lower(),
322
+ "bbox": [float(t) for t in bbox.tolist()],
323
+ "score": float(score)
324
+ })
325
+ return bb
326
+
327
+ def xywh2xyxy(x):
328
+ # [x, y, w, h] to [x1, y1, x2, y2]
329
+ y = np.copy(x)
330
+ y[:, 0] = x[:, 0] - x[:, 2] / 2
331
+ y[:, 1] = x[:, 1] - x[:, 3] / 2
332
+ y[:, 2] = x[:, 0] + x[:, 2] / 2
333
+ y[:, 3] = x[:, 1] + x[:, 3] / 2
334
+ return y
335
+
336
+ def compute_iou(box, boxes):
337
+ # Compute xmin, ymin, xmax, ymax for both boxes
338
+ xmin = np.maximum(box[0], boxes[:, 0])
339
+ ymin = np.maximum(box[1], boxes[:, 1])
340
+ xmax = np.minimum(box[2], boxes[:, 2])
341
+ ymax = np.minimum(box[3], boxes[:, 3])
342
+
343
+ # Compute intersection area
344
+ intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
345
+
346
+ # Compute union area
347
+ box_area = (box[2] - box[0]) * (box[3] - box[1])
348
+ boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
349
+ union_area = box_area + boxes_area - intersection_area
350
+
351
+ # Compute IoU
352
+ iou = intersection_area / union_area
353
+
354
+ return iou
355
+
356
+ def iou_filter(boxes, scores, iou_threshold):
357
+ sorted_indices = np.argsort(scores)[::-1]
358
+
359
+ keep_boxes = []
360
+ while sorted_indices.size > 0:
361
+ # Pick the last box
362
+ box_id = sorted_indices[0]
363
+ keep_boxes.append(box_id)
364
+
365
+ # Compute IoU of the picked box with the rest
366
+ ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
367
+
368
+ # Remove boxes with IoU over the threshold
369
+ keep_indices = np.where(ious < iou_threshold)[0]
370
+
371
+ # print(keep_indices.shape, sorted_indices.shape)
372
+ sorted_indices = sorted_indices[keep_indices + 1]
373
+
374
+ return keep_boxes
375
+
376
+ boxes = np.squeeze(boxes).T
377
+ # Filter out object confidence scores below threshold
378
+ scores = np.max(boxes[:, 4:], axis=1)
379
+ boxes = boxes[scores > thr, :]
380
+ scores = scores[scores > thr]
381
+ if len(boxes) == 0:
382
+ return []
383
+
384
+ # Get the class with the highest confidence
385
+ class_ids = np.argmax(boxes[:, 4:], axis=1)
386
+ boxes = boxes[:, :4]
387
+ input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0], inputs["scale_factor"][1]])
388
+ boxes = np.multiply(boxes, input_shape, dtype=np.float32)
389
+ boxes = xywh2xyxy(boxes)
390
+
391
+ unique_class_ids = np.unique(class_ids)
392
+ indices = []
393
+ for class_id in unique_class_ids:
394
+ class_indices = np.where(class_ids == class_id)[0]
395
+ class_boxes = boxes[class_indices, :]
396
+ class_scores = scores[class_indices]
397
+ class_keep_boxes = iou_filter(class_boxes, class_scores, 0.2)
398
+ indices.extend(class_indices[class_keep_boxes])
399
+
400
+ return [{
401
+ "type": self.label_list[class_ids[i]].lower(),
402
+ "bbox": [float(t) for t in boxes[i].tolist()],
403
+ "score": float(scores[i])
404
+ } for i in indices]
405
+
406
+ def close(self):
407
+ # NOTE: `__del__` can run during interpreter shutdown when module
408
+ # globals (including `logging`/`gc`) may already be cleared to None.
409
+ try:
410
+ import logging as _logging
411
+ _logging.info("Close recognizer.")
412
+ except Exception:
413
+ pass
414
+ if hasattr(self, "ort_sess"):
415
+ del self.ort_sess
416
+ try:
417
+ import gc as _gc
418
+ _gc.collect()
419
+ except Exception:
420
+ pass
421
+
422
+ def __call__(self, image_list, thr=0.7, batch_size=16):
423
+ res = []
424
+ images = []
425
+ for i in range(len(image_list)):
426
+ if not isinstance(image_list[i], np.ndarray):
427
+ images.append(np.array(image_list[i]))
428
+ else:
429
+ images.append(image_list[i])
430
+
431
+ batch_loop_cnt = math.ceil(float(len(images)) / batch_size)
432
+ for i in range(batch_loop_cnt):
433
+ start_index = i * batch_size
434
+ end_index = min((i + 1) * batch_size, len(images))
435
+ batch_image_list = images[start_index:end_index]
436
+ inputs = self.preprocess(batch_image_list)
437
+ logging.debug("preprocess")
438
+ for ins in inputs:
439
+ bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names}, self.run_options)[0], ins, thr)
440
+ res.append(bb)
441
+
442
+ #seeit.save_results(image_list, res, self.label_list, threshold=thr)
443
+
444
+ return res
445
+
446
+ def __del__(self):
447
+ try:
448
+ self.close()
449
+ except Exception:
450
+ # Destructors must never raise.
451
+ pass
@@ -0,0 +1,87 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import logging
18
+ import os
19
+ import PIL
20
+ from PIL import ImageDraw
21
+
22
+
23
+ def save_results(image_list, results, labels, output_dir='output/', threshold=0.5):
24
+ if not os.path.exists(output_dir):
25
+ os.makedirs(output_dir)
26
+ for idx, im in enumerate(image_list):
27
+ im = draw_box(im, results[idx], labels, threshold=threshold)
28
+
29
+ out_path = os.path.join(output_dir, f"{idx}.jpg")
30
+ im.save(out_path, quality=95)
31
+ logging.debug("save result to: " + out_path)
32
+
33
+
34
+ def draw_box(im, result, labels, threshold=0.5):
35
+ draw_thickness = min(im.size) // 320
36
+ draw = ImageDraw.Draw(im)
37
+ color_list = get_color_map_list(len(labels))
38
+ clsid2color = {n.lower():color_list[i] for i,n in enumerate(labels)}
39
+ result = [r for r in result if r["score"] >= threshold]
40
+
41
+ for dt in result:
42
+ color = tuple(clsid2color[dt["type"]])
43
+ xmin, ymin, xmax, ymax = dt["bbox"]
44
+ draw.line(
45
+ [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
46
+ (xmin, ymin)],
47
+ width=draw_thickness,
48
+ fill=color)
49
+
50
+ # draw label
51
+ text = "{} {:.4f}".format(dt["type"], dt["score"])
52
+ tw, th = imagedraw_textsize_c(draw, text)
53
+ draw.rectangle(
54
+ [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
55
+ draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
56
+ return im
57
+
58
+
59
+ def get_color_map_list(num_classes):
60
+ """
61
+ Args:
62
+ num_classes (int): number of class
63
+ Returns:
64
+ color_map (list): RGB color list
65
+ """
66
+ color_map = num_classes * [0, 0, 0]
67
+ for i in range(0, num_classes):
68
+ j = 0
69
+ lab = i
70
+ while lab:
71
+ color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
72
+ color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
73
+ color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
74
+ j += 1
75
+ lab >>= 3
76
+ color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
77
+ return color_map
78
+
79
+
80
+ def imagedraw_textsize_c(draw, text):
81
+ if int(PIL.__version__.split('.')[0]) < 10:
82
+ tw, th = draw.textsize(text)
83
+ else:
84
+ left, top, right, bottom = draw.textbbox((0, 0), text)
85
+ tw, th = right - left, bottom - top
86
+
87
+ return tw, th
@@ -0,0 +1,101 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import asyncio
18
+ import logging
19
+ import os
20
+ import sys
21
+ sys.path.insert(
22
+ 0,
23
+ os.path.abspath(
24
+ os.path.join(
25
+ os.path.dirname(
26
+ os.path.abspath(__file__)),
27
+ '../../')))
28
+
29
+ from deepdoc.vision.seeit import draw_box
30
+ from deepdoc.vision import OCR, init_in_out
31
+ import argparse
32
+ import numpy as np
33
+
34
+ # os.environ['CUDA_VISIBLE_DEVICES'] = '0,2' #2 gpus, uncontinuous
35
+ os.environ['CUDA_VISIBLE_DEVICES'] = '0' #1 gpu
36
+ # os.environ['CUDA_VISIBLE_DEVICES'] = '' #cpu
37
+
38
+
39
+ def main(args):
40
+ import torch.cuda
41
+
42
+ cuda_devices = torch.cuda.device_count()
43
+ limiter = [asyncio.Semaphore(1) for _ in range(cuda_devices)] if cuda_devices > 1 else None
44
+ ocr = OCR()
45
+ images, outputs = init_in_out(args)
46
+
47
+ def __ocr(i, id, img):
48
+ print("Task {} start".format(i))
49
+ bxs = ocr(np.array(img), id)
50
+ bxs = [(line[0], line[1][0]) for line in bxs]
51
+ bxs = [{
52
+ "text": t,
53
+ "bbox": [b[0][0], b[0][1], b[1][0], b[-1][1]],
54
+ "type": "ocr",
55
+ "score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
56
+ img = draw_box(images[i], bxs, ["ocr"], 1.)
57
+ img.save(outputs[i], quality=95)
58
+ with open(outputs[i] + ".txt", "w+", encoding='utf-8') as f:
59
+ f.write("\n".join([o["text"] for o in bxs]))
60
+
61
+ print("Task {} done".format(i))
62
+
63
+ async def __ocr_thread(i, id, img, limiter = None):
64
+ if limiter:
65
+ async with limiter:
66
+ print(f"Task {i} use device {id}")
67
+ await asyncio.to_thread(__ocr, i, id, img)
68
+ else:
69
+ await asyncio.to_thread(__ocr, i, id, img)
70
+
71
+
72
+ async def __ocr_launcher():
73
+ tasks = []
74
+ for i, img in enumerate(images):
75
+ dev_id = i % cuda_devices if cuda_devices > 1 else 0
76
+ semaphore = limiter[dev_id] if limiter else None
77
+ tasks.append(asyncio.create_task(__ocr_thread(i, dev_id, img, semaphore)))
78
+
79
+ try:
80
+ await asyncio.gather(*tasks, return_exceptions=False)
81
+ except Exception as e:
82
+ logging.error("OCR tasks failed: {}".format(e))
83
+ for t in tasks:
84
+ t.cancel()
85
+ await asyncio.gather(*tasks, return_exceptions=True)
86
+ raise
87
+
88
+ asyncio.run(__ocr_launcher())
89
+
90
+ print("OCR tasks are all done")
91
+
92
+
93
+ if __name__ == "__main__":
94
+ parser = argparse.ArgumentParser()
95
+ parser.add_argument('--inputs',
96
+ help="Directory where to store images or PDFs, or a file path to a single image or PDF",
97
+ required=True)
98
+ parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './ocr_outputs'",
99
+ default="./ocr_outputs")
100
+ args = parser.parse_args()
101
+ main(args)