xfmr-zem 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. xfmr_zem/cli.py +32 -3
  2. xfmr_zem/client.py +59 -8
  3. xfmr_zem/server.py +21 -4
  4. xfmr_zem/servers/data_juicer/server.py +1 -1
  5. xfmr_zem/servers/instruction_gen/server.py +1 -1
  6. xfmr_zem/servers/io/server.py +1 -1
  7. xfmr_zem/servers/llm/parameters.yml +10 -0
  8. xfmr_zem/servers/nemo_curator/server.py +1 -1
  9. xfmr_zem/servers/ocr/deepdoc_vietocr/__init__.py +90 -0
  10. xfmr_zem/servers/ocr/deepdoc_vietocr/implementations.py +1286 -0
  11. xfmr_zem/servers/ocr/deepdoc_vietocr/layout_recognizer.py +562 -0
  12. xfmr_zem/servers/ocr/deepdoc_vietocr/ocr.py +512 -0
  13. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/.gitattributes +35 -0
  14. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/README.md +5 -0
  15. xfmr_zem/servers/ocr/deepdoc_vietocr/onnx/ocr.res +6623 -0
  16. xfmr_zem/servers/ocr/deepdoc_vietocr/operators.py +725 -0
  17. xfmr_zem/servers/ocr/deepdoc_vietocr/phases.py +191 -0
  18. xfmr_zem/servers/ocr/deepdoc_vietocr/pipeline.py +561 -0
  19. xfmr_zem/servers/ocr/deepdoc_vietocr/postprocess.py +370 -0
  20. xfmr_zem/servers/ocr/deepdoc_vietocr/recognizer.py +436 -0
  21. xfmr_zem/servers/ocr/deepdoc_vietocr/table_structure_recognizer.py +569 -0
  22. xfmr_zem/servers/ocr/deepdoc_vietocr/utils/__init__.py +81 -0
  23. xfmr_zem/servers/ocr/deepdoc_vietocr/utils/file_utils.py +246 -0
  24. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/__init__.py +0 -0
  25. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/base.yml +58 -0
  26. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/config/vgg-seq2seq.yml +38 -0
  27. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/__init__.py +0 -0
  28. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/cnn.py +25 -0
  29. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/backbone/vgg.py +51 -0
  30. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/seqmodel/seq2seq.py +175 -0
  31. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/transformerocr.py +29 -0
  32. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/model/vocab.py +36 -0
  33. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/config.py +37 -0
  34. xfmr_zem/servers/ocr/deepdoc_vietocr/vietocr/tool/translate.py +111 -0
  35. xfmr_zem/servers/ocr/engines.py +242 -0
  36. xfmr_zem/servers/ocr/install_models.py +63 -0
  37. xfmr_zem/servers/ocr/parameters.yml +4 -0
  38. xfmr_zem/servers/ocr/server.py +102 -0
  39. xfmr_zem/servers/profiler/parameters.yml +4 -0
  40. xfmr_zem/servers/sinks/parameters.yml +6 -0
  41. xfmr_zem/servers/unstructured/parameters.yml +6 -0
  42. xfmr_zem/servers/unstructured/server.py +62 -0
  43. xfmr_zem/zenml_wrapper.py +20 -7
  44. {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/METADATA +20 -1
  45. xfmr_zem-0.2.6.dist-info/RECORD +58 -0
  46. xfmr_zem-0.2.4.dist-info/RECORD +0 -23
  47. /xfmr_zem/servers/data_juicer/{parameter.yaml → parameters.yml} +0 -0
  48. /xfmr_zem/servers/instruction_gen/{parameter.yaml → parameters.yml} +0 -0
  49. /xfmr_zem/servers/io/{parameter.yaml → parameters.yml} +0 -0
  50. /xfmr_zem/servers/nemo_curator/{parameter.yaml → parameters.yml} +0 -0
  51. {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/WHEEL +0 -0
  52. {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/entry_points.txt +0 -0
  53. {xfmr_zem-0.2.4.dist-info → xfmr_zem-0.2.6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,562 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+
19
+ import re
20
+
21
+ from collections import Counter
22
+
23
+ from copy import deepcopy
24
+
25
+ from pathlib import Path
26
+
27
+
28
+
29
+ try:
30
+
31
+ from doclayout_yolo import YOLOv10
32
+
33
+ except ImportError:
34
+
35
+ YOLOv10 = None
36
+
37
+
38
+
39
+ import cv2
40
+
41
+ import numpy as np
42
+
43
+
44
+
45
+ from .recognizer import Recognizer
46
+
47
+ from .operators import nms
48
+
49
+
50
+
51
+ def get_project_base_directory():
52
+
53
+ return Path(__file__).resolve().parent
54
+
55
+
56
+
57
+
58
+
59
+ class LayoutRecognizer(Recognizer):
60
+
61
+ labels = [
62
+
63
+ "_background_",
64
+
65
+ "Text",
66
+
67
+ "Title",
68
+
69
+ "Figure",
70
+
71
+ "Figure caption",
72
+
73
+ "Table",
74
+
75
+ "Table caption",
76
+
77
+ "Header",
78
+
79
+ "Footer",
80
+
81
+ "Reference",
82
+
83
+ "Equation",
84
+
85
+ ]
86
+
87
+
88
+
89
+ def __init__(self, domain):
90
+
91
+ # Base init that doesn't load ONNX model automatically
92
+
93
+ # Subclasses should handle their own model loading
94
+
95
+ self.domain = domain
96
+
97
+ self.garbage_layouts = ["footer", "header", "reference"]
98
+
99
+ self.client = None
100
+
101
+ if os.environ.get("TENSORRT_DLA_SVR"):
102
+
103
+ from deepdoc.vision.dla_cli import DLAClient
104
+
105
+ self.client = DLAClient(os.environ["TENSORRT_DLA_SVR"])
106
+
107
+
108
+
109
+ def get_layouts_from_model(self, image_list, thr, batch_size):
110
+
111
+ if self.client:
112
+
113
+ return self.client.predict(image_list)
114
+
115
+ raise NotImplementedError("Subclasses must implement get_layouts_from_model")
116
+
117
+
118
+
119
+ def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
120
+
121
+ def __is_garbage(b):
122
+
123
+ patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$",
124
+
125
+ r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
126
+
127
+ "\\(cid *: *[0-9]+ *\\)"
128
+
129
+ ]
130
+
131
+ return any([re.search(p, b["text"]) for p in patt])
132
+
133
+
134
+
135
+ layouts = self.get_layouts_from_model(image_list, thr, batch_size)
136
+
137
+ # save_results(image_list, layouts, self.labels, output_dir='output/', threshold=0.7)
138
+
139
+ assert len(image_list) == len(ocr_res)
140
+
141
+ # Tag layout type
142
+
143
+ boxes = []
144
+
145
+ assert len(image_list) == len(layouts)
146
+
147
+ garbages = {}
148
+
149
+ page_layout = []
150
+
151
+ for pn, lts in enumerate(layouts):
152
+
153
+ bxs = ocr_res[pn]
154
+
155
+ lts = [{"type": b["type"],
156
+
157
+ "score": float(b["score"]),
158
+
159
+ "x0": b["bbox"][0] / scale_factor, "x1": b["bbox"][2] / scale_factor,
160
+
161
+ "top": b["bbox"][1] / scale_factor, "bottom": b["bbox"][-1] / scale_factor,
162
+
163
+ "page_number": pn,
164
+
165
+ } for b in lts if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts]
166
+
167
+ lts = self.sort_Y_firstly(lts, np.mean(
168
+
169
+ [lt["bottom"] - lt["top"] for lt in lts]) / 2)
170
+
171
+ lts = self.layouts_cleanup(bxs, lts)
172
+
173
+ page_layout.append(lts)
174
+
175
+
176
+
177
+ # Tag layout type, layouts are ready
178
+
179
+ def findLayout(ty):
180
+
181
+ nonlocal bxs, lts, self
182
+
183
+ lts_ = [lt for lt in lts if lt["type"] == ty]
184
+
185
+ i = 0
186
+
187
+ while i < len(bxs):
188
+
189
+ if bxs[i].get("layout_type"):
190
+
191
+ i += 1
192
+
193
+ continue
194
+
195
+ if __is_garbage(bxs[i]):
196
+
197
+ bxs.pop(i)
198
+
199
+ continue
200
+
201
+
202
+
203
+ ii = self.find_overlapped_with_threashold(bxs[i], lts_,
204
+
205
+ thr=0.4)
206
+
207
+ if ii is None: # belong to nothing
208
+
209
+ bxs[i]["layout_type"] = ""
210
+
211
+ i += 1
212
+
213
+ continue
214
+
215
+ lts_[ii]["visited"] = True
216
+
217
+ keep_feats = [
218
+
219
+ lts_[
220
+
221
+ ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
222
+
223
+ lts_[
224
+
225
+ ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
226
+
227
+ ]
228
+
229
+ if drop and lts_[
230
+
231
+ ii]["type"] in self.garbage_layouts and not any(keep_feats):
232
+
233
+ if lts_[ii]["type"] not in garbages:
234
+
235
+ garbages[lts_[ii]["type"]] = []
236
+
237
+ garbages[lts_[ii]["type"]].append(bxs[i]["text"])
238
+
239
+ bxs.pop(i)
240
+
241
+ continue
242
+
243
+
244
+
245
+ bxs[i]["layoutno"] = f"{ty}-{ii}"
246
+
247
+ bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[
248
+
249
+ ii]["type"] != "equation" else "figure"
250
+
251
+ i += 1
252
+
253
+
254
+
255
+ for lt in ["footer", "header", "reference", "figure caption",
256
+
257
+ "table caption", "title", "table", "text", "figure", "equation"]:
258
+
259
+ findLayout(lt)
260
+
261
+
262
+
263
+ # add box to figure layouts which has not text box
264
+
265
+ for i, lt in enumerate(
266
+
267
+ [lt for lt in lts if lt["type"] in ["figure", "equation"]]):
268
+
269
+ if lt.get("visited"):
270
+
271
+ continue
272
+
273
+ lt = deepcopy(lt)
274
+
275
+ del lt["type"]
276
+
277
+ lt["text"] = ""
278
+
279
+ lt["layout_type"] = "figure"
280
+
281
+ lt["layoutno"] = f"figure-{i}"
282
+
283
+ bxs.append(lt)
284
+
285
+
286
+
287
+ boxes.extend(bxs)
288
+
289
+
290
+
291
+ ocr_res = boxes
292
+
293
+
294
+
295
+ garbag_set = set()
296
+
297
+ for k in garbages.keys():
298
+
299
+ garbages[k] = Counter(garbages[k])
300
+
301
+ for g, c in garbages[k].items():
302
+
303
+ if c > 1:
304
+
305
+ garbag_set.add(g)
306
+
307
+
308
+
309
+ ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
310
+
311
+ return ocr_res, page_layout
312
+
313
+
314
+
315
+ def forward(self, image_list, thr=0.7, batch_size=16):
316
+
317
+ return self.get_layouts_from_model(image_list, thr, batch_size)
318
+
319
+
320
+
321
+
322
+
323
+ class LayoutRecognizerDocLayoutYOLO(LayoutRecognizer):
324
+
325
+
326
+
327
+
328
+
329
+ def __init__(self, domain):
330
+
331
+
332
+
333
+
334
+
335
+ # DocLayout-YOLO handles loading via from_pretrained
336
+
337
+
338
+
339
+
340
+
341
+ self.labels = LayoutRecognizer.labels
342
+
343
+
344
+
345
+
346
+
347
+ self.domain = domain
348
+
349
+
350
+
351
+
352
+
353
+ self.garbage_layouts = ["footer", "header", "reference"]
354
+
355
+
356
+
357
+
358
+
359
+ self.client = None
360
+
361
+
362
+
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+
371
+ if YOLOv10 is None:
372
+
373
+
374
+
375
+
376
+
377
+ raise ImportError("Could not import YOLOv10 from doclayout_yolo. Please run 'pip install doclayout-yolo'.")
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+ # Load YOLOv10 model (DocStructBench) using the official library
390
+
391
+
392
+
393
+
394
+
395
+ try:
396
+
397
+
398
+
399
+
400
+
401
+ # Use hf_hub_download explicitly for robustness
402
+
403
+
404
+
405
+
406
+
407
+ from huggingface_hub import hf_hub_download
408
+
409
+
410
+
411
+
412
+
413
+ model_path = hf_hub_download(
414
+
415
+
416
+
417
+
418
+
419
+ repo_id="juliozhao/DocLayout-YOLO-DocStructBench",
420
+
421
+
422
+
423
+
424
+
425
+ filename="doclayout_yolo_docstructbench_imgsz1024.pt"
426
+
427
+
428
+
429
+
430
+
431
+ )
432
+
433
+
434
+
435
+
436
+
437
+ self.model = YOLOv10(model_path)
438
+
439
+ # OPTIMIZATION: Try to use ONNX model for CPU acceleration
440
+ try:
441
+ onnx_path = model_path.replace(".pt", ".onnx")
442
+ import logging
443
+
444
+ # If ONNX doesn't exist, try to export it
445
+ if not os.path.exists(onnx_path):
446
+ logging.info(f"⚡ Generating optimized ONNX model for Layout Detection (First run only)...")
447
+ try:
448
+ self.model.export(format="onnx", imgsz=1024)
449
+ logging.info(f"✅ Exported ONNX model to: {onnx_path}")
450
+ except Exception as e:
451
+ logging.warning(f"⚠️ Could not export ONNX (using PyTorch fallback): {e}")
452
+
453
+ # Load ONNX model if available
454
+ if os.path.exists(onnx_path):
455
+ logging.info(f"🚀 Loading optimized ONNX model: {onnx_path}")
456
+ # Re-initialize with ONNX
457
+ self.model = YOLOv10(onnx_path, task='detect')
458
+ except Exception as e:
459
+ logging.warning(f"⚠️ ONNX Optimization failed, using standard PyTorch: {e}")
460
+ self.model = YOLOv10(model_path)
461
+
462
+
463
+
464
+
465
+
466
+ except Exception as e:
467
+
468
+
469
+
470
+
471
+
472
+ # Fallback if download fails or other issue, though from_pretrained handles cache
473
+
474
+
475
+
476
+
477
+
478
+ raise RuntimeError(f"Failed to load DocLayout-YOLO model: {e}")
479
+
480
+
481
+
482
+
483
+
484
+
485
+
486
+
487
+
488
+
489
+
490
+ def get_layouts_from_model(self, image_list, thr, batch_size):
491
+
492
+ # Use batch processing as suggested in the guide
493
+
494
+ # image_list is expected to be a list of numpy arrays (cv2 images)
495
+
496
+
497
+
498
+ results = self.model.predict(
499
+
500
+ image_list,
501
+
502
+ imgsz=1024,
503
+
504
+ conf=thr,
505
+
506
+ verbose=False,
507
+
508
+ device="cuda" if np.mod(1,1)==0 and False else "cpu" # Auto-detect device or use CPU for safety if no torch.cuda
509
+
510
+ )
511
+
512
+
513
+
514
+ layouts = []
515
+
516
+ for res in results:
517
+
518
+ page_layout = []
519
+
520
+ if res.boxes:
521
+
522
+ for i in range(len(res.boxes)):
523
+
524
+ box = res.boxes[i]
525
+
526
+ # box.xyxy: [x1, y1, x2, y2]
527
+
528
+ coords = box.xyxy[0].cpu().numpy().tolist()
529
+
530
+ score = float(box.conf[0].item())
531
+
532
+ cls_id = int(box.cls[0].item())
533
+
534
+ label = res.names[cls_id]
535
+
536
+
537
+
538
+ page_layout.append({
539
+
540
+ "type": label.lower(), # Ensure lowercase for compatibility
541
+
542
+ "bbox": coords,
543
+
544
+ "score": score
545
+
546
+ })
547
+
548
+ layouts.append(page_layout)
549
+
550
+
551
+
552
+ return layouts
553
+
554
+
555
+
556
+ def forward(self, image_list, thr=0.7, batch_size=16):
557
+
558
+ return self.get_layouts_from_model(image_list, thr, batch_size)
559
+
560
+
561
+
562
+