docling-ibm-models 1.1.6__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/LICENSE +1 -1
  2. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/PKG-INFO +1 -1
  3. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/layoutmodel/layout_predictor.py +33 -25
  4. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/matching_post_processor.py +44 -24
  5. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/tf_predictor.py +21 -5
  6. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/bbox_decoder_rs.py +5 -0
  7. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py +4 -4
  8. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/otsl.py +3 -0
  9. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/utils/app_profiler.py +12 -1
  10. docling_ibm_models-1.2.0/docling_ibm_models/tableformer/utils/mem_monitor.py +175 -0
  11. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/pyproject.toml +1 -1
  12. docling_ibm_models-1.1.6/docling_ibm_models/tableformer/utils/variance.py +0 -175
  13. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/README.md +0 -0
  14. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/__init__.py +0 -0
  15. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/common.py +0 -0
  16. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/__init__.py +0 -0
  17. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/data_transformer.py +0 -0
  18. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/functional.py +0 -0
  19. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/tf_cell_matcher.py +0 -0
  20. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/tf_dataset.py +0 -0
  21. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/transforms.py +0 -0
  22. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/__init__.py +0 -0
  23. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/common/__init__.py +0 -0
  24. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/common/base_model.py +0 -0
  25. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/__init__.py +0 -0
  26. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/encoder04_rs.py +0 -0
  27. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/tablemodel04_rs.py +0 -0
  28. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/settings.py +0 -0
  29. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/test_dataset_cache.py +0 -0
  30. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/test_prepare_image.py +0 -0
  31. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/utils/__init__.py +0 -0
  32. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/utils/torch_utils.py +0 -0
  33. {docling_ibm_models-1.1.6 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) [year] [fullname]
3
+ Copyright (c) 2024 International Business Machines
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-ibm-models
3
- Version: 1.1.6
3
+ Version: 1.2.0
4
4
  Summary: This package contains the AI models used by the Docling PDF conversion package
5
5
  License: MIT
6
6
  Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
@@ -14,29 +14,6 @@ MODEL_CHECKPOINT_FN = "model.pt"
14
14
  DEFAULT_NUM_THREADS = 4
15
15
 
16
16
 
17
- # Classes:
18
- CLASSES_MAP = {
19
- 0: "background",
20
- 1: "Caption",
21
- 2: "Footnote",
22
- 3: "Formula",
23
- 4: "List-item",
24
- 5: "Page-footer",
25
- 6: "Page-header",
26
- 7: "Picture",
27
- 8: "Section-header",
28
- 9: "Table",
29
- 10: "Text",
30
- 11: "Title",
31
- 12: "Document Index",
32
- 13: "Code",
33
- 14: "Checkbox-Selected",
34
- 15: "Checkbox-Unselected",
35
- 16: "Form",
36
- 17: "Key-Value Region",
37
- }
38
-
39
-
40
17
  class LayoutPredictor:
41
18
  r"""
42
19
  Document layout prediction using ONNX
@@ -69,6 +46,31 @@ class LayoutPredictor:
69
46
  ------
70
47
  FileNotFoundError when the model's ONNX file is missing
71
48
  """
49
+ # Initialize classes map:
50
+ self._classes_map = {
51
+ 0: "background",
52
+ 1: "Caption",
53
+ 2: "Footnote",
54
+ 3: "Formula",
55
+ 4: "List-item",
56
+ 5: "Page-footer",
57
+ 6: "Page-header",
58
+ 7: "Picture",
59
+ 8: "Section-header",
60
+ 9: "Table",
61
+ 10: "Text",
62
+ 11: "Title",
63
+ 12: "Document Index",
64
+ 13: "Code",
65
+ 14: "Checkbox-Selected",
66
+ 15: "Checkbox-Unselected",
67
+ 16: "Form",
68
+ 17: "Key-Value Region",
69
+ }
70
+
71
+ # Blacklisted classes
72
+ self._black_classes = set(["Form", "Key-Value Region"])
73
+
72
74
  # Set basic params
73
75
  self._threshold = 0.6 # Score threshold
74
76
  self._image_size = 640
@@ -159,13 +161,19 @@ class LayoutPredictor:
159
161
  )
160
162
 
161
163
  # Yield output
162
- for label, box, score in zip(labels[0], boxes[0], scores[0]):
164
+ for label_idx, box, score in zip(labels[0], boxes[0], scores[0]):
165
+ # Filter out blacklisted classes
166
+ label = self._classes_map[label_idx]
167
+ if label in self._black_classes:
168
+ continue
169
+
170
+ # Check against threshold
163
171
  if score > self._threshold:
164
172
  yield {
165
173
  "l": box[0] / self._image_size * w,
166
174
  "t": box[1] / self._image_size * h,
167
175
  "r": box[2] / self._image_size * w,
168
176
  "b": box[3] / self._image_size * h,
169
- "label": CLASSES_MAP[label],
177
+ "label": label,
170
178
  "confidence": score,
171
179
  }
@@ -4,6 +4,7 @@
4
4
  #
5
5
  import json
6
6
  import logging
7
+ import math
7
8
  import statistics
8
9
 
9
10
  import docling_ibm_models.tableformer.settings as s
@@ -403,45 +404,63 @@ class MatchingPostProcessor:
403
404
  # Push horizontally
404
405
  if x1_min < x2_min:
405
406
  # Move box1 to the left and box2 to the right
406
- box1["bbox"][2] -= overlap_x
407
- box2["bbox"][0] += overlap_x
407
+ box1["bbox"][2] -= math.ceil(overlap_x / 2) + 2
408
+ box2["bbox"][0] += math.floor(overlap_x / 2)
408
409
  else:
409
410
  # Move box2 to the left and box1 to the right
410
- box2["bbox"][2] -= overlap_x
411
- box1["bbox"][0] += overlap_x
411
+ box2["bbox"][2] -= math.ceil(overlap_x / 2) + 2
412
+ box1["bbox"][0] += math.floor(overlap_x / 2)
412
413
  else:
413
414
  # Push vertically
414
415
  if y1_min < y2_min:
415
416
  # Move box1 up and box2 down
416
- box1["bbox"][3] -= overlap_y
417
- box2["bbox"][1] += overlap_y
417
+ box1["bbox"][3] -= math.ceil(overlap_y / 2) + 2
418
+ box2["bbox"][1] += math.floor(overlap_y / 2)
418
419
  else:
419
420
  # Move box2 up and box1 down
420
- box2["bbox"][3] -= overlap_y
421
- box1["bbox"][1] += overlap_y
421
+ box2["bbox"][3] -= math.ceil(overlap_y / 2) + 2
422
+ box1["bbox"][1] += math.floor(overlap_y / 2)
423
+
424
+ # Will flip coordinates in proper order, if previous operations reversed it
425
+ box1["bbox"] = [
426
+ min(box1["bbox"][0], box1["bbox"][2]),
427
+ min(box1["bbox"][1], box1["bbox"][3]),
428
+ max(box1["bbox"][0], box1["bbox"][2]),
429
+ max(box1["bbox"][1], box1["bbox"][3]),
430
+ ]
431
+ box2["bbox"] = [
432
+ min(box2["bbox"][0], box2["bbox"][2]),
433
+ min(box2["bbox"][1], box2["bbox"][3]),
434
+ max(box2["bbox"][0], box2["bbox"][2]),
435
+ max(box2["bbox"][1], box2["bbox"][3]),
436
+ ]
422
437
 
423
438
  return box1, box2
424
439
 
425
440
  def do_boxes_overlap(box1, box2):
426
- # print("{} - {}".format(box1["bbox"], box2["bbox"]))
427
- # Extract coordinates from the bounding boxes
428
- x1_min, y1_min, x1_max, y1_max = box1["bbox"]
429
- x2_min, y2_min, x2_max, y2_max = box2["bbox"]
430
- # Check if one box is to the left of the other
431
- if x1_max < x2_min or x2_max < x1_min:
441
+ B1 = box1["bbox"]
442
+ B2 = box2["bbox"]
443
+ if (
444
+ (B1[0] >= B2[2])
445
+ or (B1[2] <= B2[0])
446
+ or (B1[3] <= B2[1])
447
+ or (B1[1] >= B2[3])
448
+ ):
432
449
  return False
433
- # Check if one box is above the other
434
- if y1_max < y2_min or y2_max < y1_min:
435
- return False
436
- return True
450
+ else:
451
+ return True
437
452
 
438
453
  def find_overlapping_pairs_indexes(bboxes):
439
454
  overlapping_indexes = []
440
455
  # Compare each box with every other box (combinations)
441
456
  for i in range(len(bboxes)):
442
457
  for j in range(i + 1, len(bboxes)):
443
- if do_boxes_overlap(bboxes[i], bboxes[j]):
444
- bboxes[i], bboxes[j] = correct_overlap(bboxes[i], bboxes[j])
458
+ if i != j:
459
+ if bboxes[i] != bboxes[j]:
460
+ if do_boxes_overlap(bboxes[i], bboxes[j]):
461
+ bboxes[i], bboxes[j] = correct_overlap(
462
+ bboxes[i], bboxes[j]
463
+ )
445
464
 
446
465
  return overlapping_indexes, bboxes
447
466
 
@@ -1144,7 +1163,7 @@ class MatchingPostProcessor:
1144
1163
  new_pdf_cells.append(pdf_cells[i])
1145
1164
  return new_pdf_cells
1146
1165
 
1147
- def process(self, matching_details):
1166
+ def process(self, matching_details, correct_overlapping_cells=False):
1148
1167
  r"""
1149
1168
  Do post processing, see details in the comments below
1150
1169
 
@@ -1348,9 +1367,10 @@ class MatchingPostProcessor:
1348
1367
  table_cells_wo = po2
1349
1368
  max_cell_id = po3
1350
1369
 
1351
- # As the last step - correct cell bboxes in a way that they don't overlap:
1352
- if len(table_cells_wo) <= 300: # For performance reasons
1353
- table_cells_wo = self._find_overlapping(table_cells_wo)
1370
+ if correct_overlapping_cells:
1371
+ # As the last step - correct cell bboxes in a way that they don't overlap:
1372
+ if len(table_cells_wo) <= 300: # For performance reasons
1373
+ table_cells_wo = self._find_overlapping(table_cells_wo)
1354
1374
 
1355
1375
  self._log().debug("*** final_matches_wo")
1356
1376
  self._log().debug(final_matches_wo)
@@ -523,8 +523,9 @@ class TFPredictor:
523
523
  # return the resized image
524
524
  return resized, sf
525
525
 
526
- def multi_table_predict(self, iocr_page, table_bboxes, do_matching=True):
527
- # def multi_table_predict(self, iocr_page, page_image, table_bboxes):
526
+ def multi_table_predict(
527
+ self, iocr_page, table_bboxes, do_matching=True, correct_overlapping_cells=False
528
+ ):
528
529
  multi_tf_output = []
529
530
  page_image = iocr_page["image"]
530
531
 
@@ -546,7 +547,12 @@ class TFPredictor:
546
547
  # Predict
547
548
  if do_matching:
548
549
  tf_responses, predict_details = self.predict(
549
- iocr_page, table_bbox, table_image, scale_factor, None
550
+ iocr_page,
551
+ table_bbox,
552
+ table_image,
553
+ scale_factor,
554
+ None,
555
+ correct_overlapping_cells,
550
556
  )
551
557
  else:
552
558
  tf_responses, predict_details = self.predict_dummy(
@@ -733,7 +739,13 @@ class TFPredictor:
733
739
  return tf_output, matching_details
734
740
 
735
741
  def predict(
736
- self, iocr_page, table_bbox, table_image, scale_factor, eval_res_preds=None
742
+ self,
743
+ iocr_page,
744
+ table_bbox,
745
+ table_image,
746
+ scale_factor,
747
+ eval_res_preds=None,
748
+ correct_overlapping_cells=False,
737
749
  ):
738
750
  r"""
739
751
  Predict the table out of an image in memory
@@ -744,6 +756,8 @@ class TFPredictor:
744
756
  Docling provided table data
745
757
  eval_res_preds : dict
746
758
  Ready predictions provided by the evaluation results
759
+ correct_overlapping_cells : boolean
760
+ Enables or disables last post-processing step, that fixes cell bboxes to remove overlap
747
761
 
748
762
  Returns
749
763
  -------
@@ -834,7 +848,9 @@ class TFPredictor:
834
848
  ): # There are at least some pdf cells to match with
835
849
  if self.enable_post_process:
836
850
  AggProfiler().begin("post_process", self._prof)
837
- matching_details = self._post_processor.process(matching_details)
851
+ matching_details = self._post_processor.process(
852
+ matching_details, correct_overlapping_cells
853
+ )
838
854
  AggProfiler().end("post_process", self._prof)
839
855
 
840
856
  # Generate the expected Docling responses
@@ -157,7 +157,12 @@ class BBoxDecoder(nn.Module):
157
157
  predictions_classes.append(self._class_embed(h))
158
158
  if len(predictions_bboxes) > 0:
159
159
  predictions_bboxes = torch.stack([x[0] for x in predictions_bboxes])
160
+ else:
161
+ predictions_bboxes = torch.empty(0)
162
+
160
163
  if len(predictions_classes) > 0:
161
164
  predictions_classes = torch.stack([x[0] for x in predictions_classes])
165
+ else:
166
+ predictions_classes = torch.empty(0)
162
167
 
163
168
  return predictions_classes, predictions_bboxes
@@ -149,11 +149,11 @@ class Tag_Transformer(nn.Module):
149
149
  self._positional_encoding = PositionalEncoding(embed_dim)
150
150
  self._td_encode = td_encode
151
151
 
152
+ encoder_layer = nn.TransformerEncoderLayer(
153
+ d_model=embed_dim, nhead=n_heads, dim_feedforward=dim_ff
154
+ )
152
155
  self._encoder = nn.TransformerEncoder(
153
- nn.TransformerEncoderLayer(
154
- d_model=embed_dim, nhead=n_heads, dim_feedforward=dim_ff
155
- ),
156
- num_layers=encoder_layers,
156
+ encoder_layer, num_layers=encoder_layers, enable_nested_tensor=False
157
157
  )
158
158
 
159
159
  self._decoder = TMTransformerDecoder(
@@ -123,6 +123,9 @@ def otsl_check_right(rs_split, x, y):
123
123
 
124
124
 
125
125
  def otsl_to_html(rs_list, logdebug):
126
+ if len(rs_list) == 0:
127
+ return []
128
+
126
129
  if rs_list[0] not in ["fcel", "ched", "rhed", "srow", "ecel"]:
127
130
  # Most likely already HTML...
128
131
  return rs_list
@@ -6,6 +6,8 @@ import time
6
6
  from collections import deque
7
7
  from statistics import mean, median
8
8
 
9
+ from docling_ibm_models.tableformer.utils.mem_monitor import MemMonitor
10
+
9
11
 
10
12
  class SingletonClass(type):
11
13
  r"""
@@ -37,11 +39,13 @@ class Profiler:
37
39
  def __init__(self):
38
40
  self._section_dts = {} # section name -> sum(section intervals)
39
41
  self._section_calls = {} # section name -> number of invocations
40
- self._section_kB = {} # section name -> max kB of used heap
42
+ self._section_kB = {} # section name -> max kB of used heap (resident set size)
41
43
 
42
44
  # section name -> beginning of the last interval
43
45
  self._last_begin = {}
44
46
 
47
+ self._mem_monitor = MemMonitor()
48
+
45
49
  def begin(self, section_name, enable=True):
46
50
  r"""
47
51
  Mark the beginning of an interval
@@ -83,13 +87,20 @@ class Profiler:
83
87
  if section_name not in self._last_begin:
84
88
  return False
85
89
 
90
+ # Get memory
91
+ kB = self._mem_monitor.get_memory()
92
+ if isinstance(kB, dict):
93
+ kB = kB["resident"]
94
+
86
95
  dt = time.time() - self._last_begin[section_name]
87
96
  if section_name not in self._section_dts:
88
97
  self._section_dts[section_name] = dt
89
98
  self._section_calls[section_name] = 1
99
+ self._section_kB[section_name] = kB
90
100
  else:
91
101
  self._section_dts[section_name] += dt
92
102
  self._section_calls[section_name] += 1
103
+ self._section_kB[section_name] = max(kB, self._section_kB[section_name])
93
104
 
94
105
  return True
95
106
 
@@ -0,0 +1,175 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+ import os
6
+ import platform
7
+ import re
8
+
9
+
10
+ class MemMonitor:
11
+ r"""
12
+ Memory monitor for Linux
13
+
14
+ It supports 2 approaches for extracting memory information:
15
+ - linux-native: It parse the `/proc` pseudo-files. It is available only for Linux
16
+ - psutil: Use the `psutil` library
17
+
18
+ ## Linux-Native approach
19
+
20
+ The linux-native approach implements 2 methods to extract the memory fields:
21
+
22
+ 1. The `get_memory()` method:
23
+
24
+ - It is very fast
25
+ - It parses the `/proc/<pid>/statm` pseudo-file
26
+ - It Contains the following fields:
27
+ size (1) total program size
28
+ (same as VmSize in /proc/[pid]/status)
29
+ resident (2) resident set size
30
+ (same as VmRSS in /proc/[pid]/status)
31
+ shared (3) number of resident shared pages (i.e., backed by a file)
32
+ (same as RssFile+RssShmem in /proc/[pid]/status)
33
+ text (4) text (code)
34
+ lib (5) library (unused since Linux 2.6; always 0)
35
+ data (6) data + stack
36
+ dt (7) dirty pages (unused since Linux 2.6; always 0)
37
+
38
+
39
+ 2. The `get_memory_full()` method:
40
+
41
+ - It is slower to parse but contains more detailed information
42
+ - It uses regex to parse the `/proc/<pid>/status` pseudo-file
43
+ - It contains the following fields:
44
+ VmPeak: Peak virtual memory size.
45
+ VmSize: Virtual memory size.
46
+ VmLck: Locked memory size (see mlock(2)).
47
+ VmPin: Pinned memory size (since Linux 3.2). These are pages that can't be moved because
48
+ something needs to directly access physical memory.
49
+ VmHWM: Peak resident set size ("high water mark").
50
+ VmRSS: Resident set size. Note that the value here is the sum of RssAnon, RssFile, and
51
+ RssShmem.
52
+ RssAnon: Size of resident anonymous memory. (since Linux 4.5).
53
+ RssFile: Size of resident file mappings. (since Linux 4.5).
54
+ RssShmem: Size of resident shared memory (includes System V shared memory, mappings from
55
+ tmpfs(5), and shared anonymous mappings). (since Linux 4.5).
56
+ VmData, VmStk, VmExe: Size of data, stack, and text segments.
57
+ VmLib: Shared library code size.
58
+ VmPTE: Page table entries size (since Linux 2.6.10).
59
+ VmPMD: Size of second-level page tables (added in Linux 4.0; removed in Linux 4.15).
60
+ VmSwap: Swapped-out virtual memory size by anonymous private pages; shmem swap usage is
61
+ not included (since Linux 2.6.34).
62
+
63
+
64
+ ## The psutil library
65
+
66
+ - Apparently the psutil library parses the `/proc/<pid>/statm`
67
+ - The memory_info() function returns the fields: rss, vms, shared, text, lib, data, dirty
68
+
69
+
70
+ ## Field mappings
71
+
72
+ These are the fields returned by psutil memory_info() and their mapping in the /proc files:
73
+ (I put ? when I am not 100% about the mapping)
74
+
75
+ | psutil | /proc/$$/status | /proc/$$/statm |
76
+ |---------|--------------------|----------------|
77
+ | rss | VmRSS | resident |
78
+ | vms | VmSize | size |
79
+ | shared | RssFile + RssShmem | shared |
80
+ | text | VmExe ? | text |
81
+ | lib | RssShmem ? | lib |
82
+ | data | VmData + VmStk | data |
83
+ | dirty | VmSwap ? | dt |
84
+
85
+ """
86
+
87
+ def __init__(self, enable=True):
88
+ self._enable = enable
89
+ self._pid = os.getpid()
90
+
91
+ # Create regex for each memory field of the /proc/status pseudo-file
92
+ self._status_fields = [
93
+ "VmPeak",
94
+ "VmSize",
95
+ "VmLck",
96
+ "VmPin",
97
+ "VmHWM",
98
+ "VmRSS",
99
+ "RssAnon",
100
+ "RssFile",
101
+ "RssShmem",
102
+ "VmData",
103
+ "VmStk",
104
+ "VmExe",
105
+ "VmLib",
106
+ "VmPTE",
107
+ "VmPMD",
108
+ "VmSwap",
109
+ ]
110
+ self._status_regex = {}
111
+ for mem_field in self._status_fields:
112
+ regex_str = r"({}:)(\s+)(\d*)(.*)".format(mem_field)
113
+ self._status_regex[mem_field] = re.compile(regex_str)
114
+
115
+ def get_memory_full(self) -> dict:
116
+ r"""
117
+ - Parse /proc/<pid>status to get all memory info.
118
+ - The method returns a dict with the fields self._status_fields
119
+ - This method is SLOW. Unless you need the full memory info, better to use `get_memory`
120
+
121
+ The returned values are in kB
122
+ """
123
+ if not self._enable:
124
+ return -2
125
+ if platform.system() != "Linux":
126
+ return -1
127
+ pid_fn = "/proc/{}/status".format(self._pid)
128
+
129
+ # Dict to collect all memory fields
130
+ memory = {}
131
+ with open(pid_fn, "r") as fn:
132
+ for ll in fn:
133
+ for mem_field in self._status_fields:
134
+ regex = self._status_regex[mem_field]
135
+ m = regex.match(ll)
136
+ if m is not None:
137
+ memory[mem_field] = int(m.group(3))
138
+ if len(memory) == len(self._status_fields):
139
+ break
140
+
141
+ return memory
142
+
143
+ def get_memory(self) -> dict:
144
+ r"""
145
+ - Parse /proc/<pid>statm to get the most important memory fields
146
+ - This is a fast implementation.
147
+ - The method returns a dict with the fields:
148
+ "size", "resident", "shared", "text", "lib", "data", "dt"
149
+ - Check the documentation at the top for a mapping across the various fields
150
+
151
+ The returned values are in kB
152
+ """
153
+ if not self._enable:
154
+ return -2
155
+ if platform.system() != "Linux":
156
+ return -1
157
+ pid_fn = "/proc/{}/statm".format(self._pid)
158
+
159
+ # Dict to collect all memory fields
160
+ memory = {}
161
+ with open(pid_fn, "r") as fn:
162
+ ll = fn.read()
163
+ # The values are in pages
164
+ # Each page is 4096 bytes (4kB)
165
+ data = [int(x) << 2 for x in ll.split(" ")]
166
+ memory = {
167
+ "size": data[0],
168
+ "resident": data[1],
169
+ "shared": data[2],
170
+ "text": data[3],
171
+ "lib": data[4],
172
+ "data": data[5],
173
+ "dt": data[6],
174
+ }
175
+ return memory
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-ibm-models"
3
- version = "1.1.6" # DO NOT EDIT, updated automatically
3
+ version = "1.2.0" # DO NOT EDIT, updated automatically
4
4
  description = "This package contains the AI models used by the Docling PDF conversion package"
5
5
  authors = ["Nikos Livathinos <nli@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -1,175 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
- import logging
6
-
7
- import numpy as np
8
-
9
- import docling_ibm_models.tableformer.settings as s
10
-
11
- LOG_LEVEL = logging.INFO
12
-
13
-
14
- class MyWelford:
15
- r"""
16
- Running computation of the sample mean and sample variance using Welford's algorithm
17
- """
18
-
19
- def __init__(self):
20
- self._i = 0 # Running index
21
- self._m = 0 # Running mean
22
- self._s = 0 # (n - 1) * variance
23
-
24
- def reset(self):
25
- r"""
26
- Reset the object
27
- """
28
- self._i = 0
29
- self._m = 0
30
- self._s = 0
31
-
32
- def add(self, xi):
33
- r"""
34
- Invoke add each time a new sample arrives
35
-
36
- Inputs:
37
- xi: The next sample of data
38
- """
39
- self._i += 1
40
- old_m = self._m
41
- self._m = self._m + (xi - self._m) / self._i
42
- self._s = self._s + (xi - self._m) * (xi - old_m)
43
-
44
- def results(self):
45
- r"""
46
- Get the computed mean, variance and standard deviation up to now
47
-
48
- Outputs:
49
- m: Sample mean
50
- v: Sample variance
51
- std: Sample standard deviation
52
- """
53
- if self._i <= 1:
54
- return None, None, None
55
-
56
- # v = self._s / (self._i - 1) # Sample variance
57
- v = self._s / (self._i) # Population variance
58
- std = np.sqrt(v)
59
- return self._m, v, std
60
-
61
-
62
- class MyWelfordImg(MyWelford):
63
- r"""
64
- Welford algorithm to calculate running mean and sample variance for images
65
- """
66
-
67
- def __init__(self):
68
- super(MyWelfordImg, self).__init__()
69
-
70
- def add(self, img):
71
- r"""
72
- Input:
73
- img: An image numpy array (channel, width, height). The only requirement is to have the
74
- channels as the first dimension and have 3 dimensions in total
75
- """
76
- channels = img.shape[0]
77
- flat_dim = img.shape[1] * img.shape[2]
78
- img_r = img.reshape(channels, flat_dim)
79
-
80
- for i in range(flat_dim):
81
- super(MyWelfordImg, self).add(img_r[:, i])
82
-
83
-
84
- class ChanVarianceImg:
85
- r"""
86
- Chan's algorithm to compute a running variance with support of sub-samples
87
- In this implementation each sub-sample is an images
88
-
89
- Math for the original paper:
90
- https://github.ibm.com/nli/variance_formulae
91
- """
92
-
93
- def __init__(self):
94
- r""" """
95
- self._first = True
96
- # Size of the calculated dataset
97
- self._n = 0
98
- # Sum of the samples for the 3 image channels
99
- self._t = 0
100
- # Sum of the square differences of the deviations of the samples from the mean
101
- self._s = 0
102
-
103
- def add(self, img):
104
- r"""
105
- Add the provided image to the computation of the dataset statistics
106
-
107
- Input:
108
- img: An image numpy array (channel, width, height). The only requirement is to have the
109
- channels as the first dimension and have 3 dimensions in total
110
- """
111
- ch = img.shape[0]
112
- n = img.shape[1] * img.shape[2]
113
- img = img.reshape(ch, n)
114
- img_t = img.sum(axis=1)
115
- img_t_v = img_t.reshape(ch, 1)
116
- diff = (img - (img_t_v / n)) ** 2
117
- img_s = diff.sum(axis=1)
118
-
119
- if not self._first:
120
- c = (self._n / (n * (self._n + n))) * (
121
- ((n / self._n) * self._t - img_t) ** 2
122
- )
123
- self._s += img_s + c
124
- self._t += img_t
125
- else:
126
- self._s = img_s
127
- self._t = img_t
128
- self._first = False
129
- self._n += n
130
-
131
- def results(self):
132
- r"""
133
- Get the computed statistics
134
-
135
- Output:
136
- mean: Mean for the complete dataset
137
- var: Population variance for the complete dataset
138
- std: Population standard deviation for the complete dataset
139
- """
140
- mean = list(self._t / self._n)
141
- var = list(self._s / self._n) # Population variance
142
- std = list(np.sqrt(var))
143
-
144
- return mean, var, std
145
-
146
- def reset(self):
147
- r"""
148
- Reset the object to start over again
149
- """
150
- self._n = 0
151
- self._t = 0
152
- self._s = 0
153
- self._first = True
154
-
155
-
156
- if __name__ == "__main__":
157
- logger = s.get_custom_logger("variance", LOG_LEVEL)
158
-
159
- n = 50000
160
- channels = 3
161
- width = 448
162
- height = 448
163
-
164
- my = ChanVarianceImg()
165
- # Generate random images
166
- for i in range(n):
167
- logger.info(i)
168
- img = 255 * np.random.rand(channels, width, height)
169
- my.add(img)
170
-
171
- # Calculate the statistics
172
- m, v, std = my.results()
173
- assert m.shape == (3,), "Wrong mean dimension"
174
- assert v.shape == (3,), "Wrong variance dimension"
175
- assert std.shape == (3,), "Wrong std dimension"