docling-ibm-models 1.1.7__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/LICENSE +1 -1
  2. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/PKG-INFO +1 -1
  3. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/layoutmodel/layout_predictor.py +33 -25
  4. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py +4 -4
  5. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/utils/app_profiler.py +12 -1
  6. docling_ibm_models-1.2.0/docling_ibm_models/tableformer/utils/mem_monitor.py +175 -0
  7. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/pyproject.toml +1 -1
  8. docling_ibm_models-1.1.7/docling_ibm_models/tableformer/utils/variance.py +0 -175
  9. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/README.md +0 -0
  10. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/__init__.py +0 -0
  11. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/common.py +0 -0
  12. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/__init__.py +0 -0
  13. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/data_transformer.py +0 -0
  14. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/functional.py +0 -0
  15. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/matching_post_processor.py +0 -0
  16. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/tf_cell_matcher.py +0 -0
  17. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/tf_dataset.py +0 -0
  18. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/tf_predictor.py +0 -0
  19. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/data_management/transforms.py +0 -0
  20. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/__init__.py +0 -0
  21. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/common/__init__.py +0 -0
  22. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/common/base_model.py +0 -0
  23. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/__init__.py +0 -0
  24. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/bbox_decoder_rs.py +0 -0
  25. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/encoder04_rs.py +0 -0
  26. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/models/table04_rs/tablemodel04_rs.py +0 -0
  27. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/otsl.py +0 -0
  28. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/settings.py +0 -0
  29. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/test_dataset_cache.py +0 -0
  30. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/test_prepare_image.py +0 -0
  31. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/utils/__init__.py +0 -0
  32. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/utils/torch_utils.py +0 -0
  33. {docling_ibm_models-1.1.7 → docling_ibm_models-1.2.0}/docling_ibm_models/tableformer/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) [year] [fullname]
3
+ Copyright (c) 2024 International Business Machines
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-ibm-models
3
- Version: 1.1.7
3
+ Version: 1.2.0
4
4
  Summary: This package contains the AI models used by the Docling PDF conversion package
5
5
  License: MIT
6
6
  Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
@@ -14,29 +14,6 @@ MODEL_CHECKPOINT_FN = "model.pt"
14
14
  DEFAULT_NUM_THREADS = 4
15
15
 
16
16
 
17
- # Classes:
18
- CLASSES_MAP = {
19
- 0: "background",
20
- 1: "Caption",
21
- 2: "Footnote",
22
- 3: "Formula",
23
- 4: "List-item",
24
- 5: "Page-footer",
25
- 6: "Page-header",
26
- 7: "Picture",
27
- 8: "Section-header",
28
- 9: "Table",
29
- 10: "Text",
30
- 11: "Title",
31
- 12: "Document Index",
32
- 13: "Code",
33
- 14: "Checkbox-Selected",
34
- 15: "Checkbox-Unselected",
35
- 16: "Form",
36
- 17: "Key-Value Region",
37
- }
38
-
39
-
40
17
  class LayoutPredictor:
41
18
  r"""
42
19
  Document layout prediction using ONNX
@@ -69,6 +46,31 @@ class LayoutPredictor:
69
46
  ------
70
47
  FileNotFoundError when the model's ONNX file is missing
71
48
  """
49
+ # Initialize classes map:
50
+ self._classes_map = {
51
+ 0: "background",
52
+ 1: "Caption",
53
+ 2: "Footnote",
54
+ 3: "Formula",
55
+ 4: "List-item",
56
+ 5: "Page-footer",
57
+ 6: "Page-header",
58
+ 7: "Picture",
59
+ 8: "Section-header",
60
+ 9: "Table",
61
+ 10: "Text",
62
+ 11: "Title",
63
+ 12: "Document Index",
64
+ 13: "Code",
65
+ 14: "Checkbox-Selected",
66
+ 15: "Checkbox-Unselected",
67
+ 16: "Form",
68
+ 17: "Key-Value Region",
69
+ }
70
+
71
+ # Blacklisted classes
72
+ self._black_classes = set(["Form", "Key-Value Region"])
73
+
72
74
  # Set basic params
73
75
  self._threshold = 0.6 # Score threshold
74
76
  self._image_size = 640
@@ -159,13 +161,19 @@ class LayoutPredictor:
159
161
  )
160
162
 
161
163
  # Yield output
162
- for label, box, score in zip(labels[0], boxes[0], scores[0]):
164
+ for label_idx, box, score in zip(labels[0], boxes[0], scores[0]):
165
+ # Filter out blacklisted classes
166
+ label = self._classes_map[label_idx]
167
+ if label in self._black_classes:
168
+ continue
169
+
170
+ # Check against threshold
163
171
  if score > self._threshold:
164
172
  yield {
165
173
  "l": box[0] / self._image_size * w,
166
174
  "t": box[1] / self._image_size * h,
167
175
  "r": box[2] / self._image_size * w,
168
176
  "b": box[3] / self._image_size * h,
169
- "label": CLASSES_MAP[label],
177
+ "label": label,
170
178
  "confidence": score,
171
179
  }
@@ -149,11 +149,11 @@ class Tag_Transformer(nn.Module):
149
149
  self._positional_encoding = PositionalEncoding(embed_dim)
150
150
  self._td_encode = td_encode
151
151
 
152
+ encoder_layer = nn.TransformerEncoderLayer(
153
+ d_model=embed_dim, nhead=n_heads, dim_feedforward=dim_ff
154
+ )
152
155
  self._encoder = nn.TransformerEncoder(
153
- nn.TransformerEncoderLayer(
154
- d_model=embed_dim, nhead=n_heads, dim_feedforward=dim_ff
155
- ),
156
- num_layers=encoder_layers,
156
+ encoder_layer, num_layers=encoder_layers, enable_nested_tensor=False
157
157
  )
158
158
 
159
159
  self._decoder = TMTransformerDecoder(
@@ -6,6 +6,8 @@ import time
6
6
  from collections import deque
7
7
  from statistics import mean, median
8
8
 
9
+ from docling_ibm_models.tableformer.utils.mem_monitor import MemMonitor
10
+
9
11
 
10
12
  class SingletonClass(type):
11
13
  r"""
@@ -37,11 +39,13 @@ class Profiler:
37
39
  def __init__(self):
38
40
  self._section_dts = {} # section name -> sum(section intervals)
39
41
  self._section_calls = {} # section name -> number of invocations
40
- self._section_kB = {} # section name -> max kB of used heap
42
+ self._section_kB = {} # section name -> max kB of used heap (resident set size)
41
43
 
42
44
  # section name -> beginning of the last interval
43
45
  self._last_begin = {}
44
46
 
47
+ self._mem_monitor = MemMonitor()
48
+
45
49
  def begin(self, section_name, enable=True):
46
50
  r"""
47
51
  Mark the beginning of an interval
@@ -83,13 +87,20 @@ class Profiler:
83
87
  if section_name not in self._last_begin:
84
88
  return False
85
89
 
90
+ # Get memory
91
+ kB = self._mem_monitor.get_memory()
92
+ if isinstance(kB, dict):
93
+ kB = kB["resident"]
94
+
86
95
  dt = time.time() - self._last_begin[section_name]
87
96
  if section_name not in self._section_dts:
88
97
  self._section_dts[section_name] = dt
89
98
  self._section_calls[section_name] = 1
99
+ self._section_kB[section_name] = kB
90
100
  else:
91
101
  self._section_dts[section_name] += dt
92
102
  self._section_calls[section_name] += 1
103
+ self._section_kB[section_name] = max(kB, self._section_kB[section_name])
93
104
 
94
105
  return True
95
106
 
@@ -0,0 +1,175 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+ import os
6
+ import platform
7
+ import re
8
+
9
+
10
+ class MemMonitor:
11
+ r"""
12
+ Memory monitor for Linux
13
+
14
+ It supports 2 approaches for extracting memory information:
15
+ - linux-native: It parse the `/proc` pseudo-files. It is available only for Linux
16
+ - psutil: Use the `psutil` library
17
+
18
+ ## Linux-Native approach
19
+
20
+ The linux-native approach implements 2 methods to extract the memory fields:
21
+
22
+ 1. The `get_memory()` method:
23
+
24
+ - It is very fast
25
+ - It parses the `/proc/<pid>/statm` pseudo-file
26
+ - It Contains the following fields:
27
+ size (1) total program size
28
+ (same as VmSize in /proc/[pid]/status)
29
+ resident (2) resident set size
30
+ (same as VmRSS in /proc/[pid]/status)
31
+ shared (3) number of resident shared pages (i.e., backed by a file)
32
+ (same as RssFile+RssShmem in /proc/[pid]/status)
33
+ text (4) text (code)
34
+ lib (5) library (unused since Linux 2.6; always 0)
35
+ data (6) data + stack
36
+ dt (7) dirty pages (unused since Linux 2.6; always 0)
37
+
38
+
39
+ 2. The `get_memory_full()` method:
40
+
41
+ - It is slower to parse but contains more detailed information
42
+ - It uses regex to parse the `/proc/<pid>/status` pseudo-file
43
+ - It contains the following fields:
44
+ VmPeak: Peak virtual memory size.
45
+ VmSize: Virtual memory size.
46
+ VmLck: Locked memory size (see mlock(2)).
47
+ VmPin: Pinned memory size (since Linux 3.2). These are pages that can't be moved because
48
+ something needs to directly access physical memory.
49
+ VmHWM: Peak resident set size ("high water mark").
50
+ VmRSS: Resident set size. Note that the value here is the sum of RssAnon, RssFile, and
51
+ RssShmem.
52
+ RssAnon: Size of resident anonymous memory. (since Linux 4.5).
53
+ RssFile: Size of resident file mappings. (since Linux 4.5).
54
+ RssShmem: Size of resident shared memory (includes System V shared memory, mappings from
55
+ tmpfs(5), and shared anonymous mappings). (since Linux 4.5).
56
+ VmData, VmStk, VmExe: Size of data, stack, and text segments.
57
+ VmLib: Shared library code size.
58
+ VmPTE: Page table entries size (since Linux 2.6.10).
59
+ VmPMD: Size of second-level page tables (added in Linux 4.0; removed in Linux 4.15).
60
+ VmSwap: Swapped-out virtual memory size by anonymous private pages; shmem swap usage is
61
+ not included (since Linux 2.6.34).
62
+
63
+
64
+ ## The psutil library
65
+
66
+ - Apparently the psutil library parses the `/proc/<pid>/statm`
67
+ - The memory_info() function returns the fields: rss, vms, shared, text, lib, data, dirty
68
+
69
+
70
+ ## Field mappings
71
+
72
+ These are the fields returned by psutil memory_info() and their mapping in the /proc files:
73
+ (I put ? when I am not 100% about the mapping)
74
+
75
+ | psutil | /proc/$$/status | /proc/$$/statm |
76
+ |---------|--------------------|----------------|
77
+ | rss | VmRSS | resident |
78
+ | vms | VmSize | size |
79
+ | shared | RssFile + RssShmem | shared |
80
+ | text | VmExe ? | text |
81
+ | lib | RssShmem ? | lib |
82
+ | data | VmData + VmStk | data |
83
+ | dirty | VmSwap ? | dt |
84
+
85
+ """
86
+
87
+ def __init__(self, enable=True):
88
+ self._enable = enable
89
+ self._pid = os.getpid()
90
+
91
+ # Create regex for each memory field of the /proc/status pseudo-file
92
+ self._status_fields = [
93
+ "VmPeak",
94
+ "VmSize",
95
+ "VmLck",
96
+ "VmPin",
97
+ "VmHWM",
98
+ "VmRSS",
99
+ "RssAnon",
100
+ "RssFile",
101
+ "RssShmem",
102
+ "VmData",
103
+ "VmStk",
104
+ "VmExe",
105
+ "VmLib",
106
+ "VmPTE",
107
+ "VmPMD",
108
+ "VmSwap",
109
+ ]
110
+ self._status_regex = {}
111
+ for mem_field in self._status_fields:
112
+ regex_str = r"({}:)(\s+)(\d*)(.*)".format(mem_field)
113
+ self._status_regex[mem_field] = re.compile(regex_str)
114
+
115
+ def get_memory_full(self) -> dict:
116
+ r"""
117
+ - Parse /proc/<pid>status to get all memory info.
118
+ - The method returns a dict with the fields self._status_fields
119
+ - This method is SLOW. Unless you need the full memory info, better to use `get_memory`
120
+
121
+ The returned values are in kB
122
+ """
123
+ if not self._enable:
124
+ return -2
125
+ if platform.system() != "Linux":
126
+ return -1
127
+ pid_fn = "/proc/{}/status".format(self._pid)
128
+
129
+ # Dict to collect all memory fields
130
+ memory = {}
131
+ with open(pid_fn, "r") as fn:
132
+ for ll in fn:
133
+ for mem_field in self._status_fields:
134
+ regex = self._status_regex[mem_field]
135
+ m = regex.match(ll)
136
+ if m is not None:
137
+ memory[mem_field] = int(m.group(3))
138
+ if len(memory) == len(self._status_fields):
139
+ break
140
+
141
+ return memory
142
+
143
+ def get_memory(self) -> dict:
144
+ r"""
145
+ - Parse /proc/<pid>statm to get the most important memory fields
146
+ - This is a fast implementation.
147
+ - The method returns a dict with the fields:
148
+ "size", "resident", "shared", "text", "lib", "data", "dt"
149
+ - Check the documentation at the top for a mapping across the various fields
150
+
151
+ The returned values are in kB
152
+ """
153
+ if not self._enable:
154
+ return -2
155
+ if platform.system() != "Linux":
156
+ return -1
157
+ pid_fn = "/proc/{}/statm".format(self._pid)
158
+
159
+ # Dict to collect all memory fields
160
+ memory = {}
161
+ with open(pid_fn, "r") as fn:
162
+ ll = fn.read()
163
+ # The values are in pages
164
+ # Each page is 4096 bytes (4kB)
165
+ data = [int(x) << 2 for x in ll.split(" ")]
166
+ memory = {
167
+ "size": data[0],
168
+ "resident": data[1],
169
+ "shared": data[2],
170
+ "text": data[3],
171
+ "lib": data[4],
172
+ "data": data[5],
173
+ "dt": data[6],
174
+ }
175
+ return memory
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-ibm-models"
3
- version = "1.1.7" # DO NOT EDIT, updated automatically
3
+ version = "1.2.0" # DO NOT EDIT, updated automatically
4
4
  description = "This package contains the AI models used by the Docling PDF conversion package"
5
5
  authors = ["Nikos Livathinos <nli@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -1,175 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
- import logging
6
-
7
- import numpy as np
8
-
9
- import docling_ibm_models.tableformer.settings as s
10
-
11
- LOG_LEVEL = logging.INFO
12
-
13
-
14
- class MyWelford:
15
- r"""
16
- Running computation of the sample mean and sample variance using Welford's algorithm
17
- """
18
-
19
- def __init__(self):
20
- self._i = 0 # Running index
21
- self._m = 0 # Running mean
22
- self._s = 0 # (n - 1) * variance
23
-
24
- def reset(self):
25
- r"""
26
- Reset the object
27
- """
28
- self._i = 0
29
- self._m = 0
30
- self._s = 0
31
-
32
- def add(self, xi):
33
- r"""
34
- Invoke add each time a new sample arrives
35
-
36
- Inputs:
37
- xi: The next sample of data
38
- """
39
- self._i += 1
40
- old_m = self._m
41
- self._m = self._m + (xi - self._m) / self._i
42
- self._s = self._s + (xi - self._m) * (xi - old_m)
43
-
44
- def results(self):
45
- r"""
46
- Get the computed mean, variance and standard deviation up to now
47
-
48
- Outputs:
49
- m: Sample mean
50
- v: Sample variance
51
- std: Sample standard deviation
52
- """
53
- if self._i <= 1:
54
- return None, None, None
55
-
56
- # v = self._s / (self._i - 1) # Sample variance
57
- v = self._s / (self._i) # Population variance
58
- std = np.sqrt(v)
59
- return self._m, v, std
60
-
61
-
62
- class MyWelfordImg(MyWelford):
63
- r"""
64
- Welford algorithm to calculate running mean and sample variance for images
65
- """
66
-
67
- def __init__(self):
68
- super(MyWelfordImg, self).__init__()
69
-
70
- def add(self, img):
71
- r"""
72
- Input:
73
- img: An image numpy array (channel, width, height). The only requirement is to have the
74
- channels as the first dimension and have 3 dimensions in total
75
- """
76
- channels = img.shape[0]
77
- flat_dim = img.shape[1] * img.shape[2]
78
- img_r = img.reshape(channels, flat_dim)
79
-
80
- for i in range(flat_dim):
81
- super(MyWelfordImg, self).add(img_r[:, i])
82
-
83
-
84
- class ChanVarianceImg:
85
- r"""
86
- Chan's algorithm to compute a running variance with support of sub-samples
87
- In this implementation each sub-sample is an images
88
-
89
- Math for the original paper:
90
- https://github.ibm.com/nli/variance_formulae
91
- """
92
-
93
- def __init__(self):
94
- r""" """
95
- self._first = True
96
- # Size of the calculated dataset
97
- self._n = 0
98
- # Sum of the samples for the 3 image channels
99
- self._t = 0
100
- # Sum of the square differences of the deviations of the samples from the mean
101
- self._s = 0
102
-
103
- def add(self, img):
104
- r"""
105
- Add the provided image to the computation of the dataset statistics
106
-
107
- Input:
108
- img: An image numpy array (channel, width, height). The only requirement is to have the
109
- channels as the first dimension and have 3 dimensions in total
110
- """
111
- ch = img.shape[0]
112
- n = img.shape[1] * img.shape[2]
113
- img = img.reshape(ch, n)
114
- img_t = img.sum(axis=1)
115
- img_t_v = img_t.reshape(ch, 1)
116
- diff = (img - (img_t_v / n)) ** 2
117
- img_s = diff.sum(axis=1)
118
-
119
- if not self._first:
120
- c = (self._n / (n * (self._n + n))) * (
121
- ((n / self._n) * self._t - img_t) ** 2
122
- )
123
- self._s += img_s + c
124
- self._t += img_t
125
- else:
126
- self._s = img_s
127
- self._t = img_t
128
- self._first = False
129
- self._n += n
130
-
131
- def results(self):
132
- r"""
133
- Get the computed statistics
134
-
135
- Output:
136
- mean: Mean for the complete dataset
137
- var: Population variance for the complete dataset
138
- std: Population standard deviation for the complete dataset
139
- """
140
- mean = list(self._t / self._n)
141
- var = list(self._s / self._n) # Population variance
142
- std = list(np.sqrt(var))
143
-
144
- return mean, var, std
145
-
146
- def reset(self):
147
- r"""
148
- Reset the object to start over again
149
- """
150
- self._n = 0
151
- self._t = 0
152
- self._s = 0
153
- self._first = True
154
-
155
-
156
- if __name__ == "__main__":
157
- logger = s.get_custom_logger("variance", LOG_LEVEL)
158
-
159
- n = 50000
160
- channels = 3
161
- width = 448
162
- height = 448
163
-
164
- my = ChanVarianceImg()
165
- # Generate random images
166
- for i in range(n):
167
- logger.info(i)
168
- img = 255 * np.random.rand(channels, width, height)
169
- my.add(img)
170
-
171
- # Calculate the statistics
172
- m, v, std = my.results()
173
- assert m.shape == (3,), "Wrong mean dimension"
174
- assert v.shape == (3,), "Wrong variance dimension"
175
- assert std.shape == (3,), "Wrong std dimension"