docling-ibm-models 2.0.8__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/PKG-INFO +3 -1
  2. docling_ibm_models-3.0.0/docling_ibm_models/layoutmodel/layout_predictor.py +175 -0
  3. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/data_management/tf_predictor.py +30 -35
  4. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/pyproject.toml +16 -1
  5. docling_ibm_models-2.0.8/docling_ibm_models/layoutmodel/layout_predictor.py +0 -167
  6. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/LICENSE +0 -0
  7. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/README.md +0 -0
  8. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/__init__.py +0 -0
  9. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/common.py +0 -0
  10. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/data_management/__init__.py +0 -0
  11. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/data_management/functional.py +0 -0
  12. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/data_management/matching_post_processor.py +0 -0
  13. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/data_management/tf_cell_matcher.py +0 -0
  14. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/data_management/transforms.py +0 -0
  15. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/models/__init__.py +0 -0
  16. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/models/common/__init__.py +0 -0
  17. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/models/common/base_model.py +0 -0
  18. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/models/table04_rs/__init__.py +0 -0
  19. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/models/table04_rs/bbox_decoder_rs.py +0 -0
  20. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/models/table04_rs/encoder04_rs.py +0 -0
  21. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/models/table04_rs/tablemodel04_rs.py +0 -0
  22. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py +0 -0
  23. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/otsl.py +0 -0
  24. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/settings.py +0 -0
  25. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/utils/__init__.py +0 -0
  26. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/utils/app_profiler.py +0 -0
  27. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/utils/mem_monitor.py +0 -0
  28. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/utils/torch_utils.py +0 -0
  29. {docling_ibm_models-2.0.8 → docling_ibm_models-3.0.0}/docling_ibm_models/tableformer/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-ibm-models
3
- Version: 2.0.8
3
+ Version: 3.0.0
4
4
  Summary: This package contains the AI models used by the Docling PDF conversion package
5
5
  License: MIT
6
6
  Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
@@ -25,9 +25,11 @@ Requires-Dist: huggingface_hub (>=0.23,<1)
25
25
  Requires-Dist: jsonlines (>=3.1.0,<4.0.0)
26
26
  Requires-Dist: numpy (>=1.24.4,<3.0.0)
27
27
  Requires-Dist: opencv-python-headless (>=4.6.0.66,<5.0.0.0)
28
+ Requires-Dist: safetensors[torch] (>=0.4.3,<1)
28
29
  Requires-Dist: torch (>=2.2.2,<3.0.0)
29
30
  Requires-Dist: torchvision (>=0,<1)
30
31
  Requires-Dist: tqdm (>=4.64.0,<5.0.0)
32
+ Requires-Dist: transformers (>=4.42.0,<5.0.0)
31
33
  Description-Content-Type: text/markdown
32
34
 
33
35
  [![PyPI version](https://img.shields.io/pypi/v/docling-ibm-models)](https://pypi.org/project/docling-ibm-models/)
@@ -0,0 +1,175 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+ import logging
6
+ import os
7
+ from collections.abc import Iterable
8
+ from typing import Union
9
+
10
+ import numpy as np
11
+ import torch
12
+ import torchvision.transforms as T
13
+ from PIL import Image
14
+ from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
15
+
16
+ _log = logging.getLogger(__name__)
17
+
18
+
19
+ class LayoutPredictor:
20
+ """
21
+ Document layout prediction using safe tensors
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ artifact_path: str,
27
+ device: str = "cpu",
28
+ num_threads: int = 4,
29
+ ):
30
+ """
31
+ Provide the artifact path that contains the LayoutModel file
32
+
33
+ Parameters
34
+ ----------
35
+ artifact_path: Path for the model torch file.
36
+ device: (Optional) device to run the inference.
37
+ num_threads: (Optional) Number of threads to run the inference if device = 'cpu'
38
+
39
+ Raises
40
+ ------
41
+ FileNotFoundError when the model's torch file is missing
42
+ """
43
+ # Initialize classes map:
44
+ self._classes_map = {
45
+ 0: "background",
46
+ 1: "Caption",
47
+ 2: "Footnote",
48
+ 3: "Formula",
49
+ 4: "List-item",
50
+ 5: "Page-footer",
51
+ 6: "Page-header",
52
+ 7: "Picture",
53
+ 8: "Section-header",
54
+ 9: "Table",
55
+ 10: "Text",
56
+ 11: "Title",
57
+ 12: "Document Index",
58
+ 13: "Code",
59
+ 14: "Checkbox-Selected",
60
+ 15: "Checkbox-Unselected",
61
+ 16: "Form",
62
+ 17: "Key-Value Region",
63
+ }
64
+
65
+ # Blacklisted classes
66
+ self._black_classes = set() # ["Form", "Key-Value Region"])
67
+
68
+ # Set basic params
69
+ self._threshold = 0.3 # Score threshold
70
+ self._image_size = 640
71
+ self._size = np.asarray([[self._image_size, self._image_size]], dtype=np.int64)
72
+
73
+ # Set number of threads for CPU
74
+ self._device = torch.device(device)
75
+ self._num_threads = num_threads
76
+ if device == "cpu":
77
+ torch.set_num_threads(self._num_threads)
78
+
79
+ # Model file and configurations
80
+ self._st_fn = os.path.join(artifact_path, "model.safetensors")
81
+ if not os.path.isfile(self._st_fn):
82
+ raise FileNotFoundError("Missing safe tensors file: {}".format(self._st_fn))
83
+
84
+ # Load model and move to device
85
+ processor_config = os.path.join(artifact_path, "preprocessor_config.json")
86
+ model_config = os.path.join(artifact_path, "config.json")
87
+ self._image_processor = RTDetrImageProcessor.from_json_file(processor_config)
88
+ self._model = RTDetrForObjectDetection.from_pretrained(
89
+ artifact_path, config=model_config
90
+ ).to(self._device)
91
+ self._model.eval()
92
+
93
+ _log.debug("LayoutPredictor settings: {}".format(self.info()))
94
+
95
+ def info(self) -> dict:
96
+ """
97
+ Get information about the configuration of LayoutPredictor
98
+ """
99
+ info = {
100
+ "safe_tensors_file": self._st_fn,
101
+ "device": self._device.type,
102
+ "num_threads": self._num_threads,
103
+ "image_size": self._image_size,
104
+ "threshold": self._threshold,
105
+ }
106
+ return info
107
+
108
+ @torch.inference_mode()
109
+ def predict(self, orig_img: Union[Image.Image, np.ndarray]) -> Iterable[dict]:
110
+ """
111
+ Predict bounding boxes for a given image.
112
+ The origin (0, 0) is the top-left corner and the predicted bbox coords are provided as:
113
+ [left, top, right, bottom]
114
+
115
+ Parameter
116
+ ---------
117
+ origin_img: Image to be predicted as a PIL Image object or numpy array.
118
+
119
+ Yield
120
+ -----
121
+ Bounding box as a dict with the keys: "label", "confidence", "l", "t", "r", "b"
122
+
123
+ Raises
124
+ ------
125
+ TypeError when the input image is not supported
126
+ """
127
+ # Convert image format
128
+ if isinstance(orig_img, Image.Image):
129
+ page_img = orig_img.convert("RGB")
130
+ elif isinstance(orig_img, np.ndarray):
131
+ page_img = Image.fromarray(orig_img).convert("RGB")
132
+ else:
133
+ raise TypeError("Not supported input image format")
134
+
135
+ resize = {"height": self._image_size, "width": self._image_size}
136
+ inputs = self._image_processor(
137
+ images=page_img,
138
+ return_tensors="pt",
139
+ size=resize,
140
+ ).to(self._device)
141
+ outputs = self._model(**inputs)
142
+ results = self._image_processor.post_process_object_detection(
143
+ outputs,
144
+ target_sizes=torch.tensor([page_img.size[::-1]]),
145
+ threshold=self._threshold,
146
+ )
147
+
148
+ w, h = page_img.size
149
+
150
+ result = results[0]
151
+ for score, label_id, box in zip(
152
+ result["scores"], result["labels"], result["boxes"]
153
+ ):
154
+ score = float(score.item())
155
+
156
+ label_id = int(label_id.item()) + 1 # Advance the label_id
157
+ label_str = self._classes_map[label_id]
158
+
159
+ # Filter out blacklisted classes
160
+ if label_str in self._black_classes:
161
+ continue
162
+
163
+ bbox_float = [float(b.item()) for b in box]
164
+ l = min(w, max(0, bbox_float[0]))
165
+ t = min(h, max(0, bbox_float[1]))
166
+ r = min(w, max(0, bbox_float[2]))
167
+ b = min(h, max(0, bbox_float[3]))
168
+ yield {
169
+ "l": l,
170
+ "t": t,
171
+ "r": r,
172
+ "b": b,
173
+ "label": label_str,
174
+ "confidence": score,
175
+ }
@@ -2,14 +2,17 @@
2
2
  # Copyright IBM Corp. 2024 - 2024
3
3
  # SPDX-License-Identifier: MIT
4
4
  #
5
+ import glob
5
6
  import json
6
7
  import logging
7
8
  import os
8
9
  from itertools import groupby
10
+ from pathlib import Path
9
11
 
10
12
  import cv2
11
13
  import numpy as np
12
14
  import torch
15
+ from safetensors.torch import load_model
13
16
 
14
17
  import docling_ibm_models.tableformer.common as c
15
18
  import docling_ibm_models.tableformer.data_management.transforms as T
@@ -82,45 +85,27 @@ def otsl_sqr_chk(rs_list, logdebug):
82
85
  return isSquare
83
86
 
84
87
 
85
- def decide_device(config: dict) -> str:
86
- r"""
87
- Decide the inference device based on the "predict.device_mode" parameter
88
- """
89
- device_mode = config["predict"].get("device_mode", "cpu")
90
- num_gpus = torch.cuda.device_count()
91
-
92
- if device_mode == "auto":
93
- device = "cuda:0" if num_gpus > 0 else "cpu"
94
- elif device_mode in ["gpu", "cuda"]:
95
- device = "cuda:0"
96
- else:
97
- device = "cpu"
98
- return device
99
-
100
-
101
88
  class TFPredictor:
102
89
  r"""
103
90
  Table predictions for the in-memory Docling API
104
91
  """
105
92
 
106
- def __init__(self, config, num_threads: int = None):
93
+ def __init__(self, config, device: str = "cpu", num_threads: int = 4):
107
94
  r"""
108
- The number of threads is decided, in the following order, by:
109
- 1. The init method parameter `num_threads`, if it is set.
110
- 2. The envvar "OMP_NUM_THREADS", if it is set.
111
- 3. The default value 4.
112
-
113
95
  Parameters
114
96
  ----------
115
- config : dict
116
- Parameters configuration
97
+ config : dict Parameters configuration
98
+ device: (Optional) torch device to run the inference.
99
+ num_threads: (Optional) Number of threads to run the inference if device = 'cpu'
100
+
117
101
  Raises
118
102
  ------
119
103
  ValueError
120
104
  When the model cannot be found
121
105
  """
122
- self._device = decide_device(config)
123
- self._log().info("Running on device: {}".format(self._device))
106
+ # self._device = torch.device(device)
107
+ self._device = device
108
+ self._log().info("Running on device: {}".format(device))
124
109
 
125
110
  self._config = config
126
111
  self.enable_post_process = True
@@ -133,11 +118,10 @@ class TFPredictor:
133
118
 
134
119
  self._init_word_map()
135
120
 
136
- # Set the number of torch threads
137
- if num_threads is None:
138
- num_threads = int(os.environ.get("OMP_NUM_THREADS", 4))
139
- self._num_threads = num_threads
140
- torch.set_num_threads(num_threads)
121
+ # Set the number of threads
122
+ if device == "cpu":
123
+ self._num_threads = num_threads
124
+ torch.set_num_threads(self._num_threads)
141
125
 
142
126
  # Load the model
143
127
  self._model = self._load_model()
@@ -202,10 +186,21 @@ class TFPredictor:
202
186
  if self._model_type == "TableModel02":
203
187
  self._remove_padding = True
204
188
 
205
- # Load model from checkpoint
206
- success, _, _, _, _ = model.load()
207
- if not success:
208
- err_msg = "Cannot load the model"
189
+ # Load model from safetensors
190
+ save_dir = self._config["model"]["save_dir"]
191
+ models_fn = glob.glob(f"{save_dir}/tableformer_*.safetensors")
192
+ if not models_fn:
193
+ err_msg = "Not able to find a model file for {}".format(self._model_type)
194
+ self._log().error(err_msg)
195
+ raise ValueError(err_msg)
196
+ model_fn = models_fn[
197
+ 0
198
+ ] # Take the first tableformer safetensors file inside the save_dir
199
+ missing, unexpected = load_model(model, model_fn, device=self._device)
200
+ if missing or unexpected:
201
+ err_msg = "Not able to load the model weights for {}".format(
202
+ self._model_type
203
+ )
209
204
  self._log().error(err_msg)
210
205
  raise ValueError(err_msg)
211
206
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-ibm-models"
3
- version = "2.0.8" # DO NOT EDIT, updated automatically
3
+ version = "3.0.0" # DO NOT EDIT, updated automatically
4
4
  description = "This package contains the AI models used by the Docling PDF conversion package"
5
5
  authors = ["Nikos Livathinos <nli@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -24,12 +24,14 @@ packages = [
24
24
  python = "^3.9"
25
25
  torch = "^2.2.2"
26
26
  torchvision = "^0"
27
+ transformers = "^4.42.0"
27
28
  numpy = ">=1.24.4,<3.0.0"
28
29
  jsonlines = "^3.1.0"
29
30
  Pillow = "^10.0.0"
30
31
  tqdm = "^4.64.0"
31
32
  opencv-python-headless = "^4.6.0.66"
32
33
  huggingface_hub = ">=0.23,<1"
34
+ safetensors = {version=">=0.4.3,<1", extras=["torch"]}
33
35
 
34
36
  [tool.poetry.group.dev.dependencies]
35
37
  black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -96,3 +98,16 @@ branch = "main"
96
98
  parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
97
99
  parser_angular_minor_types = "feat"
98
100
  parser_angular_patch_types = "fix,perf"
101
+
102
+
103
+ # [tool.mypy]
104
+ # pretty = true
105
+ # no_implicit_optional = true
106
+ # python_version = "3.10"
107
+ #
108
+ # [[tool.mypy.overrides]]
109
+ # module = [
110
+ # "torchvision.*",
111
+ # "transformers.*"
112
+ # ]
113
+ # ignore_missing_imports = true
@@ -1,167 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
- import os
6
- from collections.abc import Iterable
7
- from typing import Union
8
-
9
- import numpy as np
10
- import torch
11
- import torchvision.transforms as T
12
- from PIL import Image
13
-
14
- MODEL_CHECKPOINT_FN = "model.pt"
15
- DEFAULT_NUM_THREADS = 4
16
-
17
-
18
- class LayoutPredictor:
19
- r"""
20
- Document layout prediction using torch
21
- """
22
-
23
- def __init__(
24
- self, artifact_path: str, num_threads: int = None, use_cpu_only: bool = False
25
- ):
26
- r"""
27
- Provide the artifact path that contains the LayoutModel file
28
-
29
- The number of threads is decided, in the following order, by:
30
- 1. The init method parameter `num_threads`, if it is set.
31
- 2. The envvar "OMP_NUM_THREADS", if it is set.
32
- 3. The default value DEFAULT_NUM_THREADS.
33
-
34
- The execution provided is decided, in the following order:
35
- 1. If the init method parameter `cpu_only` is True or the envvar "USE_CPU_ONLY" is set,
36
- it uses the "CPUExecutionProvider".
37
- 3. Otherwise if the "CUDAExecutionProvider" is present, use:
38
- ["CUDAExecutionProvider", "CPUExecutionProvider"]:
39
-
40
- Parameters
41
- ----------
42
- artifact_path: Path for the model torch file.
43
- num_threads: (Optional) Number of threads to run the inference.
44
- use_cpu_only: (Optional) If True, it forces CPU as the execution provider.
45
-
46
- Raises
47
- ------
48
- FileNotFoundError when the model's torch file is missing
49
- """
50
- # Initialize classes map:
51
- self._classes_map = {
52
- 0: "background",
53
- 1: "Caption",
54
- 2: "Footnote",
55
- 3: "Formula",
56
- 4: "List-item",
57
- 5: "Page-footer",
58
- 6: "Page-header",
59
- 7: "Picture",
60
- 8: "Section-header",
61
- 9: "Table",
62
- 10: "Text",
63
- 11: "Title",
64
- 12: "Document Index",
65
- 13: "Code",
66
- 14: "Checkbox-Selected",
67
- 15: "Checkbox-Unselected",
68
- 16: "Form",
69
- 17: "Key-Value Region",
70
- }
71
-
72
- # Blacklisted classes
73
- self._black_classes = set(["Form", "Key-Value Region"])
74
-
75
- # Set basic params
76
- self._threshold = 0.6 # Score threshold
77
- self._image_size = 640
78
- self._size = np.asarray([[self._image_size, self._image_size]], dtype=np.int64)
79
- self._use_cpu_only = use_cpu_only or ("USE_CPU_ONLY" in os.environ)
80
-
81
- # Model file
82
- self._torch_fn = os.path.join(artifact_path, MODEL_CHECKPOINT_FN)
83
- if not os.path.isfile(self._torch_fn):
84
- raise FileNotFoundError("Missing torch file: {}".format(self._torch_fn))
85
-
86
- # Get env vars
87
- if num_threads is None:
88
- num_threads = int(os.environ.get("OMP_NUM_THREADS", DEFAULT_NUM_THREADS))
89
- self._num_threads = num_threads
90
-
91
- self.model = torch.jit.load(self._torch_fn)
92
-
93
- def info(self) -> dict:
94
- r"""
95
- Get information about the configuration of LayoutPredictor
96
- """
97
- info = {
98
- "torch_file": self._torch_fn,
99
- "use_cpu_only": self._use_cpu_only,
100
- "image_size": self._image_size,
101
- "threshold": self._threshold,
102
- }
103
- return info
104
-
105
- def predict(self, orig_img: Union[Image.Image, np.ndarray]) -> Iterable[dict]:
106
- r"""
107
- Predict bounding boxes for a given image.
108
- The origin (0, 0) is the top-left corner and the predicted bbox coords are provided as:
109
- [left, top, right, bottom]
110
-
111
- Parameter
112
- ---------
113
- origin_img: Image to be predicted as a PIL Image object or numpy array.
114
-
115
- Yield
116
- -----
117
- Bounding box as a dict with the keys: "label", "confidence", "l", "t", "r", "b"
118
-
119
- Raises
120
- ------
121
- TypeError when the input image is not supported
122
- """
123
- # Convert image format
124
- if isinstance(orig_img, Image.Image):
125
- page_img = orig_img.convert("RGB")
126
- elif isinstance(orig_img, np.ndarray):
127
- page_img = Image.fromarray(orig_img).convert("RGB")
128
- else:
129
- raise TypeError("Not supported input image format")
130
-
131
- w, h = page_img.size
132
- orig_size = torch.tensor([w, h])[None]
133
-
134
- transforms = T.Compose(
135
- [
136
- T.Resize((640, 640)),
137
- T.ToTensor(),
138
- ]
139
- )
140
- img = transforms(page_img)[None]
141
- # Predict
142
- with torch.no_grad():
143
- labels, boxes, scores = self.model(img, orig_size)
144
-
145
- # Yield output
146
- for label_idx, box, score in zip(labels[0], boxes[0], scores[0]):
147
- # Filter out blacklisted classes
148
- label_idx = int(label_idx.item())
149
- score = float(score.item())
150
- label = self._classes_map[label_idx + 1]
151
- if label in self._black_classes:
152
- continue
153
-
154
- # Check against threshold
155
- if score > self._threshold:
156
- l = min(w, max(0, box[0]))
157
- t = min(h, max(0, box[1]))
158
- r = min(w, max(0, box[2]))
159
- b = min(h, max(0, box[3]))
160
- yield {
161
- "l": l,
162
- "t": t,
163
- "r": r,
164
- "b": b,
165
- "label": label,
166
- "confidence": score,
167
- }