docling-ibm-models 2.0.8__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,46 +2,41 @@
2
2
  # Copyright IBM Corp. 2024 - 2024
3
3
  # SPDX-License-Identifier: MIT
4
4
  #
5
+ import logging
5
6
  import os
6
7
  from collections.abc import Iterable
7
- from typing import Union
8
+ from typing import Set, Union
8
9
 
9
10
  import numpy as np
10
11
  import torch
11
12
  import torchvision.transforms as T
12
13
  from PIL import Image
14
+ from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
13
15
 
14
- MODEL_CHECKPOINT_FN = "model.pt"
15
- DEFAULT_NUM_THREADS = 4
16
+ _log = logging.getLogger(__name__)
16
17
 
17
18
 
18
19
  class LayoutPredictor:
19
- r"""
20
- Document layout prediction using torch
20
+ """
21
+ Document layout prediction using safe tensors
21
22
  """
22
23
 
23
24
  def __init__(
24
- self, artifact_path: str, num_threads: int = None, use_cpu_only: bool = False
25
+ self,
26
+ artifact_path: str,
27
+ device: str = "cpu",
28
+ num_threads: int = 4,
29
+ base_threshold: float = 0.3,
30
+ blacklist_classes: Set[str] = set(),
25
31
  ):
26
- r"""
32
+ """
27
33
  Provide the artifact path that contains the LayoutModel file
28
34
 
29
- The number of threads is decided, in the following order, by:
30
- 1. The init method parameter `num_threads`, if it is set.
31
- 2. The envvar "OMP_NUM_THREADS", if it is set.
32
- 3. The default value DEFAULT_NUM_THREADS.
33
-
34
- The execution provided is decided, in the following order:
35
- 1. If the init method parameter `cpu_only` is True or the envvar "USE_CPU_ONLY" is set,
36
- it uses the "CPUExecutionProvider".
37
- 3. Otherwise if the "CUDAExecutionProvider" is present, use:
38
- ["CUDAExecutionProvider", "CPUExecutionProvider"]:
39
-
40
35
  Parameters
41
36
  ----------
42
37
  artifact_path: Path for the model torch file.
43
- num_threads: (Optional) Number of threads to run the inference.
44
- use_cpu_only: (Optional) If True, it forces CPU as the execution provider.
38
+ device: (Optional) device to run the inference.
39
+ num_threads: (Optional) Number of threads to run the inference if device = 'cpu'
45
40
 
46
41
  Raises
47
42
  ------
@@ -70,40 +65,51 @@ class LayoutPredictor:
70
65
  }
71
66
 
72
67
  # Blacklisted classes
73
- self._black_classes = set(["Form", "Key-Value Region"])
68
+ self._black_classes = blacklist_classes # set(["Form", "Key-Value Region"])
74
69
 
75
70
  # Set basic params
76
- self._threshold = 0.6 # Score threshold
71
+ self._threshold = base_threshold # Score threshold
77
72
  self._image_size = 640
78
73
  self._size = np.asarray([[self._image_size, self._image_size]], dtype=np.int64)
79
- self._use_cpu_only = use_cpu_only or ("USE_CPU_ONLY" in os.environ)
80
74
 
81
- # Model file
82
- self._torch_fn = os.path.join(artifact_path, MODEL_CHECKPOINT_FN)
83
- if not os.path.isfile(self._torch_fn):
84
- raise FileNotFoundError("Missing torch file: {}".format(self._torch_fn))
85
-
86
- # Get env vars
87
- if num_threads is None:
88
- num_threads = int(os.environ.get("OMP_NUM_THREADS", DEFAULT_NUM_THREADS))
75
+ # Set number of threads for CPU
76
+ self._device = torch.device(device)
89
77
  self._num_threads = num_threads
78
+ if device == "cpu":
79
+ torch.set_num_threads(self._num_threads)
80
+
81
+ # Model file and configurations
82
+ self._st_fn = os.path.join(artifact_path, "model.safetensors")
83
+ if not os.path.isfile(self._st_fn):
84
+ raise FileNotFoundError("Missing safe tensors file: {}".format(self._st_fn))
90
85
 
91
- self.model = torch.jit.load(self._torch_fn)
86
+ # Load model and move to device
87
+ processor_config = os.path.join(artifact_path, "preprocessor_config.json")
88
+ model_config = os.path.join(artifact_path, "config.json")
89
+ self._image_processor = RTDetrImageProcessor.from_json_file(processor_config)
90
+ self._model = RTDetrForObjectDetection.from_pretrained(
91
+ artifact_path, config=model_config
92
+ ).to(self._device)
93
+ self._model.eval()
94
+
95
+ _log.debug("LayoutPredictor settings: {}".format(self.info()))
92
96
 
93
97
  def info(self) -> dict:
94
- r"""
98
+ """
95
99
  Get information about the configuration of LayoutPredictor
96
100
  """
97
101
  info = {
98
- "torch_file": self._torch_fn,
99
- "use_cpu_only": self._use_cpu_only,
102
+ "safe_tensors_file": self._st_fn,
103
+ "device": self._device.type,
104
+ "num_threads": self._num_threads,
100
105
  "image_size": self._image_size,
101
106
  "threshold": self._threshold,
102
107
  }
103
108
  return info
104
109
 
110
+ @torch.inference_mode()
105
111
  def predict(self, orig_img: Union[Image.Image, np.ndarray]) -> Iterable[dict]:
106
- r"""
112
+ """
107
113
  Predict bounding boxes for a given image.
108
114
  The origin (0, 0) is the top-left corner and the predicted bbox coords are provided as:
109
115
  [left, top, right, bottom]
@@ -128,40 +134,44 @@ class LayoutPredictor:
128
134
  else:
129
135
  raise TypeError("Not supported input image format")
130
136
 
137
+ resize = {"height": self._image_size, "width": self._image_size}
138
+ inputs = self._image_processor(
139
+ images=page_img,
140
+ return_tensors="pt",
141
+ size=resize,
142
+ ).to(self._device)
143
+ outputs = self._model(**inputs)
144
+ results = self._image_processor.post_process_object_detection(
145
+ outputs,
146
+ target_sizes=torch.tensor([page_img.size[::-1]]),
147
+ threshold=self._threshold,
148
+ )
149
+
131
150
  w, h = page_img.size
132
- orig_size = torch.tensor([w, h])[None]
133
151
 
134
- transforms = T.Compose(
135
- [
136
- T.Resize((640, 640)),
137
- T.ToTensor(),
138
- ]
139
- )
140
- img = transforms(page_img)[None]
141
- # Predict
142
- with torch.no_grad():
143
- labels, boxes, scores = self.model(img, orig_size)
152
+ result = results[0]
153
+ for score, label_id, box in zip(
154
+ result["scores"], result["labels"], result["boxes"]
155
+ ):
156
+ score = float(score.item())
157
+
158
+ label_id = int(label_id.item()) + 1 # Advance the label_id
159
+ label_str = self._classes_map[label_id]
144
160
 
145
- # Yield output
146
- for label_idx, box, score in zip(labels[0], boxes[0], scores[0]):
147
161
  # Filter out blacklisted classes
148
- label_idx = int(label_idx.item())
149
- score = float(score.item())
150
- label = self._classes_map[label_idx + 1]
151
- if label in self._black_classes:
162
+ if label_str in self._black_classes:
152
163
  continue
153
164
 
154
- # Check against threshold
155
- if score > self._threshold:
156
- l = min(w, max(0, box[0]))
157
- t = min(h, max(0, box[1]))
158
- r = min(w, max(0, box[2]))
159
- b = min(h, max(0, box[3]))
160
- yield {
161
- "l": l,
162
- "t": t,
163
- "r": r,
164
- "b": b,
165
- "label": label,
166
- "confidence": score,
167
- }
165
+ bbox_float = [float(b.item()) for b in box]
166
+ l = min(w, max(0, bbox_float[0]))
167
+ t = min(h, max(0, bbox_float[1]))
168
+ r = min(w, max(0, bbox_float[2]))
169
+ b = min(h, max(0, bbox_float[3]))
170
+ yield {
171
+ "l": l,
172
+ "t": t,
173
+ "r": r,
174
+ "b": b,
175
+ "label": label_str,
176
+ "confidence": score,
177
+ }
@@ -2,14 +2,17 @@
2
2
  # Copyright IBM Corp. 2024 - 2024
3
3
  # SPDX-License-Identifier: MIT
4
4
  #
5
+ import glob
5
6
  import json
6
7
  import logging
7
8
  import os
8
9
  from itertools import groupby
10
+ from pathlib import Path
9
11
 
10
12
  import cv2
11
13
  import numpy as np
12
14
  import torch
15
+ from safetensors.torch import load_model
13
16
 
14
17
  import docling_ibm_models.tableformer.common as c
15
18
  import docling_ibm_models.tableformer.data_management.transforms as T
@@ -82,45 +85,27 @@ def otsl_sqr_chk(rs_list, logdebug):
82
85
  return isSquare
83
86
 
84
87
 
85
- def decide_device(config: dict) -> str:
86
- r"""
87
- Decide the inference device based on the "predict.device_mode" parameter
88
- """
89
- device_mode = config["predict"].get("device_mode", "cpu")
90
- num_gpus = torch.cuda.device_count()
91
-
92
- if device_mode == "auto":
93
- device = "cuda:0" if num_gpus > 0 else "cpu"
94
- elif device_mode in ["gpu", "cuda"]:
95
- device = "cuda:0"
96
- else:
97
- device = "cpu"
98
- return device
99
-
100
-
101
88
  class TFPredictor:
102
89
  r"""
103
90
  Table predictions for the in-memory Docling API
104
91
  """
105
92
 
106
- def __init__(self, config, num_threads: int = None):
93
+ def __init__(self, config, device: str = "cpu", num_threads: int = 4):
107
94
  r"""
108
- The number of threads is decided, in the following order, by:
109
- 1. The init method parameter `num_threads`, if it is set.
110
- 2. The envvar "OMP_NUM_THREADS", if it is set.
111
- 3. The default value 4.
112
-
113
95
  Parameters
114
96
  ----------
115
- config : dict
116
- Parameters configuration
97
+ config : dict Parameters configuration
98
+ device: (Optional) torch device to run the inference.
99
+ num_threads: (Optional) Number of threads to run the inference if device = 'cpu'
100
+
117
101
  Raises
118
102
  ------
119
103
  ValueError
120
104
  When the model cannot be found
121
105
  """
122
- self._device = decide_device(config)
123
- self._log().info("Running on device: {}".format(self._device))
106
+ # self._device = torch.device(device)
107
+ self._device = device
108
+ self._log().info("Running on device: {}".format(device))
124
109
 
125
110
  self._config = config
126
111
  self.enable_post_process = True
@@ -133,11 +118,10 @@ class TFPredictor:
133
118
 
134
119
  self._init_word_map()
135
120
 
136
- # Set the number of torch threads
137
- if num_threads is None:
138
- num_threads = int(os.environ.get("OMP_NUM_THREADS", 4))
139
- self._num_threads = num_threads
140
- torch.set_num_threads(num_threads)
121
+ # Set the number of threads
122
+ if device == "cpu":
123
+ self._num_threads = num_threads
124
+ torch.set_num_threads(self._num_threads)
141
125
 
142
126
  # Load the model
143
127
  self._model = self._load_model()
@@ -202,10 +186,21 @@ class TFPredictor:
202
186
  if self._model_type == "TableModel02":
203
187
  self._remove_padding = True
204
188
 
205
- # Load model from checkpoint
206
- success, _, _, _, _ = model.load()
207
- if not success:
208
- err_msg = "Cannot load the model"
189
+ # Load model from safetensors
190
+ save_dir = self._config["model"]["save_dir"]
191
+ models_fn = glob.glob(f"{save_dir}/tableformer_*.safetensors")
192
+ if not models_fn:
193
+ err_msg = "Not able to find a model file for {}".format(self._model_type)
194
+ self._log().error(err_msg)
195
+ raise ValueError(err_msg)
196
+ model_fn = models_fn[
197
+ 0
198
+ ] # Take the first tableformer safetensors file inside the save_dir
199
+ missing, unexpected = load_model(model, model_fn, device=self._device)
200
+ if missing or unexpected:
201
+ err_msg = "Not able to load the model weights for {}".format(
202
+ self._model_type
203
+ )
209
204
  self._log().error(err_msg)
210
205
  raise ValueError(err_msg)
211
206
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-ibm-models
3
- Version: 2.0.8
3
+ Version: 3.1.0
4
4
  Summary: This package contains the AI models used by the Docling PDF conversion package
5
5
  License: MIT
6
6
  Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
@@ -25,9 +25,11 @@ Requires-Dist: huggingface_hub (>=0.23,<1)
25
25
  Requires-Dist: jsonlines (>=3.1.0,<4.0.0)
26
26
  Requires-Dist: numpy (>=1.24.4,<3.0.0)
27
27
  Requires-Dist: opencv-python-headless (>=4.6.0.66,<5.0.0.0)
28
+ Requires-Dist: safetensors[torch] (>=0.4.3,<1)
28
29
  Requires-Dist: torch (>=2.2.2,<3.0.0)
29
30
  Requires-Dist: torchvision (>=0,<1)
30
31
  Requires-Dist: tqdm (>=4.64.0,<5.0.0)
32
+ Requires-Dist: transformers (>=4.42.0,<5.0.0)
31
33
  Description-Content-Type: text/markdown
32
34
 
33
35
  [![PyPI version](https://img.shields.io/pypi/v/docling-ibm-models)](https://pypi.org/project/docling-ibm-models/)
@@ -1,11 +1,11 @@
1
- docling_ibm_models/layoutmodel/layout_predictor.py,sha256=5JijEajVGy-vwMDOxDemcbtqlyjFWfWfnCcjMyj5Y_A,5405
1
+ docling_ibm_models/layoutmodel/layout_predictor.py,sha256=ArVgs7FBOiu23TC-JoybcaTp7F7a4BgYC8uRVxTgx4E,5681
2
2
  docling_ibm_models/tableformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling_ibm_models/tableformer/common.py,sha256=2zgGZBFf4fXytEaXrZR2NU6FWdX2kxO0DHlGZmuvpNQ,3230
4
4
  docling_ibm_models/tableformer/data_management/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  docling_ibm_models/tableformer/data_management/functional.py,sha256=kJntHEXFz2SP7obEcHyjAqZNZC9qh-U75MwUJALLADI,3143
6
6
  docling_ibm_models/tableformer/data_management/matching_post_processor.py,sha256=meSM0jLWNLS8P95QjN6pEp095jFEbKdl9KKfRY1ocy0,58046
7
7
  docling_ibm_models/tableformer/data_management/tf_cell_matcher.py,sha256=IdZTaWIRhPpyEwzZgCmviZnYacR6kbcUqBvx7ilmkKY,21250
8
- docling_ibm_models/tableformer/data_management/tf_predictor.py,sha256=rRGcuyF_Kwika_P-mNrQvkOgDceTMvwgsekhHi4aafo,38920
8
+ docling_ibm_models/tableformer/data_management/tf_predictor.py,sha256=BHd6KdAX0-b9TbX01m0872MO10zWDMValyf4UTIRkAU,39008
9
9
  docling_ibm_models/tableformer/data_management/transforms.py,sha256=NNaz_7GI7FCVmu_rJuenqH5VfzRSljJHUHpNQQ8Mq3Q,2983
10
10
  docling_ibm_models/tableformer/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  docling_ibm_models/tableformer/models/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -22,7 +22,7 @@ docling_ibm_models/tableformer/utils/app_profiler.py,sha256=Pb7o1zcikKXh7ninaNt4
22
22
  docling_ibm_models/tableformer/utils/mem_monitor.py,sha256=ycZ07fUBVVKKLTVGF54jGPDM2aTkKuZWk1kMbOS0wwQ,6353
23
23
  docling_ibm_models/tableformer/utils/torch_utils.py,sha256=uN0rK9mSXy1ewBnBnILrWebJhhVU4N-XJZBqNiLJwlQ,8893
24
24
  docling_ibm_models/tableformer/utils/utils.py,sha256=8Bxf1rEn977lFbY9NX0r5xh9PvxIRipQZX_EZW92XfA,10980
25
- docling_ibm_models-2.0.8.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
26
- docling_ibm_models-2.0.8.dist-info/METADATA,sha256=-hV4IdslFbo69zhRSQvslUFR-AwxTXiaW0BtA_oaiKI,6930
27
- docling_ibm_models-2.0.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
28
- docling_ibm_models-2.0.8.dist-info/RECORD,,
25
+ docling_ibm_models-3.1.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
26
+ docling_ibm_models-3.1.0.dist-info/METADATA,sha256=tWJOjbwYGXZqg2qcjpMrOUm-3wCuThh05yqZmbY26s8,7023
27
+ docling_ibm_models-3.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
28
+ docling_ibm_models-3.1.0.dist-info/RECORD,,