docling-ibm-models 3.1.2__py3-none-any.whl → 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+ import logging
6
+ from typing import List, Union
7
+
8
+ import numpy as np
9
+ import torch
10
+ from PIL import Image
11
+ from transformers import AutoTokenizer
12
+
13
+ from docling_ibm_models.code_formula_model.models.sam_opt import SamOPTForCausalLM
14
+ from docling_ibm_models.code_formula_model.models.sam_opt_image_processor import (
15
+ SamOptImageProcessor,
16
+ )
17
+
18
+ _log = logging.getLogger(__name__)
19
+
20
+
21
+ class CodeFormulaPredictor:
22
+ """
23
+ Code and Formula Predictor using a multi-modal vision-language model.
24
+
25
+ This class enables the prediction of code or LaTeX representations
26
+ from input images of code snippets or mathematical formulas.
27
+
28
+ Attributes
29
+ ----------
30
+ _device : str
31
+ The device on which the model is loaded (e.g., 'cpu' or 'cuda').
32
+ _num_threads : int
33
+ Number of threads used for inference when running on CPU.
34
+ _tokenizer : transformers.PreTrainedTokenizer
35
+ Tokenizer for processing textual inputs to the model.
36
+ _model : transformers.PreTrainedModel
37
+ Pretrained multi-modal vision-language model.
38
+ _image_processor : transformers.ImageProcessor
39
+ Processor for normalizing and preparing input images.
40
+ _temperature : float
41
+ Sampling temperature for generation; controls randomness in predictions.
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ artifacts_path: str,
47
+ device: str = "cpu",
48
+ num_threads: int = 4,
49
+ ):
50
+ """
51
+ Initializes the CodeFormulaPredictor with the specified model artifacts.
52
+
53
+ Parameters
54
+ ----------
55
+ artifacts_path : str
56
+ Path to the directory containing the pretrained model files.
57
+ device : str, optional
58
+ Device to run the inference on ('cpu' or 'cuda'), by default "cpu".
59
+ num_threads : int, optional
60
+ Number of threads for CPU inference, by default 4.
61
+ """
62
+ self._device = device
63
+ self._num_threads = num_threads
64
+ if device == "cpu":
65
+ torch.set_num_threads(self._num_threads)
66
+
67
+ self._tokenizer = AutoTokenizer.from_pretrained(
68
+ artifacts_path, use_fast=True, padding_side="left"
69
+ )
70
+ self._model = SamOPTForCausalLM.from_pretrained(artifacts_path).to(self._device)
71
+ self._model.eval()
72
+
73
+ self._image_processor = SamOptImageProcessor.from_pretrained(artifacts_path)
74
+
75
+ _log.debug("CodeFormulaModel settings: {}".format(self.info()))
76
+
77
+ def info(self) -> dict:
78
+ """
79
+ Retrieves configuration details of the CodeFormulaPredictor instance.
80
+
81
+ Returns
82
+ -------
83
+ dict
84
+ A dictionary containing configuration details such as the device and
85
+ the number of threads used.
86
+ """
87
+ info = {
88
+ "device": self._device,
89
+ "num_threads": self._num_threads,
90
+ }
91
+ return info
92
+
93
+ def _get_prompt(self, label: str) -> str:
94
+ """
95
+ Constructs the prompt for the model based on the input label.
96
+
97
+ Parameters
98
+ ----------
99
+ label : str
100
+ The type of input, either 'code' or 'formula'.
101
+
102
+ Returns
103
+ -------
104
+ str
105
+ The constructed prompt including necessary tokens and query.
106
+
107
+ Raises
108
+ ------
109
+ NotImplementedError
110
+ If the label is not 'code' or 'formula'.
111
+ """
112
+ if label == "code":
113
+ query = "<code_image_to_text>"
114
+ elif label == "formula":
115
+ query = "<equation>"
116
+ else:
117
+ raise NotImplementedError("Label must be either code or formula")
118
+
119
+ prompt = (
120
+ "A chat between a curious user and an artificial intelligence"
121
+ " assistant. The assistant gives helpful, detailed, and polite answers to"
122
+ " the user's questions. USER:"
123
+ )
124
+ prompt += (
125
+ "<img>" + "<imgpad>" * 256 + "</img>" + "\n" + " ASSISTANT:" + "\n" + query
126
+ )
127
+
128
+ return prompt
129
+
130
+ @torch.inference_mode()
131
+ def predict(
132
+ self,
133
+ images: List[Union[Image.Image, np.ndarray]],
134
+ labels: List[str],
135
+ temperature: float = 0.1,
136
+ ) -> List[str]:
137
+ """
138
+ Predicts the textual representation of input images (code or LaTeX).
139
+
140
+ Parameters
141
+ ----------
142
+ images : List[Union[Image.Image, np.ndarray]]
143
+ List of images to be processed, provided as PIL Image objects or numpy arrays.
144
+ labels : List[str]
145
+ List of labels indicating the type of each image ('code' or 'formula').
146
+ temperature : float, optional
147
+ Sampling temperature for generation, by default set to 0.1.
148
+
149
+ Returns
150
+ -------
151
+ List[str]
152
+ List of predicted textual outputs for each input image in the given input
153
+ order.
154
+
155
+ Raises
156
+ ------
157
+ TypeError
158
+ If any of the input images is not of a supported type (PIL Image or numpy array).
159
+ Excpetion
160
+ In case the temperature is an invalid number.
161
+ """
162
+ if (type(temperature) != float and type(temperature) != int) or temperature < 0:
163
+ raise Exception("Temperature must be a number greater or equal to 0.")
164
+
165
+ do_sample = True
166
+ if temperature == 0:
167
+ do_sample = False
168
+ temperature = None
169
+
170
+ if len(labels) != len(images):
171
+ raise Exception(
172
+ "The number of images must be the same as the number of labels."
173
+ )
174
+
175
+ images_tmp = []
176
+ for image in images:
177
+ if isinstance(image, Image.Image):
178
+ image = image.convert("RGB")
179
+ elif isinstance(image, np.ndarray):
180
+ image = Image.fromarray(image).convert("RGB")
181
+ else:
182
+ raise TypeError("Not supported input image format")
183
+ images_tmp.append(image)
184
+ images = images_tmp
185
+
186
+ images_tensor = torch.stack([self._image_processor(img) for img in images]).to(
187
+ self._device
188
+ )
189
+
190
+ prompts = [self._get_prompt(label) for label in labels]
191
+
192
+ tokenized = self._tokenizer(prompts, padding=True, return_tensors="pt")
193
+ tokenized = {k: v.to(self._device) for k, v in tokenized.items()}
194
+
195
+ prompt_ids = tokenized["input_ids"]
196
+ attention_mask = tokenized["attention_mask"]
197
+
198
+ if self._device == "cpu":
199
+ output_ids_list = self._model.generate(
200
+ input_ids=prompt_ids,
201
+ attention_mask=attention_mask,
202
+ images=images_tensor,
203
+ do_sample=do_sample,
204
+ temperature=temperature,
205
+ max_new_tokens=4096 - prompt_ids.shape[1],
206
+ use_cache=True,
207
+ )
208
+ else:
209
+ with torch.autocast(device_type=self._device, dtype=torch.bfloat16):
210
+ output_ids_list = self._model.generate(
211
+ prompt_ids,
212
+ images=images_tensor,
213
+ do_sample=do_sample,
214
+ temperature=temperature,
215
+ max_new_tokens=4096 - prompt_ids.shape[1],
216
+ use_cache=True,
217
+ )
218
+
219
+ outputs = self._tokenizer.batch_decode(
220
+ output_ids_list[:, prompt_ids.shape[1] :], skip_special_tokens=True
221
+ )
222
+
223
+ return outputs
@@ -0,0 +1,514 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This file was originally developed by Meta Platforms, Inc. as part of
5
+ # the Segment Anything project (https://github.com/facebookresearch/segment-anything).
6
+ # It has been adapted by contributors from the Vary-toy project
7
+ # (https://github.com/Ucas-HaoranWei/Vary-toy).
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at:
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+
22
+ from functools import partial
23
+ from typing import Optional, Tuple, Type
24
+
25
+ import torch
26
+ import torch.nn as nn
27
+ import torch.nn.functional as F
28
+
29
+
30
+ class MLPBlock(nn.Module):
31
+ def __init__(
32
+ self,
33
+ embedding_dim: int,
34
+ mlp_dim: int,
35
+ act: Type[nn.Module] = nn.GELU,
36
+ ) -> None:
37
+ super().__init__()
38
+ self.lin1 = nn.Linear(embedding_dim, mlp_dim)
39
+ self.lin2 = nn.Linear(mlp_dim, embedding_dim)
40
+ self.act = act()
41
+
42
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
43
+ return self.lin2(self.act(self.lin1(x)))
44
+
45
+
46
+ # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
47
+ # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
48
+ class LayerNorm2d(nn.Module):
49
+ def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
50
+ super().__init__()
51
+ self.weight = nn.Parameter(torch.ones(num_channels))
52
+ self.bias = nn.Parameter(torch.zeros(num_channels))
53
+ self.eps = eps
54
+
55
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
56
+ u = x.mean(1, keepdim=True)
57
+ s = (x - u).pow(2).mean(1, keepdim=True)
58
+ x = (x - u) / torch.sqrt(s + self.eps)
59
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
60
+ return x
61
+
62
+
63
+ # This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
64
+ class ImageEncoderViT(nn.Module):
65
+ def __init__(
66
+ self,
67
+ img_size: int = 1024,
68
+ patch_size: int = 16,
69
+ in_chans: int = 3,
70
+ embed_dim: int = 768,
71
+ depth: int = 12,
72
+ num_heads: int = 12,
73
+ mlp_ratio: float = 4.0,
74
+ out_chans: int = 256,
75
+ qkv_bias: bool = True,
76
+ norm_layer: Type[nn.Module] = nn.LayerNorm,
77
+ act_layer: Type[nn.Module] = nn.GELU,
78
+ use_abs_pos: bool = True,
79
+ use_rel_pos: bool = False,
80
+ rel_pos_zero_init: bool = True,
81
+ window_size: int = 0,
82
+ global_attn_indexes: Tuple[int, ...] = (),
83
+ ) -> None:
84
+ """
85
+ Args:
86
+ img_size (int): Input image size.
87
+ patch_size (int): Patch size.
88
+ in_chans (int): Number of input image channels.
89
+ embed_dim (int): Patch embedding dimension.
90
+ depth (int): Depth of ViT.
91
+ num_heads (int): Number of attention heads in each ViT block.
92
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
93
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
94
+ norm_layer (nn.Module): Normalization layer.
95
+ act_layer (nn.Module): Activation layer.
96
+ use_abs_pos (bool): If True, use absolute positional embeddings.
97
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
98
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
99
+ window_size (int): Window size for window attention blocks.
100
+ global_attn_indexes (list): Indexes for blocks using global attention.
101
+ """
102
+ super().__init__()
103
+ self.img_size = img_size
104
+
105
+ self.patch_embed = PatchEmbed(
106
+ kernel_size=(patch_size, patch_size),
107
+ stride=(patch_size, patch_size),
108
+ in_chans=in_chans,
109
+ embed_dim=embed_dim,
110
+ )
111
+
112
+ self.pos_embed: Optional[nn.Parameter] = None
113
+ if use_abs_pos:
114
+ self.pos_embed = nn.Parameter(
115
+ torch.zeros(
116
+ 1, img_size // patch_size, img_size // patch_size, embed_dim
117
+ )
118
+ )
119
+
120
+ self.blocks = nn.ModuleList()
121
+ for i in range(depth):
122
+ block = Block(
123
+ dim=embed_dim,
124
+ num_heads=num_heads,
125
+ mlp_ratio=mlp_ratio,
126
+ qkv_bias=qkv_bias,
127
+ norm_layer=norm_layer,
128
+ act_layer=act_layer,
129
+ use_rel_pos=use_rel_pos,
130
+ rel_pos_zero_init=rel_pos_zero_init,
131
+ window_size=window_size if i not in global_attn_indexes else 0,
132
+ input_size=(img_size // patch_size, img_size // patch_size),
133
+ )
134
+ self.blocks.append(block)
135
+
136
+ self.neck = nn.Sequential(
137
+ nn.Conv2d(
138
+ embed_dim,
139
+ out_chans,
140
+ kernel_size=1,
141
+ bias=False,
142
+ ),
143
+ LayerNorm2d(out_chans),
144
+ nn.Conv2d(
145
+ out_chans,
146
+ out_chans,
147
+ kernel_size=3,
148
+ padding=1,
149
+ bias=False,
150
+ ),
151
+ LayerNorm2d(out_chans),
152
+ )
153
+
154
+ self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
155
+ self.net_3 = nn.Conv2d(
156
+ 512, 1024, kernel_size=3, stride=2, padding=1, bias=False
157
+ )
158
+
159
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
160
+ x = self.patch_embed(x)
161
+ if self.pos_embed is not None:
162
+ x = x + self.pos_embed
163
+
164
+ for blk in self.blocks:
165
+ x = blk(x)
166
+
167
+ x = self.neck(x.permute(0, 3, 1, 2))
168
+ x = self.net_2(x)
169
+ x = self.net_3(x)
170
+
171
+ return x
172
+
173
+
174
+ class Block(nn.Module):
175
+ """Transformer blocks with support of window attention and residual propagation blocks"""
176
+
177
+ def __init__(
178
+ self,
179
+ dim: int,
180
+ num_heads: int,
181
+ mlp_ratio: float = 4.0,
182
+ qkv_bias: bool = True,
183
+ norm_layer: Type[nn.Module] = nn.LayerNorm,
184
+ act_layer: Type[nn.Module] = nn.GELU,
185
+ use_rel_pos: bool = False,
186
+ rel_pos_zero_init: bool = True,
187
+ window_size: int = 0,
188
+ input_size: Optional[Tuple[int, int]] = None,
189
+ ) -> None:
190
+ """
191
+ Args:
192
+ dim (int): Number of input channels.
193
+ num_heads (int): Number of attention heads in each ViT block.
194
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
195
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
196
+ norm_layer (nn.Module): Normalization layer.
197
+ act_layer (nn.Module): Activation layer.
198
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
199
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
200
+ window_size (int): Window size for window attention blocks. If it equals 0, then
201
+ use global attention.
202
+ input_size (tuple(int, int) or None): Input resolution for calculating the relative
203
+ positional parameter size.
204
+ """
205
+ super().__init__()
206
+ self.norm1 = norm_layer(dim)
207
+ self.attn = Attention(
208
+ dim,
209
+ num_heads=num_heads,
210
+ qkv_bias=qkv_bias,
211
+ use_rel_pos=use_rel_pos,
212
+ rel_pos_zero_init=rel_pos_zero_init,
213
+ input_size=input_size if window_size == 0 else (window_size, window_size),
214
+ )
215
+
216
+ self.norm2 = norm_layer(dim)
217
+ self.mlp = MLPBlock(
218
+ embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer
219
+ )
220
+
221
+ self.window_size = window_size
222
+
223
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
224
+ shortcut = x
225
+ x = self.norm1(x)
226
+ if self.window_size > 0:
227
+ H, W = x.shape[1], x.shape[2]
228
+ x, pad_hw = window_partition(x, self.window_size)
229
+
230
+ x = self.attn(x)
231
+ if self.window_size > 0:
232
+ x = window_unpartition(x, self.window_size, pad_hw, (H, W))
233
+
234
+ x = shortcut + x
235
+ x = x + self.mlp(self.norm2(x))
236
+
237
+ return x
238
+
239
+
240
+ class Attention(nn.Module):
241
+ """Multi-head Attention block with relative position embeddings."""
242
+
243
+ def __init__(
244
+ self,
245
+ dim: int,
246
+ num_heads: int = 8,
247
+ qkv_bias: bool = True,
248
+ use_rel_pos: bool = False,
249
+ rel_pos_zero_init: bool = True,
250
+ input_size: Optional[Tuple[int, int]] = None,
251
+ ) -> None:
252
+ """
253
+ Args:
254
+ dim (int): Number of input channels.
255
+ num_heads (int): Number of attention heads.
256
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
257
+ rel_pos (bool): If True, add relative positional embeddings to the attention map.
258
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
259
+ input_size (tuple(int, int) or None): Input resolution for calculating the relative
260
+ positional parameter size.
261
+ """
262
+ super().__init__()
263
+ self.num_heads = num_heads
264
+ head_dim = dim // num_heads
265
+ self.scale = head_dim**-0.5
266
+
267
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
268
+ self.proj = nn.Linear(dim, dim)
269
+
270
+ self.use_rel_pos = use_rel_pos
271
+ if self.use_rel_pos:
272
+ assert (
273
+ input_size is not None
274
+ ), "Input size must be provided if using relative positional encoding."
275
+ # initialize relative positional embeddings
276
+ self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
277
+ self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
278
+
279
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
280
+ B, H, W, _ = x.shape
281
+ # qkv with shape (3, B, nHead, H * W, C)
282
+ qkv = (
283
+ self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
284
+ )
285
+ # q, k, v with shape (B * nHead, H * W, C)
286
+ q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
287
+
288
+ attn = (q * self.scale) @ k.transpose(-2, -1)
289
+
290
+ if self.use_rel_pos:
291
+ attn = add_decomposed_rel_pos(
292
+ attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)
293
+ )
294
+
295
+ attn = attn.softmax(dim=-1)
296
+ x = (
297
+ (attn @ v)
298
+ .view(B, self.num_heads, H, W, -1)
299
+ .permute(0, 2, 3, 1, 4)
300
+ .reshape(B, H, W, -1)
301
+ )
302
+ x = self.proj(x)
303
+
304
+ return x
305
+
306
+
307
+ def window_partition(
308
+ x: torch.Tensor, window_size: int
309
+ ) -> Tuple[torch.Tensor, Tuple[int, int]]:
310
+ """
311
+ Partition into non-overlapping windows with padding if needed.
312
+ Args:
313
+ x (tensor): input tokens with [B, H, W, C].
314
+ window_size (int): window size.
315
+
316
+ Returns:
317
+ windows: windows after partition with [B * num_windows, window_size, window_size, C].
318
+ (Hp, Wp): padded height and width before partition
319
+ """
320
+ B, H, W, C = x.shape
321
+
322
+ pad_h = (window_size - H % window_size) % window_size
323
+ pad_w = (window_size - W % window_size) % window_size
324
+ if pad_h > 0 or pad_w > 0:
325
+ x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
326
+ Hp, Wp = H + pad_h, W + pad_w
327
+
328
+ x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
329
+ windows = (
330
+ x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
331
+ )
332
+ return windows, (Hp, Wp)
333
+
334
+
335
+ def window_unpartition(
336
+ windows: torch.Tensor,
337
+ window_size: int,
338
+ pad_hw: Tuple[int, int],
339
+ hw: Tuple[int, int],
340
+ ) -> torch.Tensor:
341
+ """
342
+ Window unpartition into original sequences and removing padding.
343
+ Args:
344
+ windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
345
+ window_size (int): window size.
346
+ pad_hw (Tuple): padded height and width (Hp, Wp).
347
+ hw (Tuple): original height and width (H, W) before padding.
348
+
349
+ Returns:
350
+ x: unpartitioned sequences with [B, H, W, C].
351
+ """
352
+ Hp, Wp = pad_hw
353
+ H, W = hw
354
+ B = windows.shape[0] // (Hp * Wp // window_size // window_size)
355
+ x = windows.view(
356
+ B, Hp // window_size, Wp // window_size, window_size, window_size, -1
357
+ )
358
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
359
+
360
+ if Hp > H or Wp > W:
361
+ x = x[:, :H, :W, :].contiguous()
362
+ return x
363
+
364
+
365
+ def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
366
+ """
367
+ Get relative positional embeddings according to the relative positions of
368
+ query and key sizes.
369
+ Args:
370
+ q_size (int): size of query q.
371
+ k_size (int): size of key k.
372
+ rel_pos (Tensor): relative position embeddings (L, C).
373
+
374
+ Returns:
375
+ Extracted positional embeddings according to relative positions.
376
+ """
377
+ max_rel_dist = int(2 * max(q_size, k_size) - 1)
378
+ # Interpolate rel pos if needed.
379
+ if rel_pos.shape[0] != max_rel_dist:
380
+ # Interpolate rel pos.
381
+ rel_pos_resized = F.interpolate(
382
+ rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
383
+ size=max_rel_dist,
384
+ mode="linear",
385
+ )
386
+ rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
387
+ else:
388
+ rel_pos_resized = rel_pos
389
+
390
+ # Scale the coords with short length if shapes for q and k are different.
391
+ q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
392
+ k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
393
+ relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
394
+
395
+ return rel_pos_resized[relative_coords.long()]
396
+
397
+
398
+ def add_decomposed_rel_pos(
399
+ attn: torch.Tensor,
400
+ q: torch.Tensor,
401
+ rel_pos_h: torch.Tensor,
402
+ rel_pos_w: torch.Tensor,
403
+ q_size: Tuple[int, int],
404
+ k_size: Tuple[int, int],
405
+ ) -> torch.Tensor:
406
+ """
407
+ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
408
+ https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
409
+ Args:
410
+ attn (Tensor): attention map.
411
+ q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
412
+ rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
413
+ rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
414
+ q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
415
+ k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
416
+
417
+ Returns:
418
+ attn (Tensor): attention map with added relative positional embeddings.
419
+ """
420
+ q_h, q_w = q_size
421
+ k_h, k_w = k_size
422
+ Rh = get_rel_pos(q_h, k_h, rel_pos_h)
423
+ Rw = get_rel_pos(q_w, k_w, rel_pos_w)
424
+
425
+ B, _, dim = q.shape
426
+ r_q = q.reshape(B, q_h, q_w, dim)
427
+ rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
428
+ rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
429
+
430
+ attn = (
431
+ attn.view(B, q_h, q_w, k_h, k_w)
432
+ + rel_h[:, :, :, :, None]
433
+ + rel_w[:, :, :, None, :]
434
+ ).view(B, q_h * q_w, k_h * k_w)
435
+
436
+ return attn
437
+
438
+
439
+ class PatchEmbed(nn.Module):
440
+ """
441
+ Image to Patch Embedding.
442
+ """
443
+
444
+ def __init__(
445
+ self,
446
+ kernel_size: Tuple[int, int] = (16, 16),
447
+ stride: Tuple[int, int] = (16, 16),
448
+ padding: Tuple[int, int] = (0, 0),
449
+ in_chans: int = 3,
450
+ embed_dim: int = 768,
451
+ ) -> None:
452
+ """
453
+ Args:
454
+ kernel_size (Tuple): kernel size of the projection layer.
455
+ stride (Tuple): stride of the projection layer.
456
+ padding (Tuple): padding size of the projection layer.
457
+ in_chans (int): Number of input image channels.
458
+ embed_dim (int): Patch embedding dimension.
459
+ """
460
+ super().__init__()
461
+
462
+ self.proj = nn.Conv2d(
463
+ in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
464
+ )
465
+
466
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
467
+ x = self.proj(x)
468
+ # B C H W -> B H W C
469
+ x = x.permute(0, 2, 3, 1)
470
+ return x
471
+
472
+
473
+ def build_sam_vit_b(checkpoint=None, image_size=1024):
474
+ return _build_sam(
475
+ encoder_embed_dim=768,
476
+ encoder_depth=12,
477
+ encoder_num_heads=12,
478
+ encoder_global_attn_indexes=[2, 5, 8, 11],
479
+ checkpoint=checkpoint,
480
+ image_size=image_size,
481
+ )
482
+
483
+
484
+ def _build_sam(
485
+ encoder_embed_dim,
486
+ encoder_depth,
487
+ encoder_num_heads,
488
+ encoder_global_attn_indexes,
489
+ checkpoint=None,
490
+ image_size=1024,
491
+ ):
492
+ prompt_embed_dim = 256
493
+ vit_patch_size = 16
494
+ image_encoder = ImageEncoderViT(
495
+ depth=encoder_depth,
496
+ embed_dim=encoder_embed_dim,
497
+ img_size=image_size,
498
+ mlp_ratio=4,
499
+ norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
500
+ num_heads=encoder_num_heads,
501
+ patch_size=vit_patch_size,
502
+ qkv_bias=True,
503
+ use_rel_pos=True,
504
+ global_attn_indexes=encoder_global_attn_indexes,
505
+ window_size=14,
506
+ out_chans=prompt_embed_dim,
507
+ )
508
+
509
+ if checkpoint is not None:
510
+ # with open(checkpoint, "rb") as f:
511
+ state_dict = torch.load(checkpoint)
512
+
513
+ image_encoder.load_state_dict(state_dict, strict=True)
514
+ return image_encoder
@@ -0,0 +1,237 @@
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # This file is part of the Vary project, originally located at:
4
+ # https://github.com/Ucas-HaoranWei/Vary-toy/blob/main/Vary-master/vary/model/vary_opt.py
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.nn as nn
23
+ from transformers import (
24
+ AutoConfig,
25
+ AutoModelForCausalLM,
26
+ OPTConfig,
27
+ OPTForCausalLM,
28
+ OPTModel,
29
+ )
30
+ from transformers.modeling_outputs import (
31
+ BaseModelOutputWithPast,
32
+ CausalLMOutputWithPast,
33
+ )
34
+
35
+ from docling_ibm_models.code_formula_model.models.sam import build_sam_vit_b
36
+
37
+
38
+ class SamOptConfig(OPTConfig):
39
+ model_type = "sam_opt"
40
+
41
+ def __init__(
42
+ self,
43
+ sam_image_size=1024,
44
+ sam_mm_projector_in=1024,
45
+ sam_mm_projector_out=768,
46
+ **kwargs,
47
+ ):
48
+ super().__init__(**kwargs)
49
+ self.sam_image_size = sam_image_size
50
+ self.sam_mm_projector_in = sam_mm_projector_in
51
+ self.sam_mm_projector_out = sam_mm_projector_out
52
+
53
+
54
+ class SamOPTModel(OPTModel):
55
+ config_class = SamOptConfig
56
+
57
+ def __init__(self, config: OPTConfig):
58
+ super(SamOPTModel, self).__init__(config)
59
+ self.vision_tower = build_sam_vit_b(image_size=config.sam_image_size)
60
+
61
+ self.mm_projector = nn.Linear(
62
+ config.sam_mm_projector_in, config.sam_mm_projector_out
63
+ )
64
+
65
+ def embed_tokens(self, x):
66
+ return self.get_input_embeddings()(x)
67
+
68
+ def forward(
69
+ self,
70
+ input_ids: torch.LongTensor = None,
71
+ attention_mask: Optional[torch.Tensor] = None,
72
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
73
+ inputs_embeds: Optional[torch.FloatTensor] = None,
74
+ use_cache: Optional[bool] = None,
75
+ output_attentions: Optional[bool] = None,
76
+ output_hidden_states: Optional[bool] = None,
77
+ images: torch.FloatTensor = None,
78
+ return_dict: Optional[bool] = None,
79
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
80
+
81
+ if inputs_embeds is None:
82
+ inputs_embeds = self.embed_tokens(input_ids)
83
+
84
+ vision_tower = getattr(self, "vision_tower", None)
85
+ im_start_token = getattr(self.config, "im_start_token", -1)
86
+
87
+ if input_ids.shape[1] != 1 or self.training:
88
+ with torch.set_grad_enabled(self.training):
89
+ image_features = vision_tower(images)
90
+ image_features = image_features.flatten(2).permute(0, 2, 1)
91
+ image_features = self.mm_projector(image_features)
92
+
93
+ new_input_embeds = []
94
+ for cur_input_ids, cur_input_embeds, cur_image_features in zip(
95
+ input_ids, inputs_embeds, image_features
96
+ ):
97
+ image_start_token_position = torch.where(
98
+ cur_input_ids == im_start_token
99
+ )[0].item()
100
+
101
+ cur_image_features = cur_image_features.to(
102
+ device=cur_input_embeds.device
103
+ )
104
+ num_patches = cur_image_features.shape[0]
105
+ cur_input_embeds = torch.cat(
106
+ (
107
+ cur_input_embeds[: image_start_token_position + 1],
108
+ cur_image_features,
109
+ cur_input_embeds[
110
+ image_start_token_position + num_patches + 1 :
111
+ ],
112
+ ),
113
+ dim=0,
114
+ )
115
+
116
+ new_input_embeds.append(cur_input_embeds)
117
+
118
+ inputs_embeds = torch.stack(new_input_embeds, dim=0)
119
+
120
+ return super(SamOPTModel, self).forward(
121
+ input_ids=None,
122
+ attention_mask=attention_mask,
123
+ past_key_values=past_key_values,
124
+ inputs_embeds=inputs_embeds,
125
+ use_cache=use_cache,
126
+ output_attentions=output_attentions,
127
+ output_hidden_states=output_hidden_states,
128
+ return_dict=return_dict,
129
+ )
130
+
131
+
132
+ class SamOPTForCausalLM(OPTForCausalLM):
133
+ config_class = SamOptConfig
134
+
135
+ def __init__(self, config):
136
+ super(OPTForCausalLM, self).__init__(config)
137
+ self.model = SamOPTModel(config)
138
+
139
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
140
+
141
+ self.post_init()
142
+
143
+ def get_model(self):
144
+ return self.model
145
+
146
+ def forward(
147
+ self,
148
+ input_ids: Optional[torch.LongTensor] = None,
149
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
150
+ attention_mask: Optional[torch.FloatTensor] = None,
151
+ token_type_ids: Optional[torch.LongTensor] = None,
152
+ position_ids: Optional[torch.LongTensor] = None,
153
+ head_mask: Optional[torch.FloatTensor] = None,
154
+ inputs_embeds: Optional[torch.FloatTensor] = None,
155
+ encoder_hidden_states: Optional[torch.Tensor] = None,
156
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
157
+ labels: Optional[torch.LongTensor] = None,
158
+ use_cache: Optional[bool] = None,
159
+ output_attentions: Optional[bool] = None,
160
+ output_hidden_states: Optional[bool] = None,
161
+ images: Optional[torch.FloatTensor] = None,
162
+ return_dict: Optional[bool] = None,
163
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
164
+ output_attentions = (
165
+ output_attentions
166
+ if output_attentions is not None
167
+ else self.config.output_attentions
168
+ )
169
+ output_hidden_states = (
170
+ output_hidden_states
171
+ if output_hidden_states is not None
172
+ else self.config.output_hidden_states
173
+ )
174
+
175
+ outputs = self.model(
176
+ input_ids=input_ids,
177
+ past_key_values=past_key_values,
178
+ attention_mask=attention_mask,
179
+ inputs_embeds=inputs_embeds,
180
+ use_cache=use_cache,
181
+ output_attentions=output_attentions,
182
+ output_hidden_states=output_hidden_states,
183
+ images=images,
184
+ return_dict=return_dict,
185
+ )
186
+
187
+ hidden_states = outputs[0]
188
+ logits = self.lm_head(hidden_states).contiguous()
189
+
190
+ return CausalLMOutputWithPast(
191
+ loss=None,
192
+ logits=logits,
193
+ past_key_values=outputs.past_key_values,
194
+ hidden_states=outputs.hidden_states,
195
+ attentions=outputs.attentions,
196
+ )
197
+
198
+ def prepare_inputs_for_generation(
199
+ self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
200
+ ):
201
+ token_type_ids = kwargs.get("token_type_ids", None)
202
+ if past_key_values:
203
+ input_ids = input_ids[:, -1].unsqueeze(-1)
204
+ if token_type_ids is not None:
205
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
206
+
207
+ attention_mask = kwargs.get("attention_mask", None)
208
+ position_ids = kwargs.get("position_ids", None)
209
+
210
+ if attention_mask is not None and position_ids is None:
211
+ position_ids = attention_mask.long().cumsum(-1) - 1
212
+ position_ids.masked_fill_(attention_mask == 0, 1)
213
+ if past_key_values:
214
+ position_ids = position_ids[:, -1].unsqueeze(-1)
215
+ else:
216
+ position_ids = None
217
+
218
+ if inputs_embeds is not None and past_key_values is None:
219
+ model_inputs = {"inputs_embeds": inputs_embeds}
220
+ else:
221
+ model_inputs = {"input_ids": input_ids}
222
+
223
+ model_inputs.update(
224
+ {
225
+ "past_key_values": past_key_values,
226
+ "use_cache": kwargs.get("use_cache"),
227
+ "position_ids": position_ids,
228
+ "attention_mask": attention_mask,
229
+ "token_type_ids": token_type_ids,
230
+ "images": kwargs.get("images", None),
231
+ }
232
+ )
233
+ return model_inputs
234
+
235
+
236
+ AutoConfig.register("sam_opt", SamOptConfig)
237
+ AutoModelForCausalLM.register(SamOptConfig, SamOPTForCausalLM)
@@ -0,0 +1,31 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+ from PIL import Image
6
+ from torchvision.transforms import functional as F
7
+ from transformers import AutoImageProcessor
8
+ from transformers.image_processing_utils import ImageProcessingMixin
9
+
10
+
11
+ class SamOptImageProcessor(ImageProcessingMixin):
12
+
13
+ def __init__(self, size=(1024, 1024), mean=None, std=None, **kwargs):
14
+ super().__init__(**kwargs)
15
+ self.size = size
16
+ self.mean = mean
17
+ self.std = std
18
+
19
+ def __call__(self, image):
20
+ if not isinstance(image, Image.Image):
21
+ raise ValueError("Input must be a PIL Image")
22
+
23
+ image = F.resize(image, self.size)
24
+ image = F.to_tensor(image)
25
+
26
+ image = F.normalize(image, mean=self.mean, std=self.std)
27
+
28
+ return image
29
+
30
+
31
+ AutoImageProcessor.register(SamOptImageProcessor, SamOptImageProcessor)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-ibm-models
3
- Version: 3.1.2
3
+ Version: 3.2.0
4
4
  Summary: This package contains the AI models used by the Docling PDF conversion package
5
5
  License: MIT
6
6
  Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
@@ -1,3 +1,7 @@
1
+ docling_ibm_models/code_formula_model/code_formula_predictor.py,sha256=vU18PzmG77htQFEabS2nKbJqNikbWk_BDaA7sqKQuqc,7358
2
+ docling_ibm_models/code_formula_model/models/sam.py,sha256=6MXf1ae_wRWJ4b1luISWXBRKyoQie7YbpY-qwq1OJJA,17841
3
+ docling_ibm_models/code_formula_model/models/sam_opt.py,sha256=qQjmZZgInmKWBp8qcpYZjR2pr5jzjpYRp404RcsJyZM,8333
4
+ docling_ibm_models/code_formula_model/models/sam_opt_image_processor.py,sha256=rA06J4vCK3s9qgfDreJJCcIYUyJzihBk0kHPskfUPGc,868
1
5
  docling_ibm_models/layoutmodel/layout_predictor.py,sha256=ArVgs7FBOiu23TC-JoybcaTp7F7a4BgYC8uRVxTgx4E,5681
2
6
  docling_ibm_models/tableformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
7
  docling_ibm_models/tableformer/common.py,sha256=2zgGZBFf4fXytEaXrZR2NU6FWdX2kxO0DHlGZmuvpNQ,3230
@@ -22,7 +26,7 @@ docling_ibm_models/tableformer/utils/app_profiler.py,sha256=Pb7o1zcikKXh7ninaNt4
22
26
  docling_ibm_models/tableformer/utils/mem_monitor.py,sha256=ycZ07fUBVVKKLTVGF54jGPDM2aTkKuZWk1kMbOS0wwQ,6353
23
27
  docling_ibm_models/tableformer/utils/torch_utils.py,sha256=uN0rK9mSXy1ewBnBnILrWebJhhVU4N-XJZBqNiLJwlQ,8893
24
28
  docling_ibm_models/tableformer/utils/utils.py,sha256=8Bxf1rEn977lFbY9NX0r5xh9PvxIRipQZX_EZW92XfA,10980
25
- docling_ibm_models-3.1.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
26
- docling_ibm_models-3.1.2.dist-info/METADATA,sha256=AamN7IRNfa5y0El3uhHEQsjLMLCdrK51qJTlTrC87XE,7347
27
- docling_ibm_models-3.1.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
28
- docling_ibm_models-3.1.2.dist-info/RECORD,,
29
+ docling_ibm_models-3.2.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
30
+ docling_ibm_models-3.2.0.dist-info/METADATA,sha256=3XWPmwMvKxWm_9mq4_LZV3ffVRN-e_le6WGI2gfOZww,7347
31
+ docling_ibm_models-3.2.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
32
+ docling_ibm_models-3.2.0.dist-info/RECORD,,