docling 2.34.0__py3-none-any.whl → 2.36.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/xml/jats_backend.py +0 -0
  2. docling/cli/main.py +48 -18
  3. docling/datamodel/accelerator_options.py +68 -0
  4. docling/datamodel/base_models.py +10 -8
  5. docling/datamodel/document.py +7 -2
  6. docling/datamodel/pipeline_options.py +29 -161
  7. docling/datamodel/pipeline_options_vlm_model.py +81 -0
  8. docling/datamodel/vlm_model_specs.py +144 -0
  9. docling/document_converter.py +5 -0
  10. docling/models/api_vlm_model.py +1 -1
  11. docling/models/base_ocr_model.py +2 -1
  12. docling/models/code_formula_model.py +6 -11
  13. docling/models/document_picture_classifier.py +6 -11
  14. docling/models/easyocr_model.py +1 -2
  15. docling/models/layout_model.py +22 -17
  16. docling/models/ocr_mac_model.py +1 -1
  17. docling/models/page_preprocessing_model.py +11 -6
  18. docling/models/picture_description_api_model.py +1 -1
  19. docling/models/picture_description_base_model.py +1 -1
  20. docling/models/picture_description_vlm_model.py +7 -22
  21. docling/models/rapid_ocr_model.py +1 -2
  22. docling/models/table_structure_model.py +6 -12
  23. docling/models/tesseract_ocr_cli_model.py +1 -1
  24. docling/models/tesseract_ocr_model.py +1 -1
  25. docling/models/utils/__init__.py +0 -0
  26. docling/models/utils/hf_model_download.py +40 -0
  27. docling/models/vlm_models_inline/__init__.py +0 -0
  28. docling/models/vlm_models_inline/hf_transformers_model.py +194 -0
  29. docling/models/{hf_mlx_model.py → vlm_models_inline/mlx_model.py} +56 -44
  30. docling/pipeline/standard_pdf_pipeline.py +69 -57
  31. docling/pipeline/vlm_pipeline.py +228 -61
  32. docling/utils/accelerator_utils.py +17 -2
  33. docling/utils/model_downloader.py +13 -12
  34. {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/METADATA +54 -55
  35. {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/RECORD +48 -41
  36. {docling-2.34.0.dist-info → docling-2.36.0.dist-info}/WHEEL +2 -1
  37. docling-2.36.0.dist-info/entry_points.txt +6 -0
  38. docling-2.36.0.dist-info/top_level.txt +1 -0
  39. docling/models/hf_vlm_model.py +0 -182
  40. docling-2.34.0.dist-info/entry_points.txt +0 -7
  41. {docling-2.34.0.dist-info → docling-2.36.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,194 @@
1
+ import importlib.metadata
2
+ import logging
3
+ import time
4
+ from collections.abc import Iterable
5
+ from pathlib import Path
6
+ from typing import Any, Optional
7
+
8
+ from docling.datamodel.accelerator_options import (
9
+ AcceleratorOptions,
10
+ )
11
+ from docling.datamodel.base_models import Page, VlmPrediction
12
+ from docling.datamodel.document import ConversionResult
13
+ from docling.datamodel.pipeline_options_vlm_model import (
14
+ InlineVlmOptions,
15
+ TransformersModelType,
16
+ )
17
+ from docling.models.base_model import BasePageModel
18
+ from docling.models.utils.hf_model_download import (
19
+ HuggingFaceModelDownloadMixin,
20
+ )
21
+ from docling.utils.accelerator_utils import decide_device
22
+ from docling.utils.profiling import TimeRecorder
23
+
24
+ _log = logging.getLogger(__name__)
25
+
26
+
27
+ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
28
+ def __init__(
29
+ self,
30
+ enabled: bool,
31
+ artifacts_path: Optional[Path],
32
+ accelerator_options: AcceleratorOptions,
33
+ vlm_options: InlineVlmOptions,
34
+ ):
35
+ self.enabled = enabled
36
+
37
+ self.vlm_options = vlm_options
38
+
39
+ if self.enabled:
40
+ import torch
41
+ from transformers import (
42
+ AutoModel,
43
+ AutoModelForCausalLM,
44
+ AutoModelForVision2Seq,
45
+ AutoProcessor,
46
+ BitsAndBytesConfig,
47
+ GenerationConfig,
48
+ )
49
+
50
+ transformers_version = importlib.metadata.version("transformers")
51
+ if (
52
+ self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct"
53
+ and transformers_version >= "4.52.0"
54
+ ):
55
+ raise NotImplementedError(
56
+ f"Phi 4 only works with transformers<4.52.0 but you have {transformers_version=}. Please downgrage running pip install -U 'transformers<4.52.0'."
57
+ )
58
+
59
+ self.device = decide_device(
60
+ accelerator_options.device,
61
+ supported_devices=vlm_options.supported_devices,
62
+ )
63
+ _log.debug(f"Available device for VLM: {self.device}")
64
+
65
+ self.use_cache = vlm_options.use_kv_cache
66
+ self.max_new_tokens = vlm_options.max_new_tokens
67
+ self.temperature = vlm_options.temperature
68
+
69
+ repo_cache_folder = vlm_options.repo_id.replace("/", "--")
70
+
71
+ if artifacts_path is None:
72
+ artifacts_path = self.download_models(self.vlm_options.repo_id)
73
+ elif (artifacts_path / repo_cache_folder).exists():
74
+ artifacts_path = artifacts_path / repo_cache_folder
75
+
76
+ self.param_quantization_config: Optional[BitsAndBytesConfig] = None
77
+ if vlm_options.quantized:
78
+ self.param_quantization_config = BitsAndBytesConfig(
79
+ load_in_8bit=vlm_options.load_in_8bit,
80
+ llm_int8_threshold=vlm_options.llm_int8_threshold,
81
+ )
82
+
83
+ model_cls: Any = AutoModel
84
+ if (
85
+ self.vlm_options.transformers_model_type
86
+ == TransformersModelType.AUTOMODEL_CAUSALLM
87
+ ):
88
+ model_cls = AutoModelForCausalLM
89
+ elif (
90
+ self.vlm_options.transformers_model_type
91
+ == TransformersModelType.AUTOMODEL_VISION2SEQ
92
+ ):
93
+ model_cls = AutoModelForVision2Seq
94
+
95
+ self.processor = AutoProcessor.from_pretrained(
96
+ artifacts_path,
97
+ trust_remote_code=vlm_options.trust_remote_code,
98
+ )
99
+ self.vlm_model = model_cls.from_pretrained(
100
+ artifacts_path,
101
+ device_map=self.device,
102
+ _attn_implementation=(
103
+ "flash_attention_2"
104
+ if self.device.startswith("cuda")
105
+ and accelerator_options.cuda_use_flash_attention2
106
+ else "eager"
107
+ ),
108
+ trust_remote_code=vlm_options.trust_remote_code,
109
+ )
110
+
111
+ # Load generation config
112
+ self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
113
+
114
+ def __call__(
115
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
116
+ ) -> Iterable[Page]:
117
+ for page in page_batch:
118
+ assert page._backend is not None
119
+ if not page._backend.is_valid():
120
+ yield page
121
+ else:
122
+ with TimeRecorder(conv_res, "vlm"):
123
+ assert page.size is not None
124
+
125
+ hi_res_image = page.get_image(scale=self.vlm_options.scale)
126
+
127
+ # Define prompt structure
128
+ prompt = self.formulate_prompt()
129
+
130
+ inputs = self.processor(
131
+ text=prompt, images=[hi_res_image], return_tensors="pt"
132
+ ).to(self.device)
133
+
134
+ start_time = time.time()
135
+ # Call model to generate:
136
+ generated_ids = self.vlm_model.generate(
137
+ **inputs,
138
+ max_new_tokens=self.max_new_tokens,
139
+ use_cache=self.use_cache,
140
+ temperature=self.temperature,
141
+ generation_config=self.generation_config,
142
+ **self.vlm_options.extra_generation_config,
143
+ )
144
+
145
+ generation_time = time.time() - start_time
146
+ generated_texts = self.processor.batch_decode(
147
+ generated_ids[:, inputs["input_ids"].shape[1] :],
148
+ skip_special_tokens=False,
149
+ )[0]
150
+
151
+ num_tokens = len(generated_ids[0])
152
+ _log.debug(
153
+ f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
154
+ )
155
+ page.predictions.vlm_response = VlmPrediction(
156
+ text=generated_texts,
157
+ generation_time=generation_time,
158
+ )
159
+
160
+ yield page
161
+
162
+ def formulate_prompt(self) -> str:
163
+ """Formulate a prompt for the VLM."""
164
+
165
+ if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
166
+ _log.debug("Using specialized prompt for Phi-4")
167
+ # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
168
+
169
+ user_prompt = "<|user|>"
170
+ assistant_prompt = "<|assistant|>"
171
+ prompt_suffix = "<|end|>"
172
+
173
+ prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
174
+ _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
175
+
176
+ return prompt
177
+
178
+ messages = [
179
+ {
180
+ "role": "user",
181
+ "content": [
182
+ {
183
+ "type": "text",
184
+ "text": "This is a page from a document.",
185
+ },
186
+ {"type": "image"},
187
+ {"type": "text", "text": self.vlm_options.prompt},
188
+ ],
189
+ }
190
+ ]
191
+ prompt = self.processor.apply_chat_template(
192
+ messages, add_generation_prompt=False
193
+ )
194
+ return prompt
@@ -4,29 +4,34 @@ from collections.abc import Iterable
4
4
  from pathlib import Path
5
5
  from typing import Optional
6
6
 
7
- from docling.datamodel.base_models import Page, VlmPrediction
8
- from docling.datamodel.document import ConversionResult
9
- from docling.datamodel.pipeline_options import (
7
+ from docling.datamodel.accelerator_options import (
10
8
  AcceleratorOptions,
11
- HuggingFaceVlmOptions,
12
9
  )
10
+ from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
11
+ from docling.datamodel.document import ConversionResult
12
+ from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
13
13
  from docling.models.base_model import BasePageModel
14
+ from docling.models.utils.hf_model_download import (
15
+ HuggingFaceModelDownloadMixin,
16
+ )
14
17
  from docling.utils.profiling import TimeRecorder
15
18
 
16
19
  _log = logging.getLogger(__name__)
17
20
 
18
21
 
19
- class HuggingFaceMlxModel(BasePageModel):
22
+ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
20
23
  def __init__(
21
24
  self,
22
25
  enabled: bool,
23
26
  artifacts_path: Optional[Path],
24
27
  accelerator_options: AcceleratorOptions,
25
- vlm_options: HuggingFaceVlmOptions,
28
+ vlm_options: InlineVlmOptions,
26
29
  ):
27
30
  self.enabled = enabled
28
31
 
29
32
  self.vlm_options = vlm_options
33
+ self.max_tokens = vlm_options.max_new_tokens
34
+ self.temperature = vlm_options.temperature
30
35
 
31
36
  if self.enabled:
32
37
  try:
@@ -39,42 +44,24 @@ class HuggingFaceMlxModel(BasePageModel):
39
44
  )
40
45
 
41
46
  repo_cache_folder = vlm_options.repo_id.replace("/", "--")
47
+
42
48
  self.apply_chat_template = apply_chat_template
43
49
  self.stream_generate = stream_generate
44
50
 
45
51
  # PARAMETERS:
46
52
  if artifacts_path is None:
47
- artifacts_path = self.download_models(self.vlm_options.repo_id)
53
+ artifacts_path = self.download_models(
54
+ self.vlm_options.repo_id,
55
+ )
48
56
  elif (artifacts_path / repo_cache_folder).exists():
49
57
  artifacts_path = artifacts_path / repo_cache_folder
50
58
 
51
- self.param_question = vlm_options.prompt # "Perform Layout Analysis."
59
+ self.param_question = vlm_options.prompt
52
60
 
53
61
  ## Load the model
54
62
  self.vlm_model, self.processor = load(artifacts_path)
55
63
  self.config = load_config(artifacts_path)
56
64
 
57
- @staticmethod
58
- def download_models(
59
- repo_id: str,
60
- local_dir: Optional[Path] = None,
61
- force: bool = False,
62
- progress: bool = False,
63
- ) -> Path:
64
- from huggingface_hub import snapshot_download
65
- from huggingface_hub.utils import disable_progress_bars
66
-
67
- if not progress:
68
- disable_progress_bars()
69
- download_path = snapshot_download(
70
- repo_id=repo_id,
71
- force_download=force,
72
- local_dir=local_dir,
73
- # revision="v0.0.1",
74
- )
75
-
76
- return Path(download_path)
77
-
78
65
  def __call__(
79
66
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
80
67
  ) -> Iterable[Page]:
@@ -83,12 +70,10 @@ class HuggingFaceMlxModel(BasePageModel):
83
70
  if not page._backend.is_valid():
84
71
  yield page
85
72
  else:
86
- with TimeRecorder(conv_res, "vlm"):
73
+ with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
87
74
  assert page.size is not None
88
75
 
89
- hi_res_image = page.get_image(scale=2.0) # 144dpi
90
- # hi_res_image = page.get_image(scale=1.0) # 72dpi
91
-
76
+ hi_res_image = page.get_image(scale=self.vlm_options.scale)
92
77
  if hi_res_image is not None:
93
78
  im_width, im_height = hi_res_image.size
94
79
 
@@ -104,16 +89,45 @@ class HuggingFaceMlxModel(BasePageModel):
104
89
  )
105
90
 
106
91
  start_time = time.time()
92
+ _log.debug("start generating ...")
93
+
107
94
  # Call model to generate:
95
+ tokens: list[VlmPredictionToken] = []
96
+
108
97
  output = ""
109
98
  for token in self.stream_generate(
110
99
  self.vlm_model,
111
100
  self.processor,
112
101
  prompt,
113
102
  [hi_res_image],
114
- max_tokens=4096,
103
+ max_tokens=self.max_tokens,
115
104
  verbose=False,
105
+ temp=self.temperature,
116
106
  ):
107
+ if len(token.logprobs.shape) == 1:
108
+ tokens.append(
109
+ VlmPredictionToken(
110
+ text=token.text,
111
+ token=token.token,
112
+ logprob=token.logprobs[token.token],
113
+ )
114
+ )
115
+ elif (
116
+ len(token.logprobs.shape) == 2
117
+ and token.logprobs.shape[0] == 1
118
+ ):
119
+ tokens.append(
120
+ VlmPredictionToken(
121
+ text=token.text,
122
+ token=token.token,
123
+ logprob=token.logprobs[0, token.token],
124
+ )
125
+ )
126
+ else:
127
+ _log.warning(
128
+ f"incompatible shape for logprobs: {token.logprobs.shape}"
129
+ )
130
+
117
131
  output += token.text
118
132
  if "</doctag>" in token.text:
119
133
  break
@@ -121,15 +135,13 @@ class HuggingFaceMlxModel(BasePageModel):
121
135
  generation_time = time.time() - start_time
122
136
  page_tags = output
123
137
 
124
- _log.debug(f"Generation time {generation_time:.2f} seconds.")
125
-
126
- # inference_time = time.time() - start_time
127
- # tokens_per_second = num_tokens / generation_time
128
- # print("")
129
- # print(f"Page Inference Time: {inference_time:.2f} seconds")
130
- # print(f"Total tokens on page: {num_tokens:.2f}")
131
- # print(f"Tokens/sec: {tokens_per_second:.2f}")
132
- # print("")
133
- page.predictions.vlm_response = VlmPrediction(text=page_tags)
138
+ _log.debug(
139
+ f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
140
+ )
141
+ page.predictions.vlm_response = VlmPrediction(
142
+ text=page_tags,
143
+ generation_time=generation_time,
144
+ generated_tokens=tokens,
145
+ )
134
146
 
135
147
  yield page
@@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
8
8
 
9
9
  from docling.backend.abstract_backend import AbstractDocumentBackend
10
10
  from docling.backend.pdf_backend import PdfDocumentBackend
11
- from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
11
+ from docling.datamodel.base_models import AssembledUnit, Page
12
12
  from docling.datamodel.document import ConversionResult
13
13
  from docling.datamodel.pipeline_options import PdfPipelineOptions
14
14
  from docling.datamodel.settings import settings
@@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline):
55
55
  "When defined, it must point to a folder containing all models required by the pipeline."
56
56
  )
57
57
 
58
- self.keep_images = (
59
- self.pipeline_options.generate_page_images
60
- or self.pipeline_options.generate_picture_images
61
- or self.pipeline_options.generate_table_images
62
- )
58
+ with warnings.catch_warnings(): # deprecated generate_table_images
59
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
60
+ self.keep_images = (
61
+ self.pipeline_options.generate_page_images
62
+ or self.pipeline_options.generate_picture_images
63
+ or self.pipeline_options.generate_table_images
64
+ )
63
65
 
64
66
  self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
65
67
 
@@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
210
212
  )
211
213
 
212
214
  # Generate images of the requested element types
213
- if (
214
- self.pipeline_options.generate_picture_images
215
- or self.pipeline_options.generate_table_images
216
- ):
217
- scale = self.pipeline_options.images_scale
218
- for element, _level in conv_res.document.iterate_items():
219
- if not isinstance(element, DocItem) or len(element.prov) == 0:
220
- continue
221
- if (
222
- isinstance(element, PictureItem)
223
- and self.pipeline_options.generate_picture_images
224
- ) or (
225
- isinstance(element, TableItem)
226
- and self.pipeline_options.generate_table_images
227
- ):
228
- page_ix = element.prov[0].page_no - 1
229
- page = next(
230
- (p for p in conv_res.pages if p.page_no == page_ix),
231
- cast("Page", None),
232
- )
233
- assert page is not None
234
- assert page.size is not None
235
- assert page.image is not None
236
-
237
- crop_bbox = (
238
- element.prov[0]
239
- .bbox.scaled(scale=scale)
240
- .to_top_left_origin(page_height=page.size.height * scale)
241
- )
242
-
243
- cropped_im = page.image.crop(crop_bbox.as_tuple())
244
- element.image = ImageRef.from_pil(
245
- cropped_im, dpi=int(72 * scale)
246
- )
215
+ with warnings.catch_warnings(): # deprecated generate_table_images
216
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
217
+ if (
218
+ self.pipeline_options.generate_picture_images
219
+ or self.pipeline_options.generate_table_images
220
+ ):
221
+ scale = self.pipeline_options.images_scale
222
+ for element, _level in conv_res.document.iterate_items():
223
+ if not isinstance(element, DocItem) or len(element.prov) == 0:
224
+ continue
225
+ if (
226
+ isinstance(element, PictureItem)
227
+ and self.pipeline_options.generate_picture_images
228
+ ) or (
229
+ isinstance(element, TableItem)
230
+ and self.pipeline_options.generate_table_images
231
+ ):
232
+ page_ix = element.prov[0].page_no - 1
233
+ page = next(
234
+ (p for p in conv_res.pages if p.page_no == page_ix),
235
+ cast("Page", None),
236
+ )
237
+ assert page is not None
238
+ assert page.size is not None
239
+ assert page.image is not None
240
+
241
+ crop_bbox = (
242
+ element.prov[0]
243
+ .bbox.scaled(scale=scale)
244
+ .to_top_left_origin(
245
+ page_height=page.size.height * scale
246
+ )
247
+ )
248
+
249
+ cropped_im = page.image.crop(crop_bbox.as_tuple())
250
+ element.image = ImageRef.from_pil(
251
+ cropped_im, dpi=int(72 * scale)
252
+ )
247
253
 
248
254
  # Aggregate confidence values for document:
249
255
  if len(conv_res.pages) > 0:
250
- conv_res.confidence.layout_score = float(
251
- np.nanmean(
252
- [c.layout_score for c in conv_res.confidence.pages.values()]
256
+ with warnings.catch_warnings():
257
+ warnings.filterwarnings(
258
+ "ignore",
259
+ category=RuntimeWarning,
260
+ message="Mean of empty slice|All-NaN slice encountered",
253
261
  )
254
- )
255
- conv_res.confidence.parse_score = float(
256
- np.nanquantile(
257
- [c.parse_score for c in conv_res.confidence.pages.values()],
258
- q=0.1, # parse score should relate to worst 10% of pages.
262
+ conv_res.confidence.layout_score = float(
263
+ np.nanmean(
264
+ [c.layout_score for c in conv_res.confidence.pages.values()]
265
+ )
259
266
  )
260
- )
261
- conv_res.confidence.table_score = float(
262
- np.nanmean(
263
- [c.table_score for c in conv_res.confidence.pages.values()]
267
+ conv_res.confidence.parse_score = float(
268
+ np.nanquantile(
269
+ [c.parse_score for c in conv_res.confidence.pages.values()],
270
+ q=0.1, # parse score should relate to worst 10% of pages.
271
+ )
264
272
  )
265
- )
266
- conv_res.confidence.ocr_score = float(
267
- np.nanmean(
268
- [c.ocr_score for c in conv_res.confidence.pages.values()]
273
+ conv_res.confidence.table_score = float(
274
+ np.nanmean(
275
+ [c.table_score for c in conv_res.confidence.pages.values()]
276
+ )
277
+ )
278
+ conv_res.confidence.ocr_score = float(
279
+ np.nanmean(
280
+ [c.ocr_score for c in conv_res.confidence.pages.values()]
281
+ )
269
282
  )
270
- )
271
283
 
272
284
  return conv_res
273
285