docling 2.39.0__py3-none-any.whl → 2.41.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. docling/backend/docling_parse_v4_backend.py +14 -4
  2. docling/backend/msexcel_backend.py +33 -14
  3. docling/datamodel/asr_model_specs.py +6 -6
  4. docling/datamodel/base_models.py +23 -1
  5. docling/datamodel/layout_model_specs.py +90 -0
  6. docling/datamodel/pipeline_options.py +18 -0
  7. docling/datamodel/pipeline_options_vlm_model.py +11 -3
  8. docling/models/api_vlm_model.py +7 -5
  9. docling/models/base_ocr_model.py +6 -2
  10. docling/models/document_picture_classifier.py +12 -13
  11. docling/models/layout_model.py +27 -18
  12. docling/models/picture_description_vlm_model.py +16 -11
  13. docling/models/plugins/defaults.py +9 -9
  14. docling/models/readingorder_model.py +8 -1
  15. docling/models/table_structure_model.py +3 -1
  16. docling/models/tesseract_ocr_model.py +10 -4
  17. docling/models/vlm_models_inline/hf_transformers_model.py +39 -20
  18. docling/models/vlm_models_inline/mlx_model.py +5 -3
  19. docling/pipeline/standard_pdf_pipeline.py +3 -3
  20. docling/pipeline/vlm_pipeline.py +1 -0
  21. docling/utils/accelerator_utils.py +2 -2
  22. docling/utils/layout_postprocessor.py +7 -2
  23. docling/utils/model_downloader.py +2 -1
  24. docling/utils/ocr_utils.py +1 -1
  25. docling/utils/orientation.py +22 -28
  26. {docling-2.39.0.dist-info → docling-2.41.0.dist-info}/METADATA +5 -5
  27. {docling-2.39.0.dist-info → docling-2.41.0.dist-info}/RECORD +31 -30
  28. {docling-2.39.0.dist-info → docling-2.41.0.dist-info}/WHEEL +0 -0
  29. {docling-2.39.0.dist-info → docling-2.41.0.dist-info}/entry_points.txt +0 -0
  30. {docling-2.39.0.dist-info → docling-2.41.0.dist-info}/licenses/LICENSE +0 -0
  31. {docling-2.39.0.dist-info → docling-2.41.0.dist-info}/top_level.txt +0 -0
@@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
187
187
 
188
188
  def unload(self):
189
189
  super().unload()
190
- self.dp_doc.unload()
191
- with pypdfium2_lock:
192
- self._pdoc.close()
193
- self._pdoc = None
190
+ # Unload docling-parse document first
191
+ if self.dp_doc is not None:
192
+ self.dp_doc.unload()
193
+ self.dp_doc = None
194
+
195
+ # Then close pypdfium2 document with proper locking
196
+ if self._pdoc is not None:
197
+ with pypdfium2_lock:
198
+ try:
199
+ self._pdoc.close()
200
+ except Exception:
201
+ # Ignore cleanup errors
202
+ pass
203
+ self._pdoc = None
@@ -337,10 +337,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
337
337
  # Collect the data within the bounds
338
338
  data = []
339
339
  visited_cells: set[tuple[int, int]] = set()
340
- for ri in range(start_row, max_row + 1):
341
- for rj in range(start_col, max_col + 1):
342
- cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
343
-
340
+ for ri, row in enumerate(
341
+ sheet.iter_rows(
342
+ min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
343
+ max_row=max_row + 1,
344
+ min_col=start_col + 1,
345
+ max_col=max_col + 1,
346
+ values_only=False,
347
+ ),
348
+ start_row,
349
+ ):
350
+ for rj, cell in enumerate(row, start_col):
344
351
  # Check if the cell belongs to a merged range
345
352
  row_span = 1
346
353
  col_span = 1
@@ -397,10 +404,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
397
404
  """
398
405
  max_row: int = start_row
399
406
 
400
- while max_row < sheet.max_row - 1:
401
- # Get the cell value or check if it is part of a merged cell
402
- cell = sheet.cell(row=max_row + 2, column=start_col + 1)
403
-
407
+ for ri, (cell,) in enumerate(
408
+ sheet.iter_rows(
409
+ min_row=start_row + 2,
410
+ max_row=sheet.max_row,
411
+ min_col=start_col + 1,
412
+ max_col=start_col + 1,
413
+ values_only=False,
414
+ ),
415
+ start_row + 1,
416
+ ):
404
417
  # Check if the cell is part of a merged range
405
418
  merged_range = next(
406
419
  (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
@@ -414,7 +427,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
414
427
  if merged_range:
415
428
  max_row = max(max_row, merged_range.max_row - 1)
416
429
  else:
417
- max_row += 1
430
+ max_row = ri
418
431
 
419
432
  return max_row
420
433
 
@@ -433,10 +446,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
433
446
  """
434
447
  max_col: int = start_col
435
448
 
436
- while max_col < sheet.max_column - 1:
437
- # Get the cell value or check if it is part of a merged cell
438
- cell = sheet.cell(row=start_row + 1, column=max_col + 2)
439
-
449
+ for rj, (cell,) in enumerate(
450
+ sheet.iter_cols(
451
+ min_row=start_row + 1,
452
+ max_row=start_row + 1,
453
+ min_col=start_col + 2,
454
+ max_col=sheet.max_column,
455
+ values_only=False,
456
+ ),
457
+ start_col + 1,
458
+ ):
440
459
  # Check if the cell is part of a merged range
441
460
  merged_range = next(
442
461
  (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
@@ -450,7 +469,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
450
469
  if merged_range:
451
470
  max_col = max(max_col, merged_range.max_col - 1)
452
471
  else:
453
- max_col += 1
472
+ max_col = rj
454
473
 
455
474
  return max_col
456
475
 
@@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
22
22
  verbose=True,
23
23
  timestamps=True,
24
24
  word_timestamps=True,
25
- temperatue=0.0,
25
+ temperature=0.0,
26
26
  max_new_tokens=256,
27
27
  max_time_chunk=30.0,
28
28
  )
@@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
33
33
  verbose=True,
34
34
  timestamps=True,
35
35
  word_timestamps=True,
36
- temperatue=0.0,
36
+ temperature=0.0,
37
37
  max_new_tokens=256,
38
38
  max_time_chunk=30.0,
39
39
  )
@@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
44
44
  verbose=True,
45
45
  timestamps=True,
46
46
  word_timestamps=True,
47
- temperatue=0.0,
47
+ temperature=0.0,
48
48
  max_new_tokens=256,
49
49
  max_time_chunk=30.0,
50
50
  )
@@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
55
55
  verbose=True,
56
56
  timestamps=True,
57
57
  word_timestamps=True,
58
- temperatue=0.0,
58
+ temperature=0.0,
59
59
  max_new_tokens=256,
60
60
  max_time_chunk=30.0,
61
61
  )
@@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
66
66
  verbose=True,
67
67
  timestamps=True,
68
68
  word_timestamps=True,
69
- temperatue=0.0,
69
+ temperature=0.0,
70
70
  max_new_tokens=256,
71
71
  max_time_chunk=30.0,
72
72
  )
@@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
77
77
  verbose=True,
78
78
  timestamps=True,
79
79
  word_timestamps=True,
80
- temperatue=0.0,
80
+ temperature=0.0,
81
81
  max_new_tokens=256,
82
82
  max_time_chunk=30.0,
83
83
  )
@@ -12,6 +12,7 @@ from docling_core.types.doc import (
12
12
  Size,
13
13
  TableCell,
14
14
  )
15
+ from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
15
16
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
16
17
  from docling_core.types.io import (
17
18
  DocumentStream,
@@ -19,7 +20,14 @@ from docling_core.types.io import (
19
20
 
20
21
  # DO NOT REMOVE; explicitly exposed from this location
21
22
  from PIL.Image import Image
22
- from pydantic import BaseModel, ConfigDict, Field, computed_field
23
+ from pydantic import (
24
+ BaseModel,
25
+ ConfigDict,
26
+ Field,
27
+ FieldSerializationInfo,
28
+ computed_field,
29
+ field_serializer,
30
+ )
23
31
 
24
32
  if TYPE_CHECKING:
25
33
  from docling.backend.pdf_backend import PdfPageBackend
@@ -142,6 +150,10 @@ class Cluster(BaseModel):
142
150
  cells: List[TextCell] = []
143
151
  children: List["Cluster"] = [] # Add child cluster support
144
152
 
153
+ @field_serializer("confidence")
154
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
155
+ return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
156
+
145
157
 
146
158
  class BasePageElement(BaseModel):
147
159
  label: DocItemLabel
@@ -194,6 +206,16 @@ class FigureElement(BasePageElement):
194
206
  predicted_class: Optional[str] = None
195
207
  confidence: Optional[float] = None
196
208
 
209
+ @field_serializer("confidence")
210
+ def _serialize(
211
+ self, value: Optional[float], info: FieldSerializationInfo
212
+ ) -> Optional[float]:
213
+ return (
214
+ round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
215
+ if value is not None
216
+ else None
217
+ )
218
+
197
219
 
198
220
  class FigureClassificationPrediction(BaseModel):
199
221
  figure_count: int = 0
@@ -0,0 +1,90 @@
1
+ import logging
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from docling.datamodel.accelerator_options import AcceleratorDevice
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class LayoutModelConfig(BaseModel):
14
+ name: str
15
+ repo_id: str
16
+ revision: str
17
+ model_path: str
18
+ supported_devices: list[AcceleratorDevice] = [
19
+ AcceleratorDevice.CPU,
20
+ AcceleratorDevice.CUDA,
21
+ AcceleratorDevice.MPS,
22
+ ]
23
+
24
+ @property
25
+ def model_repo_folder(self) -> str:
26
+ return self.repo_id.replace("/", "--")
27
+
28
+
29
+ # HuggingFace Layout Models
30
+
31
+ # Default Docling Layout Model
32
+ DOCLING_LAYOUT_V2 = LayoutModelConfig(
33
+ name="docling_layout_v2",
34
+ repo_id="ds4sd/docling-layout-old",
35
+ revision="main",
36
+ model_path="",
37
+ )
38
+
39
+ DOCLING_LAYOUT_HERON = LayoutModelConfig(
40
+ name="docling_layout_heron",
41
+ repo_id="ds4sd/docling-layout-heron",
42
+ revision="main",
43
+ model_path="",
44
+ )
45
+
46
+ DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
47
+ name="docling_layout_heron_101",
48
+ repo_id="ds4sd/docling-layout-heron-101",
49
+ revision="main",
50
+ model_path="",
51
+ )
52
+
53
+ DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
54
+ name="docling_layout_egret_medium",
55
+ repo_id="ds4sd/docling-layout-egret-medium",
56
+ revision="main",
57
+ model_path="",
58
+ )
59
+
60
+ DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
61
+ name="docling_layout_egret_large",
62
+ repo_id="ds4sd/docling-layout-egret-large",
63
+ revision="main",
64
+ model_path="",
65
+ )
66
+
67
+ DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
68
+ name="docling_layout_egret_xlarge",
69
+ repo_id="ds4sd/docling-layout-egret-xlarge",
70
+ revision="main",
71
+ model_path="",
72
+ )
73
+
74
+ # Example for a hypothetical alternative model
75
+ # ALTERNATIVE_LAYOUT = LayoutModelConfig(
76
+ # name="alternative_layout",
77
+ # repo_id="someorg/alternative-layout",
78
+ # revision="main",
79
+ # model_path="model_artifacts/layout_alt",
80
+ # )
81
+
82
+
83
+ class LayoutModelType(str, Enum):
84
+ DOCLING_LAYOUT_V2 = "docling_layout_v2"
85
+ DOCLING_LAYOUT_HERON = "docling_layout_heron"
86
+ DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
87
+ DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
88
+ DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
89
+ DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
90
+ # ALTERNATIVE_LAYOUT = "alternative_layout"
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from datetime import datetime
2
3
  from enum import Enum
3
4
  from pathlib import Path
4
5
  from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@@ -15,6 +16,15 @@ from docling.datamodel import asr_model_specs
15
16
 
16
17
  # Import the following for backwards compatibility
17
18
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
19
+ from docling.datamodel.layout_model_specs import (
20
+ DOCLING_LAYOUT_EGRET_LARGE,
21
+ DOCLING_LAYOUT_EGRET_MEDIUM,
22
+ DOCLING_LAYOUT_EGRET_XLARGE,
23
+ DOCLING_LAYOUT_HERON,
24
+ DOCLING_LAYOUT_HERON_101,
25
+ DOCLING_LAYOUT_V2,
26
+ LayoutModelConfig,
27
+ )
18
28
  from docling.datamodel.pipeline_options_asr_model import (
19
29
  InlineAsrOptions,
20
30
  )
@@ -265,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
265
275
  )
266
276
 
267
277
 
278
+ class LayoutOptions(BaseModel):
279
+ """Options for layout processing."""
280
+
281
+ create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
282
+ model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
283
+
284
+
268
285
  class AsrPipelineOptions(PipelineOptions):
269
286
  asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
270
287
  artifacts_path: Optional[Union[Path, str]] = None
@@ -289,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
289
306
  picture_description_options: PictureDescriptionBaseOptions = (
290
307
  smolvlm_picture_description
291
308
  )
309
+ layout_options: LayoutOptions = LayoutOptions()
292
310
 
293
311
  images_scale: float = 1.0
294
312
  generate_page_images: bool = False
@@ -1,6 +1,7 @@
1
1
  from enum import Enum
2
- from typing import Any, Dict, List, Literal, Optional, Union
2
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union
3
3
 
4
+ from docling_core.types.doc.page import SegmentedPage
4
5
  from pydantic import AnyUrl, BaseModel
5
6
  from typing_extensions import deprecated
6
7
 
@@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
9
10
 
10
11
  class BaseVlmOptions(BaseModel):
11
12
  kind: str
12
- prompt: str
13
+ prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
13
14
  scale: float = 2.0
14
15
  max_size: Optional[int] = None
16
+ temperature: float = 0.0
15
17
 
16
18
 
17
19
  class ResponseFormat(str, Enum):
@@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
29
31
  AUTOMODEL = "automodel"
30
32
  AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
31
33
  AUTOMODEL_CAUSALLM = "automodel-causallm"
34
+ AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
35
+
36
+
37
+ class TransformersPromptStyle(str, Enum):
38
+ CHAT = "chat"
39
+ RAW = "raw"
32
40
 
33
41
 
34
42
  class InlineVlmOptions(BaseVlmOptions):
@@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
42
50
 
43
51
  inference_framework: InferenceFramework
44
52
  transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
53
+ transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
45
54
  response_format: ResponseFormat
46
55
 
47
56
  torch_dtype: Optional[str] = None
@@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
51
60
  AcceleratorDevice.MPS,
52
61
  ]
53
62
 
54
- temperature: float = 0.0
55
63
  stop_strings: List[str] = []
56
64
  extra_generation_config: Dict[str, Any] = {}
57
65
 
@@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
29
29
 
30
30
  self.timeout = self.vlm_options.timeout
31
31
  self.concurrency = self.vlm_options.concurrency
32
- self.prompt_content = (
33
- f"This is a page from a document.\n{self.vlm_options.prompt}"
34
- )
35
32
  self.params = {
36
33
  **self.vlm_options.params,
37
- "temperature": 0,
34
+ "temperature": self.vlm_options.temperature,
38
35
  }
39
36
 
40
37
  def __call__(
@@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
56
53
  if hi_res_image.mode != "RGB":
57
54
  hi_res_image = hi_res_image.convert("RGB")
58
55
 
56
+ if callable(self.vlm_options.prompt):
57
+ prompt = self.vlm_options.prompt(page.parsed_page)
58
+ else:
59
+ prompt = self.vlm_options.prompt
60
+
59
61
  page_tags = api_image_request(
60
62
  image=hi_res_image,
61
- prompt=self.prompt_content,
63
+ prompt=prompt,
62
64
  url=self.vlm_options.url,
63
65
  timeout=self.timeout,
64
66
  headers=self.vlm_options.headers,
@@ -3,14 +3,13 @@ import logging
3
3
  from abc import abstractmethod
4
4
  from collections.abc import Iterable
5
5
  from pathlib import Path
6
- from typing import List, Optional, Type
6
+ from typing import TYPE_CHECKING, List, Optional, Type
7
7
 
8
8
  import numpy as np
9
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
10
10
  from docling_core.types.doc.page import TextCell
11
11
  from PIL import Image, ImageDraw
12
12
  from rtree import index
13
- from scipy.ndimage import binary_dilation, find_objects, label
14
13
 
15
14
  from docling.datamodel.accelerator_options import AcceleratorOptions
16
15
  from docling.datamodel.base_models import Page
@@ -31,11 +30,16 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
31
30
  options: OcrOptions,
32
31
  accelerator_options: AcceleratorOptions,
33
32
  ):
33
+ # Make sure any delay/error from import occurs on ocr model init and not first use
34
+ from scipy.ndimage import binary_dilation, find_objects, label
35
+
34
36
  self.enabled = enabled
35
37
  self.options = options
36
38
 
37
39
  # Computes the optimum amount and coordinates of rectangles to OCR on a given page
38
40
  def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
41
+ from scipy.ndimage import binary_dilation, find_objects, label
42
+
39
43
  BITMAP_COVERAGE_TRESHOLD = 0.75
40
44
  assert page.size is not None
41
45
 
@@ -14,7 +14,8 @@ from PIL import Image
14
14
  from pydantic import BaseModel
15
15
 
16
16
  from docling.datamodel.accelerator_options import AcceleratorOptions
17
- from docling.models.base_model import BaseEnrichmentModel
17
+ from docling.datamodel.base_models import ItemAndImageEnrichmentElement
18
+ from docling.models.base_model import BaseItemAndImageEnrichmentModel
18
19
  from docling.models.utils.hf_model_download import download_hf_model
19
20
  from docling.utils.accelerator_utils import decide_device
20
21
 
@@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
32
33
  kind: Literal["document_picture_classifier"] = "document_picture_classifier"
33
34
 
34
35
 
35
- class DocumentPictureClassifier(BaseEnrichmentModel):
36
+ class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
36
37
  """
37
38
  A model for classifying pictures in documents.
38
39
 
@@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
135
136
  def __call__(
136
137
  self,
137
138
  doc: DoclingDocument,
138
- element_batch: Iterable[NodeItem],
139
+ element_batch: Iterable[ItemAndImageEnrichmentElement],
139
140
  ) -> Iterable[NodeItem]:
140
141
  """
141
142
  Processes a batch of elements and enriches them with classification predictions.
@@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
144
145
  ----------
145
146
  doc : DoclingDocument
146
147
  The document containing the elements to be processed.
147
- element_batch : Iterable[NodeItem]
148
+ element_batch : Iterable[ItemAndImageEnrichmentElement]
148
149
  A batch of pictures to classify.
149
150
 
150
151
  Returns
@@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
155
156
  """
156
157
  if not self.enabled:
157
158
  for element in element_batch:
158
- yield element
159
+ yield element.item
159
160
  return
160
161
 
161
162
  images: List[Union[Image.Image, np.ndarray]] = []
162
163
  elements: List[PictureItem] = []
163
164
  for el in element_batch:
164
- assert isinstance(el, PictureItem)
165
- elements.append(el)
166
- img = el.get_image(doc)
167
- assert img is not None
168
- images.append(img)
165
+ assert isinstance(el.item, PictureItem)
166
+ elements.append(el.item)
167
+ images.append(el.image)
169
168
 
170
169
  outputs = self.document_picture_classifier.predict(images)
171
170
 
172
- for element, output in zip(elements, outputs):
173
- element.annotations.append(
171
+ for item, output in zip(elements, outputs):
172
+ item.annotations.append(
174
173
  PictureClassificationData(
175
174
  provenance="DocumentPictureClassifier",
176
175
  predicted_classes=[
@@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
183
182
  )
184
183
  )
185
184
 
186
- yield element
185
+ yield item
@@ -7,12 +7,13 @@ from typing import Optional
7
7
 
8
8
  import numpy as np
9
9
  from docling_core.types.doc import DocItemLabel
10
- from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
11
10
  from PIL import Image
12
11
 
13
12
  from docling.datamodel.accelerator_options import AcceleratorOptions
14
13
  from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
15
14
  from docling.datamodel.document import ConversionResult
15
+ from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
16
+ from docling.datamodel.pipeline_options import LayoutOptions
16
17
  from docling.datamodel.settings import settings
17
18
  from docling.models.base_model import BasePageModel
18
19
  from docling.models.utils.hf_model_download import download_hf_model
@@ -25,9 +26,6 @@ _log = logging.getLogger(__name__)
25
26
 
26
27
 
27
28
  class LayoutModel(BasePageModel):
28
- _model_repo_folder = "ds4sd--docling-models"
29
- _model_path = "model_artifacts/layout"
30
-
31
29
  TEXT_ELEM_LABELS = [
32
30
  DocItemLabel.TEXT,
33
31
  DocItemLabel.FOOTNOTE,
@@ -49,28 +47,38 @@ class LayoutModel(BasePageModel):
49
47
  CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
50
48
 
51
49
  def __init__(
52
- self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
50
+ self,
51
+ artifacts_path: Optional[Path],
52
+ accelerator_options: AcceleratorOptions,
53
+ options: LayoutOptions,
53
54
  ):
55
+ from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
56
+
57
+ self.options = options
58
+
54
59
  device = decide_device(accelerator_options.device)
60
+ layout_model_config = options.model_spec
61
+ model_repo_folder = layout_model_config.model_repo_folder
62
+ model_path = layout_model_config.model_path
55
63
 
56
64
  if artifacts_path is None:
57
- artifacts_path = self.download_models() / self._model_path
65
+ artifacts_path = (
66
+ self.download_models(layout_model_config=layout_model_config)
67
+ / model_path
68
+ )
58
69
  else:
59
- # will become the default in the future
60
- if (artifacts_path / self._model_repo_folder).exists():
61
- artifacts_path = (
62
- artifacts_path / self._model_repo_folder / self._model_path
63
- )
64
- elif (artifacts_path / self._model_path).exists():
70
+ if (artifacts_path / model_repo_folder).exists():
71
+ artifacts_path = artifacts_path / model_repo_folder / model_path
72
+ elif (artifacts_path / model_path).exists():
65
73
  warnings.warn(
66
74
  "The usage of artifacts_path containing directly "
67
- f"{self._model_path} is deprecated. Please point "
75
+ f"{model_path} is deprecated. Please point "
68
76
  "the artifacts_path to the parent containing "
69
- f"the {self._model_repo_folder} folder.",
77
+ f"the {model_repo_folder} folder.",
70
78
  DeprecationWarning,
71
79
  stacklevel=3,
72
80
  )
73
- artifacts_path = artifacts_path / self._model_path
81
+ artifacts_path = artifacts_path / model_path
74
82
 
75
83
  self.layout_predictor = LayoutPredictor(
76
84
  artifact_path=str(artifacts_path),
@@ -83,10 +91,11 @@ class LayoutModel(BasePageModel):
83
91
  local_dir: Optional[Path] = None,
84
92
  force: bool = False,
85
93
  progress: bool = False,
94
+ layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
86
95
  ) -> Path:
87
96
  return download_hf_model(
88
- repo_id="ds4sd/docling-models",
89
- revision="v2.2.0",
97
+ repo_id=layout_model_config.repo_id,
98
+ revision=layout_model_config.revision,
90
99
  local_dir=local_dir,
91
100
  force=force,
92
101
  progress=progress,
@@ -176,7 +185,7 @@ class LayoutModel(BasePageModel):
176
185
  # Apply postprocessing
177
186
 
178
187
  processed_clusters, processed_cells = LayoutPostprocessor(
179
- page, clusters
188
+ page, clusters, self.options
180
189
  ).postprocess()
181
190
  # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
182
191
 
@@ -1,3 +1,4 @@
1
+ import threading
1
2
  from collections.abc import Iterable
2
3
  from pathlib import Path
3
4
  from typing import Optional, Type, Union
@@ -15,6 +16,9 @@ from docling.models.utils.hf_model_download import (
15
16
  )
16
17
  from docling.utils.accelerator_utils import decide_device
17
18
 
19
+ # Global lock for model initialization to prevent threading issues
20
+ _model_init_lock = threading.Lock()
21
+
18
22
 
19
23
  class PictureDescriptionVlmModel(
20
24
  PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
@@ -57,17 +61,18 @@ class PictureDescriptionVlmModel(
57
61
  )
58
62
 
59
63
  # Initialize processor and model
60
- self.processor = AutoProcessor.from_pretrained(artifacts_path)
61
- self.model = AutoModelForVision2Seq.from_pretrained(
62
- artifacts_path,
63
- torch_dtype=torch.bfloat16,
64
- _attn_implementation=(
65
- "flash_attention_2"
66
- if self.device.startswith("cuda")
67
- and accelerator_options.cuda_use_flash_attention2
68
- else "eager"
69
- ),
70
- ).to(self.device)
64
+ with _model_init_lock:
65
+ self.processor = AutoProcessor.from_pretrained(artifacts_path)
66
+ self.model = AutoModelForVision2Seq.from_pretrained(
67
+ artifacts_path,
68
+ torch_dtype=torch.bfloat16,
69
+ _attn_implementation=(
70
+ "flash_attention_2"
71
+ if self.device.startswith("cuda")
72
+ and accelerator_options.cuda_use_flash_attention2
73
+ else "eager"
74
+ ),
75
+ ).to(self.device)
71
76
 
72
77
  self.provenance = f"{self.options.repo_id}"
73
78
 
@@ -1,13 +1,10 @@
1
- from docling.models.easyocr_model import EasyOcrModel
2
- from docling.models.ocr_mac_model import OcrMacModel
3
- from docling.models.picture_description_api_model import PictureDescriptionApiModel
4
- from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
5
- from docling.models.rapid_ocr_model import RapidOcrModel
6
- from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
7
- from docling.models.tesseract_ocr_model import TesseractOcrModel
8
-
9
-
10
1
  def ocr_engines():
2
+ from docling.models.easyocr_model import EasyOcrModel
3
+ from docling.models.ocr_mac_model import OcrMacModel
4
+ from docling.models.rapid_ocr_model import RapidOcrModel
5
+ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
6
+ from docling.models.tesseract_ocr_model import TesseractOcrModel
7
+
11
8
  return {
12
9
  "ocr_engines": [
13
10
  EasyOcrModel,
@@ -20,6 +17,9 @@ def ocr_engines():
20
17
 
21
18
 
22
19
  def picture_description():
20
+ from docling.models.picture_description_api_model import PictureDescriptionApiModel
21
+ from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
22
+
23
23
  return {
24
24
  "picture_description": [
25
25
  PictureDescriptionVlmModel,
@@ -12,6 +12,9 @@ from docling_core.types.doc import (
12
12
  TableData,
13
13
  )
14
14
  from docling_core.types.doc.document import ContentLayer
15
+ from docling_ibm_models.list_item_normalizer.list_marker_processor import (
16
+ ListItemMarkerProcessor,
17
+ )
15
18
  from docling_ibm_models.reading_order.reading_order_rb import (
16
19
  PageElement as ReadingOrderPageElement,
17
20
  ReadingOrderPredictor,
@@ -40,6 +43,7 @@ class ReadingOrderModel:
40
43
  def __init__(self, options: ReadingOrderOptions):
41
44
  self.options = options
42
45
  self.ro_model = ReadingOrderPredictor()
46
+ self.list_item_processor = ListItemMarkerProcessor()
43
47
 
44
48
  def _assembled_to_readingorder_elements(
45
49
  self, conv_res: ConversionResult
@@ -92,7 +96,8 @@ class ReadingOrderModel:
92
96
  )
93
97
  if c_label == DocItemLabel.LIST_ITEM:
94
98
  # TODO: Infer if this is a numbered or a bullet list item
95
- doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
99
+ l_item = doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
100
+ self.list_item_processor.process_list_item(l_item)
96
101
  elif c_label == DocItemLabel.SECTION_HEADER:
97
102
  doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
98
103
  else:
@@ -301,6 +306,8 @@ class ReadingOrderModel:
301
306
  new_item = out_doc.add_list_item(
302
307
  text=cap_text, enumerated=False, prov=prov, parent=current_list
303
308
  )
309
+ self.list_item_processor.process_list_item(new_item)
310
+
304
311
  elif label == DocItemLabel.SECTION_HEADER:
305
312
  current_list = None
306
313
 
@@ -10,7 +10,6 @@ from docling_core.types.doc.page import (
10
10
  BoundingRectangle,
11
11
  TextCellUnit,
12
12
  )
13
- from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
14
13
  from PIL import ImageDraw
15
14
 
16
15
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -70,6 +69,9 @@ class TableStructureModel(BasePageModel):
70
69
 
71
70
  # Third Party
72
71
  import docling_ibm_models.tableformer.common as c
72
+ from docling_ibm_models.tableformer.data_management.tf_predictor import (
73
+ TFPredictor,
74
+ )
73
75
 
74
76
  device = decide_device(accelerator_options.device)
75
77
 
@@ -144,7 +144,10 @@ class TesseractOcrModel(BaseOcrModel):
144
144
 
145
145
  local_reader = self.reader
146
146
  self.osd_reader.SetImage(high_res_image)
147
+
148
+ doc_orientation = 0
147
149
  osd = self.osd_reader.DetectOrientationScript()
150
+
148
151
  # No text, or Orientation and Script detection failure
149
152
  if osd is None:
150
153
  _log.error(
@@ -158,11 +161,14 @@ class TesseractOcrModel(BaseOcrModel):
158
161
  # to OCR in the hope OCR will succeed while OSD failed
159
162
  if self._is_auto:
160
163
  continue
161
- doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
162
- if doc_orientation != 0:
163
- high_res_image = high_res_image.rotate(
164
- -doc_orientation, expand=True
164
+ else:
165
+ doc_orientation = parse_tesseract_orientation(
166
+ osd["orient_deg"]
165
167
  )
168
+ if doc_orientation != 0:
169
+ high_res_image = high_res_image.rotate(
170
+ -doc_orientation, expand=True
171
+ )
166
172
  if self._is_auto:
167
173
  script = osd["script_name"]
168
174
  script = map_tesseract_script(script)
@@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
13
13
  from docling.datamodel.pipeline_options_vlm_model import (
14
14
  InlineVlmOptions,
15
15
  TransformersModelType,
16
+ TransformersPromptStyle,
16
17
  )
17
18
  from docling.models.base_model import BasePageModel
18
19
  from docling.models.utils.hf_model_download import (
@@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
41
42
  from transformers import (
42
43
  AutoModel,
43
44
  AutoModelForCausalLM,
45
+ AutoModelForImageTextToText,
44
46
  AutoModelForVision2Seq,
45
47
  AutoProcessor,
46
48
  BitsAndBytesConfig,
@@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
91
93
  == TransformersModelType.AUTOMODEL_VISION2SEQ
92
94
  ):
93
95
  model_cls = AutoModelForVision2Seq
96
+ elif (
97
+ self.vlm_options.transformers_model_type
98
+ == TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
99
+ ):
100
+ model_cls = AutoModelForImageTextToText
94
101
 
95
102
  self.processor = AutoProcessor.from_pretrained(
96
103
  artifacts_path,
@@ -128,7 +135,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
128
135
  )
129
136
 
130
137
  # Define prompt structure
131
- prompt = self.formulate_prompt()
138
+ if callable(self.vlm_options.prompt):
139
+ user_prompt = self.vlm_options.prompt(page.parsed_page)
140
+ else:
141
+ user_prompt = self.vlm_options.prompt
142
+ prompt = self.formulate_prompt(user_prompt)
132
143
 
133
144
  inputs = self.processor(
134
145
  text=prompt, images=[hi_res_image], return_tensors="pt"
@@ -162,10 +173,13 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
162
173
 
163
174
  yield page
164
175
 
165
- def formulate_prompt(self) -> str:
176
+ def formulate_prompt(self, user_prompt: str) -> str:
166
177
  """Formulate a prompt for the VLM."""
167
178
 
168
- if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
179
+ if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
180
+ return user_prompt
181
+
182
+ elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
169
183
  _log.debug("Using specialized prompt for Phi-4")
170
184
  # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
171
185
 
@@ -173,25 +187,30 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
173
187
  assistant_prompt = "<|assistant|>"
174
188
  prompt_suffix = "<|end|>"
175
189
 
176
- prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
190
+ prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
177
191
  _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
178
192
 
179
193
  return prompt
180
194
 
181
- messages = [
182
- {
183
- "role": "user",
184
- "content": [
185
- {
186
- "type": "text",
187
- "text": "This is a page from a document.",
188
- },
189
- {"type": "image"},
190
- {"type": "text", "text": self.vlm_options.prompt},
191
- ],
192
- }
193
- ]
194
- prompt = self.processor.apply_chat_template(
195
- messages, add_generation_prompt=False
195
+ elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
196
+ messages = [
197
+ {
198
+ "role": "user",
199
+ "content": [
200
+ {
201
+ "type": "text",
202
+ "text": "This is a page from a document.",
203
+ },
204
+ {"type": "image"},
205
+ {"type": "text", "text": user_prompt},
206
+ ],
207
+ }
208
+ ]
209
+ prompt = self.processor.apply_chat_template(
210
+ messages, add_generation_prompt=False
211
+ )
212
+ return prompt
213
+
214
+ raise RuntimeError(
215
+ f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
196
216
  )
197
- return prompt
@@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
56
56
  elif (artifacts_path / repo_cache_folder).exists():
57
57
  artifacts_path = artifacts_path / repo_cache_folder
58
58
 
59
- self.param_question = vlm_options.prompt
60
-
61
59
  ## Load the model
62
60
  self.vlm_model, self.processor = load(artifacts_path)
63
61
  self.config = load_config(artifacts_path)
@@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
86
84
  if hi_res_image.mode != "RGB":
87
85
  hi_res_image = hi_res_image.convert("RGB")
88
86
 
87
+ if callable(self.vlm_options.prompt):
88
+ user_prompt = self.vlm_options.prompt(page.parsed_page)
89
+ else:
90
+ user_prompt = self.vlm_options.prompt
89
91
  prompt = self.apply_chat_template(
90
- self.processor, self.config, self.param_question, num_images=1
92
+ self.processor, self.config, user_prompt, num_images=1
91
93
  )
92
94
 
93
95
  start_time = time.time()
@@ -10,6 +10,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
10
10
  from docling.backend.pdf_backend import PdfDocumentBackend
11
11
  from docling.datamodel.base_models import AssembledUnit, Page
12
12
  from docling.datamodel.document import ConversionResult
13
+ from docling.datamodel.layout_model_specs import LayoutModelConfig
13
14
  from docling.datamodel.pipeline_options import PdfPipelineOptions
14
15
  from docling.datamodel.settings import settings
15
16
  from docling.models.base_ocr_model import BaseOcrModel
@@ -36,9 +37,6 @@ _log = logging.getLogger(__name__)
36
37
 
37
38
 
38
39
  class StandardPdfPipeline(PaginatedPipeline):
39
- _layout_model_path = LayoutModel._model_path
40
- _table_model_path = TableStructureModel._model_path
41
-
42
40
  def __init__(self, pipeline_options: PdfPipelineOptions):
43
41
  super().__init__(pipeline_options)
44
42
  self.pipeline_options: PdfPipelineOptions
@@ -80,6 +78,7 @@ class StandardPdfPipeline(PaginatedPipeline):
80
78
  LayoutModel(
81
79
  artifacts_path=artifacts_path,
82
80
  accelerator_options=pipeline_options.accelerator_options,
81
+ options=pipeline_options.layout_options,
83
82
  ),
84
83
  # Table structure model
85
84
  TableStructureModel(
@@ -128,6 +127,7 @@ class StandardPdfPipeline(PaginatedPipeline):
128
127
  if (
129
128
  self.pipeline_options.do_formula_enrichment
130
129
  or self.pipeline_options.do_code_enrichment
130
+ or self.pipeline_options.do_picture_classification
131
131
  or self.pipeline_options.do_picture_description
132
132
  ):
133
133
  self.keep_backend = True
@@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
117
117
  page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
118
118
  if page._backend is not None and page._backend.is_valid():
119
119
  page.size = page._backend.get_size()
120
+ page.parsed_page = page._backend.get_segmented_page()
120
121
 
121
122
  return page
122
123
 
@@ -1,8 +1,6 @@
1
1
  import logging
2
2
  from typing import List, Optional
3
3
 
4
- import torch
5
-
6
4
  from docling.datamodel.accelerator_options import AcceleratorDevice
7
5
 
8
6
  _log = logging.getLogger(__name__)
@@ -18,6 +16,8 @@ def decide_device(
18
16
  1. AUTO: Check for the best available device on the system.
19
17
  2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
20
18
  """
19
+ import torch
20
+
21
21
  device = "cpu"
22
22
 
23
23
  has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
9
9
  from rtree import index
10
10
 
11
11
  from docling.datamodel.base_models import BoundingBox, Cluster, Page
12
+ from docling.datamodel.pipeline_options import LayoutOptions
12
13
 
13
14
  _log = logging.getLogger(__name__)
14
15
 
@@ -194,12 +195,16 @@ class LayoutPostprocessor:
194
195
  DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
195
196
  }
196
197
 
197
- def __init__(self, page: Page, clusters: List[Cluster]) -> None:
198
+ def __init__(
199
+ self, page: Page, clusters: List[Cluster], options: LayoutOptions
200
+ ) -> None:
198
201
  """Initialize processor with page and clusters."""
202
+
199
203
  self.cells = page.cells
200
204
  self.page = page
201
205
  self.page_size = page.size
202
206
  self.all_clusters = clusters
207
+ self.options = options
203
208
  self.regular_clusters = [
204
209
  c for c in clusters if c.label not in self.SPECIAL_TYPES
205
210
  ]
@@ -267,7 +272,7 @@ class LayoutPostprocessor:
267
272
 
268
273
  # Handle orphaned cells
269
274
  unassigned = self._find_unassigned_cells(clusters)
270
- if unassigned:
275
+ if unassigned and self.options.create_orphan_clusters:
271
276
  next_id = max((c.id for c in self.all_clusters), default=0) + 1
272
277
  orphan_clusters = []
273
278
  for i, cell in enumerate(unassigned):
@@ -2,6 +2,7 @@ import logging
2
2
  from pathlib import Path
3
3
  from typing import Optional
4
4
 
5
+ from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2
5
6
  from docling.datamodel.pipeline_options import (
6
7
  granite_picture_description,
7
8
  smolvlm_picture_description,
@@ -46,7 +47,7 @@ def download_models(
46
47
  if with_layout:
47
48
  _log.info("Downloading layout model...")
48
49
  LayoutModel.download_models(
49
- local_dir=output_dir / LayoutModel._model_repo_folder,
50
+ local_dir=output_dir / DOCLING_LAYOUT_V2.model_repo_folder,
50
51
  force=force,
51
52
  progress=progress,
52
53
  )
@@ -41,7 +41,7 @@ def tesseract_box_to_bounding_rectangle(
41
41
  im_size: Tuple[int, int],
42
42
  ) -> BoundingRectangle:
43
43
  # box is in the top, left, height, width format, top left coordinates
44
- rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
44
+ rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
45
45
  rect = BoundingRectangle(
46
46
  r_x0=rect.r_x0 / scale,
47
47
  r_y0=rect.r_y0 / scale,
@@ -14,43 +14,36 @@ def rotate_bounding_box(
14
14
  # coordinate system. Then other corners are found rotating counterclockwise
15
15
  bbox = bbox.to_top_left_origin(im_size[1])
16
16
  left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
17
- im_h, im_w = im_size
17
+ im_w, im_h = im_size
18
18
  angle = angle % 360
19
19
  if angle == 0:
20
- r_x0 = left
21
- r_y0 = top + height
22
- r_x1 = r_x0 + width
23
- r_y1 = r_y0
24
- r_x2 = r_x0 + width
25
- r_y2 = r_y0 - height
26
- r_x3 = r_x0
27
- r_y3 = r_y0 - height
20
+ return BoundingRectangle.from_bounding_box(bbox)
28
21
  elif angle == 90:
29
- r_x0 = im_w - (top + height)
30
- r_y0 = left
22
+ r_x0 = top + height
23
+ r_y0 = im_w - left
31
24
  r_x1 = r_x0
32
- r_y1 = r_y0 + width
33
- r_x2 = r_x0 + height
34
- r_y2 = r_y0 + width
35
- r_x3 = r_x0
36
- r_y3 = r_y0 + width
25
+ r_y1 = r_y0 - width
26
+ r_x2 = r_x1 - height
27
+ r_y2 = r_y1
28
+ r_x3 = r_x2
29
+ r_y3 = r_y0
37
30
  elif angle == 180:
38
- r_x0 = im_h - left
39
- r_y0 = im_w - (top + height)
31
+ r_x0 = im_w - left
32
+ r_y0 = im_h - (top + height)
40
33
  r_x1 = r_x0 - width
41
34
  r_y1 = r_y0
42
- r_x2 = r_x0 - width
43
- r_y2 = r_y0 + height
35
+ r_x2 = r_x1
36
+ r_y2 = r_y1 + height
44
37
  r_x3 = r_x0
45
- r_y3 = r_y0 + height
38
+ r_y3 = r_y2
46
39
  elif angle == 270:
47
- r_x0 = top + height
48
- r_y0 = im_h - left
40
+ r_x0 = im_h - (top + height)
41
+ r_y0 = left
49
42
  r_x1 = r_x0
50
- r_y1 = r_y0 - width
51
- r_x2 = r_x0 - height
52
- r_y2 = r_y0 - width
53
- r_x3 = r_x0 - height
43
+ r_y1 = r_y0 + width
44
+ r_x2 = r_x1 + height
45
+ r_y2 = r_y1
46
+ r_x3 = r_x2
54
47
  r_y3 = r_y0
55
48
  else:
56
49
  msg = (
@@ -58,7 +51,7 @@ def rotate_bounding_box(
58
51
  f" {sorted(CLIPPED_ORIENTATIONS)}"
59
52
  )
60
53
  raise ValueError(msg)
61
- return BoundingRectangle(
54
+ rectangle = BoundingRectangle(
62
55
  r_x0=r_x0,
63
56
  r_y0=r_y0,
64
57
  r_x1=r_x1,
@@ -69,3 +62,4 @@ def rotate_bounding_box(
69
62
  r_y3=r_y3,
70
63
  coord_origin=CoordOrigin.TOPLEFT,
71
64
  )
65
+ return rectangle
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.39.0
3
+ Version: 2.41.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,9 +26,9 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
30
- Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
31
30
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.6.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -57,12 +57,12 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
57
57
  Provides-Extra: vlm
58
58
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
59
59
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
60
- Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
60
+ Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
61
61
  Provides-Extra: rapidocr
62
62
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
63
63
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
64
64
  Provides-Extra: asr
65
- Requires-Dist: openai-whisper>=20240930; extra == "asr"
65
+ Requires-Dist: openai-whisper>=20250625; extra == "asr"
66
66
  Dynamic: license-file
67
67
 
68
68
  <p align="center">
@@ -8,10 +8,10 @@ docling/backend/asciidoc_backend.py,sha256=RDNLrPJHxROiM7-NQdZn3DdvAyiPAndbSWcZo
8
8
  docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
9
9
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
- docling/backend/docling_parse_v4_backend.py,sha256=7tQvpCwpYoq98PNszDkrXaFhy5eWmQqMP4RjWWPLPgw,6197
11
+ docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
12
12
  docling/backend/html_backend.py,sha256=Z959dzqYQO2pPE4xgPRxC5MR9j3nFGtiD6_F_osQ2iI,20670
13
13
  docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
14
- docling/backend/msexcel_backend.py,sha256=3j0WQfqDpgPXdPMCguefdv7arcNVDedPD6gl54cmLn8,18110
14
+ docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
15
15
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
16
16
  docling/backend/msword_backend.py,sha256=7mzPCF4bGWZPst5ntoV3aSxH5WUu2nBP-l8lgQT3tdw,44544
17
17
  docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
@@ -33,66 +33,67 @@ docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
33
33
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
34
34
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
36
- docling/datamodel/asr_model_specs.py,sha256=L7ETXsUKVbPsVcPLhEIMxQjd4UzMGZBVsy74CLsZBkU,2181
37
- docling/datamodel/base_models.py,sha256=67o1ptOTT8tW7i-g6gM2JKEX_1CDbmKEMQ_B9ZYM2z0,11156
36
+ docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
37
+ docling/datamodel/base_models.py,sha256=9FslHkGUNmBp264LpLL_2JTfDAdaikldYs3SiQOHb5A,11828
38
38
  docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
39
- docling/datamodel/pipeline_options.py,sha256=7mKv1IThXYpu3osggp_Y2h7E5C8nbxJLQXS7JJPMvYQ,9479
39
+ docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
40
+ docling/datamodel/pipeline_options.py,sha256=aMwpbyEMbAC-xGJnjQp8iw2ocpSU4eiD8D73gHf7T4U,10033
40
41
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
41
- docling/datamodel/pipeline_options_vlm_model.py,sha256=rtDMVtKFZbgQD269w8FvHMXEhdRBrsA4rVYk6A-M-b4,2063
42
+ docling/datamodel/pipeline_options_vlm_model.py,sha256=z-pUqwRA8nJp6C3SEXZLem2zvSYdgavaAVYa8wkAIZY,2400
42
43
  docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
43
44
  docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
44
45
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- docling/models/api_vlm_model.py,sha256=GDDJGAia4SJjK7JFxsZy5oEU-D8yQo8Kb3NvvPbTvT0,2820
46
+ docling/models/api_vlm_model.py,sha256=foBvzaWeHFH1t-VdvRWLdiXiiofhvhjvHqRI0eNA_3w,2923
46
47
  docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
47
- docling/models/base_ocr_model.py,sha256=HtrefTq9Zy4UnUInMchPv0tbobiA7CQU5VUauKJD7IU,8006
48
+ docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
48
49
  docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
49
- docling/models/document_picture_classifier.py,sha256=fkJLV7pMy3v6iNwOzVb6zdBU1dGtBM1ARHLIRPfoAG4,6124
50
+ docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
50
51
  docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
51
- docling/models/layout_model.py,sha256=EJuRXW0rFdnNPS5AifdEsr812EATUqAioeMCVjw8PL0,8460
52
+ docling/models/layout_model.py,sha256=8bfLVKCS2A-ePTQK-T4M2K_Ah-jUVj71YOtwZvZ9rsU,8825
52
53
  docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
53
54
  docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
54
55
  docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
55
56
  docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
56
57
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
57
- docling/models/picture_description_vlm_model.py,sha256=7LeCx9ZdPxsmWJ468OtxCdAkH48A1HD0iwH9cs_7-1Q,3800
58
+ docling/models/picture_description_vlm_model.py,sha256=nAUt-eZOX2GvaCiV2BJO7VppxUbP7udVIF4oe_sEYXo,4000
58
59
  docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
59
- docling/models/readingorder_model.py,sha256=QHb5fyiqmxU8lg4W5IzdukqHPh6V7rNw_57O4-z-Az4,14615
60
- docling/models/table_structure_model.py,sha256=dQf6u_zn5fHCkHzmTwYfCbRtZCBddsyAM0WNVBUUQzk,12473
60
+ docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
61
+ docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
61
62
  docling/models/tesseract_ocr_cli_model.py,sha256=qcM3-n7Z_dm1CGBhVUcNr2XT41iXnU32zk4RqKHBl9I,12775
62
- docling/models/tesseract_ocr_model.py,sha256=9DPAE7XP7smej7HYhr7mdwpuxSjAcv_GPrYZG3bb1RA,10587
63
+ docling/models/tesseract_ocr_model.py,sha256=GdI5Cjfi87qcehVbM3wdKRvKkl_F9A4bwTUbjXZCJYA,10745
63
64
  docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
64
65
  docling/models/factories/base_factory.py,sha256=MfWIljMETi5aaVR-6qLTelW8u1gwDAQsOwg3fu7O4Qc,4028
65
66
  docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
66
67
  docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
67
68
  docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
- docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurHCZjp4,858
69
+ docling/models/plugins/defaults.py,sha256=OAHWW2tCcUXSyDMFxV_lXVRjSBJ1n6z-Eb3R8cDucU4,886
69
70
  docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
71
  docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
71
72
  docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
- docling/models/vlm_models_inline/hf_transformers_model.py,sha256=w9_N4ccjmYYK5yYQou0LSMGaj6gs8l0hULvXbkfYXSQ,7425
73
- docling/models/vlm_models_inline/mlx_model.py,sha256=qpyi6fGHm0vPqW2yeTsRBKOTTshNJ1LAPbH1SBDp8Y8,5784
73
+ docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6DQMLtCQQW3-YUPDMbgeD2tjfM8vLM,8415
74
+ docling/models/vlm_models_inline/mlx_model.py,sha256=p-H6wG31iVRoOjsqYaCVa4pEzxMP3vzLcsUatMjDJDQ,5948
74
75
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
76
  docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
76
77
  docling/pipeline/base_pipeline.py,sha256=14yQrDjsojl4RgbBjKFSEfVBYR_sULZfBI1uDzFLi8Y,9331
77
78
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
78
- docling/pipeline/standard_pdf_pipeline.py,sha256=2Hqg2wnAXfbZbLUOQrRus8PMEuZ549jR1mfR86-CAB4,12659
79
- docling/pipeline/vlm_pipeline.py,sha256=IrjDbajCPmUPep_jATKNiABST4tQ8mvpkQz9mtBQ8qQ,15279
79
+ docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
80
+ docling/pipeline/vlm_pipeline.py,sha256=0lj8tbXNpYF8OLBoLqP2BZfFpTHi40RoHVfvO_Nah4Q,15349
80
81
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
- docling/utils/accelerator_utils.py,sha256=Fww4UiTiuIB91iuPgUZTy-DYpCGRMI8YuCYKhFb0gjA,2905
82
+ docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
82
83
  docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
83
84
  docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
84
85
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
85
- docling/utils/layout_postprocessor.py,sha256=laTPGGj-hv16Zh1TRcn8NK0POKs7d3jeaV1pRR_TjIU,24228
86
+ docling/utils/layout_postprocessor.py,sha256=QuTZZq4LNs1eM_n_2gubVfAuLBMkJiozfs3hp-jUpK4,24399
86
87
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
87
- docling/utils/model_downloader.py,sha256=6TDxFOvMRYT8JyYyaQS_wXMJzNga61ImY3sFdks66qM,4004
88
- docling/utils/ocr_utils.py,sha256=AOaDAHr5S74d-IRVR_LKhKynUTIurAwLJ3wNeY58gPA,2326
89
- docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,2011
88
+ docling/utils/model_downloader.py,sha256=3vijCsAIVwWqehGBDRxRq7mJ3yRb9-zBsG00iqjqegU,4076
89
+ docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
90
+ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,1842
90
91
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
91
92
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
92
93
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
93
- docling-2.39.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
94
- docling-2.39.0.dist-info/METADATA,sha256=RhW8SMq3CPy0AIX67Fvkuk8CaF1slhq22Z-J78qGhHI,10273
95
- docling-2.39.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
- docling-2.39.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
97
- docling-2.39.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
98
- docling-2.39.0.dist-info/RECORD,,
94
+ docling-2.41.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
95
+ docling-2.41.0.dist-info/METADATA,sha256=KYqB0miKX2x2ESNy8tNHdAlyTCONqhwGLR2iag2PcQ0,10274
96
+ docling-2.41.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
+ docling-2.41.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
98
+ docling-2.41.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
99
+ docling-2.41.0.dist-info/RECORD,,