docling 2.39.0__tar.gz → 2.40.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. {docling-2.39.0 → docling-2.40.0}/PKG-INFO +3 -3
  2. {docling-2.39.0 → docling-2.40.0}/docling/backend/docling_parse_v4_backend.py +14 -4
  3. {docling-2.39.0 → docling-2.40.0}/docling/backend/msexcel_backend.py +33 -14
  4. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/pipeline_options.py +8 -0
  5. {docling-2.39.0 → docling-2.40.0}/docling/models/base_ocr_model.py +6 -2
  6. {docling-2.39.0 → docling-2.40.0}/docling/models/layout_model.py +10 -3
  7. {docling-2.39.0 → docling-2.40.0}/docling/models/picture_description_vlm_model.py +16 -11
  8. docling-2.40.0/docling/models/plugins/defaults.py +28 -0
  9. {docling-2.39.0 → docling-2.40.0}/docling/models/readingorder_model.py +8 -1
  10. {docling-2.39.0 → docling-2.40.0}/docling/models/table_structure_model.py +3 -1
  11. {docling-2.39.0 → docling-2.40.0}/docling/models/tesseract_ocr_model.py +10 -4
  12. {docling-2.39.0 → docling-2.40.0}/docling/pipeline/standard_pdf_pipeline.py +1 -0
  13. {docling-2.39.0 → docling-2.40.0}/docling/utils/accelerator_utils.py +2 -2
  14. {docling-2.39.0 → docling-2.40.0}/docling/utils/layout_postprocessor.py +7 -2
  15. {docling-2.39.0 → docling-2.40.0}/docling.egg-info/PKG-INFO +3 -3
  16. {docling-2.39.0 → docling-2.40.0}/docling.egg-info/requires.txt +2 -2
  17. {docling-2.39.0 → docling-2.40.0}/pyproject.toml +3 -3
  18. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_docling_parse_v4.py +17 -0
  19. docling-2.39.0/docling/models/plugins/defaults.py +0 -28
  20. {docling-2.39.0 → docling-2.40.0}/LICENSE +0 -0
  21. {docling-2.39.0 → docling-2.40.0}/README.md +0 -0
  22. {docling-2.39.0 → docling-2.40.0}/docling/__init__.py +0 -0
  23. {docling-2.39.0 → docling-2.40.0}/docling/backend/__init__.py +0 -0
  24. {docling-2.39.0 → docling-2.40.0}/docling/backend/abstract_backend.py +0 -0
  25. {docling-2.39.0 → docling-2.40.0}/docling/backend/asciidoc_backend.py +0 -0
  26. {docling-2.39.0 → docling-2.40.0}/docling/backend/csv_backend.py +0 -0
  27. {docling-2.39.0 → docling-2.40.0}/docling/backend/docling_parse_backend.py +0 -0
  28. {docling-2.39.0 → docling-2.40.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  29. {docling-2.39.0 → docling-2.40.0}/docling/backend/docx/__init__.py +0 -0
  30. {docling-2.39.0 → docling-2.40.0}/docling/backend/docx/latex/__init__.py +0 -0
  31. {docling-2.39.0 → docling-2.40.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  32. {docling-2.39.0 → docling-2.40.0}/docling/backend/docx/latex/omml.py +0 -0
  33. {docling-2.39.0 → docling-2.40.0}/docling/backend/html_backend.py +0 -0
  34. {docling-2.39.0 → docling-2.40.0}/docling/backend/json/__init__.py +0 -0
  35. {docling-2.39.0 → docling-2.40.0}/docling/backend/json/docling_json_backend.py +0 -0
  36. {docling-2.39.0 → docling-2.40.0}/docling/backend/md_backend.py +0 -0
  37. {docling-2.39.0 → docling-2.40.0}/docling/backend/mspowerpoint_backend.py +0 -0
  38. {docling-2.39.0 → docling-2.40.0}/docling/backend/msword_backend.py +0 -0
  39. {docling-2.39.0 → docling-2.40.0}/docling/backend/noop_backend.py +0 -0
  40. {docling-2.39.0 → docling-2.40.0}/docling/backend/pdf_backend.py +0 -0
  41. {docling-2.39.0 → docling-2.40.0}/docling/backend/pypdfium2_backend.py +0 -0
  42. {docling-2.39.0 → docling-2.40.0}/docling/backend/xml/__init__.py +0 -0
  43. {docling-2.39.0 → docling-2.40.0}/docling/backend/xml/jats_backend.py +0 -0
  44. {docling-2.39.0 → docling-2.40.0}/docling/backend/xml/uspto_backend.py +0 -0
  45. {docling-2.39.0 → docling-2.40.0}/docling/chunking/__init__.py +0 -0
  46. {docling-2.39.0 → docling-2.40.0}/docling/cli/__init__.py +0 -0
  47. {docling-2.39.0 → docling-2.40.0}/docling/cli/main.py +0 -0
  48. {docling-2.39.0 → docling-2.40.0}/docling/cli/models.py +0 -0
  49. {docling-2.39.0 → docling-2.40.0}/docling/cli/tools.py +0 -0
  50. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/__init__.py +0 -0
  51. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/accelerator_options.py +0 -0
  52. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/asr_model_specs.py +0 -0
  53. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/base_models.py +0 -0
  54. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/document.py +0 -0
  55. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  56. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  57. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/settings.py +0 -0
  58. {docling-2.39.0 → docling-2.40.0}/docling/datamodel/vlm_model_specs.py +0 -0
  59. {docling-2.39.0 → docling-2.40.0}/docling/document_converter.py +0 -0
  60. {docling-2.39.0 → docling-2.40.0}/docling/exceptions.py +0 -0
  61. {docling-2.39.0 → docling-2.40.0}/docling/models/__init__.py +0 -0
  62. {docling-2.39.0 → docling-2.40.0}/docling/models/api_vlm_model.py +0 -0
  63. {docling-2.39.0 → docling-2.40.0}/docling/models/base_model.py +0 -0
  64. {docling-2.39.0 → docling-2.40.0}/docling/models/code_formula_model.py +0 -0
  65. {docling-2.39.0 → docling-2.40.0}/docling/models/document_picture_classifier.py +0 -0
  66. {docling-2.39.0 → docling-2.40.0}/docling/models/easyocr_model.py +0 -0
  67. {docling-2.39.0 → docling-2.40.0}/docling/models/factories/__init__.py +0 -0
  68. {docling-2.39.0 → docling-2.40.0}/docling/models/factories/base_factory.py +0 -0
  69. {docling-2.39.0 → docling-2.40.0}/docling/models/factories/ocr_factory.py +0 -0
  70. {docling-2.39.0 → docling-2.40.0}/docling/models/factories/picture_description_factory.py +0 -0
  71. {docling-2.39.0 → docling-2.40.0}/docling/models/ocr_mac_model.py +0 -0
  72. {docling-2.39.0 → docling-2.40.0}/docling/models/page_assemble_model.py +0 -0
  73. {docling-2.39.0 → docling-2.40.0}/docling/models/page_preprocessing_model.py +0 -0
  74. {docling-2.39.0 → docling-2.40.0}/docling/models/picture_description_api_model.py +0 -0
  75. {docling-2.39.0 → docling-2.40.0}/docling/models/picture_description_base_model.py +0 -0
  76. {docling-2.39.0 → docling-2.40.0}/docling/models/plugins/__init__.py +0 -0
  77. {docling-2.39.0 → docling-2.40.0}/docling/models/rapid_ocr_model.py +0 -0
  78. {docling-2.39.0 → docling-2.40.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  79. {docling-2.39.0 → docling-2.40.0}/docling/models/utils/__init__.py +0 -0
  80. {docling-2.39.0 → docling-2.40.0}/docling/models/utils/hf_model_download.py +0 -0
  81. {docling-2.39.0 → docling-2.40.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  82. {docling-2.39.0 → docling-2.40.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  83. {docling-2.39.0 → docling-2.40.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  84. {docling-2.39.0 → docling-2.40.0}/docling/pipeline/__init__.py +0 -0
  85. {docling-2.39.0 → docling-2.40.0}/docling/pipeline/asr_pipeline.py +0 -0
  86. {docling-2.39.0 → docling-2.40.0}/docling/pipeline/base_pipeline.py +0 -0
  87. {docling-2.39.0 → docling-2.40.0}/docling/pipeline/simple_pipeline.py +0 -0
  88. {docling-2.39.0 → docling-2.40.0}/docling/pipeline/vlm_pipeline.py +0 -0
  89. {docling-2.39.0 → docling-2.40.0}/docling/py.typed +0 -0
  90. {docling-2.39.0 → docling-2.40.0}/docling/utils/__init__.py +0 -0
  91. {docling-2.39.0 → docling-2.40.0}/docling/utils/api_image_request.py +0 -0
  92. {docling-2.39.0 → docling-2.40.0}/docling/utils/export.py +0 -0
  93. {docling-2.39.0 → docling-2.40.0}/docling/utils/glm_utils.py +0 -0
  94. {docling-2.39.0 → docling-2.40.0}/docling/utils/locks.py +0 -0
  95. {docling-2.39.0 → docling-2.40.0}/docling/utils/model_downloader.py +0 -0
  96. {docling-2.39.0 → docling-2.40.0}/docling/utils/ocr_utils.py +0 -0
  97. {docling-2.39.0 → docling-2.40.0}/docling/utils/orientation.py +0 -0
  98. {docling-2.39.0 → docling-2.40.0}/docling/utils/profiling.py +0 -0
  99. {docling-2.39.0 → docling-2.40.0}/docling/utils/utils.py +0 -0
  100. {docling-2.39.0 → docling-2.40.0}/docling/utils/visualization.py +0 -0
  101. {docling-2.39.0 → docling-2.40.0}/docling.egg-info/SOURCES.txt +0 -0
  102. {docling-2.39.0 → docling-2.40.0}/docling.egg-info/dependency_links.txt +0 -0
  103. {docling-2.39.0 → docling-2.40.0}/docling.egg-info/entry_points.txt +0 -0
  104. {docling-2.39.0 → docling-2.40.0}/docling.egg-info/top_level.txt +0 -0
  105. {docling-2.39.0 → docling-2.40.0}/setup.cfg +0 -0
  106. {docling-2.39.0 → docling-2.40.0}/tests/test_asr_pipeline.py +0 -0
  107. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_asciidoc.py +0 -0
  108. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_csv.py +0 -0
  109. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_docling_json.py +0 -0
  110. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_docling_parse.py +0 -0
  111. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_docling_parse_v2.py +0 -0
  112. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_html.py +0 -0
  113. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_jats.py +0 -0
  114. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_markdown.py +0 -0
  115. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_msexcel.py +0 -0
  116. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_msword.py +0 -0
  117. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_patent_uspto.py +0 -0
  118. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_pdfium.py +0 -0
  119. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_pptx.py +0 -0
  120. {docling-2.39.0 → docling-2.40.0}/tests/test_backend_webp.py +0 -0
  121. {docling-2.39.0 → docling-2.40.0}/tests/test_cli.py +0 -0
  122. {docling-2.39.0 → docling-2.40.0}/tests/test_code_formula.py +0 -0
  123. {docling-2.39.0 → docling-2.40.0}/tests/test_data_gen_flag.py +0 -0
  124. {docling-2.39.0 → docling-2.40.0}/tests/test_document_picture_classifier.py +0 -0
  125. {docling-2.39.0 → docling-2.40.0}/tests/test_e2e_conversion.py +0 -0
  126. {docling-2.39.0 → docling-2.40.0}/tests/test_e2e_ocr_conversion.py +0 -0
  127. {docling-2.39.0 → docling-2.40.0}/tests/test_input_doc.py +0 -0
  128. {docling-2.39.0 → docling-2.40.0}/tests/test_interfaces.py +0 -0
  129. {docling-2.39.0 → docling-2.40.0}/tests/test_invalid_input.py +0 -0
  130. {docling-2.39.0 → docling-2.40.0}/tests/test_legacy_format_transform.py +0 -0
  131. {docling-2.39.0 → docling-2.40.0}/tests/test_options.py +0 -0
  132. {docling-2.39.0 → docling-2.40.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.39.0
3
+ Version: 2.40.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,8 +27,8 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
30
- Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
31
30
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.6.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -57,7 +57,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
57
57
  Provides-Extra: vlm
58
58
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
59
59
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
60
- Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
60
+ Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
61
61
  Provides-Extra: rapidocr
62
62
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
63
63
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
187
187
 
188
188
  def unload(self):
189
189
  super().unload()
190
- self.dp_doc.unload()
191
- with pypdfium2_lock:
192
- self._pdoc.close()
193
- self._pdoc = None
190
+ # Unload docling-parse document first
191
+ if self.dp_doc is not None:
192
+ self.dp_doc.unload()
193
+ self.dp_doc = None
194
+
195
+ # Then close pypdfium2 document with proper locking
196
+ if self._pdoc is not None:
197
+ with pypdfium2_lock:
198
+ try:
199
+ self._pdoc.close()
200
+ except Exception:
201
+ # Ignore cleanup errors
202
+ pass
203
+ self._pdoc = None
@@ -337,10 +337,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
337
337
  # Collect the data within the bounds
338
338
  data = []
339
339
  visited_cells: set[tuple[int, int]] = set()
340
- for ri in range(start_row, max_row + 1):
341
- for rj in range(start_col, max_col + 1):
342
- cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
343
-
340
+ for ri, row in enumerate(
341
+ sheet.iter_rows(
342
+ min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
343
+ max_row=max_row + 1,
344
+ min_col=start_col + 1,
345
+ max_col=max_col + 1,
346
+ values_only=False,
347
+ ),
348
+ start_row,
349
+ ):
350
+ for rj, cell in enumerate(row, start_col):
344
351
  # Check if the cell belongs to a merged range
345
352
  row_span = 1
346
353
  col_span = 1
@@ -397,10 +404,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
397
404
  """
398
405
  max_row: int = start_row
399
406
 
400
- while max_row < sheet.max_row - 1:
401
- # Get the cell value or check if it is part of a merged cell
402
- cell = sheet.cell(row=max_row + 2, column=start_col + 1)
403
-
407
+ for ri, (cell,) in enumerate(
408
+ sheet.iter_rows(
409
+ min_row=start_row + 2,
410
+ max_row=sheet.max_row,
411
+ min_col=start_col + 1,
412
+ max_col=start_col + 1,
413
+ values_only=False,
414
+ ),
415
+ start_row + 1,
416
+ ):
404
417
  # Check if the cell is part of a merged range
405
418
  merged_range = next(
406
419
  (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
@@ -414,7 +427,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
414
427
  if merged_range:
415
428
  max_row = max(max_row, merged_range.max_row - 1)
416
429
  else:
417
- max_row += 1
430
+ max_row = ri
418
431
 
419
432
  return max_row
420
433
 
@@ -433,10 +446,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
433
446
  """
434
447
  max_col: int = start_col
435
448
 
436
- while max_col < sheet.max_column - 1:
437
- # Get the cell value or check if it is part of a merged cell
438
- cell = sheet.cell(row=start_row + 1, column=max_col + 2)
439
-
449
+ for rj, (cell,) in enumerate(
450
+ sheet.iter_cols(
451
+ min_row=start_row + 1,
452
+ max_row=start_row + 1,
453
+ min_col=start_col + 2,
454
+ max_col=sheet.max_column,
455
+ values_only=False,
456
+ ),
457
+ start_col + 1,
458
+ ):
440
459
  # Check if the cell is part of a merged range
441
460
  merged_range = next(
442
461
  (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
@@ -450,7 +469,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
450
469
  if merged_range:
451
470
  max_col = max(max_col, merged_range.max_col - 1)
452
471
  else:
453
- max_col += 1
472
+ max_col = rj
454
473
 
455
474
  return max_col
456
475
 
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from datetime import datetime
2
3
  from enum import Enum
3
4
  from pathlib import Path
4
5
  from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@@ -265,6 +266,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
265
266
  )
266
267
 
267
268
 
269
+ class LayoutOptions(BaseModel):
270
+ """Options for layout processing."""
271
+
272
+ create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
273
+
274
+
268
275
  class AsrPipelineOptions(PipelineOptions):
269
276
  asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
270
277
  artifacts_path: Optional[Union[Path, str]] = None
@@ -289,6 +296,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
289
296
  picture_description_options: PictureDescriptionBaseOptions = (
290
297
  smolvlm_picture_description
291
298
  )
299
+ layout_options: LayoutOptions = LayoutOptions()
292
300
 
293
301
  images_scale: float = 1.0
294
302
  generate_page_images: bool = False
@@ -3,14 +3,13 @@ import logging
3
3
  from abc import abstractmethod
4
4
  from collections.abc import Iterable
5
5
  from pathlib import Path
6
- from typing import List, Optional, Type
6
+ from typing import TYPE_CHECKING, List, Optional, Type
7
7
 
8
8
  import numpy as np
9
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
10
10
  from docling_core.types.doc.page import TextCell
11
11
  from PIL import Image, ImageDraw
12
12
  from rtree import index
13
- from scipy.ndimage import binary_dilation, find_objects, label
14
13
 
15
14
  from docling.datamodel.accelerator_options import AcceleratorOptions
16
15
  from docling.datamodel.base_models import Page
@@ -31,11 +30,16 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
31
30
  options: OcrOptions,
32
31
  accelerator_options: AcceleratorOptions,
33
32
  ):
33
+ # Make sure any delay/error from import occurs on ocr model init and not first use
34
+ from scipy.ndimage import binary_dilation, find_objects, label
35
+
34
36
  self.enabled = enabled
35
37
  self.options = options
36
38
 
37
39
  # Computes the optimum amount and coordinates of rectangles to OCR on a given page
38
40
  def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
41
+ from scipy.ndimage import binary_dilation, find_objects, label
42
+
39
43
  BITMAP_COVERAGE_TRESHOLD = 0.75
40
44
  assert page.size is not None
41
45
 
@@ -7,12 +7,12 @@ from typing import Optional
7
7
 
8
8
  import numpy as np
9
9
  from docling_core.types.doc import DocItemLabel
10
- from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
11
10
  from PIL import Image
12
11
 
13
12
  from docling.datamodel.accelerator_options import AcceleratorOptions
14
13
  from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
15
14
  from docling.datamodel.document import ConversionResult
15
+ from docling.datamodel.pipeline_options import LayoutOptions
16
16
  from docling.datamodel.settings import settings
17
17
  from docling.models.base_model import BasePageModel
18
18
  from docling.models.utils.hf_model_download import download_hf_model
@@ -49,8 +49,15 @@ class LayoutModel(BasePageModel):
49
49
  CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
50
50
 
51
51
  def __init__(
52
- self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
52
+ self,
53
+ artifacts_path: Optional[Path],
54
+ accelerator_options: AcceleratorOptions,
55
+ options: LayoutOptions,
53
56
  ):
57
+ from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
58
+
59
+ self.options = options
60
+
54
61
  device = decide_device(accelerator_options.device)
55
62
 
56
63
  if artifacts_path is None:
@@ -176,7 +183,7 @@ class LayoutModel(BasePageModel):
176
183
  # Apply postprocessing
177
184
 
178
185
  processed_clusters, processed_cells = LayoutPostprocessor(
179
- page, clusters
186
+ page, clusters, self.options
180
187
  ).postprocess()
181
188
  # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
182
189
 
@@ -1,3 +1,4 @@
1
+ import threading
1
2
  from collections.abc import Iterable
2
3
  from pathlib import Path
3
4
  from typing import Optional, Type, Union
@@ -15,6 +16,9 @@ from docling.models.utils.hf_model_download import (
15
16
  )
16
17
  from docling.utils.accelerator_utils import decide_device
17
18
 
19
+ # Global lock for model initialization to prevent threading issues
20
+ _model_init_lock = threading.Lock()
21
+
18
22
 
19
23
  class PictureDescriptionVlmModel(
20
24
  PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
@@ -57,17 +61,18 @@ class PictureDescriptionVlmModel(
57
61
  )
58
62
 
59
63
  # Initialize processor and model
60
- self.processor = AutoProcessor.from_pretrained(artifacts_path)
61
- self.model = AutoModelForVision2Seq.from_pretrained(
62
- artifacts_path,
63
- torch_dtype=torch.bfloat16,
64
- _attn_implementation=(
65
- "flash_attention_2"
66
- if self.device.startswith("cuda")
67
- and accelerator_options.cuda_use_flash_attention2
68
- else "eager"
69
- ),
70
- ).to(self.device)
64
+ with _model_init_lock:
65
+ self.processor = AutoProcessor.from_pretrained(artifacts_path)
66
+ self.model = AutoModelForVision2Seq.from_pretrained(
67
+ artifacts_path,
68
+ torch_dtype=torch.bfloat16,
69
+ _attn_implementation=(
70
+ "flash_attention_2"
71
+ if self.device.startswith("cuda")
72
+ and accelerator_options.cuda_use_flash_attention2
73
+ else "eager"
74
+ ),
75
+ ).to(self.device)
71
76
 
72
77
  self.provenance = f"{self.options.repo_id}"
73
78
 
@@ -0,0 +1,28 @@
1
+ def ocr_engines():
2
+ from docling.models.easyocr_model import EasyOcrModel
3
+ from docling.models.ocr_mac_model import OcrMacModel
4
+ from docling.models.rapid_ocr_model import RapidOcrModel
5
+ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
6
+ from docling.models.tesseract_ocr_model import TesseractOcrModel
7
+
8
+ return {
9
+ "ocr_engines": [
10
+ EasyOcrModel,
11
+ OcrMacModel,
12
+ RapidOcrModel,
13
+ TesseractOcrModel,
14
+ TesseractOcrCliModel,
15
+ ]
16
+ }
17
+
18
+
19
+ def picture_description():
20
+ from docling.models.picture_description_api_model import PictureDescriptionApiModel
21
+ from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
22
+
23
+ return {
24
+ "picture_description": [
25
+ PictureDescriptionVlmModel,
26
+ PictureDescriptionApiModel,
27
+ ]
28
+ }
@@ -12,6 +12,9 @@ from docling_core.types.doc import (
12
12
  TableData,
13
13
  )
14
14
  from docling_core.types.doc.document import ContentLayer
15
+ from docling_ibm_models.list_item_normalizer.list_marker_processor import (
16
+ ListItemMarkerProcessor,
17
+ )
15
18
  from docling_ibm_models.reading_order.reading_order_rb import (
16
19
  PageElement as ReadingOrderPageElement,
17
20
  ReadingOrderPredictor,
@@ -40,6 +43,7 @@ class ReadingOrderModel:
40
43
  def __init__(self, options: ReadingOrderOptions):
41
44
  self.options = options
42
45
  self.ro_model = ReadingOrderPredictor()
46
+ self.list_item_processor = ListItemMarkerProcessor()
43
47
 
44
48
  def _assembled_to_readingorder_elements(
45
49
  self, conv_res: ConversionResult
@@ -92,7 +96,8 @@ class ReadingOrderModel:
92
96
  )
93
97
  if c_label == DocItemLabel.LIST_ITEM:
94
98
  # TODO: Infer if this is a numbered or a bullet list item
95
- doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
99
+ l_item = doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
100
+ self.list_item_processor.process_list_item(l_item)
96
101
  elif c_label == DocItemLabel.SECTION_HEADER:
97
102
  doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
98
103
  else:
@@ -301,6 +306,8 @@ class ReadingOrderModel:
301
306
  new_item = out_doc.add_list_item(
302
307
  text=cap_text, enumerated=False, prov=prov, parent=current_list
303
308
  )
309
+ self.list_item_processor.process_list_item(new_item)
310
+
304
311
  elif label == DocItemLabel.SECTION_HEADER:
305
312
  current_list = None
306
313
 
@@ -10,7 +10,6 @@ from docling_core.types.doc.page import (
10
10
  BoundingRectangle,
11
11
  TextCellUnit,
12
12
  )
13
- from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
14
13
  from PIL import ImageDraw
15
14
 
16
15
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -70,6 +69,9 @@ class TableStructureModel(BasePageModel):
70
69
 
71
70
  # Third Party
72
71
  import docling_ibm_models.tableformer.common as c
72
+ from docling_ibm_models.tableformer.data_management.tf_predictor import (
73
+ TFPredictor,
74
+ )
73
75
 
74
76
  device = decide_device(accelerator_options.device)
75
77
 
@@ -144,7 +144,10 @@ class TesseractOcrModel(BaseOcrModel):
144
144
 
145
145
  local_reader = self.reader
146
146
  self.osd_reader.SetImage(high_res_image)
147
+
148
+ doc_orientation = 0
147
149
  osd = self.osd_reader.DetectOrientationScript()
150
+
148
151
  # No text, or Orientation and Script detection failure
149
152
  if osd is None:
150
153
  _log.error(
@@ -158,11 +161,14 @@ class TesseractOcrModel(BaseOcrModel):
158
161
  # to OCR in the hope OCR will succeed while OSD failed
159
162
  if self._is_auto:
160
163
  continue
161
- doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
162
- if doc_orientation != 0:
163
- high_res_image = high_res_image.rotate(
164
- -doc_orientation, expand=True
164
+ else:
165
+ doc_orientation = parse_tesseract_orientation(
166
+ osd["orient_deg"]
165
167
  )
168
+ if doc_orientation != 0:
169
+ high_res_image = high_res_image.rotate(
170
+ -doc_orientation, expand=True
171
+ )
166
172
  if self._is_auto:
167
173
  script = osd["script_name"]
168
174
  script = map_tesseract_script(script)
@@ -80,6 +80,7 @@ class StandardPdfPipeline(PaginatedPipeline):
80
80
  LayoutModel(
81
81
  artifacts_path=artifacts_path,
82
82
  accelerator_options=pipeline_options.accelerator_options,
83
+ options=pipeline_options.layout_options,
83
84
  ),
84
85
  # Table structure model
85
86
  TableStructureModel(
@@ -1,8 +1,6 @@
1
1
  import logging
2
2
  from typing import List, Optional
3
3
 
4
- import torch
5
-
6
4
  from docling.datamodel.accelerator_options import AcceleratorDevice
7
5
 
8
6
  _log = logging.getLogger(__name__)
@@ -18,6 +16,8 @@ def decide_device(
18
16
  1. AUTO: Check for the best available device on the system.
19
17
  2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
20
18
  """
19
+ import torch
20
+
21
21
  device = "cpu"
22
22
 
23
23
  has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
9
9
  from rtree import index
10
10
 
11
11
  from docling.datamodel.base_models import BoundingBox, Cluster, Page
12
+ from docling.datamodel.pipeline_options import LayoutOptions
12
13
 
13
14
  _log = logging.getLogger(__name__)
14
15
 
@@ -194,12 +195,16 @@ class LayoutPostprocessor:
194
195
  DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
195
196
  }
196
197
 
197
- def __init__(self, page: Page, clusters: List[Cluster]) -> None:
198
+ def __init__(
199
+ self, page: Page, clusters: List[Cluster], options: LayoutOptions
200
+ ) -> None:
198
201
  """Initialize processor with page and clusters."""
202
+
199
203
  self.cells = page.cells
200
204
  self.page = page
201
205
  self.page_size = page.size
202
206
  self.all_clusters = clusters
207
+ self.options = options
203
208
  self.regular_clusters = [
204
209
  c for c in clusters if c.label not in self.SPECIAL_TYPES
205
210
  ]
@@ -267,7 +272,7 @@ class LayoutPostprocessor:
267
272
 
268
273
  # Handle orphaned cells
269
274
  unassigned = self._find_unassigned_cells(clusters)
270
- if unassigned:
275
+ if unassigned and self.options.create_orphan_clusters:
271
276
  next_id = max((c.id for c in self.all_clusters), default=0) + 1
272
277
  orphan_clusters = []
273
278
  for i, cell in enumerate(unassigned):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.39.0
3
+ Version: 2.40.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,8 +27,8 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
30
- Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
31
30
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.6.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -57,7 +57,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
57
57
  Provides-Extra: vlm
58
58
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
59
59
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
60
- Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
60
+ Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
61
61
  Provides-Extra: rapidocr
62
62
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
63
63
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -1,7 +1,7 @@
1
1
  pydantic<3.0.0,>=2.0.0
2
2
  docling-core[chunking]<3.0.0,>=2.39.0
3
- docling-ibm-models<4.0.0,>=3.4.4
4
3
  docling-parse<5.0.0,>=4.0.0
4
+ docling-ibm-models<4,>=3.6.0
5
5
  filetype<2.0.0,>=1.2.0
6
6
  pypdfium2<5.0.0,>=4.30.0
7
7
  pydantic-settings<3.0.0,>=2.3.0
@@ -46,4 +46,4 @@ transformers<5.0.0,>=4.46.0
46
46
  accelerate<2.0.0,>=1.2.1
47
47
 
48
48
  [vlm:python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"]
49
- mlx-vlm>=0.1.22
49
+ mlx-vlm<0.2,>=0.1.22
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.39.0" # DO NOT EDIT, updated automatically
3
+ version = "2.40.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -45,8 +45,8 @@ requires-python = '>=3.9,<4.0'
45
45
  dependencies = [
46
46
  'pydantic (>=2.0.0,<3.0.0)',
47
47
  'docling-core[chunking] (>=2.39.0,<3.0.0)',
48
- 'docling-ibm-models (>=3.4.4,<4.0.0)',
49
48
  'docling-parse (>=4.0.0,<5.0.0)',
49
+ 'docling-ibm-models (>=3.6.0,<4)',
50
50
  'filetype (>=1.2.0,<2.0.0)',
51
51
  'pypdfium2 (>=4.30.0,<5.0.0)',
52
52
  'pydantic-settings (>=2.3.0,<3.0.0)',
@@ -91,7 +91,7 @@ ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
91
91
  vlm = [
92
92
  'transformers (>=4.46.0,<5.0.0)',
93
93
  'accelerate (>=1.2.1,<2.0.0)',
94
- 'mlx-vlm >=0.1.22 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
94
+ 'mlx-vlm (>=0.1.22,<0.2) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
95
95
  ]
96
96
  rapidocr = [
97
97
  'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"',
@@ -46,6 +46,12 @@ def test_text_cell_counts():
46
46
  )
47
47
  last_cell_count = len(cells)
48
48
 
49
+ # Clean up page backend after each iteration
50
+ page_backend.unload()
51
+
52
+ # Explicitly clean up document backend to prevent race conditions in CI
53
+ doc_backend.unload()
54
+
49
55
 
50
56
  def test_get_text_from_rect(test_doc_path):
51
57
  doc_backend = _get_backend(test_doc_path)
@@ -59,6 +65,10 @@ def test_get_text_from_rect(test_doc_path):
59
65
 
60
66
  assert textpiece.strip() == ref
61
67
 
68
+ # Explicitly clean up resources
69
+ page_backend.unload()
70
+ doc_backend.unload()
71
+
62
72
 
63
73
  def test_crop_page_image(test_doc_path):
64
74
  doc_backend = _get_backend(test_doc_path)
@@ -70,7 +80,14 @@ def test_crop_page_image(test_doc_path):
70
80
  )
71
81
  # im.show()
72
82
 
83
+ # Explicitly clean up resources
84
+ page_backend.unload()
85
+ doc_backend.unload()
86
+
73
87
 
74
88
  def test_num_pages(test_doc_path):
75
89
  doc_backend = _get_backend(test_doc_path)
76
90
  doc_backend.page_count() == 9
91
+
92
+ # Explicitly clean up resources to prevent race conditions in CI
93
+ doc_backend.unload()
@@ -1,28 +0,0 @@
1
- from docling.models.easyocr_model import EasyOcrModel
2
- from docling.models.ocr_mac_model import OcrMacModel
3
- from docling.models.picture_description_api_model import PictureDescriptionApiModel
4
- from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
5
- from docling.models.rapid_ocr_model import RapidOcrModel
6
- from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
7
- from docling.models.tesseract_ocr_model import TesseractOcrModel
8
-
9
-
10
- def ocr_engines():
11
- return {
12
- "ocr_engines": [
13
- EasyOcrModel,
14
- OcrMacModel,
15
- RapidOcrModel,
16
- TesseractOcrModel,
17
- TesseractOcrCliModel,
18
- ]
19
- }
20
-
21
-
22
- def picture_description():
23
- return {
24
- "picture_description": [
25
- PictureDescriptionVlmModel,
26
- PictureDescriptionApiModel,
27
- ]
28
- }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes