docling 2.38.1__tar.gz → 2.40.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. {docling-2.38.1 → docling-2.40.0}/PKG-INFO +4 -4
  2. {docling-2.38.1 → docling-2.40.0}/docling/backend/docling_parse_v4_backend.py +14 -4
  3. {docling-2.38.1 → docling-2.40.0}/docling/backend/html_backend.py +31 -42
  4. {docling-2.38.1 → docling-2.40.0}/docling/backend/md_backend.py +25 -12
  5. {docling-2.38.1 → docling-2.40.0}/docling/backend/msexcel_backend.py +33 -14
  6. {docling-2.38.1 → docling-2.40.0}/docling/backend/mspowerpoint_backend.py +4 -5
  7. {docling-2.38.1 → docling-2.40.0}/docling/backend/msword_backend.py +31 -36
  8. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/pipeline_options.py +8 -0
  9. {docling-2.38.1 → docling-2.40.0}/docling/models/base_ocr_model.py +6 -2
  10. {docling-2.38.1 → docling-2.40.0}/docling/models/layout_model.py +10 -3
  11. {docling-2.38.1 → docling-2.40.0}/docling/models/picture_description_vlm_model.py +16 -11
  12. docling-2.40.0/docling/models/plugins/defaults.py +28 -0
  13. {docling-2.38.1 → docling-2.40.0}/docling/models/readingorder_model.py +8 -1
  14. {docling-2.38.1 → docling-2.40.0}/docling/models/table_structure_model.py +3 -1
  15. {docling-2.38.1 → docling-2.40.0}/docling/models/tesseract_ocr_model.py +10 -4
  16. {docling-2.38.1 → docling-2.40.0}/docling/pipeline/standard_pdf_pipeline.py +1 -0
  17. {docling-2.38.1 → docling-2.40.0}/docling/utils/accelerator_utils.py +2 -2
  18. {docling-2.38.1 → docling-2.40.0}/docling/utils/layout_postprocessor.py +7 -2
  19. {docling-2.38.1 → docling-2.40.0}/docling.egg-info/PKG-INFO +4 -4
  20. {docling-2.38.1 → docling-2.40.0}/docling.egg-info/requires.txt +3 -3
  21. {docling-2.38.1 → docling-2.40.0}/pyproject.toml +4 -4
  22. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_docling_parse_v4.py +17 -0
  23. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_pptx.py +2 -2
  24. docling-2.38.1/docling/models/plugins/defaults.py +0 -28
  25. {docling-2.38.1 → docling-2.40.0}/LICENSE +0 -0
  26. {docling-2.38.1 → docling-2.40.0}/README.md +0 -0
  27. {docling-2.38.1 → docling-2.40.0}/docling/__init__.py +0 -0
  28. {docling-2.38.1 → docling-2.40.0}/docling/backend/__init__.py +0 -0
  29. {docling-2.38.1 → docling-2.40.0}/docling/backend/abstract_backend.py +0 -0
  30. {docling-2.38.1 → docling-2.40.0}/docling/backend/asciidoc_backend.py +0 -0
  31. {docling-2.38.1 → docling-2.40.0}/docling/backend/csv_backend.py +0 -0
  32. {docling-2.38.1 → docling-2.40.0}/docling/backend/docling_parse_backend.py +0 -0
  33. {docling-2.38.1 → docling-2.40.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  34. {docling-2.38.1 → docling-2.40.0}/docling/backend/docx/__init__.py +0 -0
  35. {docling-2.38.1 → docling-2.40.0}/docling/backend/docx/latex/__init__.py +0 -0
  36. {docling-2.38.1 → docling-2.40.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  37. {docling-2.38.1 → docling-2.40.0}/docling/backend/docx/latex/omml.py +0 -0
  38. {docling-2.38.1 → docling-2.40.0}/docling/backend/json/__init__.py +0 -0
  39. {docling-2.38.1 → docling-2.40.0}/docling/backend/json/docling_json_backend.py +0 -0
  40. {docling-2.38.1 → docling-2.40.0}/docling/backend/noop_backend.py +0 -0
  41. {docling-2.38.1 → docling-2.40.0}/docling/backend/pdf_backend.py +0 -0
  42. {docling-2.38.1 → docling-2.40.0}/docling/backend/pypdfium2_backend.py +0 -0
  43. {docling-2.38.1 → docling-2.40.0}/docling/backend/xml/__init__.py +0 -0
  44. {docling-2.38.1 → docling-2.40.0}/docling/backend/xml/jats_backend.py +0 -0
  45. {docling-2.38.1 → docling-2.40.0}/docling/backend/xml/uspto_backend.py +0 -0
  46. {docling-2.38.1 → docling-2.40.0}/docling/chunking/__init__.py +0 -0
  47. {docling-2.38.1 → docling-2.40.0}/docling/cli/__init__.py +0 -0
  48. {docling-2.38.1 → docling-2.40.0}/docling/cli/main.py +0 -0
  49. {docling-2.38.1 → docling-2.40.0}/docling/cli/models.py +0 -0
  50. {docling-2.38.1 → docling-2.40.0}/docling/cli/tools.py +0 -0
  51. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/__init__.py +0 -0
  52. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/accelerator_options.py +0 -0
  53. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/asr_model_specs.py +0 -0
  54. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/base_models.py +0 -0
  55. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/document.py +0 -0
  56. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  57. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  58. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/settings.py +0 -0
  59. {docling-2.38.1 → docling-2.40.0}/docling/datamodel/vlm_model_specs.py +0 -0
  60. {docling-2.38.1 → docling-2.40.0}/docling/document_converter.py +0 -0
  61. {docling-2.38.1 → docling-2.40.0}/docling/exceptions.py +0 -0
  62. {docling-2.38.1 → docling-2.40.0}/docling/models/__init__.py +0 -0
  63. {docling-2.38.1 → docling-2.40.0}/docling/models/api_vlm_model.py +0 -0
  64. {docling-2.38.1 → docling-2.40.0}/docling/models/base_model.py +0 -0
  65. {docling-2.38.1 → docling-2.40.0}/docling/models/code_formula_model.py +0 -0
  66. {docling-2.38.1 → docling-2.40.0}/docling/models/document_picture_classifier.py +0 -0
  67. {docling-2.38.1 → docling-2.40.0}/docling/models/easyocr_model.py +0 -0
  68. {docling-2.38.1 → docling-2.40.0}/docling/models/factories/__init__.py +0 -0
  69. {docling-2.38.1 → docling-2.40.0}/docling/models/factories/base_factory.py +0 -0
  70. {docling-2.38.1 → docling-2.40.0}/docling/models/factories/ocr_factory.py +0 -0
  71. {docling-2.38.1 → docling-2.40.0}/docling/models/factories/picture_description_factory.py +0 -0
  72. {docling-2.38.1 → docling-2.40.0}/docling/models/ocr_mac_model.py +0 -0
  73. {docling-2.38.1 → docling-2.40.0}/docling/models/page_assemble_model.py +0 -0
  74. {docling-2.38.1 → docling-2.40.0}/docling/models/page_preprocessing_model.py +0 -0
  75. {docling-2.38.1 → docling-2.40.0}/docling/models/picture_description_api_model.py +0 -0
  76. {docling-2.38.1 → docling-2.40.0}/docling/models/picture_description_base_model.py +0 -0
  77. {docling-2.38.1 → docling-2.40.0}/docling/models/plugins/__init__.py +0 -0
  78. {docling-2.38.1 → docling-2.40.0}/docling/models/rapid_ocr_model.py +0 -0
  79. {docling-2.38.1 → docling-2.40.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  80. {docling-2.38.1 → docling-2.40.0}/docling/models/utils/__init__.py +0 -0
  81. {docling-2.38.1 → docling-2.40.0}/docling/models/utils/hf_model_download.py +0 -0
  82. {docling-2.38.1 → docling-2.40.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  83. {docling-2.38.1 → docling-2.40.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  84. {docling-2.38.1 → docling-2.40.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  85. {docling-2.38.1 → docling-2.40.0}/docling/pipeline/__init__.py +0 -0
  86. {docling-2.38.1 → docling-2.40.0}/docling/pipeline/asr_pipeline.py +0 -0
  87. {docling-2.38.1 → docling-2.40.0}/docling/pipeline/base_pipeline.py +0 -0
  88. {docling-2.38.1 → docling-2.40.0}/docling/pipeline/simple_pipeline.py +0 -0
  89. {docling-2.38.1 → docling-2.40.0}/docling/pipeline/vlm_pipeline.py +0 -0
  90. {docling-2.38.1 → docling-2.40.0}/docling/py.typed +0 -0
  91. {docling-2.38.1 → docling-2.40.0}/docling/utils/__init__.py +0 -0
  92. {docling-2.38.1 → docling-2.40.0}/docling/utils/api_image_request.py +0 -0
  93. {docling-2.38.1 → docling-2.40.0}/docling/utils/export.py +0 -0
  94. {docling-2.38.1 → docling-2.40.0}/docling/utils/glm_utils.py +0 -0
  95. {docling-2.38.1 → docling-2.40.0}/docling/utils/locks.py +0 -0
  96. {docling-2.38.1 → docling-2.40.0}/docling/utils/model_downloader.py +0 -0
  97. {docling-2.38.1 → docling-2.40.0}/docling/utils/ocr_utils.py +0 -0
  98. {docling-2.38.1 → docling-2.40.0}/docling/utils/orientation.py +0 -0
  99. {docling-2.38.1 → docling-2.40.0}/docling/utils/profiling.py +0 -0
  100. {docling-2.38.1 → docling-2.40.0}/docling/utils/utils.py +0 -0
  101. {docling-2.38.1 → docling-2.40.0}/docling/utils/visualization.py +0 -0
  102. {docling-2.38.1 → docling-2.40.0}/docling.egg-info/SOURCES.txt +0 -0
  103. {docling-2.38.1 → docling-2.40.0}/docling.egg-info/dependency_links.txt +0 -0
  104. {docling-2.38.1 → docling-2.40.0}/docling.egg-info/entry_points.txt +0 -0
  105. {docling-2.38.1 → docling-2.40.0}/docling.egg-info/top_level.txt +0 -0
  106. {docling-2.38.1 → docling-2.40.0}/setup.cfg +0 -0
  107. {docling-2.38.1 → docling-2.40.0}/tests/test_asr_pipeline.py +0 -0
  108. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_asciidoc.py +0 -0
  109. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_csv.py +0 -0
  110. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_docling_json.py +0 -0
  111. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_docling_parse.py +0 -0
  112. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_docling_parse_v2.py +0 -0
  113. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_html.py +0 -0
  114. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_jats.py +0 -0
  115. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_markdown.py +0 -0
  116. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_msexcel.py +0 -0
  117. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_msword.py +0 -0
  118. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_patent_uspto.py +0 -0
  119. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_pdfium.py +0 -0
  120. {docling-2.38.1 → docling-2.40.0}/tests/test_backend_webp.py +0 -0
  121. {docling-2.38.1 → docling-2.40.0}/tests/test_cli.py +0 -0
  122. {docling-2.38.1 → docling-2.40.0}/tests/test_code_formula.py +0 -0
  123. {docling-2.38.1 → docling-2.40.0}/tests/test_data_gen_flag.py +0 -0
  124. {docling-2.38.1 → docling-2.40.0}/tests/test_document_picture_classifier.py +0 -0
  125. {docling-2.38.1 → docling-2.40.0}/tests/test_e2e_conversion.py +0 -0
  126. {docling-2.38.1 → docling-2.40.0}/tests/test_e2e_ocr_conversion.py +0 -0
  127. {docling-2.38.1 → docling-2.40.0}/tests/test_input_doc.py +0 -0
  128. {docling-2.38.1 → docling-2.40.0}/tests/test_interfaces.py +0 -0
  129. {docling-2.38.1 → docling-2.40.0}/tests/test_invalid_input.py +0 -0
  130. {docling-2.38.1 → docling-2.40.0}/tests/test_legacy_format_transform.py +0 -0
  131. {docling-2.38.1 → docling-2.40.0}/tests/test_options.py +0 -0
  132. {docling-2.38.1 → docling-2.40.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.38.1
3
+ Version: 2.40.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,9 +26,9 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.29.0
30
- Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
31
30
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.6.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -57,7 +57,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
57
57
  Provides-Extra: vlm
58
58
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
59
59
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
60
- Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
60
+ Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
61
61
  Provides-Extra: rapidocr
62
62
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
63
63
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
187
187
 
188
188
  def unload(self):
189
189
  super().unload()
190
- self.dp_doc.unload()
191
- with pypdfium2_lock:
192
- self._pdoc.close()
193
- self._pdoc = None
190
+ # Unload docling-parse document first
191
+ if self.dp_doc is not None:
192
+ self.dp_doc.unload()
193
+ self.dp_doc = None
194
+
195
+ # Then close pypdfium2 document with proper locking
196
+ if self._pdoc is not None:
197
+ with pypdfium2_lock:
198
+ try:
199
+ self._pdoc.close()
200
+ except Exception:
201
+ # Ignore cleanup errors
202
+ pass
203
+ self._pdoc = None
@@ -17,6 +17,7 @@ from docling_core.types.doc import (
17
17
  TableData,
18
18
  )
19
19
  from docling_core.types.doc.document import ContentLayer
20
+ from pydantic import BaseModel
20
21
  from typing_extensions import override
21
22
 
22
23
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [
48
49
  ]
49
50
 
50
51
 
52
+ class _Context(BaseModel):
53
+ list_ordered_flag_by_ref: dict[str, bool] = {}
54
+ list_start_by_ref: dict[str, int] = {}
55
+
56
+
51
57
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
52
58
  @override
53
59
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
@@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
59
65
  self.max_levels = 10
60
66
  self.level = 0
61
67
  self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
68
+ self.ctx = _Context()
62
69
  for i in range(self.max_levels):
63
70
  self.parents[i] = None
64
71
 
@@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
121
128
  self.content_layer = (
122
129
  ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
123
130
  )
131
+ self.ctx = _Context() # reset context
124
132
  self.walk(content, doc)
125
133
  else:
126
134
  raise RuntimeError(
@@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
294
302
  def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
295
303
  """Handles list tags (ul, ol) and their list items."""
296
304
 
297
- if element.name == "ul":
298
- # create a list group
299
- self.parents[self.level + 1] = doc.add_group(
300
- parent=self.parents[self.level],
301
- name="list",
302
- label=GroupLabel.LIST,
303
- content_layer=self.content_layer,
304
- )
305
- elif element.name == "ol":
305
+ start: Optional[int] = None
306
+ if is_ordered := element.name == "ol":
306
307
  start_attr = element.get("start")
307
- start: int = (
308
- int(start_attr)
309
- if isinstance(start_attr, str) and start_attr.isnumeric()
310
- else 1
311
- )
312
- # create a list group
313
- self.parents[self.level + 1] = doc.add_group(
314
- parent=self.parents[self.level],
315
- name="ordered list" + (f" start {start}" if start != 1 else ""),
316
- label=GroupLabel.ORDERED_LIST,
317
- content_layer=self.content_layer,
318
- )
308
+ if isinstance(start_attr, str) and start_attr.isnumeric():
309
+ start = int(start_attr)
310
+ name = "ordered list" + (f" start {start}" if start is not None else "")
311
+ else:
312
+ name = "list"
313
+ # create a list group
314
+ list_group = doc.add_list_group(
315
+ name=name,
316
+ parent=self.parents[self.level],
317
+ content_layer=self.content_layer,
318
+ )
319
+ self.parents[self.level + 1] = list_group
320
+ self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
321
+ if is_ordered and start is not None:
322
+ self.ctx.list_start_by_ref[list_group.self_ref] = start
323
+
319
324
  self.level += 1
320
325
 
321
326
  self.walk(element, doc)
@@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
331
336
  if parent is None:
332
337
  _log.debug(f"list-item has no parent in DoclingDocument: {element}")
333
338
  return
334
- parent_label: str = parent.label
335
- index_in_list = len(parent.children) + 1
336
- if (
337
- parent_label == GroupLabel.ORDERED_LIST
338
- and isinstance(parent, GroupItem)
339
- and parent.name
340
- ):
341
- start_in_list: str = parent.name.split(" ")[-1]
342
- start: int = int(start_in_list) if start_in_list.isnumeric() else 1
343
- index_in_list += start - 1
339
+ enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
340
+ if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
341
+ marker = f"{start + len(parent.children)}."
342
+ else:
343
+ marker = ""
344
344
 
345
345
  if nested_list:
346
346
  # Text in list item can be hidden within hierarchy, hence
@@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
350
350
  text = text.replace("\n", "").replace("\r", "")
351
351
  text = " ".join(text.split()).strip()
352
352
 
353
- marker = ""
354
- enumerated = False
355
- if parent_label == GroupLabel.ORDERED_LIST:
356
- marker = str(index_in_list)
357
- enumerated = True
358
-
359
353
  if len(text) > 0:
360
354
  # create a list-item
361
355
  self.parents[self.level + 1] = doc.add_list_item(
@@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
375
369
  elif element.text.strip():
376
370
  text = element.text.strip()
377
371
 
378
- marker = ""
379
- enumerated = False
380
- if parent_label == GroupLabel.ORDERED_LIST:
381
- marker = f"{index_in_list!s}."
382
- enumerated = True
383
372
  doc.add_list_item(
384
373
  text=text,
385
374
  enumerated=enumerated,
@@ -14,13 +14,12 @@ from docling_core.types.doc import (
14
14
  DocItemLabel,
15
15
  DoclingDocument,
16
16
  DocumentOrigin,
17
- GroupLabel,
18
17
  NodeItem,
19
18
  TableCell,
20
19
  TableData,
21
20
  TextItem,
22
21
  )
23
- from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
22
+ from docling_core.types.doc.document import Formatting
24
23
  from marko import Markdown
25
24
  from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
26
25
  from typing_extensions import Annotated
@@ -51,6 +50,7 @@ class _HeadingCreationPayload(BaseModel):
51
50
 
52
51
  class _ListItemCreationPayload(BaseModel):
53
52
  kind: Literal["list_item"] = "list_item"
53
+ enumerated: bool
54
54
 
55
55
 
56
56
  _CreationPayload = Annotated[
@@ -187,15 +187,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
187
187
  doc: DoclingDocument,
188
188
  parent_item: Optional[NodeItem],
189
189
  text: str,
190
+ enumerated: bool,
190
191
  formatting: Optional[Formatting] = None,
191
192
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
192
193
  ):
193
- if not isinstance(parent_item, (OrderedList, UnorderedList)):
194
- _log.warning("ListItem would have not had a list parent, adding one.")
195
- parent_item = doc.add_unordered_list(parent=parent_item)
196
194
  item = doc.add_list_item(
197
195
  text=text,
198
- enumerated=(isinstance(parent_item, OrderedList)),
196
+ enumerated=enumerated,
199
197
  parent=parent_item,
200
198
  formatting=formatting,
201
199
  hyperlink=hyperlink,
@@ -238,6 +236,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
238
236
  creation_stack: list[
239
237
  _CreationPayload
240
238
  ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
239
+ list_ordered_flag_by_ref: dict[str, bool],
241
240
  parent_item: Optional[NodeItem] = None,
242
241
  formatting: Optional[Formatting] = None,
243
242
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -275,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
275
274
  self._close_table(doc)
276
275
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
277
276
  if has_non_empty_list_items:
278
- label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
279
- parent_item = doc.add_group(
280
- label=label, name="list", parent=parent_item
281
- )
277
+ parent_item = doc.add_list_group(name="list", parent=parent_item)
278
+ list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered
282
279
 
283
280
  elif (
284
281
  isinstance(element, marko.block.ListItem)
@@ -289,16 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
289
286
  self._close_table(doc)
290
287
  _log.debug(" - List item")
291
288
 
289
+ enumerated = (
290
+ list_ordered_flag_by_ref.get(parent_item.self_ref, False)
291
+ if parent_item
292
+ else False
293
+ )
292
294
  if len(child.children) > 1: # inline group will be created further down
293
295
  parent_item = self._create_list_item(
294
296
  doc=doc,
295
297
  parent_item=parent_item,
296
298
  text="",
299
+ enumerated=enumerated,
297
300
  formatting=formatting,
298
301
  hyperlink=hyperlink,
299
302
  )
300
303
  else:
301
- creation_stack.append(_ListItemCreationPayload())
304
+ creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
302
305
 
303
306
  elif isinstance(element, marko.inline.Image):
304
307
  self._close_table(doc)
@@ -335,7 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
335
338
  _log.debug(f" - Paragraph (raw text): {element.children}")
336
339
  snippet_text = element.children.strip()
337
340
  # Detect start of the table:
338
- if "|" in snippet_text:
341
+ if "|" in snippet_text or self.in_table:
339
342
  # most likely part of the markdown table
340
343
  self.in_table = True
341
344
  if len(self.md_table_buffer) > 0:
@@ -349,10 +352,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
349
352
  while len(creation_stack) > 0:
350
353
  to_create = creation_stack.pop()
351
354
  if isinstance(to_create, _ListItemCreationPayload):
355
+ enumerated = (
356
+ list_ordered_flag_by_ref.get(
357
+ parent_item.self_ref, False
358
+ )
359
+ if parent_item
360
+ else False
361
+ )
352
362
  parent_item = self._create_list_item(
353
363
  doc=doc,
354
364
  parent_item=parent_item,
355
365
  text=snippet_text,
366
+ enumerated=enumerated,
356
367
  formatting=formatting,
357
368
  hyperlink=hyperlink,
358
369
  )
@@ -453,6 +464,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
453
464
  doc=doc,
454
465
  visited=visited,
455
466
  creation_stack=creation_stack,
467
+ list_ordered_flag_by_ref=list_ordered_flag_by_ref,
456
468
  parent_item=parent_item,
457
469
  formatting=formatting,
458
470
  hyperlink=hyperlink,
@@ -497,6 +509,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
497
509
  parent_item=None,
498
510
  visited=set(),
499
511
  creation_stack=[],
512
+ list_ordered_flag_by_ref={},
500
513
  )
501
514
  self._close_table(doc=doc) # handle any last hanging table
502
515
 
@@ -337,10 +337,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
337
337
  # Collect the data within the bounds
338
338
  data = []
339
339
  visited_cells: set[tuple[int, int]] = set()
340
- for ri in range(start_row, max_row + 1):
341
- for rj in range(start_col, max_col + 1):
342
- cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
343
-
340
+ for ri, row in enumerate(
341
+ sheet.iter_rows(
342
+ min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
343
+ max_row=max_row + 1,
344
+ min_col=start_col + 1,
345
+ max_col=max_col + 1,
346
+ values_only=False,
347
+ ),
348
+ start_row,
349
+ ):
350
+ for rj, cell in enumerate(row, start_col):
344
351
  # Check if the cell belongs to a merged range
345
352
  row_span = 1
346
353
  col_span = 1
@@ -397,10 +404,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
397
404
  """
398
405
  max_row: int = start_row
399
406
 
400
- while max_row < sheet.max_row - 1:
401
- # Get the cell value or check if it is part of a merged cell
402
- cell = sheet.cell(row=max_row + 2, column=start_col + 1)
403
-
407
+ for ri, (cell,) in enumerate(
408
+ sheet.iter_rows(
409
+ min_row=start_row + 2,
410
+ max_row=sheet.max_row,
411
+ min_col=start_col + 1,
412
+ max_col=start_col + 1,
413
+ values_only=False,
414
+ ),
415
+ start_row + 1,
416
+ ):
404
417
  # Check if the cell is part of a merged range
405
418
  merged_range = next(
406
419
  (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
@@ -414,7 +427,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
414
427
  if merged_range:
415
428
  max_row = max(max_row, merged_range.max_row - 1)
416
429
  else:
417
- max_row += 1
430
+ max_row = ri
418
431
 
419
432
  return max_row
420
433
 
@@ -433,10 +446,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
433
446
  """
434
447
  max_col: int = start_col
435
448
 
436
- while max_col < sheet.max_column - 1:
437
- # Get the cell value or check if it is part of a merged cell
438
- cell = sheet.cell(row=start_row + 1, column=max_col + 2)
439
-
449
+ for rj, (cell,) in enumerate(
450
+ sheet.iter_cols(
451
+ min_row=start_row + 1,
452
+ max_row=start_row + 1,
453
+ min_col=start_col + 2,
454
+ max_col=sheet.max_column,
455
+ values_only=False,
456
+ ),
457
+ start_col + 1,
458
+ ):
440
459
  # Check if the cell is part of a merged range
441
460
  merged_range = next(
442
461
  (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
@@ -450,7 +469,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
450
469
  if merged_range:
451
470
  max_col = max(max_col, merged_range.max_col - 1)
452
471
  else:
453
- max_col += 1
472
+ max_col = rj
454
473
 
455
474
  return max_col
456
475
 
@@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
121
121
 
122
122
  return prov
123
123
 
124
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
124
+ def handle_text_elements(
125
+ self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
126
+ ):
125
127
  is_list_group_created = False
126
128
  enum_list_item_value = 0
127
129
  new_list = None
@@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
165
167
  enumerated = bullet_type == "Numbered"
166
168
 
167
169
  if not is_list_group_created:
168
- new_list = doc.add_group(
169
- label=GroupLabel.ORDERED_LIST
170
- if enumerated
171
- else GroupLabel.LIST,
170
+ new_list = doc.add_list_group(
172
171
  name="list",
173
172
  parent=parent_slide,
174
173
  )
@@ -10,11 +10,12 @@ from docling_core.types.doc import (
10
10
  DocumentOrigin,
11
11
  GroupLabel,
12
12
  ImageRef,
13
+ ListGroup,
13
14
  NodeItem,
14
15
  TableCell,
15
16
  TableData,
16
17
  )
17
- from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
18
+ from docling_core.types.doc.document import Formatting
18
19
  from docx import Document
19
20
  from docx.document import Document as DocxDocument
20
21
  from docx.oxml.table import CT_Tc
@@ -688,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
688
689
  paragraph_elements: list,
689
690
  ) -> Optional[NodeItem]:
690
691
  return (
691
- doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
692
+ doc.add_inline_group(parent=prev_parent)
692
693
  if len(paragraph_elements) > 1
693
694
  else prev_parent
694
695
  )
@@ -781,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
781
782
  else:
782
783
  # Inline equation
783
784
  level = self._get_level()
784
- inline_equation = doc.add_group(
785
- label=GroupLabel.INLINE, parent=self.parents[level - 1]
786
- )
785
+ inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
787
786
  text_tmp = text
788
787
  for eq in equations:
789
788
  if len(text_tmp) == 0:
@@ -931,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
931
930
  level: int,
932
931
  ) -> None:
933
932
  # This should not happen by construction
934
- if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
933
+ if not isinstance(self.parents[level], ListGroup):
934
+ return
935
+ if not elements:
935
936
  return
937
+
936
938
  if len(elements) == 1:
937
939
  text, format, hyperlink = elements[0]
938
- doc.add_list_item(
939
- marker=marker,
940
- enumerated=enumerated,
941
- parent=self.parents[level],
942
- text=text,
943
- formatting=format,
944
- hyperlink=hyperlink,
945
- )
940
+ if text:
941
+ doc.add_list_item(
942
+ marker=marker,
943
+ enumerated=enumerated,
944
+ parent=self.parents[level],
945
+ text=text,
946
+ formatting=format,
947
+ hyperlink=hyperlink,
948
+ )
946
949
  else:
947
950
  new_item = doc.add_list_item(
948
951
  marker=marker,
@@ -950,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
950
953
  parent=self.parents[level],
951
954
  text="",
952
955
  )
953
- new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
956
+ new_parent = doc.add_inline_group(parent=new_item)
954
957
  for text, format, hyperlink in elements:
955
- doc.add_text(
956
- label=DocItemLabel.TEXT,
957
- parent=new_parent,
958
- text=text,
959
- formatting=format,
960
- hyperlink=hyperlink,
961
- )
958
+ if text:
959
+ doc.add_text(
960
+ label=DocItemLabel.TEXT,
961
+ parent=new_parent,
962
+ text=text,
963
+ formatting=format,
964
+ hyperlink=hyperlink,
965
+ )
962
966
 
963
967
  def _add_list_item(
964
968
  self,
@@ -979,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
979
983
  if self._prev_numid() is None: # Open new list
980
984
  self.level_at_new_list = level
981
985
 
982
- self.parents[level] = doc.add_group(
983
- label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
986
+ self.parents[level] = doc.add_list_group(
987
+ name="list", parent=self.parents[level - 1]
984
988
  )
985
989
 
986
990
  # Set marker and enumerated arguments if this is an enumeration element.
@@ -1001,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1001
1005
  self.level_at_new_list + prev_indent + 1,
1002
1006
  self.level_at_new_list + ilevel + 1,
1003
1007
  ):
1004
- # Determine if this is an unordered list or an ordered list.
1005
- # Set GroupLabel.ORDERED_LIST when it fits.
1006
1008
  self.listIter = 0
1007
- if is_numbered:
1008
- self.parents[i] = doc.add_group(
1009
- label=GroupLabel.ORDERED_LIST,
1010
- name="list",
1011
- parent=self.parents[i - 1],
1012
- )
1013
- else:
1014
- self.parents[i] = doc.add_group(
1015
- label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
1016
- )
1009
+ self.parents[i] = doc.add_list_group(
1010
+ name="list", parent=self.parents[i - 1]
1011
+ )
1017
1012
 
1018
1013
  # TODO: Set marker and enumerated arguments if this is an enumeration element.
1019
1014
  self.listIter += 1
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from datetime import datetime
2
3
  from enum import Enum
3
4
  from pathlib import Path
4
5
  from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@@ -265,6 +266,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
265
266
  )
266
267
 
267
268
 
269
+ class LayoutOptions(BaseModel):
270
+ """Options for layout processing."""
271
+
272
+ create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
273
+
274
+
268
275
  class AsrPipelineOptions(PipelineOptions):
269
276
  asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
270
277
  artifacts_path: Optional[Union[Path, str]] = None
@@ -289,6 +296,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
289
296
  picture_description_options: PictureDescriptionBaseOptions = (
290
297
  smolvlm_picture_description
291
298
  )
299
+ layout_options: LayoutOptions = LayoutOptions()
292
300
 
293
301
  images_scale: float = 1.0
294
302
  generate_page_images: bool = False
@@ -3,14 +3,13 @@ import logging
3
3
  from abc import abstractmethod
4
4
  from collections.abc import Iterable
5
5
  from pathlib import Path
6
- from typing import List, Optional, Type
6
+ from typing import TYPE_CHECKING, List, Optional, Type
7
7
 
8
8
  import numpy as np
9
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
10
10
  from docling_core.types.doc.page import TextCell
11
11
  from PIL import Image, ImageDraw
12
12
  from rtree import index
13
- from scipy.ndimage import binary_dilation, find_objects, label
14
13
 
15
14
  from docling.datamodel.accelerator_options import AcceleratorOptions
16
15
  from docling.datamodel.base_models import Page
@@ -31,11 +30,16 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
31
30
  options: OcrOptions,
32
31
  accelerator_options: AcceleratorOptions,
33
32
  ):
33
+ # Make sure any delay/error from import occurs on ocr model init and not first use
34
+ from scipy.ndimage import binary_dilation, find_objects, label
35
+
34
36
  self.enabled = enabled
35
37
  self.options = options
36
38
 
37
39
  # Computes the optimum amount and coordinates of rectangles to OCR on a given page
38
40
  def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
41
+ from scipy.ndimage import binary_dilation, find_objects, label
42
+
39
43
  BITMAP_COVERAGE_TRESHOLD = 0.75
40
44
  assert page.size is not None
41
45
 
@@ -7,12 +7,12 @@ from typing import Optional
7
7
 
8
8
  import numpy as np
9
9
  from docling_core.types.doc import DocItemLabel
10
- from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
11
10
  from PIL import Image
12
11
 
13
12
  from docling.datamodel.accelerator_options import AcceleratorOptions
14
13
  from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
15
14
  from docling.datamodel.document import ConversionResult
15
+ from docling.datamodel.pipeline_options import LayoutOptions
16
16
  from docling.datamodel.settings import settings
17
17
  from docling.models.base_model import BasePageModel
18
18
  from docling.models.utils.hf_model_download import download_hf_model
@@ -49,8 +49,15 @@ class LayoutModel(BasePageModel):
49
49
  CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
50
50
 
51
51
  def __init__(
52
- self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
52
+ self,
53
+ artifacts_path: Optional[Path],
54
+ accelerator_options: AcceleratorOptions,
55
+ options: LayoutOptions,
53
56
  ):
57
+ from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
58
+
59
+ self.options = options
60
+
54
61
  device = decide_device(accelerator_options.device)
55
62
 
56
63
  if artifacts_path is None:
@@ -176,7 +183,7 @@ class LayoutModel(BasePageModel):
176
183
  # Apply postprocessing
177
184
 
178
185
  processed_clusters, processed_cells = LayoutPostprocessor(
179
- page, clusters
186
+ page, clusters, self.options
180
187
  ).postprocess()
181
188
  # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
182
189