docling 2.38.0__tar.gz → 2.39.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {docling-2.38.0 → docling-2.39.0}/PKG-INFO +2 -2
  2. {docling-2.38.0 → docling-2.39.0}/docling/backend/html_backend.py +31 -42
  3. {docling-2.38.0 → docling-2.39.0}/docling/backend/md_backend.py +148 -50
  4. {docling-2.38.0 → docling-2.39.0}/docling/backend/mspowerpoint_backend.py +4 -5
  5. {docling-2.38.0 → docling-2.39.0}/docling/backend/msword_backend.py +36 -37
  6. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/base_models.py +1 -1
  7. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/pipeline_options.py +1 -1
  8. {docling-2.38.0 → docling-2.39.0}/docling.egg-info/PKG-INFO +2 -2
  9. {docling-2.38.0 → docling-2.39.0}/docling.egg-info/requires.txt +1 -1
  10. {docling-2.38.0 → docling-2.39.0}/pyproject.toml +2 -2
  11. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_pptx.py +2 -2
  12. {docling-2.38.0 → docling-2.39.0}/LICENSE +0 -0
  13. {docling-2.38.0 → docling-2.39.0}/README.md +0 -0
  14. {docling-2.38.0 → docling-2.39.0}/docling/__init__.py +0 -0
  15. {docling-2.38.0 → docling-2.39.0}/docling/backend/__init__.py +0 -0
  16. {docling-2.38.0 → docling-2.39.0}/docling/backend/abstract_backend.py +0 -0
  17. {docling-2.38.0 → docling-2.39.0}/docling/backend/asciidoc_backend.py +0 -0
  18. {docling-2.38.0 → docling-2.39.0}/docling/backend/csv_backend.py +0 -0
  19. {docling-2.38.0 → docling-2.39.0}/docling/backend/docling_parse_backend.py +0 -0
  20. {docling-2.38.0 → docling-2.39.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  21. {docling-2.38.0 → docling-2.39.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  22. {docling-2.38.0 → docling-2.39.0}/docling/backend/docx/__init__.py +0 -0
  23. {docling-2.38.0 → docling-2.39.0}/docling/backend/docx/latex/__init__.py +0 -0
  24. {docling-2.38.0 → docling-2.39.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  25. {docling-2.38.0 → docling-2.39.0}/docling/backend/docx/latex/omml.py +0 -0
  26. {docling-2.38.0 → docling-2.39.0}/docling/backend/json/__init__.py +0 -0
  27. {docling-2.38.0 → docling-2.39.0}/docling/backend/json/docling_json_backend.py +0 -0
  28. {docling-2.38.0 → docling-2.39.0}/docling/backend/msexcel_backend.py +0 -0
  29. {docling-2.38.0 → docling-2.39.0}/docling/backend/noop_backend.py +0 -0
  30. {docling-2.38.0 → docling-2.39.0}/docling/backend/pdf_backend.py +0 -0
  31. {docling-2.38.0 → docling-2.39.0}/docling/backend/pypdfium2_backend.py +0 -0
  32. {docling-2.38.0 → docling-2.39.0}/docling/backend/xml/__init__.py +0 -0
  33. {docling-2.38.0 → docling-2.39.0}/docling/backend/xml/jats_backend.py +0 -0
  34. {docling-2.38.0 → docling-2.39.0}/docling/backend/xml/uspto_backend.py +0 -0
  35. {docling-2.38.0 → docling-2.39.0}/docling/chunking/__init__.py +0 -0
  36. {docling-2.38.0 → docling-2.39.0}/docling/cli/__init__.py +0 -0
  37. {docling-2.38.0 → docling-2.39.0}/docling/cli/main.py +0 -0
  38. {docling-2.38.0 → docling-2.39.0}/docling/cli/models.py +0 -0
  39. {docling-2.38.0 → docling-2.39.0}/docling/cli/tools.py +0 -0
  40. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/__init__.py +0 -0
  41. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/accelerator_options.py +0 -0
  42. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/asr_model_specs.py +0 -0
  43. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/document.py +0 -0
  44. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  45. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  46. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/settings.py +0 -0
  47. {docling-2.38.0 → docling-2.39.0}/docling/datamodel/vlm_model_specs.py +0 -0
  48. {docling-2.38.0 → docling-2.39.0}/docling/document_converter.py +0 -0
  49. {docling-2.38.0 → docling-2.39.0}/docling/exceptions.py +0 -0
  50. {docling-2.38.0 → docling-2.39.0}/docling/models/__init__.py +0 -0
  51. {docling-2.38.0 → docling-2.39.0}/docling/models/api_vlm_model.py +0 -0
  52. {docling-2.38.0 → docling-2.39.0}/docling/models/base_model.py +0 -0
  53. {docling-2.38.0 → docling-2.39.0}/docling/models/base_ocr_model.py +0 -0
  54. {docling-2.38.0 → docling-2.39.0}/docling/models/code_formula_model.py +0 -0
  55. {docling-2.38.0 → docling-2.39.0}/docling/models/document_picture_classifier.py +0 -0
  56. {docling-2.38.0 → docling-2.39.0}/docling/models/easyocr_model.py +0 -0
  57. {docling-2.38.0 → docling-2.39.0}/docling/models/factories/__init__.py +0 -0
  58. {docling-2.38.0 → docling-2.39.0}/docling/models/factories/base_factory.py +0 -0
  59. {docling-2.38.0 → docling-2.39.0}/docling/models/factories/ocr_factory.py +0 -0
  60. {docling-2.38.0 → docling-2.39.0}/docling/models/factories/picture_description_factory.py +0 -0
  61. {docling-2.38.0 → docling-2.39.0}/docling/models/layout_model.py +0 -0
  62. {docling-2.38.0 → docling-2.39.0}/docling/models/ocr_mac_model.py +0 -0
  63. {docling-2.38.0 → docling-2.39.0}/docling/models/page_assemble_model.py +0 -0
  64. {docling-2.38.0 → docling-2.39.0}/docling/models/page_preprocessing_model.py +0 -0
  65. {docling-2.38.0 → docling-2.39.0}/docling/models/picture_description_api_model.py +0 -0
  66. {docling-2.38.0 → docling-2.39.0}/docling/models/picture_description_base_model.py +0 -0
  67. {docling-2.38.0 → docling-2.39.0}/docling/models/picture_description_vlm_model.py +0 -0
  68. {docling-2.38.0 → docling-2.39.0}/docling/models/plugins/__init__.py +0 -0
  69. {docling-2.38.0 → docling-2.39.0}/docling/models/plugins/defaults.py +0 -0
  70. {docling-2.38.0 → docling-2.39.0}/docling/models/rapid_ocr_model.py +0 -0
  71. {docling-2.38.0 → docling-2.39.0}/docling/models/readingorder_model.py +0 -0
  72. {docling-2.38.0 → docling-2.39.0}/docling/models/table_structure_model.py +0 -0
  73. {docling-2.38.0 → docling-2.39.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  74. {docling-2.38.0 → docling-2.39.0}/docling/models/tesseract_ocr_model.py +0 -0
  75. {docling-2.38.0 → docling-2.39.0}/docling/models/utils/__init__.py +0 -0
  76. {docling-2.38.0 → docling-2.39.0}/docling/models/utils/hf_model_download.py +0 -0
  77. {docling-2.38.0 → docling-2.39.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  78. {docling-2.38.0 → docling-2.39.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  79. {docling-2.38.0 → docling-2.39.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  80. {docling-2.38.0 → docling-2.39.0}/docling/pipeline/__init__.py +0 -0
  81. {docling-2.38.0 → docling-2.39.0}/docling/pipeline/asr_pipeline.py +0 -0
  82. {docling-2.38.0 → docling-2.39.0}/docling/pipeline/base_pipeline.py +0 -0
  83. {docling-2.38.0 → docling-2.39.0}/docling/pipeline/simple_pipeline.py +0 -0
  84. {docling-2.38.0 → docling-2.39.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  85. {docling-2.38.0 → docling-2.39.0}/docling/pipeline/vlm_pipeline.py +0 -0
  86. {docling-2.38.0 → docling-2.39.0}/docling/py.typed +0 -0
  87. {docling-2.38.0 → docling-2.39.0}/docling/utils/__init__.py +0 -0
  88. {docling-2.38.0 → docling-2.39.0}/docling/utils/accelerator_utils.py +0 -0
  89. {docling-2.38.0 → docling-2.39.0}/docling/utils/api_image_request.py +0 -0
  90. {docling-2.38.0 → docling-2.39.0}/docling/utils/export.py +0 -0
  91. {docling-2.38.0 → docling-2.39.0}/docling/utils/glm_utils.py +0 -0
  92. {docling-2.38.0 → docling-2.39.0}/docling/utils/layout_postprocessor.py +0 -0
  93. {docling-2.38.0 → docling-2.39.0}/docling/utils/locks.py +0 -0
  94. {docling-2.38.0 → docling-2.39.0}/docling/utils/model_downloader.py +0 -0
  95. {docling-2.38.0 → docling-2.39.0}/docling/utils/ocr_utils.py +0 -0
  96. {docling-2.38.0 → docling-2.39.0}/docling/utils/orientation.py +0 -0
  97. {docling-2.38.0 → docling-2.39.0}/docling/utils/profiling.py +0 -0
  98. {docling-2.38.0 → docling-2.39.0}/docling/utils/utils.py +0 -0
  99. {docling-2.38.0 → docling-2.39.0}/docling/utils/visualization.py +0 -0
  100. {docling-2.38.0 → docling-2.39.0}/docling.egg-info/SOURCES.txt +0 -0
  101. {docling-2.38.0 → docling-2.39.0}/docling.egg-info/dependency_links.txt +0 -0
  102. {docling-2.38.0 → docling-2.39.0}/docling.egg-info/entry_points.txt +0 -0
  103. {docling-2.38.0 → docling-2.39.0}/docling.egg-info/top_level.txt +0 -0
  104. {docling-2.38.0 → docling-2.39.0}/setup.cfg +0 -0
  105. {docling-2.38.0 → docling-2.39.0}/tests/test_asr_pipeline.py +0 -0
  106. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_asciidoc.py +0 -0
  107. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_csv.py +0 -0
  108. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_docling_json.py +0 -0
  109. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_docling_parse.py +0 -0
  110. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_docling_parse_v2.py +0 -0
  111. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_docling_parse_v4.py +0 -0
  112. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_html.py +0 -0
  113. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_jats.py +0 -0
  114. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_markdown.py +0 -0
  115. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_msexcel.py +0 -0
  116. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_msword.py +0 -0
  117. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_patent_uspto.py +0 -0
  118. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_pdfium.py +0 -0
  119. {docling-2.38.0 → docling-2.39.0}/tests/test_backend_webp.py +0 -0
  120. {docling-2.38.0 → docling-2.39.0}/tests/test_cli.py +0 -0
  121. {docling-2.38.0 → docling-2.39.0}/tests/test_code_formula.py +0 -0
  122. {docling-2.38.0 → docling-2.39.0}/tests/test_data_gen_flag.py +0 -0
  123. {docling-2.38.0 → docling-2.39.0}/tests/test_document_picture_classifier.py +0 -0
  124. {docling-2.38.0 → docling-2.39.0}/tests/test_e2e_conversion.py +0 -0
  125. {docling-2.38.0 → docling-2.39.0}/tests/test_e2e_ocr_conversion.py +0 -0
  126. {docling-2.38.0 → docling-2.39.0}/tests/test_input_doc.py +0 -0
  127. {docling-2.38.0 → docling-2.39.0}/tests/test_interfaces.py +0 -0
  128. {docling-2.38.0 → docling-2.39.0}/tests/test_invalid_input.py +0 -0
  129. {docling-2.38.0 → docling-2.39.0}/tests/test_legacy_format_transform.py +0 -0
  130. {docling-2.38.0 → docling-2.39.0}/tests/test_options.py +0 -0
  131. {docling-2.38.0 → docling-2.39.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.38.0
3
+ Version: 2.39.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.29.0
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
30
30
  Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
31
31
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
@@ -17,6 +17,7 @@ from docling_core.types.doc import (
17
17
  TableData,
18
18
  )
19
19
  from docling_core.types.doc.document import ContentLayer
20
+ from pydantic import BaseModel
20
21
  from typing_extensions import override
21
22
 
22
23
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [
48
49
  ]
49
50
 
50
51
 
52
+ class _Context(BaseModel):
53
+ list_ordered_flag_by_ref: dict[str, bool] = {}
54
+ list_start_by_ref: dict[str, int] = {}
55
+
56
+
51
57
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
52
58
  @override
53
59
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
@@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
59
65
  self.max_levels = 10
60
66
  self.level = 0
61
67
  self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
68
+ self.ctx = _Context()
62
69
  for i in range(self.max_levels):
63
70
  self.parents[i] = None
64
71
 
@@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
121
128
  self.content_layer = (
122
129
  ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
123
130
  )
131
+ self.ctx = _Context() # reset context
124
132
  self.walk(content, doc)
125
133
  else:
126
134
  raise RuntimeError(
@@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
294
302
  def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
295
303
  """Handles list tags (ul, ol) and their list items."""
296
304
 
297
- if element.name == "ul":
298
- # create a list group
299
- self.parents[self.level + 1] = doc.add_group(
300
- parent=self.parents[self.level],
301
- name="list",
302
- label=GroupLabel.LIST,
303
- content_layer=self.content_layer,
304
- )
305
- elif element.name == "ol":
305
+ start: Optional[int] = None
306
+ if is_ordered := element.name == "ol":
306
307
  start_attr = element.get("start")
307
- start: int = (
308
- int(start_attr)
309
- if isinstance(start_attr, str) and start_attr.isnumeric()
310
- else 1
311
- )
312
- # create a list group
313
- self.parents[self.level + 1] = doc.add_group(
314
- parent=self.parents[self.level],
315
- name="ordered list" + (f" start {start}" if start != 1 else ""),
316
- label=GroupLabel.ORDERED_LIST,
317
- content_layer=self.content_layer,
318
- )
308
+ if isinstance(start_attr, str) and start_attr.isnumeric():
309
+ start = int(start_attr)
310
+ name = "ordered list" + (f" start {start}" if start is not None else "")
311
+ else:
312
+ name = "list"
313
+ # create a list group
314
+ list_group = doc.add_list_group(
315
+ name=name,
316
+ parent=self.parents[self.level],
317
+ content_layer=self.content_layer,
318
+ )
319
+ self.parents[self.level + 1] = list_group
320
+ self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
321
+ if is_ordered and start is not None:
322
+ self.ctx.list_start_by_ref[list_group.self_ref] = start
323
+
319
324
  self.level += 1
320
325
 
321
326
  self.walk(element, doc)
@@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
331
336
  if parent is None:
332
337
  _log.debug(f"list-item has no parent in DoclingDocument: {element}")
333
338
  return
334
- parent_label: str = parent.label
335
- index_in_list = len(parent.children) + 1
336
- if (
337
- parent_label == GroupLabel.ORDERED_LIST
338
- and isinstance(parent, GroupItem)
339
- and parent.name
340
- ):
341
- start_in_list: str = parent.name.split(" ")[-1]
342
- start: int = int(start_in_list) if start_in_list.isnumeric() else 1
343
- index_in_list += start - 1
339
+ enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
340
+ if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
341
+ marker = f"{start + len(parent.children)}."
342
+ else:
343
+ marker = ""
344
344
 
345
345
  if nested_list:
346
346
  # Text in list item can be hidden within hierarchy, hence
@@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
350
350
  text = text.replace("\n", "").replace("\r", "")
351
351
  text = " ".join(text.split()).strip()
352
352
 
353
- marker = ""
354
- enumerated = False
355
- if parent_label == GroupLabel.ORDERED_LIST:
356
- marker = str(index_in_list)
357
- enumerated = True
358
-
359
353
  if len(text) > 0:
360
354
  # create a list-item
361
355
  self.parents[self.level + 1] = doc.add_list_item(
@@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
375
369
  elif element.text.strip():
376
370
  text = element.text.strip()
377
371
 
378
- marker = ""
379
- enumerated = False
380
- if parent_label == GroupLabel.ORDERED_LIST:
381
- marker = f"{index_in_list!s}."
382
- enumerated = True
383
372
  doc.add_list_item(
384
373
  text=text,
385
374
  enumerated=enumerated,
@@ -2,9 +2,10 @@ import logging
2
2
  import re
3
3
  import warnings
4
4
  from copy import deepcopy
5
+ from enum import Enum
5
6
  from io import BytesIO
6
7
  from pathlib import Path
7
- from typing import List, Optional, Set, Union
8
+ from typing import List, Literal, Optional, Set, Union
8
9
 
9
10
  import marko
10
11
  import marko.element
@@ -13,15 +14,15 @@ from docling_core.types.doc import (
13
14
  DocItemLabel,
14
15
  DoclingDocument,
15
16
  DocumentOrigin,
16
- GroupLabel,
17
17
  NodeItem,
18
18
  TableCell,
19
19
  TableData,
20
20
  TextItem,
21
21
  )
22
- from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
22
+ from docling_core.types.doc.document import Formatting
23
23
  from marko import Markdown
24
- from pydantic import AnyUrl, TypeAdapter
24
+ from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
25
+ from typing_extensions import Annotated
25
26
 
26
27
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
28
  from docling.backend.html_backend import HTMLDocumentBackend
@@ -35,6 +36,32 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
35
36
  _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
36
37
 
37
38
 
39
+ class _PendingCreationType(str, Enum):
40
+ """CoordOrigin."""
41
+
42
+ HEADING = "heading"
43
+ LIST_ITEM = "list_item"
44
+
45
+
46
+ class _HeadingCreationPayload(BaseModel):
47
+ kind: Literal["heading"] = "heading"
48
+ level: int
49
+
50
+
51
+ class _ListItemCreationPayload(BaseModel):
52
+ kind: Literal["list_item"] = "list_item"
53
+ enumerated: bool
54
+
55
+
56
+ _CreationPayload = Annotated[
57
+ Union[
58
+ _HeadingCreationPayload,
59
+ _ListItemCreationPayload,
60
+ ],
61
+ Field(discriminator="kind"),
62
+ ]
63
+
64
+
38
65
  class MarkdownDocumentBackend(DeclarativeDocumentBackend):
39
66
  def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
40
67
  # This regex will match any sequence of underscores
@@ -155,6 +182,50 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
155
182
  doc.add_table(data=table_data)
156
183
  return
157
184
 
185
+ def _create_list_item(
186
+ self,
187
+ doc: DoclingDocument,
188
+ parent_item: Optional[NodeItem],
189
+ text: str,
190
+ enumerated: bool,
191
+ formatting: Optional[Formatting] = None,
192
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
193
+ ):
194
+ item = doc.add_list_item(
195
+ text=text,
196
+ enumerated=enumerated,
197
+ parent=parent_item,
198
+ formatting=formatting,
199
+ hyperlink=hyperlink,
200
+ )
201
+ return item
202
+
203
+ def _create_heading_item(
204
+ self,
205
+ doc: DoclingDocument,
206
+ parent_item: Optional[NodeItem],
207
+ text: str,
208
+ level: int,
209
+ formatting: Optional[Formatting] = None,
210
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
211
+ ):
212
+ if level == 1:
213
+ item = doc.add_title(
214
+ text=text,
215
+ parent=parent_item,
216
+ formatting=formatting,
217
+ hyperlink=hyperlink,
218
+ )
219
+ else:
220
+ item = doc.add_heading(
221
+ text=text,
222
+ level=level - 1,
223
+ parent=parent_item,
224
+ formatting=formatting,
225
+ hyperlink=hyperlink,
226
+ )
227
+ return item
228
+
158
229
  def _iterate_elements( # noqa: C901
159
230
  self,
160
231
  *,
@@ -162,6 +233,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
162
233
  depth: int,
163
234
  doc: DoclingDocument,
164
235
  visited: Set[marko.element.Element],
236
+ creation_stack: list[
237
+ _CreationPayload
238
+ ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
239
+ list_ordered_flag_by_ref: dict[str, bool],
165
240
  parent_item: Optional[NodeItem] = None,
166
241
  formatting: Optional[Formatting] = None,
167
242
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -177,28 +252,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
177
252
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
178
253
  )
179
254
 
180
- if len(element.children) == 1:
181
- child = element.children[0]
182
- snippet_text = str(child.children) # type: ignore
183
- visited.add(child)
184
- else:
185
- snippet_text = "" # inline group will be created
186
-
187
- if element.level == 1:
188
- parent_item = doc.add_title(
189
- text=snippet_text,
190
- parent=parent_item,
255
+ if len(element.children) > 1: # inline group will be created further down
256
+ parent_item = self._create_heading_item(
257
+ doc=doc,
258
+ parent_item=parent_item,
259
+ text="",
260
+ level=element.level,
191
261
  formatting=formatting,
192
262
  hyperlink=hyperlink,
193
263
  )
194
264
  else:
195
- parent_item = doc.add_heading(
196
- text=snippet_text,
197
- level=element.level - 1,
198
- parent=parent_item,
199
- formatting=formatting,
200
- hyperlink=hyperlink,
201
- )
265
+ creation_stack.append(_HeadingCreationPayload(level=element.level))
202
266
 
203
267
  elif isinstance(element, marko.block.List):
204
268
  has_non_empty_list_items = False
@@ -210,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
210
274
  self._close_table(doc)
211
275
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
212
276
  if has_non_empty_list_items:
213
- label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
214
- parent_item = doc.add_group(
215
- label=label, name="list", parent=parent_item
216
- )
277
+ parent_item = doc.add_list_group(name="list", parent=parent_item)
278
+ list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered
217
279
 
218
280
  elif (
219
281
  isinstance(element, marko.block.ListItem)
@@ -224,22 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
224
286
  self._close_table(doc)
225
287
  _log.debug(" - List item")
226
288
 
227
- if len(child.children) == 1:
228
- snippet_text = str(child.children[0].children) # type: ignore
229
- visited.add(child)
230
- else:
231
- snippet_text = "" # inline group will be created
232
- is_numbered = isinstance(parent_item, OrderedList)
233
- if not isinstance(parent_item, (OrderedList, UnorderedList)):
234
- _log.warning("ListItem would have not had a list parent, adding one.")
235
- parent_item = doc.add_unordered_list(parent=parent_item)
236
- parent_item = doc.add_list_item(
237
- enumerated=is_numbered,
238
- parent=parent_item,
239
- text=snippet_text,
240
- formatting=formatting,
241
- hyperlink=hyperlink,
289
+ enumerated = (
290
+ list_ordered_flag_by_ref.get(parent_item.self_ref, False)
291
+ if parent_item
292
+ else False
242
293
  )
294
+ if len(child.children) > 1: # inline group will be created further down
295
+ parent_item = self._create_list_item(
296
+ doc=doc,
297
+ parent_item=parent_item,
298
+ text="",
299
+ enumerated=enumerated,
300
+ formatting=formatting,
301
+ hyperlink=hyperlink,
302
+ )
303
+ else:
304
+ creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
243
305
 
244
306
  elif isinstance(element, marko.inline.Image):
245
307
  self._close_table(doc)
@@ -276,7 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
276
338
  _log.debug(f" - Paragraph (raw text): {element.children}")
277
339
  snippet_text = element.children.strip()
278
340
  # Detect start of the table:
279
- if "|" in snippet_text:
341
+ if "|" in snippet_text or self.in_table:
280
342
  # most likely part of the markdown table
281
343
  self.in_table = True
282
344
  if len(self.md_table_buffer) > 0:
@@ -285,13 +347,46 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
285
347
  self.md_table_buffer.append(snippet_text)
286
348
  elif snippet_text:
287
349
  self._close_table(doc)
288
- doc.add_text(
289
- label=DocItemLabel.TEXT,
290
- parent=parent_item,
291
- text=snippet_text,
292
- formatting=formatting,
293
- hyperlink=hyperlink,
294
- )
350
+
351
+ if creation_stack:
352
+ while len(creation_stack) > 0:
353
+ to_create = creation_stack.pop()
354
+ if isinstance(to_create, _ListItemCreationPayload):
355
+ enumerated = (
356
+ list_ordered_flag_by_ref.get(
357
+ parent_item.self_ref, False
358
+ )
359
+ if parent_item
360
+ else False
361
+ )
362
+ parent_item = self._create_list_item(
363
+ doc=doc,
364
+ parent_item=parent_item,
365
+ text=snippet_text,
366
+ enumerated=enumerated,
367
+ formatting=formatting,
368
+ hyperlink=hyperlink,
369
+ )
370
+ elif isinstance(to_create, _HeadingCreationPayload):
371
+ # not keeping as parent_item as logic for correctly tracking
372
+ # that not implemented yet (section components not captured
373
+ # as heading children in marko)
374
+ self._create_heading_item(
375
+ doc=doc,
376
+ parent_item=parent_item,
377
+ text=snippet_text,
378
+ level=to_create.level,
379
+ formatting=formatting,
380
+ hyperlink=hyperlink,
381
+ )
382
+ else:
383
+ doc.add_text(
384
+ label=DocItemLabel.TEXT,
385
+ parent=parent_item,
386
+ text=snippet_text,
387
+ formatting=formatting,
388
+ hyperlink=hyperlink,
389
+ )
295
390
 
296
391
  elif isinstance(element, marko.inline.CodeSpan):
297
392
  self._close_table(doc)
@@ -353,7 +448,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
353
448
  parent_item = doc.add_inline_group(parent=parent_item)
354
449
 
355
450
  processed_block_types = (
356
- # marko.block.Heading,
357
451
  marko.block.CodeBlock,
358
452
  marko.block.FencedCode,
359
453
  marko.inline.RawText,
@@ -369,6 +463,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
369
463
  depth=depth + 1,
370
464
  doc=doc,
371
465
  visited=visited,
466
+ creation_stack=creation_stack,
467
+ list_ordered_flag_by_ref=list_ordered_flag_by_ref,
372
468
  parent_item=parent_item,
373
469
  formatting=formatting,
374
470
  hyperlink=hyperlink,
@@ -412,6 +508,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
412
508
  doc=doc,
413
509
  parent_item=None,
414
510
  visited=set(),
511
+ creation_stack=[],
512
+ list_ordered_flag_by_ref={},
415
513
  )
416
514
  self._close_table(doc=doc) # handle any last hanging table
417
515
 
@@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
121
121
 
122
122
  return prov
123
123
 
124
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
124
+ def handle_text_elements(
125
+ self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
126
+ ):
125
127
  is_list_group_created = False
126
128
  enum_list_item_value = 0
127
129
  new_list = None
@@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
165
167
  enumerated = bullet_type == "Numbered"
166
168
 
167
169
  if not is_list_group_created:
168
- new_list = doc.add_group(
169
- label=GroupLabel.ORDERED_LIST
170
- if enumerated
171
- else GroupLabel.LIST,
170
+ new_list = doc.add_list_group(
172
171
  name="list",
173
172
  parent=parent_slide,
174
173
  )
@@ -10,11 +10,12 @@ from docling_core.types.doc import (
10
10
  DocumentOrigin,
11
11
  GroupLabel,
12
12
  ImageRef,
13
+ ListGroup,
13
14
  NodeItem,
14
15
  TableCell,
15
16
  TableData,
16
17
  )
17
- from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
18
+ from docling_core.types.doc.document import Formatting
18
19
  from docx import Document
19
20
  from docx.document import Document as DocxDocument
20
21
  from docx.oxml.table import CT_Tc
@@ -397,7 +398,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
397
398
  if isinstance(c, Hyperlink):
398
399
  text = c.text
399
400
  hyperlink = Path(c.address)
400
- format = self._get_format_from_run(c.runs[0])
401
+ format = (
402
+ self._get_format_from_run(c.runs[0])
403
+ if c.runs and len(c.runs) > 0
404
+ else None
405
+ )
401
406
  elif isinstance(c, Run):
402
407
  text = c.text
403
408
  hyperlink = None
@@ -684,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
684
689
  paragraph_elements: list,
685
690
  ) -> Optional[NodeItem]:
686
691
  return (
687
- doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
692
+ doc.add_inline_group(parent=prev_parent)
688
693
  if len(paragraph_elements) > 1
689
694
  else prev_parent
690
695
  )
@@ -777,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
777
782
  else:
778
783
  # Inline equation
779
784
  level = self._get_level()
780
- inline_equation = doc.add_group(
781
- label=GroupLabel.INLINE, parent=self.parents[level - 1]
782
- )
785
+ inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
783
786
  text_tmp = text
784
787
  for eq in equations:
785
788
  if len(text_tmp) == 0:
@@ -927,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
927
930
  level: int,
928
931
  ) -> None:
929
932
  # This should not happen by construction
930
- if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
933
+ if not isinstance(self.parents[level], ListGroup):
934
+ return
935
+ if not elements:
931
936
  return
937
+
932
938
  if len(elements) == 1:
933
939
  text, format, hyperlink = elements[0]
934
- doc.add_list_item(
935
- marker=marker,
936
- enumerated=enumerated,
937
- parent=self.parents[level],
938
- text=text,
939
- formatting=format,
940
- hyperlink=hyperlink,
941
- )
940
+ if text:
941
+ doc.add_list_item(
942
+ marker=marker,
943
+ enumerated=enumerated,
944
+ parent=self.parents[level],
945
+ text=text,
946
+ formatting=format,
947
+ hyperlink=hyperlink,
948
+ )
942
949
  else:
943
950
  new_item = doc.add_list_item(
944
951
  marker=marker,
@@ -946,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
946
953
  parent=self.parents[level],
947
954
  text="",
948
955
  )
949
- new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
956
+ new_parent = doc.add_inline_group(parent=new_item)
950
957
  for text, format, hyperlink in elements:
951
- doc.add_text(
952
- label=DocItemLabel.TEXT,
953
- parent=new_parent,
954
- text=text,
955
- formatting=format,
956
- hyperlink=hyperlink,
957
- )
958
+ if text:
959
+ doc.add_text(
960
+ label=DocItemLabel.TEXT,
961
+ parent=new_parent,
962
+ text=text,
963
+ formatting=format,
964
+ hyperlink=hyperlink,
965
+ )
958
966
 
959
967
  def _add_list_item(
960
968
  self,
@@ -975,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
975
983
  if self._prev_numid() is None: # Open new list
976
984
  self.level_at_new_list = level
977
985
 
978
- self.parents[level] = doc.add_group(
979
- label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
986
+ self.parents[level] = doc.add_list_group(
987
+ name="list", parent=self.parents[level - 1]
980
988
  )
981
989
 
982
990
  # Set marker and enumerated arguments if this is an enumeration element.
@@ -997,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
997
1005
  self.level_at_new_list + prev_indent + 1,
998
1006
  self.level_at_new_list + ilevel + 1,
999
1007
  ):
1000
- # Determine if this is an unordered list or an ordered list.
1001
- # Set GroupLabel.ORDERED_LIST when it fits.
1002
1008
  self.listIter = 0
1003
- if is_numbered:
1004
- self.parents[i] = doc.add_group(
1005
- label=GroupLabel.ORDERED_LIST,
1006
- name="list",
1007
- parent=self.parents[i - 1],
1008
- )
1009
- else:
1010
- self.parents[i] = doc.add_group(
1011
- label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
1012
- )
1009
+ self.parents[i] = doc.add_list_group(
1010
+ name="list", parent=self.parents[i - 1]
1011
+ )
1013
1012
 
1014
1013
  # TODO: Set marker and enumerated arguments if this is an enumeration element.
1015
1014
  self.listIter += 1
@@ -301,7 +301,7 @@ class OpenAiChatMessage(BaseModel):
301
301
  class OpenAiResponseChoice(BaseModel):
302
302
  index: int
303
303
  message: OpenAiChatMessage
304
- finish_reason: str
304
+ finish_reason: Optional[str]
305
305
 
306
306
 
307
307
  class OpenAiResponseUsage(BaseModel):
@@ -207,7 +207,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
207
207
 
208
208
  # GraniteVision
209
209
  granite_picture_description = PictureDescriptionVlmOptions(
210
- repo_id="ibm-granite/granite-vision-3.1-2b-preview",
210
+ repo_id="ibm-granite/granite-vision-3.2-2b-preview",
211
211
  prompt="What is shown in this image?",
212
212
  )
213
213
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.38.0
3
+ Version: 2.39.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.29.0
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
30
30
  Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
31
31
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
@@ -1,5 +1,5 @@
1
1
  pydantic<3.0.0,>=2.0.0
2
- docling-core[chunking]<3.0.0,>=2.29.0
2
+ docling-core[chunking]<3.0.0,>=2.39.0
3
3
  docling-ibm-models<4.0.0,>=3.4.4
4
4
  docling-parse<5.0.0,>=4.0.0
5
5
  filetype<2.0.0,>=1.2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.38.0" # DO NOT EDIT, updated automatically
3
+ version = "2.39.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -44,7 +44,7 @@ authors = [
44
44
  requires-python = '>=3.9,<4.0'
45
45
  dependencies = [
46
46
  'pydantic (>=2.0.0,<3.0.0)',
47
- 'docling-core[chunking] (>=2.29.0,<3.0.0)',
47
+ 'docling-core[chunking] (>=2.39.0,<3.0.0)',
48
48
  'docling-ibm-models (>=3.4.4,<4.0.0)',
49
49
  'docling-parse (>=4.0.0,<5.0.0)',
50
50
  'filetype (>=1.2.0,<2.0.0)',
@@ -41,12 +41,12 @@ def test_e2e_pptx_conversions():
41
41
  doc: DoclingDocument = conv_result.document
42
42
 
43
43
  pred_md: str = doc.export_to_markdown()
44
- assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
44
+ assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md"
45
45
 
46
46
  pred_itxt: str = doc._export_to_indented_text(
47
47
  max_text_len=70, explicit_tables=False
48
48
  )
49
- assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
49
+ assert verify_export(pred_itxt, str(gt_path) + ".itxt", GENERATE), (
50
50
  "export to indented-text"
51
51
  )
52
52
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes