docling 2.38.1__tar.gz → 2.39.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {docling-2.38.1 → docling-2.39.0}/PKG-INFO +2 -2
  2. {docling-2.38.1 → docling-2.39.0}/docling/backend/html_backend.py +31 -42
  3. {docling-2.38.1 → docling-2.39.0}/docling/backend/md_backend.py +25 -12
  4. {docling-2.38.1 → docling-2.39.0}/docling/backend/mspowerpoint_backend.py +4 -5
  5. {docling-2.38.1 → docling-2.39.0}/docling/backend/msword_backend.py +31 -36
  6. {docling-2.38.1 → docling-2.39.0}/docling.egg-info/PKG-INFO +2 -2
  7. {docling-2.38.1 → docling-2.39.0}/docling.egg-info/requires.txt +1 -1
  8. {docling-2.38.1 → docling-2.39.0}/pyproject.toml +2 -2
  9. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_pptx.py +2 -2
  10. {docling-2.38.1 → docling-2.39.0}/LICENSE +0 -0
  11. {docling-2.38.1 → docling-2.39.0}/README.md +0 -0
  12. {docling-2.38.1 → docling-2.39.0}/docling/__init__.py +0 -0
  13. {docling-2.38.1 → docling-2.39.0}/docling/backend/__init__.py +0 -0
  14. {docling-2.38.1 → docling-2.39.0}/docling/backend/abstract_backend.py +0 -0
  15. {docling-2.38.1 → docling-2.39.0}/docling/backend/asciidoc_backend.py +0 -0
  16. {docling-2.38.1 → docling-2.39.0}/docling/backend/csv_backend.py +0 -0
  17. {docling-2.38.1 → docling-2.39.0}/docling/backend/docling_parse_backend.py +0 -0
  18. {docling-2.38.1 → docling-2.39.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  19. {docling-2.38.1 → docling-2.39.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  20. {docling-2.38.1 → docling-2.39.0}/docling/backend/docx/__init__.py +0 -0
  21. {docling-2.38.1 → docling-2.39.0}/docling/backend/docx/latex/__init__.py +0 -0
  22. {docling-2.38.1 → docling-2.39.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  23. {docling-2.38.1 → docling-2.39.0}/docling/backend/docx/latex/omml.py +0 -0
  24. {docling-2.38.1 → docling-2.39.0}/docling/backend/json/__init__.py +0 -0
  25. {docling-2.38.1 → docling-2.39.0}/docling/backend/json/docling_json_backend.py +0 -0
  26. {docling-2.38.1 → docling-2.39.0}/docling/backend/msexcel_backend.py +0 -0
  27. {docling-2.38.1 → docling-2.39.0}/docling/backend/noop_backend.py +0 -0
  28. {docling-2.38.1 → docling-2.39.0}/docling/backend/pdf_backend.py +0 -0
  29. {docling-2.38.1 → docling-2.39.0}/docling/backend/pypdfium2_backend.py +0 -0
  30. {docling-2.38.1 → docling-2.39.0}/docling/backend/xml/__init__.py +0 -0
  31. {docling-2.38.1 → docling-2.39.0}/docling/backend/xml/jats_backend.py +0 -0
  32. {docling-2.38.1 → docling-2.39.0}/docling/backend/xml/uspto_backend.py +0 -0
  33. {docling-2.38.1 → docling-2.39.0}/docling/chunking/__init__.py +0 -0
  34. {docling-2.38.1 → docling-2.39.0}/docling/cli/__init__.py +0 -0
  35. {docling-2.38.1 → docling-2.39.0}/docling/cli/main.py +0 -0
  36. {docling-2.38.1 → docling-2.39.0}/docling/cli/models.py +0 -0
  37. {docling-2.38.1 → docling-2.39.0}/docling/cli/tools.py +0 -0
  38. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/__init__.py +0 -0
  39. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/accelerator_options.py +0 -0
  40. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/asr_model_specs.py +0 -0
  41. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/base_models.py +0 -0
  42. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/document.py +0 -0
  43. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/pipeline_options.py +0 -0
  44. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  45. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  46. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/settings.py +0 -0
  47. {docling-2.38.1 → docling-2.39.0}/docling/datamodel/vlm_model_specs.py +0 -0
  48. {docling-2.38.1 → docling-2.39.0}/docling/document_converter.py +0 -0
  49. {docling-2.38.1 → docling-2.39.0}/docling/exceptions.py +0 -0
  50. {docling-2.38.1 → docling-2.39.0}/docling/models/__init__.py +0 -0
  51. {docling-2.38.1 → docling-2.39.0}/docling/models/api_vlm_model.py +0 -0
  52. {docling-2.38.1 → docling-2.39.0}/docling/models/base_model.py +0 -0
  53. {docling-2.38.1 → docling-2.39.0}/docling/models/base_ocr_model.py +0 -0
  54. {docling-2.38.1 → docling-2.39.0}/docling/models/code_formula_model.py +0 -0
  55. {docling-2.38.1 → docling-2.39.0}/docling/models/document_picture_classifier.py +0 -0
  56. {docling-2.38.1 → docling-2.39.0}/docling/models/easyocr_model.py +0 -0
  57. {docling-2.38.1 → docling-2.39.0}/docling/models/factories/__init__.py +0 -0
  58. {docling-2.38.1 → docling-2.39.0}/docling/models/factories/base_factory.py +0 -0
  59. {docling-2.38.1 → docling-2.39.0}/docling/models/factories/ocr_factory.py +0 -0
  60. {docling-2.38.1 → docling-2.39.0}/docling/models/factories/picture_description_factory.py +0 -0
  61. {docling-2.38.1 → docling-2.39.0}/docling/models/layout_model.py +0 -0
  62. {docling-2.38.1 → docling-2.39.0}/docling/models/ocr_mac_model.py +0 -0
  63. {docling-2.38.1 → docling-2.39.0}/docling/models/page_assemble_model.py +0 -0
  64. {docling-2.38.1 → docling-2.39.0}/docling/models/page_preprocessing_model.py +0 -0
  65. {docling-2.38.1 → docling-2.39.0}/docling/models/picture_description_api_model.py +0 -0
  66. {docling-2.38.1 → docling-2.39.0}/docling/models/picture_description_base_model.py +0 -0
  67. {docling-2.38.1 → docling-2.39.0}/docling/models/picture_description_vlm_model.py +0 -0
  68. {docling-2.38.1 → docling-2.39.0}/docling/models/plugins/__init__.py +0 -0
  69. {docling-2.38.1 → docling-2.39.0}/docling/models/plugins/defaults.py +0 -0
  70. {docling-2.38.1 → docling-2.39.0}/docling/models/rapid_ocr_model.py +0 -0
  71. {docling-2.38.1 → docling-2.39.0}/docling/models/readingorder_model.py +0 -0
  72. {docling-2.38.1 → docling-2.39.0}/docling/models/table_structure_model.py +0 -0
  73. {docling-2.38.1 → docling-2.39.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  74. {docling-2.38.1 → docling-2.39.0}/docling/models/tesseract_ocr_model.py +0 -0
  75. {docling-2.38.1 → docling-2.39.0}/docling/models/utils/__init__.py +0 -0
  76. {docling-2.38.1 → docling-2.39.0}/docling/models/utils/hf_model_download.py +0 -0
  77. {docling-2.38.1 → docling-2.39.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  78. {docling-2.38.1 → docling-2.39.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  79. {docling-2.38.1 → docling-2.39.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  80. {docling-2.38.1 → docling-2.39.0}/docling/pipeline/__init__.py +0 -0
  81. {docling-2.38.1 → docling-2.39.0}/docling/pipeline/asr_pipeline.py +0 -0
  82. {docling-2.38.1 → docling-2.39.0}/docling/pipeline/base_pipeline.py +0 -0
  83. {docling-2.38.1 → docling-2.39.0}/docling/pipeline/simple_pipeline.py +0 -0
  84. {docling-2.38.1 → docling-2.39.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  85. {docling-2.38.1 → docling-2.39.0}/docling/pipeline/vlm_pipeline.py +0 -0
  86. {docling-2.38.1 → docling-2.39.0}/docling/py.typed +0 -0
  87. {docling-2.38.1 → docling-2.39.0}/docling/utils/__init__.py +0 -0
  88. {docling-2.38.1 → docling-2.39.0}/docling/utils/accelerator_utils.py +0 -0
  89. {docling-2.38.1 → docling-2.39.0}/docling/utils/api_image_request.py +0 -0
  90. {docling-2.38.1 → docling-2.39.0}/docling/utils/export.py +0 -0
  91. {docling-2.38.1 → docling-2.39.0}/docling/utils/glm_utils.py +0 -0
  92. {docling-2.38.1 → docling-2.39.0}/docling/utils/layout_postprocessor.py +0 -0
  93. {docling-2.38.1 → docling-2.39.0}/docling/utils/locks.py +0 -0
  94. {docling-2.38.1 → docling-2.39.0}/docling/utils/model_downloader.py +0 -0
  95. {docling-2.38.1 → docling-2.39.0}/docling/utils/ocr_utils.py +0 -0
  96. {docling-2.38.1 → docling-2.39.0}/docling/utils/orientation.py +0 -0
  97. {docling-2.38.1 → docling-2.39.0}/docling/utils/profiling.py +0 -0
  98. {docling-2.38.1 → docling-2.39.0}/docling/utils/utils.py +0 -0
  99. {docling-2.38.1 → docling-2.39.0}/docling/utils/visualization.py +0 -0
  100. {docling-2.38.1 → docling-2.39.0}/docling.egg-info/SOURCES.txt +0 -0
  101. {docling-2.38.1 → docling-2.39.0}/docling.egg-info/dependency_links.txt +0 -0
  102. {docling-2.38.1 → docling-2.39.0}/docling.egg-info/entry_points.txt +0 -0
  103. {docling-2.38.1 → docling-2.39.0}/docling.egg-info/top_level.txt +0 -0
  104. {docling-2.38.1 → docling-2.39.0}/setup.cfg +0 -0
  105. {docling-2.38.1 → docling-2.39.0}/tests/test_asr_pipeline.py +0 -0
  106. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_asciidoc.py +0 -0
  107. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_csv.py +0 -0
  108. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_docling_json.py +0 -0
  109. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_docling_parse.py +0 -0
  110. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_docling_parse_v2.py +0 -0
  111. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_docling_parse_v4.py +0 -0
  112. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_html.py +0 -0
  113. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_jats.py +0 -0
  114. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_markdown.py +0 -0
  115. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_msexcel.py +0 -0
  116. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_msword.py +0 -0
  117. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_patent_uspto.py +0 -0
  118. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_pdfium.py +0 -0
  119. {docling-2.38.1 → docling-2.39.0}/tests/test_backend_webp.py +0 -0
  120. {docling-2.38.1 → docling-2.39.0}/tests/test_cli.py +0 -0
  121. {docling-2.38.1 → docling-2.39.0}/tests/test_code_formula.py +0 -0
  122. {docling-2.38.1 → docling-2.39.0}/tests/test_data_gen_flag.py +0 -0
  123. {docling-2.38.1 → docling-2.39.0}/tests/test_document_picture_classifier.py +0 -0
  124. {docling-2.38.1 → docling-2.39.0}/tests/test_e2e_conversion.py +0 -0
  125. {docling-2.38.1 → docling-2.39.0}/tests/test_e2e_ocr_conversion.py +0 -0
  126. {docling-2.38.1 → docling-2.39.0}/tests/test_input_doc.py +0 -0
  127. {docling-2.38.1 → docling-2.39.0}/tests/test_interfaces.py +0 -0
  128. {docling-2.38.1 → docling-2.39.0}/tests/test_invalid_input.py +0 -0
  129. {docling-2.38.1 → docling-2.39.0}/tests/test_legacy_format_transform.py +0 -0
  130. {docling-2.38.1 → docling-2.39.0}/tests/test_options.py +0 -0
  131. {docling-2.38.1 → docling-2.39.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.38.1
3
+ Version: 2.39.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.29.0
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
30
30
  Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
31
31
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
@@ -17,6 +17,7 @@ from docling_core.types.doc import (
17
17
  TableData,
18
18
  )
19
19
  from docling_core.types.doc.document import ContentLayer
20
+ from pydantic import BaseModel
20
21
  from typing_extensions import override
21
22
 
22
23
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [
48
49
  ]
49
50
 
50
51
 
52
+ class _Context(BaseModel):
53
+ list_ordered_flag_by_ref: dict[str, bool] = {}
54
+ list_start_by_ref: dict[str, int] = {}
55
+
56
+
51
57
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
52
58
  @override
53
59
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
@@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
59
65
  self.max_levels = 10
60
66
  self.level = 0
61
67
  self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
68
+ self.ctx = _Context()
62
69
  for i in range(self.max_levels):
63
70
  self.parents[i] = None
64
71
 
@@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
121
128
  self.content_layer = (
122
129
  ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
123
130
  )
131
+ self.ctx = _Context() # reset context
124
132
  self.walk(content, doc)
125
133
  else:
126
134
  raise RuntimeError(
@@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
294
302
  def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
295
303
  """Handles list tags (ul, ol) and their list items."""
296
304
 
297
- if element.name == "ul":
298
- # create a list group
299
- self.parents[self.level + 1] = doc.add_group(
300
- parent=self.parents[self.level],
301
- name="list",
302
- label=GroupLabel.LIST,
303
- content_layer=self.content_layer,
304
- )
305
- elif element.name == "ol":
305
+ start: Optional[int] = None
306
+ if is_ordered := element.name == "ol":
306
307
  start_attr = element.get("start")
307
- start: int = (
308
- int(start_attr)
309
- if isinstance(start_attr, str) and start_attr.isnumeric()
310
- else 1
311
- )
312
- # create a list group
313
- self.parents[self.level + 1] = doc.add_group(
314
- parent=self.parents[self.level],
315
- name="ordered list" + (f" start {start}" if start != 1 else ""),
316
- label=GroupLabel.ORDERED_LIST,
317
- content_layer=self.content_layer,
318
- )
308
+ if isinstance(start_attr, str) and start_attr.isnumeric():
309
+ start = int(start_attr)
310
+ name = "ordered list" + (f" start {start}" if start is not None else "")
311
+ else:
312
+ name = "list"
313
+ # create a list group
314
+ list_group = doc.add_list_group(
315
+ name=name,
316
+ parent=self.parents[self.level],
317
+ content_layer=self.content_layer,
318
+ )
319
+ self.parents[self.level + 1] = list_group
320
+ self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
321
+ if is_ordered and start is not None:
322
+ self.ctx.list_start_by_ref[list_group.self_ref] = start
323
+
319
324
  self.level += 1
320
325
 
321
326
  self.walk(element, doc)
@@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
331
336
  if parent is None:
332
337
  _log.debug(f"list-item has no parent in DoclingDocument: {element}")
333
338
  return
334
- parent_label: str = parent.label
335
- index_in_list = len(parent.children) + 1
336
- if (
337
- parent_label == GroupLabel.ORDERED_LIST
338
- and isinstance(parent, GroupItem)
339
- and parent.name
340
- ):
341
- start_in_list: str = parent.name.split(" ")[-1]
342
- start: int = int(start_in_list) if start_in_list.isnumeric() else 1
343
- index_in_list += start - 1
339
+ enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
340
+ if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
341
+ marker = f"{start + len(parent.children)}."
342
+ else:
343
+ marker = ""
344
344
 
345
345
  if nested_list:
346
346
  # Text in list item can be hidden within hierarchy, hence
@@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
350
350
  text = text.replace("\n", "").replace("\r", "")
351
351
  text = " ".join(text.split()).strip()
352
352
 
353
- marker = ""
354
- enumerated = False
355
- if parent_label == GroupLabel.ORDERED_LIST:
356
- marker = str(index_in_list)
357
- enumerated = True
358
-
359
353
  if len(text) > 0:
360
354
  # create a list-item
361
355
  self.parents[self.level + 1] = doc.add_list_item(
@@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
375
369
  elif element.text.strip():
376
370
  text = element.text.strip()
377
371
 
378
- marker = ""
379
- enumerated = False
380
- if parent_label == GroupLabel.ORDERED_LIST:
381
- marker = f"{index_in_list!s}."
382
- enumerated = True
383
372
  doc.add_list_item(
384
373
  text=text,
385
374
  enumerated=enumerated,
@@ -14,13 +14,12 @@ from docling_core.types.doc import (
14
14
  DocItemLabel,
15
15
  DoclingDocument,
16
16
  DocumentOrigin,
17
- GroupLabel,
18
17
  NodeItem,
19
18
  TableCell,
20
19
  TableData,
21
20
  TextItem,
22
21
  )
23
- from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
22
+ from docling_core.types.doc.document import Formatting
24
23
  from marko import Markdown
25
24
  from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
26
25
  from typing_extensions import Annotated
@@ -51,6 +50,7 @@ class _HeadingCreationPayload(BaseModel):
51
50
 
52
51
  class _ListItemCreationPayload(BaseModel):
53
52
  kind: Literal["list_item"] = "list_item"
53
+ enumerated: bool
54
54
 
55
55
 
56
56
  _CreationPayload = Annotated[
@@ -187,15 +187,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
187
187
  doc: DoclingDocument,
188
188
  parent_item: Optional[NodeItem],
189
189
  text: str,
190
+ enumerated: bool,
190
191
  formatting: Optional[Formatting] = None,
191
192
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
192
193
  ):
193
- if not isinstance(parent_item, (OrderedList, UnorderedList)):
194
- _log.warning("ListItem would have not had a list parent, adding one.")
195
- parent_item = doc.add_unordered_list(parent=parent_item)
196
194
  item = doc.add_list_item(
197
195
  text=text,
198
- enumerated=(isinstance(parent_item, OrderedList)),
196
+ enumerated=enumerated,
199
197
  parent=parent_item,
200
198
  formatting=formatting,
201
199
  hyperlink=hyperlink,
@@ -238,6 +236,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
238
236
  creation_stack: list[
239
237
  _CreationPayload
240
238
  ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
239
+ list_ordered_flag_by_ref: dict[str, bool],
241
240
  parent_item: Optional[NodeItem] = None,
242
241
  formatting: Optional[Formatting] = None,
243
242
  hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -275,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
275
274
  self._close_table(doc)
276
275
  _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
277
276
  if has_non_empty_list_items:
278
- label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
279
- parent_item = doc.add_group(
280
- label=label, name="list", parent=parent_item
281
- )
277
+ parent_item = doc.add_list_group(name="list", parent=parent_item)
278
+ list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered
282
279
 
283
280
  elif (
284
281
  isinstance(element, marko.block.ListItem)
@@ -289,16 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
289
286
  self._close_table(doc)
290
287
  _log.debug(" - List item")
291
288
 
289
+ enumerated = (
290
+ list_ordered_flag_by_ref.get(parent_item.self_ref, False)
291
+ if parent_item
292
+ else False
293
+ )
292
294
  if len(child.children) > 1: # inline group will be created further down
293
295
  parent_item = self._create_list_item(
294
296
  doc=doc,
295
297
  parent_item=parent_item,
296
298
  text="",
299
+ enumerated=enumerated,
297
300
  formatting=formatting,
298
301
  hyperlink=hyperlink,
299
302
  )
300
303
  else:
301
- creation_stack.append(_ListItemCreationPayload())
304
+ creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
302
305
 
303
306
  elif isinstance(element, marko.inline.Image):
304
307
  self._close_table(doc)
@@ -335,7 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
335
338
  _log.debug(f" - Paragraph (raw text): {element.children}")
336
339
  snippet_text = element.children.strip()
337
340
  # Detect start of the table:
338
- if "|" in snippet_text:
341
+ if "|" in snippet_text or self.in_table:
339
342
  # most likely part of the markdown table
340
343
  self.in_table = True
341
344
  if len(self.md_table_buffer) > 0:
@@ -349,10 +352,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
349
352
  while len(creation_stack) > 0:
350
353
  to_create = creation_stack.pop()
351
354
  if isinstance(to_create, _ListItemCreationPayload):
355
+ enumerated = (
356
+ list_ordered_flag_by_ref.get(
357
+ parent_item.self_ref, False
358
+ )
359
+ if parent_item
360
+ else False
361
+ )
352
362
  parent_item = self._create_list_item(
353
363
  doc=doc,
354
364
  parent_item=parent_item,
355
365
  text=snippet_text,
366
+ enumerated=enumerated,
356
367
  formatting=formatting,
357
368
  hyperlink=hyperlink,
358
369
  )
@@ -453,6 +464,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
453
464
  doc=doc,
454
465
  visited=visited,
455
466
  creation_stack=creation_stack,
467
+ list_ordered_flag_by_ref=list_ordered_flag_by_ref,
456
468
  parent_item=parent_item,
457
469
  formatting=formatting,
458
470
  hyperlink=hyperlink,
@@ -497,6 +509,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
497
509
  parent_item=None,
498
510
  visited=set(),
499
511
  creation_stack=[],
512
+ list_ordered_flag_by_ref={},
500
513
  )
501
514
  self._close_table(doc=doc) # handle any last hanging table
502
515
 
@@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
121
121
 
122
122
  return prov
123
123
 
124
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
124
+ def handle_text_elements(
125
+ self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
126
+ ):
125
127
  is_list_group_created = False
126
128
  enum_list_item_value = 0
127
129
  new_list = None
@@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
165
167
  enumerated = bullet_type == "Numbered"
166
168
 
167
169
  if not is_list_group_created:
168
- new_list = doc.add_group(
169
- label=GroupLabel.ORDERED_LIST
170
- if enumerated
171
- else GroupLabel.LIST,
170
+ new_list = doc.add_list_group(
172
171
  name="list",
173
172
  parent=parent_slide,
174
173
  )
@@ -10,11 +10,12 @@ from docling_core.types.doc import (
10
10
  DocumentOrigin,
11
11
  GroupLabel,
12
12
  ImageRef,
13
+ ListGroup,
13
14
  NodeItem,
14
15
  TableCell,
15
16
  TableData,
16
17
  )
17
- from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
18
+ from docling_core.types.doc.document import Formatting
18
19
  from docx import Document
19
20
  from docx.document import Document as DocxDocument
20
21
  from docx.oxml.table import CT_Tc
@@ -688,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
688
689
  paragraph_elements: list,
689
690
  ) -> Optional[NodeItem]:
690
691
  return (
691
- doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
692
+ doc.add_inline_group(parent=prev_parent)
692
693
  if len(paragraph_elements) > 1
693
694
  else prev_parent
694
695
  )
@@ -781,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
781
782
  else:
782
783
  # Inline equation
783
784
  level = self._get_level()
784
- inline_equation = doc.add_group(
785
- label=GroupLabel.INLINE, parent=self.parents[level - 1]
786
- )
785
+ inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
787
786
  text_tmp = text
788
787
  for eq in equations:
789
788
  if len(text_tmp) == 0:
@@ -931,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
931
930
  level: int,
932
931
  ) -> None:
933
932
  # This should not happen by construction
934
- if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
933
+ if not isinstance(self.parents[level], ListGroup):
934
+ return
935
+ if not elements:
935
936
  return
937
+
936
938
  if len(elements) == 1:
937
939
  text, format, hyperlink = elements[0]
938
- doc.add_list_item(
939
- marker=marker,
940
- enumerated=enumerated,
941
- parent=self.parents[level],
942
- text=text,
943
- formatting=format,
944
- hyperlink=hyperlink,
945
- )
940
+ if text:
941
+ doc.add_list_item(
942
+ marker=marker,
943
+ enumerated=enumerated,
944
+ parent=self.parents[level],
945
+ text=text,
946
+ formatting=format,
947
+ hyperlink=hyperlink,
948
+ )
946
949
  else:
947
950
  new_item = doc.add_list_item(
948
951
  marker=marker,
@@ -950,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
950
953
  parent=self.parents[level],
951
954
  text="",
952
955
  )
953
- new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
956
+ new_parent = doc.add_inline_group(parent=new_item)
954
957
  for text, format, hyperlink in elements:
955
- doc.add_text(
956
- label=DocItemLabel.TEXT,
957
- parent=new_parent,
958
- text=text,
959
- formatting=format,
960
- hyperlink=hyperlink,
961
- )
958
+ if text:
959
+ doc.add_text(
960
+ label=DocItemLabel.TEXT,
961
+ parent=new_parent,
962
+ text=text,
963
+ formatting=format,
964
+ hyperlink=hyperlink,
965
+ )
962
966
 
963
967
  def _add_list_item(
964
968
  self,
@@ -979,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
979
983
  if self._prev_numid() is None: # Open new list
980
984
  self.level_at_new_list = level
981
985
 
982
- self.parents[level] = doc.add_group(
983
- label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
986
+ self.parents[level] = doc.add_list_group(
987
+ name="list", parent=self.parents[level - 1]
984
988
  )
985
989
 
986
990
  # Set marker and enumerated arguments if this is an enumeration element.
@@ -1001,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1001
1005
  self.level_at_new_list + prev_indent + 1,
1002
1006
  self.level_at_new_list + ilevel + 1,
1003
1007
  ):
1004
- # Determine if this is an unordered list or an ordered list.
1005
- # Set GroupLabel.ORDERED_LIST when it fits.
1006
1008
  self.listIter = 0
1007
- if is_numbered:
1008
- self.parents[i] = doc.add_group(
1009
- label=GroupLabel.ORDERED_LIST,
1010
- name="list",
1011
- parent=self.parents[i - 1],
1012
- )
1013
- else:
1014
- self.parents[i] = doc.add_group(
1015
- label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
1016
- )
1009
+ self.parents[i] = doc.add_list_group(
1010
+ name="list", parent=self.parents[i - 1]
1011
+ )
1017
1012
 
1018
1013
  # TODO: Set marker and enumerated arguments if this is an enumeration element.
1019
1014
  self.listIter += 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.38.1
3
+ Version: 2.39.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.29.0
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
30
30
  Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
31
31
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
@@ -1,5 +1,5 @@
1
1
  pydantic<3.0.0,>=2.0.0
2
- docling-core[chunking]<3.0.0,>=2.29.0
2
+ docling-core[chunking]<3.0.0,>=2.39.0
3
3
  docling-ibm-models<4.0.0,>=3.4.4
4
4
  docling-parse<5.0.0,>=4.0.0
5
5
  filetype<2.0.0,>=1.2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.38.1" # DO NOT EDIT, updated automatically
3
+ version = "2.39.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -44,7 +44,7 @@ authors = [
44
44
  requires-python = '>=3.9,<4.0'
45
45
  dependencies = [
46
46
  'pydantic (>=2.0.0,<3.0.0)',
47
- 'docling-core[chunking] (>=2.29.0,<3.0.0)',
47
+ 'docling-core[chunking] (>=2.39.0,<3.0.0)',
48
48
  'docling-ibm-models (>=3.4.4,<4.0.0)',
49
49
  'docling-parse (>=4.0.0,<5.0.0)',
50
50
  'filetype (>=1.2.0,<2.0.0)',
@@ -41,12 +41,12 @@ def test_e2e_pptx_conversions():
41
41
  doc: DoclingDocument = conv_result.document
42
42
 
43
43
  pred_md: str = doc.export_to_markdown()
44
- assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
44
+ assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md"
45
45
 
46
46
  pred_itxt: str = doc._export_to_indented_text(
47
47
  max_text_len=70, explicit_tables=False
48
48
  )
49
- assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
49
+ assert verify_export(pred_itxt, str(gt_path) + ".itxt", GENERATE), (
50
50
  "export to indented-text"
51
51
  )
52
52
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes