docling 2.64.1__tar.gz → 2.66.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. {docling-2.64.1 → docling-2.66.0}/PKG-INFO +1 -1
  2. {docling-2.64.1 → docling-2.66.0}/docling/backend/html_backend.py +5 -2
  3. {docling-2.64.1 → docling-2.66.0}/docling/backend/md_backend.py +4 -0
  4. {docling-2.64.1 → docling-2.66.0}/docling/backend/msword_backend.py +12 -6
  5. {docling-2.64.1 → docling-2.66.0}/docling/cli/main.py +11 -0
  6. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/backend_options.py +6 -0
  7. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/base_models.py +1 -0
  8. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/pipeline_options.py +68 -12
  9. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/vlm_model_specs.py +20 -4
  10. {docling-2.64.1 → docling-2.66.0}/docling/document_converter.py +109 -6
  11. {docling-2.64.1 → docling-2.66.0}/docling/models/rapid_ocr_model.py +1 -1
  12. {docling-2.64.1 → docling-2.66.0}/docling.egg-info/PKG-INFO +1 -1
  13. {docling-2.64.1 → docling-2.66.0}/pyproject.toml +1 -1
  14. {docling-2.64.1 → docling-2.66.0}/LICENSE +0 -0
  15. {docling-2.64.1 → docling-2.66.0}/README.md +0 -0
  16. {docling-2.64.1 → docling-2.66.0}/docling/__init__.py +0 -0
  17. {docling-2.64.1 → docling-2.66.0}/docling/backend/__init__.py +0 -0
  18. {docling-2.64.1 → docling-2.66.0}/docling/backend/abstract_backend.py +0 -0
  19. {docling-2.64.1 → docling-2.66.0}/docling/backend/asciidoc_backend.py +0 -0
  20. {docling-2.64.1 → docling-2.66.0}/docling/backend/csv_backend.py +0 -0
  21. {docling-2.64.1 → docling-2.66.0}/docling/backend/docling_parse_backend.py +0 -0
  22. {docling-2.64.1 → docling-2.66.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  23. {docling-2.64.1 → docling-2.66.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  24. {docling-2.64.1 → docling-2.66.0}/docling/backend/docx/__init__.py +0 -0
  25. {docling-2.64.1 → docling-2.66.0}/docling/backend/docx/drawingml/utils.py +0 -0
  26. {docling-2.64.1 → docling-2.66.0}/docling/backend/docx/latex/__init__.py +0 -0
  27. {docling-2.64.1 → docling-2.66.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  28. {docling-2.64.1 → docling-2.66.0}/docling/backend/docx/latex/omml.py +0 -0
  29. {docling-2.64.1 → docling-2.66.0}/docling/backend/image_backend.py +0 -0
  30. {docling-2.64.1 → docling-2.66.0}/docling/backend/json/__init__.py +0 -0
  31. {docling-2.64.1 → docling-2.66.0}/docling/backend/json/docling_json_backend.py +0 -0
  32. {docling-2.64.1 → docling-2.66.0}/docling/backend/mets_gbs_backend.py +0 -0
  33. {docling-2.64.1 → docling-2.66.0}/docling/backend/msexcel_backend.py +0 -0
  34. {docling-2.64.1 → docling-2.66.0}/docling/backend/mspowerpoint_backend.py +0 -0
  35. {docling-2.64.1 → docling-2.66.0}/docling/backend/noop_backend.py +0 -0
  36. {docling-2.64.1 → docling-2.66.0}/docling/backend/pdf_backend.py +0 -0
  37. {docling-2.64.1 → docling-2.66.0}/docling/backend/pypdfium2_backend.py +0 -0
  38. {docling-2.64.1 → docling-2.66.0}/docling/backend/webvtt_backend.py +0 -0
  39. {docling-2.64.1 → docling-2.66.0}/docling/backend/xml/__init__.py +0 -0
  40. {docling-2.64.1 → docling-2.66.0}/docling/backend/xml/jats_backend.py +0 -0
  41. {docling-2.64.1 → docling-2.66.0}/docling/backend/xml/uspto_backend.py +0 -0
  42. {docling-2.64.1 → docling-2.66.0}/docling/chunking/__init__.py +0 -0
  43. {docling-2.64.1 → docling-2.66.0}/docling/cli/__init__.py +0 -0
  44. {docling-2.64.1 → docling-2.66.0}/docling/cli/models.py +0 -0
  45. {docling-2.64.1 → docling-2.66.0}/docling/cli/tools.py +0 -0
  46. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/__init__.py +0 -0
  47. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/accelerator_options.py +0 -0
  48. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/asr_model_specs.py +0 -0
  49. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/document.py +0 -0
  50. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/extraction.py +0 -0
  51. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/layout_model_specs.py +0 -0
  52. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  53. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  54. {docling-2.64.1 → docling-2.66.0}/docling/datamodel/settings.py +0 -0
  55. {docling-2.64.1 → docling-2.66.0}/docling/document_extractor.py +0 -0
  56. {docling-2.64.1 → docling-2.66.0}/docling/exceptions.py +0 -0
  57. {docling-2.64.1 → docling-2.66.0}/docling/experimental/__init__.py +0 -0
  58. {docling-2.64.1 → docling-2.66.0}/docling/experimental/datamodel/__init__.py +0 -0
  59. {docling-2.64.1 → docling-2.66.0}/docling/experimental/datamodel/table_crops_layout_options.py +0 -0
  60. {docling-2.64.1 → docling-2.66.0}/docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +0 -0
  61. {docling-2.64.1 → docling-2.66.0}/docling/experimental/models/__init__.py +0 -0
  62. {docling-2.64.1 → docling-2.66.0}/docling/experimental/models/table_crops_layout_model.py +0 -0
  63. {docling-2.64.1 → docling-2.66.0}/docling/experimental/pipeline/__init__.py +0 -0
  64. {docling-2.64.1 → docling-2.66.0}/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +0 -0
  65. {docling-2.64.1 → docling-2.66.0}/docling/models/__init__.py +0 -0
  66. {docling-2.64.1 → docling-2.66.0}/docling/models/api_vlm_model.py +0 -0
  67. {docling-2.64.1 → docling-2.66.0}/docling/models/auto_ocr_model.py +0 -0
  68. {docling-2.64.1 → docling-2.66.0}/docling/models/base_layout_model.py +0 -0
  69. {docling-2.64.1 → docling-2.66.0}/docling/models/base_model.py +0 -0
  70. {docling-2.64.1 → docling-2.66.0}/docling/models/base_ocr_model.py +0 -0
  71. {docling-2.64.1 → docling-2.66.0}/docling/models/base_table_model.py +0 -0
  72. {docling-2.64.1 → docling-2.66.0}/docling/models/code_formula_model.py +0 -0
  73. {docling-2.64.1 → docling-2.66.0}/docling/models/document_picture_classifier.py +0 -0
  74. {docling-2.64.1 → docling-2.66.0}/docling/models/easyocr_model.py +0 -0
  75. {docling-2.64.1 → docling-2.66.0}/docling/models/factories/__init__.py +0 -0
  76. {docling-2.64.1 → docling-2.66.0}/docling/models/factories/base_factory.py +0 -0
  77. {docling-2.64.1 → docling-2.66.0}/docling/models/factories/layout_factory.py +0 -0
  78. {docling-2.64.1 → docling-2.66.0}/docling/models/factories/ocr_factory.py +0 -0
  79. {docling-2.64.1 → docling-2.66.0}/docling/models/factories/picture_description_factory.py +0 -0
  80. {docling-2.64.1 → docling-2.66.0}/docling/models/factories/table_factory.py +0 -0
  81. {docling-2.64.1 → docling-2.66.0}/docling/models/layout_model.py +0 -0
  82. {docling-2.64.1 → docling-2.66.0}/docling/models/ocr_mac_model.py +0 -0
  83. {docling-2.64.1 → docling-2.66.0}/docling/models/page_assemble_model.py +0 -0
  84. {docling-2.64.1 → docling-2.66.0}/docling/models/page_preprocessing_model.py +0 -0
  85. {docling-2.64.1 → docling-2.66.0}/docling/models/picture_description_api_model.py +0 -0
  86. {docling-2.64.1 → docling-2.66.0}/docling/models/picture_description_base_model.py +0 -0
  87. {docling-2.64.1 → docling-2.66.0}/docling/models/picture_description_vlm_model.py +0 -0
  88. {docling-2.64.1 → docling-2.66.0}/docling/models/plugins/__init__.py +0 -0
  89. {docling-2.64.1 → docling-2.66.0}/docling/models/plugins/defaults.py +0 -0
  90. {docling-2.64.1 → docling-2.66.0}/docling/models/readingorder_model.py +0 -0
  91. {docling-2.64.1 → docling-2.66.0}/docling/models/table_structure_model.py +0 -0
  92. {docling-2.64.1 → docling-2.66.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  93. {docling-2.64.1 → docling-2.66.0}/docling/models/tesseract_ocr_model.py +0 -0
  94. {docling-2.64.1 → docling-2.66.0}/docling/models/utils/__init__.py +0 -0
  95. {docling-2.64.1 → docling-2.66.0}/docling/models/utils/generation_utils.py +0 -0
  96. {docling-2.64.1 → docling-2.66.0}/docling/models/utils/hf_model_download.py +0 -0
  97. {docling-2.64.1 → docling-2.66.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  98. {docling-2.64.1 → docling-2.66.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  99. {docling-2.64.1 → docling-2.66.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  100. {docling-2.64.1 → docling-2.66.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
  101. {docling-2.64.1 → docling-2.66.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  102. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/__init__.py +0 -0
  103. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/asr_pipeline.py +0 -0
  104. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/base_extraction_pipeline.py +0 -0
  105. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/base_pipeline.py +0 -0
  106. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
  107. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/legacy_standard_pdf_pipeline.py +0 -0
  108. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/simple_pipeline.py +0 -0
  109. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  110. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
  111. {docling-2.64.1 → docling-2.66.0}/docling/pipeline/vlm_pipeline.py +0 -0
  112. {docling-2.64.1 → docling-2.66.0}/docling/py.typed +0 -0
  113. {docling-2.64.1 → docling-2.66.0}/docling/utils/__init__.py +0 -0
  114. {docling-2.64.1 → docling-2.66.0}/docling/utils/accelerator_utils.py +0 -0
  115. {docling-2.64.1 → docling-2.66.0}/docling/utils/api_image_request.py +0 -0
  116. {docling-2.64.1 → docling-2.66.0}/docling/utils/export.py +0 -0
  117. {docling-2.64.1 → docling-2.66.0}/docling/utils/glm_utils.py +0 -0
  118. {docling-2.64.1 → docling-2.66.0}/docling/utils/layout_postprocessor.py +0 -0
  119. {docling-2.64.1 → docling-2.66.0}/docling/utils/locks.py +0 -0
  120. {docling-2.64.1 → docling-2.66.0}/docling/utils/model_downloader.py +0 -0
  121. {docling-2.64.1 → docling-2.66.0}/docling/utils/ocr_utils.py +0 -0
  122. {docling-2.64.1 → docling-2.66.0}/docling/utils/orientation.py +0 -0
  123. {docling-2.64.1 → docling-2.66.0}/docling/utils/profiling.py +0 -0
  124. {docling-2.64.1 → docling-2.66.0}/docling/utils/utils.py +0 -0
  125. {docling-2.64.1 → docling-2.66.0}/docling/utils/visualization.py +0 -0
  126. {docling-2.64.1 → docling-2.66.0}/docling.egg-info/SOURCES.txt +0 -0
  127. {docling-2.64.1 → docling-2.66.0}/docling.egg-info/dependency_links.txt +0 -0
  128. {docling-2.64.1 → docling-2.66.0}/docling.egg-info/entry_points.txt +0 -0
  129. {docling-2.64.1 → docling-2.66.0}/docling.egg-info/requires.txt +0 -0
  130. {docling-2.64.1 → docling-2.66.0}/docling.egg-info/top_level.txt +0 -0
  131. {docling-2.64.1 → docling-2.66.0}/setup.cfg +0 -0
  132. {docling-2.64.1 → docling-2.66.0}/tests/test_asr_mlx_whisper.py +0 -0
  133. {docling-2.64.1 → docling-2.66.0}/tests/test_asr_pipeline.py +0 -0
  134. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_asciidoc.py +0 -0
  135. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_csv.py +0 -0
  136. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_docling_json.py +0 -0
  137. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_docling_parse.py +0 -0
  138. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_docling_parse_v2.py +0 -0
  139. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_docling_parse_v4.py +0 -0
  140. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_html.py +0 -0
  141. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_image_native.py +0 -0
  142. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_jats.py +0 -0
  143. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_markdown.py +0 -0
  144. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_mets_gbs.py +0 -0
  145. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_msexcel.py +0 -0
  146. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_msword.py +0 -0
  147. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_patent_uspto.py +0 -0
  148. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_pdfium.py +0 -0
  149. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_pptx.py +0 -0
  150. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_vtt.py +0 -0
  151. {docling-2.64.1 → docling-2.66.0}/tests/test_backend_webp.py +0 -0
  152. {docling-2.64.1 → docling-2.66.0}/tests/test_cli.py +0 -0
  153. {docling-2.64.1 → docling-2.66.0}/tests/test_code_formula.py +0 -0
  154. {docling-2.64.1 → docling-2.66.0}/tests/test_conversion_result_json.py +0 -0
  155. {docling-2.64.1 → docling-2.66.0}/tests/test_data_gen_flag.py +0 -0
  156. {docling-2.64.1 → docling-2.66.0}/tests/test_document_picture_classifier.py +0 -0
  157. {docling-2.64.1 → docling-2.66.0}/tests/test_e2e_conversion.py +0 -0
  158. {docling-2.64.1 → docling-2.66.0}/tests/test_e2e_ocr_conversion.py +0 -0
  159. {docling-2.64.1 → docling-2.66.0}/tests/test_extraction.py +0 -0
  160. {docling-2.64.1 → docling-2.66.0}/tests/test_input_doc.py +0 -0
  161. {docling-2.64.1 → docling-2.66.0}/tests/test_interfaces.py +0 -0
  162. {docling-2.64.1 → docling-2.66.0}/tests/test_invalid_input.py +0 -0
  163. {docling-2.64.1 → docling-2.66.0}/tests/test_legacy_format_transform.py +0 -0
  164. {docling-2.64.1 → docling-2.66.0}/tests/test_ocr_utils.py +0 -0
  165. {docling-2.64.1 → docling-2.66.0}/tests/test_options.py +0 -0
  166. {docling-2.64.1 → docling-2.66.0}/tests/test_pdf_password.py +0 -0
  167. {docling-2.64.1 → docling-2.66.0}/tests/test_settings_load.py +0 -0
  168. {docling-2.64.1 → docling-2.66.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.64.1
3
+ Version: 2.66.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -236,6 +236,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
236
236
  options: HTMLBackendOptions = HTMLBackendOptions(),
237
237
  ):
238
238
  super().__init__(in_doc, path_or_stream, options)
239
+ self.options: HTMLBackendOptions
239
240
  self.soup: Optional[BeautifulSoup] = None
240
241
  self.path_or_stream: Union[BytesIO, Path] = path_or_stream
241
242
  self.base_path: Optional[str] = str(options.source_uri)
@@ -299,7 +300,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
299
300
  assert self.soup is not None
300
301
  # set the title as furniture, since it is part of the document metadata
301
302
  title = self.soup.title
302
- if title:
303
+ if title and self.options.add_title:
303
304
  title_text = title.get_text(separator=" ", strip=True)
304
305
  title_clean = HTMLDocumentBackend._clean_unicode(title_text)
305
306
  doc.add_title(
@@ -333,7 +334,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
333
334
  header = clean_headers[0]
334
335
  # Set starting content layer
335
336
  self.content_layer = (
336
- ContentLayer.BODY if header is None else ContentLayer.FURNITURE
337
+ ContentLayer.BODY
338
+ if (not self.options.infer_furniture) or (header is None)
339
+ else ContentLayer.FURNITURE
337
340
  )
338
341
  # reset context
339
342
  self.ctx = _Context()
@@ -12,6 +12,8 @@ import marko
12
12
  import marko.element
13
13
  import marko.inline
14
14
  from docling_core.types.doc import (
15
+ ContentLayer,
16
+ DocItem,
15
17
  DocItemLabel,
16
18
  DoclingDocument,
17
19
  DocumentOrigin,
@@ -593,6 +595,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
593
595
  enable_remote_fetch=md_options.enable_remote_fetch,
594
596
  fetch_images=md_options.fetch_images,
595
597
  source_uri=md_options.source_uri,
598
+ infer_furniture=False,
599
+ add_title=False,
596
600
  )
597
601
  in_doc = InputDocument(
598
602
  path_or_stream=stream,
@@ -25,6 +25,7 @@ from docx import Document
25
25
  from docx.document import Document as DocxDocument
26
26
  from docx.oxml.table import CT_Tc
27
27
  from docx.oxml.xmlchemy import BaseOxmlElement
28
+ from docx.styles.style import ParagraphStyle
28
29
  from docx.table import Table, _Cell
29
30
  from docx.text.hyperlink import Hyperlink
30
31
  from docx.text.paragraph import Paragraph
@@ -511,15 +512,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
511
512
  if paragraph.style is None:
512
513
  return "Normal", None
513
514
 
514
- label = paragraph.style.style_id
515
- name = paragraph.style.name
516
- base_style_label = None
517
- base_style_name = None
518
- if base_style := getattr(paragraph.style, "base_style", None):
515
+ label: str = paragraph.style.style_id
516
+ name: str = paragraph.style.name or ""
517
+ base_style_label: Optional[str] = None
518
+ base_style_name: Optional[str] = None
519
+ if isinstance(
520
+ base_style := getattr(paragraph.style, "base_style", None), ParagraphStyle
521
+ ):
519
522
  base_style_label = base_style.style_id
520
523
  base_style_name = base_style.name
521
524
 
522
- if label is None:
525
+ if not label:
523
526
  return "Normal", None
524
527
 
525
528
  if ":" in label:
@@ -1348,6 +1351,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1348
1351
  _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
1349
1352
  col_idx = 0
1350
1353
  while col_idx < num_cols:
1354
+ # Handle merged cells: row may have fewer cells than num_cols
1355
+ if col_idx >= len(row.cells):
1356
+ break
1351
1357
  cell: _Cell = row.cells[col_idx]
1352
1358
  _log.debug(
1353
1359
  f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
@@ -201,6 +201,7 @@ def export_documents(
201
201
  conv_results: Iterable[ConversionResult],
202
202
  output_dir: Path,
203
203
  export_json: bool,
204
+ export_yaml: bool,
204
205
  export_html: bool,
205
206
  export_html_split_page: bool,
206
207
  show_layout: bool,
@@ -225,6 +226,14 @@ def export_documents(
225
226
  filename=fname, image_mode=image_export_mode
226
227
  )
227
228
 
229
+ # Export YAML format:
230
+ if export_yaml:
231
+ fname = output_dir / f"{doc_filename}.yaml"
232
+ _log.info(f"writing YAML output to {fname}")
233
+ conv_res.document.save_as_yaml(
234
+ filename=fname, image_mode=image_export_mode
235
+ )
236
+
228
237
  # Export HTML format:
229
238
  if export_html:
230
239
  fname = output_dir / f"{doc_filename}.html"
@@ -602,6 +611,7 @@ def convert( # noqa: C901
602
611
  to_formats = [OutputFormat.MARKDOWN]
603
612
 
604
613
  export_json = OutputFormat.JSON in to_formats
614
+ export_yaml = OutputFormat.YAML in to_formats
605
615
  export_html = OutputFormat.HTML in to_formats
606
616
  export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
607
617
  export_md = OutputFormat.MARKDOWN in to_formats
@@ -873,6 +883,7 @@ def convert( # noqa: C901
873
883
  conv_results,
874
884
  output_dir=output,
875
885
  export_json=export_json,
886
+ export_yaml=export_yaml,
876
887
  export_html=export_html,
877
888
  export_html_split_page=export_html_split_page,
878
889
  show_layout=show_layout,
@@ -42,6 +42,12 @@ class HTMLBackendOptions(BaseBackendOptions):
42
42
  "will use it to resolve relative paths in the HTML document."
43
43
  ),
44
44
  )
45
+ add_title: bool = Field(
46
+ True, description="Add the HTML title tag as furniture in the DoclingDocument."
47
+ )
48
+ infer_furniture: bool = Field(
49
+ True, description="Infer all the content before the first header as furniture."
50
+ )
45
51
 
46
52
 
47
53
  class MarkdownBackendOptions(BaseBackendOptions):
@@ -75,6 +75,7 @@ class InputFormat(str, Enum):
75
75
  class OutputFormat(str, Enum):
76
76
  MARKDOWN = "md"
77
77
  JSON = "json"
78
+ YAML = "yaml"
78
79
  HTML = "html"
79
80
  HTML_SPLIT_PAGE = "html_split_page"
80
81
  TEXT = "text"
@@ -2,7 +2,7 @@ import logging
2
2
  from datetime import datetime
3
3
  from enum import Enum
4
4
  from pathlib import Path
5
- from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
5
+ from typing import Annotated, Any, ClassVar, Dict, List, Literal, Optional, Union
6
6
 
7
7
  from pydantic import (
8
8
  AnyUrl,
@@ -79,18 +79,41 @@ class TableStructureOptions(BaseTableStructureOptions):
79
79
  class OcrOptions(BaseOptions):
80
80
  """OCR options."""
81
81
 
82
- lang: List[str]
83
- force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
84
- bitmap_area_threshold: float = (
85
- 0.05 # percentage of the area for a bitmap to processed with OCR
86
- )
82
+ lang: Annotated[
83
+ List[str],
84
+ Field(
85
+ description="List of OCR languages to use. The format must match the values of the OCR engine of choice.",
86
+ examples=[["deu", "eng"]],
87
+ ),
88
+ ]
89
+
90
+ force_full_page_ocr: Annotated[
91
+ bool,
92
+ Field(
93
+ description="If enabled, a full-page OCR is always applied.",
94
+ examples=[False],
95
+ ),
96
+ ] = False
97
+
98
+ bitmap_area_threshold: Annotated[
99
+ float,
100
+ Field(
101
+ description="Percentage of the page area for a bitmap to be processed with OCR.",
102
+ examples=[0.05, 0.1],
103
+ ),
104
+ ] = 0.05
87
105
 
88
106
 
89
107
  class OcrAutoOptions(OcrOptions):
90
108
  """Options for pick OCR engine automatically."""
91
109
 
92
110
  kind: ClassVar[Literal["auto"]] = "auto"
93
- lang: List[str] = []
111
+ lang: Annotated[
112
+ List[str],
113
+ Field(
114
+ description="The automatic OCR engine will use the default values of the engine. Please specify the engine explicitly to change the language selection.",
115
+ ),
116
+ ] = []
94
117
 
95
118
 
96
119
  class RapidOcrOptions(OcrOptions):
@@ -278,11 +301,44 @@ class OcrEngine(str, Enum):
278
301
  class PipelineOptions(BaseOptions):
279
302
  """Base pipeline options."""
280
303
 
281
- document_timeout: Optional[float] = None
282
- accelerator_options: AcceleratorOptions = AcceleratorOptions()
283
- enable_remote_services: bool = False
284
- allow_external_plugins: bool = False
285
- artifacts_path: Optional[Union[Path, str]] = None
304
+ document_timeout: Annotated[
305
+ Optional[float],
306
+ Field(
307
+ description="Maximum allowed processing time for a document before timing out. If None, no timeout is enforced.",
308
+ examples=[10.0, 20.0],
309
+ ),
310
+ ] = None
311
+
312
+ accelerator_options: Annotated[
313
+ AcceleratorOptions,
314
+ Field(
315
+ description="Configuration options for hardware acceleration (e.g., GPU or optimized execution settings).",
316
+ ),
317
+ ] = AcceleratorOptions()
318
+
319
+ enable_remote_services: Annotated[
320
+ bool,
321
+ Field(
322
+ description="Enable calling external APIs or cloud services during pipeline execution.",
323
+ examples=[False],
324
+ ),
325
+ ] = False
326
+
327
+ allow_external_plugins: Annotated[
328
+ bool,
329
+ Field(
330
+ description="Allow loading external third-party plugins or modules. Disabled by default for safety.",
331
+ examples=[False],
332
+ ),
333
+ ] = False
334
+
335
+ artifacts_path: Annotated[
336
+ Optional[Union[Path, str]],
337
+ Field(
338
+ description="Filesystem path where pipeline artifacts should be stored. If None, artifacts will be fetched. You can use the utility `docling-tools models download` to pre-fetch the model artifacts.",
339
+ examples=["./artifacts", "/tmp/docling_outputs"],
340
+ ),
341
+ ] = None
286
342
 
287
343
 
288
344
  class ConvertPipelineOptions(PipelineOptions):
@@ -38,10 +38,6 @@ GRANITEDOCLING_TRANSFORMERS = InlineVlmOptions(
38
38
 
39
39
  GRANITEDOCLING_VLLM = GRANITEDOCLING_TRANSFORMERS.model_copy()
40
40
  GRANITEDOCLING_VLLM.inference_framework = InferenceFramework.VLLM
41
- GRANITEDOCLING_VLLM.revision = (
42
- "untied" # change back to "main" with next vllm relase after 0.10.2
43
- )
44
-
45
41
 
46
42
  GRANITEDOCLING_MLX = InlineVlmOptions(
47
43
  repo_id="ibm-granite/granite-docling-258M-mlx",
@@ -55,6 +51,26 @@ GRANITEDOCLING_MLX = InlineVlmOptions(
55
51
  stop_strings=["</doctag>", "<|end_of_text|>"],
56
52
  )
57
53
 
54
+ GRANITEDOCLING_VLLM_API = ApiVlmOptions(
55
+ url="http://localhost:8000/v1/chat/completions", # LM studio defaults to port 1234, VLLM to 8000
56
+ params=dict(
57
+ model=GRANITEDOCLING_TRANSFORMERS.repo_id,
58
+ max_tokens=4096,
59
+ skip_special_tokens=True,
60
+ ),
61
+ prompt=GRANITEDOCLING_TRANSFORMERS.prompt,
62
+ timeout=90,
63
+ scale=2.0,
64
+ temperature=0.0,
65
+ concurrency=4,
66
+ stop_strings=["</doctag>", "<|end_of_text|>"],
67
+ response_format=ResponseFormat.DOCTAGS,
68
+ )
69
+
70
+ GRANITEDOCLING_OLLAMA = GRANITEDOCLING_VLLM_API.model_copy()
71
+ GRANITEDOCLING_OLLAMA.url = AnyUrl("http://localhost:11434/v1/chat/completions")
72
+ GRANITEDOCLING_OLLAMA.params["model"] = "ibm/granite-docling:258m"
73
+
58
74
  # SmolDocling
59
75
  SMOLDOCLING_MLX = InlineVlmOptions(
60
76
  repo_id="docling-project/SmolDocling-256M-preview-mlx-bf16",
@@ -176,14 +176,38 @@ def _get_default_option(format: InputFormat) -> FormatOption:
176
176
 
177
177
 
178
178
  class DocumentConverter:
179
+ """Convert documents of various input formats to Docling documents.
180
+
181
+ `DocumentConverter` is the main entry point for converting documents in Docling.
182
+ It handles various input formats (PDF, DOCX, PPTX, images, HTML, Markdown, etc.)
183
+ and provides both single-document and batch conversion capabilities.
184
+
185
+ The conversion methods return a `ConversionResult` instance for each document,
186
+ which wraps a `DoclingDocument` object if the conversion was successful, along
187
+ with metadata about the conversion process.
188
+
189
+ Attributes:
190
+ allowed_formats: Allowed input formats.
191
+ format_to_options: Mapping of formats to their options.
192
+ initialized_pipelines: Cache of initialized pipelines keyed by
193
+ (pipeline class, options hash).
194
+ """
195
+
179
196
  _default_download_filename = "file"
180
197
 
181
198
  def __init__(
182
199
  self,
183
200
  allowed_formats: Optional[list[InputFormat]] = None,
184
201
  format_options: Optional[dict[InputFormat, FormatOption]] = None,
185
- ):
186
- self.allowed_formats = (
202
+ ) -> None:
203
+ """Initialize the converter based on format preferences.
204
+
205
+ Args:
206
+ allowed_formats: List of allowed input formats. By default, any
207
+ format supported by Docling is allowed.
208
+ format_options: Dictionary of format-specific options.
209
+ """
210
+ self.allowed_formats: list[InputFormat] = (
187
211
  allowed_formats if allowed_formats is not None else list(InputFormat)
188
212
  )
189
213
 
@@ -237,7 +261,19 @@ class DocumentConverter:
237
261
  ).hexdigest()
238
262
 
239
263
  def initialize_pipeline(self, format: InputFormat):
240
- """Initialize the conversion pipeline for the selected format."""
264
+ """Initialize the conversion pipeline for the selected format.
265
+
266
+ Args:
267
+ format: The input format for which to initialize the pipeline.
268
+
269
+ Raises:
270
+ ConversionError: If no pipeline could be initialized for the
271
+ given format.
272
+ RuntimeError: If `artifacts_path` is set in
273
+ `docling.datamodel.settings.settings` when required by
274
+ the pipeline, but points to a non-directory file.
275
+ FileNotFoundError: If local model files are not found.
276
+ """
241
277
  pipeline = self._get_pipeline(doc_format=format)
242
278
  if pipeline is None:
243
279
  raise ConversionError(
@@ -254,6 +290,30 @@ class DocumentConverter:
254
290
  max_file_size: int = sys.maxsize,
255
291
  page_range: PageRange = DEFAULT_PAGE_RANGE,
256
292
  ) -> ConversionResult:
293
+ """Convert one document fetched from a file path, URL, or DocumentStream.
294
+
295
+ Note: If the document content is given as a string (Markdown or HTML
296
+ content), use the `convert_string` method.
297
+
298
+ Args:
299
+ source: Source of input document given as file path, URL, or
300
+ DocumentStream.
301
+ headers: Optional headers given as a dictionary of string key-value pairs,
302
+ in case of URL input source.
303
+ raises_on_error: Whether to raise an error on the first conversion failure.
304
+ If False, errors are captured in the ConversionResult objects.
305
+ max_num_pages: Maximum number of pages accepted per document.
306
+ Documents exceeding this number will not be converted.
307
+ max_file_size: Maximum file size to convert.
308
+ page_range: Range of pages to convert.
309
+
310
+ Returns:
311
+ The conversion result, which contains a `DoclingDocument` in the `document`
312
+ attribute, and metadata about the conversion process.
313
+
314
+ Raises:
315
+ ConversionError: An error occurred during conversion.
316
+ """
257
317
  all_res = self.convert_all(
258
318
  source=[source],
259
319
  raises_on_error=raises_on_error,
@@ -269,11 +329,31 @@ class DocumentConverter:
269
329
  self,
270
330
  source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
271
331
  headers: Optional[dict[str, str]] = None,
272
- raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
332
+ raises_on_error: bool = True,
273
333
  max_num_pages: int = sys.maxsize,
274
334
  max_file_size: int = sys.maxsize,
275
335
  page_range: PageRange = DEFAULT_PAGE_RANGE,
276
336
  ) -> Iterator[ConversionResult]:
337
+ """Convert multiple documents from file paths, URLs, or DocumentStreams.
338
+
339
+ Args:
340
+ source: Source of input documents given as an iterable of file paths, URLs,
341
+ or DocumentStreams.
342
+ headers: Optional headers given as a (single) dictionary of string
343
+ key-value pairs, in case of URL input source.
344
+ raises_on_error: Whether to raise an error on the first conversion failure.
345
+ max_num_pages: Maximum number of pages to convert.
346
+ max_file_size: Maximum number of pages accepted per document. Documents
347
+ exceeding this number will be skipped.
348
+ page_range: Range of pages to convert in each document.
349
+
350
+ Yields:
351
+ The conversion results, each containing a `DoclingDocument` in the
352
+ `document` attribute and metadata about the conversion process.
353
+
354
+ Raises:
355
+ ConversionError: An error occurred during conversion.
356
+ """
277
357
  limits = DocumentLimits(
278
358
  max_num_pages=max_num_pages,
279
359
  max_file_size=max_file_size,
@@ -296,14 +376,16 @@ class DocumentConverter:
296
376
  error_messages = [err.error_message for err in conv_res.errors]
297
377
  error_details = f" Errors: {'; '.join(error_messages)}"
298
378
  raise ConversionError(
299
- f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}.{error_details}"
379
+ f"Conversion failed for: {conv_res.input.file} with status: "
380
+ f"{conv_res.status}.{error_details}"
300
381
  )
301
382
  else:
302
383
  yield conv_res
303
384
 
304
385
  if not had_result and raises_on_error:
305
386
  raise ConversionError(
306
- "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
387
+ "Conversion failed because the provided file has no recognizable "
388
+ "format or it wasn't in the list of allowed formats."
307
389
  )
308
390
 
309
391
  @validate_call(config=ConfigDict(strict=True))
@@ -313,6 +395,27 @@ class DocumentConverter:
313
395
  format: InputFormat,
314
396
  name: Optional[str] = None,
315
397
  ) -> ConversionResult:
398
+ """Convert a document given as a string using the specified format.
399
+
400
+ Only Markdown (`InputFormat.MD`) and HTML (`InputFormat.HTML`) formats
401
+ are supported. The content is wrapped in a `DocumentStream` and passed
402
+ to the main conversion pipeline.
403
+
404
+ Args:
405
+ content: The document content as a string.
406
+ format: The format of the input content.
407
+ name: The filename to associate with the document. If not provided, a
408
+ timestamp-based name is generated. The appropriate file extension (`md`
409
+ or `html`) is appended if missing.
410
+
411
+ Returns:
412
+ The conversion result, which contains a `DoclingDocument` in the `document`
413
+ attribute, and metadata about the conversion process.
414
+
415
+ Raises:
416
+ ValueError: If format is neither `InputFormat.MD` nor `InputFormat.HTML`.
417
+ ConversionError: An error occurred during conversion.
418
+ """
316
419
  name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
317
420
 
318
421
  if format == InputFormat.MD:
@@ -196,7 +196,7 @@ class RapidOcrModel(BaseOcrModel):
196
196
  # Recognition model settings
197
197
  "Rec.model_path": rec_model_path,
198
198
  "Rec.font_path": self.options.rec_font_path,
199
- "Rec.keys_path": rec_keys_path,
199
+ "Rec.rec_keys_path": rec_keys_path,
200
200
  "Rec.use_cuda": use_cuda,
201
201
  "Rec.use_dml": use_dml,
202
202
  "Rec.intra_op_num_threads": intra_op_num_threads,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.64.1
3
+ Version: 2.66.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.64.1" # DO NOT EDIT, updated automatically
3
+ version = "2.66.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes