docling 2.49.0__tar.gz → 2.51.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {docling-2.49.0 → docling-2.51.0}/PKG-INFO +10 -6
  2. {docling-2.49.0 → docling-2.51.0}/README.md +7 -3
  3. {docling-2.49.0 → docling-2.51.0}/docling/backend/docling_parse_v4_backend.py +12 -0
  4. {docling-2.49.0 → docling-2.51.0}/docling/backend/html_backend.py +3 -2
  5. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/pipeline_options.py +4 -2
  6. {docling-2.49.0 → docling-2.51.0}/docling/models/layout_model.py +3 -3
  7. {docling-2.49.0 → docling-2.51.0}/docling/models/page_preprocessing_model.py +1 -1
  8. {docling-2.49.0 → docling-2.51.0}/docling/models/table_structure_model.py +1 -1
  9. {docling-2.49.0 → docling-2.51.0}/docling/utils/model_downloader.py +2 -1
  10. {docling-2.49.0 → docling-2.51.0}/docling.egg-info/PKG-INFO +10 -6
  11. {docling-2.49.0 → docling-2.51.0}/docling.egg-info/requires.txt +2 -2
  12. {docling-2.49.0 → docling-2.51.0}/pyproject.toml +3 -3
  13. {docling-2.49.0 → docling-2.51.0}/tests/test_e2e_conversion.py +9 -1
  14. {docling-2.49.0 → docling-2.51.0}/LICENSE +0 -0
  15. {docling-2.49.0 → docling-2.51.0}/docling/__init__.py +0 -0
  16. {docling-2.49.0 → docling-2.51.0}/docling/backend/__init__.py +0 -0
  17. {docling-2.49.0 → docling-2.51.0}/docling/backend/abstract_backend.py +0 -0
  18. {docling-2.49.0 → docling-2.51.0}/docling/backend/asciidoc_backend.py +0 -0
  19. {docling-2.49.0 → docling-2.51.0}/docling/backend/csv_backend.py +0 -0
  20. {docling-2.49.0 → docling-2.51.0}/docling/backend/docling_parse_backend.py +0 -0
  21. {docling-2.49.0 → docling-2.51.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  22. {docling-2.49.0 → docling-2.51.0}/docling/backend/docx/__init__.py +0 -0
  23. {docling-2.49.0 → docling-2.51.0}/docling/backend/docx/latex/__init__.py +0 -0
  24. {docling-2.49.0 → docling-2.51.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  25. {docling-2.49.0 → docling-2.51.0}/docling/backend/docx/latex/omml.py +0 -0
  26. {docling-2.49.0 → docling-2.51.0}/docling/backend/json/__init__.py +0 -0
  27. {docling-2.49.0 → docling-2.51.0}/docling/backend/json/docling_json_backend.py +0 -0
  28. {docling-2.49.0 → docling-2.51.0}/docling/backend/md_backend.py +0 -0
  29. {docling-2.49.0 → docling-2.51.0}/docling/backend/mets_gbs_backend.py +0 -0
  30. {docling-2.49.0 → docling-2.51.0}/docling/backend/msexcel_backend.py +0 -0
  31. {docling-2.49.0 → docling-2.51.0}/docling/backend/mspowerpoint_backend.py +0 -0
  32. {docling-2.49.0 → docling-2.51.0}/docling/backend/msword_backend.py +0 -0
  33. {docling-2.49.0 → docling-2.51.0}/docling/backend/noop_backend.py +0 -0
  34. {docling-2.49.0 → docling-2.51.0}/docling/backend/pdf_backend.py +0 -0
  35. {docling-2.49.0 → docling-2.51.0}/docling/backend/pypdfium2_backend.py +0 -0
  36. {docling-2.49.0 → docling-2.51.0}/docling/backend/xml/__init__.py +0 -0
  37. {docling-2.49.0 → docling-2.51.0}/docling/backend/xml/jats_backend.py +0 -0
  38. {docling-2.49.0 → docling-2.51.0}/docling/backend/xml/uspto_backend.py +0 -0
  39. {docling-2.49.0 → docling-2.51.0}/docling/chunking/__init__.py +0 -0
  40. {docling-2.49.0 → docling-2.51.0}/docling/cli/__init__.py +0 -0
  41. {docling-2.49.0 → docling-2.51.0}/docling/cli/main.py +0 -0
  42. {docling-2.49.0 → docling-2.51.0}/docling/cli/models.py +0 -0
  43. {docling-2.49.0 → docling-2.51.0}/docling/cli/tools.py +0 -0
  44. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/__init__.py +0 -0
  45. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/accelerator_options.py +0 -0
  46. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/asr_model_specs.py +0 -0
  47. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/base_models.py +0 -0
  48. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/document.py +0 -0
  49. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/extraction.py +0 -0
  50. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/layout_model_specs.py +0 -0
  51. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  52. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  53. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/settings.py +0 -0
  54. {docling-2.49.0 → docling-2.51.0}/docling/datamodel/vlm_model_specs.py +0 -0
  55. {docling-2.49.0 → docling-2.51.0}/docling/document_converter.py +0 -0
  56. {docling-2.49.0 → docling-2.51.0}/docling/document_extractor.py +0 -0
  57. {docling-2.49.0 → docling-2.51.0}/docling/exceptions.py +0 -0
  58. {docling-2.49.0 → docling-2.51.0}/docling/models/__init__.py +0 -0
  59. {docling-2.49.0 → docling-2.51.0}/docling/models/api_vlm_model.py +0 -0
  60. {docling-2.49.0 → docling-2.51.0}/docling/models/base_model.py +0 -0
  61. {docling-2.49.0 → docling-2.51.0}/docling/models/base_ocr_model.py +0 -0
  62. {docling-2.49.0 → docling-2.51.0}/docling/models/code_formula_model.py +0 -0
  63. {docling-2.49.0 → docling-2.51.0}/docling/models/document_picture_classifier.py +0 -0
  64. {docling-2.49.0 → docling-2.51.0}/docling/models/easyocr_model.py +0 -0
  65. {docling-2.49.0 → docling-2.51.0}/docling/models/factories/__init__.py +0 -0
  66. {docling-2.49.0 → docling-2.51.0}/docling/models/factories/base_factory.py +0 -0
  67. {docling-2.49.0 → docling-2.51.0}/docling/models/factories/ocr_factory.py +0 -0
  68. {docling-2.49.0 → docling-2.51.0}/docling/models/factories/picture_description_factory.py +0 -0
  69. {docling-2.49.0 → docling-2.51.0}/docling/models/ocr_mac_model.py +0 -0
  70. {docling-2.49.0 → docling-2.51.0}/docling/models/page_assemble_model.py +0 -0
  71. {docling-2.49.0 → docling-2.51.0}/docling/models/picture_description_api_model.py +0 -0
  72. {docling-2.49.0 → docling-2.51.0}/docling/models/picture_description_base_model.py +0 -0
  73. {docling-2.49.0 → docling-2.51.0}/docling/models/picture_description_vlm_model.py +0 -0
  74. {docling-2.49.0 → docling-2.51.0}/docling/models/plugins/__init__.py +0 -0
  75. {docling-2.49.0 → docling-2.51.0}/docling/models/plugins/defaults.py +0 -0
  76. {docling-2.49.0 → docling-2.51.0}/docling/models/rapid_ocr_model.py +0 -0
  77. {docling-2.49.0 → docling-2.51.0}/docling/models/readingorder_model.py +0 -0
  78. {docling-2.49.0 → docling-2.51.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  79. {docling-2.49.0 → docling-2.51.0}/docling/models/tesseract_ocr_model.py +0 -0
  80. {docling-2.49.0 → docling-2.51.0}/docling/models/utils/__init__.py +0 -0
  81. {docling-2.49.0 → docling-2.51.0}/docling/models/utils/hf_model_download.py +0 -0
  82. {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  83. {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  84. {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  85. {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
  86. {docling-2.49.0 → docling-2.51.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  87. {docling-2.49.0 → docling-2.51.0}/docling/pipeline/__init__.py +0 -0
  88. {docling-2.49.0 → docling-2.51.0}/docling/pipeline/asr_pipeline.py +0 -0
  89. {docling-2.49.0 → docling-2.51.0}/docling/pipeline/base_extraction_pipeline.py +0 -0
  90. {docling-2.49.0 → docling-2.51.0}/docling/pipeline/base_pipeline.py +0 -0
  91. {docling-2.49.0 → docling-2.51.0}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
  92. {docling-2.49.0 → docling-2.51.0}/docling/pipeline/simple_pipeline.py +0 -0
  93. {docling-2.49.0 → docling-2.51.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  94. {docling-2.49.0 → docling-2.51.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
  95. {docling-2.49.0 → docling-2.51.0}/docling/pipeline/vlm_pipeline.py +0 -0
  96. {docling-2.49.0 → docling-2.51.0}/docling/py.typed +0 -0
  97. {docling-2.49.0 → docling-2.51.0}/docling/utils/__init__.py +0 -0
  98. {docling-2.49.0 → docling-2.51.0}/docling/utils/accelerator_utils.py +0 -0
  99. {docling-2.49.0 → docling-2.51.0}/docling/utils/api_image_request.py +0 -0
  100. {docling-2.49.0 → docling-2.51.0}/docling/utils/export.py +0 -0
  101. {docling-2.49.0 → docling-2.51.0}/docling/utils/glm_utils.py +0 -0
  102. {docling-2.49.0 → docling-2.51.0}/docling/utils/layout_postprocessor.py +0 -0
  103. {docling-2.49.0 → docling-2.51.0}/docling/utils/locks.py +0 -0
  104. {docling-2.49.0 → docling-2.51.0}/docling/utils/ocr_utils.py +0 -0
  105. {docling-2.49.0 → docling-2.51.0}/docling/utils/orientation.py +0 -0
  106. {docling-2.49.0 → docling-2.51.0}/docling/utils/profiling.py +0 -0
  107. {docling-2.49.0 → docling-2.51.0}/docling/utils/utils.py +0 -0
  108. {docling-2.49.0 → docling-2.51.0}/docling/utils/visualization.py +0 -0
  109. {docling-2.49.0 → docling-2.51.0}/docling.egg-info/SOURCES.txt +0 -0
  110. {docling-2.49.0 → docling-2.51.0}/docling.egg-info/dependency_links.txt +0 -0
  111. {docling-2.49.0 → docling-2.51.0}/docling.egg-info/entry_points.txt +0 -0
  112. {docling-2.49.0 → docling-2.51.0}/docling.egg-info/top_level.txt +0 -0
  113. {docling-2.49.0 → docling-2.51.0}/setup.cfg +0 -0
  114. {docling-2.49.0 → docling-2.51.0}/tests/test_asr_pipeline.py +0 -0
  115. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_asciidoc.py +0 -0
  116. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_csv.py +0 -0
  117. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_docling_json.py +0 -0
  118. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_docling_parse.py +0 -0
  119. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_docling_parse_v2.py +0 -0
  120. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_docling_parse_v4.py +0 -0
  121. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_html.py +0 -0
  122. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_jats.py +0 -0
  123. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_markdown.py +0 -0
  124. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_mets_gbs.py +0 -0
  125. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_msexcel.py +0 -0
  126. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_msword.py +0 -0
  127. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_patent_uspto.py +0 -0
  128. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_pdfium.py +0 -0
  129. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_pptx.py +0 -0
  130. {docling-2.49.0 → docling-2.51.0}/tests/test_backend_webp.py +0 -0
  131. {docling-2.49.0 → docling-2.51.0}/tests/test_cli.py +0 -0
  132. {docling-2.49.0 → docling-2.51.0}/tests/test_code_formula.py +0 -0
  133. {docling-2.49.0 → docling-2.51.0}/tests/test_data_gen_flag.py +0 -0
  134. {docling-2.49.0 → docling-2.51.0}/tests/test_document_picture_classifier.py +0 -0
  135. {docling-2.49.0 → docling-2.51.0}/tests/test_e2e_ocr_conversion.py +0 -0
  136. {docling-2.49.0 → docling-2.51.0}/tests/test_extraction.py +0 -0
  137. {docling-2.49.0 → docling-2.51.0}/tests/test_input_doc.py +0 -0
  138. {docling-2.49.0 → docling-2.51.0}/tests/test_interfaces.py +0 -0
  139. {docling-2.49.0 → docling-2.51.0}/tests/test_invalid_input.py +0 -0
  140. {docling-2.49.0 → docling-2.51.0}/tests/test_legacy_format_transform.py +0 -0
  141. {docling-2.49.0 → docling-2.51.0}/tests/test_ocr_utils.py +0 -0
  142. {docling-2.49.0 → docling-2.51.0}/tests/test_options.py +0 -0
  143. {docling-2.49.0 → docling-2.51.0}/tests/test_settings_load.py +0 -0
  144. {docling-2.49.0 → docling-2.51.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.49.0
3
+ Version: 2.51.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,8 +27,8 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
- Requires-Dist: docling-parse<5.0.0,>=4.2.2
31
- Requires-Dist: docling-ibm-models<4,>=3.9.0
30
+ Requires-Dist: docling-parse<5.0.0,>=4.4.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -101,17 +101,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
101
101
 
102
102
  ## Features
103
103
 
104
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
104
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
105
105
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
106
106
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
107
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
107
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
108
108
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
109
109
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
110
110
  * 🔍 Extensive OCR support for scanned PDFs and images
111
111
  * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
112
- * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
112
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
113
113
  * 💻 Simple and convenient CLI
114
114
 
115
+ ### What's new
116
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
117
+
115
118
  ### Coming soon
116
119
 
117
120
  * 📝 Metadata extraction, including title, authors, references & language
@@ -222,3 +225,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
222
225
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
223
226
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
224
227
  [integrations]: https://docling-project.github.io/docling/integrations/
228
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -29,17 +29,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
29
29
 
30
30
  ## Features
31
31
 
32
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
32
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
33
33
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
34
34
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
35
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
35
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
36
36
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
37
37
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
38
38
  * 🔍 Extensive OCR support for scanned PDFs and images
39
39
  * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
40
- * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
40
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
41
41
  * 💻 Simple and convenient CLI
42
42
 
43
+ ### What's new
44
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
45
+
43
46
  ### Coming soon
44
47
 
45
48
  * 📝 Metadata extraction, including title, authors, references & language
@@ -150,3 +153,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
150
153
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
151
154
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
152
155
  [integrations]: https://docling-project.github.io/docling/integrations/
156
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend):
30
30
  page_no: int,
31
31
  create_words: bool = True,
32
32
  create_textlines: bool = True,
33
+ keep_chars: bool = False,
34
+ keep_lines: bool = False,
35
+ keep_images: bool = True,
33
36
  ):
34
37
  self._ppage = page_obj
35
38
  self._dp_doc = dp_doc
36
39
  self._page_no = page_no
40
+
37
41
  self._create_words = create_words
38
42
  self._create_textlines = create_textlines
39
43
 
44
+ self._keep_chars = keep_chars
45
+ self._keep_lines = keep_lines
46
+ self._keep_images = keep_images
47
+
40
48
  self._dpage: Optional[SegmentedPdfPage] = None
41
49
  self._unloaded = False
42
50
  self.valid = (self._ppage is not None) and (self._dp_doc is not None)
@@ -47,8 +55,12 @@ class DoclingParseV4PageBackend(PdfPageBackend):
47
55
 
48
56
  seg_page = self._dp_doc.get_page(
49
57
  self._page_no + 1,
58
+ keep_chars=self._keep_chars,
59
+ keep_lines=self._keep_lines,
60
+ keep_bitmaps=self._keep_images,
50
61
  create_words=self._create_words,
51
62
  create_textlines=self._create_textlines,
63
+ enforce_same_font=True,
52
64
  )
53
65
 
54
66
  # In Docling, all TextCell instances are expected with top-left origin.
@@ -467,13 +467,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
467
467
 
468
468
  @contextmanager
469
469
  def _use_hyperlink(self, tag: Tag):
470
+ old_hyperlink: Union[AnyUrl, Path, None] = None
471
+ new_hyperlink: Union[AnyUrl, Path, None] = None
470
472
  this_href = tag.get("href")
471
473
  if this_href is None:
472
474
  yield None
473
475
  else:
474
476
  if isinstance(this_href, str) and this_href:
475
- old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
476
- new_hyperlink: Union[AnyUrl, Path, None] = None
477
+ old_hyperlink = self.hyperlink
477
478
  if self.original_url is not None:
478
479
  this_href = urljoin(str(self.original_url), str(this_href))
479
480
  # ugly fix for relative links since pydantic does not support them.
@@ -237,7 +237,9 @@ class PdfBackend(str, Enum):
237
237
 
238
238
 
239
239
  # Define an enum for the ocr engines
240
- @deprecated("Use ocr_factory.registered_enum")
240
+ @deprecated(
241
+ "Use get_ocr_factory().registered_kind to get a list of registered OCR engines."
242
+ )
241
243
  class OcrEngine(str, Enum):
242
244
  """Enum of valid OCR engines."""
243
245
 
@@ -283,10 +285,10 @@ class LayoutOptions(BaseModel):
283
285
  keep_empty_clusters: bool = (
284
286
  False # Whether to keep clusters that contain no text cells
285
287
  )
288
+ model_spec: LayoutModelConfig = DOCLING_LAYOUT_HERON
286
289
  skip_cell_assignment: bool = (
287
290
  False # Skip cell-to-cluster assignment for VLM-only processing
288
291
  )
289
- model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
290
292
 
291
293
 
292
294
  class AsrPipelineOptions(PipelineOptions):
@@ -91,7 +91,7 @@ class LayoutModel(BasePageModel):
91
91
  local_dir: Optional[Path] = None,
92
92
  force: bool = False,
93
93
  progress: bool = False,
94
- layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
94
+ layout_model_config: LayoutModelConfig = LayoutOptions().model_spec, # use default
95
95
  ) -> Path:
96
96
  return download_hf_model(
97
97
  repo_id=layout_model_config.repo_id,
@@ -122,8 +122,8 @@ class LayoutModel(BasePageModel):
122
122
  left_clusters = [c for c in clusters if c.label not in exclude_labels]
123
123
  right_clusters = [c for c in clusters if c.label in exclude_labels]
124
124
  # Create a deep copy of the original image for both sides
125
- left_image = copy.deepcopy(page.image)
126
- right_image = copy.deepcopy(page.image)
125
+ left_image = page.image.copy()
126
+ right_image = page.image.copy()
127
127
 
128
128
  # Draw clusters on both images
129
129
  draw_clusters(left_image, left_clusters, scale_x, scale_y)
@@ -90,7 +90,7 @@ class PagePreprocessingModel(BasePageModel):
90
90
 
91
91
  # DEBUG code:
92
92
  def draw_text_boxes(image, cells, show: bool = False):
93
- draw = ImageDraw.Draw(image)
93
+ draw = ImageDraw.Draw(image.copy())
94
94
  for c in cells:
95
95
  x0, y0, x1, y1 = (
96
96
  c.to_bounding_box().l,
@@ -94,7 +94,7 @@ class TableStructureModel(BasePageModel):
94
94
  ) -> Path:
95
95
  return download_hf_model(
96
96
  repo_id="ds4sd/docling-models",
97
- revision="v2.2.0",
97
+ revision="v2.3.0",
98
98
  local_dir=local_dir,
99
99
  force=force,
100
100
  progress=progress,
@@ -4,6 +4,7 @@ from typing import Optional
4
4
 
5
5
  from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2
6
6
  from docling.datamodel.pipeline_options import (
7
+ LayoutOptions,
7
8
  granite_picture_description,
8
9
  smolvlm_picture_description,
9
10
  )
@@ -47,7 +48,7 @@ def download_models(
47
48
  if with_layout:
48
49
  _log.info("Downloading layout model...")
49
50
  LayoutModel.download_models(
50
- local_dir=output_dir / DOCLING_LAYOUT_V2.model_repo_folder,
51
+ local_dir=output_dir / LayoutOptions().model_spec.model_repo_folder,
51
52
  force=force,
52
53
  progress=progress,
53
54
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.49.0
3
+ Version: 2.51.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,8 +27,8 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
- Requires-Dist: docling-parse<5.0.0,>=4.2.2
31
- Requires-Dist: docling-ibm-models<4,>=3.9.0
30
+ Requires-Dist: docling-parse<5.0.0,>=4.4.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -101,17 +101,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
101
101
 
102
102
  ## Features
103
103
 
104
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
104
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
105
105
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
106
106
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
107
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
107
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
108
108
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
109
109
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
110
110
  * 🔍 Extensive OCR support for scanned PDFs and images
111
111
  * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
112
- * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
112
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
113
113
  * 💻 Simple and convenient CLI
114
114
 
115
+ ### What's new
116
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
117
+
115
118
  ### Coming soon
116
119
 
117
120
  * 📝 Metadata extraction, including title, authors, references & language
@@ -222,3 +225,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
222
225
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
223
226
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
224
227
  [integrations]: https://docling-project.github.io/docling/integrations/
228
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -1,7 +1,7 @@
1
1
  pydantic<3.0.0,>=2.0.0
2
2
  docling-core[chunking]<3.0.0,>=2.42.0
3
- docling-parse<5.0.0,>=4.2.2
4
- docling-ibm-models<4,>=3.9.0
3
+ docling-parse<5.0.0,>=4.4.0
4
+ docling-ibm-models<4,>=3.9.1
5
5
  filetype<2.0.0,>=1.2.0
6
6
  pypdfium2!=4.30.1,<5.0.0,>=4.30.0
7
7
  pydantic-settings<3.0.0,>=2.3.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.49.0" # DO NOT EDIT, updated automatically
3
+ version = "2.51.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -45,8 +45,8 @@ requires-python = '>=3.9,<4.0'
45
45
  dependencies = [
46
46
  'pydantic (>=2.0.0,<3.0.0)',
47
47
  'docling-core[chunking] (>=2.42.0,<3.0.0)',
48
- 'docling-parse (>=4.2.2,<5.0.0)',
49
- "docling-ibm-models>=3.9.0,<4",
48
+ 'docling-parse (>=4.4.0,<5.0.0)',
49
+ "docling-ibm-models>=3.9.1,<4",
50
50
  'filetype (>=1.2.0,<2.0.0)',
51
51
  'pypdfium2 (>=4.30.0,!=4.30.1,<5.0.0)',
52
52
  'pydantic-settings (>=2.3.0,<3.0.0)',
@@ -11,6 +11,8 @@ from .verify_utils import verify_conversion_result_v2
11
11
 
12
12
  GENERATE_V2 = GEN_TEST_DATA
13
13
 
14
+ SKIP_DOCTAGS_COMPARISON = ["2203.01017v2.pdf"]
15
+
14
16
 
15
17
  def get_pdf_paths():
16
18
  # Define the directory you want to search
@@ -50,6 +52,12 @@ def test_e2e_pdfs_conversions():
50
52
 
51
53
  doc_result: ConversionResult = converter.convert(pdf_path)
52
54
 
55
+ # Decide if to skip doctags comparison
56
+ verify_doctags = pdf_path.name not in SKIP_DOCTAGS_COMPARISON
57
+
53
58
  verify_conversion_result_v2(
54
- input_path=pdf_path, doc_result=doc_result, generate=GENERATE_V2
59
+ input_path=pdf_path,
60
+ doc_result=doc_result,
61
+ generate=GENERATE_V2,
62
+ verify_doctags=verify_doctags,
55
63
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes