docling 2.50.0__tar.gz → 2.52.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {docling-2.50.0 → docling-2.52.0}/PKG-INFO +15 -7
  2. {docling-2.50.0 → docling-2.52.0}/README.md +12 -4
  3. {docling-2.50.0 → docling-2.52.0}/docling/backend/docling_parse_v4_backend.py +12 -0
  4. {docling-2.50.0 → docling-2.52.0}/docling/cli/main.py +29 -0
  5. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/pipeline_options.py +17 -10
  6. {docling-2.50.0 → docling-2.52.0}/docling/models/base_model.py +27 -2
  7. {docling-2.50.0 → docling-2.52.0}/docling/models/easyocr_model.py +19 -9
  8. {docling-2.50.0 → docling-2.52.0}/docling/models/picture_description_vlm_model.py +1 -1
  9. {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/hf_transformers_model.py +1 -1
  10. {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +1 -1
  11. {docling-2.50.0 → docling-2.52.0}/docling/pipeline/asr_pipeline.py +1 -13
  12. {docling-2.50.0 → docling-2.52.0}/docling/pipeline/base_extraction_pipeline.py +17 -3
  13. {docling-2.50.0 → docling-2.52.0}/docling/pipeline/base_pipeline.py +75 -9
  14. {docling-2.50.0 → docling-2.52.0}/docling/pipeline/extraction_vlm_pipeline.py +9 -16
  15. {docling-2.50.0 → docling-2.52.0}/docling/pipeline/simple_pipeline.py +6 -6
  16. {docling-2.50.0 → docling-2.52.0}/docling/pipeline/standard_pdf_pipeline.py +6 -55
  17. {docling-2.50.0 → docling-2.52.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +102 -62
  18. {docling-2.50.0 → docling-2.52.0}/docling/pipeline/vlm_pipeline.py +3 -15
  19. {docling-2.50.0 → docling-2.52.0}/docling.egg-info/PKG-INFO +15 -7
  20. {docling-2.50.0 → docling-2.52.0}/docling.egg-info/requires.txt +2 -2
  21. {docling-2.50.0 → docling-2.52.0}/pyproject.toml +3 -3
  22. {docling-2.50.0 → docling-2.52.0}/LICENSE +0 -0
  23. {docling-2.50.0 → docling-2.52.0}/docling/__init__.py +0 -0
  24. {docling-2.50.0 → docling-2.52.0}/docling/backend/__init__.py +0 -0
  25. {docling-2.50.0 → docling-2.52.0}/docling/backend/abstract_backend.py +0 -0
  26. {docling-2.50.0 → docling-2.52.0}/docling/backend/asciidoc_backend.py +0 -0
  27. {docling-2.50.0 → docling-2.52.0}/docling/backend/csv_backend.py +0 -0
  28. {docling-2.50.0 → docling-2.52.0}/docling/backend/docling_parse_backend.py +0 -0
  29. {docling-2.50.0 → docling-2.52.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  30. {docling-2.50.0 → docling-2.52.0}/docling/backend/docx/__init__.py +0 -0
  31. {docling-2.50.0 → docling-2.52.0}/docling/backend/docx/latex/__init__.py +0 -0
  32. {docling-2.50.0 → docling-2.52.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  33. {docling-2.50.0 → docling-2.52.0}/docling/backend/docx/latex/omml.py +0 -0
  34. {docling-2.50.0 → docling-2.52.0}/docling/backend/html_backend.py +0 -0
  35. {docling-2.50.0 → docling-2.52.0}/docling/backend/json/__init__.py +0 -0
  36. {docling-2.50.0 → docling-2.52.0}/docling/backend/json/docling_json_backend.py +0 -0
  37. {docling-2.50.0 → docling-2.52.0}/docling/backend/md_backend.py +0 -0
  38. {docling-2.50.0 → docling-2.52.0}/docling/backend/mets_gbs_backend.py +0 -0
  39. {docling-2.50.0 → docling-2.52.0}/docling/backend/msexcel_backend.py +0 -0
  40. {docling-2.50.0 → docling-2.52.0}/docling/backend/mspowerpoint_backend.py +0 -0
  41. {docling-2.50.0 → docling-2.52.0}/docling/backend/msword_backend.py +0 -0
  42. {docling-2.50.0 → docling-2.52.0}/docling/backend/noop_backend.py +0 -0
  43. {docling-2.50.0 → docling-2.52.0}/docling/backend/pdf_backend.py +0 -0
  44. {docling-2.50.0 → docling-2.52.0}/docling/backend/pypdfium2_backend.py +0 -0
  45. {docling-2.50.0 → docling-2.52.0}/docling/backend/xml/__init__.py +0 -0
  46. {docling-2.50.0 → docling-2.52.0}/docling/backend/xml/jats_backend.py +0 -0
  47. {docling-2.50.0 → docling-2.52.0}/docling/backend/xml/uspto_backend.py +0 -0
  48. {docling-2.50.0 → docling-2.52.0}/docling/chunking/__init__.py +0 -0
  49. {docling-2.50.0 → docling-2.52.0}/docling/cli/__init__.py +0 -0
  50. {docling-2.50.0 → docling-2.52.0}/docling/cli/models.py +0 -0
  51. {docling-2.50.0 → docling-2.52.0}/docling/cli/tools.py +0 -0
  52. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/__init__.py +0 -0
  53. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/accelerator_options.py +0 -0
  54. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/asr_model_specs.py +0 -0
  55. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/base_models.py +0 -0
  56. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/document.py +0 -0
  57. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/extraction.py +0 -0
  58. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/layout_model_specs.py +0 -0
  59. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  60. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  61. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/settings.py +0 -0
  62. {docling-2.50.0 → docling-2.52.0}/docling/datamodel/vlm_model_specs.py +0 -0
  63. {docling-2.50.0 → docling-2.52.0}/docling/document_converter.py +0 -0
  64. {docling-2.50.0 → docling-2.52.0}/docling/document_extractor.py +0 -0
  65. {docling-2.50.0 → docling-2.52.0}/docling/exceptions.py +0 -0
  66. {docling-2.50.0 → docling-2.52.0}/docling/models/__init__.py +0 -0
  67. {docling-2.50.0 → docling-2.52.0}/docling/models/api_vlm_model.py +0 -0
  68. {docling-2.50.0 → docling-2.52.0}/docling/models/base_ocr_model.py +0 -0
  69. {docling-2.50.0 → docling-2.52.0}/docling/models/code_formula_model.py +0 -0
  70. {docling-2.50.0 → docling-2.52.0}/docling/models/document_picture_classifier.py +0 -0
  71. {docling-2.50.0 → docling-2.52.0}/docling/models/factories/__init__.py +0 -0
  72. {docling-2.50.0 → docling-2.52.0}/docling/models/factories/base_factory.py +0 -0
  73. {docling-2.50.0 → docling-2.52.0}/docling/models/factories/ocr_factory.py +0 -0
  74. {docling-2.50.0 → docling-2.52.0}/docling/models/factories/picture_description_factory.py +0 -0
  75. {docling-2.50.0 → docling-2.52.0}/docling/models/layout_model.py +0 -0
  76. {docling-2.50.0 → docling-2.52.0}/docling/models/ocr_mac_model.py +0 -0
  77. {docling-2.50.0 → docling-2.52.0}/docling/models/page_assemble_model.py +0 -0
  78. {docling-2.50.0 → docling-2.52.0}/docling/models/page_preprocessing_model.py +0 -0
  79. {docling-2.50.0 → docling-2.52.0}/docling/models/picture_description_api_model.py +0 -0
  80. {docling-2.50.0 → docling-2.52.0}/docling/models/picture_description_base_model.py +0 -0
  81. {docling-2.50.0 → docling-2.52.0}/docling/models/plugins/__init__.py +0 -0
  82. {docling-2.50.0 → docling-2.52.0}/docling/models/plugins/defaults.py +0 -0
  83. {docling-2.50.0 → docling-2.52.0}/docling/models/rapid_ocr_model.py +0 -0
  84. {docling-2.50.0 → docling-2.52.0}/docling/models/readingorder_model.py +0 -0
  85. {docling-2.50.0 → docling-2.52.0}/docling/models/table_structure_model.py +0 -0
  86. {docling-2.50.0 → docling-2.52.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  87. {docling-2.50.0 → docling-2.52.0}/docling/models/tesseract_ocr_model.py +0 -0
  88. {docling-2.50.0 → docling-2.52.0}/docling/models/utils/__init__.py +0 -0
  89. {docling-2.50.0 → docling-2.52.0}/docling/models/utils/hf_model_download.py +0 -0
  90. {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  91. {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  92. {docling-2.50.0 → docling-2.52.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  93. {docling-2.50.0 → docling-2.52.0}/docling/pipeline/__init__.py +0 -0
  94. {docling-2.50.0 → docling-2.52.0}/docling/py.typed +0 -0
  95. {docling-2.50.0 → docling-2.52.0}/docling/utils/__init__.py +0 -0
  96. {docling-2.50.0 → docling-2.52.0}/docling/utils/accelerator_utils.py +0 -0
  97. {docling-2.50.0 → docling-2.52.0}/docling/utils/api_image_request.py +0 -0
  98. {docling-2.50.0 → docling-2.52.0}/docling/utils/export.py +0 -0
  99. {docling-2.50.0 → docling-2.52.0}/docling/utils/glm_utils.py +0 -0
  100. {docling-2.50.0 → docling-2.52.0}/docling/utils/layout_postprocessor.py +0 -0
  101. {docling-2.50.0 → docling-2.52.0}/docling/utils/locks.py +0 -0
  102. {docling-2.50.0 → docling-2.52.0}/docling/utils/model_downloader.py +0 -0
  103. {docling-2.50.0 → docling-2.52.0}/docling/utils/ocr_utils.py +0 -0
  104. {docling-2.50.0 → docling-2.52.0}/docling/utils/orientation.py +0 -0
  105. {docling-2.50.0 → docling-2.52.0}/docling/utils/profiling.py +0 -0
  106. {docling-2.50.0 → docling-2.52.0}/docling/utils/utils.py +0 -0
  107. {docling-2.50.0 → docling-2.52.0}/docling/utils/visualization.py +0 -0
  108. {docling-2.50.0 → docling-2.52.0}/docling.egg-info/SOURCES.txt +0 -0
  109. {docling-2.50.0 → docling-2.52.0}/docling.egg-info/dependency_links.txt +0 -0
  110. {docling-2.50.0 → docling-2.52.0}/docling.egg-info/entry_points.txt +0 -0
  111. {docling-2.50.0 → docling-2.52.0}/docling.egg-info/top_level.txt +0 -0
  112. {docling-2.50.0 → docling-2.52.0}/setup.cfg +0 -0
  113. {docling-2.50.0 → docling-2.52.0}/tests/test_asr_pipeline.py +0 -0
  114. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_asciidoc.py +0 -0
  115. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_csv.py +0 -0
  116. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_docling_json.py +0 -0
  117. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_docling_parse.py +0 -0
  118. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_docling_parse_v2.py +0 -0
  119. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_docling_parse_v4.py +0 -0
  120. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_html.py +0 -0
  121. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_jats.py +0 -0
  122. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_markdown.py +0 -0
  123. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_mets_gbs.py +0 -0
  124. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_msexcel.py +0 -0
  125. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_msword.py +0 -0
  126. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_patent_uspto.py +0 -0
  127. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_pdfium.py +0 -0
  128. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_pptx.py +0 -0
  129. {docling-2.50.0 → docling-2.52.0}/tests/test_backend_webp.py +0 -0
  130. {docling-2.50.0 → docling-2.52.0}/tests/test_cli.py +0 -0
  131. {docling-2.50.0 → docling-2.52.0}/tests/test_code_formula.py +0 -0
  132. {docling-2.50.0 → docling-2.52.0}/tests/test_data_gen_flag.py +0 -0
  133. {docling-2.50.0 → docling-2.52.0}/tests/test_document_picture_classifier.py +0 -0
  134. {docling-2.50.0 → docling-2.52.0}/tests/test_e2e_conversion.py +0 -0
  135. {docling-2.50.0 → docling-2.52.0}/tests/test_e2e_ocr_conversion.py +0 -0
  136. {docling-2.50.0 → docling-2.52.0}/tests/test_extraction.py +0 -0
  137. {docling-2.50.0 → docling-2.52.0}/tests/test_input_doc.py +0 -0
  138. {docling-2.50.0 → docling-2.52.0}/tests/test_interfaces.py +0 -0
  139. {docling-2.50.0 → docling-2.52.0}/tests/test_invalid_input.py +0 -0
  140. {docling-2.50.0 → docling-2.52.0}/tests/test_legacy_format_transform.py +0 -0
  141. {docling-2.50.0 → docling-2.52.0}/tests/test_ocr_utils.py +0 -0
  142. {docling-2.50.0 → docling-2.52.0}/tests/test_options.py +0 -0
  143. {docling-2.50.0 → docling-2.52.0}/tests/test_settings_load.py +0 -0
  144. {docling-2.50.0 → docling-2.52.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.50.0
3
+ Version: 2.52.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,8 +26,8 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
- Requires-Dist: docling-parse<5.0.0,>=4.2.2
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.0
30
+ Requires-Dist: docling-parse<5.0.0,>=4.4.0
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
@@ -101,22 +101,29 @@ Docling simplifies document processing, parsing diverse formats — including ad
101
101
 
102
102
  ## Features
103
103
 
104
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
104
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
105
105
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
106
106
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
107
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
107
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
108
108
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
109
109
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
110
110
  * 🔍 Extensive OCR support for scanned PDFs and images
111
111
  * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
112
- * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
112
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
113
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
113
114
  * 💻 Simple and convenient CLI
114
115
 
116
+ ### What's new
117
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
118
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
119
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
120
+
115
121
  ### Coming soon
116
122
 
117
123
  * 📝 Metadata extraction, including title, authors, references & language
118
124
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
119
125
  * 📝 Complex chemistry understanding (Molecular structures)
126
+ * 📝 Parsing of Web Video Text Tracks (WebVTT) files
120
127
 
121
128
  ## Installation
122
129
 
@@ -142,7 +149,7 @@ result = converter.convert(source)
142
149
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
143
150
  ```
144
151
 
145
- More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
152
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
146
153
  the docs.
147
154
 
148
155
  ## CLI
@@ -222,3 +229,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
222
229
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
223
230
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
224
231
  [integrations]: https://docling-project.github.io/docling/integrations/
232
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -29,22 +29,29 @@ Docling simplifies document processing, parsing diverse formats — including ad
29
29
 
30
30
  ## Features
31
31
 
32
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
32
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
33
33
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
34
34
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
35
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
35
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
36
36
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
37
37
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
38
38
  * 🔍 Extensive OCR support for scanned PDFs and images
39
39
  * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
40
- * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
40
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
41
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
41
42
  * 💻 Simple and convenient CLI
42
43
 
44
+ ### What's new
45
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
46
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
47
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
48
+
43
49
  ### Coming soon
44
50
 
45
51
  * 📝 Metadata extraction, including title, authors, references & language
46
52
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
47
53
  * 📝 Complex chemistry understanding (Molecular structures)
54
+ * 📝 Parsing of Web Video Text Tracks (WebVTT) files
48
55
 
49
56
  ## Installation
50
57
 
@@ -70,7 +77,7 @@ result = converter.convert(source)
70
77
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
71
78
  ```
72
79
 
73
- More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
80
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
74
81
  the docs.
75
82
 
76
83
  ## CLI
@@ -150,3 +157,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
150
157
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
151
158
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
152
159
  [integrations]: https://docling-project.github.io/docling/integrations/
160
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend):
30
30
  page_no: int,
31
31
  create_words: bool = True,
32
32
  create_textlines: bool = True,
33
+ keep_chars: bool = False,
34
+ keep_lines: bool = False,
35
+ keep_images: bool = True,
33
36
  ):
34
37
  self._ppage = page_obj
35
38
  self._dp_doc = dp_doc
36
39
  self._page_no = page_no
40
+
37
41
  self._create_words = create_words
38
42
  self._create_textlines = create_textlines
39
43
 
44
+ self._keep_chars = keep_chars
45
+ self._keep_lines = keep_lines
46
+ self._keep_images = keep_images
47
+
40
48
  self._dpage: Optional[SegmentedPdfPage] = None
41
49
  self._unloaded = False
42
50
  self.valid = (self._ppage is not None) and (self._dp_doc is not None)
@@ -47,8 +55,12 @@ class DoclingParseV4PageBackend(PdfPageBackend):
47
55
 
48
56
  seg_page = self._dp_doc.get_page(
49
57
  self._page_no + 1,
58
+ keep_chars=self._keep_chars,
59
+ keep_lines=self._keep_lines,
60
+ keep_bitmaps=self._keep_images,
50
61
  create_words=self._create_words,
51
62
  create_textlines=self._create_textlines,
63
+ enforce_same_font=True,
52
64
  )
53
65
 
54
66
  # In Docling, all TextCell instances are expected with top-left origin.
@@ -48,6 +48,7 @@ from docling.datamodel.base_models import (
48
48
  from docling.datamodel.document import ConversionResult
49
49
  from docling.datamodel.pipeline_options import (
50
50
  AsrPipelineOptions,
51
+ ConvertPipelineOptions,
51
52
  EasyOcrOptions,
52
53
  OcrOptions,
53
54
  PaginatedPipelineOptions,
@@ -71,8 +72,13 @@ from docling.datamodel.vlm_model_specs import (
71
72
  from docling.document_converter import (
72
73
  AudioFormatOption,
73
74
  DocumentConverter,
75
+ ExcelFormatOption,
74
76
  FormatOption,
77
+ HTMLFormatOption,
78
+ MarkdownFormatOption,
75
79
  PdfFormatOption,
80
+ PowerpointFormatOption,
81
+ WordFormatOption,
76
82
  )
77
83
  from docling.models.factories import get_ocr_factory
78
84
  from docling.pipeline.asr_pipeline import AsrPipeline
@@ -626,10 +632,33 @@ def convert( # noqa: C901
626
632
  backend=MetsGbsDocumentBackend,
627
633
  )
628
634
 
635
+ # SimplePipeline options
636
+ simple_format_option = ConvertPipelineOptions(
637
+ do_picture_description=enrich_picture_description,
638
+ do_picture_classification=enrich_picture_classes,
639
+ )
640
+ if artifacts_path is not None:
641
+ simple_format_option.artifacts_path = artifacts_path
642
+
629
643
  format_options = {
630
644
  InputFormat.PDF: pdf_format_option,
631
645
  InputFormat.IMAGE: pdf_format_option,
632
646
  InputFormat.METS_GBS: mets_gbs_format_option,
647
+ InputFormat.DOCX: WordFormatOption(
648
+ pipeline_options=simple_format_option
649
+ ),
650
+ InputFormat.PPTX: PowerpointFormatOption(
651
+ pipeline_options=simple_format_option
652
+ ),
653
+ InputFormat.XLSX: ExcelFormatOption(
654
+ pipeline_options=simple_format_option
655
+ ),
656
+ InputFormat.HTML: HTMLFormatOption(
657
+ pipeline_options=simple_format_option
658
+ ),
659
+ InputFormat.MD: MarkdownFormatOption(
660
+ pipeline_options=simple_format_option
661
+ ),
633
662
  }
634
663
 
635
664
  elif pipeline == ProcessingPipeline.VLM:
@@ -135,6 +135,8 @@ class EasyOcrOptions(OcrOptions):
135
135
  recog_network: Optional[str] = "standard"
136
136
  download_enabled: bool = True
137
137
 
138
+ suppress_mps_warnings: bool = True
139
+
138
140
  model_config = ConfigDict(
139
141
  extra="forbid",
140
142
  protected_namespaces=(),
@@ -237,7 +239,9 @@ class PdfBackend(str, Enum):
237
239
 
238
240
 
239
241
  # Define an enum for the ocr engines
240
- @deprecated("Use ocr_factory.registered_enum")
242
+ @deprecated(
243
+ "Use get_ocr_factory().registered_kind to get a list of registered OCR engines."
244
+ )
241
245
  class OcrEngine(str, Enum):
242
246
  """Enum of valid OCR engines."""
243
247
 
@@ -255,11 +259,21 @@ class PipelineOptions(BaseOptions):
255
259
  accelerator_options: AcceleratorOptions = AcceleratorOptions()
256
260
  enable_remote_services: bool = False
257
261
  allow_external_plugins: bool = False
262
+ artifacts_path: Optional[Union[Path, str]] = None
258
263
 
259
264
 
260
- class PaginatedPipelineOptions(PipelineOptions):
261
- artifacts_path: Optional[Union[Path, str]] = None
265
+ class ConvertPipelineOptions(PipelineOptions):
266
+ """Base convert pipeline options."""
267
+
268
+ do_picture_classification: bool = False # True: classify pictures in documents
262
269
 
270
+ do_picture_description: bool = False # True: run describe pictures in documents
271
+ picture_description_options: PictureDescriptionBaseOptions = (
272
+ smolvlm_picture_description
273
+ )
274
+
275
+
276
+ class PaginatedPipelineOptions(ConvertPipelineOptions):
263
277
  images_scale: float = 1.0
264
278
  generate_page_images: bool = False
265
279
  generate_picture_images: bool = False
@@ -291,13 +305,11 @@ class LayoutOptions(BaseModel):
291
305
 
292
306
  class AsrPipelineOptions(PipelineOptions):
293
307
  asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
294
- artifacts_path: Optional[Union[Path, str]] = None
295
308
 
296
309
 
297
310
  class VlmExtractionPipelineOptions(PipelineOptions):
298
311
  """Options for extraction pipeline."""
299
312
 
300
- artifacts_path: Optional[Union[Path, str]] = None
301
313
  vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
302
314
 
303
315
 
@@ -308,8 +320,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
308
320
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
309
321
  do_code_enrichment: bool = False # True: perform code OCR
310
322
  do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
311
- do_picture_classification: bool = False # True: classify pictures in documents
312
- do_picture_description: bool = False # True: run describe pictures in documents
313
323
  force_backend_text: bool = (
314
324
  False # (To be used with vlms, or other generative models)
315
325
  )
@@ -317,9 +327,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
317
327
 
318
328
  table_structure_options: TableStructureOptions = TableStructureOptions()
319
329
  ocr_options: OcrOptions = EasyOcrOptions()
320
- picture_description_options: PictureDescriptionBaseOptions = (
321
- smolvlm_picture_description
322
- )
323
330
  layout_options: LayoutOptions = LayoutOptions()
324
331
 
325
332
  images_scale: float = 1.0
@@ -4,7 +4,13 @@ from collections.abc import Iterable
4
4
  from typing import Any, Generic, Optional, Protocol, Type, Union
5
5
 
6
6
  import numpy as np
7
- from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
7
+ from docling_core.types.doc import (
8
+ BoundingBox,
9
+ DocItem,
10
+ DoclingDocument,
11
+ NodeItem,
12
+ PictureItem,
13
+ )
8
14
  from PIL.Image import Image
9
15
  from typing_extensions import TypeVar
10
16
 
@@ -164,8 +170,17 @@ class BaseItemAndImageEnrichmentModel(
164
170
  return None
165
171
 
166
172
  assert isinstance(element, DocItem)
167
- element_prov = element.prov[0]
168
173
 
174
+ # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
175
+ if len(element.prov) == 0 and isinstance(element, PictureItem):
176
+ embedded_im = element.get_image(conv_res.document)
177
+ if embedded_im is not None:
178
+ return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
179
+ else:
180
+ return None
181
+
182
+ # Crop the image form the page
183
+ element_prov = element.prov[0]
169
184
  bbox = element_prov.bbox
170
185
  width = bbox.r - bbox.l
171
186
  height = bbox.t - bbox.b
@@ -183,4 +198,14 @@ class BaseItemAndImageEnrichmentModel(
183
198
  cropped_image = conv_res.pages[page_ix].get_image(
184
199
  scale=self.images_scale, cropbox=expanded_bbox
185
200
  )
201
+
202
+ # Allow for images being embedded without the page backend or page images
203
+ if cropped_image is None and isinstance(element, PictureItem):
204
+ embedded_im = element.get_image(conv_res.document)
205
+ if embedded_im is not None:
206
+ return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
207
+ else:
208
+ return None
209
+
210
+ # Return the proper cropped image
186
211
  return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
@@ -78,14 +78,17 @@ class EasyOcrModel(BaseOcrModel):
78
78
  download_enabled = False
79
79
  model_storage_directory = str(artifacts_path / self._model_repo_folder)
80
80
 
81
- self.reader = easyocr.Reader(
82
- lang_list=self.options.lang,
83
- gpu=use_gpu,
84
- model_storage_directory=model_storage_directory,
85
- recog_network=self.options.recog_network,
86
- download_enabled=download_enabled,
87
- verbose=False,
88
- )
81
+ with warnings.catch_warnings():
82
+ if self.options.suppress_mps_warnings:
83
+ warnings.filterwarnings("ignore", message=".*pin_memory.*MPS.*")
84
+ self.reader = easyocr.Reader(
85
+ lang_list=self.options.lang,
86
+ gpu=use_gpu,
87
+ model_storage_directory=model_storage_directory,
88
+ recog_network=self.options.recog_network,
89
+ download_enabled=download_enabled,
90
+ verbose=False,
91
+ )
89
92
 
90
93
  @staticmethod
91
94
  def download_models(
@@ -147,7 +150,14 @@ class EasyOcrModel(BaseOcrModel):
147
150
  scale=self.scale, cropbox=ocr_rect
148
151
  )
149
152
  im = numpy.array(high_res_image)
150
- result = self.reader.readtext(im)
153
+
154
+ with warnings.catch_warnings():
155
+ if self.options.suppress_mps_warnings:
156
+ warnings.filterwarnings(
157
+ "ignore", message=".*pin_memory.*MPS.*"
158
+ )
159
+
160
+ result = self.reader.readtext(im)
151
161
 
152
162
  del high_res_image
153
163
  del im
@@ -67,7 +67,7 @@ class PictureDescriptionVlmModel(
67
67
  self.model = AutoModelForImageTextToText.from_pretrained(
68
68
  artifacts_path,
69
69
  device_map=self.device,
70
- torch_dtype=torch.bfloat16,
70
+ dtype=torch.bfloat16,
71
71
  _attn_implementation=(
72
72
  "flash_attention_2"
73
73
  if self.device.startswith("cuda")
@@ -112,7 +112,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
112
112
  self.vlm_model = model_cls.from_pretrained(
113
113
  artifacts_path,
114
114
  device_map=self.device,
115
- torch_dtype=self.vlm_options.torch_dtype,
115
+ dtype=self.vlm_options.torch_dtype,
116
116
  _attn_implementation=(
117
117
  "flash_attention_2"
118
118
  if self.device.startswith("cuda")
@@ -144,7 +144,7 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
144
144
  self.vlm_model = AutoModelForImageTextToText.from_pretrained(
145
145
  artifacts_path,
146
146
  device_map=self.device,
147
- torch_dtype=self.vlm_options.torch_dtype,
147
+ dtype=self.vlm_options.torch_dtype,
148
148
  _attn_implementation=(
149
149
  "flash_attention_2"
150
150
  if self.device.startswith("cuda")
@@ -208,25 +208,13 @@ class AsrPipeline(BasePipeline):
208
208
 
209
209
  self.pipeline_options: AsrPipelineOptions = pipeline_options
210
210
 
211
- artifacts_path: Optional[Path] = None
212
- if pipeline_options.artifacts_path is not None:
213
- artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
214
- elif settings.artifacts_path is not None:
215
- artifacts_path = Path(settings.artifacts_path).expanduser()
216
-
217
- if artifacts_path is not None and not artifacts_path.is_dir():
218
- raise RuntimeError(
219
- f"The value of {artifacts_path=} is not valid. "
220
- "When defined, it must point to a folder containing all models required by the pipeline."
221
- )
222
-
223
211
  if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
224
212
  asr_options: InlineAsrNativeWhisperOptions = (
225
213
  self.pipeline_options.asr_options
226
214
  )
227
215
  self._model = _NativeWhisperModel(
228
216
  enabled=True, # must be always enabled for this pipeline to make sense.
229
- artifacts_path=artifacts_path,
217
+ artifacts_path=self.artifacts_path,
230
218
  accelerator_options=pipeline_options.accelerator_options,
231
219
  asr_options=asr_options,
232
220
  )
@@ -1,19 +1,33 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
+ from pathlib import Path
3
4
  from typing import Optional
4
5
 
5
6
  from docling.datamodel.base_models import ConversionStatus, ErrorItem
6
7
  from docling.datamodel.document import InputDocument
7
8
  from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
8
- from docling.datamodel.pipeline_options import BaseOptions
9
+ from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
10
+ from docling.datamodel.settings import settings
9
11
 
10
12
  _log = logging.getLogger(__name__)
11
13
 
12
14
 
13
15
  class BaseExtractionPipeline(ABC):
14
- def __init__(self, pipeline_options: BaseOptions):
16
+ def __init__(self, pipeline_options: PipelineOptions):
15
17
  self.pipeline_options = pipeline_options
16
18
 
19
+ self.artifacts_path: Optional[Path] = None
20
+ if pipeline_options.artifacts_path is not None:
21
+ self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
22
+ elif settings.artifacts_path is not None:
23
+ self.artifacts_path = Path(settings.artifacts_path).expanduser()
24
+
25
+ if self.artifacts_path is not None and not self.artifacts_path.is_dir():
26
+ raise RuntimeError(
27
+ f"The value of {self.artifacts_path=} is not valid. "
28
+ "When defined, it must point to a folder containing all models required by the pipeline."
29
+ )
30
+
17
31
  def execute(
18
32
  self,
19
33
  in_doc: InputDocument,
@@ -54,5 +68,5 @@ class BaseExtractionPipeline(ABC):
54
68
 
55
69
  @classmethod
56
70
  @abstractmethod
57
- def get_default_options(cls) -> BaseOptions:
71
+ def get_default_options(cls) -> PipelineOptions:
58
72
  pass
@@ -4,7 +4,8 @@ import time
4
4
  import traceback
5
5
  from abc import ABC, abstractmethod
6
6
  from collections.abc import Iterable
7
- from typing import Any, Callable, List
7
+ from pathlib import Path
8
+ from typing import Any, Callable, List, Optional
8
9
 
9
10
  from docling_core.types.doc import NodeItem
10
11
 
@@ -20,9 +21,19 @@ from docling.datamodel.base_models import (
20
21
  Page,
21
22
  )
22
23
  from docling.datamodel.document import ConversionResult, InputDocument
23
- from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
24
+ from docling.datamodel.pipeline_options import (
25
+ ConvertPipelineOptions,
26
+ PdfPipelineOptions,
27
+ PipelineOptions,
28
+ )
24
29
  from docling.datamodel.settings import settings
25
30
  from docling.models.base_model import GenericEnrichmentModel
31
+ from docling.models.document_picture_classifier import (
32
+ DocumentPictureClassifier,
33
+ DocumentPictureClassifierOptions,
34
+ )
35
+ from docling.models.factories import get_picture_description_factory
36
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
26
37
  from docling.utils.profiling import ProfilingScope, TimeRecorder
27
38
  from docling.utils.utils import chunkify
28
39
 
@@ -36,6 +47,18 @@ class BasePipeline(ABC):
36
47
  self.build_pipe: List[Callable] = []
37
48
  self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
38
49
 
50
+ self.artifacts_path: Optional[Path] = None
51
+ if pipeline_options.artifacts_path is not None:
52
+ self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
53
+ elif settings.artifacts_path is not None:
54
+ self.artifacts_path = Path(settings.artifacts_path).expanduser()
55
+
56
+ if self.artifacts_path is not None and not self.artifacts_path.is_dir():
57
+ raise RuntimeError(
58
+ f"The value of {self.artifacts_path=} is not valid. "
59
+ "When defined, it must point to a folder containing all models required by the pipeline."
60
+ )
61
+
39
62
  def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
40
63
  conv_res = ConversionResult(input=in_doc)
41
64
 
@@ -108,15 +131,58 @@ class BasePipeline(ABC):
108
131
  def is_backend_supported(cls, backend: AbstractDocumentBackend):
109
132
  pass
110
133
 
111
- # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
112
- # for model in self.build_pipe:
113
- # element_batch = model(element_batch)
114
- #
115
- # yield from element_batch
116
134
 
135
+ class ConvertPipeline(BasePipeline):
136
+ def __init__(self, pipeline_options: ConvertPipelineOptions):
137
+ super().__init__(pipeline_options)
138
+ self.pipeline_options: ConvertPipelineOptions
117
139
 
118
- class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
119
- def __init__(self, pipeline_options: PipelineOptions):
140
+ # ------ Common enrichment models working on all backends
141
+
142
+ # Picture description model
143
+ if (
144
+ picture_description_model := self._get_picture_description_model(
145
+ artifacts_path=self.artifacts_path
146
+ )
147
+ ) is None:
148
+ raise RuntimeError(
149
+ f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
150
+ )
151
+
152
+ self.enrichment_pipe = [
153
+ # Document Picture Classifier
154
+ DocumentPictureClassifier(
155
+ enabled=pipeline_options.do_picture_classification,
156
+ artifacts_path=self.artifacts_path,
157
+ options=DocumentPictureClassifierOptions(),
158
+ accelerator_options=pipeline_options.accelerator_options,
159
+ ),
160
+ # Document Picture description
161
+ picture_description_model,
162
+ ]
163
+
164
+ def _get_picture_description_model(
165
+ self, artifacts_path: Optional[Path] = None
166
+ ) -> Optional[PictureDescriptionBaseModel]:
167
+ factory = get_picture_description_factory(
168
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
169
+ )
170
+ return factory.create_instance(
171
+ options=self.pipeline_options.picture_description_options,
172
+ enabled=self.pipeline_options.do_picture_description,
173
+ enable_remote_services=self.pipeline_options.enable_remote_services,
174
+ artifacts_path=artifacts_path,
175
+ accelerator_options=self.pipeline_options.accelerator_options,
176
+ )
177
+
178
+ @classmethod
179
+ @abstractmethod
180
+ def get_default_options(cls) -> ConvertPipelineOptions:
181
+ pass
182
+
183
+
184
+ class PaginatedPipeline(ConvertPipeline): # TODO this is a bad name.
185
+ def __init__(self, pipeline_options: ConvertPipelineOptions):
120
186
  super().__init__(pipeline_options)
121
187
  self.keep_backend = False
122
188
 
@@ -1,7 +1,6 @@
1
1
  import inspect
2
2
  import json
3
3
  import logging
4
- from pathlib import Path
5
4
  from typing import Optional
6
5
 
7
6
  from PIL.Image import Image
@@ -16,7 +15,10 @@ from docling.datamodel.extraction import (
16
15
  ExtractionResult,
17
16
  ExtractionTemplateType,
18
17
  )
19
- from docling.datamodel.pipeline_options import BaseOptions, VlmExtractionPipelineOptions
18
+ from docling.datamodel.pipeline_options import (
19
+ PipelineOptions,
20
+ VlmExtractionPipelineOptions,
21
+ )
20
22
  from docling.datamodel.settings import settings
21
23
  from docling.models.vlm_models_inline.nuextract_transformers_model import (
22
24
  NuExtractTransformersModel,
@@ -35,22 +37,10 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
35
37
  self.accelerator_options = pipeline_options.accelerator_options
36
38
  self.pipeline_options: VlmExtractionPipelineOptions
37
39
 
38
- artifacts_path: Optional[Path] = None
39
- if pipeline_options.artifacts_path is not None:
40
- artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
41
- elif settings.artifacts_path is not None:
42
- artifacts_path = Path(settings.artifacts_path).expanduser()
43
-
44
- if artifacts_path is not None and not artifacts_path.is_dir():
45
- raise RuntimeError(
46
- f"The value of {artifacts_path=} is not valid. "
47
- "When defined, it must point to a folder containing all models required by the pipeline."
48
- )
49
-
50
40
  # Create VLM model instance
51
41
  self.vlm_model = NuExtractTransformersModel(
52
42
  enabled=True,
53
- artifacts_path=artifacts_path, # Will download automatically
43
+ artifacts_path=self.artifacts_path, # Will download automatically
54
44
  accelerator_options=self.accelerator_options,
55
45
  vlm_options=pipeline_options.vlm_options,
56
46
  )
@@ -194,11 +184,14 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
194
184
  class ExtractionTemplateFactory(ModelFactory[template]): # type: ignore
195
185
  __use_examples__ = True # prefer Field(examples=...) when present
196
186
  __use_defaults__ = True # use field defaults instead of random values
187
+ __check_model__ = (
188
+ True # setting the value to avoid deprecation warnings
189
+ )
197
190
 
198
191
  return ExtractionTemplateFactory.build().model_dump_json(indent=2) # type: ignore
199
192
  else:
200
193
  raise ValueError(f"Unsupported template type: {type(template)}")
201
194
 
202
195
  @classmethod
203
- def get_default_options(cls) -> BaseOptions:
196
+ def get_default_options(cls) -> PipelineOptions:
204
197
  return VlmExtractionPipelineOptions()