docling 2.51.0__tar.gz → 2.53.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (144) hide show
  1. {docling-2.51.0 → docling-2.53.0}/PKG-INFO +10 -6
  2. {docling-2.51.0 → docling-2.53.0}/README.md +8 -4
  3. {docling-2.51.0 → docling-2.53.0}/docling/cli/main.py +44 -1
  4. {docling-2.51.0 → docling-2.53.0}/docling/cli/models.py +4 -0
  5. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/pipeline_options.py +21 -12
  6. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/vlm_model_specs.py +30 -0
  7. {docling-2.51.0 → docling-2.53.0}/docling/models/base_model.py +27 -2
  8. {docling-2.51.0 → docling-2.53.0}/docling/models/easyocr_model.py +19 -9
  9. {docling-2.51.0 → docling-2.53.0}/docling/models/picture_description_vlm_model.py +1 -1
  10. {docling-2.51.0 → docling-2.53.0}/docling/models/rapid_ocr_model.py +40 -25
  11. {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/hf_transformers_model.py +1 -1
  12. {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +1 -1
  13. {docling-2.51.0 → docling-2.53.0}/docling/pipeline/asr_pipeline.py +1 -13
  14. {docling-2.51.0 → docling-2.53.0}/docling/pipeline/base_extraction_pipeline.py +17 -3
  15. {docling-2.51.0 → docling-2.53.0}/docling/pipeline/base_pipeline.py +75 -9
  16. {docling-2.51.0 → docling-2.53.0}/docling/pipeline/extraction_vlm_pipeline.py +9 -16
  17. {docling-2.51.0 → docling-2.53.0}/docling/pipeline/simple_pipeline.py +6 -6
  18. {docling-2.51.0 → docling-2.53.0}/docling/pipeline/standard_pdf_pipeline.py +6 -55
  19. {docling-2.51.0 → docling-2.53.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +102 -62
  20. {docling-2.51.0 → docling-2.53.0}/docling/pipeline/vlm_pipeline.py +3 -15
  21. {docling-2.51.0 → docling-2.53.0}/docling/utils/model_downloader.py +22 -0
  22. {docling-2.51.0 → docling-2.53.0}/docling.egg-info/PKG-INFO +10 -6
  23. {docling-2.51.0 → docling-2.53.0}/docling.egg-info/requires.txt +1 -1
  24. {docling-2.51.0 → docling-2.53.0}/pyproject.toml +2 -2
  25. {docling-2.51.0 → docling-2.53.0}/tests/test_e2e_ocr_conversion.py +10 -0
  26. {docling-2.51.0 → docling-2.53.0}/LICENSE +0 -0
  27. {docling-2.51.0 → docling-2.53.0}/docling/__init__.py +0 -0
  28. {docling-2.51.0 → docling-2.53.0}/docling/backend/__init__.py +0 -0
  29. {docling-2.51.0 → docling-2.53.0}/docling/backend/abstract_backend.py +0 -0
  30. {docling-2.51.0 → docling-2.53.0}/docling/backend/asciidoc_backend.py +0 -0
  31. {docling-2.51.0 → docling-2.53.0}/docling/backend/csv_backend.py +0 -0
  32. {docling-2.51.0 → docling-2.53.0}/docling/backend/docling_parse_backend.py +0 -0
  33. {docling-2.51.0 → docling-2.53.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  34. {docling-2.51.0 → docling-2.53.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  35. {docling-2.51.0 → docling-2.53.0}/docling/backend/docx/__init__.py +0 -0
  36. {docling-2.51.0 → docling-2.53.0}/docling/backend/docx/latex/__init__.py +0 -0
  37. {docling-2.51.0 → docling-2.53.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  38. {docling-2.51.0 → docling-2.53.0}/docling/backend/docx/latex/omml.py +0 -0
  39. {docling-2.51.0 → docling-2.53.0}/docling/backend/html_backend.py +0 -0
  40. {docling-2.51.0 → docling-2.53.0}/docling/backend/json/__init__.py +0 -0
  41. {docling-2.51.0 → docling-2.53.0}/docling/backend/json/docling_json_backend.py +0 -0
  42. {docling-2.51.0 → docling-2.53.0}/docling/backend/md_backend.py +0 -0
  43. {docling-2.51.0 → docling-2.53.0}/docling/backend/mets_gbs_backend.py +0 -0
  44. {docling-2.51.0 → docling-2.53.0}/docling/backend/msexcel_backend.py +0 -0
  45. {docling-2.51.0 → docling-2.53.0}/docling/backend/mspowerpoint_backend.py +0 -0
  46. {docling-2.51.0 → docling-2.53.0}/docling/backend/msword_backend.py +0 -0
  47. {docling-2.51.0 → docling-2.53.0}/docling/backend/noop_backend.py +0 -0
  48. {docling-2.51.0 → docling-2.53.0}/docling/backend/pdf_backend.py +0 -0
  49. {docling-2.51.0 → docling-2.53.0}/docling/backend/pypdfium2_backend.py +0 -0
  50. {docling-2.51.0 → docling-2.53.0}/docling/backend/xml/__init__.py +0 -0
  51. {docling-2.51.0 → docling-2.53.0}/docling/backend/xml/jats_backend.py +0 -0
  52. {docling-2.51.0 → docling-2.53.0}/docling/backend/xml/uspto_backend.py +0 -0
  53. {docling-2.51.0 → docling-2.53.0}/docling/chunking/__init__.py +0 -0
  54. {docling-2.51.0 → docling-2.53.0}/docling/cli/__init__.py +0 -0
  55. {docling-2.51.0 → docling-2.53.0}/docling/cli/tools.py +0 -0
  56. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/__init__.py +0 -0
  57. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/accelerator_options.py +0 -0
  58. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/asr_model_specs.py +0 -0
  59. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/base_models.py +0 -0
  60. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/document.py +0 -0
  61. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/extraction.py +0 -0
  62. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/layout_model_specs.py +0 -0
  63. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  64. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  65. {docling-2.51.0 → docling-2.53.0}/docling/datamodel/settings.py +0 -0
  66. {docling-2.51.0 → docling-2.53.0}/docling/document_converter.py +0 -0
  67. {docling-2.51.0 → docling-2.53.0}/docling/document_extractor.py +0 -0
  68. {docling-2.51.0 → docling-2.53.0}/docling/exceptions.py +0 -0
  69. {docling-2.51.0 → docling-2.53.0}/docling/models/__init__.py +0 -0
  70. {docling-2.51.0 → docling-2.53.0}/docling/models/api_vlm_model.py +0 -0
  71. {docling-2.51.0 → docling-2.53.0}/docling/models/base_ocr_model.py +0 -0
  72. {docling-2.51.0 → docling-2.53.0}/docling/models/code_formula_model.py +0 -0
  73. {docling-2.51.0 → docling-2.53.0}/docling/models/document_picture_classifier.py +0 -0
  74. {docling-2.51.0 → docling-2.53.0}/docling/models/factories/__init__.py +0 -0
  75. {docling-2.51.0 → docling-2.53.0}/docling/models/factories/base_factory.py +0 -0
  76. {docling-2.51.0 → docling-2.53.0}/docling/models/factories/ocr_factory.py +0 -0
  77. {docling-2.51.0 → docling-2.53.0}/docling/models/factories/picture_description_factory.py +0 -0
  78. {docling-2.51.0 → docling-2.53.0}/docling/models/layout_model.py +0 -0
  79. {docling-2.51.0 → docling-2.53.0}/docling/models/ocr_mac_model.py +0 -0
  80. {docling-2.51.0 → docling-2.53.0}/docling/models/page_assemble_model.py +0 -0
  81. {docling-2.51.0 → docling-2.53.0}/docling/models/page_preprocessing_model.py +0 -0
  82. {docling-2.51.0 → docling-2.53.0}/docling/models/picture_description_api_model.py +0 -0
  83. {docling-2.51.0 → docling-2.53.0}/docling/models/picture_description_base_model.py +0 -0
  84. {docling-2.51.0 → docling-2.53.0}/docling/models/plugins/__init__.py +0 -0
  85. {docling-2.51.0 → docling-2.53.0}/docling/models/plugins/defaults.py +0 -0
  86. {docling-2.51.0 → docling-2.53.0}/docling/models/readingorder_model.py +0 -0
  87. {docling-2.51.0 → docling-2.53.0}/docling/models/table_structure_model.py +0 -0
  88. {docling-2.51.0 → docling-2.53.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  89. {docling-2.51.0 → docling-2.53.0}/docling/models/tesseract_ocr_model.py +0 -0
  90. {docling-2.51.0 → docling-2.53.0}/docling/models/utils/__init__.py +0 -0
  91. {docling-2.51.0 → docling-2.53.0}/docling/models/utils/hf_model_download.py +0 -0
  92. {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  93. {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  94. {docling-2.51.0 → docling-2.53.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  95. {docling-2.51.0 → docling-2.53.0}/docling/pipeline/__init__.py +0 -0
  96. {docling-2.51.0 → docling-2.53.0}/docling/py.typed +0 -0
  97. {docling-2.51.0 → docling-2.53.0}/docling/utils/__init__.py +0 -0
  98. {docling-2.51.0 → docling-2.53.0}/docling/utils/accelerator_utils.py +0 -0
  99. {docling-2.51.0 → docling-2.53.0}/docling/utils/api_image_request.py +0 -0
  100. {docling-2.51.0 → docling-2.53.0}/docling/utils/export.py +0 -0
  101. {docling-2.51.0 → docling-2.53.0}/docling/utils/glm_utils.py +0 -0
  102. {docling-2.51.0 → docling-2.53.0}/docling/utils/layout_postprocessor.py +0 -0
  103. {docling-2.51.0 → docling-2.53.0}/docling/utils/locks.py +0 -0
  104. {docling-2.51.0 → docling-2.53.0}/docling/utils/ocr_utils.py +0 -0
  105. {docling-2.51.0 → docling-2.53.0}/docling/utils/orientation.py +0 -0
  106. {docling-2.51.0 → docling-2.53.0}/docling/utils/profiling.py +0 -0
  107. {docling-2.51.0 → docling-2.53.0}/docling/utils/utils.py +0 -0
  108. {docling-2.51.0 → docling-2.53.0}/docling/utils/visualization.py +0 -0
  109. {docling-2.51.0 → docling-2.53.0}/docling.egg-info/SOURCES.txt +0 -0
  110. {docling-2.51.0 → docling-2.53.0}/docling.egg-info/dependency_links.txt +0 -0
  111. {docling-2.51.0 → docling-2.53.0}/docling.egg-info/entry_points.txt +0 -0
  112. {docling-2.51.0 → docling-2.53.0}/docling.egg-info/top_level.txt +0 -0
  113. {docling-2.51.0 → docling-2.53.0}/setup.cfg +0 -0
  114. {docling-2.51.0 → docling-2.53.0}/tests/test_asr_pipeline.py +0 -0
  115. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_asciidoc.py +0 -0
  116. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_csv.py +0 -0
  117. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_docling_json.py +0 -0
  118. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_docling_parse.py +0 -0
  119. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_docling_parse_v2.py +0 -0
  120. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_docling_parse_v4.py +0 -0
  121. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_html.py +0 -0
  122. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_jats.py +0 -0
  123. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_markdown.py +0 -0
  124. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_mets_gbs.py +0 -0
  125. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_msexcel.py +0 -0
  126. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_msword.py +0 -0
  127. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_patent_uspto.py +0 -0
  128. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_pdfium.py +0 -0
  129. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_pptx.py +0 -0
  130. {docling-2.51.0 → docling-2.53.0}/tests/test_backend_webp.py +0 -0
  131. {docling-2.51.0 → docling-2.53.0}/tests/test_cli.py +0 -0
  132. {docling-2.51.0 → docling-2.53.0}/tests/test_code_formula.py +0 -0
  133. {docling-2.51.0 → docling-2.53.0}/tests/test_data_gen_flag.py +0 -0
  134. {docling-2.51.0 → docling-2.53.0}/tests/test_document_picture_classifier.py +0 -0
  135. {docling-2.51.0 → docling-2.53.0}/tests/test_e2e_conversion.py +0 -0
  136. {docling-2.51.0 → docling-2.53.0}/tests/test_extraction.py +0 -0
  137. {docling-2.51.0 → docling-2.53.0}/tests/test_input_doc.py +0 -0
  138. {docling-2.51.0 → docling-2.53.0}/tests/test_interfaces.py +0 -0
  139. {docling-2.51.0 → docling-2.53.0}/tests/test_invalid_input.py +0 -0
  140. {docling-2.51.0 → docling-2.53.0}/tests/test_legacy_format_transform.py +0 -0
  141. {docling-2.51.0 → docling-2.53.0}/tests/test_ocr_utils.py +0 -0
  142. {docling-2.51.0 → docling-2.53.0}/tests/test_options.py +0 -0
  143. {docling-2.51.0 → docling-2.53.0}/tests/test_settings_load.py +0 -0
  144. {docling-2.51.0 → docling-2.53.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.51.0
3
+ Version: 2.53.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.0
30
30
  Requires-Dist: docling-parse<5.0.0,>=4.4.0
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
@@ -108,18 +108,22 @@ Docling simplifies document processing, parsing diverse formats — including ad
108
108
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
109
109
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
110
110
  * 🔍 Extensive OCR support for scanned PDFs and images
111
- * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
111
+ * 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
112
112
  * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
113
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
113
114
  * 💻 Simple and convenient CLI
114
115
 
115
116
  ### What's new
116
117
  * 📤 Structured [information extraction][extraction] \[🧪 beta\]
118
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
119
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
117
120
 
118
121
  ### Coming soon
119
122
 
120
123
  * 📝 Metadata extraction, including title, authors, references & language
121
124
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
122
125
  * 📝 Complex chemistry understanding (Molecular structures)
126
+ * 📝 Parsing of Web Video Text Tracks (WebVTT) files
123
127
 
124
128
  ## Installation
125
129
 
@@ -145,7 +149,7 @@ result = converter.convert(source)
145
149
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
146
150
  ```
147
151
 
148
- More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
152
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
149
153
  the docs.
150
154
 
151
155
  ## CLI
@@ -156,9 +160,9 @@ Docling has a built-in CLI to run conversions.
156
160
  docling https://arxiv.org/pdf/2206.01062
157
161
  ```
158
162
 
159
- You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
163
+ You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
160
164
  ```bash
161
- docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
165
+ docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
162
166
  ```
163
167
  This will use MLX acceleration on supported Apple Silicon hardware.
164
168
 
@@ -36,18 +36,22 @@ Docling simplifies document processing, parsing diverse formats — including ad
36
36
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
37
37
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
38
38
  * 🔍 Extensive OCR support for scanned PDFs and images
39
- * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
39
+ * 👓 Support of several Visual Language Models ([GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M))
40
40
  * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
41
+ * 🔌 Connect to any agent using the [MCP server](https://docling-project.github.io/docling/usage/mcp/)
41
42
  * 💻 Simple and convenient CLI
42
43
 
43
44
  ### What's new
44
45
  * 📤 Structured [information extraction][extraction] \[🧪 beta\]
46
+ * 📑 New layout model (**Heron**) by default, for faster PDF parsing
47
+ * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
45
48
 
46
49
  ### Coming soon
47
50
 
48
51
  * 📝 Metadata extraction, including title, authors, references & language
49
52
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
50
53
  * 📝 Complex chemistry understanding (Molecular structures)
54
+ * 📝 Parsing of Web Video Text Tracks (WebVTT) files
51
55
 
52
56
  ## Installation
53
57
 
@@ -73,7 +77,7 @@ result = converter.convert(source)
73
77
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
74
78
  ```
75
79
 
76
- More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
80
+ More [advanced usage options](https://docling-project.github.io/docling/usage/advanced_options/) are available in
77
81
  the docs.
78
82
 
79
83
  ## CLI
@@ -84,9 +88,9 @@ Docling has a built-in CLI to run conversions.
84
88
  docling https://arxiv.org/pdf/2206.01062
85
89
  ```
86
90
 
87
- You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
91
+ You can also use 🥚[GraniteDocling](https://huggingface.co/ibm-granite/granite-docling-258M) and other VLMs via Docling CLI:
88
92
  ```bash
89
- docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
93
+ docling --pipeline vlm --vlm-model granite_docling https://arxiv.org/pdf/2206.01062
90
94
  ```
91
95
  This will use MLX acceleration on supported Apple Silicon hardware.
92
96
 
@@ -48,6 +48,7 @@ from docling.datamodel.base_models import (
48
48
  from docling.datamodel.document import ConversionResult
49
49
  from docling.datamodel.pipeline_options import (
50
50
  AsrPipelineOptions,
51
+ ConvertPipelineOptions,
51
52
  EasyOcrOptions,
52
53
  OcrOptions,
53
54
  PaginatedPipelineOptions,
@@ -63,6 +64,8 @@ from docling.datamodel.vlm_model_specs import (
63
64
  GOT2_TRANSFORMERS,
64
65
  GRANITE_VISION_OLLAMA,
65
66
  GRANITE_VISION_TRANSFORMERS,
67
+ GRANITEDOCLING_MLX,
68
+ GRANITEDOCLING_TRANSFORMERS,
66
69
  SMOLDOCLING_MLX,
67
70
  SMOLDOCLING_TRANSFORMERS,
68
71
  SMOLDOCLING_VLLM,
@@ -71,8 +74,13 @@ from docling.datamodel.vlm_model_specs import (
71
74
  from docling.document_converter import (
72
75
  AudioFormatOption,
73
76
  DocumentConverter,
77
+ ExcelFormatOption,
74
78
  FormatOption,
79
+ HTMLFormatOption,
80
+ MarkdownFormatOption,
75
81
  PdfFormatOption,
82
+ PowerpointFormatOption,
83
+ WordFormatOption,
76
84
  )
77
85
  from docling.models.factories import get_ocr_factory
78
86
  from docling.pipeline.asr_pipeline import AsrPipeline
@@ -328,7 +336,7 @@ def convert( # noqa: C901
328
336
  vlm_model: Annotated[
329
337
  VlmModelType,
330
338
  typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
331
- ] = VlmModelType.SMOLDOCLING,
339
+ ] = VlmModelType.GRANITEDOCLING,
332
340
  asr_model: Annotated[
333
341
  AsrModelType,
334
342
  typer.Option(..., help="Choose the ASR model to use with audio/video files."),
@@ -626,10 +634,33 @@ def convert( # noqa: C901
626
634
  backend=MetsGbsDocumentBackend,
627
635
  )
628
636
 
637
+ # SimplePipeline options
638
+ simple_format_option = ConvertPipelineOptions(
639
+ do_picture_description=enrich_picture_description,
640
+ do_picture_classification=enrich_picture_classes,
641
+ )
642
+ if artifacts_path is not None:
643
+ simple_format_option.artifacts_path = artifacts_path
644
+
629
645
  format_options = {
630
646
  InputFormat.PDF: pdf_format_option,
631
647
  InputFormat.IMAGE: pdf_format_option,
632
648
  InputFormat.METS_GBS: mets_gbs_format_option,
649
+ InputFormat.DOCX: WordFormatOption(
650
+ pipeline_options=simple_format_option
651
+ ),
652
+ InputFormat.PPTX: PowerpointFormatOption(
653
+ pipeline_options=simple_format_option
654
+ ),
655
+ InputFormat.XLSX: ExcelFormatOption(
656
+ pipeline_options=simple_format_option
657
+ ),
658
+ InputFormat.HTML: HTMLFormatOption(
659
+ pipeline_options=simple_format_option
660
+ ),
661
+ InputFormat.MD: MarkdownFormatOption(
662
+ pipeline_options=simple_format_option
663
+ ),
633
664
  }
634
665
 
635
666
  elif pipeline == ProcessingPipeline.VLM:
@@ -655,6 +686,18 @@ def convert( # noqa: C901
655
686
  "To run SmolDocling faster, please install mlx-vlm:\n"
656
687
  "pip install mlx-vlm"
657
688
  )
689
+ elif vlm_model == VlmModelType.GRANITEDOCLING:
690
+ pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
691
+ if sys.platform == "darwin":
692
+ try:
693
+ import mlx_vlm
694
+
695
+ pipeline_options.vlm_options = GRANITEDOCLING_MLX
696
+ except ImportError:
697
+ _log.warning(
698
+ "To run GraniteDocling faster, please install mlx-vlm:\n"
699
+ "pip install mlx-vlm"
700
+ )
658
701
  elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
659
702
  pipeline_options.vlm_options = SMOLDOCLING_VLLM
660
703
 
@@ -33,6 +33,8 @@ class _AvailableModels(str, Enum):
33
33
  CODE_FORMULA = "code_formula"
34
34
  PICTURE_CLASSIFIER = "picture_classifier"
35
35
  SMOLVLM = "smolvlm"
36
+ GRANITEDOCLING = "granitedocling"
37
+ GRANITEDOCLING_MLX = "granitedocling_mlx"
36
38
  SMOLDOCLING = "smoldocling"
37
39
  SMOLDOCLING_MLX = "smoldocling_mlx"
38
40
  GRANITE_VISION = "granite_vision"
@@ -108,6 +110,8 @@ def download(
108
110
  with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
109
111
  with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
110
112
  with_smolvlm=_AvailableModels.SMOLVLM in to_download,
113
+ with_granitedocling=_AvailableModels.GRANITEDOCLING in to_download,
114
+ with_granitedocling_mlx=_AvailableModels.GRANITEDOCLING_MLX in to_download,
111
115
  with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
112
116
  with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
113
117
  with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
@@ -12,7 +12,7 @@ from pydantic import (
12
12
  )
13
13
  from typing_extensions import deprecated
14
14
 
15
- from docling.datamodel import asr_model_specs
15
+ from docling.datamodel import asr_model_specs, vlm_model_specs
16
16
 
17
17
  # Import the following for backwards compatibility
18
18
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -114,7 +114,11 @@ class RapidOcrOptions(OcrOptions):
114
114
  cls_model_path: Optional[str] = None # same default as rapidocr
115
115
  rec_model_path: Optional[str] = None # same default as rapidocr
116
116
  rec_keys_path: Optional[str] = None # same default as rapidocr
117
- rec_font_path: Optional[str] = None # same default as rapidocr
117
+ rec_font_path: Optional[str] = None # Deprecated, please use font_path instead
118
+ font_path: Optional[str] = None # same default as rapidocr
119
+
120
+ # Dictionary to overwrite or pass-through additional parameters
121
+ rapidocr_params: Dict[str, Any] = Field(default_factory=dict)
118
122
 
119
123
  model_config = ConfigDict(
120
124
  extra="forbid",
@@ -135,6 +139,8 @@ class EasyOcrOptions(OcrOptions):
135
139
  recog_network: Optional[str] = "standard"
136
140
  download_enabled: bool = True
137
141
 
142
+ suppress_mps_warnings: bool = True
143
+
138
144
  model_config = ConfigDict(
139
145
  extra="forbid",
140
146
  protected_namespaces=(),
@@ -257,11 +263,21 @@ class PipelineOptions(BaseOptions):
257
263
  accelerator_options: AcceleratorOptions = AcceleratorOptions()
258
264
  enable_remote_services: bool = False
259
265
  allow_external_plugins: bool = False
266
+ artifacts_path: Optional[Union[Path, str]] = None
260
267
 
261
268
 
262
- class PaginatedPipelineOptions(PipelineOptions):
263
- artifacts_path: Optional[Union[Path, str]] = None
269
+ class ConvertPipelineOptions(PipelineOptions):
270
+ """Base convert pipeline options."""
271
+
272
+ do_picture_classification: bool = False # True: classify pictures in documents
264
273
 
274
+ do_picture_description: bool = False # True: run describe pictures in documents
275
+ picture_description_options: PictureDescriptionBaseOptions = (
276
+ smolvlm_picture_description
277
+ )
278
+
279
+
280
+ class PaginatedPipelineOptions(ConvertPipelineOptions):
265
281
  images_scale: float = 1.0
266
282
  generate_page_images: bool = False
267
283
  generate_picture_images: bool = False
@@ -274,7 +290,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
274
290
  )
275
291
  # If True, text from backend will be used instead of generated text
276
292
  vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
277
- smoldocling_vlm_conversion_options
293
+ vlm_model_specs.GRANITEDOCLING_TRANSFORMERS
278
294
  )
279
295
 
280
296
 
@@ -293,13 +309,11 @@ class LayoutOptions(BaseModel):
293
309
 
294
310
  class AsrPipelineOptions(PipelineOptions):
295
311
  asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
296
- artifacts_path: Optional[Union[Path, str]] = None
297
312
 
298
313
 
299
314
  class VlmExtractionPipelineOptions(PipelineOptions):
300
315
  """Options for extraction pipeline."""
301
316
 
302
- artifacts_path: Optional[Union[Path, str]] = None
303
317
  vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
304
318
 
305
319
 
@@ -310,8 +324,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
310
324
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
311
325
  do_code_enrichment: bool = False # True: perform code OCR
312
326
  do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
313
- do_picture_classification: bool = False # True: classify pictures in documents
314
- do_picture_description: bool = False # True: run describe pictures in documents
315
327
  force_backend_text: bool = (
316
328
  False # (To be used with vlms, or other generative models)
317
329
  )
@@ -319,9 +331,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
319
331
 
320
332
  table_structure_options: TableStructureOptions = TableStructureOptions()
321
333
  ocr_options: OcrOptions = EasyOcrOptions()
322
- picture_description_options: PictureDescriptionBaseOptions = (
323
- smolvlm_picture_description
324
- )
325
334
  layout_options: LayoutOptions = LayoutOptions()
326
335
 
327
336
  images_scale: float = 1.0
@@ -18,6 +18,35 @@ from docling.datamodel.pipeline_options_vlm_model import (
18
18
  _log = logging.getLogger(__name__)
19
19
 
20
20
 
21
+ # Granite-Docling
22
+ GRANITEDOCLING_TRANSFORMERS = InlineVlmOptions(
23
+ repo_id="ibm-granite/granite-docling-258M",
24
+ prompt="Convert this page to docling.",
25
+ response_format=ResponseFormat.DOCTAGS,
26
+ inference_framework=InferenceFramework.TRANSFORMERS,
27
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
28
+ supported_devices=[
29
+ AcceleratorDevice.CPU,
30
+ AcceleratorDevice.CUDA,
31
+ ],
32
+ scale=2.0,
33
+ temperature=0.0,
34
+ max_new_tokens=8192,
35
+ stop_strings=["</doctag>", "<|end_of_text|>"],
36
+ )
37
+
38
+ GRANITEDOCLING_MLX = InlineVlmOptions(
39
+ repo_id="ibm-granite/granite-docling-258M-mlx",
40
+ prompt="Convert this page to docling.",
41
+ response_format=ResponseFormat.DOCTAGS,
42
+ inference_framework=InferenceFramework.MLX,
43
+ supported_devices=[AcceleratorDevice.MPS],
44
+ scale=2.0,
45
+ temperature=0.0,
46
+ max_new_tokens=8192,
47
+ stop_strings=["</doctag>", "<|end_of_text|>"],
48
+ )
49
+
21
50
  # SmolDocling
22
51
  SMOLDOCLING_MLX = InlineVlmOptions(
23
52
  repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
@@ -272,3 +301,4 @@ class VlmModelType(str, Enum):
272
301
  GRANITE_VISION_VLLM = "granite_vision_vllm"
273
302
  GRANITE_VISION_OLLAMA = "granite_vision_ollama"
274
303
  GOT_OCR_2 = "got_ocr_2"
304
+ GRANITEDOCLING = "granite_docling"
@@ -4,7 +4,13 @@ from collections.abc import Iterable
4
4
  from typing import Any, Generic, Optional, Protocol, Type, Union
5
5
 
6
6
  import numpy as np
7
- from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
7
+ from docling_core.types.doc import (
8
+ BoundingBox,
9
+ DocItem,
10
+ DoclingDocument,
11
+ NodeItem,
12
+ PictureItem,
13
+ )
8
14
  from PIL.Image import Image
9
15
  from typing_extensions import TypeVar
10
16
 
@@ -164,8 +170,17 @@ class BaseItemAndImageEnrichmentModel(
164
170
  return None
165
171
 
166
172
  assert isinstance(element, DocItem)
167
- element_prov = element.prov[0]
168
173
 
174
+ # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
175
+ if len(element.prov) == 0 and isinstance(element, PictureItem):
176
+ embedded_im = element.get_image(conv_res.document)
177
+ if embedded_im is not None:
178
+ return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
179
+ else:
180
+ return None
181
+
182
+ # Crop the image form the page
183
+ element_prov = element.prov[0]
169
184
  bbox = element_prov.bbox
170
185
  width = bbox.r - bbox.l
171
186
  height = bbox.t - bbox.b
@@ -183,4 +198,14 @@ class BaseItemAndImageEnrichmentModel(
183
198
  cropped_image = conv_res.pages[page_ix].get_image(
184
199
  scale=self.images_scale, cropbox=expanded_bbox
185
200
  )
201
+
202
+ # Allow for images being embedded without the page backend or page images
203
+ if cropped_image is None and isinstance(element, PictureItem):
204
+ embedded_im = element.get_image(conv_res.document)
205
+ if embedded_im is not None:
206
+ return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
207
+ else:
208
+ return None
209
+
210
+ # Return the proper cropped image
186
211
  return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
@@ -78,14 +78,17 @@ class EasyOcrModel(BaseOcrModel):
78
78
  download_enabled = False
79
79
  model_storage_directory = str(artifacts_path / self._model_repo_folder)
80
80
 
81
- self.reader = easyocr.Reader(
82
- lang_list=self.options.lang,
83
- gpu=use_gpu,
84
- model_storage_directory=model_storage_directory,
85
- recog_network=self.options.recog_network,
86
- download_enabled=download_enabled,
87
- verbose=False,
88
- )
81
+ with warnings.catch_warnings():
82
+ if self.options.suppress_mps_warnings:
83
+ warnings.filterwarnings("ignore", message=".*pin_memory.*MPS.*")
84
+ self.reader = easyocr.Reader(
85
+ lang_list=self.options.lang,
86
+ gpu=use_gpu,
87
+ model_storage_directory=model_storage_directory,
88
+ recog_network=self.options.recog_network,
89
+ download_enabled=download_enabled,
90
+ verbose=False,
91
+ )
89
92
 
90
93
  @staticmethod
91
94
  def download_models(
@@ -147,7 +150,14 @@ class EasyOcrModel(BaseOcrModel):
147
150
  scale=self.scale, cropbox=ocr_rect
148
151
  )
149
152
  im = numpy.array(high_res_image)
150
- result = self.reader.readtext(im)
153
+
154
+ with warnings.catch_warnings():
155
+ if self.options.suppress_mps_warnings:
156
+ warnings.filterwarnings(
157
+ "ignore", message=".*pin_memory.*MPS.*"
158
+ )
159
+
160
+ result = self.reader.readtext(im)
151
161
 
152
162
  del high_res_image
153
163
  del im
@@ -67,7 +67,7 @@ class PictureDescriptionVlmModel(
67
67
  self.model = AutoModelForImageTextToText.from_pretrained(
68
68
  artifacts_path,
69
69
  device_map=self.device,
70
- torch_dtype=torch.bfloat16,
70
+ dtype=torch.bfloat16,
71
71
  _attn_implementation=(
72
72
  "flash_attention_2"
73
73
  if self.device.startswith("cuda")
@@ -62,32 +62,44 @@ class RapidOcrModel(BaseOcrModel):
62
62
  }
63
63
  backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
64
64
 
65
+ params = {
66
+ # Global settings (these are still correct)
67
+ "Global.text_score": self.options.text_score,
68
+ "Global.font_path": self.options.font_path,
69
+ # "Global.verbose": self.options.print_verbose,
70
+ # Detection model settings
71
+ "Det.model_path": self.options.det_model_path,
72
+ "Det.use_cuda": use_cuda,
73
+ "Det.use_dml": use_dml,
74
+ "Det.intra_op_num_threads": intra_op_num_threads,
75
+ # Classification model settings
76
+ "Cls.model_path": self.options.cls_model_path,
77
+ "Cls.use_cuda": use_cuda,
78
+ "Cls.use_dml": use_dml,
79
+ "Cls.intra_op_num_threads": intra_op_num_threads,
80
+ # Recognition model settings
81
+ "Rec.model_path": self.options.rec_model_path,
82
+ "Rec.font_path": self.options.rec_font_path,
83
+ "Rec.keys_path": self.options.rec_keys_path,
84
+ "Rec.use_cuda": use_cuda,
85
+ "Rec.use_dml": use_dml,
86
+ "Rec.intra_op_num_threads": intra_op_num_threads,
87
+ "Det.engine_type": backend_enum,
88
+ "Cls.engine_type": backend_enum,
89
+ "Rec.engine_type": backend_enum,
90
+ }
91
+
92
+ if self.options.rec_font_path is not None:
93
+ _log.warning(
94
+ "The 'rec_font_path' option for RapidOCR is deprecated. Please use 'font_path' instead."
95
+ )
96
+ user_params = self.options.rapidocr_params
97
+ if user_params:
98
+ _log.debug("Overwriting RapidOCR params with user-provided values.")
99
+ params.update(user_params)
100
+
65
101
  self.reader = RapidOCR(
66
- params={
67
- # Global settings (these are still correct)
68
- "Global.text_score": self.options.text_score,
69
- # "Global.verbose": self.options.print_verbose,
70
- # Detection model settings
71
- "Det.model_path": self.options.det_model_path,
72
- "Det.use_cuda": use_cuda,
73
- "Det.use_dml": use_dml,
74
- "Det.intra_op_num_threads": intra_op_num_threads,
75
- # Classification model settings
76
- "Cls.model_path": self.options.cls_model_path,
77
- "Cls.use_cuda": use_cuda,
78
- "Cls.use_dml": use_dml,
79
- "Cls.intra_op_num_threads": intra_op_num_threads,
80
- # Recognition model settings
81
- "Rec.model_path": self.options.rec_model_path,
82
- "Rec.font_path": self.options.rec_font_path,
83
- "Rec.keys_path": self.options.rec_keys_path,
84
- "Rec.use_cuda": use_cuda,
85
- "Rec.use_dml": use_dml,
86
- "Rec.intra_op_num_threads": intra_op_num_threads,
87
- "Det.engine_type": backend_enum,
88
- "Cls.engine_type": backend_enum,
89
- "Rec.engine_type": backend_enum,
90
- }
102
+ params=params,
91
103
  )
92
104
 
93
105
  def __call__(
@@ -120,6 +132,9 @@ class RapidOcrModel(BaseOcrModel):
120
132
  use_cls=self.options.use_cls,
121
133
  use_rec=self.options.use_rec,
122
134
  )
135
+ if result is None or result.boxes is None:
136
+ _log.warning("RapidOCR returned empty result!")
137
+ continue
123
138
  result = list(
124
139
  zip(result.boxes.tolist(), result.txts, result.scores)
125
140
  )
@@ -112,7 +112,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
112
112
  self.vlm_model = model_cls.from_pretrained(
113
113
  artifacts_path,
114
114
  device_map=self.device,
115
- torch_dtype=self.vlm_options.torch_dtype,
115
+ dtype=self.vlm_options.torch_dtype,
116
116
  _attn_implementation=(
117
117
  "flash_attention_2"
118
118
  if self.device.startswith("cuda")
@@ -144,7 +144,7 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
144
144
  self.vlm_model = AutoModelForImageTextToText.from_pretrained(
145
145
  artifacts_path,
146
146
  device_map=self.device,
147
- torch_dtype=self.vlm_options.torch_dtype,
147
+ dtype=self.vlm_options.torch_dtype,
148
148
  _attn_implementation=(
149
149
  "flash_attention_2"
150
150
  if self.device.startswith("cuda")
@@ -208,25 +208,13 @@ class AsrPipeline(BasePipeline):
208
208
 
209
209
  self.pipeline_options: AsrPipelineOptions = pipeline_options
210
210
 
211
- artifacts_path: Optional[Path] = None
212
- if pipeline_options.artifacts_path is not None:
213
- artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
214
- elif settings.artifacts_path is not None:
215
- artifacts_path = Path(settings.artifacts_path).expanduser()
216
-
217
- if artifacts_path is not None and not artifacts_path.is_dir():
218
- raise RuntimeError(
219
- f"The value of {artifacts_path=} is not valid. "
220
- "When defined, it must point to a folder containing all models required by the pipeline."
221
- )
222
-
223
211
  if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
224
212
  asr_options: InlineAsrNativeWhisperOptions = (
225
213
  self.pipeline_options.asr_options
226
214
  )
227
215
  self._model = _NativeWhisperModel(
228
216
  enabled=True, # must be always enabled for this pipeline to make sense.
229
- artifacts_path=artifacts_path,
217
+ artifacts_path=self.artifacts_path,
230
218
  accelerator_options=pipeline_options.accelerator_options,
231
219
  asr_options=asr_options,
232
220
  )
@@ -1,19 +1,33 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
+ from pathlib import Path
3
4
  from typing import Optional
4
5
 
5
6
  from docling.datamodel.base_models import ConversionStatus, ErrorItem
6
7
  from docling.datamodel.document import InputDocument
7
8
  from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
8
- from docling.datamodel.pipeline_options import BaseOptions
9
+ from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
10
+ from docling.datamodel.settings import settings
9
11
 
10
12
  _log = logging.getLogger(__name__)
11
13
 
12
14
 
13
15
  class BaseExtractionPipeline(ABC):
14
- def __init__(self, pipeline_options: BaseOptions):
16
+ def __init__(self, pipeline_options: PipelineOptions):
15
17
  self.pipeline_options = pipeline_options
16
18
 
19
+ self.artifacts_path: Optional[Path] = None
20
+ if pipeline_options.artifacts_path is not None:
21
+ self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
22
+ elif settings.artifacts_path is not None:
23
+ self.artifacts_path = Path(settings.artifacts_path).expanduser()
24
+
25
+ if self.artifacts_path is not None and not self.artifacts_path.is_dir():
26
+ raise RuntimeError(
27
+ f"The value of {self.artifacts_path=} is not valid. "
28
+ "When defined, it must point to a folder containing all models required by the pipeline."
29
+ )
30
+
17
31
  def execute(
18
32
  self,
19
33
  in_doc: InputDocument,
@@ -54,5 +68,5 @@ class BaseExtractionPipeline(ABC):
54
68
 
55
69
  @classmethod
56
70
  @abstractmethod
57
- def get_default_options(cls) -> BaseOptions:
71
+ def get_default_options(cls) -> PipelineOptions:
58
72
  pass