docling 2.27.0__tar.gz → 2.28.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {docling-2.27.0 → docling-2.28.1}/PKG-INFO +27 -32
  2. {docling-2.27.0 → docling-2.28.1}/README.md +25 -30
  3. {docling-2.27.0 → docling-2.28.1}/docling/backend/docling_parse_v4_backend.py +20 -13
  4. {docling-2.27.0 → docling-2.28.1}/docling/backend/mspowerpoint_backend.py +18 -0
  5. {docling-2.27.0 → docling-2.28.1}/docling/backend/msword_backend.py +56 -14
  6. {docling-2.27.0 → docling-2.28.1}/docling/cli/main.py +81 -38
  7. {docling-2.27.0 → docling-2.28.1}/docling/datamodel/pipeline_options.py +28 -2
  8. {docling-2.27.0 → docling-2.28.1}/docling/document_converter.py +29 -17
  9. docling-2.28.1/docling/models/hf_mlx_model.py +137 -0
  10. {docling-2.27.0 → docling-2.28.1}/docling/models/page_preprocessing_model.py +7 -1
  11. docling-2.28.1/docling/pipeline/vlm_pipeline.py +214 -0
  12. {docling-2.27.0 → docling-2.28.1}/pyproject.toml +3 -2
  13. docling-2.27.0/docling/pipeline/vlm_pipeline.py +0 -534
  14. {docling-2.27.0 → docling-2.28.1}/LICENSE +0 -0
  15. {docling-2.27.0 → docling-2.28.1}/docling/__init__.py +0 -0
  16. {docling-2.27.0 → docling-2.28.1}/docling/backend/__init__.py +0 -0
  17. {docling-2.27.0 → docling-2.28.1}/docling/backend/abstract_backend.py +0 -0
  18. {docling-2.27.0 → docling-2.28.1}/docling/backend/asciidoc_backend.py +0 -0
  19. {docling-2.27.0 → docling-2.28.1}/docling/backend/csv_backend.py +0 -0
  20. {docling-2.27.0 → docling-2.28.1}/docling/backend/docling_parse_backend.py +0 -0
  21. {docling-2.27.0 → docling-2.28.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  22. {docling-2.27.0 → docling-2.28.1}/docling/backend/docx/__init__.py +0 -0
  23. {docling-2.27.0 → docling-2.28.1}/docling/backend/docx/latex/__init__.py +0 -0
  24. {docling-2.27.0 → docling-2.28.1}/docling/backend/docx/latex/latex_dict.py +0 -0
  25. {docling-2.27.0 → docling-2.28.1}/docling/backend/docx/latex/omml.py +0 -0
  26. {docling-2.27.0 → docling-2.28.1}/docling/backend/html_backend.py +0 -0
  27. {docling-2.27.0 → docling-2.28.1}/docling/backend/json/__init__.py +0 -0
  28. {docling-2.27.0 → docling-2.28.1}/docling/backend/json/docling_json_backend.py +0 -0
  29. {docling-2.27.0 → docling-2.28.1}/docling/backend/md_backend.py +0 -0
  30. {docling-2.27.0 → docling-2.28.1}/docling/backend/msexcel_backend.py +0 -0
  31. {docling-2.27.0 → docling-2.28.1}/docling/backend/pdf_backend.py +0 -0
  32. {docling-2.27.0 → docling-2.28.1}/docling/backend/pypdfium2_backend.py +0 -0
  33. {docling-2.27.0 → docling-2.28.1}/docling/backend/xml/__init__.py +0 -0
  34. {docling-2.27.0 → docling-2.28.1}/docling/backend/xml/jats_backend.py +0 -0
  35. {docling-2.27.0 → docling-2.28.1}/docling/backend/xml/uspto_backend.py +0 -0
  36. {docling-2.27.0 → docling-2.28.1}/docling/chunking/__init__.py +0 -0
  37. {docling-2.27.0 → docling-2.28.1}/docling/cli/__init__.py +0 -0
  38. {docling-2.27.0 → docling-2.28.1}/docling/cli/models.py +0 -0
  39. {docling-2.27.0 → docling-2.28.1}/docling/cli/tools.py +0 -0
  40. {docling-2.27.0 → docling-2.28.1}/docling/datamodel/__init__.py +0 -0
  41. {docling-2.27.0 → docling-2.28.1}/docling/datamodel/base_models.py +0 -0
  42. {docling-2.27.0 → docling-2.28.1}/docling/datamodel/document.py +0 -0
  43. {docling-2.27.0 → docling-2.28.1}/docling/datamodel/settings.py +0 -0
  44. {docling-2.27.0 → docling-2.28.1}/docling/exceptions.py +0 -0
  45. {docling-2.27.0 → docling-2.28.1}/docling/models/__init__.py +0 -0
  46. {docling-2.27.0 → docling-2.28.1}/docling/models/base_model.py +0 -0
  47. {docling-2.27.0 → docling-2.28.1}/docling/models/base_ocr_model.py +0 -0
  48. {docling-2.27.0 → docling-2.28.1}/docling/models/code_formula_model.py +0 -0
  49. {docling-2.27.0 → docling-2.28.1}/docling/models/document_picture_classifier.py +0 -0
  50. {docling-2.27.0 → docling-2.28.1}/docling/models/easyocr_model.py +0 -0
  51. {docling-2.27.0 → docling-2.28.1}/docling/models/factories/__init__.py +0 -0
  52. {docling-2.27.0 → docling-2.28.1}/docling/models/factories/base_factory.py +0 -0
  53. {docling-2.27.0 → docling-2.28.1}/docling/models/factories/ocr_factory.py +0 -0
  54. {docling-2.27.0 → docling-2.28.1}/docling/models/factories/picture_description_factory.py +0 -0
  55. {docling-2.27.0 → docling-2.28.1}/docling/models/hf_vlm_model.py +0 -0
  56. {docling-2.27.0 → docling-2.28.1}/docling/models/layout_model.py +0 -0
  57. {docling-2.27.0 → docling-2.28.1}/docling/models/ocr_mac_model.py +0 -0
  58. {docling-2.27.0 → docling-2.28.1}/docling/models/page_assemble_model.py +0 -0
  59. {docling-2.27.0 → docling-2.28.1}/docling/models/picture_description_api_model.py +0 -0
  60. {docling-2.27.0 → docling-2.28.1}/docling/models/picture_description_base_model.py +0 -0
  61. {docling-2.27.0 → docling-2.28.1}/docling/models/picture_description_vlm_model.py +0 -0
  62. {docling-2.27.0 → docling-2.28.1}/docling/models/plugins/__init__.py +0 -0
  63. {docling-2.27.0 → docling-2.28.1}/docling/models/plugins/defaults.py +0 -0
  64. {docling-2.27.0 → docling-2.28.1}/docling/models/rapid_ocr_model.py +0 -0
  65. {docling-2.27.0 → docling-2.28.1}/docling/models/readingorder_model.py +0 -0
  66. {docling-2.27.0 → docling-2.28.1}/docling/models/table_structure_model.py +0 -0
  67. {docling-2.27.0 → docling-2.28.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  68. {docling-2.27.0 → docling-2.28.1}/docling/models/tesseract_ocr_model.py +0 -0
  69. {docling-2.27.0 → docling-2.28.1}/docling/pipeline/__init__.py +0 -0
  70. {docling-2.27.0 → docling-2.28.1}/docling/pipeline/base_pipeline.py +0 -0
  71. {docling-2.27.0 → docling-2.28.1}/docling/pipeline/simple_pipeline.py +0 -0
  72. {docling-2.27.0 → docling-2.28.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  73. {docling-2.27.0 → docling-2.28.1}/docling/py.typed +0 -0
  74. {docling-2.27.0 → docling-2.28.1}/docling/utils/__init__.py +0 -0
  75. {docling-2.27.0 → docling-2.28.1}/docling/utils/accelerator_utils.py +0 -0
  76. {docling-2.27.0 → docling-2.28.1}/docling/utils/export.py +0 -0
  77. {docling-2.27.0 → docling-2.28.1}/docling/utils/glm_utils.py +0 -0
  78. {docling-2.27.0 → docling-2.28.1}/docling/utils/layout_postprocessor.py +0 -0
  79. {docling-2.27.0 → docling-2.28.1}/docling/utils/locks.py +0 -0
  80. {docling-2.27.0 → docling-2.28.1}/docling/utils/model_downloader.py +0 -0
  81. {docling-2.27.0 → docling-2.28.1}/docling/utils/ocr_utils.py +0 -0
  82. {docling-2.27.0 → docling-2.28.1}/docling/utils/profiling.py +0 -0
  83. {docling-2.27.0 → docling-2.28.1}/docling/utils/utils.py +0 -0
  84. {docling-2.27.0 → docling-2.28.1}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.27.0
3
+ Version: 2.28.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.23.0,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.23.1,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
86
86
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
87
87
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
88
88
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
89
+ [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
89
90
 
90
91
  Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
91
92
 
@@ -98,12 +99,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
98
99
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
99
100
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
100
101
  * 🔍 Extensive OCR support for scanned PDFs and images
102
+ * 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
101
103
  * 💻 Simple and convenient CLI
102
104
 
103
105
  ### Coming soon
104
106
 
105
107
  * 📝 Metadata extraction, including title, authors, references & language
106
- * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
107
108
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
108
109
  * 📝 Complex chemistry understanding (Molecular structures)
109
110
 
@@ -120,7 +121,7 @@ More [detailed installation instructions](https://docling-project.github.io/docl
120
121
 
121
122
  ## Getting started
122
123
 
123
- To convert individual documents, use `convert()`, for example:
124
+ To convert individual documents with python, use `convert()`, for example:
124
125
 
125
126
  ```python
126
127
  from docling.document_converter import DocumentConverter
@@ -134,6 +135,22 @@ print(result.document.export_to_markdown()) # output: "## Docling Technical Rep
134
135
  More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
135
136
  the docs.
136
137
 
138
+ ## CLI
139
+
140
+ Docling has a built-in CLI to run conversions.
141
+
142
+ ```bash
143
+ docling https://arxiv.org/pdf/2206.01062
144
+ ```
145
+
146
+ You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
147
+ ```bash
148
+ docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
149
+ ```
150
+ This will use MLX acceleration on supported Apple Silicon hardware.
151
+
152
+ Read more [here](https://docling-project.github.io/docling/usage/)
153
+
137
154
  ## Documentation
138
155
 
139
156
  Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
@@ -150,32 +167,6 @@ To further accelerate your AI application development, check out Docling's nativ
150
167
  [integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
151
168
  and tools.
152
169
 
153
- ## Apify Actor
154
-
155
- <a href="https://apify.com/vancura/docling?fpr=docling"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Docling Actor on Apify" width="176" height="39" /></a>
156
-
157
- You can run Docling in the cloud without installation using the [Docling Actor](https://apify.com/vancura/docling?fpr=docling) on Apify platform. Simply provide a document URL and get the processed result:
158
-
159
- ```bash
160
- apify call vancura/docling -i '{
161
- "options": {
162
- "to_formats": ["md", "json", "html", "text", "doctags"]
163
- },
164
- "http_sources": [
165
- {"url": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf"},
166
- {"url": "https://arxiv.org/pdf/2408.09869"}
167
- ]
168
- }'
169
- ```
170
-
171
- The Actor stores results in:
172
-
173
- * Processed document in key-value store (`OUTPUT_RESULT`)
174
- * Processing logs (`DOCLING_LOG`)
175
- * Dataset record with result URL and status
176
-
177
- Read more about the [Docling Actor](.actor/README.md), including how to use it via the Apify API and CLI.
178
-
179
170
  ## Get help and support
180
171
 
181
172
  Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
@@ -210,9 +201,13 @@ If you use Docling in your projects, please consider citing the following:
210
201
  The Docling codebase is under MIT license.
211
202
  For individual model usage, please refer to the model licenses found in the original packages.
212
203
 
213
- ## IBM ❤️ Open Source AI
204
+ ## LF AI & Data
205
+
206
+ Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
207
+
208
+ ### IBM ❤️ Open Source AI
214
209
 
215
- Docling has been brought to you by IBM.
210
+ The project was started by the AI for knowledge team at IBM Research Zurich.
216
211
 
217
212
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
218
213
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
@@ -22,6 +22,7 @@
22
22
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
23
23
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
24
24
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
25
+ [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
25
26
 
26
27
  Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
27
28
 
@@ -34,12 +35,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
34
35
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
35
36
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
36
37
  * 🔍 Extensive OCR support for scanned PDFs and images
38
+ * 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
37
39
  * 💻 Simple and convenient CLI
38
40
 
39
41
  ### Coming soon
40
42
 
41
43
  * 📝 Metadata extraction, including title, authors, references & language
42
- * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
43
44
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
44
45
  * 📝 Complex chemistry understanding (Molecular structures)
45
46
 
@@ -56,7 +57,7 @@ More [detailed installation instructions](https://docling-project.github.io/docl
56
57
 
57
58
  ## Getting started
58
59
 
59
- To convert individual documents, use `convert()`, for example:
60
+ To convert individual documents with python, use `convert()`, for example:
60
61
 
61
62
  ```python
62
63
  from docling.document_converter import DocumentConverter
@@ -70,6 +71,22 @@ print(result.document.export_to_markdown()) # output: "## Docling Technical Rep
70
71
  More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
71
72
  the docs.
72
73
 
74
+ ## CLI
75
+
76
+ Docling has a built-in CLI to run conversions.
77
+
78
+ ```bash
79
+ docling https://arxiv.org/pdf/2206.01062
80
+ ```
81
+
82
+ You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
83
+ ```bash
84
+ docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
85
+ ```
86
+ This will use MLX acceleration on supported Apple Silicon hardware.
87
+
88
+ Read more [here](https://docling-project.github.io/docling/usage/)
89
+
73
90
  ## Documentation
74
91
 
75
92
  Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
@@ -86,32 +103,6 @@ To further accelerate your AI application development, check out Docling's nativ
86
103
  [integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
87
104
  and tools.
88
105
 
89
- ## Apify Actor
90
-
91
- <a href="https://apify.com/vancura/docling?fpr=docling"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Docling Actor on Apify" width="176" height="39" /></a>
92
-
93
- You can run Docling in the cloud without installation using the [Docling Actor](https://apify.com/vancura/docling?fpr=docling) on Apify platform. Simply provide a document URL and get the processed result:
94
-
95
- ```bash
96
- apify call vancura/docling -i '{
97
- "options": {
98
- "to_formats": ["md", "json", "html", "text", "doctags"]
99
- },
100
- "http_sources": [
101
- {"url": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf"},
102
- {"url": "https://arxiv.org/pdf/2408.09869"}
103
- ]
104
- }'
105
- ```
106
-
107
- The Actor stores results in:
108
-
109
- * Processed document in key-value store (`OUTPUT_RESULT`)
110
- * Processing logs (`DOCLING_LOG`)
111
- * Dataset record with result URL and status
112
-
113
- Read more about the [Docling Actor](.actor/README.md), including how to use it via the Apify API and CLI.
114
-
115
106
  ## Get help and support
116
107
 
117
108
  Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
@@ -146,9 +137,13 @@ If you use Docling in your projects, please consider citing the following:
146
137
  The Docling codebase is under MIT license.
147
138
  For individual model usage, please refer to the model licenses found in the original packages.
148
139
 
149
- ## IBM ❤️ Open Source AI
140
+ ## LF AI & Data
141
+
142
+ Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
143
+
144
+ ### IBM ❤️ Open Source AI
150
145
 
151
- Docling has been brought to you by IBM.
146
+ The project was started by the AI for knowledge team at IBM Research Zurich.
152
147
 
153
148
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
154
149
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
@@ -112,23 +112,30 @@ class DoclingParseV4PageBackend(PdfPageBackend):
112
112
  padbox.r = page_size.width - padbox.r
113
113
  padbox.t = page_size.height - padbox.t
114
114
 
115
- image = (
116
- self._ppage.render(
117
- scale=scale * 1.5,
118
- rotation=0, # no additional rotation
119
- crop=padbox.as_tuple(),
120
- )
121
- .to_pil()
122
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
123
- ) # We resize the image from 1.5x the given scale to make it sharper.
115
+ with pypdfium2_lock:
116
+ image = (
117
+ self._ppage.render(
118
+ scale=scale * 1.5,
119
+ rotation=0, # no additional rotation
120
+ crop=padbox.as_tuple(),
121
+ )
122
+ .to_pil()
123
+ .resize(
124
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
125
+ )
126
+ ) # We resize the image from 1.5x the given scale to make it sharper.
124
127
 
125
128
  return image
126
129
 
127
130
  def get_size(self) -> Size:
128
- return Size(
129
- width=self._dpage.dimension.width,
130
- height=self._dpage.dimension.height,
131
- )
131
+ with pypdfium2_lock:
132
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
133
+
134
+ # TODO: Take width and height from docling-parse.
135
+ # return Size(
136
+ # width=self._dpage.dimension.width,
137
+ # height=self._dpage.dimension.height,
138
+ # )
132
139
 
133
140
  def unload(self):
134
141
  self._ppage = None
@@ -16,6 +16,7 @@ from docling_core.types.doc import (
16
16
  TableCell,
17
17
  TableData,
18
18
  )
19
+ from docling_core.types.doc.document import ContentLayer
19
20
  from PIL import Image, UnidentifiedImageError
20
21
  from pptx import Presentation
21
22
  from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
421
422
  for shape in slide.shapes:
422
423
  handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
423
424
 
425
+ # Handle notes slide
426
+ if slide.has_notes_slide:
427
+ notes_slide = slide.notes_slide
428
+ notes_text = notes_slide.notes_text_frame.text.strip()
429
+ if notes_text:
430
+ bbox = BoundingBox(l=0, t=0, r=0, b=0)
431
+ prov = ProvenanceItem(
432
+ page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
433
+ )
434
+ doc.add_text(
435
+ label=DocItemLabel.TEXT,
436
+ parent=parent_slide,
437
+ text=notes_text,
438
+ prov=prov,
439
+ content_layer=ContentLayer.FURNITURE,
440
+ )
441
+
424
442
  return doc
@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
53
53
  self.max_levels: int = 10
54
54
  self.level_at_new_list: Optional[int] = None
55
55
  self.parents: dict[int, Optional[NodeItem]] = {}
56
+ self.numbered_headers: dict[int, int] = {}
56
57
  for i in range(-1, self.max_levels):
57
58
  self.parents[i] = None
58
59
 
@@ -275,8 +276,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
275
276
  only_equations.append(latex_equation)
276
277
  texts_and_equations.append(latex_equation)
277
278
 
278
- if "".join(only_texts) != text:
279
- return text
279
+ if "".join(only_texts).strip() != text.strip():
280
+ # If we are not able to reconstruct the initial raw text
281
+ # do not try to parse equations and return the original
282
+ return text, []
280
283
 
281
284
  return "".join(texts_and_equations), only_equations
282
285
 
@@ -344,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
344
347
  parent=None, label=DocItemLabel.TITLE, text=text
345
348
  )
346
349
  elif "Heading" in p_style_id:
347
- self.add_header(doc, p_level, text)
350
+ style_element = getattr(paragraph.style, "element", None)
351
+ if style_element:
352
+ is_numbered_style = (
353
+ "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
354
+ )
355
+ else:
356
+ is_numbered_style = False
357
+ self.add_header(doc, p_level, text, is_numbered_style)
348
358
 
349
359
  elif len(equations) > 0:
350
360
  if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
@@ -365,6 +375,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
365
375
  for eq in equations:
366
376
  if len(text_tmp) == 0:
367
377
  break
378
+
368
379
  pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
369
380
  text_tmp = text_tmp.split(eq, maxsplit=1)[1]
370
381
  if len(pre_eq_text) > 0:
@@ -412,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
412
423
  return
413
424
 
414
425
  def add_header(
415
- self, doc: DoclingDocument, curr_level: Optional[int], text: str
426
+ self,
427
+ doc: DoclingDocument,
428
+ curr_level: Optional[int],
429
+ text: str,
430
+ is_numbered_style: bool = False,
416
431
  ) -> None:
417
432
  level = self.get_level()
418
433
  if isinstance(curr_level, int):
@@ -430,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
430
445
  if key >= curr_level:
431
446
  self.parents[key] = None
432
447
 
433
- self.parents[curr_level] = doc.add_heading(
434
- parent=self.parents[curr_level - 1],
435
- text=text,
436
- level=curr_level,
437
- )
448
+ current_level = curr_level
449
+ parent_level = curr_level - 1
450
+ add_level = curr_level
438
451
  else:
439
- self.parents[self.level] = doc.add_heading(
440
- parent=self.parents[self.level - 1],
441
- text=text,
442
- level=1,
443
- )
452
+ current_level = self.level
453
+ parent_level = self.level - 1
454
+ add_level = 1
455
+
456
+ if is_numbered_style:
457
+ if add_level in self.numbered_headers:
458
+ self.numbered_headers[add_level] += 1
459
+ else:
460
+ self.numbered_headers[add_level] = 1
461
+ text = f"{self.numbered_headers[add_level]} {text}"
462
+
463
+ # Reset deeper levels
464
+ next_level = add_level + 1
465
+ while next_level in self.numbered_headers:
466
+ self.numbered_headers[next_level] = 0
467
+ next_level += 1
468
+
469
+ # Scan upper levels
470
+ previous_level = add_level - 1
471
+ while previous_level in self.numbered_headers:
472
+ # MSWord convention: no empty sublevels
473
+ # I.e., sub-sub section (2.0.1) without a sub-section (2.1)
474
+ # is processed as 2.1.1
475
+ if self.numbered_headers[previous_level] == 0:
476
+ self.numbered_headers[previous_level] += 1
477
+
478
+ text = f"{self.numbered_headers[previous_level]}.{text}"
479
+ previous_level -= 1
480
+
481
+ self.parents[current_level] = doc.add_heading(
482
+ parent=self.parents[parent_level],
483
+ text=text,
484
+ level=add_level,
485
+ )
444
486
  return
445
487
 
446
488
  def add_listitem(
@@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
32
32
  AcceleratorOptions,
33
33
  EasyOcrOptions,
34
34
  OcrOptions,
35
+ PaginatedPipelineOptions,
35
36
  PdfBackend,
37
+ PdfPipeline,
36
38
  PdfPipelineOptions,
37
39
  TableFormerMode,
40
+ VlmModelType,
41
+ VlmPipelineOptions,
42
+ granite_vision_vlm_conversion_options,
43
+ smoldocling_vlm_conversion_options,
44
+ smoldocling_vlm_mlx_conversion_options,
38
45
  )
39
46
  from docling.datamodel.settings import settings
40
47
  from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
41
48
  from docling.models.factories import get_ocr_factory
49
+ from docling.pipeline.vlm_pipeline import VlmPipeline
42
50
 
43
51
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
44
52
  warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -200,6 +208,14 @@ def convert(
200
208
  help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
201
209
  ),
202
210
  ] = ImageRefMode.EMBEDDED,
211
+ pipeline: Annotated[
212
+ PdfPipeline,
213
+ typer.Option(..., help="Choose the pipeline to process PDF or image files."),
214
+ ] = PdfPipeline.STANDARD,
215
+ vlm_model: Annotated[
216
+ VlmModelType,
217
+ typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
218
+ ] = VlmModelType.SMOLDOCLING,
203
219
  ocr: Annotated[
204
220
  bool,
205
221
  typer.Option(
@@ -420,50 +436,77 @@ def convert(
420
436
  ocr_options.lang = ocr_lang_list
421
437
 
422
438
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
423
- pipeline_options = PdfPipelineOptions(
424
- allow_external_plugins=allow_external_plugins,
425
- enable_remote_services=enable_remote_services,
426
- accelerator_options=accelerator_options,
427
- do_ocr=ocr,
428
- ocr_options=ocr_options,
429
- do_table_structure=True,
430
- do_code_enrichment=enrich_code,
431
- do_formula_enrichment=enrich_formula,
432
- do_picture_description=enrich_picture_description,
433
- do_picture_classification=enrich_picture_classes,
434
- document_timeout=document_timeout,
435
- )
436
- pipeline_options.table_structure_options.do_cell_matching = (
437
- True # do_cell_matching
438
- )
439
- pipeline_options.table_structure_options.mode = table_mode
439
+ pipeline_options: PaginatedPipelineOptions
440
+
441
+ if pipeline == PdfPipeline.STANDARD:
442
+ pipeline_options = PdfPipelineOptions(
443
+ allow_external_plugins=allow_external_plugins,
444
+ enable_remote_services=enable_remote_services,
445
+ accelerator_options=accelerator_options,
446
+ do_ocr=ocr,
447
+ ocr_options=ocr_options,
448
+ do_table_structure=True,
449
+ do_code_enrichment=enrich_code,
450
+ do_formula_enrichment=enrich_formula,
451
+ do_picture_description=enrich_picture_description,
452
+ do_picture_classification=enrich_picture_classes,
453
+ document_timeout=document_timeout,
454
+ )
455
+ pipeline_options.table_structure_options.do_cell_matching = (
456
+ True # do_cell_matching
457
+ )
458
+ pipeline_options.table_structure_options.mode = table_mode
459
+
460
+ if image_export_mode != ImageRefMode.PLACEHOLDER:
461
+ pipeline_options.generate_page_images = True
462
+ pipeline_options.generate_picture_images = (
463
+ True # FIXME: to be deprecated in verson 3
464
+ )
465
+ pipeline_options.images_scale = 2
466
+
467
+ backend: Type[PdfDocumentBackend]
468
+ if pdf_backend == PdfBackend.DLPARSE_V1:
469
+ backend = DoclingParseDocumentBackend
470
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
471
+ backend = DoclingParseV2DocumentBackend
472
+ elif pdf_backend == PdfBackend.DLPARSE_V4:
473
+ backend = DoclingParseV4DocumentBackend # type: ignore
474
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
475
+ backend = PyPdfiumDocumentBackend # type: ignore
476
+ else:
477
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
478
+
479
+ pdf_format_option = PdfFormatOption(
480
+ pipeline_options=pipeline_options,
481
+ backend=backend, # pdf_backend
482
+ )
483
+ elif pipeline == PdfPipeline.VLM:
484
+ pipeline_options = VlmPipelineOptions()
485
+
486
+ if vlm_model == VlmModelType.GRANITE_VISION:
487
+ pipeline_options.vlm_options = granite_vision_vlm_conversion_options
488
+ elif vlm_model == VlmModelType.SMOLDOCLING:
489
+ pipeline_options.vlm_options = smoldocling_vlm_conversion_options
490
+ if sys.platform == "darwin":
491
+ try:
492
+ import mlx_vlm
493
+
494
+ pipeline_options.vlm_options = (
495
+ smoldocling_vlm_mlx_conversion_options
496
+ )
497
+ except ImportError:
498
+ _log.warning(
499
+ "To run SmolDocling faster, please install mlx-vlm:\n"
500
+ "pip install mlx-vlm"
501
+ )
440
502
 
441
- if image_export_mode != ImageRefMode.PLACEHOLDER:
442
- pipeline_options.generate_page_images = True
443
- pipeline_options.generate_picture_images = (
444
- True # FIXME: to be deprecated in verson 3
503
+ pdf_format_option = PdfFormatOption(
504
+ pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
445
505
  )
446
- pipeline_options.images_scale = 2
447
506
 
448
507
  if artifacts_path is not None:
449
508
  pipeline_options.artifacts_path = artifacts_path
450
509
 
451
- backend: Type[PdfDocumentBackend]
452
- if pdf_backend == PdfBackend.DLPARSE_V1:
453
- backend = DoclingParseDocumentBackend
454
- elif pdf_backend == PdfBackend.DLPARSE_V2:
455
- backend = DoclingParseV2DocumentBackend
456
- elif pdf_backend == PdfBackend.DLPARSE_V4:
457
- backend = DoclingParseV4DocumentBackend # type: ignore
458
- elif pdf_backend == PdfBackend.PYPDFIUM2:
459
- backend = PyPdfiumDocumentBackend # type: ignore
460
- else:
461
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
462
-
463
- pdf_format_option = PdfFormatOption(
464
- pipeline_options=pipeline_options,
465
- backend=backend, # pdf_backend
466
- )
467
510
  format_options: Dict[InputFormat, FormatOption] = {
468
511
  InputFormat.PDF: pdf_format_option,
469
512
  InputFormat.IMAGE: pdf_format_option,
@@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
263
263
  MARKDOWN = "markdown"
264
264
 
265
265
 
266
+ class InferenceFramework(str, Enum):
267
+ MLX = "mlx"
268
+ TRANSFORMERS = "transformers"
269
+
270
+
266
271
  class HuggingFaceVlmOptions(BaseVlmOptions):
267
272
  kind: Literal["hf_model_options"] = "hf_model_options"
268
273
 
@@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
271
276
  llm_int8_threshold: float = 6.0
272
277
  quantized: bool = False
273
278
 
279
+ inference_framework: InferenceFramework
274
280
  response_format: ResponseFormat
275
281
 
276
282
  @property
@@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
278
284
  return self.repo_id.replace("/", "--")
279
285
 
280
286
 
287
+ smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
288
+ repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
289
+ prompt="Convert this page to docling.",
290
+ response_format=ResponseFormat.DOCTAGS,
291
+ inference_framework=InferenceFramework.MLX,
292
+ )
293
+
294
+
281
295
  smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
282
296
  repo_id="ds4sd/SmolDocling-256M-preview",
283
297
  prompt="Convert this page to docling.",
284
298
  response_format=ResponseFormat.DOCTAGS,
299
+ inference_framework=InferenceFramework.TRANSFORMERS,
285
300
  )
286
301
 
287
302
  granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
@@ -289,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
289
304
  # prompt="OCR the full page to markdown.",
290
305
  prompt="OCR this image.",
291
306
  response_format=ResponseFormat.MARKDOWN,
307
+ inference_framework=InferenceFramework.TRANSFORMERS,
292
308
  )
293
309
 
294
310
 
311
+ class VlmModelType(str, Enum):
312
+ SMOLDOCLING = "smoldocling"
313
+ GRANITE_VISION = "granite_vision"
314
+
315
+
295
316
  # Define an enum for the backend options
296
317
  class PdfBackend(str, Enum):
297
318
  """Enum of valid PDF backends."""
@@ -327,13 +348,14 @@ class PipelineOptions(BaseModel):
327
348
 
328
349
 
329
350
  class PaginatedPipelineOptions(PipelineOptions):
351
+ artifacts_path: Optional[Union[Path, str]] = None
352
+
330
353
  images_scale: float = 1.0
331
354
  generate_page_images: bool = False
332
355
  generate_picture_images: bool = False
333
356
 
334
357
 
335
358
  class VlmPipelineOptions(PaginatedPipelineOptions):
336
- artifacts_path: Optional[Union[Path, str]] = None
337
359
 
338
360
  generate_page_images: bool = True
339
361
  force_backend_text: bool = (
@@ -346,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
346
368
  class PdfPipelineOptions(PaginatedPipelineOptions):
347
369
  """Options for the PDF pipeline."""
348
370
 
349
- artifacts_path: Optional[Union[Path, str]] = None
350
371
  do_table_structure: bool = True # True: perform table structure extraction
351
372
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
352
373
  do_code_enrichment: bool = False # True: perform code OCR
@@ -377,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
377
398
  )
378
399
 
379
400
  generate_parsed_pages: bool = False
401
+
402
+
403
+ class PdfPipeline(str, Enum):
404
+ STANDARD = "standard"
405
+ VLM = "vlm"