docling 2.27.0__tar.gz → 2.28.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.27.0 → docling-2.28.1}/PKG-INFO +27 -32
- {docling-2.27.0 → docling-2.28.1}/README.md +25 -30
- {docling-2.27.0 → docling-2.28.1}/docling/backend/docling_parse_v4_backend.py +20 -13
- {docling-2.27.0 → docling-2.28.1}/docling/backend/mspowerpoint_backend.py +18 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/msword_backend.py +56 -14
- {docling-2.27.0 → docling-2.28.1}/docling/cli/main.py +81 -38
- {docling-2.27.0 → docling-2.28.1}/docling/datamodel/pipeline_options.py +28 -2
- {docling-2.27.0 → docling-2.28.1}/docling/document_converter.py +29 -17
- docling-2.28.1/docling/models/hf_mlx_model.py +137 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/page_preprocessing_model.py +7 -1
- docling-2.28.1/docling/pipeline/vlm_pipeline.py +214 -0
- {docling-2.27.0 → docling-2.28.1}/pyproject.toml +3 -2
- docling-2.27.0/docling/pipeline/vlm_pipeline.py +0 -534
- {docling-2.27.0 → docling-2.28.1}/LICENSE +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/abstract_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/csv_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/docx/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/html_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/json/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/md_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/pdf_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/xml/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/chunking/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/cli/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/cli/models.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/cli/tools.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/datamodel/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/datamodel/base_models.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/datamodel/document.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/datamodel/settings.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/exceptions.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/base_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/base_ocr_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/code_formula_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/easyocr_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/factories/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/factories/base_factory.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/hf_vlm_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/layout_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/page_assemble_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/plugins/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/plugins/defaults.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/readingorder_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/table_structure_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/pipeline/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/py.typed +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/__init__.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/export.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/glm_utils.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/locks.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/model_downloader.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/ocr_utils.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/profiling.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/utils.py +0 -0
- {docling-2.27.0 → docling-2.28.1}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.28.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.23.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.23.1,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
|
|
86
86
|
[](https://opensource.org/licenses/MIT)
|
87
87
|
[](https://pepy.tech/projects/docling)
|
88
88
|
[](https://apify.com/vancura/docling)
|
89
|
+
[](https://lfaidata.foundation/projects/)
|
89
90
|
|
90
91
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
91
92
|
|
@@ -98,12 +99,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
98
99
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
99
100
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
100
101
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
102
|
+
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
|
101
103
|
* 💻 Simple and convenient CLI
|
102
104
|
|
103
105
|
### Coming soon
|
104
106
|
|
105
107
|
* 📝 Metadata extraction, including title, authors, references & language
|
106
|
-
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
107
108
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
108
109
|
* 📝 Complex chemistry understanding (Molecular structures)
|
109
110
|
|
@@ -120,7 +121,7 @@ More [detailed installation instructions](https://docling-project.github.io/docl
|
|
120
121
|
|
121
122
|
## Getting started
|
122
123
|
|
123
|
-
To convert individual documents, use `convert()`, for example:
|
124
|
+
To convert individual documents with python, use `convert()`, for example:
|
124
125
|
|
125
126
|
```python
|
126
127
|
from docling.document_converter import DocumentConverter
|
@@ -134,6 +135,22 @@ print(result.document.export_to_markdown()) # output: "## Docling Technical Rep
|
|
134
135
|
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
135
136
|
the docs.
|
136
137
|
|
138
|
+
## CLI
|
139
|
+
|
140
|
+
Docling has a built-in CLI to run conversions.
|
141
|
+
|
142
|
+
```bash
|
143
|
+
docling https://arxiv.org/pdf/2206.01062
|
144
|
+
```
|
145
|
+
|
146
|
+
You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
|
147
|
+
```bash
|
148
|
+
docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
|
149
|
+
```
|
150
|
+
This will use MLX acceleration on supported Apple Silicon hardware.
|
151
|
+
|
152
|
+
Read more [here](https://docling-project.github.io/docling/usage/)
|
153
|
+
|
137
154
|
## Documentation
|
138
155
|
|
139
156
|
Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
|
@@ -150,32 +167,6 @@ To further accelerate your AI application development, check out Docling's nativ
|
|
150
167
|
[integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
|
151
168
|
and tools.
|
152
169
|
|
153
|
-
## Apify Actor
|
154
|
-
|
155
|
-
<a href="https://apify.com/vancura/docling?fpr=docling"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Docling Actor on Apify" width="176" height="39" /></a>
|
156
|
-
|
157
|
-
You can run Docling in the cloud without installation using the [Docling Actor](https://apify.com/vancura/docling?fpr=docling) on Apify platform. Simply provide a document URL and get the processed result:
|
158
|
-
|
159
|
-
```bash
|
160
|
-
apify call vancura/docling -i '{
|
161
|
-
"options": {
|
162
|
-
"to_formats": ["md", "json", "html", "text", "doctags"]
|
163
|
-
},
|
164
|
-
"http_sources": [
|
165
|
-
{"url": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf"},
|
166
|
-
{"url": "https://arxiv.org/pdf/2408.09869"}
|
167
|
-
]
|
168
|
-
}'
|
169
|
-
```
|
170
|
-
|
171
|
-
The Actor stores results in:
|
172
|
-
|
173
|
-
* Processed document in key-value store (`OUTPUT_RESULT`)
|
174
|
-
* Processing logs (`DOCLING_LOG`)
|
175
|
-
* Dataset record with result URL and status
|
176
|
-
|
177
|
-
Read more about the [Docling Actor](.actor/README.md), including how to use it via the Apify API and CLI.
|
178
|
-
|
179
170
|
## Get help and support
|
180
171
|
|
181
172
|
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
|
@@ -210,9 +201,13 @@ If you use Docling in your projects, please consider citing the following:
|
|
210
201
|
The Docling codebase is under MIT license.
|
211
202
|
For individual model usage, please refer to the model licenses found in the original packages.
|
212
203
|
|
213
|
-
##
|
204
|
+
## LF AI & Data
|
205
|
+
|
206
|
+
Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
|
207
|
+
|
208
|
+
### IBM ❤️ Open Source AI
|
214
209
|
|
215
|
-
|
210
|
+
The project was started by the AI for knowledge team at IBM Research Zurich.
|
216
211
|
|
217
212
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
218
213
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
@@ -22,6 +22,7 @@
|
|
22
22
|
[](https://opensource.org/licenses/MIT)
|
23
23
|
[](https://pepy.tech/projects/docling)
|
24
24
|
[](https://apify.com/vancura/docling)
|
25
|
+
[](https://lfaidata.foundation/projects/)
|
25
26
|
|
26
27
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
27
28
|
|
@@ -34,12 +35,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
34
35
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
35
36
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
36
37
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
38
|
+
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
|
37
39
|
* 💻 Simple and convenient CLI
|
38
40
|
|
39
41
|
### Coming soon
|
40
42
|
|
41
43
|
* 📝 Metadata extraction, including title, authors, references & language
|
42
|
-
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
43
44
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
44
45
|
* 📝 Complex chemistry understanding (Molecular structures)
|
45
46
|
|
@@ -56,7 +57,7 @@ More [detailed installation instructions](https://docling-project.github.io/docl
|
|
56
57
|
|
57
58
|
## Getting started
|
58
59
|
|
59
|
-
To convert individual documents, use `convert()`, for example:
|
60
|
+
To convert individual documents with python, use `convert()`, for example:
|
60
61
|
|
61
62
|
```python
|
62
63
|
from docling.document_converter import DocumentConverter
|
@@ -70,6 +71,22 @@ print(result.document.export_to_markdown()) # output: "## Docling Technical Rep
|
|
70
71
|
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
71
72
|
the docs.
|
72
73
|
|
74
|
+
## CLI
|
75
|
+
|
76
|
+
Docling has a built-in CLI to run conversions.
|
77
|
+
|
78
|
+
```bash
|
79
|
+
docling https://arxiv.org/pdf/2206.01062
|
80
|
+
```
|
81
|
+
|
82
|
+
You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
|
83
|
+
```bash
|
84
|
+
docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
|
85
|
+
```
|
86
|
+
This will use MLX acceleration on supported Apple Silicon hardware.
|
87
|
+
|
88
|
+
Read more [here](https://docling-project.github.io/docling/usage/)
|
89
|
+
|
73
90
|
## Documentation
|
74
91
|
|
75
92
|
Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
|
@@ -86,32 +103,6 @@ To further accelerate your AI application development, check out Docling's nativ
|
|
86
103
|
[integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
|
87
104
|
and tools.
|
88
105
|
|
89
|
-
## Apify Actor
|
90
|
-
|
91
|
-
<a href="https://apify.com/vancura/docling?fpr=docling"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Docling Actor on Apify" width="176" height="39" /></a>
|
92
|
-
|
93
|
-
You can run Docling in the cloud without installation using the [Docling Actor](https://apify.com/vancura/docling?fpr=docling) on Apify platform. Simply provide a document URL and get the processed result:
|
94
|
-
|
95
|
-
```bash
|
96
|
-
apify call vancura/docling -i '{
|
97
|
-
"options": {
|
98
|
-
"to_formats": ["md", "json", "html", "text", "doctags"]
|
99
|
-
},
|
100
|
-
"http_sources": [
|
101
|
-
{"url": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf"},
|
102
|
-
{"url": "https://arxiv.org/pdf/2408.09869"}
|
103
|
-
]
|
104
|
-
}'
|
105
|
-
```
|
106
|
-
|
107
|
-
The Actor stores results in:
|
108
|
-
|
109
|
-
* Processed document in key-value store (`OUTPUT_RESULT`)
|
110
|
-
* Processing logs (`DOCLING_LOG`)
|
111
|
-
* Dataset record with result URL and status
|
112
|
-
|
113
|
-
Read more about the [Docling Actor](.actor/README.md), including how to use it via the Apify API and CLI.
|
114
|
-
|
115
106
|
## Get help and support
|
116
107
|
|
117
108
|
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
|
@@ -146,9 +137,13 @@ If you use Docling in your projects, please consider citing the following:
|
|
146
137
|
The Docling codebase is under MIT license.
|
147
138
|
For individual model usage, please refer to the model licenses found in the original packages.
|
148
139
|
|
149
|
-
##
|
140
|
+
## LF AI & Data
|
141
|
+
|
142
|
+
Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
|
143
|
+
|
144
|
+
### IBM ❤️ Open Source AI
|
150
145
|
|
151
|
-
|
146
|
+
The project was started by the AI for knowledge team at IBM Research Zurich.
|
152
147
|
|
153
148
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
154
149
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
@@ -112,23 +112,30 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
112
112
|
padbox.r = page_size.width - padbox.r
|
113
113
|
padbox.t = page_size.height - padbox.t
|
114
114
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
115
|
+
with pypdfium2_lock:
|
116
|
+
image = (
|
117
|
+
self._ppage.render(
|
118
|
+
scale=scale * 1.5,
|
119
|
+
rotation=0, # no additional rotation
|
120
|
+
crop=padbox.as_tuple(),
|
121
|
+
)
|
122
|
+
.to_pil()
|
123
|
+
.resize(
|
124
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
125
|
+
)
|
126
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
124
127
|
|
125
128
|
return image
|
126
129
|
|
127
130
|
def get_size(self) -> Size:
|
128
|
-
|
129
|
-
width=self.
|
130
|
-
|
131
|
-
|
131
|
+
with pypdfium2_lock:
|
132
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
133
|
+
|
134
|
+
# TODO: Take width and height from docling-parse.
|
135
|
+
# return Size(
|
136
|
+
# width=self._dpage.dimension.width,
|
137
|
+
# height=self._dpage.dimension.height,
|
138
|
+
# )
|
132
139
|
|
133
140
|
def unload(self):
|
134
141
|
self._ppage = None
|
@@ -16,6 +16,7 @@ from docling_core.types.doc import (
|
|
16
16
|
TableCell,
|
17
17
|
TableData,
|
18
18
|
)
|
19
|
+
from docling_core.types.doc.document import ContentLayer
|
19
20
|
from PIL import Image, UnidentifiedImageError
|
20
21
|
from pptx import Presentation
|
21
22
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
@@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
421
422
|
for shape in slide.shapes:
|
422
423
|
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
423
424
|
|
425
|
+
# Handle notes slide
|
426
|
+
if slide.has_notes_slide:
|
427
|
+
notes_slide = slide.notes_slide
|
428
|
+
notes_text = notes_slide.notes_text_frame.text.strip()
|
429
|
+
if notes_text:
|
430
|
+
bbox = BoundingBox(l=0, t=0, r=0, b=0)
|
431
|
+
prov = ProvenanceItem(
|
432
|
+
page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
|
433
|
+
)
|
434
|
+
doc.add_text(
|
435
|
+
label=DocItemLabel.TEXT,
|
436
|
+
parent=parent_slide,
|
437
|
+
text=notes_text,
|
438
|
+
prov=prov,
|
439
|
+
content_layer=ContentLayer.FURNITURE,
|
440
|
+
)
|
441
|
+
|
424
442
|
return doc
|
@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
53
53
|
self.max_levels: int = 10
|
54
54
|
self.level_at_new_list: Optional[int] = None
|
55
55
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
56
|
+
self.numbered_headers: dict[int, int] = {}
|
56
57
|
for i in range(-1, self.max_levels):
|
57
58
|
self.parents[i] = None
|
58
59
|
|
@@ -275,8 +276,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
275
276
|
only_equations.append(latex_equation)
|
276
277
|
texts_and_equations.append(latex_equation)
|
277
278
|
|
278
|
-
if "".join(only_texts) != text:
|
279
|
-
|
279
|
+
if "".join(only_texts).strip() != text.strip():
|
280
|
+
# If we are not able to reconstruct the initial raw text
|
281
|
+
# do not try to parse equations and return the original
|
282
|
+
return text, []
|
280
283
|
|
281
284
|
return "".join(texts_and_equations), only_equations
|
282
285
|
|
@@ -344,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
344
347
|
parent=None, label=DocItemLabel.TITLE, text=text
|
345
348
|
)
|
346
349
|
elif "Heading" in p_style_id:
|
347
|
-
|
350
|
+
style_element = getattr(paragraph.style, "element", None)
|
351
|
+
if style_element:
|
352
|
+
is_numbered_style = (
|
353
|
+
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
|
354
|
+
)
|
355
|
+
else:
|
356
|
+
is_numbered_style = False
|
357
|
+
self.add_header(doc, p_level, text, is_numbered_style)
|
348
358
|
|
349
359
|
elif len(equations) > 0:
|
350
360
|
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
@@ -365,6 +375,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
365
375
|
for eq in equations:
|
366
376
|
if len(text_tmp) == 0:
|
367
377
|
break
|
378
|
+
|
368
379
|
pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
|
369
380
|
text_tmp = text_tmp.split(eq, maxsplit=1)[1]
|
370
381
|
if len(pre_eq_text) > 0:
|
@@ -412,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
412
423
|
return
|
413
424
|
|
414
425
|
def add_header(
|
415
|
-
self,
|
426
|
+
self,
|
427
|
+
doc: DoclingDocument,
|
428
|
+
curr_level: Optional[int],
|
429
|
+
text: str,
|
430
|
+
is_numbered_style: bool = False,
|
416
431
|
) -> None:
|
417
432
|
level = self.get_level()
|
418
433
|
if isinstance(curr_level, int):
|
@@ -430,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
430
445
|
if key >= curr_level:
|
431
446
|
self.parents[key] = None
|
432
447
|
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
level=curr_level,
|
437
|
-
)
|
448
|
+
current_level = curr_level
|
449
|
+
parent_level = curr_level - 1
|
450
|
+
add_level = curr_level
|
438
451
|
else:
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
452
|
+
current_level = self.level
|
453
|
+
parent_level = self.level - 1
|
454
|
+
add_level = 1
|
455
|
+
|
456
|
+
if is_numbered_style:
|
457
|
+
if add_level in self.numbered_headers:
|
458
|
+
self.numbered_headers[add_level] += 1
|
459
|
+
else:
|
460
|
+
self.numbered_headers[add_level] = 1
|
461
|
+
text = f"{self.numbered_headers[add_level]} {text}"
|
462
|
+
|
463
|
+
# Reset deeper levels
|
464
|
+
next_level = add_level + 1
|
465
|
+
while next_level in self.numbered_headers:
|
466
|
+
self.numbered_headers[next_level] = 0
|
467
|
+
next_level += 1
|
468
|
+
|
469
|
+
# Scan upper levels
|
470
|
+
previous_level = add_level - 1
|
471
|
+
while previous_level in self.numbered_headers:
|
472
|
+
# MSWord convention: no empty sublevels
|
473
|
+
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
|
474
|
+
# is processed as 2.1.1
|
475
|
+
if self.numbered_headers[previous_level] == 0:
|
476
|
+
self.numbered_headers[previous_level] += 1
|
477
|
+
|
478
|
+
text = f"{self.numbered_headers[previous_level]}.{text}"
|
479
|
+
previous_level -= 1
|
480
|
+
|
481
|
+
self.parents[current_level] = doc.add_heading(
|
482
|
+
parent=self.parents[parent_level],
|
483
|
+
text=text,
|
484
|
+
level=add_level,
|
485
|
+
)
|
444
486
|
return
|
445
487
|
|
446
488
|
def add_listitem(
|
@@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
|
|
32
32
|
AcceleratorOptions,
|
33
33
|
EasyOcrOptions,
|
34
34
|
OcrOptions,
|
35
|
+
PaginatedPipelineOptions,
|
35
36
|
PdfBackend,
|
37
|
+
PdfPipeline,
|
36
38
|
PdfPipelineOptions,
|
37
39
|
TableFormerMode,
|
40
|
+
VlmModelType,
|
41
|
+
VlmPipelineOptions,
|
42
|
+
granite_vision_vlm_conversion_options,
|
43
|
+
smoldocling_vlm_conversion_options,
|
44
|
+
smoldocling_vlm_mlx_conversion_options,
|
38
45
|
)
|
39
46
|
from docling.datamodel.settings import settings
|
40
47
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
41
48
|
from docling.models.factories import get_ocr_factory
|
49
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline
|
42
50
|
|
43
51
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
44
52
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
@@ -200,6 +208,14 @@ def convert(
|
|
200
208
|
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
201
209
|
),
|
202
210
|
] = ImageRefMode.EMBEDDED,
|
211
|
+
pipeline: Annotated[
|
212
|
+
PdfPipeline,
|
213
|
+
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
214
|
+
] = PdfPipeline.STANDARD,
|
215
|
+
vlm_model: Annotated[
|
216
|
+
VlmModelType,
|
217
|
+
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
218
|
+
] = VlmModelType.SMOLDOCLING,
|
203
219
|
ocr: Annotated[
|
204
220
|
bool,
|
205
221
|
typer.Option(
|
@@ -420,50 +436,77 @@ def convert(
|
|
420
436
|
ocr_options.lang = ocr_lang_list
|
421
437
|
|
422
438
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
423
|
-
pipeline_options
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
439
|
+
pipeline_options: PaginatedPipelineOptions
|
440
|
+
|
441
|
+
if pipeline == PdfPipeline.STANDARD:
|
442
|
+
pipeline_options = PdfPipelineOptions(
|
443
|
+
allow_external_plugins=allow_external_plugins,
|
444
|
+
enable_remote_services=enable_remote_services,
|
445
|
+
accelerator_options=accelerator_options,
|
446
|
+
do_ocr=ocr,
|
447
|
+
ocr_options=ocr_options,
|
448
|
+
do_table_structure=True,
|
449
|
+
do_code_enrichment=enrich_code,
|
450
|
+
do_formula_enrichment=enrich_formula,
|
451
|
+
do_picture_description=enrich_picture_description,
|
452
|
+
do_picture_classification=enrich_picture_classes,
|
453
|
+
document_timeout=document_timeout,
|
454
|
+
)
|
455
|
+
pipeline_options.table_structure_options.do_cell_matching = (
|
456
|
+
True # do_cell_matching
|
457
|
+
)
|
458
|
+
pipeline_options.table_structure_options.mode = table_mode
|
459
|
+
|
460
|
+
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
461
|
+
pipeline_options.generate_page_images = True
|
462
|
+
pipeline_options.generate_picture_images = (
|
463
|
+
True # FIXME: to be deprecated in verson 3
|
464
|
+
)
|
465
|
+
pipeline_options.images_scale = 2
|
466
|
+
|
467
|
+
backend: Type[PdfDocumentBackend]
|
468
|
+
if pdf_backend == PdfBackend.DLPARSE_V1:
|
469
|
+
backend = DoclingParseDocumentBackend
|
470
|
+
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
471
|
+
backend = DoclingParseV2DocumentBackend
|
472
|
+
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
473
|
+
backend = DoclingParseV4DocumentBackend # type: ignore
|
474
|
+
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
475
|
+
backend = PyPdfiumDocumentBackend # type: ignore
|
476
|
+
else:
|
477
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
478
|
+
|
479
|
+
pdf_format_option = PdfFormatOption(
|
480
|
+
pipeline_options=pipeline_options,
|
481
|
+
backend=backend, # pdf_backend
|
482
|
+
)
|
483
|
+
elif pipeline == PdfPipeline.VLM:
|
484
|
+
pipeline_options = VlmPipelineOptions()
|
485
|
+
|
486
|
+
if vlm_model == VlmModelType.GRANITE_VISION:
|
487
|
+
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
488
|
+
elif vlm_model == VlmModelType.SMOLDOCLING:
|
489
|
+
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
490
|
+
if sys.platform == "darwin":
|
491
|
+
try:
|
492
|
+
import mlx_vlm
|
493
|
+
|
494
|
+
pipeline_options.vlm_options = (
|
495
|
+
smoldocling_vlm_mlx_conversion_options
|
496
|
+
)
|
497
|
+
except ImportError:
|
498
|
+
_log.warning(
|
499
|
+
"To run SmolDocling faster, please install mlx-vlm:\n"
|
500
|
+
"pip install mlx-vlm"
|
501
|
+
)
|
440
502
|
|
441
|
-
|
442
|
-
|
443
|
-
pipeline_options.generate_picture_images = (
|
444
|
-
True # FIXME: to be deprecated in verson 3
|
503
|
+
pdf_format_option = PdfFormatOption(
|
504
|
+
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
445
505
|
)
|
446
|
-
pipeline_options.images_scale = 2
|
447
506
|
|
448
507
|
if artifacts_path is not None:
|
449
508
|
pipeline_options.artifacts_path = artifacts_path
|
450
509
|
|
451
|
-
backend: Type[PdfDocumentBackend]
|
452
|
-
if pdf_backend == PdfBackend.DLPARSE_V1:
|
453
|
-
backend = DoclingParseDocumentBackend
|
454
|
-
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
455
|
-
backend = DoclingParseV2DocumentBackend
|
456
|
-
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
457
|
-
backend = DoclingParseV4DocumentBackend # type: ignore
|
458
|
-
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
459
|
-
backend = PyPdfiumDocumentBackend # type: ignore
|
460
|
-
else:
|
461
|
-
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
462
|
-
|
463
|
-
pdf_format_option = PdfFormatOption(
|
464
|
-
pipeline_options=pipeline_options,
|
465
|
-
backend=backend, # pdf_backend
|
466
|
-
)
|
467
510
|
format_options: Dict[InputFormat, FormatOption] = {
|
468
511
|
InputFormat.PDF: pdf_format_option,
|
469
512
|
InputFormat.IMAGE: pdf_format_option,
|
@@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
|
|
263
263
|
MARKDOWN = "markdown"
|
264
264
|
|
265
265
|
|
266
|
+
class InferenceFramework(str, Enum):
|
267
|
+
MLX = "mlx"
|
268
|
+
TRANSFORMERS = "transformers"
|
269
|
+
|
270
|
+
|
266
271
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
267
272
|
kind: Literal["hf_model_options"] = "hf_model_options"
|
268
273
|
|
@@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
271
276
|
llm_int8_threshold: float = 6.0
|
272
277
|
quantized: bool = False
|
273
278
|
|
279
|
+
inference_framework: InferenceFramework
|
274
280
|
response_format: ResponseFormat
|
275
281
|
|
276
282
|
@property
|
@@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
278
284
|
return self.repo_id.replace("/", "--")
|
279
285
|
|
280
286
|
|
287
|
+
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
288
|
+
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
289
|
+
prompt="Convert this page to docling.",
|
290
|
+
response_format=ResponseFormat.DOCTAGS,
|
291
|
+
inference_framework=InferenceFramework.MLX,
|
292
|
+
)
|
293
|
+
|
294
|
+
|
281
295
|
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
282
296
|
repo_id="ds4sd/SmolDocling-256M-preview",
|
283
297
|
prompt="Convert this page to docling.",
|
284
298
|
response_format=ResponseFormat.DOCTAGS,
|
299
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
285
300
|
)
|
286
301
|
|
287
302
|
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
@@ -289,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
289
304
|
# prompt="OCR the full page to markdown.",
|
290
305
|
prompt="OCR this image.",
|
291
306
|
response_format=ResponseFormat.MARKDOWN,
|
307
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
292
308
|
)
|
293
309
|
|
294
310
|
|
311
|
+
class VlmModelType(str, Enum):
|
312
|
+
SMOLDOCLING = "smoldocling"
|
313
|
+
GRANITE_VISION = "granite_vision"
|
314
|
+
|
315
|
+
|
295
316
|
# Define an enum for the backend options
|
296
317
|
class PdfBackend(str, Enum):
|
297
318
|
"""Enum of valid PDF backends."""
|
@@ -327,13 +348,14 @@ class PipelineOptions(BaseModel):
|
|
327
348
|
|
328
349
|
|
329
350
|
class PaginatedPipelineOptions(PipelineOptions):
|
351
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
352
|
+
|
330
353
|
images_scale: float = 1.0
|
331
354
|
generate_page_images: bool = False
|
332
355
|
generate_picture_images: bool = False
|
333
356
|
|
334
357
|
|
335
358
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
336
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
337
359
|
|
338
360
|
generate_page_images: bool = True
|
339
361
|
force_backend_text: bool = (
|
@@ -346,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
346
368
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
347
369
|
"""Options for the PDF pipeline."""
|
348
370
|
|
349
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
350
371
|
do_table_structure: bool = True # True: perform table structure extraction
|
351
372
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
352
373
|
do_code_enrichment: bool = False # True: perform code OCR
|
@@ -377,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
377
398
|
)
|
378
399
|
|
379
400
|
generate_parsed_pages: bool = False
|
401
|
+
|
402
|
+
|
403
|
+
class PdfPipeline(str, Enum):
|
404
|
+
STANDARD = "standard"
|
405
|
+
VLM = "vlm"
|