docling 2.26.0__tar.gz → 2.28.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.26.0 → docling-2.28.0}/PKG-INFO +47 -23
- {docling-2.26.0 → docling-2.28.0}/README.md +40 -18
- {docling-2.26.0 → docling-2.28.0}/docling/backend/asciidoc_backend.py +1 -1
- {docling-2.26.0 → docling-2.28.0}/docling/backend/csv_backend.py +1 -1
- {docling-2.26.0 → docling-2.28.0}/docling/backend/docling_parse_backend.py +21 -13
- {docling-2.26.0 → docling-2.28.0}/docling/backend/docling_parse_v2_backend.py +20 -12
- docling-2.28.0/docling/backend/docling_parse_v4_backend.py +192 -0
- docling-2.28.0/docling/backend/docx/latex/latex_dict.py +271 -0
- docling-2.28.0/docling/backend/docx/latex/omml.py +453 -0
- {docling-2.26.0 → docling-2.28.0}/docling/backend/html_backend.py +7 -7
- {docling-2.26.0 → docling-2.28.0}/docling/backend/md_backend.py +1 -1
- {docling-2.26.0 → docling-2.28.0}/docling/backend/msexcel_backend.py +2 -45
- {docling-2.26.0 → docling-2.28.0}/docling/backend/mspowerpoint_backend.py +19 -1
- {docling-2.26.0 → docling-2.28.0}/docling/backend/msword_backend.py +68 -3
- {docling-2.26.0 → docling-2.28.0}/docling/backend/pdf_backend.py +7 -2
- {docling-2.26.0 → docling-2.28.0}/docling/backend/pypdfium2_backend.py +52 -30
- {docling-2.26.0 → docling-2.28.0}/docling/backend/xml/uspto_backend.py +1 -1
- {docling-2.26.0 → docling-2.28.0}/docling/cli/main.py +135 -53
- {docling-2.26.0 → docling-2.28.0}/docling/cli/models.py +1 -1
- {docling-2.26.0 → docling-2.28.0}/docling/datamodel/base_models.py +8 -10
- {docling-2.26.0 → docling-2.28.0}/docling/datamodel/pipeline_options.py +54 -32
- {docling-2.26.0 → docling-2.28.0}/docling/document_converter.py +5 -5
- {docling-2.26.0 → docling-2.28.0}/docling/models/base_model.py +9 -1
- {docling-2.26.0 → docling-2.28.0}/docling/models/base_ocr_model.py +27 -16
- {docling-2.26.0 → docling-2.28.0}/docling/models/easyocr_model.py +28 -13
- docling-2.28.0/docling/models/factories/__init__.py +27 -0
- docling-2.28.0/docling/models/factories/base_factory.py +122 -0
- docling-2.28.0/docling/models/factories/ocr_factory.py +11 -0
- docling-2.28.0/docling/models/factories/picture_description_factory.py +11 -0
- docling-2.28.0/docling/models/hf_mlx_model.py +137 -0
- {docling-2.26.0 → docling-2.28.0}/docling/models/ocr_mac_model.py +39 -11
- {docling-2.26.0 → docling-2.28.0}/docling/models/page_preprocessing_model.py +4 -0
- {docling-2.26.0 → docling-2.28.0}/docling/models/picture_description_api_model.py +20 -3
- {docling-2.26.0 → docling-2.28.0}/docling/models/picture_description_base_model.py +19 -3
- {docling-2.26.0 → docling-2.28.0}/docling/models/picture_description_vlm_model.py +14 -2
- docling-2.28.0/docling/models/plugins/__init__.py +0 -0
- docling-2.28.0/docling/models/plugins/defaults.py +28 -0
- {docling-2.26.0 → docling-2.28.0}/docling/models/rapid_ocr_model.py +34 -13
- {docling-2.26.0 → docling-2.28.0}/docling/models/table_structure_model.py +13 -4
- {docling-2.26.0 → docling-2.28.0}/docling/models/tesseract_ocr_cli_model.py +40 -15
- {docling-2.26.0 → docling-2.28.0}/docling/models/tesseract_ocr_model.py +37 -12
- docling-2.28.0/docling/pipeline/__init__.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/pipeline/standard_pdf_pipeline.py +25 -78
- docling-2.28.0/docling/pipeline/vlm_pipeline.py +214 -0
- docling-2.28.0/docling/utils/__init__.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/utils/export.py +8 -6
- {docling-2.26.0 → docling-2.28.0}/docling/utils/layout_postprocessor.py +26 -23
- {docling-2.26.0 → docling-2.28.0}/docling/utils/visualization.py +1 -1
- {docling-2.26.0 → docling-2.28.0}/pyproject.toml +71 -44
- docling-2.26.0/docling/pipeline/vlm_pipeline.py +0 -534
- {docling-2.26.0 → docling-2.28.0}/LICENSE +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/__init__.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/backend/__init__.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.26.0/docling/backend/json → docling-2.28.0/docling/backend/docx}/__init__.py +0 -0
- {docling-2.26.0/docling/backend/xml → docling-2.28.0/docling/backend/docx/latex}/__init__.py +0 -0
- {docling-2.26.0/docling/cli → docling-2.28.0/docling/backend/json}/__init__.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.26.0/docling/datamodel → docling-2.28.0/docling/backend/xml}/__init__.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/chunking/__init__.py +0 -0
- {docling-2.26.0/docling/models → docling-2.28.0/docling/cli}/__init__.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/cli/tools.py +0 -0
- {docling-2.26.0/docling/pipeline → docling-2.28.0/docling/datamodel}/__init__.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/datamodel/document.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/datamodel/settings.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/exceptions.py +0 -0
- {docling-2.26.0/docling/utils → docling-2.28.0/docling/models}/__init__.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/models/hf_vlm_model.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/models/layout_model.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/py.typed +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/utils/locks.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/utils/profiling.py +0 -0
- {docling-2.26.0 → docling-2.28.0}/docling/utils/utils.py +0 -0
@@ -1,8 +1,8 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.28.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
|
-
Home-page: https://github.com/
|
5
|
+
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
7
7
|
Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
|
8
8
|
Author: Christoph Auer
|
@@ -28,9 +28,9 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.23.1,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
|
-
Requires-Dist: docling-parse (>=
|
33
|
+
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
35
35
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
36
36
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -42,8 +42,10 @@ Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (ex
|
|
42
42
|
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
43
43
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
44
44
|
Requires-Dist: pillow (>=10.0.0,<12.0.0)
|
45
|
+
Requires-Dist: pluggy (>=1.0.0,<2.0.0)
|
45
46
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
46
47
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
48
|
+
Requires-Dist: pylatexenc (>=2.10,<3.0)
|
47
49
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
48
50
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
49
51
|
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
@@ -57,12 +59,12 @@ Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
|
57
59
|
Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
|
58
60
|
Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
59
61
|
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
60
|
-
Project-URL: Repository, https://github.com/
|
62
|
+
Project-URL: Repository, https://github.com/docling-project/docling
|
61
63
|
Description-Content-Type: text/markdown
|
62
64
|
|
63
65
|
<p align="center">
|
64
|
-
<a href="https://github.com/
|
65
|
-
<img loading="lazy" alt="Docling" src="https://github.com/
|
66
|
+
<a href="https://github.com/docling-project/docling">
|
67
|
+
<img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
|
66
68
|
</a>
|
67
69
|
</p>
|
68
70
|
|
@@ -73,7 +75,7 @@ Description-Content-Type: text/markdown
|
|
73
75
|
</p>
|
74
76
|
|
75
77
|
[](https://arxiv.org/abs/2408.09869)
|
76
|
-
[](https://
|
78
|
+
[](https://docling-project.github.io/docling/)
|
77
79
|
[](https://pypi.org/project/docling/)
|
78
80
|
[](https://pypi.org/project/docling/)
|
79
81
|
[](https://python-poetry.org/)
|
@@ -81,8 +83,10 @@ Description-Content-Type: text/markdown
|
|
81
83
|
[](https://pycqa.github.io/isort/)
|
82
84
|
[](https://pydantic.dev)
|
83
85
|
[](https://github.com/pre-commit/pre-commit)
|
84
|
-
[](https://opensource.org/licenses/MIT)
|
85
87
|
[](https://pepy.tech/projects/docling)
|
88
|
+
[](https://apify.com/vancura/docling)
|
89
|
+
[](https://lfaidata.foundation/projects/)
|
86
90
|
|
87
91
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
88
92
|
|
@@ -95,12 +99,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
95
99
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
96
100
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
97
101
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
102
|
+
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
|
98
103
|
* 💻 Simple and convenient CLI
|
99
104
|
|
100
105
|
### Coming soon
|
101
106
|
|
102
107
|
* 📝 Metadata extraction, including title, authors, references & language
|
103
|
-
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
104
108
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
105
109
|
* 📝 Complex chemistry understanding (Molecular structures)
|
106
110
|
|
@@ -113,11 +117,11 @@ pip install docling
|
|
113
117
|
|
114
118
|
Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
|
115
119
|
|
116
|
-
More [detailed installation instructions](https://
|
120
|
+
More [detailed installation instructions](https://docling-project.github.io/docling/installation/) are available in the docs.
|
117
121
|
|
118
122
|
## Getting started
|
119
123
|
|
120
|
-
To convert individual documents, use `convert()`, for example:
|
124
|
+
To convert individual documents with python, use `convert()`, for example:
|
121
125
|
|
122
126
|
```python
|
123
127
|
from docling.document_converter import DocumentConverter
|
@@ -128,28 +132,44 @@ result = converter.convert(source)
|
|
128
132
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
129
133
|
```
|
130
134
|
|
131
|
-
More [advanced usage options](https://
|
135
|
+
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
132
136
|
the docs.
|
133
137
|
|
138
|
+
## CLI
|
139
|
+
|
140
|
+
Docling has a built-in CLI to run conversions.
|
141
|
+
|
142
|
+
```bash
|
143
|
+
docling https://arxiv.org/pdf/2206.01062
|
144
|
+
```
|
145
|
+
|
146
|
+
You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
|
147
|
+
```bash
|
148
|
+
docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
|
149
|
+
```
|
150
|
+
This will use MLX acceleration on supported Apple Silicon hardware.
|
151
|
+
|
152
|
+
Read more [here](https://docling-project.github.io/docling/usage/)
|
153
|
+
|
134
154
|
## Documentation
|
135
155
|
|
136
|
-
Check out Docling's [documentation](https://
|
156
|
+
Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
|
137
157
|
installation, usage, concepts, recipes, extensions, and more.
|
138
158
|
|
139
159
|
## Examples
|
140
160
|
|
141
|
-
Go hands-on with our [examples](https://
|
161
|
+
Go hands-on with our [examples](https://docling-project.github.io/docling/examples/),
|
142
162
|
demonstrating how to address different application use cases with Docling.
|
143
163
|
|
144
164
|
## Integrations
|
145
165
|
|
146
166
|
To further accelerate your AI application development, check out Docling's native
|
147
|
-
[integrations](https://
|
167
|
+
[integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
|
148
168
|
and tools.
|
149
169
|
|
150
170
|
## Get help and support
|
151
171
|
|
152
|
-
Please feel free to connect with us using the [discussion section](https://github.com/
|
172
|
+
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
|
153
173
|
|
154
174
|
## Technical report
|
155
175
|
|
@@ -157,7 +177,7 @@ For more details on Docling's inner workings, check out the [Docling Technical R
|
|
157
177
|
|
158
178
|
## Contributing
|
159
179
|
|
160
|
-
Please read [Contributing to Docling](https://github.com/
|
180
|
+
Please read [Contributing to Docling](https://github.com/docling-project/docling/blob/main/CONTRIBUTING.md) for details.
|
161
181
|
|
162
182
|
## References
|
163
183
|
|
@@ -181,11 +201,15 @@ If you use Docling in your projects, please consider citing the following:
|
|
181
201
|
The Docling codebase is under MIT license.
|
182
202
|
For individual model usage, please refer to the model licenses found in the original packages.
|
183
203
|
|
184
|
-
##
|
204
|
+
## LF AI & Data
|
205
|
+
|
206
|
+
Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
|
207
|
+
|
208
|
+
### IBM ❤️ Open Source AI
|
185
209
|
|
186
|
-
|
210
|
+
The project was started by the AI for knowledge team at IBM Research Zurich.
|
187
211
|
|
188
|
-
[supported_formats]: https://
|
189
|
-
[docling_document]: https://
|
190
|
-
[integrations]: https://
|
212
|
+
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
213
|
+
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
214
|
+
[integrations]: https://docling-project.github.io/docling/integrations/
|
191
215
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
<p align="center">
|
2
|
-
<a href="https://github.com/
|
3
|
-
<img loading="lazy" alt="Docling" src="https://github.com/
|
2
|
+
<a href="https://github.com/docling-project/docling">
|
3
|
+
<img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
|
4
4
|
</a>
|
5
5
|
</p>
|
6
6
|
|
@@ -11,7 +11,7 @@
|
|
11
11
|
</p>
|
12
12
|
|
13
13
|
[](https://arxiv.org/abs/2408.09869)
|
14
|
-
[](https://
|
14
|
+
[](https://docling-project.github.io/docling/)
|
15
15
|
[](https://pypi.org/project/docling/)
|
16
16
|
[](https://pypi.org/project/docling/)
|
17
17
|
[](https://python-poetry.org/)
|
@@ -19,8 +19,10 @@
|
|
19
19
|
[](https://pycqa.github.io/isort/)
|
20
20
|
[](https://pydantic.dev)
|
21
21
|
[](https://github.com/pre-commit/pre-commit)
|
22
|
-
[](https://opensource.org/licenses/MIT)
|
23
23
|
[](https://pepy.tech/projects/docling)
|
24
|
+
[](https://apify.com/vancura/docling)
|
25
|
+
[](https://lfaidata.foundation/projects/)
|
24
26
|
|
25
27
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
26
28
|
|
@@ -33,12 +35,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
33
35
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
34
36
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
35
37
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
38
|
+
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
|
36
39
|
* 💻 Simple and convenient CLI
|
37
40
|
|
38
41
|
### Coming soon
|
39
42
|
|
40
43
|
* 📝 Metadata extraction, including title, authors, references & language
|
41
|
-
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
42
44
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
43
45
|
* 📝 Complex chemistry understanding (Molecular structures)
|
44
46
|
|
@@ -51,11 +53,11 @@ pip install docling
|
|
51
53
|
|
52
54
|
Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
|
53
55
|
|
54
|
-
More [detailed installation instructions](https://
|
56
|
+
More [detailed installation instructions](https://docling-project.github.io/docling/installation/) are available in the docs.
|
55
57
|
|
56
58
|
## Getting started
|
57
59
|
|
58
|
-
To convert individual documents, use `convert()`, for example:
|
60
|
+
To convert individual documents with python, use `convert()`, for example:
|
59
61
|
|
60
62
|
```python
|
61
63
|
from docling.document_converter import DocumentConverter
|
@@ -66,28 +68,44 @@ result = converter.convert(source)
|
|
66
68
|
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
67
69
|
```
|
68
70
|
|
69
|
-
More [advanced usage options](https://
|
71
|
+
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
70
72
|
the docs.
|
71
73
|
|
74
|
+
## CLI
|
75
|
+
|
76
|
+
Docling has a built-in CLI to run conversions.
|
77
|
+
|
78
|
+
```bash
|
79
|
+
docling https://arxiv.org/pdf/2206.01062
|
80
|
+
```
|
81
|
+
|
82
|
+
You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
|
83
|
+
```bash
|
84
|
+
docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
|
85
|
+
```
|
86
|
+
This will use MLX acceleration on supported Apple Silicon hardware.
|
87
|
+
|
88
|
+
Read more [here](https://docling-project.github.io/docling/usage/)
|
89
|
+
|
72
90
|
## Documentation
|
73
91
|
|
74
|
-
Check out Docling's [documentation](https://
|
92
|
+
Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
|
75
93
|
installation, usage, concepts, recipes, extensions, and more.
|
76
94
|
|
77
95
|
## Examples
|
78
96
|
|
79
|
-
Go hands-on with our [examples](https://
|
97
|
+
Go hands-on with our [examples](https://docling-project.github.io/docling/examples/),
|
80
98
|
demonstrating how to address different application use cases with Docling.
|
81
99
|
|
82
100
|
## Integrations
|
83
101
|
|
84
102
|
To further accelerate your AI application development, check out Docling's native
|
85
|
-
[integrations](https://
|
103
|
+
[integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
|
86
104
|
and tools.
|
87
105
|
|
88
106
|
## Get help and support
|
89
107
|
|
90
|
-
Please feel free to connect with us using the [discussion section](https://github.com/
|
108
|
+
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
|
91
109
|
|
92
110
|
## Technical report
|
93
111
|
|
@@ -95,7 +113,7 @@ For more details on Docling's inner workings, check out the [Docling Technical R
|
|
95
113
|
|
96
114
|
## Contributing
|
97
115
|
|
98
|
-
Please read [Contributing to Docling](https://github.com/
|
116
|
+
Please read [Contributing to Docling](https://github.com/docling-project/docling/blob/main/CONTRIBUTING.md) for details.
|
99
117
|
|
100
118
|
## References
|
101
119
|
|
@@ -119,10 +137,14 @@ If you use Docling in your projects, please consider citing the following:
|
|
119
137
|
The Docling codebase is under MIT license.
|
120
138
|
For individual model usage, please refer to the model licenses found in the original packages.
|
121
139
|
|
122
|
-
##
|
140
|
+
## LF AI & Data
|
141
|
+
|
142
|
+
Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
|
143
|
+
|
144
|
+
### IBM ❤️ Open Source AI
|
123
145
|
|
124
|
-
|
146
|
+
The project was started by the AI for knowledge team at IBM Research Zurich.
|
125
147
|
|
126
|
-
[supported_formats]: https://
|
127
|
-
[docling_document]: https://
|
128
|
-
[integrations]: https://
|
148
|
+
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
149
|
+
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
150
|
+
[integrations]: https://docling-project.github.io/docling/integrations/
|
@@ -380,7 +380,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|
380
380
|
end_row_offset_idx=row_idx + row_span,
|
381
381
|
start_col_offset_idx=col_idx,
|
382
382
|
end_col_offset_idx=col_idx + col_span,
|
383
|
-
|
383
|
+
column_header=row_idx == 0,
|
384
384
|
row_header=False,
|
385
385
|
)
|
386
386
|
data.table_cells.append(cell)
|
@@ -111,7 +111,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
|
111
111
|
end_row_offset_idx=row_idx + 1,
|
112
112
|
start_col_offset_idx=col_idx,
|
113
113
|
end_col_offset_idx=col_idx + 1,
|
114
|
-
|
114
|
+
column_header=row_idx == 0, # First row as header
|
115
115
|
row_header=False,
|
116
116
|
)
|
117
117
|
table_data.table_cells.append(cell)
|
@@ -6,12 +6,12 @@ from typing import Iterable, List, Optional, Union
|
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
9
|
+
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
9
10
|
from docling_parse.pdf_parsers import pdf_parser_v1
|
10
11
|
from PIL import Image, ImageDraw
|
11
12
|
from pypdfium2 import PdfPage
|
12
13
|
|
13
14
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
14
|
-
from docling.datamodel.base_models import Cell
|
15
15
|
from docling.datamodel.document import InputDocument
|
16
16
|
|
17
17
|
_log = logging.getLogger(__name__)
|
@@ -68,8 +68,11 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
68
68
|
|
69
69
|
return text_piece
|
70
70
|
|
71
|
-
def
|
72
|
-
|
71
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
72
|
+
return None
|
73
|
+
|
74
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
75
|
+
cells: List[TextCell] = []
|
73
76
|
cell_counter = 0
|
74
77
|
|
75
78
|
if not self.valid:
|
@@ -91,19 +94,24 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
91
94
|
|
92
95
|
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
93
96
|
cells.append(
|
94
|
-
|
95
|
-
|
97
|
+
TextCell(
|
98
|
+
index=cell_counter,
|
96
99
|
text=text_piece,
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
100
|
+
orig=text_piece,
|
101
|
+
from_ocr=False,
|
102
|
+
rect=BoundingRectangle.from_bounding_box(
|
103
|
+
BoundingBox(
|
104
|
+
# l=x0, b=y0, r=x1, t=y1,
|
105
|
+
l=x0 * page_size.width / parser_width,
|
106
|
+
b=y0 * page_size.height / parser_height,
|
107
|
+
r=x1 * page_size.width / parser_width,
|
108
|
+
t=y1 * page_size.height / parser_height,
|
109
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
110
|
+
)
|
104
111
|
).to_top_left_origin(page_size.height),
|
105
112
|
)
|
106
113
|
)
|
114
|
+
|
107
115
|
cell_counter += 1
|
108
116
|
|
109
117
|
def draw_clusters_and_cells():
|
@@ -112,7 +120,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
112
120
|
) # make new image to avoid drawing on the saved ones
|
113
121
|
draw = ImageDraw.Draw(image)
|
114
122
|
for c in cells:
|
115
|
-
x0, y0, x1, y1 = c.
|
123
|
+
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
116
124
|
cell_color = (
|
117
125
|
random.randint(30, 140),
|
118
126
|
random.randint(30, 140),
|
@@ -6,12 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
+
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
9
10
|
from docling_parse.pdf_parsers import pdf_parser_v2
|
10
11
|
from PIL import Image, ImageDraw
|
11
12
|
from pypdfium2 import PdfPage
|
12
13
|
|
13
14
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
14
|
-
from docling.datamodel.base_models import
|
15
|
+
from docling.datamodel.base_models import Size
|
15
16
|
from docling.utils.locks import pypdfium2_lock
|
16
17
|
|
17
18
|
if TYPE_CHECKING:
|
@@ -78,8 +79,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
78
79
|
|
79
80
|
return text_piece
|
80
81
|
|
81
|
-
def
|
82
|
-
|
82
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
83
|
+
return None
|
84
|
+
|
85
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
86
|
+
cells: List[TextCell] = []
|
83
87
|
cell_counter = 0
|
84
88
|
|
85
89
|
if not self.valid:
|
@@ -106,16 +110,20 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
106
110
|
|
107
111
|
text_piece = cell_data[cells_header.index("text")]
|
108
112
|
cells.append(
|
109
|
-
|
110
|
-
|
113
|
+
TextCell(
|
114
|
+
index=cell_counter,
|
111
115
|
text=text_piece,
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
116
|
+
orig=text_piece,
|
117
|
+
from_ocr=False,
|
118
|
+
rect=BoundingRectangle.from_bounding_box(
|
119
|
+
BoundingBox(
|
120
|
+
# l=x0, b=y0, r=x1, t=y1,
|
121
|
+
l=x0 * page_size.width / parser_width,
|
122
|
+
b=y0 * page_size.height / parser_height,
|
123
|
+
r=x1 * page_size.width / parser_width,
|
124
|
+
t=y1 * page_size.height / parser_height,
|
125
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
126
|
+
)
|
119
127
|
).to_top_left_origin(page_size.height),
|
120
128
|
)
|
121
129
|
)
|
@@ -0,0 +1,192 @@
|
|
1
|
+
import logging
|
2
|
+
import random
|
3
|
+
from io import BytesIO
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
6
|
+
|
7
|
+
import pypdfium2 as pdfium
|
8
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
+
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
10
|
+
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
11
|
+
from PIL import Image, ImageDraw
|
12
|
+
from pypdfium2 import PdfPage
|
13
|
+
|
14
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
15
|
+
from docling.datamodel.base_models import Size
|
16
|
+
from docling.utils.locks import pypdfium2_lock
|
17
|
+
|
18
|
+
if TYPE_CHECKING:
|
19
|
+
from docling.datamodel.document import InputDocument
|
20
|
+
|
21
|
+
_log = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
class DoclingParseV4PageBackend(PdfPageBackend):
|
25
|
+
def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
|
26
|
+
self._ppage = page_obj
|
27
|
+
self._dpage = parsed_page
|
28
|
+
self.valid = parsed_page is not None
|
29
|
+
|
30
|
+
def is_valid(self) -> bool:
|
31
|
+
return self.valid
|
32
|
+
|
33
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
34
|
+
# Find intersecting cells on the page
|
35
|
+
text_piece = ""
|
36
|
+
page_size = self.get_size()
|
37
|
+
|
38
|
+
scale = (
|
39
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
40
|
+
)
|
41
|
+
|
42
|
+
for i, cell in enumerate(self._dpage.textline_cells):
|
43
|
+
cell_bbox = (
|
44
|
+
cell.rect.to_bounding_box()
|
45
|
+
.to_top_left_origin(page_height=page_size.height)
|
46
|
+
.scaled(scale)
|
47
|
+
)
|
48
|
+
|
49
|
+
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
50
|
+
|
51
|
+
if overlap_frac > 0.5:
|
52
|
+
if len(text_piece) > 0:
|
53
|
+
text_piece += " "
|
54
|
+
text_piece += cell.text
|
55
|
+
|
56
|
+
return text_piece
|
57
|
+
|
58
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
59
|
+
return self._dpage
|
60
|
+
|
61
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
62
|
+
page_size = self.get_size()
|
63
|
+
|
64
|
+
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
|
65
|
+
|
66
|
+
# for cell in self._dpage.textline_cells:
|
67
|
+
# rect = cell.rect
|
68
|
+
#
|
69
|
+
# assert (
|
70
|
+
# rect.to_bounding_box().l <= rect.to_bounding_box().r
|
71
|
+
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
|
72
|
+
# assert (
|
73
|
+
# rect.to_bounding_box().t <= rect.to_bounding_box().b
|
74
|
+
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
|
75
|
+
|
76
|
+
return self._dpage.textline_cells
|
77
|
+
|
78
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
79
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
80
|
+
|
81
|
+
images = self._dpage.bitmap_resources
|
82
|
+
|
83
|
+
for img in images:
|
84
|
+
cropbox = img.rect.to_bounding_box().to_top_left_origin(
|
85
|
+
self.get_size().height
|
86
|
+
)
|
87
|
+
|
88
|
+
if cropbox.area() > AREA_THRESHOLD:
|
89
|
+
cropbox = cropbox.scaled(scale=scale)
|
90
|
+
|
91
|
+
yield cropbox
|
92
|
+
|
93
|
+
def get_page_image(
|
94
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
95
|
+
) -> Image.Image:
|
96
|
+
|
97
|
+
page_size = self.get_size()
|
98
|
+
|
99
|
+
if not cropbox:
|
100
|
+
cropbox = BoundingBox(
|
101
|
+
l=0,
|
102
|
+
r=page_size.width,
|
103
|
+
t=0,
|
104
|
+
b=page_size.height,
|
105
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
106
|
+
)
|
107
|
+
padbox = BoundingBox(
|
108
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
109
|
+
)
|
110
|
+
else:
|
111
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
112
|
+
padbox.r = page_size.width - padbox.r
|
113
|
+
padbox.t = page_size.height - padbox.t
|
114
|
+
|
115
|
+
with pypdfium2_lock:
|
116
|
+
image = (
|
117
|
+
self._ppage.render(
|
118
|
+
scale=scale * 1.5,
|
119
|
+
rotation=0, # no additional rotation
|
120
|
+
crop=padbox.as_tuple(),
|
121
|
+
)
|
122
|
+
.to_pil()
|
123
|
+
.resize(
|
124
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
125
|
+
)
|
126
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
127
|
+
|
128
|
+
return image
|
129
|
+
|
130
|
+
def get_size(self) -> Size:
|
131
|
+
with pypdfium2_lock:
|
132
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
133
|
+
|
134
|
+
# TODO: Take width and height from docling-parse.
|
135
|
+
# return Size(
|
136
|
+
# width=self._dpage.dimension.width,
|
137
|
+
# height=self._dpage.dimension.height,
|
138
|
+
# )
|
139
|
+
|
140
|
+
def unload(self):
|
141
|
+
self._ppage = None
|
142
|
+
self._dpage = None
|
143
|
+
|
144
|
+
|
145
|
+
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
146
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
147
|
+
super().__init__(in_doc, path_or_stream)
|
148
|
+
|
149
|
+
with pypdfium2_lock:
|
150
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
151
|
+
self.parser = DoclingPdfParser(loglevel="fatal")
|
152
|
+
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
|
153
|
+
success = self.dp_doc is not None
|
154
|
+
|
155
|
+
if not success:
|
156
|
+
raise RuntimeError(
|
157
|
+
f"docling-parse v4 could not load document {self.document_hash}."
|
158
|
+
)
|
159
|
+
|
160
|
+
def page_count(self) -> int:
|
161
|
+
# return len(self._pdoc) # To be replaced with docling-parse API
|
162
|
+
|
163
|
+
len_1 = len(self._pdoc)
|
164
|
+
len_2 = self.dp_doc.number_of_pages()
|
165
|
+
|
166
|
+
if len_1 != len_2:
|
167
|
+
_log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
|
168
|
+
|
169
|
+
return len_2
|
170
|
+
|
171
|
+
def load_page(
|
172
|
+
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
173
|
+
) -> DoclingParseV4PageBackend:
|
174
|
+
with pypdfium2_lock:
|
175
|
+
return DoclingParseV4PageBackend(
|
176
|
+
self.dp_doc.get_page(
|
177
|
+
page_no + 1,
|
178
|
+
create_words=create_words,
|
179
|
+
create_textlines=create_textlines,
|
180
|
+
),
|
181
|
+
self._pdoc[page_no],
|
182
|
+
)
|
183
|
+
|
184
|
+
def is_valid(self) -> bool:
|
185
|
+
return self.page_count() > 0
|
186
|
+
|
187
|
+
def unload(self):
|
188
|
+
super().unload()
|
189
|
+
self.dp_doc.unload()
|
190
|
+
with pypdfium2_lock:
|
191
|
+
self._pdoc.close()
|
192
|
+
self._pdoc = None
|