docling 2.26.0__tar.gz → 2.28.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {docling-2.26.0 → docling-2.28.0}/PKG-INFO +47 -23
  2. {docling-2.26.0 → docling-2.28.0}/README.md +40 -18
  3. {docling-2.26.0 → docling-2.28.0}/docling/backend/asciidoc_backend.py +1 -1
  4. {docling-2.26.0 → docling-2.28.0}/docling/backend/csv_backend.py +1 -1
  5. {docling-2.26.0 → docling-2.28.0}/docling/backend/docling_parse_backend.py +21 -13
  6. {docling-2.26.0 → docling-2.28.0}/docling/backend/docling_parse_v2_backend.py +20 -12
  7. docling-2.28.0/docling/backend/docling_parse_v4_backend.py +192 -0
  8. docling-2.28.0/docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling-2.28.0/docling/backend/docx/latex/omml.py +453 -0
  10. {docling-2.26.0 → docling-2.28.0}/docling/backend/html_backend.py +7 -7
  11. {docling-2.26.0 → docling-2.28.0}/docling/backend/md_backend.py +1 -1
  12. {docling-2.26.0 → docling-2.28.0}/docling/backend/msexcel_backend.py +2 -45
  13. {docling-2.26.0 → docling-2.28.0}/docling/backend/mspowerpoint_backend.py +19 -1
  14. {docling-2.26.0 → docling-2.28.0}/docling/backend/msword_backend.py +68 -3
  15. {docling-2.26.0 → docling-2.28.0}/docling/backend/pdf_backend.py +7 -2
  16. {docling-2.26.0 → docling-2.28.0}/docling/backend/pypdfium2_backend.py +52 -30
  17. {docling-2.26.0 → docling-2.28.0}/docling/backend/xml/uspto_backend.py +1 -1
  18. {docling-2.26.0 → docling-2.28.0}/docling/cli/main.py +135 -53
  19. {docling-2.26.0 → docling-2.28.0}/docling/cli/models.py +1 -1
  20. {docling-2.26.0 → docling-2.28.0}/docling/datamodel/base_models.py +8 -10
  21. {docling-2.26.0 → docling-2.28.0}/docling/datamodel/pipeline_options.py +54 -32
  22. {docling-2.26.0 → docling-2.28.0}/docling/document_converter.py +5 -5
  23. {docling-2.26.0 → docling-2.28.0}/docling/models/base_model.py +9 -1
  24. {docling-2.26.0 → docling-2.28.0}/docling/models/base_ocr_model.py +27 -16
  25. {docling-2.26.0 → docling-2.28.0}/docling/models/easyocr_model.py +28 -13
  26. docling-2.28.0/docling/models/factories/__init__.py +27 -0
  27. docling-2.28.0/docling/models/factories/base_factory.py +122 -0
  28. docling-2.28.0/docling/models/factories/ocr_factory.py +11 -0
  29. docling-2.28.0/docling/models/factories/picture_description_factory.py +11 -0
  30. docling-2.28.0/docling/models/hf_mlx_model.py +137 -0
  31. {docling-2.26.0 → docling-2.28.0}/docling/models/ocr_mac_model.py +39 -11
  32. {docling-2.26.0 → docling-2.28.0}/docling/models/page_preprocessing_model.py +4 -0
  33. {docling-2.26.0 → docling-2.28.0}/docling/models/picture_description_api_model.py +20 -3
  34. {docling-2.26.0 → docling-2.28.0}/docling/models/picture_description_base_model.py +19 -3
  35. {docling-2.26.0 → docling-2.28.0}/docling/models/picture_description_vlm_model.py +14 -2
  36. docling-2.28.0/docling/models/plugins/__init__.py +0 -0
  37. docling-2.28.0/docling/models/plugins/defaults.py +28 -0
  38. {docling-2.26.0 → docling-2.28.0}/docling/models/rapid_ocr_model.py +34 -13
  39. {docling-2.26.0 → docling-2.28.0}/docling/models/table_structure_model.py +13 -4
  40. {docling-2.26.0 → docling-2.28.0}/docling/models/tesseract_ocr_cli_model.py +40 -15
  41. {docling-2.26.0 → docling-2.28.0}/docling/models/tesseract_ocr_model.py +37 -12
  42. docling-2.28.0/docling/pipeline/__init__.py +0 -0
  43. {docling-2.26.0 → docling-2.28.0}/docling/pipeline/standard_pdf_pipeline.py +25 -78
  44. docling-2.28.0/docling/pipeline/vlm_pipeline.py +214 -0
  45. docling-2.28.0/docling/utils/__init__.py +0 -0
  46. {docling-2.26.0 → docling-2.28.0}/docling/utils/export.py +8 -6
  47. {docling-2.26.0 → docling-2.28.0}/docling/utils/layout_postprocessor.py +26 -23
  48. {docling-2.26.0 → docling-2.28.0}/docling/utils/visualization.py +1 -1
  49. {docling-2.26.0 → docling-2.28.0}/pyproject.toml +71 -44
  50. docling-2.26.0/docling/pipeline/vlm_pipeline.py +0 -534
  51. {docling-2.26.0 → docling-2.28.0}/LICENSE +0 -0
  52. {docling-2.26.0 → docling-2.28.0}/docling/__init__.py +0 -0
  53. {docling-2.26.0 → docling-2.28.0}/docling/backend/__init__.py +0 -0
  54. {docling-2.26.0 → docling-2.28.0}/docling/backend/abstract_backend.py +0 -0
  55. {docling-2.26.0/docling/backend/json → docling-2.28.0/docling/backend/docx}/__init__.py +0 -0
  56. {docling-2.26.0/docling/backend/xml → docling-2.28.0/docling/backend/docx/latex}/__init__.py +0 -0
  57. {docling-2.26.0/docling/cli → docling-2.28.0/docling/backend/json}/__init__.py +0 -0
  58. {docling-2.26.0 → docling-2.28.0}/docling/backend/json/docling_json_backend.py +0 -0
  59. {docling-2.26.0/docling/datamodel → docling-2.28.0/docling/backend/xml}/__init__.py +0 -0
  60. {docling-2.26.0 → docling-2.28.0}/docling/backend/xml/jats_backend.py +0 -0
  61. {docling-2.26.0 → docling-2.28.0}/docling/chunking/__init__.py +0 -0
  62. {docling-2.26.0/docling/models → docling-2.28.0/docling/cli}/__init__.py +0 -0
  63. {docling-2.26.0 → docling-2.28.0}/docling/cli/tools.py +0 -0
  64. {docling-2.26.0/docling/pipeline → docling-2.28.0/docling/datamodel}/__init__.py +0 -0
  65. {docling-2.26.0 → docling-2.28.0}/docling/datamodel/document.py +0 -0
  66. {docling-2.26.0 → docling-2.28.0}/docling/datamodel/settings.py +0 -0
  67. {docling-2.26.0 → docling-2.28.0}/docling/exceptions.py +0 -0
  68. {docling-2.26.0/docling/utils → docling-2.28.0/docling/models}/__init__.py +0 -0
  69. {docling-2.26.0 → docling-2.28.0}/docling/models/code_formula_model.py +0 -0
  70. {docling-2.26.0 → docling-2.28.0}/docling/models/document_picture_classifier.py +0 -0
  71. {docling-2.26.0 → docling-2.28.0}/docling/models/hf_vlm_model.py +0 -0
  72. {docling-2.26.0 → docling-2.28.0}/docling/models/layout_model.py +0 -0
  73. {docling-2.26.0 → docling-2.28.0}/docling/models/page_assemble_model.py +0 -0
  74. {docling-2.26.0 → docling-2.28.0}/docling/models/readingorder_model.py +0 -0
  75. {docling-2.26.0 → docling-2.28.0}/docling/pipeline/base_pipeline.py +0 -0
  76. {docling-2.26.0 → docling-2.28.0}/docling/pipeline/simple_pipeline.py +0 -0
  77. {docling-2.26.0 → docling-2.28.0}/docling/py.typed +0 -0
  78. {docling-2.26.0 → docling-2.28.0}/docling/utils/accelerator_utils.py +0 -0
  79. {docling-2.26.0 → docling-2.28.0}/docling/utils/glm_utils.py +0 -0
  80. {docling-2.26.0 → docling-2.28.0}/docling/utils/locks.py +0 -0
  81. {docling-2.26.0 → docling-2.28.0}/docling/utils/model_downloader.py +0 -0
  82. {docling-2.26.0 → docling-2.28.0}/docling/utils/ocr_utils.py +0 -0
  83. {docling-2.26.0 → docling-2.28.0}/docling/utils/profiling.py +0 -0
  84. {docling-2.26.0 → docling-2.28.0}/docling/utils/utils.py +0 -0
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.26.0
3
+ Version: 2.28.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
- Home-page: https://github.com/DS4SD/docling
5
+ Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
7
7
  Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
8
8
  Author: Christoph Auer
@@ -28,9 +28,9 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.23.1,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
- Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
33
+ Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
35
35
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
36
36
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -42,8 +42,10 @@ Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (ex
42
42
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
43
43
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
44
44
  Requires-Dist: pillow (>=10.0.0,<12.0.0)
45
+ Requires-Dist: pluggy (>=1.0.0,<2.0.0)
45
46
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
46
47
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
48
+ Requires-Dist: pylatexenc (>=2.10,<3.0)
47
49
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
48
50
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
49
51
  Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
@@ -57,12 +59,12 @@ Requires-Dist: tqdm (>=4.65.0,<5.0.0)
57
59
  Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
58
60
  Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
59
61
  Requires-Dist: typer (>=0.12.5,<0.13.0)
60
- Project-URL: Repository, https://github.com/DS4SD/docling
62
+ Project-URL: Repository, https://github.com/docling-project/docling
61
63
  Description-Content-Type: text/markdown
62
64
 
63
65
  <p align="center">
64
- <a href="https://github.com/ds4sd/docling">
65
- <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
66
+ <a href="https://github.com/docling-project/docling">
67
+ <img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
66
68
  </a>
67
69
  </p>
68
70
 
@@ -73,7 +75,7 @@ Description-Content-Type: text/markdown
73
75
  </p>
74
76
 
75
77
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
76
- [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
78
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://docling-project.github.io/docling/)
77
79
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
78
80
  [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
79
81
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -81,8 +83,10 @@ Description-Content-Type: text/markdown
81
83
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
82
84
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
83
85
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
84
- [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
86
+ [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
85
87
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
88
+ [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
89
+ [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
86
90
 
87
91
  Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
88
92
 
@@ -95,12 +99,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
95
99
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
96
100
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
97
101
  * 🔍 Extensive OCR support for scanned PDFs and images
102
+ * 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
98
103
  * 💻 Simple and convenient CLI
99
104
 
100
105
  ### Coming soon
101
106
 
102
107
  * 📝 Metadata extraction, including title, authors, references & language
103
- * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
104
108
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
105
109
  * 📝 Complex chemistry understanding (Molecular structures)
106
110
 
@@ -113,11 +117,11 @@ pip install docling
113
117
 
114
118
  Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
115
119
 
116
- More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
120
+ More [detailed installation instructions](https://docling-project.github.io/docling/installation/) are available in the docs.
117
121
 
118
122
  ## Getting started
119
123
 
120
- To convert individual documents, use `convert()`, for example:
124
+ To convert individual documents with python, use `convert()`, for example:
121
125
 
122
126
  ```python
123
127
  from docling.document_converter import DocumentConverter
@@ -128,28 +132,44 @@ result = converter.convert(source)
128
132
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
129
133
  ```
130
134
 
131
- More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
135
+ More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
132
136
  the docs.
133
137
 
138
+ ## CLI
139
+
140
+ Docling has a built-in CLI to run conversions.
141
+
142
+ ```bash
143
+ docling https://arxiv.org/pdf/2206.01062
144
+ ```
145
+
146
+ You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
147
+ ```bash
148
+ docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
149
+ ```
150
+ This will use MLX acceleration on supported Apple Silicon hardware.
151
+
152
+ Read more [here](https://docling-project.github.io/docling/usage/)
153
+
134
154
  ## Documentation
135
155
 
136
- Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
156
+ Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
137
157
  installation, usage, concepts, recipes, extensions, and more.
138
158
 
139
159
  ## Examples
140
160
 
141
- Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
161
+ Go hands-on with our [examples](https://docling-project.github.io/docling/examples/),
142
162
  demonstrating how to address different application use cases with Docling.
143
163
 
144
164
  ## Integrations
145
165
 
146
166
  To further accelerate your AI application development, check out Docling's native
147
- [integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
167
+ [integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
148
168
  and tools.
149
169
 
150
170
  ## Get help and support
151
171
 
152
- Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
172
+ Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
153
173
 
154
174
  ## Technical report
155
175
 
@@ -157,7 +177,7 @@ For more details on Docling's inner workings, check out the [Docling Technical R
157
177
 
158
178
  ## Contributing
159
179
 
160
- Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
180
+ Please read [Contributing to Docling](https://github.com/docling-project/docling/blob/main/CONTRIBUTING.md) for details.
161
181
 
162
182
  ## References
163
183
 
@@ -181,11 +201,15 @@ If you use Docling in your projects, please consider citing the following:
181
201
  The Docling codebase is under MIT license.
182
202
  For individual model usage, please refer to the model licenses found in the original packages.
183
203
 
184
- ## IBM ❤️ Open Source AI
204
+ ## LF AI & Data
205
+
206
+ Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
207
+
208
+ ### IBM ❤️ Open Source AI
185
209
 
186
- Docling has been brought to you by IBM.
210
+ The project was started by the AI for knowledge team at IBM Research Zurich.
187
211
 
188
- [supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
189
- [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
190
- [integrations]: https://ds4sd.github.io/docling/integrations/
212
+ [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
213
+ [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
214
+ [integrations]: https://docling-project.github.io/docling/integrations/
191
215
 
@@ -1,6 +1,6 @@
1
1
  <p align="center">
2
- <a href="https://github.com/ds4sd/docling">
3
- <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
2
+ <a href="https://github.com/docling-project/docling">
3
+ <img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
4
4
  </a>
5
5
  </p>
6
6
 
@@ -11,7 +11,7 @@
11
11
  </p>
12
12
 
13
13
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
14
- [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
14
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://docling-project.github.io/docling/)
15
15
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
16
16
  [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
17
17
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -19,8 +19,10 @@
19
19
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
20
20
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
21
21
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
22
- [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
22
+ [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
23
23
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
24
+ [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
25
+ [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
24
26
 
25
27
  Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
26
28
 
@@ -33,12 +35,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
33
35
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
34
36
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
35
37
  * 🔍 Extensive OCR support for scanned PDFs and images
38
+ * 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
36
39
  * 💻 Simple and convenient CLI
37
40
 
38
41
  ### Coming soon
39
42
 
40
43
  * 📝 Metadata extraction, including title, authors, references & language
41
- * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
42
44
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
43
45
  * 📝 Complex chemistry understanding (Molecular structures)
44
46
 
@@ -51,11 +53,11 @@ pip install docling
51
53
 
52
54
  Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
53
55
 
54
- More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
56
+ More [detailed installation instructions](https://docling-project.github.io/docling/installation/) are available in the docs.
55
57
 
56
58
  ## Getting started
57
59
 
58
- To convert individual documents, use `convert()`, for example:
60
+ To convert individual documents with python, use `convert()`, for example:
59
61
 
60
62
  ```python
61
63
  from docling.document_converter import DocumentConverter
@@ -66,28 +68,44 @@ result = converter.convert(source)
66
68
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
67
69
  ```
68
70
 
69
- More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
71
+ More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
70
72
  the docs.
71
73
 
74
+ ## CLI
75
+
76
+ Docling has a built-in CLI to run conversions.
77
+
78
+ ```bash
79
+ docling https://arxiv.org/pdf/2206.01062
80
+ ```
81
+
82
+ You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
83
+ ```bash
84
+ docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
85
+ ```
86
+ This will use MLX acceleration on supported Apple Silicon hardware.
87
+
88
+ Read more [here](https://docling-project.github.io/docling/usage/)
89
+
72
90
  ## Documentation
73
91
 
74
- Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
92
+ Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
75
93
  installation, usage, concepts, recipes, extensions, and more.
76
94
 
77
95
  ## Examples
78
96
 
79
- Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
97
+ Go hands-on with our [examples](https://docling-project.github.io/docling/examples/),
80
98
  demonstrating how to address different application use cases with Docling.
81
99
 
82
100
  ## Integrations
83
101
 
84
102
  To further accelerate your AI application development, check out Docling's native
85
- [integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
103
+ [integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
86
104
  and tools.
87
105
 
88
106
  ## Get help and support
89
107
 
90
- Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
108
+ Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
91
109
 
92
110
  ## Technical report
93
111
 
@@ -95,7 +113,7 @@ For more details on Docling's inner workings, check out the [Docling Technical R
95
113
 
96
114
  ## Contributing
97
115
 
98
- Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
116
+ Please read [Contributing to Docling](https://github.com/docling-project/docling/blob/main/CONTRIBUTING.md) for details.
99
117
 
100
118
  ## References
101
119
 
@@ -119,10 +137,14 @@ If you use Docling in your projects, please consider citing the following:
119
137
  The Docling codebase is under MIT license.
120
138
  For individual model usage, please refer to the model licenses found in the original packages.
121
139
 
122
- ## IBM ❤️ Open Source AI
140
+ ## LF AI & Data
141
+
142
+ Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
143
+
144
+ ### IBM ❤️ Open Source AI
123
145
 
124
- Docling has been brought to you by IBM.
146
+ The project was started by the AI for knowledge team at IBM Research Zurich.
125
147
 
126
- [supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
127
- [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
128
- [integrations]: https://ds4sd.github.io/docling/integrations/
148
+ [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
149
+ [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
150
+ [integrations]: https://docling-project.github.io/docling/integrations/
@@ -380,7 +380,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
380
380
  end_row_offset_idx=row_idx + row_span,
381
381
  start_col_offset_idx=col_idx,
382
382
  end_col_offset_idx=col_idx + col_span,
383
- col_header=False,
383
+ column_header=row_idx == 0,
384
384
  row_header=False,
385
385
  )
386
386
  data.table_cells.append(cell)
@@ -111,7 +111,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
111
111
  end_row_offset_idx=row_idx + 1,
112
112
  start_col_offset_idx=col_idx,
113
113
  end_col_offset_idx=col_idx + 1,
114
- col_header=row_idx == 0, # First row as header
114
+ column_header=row_idx == 0, # First row as header
115
115
  row_header=False,
116
116
  )
117
117
  table_data.table_cells.append(cell)
@@ -6,12 +6,12 @@ from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
9
+ from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
9
10
  from docling_parse.pdf_parsers import pdf_parser_v1
10
11
  from PIL import Image, ImageDraw
11
12
  from pypdfium2 import PdfPage
12
13
 
13
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
- from docling.datamodel.base_models import Cell
15
15
  from docling.datamodel.document import InputDocument
16
16
 
17
17
  _log = logging.getLogger(__name__)
@@ -68,8 +68,11 @@ class DoclingParsePageBackend(PdfPageBackend):
68
68
 
69
69
  return text_piece
70
70
 
71
- def get_text_cells(self) -> Iterable[Cell]:
72
- cells: List[Cell] = []
71
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
72
+ return None
73
+
74
+ def get_text_cells(self) -> Iterable[TextCell]:
75
+ cells: List[TextCell] = []
73
76
  cell_counter = 0
74
77
 
75
78
  if not self.valid:
@@ -91,19 +94,24 @@ class DoclingParsePageBackend(PdfPageBackend):
91
94
 
92
95
  text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
93
96
  cells.append(
94
- Cell(
95
- id=cell_counter,
97
+ TextCell(
98
+ index=cell_counter,
96
99
  text=text_piece,
97
- bbox=BoundingBox(
98
- # l=x0, b=y0, r=x1, t=y1,
99
- l=x0 * page_size.width / parser_width,
100
- b=y0 * page_size.height / parser_height,
101
- r=x1 * page_size.width / parser_width,
102
- t=y1 * page_size.height / parser_height,
103
- coord_origin=CoordOrigin.BOTTOMLEFT,
100
+ orig=text_piece,
101
+ from_ocr=False,
102
+ rect=BoundingRectangle.from_bounding_box(
103
+ BoundingBox(
104
+ # l=x0, b=y0, r=x1, t=y1,
105
+ l=x0 * page_size.width / parser_width,
106
+ b=y0 * page_size.height / parser_height,
107
+ r=x1 * page_size.width / parser_width,
108
+ t=y1 * page_size.height / parser_height,
109
+ coord_origin=CoordOrigin.BOTTOMLEFT,
110
+ )
104
111
  ).to_top_left_origin(page_size.height),
105
112
  )
106
113
  )
114
+
107
115
  cell_counter += 1
108
116
 
109
117
  def draw_clusters_and_cells():
@@ -112,7 +120,7 @@ class DoclingParsePageBackend(PdfPageBackend):
112
120
  ) # make new image to avoid drawing on the saved ones
113
121
  draw = ImageDraw.Draw(image)
114
122
  for c in cells:
115
- x0, y0, x1, y1 = c.bbox.as_tuple()
123
+ x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
116
124
  cell_color = (
117
125
  random.randint(30, 140),
118
126
  random.randint(30, 140),
@@ -6,12 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
9
10
  from docling_parse.pdf_parsers import pdf_parser_v2
10
11
  from PIL import Image, ImageDraw
11
12
  from pypdfium2 import PdfPage
12
13
 
13
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
- from docling.datamodel.base_models import Cell, Size
15
+ from docling.datamodel.base_models import Size
15
16
  from docling.utils.locks import pypdfium2_lock
16
17
 
17
18
  if TYPE_CHECKING:
@@ -78,8 +79,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
78
79
 
79
80
  return text_piece
80
81
 
81
- def get_text_cells(self) -> Iterable[Cell]:
82
- cells: List[Cell] = []
82
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
83
+ return None
84
+
85
+ def get_text_cells(self) -> Iterable[TextCell]:
86
+ cells: List[TextCell] = []
83
87
  cell_counter = 0
84
88
 
85
89
  if not self.valid:
@@ -106,16 +110,20 @@ class DoclingParseV2PageBackend(PdfPageBackend):
106
110
 
107
111
  text_piece = cell_data[cells_header.index("text")]
108
112
  cells.append(
109
- Cell(
110
- id=cell_counter,
113
+ TextCell(
114
+ index=cell_counter,
111
115
  text=text_piece,
112
- bbox=BoundingBox(
113
- # l=x0, b=y0, r=x1, t=y1,
114
- l=x0 * page_size.width / parser_width,
115
- b=y0 * page_size.height / parser_height,
116
- r=x1 * page_size.width / parser_width,
117
- t=y1 * page_size.height / parser_height,
118
- coord_origin=CoordOrigin.BOTTOMLEFT,
116
+ orig=text_piece,
117
+ from_ocr=False,
118
+ rect=BoundingRectangle.from_bounding_box(
119
+ BoundingBox(
120
+ # l=x0, b=y0, r=x1, t=y1,
121
+ l=x0 * page_size.width / parser_width,
122
+ b=y0 * page_size.height / parser_height,
123
+ r=x1 * page_size.width / parser_width,
124
+ t=y1 * page_size.height / parser_height,
125
+ coord_origin=CoordOrigin.BOTTOMLEFT,
126
+ )
119
127
  ).to_top_left_origin(page_size.height),
120
128
  )
121
129
  )
@@ -0,0 +1,192 @@
1
+ import logging
2
+ import random
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
+
7
+ import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
10
+ from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
11
+ from PIL import Image, ImageDraw
12
+ from pypdfium2 import PdfPage
13
+
14
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
+ from docling.datamodel.base_models import Size
16
+ from docling.utils.locks import pypdfium2_lock
17
+
18
+ if TYPE_CHECKING:
19
+ from docling.datamodel.document import InputDocument
20
+
21
+ _log = logging.getLogger(__name__)
22
+
23
+
24
+ class DoclingParseV4PageBackend(PdfPageBackend):
25
+ def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
26
+ self._ppage = page_obj
27
+ self._dpage = parsed_page
28
+ self.valid = parsed_page is not None
29
+
30
+ def is_valid(self) -> bool:
31
+ return self.valid
32
+
33
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
34
+ # Find intersecting cells on the page
35
+ text_piece = ""
36
+ page_size = self.get_size()
37
+
38
+ scale = (
39
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
40
+ )
41
+
42
+ for i, cell in enumerate(self._dpage.textline_cells):
43
+ cell_bbox = (
44
+ cell.rect.to_bounding_box()
45
+ .to_top_left_origin(page_height=page_size.height)
46
+ .scaled(scale)
47
+ )
48
+
49
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
50
+
51
+ if overlap_frac > 0.5:
52
+ if len(text_piece) > 0:
53
+ text_piece += " "
54
+ text_piece += cell.text
55
+
56
+ return text_piece
57
+
58
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
59
+ return self._dpage
60
+
61
+ def get_text_cells(self) -> Iterable[TextCell]:
62
+ page_size = self.get_size()
63
+
64
+ [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
65
+
66
+ # for cell in self._dpage.textline_cells:
67
+ # rect = cell.rect
68
+ #
69
+ # assert (
70
+ # rect.to_bounding_box().l <= rect.to_bounding_box().r
71
+ # ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
72
+ # assert (
73
+ # rect.to_bounding_box().t <= rect.to_bounding_box().b
74
+ # ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
75
+
76
+ return self._dpage.textline_cells
77
+
78
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
79
+ AREA_THRESHOLD = 0 # 32 * 32
80
+
81
+ images = self._dpage.bitmap_resources
82
+
83
+ for img in images:
84
+ cropbox = img.rect.to_bounding_box().to_top_left_origin(
85
+ self.get_size().height
86
+ )
87
+
88
+ if cropbox.area() > AREA_THRESHOLD:
89
+ cropbox = cropbox.scaled(scale=scale)
90
+
91
+ yield cropbox
92
+
93
+ def get_page_image(
94
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
95
+ ) -> Image.Image:
96
+
97
+ page_size = self.get_size()
98
+
99
+ if not cropbox:
100
+ cropbox = BoundingBox(
101
+ l=0,
102
+ r=page_size.width,
103
+ t=0,
104
+ b=page_size.height,
105
+ coord_origin=CoordOrigin.TOPLEFT,
106
+ )
107
+ padbox = BoundingBox(
108
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
109
+ )
110
+ else:
111
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
112
+ padbox.r = page_size.width - padbox.r
113
+ padbox.t = page_size.height - padbox.t
114
+
115
+ with pypdfium2_lock:
116
+ image = (
117
+ self._ppage.render(
118
+ scale=scale * 1.5,
119
+ rotation=0, # no additional rotation
120
+ crop=padbox.as_tuple(),
121
+ )
122
+ .to_pil()
123
+ .resize(
124
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
125
+ )
126
+ ) # We resize the image from 1.5x the given scale to make it sharper.
127
+
128
+ return image
129
+
130
+ def get_size(self) -> Size:
131
+ with pypdfium2_lock:
132
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
133
+
134
+ # TODO: Take width and height from docling-parse.
135
+ # return Size(
136
+ # width=self._dpage.dimension.width,
137
+ # height=self._dpage.dimension.height,
138
+ # )
139
+
140
+ def unload(self):
141
+ self._ppage = None
142
+ self._dpage = None
143
+
144
+
145
+ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
146
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
147
+ super().__init__(in_doc, path_or_stream)
148
+
149
+ with pypdfium2_lock:
150
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
151
+ self.parser = DoclingPdfParser(loglevel="fatal")
152
+ self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
153
+ success = self.dp_doc is not None
154
+
155
+ if not success:
156
+ raise RuntimeError(
157
+ f"docling-parse v4 could not load document {self.document_hash}."
158
+ )
159
+
160
+ def page_count(self) -> int:
161
+ # return len(self._pdoc) # To be replaced with docling-parse API
162
+
163
+ len_1 = len(self._pdoc)
164
+ len_2 = self.dp_doc.number_of_pages()
165
+
166
+ if len_1 != len_2:
167
+ _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
168
+
169
+ return len_2
170
+
171
+ def load_page(
172
+ self, page_no: int, create_words: bool = True, create_textlines: bool = True
173
+ ) -> DoclingParseV4PageBackend:
174
+ with pypdfium2_lock:
175
+ return DoclingParseV4PageBackend(
176
+ self.dp_doc.get_page(
177
+ page_no + 1,
178
+ create_words=create_words,
179
+ create_textlines=create_textlines,
180
+ ),
181
+ self._pdoc[page_no],
182
+ )
183
+
184
+ def is_valid(self) -> bool:
185
+ return self.page_count() > 0
186
+
187
+ def unload(self):
188
+ super().unload()
189
+ self.dp_doc.unload()
190
+ with pypdfium2_lock:
191
+ self._pdoc.close()
192
+ self._pdoc = None