docling 2.26.0__tar.gz → 2.27.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {docling-2.26.0 → docling-2.27.0}/PKG-INFO +48 -19
  2. {docling-2.26.0 → docling-2.27.0}/README.md +41 -14
  3. {docling-2.26.0 → docling-2.27.0}/docling/backend/asciidoc_backend.py +1 -1
  4. {docling-2.26.0 → docling-2.27.0}/docling/backend/csv_backend.py +1 -1
  5. {docling-2.26.0 → docling-2.27.0}/docling/backend/docling_parse_backend.py +21 -13
  6. {docling-2.26.0 → docling-2.27.0}/docling/backend/docling_parse_v2_backend.py +20 -12
  7. docling-2.27.0/docling/backend/docling_parse_v4_backend.py +185 -0
  8. docling-2.27.0/docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling-2.27.0/docling/backend/docx/latex/omml.py +453 -0
  10. {docling-2.26.0 → docling-2.27.0}/docling/backend/html_backend.py +7 -7
  11. {docling-2.26.0 → docling-2.27.0}/docling/backend/md_backend.py +1 -1
  12. {docling-2.26.0 → docling-2.27.0}/docling/backend/msexcel_backend.py +2 -45
  13. {docling-2.26.0 → docling-2.27.0}/docling/backend/mspowerpoint_backend.py +1 -1
  14. {docling-2.26.0 → docling-2.27.0}/docling/backend/msword_backend.py +65 -3
  15. {docling-2.26.0 → docling-2.27.0}/docling/backend/pdf_backend.py +7 -2
  16. {docling-2.26.0 → docling-2.27.0}/docling/backend/pypdfium2_backend.py +52 -30
  17. {docling-2.26.0 → docling-2.27.0}/docling/backend/xml/uspto_backend.py +1 -1
  18. {docling-2.26.0 → docling-2.27.0}/docling/cli/main.py +60 -21
  19. {docling-2.26.0 → docling-2.27.0}/docling/cli/models.py +1 -1
  20. {docling-2.26.0 → docling-2.27.0}/docling/datamodel/base_models.py +8 -10
  21. {docling-2.26.0 → docling-2.27.0}/docling/datamodel/pipeline_options.py +26 -30
  22. {docling-2.26.0 → docling-2.27.0}/docling/document_converter.py +5 -5
  23. {docling-2.26.0 → docling-2.27.0}/docling/models/base_model.py +9 -1
  24. {docling-2.26.0 → docling-2.27.0}/docling/models/base_ocr_model.py +27 -16
  25. {docling-2.26.0 → docling-2.27.0}/docling/models/easyocr_model.py +28 -13
  26. docling-2.27.0/docling/models/factories/__init__.py +27 -0
  27. docling-2.27.0/docling/models/factories/base_factory.py +122 -0
  28. docling-2.27.0/docling/models/factories/ocr_factory.py +11 -0
  29. docling-2.27.0/docling/models/factories/picture_description_factory.py +11 -0
  30. {docling-2.26.0 → docling-2.27.0}/docling/models/ocr_mac_model.py +39 -11
  31. {docling-2.26.0 → docling-2.27.0}/docling/models/page_preprocessing_model.py +4 -0
  32. {docling-2.26.0 → docling-2.27.0}/docling/models/picture_description_api_model.py +20 -3
  33. {docling-2.26.0 → docling-2.27.0}/docling/models/picture_description_base_model.py +19 -3
  34. {docling-2.26.0 → docling-2.27.0}/docling/models/picture_description_vlm_model.py +14 -2
  35. docling-2.27.0/docling/models/plugins/__init__.py +0 -0
  36. docling-2.27.0/docling/models/plugins/defaults.py +28 -0
  37. {docling-2.26.0 → docling-2.27.0}/docling/models/rapid_ocr_model.py +34 -13
  38. {docling-2.26.0 → docling-2.27.0}/docling/models/table_structure_model.py +13 -4
  39. {docling-2.26.0 → docling-2.27.0}/docling/models/tesseract_ocr_cli_model.py +40 -15
  40. {docling-2.26.0 → docling-2.27.0}/docling/models/tesseract_ocr_model.py +37 -12
  41. docling-2.27.0/docling/pipeline/__init__.py +0 -0
  42. {docling-2.26.0 → docling-2.27.0}/docling/pipeline/standard_pdf_pipeline.py +25 -78
  43. docling-2.27.0/docling/utils/__init__.py +0 -0
  44. {docling-2.26.0 → docling-2.27.0}/docling/utils/export.py +8 -6
  45. {docling-2.26.0 → docling-2.27.0}/docling/utils/layout_postprocessor.py +26 -23
  46. {docling-2.26.0 → docling-2.27.0}/docling/utils/visualization.py +1 -1
  47. {docling-2.26.0 → docling-2.27.0}/pyproject.toml +70 -44
  48. {docling-2.26.0 → docling-2.27.0}/LICENSE +0 -0
  49. {docling-2.26.0 → docling-2.27.0}/docling/__init__.py +0 -0
  50. {docling-2.26.0 → docling-2.27.0}/docling/backend/__init__.py +0 -0
  51. {docling-2.26.0 → docling-2.27.0}/docling/backend/abstract_backend.py +0 -0
  52. {docling-2.26.0/docling/backend/json → docling-2.27.0/docling/backend/docx}/__init__.py +0 -0
  53. {docling-2.26.0/docling/backend/xml → docling-2.27.0/docling/backend/docx/latex}/__init__.py +0 -0
  54. {docling-2.26.0/docling/cli → docling-2.27.0/docling/backend/json}/__init__.py +0 -0
  55. {docling-2.26.0 → docling-2.27.0}/docling/backend/json/docling_json_backend.py +0 -0
  56. {docling-2.26.0/docling/datamodel → docling-2.27.0/docling/backend/xml}/__init__.py +0 -0
  57. {docling-2.26.0 → docling-2.27.0}/docling/backend/xml/jats_backend.py +0 -0
  58. {docling-2.26.0 → docling-2.27.0}/docling/chunking/__init__.py +0 -0
  59. {docling-2.26.0/docling/models → docling-2.27.0/docling/cli}/__init__.py +0 -0
  60. {docling-2.26.0 → docling-2.27.0}/docling/cli/tools.py +0 -0
  61. {docling-2.26.0/docling/pipeline → docling-2.27.0/docling/datamodel}/__init__.py +0 -0
  62. {docling-2.26.0 → docling-2.27.0}/docling/datamodel/document.py +0 -0
  63. {docling-2.26.0 → docling-2.27.0}/docling/datamodel/settings.py +0 -0
  64. {docling-2.26.0 → docling-2.27.0}/docling/exceptions.py +0 -0
  65. {docling-2.26.0/docling/utils → docling-2.27.0/docling/models}/__init__.py +0 -0
  66. {docling-2.26.0 → docling-2.27.0}/docling/models/code_formula_model.py +0 -0
  67. {docling-2.26.0 → docling-2.27.0}/docling/models/document_picture_classifier.py +0 -0
  68. {docling-2.26.0 → docling-2.27.0}/docling/models/hf_vlm_model.py +0 -0
  69. {docling-2.26.0 → docling-2.27.0}/docling/models/layout_model.py +0 -0
  70. {docling-2.26.0 → docling-2.27.0}/docling/models/page_assemble_model.py +0 -0
  71. {docling-2.26.0 → docling-2.27.0}/docling/models/readingorder_model.py +0 -0
  72. {docling-2.26.0 → docling-2.27.0}/docling/pipeline/base_pipeline.py +0 -0
  73. {docling-2.26.0 → docling-2.27.0}/docling/pipeline/simple_pipeline.py +0 -0
  74. {docling-2.26.0 → docling-2.27.0}/docling/pipeline/vlm_pipeline.py +0 -0
  75. {docling-2.26.0 → docling-2.27.0}/docling/py.typed +0 -0
  76. {docling-2.26.0 → docling-2.27.0}/docling/utils/accelerator_utils.py +0 -0
  77. {docling-2.26.0 → docling-2.27.0}/docling/utils/glm_utils.py +0 -0
  78. {docling-2.26.0 → docling-2.27.0}/docling/utils/locks.py +0 -0
  79. {docling-2.26.0 → docling-2.27.0}/docling/utils/model_downloader.py +0 -0
  80. {docling-2.26.0 → docling-2.27.0}/docling/utils/ocr_utils.py +0 -0
  81. {docling-2.26.0 → docling-2.27.0}/docling/utils/profiling.py +0 -0
  82. {docling-2.26.0 → docling-2.27.0}/docling/utils/utils.py +0 -0
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.26.0
3
+ Version: 2.27.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
- Home-page: https://github.com/DS4SD/docling
5
+ Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
7
7
  Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
8
8
  Author: Christoph Auer
@@ -28,9 +28,9 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.23.0,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
- Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
33
+ Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
35
35
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
36
36
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -42,8 +42,10 @@ Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (ex
42
42
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
43
43
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
44
44
  Requires-Dist: pillow (>=10.0.0,<12.0.0)
45
+ Requires-Dist: pluggy (>=1.0.0,<2.0.0)
45
46
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
46
47
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
48
+ Requires-Dist: pylatexenc (>=2.10,<3.0)
47
49
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
48
50
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
49
51
  Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
@@ -57,12 +59,12 @@ Requires-Dist: tqdm (>=4.65.0,<5.0.0)
57
59
  Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
58
60
  Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
59
61
  Requires-Dist: typer (>=0.12.5,<0.13.0)
60
- Project-URL: Repository, https://github.com/DS4SD/docling
62
+ Project-URL: Repository, https://github.com/docling-project/docling
61
63
  Description-Content-Type: text/markdown
62
64
 
63
65
  <p align="center">
64
- <a href="https://github.com/ds4sd/docling">
65
- <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
66
+ <a href="https://github.com/docling-project/docling">
67
+ <img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
66
68
  </a>
67
69
  </p>
68
70
 
@@ -73,7 +75,7 @@ Description-Content-Type: text/markdown
73
75
  </p>
74
76
 
75
77
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
76
- [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
78
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://docling-project.github.io/docling/)
77
79
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
78
80
  [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
79
81
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -81,8 +83,9 @@ Description-Content-Type: text/markdown
81
83
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
82
84
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
83
85
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
84
- [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
86
+ [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
85
87
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
88
+ [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
86
89
 
87
90
  Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
88
91
 
@@ -113,7 +116,7 @@ pip install docling
113
116
 
114
117
  Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
115
118
 
116
- More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
119
+ More [detailed installation instructions](https://docling-project.github.io/docling/installation/) are available in the docs.
117
120
 
118
121
  ## Getting started
119
122
 
@@ -128,28 +131,54 @@ result = converter.convert(source)
128
131
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
129
132
  ```
130
133
 
131
- More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
134
+ More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
132
135
  the docs.
133
136
 
134
137
  ## Documentation
135
138
 
136
- Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
139
+ Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
137
140
  installation, usage, concepts, recipes, extensions, and more.
138
141
 
139
142
  ## Examples
140
143
 
141
- Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
144
+ Go hands-on with our [examples](https://docling-project.github.io/docling/examples/),
142
145
  demonstrating how to address different application use cases with Docling.
143
146
 
144
147
  ## Integrations
145
148
 
146
149
  To further accelerate your AI application development, check out Docling's native
147
- [integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
150
+ [integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
148
151
  and tools.
149
152
 
153
+ ## Apify Actor
154
+
155
+ <a href="https://apify.com/vancura/docling?fpr=docling"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Docling Actor on Apify" width="176" height="39" /></a>
156
+
157
+ You can run Docling in the cloud without installation using the [Docling Actor](https://apify.com/vancura/docling?fpr=docling) on Apify platform. Simply provide a document URL and get the processed result:
158
+
159
+ ```bash
160
+ apify call vancura/docling -i '{
161
+ "options": {
162
+ "to_formats": ["md", "json", "html", "text", "doctags"]
163
+ },
164
+ "http_sources": [
165
+ {"url": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf"},
166
+ {"url": "https://arxiv.org/pdf/2408.09869"}
167
+ ]
168
+ }'
169
+ ```
170
+
171
+ The Actor stores results in:
172
+
173
+ * Processed document in key-value store (`OUTPUT_RESULT`)
174
+ * Processing logs (`DOCLING_LOG`)
175
+ * Dataset record with result URL and status
176
+
177
+ Read more about the [Docling Actor](.actor/README.md), including how to use it via the Apify API and CLI.
178
+
150
179
  ## Get help and support
151
180
 
152
- Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
181
+ Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
153
182
 
154
183
  ## Technical report
155
184
 
@@ -157,7 +186,7 @@ For more details on Docling's inner workings, check out the [Docling Technical R
157
186
 
158
187
  ## Contributing
159
188
 
160
- Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
189
+ Please read [Contributing to Docling](https://github.com/docling-project/docling/blob/main/CONTRIBUTING.md) for details.
161
190
 
162
191
  ## References
163
192
 
@@ -185,7 +214,7 @@ For individual model usage, please refer to the model licenses found in the orig
185
214
 
186
215
  Docling has been brought to you by IBM.
187
216
 
188
- [supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
189
- [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
190
- [integrations]: https://ds4sd.github.io/docling/integrations/
217
+ [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
218
+ [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
219
+ [integrations]: https://docling-project.github.io/docling/integrations/
191
220
 
@@ -1,6 +1,6 @@
1
1
  <p align="center">
2
- <a href="https://github.com/ds4sd/docling">
3
- <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
2
+ <a href="https://github.com/docling-project/docling">
3
+ <img loading="lazy" alt="Docling" src="https://github.com/docling-project/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
4
4
  </a>
5
5
  </p>
6
6
 
@@ -11,7 +11,7 @@
11
11
  </p>
12
12
 
13
13
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
14
- [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
14
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://docling-project.github.io/docling/)
15
15
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
16
16
  [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
17
17
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
@@ -19,8 +19,9 @@
19
19
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
20
20
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
21
21
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
22
- [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
22
+ [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
23
23
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
24
+ [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
24
25
 
25
26
  Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
26
27
 
@@ -51,7 +52,7 @@ pip install docling
51
52
 
52
53
  Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
53
54
 
54
- More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
55
+ More [detailed installation instructions](https://docling-project.github.io/docling/installation/) are available in the docs.
55
56
 
56
57
  ## Getting started
57
58
 
@@ -66,28 +67,54 @@ result = converter.convert(source)
66
67
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
67
68
  ```
68
69
 
69
- More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
70
+ More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
70
71
  the docs.
71
72
 
72
73
  ## Documentation
73
74
 
74
- Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
75
+ Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
75
76
  installation, usage, concepts, recipes, extensions, and more.
76
77
 
77
78
  ## Examples
78
79
 
79
- Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
80
+ Go hands-on with our [examples](https://docling-project.github.io/docling/examples/),
80
81
  demonstrating how to address different application use cases with Docling.
81
82
 
82
83
  ## Integrations
83
84
 
84
85
  To further accelerate your AI application development, check out Docling's native
85
- [integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
86
+ [integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
86
87
  and tools.
87
88
 
89
+ ## Apify Actor
90
+
91
+ <a href="https://apify.com/vancura/docling?fpr=docling"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Docling Actor on Apify" width="176" height="39" /></a>
92
+
93
+ You can run Docling in the cloud without installation using the [Docling Actor](https://apify.com/vancura/docling?fpr=docling) on Apify platform. Simply provide a document URL and get the processed result:
94
+
95
+ ```bash
96
+ apify call vancura/docling -i '{
97
+ "options": {
98
+ "to_formats": ["md", "json", "html", "text", "doctags"]
99
+ },
100
+ "http_sources": [
101
+ {"url": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf"},
102
+ {"url": "https://arxiv.org/pdf/2408.09869"}
103
+ ]
104
+ }'
105
+ ```
106
+
107
+ The Actor stores results in:
108
+
109
+ * Processed document in key-value store (`OUTPUT_RESULT`)
110
+ * Processing logs (`DOCLING_LOG`)
111
+ * Dataset record with result URL and status
112
+
113
+ Read more about the [Docling Actor](.actor/README.md), including how to use it via the Apify API and CLI.
114
+
88
115
  ## Get help and support
89
116
 
90
- Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
117
+ Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
91
118
 
92
119
  ## Technical report
93
120
 
@@ -95,7 +122,7 @@ For more details on Docling's inner workings, check out the [Docling Technical R
95
122
 
96
123
  ## Contributing
97
124
 
98
- Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
125
+ Please read [Contributing to Docling](https://github.com/docling-project/docling/blob/main/CONTRIBUTING.md) for details.
99
126
 
100
127
  ## References
101
128
 
@@ -123,6 +150,6 @@ For individual model usage, please refer to the model licenses found in the orig
123
150
 
124
151
  Docling has been brought to you by IBM.
125
152
 
126
- [supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
127
- [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
128
- [integrations]: https://ds4sd.github.io/docling/integrations/
153
+ [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
154
+ [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
155
+ [integrations]: https://docling-project.github.io/docling/integrations/
@@ -380,7 +380,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
380
380
  end_row_offset_idx=row_idx + row_span,
381
381
  start_col_offset_idx=col_idx,
382
382
  end_col_offset_idx=col_idx + col_span,
383
- col_header=False,
383
+ column_header=row_idx == 0,
384
384
  row_header=False,
385
385
  )
386
386
  data.table_cells.append(cell)
@@ -111,7 +111,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
111
111
  end_row_offset_idx=row_idx + 1,
112
112
  start_col_offset_idx=col_idx,
113
113
  end_col_offset_idx=col_idx + 1,
114
- col_header=row_idx == 0, # First row as header
114
+ column_header=row_idx == 0, # First row as header
115
115
  row_header=False,
116
116
  )
117
117
  table_data.table_cells.append(cell)
@@ -6,12 +6,12 @@ from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
9
+ from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
9
10
  from docling_parse.pdf_parsers import pdf_parser_v1
10
11
  from PIL import Image, ImageDraw
11
12
  from pypdfium2 import PdfPage
12
13
 
13
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
- from docling.datamodel.base_models import Cell
15
15
  from docling.datamodel.document import InputDocument
16
16
 
17
17
  _log = logging.getLogger(__name__)
@@ -68,8 +68,11 @@ class DoclingParsePageBackend(PdfPageBackend):
68
68
 
69
69
  return text_piece
70
70
 
71
- def get_text_cells(self) -> Iterable[Cell]:
72
- cells: List[Cell] = []
71
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
72
+ return None
73
+
74
+ def get_text_cells(self) -> Iterable[TextCell]:
75
+ cells: List[TextCell] = []
73
76
  cell_counter = 0
74
77
 
75
78
  if not self.valid:
@@ -91,19 +94,24 @@ class DoclingParsePageBackend(PdfPageBackend):
91
94
 
92
95
  text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
93
96
  cells.append(
94
- Cell(
95
- id=cell_counter,
97
+ TextCell(
98
+ index=cell_counter,
96
99
  text=text_piece,
97
- bbox=BoundingBox(
98
- # l=x0, b=y0, r=x1, t=y1,
99
- l=x0 * page_size.width / parser_width,
100
- b=y0 * page_size.height / parser_height,
101
- r=x1 * page_size.width / parser_width,
102
- t=y1 * page_size.height / parser_height,
103
- coord_origin=CoordOrigin.BOTTOMLEFT,
100
+ orig=text_piece,
101
+ from_ocr=False,
102
+ rect=BoundingRectangle.from_bounding_box(
103
+ BoundingBox(
104
+ # l=x0, b=y0, r=x1, t=y1,
105
+ l=x0 * page_size.width / parser_width,
106
+ b=y0 * page_size.height / parser_height,
107
+ r=x1 * page_size.width / parser_width,
108
+ t=y1 * page_size.height / parser_height,
109
+ coord_origin=CoordOrigin.BOTTOMLEFT,
110
+ )
104
111
  ).to_top_left_origin(page_size.height),
105
112
  )
106
113
  )
114
+
107
115
  cell_counter += 1
108
116
 
109
117
  def draw_clusters_and_cells():
@@ -112,7 +120,7 @@ class DoclingParsePageBackend(PdfPageBackend):
112
120
  ) # make new image to avoid drawing on the saved ones
113
121
  draw = ImageDraw.Draw(image)
114
122
  for c in cells:
115
- x0, y0, x1, y1 = c.bbox.as_tuple()
123
+ x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
116
124
  cell_color = (
117
125
  random.randint(30, 140),
118
126
  random.randint(30, 140),
@@ -6,12 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
9
10
  from docling_parse.pdf_parsers import pdf_parser_v2
10
11
  from PIL import Image, ImageDraw
11
12
  from pypdfium2 import PdfPage
12
13
 
13
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
- from docling.datamodel.base_models import Cell, Size
15
+ from docling.datamodel.base_models import Size
15
16
  from docling.utils.locks import pypdfium2_lock
16
17
 
17
18
  if TYPE_CHECKING:
@@ -78,8 +79,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
78
79
 
79
80
  return text_piece
80
81
 
81
- def get_text_cells(self) -> Iterable[Cell]:
82
- cells: List[Cell] = []
82
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
83
+ return None
84
+
85
+ def get_text_cells(self) -> Iterable[TextCell]:
86
+ cells: List[TextCell] = []
83
87
  cell_counter = 0
84
88
 
85
89
  if not self.valid:
@@ -106,16 +110,20 @@ class DoclingParseV2PageBackend(PdfPageBackend):
106
110
 
107
111
  text_piece = cell_data[cells_header.index("text")]
108
112
  cells.append(
109
- Cell(
110
- id=cell_counter,
113
+ TextCell(
114
+ index=cell_counter,
111
115
  text=text_piece,
112
- bbox=BoundingBox(
113
- # l=x0, b=y0, r=x1, t=y1,
114
- l=x0 * page_size.width / parser_width,
115
- b=y0 * page_size.height / parser_height,
116
- r=x1 * page_size.width / parser_width,
117
- t=y1 * page_size.height / parser_height,
118
- coord_origin=CoordOrigin.BOTTOMLEFT,
116
+ orig=text_piece,
117
+ from_ocr=False,
118
+ rect=BoundingRectangle.from_bounding_box(
119
+ BoundingBox(
120
+ # l=x0, b=y0, r=x1, t=y1,
121
+ l=x0 * page_size.width / parser_width,
122
+ b=y0 * page_size.height / parser_height,
123
+ r=x1 * page_size.width / parser_width,
124
+ t=y1 * page_size.height / parser_height,
125
+ coord_origin=CoordOrigin.BOTTOMLEFT,
126
+ )
119
127
  ).to_top_left_origin(page_size.height),
120
128
  )
121
129
  )
@@ -0,0 +1,185 @@
1
+ import logging
2
+ import random
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
+
7
+ import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
10
+ from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
11
+ from PIL import Image, ImageDraw
12
+ from pypdfium2 import PdfPage
13
+
14
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
+ from docling.datamodel.base_models import Size
16
+ from docling.utils.locks import pypdfium2_lock
17
+
18
+ if TYPE_CHECKING:
19
+ from docling.datamodel.document import InputDocument
20
+
21
+ _log = logging.getLogger(__name__)
22
+
23
+
24
+ class DoclingParseV4PageBackend(PdfPageBackend):
25
+ def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
26
+ self._ppage = page_obj
27
+ self._dpage = parsed_page
28
+ self.valid = parsed_page is not None
29
+
30
+ def is_valid(self) -> bool:
31
+ return self.valid
32
+
33
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
34
+ # Find intersecting cells on the page
35
+ text_piece = ""
36
+ page_size = self.get_size()
37
+
38
+ scale = (
39
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
40
+ )
41
+
42
+ for i, cell in enumerate(self._dpage.textline_cells):
43
+ cell_bbox = (
44
+ cell.rect.to_bounding_box()
45
+ .to_top_left_origin(page_height=page_size.height)
46
+ .scaled(scale)
47
+ )
48
+
49
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
50
+
51
+ if overlap_frac > 0.5:
52
+ if len(text_piece) > 0:
53
+ text_piece += " "
54
+ text_piece += cell.text
55
+
56
+ return text_piece
57
+
58
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
59
+ return self._dpage
60
+
61
+ def get_text_cells(self) -> Iterable[TextCell]:
62
+ page_size = self.get_size()
63
+
64
+ [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
65
+
66
+ # for cell in self._dpage.textline_cells:
67
+ # rect = cell.rect
68
+ #
69
+ # assert (
70
+ # rect.to_bounding_box().l <= rect.to_bounding_box().r
71
+ # ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
72
+ # assert (
73
+ # rect.to_bounding_box().t <= rect.to_bounding_box().b
74
+ # ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
75
+
76
+ return self._dpage.textline_cells
77
+
78
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
79
+ AREA_THRESHOLD = 0 # 32 * 32
80
+
81
+ images = self._dpage.bitmap_resources
82
+
83
+ for img in images:
84
+ cropbox = img.rect.to_bounding_box().to_top_left_origin(
85
+ self.get_size().height
86
+ )
87
+
88
+ if cropbox.area() > AREA_THRESHOLD:
89
+ cropbox = cropbox.scaled(scale=scale)
90
+
91
+ yield cropbox
92
+
93
+ def get_page_image(
94
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
95
+ ) -> Image.Image:
96
+
97
+ page_size = self.get_size()
98
+
99
+ if not cropbox:
100
+ cropbox = BoundingBox(
101
+ l=0,
102
+ r=page_size.width,
103
+ t=0,
104
+ b=page_size.height,
105
+ coord_origin=CoordOrigin.TOPLEFT,
106
+ )
107
+ padbox = BoundingBox(
108
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
109
+ )
110
+ else:
111
+ padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
112
+ padbox.r = page_size.width - padbox.r
113
+ padbox.t = page_size.height - padbox.t
114
+
115
+ image = (
116
+ self._ppage.render(
117
+ scale=scale * 1.5,
118
+ rotation=0, # no additional rotation
119
+ crop=padbox.as_tuple(),
120
+ )
121
+ .to_pil()
122
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
123
+ ) # We resize the image from 1.5x the given scale to make it sharper.
124
+
125
+ return image
126
+
127
+ def get_size(self) -> Size:
128
+ return Size(
129
+ width=self._dpage.dimension.width,
130
+ height=self._dpage.dimension.height,
131
+ )
132
+
133
+ def unload(self):
134
+ self._ppage = None
135
+ self._dpage = None
136
+
137
+
138
+ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
139
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
140
+ super().__init__(in_doc, path_or_stream)
141
+
142
+ with pypdfium2_lock:
143
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
144
+ self.parser = DoclingPdfParser(loglevel="fatal")
145
+ self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
146
+ success = self.dp_doc is not None
147
+
148
+ if not success:
149
+ raise RuntimeError(
150
+ f"docling-parse v4 could not load document {self.document_hash}."
151
+ )
152
+
153
+ def page_count(self) -> int:
154
+ # return len(self._pdoc) # To be replaced with docling-parse API
155
+
156
+ len_1 = len(self._pdoc)
157
+ len_2 = self.dp_doc.number_of_pages()
158
+
159
+ if len_1 != len_2:
160
+ _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
161
+
162
+ return len_2
163
+
164
+ def load_page(
165
+ self, page_no: int, create_words: bool = True, create_textlines: bool = True
166
+ ) -> DoclingParseV4PageBackend:
167
+ with pypdfium2_lock:
168
+ return DoclingParseV4PageBackend(
169
+ self.dp_doc.get_page(
170
+ page_no + 1,
171
+ create_words=create_words,
172
+ create_textlines=create_textlines,
173
+ ),
174
+ self._pdoc[page_no],
175
+ )
176
+
177
+ def is_valid(self) -> bool:
178
+ return self.page_count() > 0
179
+
180
+ def unload(self):
181
+ super().unload()
182
+ self.dp_doc.unload()
183
+ with pypdfium2_lock:
184
+ self._pdoc.close()
185
+ self._pdoc = None