docling 1.19.1__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling-2.0.0/PKG-INFO +149 -0
- docling-2.0.0/README.md +98 -0
- docling-2.0.0/docling/backend/abstract_backend.py +63 -0
- {docling-1.19.1 → docling-2.0.0}/docling/backend/docling_parse_backend.py +16 -12
- docling-2.0.0/docling/backend/docling_parse_v2_backend.py +240 -0
- docling-2.0.0/docling/backend/html_backend.py +425 -0
- docling-2.0.0/docling/backend/mspowerpoint_backend.py +375 -0
- docling-2.0.0/docling/backend/msword_backend.py +509 -0
- docling-2.0.0/docling/backend/pdf_backend.py +78 -0
- {docling-1.19.1 → docling-2.0.0}/docling/backend/pypdfium2_backend.py +15 -10
- {docling-1.19.1 → docling-2.0.0}/docling/cli/main.py +61 -60
- docling-2.0.0/docling/datamodel/base_models.py +204 -0
- docling-2.0.0/docling/datamodel/document.py +507 -0
- {docling-1.19.1 → docling-2.0.0}/docling/datamodel/pipeline_options.py +13 -0
- {docling-1.19.1 → docling-2.0.0}/docling/datamodel/settings.py +1 -0
- docling-2.0.0/docling/document_converter.py +260 -0
- docling-2.0.0/docling/models/base_model.py +25 -0
- {docling-1.19.1 → docling-2.0.0}/docling/models/base_ocr_model.py +10 -5
- docling-2.0.0/docling/models/ds_glm_model.py +275 -0
- {docling-1.19.1 → docling-2.0.0}/docling/models/easyocr_model.py +4 -1
- {docling-1.19.1 → docling-2.0.0}/docling/models/layout_model.py +73 -61
- {docling-1.19.1 → docling-2.0.0}/docling/models/page_assemble_model.py +21 -5
- docling-2.0.0/docling/models/page_preprocessing_model.py +57 -0
- {docling-1.19.1 → docling-2.0.0}/docling/models/table_structure_model.py +34 -32
- {docling-1.19.1 → docling-2.0.0}/docling/models/tesseract_ocr_cli_model.py +8 -5
- {docling-1.19.1 → docling-2.0.0}/docling/models/tesseract_ocr_model.py +8 -5
- docling-2.0.0/docling/pipeline/base_pipeline.py +190 -0
- docling-2.0.0/docling/pipeline/simple_pipeline.py +59 -0
- docling-2.0.0/docling/pipeline/standard_pdf_pipeline.py +198 -0
- {docling-1.19.1 → docling-2.0.0}/docling/utils/export.py +4 -3
- {docling-1.19.1 → docling-2.0.0}/docling/utils/layout_utils.py +17 -11
- {docling-1.19.1 → docling-2.0.0}/pyproject.toml +26 -11
- docling-1.19.1/PKG-INFO +0 -380
- docling-1.19.1/README.md +0 -332
- docling-1.19.1/docling/backend/abstract_backend.py +0 -68
- docling-1.19.1/docling/datamodel/base_models.py +0 -324
- docling-1.19.1/docling/datamodel/document.py +0 -461
- docling-1.19.1/docling/document_converter.py +0 -297
- docling-1.19.1/docling/models/ds_glm_model.py +0 -86
- docling-1.19.1/docling/pipeline/base_model_pipeline.py +0 -18
- docling-1.19.1/docling/pipeline/standard_model_pipeline.py +0 -66
- {docling-1.19.1 → docling-2.0.0}/LICENSE +0 -0
- {docling-1.19.1 → docling-2.0.0}/docling/__init__.py +0 -0
- {docling-1.19.1 → docling-2.0.0}/docling/backend/__init__.py +0 -0
- {docling-1.19.1 → docling-2.0.0}/docling/cli/__init__.py +0 -0
- {docling-1.19.1 → docling-2.0.0}/docling/datamodel/__init__.py +0 -0
- {docling-1.19.1 → docling-2.0.0}/docling/models/__init__.py +0 -0
- {docling-1.19.1 → docling-2.0.0}/docling/pipeline/__init__.py +0 -0
- {docling-1.19.1 → docling-2.0.0}/docling/utils/__init__.py +0 -0
- {docling-1.19.1 → docling-2.0.0}/docling/utils/utils.py +0 -0
docling-2.0.0/PKG-INFO
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: docling
|
3
|
+
Version: 2.0.0
|
4
|
+
Summary: Docling PDF conversion package
|
5
|
+
Home-page: https://github.com/DS4SD/docling
|
6
|
+
License: MIT
|
7
|
+
Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
|
8
|
+
Author: Christoph Auer
|
9
|
+
Author-email: cau@zurich.ibm.com
|
10
|
+
Requires-Python: >=3.10,<4.0
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
15
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
16
|
+
Classifier: Operating System :: POSIX :: Linux
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
+
Provides-Extra: tesserocr
|
23
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
24
|
+
Requires-Dist: certifi (>=2024.7.4)
|
25
|
+
Requires-Dist: deepsearch-glm (>=0.25.0,<0.26.0)
|
26
|
+
Requires-Dist: docling-core (>=2.0.0,<3.0.0)
|
27
|
+
Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
|
28
|
+
Requires-Dist: docling-parse (>=1.6.0,<2.0.0)
|
29
|
+
Requires-Dist: easyocr (>=1.7,<2.0)
|
30
|
+
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
|
+
Requires-Dist: huggingface_hub (>=0.23,<1)
|
32
|
+
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
33
|
+
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
34
|
+
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
35
|
+
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
36
|
+
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
37
|
+
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
38
|
+
Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
|
39
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
40
|
+
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
41
|
+
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
42
|
+
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
43
|
+
Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
44
|
+
Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
45
|
+
Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
46
|
+
Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
47
|
+
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
48
|
+
Project-URL: Repository, https://github.com/DS4SD/docling
|
49
|
+
Description-Content-Type: text/markdown
|
50
|
+
|
51
|
+
<p align="center">
|
52
|
+
<a href="https://github.com/ds4sd/docling">
|
53
|
+
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
|
54
|
+
</a>
|
55
|
+
</p>
|
56
|
+
|
57
|
+
# Docling
|
58
|
+
|
59
|
+
[](https://arxiv.org/abs/2408.09869)
|
60
|
+
[](https://ds4sd.github.io/docling/)
|
61
|
+
[](https://pypi.org/project/docling/)
|
62
|
+

|
63
|
+
[](https://python-poetry.org/)
|
64
|
+
[](https://github.com/psf/black)
|
65
|
+
[](https://pycqa.github.io/isort/)
|
66
|
+
[](https://pydantic.dev)
|
67
|
+
[](https://github.com/pre-commit/pre-commit)
|
68
|
+
[](https://opensource.org/licenses/MIT)
|
69
|
+
|
70
|
+
Docling parses documents and exports them to the desired format with ease and speed.
|
71
|
+
|
72
|
+
## Features
|
73
|
+
|
74
|
+
* 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
|
75
|
+
* 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
|
76
|
+
* 📝 Metadata extraction, including title, authors, references & language
|
77
|
+
* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
|
78
|
+
* 🔍 OCR support for scanned PDFs
|
79
|
+
* 💻 Simple and convenient CLI
|
80
|
+
|
81
|
+
Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
|
82
|
+
|
83
|
+
|
84
|
+
## Installation
|
85
|
+
|
86
|
+
To use Docling, simply install `docling` from your package manager, e.g. pip:
|
87
|
+
```bash
|
88
|
+
pip install docling
|
89
|
+
```
|
90
|
+
|
91
|
+
Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
|
92
|
+
|
93
|
+
More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
|
94
|
+
|
95
|
+
## Getting started
|
96
|
+
|
97
|
+
To convert invidual documents, use `convert()`, for example:
|
98
|
+
|
99
|
+
```python
|
100
|
+
from docling.document_converter import DocumentConverter
|
101
|
+
|
102
|
+
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
103
|
+
converter = DocumentConverter()
|
104
|
+
result = converter.convert(source)
|
105
|
+
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
106
|
+
print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
|
107
|
+
```
|
108
|
+
|
109
|
+
|
110
|
+
Check out [Getting started](https://ds4sd.github.io/docling/).
|
111
|
+
You will find lots of tuning options to leverage all the advanced capabilities.
|
112
|
+
|
113
|
+
|
114
|
+
## Get help and support
|
115
|
+
|
116
|
+
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
117
|
+
|
118
|
+
|
119
|
+
## Technical report
|
120
|
+
|
121
|
+
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
122
|
+
|
123
|
+
## Contributing
|
124
|
+
|
125
|
+
Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
|
126
|
+
|
127
|
+
|
128
|
+
## References
|
129
|
+
|
130
|
+
If you use Docling in your projects, please consider citing the following:
|
131
|
+
|
132
|
+
```bib
|
133
|
+
@techreport{Docling,
|
134
|
+
author = {Deep Search Team},
|
135
|
+
month = {8},
|
136
|
+
title = {Docling Technical Report},
|
137
|
+
url = {https://arxiv.org/abs/2408.09869},
|
138
|
+
eprint = {2408.09869},
|
139
|
+
doi = {10.48550/arXiv.2408.09869},
|
140
|
+
version = {1.0.0},
|
141
|
+
year = {2024}
|
142
|
+
}
|
143
|
+
```
|
144
|
+
|
145
|
+
## License
|
146
|
+
|
147
|
+
The Docling codebase is under MIT license.
|
148
|
+
For individual model usage, please refer to the model licenses found in the original packages.
|
149
|
+
|
docling-2.0.0/README.md
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
<p align="center">
|
2
|
+
<a href="https://github.com/ds4sd/docling">
|
3
|
+
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
|
4
|
+
</a>
|
5
|
+
</p>
|
6
|
+
|
7
|
+
# Docling
|
8
|
+
|
9
|
+
[](https://arxiv.org/abs/2408.09869)
|
10
|
+
[](https://ds4sd.github.io/docling/)
|
11
|
+
[](https://pypi.org/project/docling/)
|
12
|
+

|
13
|
+
[](https://python-poetry.org/)
|
14
|
+
[](https://github.com/psf/black)
|
15
|
+
[](https://pycqa.github.io/isort/)
|
16
|
+
[](https://pydantic.dev)
|
17
|
+
[](https://github.com/pre-commit/pre-commit)
|
18
|
+
[](https://opensource.org/licenses/MIT)
|
19
|
+
|
20
|
+
Docling parses documents and exports them to the desired format with ease and speed.
|
21
|
+
|
22
|
+
## Features
|
23
|
+
|
24
|
+
* 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.)
|
25
|
+
* 📑 Advanced PDF document understanding incl. page layout, reading order & table structures
|
26
|
+
* 📝 Metadata extraction, including title, authors, references & language
|
27
|
+
* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
|
28
|
+
* 🔍 OCR support for scanned PDFs
|
29
|
+
* 💻 Simple and convenient CLI
|
30
|
+
|
31
|
+
Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
|
32
|
+
|
33
|
+
|
34
|
+
## Installation
|
35
|
+
|
36
|
+
To use Docling, simply install `docling` from your package manager, e.g. pip:
|
37
|
+
```bash
|
38
|
+
pip install docling
|
39
|
+
```
|
40
|
+
|
41
|
+
Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
|
42
|
+
|
43
|
+
More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
|
44
|
+
|
45
|
+
## Getting started
|
46
|
+
|
47
|
+
To convert invidual documents, use `convert()`, for example:
|
48
|
+
|
49
|
+
```python
|
50
|
+
from docling.document_converter import DocumentConverter
|
51
|
+
|
52
|
+
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
53
|
+
converter = DocumentConverter()
|
54
|
+
result = converter.convert(source)
|
55
|
+
print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
|
56
|
+
print(result.document.export_to_document_tokens()) # output: "<document><title><page_1><loc_20>..."
|
57
|
+
```
|
58
|
+
|
59
|
+
|
60
|
+
Check out [Getting started](https://ds4sd.github.io/docling/).
|
61
|
+
You will find lots of tuning options to leverage all the advanced capabilities.
|
62
|
+
|
63
|
+
|
64
|
+
## Get help and support
|
65
|
+
|
66
|
+
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
67
|
+
|
68
|
+
|
69
|
+
## Technical report
|
70
|
+
|
71
|
+
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
72
|
+
|
73
|
+
## Contributing
|
74
|
+
|
75
|
+
Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
|
76
|
+
|
77
|
+
|
78
|
+
## References
|
79
|
+
|
80
|
+
If you use Docling in your projects, please consider citing the following:
|
81
|
+
|
82
|
+
```bib
|
83
|
+
@techreport{Docling,
|
84
|
+
author = {Deep Search Team},
|
85
|
+
month = {8},
|
86
|
+
title = {Docling Technical Report},
|
87
|
+
url = {https://arxiv.org/abs/2408.09869},
|
88
|
+
eprint = {2408.09869},
|
89
|
+
doi = {10.48550/arXiv.2408.09869},
|
90
|
+
version = {1.0.0},
|
91
|
+
year = {2024}
|
92
|
+
}
|
93
|
+
```
|
94
|
+
|
95
|
+
## License
|
96
|
+
|
97
|
+
The Docling codebase is under MIT license.
|
98
|
+
For individual model usage, please refer to the model licenses found in the original packages.
|
@@ -0,0 +1,63 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import TYPE_CHECKING, Set, Union
|
5
|
+
|
6
|
+
from docling_core.types.doc import DoclingDocument
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from docling.datamodel.base_models import InputFormat
|
10
|
+
from docling.datamodel.document import InputDocument
|
11
|
+
|
12
|
+
|
13
|
+
class AbstractDocumentBackend(ABC):
|
14
|
+
@abstractmethod
|
15
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
16
|
+
self.path_or_stream = path_or_stream
|
17
|
+
self.document_hash = in_doc.document_hash
|
18
|
+
self.input_format = in_doc.format
|
19
|
+
|
20
|
+
@abstractmethod
|
21
|
+
def is_valid(self) -> bool:
|
22
|
+
pass
|
23
|
+
|
24
|
+
@classmethod
|
25
|
+
@abstractmethod
|
26
|
+
def supports_pagination(cls) -> bool:
|
27
|
+
pass
|
28
|
+
|
29
|
+
@abstractmethod
|
30
|
+
def unload(self):
|
31
|
+
if isinstance(self.path_or_stream, BytesIO):
|
32
|
+
self.path_or_stream.close()
|
33
|
+
|
34
|
+
self.path_or_stream = None
|
35
|
+
|
36
|
+
@classmethod
|
37
|
+
@abstractmethod
|
38
|
+
def supported_formats(cls) -> Set["InputFormat"]:
|
39
|
+
pass
|
40
|
+
|
41
|
+
|
42
|
+
class PaginatedDocumentBackend(AbstractDocumentBackend):
|
43
|
+
"""DeclarativeDocumentBackend.
|
44
|
+
|
45
|
+
A declarative document backend is a backend that can transform to DoclingDocument
|
46
|
+
straight without a recognition pipeline.
|
47
|
+
"""
|
48
|
+
|
49
|
+
@abstractmethod
|
50
|
+
def page_count(self) -> int:
|
51
|
+
pass
|
52
|
+
|
53
|
+
|
54
|
+
class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
55
|
+
"""DeclarativeDocumentBackend.
|
56
|
+
|
57
|
+
A declarative document backend is a backend that can transform to DoclingDocument
|
58
|
+
straight without a recognition pipeline.
|
59
|
+
"""
|
60
|
+
|
61
|
+
@abstractmethod
|
62
|
+
def convert(self) -> DoclingDocument:
|
63
|
+
pass
|
@@ -5,12 +5,14 @@ from pathlib import Path
|
|
5
5
|
from typing import Iterable, List, Optional, Union
|
6
6
|
|
7
7
|
import pypdfium2 as pdfium
|
8
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
8
9
|
from docling_parse.docling_parse import pdf_parser
|
9
10
|
from PIL import Image, ImageDraw
|
10
11
|
from pypdfium2 import PdfPage
|
11
12
|
|
12
|
-
from docling.backend.
|
13
|
-
from docling.datamodel.base_models import
|
13
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
14
|
+
from docling.datamodel.base_models import Cell
|
15
|
+
from docling.datamodel.document import InputDocument
|
14
16
|
|
15
17
|
_log = logging.getLogger(__name__)
|
16
18
|
|
@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
177
179
|
|
178
180
|
return image
|
179
181
|
|
180
|
-
def get_size(self) ->
|
181
|
-
return
|
182
|
+
def get_size(self) -> Size:
|
183
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
182
184
|
|
183
185
|
def unload(self):
|
184
186
|
self._ppage = None
|
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
186
188
|
|
187
189
|
|
188
190
|
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
189
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]
|
190
|
-
super().__init__(
|
191
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
192
|
+
super().__init__(in_doc, path_or_stream)
|
191
193
|
|
192
|
-
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
194
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
193
195
|
self.parser = pdf_parser()
|
194
196
|
|
195
197
|
success = False
|
196
|
-
if isinstance(path_or_stream, BytesIO):
|
198
|
+
if isinstance(self.path_or_stream, BytesIO):
|
197
199
|
success = self.parser.load_document_from_bytesio(
|
198
|
-
document_hash, path_or_stream
|
200
|
+
self.document_hash, self.path_or_stream
|
201
|
+
)
|
202
|
+
elif isinstance(self.path_or_stream, Path):
|
203
|
+
success = self.parser.load_document(
|
204
|
+
self.document_hash, str(self.path_or_stream)
|
199
205
|
)
|
200
|
-
elif isinstance(path_or_stream, Path):
|
201
|
-
success = self.parser.load_document(document_hash, str(path_or_stream))
|
202
206
|
|
203
207
|
if not success:
|
204
208
|
raise RuntimeError(
|
205
|
-
f"docling-parse could not load document {document_hash}."
|
209
|
+
f"docling-parse could not load document with hash {self.document_hash}."
|
206
210
|
)
|
207
211
|
|
208
212
|
def page_count(self) -> int:
|
@@ -0,0 +1,240 @@
|
|
1
|
+
import logging
|
2
|
+
import random
|
3
|
+
from io import BytesIO
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
6
|
+
|
7
|
+
import pypdfium2 as pdfium
|
8
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
+
from docling_parse.docling_parse import pdf_parser_v2
|
10
|
+
from PIL import Image, ImageDraw
|
11
|
+
from pypdfium2 import PdfPage
|
12
|
+
|
13
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
14
|
+
from docling.datamodel.base_models import Cell, Size
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from docling.datamodel.document import InputDocument
|
18
|
+
|
19
|
+
_log = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class DoclingParseV2PageBackend(PdfPageBackend):
|
23
|
+
def __init__(
|
24
|
+
self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
|
25
|
+
):
|
26
|
+
self._ppage = page_obj
|
27
|
+
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
28
|
+
|
29
|
+
self.valid = "pages" in parsed_page
|
30
|
+
if self.valid:
|
31
|
+
self._dpage = parsed_page["pages"][page_no]
|
32
|
+
else:
|
33
|
+
_log.info(
|
34
|
+
f"An error occured when loading page {page_no} of document {document_hash}."
|
35
|
+
)
|
36
|
+
|
37
|
+
def is_valid(self) -> bool:
|
38
|
+
return self.valid
|
39
|
+
|
40
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
41
|
+
if not self.valid:
|
42
|
+
return ""
|
43
|
+
# Find intersecting cells on the page
|
44
|
+
text_piece = ""
|
45
|
+
page_size = self.get_size()
|
46
|
+
|
47
|
+
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
48
|
+
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
49
|
+
|
50
|
+
scale = (
|
51
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
52
|
+
)
|
53
|
+
|
54
|
+
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
55
|
+
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
56
|
+
|
57
|
+
for i, cell_data in enumerate(cells_data):
|
58
|
+
x0 = cell_data[cells_header.index("x0")]
|
59
|
+
y0 = cell_data[cells_header.index("y0")]
|
60
|
+
x1 = cell_data[cells_header.index("x1")]
|
61
|
+
y1 = cell_data[cells_header.index("y1")]
|
62
|
+
|
63
|
+
cell_bbox = BoundingBox(
|
64
|
+
l=x0 * scale * page_size.width / parser_width,
|
65
|
+
b=y0 * scale * page_size.height / parser_height,
|
66
|
+
r=x1 * scale * page_size.width / parser_width,
|
67
|
+
t=y1 * scale * page_size.height / parser_height,
|
68
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
69
|
+
).to_top_left_origin(page_height=page_size.height * scale)
|
70
|
+
|
71
|
+
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
72
|
+
|
73
|
+
if overlap_frac > 0.5:
|
74
|
+
if len(text_piece) > 0:
|
75
|
+
text_piece += " "
|
76
|
+
text_piece += cell_data[cells_header.index("text")]
|
77
|
+
|
78
|
+
return text_piece
|
79
|
+
|
80
|
+
def get_text_cells(self) -> Iterable[Cell]:
|
81
|
+
cells: List[Cell] = []
|
82
|
+
cell_counter = 0
|
83
|
+
|
84
|
+
if not self.valid:
|
85
|
+
return cells
|
86
|
+
|
87
|
+
page_size = self.get_size()
|
88
|
+
|
89
|
+
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
90
|
+
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
91
|
+
|
92
|
+
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
93
|
+
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
94
|
+
|
95
|
+
for i, cell_data in enumerate(cells_data):
|
96
|
+
x0 = cell_data[cells_header.index("x0")]
|
97
|
+
y0 = cell_data[cells_header.index("y0")]
|
98
|
+
x1 = cell_data[cells_header.index("x1")]
|
99
|
+
y1 = cell_data[cells_header.index("y1")]
|
100
|
+
|
101
|
+
if x1 < x0:
|
102
|
+
x0, x1 = x1, x0
|
103
|
+
if y1 < y0:
|
104
|
+
y0, y1 = y1, y0
|
105
|
+
|
106
|
+
text_piece = cell_data[cells_header.index("text")]
|
107
|
+
cells.append(
|
108
|
+
Cell(
|
109
|
+
id=cell_counter,
|
110
|
+
text=text_piece,
|
111
|
+
bbox=BoundingBox(
|
112
|
+
# l=x0, b=y0, r=x1, t=y1,
|
113
|
+
l=x0 * page_size.width / parser_width,
|
114
|
+
b=y0 * page_size.height / parser_height,
|
115
|
+
r=x1 * page_size.width / parser_width,
|
116
|
+
t=y1 * page_size.height / parser_height,
|
117
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
118
|
+
).to_top_left_origin(page_size.height),
|
119
|
+
)
|
120
|
+
)
|
121
|
+
cell_counter += 1
|
122
|
+
|
123
|
+
def draw_clusters_and_cells():
|
124
|
+
image = (
|
125
|
+
self.get_page_image()
|
126
|
+
) # make new image to avoid drawing on the saved ones
|
127
|
+
draw = ImageDraw.Draw(image)
|
128
|
+
for c in cells:
|
129
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
130
|
+
cell_color = (
|
131
|
+
random.randint(30, 140),
|
132
|
+
random.randint(30, 140),
|
133
|
+
random.randint(30, 140),
|
134
|
+
)
|
135
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
136
|
+
image.show()
|
137
|
+
|
138
|
+
# draw_clusters_and_cells()
|
139
|
+
|
140
|
+
return cells
|
141
|
+
|
142
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
143
|
+
AREA_THRESHOLD = 32 * 32
|
144
|
+
|
145
|
+
images = self._dpage["sanitized"]["images"]["data"]
|
146
|
+
images_header = self._dpage["sanitized"]["images"]["header"]
|
147
|
+
|
148
|
+
for row in images:
|
149
|
+
x0 = row[images_header.index("x0")]
|
150
|
+
y0 = row[images_header.index("y0")]
|
151
|
+
x1 = row[images_header.index("x1")]
|
152
|
+
y1 = row[images_header.index("y1")]
|
153
|
+
|
154
|
+
cropbox = BoundingBox.from_tuple(
|
155
|
+
(x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
|
156
|
+
).to_top_left_origin(self.get_size().height)
|
157
|
+
|
158
|
+
if cropbox.area() > AREA_THRESHOLD:
|
159
|
+
cropbox = cropbox.scaled(scale=scale)
|
160
|
+
|
161
|
+
yield cropbox
|
162
|
+
|
163
|
+
def get_page_image(
|
164
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
165
|
+
) -> Image.Image:
|
166
|
+
|
167
|
+
page_size = self.get_size()
|
168
|
+
|
169
|
+
if not cropbox:
|
170
|
+
cropbox = BoundingBox(
|
171
|
+
l=0,
|
172
|
+
r=page_size.width,
|
173
|
+
t=0,
|
174
|
+
b=page_size.height,
|
175
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
176
|
+
)
|
177
|
+
padbox = BoundingBox(
|
178
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
179
|
+
)
|
180
|
+
else:
|
181
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
182
|
+
padbox.r = page_size.width - padbox.r
|
183
|
+
padbox.t = page_size.height - padbox.t
|
184
|
+
|
185
|
+
image = (
|
186
|
+
self._ppage.render(
|
187
|
+
scale=scale * 1.5,
|
188
|
+
rotation=0, # no additional rotation
|
189
|
+
crop=padbox.as_tuple(),
|
190
|
+
)
|
191
|
+
.to_pil()
|
192
|
+
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
193
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
194
|
+
|
195
|
+
return image
|
196
|
+
|
197
|
+
def get_size(self) -> Size:
|
198
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
199
|
+
|
200
|
+
def unload(self):
|
201
|
+
self._ppage = None
|
202
|
+
self._dpage = None
|
203
|
+
|
204
|
+
|
205
|
+
class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
206
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
207
|
+
super().__init__(in_doc, path_or_stream)
|
208
|
+
|
209
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
210
|
+
self.parser = pdf_parser_v2("fatal")
|
211
|
+
|
212
|
+
success = False
|
213
|
+
if isinstance(path_or_stream, BytesIO):
|
214
|
+
success = self.parser.load_document_from_bytesio(
|
215
|
+
self.document_hash, path_or_stream
|
216
|
+
)
|
217
|
+
elif isinstance(path_or_stream, Path):
|
218
|
+
success = self.parser.load_document(self.document_hash, str(path_or_stream))
|
219
|
+
|
220
|
+
if not success:
|
221
|
+
raise RuntimeError(
|
222
|
+
f"docling-parse v2 could not load document {self.document_hash}."
|
223
|
+
)
|
224
|
+
|
225
|
+
def page_count(self) -> int:
|
226
|
+
return len(self._pdoc) # To be replaced with docling-parse API
|
227
|
+
|
228
|
+
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
|
229
|
+
return DoclingParseV2PageBackend(
|
230
|
+
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
231
|
+
)
|
232
|
+
|
233
|
+
def is_valid(self) -> bool:
|
234
|
+
return self.page_count() > 0
|
235
|
+
|
236
|
+
def unload(self):
|
237
|
+
super().unload()
|
238
|
+
self.parser.unload_document(self.document_hash)
|
239
|
+
self._pdoc.close()
|
240
|
+
self._pdoc = None
|