docling 0.1.2__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. docling-1.5.0/PKG-INFO +192 -0
  2. docling-1.5.0/README.md +153 -0
  3. {docling-0.1.2 → docling-1.5.0}/docling/backend/abstract_backend.py +1 -1
  4. docling-1.5.0/docling/backend/docling_parse_backend.py +187 -0
  5. {docling-0.1.2 → docling-1.5.0}/docling/backend/pypdfium2_backend.py +5 -9
  6. {docling-0.1.2 → docling-1.5.0}/docling/datamodel/base_models.py +68 -11
  7. {docling-0.1.2 → docling-1.5.0}/docling/datamodel/document.py +27 -15
  8. {docling-0.1.2 → docling-1.5.0}/docling/document_converter.py +77 -6
  9. {docling-0.1.2 → docling-1.5.0}/docling/models/easyocr_model.py +1 -1
  10. {docling-0.1.2 → docling-1.5.0}/docling/models/layout_model.py +11 -1
  11. {docling-0.1.2 → docling-1.5.0}/docling/models/page_assemble_model.py +0 -12
  12. {docling-0.1.2 → docling-1.5.0}/docling/models/table_structure_model.py +43 -12
  13. {docling-0.1.2 → docling-1.5.0}/docling/pipeline/standard_model_pipeline.py +1 -1
  14. {docling-0.1.2 → docling-1.5.0}/pyproject.toml +28 -11
  15. docling-0.1.2/PKG-INFO +0 -132
  16. docling-0.1.2/README.md +0 -99
  17. {docling-0.1.2 → docling-1.5.0}/LICENSE +0 -0
  18. {docling-0.1.2 → docling-1.5.0}/docling/__init__.py +0 -0
  19. {docling-0.1.2 → docling-1.5.0}/docling/backend/__init__.py +0 -0
  20. {docling-0.1.2 → docling-1.5.0}/docling/datamodel/__init__.py +0 -0
  21. {docling-0.1.2 → docling-1.5.0}/docling/datamodel/settings.py +0 -0
  22. {docling-0.1.2 → docling-1.5.0}/docling/models/__init__.py +0 -0
  23. {docling-0.1.2 → docling-1.5.0}/docling/models/ds_glm_model.py +0 -0
  24. {docling-0.1.2 → docling-1.5.0}/docling/pipeline/__init__.py +0 -0
  25. {docling-0.1.2 → docling-1.5.0}/docling/pipeline/base_model_pipeline.py +0 -0
  26. {docling-0.1.2 → docling-1.5.0}/docling/utils/__init__.py +0 -0
  27. {docling-0.1.2 → docling-1.5.0}/docling/utils/layout_utils.py +0 -0
  28. {docling-0.1.2 → docling-1.5.0}/docling/utils/utils.py +0 -0
docling-1.5.0/PKG-INFO ADDED
@@ -0,0 +1,192 @@
1
+ Metadata-Version: 2.1
2
+ Name: docling
3
+ Version: 1.5.0
4
+ Summary: Docling PDF conversion package
5
+ Home-page: https://github.com/DS4SD/docling
6
+ License: MIT
7
+ Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
8
+ Author: Christoph Auer
9
+ Author-email: cau@zurich.ibm.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: MacOS :: MacOS X
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Provides-Extra: easyocr
23
+ Provides-Extra: ocr
24
+ Requires-Dist: certifi (>=2024.7.4)
25
+ Requires-Dist: deepsearch-glm (>=0.19.0,<1)
26
+ Requires-Dist: docling-core (>=1.1.2,<2.0.0)
27
+ Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
28
+ Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
29
+ Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
30
+ Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
+ Requires-Dist: huggingface_hub (>=0.23,<1)
32
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
33
+ Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
34
+ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
35
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
36
+ Project-URL: Repository, https://github.com/DS4SD/docling
37
+ Description-Content-Type: text/markdown
38
+
39
+ <p align="center">
40
+ <a href="https://github.com/ds4sd/docling">
41
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
42
+ </a>
43
+ </p>
44
+
45
+ # Docling
46
+
47
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
48
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
49
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
50
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
51
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
52
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
53
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
54
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
55
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
56
+
57
+ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
58
+
59
+ ## Features
60
+ * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
61
+ * 📑 Understands detailed page layout, reading order and recovers table structures
62
+ * 📝 Extracts metadata from the document, such as title, authors, references and language
63
+ * 🔍 Optionally applies OCR (use with scanned PDFs)
64
+
65
+ ## Installation
66
+
67
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
68
+ ```bash
69
+ pip install docling
70
+ ```
71
+
72
+ > [!NOTE]
73
+ > Works on macOS and Linux environments. Windows platforms are currently not tested.
74
+
75
+ ### Development setup
76
+
77
+ To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
78
+ ```bash
79
+ poetry install --all-extras
80
+ ```
81
+
82
+ ## Usage
83
+
84
+ ### Convert a single document
85
+
86
+ To convert invidual PDF documents, use `convert_single()`, for example:
87
+ ```python
88
+ from docling.document_converter import DocumentConverter
89
+
90
+ source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
91
+ converter = DocumentConverter()
92
+ doc = converter.convert_single(source)
93
+ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
94
+ ```
95
+
96
+ ### Convert a batch of documents
97
+
98
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
99
+
100
+ From a local repo clone, you can run it with:
101
+
102
+ ```
103
+ python examples/batch_convert.py
104
+ ```
105
+ The output of the above command will be written to `./scratch`.
106
+
107
+ ### Adjust pipeline features
108
+
109
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
110
+ one can adjust the conversion pipeline and features.
111
+
112
+
113
+ #### Control pipeline options
114
+
115
+ You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
116
+ ```python
117
+ doc_converter = DocumentConverter(
118
+ artifacts_path=artifacts_path,
119
+ pipeline_options=PipelineOptions(
120
+ do_table_structure=False, # controls if table structure is recovered
121
+ do_ocr=True, # controls if OCR is applied (ignores programmatic content)
122
+ ),
123
+ )
124
+ ```
125
+
126
+ #### Control table extraction options
127
+
128
+ You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
129
+ This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
130
+
131
+
132
+ ```python
133
+ pipeline_options = PipelineOptions(do_table_structure=True)
134
+ pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
135
+
136
+ doc_converter = DocumentConverter(
137
+ artifacts_path=artifacts_path,
138
+ pipeline_options=pipeline_options,
139
+ )
140
+ ```
141
+
142
+ ### Impose limits on the document size
143
+
144
+ You can limit the file size and number of pages which should be allowed to process per document:
145
+ ```python
146
+ conv_input = DocumentConversionInput.from_paths(
147
+ paths=[Path("./test/data/2206.01062.pdf")],
148
+ limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
149
+ )
150
+ ```
151
+
152
+ ### Convert from binary PDF streams
153
+
154
+ You can convert PDFs from a binary stream instead of from the filesystem as follows:
155
+ ```python
156
+ buf = BytesIO(your_binary_stream)
157
+ docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
158
+ conv_input = DocumentConversionInput.from_streams(docs)
159
+ converted_docs = doc_converter.convert(conv_input)
160
+ ```
161
+ ### Limit resource usage
162
+
163
+ You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
164
+
165
+
166
+ ## Contributing
167
+
168
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
169
+
170
+
171
+ ## References
172
+
173
+ If you use Docling in your projects, please consider citing the following:
174
+
175
+ ```bib
176
+ @techreport{Docling,
177
+ author = {Deep Search Team},
178
+ month = {8},
179
+ title = {{Docling Technical Report}},
180
+ url={https://arxiv.org/abs/2408.09869},
181
+ eprint={2408.09869},
182
+ doi = "10.48550/arXiv.2408.09869",
183
+ version = {1.0.0},
184
+ year = {2024}
185
+ }
186
+ ```
187
+
188
+ ## License
189
+
190
+ The Docling codebase is under MIT license.
191
+ For individual model usage, please refer to the model licenses found in the original packages.
192
+
@@ -0,0 +1,153 @@
1
+ <p align="center">
2
+ <a href="https://github.com/ds4sd/docling">
3
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
4
+ </a>
5
+ </p>
6
+
7
+ # Docling
8
+
9
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
10
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
11
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
12
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
13
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
14
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
15
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
16
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
17
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
18
+
19
+ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
20
+
21
+ ## Features
22
+ * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
23
+ * 📑 Understands detailed page layout, reading order and recovers table structures
24
+ * 📝 Extracts metadata from the document, such as title, authors, references and language
25
+ * 🔍 Optionally applies OCR (use with scanned PDFs)
26
+
27
+ ## Installation
28
+
29
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
30
+ ```bash
31
+ pip install docling
32
+ ```
33
+
34
+ > [!NOTE]
35
+ > Works on macOS and Linux environments. Windows platforms are currently not tested.
36
+
37
+ ### Development setup
38
+
39
+ To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
40
+ ```bash
41
+ poetry install --all-extras
42
+ ```
43
+
44
+ ## Usage
45
+
46
+ ### Convert a single document
47
+
48
+ To convert invidual PDF documents, use `convert_single()`, for example:
49
+ ```python
50
+ from docling.document_converter import DocumentConverter
51
+
52
+ source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
53
+ converter = DocumentConverter()
54
+ doc = converter.convert_single(source)
55
+ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
56
+ ```
57
+
58
+ ### Convert a batch of documents
59
+
60
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
61
+
62
+ From a local repo clone, you can run it with:
63
+
64
+ ```
65
+ python examples/batch_convert.py
66
+ ```
67
+ The output of the above command will be written to `./scratch`.
68
+
69
+ ### Adjust pipeline features
70
+
71
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
72
+ one can adjust the conversion pipeline and features.
73
+
74
+
75
+ #### Control pipeline options
76
+
77
+ You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
78
+ ```python
79
+ doc_converter = DocumentConverter(
80
+ artifacts_path=artifacts_path,
81
+ pipeline_options=PipelineOptions(
82
+ do_table_structure=False, # controls if table structure is recovered
83
+ do_ocr=True, # controls if OCR is applied (ignores programmatic content)
84
+ ),
85
+ )
86
+ ```
87
+
88
+ #### Control table extraction options
89
+
90
+ You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
91
+ This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
92
+
93
+
94
+ ```python
95
+ pipeline_options = PipelineOptions(do_table_structure=True)
96
+ pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
97
+
98
+ doc_converter = DocumentConverter(
99
+ artifacts_path=artifacts_path,
100
+ pipeline_options=pipeline_options,
101
+ )
102
+ ```
103
+
104
+ ### Impose limits on the document size
105
+
106
+ You can limit the file size and number of pages which should be allowed to process per document:
107
+ ```python
108
+ conv_input = DocumentConversionInput.from_paths(
109
+ paths=[Path("./test/data/2206.01062.pdf")],
110
+ limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
111
+ )
112
+ ```
113
+
114
+ ### Convert from binary PDF streams
115
+
116
+ You can convert PDFs from a binary stream instead of from the filesystem as follows:
117
+ ```python
118
+ buf = BytesIO(your_binary_stream)
119
+ docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
120
+ conv_input = DocumentConversionInput.from_streams(docs)
121
+ converted_docs = doc_converter.convert(conv_input)
122
+ ```
123
+ ### Limit resource usage
124
+
125
+ You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
126
+
127
+
128
+ ## Contributing
129
+
130
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
131
+
132
+
133
+ ## References
134
+
135
+ If you use Docling in your projects, please consider citing the following:
136
+
137
+ ```bib
138
+ @techreport{Docling,
139
+ author = {Deep Search Team},
140
+ month = {8},
141
+ title = {{Docling Technical Report}},
142
+ url={https://arxiv.org/abs/2408.09869},
143
+ eprint={2408.09869},
144
+ doi = "10.48550/arXiv.2408.09869",
145
+ version = {1.0.0},
146
+ year = {2024}
147
+ }
148
+ ```
149
+
150
+ ## License
151
+
152
+ The Docling codebase is under MIT license.
153
+ For individual model usage, please refer to the model licenses found in the original packages.
@@ -35,7 +35,7 @@ class PdfPageBackend(ABC):
35
35
 
36
36
  class PdfDocumentBackend(ABC):
37
37
  @abstractmethod
38
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
38
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
39
39
  pass
40
40
 
41
41
  @abstractmethod
@@ -0,0 +1,187 @@
1
+ import logging
2
+ import random
3
+ import time
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import Iterable, List, Optional, Union
7
+
8
+ import pypdfium2 as pdfium
9
+ from docling_parse.docling_parse import pdf_parser
10
+ from PIL import Image, ImageDraw
11
+ from pypdfium2 import PdfPage
12
+
13
+ from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
15
+
16
+ _log = logging.getLogger(__name__)
17
+
18
+
19
+ class DoclingParsePageBackend(PdfPageBackend):
20
+ def __init__(self, page_obj: PdfPage, docling_page_obj):
21
+ super().__init__(page_obj)
22
+ self._ppage = page_obj
23
+ self._dpage = docling_page_obj
24
+ self.text_page = None
25
+
26
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
27
+ # Find intersecting cells on the page
28
+ text_piece = ""
29
+ page_size = self.get_size()
30
+ parser_width = self._dpage["width"]
31
+ parser_height = self._dpage["height"]
32
+
33
+ scale = (
34
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
35
+ )
36
+
37
+ for i in range(len(self._dpage["cells"])):
38
+ rect = self._dpage["cells"][i]["box"]["device"]
39
+ x0, y0, x1, y1 = rect
40
+ cell_bbox = BoundingBox(
41
+ l=x0 * scale * page_size.width / parser_width,
42
+ b=y0 * scale * page_size.height / parser_height,
43
+ r=x1 * scale * page_size.width / parser_width,
44
+ t=y1 * scale * page_size.height / parser_height,
45
+ coord_origin=CoordOrigin.BOTTOMLEFT,
46
+ ).to_top_left_origin(page_size.height * scale)
47
+
48
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
49
+
50
+ if overlap_frac > 0.5:
51
+ if len(text_piece) > 0:
52
+ text_piece += " "
53
+ text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
54
+
55
+ return text_piece
56
+
57
+ def get_text_cells(self) -> Iterable[Cell]:
58
+ cells = []
59
+ cell_counter = 0
60
+
61
+ page_size = self.get_size()
62
+
63
+ parser_width = self._dpage["width"]
64
+ parser_height = self._dpage["height"]
65
+
66
+ for i in range(len(self._dpage["cells"])):
67
+ rect = self._dpage["cells"][i]["box"]["device"]
68
+ x0, y0, x1, y1 = rect
69
+ text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
70
+ cells.append(
71
+ Cell(
72
+ id=cell_counter,
73
+ text=text_piece,
74
+ bbox=BoundingBox(
75
+ # l=x0, b=y0, r=x1, t=y1,
76
+ l=x0 * page_size.width / parser_width,
77
+ b=y0 * page_size.height / parser_height,
78
+ r=x1 * page_size.width / parser_width,
79
+ t=y1 * page_size.height / parser_height,
80
+ coord_origin=CoordOrigin.BOTTOMLEFT,
81
+ ).to_top_left_origin(page_size.height),
82
+ )
83
+ )
84
+ cell_counter += 1
85
+
86
+ def draw_clusters_and_cells():
87
+ image = (
88
+ self.get_page_image()
89
+ ) # make new image to avoid drawing on the saved ones
90
+ draw = ImageDraw.Draw(image)
91
+ for c in cells:
92
+ x0, y0, x1, y1 = c.bbox.as_tuple()
93
+ cell_color = (
94
+ random.randint(30, 140),
95
+ random.randint(30, 140),
96
+ random.randint(30, 140),
97
+ )
98
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
99
+ image.show()
100
+
101
+ # before merge:
102
+ # draw_clusters_and_cells()
103
+
104
+ # cells = merge_horizontal_cells(cells)
105
+
106
+ # after merge:
107
+ # draw_clusters_and_cells()
108
+
109
+ return cells
110
+
111
+ def get_page_image(
112
+ self, scale: int = 1, cropbox: Optional[BoundingBox] = None
113
+ ) -> Image.Image:
114
+
115
+ page_size = self.get_size()
116
+
117
+ if not cropbox:
118
+ cropbox = BoundingBox(
119
+ l=0,
120
+ r=page_size.width,
121
+ t=0,
122
+ b=page_size.height,
123
+ coord_origin=CoordOrigin.TOPLEFT,
124
+ )
125
+ padbox = BoundingBox(
126
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
127
+ )
128
+ else:
129
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
130
+ padbox.r = page_size.width - padbox.r
131
+ padbox.t = page_size.height - padbox.t
132
+
133
+ image = (
134
+ self._ppage.render(
135
+ scale=scale * 1.5,
136
+ rotation=0, # no additional rotation
137
+ crop=padbox.as_tuple(),
138
+ )
139
+ .to_pil()
140
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
141
+ ) # We resize the image from 1.5x the given scale to make it sharper.
142
+
143
+ return image
144
+
145
+ def get_size(self) -> PageSize:
146
+ return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
147
+
148
+ def unload(self):
149
+ self._ppage = None
150
+ self._dpage = None
151
+ self.text_page = None
152
+
153
+
154
+ class DoclingParseDocumentBackend(PdfDocumentBackend):
155
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
156
+ super().__init__(path_or_stream)
157
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
158
+ # Parsing cells with docling_parser call
159
+ parser = pdf_parser()
160
+
161
+ start_pb_time = time.time()
162
+
163
+ if isinstance(path_or_stream, BytesIO):
164
+ self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
165
+ else:
166
+ self._parser_doc = parser.find_cells(str(path_or_stream))
167
+
168
+ end_pb_time = time.time() - start_pb_time
169
+ _log.info(
170
+ f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
171
+ )
172
+
173
+ def page_count(self) -> int:
174
+ return len(self._parser_doc["pages"])
175
+
176
+ def load_page(self, page_no: int) -> PdfPage:
177
+ return DoclingParsePageBackend(
178
+ self._pdoc[page_no], self._parser_doc["pages"][page_no]
179
+ )
180
+
181
+ def is_valid(self) -> bool:
182
+ return self.page_count() > 0
183
+
184
+ def unload(self):
185
+ self._pdoc.close()
186
+ self._pdoc = None
187
+ self._parser_doc = None
@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
134
134
  return merged_cells
135
135
 
136
136
  def draw_clusters_and_cells():
137
- image = self.get_page_image()
137
+ image = (
138
+ self.get_page_image()
139
+ ) # make new image to avoid drawing on the saved ones
138
140
  draw = ImageDraw.Draw(image)
139
141
  for c in cells:
140
142
  x0, y0, x1, y1 = c.bbox.as_tuple()
@@ -199,15 +201,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
199
201
 
200
202
 
201
203
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
202
- def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
204
+ def __init__(self, path_or_stream: Union[BytesIO, Path]):
203
205
  super().__init__(path_or_stream)
204
-
205
- if isinstance(path_or_stream, Path):
206
- self._pdoc = pdfium.PdfDocument(path_or_stream)
207
- elif isinstance(path_or_stream, BytesIO):
208
- self._pdoc = pdfium.PdfDocument(
209
- path_or_stream
210
- ) # TODO Fix me, won't accept bytes.
206
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
211
207
 
212
208
  def page_count(self) -> int:
213
209
  return len(self._pdoc)
@@ -1,9 +1,12 @@
1
+ import copy
2
+ import warnings
1
3
  from enum import Enum, auto
2
4
  from io import BytesIO
3
- from typing import Any, Dict, List, Optional, Tuple, Union
5
+ from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
4
6
 
5
7
  from PIL.Image import Image
6
- from pydantic import BaseModel, ConfigDict, model_validator
8
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
9
+ from typing_extensions import Self
7
10
 
8
11
  from docling.backend.abstract_backend import PdfPageBackend
9
12
 
@@ -47,6 +50,15 @@ class BoundingBox(BaseModel):
47
50
  def height(self):
48
51
  return abs(self.t - self.b)
49
52
 
53
+ def scaled(self, scale: float) -> "BoundingBox":
54
+ out_bbox = copy.deepcopy(self)
55
+ out_bbox.l *= scale
56
+ out_bbox.r *= scale
57
+ out_bbox.t *= scale
58
+ out_bbox.b *= scale
59
+
60
+ return out_bbox
61
+
50
62
  def as_tuple(self):
51
63
  if self.coord_origin == CoordOrigin.TOPLEFT:
52
64
  return (self.l, self.t, self.r, self.b)
@@ -180,8 +192,7 @@ class TableStructurePrediction(BaseModel):
180
192
  table_map: Dict[int, TableElement] = {}
181
193
 
182
194
 
183
- class TextElement(BasePageElement):
184
- ...
195
+ class TextElement(BasePageElement): ...
185
196
 
186
197
 
187
198
  class FigureData(BaseModel):
@@ -225,14 +236,30 @@ class Page(BaseModel):
225
236
  model_config = ConfigDict(arbitrary_types_allowed=True)
226
237
 
227
238
  page_no: int
228
- page_hash: str = None
229
- size: PageSize = None
230
- image: Image = None
239
+ page_hash: Optional[str] = None
240
+ size: Optional[PageSize] = None
231
241
  cells: List[Cell] = None
232
242
  predictions: PagePredictions = PagePredictions()
233
- assembled: AssembledUnit = None
243
+ assembled: Optional[AssembledUnit] = None
244
+
245
+ _backend: Optional[PdfPageBackend] = (
246
+ None # Internal PDF backend. By default it is cleared during assembling.
247
+ )
248
+ _default_image_scale: float = 1.0 # Default image scale for external usage.
249
+ _image_cache: Dict[float, Image] = (
250
+ {}
251
+ ) # Cache of images in different scales. By default it is cleared during assembling.
252
+
253
+ def get_image(self, scale: float = 1.0) -> Optional[Image]:
254
+ if self._backend is None:
255
+ return self._image_cache.get(scale, None)
256
+ if not scale in self._image_cache:
257
+ self._image_cache[scale] = self._backend.get_page_image(scale=scale)
258
+ return self._image_cache[scale]
234
259
 
235
- _backend: PdfPageBackend = None # Internal PDF backend
260
+ @property
261
+ def image(self) -> Optional[Image]:
262
+ return self.get_image(scale=self._default_image_scale)
236
263
 
237
264
 
238
265
  class DocumentStream(BaseModel):
@@ -242,6 +269,36 @@ class DocumentStream(BaseModel):
242
269
  stream: BytesIO
243
270
 
244
271
 
272
+ class TableStructureOptions(BaseModel):
273
+ do_cell_matching: bool = (
274
+ True
275
+ # True: Matches predictions back to PDF cells. Can break table output if PDF cells
276
+ # are merged across table columns.
277
+ # False: Let table structure model define the text cells, ignore PDF cells.
278
+ )
279
+
280
+
245
281
  class PipelineOptions(BaseModel):
246
- do_table_structure: bool = True
247
- do_ocr: bool = False
282
+ do_table_structure: bool = True # True: perform table structure extraction
283
+ do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
284
+
285
+ table_structure_options: TableStructureOptions = TableStructureOptions()
286
+
287
+
288
+ class AssembleOptions(BaseModel):
289
+ keep_page_images: Annotated[
290
+ bool,
291
+ Field(
292
+ deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
293
+ ),
294
+ ] = False # False: page images are removed in the assemble step
295
+ images_scale: Optional[float] = None # if set, the scale for generated images
296
+
297
+ @model_validator(mode="after")
298
+ def set_page_images_from_deprecated(self) -> Self:
299
+ with warnings.catch_warnings():
300
+ warnings.simplefilter("ignore", DeprecationWarning)
301
+ default_scale = 1.0
302
+ if self.keep_page_images and self.images_scale is None:
303
+ self.images_scale = default_scale
304
+ return self