docling 1.8.0__tar.gz → 1.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.8.0 → docling-1.8.2}/PKG-INFO +5 -5
- {docling-1.8.0 → docling-1.8.2}/README.md +4 -4
- {docling-1.8.0 → docling-1.8.2}/docling/datamodel/base_models.py +3 -3
- {docling-1.8.0 → docling-1.8.2}/docling/datamodel/document.py +20 -11
- {docling-1.8.0 → docling-1.8.2}/docling/document_converter.py +34 -38
- {docling-1.8.0 → docling-1.8.2}/docling/models/ds_glm_model.py +5 -5
- {docling-1.8.0 → docling-1.8.2}/pyproject.toml +1 -1
- {docling-1.8.0 → docling-1.8.2}/LICENSE +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/__init__.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/backend/__init__.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/backend/abstract_backend.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/backend/docling_parse_backend.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/datamodel/__init__.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/datamodel/settings.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/models/__init__.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/models/base_ocr_model.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/models/easyocr_model.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/models/layout_model.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/models/page_assemble_model.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/models/table_structure_model.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/pipeline/__init__.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/utils/__init__.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/utils/layout_utils.py +0 -0
- {docling-1.8.0 → docling-1.8.2}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.8.
|
3
|
+
Version: 1.8.2
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -87,10 +87,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
|
|
87
87
|
```python
|
88
88
|
from docling.document_converter import DocumentConverter
|
89
89
|
|
90
|
-
source = "https://arxiv.org/pdf/
|
90
|
+
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
91
91
|
converter = DocumentConverter()
|
92
|
-
|
93
|
-
print(
|
92
|
+
result = converter.convert_single(source)
|
93
|
+
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
94
94
|
```
|
95
95
|
|
96
96
|
### Convert a batch of documents
|
@@ -156,7 +156,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
|
|
156
156
|
buf = BytesIO(your_binary_stream)
|
157
157
|
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
158
158
|
conv_input = DocumentConversionInput.from_streams(docs)
|
159
|
-
|
159
|
+
results = doc_converter.convert(conv_input)
|
160
160
|
```
|
161
161
|
### Limit resource usage
|
162
162
|
|
@@ -49,10 +49,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
|
|
49
49
|
```python
|
50
50
|
from docling.document_converter import DocumentConverter
|
51
51
|
|
52
|
-
source = "https://arxiv.org/pdf/
|
52
|
+
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
53
53
|
converter = DocumentConverter()
|
54
|
-
|
55
|
-
print(
|
54
|
+
result = converter.convert_single(source)
|
55
|
+
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
56
56
|
```
|
57
57
|
|
58
58
|
### Convert a batch of documents
|
@@ -118,7 +118,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
|
|
118
118
|
buf = BytesIO(your_binary_stream)
|
119
119
|
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
120
120
|
conv_input = DocumentConversionInput.from_streams(docs)
|
121
|
-
|
121
|
+
results = doc_converter.convert(conv_input)
|
122
122
|
```
|
123
123
|
### Limit resource usage
|
124
124
|
|
@@ -247,9 +247,9 @@ PageElement = Union[TextElement, TableElement, FigureElement]
|
|
247
247
|
|
248
248
|
|
249
249
|
class AssembledUnit(BaseModel):
|
250
|
-
elements: List[PageElement]
|
251
|
-
body: List[PageElement]
|
252
|
-
headers: List[PageElement]
|
250
|
+
elements: List[PageElement] = []
|
251
|
+
body: List[PageElement] = []
|
252
|
+
headers: List[PageElement] = []
|
253
253
|
|
254
254
|
|
255
255
|
class Page(BaseModel):
|
@@ -12,6 +12,7 @@ from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
|
12
12
|
from docling_core.types import Table as DsSchemaTable
|
13
13
|
from docling_core.types import TableCell
|
14
14
|
from pydantic import BaseModel
|
15
|
+
from typing_extensions import deprecated
|
15
16
|
|
16
17
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
17
18
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
@@ -49,6 +50,15 @@ layout_label_to_ds_type = {
|
|
49
50
|
"Text": "paragraph",
|
50
51
|
}
|
51
52
|
|
53
|
+
_EMPTY_DOC = DsDocument(
|
54
|
+
_name="",
|
55
|
+
description=DsDocumentDescription(logs=[]),
|
56
|
+
file_info=DsFileInfoObject(
|
57
|
+
filename="",
|
58
|
+
document_hash="",
|
59
|
+
),
|
60
|
+
)
|
61
|
+
|
52
62
|
|
53
63
|
class InputDocument(BaseModel):
|
54
64
|
file: PurePath = None
|
@@ -115,6 +125,7 @@ class InputDocument(BaseModel):
|
|
115
125
|
# raise
|
116
126
|
|
117
127
|
|
128
|
+
@deprecated("Use `ConversionResult` instead.")
|
118
129
|
class ConvertedDocument(BaseModel):
|
119
130
|
input: InputDocument
|
120
131
|
|
@@ -122,11 +133,11 @@ class ConvertedDocument(BaseModel):
|
|
122
133
|
errors: List[ErrorItem] = [] # structure to keep errors
|
123
134
|
|
124
135
|
pages: List[Page] = []
|
125
|
-
assembled:
|
136
|
+
assembled: AssembledUnit = AssembledUnit()
|
126
137
|
|
127
|
-
output:
|
138
|
+
output: DsDocument = _EMPTY_DOC
|
128
139
|
|
129
|
-
def
|
140
|
+
def _to_ds_document(self) -> DsDocument:
|
130
141
|
title = ""
|
131
142
|
desc = DsDocumentDescription(logs=[])
|
132
143
|
|
@@ -297,16 +308,10 @@ class ConvertedDocument(BaseModel):
|
|
297
308
|
return ds_doc
|
298
309
|
|
299
310
|
def render_as_dict(self):
|
300
|
-
|
301
|
-
return self.output.model_dump(by_alias=True, exclude_none=True)
|
302
|
-
else:
|
303
|
-
return {}
|
311
|
+
return self.output.model_dump(by_alias=True, exclude_none=True)
|
304
312
|
|
305
313
|
def render_as_markdown(self):
|
306
|
-
|
307
|
-
return self.output.export_to_markdown()
|
308
|
-
else:
|
309
|
-
return ""
|
314
|
+
return self.output.export_to_markdown()
|
310
315
|
|
311
316
|
def render_element_images(
|
312
317
|
self, element_types: Tuple[PageElement] = (FigureElement,)
|
@@ -323,6 +328,10 @@ class ConvertedDocument(BaseModel):
|
|
323
328
|
yield element, cropped_im
|
324
329
|
|
325
330
|
|
331
|
+
class ConversionResult(ConvertedDocument):
|
332
|
+
pass
|
333
|
+
|
334
|
+
|
326
335
|
class DocumentConversionInput(BaseModel):
|
327
336
|
|
328
337
|
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
@@ -7,7 +7,6 @@ from pathlib import Path
|
|
7
7
|
from typing import Iterable, Optional, Type, Union
|
8
8
|
|
9
9
|
import requests
|
10
|
-
from docling_core.types import Document
|
11
10
|
from PIL import ImageDraw
|
12
11
|
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
13
12
|
|
@@ -22,7 +21,7 @@ from docling.datamodel.base_models import (
|
|
22
21
|
PipelineOptions,
|
23
22
|
)
|
24
23
|
from docling.datamodel.document import (
|
25
|
-
|
24
|
+
ConversionResult,
|
26
25
|
DocumentConversionInput,
|
27
26
|
InputDocument,
|
28
27
|
)
|
@@ -73,7 +72,7 @@ class DocumentConverter:
|
|
73
72
|
|
74
73
|
return Path(download_path)
|
75
74
|
|
76
|
-
def convert(self, input: DocumentConversionInput) -> Iterable[
|
75
|
+
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
77
76
|
|
78
77
|
for input_batch in chunkify(
|
79
78
|
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
@@ -86,9 +85,9 @@ class DocumentConverter:
|
|
86
85
|
# yield from pool.map(self.process_document, input_batch)
|
87
86
|
|
88
87
|
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
89
|
-
yield from map(self.
|
88
|
+
yield from map(self._process_document, input_batch)
|
90
89
|
|
91
|
-
def convert_single(self, source: Path | AnyHttpUrl | str) ->
|
90
|
+
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
92
91
|
"""Convert a single document.
|
93
92
|
|
94
93
|
Args:
|
@@ -99,7 +98,7 @@ class DocumentConverter:
|
|
99
98
|
RuntimeError: If conversion fails.
|
100
99
|
|
101
100
|
Returns:
|
102
|
-
|
101
|
+
ConversionResult: The conversion result object.
|
103
102
|
"""
|
104
103
|
with tempfile.TemporaryDirectory() as temp_dir:
|
105
104
|
try:
|
@@ -129,52 +128,49 @@ class DocumentConverter:
|
|
129
128
|
f"Unexpected file path type encountered: {type(source)}"
|
130
129
|
)
|
131
130
|
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
132
|
-
|
133
|
-
|
134
|
-
if
|
131
|
+
conv_res_iter = self.convert(conv_inp)
|
132
|
+
conv_res: ConversionResult = next(conv_res_iter)
|
133
|
+
if conv_res.status not in {
|
135
134
|
ConversionStatus.SUCCESS,
|
136
|
-
ConversionStatus.
|
135
|
+
ConversionStatus.PARTIAL_SUCCESS,
|
137
136
|
}:
|
138
|
-
raise RuntimeError(f"Conversion failed with status: {
|
139
|
-
|
140
|
-
return doc
|
137
|
+
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
138
|
+
return conv_res
|
141
139
|
|
142
|
-
def
|
140
|
+
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
|
143
141
|
start_doc_time = time.time()
|
144
|
-
|
142
|
+
conv_res = ConversionResult(input=in_doc)
|
145
143
|
|
146
144
|
_log.info(f"Processing document {in_doc.file.name}")
|
147
145
|
|
148
146
|
if not in_doc.valid:
|
149
|
-
|
150
|
-
return
|
147
|
+
conv_res.status = ConversionStatus.FAILURE
|
148
|
+
return conv_res
|
151
149
|
|
152
150
|
for i in range(0, in_doc.page_count):
|
153
|
-
|
151
|
+
conv_res.pages.append(Page(page_no=i))
|
154
152
|
|
155
153
|
all_assembled_pages = []
|
156
154
|
|
157
155
|
try:
|
158
156
|
# Iterate batches of pages (page_batch_size) in the doc
|
159
|
-
for page_batch in chunkify(
|
160
|
-
converted_doc.pages, settings.perf.page_batch_size
|
161
|
-
):
|
157
|
+
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
162
158
|
start_pb_time = time.time()
|
163
159
|
# Pipeline
|
164
160
|
|
165
161
|
# 1. Initialise the page resources
|
166
162
|
init_pages = map(
|
167
|
-
functools.partial(self.
|
163
|
+
functools.partial(self._initialize_page, in_doc), page_batch
|
168
164
|
)
|
169
165
|
|
170
166
|
# 2. Populate page image
|
171
167
|
pages_with_images = map(
|
172
|
-
functools.partial(self.
|
168
|
+
functools.partial(self._populate_page_images, in_doc), init_pages
|
173
169
|
)
|
174
170
|
|
175
171
|
# 3. Populate programmatic page cells
|
176
172
|
pages_with_cells = map(
|
177
|
-
functools.partial(self.
|
173
|
+
functools.partial(self._parse_page_cells, in_doc),
|
178
174
|
pages_with_images,
|
179
175
|
)
|
180
176
|
|
@@ -203,13 +199,13 @@ class DocumentConverter:
|
|
203
199
|
# Free up mem resources of PDF backend
|
204
200
|
in_doc._backend.unload()
|
205
201
|
|
206
|
-
|
207
|
-
self.
|
202
|
+
conv_res.pages = all_assembled_pages
|
203
|
+
self._assemble_doc(conv_res)
|
208
204
|
|
209
205
|
status = ConversionStatus.SUCCESS
|
210
|
-
for page in
|
206
|
+
for page in conv_res.pages:
|
211
207
|
if not page._backend.is_valid():
|
212
|
-
|
208
|
+
conv_res.errors.append(
|
213
209
|
ErrorItem(
|
214
210
|
component_type=DoclingComponentType.PDF_BACKEND,
|
215
211
|
module_name=type(page._backend).__name__,
|
@@ -218,10 +214,10 @@ class DocumentConverter:
|
|
218
214
|
)
|
219
215
|
status = ConversionStatus.PARTIAL_SUCCESS
|
220
216
|
|
221
|
-
|
217
|
+
conv_res.status = status
|
222
218
|
|
223
219
|
except Exception as e:
|
224
|
-
|
220
|
+
conv_res.status = ConversionStatus.FAILURE
|
225
221
|
trace = "\n".join(traceback.format_exception(e))
|
226
222
|
_log.info(
|
227
223
|
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
@@ -233,10 +229,10 @@ class DocumentConverter:
|
|
233
229
|
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
234
230
|
)
|
235
231
|
|
236
|
-
return
|
232
|
+
return conv_res
|
237
233
|
|
238
234
|
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
|
239
|
-
def
|
235
|
+
def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
240
236
|
page._backend = doc._backend.load_page(page.page_no)
|
241
237
|
page.size = page._backend.get_size()
|
242
238
|
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
|
@@ -244,7 +240,7 @@ class DocumentConverter:
|
|
244
240
|
return page
|
245
241
|
|
246
242
|
# Generate the page image and store it in the page object
|
247
|
-
def
|
243
|
+
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
248
244
|
# default scale
|
249
245
|
page.get_image(
|
250
246
|
scale=1.0
|
@@ -260,7 +256,7 @@ class DocumentConverter:
|
|
260
256
|
return page
|
261
257
|
|
262
258
|
# Extract and populate the page cells and store it in the page object
|
263
|
-
def
|
259
|
+
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
264
260
|
page.cells = page._backend.get_text_cells()
|
265
261
|
|
266
262
|
# DEBUG code:
|
@@ -275,12 +271,12 @@ class DocumentConverter:
|
|
275
271
|
|
276
272
|
return page
|
277
273
|
|
278
|
-
def
|
274
|
+
def _assemble_doc(self, conv_res: ConversionResult):
|
279
275
|
all_elements = []
|
280
276
|
all_headers = []
|
281
277
|
all_body = []
|
282
278
|
|
283
|
-
for p in
|
279
|
+
for p in conv_res.pages:
|
284
280
|
|
285
281
|
for el in p.assembled.body:
|
286
282
|
all_body.append(el)
|
@@ -289,8 +285,8 @@ class DocumentConverter:
|
|
289
285
|
for el in p.assembled.elements:
|
290
286
|
all_elements.append(el)
|
291
287
|
|
292
|
-
|
288
|
+
conv_res.assembled = AssembledUnit(
|
293
289
|
elements=all_elements, headers=all_headers, body=all_body
|
294
290
|
)
|
295
291
|
|
296
|
-
|
292
|
+
conv_res.output = self.glm_model(conv_res)
|
@@ -10,7 +10,7 @@ from docling_core.types import Ref
|
|
10
10
|
from PIL import ImageDraw
|
11
11
|
|
12
12
|
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
|
13
|
-
from docling.datamodel.document import
|
13
|
+
from docling.datamodel.document import ConversionResult
|
14
14
|
|
15
15
|
|
16
16
|
class GlmModel:
|
@@ -20,8 +20,8 @@ class GlmModel:
|
|
20
20
|
model = init_nlp_model(model_names="language;term;reference")
|
21
21
|
self.model = model
|
22
22
|
|
23
|
-
def __call__(self,
|
24
|
-
ds_doc =
|
23
|
+
def __call__(self, conv_res: ConversionResult) -> DsDocument:
|
24
|
+
ds_doc = conv_res._to_ds_document()
|
25
25
|
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
26
26
|
|
27
27
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
@@ -34,7 +34,7 @@ class GlmModel:
|
|
34
34
|
# DEBUG code:
|
35
35
|
def draw_clusters_and_cells(ds_document, page_no):
|
36
36
|
clusters_to_draw = []
|
37
|
-
image = copy.deepcopy(
|
37
|
+
image = copy.deepcopy(conv_res.pages[page_no].image)
|
38
38
|
for ix, elem in enumerate(ds_document.main_text):
|
39
39
|
if isinstance(elem, BaseText):
|
40
40
|
prov = elem.prov[0]
|
@@ -56,7 +56,7 @@ class GlmModel:
|
|
56
56
|
bbox=BoundingBox.from_tuple(
|
57
57
|
coord=prov.bbox,
|
58
58
|
origin=CoordOrigin.BOTTOMLEFT,
|
59
|
-
).to_top_left_origin(
|
59
|
+
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
60
60
|
)
|
61
61
|
)
|
62
62
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.8.
|
3
|
+
version = "1.8.2" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|