docling 1.8.1__tar.gz → 1.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.8.1 → docling-1.8.2}/PKG-INFO +5 -5
- {docling-1.8.1 → docling-1.8.2}/README.md +4 -4
- {docling-1.8.1 → docling-1.8.2}/docling/datamodel/base_models.py +3 -3
- {docling-1.8.1 → docling-1.8.2}/docling/datamodel/document.py +20 -11
- {docling-1.8.1 → docling-1.8.2}/docling/document_converter.py +33 -36
- {docling-1.8.1 → docling-1.8.2}/docling/models/ds_glm_model.py +5 -5
- {docling-1.8.1 → docling-1.8.2}/pyproject.toml +1 -1
- {docling-1.8.1 → docling-1.8.2}/LICENSE +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/__init__.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/backend/__init__.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/backend/abstract_backend.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/backend/docling_parse_backend.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/datamodel/__init__.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/datamodel/settings.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/models/__init__.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/models/base_ocr_model.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/models/easyocr_model.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/models/layout_model.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/models/page_assemble_model.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/models/table_structure_model.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/pipeline/__init__.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/utils/__init__.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/utils/layout_utils.py +0 -0
- {docling-1.8.1 → docling-1.8.2}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.8.
|
3
|
+
Version: 1.8.2
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -87,10 +87,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
|
|
87
87
|
```python
|
88
88
|
from docling.document_converter import DocumentConverter
|
89
89
|
|
90
|
-
source = "https://arxiv.org/pdf/
|
90
|
+
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
91
91
|
converter = DocumentConverter()
|
92
|
-
|
93
|
-
print(
|
92
|
+
result = converter.convert_single(source)
|
93
|
+
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
94
94
|
```
|
95
95
|
|
96
96
|
### Convert a batch of documents
|
@@ -156,7 +156,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
|
|
156
156
|
buf = BytesIO(your_binary_stream)
|
157
157
|
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
158
158
|
conv_input = DocumentConversionInput.from_streams(docs)
|
159
|
-
|
159
|
+
results = doc_converter.convert(conv_input)
|
160
160
|
```
|
161
161
|
### Limit resource usage
|
162
162
|
|
@@ -49,10 +49,10 @@ To convert invidual PDF documents, use `convert_single()`, for example:
|
|
49
49
|
```python
|
50
50
|
from docling.document_converter import DocumentConverter
|
51
51
|
|
52
|
-
source = "https://arxiv.org/pdf/
|
52
|
+
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
53
53
|
converter = DocumentConverter()
|
54
|
-
|
55
|
-
print(
|
54
|
+
result = converter.convert_single(source)
|
55
|
+
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
56
56
|
```
|
57
57
|
|
58
58
|
### Convert a batch of documents
|
@@ -118,7 +118,7 @@ You can convert PDFs from a binary stream instead of from the filesystem as foll
|
|
118
118
|
buf = BytesIO(your_binary_stream)
|
119
119
|
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
120
120
|
conv_input = DocumentConversionInput.from_streams(docs)
|
121
|
-
|
121
|
+
results = doc_converter.convert(conv_input)
|
122
122
|
```
|
123
123
|
### Limit resource usage
|
124
124
|
|
@@ -247,9 +247,9 @@ PageElement = Union[TextElement, TableElement, FigureElement]
|
|
247
247
|
|
248
248
|
|
249
249
|
class AssembledUnit(BaseModel):
|
250
|
-
elements: List[PageElement]
|
251
|
-
body: List[PageElement]
|
252
|
-
headers: List[PageElement]
|
250
|
+
elements: List[PageElement] = []
|
251
|
+
body: List[PageElement] = []
|
252
|
+
headers: List[PageElement] = []
|
253
253
|
|
254
254
|
|
255
255
|
class Page(BaseModel):
|
@@ -12,6 +12,7 @@ from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
|
12
12
|
from docling_core.types import Table as DsSchemaTable
|
13
13
|
from docling_core.types import TableCell
|
14
14
|
from pydantic import BaseModel
|
15
|
+
from typing_extensions import deprecated
|
15
16
|
|
16
17
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
17
18
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
@@ -49,6 +50,15 @@ layout_label_to_ds_type = {
|
|
49
50
|
"Text": "paragraph",
|
50
51
|
}
|
51
52
|
|
53
|
+
_EMPTY_DOC = DsDocument(
|
54
|
+
_name="",
|
55
|
+
description=DsDocumentDescription(logs=[]),
|
56
|
+
file_info=DsFileInfoObject(
|
57
|
+
filename="",
|
58
|
+
document_hash="",
|
59
|
+
),
|
60
|
+
)
|
61
|
+
|
52
62
|
|
53
63
|
class InputDocument(BaseModel):
|
54
64
|
file: PurePath = None
|
@@ -115,6 +125,7 @@ class InputDocument(BaseModel):
|
|
115
125
|
# raise
|
116
126
|
|
117
127
|
|
128
|
+
@deprecated("Use `ConversionResult` instead.")
|
118
129
|
class ConvertedDocument(BaseModel):
|
119
130
|
input: InputDocument
|
120
131
|
|
@@ -122,11 +133,11 @@ class ConvertedDocument(BaseModel):
|
|
122
133
|
errors: List[ErrorItem] = [] # structure to keep errors
|
123
134
|
|
124
135
|
pages: List[Page] = []
|
125
|
-
assembled:
|
136
|
+
assembled: AssembledUnit = AssembledUnit()
|
126
137
|
|
127
|
-
output:
|
138
|
+
output: DsDocument = _EMPTY_DOC
|
128
139
|
|
129
|
-
def
|
140
|
+
def _to_ds_document(self) -> DsDocument:
|
130
141
|
title = ""
|
131
142
|
desc = DsDocumentDescription(logs=[])
|
132
143
|
|
@@ -297,16 +308,10 @@ class ConvertedDocument(BaseModel):
|
|
297
308
|
return ds_doc
|
298
309
|
|
299
310
|
def render_as_dict(self):
|
300
|
-
|
301
|
-
return self.output.model_dump(by_alias=True, exclude_none=True)
|
302
|
-
else:
|
303
|
-
return {}
|
311
|
+
return self.output.model_dump(by_alias=True, exclude_none=True)
|
304
312
|
|
305
313
|
def render_as_markdown(self):
|
306
|
-
|
307
|
-
return self.output.export_to_markdown()
|
308
|
-
else:
|
309
|
-
return ""
|
314
|
+
return self.output.export_to_markdown()
|
310
315
|
|
311
316
|
def render_element_images(
|
312
317
|
self, element_types: Tuple[PageElement] = (FigureElement,)
|
@@ -323,6 +328,10 @@ class ConvertedDocument(BaseModel):
|
|
323
328
|
yield element, cropped_im
|
324
329
|
|
325
330
|
|
331
|
+
class ConversionResult(ConvertedDocument):
|
332
|
+
pass
|
333
|
+
|
334
|
+
|
326
335
|
class DocumentConversionInput(BaseModel):
|
327
336
|
|
328
337
|
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
@@ -7,7 +7,6 @@ from pathlib import Path
|
|
7
7
|
from typing import Iterable, Optional, Type, Union
|
8
8
|
|
9
9
|
import requests
|
10
|
-
from docling_core.types import Document
|
11
10
|
from PIL import ImageDraw
|
12
11
|
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
13
12
|
|
@@ -22,7 +21,7 @@ from docling.datamodel.base_models import (
|
|
22
21
|
PipelineOptions,
|
23
22
|
)
|
24
23
|
from docling.datamodel.document import (
|
25
|
-
|
24
|
+
ConversionResult,
|
26
25
|
DocumentConversionInput,
|
27
26
|
InputDocument,
|
28
27
|
)
|
@@ -73,7 +72,7 @@ class DocumentConverter:
|
|
73
72
|
|
74
73
|
return Path(download_path)
|
75
74
|
|
76
|
-
def convert(self, input: DocumentConversionInput) -> Iterable[
|
75
|
+
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
77
76
|
|
78
77
|
for input_batch in chunkify(
|
79
78
|
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
@@ -86,9 +85,9 @@ class DocumentConverter:
|
|
86
85
|
# yield from pool.map(self.process_document, input_batch)
|
87
86
|
|
88
87
|
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
89
|
-
yield from map(self.
|
88
|
+
yield from map(self._process_document, input_batch)
|
90
89
|
|
91
|
-
def convert_single(self, source: Path | AnyHttpUrl | str) ->
|
90
|
+
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
92
91
|
"""Convert a single document.
|
93
92
|
|
94
93
|
Args:
|
@@ -99,7 +98,7 @@ class DocumentConverter:
|
|
99
98
|
RuntimeError: If conversion fails.
|
100
99
|
|
101
100
|
Returns:
|
102
|
-
|
101
|
+
ConversionResult: The conversion result object.
|
103
102
|
"""
|
104
103
|
with tempfile.TemporaryDirectory() as temp_dir:
|
105
104
|
try:
|
@@ -129,51 +128,49 @@ class DocumentConverter:
|
|
129
128
|
f"Unexpected file path type encountered: {type(source)}"
|
130
129
|
)
|
131
130
|
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
132
|
-
|
133
|
-
|
134
|
-
if
|
131
|
+
conv_res_iter = self.convert(conv_inp)
|
132
|
+
conv_res: ConversionResult = next(conv_res_iter)
|
133
|
+
if conv_res.status not in {
|
135
134
|
ConversionStatus.SUCCESS,
|
136
135
|
ConversionStatus.PARTIAL_SUCCESS,
|
137
136
|
}:
|
138
|
-
raise RuntimeError(f"Conversion failed with status: {
|
139
|
-
return
|
137
|
+
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
138
|
+
return conv_res
|
140
139
|
|
141
|
-
def
|
140
|
+
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
|
142
141
|
start_doc_time = time.time()
|
143
|
-
|
142
|
+
conv_res = ConversionResult(input=in_doc)
|
144
143
|
|
145
144
|
_log.info(f"Processing document {in_doc.file.name}")
|
146
145
|
|
147
146
|
if not in_doc.valid:
|
148
|
-
|
149
|
-
return
|
147
|
+
conv_res.status = ConversionStatus.FAILURE
|
148
|
+
return conv_res
|
150
149
|
|
151
150
|
for i in range(0, in_doc.page_count):
|
152
|
-
|
151
|
+
conv_res.pages.append(Page(page_no=i))
|
153
152
|
|
154
153
|
all_assembled_pages = []
|
155
154
|
|
156
155
|
try:
|
157
156
|
# Iterate batches of pages (page_batch_size) in the doc
|
158
|
-
for page_batch in chunkify(
|
159
|
-
converted_doc.pages, settings.perf.page_batch_size
|
160
|
-
):
|
157
|
+
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
161
158
|
start_pb_time = time.time()
|
162
159
|
# Pipeline
|
163
160
|
|
164
161
|
# 1. Initialise the page resources
|
165
162
|
init_pages = map(
|
166
|
-
functools.partial(self.
|
163
|
+
functools.partial(self._initialize_page, in_doc), page_batch
|
167
164
|
)
|
168
165
|
|
169
166
|
# 2. Populate page image
|
170
167
|
pages_with_images = map(
|
171
|
-
functools.partial(self.
|
168
|
+
functools.partial(self._populate_page_images, in_doc), init_pages
|
172
169
|
)
|
173
170
|
|
174
171
|
# 3. Populate programmatic page cells
|
175
172
|
pages_with_cells = map(
|
176
|
-
functools.partial(self.
|
173
|
+
functools.partial(self._parse_page_cells, in_doc),
|
177
174
|
pages_with_images,
|
178
175
|
)
|
179
176
|
|
@@ -202,13 +199,13 @@ class DocumentConverter:
|
|
202
199
|
# Free up mem resources of PDF backend
|
203
200
|
in_doc._backend.unload()
|
204
201
|
|
205
|
-
|
206
|
-
self.
|
202
|
+
conv_res.pages = all_assembled_pages
|
203
|
+
self._assemble_doc(conv_res)
|
207
204
|
|
208
205
|
status = ConversionStatus.SUCCESS
|
209
|
-
for page in
|
206
|
+
for page in conv_res.pages:
|
210
207
|
if not page._backend.is_valid():
|
211
|
-
|
208
|
+
conv_res.errors.append(
|
212
209
|
ErrorItem(
|
213
210
|
component_type=DoclingComponentType.PDF_BACKEND,
|
214
211
|
module_name=type(page._backend).__name__,
|
@@ -217,10 +214,10 @@ class DocumentConverter:
|
|
217
214
|
)
|
218
215
|
status = ConversionStatus.PARTIAL_SUCCESS
|
219
216
|
|
220
|
-
|
217
|
+
conv_res.status = status
|
221
218
|
|
222
219
|
except Exception as e:
|
223
|
-
|
220
|
+
conv_res.status = ConversionStatus.FAILURE
|
224
221
|
trace = "\n".join(traceback.format_exception(e))
|
225
222
|
_log.info(
|
226
223
|
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
@@ -232,10 +229,10 @@ class DocumentConverter:
|
|
232
229
|
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
233
230
|
)
|
234
231
|
|
235
|
-
return
|
232
|
+
return conv_res
|
236
233
|
|
237
234
|
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
|
238
|
-
def
|
235
|
+
def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
239
236
|
page._backend = doc._backend.load_page(page.page_no)
|
240
237
|
page.size = page._backend.get_size()
|
241
238
|
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
|
@@ -243,7 +240,7 @@ class DocumentConverter:
|
|
243
240
|
return page
|
244
241
|
|
245
242
|
# Generate the page image and store it in the page object
|
246
|
-
def
|
243
|
+
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
247
244
|
# default scale
|
248
245
|
page.get_image(
|
249
246
|
scale=1.0
|
@@ -259,7 +256,7 @@ class DocumentConverter:
|
|
259
256
|
return page
|
260
257
|
|
261
258
|
# Extract and populate the page cells and store it in the page object
|
262
|
-
def
|
259
|
+
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
263
260
|
page.cells = page._backend.get_text_cells()
|
264
261
|
|
265
262
|
# DEBUG code:
|
@@ -274,12 +271,12 @@ class DocumentConverter:
|
|
274
271
|
|
275
272
|
return page
|
276
273
|
|
277
|
-
def
|
274
|
+
def _assemble_doc(self, conv_res: ConversionResult):
|
278
275
|
all_elements = []
|
279
276
|
all_headers = []
|
280
277
|
all_body = []
|
281
278
|
|
282
|
-
for p in
|
279
|
+
for p in conv_res.pages:
|
283
280
|
|
284
281
|
for el in p.assembled.body:
|
285
282
|
all_body.append(el)
|
@@ -288,8 +285,8 @@ class DocumentConverter:
|
|
288
285
|
for el in p.assembled.elements:
|
289
286
|
all_elements.append(el)
|
290
287
|
|
291
|
-
|
288
|
+
conv_res.assembled = AssembledUnit(
|
292
289
|
elements=all_elements, headers=all_headers, body=all_body
|
293
290
|
)
|
294
291
|
|
295
|
-
|
292
|
+
conv_res.output = self.glm_model(conv_res)
|
@@ -10,7 +10,7 @@ from docling_core.types import Ref
|
|
10
10
|
from PIL import ImageDraw
|
11
11
|
|
12
12
|
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
|
13
|
-
from docling.datamodel.document import
|
13
|
+
from docling.datamodel.document import ConversionResult
|
14
14
|
|
15
15
|
|
16
16
|
class GlmModel:
|
@@ -20,8 +20,8 @@ class GlmModel:
|
|
20
20
|
model = init_nlp_model(model_names="language;term;reference")
|
21
21
|
self.model = model
|
22
22
|
|
23
|
-
def __call__(self,
|
24
|
-
ds_doc =
|
23
|
+
def __call__(self, conv_res: ConversionResult) -> DsDocument:
|
24
|
+
ds_doc = conv_res._to_ds_document()
|
25
25
|
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
26
26
|
|
27
27
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
@@ -34,7 +34,7 @@ class GlmModel:
|
|
34
34
|
# DEBUG code:
|
35
35
|
def draw_clusters_and_cells(ds_document, page_no):
|
36
36
|
clusters_to_draw = []
|
37
|
-
image = copy.deepcopy(
|
37
|
+
image = copy.deepcopy(conv_res.pages[page_no].image)
|
38
38
|
for ix, elem in enumerate(ds_document.main_text):
|
39
39
|
if isinstance(elem, BaseText):
|
40
40
|
prov = elem.prov[0]
|
@@ -56,7 +56,7 @@ class GlmModel:
|
|
56
56
|
bbox=BoundingBox.from_tuple(
|
57
57
|
coord=prov.bbox,
|
58
58
|
origin=CoordOrigin.BOTTOMLEFT,
|
59
|
-
).to_top_left_origin(
|
59
|
+
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
60
60
|
)
|
61
61
|
)
|
62
62
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.8.
|
3
|
+
version = "1.8.2" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|