docling 2.42.2__py3-none-any.whl → 2.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +1 -6
- docling/backend/md_backend.py +43 -11
- docling/datamodel/pipeline_options.py +15 -0
- docling/datamodel/settings.py +7 -12
- docling/document_converter.py +27 -17
- docling/models/layout_model.py +84 -66
- docling/pipeline/threaded_standard_pdf_pipeline.py +605 -0
- {docling-2.42.2.dist-info → docling-2.43.0.dist-info}/METADATA +3 -3
- {docling-2.42.2.dist-info → docling-2.43.0.dist-info}/RECORD +13 -12
- {docling-2.42.2.dist-info → docling-2.43.0.dist-info}/WHEEL +0 -0
- {docling-2.42.2.dist-info → docling-2.43.0.dist-info}/entry_points.txt +0 -0
- {docling-2.42.2.dist-info → docling-2.43.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.42.2.dist-info → docling-2.43.0.dist-info}/top_level.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
|
-
import traceback
|
4
3
|
from io import BytesIO
|
5
4
|
from pathlib import Path
|
6
5
|
from typing import Final, Optional, Union, cast
|
@@ -144,11 +143,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
144
143
|
)
|
145
144
|
# reset context
|
146
145
|
self.ctx = _Context()
|
147
|
-
|
148
|
-
try:
|
149
|
-
self._walk(content, doc)
|
150
|
-
except Exception:
|
151
|
-
print(traceback.format_exc())
|
146
|
+
self._walk(content, doc)
|
152
147
|
|
153
148
|
return doc
|
154
149
|
|
docling/backend/md_backend.py
CHANGED
@@ -5,7 +5,7 @@ from copy import deepcopy
|
|
5
5
|
from enum import Enum
|
6
6
|
from io import BytesIO
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import
|
8
|
+
from typing import Literal, Optional, Union, cast
|
9
9
|
|
10
10
|
import marko
|
11
11
|
import marko.element
|
@@ -14,6 +14,7 @@ from docling_core.types.doc import (
|
|
14
14
|
DocItemLabel,
|
15
15
|
DoclingDocument,
|
16
16
|
DocumentOrigin,
|
17
|
+
ListItem,
|
17
18
|
NodeItem,
|
18
19
|
TableCell,
|
19
20
|
TableData,
|
@@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
89
90
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
90
91
|
super().__init__(in_doc, path_or_stream)
|
91
92
|
|
92
|
-
_log.debug("
|
93
|
+
_log.debug("Starting MarkdownDocumentBackend...")
|
93
94
|
|
94
95
|
# Markdown file:
|
95
96
|
self.path_or_stream = path_or_stream
|
@@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
131
132
|
for md_table_row in self.md_table_buffer:
|
132
133
|
_log.debug(md_table_row)
|
133
134
|
_log.debug("=== TABLE END ===")
|
134
|
-
tcells:
|
135
|
+
tcells: list[TableCell] = []
|
135
136
|
result_table = []
|
136
137
|
for n, md_table_row in enumerate(self.md_table_buffer):
|
137
138
|
data = []
|
@@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
232
233
|
element: marko.element.Element,
|
233
234
|
depth: int,
|
234
235
|
doc: DoclingDocument,
|
235
|
-
visited:
|
236
|
+
visited: set[marko.element.Element],
|
236
237
|
creation_stack: list[
|
237
238
|
_CreationPayload
|
238
239
|
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
|
239
240
|
list_ordered_flag_by_ref: dict[str, bool],
|
241
|
+
list_last_item_by_ref: dict[str, ListItem],
|
240
242
|
parent_item: Optional[NodeItem] = None,
|
241
243
|
formatting: Optional[Formatting] = None,
|
242
244
|
hyperlink: Optional[Union[AnyUrl, Path]] = None,
|
@@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
279
281
|
|
280
282
|
elif (
|
281
283
|
isinstance(element, marko.block.ListItem)
|
282
|
-
and len(element.children)
|
284
|
+
and len(element.children) > 0
|
283
285
|
and isinstance((child := element.children[0]), marko.block.Paragraph)
|
284
286
|
and len(child.children) > 0
|
285
287
|
):
|
@@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
291
293
|
if parent_item
|
292
294
|
else False
|
293
295
|
)
|
294
|
-
|
296
|
+
non_list_children: list[marko.element.Element] = [
|
297
|
+
item
|
298
|
+
for item in child.children
|
299
|
+
if not isinstance(item, marko.block.ListItem)
|
300
|
+
]
|
301
|
+
if len(non_list_children) > 1: # inline group will be created further down
|
302
|
+
parent_ref: Optional[str] = (
|
303
|
+
parent_item.self_ref if parent_item else None
|
304
|
+
)
|
295
305
|
parent_item = self._create_list_item(
|
296
306
|
doc=doc,
|
297
307
|
parent_item=parent_item,
|
@@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
300
310
|
formatting=formatting,
|
301
311
|
hyperlink=hyperlink,
|
302
312
|
)
|
313
|
+
if parent_ref:
|
314
|
+
list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
|
303
315
|
else:
|
304
316
|
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
|
305
317
|
|
@@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
334
346
|
element.dest
|
335
347
|
)
|
336
348
|
|
337
|
-
elif isinstance(element, marko.inline.RawText):
|
338
|
-
_log.debug(f" -
|
339
|
-
snippet_text =
|
349
|
+
elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
|
350
|
+
_log.debug(f" - RawText/Literal: {element.children}")
|
351
|
+
snippet_text = (
|
352
|
+
element.children.strip() if isinstance(element.children, str) else ""
|
353
|
+
)
|
340
354
|
# Detect start of the table:
|
341
355
|
if "|" in snippet_text or self.in_table:
|
342
356
|
# most likely part of the markdown table
|
@@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
359
373
|
if parent_item
|
360
374
|
else False
|
361
375
|
)
|
376
|
+
parent_ref = parent_item.self_ref if parent_item else None
|
362
377
|
parent_item = self._create_list_item(
|
363
378
|
doc=doc,
|
364
379
|
parent_item=parent_item,
|
@@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
367
382
|
formatting=formatting,
|
368
383
|
hyperlink=hyperlink,
|
369
384
|
)
|
385
|
+
if parent_ref:
|
386
|
+
list_last_item_by_ref[parent_ref] = cast(
|
387
|
+
ListItem, parent_item
|
388
|
+
)
|
389
|
+
|
370
390
|
elif isinstance(to_create, _HeadingCreationPayload):
|
371
391
|
# not keeping as parent_item as logic for correctly tracking
|
372
392
|
# that not implemented yet (section components not captured
|
@@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
458
478
|
element, processed_block_types
|
459
479
|
):
|
460
480
|
for child in element.children:
|
481
|
+
if (
|
482
|
+
isinstance(element, marko.block.ListItem)
|
483
|
+
and isinstance(child, marko.block.List)
|
484
|
+
and parent_item
|
485
|
+
and list_last_item_by_ref.get(parent_item.self_ref, None)
|
486
|
+
):
|
487
|
+
_log.debug(
|
488
|
+
f"walking into new List hanging from item of parent list {parent_item.self_ref}"
|
489
|
+
)
|
490
|
+
parent_item = list_last_item_by_ref[parent_item.self_ref]
|
491
|
+
|
461
492
|
self._iterate_elements(
|
462
493
|
element=child,
|
463
494
|
depth=depth + 1,
|
@@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
465
496
|
visited=visited,
|
466
497
|
creation_stack=creation_stack,
|
467
498
|
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
|
499
|
+
list_last_item_by_ref=list_last_item_by_ref,
|
468
500
|
parent_item=parent_item,
|
469
501
|
formatting=formatting,
|
470
502
|
hyperlink=hyperlink,
|
@@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
483
515
|
return False
|
484
516
|
|
485
517
|
@classmethod
|
486
|
-
def supported_formats(cls) ->
|
518
|
+
def supported_formats(cls) -> set[InputFormat]:
|
487
519
|
return {InputFormat.MD}
|
488
520
|
|
489
521
|
def convert(self) -> DoclingDocument:
|
@@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
510
542
|
visited=set(),
|
511
543
|
creation_stack=[],
|
512
544
|
list_ordered_flag_by_ref={},
|
545
|
+
list_last_item_by_ref={},
|
513
546
|
)
|
514
547
|
self._close_table(doc=doc) # handle any last hanging table
|
515
548
|
|
@@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
534
567
|
]:
|
535
568
|
html_str = _restore_original_html(txt=html_str, regex=regex)
|
536
569
|
self._html_blocks = 0
|
537
|
-
|
538
570
|
# delegate to HTML backend
|
539
571
|
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
540
572
|
in_doc = InputDocument(
|
@@ -332,3 +332,18 @@ class ProcessingPipeline(str, Enum):
|
|
332
332
|
STANDARD = "standard"
|
333
333
|
VLM = "vlm"
|
334
334
|
ASR = "asr"
|
335
|
+
|
336
|
+
|
337
|
+
class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
338
|
+
"""Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
|
339
|
+
|
340
|
+
# Batch sizes for different stages
|
341
|
+
ocr_batch_size: int = 4
|
342
|
+
layout_batch_size: int = 4
|
343
|
+
table_batch_size: int = 4
|
344
|
+
|
345
|
+
# Timing control
|
346
|
+
batch_timeout_seconds: float = 2.0
|
347
|
+
|
348
|
+
# Backpressure and queue control
|
349
|
+
queue_max_size: int = 100
|
docling/datamodel/settings.py
CHANGED
@@ -26,18 +26,13 @@ class DocumentLimits(BaseModel):
|
|
26
26
|
|
27
27
|
|
28
28
|
class BatchConcurrencySettings(BaseModel):
|
29
|
-
doc_batch_size: int =
|
30
|
-
doc_batch_concurrency: int =
|
31
|
-
page_batch_size: int = 4
|
32
|
-
page_batch_concurrency: int =
|
33
|
-
elements_batch_size: int =
|
34
|
-
|
35
|
-
|
36
|
-
# doc_batch_concurrency: int = 1
|
37
|
-
# page_batch_size: int = 1
|
38
|
-
# page_batch_concurrency: int = 1
|
39
|
-
|
40
|
-
# model_concurrency: int = 2
|
29
|
+
doc_batch_size: int = 1 # Number of documents processed in one batch. Should be >= doc_batch_concurrency
|
30
|
+
doc_batch_concurrency: int = 1 # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
|
31
|
+
page_batch_size: int = 4 # Number of pages processed in one batch.
|
32
|
+
page_batch_concurrency: int = 1 # Currently unused.
|
33
|
+
elements_batch_size: int = (
|
34
|
+
16 # Number of elements processed in one batch, in enrichment models.
|
35
|
+
)
|
41
36
|
|
42
37
|
# To force models into single core: export OMP_NUM_THREADS=1
|
43
38
|
|
docling/document_converter.py
CHANGED
@@ -4,6 +4,7 @@ import sys
|
|
4
4
|
import threading
|
5
5
|
import time
|
6
6
|
from collections.abc import Iterable, Iterator
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
7
8
|
from functools import partial
|
8
9
|
from pathlib import Path
|
9
10
|
from typing import Dict, List, Optional, Tuple, Type, Union
|
@@ -284,24 +285,33 @@ class DocumentConverter:
|
|
284
285
|
settings.perf.doc_batch_size, # pass format_options
|
285
286
|
):
|
286
287
|
_log.info("Going to convert document batch...")
|
288
|
+
process_func = partial(
|
289
|
+
self._process_document, raises_on_error=raises_on_error
|
290
|
+
)
|
287
291
|
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
# ) as pool:
|
292
|
-
# yield from pool.map(self.process_document, input_batch)
|
293
|
-
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
294
|
-
|
295
|
-
for item in map(
|
296
|
-
partial(self._process_document, raises_on_error=raises_on_error),
|
297
|
-
input_batch,
|
292
|
+
if (
|
293
|
+
settings.perf.doc_batch_concurrency > 1
|
294
|
+
and settings.perf.doc_batch_size > 1
|
298
295
|
):
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
296
|
+
with ThreadPoolExecutor(
|
297
|
+
max_workers=settings.perf.doc_batch_concurrency
|
298
|
+
) as pool:
|
299
|
+
for item in pool.map(
|
300
|
+
process_func,
|
301
|
+
input_batch,
|
302
|
+
):
|
303
|
+
yield item
|
304
|
+
else:
|
305
|
+
for item in map(
|
306
|
+
process_func,
|
307
|
+
input_batch,
|
308
|
+
):
|
309
|
+
elapsed = time.monotonic() - start_time
|
310
|
+
start_time = time.monotonic()
|
311
|
+
_log.info(
|
312
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
313
|
+
)
|
314
|
+
yield item
|
305
315
|
|
306
316
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
307
317
|
"""Retrieve or initialize a pipeline, reusing instances based on class and options."""
|
@@ -330,7 +340,7 @@ class DocumentConverter:
|
|
330
340
|
f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
331
341
|
)
|
332
342
|
|
333
|
-
|
343
|
+
return self.initialized_pipelines[cache_key]
|
334
344
|
|
335
345
|
def _process_document(
|
336
346
|
self, in_doc: InputDocument, raises_on_error: bool
|
docling/models/layout_model.py
CHANGED
@@ -3,7 +3,7 @@ import logging
|
|
3
3
|
import warnings
|
4
4
|
from collections.abc import Iterable
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Optional
|
6
|
+
from typing import List, Optional, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from docling_core.types.doc import DocItemLabel
|
@@ -148,72 +148,90 @@ class LayoutModel(BasePageModel):
|
|
148
148
|
def __call__(
|
149
149
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
150
150
|
) -> Iterable[Page]:
|
151
|
-
|
151
|
+
# Convert to list to allow multiple iterations
|
152
|
+
pages = list(page_batch)
|
153
|
+
|
154
|
+
# Separate valid and invalid pages
|
155
|
+
valid_pages = []
|
156
|
+
valid_page_images: List[Union[Image.Image, np.ndarray]] = []
|
157
|
+
|
158
|
+
for page in pages:
|
152
159
|
assert page._backend is not None
|
153
160
|
if not page._backend.is_valid():
|
154
|
-
|
155
|
-
else:
|
156
|
-
with TimeRecorder(conv_res, "layout"):
|
157
|
-
assert page.size is not None
|
158
|
-
page_image = page.get_image(scale=1.0)
|
159
|
-
assert page_image is not None
|
160
|
-
|
161
|
-
clusters = []
|
162
|
-
for ix, pred_item in enumerate(
|
163
|
-
self.layout_predictor.predict(page_image)
|
164
|
-
):
|
165
|
-
label = DocItemLabel(
|
166
|
-
pred_item["label"]
|
167
|
-
.lower()
|
168
|
-
.replace(" ", "_")
|
169
|
-
.replace("-", "_")
|
170
|
-
) # Temporary, until docling-ibm-model uses docling-core types
|
171
|
-
cluster = Cluster(
|
172
|
-
id=ix,
|
173
|
-
label=label,
|
174
|
-
confidence=pred_item["confidence"],
|
175
|
-
bbox=BoundingBox.model_validate(pred_item),
|
176
|
-
cells=[],
|
177
|
-
)
|
178
|
-
clusters.append(cluster)
|
179
|
-
|
180
|
-
if settings.debug.visualize_raw_layout:
|
181
|
-
self.draw_clusters_and_cells_side_by_side(
|
182
|
-
conv_res, page, clusters, mode_prefix="raw"
|
183
|
-
)
|
184
|
-
|
185
|
-
# Apply postprocessing
|
186
|
-
|
187
|
-
processed_clusters, processed_cells = LayoutPostprocessor(
|
188
|
-
page, clusters, self.options
|
189
|
-
).postprocess()
|
190
|
-
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
191
|
-
|
192
|
-
with warnings.catch_warnings():
|
193
|
-
warnings.filterwarnings(
|
194
|
-
"ignore",
|
195
|
-
"Mean of empty slice|invalid value encountered in scalar divide",
|
196
|
-
RuntimeWarning,
|
197
|
-
"numpy",
|
198
|
-
)
|
199
|
-
|
200
|
-
conv_res.confidence.pages[page.page_no].layout_score = float(
|
201
|
-
np.mean([c.confidence for c in processed_clusters])
|
202
|
-
)
|
203
|
-
|
204
|
-
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
205
|
-
np.mean(
|
206
|
-
[c.confidence for c in processed_cells if c.from_ocr]
|
207
|
-
)
|
208
|
-
)
|
209
|
-
|
210
|
-
page.predictions.layout = LayoutPrediction(
|
211
|
-
clusters=processed_clusters
|
212
|
-
)
|
213
|
-
|
214
|
-
if settings.debug.visualize_layout:
|
215
|
-
self.draw_clusters_and_cells_side_by_side(
|
216
|
-
conv_res, page, processed_clusters, mode_prefix="postprocessed"
|
217
|
-
)
|
161
|
+
continue
|
218
162
|
|
163
|
+
assert page.size is not None
|
164
|
+
page_image = page.get_image(scale=1.0)
|
165
|
+
assert page_image is not None
|
166
|
+
|
167
|
+
valid_pages.append(page)
|
168
|
+
valid_page_images.append(page_image)
|
169
|
+
|
170
|
+
# Process all valid pages with batch prediction
|
171
|
+
batch_predictions = []
|
172
|
+
if valid_page_images:
|
173
|
+
with TimeRecorder(conv_res, "layout"):
|
174
|
+
batch_predictions = self.layout_predictor.predict_batch( # type: ignore[attr-defined]
|
175
|
+
valid_page_images
|
176
|
+
)
|
177
|
+
|
178
|
+
# Process each page with its predictions
|
179
|
+
valid_page_idx = 0
|
180
|
+
for page in pages:
|
181
|
+
assert page._backend is not None
|
182
|
+
if not page._backend.is_valid():
|
219
183
|
yield page
|
184
|
+
continue
|
185
|
+
|
186
|
+
page_predictions = batch_predictions[valid_page_idx]
|
187
|
+
valid_page_idx += 1
|
188
|
+
|
189
|
+
clusters = []
|
190
|
+
for ix, pred_item in enumerate(page_predictions):
|
191
|
+
label = DocItemLabel(
|
192
|
+
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
193
|
+
) # Temporary, until docling-ibm-model uses docling-core types
|
194
|
+
cluster = Cluster(
|
195
|
+
id=ix,
|
196
|
+
label=label,
|
197
|
+
confidence=pred_item["confidence"],
|
198
|
+
bbox=BoundingBox.model_validate(pred_item),
|
199
|
+
cells=[],
|
200
|
+
)
|
201
|
+
clusters.append(cluster)
|
202
|
+
|
203
|
+
if settings.debug.visualize_raw_layout:
|
204
|
+
self.draw_clusters_and_cells_side_by_side(
|
205
|
+
conv_res, page, clusters, mode_prefix="raw"
|
206
|
+
)
|
207
|
+
|
208
|
+
# Apply postprocessing
|
209
|
+
processed_clusters, processed_cells = LayoutPostprocessor(
|
210
|
+
page, clusters, self.options
|
211
|
+
).postprocess()
|
212
|
+
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
213
|
+
|
214
|
+
with warnings.catch_warnings():
|
215
|
+
warnings.filterwarnings(
|
216
|
+
"ignore",
|
217
|
+
"Mean of empty slice|invalid value encountered in scalar divide",
|
218
|
+
RuntimeWarning,
|
219
|
+
"numpy",
|
220
|
+
)
|
221
|
+
|
222
|
+
conv_res.confidence.pages[page.page_no].layout_score = float(
|
223
|
+
np.mean([c.confidence for c in processed_clusters])
|
224
|
+
)
|
225
|
+
|
226
|
+
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
227
|
+
np.mean([c.confidence for c in processed_cells if c.from_ocr])
|
228
|
+
)
|
229
|
+
|
230
|
+
page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
|
231
|
+
|
232
|
+
if settings.debug.visualize_layout:
|
233
|
+
self.draw_clusters_and_cells_side_by_side(
|
234
|
+
conv_res, page, processed_clusters, mode_prefix="postprocessed"
|
235
|
+
)
|
236
|
+
|
237
|
+
yield page
|
@@ -0,0 +1,605 @@
|
|
1
|
+
# threaded_standard_pdf_pipeline.py
|
2
|
+
"""Thread-safe, production-ready PDF pipeline
|
3
|
+
================================================
|
4
|
+
A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
|
5
|
+
|
6
|
+
* **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
|
7
|
+
threads so that concurrent invocations never share mutable state.
|
8
|
+
* **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
|
9
|
+
relying on :pyfunc:`id`, which may clash after garbage collection.
|
10
|
+
* **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
|
11
|
+
propagates downstream so stages terminate deterministically without sentinels.
|
12
|
+
* **Minimal shared state** - heavyweight models are initialised once per pipeline instance
|
13
|
+
and only read by worker threads; no runtime mutability is exposed.
|
14
|
+
* **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
|
15
|
+
"""
|
16
|
+
|
17
|
+
from __future__ import annotations
|
18
|
+
|
19
|
+
import itertools
|
20
|
+
import logging
|
21
|
+
import threading
|
22
|
+
import time
|
23
|
+
from collections import defaultdict, deque
|
24
|
+
from dataclasses import dataclass, field
|
25
|
+
from pathlib import Path
|
26
|
+
from typing import Any, Iterable, List, Optional, Sequence, Tuple
|
27
|
+
|
28
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
29
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
30
|
+
from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
|
31
|
+
from docling.datamodel.document import ConversionResult
|
32
|
+
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
33
|
+
from docling.datamodel.settings import settings
|
34
|
+
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
35
|
+
from docling.models.document_picture_classifier import (
|
36
|
+
DocumentPictureClassifier,
|
37
|
+
DocumentPictureClassifierOptions,
|
38
|
+
)
|
39
|
+
from docling.models.factories import get_ocr_factory, get_picture_description_factory
|
40
|
+
from docling.models.layout_model import LayoutModel
|
41
|
+
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
42
|
+
from docling.models.page_preprocessing_model import (
|
43
|
+
PagePreprocessingModel,
|
44
|
+
PagePreprocessingOptions,
|
45
|
+
)
|
46
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
47
|
+
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
48
|
+
from docling.models.table_structure_model import TableStructureModel
|
49
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
50
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
51
|
+
from docling.utils.utils import chunkify
|
52
|
+
|
53
|
+
_log = logging.getLogger(__name__)
|
54
|
+
|
55
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
56
|
+
# Helper data structures
|
57
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
58
|
+
|
59
|
+
|
60
|
+
@dataclass
|
61
|
+
class ThreadedItem:
|
62
|
+
"""Envelope that travels between pipeline stages."""
|
63
|
+
|
64
|
+
payload: Optional[Page]
|
65
|
+
run_id: int # Unique per *execute* call, monotonic across pipeline instance
|
66
|
+
page_no: int
|
67
|
+
conv_res: ConversionResult
|
68
|
+
error: Optional[Exception] = None
|
69
|
+
is_failed: bool = False
|
70
|
+
|
71
|
+
|
72
|
+
@dataclass
|
73
|
+
class ProcessingResult:
|
74
|
+
"""Aggregated outcome of a pipeline run."""
|
75
|
+
|
76
|
+
pages: List[Page] = field(default_factory=list)
|
77
|
+
failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
|
78
|
+
total_expected: int = 0
|
79
|
+
|
80
|
+
@property
|
81
|
+
def success_count(self) -> int:
|
82
|
+
return len(self.pages)
|
83
|
+
|
84
|
+
@property
|
85
|
+
def failure_count(self) -> int:
|
86
|
+
return len(self.failed_pages)
|
87
|
+
|
88
|
+
@property
|
89
|
+
def is_partial_success(self) -> bool:
|
90
|
+
return 0 < self.success_count < self.total_expected
|
91
|
+
|
92
|
+
@property
|
93
|
+
def is_complete_failure(self) -> bool:
|
94
|
+
return self.success_count == 0 and self.failure_count > 0
|
95
|
+
|
96
|
+
|
97
|
+
class ThreadedQueue:
|
98
|
+
"""Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
|
99
|
+
|
100
|
+
__slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
|
101
|
+
|
102
|
+
def __init__(self, max_size: int) -> None:
|
103
|
+
self._max: int = max_size
|
104
|
+
self._items: deque[ThreadedItem] = deque()
|
105
|
+
self._lock = threading.Lock()
|
106
|
+
self._not_full = threading.Condition(self._lock)
|
107
|
+
self._not_empty = threading.Condition(self._lock)
|
108
|
+
self._closed = False
|
109
|
+
|
110
|
+
# ---------------------------------------------------------------- put()
|
111
|
+
def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
|
112
|
+
"""Block until queue accepts *item* or is closed. Returns *False* if closed."""
|
113
|
+
with self._not_full:
|
114
|
+
if self._closed:
|
115
|
+
return False
|
116
|
+
start = time.monotonic()
|
117
|
+
while len(self._items) >= self._max and not self._closed:
|
118
|
+
if timeout is not None:
|
119
|
+
remaining = timeout - (time.monotonic() - start)
|
120
|
+
if remaining <= 0:
|
121
|
+
return False
|
122
|
+
self._not_full.wait(remaining)
|
123
|
+
else:
|
124
|
+
self._not_full.wait()
|
125
|
+
if self._closed:
|
126
|
+
return False
|
127
|
+
self._items.append(item)
|
128
|
+
self._not_empty.notify()
|
129
|
+
return True
|
130
|
+
|
131
|
+
# ------------------------------------------------------------ get_batch()
|
132
|
+
def get_batch(
|
133
|
+
self, size: int, timeout: Optional[float] | None = None
|
134
|
+
) -> List[ThreadedItem]:
|
135
|
+
"""Return up to *size* items. Blocks until ≥1 item present or queue closed/timeout."""
|
136
|
+
with self._not_empty:
|
137
|
+
start = time.monotonic()
|
138
|
+
while not self._items and not self._closed:
|
139
|
+
if timeout is not None:
|
140
|
+
remaining = timeout - (time.monotonic() - start)
|
141
|
+
if remaining <= 0:
|
142
|
+
return []
|
143
|
+
self._not_empty.wait(remaining)
|
144
|
+
else:
|
145
|
+
self._not_empty.wait()
|
146
|
+
batch: List[ThreadedItem] = []
|
147
|
+
while self._items and len(batch) < size:
|
148
|
+
batch.append(self._items.popleft())
|
149
|
+
if batch:
|
150
|
+
self._not_full.notify_all()
|
151
|
+
return batch
|
152
|
+
|
153
|
+
# ---------------------------------------------------------------- close()
|
154
|
+
def close(self) -> None:
|
155
|
+
with self._lock:
|
156
|
+
self._closed = True
|
157
|
+
self._not_empty.notify_all()
|
158
|
+
self._not_full.notify_all()
|
159
|
+
|
160
|
+
# -------------------------------------------------------------- property
|
161
|
+
@property
|
162
|
+
def closed(self) -> bool:
|
163
|
+
return self._closed
|
164
|
+
|
165
|
+
|
166
|
+
class ThreadedPipelineStage:
|
167
|
+
"""A single pipeline stage backed by one worker thread."""
|
168
|
+
|
169
|
+
def __init__(
|
170
|
+
self,
|
171
|
+
*,
|
172
|
+
name: str,
|
173
|
+
model: Any,
|
174
|
+
batch_size: int,
|
175
|
+
batch_timeout: float,
|
176
|
+
queue_max_size: int,
|
177
|
+
) -> None:
|
178
|
+
self.name = name
|
179
|
+
self.model = model
|
180
|
+
self.batch_size = batch_size
|
181
|
+
self.batch_timeout = batch_timeout
|
182
|
+
self.input_queue = ThreadedQueue(queue_max_size)
|
183
|
+
self._outputs: list[ThreadedQueue] = []
|
184
|
+
self._thread: Optional[threading.Thread] = None
|
185
|
+
self._running = False
|
186
|
+
|
187
|
+
# ---------------------------------------------------------------- wiring
|
188
|
+
def add_output_queue(self, q: ThreadedQueue) -> None:
|
189
|
+
self._outputs.append(q)
|
190
|
+
|
191
|
+
# -------------------------------------------------------------- lifecycle
|
192
|
+
def start(self) -> None:
|
193
|
+
if self._running:
|
194
|
+
return
|
195
|
+
self._running = True
|
196
|
+
self._thread = threading.Thread(
|
197
|
+
target=self._run, name=f"Stage-{self.name}", daemon=False
|
198
|
+
)
|
199
|
+
self._thread.start()
|
200
|
+
|
201
|
+
def stop(self) -> None:
|
202
|
+
if not self._running:
|
203
|
+
return
|
204
|
+
self._running = False
|
205
|
+
self.input_queue.close()
|
206
|
+
if self._thread is not None:
|
207
|
+
self._thread.join(timeout=30.0)
|
208
|
+
if self._thread.is_alive():
|
209
|
+
_log.warning("Stage %s did not terminate cleanly within 30s", self.name)
|
210
|
+
|
211
|
+
# ------------------------------------------------------------------ _run
|
212
|
+
def _run(self) -> None:
|
213
|
+
try:
|
214
|
+
while self._running:
|
215
|
+
batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
|
216
|
+
if not batch and self.input_queue.closed:
|
217
|
+
break
|
218
|
+
processed = self._process_batch(batch)
|
219
|
+
self._emit(processed)
|
220
|
+
except Exception: # pragma: no cover - top-level guard
|
221
|
+
_log.exception("Fatal error in stage %s", self.name)
|
222
|
+
finally:
|
223
|
+
for q in self._outputs:
|
224
|
+
q.close()
|
225
|
+
|
226
|
+
# ----------------------------------------------------- _process_batch()
|
227
|
+
def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
|
228
|
+
"""Run *model* on *batch* grouped by run_id to maximise batching."""
|
229
|
+
groups: dict[int, list[ThreadedItem]] = defaultdict(list)
|
230
|
+
for itm in batch:
|
231
|
+
groups[itm.run_id].append(itm)
|
232
|
+
|
233
|
+
result: list[ThreadedItem] = []
|
234
|
+
for rid, items in groups.items():
|
235
|
+
good: list[ThreadedItem] = [i for i in items if not i.is_failed]
|
236
|
+
if not good:
|
237
|
+
result.extend(items)
|
238
|
+
continue
|
239
|
+
try:
|
240
|
+
# Filter out None payloads and ensure type safety
|
241
|
+
pages_with_payloads = [
|
242
|
+
(i, i.payload) for i in good if i.payload is not None
|
243
|
+
]
|
244
|
+
if len(pages_with_payloads) != len(good):
|
245
|
+
# Some items have None payloads, mark all as failed
|
246
|
+
for it in items:
|
247
|
+
it.is_failed = True
|
248
|
+
it.error = RuntimeError("Page payload is None")
|
249
|
+
result.extend(items)
|
250
|
+
continue
|
251
|
+
|
252
|
+
pages: List[Page] = [payload for _, payload in pages_with_payloads]
|
253
|
+
processed_pages = list(self.model(good[0].conv_res, pages)) # type: ignore[arg-type]
|
254
|
+
if len(processed_pages) != len(pages): # strict mismatch guard
|
255
|
+
raise RuntimeError(
|
256
|
+
f"Model {self.name} returned wrong number of pages"
|
257
|
+
)
|
258
|
+
for idx, page in enumerate(processed_pages):
|
259
|
+
result.append(
|
260
|
+
ThreadedItem(
|
261
|
+
payload=page,
|
262
|
+
run_id=rid,
|
263
|
+
page_no=good[idx].page_no,
|
264
|
+
conv_res=good[idx].conv_res,
|
265
|
+
)
|
266
|
+
)
|
267
|
+
except Exception as exc:
|
268
|
+
_log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
|
269
|
+
for it in items:
|
270
|
+
it.is_failed = True
|
271
|
+
it.error = exc
|
272
|
+
result.extend(items)
|
273
|
+
return result
|
274
|
+
|
275
|
+
# -------------------------------------------------------------- _emit()
|
276
|
+
def _emit(self, items: Iterable[ThreadedItem]) -> None:
|
277
|
+
for item in items:
|
278
|
+
for q in self._outputs:
|
279
|
+
if not q.put(item):
|
280
|
+
_log.error("Output queue closed while emitting from %s", self.name)
|
281
|
+
|
282
|
+
|
283
|
+
@dataclass
|
284
|
+
class RunContext:
|
285
|
+
"""Wiring for a single *execute* call."""
|
286
|
+
|
287
|
+
stages: list[ThreadedPipelineStage]
|
288
|
+
first_stage: ThreadedPipelineStage
|
289
|
+
output_queue: ThreadedQueue
|
290
|
+
|
291
|
+
|
292
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
293
|
+
# Main pipeline
|
294
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
295
|
+
|
296
|
+
|
297
|
+
class ThreadedStandardPdfPipeline(BasePipeline):
|
298
|
+
"""High-performance PDF pipeline with multi-threaded stages."""
|
299
|
+
|
300
|
+
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
|
301
|
+
super().__init__(pipeline_options)
|
302
|
+
self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
|
303
|
+
self._run_seq = itertools.count(1) # deterministic, monotonic run ids
|
304
|
+
|
305
|
+
# initialise heavy models once
|
306
|
+
self._init_models()
|
307
|
+
|
308
|
+
# ────────────────────────────────────────────────────────────────────────
|
309
|
+
# Heavy-model initialisation & helpers
|
310
|
+
# ────────────────────────────────────────────────────────────────────────
|
311
|
+
|
312
|
+
def _init_models(self) -> None:
|
313
|
+
art_path = self._resolve_artifacts_path()
|
314
|
+
self.keep_images = (
|
315
|
+
self.pipeline_options.generate_page_images
|
316
|
+
or self.pipeline_options.generate_picture_images
|
317
|
+
or self.pipeline_options.generate_table_images
|
318
|
+
)
|
319
|
+
self.preprocessing_model = PagePreprocessingModel(
|
320
|
+
options=PagePreprocessingOptions(
|
321
|
+
images_scale=self.pipeline_options.images_scale
|
322
|
+
)
|
323
|
+
)
|
324
|
+
self.ocr_model = self._make_ocr_model(art_path)
|
325
|
+
self.layout_model = LayoutModel(
|
326
|
+
artifacts_path=art_path,
|
327
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
328
|
+
options=self.pipeline_options.layout_options,
|
329
|
+
)
|
330
|
+
self.table_model = TableStructureModel(
|
331
|
+
enabled=self.pipeline_options.do_table_structure,
|
332
|
+
artifacts_path=art_path,
|
333
|
+
options=self.pipeline_options.table_structure_options,
|
334
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
335
|
+
)
|
336
|
+
self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
|
337
|
+
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
338
|
+
|
339
|
+
# --- optional enrichment ------------------------------------------------
|
340
|
+
self.enrichment_pipe = []
|
341
|
+
code_formula = CodeFormulaModel(
|
342
|
+
enabled=self.pipeline_options.do_code_enrichment
|
343
|
+
or self.pipeline_options.do_formula_enrichment,
|
344
|
+
artifacts_path=art_path,
|
345
|
+
options=CodeFormulaModelOptions(
|
346
|
+
do_code_enrichment=self.pipeline_options.do_code_enrichment,
|
347
|
+
do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
|
348
|
+
),
|
349
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
350
|
+
)
|
351
|
+
if code_formula.enabled:
|
352
|
+
self.enrichment_pipe.append(code_formula)
|
353
|
+
|
354
|
+
picture_classifier = DocumentPictureClassifier(
|
355
|
+
enabled=self.pipeline_options.do_picture_classification,
|
356
|
+
artifacts_path=art_path,
|
357
|
+
options=DocumentPictureClassifierOptions(),
|
358
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
359
|
+
)
|
360
|
+
if picture_classifier.enabled:
|
361
|
+
self.enrichment_pipe.append(picture_classifier)
|
362
|
+
|
363
|
+
picture_descr = self._make_picture_description_model(art_path)
|
364
|
+
if picture_descr and picture_descr.enabled:
|
365
|
+
self.enrichment_pipe.append(picture_descr)
|
366
|
+
|
367
|
+
self.keep_backend = any(
|
368
|
+
(
|
369
|
+
self.pipeline_options.do_formula_enrichment,
|
370
|
+
self.pipeline_options.do_code_enrichment,
|
371
|
+
self.pipeline_options.do_picture_classification,
|
372
|
+
self.pipeline_options.do_picture_description,
|
373
|
+
)
|
374
|
+
)
|
375
|
+
|
376
|
+
# ---------------------------------------------------------------- helpers
|
377
|
+
def _resolve_artifacts_path(self) -> Optional[Path]:
|
378
|
+
if self.pipeline_options.artifacts_path:
|
379
|
+
p = Path(self.pipeline_options.artifacts_path).expanduser()
|
380
|
+
elif settings.artifacts_path:
|
381
|
+
p = Path(settings.artifacts_path).expanduser()
|
382
|
+
else:
|
383
|
+
return None
|
384
|
+
if not p.is_dir():
|
385
|
+
raise RuntimeError(
|
386
|
+
f"{p} does not exist or is not a directory containing the required models"
|
387
|
+
)
|
388
|
+
return p
|
389
|
+
|
390
|
+
def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
|
391
|
+
factory = get_ocr_factory(
|
392
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
393
|
+
)
|
394
|
+
return factory.create_instance(
|
395
|
+
options=self.pipeline_options.ocr_options,
|
396
|
+
enabled=self.pipeline_options.do_ocr,
|
397
|
+
artifacts_path=art_path,
|
398
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
399
|
+
)
|
400
|
+
|
401
|
+
def _make_picture_description_model(
|
402
|
+
self, art_path: Optional[Path]
|
403
|
+
) -> Optional[PictureDescriptionBaseModel]:
|
404
|
+
factory = get_picture_description_factory(
|
405
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
406
|
+
)
|
407
|
+
return factory.create_instance(
|
408
|
+
options=self.pipeline_options.picture_description_options,
|
409
|
+
enabled=self.pipeline_options.do_picture_description,
|
410
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
411
|
+
artifacts_path=art_path,
|
412
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
413
|
+
)
|
414
|
+
|
415
|
+
# ────────────────────────────────────────────────────────────────────────
|
416
|
+
# Build - thread pipeline
|
417
|
+
# ────────────────────────────────────────────────────────────────────────
|
418
|
+
|
419
|
+
def _create_run_ctx(self) -> RunContext:
|
420
|
+
opts = self.pipeline_options
|
421
|
+
preprocess = ThreadedPipelineStage(
|
422
|
+
name="preprocess",
|
423
|
+
model=self.preprocessing_model,
|
424
|
+
batch_size=1,
|
425
|
+
batch_timeout=opts.batch_timeout_seconds,
|
426
|
+
queue_max_size=opts.queue_max_size,
|
427
|
+
)
|
428
|
+
ocr = ThreadedPipelineStage(
|
429
|
+
name="ocr",
|
430
|
+
model=self.ocr_model,
|
431
|
+
batch_size=opts.ocr_batch_size,
|
432
|
+
batch_timeout=opts.batch_timeout_seconds,
|
433
|
+
queue_max_size=opts.queue_max_size,
|
434
|
+
)
|
435
|
+
layout = ThreadedPipelineStage(
|
436
|
+
name="layout",
|
437
|
+
model=self.layout_model,
|
438
|
+
batch_size=opts.layout_batch_size,
|
439
|
+
batch_timeout=opts.batch_timeout_seconds,
|
440
|
+
queue_max_size=opts.queue_max_size,
|
441
|
+
)
|
442
|
+
table = ThreadedPipelineStage(
|
443
|
+
name="table",
|
444
|
+
model=self.table_model,
|
445
|
+
batch_size=opts.table_batch_size,
|
446
|
+
batch_timeout=opts.batch_timeout_seconds,
|
447
|
+
queue_max_size=opts.queue_max_size,
|
448
|
+
)
|
449
|
+
assemble = ThreadedPipelineStage(
|
450
|
+
name="assemble",
|
451
|
+
model=self.assemble_model,
|
452
|
+
batch_size=1,
|
453
|
+
batch_timeout=opts.batch_timeout_seconds,
|
454
|
+
queue_max_size=opts.queue_max_size,
|
455
|
+
)
|
456
|
+
|
457
|
+
# wire stages
|
458
|
+
output_q = ThreadedQueue(opts.queue_max_size)
|
459
|
+
preprocess.add_output_queue(ocr.input_queue)
|
460
|
+
ocr.add_output_queue(layout.input_queue)
|
461
|
+
layout.add_output_queue(table.input_queue)
|
462
|
+
table.add_output_queue(assemble.input_queue)
|
463
|
+
assemble.add_output_queue(output_q)
|
464
|
+
|
465
|
+
stages = [preprocess, ocr, layout, table, assemble]
|
466
|
+
return RunContext(stages=stages, first_stage=preprocess, output_queue=output_q)
|
467
|
+
|
468
|
+
# --------------------------------------------------------------------- build
|
469
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
470
|
+
"""Stream-build the document while interleaving producer and consumer work."""
|
471
|
+
run_id = next(self._run_seq)
|
472
|
+
assert isinstance(conv_res.input._backend, PdfDocumentBackend)
|
473
|
+
backend = conv_res.input._backend
|
474
|
+
|
475
|
+
# preload & initialise pages -------------------------------------------------------------
|
476
|
+
start_page, end_page = conv_res.input.limits.page_range
|
477
|
+
pages: list[Page] = []
|
478
|
+
for i in range(conv_res.input.page_count):
|
479
|
+
if start_page - 1 <= i <= end_page - 1:
|
480
|
+
page = Page(page_no=i)
|
481
|
+
page._backend = backend.load_page(i)
|
482
|
+
if page._backend and page._backend.is_valid():
|
483
|
+
page.size = page._backend.get_size()
|
484
|
+
conv_res.pages.append(page)
|
485
|
+
pages.append(page)
|
486
|
+
|
487
|
+
if not pages:
|
488
|
+
conv_res.status = ConversionStatus.FAILURE
|
489
|
+
return conv_res
|
490
|
+
|
491
|
+
total_pages: int = len(pages)
|
492
|
+
ctx: RunContext = self._create_run_ctx()
|
493
|
+
for st in ctx.stages:
|
494
|
+
st.start()
|
495
|
+
|
496
|
+
proc = ProcessingResult(total_expected=total_pages)
|
497
|
+
fed_idx: int = 0 # number of pages successfully queued
|
498
|
+
batch_size: int = 32 # drain chunk
|
499
|
+
try:
|
500
|
+
while proc.success_count + proc.failure_count < total_pages:
|
501
|
+
# 1) feed - try to enqueue until the first queue is full
|
502
|
+
while fed_idx < total_pages:
|
503
|
+
ok = ctx.first_stage.input_queue.put(
|
504
|
+
ThreadedItem(
|
505
|
+
payload=pages[fed_idx],
|
506
|
+
run_id=run_id,
|
507
|
+
page_no=pages[fed_idx].page_no,
|
508
|
+
conv_res=conv_res,
|
509
|
+
),
|
510
|
+
timeout=0.0, # non-blocking try-put
|
511
|
+
)
|
512
|
+
if ok:
|
513
|
+
fed_idx += 1
|
514
|
+
if fed_idx == total_pages:
|
515
|
+
ctx.first_stage.input_queue.close()
|
516
|
+
else: # queue full - switch to draining
|
517
|
+
break
|
518
|
+
|
519
|
+
# 2) drain - pull whatever is ready from the output side
|
520
|
+
out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
|
521
|
+
for itm in out_batch:
|
522
|
+
if itm.run_id != run_id:
|
523
|
+
continue
|
524
|
+
if itm.is_failed or itm.error:
|
525
|
+
proc.failed_pages.append(
|
526
|
+
(itm.page_no, itm.error or RuntimeError("unknown error"))
|
527
|
+
)
|
528
|
+
else:
|
529
|
+
assert itm.payload is not None
|
530
|
+
proc.pages.append(itm.payload)
|
531
|
+
|
532
|
+
# 3) failure safety - downstream closed early -> mark missing pages failed
|
533
|
+
if not out_batch and ctx.output_queue.closed:
|
534
|
+
missing = total_pages - (proc.success_count + proc.failure_count)
|
535
|
+
if missing > 0:
|
536
|
+
proc.failed_pages.extend(
|
537
|
+
[(-1, RuntimeError("pipeline terminated early"))] * missing
|
538
|
+
)
|
539
|
+
break
|
540
|
+
finally:
|
541
|
+
for st in ctx.stages:
|
542
|
+
st.stop()
|
543
|
+
ctx.output_queue.close()
|
544
|
+
|
545
|
+
self._integrate_results(conv_res, proc)
|
546
|
+
return conv_res
|
547
|
+
|
548
|
+
# ---------------------------------------------------- integrate_results()
|
549
|
+
def _integrate_results(
|
550
|
+
self, conv_res: ConversionResult, proc: ProcessingResult
|
551
|
+
) -> None:
|
552
|
+
page_map = {p.page_no: p for p in proc.pages}
|
553
|
+
conv_res.pages = [
|
554
|
+
page_map.get(p.page_no, p)
|
555
|
+
for p in conv_res.pages
|
556
|
+
if p.page_no in page_map
|
557
|
+
or not any(fp == p.page_no for fp, _ in proc.failed_pages)
|
558
|
+
]
|
559
|
+
if proc.is_complete_failure:
|
560
|
+
conv_res.status = ConversionStatus.FAILURE
|
561
|
+
elif proc.is_partial_success:
|
562
|
+
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
563
|
+
else:
|
564
|
+
conv_res.status = ConversionStatus.SUCCESS
|
565
|
+
if not self.keep_images:
|
566
|
+
for p in conv_res.pages:
|
567
|
+
p._image_cache = {}
|
568
|
+
if not self.keep_backend:
|
569
|
+
for p in conv_res.pages:
|
570
|
+
if p._backend is not None:
|
571
|
+
p._backend.unload()
|
572
|
+
|
573
|
+
# ---------------------------------------------------------------- assemble
|
574
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
575
|
+
elements, headers, body = [], [], []
|
576
|
+
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
577
|
+
for p in conv_res.pages:
|
578
|
+
if p.assembled:
|
579
|
+
elements.extend(p.assembled.elements)
|
580
|
+
headers.extend(p.assembled.headers)
|
581
|
+
body.extend(p.assembled.body)
|
582
|
+
conv_res.assembled = AssembledUnit(
|
583
|
+
elements=elements, headers=headers, body=body
|
584
|
+
)
|
585
|
+
conv_res.document = self.reading_order_model(conv_res)
|
586
|
+
return conv_res
|
587
|
+
|
588
|
+
# ---------------------------------------------------------------- misc
|
589
|
+
@classmethod
|
590
|
+
def get_default_options(cls) -> ThreadedPdfPipelineOptions:
|
591
|
+
return ThreadedPdfPipelineOptions()
|
592
|
+
|
593
|
+
@classmethod
|
594
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
|
595
|
+
return isinstance(backend, PdfDocumentBackend)
|
596
|
+
|
597
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
598
|
+
return conv_res.status
|
599
|
+
|
600
|
+
def _unload(self, conv_res: ConversionResult) -> None:
|
601
|
+
for p in conv_res.pages:
|
602
|
+
if p._backend is not None:
|
603
|
+
p._backend.unload()
|
604
|
+
if conv_res.input._backend:
|
605
|
+
conv_res.input._backend.unload()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.43.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -28,9 +28,9 @@ License-File: LICENSE
|
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
30
|
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
31
|
-
Requires-Dist: docling-ibm-models<4,>=3.
|
31
|
+
Requires-Dist: docling-ibm-models<4,>=3.9.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
|
-
Requires-Dist: pypdfium2
|
33
|
+
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
34
34
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
35
35
|
Requires-Dist: huggingface_hub<1,>=0.23
|
36
36
|
Requires-Dist: requests<3.0.0,>=2.32.2
|
@@ -1,5 +1,5 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/document_converter.py,sha256=
|
2
|
+
docling/document_converter.py,sha256=pYlozCp6X1iGO75m3KSudMfrSCrXihTlRpKARFN67BI,14757
|
3
3
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
4
4
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
5
5
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -9,8 +9,8 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
|
|
9
9
|
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
10
10
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
11
|
docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
|
12
|
-
docling/backend/html_backend.py,sha256=
|
13
|
-
docling/backend/md_backend.py,sha256=
|
12
|
+
docling/backend/html_backend.py,sha256=Nuzyp6kyjd0g_MsBEPiWdFWU5w9UM60yWSluwU5C0M4,20310
|
13
|
+
docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
|
14
14
|
docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
|
15
15
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
16
16
|
docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
|
@@ -37,10 +37,10 @@ docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF
|
|
37
37
|
docling/datamodel/base_models.py,sha256=9FslHkGUNmBp264LpLL_2JTfDAdaikldYs3SiQOHb5A,11828
|
38
38
|
docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
|
39
39
|
docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
|
40
|
-
docling/datamodel/pipeline_options.py,sha256=
|
40
|
+
docling/datamodel/pipeline_options.py,sha256=TaBmCBRjSxyoh79UkpEkPzokLYS8BA2QJam86g9pT5g,10544
|
41
41
|
docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
|
42
42
|
docling/datamodel/pipeline_options_vlm_model.py,sha256=z-pUqwRA8nJp6C3SEXZLem2zvSYdgavaAVYa8wkAIZY,2400
|
43
|
-
docling/datamodel/settings.py,sha256=
|
43
|
+
docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
|
44
44
|
docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
|
45
45
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
46
46
|
docling/models/api_vlm_model.py,sha256=foBvzaWeHFH1t-VdvRWLdiXiiofhvhjvHqRI0eNA_3w,2923
|
@@ -49,7 +49,7 @@ docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDW
|
|
49
49
|
docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
|
50
50
|
docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
|
51
51
|
docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
|
52
|
-
docling/models/layout_model.py,sha256=
|
52
|
+
docling/models/layout_model.py,sha256=Nfbo6keMB4vVjGoZdFMqD9CmZcWh-0bE3LkRjJTDJQ0,9146
|
53
53
|
docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
|
54
54
|
docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
|
55
55
|
docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
|
@@ -77,6 +77,7 @@ docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeX
|
|
77
77
|
docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
|
78
78
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
79
79
|
docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
|
80
|
+
docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=Rjdq1x2fRHBA0rMHJ6rqqHzxVVzgTEALBBj5d30oOZ8,26018
|
80
81
|
docling/pipeline/vlm_pipeline.py,sha256=0lj8tbXNpYF8OLBoLqP2BZfFpTHi40RoHVfvO_Nah4Q,15349
|
81
82
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
82
83
|
docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
|
@@ -91,9 +92,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
91
92
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
92
93
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
93
94
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
94
|
-
docling-2.
|
95
|
-
docling-2.
|
96
|
-
docling-2.
|
97
|
-
docling-2.
|
98
|
-
docling-2.
|
99
|
-
docling-2.
|
95
|
+
docling-2.43.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
96
|
+
docling-2.43.0.dist-info/METADATA,sha256=HS5J6rDKaZ_G_d4p10XgAwrNe-FjmHV-u5EmoTP4hro,10458
|
97
|
+
docling-2.43.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
98
|
+
docling-2.43.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
99
|
+
docling-2.43.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
100
|
+
docling-2.43.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|