docling 2.59.0__py3-none-any.whl → 2.60.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

@@ -1,19 +1,39 @@
1
+ """Thread-safe, production-ready PDF pipeline
2
+ ================================================
3
+ A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
4
+
5
+ * **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
6
+ threads so that concurrent invocations never share mutable state.
7
+ * **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
8
+ relying on :pyfunc:`id`, which may clash after garbage collection.
9
+ * **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
10
+ propagates downstream so stages terminate deterministically without sentinels.
11
+ * **Minimal shared state** - heavyweight models are initialised once per pipeline instance
12
+ and only read by worker threads; no runtime mutability is exposed.
13
+ * **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import itertools
1
19
  import logging
20
+ import threading
21
+ import time
2
22
  import warnings
23
+ from collections import defaultdict, deque
24
+ from dataclasses import dataclass, field
3
25
  from pathlib import Path
4
- from typing import Optional, cast
26
+ from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, cast
5
27
 
6
28
  import numpy as np
7
29
  from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
8
30
 
9
31
  from docling.backend.abstract_backend import AbstractDocumentBackend
10
32
  from docling.backend.pdf_backend import PdfDocumentBackend
11
- from docling.datamodel.base_models import AssembledUnit, Page
33
+ from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
12
34
  from docling.datamodel.document import ConversionResult
13
- from docling.datamodel.layout_model_specs import LayoutModelConfig
14
- from docling.datamodel.pipeline_options import PdfPipelineOptions
35
+ from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
15
36
  from docling.datamodel.settings import settings
16
- from docling.models.base_ocr_model import BaseOcrModel
17
37
  from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
18
38
  from docling.models.factories import get_ocr_factory
19
39
  from docling.models.layout_model import LayoutModel
@@ -24,132 +44,588 @@ from docling.models.page_preprocessing_model import (
24
44
  )
25
45
  from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
26
46
  from docling.models.table_structure_model import TableStructureModel
27
- from docling.pipeline.base_pipeline import PaginatedPipeline
28
- from docling.utils.model_downloader import download_models
47
+ from docling.pipeline.base_pipeline import ConvertPipeline
29
48
  from docling.utils.profiling import ProfilingScope, TimeRecorder
49
+ from docling.utils.utils import chunkify
30
50
 
31
51
  _log = logging.getLogger(__name__)
32
52
 
53
+ # ──────────────────────────────────────────────────────────────────────────────
54
+ # Helper data structures
55
+ # ──────────────────────────────────────────────────────────────────────────────
33
56
 
34
- class StandardPdfPipeline(PaginatedPipeline):
35
- def __init__(self, pipeline_options: PdfPipelineOptions):
36
- super().__init__(pipeline_options)
37
- self.pipeline_options: PdfPipelineOptions
38
-
39
- with warnings.catch_warnings(): # deprecated generate_table_images
40
- warnings.filterwarnings("ignore", category=DeprecationWarning)
41
- self.keep_images = (
42
- self.pipeline_options.generate_page_images
43
- or self.pipeline_options.generate_picture_images
44
- or self.pipeline_options.generate_table_images
45
- )
46
57
 
47
- self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
58
+ @dataclass
59
+ class ThreadedItem:
60
+ """Envelope that travels between pipeline stages."""
61
+
62
+ payload: Optional[Page]
63
+ run_id: int # Unique per *execute* call, monotonic across pipeline instance
64
+ page_no: int
65
+ conv_res: ConversionResult
66
+ error: Optional[Exception] = None
67
+ is_failed: bool = False
68
+
69
+
70
+ @dataclass
71
+ class ProcessingResult:
72
+ """Aggregated outcome of a pipeline run."""
73
+
74
+ pages: List[Page] = field(default_factory=list)
75
+ failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
76
+ total_expected: int = 0
77
+
78
+ @property
79
+ def success_count(self) -> int:
80
+ return len(self.pages)
81
+
82
+ @property
83
+ def failure_count(self) -> int:
84
+ return len(self.failed_pages)
85
+
86
+ @property
87
+ def is_partial_success(self) -> bool:
88
+ return 0 < self.success_count < self.total_expected
48
89
 
49
- ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
90
+ @property
91
+ def is_complete_failure(self) -> bool:
92
+ return self.success_count == 0 and self.failure_count > 0
50
93
 
51
- self.build_pipe = [
52
- # Pre-processing
53
- PagePreprocessingModel(
54
- options=PagePreprocessingOptions(
55
- images_scale=pipeline_options.images_scale,
94
+
95
+ class ThreadedQueue:
96
+ """Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
97
+
98
+ __slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
99
+
100
+ def __init__(self, max_size: int) -> None:
101
+ self._max: int = max_size
102
+ self._items: deque[ThreadedItem] = deque()
103
+ self._lock = threading.Lock()
104
+ self._not_full = threading.Condition(self._lock)
105
+ self._not_empty = threading.Condition(self._lock)
106
+ self._closed = False
107
+
108
+ # ---------------------------------------------------------------- put()
109
+ def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
110
+ """Block until queue accepts *item* or is closed. Returns *False* if closed."""
111
+ with self._not_full:
112
+ if self._closed:
113
+ return False
114
+ start = time.monotonic()
115
+ while len(self._items) >= self._max and not self._closed:
116
+ if timeout is not None:
117
+ remaining = timeout - (time.monotonic() - start)
118
+ if remaining <= 0:
119
+ return False
120
+ self._not_full.wait(remaining)
121
+ else:
122
+ self._not_full.wait()
123
+ if self._closed:
124
+ return False
125
+ self._items.append(item)
126
+ self._not_empty.notify()
127
+ return True
128
+
129
+ # ------------------------------------------------------------ get_batch()
130
+ def get_batch(
131
+ self, size: int, timeout: Optional[float] | None = None
132
+ ) -> List[ThreadedItem]:
133
+ """Return up to *size* items. Blocks until ≥1 item present or queue closed/timeout."""
134
+ with self._not_empty:
135
+ start = time.monotonic()
136
+ while not self._items and not self._closed:
137
+ if timeout is not None:
138
+ remaining = timeout - (time.monotonic() - start)
139
+ if remaining <= 0:
140
+ return []
141
+ self._not_empty.wait(remaining)
142
+ else:
143
+ self._not_empty.wait()
144
+ batch: List[ThreadedItem] = []
145
+ while self._items and len(batch) < size:
146
+ batch.append(self._items.popleft())
147
+ if batch:
148
+ self._not_full.notify_all()
149
+ return batch
150
+
151
+ # ---------------------------------------------------------------- close()
152
+ def close(self) -> None:
153
+ with self._lock:
154
+ self._closed = True
155
+ self._not_empty.notify_all()
156
+ self._not_full.notify_all()
157
+
158
+ # -------------------------------------------------------------- property
159
+ @property
160
+ def closed(self) -> bool:
161
+ return self._closed
162
+
163
+
164
+ class ThreadedPipelineStage:
165
+ """A single pipeline stage backed by one worker thread."""
166
+
167
+ def __init__(
168
+ self,
169
+ *,
170
+ name: str,
171
+ model: Any,
172
+ batch_size: int,
173
+ batch_timeout: float,
174
+ queue_max_size: int,
175
+ postprocess: Optional[Callable[[ThreadedItem], None]] = None,
176
+ ) -> None:
177
+ self.name = name
178
+ self.model = model
179
+ self.batch_size = batch_size
180
+ self.batch_timeout = batch_timeout
181
+ self.input_queue = ThreadedQueue(queue_max_size)
182
+ self._outputs: list[ThreadedQueue] = []
183
+ self._thread: Optional[threading.Thread] = None
184
+ self._running = False
185
+ self._postprocess = postprocess
186
+
187
+ # ---------------------------------------------------------------- wiring
188
+ def add_output_queue(self, q: ThreadedQueue) -> None:
189
+ self._outputs.append(q)
190
+
191
+ # -------------------------------------------------------------- lifecycle
192
+ def start(self) -> None:
193
+ if self._running:
194
+ return
195
+ self._running = True
196
+ self._thread = threading.Thread(
197
+ target=self._run, name=f"Stage-{self.name}", daemon=True
198
+ )
199
+ self._thread.start()
200
+
201
+ def stop(self) -> None:
202
+ if not self._running:
203
+ return
204
+ self._running = False
205
+ self.input_queue.close()
206
+ if self._thread is not None:
207
+ self._thread.join(timeout=30.0)
208
+ if self._thread.is_alive():
209
+ _log.warning("Stage %s did not terminate cleanly within 30s", self.name)
210
+
211
+ # ------------------------------------------------------------------ _run
212
+ def _run(self) -> None:
213
+ try:
214
+ while self._running:
215
+ batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
216
+ if not batch and self.input_queue.closed:
217
+ break
218
+ processed = self._process_batch(batch)
219
+ self._emit(processed)
220
+ except Exception: # pragma: no cover - top-level guard
221
+ _log.exception("Fatal error in stage %s", self.name)
222
+ finally:
223
+ for q in self._outputs:
224
+ q.close()
225
+
226
+ # ----------------------------------------------------- _process_batch()
227
+ def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
228
+ """Run *model* on *batch* grouped by run_id to maximise batching."""
229
+ groups: dict[int, list[ThreadedItem]] = defaultdict(list)
230
+ for itm in batch:
231
+ groups[itm.run_id].append(itm)
232
+
233
+ result: list[ThreadedItem] = []
234
+ for rid, items in groups.items():
235
+ good: list[ThreadedItem] = [i for i in items if not i.is_failed]
236
+ if not good:
237
+ result.extend(items)
238
+ continue
239
+ try:
240
+ # Filter out None payloads and ensure type safety
241
+ pages_with_payloads = [
242
+ (i, i.payload) for i in good if i.payload is not None
243
+ ]
244
+ if len(pages_with_payloads) != len(good):
245
+ # Some items have None payloads, mark all as failed
246
+ for it in items:
247
+ it.is_failed = True
248
+ it.error = RuntimeError("Page payload is None")
249
+ result.extend(items)
250
+ continue
251
+
252
+ pages: List[Page] = [payload for _, payload in pages_with_payloads]
253
+ processed_pages = list(self.model(good[0].conv_res, pages)) # type: ignore[arg-type]
254
+ if len(processed_pages) != len(pages): # strict mismatch guard
255
+ raise RuntimeError(
256
+ f"Model {self.name} returned wrong number of pages"
257
+ )
258
+ for idx, page in enumerate(processed_pages):
259
+ result.append(
260
+ ThreadedItem(
261
+ payload=page,
262
+ run_id=rid,
263
+ page_no=good[idx].page_no,
264
+ conv_res=good[idx].conv_res,
265
+ )
266
+ )
267
+ except Exception as exc:
268
+ _log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
269
+ for it in items:
270
+ it.is_failed = True
271
+ it.error = exc
272
+ result.extend(items)
273
+ return result
274
+
275
+ # -------------------------------------------------------------- _emit()
276
+ def _emit(self, items: Iterable[ThreadedItem]) -> None:
277
+ for item in items:
278
+ if self._postprocess is not None:
279
+ self._postprocess(item)
280
+ for q in self._outputs:
281
+ if not q.put(item):
282
+ _log.error("Output queue closed while emitting from %s", self.name)
283
+
284
+
285
+ class PreprocessThreadedStage(ThreadedPipelineStage):
286
+ """Pipeline stage that lazily loads PDF backends just-in-time."""
287
+
288
+ def __init__(
289
+ self,
290
+ *,
291
+ batch_timeout: float,
292
+ queue_max_size: int,
293
+ model: Any,
294
+ ) -> None:
295
+ super().__init__(
296
+ name="preprocess",
297
+ model=model,
298
+ batch_size=1,
299
+ batch_timeout=batch_timeout,
300
+ queue_max_size=queue_max_size,
301
+ )
302
+
303
+ def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
304
+ groups: dict[int, list[ThreadedItem]] = defaultdict(list)
305
+ for itm in batch:
306
+ groups[itm.run_id].append(itm)
307
+
308
+ result: list[ThreadedItem] = []
309
+ for rid, items in groups.items():
310
+ good = [i for i in items if not i.is_failed]
311
+ if not good:
312
+ result.extend(items)
313
+ continue
314
+ try:
315
+ pages_with_payloads: list[tuple[ThreadedItem, Page]] = []
316
+ for it in good:
317
+ page = it.payload
318
+ if page is None:
319
+ raise RuntimeError("Page payload is None")
320
+ if page._backend is None:
321
+ backend = it.conv_res.input._backend
322
+ assert isinstance(backend, PdfDocumentBackend), (
323
+ "Threaded pipeline only supports PdfDocumentBackend."
324
+ )
325
+ page_backend = backend.load_page(page.page_no)
326
+ page._backend = page_backend
327
+ if page_backend.is_valid():
328
+ page.size = page_backend.get_size()
329
+ pages_with_payloads.append((it, page))
330
+
331
+ pages = [payload for _, payload in pages_with_payloads]
332
+ processed_pages = list(
333
+ self.model(good[0].conv_res, pages) # type: ignore[arg-type]
56
334
  )
57
- ),
58
- # OCR
59
- ocr_model,
60
- # Layout model
61
- LayoutModel(
62
- artifacts_path=self.artifacts_path,
63
- accelerator_options=pipeline_options.accelerator_options,
64
- options=pipeline_options.layout_options,
65
- ),
66
- # Table structure model
67
- TableStructureModel(
68
- enabled=pipeline_options.do_table_structure,
69
- artifacts_path=self.artifacts_path,
70
- options=pipeline_options.table_structure_options,
71
- accelerator_options=pipeline_options.accelerator_options,
72
- ),
73
- # Page assemble
74
- PageAssembleModel(options=PageAssembleOptions()),
75
- ]
335
+ if len(processed_pages) != len(pages):
336
+ raise RuntimeError(
337
+ "PagePreprocessingModel returned unexpected number of pages"
338
+ )
339
+ for idx, processed_page in enumerate(processed_pages):
340
+ result.append(
341
+ ThreadedItem(
342
+ payload=processed_page,
343
+ run_id=rid,
344
+ page_no=good[idx].page_no,
345
+ conv_res=good[idx].conv_res,
346
+ )
347
+ )
348
+ except Exception as exc:
349
+ _log.error("Stage preprocess failed for run %d: %s", rid, exc)
350
+ for it in items:
351
+ it.is_failed = True
352
+ it.error = exc
353
+ result.extend(items)
354
+ return result
355
+
356
+
357
+ @dataclass
358
+ class RunContext:
359
+ """Wiring for a single *execute* call."""
360
+
361
+ stages: list[ThreadedPipelineStage]
362
+ first_stage: ThreadedPipelineStage
363
+ output_queue: ThreadedQueue
364
+
365
+
366
+ # ──────────────────────────────────────────────────────────────────────────────
367
+ # Main pipeline
368
+ # ──────────────────────────────────────────────────────────────────────────────
369
+
370
+
371
+ class StandardPdfPipeline(ConvertPipeline):
372
+ """High-performance PDF pipeline with multi-threaded stages."""
373
+
374
+ def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
375
+ super().__init__(pipeline_options)
376
+ self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
377
+ self._run_seq = itertools.count(1) # deterministic, monotonic run ids
378
+
379
+ # initialise heavy models once
380
+ self._init_models()
381
+
382
+ # ────────────────────────────────────────────────────────────────────────
383
+ # Heavy-model initialisation & helpers
384
+ # ────────────────────────────────────────────────────────────────────────
385
+
386
+ def _init_models(self) -> None:
387
+ art_path = self.artifacts_path
388
+ self.keep_images = (
389
+ self.pipeline_options.generate_page_images
390
+ or self.pipeline_options.generate_picture_images
391
+ or self.pipeline_options.generate_table_images
392
+ )
393
+ self.preprocessing_model = PagePreprocessingModel(
394
+ options=PagePreprocessingOptions(
395
+ images_scale=self.pipeline_options.images_scale
396
+ )
397
+ )
398
+ self.ocr_model = self._make_ocr_model(art_path)
399
+ self.layout_model = LayoutModel(
400
+ artifacts_path=art_path,
401
+ accelerator_options=self.pipeline_options.accelerator_options,
402
+ options=self.pipeline_options.layout_options,
403
+ )
404
+ self.table_model = TableStructureModel(
405
+ enabled=self.pipeline_options.do_table_structure,
406
+ artifacts_path=art_path,
407
+ options=self.pipeline_options.table_structure_options,
408
+ accelerator_options=self.pipeline_options.accelerator_options,
409
+ )
410
+ self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
411
+ self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
76
412
 
413
+ # --- optional enrichment ------------------------------------------------
77
414
  self.enrichment_pipe = [
78
415
  # Code Formula Enrichment Model
79
416
  CodeFormulaModel(
80
- enabled=pipeline_options.do_code_enrichment
81
- or pipeline_options.do_formula_enrichment,
417
+ enabled=self.pipeline_options.do_code_enrichment
418
+ or self.pipeline_options.do_formula_enrichment,
82
419
  artifacts_path=self.artifacts_path,
83
420
  options=CodeFormulaModelOptions(
84
- do_code_enrichment=pipeline_options.do_code_enrichment,
85
- do_formula_enrichment=pipeline_options.do_formula_enrichment,
421
+ do_code_enrichment=self.pipeline_options.do_code_enrichment,
422
+ do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
86
423
  ),
87
- accelerator_options=pipeline_options.accelerator_options,
424
+ accelerator_options=self.pipeline_options.accelerator_options,
88
425
  ),
89
426
  *self.enrichment_pipe,
90
427
  ]
91
428
 
92
- if (
93
- self.pipeline_options.do_formula_enrichment
94
- or self.pipeline_options.do_code_enrichment
95
- or self.pipeline_options.do_picture_classification
96
- or self.pipeline_options.do_picture_description
97
- ):
98
- self.keep_backend = True
99
-
100
- @staticmethod
101
- def download_models_hf(
102
- local_dir: Optional[Path] = None, force: bool = False
103
- ) -> Path:
104
- warnings.warn(
105
- "The usage of StandardPdfPipeline.download_models_hf() is deprecated "
106
- "use instead the utility `docling-tools models download`, or "
107
- "the upstream method docling.utils.models_downloader.download_all()",
108
- DeprecationWarning,
109
- stacklevel=3,
429
+ self.keep_backend = any(
430
+ (
431
+ self.pipeline_options.do_formula_enrichment,
432
+ self.pipeline_options.do_code_enrichment,
433
+ self.pipeline_options.do_picture_classification,
434
+ self.pipeline_options.do_picture_description,
435
+ )
110
436
  )
111
437
 
112
- output_dir = download_models(output_dir=local_dir, force=force, progress=False)
113
- return output_dir
114
-
115
- def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
438
+ # ---------------------------------------------------------------- helpers
439
+ def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
116
440
  factory = get_ocr_factory(
117
441
  allow_external_plugins=self.pipeline_options.allow_external_plugins
118
442
  )
119
443
  return factory.create_instance(
120
444
  options=self.pipeline_options.ocr_options,
121
445
  enabled=self.pipeline_options.do_ocr,
122
- artifacts_path=artifacts_path,
446
+ artifacts_path=art_path,
123
447
  accelerator_options=self.pipeline_options.accelerator_options,
124
448
  )
125
449
 
126
- def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
127
- with TimeRecorder(conv_res, "page_init"):
128
- page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
129
- if page._backend is not None and page._backend.is_valid():
130
- page.size = page._backend.get_size()
450
+ def _release_page_resources(self, item: ThreadedItem) -> None:
451
+ page = item.payload
452
+ if page is None:
453
+ return
454
+ if not self.keep_images:
455
+ page._image_cache = {}
456
+ if not self.keep_backend and page._backend is not None:
457
+ page._backend.unload()
458
+ page._backend = None
459
+ if not self.pipeline_options.generate_parsed_pages:
460
+ page.parsed_page = None
131
461
 
132
- return page
462
+ # ────────────────────────────────────────────────────────────────────────
463
+ # Build - thread pipeline
464
+ # ────────────────────────────────────────────────────────────────────────
133
465
 
134
- def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
135
- all_elements = []
136
- all_headers = []
137
- all_body = []
466
+ def _create_run_ctx(self) -> RunContext:
467
+ opts = self.pipeline_options
468
+ preprocess = PreprocessThreadedStage(
469
+ batch_timeout=opts.batch_polling_interval_seconds,
470
+ queue_max_size=opts.queue_max_size,
471
+ model=self.preprocessing_model,
472
+ )
473
+ ocr = ThreadedPipelineStage(
474
+ name="ocr",
475
+ model=self.ocr_model,
476
+ batch_size=opts.ocr_batch_size,
477
+ batch_timeout=opts.batch_polling_interval_seconds,
478
+ queue_max_size=opts.queue_max_size,
479
+ )
480
+ layout = ThreadedPipelineStage(
481
+ name="layout",
482
+ model=self.layout_model,
483
+ batch_size=opts.layout_batch_size,
484
+ batch_timeout=opts.batch_polling_interval_seconds,
485
+ queue_max_size=opts.queue_max_size,
486
+ )
487
+ table = ThreadedPipelineStage(
488
+ name="table",
489
+ model=self.table_model,
490
+ batch_size=opts.table_batch_size,
491
+ batch_timeout=opts.batch_polling_interval_seconds,
492
+ queue_max_size=opts.queue_max_size,
493
+ )
494
+ assemble = ThreadedPipelineStage(
495
+ name="assemble",
496
+ model=self.assemble_model,
497
+ batch_size=1,
498
+ batch_timeout=opts.batch_polling_interval_seconds,
499
+ queue_max_size=opts.queue_max_size,
500
+ postprocess=self._release_page_resources,
501
+ )
138
502
 
139
- with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
503
+ # wire stages
504
+ output_q = ThreadedQueue(opts.queue_max_size)
505
+ preprocess.add_output_queue(ocr.input_queue)
506
+ ocr.add_output_queue(layout.input_queue)
507
+ layout.add_output_queue(table.input_queue)
508
+ table.add_output_queue(assemble.input_queue)
509
+ assemble.add_output_queue(output_q)
510
+
511
+ stages = [preprocess, ocr, layout, table, assemble]
512
+ return RunContext(stages=stages, first_stage=preprocess, output_queue=output_q)
513
+
514
+ # --------------------------------------------------------------------- build
515
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
516
+ """Stream-build the document while interleaving producer and consumer work."""
517
+ run_id = next(self._run_seq)
518
+ assert isinstance(conv_res.input._backend, PdfDocumentBackend)
519
+
520
+ # Collect page placeholders; backends are loaded lazily in preprocess stage
521
+ start_page, end_page = conv_res.input.limits.page_range
522
+ pages: list[Page] = []
523
+ for i in range(conv_res.input.page_count):
524
+ if start_page - 1 <= i <= end_page - 1:
525
+ page = Page(page_no=i)
526
+ conv_res.pages.append(page)
527
+ pages.append(page)
528
+
529
+ if not pages:
530
+ conv_res.status = ConversionStatus.FAILURE
531
+ return conv_res
532
+
533
+ total_pages: int = len(pages)
534
+ ctx: RunContext = self._create_run_ctx()
535
+ for st in ctx.stages:
536
+ st.start()
537
+
538
+ proc = ProcessingResult(total_expected=total_pages)
539
+ fed_idx: int = 0 # number of pages successfully queued
540
+ batch_size: int = 32 # drain chunk
541
+ try:
542
+ while proc.success_count + proc.failure_count < total_pages:
543
+ # 1) feed - try to enqueue until the first queue is full
544
+ while fed_idx < total_pages:
545
+ ok = ctx.first_stage.input_queue.put(
546
+ ThreadedItem(
547
+ payload=pages[fed_idx],
548
+ run_id=run_id,
549
+ page_no=pages[fed_idx].page_no,
550
+ conv_res=conv_res,
551
+ ),
552
+ timeout=0.0, # non-blocking try-put
553
+ )
554
+ if ok:
555
+ fed_idx += 1
556
+ if fed_idx == total_pages:
557
+ ctx.first_stage.input_queue.close()
558
+ else: # queue full - switch to draining
559
+ break
560
+
561
+ # 2) drain - pull whatever is ready from the output side
562
+ out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
563
+ for itm in out_batch:
564
+ if itm.run_id != run_id:
565
+ continue
566
+ if itm.is_failed or itm.error:
567
+ proc.failed_pages.append(
568
+ (itm.page_no, itm.error or RuntimeError("unknown error"))
569
+ )
570
+ else:
571
+ assert itm.payload is not None
572
+ proc.pages.append(itm.payload)
573
+
574
+ # 3) failure safety - downstream closed early -> mark missing pages failed
575
+ if not out_batch and ctx.output_queue.closed:
576
+ missing = total_pages - (proc.success_count + proc.failure_count)
577
+ if missing > 0:
578
+ proc.failed_pages.extend(
579
+ [(-1, RuntimeError("pipeline terminated early"))] * missing
580
+ )
581
+ break
582
+ finally:
583
+ for st in ctx.stages:
584
+ st.stop()
585
+ ctx.output_queue.close()
586
+
587
+ self._integrate_results(conv_res, proc)
588
+ return conv_res
589
+
590
+ # ---------------------------------------------------- integrate_results()
591
+ def _integrate_results(
592
+ self, conv_res: ConversionResult, proc: ProcessingResult
593
+ ) -> None:
594
+ page_map = {p.page_no: p for p in proc.pages}
595
+ conv_res.pages = [
596
+ page_map.get(p.page_no, p)
597
+ for p in conv_res.pages
598
+ if p.page_no in page_map
599
+ or not any(fp == p.page_no for fp, _ in proc.failed_pages)
600
+ ]
601
+ if proc.is_complete_failure:
602
+ conv_res.status = ConversionStatus.FAILURE
603
+ elif proc.is_partial_success:
604
+ conv_res.status = ConversionStatus.PARTIAL_SUCCESS
605
+ else:
606
+ conv_res.status = ConversionStatus.SUCCESS
607
+ if not self.keep_images:
140
608
  for p in conv_res.pages:
141
- if p.assembled is not None:
142
- for el in p.assembled.body:
143
- all_body.append(el)
144
- for el in p.assembled.headers:
145
- all_headers.append(el)
146
- for el in p.assembled.elements:
147
- all_elements.append(el)
609
+ p._image_cache = {}
610
+ for p in conv_res.pages:
611
+ if not self.keep_backend and p._backend is not None:
612
+ p._backend.unload()
613
+ if not self.pipeline_options.generate_parsed_pages:
614
+ del p.parsed_page
615
+ p.parsed_page = None
148
616
 
617
+ # ---------------------------------------------------------------- assemble
618
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
619
+ elements, headers, body = [], [], []
620
+ with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
621
+ for p in conv_res.pages:
622
+ if p.assembled:
623
+ elements.extend(p.assembled.elements)
624
+ headers.extend(p.assembled.headers)
625
+ body.extend(p.assembled.body)
149
626
  conv_res.assembled = AssembledUnit(
150
- elements=all_elements, headers=all_headers, body=all_body
627
+ elements=elements, headers=headers, body=body
151
628
  )
152
-
153
629
  conv_res.document = self.reading_order_model(conv_res)
154
630
 
155
631
  # Generate page images in the output
@@ -233,10 +709,21 @@ class StandardPdfPipeline(PaginatedPipeline):
233
709
 
234
710
  return conv_res
235
711
 
712
+ # ---------------------------------------------------------------- misc
236
713
  @classmethod
237
- def get_default_options(cls) -> PdfPipelineOptions:
238
- return PdfPipelineOptions()
714
+ def get_default_options(cls) -> ThreadedPdfPipelineOptions:
715
+ return ThreadedPdfPipelineOptions()
239
716
 
240
717
  @classmethod
241
- def is_backend_supported(cls, backend: AbstractDocumentBackend):
718
+ def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
242
719
  return isinstance(backend, PdfDocumentBackend)
720
+
721
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
722
+ return conv_res.status
723
+
724
+ def _unload(self, conv_res: ConversionResult) -> None:
725
+ for p in conv_res.pages:
726
+ if p._backend is not None:
727
+ p._backend.unload()
728
+ if conv_res.input._backend:
729
+ conv_res.input._backend.unload()