docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,859 @@
1
+ """Thread-safe, production-ready PDF pipeline
2
+ ================================================
3
+ A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
4
+
5
+ * **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
6
+ threads so that concurrent invocations never share mutable state.
7
+ * **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
8
+ relying on :pyfunc:`id`, which may clash after garbage collection.
9
+ * **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
10
+ propagates downstream so stages terminate deterministically without sentinels.
11
+ * **Minimal shared state** - heavyweight models are initialised once per pipeline instance
12
+ and only read by worker threads; no runtime mutability is exposed.
13
+ * **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import itertools
19
+ import logging
20
+ import threading
21
+ import time
22
+ import warnings
23
+ from collections import defaultdict, deque
24
+ from dataclasses import dataclass, field
25
+ from pathlib import Path
26
+ from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, cast
27
+
28
+ import numpy as np
29
+ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
30
+
31
+ from docling.backend.abstract_backend import AbstractDocumentBackend
32
+ from docling.backend.pdf_backend import PdfDocumentBackend
33
+ from docling.datamodel.base_models import (
34
+ AssembledUnit,
35
+ ConversionStatus,
36
+ DoclingComponentType,
37
+ ErrorItem,
38
+ Page,
39
+ )
40
+ from docling.datamodel.document import ConversionResult
41
+ from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
42
+ from docling.datamodel.settings import settings
43
+ from docling.models.factories import (
44
+ get_layout_factory,
45
+ get_ocr_factory,
46
+ get_table_structure_factory,
47
+ )
48
+ from docling.models.stages.code_formula.code_formula_model import (
49
+ CodeFormulaModel,
50
+ CodeFormulaModelOptions,
51
+ )
52
+ from docling.models.stages.page_assemble.page_assemble_model import (
53
+ PageAssembleModel,
54
+ PageAssembleOptions,
55
+ )
56
+ from docling.models.stages.page_preprocessing.page_preprocessing_model import (
57
+ PagePreprocessingModel,
58
+ PagePreprocessingOptions,
59
+ )
60
+ from docling.models.stages.reading_order.readingorder_model import (
61
+ ReadingOrderModel,
62
+ ReadingOrderOptions,
63
+ )
64
+ from docling.pipeline.base_pipeline import ConvertPipeline
65
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
66
+ from docling.utils.utils import chunkify
67
+
68
+ _log = logging.getLogger(__name__)
69
+
70
+ # ──────────────────────────────────────────────────────────────────────────────
71
+ # Helper data structures
72
+ # ──────────────────────────────────────────────────────────────────────────────
73
+
74
+
75
+ @dataclass
76
+ class ThreadedItem:
77
+ """Envelope that travels between pipeline stages."""
78
+
79
+ payload: Optional[Page]
80
+ run_id: int # Unique per *execute* call, monotonic across pipeline instance
81
+ page_no: int
82
+ conv_res: ConversionResult
83
+ error: Optional[Exception] = None
84
+ is_failed: bool = False
85
+
86
+
87
+ @dataclass
88
+ class ProcessingResult:
89
+ """Aggregated outcome of a pipeline run."""
90
+
91
+ pages: List[Page] = field(default_factory=list)
92
+ failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
93
+ total_expected: int = 0
94
+
95
+ @property
96
+ def success_count(self) -> int:
97
+ return len(self.pages)
98
+
99
+ @property
100
+ def failure_count(self) -> int:
101
+ return len(self.failed_pages)
102
+
103
+ @property
104
+ def is_partial_success(self) -> bool:
105
+ return 0 < self.success_count < self.total_expected
106
+
107
+ @property
108
+ def is_complete_failure(self) -> bool:
109
+ return self.success_count == 0 and self.failure_count > 0
110
+
111
+
112
+ class ThreadedQueue:
113
+ """Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
114
+
115
+ __slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
116
+
117
+ def __init__(self, max_size: int) -> None:
118
+ self._max: int = max_size
119
+ self._items: deque[ThreadedItem] = deque()
120
+ self._lock = threading.Lock()
121
+ self._not_full = threading.Condition(self._lock)
122
+ self._not_empty = threading.Condition(self._lock)
123
+ self._closed = False
124
+
125
+ # ---------------------------------------------------------------- put()
126
+ def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
127
+ """Block until queue accepts *item* or is closed. Returns *False* if closed."""
128
+ with self._not_full:
129
+ if self._closed:
130
+ return False
131
+ start = time.monotonic()
132
+ while len(self._items) >= self._max and not self._closed:
133
+ if timeout is not None:
134
+ remaining = timeout - (time.monotonic() - start)
135
+ if remaining <= 0:
136
+ return False
137
+ self._not_full.wait(remaining)
138
+ else:
139
+ self._not_full.wait()
140
+ if self._closed:
141
+ return False
142
+ self._items.append(item)
143
+ self._not_empty.notify()
144
+ return True
145
+
146
+ # ------------------------------------------------------------ get_batch()
147
+ def get_batch(
148
+ self, size: int, timeout: Optional[float] | None = None
149
+ ) -> List[ThreadedItem]:
150
+ """Return up to *size* items. Blocks until ≥1 item present or queue closed/timeout."""
151
+ with self._not_empty:
152
+ start = time.monotonic()
153
+ while not self._items and not self._closed:
154
+ if timeout is not None:
155
+ remaining = timeout - (time.monotonic() - start)
156
+ if remaining <= 0:
157
+ return []
158
+ self._not_empty.wait(remaining)
159
+ else:
160
+ self._not_empty.wait()
161
+ batch: List[ThreadedItem] = []
162
+ while self._items and len(batch) < size:
163
+ batch.append(self._items.popleft())
164
+ if batch:
165
+ self._not_full.notify_all()
166
+ return batch
167
+
168
+ # ---------------------------------------------------------------- close()
169
+ def close(self) -> None:
170
+ with self._lock:
171
+ self._closed = True
172
+ self._not_empty.notify_all()
173
+ self._not_full.notify_all()
174
+
175
+ # -------------------------------------------------------------- property
176
+ @property
177
+ def closed(self) -> bool:
178
+ return self._closed
179
+
180
+
181
+ class ThreadedPipelineStage:
182
+ """A single pipeline stage backed by one worker thread."""
183
+
184
+ def __init__(
185
+ self,
186
+ *,
187
+ name: str,
188
+ model: Any,
189
+ batch_size: int,
190
+ batch_timeout: float,
191
+ queue_max_size: int,
192
+ postprocess: Optional[Callable[[ThreadedItem], None]] = None,
193
+ timed_out_run_ids: Optional[set[int]] = None,
194
+ ) -> None:
195
+ self.name = name
196
+ self.model = model
197
+ self.batch_size = batch_size
198
+ self.batch_timeout = batch_timeout
199
+ self.input_queue = ThreadedQueue(queue_max_size)
200
+ self._outputs: list[ThreadedQueue] = []
201
+ self._thread: Optional[threading.Thread] = None
202
+ self._running = False
203
+ self._postprocess = postprocess
204
+ self._timed_out_run_ids = (
205
+ timed_out_run_ids if timed_out_run_ids is not None else set()
206
+ )
207
+
208
+ # ---------------------------------------------------------------- wiring
209
+ def add_output_queue(self, q: ThreadedQueue) -> None:
210
+ self._outputs.append(q)
211
+
212
+ # -------------------------------------------------------------- lifecycle
213
+ def start(self) -> None:
214
+ if self._running:
215
+ return
216
+ self._running = True
217
+ self._thread = threading.Thread(
218
+ target=self._run, name=f"Stage-{self.name}", daemon=False
219
+ )
220
+ self._thread.start()
221
+
222
+ def stop(self) -> None:
223
+ if not self._running:
224
+ return
225
+ self._running = False
226
+ self.input_queue.close()
227
+ if self._thread is not None:
228
+ # Give thread 2s to finish naturally before abandoning
229
+ self._thread.join(timeout=15.0)
230
+ if self._thread.is_alive():
231
+ _log.warning(
232
+ "Stage %s thread did not terminate within 15s. "
233
+ "Thread is likely stuck in a blocking call and will be abandoned (resources may leak).",
234
+ self.name,
235
+ )
236
+
237
+ # ------------------------------------------------------------------ _run
238
+ def _run(self) -> None:
239
+ try:
240
+ while self._running:
241
+ batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
242
+ if not batch and self.input_queue.closed:
243
+ break
244
+ processed = self._process_batch(batch)
245
+ self._emit(processed)
246
+ except Exception: # pragma: no cover - top-level guard
247
+ _log.exception("Fatal error in stage %s", self.name)
248
+ finally:
249
+ for q in self._outputs:
250
+ q.close()
251
+
252
+ # ----------------------------------------------------- _process_batch()
253
+ def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
254
+ """Run *model* on *batch* grouped by run_id to maximise batching."""
255
+ groups: dict[int, list[ThreadedItem]] = defaultdict(list)
256
+ for itm in batch:
257
+ groups[itm.run_id].append(itm)
258
+
259
+ result: list[ThreadedItem] = []
260
+ for rid, items in groups.items():
261
+ # If run_id is timed out, skip processing but pass through items as-is
262
+ # This allows already-completed work to flow through while aborting new work
263
+ if rid in self._timed_out_run_ids:
264
+ for it in items:
265
+ it.is_failed = True
266
+ if it.error is None:
267
+ it.error = RuntimeError("document timeout exceeded")
268
+ result.extend(items)
269
+ continue
270
+
271
+ good: list[ThreadedItem] = [i for i in items if not i.is_failed]
272
+ if not good:
273
+ result.extend(items)
274
+ continue
275
+ try:
276
+ # Filter out None payloads and ensure type safety
277
+ pages_with_payloads = [
278
+ (i, i.payload) for i in good if i.payload is not None
279
+ ]
280
+ if len(pages_with_payloads) != len(good):
281
+ # Some items have None payloads, mark all as failed
282
+ for it in items:
283
+ it.is_failed = True
284
+ it.error = RuntimeError("Page payload is None")
285
+ result.extend(items)
286
+ continue
287
+
288
+ pages: List[Page] = [payload for _, payload in pages_with_payloads]
289
+ processed_pages = list(self.model(good[0].conv_res, pages)) # type: ignore[arg-type]
290
+ if len(processed_pages) != len(pages): # strict mismatch guard
291
+ raise RuntimeError(
292
+ f"Model {self.name} returned wrong number of pages"
293
+ )
294
+ for idx, page in enumerate(processed_pages):
295
+ result.append(
296
+ ThreadedItem(
297
+ payload=page,
298
+ run_id=rid,
299
+ page_no=good[idx].page_no,
300
+ conv_res=good[idx].conv_res,
301
+ )
302
+ )
303
+ except Exception as exc:
304
+ _log.error(
305
+ "Stage %s failed for run %d: %s", self.name, rid, exc, exc_info=True
306
+ )
307
+ for it in items:
308
+ it.is_failed = True
309
+ it.error = exc
310
+ result.extend(items)
311
+ return result
312
+
313
+ # -------------------------------------------------------------- _emit()
314
+ def _emit(self, items: Iterable[ThreadedItem]) -> None:
315
+ for item in items:
316
+ if self._postprocess is not None:
317
+ self._postprocess(item)
318
+ for q in self._outputs:
319
+ if not q.put(item):
320
+ _log.error("Output queue closed while emitting from %s", self.name)
321
+
322
+
323
+ class PreprocessThreadedStage(ThreadedPipelineStage):
324
+ """Pipeline stage that lazily loads PDF backends just-in-time."""
325
+
326
+ def __init__(
327
+ self,
328
+ *,
329
+ batch_timeout: float,
330
+ queue_max_size: int,
331
+ model: Any,
332
+ timed_out_run_ids: Optional[set[int]] = None,
333
+ ) -> None:
334
+ super().__init__(
335
+ name="preprocess",
336
+ model=model,
337
+ batch_size=1,
338
+ batch_timeout=batch_timeout,
339
+ queue_max_size=queue_max_size,
340
+ timed_out_run_ids=timed_out_run_ids,
341
+ )
342
+
343
+ def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
344
+ groups: dict[int, list[ThreadedItem]] = defaultdict(list)
345
+ for itm in batch:
346
+ groups[itm.run_id].append(itm)
347
+
348
+ result: list[ThreadedItem] = []
349
+ for rid, items in groups.items():
350
+ # If run_id is timed out, skip processing but pass through items as-is
351
+ # This allows already-completed work to flow through while aborting new work
352
+ if rid in self._timed_out_run_ids:
353
+ for it in items:
354
+ it.is_failed = True
355
+ if it.error is None:
356
+ it.error = RuntimeError("document timeout exceeded")
357
+ result.extend(items)
358
+ continue
359
+
360
+ good = [i for i in items if not i.is_failed]
361
+ if not good:
362
+ result.extend(items)
363
+ continue
364
+ try:
365
+ pages_with_payloads: list[tuple[ThreadedItem, Page]] = []
366
+ for it in good:
367
+ page = it.payload
368
+ if page is None:
369
+ raise RuntimeError("Page payload is None")
370
+ if page._backend is None:
371
+ backend = it.conv_res.input._backend
372
+ assert isinstance(backend, PdfDocumentBackend), (
373
+ "Threaded pipeline only supports PdfDocumentBackend."
374
+ )
375
+ page_backend = backend.load_page(page.page_no - 1)
376
+ page._backend = page_backend
377
+ if page_backend.is_valid():
378
+ page.size = page_backend.get_size()
379
+ pages_with_payloads.append((it, page))
380
+
381
+ pages = [payload for _, payload in pages_with_payloads]
382
+ processed_pages = list(
383
+ self.model(good[0].conv_res, pages) # type: ignore[arg-type]
384
+ )
385
+ if len(processed_pages) != len(pages):
386
+ raise RuntimeError(
387
+ "PagePreprocessingModel returned unexpected number of pages"
388
+ )
389
+ for idx, processed_page in enumerate(processed_pages):
390
+ result.append(
391
+ ThreadedItem(
392
+ payload=processed_page,
393
+ run_id=rid,
394
+ page_no=good[idx].page_no,
395
+ conv_res=good[idx].conv_res,
396
+ )
397
+ )
398
+ except Exception as exc:
399
+ page_numbers = [it.page_no for it in good]
400
+ _log.error(
401
+ "Stage preprocess failed for run %d, pages %s: %s",
402
+ rid,
403
+ page_numbers,
404
+ exc,
405
+ exc_info=False, # Put to True if you want detailed exception info
406
+ )
407
+ for it in good:
408
+ it.is_failed = True
409
+ it.error = exc
410
+ result.extend(items)
411
+ return result
412
+
413
+
414
+ @dataclass
415
+ class RunContext:
416
+ """Wiring for a single *execute* call."""
417
+
418
+ stages: list[ThreadedPipelineStage]
419
+ first_stage: ThreadedPipelineStage
420
+ output_queue: ThreadedQueue
421
+ timed_out_run_ids: set[int] = field(default_factory=set)
422
+
423
+
424
+ # ──────────────────────────────────────────────────────────────────────────────
425
+ # Main pipeline
426
+ # ──────────────────────────────────────────────────────────────────────────────
427
+
428
+
429
+ class StandardPdfPipeline(ConvertPipeline):
430
+ """High-performance PDF pipeline with multi-threaded stages."""
431
+
432
+ def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
433
+ super().__init__(pipeline_options)
434
+ self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
435
+ self._run_seq = itertools.count(1) # deterministic, monotonic run ids
436
+
437
+ # initialise heavy models once
438
+ self._init_models()
439
+
440
+ # ────────────────────────────────────────────────────────────────────────
441
+ # Heavy-model initialisation & helpers
442
+ # ────────────────────────────────────────────────────────────────────────
443
+
444
+ def _init_models(self) -> None:
445
+ art_path = self.artifacts_path
446
+ self.keep_images = (
447
+ self.pipeline_options.generate_page_images
448
+ or self.pipeline_options.generate_picture_images
449
+ or self.pipeline_options.generate_table_images
450
+ )
451
+ self.preprocessing_model = PagePreprocessingModel(
452
+ options=PagePreprocessingOptions(
453
+ images_scale=self.pipeline_options.images_scale
454
+ )
455
+ )
456
+ self.ocr_model = self._make_ocr_model(art_path)
457
+ layout_factory = get_layout_factory(
458
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
459
+ )
460
+ self.layout_model = layout_factory.create_instance(
461
+ options=self.pipeline_options.layout_options,
462
+ artifacts_path=art_path,
463
+ accelerator_options=self.pipeline_options.accelerator_options,
464
+ )
465
+ table_factory = get_table_structure_factory(
466
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
467
+ )
468
+ self.table_model = table_factory.create_instance(
469
+ options=self.pipeline_options.table_structure_options,
470
+ enabled=self.pipeline_options.do_table_structure,
471
+ artifacts_path=art_path,
472
+ accelerator_options=self.pipeline_options.accelerator_options,
473
+ )
474
+ self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
475
+ self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
476
+
477
+ # --- optional enrichment ------------------------------------------------
478
+ self.enrichment_pipe = [
479
+ # Code Formula Enrichment Model
480
+ CodeFormulaModel(
481
+ enabled=self.pipeline_options.do_code_enrichment
482
+ or self.pipeline_options.do_formula_enrichment,
483
+ artifacts_path=self.artifacts_path,
484
+ options=CodeFormulaModelOptions(
485
+ do_code_enrichment=self.pipeline_options.do_code_enrichment,
486
+ do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
487
+ ),
488
+ accelerator_options=self.pipeline_options.accelerator_options,
489
+ ),
490
+ *self.enrichment_pipe,
491
+ ]
492
+
493
+ self.keep_backend = any(
494
+ (
495
+ self.pipeline_options.do_formula_enrichment,
496
+ self.pipeline_options.do_code_enrichment,
497
+ self.pipeline_options.do_picture_classification,
498
+ self.pipeline_options.do_picture_description,
499
+ )
500
+ )
501
+
502
+ # ---------------------------------------------------------------- helpers
503
+ def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
504
+ factory = get_ocr_factory(
505
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
506
+ )
507
+ return factory.create_instance(
508
+ options=self.pipeline_options.ocr_options,
509
+ enabled=self.pipeline_options.do_ocr,
510
+ artifacts_path=art_path,
511
+ accelerator_options=self.pipeline_options.accelerator_options,
512
+ )
513
+
514
+ def _release_page_resources(self, item: ThreadedItem) -> None:
515
+ page = item.payload
516
+ if page is None:
517
+ return
518
+ if not self.keep_images:
519
+ page._image_cache = {}
520
+ if not self.keep_backend and page._backend is not None:
521
+ page._backend.unload()
522
+ page._backend = None
523
+ if not self.pipeline_options.generate_parsed_pages:
524
+ page.parsed_page = None
525
+
526
+ # ────────────────────────────────────────────────────────────────────────
527
+ # Build - thread pipeline
528
+ # ────────────────────────────────────────────────────────────────────────
529
+
530
+ def _create_run_ctx(self) -> RunContext:
531
+ opts = self.pipeline_options
532
+ timed_out_run_ids: set[int] = set()
533
+ preprocess = PreprocessThreadedStage(
534
+ batch_timeout=opts.batch_polling_interval_seconds,
535
+ queue_max_size=opts.queue_max_size,
536
+ model=self.preprocessing_model,
537
+ timed_out_run_ids=timed_out_run_ids,
538
+ )
539
+ ocr = ThreadedPipelineStage(
540
+ name="ocr",
541
+ model=self.ocr_model,
542
+ batch_size=opts.ocr_batch_size,
543
+ batch_timeout=opts.batch_polling_interval_seconds,
544
+ queue_max_size=opts.queue_max_size,
545
+ timed_out_run_ids=timed_out_run_ids,
546
+ )
547
+ layout = ThreadedPipelineStage(
548
+ name="layout",
549
+ model=self.layout_model,
550
+ batch_size=opts.layout_batch_size,
551
+ batch_timeout=opts.batch_polling_interval_seconds,
552
+ queue_max_size=opts.queue_max_size,
553
+ timed_out_run_ids=timed_out_run_ids,
554
+ )
555
+ table = ThreadedPipelineStage(
556
+ name="table",
557
+ model=self.table_model,
558
+ batch_size=opts.table_batch_size,
559
+ batch_timeout=opts.batch_polling_interval_seconds,
560
+ queue_max_size=opts.queue_max_size,
561
+ timed_out_run_ids=timed_out_run_ids,
562
+ )
563
+ assemble = ThreadedPipelineStage(
564
+ name="assemble",
565
+ model=self.assemble_model,
566
+ batch_size=1,
567
+ batch_timeout=opts.batch_polling_interval_seconds,
568
+ queue_max_size=opts.queue_max_size,
569
+ postprocess=self._release_page_resources,
570
+ timed_out_run_ids=timed_out_run_ids,
571
+ )
572
+
573
+ # wire stages
574
+ output_q = ThreadedQueue(opts.queue_max_size)
575
+ preprocess.add_output_queue(ocr.input_queue)
576
+ ocr.add_output_queue(layout.input_queue)
577
+ layout.add_output_queue(table.input_queue)
578
+ table.add_output_queue(assemble.input_queue)
579
+ assemble.add_output_queue(output_q)
580
+
581
+ stages = [preprocess, ocr, layout, table, assemble]
582
+ return RunContext(
583
+ stages=stages,
584
+ first_stage=preprocess,
585
+ output_queue=output_q,
586
+ timed_out_run_ids=timed_out_run_ids,
587
+ )
588
+
589
+ # --------------------------------------------------------------------- build
590
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
591
+ """Stream-build the document while interleaving producer and consumer work.
592
+
593
+ Note: If a worker thread gets stuck in a blocking call (model inference or PDF backend
594
+ load_page/get_size), that thread will be abandoned after a brief wait (15s) during cleanup.
595
+ The thread continues running until the blocking call completes, potentially holding
596
+ resources (e.g., pypdfium2_lock).
597
+ """
598
+ run_id = next(self._run_seq)
599
+ assert isinstance(conv_res.input._backend, PdfDocumentBackend)
600
+
601
+ # Collect page placeholders; backends are loaded lazily in preprocess stage
602
+ start_page, end_page = conv_res.input.limits.page_range
603
+ pages: list[Page] = []
604
+ for i in range(conv_res.input.page_count):
605
+ if start_page - 1 <= i <= end_page - 1:
606
+ page = Page(page_no=i + 1)
607
+ conv_res.pages.append(page)
608
+ pages.append(page)
609
+
610
+ if not pages:
611
+ conv_res.status = ConversionStatus.FAILURE
612
+ return conv_res
613
+
614
+ total_pages: int = len(pages)
615
+ ctx: RunContext = self._create_run_ctx()
616
+ for st in ctx.stages:
617
+ st.start()
618
+
619
+ proc = ProcessingResult(total_expected=total_pages)
620
+ fed_idx: int = 0 # number of pages successfully queued
621
+ batch_size: int = 32 # drain chunk
622
+ start_time = time.monotonic()
623
+ timeout_exceeded = False
624
+ input_queue_closed = False
625
+ try:
626
+ while proc.success_count + proc.failure_count < total_pages:
627
+ # Check timeout
628
+ if (
629
+ self.pipeline_options.document_timeout is not None
630
+ and not timeout_exceeded
631
+ ):
632
+ elapsed_time = time.monotonic() - start_time
633
+ if elapsed_time > self.pipeline_options.document_timeout:
634
+ _log.warning(
635
+ f"Document processing time ({elapsed_time:.3f}s) "
636
+ f"exceeded timeout of {self.pipeline_options.document_timeout:.3f}s"
637
+ )
638
+ timeout_exceeded = True
639
+ ctx.timed_out_run_ids.add(run_id)
640
+ if not input_queue_closed:
641
+ ctx.first_stage.input_queue.close()
642
+ input_queue_closed = True
643
+ # Break immediately - don't wait for in-flight work
644
+ break
645
+
646
+ # 1) feed - try to enqueue until the first queue is full
647
+ if not input_queue_closed:
648
+ while fed_idx < total_pages:
649
+ ok = ctx.first_stage.input_queue.put(
650
+ ThreadedItem(
651
+ payload=pages[fed_idx],
652
+ run_id=run_id,
653
+ page_no=pages[fed_idx].page_no,
654
+ conv_res=conv_res,
655
+ ),
656
+ timeout=0.0, # non-blocking try-put
657
+ )
658
+ if ok:
659
+ fed_idx += 1
660
+ if fed_idx == total_pages:
661
+ ctx.first_stage.input_queue.close()
662
+ input_queue_closed = True
663
+ else: # queue full - switch to draining
664
+ break
665
+
666
+ # 2) drain - pull whatever is ready from the output side
667
+ out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
668
+ for itm in out_batch:
669
+ if itm.run_id != run_id:
670
+ continue
671
+ if itm.is_failed or itm.error:
672
+ proc.failed_pages.append(
673
+ (itm.page_no, itm.error or RuntimeError("unknown error"))
674
+ )
675
+ else:
676
+ assert itm.payload is not None
677
+ proc.pages.append(itm.payload)
678
+
679
+ # 3) failure safety - downstream closed early
680
+ if not out_batch and ctx.output_queue.closed:
681
+ missing = total_pages - (proc.success_count + proc.failure_count)
682
+ if missing > 0:
683
+ proc.failed_pages.extend(
684
+ [(-1, RuntimeError("pipeline terminated early"))] * missing
685
+ )
686
+ break
687
+
688
+ # Mark remaining pages as failed if timeout occurred
689
+ if timeout_exceeded:
690
+ completed_page_nos = {p.page_no for p in proc.pages} | {
691
+ fp for fp, _ in proc.failed_pages
692
+ }
693
+ for page in pages[fed_idx:]:
694
+ if page.page_no not in completed_page_nos:
695
+ proc.failed_pages.append(
696
+ (page.page_no, RuntimeError("document timeout exceeded"))
697
+ )
698
+ finally:
699
+ for st in ctx.stages:
700
+ st.stop()
701
+ ctx.output_queue.close()
702
+
703
+ self._integrate_results(conv_res, proc, timeout_exceeded=timeout_exceeded)
704
+ return conv_res
705
+
706
+ # ---------------------------------------------------- integrate_results()
707
+ def _integrate_results(
708
+ self,
709
+ conv_res: ConversionResult,
710
+ proc: ProcessingResult,
711
+ timeout_exceeded: bool = False,
712
+ ) -> None:
713
+ page_map = {p.page_no: p for p in proc.pages}
714
+ # Only keep pages that successfully completed processing
715
+ conv_res.pages = [
716
+ page_map[p.page_no] for p in conv_res.pages if p.page_no in page_map
717
+ ]
718
+ # Add error details from failed pages
719
+ for page_no, error in proc.failed_pages:
720
+ page_label = f"Page {page_no}" if page_no > 0 else "Unknown page"
721
+ error_msg = str(error) if error else ""
722
+ error_item = ErrorItem(
723
+ component_type=DoclingComponentType.PIPELINE,
724
+ module_name=self.__class__.__name__,
725
+ error_message=f"{page_label}: {error_msg}" if error_msg else page_label,
726
+ )
727
+ conv_res.errors.append(error_item)
728
+ if timeout_exceeded and proc.total_expected > 0:
729
+ # Timeout exceeded: set PARTIAL_SUCCESS if any pages were attempted
730
+ conv_res.status = ConversionStatus.PARTIAL_SUCCESS
731
+ elif proc.is_complete_failure:
732
+ conv_res.status = ConversionStatus.FAILURE
733
+ elif proc.is_partial_success:
734
+ conv_res.status = ConversionStatus.PARTIAL_SUCCESS
735
+ else:
736
+ conv_res.status = ConversionStatus.SUCCESS
737
+ if not self.keep_images:
738
+ for p in conv_res.pages:
739
+ p._image_cache = {}
740
+ for p in conv_res.pages:
741
+ if not self.keep_backend and p._backend is not None:
742
+ p._backend.unload()
743
+ if not self.pipeline_options.generate_parsed_pages:
744
+ del p.parsed_page
745
+ p.parsed_page = None
746
+
747
+ # ---------------------------------------------------------------- assemble
748
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
749
+ elements, headers, body = [], [], []
750
+ with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
751
+ for p in conv_res.pages:
752
+ if p.assembled:
753
+ elements.extend(p.assembled.elements)
754
+ headers.extend(p.assembled.headers)
755
+ body.extend(p.assembled.body)
756
+ conv_res.assembled = AssembledUnit(
757
+ elements=elements, headers=headers, body=body
758
+ )
759
+ conv_res.document = self.reading_order_model(conv_res)
760
+
761
+ # Generate page images in the output
762
+ if self.pipeline_options.generate_page_images:
763
+ for page in conv_res.pages:
764
+ assert page.image is not None
765
+ page_no = page.page_no
766
+ conv_res.document.pages[page_no].image = ImageRef.from_pil(
767
+ page.image, dpi=int(72 * self.pipeline_options.images_scale)
768
+ )
769
+
770
+ # Generate images of the requested element types
771
+ with warnings.catch_warnings(): # deprecated generate_table_images
772
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
773
+ if (
774
+ self.pipeline_options.generate_picture_images
775
+ or self.pipeline_options.generate_table_images
776
+ ):
777
+ scale = self.pipeline_options.images_scale
778
+ for element, _level in conv_res.document.iterate_items():
779
+ if not isinstance(element, DocItem) or len(element.prov) == 0:
780
+ continue
781
+ if (
782
+ isinstance(element, PictureItem)
783
+ and self.pipeline_options.generate_picture_images
784
+ ) or (
785
+ isinstance(element, TableItem)
786
+ and self.pipeline_options.generate_table_images
787
+ ):
788
+ page_ix = element.prov[0].page_no
789
+ page = next(
790
+ (p for p in conv_res.pages if p.page_no == page_ix),
791
+ cast("Page", None),
792
+ )
793
+ assert page is not None
794
+ assert page.size is not None
795
+ assert page.image is not None
796
+
797
+ crop_bbox = (
798
+ element.prov[0]
799
+ .bbox.scaled(scale=scale)
800
+ .to_top_left_origin(
801
+ page_height=page.size.height * scale
802
+ )
803
+ )
804
+
805
+ cropped_im = page.image.crop(crop_bbox.as_tuple())
806
+ element.image = ImageRef.from_pil(
807
+ cropped_im, dpi=int(72 * scale)
808
+ )
809
+
810
+ # Aggregate confidence values for document:
811
+ if len(conv_res.pages) > 0:
812
+ with warnings.catch_warnings():
813
+ warnings.filterwarnings(
814
+ "ignore",
815
+ category=RuntimeWarning,
816
+ message="Mean of empty slice|All-NaN slice encountered",
817
+ )
818
+ conv_res.confidence.layout_score = float(
819
+ np.nanmean(
820
+ [c.layout_score for c in conv_res.confidence.pages.values()]
821
+ )
822
+ )
823
+ conv_res.confidence.parse_score = float(
824
+ np.nanquantile(
825
+ [c.parse_score for c in conv_res.confidence.pages.values()],
826
+ q=0.1, # parse score should relate to worst 10% of pages.
827
+ )
828
+ )
829
+ conv_res.confidence.table_score = float(
830
+ np.nanmean(
831
+ [c.table_score for c in conv_res.confidence.pages.values()]
832
+ )
833
+ )
834
+ conv_res.confidence.ocr_score = float(
835
+ np.nanmean(
836
+ [c.ocr_score for c in conv_res.confidence.pages.values()]
837
+ )
838
+ )
839
+
840
+ return conv_res
841
+
842
+ # ---------------------------------------------------------------- misc
843
+ @classmethod
844
+ def get_default_options(cls) -> ThreadedPdfPipelineOptions:
845
+ return ThreadedPdfPipelineOptions()
846
+
847
+ @classmethod
848
+ def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
849
+ return isinstance(backend, PdfDocumentBackend)
850
+
851
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
852
+ return conv_res.status
853
+
854
+ def _unload(self, conv_res: ConversionResult) -> None:
855
+ for p in conv_res.pages:
856
+ if p._backend is not None:
857
+ p._backend.unload()
858
+ if conv_res.input._backend:
859
+ conv_res.input._backend.unload()