docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,859 @@
|
|
|
1
|
+
"""Thread-safe, production-ready PDF pipeline
|
|
2
|
+
================================================
|
|
3
|
+
A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
|
|
4
|
+
|
|
5
|
+
* **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
|
|
6
|
+
threads so that concurrent invocations never share mutable state.
|
|
7
|
+
* **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
|
|
8
|
+
relying on :pyfunc:`id`, which may clash after garbage collection.
|
|
9
|
+
* **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
|
|
10
|
+
propagates downstream so stages terminate deterministically without sentinels.
|
|
11
|
+
* **Minimal shared state** - heavyweight models are initialised once per pipeline instance
|
|
12
|
+
and only read by worker threads; no runtime mutability is exposed.
|
|
13
|
+
* **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import itertools
|
|
19
|
+
import logging
|
|
20
|
+
import threading
|
|
21
|
+
import time
|
|
22
|
+
import warnings
|
|
23
|
+
from collections import defaultdict, deque
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, cast
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|
30
|
+
|
|
31
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
32
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
33
|
+
from docling.datamodel.base_models import (
|
|
34
|
+
AssembledUnit,
|
|
35
|
+
ConversionStatus,
|
|
36
|
+
DoclingComponentType,
|
|
37
|
+
ErrorItem,
|
|
38
|
+
Page,
|
|
39
|
+
)
|
|
40
|
+
from docling.datamodel.document import ConversionResult
|
|
41
|
+
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
|
42
|
+
from docling.datamodel.settings import settings
|
|
43
|
+
from docling.models.factories import (
|
|
44
|
+
get_layout_factory,
|
|
45
|
+
get_ocr_factory,
|
|
46
|
+
get_table_structure_factory,
|
|
47
|
+
)
|
|
48
|
+
from docling.models.stages.code_formula.code_formula_model import (
|
|
49
|
+
CodeFormulaModel,
|
|
50
|
+
CodeFormulaModelOptions,
|
|
51
|
+
)
|
|
52
|
+
from docling.models.stages.page_assemble.page_assemble_model import (
|
|
53
|
+
PageAssembleModel,
|
|
54
|
+
PageAssembleOptions,
|
|
55
|
+
)
|
|
56
|
+
from docling.models.stages.page_preprocessing.page_preprocessing_model import (
|
|
57
|
+
PagePreprocessingModel,
|
|
58
|
+
PagePreprocessingOptions,
|
|
59
|
+
)
|
|
60
|
+
from docling.models.stages.reading_order.readingorder_model import (
|
|
61
|
+
ReadingOrderModel,
|
|
62
|
+
ReadingOrderOptions,
|
|
63
|
+
)
|
|
64
|
+
from docling.pipeline.base_pipeline import ConvertPipeline
|
|
65
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
66
|
+
from docling.utils.utils import chunkify
|
|
67
|
+
|
|
68
|
+
_log = logging.getLogger(__name__)
|
|
69
|
+
|
|
70
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
71
|
+
# Helper data structures
|
|
72
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class ThreadedItem:
|
|
77
|
+
"""Envelope that travels between pipeline stages."""
|
|
78
|
+
|
|
79
|
+
payload: Optional[Page]
|
|
80
|
+
run_id: int # Unique per *execute* call, monotonic across pipeline instance
|
|
81
|
+
page_no: int
|
|
82
|
+
conv_res: ConversionResult
|
|
83
|
+
error: Optional[Exception] = None
|
|
84
|
+
is_failed: bool = False
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class ProcessingResult:
|
|
89
|
+
"""Aggregated outcome of a pipeline run."""
|
|
90
|
+
|
|
91
|
+
pages: List[Page] = field(default_factory=list)
|
|
92
|
+
failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
|
|
93
|
+
total_expected: int = 0
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def success_count(self) -> int:
|
|
97
|
+
return len(self.pages)
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def failure_count(self) -> int:
|
|
101
|
+
return len(self.failed_pages)
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def is_partial_success(self) -> bool:
|
|
105
|
+
return 0 < self.success_count < self.total_expected
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def is_complete_failure(self) -> bool:
|
|
109
|
+
return self.success_count == 0 and self.failure_count > 0
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ThreadedQueue:
|
|
113
|
+
"""Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
|
|
114
|
+
|
|
115
|
+
__slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
|
|
116
|
+
|
|
117
|
+
def __init__(self, max_size: int) -> None:
|
|
118
|
+
self._max: int = max_size
|
|
119
|
+
self._items: deque[ThreadedItem] = deque()
|
|
120
|
+
self._lock = threading.Lock()
|
|
121
|
+
self._not_full = threading.Condition(self._lock)
|
|
122
|
+
self._not_empty = threading.Condition(self._lock)
|
|
123
|
+
self._closed = False
|
|
124
|
+
|
|
125
|
+
# ---------------------------------------------------------------- put()
|
|
126
|
+
def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
|
|
127
|
+
"""Block until queue accepts *item* or is closed. Returns *False* if closed."""
|
|
128
|
+
with self._not_full:
|
|
129
|
+
if self._closed:
|
|
130
|
+
return False
|
|
131
|
+
start = time.monotonic()
|
|
132
|
+
while len(self._items) >= self._max and not self._closed:
|
|
133
|
+
if timeout is not None:
|
|
134
|
+
remaining = timeout - (time.monotonic() - start)
|
|
135
|
+
if remaining <= 0:
|
|
136
|
+
return False
|
|
137
|
+
self._not_full.wait(remaining)
|
|
138
|
+
else:
|
|
139
|
+
self._not_full.wait()
|
|
140
|
+
if self._closed:
|
|
141
|
+
return False
|
|
142
|
+
self._items.append(item)
|
|
143
|
+
self._not_empty.notify()
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
# ------------------------------------------------------------ get_batch()
|
|
147
|
+
def get_batch(
|
|
148
|
+
self, size: int, timeout: Optional[float] | None = None
|
|
149
|
+
) -> List[ThreadedItem]:
|
|
150
|
+
"""Return up to *size* items. Blocks until ≥1 item present or queue closed/timeout."""
|
|
151
|
+
with self._not_empty:
|
|
152
|
+
start = time.monotonic()
|
|
153
|
+
while not self._items and not self._closed:
|
|
154
|
+
if timeout is not None:
|
|
155
|
+
remaining = timeout - (time.monotonic() - start)
|
|
156
|
+
if remaining <= 0:
|
|
157
|
+
return []
|
|
158
|
+
self._not_empty.wait(remaining)
|
|
159
|
+
else:
|
|
160
|
+
self._not_empty.wait()
|
|
161
|
+
batch: List[ThreadedItem] = []
|
|
162
|
+
while self._items and len(batch) < size:
|
|
163
|
+
batch.append(self._items.popleft())
|
|
164
|
+
if batch:
|
|
165
|
+
self._not_full.notify_all()
|
|
166
|
+
return batch
|
|
167
|
+
|
|
168
|
+
# ---------------------------------------------------------------- close()
|
|
169
|
+
def close(self) -> None:
|
|
170
|
+
with self._lock:
|
|
171
|
+
self._closed = True
|
|
172
|
+
self._not_empty.notify_all()
|
|
173
|
+
self._not_full.notify_all()
|
|
174
|
+
|
|
175
|
+
# -------------------------------------------------------------- property
|
|
176
|
+
@property
|
|
177
|
+
def closed(self) -> bool:
|
|
178
|
+
return self._closed
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class ThreadedPipelineStage:
|
|
182
|
+
"""A single pipeline stage backed by one worker thread."""
|
|
183
|
+
|
|
184
|
+
def __init__(
|
|
185
|
+
self,
|
|
186
|
+
*,
|
|
187
|
+
name: str,
|
|
188
|
+
model: Any,
|
|
189
|
+
batch_size: int,
|
|
190
|
+
batch_timeout: float,
|
|
191
|
+
queue_max_size: int,
|
|
192
|
+
postprocess: Optional[Callable[[ThreadedItem], None]] = None,
|
|
193
|
+
timed_out_run_ids: Optional[set[int]] = None,
|
|
194
|
+
) -> None:
|
|
195
|
+
self.name = name
|
|
196
|
+
self.model = model
|
|
197
|
+
self.batch_size = batch_size
|
|
198
|
+
self.batch_timeout = batch_timeout
|
|
199
|
+
self.input_queue = ThreadedQueue(queue_max_size)
|
|
200
|
+
self._outputs: list[ThreadedQueue] = []
|
|
201
|
+
self._thread: Optional[threading.Thread] = None
|
|
202
|
+
self._running = False
|
|
203
|
+
self._postprocess = postprocess
|
|
204
|
+
self._timed_out_run_ids = (
|
|
205
|
+
timed_out_run_ids if timed_out_run_ids is not None else set()
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# ---------------------------------------------------------------- wiring
|
|
209
|
+
def add_output_queue(self, q: ThreadedQueue) -> None:
|
|
210
|
+
self._outputs.append(q)
|
|
211
|
+
|
|
212
|
+
# -------------------------------------------------------------- lifecycle
|
|
213
|
+
def start(self) -> None:
|
|
214
|
+
if self._running:
|
|
215
|
+
return
|
|
216
|
+
self._running = True
|
|
217
|
+
self._thread = threading.Thread(
|
|
218
|
+
target=self._run, name=f"Stage-{self.name}", daemon=False
|
|
219
|
+
)
|
|
220
|
+
self._thread.start()
|
|
221
|
+
|
|
222
|
+
def stop(self) -> None:
|
|
223
|
+
if not self._running:
|
|
224
|
+
return
|
|
225
|
+
self._running = False
|
|
226
|
+
self.input_queue.close()
|
|
227
|
+
if self._thread is not None:
|
|
228
|
+
# Give thread 2s to finish naturally before abandoning
|
|
229
|
+
self._thread.join(timeout=15.0)
|
|
230
|
+
if self._thread.is_alive():
|
|
231
|
+
_log.warning(
|
|
232
|
+
"Stage %s thread did not terminate within 15s. "
|
|
233
|
+
"Thread is likely stuck in a blocking call and will be abandoned (resources may leak).",
|
|
234
|
+
self.name,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# ------------------------------------------------------------------ _run
|
|
238
|
+
def _run(self) -> None:
|
|
239
|
+
try:
|
|
240
|
+
while self._running:
|
|
241
|
+
batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
|
|
242
|
+
if not batch and self.input_queue.closed:
|
|
243
|
+
break
|
|
244
|
+
processed = self._process_batch(batch)
|
|
245
|
+
self._emit(processed)
|
|
246
|
+
except Exception: # pragma: no cover - top-level guard
|
|
247
|
+
_log.exception("Fatal error in stage %s", self.name)
|
|
248
|
+
finally:
|
|
249
|
+
for q in self._outputs:
|
|
250
|
+
q.close()
|
|
251
|
+
|
|
252
|
+
# ----------------------------------------------------- _process_batch()
|
|
253
|
+
def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
|
|
254
|
+
"""Run *model* on *batch* grouped by run_id to maximise batching."""
|
|
255
|
+
groups: dict[int, list[ThreadedItem]] = defaultdict(list)
|
|
256
|
+
for itm in batch:
|
|
257
|
+
groups[itm.run_id].append(itm)
|
|
258
|
+
|
|
259
|
+
result: list[ThreadedItem] = []
|
|
260
|
+
for rid, items in groups.items():
|
|
261
|
+
# If run_id is timed out, skip processing but pass through items as-is
|
|
262
|
+
# This allows already-completed work to flow through while aborting new work
|
|
263
|
+
if rid in self._timed_out_run_ids:
|
|
264
|
+
for it in items:
|
|
265
|
+
it.is_failed = True
|
|
266
|
+
if it.error is None:
|
|
267
|
+
it.error = RuntimeError("document timeout exceeded")
|
|
268
|
+
result.extend(items)
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
good: list[ThreadedItem] = [i for i in items if not i.is_failed]
|
|
272
|
+
if not good:
|
|
273
|
+
result.extend(items)
|
|
274
|
+
continue
|
|
275
|
+
try:
|
|
276
|
+
# Filter out None payloads and ensure type safety
|
|
277
|
+
pages_with_payloads = [
|
|
278
|
+
(i, i.payload) for i in good if i.payload is not None
|
|
279
|
+
]
|
|
280
|
+
if len(pages_with_payloads) != len(good):
|
|
281
|
+
# Some items have None payloads, mark all as failed
|
|
282
|
+
for it in items:
|
|
283
|
+
it.is_failed = True
|
|
284
|
+
it.error = RuntimeError("Page payload is None")
|
|
285
|
+
result.extend(items)
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
pages: List[Page] = [payload for _, payload in pages_with_payloads]
|
|
289
|
+
processed_pages = list(self.model(good[0].conv_res, pages)) # type: ignore[arg-type]
|
|
290
|
+
if len(processed_pages) != len(pages): # strict mismatch guard
|
|
291
|
+
raise RuntimeError(
|
|
292
|
+
f"Model {self.name} returned wrong number of pages"
|
|
293
|
+
)
|
|
294
|
+
for idx, page in enumerate(processed_pages):
|
|
295
|
+
result.append(
|
|
296
|
+
ThreadedItem(
|
|
297
|
+
payload=page,
|
|
298
|
+
run_id=rid,
|
|
299
|
+
page_no=good[idx].page_no,
|
|
300
|
+
conv_res=good[idx].conv_res,
|
|
301
|
+
)
|
|
302
|
+
)
|
|
303
|
+
except Exception as exc:
|
|
304
|
+
_log.error(
|
|
305
|
+
"Stage %s failed for run %d: %s", self.name, rid, exc, exc_info=True
|
|
306
|
+
)
|
|
307
|
+
for it in items:
|
|
308
|
+
it.is_failed = True
|
|
309
|
+
it.error = exc
|
|
310
|
+
result.extend(items)
|
|
311
|
+
return result
|
|
312
|
+
|
|
313
|
+
# -------------------------------------------------------------- _emit()
|
|
314
|
+
def _emit(self, items: Iterable[ThreadedItem]) -> None:
|
|
315
|
+
for item in items:
|
|
316
|
+
if self._postprocess is not None:
|
|
317
|
+
self._postprocess(item)
|
|
318
|
+
for q in self._outputs:
|
|
319
|
+
if not q.put(item):
|
|
320
|
+
_log.error("Output queue closed while emitting from %s", self.name)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
class PreprocessThreadedStage(ThreadedPipelineStage):
|
|
324
|
+
"""Pipeline stage that lazily loads PDF backends just-in-time."""
|
|
325
|
+
|
|
326
|
+
def __init__(
|
|
327
|
+
self,
|
|
328
|
+
*,
|
|
329
|
+
batch_timeout: float,
|
|
330
|
+
queue_max_size: int,
|
|
331
|
+
model: Any,
|
|
332
|
+
timed_out_run_ids: Optional[set[int]] = None,
|
|
333
|
+
) -> None:
|
|
334
|
+
super().__init__(
|
|
335
|
+
name="preprocess",
|
|
336
|
+
model=model,
|
|
337
|
+
batch_size=1,
|
|
338
|
+
batch_timeout=batch_timeout,
|
|
339
|
+
queue_max_size=queue_max_size,
|
|
340
|
+
timed_out_run_ids=timed_out_run_ids,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
|
|
344
|
+
groups: dict[int, list[ThreadedItem]] = defaultdict(list)
|
|
345
|
+
for itm in batch:
|
|
346
|
+
groups[itm.run_id].append(itm)
|
|
347
|
+
|
|
348
|
+
result: list[ThreadedItem] = []
|
|
349
|
+
for rid, items in groups.items():
|
|
350
|
+
# If run_id is timed out, skip processing but pass through items as-is
|
|
351
|
+
# This allows already-completed work to flow through while aborting new work
|
|
352
|
+
if rid in self._timed_out_run_ids:
|
|
353
|
+
for it in items:
|
|
354
|
+
it.is_failed = True
|
|
355
|
+
if it.error is None:
|
|
356
|
+
it.error = RuntimeError("document timeout exceeded")
|
|
357
|
+
result.extend(items)
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
good = [i for i in items if not i.is_failed]
|
|
361
|
+
if not good:
|
|
362
|
+
result.extend(items)
|
|
363
|
+
continue
|
|
364
|
+
try:
|
|
365
|
+
pages_with_payloads: list[tuple[ThreadedItem, Page]] = []
|
|
366
|
+
for it in good:
|
|
367
|
+
page = it.payload
|
|
368
|
+
if page is None:
|
|
369
|
+
raise RuntimeError("Page payload is None")
|
|
370
|
+
if page._backend is None:
|
|
371
|
+
backend = it.conv_res.input._backend
|
|
372
|
+
assert isinstance(backend, PdfDocumentBackend), (
|
|
373
|
+
"Threaded pipeline only supports PdfDocumentBackend."
|
|
374
|
+
)
|
|
375
|
+
page_backend = backend.load_page(page.page_no - 1)
|
|
376
|
+
page._backend = page_backend
|
|
377
|
+
if page_backend.is_valid():
|
|
378
|
+
page.size = page_backend.get_size()
|
|
379
|
+
pages_with_payloads.append((it, page))
|
|
380
|
+
|
|
381
|
+
pages = [payload for _, payload in pages_with_payloads]
|
|
382
|
+
processed_pages = list(
|
|
383
|
+
self.model(good[0].conv_res, pages) # type: ignore[arg-type]
|
|
384
|
+
)
|
|
385
|
+
if len(processed_pages) != len(pages):
|
|
386
|
+
raise RuntimeError(
|
|
387
|
+
"PagePreprocessingModel returned unexpected number of pages"
|
|
388
|
+
)
|
|
389
|
+
for idx, processed_page in enumerate(processed_pages):
|
|
390
|
+
result.append(
|
|
391
|
+
ThreadedItem(
|
|
392
|
+
payload=processed_page,
|
|
393
|
+
run_id=rid,
|
|
394
|
+
page_no=good[idx].page_no,
|
|
395
|
+
conv_res=good[idx].conv_res,
|
|
396
|
+
)
|
|
397
|
+
)
|
|
398
|
+
except Exception as exc:
|
|
399
|
+
page_numbers = [it.page_no for it in good]
|
|
400
|
+
_log.error(
|
|
401
|
+
"Stage preprocess failed for run %d, pages %s: %s",
|
|
402
|
+
rid,
|
|
403
|
+
page_numbers,
|
|
404
|
+
exc,
|
|
405
|
+
exc_info=False, # Put to True if you want detailed exception info
|
|
406
|
+
)
|
|
407
|
+
for it in good:
|
|
408
|
+
it.is_failed = True
|
|
409
|
+
it.error = exc
|
|
410
|
+
result.extend(items)
|
|
411
|
+
return result
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
@dataclass
|
|
415
|
+
class RunContext:
|
|
416
|
+
"""Wiring for a single *execute* call."""
|
|
417
|
+
|
|
418
|
+
stages: list[ThreadedPipelineStage]
|
|
419
|
+
first_stage: ThreadedPipelineStage
|
|
420
|
+
output_queue: ThreadedQueue
|
|
421
|
+
timed_out_run_ids: set[int] = field(default_factory=set)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
425
|
+
# Main pipeline
|
|
426
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
class StandardPdfPipeline(ConvertPipeline):
|
|
430
|
+
"""High-performance PDF pipeline with multi-threaded stages."""
|
|
431
|
+
|
|
432
|
+
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
|
|
433
|
+
super().__init__(pipeline_options)
|
|
434
|
+
self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
|
|
435
|
+
self._run_seq = itertools.count(1) # deterministic, monotonic run ids
|
|
436
|
+
|
|
437
|
+
# initialise heavy models once
|
|
438
|
+
self._init_models()
|
|
439
|
+
|
|
440
|
+
# ────────────────────────────────────────────────────────────────────────
|
|
441
|
+
# Heavy-model initialisation & helpers
|
|
442
|
+
# ────────────────────────────────────────────────────────────────────────
|
|
443
|
+
|
|
444
|
+
def _init_models(self) -> None:
|
|
445
|
+
art_path = self.artifacts_path
|
|
446
|
+
self.keep_images = (
|
|
447
|
+
self.pipeline_options.generate_page_images
|
|
448
|
+
or self.pipeline_options.generate_picture_images
|
|
449
|
+
or self.pipeline_options.generate_table_images
|
|
450
|
+
)
|
|
451
|
+
self.preprocessing_model = PagePreprocessingModel(
|
|
452
|
+
options=PagePreprocessingOptions(
|
|
453
|
+
images_scale=self.pipeline_options.images_scale
|
|
454
|
+
)
|
|
455
|
+
)
|
|
456
|
+
self.ocr_model = self._make_ocr_model(art_path)
|
|
457
|
+
layout_factory = get_layout_factory(
|
|
458
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
459
|
+
)
|
|
460
|
+
self.layout_model = layout_factory.create_instance(
|
|
461
|
+
options=self.pipeline_options.layout_options,
|
|
462
|
+
artifacts_path=art_path,
|
|
463
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
464
|
+
)
|
|
465
|
+
table_factory = get_table_structure_factory(
|
|
466
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
467
|
+
)
|
|
468
|
+
self.table_model = table_factory.create_instance(
|
|
469
|
+
options=self.pipeline_options.table_structure_options,
|
|
470
|
+
enabled=self.pipeline_options.do_table_structure,
|
|
471
|
+
artifacts_path=art_path,
|
|
472
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
473
|
+
)
|
|
474
|
+
self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
|
|
475
|
+
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
|
476
|
+
|
|
477
|
+
# --- optional enrichment ------------------------------------------------
|
|
478
|
+
self.enrichment_pipe = [
|
|
479
|
+
# Code Formula Enrichment Model
|
|
480
|
+
CodeFormulaModel(
|
|
481
|
+
enabled=self.pipeline_options.do_code_enrichment
|
|
482
|
+
or self.pipeline_options.do_formula_enrichment,
|
|
483
|
+
artifacts_path=self.artifacts_path,
|
|
484
|
+
options=CodeFormulaModelOptions(
|
|
485
|
+
do_code_enrichment=self.pipeline_options.do_code_enrichment,
|
|
486
|
+
do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
|
|
487
|
+
),
|
|
488
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
489
|
+
),
|
|
490
|
+
*self.enrichment_pipe,
|
|
491
|
+
]
|
|
492
|
+
|
|
493
|
+
self.keep_backend = any(
|
|
494
|
+
(
|
|
495
|
+
self.pipeline_options.do_formula_enrichment,
|
|
496
|
+
self.pipeline_options.do_code_enrichment,
|
|
497
|
+
self.pipeline_options.do_picture_classification,
|
|
498
|
+
self.pipeline_options.do_picture_description,
|
|
499
|
+
)
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# ---------------------------------------------------------------- helpers
|
|
503
|
+
def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
|
|
504
|
+
factory = get_ocr_factory(
|
|
505
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
506
|
+
)
|
|
507
|
+
return factory.create_instance(
|
|
508
|
+
options=self.pipeline_options.ocr_options,
|
|
509
|
+
enabled=self.pipeline_options.do_ocr,
|
|
510
|
+
artifacts_path=art_path,
|
|
511
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
def _release_page_resources(self, item: ThreadedItem) -> None:
|
|
515
|
+
page = item.payload
|
|
516
|
+
if page is None:
|
|
517
|
+
return
|
|
518
|
+
if not self.keep_images:
|
|
519
|
+
page._image_cache = {}
|
|
520
|
+
if not self.keep_backend and page._backend is not None:
|
|
521
|
+
page._backend.unload()
|
|
522
|
+
page._backend = None
|
|
523
|
+
if not self.pipeline_options.generate_parsed_pages:
|
|
524
|
+
page.parsed_page = None
|
|
525
|
+
|
|
526
|
+
# ────────────────────────────────────────────────────────────────────────
|
|
527
|
+
# Build - thread pipeline
|
|
528
|
+
# ────────────────────────────────────────────────────────────────────────
|
|
529
|
+
|
|
530
|
+
def _create_run_ctx(self) -> RunContext:
|
|
531
|
+
opts = self.pipeline_options
|
|
532
|
+
timed_out_run_ids: set[int] = set()
|
|
533
|
+
preprocess = PreprocessThreadedStage(
|
|
534
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
535
|
+
queue_max_size=opts.queue_max_size,
|
|
536
|
+
model=self.preprocessing_model,
|
|
537
|
+
timed_out_run_ids=timed_out_run_ids,
|
|
538
|
+
)
|
|
539
|
+
ocr = ThreadedPipelineStage(
|
|
540
|
+
name="ocr",
|
|
541
|
+
model=self.ocr_model,
|
|
542
|
+
batch_size=opts.ocr_batch_size,
|
|
543
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
544
|
+
queue_max_size=opts.queue_max_size,
|
|
545
|
+
timed_out_run_ids=timed_out_run_ids,
|
|
546
|
+
)
|
|
547
|
+
layout = ThreadedPipelineStage(
|
|
548
|
+
name="layout",
|
|
549
|
+
model=self.layout_model,
|
|
550
|
+
batch_size=opts.layout_batch_size,
|
|
551
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
552
|
+
queue_max_size=opts.queue_max_size,
|
|
553
|
+
timed_out_run_ids=timed_out_run_ids,
|
|
554
|
+
)
|
|
555
|
+
table = ThreadedPipelineStage(
|
|
556
|
+
name="table",
|
|
557
|
+
model=self.table_model,
|
|
558
|
+
batch_size=opts.table_batch_size,
|
|
559
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
560
|
+
queue_max_size=opts.queue_max_size,
|
|
561
|
+
timed_out_run_ids=timed_out_run_ids,
|
|
562
|
+
)
|
|
563
|
+
assemble = ThreadedPipelineStage(
|
|
564
|
+
name="assemble",
|
|
565
|
+
model=self.assemble_model,
|
|
566
|
+
batch_size=1,
|
|
567
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
568
|
+
queue_max_size=opts.queue_max_size,
|
|
569
|
+
postprocess=self._release_page_resources,
|
|
570
|
+
timed_out_run_ids=timed_out_run_ids,
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# wire stages
|
|
574
|
+
output_q = ThreadedQueue(opts.queue_max_size)
|
|
575
|
+
preprocess.add_output_queue(ocr.input_queue)
|
|
576
|
+
ocr.add_output_queue(layout.input_queue)
|
|
577
|
+
layout.add_output_queue(table.input_queue)
|
|
578
|
+
table.add_output_queue(assemble.input_queue)
|
|
579
|
+
assemble.add_output_queue(output_q)
|
|
580
|
+
|
|
581
|
+
stages = [preprocess, ocr, layout, table, assemble]
|
|
582
|
+
return RunContext(
|
|
583
|
+
stages=stages,
|
|
584
|
+
first_stage=preprocess,
|
|
585
|
+
output_queue=output_q,
|
|
586
|
+
timed_out_run_ids=timed_out_run_ids,
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
# --------------------------------------------------------------------- build
|
|
590
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
591
|
+
"""Stream-build the document while interleaving producer and consumer work.
|
|
592
|
+
|
|
593
|
+
Note: If a worker thread gets stuck in a blocking call (model inference or PDF backend
|
|
594
|
+
load_page/get_size), that thread will be abandoned after a brief wait (15s) during cleanup.
|
|
595
|
+
The thread continues running until the blocking call completes, potentially holding
|
|
596
|
+
resources (e.g., pypdfium2_lock).
|
|
597
|
+
"""
|
|
598
|
+
run_id = next(self._run_seq)
|
|
599
|
+
assert isinstance(conv_res.input._backend, PdfDocumentBackend)
|
|
600
|
+
|
|
601
|
+
# Collect page placeholders; backends are loaded lazily in preprocess stage
|
|
602
|
+
start_page, end_page = conv_res.input.limits.page_range
|
|
603
|
+
pages: list[Page] = []
|
|
604
|
+
for i in range(conv_res.input.page_count):
|
|
605
|
+
if start_page - 1 <= i <= end_page - 1:
|
|
606
|
+
page = Page(page_no=i + 1)
|
|
607
|
+
conv_res.pages.append(page)
|
|
608
|
+
pages.append(page)
|
|
609
|
+
|
|
610
|
+
if not pages:
|
|
611
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
612
|
+
return conv_res
|
|
613
|
+
|
|
614
|
+
total_pages: int = len(pages)
|
|
615
|
+
ctx: RunContext = self._create_run_ctx()
|
|
616
|
+
for st in ctx.stages:
|
|
617
|
+
st.start()
|
|
618
|
+
|
|
619
|
+
proc = ProcessingResult(total_expected=total_pages)
|
|
620
|
+
fed_idx: int = 0 # number of pages successfully queued
|
|
621
|
+
batch_size: int = 32 # drain chunk
|
|
622
|
+
start_time = time.monotonic()
|
|
623
|
+
timeout_exceeded = False
|
|
624
|
+
input_queue_closed = False
|
|
625
|
+
try:
|
|
626
|
+
while proc.success_count + proc.failure_count < total_pages:
|
|
627
|
+
# Check timeout
|
|
628
|
+
if (
|
|
629
|
+
self.pipeline_options.document_timeout is not None
|
|
630
|
+
and not timeout_exceeded
|
|
631
|
+
):
|
|
632
|
+
elapsed_time = time.monotonic() - start_time
|
|
633
|
+
if elapsed_time > self.pipeline_options.document_timeout:
|
|
634
|
+
_log.warning(
|
|
635
|
+
f"Document processing time ({elapsed_time:.3f}s) "
|
|
636
|
+
f"exceeded timeout of {self.pipeline_options.document_timeout:.3f}s"
|
|
637
|
+
)
|
|
638
|
+
timeout_exceeded = True
|
|
639
|
+
ctx.timed_out_run_ids.add(run_id)
|
|
640
|
+
if not input_queue_closed:
|
|
641
|
+
ctx.first_stage.input_queue.close()
|
|
642
|
+
input_queue_closed = True
|
|
643
|
+
# Break immediately - don't wait for in-flight work
|
|
644
|
+
break
|
|
645
|
+
|
|
646
|
+
# 1) feed - try to enqueue until the first queue is full
|
|
647
|
+
if not input_queue_closed:
|
|
648
|
+
while fed_idx < total_pages:
|
|
649
|
+
ok = ctx.first_stage.input_queue.put(
|
|
650
|
+
ThreadedItem(
|
|
651
|
+
payload=pages[fed_idx],
|
|
652
|
+
run_id=run_id,
|
|
653
|
+
page_no=pages[fed_idx].page_no,
|
|
654
|
+
conv_res=conv_res,
|
|
655
|
+
),
|
|
656
|
+
timeout=0.0, # non-blocking try-put
|
|
657
|
+
)
|
|
658
|
+
if ok:
|
|
659
|
+
fed_idx += 1
|
|
660
|
+
if fed_idx == total_pages:
|
|
661
|
+
ctx.first_stage.input_queue.close()
|
|
662
|
+
input_queue_closed = True
|
|
663
|
+
else: # queue full - switch to draining
|
|
664
|
+
break
|
|
665
|
+
|
|
666
|
+
# 2) drain - pull whatever is ready from the output side
|
|
667
|
+
out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
|
|
668
|
+
for itm in out_batch:
|
|
669
|
+
if itm.run_id != run_id:
|
|
670
|
+
continue
|
|
671
|
+
if itm.is_failed or itm.error:
|
|
672
|
+
proc.failed_pages.append(
|
|
673
|
+
(itm.page_no, itm.error or RuntimeError("unknown error"))
|
|
674
|
+
)
|
|
675
|
+
else:
|
|
676
|
+
assert itm.payload is not None
|
|
677
|
+
proc.pages.append(itm.payload)
|
|
678
|
+
|
|
679
|
+
# 3) failure safety - downstream closed early
|
|
680
|
+
if not out_batch and ctx.output_queue.closed:
|
|
681
|
+
missing = total_pages - (proc.success_count + proc.failure_count)
|
|
682
|
+
if missing > 0:
|
|
683
|
+
proc.failed_pages.extend(
|
|
684
|
+
[(-1, RuntimeError("pipeline terminated early"))] * missing
|
|
685
|
+
)
|
|
686
|
+
break
|
|
687
|
+
|
|
688
|
+
# Mark remaining pages as failed if timeout occurred
|
|
689
|
+
if timeout_exceeded:
|
|
690
|
+
completed_page_nos = {p.page_no for p in proc.pages} | {
|
|
691
|
+
fp for fp, _ in proc.failed_pages
|
|
692
|
+
}
|
|
693
|
+
for page in pages[fed_idx:]:
|
|
694
|
+
if page.page_no not in completed_page_nos:
|
|
695
|
+
proc.failed_pages.append(
|
|
696
|
+
(page.page_no, RuntimeError("document timeout exceeded"))
|
|
697
|
+
)
|
|
698
|
+
finally:
|
|
699
|
+
for st in ctx.stages:
|
|
700
|
+
st.stop()
|
|
701
|
+
ctx.output_queue.close()
|
|
702
|
+
|
|
703
|
+
self._integrate_results(conv_res, proc, timeout_exceeded=timeout_exceeded)
|
|
704
|
+
return conv_res
|
|
705
|
+
|
|
706
|
+
# ---------------------------------------------------- integrate_results()
|
|
707
|
+
def _integrate_results(
|
|
708
|
+
self,
|
|
709
|
+
conv_res: ConversionResult,
|
|
710
|
+
proc: ProcessingResult,
|
|
711
|
+
timeout_exceeded: bool = False,
|
|
712
|
+
) -> None:
|
|
713
|
+
page_map = {p.page_no: p for p in proc.pages}
|
|
714
|
+
# Only keep pages that successfully completed processing
|
|
715
|
+
conv_res.pages = [
|
|
716
|
+
page_map[p.page_no] for p in conv_res.pages if p.page_no in page_map
|
|
717
|
+
]
|
|
718
|
+
# Add error details from failed pages
|
|
719
|
+
for page_no, error in proc.failed_pages:
|
|
720
|
+
page_label = f"Page {page_no}" if page_no > 0 else "Unknown page"
|
|
721
|
+
error_msg = str(error) if error else ""
|
|
722
|
+
error_item = ErrorItem(
|
|
723
|
+
component_type=DoclingComponentType.PIPELINE,
|
|
724
|
+
module_name=self.__class__.__name__,
|
|
725
|
+
error_message=f"{page_label}: {error_msg}" if error_msg else page_label,
|
|
726
|
+
)
|
|
727
|
+
conv_res.errors.append(error_item)
|
|
728
|
+
if timeout_exceeded and proc.total_expected > 0:
|
|
729
|
+
# Timeout exceeded: set PARTIAL_SUCCESS if any pages were attempted
|
|
730
|
+
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
|
731
|
+
elif proc.is_complete_failure:
|
|
732
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
733
|
+
elif proc.is_partial_success:
|
|
734
|
+
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
|
735
|
+
else:
|
|
736
|
+
conv_res.status = ConversionStatus.SUCCESS
|
|
737
|
+
if not self.keep_images:
|
|
738
|
+
for p in conv_res.pages:
|
|
739
|
+
p._image_cache = {}
|
|
740
|
+
for p in conv_res.pages:
|
|
741
|
+
if not self.keep_backend and p._backend is not None:
|
|
742
|
+
p._backend.unload()
|
|
743
|
+
if not self.pipeline_options.generate_parsed_pages:
|
|
744
|
+
del p.parsed_page
|
|
745
|
+
p.parsed_page = None
|
|
746
|
+
|
|
747
|
+
# ---------------------------------------------------------------- assemble
|
|
748
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
749
|
+
elements, headers, body = [], [], []
|
|
750
|
+
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
|
751
|
+
for p in conv_res.pages:
|
|
752
|
+
if p.assembled:
|
|
753
|
+
elements.extend(p.assembled.elements)
|
|
754
|
+
headers.extend(p.assembled.headers)
|
|
755
|
+
body.extend(p.assembled.body)
|
|
756
|
+
conv_res.assembled = AssembledUnit(
|
|
757
|
+
elements=elements, headers=headers, body=body
|
|
758
|
+
)
|
|
759
|
+
conv_res.document = self.reading_order_model(conv_res)
|
|
760
|
+
|
|
761
|
+
# Generate page images in the output
|
|
762
|
+
if self.pipeline_options.generate_page_images:
|
|
763
|
+
for page in conv_res.pages:
|
|
764
|
+
assert page.image is not None
|
|
765
|
+
page_no = page.page_no
|
|
766
|
+
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
|
767
|
+
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
# Generate images of the requested element types
|
|
771
|
+
with warnings.catch_warnings(): # deprecated generate_table_images
|
|
772
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
773
|
+
if (
|
|
774
|
+
self.pipeline_options.generate_picture_images
|
|
775
|
+
or self.pipeline_options.generate_table_images
|
|
776
|
+
):
|
|
777
|
+
scale = self.pipeline_options.images_scale
|
|
778
|
+
for element, _level in conv_res.document.iterate_items():
|
|
779
|
+
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
|
780
|
+
continue
|
|
781
|
+
if (
|
|
782
|
+
isinstance(element, PictureItem)
|
|
783
|
+
and self.pipeline_options.generate_picture_images
|
|
784
|
+
) or (
|
|
785
|
+
isinstance(element, TableItem)
|
|
786
|
+
and self.pipeline_options.generate_table_images
|
|
787
|
+
):
|
|
788
|
+
page_ix = element.prov[0].page_no
|
|
789
|
+
page = next(
|
|
790
|
+
(p for p in conv_res.pages if p.page_no == page_ix),
|
|
791
|
+
cast("Page", None),
|
|
792
|
+
)
|
|
793
|
+
assert page is not None
|
|
794
|
+
assert page.size is not None
|
|
795
|
+
assert page.image is not None
|
|
796
|
+
|
|
797
|
+
crop_bbox = (
|
|
798
|
+
element.prov[0]
|
|
799
|
+
.bbox.scaled(scale=scale)
|
|
800
|
+
.to_top_left_origin(
|
|
801
|
+
page_height=page.size.height * scale
|
|
802
|
+
)
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
|
806
|
+
element.image = ImageRef.from_pil(
|
|
807
|
+
cropped_im, dpi=int(72 * scale)
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
# Aggregate confidence values for document:
|
|
811
|
+
if len(conv_res.pages) > 0:
|
|
812
|
+
with warnings.catch_warnings():
|
|
813
|
+
warnings.filterwarnings(
|
|
814
|
+
"ignore",
|
|
815
|
+
category=RuntimeWarning,
|
|
816
|
+
message="Mean of empty slice|All-NaN slice encountered",
|
|
817
|
+
)
|
|
818
|
+
conv_res.confidence.layout_score = float(
|
|
819
|
+
np.nanmean(
|
|
820
|
+
[c.layout_score for c in conv_res.confidence.pages.values()]
|
|
821
|
+
)
|
|
822
|
+
)
|
|
823
|
+
conv_res.confidence.parse_score = float(
|
|
824
|
+
np.nanquantile(
|
|
825
|
+
[c.parse_score for c in conv_res.confidence.pages.values()],
|
|
826
|
+
q=0.1, # parse score should relate to worst 10% of pages.
|
|
827
|
+
)
|
|
828
|
+
)
|
|
829
|
+
conv_res.confidence.table_score = float(
|
|
830
|
+
np.nanmean(
|
|
831
|
+
[c.table_score for c in conv_res.confidence.pages.values()]
|
|
832
|
+
)
|
|
833
|
+
)
|
|
834
|
+
conv_res.confidence.ocr_score = float(
|
|
835
|
+
np.nanmean(
|
|
836
|
+
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
|
837
|
+
)
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
return conv_res
|
|
841
|
+
|
|
842
|
+
# ---------------------------------------------------------------- misc
|
|
843
|
+
@classmethod
|
|
844
|
+
def get_default_options(cls) -> ThreadedPdfPipelineOptions:
|
|
845
|
+
return ThreadedPdfPipelineOptions()
|
|
846
|
+
|
|
847
|
+
@classmethod
|
|
848
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
|
|
849
|
+
return isinstance(backend, PdfDocumentBackend)
|
|
850
|
+
|
|
851
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
852
|
+
return conv_res.status
|
|
853
|
+
|
|
854
|
+
def _unload(self, conv_res: ConversionResult) -> None:
|
|
855
|
+
for p in conv_res.pages:
|
|
856
|
+
if p._backend is not None:
|
|
857
|
+
p._backend.unload()
|
|
858
|
+
if conv_res.input._backend:
|
|
859
|
+
conv_res.input._backend.unload()
|