docling 2.59.0__py3-none-any.whl → 2.60.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

@@ -1,647 +1,5 @@
1
- # threaded_standard_pdf_pipeline.py
2
- """Thread-safe, production-ready PDF pipeline
3
- ================================================
4
- A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
1
+ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
5
2
 
6
- * **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
7
- threads so that concurrent invocations never share mutable state.
8
- * **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
9
- relying on :pyfunc:`id`, which may clash after garbage collection.
10
- * **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
11
- propagates downstream so stages terminate deterministically without sentinels.
12
- * **Minimal shared state** - heavyweight models are initialised once per pipeline instance
13
- and only read by worker threads; no runtime mutability is exposed.
14
- * **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
15
- """
16
3
 
17
- from __future__ import annotations
18
-
19
- import itertools
20
- import logging
21
- import threading
22
- import time
23
- import warnings
24
- from collections import defaultdict, deque
25
- from dataclasses import dataclass, field
26
- from pathlib import Path
27
- from typing import Any, Iterable, List, Optional, Sequence, Tuple, cast
28
-
29
- import numpy as np
30
- from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
31
-
32
- from docling.backend.abstract_backend import AbstractDocumentBackend
33
- from docling.backend.pdf_backend import PdfDocumentBackend
34
- from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
35
- from docling.datamodel.document import ConversionResult
36
- from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
37
- from docling.datamodel.settings import settings
38
- from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
39
- from docling.models.factories import get_ocr_factory
40
- from docling.models.layout_model import LayoutModel
41
- from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
42
- from docling.models.page_preprocessing_model import (
43
- PagePreprocessingModel,
44
- PagePreprocessingOptions,
45
- )
46
- from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
47
- from docling.models.table_structure_model import TableStructureModel
48
- from docling.pipeline.base_pipeline import ConvertPipeline
49
- from docling.utils.profiling import ProfilingScope, TimeRecorder
50
- from docling.utils.utils import chunkify
51
-
52
- _log = logging.getLogger(__name__)
53
-
54
- # ──────────────────────────────────────────────────────────────────────────────
55
- # Helper data structures
56
- # ──────────────────────────────────────────────────────────────────────────────
57
-
58
-
59
- @dataclass
60
- class ThreadedItem:
61
- """Envelope that travels between pipeline stages."""
62
-
63
- payload: Optional[Page]
64
- run_id: int # Unique per *execute* call, monotonic across pipeline instance
65
- page_no: int
66
- conv_res: ConversionResult
67
- error: Optional[Exception] = None
68
- is_failed: bool = False
69
-
70
-
71
- @dataclass
72
- class ProcessingResult:
73
- """Aggregated outcome of a pipeline run."""
74
-
75
- pages: List[Page] = field(default_factory=list)
76
- failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
77
- total_expected: int = 0
78
-
79
- @property
80
- def success_count(self) -> int:
81
- return len(self.pages)
82
-
83
- @property
84
- def failure_count(self) -> int:
85
- return len(self.failed_pages)
86
-
87
- @property
88
- def is_partial_success(self) -> bool:
89
- return 0 < self.success_count < self.total_expected
90
-
91
- @property
92
- def is_complete_failure(self) -> bool:
93
- return self.success_count == 0 and self.failure_count > 0
94
-
95
-
96
- class ThreadedQueue:
97
- """Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
98
-
99
- __slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
100
-
101
- def __init__(self, max_size: int) -> None:
102
- self._max: int = max_size
103
- self._items: deque[ThreadedItem] = deque()
104
- self._lock = threading.Lock()
105
- self._not_full = threading.Condition(self._lock)
106
- self._not_empty = threading.Condition(self._lock)
107
- self._closed = False
108
-
109
- # ---------------------------------------------------------------- put()
110
- def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
111
- """Block until queue accepts *item* or is closed. Returns *False* if closed."""
112
- with self._not_full:
113
- if self._closed:
114
- return False
115
- start = time.monotonic()
116
- while len(self._items) >= self._max and not self._closed:
117
- if timeout is not None:
118
- remaining = timeout - (time.monotonic() - start)
119
- if remaining <= 0:
120
- return False
121
- self._not_full.wait(remaining)
122
- else:
123
- self._not_full.wait()
124
- if self._closed:
125
- return False
126
- self._items.append(item)
127
- self._not_empty.notify()
128
- return True
129
-
130
- # ------------------------------------------------------------ get_batch()
131
- def get_batch(
132
- self, size: int, timeout: Optional[float] | None = None
133
- ) -> List[ThreadedItem]:
134
- """Return up to *size* items. Blocks until ≥1 item present or queue closed/timeout."""
135
- with self._not_empty:
136
- start = time.monotonic()
137
- while not self._items and not self._closed:
138
- if timeout is not None:
139
- remaining = timeout - (time.monotonic() - start)
140
- if remaining <= 0:
141
- return []
142
- self._not_empty.wait(remaining)
143
- else:
144
- self._not_empty.wait()
145
- batch: List[ThreadedItem] = []
146
- while self._items and len(batch) < size:
147
- batch.append(self._items.popleft())
148
- if batch:
149
- self._not_full.notify_all()
150
- return batch
151
-
152
- # ---------------------------------------------------------------- close()
153
- def close(self) -> None:
154
- with self._lock:
155
- self._closed = True
156
- self._not_empty.notify_all()
157
- self._not_full.notify_all()
158
-
159
- # -------------------------------------------------------------- property
160
- @property
161
- def closed(self) -> bool:
162
- return self._closed
163
-
164
-
165
- class ThreadedPipelineStage:
166
- """A single pipeline stage backed by one worker thread."""
167
-
168
- def __init__(
169
- self,
170
- *,
171
- name: str,
172
- model: Any,
173
- batch_size: int,
174
- batch_timeout: float,
175
- queue_max_size: int,
176
- ) -> None:
177
- self.name = name
178
- self.model = model
179
- self.batch_size = batch_size
180
- self.batch_timeout = batch_timeout
181
- self.input_queue = ThreadedQueue(queue_max_size)
182
- self._outputs: list[ThreadedQueue] = []
183
- self._thread: Optional[threading.Thread] = None
184
- self._running = False
185
-
186
- # ---------------------------------------------------------------- wiring
187
- def add_output_queue(self, q: ThreadedQueue) -> None:
188
- self._outputs.append(q)
189
-
190
- # -------------------------------------------------------------- lifecycle
191
- def start(self) -> None:
192
- if self._running:
193
- return
194
- self._running = True
195
- self._thread = threading.Thread(
196
- target=self._run, name=f"Stage-{self.name}", daemon=True
197
- )
198
- self._thread.start()
199
-
200
- def stop(self) -> None:
201
- if not self._running:
202
- return
203
- self._running = False
204
- self.input_queue.close()
205
- if self._thread is not None:
206
- self._thread.join(timeout=30.0)
207
- if self._thread.is_alive():
208
- _log.warning("Stage %s did not terminate cleanly within 30s", self.name)
209
-
210
- # ------------------------------------------------------------------ _run
211
- def _run(self) -> None:
212
- try:
213
- while self._running:
214
- batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
215
- if not batch and self.input_queue.closed:
216
- break
217
- processed = self._process_batch(batch)
218
- self._emit(processed)
219
- except Exception: # pragma: no cover - top-level guard
220
- _log.exception("Fatal error in stage %s", self.name)
221
- finally:
222
- for q in self._outputs:
223
- q.close()
224
-
225
- # ----------------------------------------------------- _process_batch()
226
- def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
227
- """Run *model* on *batch* grouped by run_id to maximise batching."""
228
- groups: dict[int, list[ThreadedItem]] = defaultdict(list)
229
- for itm in batch:
230
- groups[itm.run_id].append(itm)
231
-
232
- result: list[ThreadedItem] = []
233
- for rid, items in groups.items():
234
- good: list[ThreadedItem] = [i for i in items if not i.is_failed]
235
- if not good:
236
- result.extend(items)
237
- continue
238
- try:
239
- # Filter out None payloads and ensure type safety
240
- pages_with_payloads = [
241
- (i, i.payload) for i in good if i.payload is not None
242
- ]
243
- if len(pages_with_payloads) != len(good):
244
- # Some items have None payloads, mark all as failed
245
- for it in items:
246
- it.is_failed = True
247
- it.error = RuntimeError("Page payload is None")
248
- result.extend(items)
249
- continue
250
-
251
- pages: List[Page] = [payload for _, payload in pages_with_payloads]
252
- processed_pages = list(self.model(good[0].conv_res, pages)) # type: ignore[arg-type]
253
- if len(processed_pages) != len(pages): # strict mismatch guard
254
- raise RuntimeError(
255
- f"Model {self.name} returned wrong number of pages"
256
- )
257
- for idx, page in enumerate(processed_pages):
258
- result.append(
259
- ThreadedItem(
260
- payload=page,
261
- run_id=rid,
262
- page_no=good[idx].page_no,
263
- conv_res=good[idx].conv_res,
264
- )
265
- )
266
- except Exception as exc:
267
- _log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
268
- for it in items:
269
- it.is_failed = True
270
- it.error = exc
271
- result.extend(items)
272
- return result
273
-
274
- # -------------------------------------------------------------- _emit()
275
- def _emit(self, items: Iterable[ThreadedItem]) -> None:
276
- for item in items:
277
- for q in self._outputs:
278
- if not q.put(item):
279
- _log.error("Output queue closed while emitting from %s", self.name)
280
-
281
-
282
- @dataclass
283
- class RunContext:
284
- """Wiring for a single *execute* call."""
285
-
286
- stages: list[ThreadedPipelineStage]
287
- first_stage: ThreadedPipelineStage
288
- output_queue: ThreadedQueue
289
-
290
-
291
- # ──────────────────────────────────────────────────────────────────────────────
292
- # Main pipeline
293
- # ──────────────────────────────────────────────────────────────────────────────
294
-
295
-
296
- class ThreadedStandardPdfPipeline(ConvertPipeline):
297
- """High-performance PDF pipeline with multi-threaded stages."""
298
-
299
- def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
300
- super().__init__(pipeline_options)
301
- self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
302
- self._run_seq = itertools.count(1) # deterministic, monotonic run ids
303
-
304
- # initialise heavy models once
305
- self._init_models()
306
-
307
- # ────────────────────────────────────────────────────────────────────────
308
- # Heavy-model initialisation & helpers
309
- # ────────────────────────────────────────────────────────────────────────
310
-
311
- def _init_models(self) -> None:
312
- art_path = self.artifacts_path
313
- self.keep_images = (
314
- self.pipeline_options.generate_page_images
315
- or self.pipeline_options.generate_picture_images
316
- or self.pipeline_options.generate_table_images
317
- )
318
- self.preprocessing_model = PagePreprocessingModel(
319
- options=PagePreprocessingOptions(
320
- images_scale=self.pipeline_options.images_scale
321
- )
322
- )
323
- self.ocr_model = self._make_ocr_model(art_path)
324
- self.layout_model = LayoutModel(
325
- artifacts_path=art_path,
326
- accelerator_options=self.pipeline_options.accelerator_options,
327
- options=self.pipeline_options.layout_options,
328
- )
329
- self.table_model = TableStructureModel(
330
- enabled=self.pipeline_options.do_table_structure,
331
- artifacts_path=art_path,
332
- options=self.pipeline_options.table_structure_options,
333
- accelerator_options=self.pipeline_options.accelerator_options,
334
- )
335
- self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
336
- self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
337
-
338
- # --- optional enrichment ------------------------------------------------
339
- self.enrichment_pipe = [
340
- # Code Formula Enrichment Model
341
- CodeFormulaModel(
342
- enabled=self.pipeline_options.do_code_enrichment
343
- or self.pipeline_options.do_formula_enrichment,
344
- artifacts_path=self.artifacts_path,
345
- options=CodeFormulaModelOptions(
346
- do_code_enrichment=self.pipeline_options.do_code_enrichment,
347
- do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
348
- ),
349
- accelerator_options=self.pipeline_options.accelerator_options,
350
- ),
351
- *self.enrichment_pipe,
352
- ]
353
-
354
- self.keep_backend = any(
355
- (
356
- self.pipeline_options.do_formula_enrichment,
357
- self.pipeline_options.do_code_enrichment,
358
- self.pipeline_options.do_picture_classification,
359
- self.pipeline_options.do_picture_description,
360
- )
361
- )
362
-
363
- # ---------------------------------------------------------------- helpers
364
- def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
365
- factory = get_ocr_factory(
366
- allow_external_plugins=self.pipeline_options.allow_external_plugins
367
- )
368
- return factory.create_instance(
369
- options=self.pipeline_options.ocr_options,
370
- enabled=self.pipeline_options.do_ocr,
371
- artifacts_path=art_path,
372
- accelerator_options=self.pipeline_options.accelerator_options,
373
- )
374
-
375
- # ────────────────────────────────────────────────────────────────────────
376
- # Build - thread pipeline
377
- # ────────────────────────────────────────────────────────────────────────
378
-
379
- def _create_run_ctx(self) -> RunContext:
380
- opts = self.pipeline_options
381
- preprocess = ThreadedPipelineStage(
382
- name="preprocess",
383
- model=self.preprocessing_model,
384
- batch_size=1,
385
- batch_timeout=opts.batch_timeout_seconds,
386
- queue_max_size=opts.queue_max_size,
387
- )
388
- ocr = ThreadedPipelineStage(
389
- name="ocr",
390
- model=self.ocr_model,
391
- batch_size=opts.ocr_batch_size,
392
- batch_timeout=opts.batch_timeout_seconds,
393
- queue_max_size=opts.queue_max_size,
394
- )
395
- layout = ThreadedPipelineStage(
396
- name="layout",
397
- model=self.layout_model,
398
- batch_size=opts.layout_batch_size,
399
- batch_timeout=opts.batch_timeout_seconds,
400
- queue_max_size=opts.queue_max_size,
401
- )
402
- table = ThreadedPipelineStage(
403
- name="table",
404
- model=self.table_model,
405
- batch_size=opts.table_batch_size,
406
- batch_timeout=opts.batch_timeout_seconds,
407
- queue_max_size=opts.queue_max_size,
408
- )
409
- assemble = ThreadedPipelineStage(
410
- name="assemble",
411
- model=self.assemble_model,
412
- batch_size=1,
413
- batch_timeout=opts.batch_timeout_seconds,
414
- queue_max_size=opts.queue_max_size,
415
- )
416
-
417
- # wire stages
418
- output_q = ThreadedQueue(opts.queue_max_size)
419
- preprocess.add_output_queue(ocr.input_queue)
420
- ocr.add_output_queue(layout.input_queue)
421
- layout.add_output_queue(table.input_queue)
422
- table.add_output_queue(assemble.input_queue)
423
- assemble.add_output_queue(output_q)
424
-
425
- stages = [preprocess, ocr, layout, table, assemble]
426
- return RunContext(stages=stages, first_stage=preprocess, output_queue=output_q)
427
-
428
- # --------------------------------------------------------------------- build
429
- def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
430
- """Stream-build the document while interleaving producer and consumer work."""
431
- run_id = next(self._run_seq)
432
- assert isinstance(conv_res.input._backend, PdfDocumentBackend)
433
- backend = conv_res.input._backend
434
-
435
- # preload & initialise pages -------------------------------------------------------------
436
- start_page, end_page = conv_res.input.limits.page_range
437
- pages: list[Page] = []
438
- for i in range(conv_res.input.page_count):
439
- if start_page - 1 <= i <= end_page - 1:
440
- page = Page(page_no=i)
441
- page._backend = backend.load_page(i)
442
- if page._backend and page._backend.is_valid():
443
- page.size = page._backend.get_size()
444
- conv_res.pages.append(page)
445
- pages.append(page)
446
-
447
- if not pages:
448
- conv_res.status = ConversionStatus.FAILURE
449
- return conv_res
450
-
451
- total_pages: int = len(pages)
452
- ctx: RunContext = self._create_run_ctx()
453
- for st in ctx.stages:
454
- st.start()
455
-
456
- proc = ProcessingResult(total_expected=total_pages)
457
- fed_idx: int = 0 # number of pages successfully queued
458
- batch_size: int = 32 # drain chunk
459
- try:
460
- while proc.success_count + proc.failure_count < total_pages:
461
- # 1) feed - try to enqueue until the first queue is full
462
- while fed_idx < total_pages:
463
- ok = ctx.first_stage.input_queue.put(
464
- ThreadedItem(
465
- payload=pages[fed_idx],
466
- run_id=run_id,
467
- page_no=pages[fed_idx].page_no,
468
- conv_res=conv_res,
469
- ),
470
- timeout=0.0, # non-blocking try-put
471
- )
472
- if ok:
473
- fed_idx += 1
474
- if fed_idx == total_pages:
475
- ctx.first_stage.input_queue.close()
476
- else: # queue full - switch to draining
477
- break
478
-
479
- # 2) drain - pull whatever is ready from the output side
480
- out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
481
- for itm in out_batch:
482
- if itm.run_id != run_id:
483
- continue
484
- if itm.is_failed or itm.error:
485
- proc.failed_pages.append(
486
- (itm.page_no, itm.error or RuntimeError("unknown error"))
487
- )
488
- else:
489
- assert itm.payload is not None
490
- proc.pages.append(itm.payload)
491
-
492
- # 3) failure safety - downstream closed early -> mark missing pages failed
493
- if not out_batch and ctx.output_queue.closed:
494
- missing = total_pages - (proc.success_count + proc.failure_count)
495
- if missing > 0:
496
- proc.failed_pages.extend(
497
- [(-1, RuntimeError("pipeline terminated early"))] * missing
498
- )
499
- break
500
- finally:
501
- for st in ctx.stages:
502
- st.stop()
503
- ctx.output_queue.close()
504
-
505
- self._integrate_results(conv_res, proc)
506
- return conv_res
507
-
508
- # ---------------------------------------------------- integrate_results()
509
- def _integrate_results(
510
- self, conv_res: ConversionResult, proc: ProcessingResult
511
- ) -> None:
512
- page_map = {p.page_no: p for p in proc.pages}
513
- conv_res.pages = [
514
- page_map.get(p.page_no, p)
515
- for p in conv_res.pages
516
- if p.page_no in page_map
517
- or not any(fp == p.page_no for fp, _ in proc.failed_pages)
518
- ]
519
- if proc.is_complete_failure:
520
- conv_res.status = ConversionStatus.FAILURE
521
- elif proc.is_partial_success:
522
- conv_res.status = ConversionStatus.PARTIAL_SUCCESS
523
- else:
524
- conv_res.status = ConversionStatus.SUCCESS
525
- if not self.keep_images:
526
- for p in conv_res.pages:
527
- p._image_cache = {}
528
- for p in conv_res.pages:
529
- if not self.keep_backend and p._backend is not None:
530
- p._backend.unload()
531
- if not self.pipeline_options.generate_parsed_pages:
532
- del p.parsed_page
533
- p.parsed_page = None
534
-
535
- # ---------------------------------------------------------------- assemble
536
- def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
537
- elements, headers, body = [], [], []
538
- with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
539
- for p in conv_res.pages:
540
- if p.assembled:
541
- elements.extend(p.assembled.elements)
542
- headers.extend(p.assembled.headers)
543
- body.extend(p.assembled.body)
544
- conv_res.assembled = AssembledUnit(
545
- elements=elements, headers=headers, body=body
546
- )
547
- conv_res.document = self.reading_order_model(conv_res)
548
-
549
- # Generate page images in the output
550
- if self.pipeline_options.generate_page_images:
551
- for page in conv_res.pages:
552
- assert page.image is not None
553
- page_no = page.page_no + 1
554
- conv_res.document.pages[page_no].image = ImageRef.from_pil(
555
- page.image, dpi=int(72 * self.pipeline_options.images_scale)
556
- )
557
-
558
- # Generate images of the requested element types
559
- with warnings.catch_warnings(): # deprecated generate_table_images
560
- warnings.filterwarnings("ignore", category=DeprecationWarning)
561
- if (
562
- self.pipeline_options.generate_picture_images
563
- or self.pipeline_options.generate_table_images
564
- ):
565
- scale = self.pipeline_options.images_scale
566
- for element, _level in conv_res.document.iterate_items():
567
- if not isinstance(element, DocItem) or len(element.prov) == 0:
568
- continue
569
- if (
570
- isinstance(element, PictureItem)
571
- and self.pipeline_options.generate_picture_images
572
- ) or (
573
- isinstance(element, TableItem)
574
- and self.pipeline_options.generate_table_images
575
- ):
576
- page_ix = element.prov[0].page_no - 1
577
- page = next(
578
- (p for p in conv_res.pages if p.page_no == page_ix),
579
- cast("Page", None),
580
- )
581
- assert page is not None
582
- assert page.size is not None
583
- assert page.image is not None
584
-
585
- crop_bbox = (
586
- element.prov[0]
587
- .bbox.scaled(scale=scale)
588
- .to_top_left_origin(
589
- page_height=page.size.height * scale
590
- )
591
- )
592
-
593
- cropped_im = page.image.crop(crop_bbox.as_tuple())
594
- element.image = ImageRef.from_pil(
595
- cropped_im, dpi=int(72 * scale)
596
- )
597
-
598
- # Aggregate confidence values for document:
599
- if len(conv_res.pages) > 0:
600
- with warnings.catch_warnings():
601
- warnings.filterwarnings(
602
- "ignore",
603
- category=RuntimeWarning,
604
- message="Mean of empty slice|All-NaN slice encountered",
605
- )
606
- conv_res.confidence.layout_score = float(
607
- np.nanmean(
608
- [c.layout_score for c in conv_res.confidence.pages.values()]
609
- )
610
- )
611
- conv_res.confidence.parse_score = float(
612
- np.nanquantile(
613
- [c.parse_score for c in conv_res.confidence.pages.values()],
614
- q=0.1, # parse score should relate to worst 10% of pages.
615
- )
616
- )
617
- conv_res.confidence.table_score = float(
618
- np.nanmean(
619
- [c.table_score for c in conv_res.confidence.pages.values()]
620
- )
621
- )
622
- conv_res.confidence.ocr_score = float(
623
- np.nanmean(
624
- [c.ocr_score for c in conv_res.confidence.pages.values()]
625
- )
626
- )
627
-
628
- return conv_res
629
-
630
- # ---------------------------------------------------------------- misc
631
- @classmethod
632
- def get_default_options(cls) -> ThreadedPdfPipelineOptions:
633
- return ThreadedPdfPipelineOptions()
634
-
635
- @classmethod
636
- def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
637
- return isinstance(backend, PdfDocumentBackend)
638
-
639
- def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
640
- return conv_res.status
641
-
642
- def _unload(self, conv_res: ConversionResult) -> None:
643
- for p in conv_res.pages:
644
- if p._backend is not None:
645
- p._backend.unload()
646
- if conv_res.input._backend:
647
- conv_res.input._backend.unload()
4
+ class ThreadedStandardPdfPipeline(StandardPdfPipeline):
5
+ """Backwards compatible import for ThreadedStandardPdfPipeline."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.59.0
3
+ Version: 2.60.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT