docling 2.42.2__py3-none-any.whl → 2.44.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,605 @@
1
+ # threaded_standard_pdf_pipeline.py
2
+ """Thread-safe, production-ready PDF pipeline
3
+ ================================================
4
+ A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
5
+
6
+ * **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
7
+ threads so that concurrent invocations never share mutable state.
8
+ * **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
9
+ relying on :pyfunc:`id`, which may clash after garbage collection.
10
+ * **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
11
+ propagates downstream so stages terminate deterministically without sentinels.
12
+ * **Minimal shared state** - heavyweight models are initialised once per pipeline instance
13
+ and only read by worker threads; no runtime mutability is exposed.
14
+ * **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import itertools
20
+ import logging
21
+ import threading
22
+ import time
23
+ from collections import defaultdict, deque
24
+ from dataclasses import dataclass, field
25
+ from pathlib import Path
26
+ from typing import Any, Iterable, List, Optional, Sequence, Tuple
27
+
28
+ from docling.backend.abstract_backend import AbstractDocumentBackend
29
+ from docling.backend.pdf_backend import PdfDocumentBackend
30
+ from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
31
+ from docling.datamodel.document import ConversionResult
32
+ from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
33
+ from docling.datamodel.settings import settings
34
+ from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
35
+ from docling.models.document_picture_classifier import (
36
+ DocumentPictureClassifier,
37
+ DocumentPictureClassifierOptions,
38
+ )
39
+ from docling.models.factories import get_ocr_factory, get_picture_description_factory
40
+ from docling.models.layout_model import LayoutModel
41
+ from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
42
+ from docling.models.page_preprocessing_model import (
43
+ PagePreprocessingModel,
44
+ PagePreprocessingOptions,
45
+ )
46
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
47
+ from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
48
+ from docling.models.table_structure_model import TableStructureModel
49
+ from docling.pipeline.base_pipeline import BasePipeline
50
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
51
+ from docling.utils.utils import chunkify
52
+
53
+ _log = logging.getLogger(__name__)
54
+
55
+ # ──────────────────────────────────────────────────────────────────────────────
56
+ # Helper data structures
57
+ # ──────────────────────────────────────────────────────────────────────────────
58
+
59
+
60
+ @dataclass
61
+ class ThreadedItem:
62
+ """Envelope that travels between pipeline stages."""
63
+
64
+ payload: Optional[Page]
65
+ run_id: int # Unique per *execute* call, monotonic across pipeline instance
66
+ page_no: int
67
+ conv_res: ConversionResult
68
+ error: Optional[Exception] = None
69
+ is_failed: bool = False
70
+
71
+
72
+ @dataclass
73
+ class ProcessingResult:
74
+ """Aggregated outcome of a pipeline run."""
75
+
76
+ pages: List[Page] = field(default_factory=list)
77
+ failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
78
+ total_expected: int = 0
79
+
80
+ @property
81
+ def success_count(self) -> int:
82
+ return len(self.pages)
83
+
84
+ @property
85
+ def failure_count(self) -> int:
86
+ return len(self.failed_pages)
87
+
88
+ @property
89
+ def is_partial_success(self) -> bool:
90
+ return 0 < self.success_count < self.total_expected
91
+
92
+ @property
93
+ def is_complete_failure(self) -> bool:
94
+ return self.success_count == 0 and self.failure_count > 0
95
+
96
+
97
+ class ThreadedQueue:
98
+ """Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
99
+
100
+ __slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
101
+
102
+ def __init__(self, max_size: int) -> None:
103
+ self._max: int = max_size
104
+ self._items: deque[ThreadedItem] = deque()
105
+ self._lock = threading.Lock()
106
+ self._not_full = threading.Condition(self._lock)
107
+ self._not_empty = threading.Condition(self._lock)
108
+ self._closed = False
109
+
110
+ # ---------------------------------------------------------------- put()
111
+ def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
112
+ """Block until queue accepts *item* or is closed. Returns *False* if closed."""
113
+ with self._not_full:
114
+ if self._closed:
115
+ return False
116
+ start = time.monotonic()
117
+ while len(self._items) >= self._max and not self._closed:
118
+ if timeout is not None:
119
+ remaining = timeout - (time.monotonic() - start)
120
+ if remaining <= 0:
121
+ return False
122
+ self._not_full.wait(remaining)
123
+ else:
124
+ self._not_full.wait()
125
+ if self._closed:
126
+ return False
127
+ self._items.append(item)
128
+ self._not_empty.notify()
129
+ return True
130
+
131
+ # ------------------------------------------------------------ get_batch()
132
+ def get_batch(
133
+ self, size: int, timeout: Optional[float] | None = None
134
+ ) -> List[ThreadedItem]:
135
+ """Return up to *size* items. Blocks until ≥1 item present or queue closed/timeout."""
136
+ with self._not_empty:
137
+ start = time.monotonic()
138
+ while not self._items and not self._closed:
139
+ if timeout is not None:
140
+ remaining = timeout - (time.monotonic() - start)
141
+ if remaining <= 0:
142
+ return []
143
+ self._not_empty.wait(remaining)
144
+ else:
145
+ self._not_empty.wait()
146
+ batch: List[ThreadedItem] = []
147
+ while self._items and len(batch) < size:
148
+ batch.append(self._items.popleft())
149
+ if batch:
150
+ self._not_full.notify_all()
151
+ return batch
152
+
153
+ # ---------------------------------------------------------------- close()
154
+ def close(self) -> None:
155
+ with self._lock:
156
+ self._closed = True
157
+ self._not_empty.notify_all()
158
+ self._not_full.notify_all()
159
+
160
+ # -------------------------------------------------------------- property
161
+ @property
162
+ def closed(self) -> bool:
163
+ return self._closed
164
+
165
+
166
+ class ThreadedPipelineStage:
167
+ """A single pipeline stage backed by one worker thread."""
168
+
169
+ def __init__(
170
+ self,
171
+ *,
172
+ name: str,
173
+ model: Any,
174
+ batch_size: int,
175
+ batch_timeout: float,
176
+ queue_max_size: int,
177
+ ) -> None:
178
+ self.name = name
179
+ self.model = model
180
+ self.batch_size = batch_size
181
+ self.batch_timeout = batch_timeout
182
+ self.input_queue = ThreadedQueue(queue_max_size)
183
+ self._outputs: list[ThreadedQueue] = []
184
+ self._thread: Optional[threading.Thread] = None
185
+ self._running = False
186
+
187
+ # ---------------------------------------------------------------- wiring
188
+ def add_output_queue(self, q: ThreadedQueue) -> None:
189
+ self._outputs.append(q)
190
+
191
+ # -------------------------------------------------------------- lifecycle
192
+ def start(self) -> None:
193
+ if self._running:
194
+ return
195
+ self._running = True
196
+ self._thread = threading.Thread(
197
+ target=self._run, name=f"Stage-{self.name}", daemon=False
198
+ )
199
+ self._thread.start()
200
+
201
+ def stop(self) -> None:
202
+ if not self._running:
203
+ return
204
+ self._running = False
205
+ self.input_queue.close()
206
+ if self._thread is not None:
207
+ self._thread.join(timeout=30.0)
208
+ if self._thread.is_alive():
209
+ _log.warning("Stage %s did not terminate cleanly within 30s", self.name)
210
+
211
+ # ------------------------------------------------------------------ _run
212
+ def _run(self) -> None:
213
+ try:
214
+ while self._running:
215
+ batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
216
+ if not batch and self.input_queue.closed:
217
+ break
218
+ processed = self._process_batch(batch)
219
+ self._emit(processed)
220
+ except Exception: # pragma: no cover - top-level guard
221
+ _log.exception("Fatal error in stage %s", self.name)
222
+ finally:
223
+ for q in self._outputs:
224
+ q.close()
225
+
226
+ # ----------------------------------------------------- _process_batch()
227
+ def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
228
+ """Run *model* on *batch* grouped by run_id to maximise batching."""
229
+ groups: dict[int, list[ThreadedItem]] = defaultdict(list)
230
+ for itm in batch:
231
+ groups[itm.run_id].append(itm)
232
+
233
+ result: list[ThreadedItem] = []
234
+ for rid, items in groups.items():
235
+ good: list[ThreadedItem] = [i for i in items if not i.is_failed]
236
+ if not good:
237
+ result.extend(items)
238
+ continue
239
+ try:
240
+ # Filter out None payloads and ensure type safety
241
+ pages_with_payloads = [
242
+ (i, i.payload) for i in good if i.payload is not None
243
+ ]
244
+ if len(pages_with_payloads) != len(good):
245
+ # Some items have None payloads, mark all as failed
246
+ for it in items:
247
+ it.is_failed = True
248
+ it.error = RuntimeError("Page payload is None")
249
+ result.extend(items)
250
+ continue
251
+
252
+ pages: List[Page] = [payload for _, payload in pages_with_payloads]
253
+ processed_pages = list(self.model(good[0].conv_res, pages)) # type: ignore[arg-type]
254
+ if len(processed_pages) != len(pages): # strict mismatch guard
255
+ raise RuntimeError(
256
+ f"Model {self.name} returned wrong number of pages"
257
+ )
258
+ for idx, page in enumerate(processed_pages):
259
+ result.append(
260
+ ThreadedItem(
261
+ payload=page,
262
+ run_id=rid,
263
+ page_no=good[idx].page_no,
264
+ conv_res=good[idx].conv_res,
265
+ )
266
+ )
267
+ except Exception as exc:
268
+ _log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
269
+ for it in items:
270
+ it.is_failed = True
271
+ it.error = exc
272
+ result.extend(items)
273
+ return result
274
+
275
+ # -------------------------------------------------------------- _emit()
276
+ def _emit(self, items: Iterable[ThreadedItem]) -> None:
277
+ for item in items:
278
+ for q in self._outputs:
279
+ if not q.put(item):
280
+ _log.error("Output queue closed while emitting from %s", self.name)
281
+
282
+
283
+ @dataclass
284
+ class RunContext:
285
+ """Wiring for a single *execute* call."""
286
+
287
+ stages: list[ThreadedPipelineStage]
288
+ first_stage: ThreadedPipelineStage
289
+ output_queue: ThreadedQueue
290
+
291
+
292
+ # ──────────────────────────────────────────────────────────────────────────────
293
+ # Main pipeline
294
+ # ──────────────────────────────────────────────────────────────────────────────
295
+
296
+
297
+ class ThreadedStandardPdfPipeline(BasePipeline):
298
+ """High-performance PDF pipeline with multi-threaded stages."""
299
+
300
+ def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
301
+ super().__init__(pipeline_options)
302
+ self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
303
+ self._run_seq = itertools.count(1) # deterministic, monotonic run ids
304
+
305
+ # initialise heavy models once
306
+ self._init_models()
307
+
308
+ # ────────────────────────────────────────────────────────────────────────
309
+ # Heavy-model initialisation & helpers
310
+ # ────────────────────────────────────────────────────────────────────────
311
+
312
+ def _init_models(self) -> None:
313
+ art_path = self._resolve_artifacts_path()
314
+ self.keep_images = (
315
+ self.pipeline_options.generate_page_images
316
+ or self.pipeline_options.generate_picture_images
317
+ or self.pipeline_options.generate_table_images
318
+ )
319
+ self.preprocessing_model = PagePreprocessingModel(
320
+ options=PagePreprocessingOptions(
321
+ images_scale=self.pipeline_options.images_scale
322
+ )
323
+ )
324
+ self.ocr_model = self._make_ocr_model(art_path)
325
+ self.layout_model = LayoutModel(
326
+ artifacts_path=art_path,
327
+ accelerator_options=self.pipeline_options.accelerator_options,
328
+ options=self.pipeline_options.layout_options,
329
+ )
330
+ self.table_model = TableStructureModel(
331
+ enabled=self.pipeline_options.do_table_structure,
332
+ artifacts_path=art_path,
333
+ options=self.pipeline_options.table_structure_options,
334
+ accelerator_options=self.pipeline_options.accelerator_options,
335
+ )
336
+ self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
337
+ self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
338
+
339
+ # --- optional enrichment ------------------------------------------------
340
+ self.enrichment_pipe = []
341
+ code_formula = CodeFormulaModel(
342
+ enabled=self.pipeline_options.do_code_enrichment
343
+ or self.pipeline_options.do_formula_enrichment,
344
+ artifacts_path=art_path,
345
+ options=CodeFormulaModelOptions(
346
+ do_code_enrichment=self.pipeline_options.do_code_enrichment,
347
+ do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
348
+ ),
349
+ accelerator_options=self.pipeline_options.accelerator_options,
350
+ )
351
+ if code_formula.enabled:
352
+ self.enrichment_pipe.append(code_formula)
353
+
354
+ picture_classifier = DocumentPictureClassifier(
355
+ enabled=self.pipeline_options.do_picture_classification,
356
+ artifacts_path=art_path,
357
+ options=DocumentPictureClassifierOptions(),
358
+ accelerator_options=self.pipeline_options.accelerator_options,
359
+ )
360
+ if picture_classifier.enabled:
361
+ self.enrichment_pipe.append(picture_classifier)
362
+
363
+ picture_descr = self._make_picture_description_model(art_path)
364
+ if picture_descr and picture_descr.enabled:
365
+ self.enrichment_pipe.append(picture_descr)
366
+
367
+ self.keep_backend = any(
368
+ (
369
+ self.pipeline_options.do_formula_enrichment,
370
+ self.pipeline_options.do_code_enrichment,
371
+ self.pipeline_options.do_picture_classification,
372
+ self.pipeline_options.do_picture_description,
373
+ )
374
+ )
375
+
376
+ # ---------------------------------------------------------------- helpers
377
+ def _resolve_artifacts_path(self) -> Optional[Path]:
378
+ if self.pipeline_options.artifacts_path:
379
+ p = Path(self.pipeline_options.artifacts_path).expanduser()
380
+ elif settings.artifacts_path:
381
+ p = Path(settings.artifacts_path).expanduser()
382
+ else:
383
+ return None
384
+ if not p.is_dir():
385
+ raise RuntimeError(
386
+ f"{p} does not exist or is not a directory containing the required models"
387
+ )
388
+ return p
389
+
390
+ def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
391
+ factory = get_ocr_factory(
392
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
393
+ )
394
+ return factory.create_instance(
395
+ options=self.pipeline_options.ocr_options,
396
+ enabled=self.pipeline_options.do_ocr,
397
+ artifacts_path=art_path,
398
+ accelerator_options=self.pipeline_options.accelerator_options,
399
+ )
400
+
401
+ def _make_picture_description_model(
402
+ self, art_path: Optional[Path]
403
+ ) -> Optional[PictureDescriptionBaseModel]:
404
+ factory = get_picture_description_factory(
405
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
406
+ )
407
+ return factory.create_instance(
408
+ options=self.pipeline_options.picture_description_options,
409
+ enabled=self.pipeline_options.do_picture_description,
410
+ enable_remote_services=self.pipeline_options.enable_remote_services,
411
+ artifacts_path=art_path,
412
+ accelerator_options=self.pipeline_options.accelerator_options,
413
+ )
414
+
415
+ # ────────────────────────────────────────────────────────────────────────
416
+ # Build - thread pipeline
417
+ # ────────────────────────────────────────────────────────────────────────
418
+
419
+ def _create_run_ctx(self) -> RunContext:
420
+ opts = self.pipeline_options
421
+ preprocess = ThreadedPipelineStage(
422
+ name="preprocess",
423
+ model=self.preprocessing_model,
424
+ batch_size=1,
425
+ batch_timeout=opts.batch_timeout_seconds,
426
+ queue_max_size=opts.queue_max_size,
427
+ )
428
+ ocr = ThreadedPipelineStage(
429
+ name="ocr",
430
+ model=self.ocr_model,
431
+ batch_size=opts.ocr_batch_size,
432
+ batch_timeout=opts.batch_timeout_seconds,
433
+ queue_max_size=opts.queue_max_size,
434
+ )
435
+ layout = ThreadedPipelineStage(
436
+ name="layout",
437
+ model=self.layout_model,
438
+ batch_size=opts.layout_batch_size,
439
+ batch_timeout=opts.batch_timeout_seconds,
440
+ queue_max_size=opts.queue_max_size,
441
+ )
442
+ table = ThreadedPipelineStage(
443
+ name="table",
444
+ model=self.table_model,
445
+ batch_size=opts.table_batch_size,
446
+ batch_timeout=opts.batch_timeout_seconds,
447
+ queue_max_size=opts.queue_max_size,
448
+ )
449
+ assemble = ThreadedPipelineStage(
450
+ name="assemble",
451
+ model=self.assemble_model,
452
+ batch_size=1,
453
+ batch_timeout=opts.batch_timeout_seconds,
454
+ queue_max_size=opts.queue_max_size,
455
+ )
456
+
457
+ # wire stages
458
+ output_q = ThreadedQueue(opts.queue_max_size)
459
+ preprocess.add_output_queue(ocr.input_queue)
460
+ ocr.add_output_queue(layout.input_queue)
461
+ layout.add_output_queue(table.input_queue)
462
+ table.add_output_queue(assemble.input_queue)
463
+ assemble.add_output_queue(output_q)
464
+
465
+ stages = [preprocess, ocr, layout, table, assemble]
466
+ return RunContext(stages=stages, first_stage=preprocess, output_queue=output_q)
467
+
468
+ # --------------------------------------------------------------------- build
469
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
470
+ """Stream-build the document while interleaving producer and consumer work."""
471
+ run_id = next(self._run_seq)
472
+ assert isinstance(conv_res.input._backend, PdfDocumentBackend)
473
+ backend = conv_res.input._backend
474
+
475
+ # preload & initialise pages -------------------------------------------------------------
476
+ start_page, end_page = conv_res.input.limits.page_range
477
+ pages: list[Page] = []
478
+ for i in range(conv_res.input.page_count):
479
+ if start_page - 1 <= i <= end_page - 1:
480
+ page = Page(page_no=i)
481
+ page._backend = backend.load_page(i)
482
+ if page._backend and page._backend.is_valid():
483
+ page.size = page._backend.get_size()
484
+ conv_res.pages.append(page)
485
+ pages.append(page)
486
+
487
+ if not pages:
488
+ conv_res.status = ConversionStatus.FAILURE
489
+ return conv_res
490
+
491
+ total_pages: int = len(pages)
492
+ ctx: RunContext = self._create_run_ctx()
493
+ for st in ctx.stages:
494
+ st.start()
495
+
496
+ proc = ProcessingResult(total_expected=total_pages)
497
+ fed_idx: int = 0 # number of pages successfully queued
498
+ batch_size: int = 32 # drain chunk
499
+ try:
500
+ while proc.success_count + proc.failure_count < total_pages:
501
+ # 1) feed - try to enqueue until the first queue is full
502
+ while fed_idx < total_pages:
503
+ ok = ctx.first_stage.input_queue.put(
504
+ ThreadedItem(
505
+ payload=pages[fed_idx],
506
+ run_id=run_id,
507
+ page_no=pages[fed_idx].page_no,
508
+ conv_res=conv_res,
509
+ ),
510
+ timeout=0.0, # non-blocking try-put
511
+ )
512
+ if ok:
513
+ fed_idx += 1
514
+ if fed_idx == total_pages:
515
+ ctx.first_stage.input_queue.close()
516
+ else: # queue full - switch to draining
517
+ break
518
+
519
+ # 2) drain - pull whatever is ready from the output side
520
+ out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
521
+ for itm in out_batch:
522
+ if itm.run_id != run_id:
523
+ continue
524
+ if itm.is_failed or itm.error:
525
+ proc.failed_pages.append(
526
+ (itm.page_no, itm.error or RuntimeError("unknown error"))
527
+ )
528
+ else:
529
+ assert itm.payload is not None
530
+ proc.pages.append(itm.payload)
531
+
532
+ # 3) failure safety - downstream closed early -> mark missing pages failed
533
+ if not out_batch and ctx.output_queue.closed:
534
+ missing = total_pages - (proc.success_count + proc.failure_count)
535
+ if missing > 0:
536
+ proc.failed_pages.extend(
537
+ [(-1, RuntimeError("pipeline terminated early"))] * missing
538
+ )
539
+ break
540
+ finally:
541
+ for st in ctx.stages:
542
+ st.stop()
543
+ ctx.output_queue.close()
544
+
545
+ self._integrate_results(conv_res, proc)
546
+ return conv_res
547
+
548
+ # ---------------------------------------------------- integrate_results()
549
+ def _integrate_results(
550
+ self, conv_res: ConversionResult, proc: ProcessingResult
551
+ ) -> None:
552
+ page_map = {p.page_no: p for p in proc.pages}
553
+ conv_res.pages = [
554
+ page_map.get(p.page_no, p)
555
+ for p in conv_res.pages
556
+ if p.page_no in page_map
557
+ or not any(fp == p.page_no for fp, _ in proc.failed_pages)
558
+ ]
559
+ if proc.is_complete_failure:
560
+ conv_res.status = ConversionStatus.FAILURE
561
+ elif proc.is_partial_success:
562
+ conv_res.status = ConversionStatus.PARTIAL_SUCCESS
563
+ else:
564
+ conv_res.status = ConversionStatus.SUCCESS
565
+ if not self.keep_images:
566
+ for p in conv_res.pages:
567
+ p._image_cache = {}
568
+ if not self.keep_backend:
569
+ for p in conv_res.pages:
570
+ if p._backend is not None:
571
+ p._backend.unload()
572
+
573
+ # ---------------------------------------------------------------- assemble
574
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
575
+ elements, headers, body = [], [], []
576
+ with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
577
+ for p in conv_res.pages:
578
+ if p.assembled:
579
+ elements.extend(p.assembled.elements)
580
+ headers.extend(p.assembled.headers)
581
+ body.extend(p.assembled.body)
582
+ conv_res.assembled = AssembledUnit(
583
+ elements=elements, headers=headers, body=body
584
+ )
585
+ conv_res.document = self.reading_order_model(conv_res)
586
+ return conv_res
587
+
588
+ # ---------------------------------------------------------------- misc
589
+ @classmethod
590
+ def get_default_options(cls) -> ThreadedPdfPipelineOptions:
591
+ return ThreadedPdfPipelineOptions()
592
+
593
+ @classmethod
594
+ def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
595
+ return isinstance(backend, PdfDocumentBackend)
596
+
597
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
598
+ return conv_res.status
599
+
600
+ def _unload(self, conv_res: ConversionResult) -> None:
601
+ for p in conv_res.pages:
602
+ if p._backend is not None:
603
+ p._backend.unload()
604
+ if conv_res.input._backend:
605
+ conv_res.input._backend.unload()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.42.2
3
+ Version: 2.44.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -28,9 +28,9 @@ License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
30
  Requires-Dist: docling-parse<5.0.0,>=4.0.0
31
- Requires-Dist: docling-ibm-models<4,>=3.6.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.9.0
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
- Requires-Dist: pypdfium2<5.0.0,>=4.30.0
33
+ Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
35
35
  Requires-Dist: huggingface_hub<1,>=0.23
36
36
  Requires-Dist: requests<3.0.0,>=2.32.2
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
58
58
  Provides-Extra: vlm
59
59
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
60
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
- Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
61
+ Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
62
  Provides-Extra: rapidocr
63
63
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
64
64
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"