docling 2.59.0__tar.gz → 2.60.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (153) hide show
  1. {docling-2.59.0 → docling-2.60.0}/PKG-INFO +1 -1
  2. {docling-2.59.0 → docling-2.60.0}/docling/backend/pypdfium2_backend.py +4 -4
  3. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/pipeline_options.py +13 -10
  4. {docling-2.59.0 → docling-2.60.0}/docling/models/layout_model.py +4 -0
  5. docling-2.59.0/docling/pipeline/standard_pdf_pipeline.py → docling-2.60.0/docling/pipeline/legacy_standard_pdf_pipeline.py +2 -2
  6. docling-2.59.0/docling/pipeline/threaded_standard_pdf_pipeline.py → docling-2.60.0/docling/pipeline/standard_pdf_pipeline.py +101 -19
  7. docling-2.60.0/docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  8. {docling-2.59.0 → docling-2.60.0}/docling.egg-info/PKG-INFO +1 -1
  9. {docling-2.59.0 → docling-2.60.0}/docling.egg-info/SOURCES.txt +1 -0
  10. {docling-2.59.0 → docling-2.60.0}/pyproject.toml +1 -1
  11. {docling-2.59.0 → docling-2.60.0}/tests/test_threaded_pipeline.py +23 -1
  12. {docling-2.59.0 → docling-2.60.0}/LICENSE +0 -0
  13. {docling-2.59.0 → docling-2.60.0}/README.md +0 -0
  14. {docling-2.59.0 → docling-2.60.0}/docling/__init__.py +0 -0
  15. {docling-2.59.0 → docling-2.60.0}/docling/backend/__init__.py +0 -0
  16. {docling-2.59.0 → docling-2.60.0}/docling/backend/abstract_backend.py +0 -0
  17. {docling-2.59.0 → docling-2.60.0}/docling/backend/asciidoc_backend.py +0 -0
  18. {docling-2.59.0 → docling-2.60.0}/docling/backend/csv_backend.py +0 -0
  19. {docling-2.59.0 → docling-2.60.0}/docling/backend/docling_parse_backend.py +0 -0
  20. {docling-2.59.0 → docling-2.60.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  21. {docling-2.59.0 → docling-2.60.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  22. {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/__init__.py +0 -0
  23. {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/drawingml/utils.py +0 -0
  24. {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/latex/__init__.py +0 -0
  25. {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  26. {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/latex/omml.py +0 -0
  27. {docling-2.59.0 → docling-2.60.0}/docling/backend/html_backend.py +0 -0
  28. {docling-2.59.0 → docling-2.60.0}/docling/backend/json/__init__.py +0 -0
  29. {docling-2.59.0 → docling-2.60.0}/docling/backend/json/docling_json_backend.py +0 -0
  30. {docling-2.59.0 → docling-2.60.0}/docling/backend/md_backend.py +0 -0
  31. {docling-2.59.0 → docling-2.60.0}/docling/backend/mets_gbs_backend.py +0 -0
  32. {docling-2.59.0 → docling-2.60.0}/docling/backend/msexcel_backend.py +0 -0
  33. {docling-2.59.0 → docling-2.60.0}/docling/backend/mspowerpoint_backend.py +0 -0
  34. {docling-2.59.0 → docling-2.60.0}/docling/backend/msword_backend.py +0 -0
  35. {docling-2.59.0 → docling-2.60.0}/docling/backend/noop_backend.py +0 -0
  36. {docling-2.59.0 → docling-2.60.0}/docling/backend/pdf_backend.py +0 -0
  37. {docling-2.59.0 → docling-2.60.0}/docling/backend/webvtt_backend.py +0 -0
  38. {docling-2.59.0 → docling-2.60.0}/docling/backend/xml/__init__.py +0 -0
  39. {docling-2.59.0 → docling-2.60.0}/docling/backend/xml/jats_backend.py +0 -0
  40. {docling-2.59.0 → docling-2.60.0}/docling/backend/xml/uspto_backend.py +0 -0
  41. {docling-2.59.0 → docling-2.60.0}/docling/chunking/__init__.py +0 -0
  42. {docling-2.59.0 → docling-2.60.0}/docling/cli/__init__.py +0 -0
  43. {docling-2.59.0 → docling-2.60.0}/docling/cli/main.py +0 -0
  44. {docling-2.59.0 → docling-2.60.0}/docling/cli/models.py +0 -0
  45. {docling-2.59.0 → docling-2.60.0}/docling/cli/tools.py +0 -0
  46. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/__init__.py +0 -0
  47. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/accelerator_options.py +0 -0
  48. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/asr_model_specs.py +0 -0
  49. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/backend_options.py +0 -0
  50. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/base_models.py +0 -0
  51. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/document.py +0 -0
  52. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/extraction.py +0 -0
  53. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/layout_model_specs.py +0 -0
  54. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  55. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  56. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/settings.py +0 -0
  57. {docling-2.59.0 → docling-2.60.0}/docling/datamodel/vlm_model_specs.py +0 -0
  58. {docling-2.59.0 → docling-2.60.0}/docling/document_converter.py +0 -0
  59. {docling-2.59.0 → docling-2.60.0}/docling/document_extractor.py +0 -0
  60. {docling-2.59.0 → docling-2.60.0}/docling/exceptions.py +0 -0
  61. {docling-2.59.0 → docling-2.60.0}/docling/models/__init__.py +0 -0
  62. {docling-2.59.0 → docling-2.60.0}/docling/models/api_vlm_model.py +0 -0
  63. {docling-2.59.0 → docling-2.60.0}/docling/models/auto_ocr_model.py +0 -0
  64. {docling-2.59.0 → docling-2.60.0}/docling/models/base_model.py +0 -0
  65. {docling-2.59.0 → docling-2.60.0}/docling/models/base_ocr_model.py +0 -0
  66. {docling-2.59.0 → docling-2.60.0}/docling/models/code_formula_model.py +0 -0
  67. {docling-2.59.0 → docling-2.60.0}/docling/models/document_picture_classifier.py +0 -0
  68. {docling-2.59.0 → docling-2.60.0}/docling/models/easyocr_model.py +0 -0
  69. {docling-2.59.0 → docling-2.60.0}/docling/models/factories/__init__.py +0 -0
  70. {docling-2.59.0 → docling-2.60.0}/docling/models/factories/base_factory.py +0 -0
  71. {docling-2.59.0 → docling-2.60.0}/docling/models/factories/ocr_factory.py +0 -0
  72. {docling-2.59.0 → docling-2.60.0}/docling/models/factories/picture_description_factory.py +0 -0
  73. {docling-2.59.0 → docling-2.60.0}/docling/models/ocr_mac_model.py +0 -0
  74. {docling-2.59.0 → docling-2.60.0}/docling/models/page_assemble_model.py +0 -0
  75. {docling-2.59.0 → docling-2.60.0}/docling/models/page_preprocessing_model.py +0 -0
  76. {docling-2.59.0 → docling-2.60.0}/docling/models/picture_description_api_model.py +0 -0
  77. {docling-2.59.0 → docling-2.60.0}/docling/models/picture_description_base_model.py +0 -0
  78. {docling-2.59.0 → docling-2.60.0}/docling/models/picture_description_vlm_model.py +0 -0
  79. {docling-2.59.0 → docling-2.60.0}/docling/models/plugins/__init__.py +0 -0
  80. {docling-2.59.0 → docling-2.60.0}/docling/models/plugins/defaults.py +0 -0
  81. {docling-2.59.0 → docling-2.60.0}/docling/models/rapid_ocr_model.py +0 -0
  82. {docling-2.59.0 → docling-2.60.0}/docling/models/readingorder_model.py +0 -0
  83. {docling-2.59.0 → docling-2.60.0}/docling/models/table_structure_model.py +0 -0
  84. {docling-2.59.0 → docling-2.60.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  85. {docling-2.59.0 → docling-2.60.0}/docling/models/tesseract_ocr_model.py +0 -0
  86. {docling-2.59.0 → docling-2.60.0}/docling/models/utils/__init__.py +0 -0
  87. {docling-2.59.0 → docling-2.60.0}/docling/models/utils/generation_utils.py +0 -0
  88. {docling-2.59.0 → docling-2.60.0}/docling/models/utils/hf_model_download.py +0 -0
  89. {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  90. {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  91. {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  92. {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
  93. {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  94. {docling-2.59.0 → docling-2.60.0}/docling/pipeline/__init__.py +0 -0
  95. {docling-2.59.0 → docling-2.60.0}/docling/pipeline/asr_pipeline.py +0 -0
  96. {docling-2.59.0 → docling-2.60.0}/docling/pipeline/base_extraction_pipeline.py +0 -0
  97. {docling-2.59.0 → docling-2.60.0}/docling/pipeline/base_pipeline.py +0 -0
  98. {docling-2.59.0 → docling-2.60.0}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
  99. {docling-2.59.0 → docling-2.60.0}/docling/pipeline/simple_pipeline.py +0 -0
  100. {docling-2.59.0 → docling-2.60.0}/docling/pipeline/vlm_pipeline.py +0 -0
  101. {docling-2.59.0 → docling-2.60.0}/docling/py.typed +0 -0
  102. {docling-2.59.0 → docling-2.60.0}/docling/utils/__init__.py +0 -0
  103. {docling-2.59.0 → docling-2.60.0}/docling/utils/accelerator_utils.py +0 -0
  104. {docling-2.59.0 → docling-2.60.0}/docling/utils/api_image_request.py +0 -0
  105. {docling-2.59.0 → docling-2.60.0}/docling/utils/export.py +0 -0
  106. {docling-2.59.0 → docling-2.60.0}/docling/utils/glm_utils.py +0 -0
  107. {docling-2.59.0 → docling-2.60.0}/docling/utils/layout_postprocessor.py +0 -0
  108. {docling-2.59.0 → docling-2.60.0}/docling/utils/locks.py +0 -0
  109. {docling-2.59.0 → docling-2.60.0}/docling/utils/model_downloader.py +0 -0
  110. {docling-2.59.0 → docling-2.60.0}/docling/utils/ocr_utils.py +0 -0
  111. {docling-2.59.0 → docling-2.60.0}/docling/utils/orientation.py +0 -0
  112. {docling-2.59.0 → docling-2.60.0}/docling/utils/profiling.py +0 -0
  113. {docling-2.59.0 → docling-2.60.0}/docling/utils/utils.py +0 -0
  114. {docling-2.59.0 → docling-2.60.0}/docling/utils/visualization.py +0 -0
  115. {docling-2.59.0 → docling-2.60.0}/docling.egg-info/dependency_links.txt +0 -0
  116. {docling-2.59.0 → docling-2.60.0}/docling.egg-info/entry_points.txt +0 -0
  117. {docling-2.59.0 → docling-2.60.0}/docling.egg-info/requires.txt +0 -0
  118. {docling-2.59.0 → docling-2.60.0}/docling.egg-info/top_level.txt +0 -0
  119. {docling-2.59.0 → docling-2.60.0}/setup.cfg +0 -0
  120. {docling-2.59.0 → docling-2.60.0}/tests/test_asr_mlx_whisper.py +0 -0
  121. {docling-2.59.0 → docling-2.60.0}/tests/test_asr_pipeline.py +0 -0
  122. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_asciidoc.py +0 -0
  123. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_csv.py +0 -0
  124. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_docling_json.py +0 -0
  125. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_docling_parse.py +0 -0
  126. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_docling_parse_v2.py +0 -0
  127. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_docling_parse_v4.py +0 -0
  128. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_html.py +0 -0
  129. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_jats.py +0 -0
  130. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_markdown.py +0 -0
  131. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_mets_gbs.py +0 -0
  132. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_msexcel.py +0 -0
  133. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_msword.py +0 -0
  134. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_patent_uspto.py +0 -0
  135. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_pdfium.py +0 -0
  136. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_pptx.py +0 -0
  137. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_vtt.py +0 -0
  138. {docling-2.59.0 → docling-2.60.0}/tests/test_backend_webp.py +0 -0
  139. {docling-2.59.0 → docling-2.60.0}/tests/test_cli.py +0 -0
  140. {docling-2.59.0 → docling-2.60.0}/tests/test_code_formula.py +0 -0
  141. {docling-2.59.0 → docling-2.60.0}/tests/test_data_gen_flag.py +0 -0
  142. {docling-2.59.0 → docling-2.60.0}/tests/test_document_picture_classifier.py +0 -0
  143. {docling-2.59.0 → docling-2.60.0}/tests/test_e2e_conversion.py +0 -0
  144. {docling-2.59.0 → docling-2.60.0}/tests/test_e2e_ocr_conversion.py +0 -0
  145. {docling-2.59.0 → docling-2.60.0}/tests/test_extraction.py +0 -0
  146. {docling-2.59.0 → docling-2.60.0}/tests/test_input_doc.py +0 -0
  147. {docling-2.59.0 → docling-2.60.0}/tests/test_interfaces.py +0 -0
  148. {docling-2.59.0 → docling-2.60.0}/tests/test_invalid_input.py +0 -0
  149. {docling-2.59.0 → docling-2.60.0}/tests/test_legacy_format_transform.py +0 -0
  150. {docling-2.59.0 → docling-2.60.0}/tests/test_ocr_utils.py +0 -0
  151. {docling-2.59.0 → docling-2.60.0}/tests/test_options.py +0 -0
  152. {docling-2.59.0 → docling-2.60.0}/tests/test_pdf_password.py +0 -0
  153. {docling-2.59.0 → docling-2.60.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.59.0
3
+ Version: 2.60.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -229,10 +229,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
229
229
  b=max(cell.rect.to_bounding_box().b for cell in group),
230
230
  )
231
231
 
232
- assert self._ppage is not None
233
- self.text_page = self._ppage.get_textpage()
232
+ assert self.text_page is not None
234
233
  bbox = merged_bbox.to_bottom_left_origin(page_size.height)
235
- merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
234
+ with pypdfium2_lock:
235
+ merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
236
236
 
237
237
  return TextCell(
238
238
  index=group[0].index,
@@ -255,9 +255,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
255
255
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
256
256
  AREA_THRESHOLD = 0 # 32 * 32
257
257
  page_size = self.get_size()
258
- rotation = self._ppage.get_rotation()
259
258
 
260
259
  with pypdfium2_lock:
260
+ rotation = self._ppage.get_rotation()
261
261
  for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
262
262
  pos = obj.get_pos()
263
263
  if rotation == 90:
@@ -361,15 +361,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
361
361
 
362
362
  generate_parsed_pages: bool = False
363
363
 
364
-
365
- class ProcessingPipeline(str, Enum):
366
- STANDARD = "standard"
367
- VLM = "vlm"
368
- ASR = "asr"
369
-
370
-
371
- class ThreadedPdfPipelineOptions(PdfPipelineOptions):
372
- """Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
364
+ ### Arguments for threaded PDF pipeline with batching and backpressure control
373
365
 
374
366
  # Batch sizes for different stages
375
367
  ocr_batch_size: int = 4
@@ -377,7 +369,18 @@ class ThreadedPdfPipelineOptions(PdfPipelineOptions):
377
369
  table_batch_size: int = 4
378
370
 
379
371
  # Timing control
380
- batch_timeout_seconds: float = 2.0
372
+ batch_polling_interval_seconds: float = 0.5
381
373
 
382
374
  # Backpressure and queue control
383
375
  queue_max_size: int = 100
376
+
377
+
378
+ class ProcessingPipeline(str, Enum):
379
+ LEGACY = "legacy"
380
+ STANDARD = "standard"
381
+ VLM = "vlm"
382
+ ASR = "asr"
383
+
384
+
385
+ class ThreadedPdfPipelineOptions(PdfPipelineOptions):
386
+ """Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
@@ -167,6 +167,10 @@ class LayoutModel(BasePageModel):
167
167
  valid_pages.append(page)
168
168
  valid_page_images.append(page_image)
169
169
 
170
+ print(f"{len(pages)=}, {pages[0].page_no}-{pages[-1].page_no}")
171
+ print(f"{len(valid_pages)=}")
172
+ print(f"{len(valid_page_images)=}")
173
+
170
174
  # Process all valid pages with batch prediction
171
175
  batch_predictions = []
172
176
  if valid_page_images:
@@ -31,7 +31,7 @@ from docling.utils.profiling import ProfilingScope, TimeRecorder
31
31
  _log = logging.getLogger(__name__)
32
32
 
33
33
 
34
- class StandardPdfPipeline(PaginatedPipeline):
34
+ class LegacyStandardPdfPipeline(PaginatedPipeline):
35
35
  def __init__(self, pipeline_options: PdfPipelineOptions):
36
36
  super().__init__(pipeline_options)
37
37
  self.pipeline_options: PdfPipelineOptions
@@ -102,7 +102,7 @@ class StandardPdfPipeline(PaginatedPipeline):
102
102
  local_dir: Optional[Path] = None, force: bool = False
103
103
  ) -> Path:
104
104
  warnings.warn(
105
- "The usage of StandardPdfPipeline.download_models_hf() is deprecated "
105
+ "The usage of LegacyStandardPdfPipeline.download_models_hf() is deprecated "
106
106
  "use instead the utility `docling-tools models download`, or "
107
107
  "the upstream method docling.utils.models_downloader.download_all()",
108
108
  DeprecationWarning,
@@ -1,4 +1,3 @@
1
- # threaded_standard_pdf_pipeline.py
2
1
  """Thread-safe, production-ready PDF pipeline
3
2
  ================================================
4
3
  A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
@@ -24,7 +23,7 @@ import warnings
24
23
  from collections import defaultdict, deque
25
24
  from dataclasses import dataclass, field
26
25
  from pathlib import Path
27
- from typing import Any, Iterable, List, Optional, Sequence, Tuple, cast
26
+ from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, cast
28
27
 
29
28
  import numpy as np
30
29
  from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
@@ -173,6 +172,7 @@ class ThreadedPipelineStage:
173
172
  batch_size: int,
174
173
  batch_timeout: float,
175
174
  queue_max_size: int,
175
+ postprocess: Optional[Callable[[ThreadedItem], None]] = None,
176
176
  ) -> None:
177
177
  self.name = name
178
178
  self.model = model
@@ -182,6 +182,7 @@ class ThreadedPipelineStage:
182
182
  self._outputs: list[ThreadedQueue] = []
183
183
  self._thread: Optional[threading.Thread] = None
184
184
  self._running = False
185
+ self._postprocess = postprocess
185
186
 
186
187
  # ---------------------------------------------------------------- wiring
187
188
  def add_output_queue(self, q: ThreadedQueue) -> None:
@@ -274,11 +275,85 @@ class ThreadedPipelineStage:
274
275
  # -------------------------------------------------------------- _emit()
275
276
  def _emit(self, items: Iterable[ThreadedItem]) -> None:
276
277
  for item in items:
278
+ if self._postprocess is not None:
279
+ self._postprocess(item)
277
280
  for q in self._outputs:
278
281
  if not q.put(item):
279
282
  _log.error("Output queue closed while emitting from %s", self.name)
280
283
 
281
284
 
285
+ class PreprocessThreadedStage(ThreadedPipelineStage):
286
+ """Pipeline stage that lazily loads PDF backends just-in-time."""
287
+
288
+ def __init__(
289
+ self,
290
+ *,
291
+ batch_timeout: float,
292
+ queue_max_size: int,
293
+ model: Any,
294
+ ) -> None:
295
+ super().__init__(
296
+ name="preprocess",
297
+ model=model,
298
+ batch_size=1,
299
+ batch_timeout=batch_timeout,
300
+ queue_max_size=queue_max_size,
301
+ )
302
+
303
+ def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
304
+ groups: dict[int, list[ThreadedItem]] = defaultdict(list)
305
+ for itm in batch:
306
+ groups[itm.run_id].append(itm)
307
+
308
+ result: list[ThreadedItem] = []
309
+ for rid, items in groups.items():
310
+ good = [i for i in items if not i.is_failed]
311
+ if not good:
312
+ result.extend(items)
313
+ continue
314
+ try:
315
+ pages_with_payloads: list[tuple[ThreadedItem, Page]] = []
316
+ for it in good:
317
+ page = it.payload
318
+ if page is None:
319
+ raise RuntimeError("Page payload is None")
320
+ if page._backend is None:
321
+ backend = it.conv_res.input._backend
322
+ assert isinstance(backend, PdfDocumentBackend), (
323
+ "Threaded pipeline only supports PdfDocumentBackend."
324
+ )
325
+ page_backend = backend.load_page(page.page_no)
326
+ page._backend = page_backend
327
+ if page_backend.is_valid():
328
+ page.size = page_backend.get_size()
329
+ pages_with_payloads.append((it, page))
330
+
331
+ pages = [payload for _, payload in pages_with_payloads]
332
+ processed_pages = list(
333
+ self.model(good[0].conv_res, pages) # type: ignore[arg-type]
334
+ )
335
+ if len(processed_pages) != len(pages):
336
+ raise RuntimeError(
337
+ "PagePreprocessingModel returned unexpected number of pages"
338
+ )
339
+ for idx, processed_page in enumerate(processed_pages):
340
+ result.append(
341
+ ThreadedItem(
342
+ payload=processed_page,
343
+ run_id=rid,
344
+ page_no=good[idx].page_no,
345
+ conv_res=good[idx].conv_res,
346
+ )
347
+ )
348
+ except Exception as exc:
349
+ _log.error("Stage preprocess failed for run %d: %s", rid, exc)
350
+ for it in items:
351
+ it.is_failed = True
352
+ it.error = exc
353
+ result.extend(items)
354
+ return result
355
+
356
+
282
357
  @dataclass
283
358
  class RunContext:
284
359
  """Wiring for a single *execute* call."""
@@ -293,7 +368,7 @@ class RunContext:
293
368
  # ──────────────────────────────────────────────────────────────────────────────
294
369
 
295
370
 
296
- class ThreadedStandardPdfPipeline(ConvertPipeline):
371
+ class StandardPdfPipeline(ConvertPipeline):
297
372
  """High-performance PDF pipeline with multi-threaded stages."""
298
373
 
299
374
  def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
@@ -372,46 +447,57 @@ class ThreadedStandardPdfPipeline(ConvertPipeline):
372
447
  accelerator_options=self.pipeline_options.accelerator_options,
373
448
  )
374
449
 
450
+ def _release_page_resources(self, item: ThreadedItem) -> None:
451
+ page = item.payload
452
+ if page is None:
453
+ return
454
+ if not self.keep_images:
455
+ page._image_cache = {}
456
+ if not self.keep_backend and page._backend is not None:
457
+ page._backend.unload()
458
+ page._backend = None
459
+ if not self.pipeline_options.generate_parsed_pages:
460
+ page.parsed_page = None
461
+
375
462
  # ────────────────────────────────────────────────────────────────────────
376
463
  # Build - thread pipeline
377
464
  # ────────────────────────────────────────────────────────────────────────
378
465
 
379
466
  def _create_run_ctx(self) -> RunContext:
380
467
  opts = self.pipeline_options
381
- preprocess = ThreadedPipelineStage(
382
- name="preprocess",
383
- model=self.preprocessing_model,
384
- batch_size=1,
385
- batch_timeout=opts.batch_timeout_seconds,
468
+ preprocess = PreprocessThreadedStage(
469
+ batch_timeout=opts.batch_polling_interval_seconds,
386
470
  queue_max_size=opts.queue_max_size,
471
+ model=self.preprocessing_model,
387
472
  )
388
473
  ocr = ThreadedPipelineStage(
389
474
  name="ocr",
390
475
  model=self.ocr_model,
391
476
  batch_size=opts.ocr_batch_size,
392
- batch_timeout=opts.batch_timeout_seconds,
477
+ batch_timeout=opts.batch_polling_interval_seconds,
393
478
  queue_max_size=opts.queue_max_size,
394
479
  )
395
480
  layout = ThreadedPipelineStage(
396
481
  name="layout",
397
482
  model=self.layout_model,
398
483
  batch_size=opts.layout_batch_size,
399
- batch_timeout=opts.batch_timeout_seconds,
484
+ batch_timeout=opts.batch_polling_interval_seconds,
400
485
  queue_max_size=opts.queue_max_size,
401
486
  )
402
487
  table = ThreadedPipelineStage(
403
488
  name="table",
404
489
  model=self.table_model,
405
490
  batch_size=opts.table_batch_size,
406
- batch_timeout=opts.batch_timeout_seconds,
491
+ batch_timeout=opts.batch_polling_interval_seconds,
407
492
  queue_max_size=opts.queue_max_size,
408
493
  )
409
494
  assemble = ThreadedPipelineStage(
410
495
  name="assemble",
411
496
  model=self.assemble_model,
412
497
  batch_size=1,
413
- batch_timeout=opts.batch_timeout_seconds,
498
+ batch_timeout=opts.batch_polling_interval_seconds,
414
499
  queue_max_size=opts.queue_max_size,
500
+ postprocess=self._release_page_resources,
415
501
  )
416
502
 
417
503
  # wire stages
@@ -430,19 +516,15 @@ class ThreadedStandardPdfPipeline(ConvertPipeline):
430
516
  """Stream-build the document while interleaving producer and consumer work."""
431
517
  run_id = next(self._run_seq)
432
518
  assert isinstance(conv_res.input._backend, PdfDocumentBackend)
433
- backend = conv_res.input._backend
434
519
 
435
- # preload & initialise pages -------------------------------------------------------------
520
+ # Collect page placeholders; backends are loaded lazily in preprocess stage
436
521
  start_page, end_page = conv_res.input.limits.page_range
437
522
  pages: list[Page] = []
438
523
  for i in range(conv_res.input.page_count):
439
524
  if start_page - 1 <= i <= end_page - 1:
440
525
  page = Page(page_no=i)
441
- page._backend = backend.load_page(i)
442
- if page._backend and page._backend.is_valid():
443
- page.size = page._backend.get_size()
444
- conv_res.pages.append(page)
445
- pages.append(page)
526
+ conv_res.pages.append(page)
527
+ pages.append(page)
446
528
 
447
529
  if not pages:
448
530
  conv_res.status = ConversionStatus.FAILURE
@@ -0,0 +1,5 @@
1
+ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
2
+
3
+
4
+ class ThreadedStandardPdfPipeline(StandardPdfPipeline):
5
+ """Backwards compatible import for ThreadedStandardPdfPipeline."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.59.0
3
+ Version: 2.60.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -96,6 +96,7 @@ docling/pipeline/asr_pipeline.py
96
96
  docling/pipeline/base_extraction_pipeline.py
97
97
  docling/pipeline/base_pipeline.py
98
98
  docling/pipeline/extraction_vlm_pipeline.py
99
+ docling/pipeline/legacy_standard_pdf_pipeline.py
99
100
  docling/pipeline/simple_pipeline.py
100
101
  docling/pipeline/standard_pdf_pipeline.py
101
102
  docling/pipeline/threaded_standard_pdf_pipeline.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.59.0" # DO NOT EDIT, updated automatically
3
+ version = "2.60.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -5,6 +5,7 @@ from typing import List
5
5
 
6
6
  import pytest
7
7
 
8
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
8
9
  from docling.datamodel.base_models import ConversionStatus, InputFormat
9
10
  from docling.datamodel.document import ConversionResult
10
11
  from docling.datamodel.pipeline_options import (
@@ -41,7 +42,7 @@ def test_threaded_pipeline_multiple_documents():
41
42
  layout_batch_size=1,
42
43
  table_batch_size=1,
43
44
  ocr_batch_size=1,
44
- batch_timeout_seconds=1.0,
45
+ batch_polling_interval_seconds=1.0,
45
46
  do_table_structure=do_ts,
46
47
  do_ocr=do_ocr,
47
48
  ),
@@ -171,6 +172,27 @@ def test_pipeline_comparison():
171
172
  assert len(sync_doc.texts) == len(threaded_doc.texts)
172
173
 
173
174
 
175
+ def test_pypdfium_threaded_pipeline():
176
+ doc_converter = (
177
+ DocumentConverter( # all of the below is optional, has internal defaults.
178
+ format_options={
179
+ InputFormat.PDF: PdfFormatOption(
180
+ pipeline_cls=ThreadedStandardPdfPipeline,
181
+ backend=PyPdfiumDocumentBackend,
182
+ ),
183
+ },
184
+ )
185
+ )
186
+
187
+ test_file = "tests/data/pdf/2206.01062.pdf"
188
+ for i in range(6):
189
+ print(f"iteration {i=}")
190
+ conv_result = doc_converter.convert(test_file)
191
+ assert conv_result.status == ConversionStatus.SUCCESS
192
+ print(f"[{i=}] Success")
193
+ print("All done!")
194
+
195
+
174
196
  if __name__ == "__main__":
175
197
  # Run basic performance test
176
198
  test_pipeline_comparison()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes