docling 2.59.0__tar.gz → 2.60.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- {docling-2.59.0 → docling-2.60.0}/PKG-INFO +1 -1
- {docling-2.59.0 → docling-2.60.0}/docling/backend/pypdfium2_backend.py +4 -4
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/pipeline_options.py +13 -10
- {docling-2.59.0 → docling-2.60.0}/docling/models/layout_model.py +4 -0
- docling-2.59.0/docling/pipeline/standard_pdf_pipeline.py → docling-2.60.0/docling/pipeline/legacy_standard_pdf_pipeline.py +2 -2
- docling-2.59.0/docling/pipeline/threaded_standard_pdf_pipeline.py → docling-2.60.0/docling/pipeline/standard_pdf_pipeline.py +101 -19
- docling-2.60.0/docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- {docling-2.59.0 → docling-2.60.0}/docling.egg-info/PKG-INFO +1 -1
- {docling-2.59.0 → docling-2.60.0}/docling.egg-info/SOURCES.txt +1 -0
- {docling-2.59.0 → docling-2.60.0}/pyproject.toml +1 -1
- {docling-2.59.0 → docling-2.60.0}/tests/test_threaded_pipeline.py +23 -1
- {docling-2.59.0 → docling-2.60.0}/LICENSE +0 -0
- {docling-2.59.0 → docling-2.60.0}/README.md +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/drawingml/utils.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/html_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/md_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/webvtt_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/chunking/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/cli/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/cli/main.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/cli/models.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/cli/tools.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/backend_options.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/document.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/extraction.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/settings.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/datamodel/vlm_model_specs.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/document_converter.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/document_extractor.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/exceptions.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/auto_ocr_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/base_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/utils/generation_utils.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/utils/hf_model_download.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/vllm_model.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/pipeline/base_extraction_pipeline.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/py.typed +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/__init__.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/export.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/locks.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/orientation.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/profiling.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/utils.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling/utils/visualization.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling.egg-info/requires.txt +0 -0
- {docling-2.59.0 → docling-2.60.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.59.0 → docling-2.60.0}/setup.cfg +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_asr_mlx_whisper.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_csv.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_html.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_jats.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_msword.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_vtt.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_backend_webp.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_cli.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_code_formula.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_e2e_conversion.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_extraction.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_input_doc.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_interfaces.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_invalid_input.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_options.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_pdf_password.py +0 -0
- {docling-2.59.0 → docling-2.60.0}/tests/test_settings_load.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.60.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -229,10 +229,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
|
229
229
|
b=max(cell.rect.to_bounding_box().b for cell in group),
|
|
230
230
|
)
|
|
231
231
|
|
|
232
|
-
assert self.
|
|
233
|
-
self.text_page = self._ppage.get_textpage()
|
|
232
|
+
assert self.text_page is not None
|
|
234
233
|
bbox = merged_bbox.to_bottom_left_origin(page_size.height)
|
|
235
|
-
|
|
234
|
+
with pypdfium2_lock:
|
|
235
|
+
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
|
|
236
236
|
|
|
237
237
|
return TextCell(
|
|
238
238
|
index=group[0].index,
|
|
@@ -255,9 +255,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
|
255
255
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
256
256
|
AREA_THRESHOLD = 0 # 32 * 32
|
|
257
257
|
page_size = self.get_size()
|
|
258
|
-
rotation = self._ppage.get_rotation()
|
|
259
258
|
|
|
260
259
|
with pypdfium2_lock:
|
|
260
|
+
rotation = self._ppage.get_rotation()
|
|
261
261
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
|
262
262
|
pos = obj.get_pos()
|
|
263
263
|
if rotation == 90:
|
|
@@ -361,15 +361,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
|
361
361
|
|
|
362
362
|
generate_parsed_pages: bool = False
|
|
363
363
|
|
|
364
|
-
|
|
365
|
-
class ProcessingPipeline(str, Enum):
|
|
366
|
-
STANDARD = "standard"
|
|
367
|
-
VLM = "vlm"
|
|
368
|
-
ASR = "asr"
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
|
372
|
-
"""Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
|
|
364
|
+
### Arguments for threaded PDF pipeline with batching and backpressure control
|
|
373
365
|
|
|
374
366
|
# Batch sizes for different stages
|
|
375
367
|
ocr_batch_size: int = 4
|
|
@@ -377,7 +369,18 @@ class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
|
|
377
369
|
table_batch_size: int = 4
|
|
378
370
|
|
|
379
371
|
# Timing control
|
|
380
|
-
|
|
372
|
+
batch_polling_interval_seconds: float = 0.5
|
|
381
373
|
|
|
382
374
|
# Backpressure and queue control
|
|
383
375
|
queue_max_size: int = 100
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
class ProcessingPipeline(str, Enum):
|
|
379
|
+
LEGACY = "legacy"
|
|
380
|
+
STANDARD = "standard"
|
|
381
|
+
VLM = "vlm"
|
|
382
|
+
ASR = "asr"
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
|
386
|
+
"""Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
|
|
@@ -167,6 +167,10 @@ class LayoutModel(BasePageModel):
|
|
|
167
167
|
valid_pages.append(page)
|
|
168
168
|
valid_page_images.append(page_image)
|
|
169
169
|
|
|
170
|
+
print(f"{len(pages)=}, {pages[0].page_no}-{pages[-1].page_no}")
|
|
171
|
+
print(f"{len(valid_pages)=}")
|
|
172
|
+
print(f"{len(valid_page_images)=}")
|
|
173
|
+
|
|
170
174
|
# Process all valid pages with batch prediction
|
|
171
175
|
batch_predictions = []
|
|
172
176
|
if valid_page_images:
|
|
@@ -31,7 +31,7 @@ from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
|
31
31
|
_log = logging.getLogger(__name__)
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
class
|
|
34
|
+
class LegacyStandardPdfPipeline(PaginatedPipeline):
|
|
35
35
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
|
36
36
|
super().__init__(pipeline_options)
|
|
37
37
|
self.pipeline_options: PdfPipelineOptions
|
|
@@ -102,7 +102,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
|
102
102
|
local_dir: Optional[Path] = None, force: bool = False
|
|
103
103
|
) -> Path:
|
|
104
104
|
warnings.warn(
|
|
105
|
-
"The usage of
|
|
105
|
+
"The usage of LegacyStandardPdfPipeline.download_models_hf() is deprecated "
|
|
106
106
|
"use instead the utility `docling-tools models download`, or "
|
|
107
107
|
"the upstream method docling.utils.models_downloader.download_all()",
|
|
108
108
|
DeprecationWarning,
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# threaded_standard_pdf_pipeline.py
|
|
2
1
|
"""Thread-safe, production-ready PDF pipeline
|
|
3
2
|
================================================
|
|
4
3
|
A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
|
|
@@ -24,7 +23,7 @@ import warnings
|
|
|
24
23
|
from collections import defaultdict, deque
|
|
25
24
|
from dataclasses import dataclass, field
|
|
26
25
|
from pathlib import Path
|
|
27
|
-
from typing import Any, Iterable, List, Optional, Sequence, Tuple, cast
|
|
26
|
+
from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, cast
|
|
28
27
|
|
|
29
28
|
import numpy as np
|
|
30
29
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|
@@ -173,6 +172,7 @@ class ThreadedPipelineStage:
|
|
|
173
172
|
batch_size: int,
|
|
174
173
|
batch_timeout: float,
|
|
175
174
|
queue_max_size: int,
|
|
175
|
+
postprocess: Optional[Callable[[ThreadedItem], None]] = None,
|
|
176
176
|
) -> None:
|
|
177
177
|
self.name = name
|
|
178
178
|
self.model = model
|
|
@@ -182,6 +182,7 @@ class ThreadedPipelineStage:
|
|
|
182
182
|
self._outputs: list[ThreadedQueue] = []
|
|
183
183
|
self._thread: Optional[threading.Thread] = None
|
|
184
184
|
self._running = False
|
|
185
|
+
self._postprocess = postprocess
|
|
185
186
|
|
|
186
187
|
# ---------------------------------------------------------------- wiring
|
|
187
188
|
def add_output_queue(self, q: ThreadedQueue) -> None:
|
|
@@ -274,11 +275,85 @@ class ThreadedPipelineStage:
|
|
|
274
275
|
# -------------------------------------------------------------- _emit()
|
|
275
276
|
def _emit(self, items: Iterable[ThreadedItem]) -> None:
|
|
276
277
|
for item in items:
|
|
278
|
+
if self._postprocess is not None:
|
|
279
|
+
self._postprocess(item)
|
|
277
280
|
for q in self._outputs:
|
|
278
281
|
if not q.put(item):
|
|
279
282
|
_log.error("Output queue closed while emitting from %s", self.name)
|
|
280
283
|
|
|
281
284
|
|
|
285
|
+
class PreprocessThreadedStage(ThreadedPipelineStage):
|
|
286
|
+
"""Pipeline stage that lazily loads PDF backends just-in-time."""
|
|
287
|
+
|
|
288
|
+
def __init__(
|
|
289
|
+
self,
|
|
290
|
+
*,
|
|
291
|
+
batch_timeout: float,
|
|
292
|
+
queue_max_size: int,
|
|
293
|
+
model: Any,
|
|
294
|
+
) -> None:
|
|
295
|
+
super().__init__(
|
|
296
|
+
name="preprocess",
|
|
297
|
+
model=model,
|
|
298
|
+
batch_size=1,
|
|
299
|
+
batch_timeout=batch_timeout,
|
|
300
|
+
queue_max_size=queue_max_size,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
|
|
304
|
+
groups: dict[int, list[ThreadedItem]] = defaultdict(list)
|
|
305
|
+
for itm in batch:
|
|
306
|
+
groups[itm.run_id].append(itm)
|
|
307
|
+
|
|
308
|
+
result: list[ThreadedItem] = []
|
|
309
|
+
for rid, items in groups.items():
|
|
310
|
+
good = [i for i in items if not i.is_failed]
|
|
311
|
+
if not good:
|
|
312
|
+
result.extend(items)
|
|
313
|
+
continue
|
|
314
|
+
try:
|
|
315
|
+
pages_with_payloads: list[tuple[ThreadedItem, Page]] = []
|
|
316
|
+
for it in good:
|
|
317
|
+
page = it.payload
|
|
318
|
+
if page is None:
|
|
319
|
+
raise RuntimeError("Page payload is None")
|
|
320
|
+
if page._backend is None:
|
|
321
|
+
backend = it.conv_res.input._backend
|
|
322
|
+
assert isinstance(backend, PdfDocumentBackend), (
|
|
323
|
+
"Threaded pipeline only supports PdfDocumentBackend."
|
|
324
|
+
)
|
|
325
|
+
page_backend = backend.load_page(page.page_no)
|
|
326
|
+
page._backend = page_backend
|
|
327
|
+
if page_backend.is_valid():
|
|
328
|
+
page.size = page_backend.get_size()
|
|
329
|
+
pages_with_payloads.append((it, page))
|
|
330
|
+
|
|
331
|
+
pages = [payload for _, payload in pages_with_payloads]
|
|
332
|
+
processed_pages = list(
|
|
333
|
+
self.model(good[0].conv_res, pages) # type: ignore[arg-type]
|
|
334
|
+
)
|
|
335
|
+
if len(processed_pages) != len(pages):
|
|
336
|
+
raise RuntimeError(
|
|
337
|
+
"PagePreprocessingModel returned unexpected number of pages"
|
|
338
|
+
)
|
|
339
|
+
for idx, processed_page in enumerate(processed_pages):
|
|
340
|
+
result.append(
|
|
341
|
+
ThreadedItem(
|
|
342
|
+
payload=processed_page,
|
|
343
|
+
run_id=rid,
|
|
344
|
+
page_no=good[idx].page_no,
|
|
345
|
+
conv_res=good[idx].conv_res,
|
|
346
|
+
)
|
|
347
|
+
)
|
|
348
|
+
except Exception as exc:
|
|
349
|
+
_log.error("Stage preprocess failed for run %d: %s", rid, exc)
|
|
350
|
+
for it in items:
|
|
351
|
+
it.is_failed = True
|
|
352
|
+
it.error = exc
|
|
353
|
+
result.extend(items)
|
|
354
|
+
return result
|
|
355
|
+
|
|
356
|
+
|
|
282
357
|
@dataclass
|
|
283
358
|
class RunContext:
|
|
284
359
|
"""Wiring for a single *execute* call."""
|
|
@@ -293,7 +368,7 @@ class RunContext:
|
|
|
293
368
|
# ──────────────────────────────────────────────────────────────────────────────
|
|
294
369
|
|
|
295
370
|
|
|
296
|
-
class
|
|
371
|
+
class StandardPdfPipeline(ConvertPipeline):
|
|
297
372
|
"""High-performance PDF pipeline with multi-threaded stages."""
|
|
298
373
|
|
|
299
374
|
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
|
|
@@ -372,46 +447,57 @@ class ThreadedStandardPdfPipeline(ConvertPipeline):
|
|
|
372
447
|
accelerator_options=self.pipeline_options.accelerator_options,
|
|
373
448
|
)
|
|
374
449
|
|
|
450
|
+
def _release_page_resources(self, item: ThreadedItem) -> None:
|
|
451
|
+
page = item.payload
|
|
452
|
+
if page is None:
|
|
453
|
+
return
|
|
454
|
+
if not self.keep_images:
|
|
455
|
+
page._image_cache = {}
|
|
456
|
+
if not self.keep_backend and page._backend is not None:
|
|
457
|
+
page._backend.unload()
|
|
458
|
+
page._backend = None
|
|
459
|
+
if not self.pipeline_options.generate_parsed_pages:
|
|
460
|
+
page.parsed_page = None
|
|
461
|
+
|
|
375
462
|
# ────────────────────────────────────────────────────────────────────────
|
|
376
463
|
# Build - thread pipeline
|
|
377
464
|
# ────────────────────────────────────────────────────────────────────────
|
|
378
465
|
|
|
379
466
|
def _create_run_ctx(self) -> RunContext:
|
|
380
467
|
opts = self.pipeline_options
|
|
381
|
-
preprocess =
|
|
382
|
-
|
|
383
|
-
model=self.preprocessing_model,
|
|
384
|
-
batch_size=1,
|
|
385
|
-
batch_timeout=opts.batch_timeout_seconds,
|
|
468
|
+
preprocess = PreprocessThreadedStage(
|
|
469
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
386
470
|
queue_max_size=opts.queue_max_size,
|
|
471
|
+
model=self.preprocessing_model,
|
|
387
472
|
)
|
|
388
473
|
ocr = ThreadedPipelineStage(
|
|
389
474
|
name="ocr",
|
|
390
475
|
model=self.ocr_model,
|
|
391
476
|
batch_size=opts.ocr_batch_size,
|
|
392
|
-
batch_timeout=opts.
|
|
477
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
393
478
|
queue_max_size=opts.queue_max_size,
|
|
394
479
|
)
|
|
395
480
|
layout = ThreadedPipelineStage(
|
|
396
481
|
name="layout",
|
|
397
482
|
model=self.layout_model,
|
|
398
483
|
batch_size=opts.layout_batch_size,
|
|
399
|
-
batch_timeout=opts.
|
|
484
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
400
485
|
queue_max_size=opts.queue_max_size,
|
|
401
486
|
)
|
|
402
487
|
table = ThreadedPipelineStage(
|
|
403
488
|
name="table",
|
|
404
489
|
model=self.table_model,
|
|
405
490
|
batch_size=opts.table_batch_size,
|
|
406
|
-
batch_timeout=opts.
|
|
491
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
407
492
|
queue_max_size=opts.queue_max_size,
|
|
408
493
|
)
|
|
409
494
|
assemble = ThreadedPipelineStage(
|
|
410
495
|
name="assemble",
|
|
411
496
|
model=self.assemble_model,
|
|
412
497
|
batch_size=1,
|
|
413
|
-
batch_timeout=opts.
|
|
498
|
+
batch_timeout=opts.batch_polling_interval_seconds,
|
|
414
499
|
queue_max_size=opts.queue_max_size,
|
|
500
|
+
postprocess=self._release_page_resources,
|
|
415
501
|
)
|
|
416
502
|
|
|
417
503
|
# wire stages
|
|
@@ -430,19 +516,15 @@ class ThreadedStandardPdfPipeline(ConvertPipeline):
|
|
|
430
516
|
"""Stream-build the document while interleaving producer and consumer work."""
|
|
431
517
|
run_id = next(self._run_seq)
|
|
432
518
|
assert isinstance(conv_res.input._backend, PdfDocumentBackend)
|
|
433
|
-
backend = conv_res.input._backend
|
|
434
519
|
|
|
435
|
-
#
|
|
520
|
+
# Collect page placeholders; backends are loaded lazily in preprocess stage
|
|
436
521
|
start_page, end_page = conv_res.input.limits.page_range
|
|
437
522
|
pages: list[Page] = []
|
|
438
523
|
for i in range(conv_res.input.page_count):
|
|
439
524
|
if start_page - 1 <= i <= end_page - 1:
|
|
440
525
|
page = Page(page_no=i)
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
page.size = page._backend.get_size()
|
|
444
|
-
conv_res.pages.append(page)
|
|
445
|
-
pages.append(page)
|
|
526
|
+
conv_res.pages.append(page)
|
|
527
|
+
pages.append(page)
|
|
446
528
|
|
|
447
529
|
if not pages:
|
|
448
530
|
conv_res.status = ConversionStatus.FAILURE
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.60.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -96,6 +96,7 @@ docling/pipeline/asr_pipeline.py
|
|
|
96
96
|
docling/pipeline/base_extraction_pipeline.py
|
|
97
97
|
docling/pipeline/base_pipeline.py
|
|
98
98
|
docling/pipeline/extraction_vlm_pipeline.py
|
|
99
|
+
docling/pipeline/legacy_standard_pdf_pipeline.py
|
|
99
100
|
docling/pipeline/simple_pipeline.py
|
|
100
101
|
docling/pipeline/standard_pdf_pipeline.py
|
|
101
102
|
docling/pipeline/threaded_standard_pdf_pipeline.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "docling"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.60.0" # DO NOT EDIT, updated automatically
|
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
keywords = [
|
|
@@ -5,6 +5,7 @@ from typing import List
|
|
|
5
5
|
|
|
6
6
|
import pytest
|
|
7
7
|
|
|
8
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
8
9
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
9
10
|
from docling.datamodel.document import ConversionResult
|
|
10
11
|
from docling.datamodel.pipeline_options import (
|
|
@@ -41,7 +42,7 @@ def test_threaded_pipeline_multiple_documents():
|
|
|
41
42
|
layout_batch_size=1,
|
|
42
43
|
table_batch_size=1,
|
|
43
44
|
ocr_batch_size=1,
|
|
44
|
-
|
|
45
|
+
batch_polling_interval_seconds=1.0,
|
|
45
46
|
do_table_structure=do_ts,
|
|
46
47
|
do_ocr=do_ocr,
|
|
47
48
|
),
|
|
@@ -171,6 +172,27 @@ def test_pipeline_comparison():
|
|
|
171
172
|
assert len(sync_doc.texts) == len(threaded_doc.texts)
|
|
172
173
|
|
|
173
174
|
|
|
175
|
+
def test_pypdfium_threaded_pipeline():
|
|
176
|
+
doc_converter = (
|
|
177
|
+
DocumentConverter( # all of the below is optional, has internal defaults.
|
|
178
|
+
format_options={
|
|
179
|
+
InputFormat.PDF: PdfFormatOption(
|
|
180
|
+
pipeline_cls=ThreadedStandardPdfPipeline,
|
|
181
|
+
backend=PyPdfiumDocumentBackend,
|
|
182
|
+
),
|
|
183
|
+
},
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
test_file = "tests/data/pdf/2206.01062.pdf"
|
|
188
|
+
for i in range(6):
|
|
189
|
+
print(f"iteration {i=}")
|
|
190
|
+
conv_result = doc_converter.convert(test_file)
|
|
191
|
+
assert conv_result.status == ConversionStatus.SUCCESS
|
|
192
|
+
print(f"[{i=}] Success")
|
|
193
|
+
print("All done!")
|
|
194
|
+
|
|
195
|
+
|
|
174
196
|
if __name__ == "__main__":
|
|
175
197
|
# Run basic performance test
|
|
176
198
|
test_pipeline_comparison()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling-2.59.0 → docling-2.60.0}/docling/models/vlm_models_inline/nuextract_transformers_model.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|