docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
"""Threaded Layout+VLM Pipeline
|
|
2
|
+
================================
|
|
3
|
+
A specialized two-stage threaded pipeline that combines layout model preprocessing
|
|
4
|
+
with VLM processing. The layout model detects document elements and coordinates,
|
|
5
|
+
which are then injected into the VLM prompt for enhanced structured output.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import itertools
|
|
11
|
+
import logging
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING, List, Optional, Union, cast
|
|
14
|
+
|
|
15
|
+
from docling_core.types.doc import DoclingDocument
|
|
16
|
+
from docling_core.types.doc.document import DocTagsDocument
|
|
17
|
+
from PIL import Image as PILImage
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from docling_core.types.doc.page import SegmentedPage
|
|
21
|
+
|
|
22
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
23
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
24
|
+
from docling.datamodel.base_models import ConversionStatus, Page
|
|
25
|
+
from docling.datamodel.document import ConversionResult
|
|
26
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
|
27
|
+
ApiVlmOptions,
|
|
28
|
+
InferenceFramework,
|
|
29
|
+
InlineVlmOptions,
|
|
30
|
+
)
|
|
31
|
+
from docling.datamodel.settings import settings
|
|
32
|
+
from docling.experimental.datamodel.threaded_layout_vlm_pipeline_options import (
|
|
33
|
+
ThreadedLayoutVlmPipelineOptions,
|
|
34
|
+
)
|
|
35
|
+
from docling.models.base_model import BaseVlmPageModel
|
|
36
|
+
from docling.models.stages.layout.layout_model import LayoutModel
|
|
37
|
+
from docling.models.vlm_pipeline_models.api_vlm_model import ApiVlmModel
|
|
38
|
+
from docling.models.vlm_pipeline_models.hf_transformers_model import (
|
|
39
|
+
HuggingFaceTransformersVlmModel,
|
|
40
|
+
)
|
|
41
|
+
from docling.models.vlm_pipeline_models.mlx_model import HuggingFaceMlxModel
|
|
42
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
|
43
|
+
from docling.pipeline.standard_pdf_pipeline import (
|
|
44
|
+
ProcessingResult,
|
|
45
|
+
RunContext,
|
|
46
|
+
ThreadedItem,
|
|
47
|
+
ThreadedPipelineStage,
|
|
48
|
+
ThreadedQueue,
|
|
49
|
+
)
|
|
50
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
51
|
+
|
|
52
|
+
_log = logging.getLogger(__name__)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ThreadedLayoutVlmPipeline(BasePipeline):
|
|
56
|
+
"""Two-stage threaded pipeline: Layout Model → VLM Model."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, pipeline_options: ThreadedLayoutVlmPipelineOptions) -> None:
|
|
59
|
+
super().__init__(pipeline_options)
|
|
60
|
+
self.pipeline_options: ThreadedLayoutVlmPipelineOptions = pipeline_options
|
|
61
|
+
self._run_seq = itertools.count(1) # deterministic, monotonic run ids
|
|
62
|
+
|
|
63
|
+
# VLM model type (initialized in _init_models)
|
|
64
|
+
self.vlm_model: BaseVlmPageModel
|
|
65
|
+
|
|
66
|
+
# Initialize models
|
|
67
|
+
self._init_models()
|
|
68
|
+
|
|
69
|
+
def _init_models(self) -> None:
|
|
70
|
+
"""Initialize layout and VLM models."""
|
|
71
|
+
art_path = self._resolve_artifacts_path()
|
|
72
|
+
|
|
73
|
+
# Layout model
|
|
74
|
+
self.layout_model = LayoutModel(
|
|
75
|
+
artifacts_path=art_path,
|
|
76
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
77
|
+
options=self.pipeline_options.layout_options,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# VLM model based on options type
|
|
81
|
+
# Create layout-aware VLM options internally
|
|
82
|
+
base_vlm_options = self.pipeline_options.vlm_options
|
|
83
|
+
|
|
84
|
+
class LayoutAwareVlmOptions(type(base_vlm_options)): # type: ignore[misc]
|
|
85
|
+
def build_prompt(
|
|
86
|
+
self,
|
|
87
|
+
page: Optional[SegmentedPage],
|
|
88
|
+
*,
|
|
89
|
+
_internal_page: Optional[Page] = None,
|
|
90
|
+
) -> str:
|
|
91
|
+
base_prompt = self.prompt
|
|
92
|
+
augmented_prompt = base_prompt
|
|
93
|
+
|
|
94
|
+
# In this layout-aware pipeline, _internal_page is always provided
|
|
95
|
+
if _internal_page is None:
|
|
96
|
+
return base_prompt
|
|
97
|
+
|
|
98
|
+
if not _internal_page.size:
|
|
99
|
+
_log.warning(
|
|
100
|
+
f"Page size not available for page {_internal_page.page_no}. Cannot enhance prompt with layout info."
|
|
101
|
+
)
|
|
102
|
+
return base_prompt
|
|
103
|
+
|
|
104
|
+
if _internal_page.predictions.layout:
|
|
105
|
+
from docling_core.types.doc.tokens import DocumentToken
|
|
106
|
+
|
|
107
|
+
layout_elements = []
|
|
108
|
+
for cluster in _internal_page.predictions.layout.clusters:
|
|
109
|
+
# Get proper tag name from DocItemLabel
|
|
110
|
+
tag_name = DocumentToken.create_token_name_from_doc_item_label(
|
|
111
|
+
label=cluster.label
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Convert bbox to tuple and get location tokens
|
|
115
|
+
bbox_tuple = cluster.bbox.as_tuple()
|
|
116
|
+
location_tokens = DocumentToken.get_location(
|
|
117
|
+
bbox=bbox_tuple,
|
|
118
|
+
page_w=_internal_page.size.width,
|
|
119
|
+
page_h=_internal_page.size.height,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Create XML element with DocTags format
|
|
123
|
+
xml_element = f"<{tag_name}>{location_tokens}</{tag_name}>"
|
|
124
|
+
layout_elements.append(xml_element)
|
|
125
|
+
|
|
126
|
+
if layout_elements:
|
|
127
|
+
# Join elements with newlines and wrap in layout tags
|
|
128
|
+
layout_xml = (
|
|
129
|
+
"<layout>" + "\n".join(layout_elements) + "</layout>"
|
|
130
|
+
)
|
|
131
|
+
layout_injection = f"{layout_xml}"
|
|
132
|
+
|
|
133
|
+
augmented_prompt = base_prompt + layout_injection
|
|
134
|
+
|
|
135
|
+
_log.debug(
|
|
136
|
+
"Enhanced Prompt with Layout Info: %s\n", augmented_prompt
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return augmented_prompt
|
|
140
|
+
|
|
141
|
+
vlm_options = LayoutAwareVlmOptions(**base_vlm_options.model_dump())
|
|
142
|
+
|
|
143
|
+
if isinstance(base_vlm_options, ApiVlmOptions):
|
|
144
|
+
self.vlm_model = ApiVlmModel(
|
|
145
|
+
enabled=True,
|
|
146
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
|
147
|
+
vlm_options=vlm_options,
|
|
148
|
+
)
|
|
149
|
+
elif isinstance(base_vlm_options, InlineVlmOptions):
|
|
150
|
+
if vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
|
|
151
|
+
self.vlm_model = HuggingFaceTransformersVlmModel(
|
|
152
|
+
enabled=True,
|
|
153
|
+
artifacts_path=art_path,
|
|
154
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
155
|
+
vlm_options=vlm_options,
|
|
156
|
+
)
|
|
157
|
+
elif vlm_options.inference_framework == InferenceFramework.MLX:
|
|
158
|
+
self.vlm_model = HuggingFaceMlxModel(
|
|
159
|
+
enabled=True,
|
|
160
|
+
artifacts_path=art_path,
|
|
161
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
162
|
+
vlm_options=vlm_options,
|
|
163
|
+
)
|
|
164
|
+
elif vlm_options.inference_framework == InferenceFramework.VLLM:
|
|
165
|
+
from docling.models.vlm_pipeline_models.vllm_model import VllmVlmModel
|
|
166
|
+
|
|
167
|
+
self.vlm_model = VllmVlmModel(
|
|
168
|
+
enabled=True,
|
|
169
|
+
artifacts_path=art_path,
|
|
170
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
171
|
+
vlm_options=vlm_options,
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"Unsupported VLM inference framework: {vlm_options.inference_framework}"
|
|
176
|
+
)
|
|
177
|
+
else:
|
|
178
|
+
raise ValueError(f"Unsupported VLM options type: {type(base_vlm_options)}")
|
|
179
|
+
|
|
180
|
+
def _resolve_artifacts_path(self) -> Optional[Path]:
|
|
181
|
+
"""Resolve artifacts path from options or settings."""
|
|
182
|
+
if self.pipeline_options.artifacts_path:
|
|
183
|
+
p = Path(self.pipeline_options.artifacts_path).expanduser()
|
|
184
|
+
elif settings.artifacts_path:
|
|
185
|
+
p = Path(settings.artifacts_path).expanduser()
|
|
186
|
+
else:
|
|
187
|
+
return None
|
|
188
|
+
if not p.is_dir():
|
|
189
|
+
raise RuntimeError(
|
|
190
|
+
f"{p} does not exist or is not a directory containing the required models"
|
|
191
|
+
)
|
|
192
|
+
return p
|
|
193
|
+
|
|
194
|
+
def _create_run_ctx(self) -> RunContext:
|
|
195
|
+
"""Create pipeline stages and wire them together."""
|
|
196
|
+
opts = self.pipeline_options
|
|
197
|
+
|
|
198
|
+
# Layout stage
|
|
199
|
+
layout_stage = ThreadedPipelineStage(
|
|
200
|
+
name="layout",
|
|
201
|
+
model=self.layout_model,
|
|
202
|
+
batch_size=opts.layout_batch_size,
|
|
203
|
+
batch_timeout=opts.batch_timeout_seconds,
|
|
204
|
+
queue_max_size=opts.queue_max_size,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# VLM stage - now layout-aware through enhanced build_prompt
|
|
208
|
+
vlm_stage = ThreadedPipelineStage(
|
|
209
|
+
name="vlm",
|
|
210
|
+
model=self.vlm_model,
|
|
211
|
+
batch_size=opts.vlm_batch_size,
|
|
212
|
+
batch_timeout=opts.batch_timeout_seconds,
|
|
213
|
+
queue_max_size=opts.queue_max_size,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Wire stages
|
|
217
|
+
output_q = ThreadedQueue(opts.queue_max_size)
|
|
218
|
+
layout_stage.add_output_queue(vlm_stage.input_queue)
|
|
219
|
+
vlm_stage.add_output_queue(output_q)
|
|
220
|
+
|
|
221
|
+
stages = [layout_stage, vlm_stage]
|
|
222
|
+
return RunContext(
|
|
223
|
+
stages=stages, first_stage=layout_stage, output_queue=output_q
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
227
|
+
"""Build document using threaded layout+VLM pipeline."""
|
|
228
|
+
run_id = next(self._run_seq)
|
|
229
|
+
assert isinstance(conv_res.input._backend, PdfDocumentBackend)
|
|
230
|
+
backend = conv_res.input._backend
|
|
231
|
+
|
|
232
|
+
# Initialize pages
|
|
233
|
+
start_page, end_page = conv_res.input.limits.page_range
|
|
234
|
+
pages: List[Page] = []
|
|
235
|
+
images_scale = self.pipeline_options.images_scale
|
|
236
|
+
for i in range(conv_res.input.page_count):
|
|
237
|
+
if start_page - 1 <= i <= end_page - 1:
|
|
238
|
+
page = Page(page_no=i)
|
|
239
|
+
if images_scale is not None:
|
|
240
|
+
page._default_image_scale = images_scale
|
|
241
|
+
page._backend = backend.load_page(i)
|
|
242
|
+
if page._backend and page._backend.is_valid():
|
|
243
|
+
page.size = page._backend.get_size()
|
|
244
|
+
conv_res.pages.append(page)
|
|
245
|
+
pages.append(page)
|
|
246
|
+
|
|
247
|
+
if not pages:
|
|
248
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
249
|
+
return conv_res
|
|
250
|
+
|
|
251
|
+
total_pages = len(pages)
|
|
252
|
+
ctx = self._create_run_ctx()
|
|
253
|
+
for st in ctx.stages:
|
|
254
|
+
st.start()
|
|
255
|
+
|
|
256
|
+
proc = ProcessingResult(total_expected=total_pages)
|
|
257
|
+
fed_idx = 0
|
|
258
|
+
batch_size = 32
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
while proc.success_count + proc.failure_count < total_pages:
|
|
262
|
+
# Feed pages to first stage
|
|
263
|
+
while fed_idx < total_pages:
|
|
264
|
+
ok = ctx.first_stage.input_queue.put(
|
|
265
|
+
ThreadedItem(
|
|
266
|
+
payload=pages[fed_idx],
|
|
267
|
+
run_id=run_id,
|
|
268
|
+
page_no=pages[fed_idx].page_no,
|
|
269
|
+
conv_res=conv_res,
|
|
270
|
+
),
|
|
271
|
+
timeout=0.0,
|
|
272
|
+
)
|
|
273
|
+
if ok:
|
|
274
|
+
fed_idx += 1
|
|
275
|
+
if fed_idx == total_pages:
|
|
276
|
+
ctx.first_stage.input_queue.close()
|
|
277
|
+
else:
|
|
278
|
+
break
|
|
279
|
+
|
|
280
|
+
# Drain results from output
|
|
281
|
+
out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
|
|
282
|
+
for itm in out_batch:
|
|
283
|
+
if itm.run_id != run_id:
|
|
284
|
+
continue
|
|
285
|
+
if itm.is_failed or itm.error:
|
|
286
|
+
proc.failed_pages.append(
|
|
287
|
+
(itm.page_no, itm.error or RuntimeError("unknown error"))
|
|
288
|
+
)
|
|
289
|
+
else:
|
|
290
|
+
assert itm.payload is not None
|
|
291
|
+
proc.pages.append(itm.payload)
|
|
292
|
+
|
|
293
|
+
# Handle early termination
|
|
294
|
+
if not out_batch and ctx.output_queue.closed:
|
|
295
|
+
missing = total_pages - (proc.success_count + proc.failure_count)
|
|
296
|
+
if missing > 0:
|
|
297
|
+
proc.failed_pages.extend(
|
|
298
|
+
[(-1, RuntimeError("pipeline terminated early"))] * missing
|
|
299
|
+
)
|
|
300
|
+
break
|
|
301
|
+
finally:
|
|
302
|
+
for st in ctx.stages:
|
|
303
|
+
st.stop()
|
|
304
|
+
ctx.output_queue.close()
|
|
305
|
+
|
|
306
|
+
self._integrate_results(conv_res, proc)
|
|
307
|
+
return conv_res
|
|
308
|
+
|
|
309
|
+
def _integrate_results(
|
|
310
|
+
self, conv_res: ConversionResult, proc: ProcessingResult
|
|
311
|
+
) -> None:
|
|
312
|
+
"""Integrate processing results into conversion result."""
|
|
313
|
+
page_map = {p.page_no: p for p in proc.pages}
|
|
314
|
+
|
|
315
|
+
# Track failed pages for cleanup
|
|
316
|
+
failed_page_nos = {fp for fp, _ in proc.failed_pages}
|
|
317
|
+
|
|
318
|
+
# Collect pages that will be removed (failed pages) for resource cleanup
|
|
319
|
+
pages_to_remove = [p for p in conv_res.pages if p.page_no in failed_page_nos]
|
|
320
|
+
|
|
321
|
+
conv_res.pages = [
|
|
322
|
+
page_map.get(p.page_no, p)
|
|
323
|
+
for p in conv_res.pages
|
|
324
|
+
if p.page_no in page_map
|
|
325
|
+
or not any(fp == p.page_no for fp, _ in proc.failed_pages)
|
|
326
|
+
]
|
|
327
|
+
|
|
328
|
+
if proc.is_complete_failure:
|
|
329
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
330
|
+
elif proc.is_partial_success:
|
|
331
|
+
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
|
332
|
+
else:
|
|
333
|
+
conv_res.status = ConversionStatus.SUCCESS
|
|
334
|
+
|
|
335
|
+
# Clean up resources for failed pages that were removed
|
|
336
|
+
for p in pages_to_remove:
|
|
337
|
+
if p._backend is not None:
|
|
338
|
+
p._backend.unload()
|
|
339
|
+
p._image_cache = {}
|
|
340
|
+
# Clean up parsed_page if it exists (it's Optional[SegmentedPdfPage])
|
|
341
|
+
if p.parsed_page is not None:
|
|
342
|
+
del p.parsed_page
|
|
343
|
+
p.parsed_page = None
|
|
344
|
+
|
|
345
|
+
# Clean up images if not needed for remaining pages
|
|
346
|
+
if not self.pipeline_options.generate_page_images:
|
|
347
|
+
for p in conv_res.pages:
|
|
348
|
+
p._image_cache = {}
|
|
349
|
+
|
|
350
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
351
|
+
"""Assemble final document from VLM predictions."""
|
|
352
|
+
from docling_core.types.doc import DocItem, ImageRef, PictureItem
|
|
353
|
+
|
|
354
|
+
from docling.datamodel.pipeline_options_vlm_model import ResponseFormat
|
|
355
|
+
|
|
356
|
+
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
|
357
|
+
# Response format validation is done in ThreadedLayoutVlmPipelineOptions
|
|
358
|
+
# This check is kept as a safety net, but should never trigger if validation works
|
|
359
|
+
if (
|
|
360
|
+
self.pipeline_options.vlm_options.response_format
|
|
361
|
+
!= ResponseFormat.DOCTAGS
|
|
362
|
+
):
|
|
363
|
+
raise RuntimeError(
|
|
364
|
+
f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}. Only DOCTAGS format is supported."
|
|
365
|
+
)
|
|
366
|
+
conv_res.document = self._turn_dt_into_doc(conv_res)
|
|
367
|
+
|
|
368
|
+
# Generate images of the requested element types
|
|
369
|
+
if self.pipeline_options.generate_picture_images:
|
|
370
|
+
# Create mapping from page_no to Page object since pages may be non-continuous
|
|
371
|
+
page_map = {p.page_no: p for p in conv_res.pages}
|
|
372
|
+
scale = self.pipeline_options.images_scale
|
|
373
|
+
for element, _level in conv_res.document.iterate_items():
|
|
374
|
+
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
|
375
|
+
continue
|
|
376
|
+
if (
|
|
377
|
+
isinstance(element, PictureItem)
|
|
378
|
+
and self.pipeline_options.generate_picture_images
|
|
379
|
+
):
|
|
380
|
+
page_no = element.prov[0].page_no
|
|
381
|
+
page = page_map.get(page_no)
|
|
382
|
+
if page is None:
|
|
383
|
+
_log.warning(
|
|
384
|
+
f"Page {page_no} not found in conversion result for picture element. Skipping image generation."
|
|
385
|
+
)
|
|
386
|
+
continue
|
|
387
|
+
assert page.size is not None
|
|
388
|
+
assert page.image is not None
|
|
389
|
+
|
|
390
|
+
crop_bbox = (
|
|
391
|
+
element.prov[0]
|
|
392
|
+
.bbox.scaled(scale=scale)
|
|
393
|
+
.to_top_left_origin(page_height=page.size.height * scale)
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
|
397
|
+
element.image = ImageRef.from_pil(
|
|
398
|
+
cropped_im, dpi=int(72 * scale)
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
return conv_res
|
|
402
|
+
|
|
403
|
+
def _turn_dt_into_doc(self, conv_res: ConversionResult) -> DoclingDocument:
|
|
404
|
+
"""Convert DOCTAGS response format to DoclingDocument."""
|
|
405
|
+
doctags_list = []
|
|
406
|
+
image_list = []
|
|
407
|
+
for page in conv_res.pages:
|
|
408
|
+
# Only include pages that have both an image and VLM predictions
|
|
409
|
+
if page.image and page.predictions.vlm_response:
|
|
410
|
+
predicted_doctags = page.predictions.vlm_response.text
|
|
411
|
+
image_list.append(page.image)
|
|
412
|
+
doctags_list.append(predicted_doctags)
|
|
413
|
+
|
|
414
|
+
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
|
415
|
+
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
|
416
|
+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
|
417
|
+
doctags_list_c, image_list_c
|
|
418
|
+
)
|
|
419
|
+
document = DoclingDocument.load_from_doctags(doctag_document=doctags_doc)
|
|
420
|
+
|
|
421
|
+
return document
|
|
422
|
+
|
|
423
|
+
@classmethod
|
|
424
|
+
def get_default_options(cls) -> ThreadedLayoutVlmPipelineOptions:
|
|
425
|
+
return ThreadedLayoutVlmPipelineOptions()
|
|
426
|
+
|
|
427
|
+
@classmethod
|
|
428
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
|
|
429
|
+
return isinstance(backend, PdfDocumentBackend)
|
|
430
|
+
|
|
431
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
432
|
+
return conv_res.status
|
|
433
|
+
|
|
434
|
+
def _unload(self, conv_res: ConversionResult) -> None:
|
|
435
|
+
for p in conv_res.pages:
|
|
436
|
+
if p._backend is not None:
|
|
437
|
+
p._backend.unload()
|
|
438
|
+
if conv_res.input._backend:
|
|
439
|
+
conv_res.input._backend.unload()
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Iterable, Sequence
|
|
5
|
+
from typing import Type
|
|
6
|
+
|
|
7
|
+
from docling.datamodel.base_models import LayoutPrediction, Page
|
|
8
|
+
from docling.datamodel.document import ConversionResult
|
|
9
|
+
from docling.datamodel.pipeline_options import BaseLayoutOptions
|
|
10
|
+
from docling.models.base_model import BaseModelWithOptions, BasePageModel
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseLayoutModel(BasePageModel, BaseModelWithOptions, ABC):
|
|
14
|
+
"""Shared interface for layout models."""
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def get_options_type(cls) -> Type[BaseLayoutOptions]:
|
|
19
|
+
"""Return the options type supported by this layout model."""
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def predict_layout(
|
|
23
|
+
self,
|
|
24
|
+
conv_res: ConversionResult,
|
|
25
|
+
pages: Sequence[Page],
|
|
26
|
+
) -> Sequence[LayoutPrediction]:
|
|
27
|
+
"""Produce layout predictions for the provided pages."""
|
|
28
|
+
|
|
29
|
+
def __call__(
|
|
30
|
+
self,
|
|
31
|
+
conv_res: ConversionResult,
|
|
32
|
+
page_batch: Iterable[Page],
|
|
33
|
+
) -> Iterable[Page]:
|
|
34
|
+
pages = list(page_batch)
|
|
35
|
+
predictions = self.predict_layout(conv_res, pages)
|
|
36
|
+
|
|
37
|
+
for page, prediction in zip(pages, predictions):
|
|
38
|
+
page.predictions.layout = prediction
|
|
39
|
+
yield page
|