docling 2.58.0__py3-none-any.whl → 2.60.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/msexcel_backend.py +6 -2
- docling/backend/pypdfium2_backend.py +4 -4
- docling/cli/main.py +19 -8
- docling/datamodel/base_models.py +2 -0
- docling/datamodel/pipeline_options.py +13 -10
- docling/datamodel/pipeline_options_vlm_model.py +1 -0
- docling/models/api_vlm_model.py +5 -3
- docling/models/layout_model.py +4 -0
- docling/models/picture_description_vlm_model.py +5 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +13 -3
- docling/models/vlm_models_inline/mlx_model.py +9 -3
- docling/models/vlm_models_inline/nuextract_transformers_model.py +13 -3
- docling/models/vlm_models_inline/vllm_model.py +42 -8
- docling/pipeline/asr_pipeline.py +10 -3
- docling/pipeline/legacy_standard_pdf_pipeline.py +242 -0
- docling/pipeline/standard_pdf_pipeline.py +583 -96
- docling/pipeline/threaded_standard_pdf_pipeline.py +3 -645
- docling/utils/api_image_request.py +17 -6
- {docling-2.58.0.dist-info → docling-2.60.0.dist-info}/METADATA +9 -8
- {docling-2.58.0.dist-info → docling-2.60.0.dist-info}/RECORD +24 -23
- {docling-2.58.0.dist-info → docling-2.60.0.dist-info}/WHEEL +0 -0
- {docling-2.58.0.dist-info → docling-2.60.0.dist-info}/entry_points.txt +0 -0
- {docling-2.58.0.dist-info → docling-2.60.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.58.0.dist-info → docling-2.60.0.dist-info}/top_level.txt +0 -0
|
@@ -139,10 +139,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
139
139
|
self.workbook = None
|
|
140
140
|
try:
|
|
141
141
|
if isinstance(self.path_or_stream, BytesIO):
|
|
142
|
-
self.workbook = load_workbook(
|
|
142
|
+
self.workbook = load_workbook(
|
|
143
|
+
filename=self.path_or_stream, data_only=True
|
|
144
|
+
)
|
|
143
145
|
|
|
144
146
|
elif isinstance(self.path_or_stream, Path):
|
|
145
|
-
self.workbook = load_workbook(
|
|
147
|
+
self.workbook = load_workbook(
|
|
148
|
+
filename=str(self.path_or_stream), data_only=True
|
|
149
|
+
)
|
|
146
150
|
|
|
147
151
|
self.valid = self.workbook is not None
|
|
148
152
|
except Exception as e:
|
|
@@ -229,10 +229,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
|
229
229
|
b=max(cell.rect.to_bounding_box().b for cell in group),
|
|
230
230
|
)
|
|
231
231
|
|
|
232
|
-
assert self.
|
|
233
|
-
self.text_page = self._ppage.get_textpage()
|
|
232
|
+
assert self.text_page is not None
|
|
234
233
|
bbox = merged_bbox.to_bottom_left_origin(page_size.height)
|
|
235
|
-
|
|
234
|
+
with pypdfium2_lock:
|
|
235
|
+
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
|
|
236
236
|
|
|
237
237
|
return TextCell(
|
|
238
238
|
index=group[0].index,
|
|
@@ -255,9 +255,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
|
255
255
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
256
256
|
AREA_THRESHOLD = 0 # 32 * 32
|
|
257
257
|
page_size = self.get_size()
|
|
258
|
-
rotation = self._ppage.get_rotation()
|
|
259
258
|
|
|
260
259
|
with pypdfium2_lock:
|
|
260
|
+
rotation = self._ppage.get_rotation()
|
|
261
261
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
|
262
262
|
pos = obj.get_pos()
|
|
263
263
|
if rotation == 90:
|
docling/cli/main.py
CHANGED
|
@@ -738,10 +738,15 @@ def convert( # noqa: C901
|
|
|
738
738
|
|
|
739
739
|
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
|
740
740
|
except ImportError:
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
741
|
+
if sys.version_info < (3, 14):
|
|
742
|
+
_log.warning(
|
|
743
|
+
"To run SmolDocling faster, please install mlx-vlm:\n"
|
|
744
|
+
"pip install mlx-vlm"
|
|
745
|
+
)
|
|
746
|
+
else:
|
|
747
|
+
_log.warning(
|
|
748
|
+
"You can run SmolDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
|
|
749
|
+
)
|
|
745
750
|
|
|
746
751
|
elif vlm_model == VlmModelType.GRANITEDOCLING:
|
|
747
752
|
pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
|
|
@@ -751,10 +756,16 @@ def convert( # noqa: C901
|
|
|
751
756
|
|
|
752
757
|
pipeline_options.vlm_options = GRANITEDOCLING_MLX
|
|
753
758
|
except ImportError:
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
759
|
+
if sys.version_info < (3, 14):
|
|
760
|
+
_log.warning(
|
|
761
|
+
"To run GraniteDocling faster, please install mlx-vlm:\n"
|
|
762
|
+
"pip install mlx-vlm"
|
|
763
|
+
)
|
|
764
|
+
else:
|
|
765
|
+
_log.warning(
|
|
766
|
+
"You can run GraniteDocling faster with MLX support, but it is unfortunately not yet available on Python 3.14."
|
|
767
|
+
)
|
|
768
|
+
|
|
758
769
|
elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
|
|
759
770
|
pipeline_options.vlm_options = SMOLDOCLING_VLLM
|
|
760
771
|
|
docling/datamodel/base_models.py
CHANGED
|
@@ -207,6 +207,8 @@ class VlmPrediction(BaseModel):
|
|
|
207
207
|
text: str = ""
|
|
208
208
|
generated_tokens: list[VlmPredictionToken] = []
|
|
209
209
|
generation_time: float = -1
|
|
210
|
+
num_tokens: Optional[int] = None
|
|
211
|
+
stop_reason: Optional[str] = None # todo define an enum for possible stop reasons
|
|
210
212
|
|
|
211
213
|
|
|
212
214
|
class ContainerElement(
|
|
@@ -361,15 +361,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
|
361
361
|
|
|
362
362
|
generate_parsed_pages: bool = False
|
|
363
363
|
|
|
364
|
-
|
|
365
|
-
class ProcessingPipeline(str, Enum):
|
|
366
|
-
STANDARD = "standard"
|
|
367
|
-
VLM = "vlm"
|
|
368
|
-
ASR = "asr"
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
|
372
|
-
"""Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
|
|
364
|
+
### Arguments for threaded PDF pipeline with batching and backpressure control
|
|
373
365
|
|
|
374
366
|
# Batch sizes for different stages
|
|
375
367
|
ocr_batch_size: int = 4
|
|
@@ -377,7 +369,18 @@ class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
|
|
377
369
|
table_batch_size: int = 4
|
|
378
370
|
|
|
379
371
|
# Timing control
|
|
380
|
-
|
|
372
|
+
batch_polling_interval_seconds: float = 0.5
|
|
381
373
|
|
|
382
374
|
# Backpressure and queue control
|
|
383
375
|
queue_max_size: int = 100
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
class ProcessingPipeline(str, Enum):
|
|
379
|
+
LEGACY = "legacy"
|
|
380
|
+
STANDARD = "standard"
|
|
381
|
+
VLM = "vlm"
|
|
382
|
+
ASR = "asr"
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
|
386
|
+
"""Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
|
docling/models/api_vlm_model.py
CHANGED
|
@@ -73,7 +73,7 @@ class ApiVlmModel(BasePageModel):
|
|
|
73
73
|
# Skip non-GenerationStopper criteria (should have been caught in validation)
|
|
74
74
|
|
|
75
75
|
# Streaming path with early abort support
|
|
76
|
-
page_tags = api_image_request_streaming(
|
|
76
|
+
page_tags, num_tokens = api_image_request_streaming(
|
|
77
77
|
image=hi_res_image,
|
|
78
78
|
prompt=prompt,
|
|
79
79
|
url=self.vlm_options.url,
|
|
@@ -84,7 +84,7 @@ class ApiVlmModel(BasePageModel):
|
|
|
84
84
|
)
|
|
85
85
|
else:
|
|
86
86
|
# Non-streaming fallback (existing behavior)
|
|
87
|
-
page_tags = api_image_request(
|
|
87
|
+
page_tags, num_tokens = api_image_request(
|
|
88
88
|
image=hi_res_image,
|
|
89
89
|
prompt=prompt,
|
|
90
90
|
url=self.vlm_options.url,
|
|
@@ -94,7 +94,9 @@ class ApiVlmModel(BasePageModel):
|
|
|
94
94
|
)
|
|
95
95
|
|
|
96
96
|
page_tags = self.vlm_options.decode_response(page_tags)
|
|
97
|
-
page.predictions.vlm_response = VlmPrediction(
|
|
97
|
+
page.predictions.vlm_response = VlmPrediction(
|
|
98
|
+
text=page_tags, num_tokens=num_tokens
|
|
99
|
+
)
|
|
98
100
|
return page
|
|
99
101
|
|
|
100
102
|
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
|
docling/models/layout_model.py
CHANGED
|
@@ -167,6 +167,10 @@ class LayoutModel(BasePageModel):
|
|
|
167
167
|
valid_pages.append(page)
|
|
168
168
|
valid_page_images.append(page_image)
|
|
169
169
|
|
|
170
|
+
print(f"{len(pages)=}, {pages[0].page_no}-{pages[-1].page_no}")
|
|
171
|
+
print(f"{len(valid_pages)=}")
|
|
172
|
+
print(f"{len(valid_page_images)=}")
|
|
173
|
+
|
|
170
174
|
# Process all valid pages with batch prediction
|
|
171
175
|
batch_predictions = []
|
|
172
176
|
if valid_page_images:
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import sys
|
|
1
2
|
import threading
|
|
2
3
|
from collections.abc import Iterable
|
|
3
4
|
from pathlib import Path
|
|
@@ -75,7 +76,10 @@ class PictureDescriptionVlmModel(
|
|
|
75
76
|
else "sdpa"
|
|
76
77
|
),
|
|
77
78
|
)
|
|
78
|
-
|
|
79
|
+
if sys.version_info < (3, 14):
|
|
80
|
+
self.model = torch.compile(self.model) # type: ignore
|
|
81
|
+
else:
|
|
82
|
+
self.model.eval()
|
|
79
83
|
|
|
80
84
|
self.provenance = f"{self.options.repo_id}"
|
|
81
85
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import importlib.metadata
|
|
2
2
|
import logging
|
|
3
|
+
import sys
|
|
3
4
|
import time
|
|
4
5
|
from collections.abc import Iterable
|
|
5
6
|
from pathlib import Path
|
|
@@ -129,7 +130,10 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
|
|
|
129
130
|
trust_remote_code=vlm_options.trust_remote_code,
|
|
130
131
|
revision=vlm_options.revision,
|
|
131
132
|
)
|
|
132
|
-
|
|
133
|
+
if sys.version_info < (3, 14):
|
|
134
|
+
self.vlm_model = torch.compile(self.vlm_model) # type: ignore
|
|
135
|
+
else:
|
|
136
|
+
self.vlm_model.eval()
|
|
133
137
|
|
|
134
138
|
# Load generation config
|
|
135
139
|
self.generation_config = GenerationConfig.from_pretrained(
|
|
@@ -363,13 +367,19 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
|
|
|
363
367
|
decoded_texts = [text.rstrip(pad_token) for text in decoded_texts]
|
|
364
368
|
|
|
365
369
|
# -- Optional logging
|
|
370
|
+
num_tokens = None
|
|
366
371
|
if generated_ids.shape[0] > 0:
|
|
372
|
+
num_tokens = int(generated_ids[0].shape[0])
|
|
367
373
|
_log.debug(
|
|
368
|
-
f"Generated {
|
|
374
|
+
f"Generated {num_tokens} tokens in {generation_time:.2f}s "
|
|
369
375
|
f"for batch size {generated_ids.shape[0]}."
|
|
370
376
|
)
|
|
371
377
|
|
|
372
378
|
for text in decoded_texts:
|
|
373
379
|
# Apply decode_response to the output text
|
|
374
380
|
decoded_text = self.vlm_options.decode_response(text)
|
|
375
|
-
yield VlmPrediction(
|
|
381
|
+
yield VlmPrediction(
|
|
382
|
+
text=decoded_text,
|
|
383
|
+
generation_time=generation_time,
|
|
384
|
+
num_tokens=num_tokens,
|
|
385
|
+
)
|
|
@@ -50,9 +50,14 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
|
50
50
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
|
51
51
|
from mlx_vlm.utils import load_config # type: ignore
|
|
52
52
|
except ImportError:
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
if sys.version_info < (3, 14):
|
|
54
|
+
raise ImportError(
|
|
55
|
+
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
|
56
|
+
)
|
|
57
|
+
else:
|
|
58
|
+
raise ImportError(
|
|
59
|
+
"mlx-vlm is not installed. It is not yet available on Python 3.14."
|
|
60
|
+
)
|
|
56
61
|
|
|
57
62
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
|
58
63
|
|
|
@@ -313,5 +318,6 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
|
313
318
|
text=decoded_output,
|
|
314
319
|
generation_time=generation_time,
|
|
315
320
|
generated_tokens=tokens,
|
|
321
|
+
num_tokens=len(tokens),
|
|
316
322
|
)
|
|
317
323
|
_log.debug("MLX model: Released global lock")
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import sys
|
|
2
3
|
import time
|
|
3
4
|
from collections.abc import Iterable
|
|
4
5
|
from pathlib import Path
|
|
@@ -153,7 +154,10 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
|
|
|
153
154
|
),
|
|
154
155
|
trust_remote_code=vlm_options.trust_remote_code,
|
|
155
156
|
)
|
|
156
|
-
|
|
157
|
+
if sys.version_info < (3, 14):
|
|
158
|
+
self.vlm_model = torch.compile(self.vlm_model) # type: ignore
|
|
159
|
+
else:
|
|
160
|
+
self.vlm_model.eval()
|
|
157
161
|
|
|
158
162
|
# Load generation config
|
|
159
163
|
self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
|
|
@@ -278,13 +282,19 @@ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
|
|
|
278
282
|
)
|
|
279
283
|
|
|
280
284
|
# Optional logging
|
|
285
|
+
num_tokens = None
|
|
281
286
|
if generated_ids.shape[0] > 0: # type: ignore
|
|
287
|
+
num_tokens = int(generated_ids[0].shape[0])
|
|
282
288
|
_log.debug(
|
|
283
|
-
f"Generated {
|
|
289
|
+
f"Generated {num_tokens} tokens in {generation_time:.2f}s "
|
|
284
290
|
f"for batch size {generated_ids.shape[0]}." # type: ignore
|
|
285
291
|
)
|
|
286
292
|
|
|
287
293
|
for text in decoded_texts:
|
|
288
294
|
# Apply decode_response to the output text
|
|
289
295
|
decoded_text = self.vlm_options.decode_response(text)
|
|
290
|
-
yield VlmPrediction(
|
|
296
|
+
yield VlmPrediction(
|
|
297
|
+
text=decoded_text,
|
|
298
|
+
generation_time=generation_time,
|
|
299
|
+
num_tokens=num_tokens,
|
|
300
|
+
)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import sys
|
|
2
3
|
import time
|
|
3
4
|
from collections.abc import Iterable
|
|
4
5
|
from pathlib import Path
|
|
@@ -8,7 +9,7 @@ import numpy as np
|
|
|
8
9
|
from PIL.Image import Image
|
|
9
10
|
|
|
10
11
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
11
|
-
from docling.datamodel.base_models import Page, VlmPrediction
|
|
12
|
+
from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
|
|
12
13
|
from docling.datamodel.document import ConversionResult
|
|
13
14
|
from docling.datamodel.pipeline_options_vlm_model import (
|
|
14
15
|
InlineVlmOptions,
|
|
@@ -87,7 +88,7 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
|
87
88
|
vlm_options: InlineVlmOptions,
|
|
88
89
|
):
|
|
89
90
|
self.enabled = enabled
|
|
90
|
-
self.vlm_options = vlm_options
|
|
91
|
+
self.vlm_options: InlineVlmOptions = vlm_options
|
|
91
92
|
|
|
92
93
|
self.llm = None
|
|
93
94
|
self.sampling_params = None
|
|
@@ -100,7 +101,18 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
|
100
101
|
return
|
|
101
102
|
|
|
102
103
|
from transformers import AutoProcessor
|
|
103
|
-
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
from vllm import LLM, SamplingParams
|
|
107
|
+
except ImportError:
|
|
108
|
+
if sys.version_info < (3, 14):
|
|
109
|
+
raise ImportError(
|
|
110
|
+
"vllm is not installed. Please install it via `pip install vllm`."
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
raise ImportError(
|
|
114
|
+
"vllm is not installed. It is not yet available on Python 3.14."
|
|
115
|
+
)
|
|
104
116
|
|
|
105
117
|
# Device selection
|
|
106
118
|
self.device = decide_device(
|
|
@@ -222,7 +234,8 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
|
222
234
|
pages_with_images.append(page)
|
|
223
235
|
|
|
224
236
|
if images:
|
|
225
|
-
|
|
237
|
+
with TimeRecorder(conv_res, "vlm_inference"):
|
|
238
|
+
predictions = list(self.process_images(images, user_prompts))
|
|
226
239
|
for page, prediction in zip(pages_with_images, predictions):
|
|
227
240
|
page.predictions.vlm_response = prediction
|
|
228
241
|
|
|
@@ -288,13 +301,34 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
|
288
301
|
# Optional debug
|
|
289
302
|
if outputs:
|
|
290
303
|
try:
|
|
291
|
-
|
|
292
|
-
_log.debug(
|
|
304
|
+
num_tokens_within_batch = len(outputs[0].outputs[0].token_ids)
|
|
305
|
+
_log.debug(
|
|
306
|
+
f"Generated {num_tokens_within_batch} tokens for batch in {generation_time:.2f}s."
|
|
307
|
+
)
|
|
293
308
|
except Exception:
|
|
294
|
-
|
|
309
|
+
num_tokens_within_batch = 0
|
|
295
310
|
|
|
296
311
|
# Emit predictions
|
|
297
312
|
for output in outputs:
|
|
298
313
|
text = output.outputs[0].text if output.outputs else ""
|
|
314
|
+
stop_reason = output.outputs[0].stop_reason if output.outputs else ""
|
|
315
|
+
generated_tokens = [
|
|
316
|
+
VlmPredictionToken(token=int(p)) for p in output.outputs[0].token_ids
|
|
317
|
+
]
|
|
318
|
+
num_tokens = len(generated_tokens)
|
|
299
319
|
decoded_text = self.vlm_options.decode_response(text)
|
|
300
|
-
|
|
320
|
+
if self.vlm_options.track_generated_tokens:
|
|
321
|
+
yield VlmPrediction(
|
|
322
|
+
text=decoded_text,
|
|
323
|
+
generation_time=generation_time,
|
|
324
|
+
num_tokens=num_tokens,
|
|
325
|
+
stop_reason=stop_reason,
|
|
326
|
+
generated_tokens=generated_tokens,
|
|
327
|
+
)
|
|
328
|
+
else:
|
|
329
|
+
yield VlmPrediction(
|
|
330
|
+
text=decoded_text,
|
|
331
|
+
generation_time=generation_time,
|
|
332
|
+
num_tokens=num_tokens,
|
|
333
|
+
stop_reason=stop_reason,
|
|
334
|
+
)
|
docling/pipeline/asr_pipeline.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
|
+
import sys
|
|
4
5
|
import tempfile
|
|
5
6
|
from io import BytesIO
|
|
6
7
|
from pathlib import Path
|
|
@@ -117,9 +118,15 @@ class _NativeWhisperModel:
|
|
|
117
118
|
try:
|
|
118
119
|
import whisper # type: ignore
|
|
119
120
|
except ImportError:
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
121
|
+
if sys.version_info < (3, 14):
|
|
122
|
+
raise ImportError(
|
|
123
|
+
"whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
raise ImportError(
|
|
127
|
+
"whisper is not installed. Unfortunately its dependencies are not yet available for Python 3.14."
|
|
128
|
+
)
|
|
129
|
+
|
|
123
130
|
self.asr_options = asr_options
|
|
124
131
|
self.max_tokens = asr_options.max_new_tokens
|
|
125
132
|
self.temperature = asr_options.temperature
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import warnings
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional, cast
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|
8
|
+
|
|
9
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
10
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
11
|
+
from docling.datamodel.base_models import AssembledUnit, Page
|
|
12
|
+
from docling.datamodel.document import ConversionResult
|
|
13
|
+
from docling.datamodel.layout_model_specs import LayoutModelConfig
|
|
14
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
15
|
+
from docling.datamodel.settings import settings
|
|
16
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
|
17
|
+
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
|
18
|
+
from docling.models.factories import get_ocr_factory
|
|
19
|
+
from docling.models.layout_model import LayoutModel
|
|
20
|
+
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
|
21
|
+
from docling.models.page_preprocessing_model import (
|
|
22
|
+
PagePreprocessingModel,
|
|
23
|
+
PagePreprocessingOptions,
|
|
24
|
+
)
|
|
25
|
+
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
|
26
|
+
from docling.models.table_structure_model import TableStructureModel
|
|
27
|
+
from docling.pipeline.base_pipeline import PaginatedPipeline
|
|
28
|
+
from docling.utils.model_downloader import download_models
|
|
29
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
30
|
+
|
|
31
|
+
_log = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LegacyStandardPdfPipeline(PaginatedPipeline):
|
|
35
|
+
def __init__(self, pipeline_options: PdfPipelineOptions):
|
|
36
|
+
super().__init__(pipeline_options)
|
|
37
|
+
self.pipeline_options: PdfPipelineOptions
|
|
38
|
+
|
|
39
|
+
with warnings.catch_warnings(): # deprecated generate_table_images
|
|
40
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
41
|
+
self.keep_images = (
|
|
42
|
+
self.pipeline_options.generate_page_images
|
|
43
|
+
or self.pipeline_options.generate_picture_images
|
|
44
|
+
or self.pipeline_options.generate_table_images
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
|
48
|
+
|
|
49
|
+
ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
|
|
50
|
+
|
|
51
|
+
self.build_pipe = [
|
|
52
|
+
# Pre-processing
|
|
53
|
+
PagePreprocessingModel(
|
|
54
|
+
options=PagePreprocessingOptions(
|
|
55
|
+
images_scale=pipeline_options.images_scale,
|
|
56
|
+
)
|
|
57
|
+
),
|
|
58
|
+
# OCR
|
|
59
|
+
ocr_model,
|
|
60
|
+
# Layout model
|
|
61
|
+
LayoutModel(
|
|
62
|
+
artifacts_path=self.artifacts_path,
|
|
63
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
64
|
+
options=pipeline_options.layout_options,
|
|
65
|
+
),
|
|
66
|
+
# Table structure model
|
|
67
|
+
TableStructureModel(
|
|
68
|
+
enabled=pipeline_options.do_table_structure,
|
|
69
|
+
artifacts_path=self.artifacts_path,
|
|
70
|
+
options=pipeline_options.table_structure_options,
|
|
71
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
72
|
+
),
|
|
73
|
+
# Page assemble
|
|
74
|
+
PageAssembleModel(options=PageAssembleOptions()),
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
self.enrichment_pipe = [
|
|
78
|
+
# Code Formula Enrichment Model
|
|
79
|
+
CodeFormulaModel(
|
|
80
|
+
enabled=pipeline_options.do_code_enrichment
|
|
81
|
+
or pipeline_options.do_formula_enrichment,
|
|
82
|
+
artifacts_path=self.artifacts_path,
|
|
83
|
+
options=CodeFormulaModelOptions(
|
|
84
|
+
do_code_enrichment=pipeline_options.do_code_enrichment,
|
|
85
|
+
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
|
86
|
+
),
|
|
87
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
88
|
+
),
|
|
89
|
+
*self.enrichment_pipe,
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
if (
|
|
93
|
+
self.pipeline_options.do_formula_enrichment
|
|
94
|
+
or self.pipeline_options.do_code_enrichment
|
|
95
|
+
or self.pipeline_options.do_picture_classification
|
|
96
|
+
or self.pipeline_options.do_picture_description
|
|
97
|
+
):
|
|
98
|
+
self.keep_backend = True
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def download_models_hf(
|
|
102
|
+
local_dir: Optional[Path] = None, force: bool = False
|
|
103
|
+
) -> Path:
|
|
104
|
+
warnings.warn(
|
|
105
|
+
"The usage of LegacyStandardPdfPipeline.download_models_hf() is deprecated "
|
|
106
|
+
"use instead the utility `docling-tools models download`, or "
|
|
107
|
+
"the upstream method docling.utils.models_downloader.download_all()",
|
|
108
|
+
DeprecationWarning,
|
|
109
|
+
stacklevel=3,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
|
113
|
+
return output_dir
|
|
114
|
+
|
|
115
|
+
def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
|
|
116
|
+
factory = get_ocr_factory(
|
|
117
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
118
|
+
)
|
|
119
|
+
return factory.create_instance(
|
|
120
|
+
options=self.pipeline_options.ocr_options,
|
|
121
|
+
enabled=self.pipeline_options.do_ocr,
|
|
122
|
+
artifacts_path=artifacts_path,
|
|
123
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
|
127
|
+
with TimeRecorder(conv_res, "page_init"):
|
|
128
|
+
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
|
129
|
+
if page._backend is not None and page._backend.is_valid():
|
|
130
|
+
page.size = page._backend.get_size()
|
|
131
|
+
|
|
132
|
+
return page
|
|
133
|
+
|
|
134
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
135
|
+
all_elements = []
|
|
136
|
+
all_headers = []
|
|
137
|
+
all_body = []
|
|
138
|
+
|
|
139
|
+
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
|
140
|
+
for p in conv_res.pages:
|
|
141
|
+
if p.assembled is not None:
|
|
142
|
+
for el in p.assembled.body:
|
|
143
|
+
all_body.append(el)
|
|
144
|
+
for el in p.assembled.headers:
|
|
145
|
+
all_headers.append(el)
|
|
146
|
+
for el in p.assembled.elements:
|
|
147
|
+
all_elements.append(el)
|
|
148
|
+
|
|
149
|
+
conv_res.assembled = AssembledUnit(
|
|
150
|
+
elements=all_elements, headers=all_headers, body=all_body
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
conv_res.document = self.reading_order_model(conv_res)
|
|
154
|
+
|
|
155
|
+
# Generate page images in the output
|
|
156
|
+
if self.pipeline_options.generate_page_images:
|
|
157
|
+
for page in conv_res.pages:
|
|
158
|
+
assert page.image is not None
|
|
159
|
+
page_no = page.page_no + 1
|
|
160
|
+
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
|
161
|
+
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Generate images of the requested element types
|
|
165
|
+
with warnings.catch_warnings(): # deprecated generate_table_images
|
|
166
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
167
|
+
if (
|
|
168
|
+
self.pipeline_options.generate_picture_images
|
|
169
|
+
or self.pipeline_options.generate_table_images
|
|
170
|
+
):
|
|
171
|
+
scale = self.pipeline_options.images_scale
|
|
172
|
+
for element, _level in conv_res.document.iterate_items():
|
|
173
|
+
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
|
174
|
+
continue
|
|
175
|
+
if (
|
|
176
|
+
isinstance(element, PictureItem)
|
|
177
|
+
and self.pipeline_options.generate_picture_images
|
|
178
|
+
) or (
|
|
179
|
+
isinstance(element, TableItem)
|
|
180
|
+
and self.pipeline_options.generate_table_images
|
|
181
|
+
):
|
|
182
|
+
page_ix = element.prov[0].page_no - 1
|
|
183
|
+
page = next(
|
|
184
|
+
(p for p in conv_res.pages if p.page_no == page_ix),
|
|
185
|
+
cast("Page", None),
|
|
186
|
+
)
|
|
187
|
+
assert page is not None
|
|
188
|
+
assert page.size is not None
|
|
189
|
+
assert page.image is not None
|
|
190
|
+
|
|
191
|
+
crop_bbox = (
|
|
192
|
+
element.prov[0]
|
|
193
|
+
.bbox.scaled(scale=scale)
|
|
194
|
+
.to_top_left_origin(
|
|
195
|
+
page_height=page.size.height * scale
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
|
200
|
+
element.image = ImageRef.from_pil(
|
|
201
|
+
cropped_im, dpi=int(72 * scale)
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Aggregate confidence values for document:
|
|
205
|
+
if len(conv_res.pages) > 0:
|
|
206
|
+
with warnings.catch_warnings():
|
|
207
|
+
warnings.filterwarnings(
|
|
208
|
+
"ignore",
|
|
209
|
+
category=RuntimeWarning,
|
|
210
|
+
message="Mean of empty slice|All-NaN slice encountered",
|
|
211
|
+
)
|
|
212
|
+
conv_res.confidence.layout_score = float(
|
|
213
|
+
np.nanmean(
|
|
214
|
+
[c.layout_score for c in conv_res.confidence.pages.values()]
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
conv_res.confidence.parse_score = float(
|
|
218
|
+
np.nanquantile(
|
|
219
|
+
[c.parse_score for c in conv_res.confidence.pages.values()],
|
|
220
|
+
q=0.1, # parse score should relate to worst 10% of pages.
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
conv_res.confidence.table_score = float(
|
|
224
|
+
np.nanmean(
|
|
225
|
+
[c.table_score for c in conv_res.confidence.pages.values()]
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
conv_res.confidence.ocr_score = float(
|
|
229
|
+
np.nanmean(
|
|
230
|
+
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
return conv_res
|
|
235
|
+
|
|
236
|
+
@classmethod
|
|
237
|
+
def get_default_options(cls) -> PdfPipelineOptions:
|
|
238
|
+
return PdfPipelineOptions()
|
|
239
|
+
|
|
240
|
+
@classmethod
|
|
241
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
242
|
+
return isinstance(backend, PdfDocumentBackend)
|