docling 2.56.1__py3-none-any.whl → 2.58.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/abstract_backend.py +24 -3
- docling/backend/asciidoc_backend.py +3 -3
- docling/backend/docling_parse_v4_backend.py +15 -4
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/html_backend.py +130 -20
- docling/backend/md_backend.py +27 -5
- docling/backend/msexcel_backend.py +115 -27
- docling/backend/mspowerpoint_backend.py +2 -2
- docling/backend/msword_backend.py +104 -29
- docling/backend/pdf_backend.py +9 -2
- docling/backend/pypdfium2_backend.py +12 -3
- docling/cli/main.py +85 -30
- docling/datamodel/asr_model_specs.py +408 -6
- docling/datamodel/backend_options.py +82 -0
- docling/datamodel/base_models.py +17 -2
- docling/datamodel/document.py +81 -48
- docling/datamodel/pipeline_options_asr_model.py +21 -1
- docling/document_converter.py +37 -45
- docling/document_extractor.py +12 -11
- docling/models/readingorder_model.py +6 -7
- docling/pipeline/asr_pipeline.py +139 -3
- docling/pipeline/vlm_pipeline.py +53 -33
- docling/utils/api_image_request.py +4 -4
- docling/utils/layout_postprocessor.py +23 -24
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/METADATA +4 -2
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/RECORD +30 -28
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/WHEEL +0 -0
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/entry_points.txt +0 -0
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/top_level.txt +0 -0
docling/pipeline/asr_pipeline.py
CHANGED
|
@@ -4,7 +4,7 @@ import re
|
|
|
4
4
|
import tempfile
|
|
5
5
|
from io import BytesIO
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import List, Optional, Union, cast
|
|
7
|
+
from typing import TYPE_CHECKING, List, Optional, Union, cast
|
|
8
8
|
|
|
9
9
|
from docling_core.types.doc import DoclingDocument, DocumentOrigin
|
|
10
10
|
|
|
@@ -32,6 +32,7 @@ from docling.datamodel.pipeline_options import (
|
|
|
32
32
|
AsrPipelineOptions,
|
|
33
33
|
)
|
|
34
34
|
from docling.datamodel.pipeline_options_asr_model import (
|
|
35
|
+
InlineAsrMlxWhisperOptions,
|
|
35
36
|
InlineAsrNativeWhisperOptions,
|
|
36
37
|
# AsrResponseFormat,
|
|
37
38
|
InlineAsrOptions,
|
|
@@ -228,22 +229,157 @@ class _NativeWhisperModel:
|
|
|
228
229
|
return convo
|
|
229
230
|
|
|
230
231
|
|
|
232
|
+
class _MlxWhisperModel:
|
|
233
|
+
def __init__(
|
|
234
|
+
self,
|
|
235
|
+
enabled: bool,
|
|
236
|
+
artifacts_path: Optional[Path],
|
|
237
|
+
accelerator_options: AcceleratorOptions,
|
|
238
|
+
asr_options: InlineAsrMlxWhisperOptions,
|
|
239
|
+
):
|
|
240
|
+
"""
|
|
241
|
+
Transcriber using MLX Whisper for Apple Silicon optimization.
|
|
242
|
+
"""
|
|
243
|
+
self.enabled = enabled
|
|
244
|
+
|
|
245
|
+
_log.info(f"artifacts-path: {artifacts_path}")
|
|
246
|
+
_log.info(f"accelerator_options: {accelerator_options}")
|
|
247
|
+
|
|
248
|
+
if self.enabled:
|
|
249
|
+
try:
|
|
250
|
+
import mlx_whisper # type: ignore
|
|
251
|
+
except ImportError:
|
|
252
|
+
raise ImportError(
|
|
253
|
+
"mlx-whisper is not installed. Please install it via `pip install mlx-whisper` or do `uv sync --extra asr`."
|
|
254
|
+
)
|
|
255
|
+
self.asr_options = asr_options
|
|
256
|
+
self.mlx_whisper = mlx_whisper
|
|
257
|
+
|
|
258
|
+
self.device = decide_device(
|
|
259
|
+
accelerator_options.device,
|
|
260
|
+
supported_devices=asr_options.supported_devices,
|
|
261
|
+
)
|
|
262
|
+
_log.info(f"Available device for MLX Whisper: {self.device}")
|
|
263
|
+
|
|
264
|
+
self.model_name = asr_options.repo_id
|
|
265
|
+
_log.info(f"loading _MlxWhisperModel({self.model_name})")
|
|
266
|
+
|
|
267
|
+
# MLX Whisper models are loaded differently - they use HuggingFace repos
|
|
268
|
+
self.model_path = self.model_name
|
|
269
|
+
|
|
270
|
+
# Store MLX-specific options
|
|
271
|
+
self.language = asr_options.language
|
|
272
|
+
self.task = asr_options.task
|
|
273
|
+
self.word_timestamps = asr_options.word_timestamps
|
|
274
|
+
self.no_speech_threshold = asr_options.no_speech_threshold
|
|
275
|
+
self.logprob_threshold = asr_options.logprob_threshold
|
|
276
|
+
self.compression_ratio_threshold = asr_options.compression_ratio_threshold
|
|
277
|
+
|
|
278
|
+
def run(self, conv_res: ConversionResult) -> ConversionResult:
|
|
279
|
+
audio_path: Path = Path(conv_res.input.file).resolve()
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
conversation = self.transcribe(audio_path)
|
|
283
|
+
|
|
284
|
+
# Ensure we have a proper DoclingDocument
|
|
285
|
+
origin = DocumentOrigin(
|
|
286
|
+
filename=conv_res.input.file.name or "audio.wav",
|
|
287
|
+
mimetype="audio/x-wav",
|
|
288
|
+
binary_hash=conv_res.input.document_hash,
|
|
289
|
+
)
|
|
290
|
+
conv_res.document = DoclingDocument(
|
|
291
|
+
name=conv_res.input.file.stem or "audio.wav", origin=origin
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
for citem in conversation:
|
|
295
|
+
conv_res.document.add_text(
|
|
296
|
+
label=DocItemLabel.TEXT, text=citem.to_string()
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
conv_res.status = ConversionStatus.SUCCESS
|
|
300
|
+
return conv_res
|
|
301
|
+
|
|
302
|
+
except Exception as exc:
|
|
303
|
+
_log.error(f"MLX Audio transcription has an error: {exc}")
|
|
304
|
+
|
|
305
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
306
|
+
return conv_res
|
|
307
|
+
|
|
308
|
+
def transcribe(self, fpath: Path) -> list[_ConversationItem]:
|
|
309
|
+
"""
|
|
310
|
+
Transcribe audio using MLX Whisper.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
fpath: Path to audio file
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
List of conversation items with timestamps
|
|
317
|
+
"""
|
|
318
|
+
result = self.mlx_whisper.transcribe(
|
|
319
|
+
str(fpath),
|
|
320
|
+
path_or_hf_repo=self.model_path,
|
|
321
|
+
language=self.language,
|
|
322
|
+
task=self.task,
|
|
323
|
+
word_timestamps=self.word_timestamps,
|
|
324
|
+
no_speech_threshold=self.no_speech_threshold,
|
|
325
|
+
logprob_threshold=self.logprob_threshold,
|
|
326
|
+
compression_ratio_threshold=self.compression_ratio_threshold,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
convo: list[_ConversationItem] = []
|
|
330
|
+
|
|
331
|
+
# MLX Whisper returns segments similar to native Whisper
|
|
332
|
+
for segment in result.get("segments", []):
|
|
333
|
+
item = _ConversationItem(
|
|
334
|
+
start_time=segment.get("start"),
|
|
335
|
+
end_time=segment.get("end"),
|
|
336
|
+
text=segment.get("text", "").strip(),
|
|
337
|
+
words=[],
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# Add word-level timestamps if available
|
|
341
|
+
if self.word_timestamps and "words" in segment:
|
|
342
|
+
item.words = []
|
|
343
|
+
for word_data in segment["words"]:
|
|
344
|
+
item.words.append(
|
|
345
|
+
_ConversationWord(
|
|
346
|
+
start_time=word_data.get("start"),
|
|
347
|
+
end_time=word_data.get("end"),
|
|
348
|
+
text=word_data.get("word", ""),
|
|
349
|
+
)
|
|
350
|
+
)
|
|
351
|
+
convo.append(item)
|
|
352
|
+
|
|
353
|
+
return convo
|
|
354
|
+
|
|
355
|
+
|
|
231
356
|
class AsrPipeline(BasePipeline):
|
|
232
357
|
def __init__(self, pipeline_options: AsrPipelineOptions):
|
|
233
358
|
super().__init__(pipeline_options)
|
|
234
359
|
self.keep_backend = True
|
|
235
360
|
|
|
236
361
|
self.pipeline_options: AsrPipelineOptions = pipeline_options
|
|
362
|
+
self._model: Union[_NativeWhisperModel, _MlxWhisperModel]
|
|
237
363
|
|
|
238
364
|
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
|
|
239
|
-
|
|
365
|
+
native_asr_options: InlineAsrNativeWhisperOptions = (
|
|
240
366
|
self.pipeline_options.asr_options
|
|
241
367
|
)
|
|
242
368
|
self._model = _NativeWhisperModel(
|
|
243
369
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
244
370
|
artifacts_path=self.artifacts_path,
|
|
245
371
|
accelerator_options=pipeline_options.accelerator_options,
|
|
246
|
-
asr_options=
|
|
372
|
+
asr_options=native_asr_options,
|
|
373
|
+
)
|
|
374
|
+
elif isinstance(self.pipeline_options.asr_options, InlineAsrMlxWhisperOptions):
|
|
375
|
+
mlx_asr_options: InlineAsrMlxWhisperOptions = (
|
|
376
|
+
self.pipeline_options.asr_options
|
|
377
|
+
)
|
|
378
|
+
self._model = _MlxWhisperModel(
|
|
379
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
380
|
+
artifacts_path=self.artifacts_path,
|
|
381
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
382
|
+
asr_options=mlx_asr_options,
|
|
247
383
|
)
|
|
248
384
|
else:
|
|
249
385
|
_log.error(f"No model support for {self.pipeline_options.asr_options}")
|
docling/pipeline/vlm_pipeline.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import List, Optional, Union, cast
|
|
|
6
6
|
|
|
7
7
|
from docling_core.types.doc import (
|
|
8
8
|
BoundingBox,
|
|
9
|
+
ContentLayer,
|
|
9
10
|
DocItem,
|
|
10
11
|
DoclingDocument,
|
|
11
12
|
ImageRef,
|
|
@@ -251,9 +252,9 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
251
252
|
# No code blocks found, return original text
|
|
252
253
|
return text
|
|
253
254
|
|
|
254
|
-
|
|
255
|
-
page_no = pg_idx + 1 # FIXME: might be incorrect
|
|
255
|
+
page_docs = []
|
|
256
256
|
|
|
257
|
+
for pg_idx, page in enumerate(conv_res.pages):
|
|
257
258
|
predicted_text = ""
|
|
258
259
|
if page.predictions.vlm_response:
|
|
259
260
|
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
|
@@ -273,6 +274,24 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
273
274
|
)
|
|
274
275
|
page_doc = backend.convert()
|
|
275
276
|
|
|
277
|
+
# Modify provenance in place for all items in the page document
|
|
278
|
+
for item, level in page_doc.iterate_items(
|
|
279
|
+
with_groups=True,
|
|
280
|
+
traverse_pictures=True,
|
|
281
|
+
included_content_layers=set(ContentLayer),
|
|
282
|
+
):
|
|
283
|
+
if isinstance(item, DocItem):
|
|
284
|
+
item.prov = [
|
|
285
|
+
ProvenanceItem(
|
|
286
|
+
page_no=pg_idx + 1,
|
|
287
|
+
bbox=BoundingBox(
|
|
288
|
+
t=0.0, b=0.0, l=0.0, r=0.0
|
|
289
|
+
), # FIXME: would be nice not to have to "fake" it
|
|
290
|
+
charspan=[0, 0],
|
|
291
|
+
)
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
# Add page metadata to the page document before concatenation
|
|
276
295
|
if page.image is not None:
|
|
277
296
|
pg_width = page.image.width
|
|
278
297
|
pg_height = page.image.height
|
|
@@ -280,27 +299,18 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
280
299
|
pg_width = 1
|
|
281
300
|
pg_height = 1
|
|
282
301
|
|
|
283
|
-
|
|
284
|
-
page_no=
|
|
302
|
+
page_doc.add_page(
|
|
303
|
+
page_no=pg_idx + 1,
|
|
285
304
|
size=Size(width=pg_width, height=pg_height),
|
|
286
305
|
image=ImageRef.from_pil(image=page.image, dpi=72)
|
|
287
306
|
if page.image
|
|
288
307
|
else None,
|
|
289
308
|
)
|
|
290
309
|
|
|
291
|
-
|
|
292
|
-
item.prov = [
|
|
293
|
-
ProvenanceItem(
|
|
294
|
-
page_no=pg_idx + 1,
|
|
295
|
-
bbox=BoundingBox(
|
|
296
|
-
t=0.0, b=0.0, l=0.0, r=0.0
|
|
297
|
-
), # FIXME: would be nice not to have to "fake" it
|
|
298
|
-
charspan=[0, 0],
|
|
299
|
-
)
|
|
300
|
-
]
|
|
301
|
-
conv_res.document.append_child_item(child=item)
|
|
310
|
+
page_docs.append(page_doc)
|
|
302
311
|
|
|
303
|
-
|
|
312
|
+
final_doc = DoclingDocument.concatenate(docs=page_docs)
|
|
313
|
+
return final_doc
|
|
304
314
|
|
|
305
315
|
def _turn_html_into_doc(self, conv_res):
|
|
306
316
|
def _extract_html_code(text):
|
|
@@ -328,9 +338,9 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
328
338
|
# No code blocks found, return original text
|
|
329
339
|
return text
|
|
330
340
|
|
|
331
|
-
|
|
332
|
-
page_no = pg_idx + 1 # FIXME: might be incorrect
|
|
341
|
+
page_docs = []
|
|
333
342
|
|
|
343
|
+
for pg_idx, page in enumerate(conv_res.pages):
|
|
334
344
|
predicted_text = ""
|
|
335
345
|
if page.predictions.vlm_response:
|
|
336
346
|
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
|
@@ -341,7 +351,7 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
341
351
|
out_doc = InputDocument(
|
|
342
352
|
path_or_stream=response_bytes,
|
|
343
353
|
filename=conv_res.input.file.name,
|
|
344
|
-
format=InputFormat.
|
|
354
|
+
format=InputFormat.HTML,
|
|
345
355
|
backend=HTMLDocumentBackend,
|
|
346
356
|
)
|
|
347
357
|
backend = HTMLDocumentBackend(
|
|
@@ -350,6 +360,24 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
350
360
|
)
|
|
351
361
|
page_doc = backend.convert()
|
|
352
362
|
|
|
363
|
+
# Modify provenance in place for all items in the page document
|
|
364
|
+
for item, level in page_doc.iterate_items(
|
|
365
|
+
with_groups=True,
|
|
366
|
+
traverse_pictures=True,
|
|
367
|
+
included_content_layers=set(ContentLayer),
|
|
368
|
+
):
|
|
369
|
+
if isinstance(item, DocItem):
|
|
370
|
+
item.prov = [
|
|
371
|
+
ProvenanceItem(
|
|
372
|
+
page_no=pg_idx + 1,
|
|
373
|
+
bbox=BoundingBox(
|
|
374
|
+
t=0.0, b=0.0, l=0.0, r=0.0
|
|
375
|
+
), # FIXME: would be nice not to have to "fake" it
|
|
376
|
+
charspan=[0, 0],
|
|
377
|
+
)
|
|
378
|
+
]
|
|
379
|
+
|
|
380
|
+
# Add page metadata to the page document before concatenation
|
|
353
381
|
if page.image is not None:
|
|
354
382
|
pg_width = page.image.width
|
|
355
383
|
pg_height = page.image.height
|
|
@@ -357,27 +385,19 @@ class VlmPipeline(PaginatedPipeline):
|
|
|
357
385
|
pg_width = 1
|
|
358
386
|
pg_height = 1
|
|
359
387
|
|
|
360
|
-
|
|
361
|
-
page_no=
|
|
388
|
+
page_doc.add_page(
|
|
389
|
+
page_no=pg_idx + 1,
|
|
362
390
|
size=Size(width=pg_width, height=pg_height),
|
|
363
391
|
image=ImageRef.from_pil(image=page.image, dpi=72)
|
|
364
392
|
if page.image
|
|
365
393
|
else None,
|
|
366
394
|
)
|
|
367
395
|
|
|
368
|
-
|
|
369
|
-
item.prov = [
|
|
370
|
-
ProvenanceItem(
|
|
371
|
-
page_no=pg_idx + 1,
|
|
372
|
-
bbox=BoundingBox(
|
|
373
|
-
t=0.0, b=0.0, l=0.0, r=0.0
|
|
374
|
-
), # FIXME: would be nice not to have to "fake" it
|
|
375
|
-
charspan=[0, 0],
|
|
376
|
-
)
|
|
377
|
-
]
|
|
378
|
-
conv_res.document.append_child_item(child=item)
|
|
396
|
+
page_docs.append(page_doc)
|
|
379
397
|
|
|
380
|
-
|
|
398
|
+
# Concatenate all page documents to preserve hierarchy
|
|
399
|
+
final_doc = DoclingDocument.concatenate(docs=page_docs)
|
|
400
|
+
return final_doc
|
|
381
401
|
|
|
382
402
|
@classmethod
|
|
383
403
|
def get_default_options(cls) -> VlmPipelineOptions:
|
|
@@ -2,7 +2,7 @@ import base64
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
from io import BytesIO
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Optional
|
|
6
6
|
|
|
7
7
|
import requests
|
|
8
8
|
from PIL import Image
|
|
@@ -19,7 +19,7 @@ def api_image_request(
|
|
|
19
19
|
prompt: str,
|
|
20
20
|
url: AnyUrl,
|
|
21
21
|
timeout: float = 20,
|
|
22
|
-
headers: Optional[
|
|
22
|
+
headers: Optional[dict[str, str]] = None,
|
|
23
23
|
**params,
|
|
24
24
|
) -> str:
|
|
25
25
|
img_io = BytesIO()
|
|
@@ -69,8 +69,8 @@ def api_image_request_streaming(
|
|
|
69
69
|
url: AnyUrl,
|
|
70
70
|
*,
|
|
71
71
|
timeout: float = 20,
|
|
72
|
-
headers: Optional[
|
|
73
|
-
generation_stoppers:
|
|
72
|
+
headers: Optional[dict[str, str]] = None,
|
|
73
|
+
generation_stoppers: list[GenerationStopper] = [],
|
|
74
74
|
**params,
|
|
75
75
|
) -> str:
|
|
76
76
|
"""
|
|
@@ -2,7 +2,6 @@ import bisect
|
|
|
2
2
|
import logging
|
|
3
3
|
import sys
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
from typing import Dict, List, Set, Tuple
|
|
6
5
|
|
|
7
6
|
from docling_core.types.doc import DocItemLabel, Size
|
|
8
7
|
from docling_core.types.doc.page import TextCell
|
|
@@ -39,7 +38,7 @@ class UnionFind:
|
|
|
39
38
|
self.parent[root_y] = root_x
|
|
40
39
|
self.rank[root_x] += 1
|
|
41
40
|
|
|
42
|
-
def get_groups(self) ->
|
|
41
|
+
def get_groups(self) -> dict[int, list[int]]:
|
|
43
42
|
"""Returns groups as {root: [elements]}."""
|
|
44
43
|
groups = defaultdict(list)
|
|
45
44
|
for elem in self.parent:
|
|
@@ -50,13 +49,13 @@ class UnionFind:
|
|
|
50
49
|
class SpatialClusterIndex:
|
|
51
50
|
"""Efficient spatial indexing for clusters using R-tree and interval trees."""
|
|
52
51
|
|
|
53
|
-
def __init__(self, clusters:
|
|
52
|
+
def __init__(self, clusters: list[Cluster]):
|
|
54
53
|
p = index.Property()
|
|
55
54
|
p.dimension = 2
|
|
56
55
|
self.spatial_index = index.Index(properties=p)
|
|
57
56
|
self.x_intervals = IntervalTree()
|
|
58
57
|
self.y_intervals = IntervalTree()
|
|
59
|
-
self.clusters_by_id:
|
|
58
|
+
self.clusters_by_id: dict[int, Cluster] = {}
|
|
60
59
|
|
|
61
60
|
for cluster in clusters:
|
|
62
61
|
self.add_cluster(cluster)
|
|
@@ -72,7 +71,7 @@ class SpatialClusterIndex:
|
|
|
72
71
|
self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
|
|
73
72
|
del self.clusters_by_id[cluster.id]
|
|
74
73
|
|
|
75
|
-
def find_candidates(self, bbox: BoundingBox) ->
|
|
74
|
+
def find_candidates(self, bbox: BoundingBox) -> set[int]:
|
|
76
75
|
"""Find potential overlapping cluster IDs using all indexes."""
|
|
77
76
|
spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
|
|
78
77
|
x_candidates = self.x_intervals.find_containing(
|
|
@@ -123,13 +122,13 @@ class IntervalTree:
|
|
|
123
122
|
"""Memory-efficient interval tree for 1D overlap queries."""
|
|
124
123
|
|
|
125
124
|
def __init__(self):
|
|
126
|
-
self.intervals:
|
|
125
|
+
self.intervals: list[Interval] = [] # Sorted by min_val
|
|
127
126
|
|
|
128
127
|
def insert(self, min_val: float, max_val: float, id: int):
|
|
129
128
|
interval = Interval(min_val, max_val, id)
|
|
130
129
|
bisect.insort(self.intervals, interval)
|
|
131
130
|
|
|
132
|
-
def find_containing(self, point: float) ->
|
|
131
|
+
def find_containing(self, point: float) -> set[int]:
|
|
133
132
|
"""Find all intervals containing the point."""
|
|
134
133
|
pos = bisect.bisect_left(self.intervals, point)
|
|
135
134
|
result = set()
|
|
@@ -196,7 +195,7 @@ class LayoutPostprocessor:
|
|
|
196
195
|
}
|
|
197
196
|
|
|
198
197
|
def __init__(
|
|
199
|
-
self, page: Page, clusters:
|
|
198
|
+
self, page: Page, clusters: list[Cluster], options: LayoutOptions
|
|
200
199
|
) -> None:
|
|
201
200
|
"""Initialize processor with page and clusters."""
|
|
202
201
|
|
|
@@ -219,7 +218,7 @@ class LayoutPostprocessor:
|
|
|
219
218
|
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
|
220
219
|
)
|
|
221
220
|
|
|
222
|
-
def postprocess(self) ->
|
|
221
|
+
def postprocess(self) -> tuple[list[Cluster], list[TextCell]]:
|
|
223
222
|
"""Main processing pipeline."""
|
|
224
223
|
self.regular_clusters = self._process_regular_clusters()
|
|
225
224
|
self.special_clusters = self._process_special_clusters()
|
|
@@ -254,7 +253,7 @@ class LayoutPostprocessor:
|
|
|
254
253
|
|
|
255
254
|
return final_clusters, self.cells
|
|
256
255
|
|
|
257
|
-
def _process_regular_clusters(self) ->
|
|
256
|
+
def _process_regular_clusters(self) -> list[Cluster]:
|
|
258
257
|
"""Process regular clusters with iterative refinement."""
|
|
259
258
|
clusters = [
|
|
260
259
|
c
|
|
@@ -311,7 +310,7 @@ class LayoutPostprocessor:
|
|
|
311
310
|
|
|
312
311
|
return clusters
|
|
313
312
|
|
|
314
|
-
def _process_special_clusters(self) ->
|
|
313
|
+
def _process_special_clusters(self) -> list[Cluster]:
|
|
315
314
|
special_clusters = [
|
|
316
315
|
c
|
|
317
316
|
for c in self.special_clusters
|
|
@@ -381,7 +380,7 @@ class LayoutPostprocessor:
|
|
|
381
380
|
|
|
382
381
|
return picture_clusters + wrapper_clusters
|
|
383
382
|
|
|
384
|
-
def _handle_cross_type_overlaps(self, special_clusters) ->
|
|
383
|
+
def _handle_cross_type_overlaps(self, special_clusters) -> list[Cluster]:
|
|
385
384
|
"""Handle overlaps between regular and wrapper clusters before child assignment.
|
|
386
385
|
|
|
387
386
|
In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
|
|
@@ -454,7 +453,7 @@ class LayoutPostprocessor:
|
|
|
454
453
|
|
|
455
454
|
def _select_best_cluster_from_group(
|
|
456
455
|
self,
|
|
457
|
-
group_clusters:
|
|
456
|
+
group_clusters: list[Cluster],
|
|
458
457
|
params: dict,
|
|
459
458
|
) -> Cluster:
|
|
460
459
|
"""Select best cluster from a group of overlapping clusters based on all rules."""
|
|
@@ -487,11 +486,11 @@ class LayoutPostprocessor:
|
|
|
487
486
|
|
|
488
487
|
def _remove_overlapping_clusters(
|
|
489
488
|
self,
|
|
490
|
-
clusters:
|
|
489
|
+
clusters: list[Cluster],
|
|
491
490
|
cluster_type: str,
|
|
492
491
|
overlap_threshold: float = 0.8,
|
|
493
492
|
containment_threshold: float = 0.8,
|
|
494
|
-
) ->
|
|
493
|
+
) -> list[Cluster]:
|
|
495
494
|
if not clusters:
|
|
496
495
|
return []
|
|
497
496
|
|
|
@@ -544,7 +543,7 @@ class LayoutPostprocessor:
|
|
|
544
543
|
|
|
545
544
|
def _select_best_cluster(
|
|
546
545
|
self,
|
|
547
|
-
clusters:
|
|
546
|
+
clusters: list[Cluster],
|
|
548
547
|
area_threshold: float,
|
|
549
548
|
conf_threshold: float,
|
|
550
549
|
) -> Cluster:
|
|
@@ -572,7 +571,7 @@ class LayoutPostprocessor:
|
|
|
572
571
|
|
|
573
572
|
return current_best if current_best else clusters[0]
|
|
574
573
|
|
|
575
|
-
def _deduplicate_cells(self, cells:
|
|
574
|
+
def _deduplicate_cells(self, cells: list[TextCell]) -> list[TextCell]:
|
|
576
575
|
"""Ensure each cell appears only once, maintaining order of first appearance."""
|
|
577
576
|
seen_ids = set()
|
|
578
577
|
unique_cells = []
|
|
@@ -583,8 +582,8 @@ class LayoutPostprocessor:
|
|
|
583
582
|
return unique_cells
|
|
584
583
|
|
|
585
584
|
def _assign_cells_to_clusters(
|
|
586
|
-
self, clusters:
|
|
587
|
-
) ->
|
|
585
|
+
self, clusters: list[Cluster], min_overlap: float = 0.2
|
|
586
|
+
) -> list[Cluster]:
|
|
588
587
|
"""Assign cells to best overlapping cluster."""
|
|
589
588
|
for cluster in clusters:
|
|
590
589
|
cluster.cells = []
|
|
@@ -616,7 +615,7 @@ class LayoutPostprocessor:
|
|
|
616
615
|
|
|
617
616
|
return clusters
|
|
618
617
|
|
|
619
|
-
def _find_unassigned_cells(self, clusters:
|
|
618
|
+
def _find_unassigned_cells(self, clusters: list[Cluster]) -> list[TextCell]:
|
|
620
619
|
"""Find cells not assigned to any cluster."""
|
|
621
620
|
assigned = {cell.index for cluster in clusters for cell in cluster.cells}
|
|
622
621
|
return [
|
|
@@ -625,7 +624,7 @@ class LayoutPostprocessor:
|
|
|
625
624
|
if cell.index not in assigned and cell.text.strip()
|
|
626
625
|
]
|
|
627
626
|
|
|
628
|
-
def _adjust_cluster_bboxes(self, clusters:
|
|
627
|
+
def _adjust_cluster_bboxes(self, clusters: list[Cluster]) -> list[Cluster]:
|
|
629
628
|
"""Adjust cluster bounding boxes to contain their cells."""
|
|
630
629
|
for cluster in clusters:
|
|
631
630
|
if not cluster.cells:
|
|
@@ -651,13 +650,13 @@ class LayoutPostprocessor:
|
|
|
651
650
|
|
|
652
651
|
return clusters
|
|
653
652
|
|
|
654
|
-
def _sort_cells(self, cells:
|
|
653
|
+
def _sort_cells(self, cells: list[TextCell]) -> list[TextCell]:
|
|
655
654
|
"""Sort cells in native reading order."""
|
|
656
655
|
return sorted(cells, key=lambda c: (c.index))
|
|
657
656
|
|
|
658
657
|
def _sort_clusters(
|
|
659
|
-
self, clusters:
|
|
660
|
-
) ->
|
|
658
|
+
self, clusters: list[Cluster], mode: str = "id"
|
|
659
|
+
) -> list[Cluster]:
|
|
661
660
|
"""Sort clusters in reading order (top-to-bottom, left-to-right)."""
|
|
662
661
|
if mode == "id": # sort in the order the cells are printed in the PDF.
|
|
663
662
|
return sorted(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.58.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
|
|
|
27
27
|
License-File: LICENSE
|
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.2
|
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.7.0
|
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
|
@@ -69,6 +69,7 @@ Provides-Extra: rapidocr
|
|
|
69
69
|
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
|
|
70
70
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
|
71
71
|
Provides-Extra: asr
|
|
72
|
+
Requires-Dist: mlx-whisper>=0.4.3; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "asr"
|
|
72
73
|
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
|
73
74
|
Dynamic: license-file
|
|
74
75
|
|
|
@@ -96,6 +97,7 @@ Dynamic: license-file
|
|
|
96
97
|
[](https://pepy.tech/projects/docling)
|
|
97
98
|
[](https://apify.com/vancura/docling)
|
|
98
99
|
[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
|
|
100
|
+
[](https://docling.ai/discord)
|
|
99
101
|
[](https://www.bestpractices.dev/projects/10101)
|
|
100
102
|
[](https://lfaidata.foundation/projects/)
|
|
101
103
|
|