docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +39 -18
- docling/backend/docling_parse_backend.py +61 -59
- docling/backend/docling_parse_v2_backend.py +72 -62
- docling/backend/docling_parse_v4_backend.py +21 -19
- docling/backend/md_backend.py +101 -81
- docling/backend/mspowerpoint_backend.py +72 -113
- docling/backend/msword_backend.py +99 -80
- docling/backend/noop_backend.py +51 -0
- docling/backend/pypdfium2_backend.py +127 -53
- docling/cli/main.py +82 -14
- docling/datamodel/asr_model_specs.py +92 -0
- docling/datamodel/base_models.py +21 -4
- docling/datamodel/document.py +3 -1
- docling/datamodel/pipeline_options.py +15 -2
- docling/datamodel/pipeline_options_asr_model.py +57 -0
- docling/datamodel/pipeline_options_vlm_model.py +4 -4
- docling/document_converter.py +8 -0
- docling/models/api_vlm_model.py +3 -1
- docling/models/base_model.py +1 -1
- docling/models/base_ocr_model.py +33 -11
- docling/models/easyocr_model.py +1 -1
- docling/models/layout_model.py +2 -3
- docling/models/ocr_mac_model.py +1 -1
- docling/models/page_preprocessing_model.py +3 -6
- docling/models/rapid_ocr_model.py +1 -1
- docling/models/readingorder_model.py +3 -3
- docling/models/tesseract_ocr_cli_model.py +4 -3
- docling/models/tesseract_ocr_model.py +1 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
- docling/models/vlm_models_inline/mlx_model.py +3 -1
- docling/pipeline/asr_pipeline.py +253 -0
- docling/pipeline/base_pipeline.py +11 -0
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/utils/layout_postprocessor.py +11 -6
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ import re
|
|
2
2
|
import warnings
|
3
3
|
from collections.abc import Iterable
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Optional
|
5
|
+
from typing import Literal, Optional
|
6
6
|
|
7
7
|
import numpy as np
|
8
8
|
from PIL import ImageDraw
|
@@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
|
|
17
17
|
|
18
18
|
class PagePreprocessingOptions(BaseModel):
|
19
19
|
images_scale: Optional[float]
|
20
|
-
create_parsed_page: bool
|
21
20
|
|
22
21
|
|
23
22
|
class PagePreprocessingModel(BasePageModel):
|
@@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
|
|
66
65
|
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
67
66
|
assert page._backend is not None
|
68
67
|
|
69
|
-
page.
|
70
|
-
|
71
|
-
if self.options.create_parsed_page:
|
72
|
-
page.parsed_page = page._backend.get_segmented_page()
|
68
|
+
page.parsed_page = page._backend.get_segmented_page()
|
69
|
+
assert page.parsed_page is not None
|
73
70
|
|
74
71
|
# Rate the text quality from the PDF parser, and aggregate on page
|
75
72
|
text_scores = []
|
@@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
|
|
134
134
|
all_ocr_cells.extend(cells)
|
135
135
|
|
136
136
|
# Post-process the cells
|
137
|
-
|
137
|
+
self.post_process_cells(all_ocr_cells, page)
|
138
138
|
|
139
139
|
# DEBUG code:
|
140
140
|
if settings.debug.visualize_ocr:
|
@@ -124,7 +124,7 @@ class ReadingOrderModel:
|
|
124
124
|
page_no = page.page_no + 1
|
125
125
|
size = page.size
|
126
126
|
|
127
|
-
assert size is not None
|
127
|
+
assert size is not None, "Page size is not initialized."
|
128
128
|
|
129
129
|
out_doc.add_page(page_no=page_no, size=size)
|
130
130
|
|
@@ -334,12 +334,12 @@ class ReadingOrderModel:
|
|
334
334
|
"Labels of merged elements must match."
|
335
335
|
)
|
336
336
|
prov = ProvenanceItem(
|
337
|
-
page_no=
|
337
|
+
page_no=merged_elem.page_no + 1,
|
338
338
|
charspan=(
|
339
339
|
len(new_item.text) + 1,
|
340
340
|
len(new_item.text) + 1 + len(merged_elem.text),
|
341
341
|
),
|
342
|
-
bbox=
|
342
|
+
bbox=merged_elem.cluster.bbox.to_bottom_left_origin(page_height),
|
343
343
|
)
|
344
344
|
new_item.text += f" {merged_elem.text}"
|
345
345
|
new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
|
@@ -99,12 +99,12 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
99
99
|
|
100
100
|
return name, version
|
101
101
|
|
102
|
-
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
|
102
|
+
def _run_tesseract(self, ifilename: str, osd: Optional[pd.DataFrame]):
|
103
103
|
r"""
|
104
104
|
Run tesseract CLI
|
105
105
|
"""
|
106
106
|
cmd = [self.options.tesseract_cmd]
|
107
|
-
if self._is_auto:
|
107
|
+
if self._is_auto and osd is not None:
|
108
108
|
lang = self._parse_language(osd)
|
109
109
|
if lang is not None:
|
110
110
|
cmd.append("-l")
|
@@ -231,6 +231,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
231
231
|
fname = image_file.name
|
232
232
|
high_res_image.save(image_file)
|
233
233
|
doc_orientation = 0
|
234
|
+
df_osd: Optional[pd.DataFrame] = None
|
234
235
|
try:
|
235
236
|
df_osd = self._perform_osd(fname)
|
236
237
|
doc_orientation = _parse_orientation(df_osd)
|
@@ -305,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
305
306
|
all_ocr_cells.append(cell)
|
306
307
|
|
307
308
|
# Post-process the cells
|
308
|
-
|
309
|
+
self.post_process_cells(all_ocr_cells, page)
|
309
310
|
|
310
311
|
# DEBUG code:
|
311
312
|
if settings.debug.visualize_ocr:
|
@@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
235
235
|
all_ocr_cells.extend(cells)
|
236
236
|
|
237
237
|
# Post-process the cells
|
238
|
-
|
238
|
+
self.post_process_cells(all_ocr_cells, page)
|
239
239
|
|
240
240
|
# DEBUG code:
|
241
241
|
if settings.debug.visualize_ocr:
|
@@ -99,6 +99,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
99
99
|
self.vlm_model = model_cls.from_pretrained(
|
100
100
|
artifacts_path,
|
101
101
|
device_map=self.device,
|
102
|
+
torch_dtype=self.vlm_options.torch_dtype,
|
102
103
|
_attn_implementation=(
|
103
104
|
"flash_attention_2"
|
104
105
|
if self.device.startswith("cuda")
|
@@ -122,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
122
123
|
with TimeRecorder(conv_res, "vlm"):
|
123
124
|
assert page.size is not None
|
124
125
|
|
125
|
-
hi_res_image = page.get_image(
|
126
|
+
hi_res_image = page.get_image(
|
127
|
+
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
128
|
+
)
|
126
129
|
|
127
130
|
# Define prompt structure
|
128
131
|
prompt = self.formulate_prompt()
|
@@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
73
73
|
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
|
74
74
|
assert page.size is not None
|
75
75
|
|
76
|
-
hi_res_image = page.get_image(
|
76
|
+
hi_res_image = page.get_image(
|
77
|
+
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
78
|
+
)
|
77
79
|
if hi_res_image is not None:
|
78
80
|
im_width, im_height = hi_res_image.size
|
79
81
|
|
@@ -0,0 +1,253 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import re
|
4
|
+
from io import BytesIO
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import List, Optional, Union, cast
|
7
|
+
|
8
|
+
from docling_core.types.doc import DoclingDocument, DocumentOrigin
|
9
|
+
|
10
|
+
# import whisper # type: ignore
|
11
|
+
# import librosa
|
12
|
+
# import numpy as np
|
13
|
+
# import soundfile as sf # type: ignore
|
14
|
+
from docling_core.types.doc.labels import DocItemLabel
|
15
|
+
from pydantic import BaseModel, Field, validator
|
16
|
+
|
17
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
18
|
+
from docling.backend.noop_backend import NoOpBackend
|
19
|
+
|
20
|
+
# from pydub import AudioSegment # type: ignore
|
21
|
+
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
|
22
|
+
from docling.datamodel.accelerator_options import (
|
23
|
+
AcceleratorOptions,
|
24
|
+
)
|
25
|
+
from docling.datamodel.base_models import (
|
26
|
+
ConversionStatus,
|
27
|
+
FormatToMimeType,
|
28
|
+
)
|
29
|
+
from docling.datamodel.document import ConversionResult, InputDocument
|
30
|
+
from docling.datamodel.pipeline_options import (
|
31
|
+
AsrPipelineOptions,
|
32
|
+
)
|
33
|
+
from docling.datamodel.pipeline_options_asr_model import (
|
34
|
+
InlineAsrNativeWhisperOptions,
|
35
|
+
# AsrResponseFormat,
|
36
|
+
InlineAsrOptions,
|
37
|
+
)
|
38
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
39
|
+
InferenceFramework,
|
40
|
+
)
|
41
|
+
from docling.datamodel.settings import settings
|
42
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
43
|
+
from docling.utils.accelerator_utils import decide_device
|
44
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
45
|
+
|
46
|
+
_log = logging.getLogger(__name__)
|
47
|
+
|
48
|
+
|
49
|
+
class _ConversationWord(BaseModel):
|
50
|
+
text: str
|
51
|
+
start_time: Optional[float] = Field(
|
52
|
+
None, description="Start time in seconds from video start"
|
53
|
+
)
|
54
|
+
end_time: Optional[float] = Field(
|
55
|
+
None, ge=0, description="End time in seconds from video start"
|
56
|
+
)
|
57
|
+
|
58
|
+
|
59
|
+
class _ConversationItem(BaseModel):
|
60
|
+
text: str
|
61
|
+
start_time: Optional[float] = Field(
|
62
|
+
None, description="Start time in seconds from video start"
|
63
|
+
)
|
64
|
+
end_time: Optional[float] = Field(
|
65
|
+
None, ge=0, description="End time in seconds from video start"
|
66
|
+
)
|
67
|
+
speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
|
68
|
+
speaker: Optional[str] = Field(
|
69
|
+
None, description="Speaker name, defaults to speaker-{speaker_id}"
|
70
|
+
)
|
71
|
+
words: Optional[list[_ConversationWord]] = Field(
|
72
|
+
None, description="Individual words with time-stamps"
|
73
|
+
)
|
74
|
+
|
75
|
+
def __lt__(self, other):
|
76
|
+
if not isinstance(other, _ConversationItem):
|
77
|
+
return NotImplemented
|
78
|
+
return self.start_time < other.start_time
|
79
|
+
|
80
|
+
def __eq__(self, other):
|
81
|
+
if not isinstance(other, _ConversationItem):
|
82
|
+
return NotImplemented
|
83
|
+
return self.start_time == other.start_time
|
84
|
+
|
85
|
+
def to_string(self) -> str:
|
86
|
+
"""Format the conversation entry as a string"""
|
87
|
+
result = ""
|
88
|
+
if (self.start_time is not None) and (self.end_time is not None):
|
89
|
+
result += f"[time: {self.start_time}-{self.end_time}] "
|
90
|
+
|
91
|
+
if self.speaker is not None:
|
92
|
+
result += f"[speaker:{self.speaker}] "
|
93
|
+
|
94
|
+
result += self.text
|
95
|
+
return result
|
96
|
+
|
97
|
+
|
98
|
+
class _NativeWhisperModel:
|
99
|
+
def __init__(
|
100
|
+
self,
|
101
|
+
enabled: bool,
|
102
|
+
artifacts_path: Optional[Path],
|
103
|
+
accelerator_options: AcceleratorOptions,
|
104
|
+
asr_options: InlineAsrNativeWhisperOptions,
|
105
|
+
):
|
106
|
+
"""
|
107
|
+
Transcriber using native Whisper.
|
108
|
+
"""
|
109
|
+
self.enabled = enabled
|
110
|
+
|
111
|
+
_log.info(f"artifacts-path: {artifacts_path}")
|
112
|
+
_log.info(f"accelerator_options: {accelerator_options}")
|
113
|
+
|
114
|
+
if self.enabled:
|
115
|
+
try:
|
116
|
+
import whisper # type: ignore
|
117
|
+
except ImportError:
|
118
|
+
raise ImportError(
|
119
|
+
"whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
|
120
|
+
)
|
121
|
+
self.asr_options = asr_options
|
122
|
+
self.max_tokens = asr_options.max_new_tokens
|
123
|
+
self.temperature = asr_options.temperature
|
124
|
+
|
125
|
+
self.device = decide_device(
|
126
|
+
accelerator_options.device,
|
127
|
+
supported_devices=asr_options.supported_devices,
|
128
|
+
)
|
129
|
+
_log.info(f"Available device for Whisper: {self.device}")
|
130
|
+
|
131
|
+
self.model_name = asr_options.repo_id
|
132
|
+
_log.info(f"loading _NativeWhisperModel({self.model_name})")
|
133
|
+
if artifacts_path is not None:
|
134
|
+
_log.info(f"loading {self.model_name} from {artifacts_path}")
|
135
|
+
self.model = whisper.load_model(
|
136
|
+
name=self.model_name,
|
137
|
+
device=self.device,
|
138
|
+
download_root=str(artifacts_path),
|
139
|
+
)
|
140
|
+
else:
|
141
|
+
self.model = whisper.load_model(
|
142
|
+
name=self.model_name, device=self.device
|
143
|
+
)
|
144
|
+
|
145
|
+
self.verbose = asr_options.verbose
|
146
|
+
self.timestamps = asr_options.timestamps
|
147
|
+
self.word_timestamps = asr_options.word_timestamps
|
148
|
+
|
149
|
+
def run(self, conv_res: ConversionResult) -> ConversionResult:
|
150
|
+
audio_path: Path = Path(conv_res.input.file).resolve()
|
151
|
+
|
152
|
+
try:
|
153
|
+
conversation = self.transcribe(audio_path)
|
154
|
+
|
155
|
+
# Ensure we have a proper DoclingDocument
|
156
|
+
origin = DocumentOrigin(
|
157
|
+
filename=conv_res.input.file.name or "audio.wav",
|
158
|
+
mimetype="audio/x-wav",
|
159
|
+
binary_hash=conv_res.input.document_hash,
|
160
|
+
)
|
161
|
+
conv_res.document = DoclingDocument(
|
162
|
+
name=conv_res.input.file.stem or "audio.wav", origin=origin
|
163
|
+
)
|
164
|
+
|
165
|
+
for citem in conversation:
|
166
|
+
conv_res.document.add_text(
|
167
|
+
label=DocItemLabel.TEXT, text=citem.to_string()
|
168
|
+
)
|
169
|
+
|
170
|
+
conv_res.status = ConversionStatus.SUCCESS
|
171
|
+
return conv_res
|
172
|
+
|
173
|
+
except Exception as exc:
|
174
|
+
_log.error(f"Audio tranciption has an error: {exc}")
|
175
|
+
|
176
|
+
conv_res.status = ConversionStatus.FAILURE
|
177
|
+
return conv_res
|
178
|
+
|
179
|
+
def transcribe(self, fpath: Path) -> list[_ConversationItem]:
|
180
|
+
result = self.model.transcribe(
|
181
|
+
str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
|
182
|
+
)
|
183
|
+
|
184
|
+
convo: list[_ConversationItem] = []
|
185
|
+
for _ in result["segments"]:
|
186
|
+
item = _ConversationItem(
|
187
|
+
start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
|
188
|
+
)
|
189
|
+
if "words" in _ and self.word_timestamps:
|
190
|
+
item.words = []
|
191
|
+
for __ in _["words"]:
|
192
|
+
item.words.append(
|
193
|
+
_ConversationWord(
|
194
|
+
start_time=__["start"],
|
195
|
+
end_time=__["end"],
|
196
|
+
text=__["word"],
|
197
|
+
)
|
198
|
+
)
|
199
|
+
convo.append(item)
|
200
|
+
|
201
|
+
return convo
|
202
|
+
|
203
|
+
|
204
|
+
class AsrPipeline(BasePipeline):
|
205
|
+
def __init__(self, pipeline_options: AsrPipelineOptions):
|
206
|
+
super().__init__(pipeline_options)
|
207
|
+
self.keep_backend = True
|
208
|
+
|
209
|
+
self.pipeline_options: AsrPipelineOptions = pipeline_options
|
210
|
+
|
211
|
+
artifacts_path: Optional[Path] = None
|
212
|
+
if pipeline_options.artifacts_path is not None:
|
213
|
+
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
214
|
+
elif settings.artifacts_path is not None:
|
215
|
+
artifacts_path = Path(settings.artifacts_path).expanduser()
|
216
|
+
|
217
|
+
if artifacts_path is not None and not artifacts_path.is_dir():
|
218
|
+
raise RuntimeError(
|
219
|
+
f"The value of {artifacts_path=} is not valid. "
|
220
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
221
|
+
)
|
222
|
+
|
223
|
+
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
|
224
|
+
asr_options: InlineAsrNativeWhisperOptions = (
|
225
|
+
self.pipeline_options.asr_options
|
226
|
+
)
|
227
|
+
self._model = _NativeWhisperModel(
|
228
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
229
|
+
artifacts_path=artifacts_path,
|
230
|
+
accelerator_options=pipeline_options.accelerator_options,
|
231
|
+
asr_options=asr_options,
|
232
|
+
)
|
233
|
+
else:
|
234
|
+
_log.error(f"No model support for {self.pipeline_options.asr_options}")
|
235
|
+
|
236
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
237
|
+
status = ConversionStatus.SUCCESS
|
238
|
+
return status
|
239
|
+
|
240
|
+
@classmethod
|
241
|
+
def get_default_options(cls) -> AsrPipelineOptions:
|
242
|
+
return AsrPipelineOptions()
|
243
|
+
|
244
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
245
|
+
_log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
|
246
|
+
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
247
|
+
self._model.run(conv_res=conv_res)
|
248
|
+
|
249
|
+
return conv_res
|
250
|
+
|
251
|
+
@classmethod
|
252
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
253
|
+
return isinstance(backend, NoOpBackend)
|
@@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
193
193
|
)
|
194
194
|
raise e
|
195
195
|
|
196
|
+
# Filter out uninitialized pages (those with size=None) that may remain
|
197
|
+
# after timeout or processing failures to prevent assertion errors downstream
|
198
|
+
initial_page_count = len(conv_res.pages)
|
199
|
+
conv_res.pages = [page for page in conv_res.pages if page.size is not None]
|
200
|
+
|
201
|
+
if len(conv_res.pages) < initial_page_count:
|
202
|
+
_log.info(
|
203
|
+
f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
|
204
|
+
f"due to timeout or processing failures"
|
205
|
+
)
|
206
|
+
|
196
207
|
return conv_res
|
197
208
|
|
198
209
|
def _unload(self, conv_res: ConversionResult) -> ConversionResult:
|
@@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
|
|
8
8
|
from docling_core.types.doc.page import TextCell
|
9
9
|
from rtree import index
|
10
10
|
|
11
|
-
from docling.datamodel.base_models import BoundingBox, Cluster
|
11
|
+
from docling.datamodel.base_models import BoundingBox, Cluster, Page
|
12
12
|
|
13
13
|
_log = logging.getLogger(__name__)
|
14
14
|
|
@@ -194,11 +194,11 @@ class LayoutPostprocessor:
|
|
194
194
|
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
195
195
|
}
|
196
196
|
|
197
|
-
def __init__(self,
|
198
|
-
"""Initialize processor with
|
199
|
-
|
200
|
-
self.
|
201
|
-
self.page_size =
|
197
|
+
def __init__(self, page: Page, clusters: List[Cluster]) -> None:
|
198
|
+
"""Initialize processor with page and clusters."""
|
199
|
+
self.cells = page.cells
|
200
|
+
self.page = page
|
201
|
+
self.page_size = page.size
|
202
202
|
self.all_clusters = clusters
|
203
203
|
self.regular_clusters = [
|
204
204
|
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
@@ -240,6 +240,10 @@ class LayoutPostprocessor:
|
|
240
240
|
for child in cluster.children:
|
241
241
|
child.cells = self._sort_cells(child.cells)
|
242
242
|
|
243
|
+
assert self.page.parsed_page is not None
|
244
|
+
self.page.parsed_page.textline_cells = self.cells
|
245
|
+
self.page.parsed_page.has_lines = len(self.cells) > 0
|
246
|
+
|
243
247
|
return final_clusters, self.cells
|
244
248
|
|
245
249
|
def _process_regular_clusters(self) -> List[Cluster]:
|
@@ -301,6 +305,7 @@ class LayoutPostprocessor:
|
|
301
305
|
special_clusters = self._handle_cross_type_overlaps(special_clusters)
|
302
306
|
|
303
307
|
# Calculate page area from known page size
|
308
|
+
assert self.page_size is not None
|
304
309
|
page_area = self.page_size.width * self.page_size.height
|
305
310
|
if page_area > 0:
|
306
311
|
# Filter out full-page pictures
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.38.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -61,6 +61,8 @@ Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "d
|
|
61
61
|
Provides-Extra: rapidocr
|
62
62
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
63
63
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
64
|
+
Provides-Extra: asr
|
65
|
+
Requires-Dist: openai-whisper>=20240930; extra == "asr"
|
64
66
|
Dynamic: license-file
|
65
67
|
|
66
68
|
<p align="center">
|
@@ -93,14 +95,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
93
95
|
|
94
96
|
## Features
|
95
97
|
|
96
|
-
* 🗂️
|
98
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
97
99
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
98
100
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
99
|
-
* ↪️
|
101
|
+
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
100
102
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
101
103
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
102
104
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
103
|
-
*
|
105
|
+
* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
106
|
+
* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
|
104
107
|
* 💻 Simple and convenient CLI
|
105
108
|
|
106
109
|
### Coming soon
|
@@ -1,21 +1,22 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/document_converter.py,sha256=
|
2
|
+
docling/document_converter.py,sha256=3jWywP_TLy-1PMvjJBUlnTM9FNzpBLRCHYA1RKFvGR4,14333
|
3
3
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
4
4
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
5
5
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
|
7
|
-
docling/backend/asciidoc_backend.py,sha256=
|
7
|
+
docling/backend/asciidoc_backend.py,sha256=RDNLrPJHxROiM7-NQdZn3DdvAyiPAndbSWcZo9PbCKU,14417
|
8
8
|
docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
|
9
|
-
docling/backend/docling_parse_backend.py,sha256=
|
10
|
-
docling/backend/docling_parse_v2_backend.py,sha256=
|
11
|
-
docling/backend/docling_parse_v4_backend.py,sha256=
|
9
|
+
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
10
|
+
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
|
+
docling/backend/docling_parse_v4_backend.py,sha256=7tQvpCwpYoq98PNszDkrXaFhy5eWmQqMP4RjWWPLPgw,6197
|
12
12
|
docling/backend/html_backend.py,sha256=3K-l5SUAAyqISNEb7nPst_I51xzYOVOkgmwXh3lv9sw,21063
|
13
|
-
docling/backend/md_backend.py,sha256=
|
13
|
+
docling/backend/md_backend.py,sha256=ghIU_NSaENKrRu49Dn5GvjYtcAgEU7ZHbf-TeYg49nY,17673
|
14
14
|
docling/backend/msexcel_backend.py,sha256=3j0WQfqDpgPXdPMCguefdv7arcNVDedPD6gl54cmLn8,18110
|
15
|
-
docling/backend/mspowerpoint_backend.py,sha256=
|
16
|
-
docling/backend/msword_backend.py,sha256=
|
15
|
+
docling/backend/mspowerpoint_backend.py,sha256=0lsb8ZeQFxbDt7jZpSQyk5wYHYa3SP2T2y2dMI-o30o,15216
|
16
|
+
docling/backend/msword_backend.py,sha256=C4qs4mQEt1JzonCg5v6_yUxdngzcTzSO9k1ik8_DW5Q,44855
|
17
|
+
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
17
18
|
docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
|
18
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
19
|
+
docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
|
19
20
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
21
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
22
|
docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
|
@@ -27,36 +28,38 @@ docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e
|
|
27
28
|
docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
|
28
29
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
29
30
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
-
docling/cli/main.py,sha256=
|
31
|
+
docling/cli/main.py,sha256=D2gEoArnQ2yQ9BesH9CkxZbYQyhZRGgjjNWYqmRRUtU,29617
|
31
32
|
docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
|
32
33
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
33
34
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
35
|
docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
|
35
|
-
docling/datamodel/
|
36
|
-
docling/datamodel/
|
37
|
-
docling/datamodel/
|
38
|
-
docling/datamodel/
|
36
|
+
docling/datamodel/asr_model_specs.py,sha256=L7ETXsUKVbPsVcPLhEIMxQjd4UzMGZBVsy74CLsZBkU,2181
|
37
|
+
docling/datamodel/base_models.py,sha256=L35qXLmADZQNEzBC0M6K2xrfLyqrTqDlbPD6E6DkWMc,11146
|
38
|
+
docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
|
39
|
+
docling/datamodel/pipeline_options.py,sha256=N7my7hmvuX6EzlujHeF6RObPSrG_HjN_nfPzILTqP-E,9479
|
40
|
+
docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
|
41
|
+
docling/datamodel/pipeline_options_vlm_model.py,sha256=rtDMVtKFZbgQD269w8FvHMXEhdRBrsA4rVYk6A-M-b4,2063
|
39
42
|
docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
|
40
43
|
docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
|
41
44
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
|
-
docling/models/api_vlm_model.py,sha256=
|
43
|
-
docling/models/base_model.py,sha256=
|
44
|
-
docling/models/base_ocr_model.py,sha256=
|
45
|
+
docling/models/api_vlm_model.py,sha256=GDDJGAia4SJjK7JFxsZy5oEU-D8yQo8Kb3NvvPbTvT0,2820
|
46
|
+
docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
|
47
|
+
docling/models/base_ocr_model.py,sha256=HtrefTq9Zy4UnUInMchPv0tbobiA7CQU5VUauKJD7IU,8006
|
45
48
|
docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
|
46
49
|
docling/models/document_picture_classifier.py,sha256=fkJLV7pMy3v6iNwOzVb6zdBU1dGtBM1ARHLIRPfoAG4,6124
|
47
|
-
docling/models/easyocr_model.py,sha256=
|
48
|
-
docling/models/layout_model.py,sha256=
|
49
|
-
docling/models/ocr_mac_model.py,sha256=
|
50
|
+
docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
|
51
|
+
docling/models/layout_model.py,sha256=EJuRXW0rFdnNPS5AifdEsr812EATUqAioeMCVjw8PL0,8460
|
52
|
+
docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
|
50
53
|
docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
|
51
|
-
docling/models/page_preprocessing_model.py,sha256=
|
54
|
+
docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
|
52
55
|
docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
|
53
56
|
docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
|
54
57
|
docling/models/picture_description_vlm_model.py,sha256=7LeCx9ZdPxsmWJ468OtxCdAkH48A1HD0iwH9cs_7-1Q,3800
|
55
|
-
docling/models/rapid_ocr_model.py,sha256=
|
56
|
-
docling/models/readingorder_model.py,sha256=
|
58
|
+
docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
|
59
|
+
docling/models/readingorder_model.py,sha256=QHb5fyiqmxU8lg4W5IzdukqHPh6V7rNw_57O4-z-Az4,14615
|
57
60
|
docling/models/table_structure_model.py,sha256=dQf6u_zn5fHCkHzmTwYfCbRtZCBddsyAM0WNVBUUQzk,12473
|
58
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
59
|
-
docling/models/tesseract_ocr_model.py,sha256=
|
61
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=qcM3-n7Z_dm1CGBhVUcNr2XT41iXnU32zk4RqKHBl9I,12775
|
62
|
+
docling/models/tesseract_ocr_model.py,sha256=9DPAE7XP7smej7HYhr7mdwpuxSjAcv_GPrYZG3bb1RA,10587
|
60
63
|
docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
|
61
64
|
docling/models/factories/base_factory.py,sha256=MfWIljMETi5aaVR-6qLTelW8u1gwDAQsOwg3fu7O4Qc,4028
|
62
65
|
docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
|
@@ -66,19 +69,20 @@ docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurH
|
|
66
69
|
docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
67
70
|
docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
|
68
71
|
docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
|
-
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=
|
70
|
-
docling/models/vlm_models_inline/mlx_model.py,sha256=
|
72
|
+
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=w9_N4ccjmYYK5yYQou0LSMGaj6gs8l0hULvXbkfYXSQ,7425
|
73
|
+
docling/models/vlm_models_inline/mlx_model.py,sha256=qpyi6fGHm0vPqW2yeTsRBKOTTshNJ1LAPbH1SBDp8Y8,5784
|
71
74
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
72
|
-
docling/pipeline/
|
75
|
+
docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
|
76
|
+
docling/pipeline/base_pipeline.py,sha256=14yQrDjsojl4RgbBjKFSEfVBYR_sULZfBI1uDzFLi8Y,9331
|
73
77
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
74
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
78
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=2Hqg2wnAXfbZbLUOQrRus8PMEuZ549jR1mfR86-CAB4,12659
|
75
79
|
docling/pipeline/vlm_pipeline.py,sha256=IrjDbajCPmUPep_jATKNiABST4tQ8mvpkQz9mtBQ8qQ,15279
|
76
80
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
81
|
docling/utils/accelerator_utils.py,sha256=Fww4UiTiuIB91iuPgUZTy-DYpCGRMI8YuCYKhFb0gjA,2905
|
78
82
|
docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
|
79
83
|
docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
|
80
84
|
docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
|
81
|
-
docling/utils/layout_postprocessor.py,sha256=
|
85
|
+
docling/utils/layout_postprocessor.py,sha256=laTPGGj-hv16Zh1TRcn8NK0POKs7d3jeaV1pRR_TjIU,24228
|
82
86
|
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
83
87
|
docling/utils/model_downloader.py,sha256=6TDxFOvMRYT8JyYyaQS_wXMJzNga61ImY3sFdks66qM,4004
|
84
88
|
docling/utils/ocr_utils.py,sha256=AOaDAHr5S74d-IRVR_LKhKynUTIurAwLJ3wNeY58gPA,2326
|
@@ -86,9 +90,9 @@ docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,
|
|
86
90
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
87
91
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
88
92
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
89
|
-
docling-2.
|
90
|
-
docling-2.
|
91
|
-
docling-2.
|
92
|
-
docling-2.
|
93
|
-
docling-2.
|
94
|
-
docling-2.
|
93
|
+
docling-2.38.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
94
|
+
docling-2.38.0.dist-info/METADATA,sha256=vT8Zko4wD8iyKUjLAJ83Cm7ntscjEk5ojHvcJXlvT5A,10273
|
95
|
+
docling-2.38.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
96
|
+
docling-2.38.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
97
|
+
docling-2.38.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
98
|
+
docling-2.38.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|