docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,572 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Annotated, ClassVar, Literal, Optional, Union, cast
|
|
6
|
+
|
|
7
|
+
from docling_core.types.doc import (
|
|
8
|
+
ContentLayer,
|
|
9
|
+
DocItemLabel,
|
|
10
|
+
DoclingDocument,
|
|
11
|
+
DocumentOrigin,
|
|
12
|
+
Formatting,
|
|
13
|
+
GroupLabel,
|
|
14
|
+
NodeItem,
|
|
15
|
+
)
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
17
|
+
from pydantic.types import StringConstraints
|
|
18
|
+
from typing_extensions import Self, override
|
|
19
|
+
|
|
20
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
|
21
|
+
from docling.datamodel.base_models import InputFormat
|
|
22
|
+
from docling.datamodel.document import InputDocument
|
|
23
|
+
|
|
24
|
+
_log = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _WebVTTTimestamp(BaseModel):
|
|
28
|
+
"""Model representing a WebVTT timestamp.
|
|
29
|
+
|
|
30
|
+
A WebVTT timestamp is always interpreted relative to the current playback position
|
|
31
|
+
of the media data that the WebVTT file is to be synchronized with.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
model_config = ConfigDict(regex_engine="python-re")
|
|
35
|
+
|
|
36
|
+
raw: Annotated[
|
|
37
|
+
str,
|
|
38
|
+
Field(
|
|
39
|
+
description="A representation of the WebVTT Timestamp as a single string"
|
|
40
|
+
),
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
_pattern: ClassVar[re.Pattern] = re.compile(
|
|
44
|
+
r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
|
|
45
|
+
)
|
|
46
|
+
_hours: int
|
|
47
|
+
_minutes: int
|
|
48
|
+
_seconds: int
|
|
49
|
+
_millis: int
|
|
50
|
+
|
|
51
|
+
@model_validator(mode="after")
|
|
52
|
+
def validate_raw(self) -> Self:
|
|
53
|
+
m = self._pattern.match(self.raw)
|
|
54
|
+
if not m:
|
|
55
|
+
raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
|
|
56
|
+
self._hours = int(m.group(1)) if m.group(1) else 0
|
|
57
|
+
self._minutes = int(m.group(2))
|
|
58
|
+
self._seconds = int(m.group(3))
|
|
59
|
+
self._millis = int(m.group(4))
|
|
60
|
+
|
|
61
|
+
if self._minutes < 0 or self._minutes > 59:
|
|
62
|
+
raise ValueError("Minutes must be between 0 and 59")
|
|
63
|
+
if self._seconds < 0 or self._seconds > 59:
|
|
64
|
+
raise ValueError("Seconds must be between 0 and 59")
|
|
65
|
+
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def seconds(self) -> float:
|
|
70
|
+
"""A representation of the WebVTT Timestamp in seconds"""
|
|
71
|
+
return (
|
|
72
|
+
self._hours * 3600
|
|
73
|
+
+ self._minutes * 60
|
|
74
|
+
+ self._seconds
|
|
75
|
+
+ self._millis / 1000.0
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
@override
|
|
79
|
+
def __str__(self) -> str:
|
|
80
|
+
return self.raw
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
_WebVTTCueIdentifier = Annotated[
|
|
84
|
+
str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class _WebVTTCueTimings(BaseModel):
|
|
89
|
+
"""Model representating WebVTT cue timings."""
|
|
90
|
+
|
|
91
|
+
start: Annotated[
|
|
92
|
+
_WebVTTTimestamp, Field(description="Start time offset of the cue")
|
|
93
|
+
]
|
|
94
|
+
end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
|
|
95
|
+
|
|
96
|
+
@model_validator(mode="after")
|
|
97
|
+
def check_order(self) -> Self:
|
|
98
|
+
if self.start and self.end:
|
|
99
|
+
if self.end.seconds <= self.start.seconds:
|
|
100
|
+
raise ValueError("End timestamp must be greater than start timestamp")
|
|
101
|
+
return self
|
|
102
|
+
|
|
103
|
+
@override
|
|
104
|
+
def __str__(self):
|
|
105
|
+
return f"{self.start} --> {self.end}"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class _WebVTTCueTextSpan(BaseModel):
|
|
109
|
+
"""Model representing a WebVTT cue text span."""
|
|
110
|
+
|
|
111
|
+
text: str
|
|
112
|
+
span_type: Literal["text"] = "text"
|
|
113
|
+
|
|
114
|
+
@field_validator("text", mode="after")
|
|
115
|
+
@classmethod
|
|
116
|
+
def validate_text(cls, value: str) -> str:
|
|
117
|
+
if any(ch in value for ch in {"\n", "\r", "&", "<"}):
|
|
118
|
+
raise ValueError("Cue text span contains invalid characters")
|
|
119
|
+
if len(value) == 0:
|
|
120
|
+
raise ValueError("Cue text span cannot be empty")
|
|
121
|
+
return value
|
|
122
|
+
|
|
123
|
+
@override
|
|
124
|
+
def __str__(self):
|
|
125
|
+
return self.text
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class _WebVTTCueVoiceSpan(BaseModel):
|
|
129
|
+
"""Model representing a WebVTT cue voice span."""
|
|
130
|
+
|
|
131
|
+
annotation: Annotated[
|
|
132
|
+
str,
|
|
133
|
+
Field(
|
|
134
|
+
description=(
|
|
135
|
+
"Cue span start tag annotation text representing the name of thevoice"
|
|
136
|
+
)
|
|
137
|
+
),
|
|
138
|
+
]
|
|
139
|
+
classes: Annotated[
|
|
140
|
+
list[str],
|
|
141
|
+
Field(description="List of classes representing the cue span's significance"),
|
|
142
|
+
] = []
|
|
143
|
+
components: Annotated[
|
|
144
|
+
list["_WebVTTCueComponent"],
|
|
145
|
+
Field(description="The components representing the cue internal text"),
|
|
146
|
+
] = []
|
|
147
|
+
span_type: Literal["v"] = "v"
|
|
148
|
+
|
|
149
|
+
@field_validator("annotation", mode="after")
|
|
150
|
+
@classmethod
|
|
151
|
+
def validate_annotation(cls, value: str) -> str:
|
|
152
|
+
if any(ch in value for ch in {"\n", "\r", "&", ">"}):
|
|
153
|
+
raise ValueError(
|
|
154
|
+
"Cue span start tag annotation contains invalid characters"
|
|
155
|
+
)
|
|
156
|
+
if not value:
|
|
157
|
+
raise ValueError("Cue text span cannot be empty")
|
|
158
|
+
return value
|
|
159
|
+
|
|
160
|
+
@field_validator("classes", mode="after")
|
|
161
|
+
@classmethod
|
|
162
|
+
def validate_classes(cls, value: list[str]) -> list[str]:
|
|
163
|
+
for item in value:
|
|
164
|
+
if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
|
|
165
|
+
raise ValueError(
|
|
166
|
+
"A cue span start tag class contains invalid characters"
|
|
167
|
+
)
|
|
168
|
+
if not item:
|
|
169
|
+
raise ValueError("Cue span start tag classes cannot be empty")
|
|
170
|
+
return value
|
|
171
|
+
|
|
172
|
+
@override
|
|
173
|
+
def __str__(self):
|
|
174
|
+
tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
|
|
175
|
+
inner = "".join(str(span) for span in self.components)
|
|
176
|
+
return f"<{tag} {self.annotation}>{inner}</v>"
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class _WebVTTCueClassSpan(BaseModel):
|
|
180
|
+
span_type: Literal["c"] = "c"
|
|
181
|
+
components: list["_WebVTTCueComponent"]
|
|
182
|
+
|
|
183
|
+
@override
|
|
184
|
+
def __str__(self):
|
|
185
|
+
inner = "".join(str(span) for span in self.components)
|
|
186
|
+
return f"<c>{inner}</c>"
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class _WebVTTCueItalicSpan(BaseModel):
|
|
190
|
+
span_type: Literal["i"] = "i"
|
|
191
|
+
components: list["_WebVTTCueComponent"]
|
|
192
|
+
|
|
193
|
+
@override
|
|
194
|
+
def __str__(self):
|
|
195
|
+
inner = "".join(str(span) for span in self.components)
|
|
196
|
+
return f"<i>{inner}</i>"
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class _WebVTTCueBoldSpan(BaseModel):
|
|
200
|
+
span_type: Literal["b"] = "b"
|
|
201
|
+
components: list["_WebVTTCueComponent"]
|
|
202
|
+
|
|
203
|
+
@override
|
|
204
|
+
def __str__(self):
|
|
205
|
+
inner = "".join(str(span) for span in self.components)
|
|
206
|
+
return f"<b>{inner}</b>"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class _WebVTTCueUnderlineSpan(BaseModel):
|
|
210
|
+
span_type: Literal["u"] = "u"
|
|
211
|
+
components: list["_WebVTTCueComponent"]
|
|
212
|
+
|
|
213
|
+
@override
|
|
214
|
+
def __str__(self):
|
|
215
|
+
inner = "".join(str(span) for span in self.components)
|
|
216
|
+
return f"<u>{inner}</u>"
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
_WebVTTCueComponent = Annotated[
|
|
220
|
+
Union[
|
|
221
|
+
_WebVTTCueTextSpan,
|
|
222
|
+
_WebVTTCueClassSpan,
|
|
223
|
+
_WebVTTCueItalicSpan,
|
|
224
|
+
_WebVTTCueBoldSpan,
|
|
225
|
+
_WebVTTCueUnderlineSpan,
|
|
226
|
+
_WebVTTCueVoiceSpan,
|
|
227
|
+
],
|
|
228
|
+
Field(discriminator="span_type", description="The WebVTT cue component"),
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class _WebVTTCueBlock(BaseModel):
|
|
233
|
+
"""Model representing a WebVTT cue block.
|
|
234
|
+
|
|
235
|
+
The optional WebVTT cue settings list is not supported.
|
|
236
|
+
The cue payload is limited to the following spans: text, class, italic, bold,
|
|
237
|
+
underline, and voice.
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
model_config = ConfigDict(regex_engine="python-re")
|
|
241
|
+
|
|
242
|
+
identifier: Optional[_WebVTTCueIdentifier] = Field(
|
|
243
|
+
None, description="The WebVTT cue identifier"
|
|
244
|
+
)
|
|
245
|
+
timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
|
|
246
|
+
payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
|
|
247
|
+
|
|
248
|
+
_pattern_block: ClassVar[re.Pattern] = re.compile(
|
|
249
|
+
r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
|
|
250
|
+
)
|
|
251
|
+
_pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
|
|
252
|
+
r"^<v(?P<class>\.[^\t\n\r &<>]+)?" # zero or more classes
|
|
253
|
+
r"[ \t]+(?P<annotation>[^\n\r&>]+)>" # required space and annotation
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
@field_validator("payload", mode="after")
|
|
257
|
+
@classmethod
|
|
258
|
+
def validate_payload(cls, payload):
|
|
259
|
+
for voice in payload:
|
|
260
|
+
if "-->" in str(voice):
|
|
261
|
+
raise ValueError("Cue payload must not contain '-->'")
|
|
262
|
+
return payload
|
|
263
|
+
|
|
264
|
+
@classmethod
|
|
265
|
+
def parse(cls, raw: str) -> "_WebVTTCueBlock":
|
|
266
|
+
lines = raw.strip().splitlines()
|
|
267
|
+
if not lines:
|
|
268
|
+
raise ValueError("Cue block must have at least one line")
|
|
269
|
+
identifier: Optional[_WebVTTCueIdentifier] = None
|
|
270
|
+
timing_line = lines[0]
|
|
271
|
+
if "-->" not in timing_line and len(lines) > 1:
|
|
272
|
+
identifier = timing_line
|
|
273
|
+
timing_line = lines[1]
|
|
274
|
+
cue_lines = lines[2:]
|
|
275
|
+
else:
|
|
276
|
+
cue_lines = lines[1:]
|
|
277
|
+
|
|
278
|
+
if "-->" not in timing_line:
|
|
279
|
+
raise ValueError("Cue block must contain WebVTT cue timings")
|
|
280
|
+
|
|
281
|
+
start, end = [t.strip() for t in timing_line.split("-->")]
|
|
282
|
+
end = re.split(" |\t", end)[0] # ignore the cue settings list
|
|
283
|
+
timings: _WebVTTCueTimings = _WebVTTCueTimings(
|
|
284
|
+
start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
|
|
285
|
+
)
|
|
286
|
+
cue_text = " ".join(cue_lines).strip()
|
|
287
|
+
if cue_text.startswith("<v") and "</v>" not in cue_text:
|
|
288
|
+
# adding close tag for cue voice spans without end tag
|
|
289
|
+
cue_text += "</v>"
|
|
290
|
+
|
|
291
|
+
stack: list[list[_WebVTTCueComponent]] = [[]]
|
|
292
|
+
tag_stack: list[Union[str, tuple]] = []
|
|
293
|
+
|
|
294
|
+
pos = 0
|
|
295
|
+
matches = list(cls._pattern_block.finditer(cue_text))
|
|
296
|
+
i = 0
|
|
297
|
+
while i < len(matches):
|
|
298
|
+
match = matches[i]
|
|
299
|
+
if match.start() > pos:
|
|
300
|
+
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
|
|
301
|
+
tag = match.group(0)
|
|
302
|
+
|
|
303
|
+
if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
|
|
304
|
+
tag_type = tag[1:2]
|
|
305
|
+
tag_stack.append(tag_type)
|
|
306
|
+
stack.append([])
|
|
307
|
+
elif tag == "</i>":
|
|
308
|
+
children = stack.pop()
|
|
309
|
+
stack[-1].append(_WebVTTCueItalicSpan(components=children))
|
|
310
|
+
tag_stack.pop()
|
|
311
|
+
elif tag == "</b>":
|
|
312
|
+
children = stack.pop()
|
|
313
|
+
stack[-1].append(_WebVTTCueBoldSpan(components=children))
|
|
314
|
+
tag_stack.pop()
|
|
315
|
+
elif tag == "</u>":
|
|
316
|
+
children = stack.pop()
|
|
317
|
+
stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
|
|
318
|
+
tag_stack.pop()
|
|
319
|
+
elif tag == "</c>":
|
|
320
|
+
children = stack.pop()
|
|
321
|
+
stack[-1].append(_WebVTTCueClassSpan(components=children))
|
|
322
|
+
tag_stack.pop()
|
|
323
|
+
elif tag.startswith("<v"):
|
|
324
|
+
tag_stack.append(("v", tag))
|
|
325
|
+
stack.append([])
|
|
326
|
+
elif tag.startswith("</v"):
|
|
327
|
+
children = stack.pop() if stack else []
|
|
328
|
+
if (
|
|
329
|
+
tag_stack
|
|
330
|
+
and isinstance(tag_stack[-1], tuple)
|
|
331
|
+
and tag_stack[-1][0] == "v"
|
|
332
|
+
):
|
|
333
|
+
_, voice = cast(tuple, tag_stack.pop())
|
|
334
|
+
voice_match = cls._pattern_voice_tag.match(voice)
|
|
335
|
+
if voice_match:
|
|
336
|
+
class_string = voice_match.group("class")
|
|
337
|
+
annotation = voice_match.group("annotation")
|
|
338
|
+
if annotation:
|
|
339
|
+
classes: list[str] = []
|
|
340
|
+
if class_string:
|
|
341
|
+
classes = [c for c in class_string.split(".") if c]
|
|
342
|
+
stack[-1].append(
|
|
343
|
+
_WebVTTCueVoiceSpan(
|
|
344
|
+
annotation=annotation.strip(),
|
|
345
|
+
classes=classes,
|
|
346
|
+
components=children,
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
pos = match.end()
|
|
351
|
+
i += 1
|
|
352
|
+
|
|
353
|
+
if pos < len(cue_text):
|
|
354
|
+
stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
|
|
355
|
+
|
|
356
|
+
return cls(
|
|
357
|
+
identifier=identifier,
|
|
358
|
+
timings=timings,
|
|
359
|
+
payload=stack[0],
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
def __str__(self):
|
|
363
|
+
parts = []
|
|
364
|
+
if self.identifier:
|
|
365
|
+
parts.append(f"{self.identifier}\n")
|
|
366
|
+
timings_line = str(self.timings)
|
|
367
|
+
parts.append(timings_line + "\n")
|
|
368
|
+
for idx, span in enumerate(self.payload):
|
|
369
|
+
if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
|
|
370
|
+
# the end tag may be omitted for brevity
|
|
371
|
+
parts.append(str(span).removesuffix("</v>"))
|
|
372
|
+
else:
|
|
373
|
+
parts.append(str(span))
|
|
374
|
+
|
|
375
|
+
return "".join(parts)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
class _WebVTTFile(BaseModel):
|
|
379
|
+
"""A model representing a WebVTT file."""
|
|
380
|
+
|
|
381
|
+
cue_blocks: list[_WebVTTCueBlock]
|
|
382
|
+
|
|
383
|
+
@staticmethod
|
|
384
|
+
def verify_signature(content: str) -> bool:
|
|
385
|
+
if not content:
|
|
386
|
+
return False
|
|
387
|
+
elif len(content) == 6:
|
|
388
|
+
return content == "WEBVTT"
|
|
389
|
+
elif len(content) > 6 and content.startswith("WEBVTT"):
|
|
390
|
+
return content[6] in (" ", "\t", "\n")
|
|
391
|
+
else:
|
|
392
|
+
return False
|
|
393
|
+
|
|
394
|
+
@classmethod
|
|
395
|
+
def parse(cls, raw: str) -> "_WebVTTFile":
|
|
396
|
+
# Normalize newlines to LF
|
|
397
|
+
raw = raw.replace("\r\n", "\n").replace("\r", "\n")
|
|
398
|
+
|
|
399
|
+
# Check WebVTT signature
|
|
400
|
+
if not cls.verify_signature(raw):
|
|
401
|
+
raise ValueError("Invalid WebVTT file signature")
|
|
402
|
+
|
|
403
|
+
# Strip "WEBVTT" header line
|
|
404
|
+
lines = raw.split("\n", 1)
|
|
405
|
+
body = lines[1] if len(lines) > 1 else ""
|
|
406
|
+
|
|
407
|
+
# Remove NOTE/STYLE/REGION blocks
|
|
408
|
+
body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
|
|
409
|
+
body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
|
|
410
|
+
|
|
411
|
+
# Split into cue blocks
|
|
412
|
+
raw_blocks = re.split(r"\n\s*\n", body.strip())
|
|
413
|
+
cues: list[_WebVTTCueBlock] = []
|
|
414
|
+
for block in raw_blocks:
|
|
415
|
+
try:
|
|
416
|
+
cues.append(_WebVTTCueBlock.parse(block))
|
|
417
|
+
except ValueError as e:
|
|
418
|
+
_log.warning(f"Failed to parse cue block:\n{block}\n{e}")
|
|
419
|
+
|
|
420
|
+
return cls(cue_blocks=cues)
|
|
421
|
+
|
|
422
|
+
def __iter__(self):
|
|
423
|
+
return iter(self.cue_blocks)
|
|
424
|
+
|
|
425
|
+
def __getitem__(self, idx):
|
|
426
|
+
return self.cue_blocks[idx]
|
|
427
|
+
|
|
428
|
+
def __len__(self):
|
|
429
|
+
return len(self.cue_blocks)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
class WebVTTDocumentBackend(DeclarativeDocumentBackend):
|
|
433
|
+
"""Declarative backend for WebVTT (.vtt) files.
|
|
434
|
+
|
|
435
|
+
This parser reads the content of a WebVTT file and converts
|
|
436
|
+
it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
|
|
437
|
+
|
|
438
|
+
Each cue becomes a TextItem and the items are appended to the
|
|
439
|
+
document body by the cue's start time.
|
|
440
|
+
"""
|
|
441
|
+
|
|
442
|
+
@override
|
|
443
|
+
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
|
444
|
+
super().__init__(in_doc, path_or_stream)
|
|
445
|
+
|
|
446
|
+
self.content: str = ""
|
|
447
|
+
try:
|
|
448
|
+
if isinstance(self.path_or_stream, BytesIO):
|
|
449
|
+
self.content = self.path_or_stream.getvalue().decode("utf-8")
|
|
450
|
+
if isinstance(self.path_or_stream, Path):
|
|
451
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
|
452
|
+
self.content = f.read()
|
|
453
|
+
except Exception as e:
|
|
454
|
+
raise RuntimeError(
|
|
455
|
+
"Could not initialize the WebVTT backend for file with hash "
|
|
456
|
+
f"{self.document_hash}."
|
|
457
|
+
) from e
|
|
458
|
+
|
|
459
|
+
@override
|
|
460
|
+
def is_valid(self) -> bool:
|
|
461
|
+
return _WebVTTFile.verify_signature(self.content)
|
|
462
|
+
|
|
463
|
+
@classmethod
|
|
464
|
+
@override
|
|
465
|
+
def supports_pagination(cls) -> bool:
|
|
466
|
+
return False
|
|
467
|
+
|
|
468
|
+
@override
|
|
469
|
+
def unload(self):
|
|
470
|
+
if isinstance(self.path_or_stream, BytesIO):
|
|
471
|
+
self.path_or_stream.close()
|
|
472
|
+
self.path_or_stream = None
|
|
473
|
+
|
|
474
|
+
@classmethod
|
|
475
|
+
@override
|
|
476
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
477
|
+
return {InputFormat.VTT}
|
|
478
|
+
|
|
479
|
+
@staticmethod
|
|
480
|
+
def _add_text_from_component(
|
|
481
|
+
doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
|
|
482
|
+
) -> None:
|
|
483
|
+
"""Adds a TextItem to a document by extracting text from a cue span component.
|
|
484
|
+
|
|
485
|
+
TODO: address nesting
|
|
486
|
+
"""
|
|
487
|
+
formatting = Formatting()
|
|
488
|
+
text = ""
|
|
489
|
+
if isinstance(item, _WebVTTCueItalicSpan):
|
|
490
|
+
formatting.italic = True
|
|
491
|
+
elif isinstance(item, _WebVTTCueBoldSpan):
|
|
492
|
+
formatting.bold = True
|
|
493
|
+
elif isinstance(item, _WebVTTCueUnderlineSpan):
|
|
494
|
+
formatting.underline = True
|
|
495
|
+
if isinstance(item, _WebVTTCueTextSpan):
|
|
496
|
+
text = item.text
|
|
497
|
+
else:
|
|
498
|
+
# TODO: address nesting
|
|
499
|
+
text = "".join(
|
|
500
|
+
[t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
|
|
501
|
+
)
|
|
502
|
+
if text := text.strip():
|
|
503
|
+
doc.add_text(
|
|
504
|
+
label=DocItemLabel.TEXT,
|
|
505
|
+
text=text,
|
|
506
|
+
parent=parent,
|
|
507
|
+
content_layer=ContentLayer.BODY,
|
|
508
|
+
formatting=formatting,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
@override
|
|
512
|
+
def convert(self) -> DoclingDocument:
|
|
513
|
+
_log.debug("Starting WebVTT conversion...")
|
|
514
|
+
if not self.is_valid():
|
|
515
|
+
raise RuntimeError("Invalid WebVTT document.")
|
|
516
|
+
|
|
517
|
+
origin = DocumentOrigin(
|
|
518
|
+
filename=self.file.name or "file",
|
|
519
|
+
mimetype="text/vtt",
|
|
520
|
+
binary_hash=self.document_hash,
|
|
521
|
+
)
|
|
522
|
+
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
|
523
|
+
|
|
524
|
+
vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
|
|
525
|
+
for block in vtt.cue_blocks:
|
|
526
|
+
block_group = doc.add_group(
|
|
527
|
+
label=GroupLabel.SECTION,
|
|
528
|
+
name="WebVTT cue block",
|
|
529
|
+
parent=None,
|
|
530
|
+
content_layer=ContentLayer.BODY,
|
|
531
|
+
)
|
|
532
|
+
if block.identifier:
|
|
533
|
+
doc.add_text(
|
|
534
|
+
label=DocItemLabel.TEXT,
|
|
535
|
+
text=str(block.identifier),
|
|
536
|
+
parent=block_group,
|
|
537
|
+
content_layer=ContentLayer.BODY,
|
|
538
|
+
)
|
|
539
|
+
doc.add_text(
|
|
540
|
+
label=DocItemLabel.TEXT,
|
|
541
|
+
text=str(block.timings),
|
|
542
|
+
parent=block_group,
|
|
543
|
+
content_layer=ContentLayer.BODY,
|
|
544
|
+
)
|
|
545
|
+
for cue_span in block.payload:
|
|
546
|
+
if isinstance(cue_span, _WebVTTCueVoiceSpan):
|
|
547
|
+
voice_group = doc.add_group(
|
|
548
|
+
label=GroupLabel.INLINE,
|
|
549
|
+
name="WebVTT cue voice span",
|
|
550
|
+
parent=block_group,
|
|
551
|
+
content_layer=ContentLayer.BODY,
|
|
552
|
+
)
|
|
553
|
+
voice = cue_span.annotation
|
|
554
|
+
if classes := cue_span.classes:
|
|
555
|
+
voice += f" ({', '.join(classes)})"
|
|
556
|
+
voice += ": "
|
|
557
|
+
doc.add_text(
|
|
558
|
+
label=DocItemLabel.TEXT,
|
|
559
|
+
text=voice,
|
|
560
|
+
parent=voice_group,
|
|
561
|
+
content_layer=ContentLayer.BODY,
|
|
562
|
+
)
|
|
563
|
+
for item in cue_span.components:
|
|
564
|
+
WebVTTDocumentBackend._add_text_from_component(
|
|
565
|
+
doc, item, voice_group
|
|
566
|
+
)
|
|
567
|
+
else:
|
|
568
|
+
WebVTTDocumentBackend._add_text_from_component(
|
|
569
|
+
doc, cue_span, block_group
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
return doc
|
|
File without changes
|