docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,572 @@
1
+ import logging
2
+ import re
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import Annotated, ClassVar, Literal, Optional, Union, cast
6
+
7
+ from docling_core.types.doc import (
8
+ ContentLayer,
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ Formatting,
13
+ GroupLabel,
14
+ NodeItem,
15
+ )
16
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
17
+ from pydantic.types import StringConstraints
18
+ from typing_extensions import Self, override
19
+
20
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
21
+ from docling.datamodel.base_models import InputFormat
22
+ from docling.datamodel.document import InputDocument
23
+
24
+ _log = logging.getLogger(__name__)
25
+
26
+
27
+ class _WebVTTTimestamp(BaseModel):
28
+ """Model representing a WebVTT timestamp.
29
+
30
+ A WebVTT timestamp is always interpreted relative to the current playback position
31
+ of the media data that the WebVTT file is to be synchronized with.
32
+ """
33
+
34
+ model_config = ConfigDict(regex_engine="python-re")
35
+
36
+ raw: Annotated[
37
+ str,
38
+ Field(
39
+ description="A representation of the WebVTT Timestamp as a single string"
40
+ ),
41
+ ]
42
+
43
+ _pattern: ClassVar[re.Pattern] = re.compile(
44
+ r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
45
+ )
46
+ _hours: int
47
+ _minutes: int
48
+ _seconds: int
49
+ _millis: int
50
+
51
+ @model_validator(mode="after")
52
+ def validate_raw(self) -> Self:
53
+ m = self._pattern.match(self.raw)
54
+ if not m:
55
+ raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
56
+ self._hours = int(m.group(1)) if m.group(1) else 0
57
+ self._minutes = int(m.group(2))
58
+ self._seconds = int(m.group(3))
59
+ self._millis = int(m.group(4))
60
+
61
+ if self._minutes < 0 or self._minutes > 59:
62
+ raise ValueError("Minutes must be between 0 and 59")
63
+ if self._seconds < 0 or self._seconds > 59:
64
+ raise ValueError("Seconds must be between 0 and 59")
65
+
66
+ return self
67
+
68
+ @property
69
+ def seconds(self) -> float:
70
+ """A representation of the WebVTT Timestamp in seconds"""
71
+ return (
72
+ self._hours * 3600
73
+ + self._minutes * 60
74
+ + self._seconds
75
+ + self._millis / 1000.0
76
+ )
77
+
78
+ @override
79
+ def __str__(self) -> str:
80
+ return self.raw
81
+
82
+
83
+ _WebVTTCueIdentifier = Annotated[
84
+ str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
85
+ ]
86
+
87
+
88
+ class _WebVTTCueTimings(BaseModel):
89
+ """Model representating WebVTT cue timings."""
90
+
91
+ start: Annotated[
92
+ _WebVTTTimestamp, Field(description="Start time offset of the cue")
93
+ ]
94
+ end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
95
+
96
+ @model_validator(mode="after")
97
+ def check_order(self) -> Self:
98
+ if self.start and self.end:
99
+ if self.end.seconds <= self.start.seconds:
100
+ raise ValueError("End timestamp must be greater than start timestamp")
101
+ return self
102
+
103
+ @override
104
+ def __str__(self):
105
+ return f"{self.start} --> {self.end}"
106
+
107
+
108
+ class _WebVTTCueTextSpan(BaseModel):
109
+ """Model representing a WebVTT cue text span."""
110
+
111
+ text: str
112
+ span_type: Literal["text"] = "text"
113
+
114
+ @field_validator("text", mode="after")
115
+ @classmethod
116
+ def validate_text(cls, value: str) -> str:
117
+ if any(ch in value for ch in {"\n", "\r", "&", "<"}):
118
+ raise ValueError("Cue text span contains invalid characters")
119
+ if len(value) == 0:
120
+ raise ValueError("Cue text span cannot be empty")
121
+ return value
122
+
123
+ @override
124
+ def __str__(self):
125
+ return self.text
126
+
127
+
128
+ class _WebVTTCueVoiceSpan(BaseModel):
129
+ """Model representing a WebVTT cue voice span."""
130
+
131
+ annotation: Annotated[
132
+ str,
133
+ Field(
134
+ description=(
135
+ "Cue span start tag annotation text representing the name of thevoice"
136
+ )
137
+ ),
138
+ ]
139
+ classes: Annotated[
140
+ list[str],
141
+ Field(description="List of classes representing the cue span's significance"),
142
+ ] = []
143
+ components: Annotated[
144
+ list["_WebVTTCueComponent"],
145
+ Field(description="The components representing the cue internal text"),
146
+ ] = []
147
+ span_type: Literal["v"] = "v"
148
+
149
+ @field_validator("annotation", mode="after")
150
+ @classmethod
151
+ def validate_annotation(cls, value: str) -> str:
152
+ if any(ch in value for ch in {"\n", "\r", "&", ">"}):
153
+ raise ValueError(
154
+ "Cue span start tag annotation contains invalid characters"
155
+ )
156
+ if not value:
157
+ raise ValueError("Cue text span cannot be empty")
158
+ return value
159
+
160
+ @field_validator("classes", mode="after")
161
+ @classmethod
162
+ def validate_classes(cls, value: list[str]) -> list[str]:
163
+ for item in value:
164
+ if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
165
+ raise ValueError(
166
+ "A cue span start tag class contains invalid characters"
167
+ )
168
+ if not item:
169
+ raise ValueError("Cue span start tag classes cannot be empty")
170
+ return value
171
+
172
+ @override
173
+ def __str__(self):
174
+ tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
175
+ inner = "".join(str(span) for span in self.components)
176
+ return f"<{tag} {self.annotation}>{inner}</v>"
177
+
178
+
179
+ class _WebVTTCueClassSpan(BaseModel):
180
+ span_type: Literal["c"] = "c"
181
+ components: list["_WebVTTCueComponent"]
182
+
183
+ @override
184
+ def __str__(self):
185
+ inner = "".join(str(span) for span in self.components)
186
+ return f"<c>{inner}</c>"
187
+
188
+
189
+ class _WebVTTCueItalicSpan(BaseModel):
190
+ span_type: Literal["i"] = "i"
191
+ components: list["_WebVTTCueComponent"]
192
+
193
+ @override
194
+ def __str__(self):
195
+ inner = "".join(str(span) for span in self.components)
196
+ return f"<i>{inner}</i>"
197
+
198
+
199
+ class _WebVTTCueBoldSpan(BaseModel):
200
+ span_type: Literal["b"] = "b"
201
+ components: list["_WebVTTCueComponent"]
202
+
203
+ @override
204
+ def __str__(self):
205
+ inner = "".join(str(span) for span in self.components)
206
+ return f"<b>{inner}</b>"
207
+
208
+
209
+ class _WebVTTCueUnderlineSpan(BaseModel):
210
+ span_type: Literal["u"] = "u"
211
+ components: list["_WebVTTCueComponent"]
212
+
213
+ @override
214
+ def __str__(self):
215
+ inner = "".join(str(span) for span in self.components)
216
+ return f"<u>{inner}</u>"
217
+
218
+
219
+ _WebVTTCueComponent = Annotated[
220
+ Union[
221
+ _WebVTTCueTextSpan,
222
+ _WebVTTCueClassSpan,
223
+ _WebVTTCueItalicSpan,
224
+ _WebVTTCueBoldSpan,
225
+ _WebVTTCueUnderlineSpan,
226
+ _WebVTTCueVoiceSpan,
227
+ ],
228
+ Field(discriminator="span_type", description="The WebVTT cue component"),
229
+ ]
230
+
231
+
232
+ class _WebVTTCueBlock(BaseModel):
233
+ """Model representing a WebVTT cue block.
234
+
235
+ The optional WebVTT cue settings list is not supported.
236
+ The cue payload is limited to the following spans: text, class, italic, bold,
237
+ underline, and voice.
238
+ """
239
+
240
+ model_config = ConfigDict(regex_engine="python-re")
241
+
242
+ identifier: Optional[_WebVTTCueIdentifier] = Field(
243
+ None, description="The WebVTT cue identifier"
244
+ )
245
+ timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
246
+ payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
247
+
248
+ _pattern_block: ClassVar[re.Pattern] = re.compile(
249
+ r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
250
+ )
251
+ _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
252
+ r"^<v(?P<class>\.[^\t\n\r &<>]+)?" # zero or more classes
253
+ r"[ \t]+(?P<annotation>[^\n\r&>]+)>" # required space and annotation
254
+ )
255
+
256
+ @field_validator("payload", mode="after")
257
+ @classmethod
258
+ def validate_payload(cls, payload):
259
+ for voice in payload:
260
+ if "-->" in str(voice):
261
+ raise ValueError("Cue payload must not contain '-->'")
262
+ return payload
263
+
264
+ @classmethod
265
+ def parse(cls, raw: str) -> "_WebVTTCueBlock":
266
+ lines = raw.strip().splitlines()
267
+ if not lines:
268
+ raise ValueError("Cue block must have at least one line")
269
+ identifier: Optional[_WebVTTCueIdentifier] = None
270
+ timing_line = lines[0]
271
+ if "-->" not in timing_line and len(lines) > 1:
272
+ identifier = timing_line
273
+ timing_line = lines[1]
274
+ cue_lines = lines[2:]
275
+ else:
276
+ cue_lines = lines[1:]
277
+
278
+ if "-->" not in timing_line:
279
+ raise ValueError("Cue block must contain WebVTT cue timings")
280
+
281
+ start, end = [t.strip() for t in timing_line.split("-->")]
282
+ end = re.split(" |\t", end)[0] # ignore the cue settings list
283
+ timings: _WebVTTCueTimings = _WebVTTCueTimings(
284
+ start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
285
+ )
286
+ cue_text = " ".join(cue_lines).strip()
287
+ if cue_text.startswith("<v") and "</v>" not in cue_text:
288
+ # adding close tag for cue voice spans without end tag
289
+ cue_text += "</v>"
290
+
291
+ stack: list[list[_WebVTTCueComponent]] = [[]]
292
+ tag_stack: list[Union[str, tuple]] = []
293
+
294
+ pos = 0
295
+ matches = list(cls._pattern_block.finditer(cue_text))
296
+ i = 0
297
+ while i < len(matches):
298
+ match = matches[i]
299
+ if match.start() > pos:
300
+ stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
301
+ tag = match.group(0)
302
+
303
+ if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
304
+ tag_type = tag[1:2]
305
+ tag_stack.append(tag_type)
306
+ stack.append([])
307
+ elif tag == "</i>":
308
+ children = stack.pop()
309
+ stack[-1].append(_WebVTTCueItalicSpan(components=children))
310
+ tag_stack.pop()
311
+ elif tag == "</b>":
312
+ children = stack.pop()
313
+ stack[-1].append(_WebVTTCueBoldSpan(components=children))
314
+ tag_stack.pop()
315
+ elif tag == "</u>":
316
+ children = stack.pop()
317
+ stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
318
+ tag_stack.pop()
319
+ elif tag == "</c>":
320
+ children = stack.pop()
321
+ stack[-1].append(_WebVTTCueClassSpan(components=children))
322
+ tag_stack.pop()
323
+ elif tag.startswith("<v"):
324
+ tag_stack.append(("v", tag))
325
+ stack.append([])
326
+ elif tag.startswith("</v"):
327
+ children = stack.pop() if stack else []
328
+ if (
329
+ tag_stack
330
+ and isinstance(tag_stack[-1], tuple)
331
+ and tag_stack[-1][0] == "v"
332
+ ):
333
+ _, voice = cast(tuple, tag_stack.pop())
334
+ voice_match = cls._pattern_voice_tag.match(voice)
335
+ if voice_match:
336
+ class_string = voice_match.group("class")
337
+ annotation = voice_match.group("annotation")
338
+ if annotation:
339
+ classes: list[str] = []
340
+ if class_string:
341
+ classes = [c for c in class_string.split(".") if c]
342
+ stack[-1].append(
343
+ _WebVTTCueVoiceSpan(
344
+ annotation=annotation.strip(),
345
+ classes=classes,
346
+ components=children,
347
+ )
348
+ )
349
+
350
+ pos = match.end()
351
+ i += 1
352
+
353
+ if pos < len(cue_text):
354
+ stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
355
+
356
+ return cls(
357
+ identifier=identifier,
358
+ timings=timings,
359
+ payload=stack[0],
360
+ )
361
+
362
+ def __str__(self):
363
+ parts = []
364
+ if self.identifier:
365
+ parts.append(f"{self.identifier}\n")
366
+ timings_line = str(self.timings)
367
+ parts.append(timings_line + "\n")
368
+ for idx, span in enumerate(self.payload):
369
+ if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
370
+ # the end tag may be omitted for brevity
371
+ parts.append(str(span).removesuffix("</v>"))
372
+ else:
373
+ parts.append(str(span))
374
+
375
+ return "".join(parts)
376
+
377
+
378
+ class _WebVTTFile(BaseModel):
379
+ """A model representing a WebVTT file."""
380
+
381
+ cue_blocks: list[_WebVTTCueBlock]
382
+
383
+ @staticmethod
384
+ def verify_signature(content: str) -> bool:
385
+ if not content:
386
+ return False
387
+ elif len(content) == 6:
388
+ return content == "WEBVTT"
389
+ elif len(content) > 6 and content.startswith("WEBVTT"):
390
+ return content[6] in (" ", "\t", "\n")
391
+ else:
392
+ return False
393
+
394
+ @classmethod
395
+ def parse(cls, raw: str) -> "_WebVTTFile":
396
+ # Normalize newlines to LF
397
+ raw = raw.replace("\r\n", "\n").replace("\r", "\n")
398
+
399
+ # Check WebVTT signature
400
+ if not cls.verify_signature(raw):
401
+ raise ValueError("Invalid WebVTT file signature")
402
+
403
+ # Strip "WEBVTT" header line
404
+ lines = raw.split("\n", 1)
405
+ body = lines[1] if len(lines) > 1 else ""
406
+
407
+ # Remove NOTE/STYLE/REGION blocks
408
+ body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
409
+ body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
410
+
411
+ # Split into cue blocks
412
+ raw_blocks = re.split(r"\n\s*\n", body.strip())
413
+ cues: list[_WebVTTCueBlock] = []
414
+ for block in raw_blocks:
415
+ try:
416
+ cues.append(_WebVTTCueBlock.parse(block))
417
+ except ValueError as e:
418
+ _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
419
+
420
+ return cls(cue_blocks=cues)
421
+
422
+ def __iter__(self):
423
+ return iter(self.cue_blocks)
424
+
425
+ def __getitem__(self, idx):
426
+ return self.cue_blocks[idx]
427
+
428
+ def __len__(self):
429
+ return len(self.cue_blocks)
430
+
431
+
432
+ class WebVTTDocumentBackend(DeclarativeDocumentBackend):
433
+ """Declarative backend for WebVTT (.vtt) files.
434
+
435
+ This parser reads the content of a WebVTT file and converts
436
+ it to a DoclingDocument, following the W3C specs on https://www.w3.org/TR/webvtt1
437
+
438
+ Each cue becomes a TextItem and the items are appended to the
439
+ document body by the cue's start time.
440
+ """
441
+
442
+ @override
443
+ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
444
+ super().__init__(in_doc, path_or_stream)
445
+
446
+ self.content: str = ""
447
+ try:
448
+ if isinstance(self.path_or_stream, BytesIO):
449
+ self.content = self.path_or_stream.getvalue().decode("utf-8")
450
+ if isinstance(self.path_or_stream, Path):
451
+ with open(self.path_or_stream, encoding="utf-8") as f:
452
+ self.content = f.read()
453
+ except Exception as e:
454
+ raise RuntimeError(
455
+ "Could not initialize the WebVTT backend for file with hash "
456
+ f"{self.document_hash}."
457
+ ) from e
458
+
459
+ @override
460
+ def is_valid(self) -> bool:
461
+ return _WebVTTFile.verify_signature(self.content)
462
+
463
+ @classmethod
464
+ @override
465
+ def supports_pagination(cls) -> bool:
466
+ return False
467
+
468
+ @override
469
+ def unload(self):
470
+ if isinstance(self.path_or_stream, BytesIO):
471
+ self.path_or_stream.close()
472
+ self.path_or_stream = None
473
+
474
+ @classmethod
475
+ @override
476
+ def supported_formats(cls) -> set[InputFormat]:
477
+ return {InputFormat.VTT}
478
+
479
+ @staticmethod
480
+ def _add_text_from_component(
481
+ doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
482
+ ) -> None:
483
+ """Adds a TextItem to a document by extracting text from a cue span component.
484
+
485
+ TODO: address nesting
486
+ """
487
+ formatting = Formatting()
488
+ text = ""
489
+ if isinstance(item, _WebVTTCueItalicSpan):
490
+ formatting.italic = True
491
+ elif isinstance(item, _WebVTTCueBoldSpan):
492
+ formatting.bold = True
493
+ elif isinstance(item, _WebVTTCueUnderlineSpan):
494
+ formatting.underline = True
495
+ if isinstance(item, _WebVTTCueTextSpan):
496
+ text = item.text
497
+ else:
498
+ # TODO: address nesting
499
+ text = "".join(
500
+ [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
501
+ )
502
+ if text := text.strip():
503
+ doc.add_text(
504
+ label=DocItemLabel.TEXT,
505
+ text=text,
506
+ parent=parent,
507
+ content_layer=ContentLayer.BODY,
508
+ formatting=formatting,
509
+ )
510
+
511
+ @override
512
+ def convert(self) -> DoclingDocument:
513
+ _log.debug("Starting WebVTT conversion...")
514
+ if not self.is_valid():
515
+ raise RuntimeError("Invalid WebVTT document.")
516
+
517
+ origin = DocumentOrigin(
518
+ filename=self.file.name or "file",
519
+ mimetype="text/vtt",
520
+ binary_hash=self.document_hash,
521
+ )
522
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
523
+
524
+ vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
525
+ for block in vtt.cue_blocks:
526
+ block_group = doc.add_group(
527
+ label=GroupLabel.SECTION,
528
+ name="WebVTT cue block",
529
+ parent=None,
530
+ content_layer=ContentLayer.BODY,
531
+ )
532
+ if block.identifier:
533
+ doc.add_text(
534
+ label=DocItemLabel.TEXT,
535
+ text=str(block.identifier),
536
+ parent=block_group,
537
+ content_layer=ContentLayer.BODY,
538
+ )
539
+ doc.add_text(
540
+ label=DocItemLabel.TEXT,
541
+ text=str(block.timings),
542
+ parent=block_group,
543
+ content_layer=ContentLayer.BODY,
544
+ )
545
+ for cue_span in block.payload:
546
+ if isinstance(cue_span, _WebVTTCueVoiceSpan):
547
+ voice_group = doc.add_group(
548
+ label=GroupLabel.INLINE,
549
+ name="WebVTT cue voice span",
550
+ parent=block_group,
551
+ content_layer=ContentLayer.BODY,
552
+ )
553
+ voice = cue_span.annotation
554
+ if classes := cue_span.classes:
555
+ voice += f" ({', '.join(classes)})"
556
+ voice += ": "
557
+ doc.add_text(
558
+ label=DocItemLabel.TEXT,
559
+ text=voice,
560
+ parent=voice_group,
561
+ content_layer=ContentLayer.BODY,
562
+ )
563
+ for item in cue_span.components:
564
+ WebVTTDocumentBackend._add_text_from_component(
565
+ doc, item, voice_group
566
+ )
567
+ else:
568
+ WebVTTDocumentBackend._add_text_from_component(
569
+ doc, cue_span, block_group
570
+ )
571
+
572
+ return doc
File without changes