docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,431 @@
1
+ import logging
2
+ import os
3
+ import re
4
+ import sys
5
+ import tempfile
6
+ from io import BytesIO
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, List, Optional, Union, cast
9
+
10
+ from docling_core.types.doc import DoclingDocument, DocumentOrigin
11
+
12
+ # import whisper # type: ignore
13
+ # import librosa
14
+ # import numpy as np
15
+ # import soundfile as sf # type: ignore
16
+ from docling_core.types.doc.labels import DocItemLabel
17
+ from pydantic import BaseModel, Field, validator
18
+
19
+ from docling.backend.abstract_backend import AbstractDocumentBackend
20
+ from docling.backend.noop_backend import NoOpBackend
21
+
22
+ # from pydub import AudioSegment # type: ignore
23
+ # from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
24
+ from docling.datamodel.accelerator_options import (
25
+ AcceleratorOptions,
26
+ )
27
+ from docling.datamodel.base_models import (
28
+ ConversionStatus,
29
+ FormatToMimeType,
30
+ )
31
+ from docling.datamodel.document import ConversionResult, InputDocument
32
+ from docling.datamodel.pipeline_options import (
33
+ AsrPipelineOptions,
34
+ )
35
+ from docling.datamodel.pipeline_options_asr_model import (
36
+ InlineAsrMlxWhisperOptions,
37
+ InlineAsrNativeWhisperOptions,
38
+ # AsrResponseFormat,
39
+ InlineAsrOptions,
40
+ )
41
+ from docling.datamodel.pipeline_options_vlm_model import (
42
+ InferenceFramework,
43
+ )
44
+ from docling.datamodel.settings import settings
45
+ from docling.pipeline.base_pipeline import BasePipeline
46
+ from docling.utils.accelerator_utils import decide_device
47
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
48
+
49
+ _log = logging.getLogger(__name__)
50
+
51
+
52
+ class _ConversationWord(BaseModel):
53
+ text: str
54
+ start_time: Optional[float] = Field(
55
+ None, description="Start time in seconds from video start"
56
+ )
57
+ end_time: Optional[float] = Field(
58
+ None, ge=0, description="End time in seconds from video start"
59
+ )
60
+
61
+
62
+ class _ConversationItem(BaseModel):
63
+ text: str
64
+ start_time: Optional[float] = Field(
65
+ None, description="Start time in seconds from video start"
66
+ )
67
+ end_time: Optional[float] = Field(
68
+ None, ge=0, description="End time in seconds from video start"
69
+ )
70
+ speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
71
+ speaker: Optional[str] = Field(
72
+ None, description="Speaker name, defaults to speaker-{speaker_id}"
73
+ )
74
+ words: Optional[list[_ConversationWord]] = Field(
75
+ None, description="Individual words with time-stamps"
76
+ )
77
+
78
+ def __lt__(self, other):
79
+ if not isinstance(other, _ConversationItem):
80
+ return NotImplemented
81
+ return self.start_time < other.start_time
82
+
83
+ def __eq__(self, other):
84
+ if not isinstance(other, _ConversationItem):
85
+ return NotImplemented
86
+ return self.start_time == other.start_time
87
+
88
+ def to_string(self) -> str:
89
+ """Format the conversation entry as a string"""
90
+ result = ""
91
+ if (self.start_time is not None) and (self.end_time is not None):
92
+ result += f"[time: {self.start_time}-{self.end_time}] "
93
+
94
+ if self.speaker is not None:
95
+ result += f"[speaker:{self.speaker}] "
96
+
97
+ result += self.text
98
+ return result
99
+
100
+
101
+ class _NativeWhisperModel:
102
+ def __init__(
103
+ self,
104
+ enabled: bool,
105
+ artifacts_path: Optional[Path],
106
+ accelerator_options: AcceleratorOptions,
107
+ asr_options: InlineAsrNativeWhisperOptions,
108
+ ):
109
+ """
110
+ Transcriber using native Whisper.
111
+ """
112
+ self.enabled = enabled
113
+
114
+ _log.info(f"artifacts-path: {artifacts_path}")
115
+ _log.info(f"accelerator_options: {accelerator_options}")
116
+
117
+ if self.enabled:
118
+ try:
119
+ import whisper # type: ignore
120
+ except ImportError:
121
+ if sys.version_info < (3, 14):
122
+ raise ImportError(
123
+ "whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
124
+ )
125
+ else:
126
+ raise ImportError(
127
+ "whisper is not installed. Unfortunately its dependencies are not yet available for Python 3.14."
128
+ )
129
+
130
+ self.asr_options = asr_options
131
+ self.max_tokens = asr_options.max_new_tokens
132
+ self.temperature = asr_options.temperature
133
+
134
+ self.device = decide_device(
135
+ accelerator_options.device,
136
+ supported_devices=asr_options.supported_devices,
137
+ )
138
+ _log.info(f"Available device for Whisper: {self.device}")
139
+
140
+ self.model_name = asr_options.repo_id
141
+ _log.info(f"loading _NativeWhisperModel({self.model_name})")
142
+ if artifacts_path is not None:
143
+ _log.info(f"loading {self.model_name} from {artifacts_path}")
144
+ self.model = whisper.load_model(
145
+ name=self.model_name,
146
+ device=self.device,
147
+ download_root=str(artifacts_path),
148
+ )
149
+ else:
150
+ self.model = whisper.load_model(
151
+ name=self.model_name, device=self.device
152
+ )
153
+
154
+ self.verbose = asr_options.verbose
155
+ self.timestamps = asr_options.timestamps
156
+ self.word_timestamps = asr_options.word_timestamps
157
+
158
+ def run(self, conv_res: ConversionResult) -> ConversionResult:
159
+ # Access the file path from the backend, similar to how other pipelines handle it
160
+ path_or_stream = conv_res.input._backend.path_or_stream
161
+
162
+ # Handle both Path and BytesIO inputs
163
+ temp_file_path: Optional[Path] = None
164
+
165
+ if isinstance(path_or_stream, BytesIO):
166
+ # For BytesIO, write to a temporary file since whisper requires a file path
167
+ suffix = Path(conv_res.input.file.name).suffix or ".wav"
168
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
169
+ tmp_file.write(path_or_stream.getvalue())
170
+ temp_file_path = Path(tmp_file.name)
171
+ audio_path = temp_file_path
172
+ elif isinstance(path_or_stream, Path):
173
+ audio_path = path_or_stream
174
+ else:
175
+ raise RuntimeError(
176
+ f"ASR pipeline requires a file path or BytesIO stream, but got {type(path_or_stream)}"
177
+ )
178
+
179
+ try:
180
+ conversation = self.transcribe(audio_path)
181
+
182
+ # Ensure we have a proper DoclingDocument
183
+ origin = DocumentOrigin(
184
+ filename=conv_res.input.file.name or "audio.wav",
185
+ mimetype="audio/x-wav",
186
+ binary_hash=conv_res.input.document_hash,
187
+ )
188
+ conv_res.document = DoclingDocument(
189
+ name=conv_res.input.file.stem or "audio.wav", origin=origin
190
+ )
191
+
192
+ for citem in conversation:
193
+ conv_res.document.add_text(
194
+ label=DocItemLabel.TEXT, text=citem.to_string()
195
+ )
196
+
197
+ return conv_res
198
+
199
+ except Exception as exc:
200
+ _log.error(f"Audio tranciption has an error: {exc}")
201
+ conv_res.status = ConversionStatus.FAILURE
202
+ return conv_res
203
+
204
+ finally:
205
+ # Clean up temporary file if created
206
+ if temp_file_path is not None and temp_file_path.exists():
207
+ try:
208
+ temp_file_path.unlink()
209
+ except Exception as e:
210
+ _log.warning(
211
+ f"Failed to delete temporary file {temp_file_path}: {e}"
212
+ )
213
+
214
+ def transcribe(self, fpath: Path) -> list[_ConversationItem]:
215
+ result = self.model.transcribe(
216
+ str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
217
+ )
218
+
219
+ convo: list[_ConversationItem] = []
220
+ for _ in result["segments"]:
221
+ item = _ConversationItem(
222
+ start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
223
+ )
224
+ if "words" in _ and self.word_timestamps:
225
+ item.words = []
226
+ for __ in _["words"]:
227
+ item.words.append(
228
+ _ConversationWord(
229
+ start_time=__["start"],
230
+ end_time=__["end"],
231
+ text=__["word"],
232
+ )
233
+ )
234
+ convo.append(item)
235
+
236
+ return convo
237
+
238
+
239
+ class _MlxWhisperModel:
240
+ def __init__(
241
+ self,
242
+ enabled: bool,
243
+ artifacts_path: Optional[Path],
244
+ accelerator_options: AcceleratorOptions,
245
+ asr_options: InlineAsrMlxWhisperOptions,
246
+ ):
247
+ """
248
+ Transcriber using MLX Whisper for Apple Silicon optimization.
249
+ """
250
+ self.enabled = enabled
251
+
252
+ _log.info(f"artifacts-path: {artifacts_path}")
253
+ _log.info(f"accelerator_options: {accelerator_options}")
254
+
255
+ if self.enabled:
256
+ try:
257
+ import mlx_whisper # type: ignore
258
+ except ImportError:
259
+ raise ImportError(
260
+ "mlx-whisper is not installed. Please install it via `pip install mlx-whisper` or do `uv sync --extra asr`."
261
+ )
262
+ self.asr_options = asr_options
263
+ self.mlx_whisper = mlx_whisper
264
+
265
+ self.device = decide_device(
266
+ accelerator_options.device,
267
+ supported_devices=asr_options.supported_devices,
268
+ )
269
+ _log.info(f"Available device for MLX Whisper: {self.device}")
270
+
271
+ self.model_name = asr_options.repo_id
272
+ _log.info(f"loading _MlxWhisperModel({self.model_name})")
273
+
274
+ # MLX Whisper models are loaded differently - they use HuggingFace repos
275
+ self.model_path = self.model_name
276
+
277
+ # Store MLX-specific options
278
+ self.language = asr_options.language
279
+ self.task = asr_options.task
280
+ self.word_timestamps = asr_options.word_timestamps
281
+ self.no_speech_threshold = asr_options.no_speech_threshold
282
+ self.logprob_threshold = asr_options.logprob_threshold
283
+ self.compression_ratio_threshold = asr_options.compression_ratio_threshold
284
+
285
+ def run(self, conv_res: ConversionResult) -> ConversionResult:
286
+ audio_path: Path = Path(conv_res.input.file).resolve()
287
+
288
+ try:
289
+ conversation = self.transcribe(audio_path)
290
+
291
+ # Ensure we have a proper DoclingDocument
292
+ origin = DocumentOrigin(
293
+ filename=conv_res.input.file.name or "audio.wav",
294
+ mimetype="audio/x-wav",
295
+ binary_hash=conv_res.input.document_hash,
296
+ )
297
+ conv_res.document = DoclingDocument(
298
+ name=conv_res.input.file.stem or "audio.wav", origin=origin
299
+ )
300
+
301
+ for citem in conversation:
302
+ conv_res.document.add_text(
303
+ label=DocItemLabel.TEXT, text=citem.to_string()
304
+ )
305
+
306
+ conv_res.status = ConversionStatus.SUCCESS
307
+ return conv_res
308
+
309
+ except Exception as exc:
310
+ _log.error(f"MLX Audio transcription has an error: {exc}")
311
+
312
+ conv_res.status = ConversionStatus.FAILURE
313
+ return conv_res
314
+
315
+ def transcribe(self, fpath: Path) -> list[_ConversationItem]:
316
+ """
317
+ Transcribe audio using MLX Whisper.
318
+
319
+ Args:
320
+ fpath: Path to audio file
321
+
322
+ Returns:
323
+ List of conversation items with timestamps
324
+ """
325
+ result = self.mlx_whisper.transcribe(
326
+ str(fpath),
327
+ path_or_hf_repo=self.model_path,
328
+ language=self.language,
329
+ task=self.task,
330
+ word_timestamps=self.word_timestamps,
331
+ no_speech_threshold=self.no_speech_threshold,
332
+ logprob_threshold=self.logprob_threshold,
333
+ compression_ratio_threshold=self.compression_ratio_threshold,
334
+ )
335
+
336
+ convo: list[_ConversationItem] = []
337
+
338
+ # MLX Whisper returns segments similar to native Whisper
339
+ for segment in result.get("segments", []):
340
+ item = _ConversationItem(
341
+ start_time=segment.get("start"),
342
+ end_time=segment.get("end"),
343
+ text=segment.get("text", "").strip(),
344
+ words=[],
345
+ )
346
+
347
+ # Add word-level timestamps if available
348
+ if self.word_timestamps and "words" in segment:
349
+ item.words = []
350
+ for word_data in segment["words"]:
351
+ item.words.append(
352
+ _ConversationWord(
353
+ start_time=word_data.get("start"),
354
+ end_time=word_data.get("end"),
355
+ text=word_data.get("word", ""),
356
+ )
357
+ )
358
+ convo.append(item)
359
+
360
+ return convo
361
+
362
+
363
+ class AsrPipeline(BasePipeline):
364
+ def __init__(self, pipeline_options: AsrPipelineOptions):
365
+ super().__init__(pipeline_options)
366
+ self.keep_backend = True
367
+
368
+ self.pipeline_options: AsrPipelineOptions = pipeline_options
369
+ self._model: Union[_NativeWhisperModel, _MlxWhisperModel]
370
+
371
+ if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
372
+ native_asr_options: InlineAsrNativeWhisperOptions = (
373
+ self.pipeline_options.asr_options
374
+ )
375
+ self._model = _NativeWhisperModel(
376
+ enabled=True, # must be always enabled for this pipeline to make sense.
377
+ artifacts_path=self.artifacts_path,
378
+ accelerator_options=pipeline_options.accelerator_options,
379
+ asr_options=native_asr_options,
380
+ )
381
+ elif isinstance(self.pipeline_options.asr_options, InlineAsrMlxWhisperOptions):
382
+ mlx_asr_options: InlineAsrMlxWhisperOptions = (
383
+ self.pipeline_options.asr_options
384
+ )
385
+ self._model = _MlxWhisperModel(
386
+ enabled=True, # must be always enabled for this pipeline to make sense.
387
+ artifacts_path=self.artifacts_path,
388
+ accelerator_options=pipeline_options.accelerator_options,
389
+ asr_options=mlx_asr_options,
390
+ )
391
+ else:
392
+ _log.error(f"No model support for {self.pipeline_options.asr_options}")
393
+
394
+ def _has_text(self, document: "DoclingDocument") -> bool:
395
+ """
396
+ Helper method to check if the document contains any transcribed text.
397
+ A transcription is considered non-empty if the .texts list contains items with actual, non whitespace content.
398
+ """
399
+ if not document or not document.texts:
400
+ return False
401
+ for item in document.texts:
402
+ if item.text and item.text.strip():
403
+ return True
404
+ return False
405
+
406
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
407
+ """Determines the final status of ASR Conversion based on its result."""
408
+ if conv_res.status == ConversionStatus.FAILURE or conv_res.errors:
409
+ return ConversionStatus.FAILURE
410
+ if not self._has_text(conv_res.document):
411
+ _log.warning(
412
+ "ASR conversion resulted in an empty document."
413
+ f"File: {conv_res.input.file.name}"
414
+ )
415
+ return ConversionStatus.PARTIAL_SUCCESS
416
+ return ConversionStatus.SUCCESS
417
+
418
+ @classmethod
419
+ def get_default_options(cls) -> AsrPipelineOptions:
420
+ return AsrPipelineOptions()
421
+
422
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
423
+ _log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
424
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
425
+ self._model.run(conv_res=conv_res)
426
+
427
+ return conv_res
428
+
429
+ @classmethod
430
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
431
+ return isinstance(backend, NoOpBackend)
@@ -0,0 +1,72 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from docling.datamodel.base_models import ConversionStatus, ErrorItem
7
+ from docling.datamodel.document import InputDocument
8
+ from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
9
+ from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
10
+ from docling.datamodel.settings import settings
11
+
12
+ _log = logging.getLogger(__name__)
13
+
14
+
15
+ class BaseExtractionPipeline(ABC):
16
+ def __init__(self, pipeline_options: PipelineOptions):
17
+ self.pipeline_options = pipeline_options
18
+
19
+ self.artifacts_path: Optional[Path] = None
20
+ if pipeline_options.artifacts_path is not None:
21
+ self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
22
+ elif settings.artifacts_path is not None:
23
+ self.artifacts_path = Path(settings.artifacts_path).expanduser()
24
+
25
+ if self.artifacts_path is not None and not self.artifacts_path.is_dir():
26
+ raise RuntimeError(
27
+ f"The value of {self.artifacts_path=} is not valid. "
28
+ "When defined, it must point to a folder containing all models required by the pipeline."
29
+ )
30
+
31
+ def execute(
32
+ self,
33
+ in_doc: InputDocument,
34
+ raises_on_error: bool,
35
+ template: Optional[ExtractionTemplateType] = None,
36
+ ) -> ExtractionResult:
37
+ ext_res = ExtractionResult(input=in_doc)
38
+
39
+ try:
40
+ ext_res = self._extract_data(ext_res, template)
41
+ ext_res.status = self._determine_status(ext_res)
42
+ except Exception as e:
43
+ ext_res.status = ConversionStatus.FAILURE
44
+ error_item = ErrorItem(
45
+ component_type="extraction_pipeline",
46
+ module_name=self.__class__.__name__,
47
+ error_message=str(e),
48
+ )
49
+ ext_res.errors.append(error_item)
50
+ if raises_on_error:
51
+ raise e
52
+
53
+ return ext_res
54
+
55
+ @abstractmethod
56
+ def _extract_data(
57
+ self,
58
+ ext_res: ExtractionResult,
59
+ template: Optional[ExtractionTemplateType] = None,
60
+ ) -> ExtractionResult:
61
+ """Subclass must populate ext_res.pages/errors and return the result."""
62
+ raise NotImplementedError
63
+
64
+ @abstractmethod
65
+ def _determine_status(self, ext_res: ExtractionResult) -> ConversionStatus:
66
+ """Subclass must decide SUCCESS/PARTIAL_SUCCESS/FAILURE based on ext_res."""
67
+ raise NotImplementedError
68
+
69
+ @classmethod
70
+ @abstractmethod
71
+ def get_default_options(cls) -> PipelineOptions:
72
+ pass