docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,559 @@
1
+ import hashlib
2
+ import logging
3
+ import sys
4
+ import threading
5
+ import time
6
+ import warnings
7
+ from collections.abc import Iterable, Iterator
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from datetime import datetime
10
+ from functools import partial
11
+ from io import BytesIO
12
+ from pathlib import Path
13
+ from typing import Optional, Type, Union
14
+
15
+ from pydantic import ConfigDict, model_validator, validate_call
16
+ from typing_extensions import Self
17
+
18
+ from docling.backend.abstract_backend import (
19
+ AbstractDocumentBackend,
20
+ )
21
+ from docling.backend.asciidoc_backend import AsciiDocBackend
22
+ from docling.backend.csv_backend import CsvDocumentBackend
23
+ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
24
+ from docling.backend.html_backend import HTMLDocumentBackend
25
+ from docling.backend.image_backend import ImageDocumentBackend
26
+ from docling.backend.json.docling_json_backend import DoclingJSONBackend
27
+ from docling.backend.md_backend import MarkdownDocumentBackend
28
+ from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
29
+ from docling.backend.msexcel_backend import MsExcelDocumentBackend
30
+ from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
31
+ from docling.backend.msword_backend import MsWordDocumentBackend
32
+ from docling.backend.noop_backend import NoOpBackend
33
+ from docling.backend.webvtt_backend import WebVTTDocumentBackend
34
+ from docling.backend.xml.jats_backend import JatsDocumentBackend
35
+ from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
36
+ from docling.datamodel.backend_options import (
37
+ BackendOptions,
38
+ HTMLBackendOptions,
39
+ MarkdownBackendOptions,
40
+ PdfBackendOptions,
41
+ )
42
+ from docling.datamodel.base_models import (
43
+ BaseFormatOption,
44
+ ConversionStatus,
45
+ DoclingComponentType,
46
+ DocumentStream,
47
+ ErrorItem,
48
+ InputFormat,
49
+ )
50
+ from docling.datamodel.document import (
51
+ ConversionResult,
52
+ InputDocument,
53
+ _DocumentConversionInput,
54
+ )
55
+ from docling.datamodel.pipeline_options import PipelineOptions
56
+ from docling.datamodel.settings import (
57
+ DEFAULT_PAGE_RANGE,
58
+ DocumentLimits,
59
+ PageRange,
60
+ settings,
61
+ )
62
+ from docling.exceptions import ConversionError
63
+ from docling.pipeline.asr_pipeline import AsrPipeline
64
+ from docling.pipeline.base_pipeline import BasePipeline
65
+ from docling.pipeline.simple_pipeline import SimplePipeline
66
+ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
67
+ from docling.utils.utils import chunkify
68
+
69
+ _log = logging.getLogger(__name__)
70
+ _PIPELINE_CACHE_LOCK = threading.Lock()
71
+
72
+
73
+ class FormatOption(BaseFormatOption):
74
+ pipeline_cls: Type[BasePipeline]
75
+ backend_options: Optional[BackendOptions] = None
76
+
77
+ @model_validator(mode="after")
78
+ def set_optional_field_default(self) -> Self:
79
+ if self.pipeline_options is None:
80
+ self.pipeline_options = self.pipeline_cls.get_default_options()
81
+
82
+ return self
83
+
84
+
85
+ class CsvFormatOption(FormatOption):
86
+ pipeline_cls: Type = SimplePipeline
87
+ backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
88
+
89
+
90
+ class ExcelFormatOption(FormatOption):
91
+ pipeline_cls: Type = SimplePipeline
92
+ backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
93
+
94
+
95
+ class WordFormatOption(FormatOption):
96
+ pipeline_cls: Type = SimplePipeline
97
+ backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
98
+
99
+
100
+ class PowerpointFormatOption(FormatOption):
101
+ pipeline_cls: Type = SimplePipeline
102
+ backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
103
+
104
+
105
+ class MarkdownFormatOption(FormatOption):
106
+ pipeline_cls: Type = SimplePipeline
107
+ backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
108
+ backend_options: Optional[MarkdownBackendOptions] = None
109
+
110
+
111
+ class AsciiDocFormatOption(FormatOption):
112
+ pipeline_cls: Type = SimplePipeline
113
+ backend: Type[AbstractDocumentBackend] = AsciiDocBackend
114
+
115
+
116
+ class HTMLFormatOption(FormatOption):
117
+ pipeline_cls: Type = SimplePipeline
118
+ backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
119
+ backend_options: Optional[HTMLBackendOptions] = None
120
+
121
+
122
+ class PatentUsptoFormatOption(FormatOption):
123
+ pipeline_cls: Type = SimplePipeline
124
+ backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
125
+
126
+
127
+ class XMLJatsFormatOption(FormatOption):
128
+ pipeline_cls: Type = SimplePipeline
129
+ backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
130
+
131
+
132
+ class ImageFormatOption(FormatOption):
133
+ pipeline_cls: Type = StandardPdfPipeline
134
+ backend: Type[AbstractDocumentBackend] = ImageDocumentBackend
135
+
136
+
137
+ class PdfFormatOption(FormatOption):
138
+ pipeline_cls: Type = StandardPdfPipeline
139
+ backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
140
+ backend_options: Optional[PdfBackendOptions] = None
141
+
142
+
143
+ class AudioFormatOption(FormatOption):
144
+ pipeline_cls: Type = AsrPipeline
145
+ backend: Type[AbstractDocumentBackend] = NoOpBackend
146
+
147
+
148
+ def _get_default_option(format: InputFormat) -> FormatOption:
149
+ format_to_default_options = {
150
+ InputFormat.CSV: CsvFormatOption(),
151
+ InputFormat.XLSX: ExcelFormatOption(),
152
+ InputFormat.DOCX: WordFormatOption(),
153
+ InputFormat.PPTX: PowerpointFormatOption(),
154
+ InputFormat.MD: MarkdownFormatOption(),
155
+ InputFormat.ASCIIDOC: AsciiDocFormatOption(),
156
+ InputFormat.HTML: HTMLFormatOption(),
157
+ InputFormat.XML_USPTO: PatentUsptoFormatOption(),
158
+ InputFormat.XML_JATS: XMLJatsFormatOption(),
159
+ InputFormat.METS_GBS: FormatOption(
160
+ pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
161
+ ),
162
+ InputFormat.IMAGE: ImageFormatOption(),
163
+ InputFormat.PDF: PdfFormatOption(),
164
+ InputFormat.JSON_DOCLING: FormatOption(
165
+ pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
166
+ ),
167
+ InputFormat.AUDIO: AudioFormatOption(),
168
+ InputFormat.VTT: FormatOption(
169
+ pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
170
+ ),
171
+ }
172
+ if (options := format_to_default_options.get(format)) is not None:
173
+ return options
174
+ else:
175
+ raise RuntimeError(f"No default options configured for {format}")
176
+
177
+
178
+ class DocumentConverter:
179
+ """Convert documents of various input formats to Docling documents.
180
+
181
+ `DocumentConverter` is the main entry point for converting documents in Docling.
182
+ It handles various input formats (PDF, DOCX, PPTX, images, HTML, Markdown, etc.)
183
+ and provides both single-document and batch conversion capabilities.
184
+
185
+ The conversion methods return a `ConversionResult` instance for each document,
186
+ which wraps a `DoclingDocument` object if the conversion was successful, along
187
+ with metadata about the conversion process.
188
+
189
+ Attributes:
190
+ allowed_formats: Allowed input formats.
191
+ format_to_options: Mapping of formats to their options.
192
+ initialized_pipelines: Cache of initialized pipelines keyed by
193
+ (pipeline class, options hash).
194
+ """
195
+
196
+ _default_download_filename = "file"
197
+
198
+ def __init__(
199
+ self,
200
+ allowed_formats: Optional[list[InputFormat]] = None,
201
+ format_options: Optional[dict[InputFormat, FormatOption]] = None,
202
+ ) -> None:
203
+ """Initialize the converter based on format preferences.
204
+
205
+ Args:
206
+ allowed_formats: List of allowed input formats. By default, any
207
+ format supported by Docling is allowed.
208
+ format_options: Dictionary of format-specific options.
209
+ """
210
+ self.allowed_formats: list[InputFormat] = (
211
+ allowed_formats if allowed_formats is not None else list(InputFormat)
212
+ )
213
+
214
+ # Normalize format options: ensure IMAGE format uses ImageDocumentBackend
215
+ # for backward compatibility (old code might use PdfFormatOption or other backends for images)
216
+ normalized_format_options: dict[InputFormat, FormatOption] = {}
217
+ if format_options:
218
+ for format, option in format_options.items():
219
+ if (
220
+ format == InputFormat.IMAGE
221
+ and option.backend is not ImageDocumentBackend
222
+ ):
223
+ warnings.warn(
224
+ f"Using {option.backend.__name__} for InputFormat.IMAGE is deprecated. "
225
+ "Images should use ImageDocumentBackend via ImageFormatOption. "
226
+ "Automatically correcting the backend, please update your code to avoid this warning.",
227
+ DeprecationWarning,
228
+ stacklevel=2,
229
+ )
230
+ # Convert to ImageFormatOption while preserving pipeline and backend options
231
+ normalized_format_options[format] = ImageFormatOption(
232
+ pipeline_cls=option.pipeline_cls,
233
+ pipeline_options=option.pipeline_options,
234
+ backend_options=option.backend_options,
235
+ )
236
+ else:
237
+ normalized_format_options[format] = option
238
+
239
+ self.format_to_options: dict[InputFormat, FormatOption] = {
240
+ format: (
241
+ _get_default_option(format=format)
242
+ if (custom_option := normalized_format_options.get(format)) is None
243
+ else custom_option
244
+ )
245
+ for format in self.allowed_formats
246
+ }
247
+ self.initialized_pipelines: dict[
248
+ tuple[Type[BasePipeline], str], BasePipeline
249
+ ] = {}
250
+
251
+ def _get_initialized_pipelines(
252
+ self,
253
+ ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
254
+ return self.initialized_pipelines
255
+
256
+ def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
257
+ """Generate a hash of pipeline options to use as part of the cache key."""
258
+ options_str = str(pipeline_options.model_dump())
259
+ return hashlib.md5(
260
+ options_str.encode("utf-8"), usedforsecurity=False
261
+ ).hexdigest()
262
+
263
+ def initialize_pipeline(self, format: InputFormat):
264
+ """Initialize the conversion pipeline for the selected format.
265
+
266
+ Args:
267
+ format: The input format for which to initialize the pipeline.
268
+
269
+ Raises:
270
+ ConversionError: If no pipeline could be initialized for the
271
+ given format.
272
+ RuntimeError: If `artifacts_path` is set in
273
+ `docling.datamodel.settings.settings` when required by
274
+ the pipeline, but points to a non-directory file.
275
+ FileNotFoundError: If local model files are not found.
276
+ """
277
+ pipeline = self._get_pipeline(doc_format=format)
278
+ if pipeline is None:
279
+ raise ConversionError(
280
+ f"No pipeline could be initialized for format {format}"
281
+ )
282
+
283
+ @validate_call(config=ConfigDict(strict=True))
284
+ def convert(
285
+ self,
286
+ source: Union[Path, str, DocumentStream], # TODO review naming
287
+ headers: Optional[dict[str, str]] = None,
288
+ raises_on_error: bool = True,
289
+ max_num_pages: int = sys.maxsize,
290
+ max_file_size: int = sys.maxsize,
291
+ page_range: PageRange = DEFAULT_PAGE_RANGE,
292
+ ) -> ConversionResult:
293
+ """Convert one document fetched from a file path, URL, or DocumentStream.
294
+
295
+ Note: If the document content is given as a string (Markdown or HTML
296
+ content), use the `convert_string` method.
297
+
298
+ Args:
299
+ source: Source of input document given as file path, URL, or
300
+ DocumentStream.
301
+ headers: Optional headers given as a dictionary of string key-value pairs,
302
+ in case of URL input source.
303
+ raises_on_error: Whether to raise an error on the first conversion failure.
304
+ If False, errors are captured in the ConversionResult objects.
305
+ max_num_pages: Maximum number of pages accepted per document.
306
+ Documents exceeding this number will not be converted.
307
+ max_file_size: Maximum file size to convert.
308
+ page_range: Range of pages to convert.
309
+
310
+ Returns:
311
+ The conversion result, which contains a `DoclingDocument` in the `document`
312
+ attribute, and metadata about the conversion process.
313
+
314
+ Raises:
315
+ ConversionError: An error occurred during conversion.
316
+ """
317
+ all_res = self.convert_all(
318
+ source=[source],
319
+ raises_on_error=raises_on_error,
320
+ max_num_pages=max_num_pages,
321
+ max_file_size=max_file_size,
322
+ headers=headers,
323
+ page_range=page_range,
324
+ )
325
+ return next(all_res)
326
+
327
+ @validate_call(config=ConfigDict(strict=True))
328
+ def convert_all(
329
+ self,
330
+ source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
331
+ headers: Optional[dict[str, str]] = None,
332
+ raises_on_error: bool = True,
333
+ max_num_pages: int = sys.maxsize,
334
+ max_file_size: int = sys.maxsize,
335
+ page_range: PageRange = DEFAULT_PAGE_RANGE,
336
+ ) -> Iterator[ConversionResult]:
337
+ """Convert multiple documents from file paths, URLs, or DocumentStreams.
338
+
339
+ Args:
340
+ source: Source of input documents given as an iterable of file paths, URLs,
341
+ or DocumentStreams.
342
+ headers: Optional headers given as a (single) dictionary of string
343
+ key-value pairs, in case of URL input source.
344
+ raises_on_error: Whether to raise an error on the first conversion failure.
345
+ max_num_pages: Maximum number of pages to convert.
346
+ max_file_size: Maximum number of pages accepted per document. Documents
347
+ exceeding this number will be skipped.
348
+ page_range: Range of pages to convert in each document.
349
+
350
+ Yields:
351
+ The conversion results, each containing a `DoclingDocument` in the
352
+ `document` attribute and metadata about the conversion process.
353
+
354
+ Raises:
355
+ ConversionError: An error occurred during conversion.
356
+ """
357
+ limits = DocumentLimits(
358
+ max_num_pages=max_num_pages,
359
+ max_file_size=max_file_size,
360
+ page_range=page_range,
361
+ )
362
+ conv_input = _DocumentConversionInput(
363
+ path_or_stream_iterator=source, limits=limits, headers=headers
364
+ )
365
+ conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
366
+
367
+ had_result = False
368
+ for conv_res in conv_res_iter:
369
+ had_result = True
370
+ if raises_on_error and conv_res.status not in {
371
+ ConversionStatus.SUCCESS,
372
+ ConversionStatus.PARTIAL_SUCCESS,
373
+ }:
374
+ error_details = ""
375
+ if conv_res.errors:
376
+ error_messages = [err.error_message for err in conv_res.errors]
377
+ error_details = f" Errors: {'; '.join(error_messages)}"
378
+ raise ConversionError(
379
+ f"Conversion failed for: {conv_res.input.file} with status: "
380
+ f"{conv_res.status}.{error_details}"
381
+ )
382
+ else:
383
+ yield conv_res
384
+
385
+ if not had_result and raises_on_error:
386
+ raise ConversionError(
387
+ "Conversion failed because the provided file has no recognizable "
388
+ "format or it wasn't in the list of allowed formats."
389
+ )
390
+
391
+ @validate_call(config=ConfigDict(strict=True))
392
+ def convert_string(
393
+ self,
394
+ content: str,
395
+ format: InputFormat,
396
+ name: Optional[str] = None,
397
+ ) -> ConversionResult:
398
+ """Convert a document given as a string using the specified format.
399
+
400
+ Only Markdown (`InputFormat.MD`) and HTML (`InputFormat.HTML`) formats
401
+ are supported. The content is wrapped in a `DocumentStream` and passed
402
+ to the main conversion pipeline.
403
+
404
+ Args:
405
+ content: The document content as a string.
406
+ format: The format of the input content.
407
+ name: The filename to associate with the document. If not provided, a
408
+ timestamp-based name is generated. The appropriate file extension (`md`
409
+ or `html`) is appended if missing.
410
+
411
+ Returns:
412
+ The conversion result, which contains a `DoclingDocument` in the `document`
413
+ attribute, and metadata about the conversion process.
414
+
415
+ Raises:
416
+ ValueError: If format is neither `InputFormat.MD` nor `InputFormat.HTML`.
417
+ ConversionError: An error occurred during conversion.
418
+ """
419
+ name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
420
+
421
+ if format == InputFormat.MD:
422
+ if not name.endswith(".md"):
423
+ name += ".md"
424
+
425
+ buff = BytesIO(content.encode("utf-8"))
426
+ doc_stream = DocumentStream(name=name, stream=buff)
427
+
428
+ return self.convert(doc_stream)
429
+ elif format == InputFormat.HTML:
430
+ if not name.endswith(".html"):
431
+ name += ".html"
432
+
433
+ buff = BytesIO(content.encode("utf-8"))
434
+ doc_stream = DocumentStream(name=name, stream=buff)
435
+
436
+ return self.convert(doc_stream)
437
+ else:
438
+ raise ValueError(f"format {format} is not supported in `convert_string`")
439
+
440
+ def _convert(
441
+ self, conv_input: _DocumentConversionInput, raises_on_error: bool
442
+ ) -> Iterator[ConversionResult]:
443
+ start_time = time.monotonic()
444
+
445
+ for input_batch in chunkify(
446
+ conv_input.docs(self.format_to_options),
447
+ settings.perf.doc_batch_size, # pass format_options
448
+ ):
449
+ _log.info("Going to convert document batch...")
450
+ process_func = partial(
451
+ self._process_document, raises_on_error=raises_on_error
452
+ )
453
+
454
+ if (
455
+ settings.perf.doc_batch_concurrency > 1
456
+ and settings.perf.doc_batch_size > 1
457
+ ):
458
+ with ThreadPoolExecutor(
459
+ max_workers=settings.perf.doc_batch_concurrency
460
+ ) as pool:
461
+ for item in pool.map(
462
+ process_func,
463
+ input_batch,
464
+ ):
465
+ yield item
466
+ else:
467
+ for item in map(
468
+ process_func,
469
+ input_batch,
470
+ ):
471
+ elapsed = time.monotonic() - start_time
472
+ start_time = time.monotonic()
473
+ _log.info(
474
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
475
+ )
476
+ yield item
477
+
478
+ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
479
+ """Retrieve or initialize a pipeline, reusing instances based on class and options."""
480
+ fopt = self.format_to_options.get(doc_format)
481
+
482
+ if fopt is None or fopt.pipeline_options is None:
483
+ return None
484
+
485
+ pipeline_class = fopt.pipeline_cls
486
+ pipeline_options = fopt.pipeline_options
487
+ options_hash = self._get_pipeline_options_hash(pipeline_options)
488
+
489
+ # Use a composite key to cache pipelines
490
+ cache_key = (pipeline_class, options_hash)
491
+
492
+ with _PIPELINE_CACHE_LOCK:
493
+ if cache_key not in self.initialized_pipelines:
494
+ _log.info(
495
+ f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
496
+ )
497
+ self.initialized_pipelines[cache_key] = pipeline_class(
498
+ pipeline_options=pipeline_options
499
+ )
500
+ else:
501
+ _log.debug(
502
+ f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
503
+ )
504
+
505
+ return self.initialized_pipelines[cache_key]
506
+
507
+ def _process_document(
508
+ self, in_doc: InputDocument, raises_on_error: bool
509
+ ) -> ConversionResult:
510
+ valid = (
511
+ self.allowed_formats is not None and in_doc.format in self.allowed_formats
512
+ )
513
+ if valid:
514
+ conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
515
+ else:
516
+ error_message = f"File format not allowed: {in_doc.file}"
517
+ if raises_on_error:
518
+ raise ConversionError(error_message)
519
+ else:
520
+ error_item = ErrorItem(
521
+ component_type=DoclingComponentType.USER_INPUT,
522
+ module_name="",
523
+ error_message=error_message,
524
+ )
525
+ conv_res = ConversionResult(
526
+ input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
527
+ )
528
+
529
+ return conv_res
530
+
531
+ def _execute_pipeline(
532
+ self, in_doc: InputDocument, raises_on_error: bool
533
+ ) -> ConversionResult:
534
+ if in_doc.valid:
535
+ pipeline = self._get_pipeline(in_doc.format)
536
+ if pipeline is not None:
537
+ conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
538
+ else:
539
+ if raises_on_error:
540
+ raise ConversionError(
541
+ f"No pipeline could be initialized for {in_doc.file}."
542
+ )
543
+ else:
544
+ conv_res = ConversionResult(
545
+ input=in_doc,
546
+ status=ConversionStatus.FAILURE,
547
+ )
548
+ else:
549
+ if raises_on_error:
550
+ raise ConversionError(f"Input document {in_doc.file} is not valid.")
551
+ else:
552
+ # invalid doc or not of desired format
553
+ conv_res = ConversionResult(
554
+ input=in_doc,
555
+ status=ConversionStatus.FAILURE,
556
+ )
557
+ # TODO add error log why it failed.
558
+
559
+ return conv_res