docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,559 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
import warnings
|
|
7
|
+
from collections.abc import Iterable, Iterator
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from functools import partial
|
|
11
|
+
from io import BytesIO
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional, Type, Union
|
|
14
|
+
|
|
15
|
+
from pydantic import ConfigDict, model_validator, validate_call
|
|
16
|
+
from typing_extensions import Self
|
|
17
|
+
|
|
18
|
+
from docling.backend.abstract_backend import (
|
|
19
|
+
AbstractDocumentBackend,
|
|
20
|
+
)
|
|
21
|
+
from docling.backend.asciidoc_backend import AsciiDocBackend
|
|
22
|
+
from docling.backend.csv_backend import CsvDocumentBackend
|
|
23
|
+
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
24
|
+
from docling.backend.html_backend import HTMLDocumentBackend
|
|
25
|
+
from docling.backend.image_backend import ImageDocumentBackend
|
|
26
|
+
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
|
27
|
+
from docling.backend.md_backend import MarkdownDocumentBackend
|
|
28
|
+
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
|
29
|
+
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
|
30
|
+
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
|
31
|
+
from docling.backend.msword_backend import MsWordDocumentBackend
|
|
32
|
+
from docling.backend.noop_backend import NoOpBackend
|
|
33
|
+
from docling.backend.webvtt_backend import WebVTTDocumentBackend
|
|
34
|
+
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
|
35
|
+
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
|
36
|
+
from docling.datamodel.backend_options import (
|
|
37
|
+
BackendOptions,
|
|
38
|
+
HTMLBackendOptions,
|
|
39
|
+
MarkdownBackendOptions,
|
|
40
|
+
PdfBackendOptions,
|
|
41
|
+
)
|
|
42
|
+
from docling.datamodel.base_models import (
|
|
43
|
+
BaseFormatOption,
|
|
44
|
+
ConversionStatus,
|
|
45
|
+
DoclingComponentType,
|
|
46
|
+
DocumentStream,
|
|
47
|
+
ErrorItem,
|
|
48
|
+
InputFormat,
|
|
49
|
+
)
|
|
50
|
+
from docling.datamodel.document import (
|
|
51
|
+
ConversionResult,
|
|
52
|
+
InputDocument,
|
|
53
|
+
_DocumentConversionInput,
|
|
54
|
+
)
|
|
55
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
|
56
|
+
from docling.datamodel.settings import (
|
|
57
|
+
DEFAULT_PAGE_RANGE,
|
|
58
|
+
DocumentLimits,
|
|
59
|
+
PageRange,
|
|
60
|
+
settings,
|
|
61
|
+
)
|
|
62
|
+
from docling.exceptions import ConversionError
|
|
63
|
+
from docling.pipeline.asr_pipeline import AsrPipeline
|
|
64
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
|
65
|
+
from docling.pipeline.simple_pipeline import SimplePipeline
|
|
66
|
+
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|
67
|
+
from docling.utils.utils import chunkify
|
|
68
|
+
|
|
69
|
+
_log = logging.getLogger(__name__)
|
|
70
|
+
_PIPELINE_CACHE_LOCK = threading.Lock()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class FormatOption(BaseFormatOption):
|
|
74
|
+
pipeline_cls: Type[BasePipeline]
|
|
75
|
+
backend_options: Optional[BackendOptions] = None
|
|
76
|
+
|
|
77
|
+
@model_validator(mode="after")
|
|
78
|
+
def set_optional_field_default(self) -> Self:
|
|
79
|
+
if self.pipeline_options is None:
|
|
80
|
+
self.pipeline_options = self.pipeline_cls.get_default_options()
|
|
81
|
+
|
|
82
|
+
return self
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class CsvFormatOption(FormatOption):
|
|
86
|
+
pipeline_cls: Type = SimplePipeline
|
|
87
|
+
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ExcelFormatOption(FormatOption):
|
|
91
|
+
pipeline_cls: Type = SimplePipeline
|
|
92
|
+
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class WordFormatOption(FormatOption):
|
|
96
|
+
pipeline_cls: Type = SimplePipeline
|
|
97
|
+
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class PowerpointFormatOption(FormatOption):
|
|
101
|
+
pipeline_cls: Type = SimplePipeline
|
|
102
|
+
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class MarkdownFormatOption(FormatOption):
|
|
106
|
+
pipeline_cls: Type = SimplePipeline
|
|
107
|
+
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
|
108
|
+
backend_options: Optional[MarkdownBackendOptions] = None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class AsciiDocFormatOption(FormatOption):
|
|
112
|
+
pipeline_cls: Type = SimplePipeline
|
|
113
|
+
backend: Type[AbstractDocumentBackend] = AsciiDocBackend
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class HTMLFormatOption(FormatOption):
|
|
117
|
+
pipeline_cls: Type = SimplePipeline
|
|
118
|
+
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
|
119
|
+
backend_options: Optional[HTMLBackendOptions] = None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class PatentUsptoFormatOption(FormatOption):
|
|
123
|
+
pipeline_cls: Type = SimplePipeline
|
|
124
|
+
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class XMLJatsFormatOption(FormatOption):
|
|
128
|
+
pipeline_cls: Type = SimplePipeline
|
|
129
|
+
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class ImageFormatOption(FormatOption):
|
|
133
|
+
pipeline_cls: Type = StandardPdfPipeline
|
|
134
|
+
backend: Type[AbstractDocumentBackend] = ImageDocumentBackend
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class PdfFormatOption(FormatOption):
|
|
138
|
+
pipeline_cls: Type = StandardPdfPipeline
|
|
139
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
|
140
|
+
backend_options: Optional[PdfBackendOptions] = None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class AudioFormatOption(FormatOption):
|
|
144
|
+
pipeline_cls: Type = AsrPipeline
|
|
145
|
+
backend: Type[AbstractDocumentBackend] = NoOpBackend
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _get_default_option(format: InputFormat) -> FormatOption:
|
|
149
|
+
format_to_default_options = {
|
|
150
|
+
InputFormat.CSV: CsvFormatOption(),
|
|
151
|
+
InputFormat.XLSX: ExcelFormatOption(),
|
|
152
|
+
InputFormat.DOCX: WordFormatOption(),
|
|
153
|
+
InputFormat.PPTX: PowerpointFormatOption(),
|
|
154
|
+
InputFormat.MD: MarkdownFormatOption(),
|
|
155
|
+
InputFormat.ASCIIDOC: AsciiDocFormatOption(),
|
|
156
|
+
InputFormat.HTML: HTMLFormatOption(),
|
|
157
|
+
InputFormat.XML_USPTO: PatentUsptoFormatOption(),
|
|
158
|
+
InputFormat.XML_JATS: XMLJatsFormatOption(),
|
|
159
|
+
InputFormat.METS_GBS: FormatOption(
|
|
160
|
+
pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
|
|
161
|
+
),
|
|
162
|
+
InputFormat.IMAGE: ImageFormatOption(),
|
|
163
|
+
InputFormat.PDF: PdfFormatOption(),
|
|
164
|
+
InputFormat.JSON_DOCLING: FormatOption(
|
|
165
|
+
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
|
166
|
+
),
|
|
167
|
+
InputFormat.AUDIO: AudioFormatOption(),
|
|
168
|
+
InputFormat.VTT: FormatOption(
|
|
169
|
+
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
|
|
170
|
+
),
|
|
171
|
+
}
|
|
172
|
+
if (options := format_to_default_options.get(format)) is not None:
|
|
173
|
+
return options
|
|
174
|
+
else:
|
|
175
|
+
raise RuntimeError(f"No default options configured for {format}")
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class DocumentConverter:
|
|
179
|
+
"""Convert documents of various input formats to Docling documents.
|
|
180
|
+
|
|
181
|
+
`DocumentConverter` is the main entry point for converting documents in Docling.
|
|
182
|
+
It handles various input formats (PDF, DOCX, PPTX, images, HTML, Markdown, etc.)
|
|
183
|
+
and provides both single-document and batch conversion capabilities.
|
|
184
|
+
|
|
185
|
+
The conversion methods return a `ConversionResult` instance for each document,
|
|
186
|
+
which wraps a `DoclingDocument` object if the conversion was successful, along
|
|
187
|
+
with metadata about the conversion process.
|
|
188
|
+
|
|
189
|
+
Attributes:
|
|
190
|
+
allowed_formats: Allowed input formats.
|
|
191
|
+
format_to_options: Mapping of formats to their options.
|
|
192
|
+
initialized_pipelines: Cache of initialized pipelines keyed by
|
|
193
|
+
(pipeline class, options hash).
|
|
194
|
+
"""
|
|
195
|
+
|
|
196
|
+
_default_download_filename = "file"
|
|
197
|
+
|
|
198
|
+
def __init__(
|
|
199
|
+
self,
|
|
200
|
+
allowed_formats: Optional[list[InputFormat]] = None,
|
|
201
|
+
format_options: Optional[dict[InputFormat, FormatOption]] = None,
|
|
202
|
+
) -> None:
|
|
203
|
+
"""Initialize the converter based on format preferences.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
allowed_formats: List of allowed input formats. By default, any
|
|
207
|
+
format supported by Docling is allowed.
|
|
208
|
+
format_options: Dictionary of format-specific options.
|
|
209
|
+
"""
|
|
210
|
+
self.allowed_formats: list[InputFormat] = (
|
|
211
|
+
allowed_formats if allowed_formats is not None else list(InputFormat)
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Normalize format options: ensure IMAGE format uses ImageDocumentBackend
|
|
215
|
+
# for backward compatibility (old code might use PdfFormatOption or other backends for images)
|
|
216
|
+
normalized_format_options: dict[InputFormat, FormatOption] = {}
|
|
217
|
+
if format_options:
|
|
218
|
+
for format, option in format_options.items():
|
|
219
|
+
if (
|
|
220
|
+
format == InputFormat.IMAGE
|
|
221
|
+
and option.backend is not ImageDocumentBackend
|
|
222
|
+
):
|
|
223
|
+
warnings.warn(
|
|
224
|
+
f"Using {option.backend.__name__} for InputFormat.IMAGE is deprecated. "
|
|
225
|
+
"Images should use ImageDocumentBackend via ImageFormatOption. "
|
|
226
|
+
"Automatically correcting the backend, please update your code to avoid this warning.",
|
|
227
|
+
DeprecationWarning,
|
|
228
|
+
stacklevel=2,
|
|
229
|
+
)
|
|
230
|
+
# Convert to ImageFormatOption while preserving pipeline and backend options
|
|
231
|
+
normalized_format_options[format] = ImageFormatOption(
|
|
232
|
+
pipeline_cls=option.pipeline_cls,
|
|
233
|
+
pipeline_options=option.pipeline_options,
|
|
234
|
+
backend_options=option.backend_options,
|
|
235
|
+
)
|
|
236
|
+
else:
|
|
237
|
+
normalized_format_options[format] = option
|
|
238
|
+
|
|
239
|
+
self.format_to_options: dict[InputFormat, FormatOption] = {
|
|
240
|
+
format: (
|
|
241
|
+
_get_default_option(format=format)
|
|
242
|
+
if (custom_option := normalized_format_options.get(format)) is None
|
|
243
|
+
else custom_option
|
|
244
|
+
)
|
|
245
|
+
for format in self.allowed_formats
|
|
246
|
+
}
|
|
247
|
+
self.initialized_pipelines: dict[
|
|
248
|
+
tuple[Type[BasePipeline], str], BasePipeline
|
|
249
|
+
] = {}
|
|
250
|
+
|
|
251
|
+
def _get_initialized_pipelines(
|
|
252
|
+
self,
|
|
253
|
+
) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
|
|
254
|
+
return self.initialized_pipelines
|
|
255
|
+
|
|
256
|
+
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
|
257
|
+
"""Generate a hash of pipeline options to use as part of the cache key."""
|
|
258
|
+
options_str = str(pipeline_options.model_dump())
|
|
259
|
+
return hashlib.md5(
|
|
260
|
+
options_str.encode("utf-8"), usedforsecurity=False
|
|
261
|
+
).hexdigest()
|
|
262
|
+
|
|
263
|
+
def initialize_pipeline(self, format: InputFormat):
|
|
264
|
+
"""Initialize the conversion pipeline for the selected format.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
format: The input format for which to initialize the pipeline.
|
|
268
|
+
|
|
269
|
+
Raises:
|
|
270
|
+
ConversionError: If no pipeline could be initialized for the
|
|
271
|
+
given format.
|
|
272
|
+
RuntimeError: If `artifacts_path` is set in
|
|
273
|
+
`docling.datamodel.settings.settings` when required by
|
|
274
|
+
the pipeline, but points to a non-directory file.
|
|
275
|
+
FileNotFoundError: If local model files are not found.
|
|
276
|
+
"""
|
|
277
|
+
pipeline = self._get_pipeline(doc_format=format)
|
|
278
|
+
if pipeline is None:
|
|
279
|
+
raise ConversionError(
|
|
280
|
+
f"No pipeline could be initialized for format {format}"
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
@validate_call(config=ConfigDict(strict=True))
|
|
284
|
+
def convert(
|
|
285
|
+
self,
|
|
286
|
+
source: Union[Path, str, DocumentStream], # TODO review naming
|
|
287
|
+
headers: Optional[dict[str, str]] = None,
|
|
288
|
+
raises_on_error: bool = True,
|
|
289
|
+
max_num_pages: int = sys.maxsize,
|
|
290
|
+
max_file_size: int = sys.maxsize,
|
|
291
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
|
292
|
+
) -> ConversionResult:
|
|
293
|
+
"""Convert one document fetched from a file path, URL, or DocumentStream.
|
|
294
|
+
|
|
295
|
+
Note: If the document content is given as a string (Markdown or HTML
|
|
296
|
+
content), use the `convert_string` method.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
source: Source of input document given as file path, URL, or
|
|
300
|
+
DocumentStream.
|
|
301
|
+
headers: Optional headers given as a dictionary of string key-value pairs,
|
|
302
|
+
in case of URL input source.
|
|
303
|
+
raises_on_error: Whether to raise an error on the first conversion failure.
|
|
304
|
+
If False, errors are captured in the ConversionResult objects.
|
|
305
|
+
max_num_pages: Maximum number of pages accepted per document.
|
|
306
|
+
Documents exceeding this number will not be converted.
|
|
307
|
+
max_file_size: Maximum file size to convert.
|
|
308
|
+
page_range: Range of pages to convert.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
The conversion result, which contains a `DoclingDocument` in the `document`
|
|
312
|
+
attribute, and metadata about the conversion process.
|
|
313
|
+
|
|
314
|
+
Raises:
|
|
315
|
+
ConversionError: An error occurred during conversion.
|
|
316
|
+
"""
|
|
317
|
+
all_res = self.convert_all(
|
|
318
|
+
source=[source],
|
|
319
|
+
raises_on_error=raises_on_error,
|
|
320
|
+
max_num_pages=max_num_pages,
|
|
321
|
+
max_file_size=max_file_size,
|
|
322
|
+
headers=headers,
|
|
323
|
+
page_range=page_range,
|
|
324
|
+
)
|
|
325
|
+
return next(all_res)
|
|
326
|
+
|
|
327
|
+
@validate_call(config=ConfigDict(strict=True))
|
|
328
|
+
def convert_all(
|
|
329
|
+
self,
|
|
330
|
+
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
|
331
|
+
headers: Optional[dict[str, str]] = None,
|
|
332
|
+
raises_on_error: bool = True,
|
|
333
|
+
max_num_pages: int = sys.maxsize,
|
|
334
|
+
max_file_size: int = sys.maxsize,
|
|
335
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
|
336
|
+
) -> Iterator[ConversionResult]:
|
|
337
|
+
"""Convert multiple documents from file paths, URLs, or DocumentStreams.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
source: Source of input documents given as an iterable of file paths, URLs,
|
|
341
|
+
or DocumentStreams.
|
|
342
|
+
headers: Optional headers given as a (single) dictionary of string
|
|
343
|
+
key-value pairs, in case of URL input source.
|
|
344
|
+
raises_on_error: Whether to raise an error on the first conversion failure.
|
|
345
|
+
max_num_pages: Maximum number of pages to convert.
|
|
346
|
+
max_file_size: Maximum number of pages accepted per document. Documents
|
|
347
|
+
exceeding this number will be skipped.
|
|
348
|
+
page_range: Range of pages to convert in each document.
|
|
349
|
+
|
|
350
|
+
Yields:
|
|
351
|
+
The conversion results, each containing a `DoclingDocument` in the
|
|
352
|
+
`document` attribute and metadata about the conversion process.
|
|
353
|
+
|
|
354
|
+
Raises:
|
|
355
|
+
ConversionError: An error occurred during conversion.
|
|
356
|
+
"""
|
|
357
|
+
limits = DocumentLimits(
|
|
358
|
+
max_num_pages=max_num_pages,
|
|
359
|
+
max_file_size=max_file_size,
|
|
360
|
+
page_range=page_range,
|
|
361
|
+
)
|
|
362
|
+
conv_input = _DocumentConversionInput(
|
|
363
|
+
path_or_stream_iterator=source, limits=limits, headers=headers
|
|
364
|
+
)
|
|
365
|
+
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
|
366
|
+
|
|
367
|
+
had_result = False
|
|
368
|
+
for conv_res in conv_res_iter:
|
|
369
|
+
had_result = True
|
|
370
|
+
if raises_on_error and conv_res.status not in {
|
|
371
|
+
ConversionStatus.SUCCESS,
|
|
372
|
+
ConversionStatus.PARTIAL_SUCCESS,
|
|
373
|
+
}:
|
|
374
|
+
error_details = ""
|
|
375
|
+
if conv_res.errors:
|
|
376
|
+
error_messages = [err.error_message for err in conv_res.errors]
|
|
377
|
+
error_details = f" Errors: {'; '.join(error_messages)}"
|
|
378
|
+
raise ConversionError(
|
|
379
|
+
f"Conversion failed for: {conv_res.input.file} with status: "
|
|
380
|
+
f"{conv_res.status}.{error_details}"
|
|
381
|
+
)
|
|
382
|
+
else:
|
|
383
|
+
yield conv_res
|
|
384
|
+
|
|
385
|
+
if not had_result and raises_on_error:
|
|
386
|
+
raise ConversionError(
|
|
387
|
+
"Conversion failed because the provided file has no recognizable "
|
|
388
|
+
"format or it wasn't in the list of allowed formats."
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
@validate_call(config=ConfigDict(strict=True))
|
|
392
|
+
def convert_string(
|
|
393
|
+
self,
|
|
394
|
+
content: str,
|
|
395
|
+
format: InputFormat,
|
|
396
|
+
name: Optional[str] = None,
|
|
397
|
+
) -> ConversionResult:
|
|
398
|
+
"""Convert a document given as a string using the specified format.
|
|
399
|
+
|
|
400
|
+
Only Markdown (`InputFormat.MD`) and HTML (`InputFormat.HTML`) formats
|
|
401
|
+
are supported. The content is wrapped in a `DocumentStream` and passed
|
|
402
|
+
to the main conversion pipeline.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
content: The document content as a string.
|
|
406
|
+
format: The format of the input content.
|
|
407
|
+
name: The filename to associate with the document. If not provided, a
|
|
408
|
+
timestamp-based name is generated. The appropriate file extension (`md`
|
|
409
|
+
or `html`) is appended if missing.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
The conversion result, which contains a `DoclingDocument` in the `document`
|
|
413
|
+
attribute, and metadata about the conversion process.
|
|
414
|
+
|
|
415
|
+
Raises:
|
|
416
|
+
ValueError: If format is neither `InputFormat.MD` nor `InputFormat.HTML`.
|
|
417
|
+
ConversionError: An error occurred during conversion.
|
|
418
|
+
"""
|
|
419
|
+
name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
420
|
+
|
|
421
|
+
if format == InputFormat.MD:
|
|
422
|
+
if not name.endswith(".md"):
|
|
423
|
+
name += ".md"
|
|
424
|
+
|
|
425
|
+
buff = BytesIO(content.encode("utf-8"))
|
|
426
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
|
427
|
+
|
|
428
|
+
return self.convert(doc_stream)
|
|
429
|
+
elif format == InputFormat.HTML:
|
|
430
|
+
if not name.endswith(".html"):
|
|
431
|
+
name += ".html"
|
|
432
|
+
|
|
433
|
+
buff = BytesIO(content.encode("utf-8"))
|
|
434
|
+
doc_stream = DocumentStream(name=name, stream=buff)
|
|
435
|
+
|
|
436
|
+
return self.convert(doc_stream)
|
|
437
|
+
else:
|
|
438
|
+
raise ValueError(f"format {format} is not supported in `convert_string`")
|
|
439
|
+
|
|
440
|
+
def _convert(
|
|
441
|
+
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
|
442
|
+
) -> Iterator[ConversionResult]:
|
|
443
|
+
start_time = time.monotonic()
|
|
444
|
+
|
|
445
|
+
for input_batch in chunkify(
|
|
446
|
+
conv_input.docs(self.format_to_options),
|
|
447
|
+
settings.perf.doc_batch_size, # pass format_options
|
|
448
|
+
):
|
|
449
|
+
_log.info("Going to convert document batch...")
|
|
450
|
+
process_func = partial(
|
|
451
|
+
self._process_document, raises_on_error=raises_on_error
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
if (
|
|
455
|
+
settings.perf.doc_batch_concurrency > 1
|
|
456
|
+
and settings.perf.doc_batch_size > 1
|
|
457
|
+
):
|
|
458
|
+
with ThreadPoolExecutor(
|
|
459
|
+
max_workers=settings.perf.doc_batch_concurrency
|
|
460
|
+
) as pool:
|
|
461
|
+
for item in pool.map(
|
|
462
|
+
process_func,
|
|
463
|
+
input_batch,
|
|
464
|
+
):
|
|
465
|
+
yield item
|
|
466
|
+
else:
|
|
467
|
+
for item in map(
|
|
468
|
+
process_func,
|
|
469
|
+
input_batch,
|
|
470
|
+
):
|
|
471
|
+
elapsed = time.monotonic() - start_time
|
|
472
|
+
start_time = time.monotonic()
|
|
473
|
+
_log.info(
|
|
474
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
|
475
|
+
)
|
|
476
|
+
yield item
|
|
477
|
+
|
|
478
|
+
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
|
479
|
+
"""Retrieve or initialize a pipeline, reusing instances based on class and options."""
|
|
480
|
+
fopt = self.format_to_options.get(doc_format)
|
|
481
|
+
|
|
482
|
+
if fopt is None or fopt.pipeline_options is None:
|
|
483
|
+
return None
|
|
484
|
+
|
|
485
|
+
pipeline_class = fopt.pipeline_cls
|
|
486
|
+
pipeline_options = fopt.pipeline_options
|
|
487
|
+
options_hash = self._get_pipeline_options_hash(pipeline_options)
|
|
488
|
+
|
|
489
|
+
# Use a composite key to cache pipelines
|
|
490
|
+
cache_key = (pipeline_class, options_hash)
|
|
491
|
+
|
|
492
|
+
with _PIPELINE_CACHE_LOCK:
|
|
493
|
+
if cache_key not in self.initialized_pipelines:
|
|
494
|
+
_log.info(
|
|
495
|
+
f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
|
496
|
+
)
|
|
497
|
+
self.initialized_pipelines[cache_key] = pipeline_class(
|
|
498
|
+
pipeline_options=pipeline_options
|
|
499
|
+
)
|
|
500
|
+
else:
|
|
501
|
+
_log.debug(
|
|
502
|
+
f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
return self.initialized_pipelines[cache_key]
|
|
506
|
+
|
|
507
|
+
def _process_document(
|
|
508
|
+
self, in_doc: InputDocument, raises_on_error: bool
|
|
509
|
+
) -> ConversionResult:
|
|
510
|
+
valid = (
|
|
511
|
+
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
|
512
|
+
)
|
|
513
|
+
if valid:
|
|
514
|
+
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
|
515
|
+
else:
|
|
516
|
+
error_message = f"File format not allowed: {in_doc.file}"
|
|
517
|
+
if raises_on_error:
|
|
518
|
+
raise ConversionError(error_message)
|
|
519
|
+
else:
|
|
520
|
+
error_item = ErrorItem(
|
|
521
|
+
component_type=DoclingComponentType.USER_INPUT,
|
|
522
|
+
module_name="",
|
|
523
|
+
error_message=error_message,
|
|
524
|
+
)
|
|
525
|
+
conv_res = ConversionResult(
|
|
526
|
+
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
return conv_res
|
|
530
|
+
|
|
531
|
+
def _execute_pipeline(
|
|
532
|
+
self, in_doc: InputDocument, raises_on_error: bool
|
|
533
|
+
) -> ConversionResult:
|
|
534
|
+
if in_doc.valid:
|
|
535
|
+
pipeline = self._get_pipeline(in_doc.format)
|
|
536
|
+
if pipeline is not None:
|
|
537
|
+
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
|
538
|
+
else:
|
|
539
|
+
if raises_on_error:
|
|
540
|
+
raise ConversionError(
|
|
541
|
+
f"No pipeline could be initialized for {in_doc.file}."
|
|
542
|
+
)
|
|
543
|
+
else:
|
|
544
|
+
conv_res = ConversionResult(
|
|
545
|
+
input=in_doc,
|
|
546
|
+
status=ConversionStatus.FAILURE,
|
|
547
|
+
)
|
|
548
|
+
else:
|
|
549
|
+
if raises_on_error:
|
|
550
|
+
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
|
551
|
+
else:
|
|
552
|
+
# invalid doc or not of desired format
|
|
553
|
+
conv_res = ConversionResult(
|
|
554
|
+
input=in_doc,
|
|
555
|
+
status=ConversionStatus.FAILURE,
|
|
556
|
+
)
|
|
557
|
+
# TODO add error log why it failed.
|
|
558
|
+
|
|
559
|
+
return conv_res
|