docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,699 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import importlib
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import platform
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
import tarfile
|
|
9
|
+
import zipfile
|
|
10
|
+
from collections.abc import Iterable, Mapping
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from io import BytesIO
|
|
14
|
+
from pathlib import Path, PurePath
|
|
15
|
+
from typing import (
|
|
16
|
+
TYPE_CHECKING,
|
|
17
|
+
Annotated,
|
|
18
|
+
Literal,
|
|
19
|
+
Optional,
|
|
20
|
+
Type,
|
|
21
|
+
Union,
|
|
22
|
+
cast,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
import filetype
|
|
26
|
+
|
|
27
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
|
28
|
+
from docling_core.types.doc import (
|
|
29
|
+
DocItem,
|
|
30
|
+
DocItemLabel,
|
|
31
|
+
DoclingDocument,
|
|
32
|
+
PictureItem,
|
|
33
|
+
SectionHeaderItem,
|
|
34
|
+
TableItem,
|
|
35
|
+
TextItem,
|
|
36
|
+
)
|
|
37
|
+
from docling_core.types.doc.document import ListItem
|
|
38
|
+
from docling_core.types.legacy_doc.base import (
|
|
39
|
+
BaseText,
|
|
40
|
+
Figure,
|
|
41
|
+
GlmTableCell,
|
|
42
|
+
PageDimensions,
|
|
43
|
+
PageReference,
|
|
44
|
+
Prov,
|
|
45
|
+
Ref,
|
|
46
|
+
Table as DsSchemaTable,
|
|
47
|
+
TableCell,
|
|
48
|
+
)
|
|
49
|
+
from docling_core.types.legacy_doc.document import (
|
|
50
|
+
CCSDocumentDescription as DsDocumentDescription,
|
|
51
|
+
CCSFileInfoObject as DsFileInfoObject,
|
|
52
|
+
ExportedCCSDocument as DsDocument,
|
|
53
|
+
)
|
|
54
|
+
from docling_core.utils.file import resolve_source_to_stream
|
|
55
|
+
from docling_core.utils.legacy import docling_document_to_legacy
|
|
56
|
+
from pydantic import BaseModel, Field
|
|
57
|
+
from typing_extensions import deprecated
|
|
58
|
+
|
|
59
|
+
from docling.backend.abstract_backend import (
|
|
60
|
+
AbstractDocumentBackend,
|
|
61
|
+
DeclarativeDocumentBackend,
|
|
62
|
+
PaginatedDocumentBackend,
|
|
63
|
+
)
|
|
64
|
+
from docling.datamodel.backend_options import BackendOptions
|
|
65
|
+
from docling.datamodel.base_models import (
|
|
66
|
+
AssembledUnit,
|
|
67
|
+
ConfidenceReport,
|
|
68
|
+
ConversionStatus,
|
|
69
|
+
DocumentStream,
|
|
70
|
+
ErrorItem,
|
|
71
|
+
FormatToExtensions,
|
|
72
|
+
FormatToMimeType,
|
|
73
|
+
InputFormat,
|
|
74
|
+
MimeTypeToFormat,
|
|
75
|
+
Page,
|
|
76
|
+
)
|
|
77
|
+
from docling.datamodel.settings import DocumentLimits
|
|
78
|
+
from docling.utils.profiling import ProfilingItem
|
|
79
|
+
from docling.utils.utils import create_file_hash
|
|
80
|
+
|
|
81
|
+
if TYPE_CHECKING:
|
|
82
|
+
from docling.datamodel.base_models import BaseFormatOption
|
|
83
|
+
from docling.document_converter import FormatOption
|
|
84
|
+
|
|
85
|
+
_log = logging.getLogger(__name__)
|
|
86
|
+
|
|
87
|
+
layout_label_to_ds_type = {
|
|
88
|
+
DocItemLabel.TITLE: "title",
|
|
89
|
+
DocItemLabel.DOCUMENT_INDEX: "table",
|
|
90
|
+
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
|
91
|
+
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
|
92
|
+
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
|
93
|
+
DocItemLabel.CAPTION: "caption",
|
|
94
|
+
DocItemLabel.PAGE_HEADER: "page-header",
|
|
95
|
+
DocItemLabel.PAGE_FOOTER: "page-footer",
|
|
96
|
+
DocItemLabel.FOOTNOTE: "footnote",
|
|
97
|
+
DocItemLabel.TABLE: "table",
|
|
98
|
+
DocItemLabel.FORMULA: "equation",
|
|
99
|
+
DocItemLabel.LIST_ITEM: "paragraph",
|
|
100
|
+
DocItemLabel.CODE: "paragraph",
|
|
101
|
+
DocItemLabel.PICTURE: "figure",
|
|
102
|
+
DocItemLabel.TEXT: "paragraph",
|
|
103
|
+
DocItemLabel.PARAGRAPH: "paragraph",
|
|
104
|
+
DocItemLabel.FORM: DocItemLabel.FORM.value,
|
|
105
|
+
DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class InputDocument(BaseModel):
|
|
112
|
+
"""A document as an input of a Docling conversion."""
|
|
113
|
+
|
|
114
|
+
file: Annotated[
|
|
115
|
+
PurePath, Field(description="A path representation the input document.")
|
|
116
|
+
]
|
|
117
|
+
document_hash: Annotated[
|
|
118
|
+
str,
|
|
119
|
+
Field(description="A stable hash of the path or stream of the input document."),
|
|
120
|
+
]
|
|
121
|
+
valid: bool = Field(True, description="Whether this is is a valid input document.")
|
|
122
|
+
backend_options: Optional[BackendOptions] = Field(
|
|
123
|
+
None, description="Custom options for backends."
|
|
124
|
+
)
|
|
125
|
+
limits: DocumentLimits = Field(
|
|
126
|
+
DocumentLimits(), description="Limits in the input document for the conversion."
|
|
127
|
+
)
|
|
128
|
+
format: Annotated[InputFormat, Field(description="The document format.")]
|
|
129
|
+
|
|
130
|
+
filesize: Optional[int] = Field(
|
|
131
|
+
None, description="Size of the input file, in bytes."
|
|
132
|
+
)
|
|
133
|
+
page_count: int = Field(0, description="Number of pages in the input document.")
|
|
134
|
+
|
|
135
|
+
_backend: AbstractDocumentBackend
|
|
136
|
+
|
|
137
|
+
def __init__(
|
|
138
|
+
self,
|
|
139
|
+
path_or_stream: Union[BytesIO, Path],
|
|
140
|
+
format: InputFormat,
|
|
141
|
+
backend: Type[AbstractDocumentBackend],
|
|
142
|
+
backend_options: Optional[BackendOptions] = None,
|
|
143
|
+
filename: Optional[str] = None,
|
|
144
|
+
limits: Optional[DocumentLimits] = None,
|
|
145
|
+
) -> None:
|
|
146
|
+
super().__init__(
|
|
147
|
+
file="",
|
|
148
|
+
document_hash="",
|
|
149
|
+
format=InputFormat.PDF,
|
|
150
|
+
backend_options=backend_options,
|
|
151
|
+
) # initialize with dummy values
|
|
152
|
+
self.limits = limits or DocumentLimits()
|
|
153
|
+
self.format = format
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
if isinstance(path_or_stream, Path):
|
|
157
|
+
self.file = path_or_stream
|
|
158
|
+
self.filesize = path_or_stream.stat().st_size
|
|
159
|
+
if self.filesize > self.limits.max_file_size:
|
|
160
|
+
self.valid = False
|
|
161
|
+
else:
|
|
162
|
+
self.document_hash = create_file_hash(path_or_stream)
|
|
163
|
+
self._init_doc(backend, path_or_stream)
|
|
164
|
+
|
|
165
|
+
elif isinstance(path_or_stream, BytesIO):
|
|
166
|
+
assert filename is not None, (
|
|
167
|
+
"Can't construct InputDocument from stream without providing "
|
|
168
|
+
"filename arg."
|
|
169
|
+
)
|
|
170
|
+
self.file = PurePath(filename)
|
|
171
|
+
self.filesize = path_or_stream.getbuffer().nbytes
|
|
172
|
+
|
|
173
|
+
if self.filesize > self.limits.max_file_size:
|
|
174
|
+
self.valid = False
|
|
175
|
+
else:
|
|
176
|
+
self.document_hash = create_file_hash(path_or_stream)
|
|
177
|
+
self._init_doc(backend, path_or_stream)
|
|
178
|
+
else:
|
|
179
|
+
raise RuntimeError(
|
|
180
|
+
f"Unexpected type path_or_stream: {type(path_or_stream)}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# For paginated backends, check if the maximum page count is exceeded.
|
|
184
|
+
if self.valid and self._backend.is_valid():
|
|
185
|
+
if self._backend.supports_pagination() and isinstance(
|
|
186
|
+
self._backend, PaginatedDocumentBackend
|
|
187
|
+
):
|
|
188
|
+
self.page_count = self._backend.page_count()
|
|
189
|
+
if not self.page_count <= self.limits.max_num_pages:
|
|
190
|
+
self.valid = False
|
|
191
|
+
elif self.page_count < self.limits.page_range[0]:
|
|
192
|
+
self.valid = False
|
|
193
|
+
|
|
194
|
+
except (FileNotFoundError, OSError) as e:
|
|
195
|
+
self.valid = False
|
|
196
|
+
_log.exception(
|
|
197
|
+
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
|
198
|
+
)
|
|
199
|
+
# raise
|
|
200
|
+
except RuntimeError as e:
|
|
201
|
+
self.valid = False
|
|
202
|
+
_log.exception(
|
|
203
|
+
"An unexpected error occurred while opening the document "
|
|
204
|
+
f"{self.file.name}",
|
|
205
|
+
exc_info=e,
|
|
206
|
+
)
|
|
207
|
+
# raise
|
|
208
|
+
|
|
209
|
+
def _init_doc(
|
|
210
|
+
self,
|
|
211
|
+
backend: Type[AbstractDocumentBackend],
|
|
212
|
+
path_or_stream: Union[BytesIO, Path],
|
|
213
|
+
) -> None:
|
|
214
|
+
if self.backend_options:
|
|
215
|
+
self._backend = backend(
|
|
216
|
+
self,
|
|
217
|
+
path_or_stream=path_or_stream,
|
|
218
|
+
options=self.backend_options,
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
self._backend = backend(self, path_or_stream=path_or_stream)
|
|
222
|
+
|
|
223
|
+
if not self._backend.is_valid():
|
|
224
|
+
self.valid = False
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class DocumentFormat(str, Enum):
|
|
228
|
+
V2 = "v2"
|
|
229
|
+
V1 = "v1"
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class DoclingVersion(BaseModel):
|
|
233
|
+
docling_version: str = importlib.metadata.version("docling")
|
|
234
|
+
docling_core_version: str = importlib.metadata.version("docling-core")
|
|
235
|
+
docling_ibm_models_version: str = importlib.metadata.version("docling-ibm-models")
|
|
236
|
+
docling_parse_version: str = importlib.metadata.version("docling-parse")
|
|
237
|
+
platform_str: str = platform.platform()
|
|
238
|
+
py_impl_version: str = sys.implementation.cache_tag
|
|
239
|
+
py_lang_version: str = platform.python_version()
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class ConversionAssets(BaseModel):
|
|
243
|
+
version: DoclingVersion = DoclingVersion()
|
|
244
|
+
# When the assets were saved (ISO string from datetime.now())
|
|
245
|
+
timestamp: Optional[str] = None
|
|
246
|
+
|
|
247
|
+
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
|
248
|
+
errors: list[ErrorItem] = [] # structure to keep errors
|
|
249
|
+
|
|
250
|
+
pages: list[Page] = []
|
|
251
|
+
timings: dict[str, ProfilingItem] = {}
|
|
252
|
+
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
|
253
|
+
|
|
254
|
+
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
@deprecated("Use document instead.")
|
|
258
|
+
def legacy_document(self):
|
|
259
|
+
return docling_document_to_legacy(self.document)
|
|
260
|
+
|
|
261
|
+
def save(
|
|
262
|
+
self,
|
|
263
|
+
*,
|
|
264
|
+
filename: Union[str, Path],
|
|
265
|
+
indent: Optional[int] = 2,
|
|
266
|
+
):
|
|
267
|
+
"""Serialize the full ConversionAssets to JSON."""
|
|
268
|
+
if isinstance(filename, str):
|
|
269
|
+
filename = Path(filename)
|
|
270
|
+
# Build an in-memory ZIP archive containing JSON for each asset
|
|
271
|
+
buf = BytesIO()
|
|
272
|
+
|
|
273
|
+
def to_jsonable(obj):
|
|
274
|
+
try:
|
|
275
|
+
# pydantic v2 models
|
|
276
|
+
if hasattr(obj, "model_dump"):
|
|
277
|
+
return obj.model_dump(mode="json") # type: ignore[attr-defined]
|
|
278
|
+
except TypeError:
|
|
279
|
+
# some models may not accept mode argument
|
|
280
|
+
return obj.model_dump() # type: ignore[attr-defined]
|
|
281
|
+
|
|
282
|
+
# enums
|
|
283
|
+
try:
|
|
284
|
+
from enum import Enum
|
|
285
|
+
|
|
286
|
+
if isinstance(obj, Enum):
|
|
287
|
+
return obj.value
|
|
288
|
+
except Exception:
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
# containers
|
|
292
|
+
if isinstance(obj, list):
|
|
293
|
+
return [to_jsonable(x) for x in obj]
|
|
294
|
+
if isinstance(obj, dict):
|
|
295
|
+
return {k: to_jsonable(v) for k, v in obj.items()}
|
|
296
|
+
|
|
297
|
+
# passthrough primitives
|
|
298
|
+
return obj
|
|
299
|
+
|
|
300
|
+
with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
301
|
+
|
|
302
|
+
def write_json(name: str, payload) -> None:
|
|
303
|
+
data = json.dumps(
|
|
304
|
+
to_jsonable(payload), ensure_ascii=False, indent=indent
|
|
305
|
+
)
|
|
306
|
+
zf.writestr(name, data.encode("utf-8"))
|
|
307
|
+
|
|
308
|
+
# Update and persist a save timestamp
|
|
309
|
+
self.timestamp = datetime.now().isoformat()
|
|
310
|
+
write_json("timestamp.json", self.timestamp)
|
|
311
|
+
|
|
312
|
+
# Store each component in its own JSON file
|
|
313
|
+
write_json("version.json", self.version)
|
|
314
|
+
write_json("status.json", self.status)
|
|
315
|
+
write_json("errors.json", self.errors)
|
|
316
|
+
write_json("pages.json", self.pages)
|
|
317
|
+
write_json("timings.json", self.timings)
|
|
318
|
+
write_json("confidence.json", self.confidence)
|
|
319
|
+
# For the document, ensure stable schema via export_to_dict
|
|
320
|
+
doc_dict = self.document.export_to_dict()
|
|
321
|
+
zf.writestr(
|
|
322
|
+
"document.json",
|
|
323
|
+
json.dumps(doc_dict, ensure_ascii=False, indent=indent).encode("utf-8"),
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Persist the ZIP to disk
|
|
327
|
+
buf.seek(0)
|
|
328
|
+
if filename.parent and not filename.parent.exists():
|
|
329
|
+
filename.parent.mkdir(parents=True, exist_ok=True)
|
|
330
|
+
with filename.open("wb") as f:
|
|
331
|
+
f.write(buf.getvalue())
|
|
332
|
+
|
|
333
|
+
@classmethod
|
|
334
|
+
def load(cls, filename: Union[str, Path]) -> "ConversionAssets":
|
|
335
|
+
"""Load a ConversionAssets."""
|
|
336
|
+
if isinstance(filename, str):
|
|
337
|
+
filename = Path(filename)
|
|
338
|
+
|
|
339
|
+
# Read the ZIP and deserialize all items
|
|
340
|
+
version_info: DoclingVersion = DoclingVersion()
|
|
341
|
+
timestamp: Optional[str] = None
|
|
342
|
+
status = ConversionStatus.PENDING
|
|
343
|
+
errors: list[ErrorItem] = []
|
|
344
|
+
pages: list[Page] = []
|
|
345
|
+
timings: dict[str, ProfilingItem] = {}
|
|
346
|
+
confidence = ConfidenceReport()
|
|
347
|
+
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
|
348
|
+
|
|
349
|
+
with zipfile.ZipFile(filename, mode="r") as zf:
|
|
350
|
+
|
|
351
|
+
def read_json(name: str):
|
|
352
|
+
try:
|
|
353
|
+
with zf.open(name, "r") as fp:
|
|
354
|
+
return json.loads(fp.read().decode("utf-8"))
|
|
355
|
+
except KeyError:
|
|
356
|
+
return None
|
|
357
|
+
|
|
358
|
+
# version
|
|
359
|
+
if (data := read_json("version.json")) is not None:
|
|
360
|
+
try:
|
|
361
|
+
version_info = DoclingVersion.model_validate(data)
|
|
362
|
+
except Exception as exc:
|
|
363
|
+
_log.error(f"Could not read version: {exc}")
|
|
364
|
+
|
|
365
|
+
# timestamp
|
|
366
|
+
if (data := read_json("timestamp.json")) is not None:
|
|
367
|
+
if isinstance(data, str):
|
|
368
|
+
timestamp = data
|
|
369
|
+
|
|
370
|
+
# status
|
|
371
|
+
if (data := read_json("status.json")) is not None:
|
|
372
|
+
try:
|
|
373
|
+
status = ConversionStatus(data)
|
|
374
|
+
except Exception:
|
|
375
|
+
status = ConversionStatus.PENDING
|
|
376
|
+
|
|
377
|
+
# errors
|
|
378
|
+
if (data := read_json("errors.json")) is not None and isinstance(
|
|
379
|
+
data, list
|
|
380
|
+
):
|
|
381
|
+
errors = [ErrorItem.model_validate(item) for item in data]
|
|
382
|
+
|
|
383
|
+
# pages
|
|
384
|
+
if (data := read_json("pages.json")) is not None and isinstance(data, list):
|
|
385
|
+
pages = [Page.model_validate(item) for item in data]
|
|
386
|
+
|
|
387
|
+
# timings
|
|
388
|
+
if (data := read_json("timings.json")) is not None and isinstance(
|
|
389
|
+
data, dict
|
|
390
|
+
):
|
|
391
|
+
timings = {k: ProfilingItem.model_validate(v) for k, v in data.items()}
|
|
392
|
+
|
|
393
|
+
# confidence
|
|
394
|
+
if (data := read_json("confidence.json")) is not None and isinstance(
|
|
395
|
+
data, dict
|
|
396
|
+
):
|
|
397
|
+
confidence = ConfidenceReport.model_validate(data)
|
|
398
|
+
|
|
399
|
+
# document
|
|
400
|
+
if (data := read_json("document.json")) is not None and isinstance(
|
|
401
|
+
data, dict
|
|
402
|
+
):
|
|
403
|
+
document = DoclingDocument.model_validate(data)
|
|
404
|
+
|
|
405
|
+
return cls(
|
|
406
|
+
version=version_info,
|
|
407
|
+
timestamp=timestamp,
|
|
408
|
+
status=status,
|
|
409
|
+
errors=errors,
|
|
410
|
+
pages=pages,
|
|
411
|
+
timings=timings,
|
|
412
|
+
confidence=confidence,
|
|
413
|
+
document=document,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
class ConversionResult(ConversionAssets):
|
|
418
|
+
input: InputDocument
|
|
419
|
+
assembled: AssembledUnit = AssembledUnit()
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
class _DummyBackend(AbstractDocumentBackend):
|
|
423
|
+
def __init__(self, *args, **kwargs):
|
|
424
|
+
super().__init__(*args, **kwargs)
|
|
425
|
+
|
|
426
|
+
def is_valid(self) -> bool:
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
@classmethod
|
|
430
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
431
|
+
return set()
|
|
432
|
+
|
|
433
|
+
@classmethod
|
|
434
|
+
def supports_pagination(cls) -> bool:
|
|
435
|
+
return False
|
|
436
|
+
|
|
437
|
+
def unload(self):
|
|
438
|
+
return super().unload()
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
class _DocumentConversionInput(BaseModel):
|
|
442
|
+
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
|
443
|
+
headers: Optional[dict[str, str]] = None
|
|
444
|
+
limits: Optional[DocumentLimits] = DocumentLimits()
|
|
445
|
+
|
|
446
|
+
def docs(
|
|
447
|
+
self,
|
|
448
|
+
format_options: Mapping[InputFormat, "BaseFormatOption"],
|
|
449
|
+
) -> Iterable[InputDocument]:
|
|
450
|
+
for item in self.path_or_stream_iterator:
|
|
451
|
+
obj = (
|
|
452
|
+
resolve_source_to_stream(item, self.headers)
|
|
453
|
+
if isinstance(item, str)
|
|
454
|
+
else item
|
|
455
|
+
)
|
|
456
|
+
format = self._guess_format(obj)
|
|
457
|
+
backend: Type[AbstractDocumentBackend]
|
|
458
|
+
backend_options: Optional[BackendOptions] = None
|
|
459
|
+
if not format or format not in format_options:
|
|
460
|
+
_log.error(
|
|
461
|
+
f"Input document {obj.name} with format {format} does not match "
|
|
462
|
+
f"any allowed format: ({format_options.keys()})"
|
|
463
|
+
)
|
|
464
|
+
backend = _DummyBackend
|
|
465
|
+
else:
|
|
466
|
+
options = format_options[format]
|
|
467
|
+
backend = options.backend
|
|
468
|
+
if "backend_options" in options.model_fields_set:
|
|
469
|
+
backend_options = cast("FormatOption", options).backend_options
|
|
470
|
+
|
|
471
|
+
path_or_stream: Union[BytesIO, Path]
|
|
472
|
+
if isinstance(obj, Path):
|
|
473
|
+
path_or_stream = obj
|
|
474
|
+
elif isinstance(obj, DocumentStream):
|
|
475
|
+
path_or_stream = obj.stream
|
|
476
|
+
else:
|
|
477
|
+
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
|
478
|
+
|
|
479
|
+
yield InputDocument(
|
|
480
|
+
path_or_stream=path_or_stream,
|
|
481
|
+
format=format, # type: ignore[arg-type]
|
|
482
|
+
filename=obj.name,
|
|
483
|
+
limits=self.limits,
|
|
484
|
+
backend=backend,
|
|
485
|
+
backend_options=backend_options,
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
|
|
489
|
+
content = b"" # empty binary blob
|
|
490
|
+
formats: list[InputFormat] = []
|
|
491
|
+
|
|
492
|
+
if isinstance(obj, Path):
|
|
493
|
+
mime = filetype.guess_mime(str(obj))
|
|
494
|
+
if mime is None:
|
|
495
|
+
ext = obj.suffix[1:]
|
|
496
|
+
mime = _DocumentConversionInput._mime_from_extension(ext)
|
|
497
|
+
if mime is None: # must guess from
|
|
498
|
+
with obj.open("rb") as f:
|
|
499
|
+
content = f.read(1024) # Read first 1KB
|
|
500
|
+
if mime is not None and mime.lower() == "application/zip":
|
|
501
|
+
mime_root = "application/vnd.openxmlformats-officedocument"
|
|
502
|
+
if obj.suffixes[-1].lower() == ".xlsx":
|
|
503
|
+
mime = mime_root + ".spreadsheetml.sheet"
|
|
504
|
+
elif obj.suffixes[-1].lower() == ".docx":
|
|
505
|
+
mime = mime_root + ".wordprocessingml.document"
|
|
506
|
+
elif obj.suffixes[-1].lower() == ".pptx":
|
|
507
|
+
mime = mime_root + ".presentationml.presentation"
|
|
508
|
+
|
|
509
|
+
elif isinstance(obj, DocumentStream):
|
|
510
|
+
content = obj.stream.read(8192)
|
|
511
|
+
obj.stream.seek(0)
|
|
512
|
+
mime = filetype.guess_mime(content)
|
|
513
|
+
if mime is None:
|
|
514
|
+
ext = (
|
|
515
|
+
obj.name.rsplit(".", 1)[-1]
|
|
516
|
+
if ("." in obj.name and not obj.name.startswith("."))
|
|
517
|
+
else ""
|
|
518
|
+
)
|
|
519
|
+
mime = _DocumentConversionInput._mime_from_extension(ext.lower())
|
|
520
|
+
if mime is not None and mime.lower() == "application/zip":
|
|
521
|
+
objname = obj.name.lower()
|
|
522
|
+
mime_root = "application/vnd.openxmlformats-officedocument"
|
|
523
|
+
if objname.endswith(".xlsx"):
|
|
524
|
+
mime = mime_root + ".spreadsheetml.sheet"
|
|
525
|
+
elif objname.endswith(".docx"):
|
|
526
|
+
mime = mime_root + ".wordprocessingml.document"
|
|
527
|
+
elif objname.endswith(".pptx"):
|
|
528
|
+
mime = mime_root + ".presentationml.presentation"
|
|
529
|
+
|
|
530
|
+
if mime is not None and mime.lower() == "application/gzip":
|
|
531
|
+
if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
|
|
532
|
+
mime = detected_mime
|
|
533
|
+
|
|
534
|
+
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
|
535
|
+
mime = mime or _DocumentConversionInput._detect_csv(content)
|
|
536
|
+
mime = mime or "text/plain"
|
|
537
|
+
formats = MimeTypeToFormat.get(mime, [])
|
|
538
|
+
_log.info(f"detected formats: {formats}")
|
|
539
|
+
|
|
540
|
+
if formats:
|
|
541
|
+
if len(formats) == 1 and mime not in ("text/plain"):
|
|
542
|
+
return formats[0]
|
|
543
|
+
else: # ambiguity in formats
|
|
544
|
+
return _DocumentConversionInput._guess_from_content(
|
|
545
|
+
content, mime, formats
|
|
546
|
+
)
|
|
547
|
+
else:
|
|
548
|
+
return None
|
|
549
|
+
|
|
550
|
+
@staticmethod
|
|
551
|
+
def _guess_from_content(
|
|
552
|
+
content: bytes, mime: str, formats: list[InputFormat]
|
|
553
|
+
) -> Optional[InputFormat]:
|
|
554
|
+
"""Guess the input format of a document by checking part of its content."""
|
|
555
|
+
input_format: Optional[InputFormat] = None
|
|
556
|
+
|
|
557
|
+
if mime == "application/xml":
|
|
558
|
+
content_str = content.decode("utf-8")
|
|
559
|
+
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
|
560
|
+
if match_doctype:
|
|
561
|
+
xml_doctype = match_doctype.group()
|
|
562
|
+
if InputFormat.XML_USPTO in formats and any(
|
|
563
|
+
item in xml_doctype
|
|
564
|
+
for item in (
|
|
565
|
+
"us-patent-application-v4",
|
|
566
|
+
"us-patent-grant-v4",
|
|
567
|
+
"us-grant-025",
|
|
568
|
+
"patent-application-publication",
|
|
569
|
+
)
|
|
570
|
+
):
|
|
571
|
+
input_format = InputFormat.XML_USPTO
|
|
572
|
+
|
|
573
|
+
if InputFormat.XML_JATS in formats and (
|
|
574
|
+
"JATS-journalpublishing" in xml_doctype
|
|
575
|
+
or "JATS-archive" in xml_doctype
|
|
576
|
+
):
|
|
577
|
+
input_format = InputFormat.XML_JATS
|
|
578
|
+
|
|
579
|
+
elif mime == "text/plain":
|
|
580
|
+
content_str = content.decode("utf-8")
|
|
581
|
+
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
|
582
|
+
input_format = InputFormat.XML_USPTO
|
|
583
|
+
|
|
584
|
+
return input_format
|
|
585
|
+
|
|
586
|
+
@staticmethod
|
|
587
|
+
def _mime_from_extension(ext):
|
|
588
|
+
mime = None
|
|
589
|
+
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
|
590
|
+
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
|
591
|
+
elif ext in FormatToExtensions[InputFormat.HTML]:
|
|
592
|
+
mime = FormatToMimeType[InputFormat.HTML][0]
|
|
593
|
+
elif ext in FormatToExtensions[InputFormat.MD]:
|
|
594
|
+
mime = FormatToMimeType[InputFormat.MD][0]
|
|
595
|
+
elif ext in FormatToExtensions[InputFormat.CSV]:
|
|
596
|
+
mime = FormatToMimeType[InputFormat.CSV][0]
|
|
597
|
+
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
|
|
598
|
+
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
|
599
|
+
elif ext in FormatToExtensions[InputFormat.PDF]:
|
|
600
|
+
mime = FormatToMimeType[InputFormat.PDF][0]
|
|
601
|
+
elif ext in FormatToExtensions[InputFormat.DOCX]:
|
|
602
|
+
mime = FormatToMimeType[InputFormat.DOCX][0]
|
|
603
|
+
elif ext in FormatToExtensions[InputFormat.PPTX]:
|
|
604
|
+
mime = FormatToMimeType[InputFormat.PPTX][0]
|
|
605
|
+
elif ext in FormatToExtensions[InputFormat.XLSX]:
|
|
606
|
+
mime = FormatToMimeType[InputFormat.XLSX][0]
|
|
607
|
+
elif ext in FormatToExtensions[InputFormat.VTT]:
|
|
608
|
+
mime = FormatToMimeType[InputFormat.VTT][0]
|
|
609
|
+
|
|
610
|
+
return mime
|
|
611
|
+
|
|
612
|
+
@staticmethod
|
|
613
|
+
def _detect_html_xhtml(
|
|
614
|
+
content: bytes,
|
|
615
|
+
) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
|
|
616
|
+
"""Guess the mime type of an XHTML, HTML, or XML file from its content.
|
|
617
|
+
|
|
618
|
+
Args:
|
|
619
|
+
content: A short piece of a document from its beginning.
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
The mime type of an XHTML, HTML, or XML file, or None if the content does
|
|
623
|
+
not match any of these formats.
|
|
624
|
+
"""
|
|
625
|
+
content_str = content.decode("ascii", errors="ignore").lower()
|
|
626
|
+
# Remove XML comments
|
|
627
|
+
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
|
628
|
+
content_str = content_str.lstrip()
|
|
629
|
+
|
|
630
|
+
if re.match(r"<\?xml", content_str):
|
|
631
|
+
if "xhtml" in content_str[:1000]:
|
|
632
|
+
return "application/xhtml+xml"
|
|
633
|
+
else:
|
|
634
|
+
return "application/xml"
|
|
635
|
+
|
|
636
|
+
if re.match(
|
|
637
|
+
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
|
|
638
|
+
content_str,
|
|
639
|
+
re.DOTALL,
|
|
640
|
+
):
|
|
641
|
+
return "text/html"
|
|
642
|
+
|
|
643
|
+
p = re.compile(
|
|
644
|
+
r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
|
|
645
|
+
)
|
|
646
|
+
if p.search(content_str):
|
|
647
|
+
return "application/xml"
|
|
648
|
+
|
|
649
|
+
return None
|
|
650
|
+
|
|
651
|
+
@staticmethod
|
|
652
|
+
def _detect_csv(
|
|
653
|
+
content: bytes,
|
|
654
|
+
) -> Optional[Literal["text/csv"]]:
|
|
655
|
+
"""Guess the mime type of a CSV file from its content.
|
|
656
|
+
|
|
657
|
+
Args:
|
|
658
|
+
content: A short piece of a document from its beginning.
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
The mime type of a CSV file, or None if the content does
|
|
662
|
+
not match any of the format.
|
|
663
|
+
"""
|
|
664
|
+
content_str = content.decode("ascii", errors="ignore").strip()
|
|
665
|
+
|
|
666
|
+
# Ensure there's at least one newline (CSV is usually multi-line)
|
|
667
|
+
if "\n" not in content_str:
|
|
668
|
+
return None
|
|
669
|
+
|
|
670
|
+
# Use csv.Sniffer to detect CSV characteristics
|
|
671
|
+
try:
|
|
672
|
+
dialect = csv.Sniffer().sniff(content_str)
|
|
673
|
+
if dialect.delimiter in {",", ";", "\t", "|"}: # Common delimiters
|
|
674
|
+
return "text/csv"
|
|
675
|
+
except csv.Error:
|
|
676
|
+
return None
|
|
677
|
+
|
|
678
|
+
return None
|
|
679
|
+
|
|
680
|
+
@staticmethod
|
|
681
|
+
def _detect_mets_gbs(
|
|
682
|
+
obj: Union[Path, DocumentStream],
|
|
683
|
+
) -> Optional[Literal["application/mets+xml"]]:
|
|
684
|
+
content = obj if isinstance(obj, Path) else obj.stream
|
|
685
|
+
tar: tarfile.TarFile
|
|
686
|
+
member: tarfile.TarInfo
|
|
687
|
+
with tarfile.open(
|
|
688
|
+
name=content if isinstance(content, Path) else None,
|
|
689
|
+
fileobj=content if isinstance(content, BytesIO) else None,
|
|
690
|
+
mode="r:gz",
|
|
691
|
+
) as tar:
|
|
692
|
+
for member in tar.getmembers():
|
|
693
|
+
if member.name.endswith(".xml"):
|
|
694
|
+
file = tar.extractfile(member)
|
|
695
|
+
if file is not None:
|
|
696
|
+
content_str = file.read().decode(errors="ignore")
|
|
697
|
+
if "http://www.loc.gov/METS/" in content_str:
|
|
698
|
+
return "application/mets+xml"
|
|
699
|
+
return None
|