docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,699 @@
1
+ import csv
2
+ import importlib
3
+ import json
4
+ import logging
5
+ import platform
6
+ import re
7
+ import sys
8
+ import tarfile
9
+ import zipfile
10
+ from collections.abc import Iterable, Mapping
11
+ from datetime import datetime
12
+ from enum import Enum
13
+ from io import BytesIO
14
+ from pathlib import Path, PurePath
15
+ from typing import (
16
+ TYPE_CHECKING,
17
+ Annotated,
18
+ Literal,
19
+ Optional,
20
+ Type,
21
+ Union,
22
+ cast,
23
+ )
24
+
25
+ import filetype
26
+
27
+ # DO NOT REMOVE; explicitly exposed from this location
28
+ from docling_core.types.doc import (
29
+ DocItem,
30
+ DocItemLabel,
31
+ DoclingDocument,
32
+ PictureItem,
33
+ SectionHeaderItem,
34
+ TableItem,
35
+ TextItem,
36
+ )
37
+ from docling_core.types.doc.document import ListItem
38
+ from docling_core.types.legacy_doc.base import (
39
+ BaseText,
40
+ Figure,
41
+ GlmTableCell,
42
+ PageDimensions,
43
+ PageReference,
44
+ Prov,
45
+ Ref,
46
+ Table as DsSchemaTable,
47
+ TableCell,
48
+ )
49
+ from docling_core.types.legacy_doc.document import (
50
+ CCSDocumentDescription as DsDocumentDescription,
51
+ CCSFileInfoObject as DsFileInfoObject,
52
+ ExportedCCSDocument as DsDocument,
53
+ )
54
+ from docling_core.utils.file import resolve_source_to_stream
55
+ from docling_core.utils.legacy import docling_document_to_legacy
56
+ from pydantic import BaseModel, Field
57
+ from typing_extensions import deprecated
58
+
59
+ from docling.backend.abstract_backend import (
60
+ AbstractDocumentBackend,
61
+ DeclarativeDocumentBackend,
62
+ PaginatedDocumentBackend,
63
+ )
64
+ from docling.datamodel.backend_options import BackendOptions
65
+ from docling.datamodel.base_models import (
66
+ AssembledUnit,
67
+ ConfidenceReport,
68
+ ConversionStatus,
69
+ DocumentStream,
70
+ ErrorItem,
71
+ FormatToExtensions,
72
+ FormatToMimeType,
73
+ InputFormat,
74
+ MimeTypeToFormat,
75
+ Page,
76
+ )
77
+ from docling.datamodel.settings import DocumentLimits
78
+ from docling.utils.profiling import ProfilingItem
79
+ from docling.utils.utils import create_file_hash
80
+
81
+ if TYPE_CHECKING:
82
+ from docling.datamodel.base_models import BaseFormatOption
83
+ from docling.document_converter import FormatOption
84
+
85
+ _log = logging.getLogger(__name__)
86
+
87
+ layout_label_to_ds_type = {
88
+ DocItemLabel.TITLE: "title",
89
+ DocItemLabel.DOCUMENT_INDEX: "table",
90
+ DocItemLabel.SECTION_HEADER: "subtitle-level-1",
91
+ DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
92
+ DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
93
+ DocItemLabel.CAPTION: "caption",
94
+ DocItemLabel.PAGE_HEADER: "page-header",
95
+ DocItemLabel.PAGE_FOOTER: "page-footer",
96
+ DocItemLabel.FOOTNOTE: "footnote",
97
+ DocItemLabel.TABLE: "table",
98
+ DocItemLabel.FORMULA: "equation",
99
+ DocItemLabel.LIST_ITEM: "paragraph",
100
+ DocItemLabel.CODE: "paragraph",
101
+ DocItemLabel.PICTURE: "figure",
102
+ DocItemLabel.TEXT: "paragraph",
103
+ DocItemLabel.PARAGRAPH: "paragraph",
104
+ DocItemLabel.FORM: DocItemLabel.FORM.value,
105
+ DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
106
+ }
107
+
108
+ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
109
+
110
+
111
+ class InputDocument(BaseModel):
112
+ """A document as an input of a Docling conversion."""
113
+
114
+ file: Annotated[
115
+ PurePath, Field(description="A path representation the input document.")
116
+ ]
117
+ document_hash: Annotated[
118
+ str,
119
+ Field(description="A stable hash of the path or stream of the input document."),
120
+ ]
121
+ valid: bool = Field(True, description="Whether this is is a valid input document.")
122
+ backend_options: Optional[BackendOptions] = Field(
123
+ None, description="Custom options for backends."
124
+ )
125
+ limits: DocumentLimits = Field(
126
+ DocumentLimits(), description="Limits in the input document for the conversion."
127
+ )
128
+ format: Annotated[InputFormat, Field(description="The document format.")]
129
+
130
+ filesize: Optional[int] = Field(
131
+ None, description="Size of the input file, in bytes."
132
+ )
133
+ page_count: int = Field(0, description="Number of pages in the input document.")
134
+
135
+ _backend: AbstractDocumentBackend
136
+
137
+ def __init__(
138
+ self,
139
+ path_or_stream: Union[BytesIO, Path],
140
+ format: InputFormat,
141
+ backend: Type[AbstractDocumentBackend],
142
+ backend_options: Optional[BackendOptions] = None,
143
+ filename: Optional[str] = None,
144
+ limits: Optional[DocumentLimits] = None,
145
+ ) -> None:
146
+ super().__init__(
147
+ file="",
148
+ document_hash="",
149
+ format=InputFormat.PDF,
150
+ backend_options=backend_options,
151
+ ) # initialize with dummy values
152
+ self.limits = limits or DocumentLimits()
153
+ self.format = format
154
+
155
+ try:
156
+ if isinstance(path_or_stream, Path):
157
+ self.file = path_or_stream
158
+ self.filesize = path_or_stream.stat().st_size
159
+ if self.filesize > self.limits.max_file_size:
160
+ self.valid = False
161
+ else:
162
+ self.document_hash = create_file_hash(path_or_stream)
163
+ self._init_doc(backend, path_or_stream)
164
+
165
+ elif isinstance(path_or_stream, BytesIO):
166
+ assert filename is not None, (
167
+ "Can't construct InputDocument from stream without providing "
168
+ "filename arg."
169
+ )
170
+ self.file = PurePath(filename)
171
+ self.filesize = path_or_stream.getbuffer().nbytes
172
+
173
+ if self.filesize > self.limits.max_file_size:
174
+ self.valid = False
175
+ else:
176
+ self.document_hash = create_file_hash(path_or_stream)
177
+ self._init_doc(backend, path_or_stream)
178
+ else:
179
+ raise RuntimeError(
180
+ f"Unexpected type path_or_stream: {type(path_or_stream)}"
181
+ )
182
+
183
+ # For paginated backends, check if the maximum page count is exceeded.
184
+ if self.valid and self._backend.is_valid():
185
+ if self._backend.supports_pagination() and isinstance(
186
+ self._backend, PaginatedDocumentBackend
187
+ ):
188
+ self.page_count = self._backend.page_count()
189
+ if not self.page_count <= self.limits.max_num_pages:
190
+ self.valid = False
191
+ elif self.page_count < self.limits.page_range[0]:
192
+ self.valid = False
193
+
194
+ except (FileNotFoundError, OSError) as e:
195
+ self.valid = False
196
+ _log.exception(
197
+ f"File {self.file.name} not found or cannot be opened.", exc_info=e
198
+ )
199
+ # raise
200
+ except RuntimeError as e:
201
+ self.valid = False
202
+ _log.exception(
203
+ "An unexpected error occurred while opening the document "
204
+ f"{self.file.name}",
205
+ exc_info=e,
206
+ )
207
+ # raise
208
+
209
+ def _init_doc(
210
+ self,
211
+ backend: Type[AbstractDocumentBackend],
212
+ path_or_stream: Union[BytesIO, Path],
213
+ ) -> None:
214
+ if self.backend_options:
215
+ self._backend = backend(
216
+ self,
217
+ path_or_stream=path_or_stream,
218
+ options=self.backend_options,
219
+ )
220
+ else:
221
+ self._backend = backend(self, path_or_stream=path_or_stream)
222
+
223
+ if not self._backend.is_valid():
224
+ self.valid = False
225
+
226
+
227
+ class DocumentFormat(str, Enum):
228
+ V2 = "v2"
229
+ V1 = "v1"
230
+
231
+
232
+ class DoclingVersion(BaseModel):
233
+ docling_version: str = importlib.metadata.version("docling")
234
+ docling_core_version: str = importlib.metadata.version("docling-core")
235
+ docling_ibm_models_version: str = importlib.metadata.version("docling-ibm-models")
236
+ docling_parse_version: str = importlib.metadata.version("docling-parse")
237
+ platform_str: str = platform.platform()
238
+ py_impl_version: str = sys.implementation.cache_tag
239
+ py_lang_version: str = platform.python_version()
240
+
241
+
242
+ class ConversionAssets(BaseModel):
243
+ version: DoclingVersion = DoclingVersion()
244
+ # When the assets were saved (ISO string from datetime.now())
245
+ timestamp: Optional[str] = None
246
+
247
+ status: ConversionStatus = ConversionStatus.PENDING # failure, success
248
+ errors: list[ErrorItem] = [] # structure to keep errors
249
+
250
+ pages: list[Page] = []
251
+ timings: dict[str, ProfilingItem] = {}
252
+ confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
253
+
254
+ document: DoclingDocument = _EMPTY_DOCLING_DOC
255
+
256
+ @property
257
+ @deprecated("Use document instead.")
258
+ def legacy_document(self):
259
+ return docling_document_to_legacy(self.document)
260
+
261
+ def save(
262
+ self,
263
+ *,
264
+ filename: Union[str, Path],
265
+ indent: Optional[int] = 2,
266
+ ):
267
+ """Serialize the full ConversionAssets to JSON."""
268
+ if isinstance(filename, str):
269
+ filename = Path(filename)
270
+ # Build an in-memory ZIP archive containing JSON for each asset
271
+ buf = BytesIO()
272
+
273
+ def to_jsonable(obj):
274
+ try:
275
+ # pydantic v2 models
276
+ if hasattr(obj, "model_dump"):
277
+ return obj.model_dump(mode="json") # type: ignore[attr-defined]
278
+ except TypeError:
279
+ # some models may not accept mode argument
280
+ return obj.model_dump() # type: ignore[attr-defined]
281
+
282
+ # enums
283
+ try:
284
+ from enum import Enum
285
+
286
+ if isinstance(obj, Enum):
287
+ return obj.value
288
+ except Exception:
289
+ pass
290
+
291
+ # containers
292
+ if isinstance(obj, list):
293
+ return [to_jsonable(x) for x in obj]
294
+ if isinstance(obj, dict):
295
+ return {k: to_jsonable(v) for k, v in obj.items()}
296
+
297
+ # passthrough primitives
298
+ return obj
299
+
300
+ with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
301
+
302
+ def write_json(name: str, payload) -> None:
303
+ data = json.dumps(
304
+ to_jsonable(payload), ensure_ascii=False, indent=indent
305
+ )
306
+ zf.writestr(name, data.encode("utf-8"))
307
+
308
+ # Update and persist a save timestamp
309
+ self.timestamp = datetime.now().isoformat()
310
+ write_json("timestamp.json", self.timestamp)
311
+
312
+ # Store each component in its own JSON file
313
+ write_json("version.json", self.version)
314
+ write_json("status.json", self.status)
315
+ write_json("errors.json", self.errors)
316
+ write_json("pages.json", self.pages)
317
+ write_json("timings.json", self.timings)
318
+ write_json("confidence.json", self.confidence)
319
+ # For the document, ensure stable schema via export_to_dict
320
+ doc_dict = self.document.export_to_dict()
321
+ zf.writestr(
322
+ "document.json",
323
+ json.dumps(doc_dict, ensure_ascii=False, indent=indent).encode("utf-8"),
324
+ )
325
+
326
+ # Persist the ZIP to disk
327
+ buf.seek(0)
328
+ if filename.parent and not filename.parent.exists():
329
+ filename.parent.mkdir(parents=True, exist_ok=True)
330
+ with filename.open("wb") as f:
331
+ f.write(buf.getvalue())
332
+
333
+ @classmethod
334
+ def load(cls, filename: Union[str, Path]) -> "ConversionAssets":
335
+ """Load a ConversionAssets."""
336
+ if isinstance(filename, str):
337
+ filename = Path(filename)
338
+
339
+ # Read the ZIP and deserialize all items
340
+ version_info: DoclingVersion = DoclingVersion()
341
+ timestamp: Optional[str] = None
342
+ status = ConversionStatus.PENDING
343
+ errors: list[ErrorItem] = []
344
+ pages: list[Page] = []
345
+ timings: dict[str, ProfilingItem] = {}
346
+ confidence = ConfidenceReport()
347
+ document: DoclingDocument = _EMPTY_DOCLING_DOC
348
+
349
+ with zipfile.ZipFile(filename, mode="r") as zf:
350
+
351
+ def read_json(name: str):
352
+ try:
353
+ with zf.open(name, "r") as fp:
354
+ return json.loads(fp.read().decode("utf-8"))
355
+ except KeyError:
356
+ return None
357
+
358
+ # version
359
+ if (data := read_json("version.json")) is not None:
360
+ try:
361
+ version_info = DoclingVersion.model_validate(data)
362
+ except Exception as exc:
363
+ _log.error(f"Could not read version: {exc}")
364
+
365
+ # timestamp
366
+ if (data := read_json("timestamp.json")) is not None:
367
+ if isinstance(data, str):
368
+ timestamp = data
369
+
370
+ # status
371
+ if (data := read_json("status.json")) is not None:
372
+ try:
373
+ status = ConversionStatus(data)
374
+ except Exception:
375
+ status = ConversionStatus.PENDING
376
+
377
+ # errors
378
+ if (data := read_json("errors.json")) is not None and isinstance(
379
+ data, list
380
+ ):
381
+ errors = [ErrorItem.model_validate(item) for item in data]
382
+
383
+ # pages
384
+ if (data := read_json("pages.json")) is not None and isinstance(data, list):
385
+ pages = [Page.model_validate(item) for item in data]
386
+
387
+ # timings
388
+ if (data := read_json("timings.json")) is not None and isinstance(
389
+ data, dict
390
+ ):
391
+ timings = {k: ProfilingItem.model_validate(v) for k, v in data.items()}
392
+
393
+ # confidence
394
+ if (data := read_json("confidence.json")) is not None and isinstance(
395
+ data, dict
396
+ ):
397
+ confidence = ConfidenceReport.model_validate(data)
398
+
399
+ # document
400
+ if (data := read_json("document.json")) is not None and isinstance(
401
+ data, dict
402
+ ):
403
+ document = DoclingDocument.model_validate(data)
404
+
405
+ return cls(
406
+ version=version_info,
407
+ timestamp=timestamp,
408
+ status=status,
409
+ errors=errors,
410
+ pages=pages,
411
+ timings=timings,
412
+ confidence=confidence,
413
+ document=document,
414
+ )
415
+
416
+
417
+ class ConversionResult(ConversionAssets):
418
+ input: InputDocument
419
+ assembled: AssembledUnit = AssembledUnit()
420
+
421
+
422
+ class _DummyBackend(AbstractDocumentBackend):
423
+ def __init__(self, *args, **kwargs):
424
+ super().__init__(*args, **kwargs)
425
+
426
+ def is_valid(self) -> bool:
427
+ return False
428
+
429
+ @classmethod
430
+ def supported_formats(cls) -> set[InputFormat]:
431
+ return set()
432
+
433
+ @classmethod
434
+ def supports_pagination(cls) -> bool:
435
+ return False
436
+
437
+ def unload(self):
438
+ return super().unload()
439
+
440
+
441
+ class _DocumentConversionInput(BaseModel):
442
+ path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
443
+ headers: Optional[dict[str, str]] = None
444
+ limits: Optional[DocumentLimits] = DocumentLimits()
445
+
446
+ def docs(
447
+ self,
448
+ format_options: Mapping[InputFormat, "BaseFormatOption"],
449
+ ) -> Iterable[InputDocument]:
450
+ for item in self.path_or_stream_iterator:
451
+ obj = (
452
+ resolve_source_to_stream(item, self.headers)
453
+ if isinstance(item, str)
454
+ else item
455
+ )
456
+ format = self._guess_format(obj)
457
+ backend: Type[AbstractDocumentBackend]
458
+ backend_options: Optional[BackendOptions] = None
459
+ if not format or format not in format_options:
460
+ _log.error(
461
+ f"Input document {obj.name} with format {format} does not match "
462
+ f"any allowed format: ({format_options.keys()})"
463
+ )
464
+ backend = _DummyBackend
465
+ else:
466
+ options = format_options[format]
467
+ backend = options.backend
468
+ if "backend_options" in options.model_fields_set:
469
+ backend_options = cast("FormatOption", options).backend_options
470
+
471
+ path_or_stream: Union[BytesIO, Path]
472
+ if isinstance(obj, Path):
473
+ path_or_stream = obj
474
+ elif isinstance(obj, DocumentStream):
475
+ path_or_stream = obj.stream
476
+ else:
477
+ raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
478
+
479
+ yield InputDocument(
480
+ path_or_stream=path_or_stream,
481
+ format=format, # type: ignore[arg-type]
482
+ filename=obj.name,
483
+ limits=self.limits,
484
+ backend=backend,
485
+ backend_options=backend_options,
486
+ )
487
+
488
+ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
489
+ content = b"" # empty binary blob
490
+ formats: list[InputFormat] = []
491
+
492
+ if isinstance(obj, Path):
493
+ mime = filetype.guess_mime(str(obj))
494
+ if mime is None:
495
+ ext = obj.suffix[1:]
496
+ mime = _DocumentConversionInput._mime_from_extension(ext)
497
+ if mime is None: # must guess from
498
+ with obj.open("rb") as f:
499
+ content = f.read(1024) # Read first 1KB
500
+ if mime is not None and mime.lower() == "application/zip":
501
+ mime_root = "application/vnd.openxmlformats-officedocument"
502
+ if obj.suffixes[-1].lower() == ".xlsx":
503
+ mime = mime_root + ".spreadsheetml.sheet"
504
+ elif obj.suffixes[-1].lower() == ".docx":
505
+ mime = mime_root + ".wordprocessingml.document"
506
+ elif obj.suffixes[-1].lower() == ".pptx":
507
+ mime = mime_root + ".presentationml.presentation"
508
+
509
+ elif isinstance(obj, DocumentStream):
510
+ content = obj.stream.read(8192)
511
+ obj.stream.seek(0)
512
+ mime = filetype.guess_mime(content)
513
+ if mime is None:
514
+ ext = (
515
+ obj.name.rsplit(".", 1)[-1]
516
+ if ("." in obj.name and not obj.name.startswith("."))
517
+ else ""
518
+ )
519
+ mime = _DocumentConversionInput._mime_from_extension(ext.lower())
520
+ if mime is not None and mime.lower() == "application/zip":
521
+ objname = obj.name.lower()
522
+ mime_root = "application/vnd.openxmlformats-officedocument"
523
+ if objname.endswith(".xlsx"):
524
+ mime = mime_root + ".spreadsheetml.sheet"
525
+ elif objname.endswith(".docx"):
526
+ mime = mime_root + ".wordprocessingml.document"
527
+ elif objname.endswith(".pptx"):
528
+ mime = mime_root + ".presentationml.presentation"
529
+
530
+ if mime is not None and mime.lower() == "application/gzip":
531
+ if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
532
+ mime = detected_mime
533
+
534
+ mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
535
+ mime = mime or _DocumentConversionInput._detect_csv(content)
536
+ mime = mime or "text/plain"
537
+ formats = MimeTypeToFormat.get(mime, [])
538
+ _log.info(f"detected formats: {formats}")
539
+
540
+ if formats:
541
+ if len(formats) == 1 and mime not in ("text/plain"):
542
+ return formats[0]
543
+ else: # ambiguity in formats
544
+ return _DocumentConversionInput._guess_from_content(
545
+ content, mime, formats
546
+ )
547
+ else:
548
+ return None
549
+
550
+ @staticmethod
551
+ def _guess_from_content(
552
+ content: bytes, mime: str, formats: list[InputFormat]
553
+ ) -> Optional[InputFormat]:
554
+ """Guess the input format of a document by checking part of its content."""
555
+ input_format: Optional[InputFormat] = None
556
+
557
+ if mime == "application/xml":
558
+ content_str = content.decode("utf-8")
559
+ match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
560
+ if match_doctype:
561
+ xml_doctype = match_doctype.group()
562
+ if InputFormat.XML_USPTO in formats and any(
563
+ item in xml_doctype
564
+ for item in (
565
+ "us-patent-application-v4",
566
+ "us-patent-grant-v4",
567
+ "us-grant-025",
568
+ "patent-application-publication",
569
+ )
570
+ ):
571
+ input_format = InputFormat.XML_USPTO
572
+
573
+ if InputFormat.XML_JATS in formats and (
574
+ "JATS-journalpublishing" in xml_doctype
575
+ or "JATS-archive" in xml_doctype
576
+ ):
577
+ input_format = InputFormat.XML_JATS
578
+
579
+ elif mime == "text/plain":
580
+ content_str = content.decode("utf-8")
581
+ if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
582
+ input_format = InputFormat.XML_USPTO
583
+
584
+ return input_format
585
+
586
+ @staticmethod
587
+ def _mime_from_extension(ext):
588
+ mime = None
589
+ if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
590
+ mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
591
+ elif ext in FormatToExtensions[InputFormat.HTML]:
592
+ mime = FormatToMimeType[InputFormat.HTML][0]
593
+ elif ext in FormatToExtensions[InputFormat.MD]:
594
+ mime = FormatToMimeType[InputFormat.MD][0]
595
+ elif ext in FormatToExtensions[InputFormat.CSV]:
596
+ mime = FormatToMimeType[InputFormat.CSV][0]
597
+ elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
598
+ mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
599
+ elif ext in FormatToExtensions[InputFormat.PDF]:
600
+ mime = FormatToMimeType[InputFormat.PDF][0]
601
+ elif ext in FormatToExtensions[InputFormat.DOCX]:
602
+ mime = FormatToMimeType[InputFormat.DOCX][0]
603
+ elif ext in FormatToExtensions[InputFormat.PPTX]:
604
+ mime = FormatToMimeType[InputFormat.PPTX][0]
605
+ elif ext in FormatToExtensions[InputFormat.XLSX]:
606
+ mime = FormatToMimeType[InputFormat.XLSX][0]
607
+ elif ext in FormatToExtensions[InputFormat.VTT]:
608
+ mime = FormatToMimeType[InputFormat.VTT][0]
609
+
610
+ return mime
611
+
612
+ @staticmethod
613
+ def _detect_html_xhtml(
614
+ content: bytes,
615
+ ) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
616
+ """Guess the mime type of an XHTML, HTML, or XML file from its content.
617
+
618
+ Args:
619
+ content: A short piece of a document from its beginning.
620
+
621
+ Returns:
622
+ The mime type of an XHTML, HTML, or XML file, or None if the content does
623
+ not match any of these formats.
624
+ """
625
+ content_str = content.decode("ascii", errors="ignore").lower()
626
+ # Remove XML comments
627
+ content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
628
+ content_str = content_str.lstrip()
629
+
630
+ if re.match(r"<\?xml", content_str):
631
+ if "xhtml" in content_str[:1000]:
632
+ return "application/xhtml+xml"
633
+ else:
634
+ return "application/xml"
635
+
636
+ if re.match(
637
+ r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
638
+ content_str,
639
+ re.DOTALL,
640
+ ):
641
+ return "text/html"
642
+
643
+ p = re.compile(
644
+ r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
645
+ )
646
+ if p.search(content_str):
647
+ return "application/xml"
648
+
649
+ return None
650
+
651
+ @staticmethod
652
+ def _detect_csv(
653
+ content: bytes,
654
+ ) -> Optional[Literal["text/csv"]]:
655
+ """Guess the mime type of a CSV file from its content.
656
+
657
+ Args:
658
+ content: A short piece of a document from its beginning.
659
+
660
+ Returns:
661
+ The mime type of a CSV file, or None if the content does
662
+ not match any of the format.
663
+ """
664
+ content_str = content.decode("ascii", errors="ignore").strip()
665
+
666
+ # Ensure there's at least one newline (CSV is usually multi-line)
667
+ if "\n" not in content_str:
668
+ return None
669
+
670
+ # Use csv.Sniffer to detect CSV characteristics
671
+ try:
672
+ dialect = csv.Sniffer().sniff(content_str)
673
+ if dialect.delimiter in {",", ";", "\t", "|"}: # Common delimiters
674
+ return "text/csv"
675
+ except csv.Error:
676
+ return None
677
+
678
+ return None
679
+
680
+ @staticmethod
681
+ def _detect_mets_gbs(
682
+ obj: Union[Path, DocumentStream],
683
+ ) -> Optional[Literal["application/mets+xml"]]:
684
+ content = obj if isinstance(obj, Path) else obj.stream
685
+ tar: tarfile.TarFile
686
+ member: tarfile.TarInfo
687
+ with tarfile.open(
688
+ name=content if isinstance(content, Path) else None,
689
+ fileobj=content if isinstance(content, BytesIO) else None,
690
+ mode="r:gz",
691
+ ) as tar:
692
+ for member in tar.getmembers():
693
+ if member.name.endswith(".xml"):
694
+ file = tar.extractfile(member)
695
+ if file is not None:
696
+ content_str = file.read().decode(errors="ignore")
697
+ if "http://www.loc.gov/METS/" in content_str:
698
+ return "application/mets+xml"
699
+ return None