docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,327 @@
1
+ import hashlib
2
+ import logging
3
+ import sys
4
+ import threading
5
+ import time
6
+ import warnings
7
+ from collections.abc import Iterable, Iterator
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from functools import partial
10
+ from pathlib import Path
11
+ from typing import Optional, Type, Union
12
+
13
+ from pydantic import ConfigDict, model_validator, validate_call
14
+ from typing_extensions import Self
15
+
16
+ from docling.backend.abstract_backend import AbstractDocumentBackend
17
+ from docling.backend.image_backend import ImageDocumentBackend
18
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
19
+ from docling.datamodel.base_models import (
20
+ BaseFormatOption,
21
+ ConversionStatus,
22
+ DoclingComponentType,
23
+ DocumentStream,
24
+ ErrorItem,
25
+ InputFormat,
26
+ )
27
+ from docling.datamodel.document import (
28
+ InputDocument,
29
+ _DocumentConversionInput, # intentionally reused builder
30
+ )
31
+ from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
32
+ from docling.datamodel.pipeline_options import PipelineOptions
33
+ from docling.datamodel.settings import (
34
+ DEFAULT_PAGE_RANGE,
35
+ DocumentLimits,
36
+ PageRange,
37
+ settings,
38
+ )
39
+ from docling.exceptions import ConversionError
40
+ from docling.pipeline.base_extraction_pipeline import BaseExtractionPipeline
41
+ from docling.pipeline.extraction_vlm_pipeline import ExtractionVlmPipeline
42
+ from docling.utils.utils import chunkify
43
+
44
+ _log = logging.getLogger(__name__)
45
+ _PIPELINE_CACHE_LOCK = threading.Lock()
46
+
47
+
48
+ class ExtractionFormatOption(BaseFormatOption):
49
+ """Per-format configuration for extraction.
50
+
51
+ Notes:
52
+ - `pipeline_cls` must subclass `BaseExtractionPipeline`.
53
+ - `pipeline_options` is typed as `PipelineOptions` which MUST inherit from
54
+ `BaseOptions` (as used by `BaseExtractionPipeline`).
55
+ - `backend` is the document-opening backend used by `_DocumentConversionInput`.
56
+ """
57
+
58
+ pipeline_cls: Type[BaseExtractionPipeline]
59
+
60
+ @model_validator(mode="after")
61
+ def set_optional_field_default(self) -> Self:
62
+ if self.pipeline_options is None:
63
+ # `get_default_options` comes from BaseExtractionPipeline
64
+ self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
65
+ return self
66
+
67
+
68
+ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
69
+ """Return the default extraction option for a given input format.
70
+
71
+ Defaults mirror the converter's *backend* choices, while the pipeline is
72
+ the VLM extractor. This duplication will be removed when we deduplicate
73
+ the format registry between convert/extract.
74
+ """
75
+ format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
76
+ InputFormat.IMAGE: ImageDocumentBackend,
77
+ InputFormat.PDF: PyPdfiumDocumentBackend,
78
+ }
79
+
80
+ backend = format_to_default_backend.get(fmt)
81
+ if backend is None:
82
+ raise RuntimeError(f"No default extraction backend configured for {fmt}")
83
+
84
+ return ExtractionFormatOption(
85
+ pipeline_cls=ExtractionVlmPipeline,
86
+ backend=backend,
87
+ )
88
+
89
+
90
+ class DocumentExtractor:
91
+ """Standalone extractor class.
92
+
93
+ Public API:
94
+ - `extract(...) -> ExtractionResult`
95
+ - `extract_all(...) -> Iterator[ExtractionResult]`
96
+
97
+ Implementation intentionally reuses `_DocumentConversionInput` to build
98
+ `InputDocument` with the correct backend per format.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ allowed_formats: Optional[list[InputFormat]] = None,
104
+ extraction_format_options: Optional[
105
+ dict[InputFormat, ExtractionFormatOption]
106
+ ] = None,
107
+ ) -> None:
108
+ self.allowed_formats: list[InputFormat] = (
109
+ allowed_formats if allowed_formats is not None else list(InputFormat)
110
+ )
111
+ # Build per-format options with defaults, then apply any user overrides
112
+ overrides = extraction_format_options or {}
113
+ self.extraction_format_to_options: dict[InputFormat, ExtractionFormatOption] = {
114
+ fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
115
+ for fmt in self.allowed_formats
116
+ }
117
+
118
+ # Cache pipelines by (class, options-hash)
119
+ self._initialized_pipelines: dict[
120
+ tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
121
+ ] = {}
122
+
123
+ # ---------------------------- Public API ---------------------------------
124
+
125
+ @validate_call(config=ConfigDict(strict=True))
126
+ def extract(
127
+ self,
128
+ source: Union[Path, str, DocumentStream],
129
+ template: ExtractionTemplateType,
130
+ headers: Optional[dict[str, str]] = None,
131
+ raises_on_error: bool = True,
132
+ max_num_pages: int = sys.maxsize,
133
+ max_file_size: int = sys.maxsize,
134
+ page_range: PageRange = DEFAULT_PAGE_RANGE,
135
+ ) -> ExtractionResult:
136
+ all_res = self.extract_all(
137
+ source=[source],
138
+ headers=headers,
139
+ raises_on_error=raises_on_error,
140
+ max_num_pages=max_num_pages,
141
+ max_file_size=max_file_size,
142
+ page_range=page_range,
143
+ template=template,
144
+ )
145
+ return next(all_res)
146
+
147
+ @validate_call(config=ConfigDict(strict=True))
148
+ def extract_all(
149
+ self,
150
+ source: Iterable[Union[Path, str, DocumentStream]],
151
+ template: ExtractionTemplateType,
152
+ headers: Optional[dict[str, str]] = None,
153
+ raises_on_error: bool = True,
154
+ max_num_pages: int = sys.maxsize,
155
+ max_file_size: int = sys.maxsize,
156
+ page_range: PageRange = DEFAULT_PAGE_RANGE,
157
+ ) -> Iterator[ExtractionResult]:
158
+ warnings.warn(
159
+ "The extract API is currently experimental and may change without prior notice.\n"
160
+ "Only PDF and image formats are supported.",
161
+ UserWarning,
162
+ stacklevel=2,
163
+ )
164
+
165
+ limits = DocumentLimits(
166
+ max_num_pages=max_num_pages,
167
+ max_file_size=max_file_size,
168
+ page_range=page_range,
169
+ )
170
+ conv_input = _DocumentConversionInput(
171
+ path_or_stream_iterator=source, limits=limits, headers=headers
172
+ )
173
+
174
+ ext_res_iter = self._extract(
175
+ conv_input, raises_on_error=raises_on_error, template=template
176
+ )
177
+
178
+ had_result = False
179
+ for ext_res in ext_res_iter:
180
+ had_result = True
181
+ if raises_on_error and ext_res.status not in {
182
+ ConversionStatus.SUCCESS,
183
+ ConversionStatus.PARTIAL_SUCCESS,
184
+ }:
185
+ raise ConversionError(
186
+ f"Extraction failed for: {ext_res.input.file} with status: {ext_res.status}"
187
+ )
188
+ else:
189
+ yield ext_res
190
+
191
+ if not had_result and raises_on_error:
192
+ raise ConversionError(
193
+ "Extraction failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
194
+ )
195
+
196
+ # --------------------------- Internal engine ------------------------------
197
+
198
+ def _extract(
199
+ self,
200
+ conv_input: _DocumentConversionInput,
201
+ raises_on_error: bool,
202
+ template: ExtractionTemplateType,
203
+ ) -> Iterator[ExtractionResult]:
204
+ start_time = time.monotonic()
205
+
206
+ for input_batch in chunkify(
207
+ conv_input.docs(self.extraction_format_to_options),
208
+ settings.perf.doc_batch_size,
209
+ ):
210
+ _log.info("Going to extract document batch...")
211
+ process_func = partial(
212
+ self._process_document_extraction,
213
+ raises_on_error=raises_on_error,
214
+ template=template,
215
+ )
216
+
217
+ if (
218
+ settings.perf.doc_batch_concurrency > 1
219
+ and settings.perf.doc_batch_size > 1
220
+ ):
221
+ with ThreadPoolExecutor(
222
+ max_workers=settings.perf.doc_batch_concurrency
223
+ ) as pool:
224
+ for item in pool.map(
225
+ process_func,
226
+ input_batch,
227
+ ):
228
+ yield item
229
+ else:
230
+ for item in map(
231
+ process_func,
232
+ input_batch,
233
+ ):
234
+ elapsed = time.monotonic() - start_time
235
+ start_time = time.monotonic()
236
+ _log.info(
237
+ f"Finished extracting document {item.input.file.name} in {elapsed:.2f} sec."
238
+ )
239
+ yield item
240
+
241
+ def _process_document_extraction(
242
+ self,
243
+ in_doc: InputDocument,
244
+ raises_on_error: bool,
245
+ template: ExtractionTemplateType,
246
+ ) -> ExtractionResult:
247
+ valid = (
248
+ self.allowed_formats is not None and in_doc.format in self.allowed_formats
249
+ )
250
+ if valid:
251
+ return self._execute_extraction_pipeline(
252
+ in_doc, raises_on_error=raises_on_error, template=template
253
+ )
254
+ else:
255
+ error_message = f"File format not allowed: {in_doc.file}"
256
+ if raises_on_error:
257
+ raise ConversionError(error_message)
258
+ else:
259
+ error_item = ErrorItem(
260
+ component_type=DoclingComponentType.USER_INPUT,
261
+ module_name="",
262
+ error_message=error_message,
263
+ )
264
+ return ExtractionResult(
265
+ input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
266
+ )
267
+
268
+ def _execute_extraction_pipeline(
269
+ self,
270
+ in_doc: InputDocument,
271
+ raises_on_error: bool,
272
+ template: ExtractionTemplateType,
273
+ ) -> ExtractionResult:
274
+ if not in_doc.valid:
275
+ if raises_on_error:
276
+ raise ConversionError(f"Input document {in_doc.file} is not valid.")
277
+ else:
278
+ return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
279
+
280
+ pipeline = self._get_pipeline(in_doc.format)
281
+ if pipeline is None:
282
+ if raises_on_error:
283
+ raise ConversionError(
284
+ f"No extraction pipeline could be initialized for {in_doc.file}."
285
+ )
286
+ else:
287
+ return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
288
+
289
+ return pipeline.execute(
290
+ in_doc, raises_on_error=raises_on_error, template=template
291
+ )
292
+
293
+ def _get_pipeline(
294
+ self, doc_format: InputFormat
295
+ ) -> Optional[BaseExtractionPipeline]:
296
+ """Retrieve or initialize a pipeline, reusing instances based on class and options."""
297
+ fopt = self.extraction_format_to_options.get(doc_format)
298
+ if fopt is None or fopt.pipeline_options is None:
299
+ return None
300
+
301
+ pipeline_class = fopt.pipeline_cls
302
+ pipeline_options = fopt.pipeline_options
303
+ options_hash = self._get_pipeline_options_hash(pipeline_options)
304
+
305
+ cache_key = (pipeline_class, options_hash)
306
+ with _PIPELINE_CACHE_LOCK:
307
+ if cache_key not in self._initialized_pipelines:
308
+ _log.info(
309
+ f"Initializing extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
310
+ )
311
+ self._initialized_pipelines[cache_key] = pipeline_class(
312
+ pipeline_options=pipeline_options # type: ignore[arg-type]
313
+ )
314
+ else:
315
+ _log.debug(
316
+ f"Reusing cached extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
317
+ )
318
+
319
+ return self._initialized_pipelines[cache_key]
320
+
321
+ @staticmethod
322
+ def _get_pipeline_options_hash(pipeline_options: PipelineOptions) -> str:
323
+ """Generate a stable hash of pipeline options to use as part of the cache key."""
324
+ options_str = str(pipeline_options.model_dump())
325
+ return hashlib.md5(
326
+ options_str.encode("utf-8"), usedforsecurity=False
327
+ ).hexdigest()
docling/exceptions.py ADDED
@@ -0,0 +1,10 @@
1
+ class BaseError(RuntimeError):
2
+ pass
3
+
4
+
5
+ class ConversionError(BaseError):
6
+ pass
7
+
8
+
9
+ class OperationNotAllowed(BaseError):
10
+ pass
@@ -0,0 +1,5 @@
1
+ """Experimental modules for Docling.
2
+
3
+ This package contains experimental features that are under development
4
+ and may change or be removed in future versions.
5
+ """
@@ -0,0 +1 @@
1
+ """Experimental datamodel modules."""
@@ -0,0 +1,13 @@
1
+ """Internal options for the experimental TableCrops layout model."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from docling.datamodel.pipeline_options import BaseLayoutOptions
6
+
7
+ __all__ = ["TableCropsLayoutOptions"]
8
+
9
+
10
+ class TableCropsLayoutOptions(BaseLayoutOptions):
11
+ """Options for TableCropsLayoutModel (internal-only)."""
12
+
13
+ kind: ClassVar[str] = "docling_experimental_table_crops_layout"
@@ -0,0 +1,45 @@
1
+ """Options for the threaded layout+VLM pipeline."""
2
+
3
+ from typing import Union
4
+
5
+ from pydantic import model_validator
6
+
7
+ from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_HERON
8
+ from docling.datamodel.pipeline_options import LayoutOptions, PaginatedPipelineOptions
9
+ from docling.datamodel.pipeline_options_vlm_model import (
10
+ ApiVlmOptions,
11
+ InlineVlmOptions,
12
+ ResponseFormat,
13
+ )
14
+ from docling.datamodel.vlm_model_specs import GRANITEDOCLING_TRANSFORMERS
15
+
16
+
17
+ class ThreadedLayoutVlmPipelineOptions(PaginatedPipelineOptions):
18
+ """Pipeline options for the threaded layout+VLM pipeline."""
19
+
20
+ images_scale: float = 2.0
21
+
22
+ # VLM configuration (will be enhanced with layout awareness by the pipeline)
23
+ vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = GRANITEDOCLING_TRANSFORMERS
24
+
25
+ # Layout model configuration
26
+ layout_options: LayoutOptions = LayoutOptions(
27
+ model_spec=DOCLING_LAYOUT_HERON, skip_cell_assignment=True
28
+ )
29
+
30
+ # Threading and batching controls
31
+ layout_batch_size: int = 4
32
+ vlm_batch_size: int = 4
33
+ batch_timeout_seconds: float = 2.0
34
+ queue_max_size: int = 50
35
+
36
+ @model_validator(mode="after")
37
+ def validate_response_format(self):
38
+ """Validate that VLM response format is DOCTAGS (required for this pipeline)."""
39
+ if self.vlm_options.response_format != ResponseFormat.DOCTAGS:
40
+ raise ValueError(
41
+ f"ThreadedLayoutVlmPipeline only supports DOCTAGS response format, "
42
+ f"but got {self.vlm_options.response_format}. "
43
+ f"Please set vlm_options.response_format=ResponseFormat.DOCTAGS"
44
+ )
45
+ return self
@@ -0,0 +1,3 @@
1
+ """Experimental models for Docling."""
2
+
3
+ __all__: list[str] = []
@@ -0,0 +1,114 @@
1
+ """Internal TableCrops layout model that marks full pages as table clusters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import warnings
6
+ from collections.abc import Sequence
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ import numpy as np
11
+ from docling_core.types.doc import DocItemLabel
12
+
13
+ from docling.datamodel.accelerator_options import AcceleratorOptions
14
+ from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
15
+ from docling.datamodel.document import ConversionResult
16
+ from docling.experimental.datamodel.table_crops_layout_options import (
17
+ TableCropsLayoutOptions,
18
+ )
19
+ from docling.models.base_layout_model import BaseLayoutModel
20
+
21
+ __all__ = ["TableCropsLayoutModel"]
22
+
23
+
24
+ class TableCropsLayoutModel(BaseLayoutModel):
25
+ """Experimental layout model that treats the full page as a table cluster.
26
+ This is useful in cases where a Docling pipeline is applied to images of table crops only.
27
+
28
+ This model is internal and not part of the stable public interface.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ artifacts_path: Optional[Path],
34
+ accelerator_options: AcceleratorOptions,
35
+ options: TableCropsLayoutOptions,
36
+ ):
37
+ self.options = options
38
+ self.artifacts_path = artifacts_path
39
+ self.accelerator_options = accelerator_options
40
+
41
+ @classmethod
42
+ def get_options_type(cls) -> type[TableCropsLayoutOptions]:
43
+ return TableCropsLayoutOptions
44
+
45
+ def predict_layout(
46
+ self,
47
+ conv_res: ConversionResult,
48
+ pages: Sequence[Page],
49
+ ) -> Sequence[LayoutPrediction]:
50
+ layout_predictions: list[LayoutPrediction] = []
51
+
52
+ for page in pages:
53
+ if page._backend is None or not page._backend.is_valid():
54
+ existing_prediction = page.predictions.layout or LayoutPrediction()
55
+ layout_predictions.append(existing_prediction)
56
+ continue
57
+
58
+ clusters = self._build_page_clusters(page)
59
+ prediction = LayoutPrediction(clusters=clusters)
60
+
61
+ self._update_confidence(conv_res, page, clusters)
62
+
63
+ layout_predictions.append(prediction)
64
+
65
+ return layout_predictions
66
+
67
+ def _build_page_clusters(self, page: Page) -> list[Cluster]:
68
+ page_size = page.size
69
+ if page_size is None:
70
+ return []
71
+
72
+ bbox = BoundingBox(
73
+ l=0.0,
74
+ t=0.0,
75
+ r=page_size.width,
76
+ b=page_size.height,
77
+ )
78
+
79
+ cluster = Cluster(
80
+ id=0,
81
+ label=DocItemLabel.TABLE,
82
+ bbox=bbox,
83
+ confidence=1.0,
84
+ cells=[],
85
+ )
86
+
87
+ clusters = [cluster]
88
+
89
+ if not self.options.skip_cell_assignment:
90
+ page_cells = list(page.cells)
91
+ cluster.cells = page_cells
92
+
93
+ if not page_cells and not self.options.keep_empty_clusters:
94
+ clusters = []
95
+
96
+ return clusters
97
+
98
+ def _update_confidence(
99
+ self, conv_res: ConversionResult, page: Page, clusters: list[Cluster]
100
+ ) -> None:
101
+ """Populate layout and OCR confidence scores for the page."""
102
+ with warnings.catch_warnings():
103
+ warnings.filterwarnings(
104
+ "ignore",
105
+ "Mean of empty slice|invalid value encountered in scalar divide",
106
+ RuntimeWarning,
107
+ "numpy",
108
+ )
109
+
110
+ conv_res.confidence.pages[page.page_no].layout_score = 1.0
111
+
112
+ ocr_cells = [cell for cell in page.cells if cell.from_ocr]
113
+ ocr_confidence = float(np.mean([cell.confidence for cell in ocr_cells]))
114
+ conv_res.confidence.pages[page.page_no].ocr_score = ocr_confidence
@@ -0,0 +1 @@
1
+ """Experimental pipeline modules."""