docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,305 @@
1
+ import logging
2
+ import sys
3
+ import time
4
+ from collections.abc import Iterable
5
+ from pathlib import Path
6
+ from typing import Any, Optional, Union
7
+
8
+ import numpy as np
9
+ from PIL.Image import Image
10
+ from transformers import AutoModelForImageTextToText, AutoProcessor, GenerationConfig
11
+
12
+ from docling.datamodel.accelerator_options import (
13
+ AcceleratorOptions,
14
+ )
15
+ from docling.datamodel.base_models import VlmPrediction, VlmStopReason
16
+ from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
17
+ from docling.models.base_model import BaseVlmModel
18
+ from docling.models.utils.hf_model_download import (
19
+ HuggingFaceModelDownloadMixin,
20
+ )
21
+ from docling.utils.accelerator_utils import decide_device
22
+
23
+ _log = logging.getLogger(__name__)
24
+
25
+
26
+ # Source code from https://huggingface.co/numind/NuExtract-2.0-8B
27
+ def process_all_vision_info(messages, examples=None):
28
+ """
29
+ Process vision information from both messages and in-context examples, supporting batch processing.
30
+
31
+ Args:
32
+ messages: List of message dictionaries (single input) OR list of message lists (batch input)
33
+ examples: Optional list of example dictionaries (single input) OR list of example lists (batch)
34
+
35
+ Returns:
36
+ A flat list of all images in the correct order:
37
+ - For single input: example images followed by message images
38
+ - For batch input: interleaved as (item1 examples, item1 input, item2 examples, item2 input, etc.)
39
+ - Returns None if no images were found
40
+ """
41
+ try:
42
+ from qwen_vl_utils import fetch_image, process_vision_info
43
+ except ImportError:
44
+ raise ImportError(
45
+ "qwen-vl-utils is required for NuExtractTransformersModel. "
46
+ "Please install it with: pip install qwen-vl-utils"
47
+ )
48
+
49
+ from qwen_vl_utils import fetch_image, process_vision_info
50
+
51
+ # Helper function to extract images from examples
52
+ def extract_example_images(example_item):
53
+ if not example_item:
54
+ return []
55
+
56
+ # Handle both list of examples and single example
57
+ examples_to_process = (
58
+ example_item if isinstance(example_item, list) else [example_item]
59
+ )
60
+ images = []
61
+
62
+ for example in examples_to_process:
63
+ if (
64
+ isinstance(example.get("input"), dict)
65
+ and example["input"].get("type") == "image"
66
+ ):
67
+ images.append(fetch_image(example["input"]))
68
+
69
+ return images
70
+
71
+ # Normalize inputs to always be batched format
72
+ is_batch = messages and isinstance(messages[0], list)
73
+ messages_batch = messages if is_batch else [messages]
74
+ is_batch_examples = (
75
+ examples
76
+ and isinstance(examples, list)
77
+ and (isinstance(examples[0], list) or examples[0] is None)
78
+ )
79
+ examples_batch = (
80
+ examples
81
+ if is_batch_examples
82
+ else ([examples] if examples is not None else None)
83
+ )
84
+
85
+ # Ensure examples batch matches messages batch if provided
86
+ if examples and len(examples_batch) != len(messages_batch):
87
+ if not is_batch and len(examples_batch) == 1:
88
+ # Single example set for a single input is fine
89
+ pass
90
+ else:
91
+ raise ValueError("Examples batch length must match messages batch length")
92
+
93
+ # Process all inputs, maintaining correct order
94
+ all_images = []
95
+ for i, message_group in enumerate(messages_batch):
96
+ # Get example images for this input
97
+ if examples and i < len(examples_batch):
98
+ input_example_images = extract_example_images(examples_batch[i])
99
+ all_images.extend(input_example_images)
100
+
101
+ # Get message images for this input
102
+ input_message_images = process_vision_info(message_group)[0] or []
103
+ all_images.extend(input_message_images)
104
+
105
+ return all_images if all_images else None
106
+
107
+
108
+ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
109
+ def __init__(
110
+ self,
111
+ enabled: bool,
112
+ artifacts_path: Optional[Path],
113
+ accelerator_options: AcceleratorOptions,
114
+ vlm_options: InlineVlmOptions,
115
+ ):
116
+ self.enabled = enabled
117
+ self.vlm_options = vlm_options
118
+
119
+ if self.enabled:
120
+ import torch
121
+
122
+ self.device = decide_device(
123
+ accelerator_options.device,
124
+ supported_devices=vlm_options.supported_devices,
125
+ )
126
+ _log.debug(f"Available device for NuExtract VLM: {self.device}")
127
+
128
+ self.max_new_tokens = vlm_options.max_new_tokens
129
+ self.temperature = vlm_options.temperature
130
+
131
+ repo_cache_folder = vlm_options.repo_id.replace("/", "--")
132
+
133
+ if artifacts_path is None:
134
+ artifacts_path = self.download_models(
135
+ repo_id=self.vlm_options.repo_id,
136
+ revision=self.vlm_options.revision,
137
+ )
138
+ elif (artifacts_path / repo_cache_folder).exists():
139
+ artifacts_path = artifacts_path / repo_cache_folder
140
+
141
+ self.processor = AutoProcessor.from_pretrained(
142
+ artifacts_path,
143
+ trust_remote_code=vlm_options.trust_remote_code,
144
+ use_fast=True,
145
+ )
146
+ self.processor.tokenizer.padding_side = "left"
147
+
148
+ self.vlm_model = AutoModelForImageTextToText.from_pretrained(
149
+ artifacts_path,
150
+ device_map=self.device,
151
+ dtype=self.vlm_options.torch_dtype,
152
+ _attn_implementation=(
153
+ "flash_attention_2"
154
+ if self.device.startswith("cuda")
155
+ and accelerator_options.cuda_use_flash_attention2
156
+ else "sdpa"
157
+ ),
158
+ trust_remote_code=vlm_options.trust_remote_code,
159
+ )
160
+ if sys.version_info < (3, 14):
161
+ self.vlm_model = torch.compile(self.vlm_model) # type: ignore
162
+ else:
163
+ self.vlm_model.eval()
164
+
165
+ # Load generation config
166
+ self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
167
+
168
+ def process_images(
169
+ self,
170
+ image_batch: Iterable[Union[Image, np.ndarray]],
171
+ prompt: Union[str, list[str]],
172
+ ) -> Iterable[VlmPrediction]:
173
+ """
174
+ Batched inference for NuExtract VLM using the specialized input format.
175
+
176
+ Args:
177
+ image_batch: Iterable of PIL Images or numpy arrays
178
+ prompt: Either:
179
+ - str: Single template used for all images
180
+ - list[str]: List of templates (one per image, must match image count)
181
+ """
182
+ import torch
183
+ from PIL import Image as PILImage
184
+
185
+ # Normalize images to RGB PIL
186
+ pil_images: list[Image] = []
187
+ for img in image_batch:
188
+ if isinstance(img, np.ndarray):
189
+ if img.ndim == 3 and img.shape[2] in (3, 4):
190
+ pil_img = PILImage.fromarray(img.astype(np.uint8))
191
+ elif img.ndim == 2:
192
+ pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
193
+ else:
194
+ raise ValueError(f"Unsupported numpy array shape: {img.shape}")
195
+ else:
196
+ pil_img = img
197
+ if pil_img.mode != "RGB":
198
+ pil_img = pil_img.convert("RGB")
199
+ pil_images.append(pil_img)
200
+
201
+ if not pil_images:
202
+ return
203
+
204
+ # Normalize templates (1 per image)
205
+ if isinstance(prompt, str):
206
+ templates = [prompt] * len(pil_images)
207
+ else:
208
+ if len(prompt) != len(pil_images):
209
+ raise ValueError(
210
+ f"Number of templates ({len(prompt)}) must match number of images ({len(pil_images)})"
211
+ )
212
+ templates = prompt
213
+
214
+ # Construct NuExtract input format
215
+ inputs = []
216
+ for pil_img, template in zip(pil_images, templates):
217
+ input_item = {
218
+ "document": {"type": "image", "image": pil_img},
219
+ "template": template,
220
+ }
221
+ inputs.append(input_item)
222
+
223
+ # Create messages structure for batch processing
224
+ messages = [
225
+ [
226
+ {
227
+ "role": "user",
228
+ "content": [x["document"]],
229
+ }
230
+ ]
231
+ for x in inputs
232
+ ]
233
+
234
+ # Apply chat template to each example individually
235
+ texts = [
236
+ self.processor.tokenizer.apply_chat_template(
237
+ messages[i],
238
+ template=x["template"],
239
+ tokenize=False,
240
+ add_generation_prompt=True,
241
+ )
242
+ for i, x in enumerate(inputs)
243
+ ]
244
+
245
+ # Process vision inputs using qwen-vl-utils
246
+ image_inputs = process_all_vision_info(messages)
247
+
248
+ # Process with the processor
249
+ processor_inputs = self.processor(
250
+ text=texts,
251
+ images=image_inputs,
252
+ padding=True,
253
+ return_tensors="pt",
254
+ **self.vlm_options.extra_processor_kwargs,
255
+ )
256
+ processor_inputs = {k: v.to(self.device) for k, v in processor_inputs.items()}
257
+
258
+ # Generate
259
+ gen_kwargs = {
260
+ **processor_inputs,
261
+ "max_new_tokens": self.max_new_tokens,
262
+ "generation_config": self.generation_config,
263
+ **self.vlm_options.extra_generation_config,
264
+ }
265
+ if self.temperature > 0:
266
+ gen_kwargs["do_sample"] = True
267
+ gen_kwargs["temperature"] = self.temperature
268
+ else:
269
+ gen_kwargs["do_sample"] = False
270
+
271
+ start_time = time.time()
272
+ with torch.inference_mode():
273
+ generated_ids = self.vlm_model.generate(**gen_kwargs)
274
+ generation_time = time.time() - start_time
275
+
276
+ # Trim generated sequences
277
+ input_len = processor_inputs["input_ids"].shape[1]
278
+ trimmed_sequences = generated_ids[:, input_len:]
279
+
280
+ # Decode with the processor/tokenizer
281
+ decoded_texts: list[str] = self.processor.batch_decode(
282
+ trimmed_sequences,
283
+ skip_special_tokens=True,
284
+ clean_up_tokenization_spaces=False,
285
+ )
286
+
287
+ # Optional logging
288
+ num_tokens = None
289
+ if generated_ids.shape[0] > 0: # type: ignore
290
+ # Todo: confirm num tokens is actually from first item, code was already like this
291
+ num_tokens = int(generated_ids[0].shape[0])
292
+ _log.debug(
293
+ f"Generated {num_tokens} tokens in {generation_time:.2f}s "
294
+ f"for batch size {generated_ids.shape[0]}." # type: ignore
295
+ )
296
+
297
+ for text in decoded_texts:
298
+ # Apply decode_response to the output text
299
+ decoded_text = self.vlm_options.decode_response(text)
300
+ yield VlmPrediction(
301
+ text=decoded_text,
302
+ generation_time=generation_time,
303
+ num_tokens=num_tokens,
304
+ stop_reason=VlmStopReason.UNSPECIFIED,
305
+ )
@@ -0,0 +1,47 @@
1
+ import logging
2
+ from functools import lru_cache
3
+
4
+ from docling.models.factories.layout_factory import LayoutFactory
5
+ from docling.models.factories.ocr_factory import OcrFactory
6
+ from docling.models.factories.picture_description_factory import (
7
+ PictureDescriptionFactory,
8
+ )
9
+ from docling.models.factories.table_factory import TableStructureFactory
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @lru_cache
15
+ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
16
+ factory = OcrFactory()
17
+ factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
18
+ logger.info("Registered ocr engines: %r", factory.registered_kind)
19
+ return factory
20
+
21
+
22
+ @lru_cache
23
+ def get_picture_description_factory(
24
+ allow_external_plugins: bool = False,
25
+ ) -> PictureDescriptionFactory:
26
+ factory = PictureDescriptionFactory()
27
+ factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
28
+ logger.info("Registered picture descriptions: %r", factory.registered_kind)
29
+ return factory
30
+
31
+
32
+ @lru_cache
33
+ def get_layout_factory(allow_external_plugins: bool = False) -> LayoutFactory:
34
+ factory = LayoutFactory()
35
+ factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
36
+ logger.info("Registered layout engines: %r", factory.registered_kind)
37
+ return factory
38
+
39
+
40
+ @lru_cache
41
+ def get_table_structure_factory(
42
+ allow_external_plugins: bool = False,
43
+ ) -> TableStructureFactory:
44
+ factory = TableStructureFactory()
45
+ factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
46
+ logger.info("Registered table structure engines: %r", factory.registered_kind)
47
+ return factory
@@ -0,0 +1,122 @@
1
+ import enum
2
+ import logging
3
+ from abc import ABCMeta
4
+ from typing import Generic, Optional, Type, TypeVar
5
+
6
+ from pluggy import PluginManager
7
+ from pydantic import BaseModel
8
+
9
+ from docling.datamodel.pipeline_options import BaseOptions
10
+ from docling.models.base_model import BaseModelWithOptions
11
+
12
+ A = TypeVar("A", bound=BaseModelWithOptions)
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class FactoryMeta(BaseModel):
19
+ kind: str
20
+ plugin_name: str
21
+ module: str
22
+
23
+
24
+ class BaseFactory(Generic[A], metaclass=ABCMeta):
25
+ default_plugin_name = "docling"
26
+
27
+ def __init__(self, plugin_attr_name: str, plugin_name=default_plugin_name):
28
+ self.plugin_name = plugin_name
29
+ self.plugin_attr_name = plugin_attr_name
30
+
31
+ self._classes: dict[Type[BaseOptions], Type[A]] = {}
32
+ self._meta: dict[Type[BaseOptions], FactoryMeta] = {}
33
+
34
+ @property
35
+ def registered_kind(self) -> list[str]:
36
+ return [opt.kind for opt in self._classes.keys()]
37
+
38
+ def get_enum(self) -> enum.Enum:
39
+ return enum.Enum(
40
+ self.plugin_attr_name + "_enum",
41
+ names={kind: kind for kind in self.registered_kind},
42
+ type=str,
43
+ module=__name__,
44
+ )
45
+
46
+ @property
47
+ def classes(self):
48
+ return self._classes
49
+
50
+ @property
51
+ def registered_meta(self):
52
+ return self._meta
53
+
54
+ def create_instance(self, options: BaseOptions, **kwargs) -> A:
55
+ try:
56
+ _cls = self._classes[type(options)]
57
+ return _cls(options=options, **kwargs)
58
+ except KeyError:
59
+ raise RuntimeError(self._err_msg_on_class_not_found(options.kind))
60
+
61
+ def create_options(self, kind: str, *args, **kwargs) -> BaseOptions:
62
+ for opt_cls, _ in self._classes.items():
63
+ if opt_cls.kind == kind:
64
+ return opt_cls(*args, **kwargs)
65
+ raise RuntimeError(self._err_msg_on_class_not_found(kind))
66
+
67
+ def _err_msg_on_class_not_found(self, kind: str):
68
+ msg = []
69
+
70
+ for opt, cls in self._classes.items():
71
+ msg.append(f"\t{opt.kind!r} => {cls!r}")
72
+
73
+ msg_str = "\n".join(msg)
74
+
75
+ return f"No class found with the name {kind!r}, known classes are:\n{msg_str}"
76
+
77
+ def register(self, cls: Type[A], plugin_name: str, plugin_module_name: str):
78
+ opt_type = cls.get_options_type()
79
+
80
+ if opt_type in self._classes:
81
+ raise ValueError(
82
+ f"{opt_type.kind!r} already registered to class {self._classes[opt_type]!r}"
83
+ )
84
+
85
+ self._classes[opt_type] = cls
86
+ self._meta[opt_type] = FactoryMeta(
87
+ kind=opt_type.kind, plugin_name=plugin_name, module=plugin_module_name
88
+ )
89
+
90
+ def load_from_plugins(
91
+ self, plugin_name: Optional[str] = None, allow_external_plugins: bool = False
92
+ ):
93
+ plugin_name = plugin_name or self.plugin_name
94
+
95
+ plugin_manager = PluginManager(plugin_name)
96
+ plugin_manager.load_setuptools_entrypoints(plugin_name)
97
+
98
+ for plugin_name, plugin_module in plugin_manager.list_name_plugin():
99
+ plugin_module_name = str(plugin_module.__name__) # type: ignore
100
+
101
+ if not allow_external_plugins and not plugin_module_name.startswith(
102
+ "docling."
103
+ ):
104
+ logger.warning(
105
+ f"The plugin {plugin_name} will not be loaded because Docling is being executed with allow_external_plugins=false."
106
+ )
107
+ continue
108
+
109
+ attr = getattr(plugin_module, self.plugin_attr_name, None)
110
+
111
+ if callable(attr):
112
+ logger.info("Loading plugin %r", plugin_name)
113
+
114
+ config = attr()
115
+ self.process_plugin(config, plugin_name, plugin_module_name)
116
+
117
+ def process_plugin(self, config, plugin_name: str, plugin_module_name: str):
118
+ for item in config[self.plugin_attr_name]:
119
+ try:
120
+ self.register(item, plugin_name, plugin_module_name)
121
+ except ValueError:
122
+ logger.warning("%r already registered", item)
@@ -0,0 +1,7 @@
1
+ from docling.models.base_layout_model import BaseLayoutModel
2
+ from docling.models.factories.base_factory import BaseFactory
3
+
4
+
5
+ class LayoutFactory(BaseFactory[BaseLayoutModel]):
6
+ def __init__(self, *args, **kwargs):
7
+ super().__init__("layout_engines", *args, **kwargs)
@@ -0,0 +1,11 @@
1
+ import logging
2
+
3
+ from docling.models.base_ocr_model import BaseOcrModel
4
+ from docling.models.factories.base_factory import BaseFactory
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class OcrFactory(BaseFactory[BaseOcrModel]):
10
+ def __init__(self, *args, **kwargs):
11
+ super().__init__("ocr_engines", *args, **kwargs)
@@ -0,0 +1,11 @@
1
+ import logging
2
+
3
+ from docling.models.factories.base_factory import BaseFactory
4
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class PictureDescriptionFactory(BaseFactory[PictureDescriptionBaseModel]):
10
+ def __init__(self, *args, **kwargs):
11
+ super().__init__("picture_description", *args, **kwargs)
@@ -0,0 +1,7 @@
1
+ from docling.models.base_table_model import BaseTableStructureModel
2
+ from docling.models.factories.base_factory import BaseFactory
3
+
4
+
5
+ class TableStructureFactory(BaseFactory[BaseTableStructureModel]):
6
+ def __init__(self, *args, **kwargs):
7
+ super().__init__("table_structure_engines", *args, **kwargs)
@@ -0,0 +1,149 @@
1
+ from abc import abstractmethod
2
+ from collections.abc import Iterable
3
+ from pathlib import Path
4
+ from typing import List, Optional, Type, Union
5
+
6
+ from docling_core.types.doc import (
7
+ DescriptionMetaField,
8
+ DoclingDocument,
9
+ NodeItem,
10
+ PictureClassificationLabel,
11
+ PictureItem,
12
+ PictureMeta,
13
+ )
14
+ from docling_core.types.doc.document import PictureDescriptionData
15
+ from PIL import Image
16
+
17
+ from docling.datamodel.accelerator_options import AcceleratorOptions
18
+ from docling.datamodel.pipeline_options import (
19
+ PictureDescriptionBaseOptions,
20
+ )
21
+ from docling.models.base_model import (
22
+ BaseItemAndImageEnrichmentModel,
23
+ BaseModelWithOptions,
24
+ ItemAndImageEnrichmentElement,
25
+ )
26
+
27
+
28
+ class PictureDescriptionBaseModel(
29
+ BaseItemAndImageEnrichmentModel, BaseModelWithOptions
30
+ ):
31
+ images_scale: float = 2.0
32
+
33
+ def __init__(
34
+ self,
35
+ *,
36
+ enabled: bool,
37
+ enable_remote_services: bool,
38
+ artifacts_path: Optional[Union[Path, str]],
39
+ options: PictureDescriptionBaseOptions,
40
+ accelerator_options: AcceleratorOptions,
41
+ ):
42
+ self.enabled = enabled
43
+ self.options = options
44
+ self.provenance = "not-implemented"
45
+
46
+ def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
47
+ return self.enabled and isinstance(element, PictureItem)
48
+
49
+ def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
50
+ raise NotImplementedError
51
+
52
+ def __call__(
53
+ self,
54
+ doc: DoclingDocument,
55
+ element_batch: Iterable[ItemAndImageEnrichmentElement],
56
+ ) -> Iterable[NodeItem]:
57
+ if not self.enabled:
58
+ for element in element_batch:
59
+ yield element.item
60
+ return
61
+
62
+ images: List[Image.Image] = []
63
+ elements: List[PictureItem] = []
64
+ for el in element_batch:
65
+ assert isinstance(el.item, PictureItem)
66
+ describe_image = True
67
+ # Don't describe the image if it's smaller than the threshold
68
+ if len(el.item.prov) > 0:
69
+ prov = el.item.prov[0] # PictureItems have at most a single provenance
70
+ page = doc.pages.get(prov.page_no)
71
+ if page is not None:
72
+ page_area = page.size.width * page.size.height
73
+ if page_area > 0:
74
+ area_fraction = prov.bbox.area() / page_area
75
+ if area_fraction < self.options.picture_area_threshold:
76
+ describe_image = False
77
+ if describe_image and not _passes_classification(
78
+ el.item.meta,
79
+ self.options.classification_allow,
80
+ self.options.classification_deny,
81
+ self.options.classification_min_confidence,
82
+ ):
83
+ describe_image = False
84
+ if describe_image:
85
+ elements.append(el.item)
86
+ images.append(el.image)
87
+
88
+ outputs = self._annotate_images(images)
89
+
90
+ for item, output in zip(elements, outputs):
91
+ # FIXME: annotations is deprecated, remove once all consumers use meta.classification
92
+ item.annotations.append(
93
+ PictureDescriptionData(text=output, provenance=self.provenance)
94
+ )
95
+
96
+ # Store classification in the new meta field
97
+ if item.meta is None:
98
+ item.meta = PictureMeta()
99
+ item.meta.description = DescriptionMetaField(
100
+ text=output,
101
+ created_by=self.provenance,
102
+ )
103
+
104
+ yield item
105
+
106
+ @classmethod
107
+ @abstractmethod
108
+ def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
109
+ pass
110
+
111
+
112
+ def _passes_classification(
113
+ meta: Optional[PictureMeta],
114
+ allow: Optional[List[PictureClassificationLabel]],
115
+ deny: Optional[List[PictureClassificationLabel]],
116
+ min_confidence: float,
117
+ ) -> bool:
118
+ if not allow and not deny:
119
+ return True
120
+ predicted = None
121
+ if meta and meta.classification:
122
+ predicted = meta.classification.predictions
123
+ if not predicted:
124
+ return allow is None
125
+ if deny:
126
+ deny_set = {_label_value(label) for label in deny}
127
+ for entry in predicted:
128
+ if _meets_confidence(entry.confidence, min_confidence) and (
129
+ entry.class_name in deny_set
130
+ ):
131
+ return False
132
+ if allow:
133
+ allow_set = {_label_value(label) for label in allow}
134
+ return any(
135
+ _meets_confidence(entry.confidence, min_confidence)
136
+ and entry.class_name in allow_set
137
+ for entry in predicted
138
+ )
139
+ return True
140
+
141
+
142
+ def _label_value(label: Union[PictureClassificationLabel, str]) -> str:
143
+ return label.value if isinstance(label, PictureClassificationLabel) else str(label)
144
+
145
+
146
+ def _meets_confidence(confidence: Optional[float], min_confidence: float) -> bool:
147
+ return min_confidence <= 0 or (
148
+ confidence is not None and confidence >= min_confidence
149
+ )
File without changes