docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Optional, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from PIL.Image import Image
|
|
10
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor, GenerationConfig
|
|
11
|
+
|
|
12
|
+
from docling.datamodel.accelerator_options import (
|
|
13
|
+
AcceleratorOptions,
|
|
14
|
+
)
|
|
15
|
+
from docling.datamodel.base_models import VlmPrediction, VlmStopReason
|
|
16
|
+
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
|
|
17
|
+
from docling.models.base_model import BaseVlmModel
|
|
18
|
+
from docling.models.utils.hf_model_download import (
|
|
19
|
+
HuggingFaceModelDownloadMixin,
|
|
20
|
+
)
|
|
21
|
+
from docling.utils.accelerator_utils import decide_device
|
|
22
|
+
|
|
23
|
+
_log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Source code from https://huggingface.co/numind/NuExtract-2.0-8B
|
|
27
|
+
def process_all_vision_info(messages, examples=None):
|
|
28
|
+
"""
|
|
29
|
+
Process vision information from both messages and in-context examples, supporting batch processing.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
messages: List of message dictionaries (single input) OR list of message lists (batch input)
|
|
33
|
+
examples: Optional list of example dictionaries (single input) OR list of example lists (batch)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
A flat list of all images in the correct order:
|
|
37
|
+
- For single input: example images followed by message images
|
|
38
|
+
- For batch input: interleaved as (item1 examples, item1 input, item2 examples, item2 input, etc.)
|
|
39
|
+
- Returns None if no images were found
|
|
40
|
+
"""
|
|
41
|
+
try:
|
|
42
|
+
from qwen_vl_utils import fetch_image, process_vision_info
|
|
43
|
+
except ImportError:
|
|
44
|
+
raise ImportError(
|
|
45
|
+
"qwen-vl-utils is required for NuExtractTransformersModel. "
|
|
46
|
+
"Please install it with: pip install qwen-vl-utils"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
from qwen_vl_utils import fetch_image, process_vision_info
|
|
50
|
+
|
|
51
|
+
# Helper function to extract images from examples
|
|
52
|
+
def extract_example_images(example_item):
|
|
53
|
+
if not example_item:
|
|
54
|
+
return []
|
|
55
|
+
|
|
56
|
+
# Handle both list of examples and single example
|
|
57
|
+
examples_to_process = (
|
|
58
|
+
example_item if isinstance(example_item, list) else [example_item]
|
|
59
|
+
)
|
|
60
|
+
images = []
|
|
61
|
+
|
|
62
|
+
for example in examples_to_process:
|
|
63
|
+
if (
|
|
64
|
+
isinstance(example.get("input"), dict)
|
|
65
|
+
and example["input"].get("type") == "image"
|
|
66
|
+
):
|
|
67
|
+
images.append(fetch_image(example["input"]))
|
|
68
|
+
|
|
69
|
+
return images
|
|
70
|
+
|
|
71
|
+
# Normalize inputs to always be batched format
|
|
72
|
+
is_batch = messages and isinstance(messages[0], list)
|
|
73
|
+
messages_batch = messages if is_batch else [messages]
|
|
74
|
+
is_batch_examples = (
|
|
75
|
+
examples
|
|
76
|
+
and isinstance(examples, list)
|
|
77
|
+
and (isinstance(examples[0], list) or examples[0] is None)
|
|
78
|
+
)
|
|
79
|
+
examples_batch = (
|
|
80
|
+
examples
|
|
81
|
+
if is_batch_examples
|
|
82
|
+
else ([examples] if examples is not None else None)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Ensure examples batch matches messages batch if provided
|
|
86
|
+
if examples and len(examples_batch) != len(messages_batch):
|
|
87
|
+
if not is_batch and len(examples_batch) == 1:
|
|
88
|
+
# Single example set for a single input is fine
|
|
89
|
+
pass
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError("Examples batch length must match messages batch length")
|
|
92
|
+
|
|
93
|
+
# Process all inputs, maintaining correct order
|
|
94
|
+
all_images = []
|
|
95
|
+
for i, message_group in enumerate(messages_batch):
|
|
96
|
+
# Get example images for this input
|
|
97
|
+
if examples and i < len(examples_batch):
|
|
98
|
+
input_example_images = extract_example_images(examples_batch[i])
|
|
99
|
+
all_images.extend(input_example_images)
|
|
100
|
+
|
|
101
|
+
# Get message images for this input
|
|
102
|
+
input_message_images = process_vision_info(message_group)[0] or []
|
|
103
|
+
all_images.extend(input_message_images)
|
|
104
|
+
|
|
105
|
+
return all_images if all_images else None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
enabled: bool,
|
|
112
|
+
artifacts_path: Optional[Path],
|
|
113
|
+
accelerator_options: AcceleratorOptions,
|
|
114
|
+
vlm_options: InlineVlmOptions,
|
|
115
|
+
):
|
|
116
|
+
self.enabled = enabled
|
|
117
|
+
self.vlm_options = vlm_options
|
|
118
|
+
|
|
119
|
+
if self.enabled:
|
|
120
|
+
import torch
|
|
121
|
+
|
|
122
|
+
self.device = decide_device(
|
|
123
|
+
accelerator_options.device,
|
|
124
|
+
supported_devices=vlm_options.supported_devices,
|
|
125
|
+
)
|
|
126
|
+
_log.debug(f"Available device for NuExtract VLM: {self.device}")
|
|
127
|
+
|
|
128
|
+
self.max_new_tokens = vlm_options.max_new_tokens
|
|
129
|
+
self.temperature = vlm_options.temperature
|
|
130
|
+
|
|
131
|
+
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
|
132
|
+
|
|
133
|
+
if artifacts_path is None:
|
|
134
|
+
artifacts_path = self.download_models(
|
|
135
|
+
repo_id=self.vlm_options.repo_id,
|
|
136
|
+
revision=self.vlm_options.revision,
|
|
137
|
+
)
|
|
138
|
+
elif (artifacts_path / repo_cache_folder).exists():
|
|
139
|
+
artifacts_path = artifacts_path / repo_cache_folder
|
|
140
|
+
|
|
141
|
+
self.processor = AutoProcessor.from_pretrained(
|
|
142
|
+
artifacts_path,
|
|
143
|
+
trust_remote_code=vlm_options.trust_remote_code,
|
|
144
|
+
use_fast=True,
|
|
145
|
+
)
|
|
146
|
+
self.processor.tokenizer.padding_side = "left"
|
|
147
|
+
|
|
148
|
+
self.vlm_model = AutoModelForImageTextToText.from_pretrained(
|
|
149
|
+
artifacts_path,
|
|
150
|
+
device_map=self.device,
|
|
151
|
+
dtype=self.vlm_options.torch_dtype,
|
|
152
|
+
_attn_implementation=(
|
|
153
|
+
"flash_attention_2"
|
|
154
|
+
if self.device.startswith("cuda")
|
|
155
|
+
and accelerator_options.cuda_use_flash_attention2
|
|
156
|
+
else "sdpa"
|
|
157
|
+
),
|
|
158
|
+
trust_remote_code=vlm_options.trust_remote_code,
|
|
159
|
+
)
|
|
160
|
+
if sys.version_info < (3, 14):
|
|
161
|
+
self.vlm_model = torch.compile(self.vlm_model) # type: ignore
|
|
162
|
+
else:
|
|
163
|
+
self.vlm_model.eval()
|
|
164
|
+
|
|
165
|
+
# Load generation config
|
|
166
|
+
self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
|
|
167
|
+
|
|
168
|
+
def process_images(
|
|
169
|
+
self,
|
|
170
|
+
image_batch: Iterable[Union[Image, np.ndarray]],
|
|
171
|
+
prompt: Union[str, list[str]],
|
|
172
|
+
) -> Iterable[VlmPrediction]:
|
|
173
|
+
"""
|
|
174
|
+
Batched inference for NuExtract VLM using the specialized input format.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
image_batch: Iterable of PIL Images or numpy arrays
|
|
178
|
+
prompt: Either:
|
|
179
|
+
- str: Single template used for all images
|
|
180
|
+
- list[str]: List of templates (one per image, must match image count)
|
|
181
|
+
"""
|
|
182
|
+
import torch
|
|
183
|
+
from PIL import Image as PILImage
|
|
184
|
+
|
|
185
|
+
# Normalize images to RGB PIL
|
|
186
|
+
pil_images: list[Image] = []
|
|
187
|
+
for img in image_batch:
|
|
188
|
+
if isinstance(img, np.ndarray):
|
|
189
|
+
if img.ndim == 3 and img.shape[2] in (3, 4):
|
|
190
|
+
pil_img = PILImage.fromarray(img.astype(np.uint8))
|
|
191
|
+
elif img.ndim == 2:
|
|
192
|
+
pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
|
|
193
|
+
else:
|
|
194
|
+
raise ValueError(f"Unsupported numpy array shape: {img.shape}")
|
|
195
|
+
else:
|
|
196
|
+
pil_img = img
|
|
197
|
+
if pil_img.mode != "RGB":
|
|
198
|
+
pil_img = pil_img.convert("RGB")
|
|
199
|
+
pil_images.append(pil_img)
|
|
200
|
+
|
|
201
|
+
if not pil_images:
|
|
202
|
+
return
|
|
203
|
+
|
|
204
|
+
# Normalize templates (1 per image)
|
|
205
|
+
if isinstance(prompt, str):
|
|
206
|
+
templates = [prompt] * len(pil_images)
|
|
207
|
+
else:
|
|
208
|
+
if len(prompt) != len(pil_images):
|
|
209
|
+
raise ValueError(
|
|
210
|
+
f"Number of templates ({len(prompt)}) must match number of images ({len(pil_images)})"
|
|
211
|
+
)
|
|
212
|
+
templates = prompt
|
|
213
|
+
|
|
214
|
+
# Construct NuExtract input format
|
|
215
|
+
inputs = []
|
|
216
|
+
for pil_img, template in zip(pil_images, templates):
|
|
217
|
+
input_item = {
|
|
218
|
+
"document": {"type": "image", "image": pil_img},
|
|
219
|
+
"template": template,
|
|
220
|
+
}
|
|
221
|
+
inputs.append(input_item)
|
|
222
|
+
|
|
223
|
+
# Create messages structure for batch processing
|
|
224
|
+
messages = [
|
|
225
|
+
[
|
|
226
|
+
{
|
|
227
|
+
"role": "user",
|
|
228
|
+
"content": [x["document"]],
|
|
229
|
+
}
|
|
230
|
+
]
|
|
231
|
+
for x in inputs
|
|
232
|
+
]
|
|
233
|
+
|
|
234
|
+
# Apply chat template to each example individually
|
|
235
|
+
texts = [
|
|
236
|
+
self.processor.tokenizer.apply_chat_template(
|
|
237
|
+
messages[i],
|
|
238
|
+
template=x["template"],
|
|
239
|
+
tokenize=False,
|
|
240
|
+
add_generation_prompt=True,
|
|
241
|
+
)
|
|
242
|
+
for i, x in enumerate(inputs)
|
|
243
|
+
]
|
|
244
|
+
|
|
245
|
+
# Process vision inputs using qwen-vl-utils
|
|
246
|
+
image_inputs = process_all_vision_info(messages)
|
|
247
|
+
|
|
248
|
+
# Process with the processor
|
|
249
|
+
processor_inputs = self.processor(
|
|
250
|
+
text=texts,
|
|
251
|
+
images=image_inputs,
|
|
252
|
+
padding=True,
|
|
253
|
+
return_tensors="pt",
|
|
254
|
+
**self.vlm_options.extra_processor_kwargs,
|
|
255
|
+
)
|
|
256
|
+
processor_inputs = {k: v.to(self.device) for k, v in processor_inputs.items()}
|
|
257
|
+
|
|
258
|
+
# Generate
|
|
259
|
+
gen_kwargs = {
|
|
260
|
+
**processor_inputs,
|
|
261
|
+
"max_new_tokens": self.max_new_tokens,
|
|
262
|
+
"generation_config": self.generation_config,
|
|
263
|
+
**self.vlm_options.extra_generation_config,
|
|
264
|
+
}
|
|
265
|
+
if self.temperature > 0:
|
|
266
|
+
gen_kwargs["do_sample"] = True
|
|
267
|
+
gen_kwargs["temperature"] = self.temperature
|
|
268
|
+
else:
|
|
269
|
+
gen_kwargs["do_sample"] = False
|
|
270
|
+
|
|
271
|
+
start_time = time.time()
|
|
272
|
+
with torch.inference_mode():
|
|
273
|
+
generated_ids = self.vlm_model.generate(**gen_kwargs)
|
|
274
|
+
generation_time = time.time() - start_time
|
|
275
|
+
|
|
276
|
+
# Trim generated sequences
|
|
277
|
+
input_len = processor_inputs["input_ids"].shape[1]
|
|
278
|
+
trimmed_sequences = generated_ids[:, input_len:]
|
|
279
|
+
|
|
280
|
+
# Decode with the processor/tokenizer
|
|
281
|
+
decoded_texts: list[str] = self.processor.batch_decode(
|
|
282
|
+
trimmed_sequences,
|
|
283
|
+
skip_special_tokens=True,
|
|
284
|
+
clean_up_tokenization_spaces=False,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Optional logging
|
|
288
|
+
num_tokens = None
|
|
289
|
+
if generated_ids.shape[0] > 0: # type: ignore
|
|
290
|
+
# Todo: confirm num tokens is actually from first item, code was already like this
|
|
291
|
+
num_tokens = int(generated_ids[0].shape[0])
|
|
292
|
+
_log.debug(
|
|
293
|
+
f"Generated {num_tokens} tokens in {generation_time:.2f}s "
|
|
294
|
+
f"for batch size {generated_ids.shape[0]}." # type: ignore
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
for text in decoded_texts:
|
|
298
|
+
# Apply decode_response to the output text
|
|
299
|
+
decoded_text = self.vlm_options.decode_response(text)
|
|
300
|
+
yield VlmPrediction(
|
|
301
|
+
text=decoded_text,
|
|
302
|
+
generation_time=generation_time,
|
|
303
|
+
num_tokens=num_tokens,
|
|
304
|
+
stop_reason=VlmStopReason.UNSPECIFIED,
|
|
305
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
|
|
4
|
+
from docling.models.factories.layout_factory import LayoutFactory
|
|
5
|
+
from docling.models.factories.ocr_factory import OcrFactory
|
|
6
|
+
from docling.models.factories.picture_description_factory import (
|
|
7
|
+
PictureDescriptionFactory,
|
|
8
|
+
)
|
|
9
|
+
from docling.models.factories.table_factory import TableStructureFactory
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@lru_cache
|
|
15
|
+
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
|
16
|
+
factory = OcrFactory()
|
|
17
|
+
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
|
18
|
+
logger.info("Registered ocr engines: %r", factory.registered_kind)
|
|
19
|
+
return factory
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@lru_cache
|
|
23
|
+
def get_picture_description_factory(
|
|
24
|
+
allow_external_plugins: bool = False,
|
|
25
|
+
) -> PictureDescriptionFactory:
|
|
26
|
+
factory = PictureDescriptionFactory()
|
|
27
|
+
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
|
28
|
+
logger.info("Registered picture descriptions: %r", factory.registered_kind)
|
|
29
|
+
return factory
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@lru_cache
|
|
33
|
+
def get_layout_factory(allow_external_plugins: bool = False) -> LayoutFactory:
|
|
34
|
+
factory = LayoutFactory()
|
|
35
|
+
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
|
36
|
+
logger.info("Registered layout engines: %r", factory.registered_kind)
|
|
37
|
+
return factory
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@lru_cache
|
|
41
|
+
def get_table_structure_factory(
|
|
42
|
+
allow_external_plugins: bool = False,
|
|
43
|
+
) -> TableStructureFactory:
|
|
44
|
+
factory = TableStructureFactory()
|
|
45
|
+
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
|
46
|
+
logger.info("Registered table structure engines: %r", factory.registered_kind)
|
|
47
|
+
return factory
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
import logging
|
|
3
|
+
from abc import ABCMeta
|
|
4
|
+
from typing import Generic, Optional, Type, TypeVar
|
|
5
|
+
|
|
6
|
+
from pluggy import PluginManager
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from docling.datamodel.pipeline_options import BaseOptions
|
|
10
|
+
from docling.models.base_model import BaseModelWithOptions
|
|
11
|
+
|
|
12
|
+
A = TypeVar("A", bound=BaseModelWithOptions)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FactoryMeta(BaseModel):
|
|
19
|
+
kind: str
|
|
20
|
+
plugin_name: str
|
|
21
|
+
module: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BaseFactory(Generic[A], metaclass=ABCMeta):
|
|
25
|
+
default_plugin_name = "docling"
|
|
26
|
+
|
|
27
|
+
def __init__(self, plugin_attr_name: str, plugin_name=default_plugin_name):
|
|
28
|
+
self.plugin_name = plugin_name
|
|
29
|
+
self.plugin_attr_name = plugin_attr_name
|
|
30
|
+
|
|
31
|
+
self._classes: dict[Type[BaseOptions], Type[A]] = {}
|
|
32
|
+
self._meta: dict[Type[BaseOptions], FactoryMeta] = {}
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def registered_kind(self) -> list[str]:
|
|
36
|
+
return [opt.kind for opt in self._classes.keys()]
|
|
37
|
+
|
|
38
|
+
def get_enum(self) -> enum.Enum:
|
|
39
|
+
return enum.Enum(
|
|
40
|
+
self.plugin_attr_name + "_enum",
|
|
41
|
+
names={kind: kind for kind in self.registered_kind},
|
|
42
|
+
type=str,
|
|
43
|
+
module=__name__,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def classes(self):
|
|
48
|
+
return self._classes
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def registered_meta(self):
|
|
52
|
+
return self._meta
|
|
53
|
+
|
|
54
|
+
def create_instance(self, options: BaseOptions, **kwargs) -> A:
|
|
55
|
+
try:
|
|
56
|
+
_cls = self._classes[type(options)]
|
|
57
|
+
return _cls(options=options, **kwargs)
|
|
58
|
+
except KeyError:
|
|
59
|
+
raise RuntimeError(self._err_msg_on_class_not_found(options.kind))
|
|
60
|
+
|
|
61
|
+
def create_options(self, kind: str, *args, **kwargs) -> BaseOptions:
|
|
62
|
+
for opt_cls, _ in self._classes.items():
|
|
63
|
+
if opt_cls.kind == kind:
|
|
64
|
+
return opt_cls(*args, **kwargs)
|
|
65
|
+
raise RuntimeError(self._err_msg_on_class_not_found(kind))
|
|
66
|
+
|
|
67
|
+
def _err_msg_on_class_not_found(self, kind: str):
|
|
68
|
+
msg = []
|
|
69
|
+
|
|
70
|
+
for opt, cls in self._classes.items():
|
|
71
|
+
msg.append(f"\t{opt.kind!r} => {cls!r}")
|
|
72
|
+
|
|
73
|
+
msg_str = "\n".join(msg)
|
|
74
|
+
|
|
75
|
+
return f"No class found with the name {kind!r}, known classes are:\n{msg_str}"
|
|
76
|
+
|
|
77
|
+
def register(self, cls: Type[A], plugin_name: str, plugin_module_name: str):
|
|
78
|
+
opt_type = cls.get_options_type()
|
|
79
|
+
|
|
80
|
+
if opt_type in self._classes:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"{opt_type.kind!r} already registered to class {self._classes[opt_type]!r}"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
self._classes[opt_type] = cls
|
|
86
|
+
self._meta[opt_type] = FactoryMeta(
|
|
87
|
+
kind=opt_type.kind, plugin_name=plugin_name, module=plugin_module_name
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def load_from_plugins(
|
|
91
|
+
self, plugin_name: Optional[str] = None, allow_external_plugins: bool = False
|
|
92
|
+
):
|
|
93
|
+
plugin_name = plugin_name or self.plugin_name
|
|
94
|
+
|
|
95
|
+
plugin_manager = PluginManager(plugin_name)
|
|
96
|
+
plugin_manager.load_setuptools_entrypoints(plugin_name)
|
|
97
|
+
|
|
98
|
+
for plugin_name, plugin_module in plugin_manager.list_name_plugin():
|
|
99
|
+
plugin_module_name = str(plugin_module.__name__) # type: ignore
|
|
100
|
+
|
|
101
|
+
if not allow_external_plugins and not plugin_module_name.startswith(
|
|
102
|
+
"docling."
|
|
103
|
+
):
|
|
104
|
+
logger.warning(
|
|
105
|
+
f"The plugin {plugin_name} will not be loaded because Docling is being executed with allow_external_plugins=false."
|
|
106
|
+
)
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
attr = getattr(plugin_module, self.plugin_attr_name, None)
|
|
110
|
+
|
|
111
|
+
if callable(attr):
|
|
112
|
+
logger.info("Loading plugin %r", plugin_name)
|
|
113
|
+
|
|
114
|
+
config = attr()
|
|
115
|
+
self.process_plugin(config, plugin_name, plugin_module_name)
|
|
116
|
+
|
|
117
|
+
def process_plugin(self, config, plugin_name: str, plugin_module_name: str):
|
|
118
|
+
for item in config[self.plugin_attr_name]:
|
|
119
|
+
try:
|
|
120
|
+
self.register(item, plugin_name, plugin_module_name)
|
|
121
|
+
except ValueError:
|
|
122
|
+
logger.warning("%r already registered", item)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from docling.models.base_layout_model import BaseLayoutModel
|
|
2
|
+
from docling.models.factories.base_factory import BaseFactory
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class LayoutFactory(BaseFactory[BaseLayoutModel]):
|
|
6
|
+
def __init__(self, *args, **kwargs):
|
|
7
|
+
super().__init__("layout_engines", *args, **kwargs)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
|
4
|
+
from docling.models.factories.base_factory import BaseFactory
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OcrFactory(BaseFactory[BaseOcrModel]):
|
|
10
|
+
def __init__(self, *args, **kwargs):
|
|
11
|
+
super().__init__("ocr_engines", *args, **kwargs)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from docling.models.factories.base_factory import BaseFactory
|
|
4
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PictureDescriptionFactory(BaseFactory[PictureDescriptionBaseModel]):
|
|
10
|
+
def __init__(self, *args, **kwargs):
|
|
11
|
+
super().__init__("picture_description", *args, **kwargs)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from docling.models.base_table_model import BaseTableStructureModel
|
|
2
|
+
from docling.models.factories.base_factory import BaseFactory
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TableStructureFactory(BaseFactory[BaseTableStructureModel]):
|
|
6
|
+
def __init__(self, *args, **kwargs):
|
|
7
|
+
super().__init__("table_structure_engines", *args, **kwargs)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
from docling_core.types.doc import (
|
|
7
|
+
DescriptionMetaField,
|
|
8
|
+
DoclingDocument,
|
|
9
|
+
NodeItem,
|
|
10
|
+
PictureClassificationLabel,
|
|
11
|
+
PictureItem,
|
|
12
|
+
PictureMeta,
|
|
13
|
+
)
|
|
14
|
+
from docling_core.types.doc.document import PictureDescriptionData
|
|
15
|
+
from PIL import Image
|
|
16
|
+
|
|
17
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
18
|
+
from docling.datamodel.pipeline_options import (
|
|
19
|
+
PictureDescriptionBaseOptions,
|
|
20
|
+
)
|
|
21
|
+
from docling.models.base_model import (
|
|
22
|
+
BaseItemAndImageEnrichmentModel,
|
|
23
|
+
BaseModelWithOptions,
|
|
24
|
+
ItemAndImageEnrichmentElement,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PictureDescriptionBaseModel(
|
|
29
|
+
BaseItemAndImageEnrichmentModel, BaseModelWithOptions
|
|
30
|
+
):
|
|
31
|
+
images_scale: float = 2.0
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
*,
|
|
36
|
+
enabled: bool,
|
|
37
|
+
enable_remote_services: bool,
|
|
38
|
+
artifacts_path: Optional[Union[Path, str]],
|
|
39
|
+
options: PictureDescriptionBaseOptions,
|
|
40
|
+
accelerator_options: AcceleratorOptions,
|
|
41
|
+
):
|
|
42
|
+
self.enabled = enabled
|
|
43
|
+
self.options = options
|
|
44
|
+
self.provenance = "not-implemented"
|
|
45
|
+
|
|
46
|
+
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
|
47
|
+
return self.enabled and isinstance(element, PictureItem)
|
|
48
|
+
|
|
49
|
+
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
|
50
|
+
raise NotImplementedError
|
|
51
|
+
|
|
52
|
+
def __call__(
|
|
53
|
+
self,
|
|
54
|
+
doc: DoclingDocument,
|
|
55
|
+
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
|
56
|
+
) -> Iterable[NodeItem]:
|
|
57
|
+
if not self.enabled:
|
|
58
|
+
for element in element_batch:
|
|
59
|
+
yield element.item
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
images: List[Image.Image] = []
|
|
63
|
+
elements: List[PictureItem] = []
|
|
64
|
+
for el in element_batch:
|
|
65
|
+
assert isinstance(el.item, PictureItem)
|
|
66
|
+
describe_image = True
|
|
67
|
+
# Don't describe the image if it's smaller than the threshold
|
|
68
|
+
if len(el.item.prov) > 0:
|
|
69
|
+
prov = el.item.prov[0] # PictureItems have at most a single provenance
|
|
70
|
+
page = doc.pages.get(prov.page_no)
|
|
71
|
+
if page is not None:
|
|
72
|
+
page_area = page.size.width * page.size.height
|
|
73
|
+
if page_area > 0:
|
|
74
|
+
area_fraction = prov.bbox.area() / page_area
|
|
75
|
+
if area_fraction < self.options.picture_area_threshold:
|
|
76
|
+
describe_image = False
|
|
77
|
+
if describe_image and not _passes_classification(
|
|
78
|
+
el.item.meta,
|
|
79
|
+
self.options.classification_allow,
|
|
80
|
+
self.options.classification_deny,
|
|
81
|
+
self.options.classification_min_confidence,
|
|
82
|
+
):
|
|
83
|
+
describe_image = False
|
|
84
|
+
if describe_image:
|
|
85
|
+
elements.append(el.item)
|
|
86
|
+
images.append(el.image)
|
|
87
|
+
|
|
88
|
+
outputs = self._annotate_images(images)
|
|
89
|
+
|
|
90
|
+
for item, output in zip(elements, outputs):
|
|
91
|
+
# FIXME: annotations is deprecated, remove once all consumers use meta.classification
|
|
92
|
+
item.annotations.append(
|
|
93
|
+
PictureDescriptionData(text=output, provenance=self.provenance)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Store classification in the new meta field
|
|
97
|
+
if item.meta is None:
|
|
98
|
+
item.meta = PictureMeta()
|
|
99
|
+
item.meta.description = DescriptionMetaField(
|
|
100
|
+
text=output,
|
|
101
|
+
created_by=self.provenance,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
yield item
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
@abstractmethod
|
|
108
|
+
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _passes_classification(
|
|
113
|
+
meta: Optional[PictureMeta],
|
|
114
|
+
allow: Optional[List[PictureClassificationLabel]],
|
|
115
|
+
deny: Optional[List[PictureClassificationLabel]],
|
|
116
|
+
min_confidence: float,
|
|
117
|
+
) -> bool:
|
|
118
|
+
if not allow and not deny:
|
|
119
|
+
return True
|
|
120
|
+
predicted = None
|
|
121
|
+
if meta and meta.classification:
|
|
122
|
+
predicted = meta.classification.predictions
|
|
123
|
+
if not predicted:
|
|
124
|
+
return allow is None
|
|
125
|
+
if deny:
|
|
126
|
+
deny_set = {_label_value(label) for label in deny}
|
|
127
|
+
for entry in predicted:
|
|
128
|
+
if _meets_confidence(entry.confidence, min_confidence) and (
|
|
129
|
+
entry.class_name in deny_set
|
|
130
|
+
):
|
|
131
|
+
return False
|
|
132
|
+
if allow:
|
|
133
|
+
allow_set = {_label_value(label) for label in allow}
|
|
134
|
+
return any(
|
|
135
|
+
_meets_confidence(entry.confidence, min_confidence)
|
|
136
|
+
and entry.class_name in allow_set
|
|
137
|
+
for entry in predicted
|
|
138
|
+
)
|
|
139
|
+
return True
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _label_value(label: Union[PictureClassificationLabel, str]) -> str:
|
|
143
|
+
return label.value if isinstance(label, PictureClassificationLabel) else str(label)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _meets_confidence(confidence: Optional[float], min_confidence: float) -> bool:
|
|
147
|
+
return min_confidence <= 0 or (
|
|
148
|
+
confidence is not None and confidence >= min_confidence
|
|
149
|
+
)
|
|
File without changes
|