docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import threading
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List, Literal, Optional, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from docling_core.types.doc import (
|
|
9
|
+
DoclingDocument,
|
|
10
|
+
NodeItem,
|
|
11
|
+
PictureClassificationClass,
|
|
12
|
+
PictureClassificationData,
|
|
13
|
+
PictureClassificationMetaField,
|
|
14
|
+
PictureItem,
|
|
15
|
+
PictureMeta,
|
|
16
|
+
)
|
|
17
|
+
from docling_core.types.doc.document import PictureClassificationPrediction
|
|
18
|
+
from PIL import Image
|
|
19
|
+
from pydantic import BaseModel
|
|
20
|
+
|
|
21
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
22
|
+
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
|
23
|
+
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
|
24
|
+
from docling.models.utils.hf_model_download import HuggingFaceModelDownloadMixin
|
|
25
|
+
from docling.utils.accelerator_utils import decide_device
|
|
26
|
+
|
|
27
|
+
# Global lock for model initialization to prevent threading issues
|
|
28
|
+
_model_init_lock = threading.Lock()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DocumentPictureClassifierOptions(BaseModel):
|
|
32
|
+
"""
|
|
33
|
+
Options for configuring the DocumentPictureClassifier.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
|
|
37
|
+
repo_id: str = "docling-project/DocumentFigureClassifier-v2.0"
|
|
38
|
+
revision: str = "main"
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def repo_cache_folder(self) -> str:
|
|
42
|
+
return self.repo_id.replace("/", "--")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DocumentPictureClassifier(
|
|
46
|
+
BaseItemAndImageEnrichmentModel, HuggingFaceModelDownloadMixin
|
|
47
|
+
):
|
|
48
|
+
"""
|
|
49
|
+
A model for classifying pictures in documents.
|
|
50
|
+
|
|
51
|
+
This class enriches document pictures with predicted classifications
|
|
52
|
+
based on a predefined set of classes.
|
|
53
|
+
|
|
54
|
+
Attributes
|
|
55
|
+
----------
|
|
56
|
+
enabled : bool
|
|
57
|
+
Whether the classifier is enabled for use.
|
|
58
|
+
options : DocumentPictureClassifierOptions
|
|
59
|
+
Configuration options for the classifier.
|
|
60
|
+
document_picture_classifier : DocumentPictureClassifierPredictor
|
|
61
|
+
The underlying prediction model, loaded if the classifier is enabled.
|
|
62
|
+
|
|
63
|
+
Methods
|
|
64
|
+
-------
|
|
65
|
+
__init__(enabled, artifacts_path, options, accelerator_options)
|
|
66
|
+
Initializes the classifier with specified configurations.
|
|
67
|
+
is_processable(doc, element)
|
|
68
|
+
Checks if the given element can be processed by the classifier.
|
|
69
|
+
__call__(doc, element_batch)
|
|
70
|
+
Processes a batch of elements and adds classification annotations.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
images_scale = 2
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
enabled: bool,
|
|
78
|
+
artifacts_path: Optional[Path],
|
|
79
|
+
options: DocumentPictureClassifierOptions,
|
|
80
|
+
accelerator_options: AcceleratorOptions,
|
|
81
|
+
):
|
|
82
|
+
"""
|
|
83
|
+
Initializes the DocumentPictureClassifier.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
enabled : bool
|
|
88
|
+
Indicates whether the classifier is enabled.
|
|
89
|
+
artifacts_path : Optional[Union[Path, str]],
|
|
90
|
+
Path to the directory containing model artifacts.
|
|
91
|
+
options : DocumentPictureClassifierOptions
|
|
92
|
+
Configuration options for the classifier.
|
|
93
|
+
accelerator_options : AcceleratorOptions
|
|
94
|
+
Options for configuring the device and parallelism.
|
|
95
|
+
"""
|
|
96
|
+
self.enabled = enabled
|
|
97
|
+
self.options = options
|
|
98
|
+
|
|
99
|
+
if self.enabled:
|
|
100
|
+
self._device = decide_device(accelerator_options.device)
|
|
101
|
+
|
|
102
|
+
repo_cache_folder = self.options.repo_cache_folder
|
|
103
|
+
|
|
104
|
+
if artifacts_path is None:
|
|
105
|
+
artifacts_path = self.download_models(
|
|
106
|
+
self.options.repo_id, revision=self.options.revision
|
|
107
|
+
)
|
|
108
|
+
elif (artifacts_path / repo_cache_folder).exists():
|
|
109
|
+
artifacts_path = artifacts_path / repo_cache_folder
|
|
110
|
+
|
|
111
|
+
import torch
|
|
112
|
+
from transformers import AutoImageProcessor, AutoModelForImageClassification
|
|
113
|
+
|
|
114
|
+
with _model_init_lock:
|
|
115
|
+
# Image processor
|
|
116
|
+
self._processor = AutoImageProcessor.from_pretrained(
|
|
117
|
+
artifacts_path, use_fast=True
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Model
|
|
121
|
+
self._model = AutoModelForImageClassification.from_pretrained(
|
|
122
|
+
artifacts_path,
|
|
123
|
+
device_map=self._device,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
if sys.version_info < (3, 14):
|
|
127
|
+
self._model = torch.compile(self._model) # type: ignore
|
|
128
|
+
else:
|
|
129
|
+
self._model.eval()
|
|
130
|
+
|
|
131
|
+
self._classes = self._model.config.id2label
|
|
132
|
+
|
|
133
|
+
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
|
134
|
+
"""
|
|
135
|
+
Determines if the given element can be processed by the classifier.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
doc : DoclingDocument
|
|
140
|
+
The document containing the element.
|
|
141
|
+
element : NodeItem
|
|
142
|
+
The element to be checked.
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
bool
|
|
147
|
+
True if the element is a PictureItem and processing is enabled; False otherwise.
|
|
148
|
+
"""
|
|
149
|
+
return self.enabled and isinstance(element, PictureItem)
|
|
150
|
+
|
|
151
|
+
def __call__(
|
|
152
|
+
self,
|
|
153
|
+
doc: DoclingDocument,
|
|
154
|
+
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
|
155
|
+
) -> Iterable[NodeItem]:
|
|
156
|
+
"""
|
|
157
|
+
Processes a batch of elements and enriches them with classification predictions.
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
doc : DoclingDocument
|
|
162
|
+
The document containing the elements to be processed.
|
|
163
|
+
element_batch : Iterable[ItemAndImageEnrichmentElement]
|
|
164
|
+
A batch of pictures to classify.
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
Iterable[NodeItem]
|
|
169
|
+
An iterable of NodeItem objects after processing. The field
|
|
170
|
+
'data.classification' is added containing the classification for each picture.
|
|
171
|
+
"""
|
|
172
|
+
if not self.enabled:
|
|
173
|
+
for element in element_batch:
|
|
174
|
+
yield element.item
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
import torch
|
|
178
|
+
|
|
179
|
+
images: List[Union[Image.Image, np.ndarray]] = []
|
|
180
|
+
elements: List[PictureItem] = []
|
|
181
|
+
for i, el in enumerate(element_batch):
|
|
182
|
+
assert isinstance(el.item, PictureItem)
|
|
183
|
+
elements.append(el.item)
|
|
184
|
+
|
|
185
|
+
raw_image = el.image
|
|
186
|
+
if isinstance(raw_image, Image.Image):
|
|
187
|
+
raw_image = raw_image.convert("RGB")
|
|
188
|
+
elif isinstance(raw_image, np.ndarray):
|
|
189
|
+
raw_image = Image.fromarray(raw_image).convert("RGB")
|
|
190
|
+
else:
|
|
191
|
+
raise TypeError(
|
|
192
|
+
"Supported input formats are PIL.Image.Image or numpy.ndarray."
|
|
193
|
+
)
|
|
194
|
+
images.append(raw_image)
|
|
195
|
+
|
|
196
|
+
inputs = self._processor(images=images, return_tensors="pt")
|
|
197
|
+
# move inputs to the same device as the model
|
|
198
|
+
inputs = {k: v.to(self._device) for k, v in inputs.items()}
|
|
199
|
+
|
|
200
|
+
with torch.no_grad():
|
|
201
|
+
logits = self._model(**inputs).logits # (batch_size, num_classes)
|
|
202
|
+
probs_batch = logits.softmax(dim=1) # (batch_size, num_classes)
|
|
203
|
+
probs_batch = probs_batch.cpu().numpy().tolist()
|
|
204
|
+
|
|
205
|
+
predictions_batch = []
|
|
206
|
+
for probs_image in probs_batch:
|
|
207
|
+
preds = [(self._classes[i], prob) for i, prob in enumerate(probs_image)]
|
|
208
|
+
preds.sort(key=lambda t: t[1], reverse=True)
|
|
209
|
+
predictions_batch.append(preds)
|
|
210
|
+
|
|
211
|
+
for item, output in zip(elements, predictions_batch):
|
|
212
|
+
predicted_classes = [
|
|
213
|
+
PictureClassificationClass(
|
|
214
|
+
class_name=pred[0],
|
|
215
|
+
confidence=pred[1],
|
|
216
|
+
)
|
|
217
|
+
for pred in output
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
# FIXME: annotations is deprecated, remove once all consumers use meta.classification
|
|
221
|
+
item.annotations.append(
|
|
222
|
+
PictureClassificationData(
|
|
223
|
+
provenance="DocumentPictureClassifier",
|
|
224
|
+
predicted_classes=predicted_classes,
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Store classification in the new meta field
|
|
229
|
+
predictions = [
|
|
230
|
+
PictureClassificationPrediction(
|
|
231
|
+
class_name=pred.class_name,
|
|
232
|
+
confidence=pred.confidence,
|
|
233
|
+
created_by="DocumentPictureClassifier",
|
|
234
|
+
)
|
|
235
|
+
for pred in predicted_classes
|
|
236
|
+
]
|
|
237
|
+
classification_data = PictureClassificationMetaField(
|
|
238
|
+
predictions=predictions,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if item.meta is not None:
|
|
242
|
+
item.meta.classification = classification_data
|
|
243
|
+
else:
|
|
244
|
+
item.meta = PictureMeta(classification=classification_data)
|
|
245
|
+
|
|
246
|
+
yield item
|
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional, Type, Union
|
|
5
|
+
|
|
6
|
+
from PIL import Image
|
|
7
|
+
|
|
8
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
9
|
+
from docling.datamodel.pipeline_options import (
|
|
10
|
+
PictureDescriptionApiOptions,
|
|
11
|
+
PictureDescriptionBaseOptions,
|
|
12
|
+
)
|
|
13
|
+
from docling.exceptions import OperationNotAllowed
|
|
14
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
15
|
+
from docling.utils.api_image_request import api_image_request
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|
19
|
+
# elements_batch_size = 4
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
|
23
|
+
return PictureDescriptionApiOptions
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
enabled: bool,
|
|
28
|
+
enable_remote_services: bool,
|
|
29
|
+
artifacts_path: Optional[Union[Path, str]],
|
|
30
|
+
options: PictureDescriptionApiOptions,
|
|
31
|
+
accelerator_options: AcceleratorOptions,
|
|
32
|
+
):
|
|
33
|
+
super().__init__(
|
|
34
|
+
enabled=enabled,
|
|
35
|
+
enable_remote_services=enable_remote_services,
|
|
36
|
+
artifacts_path=artifacts_path,
|
|
37
|
+
options=options,
|
|
38
|
+
accelerator_options=accelerator_options,
|
|
39
|
+
)
|
|
40
|
+
self.options: PictureDescriptionApiOptions
|
|
41
|
+
self.concurrency = self.options.concurrency
|
|
42
|
+
|
|
43
|
+
if self.enabled:
|
|
44
|
+
if not enable_remote_services:
|
|
45
|
+
raise OperationNotAllowed(
|
|
46
|
+
"Connections to remote services is only allowed when set explicitly. "
|
|
47
|
+
"pipeline_options.enable_remote_services=True."
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
|
51
|
+
# Note: technically we could make a batch request here,
|
|
52
|
+
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
|
53
|
+
def _api_request(image):
|
|
54
|
+
page_tags, _, _ = api_image_request(
|
|
55
|
+
image=image,
|
|
56
|
+
prompt=self.options.prompt,
|
|
57
|
+
url=self.options.url,
|
|
58
|
+
timeout=self.options.timeout,
|
|
59
|
+
headers=self.options.headers,
|
|
60
|
+
**self.options.params,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return page_tags
|
|
64
|
+
|
|
65
|
+
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
|
|
66
|
+
yield from executor.map(_api_request, images)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import threading
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Type, Union
|
|
6
|
+
|
|
7
|
+
from PIL import Image
|
|
8
|
+
|
|
9
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
10
|
+
from docling.datamodel.pipeline_options import (
|
|
11
|
+
PictureDescriptionBaseOptions,
|
|
12
|
+
PictureDescriptionVlmOptions,
|
|
13
|
+
)
|
|
14
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
15
|
+
from docling.models.utils.hf_model_download import (
|
|
16
|
+
HuggingFaceModelDownloadMixin,
|
|
17
|
+
)
|
|
18
|
+
from docling.utils.accelerator_utils import decide_device
|
|
19
|
+
|
|
20
|
+
# Global lock for model initialization to prevent threading issues
|
|
21
|
+
_model_init_lock = threading.Lock()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PictureDescriptionVlmModel(
|
|
25
|
+
PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
|
|
26
|
+
):
|
|
27
|
+
@classmethod
|
|
28
|
+
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
|
29
|
+
return PictureDescriptionVlmOptions
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
enabled: bool,
|
|
34
|
+
enable_remote_services: bool,
|
|
35
|
+
artifacts_path: Optional[Union[Path, str]],
|
|
36
|
+
options: PictureDescriptionVlmOptions,
|
|
37
|
+
accelerator_options: AcceleratorOptions,
|
|
38
|
+
):
|
|
39
|
+
super().__init__(
|
|
40
|
+
enabled=enabled,
|
|
41
|
+
enable_remote_services=enable_remote_services,
|
|
42
|
+
artifacts_path=artifacts_path,
|
|
43
|
+
options=options,
|
|
44
|
+
accelerator_options=accelerator_options,
|
|
45
|
+
)
|
|
46
|
+
self.options: PictureDescriptionVlmOptions
|
|
47
|
+
|
|
48
|
+
if self.enabled:
|
|
49
|
+
if artifacts_path is None:
|
|
50
|
+
artifacts_path = self.download_models(repo_id=self.options.repo_id)
|
|
51
|
+
else:
|
|
52
|
+
artifacts_path = Path(artifacts_path) / self.options.repo_cache_folder
|
|
53
|
+
|
|
54
|
+
self.device = decide_device(accelerator_options.device)
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
import torch
|
|
58
|
+
from transformers import (
|
|
59
|
+
AutoModelForImageTextToText,
|
|
60
|
+
AutoModelForVision2Seq,
|
|
61
|
+
AutoProcessor,
|
|
62
|
+
)
|
|
63
|
+
except ImportError:
|
|
64
|
+
raise ImportError(
|
|
65
|
+
"transformers >=4.46 is not installed. Please install Docling with the required extras `pip install docling[vlm]`."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Initialize processor and model
|
|
69
|
+
with _model_init_lock:
|
|
70
|
+
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
|
71
|
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
|
72
|
+
artifacts_path,
|
|
73
|
+
device_map=self.device,
|
|
74
|
+
dtype=torch.bfloat16,
|
|
75
|
+
_attn_implementation=(
|
|
76
|
+
"flash_attention_2"
|
|
77
|
+
if self.device.startswith("cuda")
|
|
78
|
+
and accelerator_options.cuda_use_flash_attention2
|
|
79
|
+
else "sdpa"
|
|
80
|
+
),
|
|
81
|
+
)
|
|
82
|
+
if sys.version_info < (3, 14):
|
|
83
|
+
self.model = torch.compile(self.model) # type: ignore
|
|
84
|
+
else:
|
|
85
|
+
self.model.eval()
|
|
86
|
+
|
|
87
|
+
self.provenance = f"{self.options.repo_id}"
|
|
88
|
+
|
|
89
|
+
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
|
90
|
+
from transformers import GenerationConfig
|
|
91
|
+
|
|
92
|
+
# Create input messages
|
|
93
|
+
messages = [
|
|
94
|
+
{
|
|
95
|
+
"role": "user",
|
|
96
|
+
"content": [
|
|
97
|
+
{"type": "image"},
|
|
98
|
+
{"type": "text", "text": self.options.prompt},
|
|
99
|
+
],
|
|
100
|
+
},
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
# TODO: do batch generation
|
|
104
|
+
|
|
105
|
+
for image in images:
|
|
106
|
+
# Prepare inputs
|
|
107
|
+
prompt = self.processor.apply_chat_template(
|
|
108
|
+
messages, add_generation_prompt=True
|
|
109
|
+
)
|
|
110
|
+
inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
|
|
111
|
+
inputs = inputs.to(self.device)
|
|
112
|
+
|
|
113
|
+
# Generate outputs
|
|
114
|
+
generated_ids = self.model.generate(
|
|
115
|
+
**inputs,
|
|
116
|
+
generation_config=GenerationConfig(**self.options.generation_config),
|
|
117
|
+
)
|
|
118
|
+
generated_texts = self.processor.batch_decode(
|
|
119
|
+
generated_ids[:, inputs["input_ids"].shape[1] :],
|
|
120
|
+
skip_special_tokens=True,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
yield generated_texts[0].strip()
|
|
File without changes
|