docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,246 @@
1
+ import sys
2
+ import threading
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+ from typing import List, Literal, Optional, Union
6
+
7
+ import numpy as np
8
+ from docling_core.types.doc import (
9
+ DoclingDocument,
10
+ NodeItem,
11
+ PictureClassificationClass,
12
+ PictureClassificationData,
13
+ PictureClassificationMetaField,
14
+ PictureItem,
15
+ PictureMeta,
16
+ )
17
+ from docling_core.types.doc.document import PictureClassificationPrediction
18
+ from PIL import Image
19
+ from pydantic import BaseModel
20
+
21
+ from docling.datamodel.accelerator_options import AcceleratorOptions
22
+ from docling.datamodel.base_models import ItemAndImageEnrichmentElement
23
+ from docling.models.base_model import BaseItemAndImageEnrichmentModel
24
+ from docling.models.utils.hf_model_download import HuggingFaceModelDownloadMixin
25
+ from docling.utils.accelerator_utils import decide_device
26
+
27
+ # Global lock for model initialization to prevent threading issues
28
+ _model_init_lock = threading.Lock()
29
+
30
+
31
+ class DocumentPictureClassifierOptions(BaseModel):
32
+ """
33
+ Options for configuring the DocumentPictureClassifier.
34
+ """
35
+
36
+ kind: Literal["document_picture_classifier"] = "document_picture_classifier"
37
+ repo_id: str = "docling-project/DocumentFigureClassifier-v2.0"
38
+ revision: str = "main"
39
+
40
+ @property
41
+ def repo_cache_folder(self) -> str:
42
+ return self.repo_id.replace("/", "--")
43
+
44
+
45
+ class DocumentPictureClassifier(
46
+ BaseItemAndImageEnrichmentModel, HuggingFaceModelDownloadMixin
47
+ ):
48
+ """
49
+ A model for classifying pictures in documents.
50
+
51
+ This class enriches document pictures with predicted classifications
52
+ based on a predefined set of classes.
53
+
54
+ Attributes
55
+ ----------
56
+ enabled : bool
57
+ Whether the classifier is enabled for use.
58
+ options : DocumentPictureClassifierOptions
59
+ Configuration options for the classifier.
60
+ document_picture_classifier : DocumentPictureClassifierPredictor
61
+ The underlying prediction model, loaded if the classifier is enabled.
62
+
63
+ Methods
64
+ -------
65
+ __init__(enabled, artifacts_path, options, accelerator_options)
66
+ Initializes the classifier with specified configurations.
67
+ is_processable(doc, element)
68
+ Checks if the given element can be processed by the classifier.
69
+ __call__(doc, element_batch)
70
+ Processes a batch of elements and adds classification annotations.
71
+ """
72
+
73
+ images_scale = 2
74
+
75
+ def __init__(
76
+ self,
77
+ enabled: bool,
78
+ artifacts_path: Optional[Path],
79
+ options: DocumentPictureClassifierOptions,
80
+ accelerator_options: AcceleratorOptions,
81
+ ):
82
+ """
83
+ Initializes the DocumentPictureClassifier.
84
+
85
+ Parameters
86
+ ----------
87
+ enabled : bool
88
+ Indicates whether the classifier is enabled.
89
+ artifacts_path : Optional[Union[Path, str]],
90
+ Path to the directory containing model artifacts.
91
+ options : DocumentPictureClassifierOptions
92
+ Configuration options for the classifier.
93
+ accelerator_options : AcceleratorOptions
94
+ Options for configuring the device and parallelism.
95
+ """
96
+ self.enabled = enabled
97
+ self.options = options
98
+
99
+ if self.enabled:
100
+ self._device = decide_device(accelerator_options.device)
101
+
102
+ repo_cache_folder = self.options.repo_cache_folder
103
+
104
+ if artifacts_path is None:
105
+ artifacts_path = self.download_models(
106
+ self.options.repo_id, revision=self.options.revision
107
+ )
108
+ elif (artifacts_path / repo_cache_folder).exists():
109
+ artifacts_path = artifacts_path / repo_cache_folder
110
+
111
+ import torch
112
+ from transformers import AutoImageProcessor, AutoModelForImageClassification
113
+
114
+ with _model_init_lock:
115
+ # Image processor
116
+ self._processor = AutoImageProcessor.from_pretrained(
117
+ artifacts_path, use_fast=True
118
+ )
119
+
120
+ # Model
121
+ self._model = AutoModelForImageClassification.from_pretrained(
122
+ artifacts_path,
123
+ device_map=self._device,
124
+ )
125
+
126
+ if sys.version_info < (3, 14):
127
+ self._model = torch.compile(self._model) # type: ignore
128
+ else:
129
+ self._model.eval()
130
+
131
+ self._classes = self._model.config.id2label
132
+
133
+ def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
134
+ """
135
+ Determines if the given element can be processed by the classifier.
136
+
137
+ Parameters
138
+ ----------
139
+ doc : DoclingDocument
140
+ The document containing the element.
141
+ element : NodeItem
142
+ The element to be checked.
143
+
144
+ Returns
145
+ -------
146
+ bool
147
+ True if the element is a PictureItem and processing is enabled; False otherwise.
148
+ """
149
+ return self.enabled and isinstance(element, PictureItem)
150
+
151
+ def __call__(
152
+ self,
153
+ doc: DoclingDocument,
154
+ element_batch: Iterable[ItemAndImageEnrichmentElement],
155
+ ) -> Iterable[NodeItem]:
156
+ """
157
+ Processes a batch of elements and enriches them with classification predictions.
158
+
159
+ Parameters
160
+ ----------
161
+ doc : DoclingDocument
162
+ The document containing the elements to be processed.
163
+ element_batch : Iterable[ItemAndImageEnrichmentElement]
164
+ A batch of pictures to classify.
165
+
166
+ Returns
167
+ -------
168
+ Iterable[NodeItem]
169
+ An iterable of NodeItem objects after processing. The field
170
+ 'data.classification' is added containing the classification for each picture.
171
+ """
172
+ if not self.enabled:
173
+ for element in element_batch:
174
+ yield element.item
175
+ return
176
+
177
+ import torch
178
+
179
+ images: List[Union[Image.Image, np.ndarray]] = []
180
+ elements: List[PictureItem] = []
181
+ for i, el in enumerate(element_batch):
182
+ assert isinstance(el.item, PictureItem)
183
+ elements.append(el.item)
184
+
185
+ raw_image = el.image
186
+ if isinstance(raw_image, Image.Image):
187
+ raw_image = raw_image.convert("RGB")
188
+ elif isinstance(raw_image, np.ndarray):
189
+ raw_image = Image.fromarray(raw_image).convert("RGB")
190
+ else:
191
+ raise TypeError(
192
+ "Supported input formats are PIL.Image.Image or numpy.ndarray."
193
+ )
194
+ images.append(raw_image)
195
+
196
+ inputs = self._processor(images=images, return_tensors="pt")
197
+ # move inputs to the same device as the model
198
+ inputs = {k: v.to(self._device) for k, v in inputs.items()}
199
+
200
+ with torch.no_grad():
201
+ logits = self._model(**inputs).logits # (batch_size, num_classes)
202
+ probs_batch = logits.softmax(dim=1) # (batch_size, num_classes)
203
+ probs_batch = probs_batch.cpu().numpy().tolist()
204
+
205
+ predictions_batch = []
206
+ for probs_image in probs_batch:
207
+ preds = [(self._classes[i], prob) for i, prob in enumerate(probs_image)]
208
+ preds.sort(key=lambda t: t[1], reverse=True)
209
+ predictions_batch.append(preds)
210
+
211
+ for item, output in zip(elements, predictions_batch):
212
+ predicted_classes = [
213
+ PictureClassificationClass(
214
+ class_name=pred[0],
215
+ confidence=pred[1],
216
+ )
217
+ for pred in output
218
+ ]
219
+
220
+ # FIXME: annotations is deprecated, remove once all consumers use meta.classification
221
+ item.annotations.append(
222
+ PictureClassificationData(
223
+ provenance="DocumentPictureClassifier",
224
+ predicted_classes=predicted_classes,
225
+ )
226
+ )
227
+
228
+ # Store classification in the new meta field
229
+ predictions = [
230
+ PictureClassificationPrediction(
231
+ class_name=pred.class_name,
232
+ confidence=pred.confidence,
233
+ created_by="DocumentPictureClassifier",
234
+ )
235
+ for pred in predicted_classes
236
+ ]
237
+ classification_data = PictureClassificationMetaField(
238
+ predictions=predictions,
239
+ )
240
+
241
+ if item.meta is not None:
242
+ item.meta.classification = classification_data
243
+ else:
244
+ item.meta = PictureMeta(classification=classification_data)
245
+
246
+ yield item
File without changes
@@ -0,0 +1,66 @@
1
+ from collections.abc import Iterable
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ from pathlib import Path
4
+ from typing import Optional, Type, Union
5
+
6
+ from PIL import Image
7
+
8
+ from docling.datamodel.accelerator_options import AcceleratorOptions
9
+ from docling.datamodel.pipeline_options import (
10
+ PictureDescriptionApiOptions,
11
+ PictureDescriptionBaseOptions,
12
+ )
13
+ from docling.exceptions import OperationNotAllowed
14
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
15
+ from docling.utils.api_image_request import api_image_request
16
+
17
+
18
+ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
19
+ # elements_batch_size = 4
20
+
21
+ @classmethod
22
+ def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
23
+ return PictureDescriptionApiOptions
24
+
25
+ def __init__(
26
+ self,
27
+ enabled: bool,
28
+ enable_remote_services: bool,
29
+ artifacts_path: Optional[Union[Path, str]],
30
+ options: PictureDescriptionApiOptions,
31
+ accelerator_options: AcceleratorOptions,
32
+ ):
33
+ super().__init__(
34
+ enabled=enabled,
35
+ enable_remote_services=enable_remote_services,
36
+ artifacts_path=artifacts_path,
37
+ options=options,
38
+ accelerator_options=accelerator_options,
39
+ )
40
+ self.options: PictureDescriptionApiOptions
41
+ self.concurrency = self.options.concurrency
42
+
43
+ if self.enabled:
44
+ if not enable_remote_services:
45
+ raise OperationNotAllowed(
46
+ "Connections to remote services is only allowed when set explicitly. "
47
+ "pipeline_options.enable_remote_services=True."
48
+ )
49
+
50
+ def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
51
+ # Note: technically we could make a batch request here,
52
+ # but not all APIs will allow for it. For example, vllm won't allow more than 1.
53
+ def _api_request(image):
54
+ page_tags, _, _ = api_image_request(
55
+ image=image,
56
+ prompt=self.options.prompt,
57
+ url=self.options.url,
58
+ timeout=self.options.timeout,
59
+ headers=self.options.headers,
60
+ **self.options.params,
61
+ )
62
+
63
+ return page_tags
64
+
65
+ with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
66
+ yield from executor.map(_api_request, images)
@@ -0,0 +1,123 @@
1
+ import sys
2
+ import threading
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+ from typing import Optional, Type, Union
6
+
7
+ from PIL import Image
8
+
9
+ from docling.datamodel.accelerator_options import AcceleratorOptions
10
+ from docling.datamodel.pipeline_options import (
11
+ PictureDescriptionBaseOptions,
12
+ PictureDescriptionVlmOptions,
13
+ )
14
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
15
+ from docling.models.utils.hf_model_download import (
16
+ HuggingFaceModelDownloadMixin,
17
+ )
18
+ from docling.utils.accelerator_utils import decide_device
19
+
20
+ # Global lock for model initialization to prevent threading issues
21
+ _model_init_lock = threading.Lock()
22
+
23
+
24
+ class PictureDescriptionVlmModel(
25
+ PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
26
+ ):
27
+ @classmethod
28
+ def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
29
+ return PictureDescriptionVlmOptions
30
+
31
+ def __init__(
32
+ self,
33
+ enabled: bool,
34
+ enable_remote_services: bool,
35
+ artifacts_path: Optional[Union[Path, str]],
36
+ options: PictureDescriptionVlmOptions,
37
+ accelerator_options: AcceleratorOptions,
38
+ ):
39
+ super().__init__(
40
+ enabled=enabled,
41
+ enable_remote_services=enable_remote_services,
42
+ artifacts_path=artifacts_path,
43
+ options=options,
44
+ accelerator_options=accelerator_options,
45
+ )
46
+ self.options: PictureDescriptionVlmOptions
47
+
48
+ if self.enabled:
49
+ if artifacts_path is None:
50
+ artifacts_path = self.download_models(repo_id=self.options.repo_id)
51
+ else:
52
+ artifacts_path = Path(artifacts_path) / self.options.repo_cache_folder
53
+
54
+ self.device = decide_device(accelerator_options.device)
55
+
56
+ try:
57
+ import torch
58
+ from transformers import (
59
+ AutoModelForImageTextToText,
60
+ AutoModelForVision2Seq,
61
+ AutoProcessor,
62
+ )
63
+ except ImportError:
64
+ raise ImportError(
65
+ "transformers >=4.46 is not installed. Please install Docling with the required extras `pip install docling[vlm]`."
66
+ )
67
+
68
+ # Initialize processor and model
69
+ with _model_init_lock:
70
+ self.processor = AutoProcessor.from_pretrained(artifacts_path)
71
+ self.model = AutoModelForImageTextToText.from_pretrained(
72
+ artifacts_path,
73
+ device_map=self.device,
74
+ dtype=torch.bfloat16,
75
+ _attn_implementation=(
76
+ "flash_attention_2"
77
+ if self.device.startswith("cuda")
78
+ and accelerator_options.cuda_use_flash_attention2
79
+ else "sdpa"
80
+ ),
81
+ )
82
+ if sys.version_info < (3, 14):
83
+ self.model = torch.compile(self.model) # type: ignore
84
+ else:
85
+ self.model.eval()
86
+
87
+ self.provenance = f"{self.options.repo_id}"
88
+
89
+ def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
90
+ from transformers import GenerationConfig
91
+
92
+ # Create input messages
93
+ messages = [
94
+ {
95
+ "role": "user",
96
+ "content": [
97
+ {"type": "image"},
98
+ {"type": "text", "text": self.options.prompt},
99
+ ],
100
+ },
101
+ ]
102
+
103
+ # TODO: do batch generation
104
+
105
+ for image in images:
106
+ # Prepare inputs
107
+ prompt = self.processor.apply_chat_template(
108
+ messages, add_generation_prompt=True
109
+ )
110
+ inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
111
+ inputs = inputs.to(self.device)
112
+
113
+ # Generate outputs
114
+ generated_ids = self.model.generate(
115
+ **inputs,
116
+ generation_config=GenerationConfig(**self.options.generation_config),
117
+ )
118
+ generated_texts = self.processor.batch_decode(
119
+ generated_ids[:, inputs["input_ids"].shape[1] :],
120
+ skip_special_tokens=True,
121
+ )
122
+
123
+ yield generated_texts[0].strip()
File without changes