docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,39 @@
1
+ """Data models for document extraction functionality."""
2
+
3
+ from typing import Any, Dict, List, Optional, Type, Union
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from docling.datamodel.base_models import ConversionStatus, ErrorItem, VlmStopReason
8
+ from docling.datamodel.document import InputDocument
9
+
10
+
11
+ class ExtractedPageData(BaseModel):
12
+ """Data model for extracted content from a single page."""
13
+
14
+ page_no: int = Field(..., description="1-indexed page number")
15
+ extracted_data: Optional[Dict[str, Any]] = Field(
16
+ None, description="Extracted structured data from the page"
17
+ )
18
+ raw_text: Optional[str] = Field(None, description="Raw extracted text")
19
+ errors: List[str] = Field(
20
+ default_factory=list,
21
+ description="Any errors encountered during extraction for this page",
22
+ )
23
+
24
+
25
+ class ExtractionResult(BaseModel):
26
+ """Result of document extraction."""
27
+
28
+ input: InputDocument
29
+ status: ConversionStatus = ConversionStatus.PENDING
30
+ errors: List[ErrorItem] = []
31
+
32
+ # Pages field - always a list for consistency
33
+ pages: List[ExtractedPageData] = Field(
34
+ default_factory=list, description="Extracted data from each page"
35
+ )
36
+
37
+
38
+ # Type alias for template parameters that can be string, dict, or BaseModel
39
+ ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]
@@ -0,0 +1,91 @@
1
+ import logging
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from docling.datamodel.accelerator_options import AcceleratorDevice
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class LayoutModelConfig(BaseModel):
14
+ name: str
15
+ repo_id: str
16
+ revision: str
17
+ model_path: str
18
+ supported_devices: list[AcceleratorDevice] = [
19
+ AcceleratorDevice.CPU,
20
+ AcceleratorDevice.CUDA,
21
+ AcceleratorDevice.MPS,
22
+ AcceleratorDevice.XPU,
23
+ ]
24
+
25
+ @property
26
+ def model_repo_folder(self) -> str:
27
+ return self.repo_id.replace("/", "--")
28
+
29
+
30
+ # HuggingFace Layout Models
31
+
32
+ # Default Docling Layout Model
33
+ DOCLING_LAYOUT_V2 = LayoutModelConfig(
34
+ name="docling_layout_v2",
35
+ repo_id="docling-project/docling-layout-old",
36
+ revision="main",
37
+ model_path="",
38
+ )
39
+
40
+ DOCLING_LAYOUT_HERON = LayoutModelConfig(
41
+ name="docling_layout_heron",
42
+ repo_id="docling-project/docling-layout-heron",
43
+ revision="main",
44
+ model_path="",
45
+ )
46
+
47
+ DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
48
+ name="docling_layout_heron_101",
49
+ repo_id="docling-project/docling-layout-heron-101",
50
+ revision="main",
51
+ model_path="",
52
+ )
53
+
54
+ DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
55
+ name="docling_layout_egret_medium",
56
+ repo_id="docling-project/docling-layout-egret-medium",
57
+ revision="main",
58
+ model_path="",
59
+ )
60
+
61
+ DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
62
+ name="docling_layout_egret_large",
63
+ repo_id="docling-project/docling-layout-egret-large",
64
+ revision="main",
65
+ model_path="",
66
+ )
67
+
68
+ DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
69
+ name="docling_layout_egret_xlarge",
70
+ repo_id="docling-project/docling-layout-egret-xlarge",
71
+ revision="main",
72
+ model_path="",
73
+ )
74
+
75
+ # Example for a hypothetical alternative model
76
+ # ALTERNATIVE_LAYOUT = LayoutModelConfig(
77
+ # name="alternative_layout",
78
+ # repo_id="someorg/alternative-layout",
79
+ # revision="main",
80
+ # model_path="model_artifacts/layout_alt",
81
+ # )
82
+
83
+
84
+ class LayoutModelType(str, Enum):
85
+ DOCLING_LAYOUT_V2 = "docling_layout_v2"
86
+ DOCLING_LAYOUT_HERON = "docling_layout_heron"
87
+ DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
88
+ DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
89
+ DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
90
+ DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
91
+ # ALTERNATIVE_LAYOUT = "alternative_layout"
@@ -0,0 +1,457 @@
1
+ import logging
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ from typing import Annotated, Any, ClassVar, Dict, List, Literal, Optional, Union
6
+
7
+ from docling_core.types.doc import PictureClassificationLabel
8
+ from pydantic import (
9
+ AnyUrl,
10
+ BaseModel,
11
+ ConfigDict,
12
+ Field,
13
+ )
14
+ from typing_extensions import deprecated
15
+
16
+ from docling.datamodel import asr_model_specs, vlm_model_specs
17
+
18
+ # Import the following for backwards compatibility
19
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
20
+ from docling.datamodel.layout_model_specs import (
21
+ DOCLING_LAYOUT_EGRET_LARGE,
22
+ DOCLING_LAYOUT_EGRET_MEDIUM,
23
+ DOCLING_LAYOUT_EGRET_XLARGE,
24
+ DOCLING_LAYOUT_HERON,
25
+ DOCLING_LAYOUT_HERON_101,
26
+ DOCLING_LAYOUT_V2,
27
+ LayoutModelConfig,
28
+ )
29
+ from docling.datamodel.pipeline_options_asr_model import (
30
+ InlineAsrOptions,
31
+ )
32
+ from docling.datamodel.pipeline_options_vlm_model import (
33
+ ApiVlmOptions,
34
+ InferenceFramework,
35
+ InlineVlmOptions,
36
+ ResponseFormat,
37
+ )
38
+ from docling.datamodel.vlm_model_specs import (
39
+ GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
40
+ GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
41
+ NU_EXTRACT_2B_TRANSFORMERS,
42
+ SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
43
+ SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
44
+ VlmModelType,
45
+ )
46
+
47
+ _log = logging.getLogger(__name__)
48
+
49
+
50
+ class BaseOptions(BaseModel):
51
+ """Base class for options."""
52
+
53
+ kind: ClassVar[str]
54
+
55
+
56
+ class TableFormerMode(str, Enum):
57
+ """Modes for the TableFormer model."""
58
+
59
+ FAST = "fast"
60
+ ACCURATE = "accurate"
61
+
62
+
63
+ class BaseTableStructureOptions(BaseOptions):
64
+ """Base options for table structure models."""
65
+
66
+
67
+ class TableStructureOptions(BaseTableStructureOptions):
68
+ """Options for the table structure."""
69
+
70
+ kind: ClassVar[str] = "docling_tableformer"
71
+ do_cell_matching: bool = (
72
+ True
73
+ # True: Matches predictions back to PDF cells. Can break table output if PDF cells
74
+ # are merged across table columns.
75
+ # False: Let table structure model define the text cells, ignore PDF cells.
76
+ )
77
+ mode: TableFormerMode = TableFormerMode.ACCURATE
78
+
79
+
80
+ class OcrOptions(BaseOptions):
81
+ """OCR options."""
82
+
83
+ lang: Annotated[
84
+ List[str],
85
+ Field(
86
+ description="List of OCR languages to use. The format must match the values of the OCR engine of choice.",
87
+ examples=[["deu", "eng"]],
88
+ ),
89
+ ]
90
+
91
+ force_full_page_ocr: Annotated[
92
+ bool,
93
+ Field(
94
+ description="If enabled, a full-page OCR is always applied.",
95
+ examples=[False],
96
+ ),
97
+ ] = False
98
+
99
+ bitmap_area_threshold: Annotated[
100
+ float,
101
+ Field(
102
+ description="Percentage of the page area for a bitmap to be processed with OCR.",
103
+ examples=[0.05, 0.1],
104
+ ),
105
+ ] = 0.05
106
+
107
+
108
+ class OcrAutoOptions(OcrOptions):
109
+ """Options for pick OCR engine automatically."""
110
+
111
+ kind: ClassVar[Literal["auto"]] = "auto"
112
+ lang: Annotated[
113
+ List[str],
114
+ Field(
115
+ description="The automatic OCR engine will use the default values of the engine. Please specify the engine explicitly to change the language selection.",
116
+ ),
117
+ ] = []
118
+
119
+
120
+ class RapidOcrOptions(OcrOptions):
121
+ """Options for the RapidOCR engine."""
122
+
123
+ kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
124
+
125
+ # English and chinese are the most commly used models and have been tested with RapidOCR.
126
+ lang: List[str] = [
127
+ "english",
128
+ "chinese",
129
+ ]
130
+ # However, language as a parameter is not supported by rapidocr yet
131
+ # and hence changing this options doesn't affect anything.
132
+
133
+ # For more details on supported languages by RapidOCR visit
134
+ # https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
135
+
136
+ # For more details on the following options visit
137
+ # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
138
+
139
+ # https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#__tabbed_3_4
140
+ backend: Literal["onnxruntime", "openvino", "paddle", "torch"] = "onnxruntime"
141
+ text_score: float = 0.5 # same default as rapidocr
142
+
143
+ use_det: Optional[bool] = None # same default as rapidocr
144
+ use_cls: Optional[bool] = None # same default as rapidocr
145
+ use_rec: Optional[bool] = None # same default as rapidocr
146
+
147
+ print_verbose: bool = False # same default as rapidocr
148
+
149
+ det_model_path: Optional[str] = None # same default as rapidocr
150
+ cls_model_path: Optional[str] = None # same default as rapidocr
151
+ rec_model_path: Optional[str] = None # same default as rapidocr
152
+ rec_keys_path: Optional[str] = None # same default as rapidocr
153
+ rec_font_path: Optional[str] = None # Deprecated, please use font_path instead
154
+ font_path: Optional[str] = None # same default as rapidocr
155
+
156
+ # Dictionary to overwrite or pass-through additional parameters
157
+ rapidocr_params: Dict[str, Any] = Field(default_factory=dict)
158
+
159
+ model_config = ConfigDict(
160
+ extra="forbid",
161
+ )
162
+
163
+
164
+ class EasyOcrOptions(OcrOptions):
165
+ """Options for the EasyOCR engine."""
166
+
167
+ kind: ClassVar[Literal["easyocr"]] = "easyocr"
168
+ lang: List[str] = ["fr", "de", "es", "en"]
169
+
170
+ use_gpu: Optional[bool] = None
171
+
172
+ confidence_threshold: float = 0.5
173
+
174
+ model_storage_directory: Optional[str] = None
175
+ recog_network: Optional[str] = "standard"
176
+ download_enabled: bool = True
177
+
178
+ suppress_mps_warnings: bool = True
179
+
180
+ model_config = ConfigDict(
181
+ extra="forbid",
182
+ protected_namespaces=(),
183
+ )
184
+
185
+
186
+ class TesseractCliOcrOptions(OcrOptions):
187
+ """Options for the TesseractCli engine."""
188
+
189
+ kind: ClassVar[Literal["tesseract"]] = "tesseract"
190
+ lang: List[str] = ["fra", "deu", "spa", "eng"]
191
+ tesseract_cmd: str = "tesseract"
192
+ path: Optional[str] = None
193
+ psm: Optional[int] = (
194
+ None # Page Segmentation Mode (0-13), defaults to tesseract's default
195
+ )
196
+
197
+ model_config = ConfigDict(
198
+ extra="forbid",
199
+ )
200
+
201
+
202
+ class TesseractOcrOptions(OcrOptions):
203
+ """Options for the Tesseract engine."""
204
+
205
+ kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
206
+ lang: List[str] = ["fra", "deu", "spa", "eng"]
207
+ path: Optional[str] = None
208
+ psm: Optional[int] = (
209
+ None # Page Segmentation Mode (0-13), defaults to tesseract's default
210
+ )
211
+
212
+ model_config = ConfigDict(
213
+ extra="forbid",
214
+ )
215
+
216
+
217
+ class OcrMacOptions(OcrOptions):
218
+ """Options for the Mac OCR engine."""
219
+
220
+ kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
221
+ lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
222
+ recognition: str = "accurate"
223
+ framework: str = "vision"
224
+
225
+ model_config = ConfigDict(
226
+ extra="forbid",
227
+ )
228
+
229
+
230
+ class PictureDescriptionBaseOptions(BaseOptions):
231
+ batch_size: int = 8
232
+ scale: float = 2
233
+
234
+ picture_area_threshold: float = (
235
+ 0.05 # percentage of the area for a picture to processed with the models
236
+ )
237
+ classification_allow: Optional[List[PictureClassificationLabel]] = None
238
+ classification_deny: Optional[List[PictureClassificationLabel]] = None
239
+ classification_min_confidence: float = 0.0
240
+
241
+
242
+ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
243
+ kind: ClassVar[Literal["api"]] = "api"
244
+
245
+ url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
246
+ headers: Dict[str, str] = {}
247
+ params: Dict[str, Any] = {}
248
+ timeout: float = 20
249
+ concurrency: int = 1
250
+
251
+ prompt: str = "Describe this image in a few sentences."
252
+ provenance: str = ""
253
+
254
+
255
+ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
256
+ kind: ClassVar[Literal["vlm"]] = "vlm"
257
+
258
+ repo_id: str
259
+ prompt: str = "Describe this image in a few sentences."
260
+ # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
261
+ generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
262
+
263
+ @property
264
+ def repo_cache_folder(self) -> str:
265
+ return self.repo_id.replace("/", "--")
266
+
267
+
268
+ # SmolVLM
269
+ smolvlm_picture_description = PictureDescriptionVlmOptions(
270
+ repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
271
+ )
272
+
273
+ # GraniteVision
274
+ granite_picture_description = PictureDescriptionVlmOptions(
275
+ repo_id="ibm-granite/granite-vision-3.3-2b",
276
+ prompt="What is shown in this image?",
277
+ )
278
+
279
+
280
+ # Define an enum for the backend options
281
+ class PdfBackend(str, Enum):
282
+ """Enum of valid PDF backends."""
283
+
284
+ PYPDFIUM2 = "pypdfium2"
285
+ DLPARSE_V1 = "dlparse_v1"
286
+ DLPARSE_V2 = "dlparse_v2"
287
+ DLPARSE_V4 = "dlparse_v4"
288
+
289
+
290
+ # Define an enum for the ocr engines
291
+ @deprecated(
292
+ "Use get_ocr_factory().registered_kind to get a list of registered OCR engines."
293
+ )
294
+ class OcrEngine(str, Enum):
295
+ """Enum of valid OCR engines."""
296
+
297
+ AUTO = "auto"
298
+ EASYOCR = "easyocr"
299
+ TESSERACT_CLI = "tesseract_cli"
300
+ TESSERACT = "tesseract"
301
+ OCRMAC = "ocrmac"
302
+ RAPIDOCR = "rapidocr"
303
+
304
+
305
+ class PipelineOptions(BaseOptions):
306
+ """Base pipeline options."""
307
+
308
+ document_timeout: Annotated[
309
+ Optional[float],
310
+ Field(
311
+ description="Maximum allowed processing time for a document before timing out. If None, no timeout is enforced.",
312
+ examples=[10.0, 20.0],
313
+ ),
314
+ ] = None
315
+
316
+ accelerator_options: Annotated[
317
+ AcceleratorOptions,
318
+ Field(
319
+ description="Configuration options for hardware acceleration (e.g., GPU or optimized execution settings).",
320
+ ),
321
+ ] = AcceleratorOptions()
322
+
323
+ enable_remote_services: Annotated[
324
+ bool,
325
+ Field(
326
+ description="Enable calling external APIs or cloud services during pipeline execution.",
327
+ examples=[False],
328
+ ),
329
+ ] = False
330
+
331
+ allow_external_plugins: Annotated[
332
+ bool,
333
+ Field(
334
+ description="Allow loading external third-party plugins or modules. Disabled by default for safety.",
335
+ examples=[False],
336
+ ),
337
+ ] = False
338
+
339
+ artifacts_path: Annotated[
340
+ Optional[Union[Path, str]],
341
+ Field(
342
+ description="Filesystem path where pipeline artifacts should be stored. If None, artifacts will be fetched. You can use the utility `docling-tools models download` to pre-fetch the model artifacts.",
343
+ examples=["./artifacts", "/tmp/docling_outputs"],
344
+ ),
345
+ ] = None
346
+
347
+
348
+ class ConvertPipelineOptions(PipelineOptions):
349
+ """Base convert pipeline options."""
350
+
351
+ do_picture_classification: bool = False # True: classify pictures in documents
352
+
353
+ do_picture_description: bool = False # True: run describe pictures in documents
354
+ picture_description_options: PictureDescriptionBaseOptions = (
355
+ smolvlm_picture_description
356
+ )
357
+
358
+
359
+ class PaginatedPipelineOptions(ConvertPipelineOptions):
360
+ images_scale: float = 1.0
361
+ generate_page_images: bool = False
362
+ generate_picture_images: bool = False
363
+
364
+
365
+ class VlmPipelineOptions(PaginatedPipelineOptions):
366
+ generate_page_images: bool = True
367
+ force_backend_text: bool = (
368
+ False # (To be used with vlms, or other generative models)
369
+ )
370
+ # If True, text from backend will be used instead of generated text
371
+ vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
372
+ vlm_model_specs.GRANITEDOCLING_TRANSFORMERS
373
+ )
374
+
375
+
376
+ class BaseLayoutOptions(BaseOptions):
377
+ """Base options for layout models."""
378
+
379
+ keep_empty_clusters: bool = (
380
+ False # Whether to keep clusters that contain no text cells
381
+ )
382
+ skip_cell_assignment: bool = (
383
+ False # Skip cell-to-cluster assignment for VLM-only processing
384
+ )
385
+
386
+
387
+ class LayoutOptions(BaseLayoutOptions):
388
+ """Options for layout processing."""
389
+
390
+ kind: ClassVar[str] = "docling_layout_default"
391
+ create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
392
+ model_spec: LayoutModelConfig = DOCLING_LAYOUT_HERON
393
+
394
+
395
+ class AsrPipelineOptions(PipelineOptions):
396
+ asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
397
+
398
+
399
+ class VlmExtractionPipelineOptions(PipelineOptions):
400
+ """Options for extraction pipeline."""
401
+
402
+ vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
403
+
404
+
405
+ class PdfPipelineOptions(PaginatedPipelineOptions):
406
+ """Options for the PDF pipeline."""
407
+
408
+ do_table_structure: bool = True # True: perform table structure extraction
409
+ do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
410
+ do_code_enrichment: bool = False # True: perform code OCR
411
+ do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
412
+ force_backend_text: bool = (
413
+ False # (To be used with vlms, or other generative models)
414
+ )
415
+ # If True, text from backend will be used instead of generated text
416
+
417
+ table_structure_options: BaseTableStructureOptions = TableStructureOptions()
418
+ ocr_options: OcrOptions = OcrAutoOptions()
419
+ layout_options: BaseLayoutOptions = LayoutOptions()
420
+
421
+ images_scale: float = 1.0
422
+ generate_page_images: bool = False
423
+ generate_picture_images: bool = False
424
+ generate_table_images: bool = Field(
425
+ default=False,
426
+ deprecated=(
427
+ "Field `generate_table_images` is deprecated. "
428
+ "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
429
+ "before conversion and then use the `TableItem.get_image` function."
430
+ ),
431
+ )
432
+
433
+ generate_parsed_pages: bool = False
434
+
435
+ ### Arguments for threaded PDF pipeline with batching and backpressure control
436
+
437
+ # Batch sizes for different stages
438
+ ocr_batch_size: int = 4
439
+ layout_batch_size: int = 4
440
+ table_batch_size: int = 4
441
+
442
+ # Timing control
443
+ batch_polling_interval_seconds: float = 0.5
444
+
445
+ # Backpressure and queue control
446
+ queue_max_size: int = 100
447
+
448
+
449
+ class ProcessingPipeline(str, Enum):
450
+ LEGACY = "legacy"
451
+ STANDARD = "standard"
452
+ VLM = "vlm"
453
+ ASR = "asr"
454
+
455
+
456
+ class ThreadedPdfPipelineOptions(PdfPipelineOptions):
457
+ """Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
@@ -0,0 +1,78 @@
1
+ from enum import Enum
2
+ from typing import Any, Dict, List, Literal, Optional, Union
3
+
4
+ from pydantic import AnyUrl, BaseModel
5
+ from typing_extensions import deprecated
6
+
7
+ from docling.datamodel.accelerator_options import AcceleratorDevice
8
+ from docling.datamodel.pipeline_options_vlm_model import (
9
+ # InferenceFramework,
10
+ TransformersModelType,
11
+ )
12
+
13
+
14
+ class BaseAsrOptions(BaseModel):
15
+ kind: str
16
+ # prompt: str
17
+
18
+
19
+ class InferenceAsrFramework(str, Enum):
20
+ MLX = "mlx"
21
+ # TRANSFORMERS = "transformers" # disabled for now
22
+ WHISPER = "whisper"
23
+
24
+
25
+ class InlineAsrOptions(BaseAsrOptions):
26
+ kind: Literal["inline_model_options"] = "inline_model_options"
27
+
28
+ repo_id: str
29
+
30
+ verbose: bool = False
31
+ timestamps: bool = True
32
+
33
+ temperature: float = 0.0
34
+ max_new_tokens: int = 256
35
+ max_time_chunk: float = 30.0
36
+
37
+ torch_dtype: Optional[str] = None
38
+ supported_devices: List[AcceleratorDevice] = [
39
+ AcceleratorDevice.CPU,
40
+ AcceleratorDevice.CUDA,
41
+ AcceleratorDevice.MPS,
42
+ AcceleratorDevice.XPU,
43
+ ]
44
+
45
+ @property
46
+ def repo_cache_folder(self) -> str:
47
+ return self.repo_id.replace("/", "--")
48
+
49
+
50
+ class InlineAsrNativeWhisperOptions(InlineAsrOptions):
51
+ inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
52
+
53
+ language: str = "en"
54
+ supported_devices: List[AcceleratorDevice] = [
55
+ AcceleratorDevice.CPU,
56
+ AcceleratorDevice.CUDA,
57
+ ]
58
+ word_timestamps: bool = True
59
+
60
+
61
+ class InlineAsrMlxWhisperOptions(InlineAsrOptions):
62
+ """
63
+ MLX Whisper options for Apple Silicon optimization.
64
+
65
+ Uses mlx-whisper library for efficient inference on Apple Silicon devices.
66
+ """
67
+
68
+ inference_framework: InferenceAsrFramework = InferenceAsrFramework.MLX
69
+
70
+ language: str = "en"
71
+ task: str = "transcribe" # "transcribe" or "translate"
72
+ supported_devices: List[AcceleratorDevice] = [
73
+ AcceleratorDevice.MPS, # MLX is optimized for Apple Silicon
74
+ ]
75
+ word_timestamps: bool = True
76
+ no_speech_threshold: float = 0.6 # Threshold for detecting speech
77
+ logprob_threshold: float = -1.0 # Log probability threshold
78
+ compression_ratio_threshold: float = 2.4 # Compression ratio threshold