docling 2.47.1__py3-none-any.whl → 2.49.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ """Data models for document extraction functionality."""
2
+
3
+ from typing import Any, Dict, List, Optional, Type, Union
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from docling.datamodel.base_models import ConversionStatus, ErrorItem
8
+ from docling.datamodel.document import InputDocument
9
+
10
+
11
+ class ExtractedPageData(BaseModel):
12
+ """Data model for extracted content from a single page."""
13
+
14
+ page_no: int = Field(..., description="1-indexed page number")
15
+ extracted_data: Optional[Dict[str, Any]] = Field(
16
+ None, description="Extracted structured data from the page"
17
+ )
18
+ raw_text: Optional[str] = Field(None, description="Raw extracted text")
19
+ errors: List[str] = Field(
20
+ default_factory=list,
21
+ description="Any errors encountered during extraction for this page",
22
+ )
23
+
24
+
25
+ class ExtractionResult(BaseModel):
26
+ """Result of document extraction."""
27
+
28
+ input: InputDocument
29
+ status: ConversionStatus = ConversionStatus.PENDING
30
+ errors: List[ErrorItem] = []
31
+
32
+ # Pages field - always a list for consistency
33
+ pages: List[ExtractedPageData] = Field(
34
+ default_factory=list, description="Extracted data from each page"
35
+ )
36
+
37
+
38
+ # Type alias for template parameters that can be string, dict, or BaseModel
39
+ ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]
@@ -37,6 +37,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
37
37
  from docling.datamodel.vlm_model_specs import (
38
38
  GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
39
39
  GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
40
+ NU_EXTRACT_2B_TRANSFORMERS,
40
41
  SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
41
42
  SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
42
43
  VlmModelType,
@@ -99,6 +100,8 @@ class RapidOcrOptions(OcrOptions):
99
100
  # For more details on the following options visit
100
101
  # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
101
102
 
103
+ # https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#__tabbed_3_4
104
+ backend: Literal["onnxruntime", "openvino", "paddle", "torch"] = "onnxruntime"
102
105
  text_score: float = 0.5 # same default as rapidocr
103
106
 
104
107
  use_det: Optional[bool] = None # same default as rapidocr
@@ -111,6 +114,7 @@ class RapidOcrOptions(OcrOptions):
111
114
  cls_model_path: Optional[str] = None # same default as rapidocr
112
115
  rec_model_path: Optional[str] = None # same default as rapidocr
113
116
  rec_keys_path: Optional[str] = None # same default as rapidocr
117
+ rec_font_path: Optional[str] = None # same default as rapidocr
114
118
 
115
119
  model_config = ConfigDict(
116
120
  extra="forbid",
@@ -244,12 +248,9 @@ class OcrEngine(str, Enum):
244
248
  RAPIDOCR = "rapidocr"
245
249
 
246
250
 
247
- class PipelineOptions(BaseModel):
251
+ class PipelineOptions(BaseOptions):
248
252
  """Base pipeline options."""
249
253
 
250
- create_legacy_output: bool = (
251
- True # This default will be set to False on a future version of docling
252
- )
253
254
  document_timeout: Optional[float] = None
254
255
  accelerator_options: AcceleratorOptions = AcceleratorOptions()
255
256
  enable_remote_services: bool = False
@@ -293,6 +294,13 @@ class AsrPipelineOptions(PipelineOptions):
293
294
  artifacts_path: Optional[Union[Path, str]] = None
294
295
 
295
296
 
297
+ class VlmExtractionPipelineOptions(PipelineOptions):
298
+ """Options for extraction pipeline."""
299
+
300
+ artifacts_path: Optional[Union[Path, str]] = None
301
+ vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
302
+
303
+
296
304
  class PdfPipelineOptions(PaginatedPipelineOptions):
297
305
  """Options for the PDF pipeline."""
298
306
 
@@ -247,6 +247,23 @@ DOLPHIN_TRANSFORMERS = InlineVlmOptions(
247
247
  temperature=0.0,
248
248
  )
249
249
 
250
+ # NuExtract
251
+ NU_EXTRACT_2B_TRANSFORMERS = InlineVlmOptions(
252
+ repo_id="numind/NuExtract-2.0-2B",
253
+ prompt="", # This won't be used, template is passed separately
254
+ torch_dtype="bfloat16",
255
+ inference_framework=InferenceFramework.TRANSFORMERS,
256
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
257
+ response_format=ResponseFormat.PLAINTEXT,
258
+ supported_devices=[
259
+ AcceleratorDevice.CPU,
260
+ AcceleratorDevice.CUDA,
261
+ AcceleratorDevice.MPS,
262
+ ],
263
+ scale=2.0,
264
+ temperature=0.0,
265
+ )
266
+
250
267
 
251
268
  class VlmModelType(str, Enum):
252
269
  SMOLDOCLING = "smoldocling"
@@ -28,6 +28,7 @@ from docling.backend.noop_backend import NoOpBackend
28
28
  from docling.backend.xml.jats_backend import JatsDocumentBackend
29
29
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
30
30
  from docling.datamodel.base_models import (
31
+ BaseFormatOption,
31
32
  ConversionStatus,
32
33
  DoclingComponentType,
33
34
  DocumentStream,
@@ -57,12 +58,8 @@ _log = logging.getLogger(__name__)
57
58
  _PIPELINE_CACHE_LOCK = threading.Lock()
58
59
 
59
60
 
60
- class FormatOption(BaseModel):
61
+ class FormatOption(BaseFormatOption):
61
62
  pipeline_cls: Type[BasePipeline]
62
- pipeline_options: Optional[PipelineOptions] = None
63
- backend: Type[AbstractDocumentBackend]
64
-
65
- model_config = ConfigDict(arbitrary_types_allowed=True)
66
63
 
67
64
  @model_validator(mode="after")
68
65
  def set_optional_field_default(self) -> "FormatOption":
@@ -191,7 +188,7 @@ class DocumentConverter:
191
188
  self.allowed_formats = (
192
189
  allowed_formats if allowed_formats is not None else list(InputFormat)
193
190
  )
194
- self.format_to_options = {
191
+ self.format_to_options: Dict[InputFormat, FormatOption] = {
195
192
  format: (
196
193
  _get_default_option(format=format)
197
194
  if (custom_option := (format_options or {}).get(format)) is None
@@ -0,0 +1,325 @@
1
+ import hashlib
2
+ import logging
3
+ import sys
4
+ import threading
5
+ import time
6
+ import warnings
7
+ from collections.abc import Iterable, Iterator
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from functools import partial
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional, Tuple, Type, Union
12
+
13
+ from pydantic import ConfigDict, model_validator, validate_call
14
+
15
+ from docling.backend.abstract_backend import AbstractDocumentBackend
16
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
17
+ from docling.datamodel.base_models import (
18
+ BaseFormatOption,
19
+ ConversionStatus,
20
+ DoclingComponentType,
21
+ DocumentStream,
22
+ ErrorItem,
23
+ InputFormat,
24
+ )
25
+ from docling.datamodel.document import (
26
+ InputDocument,
27
+ _DocumentConversionInput, # intentionally reused builder
28
+ )
29
+ from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
30
+ from docling.datamodel.pipeline_options import PipelineOptions
31
+ from docling.datamodel.settings import (
32
+ DEFAULT_PAGE_RANGE,
33
+ DocumentLimits,
34
+ PageRange,
35
+ settings,
36
+ )
37
+ from docling.exceptions import ConversionError
38
+ from docling.pipeline.base_extraction_pipeline import BaseExtractionPipeline
39
+ from docling.pipeline.extraction_vlm_pipeline import ExtractionVlmPipeline
40
+ from docling.utils.utils import chunkify
41
+
42
+ _log = logging.getLogger(__name__)
43
+ _PIPELINE_CACHE_LOCK = threading.Lock()
44
+
45
+
46
+ class ExtractionFormatOption(BaseFormatOption):
47
+ """Per-format configuration for extraction.
48
+
49
+ Notes:
50
+ - `pipeline_cls` must subclass `BaseExtractionPipeline`.
51
+ - `pipeline_options` is typed as `PipelineOptions` which MUST inherit from
52
+ `BaseOptions` (as used by `BaseExtractionPipeline`).
53
+ - `backend` is the document-opening backend used by `_DocumentConversionInput`.
54
+ """
55
+
56
+ pipeline_cls: Type[BaseExtractionPipeline]
57
+
58
+ @model_validator(mode="after")
59
+ def set_optional_field_default(self) -> "ExtractionFormatOption":
60
+ if self.pipeline_options is None:
61
+ # `get_default_options` comes from BaseExtractionPipeline
62
+ self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
63
+ return self
64
+
65
+
66
+ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
67
+ """Return the default extraction option for a given input format.
68
+
69
+ Defaults mirror the converter's *backend* choices, while the pipeline is
70
+ the VLM extractor. This duplication will be removed when we deduplicate
71
+ the format registry between convert/extract.
72
+ """
73
+ format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
74
+ InputFormat.IMAGE: PyPdfiumDocumentBackend,
75
+ InputFormat.PDF: PyPdfiumDocumentBackend,
76
+ }
77
+
78
+ backend = format_to_default_backend.get(fmt)
79
+ if backend is None:
80
+ raise RuntimeError(f"No default extraction backend configured for {fmt}")
81
+
82
+ return ExtractionFormatOption(
83
+ pipeline_cls=ExtractionVlmPipeline,
84
+ backend=backend,
85
+ )
86
+
87
+
88
+ class DocumentExtractor:
89
+ """Standalone extractor class.
90
+
91
+ Public API:
92
+ - `extract(...) -> ExtractionResult`
93
+ - `extract_all(...) -> Iterator[ExtractionResult]`
94
+
95
+ Implementation intentionally reuses `_DocumentConversionInput` to build
96
+ `InputDocument` with the correct backend per format.
97
+ """
98
+
99
+ def __init__(
100
+ self,
101
+ allowed_formats: Optional[List[InputFormat]] = None,
102
+ extraction_format_options: Optional[
103
+ Dict[InputFormat, ExtractionFormatOption]
104
+ ] = None,
105
+ ) -> None:
106
+ self.allowed_formats: List[InputFormat] = (
107
+ allowed_formats if allowed_formats is not None else list(InputFormat)
108
+ )
109
+ # Build per-format options with defaults, then apply any user overrides
110
+ overrides = extraction_format_options or {}
111
+ self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
112
+ fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
113
+ for fmt in self.allowed_formats
114
+ }
115
+
116
+ # Cache pipelines by (class, options-hash)
117
+ self._initialized_pipelines: Dict[
118
+ Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
119
+ ] = {}
120
+
121
+ # ---------------------------- Public API ---------------------------------
122
+
123
+ @validate_call(config=ConfigDict(strict=True))
124
+ def extract(
125
+ self,
126
+ source: Union[Path, str, DocumentStream],
127
+ template: ExtractionTemplateType,
128
+ headers: Optional[Dict[str, str]] = None,
129
+ raises_on_error: bool = True,
130
+ max_num_pages: int = sys.maxsize,
131
+ max_file_size: int = sys.maxsize,
132
+ page_range: PageRange = DEFAULT_PAGE_RANGE,
133
+ ) -> ExtractionResult:
134
+ all_res = self.extract_all(
135
+ source=[source],
136
+ headers=headers,
137
+ raises_on_error=raises_on_error,
138
+ max_num_pages=max_num_pages,
139
+ max_file_size=max_file_size,
140
+ page_range=page_range,
141
+ template=template,
142
+ )
143
+ return next(all_res)
144
+
145
+ @validate_call(config=ConfigDict(strict=True))
146
+ def extract_all(
147
+ self,
148
+ source: Iterable[Union[Path, str, DocumentStream]],
149
+ template: ExtractionTemplateType,
150
+ headers: Optional[Dict[str, str]] = None,
151
+ raises_on_error: bool = True,
152
+ max_num_pages: int = sys.maxsize,
153
+ max_file_size: int = sys.maxsize,
154
+ page_range: PageRange = DEFAULT_PAGE_RANGE,
155
+ ) -> Iterator[ExtractionResult]:
156
+ warnings.warn(
157
+ "The extract API is currently experimental and may change without prior notice.\n"
158
+ "Only PDF and image formats are supported.",
159
+ UserWarning,
160
+ stacklevel=2,
161
+ )
162
+
163
+ limits = DocumentLimits(
164
+ max_num_pages=max_num_pages,
165
+ max_file_size=max_file_size,
166
+ page_range=page_range,
167
+ )
168
+ conv_input = _DocumentConversionInput(
169
+ path_or_stream_iterator=source, limits=limits, headers=headers
170
+ )
171
+
172
+ ext_res_iter = self._extract(
173
+ conv_input, raises_on_error=raises_on_error, template=template
174
+ )
175
+
176
+ had_result = False
177
+ for ext_res in ext_res_iter:
178
+ had_result = True
179
+ if raises_on_error and ext_res.status not in {
180
+ ConversionStatus.SUCCESS,
181
+ ConversionStatus.PARTIAL_SUCCESS,
182
+ }:
183
+ raise ConversionError(
184
+ f"Extraction failed for: {ext_res.input.file} with status: {ext_res.status}"
185
+ )
186
+ else:
187
+ yield ext_res
188
+
189
+ if not had_result and raises_on_error:
190
+ raise ConversionError(
191
+ "Extraction failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
192
+ )
193
+
194
+ # --------------------------- Internal engine ------------------------------
195
+
196
+ def _extract(
197
+ self,
198
+ conv_input: _DocumentConversionInput,
199
+ raises_on_error: bool,
200
+ template: ExtractionTemplateType,
201
+ ) -> Iterator[ExtractionResult]:
202
+ start_time = time.monotonic()
203
+
204
+ for input_batch in chunkify(
205
+ conv_input.docs(self.extraction_format_to_options),
206
+ settings.perf.doc_batch_size,
207
+ ):
208
+ _log.info("Going to extract document batch...")
209
+ process_func = partial(
210
+ self._process_document_extraction,
211
+ raises_on_error=raises_on_error,
212
+ template=template,
213
+ )
214
+
215
+ if (
216
+ settings.perf.doc_batch_concurrency > 1
217
+ and settings.perf.doc_batch_size > 1
218
+ ):
219
+ with ThreadPoolExecutor(
220
+ max_workers=settings.perf.doc_batch_concurrency
221
+ ) as pool:
222
+ for item in pool.map(
223
+ process_func,
224
+ input_batch,
225
+ ):
226
+ yield item
227
+ else:
228
+ for item in map(
229
+ process_func,
230
+ input_batch,
231
+ ):
232
+ elapsed = time.monotonic() - start_time
233
+ start_time = time.monotonic()
234
+ _log.info(
235
+ f"Finished extracting document {item.input.file.name} in {elapsed:.2f} sec."
236
+ )
237
+ yield item
238
+
239
+ def _process_document_extraction(
240
+ self,
241
+ in_doc: InputDocument,
242
+ raises_on_error: bool,
243
+ template: ExtractionTemplateType,
244
+ ) -> ExtractionResult:
245
+ valid = (
246
+ self.allowed_formats is not None and in_doc.format in self.allowed_formats
247
+ )
248
+ if valid:
249
+ return self._execute_extraction_pipeline(
250
+ in_doc, raises_on_error=raises_on_error, template=template
251
+ )
252
+ else:
253
+ error_message = f"File format not allowed: {in_doc.file}"
254
+ if raises_on_error:
255
+ raise ConversionError(error_message)
256
+ else:
257
+ error_item = ErrorItem(
258
+ component_type=DoclingComponentType.USER_INPUT,
259
+ module_name="",
260
+ error_message=error_message,
261
+ )
262
+ return ExtractionResult(
263
+ input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
264
+ )
265
+
266
+ def _execute_extraction_pipeline(
267
+ self,
268
+ in_doc: InputDocument,
269
+ raises_on_error: bool,
270
+ template: ExtractionTemplateType,
271
+ ) -> ExtractionResult:
272
+ if not in_doc.valid:
273
+ if raises_on_error:
274
+ raise ConversionError(f"Input document {in_doc.file} is not valid.")
275
+ else:
276
+ return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
277
+
278
+ pipeline = self._get_pipeline(in_doc.format)
279
+ if pipeline is None:
280
+ if raises_on_error:
281
+ raise ConversionError(
282
+ f"No extraction pipeline could be initialized for {in_doc.file}."
283
+ )
284
+ else:
285
+ return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
286
+
287
+ return pipeline.execute(
288
+ in_doc, raises_on_error=raises_on_error, template=template
289
+ )
290
+
291
+ def _get_pipeline(
292
+ self, doc_format: InputFormat
293
+ ) -> Optional[BaseExtractionPipeline]:
294
+ """Retrieve or initialize a pipeline, reusing instances based on class and options."""
295
+ fopt = self.extraction_format_to_options.get(doc_format)
296
+ if fopt is None or fopt.pipeline_options is None:
297
+ return None
298
+
299
+ pipeline_class = fopt.pipeline_cls
300
+ pipeline_options = fopt.pipeline_options
301
+ options_hash = self._get_pipeline_options_hash(pipeline_options)
302
+
303
+ cache_key = (pipeline_class, options_hash)
304
+ with _PIPELINE_CACHE_LOCK:
305
+ if cache_key not in self._initialized_pipelines:
306
+ _log.info(
307
+ f"Initializing extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
308
+ )
309
+ self._initialized_pipelines[cache_key] = pipeline_class(
310
+ pipeline_options=pipeline_options # type: ignore[arg-type]
311
+ )
312
+ else:
313
+ _log.debug(
314
+ f"Reusing cached extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
315
+ )
316
+
317
+ return self._initialized_pipelines[cache_key]
318
+
319
+ @staticmethod
320
+ def _get_pipeline_options_hash(pipeline_options: PipelineOptions) -> str:
321
+ """Generate a stable hash of pipeline options to use as part of the cache key."""
322
+ options_str = str(pipeline_options.model_dump())
323
+ return hashlib.md5(
324
+ options_str.encode("utf-8"), usedforsecurity=False
325
+ ).hexdigest()
@@ -42,10 +42,10 @@ class RapidOcrModel(BaseOcrModel):
42
42
 
43
43
  if self.enabled:
44
44
  try:
45
- from rapidocr_onnxruntime import RapidOCR # type: ignore
45
+ from rapidocr import EngineType, RapidOCR # type: ignore
46
46
  except ImportError:
47
47
  raise ImportError(
48
- "RapidOCR is not installed. Please install it via `pip install rapidocr_onnxruntime` to use this OCR engine. "
48
+ "RapidOCR is not installed. Please install it via `pip install rapidocr onnxruntime` to use this OCR engine. "
49
49
  "Alternatively, Docling has support for other OCR engines. See the documentation."
50
50
  )
51
51
 
@@ -54,21 +54,40 @@ class RapidOcrModel(BaseOcrModel):
54
54
  use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
55
55
  use_dml = accelerator_options.device == AcceleratorDevice.AUTO
56
56
  intra_op_num_threads = accelerator_options.num_threads
57
+ _ALIASES = {
58
+ "onnxruntime": EngineType.ONNXRUNTIME,
59
+ "openvino": EngineType.OPENVINO,
60
+ "paddle": EngineType.PADDLE,
61
+ "torch": EngineType.TORCH,
62
+ }
63
+ backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
57
64
 
58
65
  self.reader = RapidOCR(
59
- text_score=self.options.text_score,
60
- cls_use_cuda=use_cuda,
61
- rec_use_cuda=use_cuda,
62
- det_use_cuda=use_cuda,
63
- det_use_dml=use_dml,
64
- cls_use_dml=use_dml,
65
- rec_use_dml=use_dml,
66
- intra_op_num_threads=intra_op_num_threads,
67
- print_verbose=self.options.print_verbose,
68
- det_model_path=self.options.det_model_path,
69
- cls_model_path=self.options.cls_model_path,
70
- rec_model_path=self.options.rec_model_path,
71
- rec_keys_path=self.options.rec_keys_path,
66
+ params={
67
+ # Global settings (these are still correct)
68
+ "Global.text_score": self.options.text_score,
69
+ # "Global.verbose": self.options.print_verbose,
70
+ # Detection model settings
71
+ "Det.model_path": self.options.det_model_path,
72
+ "Det.use_cuda": use_cuda,
73
+ "Det.use_dml": use_dml,
74
+ "Det.intra_op_num_threads": intra_op_num_threads,
75
+ # Classification model settings
76
+ "Cls.model_path": self.options.cls_model_path,
77
+ "Cls.use_cuda": use_cuda,
78
+ "Cls.use_dml": use_dml,
79
+ "Cls.intra_op_num_threads": intra_op_num_threads,
80
+ # Recognition model settings
81
+ "Rec.model_path": self.options.rec_model_path,
82
+ "Rec.font_path": self.options.rec_font_path,
83
+ "Rec.keys_path": self.options.rec_keys_path,
84
+ "Rec.use_cuda": use_cuda,
85
+ "Rec.use_dml": use_dml,
86
+ "Rec.intra_op_num_threads": intra_op_num_threads,
87
+ "Det.engine_type": backend_enum,
88
+ "Cls.engine_type": backend_enum,
89
+ "Rec.engine_type": backend_enum,
90
+ }
72
91
  )
73
92
 
74
93
  def __call__(
@@ -95,12 +114,15 @@ class RapidOcrModel(BaseOcrModel):
95
114
  scale=self.scale, cropbox=ocr_rect
96
115
  )
97
116
  im = numpy.array(high_res_image)
98
- result, _ = self.reader(
117
+ result = self.reader(
99
118
  im,
100
119
  use_det=self.options.use_det,
101
120
  use_cls=self.options.use_cls,
102
121
  use_rec=self.options.use_rec,
103
122
  )
123
+ result = list(
124
+ zip(result.boxes.tolist(), result.txts, result.scores)
125
+ )
104
126
 
105
127
  del high_res_image
106
128
  del im