docling 2.48.0__py3-none-any.whl → 2.49.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,11 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Any, Union, cast
4
+ from typing import Any, Optional, Union, cast
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  BoundingBox,
8
+ ContentLayer,
8
9
  CoordOrigin,
9
10
  DocItem,
10
11
  DoclingDocument,
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
197
198
  parent=None,
198
199
  label=GroupLabel.SECTION,
199
200
  name=f"sheet: {sheet_name}",
201
+ content_layer=self._get_sheet_content_layer(sheet),
200
202
  )
201
203
  doc = self._convert_sheet(doc, sheet)
202
204
  width, height = self._find_page_size(doc, page_no)
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
237
239
  """
238
240
 
239
241
  if self.workbook is not None:
242
+ content_layer = self._get_sheet_content_layer(sheet)
240
243
  tables = self._find_data_tables(sheet)
241
244
 
242
245
  for excel_table in tables:
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
282
285
  origin=CoordOrigin.TOPLEFT,
283
286
  ),
284
287
  ),
288
+ content_layer=content_layer,
285
289
  )
286
290
 
287
291
  return doc
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
486
490
  The updated DoclingDocument.
487
491
  """
488
492
  if self.workbook is not None:
493
+ content_layer = self._get_sheet_content_layer(sheet)
489
494
  # Iterate over byte images in the sheet
490
495
  for item in sheet._images: # type: ignore[attr-defined]
491
496
  try:
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
511
516
  anchor, origin=CoordOrigin.TOPLEFT
512
517
  ),
513
518
  ),
519
+ content_layer=content_layer,
514
520
  )
515
521
  except Exception:
516
522
  _log.error("could not extract the image from excel sheets")
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
536
542
  bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
537
543
 
538
544
  return (right - left, bottom - top)
545
+
546
+ @staticmethod
547
+ def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
548
+ return (
549
+ None
550
+ if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
551
+ else ContentLayer.INVISIBLE
552
+ )
@@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend):
254
254
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
255
255
  AREA_THRESHOLD = 0 # 32 * 32
256
256
  page_size = self.get_size()
257
+ rotation = self._ppage.get_rotation()
258
+
257
259
  with pypdfium2_lock:
258
260
  for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
259
261
  pos = obj.get_pos()
262
+ if rotation == 90:
263
+ pos = (
264
+ pos[1],
265
+ page_size.height - pos[2],
266
+ pos[3],
267
+ page_size.height - pos[0],
268
+ )
269
+ elif rotation == 180:
270
+ pos = (
271
+ page_size.width - pos[2],
272
+ page_size.height - pos[3],
273
+ page_size.width - pos[0],
274
+ page_size.height - pos[1],
275
+ )
276
+ elif rotation == 270:
277
+ pos = (
278
+ page_size.width - pos[3],
279
+ pos[0],
280
+ page_size.width - pos[1],
281
+ pos[2],
282
+ )
283
+
260
284
  cropbox = BoundingBox.from_tuple(
261
285
  pos, origin=CoordOrigin.BOTTOMLEFT
262
286
  ).to_top_left_origin(page_height=page_size.height)
263
-
264
287
  if cropbox.area() > AREA_THRESHOLD:
265
288
  cropbox = cropbox.scaled(scale=scale)
266
-
267
289
  yield cropbox
268
290
 
269
291
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
@@ -1,7 +1,7 @@
1
1
  import math
2
2
  from collections import defaultdict
3
3
  from enum import Enum
4
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
+ from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
5
5
 
6
6
  import numpy as np
7
7
  from docling_core.types.doc import (
@@ -32,6 +32,18 @@ from pydantic import (
32
32
  if TYPE_CHECKING:
33
33
  from docling.backend.pdf_backend import PdfPageBackend
34
34
 
35
+ from docling.backend.abstract_backend import AbstractDocumentBackend
36
+ from docling.datamodel.pipeline_options import PipelineOptions
37
+
38
+
39
+ class BaseFormatOption(BaseModel):
40
+ """Base class for format options used by _DocumentConversionInput."""
41
+
42
+ pipeline_options: Optional[PipelineOptions] = None
43
+ backend: Type[AbstractDocumentBackend]
44
+
45
+ model_config = ConfigDict(arbitrary_types_allowed=True)
46
+
35
47
 
36
48
  class ConversionStatus(str, Enum):
37
49
  PENDING = "pending"
@@ -2,12 +2,13 @@ import csv
2
2
  import logging
3
3
  import re
4
4
  import tarfile
5
- from collections.abc import Iterable
5
+ from collections.abc import Iterable, Mapping
6
6
  from enum import Enum
7
7
  from io import BytesIO
8
8
  from pathlib import Path, PurePath
9
9
  from typing import (
10
10
  TYPE_CHECKING,
11
+ Any,
11
12
  Dict,
12
13
  List,
13
14
  Literal,
@@ -72,7 +73,7 @@ from docling.utils.profiling import ProfilingItem
72
73
  from docling.utils.utils import create_file_hash
73
74
 
74
75
  if TYPE_CHECKING:
75
- from docling.document_converter import FormatOption
76
+ from docling.datamodel.base_models import BaseFormatOption
76
77
 
77
78
  _log = logging.getLogger(__name__)
78
79
 
@@ -238,7 +239,8 @@ class _DocumentConversionInput(BaseModel):
238
239
  limits: Optional[DocumentLimits] = DocumentLimits()
239
240
 
240
241
  def docs(
241
- self, format_options: Dict[InputFormat, "FormatOption"]
242
+ self,
243
+ format_options: Mapping[InputFormat, "BaseFormatOption"],
242
244
  ) -> Iterable[InputDocument]:
243
245
  for item in self.path_or_stream_iterator:
244
246
  obj = (
@@ -0,0 +1,39 @@
1
+ """Data models for document extraction functionality."""
2
+
3
+ from typing import Any, Dict, List, Optional, Type, Union
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from docling.datamodel.base_models import ConversionStatus, ErrorItem
8
+ from docling.datamodel.document import InputDocument
9
+
10
+
11
+ class ExtractedPageData(BaseModel):
12
+ """Data model for extracted content from a single page."""
13
+
14
+ page_no: int = Field(..., description="1-indexed page number")
15
+ extracted_data: Optional[Dict[str, Any]] = Field(
16
+ None, description="Extracted structured data from the page"
17
+ )
18
+ raw_text: Optional[str] = Field(None, description="Raw extracted text")
19
+ errors: List[str] = Field(
20
+ default_factory=list,
21
+ description="Any errors encountered during extraction for this page",
22
+ )
23
+
24
+
25
+ class ExtractionResult(BaseModel):
26
+ """Result of document extraction."""
27
+
28
+ input: InputDocument
29
+ status: ConversionStatus = ConversionStatus.PENDING
30
+ errors: List[ErrorItem] = []
31
+
32
+ # Pages field - always a list for consistency
33
+ pages: List[ExtractedPageData] = Field(
34
+ default_factory=list, description="Extracted data from each page"
35
+ )
36
+
37
+
38
+ # Type alias for template parameters that can be string, dict, or BaseModel
39
+ ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]
@@ -37,6 +37,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
37
37
  from docling.datamodel.vlm_model_specs import (
38
38
  GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
39
39
  GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
40
+ NU_EXTRACT_2B_TRANSFORMERS,
40
41
  SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
41
42
  SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
42
43
  VlmModelType,
@@ -113,6 +114,7 @@ class RapidOcrOptions(OcrOptions):
113
114
  cls_model_path: Optional[str] = None # same default as rapidocr
114
115
  rec_model_path: Optional[str] = None # same default as rapidocr
115
116
  rec_keys_path: Optional[str] = None # same default as rapidocr
117
+ rec_font_path: Optional[str] = None # same default as rapidocr
116
118
 
117
119
  model_config = ConfigDict(
118
120
  extra="forbid",
@@ -246,12 +248,9 @@ class OcrEngine(str, Enum):
246
248
  RAPIDOCR = "rapidocr"
247
249
 
248
250
 
249
- class PipelineOptions(BaseModel):
251
+ class PipelineOptions(BaseOptions):
250
252
  """Base pipeline options."""
251
253
 
252
- create_legacy_output: bool = (
253
- True # This default will be set to False on a future version of docling
254
- )
255
254
  document_timeout: Optional[float] = None
256
255
  accelerator_options: AcceleratorOptions = AcceleratorOptions()
257
256
  enable_remote_services: bool = False
@@ -295,6 +294,13 @@ class AsrPipelineOptions(PipelineOptions):
295
294
  artifacts_path: Optional[Union[Path, str]] = None
296
295
 
297
296
 
297
+ class VlmExtractionPipelineOptions(PipelineOptions):
298
+ """Options for extraction pipeline."""
299
+
300
+ artifacts_path: Optional[Union[Path, str]] = None
301
+ vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
302
+
303
+
298
304
  class PdfPipelineOptions(PaginatedPipelineOptions):
299
305
  """Options for the PDF pipeline."""
300
306
 
@@ -247,6 +247,23 @@ DOLPHIN_TRANSFORMERS = InlineVlmOptions(
247
247
  temperature=0.0,
248
248
  )
249
249
 
250
+ # NuExtract
251
+ NU_EXTRACT_2B_TRANSFORMERS = InlineVlmOptions(
252
+ repo_id="numind/NuExtract-2.0-2B",
253
+ prompt="", # This won't be used, template is passed separately
254
+ torch_dtype="bfloat16",
255
+ inference_framework=InferenceFramework.TRANSFORMERS,
256
+ transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
257
+ response_format=ResponseFormat.PLAINTEXT,
258
+ supported_devices=[
259
+ AcceleratorDevice.CPU,
260
+ AcceleratorDevice.CUDA,
261
+ AcceleratorDevice.MPS,
262
+ ],
263
+ scale=2.0,
264
+ temperature=0.0,
265
+ )
266
+
250
267
 
251
268
  class VlmModelType(str, Enum):
252
269
  SMOLDOCLING = "smoldocling"
@@ -28,6 +28,7 @@ from docling.backend.noop_backend import NoOpBackend
28
28
  from docling.backend.xml.jats_backend import JatsDocumentBackend
29
29
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
30
30
  from docling.datamodel.base_models import (
31
+ BaseFormatOption,
31
32
  ConversionStatus,
32
33
  DoclingComponentType,
33
34
  DocumentStream,
@@ -57,12 +58,8 @@ _log = logging.getLogger(__name__)
57
58
  _PIPELINE_CACHE_LOCK = threading.Lock()
58
59
 
59
60
 
60
- class FormatOption(BaseModel):
61
+ class FormatOption(BaseFormatOption):
61
62
  pipeline_cls: Type[BasePipeline]
62
- pipeline_options: Optional[PipelineOptions] = None
63
- backend: Type[AbstractDocumentBackend]
64
-
65
- model_config = ConfigDict(arbitrary_types_allowed=True)
66
63
 
67
64
  @model_validator(mode="after")
68
65
  def set_optional_field_default(self) -> "FormatOption":
@@ -191,7 +188,7 @@ class DocumentConverter:
191
188
  self.allowed_formats = (
192
189
  allowed_formats if allowed_formats is not None else list(InputFormat)
193
190
  )
194
- self.format_to_options = {
191
+ self.format_to_options: Dict[InputFormat, FormatOption] = {
195
192
  format: (
196
193
  _get_default_option(format=format)
197
194
  if (custom_option := (format_options or {}).get(format)) is None
@@ -0,0 +1,325 @@
1
+ import hashlib
2
+ import logging
3
+ import sys
4
+ import threading
5
+ import time
6
+ import warnings
7
+ from collections.abc import Iterable, Iterator
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from functools import partial
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional, Tuple, Type, Union
12
+
13
+ from pydantic import ConfigDict, model_validator, validate_call
14
+
15
+ from docling.backend.abstract_backend import AbstractDocumentBackend
16
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
17
+ from docling.datamodel.base_models import (
18
+ BaseFormatOption,
19
+ ConversionStatus,
20
+ DoclingComponentType,
21
+ DocumentStream,
22
+ ErrorItem,
23
+ InputFormat,
24
+ )
25
+ from docling.datamodel.document import (
26
+ InputDocument,
27
+ _DocumentConversionInput, # intentionally reused builder
28
+ )
29
+ from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
30
+ from docling.datamodel.pipeline_options import PipelineOptions
31
+ from docling.datamodel.settings import (
32
+ DEFAULT_PAGE_RANGE,
33
+ DocumentLimits,
34
+ PageRange,
35
+ settings,
36
+ )
37
+ from docling.exceptions import ConversionError
38
+ from docling.pipeline.base_extraction_pipeline import BaseExtractionPipeline
39
+ from docling.pipeline.extraction_vlm_pipeline import ExtractionVlmPipeline
40
+ from docling.utils.utils import chunkify
41
+
42
+ _log = logging.getLogger(__name__)
43
+ _PIPELINE_CACHE_LOCK = threading.Lock()
44
+
45
+
46
+ class ExtractionFormatOption(BaseFormatOption):
47
+ """Per-format configuration for extraction.
48
+
49
+ Notes:
50
+ - `pipeline_cls` must subclass `BaseExtractionPipeline`.
51
+ - `pipeline_options` is typed as `PipelineOptions` which MUST inherit from
52
+ `BaseOptions` (as used by `BaseExtractionPipeline`).
53
+ - `backend` is the document-opening backend used by `_DocumentConversionInput`.
54
+ """
55
+
56
+ pipeline_cls: Type[BaseExtractionPipeline]
57
+
58
+ @model_validator(mode="after")
59
+ def set_optional_field_default(self) -> "ExtractionFormatOption":
60
+ if self.pipeline_options is None:
61
+ # `get_default_options` comes from BaseExtractionPipeline
62
+ self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
63
+ return self
64
+
65
+
66
+ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
67
+ """Return the default extraction option for a given input format.
68
+
69
+ Defaults mirror the converter's *backend* choices, while the pipeline is
70
+ the VLM extractor. This duplication will be removed when we deduplicate
71
+ the format registry between convert/extract.
72
+ """
73
+ format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
74
+ InputFormat.IMAGE: PyPdfiumDocumentBackend,
75
+ InputFormat.PDF: PyPdfiumDocumentBackend,
76
+ }
77
+
78
+ backend = format_to_default_backend.get(fmt)
79
+ if backend is None:
80
+ raise RuntimeError(f"No default extraction backend configured for {fmt}")
81
+
82
+ return ExtractionFormatOption(
83
+ pipeline_cls=ExtractionVlmPipeline,
84
+ backend=backend,
85
+ )
86
+
87
+
88
+ class DocumentExtractor:
89
+ """Standalone extractor class.
90
+
91
+ Public API:
92
+ - `extract(...) -> ExtractionResult`
93
+ - `extract_all(...) -> Iterator[ExtractionResult]`
94
+
95
+ Implementation intentionally reuses `_DocumentConversionInput` to build
96
+ `InputDocument` with the correct backend per format.
97
+ """
98
+
99
+ def __init__(
100
+ self,
101
+ allowed_formats: Optional[List[InputFormat]] = None,
102
+ extraction_format_options: Optional[
103
+ Dict[InputFormat, ExtractionFormatOption]
104
+ ] = None,
105
+ ) -> None:
106
+ self.allowed_formats: List[InputFormat] = (
107
+ allowed_formats if allowed_formats is not None else list(InputFormat)
108
+ )
109
+ # Build per-format options with defaults, then apply any user overrides
110
+ overrides = extraction_format_options or {}
111
+ self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
112
+ fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
113
+ for fmt in self.allowed_formats
114
+ }
115
+
116
+ # Cache pipelines by (class, options-hash)
117
+ self._initialized_pipelines: Dict[
118
+ Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
119
+ ] = {}
120
+
121
+ # ---------------------------- Public API ---------------------------------
122
+
123
+ @validate_call(config=ConfigDict(strict=True))
124
+ def extract(
125
+ self,
126
+ source: Union[Path, str, DocumentStream],
127
+ template: ExtractionTemplateType,
128
+ headers: Optional[Dict[str, str]] = None,
129
+ raises_on_error: bool = True,
130
+ max_num_pages: int = sys.maxsize,
131
+ max_file_size: int = sys.maxsize,
132
+ page_range: PageRange = DEFAULT_PAGE_RANGE,
133
+ ) -> ExtractionResult:
134
+ all_res = self.extract_all(
135
+ source=[source],
136
+ headers=headers,
137
+ raises_on_error=raises_on_error,
138
+ max_num_pages=max_num_pages,
139
+ max_file_size=max_file_size,
140
+ page_range=page_range,
141
+ template=template,
142
+ )
143
+ return next(all_res)
144
+
145
+ @validate_call(config=ConfigDict(strict=True))
146
+ def extract_all(
147
+ self,
148
+ source: Iterable[Union[Path, str, DocumentStream]],
149
+ template: ExtractionTemplateType,
150
+ headers: Optional[Dict[str, str]] = None,
151
+ raises_on_error: bool = True,
152
+ max_num_pages: int = sys.maxsize,
153
+ max_file_size: int = sys.maxsize,
154
+ page_range: PageRange = DEFAULT_PAGE_RANGE,
155
+ ) -> Iterator[ExtractionResult]:
156
+ warnings.warn(
157
+ "The extract API is currently experimental and may change without prior notice.\n"
158
+ "Only PDF and image formats are supported.",
159
+ UserWarning,
160
+ stacklevel=2,
161
+ )
162
+
163
+ limits = DocumentLimits(
164
+ max_num_pages=max_num_pages,
165
+ max_file_size=max_file_size,
166
+ page_range=page_range,
167
+ )
168
+ conv_input = _DocumentConversionInput(
169
+ path_or_stream_iterator=source, limits=limits, headers=headers
170
+ )
171
+
172
+ ext_res_iter = self._extract(
173
+ conv_input, raises_on_error=raises_on_error, template=template
174
+ )
175
+
176
+ had_result = False
177
+ for ext_res in ext_res_iter:
178
+ had_result = True
179
+ if raises_on_error and ext_res.status not in {
180
+ ConversionStatus.SUCCESS,
181
+ ConversionStatus.PARTIAL_SUCCESS,
182
+ }:
183
+ raise ConversionError(
184
+ f"Extraction failed for: {ext_res.input.file} with status: {ext_res.status}"
185
+ )
186
+ else:
187
+ yield ext_res
188
+
189
+ if not had_result and raises_on_error:
190
+ raise ConversionError(
191
+ "Extraction failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
192
+ )
193
+
194
+ # --------------------------- Internal engine ------------------------------
195
+
196
+ def _extract(
197
+ self,
198
+ conv_input: _DocumentConversionInput,
199
+ raises_on_error: bool,
200
+ template: ExtractionTemplateType,
201
+ ) -> Iterator[ExtractionResult]:
202
+ start_time = time.monotonic()
203
+
204
+ for input_batch in chunkify(
205
+ conv_input.docs(self.extraction_format_to_options),
206
+ settings.perf.doc_batch_size,
207
+ ):
208
+ _log.info("Going to extract document batch...")
209
+ process_func = partial(
210
+ self._process_document_extraction,
211
+ raises_on_error=raises_on_error,
212
+ template=template,
213
+ )
214
+
215
+ if (
216
+ settings.perf.doc_batch_concurrency > 1
217
+ and settings.perf.doc_batch_size > 1
218
+ ):
219
+ with ThreadPoolExecutor(
220
+ max_workers=settings.perf.doc_batch_concurrency
221
+ ) as pool:
222
+ for item in pool.map(
223
+ process_func,
224
+ input_batch,
225
+ ):
226
+ yield item
227
+ else:
228
+ for item in map(
229
+ process_func,
230
+ input_batch,
231
+ ):
232
+ elapsed = time.monotonic() - start_time
233
+ start_time = time.monotonic()
234
+ _log.info(
235
+ f"Finished extracting document {item.input.file.name} in {elapsed:.2f} sec."
236
+ )
237
+ yield item
238
+
239
+ def _process_document_extraction(
240
+ self,
241
+ in_doc: InputDocument,
242
+ raises_on_error: bool,
243
+ template: ExtractionTemplateType,
244
+ ) -> ExtractionResult:
245
+ valid = (
246
+ self.allowed_formats is not None and in_doc.format in self.allowed_formats
247
+ )
248
+ if valid:
249
+ return self._execute_extraction_pipeline(
250
+ in_doc, raises_on_error=raises_on_error, template=template
251
+ )
252
+ else:
253
+ error_message = f"File format not allowed: {in_doc.file}"
254
+ if raises_on_error:
255
+ raise ConversionError(error_message)
256
+ else:
257
+ error_item = ErrorItem(
258
+ component_type=DoclingComponentType.USER_INPUT,
259
+ module_name="",
260
+ error_message=error_message,
261
+ )
262
+ return ExtractionResult(
263
+ input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
264
+ )
265
+
266
+ def _execute_extraction_pipeline(
267
+ self,
268
+ in_doc: InputDocument,
269
+ raises_on_error: bool,
270
+ template: ExtractionTemplateType,
271
+ ) -> ExtractionResult:
272
+ if not in_doc.valid:
273
+ if raises_on_error:
274
+ raise ConversionError(f"Input document {in_doc.file} is not valid.")
275
+ else:
276
+ return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
277
+
278
+ pipeline = self._get_pipeline(in_doc.format)
279
+ if pipeline is None:
280
+ if raises_on_error:
281
+ raise ConversionError(
282
+ f"No extraction pipeline could be initialized for {in_doc.file}."
283
+ )
284
+ else:
285
+ return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
286
+
287
+ return pipeline.execute(
288
+ in_doc, raises_on_error=raises_on_error, template=template
289
+ )
290
+
291
+ def _get_pipeline(
292
+ self, doc_format: InputFormat
293
+ ) -> Optional[BaseExtractionPipeline]:
294
+ """Retrieve or initialize a pipeline, reusing instances based on class and options."""
295
+ fopt = self.extraction_format_to_options.get(doc_format)
296
+ if fopt is None or fopt.pipeline_options is None:
297
+ return None
298
+
299
+ pipeline_class = fopt.pipeline_cls
300
+ pipeline_options = fopt.pipeline_options
301
+ options_hash = self._get_pipeline_options_hash(pipeline_options)
302
+
303
+ cache_key = (pipeline_class, options_hash)
304
+ with _PIPELINE_CACHE_LOCK:
305
+ if cache_key not in self._initialized_pipelines:
306
+ _log.info(
307
+ f"Initializing extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
308
+ )
309
+ self._initialized_pipelines[cache_key] = pipeline_class(
310
+ pipeline_options=pipeline_options # type: ignore[arg-type]
311
+ )
312
+ else:
313
+ _log.debug(
314
+ f"Reusing cached extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
315
+ )
316
+
317
+ return self._initialized_pipelines[cache_key]
318
+
319
+ @staticmethod
320
+ def _get_pipeline_options_hash(pipeline_options: PipelineOptions) -> str:
321
+ """Generate a stable hash of pipeline options to use as part of the cache key."""
322
+ options_str = str(pipeline_options.model_dump())
323
+ return hashlib.md5(
324
+ options_str.encode("utf-8"), usedforsecurity=False
325
+ ).hexdigest()
@@ -79,6 +79,7 @@ class RapidOcrModel(BaseOcrModel):
79
79
  "Cls.intra_op_num_threads": intra_op_num_threads,
80
80
  # Recognition model settings
81
81
  "Rec.model_path": self.options.rec_model_path,
82
+ "Rec.font_path": self.options.rec_font_path,
82
83
  "Rec.keys_path": self.options.rec_keys_path,
83
84
  "Rec.use_cuda": use_cuda,
84
85
  "Rec.use_dml": use_dml,
@@ -0,0 +1,290 @@
1
+ import logging
2
+ import time
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+ from typing import Any, Optional, Union
6
+
7
+ import numpy as np
8
+ from PIL.Image import Image
9
+ from transformers import AutoModelForImageTextToText, AutoProcessor, GenerationConfig
10
+
11
+ from docling.datamodel.accelerator_options import (
12
+ AcceleratorOptions,
13
+ )
14
+ from docling.datamodel.base_models import VlmPrediction
15
+ from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
16
+ from docling.models.base_model import BaseVlmModel
17
+ from docling.models.utils.hf_model_download import (
18
+ HuggingFaceModelDownloadMixin,
19
+ )
20
+ from docling.utils.accelerator_utils import decide_device
21
+
22
+ _log = logging.getLogger(__name__)
23
+
24
+
25
+ # Source code from https://huggingface.co/numind/NuExtract-2.0-8B
26
+ def process_all_vision_info(messages, examples=None):
27
+ """
28
+ Process vision information from both messages and in-context examples, supporting batch processing.
29
+
30
+ Args:
31
+ messages: List of message dictionaries (single input) OR list of message lists (batch input)
32
+ examples: Optional list of example dictionaries (single input) OR list of example lists (batch)
33
+
34
+ Returns:
35
+ A flat list of all images in the correct order:
36
+ - For single input: example images followed by message images
37
+ - For batch input: interleaved as (item1 examples, item1 input, item2 examples, item2 input, etc.)
38
+ - Returns None if no images were found
39
+ """
40
+ try:
41
+ from qwen_vl_utils import fetch_image, process_vision_info
42
+ except ImportError:
43
+ raise ImportError(
44
+ "qwen-vl-utils is required for NuExtractTransformersModel. "
45
+ "Please install it with: pip install qwen-vl-utils"
46
+ )
47
+
48
+ from qwen_vl_utils import fetch_image, process_vision_info
49
+
50
+ # Helper function to extract images from examples
51
+ def extract_example_images(example_item):
52
+ if not example_item:
53
+ return []
54
+
55
+ # Handle both list of examples and single example
56
+ examples_to_process = (
57
+ example_item if isinstance(example_item, list) else [example_item]
58
+ )
59
+ images = []
60
+
61
+ for example in examples_to_process:
62
+ if (
63
+ isinstance(example.get("input"), dict)
64
+ and example["input"].get("type") == "image"
65
+ ):
66
+ images.append(fetch_image(example["input"]))
67
+
68
+ return images
69
+
70
+ # Normalize inputs to always be batched format
71
+ is_batch = messages and isinstance(messages[0], list)
72
+ messages_batch = messages if is_batch else [messages]
73
+ is_batch_examples = (
74
+ examples
75
+ and isinstance(examples, list)
76
+ and (isinstance(examples[0], list) or examples[0] is None)
77
+ )
78
+ examples_batch = (
79
+ examples
80
+ if is_batch_examples
81
+ else ([examples] if examples is not None else None)
82
+ )
83
+
84
+ # Ensure examples batch matches messages batch if provided
85
+ if examples and len(examples_batch) != len(messages_batch):
86
+ if not is_batch and len(examples_batch) == 1:
87
+ # Single example set for a single input is fine
88
+ pass
89
+ else:
90
+ raise ValueError("Examples batch length must match messages batch length")
91
+
92
+ # Process all inputs, maintaining correct order
93
+ all_images = []
94
+ for i, message_group in enumerate(messages_batch):
95
+ # Get example images for this input
96
+ if examples and i < len(examples_batch):
97
+ input_example_images = extract_example_images(examples_batch[i])
98
+ all_images.extend(input_example_images)
99
+
100
+ # Get message images for this input
101
+ input_message_images = process_vision_info(message_group)[0] or []
102
+ all_images.extend(input_message_images)
103
+
104
+ return all_images if all_images else None
105
+
106
+
107
+ class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
108
+ def __init__(
109
+ self,
110
+ enabled: bool,
111
+ artifacts_path: Optional[Path],
112
+ accelerator_options: AcceleratorOptions,
113
+ vlm_options: InlineVlmOptions,
114
+ ):
115
+ self.enabled = enabled
116
+ self.vlm_options = vlm_options
117
+
118
+ if self.enabled:
119
+ import torch
120
+
121
+ self.device = decide_device(
122
+ accelerator_options.device,
123
+ supported_devices=vlm_options.supported_devices,
124
+ )
125
+ _log.debug(f"Available device for NuExtract VLM: {self.device}")
126
+
127
+ self.max_new_tokens = vlm_options.max_new_tokens
128
+ self.temperature = vlm_options.temperature
129
+
130
+ repo_cache_folder = vlm_options.repo_id.replace("/", "--")
131
+
132
+ if artifacts_path is None:
133
+ artifacts_path = self.download_models(self.vlm_options.repo_id)
134
+ elif (artifacts_path / repo_cache_folder).exists():
135
+ artifacts_path = artifacts_path / repo_cache_folder
136
+
137
+ self.processor = AutoProcessor.from_pretrained(
138
+ artifacts_path,
139
+ trust_remote_code=vlm_options.trust_remote_code,
140
+ use_fast=True,
141
+ )
142
+ self.processor.tokenizer.padding_side = "left"
143
+
144
+ self.vlm_model = AutoModelForImageTextToText.from_pretrained(
145
+ artifacts_path,
146
+ device_map=self.device,
147
+ torch_dtype=self.vlm_options.torch_dtype,
148
+ _attn_implementation=(
149
+ "flash_attention_2"
150
+ if self.device.startswith("cuda")
151
+ and accelerator_options.cuda_use_flash_attention2
152
+ else "sdpa"
153
+ ),
154
+ trust_remote_code=vlm_options.trust_remote_code,
155
+ )
156
+ self.vlm_model = torch.compile(self.vlm_model) # type: ignore
157
+
158
+ # Load generation config
159
+ self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
160
+
161
+ def process_images(
162
+ self,
163
+ image_batch: Iterable[Union[Image, np.ndarray]],
164
+ prompt: Union[str, list[str]],
165
+ ) -> Iterable[VlmPrediction]:
166
+ """
167
+ Batched inference for NuExtract VLM using the specialized input format.
168
+
169
+ Args:
170
+ image_batch: Iterable of PIL Images or numpy arrays
171
+ prompt: Either:
172
+ - str: Single template used for all images
173
+ - list[str]: List of templates (one per image, must match image count)
174
+ """
175
+ import torch
176
+ from PIL import Image as PILImage
177
+
178
+ # Normalize images to RGB PIL
179
+ pil_images: list[Image] = []
180
+ for img in image_batch:
181
+ if isinstance(img, np.ndarray):
182
+ if img.ndim == 3 and img.shape[2] in (3, 4):
183
+ pil_img = PILImage.fromarray(img.astype(np.uint8))
184
+ elif img.ndim == 2:
185
+ pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
186
+ else:
187
+ raise ValueError(f"Unsupported numpy array shape: {img.shape}")
188
+ else:
189
+ pil_img = img
190
+ if pil_img.mode != "RGB":
191
+ pil_img = pil_img.convert("RGB")
192
+ pil_images.append(pil_img)
193
+
194
+ if not pil_images:
195
+ return
196
+
197
+ # Normalize templates (1 per image)
198
+ if isinstance(prompt, str):
199
+ templates = [prompt] * len(pil_images)
200
+ else:
201
+ if len(prompt) != len(pil_images):
202
+ raise ValueError(
203
+ f"Number of templates ({len(prompt)}) must match number of images ({len(pil_images)})"
204
+ )
205
+ templates = prompt
206
+
207
+ # Construct NuExtract input format
208
+ inputs = []
209
+ for pil_img, template in zip(pil_images, templates):
210
+ input_item = {
211
+ "document": {"type": "image", "image": pil_img},
212
+ "template": template,
213
+ }
214
+ inputs.append(input_item)
215
+
216
+ # Create messages structure for batch processing
217
+ messages = [
218
+ [
219
+ {
220
+ "role": "user",
221
+ "content": [x["document"]],
222
+ }
223
+ ]
224
+ for x in inputs
225
+ ]
226
+
227
+ # Apply chat template to each example individually
228
+ texts = [
229
+ self.processor.tokenizer.apply_chat_template(
230
+ messages[i],
231
+ template=x["template"],
232
+ tokenize=False,
233
+ add_generation_prompt=True,
234
+ )
235
+ for i, x in enumerate(inputs)
236
+ ]
237
+
238
+ # Process vision inputs using qwen-vl-utils
239
+ image_inputs = process_all_vision_info(messages)
240
+
241
+ # Process with the processor
242
+ processor_inputs = self.processor(
243
+ text=texts,
244
+ images=image_inputs,
245
+ padding=True,
246
+ return_tensors="pt",
247
+ **self.vlm_options.extra_processor_kwargs,
248
+ )
249
+ processor_inputs = {k: v.to(self.device) for k, v in processor_inputs.items()}
250
+
251
+ # Generate
252
+ gen_kwargs = {
253
+ **processor_inputs,
254
+ "max_new_tokens": self.max_new_tokens,
255
+ "generation_config": self.generation_config,
256
+ **self.vlm_options.extra_generation_config,
257
+ }
258
+ if self.temperature > 0:
259
+ gen_kwargs["do_sample"] = True
260
+ gen_kwargs["temperature"] = self.temperature
261
+ else:
262
+ gen_kwargs["do_sample"] = False
263
+
264
+ start_time = time.time()
265
+ with torch.inference_mode():
266
+ generated_ids = self.vlm_model.generate(**gen_kwargs)
267
+ generation_time = time.time() - start_time
268
+
269
+ # Trim generated sequences
270
+ input_len = processor_inputs["input_ids"].shape[1]
271
+ trimmed_sequences = generated_ids[:, input_len:]
272
+
273
+ # Decode with the processor/tokenizer
274
+ decoded_texts: list[str] = self.processor.batch_decode(
275
+ trimmed_sequences,
276
+ skip_special_tokens=True,
277
+ clean_up_tokenization_spaces=False,
278
+ )
279
+
280
+ # Optional logging
281
+ if generated_ids.shape[0] > 0: # type: ignore
282
+ _log.debug(
283
+ f"Generated {int(generated_ids[0].shape[0])} tokens in {generation_time:.2f}s "
284
+ f"for batch size {generated_ids.shape[0]}." # type: ignore
285
+ )
286
+
287
+ for text in decoded_texts:
288
+ # Apply decode_response to the output text
289
+ decoded_text = self.vlm_options.decode_response(text)
290
+ yield VlmPrediction(text=decoded_text, generation_time=generation_time)
@@ -0,0 +1,58 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from typing import Optional
4
+
5
+ from docling.datamodel.base_models import ConversionStatus, ErrorItem
6
+ from docling.datamodel.document import InputDocument
7
+ from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
8
+ from docling.datamodel.pipeline_options import BaseOptions
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class BaseExtractionPipeline(ABC):
14
+ def __init__(self, pipeline_options: BaseOptions):
15
+ self.pipeline_options = pipeline_options
16
+
17
+ def execute(
18
+ self,
19
+ in_doc: InputDocument,
20
+ raises_on_error: bool,
21
+ template: Optional[ExtractionTemplateType] = None,
22
+ ) -> ExtractionResult:
23
+ ext_res = ExtractionResult(input=in_doc)
24
+
25
+ try:
26
+ ext_res = self._extract_data(ext_res, template)
27
+ ext_res.status = self._determine_status(ext_res)
28
+ except Exception as e:
29
+ ext_res.status = ConversionStatus.FAILURE
30
+ error_item = ErrorItem(
31
+ component_type="extraction_pipeline",
32
+ module_name=self.__class__.__name__,
33
+ error_message=str(e),
34
+ )
35
+ ext_res.errors.append(error_item)
36
+ if raises_on_error:
37
+ raise e
38
+
39
+ return ext_res
40
+
41
+ @abstractmethod
42
+ def _extract_data(
43
+ self,
44
+ ext_res: ExtractionResult,
45
+ template: Optional[ExtractionTemplateType] = None,
46
+ ) -> ExtractionResult:
47
+ """Subclass must populate ext_res.pages/errors and return the result."""
48
+ raise NotImplementedError
49
+
50
+ @abstractmethod
51
+ def _determine_status(self, ext_res: ExtractionResult) -> ConversionStatus:
52
+ """Subclass must decide SUCCESS/PARTIAL_SUCCESS/FAILURE based on ext_res."""
53
+ raise NotImplementedError
54
+
55
+ @classmethod
56
+ @abstractmethod
57
+ def get_default_options(cls) -> BaseOptions:
58
+ pass
@@ -0,0 +1,204 @@
1
+ import inspect
2
+ import json
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from PIL.Image import Image
8
+ from pydantic import BaseModel
9
+
10
+ from docling.backend.abstract_backend import PaginatedDocumentBackend
11
+ from docling.backend.pdf_backend import PdfDocumentBackend
12
+ from docling.datamodel.base_models import ConversionStatus, ErrorItem
13
+ from docling.datamodel.document import InputDocument
14
+ from docling.datamodel.extraction import (
15
+ ExtractedPageData,
16
+ ExtractionResult,
17
+ ExtractionTemplateType,
18
+ )
19
+ from docling.datamodel.pipeline_options import BaseOptions, VlmExtractionPipelineOptions
20
+ from docling.datamodel.settings import settings
21
+ from docling.models.vlm_models_inline.nuextract_transformers_model import (
22
+ NuExtractTransformersModel,
23
+ )
24
+ from docling.pipeline.base_extraction_pipeline import BaseExtractionPipeline
25
+ from docling.utils.accelerator_utils import decide_device
26
+
27
+ _log = logging.getLogger(__name__)
28
+
29
+
30
+ class ExtractionVlmPipeline(BaseExtractionPipeline):
31
+ def __init__(self, pipeline_options: VlmExtractionPipelineOptions):
32
+ super().__init__(pipeline_options)
33
+
34
+ # Initialize VLM model with default options
35
+ self.accelerator_options = pipeline_options.accelerator_options
36
+ self.pipeline_options: VlmExtractionPipelineOptions
37
+
38
+ artifacts_path: Optional[Path] = None
39
+ if pipeline_options.artifacts_path is not None:
40
+ artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
41
+ elif settings.artifacts_path is not None:
42
+ artifacts_path = Path(settings.artifacts_path).expanduser()
43
+
44
+ if artifacts_path is not None and not artifacts_path.is_dir():
45
+ raise RuntimeError(
46
+ f"The value of {artifacts_path=} is not valid. "
47
+ "When defined, it must point to a folder containing all models required by the pipeline."
48
+ )
49
+
50
+ # Create VLM model instance
51
+ self.vlm_model = NuExtractTransformersModel(
52
+ enabled=True,
53
+ artifacts_path=artifacts_path, # Will download automatically
54
+ accelerator_options=self.accelerator_options,
55
+ vlm_options=pipeline_options.vlm_options,
56
+ )
57
+
58
+ def _extract_data(
59
+ self,
60
+ ext_res: ExtractionResult,
61
+ template: Optional[ExtractionTemplateType] = None,
62
+ ) -> ExtractionResult:
63
+ """Extract data using the VLM model."""
64
+ try:
65
+ # Get images from input document using the backend
66
+ images = self._get_images_from_input(ext_res.input)
67
+ if not images:
68
+ ext_res.status = ConversionStatus.FAILURE
69
+ ext_res.errors.append(
70
+ ErrorItem(
71
+ component_type="extraction_pipeline",
72
+ module_name=self.__class__.__name__,
73
+ error_message="No images found in document",
74
+ )
75
+ )
76
+ return ext_res
77
+
78
+ # Use provided template or default prompt
79
+ if template is not None:
80
+ prompt = self._serialize_template(template)
81
+ else:
82
+ prompt = "Extract all text and structured information from this document. Return as JSON."
83
+
84
+ # Process all images with VLM model
85
+ start_page, end_page = ext_res.input.limits.page_range
86
+ for i, image in enumerate(images):
87
+ # Calculate the actual page number based on the filtered range
88
+ page_number = start_page + i
89
+ try:
90
+ predictions = list(self.vlm_model.process_images([image], prompt))
91
+
92
+ if predictions:
93
+ # Parse the extracted text as JSON if possible, otherwise use as-is
94
+ extracted_text = predictions[0].text
95
+ extracted_data = None
96
+
97
+ try:
98
+ extracted_data = json.loads(extracted_text)
99
+ except (json.JSONDecodeError, ValueError):
100
+ # If not valid JSON, keep extracted_data as None
101
+ pass
102
+
103
+ # Create page data with proper structure
104
+ page_data = ExtractedPageData(
105
+ page_no=page_number,
106
+ extracted_data=extracted_data,
107
+ raw_text=extracted_text, # Always populate raw_text
108
+ )
109
+ ext_res.pages.append(page_data)
110
+ else:
111
+ # Add error page data
112
+ page_data = ExtractedPageData(
113
+ page_no=page_number,
114
+ extracted_data=None,
115
+ errors=["No extraction result from VLM model"],
116
+ )
117
+ ext_res.pages.append(page_data)
118
+
119
+ except Exception as e:
120
+ _log.error(f"Error processing page {page_number}: {e}")
121
+ page_data = ExtractedPageData(
122
+ page_no=page_number, extracted_data=None, errors=[str(e)]
123
+ )
124
+ ext_res.pages.append(page_data)
125
+
126
+ except Exception as e:
127
+ _log.error(f"Error during extraction: {e}")
128
+ ext_res.errors.append(
129
+ ErrorItem(
130
+ component_type="extraction_pipeline",
131
+ module_name=self.__class__.__name__,
132
+ error_message=str(e),
133
+ )
134
+ )
135
+
136
+ return ext_res
137
+
138
+ def _determine_status(self, ext_res: ExtractionResult) -> ConversionStatus:
139
+ """Determine the status based on extraction results."""
140
+ if ext_res.pages and not any(page.errors for page in ext_res.pages):
141
+ return ConversionStatus.SUCCESS
142
+ else:
143
+ return ConversionStatus.FAILURE
144
+
145
+ def _get_images_from_input(self, input_doc: InputDocument) -> list[Image]:
146
+ """Extract images from input document using the backend."""
147
+ images = []
148
+
149
+ try:
150
+ backend = input_doc._backend
151
+
152
+ assert isinstance(backend, PdfDocumentBackend)
153
+ # Use the backend's pagination interface
154
+ page_count = backend.page_count()
155
+
156
+ # Respect page range limits, following the same pattern as PaginatedPipeline
157
+ start_page, end_page = input_doc.limits.page_range
158
+ _log.info(
159
+ f"Processing pages {start_page}-{end_page} of {page_count} total pages for extraction"
160
+ )
161
+
162
+ for page_num in range(page_count):
163
+ # Only process pages within the specified range (0-based indexing)
164
+ if start_page - 1 <= page_num <= end_page - 1:
165
+ try:
166
+ page_backend = backend.load_page(page_num)
167
+ if page_backend.is_valid():
168
+ # Get page image at a reasonable scale
169
+ page_image = page_backend.get_page_image(
170
+ scale=self.pipeline_options.vlm_options.scale
171
+ )
172
+ images.append(page_image)
173
+ else:
174
+ _log.warning(f"Page {page_num + 1} backend is not valid")
175
+ except Exception as e:
176
+ _log.error(f"Error loading page {page_num + 1}: {e}")
177
+
178
+ except Exception as e:
179
+ _log.error(f"Error getting images from input document: {e}")
180
+
181
+ return images
182
+
183
+ def _serialize_template(self, template: ExtractionTemplateType) -> str:
184
+ """Serialize template to string based on its type."""
185
+ if isinstance(template, str):
186
+ return template
187
+ elif isinstance(template, dict):
188
+ return json.dumps(template, indent=2)
189
+ elif isinstance(template, BaseModel):
190
+ return template.model_dump_json(indent=2)
191
+ elif inspect.isclass(template) and issubclass(template, BaseModel):
192
+ from polyfactory.factories.pydantic_factory import ModelFactory
193
+
194
+ class ExtractionTemplateFactory(ModelFactory[template]): # type: ignore
195
+ __use_examples__ = True # prefer Field(examples=...) when present
196
+ __use_defaults__ = True # use field defaults instead of random values
197
+
198
+ return ExtractionTemplateFactory.build().model_dump_json(indent=2) # type: ignore
199
+ else:
200
+ raise ValueError(f"Unsupported template type: {type(template)}")
201
+
202
+ @classmethod
203
+ def get_default_options(cls) -> BaseOptions:
204
+ return VlmExtractionPipelineOptions()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.48.0
3
+ Version: 2.49.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -51,6 +51,7 @@ Requires-Dist: pluggy<2.0.0,>=1.0.0
51
51
  Requires-Dist: pylatexenc<3.0,>=2.10
52
52
  Requires-Dist: scipy<2.0.0,>=1.6.0
53
53
  Requires-Dist: accelerate<2,>=1.0.0
54
+ Requires-Dist: polyfactory>=2.22.2
54
55
  Provides-Extra: tesserocr
55
56
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
56
57
  Provides-Extra: ocrmac
@@ -60,6 +61,7 @@ Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
61
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
62
  Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
63
  Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
64
+ Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
63
65
  Provides-Extra: rapidocr
64
66
  Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
65
67
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -1,5 +1,6 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/document_converter.py,sha256=7lid_uhGNuurYICweaA1jqtSbnhf3hpuUYUNleHh-Ww,15924
2
+ docling/document_converter.py,sha256=CKMlobhTt8Y5yZ_tQOnPAP7_otBiddQ_klRGT5Bgwyo,15827
3
+ docling/document_extractor.py,sha256=-RbQRvLWLXF15HYqBbV_lJhh08Zl487UEQKhP-_FR8k,11969
3
4
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
4
5
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
5
6
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -12,12 +13,12 @@ docling/backend/docling_parse_v4_backend.py,sha256=MbCMxNGmoW4iuev9tX1Vt4jtIeak2
12
13
  docling/backend/html_backend.py,sha256=MqtU9fA83lcjqb85lFTmGDedOH72WxTmwvj0ZzPur1I,42224
13
14
  docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
14
15
  docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
15
- docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
16
+ docling/backend/msexcel_backend.py,sha256=5JRbPwOjR1r45AMeIts1rj6InbOgLBf_CtAhvNPVmsQ,19157
16
17
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
17
18
  docling/backend/msword_backend.py,sha256=fKeAMGGR5ABimedo_ofCQAybzdqmqWA3A3mpLl7X6qY,49129
18
19
  docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
19
20
  docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
20
- docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
21
+ docling/backend/pypdfium2_backend.py,sha256=AYhWs9S8W_TkAK0-OkRmUNf4HUZl26FP7-XYjwU5zDk,14209
21
22
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
23
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
24
  docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
@@ -35,14 +36,15 @@ docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
35
36
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
37
  docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
37
38
  docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
38
- docling/datamodel/base_models.py,sha256=OI2-tBjH3PZMF_Zyyc4eezJ4gFXIBiKT4BYKYy6n81E,11924
39
- docling/datamodel/document.py,sha256=zsxFYXvo6GtwGNogSDoBB1TFvkm7IOrP_VnqXNqBhJs,17329
39
+ docling/datamodel/base_models.py,sha256=vOt895z0GsFirHkkI3hM23e9oyUuz9RXfcGFtoINLtw,12334
40
+ docling/datamodel/document.py,sha256=ElY7G6FYJ6Bayyw433_tbnxyE47fnQRoBG_mygvOBrA,17370
41
+ docling/datamodel/extraction.py,sha256=7dgvtK5SuvgfB8LHAwS1FwrW1kcMQJuJG0ol8uAQgoQ,1323
40
42
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
41
- docling/datamodel/pipeline_options.py,sha256=0Qk2nyzEo90NWxSKaiHaVhIV_6zB20CXwC-Icn7g3gw,10760
43
+ docling/datamodel/pipeline_options.py,sha256=0J0xVOSfI3pqRMkXlzX_rtmVBgCTsR2QJz54xugP8sg,10963
42
44
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
43
45
  docling/datamodel/pipeline_options_vlm_model.py,sha256=AcqqThSW74hwQ6x7pazzm57LnJiUqB7gQi5wFayGlbk,2628
44
46
  docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
45
- docling/datamodel/vlm_model_specs.py,sha256=dFObfYlPyN7AbTCudsubsWvWTTx4F4Xz9GEJPkEV2_M,8175
47
+ docling/datamodel/vlm_model_specs.py,sha256=8D-bF95EoaD-Wd29lVX094HPJT1gYN393aFmzv7RipQ,8713
46
48
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
49
  docling/models/api_vlm_model.py,sha256=-zisU32pgDRbychyG6-neB0qweNbPaYnLXwiGT7SEdI,2859
48
50
  docling/models/base_model.py,sha256=tXFM7zJwF6Kn2EhtaB4QmgK4O2ruv1C7SjdBgM5QKak,6225
@@ -57,7 +59,7 @@ docling/models/page_preprocessing_model.py,sha256=rHNX1uP1ScTjVUlsxZ0eamK2uNUqI9
57
59
  docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
58
60
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
59
61
  docling/models/picture_description_vlm_model.py,sha256=5BJvaF3PHuL9lCVYqPv9krh3h_7YwNSdKYw1EVEj13k,4156
60
- docling/models/rapid_ocr_model.py,sha256=h5f-UMPzGoKv7jJKkH1bkb1OcB33zxs3yZpIFOgZdsw,7037
62
+ docling/models/rapid_ocr_model.py,sha256=7yZC7I1qoC9xC8xJIjTk2c8VFm89RfB6Vr7IDOnr5gs,7102
61
63
  docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
62
64
  docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
63
65
  docling/models/tesseract_ocr_cli_model.py,sha256=I3Gn28Y-LD8OfvyCElN9fLiNgpo2sT0uMkVt258253s,12881
@@ -73,10 +75,13 @@ docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnu
73
75
  docling/models/vlm_models_inline/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
74
76
  docling/models/vlm_models_inline/hf_transformers_model.py,sha256=G0RpKwdzm5NiqIBHG5nWLwBsrDfDebzErzRkyXppZPw,12134
75
77
  docling/models/vlm_models_inline/mlx_model.py,sha256=VP05v97mqzmaG4o9bOpJcxIlEqvNzAapJ15Zz3E3ACI,10169
78
+ docling/models/vlm_models_inline/nuextract_transformers_model.py,sha256=iWoGF8TgQfOOMqS__tSODcUuDnKTPaK7gIRFum5bPzc,10512
76
79
  docling/models/vlm_models_inline/vllm_model.py,sha256=_EnK1nfpAPJky7aRlyp8SUIghiZOQO8AkDN_hHqXLZg,8615
77
80
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
81
  docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
82
+ docling/pipeline/base_extraction_pipeline.py,sha256=aJj7qbppgAelwoaVKB1W-s7kFg_OcXRE64NpIIOxZGE,1905
79
83
  docling/pipeline/base_pipeline.py,sha256=Tl_C3adFABNxtE7hX83VSdx-j7D8GRvoFcno5A3Z-YQ,10062
84
+ docling/pipeline/extraction_vlm_pipeline.py,sha256=WIRZygpBJmKjszRsFqW4qfPUZ5Frd_Hqoiysp2dGx8Y,8723
80
85
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
81
86
  docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
82
87
  docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=NgdZxpfpElnvCgGlrQ8kSvq44LNzJcc6wOqD-AMrKZ0,26132
@@ -94,9 +99,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
94
99
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
95
100
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
96
101
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
97
- docling-2.48.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
98
- docling-2.48.0.dist-info/METADATA,sha256=EEjk7em4miqz1ZEyCZg9lRnzPBsoOljSwSFfi12a98g,10643
99
- docling-2.48.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
- docling-2.48.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
101
- docling-2.48.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
102
- docling-2.48.0.dist-info/RECORD,,
102
+ docling-2.49.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
103
+ docling-2.49.0.dist-info/METADATA,sha256=Gn1u-LwLRMCqHamlyu1M4w9a8NvGfk-jfcCh0XjhsfQ,10731
104
+ docling-2.49.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
105
+ docling-2.49.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
106
+ docling-2.49.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
107
+ docling-2.49.0.dist-info/RECORD,,