docling 2.48.0__py3-none-any.whl → 2.49.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/msexcel_backend.py +15 -1
- docling/backend/pypdfium2_backend.py +24 -2
- docling/datamodel/base_models.py +13 -1
- docling/datamodel/document.py +5 -3
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/pipeline_options.py +10 -4
- docling/datamodel/vlm_model_specs.py +17 -0
- docling/document_converter.py +3 -6
- docling/document_extractor.py +325 -0
- docling/models/rapid_ocr_model.py +1 -0
- docling/models/vlm_models_inline/nuextract_transformers_model.py +290 -0
- docling/pipeline/base_extraction_pipeline.py +58 -0
- docling/pipeline/extraction_vlm_pipeline.py +204 -0
- {docling-2.48.0.dist-info → docling-2.49.0.dist-info}/METADATA +3 -1
- {docling-2.48.0.dist-info → docling-2.49.0.dist-info}/RECORD +19 -14
- {docling-2.48.0.dist-info → docling-2.49.0.dist-info}/WHEEL +0 -0
- {docling-2.48.0.dist-info → docling-2.49.0.dist-info}/entry_points.txt +0 -0
- {docling-2.48.0.dist-info → docling-2.49.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.48.0.dist-info → docling-2.49.0.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Union, cast
|
4
|
+
from typing import Any, Optional, Union, cast
|
5
5
|
|
6
6
|
from docling_core.types.doc import (
|
7
7
|
BoundingBox,
|
8
|
+
ContentLayer,
|
8
9
|
CoordOrigin,
|
9
10
|
DocItem,
|
10
11
|
DoclingDocument,
|
@@ -197,6 +198,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
197
198
|
parent=None,
|
198
199
|
label=GroupLabel.SECTION,
|
199
200
|
name=f"sheet: {sheet_name}",
|
201
|
+
content_layer=self._get_sheet_content_layer(sheet),
|
200
202
|
)
|
201
203
|
doc = self._convert_sheet(doc, sheet)
|
202
204
|
width, height = self._find_page_size(doc, page_no)
|
@@ -237,6 +239,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
237
239
|
"""
|
238
240
|
|
239
241
|
if self.workbook is not None:
|
242
|
+
content_layer = self._get_sheet_content_layer(sheet)
|
240
243
|
tables = self._find_data_tables(sheet)
|
241
244
|
|
242
245
|
for excel_table in tables:
|
@@ -282,6 +285,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
282
285
|
origin=CoordOrigin.TOPLEFT,
|
283
286
|
),
|
284
287
|
),
|
288
|
+
content_layer=content_layer,
|
285
289
|
)
|
286
290
|
|
287
291
|
return doc
|
@@ -486,6 +490,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
486
490
|
The updated DoclingDocument.
|
487
491
|
"""
|
488
492
|
if self.workbook is not None:
|
493
|
+
content_layer = self._get_sheet_content_layer(sheet)
|
489
494
|
# Iterate over byte images in the sheet
|
490
495
|
for item in sheet._images: # type: ignore[attr-defined]
|
491
496
|
try:
|
@@ -511,6 +516,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
511
516
|
anchor, origin=CoordOrigin.TOPLEFT
|
512
517
|
),
|
513
518
|
),
|
519
|
+
content_layer=content_layer,
|
514
520
|
)
|
515
521
|
except Exception:
|
516
522
|
_log.error("could not extract the image from excel sheets")
|
@@ -536,3 +542,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
536
542
|
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
537
543
|
|
538
544
|
return (right - left, bottom - top)
|
545
|
+
|
546
|
+
@staticmethod
|
547
|
+
def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
|
548
|
+
return (
|
549
|
+
None
|
550
|
+
if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
|
551
|
+
else ContentLayer.INVISIBLE
|
552
|
+
)
|
@@ -254,16 +254,38 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
254
254
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
255
255
|
AREA_THRESHOLD = 0 # 32 * 32
|
256
256
|
page_size = self.get_size()
|
257
|
+
rotation = self._ppage.get_rotation()
|
258
|
+
|
257
259
|
with pypdfium2_lock:
|
258
260
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
259
261
|
pos = obj.get_pos()
|
262
|
+
if rotation == 90:
|
263
|
+
pos = (
|
264
|
+
pos[1],
|
265
|
+
page_size.height - pos[2],
|
266
|
+
pos[3],
|
267
|
+
page_size.height - pos[0],
|
268
|
+
)
|
269
|
+
elif rotation == 180:
|
270
|
+
pos = (
|
271
|
+
page_size.width - pos[2],
|
272
|
+
page_size.height - pos[3],
|
273
|
+
page_size.width - pos[0],
|
274
|
+
page_size.height - pos[1],
|
275
|
+
)
|
276
|
+
elif rotation == 270:
|
277
|
+
pos = (
|
278
|
+
page_size.width - pos[3],
|
279
|
+
pos[0],
|
280
|
+
page_size.width - pos[1],
|
281
|
+
pos[2],
|
282
|
+
)
|
283
|
+
|
260
284
|
cropbox = BoundingBox.from_tuple(
|
261
285
|
pos, origin=CoordOrigin.BOTTOMLEFT
|
262
286
|
).to_top_left_origin(page_height=page_size.height)
|
263
|
-
|
264
287
|
if cropbox.area() > AREA_THRESHOLD:
|
265
288
|
cropbox = cropbox.scaled(scale=scale)
|
266
|
-
|
267
289
|
yield cropbox
|
268
290
|
|
269
291
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
docling/datamodel/base_models.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import math
|
2
2
|
from collections import defaultdict
|
3
3
|
from enum import Enum
|
4
|
-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
|
5
5
|
|
6
6
|
import numpy as np
|
7
7
|
from docling_core.types.doc import (
|
@@ -32,6 +32,18 @@ from pydantic import (
|
|
32
32
|
if TYPE_CHECKING:
|
33
33
|
from docling.backend.pdf_backend import PdfPageBackend
|
34
34
|
|
35
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
36
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
37
|
+
|
38
|
+
|
39
|
+
class BaseFormatOption(BaseModel):
|
40
|
+
"""Base class for format options used by _DocumentConversionInput."""
|
41
|
+
|
42
|
+
pipeline_options: Optional[PipelineOptions] = None
|
43
|
+
backend: Type[AbstractDocumentBackend]
|
44
|
+
|
45
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
46
|
+
|
35
47
|
|
36
48
|
class ConversionStatus(str, Enum):
|
37
49
|
PENDING = "pending"
|
docling/datamodel/document.py
CHANGED
@@ -2,12 +2,13 @@ import csv
|
|
2
2
|
import logging
|
3
3
|
import re
|
4
4
|
import tarfile
|
5
|
-
from collections.abc import Iterable
|
5
|
+
from collections.abc import Iterable, Mapping
|
6
6
|
from enum import Enum
|
7
7
|
from io import BytesIO
|
8
8
|
from pathlib import Path, PurePath
|
9
9
|
from typing import (
|
10
10
|
TYPE_CHECKING,
|
11
|
+
Any,
|
11
12
|
Dict,
|
12
13
|
List,
|
13
14
|
Literal,
|
@@ -72,7 +73,7 @@ from docling.utils.profiling import ProfilingItem
|
|
72
73
|
from docling.utils.utils import create_file_hash
|
73
74
|
|
74
75
|
if TYPE_CHECKING:
|
75
|
-
from docling.
|
76
|
+
from docling.datamodel.base_models import BaseFormatOption
|
76
77
|
|
77
78
|
_log = logging.getLogger(__name__)
|
78
79
|
|
@@ -238,7 +239,8 @@ class _DocumentConversionInput(BaseModel):
|
|
238
239
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
239
240
|
|
240
241
|
def docs(
|
241
|
-
self,
|
242
|
+
self,
|
243
|
+
format_options: Mapping[InputFormat, "BaseFormatOption"],
|
242
244
|
) -> Iterable[InputDocument]:
|
243
245
|
for item in self.path_or_stream_iterator:
|
244
246
|
obj = (
|
@@ -0,0 +1,39 @@
|
|
1
|
+
"""Data models for document extraction functionality."""
|
2
|
+
|
3
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
4
|
+
|
5
|
+
from pydantic import BaseModel, Field
|
6
|
+
|
7
|
+
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
8
|
+
from docling.datamodel.document import InputDocument
|
9
|
+
|
10
|
+
|
11
|
+
class ExtractedPageData(BaseModel):
|
12
|
+
"""Data model for extracted content from a single page."""
|
13
|
+
|
14
|
+
page_no: int = Field(..., description="1-indexed page number")
|
15
|
+
extracted_data: Optional[Dict[str, Any]] = Field(
|
16
|
+
None, description="Extracted structured data from the page"
|
17
|
+
)
|
18
|
+
raw_text: Optional[str] = Field(None, description="Raw extracted text")
|
19
|
+
errors: List[str] = Field(
|
20
|
+
default_factory=list,
|
21
|
+
description="Any errors encountered during extraction for this page",
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
class ExtractionResult(BaseModel):
|
26
|
+
"""Result of document extraction."""
|
27
|
+
|
28
|
+
input: InputDocument
|
29
|
+
status: ConversionStatus = ConversionStatus.PENDING
|
30
|
+
errors: List[ErrorItem] = []
|
31
|
+
|
32
|
+
# Pages field - always a list for consistency
|
33
|
+
pages: List[ExtractedPageData] = Field(
|
34
|
+
default_factory=list, description="Extracted data from each page"
|
35
|
+
)
|
36
|
+
|
37
|
+
|
38
|
+
# Type alias for template parameters that can be string, dict, or BaseModel
|
39
|
+
ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]
|
@@ -37,6 +37,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
|
|
37
37
|
from docling.datamodel.vlm_model_specs import (
|
38
38
|
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
39
39
|
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
40
|
+
NU_EXTRACT_2B_TRANSFORMERS,
|
40
41
|
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
41
42
|
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
42
43
|
VlmModelType,
|
@@ -113,6 +114,7 @@ class RapidOcrOptions(OcrOptions):
|
|
113
114
|
cls_model_path: Optional[str] = None # same default as rapidocr
|
114
115
|
rec_model_path: Optional[str] = None # same default as rapidocr
|
115
116
|
rec_keys_path: Optional[str] = None # same default as rapidocr
|
117
|
+
rec_font_path: Optional[str] = None # same default as rapidocr
|
116
118
|
|
117
119
|
model_config = ConfigDict(
|
118
120
|
extra="forbid",
|
@@ -246,12 +248,9 @@ class OcrEngine(str, Enum):
|
|
246
248
|
RAPIDOCR = "rapidocr"
|
247
249
|
|
248
250
|
|
249
|
-
class PipelineOptions(
|
251
|
+
class PipelineOptions(BaseOptions):
|
250
252
|
"""Base pipeline options."""
|
251
253
|
|
252
|
-
create_legacy_output: bool = (
|
253
|
-
True # This default will be set to False on a future version of docling
|
254
|
-
)
|
255
254
|
document_timeout: Optional[float] = None
|
256
255
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
257
256
|
enable_remote_services: bool = False
|
@@ -295,6 +294,13 @@ class AsrPipelineOptions(PipelineOptions):
|
|
295
294
|
artifacts_path: Optional[Union[Path, str]] = None
|
296
295
|
|
297
296
|
|
297
|
+
class VlmExtractionPipelineOptions(PipelineOptions):
|
298
|
+
"""Options for extraction pipeline."""
|
299
|
+
|
300
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
301
|
+
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
|
302
|
+
|
303
|
+
|
298
304
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
299
305
|
"""Options for the PDF pipeline."""
|
300
306
|
|
@@ -247,6 +247,23 @@ DOLPHIN_TRANSFORMERS = InlineVlmOptions(
|
|
247
247
|
temperature=0.0,
|
248
248
|
)
|
249
249
|
|
250
|
+
# NuExtract
|
251
|
+
NU_EXTRACT_2B_TRANSFORMERS = InlineVlmOptions(
|
252
|
+
repo_id="numind/NuExtract-2.0-2B",
|
253
|
+
prompt="", # This won't be used, template is passed separately
|
254
|
+
torch_dtype="bfloat16",
|
255
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
256
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
257
|
+
response_format=ResponseFormat.PLAINTEXT,
|
258
|
+
supported_devices=[
|
259
|
+
AcceleratorDevice.CPU,
|
260
|
+
AcceleratorDevice.CUDA,
|
261
|
+
AcceleratorDevice.MPS,
|
262
|
+
],
|
263
|
+
scale=2.0,
|
264
|
+
temperature=0.0,
|
265
|
+
)
|
266
|
+
|
250
267
|
|
251
268
|
class VlmModelType(str, Enum):
|
252
269
|
SMOLDOCLING = "smoldocling"
|
docling/document_converter.py
CHANGED
@@ -28,6 +28,7 @@ from docling.backend.noop_backend import NoOpBackend
|
|
28
28
|
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
29
29
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
30
30
|
from docling.datamodel.base_models import (
|
31
|
+
BaseFormatOption,
|
31
32
|
ConversionStatus,
|
32
33
|
DoclingComponentType,
|
33
34
|
DocumentStream,
|
@@ -57,12 +58,8 @@ _log = logging.getLogger(__name__)
|
|
57
58
|
_PIPELINE_CACHE_LOCK = threading.Lock()
|
58
59
|
|
59
60
|
|
60
|
-
class FormatOption(
|
61
|
+
class FormatOption(BaseFormatOption):
|
61
62
|
pipeline_cls: Type[BasePipeline]
|
62
|
-
pipeline_options: Optional[PipelineOptions] = None
|
63
|
-
backend: Type[AbstractDocumentBackend]
|
64
|
-
|
65
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
66
63
|
|
67
64
|
@model_validator(mode="after")
|
68
65
|
def set_optional_field_default(self) -> "FormatOption":
|
@@ -191,7 +188,7 @@ class DocumentConverter:
|
|
191
188
|
self.allowed_formats = (
|
192
189
|
allowed_formats if allowed_formats is not None else list(InputFormat)
|
193
190
|
)
|
194
|
-
self.format_to_options = {
|
191
|
+
self.format_to_options: Dict[InputFormat, FormatOption] = {
|
195
192
|
format: (
|
196
193
|
_get_default_option(format=format)
|
197
194
|
if (custom_option := (format_options or {}).get(format)) is None
|
@@ -0,0 +1,325 @@
|
|
1
|
+
import hashlib
|
2
|
+
import logging
|
3
|
+
import sys
|
4
|
+
import threading
|
5
|
+
import time
|
6
|
+
import warnings
|
7
|
+
from collections.abc import Iterable, Iterator
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
9
|
+
from functools import partial
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import Dict, List, Optional, Tuple, Type, Union
|
12
|
+
|
13
|
+
from pydantic import ConfigDict, model_validator, validate_call
|
14
|
+
|
15
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
16
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
17
|
+
from docling.datamodel.base_models import (
|
18
|
+
BaseFormatOption,
|
19
|
+
ConversionStatus,
|
20
|
+
DoclingComponentType,
|
21
|
+
DocumentStream,
|
22
|
+
ErrorItem,
|
23
|
+
InputFormat,
|
24
|
+
)
|
25
|
+
from docling.datamodel.document import (
|
26
|
+
InputDocument,
|
27
|
+
_DocumentConversionInput, # intentionally reused builder
|
28
|
+
)
|
29
|
+
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
|
30
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
31
|
+
from docling.datamodel.settings import (
|
32
|
+
DEFAULT_PAGE_RANGE,
|
33
|
+
DocumentLimits,
|
34
|
+
PageRange,
|
35
|
+
settings,
|
36
|
+
)
|
37
|
+
from docling.exceptions import ConversionError
|
38
|
+
from docling.pipeline.base_extraction_pipeline import BaseExtractionPipeline
|
39
|
+
from docling.pipeline.extraction_vlm_pipeline import ExtractionVlmPipeline
|
40
|
+
from docling.utils.utils import chunkify
|
41
|
+
|
42
|
+
_log = logging.getLogger(__name__)
|
43
|
+
_PIPELINE_CACHE_LOCK = threading.Lock()
|
44
|
+
|
45
|
+
|
46
|
+
class ExtractionFormatOption(BaseFormatOption):
|
47
|
+
"""Per-format configuration for extraction.
|
48
|
+
|
49
|
+
Notes:
|
50
|
+
- `pipeline_cls` must subclass `BaseExtractionPipeline`.
|
51
|
+
- `pipeline_options` is typed as `PipelineOptions` which MUST inherit from
|
52
|
+
`BaseOptions` (as used by `BaseExtractionPipeline`).
|
53
|
+
- `backend` is the document-opening backend used by `_DocumentConversionInput`.
|
54
|
+
"""
|
55
|
+
|
56
|
+
pipeline_cls: Type[BaseExtractionPipeline]
|
57
|
+
|
58
|
+
@model_validator(mode="after")
|
59
|
+
def set_optional_field_default(self) -> "ExtractionFormatOption":
|
60
|
+
if self.pipeline_options is None:
|
61
|
+
# `get_default_options` comes from BaseExtractionPipeline
|
62
|
+
self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
|
63
|
+
return self
|
64
|
+
|
65
|
+
|
66
|
+
def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
|
67
|
+
"""Return the default extraction option for a given input format.
|
68
|
+
|
69
|
+
Defaults mirror the converter's *backend* choices, while the pipeline is
|
70
|
+
the VLM extractor. This duplication will be removed when we deduplicate
|
71
|
+
the format registry between convert/extract.
|
72
|
+
"""
|
73
|
+
format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
74
|
+
InputFormat.IMAGE: PyPdfiumDocumentBackend,
|
75
|
+
InputFormat.PDF: PyPdfiumDocumentBackend,
|
76
|
+
}
|
77
|
+
|
78
|
+
backend = format_to_default_backend.get(fmt)
|
79
|
+
if backend is None:
|
80
|
+
raise RuntimeError(f"No default extraction backend configured for {fmt}")
|
81
|
+
|
82
|
+
return ExtractionFormatOption(
|
83
|
+
pipeline_cls=ExtractionVlmPipeline,
|
84
|
+
backend=backend,
|
85
|
+
)
|
86
|
+
|
87
|
+
|
88
|
+
class DocumentExtractor:
|
89
|
+
"""Standalone extractor class.
|
90
|
+
|
91
|
+
Public API:
|
92
|
+
- `extract(...) -> ExtractionResult`
|
93
|
+
- `extract_all(...) -> Iterator[ExtractionResult]`
|
94
|
+
|
95
|
+
Implementation intentionally reuses `_DocumentConversionInput` to build
|
96
|
+
`InputDocument` with the correct backend per format.
|
97
|
+
"""
|
98
|
+
|
99
|
+
def __init__(
|
100
|
+
self,
|
101
|
+
allowed_formats: Optional[List[InputFormat]] = None,
|
102
|
+
extraction_format_options: Optional[
|
103
|
+
Dict[InputFormat, ExtractionFormatOption]
|
104
|
+
] = None,
|
105
|
+
) -> None:
|
106
|
+
self.allowed_formats: List[InputFormat] = (
|
107
|
+
allowed_formats if allowed_formats is not None else list(InputFormat)
|
108
|
+
)
|
109
|
+
# Build per-format options with defaults, then apply any user overrides
|
110
|
+
overrides = extraction_format_options or {}
|
111
|
+
self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
|
112
|
+
fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
|
113
|
+
for fmt in self.allowed_formats
|
114
|
+
}
|
115
|
+
|
116
|
+
# Cache pipelines by (class, options-hash)
|
117
|
+
self._initialized_pipelines: Dict[
|
118
|
+
Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
|
119
|
+
] = {}
|
120
|
+
|
121
|
+
# ---------------------------- Public API ---------------------------------
|
122
|
+
|
123
|
+
@validate_call(config=ConfigDict(strict=True))
|
124
|
+
def extract(
|
125
|
+
self,
|
126
|
+
source: Union[Path, str, DocumentStream],
|
127
|
+
template: ExtractionTemplateType,
|
128
|
+
headers: Optional[Dict[str, str]] = None,
|
129
|
+
raises_on_error: bool = True,
|
130
|
+
max_num_pages: int = sys.maxsize,
|
131
|
+
max_file_size: int = sys.maxsize,
|
132
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
133
|
+
) -> ExtractionResult:
|
134
|
+
all_res = self.extract_all(
|
135
|
+
source=[source],
|
136
|
+
headers=headers,
|
137
|
+
raises_on_error=raises_on_error,
|
138
|
+
max_num_pages=max_num_pages,
|
139
|
+
max_file_size=max_file_size,
|
140
|
+
page_range=page_range,
|
141
|
+
template=template,
|
142
|
+
)
|
143
|
+
return next(all_res)
|
144
|
+
|
145
|
+
@validate_call(config=ConfigDict(strict=True))
|
146
|
+
def extract_all(
|
147
|
+
self,
|
148
|
+
source: Iterable[Union[Path, str, DocumentStream]],
|
149
|
+
template: ExtractionTemplateType,
|
150
|
+
headers: Optional[Dict[str, str]] = None,
|
151
|
+
raises_on_error: bool = True,
|
152
|
+
max_num_pages: int = sys.maxsize,
|
153
|
+
max_file_size: int = sys.maxsize,
|
154
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
155
|
+
) -> Iterator[ExtractionResult]:
|
156
|
+
warnings.warn(
|
157
|
+
"The extract API is currently experimental and may change without prior notice.\n"
|
158
|
+
"Only PDF and image formats are supported.",
|
159
|
+
UserWarning,
|
160
|
+
stacklevel=2,
|
161
|
+
)
|
162
|
+
|
163
|
+
limits = DocumentLimits(
|
164
|
+
max_num_pages=max_num_pages,
|
165
|
+
max_file_size=max_file_size,
|
166
|
+
page_range=page_range,
|
167
|
+
)
|
168
|
+
conv_input = _DocumentConversionInput(
|
169
|
+
path_or_stream_iterator=source, limits=limits, headers=headers
|
170
|
+
)
|
171
|
+
|
172
|
+
ext_res_iter = self._extract(
|
173
|
+
conv_input, raises_on_error=raises_on_error, template=template
|
174
|
+
)
|
175
|
+
|
176
|
+
had_result = False
|
177
|
+
for ext_res in ext_res_iter:
|
178
|
+
had_result = True
|
179
|
+
if raises_on_error and ext_res.status not in {
|
180
|
+
ConversionStatus.SUCCESS,
|
181
|
+
ConversionStatus.PARTIAL_SUCCESS,
|
182
|
+
}:
|
183
|
+
raise ConversionError(
|
184
|
+
f"Extraction failed for: {ext_res.input.file} with status: {ext_res.status}"
|
185
|
+
)
|
186
|
+
else:
|
187
|
+
yield ext_res
|
188
|
+
|
189
|
+
if not had_result and raises_on_error:
|
190
|
+
raise ConversionError(
|
191
|
+
"Extraction failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
192
|
+
)
|
193
|
+
|
194
|
+
# --------------------------- Internal engine ------------------------------
|
195
|
+
|
196
|
+
def _extract(
|
197
|
+
self,
|
198
|
+
conv_input: _DocumentConversionInput,
|
199
|
+
raises_on_error: bool,
|
200
|
+
template: ExtractionTemplateType,
|
201
|
+
) -> Iterator[ExtractionResult]:
|
202
|
+
start_time = time.monotonic()
|
203
|
+
|
204
|
+
for input_batch in chunkify(
|
205
|
+
conv_input.docs(self.extraction_format_to_options),
|
206
|
+
settings.perf.doc_batch_size,
|
207
|
+
):
|
208
|
+
_log.info("Going to extract document batch...")
|
209
|
+
process_func = partial(
|
210
|
+
self._process_document_extraction,
|
211
|
+
raises_on_error=raises_on_error,
|
212
|
+
template=template,
|
213
|
+
)
|
214
|
+
|
215
|
+
if (
|
216
|
+
settings.perf.doc_batch_concurrency > 1
|
217
|
+
and settings.perf.doc_batch_size > 1
|
218
|
+
):
|
219
|
+
with ThreadPoolExecutor(
|
220
|
+
max_workers=settings.perf.doc_batch_concurrency
|
221
|
+
) as pool:
|
222
|
+
for item in pool.map(
|
223
|
+
process_func,
|
224
|
+
input_batch,
|
225
|
+
):
|
226
|
+
yield item
|
227
|
+
else:
|
228
|
+
for item in map(
|
229
|
+
process_func,
|
230
|
+
input_batch,
|
231
|
+
):
|
232
|
+
elapsed = time.monotonic() - start_time
|
233
|
+
start_time = time.monotonic()
|
234
|
+
_log.info(
|
235
|
+
f"Finished extracting document {item.input.file.name} in {elapsed:.2f} sec."
|
236
|
+
)
|
237
|
+
yield item
|
238
|
+
|
239
|
+
def _process_document_extraction(
|
240
|
+
self,
|
241
|
+
in_doc: InputDocument,
|
242
|
+
raises_on_error: bool,
|
243
|
+
template: ExtractionTemplateType,
|
244
|
+
) -> ExtractionResult:
|
245
|
+
valid = (
|
246
|
+
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
247
|
+
)
|
248
|
+
if valid:
|
249
|
+
return self._execute_extraction_pipeline(
|
250
|
+
in_doc, raises_on_error=raises_on_error, template=template
|
251
|
+
)
|
252
|
+
else:
|
253
|
+
error_message = f"File format not allowed: {in_doc.file}"
|
254
|
+
if raises_on_error:
|
255
|
+
raise ConversionError(error_message)
|
256
|
+
else:
|
257
|
+
error_item = ErrorItem(
|
258
|
+
component_type=DoclingComponentType.USER_INPUT,
|
259
|
+
module_name="",
|
260
|
+
error_message=error_message,
|
261
|
+
)
|
262
|
+
return ExtractionResult(
|
263
|
+
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
264
|
+
)
|
265
|
+
|
266
|
+
def _execute_extraction_pipeline(
|
267
|
+
self,
|
268
|
+
in_doc: InputDocument,
|
269
|
+
raises_on_error: bool,
|
270
|
+
template: ExtractionTemplateType,
|
271
|
+
) -> ExtractionResult:
|
272
|
+
if not in_doc.valid:
|
273
|
+
if raises_on_error:
|
274
|
+
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
275
|
+
else:
|
276
|
+
return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
|
277
|
+
|
278
|
+
pipeline = self._get_pipeline(in_doc.format)
|
279
|
+
if pipeline is None:
|
280
|
+
if raises_on_error:
|
281
|
+
raise ConversionError(
|
282
|
+
f"No extraction pipeline could be initialized for {in_doc.file}."
|
283
|
+
)
|
284
|
+
else:
|
285
|
+
return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
|
286
|
+
|
287
|
+
return pipeline.execute(
|
288
|
+
in_doc, raises_on_error=raises_on_error, template=template
|
289
|
+
)
|
290
|
+
|
291
|
+
def _get_pipeline(
|
292
|
+
self, doc_format: InputFormat
|
293
|
+
) -> Optional[BaseExtractionPipeline]:
|
294
|
+
"""Retrieve or initialize a pipeline, reusing instances based on class and options."""
|
295
|
+
fopt = self.extraction_format_to_options.get(doc_format)
|
296
|
+
if fopt is None or fopt.pipeline_options is None:
|
297
|
+
return None
|
298
|
+
|
299
|
+
pipeline_class = fopt.pipeline_cls
|
300
|
+
pipeline_options = fopt.pipeline_options
|
301
|
+
options_hash = self._get_pipeline_options_hash(pipeline_options)
|
302
|
+
|
303
|
+
cache_key = (pipeline_class, options_hash)
|
304
|
+
with _PIPELINE_CACHE_LOCK:
|
305
|
+
if cache_key not in self._initialized_pipelines:
|
306
|
+
_log.info(
|
307
|
+
f"Initializing extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
308
|
+
)
|
309
|
+
self._initialized_pipelines[cache_key] = pipeline_class(
|
310
|
+
pipeline_options=pipeline_options # type: ignore[arg-type]
|
311
|
+
)
|
312
|
+
else:
|
313
|
+
_log.debug(
|
314
|
+
f"Reusing cached extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
315
|
+
)
|
316
|
+
|
317
|
+
return self._initialized_pipelines[cache_key]
|
318
|
+
|
319
|
+
@staticmethod
|
320
|
+
def _get_pipeline_options_hash(pipeline_options: PipelineOptions) -> str:
|
321
|
+
"""Generate a stable hash of pipeline options to use as part of the cache key."""
|
322
|
+
options_str = str(pipeline_options.model_dump())
|
323
|
+
return hashlib.md5(
|
324
|
+
options_str.encode("utf-8"), usedforsecurity=False
|
325
|
+
).hexdigest()
|
@@ -79,6 +79,7 @@ class RapidOcrModel(BaseOcrModel):
|
|
79
79
|
"Cls.intra_op_num_threads": intra_op_num_threads,
|
80
80
|
# Recognition model settings
|
81
81
|
"Rec.model_path": self.options.rec_model_path,
|
82
|
+
"Rec.font_path": self.options.rec_font_path,
|
82
83
|
"Rec.keys_path": self.options.rec_keys_path,
|
83
84
|
"Rec.use_cuda": use_cuda,
|
84
85
|
"Rec.use_dml": use_dml,
|
@@ -0,0 +1,290 @@
|
|
1
|
+
import logging
|
2
|
+
import time
|
3
|
+
from collections.abc import Iterable
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Any, Optional, Union
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from PIL.Image import Image
|
9
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor, GenerationConfig
|
10
|
+
|
11
|
+
from docling.datamodel.accelerator_options import (
|
12
|
+
AcceleratorOptions,
|
13
|
+
)
|
14
|
+
from docling.datamodel.base_models import VlmPrediction
|
15
|
+
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
|
16
|
+
from docling.models.base_model import BaseVlmModel
|
17
|
+
from docling.models.utils.hf_model_download import (
|
18
|
+
HuggingFaceModelDownloadMixin,
|
19
|
+
)
|
20
|
+
from docling.utils.accelerator_utils import decide_device
|
21
|
+
|
22
|
+
_log = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
# Source code from https://huggingface.co/numind/NuExtract-2.0-8B
|
26
|
+
def process_all_vision_info(messages, examples=None):
|
27
|
+
"""
|
28
|
+
Process vision information from both messages and in-context examples, supporting batch processing.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
messages: List of message dictionaries (single input) OR list of message lists (batch input)
|
32
|
+
examples: Optional list of example dictionaries (single input) OR list of example lists (batch)
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
A flat list of all images in the correct order:
|
36
|
+
- For single input: example images followed by message images
|
37
|
+
- For batch input: interleaved as (item1 examples, item1 input, item2 examples, item2 input, etc.)
|
38
|
+
- Returns None if no images were found
|
39
|
+
"""
|
40
|
+
try:
|
41
|
+
from qwen_vl_utils import fetch_image, process_vision_info
|
42
|
+
except ImportError:
|
43
|
+
raise ImportError(
|
44
|
+
"qwen-vl-utils is required for NuExtractTransformersModel. "
|
45
|
+
"Please install it with: pip install qwen-vl-utils"
|
46
|
+
)
|
47
|
+
|
48
|
+
from qwen_vl_utils import fetch_image, process_vision_info
|
49
|
+
|
50
|
+
# Helper function to extract images from examples
|
51
|
+
def extract_example_images(example_item):
|
52
|
+
if not example_item:
|
53
|
+
return []
|
54
|
+
|
55
|
+
# Handle both list of examples and single example
|
56
|
+
examples_to_process = (
|
57
|
+
example_item if isinstance(example_item, list) else [example_item]
|
58
|
+
)
|
59
|
+
images = []
|
60
|
+
|
61
|
+
for example in examples_to_process:
|
62
|
+
if (
|
63
|
+
isinstance(example.get("input"), dict)
|
64
|
+
and example["input"].get("type") == "image"
|
65
|
+
):
|
66
|
+
images.append(fetch_image(example["input"]))
|
67
|
+
|
68
|
+
return images
|
69
|
+
|
70
|
+
# Normalize inputs to always be batched format
|
71
|
+
is_batch = messages and isinstance(messages[0], list)
|
72
|
+
messages_batch = messages if is_batch else [messages]
|
73
|
+
is_batch_examples = (
|
74
|
+
examples
|
75
|
+
and isinstance(examples, list)
|
76
|
+
and (isinstance(examples[0], list) or examples[0] is None)
|
77
|
+
)
|
78
|
+
examples_batch = (
|
79
|
+
examples
|
80
|
+
if is_batch_examples
|
81
|
+
else ([examples] if examples is not None else None)
|
82
|
+
)
|
83
|
+
|
84
|
+
# Ensure examples batch matches messages batch if provided
|
85
|
+
if examples and len(examples_batch) != len(messages_batch):
|
86
|
+
if not is_batch and len(examples_batch) == 1:
|
87
|
+
# Single example set for a single input is fine
|
88
|
+
pass
|
89
|
+
else:
|
90
|
+
raise ValueError("Examples batch length must match messages batch length")
|
91
|
+
|
92
|
+
# Process all inputs, maintaining correct order
|
93
|
+
all_images = []
|
94
|
+
for i, message_group in enumerate(messages_batch):
|
95
|
+
# Get example images for this input
|
96
|
+
if examples and i < len(examples_batch):
|
97
|
+
input_example_images = extract_example_images(examples_batch[i])
|
98
|
+
all_images.extend(input_example_images)
|
99
|
+
|
100
|
+
# Get message images for this input
|
101
|
+
input_message_images = process_vision_info(message_group)[0] or []
|
102
|
+
all_images.extend(input_message_images)
|
103
|
+
|
104
|
+
return all_images if all_images else None
|
105
|
+
|
106
|
+
|
107
|
+
class NuExtractTransformersModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
|
108
|
+
def __init__(
|
109
|
+
self,
|
110
|
+
enabled: bool,
|
111
|
+
artifacts_path: Optional[Path],
|
112
|
+
accelerator_options: AcceleratorOptions,
|
113
|
+
vlm_options: InlineVlmOptions,
|
114
|
+
):
|
115
|
+
self.enabled = enabled
|
116
|
+
self.vlm_options = vlm_options
|
117
|
+
|
118
|
+
if self.enabled:
|
119
|
+
import torch
|
120
|
+
|
121
|
+
self.device = decide_device(
|
122
|
+
accelerator_options.device,
|
123
|
+
supported_devices=vlm_options.supported_devices,
|
124
|
+
)
|
125
|
+
_log.debug(f"Available device for NuExtract VLM: {self.device}")
|
126
|
+
|
127
|
+
self.max_new_tokens = vlm_options.max_new_tokens
|
128
|
+
self.temperature = vlm_options.temperature
|
129
|
+
|
130
|
+
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
131
|
+
|
132
|
+
if artifacts_path is None:
|
133
|
+
artifacts_path = self.download_models(self.vlm_options.repo_id)
|
134
|
+
elif (artifacts_path / repo_cache_folder).exists():
|
135
|
+
artifacts_path = artifacts_path / repo_cache_folder
|
136
|
+
|
137
|
+
self.processor = AutoProcessor.from_pretrained(
|
138
|
+
artifacts_path,
|
139
|
+
trust_remote_code=vlm_options.trust_remote_code,
|
140
|
+
use_fast=True,
|
141
|
+
)
|
142
|
+
self.processor.tokenizer.padding_side = "left"
|
143
|
+
|
144
|
+
self.vlm_model = AutoModelForImageTextToText.from_pretrained(
|
145
|
+
artifacts_path,
|
146
|
+
device_map=self.device,
|
147
|
+
torch_dtype=self.vlm_options.torch_dtype,
|
148
|
+
_attn_implementation=(
|
149
|
+
"flash_attention_2"
|
150
|
+
if self.device.startswith("cuda")
|
151
|
+
and accelerator_options.cuda_use_flash_attention2
|
152
|
+
else "sdpa"
|
153
|
+
),
|
154
|
+
trust_remote_code=vlm_options.trust_remote_code,
|
155
|
+
)
|
156
|
+
self.vlm_model = torch.compile(self.vlm_model) # type: ignore
|
157
|
+
|
158
|
+
# Load generation config
|
159
|
+
self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
|
160
|
+
|
161
|
+
def process_images(
|
162
|
+
self,
|
163
|
+
image_batch: Iterable[Union[Image, np.ndarray]],
|
164
|
+
prompt: Union[str, list[str]],
|
165
|
+
) -> Iterable[VlmPrediction]:
|
166
|
+
"""
|
167
|
+
Batched inference for NuExtract VLM using the specialized input format.
|
168
|
+
|
169
|
+
Args:
|
170
|
+
image_batch: Iterable of PIL Images or numpy arrays
|
171
|
+
prompt: Either:
|
172
|
+
- str: Single template used for all images
|
173
|
+
- list[str]: List of templates (one per image, must match image count)
|
174
|
+
"""
|
175
|
+
import torch
|
176
|
+
from PIL import Image as PILImage
|
177
|
+
|
178
|
+
# Normalize images to RGB PIL
|
179
|
+
pil_images: list[Image] = []
|
180
|
+
for img in image_batch:
|
181
|
+
if isinstance(img, np.ndarray):
|
182
|
+
if img.ndim == 3 and img.shape[2] in (3, 4):
|
183
|
+
pil_img = PILImage.fromarray(img.astype(np.uint8))
|
184
|
+
elif img.ndim == 2:
|
185
|
+
pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
|
186
|
+
else:
|
187
|
+
raise ValueError(f"Unsupported numpy array shape: {img.shape}")
|
188
|
+
else:
|
189
|
+
pil_img = img
|
190
|
+
if pil_img.mode != "RGB":
|
191
|
+
pil_img = pil_img.convert("RGB")
|
192
|
+
pil_images.append(pil_img)
|
193
|
+
|
194
|
+
if not pil_images:
|
195
|
+
return
|
196
|
+
|
197
|
+
# Normalize templates (1 per image)
|
198
|
+
if isinstance(prompt, str):
|
199
|
+
templates = [prompt] * len(pil_images)
|
200
|
+
else:
|
201
|
+
if len(prompt) != len(pil_images):
|
202
|
+
raise ValueError(
|
203
|
+
f"Number of templates ({len(prompt)}) must match number of images ({len(pil_images)})"
|
204
|
+
)
|
205
|
+
templates = prompt
|
206
|
+
|
207
|
+
# Construct NuExtract input format
|
208
|
+
inputs = []
|
209
|
+
for pil_img, template in zip(pil_images, templates):
|
210
|
+
input_item = {
|
211
|
+
"document": {"type": "image", "image": pil_img},
|
212
|
+
"template": template,
|
213
|
+
}
|
214
|
+
inputs.append(input_item)
|
215
|
+
|
216
|
+
# Create messages structure for batch processing
|
217
|
+
messages = [
|
218
|
+
[
|
219
|
+
{
|
220
|
+
"role": "user",
|
221
|
+
"content": [x["document"]],
|
222
|
+
}
|
223
|
+
]
|
224
|
+
for x in inputs
|
225
|
+
]
|
226
|
+
|
227
|
+
# Apply chat template to each example individually
|
228
|
+
texts = [
|
229
|
+
self.processor.tokenizer.apply_chat_template(
|
230
|
+
messages[i],
|
231
|
+
template=x["template"],
|
232
|
+
tokenize=False,
|
233
|
+
add_generation_prompt=True,
|
234
|
+
)
|
235
|
+
for i, x in enumerate(inputs)
|
236
|
+
]
|
237
|
+
|
238
|
+
# Process vision inputs using qwen-vl-utils
|
239
|
+
image_inputs = process_all_vision_info(messages)
|
240
|
+
|
241
|
+
# Process with the processor
|
242
|
+
processor_inputs = self.processor(
|
243
|
+
text=texts,
|
244
|
+
images=image_inputs,
|
245
|
+
padding=True,
|
246
|
+
return_tensors="pt",
|
247
|
+
**self.vlm_options.extra_processor_kwargs,
|
248
|
+
)
|
249
|
+
processor_inputs = {k: v.to(self.device) for k, v in processor_inputs.items()}
|
250
|
+
|
251
|
+
# Generate
|
252
|
+
gen_kwargs = {
|
253
|
+
**processor_inputs,
|
254
|
+
"max_new_tokens": self.max_new_tokens,
|
255
|
+
"generation_config": self.generation_config,
|
256
|
+
**self.vlm_options.extra_generation_config,
|
257
|
+
}
|
258
|
+
if self.temperature > 0:
|
259
|
+
gen_kwargs["do_sample"] = True
|
260
|
+
gen_kwargs["temperature"] = self.temperature
|
261
|
+
else:
|
262
|
+
gen_kwargs["do_sample"] = False
|
263
|
+
|
264
|
+
start_time = time.time()
|
265
|
+
with torch.inference_mode():
|
266
|
+
generated_ids = self.vlm_model.generate(**gen_kwargs)
|
267
|
+
generation_time = time.time() - start_time
|
268
|
+
|
269
|
+
# Trim generated sequences
|
270
|
+
input_len = processor_inputs["input_ids"].shape[1]
|
271
|
+
trimmed_sequences = generated_ids[:, input_len:]
|
272
|
+
|
273
|
+
# Decode with the processor/tokenizer
|
274
|
+
decoded_texts: list[str] = self.processor.batch_decode(
|
275
|
+
trimmed_sequences,
|
276
|
+
skip_special_tokens=True,
|
277
|
+
clean_up_tokenization_spaces=False,
|
278
|
+
)
|
279
|
+
|
280
|
+
# Optional logging
|
281
|
+
if generated_ids.shape[0] > 0: # type: ignore
|
282
|
+
_log.debug(
|
283
|
+
f"Generated {int(generated_ids[0].shape[0])} tokens in {generation_time:.2f}s "
|
284
|
+
f"for batch size {generated_ids.shape[0]}." # type: ignore
|
285
|
+
)
|
286
|
+
|
287
|
+
for text in decoded_texts:
|
288
|
+
# Apply decode_response to the output text
|
289
|
+
decoded_text = self.vlm_options.decode_response(text)
|
290
|
+
yield VlmPrediction(text=decoded_text, generation_time=generation_time)
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import logging
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
6
|
+
from docling.datamodel.document import InputDocument
|
7
|
+
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
|
8
|
+
from docling.datamodel.pipeline_options import BaseOptions
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class BaseExtractionPipeline(ABC):
|
14
|
+
def __init__(self, pipeline_options: BaseOptions):
|
15
|
+
self.pipeline_options = pipeline_options
|
16
|
+
|
17
|
+
def execute(
|
18
|
+
self,
|
19
|
+
in_doc: InputDocument,
|
20
|
+
raises_on_error: bool,
|
21
|
+
template: Optional[ExtractionTemplateType] = None,
|
22
|
+
) -> ExtractionResult:
|
23
|
+
ext_res = ExtractionResult(input=in_doc)
|
24
|
+
|
25
|
+
try:
|
26
|
+
ext_res = self._extract_data(ext_res, template)
|
27
|
+
ext_res.status = self._determine_status(ext_res)
|
28
|
+
except Exception as e:
|
29
|
+
ext_res.status = ConversionStatus.FAILURE
|
30
|
+
error_item = ErrorItem(
|
31
|
+
component_type="extraction_pipeline",
|
32
|
+
module_name=self.__class__.__name__,
|
33
|
+
error_message=str(e),
|
34
|
+
)
|
35
|
+
ext_res.errors.append(error_item)
|
36
|
+
if raises_on_error:
|
37
|
+
raise e
|
38
|
+
|
39
|
+
return ext_res
|
40
|
+
|
41
|
+
@abstractmethod
|
42
|
+
def _extract_data(
|
43
|
+
self,
|
44
|
+
ext_res: ExtractionResult,
|
45
|
+
template: Optional[ExtractionTemplateType] = None,
|
46
|
+
) -> ExtractionResult:
|
47
|
+
"""Subclass must populate ext_res.pages/errors and return the result."""
|
48
|
+
raise NotImplementedError
|
49
|
+
|
50
|
+
@abstractmethod
|
51
|
+
def _determine_status(self, ext_res: ExtractionResult) -> ConversionStatus:
|
52
|
+
"""Subclass must decide SUCCESS/PARTIAL_SUCCESS/FAILURE based on ext_res."""
|
53
|
+
raise NotImplementedError
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
@abstractmethod
|
57
|
+
def get_default_options(cls) -> BaseOptions:
|
58
|
+
pass
|
@@ -0,0 +1,204 @@
|
|
1
|
+
import inspect
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
from PIL.Image import Image
|
8
|
+
from pydantic import BaseModel
|
9
|
+
|
10
|
+
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
11
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
12
|
+
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
13
|
+
from docling.datamodel.document import InputDocument
|
14
|
+
from docling.datamodel.extraction import (
|
15
|
+
ExtractedPageData,
|
16
|
+
ExtractionResult,
|
17
|
+
ExtractionTemplateType,
|
18
|
+
)
|
19
|
+
from docling.datamodel.pipeline_options import BaseOptions, VlmExtractionPipelineOptions
|
20
|
+
from docling.datamodel.settings import settings
|
21
|
+
from docling.models.vlm_models_inline.nuextract_transformers_model import (
|
22
|
+
NuExtractTransformersModel,
|
23
|
+
)
|
24
|
+
from docling.pipeline.base_extraction_pipeline import BaseExtractionPipeline
|
25
|
+
from docling.utils.accelerator_utils import decide_device
|
26
|
+
|
27
|
+
_log = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
|
30
|
+
class ExtractionVlmPipeline(BaseExtractionPipeline):
|
31
|
+
def __init__(self, pipeline_options: VlmExtractionPipelineOptions):
|
32
|
+
super().__init__(pipeline_options)
|
33
|
+
|
34
|
+
# Initialize VLM model with default options
|
35
|
+
self.accelerator_options = pipeline_options.accelerator_options
|
36
|
+
self.pipeline_options: VlmExtractionPipelineOptions
|
37
|
+
|
38
|
+
artifacts_path: Optional[Path] = None
|
39
|
+
if pipeline_options.artifacts_path is not None:
|
40
|
+
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
41
|
+
elif settings.artifacts_path is not None:
|
42
|
+
artifacts_path = Path(settings.artifacts_path).expanduser()
|
43
|
+
|
44
|
+
if artifacts_path is not None and not artifacts_path.is_dir():
|
45
|
+
raise RuntimeError(
|
46
|
+
f"The value of {artifacts_path=} is not valid. "
|
47
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
48
|
+
)
|
49
|
+
|
50
|
+
# Create VLM model instance
|
51
|
+
self.vlm_model = NuExtractTransformersModel(
|
52
|
+
enabled=True,
|
53
|
+
artifacts_path=artifacts_path, # Will download automatically
|
54
|
+
accelerator_options=self.accelerator_options,
|
55
|
+
vlm_options=pipeline_options.vlm_options,
|
56
|
+
)
|
57
|
+
|
58
|
+
def _extract_data(
|
59
|
+
self,
|
60
|
+
ext_res: ExtractionResult,
|
61
|
+
template: Optional[ExtractionTemplateType] = None,
|
62
|
+
) -> ExtractionResult:
|
63
|
+
"""Extract data using the VLM model."""
|
64
|
+
try:
|
65
|
+
# Get images from input document using the backend
|
66
|
+
images = self._get_images_from_input(ext_res.input)
|
67
|
+
if not images:
|
68
|
+
ext_res.status = ConversionStatus.FAILURE
|
69
|
+
ext_res.errors.append(
|
70
|
+
ErrorItem(
|
71
|
+
component_type="extraction_pipeline",
|
72
|
+
module_name=self.__class__.__name__,
|
73
|
+
error_message="No images found in document",
|
74
|
+
)
|
75
|
+
)
|
76
|
+
return ext_res
|
77
|
+
|
78
|
+
# Use provided template or default prompt
|
79
|
+
if template is not None:
|
80
|
+
prompt = self._serialize_template(template)
|
81
|
+
else:
|
82
|
+
prompt = "Extract all text and structured information from this document. Return as JSON."
|
83
|
+
|
84
|
+
# Process all images with VLM model
|
85
|
+
start_page, end_page = ext_res.input.limits.page_range
|
86
|
+
for i, image in enumerate(images):
|
87
|
+
# Calculate the actual page number based on the filtered range
|
88
|
+
page_number = start_page + i
|
89
|
+
try:
|
90
|
+
predictions = list(self.vlm_model.process_images([image], prompt))
|
91
|
+
|
92
|
+
if predictions:
|
93
|
+
# Parse the extracted text as JSON if possible, otherwise use as-is
|
94
|
+
extracted_text = predictions[0].text
|
95
|
+
extracted_data = None
|
96
|
+
|
97
|
+
try:
|
98
|
+
extracted_data = json.loads(extracted_text)
|
99
|
+
except (json.JSONDecodeError, ValueError):
|
100
|
+
# If not valid JSON, keep extracted_data as None
|
101
|
+
pass
|
102
|
+
|
103
|
+
# Create page data with proper structure
|
104
|
+
page_data = ExtractedPageData(
|
105
|
+
page_no=page_number,
|
106
|
+
extracted_data=extracted_data,
|
107
|
+
raw_text=extracted_text, # Always populate raw_text
|
108
|
+
)
|
109
|
+
ext_res.pages.append(page_data)
|
110
|
+
else:
|
111
|
+
# Add error page data
|
112
|
+
page_data = ExtractedPageData(
|
113
|
+
page_no=page_number,
|
114
|
+
extracted_data=None,
|
115
|
+
errors=["No extraction result from VLM model"],
|
116
|
+
)
|
117
|
+
ext_res.pages.append(page_data)
|
118
|
+
|
119
|
+
except Exception as e:
|
120
|
+
_log.error(f"Error processing page {page_number}: {e}")
|
121
|
+
page_data = ExtractedPageData(
|
122
|
+
page_no=page_number, extracted_data=None, errors=[str(e)]
|
123
|
+
)
|
124
|
+
ext_res.pages.append(page_data)
|
125
|
+
|
126
|
+
except Exception as e:
|
127
|
+
_log.error(f"Error during extraction: {e}")
|
128
|
+
ext_res.errors.append(
|
129
|
+
ErrorItem(
|
130
|
+
component_type="extraction_pipeline",
|
131
|
+
module_name=self.__class__.__name__,
|
132
|
+
error_message=str(e),
|
133
|
+
)
|
134
|
+
)
|
135
|
+
|
136
|
+
return ext_res
|
137
|
+
|
138
|
+
def _determine_status(self, ext_res: ExtractionResult) -> ConversionStatus:
|
139
|
+
"""Determine the status based on extraction results."""
|
140
|
+
if ext_res.pages and not any(page.errors for page in ext_res.pages):
|
141
|
+
return ConversionStatus.SUCCESS
|
142
|
+
else:
|
143
|
+
return ConversionStatus.FAILURE
|
144
|
+
|
145
|
+
def _get_images_from_input(self, input_doc: InputDocument) -> list[Image]:
|
146
|
+
"""Extract images from input document using the backend."""
|
147
|
+
images = []
|
148
|
+
|
149
|
+
try:
|
150
|
+
backend = input_doc._backend
|
151
|
+
|
152
|
+
assert isinstance(backend, PdfDocumentBackend)
|
153
|
+
# Use the backend's pagination interface
|
154
|
+
page_count = backend.page_count()
|
155
|
+
|
156
|
+
# Respect page range limits, following the same pattern as PaginatedPipeline
|
157
|
+
start_page, end_page = input_doc.limits.page_range
|
158
|
+
_log.info(
|
159
|
+
f"Processing pages {start_page}-{end_page} of {page_count} total pages for extraction"
|
160
|
+
)
|
161
|
+
|
162
|
+
for page_num in range(page_count):
|
163
|
+
# Only process pages within the specified range (0-based indexing)
|
164
|
+
if start_page - 1 <= page_num <= end_page - 1:
|
165
|
+
try:
|
166
|
+
page_backend = backend.load_page(page_num)
|
167
|
+
if page_backend.is_valid():
|
168
|
+
# Get page image at a reasonable scale
|
169
|
+
page_image = page_backend.get_page_image(
|
170
|
+
scale=self.pipeline_options.vlm_options.scale
|
171
|
+
)
|
172
|
+
images.append(page_image)
|
173
|
+
else:
|
174
|
+
_log.warning(f"Page {page_num + 1} backend is not valid")
|
175
|
+
except Exception as e:
|
176
|
+
_log.error(f"Error loading page {page_num + 1}: {e}")
|
177
|
+
|
178
|
+
except Exception as e:
|
179
|
+
_log.error(f"Error getting images from input document: {e}")
|
180
|
+
|
181
|
+
return images
|
182
|
+
|
183
|
+
def _serialize_template(self, template: ExtractionTemplateType) -> str:
|
184
|
+
"""Serialize template to string based on its type."""
|
185
|
+
if isinstance(template, str):
|
186
|
+
return template
|
187
|
+
elif isinstance(template, dict):
|
188
|
+
return json.dumps(template, indent=2)
|
189
|
+
elif isinstance(template, BaseModel):
|
190
|
+
return template.model_dump_json(indent=2)
|
191
|
+
elif inspect.isclass(template) and issubclass(template, BaseModel):
|
192
|
+
from polyfactory.factories.pydantic_factory import ModelFactory
|
193
|
+
|
194
|
+
class ExtractionTemplateFactory(ModelFactory[template]): # type: ignore
|
195
|
+
__use_examples__ = True # prefer Field(examples=...) when present
|
196
|
+
__use_defaults__ = True # use field defaults instead of random values
|
197
|
+
|
198
|
+
return ExtractionTemplateFactory.build().model_dump_json(indent=2) # type: ignore
|
199
|
+
else:
|
200
|
+
raise ValueError(f"Unsupported template type: {type(template)}")
|
201
|
+
|
202
|
+
@classmethod
|
203
|
+
def get_default_options(cls) -> BaseOptions:
|
204
|
+
return VlmExtractionPipelineOptions()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.49.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -51,6 +51,7 @@ Requires-Dist: pluggy<2.0.0,>=1.0.0
|
|
51
51
|
Requires-Dist: pylatexenc<3.0,>=2.10
|
52
52
|
Requires-Dist: scipy<2.0.0,>=1.6.0
|
53
53
|
Requires-Dist: accelerate<2,>=1.0.0
|
54
|
+
Requires-Dist: polyfactory>=2.22.2
|
54
55
|
Provides-Extra: tesserocr
|
55
56
|
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
56
57
|
Provides-Extra: ocrmac
|
@@ -60,6 +61,7 @@ Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
|
60
61
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
62
|
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
63
|
Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
|
64
|
+
Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
|
63
65
|
Provides-Extra: rapidocr
|
64
66
|
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
|
65
67
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
@@ -1,5 +1,6 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/document_converter.py,sha256=
|
2
|
+
docling/document_converter.py,sha256=CKMlobhTt8Y5yZ_tQOnPAP7_otBiddQ_klRGT5Bgwyo,15827
|
3
|
+
docling/document_extractor.py,sha256=-RbQRvLWLXF15HYqBbV_lJhh08Zl487UEQKhP-_FR8k,11969
|
3
4
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
4
5
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
5
6
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -12,12 +13,12 @@ docling/backend/docling_parse_v4_backend.py,sha256=MbCMxNGmoW4iuev9tX1Vt4jtIeak2
|
|
12
13
|
docling/backend/html_backend.py,sha256=MqtU9fA83lcjqb85lFTmGDedOH72WxTmwvj0ZzPur1I,42224
|
13
14
|
docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
|
14
15
|
docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
|
15
|
-
docling/backend/msexcel_backend.py,sha256=
|
16
|
+
docling/backend/msexcel_backend.py,sha256=5JRbPwOjR1r45AMeIts1rj6InbOgLBf_CtAhvNPVmsQ,19157
|
16
17
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
17
18
|
docling/backend/msword_backend.py,sha256=fKeAMGGR5ABimedo_ofCQAybzdqmqWA3A3mpLl7X6qY,49129
|
18
19
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
19
20
|
docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
|
20
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
21
|
+
docling/backend/pypdfium2_backend.py,sha256=AYhWs9S8W_TkAK0-OkRmUNf4HUZl26FP7-XYjwU5zDk,14209
|
21
22
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
23
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
24
|
docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
|
@@ -35,14 +36,15 @@ docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
|
35
36
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
37
|
docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
|
37
38
|
docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
|
38
|
-
docling/datamodel/base_models.py,sha256=
|
39
|
-
docling/datamodel/document.py,sha256=
|
39
|
+
docling/datamodel/base_models.py,sha256=vOt895z0GsFirHkkI3hM23e9oyUuz9RXfcGFtoINLtw,12334
|
40
|
+
docling/datamodel/document.py,sha256=ElY7G6FYJ6Bayyw433_tbnxyE47fnQRoBG_mygvOBrA,17370
|
41
|
+
docling/datamodel/extraction.py,sha256=7dgvtK5SuvgfB8LHAwS1FwrW1kcMQJuJG0ol8uAQgoQ,1323
|
40
42
|
docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
|
41
|
-
docling/datamodel/pipeline_options.py,sha256=
|
43
|
+
docling/datamodel/pipeline_options.py,sha256=0J0xVOSfI3pqRMkXlzX_rtmVBgCTsR2QJz54xugP8sg,10963
|
42
44
|
docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
|
43
45
|
docling/datamodel/pipeline_options_vlm_model.py,sha256=AcqqThSW74hwQ6x7pazzm57LnJiUqB7gQi5wFayGlbk,2628
|
44
46
|
docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
|
45
|
-
docling/datamodel/vlm_model_specs.py,sha256=
|
47
|
+
docling/datamodel/vlm_model_specs.py,sha256=8D-bF95EoaD-Wd29lVX094HPJT1gYN393aFmzv7RipQ,8713
|
46
48
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
49
|
docling/models/api_vlm_model.py,sha256=-zisU32pgDRbychyG6-neB0qweNbPaYnLXwiGT7SEdI,2859
|
48
50
|
docling/models/base_model.py,sha256=tXFM7zJwF6Kn2EhtaB4QmgK4O2ruv1C7SjdBgM5QKak,6225
|
@@ -57,7 +59,7 @@ docling/models/page_preprocessing_model.py,sha256=rHNX1uP1ScTjVUlsxZ0eamK2uNUqI9
|
|
57
59
|
docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
|
58
60
|
docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
|
59
61
|
docling/models/picture_description_vlm_model.py,sha256=5BJvaF3PHuL9lCVYqPv9krh3h_7YwNSdKYw1EVEj13k,4156
|
60
|
-
docling/models/rapid_ocr_model.py,sha256=
|
62
|
+
docling/models/rapid_ocr_model.py,sha256=7yZC7I1qoC9xC8xJIjTk2c8VFm89RfB6Vr7IDOnr5gs,7102
|
61
63
|
docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
|
62
64
|
docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
|
63
65
|
docling/models/tesseract_ocr_cli_model.py,sha256=I3Gn28Y-LD8OfvyCElN9fLiNgpo2sT0uMkVt258253s,12881
|
@@ -73,10 +75,13 @@ docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnu
|
|
73
75
|
docling/models/vlm_models_inline/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
74
76
|
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=G0RpKwdzm5NiqIBHG5nWLwBsrDfDebzErzRkyXppZPw,12134
|
75
77
|
docling/models/vlm_models_inline/mlx_model.py,sha256=VP05v97mqzmaG4o9bOpJcxIlEqvNzAapJ15Zz3E3ACI,10169
|
78
|
+
docling/models/vlm_models_inline/nuextract_transformers_model.py,sha256=iWoGF8TgQfOOMqS__tSODcUuDnKTPaK7gIRFum5bPzc,10512
|
76
79
|
docling/models/vlm_models_inline/vllm_model.py,sha256=_EnK1nfpAPJky7aRlyp8SUIghiZOQO8AkDN_hHqXLZg,8615
|
77
80
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
78
81
|
docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
|
82
|
+
docling/pipeline/base_extraction_pipeline.py,sha256=aJj7qbppgAelwoaVKB1W-s7kFg_OcXRE64NpIIOxZGE,1905
|
79
83
|
docling/pipeline/base_pipeline.py,sha256=Tl_C3adFABNxtE7hX83VSdx-j7D8GRvoFcno5A3Z-YQ,10062
|
84
|
+
docling/pipeline/extraction_vlm_pipeline.py,sha256=WIRZygpBJmKjszRsFqW4qfPUZ5Frd_Hqoiysp2dGx8Y,8723
|
80
85
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
81
86
|
docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
|
82
87
|
docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=NgdZxpfpElnvCgGlrQ8kSvq44LNzJcc6wOqD-AMrKZ0,26132
|
@@ -94,9 +99,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
94
99
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
95
100
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
96
101
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
97
|
-
docling-2.
|
98
|
-
docling-2.
|
99
|
-
docling-2.
|
100
|
-
docling-2.
|
101
|
-
docling-2.
|
102
|
-
docling-2.
|
102
|
+
docling-2.49.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
103
|
+
docling-2.49.0.dist-info/METADATA,sha256=Gn1u-LwLRMCqHamlyu1M4w9a8NvGfk-jfcCh0XjhsfQ,10731
|
104
|
+
docling-2.49.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
105
|
+
docling-2.49.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
106
|
+
docling-2.49.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
107
|
+
docling-2.49.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|