docling 2.47.1__py3-none-any.whl → 2.49.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +172 -76
- docling/backend/msexcel_backend.py +15 -1
- docling/backend/pypdfium2_backend.py +24 -2
- docling/datamodel/base_models.py +13 -1
- docling/datamodel/document.py +5 -3
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/pipeline_options.py +12 -4
- docling/datamodel/vlm_model_specs.py +17 -0
- docling/document_converter.py +3 -6
- docling/document_extractor.py +325 -0
- docling/models/rapid_ocr_model.py +38 -16
- docling/models/vlm_models_inline/nuextract_transformers_model.py +290 -0
- docling/pipeline/base_extraction_pipeline.py +58 -0
- docling/pipeline/extraction_vlm_pipeline.py +204 -0
- {docling-2.47.1.dist-info → docling-2.49.0.dist-info}/METADATA +5 -2
- {docling-2.47.1.dist-info → docling-2.49.0.dist-info}/RECORD +20 -15
- {docling-2.47.1.dist-info → docling-2.49.0.dist-info}/WHEEL +0 -0
- {docling-2.47.1.dist-info → docling-2.49.0.dist-info}/entry_points.txt +0 -0
- {docling-2.47.1.dist-info → docling-2.49.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.47.1.dist-info → docling-2.49.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
"""Data models for document extraction functionality."""
|
2
|
+
|
3
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
4
|
+
|
5
|
+
from pydantic import BaseModel, Field
|
6
|
+
|
7
|
+
from docling.datamodel.base_models import ConversionStatus, ErrorItem
|
8
|
+
from docling.datamodel.document import InputDocument
|
9
|
+
|
10
|
+
|
11
|
+
class ExtractedPageData(BaseModel):
|
12
|
+
"""Data model for extracted content from a single page."""
|
13
|
+
|
14
|
+
page_no: int = Field(..., description="1-indexed page number")
|
15
|
+
extracted_data: Optional[Dict[str, Any]] = Field(
|
16
|
+
None, description="Extracted structured data from the page"
|
17
|
+
)
|
18
|
+
raw_text: Optional[str] = Field(None, description="Raw extracted text")
|
19
|
+
errors: List[str] = Field(
|
20
|
+
default_factory=list,
|
21
|
+
description="Any errors encountered during extraction for this page",
|
22
|
+
)
|
23
|
+
|
24
|
+
|
25
|
+
class ExtractionResult(BaseModel):
|
26
|
+
"""Result of document extraction."""
|
27
|
+
|
28
|
+
input: InputDocument
|
29
|
+
status: ConversionStatus = ConversionStatus.PENDING
|
30
|
+
errors: List[ErrorItem] = []
|
31
|
+
|
32
|
+
# Pages field - always a list for consistency
|
33
|
+
pages: List[ExtractedPageData] = Field(
|
34
|
+
default_factory=list, description="Extracted data from each page"
|
35
|
+
)
|
36
|
+
|
37
|
+
|
38
|
+
# Type alias for template parameters that can be string, dict, or BaseModel
|
39
|
+
ExtractionTemplateType = Union[str, Dict[str, Any], BaseModel, Type[BaseModel]]
|
@@ -37,6 +37,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
|
|
37
37
|
from docling.datamodel.vlm_model_specs import (
|
38
38
|
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
39
39
|
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
40
|
+
NU_EXTRACT_2B_TRANSFORMERS,
|
40
41
|
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
41
42
|
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
42
43
|
VlmModelType,
|
@@ -99,6 +100,8 @@ class RapidOcrOptions(OcrOptions):
|
|
99
100
|
# For more details on the following options visit
|
100
101
|
# https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
101
102
|
|
103
|
+
# https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#__tabbed_3_4
|
104
|
+
backend: Literal["onnxruntime", "openvino", "paddle", "torch"] = "onnxruntime"
|
102
105
|
text_score: float = 0.5 # same default as rapidocr
|
103
106
|
|
104
107
|
use_det: Optional[bool] = None # same default as rapidocr
|
@@ -111,6 +114,7 @@ class RapidOcrOptions(OcrOptions):
|
|
111
114
|
cls_model_path: Optional[str] = None # same default as rapidocr
|
112
115
|
rec_model_path: Optional[str] = None # same default as rapidocr
|
113
116
|
rec_keys_path: Optional[str] = None # same default as rapidocr
|
117
|
+
rec_font_path: Optional[str] = None # same default as rapidocr
|
114
118
|
|
115
119
|
model_config = ConfigDict(
|
116
120
|
extra="forbid",
|
@@ -244,12 +248,9 @@ class OcrEngine(str, Enum):
|
|
244
248
|
RAPIDOCR = "rapidocr"
|
245
249
|
|
246
250
|
|
247
|
-
class PipelineOptions(
|
251
|
+
class PipelineOptions(BaseOptions):
|
248
252
|
"""Base pipeline options."""
|
249
253
|
|
250
|
-
create_legacy_output: bool = (
|
251
|
-
True # This default will be set to False on a future version of docling
|
252
|
-
)
|
253
254
|
document_timeout: Optional[float] = None
|
254
255
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
255
256
|
enable_remote_services: bool = False
|
@@ -293,6 +294,13 @@ class AsrPipelineOptions(PipelineOptions):
|
|
293
294
|
artifacts_path: Optional[Union[Path, str]] = None
|
294
295
|
|
295
296
|
|
297
|
+
class VlmExtractionPipelineOptions(PipelineOptions):
|
298
|
+
"""Options for extraction pipeline."""
|
299
|
+
|
300
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
301
|
+
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
|
302
|
+
|
303
|
+
|
296
304
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
297
305
|
"""Options for the PDF pipeline."""
|
298
306
|
|
@@ -247,6 +247,23 @@ DOLPHIN_TRANSFORMERS = InlineVlmOptions(
|
|
247
247
|
temperature=0.0,
|
248
248
|
)
|
249
249
|
|
250
|
+
# NuExtract
|
251
|
+
NU_EXTRACT_2B_TRANSFORMERS = InlineVlmOptions(
|
252
|
+
repo_id="numind/NuExtract-2.0-2B",
|
253
|
+
prompt="", # This won't be used, template is passed separately
|
254
|
+
torch_dtype="bfloat16",
|
255
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
256
|
+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
257
|
+
response_format=ResponseFormat.PLAINTEXT,
|
258
|
+
supported_devices=[
|
259
|
+
AcceleratorDevice.CPU,
|
260
|
+
AcceleratorDevice.CUDA,
|
261
|
+
AcceleratorDevice.MPS,
|
262
|
+
],
|
263
|
+
scale=2.0,
|
264
|
+
temperature=0.0,
|
265
|
+
)
|
266
|
+
|
250
267
|
|
251
268
|
class VlmModelType(str, Enum):
|
252
269
|
SMOLDOCLING = "smoldocling"
|
docling/document_converter.py
CHANGED
@@ -28,6 +28,7 @@ from docling.backend.noop_backend import NoOpBackend
|
|
28
28
|
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
29
29
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
30
30
|
from docling.datamodel.base_models import (
|
31
|
+
BaseFormatOption,
|
31
32
|
ConversionStatus,
|
32
33
|
DoclingComponentType,
|
33
34
|
DocumentStream,
|
@@ -57,12 +58,8 @@ _log = logging.getLogger(__name__)
|
|
57
58
|
_PIPELINE_CACHE_LOCK = threading.Lock()
|
58
59
|
|
59
60
|
|
60
|
-
class FormatOption(
|
61
|
+
class FormatOption(BaseFormatOption):
|
61
62
|
pipeline_cls: Type[BasePipeline]
|
62
|
-
pipeline_options: Optional[PipelineOptions] = None
|
63
|
-
backend: Type[AbstractDocumentBackend]
|
64
|
-
|
65
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
66
63
|
|
67
64
|
@model_validator(mode="after")
|
68
65
|
def set_optional_field_default(self) -> "FormatOption":
|
@@ -191,7 +188,7 @@ class DocumentConverter:
|
|
191
188
|
self.allowed_formats = (
|
192
189
|
allowed_formats if allowed_formats is not None else list(InputFormat)
|
193
190
|
)
|
194
|
-
self.format_to_options = {
|
191
|
+
self.format_to_options: Dict[InputFormat, FormatOption] = {
|
195
192
|
format: (
|
196
193
|
_get_default_option(format=format)
|
197
194
|
if (custom_option := (format_options or {}).get(format)) is None
|
@@ -0,0 +1,325 @@
|
|
1
|
+
import hashlib
|
2
|
+
import logging
|
3
|
+
import sys
|
4
|
+
import threading
|
5
|
+
import time
|
6
|
+
import warnings
|
7
|
+
from collections.abc import Iterable, Iterator
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
9
|
+
from functools import partial
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import Dict, List, Optional, Tuple, Type, Union
|
12
|
+
|
13
|
+
from pydantic import ConfigDict, model_validator, validate_call
|
14
|
+
|
15
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
16
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
17
|
+
from docling.datamodel.base_models import (
|
18
|
+
BaseFormatOption,
|
19
|
+
ConversionStatus,
|
20
|
+
DoclingComponentType,
|
21
|
+
DocumentStream,
|
22
|
+
ErrorItem,
|
23
|
+
InputFormat,
|
24
|
+
)
|
25
|
+
from docling.datamodel.document import (
|
26
|
+
InputDocument,
|
27
|
+
_DocumentConversionInput, # intentionally reused builder
|
28
|
+
)
|
29
|
+
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
|
30
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
31
|
+
from docling.datamodel.settings import (
|
32
|
+
DEFAULT_PAGE_RANGE,
|
33
|
+
DocumentLimits,
|
34
|
+
PageRange,
|
35
|
+
settings,
|
36
|
+
)
|
37
|
+
from docling.exceptions import ConversionError
|
38
|
+
from docling.pipeline.base_extraction_pipeline import BaseExtractionPipeline
|
39
|
+
from docling.pipeline.extraction_vlm_pipeline import ExtractionVlmPipeline
|
40
|
+
from docling.utils.utils import chunkify
|
41
|
+
|
42
|
+
_log = logging.getLogger(__name__)
|
43
|
+
_PIPELINE_CACHE_LOCK = threading.Lock()
|
44
|
+
|
45
|
+
|
46
|
+
class ExtractionFormatOption(BaseFormatOption):
|
47
|
+
"""Per-format configuration for extraction.
|
48
|
+
|
49
|
+
Notes:
|
50
|
+
- `pipeline_cls` must subclass `BaseExtractionPipeline`.
|
51
|
+
- `pipeline_options` is typed as `PipelineOptions` which MUST inherit from
|
52
|
+
`BaseOptions` (as used by `BaseExtractionPipeline`).
|
53
|
+
- `backend` is the document-opening backend used by `_DocumentConversionInput`.
|
54
|
+
"""
|
55
|
+
|
56
|
+
pipeline_cls: Type[BaseExtractionPipeline]
|
57
|
+
|
58
|
+
@model_validator(mode="after")
|
59
|
+
def set_optional_field_default(self) -> "ExtractionFormatOption":
|
60
|
+
if self.pipeline_options is None:
|
61
|
+
# `get_default_options` comes from BaseExtractionPipeline
|
62
|
+
self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
|
63
|
+
return self
|
64
|
+
|
65
|
+
|
66
|
+
def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
|
67
|
+
"""Return the default extraction option for a given input format.
|
68
|
+
|
69
|
+
Defaults mirror the converter's *backend* choices, while the pipeline is
|
70
|
+
the VLM extractor. This duplication will be removed when we deduplicate
|
71
|
+
the format registry between convert/extract.
|
72
|
+
"""
|
73
|
+
format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
74
|
+
InputFormat.IMAGE: PyPdfiumDocumentBackend,
|
75
|
+
InputFormat.PDF: PyPdfiumDocumentBackend,
|
76
|
+
}
|
77
|
+
|
78
|
+
backend = format_to_default_backend.get(fmt)
|
79
|
+
if backend is None:
|
80
|
+
raise RuntimeError(f"No default extraction backend configured for {fmt}")
|
81
|
+
|
82
|
+
return ExtractionFormatOption(
|
83
|
+
pipeline_cls=ExtractionVlmPipeline,
|
84
|
+
backend=backend,
|
85
|
+
)
|
86
|
+
|
87
|
+
|
88
|
+
class DocumentExtractor:
|
89
|
+
"""Standalone extractor class.
|
90
|
+
|
91
|
+
Public API:
|
92
|
+
- `extract(...) -> ExtractionResult`
|
93
|
+
- `extract_all(...) -> Iterator[ExtractionResult]`
|
94
|
+
|
95
|
+
Implementation intentionally reuses `_DocumentConversionInput` to build
|
96
|
+
`InputDocument` with the correct backend per format.
|
97
|
+
"""
|
98
|
+
|
99
|
+
def __init__(
|
100
|
+
self,
|
101
|
+
allowed_formats: Optional[List[InputFormat]] = None,
|
102
|
+
extraction_format_options: Optional[
|
103
|
+
Dict[InputFormat, ExtractionFormatOption]
|
104
|
+
] = None,
|
105
|
+
) -> None:
|
106
|
+
self.allowed_formats: List[InputFormat] = (
|
107
|
+
allowed_formats if allowed_formats is not None else list(InputFormat)
|
108
|
+
)
|
109
|
+
# Build per-format options with defaults, then apply any user overrides
|
110
|
+
overrides = extraction_format_options or {}
|
111
|
+
self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
|
112
|
+
fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
|
113
|
+
for fmt in self.allowed_formats
|
114
|
+
}
|
115
|
+
|
116
|
+
# Cache pipelines by (class, options-hash)
|
117
|
+
self._initialized_pipelines: Dict[
|
118
|
+
Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
|
119
|
+
] = {}
|
120
|
+
|
121
|
+
# ---------------------------- Public API ---------------------------------
|
122
|
+
|
123
|
+
@validate_call(config=ConfigDict(strict=True))
|
124
|
+
def extract(
|
125
|
+
self,
|
126
|
+
source: Union[Path, str, DocumentStream],
|
127
|
+
template: ExtractionTemplateType,
|
128
|
+
headers: Optional[Dict[str, str]] = None,
|
129
|
+
raises_on_error: bool = True,
|
130
|
+
max_num_pages: int = sys.maxsize,
|
131
|
+
max_file_size: int = sys.maxsize,
|
132
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
133
|
+
) -> ExtractionResult:
|
134
|
+
all_res = self.extract_all(
|
135
|
+
source=[source],
|
136
|
+
headers=headers,
|
137
|
+
raises_on_error=raises_on_error,
|
138
|
+
max_num_pages=max_num_pages,
|
139
|
+
max_file_size=max_file_size,
|
140
|
+
page_range=page_range,
|
141
|
+
template=template,
|
142
|
+
)
|
143
|
+
return next(all_res)
|
144
|
+
|
145
|
+
@validate_call(config=ConfigDict(strict=True))
|
146
|
+
def extract_all(
|
147
|
+
self,
|
148
|
+
source: Iterable[Union[Path, str, DocumentStream]],
|
149
|
+
template: ExtractionTemplateType,
|
150
|
+
headers: Optional[Dict[str, str]] = None,
|
151
|
+
raises_on_error: bool = True,
|
152
|
+
max_num_pages: int = sys.maxsize,
|
153
|
+
max_file_size: int = sys.maxsize,
|
154
|
+
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
155
|
+
) -> Iterator[ExtractionResult]:
|
156
|
+
warnings.warn(
|
157
|
+
"The extract API is currently experimental and may change without prior notice.\n"
|
158
|
+
"Only PDF and image formats are supported.",
|
159
|
+
UserWarning,
|
160
|
+
stacklevel=2,
|
161
|
+
)
|
162
|
+
|
163
|
+
limits = DocumentLimits(
|
164
|
+
max_num_pages=max_num_pages,
|
165
|
+
max_file_size=max_file_size,
|
166
|
+
page_range=page_range,
|
167
|
+
)
|
168
|
+
conv_input = _DocumentConversionInput(
|
169
|
+
path_or_stream_iterator=source, limits=limits, headers=headers
|
170
|
+
)
|
171
|
+
|
172
|
+
ext_res_iter = self._extract(
|
173
|
+
conv_input, raises_on_error=raises_on_error, template=template
|
174
|
+
)
|
175
|
+
|
176
|
+
had_result = False
|
177
|
+
for ext_res in ext_res_iter:
|
178
|
+
had_result = True
|
179
|
+
if raises_on_error and ext_res.status not in {
|
180
|
+
ConversionStatus.SUCCESS,
|
181
|
+
ConversionStatus.PARTIAL_SUCCESS,
|
182
|
+
}:
|
183
|
+
raise ConversionError(
|
184
|
+
f"Extraction failed for: {ext_res.input.file} with status: {ext_res.status}"
|
185
|
+
)
|
186
|
+
else:
|
187
|
+
yield ext_res
|
188
|
+
|
189
|
+
if not had_result and raises_on_error:
|
190
|
+
raise ConversionError(
|
191
|
+
"Extraction failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
192
|
+
)
|
193
|
+
|
194
|
+
# --------------------------- Internal engine ------------------------------
|
195
|
+
|
196
|
+
def _extract(
|
197
|
+
self,
|
198
|
+
conv_input: _DocumentConversionInput,
|
199
|
+
raises_on_error: bool,
|
200
|
+
template: ExtractionTemplateType,
|
201
|
+
) -> Iterator[ExtractionResult]:
|
202
|
+
start_time = time.monotonic()
|
203
|
+
|
204
|
+
for input_batch in chunkify(
|
205
|
+
conv_input.docs(self.extraction_format_to_options),
|
206
|
+
settings.perf.doc_batch_size,
|
207
|
+
):
|
208
|
+
_log.info("Going to extract document batch...")
|
209
|
+
process_func = partial(
|
210
|
+
self._process_document_extraction,
|
211
|
+
raises_on_error=raises_on_error,
|
212
|
+
template=template,
|
213
|
+
)
|
214
|
+
|
215
|
+
if (
|
216
|
+
settings.perf.doc_batch_concurrency > 1
|
217
|
+
and settings.perf.doc_batch_size > 1
|
218
|
+
):
|
219
|
+
with ThreadPoolExecutor(
|
220
|
+
max_workers=settings.perf.doc_batch_concurrency
|
221
|
+
) as pool:
|
222
|
+
for item in pool.map(
|
223
|
+
process_func,
|
224
|
+
input_batch,
|
225
|
+
):
|
226
|
+
yield item
|
227
|
+
else:
|
228
|
+
for item in map(
|
229
|
+
process_func,
|
230
|
+
input_batch,
|
231
|
+
):
|
232
|
+
elapsed = time.monotonic() - start_time
|
233
|
+
start_time = time.monotonic()
|
234
|
+
_log.info(
|
235
|
+
f"Finished extracting document {item.input.file.name} in {elapsed:.2f} sec."
|
236
|
+
)
|
237
|
+
yield item
|
238
|
+
|
239
|
+
def _process_document_extraction(
|
240
|
+
self,
|
241
|
+
in_doc: InputDocument,
|
242
|
+
raises_on_error: bool,
|
243
|
+
template: ExtractionTemplateType,
|
244
|
+
) -> ExtractionResult:
|
245
|
+
valid = (
|
246
|
+
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
247
|
+
)
|
248
|
+
if valid:
|
249
|
+
return self._execute_extraction_pipeline(
|
250
|
+
in_doc, raises_on_error=raises_on_error, template=template
|
251
|
+
)
|
252
|
+
else:
|
253
|
+
error_message = f"File format not allowed: {in_doc.file}"
|
254
|
+
if raises_on_error:
|
255
|
+
raise ConversionError(error_message)
|
256
|
+
else:
|
257
|
+
error_item = ErrorItem(
|
258
|
+
component_type=DoclingComponentType.USER_INPUT,
|
259
|
+
module_name="",
|
260
|
+
error_message=error_message,
|
261
|
+
)
|
262
|
+
return ExtractionResult(
|
263
|
+
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
264
|
+
)
|
265
|
+
|
266
|
+
def _execute_extraction_pipeline(
|
267
|
+
self,
|
268
|
+
in_doc: InputDocument,
|
269
|
+
raises_on_error: bool,
|
270
|
+
template: ExtractionTemplateType,
|
271
|
+
) -> ExtractionResult:
|
272
|
+
if not in_doc.valid:
|
273
|
+
if raises_on_error:
|
274
|
+
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
275
|
+
else:
|
276
|
+
return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
|
277
|
+
|
278
|
+
pipeline = self._get_pipeline(in_doc.format)
|
279
|
+
if pipeline is None:
|
280
|
+
if raises_on_error:
|
281
|
+
raise ConversionError(
|
282
|
+
f"No extraction pipeline could be initialized for {in_doc.file}."
|
283
|
+
)
|
284
|
+
else:
|
285
|
+
return ExtractionResult(input=in_doc, status=ConversionStatus.FAILURE)
|
286
|
+
|
287
|
+
return pipeline.execute(
|
288
|
+
in_doc, raises_on_error=raises_on_error, template=template
|
289
|
+
)
|
290
|
+
|
291
|
+
def _get_pipeline(
|
292
|
+
self, doc_format: InputFormat
|
293
|
+
) -> Optional[BaseExtractionPipeline]:
|
294
|
+
"""Retrieve or initialize a pipeline, reusing instances based on class and options."""
|
295
|
+
fopt = self.extraction_format_to_options.get(doc_format)
|
296
|
+
if fopt is None or fopt.pipeline_options is None:
|
297
|
+
return None
|
298
|
+
|
299
|
+
pipeline_class = fopt.pipeline_cls
|
300
|
+
pipeline_options = fopt.pipeline_options
|
301
|
+
options_hash = self._get_pipeline_options_hash(pipeline_options)
|
302
|
+
|
303
|
+
cache_key = (pipeline_class, options_hash)
|
304
|
+
with _PIPELINE_CACHE_LOCK:
|
305
|
+
if cache_key not in self._initialized_pipelines:
|
306
|
+
_log.info(
|
307
|
+
f"Initializing extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
308
|
+
)
|
309
|
+
self._initialized_pipelines[cache_key] = pipeline_class(
|
310
|
+
pipeline_options=pipeline_options # type: ignore[arg-type]
|
311
|
+
)
|
312
|
+
else:
|
313
|
+
_log.debug(
|
314
|
+
f"Reusing cached extraction pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
315
|
+
)
|
316
|
+
|
317
|
+
return self._initialized_pipelines[cache_key]
|
318
|
+
|
319
|
+
@staticmethod
|
320
|
+
def _get_pipeline_options_hash(pipeline_options: PipelineOptions) -> str:
|
321
|
+
"""Generate a stable hash of pipeline options to use as part of the cache key."""
|
322
|
+
options_str = str(pipeline_options.model_dump())
|
323
|
+
return hashlib.md5(
|
324
|
+
options_str.encode("utf-8"), usedforsecurity=False
|
325
|
+
).hexdigest()
|
@@ -42,10 +42,10 @@ class RapidOcrModel(BaseOcrModel):
|
|
42
42
|
|
43
43
|
if self.enabled:
|
44
44
|
try:
|
45
|
-
from
|
45
|
+
from rapidocr import EngineType, RapidOCR # type: ignore
|
46
46
|
except ImportError:
|
47
47
|
raise ImportError(
|
48
|
-
"RapidOCR is not installed. Please install it via `pip install
|
48
|
+
"RapidOCR is not installed. Please install it via `pip install rapidocr onnxruntime` to use this OCR engine. "
|
49
49
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
50
50
|
)
|
51
51
|
|
@@ -54,21 +54,40 @@ class RapidOcrModel(BaseOcrModel):
|
|
54
54
|
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
|
55
55
|
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
|
56
56
|
intra_op_num_threads = accelerator_options.num_threads
|
57
|
+
_ALIASES = {
|
58
|
+
"onnxruntime": EngineType.ONNXRUNTIME,
|
59
|
+
"openvino": EngineType.OPENVINO,
|
60
|
+
"paddle": EngineType.PADDLE,
|
61
|
+
"torch": EngineType.TORCH,
|
62
|
+
}
|
63
|
+
backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
|
57
64
|
|
58
65
|
self.reader = RapidOCR(
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
66
|
+
params={
|
67
|
+
# Global settings (these are still correct)
|
68
|
+
"Global.text_score": self.options.text_score,
|
69
|
+
# "Global.verbose": self.options.print_verbose,
|
70
|
+
# Detection model settings
|
71
|
+
"Det.model_path": self.options.det_model_path,
|
72
|
+
"Det.use_cuda": use_cuda,
|
73
|
+
"Det.use_dml": use_dml,
|
74
|
+
"Det.intra_op_num_threads": intra_op_num_threads,
|
75
|
+
# Classification model settings
|
76
|
+
"Cls.model_path": self.options.cls_model_path,
|
77
|
+
"Cls.use_cuda": use_cuda,
|
78
|
+
"Cls.use_dml": use_dml,
|
79
|
+
"Cls.intra_op_num_threads": intra_op_num_threads,
|
80
|
+
# Recognition model settings
|
81
|
+
"Rec.model_path": self.options.rec_model_path,
|
82
|
+
"Rec.font_path": self.options.rec_font_path,
|
83
|
+
"Rec.keys_path": self.options.rec_keys_path,
|
84
|
+
"Rec.use_cuda": use_cuda,
|
85
|
+
"Rec.use_dml": use_dml,
|
86
|
+
"Rec.intra_op_num_threads": intra_op_num_threads,
|
87
|
+
"Det.engine_type": backend_enum,
|
88
|
+
"Cls.engine_type": backend_enum,
|
89
|
+
"Rec.engine_type": backend_enum,
|
90
|
+
}
|
72
91
|
)
|
73
92
|
|
74
93
|
def __call__(
|
@@ -95,12 +114,15 @@ class RapidOcrModel(BaseOcrModel):
|
|
95
114
|
scale=self.scale, cropbox=ocr_rect
|
96
115
|
)
|
97
116
|
im = numpy.array(high_res_image)
|
98
|
-
result
|
117
|
+
result = self.reader(
|
99
118
|
im,
|
100
119
|
use_det=self.options.use_det,
|
101
120
|
use_cls=self.options.use_cls,
|
102
121
|
use_rec=self.options.use_rec,
|
103
122
|
)
|
123
|
+
result = list(
|
124
|
+
zip(result.boxes.tolist(), result.txts, result.scores)
|
125
|
+
)
|
104
126
|
|
105
127
|
del high_res_image
|
106
128
|
del im
|