docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
import time
|
|
4
|
+
import traceback
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Callable, List, Optional
|
|
9
|
+
|
|
10
|
+
from docling_core.types.doc import NodeItem
|
|
11
|
+
|
|
12
|
+
from docling.backend.abstract_backend import (
|
|
13
|
+
AbstractDocumentBackend,
|
|
14
|
+
PaginatedDocumentBackend,
|
|
15
|
+
)
|
|
16
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
17
|
+
from docling.datamodel.base_models import (
|
|
18
|
+
ConversionStatus,
|
|
19
|
+
DoclingComponentType,
|
|
20
|
+
ErrorItem,
|
|
21
|
+
Page,
|
|
22
|
+
)
|
|
23
|
+
from docling.datamodel.document import ConversionResult, InputDocument
|
|
24
|
+
from docling.datamodel.pipeline_options import (
|
|
25
|
+
ConvertPipelineOptions,
|
|
26
|
+
PdfPipelineOptions,
|
|
27
|
+
PipelineOptions,
|
|
28
|
+
)
|
|
29
|
+
from docling.datamodel.settings import settings
|
|
30
|
+
from docling.models.base_model import GenericEnrichmentModel
|
|
31
|
+
from docling.models.factories import get_picture_description_factory
|
|
32
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
33
|
+
from docling.models.stages.picture_classifier.document_picture_classifier import (
|
|
34
|
+
DocumentPictureClassifier,
|
|
35
|
+
DocumentPictureClassifierOptions,
|
|
36
|
+
)
|
|
37
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
38
|
+
from docling.utils.utils import chunkify
|
|
39
|
+
|
|
40
|
+
_log = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class BasePipeline(ABC):
|
|
44
|
+
def __init__(self, pipeline_options: PipelineOptions):
|
|
45
|
+
self.pipeline_options = pipeline_options
|
|
46
|
+
self.keep_images = False
|
|
47
|
+
self.build_pipe: List[Callable] = []
|
|
48
|
+
self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
|
|
49
|
+
|
|
50
|
+
self.artifacts_path: Optional[Path] = None
|
|
51
|
+
if pipeline_options.artifacts_path is not None:
|
|
52
|
+
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
53
|
+
elif settings.artifacts_path is not None:
|
|
54
|
+
self.artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
55
|
+
|
|
56
|
+
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
|
|
57
|
+
raise RuntimeError(
|
|
58
|
+
f"The value of {self.artifacts_path=} is not valid. "
|
|
59
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
|
|
63
|
+
conv_res = ConversionResult(input=in_doc)
|
|
64
|
+
|
|
65
|
+
_log.info(f"Processing document {in_doc.file.name}")
|
|
66
|
+
try:
|
|
67
|
+
with TimeRecorder(
|
|
68
|
+
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
|
69
|
+
):
|
|
70
|
+
# These steps are building and assembling the structure of the
|
|
71
|
+
# output DoclingDocument.
|
|
72
|
+
conv_res = self._build_document(conv_res)
|
|
73
|
+
conv_res = self._assemble_document(conv_res)
|
|
74
|
+
# From this stage, all operations should rely only on conv_res.output
|
|
75
|
+
conv_res = self._enrich_document(conv_res)
|
|
76
|
+
conv_res.status = self._determine_status(conv_res)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
79
|
+
if not raises_on_error:
|
|
80
|
+
error_item = ErrorItem(
|
|
81
|
+
component_type=DoclingComponentType.PIPELINE,
|
|
82
|
+
module_name=self.__class__.__name__,
|
|
83
|
+
error_message=str(e),
|
|
84
|
+
)
|
|
85
|
+
conv_res.errors.append(error_item)
|
|
86
|
+
else:
|
|
87
|
+
raise RuntimeError(f"Pipeline {self.__class__.__name__} failed") from e
|
|
88
|
+
finally:
|
|
89
|
+
self._unload(conv_res)
|
|
90
|
+
|
|
91
|
+
return conv_res
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
98
|
+
return conv_res
|
|
99
|
+
|
|
100
|
+
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
101
|
+
def _prepare_elements(
|
|
102
|
+
conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
|
|
103
|
+
) -> Iterable[NodeItem]:
|
|
104
|
+
for doc_element, _level in conv_res.document.iterate_items():
|
|
105
|
+
prepared_element = model.prepare_element(
|
|
106
|
+
conv_res=conv_res, element=doc_element
|
|
107
|
+
)
|
|
108
|
+
if prepared_element is not None:
|
|
109
|
+
yield prepared_element
|
|
110
|
+
|
|
111
|
+
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
|
112
|
+
for model in self.enrichment_pipe:
|
|
113
|
+
for element_batch in chunkify(
|
|
114
|
+
_prepare_elements(conv_res, model),
|
|
115
|
+
model.elements_batch_size,
|
|
116
|
+
):
|
|
117
|
+
for element in model(
|
|
118
|
+
doc=conv_res.document, element_batch=element_batch
|
|
119
|
+
): # Must exhaust!
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
return conv_res
|
|
123
|
+
|
|
124
|
+
@abstractmethod
|
|
125
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
def _unload(self, conv_res: ConversionResult):
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
@classmethod
|
|
132
|
+
@abstractmethod
|
|
133
|
+
def get_default_options(cls) -> PipelineOptions:
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
@abstractmethod
|
|
138
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
139
|
+
pass
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class ConvertPipeline(BasePipeline):
|
|
143
|
+
def __init__(self, pipeline_options: ConvertPipelineOptions):
|
|
144
|
+
super().__init__(pipeline_options)
|
|
145
|
+
self.pipeline_options: ConvertPipelineOptions
|
|
146
|
+
|
|
147
|
+
# ------ Common enrichment models working on all backends
|
|
148
|
+
|
|
149
|
+
# Picture description model
|
|
150
|
+
if (
|
|
151
|
+
picture_description_model := self._get_picture_description_model(
|
|
152
|
+
artifacts_path=self.artifacts_path
|
|
153
|
+
)
|
|
154
|
+
) is None:
|
|
155
|
+
raise RuntimeError(
|
|
156
|
+
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
self.enrichment_pipe = [
|
|
160
|
+
# Document Picture Classifier
|
|
161
|
+
DocumentPictureClassifier(
|
|
162
|
+
enabled=pipeline_options.do_picture_classification,
|
|
163
|
+
artifacts_path=self.artifacts_path,
|
|
164
|
+
options=DocumentPictureClassifierOptions(),
|
|
165
|
+
accelerator_options=pipeline_options.accelerator_options,
|
|
166
|
+
),
|
|
167
|
+
# Document Picture description
|
|
168
|
+
picture_description_model,
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
def _get_picture_description_model(
|
|
172
|
+
self, artifacts_path: Optional[Path] = None
|
|
173
|
+
) -> Optional[PictureDescriptionBaseModel]:
|
|
174
|
+
factory = get_picture_description_factory(
|
|
175
|
+
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
176
|
+
)
|
|
177
|
+
return factory.create_instance(
|
|
178
|
+
options=self.pipeline_options.picture_description_options,
|
|
179
|
+
enabled=self.pipeline_options.do_picture_description,
|
|
180
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
|
181
|
+
artifacts_path=artifacts_path,
|
|
182
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
@classmethod
|
|
186
|
+
@abstractmethod
|
|
187
|
+
def get_default_options(cls) -> ConvertPipelineOptions:
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class PaginatedPipeline(ConvertPipeline): # TODO this is a bad name.
|
|
192
|
+
def __init__(self, pipeline_options: ConvertPipelineOptions):
|
|
193
|
+
super().__init__(pipeline_options)
|
|
194
|
+
self.keep_backend = False
|
|
195
|
+
|
|
196
|
+
def _apply_on_pages(
|
|
197
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
198
|
+
) -> Iterable[Page]:
|
|
199
|
+
for model in self.build_pipe:
|
|
200
|
+
page_batch = model(conv_res, page_batch)
|
|
201
|
+
|
|
202
|
+
yield from page_batch
|
|
203
|
+
|
|
204
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
205
|
+
if not isinstance(conv_res.input._backend, PaginatedDocumentBackend):
|
|
206
|
+
raise RuntimeError(
|
|
207
|
+
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a paginated backend. "
|
|
208
|
+
f"Can not convert this with a paginated PDF pipeline. "
|
|
209
|
+
f"Please check your format configuration on DocumentConverter."
|
|
210
|
+
)
|
|
211
|
+
# conv_res.status = ConversionStatus.FAILURE
|
|
212
|
+
# return conv_res
|
|
213
|
+
|
|
214
|
+
total_elapsed_time = 0.0
|
|
215
|
+
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
|
216
|
+
for i in range(conv_res.input.page_count):
|
|
217
|
+
start_page, end_page = conv_res.input.limits.page_range
|
|
218
|
+
if (start_page - 1) <= i <= (end_page - 1):
|
|
219
|
+
conv_res.pages.append(Page(page_no=i + 1))
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
total_pages_processed = 0
|
|
223
|
+
# Iterate batches of pages (page_batch_size) in the doc
|
|
224
|
+
for page_batch in chunkify(
|
|
225
|
+
conv_res.pages, settings.perf.page_batch_size
|
|
226
|
+
):
|
|
227
|
+
start_batch_time = time.monotonic()
|
|
228
|
+
|
|
229
|
+
# 1. Initialise the page resources
|
|
230
|
+
init_pages = map(
|
|
231
|
+
functools.partial(self.initialize_page, conv_res), page_batch
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# 2. Run pipeline stages
|
|
235
|
+
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
|
236
|
+
|
|
237
|
+
for p in pipeline_pages: # Must exhaust!
|
|
238
|
+
# Cleanup cached images
|
|
239
|
+
if not self.keep_images:
|
|
240
|
+
p._image_cache = {}
|
|
241
|
+
|
|
242
|
+
# Cleanup page backends
|
|
243
|
+
if not self.keep_backend and p._backend is not None:
|
|
244
|
+
p._backend.unload()
|
|
245
|
+
if (
|
|
246
|
+
isinstance(self.pipeline_options, PdfPipelineOptions)
|
|
247
|
+
and not self.pipeline_options.generate_parsed_pages
|
|
248
|
+
):
|
|
249
|
+
del p.parsed_page
|
|
250
|
+
p.parsed_page = None
|
|
251
|
+
|
|
252
|
+
end_batch_time = time.monotonic()
|
|
253
|
+
total_elapsed_time += end_batch_time - start_batch_time
|
|
254
|
+
if (
|
|
255
|
+
self.pipeline_options.document_timeout is not None
|
|
256
|
+
and total_elapsed_time > self.pipeline_options.document_timeout
|
|
257
|
+
):
|
|
258
|
+
_log.warning(
|
|
259
|
+
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
|
|
260
|
+
)
|
|
261
|
+
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
|
262
|
+
break
|
|
263
|
+
total_pages_processed += len(page_batch)
|
|
264
|
+
_log.debug(
|
|
265
|
+
f"Finished converting pages {total_pages_processed}/{len(conv_res.pages)} time={end_batch_time:.3f}"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
except Exception as e:
|
|
269
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
270
|
+
trace = "\n".join(
|
|
271
|
+
traceback.format_exception(type(e), e, e.__traceback__)
|
|
272
|
+
)
|
|
273
|
+
_log.warning(
|
|
274
|
+
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
|
275
|
+
f"{trace}"
|
|
276
|
+
)
|
|
277
|
+
raise e
|
|
278
|
+
|
|
279
|
+
# Filter out uninitialized pages (those with size=None) that may remain
|
|
280
|
+
# after timeout or processing failures to prevent assertion errors downstream
|
|
281
|
+
initial_page_count = len(conv_res.pages)
|
|
282
|
+
conv_res.pages = [page for page in conv_res.pages if page.size is not None]
|
|
283
|
+
|
|
284
|
+
if len(conv_res.pages) < initial_page_count:
|
|
285
|
+
_log.info(
|
|
286
|
+
f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
|
|
287
|
+
f"due to timeout or processing failures"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
return conv_res
|
|
291
|
+
|
|
292
|
+
def _unload(self, conv_res: ConversionResult) -> ConversionResult:
|
|
293
|
+
for page in conv_res.pages:
|
|
294
|
+
if page._backend is not None:
|
|
295
|
+
page._backend.unload()
|
|
296
|
+
|
|
297
|
+
if conv_res.input._backend:
|
|
298
|
+
conv_res.input._backend.unload()
|
|
299
|
+
|
|
300
|
+
return conv_res
|
|
301
|
+
|
|
302
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
303
|
+
status = conv_res.status
|
|
304
|
+
if status in [
|
|
305
|
+
ConversionStatus.PENDING,
|
|
306
|
+
ConversionStatus.STARTED,
|
|
307
|
+
]: # preserves ConversionStatus.PARTIAL_SUCCESS
|
|
308
|
+
status = ConversionStatus.SUCCESS
|
|
309
|
+
|
|
310
|
+
for page in conv_res.pages:
|
|
311
|
+
if page._backend is None or not page._backend.is_valid():
|
|
312
|
+
conv_res.errors.append(
|
|
313
|
+
ErrorItem(
|
|
314
|
+
component_type=DoclingComponentType.DOCUMENT_BACKEND,
|
|
315
|
+
module_name=type(page._backend).__name__,
|
|
316
|
+
error_message=f"Page {page.page_no} failed to parse.",
|
|
317
|
+
)
|
|
318
|
+
)
|
|
319
|
+
status = ConversionStatus.PARTIAL_SUCCESS
|
|
320
|
+
|
|
321
|
+
return status
|
|
322
|
+
|
|
323
|
+
# Initialise and load resources for a page
|
|
324
|
+
@abstractmethod
|
|
325
|
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
|
326
|
+
pass
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from PIL.Image import Image
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
|
10
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
11
|
+
from docling.datamodel.base_models import ConversionStatus, ErrorItem, VlmStopReason
|
|
12
|
+
from docling.datamodel.document import InputDocument
|
|
13
|
+
from docling.datamodel.extraction import (
|
|
14
|
+
ExtractedPageData,
|
|
15
|
+
ExtractionResult,
|
|
16
|
+
ExtractionTemplateType,
|
|
17
|
+
)
|
|
18
|
+
from docling.datamodel.pipeline_options import (
|
|
19
|
+
PipelineOptions,
|
|
20
|
+
VlmExtractionPipelineOptions,
|
|
21
|
+
)
|
|
22
|
+
from docling.datamodel.settings import settings
|
|
23
|
+
from docling.models.extraction.nuextract_transformers_model import (
|
|
24
|
+
NuExtractTransformersModel,
|
|
25
|
+
)
|
|
26
|
+
from docling.pipeline.base_extraction_pipeline import BaseExtractionPipeline
|
|
27
|
+
from docling.utils.accelerator_utils import decide_device
|
|
28
|
+
|
|
29
|
+
_log = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ExtractionVlmPipeline(BaseExtractionPipeline):
|
|
33
|
+
def __init__(self, pipeline_options: VlmExtractionPipelineOptions):
|
|
34
|
+
super().__init__(pipeline_options)
|
|
35
|
+
|
|
36
|
+
# Initialize VLM model with default options
|
|
37
|
+
self.accelerator_options = pipeline_options.accelerator_options
|
|
38
|
+
self.pipeline_options: VlmExtractionPipelineOptions
|
|
39
|
+
|
|
40
|
+
# Create VLM model instance
|
|
41
|
+
self.vlm_model = NuExtractTransformersModel(
|
|
42
|
+
enabled=True,
|
|
43
|
+
artifacts_path=self.artifacts_path, # Will download automatically
|
|
44
|
+
accelerator_options=self.accelerator_options,
|
|
45
|
+
vlm_options=pipeline_options.vlm_options,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def _extract_data(
|
|
49
|
+
self,
|
|
50
|
+
ext_res: ExtractionResult,
|
|
51
|
+
template: Optional[ExtractionTemplateType] = None,
|
|
52
|
+
) -> ExtractionResult:
|
|
53
|
+
"""Extract data using the VLM model."""
|
|
54
|
+
try:
|
|
55
|
+
# Get images from input document using the backend
|
|
56
|
+
images = self._get_images_from_input(ext_res.input)
|
|
57
|
+
if not images:
|
|
58
|
+
ext_res.status = ConversionStatus.FAILURE
|
|
59
|
+
ext_res.errors.append(
|
|
60
|
+
ErrorItem(
|
|
61
|
+
component_type="extraction_pipeline",
|
|
62
|
+
module_name=self.__class__.__name__,
|
|
63
|
+
error_message="No images found in document",
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
return ext_res
|
|
67
|
+
|
|
68
|
+
# Use provided template or default prompt
|
|
69
|
+
if template is not None:
|
|
70
|
+
prompt = self._serialize_template(template)
|
|
71
|
+
else:
|
|
72
|
+
prompt = "Extract all text and structured information from this document. Return as JSON."
|
|
73
|
+
|
|
74
|
+
# Process all images with VLM model
|
|
75
|
+
start_page, end_page = ext_res.input.limits.page_range
|
|
76
|
+
for i, image in enumerate(images):
|
|
77
|
+
# Calculate the actual page number based on the filtered range
|
|
78
|
+
page_number = start_page + i
|
|
79
|
+
try:
|
|
80
|
+
predictions = list(self.vlm_model.process_images([image], prompt))
|
|
81
|
+
|
|
82
|
+
if predictions:
|
|
83
|
+
# Parse the extracted text as JSON if possible, otherwise use as-is
|
|
84
|
+
extracted_text = predictions[0].text
|
|
85
|
+
extracted_data = None
|
|
86
|
+
vlm_stop_reason: VlmStopReason = predictions[0].stop_reason
|
|
87
|
+
if (
|
|
88
|
+
vlm_stop_reason == VlmStopReason.LENGTH
|
|
89
|
+
or vlm_stop_reason == VlmStopReason.STOP_SEQUENCE
|
|
90
|
+
):
|
|
91
|
+
ext_res.status = ConversionStatus.PARTIAL_SUCCESS
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
extracted_data = json.loads(extracted_text)
|
|
95
|
+
except (json.JSONDecodeError, ValueError):
|
|
96
|
+
# If not valid JSON, keep extracted_data as None
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
# Create page data with proper structure
|
|
100
|
+
page_data = ExtractedPageData(
|
|
101
|
+
page_no=page_number,
|
|
102
|
+
extracted_data=extracted_data,
|
|
103
|
+
raw_text=extracted_text, # Always populate raw_text
|
|
104
|
+
)
|
|
105
|
+
ext_res.pages.append(page_data)
|
|
106
|
+
else:
|
|
107
|
+
# Add error page data
|
|
108
|
+
page_data = ExtractedPageData(
|
|
109
|
+
page_no=page_number,
|
|
110
|
+
extracted_data=None,
|
|
111
|
+
errors=["No extraction result from VLM model"],
|
|
112
|
+
)
|
|
113
|
+
ext_res.pages.append(page_data)
|
|
114
|
+
|
|
115
|
+
except Exception as e:
|
|
116
|
+
_log.error(f"Error processing page {page_number}: {e}")
|
|
117
|
+
page_data = ExtractedPageData(
|
|
118
|
+
page_no=page_number, extracted_data=None, errors=[str(e)]
|
|
119
|
+
)
|
|
120
|
+
ext_res.pages.append(page_data)
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
_log.error(f"Error during extraction: {e}")
|
|
124
|
+
ext_res.errors.append(
|
|
125
|
+
ErrorItem(
|
|
126
|
+
component_type="extraction_pipeline",
|
|
127
|
+
module_name=self.__class__.__name__,
|
|
128
|
+
error_message=str(e),
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
return ext_res
|
|
133
|
+
|
|
134
|
+
def _determine_status(self, ext_res: ExtractionResult) -> ConversionStatus:
|
|
135
|
+
"""Determine the status based on extraction results."""
|
|
136
|
+
if ext_res.pages and not any(page.errors for page in ext_res.pages):
|
|
137
|
+
return (
|
|
138
|
+
ConversionStatus.PARTIAL_SUCCESS
|
|
139
|
+
if ext_res.status == ConversionStatus.PARTIAL_SUCCESS
|
|
140
|
+
else ConversionStatus.SUCCESS
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
return ConversionStatus.FAILURE
|
|
144
|
+
|
|
145
|
+
def _get_images_from_input(self, input_doc: InputDocument) -> list[Image]:
|
|
146
|
+
"""Extract images from input document using the backend."""
|
|
147
|
+
images = []
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
backend = input_doc._backend
|
|
151
|
+
|
|
152
|
+
assert isinstance(backend, PdfDocumentBackend)
|
|
153
|
+
# Use the backend's pagination interface
|
|
154
|
+
page_count = backend.page_count()
|
|
155
|
+
|
|
156
|
+
# Respect page range limits, following the same pattern as PaginatedPipeline
|
|
157
|
+
start_page, end_page = input_doc.limits.page_range
|
|
158
|
+
_log.info(
|
|
159
|
+
f"Processing pages {start_page}-{end_page} of {page_count} total pages for extraction"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
for page_num in range(page_count):
|
|
163
|
+
# Only process pages within the specified range (0-based indexing)
|
|
164
|
+
if start_page - 1 <= page_num <= end_page - 1:
|
|
165
|
+
try:
|
|
166
|
+
page_backend = backend.load_page(page_num)
|
|
167
|
+
if page_backend.is_valid():
|
|
168
|
+
# Get page image at a reasonable scale
|
|
169
|
+
page_image = page_backend.get_page_image(
|
|
170
|
+
scale=self.pipeline_options.vlm_options.scale
|
|
171
|
+
)
|
|
172
|
+
images.append(page_image)
|
|
173
|
+
else:
|
|
174
|
+
_log.warning(f"Page {page_num + 1} backend is not valid")
|
|
175
|
+
except Exception as e:
|
|
176
|
+
_log.error(f"Error loading page {page_num + 1}: {e}")
|
|
177
|
+
|
|
178
|
+
except Exception as e:
|
|
179
|
+
_log.error(f"Error getting images from input document: {e}")
|
|
180
|
+
|
|
181
|
+
return images
|
|
182
|
+
|
|
183
|
+
def _serialize_template(self, template: ExtractionTemplateType) -> str:
|
|
184
|
+
"""Serialize template to string based on its type."""
|
|
185
|
+
if isinstance(template, str):
|
|
186
|
+
return template
|
|
187
|
+
elif isinstance(template, dict):
|
|
188
|
+
return json.dumps(template, indent=2)
|
|
189
|
+
elif isinstance(template, BaseModel):
|
|
190
|
+
return template.model_dump_json(indent=2)
|
|
191
|
+
elif inspect.isclass(template) and issubclass(template, BaseModel):
|
|
192
|
+
from polyfactory.factories.pydantic_factory import ModelFactory
|
|
193
|
+
|
|
194
|
+
class ExtractionTemplateFactory(ModelFactory[template]): # type: ignore
|
|
195
|
+
__use_examples__ = True # prefer Field(examples=...) when present
|
|
196
|
+
__use_defaults__ = True # use field defaults instead of random values
|
|
197
|
+
__check_model__ = (
|
|
198
|
+
True # setting the value to avoid deprecation warnings
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return ExtractionTemplateFactory.build().model_dump_json(indent=2) # type: ignore
|
|
202
|
+
else:
|
|
203
|
+
raise ValueError(f"Unsupported template type: {type(template)}")
|
|
204
|
+
|
|
205
|
+
@classmethod
|
|
206
|
+
def get_default_options(cls) -> PipelineOptions:
|
|
207
|
+
return VlmExtractionPipelineOptions()
|