docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,326 @@
1
+ import functools
2
+ import logging
3
+ import time
4
+ import traceback
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import Iterable
7
+ from pathlib import Path
8
+ from typing import Any, Callable, List, Optional
9
+
10
+ from docling_core.types.doc import NodeItem
11
+
12
+ from docling.backend.abstract_backend import (
13
+ AbstractDocumentBackend,
14
+ PaginatedDocumentBackend,
15
+ )
16
+ from docling.backend.pdf_backend import PdfDocumentBackend
17
+ from docling.datamodel.base_models import (
18
+ ConversionStatus,
19
+ DoclingComponentType,
20
+ ErrorItem,
21
+ Page,
22
+ )
23
+ from docling.datamodel.document import ConversionResult, InputDocument
24
+ from docling.datamodel.pipeline_options import (
25
+ ConvertPipelineOptions,
26
+ PdfPipelineOptions,
27
+ PipelineOptions,
28
+ )
29
+ from docling.datamodel.settings import settings
30
+ from docling.models.base_model import GenericEnrichmentModel
31
+ from docling.models.factories import get_picture_description_factory
32
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
33
+ from docling.models.stages.picture_classifier.document_picture_classifier import (
34
+ DocumentPictureClassifier,
35
+ DocumentPictureClassifierOptions,
36
+ )
37
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
38
+ from docling.utils.utils import chunkify
39
+
40
+ _log = logging.getLogger(__name__)
41
+
42
+
43
+ class BasePipeline(ABC):
44
+ def __init__(self, pipeline_options: PipelineOptions):
45
+ self.pipeline_options = pipeline_options
46
+ self.keep_images = False
47
+ self.build_pipe: List[Callable] = []
48
+ self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
49
+
50
+ self.artifacts_path: Optional[Path] = None
51
+ if pipeline_options.artifacts_path is not None:
52
+ self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
53
+ elif settings.artifacts_path is not None:
54
+ self.artifacts_path = Path(settings.artifacts_path).expanduser()
55
+
56
+ if self.artifacts_path is not None and not self.artifacts_path.is_dir():
57
+ raise RuntimeError(
58
+ f"The value of {self.artifacts_path=} is not valid. "
59
+ "When defined, it must point to a folder containing all models required by the pipeline."
60
+ )
61
+
62
+ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
63
+ conv_res = ConversionResult(input=in_doc)
64
+
65
+ _log.info(f"Processing document {in_doc.file.name}")
66
+ try:
67
+ with TimeRecorder(
68
+ conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
69
+ ):
70
+ # These steps are building and assembling the structure of the
71
+ # output DoclingDocument.
72
+ conv_res = self._build_document(conv_res)
73
+ conv_res = self._assemble_document(conv_res)
74
+ # From this stage, all operations should rely only on conv_res.output
75
+ conv_res = self._enrich_document(conv_res)
76
+ conv_res.status = self._determine_status(conv_res)
77
+ except Exception as e:
78
+ conv_res.status = ConversionStatus.FAILURE
79
+ if not raises_on_error:
80
+ error_item = ErrorItem(
81
+ component_type=DoclingComponentType.PIPELINE,
82
+ module_name=self.__class__.__name__,
83
+ error_message=str(e),
84
+ )
85
+ conv_res.errors.append(error_item)
86
+ else:
87
+ raise RuntimeError(f"Pipeline {self.__class__.__name__} failed") from e
88
+ finally:
89
+ self._unload(conv_res)
90
+
91
+ return conv_res
92
+
93
+ @abstractmethod
94
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
95
+ pass
96
+
97
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
98
+ return conv_res
99
+
100
+ def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
101
+ def _prepare_elements(
102
+ conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
103
+ ) -> Iterable[NodeItem]:
104
+ for doc_element, _level in conv_res.document.iterate_items():
105
+ prepared_element = model.prepare_element(
106
+ conv_res=conv_res, element=doc_element
107
+ )
108
+ if prepared_element is not None:
109
+ yield prepared_element
110
+
111
+ with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
112
+ for model in self.enrichment_pipe:
113
+ for element_batch in chunkify(
114
+ _prepare_elements(conv_res, model),
115
+ model.elements_batch_size,
116
+ ):
117
+ for element in model(
118
+ doc=conv_res.document, element_batch=element_batch
119
+ ): # Must exhaust!
120
+ pass
121
+
122
+ return conv_res
123
+
124
+ @abstractmethod
125
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
126
+ pass
127
+
128
+ def _unload(self, conv_res: ConversionResult):
129
+ pass
130
+
131
+ @classmethod
132
+ @abstractmethod
133
+ def get_default_options(cls) -> PipelineOptions:
134
+ pass
135
+
136
+ @classmethod
137
+ @abstractmethod
138
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
139
+ pass
140
+
141
+
142
+ class ConvertPipeline(BasePipeline):
143
+ def __init__(self, pipeline_options: ConvertPipelineOptions):
144
+ super().__init__(pipeline_options)
145
+ self.pipeline_options: ConvertPipelineOptions
146
+
147
+ # ------ Common enrichment models working on all backends
148
+
149
+ # Picture description model
150
+ if (
151
+ picture_description_model := self._get_picture_description_model(
152
+ artifacts_path=self.artifacts_path
153
+ )
154
+ ) is None:
155
+ raise RuntimeError(
156
+ f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
157
+ )
158
+
159
+ self.enrichment_pipe = [
160
+ # Document Picture Classifier
161
+ DocumentPictureClassifier(
162
+ enabled=pipeline_options.do_picture_classification,
163
+ artifacts_path=self.artifacts_path,
164
+ options=DocumentPictureClassifierOptions(),
165
+ accelerator_options=pipeline_options.accelerator_options,
166
+ ),
167
+ # Document Picture description
168
+ picture_description_model,
169
+ ]
170
+
171
+ def _get_picture_description_model(
172
+ self, artifacts_path: Optional[Path] = None
173
+ ) -> Optional[PictureDescriptionBaseModel]:
174
+ factory = get_picture_description_factory(
175
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
176
+ )
177
+ return factory.create_instance(
178
+ options=self.pipeline_options.picture_description_options,
179
+ enabled=self.pipeline_options.do_picture_description,
180
+ enable_remote_services=self.pipeline_options.enable_remote_services,
181
+ artifacts_path=artifacts_path,
182
+ accelerator_options=self.pipeline_options.accelerator_options,
183
+ )
184
+
185
+ @classmethod
186
+ @abstractmethod
187
+ def get_default_options(cls) -> ConvertPipelineOptions:
188
+ pass
189
+
190
+
191
+ class PaginatedPipeline(ConvertPipeline): # TODO this is a bad name.
192
+ def __init__(self, pipeline_options: ConvertPipelineOptions):
193
+ super().__init__(pipeline_options)
194
+ self.keep_backend = False
195
+
196
+ def _apply_on_pages(
197
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
198
+ ) -> Iterable[Page]:
199
+ for model in self.build_pipe:
200
+ page_batch = model(conv_res, page_batch)
201
+
202
+ yield from page_batch
203
+
204
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
205
+ if not isinstance(conv_res.input._backend, PaginatedDocumentBackend):
206
+ raise RuntimeError(
207
+ f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a paginated backend. "
208
+ f"Can not convert this with a paginated PDF pipeline. "
209
+ f"Please check your format configuration on DocumentConverter."
210
+ )
211
+ # conv_res.status = ConversionStatus.FAILURE
212
+ # return conv_res
213
+
214
+ total_elapsed_time = 0.0
215
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
216
+ for i in range(conv_res.input.page_count):
217
+ start_page, end_page = conv_res.input.limits.page_range
218
+ if (start_page - 1) <= i <= (end_page - 1):
219
+ conv_res.pages.append(Page(page_no=i + 1))
220
+
221
+ try:
222
+ total_pages_processed = 0
223
+ # Iterate batches of pages (page_batch_size) in the doc
224
+ for page_batch in chunkify(
225
+ conv_res.pages, settings.perf.page_batch_size
226
+ ):
227
+ start_batch_time = time.monotonic()
228
+
229
+ # 1. Initialise the page resources
230
+ init_pages = map(
231
+ functools.partial(self.initialize_page, conv_res), page_batch
232
+ )
233
+
234
+ # 2. Run pipeline stages
235
+ pipeline_pages = self._apply_on_pages(conv_res, init_pages)
236
+
237
+ for p in pipeline_pages: # Must exhaust!
238
+ # Cleanup cached images
239
+ if not self.keep_images:
240
+ p._image_cache = {}
241
+
242
+ # Cleanup page backends
243
+ if not self.keep_backend and p._backend is not None:
244
+ p._backend.unload()
245
+ if (
246
+ isinstance(self.pipeline_options, PdfPipelineOptions)
247
+ and not self.pipeline_options.generate_parsed_pages
248
+ ):
249
+ del p.parsed_page
250
+ p.parsed_page = None
251
+
252
+ end_batch_time = time.monotonic()
253
+ total_elapsed_time += end_batch_time - start_batch_time
254
+ if (
255
+ self.pipeline_options.document_timeout is not None
256
+ and total_elapsed_time > self.pipeline_options.document_timeout
257
+ ):
258
+ _log.warning(
259
+ f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
260
+ )
261
+ conv_res.status = ConversionStatus.PARTIAL_SUCCESS
262
+ break
263
+ total_pages_processed += len(page_batch)
264
+ _log.debug(
265
+ f"Finished converting pages {total_pages_processed}/{len(conv_res.pages)} time={end_batch_time:.3f}"
266
+ )
267
+
268
+ except Exception as e:
269
+ conv_res.status = ConversionStatus.FAILURE
270
+ trace = "\n".join(
271
+ traceback.format_exception(type(e), e, e.__traceback__)
272
+ )
273
+ _log.warning(
274
+ f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
275
+ f"{trace}"
276
+ )
277
+ raise e
278
+
279
+ # Filter out uninitialized pages (those with size=None) that may remain
280
+ # after timeout or processing failures to prevent assertion errors downstream
281
+ initial_page_count = len(conv_res.pages)
282
+ conv_res.pages = [page for page in conv_res.pages if page.size is not None]
283
+
284
+ if len(conv_res.pages) < initial_page_count:
285
+ _log.info(
286
+ f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
287
+ f"due to timeout or processing failures"
288
+ )
289
+
290
+ return conv_res
291
+
292
+ def _unload(self, conv_res: ConversionResult) -> ConversionResult:
293
+ for page in conv_res.pages:
294
+ if page._backend is not None:
295
+ page._backend.unload()
296
+
297
+ if conv_res.input._backend:
298
+ conv_res.input._backend.unload()
299
+
300
+ return conv_res
301
+
302
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
303
+ status = conv_res.status
304
+ if status in [
305
+ ConversionStatus.PENDING,
306
+ ConversionStatus.STARTED,
307
+ ]: # preserves ConversionStatus.PARTIAL_SUCCESS
308
+ status = ConversionStatus.SUCCESS
309
+
310
+ for page in conv_res.pages:
311
+ if page._backend is None or not page._backend.is_valid():
312
+ conv_res.errors.append(
313
+ ErrorItem(
314
+ component_type=DoclingComponentType.DOCUMENT_BACKEND,
315
+ module_name=type(page._backend).__name__,
316
+ error_message=f"Page {page.page_no} failed to parse.",
317
+ )
318
+ )
319
+ status = ConversionStatus.PARTIAL_SUCCESS
320
+
321
+ return status
322
+
323
+ # Initialise and load resources for a page
324
+ @abstractmethod
325
+ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
326
+ pass
@@ -0,0 +1,207 @@
1
+ import inspect
2
+ import json
3
+ import logging
4
+ from typing import Optional
5
+
6
+ from PIL.Image import Image
7
+ from pydantic import BaseModel
8
+
9
+ from docling.backend.abstract_backend import PaginatedDocumentBackend
10
+ from docling.backend.pdf_backend import PdfDocumentBackend
11
+ from docling.datamodel.base_models import ConversionStatus, ErrorItem, VlmStopReason
12
+ from docling.datamodel.document import InputDocument
13
+ from docling.datamodel.extraction import (
14
+ ExtractedPageData,
15
+ ExtractionResult,
16
+ ExtractionTemplateType,
17
+ )
18
+ from docling.datamodel.pipeline_options import (
19
+ PipelineOptions,
20
+ VlmExtractionPipelineOptions,
21
+ )
22
+ from docling.datamodel.settings import settings
23
+ from docling.models.extraction.nuextract_transformers_model import (
24
+ NuExtractTransformersModel,
25
+ )
26
+ from docling.pipeline.base_extraction_pipeline import BaseExtractionPipeline
27
+ from docling.utils.accelerator_utils import decide_device
28
+
29
+ _log = logging.getLogger(__name__)
30
+
31
+
32
+ class ExtractionVlmPipeline(BaseExtractionPipeline):
33
+ def __init__(self, pipeline_options: VlmExtractionPipelineOptions):
34
+ super().__init__(pipeline_options)
35
+
36
+ # Initialize VLM model with default options
37
+ self.accelerator_options = pipeline_options.accelerator_options
38
+ self.pipeline_options: VlmExtractionPipelineOptions
39
+
40
+ # Create VLM model instance
41
+ self.vlm_model = NuExtractTransformersModel(
42
+ enabled=True,
43
+ artifacts_path=self.artifacts_path, # Will download automatically
44
+ accelerator_options=self.accelerator_options,
45
+ vlm_options=pipeline_options.vlm_options,
46
+ )
47
+
48
+ def _extract_data(
49
+ self,
50
+ ext_res: ExtractionResult,
51
+ template: Optional[ExtractionTemplateType] = None,
52
+ ) -> ExtractionResult:
53
+ """Extract data using the VLM model."""
54
+ try:
55
+ # Get images from input document using the backend
56
+ images = self._get_images_from_input(ext_res.input)
57
+ if not images:
58
+ ext_res.status = ConversionStatus.FAILURE
59
+ ext_res.errors.append(
60
+ ErrorItem(
61
+ component_type="extraction_pipeline",
62
+ module_name=self.__class__.__name__,
63
+ error_message="No images found in document",
64
+ )
65
+ )
66
+ return ext_res
67
+
68
+ # Use provided template or default prompt
69
+ if template is not None:
70
+ prompt = self._serialize_template(template)
71
+ else:
72
+ prompt = "Extract all text and structured information from this document. Return as JSON."
73
+
74
+ # Process all images with VLM model
75
+ start_page, end_page = ext_res.input.limits.page_range
76
+ for i, image in enumerate(images):
77
+ # Calculate the actual page number based on the filtered range
78
+ page_number = start_page + i
79
+ try:
80
+ predictions = list(self.vlm_model.process_images([image], prompt))
81
+
82
+ if predictions:
83
+ # Parse the extracted text as JSON if possible, otherwise use as-is
84
+ extracted_text = predictions[0].text
85
+ extracted_data = None
86
+ vlm_stop_reason: VlmStopReason = predictions[0].stop_reason
87
+ if (
88
+ vlm_stop_reason == VlmStopReason.LENGTH
89
+ or vlm_stop_reason == VlmStopReason.STOP_SEQUENCE
90
+ ):
91
+ ext_res.status = ConversionStatus.PARTIAL_SUCCESS
92
+
93
+ try:
94
+ extracted_data = json.loads(extracted_text)
95
+ except (json.JSONDecodeError, ValueError):
96
+ # If not valid JSON, keep extracted_data as None
97
+ pass
98
+
99
+ # Create page data with proper structure
100
+ page_data = ExtractedPageData(
101
+ page_no=page_number,
102
+ extracted_data=extracted_data,
103
+ raw_text=extracted_text, # Always populate raw_text
104
+ )
105
+ ext_res.pages.append(page_data)
106
+ else:
107
+ # Add error page data
108
+ page_data = ExtractedPageData(
109
+ page_no=page_number,
110
+ extracted_data=None,
111
+ errors=["No extraction result from VLM model"],
112
+ )
113
+ ext_res.pages.append(page_data)
114
+
115
+ except Exception as e:
116
+ _log.error(f"Error processing page {page_number}: {e}")
117
+ page_data = ExtractedPageData(
118
+ page_no=page_number, extracted_data=None, errors=[str(e)]
119
+ )
120
+ ext_res.pages.append(page_data)
121
+
122
+ except Exception as e:
123
+ _log.error(f"Error during extraction: {e}")
124
+ ext_res.errors.append(
125
+ ErrorItem(
126
+ component_type="extraction_pipeline",
127
+ module_name=self.__class__.__name__,
128
+ error_message=str(e),
129
+ )
130
+ )
131
+
132
+ return ext_res
133
+
134
+ def _determine_status(self, ext_res: ExtractionResult) -> ConversionStatus:
135
+ """Determine the status based on extraction results."""
136
+ if ext_res.pages and not any(page.errors for page in ext_res.pages):
137
+ return (
138
+ ConversionStatus.PARTIAL_SUCCESS
139
+ if ext_res.status == ConversionStatus.PARTIAL_SUCCESS
140
+ else ConversionStatus.SUCCESS
141
+ )
142
+ else:
143
+ return ConversionStatus.FAILURE
144
+
145
+ def _get_images_from_input(self, input_doc: InputDocument) -> list[Image]:
146
+ """Extract images from input document using the backend."""
147
+ images = []
148
+
149
+ try:
150
+ backend = input_doc._backend
151
+
152
+ assert isinstance(backend, PdfDocumentBackend)
153
+ # Use the backend's pagination interface
154
+ page_count = backend.page_count()
155
+
156
+ # Respect page range limits, following the same pattern as PaginatedPipeline
157
+ start_page, end_page = input_doc.limits.page_range
158
+ _log.info(
159
+ f"Processing pages {start_page}-{end_page} of {page_count} total pages for extraction"
160
+ )
161
+
162
+ for page_num in range(page_count):
163
+ # Only process pages within the specified range (0-based indexing)
164
+ if start_page - 1 <= page_num <= end_page - 1:
165
+ try:
166
+ page_backend = backend.load_page(page_num)
167
+ if page_backend.is_valid():
168
+ # Get page image at a reasonable scale
169
+ page_image = page_backend.get_page_image(
170
+ scale=self.pipeline_options.vlm_options.scale
171
+ )
172
+ images.append(page_image)
173
+ else:
174
+ _log.warning(f"Page {page_num + 1} backend is not valid")
175
+ except Exception as e:
176
+ _log.error(f"Error loading page {page_num + 1}: {e}")
177
+
178
+ except Exception as e:
179
+ _log.error(f"Error getting images from input document: {e}")
180
+
181
+ return images
182
+
183
+ def _serialize_template(self, template: ExtractionTemplateType) -> str:
184
+ """Serialize template to string based on its type."""
185
+ if isinstance(template, str):
186
+ return template
187
+ elif isinstance(template, dict):
188
+ return json.dumps(template, indent=2)
189
+ elif isinstance(template, BaseModel):
190
+ return template.model_dump_json(indent=2)
191
+ elif inspect.isclass(template) and issubclass(template, BaseModel):
192
+ from polyfactory.factories.pydantic_factory import ModelFactory
193
+
194
+ class ExtractionTemplateFactory(ModelFactory[template]): # type: ignore
195
+ __use_examples__ = True # prefer Field(examples=...) when present
196
+ __use_defaults__ = True # use field defaults instead of random values
197
+ __check_model__ = (
198
+ True # setting the value to avoid deprecation warnings
199
+ )
200
+
201
+ return ExtractionTemplateFactory.build().model_dump_json(indent=2) # type: ignore
202
+ else:
203
+ raise ValueError(f"Unsupported template type: {type(template)}")
204
+
205
+ @classmethod
206
+ def get_default_options(cls) -> PipelineOptions:
207
+ return VlmExtractionPipelineOptions()