docling 2.37.0__py3-none-any.whl → 2.38.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ from docling.backend.abstract_backend import AbstractDocumentBackend
7
+ from docling.datamodel.base_models import InputFormat
8
+ from docling.datamodel.document import InputDocument
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class NoOpBackend(AbstractDocumentBackend):
14
+ """
15
+ A no-op backend that only validates input existence.
16
+ Used e.g. for audio files where actual processing is handled by the ASR pipeline.
17
+ """
18
+
19
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
20
+ super().__init__(in_doc, path_or_stream)
21
+
22
+ _log.debug(f"NoOpBackend initialized for: {path_or_stream}")
23
+
24
+ # Validate input
25
+ try:
26
+ if isinstance(self.path_or_stream, BytesIO):
27
+ # Check if stream has content
28
+ self.valid = len(self.path_or_stream.getvalue()) > 0
29
+ _log.debug(
30
+ f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
31
+ )
32
+ elif isinstance(self.path_or_stream, Path):
33
+ # Check if file exists
34
+ self.valid = self.path_or_stream.exists()
35
+ _log.debug(f"File exists: {self.valid}")
36
+ else:
37
+ self.valid = False
38
+ except Exception as e:
39
+ _log.error(f"NoOpBackend validation failed: {e}")
40
+ self.valid = False
41
+
42
+ def is_valid(self) -> bool:
43
+ return self.valid
44
+
45
+ @classmethod
46
+ def supports_pagination(cls) -> bool:
47
+ return False
48
+
49
+ @classmethod
50
+ def supported_formats(cls) -> Set[InputFormat]:
51
+ return set(InputFormat)
docling/cli/main.py CHANGED
@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
29
29
  from docling.backend.pdf_backend import PdfDocumentBackend
30
30
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
31
31
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
32
+ from docling.datamodel.asr_model_specs import (
33
+ WHISPER_BASE,
34
+ WHISPER_LARGE,
35
+ WHISPER_MEDIUM,
36
+ WHISPER_SMALL,
37
+ WHISPER_TINY,
38
+ WHISPER_TURBO,
39
+ AsrModelType,
40
+ )
32
41
  from docling.datamodel.base_models import (
33
42
  ConversionStatus,
34
43
  FormatToExtensions,
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
37
46
  )
38
47
  from docling.datamodel.document import ConversionResult
39
48
  from docling.datamodel.pipeline_options import (
49
+ AsrPipelineOptions,
40
50
  EasyOcrOptions,
41
51
  OcrOptions,
42
52
  PaginatedPipelineOptions,
43
53
  PdfBackend,
44
- PdfPipeline,
45
54
  PdfPipelineOptions,
55
+ PipelineOptions,
56
+ ProcessingPipeline,
46
57
  TableFormerMode,
47
58
  VlmPipelineOptions,
48
59
  )
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
54
65
  SMOLDOCLING_TRANSFORMERS,
55
66
  VlmModelType,
56
67
  )
57
- from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
68
+ from docling.document_converter import (
69
+ AudioFormatOption,
70
+ DocumentConverter,
71
+ FormatOption,
72
+ PdfFormatOption,
73
+ )
58
74
  from docling.models.factories import get_ocr_factory
75
+ from docling.pipeline.asr_pipeline import AsrPipeline
59
76
  from docling.pipeline.vlm_pipeline import VlmPipeline
60
77
 
61
78
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +313,17 @@ def convert( # noqa: C901
296
313
  ),
297
314
  ] = ImageRefMode.EMBEDDED,
298
315
  pipeline: Annotated[
299
- PdfPipeline,
316
+ ProcessingPipeline,
300
317
  typer.Option(..., help="Choose the pipeline to process PDF or image files."),
301
- ] = PdfPipeline.STANDARD,
318
+ ] = ProcessingPipeline.STANDARD,
302
319
  vlm_model: Annotated[
303
320
  VlmModelType,
304
321
  typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
305
322
  ] = VlmModelType.SMOLDOCLING,
323
+ asr_model: Annotated[
324
+ AsrModelType,
325
+ typer.Option(..., help="Choose the ASR model to use with audio/video files."),
326
+ ] = AsrModelType.WHISPER_TINY,
306
327
  ocr: Annotated[
307
328
  bool,
308
329
  typer.Option(
@@ -450,12 +471,14 @@ def convert( # noqa: C901
450
471
  ),
451
472
  ] = None,
452
473
  ):
474
+ log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
475
+
453
476
  if verbose == 0:
454
- logging.basicConfig(level=logging.WARNING)
477
+ logging.basicConfig(level=logging.WARNING, format=log_format)
455
478
  elif verbose == 1:
456
- logging.basicConfig(level=logging.INFO)
479
+ logging.basicConfig(level=logging.INFO, format=log_format)
457
480
  else:
458
- logging.basicConfig(level=logging.DEBUG)
481
+ logging.basicConfig(level=logging.DEBUG, format=log_format)
459
482
 
460
483
  settings.debug.visualize_cells = debug_visualize_cells
461
484
  settings.debug.visualize_layout = debug_visualize_layout
@@ -530,9 +553,12 @@ def convert( # noqa: C901
530
553
  ocr_options.lang = ocr_lang_list
531
554
 
532
555
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
533
- pipeline_options: PaginatedPipelineOptions
556
+ # pipeline_options: PaginatedPipelineOptions
557
+ pipeline_options: PipelineOptions
558
+
559
+ format_options: Dict[InputFormat, FormatOption] = {}
534
560
 
535
- if pipeline == PdfPipeline.STANDARD:
561
+ if pipeline == ProcessingPipeline.STANDARD:
536
562
  pipeline_options = PdfPipelineOptions(
537
563
  allow_external_plugins=allow_external_plugins,
538
564
  enable_remote_services=enable_remote_services,
@@ -574,7 +600,13 @@ def convert( # noqa: C901
574
600
  pipeline_options=pipeline_options,
575
601
  backend=backend, # pdf_backend
576
602
  )
577
- elif pipeline == PdfPipeline.VLM:
603
+
604
+ format_options = {
605
+ InputFormat.PDF: pdf_format_option,
606
+ InputFormat.IMAGE: pdf_format_option,
607
+ }
608
+
609
+ elif pipeline == ProcessingPipeline.VLM:
578
610
  pipeline_options = VlmPipelineOptions(
579
611
  enable_remote_services=enable_remote_services,
580
612
  )
@@ -600,13 +632,48 @@ def convert( # noqa: C901
600
632
  pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
601
633
  )
602
634
 
635
+ format_options = {
636
+ InputFormat.PDF: pdf_format_option,
637
+ InputFormat.IMAGE: pdf_format_option,
638
+ }
639
+
640
+ elif pipeline == ProcessingPipeline.ASR:
641
+ pipeline_options = AsrPipelineOptions(
642
+ # enable_remote_services=enable_remote_services,
643
+ # artifacts_path = artifacts_path
644
+ )
645
+
646
+ if asr_model == AsrModelType.WHISPER_TINY:
647
+ pipeline_options.asr_options = WHISPER_TINY
648
+ elif asr_model == AsrModelType.WHISPER_SMALL:
649
+ pipeline_options.asr_options = WHISPER_SMALL
650
+ elif asr_model == AsrModelType.WHISPER_MEDIUM:
651
+ pipeline_options.asr_options = WHISPER_MEDIUM
652
+ elif asr_model == AsrModelType.WHISPER_BASE:
653
+ pipeline_options.asr_options = WHISPER_BASE
654
+ elif asr_model == AsrModelType.WHISPER_LARGE:
655
+ pipeline_options.asr_options = WHISPER_LARGE
656
+ elif asr_model == AsrModelType.WHISPER_TURBO:
657
+ pipeline_options.asr_options = WHISPER_TURBO
658
+ else:
659
+ _log.error(f"{asr_model} is not known")
660
+ raise ValueError(f"{asr_model} is not known")
661
+
662
+ _log.info(f"pipeline_options: {pipeline_options}")
663
+
664
+ audio_format_option = AudioFormatOption(
665
+ pipeline_cls=AsrPipeline,
666
+ pipeline_options=pipeline_options,
667
+ )
668
+
669
+ format_options = {
670
+ InputFormat.AUDIO: audio_format_option,
671
+ }
672
+
603
673
  if artifacts_path is not None:
604
674
  pipeline_options.artifacts_path = artifacts_path
675
+ # audio_pipeline_options.artifacts_path = artifacts_path
605
676
 
606
- format_options: Dict[InputFormat, FormatOption] = {
607
- InputFormat.PDF: pdf_format_option,
608
- InputFormat.IMAGE: pdf_format_option,
609
- }
610
677
  doc_converter = DocumentConverter(
611
678
  allowed_formats=from_formats,
612
679
  format_options=format_options,
@@ -614,6 +681,7 @@ def convert( # noqa: C901
614
681
 
615
682
  start_time = time.time()
616
683
 
684
+ _log.info(f"paths: {input_doc_paths}")
617
685
  conv_results = doc_converter.convert_all(
618
686
  input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
619
687
  )
@@ -0,0 +1,92 @@
1
+ import logging
2
+ from enum import Enum
3
+
4
+ from pydantic import (
5
+ AnyUrl,
6
+ )
7
+
8
+ from docling.datamodel.accelerator_options import AcceleratorDevice
9
+ from docling.datamodel.pipeline_options_asr_model import (
10
+ # AsrResponseFormat,
11
+ # ApiAsrOptions,
12
+ InferenceAsrFramework,
13
+ InlineAsrNativeWhisperOptions,
14
+ TransformersModelType,
15
+ )
16
+
17
+ _log = logging.getLogger(__name__)
18
+
19
+ WHISPER_TINY = InlineAsrNativeWhisperOptions(
20
+ repo_id="tiny",
21
+ inference_framework=InferenceAsrFramework.WHISPER,
22
+ verbose=True,
23
+ timestamps=True,
24
+ word_timestamps=True,
25
+ temperatue=0.0,
26
+ max_new_tokens=256,
27
+ max_time_chunk=30.0,
28
+ )
29
+
30
+ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
31
+ repo_id="small",
32
+ inference_framework=InferenceAsrFramework.WHISPER,
33
+ verbose=True,
34
+ timestamps=True,
35
+ word_timestamps=True,
36
+ temperatue=0.0,
37
+ max_new_tokens=256,
38
+ max_time_chunk=30.0,
39
+ )
40
+
41
+ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
42
+ repo_id="medium",
43
+ inference_framework=InferenceAsrFramework.WHISPER,
44
+ verbose=True,
45
+ timestamps=True,
46
+ word_timestamps=True,
47
+ temperatue=0.0,
48
+ max_new_tokens=256,
49
+ max_time_chunk=30.0,
50
+ )
51
+
52
+ WHISPER_BASE = InlineAsrNativeWhisperOptions(
53
+ repo_id="base",
54
+ inference_framework=InferenceAsrFramework.WHISPER,
55
+ verbose=True,
56
+ timestamps=True,
57
+ word_timestamps=True,
58
+ temperatue=0.0,
59
+ max_new_tokens=256,
60
+ max_time_chunk=30.0,
61
+ )
62
+
63
+ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
64
+ repo_id="large",
65
+ inference_framework=InferenceAsrFramework.WHISPER,
66
+ verbose=True,
67
+ timestamps=True,
68
+ word_timestamps=True,
69
+ temperatue=0.0,
70
+ max_new_tokens=256,
71
+ max_time_chunk=30.0,
72
+ )
73
+
74
+ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
75
+ repo_id="turbo",
76
+ inference_framework=InferenceAsrFramework.WHISPER,
77
+ verbose=True,
78
+ timestamps=True,
79
+ word_timestamps=True,
80
+ temperatue=0.0,
81
+ max_new_tokens=256,
82
+ max_time_chunk=30.0,
83
+ )
84
+
85
+
86
+ class AsrModelType(str, Enum):
87
+ WHISPER_TINY = "whisper_tiny"
88
+ WHISPER_SMALL = "whisper_small"
89
+ WHISPER_MEDIUM = "whisper_medium"
90
+ WHISPER_BASE = "whisper_base"
91
+ WHISPER_LARGE = "whisper_large"
92
+ WHISPER_TURBO = "whisper_turbo"
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
49
49
  XML_USPTO = "xml_uspto"
50
50
  XML_JATS = "xml_jats"
51
51
  JSON_DOCLING = "json_docling"
52
+ AUDIO = "audio"
52
53
 
53
54
 
54
55
  class OutputFormat(str, Enum):
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
73
74
  InputFormat.XLSX: ["xlsx", "xlsm"],
74
75
  InputFormat.XML_USPTO: ["xml", "txt"],
75
76
  InputFormat.JSON_DOCLING: ["json"],
77
+ InputFormat.AUDIO: ["wav", "mp3"],
76
78
  }
77
79
 
78
80
  FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
104
106
  ],
105
107
  InputFormat.XML_USPTO: ["application/xml", "text/plain"],
106
108
  InputFormat.JSON_DOCLING: ["application/json"],
109
+ InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
107
110
  }
108
111
 
109
112
  MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -253,11 +256,18 @@ class Page(BaseModel):
253
256
  return []
254
257
 
255
258
  def get_image(
256
- self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
259
+ self,
260
+ scale: float = 1.0,
261
+ max_size: Optional[int] = None,
262
+ cropbox: Optional[BoundingBox] = None,
257
263
  ) -> Optional[Image]:
258
264
  if self._backend is None:
259
265
  return self._image_cache.get(scale, None)
260
266
 
267
+ if max_size:
268
+ assert self.size is not None
269
+ scale = min(scale, max_size / max(self.size.as_tuple()))
270
+
261
271
  if scale not in self._image_cache:
262
272
  if cropbox is None:
263
273
  self._image_cache[scale] = self._backend.get_page_image(scale=scale)
@@ -291,7 +301,7 @@ class OpenAiChatMessage(BaseModel):
291
301
  class OpenAiResponseChoice(BaseModel):
292
302
  index: int
293
303
  message: OpenAiChatMessage
294
- finish_reason: str
304
+ finish_reason: Optional[str]
295
305
 
296
306
 
297
307
  class OpenAiResponseUsage(BaseModel):
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
249
249
  backend: Type[AbstractDocumentBackend]
250
250
  if format not in format_options.keys():
251
251
  _log.error(
252
- f"Input document {obj.name} does not match any allowed format."
252
+ f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
253
253
  )
254
254
  backend = _DummyBackend
255
255
  else:
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
318
318
  mime = mime or _DocumentConversionInput._detect_csv(content)
319
319
  mime = mime or "text/plain"
320
320
  formats = MimeTypeToFormat.get(mime, [])
321
+ _log.info(f"detected formats: {formats}")
322
+
321
323
  if formats:
322
324
  if len(formats) == 1 and mime not in ("text/plain"):
323
325
  return formats[0]
@@ -11,8 +11,13 @@ from pydantic import (
11
11
  )
12
12
  from typing_extensions import deprecated
13
13
 
14
+ from docling.datamodel import asr_model_specs
15
+
14
16
  # Import the following for backwards compatibility
15
17
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
18
+ from docling.datamodel.pipeline_options_asr_model import (
19
+ InlineAsrOptions,
20
+ )
16
21
  from docling.datamodel.pipeline_options_vlm_model import (
17
22
  ApiVlmOptions,
18
23
  InferenceFramework,
@@ -202,7 +207,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
202
207
 
203
208
  # GraniteVision
204
209
  granite_picture_description = PictureDescriptionVlmOptions(
205
- repo_id="ibm-granite/granite-vision-3.1-2b-preview",
210
+ repo_id="ibm-granite/granite-vision-3.2-2b-preview",
206
211
  prompt="What is shown in this image?",
207
212
  )
208
213
 
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
260
265
  )
261
266
 
262
267
 
268
+ class AsrPipelineOptions(PipelineOptions):
269
+ asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
270
+ artifacts_path: Optional[Union[Path, str]] = None
271
+
272
+
263
273
  class PdfPipelineOptions(PaginatedPipelineOptions):
264
274
  """Options for the PDF pipeline."""
265
275
 
@@ -297,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
297
307
  )
298
308
 
299
309
 
300
- class PdfPipeline(str, Enum):
310
+ class ProcessingPipeline(str, Enum):
301
311
  STANDARD = "standard"
302
312
  VLM = "vlm"
313
+ ASR = "asr"
@@ -0,0 +1,57 @@
1
+ from enum import Enum
2
+ from typing import Any, Dict, List, Literal, Optional, Union
3
+
4
+ from pydantic import AnyUrl, BaseModel
5
+ from typing_extensions import deprecated
6
+
7
+ from docling.datamodel.accelerator_options import AcceleratorDevice
8
+ from docling.datamodel.pipeline_options_vlm_model import (
9
+ # InferenceFramework,
10
+ TransformersModelType,
11
+ )
12
+
13
+
14
+ class BaseAsrOptions(BaseModel):
15
+ kind: str
16
+ # prompt: str
17
+
18
+
19
+ class InferenceAsrFramework(str, Enum):
20
+ # MLX = "mlx" # disabled for now
21
+ # TRANSFORMERS = "transformers" # disabled for now
22
+ WHISPER = "whisper"
23
+
24
+
25
+ class InlineAsrOptions(BaseAsrOptions):
26
+ kind: Literal["inline_model_options"] = "inline_model_options"
27
+
28
+ repo_id: str
29
+
30
+ verbose: bool = False
31
+ timestamps: bool = True
32
+
33
+ temperature: float = 0.0
34
+ max_new_tokens: int = 256
35
+ max_time_chunk: float = 30.0
36
+
37
+ torch_dtype: Optional[str] = None
38
+ supported_devices: List[AcceleratorDevice] = [
39
+ AcceleratorDevice.CPU,
40
+ AcceleratorDevice.CUDA,
41
+ AcceleratorDevice.MPS,
42
+ ]
43
+
44
+ @property
45
+ def repo_cache_folder(self) -> str:
46
+ return self.repo_id.replace("/", "--")
47
+
48
+
49
+ class InlineAsrNativeWhisperOptions(InlineAsrOptions):
50
+ inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
51
+
52
+ language: str = "en"
53
+ supported_devices: List[AcceleratorDevice] = [
54
+ AcceleratorDevice.CPU,
55
+ AcceleratorDevice.CUDA,
56
+ ]
57
+ word_timestamps: bool = True
@@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
10
10
  class BaseVlmOptions(BaseModel):
11
11
  kind: str
12
12
  prompt: str
13
+ scale: float = 2.0
14
+ max_size: Optional[int] = None
13
15
 
14
16
 
15
17
  class ResponseFormat(str, Enum):
@@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions):
49
51
  AcceleratorDevice.MPS,
50
52
  ]
51
53
 
52
- scale: float = 2.0
53
-
54
54
  temperature: float = 0.0
55
55
  stop_strings: List[str] = []
56
56
  extra_generation_config: Dict[str, Any] = {}
@@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
76
76
  ) # Default to ollama
77
77
  headers: Dict[str, str] = {}
78
78
  params: Dict[str, Any] = {}
79
- scale: float = 2.0
80
79
  timeout: float = 60
81
80
  concurrency: int = 1
82
81
  response_format: ResponseFormat
@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
19
19
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
20
20
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
21
21
  from docling.backend.msword_backend import MsWordDocumentBackend
22
+ from docling.backend.noop_backend import NoOpBackend
22
23
  from docling.backend.xml.jats_backend import JatsDocumentBackend
23
24
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
24
25
  from docling.datamodel.base_models import (
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
41
42
  settings,
42
43
  )
43
44
  from docling.exceptions import ConversionError
45
+ from docling.pipeline.asr_pipeline import AsrPipeline
44
46
  from docling.pipeline.base_pipeline import BasePipeline
45
47
  from docling.pipeline.simple_pipeline import SimplePipeline
46
48
  from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
118
120
  backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
119
121
 
120
122
 
123
+ class AudioFormatOption(FormatOption):
124
+ pipeline_cls: Type = AsrPipeline
125
+ backend: Type[AbstractDocumentBackend] = NoOpBackend
126
+
127
+
121
128
  def _get_default_option(format: InputFormat) -> FormatOption:
122
129
  format_to_default_options = {
123
130
  InputFormat.CSV: FormatOption(
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
156
163
  InputFormat.JSON_DOCLING: FormatOption(
157
164
  pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
158
165
  ),
166
+ InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
159
167
  }
160
168
  if (options := format_to_default_options.get(format)) is not None:
161
169
  return options
@@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
48
48
  with TimeRecorder(conv_res, "vlm"):
49
49
  assert page.size is not None
50
50
 
51
- hi_res_image = page.get_image(scale=self.vlm_options.scale)
51
+ hi_res_image = page.get_image(
52
+ scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
53
+ )
52
54
  assert hi_res_image is not None
53
55
  if hi_res_image:
54
56
  if hi_res_image.mode != "RGB":
@@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
86
86
  coord_origin=bbox.coord_origin,
87
87
  )
88
88
 
89
- page_ix = element_prov.page_no - 1
89
+ page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
90
90
  cropped_image = conv_res.pages[page_ix].get_image(
91
91
  scale=self.images_scale, cropbox=expanded_bbox
92
92
  )
@@ -124,7 +124,7 @@ class ReadingOrderModel:
124
124
  page_no = page.page_no + 1
125
125
  size = page.size
126
126
 
127
- assert size is not None
127
+ assert size is not None, "Page size is not initialized."
128
128
 
129
129
  out_doc.add_page(page_no=page_no, size=size)
130
130
 
@@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
123
123
  with TimeRecorder(conv_res, "vlm"):
124
124
  assert page.size is not None
125
125
 
126
- hi_res_image = page.get_image(scale=self.vlm_options.scale)
126
+ hi_res_image = page.get_image(
127
+ scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
128
+ )
127
129
 
128
130
  # Define prompt structure
129
131
  prompt = self.formulate_prompt()
@@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
73
73
  with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
74
74
  assert page.size is not None
75
75
 
76
- hi_res_image = page.get_image(scale=self.vlm_options.scale)
76
+ hi_res_image = page.get_image(
77
+ scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
78
+ )
77
79
  if hi_res_image is not None:
78
80
  im_width, im_height = hi_res_image.size
79
81