docling 2.27.0__py3-none-any.whl → 2.28.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,23 +112,30 @@ class DoclingParseV4PageBackend(PdfPageBackend):
112
112
  padbox.r = page_size.width - padbox.r
113
113
  padbox.t = page_size.height - padbox.t
114
114
 
115
- image = (
116
- self._ppage.render(
117
- scale=scale * 1.5,
118
- rotation=0, # no additional rotation
119
- crop=padbox.as_tuple(),
120
- )
121
- .to_pil()
122
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
123
- ) # We resize the image from 1.5x the given scale to make it sharper.
115
+ with pypdfium2_lock:
116
+ image = (
117
+ self._ppage.render(
118
+ scale=scale * 1.5,
119
+ rotation=0, # no additional rotation
120
+ crop=padbox.as_tuple(),
121
+ )
122
+ .to_pil()
123
+ .resize(
124
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
125
+ )
126
+ ) # We resize the image from 1.5x the given scale to make it sharper.
124
127
 
125
128
  return image
126
129
 
127
130
  def get_size(self) -> Size:
128
- return Size(
129
- width=self._dpage.dimension.width,
130
- height=self._dpage.dimension.height,
131
- )
131
+ with pypdfium2_lock:
132
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
133
+
134
+ # TODO: Take width and height from docling-parse.
135
+ # return Size(
136
+ # width=self._dpage.dimension.width,
137
+ # height=self._dpage.dimension.height,
138
+ # )
132
139
 
133
140
  def unload(self):
134
141
  self._ppage = None
@@ -16,6 +16,7 @@ from docling_core.types.doc import (
16
16
  TableCell,
17
17
  TableData,
18
18
  )
19
+ from docling_core.types.doc.document import ContentLayer
19
20
  from PIL import Image, UnidentifiedImageError
20
21
  from pptx import Presentation
21
22
  from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
421
422
  for shape in slide.shapes:
422
423
  handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
423
424
 
425
+ # Handle notes slide
426
+ if slide.has_notes_slide:
427
+ notes_slide = slide.notes_slide
428
+ notes_text = notes_slide.notes_text_frame.text.strip()
429
+ if notes_text:
430
+ bbox = BoundingBox(l=0, t=0, r=0, b=0)
431
+ prov = ProvenanceItem(
432
+ page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
433
+ )
434
+ doc.add_text(
435
+ label=DocItemLabel.TEXT,
436
+ parent=parent_slide,
437
+ text=notes_text,
438
+ prov=prov,
439
+ content_layer=ContentLayer.FURNITURE,
440
+ )
441
+
424
442
  return doc
@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
53
53
  self.max_levels: int = 10
54
54
  self.level_at_new_list: Optional[int] = None
55
55
  self.parents: dict[int, Optional[NodeItem]] = {}
56
+ self.numbered_headers: dict[int, int] = {}
56
57
  for i in range(-1, self.max_levels):
57
58
  self.parents[i] = None
58
59
 
@@ -275,8 +276,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
275
276
  only_equations.append(latex_equation)
276
277
  texts_and_equations.append(latex_equation)
277
278
 
278
- if "".join(only_texts) != text:
279
- return text
279
+ if "".join(only_texts).strip() != text.strip():
280
+ # If we are not able to reconstruct the initial raw text
281
+ # do not try to parse equations and return the original
282
+ return text, []
280
283
 
281
284
  return "".join(texts_and_equations), only_equations
282
285
 
@@ -344,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
344
347
  parent=None, label=DocItemLabel.TITLE, text=text
345
348
  )
346
349
  elif "Heading" in p_style_id:
347
- self.add_header(doc, p_level, text)
350
+ style_element = getattr(paragraph.style, "element", None)
351
+ if style_element:
352
+ is_numbered_style = (
353
+ "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
354
+ )
355
+ else:
356
+ is_numbered_style = False
357
+ self.add_header(doc, p_level, text, is_numbered_style)
348
358
 
349
359
  elif len(equations) > 0:
350
360
  if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
@@ -365,6 +375,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
365
375
  for eq in equations:
366
376
  if len(text_tmp) == 0:
367
377
  break
378
+
368
379
  pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
369
380
  text_tmp = text_tmp.split(eq, maxsplit=1)[1]
370
381
  if len(pre_eq_text) > 0:
@@ -412,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
412
423
  return
413
424
 
414
425
  def add_header(
415
- self, doc: DoclingDocument, curr_level: Optional[int], text: str
426
+ self,
427
+ doc: DoclingDocument,
428
+ curr_level: Optional[int],
429
+ text: str,
430
+ is_numbered_style: bool = False,
416
431
  ) -> None:
417
432
  level = self.get_level()
418
433
  if isinstance(curr_level, int):
@@ -430,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
430
445
  if key >= curr_level:
431
446
  self.parents[key] = None
432
447
 
433
- self.parents[curr_level] = doc.add_heading(
434
- parent=self.parents[curr_level - 1],
435
- text=text,
436
- level=curr_level,
437
- )
448
+ current_level = curr_level
449
+ parent_level = curr_level - 1
450
+ add_level = curr_level
438
451
  else:
439
- self.parents[self.level] = doc.add_heading(
440
- parent=self.parents[self.level - 1],
441
- text=text,
442
- level=1,
443
- )
452
+ current_level = self.level
453
+ parent_level = self.level - 1
454
+ add_level = 1
455
+
456
+ if is_numbered_style:
457
+ if add_level in self.numbered_headers:
458
+ self.numbered_headers[add_level] += 1
459
+ else:
460
+ self.numbered_headers[add_level] = 1
461
+ text = f"{self.numbered_headers[add_level]} {text}"
462
+
463
+ # Reset deeper levels
464
+ next_level = add_level + 1
465
+ while next_level in self.numbered_headers:
466
+ self.numbered_headers[next_level] = 0
467
+ next_level += 1
468
+
469
+ # Scan upper levels
470
+ previous_level = add_level - 1
471
+ while previous_level in self.numbered_headers:
472
+ # MSWord convention: no empty sublevels
473
+ # I.e., sub-sub section (2.0.1) without a sub-section (2.1)
474
+ # is processed as 2.1.1
475
+ if self.numbered_headers[previous_level] == 0:
476
+ self.numbered_headers[previous_level] += 1
477
+
478
+ text = f"{self.numbered_headers[previous_level]}.{text}"
479
+ previous_level -= 1
480
+
481
+ self.parents[current_level] = doc.add_heading(
482
+ parent=self.parents[parent_level],
483
+ text=text,
484
+ level=add_level,
485
+ )
444
486
  return
445
487
 
446
488
  def add_listitem(
docling/cli/main.py CHANGED
@@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
32
32
  AcceleratorOptions,
33
33
  EasyOcrOptions,
34
34
  OcrOptions,
35
+ PaginatedPipelineOptions,
35
36
  PdfBackend,
37
+ PdfPipeline,
36
38
  PdfPipelineOptions,
37
39
  TableFormerMode,
40
+ VlmModelType,
41
+ VlmPipelineOptions,
42
+ granite_vision_vlm_conversion_options,
43
+ smoldocling_vlm_conversion_options,
44
+ smoldocling_vlm_mlx_conversion_options,
38
45
  )
39
46
  from docling.datamodel.settings import settings
40
47
  from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
41
48
  from docling.models.factories import get_ocr_factory
49
+ from docling.pipeline.vlm_pipeline import VlmPipeline
42
50
 
43
51
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
44
52
  warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -200,6 +208,14 @@ def convert(
200
208
  help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
201
209
  ),
202
210
  ] = ImageRefMode.EMBEDDED,
211
+ pipeline: Annotated[
212
+ PdfPipeline,
213
+ typer.Option(..., help="Choose the pipeline to process PDF or image files."),
214
+ ] = PdfPipeline.STANDARD,
215
+ vlm_model: Annotated[
216
+ VlmModelType,
217
+ typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
218
+ ] = VlmModelType.SMOLDOCLING,
203
219
  ocr: Annotated[
204
220
  bool,
205
221
  typer.Option(
@@ -420,50 +436,77 @@ def convert(
420
436
  ocr_options.lang = ocr_lang_list
421
437
 
422
438
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
423
- pipeline_options = PdfPipelineOptions(
424
- allow_external_plugins=allow_external_plugins,
425
- enable_remote_services=enable_remote_services,
426
- accelerator_options=accelerator_options,
427
- do_ocr=ocr,
428
- ocr_options=ocr_options,
429
- do_table_structure=True,
430
- do_code_enrichment=enrich_code,
431
- do_formula_enrichment=enrich_formula,
432
- do_picture_description=enrich_picture_description,
433
- do_picture_classification=enrich_picture_classes,
434
- document_timeout=document_timeout,
435
- )
436
- pipeline_options.table_structure_options.do_cell_matching = (
437
- True # do_cell_matching
438
- )
439
- pipeline_options.table_structure_options.mode = table_mode
439
+ pipeline_options: PaginatedPipelineOptions
440
+
441
+ if pipeline == PdfPipeline.STANDARD:
442
+ pipeline_options = PdfPipelineOptions(
443
+ allow_external_plugins=allow_external_plugins,
444
+ enable_remote_services=enable_remote_services,
445
+ accelerator_options=accelerator_options,
446
+ do_ocr=ocr,
447
+ ocr_options=ocr_options,
448
+ do_table_structure=True,
449
+ do_code_enrichment=enrich_code,
450
+ do_formula_enrichment=enrich_formula,
451
+ do_picture_description=enrich_picture_description,
452
+ do_picture_classification=enrich_picture_classes,
453
+ document_timeout=document_timeout,
454
+ )
455
+ pipeline_options.table_structure_options.do_cell_matching = (
456
+ True # do_cell_matching
457
+ )
458
+ pipeline_options.table_structure_options.mode = table_mode
459
+
460
+ if image_export_mode != ImageRefMode.PLACEHOLDER:
461
+ pipeline_options.generate_page_images = True
462
+ pipeline_options.generate_picture_images = (
463
+ True # FIXME: to be deprecated in verson 3
464
+ )
465
+ pipeline_options.images_scale = 2
466
+
467
+ backend: Type[PdfDocumentBackend]
468
+ if pdf_backend == PdfBackend.DLPARSE_V1:
469
+ backend = DoclingParseDocumentBackend
470
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
471
+ backend = DoclingParseV2DocumentBackend
472
+ elif pdf_backend == PdfBackend.DLPARSE_V4:
473
+ backend = DoclingParseV4DocumentBackend # type: ignore
474
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
475
+ backend = PyPdfiumDocumentBackend # type: ignore
476
+ else:
477
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
478
+
479
+ pdf_format_option = PdfFormatOption(
480
+ pipeline_options=pipeline_options,
481
+ backend=backend, # pdf_backend
482
+ )
483
+ elif pipeline == PdfPipeline.VLM:
484
+ pipeline_options = VlmPipelineOptions()
485
+
486
+ if vlm_model == VlmModelType.GRANITE_VISION:
487
+ pipeline_options.vlm_options = granite_vision_vlm_conversion_options
488
+ elif vlm_model == VlmModelType.SMOLDOCLING:
489
+ pipeline_options.vlm_options = smoldocling_vlm_conversion_options
490
+ if sys.platform == "darwin":
491
+ try:
492
+ import mlx_vlm
493
+
494
+ pipeline_options.vlm_options = (
495
+ smoldocling_vlm_mlx_conversion_options
496
+ )
497
+ except ImportError:
498
+ _log.warning(
499
+ "To run SmolDocling faster, please install mlx-vlm:\n"
500
+ "pip install mlx-vlm"
501
+ )
440
502
 
441
- if image_export_mode != ImageRefMode.PLACEHOLDER:
442
- pipeline_options.generate_page_images = True
443
- pipeline_options.generate_picture_images = (
444
- True # FIXME: to be deprecated in verson 3
503
+ pdf_format_option = PdfFormatOption(
504
+ pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
445
505
  )
446
- pipeline_options.images_scale = 2
447
506
 
448
507
  if artifacts_path is not None:
449
508
  pipeline_options.artifacts_path = artifacts_path
450
509
 
451
- backend: Type[PdfDocumentBackend]
452
- if pdf_backend == PdfBackend.DLPARSE_V1:
453
- backend = DoclingParseDocumentBackend
454
- elif pdf_backend == PdfBackend.DLPARSE_V2:
455
- backend = DoclingParseV2DocumentBackend
456
- elif pdf_backend == PdfBackend.DLPARSE_V4:
457
- backend = DoclingParseV4DocumentBackend # type: ignore
458
- elif pdf_backend == PdfBackend.PYPDFIUM2:
459
- backend = PyPdfiumDocumentBackend # type: ignore
460
- else:
461
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
462
-
463
- pdf_format_option = PdfFormatOption(
464
- pipeline_options=pipeline_options,
465
- backend=backend, # pdf_backend
466
- )
467
510
  format_options: Dict[InputFormat, FormatOption] = {
468
511
  InputFormat.PDF: pdf_format_option,
469
512
  InputFormat.IMAGE: pdf_format_option,
@@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
263
263
  MARKDOWN = "markdown"
264
264
 
265
265
 
266
+ class InferenceFramework(str, Enum):
267
+ MLX = "mlx"
268
+ TRANSFORMERS = "transformers"
269
+
270
+
266
271
  class HuggingFaceVlmOptions(BaseVlmOptions):
267
272
  kind: Literal["hf_model_options"] = "hf_model_options"
268
273
 
@@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
271
276
  llm_int8_threshold: float = 6.0
272
277
  quantized: bool = False
273
278
 
279
+ inference_framework: InferenceFramework
274
280
  response_format: ResponseFormat
275
281
 
276
282
  @property
@@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
278
284
  return self.repo_id.replace("/", "--")
279
285
 
280
286
 
287
+ smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
288
+ repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
289
+ prompt="Convert this page to docling.",
290
+ response_format=ResponseFormat.DOCTAGS,
291
+ inference_framework=InferenceFramework.MLX,
292
+ )
293
+
294
+
281
295
  smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
282
296
  repo_id="ds4sd/SmolDocling-256M-preview",
283
297
  prompt="Convert this page to docling.",
284
298
  response_format=ResponseFormat.DOCTAGS,
299
+ inference_framework=InferenceFramework.TRANSFORMERS,
285
300
  )
286
301
 
287
302
  granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
@@ -289,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
289
304
  # prompt="OCR the full page to markdown.",
290
305
  prompt="OCR this image.",
291
306
  response_format=ResponseFormat.MARKDOWN,
307
+ inference_framework=InferenceFramework.TRANSFORMERS,
292
308
  )
293
309
 
294
310
 
311
+ class VlmModelType(str, Enum):
312
+ SMOLDOCLING = "smoldocling"
313
+ GRANITE_VISION = "granite_vision"
314
+
315
+
295
316
  # Define an enum for the backend options
296
317
  class PdfBackend(str, Enum):
297
318
  """Enum of valid PDF backends."""
@@ -327,13 +348,14 @@ class PipelineOptions(BaseModel):
327
348
 
328
349
 
329
350
  class PaginatedPipelineOptions(PipelineOptions):
351
+ artifacts_path: Optional[Union[Path, str]] = None
352
+
330
353
  images_scale: float = 1.0
331
354
  generate_page_images: bool = False
332
355
  generate_picture_images: bool = False
333
356
 
334
357
 
335
358
  class VlmPipelineOptions(PaginatedPipelineOptions):
336
- artifacts_path: Optional[Union[Path, str]] = None
337
359
 
338
360
  generate_page_images: bool = True
339
361
  force_backend_text: bool = (
@@ -346,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
346
368
  class PdfPipelineOptions(PaginatedPipelineOptions):
347
369
  """Options for the PDF pipeline."""
348
370
 
349
- artifacts_path: Optional[Union[Path, str]] = None
350
371
  do_table_structure: bool = True # True: perform table structure extraction
351
372
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
352
373
  do_code_enrichment: bool = False # True: perform code OCR
@@ -377,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
377
398
  )
378
399
 
379
400
  generate_parsed_pages: bool = False
401
+
402
+
403
+ class PdfPipeline(str, Enum):
404
+ STANDARD = "standard"
405
+ VLM = "vlm"
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import logging
2
3
  import math
3
4
  import sys
@@ -181,7 +182,14 @@ class DocumentConverter:
181
182
  )
182
183
  for format in self.allowed_formats
183
184
  }
184
- self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
185
+ self.initialized_pipelines: Dict[
186
+ Tuple[Type[BasePipeline], str], BasePipeline
187
+ ] = {}
188
+
189
+ def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
190
+ """Generate a hash of pipeline options to use as part of the cache key."""
191
+ options_str = str(pipeline_options.model_dump())
192
+ return hashlib.md5(options_str.encode("utf-8")).hexdigest()
185
193
 
186
194
  def initialize_pipeline(self, format: InputFormat):
187
195
  """Initialize the conversion pipeline for the selected format."""
@@ -279,31 +287,36 @@ class DocumentConverter:
279
287
  yield item
280
288
 
281
289
  def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
290
+ """Retrieve or initialize a pipeline, reusing instances based on class and options."""
282
291
  fopt = self.format_to_options.get(doc_format)
283
292
 
284
- if fopt is None:
293
+ if fopt is None or fopt.pipeline_options is None:
285
294
  return None
286
- else:
287
- pipeline_class = fopt.pipeline_cls
288
- pipeline_options = fopt.pipeline_options
289
295
 
290
- if pipeline_options is None:
291
- return None
292
- # TODO this will ignore if different options have been defined for the same pipeline class.
293
- if (
294
- pipeline_class not in self.initialized_pipelines
295
- or self.initialized_pipelines[pipeline_class].pipeline_options
296
- != pipeline_options
297
- ):
298
- self.initialized_pipelines[pipeline_class] = pipeline_class(
296
+ pipeline_class = fopt.pipeline_cls
297
+ pipeline_options = fopt.pipeline_options
298
+ options_hash = self._get_pipeline_options_hash(pipeline_options)
299
+
300
+ # Use a composite key to cache pipelines
301
+ cache_key = (pipeline_class, options_hash)
302
+
303
+ if cache_key not in self.initialized_pipelines:
304
+ _log.info(
305
+ f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
306
+ )
307
+ self.initialized_pipelines[cache_key] = pipeline_class(
299
308
  pipeline_options=pipeline_options
300
309
  )
301
- return self.initialized_pipelines[pipeline_class]
310
+ else:
311
+ _log.debug(
312
+ f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
313
+ )
314
+
315
+ return self.initialized_pipelines[cache_key]
302
316
 
303
317
  def _process_document(
304
318
  self, in_doc: InputDocument, raises_on_error: bool
305
319
  ) -> ConversionResult:
306
-
307
320
  valid = (
308
321
  self.allowed_formats is not None and in_doc.format in self.allowed_formats
309
322
  )
@@ -345,7 +358,6 @@ class DocumentConverter:
345
358
  else:
346
359
  if raises_on_error:
347
360
  raise ConversionError(f"Input document {in_doc.file} is not valid.")
348
-
349
361
  else:
350
362
  # invalid doc or not of desired format
351
363
  conv_res = ConversionResult(
@@ -0,0 +1,137 @@
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Optional
5
+
6
+ from docling.datamodel.base_models import Page, VlmPrediction
7
+ from docling.datamodel.document import ConversionResult
8
+ from docling.datamodel.pipeline_options import (
9
+ AcceleratorDevice,
10
+ AcceleratorOptions,
11
+ HuggingFaceVlmOptions,
12
+ )
13
+ from docling.datamodel.settings import settings
14
+ from docling.models.base_model import BasePageModel
15
+ from docling.utils.accelerator_utils import decide_device
16
+ from docling.utils.profiling import TimeRecorder
17
+
18
+ _log = logging.getLogger(__name__)
19
+
20
+
21
+ class HuggingFaceMlxModel(BasePageModel):
22
+
23
+ def __init__(
24
+ self,
25
+ enabled: bool,
26
+ artifacts_path: Optional[Path],
27
+ accelerator_options: AcceleratorOptions,
28
+ vlm_options: HuggingFaceVlmOptions,
29
+ ):
30
+ self.enabled = enabled
31
+
32
+ self.vlm_options = vlm_options
33
+
34
+ if self.enabled:
35
+
36
+ try:
37
+ from mlx_vlm import generate, load # type: ignore
38
+ from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
39
+ from mlx_vlm.utils import load_config, stream_generate # type: ignore
40
+ except ImportError:
41
+ raise ImportError(
42
+ "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
43
+ )
44
+
45
+ repo_cache_folder = vlm_options.repo_id.replace("/", "--")
46
+ self.apply_chat_template = apply_chat_template
47
+ self.stream_generate = stream_generate
48
+
49
+ # PARAMETERS:
50
+ if artifacts_path is None:
51
+ artifacts_path = self.download_models(self.vlm_options.repo_id)
52
+ elif (artifacts_path / repo_cache_folder).exists():
53
+ artifacts_path = artifacts_path / repo_cache_folder
54
+
55
+ self.param_question = vlm_options.prompt # "Perform Layout Analysis."
56
+
57
+ ## Load the model
58
+ self.vlm_model, self.processor = load(artifacts_path)
59
+ self.config = load_config(artifacts_path)
60
+
61
+ @staticmethod
62
+ def download_models(
63
+ repo_id: str,
64
+ local_dir: Optional[Path] = None,
65
+ force: bool = False,
66
+ progress: bool = False,
67
+ ) -> Path:
68
+ from huggingface_hub import snapshot_download
69
+ from huggingface_hub.utils import disable_progress_bars
70
+
71
+ if not progress:
72
+ disable_progress_bars()
73
+ download_path = snapshot_download(
74
+ repo_id=repo_id,
75
+ force_download=force,
76
+ local_dir=local_dir,
77
+ # revision="v0.0.1",
78
+ )
79
+
80
+ return Path(download_path)
81
+
82
+ def __call__(
83
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
84
+ ) -> Iterable[Page]:
85
+ for page in page_batch:
86
+ assert page._backend is not None
87
+ if not page._backend.is_valid():
88
+ yield page
89
+ else:
90
+ with TimeRecorder(conv_res, "vlm"):
91
+ assert page.size is not None
92
+
93
+ hi_res_image = page.get_image(scale=2.0) # 144dpi
94
+ # hi_res_image = page.get_image(scale=1.0) # 72dpi
95
+
96
+ if hi_res_image is not None:
97
+ im_width, im_height = hi_res_image.size
98
+
99
+ # populate page_tags with predicted doc tags
100
+ page_tags = ""
101
+
102
+ if hi_res_image:
103
+ if hi_res_image.mode != "RGB":
104
+ hi_res_image = hi_res_image.convert("RGB")
105
+
106
+ prompt = self.apply_chat_template(
107
+ self.processor, self.config, self.param_question, num_images=1
108
+ )
109
+
110
+ start_time = time.time()
111
+ # Call model to generate:
112
+ output = ""
113
+ for token in self.stream_generate(
114
+ self.vlm_model,
115
+ self.processor,
116
+ prompt,
117
+ [hi_res_image],
118
+ max_tokens=4096,
119
+ verbose=False,
120
+ ):
121
+ output += token.text
122
+ if "</doctag>" in token.text:
123
+ break
124
+
125
+ generation_time = time.time() - start_time
126
+ page_tags = output
127
+
128
+ # inference_time = time.time() - start_time
129
+ # tokens_per_second = num_tokens / generation_time
130
+ # print("")
131
+ # print(f"Page Inference Time: {inference_time:.2f} seconds")
132
+ # print(f"Total tokens on page: {num_tokens:.2f}")
133
+ # print(f"Tokens/sec: {tokens_per_second:.2f}")
134
+ # print("")
135
+ page.predictions.vlm_response = VlmPrediction(text=page_tags)
136
+
137
+ yield page
@@ -63,7 +63,13 @@ class PagePreprocessingModel(BasePageModel):
63
63
  def draw_text_boxes(image, cells, show: bool = False):
64
64
  draw = ImageDraw.Draw(image)
65
65
  for c in cells:
66
- x0, y0, x1, y1 = c.bbox.as_tuple()
66
+ x0, y0, x1, y1 = (
67
+ c.to_bounding_box().l,
68
+ c.to_bounding_box().t,
69
+ c.to_bounding_box().r,
70
+ c.to_bounding_box().b,
71
+ )
72
+
67
73
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
68
74
  if show:
69
75
  image.show()