docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. docling/backend/asciidoc_backend.py +39 -18
  2. docling/backend/docling_parse_backend.py +61 -59
  3. docling/backend/docling_parse_v2_backend.py +72 -62
  4. docling/backend/docling_parse_v4_backend.py +21 -19
  5. docling/backend/md_backend.py +101 -81
  6. docling/backend/mspowerpoint_backend.py +72 -113
  7. docling/backend/msword_backend.py +99 -80
  8. docling/backend/noop_backend.py +51 -0
  9. docling/backend/pypdfium2_backend.py +127 -53
  10. docling/cli/main.py +82 -14
  11. docling/datamodel/asr_model_specs.py +92 -0
  12. docling/datamodel/base_models.py +21 -4
  13. docling/datamodel/document.py +3 -1
  14. docling/datamodel/pipeline_options.py +15 -2
  15. docling/datamodel/pipeline_options_asr_model.py +57 -0
  16. docling/datamodel/pipeline_options_vlm_model.py +4 -4
  17. docling/document_converter.py +8 -0
  18. docling/models/api_vlm_model.py +3 -1
  19. docling/models/base_model.py +1 -1
  20. docling/models/base_ocr_model.py +33 -11
  21. docling/models/easyocr_model.py +1 -1
  22. docling/models/layout_model.py +2 -3
  23. docling/models/ocr_mac_model.py +1 -1
  24. docling/models/page_preprocessing_model.py +3 -6
  25. docling/models/rapid_ocr_model.py +1 -1
  26. docling/models/readingorder_model.py +3 -3
  27. docling/models/tesseract_ocr_cli_model.py +4 -3
  28. docling/models/tesseract_ocr_model.py +1 -1
  29. docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
  30. docling/models/vlm_models_inline/mlx_model.py +3 -1
  31. docling/pipeline/asr_pipeline.py +253 -0
  32. docling/pipeline/base_pipeline.py +11 -0
  33. docling/pipeline/standard_pdf_pipeline.py +0 -1
  34. docling/utils/layout_postprocessor.py +11 -6
  35. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
  36. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
  37. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
  38. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
  39. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
  40. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0
docling/cli/main.py CHANGED
@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
29
29
  from docling.backend.pdf_backend import PdfDocumentBackend
30
30
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
31
31
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
32
+ from docling.datamodel.asr_model_specs import (
33
+ WHISPER_BASE,
34
+ WHISPER_LARGE,
35
+ WHISPER_MEDIUM,
36
+ WHISPER_SMALL,
37
+ WHISPER_TINY,
38
+ WHISPER_TURBO,
39
+ AsrModelType,
40
+ )
32
41
  from docling.datamodel.base_models import (
33
42
  ConversionStatus,
34
43
  FormatToExtensions,
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
37
46
  )
38
47
  from docling.datamodel.document import ConversionResult
39
48
  from docling.datamodel.pipeline_options import (
49
+ AsrPipelineOptions,
40
50
  EasyOcrOptions,
41
51
  OcrOptions,
42
52
  PaginatedPipelineOptions,
43
53
  PdfBackend,
44
- PdfPipeline,
45
54
  PdfPipelineOptions,
55
+ PipelineOptions,
56
+ ProcessingPipeline,
46
57
  TableFormerMode,
47
58
  VlmPipelineOptions,
48
59
  )
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
54
65
  SMOLDOCLING_TRANSFORMERS,
55
66
  VlmModelType,
56
67
  )
57
- from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
68
+ from docling.document_converter import (
69
+ AudioFormatOption,
70
+ DocumentConverter,
71
+ FormatOption,
72
+ PdfFormatOption,
73
+ )
58
74
  from docling.models.factories import get_ocr_factory
75
+ from docling.pipeline.asr_pipeline import AsrPipeline
59
76
  from docling.pipeline.vlm_pipeline import VlmPipeline
60
77
 
61
78
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +313,17 @@ def convert( # noqa: C901
296
313
  ),
297
314
  ] = ImageRefMode.EMBEDDED,
298
315
  pipeline: Annotated[
299
- PdfPipeline,
316
+ ProcessingPipeline,
300
317
  typer.Option(..., help="Choose the pipeline to process PDF or image files."),
301
- ] = PdfPipeline.STANDARD,
318
+ ] = ProcessingPipeline.STANDARD,
302
319
  vlm_model: Annotated[
303
320
  VlmModelType,
304
321
  typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
305
322
  ] = VlmModelType.SMOLDOCLING,
323
+ asr_model: Annotated[
324
+ AsrModelType,
325
+ typer.Option(..., help="Choose the ASR model to use with audio/video files."),
326
+ ] = AsrModelType.WHISPER_TINY,
306
327
  ocr: Annotated[
307
328
  bool,
308
329
  typer.Option(
@@ -450,12 +471,14 @@ def convert( # noqa: C901
450
471
  ),
451
472
  ] = None,
452
473
  ):
474
+ log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
475
+
453
476
  if verbose == 0:
454
- logging.basicConfig(level=logging.WARNING)
477
+ logging.basicConfig(level=logging.WARNING, format=log_format)
455
478
  elif verbose == 1:
456
- logging.basicConfig(level=logging.INFO)
479
+ logging.basicConfig(level=logging.INFO, format=log_format)
457
480
  else:
458
- logging.basicConfig(level=logging.DEBUG)
481
+ logging.basicConfig(level=logging.DEBUG, format=log_format)
459
482
 
460
483
  settings.debug.visualize_cells = debug_visualize_cells
461
484
  settings.debug.visualize_layout = debug_visualize_layout
@@ -530,9 +553,12 @@ def convert( # noqa: C901
530
553
  ocr_options.lang = ocr_lang_list
531
554
 
532
555
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
533
- pipeline_options: PaginatedPipelineOptions
556
+ # pipeline_options: PaginatedPipelineOptions
557
+ pipeline_options: PipelineOptions
558
+
559
+ format_options: Dict[InputFormat, FormatOption] = {}
534
560
 
535
- if pipeline == PdfPipeline.STANDARD:
561
+ if pipeline == ProcessingPipeline.STANDARD:
536
562
  pipeline_options = PdfPipelineOptions(
537
563
  allow_external_plugins=allow_external_plugins,
538
564
  enable_remote_services=enable_remote_services,
@@ -574,7 +600,13 @@ def convert( # noqa: C901
574
600
  pipeline_options=pipeline_options,
575
601
  backend=backend, # pdf_backend
576
602
  )
577
- elif pipeline == PdfPipeline.VLM:
603
+
604
+ format_options = {
605
+ InputFormat.PDF: pdf_format_option,
606
+ InputFormat.IMAGE: pdf_format_option,
607
+ }
608
+
609
+ elif pipeline == ProcessingPipeline.VLM:
578
610
  pipeline_options = VlmPipelineOptions(
579
611
  enable_remote_services=enable_remote_services,
580
612
  )
@@ -600,13 +632,48 @@ def convert( # noqa: C901
600
632
  pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
601
633
  )
602
634
 
635
+ format_options = {
636
+ InputFormat.PDF: pdf_format_option,
637
+ InputFormat.IMAGE: pdf_format_option,
638
+ }
639
+
640
+ elif pipeline == ProcessingPipeline.ASR:
641
+ pipeline_options = AsrPipelineOptions(
642
+ # enable_remote_services=enable_remote_services,
643
+ # artifacts_path = artifacts_path
644
+ )
645
+
646
+ if asr_model == AsrModelType.WHISPER_TINY:
647
+ pipeline_options.asr_options = WHISPER_TINY
648
+ elif asr_model == AsrModelType.WHISPER_SMALL:
649
+ pipeline_options.asr_options = WHISPER_SMALL
650
+ elif asr_model == AsrModelType.WHISPER_MEDIUM:
651
+ pipeline_options.asr_options = WHISPER_MEDIUM
652
+ elif asr_model == AsrModelType.WHISPER_BASE:
653
+ pipeline_options.asr_options = WHISPER_BASE
654
+ elif asr_model == AsrModelType.WHISPER_LARGE:
655
+ pipeline_options.asr_options = WHISPER_LARGE
656
+ elif asr_model == AsrModelType.WHISPER_TURBO:
657
+ pipeline_options.asr_options = WHISPER_TURBO
658
+ else:
659
+ _log.error(f"{asr_model} is not known")
660
+ raise ValueError(f"{asr_model} is not known")
661
+
662
+ _log.info(f"pipeline_options: {pipeline_options}")
663
+
664
+ audio_format_option = AudioFormatOption(
665
+ pipeline_cls=AsrPipeline,
666
+ pipeline_options=pipeline_options,
667
+ )
668
+
669
+ format_options = {
670
+ InputFormat.AUDIO: audio_format_option,
671
+ }
672
+
603
673
  if artifacts_path is not None:
604
674
  pipeline_options.artifacts_path = artifacts_path
675
+ # audio_pipeline_options.artifacts_path = artifacts_path
605
676
 
606
- format_options: Dict[InputFormat, FormatOption] = {
607
- InputFormat.PDF: pdf_format_option,
608
- InputFormat.IMAGE: pdf_format_option,
609
- }
610
677
  doc_converter = DocumentConverter(
611
678
  allowed_formats=from_formats,
612
679
  format_options=format_options,
@@ -614,6 +681,7 @@ def convert( # noqa: C901
614
681
 
615
682
  start_time = time.time()
616
683
 
684
+ _log.info(f"paths: {input_doc_paths}")
617
685
  conv_results = doc_converter.convert_all(
618
686
  input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
619
687
  )
@@ -0,0 +1,92 @@
1
+ import logging
2
+ from enum import Enum
3
+
4
+ from pydantic import (
5
+ AnyUrl,
6
+ )
7
+
8
+ from docling.datamodel.accelerator_options import AcceleratorDevice
9
+ from docling.datamodel.pipeline_options_asr_model import (
10
+ # AsrResponseFormat,
11
+ # ApiAsrOptions,
12
+ InferenceAsrFramework,
13
+ InlineAsrNativeWhisperOptions,
14
+ TransformersModelType,
15
+ )
16
+
17
+ _log = logging.getLogger(__name__)
18
+
19
+ WHISPER_TINY = InlineAsrNativeWhisperOptions(
20
+ repo_id="tiny",
21
+ inference_framework=InferenceAsrFramework.WHISPER,
22
+ verbose=True,
23
+ timestamps=True,
24
+ word_timestamps=True,
25
+ temperatue=0.0,
26
+ max_new_tokens=256,
27
+ max_time_chunk=30.0,
28
+ )
29
+
30
+ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
31
+ repo_id="small",
32
+ inference_framework=InferenceAsrFramework.WHISPER,
33
+ verbose=True,
34
+ timestamps=True,
35
+ word_timestamps=True,
36
+ temperatue=0.0,
37
+ max_new_tokens=256,
38
+ max_time_chunk=30.0,
39
+ )
40
+
41
+ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
42
+ repo_id="medium",
43
+ inference_framework=InferenceAsrFramework.WHISPER,
44
+ verbose=True,
45
+ timestamps=True,
46
+ word_timestamps=True,
47
+ temperatue=0.0,
48
+ max_new_tokens=256,
49
+ max_time_chunk=30.0,
50
+ )
51
+
52
+ WHISPER_BASE = InlineAsrNativeWhisperOptions(
53
+ repo_id="base",
54
+ inference_framework=InferenceAsrFramework.WHISPER,
55
+ verbose=True,
56
+ timestamps=True,
57
+ word_timestamps=True,
58
+ temperatue=0.0,
59
+ max_new_tokens=256,
60
+ max_time_chunk=30.0,
61
+ )
62
+
63
+ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
64
+ repo_id="large",
65
+ inference_framework=InferenceAsrFramework.WHISPER,
66
+ verbose=True,
67
+ timestamps=True,
68
+ word_timestamps=True,
69
+ temperatue=0.0,
70
+ max_new_tokens=256,
71
+ max_time_chunk=30.0,
72
+ )
73
+
74
+ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
75
+ repo_id="turbo",
76
+ inference_framework=InferenceAsrFramework.WHISPER,
77
+ verbose=True,
78
+ timestamps=True,
79
+ word_timestamps=True,
80
+ temperatue=0.0,
81
+ max_new_tokens=256,
82
+ max_time_chunk=30.0,
83
+ )
84
+
85
+
86
+ class AsrModelType(str, Enum):
87
+ WHISPER_TINY = "whisper_tiny"
88
+ WHISPER_SMALL = "whisper_small"
89
+ WHISPER_MEDIUM = "whisper_medium"
90
+ WHISPER_BASE = "whisper_base"
91
+ WHISPER_LARGE = "whisper_large"
92
+ WHISPER_TURBO = "whisper_turbo"
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
49
49
  XML_USPTO = "xml_uspto"
50
50
  XML_JATS = "xml_jats"
51
51
  JSON_DOCLING = "json_docling"
52
+ AUDIO = "audio"
52
53
 
53
54
 
54
55
  class OutputFormat(str, Enum):
@@ -67,12 +68,13 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
67
68
  InputFormat.MD: ["md"],
68
69
  InputFormat.HTML: ["html", "htm", "xhtml"],
69
70
  InputFormat.XML_JATS: ["xml", "nxml"],
70
- InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
71
+ InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
71
72
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
72
73
  InputFormat.CSV: ["csv"],
73
- InputFormat.XLSX: ["xlsx"],
74
+ InputFormat.XLSX: ["xlsx", "xlsm"],
74
75
  InputFormat.XML_USPTO: ["xml", "txt"],
75
76
  InputFormat.JSON_DOCLING: ["json"],
77
+ InputFormat.AUDIO: ["wav", "mp3"],
76
78
  }
77
79
 
78
80
  FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
104
106
  ],
105
107
  InputFormat.XML_USPTO: ["application/xml", "text/plain"],
106
108
  InputFormat.JSON_DOCLING: ["application/json"],
109
+ InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
107
110
  }
108
111
 
109
112
  MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -232,7 +235,6 @@ class Page(BaseModel):
232
235
  page_no: int
233
236
  # page_hash: Optional[str] = None
234
237
  size: Optional[Size] = None
235
- cells: List[TextCell] = []
236
238
  parsed_page: Optional[SegmentedPdfPage] = None
237
239
  predictions: PagePredictions = PagePredictions()
238
240
  assembled: Optional[AssembledUnit] = None
@@ -245,12 +247,27 @@ class Page(BaseModel):
245
247
  float, Image
246
248
  ] = {} # Cache of images in different scales. By default it is cleared during assembling.
247
249
 
250
+ @property
251
+ def cells(self) -> List[TextCell]:
252
+ """Return text cells as a read-only view of parsed_page.textline_cells."""
253
+ if self.parsed_page is not None:
254
+ return self.parsed_page.textline_cells
255
+ else:
256
+ return []
257
+
248
258
  def get_image(
249
- self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
259
+ self,
260
+ scale: float = 1.0,
261
+ max_size: Optional[int] = None,
262
+ cropbox: Optional[BoundingBox] = None,
250
263
  ) -> Optional[Image]:
251
264
  if self._backend is None:
252
265
  return self._image_cache.get(scale, None)
253
266
 
267
+ if max_size:
268
+ assert self.size is not None
269
+ scale = min(scale, max_size / max(self.size.as_tuple()))
270
+
254
271
  if scale not in self._image_cache:
255
272
  if cropbox is None:
256
273
  self._image_cache[scale] = self._backend.get_page_image(scale=scale)
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
249
249
  backend: Type[AbstractDocumentBackend]
250
250
  if format not in format_options.keys():
251
251
  _log.error(
252
- f"Input document {obj.name} does not match any allowed format."
252
+ f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
253
253
  )
254
254
  backend = _DummyBackend
255
255
  else:
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
318
318
  mime = mime or _DocumentConversionInput._detect_csv(content)
319
319
  mime = mime or "text/plain"
320
320
  formats = MimeTypeToFormat.get(mime, [])
321
+ _log.info(f"detected formats: {formats}")
322
+
321
323
  if formats:
322
324
  if len(formats) == 1 and mime not in ("text/plain"):
323
325
  return formats[0]
@@ -11,8 +11,13 @@ from pydantic import (
11
11
  )
12
12
  from typing_extensions import deprecated
13
13
 
14
+ from docling.datamodel import asr_model_specs
15
+
14
16
  # Import the following for backwards compatibility
15
17
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
18
+ from docling.datamodel.pipeline_options_asr_model import (
19
+ InlineAsrOptions,
20
+ )
16
21
  from docling.datamodel.pipeline_options_vlm_model import (
17
22
  ApiVlmOptions,
18
23
  InferenceFramework,
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
260
265
  )
261
266
 
262
267
 
268
+ class AsrPipelineOptions(PipelineOptions):
269
+ asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
270
+ artifacts_path: Optional[Union[Path, str]] = None
271
+
272
+
263
273
  class PdfPipelineOptions(PaginatedPipelineOptions):
264
274
  """Options for the PDF pipeline."""
265
275
 
@@ -292,9 +302,12 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
292
302
  ),
293
303
  )
294
304
 
295
- generate_parsed_pages: bool = False
305
+ generate_parsed_pages: Literal[True] = (
306
+ True # Always True since parsed_page is now mandatory
307
+ )
296
308
 
297
309
 
298
- class PdfPipeline(str, Enum):
310
+ class ProcessingPipeline(str, Enum):
299
311
  STANDARD = "standard"
300
312
  VLM = "vlm"
313
+ ASR = "asr"
@@ -0,0 +1,57 @@
1
+ from enum import Enum
2
+ from typing import Any, Dict, List, Literal, Optional, Union
3
+
4
+ from pydantic import AnyUrl, BaseModel
5
+ from typing_extensions import deprecated
6
+
7
+ from docling.datamodel.accelerator_options import AcceleratorDevice
8
+ from docling.datamodel.pipeline_options_vlm_model import (
9
+ # InferenceFramework,
10
+ TransformersModelType,
11
+ )
12
+
13
+
14
+ class BaseAsrOptions(BaseModel):
15
+ kind: str
16
+ # prompt: str
17
+
18
+
19
+ class InferenceAsrFramework(str, Enum):
20
+ # MLX = "mlx" # disabled for now
21
+ # TRANSFORMERS = "transformers" # disabled for now
22
+ WHISPER = "whisper"
23
+
24
+
25
+ class InlineAsrOptions(BaseAsrOptions):
26
+ kind: Literal["inline_model_options"] = "inline_model_options"
27
+
28
+ repo_id: str
29
+
30
+ verbose: bool = False
31
+ timestamps: bool = True
32
+
33
+ temperature: float = 0.0
34
+ max_new_tokens: int = 256
35
+ max_time_chunk: float = 30.0
36
+
37
+ torch_dtype: Optional[str] = None
38
+ supported_devices: List[AcceleratorDevice] = [
39
+ AcceleratorDevice.CPU,
40
+ AcceleratorDevice.CUDA,
41
+ AcceleratorDevice.MPS,
42
+ ]
43
+
44
+ @property
45
+ def repo_cache_folder(self) -> str:
46
+ return self.repo_id.replace("/", "--")
47
+
48
+
49
+ class InlineAsrNativeWhisperOptions(InlineAsrOptions):
50
+ inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
51
+
52
+ language: str = "en"
53
+ supported_devices: List[AcceleratorDevice] = [
54
+ AcceleratorDevice.CPU,
55
+ AcceleratorDevice.CUDA,
56
+ ]
57
+ word_timestamps: bool = True
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import Any, Dict, List, Literal
2
+ from typing import Any, Dict, List, Literal, Optional, Union
3
3
 
4
4
  from pydantic import AnyUrl, BaseModel
5
5
  from typing_extensions import deprecated
@@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
10
10
  class BaseVlmOptions(BaseModel):
11
11
  kind: str
12
12
  prompt: str
13
+ scale: float = 2.0
14
+ max_size: Optional[int] = None
13
15
 
14
16
 
15
17
  class ResponseFormat(str, Enum):
@@ -42,14 +44,13 @@ class InlineVlmOptions(BaseVlmOptions):
42
44
  transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
43
45
  response_format: ResponseFormat
44
46
 
47
+ torch_dtype: Optional[str] = None
45
48
  supported_devices: List[AcceleratorDevice] = [
46
49
  AcceleratorDevice.CPU,
47
50
  AcceleratorDevice.CUDA,
48
51
  AcceleratorDevice.MPS,
49
52
  ]
50
53
 
51
- scale: float = 2.0
52
-
53
54
  temperature: float = 0.0
54
55
  stop_strings: List[str] = []
55
56
  extra_generation_config: Dict[str, Any] = {}
@@ -75,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
75
76
  ) # Default to ollama
76
77
  headers: Dict[str, str] = {}
77
78
  params: Dict[str, Any] = {}
78
- scale: float = 2.0
79
79
  timeout: float = 60
80
80
  concurrency: int = 1
81
81
  response_format: ResponseFormat
@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
19
19
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
20
20
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
21
21
  from docling.backend.msword_backend import MsWordDocumentBackend
22
+ from docling.backend.noop_backend import NoOpBackend
22
23
  from docling.backend.xml.jats_backend import JatsDocumentBackend
23
24
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
24
25
  from docling.datamodel.base_models import (
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
41
42
  settings,
42
43
  )
43
44
  from docling.exceptions import ConversionError
45
+ from docling.pipeline.asr_pipeline import AsrPipeline
44
46
  from docling.pipeline.base_pipeline import BasePipeline
45
47
  from docling.pipeline.simple_pipeline import SimplePipeline
46
48
  from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
118
120
  backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
119
121
 
120
122
 
123
+ class AudioFormatOption(FormatOption):
124
+ pipeline_cls: Type = AsrPipeline
125
+ backend: Type[AbstractDocumentBackend] = NoOpBackend
126
+
127
+
121
128
  def _get_default_option(format: InputFormat) -> FormatOption:
122
129
  format_to_default_options = {
123
130
  InputFormat.CSV: FormatOption(
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
156
163
  InputFormat.JSON_DOCLING: FormatOption(
157
164
  pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
158
165
  ),
166
+ InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
159
167
  }
160
168
  if (options := format_to_default_options.get(format)) is not None:
161
169
  return options
@@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
48
48
  with TimeRecorder(conv_res, "vlm"):
49
49
  assert page.size is not None
50
50
 
51
- hi_res_image = page.get_image(scale=self.vlm_options.scale)
51
+ hi_res_image = page.get_image(
52
+ scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
53
+ )
52
54
  assert hi_res_image is not None
53
55
  if hi_res_image:
54
56
  if hi_res_image.mode != "RGB":
@@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
86
86
  coord_origin=bbox.coord_origin,
87
87
  )
88
88
 
89
- page_ix = element_prov.page_no - 1
89
+ page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
90
90
  cropped_image = conv_res.pages[page_ix].get_image(
91
91
  scale=self.images_scale, cropbox=expanded_bbox
92
92
  )
@@ -7,6 +7,7 @@ from typing import List, Optional, Type
7
7
 
8
8
  import numpy as np
9
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
10
+ from docling_core.types.doc.page import TextCell
10
11
  from PIL import Image, ImageDraw
11
12
  from rtree import index
12
13
  from scipy.ndimage import binary_dilation, find_objects, label
@@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
107
108
  return []
108
109
 
109
110
  # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
110
- def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
111
+ def _filter_ocr_cells(
112
+ self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
113
+ ) -> List[TextCell]:
111
114
  # Create R-tree index for programmatic cells
112
115
  p = index.Property()
113
116
  p.dimension = 2
@@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
130
133
  ]
131
134
  return filtered_ocr_cells
132
135
 
133
- def post_process_cells(self, ocr_cells, programmatic_cells):
136
+ def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
134
137
  r"""
135
- Post-process the ocr and programmatic cells and return the final list of of cells
138
+ Post-process the OCR cells and update the page object.
139
+ Updates parsed_page.textline_cells directly since page.cells is now read-only.
136
140
  """
141
+ # Get existing cells from the read-only property
142
+ existing_cells = page.cells
143
+
144
+ # Combine existing and OCR cells with overlap filtering
145
+ final_cells = self._combine_cells(existing_cells, ocr_cells)
146
+
147
+ assert page.parsed_page is not None
148
+
149
+ # Update parsed_page.textline_cells directly
150
+ page.parsed_page.textline_cells = final_cells
151
+ page.parsed_page.has_lines = len(final_cells) > 0
152
+
153
+ def _combine_cells(
154
+ self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
155
+ ) -> List[TextCell]:
156
+ """Combine existing and OCR cells with filtering and re-indexing."""
137
157
  if self.options.force_full_page_ocr:
138
- # If a full page OCR is forced, use only the OCR cells
139
- cells = ocr_cells
140
- return cells
141
-
142
- ## Remove OCR cells which overlap with programmatic cells.
143
- filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
144
- programmatic_cells.extend(filtered_ocr_cells)
145
- return programmatic_cells
158
+ combined = ocr_cells
159
+ else:
160
+ filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
161
+ combined = list(existing_cells) + filtered_ocr_cells
162
+
163
+ # Re-index in-place
164
+ for i, cell in enumerate(combined):
165
+ cell.index = i
166
+
167
+ return combined
146
168
 
147
169
  def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
148
170
  image = copy.deepcopy(page.image)
@@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
177
177
  all_ocr_cells.extend(cells)
178
178
 
179
179
  # Post-process the cells
180
- page.cells = self.post_process_cells(all_ocr_cells, page.cells)
180
+ self.post_process_cells(all_ocr_cells, page)
181
181
 
182
182
  # DEBUG code:
183
183
  if settings.debug.visualize_ocr:
@@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
176
176
  # Apply postprocessing
177
177
 
178
178
  processed_clusters, processed_cells = LayoutPostprocessor(
179
- page.cells, clusters, page.size
179
+ page, clusters
180
180
  ).postprocess()
181
- # processed_clusters, processed_cells = clusters, page.cells
181
+ # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
182
182
 
183
183
  with warnings.catch_warnings():
184
184
  warnings.filterwarnings(
@@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
198
198
  )
199
199
  )
200
200
 
201
- page.cells = processed_cells
202
201
  page.predictions.layout = LayoutPrediction(
203
202
  clusters=processed_clusters
204
203
  )
@@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
132
132
  all_ocr_cells.extend(cells)
133
133
 
134
134
  # Post-process the cells
135
- page.cells = self.post_process_cells(all_ocr_cells, page.cells)
135
+ self.post_process_cells(all_ocr_cells, page)
136
136
 
137
137
  # DEBUG code:
138
138
  if settings.debug.visualize_ocr: