docling 2.37.0__py3-none-any.whl → 2.38.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/md_backend.py +185 -80
- docling/backend/msword_backend.py +76 -63
- docling/backend/noop_backend.py +51 -0
- docling/cli/main.py +82 -14
- docling/datamodel/asr_model_specs.py +92 -0
- docling/datamodel/base_models.py +12 -2
- docling/datamodel/document.py +3 -1
- docling/datamodel/pipeline_options.py +13 -2
- docling/datamodel/pipeline_options_asr_model.py +57 -0
- docling/datamodel/pipeline_options_vlm_model.py +2 -3
- docling/document_converter.py +8 -0
- docling/models/api_vlm_model.py +3 -1
- docling/models/base_model.py +1 -1
- docling/models/readingorder_model.py +1 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +3 -1
- docling/models/vlm_models_inline/mlx_model.py +3 -1
- docling/pipeline/asr_pipeline.py +253 -0
- docling/pipeline/base_pipeline.py +11 -0
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/METADATA +7 -4
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/RECORD +24 -20
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/WHEEL +0 -0
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/entry_points.txt +0 -0
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/licenses/LICENSE +0 -0
- {docling-2.37.0.dist-info → docling-2.38.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Set, Union
|
5
|
+
|
6
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
7
|
+
from docling.datamodel.base_models import InputFormat
|
8
|
+
from docling.datamodel.document import InputDocument
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class NoOpBackend(AbstractDocumentBackend):
|
14
|
+
"""
|
15
|
+
A no-op backend that only validates input existence.
|
16
|
+
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
20
|
+
super().__init__(in_doc, path_or_stream)
|
21
|
+
|
22
|
+
_log.debug(f"NoOpBackend initialized for: {path_or_stream}")
|
23
|
+
|
24
|
+
# Validate input
|
25
|
+
try:
|
26
|
+
if isinstance(self.path_or_stream, BytesIO):
|
27
|
+
# Check if stream has content
|
28
|
+
self.valid = len(self.path_or_stream.getvalue()) > 0
|
29
|
+
_log.debug(
|
30
|
+
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
|
31
|
+
)
|
32
|
+
elif isinstance(self.path_or_stream, Path):
|
33
|
+
# Check if file exists
|
34
|
+
self.valid = self.path_or_stream.exists()
|
35
|
+
_log.debug(f"File exists: {self.valid}")
|
36
|
+
else:
|
37
|
+
self.valid = False
|
38
|
+
except Exception as e:
|
39
|
+
_log.error(f"NoOpBackend validation failed: {e}")
|
40
|
+
self.valid = False
|
41
|
+
|
42
|
+
def is_valid(self) -> bool:
|
43
|
+
return self.valid
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def supports_pagination(cls) -> bool:
|
47
|
+
return False
|
48
|
+
|
49
|
+
@classmethod
|
50
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
51
|
+
return set(InputFormat)
|
docling/cli/main.py
CHANGED
@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
|
29
29
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
30
30
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
31
31
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
32
|
+
from docling.datamodel.asr_model_specs import (
|
33
|
+
WHISPER_BASE,
|
34
|
+
WHISPER_LARGE,
|
35
|
+
WHISPER_MEDIUM,
|
36
|
+
WHISPER_SMALL,
|
37
|
+
WHISPER_TINY,
|
38
|
+
WHISPER_TURBO,
|
39
|
+
AsrModelType,
|
40
|
+
)
|
32
41
|
from docling.datamodel.base_models import (
|
33
42
|
ConversionStatus,
|
34
43
|
FormatToExtensions,
|
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
|
|
37
46
|
)
|
38
47
|
from docling.datamodel.document import ConversionResult
|
39
48
|
from docling.datamodel.pipeline_options import (
|
49
|
+
AsrPipelineOptions,
|
40
50
|
EasyOcrOptions,
|
41
51
|
OcrOptions,
|
42
52
|
PaginatedPipelineOptions,
|
43
53
|
PdfBackend,
|
44
|
-
PdfPipeline,
|
45
54
|
PdfPipelineOptions,
|
55
|
+
PipelineOptions,
|
56
|
+
ProcessingPipeline,
|
46
57
|
TableFormerMode,
|
47
58
|
VlmPipelineOptions,
|
48
59
|
)
|
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
|
|
54
65
|
SMOLDOCLING_TRANSFORMERS,
|
55
66
|
VlmModelType,
|
56
67
|
)
|
57
|
-
from docling.document_converter import
|
68
|
+
from docling.document_converter import (
|
69
|
+
AudioFormatOption,
|
70
|
+
DocumentConverter,
|
71
|
+
FormatOption,
|
72
|
+
PdfFormatOption,
|
73
|
+
)
|
58
74
|
from docling.models.factories import get_ocr_factory
|
75
|
+
from docling.pipeline.asr_pipeline import AsrPipeline
|
59
76
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
60
77
|
|
61
78
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
@@ -296,13 +313,17 @@ def convert( # noqa: C901
|
|
296
313
|
),
|
297
314
|
] = ImageRefMode.EMBEDDED,
|
298
315
|
pipeline: Annotated[
|
299
|
-
|
316
|
+
ProcessingPipeline,
|
300
317
|
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
301
|
-
] =
|
318
|
+
] = ProcessingPipeline.STANDARD,
|
302
319
|
vlm_model: Annotated[
|
303
320
|
VlmModelType,
|
304
321
|
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
305
322
|
] = VlmModelType.SMOLDOCLING,
|
323
|
+
asr_model: Annotated[
|
324
|
+
AsrModelType,
|
325
|
+
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
|
326
|
+
] = AsrModelType.WHISPER_TINY,
|
306
327
|
ocr: Annotated[
|
307
328
|
bool,
|
308
329
|
typer.Option(
|
@@ -450,12 +471,14 @@ def convert( # noqa: C901
|
|
450
471
|
),
|
451
472
|
] = None,
|
452
473
|
):
|
474
|
+
log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
|
475
|
+
|
453
476
|
if verbose == 0:
|
454
|
-
logging.basicConfig(level=logging.WARNING)
|
477
|
+
logging.basicConfig(level=logging.WARNING, format=log_format)
|
455
478
|
elif verbose == 1:
|
456
|
-
logging.basicConfig(level=logging.INFO)
|
479
|
+
logging.basicConfig(level=logging.INFO, format=log_format)
|
457
480
|
else:
|
458
|
-
logging.basicConfig(level=logging.DEBUG)
|
481
|
+
logging.basicConfig(level=logging.DEBUG, format=log_format)
|
459
482
|
|
460
483
|
settings.debug.visualize_cells = debug_visualize_cells
|
461
484
|
settings.debug.visualize_layout = debug_visualize_layout
|
@@ -530,9 +553,12 @@ def convert( # noqa: C901
|
|
530
553
|
ocr_options.lang = ocr_lang_list
|
531
554
|
|
532
555
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
533
|
-
pipeline_options: PaginatedPipelineOptions
|
556
|
+
# pipeline_options: PaginatedPipelineOptions
|
557
|
+
pipeline_options: PipelineOptions
|
558
|
+
|
559
|
+
format_options: Dict[InputFormat, FormatOption] = {}
|
534
560
|
|
535
|
-
if pipeline ==
|
561
|
+
if pipeline == ProcessingPipeline.STANDARD:
|
536
562
|
pipeline_options = PdfPipelineOptions(
|
537
563
|
allow_external_plugins=allow_external_plugins,
|
538
564
|
enable_remote_services=enable_remote_services,
|
@@ -574,7 +600,13 @@ def convert( # noqa: C901
|
|
574
600
|
pipeline_options=pipeline_options,
|
575
601
|
backend=backend, # pdf_backend
|
576
602
|
)
|
577
|
-
|
603
|
+
|
604
|
+
format_options = {
|
605
|
+
InputFormat.PDF: pdf_format_option,
|
606
|
+
InputFormat.IMAGE: pdf_format_option,
|
607
|
+
}
|
608
|
+
|
609
|
+
elif pipeline == ProcessingPipeline.VLM:
|
578
610
|
pipeline_options = VlmPipelineOptions(
|
579
611
|
enable_remote_services=enable_remote_services,
|
580
612
|
)
|
@@ -600,13 +632,48 @@ def convert( # noqa: C901
|
|
600
632
|
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
601
633
|
)
|
602
634
|
|
635
|
+
format_options = {
|
636
|
+
InputFormat.PDF: pdf_format_option,
|
637
|
+
InputFormat.IMAGE: pdf_format_option,
|
638
|
+
}
|
639
|
+
|
640
|
+
elif pipeline == ProcessingPipeline.ASR:
|
641
|
+
pipeline_options = AsrPipelineOptions(
|
642
|
+
# enable_remote_services=enable_remote_services,
|
643
|
+
# artifacts_path = artifacts_path
|
644
|
+
)
|
645
|
+
|
646
|
+
if asr_model == AsrModelType.WHISPER_TINY:
|
647
|
+
pipeline_options.asr_options = WHISPER_TINY
|
648
|
+
elif asr_model == AsrModelType.WHISPER_SMALL:
|
649
|
+
pipeline_options.asr_options = WHISPER_SMALL
|
650
|
+
elif asr_model == AsrModelType.WHISPER_MEDIUM:
|
651
|
+
pipeline_options.asr_options = WHISPER_MEDIUM
|
652
|
+
elif asr_model == AsrModelType.WHISPER_BASE:
|
653
|
+
pipeline_options.asr_options = WHISPER_BASE
|
654
|
+
elif asr_model == AsrModelType.WHISPER_LARGE:
|
655
|
+
pipeline_options.asr_options = WHISPER_LARGE
|
656
|
+
elif asr_model == AsrModelType.WHISPER_TURBO:
|
657
|
+
pipeline_options.asr_options = WHISPER_TURBO
|
658
|
+
else:
|
659
|
+
_log.error(f"{asr_model} is not known")
|
660
|
+
raise ValueError(f"{asr_model} is not known")
|
661
|
+
|
662
|
+
_log.info(f"pipeline_options: {pipeline_options}")
|
663
|
+
|
664
|
+
audio_format_option = AudioFormatOption(
|
665
|
+
pipeline_cls=AsrPipeline,
|
666
|
+
pipeline_options=pipeline_options,
|
667
|
+
)
|
668
|
+
|
669
|
+
format_options = {
|
670
|
+
InputFormat.AUDIO: audio_format_option,
|
671
|
+
}
|
672
|
+
|
603
673
|
if artifacts_path is not None:
|
604
674
|
pipeline_options.artifacts_path = artifacts_path
|
675
|
+
# audio_pipeline_options.artifacts_path = artifacts_path
|
605
676
|
|
606
|
-
format_options: Dict[InputFormat, FormatOption] = {
|
607
|
-
InputFormat.PDF: pdf_format_option,
|
608
|
-
InputFormat.IMAGE: pdf_format_option,
|
609
|
-
}
|
610
677
|
doc_converter = DocumentConverter(
|
611
678
|
allowed_formats=from_formats,
|
612
679
|
format_options=format_options,
|
@@ -614,6 +681,7 @@ def convert( # noqa: C901
|
|
614
681
|
|
615
682
|
start_time = time.time()
|
616
683
|
|
684
|
+
_log.info(f"paths: {input_doc_paths}")
|
617
685
|
conv_results = doc_converter.convert_all(
|
618
686
|
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
619
687
|
)
|
@@ -0,0 +1,92 @@
|
|
1
|
+
import logging
|
2
|
+
from enum import Enum
|
3
|
+
|
4
|
+
from pydantic import (
|
5
|
+
AnyUrl,
|
6
|
+
)
|
7
|
+
|
8
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
9
|
+
from docling.datamodel.pipeline_options_asr_model import (
|
10
|
+
# AsrResponseFormat,
|
11
|
+
# ApiAsrOptions,
|
12
|
+
InferenceAsrFramework,
|
13
|
+
InlineAsrNativeWhisperOptions,
|
14
|
+
TransformersModelType,
|
15
|
+
)
|
16
|
+
|
17
|
+
_log = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
20
|
+
repo_id="tiny",
|
21
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
22
|
+
verbose=True,
|
23
|
+
timestamps=True,
|
24
|
+
word_timestamps=True,
|
25
|
+
temperatue=0.0,
|
26
|
+
max_new_tokens=256,
|
27
|
+
max_time_chunk=30.0,
|
28
|
+
)
|
29
|
+
|
30
|
+
WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
31
|
+
repo_id="small",
|
32
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
33
|
+
verbose=True,
|
34
|
+
timestamps=True,
|
35
|
+
word_timestamps=True,
|
36
|
+
temperatue=0.0,
|
37
|
+
max_new_tokens=256,
|
38
|
+
max_time_chunk=30.0,
|
39
|
+
)
|
40
|
+
|
41
|
+
WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
42
|
+
repo_id="medium",
|
43
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
44
|
+
verbose=True,
|
45
|
+
timestamps=True,
|
46
|
+
word_timestamps=True,
|
47
|
+
temperatue=0.0,
|
48
|
+
max_new_tokens=256,
|
49
|
+
max_time_chunk=30.0,
|
50
|
+
)
|
51
|
+
|
52
|
+
WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
53
|
+
repo_id="base",
|
54
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
55
|
+
verbose=True,
|
56
|
+
timestamps=True,
|
57
|
+
word_timestamps=True,
|
58
|
+
temperatue=0.0,
|
59
|
+
max_new_tokens=256,
|
60
|
+
max_time_chunk=30.0,
|
61
|
+
)
|
62
|
+
|
63
|
+
WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
64
|
+
repo_id="large",
|
65
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
66
|
+
verbose=True,
|
67
|
+
timestamps=True,
|
68
|
+
word_timestamps=True,
|
69
|
+
temperatue=0.0,
|
70
|
+
max_new_tokens=256,
|
71
|
+
max_time_chunk=30.0,
|
72
|
+
)
|
73
|
+
|
74
|
+
WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
75
|
+
repo_id="turbo",
|
76
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
77
|
+
verbose=True,
|
78
|
+
timestamps=True,
|
79
|
+
word_timestamps=True,
|
80
|
+
temperatue=0.0,
|
81
|
+
max_new_tokens=256,
|
82
|
+
max_time_chunk=30.0,
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
class AsrModelType(str, Enum):
|
87
|
+
WHISPER_TINY = "whisper_tiny"
|
88
|
+
WHISPER_SMALL = "whisper_small"
|
89
|
+
WHISPER_MEDIUM = "whisper_medium"
|
90
|
+
WHISPER_BASE = "whisper_base"
|
91
|
+
WHISPER_LARGE = "whisper_large"
|
92
|
+
WHISPER_TURBO = "whisper_turbo"
|
docling/datamodel/base_models.py
CHANGED
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
|
|
49
49
|
XML_USPTO = "xml_uspto"
|
50
50
|
XML_JATS = "xml_jats"
|
51
51
|
JSON_DOCLING = "json_docling"
|
52
|
+
AUDIO = "audio"
|
52
53
|
|
53
54
|
|
54
55
|
class OutputFormat(str, Enum):
|
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
73
74
|
InputFormat.XLSX: ["xlsx", "xlsm"],
|
74
75
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
75
76
|
InputFormat.JSON_DOCLING: ["json"],
|
77
|
+
InputFormat.AUDIO: ["wav", "mp3"],
|
76
78
|
}
|
77
79
|
|
78
80
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
104
106
|
],
|
105
107
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
106
108
|
InputFormat.JSON_DOCLING: ["application/json"],
|
109
|
+
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
|
107
110
|
}
|
108
111
|
|
109
112
|
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
@@ -253,11 +256,18 @@ class Page(BaseModel):
|
|
253
256
|
return []
|
254
257
|
|
255
258
|
def get_image(
|
256
|
-
self,
|
259
|
+
self,
|
260
|
+
scale: float = 1.0,
|
261
|
+
max_size: Optional[int] = None,
|
262
|
+
cropbox: Optional[BoundingBox] = None,
|
257
263
|
) -> Optional[Image]:
|
258
264
|
if self._backend is None:
|
259
265
|
return self._image_cache.get(scale, None)
|
260
266
|
|
267
|
+
if max_size:
|
268
|
+
assert self.size is not None
|
269
|
+
scale = min(scale, max_size / max(self.size.as_tuple()))
|
270
|
+
|
261
271
|
if scale not in self._image_cache:
|
262
272
|
if cropbox is None:
|
263
273
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
@@ -291,7 +301,7 @@ class OpenAiChatMessage(BaseModel):
|
|
291
301
|
class OpenAiResponseChoice(BaseModel):
|
292
302
|
index: int
|
293
303
|
message: OpenAiChatMessage
|
294
|
-
finish_reason: str
|
304
|
+
finish_reason: Optional[str]
|
295
305
|
|
296
306
|
|
297
307
|
class OpenAiResponseUsage(BaseModel):
|
docling/datamodel/document.py
CHANGED
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
|
|
249
249
|
backend: Type[AbstractDocumentBackend]
|
250
250
|
if format not in format_options.keys():
|
251
251
|
_log.error(
|
252
|
-
f"Input document {obj.name} does not match any allowed format."
|
252
|
+
f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
|
253
253
|
)
|
254
254
|
backend = _DummyBackend
|
255
255
|
else:
|
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
|
|
318
318
|
mime = mime or _DocumentConversionInput._detect_csv(content)
|
319
319
|
mime = mime or "text/plain"
|
320
320
|
formats = MimeTypeToFormat.get(mime, [])
|
321
|
+
_log.info(f"detected formats: {formats}")
|
322
|
+
|
321
323
|
if formats:
|
322
324
|
if len(formats) == 1 and mime not in ("text/plain"):
|
323
325
|
return formats[0]
|
@@ -11,8 +11,13 @@ from pydantic import (
|
|
11
11
|
)
|
12
12
|
from typing_extensions import deprecated
|
13
13
|
|
14
|
+
from docling.datamodel import asr_model_specs
|
15
|
+
|
14
16
|
# Import the following for backwards compatibility
|
15
17
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
18
|
+
from docling.datamodel.pipeline_options_asr_model import (
|
19
|
+
InlineAsrOptions,
|
20
|
+
)
|
16
21
|
from docling.datamodel.pipeline_options_vlm_model import (
|
17
22
|
ApiVlmOptions,
|
18
23
|
InferenceFramework,
|
@@ -202,7 +207,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
|
|
202
207
|
|
203
208
|
# GraniteVision
|
204
209
|
granite_picture_description = PictureDescriptionVlmOptions(
|
205
|
-
repo_id="ibm-granite/granite-vision-3.
|
210
|
+
repo_id="ibm-granite/granite-vision-3.2-2b-preview",
|
206
211
|
prompt="What is shown in this image?",
|
207
212
|
)
|
208
213
|
|
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
260
265
|
)
|
261
266
|
|
262
267
|
|
268
|
+
class AsrPipelineOptions(PipelineOptions):
|
269
|
+
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
270
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
271
|
+
|
272
|
+
|
263
273
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
264
274
|
"""Options for the PDF pipeline."""
|
265
275
|
|
@@ -297,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
297
307
|
)
|
298
308
|
|
299
309
|
|
300
|
-
class
|
310
|
+
class ProcessingPipeline(str, Enum):
|
301
311
|
STANDARD = "standard"
|
302
312
|
VLM = "vlm"
|
313
|
+
ASR = "asr"
|
@@ -0,0 +1,57 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
3
|
+
|
4
|
+
from pydantic import AnyUrl, BaseModel
|
5
|
+
from typing_extensions import deprecated
|
6
|
+
|
7
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
8
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
9
|
+
# InferenceFramework,
|
10
|
+
TransformersModelType,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
class BaseAsrOptions(BaseModel):
|
15
|
+
kind: str
|
16
|
+
# prompt: str
|
17
|
+
|
18
|
+
|
19
|
+
class InferenceAsrFramework(str, Enum):
|
20
|
+
# MLX = "mlx" # disabled for now
|
21
|
+
# TRANSFORMERS = "transformers" # disabled for now
|
22
|
+
WHISPER = "whisper"
|
23
|
+
|
24
|
+
|
25
|
+
class InlineAsrOptions(BaseAsrOptions):
|
26
|
+
kind: Literal["inline_model_options"] = "inline_model_options"
|
27
|
+
|
28
|
+
repo_id: str
|
29
|
+
|
30
|
+
verbose: bool = False
|
31
|
+
timestamps: bool = True
|
32
|
+
|
33
|
+
temperature: float = 0.0
|
34
|
+
max_new_tokens: int = 256
|
35
|
+
max_time_chunk: float = 30.0
|
36
|
+
|
37
|
+
torch_dtype: Optional[str] = None
|
38
|
+
supported_devices: List[AcceleratorDevice] = [
|
39
|
+
AcceleratorDevice.CPU,
|
40
|
+
AcceleratorDevice.CUDA,
|
41
|
+
AcceleratorDevice.MPS,
|
42
|
+
]
|
43
|
+
|
44
|
+
@property
|
45
|
+
def repo_cache_folder(self) -> str:
|
46
|
+
return self.repo_id.replace("/", "--")
|
47
|
+
|
48
|
+
|
49
|
+
class InlineAsrNativeWhisperOptions(InlineAsrOptions):
|
50
|
+
inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
|
51
|
+
|
52
|
+
language: str = "en"
|
53
|
+
supported_devices: List[AcceleratorDevice] = [
|
54
|
+
AcceleratorDevice.CPU,
|
55
|
+
AcceleratorDevice.CUDA,
|
56
|
+
]
|
57
|
+
word_timestamps: bool = True
|
@@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
10
10
|
class BaseVlmOptions(BaseModel):
|
11
11
|
kind: str
|
12
12
|
prompt: str
|
13
|
+
scale: float = 2.0
|
14
|
+
max_size: Optional[int] = None
|
13
15
|
|
14
16
|
|
15
17
|
class ResponseFormat(str, Enum):
|
@@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
49
51
|
AcceleratorDevice.MPS,
|
50
52
|
]
|
51
53
|
|
52
|
-
scale: float = 2.0
|
53
|
-
|
54
54
|
temperature: float = 0.0
|
55
55
|
stop_strings: List[str] = []
|
56
56
|
extra_generation_config: Dict[str, Any] = {}
|
@@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
|
|
76
76
|
) # Default to ollama
|
77
77
|
headers: Dict[str, str] = {}
|
78
78
|
params: Dict[str, Any] = {}
|
79
|
-
scale: float = 2.0
|
80
79
|
timeout: float = 60
|
81
80
|
concurrency: int = 1
|
82
81
|
response_format: ResponseFormat
|
docling/document_converter.py
CHANGED
@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
19
19
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
20
20
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
21
21
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
22
|
+
from docling.backend.noop_backend import NoOpBackend
|
22
23
|
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
23
24
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
24
25
|
from docling.datamodel.base_models import (
|
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
|
|
41
42
|
settings,
|
42
43
|
)
|
43
44
|
from docling.exceptions import ConversionError
|
45
|
+
from docling.pipeline.asr_pipeline import AsrPipeline
|
44
46
|
from docling.pipeline.base_pipeline import BasePipeline
|
45
47
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
46
48
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
|
|
118
120
|
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
119
121
|
|
120
122
|
|
123
|
+
class AudioFormatOption(FormatOption):
|
124
|
+
pipeline_cls: Type = AsrPipeline
|
125
|
+
backend: Type[AbstractDocumentBackend] = NoOpBackend
|
126
|
+
|
127
|
+
|
121
128
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
122
129
|
format_to_default_options = {
|
123
130
|
InputFormat.CSV: FormatOption(
|
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
156
163
|
InputFormat.JSON_DOCLING: FormatOption(
|
157
164
|
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
158
165
|
),
|
166
|
+
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
|
159
167
|
}
|
160
168
|
if (options := format_to_default_options.get(format)) is not None:
|
161
169
|
return options
|
docling/models/api_vlm_model.py
CHANGED
@@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
|
|
48
48
|
with TimeRecorder(conv_res, "vlm"):
|
49
49
|
assert page.size is not None
|
50
50
|
|
51
|
-
hi_res_image = page.get_image(
|
51
|
+
hi_res_image = page.get_image(
|
52
|
+
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
53
|
+
)
|
52
54
|
assert hi_res_image is not None
|
53
55
|
if hi_res_image:
|
54
56
|
if hi_res_image.mode != "RGB":
|
docling/models/base_model.py
CHANGED
@@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
|
|
86
86
|
coord_origin=bbox.coord_origin,
|
87
87
|
)
|
88
88
|
|
89
|
-
page_ix = element_prov.page_no - 1
|
89
|
+
page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
|
90
90
|
cropped_image = conv_res.pages[page_ix].get_image(
|
91
91
|
scale=self.images_scale, cropbox=expanded_bbox
|
92
92
|
)
|
@@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
123
123
|
with TimeRecorder(conv_res, "vlm"):
|
124
124
|
assert page.size is not None
|
125
125
|
|
126
|
-
hi_res_image = page.get_image(
|
126
|
+
hi_res_image = page.get_image(
|
127
|
+
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
128
|
+
)
|
127
129
|
|
128
130
|
# Define prompt structure
|
129
131
|
prompt = self.formulate_prompt()
|
@@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
73
73
|
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
|
74
74
|
assert page.size is not None
|
75
75
|
|
76
|
-
hi_res_image = page.get_image(
|
76
|
+
hi_res_image = page.get_image(
|
77
|
+
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
78
|
+
)
|
77
79
|
if hi_res_image is not None:
|
78
80
|
im_width, im_height = hi_res_image.size
|
79
81
|
|