docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +39 -18
- docling/backend/docling_parse_backend.py +61 -59
- docling/backend/docling_parse_v2_backend.py +72 -62
- docling/backend/docling_parse_v4_backend.py +21 -19
- docling/backend/md_backend.py +101 -81
- docling/backend/mspowerpoint_backend.py +72 -113
- docling/backend/msword_backend.py +99 -80
- docling/backend/noop_backend.py +51 -0
- docling/backend/pypdfium2_backend.py +127 -53
- docling/cli/main.py +82 -14
- docling/datamodel/asr_model_specs.py +92 -0
- docling/datamodel/base_models.py +21 -4
- docling/datamodel/document.py +3 -1
- docling/datamodel/pipeline_options.py +15 -2
- docling/datamodel/pipeline_options_asr_model.py +57 -0
- docling/datamodel/pipeline_options_vlm_model.py +4 -4
- docling/document_converter.py +8 -0
- docling/models/api_vlm_model.py +3 -1
- docling/models/base_model.py +1 -1
- docling/models/base_ocr_model.py +33 -11
- docling/models/easyocr_model.py +1 -1
- docling/models/layout_model.py +2 -3
- docling/models/ocr_mac_model.py +1 -1
- docling/models/page_preprocessing_model.py +3 -6
- docling/models/rapid_ocr_model.py +1 -1
- docling/models/readingorder_model.py +3 -3
- docling/models/tesseract_ocr_cli_model.py +4 -3
- docling/models/tesseract_ocr_model.py +1 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
- docling/models/vlm_models_inline/mlx_model.py +3 -1
- docling/pipeline/asr_pipeline.py +253 -0
- docling/pipeline/base_pipeline.py +11 -0
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/utils/layout_postprocessor.py +11 -6
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0
docling/cli/main.py
CHANGED
@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
|
29
29
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
30
30
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
31
31
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
32
|
+
from docling.datamodel.asr_model_specs import (
|
33
|
+
WHISPER_BASE,
|
34
|
+
WHISPER_LARGE,
|
35
|
+
WHISPER_MEDIUM,
|
36
|
+
WHISPER_SMALL,
|
37
|
+
WHISPER_TINY,
|
38
|
+
WHISPER_TURBO,
|
39
|
+
AsrModelType,
|
40
|
+
)
|
32
41
|
from docling.datamodel.base_models import (
|
33
42
|
ConversionStatus,
|
34
43
|
FormatToExtensions,
|
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
|
|
37
46
|
)
|
38
47
|
from docling.datamodel.document import ConversionResult
|
39
48
|
from docling.datamodel.pipeline_options import (
|
49
|
+
AsrPipelineOptions,
|
40
50
|
EasyOcrOptions,
|
41
51
|
OcrOptions,
|
42
52
|
PaginatedPipelineOptions,
|
43
53
|
PdfBackend,
|
44
|
-
PdfPipeline,
|
45
54
|
PdfPipelineOptions,
|
55
|
+
PipelineOptions,
|
56
|
+
ProcessingPipeline,
|
46
57
|
TableFormerMode,
|
47
58
|
VlmPipelineOptions,
|
48
59
|
)
|
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
|
|
54
65
|
SMOLDOCLING_TRANSFORMERS,
|
55
66
|
VlmModelType,
|
56
67
|
)
|
57
|
-
from docling.document_converter import
|
68
|
+
from docling.document_converter import (
|
69
|
+
AudioFormatOption,
|
70
|
+
DocumentConverter,
|
71
|
+
FormatOption,
|
72
|
+
PdfFormatOption,
|
73
|
+
)
|
58
74
|
from docling.models.factories import get_ocr_factory
|
75
|
+
from docling.pipeline.asr_pipeline import AsrPipeline
|
59
76
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
60
77
|
|
61
78
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
@@ -296,13 +313,17 @@ def convert( # noqa: C901
|
|
296
313
|
),
|
297
314
|
] = ImageRefMode.EMBEDDED,
|
298
315
|
pipeline: Annotated[
|
299
|
-
|
316
|
+
ProcessingPipeline,
|
300
317
|
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
301
|
-
] =
|
318
|
+
] = ProcessingPipeline.STANDARD,
|
302
319
|
vlm_model: Annotated[
|
303
320
|
VlmModelType,
|
304
321
|
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
305
322
|
] = VlmModelType.SMOLDOCLING,
|
323
|
+
asr_model: Annotated[
|
324
|
+
AsrModelType,
|
325
|
+
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
|
326
|
+
] = AsrModelType.WHISPER_TINY,
|
306
327
|
ocr: Annotated[
|
307
328
|
bool,
|
308
329
|
typer.Option(
|
@@ -450,12 +471,14 @@ def convert( # noqa: C901
|
|
450
471
|
),
|
451
472
|
] = None,
|
452
473
|
):
|
474
|
+
log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
|
475
|
+
|
453
476
|
if verbose == 0:
|
454
|
-
logging.basicConfig(level=logging.WARNING)
|
477
|
+
logging.basicConfig(level=logging.WARNING, format=log_format)
|
455
478
|
elif verbose == 1:
|
456
|
-
logging.basicConfig(level=logging.INFO)
|
479
|
+
logging.basicConfig(level=logging.INFO, format=log_format)
|
457
480
|
else:
|
458
|
-
logging.basicConfig(level=logging.DEBUG)
|
481
|
+
logging.basicConfig(level=logging.DEBUG, format=log_format)
|
459
482
|
|
460
483
|
settings.debug.visualize_cells = debug_visualize_cells
|
461
484
|
settings.debug.visualize_layout = debug_visualize_layout
|
@@ -530,9 +553,12 @@ def convert( # noqa: C901
|
|
530
553
|
ocr_options.lang = ocr_lang_list
|
531
554
|
|
532
555
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
533
|
-
pipeline_options: PaginatedPipelineOptions
|
556
|
+
# pipeline_options: PaginatedPipelineOptions
|
557
|
+
pipeline_options: PipelineOptions
|
558
|
+
|
559
|
+
format_options: Dict[InputFormat, FormatOption] = {}
|
534
560
|
|
535
|
-
if pipeline ==
|
561
|
+
if pipeline == ProcessingPipeline.STANDARD:
|
536
562
|
pipeline_options = PdfPipelineOptions(
|
537
563
|
allow_external_plugins=allow_external_plugins,
|
538
564
|
enable_remote_services=enable_remote_services,
|
@@ -574,7 +600,13 @@ def convert( # noqa: C901
|
|
574
600
|
pipeline_options=pipeline_options,
|
575
601
|
backend=backend, # pdf_backend
|
576
602
|
)
|
577
|
-
|
603
|
+
|
604
|
+
format_options = {
|
605
|
+
InputFormat.PDF: pdf_format_option,
|
606
|
+
InputFormat.IMAGE: pdf_format_option,
|
607
|
+
}
|
608
|
+
|
609
|
+
elif pipeline == ProcessingPipeline.VLM:
|
578
610
|
pipeline_options = VlmPipelineOptions(
|
579
611
|
enable_remote_services=enable_remote_services,
|
580
612
|
)
|
@@ -600,13 +632,48 @@ def convert( # noqa: C901
|
|
600
632
|
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
601
633
|
)
|
602
634
|
|
635
|
+
format_options = {
|
636
|
+
InputFormat.PDF: pdf_format_option,
|
637
|
+
InputFormat.IMAGE: pdf_format_option,
|
638
|
+
}
|
639
|
+
|
640
|
+
elif pipeline == ProcessingPipeline.ASR:
|
641
|
+
pipeline_options = AsrPipelineOptions(
|
642
|
+
# enable_remote_services=enable_remote_services,
|
643
|
+
# artifacts_path = artifacts_path
|
644
|
+
)
|
645
|
+
|
646
|
+
if asr_model == AsrModelType.WHISPER_TINY:
|
647
|
+
pipeline_options.asr_options = WHISPER_TINY
|
648
|
+
elif asr_model == AsrModelType.WHISPER_SMALL:
|
649
|
+
pipeline_options.asr_options = WHISPER_SMALL
|
650
|
+
elif asr_model == AsrModelType.WHISPER_MEDIUM:
|
651
|
+
pipeline_options.asr_options = WHISPER_MEDIUM
|
652
|
+
elif asr_model == AsrModelType.WHISPER_BASE:
|
653
|
+
pipeline_options.asr_options = WHISPER_BASE
|
654
|
+
elif asr_model == AsrModelType.WHISPER_LARGE:
|
655
|
+
pipeline_options.asr_options = WHISPER_LARGE
|
656
|
+
elif asr_model == AsrModelType.WHISPER_TURBO:
|
657
|
+
pipeline_options.asr_options = WHISPER_TURBO
|
658
|
+
else:
|
659
|
+
_log.error(f"{asr_model} is not known")
|
660
|
+
raise ValueError(f"{asr_model} is not known")
|
661
|
+
|
662
|
+
_log.info(f"pipeline_options: {pipeline_options}")
|
663
|
+
|
664
|
+
audio_format_option = AudioFormatOption(
|
665
|
+
pipeline_cls=AsrPipeline,
|
666
|
+
pipeline_options=pipeline_options,
|
667
|
+
)
|
668
|
+
|
669
|
+
format_options = {
|
670
|
+
InputFormat.AUDIO: audio_format_option,
|
671
|
+
}
|
672
|
+
|
603
673
|
if artifacts_path is not None:
|
604
674
|
pipeline_options.artifacts_path = artifacts_path
|
675
|
+
# audio_pipeline_options.artifacts_path = artifacts_path
|
605
676
|
|
606
|
-
format_options: Dict[InputFormat, FormatOption] = {
|
607
|
-
InputFormat.PDF: pdf_format_option,
|
608
|
-
InputFormat.IMAGE: pdf_format_option,
|
609
|
-
}
|
610
677
|
doc_converter = DocumentConverter(
|
611
678
|
allowed_formats=from_formats,
|
612
679
|
format_options=format_options,
|
@@ -614,6 +681,7 @@ def convert( # noqa: C901
|
|
614
681
|
|
615
682
|
start_time = time.time()
|
616
683
|
|
684
|
+
_log.info(f"paths: {input_doc_paths}")
|
617
685
|
conv_results = doc_converter.convert_all(
|
618
686
|
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
619
687
|
)
|
@@ -0,0 +1,92 @@
|
|
1
|
+
import logging
|
2
|
+
from enum import Enum
|
3
|
+
|
4
|
+
from pydantic import (
|
5
|
+
AnyUrl,
|
6
|
+
)
|
7
|
+
|
8
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
9
|
+
from docling.datamodel.pipeline_options_asr_model import (
|
10
|
+
# AsrResponseFormat,
|
11
|
+
# ApiAsrOptions,
|
12
|
+
InferenceAsrFramework,
|
13
|
+
InlineAsrNativeWhisperOptions,
|
14
|
+
TransformersModelType,
|
15
|
+
)
|
16
|
+
|
17
|
+
_log = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
20
|
+
repo_id="tiny",
|
21
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
22
|
+
verbose=True,
|
23
|
+
timestamps=True,
|
24
|
+
word_timestamps=True,
|
25
|
+
temperatue=0.0,
|
26
|
+
max_new_tokens=256,
|
27
|
+
max_time_chunk=30.0,
|
28
|
+
)
|
29
|
+
|
30
|
+
WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
31
|
+
repo_id="small",
|
32
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
33
|
+
verbose=True,
|
34
|
+
timestamps=True,
|
35
|
+
word_timestamps=True,
|
36
|
+
temperatue=0.0,
|
37
|
+
max_new_tokens=256,
|
38
|
+
max_time_chunk=30.0,
|
39
|
+
)
|
40
|
+
|
41
|
+
WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
42
|
+
repo_id="medium",
|
43
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
44
|
+
verbose=True,
|
45
|
+
timestamps=True,
|
46
|
+
word_timestamps=True,
|
47
|
+
temperatue=0.0,
|
48
|
+
max_new_tokens=256,
|
49
|
+
max_time_chunk=30.0,
|
50
|
+
)
|
51
|
+
|
52
|
+
WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
53
|
+
repo_id="base",
|
54
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
55
|
+
verbose=True,
|
56
|
+
timestamps=True,
|
57
|
+
word_timestamps=True,
|
58
|
+
temperatue=0.0,
|
59
|
+
max_new_tokens=256,
|
60
|
+
max_time_chunk=30.0,
|
61
|
+
)
|
62
|
+
|
63
|
+
WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
64
|
+
repo_id="large",
|
65
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
66
|
+
verbose=True,
|
67
|
+
timestamps=True,
|
68
|
+
word_timestamps=True,
|
69
|
+
temperatue=0.0,
|
70
|
+
max_new_tokens=256,
|
71
|
+
max_time_chunk=30.0,
|
72
|
+
)
|
73
|
+
|
74
|
+
WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
75
|
+
repo_id="turbo",
|
76
|
+
inference_framework=InferenceAsrFramework.WHISPER,
|
77
|
+
verbose=True,
|
78
|
+
timestamps=True,
|
79
|
+
word_timestamps=True,
|
80
|
+
temperatue=0.0,
|
81
|
+
max_new_tokens=256,
|
82
|
+
max_time_chunk=30.0,
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
class AsrModelType(str, Enum):
|
87
|
+
WHISPER_TINY = "whisper_tiny"
|
88
|
+
WHISPER_SMALL = "whisper_small"
|
89
|
+
WHISPER_MEDIUM = "whisper_medium"
|
90
|
+
WHISPER_BASE = "whisper_base"
|
91
|
+
WHISPER_LARGE = "whisper_large"
|
92
|
+
WHISPER_TURBO = "whisper_turbo"
|
docling/datamodel/base_models.py
CHANGED
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
|
|
49
49
|
XML_USPTO = "xml_uspto"
|
50
50
|
XML_JATS = "xml_jats"
|
51
51
|
JSON_DOCLING = "json_docling"
|
52
|
+
AUDIO = "audio"
|
52
53
|
|
53
54
|
|
54
55
|
class OutputFormat(str, Enum):
|
@@ -67,12 +68,13 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
67
68
|
InputFormat.MD: ["md"],
|
68
69
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
69
70
|
InputFormat.XML_JATS: ["xml", "nxml"],
|
70
|
-
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
71
|
+
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
|
71
72
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
72
73
|
InputFormat.CSV: ["csv"],
|
73
|
-
InputFormat.XLSX: ["xlsx"],
|
74
|
+
InputFormat.XLSX: ["xlsx", "xlsm"],
|
74
75
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
75
76
|
InputFormat.JSON_DOCLING: ["json"],
|
77
|
+
InputFormat.AUDIO: ["wav", "mp3"],
|
76
78
|
}
|
77
79
|
|
78
80
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
104
106
|
],
|
105
107
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
106
108
|
InputFormat.JSON_DOCLING: ["application/json"],
|
109
|
+
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
|
107
110
|
}
|
108
111
|
|
109
112
|
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
@@ -232,7 +235,6 @@ class Page(BaseModel):
|
|
232
235
|
page_no: int
|
233
236
|
# page_hash: Optional[str] = None
|
234
237
|
size: Optional[Size] = None
|
235
|
-
cells: List[TextCell] = []
|
236
238
|
parsed_page: Optional[SegmentedPdfPage] = None
|
237
239
|
predictions: PagePredictions = PagePredictions()
|
238
240
|
assembled: Optional[AssembledUnit] = None
|
@@ -245,12 +247,27 @@ class Page(BaseModel):
|
|
245
247
|
float, Image
|
246
248
|
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
247
249
|
|
250
|
+
@property
|
251
|
+
def cells(self) -> List[TextCell]:
|
252
|
+
"""Return text cells as a read-only view of parsed_page.textline_cells."""
|
253
|
+
if self.parsed_page is not None:
|
254
|
+
return self.parsed_page.textline_cells
|
255
|
+
else:
|
256
|
+
return []
|
257
|
+
|
248
258
|
def get_image(
|
249
|
-
self,
|
259
|
+
self,
|
260
|
+
scale: float = 1.0,
|
261
|
+
max_size: Optional[int] = None,
|
262
|
+
cropbox: Optional[BoundingBox] = None,
|
250
263
|
) -> Optional[Image]:
|
251
264
|
if self._backend is None:
|
252
265
|
return self._image_cache.get(scale, None)
|
253
266
|
|
267
|
+
if max_size:
|
268
|
+
assert self.size is not None
|
269
|
+
scale = min(scale, max_size / max(self.size.as_tuple()))
|
270
|
+
|
254
271
|
if scale not in self._image_cache:
|
255
272
|
if cropbox is None:
|
256
273
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
docling/datamodel/document.py
CHANGED
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
|
|
249
249
|
backend: Type[AbstractDocumentBackend]
|
250
250
|
if format not in format_options.keys():
|
251
251
|
_log.error(
|
252
|
-
f"Input document {obj.name} does not match any allowed format."
|
252
|
+
f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
|
253
253
|
)
|
254
254
|
backend = _DummyBackend
|
255
255
|
else:
|
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
|
|
318
318
|
mime = mime or _DocumentConversionInput._detect_csv(content)
|
319
319
|
mime = mime or "text/plain"
|
320
320
|
formats = MimeTypeToFormat.get(mime, [])
|
321
|
+
_log.info(f"detected formats: {formats}")
|
322
|
+
|
321
323
|
if formats:
|
322
324
|
if len(formats) == 1 and mime not in ("text/plain"):
|
323
325
|
return formats[0]
|
@@ -11,8 +11,13 @@ from pydantic import (
|
|
11
11
|
)
|
12
12
|
from typing_extensions import deprecated
|
13
13
|
|
14
|
+
from docling.datamodel import asr_model_specs
|
15
|
+
|
14
16
|
# Import the following for backwards compatibility
|
15
17
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
18
|
+
from docling.datamodel.pipeline_options_asr_model import (
|
19
|
+
InlineAsrOptions,
|
20
|
+
)
|
16
21
|
from docling.datamodel.pipeline_options_vlm_model import (
|
17
22
|
ApiVlmOptions,
|
18
23
|
InferenceFramework,
|
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
260
265
|
)
|
261
266
|
|
262
267
|
|
268
|
+
class AsrPipelineOptions(PipelineOptions):
|
269
|
+
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
270
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
271
|
+
|
272
|
+
|
263
273
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
264
274
|
"""Options for the PDF pipeline."""
|
265
275
|
|
@@ -292,9 +302,12 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
292
302
|
),
|
293
303
|
)
|
294
304
|
|
295
|
-
generate_parsed_pages:
|
305
|
+
generate_parsed_pages: Literal[True] = (
|
306
|
+
True # Always True since parsed_page is now mandatory
|
307
|
+
)
|
296
308
|
|
297
309
|
|
298
|
-
class
|
310
|
+
class ProcessingPipeline(str, Enum):
|
299
311
|
STANDARD = "standard"
|
300
312
|
VLM = "vlm"
|
313
|
+
ASR = "asr"
|
@@ -0,0 +1,57 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
3
|
+
|
4
|
+
from pydantic import AnyUrl, BaseModel
|
5
|
+
from typing_extensions import deprecated
|
6
|
+
|
7
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice
|
8
|
+
from docling.datamodel.pipeline_options_vlm_model import (
|
9
|
+
# InferenceFramework,
|
10
|
+
TransformersModelType,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
class BaseAsrOptions(BaseModel):
|
15
|
+
kind: str
|
16
|
+
# prompt: str
|
17
|
+
|
18
|
+
|
19
|
+
class InferenceAsrFramework(str, Enum):
|
20
|
+
# MLX = "mlx" # disabled for now
|
21
|
+
# TRANSFORMERS = "transformers" # disabled for now
|
22
|
+
WHISPER = "whisper"
|
23
|
+
|
24
|
+
|
25
|
+
class InlineAsrOptions(BaseAsrOptions):
|
26
|
+
kind: Literal["inline_model_options"] = "inline_model_options"
|
27
|
+
|
28
|
+
repo_id: str
|
29
|
+
|
30
|
+
verbose: bool = False
|
31
|
+
timestamps: bool = True
|
32
|
+
|
33
|
+
temperature: float = 0.0
|
34
|
+
max_new_tokens: int = 256
|
35
|
+
max_time_chunk: float = 30.0
|
36
|
+
|
37
|
+
torch_dtype: Optional[str] = None
|
38
|
+
supported_devices: List[AcceleratorDevice] = [
|
39
|
+
AcceleratorDevice.CPU,
|
40
|
+
AcceleratorDevice.CUDA,
|
41
|
+
AcceleratorDevice.MPS,
|
42
|
+
]
|
43
|
+
|
44
|
+
@property
|
45
|
+
def repo_cache_folder(self) -> str:
|
46
|
+
return self.repo_id.replace("/", "--")
|
47
|
+
|
48
|
+
|
49
|
+
class InlineAsrNativeWhisperOptions(InlineAsrOptions):
|
50
|
+
inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
|
51
|
+
|
52
|
+
language: str = "en"
|
53
|
+
supported_devices: List[AcceleratorDevice] = [
|
54
|
+
AcceleratorDevice.CPU,
|
55
|
+
AcceleratorDevice.CUDA,
|
56
|
+
]
|
57
|
+
word_timestamps: bool = True
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import Any, Dict, List, Literal
|
2
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
3
3
|
|
4
4
|
from pydantic import AnyUrl, BaseModel
|
5
5
|
from typing_extensions import deprecated
|
@@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
10
10
|
class BaseVlmOptions(BaseModel):
|
11
11
|
kind: str
|
12
12
|
prompt: str
|
13
|
+
scale: float = 2.0
|
14
|
+
max_size: Optional[int] = None
|
13
15
|
|
14
16
|
|
15
17
|
class ResponseFormat(str, Enum):
|
@@ -42,14 +44,13 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
42
44
|
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
43
45
|
response_format: ResponseFormat
|
44
46
|
|
47
|
+
torch_dtype: Optional[str] = None
|
45
48
|
supported_devices: List[AcceleratorDevice] = [
|
46
49
|
AcceleratorDevice.CPU,
|
47
50
|
AcceleratorDevice.CUDA,
|
48
51
|
AcceleratorDevice.MPS,
|
49
52
|
]
|
50
53
|
|
51
|
-
scale: float = 2.0
|
52
|
-
|
53
54
|
temperature: float = 0.0
|
54
55
|
stop_strings: List[str] = []
|
55
56
|
extra_generation_config: Dict[str, Any] = {}
|
@@ -75,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
|
|
75
76
|
) # Default to ollama
|
76
77
|
headers: Dict[str, str] = {}
|
77
78
|
params: Dict[str, Any] = {}
|
78
|
-
scale: float = 2.0
|
79
79
|
timeout: float = 60
|
80
80
|
concurrency: int = 1
|
81
81
|
response_format: ResponseFormat
|
docling/document_converter.py
CHANGED
@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
19
19
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
20
20
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
21
21
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
22
|
+
from docling.backend.noop_backend import NoOpBackend
|
22
23
|
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
23
24
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
24
25
|
from docling.datamodel.base_models import (
|
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
|
|
41
42
|
settings,
|
42
43
|
)
|
43
44
|
from docling.exceptions import ConversionError
|
45
|
+
from docling.pipeline.asr_pipeline import AsrPipeline
|
44
46
|
from docling.pipeline.base_pipeline import BasePipeline
|
45
47
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
46
48
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
|
|
118
120
|
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
119
121
|
|
120
122
|
|
123
|
+
class AudioFormatOption(FormatOption):
|
124
|
+
pipeline_cls: Type = AsrPipeline
|
125
|
+
backend: Type[AbstractDocumentBackend] = NoOpBackend
|
126
|
+
|
127
|
+
|
121
128
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
122
129
|
format_to_default_options = {
|
123
130
|
InputFormat.CSV: FormatOption(
|
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
156
163
|
InputFormat.JSON_DOCLING: FormatOption(
|
157
164
|
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
158
165
|
),
|
166
|
+
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
|
159
167
|
}
|
160
168
|
if (options := format_to_default_options.get(format)) is not None:
|
161
169
|
return options
|
docling/models/api_vlm_model.py
CHANGED
@@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
|
|
48
48
|
with TimeRecorder(conv_res, "vlm"):
|
49
49
|
assert page.size is not None
|
50
50
|
|
51
|
-
hi_res_image = page.get_image(
|
51
|
+
hi_res_image = page.get_image(
|
52
|
+
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
53
|
+
)
|
52
54
|
assert hi_res_image is not None
|
53
55
|
if hi_res_image:
|
54
56
|
if hi_res_image.mode != "RGB":
|
docling/models/base_model.py
CHANGED
@@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
|
|
86
86
|
coord_origin=bbox.coord_origin,
|
87
87
|
)
|
88
88
|
|
89
|
-
page_ix = element_prov.page_no - 1
|
89
|
+
page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
|
90
90
|
cropped_image = conv_res.pages[page_ix].get_image(
|
91
91
|
scale=self.images_scale, cropbox=expanded_bbox
|
92
92
|
)
|
docling/models/base_ocr_model.py
CHANGED
@@ -7,6 +7,7 @@ from typing import List, Optional, Type
|
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
10
|
+
from docling_core.types.doc.page import TextCell
|
10
11
|
from PIL import Image, ImageDraw
|
11
12
|
from rtree import index
|
12
13
|
from scipy.ndimage import binary_dilation, find_objects, label
|
@@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|
107
108
|
return []
|
108
109
|
|
109
110
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
110
|
-
def _filter_ocr_cells(
|
111
|
+
def _filter_ocr_cells(
|
112
|
+
self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
|
113
|
+
) -> List[TextCell]:
|
111
114
|
# Create R-tree index for programmatic cells
|
112
115
|
p = index.Property()
|
113
116
|
p.dimension = 2
|
@@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|
130
133
|
]
|
131
134
|
return filtered_ocr_cells
|
132
135
|
|
133
|
-
def post_process_cells(self, ocr_cells,
|
136
|
+
def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
|
134
137
|
r"""
|
135
|
-
Post-process the
|
138
|
+
Post-process the OCR cells and update the page object.
|
139
|
+
Updates parsed_page.textline_cells directly since page.cells is now read-only.
|
136
140
|
"""
|
141
|
+
# Get existing cells from the read-only property
|
142
|
+
existing_cells = page.cells
|
143
|
+
|
144
|
+
# Combine existing and OCR cells with overlap filtering
|
145
|
+
final_cells = self._combine_cells(existing_cells, ocr_cells)
|
146
|
+
|
147
|
+
assert page.parsed_page is not None
|
148
|
+
|
149
|
+
# Update parsed_page.textline_cells directly
|
150
|
+
page.parsed_page.textline_cells = final_cells
|
151
|
+
page.parsed_page.has_lines = len(final_cells) > 0
|
152
|
+
|
153
|
+
def _combine_cells(
|
154
|
+
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
|
155
|
+
) -> List[TextCell]:
|
156
|
+
"""Combine existing and OCR cells with filtering and re-indexing."""
|
137
157
|
if self.options.force_full_page_ocr:
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
158
|
+
combined = ocr_cells
|
159
|
+
else:
|
160
|
+
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
|
161
|
+
combined = list(existing_cells) + filtered_ocr_cells
|
162
|
+
|
163
|
+
# Re-index in-place
|
164
|
+
for i, cell in enumerate(combined):
|
165
|
+
cell.index = i
|
166
|
+
|
167
|
+
return combined
|
146
168
|
|
147
169
|
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
148
170
|
image = copy.deepcopy(page.image)
|
docling/models/easyocr_model.py
CHANGED
@@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
|
|
177
177
|
all_ocr_cells.extend(cells)
|
178
178
|
|
179
179
|
# Post-process the cells
|
180
|
-
|
180
|
+
self.post_process_cells(all_ocr_cells, page)
|
181
181
|
|
182
182
|
# DEBUG code:
|
183
183
|
if settings.debug.visualize_ocr:
|
docling/models/layout_model.py
CHANGED
@@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
|
|
176
176
|
# Apply postprocessing
|
177
177
|
|
178
178
|
processed_clusters, processed_cells = LayoutPostprocessor(
|
179
|
-
page
|
179
|
+
page, clusters
|
180
180
|
).postprocess()
|
181
|
-
#
|
181
|
+
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
182
182
|
|
183
183
|
with warnings.catch_warnings():
|
184
184
|
warnings.filterwarnings(
|
@@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
|
|
198
198
|
)
|
199
199
|
)
|
200
200
|
|
201
|
-
page.cells = processed_cells
|
202
201
|
page.predictions.layout = LayoutPrediction(
|
203
202
|
clusters=processed_clusters
|
204
203
|
)
|
docling/models/ocr_mac_model.py
CHANGED
@@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
|
|
132
132
|
all_ocr_cells.extend(cells)
|
133
133
|
|
134
134
|
# Post-process the cells
|
135
|
-
|
135
|
+
self.post_process_cells(all_ocr_cells, page)
|
136
136
|
|
137
137
|
# DEBUG code:
|
138
138
|
if settings.debug.visualize_ocr:
|