docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
docling/cli/main.py ADDED
@@ -0,0 +1,974 @@
1
+ import datetime
2
+ import importlib
3
+ import logging
4
+ import platform
5
+ import re
6
+ import sys
7
+ import tempfile
8
+ import time
9
+ import warnings
10
+ from collections.abc import Iterable
11
+ from pathlib import Path
12
+ from typing import Annotated, Dict, List, Optional, Type
13
+
14
+ import rich.table
15
+ import typer
16
+ from docling_core.transforms.serializer.html import (
17
+ HTMLDocSerializer,
18
+ HTMLOutputStyle,
19
+ HTMLParams,
20
+ )
21
+ from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
22
+ from docling_core.types.doc import ImageRefMode
23
+ from docling_core.utils.file import resolve_source_to_path
24
+ from pydantic import TypeAdapter
25
+ from rich.console import Console
26
+
27
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
28
+ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
29
+ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
30
+ from docling.backend.image_backend import ImageDocumentBackend
31
+ from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
32
+ from docling.backend.pdf_backend import PdfDocumentBackend
33
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
34
+ from docling.datamodel import vlm_model_specs
35
+ from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
36
+ from docling.datamodel.asr_model_specs import (
37
+ WHISPER_BASE,
38
+ WHISPER_BASE_MLX,
39
+ WHISPER_BASE_NATIVE,
40
+ WHISPER_LARGE,
41
+ WHISPER_LARGE_MLX,
42
+ WHISPER_LARGE_NATIVE,
43
+ WHISPER_MEDIUM,
44
+ WHISPER_MEDIUM_MLX,
45
+ WHISPER_MEDIUM_NATIVE,
46
+ WHISPER_SMALL,
47
+ WHISPER_SMALL_MLX,
48
+ WHISPER_SMALL_NATIVE,
49
+ WHISPER_TINY,
50
+ WHISPER_TINY_MLX,
51
+ WHISPER_TINY_NATIVE,
52
+ WHISPER_TURBO,
53
+ WHISPER_TURBO_MLX,
54
+ WHISPER_TURBO_NATIVE,
55
+ AsrModelType,
56
+ )
57
+ from docling.datamodel.backend_options import PdfBackendOptions
58
+ from docling.datamodel.base_models import (
59
+ ConversionStatus,
60
+ FormatToExtensions,
61
+ InputFormat,
62
+ OutputFormat,
63
+ )
64
+ from docling.datamodel.document import ConversionResult, DoclingVersion
65
+ from docling.datamodel.pipeline_options import (
66
+ AsrPipelineOptions,
67
+ ConvertPipelineOptions,
68
+ OcrAutoOptions,
69
+ OcrOptions,
70
+ PaginatedPipelineOptions,
71
+ PdfBackend,
72
+ PdfPipelineOptions,
73
+ PipelineOptions,
74
+ ProcessingPipeline,
75
+ TableFormerMode,
76
+ TableStructureOptions,
77
+ TesseractCliOcrOptions,
78
+ TesseractOcrOptions,
79
+ VlmPipelineOptions,
80
+ )
81
+ from docling.datamodel.settings import settings
82
+ from docling.datamodel.vlm_model_specs import VlmModelType
83
+ from docling.document_converter import (
84
+ AudioFormatOption,
85
+ DocumentConverter,
86
+ ExcelFormatOption,
87
+ FormatOption,
88
+ HTMLFormatOption,
89
+ MarkdownFormatOption,
90
+ PdfFormatOption,
91
+ PowerpointFormatOption,
92
+ WordFormatOption,
93
+ )
94
+ from docling.models.factories import (
95
+ get_layout_factory,
96
+ get_ocr_factory,
97
+ get_table_structure_factory,
98
+ )
99
+ from docling.models.factories.base_factory import BaseFactory
100
+ from docling.pipeline.asr_pipeline import AsrPipeline
101
+ from docling.pipeline.vlm_pipeline import VlmPipeline
102
+ from docling.utils.profiling import ProfilingItem
103
+
104
+ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
105
+ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
106
+
107
+ _log = logging.getLogger(__name__)
108
+
109
+ console = Console()
110
+ err_console = Console(stderr=True)
111
+
112
+ ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
113
+ ocr_engines_enum_internal = ocr_factory_internal.get_enum()
114
+
115
+ DOCLING_ASCII_ART = r"""
116
+ ████ ██████
117
+ ███░░██░░░░░██████
118
+ ████████░░░░░░░░████████████
119
+ ████████░░░░░░░░░░░░░░░░░░████████
120
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░██████
121
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
122
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
123
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
124
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
125
+ ██████░░░░░░░ ░░░░░░░░░░░░░░░░░░░░░░ ░░░░░░░██████
126
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
127
+ ██████░░░░░░ ░░░░░░░░░░░░░░░ ░░░░░░██████
128
+ ███▒██░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒███
129
+ ███▒██░░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒████
130
+ ███▒██░░░░░░ ██ ██ ░░░░░░░░░░░░ ██ ██ ░░░░░██▒▒███
131
+ ███▒███░░░░░ ██ ░░░░████░░░░ ██ ░░░░░██▒▒███
132
+ ████▒▒██░░░░░░ ░░░███▒▒▒▒███░░░ ░░░░░░░██▒▒████
133
+ ████▒▒██░░░░░░░░░░░░░░░░░█▒▒▒▒▒▒▒▒▒▒█░░░░░░░░░░░░░░░░███▒▒████
134
+ ████▒▒▒██░░░░░░░░░░░░█████ ▒▒▒▒▒▒ ██████░░░░░░░░░░░██▒▒▒████
135
+ ███▒▒▒▒██░░░░░░░░███▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒███░░░░░░░░██▒▒▒▒███
136
+ ███▒▒▒▒▒███░░░░░░██▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒██░░░░░░███▒▒▒▒▒███
137
+ ████▒▒▒▒▒████░░░░░░██████████████████████░░░░░░████▒▒▒▒▒████
138
+ ███▒▒▒▒▒▒▒▒████░░░░░░░░░░░░░░░░░░░░░░░░░░░████▒▒▒▒▒▒▒▒▒███
139
+ ████▒▒▒▒▒▒▒▒███░░░░░████████████████████████▒▒▒▒▒▒▒▒▒████
140
+ ████▒▒▒▒▒▒██░░░░░░█ █░░░░░██▒▒▒▒▒▒████
141
+ ████▒▒▒▒█░░░░░░░█ D O C L I N G █░░░░░░░░██▒▒▒████
142
+ ████▒▒██░░░░░░█ █░░░░░░░░░░█▒▒████
143
+ ██████░░░░░░█ D O C L I N G █░░░░░░░░░░░██████
144
+ ████░░░░░█ █░░░░░░░░░░░░████
145
+ █████░░█ D O C L I N G █░░░░░░░░░░░█████
146
+ █████ █░░░░░░░░████████
147
+ ██ D O C L I N G █░░░░░░░░█████
148
+ █ █░░░████████
149
+ █████████████████████████████
150
+ """
151
+
152
+
153
+ app = typer.Typer(
154
+ name="Docling",
155
+ no_args_is_help=True,
156
+ add_completion=False,
157
+ pretty_exceptions_enable=False,
158
+ )
159
+
160
+
161
+ def logo_callback(value: bool):
162
+ if value:
163
+ print(DOCLING_ASCII_ART)
164
+ raise typer.Exit()
165
+
166
+
167
+ def version_callback(value: bool):
168
+ if value:
169
+ v = DoclingVersion()
170
+ print(f"Docling version: {v.docling_version}")
171
+ print(f"Docling Core version: {v.docling_core_version}")
172
+ print(f"Docling IBM Models version: {v.docling_ibm_models_version}")
173
+ print(f"Docling Parse version: {v.docling_parse_version}")
174
+ print(f"Python: {v.py_impl_version} ({v.py_lang_version})")
175
+ print(f"Platform: {v.platform_str}")
176
+ raise typer.Exit()
177
+
178
+
179
+ def show_external_plugins_callback(value: bool):
180
+ if value:
181
+ ocr_factory_all = get_ocr_factory(allow_external_plugins=True)
182
+ layout_factory_all = get_layout_factory(allow_external_plugins=True)
183
+ table_factory_all = get_table_structure_factory(allow_external_plugins=True)
184
+
185
+ def print_external_plugins(factory: BaseFactory, factory_name: str):
186
+ table = rich.table.Table(title=f"Available {factory_name} engines")
187
+ table.add_column("Name", justify="right")
188
+ table.add_column("Plugin")
189
+ table.add_column("Package")
190
+ for meta in factory.registered_meta.values():
191
+ if not meta.module.startswith("docling."):
192
+ table.add_row(
193
+ f"[bold]{meta.kind}[/bold]",
194
+ meta.plugin_name,
195
+ meta.module.split(".")[0],
196
+ )
197
+ rich.print(table)
198
+
199
+ print_external_plugins(ocr_factory_all, "OCR")
200
+ print_external_plugins(layout_factory_all, "layout")
201
+ print_external_plugins(table_factory_all, "table")
202
+
203
+ raise typer.Exit()
204
+
205
+
206
+ def export_documents(
207
+ conv_results: Iterable[ConversionResult],
208
+ output_dir: Path,
209
+ export_json: bool,
210
+ export_yaml: bool,
211
+ export_html: bool,
212
+ export_html_split_page: bool,
213
+ show_layout: bool,
214
+ export_md: bool,
215
+ export_txt: bool,
216
+ export_doctags: bool,
217
+ print_timings: bool,
218
+ export_timings: bool,
219
+ image_export_mode: ImageRefMode,
220
+ ):
221
+ success_count = 0
222
+ failure_count = 0
223
+
224
+ for conv_res in conv_results:
225
+ if conv_res.status == ConversionStatus.SUCCESS:
226
+ success_count += 1
227
+ doc_filename = conv_res.input.file.stem
228
+
229
+ # Export JSON format:
230
+ if export_json:
231
+ fname = output_dir / f"{doc_filename}.json"
232
+ _log.info(f"writing JSON output to {fname}")
233
+ conv_res.document.save_as_json(
234
+ filename=fname, image_mode=image_export_mode
235
+ )
236
+
237
+ # Export YAML format:
238
+ if export_yaml:
239
+ fname = output_dir / f"{doc_filename}.yaml"
240
+ _log.info(f"writing YAML output to {fname}")
241
+ conv_res.document.save_as_yaml(
242
+ filename=fname, image_mode=image_export_mode
243
+ )
244
+
245
+ # Export HTML format:
246
+ if export_html:
247
+ fname = output_dir / f"{doc_filename}.html"
248
+ _log.info(f"writing HTML output to {fname}")
249
+ conv_res.document.save_as_html(
250
+ filename=fname, image_mode=image_export_mode, split_page_view=False
251
+ )
252
+
253
+ # Export HTML format:
254
+ if export_html_split_page:
255
+ fname = output_dir / f"{doc_filename}.html"
256
+ _log.info(f"writing HTML output to {fname}")
257
+ if show_layout:
258
+ ser = HTMLDocSerializer(
259
+ doc=conv_res.document,
260
+ params=HTMLParams(
261
+ image_mode=image_export_mode,
262
+ output_style=HTMLOutputStyle.SPLIT_PAGE,
263
+ ),
264
+ )
265
+ visualizer = LayoutVisualizer()
266
+ visualizer.params.show_label = False
267
+ ser_res = ser.serialize(
268
+ visualizer=visualizer,
269
+ )
270
+ with open(fname, "w") as fw:
271
+ fw.write(ser_res.text)
272
+ else:
273
+ conv_res.document.save_as_html(
274
+ filename=fname,
275
+ image_mode=image_export_mode,
276
+ split_page_view=True,
277
+ )
278
+
279
+ # Export Text format:
280
+ if export_txt:
281
+ fname = output_dir / f"{doc_filename}.txt"
282
+ _log.info(f"writing TXT output to {fname}")
283
+ conv_res.document.save_as_markdown(
284
+ filename=fname,
285
+ strict_text=True,
286
+ image_mode=ImageRefMode.PLACEHOLDER,
287
+ )
288
+
289
+ # Export Markdown format:
290
+ if export_md:
291
+ fname = output_dir / f"{doc_filename}.md"
292
+ _log.info(f"writing Markdown output to {fname}")
293
+ conv_res.document.save_as_markdown(
294
+ filename=fname, image_mode=image_export_mode
295
+ )
296
+
297
+ # Export Document Tags format:
298
+ if export_doctags:
299
+ fname = output_dir / f"{doc_filename}.doctags"
300
+ _log.info(f"writing Doc Tags output to {fname}")
301
+ conv_res.document.save_as_doctags(filename=fname)
302
+
303
+ # Print profiling timings
304
+ if print_timings:
305
+ table = rich.table.Table(title=f"Profiling Summary, {doc_filename}")
306
+ metric_columns = [
307
+ "Stage",
308
+ "count",
309
+ "total",
310
+ "mean",
311
+ "median",
312
+ "min",
313
+ "max",
314
+ "0.1 percentile",
315
+ "0.9 percentile",
316
+ ]
317
+ for col in metric_columns:
318
+ table.add_column(col, style="bold")
319
+ for stage_key, item in conv_res.timings.items():
320
+ col_dict = {
321
+ "Stage": stage_key,
322
+ "count": item.count,
323
+ "total": item.total(),
324
+ "mean": item.avg(),
325
+ "median": item.percentile(0.5),
326
+ "min": item.percentile(0.0),
327
+ "max": item.percentile(1.0),
328
+ "0.1 percentile": item.percentile(0.1),
329
+ "0.9 percentile": item.percentile(0.9),
330
+ }
331
+ row_values = [str(col_dict[col]) for col in metric_columns]
332
+ table.add_row(*row_values)
333
+
334
+ console.print(table)
335
+
336
+ # Export profiling timings
337
+ if export_timings:
338
+ TimingsT = TypeAdapter(dict[str, ProfilingItem])
339
+ now = datetime.datetime.now()
340
+ timings_file = Path(
341
+ output_dir / f"{doc_filename}-timings-{now:%Y-%m-%d_%H-%M-%S}.json"
342
+ )
343
+ with timings_file.open("wb") as fp:
344
+ r = TimingsT.dump_json(conv_res.timings, indent=2)
345
+ fp.write(r)
346
+
347
+ else:
348
+ _log.warning(f"Document {conv_res.input.file} failed to convert.")
349
+ if _log.isEnabledFor(logging.INFO):
350
+ for err in conv_res.errors:
351
+ _log.info(
352
+ f" [Failure Detail] Component: {err.component_type}, "
353
+ f"Module: {err.module_name}, Message: {err.error_message}"
354
+ )
355
+ failure_count += 1
356
+
357
+ _log.info(
358
+ f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
359
+ )
360
+
361
+
362
+ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
363
+ if raw is None:
364
+ return None
365
+ return re.split(r"[;,]", raw)
366
+
367
+
368
+ @app.command(no_args_is_help=True)
369
+ def convert( # noqa: C901
370
+ input_sources: Annotated[
371
+ List[str],
372
+ typer.Argument(
373
+ ...,
374
+ metavar="source",
375
+ help="PDF files to convert. Can be local file / directory paths or URL.",
376
+ ),
377
+ ],
378
+ from_formats: List[InputFormat] = typer.Option(
379
+ None,
380
+ "--from",
381
+ help="Specify input formats to convert from. Defaults to all formats.",
382
+ ),
383
+ to_formats: List[OutputFormat] = typer.Option(
384
+ None, "--to", help="Specify output formats. Defaults to Markdown."
385
+ ),
386
+ show_layout: Annotated[
387
+ bool,
388
+ typer.Option(
389
+ ...,
390
+ help="If enabled, the page images will show the bounding-boxes of the items.",
391
+ ),
392
+ ] = False,
393
+ headers: str = typer.Option(
394
+ None,
395
+ "--headers",
396
+ help="Specify http request headers used when fetching url input sources in the form of a JSON string",
397
+ ),
398
+ image_export_mode: Annotated[
399
+ ImageRefMode,
400
+ typer.Option(
401
+ ...,
402
+ help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
403
+ ),
404
+ ] = ImageRefMode.EMBEDDED,
405
+ pipeline: Annotated[
406
+ ProcessingPipeline,
407
+ typer.Option(..., help="Choose the pipeline to process PDF or image files."),
408
+ ] = ProcessingPipeline.STANDARD,
409
+ vlm_model: Annotated[
410
+ VlmModelType,
411
+ typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
412
+ ] = VlmModelType.GRANITEDOCLING,
413
+ asr_model: Annotated[
414
+ AsrModelType,
415
+ typer.Option(..., help="Choose the ASR model to use with audio/video files."),
416
+ ] = AsrModelType.WHISPER_TINY,
417
+ ocr: Annotated[
418
+ bool,
419
+ typer.Option(
420
+ ..., help="If enabled, the bitmap content will be processed using OCR."
421
+ ),
422
+ ] = True,
423
+ force_ocr: Annotated[
424
+ bool,
425
+ typer.Option(
426
+ ...,
427
+ help="Replace any existing text with OCR generated text over the full content.",
428
+ ),
429
+ ] = False,
430
+ tables: Annotated[
431
+ bool,
432
+ typer.Option(
433
+ ...,
434
+ help="If enabled, the table structure model will be used to extract table information.",
435
+ ),
436
+ ] = True,
437
+ ocr_engine: Annotated[
438
+ str,
439
+ typer.Option(
440
+ ...,
441
+ help=(
442
+ f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
443
+ f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
444
+ f"Use the option --show-external-plugins to see the options allowed with external plugins."
445
+ ),
446
+ ),
447
+ ] = OcrAutoOptions.kind,
448
+ ocr_lang: Annotated[
449
+ Optional[str],
450
+ typer.Option(
451
+ ...,
452
+ help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
453
+ ),
454
+ ] = None,
455
+ psm: Annotated[
456
+ Optional[int],
457
+ typer.Option(
458
+ ...,
459
+ help="Page Segmentation Mode for the OCR engine (0-13).",
460
+ ),
461
+ ] = None,
462
+ pdf_backend: Annotated[
463
+ PdfBackend, typer.Option(..., help="The PDF backend to use.")
464
+ ] = PdfBackend.DLPARSE_V4,
465
+ pdf_password: Annotated[
466
+ Optional[str], typer.Option(..., help="Password for protected PDF documents")
467
+ ] = None,
468
+ table_mode: Annotated[
469
+ TableFormerMode,
470
+ typer.Option(..., help="The mode to use in the table structure model."),
471
+ ] = TableFormerMode.ACCURATE,
472
+ enrich_code: Annotated[
473
+ bool,
474
+ typer.Option(..., help="Enable the code enrichment model in the pipeline."),
475
+ ] = False,
476
+ enrich_formula: Annotated[
477
+ bool,
478
+ typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
479
+ ] = False,
480
+ enrich_picture_classes: Annotated[
481
+ bool,
482
+ typer.Option(
483
+ ...,
484
+ help="Enable the picture classification enrichment model in the pipeline.",
485
+ ),
486
+ ] = False,
487
+ enrich_picture_description: Annotated[
488
+ bool,
489
+ typer.Option(..., help="Enable the picture description model in the pipeline."),
490
+ ] = False,
491
+ artifacts_path: Annotated[
492
+ Optional[Path],
493
+ typer.Option(..., help="If provided, the location of the model artifacts."),
494
+ ] = None,
495
+ enable_remote_services: Annotated[
496
+ bool,
497
+ typer.Option(
498
+ ..., help="Must be enabled when using models connecting to remote services."
499
+ ),
500
+ ] = False,
501
+ allow_external_plugins: Annotated[
502
+ bool,
503
+ typer.Option(
504
+ ..., help="Must be enabled for loading modules from third-party plugins."
505
+ ),
506
+ ] = False,
507
+ show_external_plugins: Annotated[
508
+ bool,
509
+ typer.Option(
510
+ ...,
511
+ help="List the third-party plugins which are available when the option --allow-external-plugins is set.",
512
+ callback=show_external_plugins_callback,
513
+ is_eager=True,
514
+ ),
515
+ ] = False,
516
+ abort_on_error: Annotated[
517
+ bool,
518
+ typer.Option(
519
+ ...,
520
+ "--abort-on-error/--no-abort-on-error",
521
+ help="If enabled, the processing will be aborted when the first error is encountered.",
522
+ ),
523
+ ] = False,
524
+ output: Annotated[
525
+ Path, typer.Option(..., help="Output directory where results are saved.")
526
+ ] = Path("."),
527
+ verbose: Annotated[
528
+ int,
529
+ typer.Option(
530
+ "--verbose",
531
+ "-v",
532
+ count=True,
533
+ help="Set the verbosity level. -v for info logging, -vv for debug logging.",
534
+ ),
535
+ ] = 0,
536
+ debug_visualize_cells: Annotated[
537
+ bool,
538
+ typer.Option(..., help="Enable debug output which visualizes the PDF cells"),
539
+ ] = False,
540
+ debug_visualize_ocr: Annotated[
541
+ bool,
542
+ typer.Option(..., help="Enable debug output which visualizes the OCR cells"),
543
+ ] = False,
544
+ debug_visualize_layout: Annotated[
545
+ bool,
546
+ typer.Option(
547
+ ..., help="Enable debug output which visualizes the layour clusters"
548
+ ),
549
+ ] = False,
550
+ debug_visualize_tables: Annotated[
551
+ bool,
552
+ typer.Option(..., help="Enable debug output which visualizes the table cells"),
553
+ ] = False,
554
+ version: Annotated[
555
+ Optional[bool],
556
+ typer.Option(
557
+ "--version",
558
+ callback=version_callback,
559
+ is_eager=True,
560
+ help="Show version information.",
561
+ ),
562
+ ] = None,
563
+ document_timeout: Annotated[
564
+ Optional[float],
565
+ typer.Option(
566
+ ...,
567
+ help="The timeout for processing each document, in seconds.",
568
+ ),
569
+ ] = None,
570
+ num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
571
+ device: Annotated[
572
+ AcceleratorDevice, typer.Option(..., help="Accelerator device")
573
+ ] = AcceleratorDevice.AUTO,
574
+ docling_logo: Annotated[
575
+ Optional[bool],
576
+ typer.Option(
577
+ "--logo", callback=logo_callback, is_eager=True, help="Docling logo"
578
+ ),
579
+ ] = None,
580
+ page_batch_size: Annotated[
581
+ int,
582
+ typer.Option(
583
+ "--page-batch-size",
584
+ help=f"Number of pages processed in one batch. Default: {settings.perf.page_batch_size}",
585
+ ),
586
+ ] = settings.perf.page_batch_size,
587
+ profiling: Annotated[
588
+ bool,
589
+ typer.Option(
590
+ ...,
591
+ help="If enabled, it summarizes profiling details for all conversion stages.",
592
+ ),
593
+ ] = False,
594
+ save_profiling: Annotated[
595
+ bool,
596
+ typer.Option(
597
+ ...,
598
+ help="If enabled, it saves the profiling summaries to json.",
599
+ ),
600
+ ] = False,
601
+ ):
602
+ log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
603
+
604
+ if verbose == 0:
605
+ logging.basicConfig(level=logging.WARNING, format=log_format)
606
+ elif verbose == 1:
607
+ logging.basicConfig(level=logging.INFO, format=log_format)
608
+ else:
609
+ logging.basicConfig(level=logging.DEBUG, format=log_format)
610
+
611
+ settings.debug.visualize_cells = debug_visualize_cells
612
+ settings.debug.visualize_layout = debug_visualize_layout
613
+ settings.debug.visualize_tables = debug_visualize_tables
614
+ settings.debug.visualize_ocr = debug_visualize_ocr
615
+ settings.perf.page_batch_size = page_batch_size
616
+
617
+ if from_formats is None:
618
+ from_formats = list(InputFormat)
619
+
620
+ parsed_headers: Optional[Dict[str, str]] = None
621
+ if headers is not None:
622
+ headers_t = TypeAdapter(Dict[str, str])
623
+ parsed_headers = headers_t.validate_json(headers)
624
+
625
+ if profiling or save_profiling:
626
+ settings.debug.profile_pipeline_timings = True
627
+
628
+ with tempfile.TemporaryDirectory() as tempdir:
629
+ input_doc_paths: List[Path] = []
630
+ for src in input_sources:
631
+ try:
632
+ # check if we can fetch some remote url
633
+ source = resolve_source_to_path(
634
+ source=src, headers=parsed_headers, workdir=Path(tempdir)
635
+ )
636
+ input_doc_paths.append(source)
637
+ except FileNotFoundError:
638
+ err_console.print(
639
+ f"[red]Error: The input file {src} does not exist.[/red]"
640
+ )
641
+ raise typer.Abort()
642
+ except IsADirectoryError:
643
+ # if the input matches to a file or a folder
644
+ try:
645
+ local_path = TypeAdapter(Path).validate_python(src)
646
+ if local_path.exists() and local_path.is_dir():
647
+ for fmt in from_formats:
648
+ for ext in FormatToExtensions[fmt]:
649
+ for path in local_path.glob(f"**/*.{ext}"):
650
+ if path.name.startswith("~$") and ext == "docx":
651
+ _log.info(
652
+ f"Ignoring temporary Word file: {path}"
653
+ )
654
+ continue
655
+ input_doc_paths.append(path)
656
+
657
+ for path in local_path.glob(f"**/*.{ext.upper()}"):
658
+ if path.name.startswith("~$") and ext == "docx":
659
+ _log.info(
660
+ f"Ignoring temporary Word file: {path}"
661
+ )
662
+ continue
663
+ input_doc_paths.append(path)
664
+ elif local_path.exists():
665
+ if not local_path.name.startswith("~$") and ext == "docx":
666
+ _log.info(f"Ignoring temporary Word file: {path}")
667
+ continue
668
+ input_doc_paths.append(local_path)
669
+ else:
670
+ err_console.print(
671
+ f"[red]Error: The input file {src} does not exist.[/red]"
672
+ )
673
+ raise typer.Abort()
674
+ except Exception as err:
675
+ err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
676
+ _log.info(err) # will print more details if verbose is activated
677
+ raise typer.Abort()
678
+
679
+ if to_formats is None:
680
+ to_formats = [OutputFormat.MARKDOWN]
681
+
682
+ export_json = OutputFormat.JSON in to_formats
683
+ export_yaml = OutputFormat.YAML in to_formats
684
+ export_html = OutputFormat.HTML in to_formats
685
+ export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
686
+ export_md = OutputFormat.MARKDOWN in to_formats
687
+ export_txt = OutputFormat.TEXT in to_formats
688
+ export_doctags = OutputFormat.DOCTAGS in to_formats
689
+
690
+ ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
691
+ ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
692
+ kind=ocr_engine,
693
+ force_full_page_ocr=force_ocr,
694
+ )
695
+
696
+ ocr_lang_list = _split_list(ocr_lang)
697
+ if ocr_lang_list is not None:
698
+ ocr_options.lang = ocr_lang_list
699
+ if psm is not None and isinstance(
700
+ ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
701
+ ):
702
+ ocr_options.psm = psm
703
+
704
+ accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
705
+
706
+ # pipeline_options: PaginatedPipelineOptions
707
+ pipeline_options: PipelineOptions
708
+
709
+ format_options: Dict[InputFormat, FormatOption] = {}
710
+ pdf_backend_options: Optional[PdfBackendOptions] = PdfBackendOptions(
711
+ password=pdf_password
712
+ )
713
+
714
+ if pipeline == ProcessingPipeline.STANDARD:
715
+ pipeline_options = PdfPipelineOptions(
716
+ allow_external_plugins=allow_external_plugins,
717
+ enable_remote_services=enable_remote_services,
718
+ accelerator_options=accelerator_options,
719
+ do_ocr=ocr,
720
+ ocr_options=ocr_options,
721
+ do_table_structure=tables,
722
+ do_code_enrichment=enrich_code,
723
+ do_formula_enrichment=enrich_formula,
724
+ do_picture_description=enrich_picture_description,
725
+ do_picture_classification=enrich_picture_classes,
726
+ document_timeout=document_timeout,
727
+ )
728
+ if isinstance(
729
+ pipeline_options.table_structure_options, TableStructureOptions
730
+ ):
731
+ pipeline_options.table_structure_options.do_cell_matching = (
732
+ True # do_cell_matching
733
+ )
734
+ pipeline_options.table_structure_options.mode = table_mode
735
+
736
+ if image_export_mode != ImageRefMode.PLACEHOLDER:
737
+ pipeline_options.generate_page_images = True
738
+ pipeline_options.generate_picture_images = (
739
+ True # FIXME: to be deprecated in version 3
740
+ )
741
+ pipeline_options.images_scale = 2
742
+
743
+ backend: Type[PdfDocumentBackend]
744
+ if pdf_backend == PdfBackend.DLPARSE_V1:
745
+ backend = DoclingParseDocumentBackend
746
+ pdf_backend_options = None
747
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
748
+ backend = DoclingParseV2DocumentBackend
749
+ pdf_backend_options = None
750
+ elif pdf_backend == PdfBackend.DLPARSE_V4:
751
+ backend = DoclingParseV4DocumentBackend # type: ignore
752
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
753
+ backend = PyPdfiumDocumentBackend # type: ignore
754
+ else:
755
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
756
+
757
+ pdf_format_option = PdfFormatOption(
758
+ pipeline_options=pipeline_options,
759
+ backend=backend, # pdf_backend
760
+ backend_options=pdf_backend_options,
761
+ )
762
+
763
+ # METS GBS options
764
+ mets_gbs_options = pipeline_options.model_copy()
765
+ mets_gbs_options.do_ocr = False
766
+ mets_gbs_format_option = PdfFormatOption(
767
+ pipeline_options=mets_gbs_options,
768
+ backend=MetsGbsDocumentBackend,
769
+ )
770
+
771
+ # SimplePipeline options
772
+ simple_format_option = ConvertPipelineOptions(
773
+ do_picture_description=enrich_picture_description,
774
+ do_picture_classification=enrich_picture_classes,
775
+ )
776
+ if artifacts_path is not None:
777
+ simple_format_option.artifacts_path = artifacts_path
778
+
779
+ # Use image-native backend for IMAGE to avoid pypdfium2 locking
780
+ image_format_option = PdfFormatOption(
781
+ pipeline_options=pipeline_options,
782
+ backend=ImageDocumentBackend,
783
+ backend_options=pdf_backend_options,
784
+ )
785
+
786
+ format_options = {
787
+ InputFormat.PDF: pdf_format_option,
788
+ InputFormat.IMAGE: image_format_option,
789
+ InputFormat.METS_GBS: mets_gbs_format_option,
790
+ InputFormat.DOCX: WordFormatOption(
791
+ pipeline_options=simple_format_option
792
+ ),
793
+ InputFormat.PPTX: PowerpointFormatOption(
794
+ pipeline_options=simple_format_option
795
+ ),
796
+ InputFormat.XLSX: ExcelFormatOption(
797
+ pipeline_options=simple_format_option
798
+ ),
799
+ InputFormat.HTML: HTMLFormatOption(
800
+ pipeline_options=simple_format_option
801
+ ),
802
+ InputFormat.MD: MarkdownFormatOption(
803
+ pipeline_options=simple_format_option
804
+ ),
805
+ }
806
+
807
+ elif pipeline == ProcessingPipeline.VLM:
808
+ pipeline_options = VlmPipelineOptions(
809
+ enable_remote_services=enable_remote_services,
810
+ )
811
+
812
+ if vlm_model == VlmModelType.GRANITE_VISION:
813
+ pipeline_options.vlm_options = (
814
+ vlm_model_specs.GRANITE_VISION_TRANSFORMERS
815
+ )
816
+ elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
817
+ pipeline_options.vlm_options = vlm_model_specs.GRANITE_VISION_OLLAMA
818
+ elif vlm_model == VlmModelType.GOT_OCR_2:
819
+ pipeline_options.vlm_options = vlm_model_specs.GOT2_TRANSFORMERS
820
+ elif vlm_model == VlmModelType.SMOLDOCLING:
821
+ pipeline_options.vlm_options = vlm_model_specs.SMOLDOCLING_TRANSFORMERS
822
+ if sys.platform == "darwin":
823
+ try:
824
+ import mlx_vlm
825
+
826
+ pipeline_options.vlm_options = vlm_model_specs.SMOLDOCLING_MLX
827
+ except ImportError:
828
+ _log.warning(
829
+ "To run SmolDocling faster, please install mlx-vlm:\n"
830
+ "pip install mlx-vlm"
831
+ )
832
+
833
+ elif vlm_model == VlmModelType.GRANITEDOCLING:
834
+ pipeline_options.vlm_options = (
835
+ vlm_model_specs.GRANITEDOCLING_TRANSFORMERS
836
+ )
837
+ if sys.platform == "darwin":
838
+ try:
839
+ import mlx_vlm
840
+
841
+ pipeline_options.vlm_options = (
842
+ vlm_model_specs.GRANITEDOCLING_MLX
843
+ )
844
+ except ImportError:
845
+ _log.warning(
846
+ "To run GraniteDocling faster, please install mlx-vlm:\n"
847
+ "pip install mlx-vlm"
848
+ )
849
+
850
+ elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
851
+ pipeline_options.vlm_options = vlm_model_specs.SMOLDOCLING_VLLM
852
+
853
+ elif vlm_model == VlmModelType.GRANITEDOCLING_VLLM:
854
+ pipeline_options.vlm_options = vlm_model_specs.GRANITEDOCLING_VLLM
855
+
856
+ elif vlm_model == VlmModelType.DEEPSEEKOCR_OLLAMA:
857
+ pipeline_options.vlm_options = vlm_model_specs.DEEPSEEKOCR_OLLAMA
858
+
859
+ pdf_format_option = PdfFormatOption(
860
+ pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
861
+ )
862
+
863
+ format_options = {
864
+ InputFormat.PDF: pdf_format_option,
865
+ InputFormat.IMAGE: pdf_format_option,
866
+ }
867
+
868
+ # Set ASR options
869
+ asr_pipeline_options = AsrPipelineOptions(
870
+ accelerator_options=AcceleratorOptions(
871
+ device=device,
872
+ num_threads=num_threads,
873
+ ),
874
+ # enable_remote_services=enable_remote_services,
875
+ # artifacts_path = artifacts_path
876
+ )
877
+
878
+ # Auto-selecting models (choose best implementation for hardware)
879
+ if asr_model == AsrModelType.WHISPER_TINY:
880
+ asr_pipeline_options.asr_options = WHISPER_TINY
881
+ elif asr_model == AsrModelType.WHISPER_SMALL:
882
+ asr_pipeline_options.asr_options = WHISPER_SMALL
883
+ elif asr_model == AsrModelType.WHISPER_MEDIUM:
884
+ asr_pipeline_options.asr_options = WHISPER_MEDIUM
885
+ elif asr_model == AsrModelType.WHISPER_BASE:
886
+ asr_pipeline_options.asr_options = WHISPER_BASE
887
+ elif asr_model == AsrModelType.WHISPER_LARGE:
888
+ asr_pipeline_options.asr_options = WHISPER_LARGE
889
+ elif asr_model == AsrModelType.WHISPER_TURBO:
890
+ asr_pipeline_options.asr_options = WHISPER_TURBO
891
+
892
+ # Explicit MLX models (force MLX implementation)
893
+ elif asr_model == AsrModelType.WHISPER_TINY_MLX:
894
+ asr_pipeline_options.asr_options = WHISPER_TINY_MLX
895
+ elif asr_model == AsrModelType.WHISPER_SMALL_MLX:
896
+ asr_pipeline_options.asr_options = WHISPER_SMALL_MLX
897
+ elif asr_model == AsrModelType.WHISPER_MEDIUM_MLX:
898
+ asr_pipeline_options.asr_options = WHISPER_MEDIUM_MLX
899
+ elif asr_model == AsrModelType.WHISPER_BASE_MLX:
900
+ asr_pipeline_options.asr_options = WHISPER_BASE_MLX
901
+ elif asr_model == AsrModelType.WHISPER_LARGE_MLX:
902
+ asr_pipeline_options.asr_options = WHISPER_LARGE_MLX
903
+ elif asr_model == AsrModelType.WHISPER_TURBO_MLX:
904
+ asr_pipeline_options.asr_options = WHISPER_TURBO_MLX
905
+
906
+ # Explicit Native models (force native implementation)
907
+ elif asr_model == AsrModelType.WHISPER_TINY_NATIVE:
908
+ asr_pipeline_options.asr_options = WHISPER_TINY_NATIVE
909
+ elif asr_model == AsrModelType.WHISPER_SMALL_NATIVE:
910
+ asr_pipeline_options.asr_options = WHISPER_SMALL_NATIVE
911
+ elif asr_model == AsrModelType.WHISPER_MEDIUM_NATIVE:
912
+ asr_pipeline_options.asr_options = WHISPER_MEDIUM_NATIVE
913
+ elif asr_model == AsrModelType.WHISPER_BASE_NATIVE:
914
+ asr_pipeline_options.asr_options = WHISPER_BASE_NATIVE
915
+ elif asr_model == AsrModelType.WHISPER_LARGE_NATIVE:
916
+ asr_pipeline_options.asr_options = WHISPER_LARGE_NATIVE
917
+ elif asr_model == AsrModelType.WHISPER_TURBO_NATIVE:
918
+ asr_pipeline_options.asr_options = WHISPER_TURBO_NATIVE
919
+
920
+ else:
921
+ _log.error(f"{asr_model} is not known")
922
+ raise ValueError(f"{asr_model} is not known")
923
+
924
+ _log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
925
+
926
+ audio_format_option = AudioFormatOption(
927
+ pipeline_cls=AsrPipeline,
928
+ pipeline_options=asr_pipeline_options,
929
+ )
930
+ format_options[InputFormat.AUDIO] = audio_format_option
931
+
932
+ # Common options for all pipelines
933
+ if artifacts_path is not None:
934
+ pipeline_options.artifacts_path = artifacts_path
935
+ asr_pipeline_options.artifacts_path = artifacts_path
936
+
937
+ doc_converter = DocumentConverter(
938
+ allowed_formats=from_formats,
939
+ format_options=format_options,
940
+ )
941
+
942
+ start_time = time.time()
943
+
944
+ _log.info(f"paths: {input_doc_paths}")
945
+ conv_results = doc_converter.convert_all(
946
+ input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
947
+ )
948
+
949
+ output.mkdir(parents=True, exist_ok=True)
950
+ export_documents(
951
+ conv_results,
952
+ output_dir=output,
953
+ export_json=export_json,
954
+ export_yaml=export_yaml,
955
+ export_html=export_html,
956
+ export_html_split_page=export_html_split_page,
957
+ show_layout=show_layout,
958
+ export_md=export_md,
959
+ export_txt=export_txt,
960
+ export_doctags=export_doctags,
961
+ print_timings=profiling,
962
+ export_timings=save_profiling,
963
+ image_export_mode=image_export_mode,
964
+ )
965
+
966
+ end_time = time.time() - start_time
967
+
968
+ _log.info(f"All documents were converted in {end_time:.2f} seconds.")
969
+
970
+
971
+ click_app = typer.main.get_command(app)
972
+
973
+ if __name__ == "__main__":
974
+ app()