docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. docling/backend/asciidoc_backend.py +39 -18
  2. docling/backend/docling_parse_backend.py +61 -59
  3. docling/backend/docling_parse_v2_backend.py +72 -62
  4. docling/backend/docling_parse_v4_backend.py +21 -19
  5. docling/backend/md_backend.py +101 -81
  6. docling/backend/mspowerpoint_backend.py +72 -113
  7. docling/backend/msword_backend.py +99 -80
  8. docling/backend/noop_backend.py +51 -0
  9. docling/backend/pypdfium2_backend.py +127 -53
  10. docling/cli/main.py +82 -14
  11. docling/datamodel/asr_model_specs.py +92 -0
  12. docling/datamodel/base_models.py +21 -4
  13. docling/datamodel/document.py +3 -1
  14. docling/datamodel/pipeline_options.py +15 -2
  15. docling/datamodel/pipeline_options_asr_model.py +57 -0
  16. docling/datamodel/pipeline_options_vlm_model.py +4 -4
  17. docling/document_converter.py +8 -0
  18. docling/models/api_vlm_model.py +3 -1
  19. docling/models/base_model.py +1 -1
  20. docling/models/base_ocr_model.py +33 -11
  21. docling/models/easyocr_model.py +1 -1
  22. docling/models/layout_model.py +2 -3
  23. docling/models/ocr_mac_model.py +1 -1
  24. docling/models/page_preprocessing_model.py +3 -6
  25. docling/models/rapid_ocr_model.py +1 -1
  26. docling/models/readingorder_model.py +3 -3
  27. docling/models/tesseract_ocr_cli_model.py +4 -3
  28. docling/models/tesseract_ocr_model.py +1 -1
  29. docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
  30. docling/models/vlm_models_inline/mlx_model.py +3 -1
  31. docling/pipeline/asr_pipeline.py +253 -0
  32. docling/pipeline/base_pipeline.py +11 -0
  33. docling/pipeline/standard_pdf_pipeline.py +0 -1
  34. docling/utils/layout_postprocessor.py +11 -6
  35. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
  36. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
  37. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
  38. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
  39. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
  40. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ import re
2
2
  import warnings
3
3
  from collections.abc import Iterable
4
4
  from pathlib import Path
5
- from typing import Optional
5
+ from typing import Literal, Optional
6
6
 
7
7
  import numpy as np
8
8
  from PIL import ImageDraw
@@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
17
17
 
18
18
  class PagePreprocessingOptions(BaseModel):
19
19
  images_scale: Optional[float]
20
- create_parsed_page: bool
21
20
 
22
21
 
23
22
  class PagePreprocessingModel(BasePageModel):
@@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
66
65
  def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
67
66
  assert page._backend is not None
68
67
 
69
- page.cells = list(page._backend.get_text_cells())
70
-
71
- if self.options.create_parsed_page:
72
- page.parsed_page = page._backend.get_segmented_page()
68
+ page.parsed_page = page._backend.get_segmented_page()
69
+ assert page.parsed_page is not None
73
70
 
74
71
  # Rate the text quality from the PDF parser, and aggregate on page
75
72
  text_scores = []
@@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
134
134
  all_ocr_cells.extend(cells)
135
135
 
136
136
  # Post-process the cells
137
- page.cells = self.post_process_cells(all_ocr_cells, page.cells)
137
+ self.post_process_cells(all_ocr_cells, page)
138
138
 
139
139
  # DEBUG code:
140
140
  if settings.debug.visualize_ocr:
@@ -124,7 +124,7 @@ class ReadingOrderModel:
124
124
  page_no = page.page_no + 1
125
125
  size = page.size
126
126
 
127
- assert size is not None
127
+ assert size is not None, "Page size is not initialized."
128
128
 
129
129
  out_doc.add_page(page_no=page_no, size=size)
130
130
 
@@ -334,12 +334,12 @@ class ReadingOrderModel:
334
334
  "Labels of merged elements must match."
335
335
  )
336
336
  prov = ProvenanceItem(
337
- page_no=element.page_no + 1,
337
+ page_no=merged_elem.page_no + 1,
338
338
  charspan=(
339
339
  len(new_item.text) + 1,
340
340
  len(new_item.text) + 1 + len(merged_elem.text),
341
341
  ),
342
- bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
342
+ bbox=merged_elem.cluster.bbox.to_bottom_left_origin(page_height),
343
343
  )
344
344
  new_item.text += f" {merged_elem.text}"
345
345
  new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
@@ -99,12 +99,12 @@ class TesseractOcrCliModel(BaseOcrModel):
99
99
 
100
100
  return name, version
101
101
 
102
- def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
102
+ def _run_tesseract(self, ifilename: str, osd: Optional[pd.DataFrame]):
103
103
  r"""
104
104
  Run tesseract CLI
105
105
  """
106
106
  cmd = [self.options.tesseract_cmd]
107
- if self._is_auto:
107
+ if self._is_auto and osd is not None:
108
108
  lang = self._parse_language(osd)
109
109
  if lang is not None:
110
110
  cmd.append("-l")
@@ -231,6 +231,7 @@ class TesseractOcrCliModel(BaseOcrModel):
231
231
  fname = image_file.name
232
232
  high_res_image.save(image_file)
233
233
  doc_orientation = 0
234
+ df_osd: Optional[pd.DataFrame] = None
234
235
  try:
235
236
  df_osd = self._perform_osd(fname)
236
237
  doc_orientation = _parse_orientation(df_osd)
@@ -305,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
305
306
  all_ocr_cells.append(cell)
306
307
 
307
308
  # Post-process the cells
308
- page.cells = self.post_process_cells(all_ocr_cells, page.cells)
309
+ self.post_process_cells(all_ocr_cells, page)
309
310
 
310
311
  # DEBUG code:
311
312
  if settings.debug.visualize_ocr:
@@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
235
235
  all_ocr_cells.extend(cells)
236
236
 
237
237
  # Post-process the cells
238
- page.cells = self.post_process_cells(all_ocr_cells, page.cells)
238
+ self.post_process_cells(all_ocr_cells, page)
239
239
 
240
240
  # DEBUG code:
241
241
  if settings.debug.visualize_ocr:
@@ -99,6 +99,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
99
99
  self.vlm_model = model_cls.from_pretrained(
100
100
  artifacts_path,
101
101
  device_map=self.device,
102
+ torch_dtype=self.vlm_options.torch_dtype,
102
103
  _attn_implementation=(
103
104
  "flash_attention_2"
104
105
  if self.device.startswith("cuda")
@@ -122,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
122
123
  with TimeRecorder(conv_res, "vlm"):
123
124
  assert page.size is not None
124
125
 
125
- hi_res_image = page.get_image(scale=self.vlm_options.scale)
126
+ hi_res_image = page.get_image(
127
+ scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
128
+ )
126
129
 
127
130
  # Define prompt structure
128
131
  prompt = self.formulate_prompt()
@@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
73
73
  with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
74
74
  assert page.size is not None
75
75
 
76
- hi_res_image = page.get_image(scale=self.vlm_options.scale)
76
+ hi_res_image = page.get_image(
77
+ scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
78
+ )
77
79
  if hi_res_image is not None:
78
80
  im_width, im_height = hi_res_image.size
79
81
 
@@ -0,0 +1,253 @@
1
+ import logging
2
+ import os
3
+ import re
4
+ from io import BytesIO
5
+ from pathlib import Path
6
+ from typing import List, Optional, Union, cast
7
+
8
+ from docling_core.types.doc import DoclingDocument, DocumentOrigin
9
+
10
+ # import whisper # type: ignore
11
+ # import librosa
12
+ # import numpy as np
13
+ # import soundfile as sf # type: ignore
14
+ from docling_core.types.doc.labels import DocItemLabel
15
+ from pydantic import BaseModel, Field, validator
16
+
17
+ from docling.backend.abstract_backend import AbstractDocumentBackend
18
+ from docling.backend.noop_backend import NoOpBackend
19
+
20
+ # from pydub import AudioSegment # type: ignore
21
+ # from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
22
+ from docling.datamodel.accelerator_options import (
23
+ AcceleratorOptions,
24
+ )
25
+ from docling.datamodel.base_models import (
26
+ ConversionStatus,
27
+ FormatToMimeType,
28
+ )
29
+ from docling.datamodel.document import ConversionResult, InputDocument
30
+ from docling.datamodel.pipeline_options import (
31
+ AsrPipelineOptions,
32
+ )
33
+ from docling.datamodel.pipeline_options_asr_model import (
34
+ InlineAsrNativeWhisperOptions,
35
+ # AsrResponseFormat,
36
+ InlineAsrOptions,
37
+ )
38
+ from docling.datamodel.pipeline_options_vlm_model import (
39
+ InferenceFramework,
40
+ )
41
+ from docling.datamodel.settings import settings
42
+ from docling.pipeline.base_pipeline import BasePipeline
43
+ from docling.utils.accelerator_utils import decide_device
44
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
45
+
46
+ _log = logging.getLogger(__name__)
47
+
48
+
49
+ class _ConversationWord(BaseModel):
50
+ text: str
51
+ start_time: Optional[float] = Field(
52
+ None, description="Start time in seconds from video start"
53
+ )
54
+ end_time: Optional[float] = Field(
55
+ None, ge=0, description="End time in seconds from video start"
56
+ )
57
+
58
+
59
+ class _ConversationItem(BaseModel):
60
+ text: str
61
+ start_time: Optional[float] = Field(
62
+ None, description="Start time in seconds from video start"
63
+ )
64
+ end_time: Optional[float] = Field(
65
+ None, ge=0, description="End time in seconds from video start"
66
+ )
67
+ speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
68
+ speaker: Optional[str] = Field(
69
+ None, description="Speaker name, defaults to speaker-{speaker_id}"
70
+ )
71
+ words: Optional[list[_ConversationWord]] = Field(
72
+ None, description="Individual words with time-stamps"
73
+ )
74
+
75
+ def __lt__(self, other):
76
+ if not isinstance(other, _ConversationItem):
77
+ return NotImplemented
78
+ return self.start_time < other.start_time
79
+
80
+ def __eq__(self, other):
81
+ if not isinstance(other, _ConversationItem):
82
+ return NotImplemented
83
+ return self.start_time == other.start_time
84
+
85
+ def to_string(self) -> str:
86
+ """Format the conversation entry as a string"""
87
+ result = ""
88
+ if (self.start_time is not None) and (self.end_time is not None):
89
+ result += f"[time: {self.start_time}-{self.end_time}] "
90
+
91
+ if self.speaker is not None:
92
+ result += f"[speaker:{self.speaker}] "
93
+
94
+ result += self.text
95
+ return result
96
+
97
+
98
+ class _NativeWhisperModel:
99
+ def __init__(
100
+ self,
101
+ enabled: bool,
102
+ artifacts_path: Optional[Path],
103
+ accelerator_options: AcceleratorOptions,
104
+ asr_options: InlineAsrNativeWhisperOptions,
105
+ ):
106
+ """
107
+ Transcriber using native Whisper.
108
+ """
109
+ self.enabled = enabled
110
+
111
+ _log.info(f"artifacts-path: {artifacts_path}")
112
+ _log.info(f"accelerator_options: {accelerator_options}")
113
+
114
+ if self.enabled:
115
+ try:
116
+ import whisper # type: ignore
117
+ except ImportError:
118
+ raise ImportError(
119
+ "whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
120
+ )
121
+ self.asr_options = asr_options
122
+ self.max_tokens = asr_options.max_new_tokens
123
+ self.temperature = asr_options.temperature
124
+
125
+ self.device = decide_device(
126
+ accelerator_options.device,
127
+ supported_devices=asr_options.supported_devices,
128
+ )
129
+ _log.info(f"Available device for Whisper: {self.device}")
130
+
131
+ self.model_name = asr_options.repo_id
132
+ _log.info(f"loading _NativeWhisperModel({self.model_name})")
133
+ if artifacts_path is not None:
134
+ _log.info(f"loading {self.model_name} from {artifacts_path}")
135
+ self.model = whisper.load_model(
136
+ name=self.model_name,
137
+ device=self.device,
138
+ download_root=str(artifacts_path),
139
+ )
140
+ else:
141
+ self.model = whisper.load_model(
142
+ name=self.model_name, device=self.device
143
+ )
144
+
145
+ self.verbose = asr_options.verbose
146
+ self.timestamps = asr_options.timestamps
147
+ self.word_timestamps = asr_options.word_timestamps
148
+
149
+ def run(self, conv_res: ConversionResult) -> ConversionResult:
150
+ audio_path: Path = Path(conv_res.input.file).resolve()
151
+
152
+ try:
153
+ conversation = self.transcribe(audio_path)
154
+
155
+ # Ensure we have a proper DoclingDocument
156
+ origin = DocumentOrigin(
157
+ filename=conv_res.input.file.name or "audio.wav",
158
+ mimetype="audio/x-wav",
159
+ binary_hash=conv_res.input.document_hash,
160
+ )
161
+ conv_res.document = DoclingDocument(
162
+ name=conv_res.input.file.stem or "audio.wav", origin=origin
163
+ )
164
+
165
+ for citem in conversation:
166
+ conv_res.document.add_text(
167
+ label=DocItemLabel.TEXT, text=citem.to_string()
168
+ )
169
+
170
+ conv_res.status = ConversionStatus.SUCCESS
171
+ return conv_res
172
+
173
+ except Exception as exc:
174
+ _log.error(f"Audio tranciption has an error: {exc}")
175
+
176
+ conv_res.status = ConversionStatus.FAILURE
177
+ return conv_res
178
+
179
+ def transcribe(self, fpath: Path) -> list[_ConversationItem]:
180
+ result = self.model.transcribe(
181
+ str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
182
+ )
183
+
184
+ convo: list[_ConversationItem] = []
185
+ for _ in result["segments"]:
186
+ item = _ConversationItem(
187
+ start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
188
+ )
189
+ if "words" in _ and self.word_timestamps:
190
+ item.words = []
191
+ for __ in _["words"]:
192
+ item.words.append(
193
+ _ConversationWord(
194
+ start_time=__["start"],
195
+ end_time=__["end"],
196
+ text=__["word"],
197
+ )
198
+ )
199
+ convo.append(item)
200
+
201
+ return convo
202
+
203
+
204
+ class AsrPipeline(BasePipeline):
205
+ def __init__(self, pipeline_options: AsrPipelineOptions):
206
+ super().__init__(pipeline_options)
207
+ self.keep_backend = True
208
+
209
+ self.pipeline_options: AsrPipelineOptions = pipeline_options
210
+
211
+ artifacts_path: Optional[Path] = None
212
+ if pipeline_options.artifacts_path is not None:
213
+ artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
214
+ elif settings.artifacts_path is not None:
215
+ artifacts_path = Path(settings.artifacts_path).expanduser()
216
+
217
+ if artifacts_path is not None and not artifacts_path.is_dir():
218
+ raise RuntimeError(
219
+ f"The value of {artifacts_path=} is not valid. "
220
+ "When defined, it must point to a folder containing all models required by the pipeline."
221
+ )
222
+
223
+ if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
224
+ asr_options: InlineAsrNativeWhisperOptions = (
225
+ self.pipeline_options.asr_options
226
+ )
227
+ self._model = _NativeWhisperModel(
228
+ enabled=True, # must be always enabled for this pipeline to make sense.
229
+ artifacts_path=artifacts_path,
230
+ accelerator_options=pipeline_options.accelerator_options,
231
+ asr_options=asr_options,
232
+ )
233
+ else:
234
+ _log.error(f"No model support for {self.pipeline_options.asr_options}")
235
+
236
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
237
+ status = ConversionStatus.SUCCESS
238
+ return status
239
+
240
+ @classmethod
241
+ def get_default_options(cls) -> AsrPipelineOptions:
242
+ return AsrPipelineOptions()
243
+
244
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
245
+ _log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
246
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
247
+ self._model.run(conv_res=conv_res)
248
+
249
+ return conv_res
250
+
251
+ @classmethod
252
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
253
+ return isinstance(backend, NoOpBackend)
@@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
193
193
  )
194
194
  raise e
195
195
 
196
+ # Filter out uninitialized pages (those with size=None) that may remain
197
+ # after timeout or processing failures to prevent assertion errors downstream
198
+ initial_page_count = len(conv_res.pages)
199
+ conv_res.pages = [page for page in conv_res.pages if page.size is not None]
200
+
201
+ if len(conv_res.pages) < initial_page_count:
202
+ _log.info(
203
+ f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
204
+ f"due to timeout or processing failures"
205
+ )
206
+
196
207
  return conv_res
197
208
 
198
209
  def _unload(self, conv_res: ConversionResult) -> ConversionResult:
@@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
72
72
  PagePreprocessingModel(
73
73
  options=PagePreprocessingOptions(
74
74
  images_scale=pipeline_options.images_scale,
75
- create_parsed_page=pipeline_options.generate_parsed_pages,
76
75
  )
77
76
  ),
78
77
  # OCR
@@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
8
8
  from docling_core.types.doc.page import TextCell
9
9
  from rtree import index
10
10
 
11
- from docling.datamodel.base_models import BoundingBox, Cluster
11
+ from docling.datamodel.base_models import BoundingBox, Cluster, Page
12
12
 
13
13
  _log = logging.getLogger(__name__)
14
14
 
@@ -194,11 +194,11 @@ class LayoutPostprocessor:
194
194
  DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
195
195
  }
196
196
 
197
- def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
198
- """Initialize processor with cells and clusters."""
199
- """Initialize processor with cells and spatial indices."""
200
- self.cells = cells
201
- self.page_size = page_size
197
+ def __init__(self, page: Page, clusters: List[Cluster]) -> None:
198
+ """Initialize processor with page and clusters."""
199
+ self.cells = page.cells
200
+ self.page = page
201
+ self.page_size = page.size
202
202
  self.all_clusters = clusters
203
203
  self.regular_clusters = [
204
204
  c for c in clusters if c.label not in self.SPECIAL_TYPES
@@ -240,6 +240,10 @@ class LayoutPostprocessor:
240
240
  for child in cluster.children:
241
241
  child.cells = self._sort_cells(child.cells)
242
242
 
243
+ assert self.page.parsed_page is not None
244
+ self.page.parsed_page.textline_cells = self.cells
245
+ self.page.parsed_page.has_lines = len(self.cells) > 0
246
+
243
247
  return final_clusters, self.cells
244
248
 
245
249
  def _process_regular_clusters(self) -> List[Cluster]:
@@ -301,6 +305,7 @@ class LayoutPostprocessor:
301
305
  special_clusters = self._handle_cross_type_overlaps(special_clusters)
302
306
 
303
307
  # Calculate page area from known page size
308
+ assert self.page_size is not None
304
309
  page_area = self.page_size.width * self.page_size.height
305
310
  if page_area > 0:
306
311
  # Filter out full-page pictures
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.36.1
3
+ Version: 2.38.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -61,6 +61,8 @@ Requires-Dist: mlx-vlm>=0.1.22; (python_version >= "3.10" and sys_platform == "d
61
61
  Provides-Extra: rapidocr
62
62
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
63
63
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
64
+ Provides-Extra: asr
65
+ Requires-Dist: openai-whisper>=20240930; extra == "asr"
64
66
  Dynamic: license-file
65
67
 
66
68
  <p align="center">
@@ -93,14 +95,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
93
95
 
94
96
  ## Features
95
97
 
96
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
98
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
97
99
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
98
100
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
99
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
101
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
100
102
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
101
103
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
102
104
  * 🔍 Extensive OCR support for scanned PDFs and images
103
- * 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
105
+ * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
106
+ * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
104
107
  * 💻 Simple and convenient CLI
105
108
 
106
109
  ### Coming soon
@@ -1,21 +1,22 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/document_converter.py,sha256=bnUA9k1LCuCfNwCsneGQiGCvFdnX8W-vbpnu6U_fuuI,14003
2
+ docling/document_converter.py,sha256=3jWywP_TLy-1PMvjJBUlnTM9FNzpBLRCHYA1RKFvGR4,14333
3
3
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
4
4
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
5
5
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
7
- docling/backend/asciidoc_backend.py,sha256=W-4MRcID6AU9Ax23q8FwDwGG-OOCrBoqcNf2Ch_WPUc,14041
7
+ docling/backend/asciidoc_backend.py,sha256=RDNLrPJHxROiM7-NQdZn3DdvAyiPAndbSWcZo9PbCKU,14417
8
8
  docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
9
- docling/backend/docling_parse_backend.py,sha256=bVSPmmiVXdCVfe-eLtDhbPQKBjkFR8rZJoRxdWIMdYU,7998
10
- docling/backend/docling_parse_v2_backend.py,sha256=R4YPCEs72GYg-Xc9VfizPv8QjtGmKOsQzVPNAU2RIK0,9376
11
- docling/backend/docling_parse_v4_backend.py,sha256=aWh-fd-lnuRGVGC_DG17QUptIsArv5V1gJo8QFbB5Ys,6263
9
+ docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
+ docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
+ docling/backend/docling_parse_v4_backend.py,sha256=7tQvpCwpYoq98PNszDkrXaFhy5eWmQqMP4RjWWPLPgw,6197
12
12
  docling/backend/html_backend.py,sha256=3K-l5SUAAyqISNEb7nPst_I51xzYOVOkgmwXh3lv9sw,21063
13
- docling/backend/md_backend.py,sha256=JkY1qTvQFXjKSZGfD-83d-fZelorUG_l6mpJdYGqvX8,17210
13
+ docling/backend/md_backend.py,sha256=ghIU_NSaENKrRu49Dn5GvjYtcAgEU7ZHbf-TeYg49nY,17673
14
14
  docling/backend/msexcel_backend.py,sha256=3j0WQfqDpgPXdPMCguefdv7arcNVDedPD6gl54cmLn8,18110
15
- docling/backend/mspowerpoint_backend.py,sha256=RwqfvvzrtM56L9uf7PR9lvlHJ-LyYGpkS1iVxkTl72Q,17203
16
- docling/backend/msword_backend.py,sha256=iB2yRg8hXtET2-Wjkv5pq0p9Y1SGQYIVCcWtOtXUILU,44621
15
+ docling/backend/mspowerpoint_backend.py,sha256=0lsb8ZeQFxbDt7jZpSQyk5wYHYa3SP2T2y2dMI-o30o,15216
16
+ docling/backend/msword_backend.py,sha256=C4qs4mQEt1JzonCg5v6_yUxdngzcTzSO9k1ik8_DW5Q,44855
17
+ docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
17
18
  docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
18
- docling/backend/pypdfium2_backend.py,sha256=fUGRBupwTYftEgdIDWKphA2zdfb-SrUoUGENK6j-q-0,11002
19
+ docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
19
20
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
21
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
22
  docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
@@ -27,36 +28,38 @@ docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e
27
28
  docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
28
29
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
29
30
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- docling/cli/main.py,sha256=fDGjepShl6KO_BdA6qUNyNBoCjqZUKRnmmkzesGtvVU,27202
31
+ docling/cli/main.py,sha256=D2gEoArnQ2yQ9BesH9CkxZbYQyhZRGgjjNWYqmRRUtU,29617
31
32
  docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
32
33
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
33
34
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
35
  docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
35
- docling/datamodel/base_models.py,sha256=bkooSG4brZy2jt2dndkin3DHvfZ5HFp0C94yBGmCWeI,10568
36
- docling/datamodel/document.py,sha256=vPwiVU5zWCKbVYMq-TSmb7LTjijrqJq0FyAgDBa0XGA,16154
37
- docling/datamodel/pipeline_options.py,sha256=iMuwsa77hkAgjJWXBRAFEQGw9tGNMDQrPnSvE5mirNs,9081
38
- docling/datamodel/pipeline_options_vlm_model.py,sha256=-ZPAp2uSKMatDbjZPv9chT587B1aftfDVmi_FDb2aw8,1997
36
+ docling/datamodel/asr_model_specs.py,sha256=L7ETXsUKVbPsVcPLhEIMxQjd4UzMGZBVsy74CLsZBkU,2181
37
+ docling/datamodel/base_models.py,sha256=L35qXLmADZQNEzBC0M6K2xrfLyqrTqDlbPD6E6DkWMc,11146
38
+ docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
39
+ docling/datamodel/pipeline_options.py,sha256=N7my7hmvuX6EzlujHeF6RObPSrG_HjN_nfPzILTqP-E,9479
40
+ docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
41
+ docling/datamodel/pipeline_options_vlm_model.py,sha256=rtDMVtKFZbgQD269w8FvHMXEhdRBrsA4rVYk6A-M-b4,2063
39
42
  docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
40
43
  docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
41
44
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- docling/models/api_vlm_model.py,sha256=w3P1wOsr3JvZsawbK1Z4uwnD5ehUMbcKGkyhcX83Okc,2738
43
- docling/models/base_model.py,sha256=Zx_nByGYkubTvvYiQxwiB6P8lc7wOD4ZTC2QIw6vCEg,2950
44
- docling/models/base_ocr_model.py,sha256=c6a2QzZnAMfQECQDz1JASecl_Z2F3i6P3ax6kHWcz6o,7221
45
+ docling/models/api_vlm_model.py,sha256=GDDJGAia4SJjK7JFxsZy5oEU-D8yQo8Kb3NvvPbTvT0,2820
46
+ docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
47
+ docling/models/base_ocr_model.py,sha256=HtrefTq9Zy4UnUInMchPv0tbobiA7CQU5VUauKJD7IU,8006
45
48
  docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
46
49
  docling/models/document_picture_classifier.py,sha256=fkJLV7pMy3v6iNwOzVb6zdBU1dGtBM1ARHLIRPfoAG4,6124
47
- docling/models/easyocr_model.py,sha256=bTK-AQYc-WTzX8SRoMRwVjqlMigaJKGloaLUcH6RCKU,7406
48
- docling/models/layout_model.py,sha256=KdGhS4EMWKP6BwlhUJ0mdbhk2Fc78qwzqEZbTxyrbFM,8508
49
- docling/models/ocr_mac_model.py,sha256=CJOwz9h84crvZd3kQMLxYntpXz-1w2eLDjhGUnGIwMQ,5415
50
+ docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
51
+ docling/models/layout_model.py,sha256=EJuRXW0rFdnNPS5AifdEsr812EATUqAioeMCVjw8PL0,8460
52
+ docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
50
53
  docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
51
- docling/models/page_preprocessing_model.py,sha256=8cdhR9n3zcC8JxDen8WdPBx_GNk_5VICeHJo1-kP518,5186
54
+ docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
52
55
  docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
53
56
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
54
57
  docling/models/picture_description_vlm_model.py,sha256=7LeCx9ZdPxsmWJ468OtxCdAkH48A1HD0iwH9cs_7-1Q,3800
55
- docling/models/rapid_ocr_model.py,sha256=miTPn1YTWKtXUuddiVv0SjgkuNWHXCW3CZ6epDUmKjI,5935
56
- docling/models/readingorder_model.py,sha256=S9ru2ApY9sE-Uue3hptWHmbmElwo36bUbAikxCFpHYs,14574
58
+ docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
59
+ docling/models/readingorder_model.py,sha256=QHb5fyiqmxU8lg4W5IzdukqHPh6V7rNw_57O4-z-Az4,14615
57
60
  docling/models/table_structure_model.py,sha256=dQf6u_zn5fHCkHzmTwYfCbRtZCBddsyAM0WNVBUUQzk,12473
58
- docling/models/tesseract_ocr_cli_model.py,sha256=oQZVWXQ6wRrFonRFwbWeW9nJ9FLQZdzSWErOp0mEff0,12698
59
- docling/models/tesseract_ocr_model.py,sha256=AjrZNwgVbV0IbzBJwI35YP0KxvqWJWJE0v_lgHJiQrk,10606
61
+ docling/models/tesseract_ocr_cli_model.py,sha256=qcM3-n7Z_dm1CGBhVUcNr2XT41iXnU32zk4RqKHBl9I,12775
62
+ docling/models/tesseract_ocr_model.py,sha256=9DPAE7XP7smej7HYhr7mdwpuxSjAcv_GPrYZG3bb1RA,10587
60
63
  docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
61
64
  docling/models/factories/base_factory.py,sha256=MfWIljMETi5aaVR-6qLTelW8u1gwDAQsOwg3fu7O4Qc,4028
62
65
  docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
@@ -66,19 +69,20 @@ docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurH
66
69
  docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
70
  docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
68
71
  docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- docling/models/vlm_models_inline/hf_transformers_model.py,sha256=SXSu6spu8zNCsrD32RU_irLs59ltF6PqbLVfpjDujmE,7285
70
- docling/models/vlm_models_inline/mlx_model.py,sha256=CFe1UNxQufZd5K4iaOW3HsplQBPb_1cENf3KIwWUSWw,5702
72
+ docling/models/vlm_models_inline/hf_transformers_model.py,sha256=w9_N4ccjmYYK5yYQou0LSMGaj6gs8l0hULvXbkfYXSQ,7425
73
+ docling/models/vlm_models_inline/mlx_model.py,sha256=qpyi6fGHm0vPqW2yeTsRBKOTTshNJ1LAPbH1SBDp8Y8,5784
71
74
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
- docling/pipeline/base_pipeline.py,sha256=DnuxAf7EQusdSRae0QUVth-0f2mSff8JZjX-2vazk00,8751
75
+ docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
76
+ docling/pipeline/base_pipeline.py,sha256=14yQrDjsojl4RgbBjKFSEfVBYR_sULZfBI1uDzFLi8Y,9331
73
77
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
74
- docling/pipeline/standard_pdf_pipeline.py,sha256=itCZPj7nMFAQtAlStfmWthpCIHZFUm9W5uTgvVi6PkQ,12738
78
+ docling/pipeline/standard_pdf_pipeline.py,sha256=2Hqg2wnAXfbZbLUOQrRus8PMEuZ549jR1mfR86-CAB4,12659
75
79
  docling/pipeline/vlm_pipeline.py,sha256=IrjDbajCPmUPep_jATKNiABST4tQ8mvpkQz9mtBQ8qQ,15279
76
80
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
81
  docling/utils/accelerator_utils.py,sha256=Fww4UiTiuIB91iuPgUZTy-DYpCGRMI8YuCYKhFb0gjA,2905
78
82
  docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
79
83
  docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
80
84
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
81
- docling/utils/layout_postprocessor.py,sha256=3WCmkPsPJ80xfWzAUeWb5L9BmuwJ79ztctvbbUs8AfI,24068
85
+ docling/utils/layout_postprocessor.py,sha256=laTPGGj-hv16Zh1TRcn8NK0POKs7d3jeaV1pRR_TjIU,24228
82
86
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
83
87
  docling/utils/model_downloader.py,sha256=6TDxFOvMRYT8JyYyaQS_wXMJzNga61ImY3sFdks66qM,4004
84
88
  docling/utils/ocr_utils.py,sha256=AOaDAHr5S74d-IRVR_LKhKynUTIurAwLJ3wNeY58gPA,2326
@@ -86,9 +90,9 @@ docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,
86
90
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
87
91
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
88
92
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
89
- docling-2.36.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
90
- docling-2.36.1.dist-info/METADATA,sha256=0Sl0LfCopUXdEd6mm2kqRpMgFoq6nvZBUXlIKeIqY_E,10036
91
- docling-2.36.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
92
- docling-2.36.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
93
- docling-2.36.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
94
- docling-2.36.1.dist-info/RECORD,,
93
+ docling-2.38.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
94
+ docling-2.38.0.dist-info/METADATA,sha256=vT8Zko4wD8iyKUjLAJ83Cm7ntscjEk5ojHvcJXlvT5A,10273
95
+ docling-2.38.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
+ docling-2.38.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
97
+ docling-2.38.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
98
+ docling-2.38.0.dist-info/RECORD,,