docling 2.57.0__py3-none-any.whl → 2.58.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

@@ -4,7 +4,7 @@ import re
4
4
  import tempfile
5
5
  from io import BytesIO
6
6
  from pathlib import Path
7
- from typing import List, Optional, Union, cast
7
+ from typing import TYPE_CHECKING, List, Optional, Union, cast
8
8
 
9
9
  from docling_core.types.doc import DoclingDocument, DocumentOrigin
10
10
 
@@ -32,6 +32,7 @@ from docling.datamodel.pipeline_options import (
32
32
  AsrPipelineOptions,
33
33
  )
34
34
  from docling.datamodel.pipeline_options_asr_model import (
35
+ InlineAsrMlxWhisperOptions,
35
36
  InlineAsrNativeWhisperOptions,
36
37
  # AsrResponseFormat,
37
38
  InlineAsrOptions,
@@ -228,22 +229,157 @@ class _NativeWhisperModel:
228
229
  return convo
229
230
 
230
231
 
232
+ class _MlxWhisperModel:
233
+ def __init__(
234
+ self,
235
+ enabled: bool,
236
+ artifacts_path: Optional[Path],
237
+ accelerator_options: AcceleratorOptions,
238
+ asr_options: InlineAsrMlxWhisperOptions,
239
+ ):
240
+ """
241
+ Transcriber using MLX Whisper for Apple Silicon optimization.
242
+ """
243
+ self.enabled = enabled
244
+
245
+ _log.info(f"artifacts-path: {artifacts_path}")
246
+ _log.info(f"accelerator_options: {accelerator_options}")
247
+
248
+ if self.enabled:
249
+ try:
250
+ import mlx_whisper # type: ignore
251
+ except ImportError:
252
+ raise ImportError(
253
+ "mlx-whisper is not installed. Please install it via `pip install mlx-whisper` or do `uv sync --extra asr`."
254
+ )
255
+ self.asr_options = asr_options
256
+ self.mlx_whisper = mlx_whisper
257
+
258
+ self.device = decide_device(
259
+ accelerator_options.device,
260
+ supported_devices=asr_options.supported_devices,
261
+ )
262
+ _log.info(f"Available device for MLX Whisper: {self.device}")
263
+
264
+ self.model_name = asr_options.repo_id
265
+ _log.info(f"loading _MlxWhisperModel({self.model_name})")
266
+
267
+ # MLX Whisper models are loaded differently - they use HuggingFace repos
268
+ self.model_path = self.model_name
269
+
270
+ # Store MLX-specific options
271
+ self.language = asr_options.language
272
+ self.task = asr_options.task
273
+ self.word_timestamps = asr_options.word_timestamps
274
+ self.no_speech_threshold = asr_options.no_speech_threshold
275
+ self.logprob_threshold = asr_options.logprob_threshold
276
+ self.compression_ratio_threshold = asr_options.compression_ratio_threshold
277
+
278
+ def run(self, conv_res: ConversionResult) -> ConversionResult:
279
+ audio_path: Path = Path(conv_res.input.file).resolve()
280
+
281
+ try:
282
+ conversation = self.transcribe(audio_path)
283
+
284
+ # Ensure we have a proper DoclingDocument
285
+ origin = DocumentOrigin(
286
+ filename=conv_res.input.file.name or "audio.wav",
287
+ mimetype="audio/x-wav",
288
+ binary_hash=conv_res.input.document_hash,
289
+ )
290
+ conv_res.document = DoclingDocument(
291
+ name=conv_res.input.file.stem or "audio.wav", origin=origin
292
+ )
293
+
294
+ for citem in conversation:
295
+ conv_res.document.add_text(
296
+ label=DocItemLabel.TEXT, text=citem.to_string()
297
+ )
298
+
299
+ conv_res.status = ConversionStatus.SUCCESS
300
+ return conv_res
301
+
302
+ except Exception as exc:
303
+ _log.error(f"MLX Audio transcription has an error: {exc}")
304
+
305
+ conv_res.status = ConversionStatus.FAILURE
306
+ return conv_res
307
+
308
+ def transcribe(self, fpath: Path) -> list[_ConversationItem]:
309
+ """
310
+ Transcribe audio using MLX Whisper.
311
+
312
+ Args:
313
+ fpath: Path to audio file
314
+
315
+ Returns:
316
+ List of conversation items with timestamps
317
+ """
318
+ result = self.mlx_whisper.transcribe(
319
+ str(fpath),
320
+ path_or_hf_repo=self.model_path,
321
+ language=self.language,
322
+ task=self.task,
323
+ word_timestamps=self.word_timestamps,
324
+ no_speech_threshold=self.no_speech_threshold,
325
+ logprob_threshold=self.logprob_threshold,
326
+ compression_ratio_threshold=self.compression_ratio_threshold,
327
+ )
328
+
329
+ convo: list[_ConversationItem] = []
330
+
331
+ # MLX Whisper returns segments similar to native Whisper
332
+ for segment in result.get("segments", []):
333
+ item = _ConversationItem(
334
+ start_time=segment.get("start"),
335
+ end_time=segment.get("end"),
336
+ text=segment.get("text", "").strip(),
337
+ words=[],
338
+ )
339
+
340
+ # Add word-level timestamps if available
341
+ if self.word_timestamps and "words" in segment:
342
+ item.words = []
343
+ for word_data in segment["words"]:
344
+ item.words.append(
345
+ _ConversationWord(
346
+ start_time=word_data.get("start"),
347
+ end_time=word_data.get("end"),
348
+ text=word_data.get("word", ""),
349
+ )
350
+ )
351
+ convo.append(item)
352
+
353
+ return convo
354
+
355
+
231
356
  class AsrPipeline(BasePipeline):
232
357
  def __init__(self, pipeline_options: AsrPipelineOptions):
233
358
  super().__init__(pipeline_options)
234
359
  self.keep_backend = True
235
360
 
236
361
  self.pipeline_options: AsrPipelineOptions = pipeline_options
362
+ self._model: Union[_NativeWhisperModel, _MlxWhisperModel]
237
363
 
238
364
  if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
239
- asr_options: InlineAsrNativeWhisperOptions = (
365
+ native_asr_options: InlineAsrNativeWhisperOptions = (
240
366
  self.pipeline_options.asr_options
241
367
  )
242
368
  self._model = _NativeWhisperModel(
243
369
  enabled=True, # must be always enabled for this pipeline to make sense.
244
370
  artifacts_path=self.artifacts_path,
245
371
  accelerator_options=pipeline_options.accelerator_options,
246
- asr_options=asr_options,
372
+ asr_options=native_asr_options,
373
+ )
374
+ elif isinstance(self.pipeline_options.asr_options, InlineAsrMlxWhisperOptions):
375
+ mlx_asr_options: InlineAsrMlxWhisperOptions = (
376
+ self.pipeline_options.asr_options
377
+ )
378
+ self._model = _MlxWhisperModel(
379
+ enabled=True, # must be always enabled for this pipeline to make sense.
380
+ artifacts_path=self.artifacts_path,
381
+ accelerator_options=pipeline_options.accelerator_options,
382
+ asr_options=mlx_asr_options,
247
383
  )
248
384
  else:
249
385
  _log.error(f"No model support for {self.pipeline_options.asr_options}")
@@ -2,7 +2,7 @@ import base64
2
2
  import json
3
3
  import logging
4
4
  from io import BytesIO
5
- from typing import Dict, List, Optional
5
+ from typing import Optional
6
6
 
7
7
  import requests
8
8
  from PIL import Image
@@ -19,7 +19,7 @@ def api_image_request(
19
19
  prompt: str,
20
20
  url: AnyUrl,
21
21
  timeout: float = 20,
22
- headers: Optional[Dict[str, str]] = None,
22
+ headers: Optional[dict[str, str]] = None,
23
23
  **params,
24
24
  ) -> str:
25
25
  img_io = BytesIO()
@@ -69,8 +69,8 @@ def api_image_request_streaming(
69
69
  url: AnyUrl,
70
70
  *,
71
71
  timeout: float = 20,
72
- headers: Optional[Dict[str, str]] = None,
73
- generation_stoppers: List[GenerationStopper] = [],
72
+ headers: Optional[dict[str, str]] = None,
73
+ generation_stoppers: list[GenerationStopper] = [],
74
74
  **params,
75
75
  ) -> str:
76
76
  """
@@ -2,7 +2,6 @@ import bisect
2
2
  import logging
3
3
  import sys
4
4
  from collections import defaultdict
5
- from typing import Dict, List, Set, Tuple
6
5
 
7
6
  from docling_core.types.doc import DocItemLabel, Size
8
7
  from docling_core.types.doc.page import TextCell
@@ -39,7 +38,7 @@ class UnionFind:
39
38
  self.parent[root_y] = root_x
40
39
  self.rank[root_x] += 1
41
40
 
42
- def get_groups(self) -> Dict[int, List[int]]:
41
+ def get_groups(self) -> dict[int, list[int]]:
43
42
  """Returns groups as {root: [elements]}."""
44
43
  groups = defaultdict(list)
45
44
  for elem in self.parent:
@@ -50,13 +49,13 @@ class UnionFind:
50
49
  class SpatialClusterIndex:
51
50
  """Efficient spatial indexing for clusters using R-tree and interval trees."""
52
51
 
53
- def __init__(self, clusters: List[Cluster]):
52
+ def __init__(self, clusters: list[Cluster]):
54
53
  p = index.Property()
55
54
  p.dimension = 2
56
55
  self.spatial_index = index.Index(properties=p)
57
56
  self.x_intervals = IntervalTree()
58
57
  self.y_intervals = IntervalTree()
59
- self.clusters_by_id: Dict[int, Cluster] = {}
58
+ self.clusters_by_id: dict[int, Cluster] = {}
60
59
 
61
60
  for cluster in clusters:
62
61
  self.add_cluster(cluster)
@@ -72,7 +71,7 @@ class SpatialClusterIndex:
72
71
  self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
73
72
  del self.clusters_by_id[cluster.id]
74
73
 
75
- def find_candidates(self, bbox: BoundingBox) -> Set[int]:
74
+ def find_candidates(self, bbox: BoundingBox) -> set[int]:
76
75
  """Find potential overlapping cluster IDs using all indexes."""
77
76
  spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
78
77
  x_candidates = self.x_intervals.find_containing(
@@ -123,13 +122,13 @@ class IntervalTree:
123
122
  """Memory-efficient interval tree for 1D overlap queries."""
124
123
 
125
124
  def __init__(self):
126
- self.intervals: List[Interval] = [] # Sorted by min_val
125
+ self.intervals: list[Interval] = [] # Sorted by min_val
127
126
 
128
127
  def insert(self, min_val: float, max_val: float, id: int):
129
128
  interval = Interval(min_val, max_val, id)
130
129
  bisect.insort(self.intervals, interval)
131
130
 
132
- def find_containing(self, point: float) -> Set[int]:
131
+ def find_containing(self, point: float) -> set[int]:
133
132
  """Find all intervals containing the point."""
134
133
  pos = bisect.bisect_left(self.intervals, point)
135
134
  result = set()
@@ -196,7 +195,7 @@ class LayoutPostprocessor:
196
195
  }
197
196
 
198
197
  def __init__(
199
- self, page: Page, clusters: List[Cluster], options: LayoutOptions
198
+ self, page: Page, clusters: list[Cluster], options: LayoutOptions
200
199
  ) -> None:
201
200
  """Initialize processor with page and clusters."""
202
201
 
@@ -219,7 +218,7 @@ class LayoutPostprocessor:
219
218
  [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
220
219
  )
221
220
 
222
- def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
221
+ def postprocess(self) -> tuple[list[Cluster], list[TextCell]]:
223
222
  """Main processing pipeline."""
224
223
  self.regular_clusters = self._process_regular_clusters()
225
224
  self.special_clusters = self._process_special_clusters()
@@ -254,7 +253,7 @@ class LayoutPostprocessor:
254
253
 
255
254
  return final_clusters, self.cells
256
255
 
257
- def _process_regular_clusters(self) -> List[Cluster]:
256
+ def _process_regular_clusters(self) -> list[Cluster]:
258
257
  """Process regular clusters with iterative refinement."""
259
258
  clusters = [
260
259
  c
@@ -311,7 +310,7 @@ class LayoutPostprocessor:
311
310
 
312
311
  return clusters
313
312
 
314
- def _process_special_clusters(self) -> List[Cluster]:
313
+ def _process_special_clusters(self) -> list[Cluster]:
315
314
  special_clusters = [
316
315
  c
317
316
  for c in self.special_clusters
@@ -381,7 +380,7 @@ class LayoutPostprocessor:
381
380
 
382
381
  return picture_clusters + wrapper_clusters
383
382
 
384
- def _handle_cross_type_overlaps(self, special_clusters) -> List[Cluster]:
383
+ def _handle_cross_type_overlaps(self, special_clusters) -> list[Cluster]:
385
384
  """Handle overlaps between regular and wrapper clusters before child assignment.
386
385
 
387
386
  In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
@@ -454,7 +453,7 @@ class LayoutPostprocessor:
454
453
 
455
454
  def _select_best_cluster_from_group(
456
455
  self,
457
- group_clusters: List[Cluster],
456
+ group_clusters: list[Cluster],
458
457
  params: dict,
459
458
  ) -> Cluster:
460
459
  """Select best cluster from a group of overlapping clusters based on all rules."""
@@ -487,11 +486,11 @@ class LayoutPostprocessor:
487
486
 
488
487
  def _remove_overlapping_clusters(
489
488
  self,
490
- clusters: List[Cluster],
489
+ clusters: list[Cluster],
491
490
  cluster_type: str,
492
491
  overlap_threshold: float = 0.8,
493
492
  containment_threshold: float = 0.8,
494
- ) -> List[Cluster]:
493
+ ) -> list[Cluster]:
495
494
  if not clusters:
496
495
  return []
497
496
 
@@ -544,7 +543,7 @@ class LayoutPostprocessor:
544
543
 
545
544
  def _select_best_cluster(
546
545
  self,
547
- clusters: List[Cluster],
546
+ clusters: list[Cluster],
548
547
  area_threshold: float,
549
548
  conf_threshold: float,
550
549
  ) -> Cluster:
@@ -572,7 +571,7 @@ class LayoutPostprocessor:
572
571
 
573
572
  return current_best if current_best else clusters[0]
574
573
 
575
- def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
574
+ def _deduplicate_cells(self, cells: list[TextCell]) -> list[TextCell]:
576
575
  """Ensure each cell appears only once, maintaining order of first appearance."""
577
576
  seen_ids = set()
578
577
  unique_cells = []
@@ -583,8 +582,8 @@ class LayoutPostprocessor:
583
582
  return unique_cells
584
583
 
585
584
  def _assign_cells_to_clusters(
586
- self, clusters: List[Cluster], min_overlap: float = 0.2
587
- ) -> List[Cluster]:
585
+ self, clusters: list[Cluster], min_overlap: float = 0.2
586
+ ) -> list[Cluster]:
588
587
  """Assign cells to best overlapping cluster."""
589
588
  for cluster in clusters:
590
589
  cluster.cells = []
@@ -616,7 +615,7 @@ class LayoutPostprocessor:
616
615
 
617
616
  return clusters
618
617
 
619
- def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
618
+ def _find_unassigned_cells(self, clusters: list[Cluster]) -> list[TextCell]:
620
619
  """Find cells not assigned to any cluster."""
621
620
  assigned = {cell.index for cluster in clusters for cell in cluster.cells}
622
621
  return [
@@ -625,7 +624,7 @@ class LayoutPostprocessor:
625
624
  if cell.index not in assigned and cell.text.strip()
626
625
  ]
627
626
 
628
- def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
627
+ def _adjust_cluster_bboxes(self, clusters: list[Cluster]) -> list[Cluster]:
629
628
  """Adjust cluster bounding boxes to contain their cells."""
630
629
  for cluster in clusters:
631
630
  if not cluster.cells:
@@ -651,13 +650,13 @@ class LayoutPostprocessor:
651
650
 
652
651
  return clusters
653
652
 
654
- def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
653
+ def _sort_cells(self, cells: list[TextCell]) -> list[TextCell]:
655
654
  """Sort cells in native reading order."""
656
655
  return sorted(cells, key=lambda c: (c.index))
657
656
 
658
657
  def _sort_clusters(
659
- self, clusters: List[Cluster], mode: str = "id"
660
- ) -> List[Cluster]:
658
+ self, clusters: list[Cluster], mode: str = "id"
659
+ ) -> list[Cluster]:
661
660
  """Sort clusters in reading order (top-to-bottom, left-to-right)."""
662
661
  if mode == "id": # sort in the order the cells are printed in the PDF.
663
662
  return sorted(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.57.0
3
+ Version: 2.58.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.2
30
- Requires-Dist: docling-parse<5.0.0,>=4.4.0
30
+ Requires-Dist: docling-parse<5.0.0,>=4.7.0
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
@@ -69,6 +69,7 @@ Provides-Extra: rapidocr
69
69
  Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
70
70
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
71
71
  Provides-Extra: asr
72
+ Requires-Dist: mlx-whisper>=0.4.3; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "asr"
72
73
  Requires-Dist: openai-whisper>=20250625; extra == "asr"
73
74
  Dynamic: license-file
74
75
 
@@ -96,6 +97,7 @@ Dynamic: license-file
96
97
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
97
98
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
98
99
  [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
100
+ [![Discord](https://img.shields.io/discord/1399788921306746971?color=6A7EC2&logo=discord&logoColor=ffffff)](https://docling.ai/discord)
99
101
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
100
102
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
101
103
 
@@ -1,24 +1,24 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- docling/document_converter.py,sha256=gPyBrNegMgeBGxN7iebrjqEDm7zQQOmFNm8hVi-pFEQ,16013
3
- docling/document_extractor.py,sha256=-RbQRvLWLXF15HYqBbV_lJhh08Zl487UEQKhP-_FR8k,11969
2
+ docling/document_converter.py,sha256=_P3f4eZ8Gssv3P3l8xX2RrgzS8WhafY7-x6rWaWOeN4,15511
3
+ docling/document_extractor.py,sha256=Jk1a4hgPxjLkp4UoZR_pdEMid9-jhNiND5_NlPHGy6c,11965
4
4
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
5
5
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
6
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
8
- docling/backend/asciidoc_backend.py,sha256=ARpMBzrNCV-x6g2I0KytDM3pGLac3z4ql3hDKi3FI04,14403
7
+ docling/backend/abstract_backend.py,sha256=_xKSjLpR-ia93Kz0dto0yyVsaeIqEepUhVEGo18MuWw,2169
8
+ docling/backend/asciidoc_backend.py,sha256=DR8AUTNvy_SCHkieMpqZXg_NLRTy4roEqa0V8sILPWk,14400
9
9
  docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
10
10
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
11
11
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
12
- docling/backend/docling_parse_v4_backend.py,sha256=xCBbaaXjNNrOaod9tmBuCbe5mL_ipmTNG2XOxVbGG3w,7891
13
- docling/backend/html_backend.py,sha256=iuRyYztUduyP214X0SyDvl1dP_h0eccp5RkuM72rV8o,48664
14
- docling/backend/md_backend.py,sha256=TWboEPHl93pqI_Go1a3XpP-KpzI3d17xo5ZW42Ul0kY,22764
12
+ docling/backend/docling_parse_v4_backend.py,sha256=tBJR0BbKFOIDKSngjVDu0BrzTj7qUZAhFdRT8GvAJ18,8232
13
+ docling/backend/html_backend.py,sha256=m91kRxMhQ1w-7G6MHA9l01dgF8-YQNn8ZNx9lwG467M,52935
14
+ docling/backend/md_backend.py,sha256=_0ToiecsGwU4H4BBso4ar9TGJi8OTwSXjgmi66vSJVQ,23513
15
15
  docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
16
- docling/backend/msexcel_backend.py,sha256=GOuA-MlShpzFmCmJq3-Z28iquwWUg4k8v-AT4O-aAQI,19305
17
- docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
18
- docling/backend/msword_backend.py,sha256=L44vFoSHOtbX-S_lSb8EKW-nzwL_ptVPhNV74ydmwqE,57457
16
+ docling/backend/msexcel_backend.py,sha256=-iWLdIonMZl2FCfPAXFQKIQzFOJn5InpH6KDAJ_L64o,22760
17
+ docling/backend/mspowerpoint_backend.py,sha256=71W_iV31Rggqn9UcMzXmsZ3QKMRpsBT8fCwdjsIIKAs,15109
18
+ docling/backend/msword_backend.py,sha256=zNJy-KM3Ia-L8IQ4sjYxATW4owFxbg2CK0rzke8y-7w,57451
19
19
  docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
20
- docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
21
- docling/backend/pypdfium2_backend.py,sha256=AYhWs9S8W_TkAK0-OkRmUNf4HUZl26FP7-XYjwU5zDk,14209
20
+ docling/backend/pdf_backend.py,sha256=UovGV3RJG6qllzMPYzhDB6GID7buGV6w1uxl5dOAEw4,3563
21
+ docling/backend/pypdfium2_backend.py,sha256=tx0FnUW87zPsyafCvOuLcls2k5QdpPKWweyjNTfclNc,14509
22
22
  docling/backend/webvtt_backend.py,sha256=9xPcfWVLuqhEAFrkv8aU36qHnSgjeINZAXT_C9C6XJA,19165
23
23
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  docling/backend/docx/drawingml/utils.py,sha256=E9Iq8_052eEV5L1IN3ZqFX9eBidH56DKNlh6Tk7Do0I,3640
@@ -32,18 +32,19 @@ docling/backend/xml/jats_backend.py,sha256=_BWpQQg3SlsHAOOj0v2qRJoVqaQzL91GqN1tK
32
32
  docling/backend/xml/uspto_backend.py,sha256=Tv4CE7V5_QwxTNJPl90CAd_mAbwaLGy8S6s6evh1Xow,70910
33
33
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
34
34
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
- docling/cli/main.py,sha256=cvDS6CTME2B2Mrm4l9yNynOUDVsZ9ZTlA6mM_jsa5jU,34258
35
+ docling/cli/main.py,sha256=x_mPS3g3Zw60_9bL_oo9OfPBmuSd-aJV7oKTPD0GjS4,36772
36
36
  docling/cli/models.py,sha256=zZBFQJAD7C5sespnYy5M__4qC_GyqAZ-QpfWtgPRDB0,6343
37
37
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
38
38
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
39
  docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
40
- docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
41
- docling/datamodel/base_models.py,sha256=CQ6eThPzVeVD2Gq7BNz9Q5RDLwhe4NgMzk7tdLtk1c8,12382
42
- docling/datamodel/document.py,sha256=HyO3kdJcXIJ3wL95sPoL3zvsO4Rww3-qHH6IkL4I0q4,17483
40
+ docling/datamodel/asr_model_specs.py,sha256=gQJkW7DaSPiOuW_0QoI5OzR1_DQGRkw7yQlrVJ4hyo0,14473
41
+ docling/datamodel/backend_options.py,sha256=2zSbJRtBmJ6Twywj8pLOKaHhklY85XaGXUmSLX_SfgQ,2473
42
+ docling/datamodel/base_models.py,sha256=pC9CvVxMzcujKAG0TTObkYznKp8gIFdzDMDmgk5FjMQ,12697
43
+ docling/datamodel/document.py,sha256=T9OogC1kIm0VDSC2ZFcFgWdcOjXzw5JvGr2y2hMlx3s,18795
43
44
  docling/datamodel/extraction.py,sha256=7dgvtK5SuvgfB8LHAwS1FwrW1kcMQJuJG0ol8uAQgoQ,1323
44
45
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
45
46
  docling/datamodel/pipeline_options.py,sha256=dklSaA7P6VkjbBB-Pz2OyzO2SQuV9y0I8VVr9XHJusw,11692
46
- docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
47
+ docling/datamodel/pipeline_options_asr_model.py,sha256=cLqtRHBr2kbTNXRJ1ZhFGiXIK7Nl9RFmz2Wd7tJF2Jg,2172
47
48
  docling/datamodel/pipeline_options_vlm_model.py,sha256=Szdq5_MhqQ8xBCvOUkdn_LLV29ZMQJcF4xnItYlkmXQ,3090
48
49
  docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
49
50
  docling/datamodel/vlm_model_specs.py,sha256=9TTmihDEFcI-TY1jJ2GTnTcrGa3bLg0e6anN4gPtFgU,10035
@@ -63,7 +64,7 @@ docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCr
63
64
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
64
65
  docling/models/picture_description_vlm_model.py,sha256=Uja_BQSk7F-U1J2hm4yeLguirUzKYv1K8zRyw1IYomY,4150
65
66
  docling/models/rapid_ocr_model.py,sha256=JGeed1aNO64SYFgxlOifdut4fynUJyBuyyQrfuSno-4,13182
66
- docling/models/readingorder_model.py,sha256=-j-UuvnsYWqZvY0gByKz0bjcBwOhWQTHerCopig_jVs,17266
67
+ docling/models/readingorder_model.py,sha256=gnRFfJAXH-zKtQJws5Zb1_KCVvu_dAq9pgaDYQKCt9s,17236
67
68
  docling/models/table_structure_model.py,sha256=7g_mFf1YzfF8PXQfefNu6XYZu7TzJAn86zKb6IEUdCg,12518
68
69
  docling/models/tesseract_ocr_cli_model.py,sha256=KuO4rXc-88C2-cAymvcr41TqFi3hNg4gerEzoI3Z6m4,13039
69
70
  docling/models/tesseract_ocr_model.py,sha256=W_476USwExjSfhelXG8B9eNIVXXlm_dNFA60TZ5rq7E,11216
@@ -82,7 +83,7 @@ docling/models/vlm_models_inline/mlx_model.py,sha256=ae7hDMgBsMLkqulmbKDamGSSrLJ
82
83
  docling/models/vlm_models_inline/nuextract_transformers_model.py,sha256=jLNtlkMDheUyWot7Oqq-GHQIYzJ0fZrbReq5xCnYb9E,10506
83
84
  docling/models/vlm_models_inline/vllm_model.py,sha256=vXClayYxPGX1jzQ1Rvf3vvwtW9khgApGvcRz4Qbyu7I,10293
84
85
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
- docling/pipeline/asr_pipeline.py,sha256=oRluG28no3ezjbtL7nJLpDcxxxJuuULNXheq1W-qklM,10629
86
+ docling/pipeline/asr_pipeline.py,sha256=44lweVOCkFe8KikgXJjqDtfHewIotYvc242Xvgl9fV0,15744
86
87
  docling/pipeline/base_extraction_pipeline.py,sha256=GYrEz83IXv-tdIHjtNWxMBNczFwL8SZyf9vnPJ3STaI,2627
87
88
  docling/pipeline/base_pipeline.py,sha256=NPMQDTyis-LgQ4SybY2f5AESZl5PxogF-FRQuCDckXg,12748
88
89
  docling/pipeline/extraction_vlm_pipeline.py,sha256=veUOTe8nGdnduZKaGn1RRb-NfU1H6t_EN4QAsb022Zg,8260
@@ -92,10 +93,10 @@ docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=i67G5AOW7PIFCe5JS2sdBm
92
93
  docling/pipeline/vlm_pipeline.py,sha256=HSbSoGZyy4eIK8eOL2g_NymrHg8r-DrB2buggJQAqHU,16189
93
94
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
95
  docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
95
- docling/utils/api_image_request.py,sha256=kQDmTvQT6M2IgXnGYeoNflI6sLUG6WTCcEft94CRwWg,5379
96
+ docling/utils/api_image_request.py,sha256=xrn4O8ax8wdQPkLgbRhX22qWUangCXwaOzIXy_86LCs,5367
96
97
  docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
97
98
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
98
- docling/utils/layout_postprocessor.py,sha256=sE9UR3Nv4iOk26uoIsN3bFioE7ScfAjj0orDBDneLXg,25166
99
+ docling/utils/layout_postprocessor.py,sha256=bwDIhgUg5rKianzccGPTotTjqjkWtIQSoZwgKio8YC4,25124
99
100
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
100
101
  docling/utils/model_downloader.py,sha256=qrkL5NTpwk6yF4bcipcUtLRxl0Tqh7zoSa_WtLsMySA,5325
101
102
  docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
@@ -103,9 +104,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
103
104
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
104
105
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
105
106
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
106
- docling-2.57.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
107
- docling-2.57.0.dist-info/METADATA,sha256=oDfwFunLJTLSDVastMVq9JkUpIgeKOOVX1MZb6rtqcE,11364
108
- docling-2.57.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
109
- docling-2.57.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
110
- docling-2.57.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
111
- docling-2.57.0.dist-info/RECORD,,
107
+ docling-2.58.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
108
+ docling-2.58.0.dist-info/METADATA,sha256=py9js2V38fIWft1SmMe_iD_trav0WEwojgwxlHMsNv4,11642
109
+ docling-2.58.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
110
+ docling-2.58.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
111
+ docling-2.58.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
112
+ docling-2.58.0.dist-info/RECORD,,