docling 2.55.0__py3-none-any.whl → 2.56.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

@@ -272,9 +272,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
272
272
  for br in content("br"):
273
273
  br.replace_with(NavigableString("\n"))
274
274
  # set default content layer
275
- headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
275
+
276
+ # Furniture before the first heading rule, except for headers in tables
277
+ header = None
278
+ # Find all headers first
279
+ all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
280
+ # Keep only those that do NOT have a <table> in a parent chain
281
+ clean_headers = [h for h in all_headers if not h.find_parent("table")]
282
+ # Pick the first header from the remaining
283
+ if len(clean_headers):
284
+ header = clean_headers[0]
285
+ # Set starting content layer
276
286
  self.content_layer = (
277
- ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
287
+ ContentLayer.BODY if header is None else ContentLayer.FURNITURE
278
288
  )
279
289
  # reset context
280
290
  self.ctx = _Context()
@@ -309,9 +319,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
309
319
  group_name: str,
310
320
  doc: DoclingDocument,
311
321
  docling_table: TableItem,
312
- ) -> tuple[bool, RefItem]:
322
+ ) -> tuple[bool, Union[RefItem, None]]:
313
323
  rich_table_cell = False
314
- ref_for_rich_cell = provs_in_cell[0]
324
+ ref_for_rich_cell = None
325
+ if len(provs_in_cell) > 0:
326
+ ref_for_rich_cell = provs_in_cell[0]
315
327
  if len(provs_in_cell) > 1:
316
328
  # Cell has multiple elements, we need to group them
317
329
  rich_table_cell = True
@@ -324,7 +336,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
324
336
  if isinstance(pr_item, TextItem):
325
337
  # Cell has only one element and it's just a text
326
338
  rich_table_cell = False
327
- doc.delete_items(node_items=[pr_item])
339
+ try:
340
+ doc.delete_items(node_items=[pr_item])
341
+ except Exception as e:
342
+ _log.error(f"Error while making rich table: {e}.")
328
343
  else:
329
344
  rich_table_cell = True
330
345
  ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
@@ -391,17 +406,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
391
406
 
392
407
  provs_in_cell: list[RefItem] = []
393
408
  # Parse table cell sub-tree for Rich Cells content:
409
+ table_level = self.level
394
410
  provs_in_cell = self._walk(html_cell, doc)
411
+ # After walking sub-tree in cell, restore previously set level
412
+ self.level = table_level
395
413
 
396
414
  rich_table_cell = False
397
415
  ref_for_rich_cell = None
398
- if len(provs_in_cell) > 0:
399
- group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
400
- rich_table_cell, ref_for_rich_cell = (
401
- HTMLDocumentBackend.process_rich_table_cells(
402
- provs_in_cell, group_name, doc, docling_table
403
- )
416
+ group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
417
+ rich_table_cell, ref_for_rich_cell = (
418
+ HTMLDocumentBackend.process_rich_table_cells(
419
+ provs_in_cell, group_name, doc, docling_table
404
420
  )
421
+ )
405
422
 
406
423
  # Extracting text
407
424
  text = self.get_text(html_cell).strip()
@@ -774,13 +791,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
774
791
  for key in self.parents.keys():
775
792
  self.parents[key] = None
776
793
  self.level = 0
777
- docling_title = self.parents[self.level + 1] = doc.add_title(
794
+ self.parents[self.level + 1] = doc.add_title(
778
795
  text_clean,
779
796
  content_layer=self.content_layer,
780
797
  formatting=annotated_text.formatting,
781
798
  hyperlink=annotated_text.hyperlink,
782
799
  )
783
- added_ref = [docling_title.get_ref()]
800
+ p1 = self.parents[self.level + 1]
801
+ if p1 is not None:
802
+ added_ref = [p1.get_ref()]
784
803
  # the other levels need to be lowered by 1 if a title was set
785
804
  else:
786
805
  level -= 1
@@ -802,7 +821,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
802
821
  _log.debug(f"Remove the tail of level {key}")
803
822
  self.parents[key] = None
804
823
  self.level = level
805
- docling_heading = self.parents[self.level + 1] = doc.add_heading(
824
+ self.parents[self.level + 1] = doc.add_heading(
806
825
  parent=self.parents[self.level],
807
826
  text=text_clean,
808
827
  orig=annotated_text.text,
@@ -811,7 +830,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
811
830
  formatting=annotated_text.formatting,
812
831
  hyperlink=annotated_text.hyperlink,
813
832
  )
814
- added_ref = [docling_heading.get_ref()]
833
+ p2 = self.parents[self.level + 1]
834
+ if p2 is not None:
835
+ added_ref = [p2.get_ref()]
815
836
  self.level += 1
816
837
  for img_tag in tag("img"):
817
838
  if isinstance(img_tag, Tag):
@@ -249,7 +249,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
249
249
 
250
250
  # Iterates over all elements in the AST
251
251
  # Check for different element types and process relevant details
252
- if isinstance(element, marko.block.Heading) and len(element.children) > 0:
252
+ if (
253
+ isinstance(element, marko.block.Heading)
254
+ or isinstance(element, marko.block.SetextHeading)
255
+ ) and len(element.children) > 0:
253
256
  self._close_table(doc)
254
257
  _log.debug(
255
258
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
@@ -18,6 +18,7 @@ from docling_core.types.doc import (
18
18
  TableData,
19
19
  )
20
20
  from openpyxl import load_workbook
21
+ from openpyxl.chartsheet.chartsheet import Chartsheet
21
22
  from openpyxl.drawing.image import Image
22
23
  from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
23
24
  from openpyxl.worksheet.worksheet import Worksheet
@@ -186,18 +187,18 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
186
187
 
187
188
  if self.workbook is not None:
188
189
  # Iterate over all sheets
189
- for sheet_name in self.workbook.sheetnames:
190
- _log.info(f"Processing sheet: {sheet_name}")
190
+ for idx, name in enumerate(self.workbook.sheetnames):
191
+ _log.info(f"Processing sheet {idx}: {name}")
191
192
 
192
- sheet = self.workbook[sheet_name]
193
- page_no = self.workbook.index(sheet) + 1
193
+ sheet = self.workbook[name]
194
+ page_no = idx + 1
194
195
  # do not rely on sheet.max_column, sheet.max_row if there are images
195
196
  page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
196
197
 
197
198
  self.parents[0] = doc.add_group(
198
199
  parent=None,
199
200
  label=GroupLabel.SECTION,
200
- name=f"sheet: {sheet_name}",
201
+ name=f"sheet: {name}",
201
202
  content_layer=self._get_sheet_content_layer(sheet),
202
203
  )
203
204
  doc = self._convert_sheet(doc, sheet)
@@ -208,7 +209,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
208
209
 
209
210
  return doc
210
211
 
211
- def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
212
+ def _convert_sheet(
213
+ self, doc: DoclingDocument, sheet: Union[Worksheet, Chartsheet]
214
+ ) -> DoclingDocument:
212
215
  """Parse an Excel worksheet and attach its structure to a DoclingDocument
213
216
 
214
217
  Args:
@@ -218,10 +221,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
218
221
  Returns:
219
222
  The updated DoclingDocument.
220
223
  """
224
+ if isinstance(sheet, Worksheet):
225
+ doc = self._find_tables_in_sheet(doc, sheet)
226
+ doc = self._find_images_in_sheet(doc, sheet)
221
227
 
222
- doc = self._find_tables_in_sheet(doc, sheet)
223
-
224
- doc = self._find_images_in_sheet(doc, sheet)
228
+ # TODO: parse charts in sheet
225
229
 
226
230
  return doc
227
231
 
docling/cli/main.py CHANGED
@@ -49,7 +49,7 @@ from docling.datamodel.document import ConversionResult
49
49
  from docling.datamodel.pipeline_options import (
50
50
  AsrPipelineOptions,
51
51
  ConvertPipelineOptions,
52
- EasyOcrOptions,
52
+ OcrAutoOptions,
53
53
  OcrOptions,
54
54
  PaginatedPipelineOptions,
55
55
  PdfBackend,
@@ -57,6 +57,8 @@ from docling.datamodel.pipeline_options import (
57
57
  PipelineOptions,
58
58
  ProcessingPipeline,
59
59
  TableFormerMode,
60
+ TesseractCliOcrOptions,
61
+ TesseractOcrOptions,
60
62
  VlmPipelineOptions,
61
63
  )
62
64
  from docling.datamodel.settings import settings
@@ -355,6 +357,13 @@ def convert( # noqa: C901
355
357
  help="Replace any existing text with OCR generated text over the full content.",
356
358
  ),
357
359
  ] = False,
360
+ tables: Annotated[
361
+ bool,
362
+ typer.Option(
363
+ ...,
364
+ help="If enabled, the table structure model will be used to extract table information.",
365
+ ),
366
+ ] = True,
358
367
  ocr_engine: Annotated[
359
368
  str,
360
369
  typer.Option(
@@ -365,7 +374,7 @@ def convert( # noqa: C901
365
374
  f"Use the option --show-external-plugins to see the options allowed with external plugins."
366
375
  ),
367
376
  ),
368
- ] = EasyOcrOptions.kind,
377
+ ] = OcrAutoOptions.kind,
369
378
  ocr_lang: Annotated[
370
379
  Optional[str],
371
380
  typer.Option(
@@ -373,6 +382,13 @@ def convert( # noqa: C901
373
382
  help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
374
383
  ),
375
384
  ] = None,
385
+ psm: Annotated[
386
+ Optional[int],
387
+ typer.Option(
388
+ ...,
389
+ help="Page Segmentation Mode for the OCR engine (0-13).",
390
+ ),
391
+ ] = None,
376
392
  pdf_backend: Annotated[
377
393
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
378
394
  ] = PdfBackend.DLPARSE_V2,
@@ -540,13 +556,25 @@ def convert( # noqa: C901
540
556
  if local_path.exists() and local_path.is_dir():
541
557
  for fmt in from_formats:
542
558
  for ext in FormatToExtensions[fmt]:
543
- input_doc_paths.extend(
544
- list(local_path.glob(f"**/*.{ext}"))
545
- )
546
- input_doc_paths.extend(
547
- list(local_path.glob(f"**/*.{ext.upper()}"))
548
- )
559
+ for path in local_path.glob(f"**/*.{ext}"):
560
+ if path.name.startswith("~$") and ext == "docx":
561
+ _log.info(
562
+ f"Ignoring temporary Word file: {path}"
563
+ )
564
+ continue
565
+ input_doc_paths.append(path)
566
+
567
+ for path in local_path.glob(f"**/*.{ext.upper()}"):
568
+ if path.name.startswith("~$") and ext == "docx":
569
+ _log.info(
570
+ f"Ignoring temporary Word file: {path}"
571
+ )
572
+ continue
573
+ input_doc_paths.append(path)
549
574
  elif local_path.exists():
575
+ if not local_path.name.startswith("~$") and ext == "docx":
576
+ _log.info(f"Ignoring temporary Word file: {path}")
577
+ continue
550
578
  input_doc_paths.append(local_path)
551
579
  else:
552
580
  err_console.print(
@@ -577,6 +605,10 @@ def convert( # noqa: C901
577
605
  ocr_lang_list = _split_list(ocr_lang)
578
606
  if ocr_lang_list is not None:
579
607
  ocr_options.lang = ocr_lang_list
608
+ if psm is not None and isinstance(
609
+ ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
610
+ ):
611
+ ocr_options.psm = psm
580
612
 
581
613
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
582
614
  # pipeline_options: PaginatedPipelineOptions
@@ -591,7 +623,7 @@ def convert( # noqa: C901
591
623
  accelerator_options=accelerator_options,
592
624
  do_ocr=ocr,
593
625
  ocr_options=ocr_options,
594
- do_table_structure=True,
626
+ do_table_structure=tables,
595
627
  do_code_enrichment=enrich_code,
596
628
  do_formula_enrichment=enrich_formula,
597
629
  do_picture_description=enrich_picture_description,
docling/cli/models.py CHANGED
@@ -38,6 +38,7 @@ class _AvailableModels(str, Enum):
38
38
  SMOLDOCLING = "smoldocling"
39
39
  SMOLDOCLING_MLX = "smoldocling_mlx"
40
40
  GRANITE_VISION = "granite_vision"
41
+ RAPIDOCR = "rapidocr"
41
42
  EASYOCR = "easyocr"
42
43
 
43
44
 
@@ -46,7 +47,7 @@ _default_models = [
46
47
  _AvailableModels.TABLEFORMER,
47
48
  _AvailableModels.CODE_FORMULA,
48
49
  _AvailableModels.PICTURE_CLASSIFIER,
49
- _AvailableModels.EASYOCR,
50
+ _AvailableModels.RAPIDOCR,
50
51
  ]
51
52
 
52
53
 
@@ -115,6 +116,7 @@ def download(
115
116
  with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
116
117
  with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
117
118
  with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
119
+ with_rapidocr=_AvailableModels.RAPIDOCR in to_download,
118
120
  with_easyocr=_AvailableModels.EASYOCR in to_download,
119
121
  )
120
122
 
@@ -81,6 +81,13 @@ class OcrOptions(BaseOptions):
81
81
  )
82
82
 
83
83
 
84
+ class OcrAutoOptions(OcrOptions):
85
+ """Options for pick OCR engine automatically."""
86
+
87
+ kind: ClassVar[Literal["auto"]] = "auto"
88
+ lang: List[str] = []
89
+
90
+
84
91
  class RapidOcrOptions(OcrOptions):
85
92
  """Options for the RapidOCR engine."""
86
93
 
@@ -154,6 +161,9 @@ class TesseractCliOcrOptions(OcrOptions):
154
161
  lang: List[str] = ["fra", "deu", "spa", "eng"]
155
162
  tesseract_cmd: str = "tesseract"
156
163
  path: Optional[str] = None
164
+ psm: Optional[int] = (
165
+ None # Page Segmentation Mode (0-13), defaults to tesseract's default
166
+ )
157
167
 
158
168
  model_config = ConfigDict(
159
169
  extra="forbid",
@@ -166,6 +176,9 @@ class TesseractOcrOptions(OcrOptions):
166
176
  kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
167
177
  lang: List[str] = ["fra", "deu", "spa", "eng"]
168
178
  path: Optional[str] = None
179
+ psm: Optional[int] = (
180
+ None # Page Segmentation Mode (0-13), defaults to tesseract's default
181
+ )
169
182
 
170
183
  model_config = ConfigDict(
171
184
  extra="forbid",
@@ -249,6 +262,7 @@ class PdfBackend(str, Enum):
249
262
  class OcrEngine(str, Enum):
250
263
  """Enum of valid OCR engines."""
251
264
 
265
+ AUTO = "auto"
252
266
  EASYOCR = "easyocr"
253
267
  TESSERACT_CLI = "tesseract_cli"
254
268
  TESSERACT = "tesseract"
@@ -330,7 +344,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
330
344
  # If True, text from backend will be used instead of generated text
331
345
 
332
346
  table_structure_options: TableStructureOptions = TableStructureOptions()
333
- ocr_options: OcrOptions = EasyOcrOptions()
347
+ ocr_options: OcrOptions = OcrAutoOptions()
334
348
  layout_options: LayoutOptions = LayoutOptions()
335
349
 
336
350
  images_scale: float = 1.0
@@ -0,0 +1,132 @@
1
+ import logging
2
+ import sys
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+ from typing import Optional, Type
6
+
7
+ from docling.datamodel.accelerator_options import AcceleratorOptions
8
+ from docling.datamodel.base_models import Page
9
+ from docling.datamodel.document import ConversionResult
10
+ from docling.datamodel.pipeline_options import (
11
+ EasyOcrOptions,
12
+ OcrAutoOptions,
13
+ OcrMacOptions,
14
+ OcrOptions,
15
+ RapidOcrOptions,
16
+ )
17
+ from docling.models.base_ocr_model import BaseOcrModel
18
+ from docling.models.easyocr_model import EasyOcrModel
19
+ from docling.models.ocr_mac_model import OcrMacModel
20
+ from docling.models.rapid_ocr_model import RapidOcrModel
21
+
22
+ _log = logging.getLogger(__name__)
23
+
24
+
25
+ class OcrAutoModel(BaseOcrModel):
26
+ def __init__(
27
+ self,
28
+ enabled: bool,
29
+ artifacts_path: Optional[Path],
30
+ options: OcrAutoOptions,
31
+ accelerator_options: AcceleratorOptions,
32
+ ):
33
+ super().__init__(
34
+ enabled=enabled,
35
+ artifacts_path=artifacts_path,
36
+ options=options,
37
+ accelerator_options=accelerator_options,
38
+ )
39
+ self.options: OcrAutoOptions
40
+
41
+ self._engine: Optional[BaseOcrModel] = None
42
+ if self.enabled:
43
+ if "darwin" == sys.platform:
44
+ try:
45
+ from ocrmac import ocrmac
46
+
47
+ self._engine = OcrMacModel(
48
+ enabled=self.enabled,
49
+ artifacts_path=artifacts_path,
50
+ options=OcrMacOptions(
51
+ bitmap_area_threshold=self.options.bitmap_area_threshold,
52
+ force_full_page_ocr=self.options.force_full_page_ocr,
53
+ ),
54
+ accelerator_options=accelerator_options,
55
+ )
56
+ _log.info("Auto OCR model selected ocrmac.")
57
+ except ImportError:
58
+ _log.info("ocrmac cannot be used because ocrmac is not installed.")
59
+
60
+ if self._engine is None:
61
+ try:
62
+ import onnxruntime
63
+ from rapidocr import EngineType, RapidOCR # type: ignore
64
+
65
+ self._engine = RapidOcrModel(
66
+ enabled=self.enabled,
67
+ artifacts_path=artifacts_path,
68
+ options=RapidOcrOptions(
69
+ backend="onnxruntime",
70
+ bitmap_area_threshold=self.options.bitmap_area_threshold,
71
+ force_full_page_ocr=self.options.force_full_page_ocr,
72
+ ),
73
+ accelerator_options=accelerator_options,
74
+ )
75
+ _log.info("Auto OCR model selected rapidocr with onnxruntime.")
76
+ except ImportError:
77
+ _log.info(
78
+ "rapidocr cannot be used because onnxruntime is not installed."
79
+ )
80
+
81
+ if self._engine is None:
82
+ try:
83
+ import easyocr
84
+
85
+ self._engine = EasyOcrModel(
86
+ enabled=self.enabled,
87
+ artifacts_path=artifacts_path,
88
+ options=EasyOcrOptions(
89
+ bitmap_area_threshold=self.options.bitmap_area_threshold,
90
+ force_full_page_ocr=self.options.force_full_page_ocr,
91
+ ),
92
+ accelerator_options=accelerator_options,
93
+ )
94
+ _log.info("Auto OCR model selected easyocr.")
95
+ except ImportError:
96
+ _log.info("easyocr cannot be used because it is not installed.")
97
+
98
+ if self._engine is None:
99
+ try:
100
+ import torch
101
+ from rapidocr import EngineType, RapidOCR # type: ignore
102
+
103
+ self._engine = RapidOcrModel(
104
+ enabled=self.enabled,
105
+ artifacts_path=artifacts_path,
106
+ options=RapidOcrOptions(
107
+ backend="torch",
108
+ bitmap_area_threshold=self.options.bitmap_area_threshold,
109
+ force_full_page_ocr=self.options.force_full_page_ocr,
110
+ ),
111
+ accelerator_options=accelerator_options,
112
+ )
113
+ _log.info("Auto OCR model selected rapidocr with torch.")
114
+ except ImportError:
115
+ _log.info(
116
+ "rapidocr cannot be used because rapidocr or torch is not installed."
117
+ )
118
+
119
+ if self._engine is None:
120
+ _log.warning("No OCR engine found. Please review the install details.")
121
+
122
+ def __call__(
123
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
124
+ ) -> Iterable[Page]:
125
+ if not self.enabled or self._engine is None:
126
+ yield from page_batch
127
+ return
128
+ yield from self._engine(conv_res, page_batch)
129
+
130
+ @classmethod
131
+ def get_options_type(cls) -> Type[OcrOptions]:
132
+ return OcrAutoOptions
@@ -173,11 +173,11 @@ class BaseItemAndImageEnrichmentModel(
173
173
  assert isinstance(element, DocItem)
174
174
 
175
175
  # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
176
- if len(element.prov) == 0 and isinstance(element, PictureItem):
176
+ if isinstance(element, PictureItem):
177
177
  embedded_im = element.get_image(conv_res.document)
178
178
  if embedded_im is not None:
179
179
  return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
180
- else:
180
+ elif len(element.prov) == 0:
181
181
  return None
182
182
 
183
183
  # Crop the image form the page
@@ -1,4 +1,5 @@
1
1
  def ocr_engines():
2
+ from docling.models.auto_ocr_model import OcrAutoModel
2
3
  from docling.models.easyocr_model import EasyOcrModel
3
4
  from docling.models.ocr_mac_model import OcrMacModel
4
5
  from docling.models.rapid_ocr_model import RapidOcrModel
@@ -7,6 +8,7 @@ def ocr_engines():
7
8
 
8
9
  return {
9
10
  "ocr_engines": [
11
+ OcrAutoModel,
10
12
  EasyOcrModel,
11
13
  OcrMacModel,
12
14
  RapidOcrModel,
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections.abc import Iterable
3
3
  from pathlib import Path
4
- from typing import Optional, Type
4
+ from typing import Literal, Optional, Type, TypedDict
5
5
 
6
6
  import numpy
7
7
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -18,11 +18,67 @@ from docling.datamodel.settings import settings
18
18
  from docling.models.base_ocr_model import BaseOcrModel
19
19
  from docling.utils.accelerator_utils import decide_device
20
20
  from docling.utils.profiling import TimeRecorder
21
+ from docling.utils.utils import download_url_with_progress
21
22
 
22
23
  _log = logging.getLogger(__name__)
23
24
 
25
+ _ModelPathEngines = Literal["onnxruntime", "torch"]
26
+ _ModelPathTypes = Literal[
27
+ "det_model_path", "cls_model_path", "rec_model_path", "rec_keys_path"
28
+ ]
29
+
30
+
31
+ class _ModelPathDetail(TypedDict):
32
+ url: str
33
+ path: str
34
+
24
35
 
25
36
  class RapidOcrModel(BaseOcrModel):
37
+ _model_repo_folder = "RapidOcr"
38
+ # from https://github.com/RapidAI/RapidOCR/blob/main/python/rapidocr/default_models.yaml
39
+ # matching the default config in https://github.com/RapidAI/RapidOCR/blob/main/python/rapidocr/config.yaml
40
+ # and naming f"{file_info.engine_type.value}.{file_info.ocr_version.value}.{file_info.task_type.value}"
41
+ _default_models: dict[
42
+ _ModelPathEngines, dict[_ModelPathTypes, _ModelPathDetail]
43
+ ] = {
44
+ "onnxruntime": {
45
+ "det_model_path": {
46
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
47
+ "path": "onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
48
+ },
49
+ "cls_model_path": {
50
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
51
+ "path": "onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
52
+ },
53
+ "rec_model_path": {
54
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.onnx",
55
+ "path": "onnx/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.onnx",
56
+ },
57
+ "rec_keys_path": {
58
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v2.0.7/paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
59
+ "path": "paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
60
+ },
61
+ },
62
+ "torch": {
63
+ "det_model_path": {
64
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth",
65
+ "path": "torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth",
66
+ },
67
+ "cls_model_path": {
68
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth",
69
+ "path": "torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth",
70
+ },
71
+ "rec_model_path": {
72
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth",
73
+ "path": "torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth",
74
+ },
75
+ "rec_keys_path": {
76
+ "url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
77
+ "path": "paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
78
+ },
79
+ },
80
+ }
81
+
26
82
  def __init__(
27
83
  self,
28
84
  enabled: bool,
@@ -62,25 +118,66 @@ class RapidOcrModel(BaseOcrModel):
62
118
  }
63
119
  backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
64
120
 
121
+ det_model_path = self.options.det_model_path
122
+ cls_model_path = self.options.cls_model_path
123
+ rec_model_path = self.options.rec_model_path
124
+ rec_keys_path = self.options.rec_keys_path
125
+ if artifacts_path is not None:
126
+ det_model_path = (
127
+ det_model_path
128
+ or artifacts_path
129
+ / self._model_repo_folder
130
+ / self._default_models[backend_enum.value]["det_model_path"]["path"]
131
+ )
132
+ cls_model_path = (
133
+ cls_model_path
134
+ or artifacts_path
135
+ / self._model_repo_folder
136
+ / self._default_models[backend_enum.value]["cls_model_path"]["path"]
137
+ )
138
+ rec_model_path = (
139
+ rec_model_path
140
+ or artifacts_path
141
+ / self._model_repo_folder
142
+ / self._default_models[backend_enum.value]["rec_model_path"]["path"]
143
+ )
144
+ rec_keys_path = (
145
+ rec_keys_path
146
+ or artifacts_path
147
+ / self._model_repo_folder
148
+ / self._default_models[backend_enum.value]["rec_keys_path"]["path"]
149
+ )
150
+
151
+ for model_path in (
152
+ rec_keys_path,
153
+ cls_model_path,
154
+ rec_model_path,
155
+ rec_keys_path,
156
+ ):
157
+ if model_path is None:
158
+ continue
159
+ if not Path(model_path).exists():
160
+ _log.warning(f"The provided model path {model_path} is not found.")
161
+
65
162
  params = {
66
163
  # Global settings (these are still correct)
67
164
  "Global.text_score": self.options.text_score,
68
165
  "Global.font_path": self.options.font_path,
69
166
  # "Global.verbose": self.options.print_verbose,
70
167
  # Detection model settings
71
- "Det.model_path": self.options.det_model_path,
168
+ "Det.model_path": det_model_path,
72
169
  "Det.use_cuda": use_cuda,
73
170
  "Det.use_dml": use_dml,
74
171
  "Det.intra_op_num_threads": intra_op_num_threads,
75
172
  # Classification model settings
76
- "Cls.model_path": self.options.cls_model_path,
173
+ "Cls.model_path": cls_model_path,
77
174
  "Cls.use_cuda": use_cuda,
78
175
  "Cls.use_dml": use_dml,
79
176
  "Cls.intra_op_num_threads": intra_op_num_threads,
80
177
  # Recognition model settings
81
- "Rec.model_path": self.options.rec_model_path,
178
+ "Rec.model_path": rec_model_path,
82
179
  "Rec.font_path": self.options.rec_font_path,
83
- "Rec.keys_path": self.options.rec_keys_path,
180
+ "Rec.keys_path": rec_keys_path,
84
181
  "Rec.use_cuda": use_cuda,
85
182
  "Rec.use_dml": use_dml,
86
183
  "Rec.intra_op_num_threads": intra_op_num_threads,
@@ -102,6 +199,30 @@ class RapidOcrModel(BaseOcrModel):
102
199
  params=params,
103
200
  )
104
201
 
202
+ @staticmethod
203
+ def download_models(
204
+ backend: _ModelPathEngines,
205
+ local_dir: Optional[Path] = None,
206
+ force: bool = False,
207
+ progress: bool = False,
208
+ ) -> Path:
209
+ if local_dir is None:
210
+ local_dir = settings.cache_dir / "models" / RapidOcrModel._model_repo_folder
211
+
212
+ local_dir.mkdir(parents=True, exist_ok=True)
213
+
214
+ # Download models
215
+ for model_type, model_details in RapidOcrModel._default_models[backend].items():
216
+ output_path = local_dir / model_details["path"]
217
+ if output_path.exists() and not force:
218
+ continue
219
+ output_path.parent.mkdir(exist_ok=True, parents=True)
220
+ buf = download_url_with_progress(model_details["url"], progress=progress)
221
+ with output_path.open("wb") as fw:
222
+ fw.write(buf.read())
223
+
224
+ return local_dir
225
+
105
226
  def __call__(
106
227
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
107
228
  ) -> Iterable[Page]:
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
9
9
  NodeItem,
10
10
  ProvenanceItem,
11
11
  RefItem,
12
+ RichTableCell,
12
13
  TableData,
13
14
  )
14
15
  from docling_core.types.doc.document import ContentLayer
@@ -103,6 +104,22 @@ class ReadingOrderModel:
103
104
  else:
104
105
  doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
105
106
 
107
+ def _create_rich_cell_group(
108
+ self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
109
+ ) -> RefItem:
110
+ """Create a group containing all child elements for a rich table cell."""
111
+ group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
112
+ group_element = doc.add_group(
113
+ label=GroupLabel.UNSPECIFIED,
114
+ name=group_name,
115
+ parent=table_item,
116
+ )
117
+
118
+ # Add all child elements to the group
119
+ self._add_child_elements(element, group_element, doc)
120
+
121
+ return group_element.get_ref()
122
+
106
123
  def _readingorder_elements_to_docling_doc(
107
124
  self,
108
125
  conv_res: ConversionResult,
@@ -197,11 +214,21 @@ class ReadingOrderModel:
197
214
  )
198
215
 
199
216
  elif isinstance(element, Table):
200
- tbl_data = TableData(
201
- num_rows=element.num_rows,
202
- num_cols=element.num_cols,
203
- table_cells=element.table_cells,
204
- )
217
+ # Check if table has no structure prediction
218
+ if element.num_rows == 0 and element.num_cols == 0:
219
+ # Only create 1x1 table if there are children to put in it
220
+ if element.cluster.children:
221
+ # Create minimal 1x1 table with rich cell containing all children
222
+ tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
223
+ else:
224
+ # Create empty table with no structure
225
+ tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
226
+ else:
227
+ tbl_data = TableData(
228
+ num_rows=element.num_rows,
229
+ num_cols=element.num_cols,
230
+ table_cells=element.table_cells,
231
+ )
205
232
 
206
233
  prov = ProvenanceItem(
207
234
  page_no=element.page_no + 1,
@@ -231,6 +258,30 @@ class ReadingOrderModel:
231
258
 
232
259
  tbl.footnotes.append(new_footnote_item.get_ref())
233
260
 
261
+ # Handle case where table has no structure prediction but has children
262
+ if (
263
+ element.num_rows == 0
264
+ and element.num_cols == 0
265
+ and element.cluster.children
266
+ ):
267
+ # Create rich cell containing all child elements
268
+ rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
269
+
270
+ # Create rich table cell spanning the entire 1x1 table
271
+ rich_cell = RichTableCell(
272
+ text="", # Empty text since content is in the group
273
+ row_span=1,
274
+ col_span=1,
275
+ start_row_offset_idx=0,
276
+ end_row_offset_idx=1,
277
+ start_col_offset_idx=0,
278
+ end_col_offset_idx=1,
279
+ column_header=False,
280
+ row_header=False,
281
+ ref=rich_cell_ref,
282
+ )
283
+ out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
284
+
234
285
  # TODO: Consider adding children of Table.
235
286
 
236
287
  elif isinstance(element, FigureElement):
@@ -117,6 +117,10 @@ class TesseractOcrCliModel(BaseOcrModel):
117
117
  cmd.append("--tessdata-dir")
118
118
  cmd.append(self.options.path)
119
119
 
120
+ # Add PSM option if specified in the configuration
121
+ if self.options.psm is not None:
122
+ cmd.extend(["--psm", str(self.options.psm)])
123
+
120
124
  cmd += [ifilename, "stdout", "tsv"]
121
125
  _log.info("command: {}".format(" ".join(cmd)))
122
126
 
@@ -86,7 +86,6 @@ class TesseractOcrModel(BaseOcrModel):
86
86
  self.script_prefix = ""
87
87
 
88
88
  tesserocr_kwargs = {
89
- "psm": tesserocr.PSM.AUTO,
90
89
  "init": True,
91
90
  "oem": tesserocr.OEM.DEFAULT,
92
91
  }
@@ -96,14 +95,23 @@ class TesseractOcrModel(BaseOcrModel):
96
95
  if self.options.path is not None:
97
96
  tesserocr_kwargs["path"] = self.options.path
98
97
 
98
+ # Set main OCR reader with configurable PSM
99
+ main_psm = (
100
+ tesserocr.PSM(self.options.psm)
101
+ if self.options.psm is not None
102
+ else tesserocr.PSM.AUTO
103
+ )
99
104
  if lang == "auto":
100
- self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
105
+ self.reader = tesserocr.PyTessBaseAPI(psm=main_psm, **tesserocr_kwargs)
101
106
  else:
102
107
  self.reader = tesserocr.PyTessBaseAPI(
103
- **{"lang": lang} | tesserocr_kwargs,
108
+ lang=lang,
109
+ psm=main_psm,
110
+ **tesserocr_kwargs,
104
111
  )
112
+ # OSD reader must use PSM.OSD_ONLY for orientation detection
105
113
  self.osd_reader = tesserocr.PyTessBaseAPI(
106
- **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
114
+ lang="osd", psm=tesserocr.PSM.OSD_ONLY, **tesserocr_kwargs
107
115
  )
108
116
  self.reader_RIL = tesserocr.RIL
109
117
 
@@ -187,7 +195,9 @@ class TesseractOcrModel(BaseOcrModel):
187
195
  tesserocr.PyTessBaseAPI(
188
196
  path=self.reader.GetDatapath(),
189
197
  lang=lang,
190
- psm=tesserocr.PSM.AUTO,
198
+ psm=tesserocr.PSM(self.options.psm)
199
+ if self.options.psm is not None
200
+ else tesserocr.PSM.AUTO,
191
201
  init=True,
192
202
  oem=tesserocr.OEM.DEFAULT,
193
203
  )
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import os
3
3
  import re
4
+ import tempfile
4
5
  from io import BytesIO
5
6
  from pathlib import Path
6
7
  from typing import List, Optional, Union, cast
@@ -147,7 +148,25 @@ class _NativeWhisperModel:
147
148
  self.word_timestamps = asr_options.word_timestamps
148
149
 
149
150
  def run(self, conv_res: ConversionResult) -> ConversionResult:
150
- audio_path: Path = Path(conv_res.input.file).resolve()
151
+ # Access the file path from the backend, similar to how other pipelines handle it
152
+ path_or_stream = conv_res.input._backend.path_or_stream
153
+
154
+ # Handle both Path and BytesIO inputs
155
+ temp_file_path: Optional[Path] = None
156
+
157
+ if isinstance(path_or_stream, BytesIO):
158
+ # For BytesIO, write to a temporary file since whisper requires a file path
159
+ suffix = Path(conv_res.input.file.name).suffix or ".wav"
160
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
161
+ tmp_file.write(path_or_stream.getvalue())
162
+ temp_file_path = Path(tmp_file.name)
163
+ audio_path = temp_file_path
164
+ elif isinstance(path_or_stream, Path):
165
+ audio_path = path_or_stream
166
+ else:
167
+ raise RuntimeError(
168
+ f"ASR pipeline requires a file path or BytesIO stream, but got {type(path_or_stream)}"
169
+ )
151
170
 
152
171
  try:
153
172
  conversation = self.transcribe(audio_path)
@@ -167,14 +186,22 @@ class _NativeWhisperModel:
167
186
  label=DocItemLabel.TEXT, text=citem.to_string()
168
187
  )
169
188
 
170
- conv_res.status = ConversionStatus.SUCCESS
171
189
  return conv_res
172
190
 
173
191
  except Exception as exc:
174
192
  _log.error(f"Audio tranciption has an error: {exc}")
193
+ conv_res.status = ConversionStatus.FAILURE
194
+ return conv_res
175
195
 
176
- conv_res.status = ConversionStatus.FAILURE
177
- return conv_res
196
+ finally:
197
+ # Clean up temporary file if created
198
+ if temp_file_path is not None and temp_file_path.exists():
199
+ try:
200
+ temp_file_path.unlink()
201
+ except Exception as e:
202
+ _log.warning(
203
+ f"Failed to delete temporary file {temp_file_path}: {e}"
204
+ )
178
205
 
179
206
  def transcribe(self, fpath: Path) -> list[_ConversationItem]:
180
207
  result = self.model.transcribe(
@@ -221,9 +248,29 @@ class AsrPipeline(BasePipeline):
221
248
  else:
222
249
  _log.error(f"No model support for {self.pipeline_options.asr_options}")
223
250
 
251
+ def _has_text(self, document: "DoclingDocument") -> bool:
252
+ """
253
+ Helper method to check if the document contains any transcribed text.
254
+ A transcription is considered non-empty if the .texts list contains items with actual, non whitespace content.
255
+ """
256
+ if not document or not document.texts:
257
+ return False
258
+ for item in document.texts:
259
+ if item.text and item.text.strip():
260
+ return True
261
+ return False
262
+
224
263
  def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
225
- status = ConversionStatus.SUCCESS
226
- return status
264
+ """Determines the final status of ASR Conversion based on its result."""
265
+ if conv_res.status == ConversionStatus.FAILURE or conv_res.errors:
266
+ return ConversionStatus.FAILURE
267
+ if not self._has_text(conv_res.document):
268
+ _log.warning(
269
+ "ASR conversion resulted in an empty document."
270
+ f"File: {conv_res.input.file.name}"
271
+ )
272
+ return ConversionStatus.PARTIAL_SUCCESS
273
+ return ConversionStatus.SUCCESS
227
274
 
228
275
  @classmethod
229
276
  def get_default_options(cls) -> AsrPipelineOptions:
@@ -20,6 +20,7 @@ from docling.models.document_picture_classifier import DocumentPictureClassifier
20
20
  from docling.models.easyocr_model import EasyOcrModel
21
21
  from docling.models.layout_model import LayoutModel
22
22
  from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
23
+ from docling.models.rapid_ocr_model import RapidOcrModel
23
24
  from docling.models.table_structure_model import TableStructureModel
24
25
  from docling.models.utils.hf_model_download import download_hf_model
25
26
 
@@ -41,6 +42,7 @@ def download_models(
41
42
  with_smoldocling: bool = False,
42
43
  with_smoldocling_mlx: bool = False,
43
44
  with_granite_vision: bool = False,
45
+ with_rapidocr: bool = True,
44
46
  with_easyocr: bool = True,
45
47
  ):
46
48
  if output_dir is None:
@@ -135,6 +137,16 @@ def download_models(
135
137
  progress=progress,
136
138
  )
137
139
 
140
+ if with_rapidocr:
141
+ for backend in ("torch", "onnxruntime"):
142
+ _log.info(f"Downloading rapidocr {backend} models...")
143
+ RapidOcrModel.download_models(
144
+ backend=backend,
145
+ local_dir=output_dir / RapidOcrModel._model_repo_folder,
146
+ force=force,
147
+ progress=progress,
148
+ )
149
+
138
150
  if with_easyocr:
139
151
  _log.info("Downloading easyocr models...")
140
152
  EasyOcrModel.download_models(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.55.0
3
+ Version: 2.56.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -34,7 +34,8 @@ Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
35
35
  Requires-Dist: huggingface_hub<1,>=0.23
36
36
  Requires-Dist: requests<3.0.0,>=2.32.2
37
- Requires-Dist: easyocr<2.0,>=1.7
37
+ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin"
38
+ Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14"
38
39
  Requires-Dist: certifi>=2024.7.4
39
40
  Requires-Dist: rtree<2.0.0,>=1.3.0
40
41
  Requires-Dist: typer<0.20.0,>=0.12.5
@@ -52,6 +53,8 @@ Requires-Dist: pylatexenc<3.0,>=2.10
52
53
  Requires-Dist: scipy<2.0.0,>=1.6.0
53
54
  Requires-Dist: accelerate<2,>=1.0.0
54
55
  Requires-Dist: polyfactory>=2.22.2
56
+ Provides-Extra: easyocr
57
+ Requires-Dist: easyocr<2.0,>=1.7; extra == "easyocr"
55
58
  Provides-Extra: tesserocr
56
59
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
57
60
  Provides-Extra: ocrmac
@@ -65,7 +68,6 @@ Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
65
68
  Provides-Extra: rapidocr
66
69
  Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
67
70
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
68
- Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
69
71
  Provides-Extra: asr
70
72
  Requires-Dist: openai-whisper>=20250625; extra == "asr"
71
73
  Dynamic: license-file
@@ -10,10 +10,10 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
10
10
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
11
11
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
12
12
  docling/backend/docling_parse_v4_backend.py,sha256=xCBbaaXjNNrOaod9tmBuCbe5mL_ipmTNG2XOxVbGG3w,7891
13
- docling/backend/html_backend.py,sha256=r2m3aIKwwr8Vv2Fxri1FaZFvd4EWvTQlmSPwXeD79zg,47796
14
- docling/backend/md_backend.py,sha256=zrOUYoIYudUfigwnXRQocb_M4G_ptYfblNgr6BNTYQw,22678
13
+ docling/backend/html_backend.py,sha256=iuRyYztUduyP214X0SyDvl1dP_h0eccp5RkuM72rV8o,48664
14
+ docling/backend/md_backend.py,sha256=TWboEPHl93pqI_Go1a3XpP-KpzI3d17xo5ZW42Ul0kY,22764
15
15
  docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
16
- docling/backend/msexcel_backend.py,sha256=5JRbPwOjR1r45AMeIts1rj6InbOgLBf_CtAhvNPVmsQ,19157
16
+ docling/backend/msexcel_backend.py,sha256=GOuA-MlShpzFmCmJq3-Z28iquwWUg4k8v-AT4O-aAQI,19305
17
17
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
18
18
  docling/backend/msword_backend.py,sha256=Jfd57hzG8iFVAzqsOAHe5jG8LCHAIBXJhQCW0tESnMM,54405
19
19
  docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
@@ -31,8 +31,8 @@ docling/backend/xml/jats_backend.py,sha256=_BWpQQg3SlsHAOOj0v2qRJoVqaQzL91GqN1tK
31
31
  docling/backend/xml/uspto_backend.py,sha256=Tv4CE7V5_QwxTNJPl90CAd_mAbwaLGy8S6s6evh1Xow,70910
32
32
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
33
33
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- docling/cli/main.py,sha256=UX-5fRGVP_yGxTQez0x1PNnaNKRgWdcXGoPCHy-0uFM,32887
35
- docling/cli/models.py,sha256=rw_2JfeJ-k_iOLpz3JfgL1QbJY__W9nE23nHdov6VfU,6252
34
+ docling/cli/main.py,sha256=cvDS6CTME2B2Mrm4l9yNynOUDVsZ9ZTlA6mM_jsa5jU,34258
35
+ docling/cli/models.py,sha256=zZBFQJAD7C5sespnYy5M__4qC_GyqAZ-QpfWtgPRDB0,6343
36
36
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
37
37
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
@@ -41,14 +41,15 @@ docling/datamodel/base_models.py,sha256=CQ6eThPzVeVD2Gq7BNz9Q5RDLwhe4NgMzk7tdLtk
41
41
  docling/datamodel/document.py,sha256=HyO3kdJcXIJ3wL95sPoL3zvsO4Rww3-qHH6IkL4I0q4,17483
42
42
  docling/datamodel/extraction.py,sha256=7dgvtK5SuvgfB8LHAwS1FwrW1kcMQJuJG0ol8uAQgoQ,1323
43
43
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
44
- docling/datamodel/pipeline_options.py,sha256=28opZ3woXA8IKaG2-BHM-lmmi-gyuScCMHGxhlxGOsk,11290
44
+ docling/datamodel/pipeline_options.py,sha256=dklSaA7P6VkjbBB-Pz2OyzO2SQuV9y0I8VVr9XHJusw,11692
45
45
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
46
46
  docling/datamodel/pipeline_options_vlm_model.py,sha256=Szdq5_MhqQ8xBCvOUkdn_LLV29ZMQJcF4xnItYlkmXQ,3090
47
47
  docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
48
48
  docling/datamodel/vlm_model_specs.py,sha256=9TTmihDEFcI-TY1jJ2GTnTcrGa3bLg0e6anN4gPtFgU,10035
49
49
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
50
  docling/models/api_vlm_model.py,sha256=iNQ9LiT031Mch-LHn8O2CskVXYkr4weEetZPxynU_9U,4236
51
- docling/models/base_model.py,sha256=LSaJWkSaDyLBVB4Fv9fkw6kmJ67QnG0t32iGn_u2WjE,7256
51
+ docling/models/auto_ocr_model.py,sha256=nn_eQfNdGUclXKrB0nodHmCqgMUNUJzG3dLq0lhlNAI,5188
52
+ docling/models/base_model.py,sha256=QEbglxu3kT6aNq3x_5jY8T_KcD_Hhv9zr0-A4Mizhco,7252
52
53
  docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
53
54
  docling/models/code_formula_model.py,sha256=XRugm4EwifLRc-TrAk-glKlktJP-nAPneKh2EOovkJU,11308
54
55
  docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
@@ -60,17 +61,17 @@ docling/models/page_preprocessing_model.py,sha256=EmusNexws5ZmR93js_saVU0BedqZ_H
60
61
  docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
61
62
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
62
63
  docling/models/picture_description_vlm_model.py,sha256=Uja_BQSk7F-U1J2hm4yeLguirUzKYv1K8zRyw1IYomY,4150
63
- docling/models/rapid_ocr_model.py,sha256=anUVUwaj9Wubgu4FnHdYMuOVkQP_hJiLY1qRToelBoc,7700
64
- docling/models/readingorder_model.py,sha256=_usJdpM4GMWeGGneEwLLxa9grIGQb0XnNMugV72jGbY,14911
64
+ docling/models/rapid_ocr_model.py,sha256=JGeed1aNO64SYFgxlOifdut4fynUJyBuyyQrfuSno-4,13182
65
+ docling/models/readingorder_model.py,sha256=-j-UuvnsYWqZvY0gByKz0bjcBwOhWQTHerCopig_jVs,17266
65
66
  docling/models/table_structure_model.py,sha256=7g_mFf1YzfF8PXQfefNu6XYZu7TzJAn86zKb6IEUdCg,12518
66
- docling/models/tesseract_ocr_cli_model.py,sha256=I3Gn28Y-LD8OfvyCElN9fLiNgpo2sT0uMkVt258253s,12881
67
- docling/models/tesseract_ocr_model.py,sha256=GdI5Cjfi87qcehVbM3wdKRvKkl_F9A4bwTUbjXZCJYA,10745
67
+ docling/models/tesseract_ocr_cli_model.py,sha256=KuO4rXc-88C2-cAymvcr41TqFi3hNg4gerEzoI3Z6m4,13039
68
+ docling/models/tesseract_ocr_model.py,sha256=W_476USwExjSfhelXG8B9eNIVXXlm_dNFA60TZ5rq7E,11216
68
69
  docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
69
70
  docling/models/factories/base_factory.py,sha256=MfWIljMETi5aaVR-6qLTelW8u1gwDAQsOwg3fu7O4Qc,4028
70
71
  docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
71
72
  docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
72
73
  docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- docling/models/plugins/defaults.py,sha256=OAHWW2tCcUXSyDMFxV_lXVRjSBJ1n6z-Eb3R8cDucU4,886
74
+ docling/models/plugins/defaults.py,sha256=ZJq_hDg_HTmRNvM6siLBqgtHNb-oHzj3dQU_RVAbyYM,971
74
75
  docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
76
  docling/models/utils/generation_utils.py,sha256=0ZfMBMbolHAWjdbMza8FbD4_jQ4VY6ReUa4gqVLwMoU,5365
76
77
  docling/models/utils/hf_model_download.py,sha256=VlKna9tLIVOGQkIRQBXfDimPIIyeRV7cFCbuOVmFQiU,1092
@@ -80,7 +81,7 @@ docling/models/vlm_models_inline/mlx_model.py,sha256=ae7hDMgBsMLkqulmbKDamGSSrLJ
80
81
  docling/models/vlm_models_inline/nuextract_transformers_model.py,sha256=jLNtlkMDheUyWot7Oqq-GHQIYzJ0fZrbReq5xCnYb9E,10506
81
82
  docling/models/vlm_models_inline/vllm_model.py,sha256=vXClayYxPGX1jzQ1Rvf3vvwtW9khgApGvcRz4Qbyu7I,10293
82
83
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
- docling/pipeline/asr_pipeline.py,sha256=S55VHLoX3Mgauen1YP-PSUlI0LA1bgTgTkU-eC4U-dg,8481
84
+ docling/pipeline/asr_pipeline.py,sha256=oRluG28no3ezjbtL7nJLpDcxxxJuuULNXheq1W-qklM,10629
84
85
  docling/pipeline/base_extraction_pipeline.py,sha256=GYrEz83IXv-tdIHjtNWxMBNczFwL8SZyf9vnPJ3STaI,2627
85
86
  docling/pipeline/base_pipeline.py,sha256=NPMQDTyis-LgQ4SybY2f5AESZl5PxogF-FRQuCDckXg,12748
86
87
  docling/pipeline/extraction_vlm_pipeline.py,sha256=veUOTe8nGdnduZKaGn1RRb-NfU1H6t_EN4QAsb022Zg,8260
@@ -95,15 +96,15 @@ docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
95
96
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
96
97
  docling/utils/layout_postprocessor.py,sha256=sE9UR3Nv4iOk26uoIsN3bFioE7ScfAjj0orDBDneLXg,25166
97
98
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
98
- docling/utils/model_downloader.py,sha256=kFIxr5KUQbisQH0h8yP9GZMqsRJD3Xo1uOIiLiB1T78,4869
99
+ docling/utils/model_downloader.py,sha256=NjVn6ZhGcRwuLU93NYblRQpXOD8dB3pb1WC1bLEbF_E,5324
99
100
  docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
100
101
  docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,1842
101
102
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
102
103
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
103
104
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
104
- docling-2.55.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
105
- docling-2.55.0.dist-info/METADATA,sha256=e1RK_bATZ2Q_Ie9kC6uHFCj99D7pkW678jxk_l0CHxk,11252
106
- docling-2.55.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
107
- docling-2.55.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
108
- docling-2.55.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
109
- docling-2.55.0.dist-info/RECORD,,
105
+ docling-2.56.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
106
+ docling-2.56.0.dist-info/METADATA,sha256=jNEpaC8pNgpI_qbjYnBaBMHBoDRtBbKeXgMKhBEo_Xk,11364
107
+ docling-2.56.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
108
+ docling-2.56.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
109
+ docling-2.56.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
110
+ docling-2.56.0.dist-info/RECORD,,