docling 2.55.0__py3-none-any.whl → 2.56.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/html_backend.py +36 -15
- docling/backend/md_backend.py +4 -1
- docling/backend/msexcel_backend.py +13 -9
- docling/cli/main.py +41 -9
- docling/cli/models.py +3 -1
- docling/datamodel/pipeline_options.py +15 -1
- docling/models/auto_ocr_model.py +132 -0
- docling/models/base_model.py +2 -2
- docling/models/plugins/defaults.py +2 -0
- docling/models/rapid_ocr_model.py +126 -5
- docling/models/readingorder_model.py +56 -5
- docling/models/tesseract_ocr_cli_model.py +4 -0
- docling/models/tesseract_ocr_model.py +15 -5
- docling/pipeline/asr_pipeline.py +53 -6
- docling/utils/model_downloader.py +12 -0
- {docling-2.55.0.dist-info → docling-2.56.0.dist-info}/METADATA +5 -3
- {docling-2.55.0.dist-info → docling-2.56.0.dist-info}/RECORD +21 -20
- {docling-2.55.0.dist-info → docling-2.56.0.dist-info}/WHEEL +0 -0
- {docling-2.55.0.dist-info → docling-2.56.0.dist-info}/entry_points.txt +0 -0
- {docling-2.55.0.dist-info → docling-2.56.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.55.0.dist-info → docling-2.56.0.dist-info}/top_level.txt +0 -0
docling/backend/html_backend.py
CHANGED
|
@@ -272,9 +272,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
272
272
|
for br in content("br"):
|
|
273
273
|
br.replace_with(NavigableString("\n"))
|
|
274
274
|
# set default content layer
|
|
275
|
-
|
|
275
|
+
|
|
276
|
+
# Furniture before the first heading rule, except for headers in tables
|
|
277
|
+
header = None
|
|
278
|
+
# Find all headers first
|
|
279
|
+
all_headers = content.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
|
|
280
|
+
# Keep only those that do NOT have a <table> in a parent chain
|
|
281
|
+
clean_headers = [h for h in all_headers if not h.find_parent("table")]
|
|
282
|
+
# Pick the first header from the remaining
|
|
283
|
+
if len(clean_headers):
|
|
284
|
+
header = clean_headers[0]
|
|
285
|
+
# Set starting content layer
|
|
276
286
|
self.content_layer = (
|
|
277
|
-
ContentLayer.BODY if
|
|
287
|
+
ContentLayer.BODY if header is None else ContentLayer.FURNITURE
|
|
278
288
|
)
|
|
279
289
|
# reset context
|
|
280
290
|
self.ctx = _Context()
|
|
@@ -309,9 +319,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
309
319
|
group_name: str,
|
|
310
320
|
doc: DoclingDocument,
|
|
311
321
|
docling_table: TableItem,
|
|
312
|
-
) -> tuple[bool, RefItem]:
|
|
322
|
+
) -> tuple[bool, Union[RefItem, None]]:
|
|
313
323
|
rich_table_cell = False
|
|
314
|
-
ref_for_rich_cell =
|
|
324
|
+
ref_for_rich_cell = None
|
|
325
|
+
if len(provs_in_cell) > 0:
|
|
326
|
+
ref_for_rich_cell = provs_in_cell[0]
|
|
315
327
|
if len(provs_in_cell) > 1:
|
|
316
328
|
# Cell has multiple elements, we need to group them
|
|
317
329
|
rich_table_cell = True
|
|
@@ -324,7 +336,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
324
336
|
if isinstance(pr_item, TextItem):
|
|
325
337
|
# Cell has only one element and it's just a text
|
|
326
338
|
rich_table_cell = False
|
|
327
|
-
|
|
339
|
+
try:
|
|
340
|
+
doc.delete_items(node_items=[pr_item])
|
|
341
|
+
except Exception as e:
|
|
342
|
+
_log.error(f"Error while making rich table: {e}.")
|
|
328
343
|
else:
|
|
329
344
|
rich_table_cell = True
|
|
330
345
|
ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
|
|
@@ -391,17 +406,19 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
391
406
|
|
|
392
407
|
provs_in_cell: list[RefItem] = []
|
|
393
408
|
# Parse table cell sub-tree for Rich Cells content:
|
|
409
|
+
table_level = self.level
|
|
394
410
|
provs_in_cell = self._walk(html_cell, doc)
|
|
411
|
+
# After walking sub-tree in cell, restore previously set level
|
|
412
|
+
self.level = table_level
|
|
395
413
|
|
|
396
414
|
rich_table_cell = False
|
|
397
415
|
ref_for_rich_cell = None
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
provs_in_cell, group_name, doc, docling_table
|
|
403
|
-
)
|
|
416
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
|
|
417
|
+
rich_table_cell, ref_for_rich_cell = (
|
|
418
|
+
HTMLDocumentBackend.process_rich_table_cells(
|
|
419
|
+
provs_in_cell, group_name, doc, docling_table
|
|
404
420
|
)
|
|
421
|
+
)
|
|
405
422
|
|
|
406
423
|
# Extracting text
|
|
407
424
|
text = self.get_text(html_cell).strip()
|
|
@@ -774,13 +791,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
774
791
|
for key in self.parents.keys():
|
|
775
792
|
self.parents[key] = None
|
|
776
793
|
self.level = 0
|
|
777
|
-
|
|
794
|
+
self.parents[self.level + 1] = doc.add_title(
|
|
778
795
|
text_clean,
|
|
779
796
|
content_layer=self.content_layer,
|
|
780
797
|
formatting=annotated_text.formatting,
|
|
781
798
|
hyperlink=annotated_text.hyperlink,
|
|
782
799
|
)
|
|
783
|
-
|
|
800
|
+
p1 = self.parents[self.level + 1]
|
|
801
|
+
if p1 is not None:
|
|
802
|
+
added_ref = [p1.get_ref()]
|
|
784
803
|
# the other levels need to be lowered by 1 if a title was set
|
|
785
804
|
else:
|
|
786
805
|
level -= 1
|
|
@@ -802,7 +821,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
802
821
|
_log.debug(f"Remove the tail of level {key}")
|
|
803
822
|
self.parents[key] = None
|
|
804
823
|
self.level = level
|
|
805
|
-
|
|
824
|
+
self.parents[self.level + 1] = doc.add_heading(
|
|
806
825
|
parent=self.parents[self.level],
|
|
807
826
|
text=text_clean,
|
|
808
827
|
orig=annotated_text.text,
|
|
@@ -811,7 +830,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
|
811
830
|
formatting=annotated_text.formatting,
|
|
812
831
|
hyperlink=annotated_text.hyperlink,
|
|
813
832
|
)
|
|
814
|
-
|
|
833
|
+
p2 = self.parents[self.level + 1]
|
|
834
|
+
if p2 is not None:
|
|
835
|
+
added_ref = [p2.get_ref()]
|
|
815
836
|
self.level += 1
|
|
816
837
|
for img_tag in tag("img"):
|
|
817
838
|
if isinstance(img_tag, Tag):
|
docling/backend/md_backend.py
CHANGED
|
@@ -249,7 +249,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
|
249
249
|
|
|
250
250
|
# Iterates over all elements in the AST
|
|
251
251
|
# Check for different element types and process relevant details
|
|
252
|
-
if
|
|
252
|
+
if (
|
|
253
|
+
isinstance(element, marko.block.Heading)
|
|
254
|
+
or isinstance(element, marko.block.SetextHeading)
|
|
255
|
+
) and len(element.children) > 0:
|
|
253
256
|
self._close_table(doc)
|
|
254
257
|
_log.debug(
|
|
255
258
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
|
@@ -18,6 +18,7 @@ from docling_core.types.doc import (
|
|
|
18
18
|
TableData,
|
|
19
19
|
)
|
|
20
20
|
from openpyxl import load_workbook
|
|
21
|
+
from openpyxl.chartsheet.chartsheet import Chartsheet
|
|
21
22
|
from openpyxl.drawing.image import Image
|
|
22
23
|
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
|
|
23
24
|
from openpyxl.worksheet.worksheet import Worksheet
|
|
@@ -186,18 +187,18 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
186
187
|
|
|
187
188
|
if self.workbook is not None:
|
|
188
189
|
# Iterate over all sheets
|
|
189
|
-
for
|
|
190
|
-
_log.info(f"Processing sheet: {
|
|
190
|
+
for idx, name in enumerate(self.workbook.sheetnames):
|
|
191
|
+
_log.info(f"Processing sheet {idx}: {name}")
|
|
191
192
|
|
|
192
|
-
sheet = self.workbook[
|
|
193
|
-
page_no =
|
|
193
|
+
sheet = self.workbook[name]
|
|
194
|
+
page_no = idx + 1
|
|
194
195
|
# do not rely on sheet.max_column, sheet.max_row if there are images
|
|
195
196
|
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
|
|
196
197
|
|
|
197
198
|
self.parents[0] = doc.add_group(
|
|
198
199
|
parent=None,
|
|
199
200
|
label=GroupLabel.SECTION,
|
|
200
|
-
name=f"sheet: {
|
|
201
|
+
name=f"sheet: {name}",
|
|
201
202
|
content_layer=self._get_sheet_content_layer(sheet),
|
|
202
203
|
)
|
|
203
204
|
doc = self._convert_sheet(doc, sheet)
|
|
@@ -208,7 +209,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
208
209
|
|
|
209
210
|
return doc
|
|
210
211
|
|
|
211
|
-
def _convert_sheet(
|
|
212
|
+
def _convert_sheet(
|
|
213
|
+
self, doc: DoclingDocument, sheet: Union[Worksheet, Chartsheet]
|
|
214
|
+
) -> DoclingDocument:
|
|
212
215
|
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
|
|
213
216
|
|
|
214
217
|
Args:
|
|
@@ -218,10 +221,11 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
218
221
|
Returns:
|
|
219
222
|
The updated DoclingDocument.
|
|
220
223
|
"""
|
|
224
|
+
if isinstance(sheet, Worksheet):
|
|
225
|
+
doc = self._find_tables_in_sheet(doc, sheet)
|
|
226
|
+
doc = self._find_images_in_sheet(doc, sheet)
|
|
221
227
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
doc = self._find_images_in_sheet(doc, sheet)
|
|
228
|
+
# TODO: parse charts in sheet
|
|
225
229
|
|
|
226
230
|
return doc
|
|
227
231
|
|
docling/cli/main.py
CHANGED
|
@@ -49,7 +49,7 @@ from docling.datamodel.document import ConversionResult
|
|
|
49
49
|
from docling.datamodel.pipeline_options import (
|
|
50
50
|
AsrPipelineOptions,
|
|
51
51
|
ConvertPipelineOptions,
|
|
52
|
-
|
|
52
|
+
OcrAutoOptions,
|
|
53
53
|
OcrOptions,
|
|
54
54
|
PaginatedPipelineOptions,
|
|
55
55
|
PdfBackend,
|
|
@@ -57,6 +57,8 @@ from docling.datamodel.pipeline_options import (
|
|
|
57
57
|
PipelineOptions,
|
|
58
58
|
ProcessingPipeline,
|
|
59
59
|
TableFormerMode,
|
|
60
|
+
TesseractCliOcrOptions,
|
|
61
|
+
TesseractOcrOptions,
|
|
60
62
|
VlmPipelineOptions,
|
|
61
63
|
)
|
|
62
64
|
from docling.datamodel.settings import settings
|
|
@@ -355,6 +357,13 @@ def convert( # noqa: C901
|
|
|
355
357
|
help="Replace any existing text with OCR generated text over the full content.",
|
|
356
358
|
),
|
|
357
359
|
] = False,
|
|
360
|
+
tables: Annotated[
|
|
361
|
+
bool,
|
|
362
|
+
typer.Option(
|
|
363
|
+
...,
|
|
364
|
+
help="If enabled, the table structure model will be used to extract table information.",
|
|
365
|
+
),
|
|
366
|
+
] = True,
|
|
358
367
|
ocr_engine: Annotated[
|
|
359
368
|
str,
|
|
360
369
|
typer.Option(
|
|
@@ -365,7 +374,7 @@ def convert( # noqa: C901
|
|
|
365
374
|
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
|
366
375
|
),
|
|
367
376
|
),
|
|
368
|
-
] =
|
|
377
|
+
] = OcrAutoOptions.kind,
|
|
369
378
|
ocr_lang: Annotated[
|
|
370
379
|
Optional[str],
|
|
371
380
|
typer.Option(
|
|
@@ -373,6 +382,13 @@ def convert( # noqa: C901
|
|
|
373
382
|
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
|
|
374
383
|
),
|
|
375
384
|
] = None,
|
|
385
|
+
psm: Annotated[
|
|
386
|
+
Optional[int],
|
|
387
|
+
typer.Option(
|
|
388
|
+
...,
|
|
389
|
+
help="Page Segmentation Mode for the OCR engine (0-13).",
|
|
390
|
+
),
|
|
391
|
+
] = None,
|
|
376
392
|
pdf_backend: Annotated[
|
|
377
393
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
|
378
394
|
] = PdfBackend.DLPARSE_V2,
|
|
@@ -540,13 +556,25 @@ def convert( # noqa: C901
|
|
|
540
556
|
if local_path.exists() and local_path.is_dir():
|
|
541
557
|
for fmt in from_formats:
|
|
542
558
|
for ext in FormatToExtensions[fmt]:
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
559
|
+
for path in local_path.glob(f"**/*.{ext}"):
|
|
560
|
+
if path.name.startswith("~$") and ext == "docx":
|
|
561
|
+
_log.info(
|
|
562
|
+
f"Ignoring temporary Word file: {path}"
|
|
563
|
+
)
|
|
564
|
+
continue
|
|
565
|
+
input_doc_paths.append(path)
|
|
566
|
+
|
|
567
|
+
for path in local_path.glob(f"**/*.{ext.upper()}"):
|
|
568
|
+
if path.name.startswith("~$") and ext == "docx":
|
|
569
|
+
_log.info(
|
|
570
|
+
f"Ignoring temporary Word file: {path}"
|
|
571
|
+
)
|
|
572
|
+
continue
|
|
573
|
+
input_doc_paths.append(path)
|
|
549
574
|
elif local_path.exists():
|
|
575
|
+
if not local_path.name.startswith("~$") and ext == "docx":
|
|
576
|
+
_log.info(f"Ignoring temporary Word file: {path}")
|
|
577
|
+
continue
|
|
550
578
|
input_doc_paths.append(local_path)
|
|
551
579
|
else:
|
|
552
580
|
err_console.print(
|
|
@@ -577,6 +605,10 @@ def convert( # noqa: C901
|
|
|
577
605
|
ocr_lang_list = _split_list(ocr_lang)
|
|
578
606
|
if ocr_lang_list is not None:
|
|
579
607
|
ocr_options.lang = ocr_lang_list
|
|
608
|
+
if psm is not None and isinstance(
|
|
609
|
+
ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
|
|
610
|
+
):
|
|
611
|
+
ocr_options.psm = psm
|
|
580
612
|
|
|
581
613
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
|
582
614
|
# pipeline_options: PaginatedPipelineOptions
|
|
@@ -591,7 +623,7 @@ def convert( # noqa: C901
|
|
|
591
623
|
accelerator_options=accelerator_options,
|
|
592
624
|
do_ocr=ocr,
|
|
593
625
|
ocr_options=ocr_options,
|
|
594
|
-
do_table_structure=
|
|
626
|
+
do_table_structure=tables,
|
|
595
627
|
do_code_enrichment=enrich_code,
|
|
596
628
|
do_formula_enrichment=enrich_formula,
|
|
597
629
|
do_picture_description=enrich_picture_description,
|
docling/cli/models.py
CHANGED
|
@@ -38,6 +38,7 @@ class _AvailableModels(str, Enum):
|
|
|
38
38
|
SMOLDOCLING = "smoldocling"
|
|
39
39
|
SMOLDOCLING_MLX = "smoldocling_mlx"
|
|
40
40
|
GRANITE_VISION = "granite_vision"
|
|
41
|
+
RAPIDOCR = "rapidocr"
|
|
41
42
|
EASYOCR = "easyocr"
|
|
42
43
|
|
|
43
44
|
|
|
@@ -46,7 +47,7 @@ _default_models = [
|
|
|
46
47
|
_AvailableModels.TABLEFORMER,
|
|
47
48
|
_AvailableModels.CODE_FORMULA,
|
|
48
49
|
_AvailableModels.PICTURE_CLASSIFIER,
|
|
49
|
-
_AvailableModels.
|
|
50
|
+
_AvailableModels.RAPIDOCR,
|
|
50
51
|
]
|
|
51
52
|
|
|
52
53
|
|
|
@@ -115,6 +116,7 @@ def download(
|
|
|
115
116
|
with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
|
|
116
117
|
with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
|
|
117
118
|
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
|
119
|
+
with_rapidocr=_AvailableModels.RAPIDOCR in to_download,
|
|
118
120
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
|
119
121
|
)
|
|
120
122
|
|
|
@@ -81,6 +81,13 @@ class OcrOptions(BaseOptions):
|
|
|
81
81
|
)
|
|
82
82
|
|
|
83
83
|
|
|
84
|
+
class OcrAutoOptions(OcrOptions):
|
|
85
|
+
"""Options for pick OCR engine automatically."""
|
|
86
|
+
|
|
87
|
+
kind: ClassVar[Literal["auto"]] = "auto"
|
|
88
|
+
lang: List[str] = []
|
|
89
|
+
|
|
90
|
+
|
|
84
91
|
class RapidOcrOptions(OcrOptions):
|
|
85
92
|
"""Options for the RapidOCR engine."""
|
|
86
93
|
|
|
@@ -154,6 +161,9 @@ class TesseractCliOcrOptions(OcrOptions):
|
|
|
154
161
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
155
162
|
tesseract_cmd: str = "tesseract"
|
|
156
163
|
path: Optional[str] = None
|
|
164
|
+
psm: Optional[int] = (
|
|
165
|
+
None # Page Segmentation Mode (0-13), defaults to tesseract's default
|
|
166
|
+
)
|
|
157
167
|
|
|
158
168
|
model_config = ConfigDict(
|
|
159
169
|
extra="forbid",
|
|
@@ -166,6 +176,9 @@ class TesseractOcrOptions(OcrOptions):
|
|
|
166
176
|
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
|
|
167
177
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
168
178
|
path: Optional[str] = None
|
|
179
|
+
psm: Optional[int] = (
|
|
180
|
+
None # Page Segmentation Mode (0-13), defaults to tesseract's default
|
|
181
|
+
)
|
|
169
182
|
|
|
170
183
|
model_config = ConfigDict(
|
|
171
184
|
extra="forbid",
|
|
@@ -249,6 +262,7 @@ class PdfBackend(str, Enum):
|
|
|
249
262
|
class OcrEngine(str, Enum):
|
|
250
263
|
"""Enum of valid OCR engines."""
|
|
251
264
|
|
|
265
|
+
AUTO = "auto"
|
|
252
266
|
EASYOCR = "easyocr"
|
|
253
267
|
TESSERACT_CLI = "tesseract_cli"
|
|
254
268
|
TESSERACT = "tesseract"
|
|
@@ -330,7 +344,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
|
330
344
|
# If True, text from backend will be used instead of generated text
|
|
331
345
|
|
|
332
346
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
333
|
-
ocr_options: OcrOptions =
|
|
347
|
+
ocr_options: OcrOptions = OcrAutoOptions()
|
|
334
348
|
layout_options: LayoutOptions = LayoutOptions()
|
|
335
349
|
|
|
336
350
|
images_scale: float = 1.0
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Type
|
|
6
|
+
|
|
7
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
8
|
+
from docling.datamodel.base_models import Page
|
|
9
|
+
from docling.datamodel.document import ConversionResult
|
|
10
|
+
from docling.datamodel.pipeline_options import (
|
|
11
|
+
EasyOcrOptions,
|
|
12
|
+
OcrAutoOptions,
|
|
13
|
+
OcrMacOptions,
|
|
14
|
+
OcrOptions,
|
|
15
|
+
RapidOcrOptions,
|
|
16
|
+
)
|
|
17
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
|
18
|
+
from docling.models.easyocr_model import EasyOcrModel
|
|
19
|
+
from docling.models.ocr_mac_model import OcrMacModel
|
|
20
|
+
from docling.models.rapid_ocr_model import RapidOcrModel
|
|
21
|
+
|
|
22
|
+
_log = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OcrAutoModel(BaseOcrModel):
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
enabled: bool,
|
|
29
|
+
artifacts_path: Optional[Path],
|
|
30
|
+
options: OcrAutoOptions,
|
|
31
|
+
accelerator_options: AcceleratorOptions,
|
|
32
|
+
):
|
|
33
|
+
super().__init__(
|
|
34
|
+
enabled=enabled,
|
|
35
|
+
artifacts_path=artifacts_path,
|
|
36
|
+
options=options,
|
|
37
|
+
accelerator_options=accelerator_options,
|
|
38
|
+
)
|
|
39
|
+
self.options: OcrAutoOptions
|
|
40
|
+
|
|
41
|
+
self._engine: Optional[BaseOcrModel] = None
|
|
42
|
+
if self.enabled:
|
|
43
|
+
if "darwin" == sys.platform:
|
|
44
|
+
try:
|
|
45
|
+
from ocrmac import ocrmac
|
|
46
|
+
|
|
47
|
+
self._engine = OcrMacModel(
|
|
48
|
+
enabled=self.enabled,
|
|
49
|
+
artifacts_path=artifacts_path,
|
|
50
|
+
options=OcrMacOptions(
|
|
51
|
+
bitmap_area_threshold=self.options.bitmap_area_threshold,
|
|
52
|
+
force_full_page_ocr=self.options.force_full_page_ocr,
|
|
53
|
+
),
|
|
54
|
+
accelerator_options=accelerator_options,
|
|
55
|
+
)
|
|
56
|
+
_log.info("Auto OCR model selected ocrmac.")
|
|
57
|
+
except ImportError:
|
|
58
|
+
_log.info("ocrmac cannot be used because ocrmac is not installed.")
|
|
59
|
+
|
|
60
|
+
if self._engine is None:
|
|
61
|
+
try:
|
|
62
|
+
import onnxruntime
|
|
63
|
+
from rapidocr import EngineType, RapidOCR # type: ignore
|
|
64
|
+
|
|
65
|
+
self._engine = RapidOcrModel(
|
|
66
|
+
enabled=self.enabled,
|
|
67
|
+
artifacts_path=artifacts_path,
|
|
68
|
+
options=RapidOcrOptions(
|
|
69
|
+
backend="onnxruntime",
|
|
70
|
+
bitmap_area_threshold=self.options.bitmap_area_threshold,
|
|
71
|
+
force_full_page_ocr=self.options.force_full_page_ocr,
|
|
72
|
+
),
|
|
73
|
+
accelerator_options=accelerator_options,
|
|
74
|
+
)
|
|
75
|
+
_log.info("Auto OCR model selected rapidocr with onnxruntime.")
|
|
76
|
+
except ImportError:
|
|
77
|
+
_log.info(
|
|
78
|
+
"rapidocr cannot be used because onnxruntime is not installed."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if self._engine is None:
|
|
82
|
+
try:
|
|
83
|
+
import easyocr
|
|
84
|
+
|
|
85
|
+
self._engine = EasyOcrModel(
|
|
86
|
+
enabled=self.enabled,
|
|
87
|
+
artifacts_path=artifacts_path,
|
|
88
|
+
options=EasyOcrOptions(
|
|
89
|
+
bitmap_area_threshold=self.options.bitmap_area_threshold,
|
|
90
|
+
force_full_page_ocr=self.options.force_full_page_ocr,
|
|
91
|
+
),
|
|
92
|
+
accelerator_options=accelerator_options,
|
|
93
|
+
)
|
|
94
|
+
_log.info("Auto OCR model selected easyocr.")
|
|
95
|
+
except ImportError:
|
|
96
|
+
_log.info("easyocr cannot be used because it is not installed.")
|
|
97
|
+
|
|
98
|
+
if self._engine is None:
|
|
99
|
+
try:
|
|
100
|
+
import torch
|
|
101
|
+
from rapidocr import EngineType, RapidOCR # type: ignore
|
|
102
|
+
|
|
103
|
+
self._engine = RapidOcrModel(
|
|
104
|
+
enabled=self.enabled,
|
|
105
|
+
artifacts_path=artifacts_path,
|
|
106
|
+
options=RapidOcrOptions(
|
|
107
|
+
backend="torch",
|
|
108
|
+
bitmap_area_threshold=self.options.bitmap_area_threshold,
|
|
109
|
+
force_full_page_ocr=self.options.force_full_page_ocr,
|
|
110
|
+
),
|
|
111
|
+
accelerator_options=accelerator_options,
|
|
112
|
+
)
|
|
113
|
+
_log.info("Auto OCR model selected rapidocr with torch.")
|
|
114
|
+
except ImportError:
|
|
115
|
+
_log.info(
|
|
116
|
+
"rapidocr cannot be used because rapidocr or torch is not installed."
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if self._engine is None:
|
|
120
|
+
_log.warning("No OCR engine found. Please review the install details.")
|
|
121
|
+
|
|
122
|
+
def __call__(
|
|
123
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
124
|
+
) -> Iterable[Page]:
|
|
125
|
+
if not self.enabled or self._engine is None:
|
|
126
|
+
yield from page_batch
|
|
127
|
+
return
|
|
128
|
+
yield from self._engine(conv_res, page_batch)
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
|
132
|
+
return OcrAutoOptions
|
docling/models/base_model.py
CHANGED
|
@@ -173,11 +173,11 @@ class BaseItemAndImageEnrichmentModel(
|
|
|
173
173
|
assert isinstance(element, DocItem)
|
|
174
174
|
|
|
175
175
|
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
|
|
176
|
-
if
|
|
176
|
+
if isinstance(element, PictureItem):
|
|
177
177
|
embedded_im = element.get_image(conv_res.document)
|
|
178
178
|
if embedded_im is not None:
|
|
179
179
|
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
|
|
180
|
-
|
|
180
|
+
elif len(element.prov) == 0:
|
|
181
181
|
return None
|
|
182
182
|
|
|
183
183
|
# Crop the image form the page
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
def ocr_engines():
|
|
2
|
+
from docling.models.auto_ocr_model import OcrAutoModel
|
|
2
3
|
from docling.models.easyocr_model import EasyOcrModel
|
|
3
4
|
from docling.models.ocr_mac_model import OcrMacModel
|
|
4
5
|
from docling.models.rapid_ocr_model import RapidOcrModel
|
|
@@ -7,6 +8,7 @@ def ocr_engines():
|
|
|
7
8
|
|
|
8
9
|
return {
|
|
9
10
|
"ocr_engines": [
|
|
11
|
+
OcrAutoModel,
|
|
10
12
|
EasyOcrModel,
|
|
11
13
|
OcrMacModel,
|
|
12
14
|
RapidOcrModel,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections.abc import Iterable
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Optional, Type
|
|
4
|
+
from typing import Literal, Optional, Type, TypedDict
|
|
5
5
|
|
|
6
6
|
import numpy
|
|
7
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
@@ -18,11 +18,67 @@ from docling.datamodel.settings import settings
|
|
|
18
18
|
from docling.models.base_ocr_model import BaseOcrModel
|
|
19
19
|
from docling.utils.accelerator_utils import decide_device
|
|
20
20
|
from docling.utils.profiling import TimeRecorder
|
|
21
|
+
from docling.utils.utils import download_url_with_progress
|
|
21
22
|
|
|
22
23
|
_log = logging.getLogger(__name__)
|
|
23
24
|
|
|
25
|
+
_ModelPathEngines = Literal["onnxruntime", "torch"]
|
|
26
|
+
_ModelPathTypes = Literal[
|
|
27
|
+
"det_model_path", "cls_model_path", "rec_model_path", "rec_keys_path"
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class _ModelPathDetail(TypedDict):
|
|
32
|
+
url: str
|
|
33
|
+
path: str
|
|
34
|
+
|
|
24
35
|
|
|
25
36
|
class RapidOcrModel(BaseOcrModel):
|
|
37
|
+
_model_repo_folder = "RapidOcr"
|
|
38
|
+
# from https://github.com/RapidAI/RapidOCR/blob/main/python/rapidocr/default_models.yaml
|
|
39
|
+
# matching the default config in https://github.com/RapidAI/RapidOCR/blob/main/python/rapidocr/config.yaml
|
|
40
|
+
# and naming f"{file_info.engine_type.value}.{file_info.ocr_version.value}.{file_info.task_type.value}"
|
|
41
|
+
_default_models: dict[
|
|
42
|
+
_ModelPathEngines, dict[_ModelPathTypes, _ModelPathDetail]
|
|
43
|
+
] = {
|
|
44
|
+
"onnxruntime": {
|
|
45
|
+
"det_model_path": {
|
|
46
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
|
|
47
|
+
"path": "onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
|
|
48
|
+
},
|
|
49
|
+
"cls_model_path": {
|
|
50
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
|
|
51
|
+
"path": "onnx/PP-OCRv4/cls/ch_ppocr_mobile_v2.0_cls_infer.onnx",
|
|
52
|
+
},
|
|
53
|
+
"rec_model_path": {
|
|
54
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/onnx/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.onnx",
|
|
55
|
+
"path": "onnx/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.onnx",
|
|
56
|
+
},
|
|
57
|
+
"rec_keys_path": {
|
|
58
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v2.0.7/paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
|
|
59
|
+
"path": "paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
"torch": {
|
|
63
|
+
"det_model_path": {
|
|
64
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth",
|
|
65
|
+
"path": "torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth",
|
|
66
|
+
},
|
|
67
|
+
"cls_model_path": {
|
|
68
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth",
|
|
69
|
+
"path": "torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls_infer.pth",
|
|
70
|
+
},
|
|
71
|
+
"rec_model_path": {
|
|
72
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth",
|
|
73
|
+
"path": "torch/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer.pth",
|
|
74
|
+
},
|
|
75
|
+
"rec_keys_path": {
|
|
76
|
+
"url": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
|
|
77
|
+
"path": "paddle/PP-OCRv4/rec/ch_PP-OCRv4_rec_infer/ppocr_keys_v1.txt",
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
}
|
|
81
|
+
|
|
26
82
|
def __init__(
|
|
27
83
|
self,
|
|
28
84
|
enabled: bool,
|
|
@@ -62,25 +118,66 @@ class RapidOcrModel(BaseOcrModel):
|
|
|
62
118
|
}
|
|
63
119
|
backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
|
|
64
120
|
|
|
121
|
+
det_model_path = self.options.det_model_path
|
|
122
|
+
cls_model_path = self.options.cls_model_path
|
|
123
|
+
rec_model_path = self.options.rec_model_path
|
|
124
|
+
rec_keys_path = self.options.rec_keys_path
|
|
125
|
+
if artifacts_path is not None:
|
|
126
|
+
det_model_path = (
|
|
127
|
+
det_model_path
|
|
128
|
+
or artifacts_path
|
|
129
|
+
/ self._model_repo_folder
|
|
130
|
+
/ self._default_models[backend_enum.value]["det_model_path"]["path"]
|
|
131
|
+
)
|
|
132
|
+
cls_model_path = (
|
|
133
|
+
cls_model_path
|
|
134
|
+
or artifacts_path
|
|
135
|
+
/ self._model_repo_folder
|
|
136
|
+
/ self._default_models[backend_enum.value]["cls_model_path"]["path"]
|
|
137
|
+
)
|
|
138
|
+
rec_model_path = (
|
|
139
|
+
rec_model_path
|
|
140
|
+
or artifacts_path
|
|
141
|
+
/ self._model_repo_folder
|
|
142
|
+
/ self._default_models[backend_enum.value]["rec_model_path"]["path"]
|
|
143
|
+
)
|
|
144
|
+
rec_keys_path = (
|
|
145
|
+
rec_keys_path
|
|
146
|
+
or artifacts_path
|
|
147
|
+
/ self._model_repo_folder
|
|
148
|
+
/ self._default_models[backend_enum.value]["rec_keys_path"]["path"]
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
for model_path in (
|
|
152
|
+
rec_keys_path,
|
|
153
|
+
cls_model_path,
|
|
154
|
+
rec_model_path,
|
|
155
|
+
rec_keys_path,
|
|
156
|
+
):
|
|
157
|
+
if model_path is None:
|
|
158
|
+
continue
|
|
159
|
+
if not Path(model_path).exists():
|
|
160
|
+
_log.warning(f"The provided model path {model_path} is not found.")
|
|
161
|
+
|
|
65
162
|
params = {
|
|
66
163
|
# Global settings (these are still correct)
|
|
67
164
|
"Global.text_score": self.options.text_score,
|
|
68
165
|
"Global.font_path": self.options.font_path,
|
|
69
166
|
# "Global.verbose": self.options.print_verbose,
|
|
70
167
|
# Detection model settings
|
|
71
|
-
"Det.model_path":
|
|
168
|
+
"Det.model_path": det_model_path,
|
|
72
169
|
"Det.use_cuda": use_cuda,
|
|
73
170
|
"Det.use_dml": use_dml,
|
|
74
171
|
"Det.intra_op_num_threads": intra_op_num_threads,
|
|
75
172
|
# Classification model settings
|
|
76
|
-
"Cls.model_path":
|
|
173
|
+
"Cls.model_path": cls_model_path,
|
|
77
174
|
"Cls.use_cuda": use_cuda,
|
|
78
175
|
"Cls.use_dml": use_dml,
|
|
79
176
|
"Cls.intra_op_num_threads": intra_op_num_threads,
|
|
80
177
|
# Recognition model settings
|
|
81
|
-
"Rec.model_path":
|
|
178
|
+
"Rec.model_path": rec_model_path,
|
|
82
179
|
"Rec.font_path": self.options.rec_font_path,
|
|
83
|
-
"Rec.keys_path":
|
|
180
|
+
"Rec.keys_path": rec_keys_path,
|
|
84
181
|
"Rec.use_cuda": use_cuda,
|
|
85
182
|
"Rec.use_dml": use_dml,
|
|
86
183
|
"Rec.intra_op_num_threads": intra_op_num_threads,
|
|
@@ -102,6 +199,30 @@ class RapidOcrModel(BaseOcrModel):
|
|
|
102
199
|
params=params,
|
|
103
200
|
)
|
|
104
201
|
|
|
202
|
+
@staticmethod
|
|
203
|
+
def download_models(
|
|
204
|
+
backend: _ModelPathEngines,
|
|
205
|
+
local_dir: Optional[Path] = None,
|
|
206
|
+
force: bool = False,
|
|
207
|
+
progress: bool = False,
|
|
208
|
+
) -> Path:
|
|
209
|
+
if local_dir is None:
|
|
210
|
+
local_dir = settings.cache_dir / "models" / RapidOcrModel._model_repo_folder
|
|
211
|
+
|
|
212
|
+
local_dir.mkdir(parents=True, exist_ok=True)
|
|
213
|
+
|
|
214
|
+
# Download models
|
|
215
|
+
for model_type, model_details in RapidOcrModel._default_models[backend].items():
|
|
216
|
+
output_path = local_dir / model_details["path"]
|
|
217
|
+
if output_path.exists() and not force:
|
|
218
|
+
continue
|
|
219
|
+
output_path.parent.mkdir(exist_ok=True, parents=True)
|
|
220
|
+
buf = download_url_with_progress(model_details["url"], progress=progress)
|
|
221
|
+
with output_path.open("wb") as fw:
|
|
222
|
+
fw.write(buf.read())
|
|
223
|
+
|
|
224
|
+
return local_dir
|
|
225
|
+
|
|
105
226
|
def __call__(
|
|
106
227
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
107
228
|
) -> Iterable[Page]:
|
|
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
|
|
|
9
9
|
NodeItem,
|
|
10
10
|
ProvenanceItem,
|
|
11
11
|
RefItem,
|
|
12
|
+
RichTableCell,
|
|
12
13
|
TableData,
|
|
13
14
|
)
|
|
14
15
|
from docling_core.types.doc.document import ContentLayer
|
|
@@ -103,6 +104,22 @@ class ReadingOrderModel:
|
|
|
103
104
|
else:
|
|
104
105
|
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
|
105
106
|
|
|
107
|
+
def _create_rich_cell_group(
|
|
108
|
+
self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
|
|
109
|
+
) -> RefItem:
|
|
110
|
+
"""Create a group containing all child elements for a rich table cell."""
|
|
111
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
|
|
112
|
+
group_element = doc.add_group(
|
|
113
|
+
label=GroupLabel.UNSPECIFIED,
|
|
114
|
+
name=group_name,
|
|
115
|
+
parent=table_item,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Add all child elements to the group
|
|
119
|
+
self._add_child_elements(element, group_element, doc)
|
|
120
|
+
|
|
121
|
+
return group_element.get_ref()
|
|
122
|
+
|
|
106
123
|
def _readingorder_elements_to_docling_doc(
|
|
107
124
|
self,
|
|
108
125
|
conv_res: ConversionResult,
|
|
@@ -197,11 +214,21 @@ class ReadingOrderModel:
|
|
|
197
214
|
)
|
|
198
215
|
|
|
199
216
|
elif isinstance(element, Table):
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
217
|
+
# Check if table has no structure prediction
|
|
218
|
+
if element.num_rows == 0 and element.num_cols == 0:
|
|
219
|
+
# Only create 1x1 table if there are children to put in it
|
|
220
|
+
if element.cluster.children:
|
|
221
|
+
# Create minimal 1x1 table with rich cell containing all children
|
|
222
|
+
tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
|
|
223
|
+
else:
|
|
224
|
+
# Create empty table with no structure
|
|
225
|
+
tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
|
|
226
|
+
else:
|
|
227
|
+
tbl_data = TableData(
|
|
228
|
+
num_rows=element.num_rows,
|
|
229
|
+
num_cols=element.num_cols,
|
|
230
|
+
table_cells=element.table_cells,
|
|
231
|
+
)
|
|
205
232
|
|
|
206
233
|
prov = ProvenanceItem(
|
|
207
234
|
page_no=element.page_no + 1,
|
|
@@ -231,6 +258,30 @@ class ReadingOrderModel:
|
|
|
231
258
|
|
|
232
259
|
tbl.footnotes.append(new_footnote_item.get_ref())
|
|
233
260
|
|
|
261
|
+
# Handle case where table has no structure prediction but has children
|
|
262
|
+
if (
|
|
263
|
+
element.num_rows == 0
|
|
264
|
+
and element.num_cols == 0
|
|
265
|
+
and element.cluster.children
|
|
266
|
+
):
|
|
267
|
+
# Create rich cell containing all child elements
|
|
268
|
+
rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
|
|
269
|
+
|
|
270
|
+
# Create rich table cell spanning the entire 1x1 table
|
|
271
|
+
rich_cell = RichTableCell(
|
|
272
|
+
text="", # Empty text since content is in the group
|
|
273
|
+
row_span=1,
|
|
274
|
+
col_span=1,
|
|
275
|
+
start_row_offset_idx=0,
|
|
276
|
+
end_row_offset_idx=1,
|
|
277
|
+
start_col_offset_idx=0,
|
|
278
|
+
end_col_offset_idx=1,
|
|
279
|
+
column_header=False,
|
|
280
|
+
row_header=False,
|
|
281
|
+
ref=rich_cell_ref,
|
|
282
|
+
)
|
|
283
|
+
out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
|
|
284
|
+
|
|
234
285
|
# TODO: Consider adding children of Table.
|
|
235
286
|
|
|
236
287
|
elif isinstance(element, FigureElement):
|
|
@@ -117,6 +117,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
|
117
117
|
cmd.append("--tessdata-dir")
|
|
118
118
|
cmd.append(self.options.path)
|
|
119
119
|
|
|
120
|
+
# Add PSM option if specified in the configuration
|
|
121
|
+
if self.options.psm is not None:
|
|
122
|
+
cmd.extend(["--psm", str(self.options.psm)])
|
|
123
|
+
|
|
120
124
|
cmd += [ifilename, "stdout", "tsv"]
|
|
121
125
|
_log.info("command: {}".format(" ".join(cmd)))
|
|
122
126
|
|
|
@@ -86,7 +86,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|
|
86
86
|
self.script_prefix = ""
|
|
87
87
|
|
|
88
88
|
tesserocr_kwargs = {
|
|
89
|
-
"psm": tesserocr.PSM.AUTO,
|
|
90
89
|
"init": True,
|
|
91
90
|
"oem": tesserocr.OEM.DEFAULT,
|
|
92
91
|
}
|
|
@@ -96,14 +95,23 @@ class TesseractOcrModel(BaseOcrModel):
|
|
|
96
95
|
if self.options.path is not None:
|
|
97
96
|
tesserocr_kwargs["path"] = self.options.path
|
|
98
97
|
|
|
98
|
+
# Set main OCR reader with configurable PSM
|
|
99
|
+
main_psm = (
|
|
100
|
+
tesserocr.PSM(self.options.psm)
|
|
101
|
+
if self.options.psm is not None
|
|
102
|
+
else tesserocr.PSM.AUTO
|
|
103
|
+
)
|
|
99
104
|
if lang == "auto":
|
|
100
|
-
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
|
|
105
|
+
self.reader = tesserocr.PyTessBaseAPI(psm=main_psm, **tesserocr_kwargs)
|
|
101
106
|
else:
|
|
102
107
|
self.reader = tesserocr.PyTessBaseAPI(
|
|
103
|
-
|
|
108
|
+
lang=lang,
|
|
109
|
+
psm=main_psm,
|
|
110
|
+
**tesserocr_kwargs,
|
|
104
111
|
)
|
|
112
|
+
# OSD reader must use PSM.OSD_ONLY for orientation detection
|
|
105
113
|
self.osd_reader = tesserocr.PyTessBaseAPI(
|
|
106
|
-
|
|
114
|
+
lang="osd", psm=tesserocr.PSM.OSD_ONLY, **tesserocr_kwargs
|
|
107
115
|
)
|
|
108
116
|
self.reader_RIL = tesserocr.RIL
|
|
109
117
|
|
|
@@ -187,7 +195,9 @@ class TesseractOcrModel(BaseOcrModel):
|
|
|
187
195
|
tesserocr.PyTessBaseAPI(
|
|
188
196
|
path=self.reader.GetDatapath(),
|
|
189
197
|
lang=lang,
|
|
190
|
-
psm=tesserocr.PSM.
|
|
198
|
+
psm=tesserocr.PSM(self.options.psm)
|
|
199
|
+
if self.options.psm is not None
|
|
200
|
+
else tesserocr.PSM.AUTO,
|
|
191
201
|
init=True,
|
|
192
202
|
oem=tesserocr.OEM.DEFAULT,
|
|
193
203
|
)
|
docling/pipeline/asr_pipeline.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
|
+
import tempfile
|
|
4
5
|
from io import BytesIO
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import List, Optional, Union, cast
|
|
@@ -147,7 +148,25 @@ class _NativeWhisperModel:
|
|
|
147
148
|
self.word_timestamps = asr_options.word_timestamps
|
|
148
149
|
|
|
149
150
|
def run(self, conv_res: ConversionResult) -> ConversionResult:
|
|
150
|
-
|
|
151
|
+
# Access the file path from the backend, similar to how other pipelines handle it
|
|
152
|
+
path_or_stream = conv_res.input._backend.path_or_stream
|
|
153
|
+
|
|
154
|
+
# Handle both Path and BytesIO inputs
|
|
155
|
+
temp_file_path: Optional[Path] = None
|
|
156
|
+
|
|
157
|
+
if isinstance(path_or_stream, BytesIO):
|
|
158
|
+
# For BytesIO, write to a temporary file since whisper requires a file path
|
|
159
|
+
suffix = Path(conv_res.input.file.name).suffix or ".wav"
|
|
160
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
|
|
161
|
+
tmp_file.write(path_or_stream.getvalue())
|
|
162
|
+
temp_file_path = Path(tmp_file.name)
|
|
163
|
+
audio_path = temp_file_path
|
|
164
|
+
elif isinstance(path_or_stream, Path):
|
|
165
|
+
audio_path = path_or_stream
|
|
166
|
+
else:
|
|
167
|
+
raise RuntimeError(
|
|
168
|
+
f"ASR pipeline requires a file path or BytesIO stream, but got {type(path_or_stream)}"
|
|
169
|
+
)
|
|
151
170
|
|
|
152
171
|
try:
|
|
153
172
|
conversation = self.transcribe(audio_path)
|
|
@@ -167,14 +186,22 @@ class _NativeWhisperModel:
|
|
|
167
186
|
label=DocItemLabel.TEXT, text=citem.to_string()
|
|
168
187
|
)
|
|
169
188
|
|
|
170
|
-
conv_res.status = ConversionStatus.SUCCESS
|
|
171
189
|
return conv_res
|
|
172
190
|
|
|
173
191
|
except Exception as exc:
|
|
174
192
|
_log.error(f"Audio tranciption has an error: {exc}")
|
|
193
|
+
conv_res.status = ConversionStatus.FAILURE
|
|
194
|
+
return conv_res
|
|
175
195
|
|
|
176
|
-
|
|
177
|
-
|
|
196
|
+
finally:
|
|
197
|
+
# Clean up temporary file if created
|
|
198
|
+
if temp_file_path is not None and temp_file_path.exists():
|
|
199
|
+
try:
|
|
200
|
+
temp_file_path.unlink()
|
|
201
|
+
except Exception as e:
|
|
202
|
+
_log.warning(
|
|
203
|
+
f"Failed to delete temporary file {temp_file_path}: {e}"
|
|
204
|
+
)
|
|
178
205
|
|
|
179
206
|
def transcribe(self, fpath: Path) -> list[_ConversationItem]:
|
|
180
207
|
result = self.model.transcribe(
|
|
@@ -221,9 +248,29 @@ class AsrPipeline(BasePipeline):
|
|
|
221
248
|
else:
|
|
222
249
|
_log.error(f"No model support for {self.pipeline_options.asr_options}")
|
|
223
250
|
|
|
251
|
+
def _has_text(self, document: "DoclingDocument") -> bool:
|
|
252
|
+
"""
|
|
253
|
+
Helper method to check if the document contains any transcribed text.
|
|
254
|
+
A transcription is considered non-empty if the .texts list contains items with actual, non whitespace content.
|
|
255
|
+
"""
|
|
256
|
+
if not document or not document.texts:
|
|
257
|
+
return False
|
|
258
|
+
for item in document.texts:
|
|
259
|
+
if item.text and item.text.strip():
|
|
260
|
+
return True
|
|
261
|
+
return False
|
|
262
|
+
|
|
224
263
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
225
|
-
status
|
|
226
|
-
|
|
264
|
+
"""Determines the final status of ASR Conversion based on its result."""
|
|
265
|
+
if conv_res.status == ConversionStatus.FAILURE or conv_res.errors:
|
|
266
|
+
return ConversionStatus.FAILURE
|
|
267
|
+
if not self._has_text(conv_res.document):
|
|
268
|
+
_log.warning(
|
|
269
|
+
"ASR conversion resulted in an empty document."
|
|
270
|
+
f"File: {conv_res.input.file.name}"
|
|
271
|
+
)
|
|
272
|
+
return ConversionStatus.PARTIAL_SUCCESS
|
|
273
|
+
return ConversionStatus.SUCCESS
|
|
227
274
|
|
|
228
275
|
@classmethod
|
|
229
276
|
def get_default_options(cls) -> AsrPipelineOptions:
|
|
@@ -20,6 +20,7 @@ from docling.models.document_picture_classifier import DocumentPictureClassifier
|
|
|
20
20
|
from docling.models.easyocr_model import EasyOcrModel
|
|
21
21
|
from docling.models.layout_model import LayoutModel
|
|
22
22
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
|
23
|
+
from docling.models.rapid_ocr_model import RapidOcrModel
|
|
23
24
|
from docling.models.table_structure_model import TableStructureModel
|
|
24
25
|
from docling.models.utils.hf_model_download import download_hf_model
|
|
25
26
|
|
|
@@ -41,6 +42,7 @@ def download_models(
|
|
|
41
42
|
with_smoldocling: bool = False,
|
|
42
43
|
with_smoldocling_mlx: bool = False,
|
|
43
44
|
with_granite_vision: bool = False,
|
|
45
|
+
with_rapidocr: bool = True,
|
|
44
46
|
with_easyocr: bool = True,
|
|
45
47
|
):
|
|
46
48
|
if output_dir is None:
|
|
@@ -135,6 +137,16 @@ def download_models(
|
|
|
135
137
|
progress=progress,
|
|
136
138
|
)
|
|
137
139
|
|
|
140
|
+
if with_rapidocr:
|
|
141
|
+
for backend in ("torch", "onnxruntime"):
|
|
142
|
+
_log.info(f"Downloading rapidocr {backend} models...")
|
|
143
|
+
RapidOcrModel.download_models(
|
|
144
|
+
backend=backend,
|
|
145
|
+
local_dir=output_dir / RapidOcrModel._model_repo_folder,
|
|
146
|
+
force=force,
|
|
147
|
+
progress=progress,
|
|
148
|
+
)
|
|
149
|
+
|
|
138
150
|
if with_easyocr:
|
|
139
151
|
_log.info("Downloading easyocr models...")
|
|
140
152
|
EasyOcrModel.download_models(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.56.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -34,7 +34,8 @@ Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
|
|
34
34
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
|
35
35
|
Requires-Dist: huggingface_hub<1,>=0.23
|
|
36
36
|
Requires-Dist: requests<3.0.0,>=2.32.2
|
|
37
|
-
Requires-Dist:
|
|
37
|
+
Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin"
|
|
38
|
+
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14"
|
|
38
39
|
Requires-Dist: certifi>=2024.7.4
|
|
39
40
|
Requires-Dist: rtree<2.0.0,>=1.3.0
|
|
40
41
|
Requires-Dist: typer<0.20.0,>=0.12.5
|
|
@@ -52,6 +53,8 @@ Requires-Dist: pylatexenc<3.0,>=2.10
|
|
|
52
53
|
Requires-Dist: scipy<2.0.0,>=1.6.0
|
|
53
54
|
Requires-Dist: accelerate<2,>=1.0.0
|
|
54
55
|
Requires-Dist: polyfactory>=2.22.2
|
|
56
|
+
Provides-Extra: easyocr
|
|
57
|
+
Requires-Dist: easyocr<2.0,>=1.7; extra == "easyocr"
|
|
55
58
|
Provides-Extra: tesserocr
|
|
56
59
|
Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
|
|
57
60
|
Provides-Extra: ocrmac
|
|
@@ -65,7 +68,6 @@ Requires-Dist: qwen-vl-utils>=0.0.11; extra == "vlm"
|
|
|
65
68
|
Provides-Extra: rapidocr
|
|
66
69
|
Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
|
|
67
70
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
|
68
|
-
Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
|
|
69
71
|
Provides-Extra: asr
|
|
70
72
|
Requires-Dist: openai-whisper>=20250625; extra == "asr"
|
|
71
73
|
Dynamic: license-file
|
|
@@ -10,10 +10,10 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
|
|
|
10
10
|
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
|
11
11
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
|
12
12
|
docling/backend/docling_parse_v4_backend.py,sha256=xCBbaaXjNNrOaod9tmBuCbe5mL_ipmTNG2XOxVbGG3w,7891
|
|
13
|
-
docling/backend/html_backend.py,sha256=
|
|
14
|
-
docling/backend/md_backend.py,sha256=
|
|
13
|
+
docling/backend/html_backend.py,sha256=iuRyYztUduyP214X0SyDvl1dP_h0eccp5RkuM72rV8o,48664
|
|
14
|
+
docling/backend/md_backend.py,sha256=TWboEPHl93pqI_Go1a3XpP-KpzI3d17xo5ZW42Ul0kY,22764
|
|
15
15
|
docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
|
|
16
|
-
docling/backend/msexcel_backend.py,sha256=
|
|
16
|
+
docling/backend/msexcel_backend.py,sha256=GOuA-MlShpzFmCmJq3-Z28iquwWUg4k8v-AT4O-aAQI,19305
|
|
17
17
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
|
18
18
|
docling/backend/msword_backend.py,sha256=Jfd57hzG8iFVAzqsOAHe5jG8LCHAIBXJhQCW0tESnMM,54405
|
|
19
19
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
|
@@ -31,8 +31,8 @@ docling/backend/xml/jats_backend.py,sha256=_BWpQQg3SlsHAOOj0v2qRJoVqaQzL91GqN1tK
|
|
|
31
31
|
docling/backend/xml/uspto_backend.py,sha256=Tv4CE7V5_QwxTNJPl90CAd_mAbwaLGy8S6s6evh1Xow,70910
|
|
32
32
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
|
33
33
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
|
-
docling/cli/main.py,sha256=
|
|
35
|
-
docling/cli/models.py,sha256=
|
|
34
|
+
docling/cli/main.py,sha256=cvDS6CTME2B2Mrm4l9yNynOUDVsZ9ZTlA6mM_jsa5jU,34258
|
|
35
|
+
docling/cli/models.py,sha256=zZBFQJAD7C5sespnYy5M__4qC_GyqAZ-QpfWtgPRDB0,6343
|
|
36
36
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
|
37
37
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
38
|
docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
|
|
@@ -41,14 +41,15 @@ docling/datamodel/base_models.py,sha256=CQ6eThPzVeVD2Gq7BNz9Q5RDLwhe4NgMzk7tdLtk
|
|
|
41
41
|
docling/datamodel/document.py,sha256=HyO3kdJcXIJ3wL95sPoL3zvsO4Rww3-qHH6IkL4I0q4,17483
|
|
42
42
|
docling/datamodel/extraction.py,sha256=7dgvtK5SuvgfB8LHAwS1FwrW1kcMQJuJG0ol8uAQgoQ,1323
|
|
43
43
|
docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
|
|
44
|
-
docling/datamodel/pipeline_options.py,sha256=
|
|
44
|
+
docling/datamodel/pipeline_options.py,sha256=dklSaA7P6VkjbBB-Pz2OyzO2SQuV9y0I8VVr9XHJusw,11692
|
|
45
45
|
docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
|
|
46
46
|
docling/datamodel/pipeline_options_vlm_model.py,sha256=Szdq5_MhqQ8xBCvOUkdn_LLV29ZMQJcF4xnItYlkmXQ,3090
|
|
47
47
|
docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
|
|
48
48
|
docling/datamodel/vlm_model_specs.py,sha256=9TTmihDEFcI-TY1jJ2GTnTcrGa3bLg0e6anN4gPtFgU,10035
|
|
49
49
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
50
|
docling/models/api_vlm_model.py,sha256=iNQ9LiT031Mch-LHn8O2CskVXYkr4weEetZPxynU_9U,4236
|
|
51
|
-
docling/models/
|
|
51
|
+
docling/models/auto_ocr_model.py,sha256=nn_eQfNdGUclXKrB0nodHmCqgMUNUJzG3dLq0lhlNAI,5188
|
|
52
|
+
docling/models/base_model.py,sha256=QEbglxu3kT6aNq3x_5jY8T_KcD_Hhv9zr0-A4Mizhco,7252
|
|
52
53
|
docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
|
|
53
54
|
docling/models/code_formula_model.py,sha256=XRugm4EwifLRc-TrAk-glKlktJP-nAPneKh2EOovkJU,11308
|
|
54
55
|
docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
|
|
@@ -60,17 +61,17 @@ docling/models/page_preprocessing_model.py,sha256=EmusNexws5ZmR93js_saVU0BedqZ_H
|
|
|
60
61
|
docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
|
|
61
62
|
docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
|
|
62
63
|
docling/models/picture_description_vlm_model.py,sha256=Uja_BQSk7F-U1J2hm4yeLguirUzKYv1K8zRyw1IYomY,4150
|
|
63
|
-
docling/models/rapid_ocr_model.py,sha256=
|
|
64
|
-
docling/models/readingorder_model.py,sha256
|
|
64
|
+
docling/models/rapid_ocr_model.py,sha256=JGeed1aNO64SYFgxlOifdut4fynUJyBuyyQrfuSno-4,13182
|
|
65
|
+
docling/models/readingorder_model.py,sha256=-j-UuvnsYWqZvY0gByKz0bjcBwOhWQTHerCopig_jVs,17266
|
|
65
66
|
docling/models/table_structure_model.py,sha256=7g_mFf1YzfF8PXQfefNu6XYZu7TzJAn86zKb6IEUdCg,12518
|
|
66
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
|
67
|
-
docling/models/tesseract_ocr_model.py,sha256=
|
|
67
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=KuO4rXc-88C2-cAymvcr41TqFi3hNg4gerEzoI3Z6m4,13039
|
|
68
|
+
docling/models/tesseract_ocr_model.py,sha256=W_476USwExjSfhelXG8B9eNIVXXlm_dNFA60TZ5rq7E,11216
|
|
68
69
|
docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
|
|
69
70
|
docling/models/factories/base_factory.py,sha256=MfWIljMETi5aaVR-6qLTelW8u1gwDAQsOwg3fu7O4Qc,4028
|
|
70
71
|
docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
|
|
71
72
|
docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
|
|
72
73
|
docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
|
-
docling/models/plugins/defaults.py,sha256=
|
|
74
|
+
docling/models/plugins/defaults.py,sha256=ZJq_hDg_HTmRNvM6siLBqgtHNb-oHzj3dQU_RVAbyYM,971
|
|
74
75
|
docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
75
76
|
docling/models/utils/generation_utils.py,sha256=0ZfMBMbolHAWjdbMza8FbD4_jQ4VY6ReUa4gqVLwMoU,5365
|
|
76
77
|
docling/models/utils/hf_model_download.py,sha256=VlKna9tLIVOGQkIRQBXfDimPIIyeRV7cFCbuOVmFQiU,1092
|
|
@@ -80,7 +81,7 @@ docling/models/vlm_models_inline/mlx_model.py,sha256=ae7hDMgBsMLkqulmbKDamGSSrLJ
|
|
|
80
81
|
docling/models/vlm_models_inline/nuextract_transformers_model.py,sha256=jLNtlkMDheUyWot7Oqq-GHQIYzJ0fZrbReq5xCnYb9E,10506
|
|
81
82
|
docling/models/vlm_models_inline/vllm_model.py,sha256=vXClayYxPGX1jzQ1Rvf3vvwtW9khgApGvcRz4Qbyu7I,10293
|
|
82
83
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
|
-
docling/pipeline/asr_pipeline.py,sha256=
|
|
84
|
+
docling/pipeline/asr_pipeline.py,sha256=oRluG28no3ezjbtL7nJLpDcxxxJuuULNXheq1W-qklM,10629
|
|
84
85
|
docling/pipeline/base_extraction_pipeline.py,sha256=GYrEz83IXv-tdIHjtNWxMBNczFwL8SZyf9vnPJ3STaI,2627
|
|
85
86
|
docling/pipeline/base_pipeline.py,sha256=NPMQDTyis-LgQ4SybY2f5AESZl5PxogF-FRQuCDckXg,12748
|
|
86
87
|
docling/pipeline/extraction_vlm_pipeline.py,sha256=veUOTe8nGdnduZKaGn1RRb-NfU1H6t_EN4QAsb022Zg,8260
|
|
@@ -95,15 +96,15 @@ docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
|
|
|
95
96
|
docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
|
|
96
97
|
docling/utils/layout_postprocessor.py,sha256=sE9UR3Nv4iOk26uoIsN3bFioE7ScfAjj0orDBDneLXg,25166
|
|
97
98
|
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
|
98
|
-
docling/utils/model_downloader.py,sha256=
|
|
99
|
+
docling/utils/model_downloader.py,sha256=NjVn6ZhGcRwuLU93NYblRQpXOD8dB3pb1WC1bLEbF_E,5324
|
|
99
100
|
docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
|
|
100
101
|
docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,1842
|
|
101
102
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
|
102
103
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
|
103
104
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
|
104
|
-
docling-2.
|
|
105
|
-
docling-2.
|
|
106
|
-
docling-2.
|
|
107
|
-
docling-2.
|
|
108
|
-
docling-2.
|
|
109
|
-
docling-2.
|
|
105
|
+
docling-2.56.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
|
106
|
+
docling-2.56.0.dist-info/METADATA,sha256=jNEpaC8pNgpI_qbjYnBaBMHBoDRtBbKeXgMKhBEo_Xk,11364
|
|
107
|
+
docling-2.56.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
108
|
+
docling-2.56.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
|
109
|
+
docling-2.56.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
|
110
|
+
docling-2.56.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|