docling 2.8.1__py3-none-any.whl → 2.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/cli/main.py +88 -84
- docling/datamodel/base_models.py +5 -8
- docling/datamodel/document.py +26 -12
- docling/datamodel/pipeline_options.py +20 -0
- docling/document_converter.py +103 -83
- docling/exceptions.py +6 -0
- docling/models/tesseract_ocr_cli_model.py +12 -7
- {docling-2.8.1.dist-info → docling-2.8.3.dist-info}/METADATA +2 -2
- {docling-2.8.1.dist-info → docling-2.8.3.dist-info}/RECORD +12 -11
- {docling-2.8.1.dist-info → docling-2.8.3.dist-info}/LICENSE +0 -0
- {docling-2.8.1.dist-info → docling-2.8.3.dist-info}/WHEEL +0 -0
- {docling-2.8.1.dist-info → docling-2.8.3.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -2,6 +2,7 @@ import importlib
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import re
|
5
|
+
import tempfile
|
5
6
|
import time
|
6
7
|
import warnings
|
7
8
|
from enum import Enum
|
@@ -9,7 +10,7 @@ from pathlib import Path
|
|
9
10
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
10
11
|
|
11
12
|
import typer
|
12
|
-
from docling_core.utils.file import
|
13
|
+
from docling_core.utils.file import resolve_source_to_path
|
13
14
|
|
14
15
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
15
16
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
@@ -256,95 +257,98 @@ def convert(
|
|
256
257
|
if from_formats is None:
|
257
258
|
from_formats = [e for e in InputFormat]
|
258
259
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
for
|
270
|
-
|
271
|
-
|
260
|
+
with tempfile.TemporaryDirectory() as tempdir:
|
261
|
+
input_doc_paths: List[Path] = []
|
262
|
+
for src in input_sources:
|
263
|
+
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
264
|
+
if not source.exists():
|
265
|
+
err_console.print(
|
266
|
+
f"[red]Error: The input file {source} does not exist.[/red]"
|
267
|
+
)
|
268
|
+
raise typer.Abort()
|
269
|
+
elif source.is_dir():
|
270
|
+
for fmt in from_formats:
|
271
|
+
for ext in FormatToExtensions[fmt]:
|
272
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
273
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
274
|
+
else:
|
275
|
+
input_doc_paths.append(source)
|
276
|
+
|
277
|
+
if to_formats is None:
|
278
|
+
to_formats = [OutputFormat.MARKDOWN]
|
279
|
+
|
280
|
+
export_json = OutputFormat.JSON in to_formats
|
281
|
+
export_md = OutputFormat.MARKDOWN in to_formats
|
282
|
+
export_txt = OutputFormat.TEXT in to_formats
|
283
|
+
export_doctags = OutputFormat.DOCTAGS in to_formats
|
284
|
+
|
285
|
+
if ocr_engine == OcrEngine.EASYOCR:
|
286
|
+
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
287
|
+
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
288
|
+
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
289
|
+
elif ocr_engine == OcrEngine.TESSERACT:
|
290
|
+
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
291
|
+
elif ocr_engine == OcrEngine.OCRMAC:
|
292
|
+
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
293
|
+
elif ocr_engine == OcrEngine.RAPIDOCR:
|
294
|
+
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
272
295
|
else:
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
if ocr_engine == OcrEngine.EASYOCR:
|
284
|
-
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
285
|
-
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
286
|
-
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
287
|
-
elif ocr_engine == OcrEngine.TESSERACT:
|
288
|
-
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
289
|
-
elif ocr_engine == OcrEngine.OCRMAC:
|
290
|
-
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
291
|
-
elif ocr_engine == OcrEngine.RAPIDOCR:
|
292
|
-
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
293
|
-
else:
|
294
|
-
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
295
|
-
|
296
|
-
ocr_lang_list = _split_list(ocr_lang)
|
297
|
-
if ocr_lang_list is not None:
|
298
|
-
ocr_options.lang = ocr_lang_list
|
299
|
-
|
300
|
-
pipeline_options = PdfPipelineOptions(
|
301
|
-
do_ocr=ocr,
|
302
|
-
ocr_options=ocr_options,
|
303
|
-
do_table_structure=True,
|
304
|
-
)
|
305
|
-
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
306
|
-
pipeline_options.table_structure_options.mode = table_mode
|
307
|
-
|
308
|
-
if artifacts_path is not None:
|
309
|
-
pipeline_options.artifacts_path = artifacts_path
|
310
|
-
|
311
|
-
if pdf_backend == PdfBackend.DLPARSE_V1:
|
312
|
-
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
313
|
-
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
314
|
-
backend = DoclingParseV2DocumentBackend
|
315
|
-
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
316
|
-
backend = PyPdfiumDocumentBackend
|
317
|
-
else:
|
318
|
-
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
319
|
-
|
320
|
-
format_options: Dict[InputFormat, FormatOption] = {
|
321
|
-
InputFormat.PDF: PdfFormatOption(
|
322
|
-
pipeline_options=pipeline_options,
|
323
|
-
backend=backend, # pdf_backend
|
296
|
+
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
297
|
+
|
298
|
+
ocr_lang_list = _split_list(ocr_lang)
|
299
|
+
if ocr_lang_list is not None:
|
300
|
+
ocr_options.lang = ocr_lang_list
|
301
|
+
|
302
|
+
pipeline_options = PdfPipelineOptions(
|
303
|
+
do_ocr=ocr,
|
304
|
+
ocr_options=ocr_options,
|
305
|
+
do_table_structure=True,
|
324
306
|
)
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
)
|
307
|
+
pipeline_options.table_structure_options.do_cell_matching = (
|
308
|
+
True # do_cell_matching
|
309
|
+
)
|
310
|
+
pipeline_options.table_structure_options.mode = table_mode
|
330
311
|
|
331
|
-
|
312
|
+
if artifacts_path is not None:
|
313
|
+
pipeline_options.artifacts_path = artifacts_path
|
332
314
|
|
333
|
-
|
334
|
-
|
335
|
-
|
315
|
+
if pdf_backend == PdfBackend.DLPARSE_V1:
|
316
|
+
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
317
|
+
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
318
|
+
backend = DoclingParseV2DocumentBackend
|
319
|
+
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
320
|
+
backend = PyPdfiumDocumentBackend
|
321
|
+
else:
|
322
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
336
323
|
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
324
|
+
format_options: Dict[InputFormat, FormatOption] = {
|
325
|
+
InputFormat.PDF: PdfFormatOption(
|
326
|
+
pipeline_options=pipeline_options,
|
327
|
+
backend=backend, # pdf_backend
|
328
|
+
)
|
329
|
+
}
|
330
|
+
doc_converter = DocumentConverter(
|
331
|
+
allowed_formats=from_formats,
|
332
|
+
format_options=format_options,
|
333
|
+
)
|
334
|
+
|
335
|
+
start_time = time.time()
|
336
|
+
|
337
|
+
conv_results = doc_converter.convert_all(
|
338
|
+
input_doc_paths, raises_on_error=abort_on_error
|
339
|
+
)
|
340
|
+
|
341
|
+
output.mkdir(parents=True, exist_ok=True)
|
342
|
+
export_documents(
|
343
|
+
conv_results,
|
344
|
+
output_dir=output,
|
345
|
+
export_json=export_json,
|
346
|
+
export_md=export_md,
|
347
|
+
export_txt=export_txt,
|
348
|
+
export_doctags=export_doctags,
|
349
|
+
)
|
346
350
|
|
347
|
-
|
351
|
+
end_time = time.time() - start_time
|
348
352
|
|
349
353
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
350
354
|
|
docling/datamodel/base_models.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
from enum import Enum, auto
|
2
|
-
from io import BytesIO
|
3
2
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
3
|
|
5
4
|
from docling_core.types.doc import (
|
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
|
9
8
|
Size,
|
10
9
|
TableCell,
|
11
10
|
)
|
11
|
+
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
12
|
+
DocumentStream,
|
13
|
+
)
|
12
14
|
from PIL.Image import Image
|
13
15
|
from pydantic import BaseModel, ConfigDict
|
14
16
|
|
@@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
|
|
22
24
|
FAILURE = auto()
|
23
25
|
SUCCESS = auto()
|
24
26
|
PARTIAL_SUCCESS = auto()
|
27
|
+
SKIPPED = auto()
|
25
28
|
|
26
29
|
|
27
30
|
class InputFormat(str, Enum):
|
@@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
|
|
93
96
|
DOCUMENT_BACKEND = auto()
|
94
97
|
MODEL = auto()
|
95
98
|
DOC_ASSEMBLER = auto()
|
99
|
+
USER_INPUT = auto()
|
96
100
|
|
97
101
|
|
98
102
|
class ErrorItem(BaseModel):
|
@@ -207,10 +211,3 @@ class Page(BaseModel):
|
|
207
211
|
@property
|
208
212
|
def image(self) -> Optional[Image]:
|
209
213
|
return self.get_image(scale=self._default_image_scale)
|
210
|
-
|
211
|
-
|
212
|
-
class DocumentStream(BaseModel):
|
213
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
214
|
-
|
215
|
-
name: str
|
216
|
-
stream: BytesIO
|
docling/datamodel/document.py
CHANGED
@@ -3,7 +3,7 @@ import re
|
|
3
3
|
from enum import Enum
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path, PurePath
|
6
|
-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
|
7
7
|
|
8
8
|
import filetype
|
9
9
|
from docling_core.types.doc import (
|
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
|
32
32
|
)
|
33
33
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
34
34
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
35
|
-
from docling_core.utils.file import
|
35
|
+
from docling_core.utils.file import resolve_source_to_stream
|
36
36
|
from pydantic import BaseModel
|
37
37
|
from typing_extensions import deprecated
|
38
38
|
|
@@ -164,12 +164,6 @@ class InputDocument(BaseModel):
|
|
164
164
|
backend: Type[AbstractDocumentBackend],
|
165
165
|
path_or_stream: Union[BytesIO, Path],
|
166
166
|
) -> None:
|
167
|
-
if backend is None:
|
168
|
-
raise RuntimeError(
|
169
|
-
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
170
|
-
f"Please check your format configuration on DocumentConverter."
|
171
|
-
)
|
172
|
-
|
173
167
|
self._backend = backend(self, path_or_stream=path_or_stream)
|
174
168
|
if not self._backend.is_valid():
|
175
169
|
self.valid = False
|
@@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
|
|
450
444
|
return ds_doc
|
451
445
|
|
452
446
|
|
447
|
+
class _DummyBackend(AbstractDocumentBackend):
|
448
|
+
def __init__(self, *args, **kwargs):
|
449
|
+
super().__init__(*args, **kwargs)
|
450
|
+
|
451
|
+
def is_valid(self) -> bool:
|
452
|
+
return False
|
453
|
+
|
454
|
+
@classmethod
|
455
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
456
|
+
return set()
|
457
|
+
|
458
|
+
@classmethod
|
459
|
+
def supports_pagination(cls) -> bool:
|
460
|
+
return False
|
461
|
+
|
462
|
+
def unload(self):
|
463
|
+
return super().unload()
|
464
|
+
|
465
|
+
|
453
466
|
class _DocumentConversionInput(BaseModel):
|
454
467
|
|
455
468
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
@@ -459,13 +472,14 @@ class _DocumentConversionInput(BaseModel):
|
|
459
472
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
460
473
|
) -> Iterable[InputDocument]:
|
461
474
|
for item in self.path_or_stream_iterator:
|
462
|
-
obj =
|
475
|
+
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
463
476
|
format = self._guess_format(obj)
|
477
|
+
backend: Type[AbstractDocumentBackend]
|
464
478
|
if format not in format_options.keys():
|
465
|
-
_log.
|
466
|
-
f"
|
479
|
+
_log.error(
|
480
|
+
f"Input document {obj.name} does not match any allowed format."
|
467
481
|
)
|
468
|
-
|
482
|
+
backend = _DummyBackend
|
469
483
|
else:
|
470
484
|
backend = format_options[format].backend
|
471
485
|
|
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
6
6
|
|
7
7
|
|
8
8
|
class TableFormerMode(str, Enum):
|
9
|
+
"""Modes for the TableFormer model."""
|
10
|
+
|
9
11
|
FAST = "fast"
|
10
12
|
ACCURATE = "accurate"
|
11
13
|
|
12
14
|
|
13
15
|
class TableStructureOptions(BaseModel):
|
16
|
+
"""Options for the table structure."""
|
17
|
+
|
14
18
|
do_cell_matching: bool = (
|
15
19
|
True
|
16
20
|
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
|
|
21
25
|
|
22
26
|
|
23
27
|
class OcrOptions(BaseModel):
|
28
|
+
"""OCR options."""
|
29
|
+
|
24
30
|
kind: str
|
25
31
|
lang: List[str]
|
26
32
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
|
|
30
36
|
|
31
37
|
|
32
38
|
class RapidOcrOptions(OcrOptions):
|
39
|
+
"""Options for the RapidOCR engine."""
|
40
|
+
|
33
41
|
kind: Literal["rapidocr"] = "rapidocr"
|
34
42
|
|
35
43
|
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
|
|
66
74
|
|
67
75
|
|
68
76
|
class EasyOcrOptions(OcrOptions):
|
77
|
+
"""Options for the EasyOCR engine."""
|
78
|
+
|
69
79
|
kind: Literal["easyocr"] = "easyocr"
|
70
80
|
lang: List[str] = ["fr", "de", "es", "en"]
|
71
81
|
use_gpu: bool = True # same default as easyocr.Reader
|
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
|
|
79
89
|
|
80
90
|
|
81
91
|
class TesseractCliOcrOptions(OcrOptions):
|
92
|
+
"""Options for the TesseractCli engine."""
|
93
|
+
|
82
94
|
kind: Literal["tesseract"] = "tesseract"
|
83
95
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
84
96
|
tesseract_cmd: str = "tesseract"
|
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
|
|
90
102
|
|
91
103
|
|
92
104
|
class TesseractOcrOptions(OcrOptions):
|
105
|
+
"""Options for the Tesseract engine."""
|
106
|
+
|
93
107
|
kind: Literal["tesserocr"] = "tesserocr"
|
94
108
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
95
109
|
path: Optional[str] = None
|
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
|
|
100
114
|
|
101
115
|
|
102
116
|
class OcrMacOptions(OcrOptions):
|
117
|
+
"""Options for the Mac OCR engine."""
|
118
|
+
|
103
119
|
kind: Literal["ocrmac"] = "ocrmac"
|
104
120
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
105
121
|
recognition: str = "accurate"
|
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
|
|
111
127
|
|
112
128
|
|
113
129
|
class PipelineOptions(BaseModel):
|
130
|
+
"""Base pipeline options."""
|
131
|
+
|
114
132
|
create_legacy_output: bool = (
|
115
133
|
True # This defautl will be set to False on a future version of docling
|
116
134
|
)
|
117
135
|
|
118
136
|
|
119
137
|
class PdfPipelineOptions(PipelineOptions):
|
138
|
+
"""Options for the PDF pipeline."""
|
139
|
+
|
120
140
|
artifacts_path: Optional[Union[Path, str]] = None
|
121
141
|
do_table_structure: bool = True # True: perform table structure extraction
|
122
142
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
docling/document_converter.py
CHANGED
@@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
15
15
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
16
16
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
17
17
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
18
|
-
from docling.datamodel.base_models import
|
18
|
+
from docling.datamodel.base_models import (
|
19
|
+
ConversionStatus,
|
20
|
+
DoclingComponentType,
|
21
|
+
DocumentStream,
|
22
|
+
ErrorItem,
|
23
|
+
InputFormat,
|
24
|
+
)
|
19
25
|
from docling.datamodel.document import (
|
20
26
|
ConversionResult,
|
21
27
|
InputDocument,
|
@@ -23,6 +29,7 @@ from docling.datamodel.document import (
|
|
23
29
|
)
|
24
30
|
from docling.datamodel.pipeline_options import PipelineOptions
|
25
31
|
from docling.datamodel.settings import DocumentLimits, settings
|
32
|
+
from docling.exceptions import ConversionError
|
26
33
|
from docling.pipeline.base_pipeline import BasePipeline
|
27
34
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
28
35
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
|
|
85
92
|
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
86
93
|
|
87
94
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
95
|
+
def _get_default_option(format: InputFormat) -> FormatOption:
|
96
|
+
format_to_default_options = {
|
97
|
+
InputFormat.XLSX: FormatOption(
|
98
|
+
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
99
|
+
),
|
100
|
+
InputFormat.DOCX: FormatOption(
|
101
|
+
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
102
|
+
),
|
103
|
+
InputFormat.PPTX: FormatOption(
|
104
|
+
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
105
|
+
),
|
106
|
+
InputFormat.MD: FormatOption(
|
107
|
+
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
108
|
+
),
|
109
|
+
InputFormat.ASCIIDOC: FormatOption(
|
110
|
+
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
111
|
+
),
|
112
|
+
InputFormat.HTML: FormatOption(
|
113
|
+
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
114
|
+
),
|
115
|
+
InputFormat.IMAGE: FormatOption(
|
116
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
117
|
+
),
|
118
|
+
InputFormat.PDF: FormatOption(
|
119
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
120
|
+
),
|
121
|
+
}
|
122
|
+
if (options := format_to_default_options.get(format)) is not None:
|
123
|
+
return options
|
124
|
+
else:
|
125
|
+
raise RuntimeError(f"No default options configured for {format}")
|
114
126
|
|
115
127
|
|
116
128
|
class DocumentConverter:
|
@@ -121,36 +133,26 @@ class DocumentConverter:
|
|
121
133
|
allowed_formats: Optional[List[InputFormat]] = None,
|
122
134
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
123
135
|
):
|
124
|
-
self.allowed_formats =
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
else:
|
136
|
-
for f in self.allowed_formats:
|
137
|
-
if f not in self.format_to_options.keys():
|
138
|
-
_log.debug(f"Requested format {f} will use default options.")
|
139
|
-
self.format_to_options[f] = _format_to_default_options[f]
|
140
|
-
|
141
|
-
remove_keys = []
|
142
|
-
for f in self.format_to_options.keys():
|
143
|
-
if f not in self.allowed_formats:
|
144
|
-
remove_keys.append(f)
|
145
|
-
|
146
|
-
for f in remove_keys:
|
147
|
-
self.format_to_options.pop(f)
|
148
|
-
|
136
|
+
self.allowed_formats = (
|
137
|
+
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
138
|
+
)
|
139
|
+
self.format_to_options = {
|
140
|
+
format: (
|
141
|
+
_get_default_option(format=format)
|
142
|
+
if (custom_option := (format_options or {}).get(format)) is None
|
143
|
+
else custom_option
|
144
|
+
)
|
145
|
+
for format in self.allowed_formats
|
146
|
+
}
|
149
147
|
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
150
148
|
|
151
149
|
def initialize_pipeline(self, format: InputFormat):
|
152
150
|
"""Initialize the conversion pipeline for the selected format."""
|
153
|
-
self._get_pipeline(doc_format=format)
|
151
|
+
pipeline = self._get_pipeline(doc_format=format)
|
152
|
+
if pipeline is None:
|
153
|
+
raise ConversionError(
|
154
|
+
f"No pipeline could be initialized for format {format}"
|
155
|
+
)
|
154
156
|
|
155
157
|
@validate_call(config=ConfigDict(strict=True))
|
156
158
|
def convert(
|
@@ -186,22 +188,28 @@ class DocumentConverter:
|
|
186
188
|
limits=limits,
|
187
189
|
)
|
188
190
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
191
|
+
|
192
|
+
had_result = False
|
189
193
|
for conv_res in conv_res_iter:
|
194
|
+
had_result = True
|
190
195
|
if raises_on_error and conv_res.status not in {
|
191
196
|
ConversionStatus.SUCCESS,
|
192
197
|
ConversionStatus.PARTIAL_SUCCESS,
|
193
198
|
}:
|
194
|
-
raise
|
199
|
+
raise ConversionError(
|
195
200
|
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
196
201
|
)
|
197
202
|
else:
|
198
203
|
yield conv_res
|
199
204
|
|
205
|
+
if not had_result and raises_on_error:
|
206
|
+
raise ConversionError(
|
207
|
+
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
208
|
+
)
|
209
|
+
|
200
210
|
def _convert(
|
201
211
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
202
212
|
) -> Iterator[ConversionResult]:
|
203
|
-
assert self.format_to_options is not None
|
204
|
-
|
205
213
|
start_time = time.monotonic()
|
206
214
|
|
207
215
|
for input_batch in chunkify(
|
@@ -223,27 +231,22 @@ class DocumentConverter:
|
|
223
231
|
):
|
224
232
|
elapsed = time.monotonic() - start_time
|
225
233
|
start_time = time.monotonic()
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
)
|
231
|
-
yield item
|
232
|
-
else:
|
233
|
-
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
234
|
+
_log.info(
|
235
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
236
|
+
)
|
237
|
+
yield item
|
234
238
|
|
235
239
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
236
|
-
assert self.format_to_options is not None
|
237
|
-
|
238
240
|
fopt = self.format_to_options.get(doc_format)
|
239
241
|
|
240
242
|
if fopt is None:
|
241
|
-
|
243
|
+
return None
|
242
244
|
else:
|
243
245
|
pipeline_class = fopt.pipeline_cls
|
244
246
|
pipeline_options = fopt.pipeline_options
|
245
247
|
|
246
|
-
|
248
|
+
if pipeline_options is None:
|
249
|
+
return None
|
247
250
|
# TODO this will ignore if different options have been defined for the same pipeline class.
|
248
251
|
if (
|
249
252
|
pipeline_class not in self.initialized_pipelines
|
@@ -257,11 +260,26 @@ class DocumentConverter:
|
|
257
260
|
|
258
261
|
def _process_document(
|
259
262
|
self, in_doc: InputDocument, raises_on_error: bool
|
260
|
-
) ->
|
261
|
-
assert self.allowed_formats is not None
|
262
|
-
assert in_doc.format in self.allowed_formats
|
263
|
+
) -> ConversionResult:
|
263
264
|
|
264
|
-
|
265
|
+
valid = (
|
266
|
+
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
267
|
+
)
|
268
|
+
if valid:
|
269
|
+
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
270
|
+
else:
|
271
|
+
error_message = f"File format not allowed: {in_doc.file}"
|
272
|
+
if raises_on_error:
|
273
|
+
raise ConversionError(error_message)
|
274
|
+
else:
|
275
|
+
error_item = ErrorItem(
|
276
|
+
component_type=DoclingComponentType.USER_INPUT,
|
277
|
+
module_name="",
|
278
|
+
error_message=error_message,
|
279
|
+
)
|
280
|
+
conv_res = ConversionResult(
|
281
|
+
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
282
|
+
)
|
265
283
|
|
266
284
|
return conv_res
|
267
285
|
|
@@ -270,26 +288,28 @@ class DocumentConverter:
|
|
270
288
|
) -> ConversionResult:
|
271
289
|
if in_doc.valid:
|
272
290
|
pipeline = self._get_pipeline(in_doc.format)
|
273
|
-
if pipeline is None:
|
291
|
+
if pipeline is not None:
|
292
|
+
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
293
|
+
else:
|
274
294
|
if raises_on_error:
|
275
|
-
raise
|
295
|
+
raise ConversionError(
|
276
296
|
f"No pipeline could be initialized for {in_doc.file}."
|
277
297
|
)
|
278
298
|
else:
|
279
|
-
conv_res = ConversionResult(
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
284
|
-
|
299
|
+
conv_res = ConversionResult(
|
300
|
+
input=in_doc,
|
301
|
+
status=ConversionStatus.FAILURE,
|
302
|
+
)
|
285
303
|
else:
|
286
304
|
if raises_on_error:
|
287
|
-
raise
|
305
|
+
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
288
306
|
|
289
307
|
else:
|
290
308
|
# invalid doc or not of desired format
|
291
|
-
conv_res = ConversionResult(
|
292
|
-
|
309
|
+
conv_res = ConversionResult(
|
310
|
+
input=in_doc,
|
311
|
+
status=ConversionStatus.FAILURE,
|
312
|
+
)
|
293
313
|
# TODO add error log why it failed.
|
294
314
|
|
295
315
|
return conv_res
|
docling/exceptions.py
ADDED
@@ -1,5 +1,7 @@
|
|
1
|
+
import csv
|
1
2
|
import io
|
2
3
|
import logging
|
4
|
+
import os
|
3
5
|
import tempfile
|
4
6
|
from subprocess import DEVNULL, PIPE, Popen
|
5
7
|
from typing import Iterable, Optional, Tuple
|
@@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
95
97
|
# _log.info(decoded_data)
|
96
98
|
|
97
99
|
# Read the TSV file generated by Tesseract
|
98
|
-
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
100
|
+
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
99
101
|
|
100
102
|
# Display the dataframe (optional)
|
101
103
|
# _log.info("df: ", df.head())
|
@@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
130
132
|
high_res_image = page._backend.get_page_image(
|
131
133
|
scale=self.scale, cropbox=ocr_rect
|
132
134
|
)
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
135
|
+
try:
|
136
|
+
with tempfile.NamedTemporaryFile(
|
137
|
+
suffix=".png", mode="w+b", delete=False
|
138
|
+
) as image_file:
|
139
|
+
fname = image_file.name
|
140
|
+
high_res_image.save(image_file)
|
139
141
|
|
140
142
|
df = self._run_tesseract(fname)
|
143
|
+
finally:
|
144
|
+
if os.path.exists(fname):
|
145
|
+
os.remove(fname)
|
141
146
|
|
142
147
|
# _log.info(df)
|
143
148
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.8.
|
3
|
+
Version: 2.8.3
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
29
|
-
Requires-Dist: docling-core (>=2.
|
29
|
+
Requires-Dist: docling-core (>=2.6.1,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -12,13 +12,14 @@ docling/backend/msword_backend.py,sha256=VFHPr-gCak7w3NJToc5Cs-JaTb4Vm3a1JnnRIfJ
|
|
12
12
|
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
13
13
|
docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
|
14
14
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
docling/cli/main.py,sha256=
|
15
|
+
docling/cli/main.py,sha256=R9ao2zCv1GZQIATOqg9b64O7AOUCWLwjJ-2FIpW8m0I,12236
|
16
16
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
docling/datamodel/base_models.py,sha256=
|
18
|
-
docling/datamodel/document.py,sha256=
|
19
|
-
docling/datamodel/pipeline_options.py,sha256=
|
17
|
+
docling/datamodel/base_models.py,sha256=mJ4h2haE0cOYz_eLd7QlRKU1y7u4yccMGk0tiZNICkQ,5542
|
18
|
+
docling/datamodel/document.py,sha256=Y0NEFphwz44VxIaRaDRhtmw6rifzSC7MqyaDBzaR0lM,20902
|
19
|
+
docling/datamodel/pipeline_options.py,sha256=K65nEZ52aRfF8hWIzl0zVvRQj-3XVwoBbxTacGS6jEg,4960
|
20
20
|
docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
|
21
|
-
docling/document_converter.py,sha256=
|
21
|
+
docling/document_converter.py,sha256=bsXGQCUrbL2LmaqaaEmlkfSANl2XwBBx8HDLwFrqhFY,11570
|
22
|
+
docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
|
22
23
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
24
|
docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
|
24
25
|
docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
|
@@ -30,7 +31,7 @@ docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th
|
|
30
31
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
31
32
|
docling/models/rapid_ocr_model.py,sha256=VQ0jaFmOzB9f-1JaqZ6d0o_El55Lr-nsFHfTNubMAuc,6005
|
32
33
|
docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
|
33
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
34
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
|
34
35
|
docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
|
35
36
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
37
|
docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
|
@@ -41,8 +42,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
|
41
42
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
42
43
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
43
44
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
44
|
-
docling-2.8.
|
45
|
-
docling-2.8.
|
46
|
-
docling-2.8.
|
47
|
-
docling-2.8.
|
48
|
-
docling-2.8.
|
45
|
+
docling-2.8.3.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
46
|
+
docling-2.8.3.dist-info/METADATA,sha256=TKraAUApw0vLlToJ37cBQPNyJwoPmdWMIn73hYwq4Y8,7682
|
47
|
+
docling-2.8.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
48
|
+
docling-2.8.3.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
49
|
+
docling-2.8.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|