docling 2.3.1__py3-none-any.whl → 2.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/cli/main.py +38 -8
- docling/datamodel/pipeline_options.py +3 -3
- {docling-2.3.1.dist-info → docling-2.4.0.dist-info}/METADATA +3 -3
- {docling-2.3.1.dist-info → docling-2.4.0.dist-info}/RECORD +7 -7
- {docling-2.3.1.dist-info → docling-2.4.0.dist-info}/LICENSE +0 -0
- {docling-2.3.1.dist-info → docling-2.4.0.dist-info}/WHEEL +0 -0
- {docling-2.3.1.dist-info → docling-2.4.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -5,12 +5,15 @@ import time
|
|
5
5
|
import warnings
|
6
6
|
from enum import Enum
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Annotated, Dict, Iterable, List, Optional
|
8
|
+
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
9
9
|
|
10
10
|
import typer
|
11
11
|
from docling_core.utils.file import resolve_file_source
|
12
12
|
|
13
13
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
|
+
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
15
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
16
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
14
17
|
from docling.datamodel.base_models import (
|
15
18
|
ConversionStatus,
|
16
19
|
FormatToExtensions,
|
@@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
|
|
22
25
|
EasyOcrOptions,
|
23
26
|
OcrOptions,
|
24
27
|
PdfPipelineOptions,
|
28
|
+
TableFormerMode,
|
25
29
|
TesseractCliOcrOptions,
|
26
30
|
TesseractOcrOptions,
|
27
31
|
)
|
@@ -58,9 +62,10 @@ def version_callback(value: bool):
|
|
58
62
|
|
59
63
|
|
60
64
|
# Define an enum for the backend options
|
61
|
-
class
|
65
|
+
class PdfBackend(str, Enum):
|
62
66
|
PYPDFIUM2 = "pypdfium2"
|
63
|
-
|
67
|
+
DLPARSE_V1 = "dlparse_v1"
|
68
|
+
DLPARSE_V2 = "dlparse_v2"
|
64
69
|
|
65
70
|
|
66
71
|
# Define an enum for the ocr engines
|
@@ -90,28 +95,28 @@ def export_documents(
|
|
90
95
|
# Export Deep Search document JSON format:
|
91
96
|
if export_json:
|
92
97
|
fname = output_dir / f"{doc_filename}.json"
|
93
|
-
with fname.open("w") as fp:
|
98
|
+
with fname.open("w", encoding="utf8") as fp:
|
94
99
|
_log.info(f"writing JSON output to {fname}")
|
95
100
|
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
96
101
|
|
97
102
|
# Export Text format:
|
98
103
|
if export_txt:
|
99
104
|
fname = output_dir / f"{doc_filename}.txt"
|
100
|
-
with fname.open("w") as fp:
|
105
|
+
with fname.open("w", encoding="utf8") as fp:
|
101
106
|
_log.info(f"writing Text output to {fname}")
|
102
107
|
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
103
108
|
|
104
109
|
# Export Markdown format:
|
105
110
|
if export_md:
|
106
111
|
fname = output_dir / f"{doc_filename}.md"
|
107
|
-
with fname.open("w") as fp:
|
112
|
+
with fname.open("w", encoding="utf8") as fp:
|
108
113
|
_log.info(f"writing Markdown output to {fname}")
|
109
114
|
fp.write(conv_res.document.export_to_markdown())
|
110
115
|
|
111
116
|
# Export Document Tags format:
|
112
117
|
if export_doctags:
|
113
118
|
fname = output_dir / f"{doc_filename}.doctags"
|
114
|
-
with fname.open("w") as fp:
|
119
|
+
with fname.open("w", encoding="utf8") as fp:
|
115
120
|
_log.info(f"writing Doc Tags output to {fname}")
|
116
121
|
fp.write(conv_res.document.export_to_document_tokens())
|
117
122
|
|
@@ -151,6 +156,17 @@ def convert(
|
|
151
156
|
ocr_engine: Annotated[
|
152
157
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
153
158
|
] = OcrEngine.EASYOCR,
|
159
|
+
pdf_backend: Annotated[
|
160
|
+
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
161
|
+
] = PdfBackend.DLPARSE_V1,
|
162
|
+
table_mode: Annotated[
|
163
|
+
TableFormerMode,
|
164
|
+
typer.Option(..., help="The mode to use in the table structure model."),
|
165
|
+
] = TableFormerMode.FAST,
|
166
|
+
artifacts_path: Annotated[
|
167
|
+
Optional[Path],
|
168
|
+
typer.Option(..., help="If provided, the location of the model artifacts."),
|
169
|
+
] = None,
|
154
170
|
abort_on_error: Annotated[
|
155
171
|
bool,
|
156
172
|
typer.Option(
|
@@ -217,11 +233,25 @@ def convert(
|
|
217
233
|
do_table_structure=True,
|
218
234
|
)
|
219
235
|
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
236
|
+
pipeline_options.table_structure_options.mode = table_mode
|
237
|
+
|
238
|
+
if artifacts_path is not None:
|
239
|
+
pipeline_options.artifacts_path = artifacts_path
|
240
|
+
|
241
|
+
match pdf_backend:
|
242
|
+
case PdfBackend.DLPARSE_V1:
|
243
|
+
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
244
|
+
case PdfBackend.DLPARSE_V2:
|
245
|
+
backend = DoclingParseV2DocumentBackend
|
246
|
+
case PdfBackend.PYPDFIUM2:
|
247
|
+
backend = PyPdfiumDocumentBackend
|
248
|
+
case _:
|
249
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
220
250
|
|
221
251
|
format_options: Dict[InputFormat, FormatOption] = {
|
222
252
|
InputFormat.PDF: PdfFormatOption(
|
223
253
|
pipeline_options=pipeline_options,
|
224
|
-
backend=
|
254
|
+
backend=backend, # pdf_backend
|
225
255
|
)
|
226
256
|
}
|
227
257
|
doc_converter = DocumentConverter(
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from enum import Enum
|
1
|
+
from enum import Enum
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import List, Literal, Optional, Union
|
4
4
|
|
@@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
|
|
6
6
|
|
7
7
|
|
8
8
|
class TableFormerMode(str, Enum):
|
9
|
-
FAST =
|
10
|
-
ACCURATE =
|
9
|
+
FAST = "fast"
|
10
|
+
ACCURATE = "accurate"
|
11
11
|
|
12
12
|
|
13
13
|
class TableStructureOptions(BaseModel):
|
@@ -1,10 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
4
|
-
Summary:
|
3
|
+
Version: 2.4.0
|
4
|
+
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
7
|
-
Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
|
7
|
+
Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
|
8
8
|
Author: Christoph Auer
|
9
9
|
Author-email: cau@zurich.ibm.com
|
10
10
|
Requires-Python: >=3.10,<4.0
|
@@ -11,11 +11,11 @@ docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpR
|
|
11
11
|
docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
|
12
12
|
docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
|
13
13
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
docling/cli/main.py,sha256=
|
14
|
+
docling/cli/main.py,sha256=IOeIpGoK_5AeE_6LYTU_nfZjqpZ5xeGaTCB8Vfsama0,9334
|
15
15
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
docling/datamodel/base_models.py,sha256=fmkS6iTxGZCTtNCo2zsgMmBC11Ogf2Ht-mNIlZ9GP-o,5375
|
17
17
|
docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
|
18
|
-
docling/datamodel/pipeline_options.py,sha256=
|
18
|
+
docling/datamodel/pipeline_options.py,sha256=PqQ4VjMDN16oWZSUYtskQEH366504OZmnjinCaOWmMc,2444
|
19
19
|
docling/datamodel/settings.py,sha256=2-sYEnKLV_giGygUlBtiBd4CJYN5T9-3BdL6NpWkUYw,1155
|
20
20
|
docling/document_converter.py,sha256=U52_rZQDm2wzrnsuUrvsfX2MnmOWFFhjBzfS8tEvt6Y,10595
|
21
21
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -38,8 +38,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
|
38
38
|
docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
|
39
39
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
40
40
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
41
|
-
docling-2.
|
42
|
-
docling-2.
|
43
|
-
docling-2.
|
44
|
-
docling-2.
|
45
|
-
docling-2.
|
41
|
+
docling-2.4.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
42
|
+
docling-2.4.0.dist-info/METADATA,sha256=9o2Nd020wn0UeQ7d0ABRQt6UnVagPxTFson9bDzcbEA,6116
|
43
|
+
docling-2.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
44
|
+
docling-2.4.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
|
45
|
+
docling-2.4.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|