docling 1.6.2__py3-none-any.whl → 1.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +17 -8
- docling/backend/docling_parse_backend.py +42 -26
- docling/backend/pypdfium2_backend.py +33 -11
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +253 -0
- docling/datamodel/base_models.py +39 -27
- docling/datamodel/document.py +115 -17
- docling/datamodel/pipeline_options.py +67 -0
- docling/document_converter.py +65 -44
- docling/models/base_ocr_model.py +4 -4
- docling/models/ds_glm_model.py +11 -7
- docling/models/easyocr_model.py +19 -4
- docling/models/layout_model.py +3 -3
- docling/models/table_structure_model.py +18 -2
- docling/models/tesseract_ocr_cli_model.py +167 -0
- docling/models/tesseract_ocr_model.py +122 -0
- docling/pipeline/base_model_pipeline.py +4 -3
- docling/pipeline/standard_model_pipeline.py +36 -8
- docling/utils/export.py +145 -0
- {docling-1.6.2.dist-info → docling-1.19.1.dist-info}/LICENSE +1 -1
- docling-1.19.1.dist-info/METADATA +380 -0
- docling-1.19.1.dist-info/RECORD +34 -0
- docling-1.19.1.dist-info/entry_points.txt +3 -0
- docling-1.6.2.dist-info/METADATA +0 -192
- docling-1.6.2.dist-info/RECORD +0 -27
- {docling-1.6.2.dist-info → docling-1.19.1.dist-info}/WHEEL +0 -0
@@ -1,14 +1,15 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Iterable, Optional, Union
|
4
|
+
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
|
5
5
|
|
6
6
|
from PIL import Image
|
7
7
|
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from docling.datamodel.base_models import BoundingBox, Cell, PageSize
|
10
|
+
|
8
11
|
|
9
12
|
class PdfPageBackend(ABC):
|
10
|
-
def __init__(self, page_obj: Any) -> object:
|
11
|
-
pass
|
12
13
|
|
13
14
|
@abstractmethod
|
14
15
|
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
@@ -19,12 +20,12 @@ class PdfPageBackend(ABC):
|
|
19
20
|
pass
|
20
21
|
|
21
22
|
@abstractmethod
|
22
|
-
def get_bitmap_rects(self,
|
23
|
+
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
|
23
24
|
pass
|
24
25
|
|
25
26
|
@abstractmethod
|
26
27
|
def get_page_image(
|
27
|
-
self, scale:
|
28
|
+
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
|
28
29
|
) -> Image.Image:
|
29
30
|
pass
|
30
31
|
|
@@ -32,6 +33,10 @@ class PdfPageBackend(ABC):
|
|
32
33
|
def get_size(self) -> "PageSize":
|
33
34
|
pass
|
34
35
|
|
36
|
+
@abstractmethod
|
37
|
+
def is_valid(self) -> bool:
|
38
|
+
pass
|
39
|
+
|
35
40
|
@abstractmethod
|
36
41
|
def unload(self):
|
37
42
|
pass
|
@@ -39,8 +44,9 @@ class PdfPageBackend(ABC):
|
|
39
44
|
|
40
45
|
class PdfDocumentBackend(ABC):
|
41
46
|
@abstractmethod
|
42
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
43
|
-
|
47
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
48
|
+
self.path_or_stream = path_or_stream
|
49
|
+
self.document_hash = document_hash
|
44
50
|
|
45
51
|
@abstractmethod
|
46
52
|
def load_page(self, page_no: int) -> PdfPageBackend:
|
@@ -56,4 +62,7 @@ class PdfDocumentBackend(ABC):
|
|
56
62
|
|
57
63
|
@abstractmethod
|
58
64
|
def unload(self):
|
59
|
-
|
65
|
+
if isinstance(self.path_or_stream, BytesIO):
|
66
|
+
self.path_or_stream.close()
|
67
|
+
|
68
|
+
self.path_or_stream = None
|
@@ -1,9 +1,8 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
-
import time
|
4
3
|
from io import BytesIO
|
5
4
|
from pathlib import Path
|
6
|
-
from typing import Iterable, Optional, Union
|
5
|
+
from typing import Iterable, List, Optional, Union
|
7
6
|
|
8
7
|
import pypdfium2 as pdfium
|
9
8
|
from docling_parse.docling_parse import pdf_parser
|
@@ -17,13 +16,26 @@ _log = logging.getLogger(__name__)
|
|
17
16
|
|
18
17
|
|
19
18
|
class DoclingParsePageBackend(PdfPageBackend):
|
20
|
-
def __init__(
|
21
|
-
|
19
|
+
def __init__(
|
20
|
+
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
|
21
|
+
):
|
22
22
|
self._ppage = page_obj
|
23
|
-
|
24
|
-
|
23
|
+
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
24
|
+
|
25
|
+
self.valid = "pages" in parsed_page
|
26
|
+
if self.valid:
|
27
|
+
self._dpage = parsed_page["pages"][0]
|
28
|
+
else:
|
29
|
+
_log.info(
|
30
|
+
f"An error occured when loading page {page_no} of document {document_hash}."
|
31
|
+
)
|
32
|
+
|
33
|
+
def is_valid(self) -> bool:
|
34
|
+
return self.valid
|
25
35
|
|
26
36
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
37
|
+
if not self.valid:
|
38
|
+
return ""
|
27
39
|
# Find intersecting cells on the page
|
28
40
|
text_piece = ""
|
29
41
|
page_size = self.get_size()
|
@@ -55,9 +67,12 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
55
67
|
return text_piece
|
56
68
|
|
57
69
|
def get_text_cells(self) -> Iterable[Cell]:
|
58
|
-
cells = []
|
70
|
+
cells: List[Cell] = []
|
59
71
|
cell_counter = 0
|
60
72
|
|
73
|
+
if not self.valid:
|
74
|
+
return cells
|
75
|
+
|
61
76
|
page_size = self.get_size()
|
62
77
|
|
63
78
|
parser_width = self._dpage["width"]
|
@@ -114,7 +129,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
114
129
|
|
115
130
|
return cells
|
116
131
|
|
117
|
-
def get_bitmap_rects(self, scale:
|
132
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
118
133
|
AREA_THRESHOLD = 32 * 32
|
119
134
|
|
120
135
|
for i in range(len(self._dpage["images"])):
|
@@ -129,7 +144,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
129
144
|
yield cropbox
|
130
145
|
|
131
146
|
def get_page_image(
|
132
|
-
self, scale:
|
147
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
133
148
|
) -> Image.Image:
|
134
149
|
|
135
150
|
page_size = self.get_size()
|
@@ -168,40 +183,41 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
168
183
|
def unload(self):
|
169
184
|
self._ppage = None
|
170
185
|
self._dpage = None
|
171
|
-
self.text_page = None
|
172
186
|
|
173
187
|
|
174
188
|
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
175
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
176
|
-
super().__init__(path_or_stream)
|
177
|
-
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
178
|
-
# Parsing cells with docling_parser call
|
179
|
-
parser = pdf_parser()
|
189
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
190
|
+
super().__init__(path_or_stream, document_hash)
|
180
191
|
|
181
|
-
|
192
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
193
|
+
self.parser = pdf_parser()
|
182
194
|
|
195
|
+
success = False
|
183
196
|
if isinstance(path_or_stream, BytesIO):
|
184
|
-
|
185
|
-
|
186
|
-
|
197
|
+
success = self.parser.load_document_from_bytesio(
|
198
|
+
document_hash, path_or_stream
|
199
|
+
)
|
200
|
+
elif isinstance(path_or_stream, Path):
|
201
|
+
success = self.parser.load_document(document_hash, str(path_or_stream))
|
187
202
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
203
|
+
if not success:
|
204
|
+
raise RuntimeError(
|
205
|
+
f"docling-parse could not load document {document_hash}."
|
206
|
+
)
|
192
207
|
|
193
208
|
def page_count(self) -> int:
|
194
|
-
return len(self.
|
209
|
+
return len(self._pdoc) # To be replaced with docling-parse API
|
195
210
|
|
196
211
|
def load_page(self, page_no: int) -> DoclingParsePageBackend:
|
197
212
|
return DoclingParsePageBackend(
|
198
|
-
self.
|
213
|
+
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
199
214
|
)
|
200
215
|
|
201
216
|
def is_valid(self) -> bool:
|
202
217
|
return self.page_count() > 0
|
203
218
|
|
204
219
|
def unload(self):
|
220
|
+
super().unload()
|
221
|
+
self.parser.unload_document(self.document_hash)
|
205
222
|
self._pdoc.close()
|
206
223
|
self._pdoc = None
|
207
|
-
self._parser_doc = None
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import logging
|
1
2
|
import random
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
@@ -6,19 +7,34 @@ from typing import Iterable, List, Optional, Union
|
|
6
7
|
import pypdfium2 as pdfium
|
7
8
|
import pypdfium2.raw as pdfium_c
|
8
9
|
from PIL import Image, ImageDraw
|
9
|
-
from pypdfium2 import PdfPage
|
10
|
+
from pypdfium2 import PdfPage, PdfTextPage
|
11
|
+
from pypdfium2._helpers.misc import PdfiumError
|
10
12
|
|
11
13
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
12
14
|
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
13
15
|
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
14
18
|
|
15
19
|
class PyPdfiumPageBackend(PdfPageBackend):
|
16
|
-
def __init__(
|
17
|
-
|
18
|
-
|
19
|
-
self.
|
20
|
+
def __init__(
|
21
|
+
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
22
|
+
):
|
23
|
+
self.valid = True # No better way to tell from pypdfium.
|
24
|
+
try:
|
25
|
+
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
26
|
+
except PdfiumError as e:
|
27
|
+
_log.info(
|
28
|
+
f"An exception occured when loading page {page_no} of document {document_hash}.",
|
29
|
+
exc_info=True,
|
30
|
+
)
|
31
|
+
self.valid = False
|
32
|
+
self.text_page: Optional[PdfTextPage] = None
|
33
|
+
|
34
|
+
def is_valid(self) -> bool:
|
35
|
+
return self.valid
|
20
36
|
|
21
|
-
def get_bitmap_rects(self, scale:
|
37
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
22
38
|
AREA_THRESHOLD = 32 * 32
|
23
39
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
24
40
|
pos = obj.get_pos()
|
@@ -173,7 +189,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
173
189
|
return cells
|
174
190
|
|
175
191
|
def get_page_image(
|
176
|
-
self, scale:
|
192
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
177
193
|
) -> Image.Image:
|
178
194
|
|
179
195
|
page_size = self.get_size()
|
@@ -215,19 +231,25 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
215
231
|
|
216
232
|
|
217
233
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
218
|
-
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
219
|
-
super().__init__(path_or_stream)
|
220
|
-
|
234
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
235
|
+
super().__init__(path_or_stream, document_hash)
|
236
|
+
try:
|
237
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
238
|
+
except PdfiumError as e:
|
239
|
+
raise RuntimeError(
|
240
|
+
f"pypdfium could not load document {document_hash}"
|
241
|
+
) from e
|
221
242
|
|
222
243
|
def page_count(self) -> int:
|
223
244
|
return len(self._pdoc)
|
224
245
|
|
225
246
|
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
226
|
-
return PyPdfiumPageBackend(self._pdoc
|
247
|
+
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
227
248
|
|
228
249
|
def is_valid(self) -> bool:
|
229
250
|
return self.page_count() > 0
|
230
251
|
|
231
252
|
def unload(self):
|
253
|
+
super().unload()
|
232
254
|
self._pdoc.close()
|
233
255
|
self._pdoc = None
|
docling/cli/__init__.py
ADDED
File without changes
|
docling/cli/main.py
ADDED
@@ -0,0 +1,253 @@
|
|
1
|
+
import importlib
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import time
|
5
|
+
import warnings
|
6
|
+
from enum import Enum
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Annotated, Iterable, List, Optional
|
9
|
+
|
10
|
+
import typer
|
11
|
+
from docling_core.utils.file import resolve_file_source
|
12
|
+
|
13
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
|
+
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
15
|
+
from docling.datamodel.base_models import ConversionStatus
|
16
|
+
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
17
|
+
from docling.datamodel.pipeline_options import (
|
18
|
+
EasyOcrOptions,
|
19
|
+
PipelineOptions,
|
20
|
+
TesseractCliOcrOptions,
|
21
|
+
TesseractOcrOptions,
|
22
|
+
)
|
23
|
+
from docling.document_converter import DocumentConverter
|
24
|
+
|
25
|
+
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
26
|
+
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
27
|
+
|
28
|
+
_log = logging.getLogger(__name__)
|
29
|
+
from rich.console import Console
|
30
|
+
|
31
|
+
err_console = Console(stderr=True)
|
32
|
+
|
33
|
+
|
34
|
+
app = typer.Typer(
|
35
|
+
name="Docling",
|
36
|
+
no_args_is_help=True,
|
37
|
+
add_completion=False,
|
38
|
+
pretty_exceptions_enable=False,
|
39
|
+
)
|
40
|
+
|
41
|
+
|
42
|
+
def version_callback(value: bool):
|
43
|
+
if value:
|
44
|
+
docling_version = importlib.metadata.version("docling")
|
45
|
+
docling_core_version = importlib.metadata.version("docling-core")
|
46
|
+
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
|
47
|
+
docling_parse_version = importlib.metadata.version("docling-parse")
|
48
|
+
print(f"Docling version: {docling_version}")
|
49
|
+
print(f"Docling Core version: {docling_core_version}")
|
50
|
+
print(f"Docling IBM Models version: {docling_ibm_models_version}")
|
51
|
+
print(f"Docling Parse version: {docling_parse_version}")
|
52
|
+
raise typer.Exit()
|
53
|
+
|
54
|
+
|
55
|
+
# Define an enum for the backend options
|
56
|
+
class Backend(str, Enum):
|
57
|
+
PYPDFIUM2 = "pypdfium2"
|
58
|
+
DOCLING = "docling"
|
59
|
+
|
60
|
+
|
61
|
+
# Define an enum for the ocr engines
|
62
|
+
class OcrEngine(str, Enum):
|
63
|
+
EASYOCR = "easyocr"
|
64
|
+
TESSERACT_CLI = "tesseract_cli"
|
65
|
+
TESSERACT = "tesseract"
|
66
|
+
|
67
|
+
|
68
|
+
def export_documents(
|
69
|
+
conv_results: Iterable[ConversionResult],
|
70
|
+
output_dir: Path,
|
71
|
+
export_json: bool,
|
72
|
+
export_md: bool,
|
73
|
+
export_txt: bool,
|
74
|
+
export_doctags: bool,
|
75
|
+
):
|
76
|
+
|
77
|
+
success_count = 0
|
78
|
+
failure_count = 0
|
79
|
+
|
80
|
+
for conv_res in conv_results:
|
81
|
+
if conv_res.status == ConversionStatus.SUCCESS:
|
82
|
+
success_count += 1
|
83
|
+
doc_filename = conv_res.input.file.stem
|
84
|
+
|
85
|
+
# Export Deep Search document JSON format:
|
86
|
+
if export_json:
|
87
|
+
fname = output_dir / f"{doc_filename}.json"
|
88
|
+
with fname.open("w") as fp:
|
89
|
+
_log.info(f"writing JSON output to {fname}")
|
90
|
+
fp.write(json.dumps(conv_res.render_as_dict()))
|
91
|
+
|
92
|
+
# Export Text format:
|
93
|
+
if export_txt:
|
94
|
+
fname = output_dir / f"{doc_filename}.txt"
|
95
|
+
with fname.open("w") as fp:
|
96
|
+
_log.info(f"writing Text output to {fname}")
|
97
|
+
fp.write(conv_res.render_as_text())
|
98
|
+
|
99
|
+
# Export Markdown format:
|
100
|
+
if export_md:
|
101
|
+
fname = output_dir / f"{doc_filename}.md"
|
102
|
+
with fname.open("w") as fp:
|
103
|
+
_log.info(f"writing Markdown output to {fname}")
|
104
|
+
fp.write(conv_res.render_as_markdown())
|
105
|
+
|
106
|
+
# Export Document Tags format:
|
107
|
+
if export_doctags:
|
108
|
+
fname = output_dir / f"{doc_filename}.doctags"
|
109
|
+
with fname.open("w") as fp:
|
110
|
+
_log.info(f"writing Doc Tags output to {fname}")
|
111
|
+
fp.write(conv_res.render_as_doctags())
|
112
|
+
|
113
|
+
else:
|
114
|
+
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
115
|
+
failure_count += 1
|
116
|
+
|
117
|
+
_log.info(
|
118
|
+
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
119
|
+
)
|
120
|
+
|
121
|
+
|
122
|
+
@app.command(no_args_is_help=True)
|
123
|
+
def convert(
|
124
|
+
input_sources: Annotated[
|
125
|
+
List[str],
|
126
|
+
typer.Argument(
|
127
|
+
...,
|
128
|
+
metavar="source",
|
129
|
+
help="PDF files to convert. Can be local file / directory paths or URL.",
|
130
|
+
),
|
131
|
+
],
|
132
|
+
export_json: Annotated[
|
133
|
+
bool,
|
134
|
+
typer.Option(
|
135
|
+
..., "--json/--no-json", help="If enabled the document is exported as JSON."
|
136
|
+
),
|
137
|
+
] = False,
|
138
|
+
export_md: Annotated[
|
139
|
+
bool,
|
140
|
+
typer.Option(
|
141
|
+
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
|
142
|
+
),
|
143
|
+
] = True,
|
144
|
+
export_txt: Annotated[
|
145
|
+
bool,
|
146
|
+
typer.Option(
|
147
|
+
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
|
148
|
+
),
|
149
|
+
] = False,
|
150
|
+
export_doctags: Annotated[
|
151
|
+
bool,
|
152
|
+
typer.Option(
|
153
|
+
...,
|
154
|
+
"--doctags/--no-doctags",
|
155
|
+
help="If enabled the document is exported as Doc Tags.",
|
156
|
+
),
|
157
|
+
] = False,
|
158
|
+
ocr: Annotated[
|
159
|
+
bool,
|
160
|
+
typer.Option(
|
161
|
+
..., help="If enabled, the bitmap content will be processed using OCR."
|
162
|
+
),
|
163
|
+
] = True,
|
164
|
+
backend: Annotated[
|
165
|
+
Backend, typer.Option(..., help="The PDF backend to use.")
|
166
|
+
] = Backend.DOCLING,
|
167
|
+
ocr_engine: Annotated[
|
168
|
+
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
169
|
+
] = OcrEngine.EASYOCR,
|
170
|
+
output: Annotated[
|
171
|
+
Path, typer.Option(..., help="Output directory where results are saved.")
|
172
|
+
] = Path("."),
|
173
|
+
version: Annotated[
|
174
|
+
Optional[bool],
|
175
|
+
typer.Option(
|
176
|
+
"--version",
|
177
|
+
callback=version_callback,
|
178
|
+
is_eager=True,
|
179
|
+
help="Show version information.",
|
180
|
+
),
|
181
|
+
] = None,
|
182
|
+
):
|
183
|
+
logging.basicConfig(level=logging.INFO)
|
184
|
+
|
185
|
+
input_doc_paths: List[Path] = []
|
186
|
+
for src in input_sources:
|
187
|
+
source = resolve_file_source(source=src)
|
188
|
+
if not source.exists():
|
189
|
+
err_console.print(
|
190
|
+
f"[red]Error: The input file {source} does not exist.[/red]"
|
191
|
+
)
|
192
|
+
raise typer.Abort()
|
193
|
+
elif source.is_dir():
|
194
|
+
input_doc_paths.extend(list(source.glob("**/*.pdf")))
|
195
|
+
input_doc_paths.extend(list(source.glob("**/*.PDF")))
|
196
|
+
else:
|
197
|
+
input_doc_paths.append(source)
|
198
|
+
|
199
|
+
match backend:
|
200
|
+
case Backend.PYPDFIUM2:
|
201
|
+
do_cell_matching = ocr # only do cell matching when OCR enabled
|
202
|
+
pdf_backend = PyPdfiumDocumentBackend
|
203
|
+
case Backend.DOCLING:
|
204
|
+
do_cell_matching = True
|
205
|
+
pdf_backend = DoclingParseDocumentBackend
|
206
|
+
case _:
|
207
|
+
raise RuntimeError(f"Unexpected backend type {backend}")
|
208
|
+
|
209
|
+
match ocr_engine:
|
210
|
+
case OcrEngine.EASYOCR:
|
211
|
+
ocr_options = EasyOcrOptions()
|
212
|
+
case OcrEngine.TESSERACT_CLI:
|
213
|
+
ocr_options = TesseractCliOcrOptions()
|
214
|
+
case OcrEngine.TESSERACT:
|
215
|
+
ocr_options = TesseractOcrOptions()
|
216
|
+
case _:
|
217
|
+
raise RuntimeError(f"Unexpected backend type {backend}")
|
218
|
+
|
219
|
+
pipeline_options = PipelineOptions(
|
220
|
+
do_ocr=ocr,
|
221
|
+
ocr_options=ocr_options,
|
222
|
+
do_table_structure=True,
|
223
|
+
)
|
224
|
+
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
225
|
+
doc_converter = DocumentConverter(
|
226
|
+
pipeline_options=pipeline_options,
|
227
|
+
pdf_backend=pdf_backend,
|
228
|
+
)
|
229
|
+
|
230
|
+
# Define input files
|
231
|
+
input = DocumentConversionInput.from_paths(input_doc_paths)
|
232
|
+
|
233
|
+
start_time = time.time()
|
234
|
+
|
235
|
+
conv_results = doc_converter.convert(input)
|
236
|
+
|
237
|
+
output.mkdir(parents=True, exist_ok=True)
|
238
|
+
export_documents(
|
239
|
+
conv_results,
|
240
|
+
output_dir=output,
|
241
|
+
export_json=export_json,
|
242
|
+
export_md=export_md,
|
243
|
+
export_txt=export_txt,
|
244
|
+
export_doctags=export_doctags,
|
245
|
+
)
|
246
|
+
|
247
|
+
end_time = time.time() - start_time
|
248
|
+
|
249
|
+
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
250
|
+
|
251
|
+
|
252
|
+
if __name__ == "__main__":
|
253
|
+
app()
|
docling/datamodel/base_models.py
CHANGED
@@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
9
9
|
from typing_extensions import Self
|
10
10
|
|
11
11
|
from docling.backend.abstract_backend import PdfPageBackend
|
12
|
+
from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
|
13
|
+
PipelineOptions,
|
14
|
+
TableStructureOptions,
|
15
|
+
)
|
12
16
|
|
13
17
|
|
14
18
|
class ConversionStatus(str, Enum):
|
@@ -16,7 +20,7 @@ class ConversionStatus(str, Enum):
|
|
16
20
|
STARTED = auto()
|
17
21
|
FAILURE = auto()
|
18
22
|
SUCCESS = auto()
|
19
|
-
|
23
|
+
PARTIAL_SUCCESS = auto()
|
20
24
|
|
21
25
|
|
22
26
|
class DocInputType(str, Enum):
|
@@ -29,6 +33,18 @@ class CoordOrigin(str, Enum):
|
|
29
33
|
BOTTOMLEFT = auto()
|
30
34
|
|
31
35
|
|
36
|
+
class DoclingComponentType(str, Enum):
|
37
|
+
PDF_BACKEND = auto()
|
38
|
+
MODEL = auto()
|
39
|
+
DOC_ASSEMBLER = auto()
|
40
|
+
|
41
|
+
|
42
|
+
class ErrorItem(BaseModel):
|
43
|
+
component_type: DoclingComponentType
|
44
|
+
module_name: str
|
45
|
+
error_message: str
|
46
|
+
|
47
|
+
|
32
48
|
class PageSize(BaseModel):
|
33
49
|
width: float = 0.0
|
34
50
|
height: float = 0.0
|
@@ -59,6 +75,15 @@ class BoundingBox(BaseModel):
|
|
59
75
|
|
60
76
|
return out_bbox
|
61
77
|
|
78
|
+
def normalized(self, page_size: PageSize) -> "BoundingBox":
|
79
|
+
out_bbox = copy.deepcopy(self)
|
80
|
+
out_bbox.l /= page_size.width
|
81
|
+
out_bbox.r /= page_size.width
|
82
|
+
out_bbox.t /= page_size.height
|
83
|
+
out_bbox.b /= page_size.height
|
84
|
+
|
85
|
+
return out_bbox
|
86
|
+
|
62
87
|
def as_tuple(self):
|
63
88
|
if self.coord_origin == CoordOrigin.TOPLEFT:
|
64
89
|
return (self.l, self.t, self.r, self.b)
|
@@ -66,7 +91,7 @@ class BoundingBox(BaseModel):
|
|
66
91
|
return (self.l, self.b, self.r, self.t)
|
67
92
|
|
68
93
|
@classmethod
|
69
|
-
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
|
94
|
+
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
70
95
|
if origin == CoordOrigin.TOPLEFT:
|
71
96
|
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
72
97
|
if r < l:
|
@@ -85,7 +110,10 @@ class BoundingBox(BaseModel):
|
|
85
110
|
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
86
111
|
|
87
112
|
def area(self) -> float:
|
88
|
-
|
113
|
+
area = (self.r - self.l) * (self.b - self.t)
|
114
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
115
|
+
area = -area
|
116
|
+
return area
|
89
117
|
|
90
118
|
def intersection_area_with(self, other: "BoundingBox") -> float:
|
91
119
|
# Calculate intersection coordinates
|
@@ -225,19 +253,19 @@ class EquationPrediction(BaseModel):
|
|
225
253
|
|
226
254
|
|
227
255
|
class PagePredictions(BaseModel):
|
228
|
-
layout: LayoutPrediction = None
|
229
|
-
tablestructure: TableStructurePrediction = None
|
230
|
-
figures_classification: FigureClassificationPrediction = None
|
231
|
-
equations_prediction: EquationPrediction = None
|
256
|
+
layout: Optional[LayoutPrediction] = None
|
257
|
+
tablestructure: Optional[TableStructurePrediction] = None
|
258
|
+
figures_classification: Optional[FigureClassificationPrediction] = None
|
259
|
+
equations_prediction: Optional[EquationPrediction] = None
|
232
260
|
|
233
261
|
|
234
262
|
PageElement = Union[TextElement, TableElement, FigureElement]
|
235
263
|
|
236
264
|
|
237
265
|
class AssembledUnit(BaseModel):
|
238
|
-
elements: List[PageElement]
|
239
|
-
body: List[PageElement]
|
240
|
-
headers: List[PageElement]
|
266
|
+
elements: List[PageElement] = []
|
267
|
+
body: List[PageElement] = []
|
268
|
+
headers: List[PageElement] = []
|
241
269
|
|
242
270
|
|
243
271
|
class Page(BaseModel):
|
@@ -246,7 +274,7 @@ class Page(BaseModel):
|
|
246
274
|
page_no: int
|
247
275
|
page_hash: Optional[str] = None
|
248
276
|
size: Optional[PageSize] = None
|
249
|
-
cells: List[Cell] =
|
277
|
+
cells: List[Cell] = []
|
250
278
|
predictions: PagePredictions = PagePredictions()
|
251
279
|
assembled: Optional[AssembledUnit] = None
|
252
280
|
|
@@ -277,22 +305,6 @@ class DocumentStream(BaseModel):
|
|
277
305
|
stream: BytesIO
|
278
306
|
|
279
307
|
|
280
|
-
class TableStructureOptions(BaseModel):
|
281
|
-
do_cell_matching: bool = (
|
282
|
-
True
|
283
|
-
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
284
|
-
# are merged across table columns.
|
285
|
-
# False: Let table structure model define the text cells, ignore PDF cells.
|
286
|
-
)
|
287
|
-
|
288
|
-
|
289
|
-
class PipelineOptions(BaseModel):
|
290
|
-
do_table_structure: bool = True # True: perform table structure extraction
|
291
|
-
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
292
|
-
|
293
|
-
table_structure_options: TableStructureOptions = TableStructureOptions()
|
294
|
-
|
295
|
-
|
296
308
|
class AssembleOptions(BaseModel):
|
297
309
|
keep_page_images: Annotated[
|
298
310
|
bool,
|