docling 1.19.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +32 -37
- docling/backend/docling_parse_backend.py +16 -12
- docling/backend/docling_parse_v2_backend.py +240 -0
- docling/backend/html_backend.py +425 -0
- docling/backend/mspowerpoint_backend.py +375 -0
- docling/backend/msword_backend.py +509 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +15 -10
- docling/cli/main.py +61 -60
- docling/datamodel/base_models.py +73 -193
- docling/datamodel/document.py +379 -324
- docling/datamodel/pipeline_options.py +16 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +215 -252
- docling/models/base_model.py +25 -0
- docling/models/base_ocr_model.py +19 -6
- docling/models/ds_glm_model.py +220 -22
- docling/models/easyocr_model.py +45 -40
- docling/models/layout_model.py +130 -114
- docling/models/page_assemble_model.py +119 -95
- docling/models/page_preprocessing_model.py +61 -0
- docling/models/table_structure_model.py +122 -111
- docling/models/tesseract_ocr_cli_model.py +63 -56
- docling/models/tesseract_ocr_model.py +58 -50
- docling/pipeline/base_pipeline.py +190 -0
- docling/pipeline/simple_pipeline.py +59 -0
- docling/pipeline/standard_pdf_pipeline.py +198 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling-2.1.0.dist-info/METADATA +149 -0
- docling-2.1.0.dist-info/RECORD +42 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
from enum import Enum, auto
|
2
|
+
from pathlib import Path
|
2
3
|
from typing import List, Literal, Optional, Union
|
3
4
|
|
4
5
|
from pydantic import BaseModel, ConfigDict, Field
|
@@ -21,6 +22,9 @@ class TableStructureOptions(BaseModel):
|
|
21
22
|
|
22
23
|
class OcrOptions(BaseModel):
|
23
24
|
kind: str
|
25
|
+
bitmap_area_threshold: float = (
|
26
|
+
0.05 # percentage of the area for a bitmap to processed with OCR
|
27
|
+
)
|
24
28
|
|
25
29
|
|
26
30
|
class EasyOcrOptions(OcrOptions):
|
@@ -58,6 +62,13 @@ class TesseractOcrOptions(OcrOptions):
|
|
58
62
|
|
59
63
|
|
60
64
|
class PipelineOptions(BaseModel):
|
65
|
+
create_legacy_output: bool = (
|
66
|
+
True # This defautl will be set to False on a future version of docling
|
67
|
+
)
|
68
|
+
|
69
|
+
|
70
|
+
class PdfPipelineOptions(PipelineOptions):
|
71
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
61
72
|
do_table_structure: bool = True # True: perform table structure extraction
|
62
73
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
63
74
|
|
@@ -65,3 +76,8 @@ class PipelineOptions(BaseModel):
|
|
65
76
|
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
66
77
|
Field(EasyOcrOptions(), discriminator="kind")
|
67
78
|
)
|
79
|
+
|
80
|
+
images_scale: float = 1.0
|
81
|
+
generate_page_images: bool = False
|
82
|
+
generate_picture_images: bool = False
|
83
|
+
generate_table_images: bool = False
|
docling/datamodel/settings.py
CHANGED
docling/document_converter.py
CHANGED
@@ -1,297 +1,260 @@
|
|
1
|
-
import functools
|
2
1
|
import logging
|
3
|
-
import
|
2
|
+
import sys
|
4
3
|
import time
|
5
|
-
import
|
4
|
+
from functools import partial
|
6
5
|
from pathlib import Path
|
7
|
-
from typing import Iterable, Optional, Type
|
8
|
-
|
9
|
-
import
|
10
|
-
|
11
|
-
from
|
12
|
-
|
13
|
-
from docling.backend.
|
14
|
-
from docling.
|
15
|
-
|
16
|
-
|
17
|
-
ConversionStatus,
|
18
|
-
DoclingComponentType,
|
19
|
-
ErrorItem,
|
20
|
-
Page,
|
21
|
-
)
|
6
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Type
|
7
|
+
|
8
|
+
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
9
|
+
|
10
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
11
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
12
|
+
from docling.backend.html_backend import HTMLDocumentBackend
|
13
|
+
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
14
|
+
from docling.backend.msword_backend import MsWordDocumentBackend
|
15
|
+
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
22
16
|
from docling.datamodel.document import (
|
23
17
|
ConversionResult,
|
24
|
-
DocumentConversionInput,
|
25
18
|
InputDocument,
|
19
|
+
_DocumentConversionInput,
|
26
20
|
)
|
27
21
|
from docling.datamodel.pipeline_options import PipelineOptions
|
28
|
-
from docling.datamodel.settings import settings
|
29
|
-
from docling.
|
30
|
-
from docling.
|
31
|
-
from docling.pipeline.
|
32
|
-
from docling.
|
33
|
-
from docling.utils.utils import chunkify, create_hash
|
22
|
+
from docling.datamodel.settings import DocumentLimits, settings
|
23
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
24
|
+
from docling.pipeline.simple_pipeline import SimplePipeline
|
25
|
+
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
26
|
+
from docling.utils.utils import chunkify
|
34
27
|
|
35
28
|
_log = logging.getLogger(__name__)
|
36
29
|
|
37
30
|
|
38
|
-
class
|
39
|
-
|
31
|
+
class FormatOption(BaseModel):
|
32
|
+
pipeline_cls: Type[BasePipeline]
|
33
|
+
pipeline_options: Optional[PipelineOptions] = None
|
34
|
+
backend: Type[AbstractDocumentBackend]
|
40
35
|
|
41
|
-
|
42
|
-
self,
|
43
|
-
artifacts_path: Optional[Union[Path, str]] = None,
|
44
|
-
pipeline_options: PipelineOptions = PipelineOptions(),
|
45
|
-
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
46
|
-
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
47
|
-
assemble_options: AssembleOptions = AssembleOptions(),
|
48
|
-
):
|
49
|
-
if not artifacts_path:
|
50
|
-
artifacts_path = self.download_models_hf()
|
36
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
51
37
|
|
52
|
-
|
38
|
+
@model_validator(mode="after")
|
39
|
+
def set_optional_field_default(self) -> "FormatOption":
|
40
|
+
if self.pipeline_options is None:
|
41
|
+
self.pipeline_options = self.pipeline_cls.get_default_options()
|
42
|
+
return self
|
53
43
|
|
54
|
-
self.model_pipeline = pipeline_cls(
|
55
|
-
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
56
|
-
)
|
57
44
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
self.assemble_options = assemble_options
|
62
|
-
|
63
|
-
@staticmethod
|
64
|
-
def download_models_hf(
|
65
|
-
local_dir: Optional[Path] = None, force: bool = False
|
66
|
-
) -> Path:
|
67
|
-
from huggingface_hub import snapshot_download
|
68
|
-
|
69
|
-
download_path = snapshot_download(
|
70
|
-
repo_id="ds4sd/docling-models",
|
71
|
-
force_download=force,
|
72
|
-
local_dir=local_dir,
|
73
|
-
revision="v2.0.0",
|
74
|
-
)
|
45
|
+
class WordFormatOption(FormatOption):
|
46
|
+
pipeline_cls: Type = SimplePipeline
|
47
|
+
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
75
48
|
|
76
|
-
return Path(download_path)
|
77
49
|
|
78
|
-
|
50
|
+
class PowerpointFormatOption(FormatOption):
|
51
|
+
pipeline_cls: Type = SimplePipeline
|
52
|
+
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
79
53
|
|
80
|
-
for input_batch in chunkify(
|
81
|
-
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
82
|
-
):
|
83
|
-
_log.info(f"Going to convert document batch...")
|
84
|
-
# parallel processing only within input_batch
|
85
|
-
# with ThreadPoolExecutor(
|
86
|
-
# max_workers=settings.perf.doc_batch_concurrency
|
87
|
-
# ) as pool:
|
88
|
-
# yield from pool.map(self.process_document, input_batch)
|
89
54
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
94
|
-
"""Convert a single document.
|
95
|
-
|
96
|
-
Args:
|
97
|
-
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
|
98
|
-
|
99
|
-
Raises:
|
100
|
-
ValueError: If source is of unexpected type.
|
101
|
-
RuntimeError: If conversion fails.
|
102
|
-
|
103
|
-
Returns:
|
104
|
-
ConversionResult: The conversion result object.
|
105
|
-
"""
|
106
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
107
|
-
try:
|
108
|
-
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
109
|
-
res = requests.get(http_url, stream=True)
|
110
|
-
res.raise_for_status()
|
111
|
-
fname = None
|
112
|
-
# try to get filename from response header
|
113
|
-
if cont_disp := res.headers.get("Content-Disposition"):
|
114
|
-
for par in cont_disp.strip().split(";"):
|
115
|
-
# currently only handling directive "filename" (not "*filename")
|
116
|
-
if (split := par.split("=")) and split[0].strip() == "filename":
|
117
|
-
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
118
|
-
break
|
119
|
-
# otherwise, use name from URL:
|
120
|
-
if fname is None:
|
121
|
-
fname = Path(http_url.path).name or self._default_download_filename
|
122
|
-
local_path = Path(temp_dir) / fname
|
123
|
-
with open(local_path, "wb") as f:
|
124
|
-
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
125
|
-
f.write(chunk)
|
126
|
-
except ValidationError:
|
127
|
-
try:
|
128
|
-
local_path = TypeAdapter(Path).validate_python(source)
|
129
|
-
except ValidationError:
|
130
|
-
raise ValueError(
|
131
|
-
f"Unexpected file path type encountered: {type(source)}"
|
132
|
-
)
|
133
|
-
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
134
|
-
conv_res_iter = self.convert(conv_inp)
|
135
|
-
conv_res: ConversionResult = next(conv_res_iter)
|
136
|
-
if conv_res.status not in {
|
137
|
-
ConversionStatus.SUCCESS,
|
138
|
-
ConversionStatus.PARTIAL_SUCCESS,
|
139
|
-
}:
|
140
|
-
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
141
|
-
return conv_res
|
55
|
+
class HTMLFormatOption(FormatOption):
|
56
|
+
pipeline_cls: Type = SimplePipeline
|
57
|
+
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
142
58
|
|
143
|
-
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
|
144
|
-
start_doc_time = time.time()
|
145
|
-
conv_res = ConversionResult(input=in_doc)
|
146
59
|
|
147
|
-
|
60
|
+
class PdfFormatOption(FormatOption):
|
61
|
+
pipeline_cls: Type = StandardPdfPipeline
|
62
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
148
63
|
|
149
|
-
if not in_doc.valid:
|
150
|
-
conv_res.status = ConversionStatus.FAILURE
|
151
|
-
return conv_res
|
152
64
|
|
153
|
-
|
154
|
-
|
65
|
+
class ImageFormatOption(FormatOption):
|
66
|
+
pipeline_cls: Type = StandardPdfPipeline
|
67
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
155
68
|
|
156
|
-
all_assembled_pages = []
|
157
69
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
70
|
+
_format_to_default_options = {
|
71
|
+
InputFormat.DOCX: FormatOption(
|
72
|
+
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
73
|
+
),
|
74
|
+
InputFormat.PPTX: FormatOption(
|
75
|
+
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
76
|
+
),
|
77
|
+
InputFormat.HTML: FormatOption(
|
78
|
+
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
79
|
+
),
|
80
|
+
InputFormat.IMAGE: FormatOption(
|
81
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
82
|
+
),
|
83
|
+
InputFormat.PDF: FormatOption(
|
84
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
85
|
+
),
|
86
|
+
}
|
163
87
|
|
164
|
-
# 1. Initialise the page resources
|
165
|
-
init_pages = map(
|
166
|
-
functools.partial(self._initialize_page, in_doc), page_batch
|
167
|
-
)
|
168
88
|
|
169
|
-
|
170
|
-
|
171
|
-
functools.partial(self._populate_page_images, in_doc), init_pages
|
172
|
-
)
|
89
|
+
class DocumentConverter:
|
90
|
+
_default_download_filename = "file"
|
173
91
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
92
|
+
def __init__(
|
93
|
+
self,
|
94
|
+
allowed_formats: Optional[List[InputFormat]] = None,
|
95
|
+
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
96
|
+
):
|
97
|
+
self.allowed_formats = allowed_formats
|
98
|
+
self.format_to_options = format_options
|
99
|
+
|
100
|
+
if self.allowed_formats is None:
|
101
|
+
# if self.format_to_options is not None:
|
102
|
+
# self.allowed_formats = self.format_to_options.keys()
|
103
|
+
# else:
|
104
|
+
self.allowed_formats = [e for e in InputFormat] # all formats
|
105
|
+
|
106
|
+
if self.format_to_options is None:
|
107
|
+
self.format_to_options = _format_to_default_options
|
108
|
+
else:
|
109
|
+
for f in self.allowed_formats:
|
110
|
+
if f not in self.format_to_options.keys():
|
111
|
+
_log.debug(f"Requested format {f} will use default options.")
|
112
|
+
self.format_to_options[f] = _format_to_default_options[f]
|
113
|
+
|
114
|
+
remove_keys = []
|
115
|
+
for f in self.format_to_options.keys():
|
116
|
+
if f not in self.allowed_formats:
|
117
|
+
remove_keys.append(f)
|
118
|
+
|
119
|
+
for f in remove_keys:
|
120
|
+
self.format_to_options.pop(f)
|
121
|
+
|
122
|
+
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
123
|
+
|
124
|
+
@validate_call(config=ConfigDict(strict=True))
|
125
|
+
def convert(
|
126
|
+
self,
|
127
|
+
source: Path | str | DocumentStream, # TODO review naming
|
128
|
+
raises_on_error: bool = True,
|
129
|
+
max_num_pages: int = sys.maxsize,
|
130
|
+
max_file_size: int = sys.maxsize,
|
131
|
+
) -> ConversionResult:
|
132
|
+
|
133
|
+
all_res = self.convert_all(
|
134
|
+
source=[source],
|
135
|
+
raises_on_error=raises_on_error,
|
136
|
+
max_num_pages=max_num_pages,
|
137
|
+
max_file_size=max_file_size,
|
138
|
+
)
|
139
|
+
return next(all_res)
|
140
|
+
|
141
|
+
@validate_call(config=ConfigDict(strict=True))
|
142
|
+
def convert_all(
|
143
|
+
self,
|
144
|
+
source: Iterable[Path | str | DocumentStream], # TODO review naming
|
145
|
+
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
146
|
+
max_num_pages: int = sys.maxsize,
|
147
|
+
max_file_size: int = sys.maxsize,
|
148
|
+
) -> Iterator[ConversionResult]:
|
149
|
+
limits = DocumentLimits(
|
150
|
+
max_num_pages=max_num_pages,
|
151
|
+
max_file_size=max_file_size,
|
152
|
+
)
|
153
|
+
conv_input = _DocumentConversionInput(
|
154
|
+
path_or_stream_iterator=source,
|
155
|
+
limit=limits,
|
156
|
+
)
|
157
|
+
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
158
|
+
for conv_res in conv_res_iter:
|
159
|
+
if raises_on_error and conv_res.status not in {
|
160
|
+
ConversionStatus.SUCCESS,
|
161
|
+
ConversionStatus.PARTIAL_SUCCESS,
|
162
|
+
}:
|
163
|
+
raise RuntimeError(
|
164
|
+
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
178
165
|
)
|
166
|
+
else:
|
167
|
+
yield conv_res
|
179
168
|
|
180
|
-
|
181
|
-
|
169
|
+
def _convert(
|
170
|
+
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
171
|
+
) -> Iterator[ConversionResult]:
|
172
|
+
assert self.format_to_options is not None
|
182
173
|
|
183
|
-
|
184
|
-
|
174
|
+
for input_batch in chunkify(
|
175
|
+
conv_input.docs(self.format_to_options),
|
176
|
+
settings.perf.doc_batch_size, # pass format_options
|
177
|
+
):
|
178
|
+
_log.info(f"Going to convert document batch...")
|
179
|
+
# parallel processing only within input_batch
|
180
|
+
# with ThreadPoolExecutor(
|
181
|
+
# max_workers=settings.perf.doc_batch_concurrency
|
182
|
+
# ) as pool:
|
183
|
+
# yield from pool.map(self.process_document, input_batch)
|
185
184
|
|
186
|
-
|
187
|
-
|
188
|
-
|
185
|
+
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
186
|
+
for item in map(
|
187
|
+
partial(self._process_document, raises_on_error=raises_on_error),
|
188
|
+
input_batch,
|
189
|
+
):
|
190
|
+
if item is not None:
|
191
|
+
yield item
|
192
|
+
|
193
|
+
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
|
194
|
+
assert self.format_to_options is not None
|
195
|
+
|
196
|
+
fopt = self.format_to_options.get(doc.format)
|
197
|
+
|
198
|
+
if fopt is None:
|
199
|
+
raise RuntimeError(f"Could not get pipeline for document {doc.file}")
|
200
|
+
else:
|
201
|
+
pipeline_class = fopt.pipeline_cls
|
202
|
+
pipeline_options = fopt.pipeline_options
|
203
|
+
|
204
|
+
assert pipeline_options is not None
|
205
|
+
# TODO this will ignore if different options have been defined for the same pipeline class.
|
206
|
+
if (
|
207
|
+
pipeline_class not in self.initialized_pipelines
|
208
|
+
or self.initialized_pipelines[pipeline_class].pipeline_options
|
209
|
+
!= pipeline_options
|
210
|
+
):
|
211
|
+
self.initialized_pipelines[pipeline_class] = pipeline_class(
|
212
|
+
pipeline_options=pipeline_options
|
213
|
+
)
|
214
|
+
return self.initialized_pipelines[pipeline_class]
|
189
215
|
|
190
|
-
|
191
|
-
|
192
|
-
|
216
|
+
def _process_document(
|
217
|
+
self, in_doc: InputDocument, raises_on_error: bool
|
218
|
+
) -> Optional[ConversionResult]:
|
219
|
+
assert self.allowed_formats is not None
|
220
|
+
assert in_doc.format in self.allowed_formats
|
193
221
|
|
194
|
-
|
195
|
-
assembled_page._backend.unload()
|
222
|
+
start_doc_time = time.time()
|
196
223
|
|
197
|
-
|
224
|
+
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
198
225
|
|
199
|
-
|
200
|
-
|
226
|
+
end_doc_time = time.time() - start_doc_time
|
227
|
+
_log.info(
|
228
|
+
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
|
229
|
+
)
|
201
230
|
|
202
|
-
|
203
|
-
self._assemble_doc(conv_res)
|
231
|
+
return conv_res
|
204
232
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
233
|
+
def _execute_pipeline(
|
234
|
+
self, in_doc: InputDocument, raises_on_error: bool
|
235
|
+
) -> ConversionResult:
|
236
|
+
if in_doc.valid:
|
237
|
+
pipeline = self._get_pipeline(in_doc)
|
238
|
+
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
239
|
+
if raises_on_error:
|
240
|
+
raise RuntimeError(
|
241
|
+
f"No pipeline could be initialized for {in_doc.file}."
|
214
242
|
)
|
215
|
-
|
243
|
+
else:
|
244
|
+
conv_res = ConversionResult(input=in_doc)
|
245
|
+
conv_res.status = ConversionStatus.FAILURE
|
246
|
+
return conv_res
|
216
247
|
|
217
|
-
conv_res.
|
248
|
+
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
218
249
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
_log.info(
|
223
|
-
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
224
|
-
f"{trace}"
|
225
|
-
)
|
250
|
+
else:
|
251
|
+
if raises_on_error:
|
252
|
+
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
226
253
|
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
end_doc_time = time.time() - start_doc_time
|
233
|
-
_log.info(
|
234
|
-
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
235
|
-
)
|
254
|
+
else:
|
255
|
+
# invalid doc or not of desired format
|
256
|
+
conv_res = ConversionResult(input=in_doc)
|
257
|
+
conv_res.status = ConversionStatus.FAILURE
|
258
|
+
# TODO add error log why it failed.
|
236
259
|
|
237
260
|
return conv_res
|
238
|
-
|
239
|
-
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
|
240
|
-
def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
241
|
-
page._backend = doc._backend.load_page(page.page_no)
|
242
|
-
page.size = page._backend.get_size()
|
243
|
-
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
|
244
|
-
|
245
|
-
return page
|
246
|
-
|
247
|
-
# Generate the page image and store it in the page object
|
248
|
-
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
249
|
-
# default scale
|
250
|
-
page.get_image(
|
251
|
-
scale=1.0
|
252
|
-
) # puts the page image on the image cache at default scale
|
253
|
-
|
254
|
-
# user requested scales
|
255
|
-
if self.assemble_options.images_scale is not None:
|
256
|
-
page._default_image_scale = self.assemble_options.images_scale
|
257
|
-
page.get_image(
|
258
|
-
scale=self.assemble_options.images_scale
|
259
|
-
) # this will trigger storing the image in the internal cache
|
260
|
-
|
261
|
-
return page
|
262
|
-
|
263
|
-
# Extract and populate the page cells and store it in the page object
|
264
|
-
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
265
|
-
page.cells = page._backend.get_text_cells()
|
266
|
-
|
267
|
-
# DEBUG code:
|
268
|
-
def draw_text_boxes(image, cells):
|
269
|
-
draw = ImageDraw.Draw(image)
|
270
|
-
for c in cells:
|
271
|
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
272
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
273
|
-
image.show()
|
274
|
-
|
275
|
-
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
276
|
-
|
277
|
-
return page
|
278
|
-
|
279
|
-
def _assemble_doc(self, conv_res: ConversionResult):
|
280
|
-
all_elements = []
|
281
|
-
all_headers = []
|
282
|
-
all_body = []
|
283
|
-
|
284
|
-
for p in conv_res.pages:
|
285
|
-
|
286
|
-
for el in p.assembled.body:
|
287
|
-
all_body.append(el)
|
288
|
-
for el in p.assembled.headers:
|
289
|
-
all_headers.append(el)
|
290
|
-
for el in p.assembled.elements:
|
291
|
-
all_elements.append(el)
|
292
|
-
|
293
|
-
conv_res.assembled = AssembledUnit(
|
294
|
-
elements=all_elements, headers=all_headers, body=all_body
|
295
|
-
)
|
296
|
-
|
297
|
-
conv_res.output = self.glm_model(conv_res)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Any, Iterable
|
3
|
+
|
4
|
+
from docling_core.types.doc import DoclingDocument, NodeItem
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import Page
|
7
|
+
|
8
|
+
|
9
|
+
class BasePageModel(ABC):
|
10
|
+
@abstractmethod
|
11
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
12
|
+
pass
|
13
|
+
|
14
|
+
|
15
|
+
class BaseEnrichmentModel(ABC):
|
16
|
+
|
17
|
+
@abstractmethod
|
18
|
+
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
19
|
+
pass
|
20
|
+
|
21
|
+
@abstractmethod
|
22
|
+
def __call__(
|
23
|
+
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
24
|
+
) -> Iterable[Any]:
|
25
|
+
pass
|
docling/models/base_ocr_model.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
from abc import abstractmethod
|
4
|
-
from typing import Iterable, List
|
4
|
+
from typing import Iterable, List
|
5
5
|
|
6
6
|
import numpy as np
|
7
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
7
8
|
from PIL import Image, ImageDraw
|
8
9
|
from rtree import index
|
9
10
|
from scipy.ndimage import find_objects, label
|
10
11
|
|
11
|
-
from docling.datamodel.base_models import
|
12
|
+
from docling.datamodel.base_models import OcrCell, Page
|
12
13
|
from docling.datamodel.pipeline_options import OcrOptions
|
13
14
|
|
14
15
|
_log = logging.getLogger(__name__)
|
@@ -20,8 +21,9 @@ class BaseOcrModel:
|
|
20
21
|
self.options = options
|
21
22
|
|
22
23
|
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
23
|
-
def get_ocr_rects(self, page: Page) ->
|
24
|
+
def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
|
24
25
|
BITMAP_COVERAGE_TRESHOLD = 0.75
|
26
|
+
assert page.size is not None
|
25
27
|
|
26
28
|
def find_ocr_rects(size, bitmap_rects):
|
27
29
|
image = Image.new(
|
@@ -60,11 +62,14 @@ class BaseOcrModel:
|
|
60
62
|
|
61
63
|
return (area_frac, bounding_boxes) # fraction covered # boxes
|
62
64
|
|
63
|
-
|
65
|
+
if page._backend is not None:
|
66
|
+
bitmap_rects = page._backend.get_bitmap_rects()
|
67
|
+
else:
|
68
|
+
bitmap_rects = []
|
64
69
|
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
65
70
|
|
66
71
|
# return full-page rectangle if sufficiently covered with bitmaps
|
67
|
-
if coverage > BITMAP_COVERAGE_TRESHOLD:
|
72
|
+
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
|
68
73
|
return [
|
69
74
|
BoundingBox(
|
70
75
|
l=0,
|
@@ -75,7 +80,15 @@ class BaseOcrModel:
|
|
75
80
|
)
|
76
81
|
]
|
77
82
|
# return individual rectangles if the bitmap coverage is smaller
|
78
|
-
|
83
|
+
else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
|
84
|
+
|
85
|
+
# skip OCR if the bitmap area on the page is smaller than the options threshold
|
86
|
+
ocr_rects = [
|
87
|
+
rect
|
88
|
+
for rect in ocr_rects
|
89
|
+
if rect.area() / (page.size.width * page.size.height)
|
90
|
+
> self.options.bitmap_area_threshold
|
91
|
+
]
|
79
92
|
return ocr_rects
|
80
93
|
|
81
94
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|