docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +33 -37
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +20 -16
- docling/backend/docling_parse_v2_backend.py +248 -0
- docling/backend/html_backend.py +429 -0
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +496 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +16 -11
- docling/cli/main.py +96 -65
- docling/datamodel/base_models.py +79 -193
- docling/datamodel/document.py +405 -320
- docling/datamodel/pipeline_options.py +19 -3
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +240 -251
- docling/models/base_model.py +28 -0
- docling/models/base_ocr_model.py +40 -10
- docling/models/ds_glm_model.py +244 -30
- docling/models/easyocr_model.py +57 -42
- docling/models/layout_model.py +158 -116
- docling/models/page_assemble_model.py +127 -101
- docling/models/page_preprocessing_model.py +79 -0
- docling/models/table_structure_model.py +162 -116
- docling/models/tesseract_ocr_cli_model.py +76 -59
- docling/models/tesseract_ocr_model.py +90 -58
- docling/pipeline/base_pipeline.py +189 -0
- docling/pipeline/simple_pipeline.py +56 -0
- docling/pipeline/standard_pdf_pipeline.py +201 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling/utils/profiling.py +62 -0
- docling-2.4.1.dist-info/METADATA +154 -0
- docling-2.4.1.dist-info/RECORD +45 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -1,12 +1,13 @@
|
|
1
|
-
from enum import Enum
|
1
|
+
from enum import Enum
|
2
|
+
from pathlib import Path
|
2
3
|
from typing import List, Literal, Optional, Union
|
3
4
|
|
4
5
|
from pydantic import BaseModel, ConfigDict, Field
|
5
6
|
|
6
7
|
|
7
8
|
class TableFormerMode(str, Enum):
|
8
|
-
FAST =
|
9
|
-
ACCURATE =
|
9
|
+
FAST = "fast"
|
10
|
+
ACCURATE = "accurate"
|
10
11
|
|
11
12
|
|
12
13
|
class TableStructureOptions(BaseModel):
|
@@ -21,6 +22,9 @@ class TableStructureOptions(BaseModel):
|
|
21
22
|
|
22
23
|
class OcrOptions(BaseModel):
|
23
24
|
kind: str
|
25
|
+
bitmap_area_threshold: float = (
|
26
|
+
0.05 # percentage of the area for a bitmap to processed with OCR
|
27
|
+
)
|
24
28
|
|
25
29
|
|
26
30
|
class EasyOcrOptions(OcrOptions):
|
@@ -58,6 +62,13 @@ class TesseractOcrOptions(OcrOptions):
|
|
58
62
|
|
59
63
|
|
60
64
|
class PipelineOptions(BaseModel):
|
65
|
+
create_legacy_output: bool = (
|
66
|
+
True # This defautl will be set to False on a future version of docling
|
67
|
+
)
|
68
|
+
|
69
|
+
|
70
|
+
class PdfPipelineOptions(PipelineOptions):
|
71
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
61
72
|
do_table_structure: bool = True # True: perform table structure extraction
|
62
73
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
63
74
|
|
@@ -65,3 +76,8 @@ class PipelineOptions(BaseModel):
|
|
65
76
|
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
66
77
|
Field(EasyOcrOptions(), discriminator="kind")
|
67
78
|
)
|
79
|
+
|
80
|
+
images_scale: float = 1.0
|
81
|
+
generate_page_images: bool = False
|
82
|
+
generate_picture_images: bool = False
|
83
|
+
generate_table_images: bool = False
|
docling/datamodel/settings.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import sys
|
2
|
+
from pathlib import Path
|
2
3
|
|
3
4
|
from pydantic import BaseModel
|
4
5
|
from pydantic_settings import BaseSettings
|
@@ -14,6 +15,7 @@ class BatchConcurrencySettings(BaseModel):
|
|
14
15
|
doc_batch_concurrency: int = 2
|
15
16
|
page_batch_size: int = 4
|
16
17
|
page_batch_concurrency: int = 2
|
18
|
+
elements_batch_size: int = 16
|
17
19
|
|
18
20
|
# doc_batch_size: int = 1
|
19
21
|
# doc_batch_concurrency: int = 1
|
@@ -25,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
|
|
25
27
|
# To force models into single core: export OMP_NUM_THREADS=1
|
26
28
|
|
27
29
|
|
30
|
+
class DebugSettings(BaseModel):
|
31
|
+
visualize_cells: bool = False
|
32
|
+
visualize_ocr: bool = False
|
33
|
+
visualize_layout: bool = False
|
34
|
+
visualize_tables: bool = False
|
35
|
+
|
36
|
+
profile_pipeline_timings: bool = False
|
37
|
+
|
38
|
+
# Path used to output debug information.
|
39
|
+
debug_output_path: str = str(Path.cwd() / "debug")
|
40
|
+
|
41
|
+
|
28
42
|
class AppSettings(BaseSettings):
|
29
43
|
perf: BatchConcurrencySettings
|
44
|
+
debug: DebugSettings
|
30
45
|
|
31
46
|
|
32
|
-
settings = AppSettings(perf=BatchConcurrencySettings())
|
47
|
+
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
docling/document_converter.py
CHANGED
@@ -1,297 +1,286 @@
|
|
1
|
-
import functools
|
2
1
|
import logging
|
3
|
-
import
|
2
|
+
import sys
|
4
3
|
import time
|
5
|
-
import
|
4
|
+
from functools import partial
|
6
5
|
from pathlib import Path
|
7
|
-
from typing import Iterable, Optional, Type
|
8
|
-
|
9
|
-
import
|
10
|
-
|
11
|
-
from
|
12
|
-
|
13
|
-
from docling.backend.
|
14
|
-
from docling.
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
ErrorItem,
|
20
|
-
Page,
|
21
|
-
)
|
6
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Type
|
7
|
+
|
8
|
+
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
9
|
+
|
10
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
11
|
+
from docling.backend.asciidoc_backend import AsciiDocBackend
|
12
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
13
|
+
from docling.backend.html_backend import HTMLDocumentBackend
|
14
|
+
from docling.backend.md_backend import MarkdownDocumentBackend
|
15
|
+
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
16
|
+
from docling.backend.msword_backend import MsWordDocumentBackend
|
17
|
+
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
22
18
|
from docling.datamodel.document import (
|
23
19
|
ConversionResult,
|
24
|
-
DocumentConversionInput,
|
25
20
|
InputDocument,
|
21
|
+
_DocumentConversionInput,
|
26
22
|
)
|
27
23
|
from docling.datamodel.pipeline_options import PipelineOptions
|
28
|
-
from docling.datamodel.settings import settings
|
29
|
-
from docling.
|
30
|
-
from docling.
|
31
|
-
from docling.pipeline.
|
32
|
-
from docling.
|
33
|
-
from docling.utils.utils import chunkify, create_hash
|
24
|
+
from docling.datamodel.settings import DocumentLimits, settings
|
25
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
26
|
+
from docling.pipeline.simple_pipeline import SimplePipeline
|
27
|
+
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
28
|
+
from docling.utils.utils import chunkify
|
34
29
|
|
35
30
|
_log = logging.getLogger(__name__)
|
36
31
|
|
37
32
|
|
38
|
-
class
|
39
|
-
|
33
|
+
class FormatOption(BaseModel):
|
34
|
+
pipeline_cls: Type[BasePipeline]
|
35
|
+
pipeline_options: Optional[PipelineOptions] = None
|
36
|
+
backend: Type[AbstractDocumentBackend]
|
40
37
|
|
41
|
-
|
42
|
-
self,
|
43
|
-
artifacts_path: Optional[Union[Path, str]] = None,
|
44
|
-
pipeline_options: PipelineOptions = PipelineOptions(),
|
45
|
-
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
46
|
-
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
47
|
-
assemble_options: AssembleOptions = AssembleOptions(),
|
48
|
-
):
|
49
|
-
if not artifacts_path:
|
50
|
-
artifacts_path = self.download_models_hf()
|
38
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
51
39
|
|
52
|
-
|
40
|
+
@model_validator(mode="after")
|
41
|
+
def set_optional_field_default(self) -> "FormatOption":
|
42
|
+
if self.pipeline_options is None:
|
43
|
+
self.pipeline_options = self.pipeline_cls.get_default_options()
|
44
|
+
return self
|
53
45
|
|
54
|
-
self.model_pipeline = pipeline_cls(
|
55
|
-
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
56
|
-
)
|
57
46
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
self.assemble_options = assemble_options
|
62
|
-
|
63
|
-
@staticmethod
|
64
|
-
def download_models_hf(
|
65
|
-
local_dir: Optional[Path] = None, force: bool = False
|
66
|
-
) -> Path:
|
67
|
-
from huggingface_hub import snapshot_download
|
68
|
-
|
69
|
-
download_path = snapshot_download(
|
70
|
-
repo_id="ds4sd/docling-models",
|
71
|
-
force_download=force,
|
72
|
-
local_dir=local_dir,
|
73
|
-
revision="v2.0.0",
|
74
|
-
)
|
47
|
+
class WordFormatOption(FormatOption):
|
48
|
+
pipeline_cls: Type = SimplePipeline
|
49
|
+
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
75
50
|
|
76
|
-
return Path(download_path)
|
77
51
|
|
78
|
-
|
52
|
+
class PowerpointFormatOption(FormatOption):
|
53
|
+
pipeline_cls: Type = SimplePipeline
|
54
|
+
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
79
55
|
|
80
|
-
for input_batch in chunkify(
|
81
|
-
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
82
|
-
):
|
83
|
-
_log.info(f"Going to convert document batch...")
|
84
|
-
# parallel processing only within input_batch
|
85
|
-
# with ThreadPoolExecutor(
|
86
|
-
# max_workers=settings.perf.doc_batch_concurrency
|
87
|
-
# ) as pool:
|
88
|
-
# yield from pool.map(self.process_document, input_batch)
|
89
56
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
94
|
-
"""Convert a single document.
|
95
|
-
|
96
|
-
Args:
|
97
|
-
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
|
98
|
-
|
99
|
-
Raises:
|
100
|
-
ValueError: If source is of unexpected type.
|
101
|
-
RuntimeError: If conversion fails.
|
102
|
-
|
103
|
-
Returns:
|
104
|
-
ConversionResult: The conversion result object.
|
105
|
-
"""
|
106
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
107
|
-
try:
|
108
|
-
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
109
|
-
res = requests.get(http_url, stream=True)
|
110
|
-
res.raise_for_status()
|
111
|
-
fname = None
|
112
|
-
# try to get filename from response header
|
113
|
-
if cont_disp := res.headers.get("Content-Disposition"):
|
114
|
-
for par in cont_disp.strip().split(";"):
|
115
|
-
# currently only handling directive "filename" (not "*filename")
|
116
|
-
if (split := par.split("=")) and split[0].strip() == "filename":
|
117
|
-
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
118
|
-
break
|
119
|
-
# otherwise, use name from URL:
|
120
|
-
if fname is None:
|
121
|
-
fname = Path(http_url.path).name or self._default_download_filename
|
122
|
-
local_path = Path(temp_dir) / fname
|
123
|
-
with open(local_path, "wb") as f:
|
124
|
-
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
125
|
-
f.write(chunk)
|
126
|
-
except ValidationError:
|
127
|
-
try:
|
128
|
-
local_path = TypeAdapter(Path).validate_python(source)
|
129
|
-
except ValidationError:
|
130
|
-
raise ValueError(
|
131
|
-
f"Unexpected file path type encountered: {type(source)}"
|
132
|
-
)
|
133
|
-
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
134
|
-
conv_res_iter = self.convert(conv_inp)
|
135
|
-
conv_res: ConversionResult = next(conv_res_iter)
|
136
|
-
if conv_res.status not in {
|
137
|
-
ConversionStatus.SUCCESS,
|
138
|
-
ConversionStatus.PARTIAL_SUCCESS,
|
139
|
-
}:
|
140
|
-
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
141
|
-
return conv_res
|
57
|
+
class MarkdownFormatOption(FormatOption):
|
58
|
+
pipeline_cls: Type = SimplePipeline
|
59
|
+
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
142
60
|
|
143
|
-
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
|
144
|
-
start_doc_time = time.time()
|
145
|
-
conv_res = ConversionResult(input=in_doc)
|
146
61
|
|
147
|
-
|
62
|
+
class AsciiDocFormatOption(FormatOption):
|
63
|
+
pipeline_cls: Type = SimplePipeline
|
64
|
+
backend: Type[AbstractDocumentBackend] = AsciiDocBackend
|
148
65
|
|
149
|
-
if not in_doc.valid:
|
150
|
-
conv_res.status = ConversionStatus.FAILURE
|
151
|
-
return conv_res
|
152
66
|
|
153
|
-
|
154
|
-
|
67
|
+
class HTMLFormatOption(FormatOption):
|
68
|
+
pipeline_cls: Type = SimplePipeline
|
69
|
+
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
155
70
|
|
156
|
-
all_assembled_pages = []
|
157
71
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
start_pb_time = time.time()
|
162
|
-
# Pipeline
|
72
|
+
class PdfFormatOption(FormatOption):
|
73
|
+
pipeline_cls: Type = StandardPdfPipeline
|
74
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
163
75
|
|
164
|
-
# 1. Initialise the page resources
|
165
|
-
init_pages = map(
|
166
|
-
functools.partial(self._initialize_page, in_doc), page_batch
|
167
|
-
)
|
168
76
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
)
|
77
|
+
class ImageFormatOption(FormatOption):
|
78
|
+
pipeline_cls: Type = StandardPdfPipeline
|
79
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
173
80
|
|
174
|
-
# 3. Populate programmatic page cells
|
175
|
-
pages_with_cells = map(
|
176
|
-
functools.partial(self._parse_page_cells, in_doc),
|
177
|
-
pages_with_images,
|
178
|
-
)
|
179
81
|
|
180
|
-
|
181
|
-
|
82
|
+
_format_to_default_options = {
|
83
|
+
InputFormat.DOCX: FormatOption(
|
84
|
+
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
85
|
+
),
|
86
|
+
InputFormat.PPTX: FormatOption(
|
87
|
+
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
88
|
+
),
|
89
|
+
InputFormat.MD: FormatOption(
|
90
|
+
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
91
|
+
),
|
92
|
+
InputFormat.ASCIIDOC: FormatOption(
|
93
|
+
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
94
|
+
),
|
95
|
+
InputFormat.HTML: FormatOption(
|
96
|
+
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
97
|
+
),
|
98
|
+
InputFormat.IMAGE: FormatOption(
|
99
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
100
|
+
),
|
101
|
+
InputFormat.PDF: FormatOption(
|
102
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
103
|
+
),
|
104
|
+
}
|
182
105
|
|
183
|
-
# 5. Assemble page elements (per page)
|
184
|
-
assembled_pages = self.page_assemble_model(pipeline_pages)
|
185
106
|
|
186
|
-
|
187
|
-
|
188
|
-
# Free up mem resources before moving on with next batch
|
107
|
+
class DocumentConverter:
|
108
|
+
_default_download_filename = "file"
|
189
109
|
|
190
|
-
|
191
|
-
|
192
|
-
|
110
|
+
def __init__(
|
111
|
+
self,
|
112
|
+
allowed_formats: Optional[List[InputFormat]] = None,
|
113
|
+
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
114
|
+
):
|
115
|
+
self.allowed_formats = allowed_formats
|
116
|
+
self.format_to_options = format_options
|
117
|
+
|
118
|
+
if self.allowed_formats is None:
|
119
|
+
# if self.format_to_options is not None:
|
120
|
+
# self.allowed_formats = self.format_to_options.keys()
|
121
|
+
# else:
|
122
|
+
self.allowed_formats = [e for e in InputFormat] # all formats
|
123
|
+
|
124
|
+
if self.format_to_options is None:
|
125
|
+
self.format_to_options = _format_to_default_options
|
126
|
+
else:
|
127
|
+
for f in self.allowed_formats:
|
128
|
+
if f not in self.format_to_options.keys():
|
129
|
+
_log.debug(f"Requested format {f} will use default options.")
|
130
|
+
self.format_to_options[f] = _format_to_default_options[f]
|
131
|
+
|
132
|
+
remove_keys = []
|
133
|
+
for f in self.format_to_options.keys():
|
134
|
+
if f not in self.allowed_formats:
|
135
|
+
remove_keys.append(f)
|
136
|
+
|
137
|
+
for f in remove_keys:
|
138
|
+
self.format_to_options.pop(f)
|
139
|
+
|
140
|
+
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
141
|
+
|
142
|
+
def initialize_pipeline(self, format: InputFormat):
|
143
|
+
"""Initialize the conversion pipeline for the selected format."""
|
144
|
+
self._get_pipeline(doc_format=format)
|
145
|
+
|
146
|
+
@validate_call(config=ConfigDict(strict=True))
|
147
|
+
def convert(
|
148
|
+
self,
|
149
|
+
source: Path | str | DocumentStream, # TODO review naming
|
150
|
+
raises_on_error: bool = True,
|
151
|
+
max_num_pages: int = sys.maxsize,
|
152
|
+
max_file_size: int = sys.maxsize,
|
153
|
+
) -> ConversionResult:
|
154
|
+
|
155
|
+
all_res = self.convert_all(
|
156
|
+
source=[source],
|
157
|
+
raises_on_error=raises_on_error,
|
158
|
+
max_num_pages=max_num_pages,
|
159
|
+
max_file_size=max_file_size,
|
160
|
+
)
|
161
|
+
return next(all_res)
|
193
162
|
|
194
|
-
|
195
|
-
|
163
|
+
@validate_call(config=ConfigDict(strict=True))
|
164
|
+
def convert_all(
|
165
|
+
self,
|
166
|
+
source: Iterable[Path | str | DocumentStream], # TODO review naming
|
167
|
+
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
168
|
+
max_num_pages: int = sys.maxsize,
|
169
|
+
max_file_size: int = sys.maxsize,
|
170
|
+
) -> Iterator[ConversionResult]:
|
171
|
+
limits = DocumentLimits(
|
172
|
+
max_num_pages=max_num_pages,
|
173
|
+
max_file_size=max_file_size,
|
174
|
+
)
|
175
|
+
conv_input = _DocumentConversionInput(
|
176
|
+
path_or_stream_iterator=source,
|
177
|
+
limit=limits,
|
178
|
+
)
|
179
|
+
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
180
|
+
for conv_res in conv_res_iter:
|
181
|
+
if raises_on_error and conv_res.status not in {
|
182
|
+
ConversionStatus.SUCCESS,
|
183
|
+
ConversionStatus.PARTIAL_SUCCESS,
|
184
|
+
}:
|
185
|
+
raise RuntimeError(
|
186
|
+
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
187
|
+
)
|
188
|
+
else:
|
189
|
+
yield conv_res
|
196
190
|
|
197
|
-
|
191
|
+
def _convert(
|
192
|
+
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
193
|
+
) -> Iterator[ConversionResult]:
|
194
|
+
assert self.format_to_options is not None
|
198
195
|
|
199
|
-
|
200
|
-
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
196
|
+
start_time = time.monotonic()
|
201
197
|
|
202
|
-
|
203
|
-
self.
|
198
|
+
for input_batch in chunkify(
|
199
|
+
conv_input.docs(self.format_to_options),
|
200
|
+
settings.perf.doc_batch_size, # pass format_options
|
201
|
+
):
|
202
|
+
_log.info(f"Going to convert document batch...")
|
204
203
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
204
|
+
# parallel processing only within input_batch
|
205
|
+
# with ThreadPoolExecutor(
|
206
|
+
# max_workers=settings.perf.doc_batch_concurrency
|
207
|
+
# ) as pool:
|
208
|
+
# yield from pool.map(self.process_document, input_batch)
|
209
|
+
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
210
|
+
|
211
|
+
for item in map(
|
212
|
+
partial(self._process_document, raises_on_error=raises_on_error),
|
213
|
+
input_batch,
|
214
|
+
):
|
215
|
+
elapsed = time.monotonic() - start_time
|
216
|
+
start_time = time.monotonic()
|
217
|
+
|
218
|
+
if item is not None:
|
219
|
+
_log.info(
|
220
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
214
221
|
)
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
222
|
+
yield item
|
223
|
+
else:
|
224
|
+
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
225
|
+
|
226
|
+
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
227
|
+
assert self.format_to_options is not None
|
228
|
+
|
229
|
+
fopt = self.format_to_options.get(doc_format)
|
230
|
+
|
231
|
+
if fopt is None:
|
232
|
+
raise RuntimeError(f"Could not get pipeline for {doc_format}")
|
233
|
+
else:
|
234
|
+
pipeline_class = fopt.pipeline_cls
|
235
|
+
pipeline_options = fopt.pipeline_options
|
236
|
+
|
237
|
+
assert pipeline_options is not None
|
238
|
+
# TODO this will ignore if different options have been defined for the same pipeline class.
|
239
|
+
if (
|
240
|
+
pipeline_class not in self.initialized_pipelines
|
241
|
+
or self.initialized_pipelines[pipeline_class].pipeline_options
|
242
|
+
!= pipeline_options
|
243
|
+
):
|
244
|
+
self.initialized_pipelines[pipeline_class] = pipeline_class(
|
245
|
+
pipeline_options=pipeline_options
|
225
246
|
)
|
247
|
+
return self.initialized_pipelines[pipeline_class]
|
226
248
|
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
249
|
+
def _process_document(
|
250
|
+
self, in_doc: InputDocument, raises_on_error: bool
|
251
|
+
) -> Optional[ConversionResult]:
|
252
|
+
assert self.allowed_formats is not None
|
253
|
+
assert in_doc.format in self.allowed_formats
|
231
254
|
|
232
|
-
|
233
|
-
_log.info(
|
234
|
-
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
235
|
-
)
|
255
|
+
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
236
256
|
|
237
257
|
return conv_res
|
238
258
|
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
if self.assemble_options.images_scale is not None:
|
256
|
-
page._default_image_scale = self.assemble_options.images_scale
|
257
|
-
page.get_image(
|
258
|
-
scale=self.assemble_options.images_scale
|
259
|
-
) # this will trigger storing the image in the internal cache
|
260
|
-
|
261
|
-
return page
|
262
|
-
|
263
|
-
# Extract and populate the page cells and store it in the page object
|
264
|
-
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
265
|
-
page.cells = page._backend.get_text_cells()
|
266
|
-
|
267
|
-
# DEBUG code:
|
268
|
-
def draw_text_boxes(image, cells):
|
269
|
-
draw = ImageDraw.Draw(image)
|
270
|
-
for c in cells:
|
271
|
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
272
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
273
|
-
image.show()
|
274
|
-
|
275
|
-
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
276
|
-
|
277
|
-
return page
|
278
|
-
|
279
|
-
def _assemble_doc(self, conv_res: ConversionResult):
|
280
|
-
all_elements = []
|
281
|
-
all_headers = []
|
282
|
-
all_body = []
|
283
|
-
|
284
|
-
for p in conv_res.pages:
|
285
|
-
|
286
|
-
for el in p.assembled.body:
|
287
|
-
all_body.append(el)
|
288
|
-
for el in p.assembled.headers:
|
289
|
-
all_headers.append(el)
|
290
|
-
for el in p.assembled.elements:
|
291
|
-
all_elements.append(el)
|
292
|
-
|
293
|
-
conv_res.assembled = AssembledUnit(
|
294
|
-
elements=all_elements, headers=all_headers, body=all_body
|
295
|
-
)
|
259
|
+
def _execute_pipeline(
|
260
|
+
self, in_doc: InputDocument, raises_on_error: bool
|
261
|
+
) -> ConversionResult:
|
262
|
+
if in_doc.valid:
|
263
|
+
pipeline = self._get_pipeline(in_doc.format)
|
264
|
+
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
265
|
+
if raises_on_error:
|
266
|
+
raise RuntimeError(
|
267
|
+
f"No pipeline could be initialized for {in_doc.file}."
|
268
|
+
)
|
269
|
+
else:
|
270
|
+
conv_res = ConversionResult(input=in_doc)
|
271
|
+
conv_res.status = ConversionStatus.FAILURE
|
272
|
+
return conv_res
|
273
|
+
|
274
|
+
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
296
275
|
|
297
|
-
|
276
|
+
else:
|
277
|
+
if raises_on_error:
|
278
|
+
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
279
|
+
|
280
|
+
else:
|
281
|
+
# invalid doc or not of desired format
|
282
|
+
conv_res = ConversionResult(input=in_doc)
|
283
|
+
conv_res.status = ConversionStatus.FAILURE
|
284
|
+
# TODO add error log why it failed.
|
285
|
+
|
286
|
+
return conv_res
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Any, Iterable
|
3
|
+
|
4
|
+
from docling_core.types.doc import DoclingDocument, NodeItem
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import Page
|
7
|
+
from docling.datamodel.document import ConversionResult
|
8
|
+
|
9
|
+
|
10
|
+
class BasePageModel(ABC):
|
11
|
+
@abstractmethod
|
12
|
+
def __call__(
|
13
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
14
|
+
) -> Iterable[Page]:
|
15
|
+
pass
|
16
|
+
|
17
|
+
|
18
|
+
class BaseEnrichmentModel(ABC):
|
19
|
+
|
20
|
+
@abstractmethod
|
21
|
+
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
22
|
+
pass
|
23
|
+
|
24
|
+
@abstractmethod
|
25
|
+
def __call__(
|
26
|
+
self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
|
27
|
+
) -> Iterable[Any]:
|
28
|
+
pass
|