docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +33 -37
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +20 -16
- docling/backend/docling_parse_v2_backend.py +248 -0
- docling/backend/html_backend.py +429 -0
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +496 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +16 -11
- docling/cli/main.py +96 -65
- docling/datamodel/base_models.py +79 -193
- docling/datamodel/document.py +405 -320
- docling/datamodel/pipeline_options.py +19 -3
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +240 -251
- docling/models/base_model.py +28 -0
- docling/models/base_ocr_model.py +40 -10
- docling/models/ds_glm_model.py +244 -30
- docling/models/easyocr_model.py +57 -42
- docling/models/layout_model.py +158 -116
- docling/models/page_assemble_model.py +127 -101
- docling/models/page_preprocessing_model.py +79 -0
- docling/models/table_structure_model.py +162 -116
- docling/models/tesseract_ocr_cli_model.py +76 -59
- docling/models/tesseract_ocr_model.py +90 -58
- docling/pipeline/base_pipeline.py +189 -0
- docling/pipeline/simple_pipeline.py +56 -0
- docling/pipeline/standard_pdf_pipeline.py +201 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling/utils/profiling.py +62 -0
- docling-2.4.1.dist-info/METADATA +154 -0
- docling-2.4.1.dist-info/RECORD +45 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -1,43 +1,58 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Iterable
|
3
3
|
|
4
|
-
import
|
4
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
5
|
|
6
|
-
from docling.datamodel.base_models import
|
7
|
-
from docling.datamodel.
|
6
|
+
from docling.datamodel.base_models import OcrCell, Page
|
7
|
+
from docling.datamodel.document import ConversionResult
|
8
|
+
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
9
|
+
from docling.datamodel.settings import settings
|
8
10
|
from docling.models.base_ocr_model import BaseOcrModel
|
11
|
+
from docling.utils.profiling import TimeRecorder
|
9
12
|
|
10
13
|
_log = logging.getLogger(__name__)
|
11
14
|
|
12
15
|
|
13
16
|
class TesseractOcrModel(BaseOcrModel):
|
14
|
-
def __init__(self, enabled: bool, options:
|
17
|
+
def __init__(self, enabled: bool, options: TesseractOcrOptions):
|
15
18
|
super().__init__(enabled=enabled, options=options)
|
16
|
-
self.options:
|
19
|
+
self.options: TesseractOcrOptions
|
17
20
|
|
18
21
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
19
22
|
self.reader = None
|
20
23
|
|
21
24
|
if self.enabled:
|
22
|
-
|
25
|
+
install_errmsg = (
|
23
26
|
"tesserocr is not correctly installed. "
|
24
27
|
"Please install it via `pip install tesserocr` to use this OCR engine. "
|
25
|
-
"Note that tesserocr might have to be manually compiled for working with"
|
28
|
+
"Note that tesserocr might have to be manually compiled for working with "
|
26
29
|
"your Tesseract installation. The Docling documentation provides examples for it. "
|
27
|
-
"Alternatively, Docling has support for other OCR engines. See the documentation
|
30
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation: "
|
31
|
+
"https://ds4sd.github.io/docling/installation/"
|
28
32
|
)
|
33
|
+
missing_langs_errmsg = (
|
34
|
+
"tesserocr is not correctly configured. No language models have been detected. "
|
35
|
+
"Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
|
36
|
+
"You can find more information how to setup other OCR engines in Docling "
|
37
|
+
"documentation: "
|
38
|
+
"https://ds4sd.github.io/docling/installation/"
|
39
|
+
)
|
40
|
+
|
29
41
|
try:
|
30
42
|
import tesserocr
|
31
43
|
except ImportError:
|
32
|
-
raise ImportError(
|
33
|
-
|
44
|
+
raise ImportError(install_errmsg)
|
34
45
|
try:
|
35
46
|
tesseract_version = tesserocr.tesseract_version()
|
36
|
-
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
37
47
|
except:
|
38
|
-
raise ImportError(
|
48
|
+
raise ImportError(install_errmsg)
|
49
|
+
|
50
|
+
_, tesserocr_languages = tesserocr.get_languages()
|
51
|
+
if not tesserocr_languages:
|
52
|
+
raise ImportError(missing_langs_errmsg)
|
39
53
|
|
40
54
|
# Initialize the tesseractAPI
|
55
|
+
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
41
56
|
lang = "+".join(self.options.lang)
|
42
57
|
if self.options.path is not None:
|
43
58
|
self.reader = tesserocr.PyTessBaseAPI(
|
@@ -61,62 +76,79 @@ class TesseractOcrModel(BaseOcrModel):
|
|
61
76
|
# Finalize the tesseractAPI
|
62
77
|
self.reader.End()
|
63
78
|
|
64
|
-
def __call__(
|
79
|
+
def __call__(
|
80
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
81
|
+
) -> Iterable[Page]:
|
65
82
|
|
66
83
|
if not self.enabled:
|
67
84
|
yield from page_batch
|
68
85
|
return
|
69
86
|
|
70
87
|
for page in page_batch:
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
if ocr_rect.area() == 0:
|
77
|
-
continue
|
78
|
-
high_res_image = page._backend.get_page_image(
|
79
|
-
scale=self.scale, cropbox=ocr_rect
|
80
|
-
)
|
88
|
+
assert page._backend is not None
|
89
|
+
if not page._backend.is_valid():
|
90
|
+
yield page
|
91
|
+
else:
|
92
|
+
with TimeRecorder(conv_res, "ocr"):
|
81
93
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
confidence = self.reader.MeanTextConf()
|
94
|
-
left = box["x"] / self.scale
|
95
|
-
bottom = box["y"] / self.scale
|
96
|
-
right = (box["x"] + box["w"]) / self.scale
|
97
|
-
top = (box["y"] + box["h"]) / self.scale
|
98
|
-
|
99
|
-
cells.append(
|
100
|
-
OcrCell(
|
101
|
-
id=ix,
|
102
|
-
text=text,
|
103
|
-
confidence=confidence,
|
104
|
-
bbox=BoundingBox.from_tuple(
|
105
|
-
coord=(left, top, right, bottom),
|
106
|
-
origin=CoordOrigin.TOPLEFT,
|
107
|
-
),
|
94
|
+
assert self.reader is not None
|
95
|
+
|
96
|
+
ocr_rects = self.get_ocr_rects(page)
|
97
|
+
|
98
|
+
all_ocr_cells = []
|
99
|
+
for ocr_rect in ocr_rects:
|
100
|
+
# Skip zero area boxes
|
101
|
+
if ocr_rect.area() == 0:
|
102
|
+
continue
|
103
|
+
high_res_image = page._backend.get_page_image(
|
104
|
+
scale=self.scale, cropbox=ocr_rect
|
108
105
|
)
|
109
|
-
)
|
110
106
|
|
111
|
-
|
112
|
-
|
107
|
+
# Retrieve text snippets with their bounding boxes
|
108
|
+
self.reader.SetImage(high_res_image)
|
109
|
+
boxes = self.reader.GetComponentImages(
|
110
|
+
self.reader_RIL.TEXTLINE, True
|
111
|
+
)
|
113
112
|
|
114
|
-
|
115
|
-
|
113
|
+
cells = []
|
114
|
+
for ix, (im, box, _, _) in enumerate(boxes):
|
115
|
+
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
116
|
+
self.reader.SetRectangle(
|
117
|
+
box["x"], box["y"], box["w"], box["h"]
|
118
|
+
)
|
119
|
+
|
120
|
+
# Extract text within the bounding box
|
121
|
+
text = self.reader.GetUTF8Text().strip()
|
122
|
+
confidence = self.reader.MeanTextConf()
|
123
|
+
left = box["x"] / self.scale
|
124
|
+
bottom = box["y"] / self.scale
|
125
|
+
right = (box["x"] + box["w"]) / self.scale
|
126
|
+
top = (box["y"] + box["h"]) / self.scale
|
127
|
+
|
128
|
+
cells.append(
|
129
|
+
OcrCell(
|
130
|
+
id=ix,
|
131
|
+
text=text,
|
132
|
+
confidence=confidence,
|
133
|
+
bbox=BoundingBox.from_tuple(
|
134
|
+
coord=(left, top, right, bottom),
|
135
|
+
origin=CoordOrigin.TOPLEFT,
|
136
|
+
),
|
137
|
+
)
|
138
|
+
)
|
139
|
+
|
140
|
+
# del high_res_image
|
141
|
+
all_ocr_cells.extend(cells)
|
142
|
+
|
143
|
+
## Remove OCR cells which overlap with programmatic cells.
|
144
|
+
filtered_ocr_cells = self.filter_ocr_cells(
|
145
|
+
all_ocr_cells, page.cells
|
146
|
+
)
|
116
147
|
|
117
|
-
|
148
|
+
page.cells.extend(filtered_ocr_cells)
|
118
149
|
|
119
|
-
|
120
|
-
|
150
|
+
# DEBUG code:
|
151
|
+
if settings.debug.visualize_ocr:
|
152
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
121
153
|
|
122
|
-
|
154
|
+
yield page
|
@@ -0,0 +1,189 @@
|
|
1
|
+
import functools
|
2
|
+
import logging
|
3
|
+
import time
|
4
|
+
import traceback
|
5
|
+
from abc import ABC, abstractmethod
|
6
|
+
from typing import Callable, Iterable, List
|
7
|
+
|
8
|
+
from docling_core.types.doc import DoclingDocument, NodeItem
|
9
|
+
|
10
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
11
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
12
|
+
from docling.datamodel.base_models import (
|
13
|
+
ConversionStatus,
|
14
|
+
DoclingComponentType,
|
15
|
+
ErrorItem,
|
16
|
+
Page,
|
17
|
+
)
|
18
|
+
from docling.datamodel.document import ConversionResult, InputDocument
|
19
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
20
|
+
from docling.datamodel.settings import settings
|
21
|
+
from docling.models.base_model import BaseEnrichmentModel
|
22
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
23
|
+
from docling.utils.utils import chunkify
|
24
|
+
|
25
|
+
_log = logging.getLogger(__name__)
|
26
|
+
|
27
|
+
|
28
|
+
class BasePipeline(ABC):
|
29
|
+
def __init__(self, pipeline_options: PipelineOptions):
|
30
|
+
self.pipeline_options = pipeline_options
|
31
|
+
self.build_pipe: List[Callable] = []
|
32
|
+
self.enrichment_pipe: List[BaseEnrichmentModel] = []
|
33
|
+
|
34
|
+
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
|
35
|
+
conv_res = ConversionResult(input=in_doc)
|
36
|
+
|
37
|
+
_log.info(f"Processing document {in_doc.file.name}")
|
38
|
+
try:
|
39
|
+
with TimeRecorder(
|
40
|
+
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
41
|
+
):
|
42
|
+
# These steps are building and assembling the structure of the
|
43
|
+
# output DoclingDocument
|
44
|
+
conv_res = self._build_document(conv_res)
|
45
|
+
conv_res = self._assemble_document(conv_res)
|
46
|
+
# From this stage, all operations should rely only on conv_res.output
|
47
|
+
conv_res = self._enrich_document(conv_res)
|
48
|
+
conv_res.status = self._determine_status(conv_res)
|
49
|
+
except Exception as e:
|
50
|
+
conv_res.status = ConversionStatus.FAILURE
|
51
|
+
if raises_on_error:
|
52
|
+
raise e
|
53
|
+
|
54
|
+
return conv_res
|
55
|
+
|
56
|
+
@abstractmethod
|
57
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
58
|
+
pass
|
59
|
+
|
60
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
61
|
+
return conv_res
|
62
|
+
|
63
|
+
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
64
|
+
|
65
|
+
def _filter_elements(
|
66
|
+
doc: DoclingDocument, model: BaseEnrichmentModel
|
67
|
+
) -> Iterable[NodeItem]:
|
68
|
+
for element, _level in doc.iterate_items():
|
69
|
+
if model.is_processable(doc=doc, element=element):
|
70
|
+
yield element
|
71
|
+
|
72
|
+
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
73
|
+
for model in self.enrichment_pipe:
|
74
|
+
for element_batch in chunkify(
|
75
|
+
_filter_elements(conv_res.document, model),
|
76
|
+
settings.perf.elements_batch_size,
|
77
|
+
):
|
78
|
+
# TODO: currently we assume the element itself is modified, because
|
79
|
+
# we don't have an interface to save the element back to the document
|
80
|
+
for element in model(
|
81
|
+
doc=conv_res.document, element_batch=element_batch
|
82
|
+
): # Must exhaust!
|
83
|
+
pass
|
84
|
+
|
85
|
+
return conv_res
|
86
|
+
|
87
|
+
@abstractmethod
|
88
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
89
|
+
pass
|
90
|
+
|
91
|
+
@classmethod
|
92
|
+
@abstractmethod
|
93
|
+
def get_default_options(cls) -> PipelineOptions:
|
94
|
+
pass
|
95
|
+
|
96
|
+
@classmethod
|
97
|
+
@abstractmethod
|
98
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
99
|
+
pass
|
100
|
+
|
101
|
+
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
|
102
|
+
# for model in self.build_pipe:
|
103
|
+
# element_batch = model(element_batch)
|
104
|
+
#
|
105
|
+
# yield from element_batch
|
106
|
+
|
107
|
+
|
108
|
+
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
109
|
+
|
110
|
+
def _apply_on_pages(
|
111
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
112
|
+
) -> Iterable[Page]:
|
113
|
+
for model in self.build_pipe:
|
114
|
+
page_batch = model(conv_res, page_batch)
|
115
|
+
|
116
|
+
yield from page_batch
|
117
|
+
|
118
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
119
|
+
|
120
|
+
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
121
|
+
raise RuntimeError(
|
122
|
+
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
123
|
+
f"Can not convert this with a PDF pipeline. "
|
124
|
+
f"Please check your format configuration on DocumentConverter."
|
125
|
+
)
|
126
|
+
# conv_res.status = ConversionStatus.FAILURE
|
127
|
+
# return conv_res
|
128
|
+
|
129
|
+
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
130
|
+
|
131
|
+
for i in range(0, conv_res.input.page_count):
|
132
|
+
conv_res.pages.append(Page(page_no=i))
|
133
|
+
|
134
|
+
try:
|
135
|
+
# Iterate batches of pages (page_batch_size) in the doc
|
136
|
+
for page_batch in chunkify(
|
137
|
+
conv_res.pages, settings.perf.page_batch_size
|
138
|
+
):
|
139
|
+
start_pb_time = time.time()
|
140
|
+
|
141
|
+
# 1. Initialise the page resources
|
142
|
+
init_pages = map(
|
143
|
+
functools.partial(self.initialize_page, conv_res), page_batch
|
144
|
+
)
|
145
|
+
|
146
|
+
# 2. Run pipeline stages
|
147
|
+
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
148
|
+
|
149
|
+
for p in pipeline_pages: # Must exhaust!
|
150
|
+
pass
|
151
|
+
|
152
|
+
end_pb_time = time.time() - start_pb_time
|
153
|
+
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
154
|
+
|
155
|
+
except Exception as e:
|
156
|
+
conv_res.status = ConversionStatus.FAILURE
|
157
|
+
trace = "\n".join(traceback.format_exception(e))
|
158
|
+
_log.warning(
|
159
|
+
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
160
|
+
f"{trace}"
|
161
|
+
)
|
162
|
+
raise e
|
163
|
+
|
164
|
+
finally:
|
165
|
+
# Always unload the PDF backend, even in case of failure
|
166
|
+
if conv_res.input._backend:
|
167
|
+
conv_res.input._backend.unload()
|
168
|
+
|
169
|
+
return conv_res
|
170
|
+
|
171
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
172
|
+
status = ConversionStatus.SUCCESS
|
173
|
+
for page in conv_res.pages:
|
174
|
+
if page._backend is None or not page._backend.is_valid():
|
175
|
+
conv_res.errors.append(
|
176
|
+
ErrorItem(
|
177
|
+
component_type=DoclingComponentType.DOCUMENT_BACKEND,
|
178
|
+
module_name=type(page._backend).__name__,
|
179
|
+
error_message=f"Page {page.page_no} failed to parse.",
|
180
|
+
)
|
181
|
+
)
|
182
|
+
status = ConversionStatus.PARTIAL_SUCCESS
|
183
|
+
|
184
|
+
return status
|
185
|
+
|
186
|
+
# Initialise and load resources for a page
|
187
|
+
@abstractmethod
|
188
|
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
189
|
+
pass
|
@@ -0,0 +1,56 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from docling.backend.abstract_backend import (
|
4
|
+
AbstractDocumentBackend,
|
5
|
+
DeclarativeDocumentBackend,
|
6
|
+
)
|
7
|
+
from docling.datamodel.base_models import ConversionStatus
|
8
|
+
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
10
|
+
from docling.pipeline.base_pipeline import BasePipeline
|
11
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
12
|
+
|
13
|
+
_log = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class SimplePipeline(BasePipeline):
|
17
|
+
"""SimpleModelPipeline.
|
18
|
+
|
19
|
+
This class is used at the moment for formats / backends
|
20
|
+
which produce straight DoclingDocument output.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(self, pipeline_options: PipelineOptions):
|
24
|
+
super().__init__(pipeline_options)
|
25
|
+
|
26
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
27
|
+
|
28
|
+
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
29
|
+
raise RuntimeError(
|
30
|
+
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
31
|
+
f"Can not convert this with simple pipeline. "
|
32
|
+
f"Please check your format configuration on DocumentConverter."
|
33
|
+
)
|
34
|
+
# conv_res.status = ConversionStatus.FAILURE
|
35
|
+
# return conv_res
|
36
|
+
|
37
|
+
# Instead of running a page-level pipeline to build up the document structure,
|
38
|
+
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
39
|
+
# a DoclingDocument straight.
|
40
|
+
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
41
|
+
conv_res.document = conv_res.input._backend.convert()
|
42
|
+
return conv_res
|
43
|
+
|
44
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
45
|
+
# This is called only if the previous steps didn't raise.
|
46
|
+
# Since we don't have anything else to evaluate, we can
|
47
|
+
# safely return SUCCESS.
|
48
|
+
return ConversionStatus.SUCCESS
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
def get_default_options(cls) -> PipelineOptions:
|
52
|
+
return PipelineOptions()
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
56
|
+
return isinstance(backend, DeclarativeDocumentBackend)
|
@@ -0,0 +1,201 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
6
|
+
|
7
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
8
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
9
|
+
from docling.datamodel.base_models import AssembledUnit, Page
|
10
|
+
from docling.datamodel.document import ConversionResult
|
11
|
+
from docling.datamodel.pipeline_options import (
|
12
|
+
EasyOcrOptions,
|
13
|
+
PdfPipelineOptions,
|
14
|
+
TesseractCliOcrOptions,
|
15
|
+
TesseractOcrOptions,
|
16
|
+
)
|
17
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
18
|
+
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
19
|
+
from docling.models.easyocr_model import EasyOcrModel
|
20
|
+
from docling.models.layout_model import LayoutModel
|
21
|
+
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
22
|
+
from docling.models.page_preprocessing_model import (
|
23
|
+
PagePreprocessingModel,
|
24
|
+
PagePreprocessingOptions,
|
25
|
+
)
|
26
|
+
from docling.models.table_structure_model import TableStructureModel
|
27
|
+
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
28
|
+
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
29
|
+
from docling.pipeline.base_pipeline import PaginatedPipeline
|
30
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
31
|
+
|
32
|
+
_log = logging.getLogger(__name__)
|
33
|
+
|
34
|
+
|
35
|
+
class StandardPdfPipeline(PaginatedPipeline):
|
36
|
+
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
37
|
+
_table_model_path = "model_artifacts/tableformer"
|
38
|
+
|
39
|
+
def __init__(self, pipeline_options: PdfPipelineOptions):
|
40
|
+
super().__init__(pipeline_options)
|
41
|
+
self.pipeline_options: PdfPipelineOptions
|
42
|
+
|
43
|
+
if pipeline_options.artifacts_path is None:
|
44
|
+
self.artifacts_path = self.download_models_hf()
|
45
|
+
else:
|
46
|
+
self.artifacts_path = Path(pipeline_options.artifacts_path)
|
47
|
+
|
48
|
+
keep_images = (
|
49
|
+
self.pipeline_options.generate_page_images
|
50
|
+
or self.pipeline_options.generate_picture_images
|
51
|
+
or self.pipeline_options.generate_table_images
|
52
|
+
)
|
53
|
+
|
54
|
+
self.glm_model = GlmModel(options=GlmOptions())
|
55
|
+
|
56
|
+
if (ocr_model := self.get_ocr_model()) is None:
|
57
|
+
raise RuntimeError(
|
58
|
+
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
59
|
+
)
|
60
|
+
|
61
|
+
self.build_pipe = [
|
62
|
+
# Pre-processing
|
63
|
+
PagePreprocessingModel(
|
64
|
+
options=PagePreprocessingOptions(
|
65
|
+
images_scale=pipeline_options.images_scale
|
66
|
+
)
|
67
|
+
),
|
68
|
+
# OCR
|
69
|
+
ocr_model,
|
70
|
+
# Layout model
|
71
|
+
LayoutModel(
|
72
|
+
artifacts_path=self.artifacts_path
|
73
|
+
/ StandardPdfPipeline._layout_model_path
|
74
|
+
),
|
75
|
+
# Table structure model
|
76
|
+
TableStructureModel(
|
77
|
+
enabled=pipeline_options.do_table_structure,
|
78
|
+
artifacts_path=self.artifacts_path
|
79
|
+
/ StandardPdfPipeline._table_model_path,
|
80
|
+
options=pipeline_options.table_structure_options,
|
81
|
+
),
|
82
|
+
# Page assemble
|
83
|
+
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
|
84
|
+
]
|
85
|
+
|
86
|
+
self.enrichment_pipe = [
|
87
|
+
# Other models working on `NodeItem` elements in the DoclingDocument
|
88
|
+
]
|
89
|
+
|
90
|
+
@staticmethod
|
91
|
+
def download_models_hf(
|
92
|
+
local_dir: Optional[Path] = None, force: bool = False
|
93
|
+
) -> Path:
|
94
|
+
from huggingface_hub import snapshot_download
|
95
|
+
|
96
|
+
download_path = snapshot_download(
|
97
|
+
repo_id="ds4sd/docling-models",
|
98
|
+
force_download=force,
|
99
|
+
local_dir=local_dir,
|
100
|
+
revision="v2.0.1",
|
101
|
+
)
|
102
|
+
|
103
|
+
return Path(download_path)
|
104
|
+
|
105
|
+
def get_ocr_model(self) -> Optional[BaseOcrModel]:
|
106
|
+
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
|
107
|
+
return EasyOcrModel(
|
108
|
+
enabled=self.pipeline_options.do_ocr,
|
109
|
+
options=self.pipeline_options.ocr_options,
|
110
|
+
)
|
111
|
+
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
112
|
+
return TesseractOcrCliModel(
|
113
|
+
enabled=self.pipeline_options.do_ocr,
|
114
|
+
options=self.pipeline_options.ocr_options,
|
115
|
+
)
|
116
|
+
elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
|
117
|
+
return TesseractOcrModel(
|
118
|
+
enabled=self.pipeline_options.do_ocr,
|
119
|
+
options=self.pipeline_options.ocr_options,
|
120
|
+
)
|
121
|
+
return None
|
122
|
+
|
123
|
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
124
|
+
with TimeRecorder(conv_res, "page_init"):
|
125
|
+
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
126
|
+
if page._backend is not None and page._backend.is_valid():
|
127
|
+
page.size = page._backend.get_size()
|
128
|
+
|
129
|
+
return page
|
130
|
+
|
131
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
132
|
+
all_elements = []
|
133
|
+
all_headers = []
|
134
|
+
all_body = []
|
135
|
+
|
136
|
+
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
137
|
+
for p in conv_res.pages:
|
138
|
+
if p.assembled is not None:
|
139
|
+
for el in p.assembled.body:
|
140
|
+
all_body.append(el)
|
141
|
+
for el in p.assembled.headers:
|
142
|
+
all_headers.append(el)
|
143
|
+
for el in p.assembled.elements:
|
144
|
+
all_elements.append(el)
|
145
|
+
|
146
|
+
conv_res.assembled = AssembledUnit(
|
147
|
+
elements=all_elements, headers=all_headers, body=all_body
|
148
|
+
)
|
149
|
+
|
150
|
+
conv_res.document = self.glm_model(conv_res)
|
151
|
+
|
152
|
+
# Generate page images in the output
|
153
|
+
if self.pipeline_options.generate_page_images:
|
154
|
+
for page in conv_res.pages:
|
155
|
+
assert page.image is not None
|
156
|
+
page_no = page.page_no + 1
|
157
|
+
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
158
|
+
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
159
|
+
)
|
160
|
+
|
161
|
+
# Generate images of the requested element types
|
162
|
+
if (
|
163
|
+
self.pipeline_options.generate_picture_images
|
164
|
+
or self.pipeline_options.generate_table_images
|
165
|
+
):
|
166
|
+
scale = self.pipeline_options.images_scale
|
167
|
+
for element, _level in conv_res.document.iterate_items():
|
168
|
+
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
169
|
+
continue
|
170
|
+
if (
|
171
|
+
isinstance(element, PictureItem)
|
172
|
+
and self.pipeline_options.generate_picture_images
|
173
|
+
) or (
|
174
|
+
isinstance(element, TableItem)
|
175
|
+
and self.pipeline_options.generate_table_images
|
176
|
+
):
|
177
|
+
page_ix = element.prov[0].page_no - 1
|
178
|
+
page = conv_res.pages[page_ix]
|
179
|
+
assert page.size is not None
|
180
|
+
assert page.image is not None
|
181
|
+
|
182
|
+
crop_bbox = (
|
183
|
+
element.prov[0]
|
184
|
+
.bbox.scaled(scale=scale)
|
185
|
+
.to_top_left_origin(page_height=page.size.height * scale)
|
186
|
+
)
|
187
|
+
|
188
|
+
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
189
|
+
element.image = ImageRef.from_pil(
|
190
|
+
cropped_im, dpi=int(72 * scale)
|
191
|
+
)
|
192
|
+
|
193
|
+
return conv_res
|
194
|
+
|
195
|
+
@classmethod
|
196
|
+
def get_default_options(cls) -> PdfPipelineOptions:
|
197
|
+
return PdfPipelineOptions()
|
198
|
+
|
199
|
+
@classmethod
|
200
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
201
|
+
return isinstance(backend, PdfDocumentBackend)
|
docling/utils/export.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Any, Dict, Iterable, List, Tuple, Union
|
3
3
|
|
4
|
-
from docling_core.types.doc
|
4
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
|
+
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
5
6
|
|
6
|
-
from docling.datamodel.base_models import
|
7
|
+
from docling.datamodel.base_models import OcrCell
|
7
8
|
from docling.datamodel.document import ConversionResult, Page
|
8
9
|
|
9
10
|
_log = logging.getLogger(__name__)
|
@@ -40,7 +41,7 @@ def generate_multimodal_pages(
|
|
40
41
|
end_ix = 0
|
41
42
|
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
|
42
43
|
|
43
|
-
doc = doc_result.
|
44
|
+
doc = doc_result.legacy_document
|
44
45
|
|
45
46
|
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
|
46
47
|
segments = []
|