docling 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +0 -4
- docling/backend/html_backend.py +25 -25
- docling/datamodel/base_models.py +1 -1
- docling/datamodel/document.py +3 -1
- docling/datamodel/settings.py +15 -1
- docling/document_converter.py +12 -8
- docling/models/base_model.py +4 -1
- docling/models/base_ocr_model.py +21 -4
- docling/models/ds_glm_model.py +27 -11
- docling/models/easyocr_model.py +49 -39
- docling/models/layout_model.py +87 -61
- docling/models/page_assemble_model.py +102 -100
- docling/models/page_preprocessing_model.py +25 -7
- docling/models/table_structure_model.py +125 -90
- docling/models/tesseract_ocr_cli_model.py +62 -52
- docling/models/tesseract_ocr_model.py +57 -45
- docling/pipeline/base_pipeline.py +68 -69
- docling/pipeline/simple_pipeline.py +8 -11
- docling/pipeline/standard_pdf_pipeline.py +59 -56
- docling/utils/profiling.py +62 -0
- {docling-2.2.1.dist-info → docling-2.3.0.dist-info}/METADATA +5 -4
- docling-2.3.0.dist-info/RECORD +45 -0
- docling-2.2.1.dist-info/RECORD +0 -44
- {docling-2.2.1.dist-info → docling-2.3.0.dist-info}/LICENSE +0 -0
- {docling-2.2.1.dist-info → docling-2.3.0.dist-info}/WHEEL +0 -0
- {docling-2.2.1.dist-info → docling-2.3.0.dist-info}/entry_points.txt +0 -0
@@ -8,8 +8,11 @@ import pandas as pd
|
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
9
|
|
10
10
|
from docling.datamodel.base_models import OcrCell, Page
|
11
|
+
from docling.datamodel.document import ConversionResult
|
11
12
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
13
|
+
from docling.datamodel.settings import settings
|
12
14
|
from docling.models.base_ocr_model import BaseOcrModel
|
15
|
+
from docling.utils.profiling import TimeRecorder
|
13
16
|
|
14
17
|
_log = logging.getLogger(__name__)
|
15
18
|
|
@@ -102,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
102
105
|
|
103
106
|
return df_filtered
|
104
107
|
|
105
|
-
def __call__(
|
108
|
+
def __call__(
|
109
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
110
|
+
) -> Iterable[Page]:
|
106
111
|
|
107
112
|
if not self.enabled:
|
108
113
|
yield from page_batch
|
@@ -113,62 +118,67 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
113
118
|
if not page._backend.is_valid():
|
114
119
|
yield page
|
115
120
|
else:
|
116
|
-
|
117
|
-
|
118
|
-
all_ocr_cells = []
|
119
|
-
for ocr_rect in ocr_rects:
|
120
|
-
# Skip zero area boxes
|
121
|
-
if ocr_rect.area() == 0:
|
122
|
-
continue
|
123
|
-
high_res_image = page._backend.get_page_image(
|
124
|
-
scale=self.scale, cropbox=ocr_rect
|
125
|
-
)
|
121
|
+
with TimeRecorder(conv_res, "ocr"):
|
126
122
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
# Print relevant columns (bounding box and text)
|
138
|
-
for ix, row in df.iterrows():
|
139
|
-
text = row["text"]
|
140
|
-
conf = row["conf"]
|
141
|
-
|
142
|
-
l = float(row["left"])
|
143
|
-
b = float(row["top"])
|
144
|
-
w = float(row["width"])
|
145
|
-
h = float(row["height"])
|
146
|
-
|
147
|
-
t = b + h
|
148
|
-
r = l + w
|
149
|
-
|
150
|
-
cell = OcrCell(
|
151
|
-
id=ix,
|
152
|
-
text=text,
|
153
|
-
confidence=conf / 100.0,
|
154
|
-
bbox=BoundingBox.from_tuple(
|
155
|
-
coord=(
|
156
|
-
(l / self.scale) + ocr_rect.l,
|
157
|
-
(b / self.scale) + ocr_rect.t,
|
158
|
-
(r / self.scale) + ocr_rect.l,
|
159
|
-
(t / self.scale) + ocr_rect.t,
|
160
|
-
),
|
161
|
-
origin=CoordOrigin.TOPLEFT,
|
162
|
-
),
|
123
|
+
ocr_rects = self.get_ocr_rects(page)
|
124
|
+
|
125
|
+
all_ocr_cells = []
|
126
|
+
for ocr_rect in ocr_rects:
|
127
|
+
# Skip zero area boxes
|
128
|
+
if ocr_rect.area() == 0:
|
129
|
+
continue
|
130
|
+
high_res_image = page._backend.get_page_image(
|
131
|
+
scale=self.scale, cropbox=ocr_rect
|
163
132
|
)
|
164
|
-
all_ocr_cells.append(cell)
|
165
133
|
|
166
|
-
|
167
|
-
|
134
|
+
with tempfile.NamedTemporaryFile(
|
135
|
+
suffix=".png", mode="w"
|
136
|
+
) as image_file:
|
137
|
+
fname = image_file.name
|
138
|
+
high_res_image.save(fname)
|
139
|
+
|
140
|
+
df = self._run_tesseract(fname)
|
141
|
+
|
142
|
+
# _log.info(df)
|
143
|
+
|
144
|
+
# Print relevant columns (bounding box and text)
|
145
|
+
for ix, row in df.iterrows():
|
146
|
+
text = row["text"]
|
147
|
+
conf = row["conf"]
|
148
|
+
|
149
|
+
l = float(row["left"])
|
150
|
+
b = float(row["top"])
|
151
|
+
w = float(row["width"])
|
152
|
+
h = float(row["height"])
|
153
|
+
|
154
|
+
t = b + h
|
155
|
+
r = l + w
|
156
|
+
|
157
|
+
cell = OcrCell(
|
158
|
+
id=ix,
|
159
|
+
text=text,
|
160
|
+
confidence=conf / 100.0,
|
161
|
+
bbox=BoundingBox.from_tuple(
|
162
|
+
coord=(
|
163
|
+
(l / self.scale) + ocr_rect.l,
|
164
|
+
(b / self.scale) + ocr_rect.t,
|
165
|
+
(r / self.scale) + ocr_rect.l,
|
166
|
+
(t / self.scale) + ocr_rect.t,
|
167
|
+
),
|
168
|
+
origin=CoordOrigin.TOPLEFT,
|
169
|
+
),
|
170
|
+
)
|
171
|
+
all_ocr_cells.append(cell)
|
172
|
+
|
173
|
+
## Remove OCR cells which overlap with programmatic cells.
|
174
|
+
filtered_ocr_cells = self.filter_ocr_cells(
|
175
|
+
all_ocr_cells, page.cells
|
176
|
+
)
|
168
177
|
|
169
|
-
|
178
|
+
page.cells.extend(filtered_ocr_cells)
|
170
179
|
|
171
180
|
# DEBUG code:
|
172
|
-
|
181
|
+
if settings.debug.visualize_ocr:
|
182
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
173
183
|
|
174
184
|
yield page
|
@@ -4,8 +4,11 @@ from typing import Iterable
|
|
4
4
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
5
|
|
6
6
|
from docling.datamodel.base_models import OcrCell, Page
|
7
|
+
from docling.datamodel.document import ConversionResult
|
7
8
|
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
9
|
+
from docling.datamodel.settings import settings
|
8
10
|
from docling.models.base_ocr_model import BaseOcrModel
|
11
|
+
from docling.utils.profiling import TimeRecorder
|
9
12
|
|
10
13
|
_log = logging.getLogger(__name__)
|
11
14
|
|
@@ -61,7 +64,9 @@ class TesseractOcrModel(BaseOcrModel):
|
|
61
64
|
# Finalize the tesseractAPI
|
62
65
|
self.reader.End()
|
63
66
|
|
64
|
-
def __call__(
|
67
|
+
def __call__(
|
68
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
69
|
+
) -> Iterable[Page]:
|
65
70
|
|
66
71
|
if not self.enabled:
|
67
72
|
yield from page_batch
|
@@ -72,59 +77,66 @@ class TesseractOcrModel(BaseOcrModel):
|
|
72
77
|
if not page._backend.is_valid():
|
73
78
|
yield page
|
74
79
|
else:
|
75
|
-
|
80
|
+
with TimeRecorder(conv_res, "ocr"):
|
76
81
|
|
77
|
-
|
82
|
+
assert self.reader is not None
|
78
83
|
|
79
|
-
|
80
|
-
for ocr_rect in ocr_rects:
|
81
|
-
# Skip zero area boxes
|
82
|
-
if ocr_rect.area() == 0:
|
83
|
-
continue
|
84
|
-
high_res_image = page._backend.get_page_image(
|
85
|
-
scale=self.scale, cropbox=ocr_rect
|
86
|
-
)
|
84
|
+
ocr_rects = self.get_ocr_rects(page)
|
87
85
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
86
|
+
all_ocr_cells = []
|
87
|
+
for ocr_rect in ocr_rects:
|
88
|
+
# Skip zero area boxes
|
89
|
+
if ocr_rect.area() == 0:
|
90
|
+
continue
|
91
|
+
high_res_image = page._backend.get_page_image(
|
92
|
+
scale=self.scale, cropbox=ocr_rect
|
93
|
+
)
|
93
94
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
# Extract text within the bounding box
|
100
|
-
text = self.reader.GetUTF8Text().strip()
|
101
|
-
confidence = self.reader.MeanTextConf()
|
102
|
-
left = box["x"] / self.scale
|
103
|
-
bottom = box["y"] / self.scale
|
104
|
-
right = (box["x"] + box["w"]) / self.scale
|
105
|
-
top = (box["y"] + box["h"]) / self.scale
|
106
|
-
|
107
|
-
cells.append(
|
108
|
-
OcrCell(
|
109
|
-
id=ix,
|
110
|
-
text=text,
|
111
|
-
confidence=confidence,
|
112
|
-
bbox=BoundingBox.from_tuple(
|
113
|
-
coord=(left, top, right, bottom),
|
114
|
-
origin=CoordOrigin.TOPLEFT,
|
115
|
-
),
|
116
|
-
)
|
95
|
+
# Retrieve text snippets with their bounding boxes
|
96
|
+
self.reader.SetImage(high_res_image)
|
97
|
+
boxes = self.reader.GetComponentImages(
|
98
|
+
self.reader_RIL.TEXTLINE, True
|
117
99
|
)
|
118
100
|
|
119
|
-
|
120
|
-
|
101
|
+
cells = []
|
102
|
+
for ix, (im, box, _, _) in enumerate(boxes):
|
103
|
+
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
104
|
+
self.reader.SetRectangle(
|
105
|
+
box["x"], box["y"], box["w"], box["h"]
|
106
|
+
)
|
121
107
|
|
122
|
-
|
123
|
-
|
108
|
+
# Extract text within the bounding box
|
109
|
+
text = self.reader.GetUTF8Text().strip()
|
110
|
+
confidence = self.reader.MeanTextConf()
|
111
|
+
left = box["x"] / self.scale
|
112
|
+
bottom = box["y"] / self.scale
|
113
|
+
right = (box["x"] + box["w"]) / self.scale
|
114
|
+
top = (box["y"] + box["h"]) / self.scale
|
115
|
+
|
116
|
+
cells.append(
|
117
|
+
OcrCell(
|
118
|
+
id=ix,
|
119
|
+
text=text,
|
120
|
+
confidence=confidence,
|
121
|
+
bbox=BoundingBox.from_tuple(
|
122
|
+
coord=(left, top, right, bottom),
|
123
|
+
origin=CoordOrigin.TOPLEFT,
|
124
|
+
),
|
125
|
+
)
|
126
|
+
)
|
127
|
+
|
128
|
+
# del high_res_image
|
129
|
+
all_ocr_cells.extend(cells)
|
130
|
+
|
131
|
+
## Remove OCR cells which overlap with programmatic cells.
|
132
|
+
filtered_ocr_cells = self.filter_ocr_cells(
|
133
|
+
all_ocr_cells, page.cells
|
134
|
+
)
|
124
135
|
|
125
|
-
|
136
|
+
page.cells.extend(filtered_ocr_cells)
|
126
137
|
|
127
138
|
# DEBUG code:
|
128
|
-
|
139
|
+
if settings.debug.visualize_ocr:
|
140
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
129
141
|
|
130
142
|
yield page
|
@@ -19,6 +19,7 @@ from docling.datamodel.document import ConversionResult, InputDocument
|
|
19
19
|
from docling.datamodel.pipeline_options import PipelineOptions
|
20
20
|
from docling.datamodel.settings import settings
|
21
21
|
from docling.models.base_model import BaseEnrichmentModel
|
22
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
22
23
|
from docling.utils.utils import chunkify
|
23
24
|
|
24
25
|
_log = logging.getLogger(__name__)
|
@@ -35,13 +36,16 @@ class BasePipeline(ABC):
|
|
35
36
|
|
36
37
|
_log.info(f"Processing document {in_doc.file.name}")
|
37
38
|
try:
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
39
|
+
with TimeRecorder(
|
40
|
+
conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
|
41
|
+
):
|
42
|
+
# These steps are building and assembling the structure of the
|
43
|
+
# output DoclingDocument
|
44
|
+
conv_res = self._build_document(conv_res)
|
45
|
+
conv_res = self._assemble_document(conv_res)
|
46
|
+
# From this stage, all operations should rely only on conv_res.output
|
47
|
+
conv_res = self._enrich_document(conv_res)
|
48
|
+
conv_res.status = self._determine_status(conv_res)
|
45
49
|
except Exception as e:
|
46
50
|
conv_res.status = ConversionStatus.FAILURE
|
47
51
|
if raises_on_error:
|
@@ -50,19 +54,13 @@ class BasePipeline(ABC):
|
|
50
54
|
return conv_res
|
51
55
|
|
52
56
|
@abstractmethod
|
53
|
-
def _build_document(
|
54
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
55
|
-
) -> ConversionResult:
|
57
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
56
58
|
pass
|
57
59
|
|
58
|
-
def _assemble_document(
|
59
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
60
|
-
) -> ConversionResult:
|
60
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
61
61
|
return conv_res
|
62
62
|
|
63
|
-
def _enrich_document(
|
64
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
65
|
-
) -> ConversionResult:
|
63
|
+
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
66
64
|
|
67
65
|
def _filter_elements(
|
68
66
|
doc: DoclingDocument, model: BaseEnrichmentModel
|
@@ -71,24 +69,23 @@ class BasePipeline(ABC):
|
|
71
69
|
if model.is_processable(doc=doc, element=element):
|
72
70
|
yield element
|
73
71
|
|
74
|
-
|
75
|
-
for
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
72
|
+
with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
|
73
|
+
for model in self.enrichment_pipe:
|
74
|
+
for element_batch in chunkify(
|
75
|
+
_filter_elements(conv_res.document, model),
|
76
|
+
settings.perf.elements_batch_size,
|
77
|
+
):
|
78
|
+
# TODO: currently we assume the element itself is modified, because
|
79
|
+
# we don't have an interface to save the element back to the document
|
80
|
+
for element in model(
|
81
|
+
doc=conv_res.document, element_batch=element_batch
|
82
|
+
): # Must exhaust!
|
83
|
+
pass
|
85
84
|
|
86
85
|
return conv_res
|
87
86
|
|
88
87
|
@abstractmethod
|
89
|
-
def _determine_status(
|
90
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
91
|
-
) -> ConversionStatus:
|
88
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
92
89
|
pass
|
93
90
|
|
94
91
|
@classmethod
|
@@ -110,66 +107,68 @@ class BasePipeline(ABC):
|
|
110
107
|
|
111
108
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
112
109
|
|
113
|
-
def _apply_on_pages(
|
110
|
+
def _apply_on_pages(
|
111
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
112
|
+
) -> Iterable[Page]:
|
114
113
|
for model in self.build_pipe:
|
115
|
-
page_batch = model(page_batch)
|
114
|
+
page_batch = model(conv_res, page_batch)
|
116
115
|
|
117
116
|
yield from page_batch
|
118
117
|
|
119
|
-
def _build_document(
|
120
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
121
|
-
) -> ConversionResult:
|
118
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
122
119
|
|
123
|
-
if not isinstance(
|
120
|
+
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
124
121
|
raise RuntimeError(
|
125
|
-
f"The selected backend {type(
|
122
|
+
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
126
123
|
f"Can not convert this with a PDF pipeline. "
|
127
124
|
f"Please check your format configuration on DocumentConverter."
|
128
125
|
)
|
129
126
|
# conv_res.status = ConversionStatus.FAILURE
|
130
127
|
# return conv_res
|
131
128
|
|
132
|
-
|
133
|
-
conv_res.pages.append(Page(page_no=i))
|
129
|
+
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
134
130
|
|
135
|
-
|
136
|
-
|
137
|
-
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
138
|
-
start_pb_time = time.time()
|
131
|
+
for i in range(0, conv_res.input.page_count):
|
132
|
+
conv_res.pages.append(Page(page_no=i))
|
139
133
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
134
|
+
try:
|
135
|
+
# Iterate batches of pages (page_batch_size) in the doc
|
136
|
+
for page_batch in chunkify(
|
137
|
+
conv_res.pages, settings.perf.page_batch_size
|
138
|
+
):
|
139
|
+
start_pb_time = time.time()
|
144
140
|
|
145
|
-
|
146
|
-
|
141
|
+
# 1. Initialise the page resources
|
142
|
+
init_pages = map(
|
143
|
+
functools.partial(self.initialize_page, conv_res), page_batch
|
144
|
+
)
|
147
145
|
|
148
|
-
|
149
|
-
|
146
|
+
# 2. Run pipeline stages
|
147
|
+
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
150
148
|
|
151
|
-
|
152
|
-
|
149
|
+
for p in pipeline_pages: # Must exhaust!
|
150
|
+
pass
|
153
151
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
152
|
+
end_pb_time = time.time() - start_pb_time
|
153
|
+
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
154
|
+
|
155
|
+
except Exception as e:
|
156
|
+
conv_res.status = ConversionStatus.FAILURE
|
157
|
+
trace = "\n".join(traceback.format_exception(e))
|
158
|
+
_log.warning(
|
159
|
+
f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
|
160
|
+
f"{trace}"
|
161
|
+
)
|
162
|
+
raise e
|
162
163
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
164
|
+
finally:
|
165
|
+
# Always unload the PDF backend, even in case of failure
|
166
|
+
if conv_res.input._backend:
|
167
|
+
conv_res.input._backend.unload()
|
167
168
|
|
168
169
|
return conv_res
|
169
170
|
|
170
|
-
def _determine_status(
|
171
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
172
|
-
) -> ConversionStatus:
|
171
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
173
172
|
status = ConversionStatus.SUCCESS
|
174
173
|
for page in conv_res.pages:
|
175
174
|
if page._backend is None or not page._backend.is_valid():
|
@@ -186,5 +185,5 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
186
185
|
|
187
186
|
# Initialise and load resources for a page
|
188
187
|
@abstractmethod
|
189
|
-
def initialize_page(self,
|
188
|
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
190
189
|
pass
|
@@ -5,9 +5,10 @@ from docling.backend.abstract_backend import (
|
|
5
5
|
DeclarativeDocumentBackend,
|
6
6
|
)
|
7
7
|
from docling.datamodel.base_models import ConversionStatus
|
8
|
-
from docling.datamodel.document import ConversionResult
|
8
|
+
from docling.datamodel.document import ConversionResult
|
9
9
|
from docling.datamodel.pipeline_options import PipelineOptions
|
10
10
|
from docling.pipeline.base_pipeline import BasePipeline
|
11
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
11
12
|
|
12
13
|
_log = logging.getLogger(__name__)
|
13
14
|
|
@@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline):
|
|
22
23
|
def __init__(self, pipeline_options: PipelineOptions):
|
23
24
|
super().__init__(pipeline_options)
|
24
25
|
|
25
|
-
def _build_document(
|
26
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
27
|
-
) -> ConversionResult:
|
26
|
+
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
28
27
|
|
29
|
-
if not isinstance(
|
28
|
+
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
30
29
|
raise RuntimeError(
|
31
|
-
f"The selected backend {type(
|
30
|
+
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
32
31
|
f"Can not convert this with simple pipeline. "
|
33
32
|
f"Please check your format configuration on DocumentConverter."
|
34
33
|
)
|
@@ -38,13 +37,11 @@ class SimplePipeline(BasePipeline):
|
|
38
37
|
# Instead of running a page-level pipeline to build up the document structure,
|
39
38
|
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
40
39
|
# a DoclingDocument straight.
|
41
|
-
|
42
|
-
|
40
|
+
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
41
|
+
conv_res.document = conv_res.input._backend.convert()
|
43
42
|
return conv_res
|
44
43
|
|
45
|
-
def _determine_status(
|
46
|
-
self, in_doc: InputDocument, conv_res: ConversionResult
|
47
|
-
) -> ConversionStatus:
|
44
|
+
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
48
45
|
# This is called only if the previous steps didn't raise.
|
49
46
|
# Since we don't have anything else to evaluate, we can
|
50
47
|
# safely return SUCCESS.
|