docling 1.19.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +32 -37
- docling/backend/docling_parse_backend.py +16 -12
- docling/backend/docling_parse_v2_backend.py +240 -0
- docling/backend/html_backend.py +425 -0
- docling/backend/mspowerpoint_backend.py +375 -0
- docling/backend/msword_backend.py +509 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +15 -10
- docling/cli/main.py +61 -60
- docling/datamodel/base_models.py +73 -193
- docling/datamodel/document.py +379 -324
- docling/datamodel/pipeline_options.py +16 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +215 -252
- docling/models/base_model.py +25 -0
- docling/models/base_ocr_model.py +19 -6
- docling/models/ds_glm_model.py +220 -22
- docling/models/easyocr_model.py +45 -40
- docling/models/layout_model.py +130 -114
- docling/models/page_assemble_model.py +119 -95
- docling/models/page_preprocessing_model.py +61 -0
- docling/models/table_structure_model.py +122 -111
- docling/models/tesseract_ocr_cli_model.py +63 -56
- docling/models/tesseract_ocr_model.py +58 -50
- docling/pipeline/base_pipeline.py +190 -0
- docling/pipeline/simple_pipeline.py +59 -0
- docling/pipeline/standard_pdf_pipeline.py +198 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling-2.1.0.dist-info/METADATA +149 -0
- docling-2.1.0.dist-info/RECORD +42 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
docling/models/layout_model.py
CHANGED
@@ -2,8 +2,10 @@ import copy
|
|
2
2
|
import logging
|
3
3
|
import random
|
4
4
|
import time
|
5
|
+
from pathlib import Path
|
5
6
|
from typing import Iterable, List
|
6
7
|
|
8
|
+
from docling_core.types.doc import CoordOrigin, DocItemLabel
|
7
9
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
8
10
|
from PIL import ImageDraw
|
9
11
|
|
@@ -11,74 +13,73 @@ from docling.datamodel.base_models import (
|
|
11
13
|
BoundingBox,
|
12
14
|
Cell,
|
13
15
|
Cluster,
|
14
|
-
CoordOrigin,
|
15
16
|
LayoutPrediction,
|
16
17
|
Page,
|
17
18
|
)
|
19
|
+
from docling.models.base_model import BasePageModel
|
18
20
|
from docling.utils import layout_utils as lu
|
19
21
|
|
20
22
|
_log = logging.getLogger(__name__)
|
21
23
|
|
22
24
|
|
23
|
-
class LayoutModel:
|
25
|
+
class LayoutModel(BasePageModel):
|
24
26
|
|
25
27
|
TEXT_ELEM_LABELS = [
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
# "Title"
|
28
|
+
DocItemLabel.TEXT,
|
29
|
+
DocItemLabel.FOOTNOTE,
|
30
|
+
DocItemLabel.CAPTION,
|
31
|
+
DocItemLabel.CHECKBOX_UNSELECTED,
|
32
|
+
DocItemLabel.CHECKBOX_SELECTED,
|
33
|
+
DocItemLabel.SECTION_HEADER,
|
34
|
+
DocItemLabel.PAGE_HEADER,
|
35
|
+
DocItemLabel.PAGE_FOOTER,
|
36
|
+
DocItemLabel.CODE,
|
37
|
+
DocItemLabel.LIST_ITEM,
|
37
38
|
# "Formula",
|
38
39
|
]
|
39
|
-
PAGE_HEADER_LABELS = [
|
40
|
+
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
|
40
41
|
|
41
|
-
TABLE_LABEL =
|
42
|
-
FIGURE_LABEL =
|
43
|
-
FORMULA_LABEL =
|
42
|
+
TABLE_LABEL = DocItemLabel.TABLE
|
43
|
+
FIGURE_LABEL = DocItemLabel.PICTURE
|
44
|
+
FORMULA_LABEL = DocItemLabel.FORMULA
|
44
45
|
|
45
|
-
def __init__(self,
|
46
|
-
self.
|
47
|
-
self.layout_predictor = LayoutPredictor(
|
48
|
-
config["artifacts_path"]
|
49
|
-
) # TODO temporary
|
46
|
+
def __init__(self, artifacts_path: Path):
|
47
|
+
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
|
50
48
|
|
51
|
-
def postprocess(self,
|
49
|
+
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
|
52
50
|
MIN_INTERSECTION = 0.2
|
53
51
|
CLASS_THRESHOLDS = {
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
52
|
+
DocItemLabel.CAPTION: 0.35,
|
53
|
+
DocItemLabel.FOOTNOTE: 0.35,
|
54
|
+
DocItemLabel.FORMULA: 0.35,
|
55
|
+
DocItemLabel.LIST_ITEM: 0.35,
|
56
|
+
DocItemLabel.PAGE_FOOTER: 0.35,
|
57
|
+
DocItemLabel.PAGE_HEADER: 0.35,
|
58
|
+
DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
|
59
|
+
DocItemLabel.SECTION_HEADER: 0.45,
|
60
|
+
DocItemLabel.TABLE: 0.35,
|
61
|
+
DocItemLabel.TEXT: 0.45,
|
62
|
+
DocItemLabel.TITLE: 0.45,
|
63
|
+
DocItemLabel.DOCUMENT_INDEX: 0.45,
|
64
|
+
DocItemLabel.CODE: 0.45,
|
65
|
+
DocItemLabel.CHECKBOX_SELECTED: 0.45,
|
66
|
+
DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
|
67
|
+
DocItemLabel.FORM: 0.45,
|
68
|
+
DocItemLabel.KEY_VALUE_REGION: 0.45,
|
71
69
|
}
|
72
70
|
|
73
|
-
CLASS_REMAPPINGS = {
|
71
|
+
CLASS_REMAPPINGS = {
|
72
|
+
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
|
73
|
+
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
74
|
+
}
|
74
75
|
|
75
76
|
_log.debug("================= Start postprocess function ====================")
|
76
77
|
start_time = time.time()
|
77
78
|
# Apply Confidence Threshold to cluster predictions
|
78
79
|
# confidence = self.conf_threshold
|
79
|
-
|
80
|
+
clusters_mod = []
|
80
81
|
|
81
|
-
for cluster in
|
82
|
+
for cluster in clusters_in:
|
82
83
|
confidence = CLASS_THRESHOLDS[cluster.label]
|
83
84
|
if cluster.confidence >= confidence:
|
84
85
|
# annotation["created_by"] = "high_conf_pred"
|
@@ -86,10 +87,10 @@ class LayoutModel:
|
|
86
87
|
# Remap class labels where needed.
|
87
88
|
if cluster.label in CLASS_REMAPPINGS.keys():
|
88
89
|
cluster.label = CLASS_REMAPPINGS[cluster.label]
|
89
|
-
|
90
|
+
clusters_mod.append(cluster)
|
90
91
|
|
91
92
|
# map to dictionary clusters and cells, with bottom left origin
|
92
|
-
|
93
|
+
clusters_orig = [
|
93
94
|
{
|
94
95
|
"id": c.id,
|
95
96
|
"bbox": list(
|
@@ -99,7 +100,7 @@ class LayoutModel:
|
|
99
100
|
"cell_ids": [],
|
100
101
|
"type": c.label,
|
101
102
|
}
|
102
|
-
for c in
|
103
|
+
for c in clusters_in
|
103
104
|
]
|
104
105
|
|
105
106
|
clusters_out = [
|
@@ -113,9 +114,11 @@ class LayoutModel:
|
|
113
114
|
"cell_ids": [],
|
114
115
|
"type": c.label,
|
115
116
|
}
|
116
|
-
for c in
|
117
|
+
for c in clusters_mod
|
117
118
|
]
|
118
119
|
|
120
|
+
del clusters_mod
|
121
|
+
|
119
122
|
raw_cells = [
|
120
123
|
{
|
121
124
|
"id": c.id,
|
@@ -149,7 +152,7 @@ class LayoutModel:
|
|
149
152
|
|
150
153
|
# Assign orphan cells with lower confidence predictions
|
151
154
|
clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
|
152
|
-
clusters_out,
|
155
|
+
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
|
153
156
|
)
|
154
157
|
|
155
158
|
# Refresh the cell_ids assignment, after creating new clusters using low conf predictions
|
@@ -178,7 +181,7 @@ class LayoutModel:
|
|
178
181
|
) = lu.cell_id_state_map(clusters_out, cell_count)
|
179
182
|
|
180
183
|
clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
|
181
|
-
clusters_out,
|
184
|
+
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
|
182
185
|
)
|
183
186
|
|
184
187
|
_log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
|
@@ -237,92 +240,105 @@ class LayoutModel:
|
|
237
240
|
end_time = time.time() - start_time
|
238
241
|
_log.debug(f"Finished post processing in seconds={end_time:.3f}")
|
239
242
|
|
240
|
-
|
243
|
+
cells_out_new = [
|
241
244
|
Cell(
|
242
|
-
id=c["id"],
|
245
|
+
id=c["id"], # type: ignore
|
243
246
|
bbox=BoundingBox.from_tuple(
|
244
|
-
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
247
|
+
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
|
245
248
|
).to_top_left_origin(page_height),
|
246
|
-
text=c["text"],
|
249
|
+
text=c["text"], # type: ignore
|
247
250
|
)
|
248
251
|
for c in cells_out
|
249
252
|
]
|
253
|
+
|
254
|
+
del cells_out
|
255
|
+
|
250
256
|
clusters_out_new = []
|
251
257
|
for c in clusters_out:
|
252
|
-
cluster_cells = [
|
258
|
+
cluster_cells = [
|
259
|
+
ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore
|
260
|
+
]
|
253
261
|
c_new = Cluster(
|
254
|
-
id=c["id"],
|
262
|
+
id=c["id"], # type: ignore
|
255
263
|
bbox=BoundingBox.from_tuple(
|
256
|
-
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
264
|
+
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
|
257
265
|
).to_top_left_origin(page_height),
|
258
|
-
confidence=c["confidence"],
|
259
|
-
label=c["type"],
|
266
|
+
confidence=c["confidence"], # type: ignore
|
267
|
+
label=DocItemLabel(c["type"]),
|
260
268
|
cells=cluster_cells,
|
261
269
|
)
|
262
270
|
clusters_out_new.append(c_new)
|
263
271
|
|
264
|
-
return clusters_out_new,
|
272
|
+
return clusters_out_new, cells_out_new
|
265
273
|
|
266
274
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
267
275
|
for page in page_batch:
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
overlap_frac = 0.0
|
288
|
-
else:
|
289
|
-
overlap_frac = (
|
290
|
-
cell.bbox.intersection_area_with(cluster.bbox)
|
291
|
-
/ cell.bbox.area()
|
292
|
-
)
|
293
|
-
|
294
|
-
if overlap_frac > 0.5:
|
295
|
-
cluster.cells.append(cell)
|
296
|
-
|
297
|
-
# Pre-sort clusters
|
298
|
-
# clusters = self.sort_clusters_by_cell_order(clusters)
|
299
|
-
|
300
|
-
# DEBUG code:
|
301
|
-
def draw_clusters_and_cells():
|
302
|
-
image = copy.deepcopy(page.image)
|
303
|
-
draw = ImageDraw.Draw(image)
|
304
|
-
for c in clusters:
|
305
|
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
306
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
307
|
-
|
308
|
-
cell_color = (
|
309
|
-
random.randint(30, 140),
|
310
|
-
random.randint(30, 140),
|
311
|
-
random.randint(30, 140),
|
276
|
+
assert page._backend is not None
|
277
|
+
if not page._backend.is_valid():
|
278
|
+
yield page
|
279
|
+
else:
|
280
|
+
assert page.size is not None
|
281
|
+
|
282
|
+
clusters = []
|
283
|
+
for ix, pred_item in enumerate(
|
284
|
+
self.layout_predictor.predict(page.get_image(scale=1.0))
|
285
|
+
):
|
286
|
+
label = DocItemLabel(
|
287
|
+
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
|
288
|
+
) # Temporary, until docling-ibm-model uses docling-core types
|
289
|
+
cluster = Cluster(
|
290
|
+
id=ix,
|
291
|
+
label=label,
|
292
|
+
confidence=pred_item["confidence"],
|
293
|
+
bbox=BoundingBox.model_validate(pred_item),
|
294
|
+
cells=[],
|
312
295
|
)
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
296
|
+
clusters.append(cluster)
|
297
|
+
|
298
|
+
# Map cells to clusters
|
299
|
+
# TODO: Remove, postprocess should take care of it anyway.
|
300
|
+
for cell in page.cells:
|
301
|
+
for cluster in clusters:
|
302
|
+
if not cell.bbox.area() > 0:
|
303
|
+
overlap_frac = 0.0
|
304
|
+
else:
|
305
|
+
overlap_frac = (
|
306
|
+
cell.bbox.intersection_area_with(cluster.bbox)
|
307
|
+
/ cell.bbox.area()
|
308
|
+
)
|
309
|
+
|
310
|
+
if overlap_frac > 0.5:
|
311
|
+
cluster.cells.append(cell)
|
312
|
+
|
313
|
+
# Pre-sort clusters
|
314
|
+
# clusters = self.sort_clusters_by_cell_order(clusters)
|
315
|
+
|
316
|
+
# DEBUG code:
|
317
|
+
def draw_clusters_and_cells():
|
318
|
+
image = copy.deepcopy(page.image)
|
319
|
+
draw = ImageDraw.Draw(image)
|
320
|
+
for c in clusters:
|
321
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
322
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
323
|
+
|
324
|
+
cell_color = (
|
325
|
+
random.randint(30, 140),
|
326
|
+
random.randint(30, 140),
|
327
|
+
random.randint(30, 140),
|
328
|
+
)
|
329
|
+
for tc in c.cells: # [:1]:
|
330
|
+
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
331
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
332
|
+
image.show()
|
317
333
|
|
318
|
-
|
334
|
+
# draw_clusters_and_cells()
|
319
335
|
|
320
|
-
|
321
|
-
|
322
|
-
|
336
|
+
clusters, page.cells = self.postprocess(
|
337
|
+
clusters, page.cells, page.size.height
|
338
|
+
)
|
323
339
|
|
324
|
-
|
340
|
+
# draw_clusters_and_cells()
|
325
341
|
|
326
|
-
|
342
|
+
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
327
343
|
|
328
|
-
|
344
|
+
yield page
|
@@ -2,22 +2,29 @@ import logging
|
|
2
2
|
import re
|
3
3
|
from typing import Iterable, List
|
4
4
|
|
5
|
+
from pydantic import BaseModel
|
6
|
+
|
5
7
|
from docling.datamodel.base_models import (
|
6
8
|
AssembledUnit,
|
7
9
|
FigureElement,
|
8
10
|
Page,
|
9
11
|
PageElement,
|
10
|
-
|
12
|
+
Table,
|
11
13
|
TextElement,
|
12
14
|
)
|
15
|
+
from docling.models.base_model import BasePageModel
|
13
16
|
from docling.models.layout_model import LayoutModel
|
14
17
|
|
15
18
|
_log = logging.getLogger(__name__)
|
16
19
|
|
17
20
|
|
18
|
-
class
|
19
|
-
|
20
|
-
|
21
|
+
class PageAssembleOptions(BaseModel):
|
22
|
+
keep_images: bool = False
|
23
|
+
|
24
|
+
|
25
|
+
class PageAssembleModel(BasePageModel):
|
26
|
+
def __init__(self, options: PageAssembleOptions):
|
27
|
+
self.options = options
|
21
28
|
|
22
29
|
def sanitize_text(self, lines):
|
23
30
|
if len(lines) <= 1:
|
@@ -46,103 +53,120 @@ class PageAssembleModel:
|
|
46
53
|
|
47
54
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
48
55
|
for page in page_batch:
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
for cluster in page.predictions.layout.clusters:
|
56
|
-
# _log.info("Cluster label seen:", cluster.label)
|
57
|
-
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
58
|
-
|
59
|
-
textlines = [
|
60
|
-
cell.text.replace("\x02", "-").strip()
|
61
|
-
for cell in cluster.cells
|
62
|
-
if len(cell.text.strip()) > 0
|
63
|
-
]
|
64
|
-
text = self.sanitize_text(textlines)
|
65
|
-
text_el = TextElement(
|
66
|
-
label=cluster.label,
|
67
|
-
id=cluster.id,
|
68
|
-
text=text,
|
69
|
-
page_no=page.page_no,
|
70
|
-
cluster=cluster,
|
71
|
-
)
|
72
|
-
elements.append(text_el)
|
73
|
-
|
74
|
-
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
75
|
-
headers.append(text_el)
|
76
|
-
else:
|
77
|
-
body.append(text_el)
|
78
|
-
elif cluster.label == LayoutModel.TABLE_LABEL:
|
79
|
-
tbl = None
|
80
|
-
if page.predictions.tablestructure:
|
81
|
-
tbl = page.predictions.tablestructure.table_map.get(
|
82
|
-
cluster.id, None
|
83
|
-
)
|
84
|
-
if (
|
85
|
-
not tbl
|
86
|
-
): # fallback: add table without structure, if it isn't present
|
87
|
-
tbl = TableElement(
|
88
|
-
label=cluster.label,
|
89
|
-
id=cluster.id,
|
90
|
-
text="",
|
91
|
-
otsl_seq=[],
|
92
|
-
table_cells=[],
|
93
|
-
cluster=cluster,
|
94
|
-
page_no=page.page_no,
|
95
|
-
)
|
56
|
+
assert page._backend is not None
|
57
|
+
if not page._backend.is_valid():
|
58
|
+
yield page
|
59
|
+
else:
|
60
|
+
assert page.predictions.layout is not None
|
96
61
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
if
|
106
|
-
|
107
|
-
|
108
|
-
|
62
|
+
# assembles some JSON output page by page.
|
63
|
+
|
64
|
+
elements: List[PageElement] = []
|
65
|
+
headers: List[PageElement] = []
|
66
|
+
body: List[PageElement] = []
|
67
|
+
|
68
|
+
for cluster in page.predictions.layout.clusters:
|
69
|
+
# _log.info("Cluster label seen:", cluster.label)
|
70
|
+
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
71
|
+
|
72
|
+
textlines = [
|
73
|
+
cell.text.replace("\x02", "-").strip()
|
74
|
+
for cell in cluster.cells
|
75
|
+
if len(cell.text.strip()) > 0
|
76
|
+
]
|
77
|
+
text = self.sanitize_text(textlines)
|
78
|
+
text_el = TextElement(
|
109
79
|
label=cluster.label,
|
110
80
|
id=cluster.id,
|
111
|
-
text=
|
112
|
-
data=None,
|
113
|
-
cluster=cluster,
|
81
|
+
text=text,
|
114
82
|
page_no=page.page_no,
|
83
|
+
cluster=cluster,
|
115
84
|
)
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
85
|
+
elements.append(text_el)
|
86
|
+
|
87
|
+
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
88
|
+
headers.append(text_el)
|
89
|
+
else:
|
90
|
+
body.append(text_el)
|
91
|
+
elif cluster.label == LayoutModel.TABLE_LABEL:
|
92
|
+
tbl = None
|
93
|
+
if page.predictions.tablestructure:
|
94
|
+
tbl = page.predictions.tablestructure.table_map.get(
|
123
95
|
cluster.id, None
|
124
96
|
)
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
97
|
+
if (
|
98
|
+
not tbl
|
99
|
+
): # fallback: add table without structure, if it isn't present
|
100
|
+
tbl = Table(
|
101
|
+
label=cluster.label,
|
102
|
+
id=cluster.id,
|
103
|
+
text="",
|
104
|
+
otsl_seq=[],
|
105
|
+
table_cells=[],
|
106
|
+
cluster=cluster,
|
107
|
+
page_no=page.page_no,
|
108
|
+
)
|
109
|
+
|
110
|
+
elements.append(tbl)
|
111
|
+
body.append(tbl)
|
112
|
+
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
113
|
+
fig = None
|
114
|
+
if page.predictions.figures_classification:
|
115
|
+
fig = (
|
116
|
+
page.predictions.figures_classification.figure_map.get(
|
117
|
+
cluster.id, None
|
118
|
+
)
|
119
|
+
)
|
120
|
+
if (
|
121
|
+
not fig
|
122
|
+
): # fallback: add figure without classification, if it isn't present
|
123
|
+
fig = FigureElement(
|
124
|
+
label=cluster.label,
|
125
|
+
id=cluster.id,
|
126
|
+
text="",
|
127
|
+
data=None,
|
128
|
+
cluster=cluster,
|
129
|
+
page_no=page.page_no,
|
130
|
+
)
|
131
|
+
elements.append(fig)
|
132
|
+
body.append(fig)
|
133
|
+
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
134
|
+
equation = None
|
135
|
+
if page.predictions.equations_prediction:
|
136
|
+
equation = (
|
137
|
+
page.predictions.equations_prediction.equation_map.get(
|
138
|
+
cluster.id, None
|
139
|
+
)
|
140
|
+
)
|
141
|
+
if (
|
142
|
+
not equation
|
143
|
+
): # fallback: add empty formula, if it isn't present
|
144
|
+
text = self.sanitize_text(
|
145
|
+
[
|
146
|
+
cell.text.replace("\x02", "-").strip()
|
147
|
+
for cell in cluster.cells
|
148
|
+
if len(cell.text.strip()) > 0
|
149
|
+
]
|
150
|
+
)
|
151
|
+
equation = TextElement(
|
152
|
+
label=cluster.label,
|
153
|
+
id=cluster.id,
|
154
|
+
cluster=cluster,
|
155
|
+
page_no=page.page_no,
|
156
|
+
text=text,
|
157
|
+
)
|
158
|
+
elements.append(equation)
|
159
|
+
body.append(equation)
|
160
|
+
|
161
|
+
page.assembled = AssembledUnit(
|
162
|
+
elements=elements, headers=headers, body=body
|
163
|
+
)
|
164
|
+
|
165
|
+
# Remove page images (can be disabled)
|
166
|
+
if not self.options.keep_images:
|
167
|
+
page._image_cache = {}
|
143
168
|
|
144
|
-
|
145
|
-
|
146
|
-
)
|
169
|
+
# Unload backend
|
170
|
+
page._backend.unload()
|
147
171
|
|
148
|
-
|
172
|
+
yield page
|
@@ -0,0 +1,61 @@
|
|
1
|
+
from typing import Iterable, Optional
|
2
|
+
|
3
|
+
from PIL import ImageDraw
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import Page
|
7
|
+
from docling.models.base_model import BasePageModel
|
8
|
+
|
9
|
+
|
10
|
+
class PagePreprocessingOptions(BaseModel):
|
11
|
+
images_scale: Optional[float]
|
12
|
+
|
13
|
+
|
14
|
+
class PagePreprocessingModel(BasePageModel):
|
15
|
+
def __init__(self, options: PagePreprocessingOptions):
|
16
|
+
self.options = options
|
17
|
+
|
18
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
19
|
+
for page in page_batch:
|
20
|
+
assert page._backend is not None
|
21
|
+
if not page._backend.is_valid():
|
22
|
+
yield page
|
23
|
+
else:
|
24
|
+
page = self._populate_page_images(page)
|
25
|
+
page = self._parse_page_cells(page)
|
26
|
+
yield page
|
27
|
+
|
28
|
+
# Generate the page image and store it in the page object
|
29
|
+
def _populate_page_images(self, page: Page) -> Page:
|
30
|
+
# default scale
|
31
|
+
page.get_image(
|
32
|
+
scale=1.0
|
33
|
+
) # puts the page image on the image cache at default scale
|
34
|
+
|
35
|
+
images_scale = self.options.images_scale
|
36
|
+
# user requested scales
|
37
|
+
if images_scale is not None:
|
38
|
+
page._default_image_scale = images_scale
|
39
|
+
page.get_image(
|
40
|
+
scale=images_scale
|
41
|
+
) # this will trigger storing the image in the internal cache
|
42
|
+
|
43
|
+
return page
|
44
|
+
|
45
|
+
# Extract and populate the page cells and store it in the page object
|
46
|
+
def _parse_page_cells(self, page: Page) -> Page:
|
47
|
+
assert page._backend is not None
|
48
|
+
|
49
|
+
page.cells = list(page._backend.get_text_cells())
|
50
|
+
|
51
|
+
# DEBUG code:
|
52
|
+
def draw_text_boxes(image, cells):
|
53
|
+
draw = ImageDraw.Draw(image)
|
54
|
+
for c in cells:
|
55
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
56
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
57
|
+
image.show()
|
58
|
+
|
59
|
+
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
60
|
+
|
61
|
+
return page
|