docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +33 -37
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +20 -16
- docling/backend/docling_parse_v2_backend.py +248 -0
- docling/backend/html_backend.py +429 -0
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +496 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +16 -11
- docling/cli/main.py +96 -65
- docling/datamodel/base_models.py +79 -193
- docling/datamodel/document.py +405 -320
- docling/datamodel/pipeline_options.py +19 -3
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +240 -251
- docling/models/base_model.py +28 -0
- docling/models/base_ocr_model.py +40 -10
- docling/models/ds_glm_model.py +244 -30
- docling/models/easyocr_model.py +57 -42
- docling/models/layout_model.py +158 -116
- docling/models/page_assemble_model.py +127 -101
- docling/models/page_preprocessing_model.py +79 -0
- docling/models/table_structure_model.py +162 -116
- docling/models/tesseract_ocr_cli_model.py +76 -59
- docling/models/tesseract_ocr_model.py +90 -58
- docling/pipeline/base_pipeline.py +189 -0
- docling/pipeline/simple_pipeline.py +56 -0
- docling/pipeline/standard_pdf_pipeline.py +201 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling/utils/profiling.py +62 -0
- docling-2.4.1.dist-info/METADATA +154 -0
- docling-2.4.1.dist-info/RECORD +45 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
docling/models/layout_model.py
CHANGED
@@ -2,8 +2,10 @@ import copy
|
|
2
2
|
import logging
|
3
3
|
import random
|
4
4
|
import time
|
5
|
+
from pathlib import Path
|
5
6
|
from typing import Iterable, List
|
6
7
|
|
8
|
+
from docling_core.types.doc import CoordOrigin, DocItemLabel
|
7
9
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
8
10
|
from PIL import ImageDraw
|
9
11
|
|
@@ -11,74 +13,76 @@ from docling.datamodel.base_models import (
|
|
11
13
|
BoundingBox,
|
12
14
|
Cell,
|
13
15
|
Cluster,
|
14
|
-
CoordOrigin,
|
15
16
|
LayoutPrediction,
|
16
17
|
Page,
|
17
18
|
)
|
19
|
+
from docling.datamodel.document import ConversionResult
|
20
|
+
from docling.datamodel.settings import settings
|
21
|
+
from docling.models.base_model import BasePageModel
|
18
22
|
from docling.utils import layout_utils as lu
|
23
|
+
from docling.utils.profiling import TimeRecorder
|
19
24
|
|
20
25
|
_log = logging.getLogger(__name__)
|
21
26
|
|
22
27
|
|
23
|
-
class LayoutModel:
|
28
|
+
class LayoutModel(BasePageModel):
|
24
29
|
|
25
30
|
TEXT_ELEM_LABELS = [
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
# "Title"
|
31
|
+
DocItemLabel.TEXT,
|
32
|
+
DocItemLabel.FOOTNOTE,
|
33
|
+
DocItemLabel.CAPTION,
|
34
|
+
DocItemLabel.CHECKBOX_UNSELECTED,
|
35
|
+
DocItemLabel.CHECKBOX_SELECTED,
|
36
|
+
DocItemLabel.SECTION_HEADER,
|
37
|
+
DocItemLabel.PAGE_HEADER,
|
38
|
+
DocItemLabel.PAGE_FOOTER,
|
39
|
+
DocItemLabel.CODE,
|
40
|
+
DocItemLabel.LIST_ITEM,
|
37
41
|
# "Formula",
|
38
42
|
]
|
39
|
-
PAGE_HEADER_LABELS = [
|
43
|
+
PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
|
40
44
|
|
41
|
-
TABLE_LABEL =
|
42
|
-
FIGURE_LABEL =
|
43
|
-
FORMULA_LABEL =
|
45
|
+
TABLE_LABEL = DocItemLabel.TABLE
|
46
|
+
FIGURE_LABEL = DocItemLabel.PICTURE
|
47
|
+
FORMULA_LABEL = DocItemLabel.FORMULA
|
44
48
|
|
45
|
-
def __init__(self,
|
46
|
-
self.
|
47
|
-
self.layout_predictor = LayoutPredictor(
|
48
|
-
config["artifacts_path"]
|
49
|
-
) # TODO temporary
|
49
|
+
def __init__(self, artifacts_path: Path):
|
50
|
+
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
|
50
51
|
|
51
|
-
def postprocess(self,
|
52
|
+
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
|
52
53
|
MIN_INTERSECTION = 0.2
|
53
54
|
CLASS_THRESHOLDS = {
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
55
|
+
DocItemLabel.CAPTION: 0.35,
|
56
|
+
DocItemLabel.FOOTNOTE: 0.35,
|
57
|
+
DocItemLabel.FORMULA: 0.35,
|
58
|
+
DocItemLabel.LIST_ITEM: 0.35,
|
59
|
+
DocItemLabel.PAGE_FOOTER: 0.35,
|
60
|
+
DocItemLabel.PAGE_HEADER: 0.35,
|
61
|
+
DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
|
62
|
+
DocItemLabel.SECTION_HEADER: 0.45,
|
63
|
+
DocItemLabel.TABLE: 0.35,
|
64
|
+
DocItemLabel.TEXT: 0.45,
|
65
|
+
DocItemLabel.TITLE: 0.45,
|
66
|
+
DocItemLabel.DOCUMENT_INDEX: 0.45,
|
67
|
+
DocItemLabel.CODE: 0.45,
|
68
|
+
DocItemLabel.CHECKBOX_SELECTED: 0.45,
|
69
|
+
DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
|
70
|
+
DocItemLabel.FORM: 0.45,
|
71
|
+
DocItemLabel.KEY_VALUE_REGION: 0.45,
|
71
72
|
}
|
72
73
|
|
73
|
-
CLASS_REMAPPINGS = {
|
74
|
+
CLASS_REMAPPINGS = {
|
75
|
+
DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
|
76
|
+
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
77
|
+
}
|
74
78
|
|
75
79
|
_log.debug("================= Start postprocess function ====================")
|
76
80
|
start_time = time.time()
|
77
81
|
# Apply Confidence Threshold to cluster predictions
|
78
82
|
# confidence = self.conf_threshold
|
79
|
-
|
83
|
+
clusters_mod = []
|
80
84
|
|
81
|
-
for cluster in
|
85
|
+
for cluster in clusters_in:
|
82
86
|
confidence = CLASS_THRESHOLDS[cluster.label]
|
83
87
|
if cluster.confidence >= confidence:
|
84
88
|
# annotation["created_by"] = "high_conf_pred"
|
@@ -86,10 +90,10 @@ class LayoutModel:
|
|
86
90
|
# Remap class labels where needed.
|
87
91
|
if cluster.label in CLASS_REMAPPINGS.keys():
|
88
92
|
cluster.label = CLASS_REMAPPINGS[cluster.label]
|
89
|
-
|
93
|
+
clusters_mod.append(cluster)
|
90
94
|
|
91
95
|
# map to dictionary clusters and cells, with bottom left origin
|
92
|
-
|
96
|
+
clusters_orig = [
|
93
97
|
{
|
94
98
|
"id": c.id,
|
95
99
|
"bbox": list(
|
@@ -99,7 +103,7 @@ class LayoutModel:
|
|
99
103
|
"cell_ids": [],
|
100
104
|
"type": c.label,
|
101
105
|
}
|
102
|
-
for c in
|
106
|
+
for c in clusters_in
|
103
107
|
]
|
104
108
|
|
105
109
|
clusters_out = [
|
@@ -113,9 +117,11 @@ class LayoutModel:
|
|
113
117
|
"cell_ids": [],
|
114
118
|
"type": c.label,
|
115
119
|
}
|
116
|
-
for c in
|
120
|
+
for c in clusters_mod
|
117
121
|
]
|
118
122
|
|
123
|
+
del clusters_mod
|
124
|
+
|
119
125
|
raw_cells = [
|
120
126
|
{
|
121
127
|
"id": c.id,
|
@@ -149,7 +155,7 @@ class LayoutModel:
|
|
149
155
|
|
150
156
|
# Assign orphan cells with lower confidence predictions
|
151
157
|
clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
|
152
|
-
clusters_out,
|
158
|
+
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
|
153
159
|
)
|
154
160
|
|
155
161
|
# Refresh the cell_ids assignment, after creating new clusters using low conf predictions
|
@@ -178,7 +184,7 @@ class LayoutModel:
|
|
178
184
|
) = lu.cell_id_state_map(clusters_out, cell_count)
|
179
185
|
|
180
186
|
clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
|
181
|
-
clusters_out,
|
187
|
+
clusters_out, clusters_orig, raw_cells, orphan_cell_indices
|
182
188
|
)
|
183
189
|
|
184
190
|
_log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
|
@@ -237,92 +243,128 @@ class LayoutModel:
|
|
237
243
|
end_time = time.time() - start_time
|
238
244
|
_log.debug(f"Finished post processing in seconds={end_time:.3f}")
|
239
245
|
|
240
|
-
|
246
|
+
cells_out_new = [
|
241
247
|
Cell(
|
242
|
-
id=c["id"],
|
248
|
+
id=c["id"], # type: ignore
|
243
249
|
bbox=BoundingBox.from_tuple(
|
244
|
-
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
250
|
+
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
|
245
251
|
).to_top_left_origin(page_height),
|
246
|
-
text=c["text"],
|
252
|
+
text=c["text"], # type: ignore
|
247
253
|
)
|
248
254
|
for c in cells_out
|
249
255
|
]
|
256
|
+
|
257
|
+
del cells_out
|
258
|
+
|
250
259
|
clusters_out_new = []
|
251
260
|
for c in clusters_out:
|
252
|
-
cluster_cells = [
|
261
|
+
cluster_cells = [
|
262
|
+
ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore
|
263
|
+
]
|
253
264
|
c_new = Cluster(
|
254
|
-
id=c["id"],
|
265
|
+
id=c["id"], # type: ignore
|
255
266
|
bbox=BoundingBox.from_tuple(
|
256
|
-
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
267
|
+
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
|
257
268
|
).to_top_left_origin(page_height),
|
258
|
-
confidence=c["confidence"],
|
259
|
-
label=c["type"],
|
269
|
+
confidence=c["confidence"], # type: ignore
|
270
|
+
label=DocItemLabel(c["type"]),
|
260
271
|
cells=cluster_cells,
|
261
272
|
)
|
262
273
|
clusters_out_new.append(c_new)
|
263
274
|
|
264
|
-
return clusters_out_new,
|
275
|
+
return clusters_out_new, cells_out_new
|
276
|
+
|
277
|
+
def __call__(
|
278
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
279
|
+
) -> Iterable[Page]:
|
265
280
|
|
266
|
-
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
267
281
|
for page in page_batch:
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
/ cell.bbox.area()
|
282
|
+
assert page._backend is not None
|
283
|
+
if not page._backend.is_valid():
|
284
|
+
yield page
|
285
|
+
else:
|
286
|
+
with TimeRecorder(conv_res, "layout"):
|
287
|
+
assert page.size is not None
|
288
|
+
|
289
|
+
clusters = []
|
290
|
+
for ix, pred_item in enumerate(
|
291
|
+
self.layout_predictor.predict(page.get_image(scale=1.0))
|
292
|
+
):
|
293
|
+
label = DocItemLabel(
|
294
|
+
pred_item["label"]
|
295
|
+
.lower()
|
296
|
+
.replace(" ", "_")
|
297
|
+
.replace("-", "_")
|
298
|
+
) # Temporary, until docling-ibm-model uses docling-core types
|
299
|
+
cluster = Cluster(
|
300
|
+
id=ix,
|
301
|
+
label=label,
|
302
|
+
confidence=pred_item["confidence"],
|
303
|
+
bbox=BoundingBox.model_validate(pred_item),
|
304
|
+
cells=[],
|
292
305
|
)
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
306
|
+
clusters.append(cluster)
|
307
|
+
|
308
|
+
# Map cells to clusters
|
309
|
+
# TODO: Remove, postprocess should take care of it anyway.
|
310
|
+
for cell in page.cells:
|
311
|
+
for cluster in clusters:
|
312
|
+
if not cell.bbox.area() > 0:
|
313
|
+
overlap_frac = 0.0
|
314
|
+
else:
|
315
|
+
overlap_frac = (
|
316
|
+
cell.bbox.intersection_area_with(cluster.bbox)
|
317
|
+
/ cell.bbox.area()
|
318
|
+
)
|
319
|
+
|
320
|
+
if overlap_frac > 0.5:
|
321
|
+
cluster.cells.append(cell)
|
322
|
+
|
323
|
+
# Pre-sort clusters
|
324
|
+
# clusters = self.sort_clusters_by_cell_order(clusters)
|
325
|
+
|
326
|
+
# DEBUG code:
|
327
|
+
def draw_clusters_and_cells(show: bool = False):
|
328
|
+
image = copy.deepcopy(page.image)
|
329
|
+
if image is not None:
|
330
|
+
draw = ImageDraw.Draw(image)
|
331
|
+
for c in clusters:
|
332
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
333
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
334
|
+
|
335
|
+
cell_color = (
|
336
|
+
random.randint(30, 140),
|
337
|
+
random.randint(30, 140),
|
338
|
+
random.randint(30, 140),
|
339
|
+
)
|
340
|
+
for tc in c.cells: # [:1]:
|
341
|
+
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
342
|
+
draw.rectangle(
|
343
|
+
[(x0, y0), (x1, y1)], outline=cell_color
|
344
|
+
)
|
345
|
+
if show:
|
346
|
+
image.show()
|
347
|
+
else:
|
348
|
+
out_path: Path = (
|
349
|
+
Path(settings.debug.debug_output_path)
|
350
|
+
/ f"debug_{conv_res.input.file.stem}"
|
351
|
+
)
|
352
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
353
|
+
|
354
|
+
out_file = (
|
355
|
+
out_path / f"layout_page_{page.page_no:05}.png"
|
356
|
+
)
|
357
|
+
image.save(str(out_file), format="png")
|
358
|
+
|
359
|
+
# draw_clusters_and_cells()
|
360
|
+
|
361
|
+
clusters, page.cells = self.postprocess(
|
362
|
+
clusters, page.cells, page.size.height
|
312
363
|
)
|
313
|
-
for tc in c.cells: # [:1]:
|
314
|
-
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
315
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
316
|
-
image.show()
|
317
|
-
|
318
|
-
# draw_clusters_and_cells()
|
319
|
-
|
320
|
-
clusters, page.cells = self.postprocess(
|
321
|
-
clusters, page.cells, page.size.height
|
322
|
-
)
|
323
364
|
|
324
|
-
|
365
|
+
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
325
366
|
|
326
|
-
|
367
|
+
if settings.debug.visualize_layout:
|
368
|
+
draw_clusters_and_cells()
|
327
369
|
|
328
|
-
|
370
|
+
yield page
|
@@ -2,22 +2,31 @@ import logging
|
|
2
2
|
import re
|
3
3
|
from typing import Iterable, List
|
4
4
|
|
5
|
+
from pydantic import BaseModel
|
6
|
+
|
5
7
|
from docling.datamodel.base_models import (
|
6
8
|
AssembledUnit,
|
7
9
|
FigureElement,
|
8
10
|
Page,
|
9
11
|
PageElement,
|
10
|
-
|
12
|
+
Table,
|
11
13
|
TextElement,
|
12
14
|
)
|
15
|
+
from docling.datamodel.document import ConversionResult
|
16
|
+
from docling.models.base_model import BasePageModel
|
13
17
|
from docling.models.layout_model import LayoutModel
|
18
|
+
from docling.utils.profiling import TimeRecorder
|
14
19
|
|
15
20
|
_log = logging.getLogger(__name__)
|
16
21
|
|
17
22
|
|
18
|
-
class
|
19
|
-
|
20
|
-
|
23
|
+
class PageAssembleOptions(BaseModel):
|
24
|
+
keep_images: bool = False
|
25
|
+
|
26
|
+
|
27
|
+
class PageAssembleModel(BasePageModel):
|
28
|
+
def __init__(self, options: PageAssembleOptions):
|
29
|
+
self.options = options
|
21
30
|
|
22
31
|
def sanitize_text(self, lines):
|
23
32
|
if len(lines) <= 1:
|
@@ -44,105 +53,122 @@ class PageAssembleModel:
|
|
44
53
|
|
45
54
|
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
46
55
|
|
47
|
-
def __call__(
|
56
|
+
def __call__(
|
57
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
58
|
+
) -> Iterable[Page]:
|
48
59
|
for page in page_batch:
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
text=text,
|
69
|
-
page_no=page.page_no,
|
70
|
-
cluster=cluster,
|
71
|
-
)
|
72
|
-
elements.append(text_el)
|
73
|
-
|
74
|
-
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
75
|
-
headers.append(text_el)
|
76
|
-
else:
|
77
|
-
body.append(text_el)
|
78
|
-
elif cluster.label == LayoutModel.TABLE_LABEL:
|
79
|
-
tbl = None
|
80
|
-
if page.predictions.tablestructure:
|
81
|
-
tbl = page.predictions.tablestructure.table_map.get(
|
82
|
-
cluster.id, None
|
83
|
-
)
|
84
|
-
if (
|
85
|
-
not tbl
|
86
|
-
): # fallback: add table without structure, if it isn't present
|
87
|
-
tbl = TableElement(
|
88
|
-
label=cluster.label,
|
89
|
-
id=cluster.id,
|
90
|
-
text="",
|
91
|
-
otsl_seq=[],
|
92
|
-
table_cells=[],
|
93
|
-
cluster=cluster,
|
94
|
-
page_no=page.page_no,
|
95
|
-
)
|
96
|
-
|
97
|
-
elements.append(tbl)
|
98
|
-
body.append(tbl)
|
99
|
-
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
100
|
-
fig = None
|
101
|
-
if page.predictions.figures_classification:
|
102
|
-
fig = page.predictions.figures_classification.figure_map.get(
|
103
|
-
cluster.id, None
|
104
|
-
)
|
105
|
-
if (
|
106
|
-
not fig
|
107
|
-
): # fallback: add figure without classification, if it isn't present
|
108
|
-
fig = FigureElement(
|
109
|
-
label=cluster.label,
|
110
|
-
id=cluster.id,
|
111
|
-
text="",
|
112
|
-
data=None,
|
113
|
-
cluster=cluster,
|
114
|
-
page_no=page.page_no,
|
115
|
-
)
|
116
|
-
elements.append(fig)
|
117
|
-
body.append(fig)
|
118
|
-
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
119
|
-
equation = None
|
120
|
-
if page.predictions.equations_prediction:
|
121
|
-
equation = (
|
122
|
-
page.predictions.equations_prediction.equation_map.get(
|
123
|
-
cluster.id, None
|
124
|
-
)
|
125
|
-
)
|
126
|
-
if not equation: # fallback: add empty formula, if it isn't present
|
127
|
-
text = self.sanitize_text(
|
128
|
-
[
|
60
|
+
assert page._backend is not None
|
61
|
+
if not page._backend.is_valid():
|
62
|
+
yield page
|
63
|
+
else:
|
64
|
+
with TimeRecorder(conv_res, "page_assemble"):
|
65
|
+
|
66
|
+
assert page.predictions.layout is not None
|
67
|
+
|
68
|
+
# assembles some JSON output page by page.
|
69
|
+
|
70
|
+
elements: List[PageElement] = []
|
71
|
+
headers: List[PageElement] = []
|
72
|
+
body: List[PageElement] = []
|
73
|
+
|
74
|
+
for cluster in page.predictions.layout.clusters:
|
75
|
+
# _log.info("Cluster label seen:", cluster.label)
|
76
|
+
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
77
|
+
|
78
|
+
textlines = [
|
129
79
|
cell.text.replace("\x02", "-").strip()
|
130
80
|
for cell in cluster.cells
|
131
81
|
if len(cell.text.strip()) > 0
|
132
82
|
]
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
83
|
+
text = self.sanitize_text(textlines)
|
84
|
+
text_el = TextElement(
|
85
|
+
label=cluster.label,
|
86
|
+
id=cluster.id,
|
87
|
+
text=text,
|
88
|
+
page_no=page.page_no,
|
89
|
+
cluster=cluster,
|
90
|
+
)
|
91
|
+
elements.append(text_el)
|
92
|
+
|
93
|
+
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
94
|
+
headers.append(text_el)
|
95
|
+
else:
|
96
|
+
body.append(text_el)
|
97
|
+
elif cluster.label == LayoutModel.TABLE_LABEL:
|
98
|
+
tbl = None
|
99
|
+
if page.predictions.tablestructure:
|
100
|
+
tbl = page.predictions.tablestructure.table_map.get(
|
101
|
+
cluster.id, None
|
102
|
+
)
|
103
|
+
if (
|
104
|
+
not tbl
|
105
|
+
): # fallback: add table without structure, if it isn't present
|
106
|
+
tbl = Table(
|
107
|
+
label=cluster.label,
|
108
|
+
id=cluster.id,
|
109
|
+
text="",
|
110
|
+
otsl_seq=[],
|
111
|
+
table_cells=[],
|
112
|
+
cluster=cluster,
|
113
|
+
page_no=page.page_no,
|
114
|
+
)
|
115
|
+
|
116
|
+
elements.append(tbl)
|
117
|
+
body.append(tbl)
|
118
|
+
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
119
|
+
fig = None
|
120
|
+
if page.predictions.figures_classification:
|
121
|
+
fig = page.predictions.figures_classification.figure_map.get(
|
122
|
+
cluster.id, None
|
123
|
+
)
|
124
|
+
if (
|
125
|
+
not fig
|
126
|
+
): # fallback: add figure without classification, if it isn't present
|
127
|
+
fig = FigureElement(
|
128
|
+
label=cluster.label,
|
129
|
+
id=cluster.id,
|
130
|
+
text="",
|
131
|
+
data=None,
|
132
|
+
cluster=cluster,
|
133
|
+
page_no=page.page_no,
|
134
|
+
)
|
135
|
+
elements.append(fig)
|
136
|
+
body.append(fig)
|
137
|
+
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
138
|
+
equation = None
|
139
|
+
if page.predictions.equations_prediction:
|
140
|
+
equation = page.predictions.equations_prediction.equation_map.get(
|
141
|
+
cluster.id, None
|
142
|
+
)
|
143
|
+
if (
|
144
|
+
not equation
|
145
|
+
): # fallback: add empty formula, if it isn't present
|
146
|
+
text = self.sanitize_text(
|
147
|
+
[
|
148
|
+
cell.text.replace("\x02", "-").strip()
|
149
|
+
for cell in cluster.cells
|
150
|
+
if len(cell.text.strip()) > 0
|
151
|
+
]
|
152
|
+
)
|
153
|
+
equation = TextElement(
|
154
|
+
label=cluster.label,
|
155
|
+
id=cluster.id,
|
156
|
+
cluster=cluster,
|
157
|
+
page_no=page.page_no,
|
158
|
+
text=text,
|
159
|
+
)
|
160
|
+
elements.append(equation)
|
161
|
+
body.append(equation)
|
162
|
+
|
163
|
+
page.assembled = AssembledUnit(
|
164
|
+
elements=elements, headers=headers, body=body
|
165
|
+
)
|
166
|
+
|
167
|
+
# Remove page images (can be disabled)
|
168
|
+
if not self.options.keep_images:
|
169
|
+
page._image_cache = {}
|
170
|
+
|
171
|
+
# Unload backend
|
172
|
+
page._backend.unload()
|
173
|
+
|
174
|
+
yield page
|