docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -2,8 +2,10 @@ import copy
2
2
  import logging
3
3
  import random
4
4
  import time
5
+ from pathlib import Path
5
6
  from typing import Iterable, List
6
7
 
8
+ from docling_core.types.doc import CoordOrigin, DocItemLabel
7
9
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
8
10
  from PIL import ImageDraw
9
11
 
@@ -11,74 +13,76 @@ from docling.datamodel.base_models import (
11
13
  BoundingBox,
12
14
  Cell,
13
15
  Cluster,
14
- CoordOrigin,
15
16
  LayoutPrediction,
16
17
  Page,
17
18
  )
19
+ from docling.datamodel.document import ConversionResult
20
+ from docling.datamodel.settings import settings
21
+ from docling.models.base_model import BasePageModel
18
22
  from docling.utils import layout_utils as lu
23
+ from docling.utils.profiling import TimeRecorder
19
24
 
20
25
  _log = logging.getLogger(__name__)
21
26
 
22
27
 
23
- class LayoutModel:
28
+ class LayoutModel(BasePageModel):
24
29
 
25
30
  TEXT_ELEM_LABELS = [
26
- "Text",
27
- "Footnote",
28
- "Caption",
29
- "Checkbox-Unselected",
30
- "Checkbox-Selected",
31
- "Section-header",
32
- "Page-header",
33
- "Page-footer",
34
- "Code",
35
- "List-item",
36
- # "Title"
31
+ DocItemLabel.TEXT,
32
+ DocItemLabel.FOOTNOTE,
33
+ DocItemLabel.CAPTION,
34
+ DocItemLabel.CHECKBOX_UNSELECTED,
35
+ DocItemLabel.CHECKBOX_SELECTED,
36
+ DocItemLabel.SECTION_HEADER,
37
+ DocItemLabel.PAGE_HEADER,
38
+ DocItemLabel.PAGE_FOOTER,
39
+ DocItemLabel.CODE,
40
+ DocItemLabel.LIST_ITEM,
37
41
  # "Formula",
38
42
  ]
39
- PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
43
+ PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
40
44
 
41
- TABLE_LABEL = "Table"
42
- FIGURE_LABEL = "Picture"
43
- FORMULA_LABEL = "Formula"
45
+ TABLE_LABEL = DocItemLabel.TABLE
46
+ FIGURE_LABEL = DocItemLabel.PICTURE
47
+ FORMULA_LABEL = DocItemLabel.FORMULA
44
48
 
45
- def __init__(self, config):
46
- self.config = config
47
- self.layout_predictor = LayoutPredictor(
48
- config["artifacts_path"]
49
- ) # TODO temporary
49
+ def __init__(self, artifacts_path: Path):
50
+ self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
50
51
 
51
- def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
52
+ def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
52
53
  MIN_INTERSECTION = 0.2
53
54
  CLASS_THRESHOLDS = {
54
- "Caption": 0.35,
55
- "Footnote": 0.35,
56
- "Formula": 0.35,
57
- "List-item": 0.35,
58
- "Page-footer": 0.35,
59
- "Page-header": 0.35,
60
- "Picture": 0.2, # low threshold adjust to capture chemical structures for examples.
61
- "Section-header": 0.45,
62
- "Table": 0.35,
63
- "Text": 0.45,
64
- "Title": 0.45,
65
- "Document Index": 0.45,
66
- "Code": 0.45,
67
- "Checkbox-Selected": 0.45,
68
- "Checkbox-Unselected": 0.45,
69
- "Form": 0.45,
70
- "Key-Value Region": 0.45,
55
+ DocItemLabel.CAPTION: 0.35,
56
+ DocItemLabel.FOOTNOTE: 0.35,
57
+ DocItemLabel.FORMULA: 0.35,
58
+ DocItemLabel.LIST_ITEM: 0.35,
59
+ DocItemLabel.PAGE_FOOTER: 0.35,
60
+ DocItemLabel.PAGE_HEADER: 0.35,
61
+ DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
62
+ DocItemLabel.SECTION_HEADER: 0.45,
63
+ DocItemLabel.TABLE: 0.35,
64
+ DocItemLabel.TEXT: 0.45,
65
+ DocItemLabel.TITLE: 0.45,
66
+ DocItemLabel.DOCUMENT_INDEX: 0.45,
67
+ DocItemLabel.CODE: 0.45,
68
+ DocItemLabel.CHECKBOX_SELECTED: 0.45,
69
+ DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
70
+ DocItemLabel.FORM: 0.45,
71
+ DocItemLabel.KEY_VALUE_REGION: 0.45,
71
72
  }
72
73
 
73
- CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
74
+ CLASS_REMAPPINGS = {
75
+ DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
76
+ DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
77
+ }
74
78
 
75
79
  _log.debug("================= Start postprocess function ====================")
76
80
  start_time = time.time()
77
81
  # Apply Confidence Threshold to cluster predictions
78
82
  # confidence = self.conf_threshold
79
- clusters_out = []
83
+ clusters_mod = []
80
84
 
81
- for cluster in clusters:
85
+ for cluster in clusters_in:
82
86
  confidence = CLASS_THRESHOLDS[cluster.label]
83
87
  if cluster.confidence >= confidence:
84
88
  # annotation["created_by"] = "high_conf_pred"
@@ -86,10 +90,10 @@ class LayoutModel:
86
90
  # Remap class labels where needed.
87
91
  if cluster.label in CLASS_REMAPPINGS.keys():
88
92
  cluster.label = CLASS_REMAPPINGS[cluster.label]
89
- clusters_out.append(cluster)
93
+ clusters_mod.append(cluster)
90
94
 
91
95
  # map to dictionary clusters and cells, with bottom left origin
92
- clusters = [
96
+ clusters_orig = [
93
97
  {
94
98
  "id": c.id,
95
99
  "bbox": list(
@@ -99,7 +103,7 @@ class LayoutModel:
99
103
  "cell_ids": [],
100
104
  "type": c.label,
101
105
  }
102
- for c in clusters
106
+ for c in clusters_in
103
107
  ]
104
108
 
105
109
  clusters_out = [
@@ -113,9 +117,11 @@ class LayoutModel:
113
117
  "cell_ids": [],
114
118
  "type": c.label,
115
119
  }
116
- for c in clusters_out
120
+ for c in clusters_mod
117
121
  ]
118
122
 
123
+ del clusters_mod
124
+
119
125
  raw_cells = [
120
126
  {
121
127
  "id": c.id,
@@ -149,7 +155,7 @@ class LayoutModel:
149
155
 
150
156
  # Assign orphan cells with lower confidence predictions
151
157
  clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
152
- clusters_out, clusters, raw_cells, orphan_cell_indices
158
+ clusters_out, clusters_orig, raw_cells, orphan_cell_indices
153
159
  )
154
160
 
155
161
  # Refresh the cell_ids assignment, after creating new clusters using low conf predictions
@@ -178,7 +184,7 @@ class LayoutModel:
178
184
  ) = lu.cell_id_state_map(clusters_out, cell_count)
179
185
 
180
186
  clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
181
- clusters_out, clusters, raw_cells, orphan_cell_indices
187
+ clusters_out, clusters_orig, raw_cells, orphan_cell_indices
182
188
  )
183
189
 
184
190
  _log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
@@ -237,92 +243,128 @@ class LayoutModel:
237
243
  end_time = time.time() - start_time
238
244
  _log.debug(f"Finished post processing in seconds={end_time:.3f}")
239
245
 
240
- cells_out = [
246
+ cells_out_new = [
241
247
  Cell(
242
- id=c["id"],
248
+ id=c["id"], # type: ignore
243
249
  bbox=BoundingBox.from_tuple(
244
- coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
250
+ coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
245
251
  ).to_top_left_origin(page_height),
246
- text=c["text"],
252
+ text=c["text"], # type: ignore
247
253
  )
248
254
  for c in cells_out
249
255
  ]
256
+
257
+ del cells_out
258
+
250
259
  clusters_out_new = []
251
260
  for c in clusters_out:
252
- cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
261
+ cluster_cells = [
262
+ ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore
263
+ ]
253
264
  c_new = Cluster(
254
- id=c["id"],
265
+ id=c["id"], # type: ignore
255
266
  bbox=BoundingBox.from_tuple(
256
- coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
267
+ coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
257
268
  ).to_top_left_origin(page_height),
258
- confidence=c["confidence"],
259
- label=c["type"],
269
+ confidence=c["confidence"], # type: ignore
270
+ label=DocItemLabel(c["type"]),
260
271
  cells=cluster_cells,
261
272
  )
262
273
  clusters_out_new.append(c_new)
263
274
 
264
- return clusters_out_new, cells_out
275
+ return clusters_out_new, cells_out_new
276
+
277
+ def __call__(
278
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
279
+ ) -> Iterable[Page]:
265
280
 
266
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
267
281
  for page in page_batch:
268
- clusters = []
269
- for ix, pred_item in enumerate(
270
- self.layout_predictor.predict(page.get_image(scale=1.0))
271
- ):
272
- cluster = Cluster(
273
- id=ix,
274
- label=pred_item["label"],
275
- confidence=pred_item["confidence"],
276
- bbox=BoundingBox.model_validate(pred_item),
277
- cells=[],
278
- )
279
-
280
- clusters.append(cluster)
281
-
282
- # Map cells to clusters
283
- # TODO: Remove, postprocess should take care of it anyway.
284
- for cell in page.cells:
285
- for cluster in clusters:
286
- if not cell.bbox.area() > 0:
287
- overlap_frac = 0.0
288
- else:
289
- overlap_frac = (
290
- cell.bbox.intersection_area_with(cluster.bbox)
291
- / cell.bbox.area()
282
+ assert page._backend is not None
283
+ if not page._backend.is_valid():
284
+ yield page
285
+ else:
286
+ with TimeRecorder(conv_res, "layout"):
287
+ assert page.size is not None
288
+
289
+ clusters = []
290
+ for ix, pred_item in enumerate(
291
+ self.layout_predictor.predict(page.get_image(scale=1.0))
292
+ ):
293
+ label = DocItemLabel(
294
+ pred_item["label"]
295
+ .lower()
296
+ .replace(" ", "_")
297
+ .replace("-", "_")
298
+ ) # Temporary, until docling-ibm-model uses docling-core types
299
+ cluster = Cluster(
300
+ id=ix,
301
+ label=label,
302
+ confidence=pred_item["confidence"],
303
+ bbox=BoundingBox.model_validate(pred_item),
304
+ cells=[],
292
305
  )
293
-
294
- if overlap_frac > 0.5:
295
- cluster.cells.append(cell)
296
-
297
- # Pre-sort clusters
298
- # clusters = self.sort_clusters_by_cell_order(clusters)
299
-
300
- # DEBUG code:
301
- def draw_clusters_and_cells():
302
- image = copy.deepcopy(page.image)
303
- draw = ImageDraw.Draw(image)
304
- for c in clusters:
305
- x0, y0, x1, y1 = c.bbox.as_tuple()
306
- draw.rectangle([(x0, y0), (x1, y1)], outline="green")
307
-
308
- cell_color = (
309
- random.randint(30, 140),
310
- random.randint(30, 140),
311
- random.randint(30, 140),
306
+ clusters.append(cluster)
307
+
308
+ # Map cells to clusters
309
+ # TODO: Remove, postprocess should take care of it anyway.
310
+ for cell in page.cells:
311
+ for cluster in clusters:
312
+ if not cell.bbox.area() > 0:
313
+ overlap_frac = 0.0
314
+ else:
315
+ overlap_frac = (
316
+ cell.bbox.intersection_area_with(cluster.bbox)
317
+ / cell.bbox.area()
318
+ )
319
+
320
+ if overlap_frac > 0.5:
321
+ cluster.cells.append(cell)
322
+
323
+ # Pre-sort clusters
324
+ # clusters = self.sort_clusters_by_cell_order(clusters)
325
+
326
+ # DEBUG code:
327
+ def draw_clusters_and_cells(show: bool = False):
328
+ image = copy.deepcopy(page.image)
329
+ if image is not None:
330
+ draw = ImageDraw.Draw(image)
331
+ for c in clusters:
332
+ x0, y0, x1, y1 = c.bbox.as_tuple()
333
+ draw.rectangle([(x0, y0), (x1, y1)], outline="green")
334
+
335
+ cell_color = (
336
+ random.randint(30, 140),
337
+ random.randint(30, 140),
338
+ random.randint(30, 140),
339
+ )
340
+ for tc in c.cells: # [:1]:
341
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
342
+ draw.rectangle(
343
+ [(x0, y0), (x1, y1)], outline=cell_color
344
+ )
345
+ if show:
346
+ image.show()
347
+ else:
348
+ out_path: Path = (
349
+ Path(settings.debug.debug_output_path)
350
+ / f"debug_{conv_res.input.file.stem}"
351
+ )
352
+ out_path.mkdir(parents=True, exist_ok=True)
353
+
354
+ out_file = (
355
+ out_path / f"layout_page_{page.page_no:05}.png"
356
+ )
357
+ image.save(str(out_file), format="png")
358
+
359
+ # draw_clusters_and_cells()
360
+
361
+ clusters, page.cells = self.postprocess(
362
+ clusters, page.cells, page.size.height
312
363
  )
313
- for tc in c.cells: # [:1]:
314
- x0, y0, x1, y1 = tc.bbox.as_tuple()
315
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
316
- image.show()
317
-
318
- # draw_clusters_and_cells()
319
-
320
- clusters, page.cells = self.postprocess(
321
- clusters, page.cells, page.size.height
322
- )
323
364
 
324
- # draw_clusters_and_cells()
365
+ page.predictions.layout = LayoutPrediction(clusters=clusters)
325
366
 
326
- page.predictions.layout = LayoutPrediction(clusters=clusters)
367
+ if settings.debug.visualize_layout:
368
+ draw_clusters_and_cells()
327
369
 
328
- yield page
370
+ yield page
@@ -2,22 +2,31 @@ import logging
2
2
  import re
3
3
  from typing import Iterable, List
4
4
 
5
+ from pydantic import BaseModel
6
+
5
7
  from docling.datamodel.base_models import (
6
8
  AssembledUnit,
7
9
  FigureElement,
8
10
  Page,
9
11
  PageElement,
10
- TableElement,
12
+ Table,
11
13
  TextElement,
12
14
  )
15
+ from docling.datamodel.document import ConversionResult
16
+ from docling.models.base_model import BasePageModel
13
17
  from docling.models.layout_model import LayoutModel
18
+ from docling.utils.profiling import TimeRecorder
14
19
 
15
20
  _log = logging.getLogger(__name__)
16
21
 
17
22
 
18
- class PageAssembleModel:
19
- def __init__(self, config):
20
- self.config = config
23
+ class PageAssembleOptions(BaseModel):
24
+ keep_images: bool = False
25
+
26
+
27
+ class PageAssembleModel(BasePageModel):
28
+ def __init__(self, options: PageAssembleOptions):
29
+ self.options = options
21
30
 
22
31
  def sanitize_text(self, lines):
23
32
  if len(lines) <= 1:
@@ -44,105 +53,122 @@ class PageAssembleModel:
44
53
 
45
54
  return sanitized_text.strip() # Strip any leading or trailing whitespace
46
55
 
47
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
56
+ def __call__(
57
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
58
+ ) -> Iterable[Page]:
48
59
  for page in page_batch:
49
- # assembles some JSON output page by page.
50
-
51
- elements: List[PageElement] = []
52
- headers: List[PageElement] = []
53
- body: List[PageElement] = []
54
-
55
- for cluster in page.predictions.layout.clusters:
56
- # _log.info("Cluster label seen:", cluster.label)
57
- if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
58
-
59
- textlines = [
60
- cell.text.replace("\x02", "-").strip()
61
- for cell in cluster.cells
62
- if len(cell.text.strip()) > 0
63
- ]
64
- text = self.sanitize_text(textlines)
65
- text_el = TextElement(
66
- label=cluster.label,
67
- id=cluster.id,
68
- text=text,
69
- page_no=page.page_no,
70
- cluster=cluster,
71
- )
72
- elements.append(text_el)
73
-
74
- if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
75
- headers.append(text_el)
76
- else:
77
- body.append(text_el)
78
- elif cluster.label == LayoutModel.TABLE_LABEL:
79
- tbl = None
80
- if page.predictions.tablestructure:
81
- tbl = page.predictions.tablestructure.table_map.get(
82
- cluster.id, None
83
- )
84
- if (
85
- not tbl
86
- ): # fallback: add table without structure, if it isn't present
87
- tbl = TableElement(
88
- label=cluster.label,
89
- id=cluster.id,
90
- text="",
91
- otsl_seq=[],
92
- table_cells=[],
93
- cluster=cluster,
94
- page_no=page.page_no,
95
- )
96
-
97
- elements.append(tbl)
98
- body.append(tbl)
99
- elif cluster.label == LayoutModel.FIGURE_LABEL:
100
- fig = None
101
- if page.predictions.figures_classification:
102
- fig = page.predictions.figures_classification.figure_map.get(
103
- cluster.id, None
104
- )
105
- if (
106
- not fig
107
- ): # fallback: add figure without classification, if it isn't present
108
- fig = FigureElement(
109
- label=cluster.label,
110
- id=cluster.id,
111
- text="",
112
- data=None,
113
- cluster=cluster,
114
- page_no=page.page_no,
115
- )
116
- elements.append(fig)
117
- body.append(fig)
118
- elif cluster.label == LayoutModel.FORMULA_LABEL:
119
- equation = None
120
- if page.predictions.equations_prediction:
121
- equation = (
122
- page.predictions.equations_prediction.equation_map.get(
123
- cluster.id, None
124
- )
125
- )
126
- if not equation: # fallback: add empty formula, if it isn't present
127
- text = self.sanitize_text(
128
- [
60
+ assert page._backend is not None
61
+ if not page._backend.is_valid():
62
+ yield page
63
+ else:
64
+ with TimeRecorder(conv_res, "page_assemble"):
65
+
66
+ assert page.predictions.layout is not None
67
+
68
+ # assembles some JSON output page by page.
69
+
70
+ elements: List[PageElement] = []
71
+ headers: List[PageElement] = []
72
+ body: List[PageElement] = []
73
+
74
+ for cluster in page.predictions.layout.clusters:
75
+ # _log.info("Cluster label seen:", cluster.label)
76
+ if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
77
+
78
+ textlines = [
129
79
  cell.text.replace("\x02", "-").strip()
130
80
  for cell in cluster.cells
131
81
  if len(cell.text.strip()) > 0
132
82
  ]
133
- )
134
- equation = TextElement(
135
- label=cluster.label,
136
- id=cluster.id,
137
- cluster=cluster,
138
- page_no=page.page_no,
139
- text=text,
140
- )
141
- elements.append(equation)
142
- body.append(equation)
143
-
144
- page.assembled = AssembledUnit(
145
- elements=elements, headers=headers, body=body
146
- )
147
-
148
- yield page
83
+ text = self.sanitize_text(textlines)
84
+ text_el = TextElement(
85
+ label=cluster.label,
86
+ id=cluster.id,
87
+ text=text,
88
+ page_no=page.page_no,
89
+ cluster=cluster,
90
+ )
91
+ elements.append(text_el)
92
+
93
+ if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
94
+ headers.append(text_el)
95
+ else:
96
+ body.append(text_el)
97
+ elif cluster.label == LayoutModel.TABLE_LABEL:
98
+ tbl = None
99
+ if page.predictions.tablestructure:
100
+ tbl = page.predictions.tablestructure.table_map.get(
101
+ cluster.id, None
102
+ )
103
+ if (
104
+ not tbl
105
+ ): # fallback: add table without structure, if it isn't present
106
+ tbl = Table(
107
+ label=cluster.label,
108
+ id=cluster.id,
109
+ text="",
110
+ otsl_seq=[],
111
+ table_cells=[],
112
+ cluster=cluster,
113
+ page_no=page.page_no,
114
+ )
115
+
116
+ elements.append(tbl)
117
+ body.append(tbl)
118
+ elif cluster.label == LayoutModel.FIGURE_LABEL:
119
+ fig = None
120
+ if page.predictions.figures_classification:
121
+ fig = page.predictions.figures_classification.figure_map.get(
122
+ cluster.id, None
123
+ )
124
+ if (
125
+ not fig
126
+ ): # fallback: add figure without classification, if it isn't present
127
+ fig = FigureElement(
128
+ label=cluster.label,
129
+ id=cluster.id,
130
+ text="",
131
+ data=None,
132
+ cluster=cluster,
133
+ page_no=page.page_no,
134
+ )
135
+ elements.append(fig)
136
+ body.append(fig)
137
+ elif cluster.label == LayoutModel.FORMULA_LABEL:
138
+ equation = None
139
+ if page.predictions.equations_prediction:
140
+ equation = page.predictions.equations_prediction.equation_map.get(
141
+ cluster.id, None
142
+ )
143
+ if (
144
+ not equation
145
+ ): # fallback: add empty formula, if it isn't present
146
+ text = self.sanitize_text(
147
+ [
148
+ cell.text.replace("\x02", "-").strip()
149
+ for cell in cluster.cells
150
+ if len(cell.text.strip()) > 0
151
+ ]
152
+ )
153
+ equation = TextElement(
154
+ label=cluster.label,
155
+ id=cluster.id,
156
+ cluster=cluster,
157
+ page_no=page.page_no,
158
+ text=text,
159
+ )
160
+ elements.append(equation)
161
+ body.append(equation)
162
+
163
+ page.assembled = AssembledUnit(
164
+ elements=elements, headers=headers, body=body
165
+ )
166
+
167
+ # Remove page images (can be disabled)
168
+ if not self.options.keep_images:
169
+ page._image_cache = {}
170
+
171
+ # Unload backend
172
+ page._backend.unload()
173
+
174
+ yield page