docling 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,20 @@
1
1
  import logging
2
- import os
3
2
  import re
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Set, Union
7
6
 
8
7
  from docling_core.types.doc import (
9
- DocItem,
10
8
  DocItemLabel,
11
9
  DoclingDocument,
12
10
  DocumentOrigin,
13
11
  GroupItem,
14
12
  GroupLabel,
15
13
  ImageRef,
16
- NodeItem,
17
14
  Size,
18
15
  TableCell,
19
16
  TableData,
20
17
  )
21
- from pydantic import AnyUrl
22
18
 
23
19
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
24
20
  from docling.datamodel.base_models import InputFormat
@@ -179,31 +179,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
179
179
  self.parents[self.level] = doc.add_text(
180
180
  parent=self.parents[0], label=DocItemLabel.TITLE, text=text
181
181
  )
182
-
183
- elif hlevel > self.level:
184
-
185
- # add invisible group
186
- for i in range(self.level + 1, hlevel):
187
- self.parents[i] = doc.add_group(
188
- name=f"header-{i}",
189
- label=GroupLabel.SECTION,
190
- parent=self.parents[i - 1],
191
- )
192
- self.level = hlevel
193
-
194
- elif hlevel < self.level:
195
-
196
- # remove the tail
197
- for key, val in self.parents.items():
198
- if key > hlevel:
199
- self.parents[key] = None
200
- self.level = hlevel
201
-
202
- self.parents[hlevel] = doc.add_heading(
203
- parent=self.parents[hlevel - 1],
204
- text=text,
205
- level=hlevel,
206
- )
182
+ else:
183
+ if hlevel > self.level:
184
+
185
+ # add invisible group
186
+ for i in range(self.level + 1, hlevel):
187
+ self.parents[i] = doc.add_group(
188
+ name=f"header-{i}",
189
+ label=GroupLabel.SECTION,
190
+ parent=self.parents[i - 1],
191
+ )
192
+ self.level = hlevel
193
+
194
+ elif hlevel < self.level:
195
+
196
+ # remove the tail
197
+ for key, val in self.parents.items():
198
+ if key > hlevel:
199
+ self.parents[key] = None
200
+ self.level = hlevel
201
+
202
+ self.parents[hlevel] = doc.add_heading(
203
+ parent=self.parents[hlevel - 1],
204
+ text=text,
205
+ level=hlevel,
206
+ )
207
207
 
208
208
  def handle_paragraph(self, element, idx, doc):
209
209
  """Handles paragraph tags (p)."""
@@ -1,6 +1,6 @@
1
1
  from enum import Enum, auto
2
2
  from io import BytesIO
3
- from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
4
 
5
5
  from docling_core.types.doc import (
6
6
  BoundingBox,
@@ -3,7 +3,7 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
7
7
 
8
8
  import filetype
9
9
  from docling_core.types.doc import (
@@ -52,6 +52,7 @@ from docling.datamodel.base_models import (
52
52
  Page,
53
53
  )
54
54
  from docling.datamodel.settings import DocumentLimits
55
+ from docling.utils.profiling import ProfilingItem
55
56
  from docling.utils.utils import create_file_hash, create_hash
56
57
 
57
58
  if TYPE_CHECKING:
@@ -187,6 +188,7 @@ class ConversionResult(BaseModel):
187
188
 
188
189
  pages: List[Page] = []
189
190
  assembled: AssembledUnit = AssembledUnit()
191
+ timings: Dict[str, ProfilingItem] = {}
190
192
 
191
193
  document: DoclingDocument = _EMPTY_DOCLING_DOC
192
194
 
@@ -1,4 +1,5 @@
1
1
  import sys
2
+ from pathlib import Path
2
3
 
3
4
  from pydantic import BaseModel
4
5
  from pydantic_settings import BaseSettings
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
26
27
  # To force models into single core: export OMP_NUM_THREADS=1
27
28
 
28
29
 
30
+ class DebugSettings(BaseModel):
31
+ visualize_cells: bool = False
32
+ visualize_ocr: bool = False
33
+ visualize_layout: bool = False
34
+ visualize_tables: bool = False
35
+
36
+ profile_pipeline_timings: bool = False
37
+
38
+ # Path used to output debug information.
39
+ debug_output_path: str = str(Path.cwd() / "debug")
40
+
41
+
29
42
  class AppSettings(BaseSettings):
30
43
  perf: BatchConcurrencySettings
44
+ debug: DebugSettings
31
45
 
32
46
 
33
- settings = AppSettings(perf=BatchConcurrencySettings())
47
+ settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
@@ -189,24 +189,35 @@ class DocumentConverter:
189
189
  ) -> Iterator[ConversionResult]:
190
190
  assert self.format_to_options is not None
191
191
 
192
+ start_time = time.monotonic()
193
+
192
194
  for input_batch in chunkify(
193
195
  conv_input.docs(self.format_to_options),
194
196
  settings.perf.doc_batch_size, # pass format_options
195
197
  ):
196
198
  _log.info(f"Going to convert document batch...")
199
+
197
200
  # parallel processing only within input_batch
198
201
  # with ThreadPoolExecutor(
199
202
  # max_workers=settings.perf.doc_batch_concurrency
200
203
  # ) as pool:
201
204
  # yield from pool.map(self.process_document, input_batch)
202
-
203
205
  # Note: PDF backends are not thread-safe, thread pool usage was disabled.
206
+
204
207
  for item in map(
205
208
  partial(self._process_document, raises_on_error=raises_on_error),
206
209
  input_batch,
207
210
  ):
211
+ elapsed = time.monotonic() - start_time
212
+ start_time = time.monotonic()
213
+
208
214
  if item is not None:
215
+ _log.info(
216
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
217
+ )
209
218
  yield item
219
+ else:
220
+ _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
210
221
 
211
222
  def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
212
223
  assert self.format_to_options is not None
@@ -237,15 +248,8 @@ class DocumentConverter:
237
248
  assert self.allowed_formats is not None
238
249
  assert in_doc.format in self.allowed_formats
239
250
 
240
- start_doc_time = time.time()
241
-
242
251
  conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
243
252
 
244
- end_doc_time = time.time() - start_doc_time
245
- _log.info(
246
- f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
247
- )
248
-
249
253
  return conv_res
250
254
 
251
255
  def _execute_pipeline(
@@ -4,11 +4,14 @@ from typing import Any, Iterable
4
4
  from docling_core.types.doc import DoclingDocument, NodeItem
5
5
 
6
6
  from docling.datamodel.base_models import Page
7
+ from docling.datamodel.document import ConversionResult
7
8
 
8
9
 
9
10
  class BasePageModel(ABC):
10
11
  @abstractmethod
11
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
12
+ def __call__(
13
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
14
+ ) -> Iterable[Page]:
12
15
  pass
13
16
 
14
17
 
@@ -1,6 +1,7 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
+ from pathlib import Path
4
5
  from typing import Iterable, List
5
6
 
6
7
  import numpy as np
@@ -10,12 +11,15 @@ from rtree import index
10
11
  from scipy.ndimage import find_objects, label
11
12
 
12
13
  from docling.datamodel.base_models import OcrCell, Page
14
+ from docling.datamodel.document import ConversionResult
13
15
  from docling.datamodel.pipeline_options import OcrOptions
16
+ from docling.datamodel.settings import settings
17
+ from docling.models.base_model import BasePageModel
14
18
 
15
19
  _log = logging.getLogger(__name__)
16
20
 
17
21
 
18
- class BaseOcrModel:
22
+ class BaseOcrModel(BasePageModel):
19
23
  def __init__(self, enabled: bool, options: OcrOptions):
20
24
  self.enabled = enabled
21
25
  self.options = options
@@ -113,7 +117,7 @@ class BaseOcrModel:
113
117
  ]
114
118
  return filtered_ocr_cells
115
119
 
116
- def draw_ocr_rects_and_cells(self, page, ocr_rects):
120
+ def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
117
121
  image = copy.deepcopy(page.image)
118
122
  draw = ImageDraw.Draw(image, "RGBA")
119
123
 
@@ -130,8 +134,21 @@ class BaseOcrModel:
130
134
  if isinstance(tc, OcrCell):
131
135
  color = "magenta"
132
136
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
133
- image.show()
137
+
138
+ if show:
139
+ image.show()
140
+ else:
141
+ out_path: Path = (
142
+ Path(settings.debug.debug_output_path)
143
+ / f"debug_{conv_res.input.file.stem}"
144
+ )
145
+ out_path.mkdir(parents=True, exist_ok=True)
146
+
147
+ out_file = out_path / f"ocr_page_{page.page_no:05}.png"
148
+ image.save(str(out_file), format="png")
134
149
 
135
150
  @abstractmethod
136
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
151
+ def __call__(
152
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
153
+ ) -> Iterable[Page]:
137
154
  pass
@@ -1,5 +1,6 @@
1
1
  import copy
2
2
  import random
3
+ from pathlib import Path
3
4
  from typing import List, Union
4
5
 
5
6
  from deepsearch_glm.nlp_utils import init_nlp_model
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
27
28
 
28
29
  from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
29
30
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
31
+ from docling.datamodel.settings import settings
32
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
30
33
  from docling.utils.utils import create_hash
31
34
 
32
35
 
@@ -226,23 +229,24 @@ class GlmModel:
226
229
  return ds_doc
227
230
 
228
231
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
229
- ds_doc = self._to_legacy_document(conv_res)
230
- ds_doc_dict = ds_doc.model_dump(by_alias=True)
232
+ with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
233
+ ds_doc = self._to_legacy_document(conv_res)
234
+ ds_doc_dict = ds_doc.model_dump(by_alias=True)
231
235
 
232
- glm_doc = self.model.apply_on_doc(ds_doc_dict)
236
+ glm_doc = self.model.apply_on_doc(ds_doc_dict)
233
237
 
234
- docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
238
+ docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
235
239
 
236
240
  # DEBUG code:
237
- def draw_clusters_and_cells(ds_document, page_no):
241
+ def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
238
242
  clusters_to_draw = []
239
243
  image = copy.deepcopy(conv_res.pages[page_no].image)
240
244
  for ix, elem in enumerate(ds_document.main_text):
241
245
  if isinstance(elem, BaseText):
242
- prov = elem.prov[0]
246
+ prov = elem.prov[0] # type: ignore
243
247
  elif isinstance(elem, Ref):
244
248
  _, arr, index = elem.ref.split("/")
245
- index = int(index)
249
+ index = int(index) # type: ignore
246
250
  if arr == "tables":
247
251
  prov = ds_document.tables[index].prov[0]
248
252
  elif arr == "figures":
@@ -256,7 +260,7 @@ class GlmModel:
256
260
  id=ix,
257
261
  label=elem.name,
258
262
  bbox=BoundingBox.from_tuple(
259
- coord=prov.bbox,
263
+ coord=prov.bbox, # type: ignore
260
264
  origin=CoordOrigin.BOTTOMLEFT,
261
265
  ).to_top_left_origin(conv_res.pages[page_no].size.height),
262
266
  )
@@ -276,9 +280,21 @@ class GlmModel:
276
280
  for tc in c.cells: # [:1]:
277
281
  x0, y0, x1, y1 = tc.bbox.as_tuple()
278
282
  draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
279
- image.show()
280
283
 
281
- # draw_clusters_and_cells(ds_doc, 0)
282
- # draw_clusters_and_cells(exported_doc, 0)
284
+ if show:
285
+ image.show()
286
+ else:
287
+ out_path: Path = (
288
+ Path(settings.debug.debug_output_path)
289
+ / f"debug_{conv_res.input.file.stem}"
290
+ )
291
+ out_path.mkdir(parents=True, exist_ok=True)
292
+
293
+ out_file = out_path / f"doc_page_{page_no:05}.png"
294
+ image.save(str(out_file), format="png")
295
+
296
+ # for item in ds_doc.page_dimensions:
297
+ # page_no = item.page
298
+ # draw_clusters_and_cells(ds_doc, page_no)
283
299
 
284
300
  return docling_doc
@@ -5,8 +5,11 @@ import numpy
5
5
  from docling_core.types.doc import BoundingBox, CoordOrigin
6
6
 
7
7
  from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.document import ConversionResult
8
9
  from docling.datamodel.pipeline_options import EasyOcrOptions
10
+ from docling.datamodel.settings import settings
9
11
  from docling.models.base_ocr_model import BaseOcrModel
12
+ from docling.utils.profiling import TimeRecorder
10
13
 
11
14
  _log = logging.getLogger(__name__)
12
15
 
@@ -33,58 +36,65 @@ class EasyOcrModel(BaseOcrModel):
33
36
  download_enabled=self.options.download_enabled,
34
37
  )
35
38
 
36
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
39
+ def __call__(
40
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
41
+ ) -> Iterable[Page]:
37
42
 
38
43
  if not self.enabled:
39
44
  yield from page_batch
40
45
  return
41
46
 
42
47
  for page in page_batch:
48
+
43
49
  assert page._backend is not None
44
50
  if not page._backend.is_valid():
45
51
  yield page
46
52
  else:
47
- ocr_rects = self.get_ocr_rects(page)
48
-
49
- all_ocr_cells = []
50
- for ocr_rect in ocr_rects:
51
- # Skip zero area boxes
52
- if ocr_rect.area() == 0:
53
- continue
54
- high_res_image = page._backend.get_page_image(
55
- scale=self.scale, cropbox=ocr_rect
56
- )
57
- im = numpy.array(high_res_image)
58
- result = self.reader.readtext(im)
59
-
60
- del high_res_image
61
- del im
62
-
63
- cells = [
64
- OcrCell(
65
- id=ix,
66
- text=line[1],
67
- confidence=line[2],
68
- bbox=BoundingBox.from_tuple(
69
- coord=(
70
- (line[0][0][0] / self.scale) + ocr_rect.l,
71
- (line[0][0][1] / self.scale) + ocr_rect.t,
72
- (line[0][2][0] / self.scale) + ocr_rect.l,
73
- (line[0][2][1] / self.scale) + ocr_rect.t,
74
- ),
75
- origin=CoordOrigin.TOPLEFT,
76
- ),
53
+ with TimeRecorder(conv_res, "ocr"):
54
+ ocr_rects = self.get_ocr_rects(page)
55
+
56
+ all_ocr_cells = []
57
+ for ocr_rect in ocr_rects:
58
+ # Skip zero area boxes
59
+ if ocr_rect.area() == 0:
60
+ continue
61
+ high_res_image = page._backend.get_page_image(
62
+ scale=self.scale, cropbox=ocr_rect
77
63
  )
78
- for ix, line in enumerate(result)
79
- ]
80
- all_ocr_cells.extend(cells)
81
-
82
- ## Remove OCR cells which overlap with programmatic cells.
83
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
64
+ im = numpy.array(high_res_image)
65
+ result = self.reader.readtext(im)
66
+
67
+ del high_res_image
68
+ del im
69
+
70
+ cells = [
71
+ OcrCell(
72
+ id=ix,
73
+ text=line[1],
74
+ confidence=line[2],
75
+ bbox=BoundingBox.from_tuple(
76
+ coord=(
77
+ (line[0][0][0] / self.scale) + ocr_rect.l,
78
+ (line[0][0][1] / self.scale) + ocr_rect.t,
79
+ (line[0][2][0] / self.scale) + ocr_rect.l,
80
+ (line[0][2][1] / self.scale) + ocr_rect.t,
81
+ ),
82
+ origin=CoordOrigin.TOPLEFT,
83
+ ),
84
+ )
85
+ for ix, line in enumerate(result)
86
+ ]
87
+ all_ocr_cells.extend(cells)
88
+
89
+ ## Remove OCR cells which overlap with programmatic cells.
90
+ filtered_ocr_cells = self.filter_ocr_cells(
91
+ all_ocr_cells, page.cells
92
+ )
84
93
 
85
- page.cells.extend(filtered_ocr_cells)
94
+ page.cells.extend(filtered_ocr_cells)
86
95
 
87
96
  # DEBUG code:
88
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
97
+ if settings.debug.visualize_ocr:
98
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
89
99
 
90
100
  yield page
@@ -16,8 +16,11 @@ from docling.datamodel.base_models import (
16
16
  LayoutPrediction,
17
17
  Page,
18
18
  )
19
+ from docling.datamodel.document import ConversionResult
20
+ from docling.datamodel.settings import settings
19
21
  from docling.models.base_model import BasePageModel
20
22
  from docling.utils import layout_utils as lu
23
+ from docling.utils.profiling import TimeRecorder
21
24
 
22
25
  _log = logging.getLogger(__name__)
23
26
 
@@ -271,74 +274,97 @@ class LayoutModel(BasePageModel):
271
274
 
272
275
  return clusters_out_new, cells_out_new
273
276
 
274
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
277
+ def __call__(
278
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
279
+ ) -> Iterable[Page]:
280
+
275
281
  for page in page_batch:
276
282
  assert page._backend is not None
277
283
  if not page._backend.is_valid():
278
284
  yield page
279
285
  else:
280
- assert page.size is not None
281
-
282
- clusters = []
283
- for ix, pred_item in enumerate(
284
- self.layout_predictor.predict(page.get_image(scale=1.0))
285
- ):
286
- label = DocItemLabel(
287
- pred_item["label"].lower().replace(" ", "_").replace("-", "_")
288
- ) # Temporary, until docling-ibm-model uses docling-core types
289
- cluster = Cluster(
290
- id=ix,
291
- label=label,
292
- confidence=pred_item["confidence"],
293
- bbox=BoundingBox.model_validate(pred_item),
294
- cells=[],
295
- )
296
- clusters.append(cluster)
297
-
298
- # Map cells to clusters
299
- # TODO: Remove, postprocess should take care of it anyway.
300
- for cell in page.cells:
301
- for cluster in clusters:
302
- if not cell.bbox.area() > 0:
303
- overlap_frac = 0.0
304
- else:
305
- overlap_frac = (
306
- cell.bbox.intersection_area_with(cluster.bbox)
307
- / cell.bbox.area()
308
- )
309
-
310
- if overlap_frac > 0.5:
311
- cluster.cells.append(cell)
312
-
313
- # Pre-sort clusters
314
- # clusters = self.sort_clusters_by_cell_order(clusters)
315
-
316
- # DEBUG code:
317
- def draw_clusters_and_cells():
318
- image = copy.deepcopy(page.image)
319
- draw = ImageDraw.Draw(image)
320
- for c in clusters:
321
- x0, y0, x1, y1 = c.bbox.as_tuple()
322
- draw.rectangle([(x0, y0), (x1, y1)], outline="green")
323
-
324
- cell_color = (
325
- random.randint(30, 140),
326
- random.randint(30, 140),
327
- random.randint(30, 140),
286
+ with TimeRecorder(conv_res, "layout"):
287
+ assert page.size is not None
288
+
289
+ clusters = []
290
+ for ix, pred_item in enumerate(
291
+ self.layout_predictor.predict(page.get_image(scale=1.0))
292
+ ):
293
+ label = DocItemLabel(
294
+ pred_item["label"]
295
+ .lower()
296
+ .replace(" ", "_")
297
+ .replace("-", "_")
298
+ ) # Temporary, until docling-ibm-model uses docling-core types
299
+ cluster = Cluster(
300
+ id=ix,
301
+ label=label,
302
+ confidence=pred_item["confidence"],
303
+ bbox=BoundingBox.model_validate(pred_item),
304
+ cells=[],
328
305
  )
329
- for tc in c.cells: # [:1]:
330
- x0, y0, x1, y1 = tc.bbox.as_tuple()
331
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
332
- image.show()
333
-
334
- # draw_clusters_and_cells()
335
-
336
- clusters, page.cells = self.postprocess(
337
- clusters, page.cells, page.size.height
338
- )
306
+ clusters.append(cluster)
307
+
308
+ # Map cells to clusters
309
+ # TODO: Remove, postprocess should take care of it anyway.
310
+ for cell in page.cells:
311
+ for cluster in clusters:
312
+ if not cell.bbox.area() > 0:
313
+ overlap_frac = 0.0
314
+ else:
315
+ overlap_frac = (
316
+ cell.bbox.intersection_area_with(cluster.bbox)
317
+ / cell.bbox.area()
318
+ )
319
+
320
+ if overlap_frac > 0.5:
321
+ cluster.cells.append(cell)
322
+
323
+ # Pre-sort clusters
324
+ # clusters = self.sort_clusters_by_cell_order(clusters)
325
+
326
+ # DEBUG code:
327
+ def draw_clusters_and_cells(show: bool = False):
328
+ image = copy.deepcopy(page.image)
329
+ if image is not None:
330
+ draw = ImageDraw.Draw(image)
331
+ for c in clusters:
332
+ x0, y0, x1, y1 = c.bbox.as_tuple()
333
+ draw.rectangle([(x0, y0), (x1, y1)], outline="green")
334
+
335
+ cell_color = (
336
+ random.randint(30, 140),
337
+ random.randint(30, 140),
338
+ random.randint(30, 140),
339
+ )
340
+ for tc in c.cells: # [:1]:
341
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
342
+ draw.rectangle(
343
+ [(x0, y0), (x1, y1)], outline=cell_color
344
+ )
345
+ if show:
346
+ image.show()
347
+ else:
348
+ out_path: Path = (
349
+ Path(settings.debug.debug_output_path)
350
+ / f"debug_{conv_res.input.file.stem}"
351
+ )
352
+ out_path.mkdir(parents=True, exist_ok=True)
353
+
354
+ out_file = (
355
+ out_path / f"layout_page_{page.page_no:05}.png"
356
+ )
357
+ image.save(str(out_file), format="png")
358
+
359
+ # draw_clusters_and_cells()
360
+
361
+ clusters, page.cells = self.postprocess(
362
+ clusters, page.cells, page.size.height
363
+ )
339
364
 
340
- # draw_clusters_and_cells()
365
+ page.predictions.layout = LayoutPrediction(clusters=clusters)
341
366
 
342
- page.predictions.layout = LayoutPrediction(clusters=clusters)
367
+ if settings.debug.visualize_layout:
368
+ draw_clusters_and_cells()
343
369
 
344
370
  yield page