docling 2.2.1__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,20 @@
1
1
  import logging
2
- import os
3
2
  import re
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
5
  from typing import Set, Union
7
6
 
8
7
  from docling_core.types.doc import (
9
- DocItem,
10
8
  DocItemLabel,
11
9
  DoclingDocument,
12
10
  DocumentOrigin,
13
11
  GroupItem,
14
12
  GroupLabel,
15
13
  ImageRef,
16
- NodeItem,
17
14
  Size,
18
15
  TableCell,
19
16
  TableData,
20
17
  )
21
- from pydantic import AnyUrl
22
18
 
23
19
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
24
20
  from docling.datamodel.base_models import InputFormat
@@ -179,31 +179,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
179
179
  self.parents[self.level] = doc.add_text(
180
180
  parent=self.parents[0], label=DocItemLabel.TITLE, text=text
181
181
  )
182
-
183
- elif hlevel > self.level:
184
-
185
- # add invisible group
186
- for i in range(self.level + 1, hlevel):
187
- self.parents[i] = doc.add_group(
188
- name=f"header-{i}",
189
- label=GroupLabel.SECTION,
190
- parent=self.parents[i - 1],
191
- )
192
- self.level = hlevel
193
-
194
- elif hlevel < self.level:
195
-
196
- # remove the tail
197
- for key, val in self.parents.items():
198
- if key > hlevel:
199
- self.parents[key] = None
200
- self.level = hlevel
201
-
202
- self.parents[hlevel] = doc.add_heading(
203
- parent=self.parents[hlevel - 1],
204
- text=text,
205
- level=hlevel,
206
- )
182
+ else:
183
+ if hlevel > self.level:
184
+
185
+ # add invisible group
186
+ for i in range(self.level + 1, hlevel):
187
+ self.parents[i] = doc.add_group(
188
+ name=f"header-{i}",
189
+ label=GroupLabel.SECTION,
190
+ parent=self.parents[i - 1],
191
+ )
192
+ self.level = hlevel
193
+
194
+ elif hlevel < self.level:
195
+
196
+ # remove the tail
197
+ for key, val in self.parents.items():
198
+ if key > hlevel:
199
+ self.parents[key] = None
200
+ self.level = hlevel
201
+
202
+ self.parents[hlevel] = doc.add_heading(
203
+ parent=self.parents[hlevel - 1],
204
+ text=text,
205
+ level=hlevel,
206
+ )
207
207
 
208
208
  def handle_paragraph(self, element, idx, doc):
209
209
  """Handles paragraph tags (p)."""
@@ -1,6 +1,6 @@
1
1
  from enum import Enum, auto
2
2
  from io import BytesIO
3
- from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
4
 
5
5
  from docling_core.types.doc import (
6
6
  BoundingBox,
@@ -3,7 +3,7 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
7
7
 
8
8
  import filetype
9
9
  from docling_core.types.doc import (
@@ -52,6 +52,7 @@ from docling.datamodel.base_models import (
52
52
  Page,
53
53
  )
54
54
  from docling.datamodel.settings import DocumentLimits
55
+ from docling.utils.profiling import ProfilingItem
55
56
  from docling.utils.utils import create_file_hash, create_hash
56
57
 
57
58
  if TYPE_CHECKING:
@@ -187,6 +188,7 @@ class ConversionResult(BaseModel):
187
188
 
188
189
  pages: List[Page] = []
189
190
  assembled: AssembledUnit = AssembledUnit()
191
+ timings: Dict[str, ProfilingItem] = {}
190
192
 
191
193
  document: DoclingDocument = _EMPTY_DOCLING_DOC
192
194
 
@@ -1,4 +1,5 @@
1
1
  import sys
2
+ from pathlib import Path
2
3
 
3
4
  from pydantic import BaseModel
4
5
  from pydantic_settings import BaseSettings
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
26
27
  # To force models into single core: export OMP_NUM_THREADS=1
27
28
 
28
29
 
30
+ class DebugSettings(BaseModel):
31
+ visualize_cells: bool = False
32
+ visualize_ocr: bool = False
33
+ visualize_layout: bool = False
34
+ visualize_tables: bool = False
35
+
36
+ profile_pipeline_timings: bool = False
37
+
38
+ # Path used to output debug information.
39
+ debug_output_path: str = str(Path.cwd() / "debug")
40
+
41
+
29
42
  class AppSettings(BaseSettings):
30
43
  perf: BatchConcurrencySettings
44
+ debug: DebugSettings
31
45
 
32
46
 
33
- settings = AppSettings(perf=BatchConcurrencySettings())
47
+ settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
@@ -139,6 +139,10 @@ class DocumentConverter:
139
139
 
140
140
  self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
141
141
 
142
+ def initialize_pipeline(self, format: InputFormat):
143
+ """Initialize the conversion pipeline for the selected format."""
144
+ self._get_pipeline(doc_format=format)
145
+
142
146
  @validate_call(config=ConfigDict(strict=True))
143
147
  def convert(
144
148
  self,
@@ -189,32 +193,43 @@ class DocumentConverter:
189
193
  ) -> Iterator[ConversionResult]:
190
194
  assert self.format_to_options is not None
191
195
 
196
+ start_time = time.monotonic()
197
+
192
198
  for input_batch in chunkify(
193
199
  conv_input.docs(self.format_to_options),
194
200
  settings.perf.doc_batch_size, # pass format_options
195
201
  ):
196
202
  _log.info(f"Going to convert document batch...")
203
+
197
204
  # parallel processing only within input_batch
198
205
  # with ThreadPoolExecutor(
199
206
  # max_workers=settings.perf.doc_batch_concurrency
200
207
  # ) as pool:
201
208
  # yield from pool.map(self.process_document, input_batch)
202
-
203
209
  # Note: PDF backends are not thread-safe, thread pool usage was disabled.
210
+
204
211
  for item in map(
205
212
  partial(self._process_document, raises_on_error=raises_on_error),
206
213
  input_batch,
207
214
  ):
215
+ elapsed = time.monotonic() - start_time
216
+ start_time = time.monotonic()
217
+
208
218
  if item is not None:
219
+ _log.info(
220
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
221
+ )
209
222
  yield item
223
+ else:
224
+ _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
210
225
 
211
- def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
226
+ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
212
227
  assert self.format_to_options is not None
213
228
 
214
- fopt = self.format_to_options.get(doc.format)
229
+ fopt = self.format_to_options.get(doc_format)
215
230
 
216
231
  if fopt is None:
217
- raise RuntimeError(f"Could not get pipeline for document {doc.file}")
232
+ raise RuntimeError(f"Could not get pipeline for {doc_format}")
218
233
  else:
219
234
  pipeline_class = fopt.pipeline_cls
220
235
  pipeline_options = fopt.pipeline_options
@@ -237,22 +252,15 @@ class DocumentConverter:
237
252
  assert self.allowed_formats is not None
238
253
  assert in_doc.format in self.allowed_formats
239
254
 
240
- start_doc_time = time.time()
241
-
242
255
  conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
243
256
 
244
- end_doc_time = time.time() - start_doc_time
245
- _log.info(
246
- f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
247
- )
248
-
249
257
  return conv_res
250
258
 
251
259
  def _execute_pipeline(
252
260
  self, in_doc: InputDocument, raises_on_error: bool
253
261
  ) -> ConversionResult:
254
262
  if in_doc.valid:
255
- pipeline = self._get_pipeline(in_doc)
263
+ pipeline = self._get_pipeline(in_doc.format)
256
264
  if pipeline is None: # Can't find a default pipeline. Should this raise?
257
265
  if raises_on_error:
258
266
  raise RuntimeError(
@@ -4,11 +4,14 @@ from typing import Any, Iterable
4
4
  from docling_core.types.doc import DoclingDocument, NodeItem
5
5
 
6
6
  from docling.datamodel.base_models import Page
7
+ from docling.datamodel.document import ConversionResult
7
8
 
8
9
 
9
10
  class BasePageModel(ABC):
10
11
  @abstractmethod
11
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
12
+ def __call__(
13
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
14
+ ) -> Iterable[Page]:
12
15
  pass
13
16
 
14
17
 
@@ -1,6 +1,7 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
+ from pathlib import Path
4
5
  from typing import Iterable, List
5
6
 
6
7
  import numpy as np
@@ -10,12 +11,15 @@ from rtree import index
10
11
  from scipy.ndimage import find_objects, label
11
12
 
12
13
  from docling.datamodel.base_models import OcrCell, Page
14
+ from docling.datamodel.document import ConversionResult
13
15
  from docling.datamodel.pipeline_options import OcrOptions
16
+ from docling.datamodel.settings import settings
17
+ from docling.models.base_model import BasePageModel
14
18
 
15
19
  _log = logging.getLogger(__name__)
16
20
 
17
21
 
18
- class BaseOcrModel:
22
+ class BaseOcrModel(BasePageModel):
19
23
  def __init__(self, enabled: bool, options: OcrOptions):
20
24
  self.enabled = enabled
21
25
  self.options = options
@@ -113,7 +117,7 @@ class BaseOcrModel:
113
117
  ]
114
118
  return filtered_ocr_cells
115
119
 
116
- def draw_ocr_rects_and_cells(self, page, ocr_rects):
120
+ def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
117
121
  image = copy.deepcopy(page.image)
118
122
  draw = ImageDraw.Draw(image, "RGBA")
119
123
 
@@ -130,8 +134,21 @@ class BaseOcrModel:
130
134
  if isinstance(tc, OcrCell):
131
135
  color = "magenta"
132
136
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
133
- image.show()
137
+
138
+ if show:
139
+ image.show()
140
+ else:
141
+ out_path: Path = (
142
+ Path(settings.debug.debug_output_path)
143
+ / f"debug_{conv_res.input.file.stem}"
144
+ )
145
+ out_path.mkdir(parents=True, exist_ok=True)
146
+
147
+ out_file = out_path / f"ocr_page_{page.page_no:05}.png"
148
+ image.save(str(out_file), format="png")
134
149
 
135
150
  @abstractmethod
136
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
151
+ def __call__(
152
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
153
+ ) -> Iterable[Page]:
137
154
  pass
@@ -1,5 +1,6 @@
1
1
  import copy
2
2
  import random
3
+ from pathlib import Path
3
4
  from typing import List, Union
4
5
 
5
6
  from deepsearch_glm.nlp_utils import init_nlp_model
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
27
28
 
28
29
  from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
29
30
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
31
+ from docling.datamodel.settings import settings
32
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
30
33
  from docling.utils.utils import create_hash
31
34
 
32
35
 
@@ -226,23 +229,24 @@ class GlmModel:
226
229
  return ds_doc
227
230
 
228
231
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
229
- ds_doc = self._to_legacy_document(conv_res)
230
- ds_doc_dict = ds_doc.model_dump(by_alias=True)
232
+ with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
233
+ ds_doc = self._to_legacy_document(conv_res)
234
+ ds_doc_dict = ds_doc.model_dump(by_alias=True)
231
235
 
232
- glm_doc = self.model.apply_on_doc(ds_doc_dict)
236
+ glm_doc = self.model.apply_on_doc(ds_doc_dict)
233
237
 
234
- docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
238
+ docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
235
239
 
236
240
  # DEBUG code:
237
- def draw_clusters_and_cells(ds_document, page_no):
241
+ def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
238
242
  clusters_to_draw = []
239
243
  image = copy.deepcopy(conv_res.pages[page_no].image)
240
244
  for ix, elem in enumerate(ds_document.main_text):
241
245
  if isinstance(elem, BaseText):
242
- prov = elem.prov[0]
246
+ prov = elem.prov[0] # type: ignore
243
247
  elif isinstance(elem, Ref):
244
248
  _, arr, index = elem.ref.split("/")
245
- index = int(index)
249
+ index = int(index) # type: ignore
246
250
  if arr == "tables":
247
251
  prov = ds_document.tables[index].prov[0]
248
252
  elif arr == "figures":
@@ -256,7 +260,7 @@ class GlmModel:
256
260
  id=ix,
257
261
  label=elem.name,
258
262
  bbox=BoundingBox.from_tuple(
259
- coord=prov.bbox,
263
+ coord=prov.bbox, # type: ignore
260
264
  origin=CoordOrigin.BOTTOMLEFT,
261
265
  ).to_top_left_origin(conv_res.pages[page_no].size.height),
262
266
  )
@@ -276,9 +280,21 @@ class GlmModel:
276
280
  for tc in c.cells: # [:1]:
277
281
  x0, y0, x1, y1 = tc.bbox.as_tuple()
278
282
  draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
279
- image.show()
280
283
 
281
- # draw_clusters_and_cells(ds_doc, 0)
282
- # draw_clusters_and_cells(exported_doc, 0)
284
+ if show:
285
+ image.show()
286
+ else:
287
+ out_path: Path = (
288
+ Path(settings.debug.debug_output_path)
289
+ / f"debug_{conv_res.input.file.stem}"
290
+ )
291
+ out_path.mkdir(parents=True, exist_ok=True)
292
+
293
+ out_file = out_path / f"doc_page_{page_no:05}.png"
294
+ image.save(str(out_file), format="png")
295
+
296
+ # for item in ds_doc.page_dimensions:
297
+ # page_no = item.page
298
+ # draw_clusters_and_cells(ds_doc, page_no)
283
299
 
284
300
  return docling_doc
@@ -5,8 +5,11 @@ import numpy
5
5
  from docling_core.types.doc import BoundingBox, CoordOrigin
6
6
 
7
7
  from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.document import ConversionResult
8
9
  from docling.datamodel.pipeline_options import EasyOcrOptions
10
+ from docling.datamodel.settings import settings
9
11
  from docling.models.base_ocr_model import BaseOcrModel
12
+ from docling.utils.profiling import TimeRecorder
10
13
 
11
14
  _log = logging.getLogger(__name__)
12
15
 
@@ -33,58 +36,65 @@ class EasyOcrModel(BaseOcrModel):
33
36
  download_enabled=self.options.download_enabled,
34
37
  )
35
38
 
36
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
39
+ def __call__(
40
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
41
+ ) -> Iterable[Page]:
37
42
 
38
43
  if not self.enabled:
39
44
  yield from page_batch
40
45
  return
41
46
 
42
47
  for page in page_batch:
48
+
43
49
  assert page._backend is not None
44
50
  if not page._backend.is_valid():
45
51
  yield page
46
52
  else:
47
- ocr_rects = self.get_ocr_rects(page)
48
-
49
- all_ocr_cells = []
50
- for ocr_rect in ocr_rects:
51
- # Skip zero area boxes
52
- if ocr_rect.area() == 0:
53
- continue
54
- high_res_image = page._backend.get_page_image(
55
- scale=self.scale, cropbox=ocr_rect
56
- )
57
- im = numpy.array(high_res_image)
58
- result = self.reader.readtext(im)
59
-
60
- del high_res_image
61
- del im
62
-
63
- cells = [
64
- OcrCell(
65
- id=ix,
66
- text=line[1],
67
- confidence=line[2],
68
- bbox=BoundingBox.from_tuple(
69
- coord=(
70
- (line[0][0][0] / self.scale) + ocr_rect.l,
71
- (line[0][0][1] / self.scale) + ocr_rect.t,
72
- (line[0][2][0] / self.scale) + ocr_rect.l,
73
- (line[0][2][1] / self.scale) + ocr_rect.t,
74
- ),
75
- origin=CoordOrigin.TOPLEFT,
76
- ),
53
+ with TimeRecorder(conv_res, "ocr"):
54
+ ocr_rects = self.get_ocr_rects(page)
55
+
56
+ all_ocr_cells = []
57
+ for ocr_rect in ocr_rects:
58
+ # Skip zero area boxes
59
+ if ocr_rect.area() == 0:
60
+ continue
61
+ high_res_image = page._backend.get_page_image(
62
+ scale=self.scale, cropbox=ocr_rect
77
63
  )
78
- for ix, line in enumerate(result)
79
- ]
80
- all_ocr_cells.extend(cells)
81
-
82
- ## Remove OCR cells which overlap with programmatic cells.
83
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
64
+ im = numpy.array(high_res_image)
65
+ result = self.reader.readtext(im)
66
+
67
+ del high_res_image
68
+ del im
69
+
70
+ cells = [
71
+ OcrCell(
72
+ id=ix,
73
+ text=line[1],
74
+ confidence=line[2],
75
+ bbox=BoundingBox.from_tuple(
76
+ coord=(
77
+ (line[0][0][0] / self.scale) + ocr_rect.l,
78
+ (line[0][0][1] / self.scale) + ocr_rect.t,
79
+ (line[0][2][0] / self.scale) + ocr_rect.l,
80
+ (line[0][2][1] / self.scale) + ocr_rect.t,
81
+ ),
82
+ origin=CoordOrigin.TOPLEFT,
83
+ ),
84
+ )
85
+ for ix, line in enumerate(result)
86
+ ]
87
+ all_ocr_cells.extend(cells)
88
+
89
+ ## Remove OCR cells which overlap with programmatic cells.
90
+ filtered_ocr_cells = self.filter_ocr_cells(
91
+ all_ocr_cells, page.cells
92
+ )
84
93
 
85
- page.cells.extend(filtered_ocr_cells)
94
+ page.cells.extend(filtered_ocr_cells)
86
95
 
87
96
  # DEBUG code:
88
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
97
+ if settings.debug.visualize_ocr:
98
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
89
99
 
90
100
  yield page
@@ -16,8 +16,11 @@ from docling.datamodel.base_models import (
16
16
  LayoutPrediction,
17
17
  Page,
18
18
  )
19
+ from docling.datamodel.document import ConversionResult
20
+ from docling.datamodel.settings import settings
19
21
  from docling.models.base_model import BasePageModel
20
22
  from docling.utils import layout_utils as lu
23
+ from docling.utils.profiling import TimeRecorder
21
24
 
22
25
  _log = logging.getLogger(__name__)
23
26
 
@@ -271,74 +274,97 @@ class LayoutModel(BasePageModel):
271
274
 
272
275
  return clusters_out_new, cells_out_new
273
276
 
274
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
277
+ def __call__(
278
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
279
+ ) -> Iterable[Page]:
280
+
275
281
  for page in page_batch:
276
282
  assert page._backend is not None
277
283
  if not page._backend.is_valid():
278
284
  yield page
279
285
  else:
280
- assert page.size is not None
281
-
282
- clusters = []
283
- for ix, pred_item in enumerate(
284
- self.layout_predictor.predict(page.get_image(scale=1.0))
285
- ):
286
- label = DocItemLabel(
287
- pred_item["label"].lower().replace(" ", "_").replace("-", "_")
288
- ) # Temporary, until docling-ibm-model uses docling-core types
289
- cluster = Cluster(
290
- id=ix,
291
- label=label,
292
- confidence=pred_item["confidence"],
293
- bbox=BoundingBox.model_validate(pred_item),
294
- cells=[],
295
- )
296
- clusters.append(cluster)
297
-
298
- # Map cells to clusters
299
- # TODO: Remove, postprocess should take care of it anyway.
300
- for cell in page.cells:
301
- for cluster in clusters:
302
- if not cell.bbox.area() > 0:
303
- overlap_frac = 0.0
304
- else:
305
- overlap_frac = (
306
- cell.bbox.intersection_area_with(cluster.bbox)
307
- / cell.bbox.area()
308
- )
309
-
310
- if overlap_frac > 0.5:
311
- cluster.cells.append(cell)
312
-
313
- # Pre-sort clusters
314
- # clusters = self.sort_clusters_by_cell_order(clusters)
315
-
316
- # DEBUG code:
317
- def draw_clusters_and_cells():
318
- image = copy.deepcopy(page.image)
319
- draw = ImageDraw.Draw(image)
320
- for c in clusters:
321
- x0, y0, x1, y1 = c.bbox.as_tuple()
322
- draw.rectangle([(x0, y0), (x1, y1)], outline="green")
323
-
324
- cell_color = (
325
- random.randint(30, 140),
326
- random.randint(30, 140),
327
- random.randint(30, 140),
286
+ with TimeRecorder(conv_res, "layout"):
287
+ assert page.size is not None
288
+
289
+ clusters = []
290
+ for ix, pred_item in enumerate(
291
+ self.layout_predictor.predict(page.get_image(scale=1.0))
292
+ ):
293
+ label = DocItemLabel(
294
+ pred_item["label"]
295
+ .lower()
296
+ .replace(" ", "_")
297
+ .replace("-", "_")
298
+ ) # Temporary, until docling-ibm-model uses docling-core types
299
+ cluster = Cluster(
300
+ id=ix,
301
+ label=label,
302
+ confidence=pred_item["confidence"],
303
+ bbox=BoundingBox.model_validate(pred_item),
304
+ cells=[],
328
305
  )
329
- for tc in c.cells: # [:1]:
330
- x0, y0, x1, y1 = tc.bbox.as_tuple()
331
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
332
- image.show()
333
-
334
- # draw_clusters_and_cells()
335
-
336
- clusters, page.cells = self.postprocess(
337
- clusters, page.cells, page.size.height
338
- )
306
+ clusters.append(cluster)
307
+
308
+ # Map cells to clusters
309
+ # TODO: Remove, postprocess should take care of it anyway.
310
+ for cell in page.cells:
311
+ for cluster in clusters:
312
+ if not cell.bbox.area() > 0:
313
+ overlap_frac = 0.0
314
+ else:
315
+ overlap_frac = (
316
+ cell.bbox.intersection_area_with(cluster.bbox)
317
+ / cell.bbox.area()
318
+ )
319
+
320
+ if overlap_frac > 0.5:
321
+ cluster.cells.append(cell)
322
+
323
+ # Pre-sort clusters
324
+ # clusters = self.sort_clusters_by_cell_order(clusters)
325
+
326
+ # DEBUG code:
327
+ def draw_clusters_and_cells(show: bool = False):
328
+ image = copy.deepcopy(page.image)
329
+ if image is not None:
330
+ draw = ImageDraw.Draw(image)
331
+ for c in clusters:
332
+ x0, y0, x1, y1 = c.bbox.as_tuple()
333
+ draw.rectangle([(x0, y0), (x1, y1)], outline="green")
334
+
335
+ cell_color = (
336
+ random.randint(30, 140),
337
+ random.randint(30, 140),
338
+ random.randint(30, 140),
339
+ )
340
+ for tc in c.cells: # [:1]:
341
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
342
+ draw.rectangle(
343
+ [(x0, y0), (x1, y1)], outline=cell_color
344
+ )
345
+ if show:
346
+ image.show()
347
+ else:
348
+ out_path: Path = (
349
+ Path(settings.debug.debug_output_path)
350
+ / f"debug_{conv_res.input.file.stem}"
351
+ )
352
+ out_path.mkdir(parents=True, exist_ok=True)
353
+
354
+ out_file = (
355
+ out_path / f"layout_page_{page.page_no:05}.png"
356
+ )
357
+ image.save(str(out_file), format="png")
358
+
359
+ # draw_clusters_and_cells()
360
+
361
+ clusters, page.cells = self.postprocess(
362
+ clusters, page.cells, page.size.height
363
+ )
339
364
 
340
- # draw_clusters_and_cells()
365
+ page.predictions.layout = LayoutPrediction(clusters=clusters)
341
366
 
342
- page.predictions.layout = LayoutPrediction(clusters=clusters)
367
+ if settings.debug.visualize_layout:
368
+ draw_clusters_and_cells()
343
369
 
344
370
  yield page