docling 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. docling/backend/asciidoc_backend.py +1 -1
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +21 -13
  4. docling/backend/docling_parse_v2_backend.py +20 -12
  5. docling/backend/docling_parse_v4_backend.py +185 -0
  6. docling/backend/docx/__init__.py +0 -0
  7. docling/backend/docx/latex/__init__.py +0 -0
  8. docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling/backend/docx/latex/omml.py +453 -0
  10. docling/backend/html_backend.py +7 -7
  11. docling/backend/md_backend.py +1 -1
  12. docling/backend/msexcel_backend.py +2 -45
  13. docling/backend/mspowerpoint_backend.py +1 -1
  14. docling/backend/msword_backend.py +65 -3
  15. docling/backend/pdf_backend.py +7 -2
  16. docling/backend/pypdfium2_backend.py +52 -30
  17. docling/backend/xml/uspto_backend.py +1 -1
  18. docling/cli/main.py +60 -21
  19. docling/cli/models.py +1 -1
  20. docling/datamodel/base_models.py +8 -10
  21. docling/datamodel/pipeline_options.py +26 -30
  22. docling/document_converter.py +5 -5
  23. docling/models/base_model.py +9 -1
  24. docling/models/base_ocr_model.py +27 -16
  25. docling/models/easyocr_model.py +28 -13
  26. docling/models/factories/__init__.py +27 -0
  27. docling/models/factories/base_factory.py +122 -0
  28. docling/models/factories/ocr_factory.py +11 -0
  29. docling/models/factories/picture_description_factory.py +11 -0
  30. docling/models/ocr_mac_model.py +39 -11
  31. docling/models/page_preprocessing_model.py +4 -0
  32. docling/models/picture_description_api_model.py +20 -3
  33. docling/models/picture_description_base_model.py +19 -3
  34. docling/models/picture_description_vlm_model.py +14 -2
  35. docling/models/plugins/__init__.py +0 -0
  36. docling/models/plugins/defaults.py +28 -0
  37. docling/models/rapid_ocr_model.py +34 -13
  38. docling/models/table_structure_model.py +13 -4
  39. docling/models/tesseract_ocr_cli_model.py +40 -15
  40. docling/models/tesseract_ocr_model.py +37 -12
  41. docling/pipeline/standard_pdf_pipeline.py +25 -78
  42. docling/utils/export.py +8 -6
  43. docling/utils/layout_postprocessor.py +26 -23
  44. docling/utils/visualization.py +1 -1
  45. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/METADATA +48 -19
  46. docling-2.27.0.dist-info/RECORD +83 -0
  47. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/entry_points.txt +3 -0
  48. docling-2.26.0.dist-info/RECORD +0 -72
  49. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/LICENSE +0 -0
  50. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/WHEEL +0 -0
@@ -1,14 +1,17 @@
1
1
  import logging
2
- from typing import Iterable
2
+ from pathlib import Path
3
+ from typing import Iterable, Optional, Type
3
4
 
4
5
  import numpy
5
6
  from docling_core.types.doc import BoundingBox, CoordOrigin
7
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
6
8
 
7
- from docling.datamodel.base_models import OcrCell, Page
9
+ from docling.datamodel.base_models import Page
8
10
  from docling.datamodel.document import ConversionResult
9
11
  from docling.datamodel.pipeline_options import (
10
12
  AcceleratorDevice,
11
13
  AcceleratorOptions,
14
+ OcrOptions,
12
15
  RapidOcrOptions,
13
16
  )
14
17
  from docling.datamodel.settings import settings
@@ -23,10 +26,16 @@ class RapidOcrModel(BaseOcrModel):
23
26
  def __init__(
24
27
  self,
25
28
  enabled: bool,
29
+ artifacts_path: Optional[Path],
26
30
  options: RapidOcrOptions,
27
31
  accelerator_options: AcceleratorOptions,
28
32
  ):
29
- super().__init__(enabled=enabled, options=options)
33
+ super().__init__(
34
+ enabled=enabled,
35
+ artifacts_path=artifacts_path,
36
+ options=options,
37
+ accelerator_options=accelerator_options,
38
+ )
30
39
  self.options: RapidOcrOptions
31
40
 
32
41
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -100,18 +109,26 @@ class RapidOcrModel(BaseOcrModel):
100
109
 
101
110
  if result is not None:
102
111
  cells = [
103
- OcrCell(
104
- id=ix,
112
+ TextCell(
113
+ index=ix,
105
114
  text=line[1],
115
+ orig=line[1],
106
116
  confidence=line[2],
107
- bbox=BoundingBox.from_tuple(
108
- coord=(
109
- (line[0][0][0] / self.scale) + ocr_rect.l,
110
- (line[0][0][1] / self.scale) + ocr_rect.t,
111
- (line[0][2][0] / self.scale) + ocr_rect.l,
112
- (line[0][2][1] / self.scale) + ocr_rect.t,
113
- ),
114
- origin=CoordOrigin.TOPLEFT,
117
+ from_ocr=True,
118
+ rect=BoundingRectangle.from_bounding_box(
119
+ BoundingBox.from_tuple(
120
+ coord=(
121
+ (line[0][0][0] / self.scale)
122
+ + ocr_rect.l,
123
+ (line[0][0][1] / self.scale)
124
+ + ocr_rect.t,
125
+ (line[0][2][0] / self.scale)
126
+ + ocr_rect.l,
127
+ (line[0][2][1] / self.scale)
128
+ + ocr_rect.t,
129
+ ),
130
+ origin=CoordOrigin.TOPLEFT,
131
+ )
115
132
  ),
116
133
  )
117
134
  for ix, line in enumerate(result)
@@ -126,3 +143,7 @@ class RapidOcrModel(BaseOcrModel):
126
143
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
127
144
 
128
145
  yield page
146
+
147
+ @classmethod
148
+ def get_options_type(cls) -> Type[OcrOptions]:
149
+ return RapidOcrOptions
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union
5
5
 
6
6
  import numpy
7
7
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
8
+ from docling_core.types.doc.page import BoundingRectangle
8
9
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
9
10
  from PIL import ImageDraw
10
11
 
@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
129
130
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
130
131
 
131
132
  for cell in table_element.cluster.cells:
132
- x0, y0, x1, y1 = cell.bbox.as_tuple()
133
+ x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
133
134
  x0 *= scale_x
134
135
  x1 *= scale_x
135
136
  y0 *= scale_x
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
223
224
  # Only allow non empty stings (spaces) into the cells of a table
224
225
  if len(c.text.strip()) > 0:
225
226
  new_cell = copy.deepcopy(c)
226
- new_cell.bbox = new_cell.bbox.scaled(
227
- scale=self.scale
227
+ new_cell.rect = BoundingRectangle.from_bounding_box(
228
+ new_cell.rect.to_bounding_box().scaled(
229
+ scale=self.scale
230
+ )
228
231
  )
229
232
 
230
- tokens.append(new_cell.model_dump())
233
+ tokens.append(
234
+ {
235
+ "id": new_cell.index,
236
+ "text": new_cell.text,
237
+ "bbox": new_cell.rect.to_bounding_box().model_dump(),
238
+ }
239
+ )
231
240
  page_input["tokens"] = tokens
232
241
 
233
242
  tf_output = self.tf_predictor.multi_table_predict(
@@ -3,15 +3,21 @@ import io
3
3
  import logging
4
4
  import os
5
5
  import tempfile
6
+ from pathlib import Path
6
7
  from subprocess import DEVNULL, PIPE, Popen
7
- from typing import Iterable, List, Optional, Tuple
8
+ from typing import Iterable, List, Optional, Tuple, Type
8
9
 
9
10
  import pandas as pd
10
11
  from docling_core.types.doc import BoundingBox, CoordOrigin
12
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
11
13
 
12
- from docling.datamodel.base_models import Cell, OcrCell, Page
14
+ from docling.datamodel.base_models import Page
13
15
  from docling.datamodel.document import ConversionResult
14
- from docling.datamodel.pipeline_options import TesseractCliOcrOptions
16
+ from docling.datamodel.pipeline_options import (
17
+ AcceleratorOptions,
18
+ OcrOptions,
19
+ TesseractCliOcrOptions,
20
+ )
15
21
  from docling.datamodel.settings import settings
16
22
  from docling.models.base_ocr_model import BaseOcrModel
17
23
  from docling.utils.ocr_utils import map_tesseract_script
@@ -21,8 +27,19 @@ _log = logging.getLogger(__name__)
21
27
 
22
28
 
23
29
  class TesseractOcrCliModel(BaseOcrModel):
24
- def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
25
- super().__init__(enabled=enabled, options=options)
30
+ def __init__(
31
+ self,
32
+ enabled: bool,
33
+ artifacts_path: Optional[Path],
34
+ options: TesseractCliOcrOptions,
35
+ accelerator_options: AcceleratorOptions,
36
+ ):
37
+ super().__init__(
38
+ enabled=enabled,
39
+ artifacts_path=artifacts_path,
40
+ options=options,
41
+ accelerator_options=accelerator_options,
42
+ )
26
43
  self.options: TesseractCliOcrOptions
27
44
 
28
45
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -228,18 +245,22 @@ class TesseractOcrCliModel(BaseOcrModel):
228
245
  t = b + h
229
246
  r = l + w
230
247
 
231
- cell = OcrCell(
232
- id=ix,
248
+ cell = TextCell(
249
+ index=ix,
233
250
  text=text,
251
+ orig=text,
252
+ from_ocr=True,
234
253
  confidence=conf / 100.0,
235
- bbox=BoundingBox.from_tuple(
236
- coord=(
237
- (l / self.scale) + ocr_rect.l,
238
- (b / self.scale) + ocr_rect.t,
239
- (r / self.scale) + ocr_rect.l,
240
- (t / self.scale) + ocr_rect.t,
241
- ),
242
- origin=CoordOrigin.TOPLEFT,
254
+ rect=BoundingRectangle.from_bounding_box(
255
+ BoundingBox.from_tuple(
256
+ coord=(
257
+ (l / self.scale) + ocr_rect.l,
258
+ (b / self.scale) + ocr_rect.t,
259
+ (r / self.scale) + ocr_rect.l,
260
+ (t / self.scale) + ocr_rect.t,
261
+ ),
262
+ origin=CoordOrigin.TOPLEFT,
263
+ )
243
264
  ),
244
265
  )
245
266
  all_ocr_cells.append(cell)
@@ -252,3 +273,7 @@ class TesseractOcrCliModel(BaseOcrModel):
252
273
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
253
274
 
254
275
  yield page
276
+
277
+ @classmethod
278
+ def get_options_type(cls) -> Type[OcrOptions]:
279
+ return TesseractCliOcrOptions
@@ -1,11 +1,17 @@
1
1
  import logging
2
- from typing import Iterable
2
+ from pathlib import Path
3
+ from typing import Iterable, Optional, Type
3
4
 
4
5
  from docling_core.types.doc import BoundingBox, CoordOrigin
6
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
5
7
 
6
- from docling.datamodel.base_models import Cell, OcrCell, Page
8
+ from docling.datamodel.base_models import Page
7
9
  from docling.datamodel.document import ConversionResult
8
- from docling.datamodel.pipeline_options import TesseractOcrOptions
10
+ from docling.datamodel.pipeline_options import (
11
+ AcceleratorOptions,
12
+ OcrOptions,
13
+ TesseractOcrOptions,
14
+ )
9
15
  from docling.datamodel.settings import settings
10
16
  from docling.models.base_ocr_model import BaseOcrModel
11
17
  from docling.utils.ocr_utils import map_tesseract_script
@@ -15,8 +21,19 @@ _log = logging.getLogger(__name__)
15
21
 
16
22
 
17
23
  class TesseractOcrModel(BaseOcrModel):
18
- def __init__(self, enabled: bool, options: TesseractOcrOptions):
19
- super().__init__(enabled=enabled, options=options)
24
+ def __init__(
25
+ self,
26
+ enabled: bool,
27
+ artifacts_path: Optional[Path],
28
+ options: TesseractOcrOptions,
29
+ accelerator_options: AcceleratorOptions,
30
+ ):
31
+ super().__init__(
32
+ enabled=enabled,
33
+ artifacts_path=artifacts_path,
34
+ options=options,
35
+ accelerator_options=accelerator_options,
36
+ )
20
37
  self.options: TesseractOcrOptions
21
38
 
22
39
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -31,14 +48,14 @@ class TesseractOcrModel(BaseOcrModel):
31
48
  "Note that tesserocr might have to be manually compiled for working with "
32
49
  "your Tesseract installation. The Docling documentation provides examples for it. "
33
50
  "Alternatively, Docling has support for other OCR engines. See the documentation: "
34
- "https://ds4sd.github.io/docling/installation/"
51
+ "https://docling-project.github.io/docling/installation/"
35
52
  )
36
53
  missing_langs_errmsg = (
37
54
  "tesserocr is not correctly configured. No language models have been detected. "
38
55
  "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
39
56
  "You can find more information how to setup other OCR engines in Docling "
40
57
  "documentation: "
41
- "https://ds4sd.github.io/docling/installation/"
58
+ "https://docling-project.github.io/docling/installation/"
42
59
  )
43
60
 
44
61
  try:
@@ -173,13 +190,17 @@ class TesseractOcrModel(BaseOcrModel):
173
190
  top = (box["y"] + box["h"]) / self.scale
174
191
 
175
192
  cells.append(
176
- OcrCell(
177
- id=ix,
193
+ TextCell(
194
+ index=ix,
178
195
  text=text,
196
+ orig=text,
197
+ from_ocr=True,
179
198
  confidence=confidence,
180
- bbox=BoundingBox.from_tuple(
181
- coord=(left, top, right, bottom),
182
- origin=CoordOrigin.TOPLEFT,
199
+ rect=BoundingRectangle.from_bounding_box(
200
+ BoundingBox.from_tuple(
201
+ coord=(left, top, right, bottom),
202
+ origin=CoordOrigin.TOPLEFT,
203
+ ),
183
204
  ),
184
205
  )
185
206
  )
@@ -195,3 +216,7 @@ class TesseractOcrModel(BaseOcrModel):
195
216
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
196
217
 
197
218
  yield page
219
+
220
+ @classmethod
221
+ def get_options_type(cls) -> Type[OcrOptions]:
222
+ return TesseractOcrOptions
@@ -10,16 +10,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
10
10
  from docling.backend.pdf_backend import PdfDocumentBackend
11
11
  from docling.datamodel.base_models import AssembledUnit, Page
12
12
  from docling.datamodel.document import ConversionResult
13
- from docling.datamodel.pipeline_options import (
14
- EasyOcrOptions,
15
- OcrMacOptions,
16
- PdfPipelineOptions,
17
- PictureDescriptionApiOptions,
18
- PictureDescriptionVlmOptions,
19
- RapidOcrOptions,
20
- TesseractCliOcrOptions,
21
- TesseractOcrOptions,
22
- )
13
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
23
14
  from docling.datamodel.settings import settings
24
15
  from docling.models.base_ocr_model import BaseOcrModel
25
16
  from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
@@ -27,22 +18,16 @@ from docling.models.document_picture_classifier import (
27
18
  DocumentPictureClassifier,
28
19
  DocumentPictureClassifierOptions,
29
20
  )
30
- from docling.models.easyocr_model import EasyOcrModel
21
+ from docling.models.factories import get_ocr_factory, get_picture_description_factory
31
22
  from docling.models.layout_model import LayoutModel
32
- from docling.models.ocr_mac_model import OcrMacModel
33
23
  from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
34
24
  from docling.models.page_preprocessing_model import (
35
25
  PagePreprocessingModel,
36
26
  PagePreprocessingOptions,
37
27
  )
38
- from docling.models.picture_description_api_model import PictureDescriptionApiModel
39
28
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
40
- from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
41
- from docling.models.rapid_ocr_model import RapidOcrModel
42
29
  from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
43
30
  from docling.models.table_structure_model import TableStructureModel
44
- from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
45
- from docling.models.tesseract_ocr_model import TesseractOcrModel
46
31
  from docling.pipeline.base_pipeline import PaginatedPipeline
47
32
  from docling.utils.model_downloader import download_models
48
33
  from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -78,16 +63,14 @@ class StandardPdfPipeline(PaginatedPipeline):
78
63
 
79
64
  self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
80
65
 
81
- if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
82
- raise RuntimeError(
83
- f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
84
- )
66
+ ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
85
67
 
86
68
  self.build_pipe = [
87
69
  # Pre-processing
88
70
  PagePreprocessingModel(
89
71
  options=PagePreprocessingOptions(
90
- images_scale=pipeline_options.images_scale
72
+ images_scale=pipeline_options.images_scale,
73
+ create_parsed_page=pipeline_options.generate_parsed_pages,
91
74
  )
92
75
  ),
93
76
  # OCR
@@ -163,66 +146,30 @@ class StandardPdfPipeline(PaginatedPipeline):
163
146
  output_dir = download_models(output_dir=local_dir, force=force, progress=False)
164
147
  return output_dir
165
148
 
166
- def get_ocr_model(
167
- self, artifacts_path: Optional[Path] = None
168
- ) -> Optional[BaseOcrModel]:
169
- if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
170
- return EasyOcrModel(
171
- enabled=self.pipeline_options.do_ocr,
172
- artifacts_path=artifacts_path,
173
- options=self.pipeline_options.ocr_options,
174
- accelerator_options=self.pipeline_options.accelerator_options,
175
- )
176
- elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
177
- return TesseractOcrCliModel(
178
- enabled=self.pipeline_options.do_ocr,
179
- options=self.pipeline_options.ocr_options,
180
- )
181
- elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
182
- return TesseractOcrModel(
183
- enabled=self.pipeline_options.do_ocr,
184
- options=self.pipeline_options.ocr_options,
185
- )
186
- elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
187
- return RapidOcrModel(
188
- enabled=self.pipeline_options.do_ocr,
189
- options=self.pipeline_options.ocr_options,
190
- accelerator_options=self.pipeline_options.accelerator_options,
191
- )
192
- elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
193
- if "darwin" != sys.platform:
194
- raise RuntimeError(
195
- f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
196
- )
197
- return OcrMacModel(
198
- enabled=self.pipeline_options.do_ocr,
199
- options=self.pipeline_options.ocr_options,
200
- )
201
- return None
149
+ def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
150
+ factory = get_ocr_factory(
151
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
152
+ )
153
+ return factory.create_instance(
154
+ options=self.pipeline_options.ocr_options,
155
+ enabled=self.pipeline_options.do_ocr,
156
+ artifacts_path=artifacts_path,
157
+ accelerator_options=self.pipeline_options.accelerator_options,
158
+ )
202
159
 
203
160
  def get_picture_description_model(
204
161
  self, artifacts_path: Optional[Path] = None
205
162
  ) -> Optional[PictureDescriptionBaseModel]:
206
- if isinstance(
207
- self.pipeline_options.picture_description_options,
208
- PictureDescriptionApiOptions,
209
- ):
210
- return PictureDescriptionApiModel(
211
- enabled=self.pipeline_options.do_picture_description,
212
- enable_remote_services=self.pipeline_options.enable_remote_services,
213
- options=self.pipeline_options.picture_description_options,
214
- )
215
- elif isinstance(
216
- self.pipeline_options.picture_description_options,
217
- PictureDescriptionVlmOptions,
218
- ):
219
- return PictureDescriptionVlmModel(
220
- enabled=self.pipeline_options.do_picture_description,
221
- artifacts_path=artifacts_path,
222
- options=self.pipeline_options.picture_description_options,
223
- accelerator_options=self.pipeline_options.accelerator_options,
224
- )
225
- return None
163
+ factory = get_picture_description_factory(
164
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
165
+ )
166
+ return factory.create_instance(
167
+ options=self.pipeline_options.picture_description_options,
168
+ enabled=self.pipeline_options.do_picture_description,
169
+ enable_remote_services=self.pipeline_options.enable_remote_services,
170
+ artifacts_path=artifacts_path,
171
+ accelerator_options=self.pipeline_options.accelerator_options,
172
+ )
226
173
 
227
174
  def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
228
175
  with TimeRecorder(conv_res, "page_init"):
docling/utils/export.py CHANGED
@@ -2,9 +2,9 @@ import logging
2
2
  from typing import Any, Dict, Iterable, List, Tuple, Union
3
3
 
4
4
  from docling_core.types.doc import BoundingBox, CoordOrigin
5
+ from docling_core.types.doc.page import TextCell
5
6
  from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
6
7
 
7
- from docling.datamodel.base_models import OcrCell
8
8
  from docling.datamodel.document import ConversionResult, Page
9
9
 
10
10
  _log = logging.getLogger(__name__)
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
86
86
  if page.size is None:
87
87
  return cells
88
88
  for cell in page.cells:
89
- new_bbox = cell.bbox.to_top_left_origin(
90
- page_height=page.size.height
91
- ).normalized(page_size=page.size)
92
- is_ocr = isinstance(cell, OcrCell)
93
- ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
89
+ new_bbox = (
90
+ cell.rect.to_bounding_box()
91
+ .to_top_left_origin(page_height=page.size.height)
92
+ .normalized(page_size=page.size)
93
+ )
94
+ is_ocr = cell.from_ocr
95
+ ocr_confidence = cell.confidence
94
96
  cells.append(
95
97
  {
96
98
  "text": cell.text,
@@ -5,9 +5,10 @@ from collections import defaultdict
5
5
  from typing import Dict, List, Set, Tuple
6
6
 
7
7
  from docling_core.types.doc import DocItemLabel, Size
8
+ from docling_core.types.doc.page import TextCell
8
9
  from rtree import index
9
10
 
10
- from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
11
+ from docling.datamodel.base_models import BoundingBox, Cluster
11
12
 
12
13
  _log = logging.getLogger(__name__)
13
14
 
@@ -198,7 +199,7 @@ class LayoutPostprocessor:
198
199
  DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
199
200
  }
200
201
 
201
- def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
202
+ def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
202
203
  """Initialize processor with cells and clusters."""
203
204
  """Initialize processor with cells and spatial indices."""
204
205
  self.cells = cells
@@ -218,7 +219,7 @@ class LayoutPostprocessor:
218
219
  [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
219
220
  )
220
221
 
221
- def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
222
+ def postprocess(self) -> Tuple[List[Cluster], List[TextCell]]:
222
223
  """Main processing pipeline."""
223
224
  self.regular_clusters = self._process_regular_clusters()
224
225
  self.special_clusters = self._process_special_clusters()
@@ -271,15 +272,13 @@ class LayoutPostprocessor:
271
272
  next_id = max((c.id for c in self.all_clusters), default=0) + 1
272
273
  orphan_clusters = []
273
274
  for i, cell in enumerate(unassigned):
274
- conf = 1.0
275
- if isinstance(cell, OcrCell):
276
- conf = cell.confidence
275
+ conf = cell.confidence
277
276
 
278
277
  orphan_clusters.append(
279
278
  Cluster(
280
279
  id=next_id + i,
281
280
  label=DocItemLabel.TEXT,
282
- bbox=cell.bbox,
281
+ bbox=cell.to_bounding_box(),
283
282
  confidence=conf,
284
283
  cells=[cell],
285
284
  )
@@ -557,13 +556,13 @@ class LayoutPostprocessor:
557
556
 
558
557
  return current_best if current_best else clusters[0]
559
558
 
560
- def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
559
+ def _deduplicate_cells(self, cells: List[TextCell]) -> List[TextCell]:
561
560
  """Ensure each cell appears only once, maintaining order of first appearance."""
562
561
  seen_ids = set()
563
562
  unique_cells = []
564
563
  for cell in cells:
565
- if cell.id not in seen_ids:
566
- seen_ids.add(cell.id)
564
+ if cell.index not in seen_ids:
565
+ seen_ids.add(cell.index)
567
566
  unique_cells.append(cell)
568
567
  return unique_cells
569
568
 
@@ -582,11 +581,13 @@ class LayoutPostprocessor:
582
581
  best_cluster = None
583
582
 
584
583
  for cluster in clusters:
585
- if cell.bbox.area() <= 0:
584
+ if cell.rect.to_bounding_box().area() <= 0:
586
585
  continue
587
586
 
588
- overlap = cell.bbox.intersection_area_with(cluster.bbox)
589
- overlap_ratio = overlap / cell.bbox.area()
587
+ overlap = cell.rect.to_bounding_box().intersection_area_with(
588
+ cluster.bbox
589
+ )
590
+ overlap_ratio = overlap / cell.rect.to_bounding_box().area()
590
591
 
591
592
  if overlap_ratio > best_overlap:
592
593
  best_overlap = overlap_ratio
@@ -601,11 +602,13 @@ class LayoutPostprocessor:
601
602
 
602
603
  return clusters
603
604
 
604
- def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
605
+ def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[TextCell]:
605
606
  """Find cells not assigned to any cluster."""
606
- assigned = {cell.id for cluster in clusters for cell in cluster.cells}
607
+ assigned = {cell.index for cluster in clusters for cell in cluster.cells}
607
608
  return [
608
- cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
609
+ cell
610
+ for cell in self.cells
611
+ if cell.index not in assigned and cell.text.strip()
609
612
  ]
610
613
 
611
614
  def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
@@ -615,10 +618,10 @@ class LayoutPostprocessor:
615
618
  continue
616
619
 
617
620
  cells_bbox = BoundingBox(
618
- l=min(cell.bbox.l for cell in cluster.cells),
619
- t=min(cell.bbox.t for cell in cluster.cells),
620
- r=max(cell.bbox.r for cell in cluster.cells),
621
- b=max(cell.bbox.b for cell in cluster.cells),
621
+ l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
622
+ t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
623
+ r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
624
+ b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
622
625
  )
623
626
 
624
627
  if cluster.label == DocItemLabel.TABLE:
@@ -634,9 +637,9 @@ class LayoutPostprocessor:
634
637
 
635
638
  return clusters
636
639
 
637
- def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
640
+ def _sort_cells(self, cells: List[TextCell]) -> List[TextCell]:
638
641
  """Sort cells in native reading order."""
639
- return sorted(cells, key=lambda c: (c.id))
642
+ return sorted(cells, key=lambda c: (c.index))
640
643
 
641
644
  def _sort_clusters(
642
645
  self, clusters: List[Cluster], mode: str = "id"
@@ -647,7 +650,7 @@ class LayoutPostprocessor:
647
650
  clusters,
648
651
  key=lambda cluster: (
649
652
  (
650
- min(cell.id for cell in cluster.cells)
653
+ min(cell.index for cell in cluster.cells)
651
654
  if cluster.cells
652
655
  else sys.maxsize
653
656
  ),
@@ -25,7 +25,7 @@ def draw_clusters(
25
25
  # Draw cells first (underneath)
26
26
  cell_color = (0, 0, 0, 40) # Transparent black for cells
27
27
  for tc in c.cells:
28
- cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
28
+ cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
29
29
  cx0 *= scale_x
30
30
  cx1 *= scale_x
31
31
  cy0 *= scale_x