docling 2.32.0__py3-none-any.whl → 2.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import csv
2
2
  import io
3
3
  import logging
4
4
  import os
5
+ import subprocess
5
6
  import tempfile
6
7
  from collections.abc import Iterable
7
8
  from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
10
11
 
11
12
  import pandas as pd
12
13
  from docling_core.types.doc import BoundingBox, CoordOrigin
13
- from docling_core.types.doc.page import BoundingRectangle, TextCell
14
+ from docling_core.types.doc.page import TextCell
14
15
 
15
16
  from docling.datamodel.base_models import Page
16
17
  from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
21
22
  )
22
23
  from docling.datamodel.settings import settings
23
24
  from docling.models.base_ocr_model import BaseOcrModel
24
- from docling.utils.ocr_utils import map_tesseract_script
25
+ from docling.utils.ocr_utils import (
26
+ map_tesseract_script,
27
+ parse_tesseract_orientation,
28
+ tesseract_box_to_bounding_rectangle,
29
+ )
25
30
  from docling.utils.profiling import TimeRecorder
26
31
 
27
32
  _log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
49
54
  self._version: Optional[str] = None
50
55
  self._tesseract_languages: Optional[List[str]] = None
51
56
  self._script_prefix: Optional[str] = None
57
+ self._is_auto: bool = "auto" in self.options.lang
52
58
 
53
59
  if self.enabled:
54
60
  try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
93
99
 
94
100
  return name, version
95
101
 
96
- def _run_tesseract(self, ifilename: str):
102
+ def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
97
103
  r"""
98
104
  Run tesseract CLI
99
105
  """
100
106
  cmd = [self.options.tesseract_cmd]
101
-
102
- if "auto" in self.options.lang:
103
- lang = self._detect_language(ifilename)
107
+ if self._is_auto:
108
+ lang = self._parse_language(osd)
104
109
  if lang is not None:
105
110
  cmd.append("-l")
106
111
  cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
115
120
  cmd += [ifilename, "stdout", "tsv"]
116
121
  _log.info("command: {}".format(" ".join(cmd)))
117
122
 
118
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
119
- output, _ = proc.communicate()
123
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
120
124
 
121
125
  # _log.info(output)
122
126
 
123
127
  # Decode the byte string to a regular string
124
- decoded_data = output.decode("utf-8")
128
+ decoded_data = output.stdout.decode("utf-8")
125
129
  # _log.info(decoded_data)
126
130
 
127
131
  # Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
139
143
 
140
144
  return df_filtered
141
145
 
142
- def _detect_language(self, ifilename: str):
146
+ def _perform_osd(self, ifilename: str) -> pd.DataFrame:
143
147
  r"""
144
148
  Run tesseract in PSM 0 mode to detect the language
145
149
  """
146
- assert self._tesseract_languages is not None
147
150
 
148
151
  cmd = [self.options.tesseract_cmd]
149
152
  cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
150
153
  _log.info("command: {}".format(" ".join(cmd)))
151
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
152
- output, _ = proc.communicate()
153
- decoded_data = output.decode("utf-8")
154
+ output = subprocess.run(cmd, capture_output=True, check=True)
155
+ decoded_data = output.stdout.decode("utf-8")
154
156
  df_detected = pd.read_csv(
155
157
  io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
156
158
  )
157
- scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
159
+ return df_detected
160
+
161
+ def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
162
+ assert self._tesseract_languages is not None
163
+ scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
158
164
  if len(scripts) == 0:
159
165
  _log.warning("Tesseract cannot detect the script of the page")
160
166
  return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
182
188
  cmd = [self.options.tesseract_cmd]
183
189
  cmd.append("--list-langs")
184
190
  _log.info("command: {}".format(" ".join(cmd)))
185
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
186
- output, _ = proc.communicate()
187
- decoded_data = output.decode("utf-8")
191
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
192
+ decoded_data = output.stdout.decode("utf-8")
188
193
  df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
189
194
  self._tesseract_languages = df_list[0].tolist()[1:]
190
195
 
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
203
208
  yield from page_batch
204
209
  return
205
210
 
206
- for page in page_batch:
211
+ for page_i, page in enumerate(page_batch):
207
212
  assert page._backend is not None
208
213
  if not page._backend.is_valid():
209
214
  yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
212
217
  ocr_rects = self.get_ocr_rects(page)
213
218
 
214
219
  all_ocr_cells = []
215
- for ocr_rect in ocr_rects:
220
+ for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
216
221
  # Skip zero area boxes
217
222
  if ocr_rect.area() == 0:
218
223
  continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
225
230
  ) as image_file:
226
231
  fname = image_file.name
227
232
  high_res_image.save(image_file)
228
-
229
- df_result = self._run_tesseract(fname)
233
+ doc_orientation = 0
234
+ try:
235
+ df_osd = self._perform_osd(fname)
236
+ doc_orientation = _parse_orientation(df_osd)
237
+ except subprocess.CalledProcessError as exc:
238
+ _log.error(
239
+ "OSD failed (doc %s, page: %s, "
240
+ "OCR rectangle: %s, processed image file %s):\n %s",
241
+ conv_res.input.file,
242
+ page_i,
243
+ ocr_rect_i,
244
+ image_file,
245
+ exc.stderr,
246
+ )
247
+ # Skipping if OSD fail when in auto mode, otherwise proceed
248
+ # to OCR in the hope OCR will succeed while OSD failed
249
+ if self._is_auto:
250
+ continue
251
+ if doc_orientation != 0:
252
+ high_res_image = high_res_image.rotate(
253
+ -doc_orientation, expand=True
254
+ )
255
+ high_res_image.save(fname)
256
+ try:
257
+ df_result = self._run_tesseract(fname, df_osd)
258
+ except subprocess.CalledProcessError as exc:
259
+ _log.error(
260
+ "tesseract OCR failed (doc %s, page: %s, "
261
+ "OCR rectangle: %s, processed image file %s):\n %s",
262
+ conv_res.input.file,
263
+ page_i,
264
+ ocr_rect_i,
265
+ image_file,
266
+ exc.stderr,
267
+ )
268
+ continue
230
269
  finally:
231
270
  if os.path.exists(fname):
232
271
  os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
238
277
  text = row["text"]
239
278
  conf = row["conf"]
240
279
 
241
- l = float(row["left"]) # noqa: E741
242
- b = float(row["top"])
243
- w = float(row["width"])
244
- h = float(row["height"])
245
-
246
- t = b + h
247
- r = l + w
248
-
280
+ left, top = float(row["left"]), float(row["top"])
281
+ right = left + float(row["width"])
282
+ bottom = top + row["height"]
283
+ bbox = BoundingBox(
284
+ l=left,
285
+ t=top,
286
+ r=right,
287
+ b=bottom,
288
+ coord_origin=CoordOrigin.TOPLEFT,
289
+ )
290
+ rect = tesseract_box_to_bounding_rectangle(
291
+ bbox,
292
+ original_offset=ocr_rect,
293
+ scale=self.scale,
294
+ orientation=doc_orientation,
295
+ im_size=high_res_image.size,
296
+ )
249
297
  cell = TextCell(
250
298
  index=ix,
251
299
  text=str(text),
252
300
  orig=str(text),
253
301
  from_ocr=True,
254
302
  confidence=conf / 100.0,
255
- rect=BoundingRectangle.from_bounding_box(
256
- BoundingBox.from_tuple(
257
- coord=(
258
- (l / self.scale) + ocr_rect.l,
259
- (b / self.scale) + ocr_rect.t,
260
- (r / self.scale) + ocr_rect.l,
261
- (t / self.scale) + ocr_rect.t,
262
- ),
263
- origin=CoordOrigin.TOPLEFT,
264
- )
265
- ),
303
+ rect=rect,
266
304
  )
267
305
  all_ocr_cells.append(cell)
268
306
 
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
278
316
  @classmethod
279
317
  def get_options_type(cls) -> Type[OcrOptions]:
280
318
  return TesseractCliOcrOptions
319
+
320
+
321
+ def _parse_orientation(df_osd: pd.DataFrame) -> int:
322
+ orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
323
+ orientation = parse_tesseract_orientation(orientations[0].strip())
324
+ return orientation
@@ -1,12 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from collections.abc import Iterable
5
4
  from pathlib import Path
6
- from typing import Optional, Type
5
+ from typing import Iterable, Optional, Type
7
6
 
8
7
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_core.types.doc.page import BoundingRectangle, TextCell
8
+ from docling_core.types.doc.page import TextCell
10
9
 
11
10
  from docling.datamodel.base_models import Page
12
11
  from docling.datamodel.document import ConversionResult
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
17
16
  )
18
17
  from docling.datamodel.settings import settings
19
18
  from docling.models.base_ocr_model import BaseOcrModel
20
- from docling.utils.ocr_utils import map_tesseract_script
19
+ from docling.utils.ocr_utils import (
20
+ map_tesseract_script,
21
+ parse_tesseract_orientation,
22
+ tesseract_box_to_bounding_rectangle,
23
+ )
21
24
  from docling.utils.profiling import TimeRecorder
22
25
 
23
26
  _log = logging.getLogger(__name__)
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
38
41
  accelerator_options=accelerator_options,
39
42
  )
40
43
  self.options: TesseractOcrOptions
41
-
44
+ self._is_auto: bool = "auto" in self.options.lang
42
45
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
43
46
  self.reader = None
44
47
  self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
95
98
 
96
99
  if lang == "auto":
97
100
  self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
98
- self.osd_reader = tesserocr.PyTessBaseAPI(
99
- **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
100
- )
101
101
  else:
102
102
  self.reader = tesserocr.PyTessBaseAPI(
103
103
  **{"lang": lang} | tesserocr_kwargs,
104
104
  )
105
+ self.osd_reader = tesserocr.PyTessBaseAPI(
106
+ **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
107
+ )
105
108
  self.reader_RIL = tesserocr.RIL
106
109
 
107
110
  def __del__(self):
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
118
121
  yield from page_batch
119
122
  return
120
123
 
121
- for page in page_batch:
124
+ for page_i, page in enumerate(page_batch):
122
125
  assert page._backend is not None
123
126
  if not page._backend.is_valid():
124
127
  yield page
125
128
  else:
126
129
  with TimeRecorder(conv_res, "ocr"):
127
130
  assert self.reader is not None
131
+ assert self.osd_reader is not None
128
132
  assert self._tesserocr_languages is not None
129
133
 
130
134
  ocr_rects = self.get_ocr_rects(page)
131
135
 
132
136
  all_ocr_cells = []
133
- for ocr_rect in ocr_rects:
137
+ for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
134
138
  # Skip zero area boxes
135
139
  if ocr_rect.area() == 0:
136
140
  continue
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
139
143
  )
140
144
 
141
145
  local_reader = self.reader
142
- if "auto" in self.options.lang:
143
- assert self.osd_reader is not None
144
-
145
- self.osd_reader.SetImage(high_res_image)
146
- osd = self.osd_reader.DetectOrientationScript()
147
-
148
- # No text, probably
149
- if osd is None:
146
+ self.osd_reader.SetImage(high_res_image)
147
+ osd = self.osd_reader.DetectOrientationScript()
148
+ # No text, or Orientation and Script detection failure
149
+ if osd is None:
150
+ _log.error(
151
+ "OSD failed for doc (doc %s, page: %s, "
152
+ "OCR rectangle: %s)",
153
+ conv_res.input.file,
154
+ page_i,
155
+ ocr_rect_i,
156
+ )
157
+ # Skipping if OSD fail when in auto mode, otherwise proceed
158
+ # to OCR in the hope OCR will succeed while OSD failed
159
+ if self._is_auto:
150
160
  continue
151
-
161
+ doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
162
+ if doc_orientation != 0:
163
+ high_res_image = high_res_image.rotate(
164
+ -doc_orientation, expand=True
165
+ )
166
+ if self._is_auto:
152
167
  script = osd["script_name"]
153
168
  script = map_tesseract_script(script)
154
169
  lang = f"{self.script_prefix}{script}"
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
188
203
  # Extract text within the bounding box
189
204
  text = local_reader.GetUTF8Text().strip()
190
205
  confidence = local_reader.MeanTextConf()
191
- left = box["x"] / self.scale
192
- bottom = box["y"] / self.scale
193
- right = (box["x"] + box["w"]) / self.scale
194
- top = (box["y"] + box["h"]) / self.scale
195
-
206
+ left, top = box["x"], box["y"]
207
+ right = left + box["w"]
208
+ bottom = top + box["h"]
209
+ bbox = BoundingBox(
210
+ l=left,
211
+ t=top,
212
+ r=right,
213
+ b=bottom,
214
+ coord_origin=CoordOrigin.TOPLEFT,
215
+ )
216
+ rect = tesseract_box_to_bounding_rectangle(
217
+ bbox,
218
+ original_offset=ocr_rect,
219
+ scale=self.scale,
220
+ orientation=doc_orientation,
221
+ im_size=high_res_image.size,
222
+ )
196
223
  cells.append(
197
224
  TextCell(
198
225
  index=ix,
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
200
227
  orig=text,
201
228
  from_ocr=True,
202
229
  confidence=confidence,
203
- rect=BoundingRectangle.from_bounding_box(
204
- BoundingBox.from_tuple(
205
- coord=(left, top, right, bottom),
206
- origin=CoordOrigin.TOPLEFT,
207
- ),
208
- ),
230
+ rect=rect,
209
231
  )
210
232
  )
211
233
 
@@ -3,11 +3,12 @@ import warnings
3
3
  from pathlib import Path
4
4
  from typing import Optional, cast
5
5
 
6
+ import numpy as np
6
7
  from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
7
8
 
8
9
  from docling.backend.abstract_backend import AbstractDocumentBackend
9
10
  from docling.backend.pdf_backend import PdfDocumentBackend
10
- from docling.datamodel.base_models import AssembledUnit, Page
11
+ from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
11
12
  from docling.datamodel.document import ConversionResult
12
13
  from docling.datamodel.pipeline_options import PdfPipelineOptions
13
14
  from docling.datamodel.settings import settings
@@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline):
60
61
  or self.pipeline_options.generate_table_images
61
62
  )
62
63
 
63
- self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
64
+ self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
64
65
 
65
66
  ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
66
67
 
@@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline):
197
198
  elements=all_elements, headers=all_headers, body=all_body
198
199
  )
199
200
 
200
- conv_res.document = self.glm_model(conv_res)
201
+ conv_res.document = self.reading_order_model(conv_res)
201
202
 
202
203
  # Generate page images in the output
203
204
  if self.pipeline_options.generate_page_images:
@@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline):
244
245
  cropped_im, dpi=int(72 * scale)
245
246
  )
246
247
 
248
+ # Aggregate confidence values for document:
249
+ if len(conv_res.pages) > 0:
250
+ conv_res.confidence.layout_score = float(
251
+ np.nanmean(
252
+ [c.layout_score for c in conv_res.confidence.pages.values()]
253
+ )
254
+ )
255
+ conv_res.confidence.parse_score = float(
256
+ np.nanquantile(
257
+ [c.parse_score for c in conv_res.confidence.pages.values()],
258
+ q=0.1, # parse score should relate to worst 10% of pages.
259
+ )
260
+ )
261
+ conv_res.confidence.table_score = float(
262
+ np.nanmean(
263
+ [c.table_score for c in conv_res.confidence.pages.values()]
264
+ )
265
+ )
266
+ conv_res.confidence.ocr_score = float(
267
+ np.nanmean(
268
+ [c.ocr_score for c in conv_res.confidence.pages.values()]
269
+ )
270
+ )
271
+
247
272
  return conv_res
248
273
 
249
274
  @classmethod
@@ -3,7 +3,7 @@ from io import BytesIO
3
3
  from pathlib import Path
4
4
  from typing import List, Optional, Union, cast
5
5
 
6
- # from docling_core.types import DoclingDocument
6
+ from docling_core.types import DoclingDocument
7
7
  from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
8
8
  from docling_core.types.doc.document import DocTagsDocument
9
9
  from PIL import Image as PILImage
@@ -133,28 +133,26 @@ class VlmPipeline(PaginatedPipeline):
133
133
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
134
134
  doctags_list_c, image_list_c
135
135
  )
136
- conv_res.document.load_from_doctags(doctags_doc)
136
+ conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
137
137
 
138
138
  # If forced backend text, replace model predicted text with backend one
139
- if page.size:
140
- if self.force_backend_text:
141
- scale = self.pipeline_options.images_scale
142
- for element, _level in conv_res.document.iterate_items():
143
- if (
144
- not isinstance(element, TextItem)
145
- or len(element.prov) == 0
146
- ):
147
- continue
148
- crop_bbox = (
149
- element.prov[0]
150
- .bbox.scaled(scale=scale)
151
- .to_top_left_origin(
152
- page_height=page.size.height * scale
153
- )
154
- )
155
- txt = self.extract_text_from_backend(page, crop_bbox)
156
- element.text = txt
157
- element.orig = txt
139
+ if self.force_backend_text:
140
+ scale = self.pipeline_options.images_scale
141
+ for element, _level in conv_res.document.iterate_items():
142
+ if not isinstance(element, TextItem) or len(element.prov) == 0:
143
+ continue
144
+ page_ix = element.prov[0].page_no - 1
145
+ page = conv_res.pages[page_ix]
146
+ if not page.size:
147
+ continue
148
+ crop_bbox = (
149
+ element.prov[0]
150
+ .bbox.scaled(scale=scale)
151
+ .to_top_left_origin(page_height=page.size.height * scale)
152
+ )
153
+ txt = self.extract_text_from_backend(page, crop_bbox)
154
+ element.text = txt
155
+ element.orig = txt
158
156
  elif (
159
157
  self.pipeline_options.vlm_options.response_format
160
158
  == ResponseFormat.MARKDOWN
@@ -90,17 +90,12 @@ class SpatialClusterIndex:
90
90
  containment_threshold: float,
91
91
  ) -> bool:
92
92
  """Check if two bboxes overlap sufficiently."""
93
- area1, area2 = bbox1.area(), bbox2.area()
94
- if area1 <= 0 or area2 <= 0:
93
+ if bbox1.area() <= 0 or bbox2.area() <= 0:
95
94
  return False
96
95
 
97
- overlap_area = bbox1.intersection_area_with(bbox2)
98
- if overlap_area <= 0:
99
- return False
100
-
101
- iou = overlap_area / (area1 + area2 - overlap_area)
102
- containment1 = overlap_area / area1
103
- containment2 = overlap_area / area2
96
+ iou = bbox1.intersection_over_union(bbox2)
97
+ containment1 = bbox1.intersection_over_self(bbox2)
98
+ containment2 = bbox2.intersection_over_self(bbox1)
104
99
 
105
100
  return (
106
101
  iou > overlap_threshold
@@ -321,11 +316,9 @@ class LayoutPostprocessor:
321
316
  for special in special_clusters:
322
317
  contained = []
323
318
  for cluster in self.regular_clusters:
324
- overlap = cluster.bbox.intersection_area_with(special.bbox)
325
- if overlap > 0:
326
- containment = overlap / cluster.bbox.area()
327
- if containment > 0.8:
328
- contained.append(cluster)
319
+ containment = cluster.bbox.intersection_over_self(special.bbox)
320
+ if containment > 0.8:
321
+ contained.append(cluster)
329
322
 
330
323
  if contained:
331
324
  # Sort contained clusters by minimum cell ID:
@@ -379,9 +372,7 @@ class LayoutPostprocessor:
379
372
  for regular in self.regular_clusters:
380
373
  if regular.label == DocItemLabel.TABLE:
381
374
  # Calculate overlap
382
- overlap = regular.bbox.intersection_area_with(wrapper.bbox)
383
- wrapper_area = wrapper.bbox.area()
384
- overlap_ratio = overlap / wrapper_area
375
+ overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
385
376
 
386
377
  conf_diff = wrapper.confidence - regular.confidence
387
378
 
@@ -421,8 +412,7 @@ class LayoutPostprocessor:
421
412
  # Rule 2: CODE vs others
422
413
  if candidate.label == DocItemLabel.CODE:
423
414
  # Calculate how much of the other cluster is contained within the CODE cluster
424
- overlap = other.bbox.intersection_area_with(candidate.bbox)
425
- containment = overlap / other.bbox.area()
415
+ containment = other.bbox.intersection_over_self(candidate.bbox)
426
416
  if containment > 0.8: # other is 80% contained within CODE
427
417
  return True
428
418
 
@@ -586,11 +576,9 @@ class LayoutPostprocessor:
586
576
  if cell.rect.to_bounding_box().area() <= 0:
587
577
  continue
588
578
 
589
- overlap = cell.rect.to_bounding_box().intersection_area_with(
579
+ overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
590
580
  cluster.bbox
591
581
  )
592
- overlap_ratio = overlap / cell.rect.to_bounding_box().area()
593
-
594
582
  if overlap_ratio > best_overlap:
595
583
  best_overlap = overlap_ratio
596
584
  best_cluster = cluster
@@ -1,3 +1,11 @@
1
+ from typing import Optional, Tuple
2
+
3
+ from docling_core.types.doc import BoundingBox, CoordOrigin
4
+ from docling_core.types.doc.page import BoundingRectangle
5
+
6
+ from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
7
+
8
+
1
9
  def map_tesseract_script(script: str) -> str:
2
10
  r""" """
3
11
  if script == "Katakana" or script == "Hiragana":
@@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
7
15
  elif script == "Korean":
8
16
  script = "Hangul"
9
17
  return script
18
+
19
+
20
+ def parse_tesseract_orientation(orientation: str) -> int:
21
+ # Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
22
+ # are [0, 360[ counterclockwise
23
+ parsed = int(orientation)
24
+ if parsed not in CLIPPED_ORIENTATIONS:
25
+ msg = (
26
+ f"invalid tesseract document orientation {orientation}, "
27
+ f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
28
+ )
29
+ raise ValueError(msg)
30
+ parsed = -parsed
31
+ parsed %= 360
32
+ return parsed
33
+
34
+
35
+ def tesseract_box_to_bounding_rectangle(
36
+ bbox: BoundingBox,
37
+ *,
38
+ original_offset: Optional[BoundingBox] = None,
39
+ scale: float,
40
+ orientation: int,
41
+ im_size: Tuple[int, int],
42
+ ) -> BoundingRectangle:
43
+ # box is in the top, left, height, width format, top left coordinates
44
+ rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
45
+ rect = BoundingRectangle(
46
+ r_x0=rect.r_x0 / scale,
47
+ r_y0=rect.r_y0 / scale,
48
+ r_x1=rect.r_x1 / scale,
49
+ r_y1=rect.r_y1 / scale,
50
+ r_x2=rect.r_x2 / scale,
51
+ r_y2=rect.r_y2 / scale,
52
+ r_x3=rect.r_x3 / scale,
53
+ r_y3=rect.r_y3 / scale,
54
+ coord_origin=CoordOrigin.TOPLEFT,
55
+ )
56
+ if original_offset is not None:
57
+ if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
58
+ msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
59
+ raise ValueError(msg)
60
+ if original_offset is not None:
61
+ rect.r_x0 += original_offset.l
62
+ rect.r_x1 += original_offset.l
63
+ rect.r_x2 += original_offset.l
64
+ rect.r_x3 += original_offset.l
65
+ rect.r_y0 += original_offset.t
66
+ rect.r_y1 += original_offset.t
67
+ rect.r_y2 += original_offset.t
68
+ rect.r_y3 += original_offset.t
69
+ return rect