docling 2.33.0__py3-none-any.whl → 2.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
60
60
  coord_origin=CoordOrigin.BOTTOMLEFT,
61
61
  ).to_top_left_origin(page_height=page_size.height * scale)
62
62
 
63
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
63
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
64
64
 
65
65
  if overlap_frac > 0.5:
66
66
  if len(text_piece) > 0:
@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
71
71
  coord_origin=CoordOrigin.BOTTOMLEFT,
72
72
  ).to_top_left_origin(page_height=page_size.height * scale)
73
73
 
74
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
74
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
75
75
 
76
76
  if overlap_frac > 0.5:
77
77
  if len(text_piece) > 0:
@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
46
46
  .scaled(scale)
47
47
  )
48
48
 
49
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
49
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
50
50
 
51
51
  if overlap_frac > 0.5:
52
52
  if len(text_piece) > 0:
@@ -1,6 +1,9 @@
1
+ import math
2
+ from collections import defaultdict
1
3
  from enum import Enum
2
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
+ from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
3
5
 
6
+ import numpy as np
4
7
  from docling_core.types.doc import (
5
8
  BoundingBox,
6
9
  DocItemLabel,
@@ -16,7 +19,7 @@ from docling_core.types.io import (
16
19
  DocumentStream,
17
20
  )
18
21
  from PIL.Image import Image
19
- from pydantic import BaseModel, ConfigDict
22
+ from pydantic import BaseModel, ConfigDict, Field, computed_field
20
23
 
21
24
  if TYPE_CHECKING:
22
25
  from docling.backend.pdf_backend import PdfPageBackend
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
298
301
  choices: List[OpenAiResponseChoice]
299
302
  created: int
300
303
  usage: OpenAiResponseUsage
304
+
305
+
306
+ # Create a type alias for score values
307
+ ScoreValue = float
308
+
309
+
310
+ class QualityGrade(str, Enum):
311
+ POOR = "poor"
312
+ FAIR = "fair"
313
+ GOOD = "good"
314
+ EXCELLENT = "excellent"
315
+ UNSPECIFIED = "unspecified"
316
+
317
+
318
+ class PageConfidenceScores(BaseModel):
319
+ parse_score: ScoreValue = np.nan
320
+ layout_score: ScoreValue = np.nan
321
+ table_score: ScoreValue = np.nan
322
+ ocr_score: ScoreValue = np.nan
323
+
324
+ def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
325
+ if score < 0.5:
326
+ return QualityGrade.POOR
327
+ elif score < 0.8:
328
+ return QualityGrade.FAIR
329
+ elif score < 0.9:
330
+ return QualityGrade.GOOD
331
+ elif score >= 0.9:
332
+ return QualityGrade.EXCELLENT
333
+
334
+ return QualityGrade.UNSPECIFIED
335
+
336
+ @computed_field # type: ignore
337
+ @property
338
+ def mean_grade(self) -> QualityGrade:
339
+ return self._score_to_grade(self.mean_score)
340
+
341
+ @computed_field # type: ignore
342
+ @property
343
+ def low_grade(self) -> QualityGrade:
344
+ return self._score_to_grade(self.low_score)
345
+
346
+ @computed_field # type: ignore
347
+ @property
348
+ def mean_score(self) -> ScoreValue:
349
+ return ScoreValue(
350
+ np.nanmean(
351
+ [
352
+ self.ocr_score,
353
+ self.table_score,
354
+ self.layout_score,
355
+ self.parse_score,
356
+ ]
357
+ )
358
+ )
359
+
360
+ @computed_field # type: ignore
361
+ @property
362
+ def low_score(self) -> ScoreValue:
363
+ return ScoreValue(
364
+ np.nanquantile(
365
+ [
366
+ self.ocr_score,
367
+ self.table_score,
368
+ self.layout_score,
369
+ self.parse_score,
370
+ ],
371
+ q=0.05,
372
+ )
373
+ )
374
+
375
+
376
+ class ConfidenceReport(PageConfidenceScores):
377
+ pages: Dict[int, PageConfidenceScores] = Field(
378
+ default_factory=lambda: defaultdict(PageConfidenceScores)
379
+ )
380
+
381
+ @computed_field # type: ignore
382
+ @property
383
+ def mean_score(self) -> ScoreValue:
384
+ return ScoreValue(
385
+ np.nanmean(
386
+ [c.mean_score for c in self.pages.values()],
387
+ )
388
+ )
389
+
390
+ @computed_field # type: ignore
391
+ @property
392
+ def low_score(self) -> ScoreValue:
393
+ return ScoreValue(
394
+ np.nanmean(
395
+ [c.low_score for c in self.pages.values()],
396
+ )
397
+ )
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
47
47
  )
48
48
  from docling_core.utils.file import resolve_source_to_stream
49
49
  from docling_core.utils.legacy import docling_document_to_legacy
50
- from pydantic import BaseModel
50
+ from pydantic import BaseModel, Field
51
51
  from typing_extensions import deprecated
52
52
 
53
53
  from docling.backend.abstract_backend import (
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
56
56
  )
57
57
  from docling.datamodel.base_models import (
58
58
  AssembledUnit,
59
+ ConfidenceReport,
59
60
  ConversionStatus,
60
61
  DocumentStream,
61
62
  ErrorItem,
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
201
202
  pages: List[Page] = []
202
203
  assembled: AssembledUnit = AssembledUnit()
203
204
  timings: Dict[str, ProfilingItem] = {}
205
+ confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
204
206
 
205
207
  document: DoclingDocument = _EMPTY_DOCLING_DOC
206
208
 
@@ -5,6 +5,7 @@ from collections.abc import Iterable
5
5
  from pathlib import Path
6
6
  from typing import Optional
7
7
 
8
+ import numpy as np
8
9
  from docling_core.types.doc import DocItemLabel
9
10
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
10
11
  from PIL import Image
@@ -184,6 +185,14 @@ class LayoutModel(BasePageModel):
184
185
  ).postprocess()
185
186
  # processed_clusters, processed_cells = clusters, page.cells
186
187
 
188
+ conv_res.confidence.pages[page.page_no].layout_score = float(
189
+ np.mean([c.confidence for c in processed_clusters])
190
+ )
191
+
192
+ conv_res.confidence.pages[page.page_no].ocr_score = float(
193
+ np.mean([c.confidence for c in processed_cells if c.from_ocr])
194
+ )
195
+
187
196
  page.cells = processed_cells
188
197
  page.predictions.layout = LayoutPrediction(
189
198
  clusters=processed_clusters
@@ -3,6 +3,7 @@ import re
3
3
  from collections.abc import Iterable
4
4
  from typing import List
5
5
 
6
+ import numpy as np
6
7
  from pydantic import BaseModel
7
8
 
8
9
  from docling.datamodel.base_models import (
@@ -1,11 +1,13 @@
1
+ import re
1
2
  from collections.abc import Iterable
2
3
  from pathlib import Path
3
4
  from typing import Optional
4
5
 
6
+ import numpy as np
5
7
  from PIL import ImageDraw
6
8
  from pydantic import BaseModel
7
9
 
8
- from docling.datamodel.base_models import Page
10
+ from docling.datamodel.base_models import Page, ScoreValue
9
11
  from docling.datamodel.document import ConversionResult
10
12
  from docling.datamodel.settings import settings
11
13
  from docling.models.base_model import BasePageModel
@@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
21
23
  def __init__(self, options: PagePreprocessingOptions):
22
24
  self.options = options
23
25
 
26
+ # Pre-compiled regex patterns for efficiency
27
+ self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
28
+ self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
29
+ self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
30
+ self.SLASH_NUMBER_GARBAGE_RE = re.compile(
31
+ r"(?:/\w+\s*){2,}"
32
+ ) # Two or more "/token " sequences
33
+
24
34
  def __call__(
25
35
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
26
36
  ) -> Iterable[Page]:
@@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
60
70
  if self.options.create_parsed_page:
61
71
  page.parsed_page = page._backend.get_segmented_page()
62
72
 
73
+ # Rate the text quality from the PDF parser, and aggregate on page
74
+ text_scores = []
75
+ for c in page.cells:
76
+ score = self.rate_text_quality(c.text)
77
+ text_scores.append(score)
78
+
79
+ conv_res.confidence.pages[page.page_no].parse_score = float(
80
+ np.nanquantile(
81
+ text_scores, q=0.10
82
+ ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
83
+ )
84
+
63
85
  # DEBUG code:
64
86
  def draw_text_boxes(image, cells, show: bool = False):
65
87
  draw = ImageDraw.Draw(image)
@@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
88
110
  draw_text_boxes(page.get_image(scale=1.0), page.cells)
89
111
 
90
112
  return page
113
+
114
+ def rate_text_quality(self, text: str) -> float:
115
+ # Hard errors: if any of these patterns are found, return 0.0 immediately.
116
+ blacklist_chars = ["�"]
117
+ if (
118
+ any(text.find(c) >= 0 for c in blacklist_chars)
119
+ or self.GLYPH_RE.search(text)
120
+ or self.SLASH_G_RE.search(text)
121
+ or self.SLASH_NUMBER_GARBAGE_RE.match(
122
+ text
123
+ ) # Check if text is mostly slash-number pattern
124
+ ):
125
+ return 0.0
126
+
127
+ penalty = 0.0
128
+
129
+ # Apply a penalty only if the fragmented words pattern occurs at least three times.
130
+ frag_matches = self.FRAG_RE.findall(text)
131
+ if len(frag_matches) >= 3:
132
+ penalty += 0.1 * len(frag_matches)
133
+
134
+ # Additional heuristic: if the average token length is below 2, add a penalty.
135
+ # tokens = text.split()
136
+ # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
137
+ # penalty += 0.2
138
+
139
+ return max(1.0 - penalty, 0.0)
@@ -2,6 +2,7 @@ import csv
2
2
  import io
3
3
  import logging
4
4
  import os
5
+ import subprocess
5
6
  import tempfile
6
7
  from collections.abc import Iterable
7
8
  from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
10
11
 
11
12
  import pandas as pd
12
13
  from docling_core.types.doc import BoundingBox, CoordOrigin
13
- from docling_core.types.doc.page import BoundingRectangle, TextCell
14
+ from docling_core.types.doc.page import TextCell
14
15
 
15
16
  from docling.datamodel.base_models import Page
16
17
  from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
21
22
  )
22
23
  from docling.datamodel.settings import settings
23
24
  from docling.models.base_ocr_model import BaseOcrModel
24
- from docling.utils.ocr_utils import map_tesseract_script
25
+ from docling.utils.ocr_utils import (
26
+ map_tesseract_script,
27
+ parse_tesseract_orientation,
28
+ tesseract_box_to_bounding_rectangle,
29
+ )
25
30
  from docling.utils.profiling import TimeRecorder
26
31
 
27
32
  _log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
49
54
  self._version: Optional[str] = None
50
55
  self._tesseract_languages: Optional[List[str]] = None
51
56
  self._script_prefix: Optional[str] = None
57
+ self._is_auto: bool = "auto" in self.options.lang
52
58
 
53
59
  if self.enabled:
54
60
  try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
93
99
 
94
100
  return name, version
95
101
 
96
- def _run_tesseract(self, ifilename: str):
102
+ def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
97
103
  r"""
98
104
  Run tesseract CLI
99
105
  """
100
106
  cmd = [self.options.tesseract_cmd]
101
-
102
- if "auto" in self.options.lang:
103
- lang = self._detect_language(ifilename)
107
+ if self._is_auto:
108
+ lang = self._parse_language(osd)
104
109
  if lang is not None:
105
110
  cmd.append("-l")
106
111
  cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
115
120
  cmd += [ifilename, "stdout", "tsv"]
116
121
  _log.info("command: {}".format(" ".join(cmd)))
117
122
 
118
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
119
- output, _ = proc.communicate()
123
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
120
124
 
121
125
  # _log.info(output)
122
126
 
123
127
  # Decode the byte string to a regular string
124
- decoded_data = output.decode("utf-8")
128
+ decoded_data = output.stdout.decode("utf-8")
125
129
  # _log.info(decoded_data)
126
130
 
127
131
  # Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
139
143
 
140
144
  return df_filtered
141
145
 
142
- def _detect_language(self, ifilename: str):
146
+ def _perform_osd(self, ifilename: str) -> pd.DataFrame:
143
147
  r"""
144
148
  Run tesseract in PSM 0 mode to detect the language
145
149
  """
146
- assert self._tesseract_languages is not None
147
150
 
148
151
  cmd = [self.options.tesseract_cmd]
149
152
  cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
150
153
  _log.info("command: {}".format(" ".join(cmd)))
151
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
152
- output, _ = proc.communicate()
153
- decoded_data = output.decode("utf-8")
154
+ output = subprocess.run(cmd, capture_output=True, check=True)
155
+ decoded_data = output.stdout.decode("utf-8")
154
156
  df_detected = pd.read_csv(
155
157
  io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
156
158
  )
157
- scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
159
+ return df_detected
160
+
161
+ def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
162
+ assert self._tesseract_languages is not None
163
+ scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
158
164
  if len(scripts) == 0:
159
165
  _log.warning("Tesseract cannot detect the script of the page")
160
166
  return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
182
188
  cmd = [self.options.tesseract_cmd]
183
189
  cmd.append("--list-langs")
184
190
  _log.info("command: {}".format(" ".join(cmd)))
185
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
186
- output, _ = proc.communicate()
187
- decoded_data = output.decode("utf-8")
191
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
192
+ decoded_data = output.stdout.decode("utf-8")
188
193
  df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
189
194
  self._tesseract_languages = df_list[0].tolist()[1:]
190
195
 
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
203
208
  yield from page_batch
204
209
  return
205
210
 
206
- for page in page_batch:
211
+ for page_i, page in enumerate(page_batch):
207
212
  assert page._backend is not None
208
213
  if not page._backend.is_valid():
209
214
  yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
212
217
  ocr_rects = self.get_ocr_rects(page)
213
218
 
214
219
  all_ocr_cells = []
215
- for ocr_rect in ocr_rects:
220
+ for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
216
221
  # Skip zero area boxes
217
222
  if ocr_rect.area() == 0:
218
223
  continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
225
230
  ) as image_file:
226
231
  fname = image_file.name
227
232
  high_res_image.save(image_file)
228
-
229
- df_result = self._run_tesseract(fname)
233
+ doc_orientation = 0
234
+ try:
235
+ df_osd = self._perform_osd(fname)
236
+ doc_orientation = _parse_orientation(df_osd)
237
+ except subprocess.CalledProcessError as exc:
238
+ _log.error(
239
+ "OSD failed (doc %s, page: %s, "
240
+ "OCR rectangle: %s, processed image file %s):\n %s",
241
+ conv_res.input.file,
242
+ page_i,
243
+ ocr_rect_i,
244
+ image_file,
245
+ exc.stderr,
246
+ )
247
+ # Skipping if OSD fail when in auto mode, otherwise proceed
248
+ # to OCR in the hope OCR will succeed while OSD failed
249
+ if self._is_auto:
250
+ continue
251
+ if doc_orientation != 0:
252
+ high_res_image = high_res_image.rotate(
253
+ -doc_orientation, expand=True
254
+ )
255
+ high_res_image.save(fname)
256
+ try:
257
+ df_result = self._run_tesseract(fname, df_osd)
258
+ except subprocess.CalledProcessError as exc:
259
+ _log.error(
260
+ "tesseract OCR failed (doc %s, page: %s, "
261
+ "OCR rectangle: %s, processed image file %s):\n %s",
262
+ conv_res.input.file,
263
+ page_i,
264
+ ocr_rect_i,
265
+ image_file,
266
+ exc.stderr,
267
+ )
268
+ continue
230
269
  finally:
231
270
  if os.path.exists(fname):
232
271
  os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
238
277
  text = row["text"]
239
278
  conf = row["conf"]
240
279
 
241
- l = float(row["left"]) # noqa: E741
242
- b = float(row["top"])
243
- w = float(row["width"])
244
- h = float(row["height"])
245
-
246
- t = b + h
247
- r = l + w
248
-
280
+ left, top = float(row["left"]), float(row["top"])
281
+ right = left + float(row["width"])
282
+ bottom = top + row["height"]
283
+ bbox = BoundingBox(
284
+ l=left,
285
+ t=top,
286
+ r=right,
287
+ b=bottom,
288
+ coord_origin=CoordOrigin.TOPLEFT,
289
+ )
290
+ rect = tesseract_box_to_bounding_rectangle(
291
+ bbox,
292
+ original_offset=ocr_rect,
293
+ scale=self.scale,
294
+ orientation=doc_orientation,
295
+ im_size=high_res_image.size,
296
+ )
249
297
  cell = TextCell(
250
298
  index=ix,
251
299
  text=str(text),
252
300
  orig=str(text),
253
301
  from_ocr=True,
254
302
  confidence=conf / 100.0,
255
- rect=BoundingRectangle.from_bounding_box(
256
- BoundingBox.from_tuple(
257
- coord=(
258
- (l / self.scale) + ocr_rect.l,
259
- (b / self.scale) + ocr_rect.t,
260
- (r / self.scale) + ocr_rect.l,
261
- (t / self.scale) + ocr_rect.t,
262
- ),
263
- origin=CoordOrigin.TOPLEFT,
264
- )
265
- ),
303
+ rect=rect,
266
304
  )
267
305
  all_ocr_cells.append(cell)
268
306
 
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
278
316
  @classmethod
279
317
  def get_options_type(cls) -> Type[OcrOptions]:
280
318
  return TesseractCliOcrOptions
319
+
320
+
321
+ def _parse_orientation(df_osd: pd.DataFrame) -> int:
322
+ orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
323
+ orientation = parse_tesseract_orientation(orientations[0].strip())
324
+ return orientation
@@ -1,12 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from collections.abc import Iterable
5
4
  from pathlib import Path
6
- from typing import Optional, Type
5
+ from typing import Iterable, Optional, Type
7
6
 
8
7
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_core.types.doc.page import BoundingRectangle, TextCell
8
+ from docling_core.types.doc.page import TextCell
10
9
 
11
10
  from docling.datamodel.base_models import Page
12
11
  from docling.datamodel.document import ConversionResult
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
17
16
  )
18
17
  from docling.datamodel.settings import settings
19
18
  from docling.models.base_ocr_model import BaseOcrModel
20
- from docling.utils.ocr_utils import map_tesseract_script
19
+ from docling.utils.ocr_utils import (
20
+ map_tesseract_script,
21
+ parse_tesseract_orientation,
22
+ tesseract_box_to_bounding_rectangle,
23
+ )
21
24
  from docling.utils.profiling import TimeRecorder
22
25
 
23
26
  _log = logging.getLogger(__name__)
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
38
41
  accelerator_options=accelerator_options,
39
42
  )
40
43
  self.options: TesseractOcrOptions
41
-
44
+ self._is_auto: bool = "auto" in self.options.lang
42
45
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
43
46
  self.reader = None
44
47
  self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
95
98
 
96
99
  if lang == "auto":
97
100
  self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
98
- self.osd_reader = tesserocr.PyTessBaseAPI(
99
- **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
100
- )
101
101
  else:
102
102
  self.reader = tesserocr.PyTessBaseAPI(
103
103
  **{"lang": lang} | tesserocr_kwargs,
104
104
  )
105
+ self.osd_reader = tesserocr.PyTessBaseAPI(
106
+ **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
107
+ )
105
108
  self.reader_RIL = tesserocr.RIL
106
109
 
107
110
  def __del__(self):
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
118
121
  yield from page_batch
119
122
  return
120
123
 
121
- for page in page_batch:
124
+ for page_i, page in enumerate(page_batch):
122
125
  assert page._backend is not None
123
126
  if not page._backend.is_valid():
124
127
  yield page
125
128
  else:
126
129
  with TimeRecorder(conv_res, "ocr"):
127
130
  assert self.reader is not None
131
+ assert self.osd_reader is not None
128
132
  assert self._tesserocr_languages is not None
129
133
 
130
134
  ocr_rects = self.get_ocr_rects(page)
131
135
 
132
136
  all_ocr_cells = []
133
- for ocr_rect in ocr_rects:
137
+ for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
134
138
  # Skip zero area boxes
135
139
  if ocr_rect.area() == 0:
136
140
  continue
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
139
143
  )
140
144
 
141
145
  local_reader = self.reader
142
- if "auto" in self.options.lang:
143
- assert self.osd_reader is not None
144
-
145
- self.osd_reader.SetImage(high_res_image)
146
- osd = self.osd_reader.DetectOrientationScript()
147
-
148
- # No text, probably
149
- if osd is None:
146
+ self.osd_reader.SetImage(high_res_image)
147
+ osd = self.osd_reader.DetectOrientationScript()
148
+ # No text, or Orientation and Script detection failure
149
+ if osd is None:
150
+ _log.error(
151
+ "OSD failed for doc (doc %s, page: %s, "
152
+ "OCR rectangle: %s)",
153
+ conv_res.input.file,
154
+ page_i,
155
+ ocr_rect_i,
156
+ )
157
+ # Skipping if OSD fail when in auto mode, otherwise proceed
158
+ # to OCR in the hope OCR will succeed while OSD failed
159
+ if self._is_auto:
150
160
  continue
151
-
161
+ doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
162
+ if doc_orientation != 0:
163
+ high_res_image = high_res_image.rotate(
164
+ -doc_orientation, expand=True
165
+ )
166
+ if self._is_auto:
152
167
  script = osd["script_name"]
153
168
  script = map_tesseract_script(script)
154
169
  lang = f"{self.script_prefix}{script}"
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
188
203
  # Extract text within the bounding box
189
204
  text = local_reader.GetUTF8Text().strip()
190
205
  confidence = local_reader.MeanTextConf()
191
- left = box["x"] / self.scale
192
- bottom = box["y"] / self.scale
193
- right = (box["x"] + box["w"]) / self.scale
194
- top = (box["y"] + box["h"]) / self.scale
195
-
206
+ left, top = box["x"], box["y"]
207
+ right = left + box["w"]
208
+ bottom = top + box["h"]
209
+ bbox = BoundingBox(
210
+ l=left,
211
+ t=top,
212
+ r=right,
213
+ b=bottom,
214
+ coord_origin=CoordOrigin.TOPLEFT,
215
+ )
216
+ rect = tesseract_box_to_bounding_rectangle(
217
+ bbox,
218
+ original_offset=ocr_rect,
219
+ scale=self.scale,
220
+ orientation=doc_orientation,
221
+ im_size=high_res_image.size,
222
+ )
196
223
  cells.append(
197
224
  TextCell(
198
225
  index=ix,
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
200
227
  orig=text,
201
228
  from_ocr=True,
202
229
  confidence=confidence,
203
- rect=BoundingRectangle.from_bounding_box(
204
- BoundingBox.from_tuple(
205
- coord=(left, top, right, bottom),
206
- origin=CoordOrigin.TOPLEFT,
207
- ),
208
- ),
230
+ rect=rect,
209
231
  )
210
232
  )
211
233
 
@@ -3,11 +3,12 @@ import warnings
3
3
  from pathlib import Path
4
4
  from typing import Optional, cast
5
5
 
6
+ import numpy as np
6
7
  from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
7
8
 
8
9
  from docling.backend.abstract_backend import AbstractDocumentBackend
9
10
  from docling.backend.pdf_backend import PdfDocumentBackend
10
- from docling.datamodel.base_models import AssembledUnit, Page
11
+ from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
11
12
  from docling.datamodel.document import ConversionResult
12
13
  from docling.datamodel.pipeline_options import PdfPipelineOptions
13
14
  from docling.datamodel.settings import settings
@@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline):
60
61
  or self.pipeline_options.generate_table_images
61
62
  )
62
63
 
63
- self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
64
+ self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
64
65
 
65
66
  ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
66
67
 
@@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline):
197
198
  elements=all_elements, headers=all_headers, body=all_body
198
199
  )
199
200
 
200
- conv_res.document = self.glm_model(conv_res)
201
+ conv_res.document = self.reading_order_model(conv_res)
201
202
 
202
203
  # Generate page images in the output
203
204
  if self.pipeline_options.generate_page_images:
@@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline):
244
245
  cropped_im, dpi=int(72 * scale)
245
246
  )
246
247
 
248
+ # Aggregate confidence values for document:
249
+ if len(conv_res.pages) > 0:
250
+ conv_res.confidence.layout_score = float(
251
+ np.nanmean(
252
+ [c.layout_score for c in conv_res.confidence.pages.values()]
253
+ )
254
+ )
255
+ conv_res.confidence.parse_score = float(
256
+ np.nanquantile(
257
+ [c.parse_score for c in conv_res.confidence.pages.values()],
258
+ q=0.1, # parse score should relate to worst 10% of pages.
259
+ )
260
+ )
261
+ conv_res.confidence.table_score = float(
262
+ np.nanmean(
263
+ [c.table_score for c in conv_res.confidence.pages.values()]
264
+ )
265
+ )
266
+ conv_res.confidence.ocr_score = float(
267
+ np.nanmean(
268
+ [c.ocr_score for c in conv_res.confidence.pages.values()]
269
+ )
270
+ )
271
+
247
272
  return conv_res
248
273
 
249
274
  @classmethod
@@ -90,17 +90,12 @@ class SpatialClusterIndex:
90
90
  containment_threshold: float,
91
91
  ) -> bool:
92
92
  """Check if two bboxes overlap sufficiently."""
93
- area1, area2 = bbox1.area(), bbox2.area()
94
- if area1 <= 0 or area2 <= 0:
93
+ if bbox1.area() <= 0 or bbox2.area() <= 0:
95
94
  return False
96
95
 
97
- overlap_area = bbox1.intersection_area_with(bbox2)
98
- if overlap_area <= 0:
99
- return False
100
-
101
- iou = overlap_area / (area1 + area2 - overlap_area)
102
- containment1 = overlap_area / area1
103
- containment2 = overlap_area / area2
96
+ iou = bbox1.intersection_over_union(bbox2)
97
+ containment1 = bbox1.intersection_over_self(bbox2)
98
+ containment2 = bbox2.intersection_over_self(bbox1)
104
99
 
105
100
  return (
106
101
  iou > overlap_threshold
@@ -321,11 +316,9 @@ class LayoutPostprocessor:
321
316
  for special in special_clusters:
322
317
  contained = []
323
318
  for cluster in self.regular_clusters:
324
- overlap = cluster.bbox.intersection_area_with(special.bbox)
325
- if overlap > 0:
326
- containment = overlap / cluster.bbox.area()
327
- if containment > 0.8:
328
- contained.append(cluster)
319
+ containment = cluster.bbox.intersection_over_self(special.bbox)
320
+ if containment > 0.8:
321
+ contained.append(cluster)
329
322
 
330
323
  if contained:
331
324
  # Sort contained clusters by minimum cell ID:
@@ -379,9 +372,7 @@ class LayoutPostprocessor:
379
372
  for regular in self.regular_clusters:
380
373
  if regular.label == DocItemLabel.TABLE:
381
374
  # Calculate overlap
382
- overlap = regular.bbox.intersection_area_with(wrapper.bbox)
383
- wrapper_area = wrapper.bbox.area()
384
- overlap_ratio = overlap / wrapper_area
375
+ overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
385
376
 
386
377
  conf_diff = wrapper.confidence - regular.confidence
387
378
 
@@ -421,8 +412,7 @@ class LayoutPostprocessor:
421
412
  # Rule 2: CODE vs others
422
413
  if candidate.label == DocItemLabel.CODE:
423
414
  # Calculate how much of the other cluster is contained within the CODE cluster
424
- overlap = other.bbox.intersection_area_with(candidate.bbox)
425
- containment = overlap / other.bbox.area()
415
+ containment = other.bbox.intersection_over_self(candidate.bbox)
426
416
  if containment > 0.8: # other is 80% contained within CODE
427
417
  return True
428
418
 
@@ -586,11 +576,9 @@ class LayoutPostprocessor:
586
576
  if cell.rect.to_bounding_box().area() <= 0:
587
577
  continue
588
578
 
589
- overlap = cell.rect.to_bounding_box().intersection_area_with(
579
+ overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
590
580
  cluster.bbox
591
581
  )
592
- overlap_ratio = overlap / cell.rect.to_bounding_box().area()
593
-
594
582
  if overlap_ratio > best_overlap:
595
583
  best_overlap = overlap_ratio
596
584
  best_cluster = cluster
@@ -1,3 +1,11 @@
1
+ from typing import Optional, Tuple
2
+
3
+ from docling_core.types.doc import BoundingBox, CoordOrigin
4
+ from docling_core.types.doc.page import BoundingRectangle
5
+
6
+ from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
7
+
8
+
1
9
  def map_tesseract_script(script: str) -> str:
2
10
  r""" """
3
11
  if script == "Katakana" or script == "Hiragana":
@@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
7
15
  elif script == "Korean":
8
16
  script = "Hangul"
9
17
  return script
18
+
19
+
20
+ def parse_tesseract_orientation(orientation: str) -> int:
21
+ # Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
22
+ # are [0, 360[ counterclockwise
23
+ parsed = int(orientation)
24
+ if parsed not in CLIPPED_ORIENTATIONS:
25
+ msg = (
26
+ f"invalid tesseract document orientation {orientation}, "
27
+ f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
28
+ )
29
+ raise ValueError(msg)
30
+ parsed = -parsed
31
+ parsed %= 360
32
+ return parsed
33
+
34
+
35
+ def tesseract_box_to_bounding_rectangle(
36
+ bbox: BoundingBox,
37
+ *,
38
+ original_offset: Optional[BoundingBox] = None,
39
+ scale: float,
40
+ orientation: int,
41
+ im_size: Tuple[int, int],
42
+ ) -> BoundingRectangle:
43
+ # box is in the top, left, height, width format, top left coordinates
44
+ rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
45
+ rect = BoundingRectangle(
46
+ r_x0=rect.r_x0 / scale,
47
+ r_y0=rect.r_y0 / scale,
48
+ r_x1=rect.r_x1 / scale,
49
+ r_y1=rect.r_y1 / scale,
50
+ r_x2=rect.r_x2 / scale,
51
+ r_y2=rect.r_y2 / scale,
52
+ r_x3=rect.r_x3 / scale,
53
+ r_y3=rect.r_y3 / scale,
54
+ coord_origin=CoordOrigin.TOPLEFT,
55
+ )
56
+ if original_offset is not None:
57
+ if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
58
+ msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
59
+ raise ValueError(msg)
60
+ if original_offset is not None:
61
+ rect.r_x0 += original_offset.l
62
+ rect.r_x1 += original_offset.l
63
+ rect.r_x2 += original_offset.l
64
+ rect.r_x3 += original_offset.l
65
+ rect.r_y0 += original_offset.t
66
+ rect.r_y1 += original_offset.t
67
+ rect.r_y2 += original_offset.t
68
+ rect.r_y3 += original_offset.t
69
+ return rect
@@ -0,0 +1,71 @@
1
+ from typing import Tuple
2
+
3
+ from docling_core.types.doc import BoundingBox, CoordOrigin
4
+ from docling_core.types.doc.page import BoundingRectangle
5
+
6
+ CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
7
+
8
+
9
+ def rotate_bounding_box(
10
+ bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
11
+ ) -> BoundingRectangle:
12
+ # The box is left top width height in TOPLEFT coordinates
13
+ # Bounding rectangle start with r_0 at the bottom left whatever the
14
+ # coordinate system. Then other corners are found rotating counterclockwise
15
+ bbox = bbox.to_top_left_origin(im_size[1])
16
+ left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
17
+ im_h, im_w = im_size
18
+ angle = angle % 360
19
+ if angle == 0:
20
+ r_x0 = left
21
+ r_y0 = top + height
22
+ r_x1 = r_x0 + width
23
+ r_y1 = r_y0
24
+ r_x2 = r_x0 + width
25
+ r_y2 = r_y0 - height
26
+ r_x3 = r_x0
27
+ r_y3 = r_y0 - height
28
+ elif angle == 90:
29
+ r_x0 = im_w - (top + height)
30
+ r_y0 = left
31
+ r_x1 = r_x0
32
+ r_y1 = r_y0 + width
33
+ r_x2 = r_x0 + height
34
+ r_y2 = r_y0 + width
35
+ r_x3 = r_x0
36
+ r_y3 = r_y0 + width
37
+ elif angle == 180:
38
+ r_x0 = im_h - left
39
+ r_y0 = im_w - (top + height)
40
+ r_x1 = r_x0 - width
41
+ r_y1 = r_y0
42
+ r_x2 = r_x0 - width
43
+ r_y2 = r_y0 + height
44
+ r_x3 = r_x0
45
+ r_y3 = r_y0 + height
46
+ elif angle == 270:
47
+ r_x0 = top + height
48
+ r_y0 = im_h - left
49
+ r_x1 = r_x0
50
+ r_y1 = r_y0 - width
51
+ r_x2 = r_x0 - height
52
+ r_y2 = r_y0 - width
53
+ r_x3 = r_x0 - height
54
+ r_y3 = r_y0
55
+ else:
56
+ msg = (
57
+ f"invalid orientation {angle}, expected values in:"
58
+ f" {sorted(CLIPPED_ORIENTATIONS)}"
59
+ )
60
+ raise ValueError(msg)
61
+ return BoundingRectangle(
62
+ r_x0=r_x0,
63
+ r_y0=r_y0,
64
+ r_x1=r_x1,
65
+ r_y1=r_y1,
66
+ r_x2=r_x2,
67
+ r_y2=r_y2,
68
+ r_x3=r_x3,
69
+ r_y3=r_y3,
70
+ coord_origin=CoordOrigin.TOPLEFT,
71
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.33.0
3
+ Version: 2.34.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -3,9 +3,9 @@ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
4
4
  docling/backend/asciidoc_backend.py,sha256=W-4MRcID6AU9Ax23q8FwDwGG-OOCrBoqcNf2Ch_WPUc,14041
5
5
  docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
6
- docling/backend/docling_parse_backend.py,sha256=V_CsUdN5RkGQBBq7A_ReAiUW4CQVh0-1Ur157Ozurdg,8017
7
- docling/backend/docling_parse_v2_backend.py,sha256=6fokgqb1hMbZua33gL46EFamrwPTC7ms6ZuEHw-Dv28,9395
8
- docling/backend/docling_parse_v4_backend.py,sha256=-WJZs0IsdN6blhkvTS1eh_qhujYLyJ3XcOMqS6AaXxg,6282
6
+ docling/backend/docling_parse_backend.py,sha256=bVSPmmiVXdCVfe-eLtDhbPQKBjkFR8rZJoRxdWIMdYU,7998
7
+ docling/backend/docling_parse_v2_backend.py,sha256=R4YPCEs72GYg-Xc9VfizPv8QjtGmKOsQzVPNAU2RIK0,9376
8
+ docling/backend/docling_parse_v4_backend.py,sha256=aWh-fd-lnuRGVGC_DG17QUptIsArv5V1gJo8QFbB5Ys,6263
9
9
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
@@ -28,8 +28,8 @@ docling/cli/main.py,sha256=D7WEY4x6pQCVFRy3peK9KUDOb0Y5IVc-vTDqPnHPK00,26138
28
28
  docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
29
29
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
30
30
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- docling/datamodel/base_models.py,sha256=3BmGoV2HLXOWFuRHFAa42YnWceh-JEpcLXzfFz9AD9Y,7943
32
- docling/datamodel/document.py,sha256=rxu5SqUppgNtYWwIbBoLlZvYAn_w2cGn5uq4AsvouSc,15907
31
+ docling/datamodel/base_models.py,sha256=QJlzGJKUAO0kqM6DO2RZKlFi-lL2MpY8qt3Wdm02Slw,10460
32
+ docling/datamodel/document.py,sha256=lvdCw36iykfSHqapvwRVD2pdnR9vmnYRfrGFNJuwbug,16011
33
33
  docling/datamodel/pipeline_options.py,sha256=uwjBvK4egrgcF1_w4B5EDxpGnl4IgBzmxP7dJ7zm394,13400
34
34
  docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
35
35
  docling/document_converter.py,sha256=PRRr65nigQ3LZDl4G2fBMkOtJyswT7xyGt7fpUeDO3w,13849
@@ -47,10 +47,10 @@ docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0
47
47
  docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
48
48
  docling/models/hf_mlx_model.py,sha256=B_B4hFU-jU0g_DQtQD8w4Ejorn10mkDuFI93wR_WhGk,4897
49
49
  docling/models/hf_vlm_model.py,sha256=SiPMTLghMUjJ66dA2yN4UujpLO6PiOhLEPInWtXV_5s,6912
50
- docling/models/layout_model.py,sha256=0fiJXJ4aPmcMsYY7rbN9LJ2mZ0_8G0ODY9kyNTAN3Ws,7823
50
+ docling/models/layout_model.py,sha256=0Ro7IStAF8ACZLuKu7Gi9Cu96_TvGdxoHSYpz05nHVo,8212
51
51
  docling/models/ocr_mac_model.py,sha256=A3TlEbvvwhkWiq9YARos3Y9yNcpPYQ7JGc_4hFtAK-8,5370
52
- docling/models/page_assemble_model.py,sha256=GO7JI1D6T6EkSW94cLQobPGNQUahkxQqTPRwj5CnmFE,6304
53
- docling/models/page_preprocessing_model.py,sha256=6pOGXiFQ-oz06UmJdcaYMdVyfZ0YVLWS6efGcx7Mxws,3105
52
+ docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
53
+ docling/models/page_preprocessing_model.py,sha256=FiPDMmkC1EWTxDjTGbJZH0ZMyXxIOCZDN4qHfoOEfuw,4998
54
54
  docling/models/picture_description_api_model.py,sha256=kCuAFOGEuI5QsRul7Pc1LccxWN7WIvIUhXEmSICYegw,2332
55
55
  docling/models/picture_description_base_model.py,sha256=FbBVXzAOB87xpJN28tuGCxoAdcf6mZNUOqJR7ljUg5g,2946
56
56
  docling/models/picture_description_vlm_model.py,sha256=DiTjnehVy1n0N04xPUvZl8rx4TiNHzHn9Cnzy_ePGts,4177
@@ -59,12 +59,12 @@ docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurH
59
59
  docling/models/rapid_ocr_model.py,sha256=Tq_1Egu5Hjx7Y69Vox17QTtRXztSyflB1fhN08CWQwY,5894
60
60
  docling/models/readingorder_model.py,sha256=S9ru2ApY9sE-Uue3hptWHmbmElwo36bUbAikxCFpHYs,14574
61
61
  docling/models/table_structure_model.py,sha256=1gxLaooK0IKMrnmS8nT1BItKqt1GAKghfpmLKb3i53g,12566
62
- docling/models/tesseract_ocr_cli_model.py,sha256=LXYUCMQAPxQA2pY3zs9wcPSrAHHorTffSmIIWgltoaw,10234
63
- docling/models/tesseract_ocr_model.py,sha256=72009TJL_7tXTEnhlsGRiw_KibrQ0LjZlCBtW8NtwUc,9339
62
+ docling/models/tesseract_ocr_cli_model.py,sha256=e55MkaDdsseKcfX5lxIt0iv5jR6pDFBzWBZHTvl2Jws,12653
63
+ docling/models/tesseract_ocr_model.py,sha256=vS4And5NHe_uLNb6ZBi2CQzWUITBdc1E1zlsojrSZpM,10561
64
64
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
65
  docling/pipeline/base_pipeline.py,sha256=DnuxAf7EQusdSRae0QUVth-0f2mSff8JZjX-2vazk00,8751
66
66
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
67
- docling/pipeline/standard_pdf_pipeline.py,sha256=iNZMMGiHTwV6I4u_jjqXhVJ_DiPn_O9qnnee3PQxidc,10773
67
+ docling/pipeline/standard_pdf_pipeline.py,sha256=wCq0zq8xkiOdNnAEkSuJeELnBjOkLBASD9iQ5mVsSfc,11869
68
68
  docling/pipeline/vlm_pipeline.py,sha256=ZW1WGd6jeLqTCWR0S0cj6H_qVMUXELaFCrJVpvZp6Co,9684
69
69
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
70
70
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,15 +72,16 @@ docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExT
72
72
  docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
73
73
  docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
74
74
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
75
- docling/utils/layout_postprocessor.py,sha256=x7exVG3HYzV9M_O78FfyoG43Y2L7PPMMydvSNwjqh8s,24528
75
+ docling/utils/layout_postprocessor.py,sha256=3WCmkPsPJ80xfWzAUeWb5L9BmuwJ79ztctvbbUs8AfI,24068
76
76
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
77
77
  docling/utils/model_downloader.py,sha256=ocvud3G3qlBQhzMo69Q3RJMnvq5HPZ2DwNbMuEp8RCs,4142
78
- docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
78
+ docling/utils/ocr_utils.py,sha256=AOaDAHr5S74d-IRVR_LKhKynUTIurAwLJ3wNeY58gPA,2326
79
+ docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,2011
79
80
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
80
81
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
81
82
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
82
- docling-2.33.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
83
- docling-2.33.0.dist-info/METADATA,sha256=FXLSejKn7Fc1pY2Fl8YvLP1PYvldoQ76d6zcupzghDo,10138
84
- docling-2.33.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
85
- docling-2.33.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
86
- docling-2.33.0.dist-info/RECORD,,
83
+ docling-2.34.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
84
+ docling-2.34.0.dist-info/METADATA,sha256=s1PANBKtKOaJPgUhrSpeiN0z-8Jx5VvplXLo-7z0sfs,10138
85
+ docling-2.34.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
86
+ docling-2.34.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
87
+ docling-2.34.0.dist-info/RECORD,,