docling 2.33.0__py3-none-any.whl → 2.35.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
60
60
  coord_origin=CoordOrigin.BOTTOMLEFT,
61
61
  ).to_top_left_origin(page_height=page_size.height * scale)
62
62
 
63
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
63
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
64
64
 
65
65
  if overlap_frac > 0.5:
66
66
  if len(text_piece) > 0:
@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
71
71
  coord_origin=CoordOrigin.BOTTOMLEFT,
72
72
  ).to_top_left_origin(page_height=page_size.height * scale)
73
73
 
74
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
74
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
75
75
 
76
76
  if overlap_frac > 0.5:
77
77
  if len(text_piece) > 0:
@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
46
46
  .scaled(scale)
47
47
  )
48
48
 
49
- overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
49
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
50
50
 
51
51
  if overlap_frac > 0.5:
52
52
  if len(text_piece) > 0:
docling/cli/main.py CHANGED
@@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
12
12
 
13
13
  import rich.table
14
14
  import typer
15
+ from docling_core.transforms.serializer.html import (
16
+ HTMLDocSerializer,
17
+ HTMLOutputStyle,
18
+ HTMLParams,
19
+ )
20
+ from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
15
21
  from docling_core.types.doc import ImageRefMode
16
22
  from docling_core.utils.file import resolve_source_to_path
17
23
  from pydantic import TypeAdapter
@@ -156,6 +162,7 @@ def export_documents(
156
162
  export_json: bool,
157
163
  export_html: bool,
158
164
  export_html_split_page: bool,
165
+ show_layout: bool,
159
166
  export_md: bool,
160
167
  export_txt: bool,
161
168
  export_doctags: bool,
@@ -189,9 +196,27 @@ def export_documents(
189
196
  if export_html_split_page:
190
197
  fname = output_dir / f"{doc_filename}.html"
191
198
  _log.info(f"writing HTML output to {fname}")
192
- conv_res.document.save_as_html(
193
- filename=fname, image_mode=image_export_mode, split_page_view=True
194
- )
199
+ if show_layout:
200
+ ser = HTMLDocSerializer(
201
+ doc=conv_res.document,
202
+ params=HTMLParams(
203
+ image_mode=image_export_mode,
204
+ output_style=HTMLOutputStyle.SPLIT_PAGE,
205
+ ),
206
+ )
207
+ visualizer = LayoutVisualizer()
208
+ visualizer.params.show_label = False
209
+ ser_res = ser.serialize(
210
+ visualizer=visualizer,
211
+ )
212
+ with open(fname, "w") as fw:
213
+ fw.write(ser_res.text)
214
+ else:
215
+ conv_res.document.save_as_html(
216
+ filename=fname,
217
+ image_mode=image_export_mode,
218
+ split_page_view=True,
219
+ )
195
220
 
196
221
  # Export Text format:
197
222
  if export_txt:
@@ -250,6 +275,13 @@ def convert( # noqa: C901
250
275
  to_formats: List[OutputFormat] = typer.Option(
251
276
  None, "--to", help="Specify output formats. Defaults to Markdown."
252
277
  ),
278
+ show_layout: Annotated[
279
+ bool,
280
+ typer.Option(
281
+ ...,
282
+ help="If enabled, the page images will show the bounding-boxes of the items.",
283
+ ),
284
+ ] = False,
253
285
  headers: str = typer.Option(
254
286
  None,
255
287
  "--headers",
@@ -596,6 +628,7 @@ def convert( # noqa: C901
596
628
  export_json=export_json,
597
629
  export_html=export_html,
598
630
  export_html_split_page=export_html_split_page,
631
+ show_layout=show_layout,
599
632
  export_md=export_md,
600
633
  export_txt=export_txt,
601
634
  export_doctags=export_doctags,
@@ -1,6 +1,9 @@
1
+ import math
2
+ from collections import defaultdict
1
3
  from enum import Enum
2
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
+ from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
3
5
 
6
+ import numpy as np
4
7
  from docling_core.types.doc import (
5
8
  BoundingBox,
6
9
  DocItemLabel,
@@ -16,7 +19,7 @@ from docling_core.types.io import (
16
19
  DocumentStream,
17
20
  )
18
21
  from PIL.Image import Image
19
- from pydantic import BaseModel, ConfigDict
22
+ from pydantic import BaseModel, ConfigDict, Field, computed_field
20
23
 
21
24
  if TYPE_CHECKING:
22
25
  from docling.backend.pdf_backend import PdfPageBackend
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
298
301
  choices: List[OpenAiResponseChoice]
299
302
  created: int
300
303
  usage: OpenAiResponseUsage
304
+
305
+
306
+ # Create a type alias for score values
307
+ ScoreValue = float
308
+
309
+
310
+ class QualityGrade(str, Enum):
311
+ POOR = "poor"
312
+ FAIR = "fair"
313
+ GOOD = "good"
314
+ EXCELLENT = "excellent"
315
+ UNSPECIFIED = "unspecified"
316
+
317
+
318
+ class PageConfidenceScores(BaseModel):
319
+ parse_score: ScoreValue = np.nan
320
+ layout_score: ScoreValue = np.nan
321
+ table_score: ScoreValue = np.nan
322
+ ocr_score: ScoreValue = np.nan
323
+
324
+ def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
325
+ if score < 0.5:
326
+ return QualityGrade.POOR
327
+ elif score < 0.8:
328
+ return QualityGrade.FAIR
329
+ elif score < 0.9:
330
+ return QualityGrade.GOOD
331
+ elif score >= 0.9:
332
+ return QualityGrade.EXCELLENT
333
+
334
+ return QualityGrade.UNSPECIFIED
335
+
336
+ @computed_field # type: ignore
337
+ @property
338
+ def mean_grade(self) -> QualityGrade:
339
+ return self._score_to_grade(self.mean_score)
340
+
341
+ @computed_field # type: ignore
342
+ @property
343
+ def low_grade(self) -> QualityGrade:
344
+ return self._score_to_grade(self.low_score)
345
+
346
+ @computed_field # type: ignore
347
+ @property
348
+ def mean_score(self) -> ScoreValue:
349
+ return ScoreValue(
350
+ np.nanmean(
351
+ [
352
+ self.ocr_score,
353
+ self.table_score,
354
+ self.layout_score,
355
+ self.parse_score,
356
+ ]
357
+ )
358
+ )
359
+
360
+ @computed_field # type: ignore
361
+ @property
362
+ def low_score(self) -> ScoreValue:
363
+ return ScoreValue(
364
+ np.nanquantile(
365
+ [
366
+ self.ocr_score,
367
+ self.table_score,
368
+ self.layout_score,
369
+ self.parse_score,
370
+ ],
371
+ q=0.05,
372
+ )
373
+ )
374
+
375
+
376
+ class ConfidenceReport(PageConfidenceScores):
377
+ pages: Dict[int, PageConfidenceScores] = Field(
378
+ default_factory=lambda: defaultdict(PageConfidenceScores)
379
+ )
380
+
381
+ @computed_field # type: ignore
382
+ @property
383
+ def mean_score(self) -> ScoreValue:
384
+ return ScoreValue(
385
+ np.nanmean(
386
+ [c.mean_score for c in self.pages.values()],
387
+ )
388
+ )
389
+
390
+ @computed_field # type: ignore
391
+ @property
392
+ def low_score(self) -> ScoreValue:
393
+ return ScoreValue(
394
+ np.nanmean(
395
+ [c.low_score for c in self.pages.values()],
396
+ )
397
+ )
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
47
47
  )
48
48
  from docling_core.utils.file import resolve_source_to_stream
49
49
  from docling_core.utils.legacy import docling_document_to_legacy
50
- from pydantic import BaseModel
50
+ from pydantic import BaseModel, Field
51
51
  from typing_extensions import deprecated
52
52
 
53
53
  from docling.backend.abstract_backend import (
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
56
56
  )
57
57
  from docling.datamodel.base_models import (
58
58
  AssembledUnit,
59
+ ConfidenceReport,
59
60
  ConversionStatus,
60
61
  DocumentStream,
61
62
  ErrorItem,
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
201
202
  pages: List[Page] = []
202
203
  assembled: AssembledUnit = AssembledUnit()
203
204
  timings: Dict[str, ProfilingItem] = {}
205
+ confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
204
206
 
205
207
  document: DoclingDocument = _EMPTY_DOCLING_DOC
206
208
 
@@ -332,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
332
334
  ) -> Optional[InputFormat]:
333
335
  """Guess the input format of a document by checking part of its content."""
334
336
  input_format: Optional[InputFormat] = None
335
- content_str = content.decode("utf-8")
336
337
 
337
338
  if mime == "application/xml":
339
+ content_str = content.decode("utf-8")
338
340
  match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
339
341
  if match_doctype:
340
342
  xml_doctype = match_doctype.group()
@@ -356,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
356
358
  input_format = InputFormat.XML_JATS
357
359
 
358
360
  elif mime == "text/plain":
361
+ content_str = content.decode("utf-8")
359
362
  if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
360
363
  input_format = InputFormat.XML_USPTO
361
364
 
@@ -409,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
409
412
  else:
410
413
  return "application/xml"
411
414
 
412
- if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
415
+ if re.match(
416
+ r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
417
+ content_str,
418
+ re.DOTALL,
419
+ ):
413
420
  return "text/html"
414
421
 
415
422
  p = re.compile(
@@ -5,6 +5,7 @@ from collections.abc import Iterable
5
5
  from pathlib import Path
6
6
  from typing import Optional
7
7
 
8
+ import numpy as np
8
9
  from docling_core.types.doc import DocItemLabel
9
10
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
10
11
  from PIL import Image
@@ -184,6 +185,24 @@ class LayoutModel(BasePageModel):
184
185
  ).postprocess()
185
186
  # processed_clusters, processed_cells = clusters, page.cells
186
187
 
188
+ with warnings.catch_warnings():
189
+ warnings.filterwarnings(
190
+ "ignore",
191
+ "Mean of empty slice|invalid value encountered in scalar divide",
192
+ RuntimeWarning,
193
+ "numpy",
194
+ )
195
+
196
+ conv_res.confidence.pages[page.page_no].layout_score = float(
197
+ np.mean([c.confidence for c in processed_clusters])
198
+ )
199
+
200
+ conv_res.confidence.pages[page.page_no].ocr_score = float(
201
+ np.mean(
202
+ [c.confidence for c in processed_cells if c.from_ocr]
203
+ )
204
+ )
205
+
187
206
  page.cells = processed_cells
188
207
  page.predictions.layout = LayoutPrediction(
189
208
  clusters=processed_clusters
@@ -3,6 +3,7 @@ import re
3
3
  from collections.abc import Iterable
4
4
  from typing import List
5
5
 
6
+ import numpy as np
6
7
  from pydantic import BaseModel
7
8
 
8
9
  from docling.datamodel.base_models import (
@@ -1,7 +1,10 @@
1
+ import re
2
+ import warnings
1
3
  from collections.abc import Iterable
2
4
  from pathlib import Path
3
5
  from typing import Optional
4
6
 
7
+ import numpy as np
5
8
  from PIL import ImageDraw
6
9
  from pydantic import BaseModel
7
10
 
@@ -21,6 +24,14 @@ class PagePreprocessingModel(BasePageModel):
21
24
  def __init__(self, options: PagePreprocessingOptions):
22
25
  self.options = options
23
26
 
27
+ # Pre-compiled regex patterns for efficiency
28
+ self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
29
+ self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
30
+ self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
31
+ self.SLASH_NUMBER_GARBAGE_RE = re.compile(
32
+ r"(?:/\w+\s*){2,}"
33
+ ) # Two or more "/token " sequences
34
+
24
35
  def __call__(
25
36
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
26
37
  ) -> Iterable[Page]:
@@ -60,6 +71,22 @@ class PagePreprocessingModel(BasePageModel):
60
71
  if self.options.create_parsed_page:
61
72
  page.parsed_page = page._backend.get_segmented_page()
62
73
 
74
+ # Rate the text quality from the PDF parser, and aggregate on page
75
+ text_scores = []
76
+ for c in page.cells:
77
+ score = self.rate_text_quality(c.text)
78
+ text_scores.append(score)
79
+
80
+ with warnings.catch_warnings():
81
+ warnings.filterwarnings(
82
+ "ignore", "Mean of empty slice", RuntimeWarning, "numpy"
83
+ )
84
+ conv_res.confidence.pages[page.page_no].parse_score = float(
85
+ np.nanquantile(
86
+ text_scores, q=0.10
87
+ ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
88
+ )
89
+
63
90
  # DEBUG code:
64
91
  def draw_text_boxes(image, cells, show: bool = False):
65
92
  draw = ImageDraw.Draw(image)
@@ -88,3 +115,30 @@ class PagePreprocessingModel(BasePageModel):
88
115
  draw_text_boxes(page.get_image(scale=1.0), page.cells)
89
116
 
90
117
  return page
118
+
119
+ def rate_text_quality(self, text: str) -> float:
120
+ # Hard errors: if any of these patterns are found, return 0.0 immediately.
121
+ blacklist_chars = ["�"]
122
+ if (
123
+ any(text.find(c) >= 0 for c in blacklist_chars)
124
+ or self.GLYPH_RE.search(text)
125
+ or self.SLASH_G_RE.search(text)
126
+ or self.SLASH_NUMBER_GARBAGE_RE.match(
127
+ text
128
+ ) # Check if text is mostly slash-number pattern
129
+ ):
130
+ return 0.0
131
+
132
+ penalty = 0.0
133
+
134
+ # Apply a penalty only if the fragmented words pattern occurs at least three times.
135
+ frag_matches = self.FRAG_RE.findall(text)
136
+ if len(frag_matches) >= 3:
137
+ penalty += 0.1 * len(frag_matches)
138
+
139
+ # Additional heuristic: if the average token length is below 2, add a penalty.
140
+ # tokens = text.split()
141
+ # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
142
+ # penalty += 0.2
143
+
144
+ return max(1.0 - penalty, 0.0)
@@ -2,6 +2,7 @@ import csv
2
2
  import io
3
3
  import logging
4
4
  import os
5
+ import subprocess
5
6
  import tempfile
6
7
  from collections.abc import Iterable
7
8
  from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
10
11
 
11
12
  import pandas as pd
12
13
  from docling_core.types.doc import BoundingBox, CoordOrigin
13
- from docling_core.types.doc.page import BoundingRectangle, TextCell
14
+ from docling_core.types.doc.page import TextCell
14
15
 
15
16
  from docling.datamodel.base_models import Page
16
17
  from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
21
22
  )
22
23
  from docling.datamodel.settings import settings
23
24
  from docling.models.base_ocr_model import BaseOcrModel
24
- from docling.utils.ocr_utils import map_tesseract_script
25
+ from docling.utils.ocr_utils import (
26
+ map_tesseract_script,
27
+ parse_tesseract_orientation,
28
+ tesseract_box_to_bounding_rectangle,
29
+ )
25
30
  from docling.utils.profiling import TimeRecorder
26
31
 
27
32
  _log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
49
54
  self._version: Optional[str] = None
50
55
  self._tesseract_languages: Optional[List[str]] = None
51
56
  self._script_prefix: Optional[str] = None
57
+ self._is_auto: bool = "auto" in self.options.lang
52
58
 
53
59
  if self.enabled:
54
60
  try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
93
99
 
94
100
  return name, version
95
101
 
96
- def _run_tesseract(self, ifilename: str):
102
+ def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
97
103
  r"""
98
104
  Run tesseract CLI
99
105
  """
100
106
  cmd = [self.options.tesseract_cmd]
101
-
102
- if "auto" in self.options.lang:
103
- lang = self._detect_language(ifilename)
107
+ if self._is_auto:
108
+ lang = self._parse_language(osd)
104
109
  if lang is not None:
105
110
  cmd.append("-l")
106
111
  cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
115
120
  cmd += [ifilename, "stdout", "tsv"]
116
121
  _log.info("command: {}".format(" ".join(cmd)))
117
122
 
118
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
119
- output, _ = proc.communicate()
123
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
120
124
 
121
125
  # _log.info(output)
122
126
 
123
127
  # Decode the byte string to a regular string
124
- decoded_data = output.decode("utf-8")
128
+ decoded_data = output.stdout.decode("utf-8")
125
129
  # _log.info(decoded_data)
126
130
 
127
131
  # Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
139
143
 
140
144
  return df_filtered
141
145
 
142
- def _detect_language(self, ifilename: str):
146
+ def _perform_osd(self, ifilename: str) -> pd.DataFrame:
143
147
  r"""
144
148
  Run tesseract in PSM 0 mode to detect the language
145
149
  """
146
- assert self._tesseract_languages is not None
147
150
 
148
151
  cmd = [self.options.tesseract_cmd]
149
152
  cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
150
153
  _log.info("command: {}".format(" ".join(cmd)))
151
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
152
- output, _ = proc.communicate()
153
- decoded_data = output.decode("utf-8")
154
+ output = subprocess.run(cmd, capture_output=True, check=True)
155
+ decoded_data = output.stdout.decode("utf-8")
154
156
  df_detected = pd.read_csv(
155
157
  io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
156
158
  )
157
- scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
159
+ return df_detected
160
+
161
+ def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
162
+ assert self._tesseract_languages is not None
163
+ scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
158
164
  if len(scripts) == 0:
159
165
  _log.warning("Tesseract cannot detect the script of the page")
160
166
  return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
182
188
  cmd = [self.options.tesseract_cmd]
183
189
  cmd.append("--list-langs")
184
190
  _log.info("command: {}".format(" ".join(cmd)))
185
- proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
186
- output, _ = proc.communicate()
187
- decoded_data = output.decode("utf-8")
191
+ output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
192
+ decoded_data = output.stdout.decode("utf-8")
188
193
  df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
189
194
  self._tesseract_languages = df_list[0].tolist()[1:]
190
195
 
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
203
208
  yield from page_batch
204
209
  return
205
210
 
206
- for page in page_batch:
211
+ for page_i, page in enumerate(page_batch):
207
212
  assert page._backend is not None
208
213
  if not page._backend.is_valid():
209
214
  yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
212
217
  ocr_rects = self.get_ocr_rects(page)
213
218
 
214
219
  all_ocr_cells = []
215
- for ocr_rect in ocr_rects:
220
+ for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
216
221
  # Skip zero area boxes
217
222
  if ocr_rect.area() == 0:
218
223
  continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
225
230
  ) as image_file:
226
231
  fname = image_file.name
227
232
  high_res_image.save(image_file)
228
-
229
- df_result = self._run_tesseract(fname)
233
+ doc_orientation = 0
234
+ try:
235
+ df_osd = self._perform_osd(fname)
236
+ doc_orientation = _parse_orientation(df_osd)
237
+ except subprocess.CalledProcessError as exc:
238
+ _log.error(
239
+ "OSD failed (doc %s, page: %s, "
240
+ "OCR rectangle: %s, processed image file %s):\n %s",
241
+ conv_res.input.file,
242
+ page_i,
243
+ ocr_rect_i,
244
+ image_file,
245
+ exc.stderr,
246
+ )
247
+ # Skipping if OSD fail when in auto mode, otherwise proceed
248
+ # to OCR in the hope OCR will succeed while OSD failed
249
+ if self._is_auto:
250
+ continue
251
+ if doc_orientation != 0:
252
+ high_res_image = high_res_image.rotate(
253
+ -doc_orientation, expand=True
254
+ )
255
+ high_res_image.save(fname)
256
+ try:
257
+ df_result = self._run_tesseract(fname, df_osd)
258
+ except subprocess.CalledProcessError as exc:
259
+ _log.error(
260
+ "tesseract OCR failed (doc %s, page: %s, "
261
+ "OCR rectangle: %s, processed image file %s):\n %s",
262
+ conv_res.input.file,
263
+ page_i,
264
+ ocr_rect_i,
265
+ image_file,
266
+ exc.stderr,
267
+ )
268
+ continue
230
269
  finally:
231
270
  if os.path.exists(fname):
232
271
  os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
238
277
  text = row["text"]
239
278
  conf = row["conf"]
240
279
 
241
- l = float(row["left"]) # noqa: E741
242
- b = float(row["top"])
243
- w = float(row["width"])
244
- h = float(row["height"])
245
-
246
- t = b + h
247
- r = l + w
248
-
280
+ left, top = float(row["left"]), float(row["top"])
281
+ right = left + float(row["width"])
282
+ bottom = top + row["height"]
283
+ bbox = BoundingBox(
284
+ l=left,
285
+ t=top,
286
+ r=right,
287
+ b=bottom,
288
+ coord_origin=CoordOrigin.TOPLEFT,
289
+ )
290
+ rect = tesseract_box_to_bounding_rectangle(
291
+ bbox,
292
+ original_offset=ocr_rect,
293
+ scale=self.scale,
294
+ orientation=doc_orientation,
295
+ im_size=high_res_image.size,
296
+ )
249
297
  cell = TextCell(
250
298
  index=ix,
251
299
  text=str(text),
252
300
  orig=str(text),
253
301
  from_ocr=True,
254
302
  confidence=conf / 100.0,
255
- rect=BoundingRectangle.from_bounding_box(
256
- BoundingBox.from_tuple(
257
- coord=(
258
- (l / self.scale) + ocr_rect.l,
259
- (b / self.scale) + ocr_rect.t,
260
- (r / self.scale) + ocr_rect.l,
261
- (t / self.scale) + ocr_rect.t,
262
- ),
263
- origin=CoordOrigin.TOPLEFT,
264
- )
265
- ),
303
+ rect=rect,
266
304
  )
267
305
  all_ocr_cells.append(cell)
268
306
 
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
278
316
  @classmethod
279
317
  def get_options_type(cls) -> Type[OcrOptions]:
280
318
  return TesseractCliOcrOptions
319
+
320
+
321
+ def _parse_orientation(df_osd: pd.DataFrame) -> int:
322
+ orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
323
+ orientation = parse_tesseract_orientation(orientations[0].strip())
324
+ return orientation
@@ -1,12 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from collections.abc import Iterable
5
4
  from pathlib import Path
6
- from typing import Optional, Type
5
+ from typing import Iterable, Optional, Type
7
6
 
8
7
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_core.types.doc.page import BoundingRectangle, TextCell
8
+ from docling_core.types.doc.page import TextCell
10
9
 
11
10
  from docling.datamodel.base_models import Page
12
11
  from docling.datamodel.document import ConversionResult
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
17
16
  )
18
17
  from docling.datamodel.settings import settings
19
18
  from docling.models.base_ocr_model import BaseOcrModel
20
- from docling.utils.ocr_utils import map_tesseract_script
19
+ from docling.utils.ocr_utils import (
20
+ map_tesseract_script,
21
+ parse_tesseract_orientation,
22
+ tesseract_box_to_bounding_rectangle,
23
+ )
21
24
  from docling.utils.profiling import TimeRecorder
22
25
 
23
26
  _log = logging.getLogger(__name__)
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
38
41
  accelerator_options=accelerator_options,
39
42
  )
40
43
  self.options: TesseractOcrOptions
41
-
44
+ self._is_auto: bool = "auto" in self.options.lang
42
45
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
43
46
  self.reader = None
44
47
  self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
95
98
 
96
99
  if lang == "auto":
97
100
  self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
98
- self.osd_reader = tesserocr.PyTessBaseAPI(
99
- **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
100
- )
101
101
  else:
102
102
  self.reader = tesserocr.PyTessBaseAPI(
103
103
  **{"lang": lang} | tesserocr_kwargs,
104
104
  )
105
+ self.osd_reader = tesserocr.PyTessBaseAPI(
106
+ **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
107
+ )
105
108
  self.reader_RIL = tesserocr.RIL
106
109
 
107
110
  def __del__(self):
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
118
121
  yield from page_batch
119
122
  return
120
123
 
121
- for page in page_batch:
124
+ for page_i, page in enumerate(page_batch):
122
125
  assert page._backend is not None
123
126
  if not page._backend.is_valid():
124
127
  yield page
125
128
  else:
126
129
  with TimeRecorder(conv_res, "ocr"):
127
130
  assert self.reader is not None
131
+ assert self.osd_reader is not None
128
132
  assert self._tesserocr_languages is not None
129
133
 
130
134
  ocr_rects = self.get_ocr_rects(page)
131
135
 
132
136
  all_ocr_cells = []
133
- for ocr_rect in ocr_rects:
137
+ for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
134
138
  # Skip zero area boxes
135
139
  if ocr_rect.area() == 0:
136
140
  continue
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
139
143
  )
140
144
 
141
145
  local_reader = self.reader
142
- if "auto" in self.options.lang:
143
- assert self.osd_reader is not None
144
-
145
- self.osd_reader.SetImage(high_res_image)
146
- osd = self.osd_reader.DetectOrientationScript()
147
-
148
- # No text, probably
149
- if osd is None:
146
+ self.osd_reader.SetImage(high_res_image)
147
+ osd = self.osd_reader.DetectOrientationScript()
148
+ # No text, or Orientation and Script detection failure
149
+ if osd is None:
150
+ _log.error(
151
+ "OSD failed for doc (doc %s, page: %s, "
152
+ "OCR rectangle: %s)",
153
+ conv_res.input.file,
154
+ page_i,
155
+ ocr_rect_i,
156
+ )
157
+ # Skipping if OSD fail when in auto mode, otherwise proceed
158
+ # to OCR in the hope OCR will succeed while OSD failed
159
+ if self._is_auto:
150
160
  continue
151
-
161
+ doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
162
+ if doc_orientation != 0:
163
+ high_res_image = high_res_image.rotate(
164
+ -doc_orientation, expand=True
165
+ )
166
+ if self._is_auto:
152
167
  script = osd["script_name"]
153
168
  script = map_tesseract_script(script)
154
169
  lang = f"{self.script_prefix}{script}"
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
188
203
  # Extract text within the bounding box
189
204
  text = local_reader.GetUTF8Text().strip()
190
205
  confidence = local_reader.MeanTextConf()
191
- left = box["x"] / self.scale
192
- bottom = box["y"] / self.scale
193
- right = (box["x"] + box["w"]) / self.scale
194
- top = (box["y"] + box["h"]) / self.scale
195
-
206
+ left, top = box["x"], box["y"]
207
+ right = left + box["w"]
208
+ bottom = top + box["h"]
209
+ bbox = BoundingBox(
210
+ l=left,
211
+ t=top,
212
+ r=right,
213
+ b=bottom,
214
+ coord_origin=CoordOrigin.TOPLEFT,
215
+ )
216
+ rect = tesseract_box_to_bounding_rectangle(
217
+ bbox,
218
+ original_offset=ocr_rect,
219
+ scale=self.scale,
220
+ orientation=doc_orientation,
221
+ im_size=high_res_image.size,
222
+ )
196
223
  cells.append(
197
224
  TextCell(
198
225
  index=ix,
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
200
227
  orig=text,
201
228
  from_ocr=True,
202
229
  confidence=confidence,
203
- rect=BoundingRectangle.from_bounding_box(
204
- BoundingBox.from_tuple(
205
- coord=(left, top, right, bottom),
206
- origin=CoordOrigin.TOPLEFT,
207
- ),
208
- ),
230
+ rect=rect,
209
231
  )
210
232
  )
211
233
 
@@ -3,6 +3,7 @@ import warnings
3
3
  from pathlib import Path
4
4
  from typing import Optional, cast
5
5
 
6
+ import numpy as np
6
7
  from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
7
8
 
8
9
  from docling.backend.abstract_backend import AbstractDocumentBackend
@@ -54,13 +55,15 @@ class StandardPdfPipeline(PaginatedPipeline):
54
55
  "When defined, it must point to a folder containing all models required by the pipeline."
55
56
  )
56
57
 
57
- self.keep_images = (
58
- self.pipeline_options.generate_page_images
59
- or self.pipeline_options.generate_picture_images
60
- or self.pipeline_options.generate_table_images
61
- )
58
+ with warnings.catch_warnings(): # deprecated generate_table_images
59
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
60
+ self.keep_images = (
61
+ self.pipeline_options.generate_page_images
62
+ or self.pipeline_options.generate_picture_images
63
+ or self.pipeline_options.generate_table_images
64
+ )
62
65
 
63
- self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
66
+ self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
64
67
 
65
68
  ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
66
69
 
@@ -197,7 +200,7 @@ class StandardPdfPipeline(PaginatedPipeline):
197
200
  elements=all_elements, headers=all_headers, body=all_body
198
201
  )
199
202
 
200
- conv_res.document = self.glm_model(conv_res)
203
+ conv_res.document = self.reading_order_model(conv_res)
201
204
 
202
205
  # Generate page images in the output
203
206
  if self.pipeline_options.generate_page_images:
@@ -209,40 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
209
212
  )
210
213
 
211
214
  # Generate images of the requested element types
212
- if (
213
- self.pipeline_options.generate_picture_images
214
- or self.pipeline_options.generate_table_images
215
- ):
216
- scale = self.pipeline_options.images_scale
217
- for element, _level in conv_res.document.iterate_items():
218
- if not isinstance(element, DocItem) or len(element.prov) == 0:
219
- continue
220
- if (
221
- isinstance(element, PictureItem)
222
- and self.pipeline_options.generate_picture_images
223
- ) or (
224
- isinstance(element, TableItem)
225
- and self.pipeline_options.generate_table_images
226
- ):
227
- page_ix = element.prov[0].page_no - 1
228
- page = next(
229
- (p for p in conv_res.pages if p.page_no == page_ix),
230
- cast("Page", None),
215
+ with warnings.catch_warnings(): # deprecated generate_table_images
216
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
217
+ if (
218
+ self.pipeline_options.generate_picture_images
219
+ or self.pipeline_options.generate_table_images
220
+ ):
221
+ scale = self.pipeline_options.images_scale
222
+ for element, _level in conv_res.document.iterate_items():
223
+ if not isinstance(element, DocItem) or len(element.prov) == 0:
224
+ continue
225
+ if (
226
+ isinstance(element, PictureItem)
227
+ and self.pipeline_options.generate_picture_images
228
+ ) or (
229
+ isinstance(element, TableItem)
230
+ and self.pipeline_options.generate_table_images
231
+ ):
232
+ page_ix = element.prov[0].page_no - 1
233
+ page = next(
234
+ (p for p in conv_res.pages if p.page_no == page_ix),
235
+ cast("Page", None),
236
+ )
237
+ assert page is not None
238
+ assert page.size is not None
239
+ assert page.image is not None
240
+
241
+ crop_bbox = (
242
+ element.prov[0]
243
+ .bbox.scaled(scale=scale)
244
+ .to_top_left_origin(
245
+ page_height=page.size.height * scale
246
+ )
247
+ )
248
+
249
+ cropped_im = page.image.crop(crop_bbox.as_tuple())
250
+ element.image = ImageRef.from_pil(
251
+ cropped_im, dpi=int(72 * scale)
252
+ )
253
+
254
+ # Aggregate confidence values for document:
255
+ if len(conv_res.pages) > 0:
256
+ with warnings.catch_warnings():
257
+ warnings.filterwarnings(
258
+ "ignore",
259
+ category=RuntimeWarning,
260
+ message="Mean of empty slice|All-NaN slice encountered",
261
+ )
262
+ conv_res.confidence.layout_score = float(
263
+ np.nanmean(
264
+ [c.layout_score for c in conv_res.confidence.pages.values()]
231
265
  )
232
- assert page is not None
233
- assert page.size is not None
234
- assert page.image is not None
235
-
236
- crop_bbox = (
237
- element.prov[0]
238
- .bbox.scaled(scale=scale)
239
- .to_top_left_origin(page_height=page.size.height * scale)
266
+ )
267
+ conv_res.confidence.parse_score = float(
268
+ np.nanquantile(
269
+ [c.parse_score for c in conv_res.confidence.pages.values()],
270
+ q=0.1, # parse score should relate to worst 10% of pages.
240
271
  )
241
-
242
- cropped_im = page.image.crop(crop_bbox.as_tuple())
243
- element.image = ImageRef.from_pil(
244
- cropped_im, dpi=int(72 * scale)
272
+ )
273
+ conv_res.confidence.table_score = float(
274
+ np.nanmean(
275
+ [c.table_score for c in conv_res.confidence.pages.values()]
245
276
  )
277
+ )
278
+ conv_res.confidence.ocr_score = float(
279
+ np.nanmean(
280
+ [c.ocr_score for c in conv_res.confidence.pages.values()]
281
+ )
282
+ )
246
283
 
247
284
  return conv_res
248
285
 
@@ -90,17 +90,12 @@ class SpatialClusterIndex:
90
90
  containment_threshold: float,
91
91
  ) -> bool:
92
92
  """Check if two bboxes overlap sufficiently."""
93
- area1, area2 = bbox1.area(), bbox2.area()
94
- if area1 <= 0 or area2 <= 0:
93
+ if bbox1.area() <= 0 or bbox2.area() <= 0:
95
94
  return False
96
95
 
97
- overlap_area = bbox1.intersection_area_with(bbox2)
98
- if overlap_area <= 0:
99
- return False
100
-
101
- iou = overlap_area / (area1 + area2 - overlap_area)
102
- containment1 = overlap_area / area1
103
- containment2 = overlap_area / area2
96
+ iou = bbox1.intersection_over_union(bbox2)
97
+ containment1 = bbox1.intersection_over_self(bbox2)
98
+ containment2 = bbox2.intersection_over_self(bbox1)
104
99
 
105
100
  return (
106
101
  iou > overlap_threshold
@@ -321,11 +316,9 @@ class LayoutPostprocessor:
321
316
  for special in special_clusters:
322
317
  contained = []
323
318
  for cluster in self.regular_clusters:
324
- overlap = cluster.bbox.intersection_area_with(special.bbox)
325
- if overlap > 0:
326
- containment = overlap / cluster.bbox.area()
327
- if containment > 0.8:
328
- contained.append(cluster)
319
+ containment = cluster.bbox.intersection_over_self(special.bbox)
320
+ if containment > 0.8:
321
+ contained.append(cluster)
329
322
 
330
323
  if contained:
331
324
  # Sort contained clusters by minimum cell ID:
@@ -379,9 +372,7 @@ class LayoutPostprocessor:
379
372
  for regular in self.regular_clusters:
380
373
  if regular.label == DocItemLabel.TABLE:
381
374
  # Calculate overlap
382
- overlap = regular.bbox.intersection_area_with(wrapper.bbox)
383
- wrapper_area = wrapper.bbox.area()
384
- overlap_ratio = overlap / wrapper_area
375
+ overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
385
376
 
386
377
  conf_diff = wrapper.confidence - regular.confidence
387
378
 
@@ -421,8 +412,7 @@ class LayoutPostprocessor:
421
412
  # Rule 2: CODE vs others
422
413
  if candidate.label == DocItemLabel.CODE:
423
414
  # Calculate how much of the other cluster is contained within the CODE cluster
424
- overlap = other.bbox.intersection_area_with(candidate.bbox)
425
- containment = overlap / other.bbox.area()
415
+ containment = other.bbox.intersection_over_self(candidate.bbox)
426
416
  if containment > 0.8: # other is 80% contained within CODE
427
417
  return True
428
418
 
@@ -586,11 +576,9 @@ class LayoutPostprocessor:
586
576
  if cell.rect.to_bounding_box().area() <= 0:
587
577
  continue
588
578
 
589
- overlap = cell.rect.to_bounding_box().intersection_area_with(
579
+ overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
590
580
  cluster.bbox
591
581
  )
592
- overlap_ratio = overlap / cell.rect.to_bounding_box().area()
593
-
594
582
  if overlap_ratio > best_overlap:
595
583
  best_overlap = overlap_ratio
596
584
  best_cluster = cluster
@@ -1,3 +1,11 @@
1
+ from typing import Optional, Tuple
2
+
3
+ from docling_core.types.doc import BoundingBox, CoordOrigin
4
+ from docling_core.types.doc.page import BoundingRectangle
5
+
6
+ from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
7
+
8
+
1
9
  def map_tesseract_script(script: str) -> str:
2
10
  r""" """
3
11
  if script == "Katakana" or script == "Hiragana":
@@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
7
15
  elif script == "Korean":
8
16
  script = "Hangul"
9
17
  return script
18
+
19
+
20
+ def parse_tesseract_orientation(orientation: str) -> int:
21
+ # Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
22
+ # are [0, 360[ counterclockwise
23
+ parsed = int(orientation)
24
+ if parsed not in CLIPPED_ORIENTATIONS:
25
+ msg = (
26
+ f"invalid tesseract document orientation {orientation}, "
27
+ f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
28
+ )
29
+ raise ValueError(msg)
30
+ parsed = -parsed
31
+ parsed %= 360
32
+ return parsed
33
+
34
+
35
+ def tesseract_box_to_bounding_rectangle(
36
+ bbox: BoundingBox,
37
+ *,
38
+ original_offset: Optional[BoundingBox] = None,
39
+ scale: float,
40
+ orientation: int,
41
+ im_size: Tuple[int, int],
42
+ ) -> BoundingRectangle:
43
+ # box is in the top, left, height, width format, top left coordinates
44
+ rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
45
+ rect = BoundingRectangle(
46
+ r_x0=rect.r_x0 / scale,
47
+ r_y0=rect.r_y0 / scale,
48
+ r_x1=rect.r_x1 / scale,
49
+ r_y1=rect.r_y1 / scale,
50
+ r_x2=rect.r_x2 / scale,
51
+ r_y2=rect.r_y2 / scale,
52
+ r_x3=rect.r_x3 / scale,
53
+ r_y3=rect.r_y3 / scale,
54
+ coord_origin=CoordOrigin.TOPLEFT,
55
+ )
56
+ if original_offset is not None:
57
+ if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
58
+ msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
59
+ raise ValueError(msg)
60
+ if original_offset is not None:
61
+ rect.r_x0 += original_offset.l
62
+ rect.r_x1 += original_offset.l
63
+ rect.r_x2 += original_offset.l
64
+ rect.r_x3 += original_offset.l
65
+ rect.r_y0 += original_offset.t
66
+ rect.r_y1 += original_offset.t
67
+ rect.r_y2 += original_offset.t
68
+ rect.r_y3 += original_offset.t
69
+ return rect
@@ -0,0 +1,71 @@
1
+ from typing import Tuple
2
+
3
+ from docling_core.types.doc import BoundingBox, CoordOrigin
4
+ from docling_core.types.doc.page import BoundingRectangle
5
+
6
+ CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
7
+
8
+
9
+ def rotate_bounding_box(
10
+ bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
11
+ ) -> BoundingRectangle:
12
+ # The box is left top width height in TOPLEFT coordinates
13
+ # Bounding rectangle start with r_0 at the bottom left whatever the
14
+ # coordinate system. Then other corners are found rotating counterclockwise
15
+ bbox = bbox.to_top_left_origin(im_size[1])
16
+ left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
17
+ im_h, im_w = im_size
18
+ angle = angle % 360
19
+ if angle == 0:
20
+ r_x0 = left
21
+ r_y0 = top + height
22
+ r_x1 = r_x0 + width
23
+ r_y1 = r_y0
24
+ r_x2 = r_x0 + width
25
+ r_y2 = r_y0 - height
26
+ r_x3 = r_x0
27
+ r_y3 = r_y0 - height
28
+ elif angle == 90:
29
+ r_x0 = im_w - (top + height)
30
+ r_y0 = left
31
+ r_x1 = r_x0
32
+ r_y1 = r_y0 + width
33
+ r_x2 = r_x0 + height
34
+ r_y2 = r_y0 + width
35
+ r_x3 = r_x0
36
+ r_y3 = r_y0 + width
37
+ elif angle == 180:
38
+ r_x0 = im_h - left
39
+ r_y0 = im_w - (top + height)
40
+ r_x1 = r_x0 - width
41
+ r_y1 = r_y0
42
+ r_x2 = r_x0 - width
43
+ r_y2 = r_y0 + height
44
+ r_x3 = r_x0
45
+ r_y3 = r_y0 + height
46
+ elif angle == 270:
47
+ r_x0 = top + height
48
+ r_y0 = im_h - left
49
+ r_x1 = r_x0
50
+ r_y1 = r_y0 - width
51
+ r_x2 = r_x0 - height
52
+ r_y2 = r_y0 - width
53
+ r_x3 = r_x0 - height
54
+ r_y3 = r_y0
55
+ else:
56
+ msg = (
57
+ f"invalid orientation {angle}, expected values in:"
58
+ f" {sorted(CLIPPED_ORIENTATIONS)}"
59
+ )
60
+ raise ValueError(msg)
61
+ return BoundingRectangle(
62
+ r_x0=r_x0,
63
+ r_y0=r_y0,
64
+ r_x1=r_x1,
65
+ r_y1=r_y1,
66
+ r_x2=r_x2,
67
+ r_y2=r_y2,
68
+ r_x3=r_x3,
69
+ r_y3=r_y3,
70
+ coord_origin=CoordOrigin.TOPLEFT,
71
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.33.0
3
+ Version: 2.35.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
31
  Requires-Dist: click (<8.2.0)
32
- Requires-Dist: docling-core[chunking] (>=2.29.0,<3.0.0)
32
+ Requires-Dist: docling-core[chunking] (>=2.31.2,<3.0.0)
33
33
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
34
34
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
35
35
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -3,9 +3,9 @@ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
4
4
  docling/backend/asciidoc_backend.py,sha256=W-4MRcID6AU9Ax23q8FwDwGG-OOCrBoqcNf2Ch_WPUc,14041
5
5
  docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
6
- docling/backend/docling_parse_backend.py,sha256=V_CsUdN5RkGQBBq7A_ReAiUW4CQVh0-1Ur157Ozurdg,8017
7
- docling/backend/docling_parse_v2_backend.py,sha256=6fokgqb1hMbZua33gL46EFamrwPTC7ms6ZuEHw-Dv28,9395
8
- docling/backend/docling_parse_v4_backend.py,sha256=-WJZs0IsdN6blhkvTS1eh_qhujYLyJ3XcOMqS6AaXxg,6282
6
+ docling/backend/docling_parse_backend.py,sha256=bVSPmmiVXdCVfe-eLtDhbPQKBjkFR8rZJoRxdWIMdYU,7998
7
+ docling/backend/docling_parse_v2_backend.py,sha256=R4YPCEs72GYg-Xc9VfizPv8QjtGmKOsQzVPNAU2RIK0,9376
8
+ docling/backend/docling_parse_v4_backend.py,sha256=aWh-fd-lnuRGVGC_DG17QUptIsArv5V1gJo8QFbB5Ys,6263
9
9
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
@@ -24,12 +24,12 @@ docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e
24
24
  docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
25
25
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
26
26
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- docling/cli/main.py,sha256=D7WEY4x6pQCVFRy3peK9KUDOb0Y5IVc-vTDqPnHPK00,26138
27
+ docling/cli/main.py,sha256=KARZ1OJx4HvHb1D_95GPIAhKaIlhcYYSBa0t4PM-Xfk,27339
28
28
  docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
29
29
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
30
30
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- docling/datamodel/base_models.py,sha256=3BmGoV2HLXOWFuRHFAa42YnWceh-JEpcLXzfFz9AD9Y,7943
32
- docling/datamodel/document.py,sha256=rxu5SqUppgNtYWwIbBoLlZvYAn_w2cGn5uq4AsvouSc,15907
31
+ docling/datamodel/base_models.py,sha256=QJlzGJKUAO0kqM6DO2RZKlFi-lL2MpY8qt3Wdm02Slw,10460
32
+ docling/datamodel/document.py,sha256=vPwiVU5zWCKbVYMq-TSmb7LTjijrqJq0FyAgDBa0XGA,16154
33
33
  docling/datamodel/pipeline_options.py,sha256=uwjBvK4egrgcF1_w4B5EDxpGnl4IgBzmxP7dJ7zm394,13400
34
34
  docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
35
35
  docling/document_converter.py,sha256=PRRr65nigQ3LZDl4G2fBMkOtJyswT7xyGt7fpUeDO3w,13849
@@ -47,10 +47,10 @@ docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0
47
47
  docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
48
48
  docling/models/hf_mlx_model.py,sha256=B_B4hFU-jU0g_DQtQD8w4Ejorn10mkDuFI93wR_WhGk,4897
49
49
  docling/models/hf_vlm_model.py,sha256=SiPMTLghMUjJ66dA2yN4UujpLO6PiOhLEPInWtXV_5s,6912
50
- docling/models/layout_model.py,sha256=0fiJXJ4aPmcMsYY7rbN9LJ2mZ0_8G0ODY9kyNTAN3Ws,7823
50
+ docling/models/layout_model.py,sha256=1LLDS3hBfdJXA16L_PrjA_1rM_A2r5rNFkHVbLBCl_8,8639
51
51
  docling/models/ocr_mac_model.py,sha256=A3TlEbvvwhkWiq9YARos3Y9yNcpPYQ7JGc_4hFtAK-8,5370
52
- docling/models/page_assemble_model.py,sha256=GO7JI1D6T6EkSW94cLQobPGNQUahkxQqTPRwj5CnmFE,6304
53
- docling/models/page_preprocessing_model.py,sha256=6pOGXiFQ-oz06UmJdcaYMdVyfZ0YVLWS6efGcx7Mxws,3105
52
+ docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
53
+ docling/models/page_preprocessing_model.py,sha256=8cdhR9n3zcC8JxDen8WdPBx_GNk_5VICeHJo1-kP518,5186
54
54
  docling/models/picture_description_api_model.py,sha256=kCuAFOGEuI5QsRul7Pc1LccxWN7WIvIUhXEmSICYegw,2332
55
55
  docling/models/picture_description_base_model.py,sha256=FbBVXzAOB87xpJN28tuGCxoAdcf6mZNUOqJR7ljUg5g,2946
56
56
  docling/models/picture_description_vlm_model.py,sha256=DiTjnehVy1n0N04xPUvZl8rx4TiNHzHn9Cnzy_ePGts,4177
@@ -59,12 +59,12 @@ docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurH
59
59
  docling/models/rapid_ocr_model.py,sha256=Tq_1Egu5Hjx7Y69Vox17QTtRXztSyflB1fhN08CWQwY,5894
60
60
  docling/models/readingorder_model.py,sha256=S9ru2ApY9sE-Uue3hptWHmbmElwo36bUbAikxCFpHYs,14574
61
61
  docling/models/table_structure_model.py,sha256=1gxLaooK0IKMrnmS8nT1BItKqt1GAKghfpmLKb3i53g,12566
62
- docling/models/tesseract_ocr_cli_model.py,sha256=LXYUCMQAPxQA2pY3zs9wcPSrAHHorTffSmIIWgltoaw,10234
63
- docling/models/tesseract_ocr_model.py,sha256=72009TJL_7tXTEnhlsGRiw_KibrQ0LjZlCBtW8NtwUc,9339
62
+ docling/models/tesseract_ocr_cli_model.py,sha256=e55MkaDdsseKcfX5lxIt0iv5jR6pDFBzWBZHTvl2Jws,12653
63
+ docling/models/tesseract_ocr_model.py,sha256=vS4And5NHe_uLNb6ZBi2CQzWUITBdc1E1zlsojrSZpM,10561
64
64
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
65
  docling/pipeline/base_pipeline.py,sha256=DnuxAf7EQusdSRae0QUVth-0f2mSff8JZjX-2vazk00,8751
66
66
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
67
- docling/pipeline/standard_pdf_pipeline.py,sha256=iNZMMGiHTwV6I4u_jjqXhVJ_DiPn_O9qnnee3PQxidc,10773
67
+ docling/pipeline/standard_pdf_pipeline.py,sha256=itCZPj7nMFAQtAlStfmWthpCIHZFUm9W5uTgvVi6PkQ,12738
68
68
  docling/pipeline/vlm_pipeline.py,sha256=ZW1WGd6jeLqTCWR0S0cj6H_qVMUXELaFCrJVpvZp6Co,9684
69
69
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
70
70
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,15 +72,16 @@ docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExT
72
72
  docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
73
73
  docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
74
74
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
75
- docling/utils/layout_postprocessor.py,sha256=x7exVG3HYzV9M_O78FfyoG43Y2L7PPMMydvSNwjqh8s,24528
75
+ docling/utils/layout_postprocessor.py,sha256=3WCmkPsPJ80xfWzAUeWb5L9BmuwJ79ztctvbbUs8AfI,24068
76
76
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
77
77
  docling/utils/model_downloader.py,sha256=ocvud3G3qlBQhzMo69Q3RJMnvq5HPZ2DwNbMuEp8RCs,4142
78
- docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
78
+ docling/utils/ocr_utils.py,sha256=AOaDAHr5S74d-IRVR_LKhKynUTIurAwLJ3wNeY58gPA,2326
79
+ docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,2011
79
80
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
80
81
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
81
82
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
82
- docling-2.33.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
83
- docling-2.33.0.dist-info/METADATA,sha256=FXLSejKn7Fc1pY2Fl8YvLP1PYvldoQ76d6zcupzghDo,10138
84
- docling-2.33.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
85
- docling-2.33.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
86
- docling-2.33.0.dist-info/RECORD,,
83
+ docling-2.35.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
84
+ docling-2.35.0.dist-info/METADATA,sha256=SMuVHjV5ouB773e4tFnu7fqpvEdygq3ksNbESerk0Ao,10138
85
+ docling-2.35.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
86
+ docling-2.35.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
87
+ docling-2.35.0.dist-info/RECORD,,