docling 2.33.0__py3-none-any.whl → 2.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +1 -1
- docling/backend/docling_parse_v2_backend.py +1 -1
- docling/backend/docling_parse_v4_backend.py +1 -1
- docling/cli/main.py +36 -3
- docling/datamodel/base_models.py +99 -2
- docling/datamodel/document.py +10 -3
- docling/models/layout_model.py +19 -0
- docling/models/page_assemble_model.py +1 -0
- docling/models/page_preprocessing_model.py +54 -0
- docling/models/tesseract_ocr_cli_model.py +85 -41
- docling/models/tesseract_ocr_model.py +52 -30
- docling/pipeline/standard_pdf_pipeline.py +75 -38
- docling/utils/layout_postprocessor.py +10 -22
- docling/utils/ocr_utils.py +60 -0
- docling/utils/orientation.py +71 -0
- {docling-2.33.0.dist-info → docling-2.35.0.dist-info}/METADATA +2 -2
- {docling-2.33.0.dist-info → docling-2.35.0.dist-info}/RECORD +20 -19
- {docling-2.33.0.dist-info → docling-2.35.0.dist-info}/LICENSE +0 -0
- {docling-2.33.0.dist-info → docling-2.35.0.dist-info}/WHEEL +0 -0
- {docling-2.33.0.dist-info → docling-2.35.0.dist-info}/entry_points.txt +0 -0
@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
60
60
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
61
61
|
).to_top_left_origin(page_height=page_size.height * scale)
|
62
62
|
|
63
|
-
overlap_frac = cell_bbox.
|
63
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
64
64
|
|
65
65
|
if overlap_frac > 0.5:
|
66
66
|
if len(text_piece) > 0:
|
@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
71
71
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
72
72
|
).to_top_left_origin(page_height=page_size.height * scale)
|
73
73
|
|
74
|
-
overlap_frac = cell_bbox.
|
74
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
75
75
|
|
76
76
|
if overlap_frac > 0.5:
|
77
77
|
if len(text_piece) > 0:
|
@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
46
46
|
.scaled(scale)
|
47
47
|
)
|
48
48
|
|
49
|
-
overlap_frac = cell_bbox.
|
49
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
50
50
|
|
51
51
|
if overlap_frac > 0.5:
|
52
52
|
if len(text_piece) > 0:
|
docling/cli/main.py
CHANGED
@@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
|
|
12
12
|
|
13
13
|
import rich.table
|
14
14
|
import typer
|
15
|
+
from docling_core.transforms.serializer.html import (
|
16
|
+
HTMLDocSerializer,
|
17
|
+
HTMLOutputStyle,
|
18
|
+
HTMLParams,
|
19
|
+
)
|
20
|
+
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
|
15
21
|
from docling_core.types.doc import ImageRefMode
|
16
22
|
from docling_core.utils.file import resolve_source_to_path
|
17
23
|
from pydantic import TypeAdapter
|
@@ -156,6 +162,7 @@ def export_documents(
|
|
156
162
|
export_json: bool,
|
157
163
|
export_html: bool,
|
158
164
|
export_html_split_page: bool,
|
165
|
+
show_layout: bool,
|
159
166
|
export_md: bool,
|
160
167
|
export_txt: bool,
|
161
168
|
export_doctags: bool,
|
@@ -189,9 +196,27 @@ def export_documents(
|
|
189
196
|
if export_html_split_page:
|
190
197
|
fname = output_dir / f"{doc_filename}.html"
|
191
198
|
_log.info(f"writing HTML output to {fname}")
|
192
|
-
|
193
|
-
|
194
|
-
|
199
|
+
if show_layout:
|
200
|
+
ser = HTMLDocSerializer(
|
201
|
+
doc=conv_res.document,
|
202
|
+
params=HTMLParams(
|
203
|
+
image_mode=image_export_mode,
|
204
|
+
output_style=HTMLOutputStyle.SPLIT_PAGE,
|
205
|
+
),
|
206
|
+
)
|
207
|
+
visualizer = LayoutVisualizer()
|
208
|
+
visualizer.params.show_label = False
|
209
|
+
ser_res = ser.serialize(
|
210
|
+
visualizer=visualizer,
|
211
|
+
)
|
212
|
+
with open(fname, "w") as fw:
|
213
|
+
fw.write(ser_res.text)
|
214
|
+
else:
|
215
|
+
conv_res.document.save_as_html(
|
216
|
+
filename=fname,
|
217
|
+
image_mode=image_export_mode,
|
218
|
+
split_page_view=True,
|
219
|
+
)
|
195
220
|
|
196
221
|
# Export Text format:
|
197
222
|
if export_txt:
|
@@ -250,6 +275,13 @@ def convert( # noqa: C901
|
|
250
275
|
to_formats: List[OutputFormat] = typer.Option(
|
251
276
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
252
277
|
),
|
278
|
+
show_layout: Annotated[
|
279
|
+
bool,
|
280
|
+
typer.Option(
|
281
|
+
...,
|
282
|
+
help="If enabled, the page images will show the bounding-boxes of the items.",
|
283
|
+
),
|
284
|
+
] = False,
|
253
285
|
headers: str = typer.Option(
|
254
286
|
None,
|
255
287
|
"--headers",
|
@@ -596,6 +628,7 @@ def convert( # noqa: C901
|
|
596
628
|
export_json=export_json,
|
597
629
|
export_html=export_html,
|
598
630
|
export_html_split_page=export_html_split_page,
|
631
|
+
show_layout=show_layout,
|
599
632
|
export_md=export_md,
|
600
633
|
export_txt=export_txt,
|
601
634
|
export_doctags=export_doctags,
|
docling/datamodel/base_models.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
import math
|
2
|
+
from collections import defaultdict
|
1
3
|
from enum import Enum
|
2
|
-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
|
3
5
|
|
6
|
+
import numpy as np
|
4
7
|
from docling_core.types.doc import (
|
5
8
|
BoundingBox,
|
6
9
|
DocItemLabel,
|
@@ -16,7 +19,7 @@ from docling_core.types.io import (
|
|
16
19
|
DocumentStream,
|
17
20
|
)
|
18
21
|
from PIL.Image import Image
|
19
|
-
from pydantic import BaseModel, ConfigDict
|
22
|
+
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
20
23
|
|
21
24
|
if TYPE_CHECKING:
|
22
25
|
from docling.backend.pdf_backend import PdfPageBackend
|
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
|
|
298
301
|
choices: List[OpenAiResponseChoice]
|
299
302
|
created: int
|
300
303
|
usage: OpenAiResponseUsage
|
304
|
+
|
305
|
+
|
306
|
+
# Create a type alias for score values
|
307
|
+
ScoreValue = float
|
308
|
+
|
309
|
+
|
310
|
+
class QualityGrade(str, Enum):
|
311
|
+
POOR = "poor"
|
312
|
+
FAIR = "fair"
|
313
|
+
GOOD = "good"
|
314
|
+
EXCELLENT = "excellent"
|
315
|
+
UNSPECIFIED = "unspecified"
|
316
|
+
|
317
|
+
|
318
|
+
class PageConfidenceScores(BaseModel):
|
319
|
+
parse_score: ScoreValue = np.nan
|
320
|
+
layout_score: ScoreValue = np.nan
|
321
|
+
table_score: ScoreValue = np.nan
|
322
|
+
ocr_score: ScoreValue = np.nan
|
323
|
+
|
324
|
+
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
325
|
+
if score < 0.5:
|
326
|
+
return QualityGrade.POOR
|
327
|
+
elif score < 0.8:
|
328
|
+
return QualityGrade.FAIR
|
329
|
+
elif score < 0.9:
|
330
|
+
return QualityGrade.GOOD
|
331
|
+
elif score >= 0.9:
|
332
|
+
return QualityGrade.EXCELLENT
|
333
|
+
|
334
|
+
return QualityGrade.UNSPECIFIED
|
335
|
+
|
336
|
+
@computed_field # type: ignore
|
337
|
+
@property
|
338
|
+
def mean_grade(self) -> QualityGrade:
|
339
|
+
return self._score_to_grade(self.mean_score)
|
340
|
+
|
341
|
+
@computed_field # type: ignore
|
342
|
+
@property
|
343
|
+
def low_grade(self) -> QualityGrade:
|
344
|
+
return self._score_to_grade(self.low_score)
|
345
|
+
|
346
|
+
@computed_field # type: ignore
|
347
|
+
@property
|
348
|
+
def mean_score(self) -> ScoreValue:
|
349
|
+
return ScoreValue(
|
350
|
+
np.nanmean(
|
351
|
+
[
|
352
|
+
self.ocr_score,
|
353
|
+
self.table_score,
|
354
|
+
self.layout_score,
|
355
|
+
self.parse_score,
|
356
|
+
]
|
357
|
+
)
|
358
|
+
)
|
359
|
+
|
360
|
+
@computed_field # type: ignore
|
361
|
+
@property
|
362
|
+
def low_score(self) -> ScoreValue:
|
363
|
+
return ScoreValue(
|
364
|
+
np.nanquantile(
|
365
|
+
[
|
366
|
+
self.ocr_score,
|
367
|
+
self.table_score,
|
368
|
+
self.layout_score,
|
369
|
+
self.parse_score,
|
370
|
+
],
|
371
|
+
q=0.05,
|
372
|
+
)
|
373
|
+
)
|
374
|
+
|
375
|
+
|
376
|
+
class ConfidenceReport(PageConfidenceScores):
|
377
|
+
pages: Dict[int, PageConfidenceScores] = Field(
|
378
|
+
default_factory=lambda: defaultdict(PageConfidenceScores)
|
379
|
+
)
|
380
|
+
|
381
|
+
@computed_field # type: ignore
|
382
|
+
@property
|
383
|
+
def mean_score(self) -> ScoreValue:
|
384
|
+
return ScoreValue(
|
385
|
+
np.nanmean(
|
386
|
+
[c.mean_score for c in self.pages.values()],
|
387
|
+
)
|
388
|
+
)
|
389
|
+
|
390
|
+
@computed_field # type: ignore
|
391
|
+
@property
|
392
|
+
def low_score(self) -> ScoreValue:
|
393
|
+
return ScoreValue(
|
394
|
+
np.nanmean(
|
395
|
+
[c.low_score for c in self.pages.values()],
|
396
|
+
)
|
397
|
+
)
|
docling/datamodel/document.py
CHANGED
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
|
|
47
47
|
)
|
48
48
|
from docling_core.utils.file import resolve_source_to_stream
|
49
49
|
from docling_core.utils.legacy import docling_document_to_legacy
|
50
|
-
from pydantic import BaseModel
|
50
|
+
from pydantic import BaseModel, Field
|
51
51
|
from typing_extensions import deprecated
|
52
52
|
|
53
53
|
from docling.backend.abstract_backend import (
|
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
|
|
56
56
|
)
|
57
57
|
from docling.datamodel.base_models import (
|
58
58
|
AssembledUnit,
|
59
|
+
ConfidenceReport,
|
59
60
|
ConversionStatus,
|
60
61
|
DocumentStream,
|
61
62
|
ErrorItem,
|
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
|
|
201
202
|
pages: List[Page] = []
|
202
203
|
assembled: AssembledUnit = AssembledUnit()
|
203
204
|
timings: Dict[str, ProfilingItem] = {}
|
205
|
+
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
204
206
|
|
205
207
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
206
208
|
|
@@ -332,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
|
|
332
334
|
) -> Optional[InputFormat]:
|
333
335
|
"""Guess the input format of a document by checking part of its content."""
|
334
336
|
input_format: Optional[InputFormat] = None
|
335
|
-
content_str = content.decode("utf-8")
|
336
337
|
|
337
338
|
if mime == "application/xml":
|
339
|
+
content_str = content.decode("utf-8")
|
338
340
|
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
339
341
|
if match_doctype:
|
340
342
|
xml_doctype = match_doctype.group()
|
@@ -356,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
|
|
356
358
|
input_format = InputFormat.XML_JATS
|
357
359
|
|
358
360
|
elif mime == "text/plain":
|
361
|
+
content_str = content.decode("utf-8")
|
359
362
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
360
363
|
input_format = InputFormat.XML_USPTO
|
361
364
|
|
@@ -409,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
|
|
409
412
|
else:
|
410
413
|
return "application/xml"
|
411
414
|
|
412
|
-
if re.match(
|
415
|
+
if re.match(
|
416
|
+
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
|
417
|
+
content_str,
|
418
|
+
re.DOTALL,
|
419
|
+
):
|
413
420
|
return "text/html"
|
414
421
|
|
415
422
|
p = re.compile(
|
docling/models/layout_model.py
CHANGED
@@ -5,6 +5,7 @@ from collections.abc import Iterable
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Optional
|
7
7
|
|
8
|
+
import numpy as np
|
8
9
|
from docling_core.types.doc import DocItemLabel
|
9
10
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
10
11
|
from PIL import Image
|
@@ -184,6 +185,24 @@ class LayoutModel(BasePageModel):
|
|
184
185
|
).postprocess()
|
185
186
|
# processed_clusters, processed_cells = clusters, page.cells
|
186
187
|
|
188
|
+
with warnings.catch_warnings():
|
189
|
+
warnings.filterwarnings(
|
190
|
+
"ignore",
|
191
|
+
"Mean of empty slice|invalid value encountered in scalar divide",
|
192
|
+
RuntimeWarning,
|
193
|
+
"numpy",
|
194
|
+
)
|
195
|
+
|
196
|
+
conv_res.confidence.pages[page.page_no].layout_score = float(
|
197
|
+
np.mean([c.confidence for c in processed_clusters])
|
198
|
+
)
|
199
|
+
|
200
|
+
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
201
|
+
np.mean(
|
202
|
+
[c.confidence for c in processed_cells if c.from_ocr]
|
203
|
+
)
|
204
|
+
)
|
205
|
+
|
187
206
|
page.cells = processed_cells
|
188
207
|
page.predictions.layout = LayoutPrediction(
|
189
208
|
clusters=processed_clusters
|
@@ -1,7 +1,10 @@
|
|
1
|
+
import re
|
2
|
+
import warnings
|
1
3
|
from collections.abc import Iterable
|
2
4
|
from pathlib import Path
|
3
5
|
from typing import Optional
|
4
6
|
|
7
|
+
import numpy as np
|
5
8
|
from PIL import ImageDraw
|
6
9
|
from pydantic import BaseModel
|
7
10
|
|
@@ -21,6 +24,14 @@ class PagePreprocessingModel(BasePageModel):
|
|
21
24
|
def __init__(self, options: PagePreprocessingOptions):
|
22
25
|
self.options = options
|
23
26
|
|
27
|
+
# Pre-compiled regex patterns for efficiency
|
28
|
+
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
|
29
|
+
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
30
|
+
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
31
|
+
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
32
|
+
r"(?:/\w+\s*){2,}"
|
33
|
+
) # Two or more "/token " sequences
|
34
|
+
|
24
35
|
def __call__(
|
25
36
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
26
37
|
) -> Iterable[Page]:
|
@@ -60,6 +71,22 @@ class PagePreprocessingModel(BasePageModel):
|
|
60
71
|
if self.options.create_parsed_page:
|
61
72
|
page.parsed_page = page._backend.get_segmented_page()
|
62
73
|
|
74
|
+
# Rate the text quality from the PDF parser, and aggregate on page
|
75
|
+
text_scores = []
|
76
|
+
for c in page.cells:
|
77
|
+
score = self.rate_text_quality(c.text)
|
78
|
+
text_scores.append(score)
|
79
|
+
|
80
|
+
with warnings.catch_warnings():
|
81
|
+
warnings.filterwarnings(
|
82
|
+
"ignore", "Mean of empty slice", RuntimeWarning, "numpy"
|
83
|
+
)
|
84
|
+
conv_res.confidence.pages[page.page_no].parse_score = float(
|
85
|
+
np.nanquantile(
|
86
|
+
text_scores, q=0.10
|
87
|
+
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
88
|
+
)
|
89
|
+
|
63
90
|
# DEBUG code:
|
64
91
|
def draw_text_boxes(image, cells, show: bool = False):
|
65
92
|
draw = ImageDraw.Draw(image)
|
@@ -88,3 +115,30 @@ class PagePreprocessingModel(BasePageModel):
|
|
88
115
|
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
89
116
|
|
90
117
|
return page
|
118
|
+
|
119
|
+
def rate_text_quality(self, text: str) -> float:
|
120
|
+
# Hard errors: if any of these patterns are found, return 0.0 immediately.
|
121
|
+
blacklist_chars = ["�"]
|
122
|
+
if (
|
123
|
+
any(text.find(c) >= 0 for c in blacklist_chars)
|
124
|
+
or self.GLYPH_RE.search(text)
|
125
|
+
or self.SLASH_G_RE.search(text)
|
126
|
+
or self.SLASH_NUMBER_GARBAGE_RE.match(
|
127
|
+
text
|
128
|
+
) # Check if text is mostly slash-number pattern
|
129
|
+
):
|
130
|
+
return 0.0
|
131
|
+
|
132
|
+
penalty = 0.0
|
133
|
+
|
134
|
+
# Apply a penalty only if the fragmented words pattern occurs at least three times.
|
135
|
+
frag_matches = self.FRAG_RE.findall(text)
|
136
|
+
if len(frag_matches) >= 3:
|
137
|
+
penalty += 0.1 * len(frag_matches)
|
138
|
+
|
139
|
+
# Additional heuristic: if the average token length is below 2, add a penalty.
|
140
|
+
# tokens = text.split()
|
141
|
+
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
142
|
+
# penalty += 0.2
|
143
|
+
|
144
|
+
return max(1.0 - penalty, 0.0)
|
@@ -2,6 +2,7 @@ import csv
|
|
2
2
|
import io
|
3
3
|
import logging
|
4
4
|
import os
|
5
|
+
import subprocess
|
5
6
|
import tempfile
|
6
7
|
from collections.abc import Iterable
|
7
8
|
from pathlib import Path
|
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
|
|
10
11
|
|
11
12
|
import pandas as pd
|
12
13
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
13
|
-
from docling_core.types.doc.page import
|
14
|
+
from docling_core.types.doc.page import TextCell
|
14
15
|
|
15
16
|
from docling.datamodel.base_models import Page
|
16
17
|
from docling.datamodel.document import ConversionResult
|
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
|
|
21
22
|
)
|
22
23
|
from docling.datamodel.settings import settings
|
23
24
|
from docling.models.base_ocr_model import BaseOcrModel
|
24
|
-
from docling.utils.ocr_utils import
|
25
|
+
from docling.utils.ocr_utils import (
|
26
|
+
map_tesseract_script,
|
27
|
+
parse_tesseract_orientation,
|
28
|
+
tesseract_box_to_bounding_rectangle,
|
29
|
+
)
|
25
30
|
from docling.utils.profiling import TimeRecorder
|
26
31
|
|
27
32
|
_log = logging.getLogger(__name__)
|
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
49
54
|
self._version: Optional[str] = None
|
50
55
|
self._tesseract_languages: Optional[List[str]] = None
|
51
56
|
self._script_prefix: Optional[str] = None
|
57
|
+
self._is_auto: bool = "auto" in self.options.lang
|
52
58
|
|
53
59
|
if self.enabled:
|
54
60
|
try:
|
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
93
99
|
|
94
100
|
return name, version
|
95
101
|
|
96
|
-
def _run_tesseract(self, ifilename: str):
|
102
|
+
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
|
97
103
|
r"""
|
98
104
|
Run tesseract CLI
|
99
105
|
"""
|
100
106
|
cmd = [self.options.tesseract_cmd]
|
101
|
-
|
102
|
-
|
103
|
-
lang = self._detect_language(ifilename)
|
107
|
+
if self._is_auto:
|
108
|
+
lang = self._parse_language(osd)
|
104
109
|
if lang is not None:
|
105
110
|
cmd.append("-l")
|
106
111
|
cmd.append(lang)
|
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
115
120
|
cmd += [ifilename, "stdout", "tsv"]
|
116
121
|
_log.info("command: {}".format(" ".join(cmd)))
|
117
122
|
|
118
|
-
|
119
|
-
output, _ = proc.communicate()
|
123
|
+
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
120
124
|
|
121
125
|
# _log.info(output)
|
122
126
|
|
123
127
|
# Decode the byte string to a regular string
|
124
|
-
decoded_data = output.decode("utf-8")
|
128
|
+
decoded_data = output.stdout.decode("utf-8")
|
125
129
|
# _log.info(decoded_data)
|
126
130
|
|
127
131
|
# Read the TSV file generated by Tesseract
|
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
139
143
|
|
140
144
|
return df_filtered
|
141
145
|
|
142
|
-
def
|
146
|
+
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
|
143
147
|
r"""
|
144
148
|
Run tesseract in PSM 0 mode to detect the language
|
145
149
|
"""
|
146
|
-
assert self._tesseract_languages is not None
|
147
150
|
|
148
151
|
cmd = [self.options.tesseract_cmd]
|
149
152
|
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
150
153
|
_log.info("command: {}".format(" ".join(cmd)))
|
151
|
-
|
152
|
-
|
153
|
-
decoded_data = output.decode("utf-8")
|
154
|
+
output = subprocess.run(cmd, capture_output=True, check=True)
|
155
|
+
decoded_data = output.stdout.decode("utf-8")
|
154
156
|
df_detected = pd.read_csv(
|
155
157
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
156
158
|
)
|
157
|
-
|
159
|
+
return df_detected
|
160
|
+
|
161
|
+
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
|
162
|
+
assert self._tesseract_languages is not None
|
163
|
+
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
|
158
164
|
if len(scripts) == 0:
|
159
165
|
_log.warning("Tesseract cannot detect the script of the page")
|
160
166
|
return None
|
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
182
188
|
cmd = [self.options.tesseract_cmd]
|
183
189
|
cmd.append("--list-langs")
|
184
190
|
_log.info("command: {}".format(" ".join(cmd)))
|
185
|
-
|
186
|
-
|
187
|
-
decoded_data = output.decode("utf-8")
|
191
|
+
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
192
|
+
decoded_data = output.stdout.decode("utf-8")
|
188
193
|
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
189
194
|
self._tesseract_languages = df_list[0].tolist()[1:]
|
190
195
|
|
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
203
208
|
yield from page_batch
|
204
209
|
return
|
205
210
|
|
206
|
-
for page in page_batch:
|
211
|
+
for page_i, page in enumerate(page_batch):
|
207
212
|
assert page._backend is not None
|
208
213
|
if not page._backend.is_valid():
|
209
214
|
yield page
|
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
212
217
|
ocr_rects = self.get_ocr_rects(page)
|
213
218
|
|
214
219
|
all_ocr_cells = []
|
215
|
-
for ocr_rect in ocr_rects:
|
220
|
+
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
216
221
|
# Skip zero area boxes
|
217
222
|
if ocr_rect.area() == 0:
|
218
223
|
continue
|
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
225
230
|
) as image_file:
|
226
231
|
fname = image_file.name
|
227
232
|
high_res_image.save(image_file)
|
228
|
-
|
229
|
-
|
233
|
+
doc_orientation = 0
|
234
|
+
try:
|
235
|
+
df_osd = self._perform_osd(fname)
|
236
|
+
doc_orientation = _parse_orientation(df_osd)
|
237
|
+
except subprocess.CalledProcessError as exc:
|
238
|
+
_log.error(
|
239
|
+
"OSD failed (doc %s, page: %s, "
|
240
|
+
"OCR rectangle: %s, processed image file %s):\n %s",
|
241
|
+
conv_res.input.file,
|
242
|
+
page_i,
|
243
|
+
ocr_rect_i,
|
244
|
+
image_file,
|
245
|
+
exc.stderr,
|
246
|
+
)
|
247
|
+
# Skipping if OSD fail when in auto mode, otherwise proceed
|
248
|
+
# to OCR in the hope OCR will succeed while OSD failed
|
249
|
+
if self._is_auto:
|
250
|
+
continue
|
251
|
+
if doc_orientation != 0:
|
252
|
+
high_res_image = high_res_image.rotate(
|
253
|
+
-doc_orientation, expand=True
|
254
|
+
)
|
255
|
+
high_res_image.save(fname)
|
256
|
+
try:
|
257
|
+
df_result = self._run_tesseract(fname, df_osd)
|
258
|
+
except subprocess.CalledProcessError as exc:
|
259
|
+
_log.error(
|
260
|
+
"tesseract OCR failed (doc %s, page: %s, "
|
261
|
+
"OCR rectangle: %s, processed image file %s):\n %s",
|
262
|
+
conv_res.input.file,
|
263
|
+
page_i,
|
264
|
+
ocr_rect_i,
|
265
|
+
image_file,
|
266
|
+
exc.stderr,
|
267
|
+
)
|
268
|
+
continue
|
230
269
|
finally:
|
231
270
|
if os.path.exists(fname):
|
232
271
|
os.remove(fname)
|
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
238
277
|
text = row["text"]
|
239
278
|
conf = row["conf"]
|
240
279
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
280
|
+
left, top = float(row["left"]), float(row["top"])
|
281
|
+
right = left + float(row["width"])
|
282
|
+
bottom = top + row["height"]
|
283
|
+
bbox = BoundingBox(
|
284
|
+
l=left,
|
285
|
+
t=top,
|
286
|
+
r=right,
|
287
|
+
b=bottom,
|
288
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
289
|
+
)
|
290
|
+
rect = tesseract_box_to_bounding_rectangle(
|
291
|
+
bbox,
|
292
|
+
original_offset=ocr_rect,
|
293
|
+
scale=self.scale,
|
294
|
+
orientation=doc_orientation,
|
295
|
+
im_size=high_res_image.size,
|
296
|
+
)
|
249
297
|
cell = TextCell(
|
250
298
|
index=ix,
|
251
299
|
text=str(text),
|
252
300
|
orig=str(text),
|
253
301
|
from_ocr=True,
|
254
302
|
confidence=conf / 100.0,
|
255
|
-
rect=
|
256
|
-
BoundingBox.from_tuple(
|
257
|
-
coord=(
|
258
|
-
(l / self.scale) + ocr_rect.l,
|
259
|
-
(b / self.scale) + ocr_rect.t,
|
260
|
-
(r / self.scale) + ocr_rect.l,
|
261
|
-
(t / self.scale) + ocr_rect.t,
|
262
|
-
),
|
263
|
-
origin=CoordOrigin.TOPLEFT,
|
264
|
-
)
|
265
|
-
),
|
303
|
+
rect=rect,
|
266
304
|
)
|
267
305
|
all_ocr_cells.append(cell)
|
268
306
|
|
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
278
316
|
@classmethod
|
279
317
|
def get_options_type(cls) -> Type[OcrOptions]:
|
280
318
|
return TesseractCliOcrOptions
|
319
|
+
|
320
|
+
|
321
|
+
def _parse_orientation(df_osd: pd.DataFrame) -> int:
|
322
|
+
orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
|
323
|
+
orientation = parse_tesseract_orientation(orientations[0].strip())
|
324
|
+
return orientation
|
@@ -1,12 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import logging
|
4
|
-
from collections.abc import Iterable
|
5
4
|
from pathlib import Path
|
6
|
-
from typing import Optional, Type
|
5
|
+
from typing import Iterable, Optional, Type
|
7
6
|
|
8
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
-
from docling_core.types.doc.page import
|
8
|
+
from docling_core.types.doc.page import TextCell
|
10
9
|
|
11
10
|
from docling.datamodel.base_models import Page
|
12
11
|
from docling.datamodel.document import ConversionResult
|
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
|
|
17
16
|
)
|
18
17
|
from docling.datamodel.settings import settings
|
19
18
|
from docling.models.base_ocr_model import BaseOcrModel
|
20
|
-
from docling.utils.ocr_utils import
|
19
|
+
from docling.utils.ocr_utils import (
|
20
|
+
map_tesseract_script,
|
21
|
+
parse_tesseract_orientation,
|
22
|
+
tesseract_box_to_bounding_rectangle,
|
23
|
+
)
|
21
24
|
from docling.utils.profiling import TimeRecorder
|
22
25
|
|
23
26
|
_log = logging.getLogger(__name__)
|
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
38
41
|
accelerator_options=accelerator_options,
|
39
42
|
)
|
40
43
|
self.options: TesseractOcrOptions
|
41
|
-
|
44
|
+
self._is_auto: bool = "auto" in self.options.lang
|
42
45
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
43
46
|
self.reader = None
|
44
47
|
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
|
|
95
98
|
|
96
99
|
if lang == "auto":
|
97
100
|
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
|
98
|
-
self.osd_reader = tesserocr.PyTessBaseAPI(
|
99
|
-
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
100
|
-
)
|
101
101
|
else:
|
102
102
|
self.reader = tesserocr.PyTessBaseAPI(
|
103
103
|
**{"lang": lang} | tesserocr_kwargs,
|
104
104
|
)
|
105
|
+
self.osd_reader = tesserocr.PyTessBaseAPI(
|
106
|
+
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
107
|
+
)
|
105
108
|
self.reader_RIL = tesserocr.RIL
|
106
109
|
|
107
110
|
def __del__(self):
|
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
|
|
118
121
|
yield from page_batch
|
119
122
|
return
|
120
123
|
|
121
|
-
for page in page_batch:
|
124
|
+
for page_i, page in enumerate(page_batch):
|
122
125
|
assert page._backend is not None
|
123
126
|
if not page._backend.is_valid():
|
124
127
|
yield page
|
125
128
|
else:
|
126
129
|
with TimeRecorder(conv_res, "ocr"):
|
127
130
|
assert self.reader is not None
|
131
|
+
assert self.osd_reader is not None
|
128
132
|
assert self._tesserocr_languages is not None
|
129
133
|
|
130
134
|
ocr_rects = self.get_ocr_rects(page)
|
131
135
|
|
132
136
|
all_ocr_cells = []
|
133
|
-
for ocr_rect in ocr_rects:
|
137
|
+
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
134
138
|
# Skip zero area boxes
|
135
139
|
if ocr_rect.area() == 0:
|
136
140
|
continue
|
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
|
|
139
143
|
)
|
140
144
|
|
141
145
|
local_reader = self.reader
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
146
|
+
self.osd_reader.SetImage(high_res_image)
|
147
|
+
osd = self.osd_reader.DetectOrientationScript()
|
148
|
+
# No text, or Orientation and Script detection failure
|
149
|
+
if osd is None:
|
150
|
+
_log.error(
|
151
|
+
"OSD failed for doc (doc %s, page: %s, "
|
152
|
+
"OCR rectangle: %s)",
|
153
|
+
conv_res.input.file,
|
154
|
+
page_i,
|
155
|
+
ocr_rect_i,
|
156
|
+
)
|
157
|
+
# Skipping if OSD fail when in auto mode, otherwise proceed
|
158
|
+
# to OCR in the hope OCR will succeed while OSD failed
|
159
|
+
if self._is_auto:
|
150
160
|
continue
|
151
|
-
|
161
|
+
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
|
162
|
+
if doc_orientation != 0:
|
163
|
+
high_res_image = high_res_image.rotate(
|
164
|
+
-doc_orientation, expand=True
|
165
|
+
)
|
166
|
+
if self._is_auto:
|
152
167
|
script = osd["script_name"]
|
153
168
|
script = map_tesseract_script(script)
|
154
169
|
lang = f"{self.script_prefix}{script}"
|
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
|
|
188
203
|
# Extract text within the bounding box
|
189
204
|
text = local_reader.GetUTF8Text().strip()
|
190
205
|
confidence = local_reader.MeanTextConf()
|
191
|
-
left = box["x"]
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
206
|
+
left, top = box["x"], box["y"]
|
207
|
+
right = left + box["w"]
|
208
|
+
bottom = top + box["h"]
|
209
|
+
bbox = BoundingBox(
|
210
|
+
l=left,
|
211
|
+
t=top,
|
212
|
+
r=right,
|
213
|
+
b=bottom,
|
214
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
215
|
+
)
|
216
|
+
rect = tesseract_box_to_bounding_rectangle(
|
217
|
+
bbox,
|
218
|
+
original_offset=ocr_rect,
|
219
|
+
scale=self.scale,
|
220
|
+
orientation=doc_orientation,
|
221
|
+
im_size=high_res_image.size,
|
222
|
+
)
|
196
223
|
cells.append(
|
197
224
|
TextCell(
|
198
225
|
index=ix,
|
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
200
227
|
orig=text,
|
201
228
|
from_ocr=True,
|
202
229
|
confidence=confidence,
|
203
|
-
rect=
|
204
|
-
BoundingBox.from_tuple(
|
205
|
-
coord=(left, top, right, bottom),
|
206
|
-
origin=CoordOrigin.TOPLEFT,
|
207
|
-
),
|
208
|
-
),
|
230
|
+
rect=rect,
|
209
231
|
)
|
210
232
|
)
|
211
233
|
|
@@ -3,6 +3,7 @@ import warnings
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import Optional, cast
|
5
5
|
|
6
|
+
import numpy as np
|
6
7
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
7
8
|
|
8
9
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
@@ -54,13 +55,15 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
54
55
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
55
56
|
)
|
56
57
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
58
|
+
with warnings.catch_warnings(): # deprecated generate_table_images
|
59
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
60
|
+
self.keep_images = (
|
61
|
+
self.pipeline_options.generate_page_images
|
62
|
+
or self.pipeline_options.generate_picture_images
|
63
|
+
or self.pipeline_options.generate_table_images
|
64
|
+
)
|
62
65
|
|
63
|
-
self.
|
66
|
+
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
64
67
|
|
65
68
|
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
66
69
|
|
@@ -197,7 +200,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
197
200
|
elements=all_elements, headers=all_headers, body=all_body
|
198
201
|
)
|
199
202
|
|
200
|
-
conv_res.document = self.
|
203
|
+
conv_res.document = self.reading_order_model(conv_res)
|
201
204
|
|
202
205
|
# Generate page images in the output
|
203
206
|
if self.pipeline_options.generate_page_images:
|
@@ -209,40 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
209
212
|
)
|
210
213
|
|
211
214
|
# Generate images of the requested element types
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
215
|
+
with warnings.catch_warnings(): # deprecated generate_table_images
|
216
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
217
|
+
if (
|
218
|
+
self.pipeline_options.generate_picture_images
|
219
|
+
or self.pipeline_options.generate_table_images
|
220
|
+
):
|
221
|
+
scale = self.pipeline_options.images_scale
|
222
|
+
for element, _level in conv_res.document.iterate_items():
|
223
|
+
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
224
|
+
continue
|
225
|
+
if (
|
226
|
+
isinstance(element, PictureItem)
|
227
|
+
and self.pipeline_options.generate_picture_images
|
228
|
+
) or (
|
229
|
+
isinstance(element, TableItem)
|
230
|
+
and self.pipeline_options.generate_table_images
|
231
|
+
):
|
232
|
+
page_ix = element.prov[0].page_no - 1
|
233
|
+
page = next(
|
234
|
+
(p for p in conv_res.pages if p.page_no == page_ix),
|
235
|
+
cast("Page", None),
|
236
|
+
)
|
237
|
+
assert page is not None
|
238
|
+
assert page.size is not None
|
239
|
+
assert page.image is not None
|
240
|
+
|
241
|
+
crop_bbox = (
|
242
|
+
element.prov[0]
|
243
|
+
.bbox.scaled(scale=scale)
|
244
|
+
.to_top_left_origin(
|
245
|
+
page_height=page.size.height * scale
|
246
|
+
)
|
247
|
+
)
|
248
|
+
|
249
|
+
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
250
|
+
element.image = ImageRef.from_pil(
|
251
|
+
cropped_im, dpi=int(72 * scale)
|
252
|
+
)
|
253
|
+
|
254
|
+
# Aggregate confidence values for document:
|
255
|
+
if len(conv_res.pages) > 0:
|
256
|
+
with warnings.catch_warnings():
|
257
|
+
warnings.filterwarnings(
|
258
|
+
"ignore",
|
259
|
+
category=RuntimeWarning,
|
260
|
+
message="Mean of empty slice|All-NaN slice encountered",
|
261
|
+
)
|
262
|
+
conv_res.confidence.layout_score = float(
|
263
|
+
np.nanmean(
|
264
|
+
[c.layout_score for c in conv_res.confidence.pages.values()]
|
231
265
|
)
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
element.prov[0]
|
238
|
-
.bbox.scaled(scale=scale)
|
239
|
-
.to_top_left_origin(page_height=page.size.height * scale)
|
266
|
+
)
|
267
|
+
conv_res.confidence.parse_score = float(
|
268
|
+
np.nanquantile(
|
269
|
+
[c.parse_score for c in conv_res.confidence.pages.values()],
|
270
|
+
q=0.1, # parse score should relate to worst 10% of pages.
|
240
271
|
)
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
272
|
+
)
|
273
|
+
conv_res.confidence.table_score = float(
|
274
|
+
np.nanmean(
|
275
|
+
[c.table_score for c in conv_res.confidence.pages.values()]
|
245
276
|
)
|
277
|
+
)
|
278
|
+
conv_res.confidence.ocr_score = float(
|
279
|
+
np.nanmean(
|
280
|
+
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
281
|
+
)
|
282
|
+
)
|
246
283
|
|
247
284
|
return conv_res
|
248
285
|
|
@@ -90,17 +90,12 @@ class SpatialClusterIndex:
|
|
90
90
|
containment_threshold: float,
|
91
91
|
) -> bool:
|
92
92
|
"""Check if two bboxes overlap sufficiently."""
|
93
|
-
|
94
|
-
if area1 <= 0 or area2 <= 0:
|
93
|
+
if bbox1.area() <= 0 or bbox2.area() <= 0:
|
95
94
|
return False
|
96
95
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
iou = overlap_area / (area1 + area2 - overlap_area)
|
102
|
-
containment1 = overlap_area / area1
|
103
|
-
containment2 = overlap_area / area2
|
96
|
+
iou = bbox1.intersection_over_union(bbox2)
|
97
|
+
containment1 = bbox1.intersection_over_self(bbox2)
|
98
|
+
containment2 = bbox2.intersection_over_self(bbox1)
|
104
99
|
|
105
100
|
return (
|
106
101
|
iou > overlap_threshold
|
@@ -321,11 +316,9 @@ class LayoutPostprocessor:
|
|
321
316
|
for special in special_clusters:
|
322
317
|
contained = []
|
323
318
|
for cluster in self.regular_clusters:
|
324
|
-
|
325
|
-
if
|
326
|
-
|
327
|
-
if containment > 0.8:
|
328
|
-
contained.append(cluster)
|
319
|
+
containment = cluster.bbox.intersection_over_self(special.bbox)
|
320
|
+
if containment > 0.8:
|
321
|
+
contained.append(cluster)
|
329
322
|
|
330
323
|
if contained:
|
331
324
|
# Sort contained clusters by minimum cell ID:
|
@@ -379,9 +372,7 @@ class LayoutPostprocessor:
|
|
379
372
|
for regular in self.regular_clusters:
|
380
373
|
if regular.label == DocItemLabel.TABLE:
|
381
374
|
# Calculate overlap
|
382
|
-
|
383
|
-
wrapper_area = wrapper.bbox.area()
|
384
|
-
overlap_ratio = overlap / wrapper_area
|
375
|
+
overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
|
385
376
|
|
386
377
|
conf_diff = wrapper.confidence - regular.confidence
|
387
378
|
|
@@ -421,8 +412,7 @@ class LayoutPostprocessor:
|
|
421
412
|
# Rule 2: CODE vs others
|
422
413
|
if candidate.label == DocItemLabel.CODE:
|
423
414
|
# Calculate how much of the other cluster is contained within the CODE cluster
|
424
|
-
|
425
|
-
containment = overlap / other.bbox.area()
|
415
|
+
containment = other.bbox.intersection_over_self(candidate.bbox)
|
426
416
|
if containment > 0.8: # other is 80% contained within CODE
|
427
417
|
return True
|
428
418
|
|
@@ -586,11 +576,9 @@ class LayoutPostprocessor:
|
|
586
576
|
if cell.rect.to_bounding_box().area() <= 0:
|
587
577
|
continue
|
588
578
|
|
589
|
-
|
579
|
+
overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
|
590
580
|
cluster.bbox
|
591
581
|
)
|
592
|
-
overlap_ratio = overlap / cell.rect.to_bounding_box().area()
|
593
|
-
|
594
582
|
if overlap_ratio > best_overlap:
|
595
583
|
best_overlap = overlap_ratio
|
596
584
|
best_cluster = cluster
|
docling/utils/ocr_utils.py
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
from typing import Optional, Tuple
|
2
|
+
|
3
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
4
|
+
from docling_core.types.doc.page import BoundingRectangle
|
5
|
+
|
6
|
+
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
|
7
|
+
|
8
|
+
|
1
9
|
def map_tesseract_script(script: str) -> str:
|
2
10
|
r""" """
|
3
11
|
if script == "Katakana" or script == "Hiragana":
|
@@ -7,3 +15,55 @@ def map_tesseract_script(script: str) -> str:
|
|
7
15
|
elif script == "Korean":
|
8
16
|
script = "Hangul"
|
9
17
|
return script
|
18
|
+
|
19
|
+
|
20
|
+
def parse_tesseract_orientation(orientation: str) -> int:
|
21
|
+
# Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
|
22
|
+
# are [0, 360[ counterclockwise
|
23
|
+
parsed = int(orientation)
|
24
|
+
if parsed not in CLIPPED_ORIENTATIONS:
|
25
|
+
msg = (
|
26
|
+
f"invalid tesseract document orientation {orientation}, "
|
27
|
+
f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
|
28
|
+
)
|
29
|
+
raise ValueError(msg)
|
30
|
+
parsed = -parsed
|
31
|
+
parsed %= 360
|
32
|
+
return parsed
|
33
|
+
|
34
|
+
|
35
|
+
def tesseract_box_to_bounding_rectangle(
|
36
|
+
bbox: BoundingBox,
|
37
|
+
*,
|
38
|
+
original_offset: Optional[BoundingBox] = None,
|
39
|
+
scale: float,
|
40
|
+
orientation: int,
|
41
|
+
im_size: Tuple[int, int],
|
42
|
+
) -> BoundingRectangle:
|
43
|
+
# box is in the top, left, height, width format, top left coordinates
|
44
|
+
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
|
45
|
+
rect = BoundingRectangle(
|
46
|
+
r_x0=rect.r_x0 / scale,
|
47
|
+
r_y0=rect.r_y0 / scale,
|
48
|
+
r_x1=rect.r_x1 / scale,
|
49
|
+
r_y1=rect.r_y1 / scale,
|
50
|
+
r_x2=rect.r_x2 / scale,
|
51
|
+
r_y2=rect.r_y2 / scale,
|
52
|
+
r_x3=rect.r_x3 / scale,
|
53
|
+
r_y3=rect.r_y3 / scale,
|
54
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
55
|
+
)
|
56
|
+
if original_offset is not None:
|
57
|
+
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
|
58
|
+
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
|
59
|
+
raise ValueError(msg)
|
60
|
+
if original_offset is not None:
|
61
|
+
rect.r_x0 += original_offset.l
|
62
|
+
rect.r_x1 += original_offset.l
|
63
|
+
rect.r_x2 += original_offset.l
|
64
|
+
rect.r_x3 += original_offset.l
|
65
|
+
rect.r_y0 += original_offset.t
|
66
|
+
rect.r_y1 += original_offset.t
|
67
|
+
rect.r_y2 += original_offset.t
|
68
|
+
rect.r_y3 += original_offset.t
|
69
|
+
return rect
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from typing import Tuple
|
2
|
+
|
3
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
4
|
+
from docling_core.types.doc.page import BoundingRectangle
|
5
|
+
|
6
|
+
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
7
|
+
|
8
|
+
|
9
|
+
def rotate_bounding_box(
|
10
|
+
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
|
11
|
+
) -> BoundingRectangle:
|
12
|
+
# The box is left top width height in TOPLEFT coordinates
|
13
|
+
# Bounding rectangle start with r_0 at the bottom left whatever the
|
14
|
+
# coordinate system. Then other corners are found rotating counterclockwise
|
15
|
+
bbox = bbox.to_top_left_origin(im_size[1])
|
16
|
+
left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
|
17
|
+
im_h, im_w = im_size
|
18
|
+
angle = angle % 360
|
19
|
+
if angle == 0:
|
20
|
+
r_x0 = left
|
21
|
+
r_y0 = top + height
|
22
|
+
r_x1 = r_x0 + width
|
23
|
+
r_y1 = r_y0
|
24
|
+
r_x2 = r_x0 + width
|
25
|
+
r_y2 = r_y0 - height
|
26
|
+
r_x3 = r_x0
|
27
|
+
r_y3 = r_y0 - height
|
28
|
+
elif angle == 90:
|
29
|
+
r_x0 = im_w - (top + height)
|
30
|
+
r_y0 = left
|
31
|
+
r_x1 = r_x0
|
32
|
+
r_y1 = r_y0 + width
|
33
|
+
r_x2 = r_x0 + height
|
34
|
+
r_y2 = r_y0 + width
|
35
|
+
r_x3 = r_x0
|
36
|
+
r_y3 = r_y0 + width
|
37
|
+
elif angle == 180:
|
38
|
+
r_x0 = im_h - left
|
39
|
+
r_y0 = im_w - (top + height)
|
40
|
+
r_x1 = r_x0 - width
|
41
|
+
r_y1 = r_y0
|
42
|
+
r_x2 = r_x0 - width
|
43
|
+
r_y2 = r_y0 + height
|
44
|
+
r_x3 = r_x0
|
45
|
+
r_y3 = r_y0 + height
|
46
|
+
elif angle == 270:
|
47
|
+
r_x0 = top + height
|
48
|
+
r_y0 = im_h - left
|
49
|
+
r_x1 = r_x0
|
50
|
+
r_y1 = r_y0 - width
|
51
|
+
r_x2 = r_x0 - height
|
52
|
+
r_y2 = r_y0 - width
|
53
|
+
r_x3 = r_x0 - height
|
54
|
+
r_y3 = r_y0
|
55
|
+
else:
|
56
|
+
msg = (
|
57
|
+
f"invalid orientation {angle}, expected values in:"
|
58
|
+
f" {sorted(CLIPPED_ORIENTATIONS)}"
|
59
|
+
)
|
60
|
+
raise ValueError(msg)
|
61
|
+
return BoundingRectangle(
|
62
|
+
r_x0=r_x0,
|
63
|
+
r_y0=r_y0,
|
64
|
+
r_x1=r_x1,
|
65
|
+
r_y1=r_y1,
|
66
|
+
r_x2=r_x2,
|
67
|
+
r_y2=r_y2,
|
68
|
+
r_x3=r_x3,
|
69
|
+
r_y3=r_y3,
|
70
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
71
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.35.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
|
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
31
|
Requires-Dist: click (<8.2.0)
|
32
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
32
|
+
Requires-Dist: docling-core[chunking] (>=2.31.2,<3.0.0)
|
33
33
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
34
34
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
35
35
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -3,9 +3,9 @@ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=W-4MRcID6AU9Ax23q8FwDwGG-OOCrBoqcNf2Ch_WPUc,14041
|
5
5
|
docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
|
6
|
-
docling/backend/docling_parse_backend.py,sha256=
|
7
|
-
docling/backend/docling_parse_v2_backend.py,sha256=
|
8
|
-
docling/backend/docling_parse_v4_backend.py,sha256
|
6
|
+
docling/backend/docling_parse_backend.py,sha256=bVSPmmiVXdCVfe-eLtDhbPQKBjkFR8rZJoRxdWIMdYU,7998
|
7
|
+
docling/backend/docling_parse_v2_backend.py,sha256=R4YPCEs72GYg-Xc9VfizPv8QjtGmKOsQzVPNAU2RIK0,9376
|
8
|
+
docling/backend/docling_parse_v4_backend.py,sha256=aWh-fd-lnuRGVGC_DG17QUptIsArv5V1gJo8QFbB5Ys,6263
|
9
9
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
|
@@ -24,12 +24,12 @@ docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e
|
|
24
24
|
docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
|
25
25
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
26
26
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
docling/cli/main.py,sha256=
|
27
|
+
docling/cli/main.py,sha256=KARZ1OJx4HvHb1D_95GPIAhKaIlhcYYSBa0t4PM-Xfk,27339
|
28
28
|
docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
|
29
29
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
30
30
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
docling/datamodel/base_models.py,sha256=
|
32
|
-
docling/datamodel/document.py,sha256=
|
31
|
+
docling/datamodel/base_models.py,sha256=QJlzGJKUAO0kqM6DO2RZKlFi-lL2MpY8qt3Wdm02Slw,10460
|
32
|
+
docling/datamodel/document.py,sha256=vPwiVU5zWCKbVYMq-TSmb7LTjijrqJq0FyAgDBa0XGA,16154
|
33
33
|
docling/datamodel/pipeline_options.py,sha256=uwjBvK4egrgcF1_w4B5EDxpGnl4IgBzmxP7dJ7zm394,13400
|
34
34
|
docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
|
35
35
|
docling/document_converter.py,sha256=PRRr65nigQ3LZDl4G2fBMkOtJyswT7xyGt7fpUeDO3w,13849
|
@@ -47,10 +47,10 @@ docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0
|
|
47
47
|
docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
|
48
48
|
docling/models/hf_mlx_model.py,sha256=B_B4hFU-jU0g_DQtQD8w4Ejorn10mkDuFI93wR_WhGk,4897
|
49
49
|
docling/models/hf_vlm_model.py,sha256=SiPMTLghMUjJ66dA2yN4UujpLO6PiOhLEPInWtXV_5s,6912
|
50
|
-
docling/models/layout_model.py,sha256=
|
50
|
+
docling/models/layout_model.py,sha256=1LLDS3hBfdJXA16L_PrjA_1rM_A2r5rNFkHVbLBCl_8,8639
|
51
51
|
docling/models/ocr_mac_model.py,sha256=A3TlEbvvwhkWiq9YARos3Y9yNcpPYQ7JGc_4hFtAK-8,5370
|
52
|
-
docling/models/page_assemble_model.py,sha256=
|
53
|
-
docling/models/page_preprocessing_model.py,sha256=
|
52
|
+
docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
|
53
|
+
docling/models/page_preprocessing_model.py,sha256=8cdhR9n3zcC8JxDen8WdPBx_GNk_5VICeHJo1-kP518,5186
|
54
54
|
docling/models/picture_description_api_model.py,sha256=kCuAFOGEuI5QsRul7Pc1LccxWN7WIvIUhXEmSICYegw,2332
|
55
55
|
docling/models/picture_description_base_model.py,sha256=FbBVXzAOB87xpJN28tuGCxoAdcf6mZNUOqJR7ljUg5g,2946
|
56
56
|
docling/models/picture_description_vlm_model.py,sha256=DiTjnehVy1n0N04xPUvZl8rx4TiNHzHn9Cnzy_ePGts,4177
|
@@ -59,12 +59,12 @@ docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurH
|
|
59
59
|
docling/models/rapid_ocr_model.py,sha256=Tq_1Egu5Hjx7Y69Vox17QTtRXztSyflB1fhN08CWQwY,5894
|
60
60
|
docling/models/readingorder_model.py,sha256=S9ru2ApY9sE-Uue3hptWHmbmElwo36bUbAikxCFpHYs,14574
|
61
61
|
docling/models/table_structure_model.py,sha256=1gxLaooK0IKMrnmS8nT1BItKqt1GAKghfpmLKb3i53g,12566
|
62
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
63
|
-
docling/models/tesseract_ocr_model.py,sha256=
|
62
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=e55MkaDdsseKcfX5lxIt0iv5jR6pDFBzWBZHTvl2Jws,12653
|
63
|
+
docling/models/tesseract_ocr_model.py,sha256=vS4And5NHe_uLNb6ZBi2CQzWUITBdc1E1zlsojrSZpM,10561
|
64
64
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
65
65
|
docling/pipeline/base_pipeline.py,sha256=DnuxAf7EQusdSRae0QUVth-0f2mSff8JZjX-2vazk00,8751
|
66
66
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
67
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
67
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=itCZPj7nMFAQtAlStfmWthpCIHZFUm9W5uTgvVi6PkQ,12738
|
68
68
|
docling/pipeline/vlm_pipeline.py,sha256=ZW1WGd6jeLqTCWR0S0cj6H_qVMUXELaFCrJVpvZp6Co,9684
|
69
69
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
70
70
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -72,15 +72,16 @@ docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExT
|
|
72
72
|
docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
|
73
73
|
docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
|
74
74
|
docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
|
75
|
-
docling/utils/layout_postprocessor.py,sha256=
|
75
|
+
docling/utils/layout_postprocessor.py,sha256=3WCmkPsPJ80xfWzAUeWb5L9BmuwJ79ztctvbbUs8AfI,24068
|
76
76
|
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
77
77
|
docling/utils/model_downloader.py,sha256=ocvud3G3qlBQhzMo69Q3RJMnvq5HPZ2DwNbMuEp8RCs,4142
|
78
|
-
docling/utils/ocr_utils.py,sha256=
|
78
|
+
docling/utils/ocr_utils.py,sha256=AOaDAHr5S74d-IRVR_LKhKynUTIurAwLJ3wNeY58gPA,2326
|
79
|
+
docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,2011
|
79
80
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
80
81
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
81
82
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
82
|
-
docling-2.
|
83
|
-
docling-2.
|
84
|
-
docling-2.
|
85
|
-
docling-2.
|
86
|
-
docling-2.
|
83
|
+
docling-2.35.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
84
|
+
docling-2.35.0.dist-info/METADATA,sha256=SMuVHjV5ouB773e4tFnu7fqpvEdygq3ksNbESerk0Ao,10138
|
85
|
+
docling-2.35.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
86
|
+
docling-2.35.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
|
87
|
+
docling-2.35.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|