docling 2.33.0__tar.gz → 2.34.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.33.0 → docling-2.34.0}/PKG-INFO +1 -1
- {docling-2.33.0 → docling-2.34.0}/docling/backend/docling_parse_backend.py +1 -1
- {docling-2.33.0 → docling-2.34.0}/docling/backend/docling_parse_v2_backend.py +1 -1
- {docling-2.33.0 → docling-2.34.0}/docling/backend/docling_parse_v4_backend.py +1 -1
- {docling-2.33.0 → docling-2.34.0}/docling/datamodel/base_models.py +99 -2
- {docling-2.33.0 → docling-2.34.0}/docling/datamodel/document.py +3 -1
- {docling-2.33.0 → docling-2.34.0}/docling/models/layout_model.py +9 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/page_assemble_model.py +1 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/page_preprocessing_model.py +50 -1
- {docling-2.33.0 → docling-2.34.0}/docling/models/tesseract_ocr_cli_model.py +85 -41
- {docling-2.33.0 → docling-2.34.0}/docling/models/tesseract_ocr_model.py +52 -30
- {docling-2.33.0 → docling-2.34.0}/docling/pipeline/standard_pdf_pipeline.py +28 -3
- {docling-2.33.0 → docling-2.34.0}/docling/utils/layout_postprocessor.py +10 -22
- docling-2.34.0/docling/utils/ocr_utils.py +69 -0
- docling-2.34.0/docling/utils/orientation.py +71 -0
- {docling-2.33.0 → docling-2.34.0}/pyproject.toml +1 -1
- docling-2.33.0/docling/utils/ocr_utils.py +0 -9
- {docling-2.33.0 → docling-2.34.0}/LICENSE +0 -0
- {docling-2.33.0 → docling-2.34.0}/README.md +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/html_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/md_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/chunking/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/cli/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/cli/main.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/cli/models.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/cli/tools.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/datamodel/settings.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/document_converter.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/exceptions.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/base_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/hf_mlx_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/hf_vlm_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/py.typed +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/__init__.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/export.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/locks.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/profiling.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/utils.py +0 -0
- {docling-2.33.0 → docling-2.34.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.34.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
60
60
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
61
61
|
).to_top_left_origin(page_height=page_size.height * scale)
|
62
62
|
|
63
|
-
overlap_frac = cell_bbox.
|
63
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
64
64
|
|
65
65
|
if overlap_frac > 0.5:
|
66
66
|
if len(text_piece) > 0:
|
@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
71
71
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
72
72
|
).to_top_left_origin(page_height=page_size.height * scale)
|
73
73
|
|
74
|
-
overlap_frac = cell_bbox.
|
74
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
75
75
|
|
76
76
|
if overlap_frac > 0.5:
|
77
77
|
if len(text_piece) > 0:
|
@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
46
46
|
.scaled(scale)
|
47
47
|
)
|
48
48
|
|
49
|
-
overlap_frac = cell_bbox.
|
49
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
50
50
|
|
51
51
|
if overlap_frac > 0.5:
|
52
52
|
if len(text_piece) > 0:
|
@@ -1,6 +1,9 @@
|
|
1
|
+
import math
|
2
|
+
from collections import defaultdict
|
1
3
|
from enum import Enum
|
2
|
-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
|
3
5
|
|
6
|
+
import numpy as np
|
4
7
|
from docling_core.types.doc import (
|
5
8
|
BoundingBox,
|
6
9
|
DocItemLabel,
|
@@ -16,7 +19,7 @@ from docling_core.types.io import (
|
|
16
19
|
DocumentStream,
|
17
20
|
)
|
18
21
|
from PIL.Image import Image
|
19
|
-
from pydantic import BaseModel, ConfigDict
|
22
|
+
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
20
23
|
|
21
24
|
if TYPE_CHECKING:
|
22
25
|
from docling.backend.pdf_backend import PdfPageBackend
|
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
|
|
298
301
|
choices: List[OpenAiResponseChoice]
|
299
302
|
created: int
|
300
303
|
usage: OpenAiResponseUsage
|
304
|
+
|
305
|
+
|
306
|
+
# Create a type alias for score values
|
307
|
+
ScoreValue = float
|
308
|
+
|
309
|
+
|
310
|
+
class QualityGrade(str, Enum):
|
311
|
+
POOR = "poor"
|
312
|
+
FAIR = "fair"
|
313
|
+
GOOD = "good"
|
314
|
+
EXCELLENT = "excellent"
|
315
|
+
UNSPECIFIED = "unspecified"
|
316
|
+
|
317
|
+
|
318
|
+
class PageConfidenceScores(BaseModel):
|
319
|
+
parse_score: ScoreValue = np.nan
|
320
|
+
layout_score: ScoreValue = np.nan
|
321
|
+
table_score: ScoreValue = np.nan
|
322
|
+
ocr_score: ScoreValue = np.nan
|
323
|
+
|
324
|
+
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
325
|
+
if score < 0.5:
|
326
|
+
return QualityGrade.POOR
|
327
|
+
elif score < 0.8:
|
328
|
+
return QualityGrade.FAIR
|
329
|
+
elif score < 0.9:
|
330
|
+
return QualityGrade.GOOD
|
331
|
+
elif score >= 0.9:
|
332
|
+
return QualityGrade.EXCELLENT
|
333
|
+
|
334
|
+
return QualityGrade.UNSPECIFIED
|
335
|
+
|
336
|
+
@computed_field # type: ignore
|
337
|
+
@property
|
338
|
+
def mean_grade(self) -> QualityGrade:
|
339
|
+
return self._score_to_grade(self.mean_score)
|
340
|
+
|
341
|
+
@computed_field # type: ignore
|
342
|
+
@property
|
343
|
+
def low_grade(self) -> QualityGrade:
|
344
|
+
return self._score_to_grade(self.low_score)
|
345
|
+
|
346
|
+
@computed_field # type: ignore
|
347
|
+
@property
|
348
|
+
def mean_score(self) -> ScoreValue:
|
349
|
+
return ScoreValue(
|
350
|
+
np.nanmean(
|
351
|
+
[
|
352
|
+
self.ocr_score,
|
353
|
+
self.table_score,
|
354
|
+
self.layout_score,
|
355
|
+
self.parse_score,
|
356
|
+
]
|
357
|
+
)
|
358
|
+
)
|
359
|
+
|
360
|
+
@computed_field # type: ignore
|
361
|
+
@property
|
362
|
+
def low_score(self) -> ScoreValue:
|
363
|
+
return ScoreValue(
|
364
|
+
np.nanquantile(
|
365
|
+
[
|
366
|
+
self.ocr_score,
|
367
|
+
self.table_score,
|
368
|
+
self.layout_score,
|
369
|
+
self.parse_score,
|
370
|
+
],
|
371
|
+
q=0.05,
|
372
|
+
)
|
373
|
+
)
|
374
|
+
|
375
|
+
|
376
|
+
class ConfidenceReport(PageConfidenceScores):
|
377
|
+
pages: Dict[int, PageConfidenceScores] = Field(
|
378
|
+
default_factory=lambda: defaultdict(PageConfidenceScores)
|
379
|
+
)
|
380
|
+
|
381
|
+
@computed_field # type: ignore
|
382
|
+
@property
|
383
|
+
def mean_score(self) -> ScoreValue:
|
384
|
+
return ScoreValue(
|
385
|
+
np.nanmean(
|
386
|
+
[c.mean_score for c in self.pages.values()],
|
387
|
+
)
|
388
|
+
)
|
389
|
+
|
390
|
+
@computed_field # type: ignore
|
391
|
+
@property
|
392
|
+
def low_score(self) -> ScoreValue:
|
393
|
+
return ScoreValue(
|
394
|
+
np.nanmean(
|
395
|
+
[c.low_score for c in self.pages.values()],
|
396
|
+
)
|
397
|
+
)
|
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
|
|
47
47
|
)
|
48
48
|
from docling_core.utils.file import resolve_source_to_stream
|
49
49
|
from docling_core.utils.legacy import docling_document_to_legacy
|
50
|
-
from pydantic import BaseModel
|
50
|
+
from pydantic import BaseModel, Field
|
51
51
|
from typing_extensions import deprecated
|
52
52
|
|
53
53
|
from docling.backend.abstract_backend import (
|
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
|
|
56
56
|
)
|
57
57
|
from docling.datamodel.base_models import (
|
58
58
|
AssembledUnit,
|
59
|
+
ConfidenceReport,
|
59
60
|
ConversionStatus,
|
60
61
|
DocumentStream,
|
61
62
|
ErrorItem,
|
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
|
|
201
202
|
pages: List[Page] = []
|
202
203
|
assembled: AssembledUnit = AssembledUnit()
|
203
204
|
timings: Dict[str, ProfilingItem] = {}
|
205
|
+
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
204
206
|
|
205
207
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
206
208
|
|
@@ -5,6 +5,7 @@ from collections.abc import Iterable
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Optional
|
7
7
|
|
8
|
+
import numpy as np
|
8
9
|
from docling_core.types.doc import DocItemLabel
|
9
10
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
10
11
|
from PIL import Image
|
@@ -184,6 +185,14 @@ class LayoutModel(BasePageModel):
|
|
184
185
|
).postprocess()
|
185
186
|
# processed_clusters, processed_cells = clusters, page.cells
|
186
187
|
|
188
|
+
conv_res.confidence.pages[page.page_no].layout_score = float(
|
189
|
+
np.mean([c.confidence for c in processed_clusters])
|
190
|
+
)
|
191
|
+
|
192
|
+
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
193
|
+
np.mean([c.confidence for c in processed_cells if c.from_ocr])
|
194
|
+
)
|
195
|
+
|
187
196
|
page.cells = processed_cells
|
188
197
|
page.predictions.layout = LayoutPrediction(
|
189
198
|
clusters=processed_clusters
|
@@ -1,11 +1,13 @@
|
|
1
|
+
import re
|
1
2
|
from collections.abc import Iterable
|
2
3
|
from pathlib import Path
|
3
4
|
from typing import Optional
|
4
5
|
|
6
|
+
import numpy as np
|
5
7
|
from PIL import ImageDraw
|
6
8
|
from pydantic import BaseModel
|
7
9
|
|
8
|
-
from docling.datamodel.base_models import Page
|
10
|
+
from docling.datamodel.base_models import Page, ScoreValue
|
9
11
|
from docling.datamodel.document import ConversionResult
|
10
12
|
from docling.datamodel.settings import settings
|
11
13
|
from docling.models.base_model import BasePageModel
|
@@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
|
|
21
23
|
def __init__(self, options: PagePreprocessingOptions):
|
22
24
|
self.options = options
|
23
25
|
|
26
|
+
# Pre-compiled regex patterns for efficiency
|
27
|
+
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
|
28
|
+
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
29
|
+
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
30
|
+
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
31
|
+
r"(?:/\w+\s*){2,}"
|
32
|
+
) # Two or more "/token " sequences
|
33
|
+
|
24
34
|
def __call__(
|
25
35
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
26
36
|
) -> Iterable[Page]:
|
@@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
|
|
60
70
|
if self.options.create_parsed_page:
|
61
71
|
page.parsed_page = page._backend.get_segmented_page()
|
62
72
|
|
73
|
+
# Rate the text quality from the PDF parser, and aggregate on page
|
74
|
+
text_scores = []
|
75
|
+
for c in page.cells:
|
76
|
+
score = self.rate_text_quality(c.text)
|
77
|
+
text_scores.append(score)
|
78
|
+
|
79
|
+
conv_res.confidence.pages[page.page_no].parse_score = float(
|
80
|
+
np.nanquantile(
|
81
|
+
text_scores, q=0.10
|
82
|
+
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
83
|
+
)
|
84
|
+
|
63
85
|
# DEBUG code:
|
64
86
|
def draw_text_boxes(image, cells, show: bool = False):
|
65
87
|
draw = ImageDraw.Draw(image)
|
@@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
|
|
88
110
|
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
89
111
|
|
90
112
|
return page
|
113
|
+
|
114
|
+
def rate_text_quality(self, text: str) -> float:
|
115
|
+
# Hard errors: if any of these patterns are found, return 0.0 immediately.
|
116
|
+
blacklist_chars = ["�"]
|
117
|
+
if (
|
118
|
+
any(text.find(c) >= 0 for c in blacklist_chars)
|
119
|
+
or self.GLYPH_RE.search(text)
|
120
|
+
or self.SLASH_G_RE.search(text)
|
121
|
+
or self.SLASH_NUMBER_GARBAGE_RE.match(
|
122
|
+
text
|
123
|
+
) # Check if text is mostly slash-number pattern
|
124
|
+
):
|
125
|
+
return 0.0
|
126
|
+
|
127
|
+
penalty = 0.0
|
128
|
+
|
129
|
+
# Apply a penalty only if the fragmented words pattern occurs at least three times.
|
130
|
+
frag_matches = self.FRAG_RE.findall(text)
|
131
|
+
if len(frag_matches) >= 3:
|
132
|
+
penalty += 0.1 * len(frag_matches)
|
133
|
+
|
134
|
+
# Additional heuristic: if the average token length is below 2, add a penalty.
|
135
|
+
# tokens = text.split()
|
136
|
+
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
137
|
+
# penalty += 0.2
|
138
|
+
|
139
|
+
return max(1.0 - penalty, 0.0)
|
@@ -2,6 +2,7 @@ import csv
|
|
2
2
|
import io
|
3
3
|
import logging
|
4
4
|
import os
|
5
|
+
import subprocess
|
5
6
|
import tempfile
|
6
7
|
from collections.abc import Iterable
|
7
8
|
from pathlib import Path
|
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
|
|
10
11
|
|
11
12
|
import pandas as pd
|
12
13
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
13
|
-
from docling_core.types.doc.page import
|
14
|
+
from docling_core.types.doc.page import TextCell
|
14
15
|
|
15
16
|
from docling.datamodel.base_models import Page
|
16
17
|
from docling.datamodel.document import ConversionResult
|
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
|
|
21
22
|
)
|
22
23
|
from docling.datamodel.settings import settings
|
23
24
|
from docling.models.base_ocr_model import BaseOcrModel
|
24
|
-
from docling.utils.ocr_utils import
|
25
|
+
from docling.utils.ocr_utils import (
|
26
|
+
map_tesseract_script,
|
27
|
+
parse_tesseract_orientation,
|
28
|
+
tesseract_box_to_bounding_rectangle,
|
29
|
+
)
|
25
30
|
from docling.utils.profiling import TimeRecorder
|
26
31
|
|
27
32
|
_log = logging.getLogger(__name__)
|
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
49
54
|
self._version: Optional[str] = None
|
50
55
|
self._tesseract_languages: Optional[List[str]] = None
|
51
56
|
self._script_prefix: Optional[str] = None
|
57
|
+
self._is_auto: bool = "auto" in self.options.lang
|
52
58
|
|
53
59
|
if self.enabled:
|
54
60
|
try:
|
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
93
99
|
|
94
100
|
return name, version
|
95
101
|
|
96
|
-
def _run_tesseract(self, ifilename: str):
|
102
|
+
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
|
97
103
|
r"""
|
98
104
|
Run tesseract CLI
|
99
105
|
"""
|
100
106
|
cmd = [self.options.tesseract_cmd]
|
101
|
-
|
102
|
-
|
103
|
-
lang = self._detect_language(ifilename)
|
107
|
+
if self._is_auto:
|
108
|
+
lang = self._parse_language(osd)
|
104
109
|
if lang is not None:
|
105
110
|
cmd.append("-l")
|
106
111
|
cmd.append(lang)
|
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
115
120
|
cmd += [ifilename, "stdout", "tsv"]
|
116
121
|
_log.info("command: {}".format(" ".join(cmd)))
|
117
122
|
|
118
|
-
|
119
|
-
output, _ = proc.communicate()
|
123
|
+
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
120
124
|
|
121
125
|
# _log.info(output)
|
122
126
|
|
123
127
|
# Decode the byte string to a regular string
|
124
|
-
decoded_data = output.decode("utf-8")
|
128
|
+
decoded_data = output.stdout.decode("utf-8")
|
125
129
|
# _log.info(decoded_data)
|
126
130
|
|
127
131
|
# Read the TSV file generated by Tesseract
|
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
139
143
|
|
140
144
|
return df_filtered
|
141
145
|
|
142
|
-
def
|
146
|
+
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
|
143
147
|
r"""
|
144
148
|
Run tesseract in PSM 0 mode to detect the language
|
145
149
|
"""
|
146
|
-
assert self._tesseract_languages is not None
|
147
150
|
|
148
151
|
cmd = [self.options.tesseract_cmd]
|
149
152
|
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
150
153
|
_log.info("command: {}".format(" ".join(cmd)))
|
151
|
-
|
152
|
-
|
153
|
-
decoded_data = output.decode("utf-8")
|
154
|
+
output = subprocess.run(cmd, capture_output=True, check=True)
|
155
|
+
decoded_data = output.stdout.decode("utf-8")
|
154
156
|
df_detected = pd.read_csv(
|
155
157
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
156
158
|
)
|
157
|
-
|
159
|
+
return df_detected
|
160
|
+
|
161
|
+
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
|
162
|
+
assert self._tesseract_languages is not None
|
163
|
+
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
|
158
164
|
if len(scripts) == 0:
|
159
165
|
_log.warning("Tesseract cannot detect the script of the page")
|
160
166
|
return None
|
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
182
188
|
cmd = [self.options.tesseract_cmd]
|
183
189
|
cmd.append("--list-langs")
|
184
190
|
_log.info("command: {}".format(" ".join(cmd)))
|
185
|
-
|
186
|
-
|
187
|
-
decoded_data = output.decode("utf-8")
|
191
|
+
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
192
|
+
decoded_data = output.stdout.decode("utf-8")
|
188
193
|
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
189
194
|
self._tesseract_languages = df_list[0].tolist()[1:]
|
190
195
|
|
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
203
208
|
yield from page_batch
|
204
209
|
return
|
205
210
|
|
206
|
-
for page in page_batch:
|
211
|
+
for page_i, page in enumerate(page_batch):
|
207
212
|
assert page._backend is not None
|
208
213
|
if not page._backend.is_valid():
|
209
214
|
yield page
|
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
212
217
|
ocr_rects = self.get_ocr_rects(page)
|
213
218
|
|
214
219
|
all_ocr_cells = []
|
215
|
-
for ocr_rect in ocr_rects:
|
220
|
+
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
216
221
|
# Skip zero area boxes
|
217
222
|
if ocr_rect.area() == 0:
|
218
223
|
continue
|
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
225
230
|
) as image_file:
|
226
231
|
fname = image_file.name
|
227
232
|
high_res_image.save(image_file)
|
228
|
-
|
229
|
-
|
233
|
+
doc_orientation = 0
|
234
|
+
try:
|
235
|
+
df_osd = self._perform_osd(fname)
|
236
|
+
doc_orientation = _parse_orientation(df_osd)
|
237
|
+
except subprocess.CalledProcessError as exc:
|
238
|
+
_log.error(
|
239
|
+
"OSD failed (doc %s, page: %s, "
|
240
|
+
"OCR rectangle: %s, processed image file %s):\n %s",
|
241
|
+
conv_res.input.file,
|
242
|
+
page_i,
|
243
|
+
ocr_rect_i,
|
244
|
+
image_file,
|
245
|
+
exc.stderr,
|
246
|
+
)
|
247
|
+
# Skipping if OSD fail when in auto mode, otherwise proceed
|
248
|
+
# to OCR in the hope OCR will succeed while OSD failed
|
249
|
+
if self._is_auto:
|
250
|
+
continue
|
251
|
+
if doc_orientation != 0:
|
252
|
+
high_res_image = high_res_image.rotate(
|
253
|
+
-doc_orientation, expand=True
|
254
|
+
)
|
255
|
+
high_res_image.save(fname)
|
256
|
+
try:
|
257
|
+
df_result = self._run_tesseract(fname, df_osd)
|
258
|
+
except subprocess.CalledProcessError as exc:
|
259
|
+
_log.error(
|
260
|
+
"tesseract OCR failed (doc %s, page: %s, "
|
261
|
+
"OCR rectangle: %s, processed image file %s):\n %s",
|
262
|
+
conv_res.input.file,
|
263
|
+
page_i,
|
264
|
+
ocr_rect_i,
|
265
|
+
image_file,
|
266
|
+
exc.stderr,
|
267
|
+
)
|
268
|
+
continue
|
230
269
|
finally:
|
231
270
|
if os.path.exists(fname):
|
232
271
|
os.remove(fname)
|
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
238
277
|
text = row["text"]
|
239
278
|
conf = row["conf"]
|
240
279
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
280
|
+
left, top = float(row["left"]), float(row["top"])
|
281
|
+
right = left + float(row["width"])
|
282
|
+
bottom = top + row["height"]
|
283
|
+
bbox = BoundingBox(
|
284
|
+
l=left,
|
285
|
+
t=top,
|
286
|
+
r=right,
|
287
|
+
b=bottom,
|
288
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
289
|
+
)
|
290
|
+
rect = tesseract_box_to_bounding_rectangle(
|
291
|
+
bbox,
|
292
|
+
original_offset=ocr_rect,
|
293
|
+
scale=self.scale,
|
294
|
+
orientation=doc_orientation,
|
295
|
+
im_size=high_res_image.size,
|
296
|
+
)
|
249
297
|
cell = TextCell(
|
250
298
|
index=ix,
|
251
299
|
text=str(text),
|
252
300
|
orig=str(text),
|
253
301
|
from_ocr=True,
|
254
302
|
confidence=conf / 100.0,
|
255
|
-
rect=
|
256
|
-
BoundingBox.from_tuple(
|
257
|
-
coord=(
|
258
|
-
(l / self.scale) + ocr_rect.l,
|
259
|
-
(b / self.scale) + ocr_rect.t,
|
260
|
-
(r / self.scale) + ocr_rect.l,
|
261
|
-
(t / self.scale) + ocr_rect.t,
|
262
|
-
),
|
263
|
-
origin=CoordOrigin.TOPLEFT,
|
264
|
-
)
|
265
|
-
),
|
303
|
+
rect=rect,
|
266
304
|
)
|
267
305
|
all_ocr_cells.append(cell)
|
268
306
|
|
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
278
316
|
@classmethod
|
279
317
|
def get_options_type(cls) -> Type[OcrOptions]:
|
280
318
|
return TesseractCliOcrOptions
|
319
|
+
|
320
|
+
|
321
|
+
def _parse_orientation(df_osd: pd.DataFrame) -> int:
|
322
|
+
orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
|
323
|
+
orientation = parse_tesseract_orientation(orientations[0].strip())
|
324
|
+
return orientation
|
@@ -1,12 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import logging
|
4
|
-
from collections.abc import Iterable
|
5
4
|
from pathlib import Path
|
6
|
-
from typing import Optional, Type
|
5
|
+
from typing import Iterable, Optional, Type
|
7
6
|
|
8
7
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
-
from docling_core.types.doc.page import
|
8
|
+
from docling_core.types.doc.page import TextCell
|
10
9
|
|
11
10
|
from docling.datamodel.base_models import Page
|
12
11
|
from docling.datamodel.document import ConversionResult
|
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
|
|
17
16
|
)
|
18
17
|
from docling.datamodel.settings import settings
|
19
18
|
from docling.models.base_ocr_model import BaseOcrModel
|
20
|
-
from docling.utils.ocr_utils import
|
19
|
+
from docling.utils.ocr_utils import (
|
20
|
+
map_tesseract_script,
|
21
|
+
parse_tesseract_orientation,
|
22
|
+
tesseract_box_to_bounding_rectangle,
|
23
|
+
)
|
21
24
|
from docling.utils.profiling import TimeRecorder
|
22
25
|
|
23
26
|
_log = logging.getLogger(__name__)
|
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
38
41
|
accelerator_options=accelerator_options,
|
39
42
|
)
|
40
43
|
self.options: TesseractOcrOptions
|
41
|
-
|
44
|
+
self._is_auto: bool = "auto" in self.options.lang
|
42
45
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
43
46
|
self.reader = None
|
44
47
|
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
|
|
95
98
|
|
96
99
|
if lang == "auto":
|
97
100
|
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
|
98
|
-
self.osd_reader = tesserocr.PyTessBaseAPI(
|
99
|
-
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
100
|
-
)
|
101
101
|
else:
|
102
102
|
self.reader = tesserocr.PyTessBaseAPI(
|
103
103
|
**{"lang": lang} | tesserocr_kwargs,
|
104
104
|
)
|
105
|
+
self.osd_reader = tesserocr.PyTessBaseAPI(
|
106
|
+
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
|
107
|
+
)
|
105
108
|
self.reader_RIL = tesserocr.RIL
|
106
109
|
|
107
110
|
def __del__(self):
|
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
|
|
118
121
|
yield from page_batch
|
119
122
|
return
|
120
123
|
|
121
|
-
for page in page_batch:
|
124
|
+
for page_i, page in enumerate(page_batch):
|
122
125
|
assert page._backend is not None
|
123
126
|
if not page._backend.is_valid():
|
124
127
|
yield page
|
125
128
|
else:
|
126
129
|
with TimeRecorder(conv_res, "ocr"):
|
127
130
|
assert self.reader is not None
|
131
|
+
assert self.osd_reader is not None
|
128
132
|
assert self._tesserocr_languages is not None
|
129
133
|
|
130
134
|
ocr_rects = self.get_ocr_rects(page)
|
131
135
|
|
132
136
|
all_ocr_cells = []
|
133
|
-
for ocr_rect in ocr_rects:
|
137
|
+
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
134
138
|
# Skip zero area boxes
|
135
139
|
if ocr_rect.area() == 0:
|
136
140
|
continue
|
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
|
|
139
143
|
)
|
140
144
|
|
141
145
|
local_reader = self.reader
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
146
|
+
self.osd_reader.SetImage(high_res_image)
|
147
|
+
osd = self.osd_reader.DetectOrientationScript()
|
148
|
+
# No text, or Orientation and Script detection failure
|
149
|
+
if osd is None:
|
150
|
+
_log.error(
|
151
|
+
"OSD failed for doc (doc %s, page: %s, "
|
152
|
+
"OCR rectangle: %s)",
|
153
|
+
conv_res.input.file,
|
154
|
+
page_i,
|
155
|
+
ocr_rect_i,
|
156
|
+
)
|
157
|
+
# Skipping if OSD fail when in auto mode, otherwise proceed
|
158
|
+
# to OCR in the hope OCR will succeed while OSD failed
|
159
|
+
if self._is_auto:
|
150
160
|
continue
|
151
|
-
|
161
|
+
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
|
162
|
+
if doc_orientation != 0:
|
163
|
+
high_res_image = high_res_image.rotate(
|
164
|
+
-doc_orientation, expand=True
|
165
|
+
)
|
166
|
+
if self._is_auto:
|
152
167
|
script = osd["script_name"]
|
153
168
|
script = map_tesseract_script(script)
|
154
169
|
lang = f"{self.script_prefix}{script}"
|
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
|
|
188
203
|
# Extract text within the bounding box
|
189
204
|
text = local_reader.GetUTF8Text().strip()
|
190
205
|
confidence = local_reader.MeanTextConf()
|
191
|
-
left = box["x"]
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
206
|
+
left, top = box["x"], box["y"]
|
207
|
+
right = left + box["w"]
|
208
|
+
bottom = top + box["h"]
|
209
|
+
bbox = BoundingBox(
|
210
|
+
l=left,
|
211
|
+
t=top,
|
212
|
+
r=right,
|
213
|
+
b=bottom,
|
214
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
215
|
+
)
|
216
|
+
rect = tesseract_box_to_bounding_rectangle(
|
217
|
+
bbox,
|
218
|
+
original_offset=ocr_rect,
|
219
|
+
scale=self.scale,
|
220
|
+
orientation=doc_orientation,
|
221
|
+
im_size=high_res_image.size,
|
222
|
+
)
|
196
223
|
cells.append(
|
197
224
|
TextCell(
|
198
225
|
index=ix,
|
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
200
227
|
orig=text,
|
201
228
|
from_ocr=True,
|
202
229
|
confidence=confidence,
|
203
|
-
rect=
|
204
|
-
BoundingBox.from_tuple(
|
205
|
-
coord=(left, top, right, bottom),
|
206
|
-
origin=CoordOrigin.TOPLEFT,
|
207
|
-
),
|
208
|
-
),
|
230
|
+
rect=rect,
|
209
231
|
)
|
210
232
|
)
|
211
233
|
|
@@ -3,11 +3,12 @@ import warnings
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import Optional, cast
|
5
5
|
|
6
|
+
import numpy as np
|
6
7
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
7
8
|
|
8
9
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
9
10
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
10
|
-
from docling.datamodel.base_models import AssembledUnit, Page
|
11
|
+
from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
|
11
12
|
from docling.datamodel.document import ConversionResult
|
12
13
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
13
14
|
from docling.datamodel.settings import settings
|
@@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
60
61
|
or self.pipeline_options.generate_table_images
|
61
62
|
)
|
62
63
|
|
63
|
-
self.
|
64
|
+
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
64
65
|
|
65
66
|
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
66
67
|
|
@@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
197
198
|
elements=all_elements, headers=all_headers, body=all_body
|
198
199
|
)
|
199
200
|
|
200
|
-
conv_res.document = self.
|
201
|
+
conv_res.document = self.reading_order_model(conv_res)
|
201
202
|
|
202
203
|
# Generate page images in the output
|
203
204
|
if self.pipeline_options.generate_page_images:
|
@@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
244
245
|
cropped_im, dpi=int(72 * scale)
|
245
246
|
)
|
246
247
|
|
248
|
+
# Aggregate confidence values for document:
|
249
|
+
if len(conv_res.pages) > 0:
|
250
|
+
conv_res.confidence.layout_score = float(
|
251
|
+
np.nanmean(
|
252
|
+
[c.layout_score for c in conv_res.confidence.pages.values()]
|
253
|
+
)
|
254
|
+
)
|
255
|
+
conv_res.confidence.parse_score = float(
|
256
|
+
np.nanquantile(
|
257
|
+
[c.parse_score for c in conv_res.confidence.pages.values()],
|
258
|
+
q=0.1, # parse score should relate to worst 10% of pages.
|
259
|
+
)
|
260
|
+
)
|
261
|
+
conv_res.confidence.table_score = float(
|
262
|
+
np.nanmean(
|
263
|
+
[c.table_score for c in conv_res.confidence.pages.values()]
|
264
|
+
)
|
265
|
+
)
|
266
|
+
conv_res.confidence.ocr_score = float(
|
267
|
+
np.nanmean(
|
268
|
+
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
269
|
+
)
|
270
|
+
)
|
271
|
+
|
247
272
|
return conv_res
|
248
273
|
|
249
274
|
@classmethod
|
@@ -90,17 +90,12 @@ class SpatialClusterIndex:
|
|
90
90
|
containment_threshold: float,
|
91
91
|
) -> bool:
|
92
92
|
"""Check if two bboxes overlap sufficiently."""
|
93
|
-
|
94
|
-
if area1 <= 0 or area2 <= 0:
|
93
|
+
if bbox1.area() <= 0 or bbox2.area() <= 0:
|
95
94
|
return False
|
96
95
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
iou = overlap_area / (area1 + area2 - overlap_area)
|
102
|
-
containment1 = overlap_area / area1
|
103
|
-
containment2 = overlap_area / area2
|
96
|
+
iou = bbox1.intersection_over_union(bbox2)
|
97
|
+
containment1 = bbox1.intersection_over_self(bbox2)
|
98
|
+
containment2 = bbox2.intersection_over_self(bbox1)
|
104
99
|
|
105
100
|
return (
|
106
101
|
iou > overlap_threshold
|
@@ -321,11 +316,9 @@ class LayoutPostprocessor:
|
|
321
316
|
for special in special_clusters:
|
322
317
|
contained = []
|
323
318
|
for cluster in self.regular_clusters:
|
324
|
-
|
325
|
-
if
|
326
|
-
|
327
|
-
if containment > 0.8:
|
328
|
-
contained.append(cluster)
|
319
|
+
containment = cluster.bbox.intersection_over_self(special.bbox)
|
320
|
+
if containment > 0.8:
|
321
|
+
contained.append(cluster)
|
329
322
|
|
330
323
|
if contained:
|
331
324
|
# Sort contained clusters by minimum cell ID:
|
@@ -379,9 +372,7 @@ class LayoutPostprocessor:
|
|
379
372
|
for regular in self.regular_clusters:
|
380
373
|
if regular.label == DocItemLabel.TABLE:
|
381
374
|
# Calculate overlap
|
382
|
-
|
383
|
-
wrapper_area = wrapper.bbox.area()
|
384
|
-
overlap_ratio = overlap / wrapper_area
|
375
|
+
overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
|
385
376
|
|
386
377
|
conf_diff = wrapper.confidence - regular.confidence
|
387
378
|
|
@@ -421,8 +412,7 @@ class LayoutPostprocessor:
|
|
421
412
|
# Rule 2: CODE vs others
|
422
413
|
if candidate.label == DocItemLabel.CODE:
|
423
414
|
# Calculate how much of the other cluster is contained within the CODE cluster
|
424
|
-
|
425
|
-
containment = overlap / other.bbox.area()
|
415
|
+
containment = other.bbox.intersection_over_self(candidate.bbox)
|
426
416
|
if containment > 0.8: # other is 80% contained within CODE
|
427
417
|
return True
|
428
418
|
|
@@ -586,11 +576,9 @@ class LayoutPostprocessor:
|
|
586
576
|
if cell.rect.to_bounding_box().area() <= 0:
|
587
577
|
continue
|
588
578
|
|
589
|
-
|
579
|
+
overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
|
590
580
|
cluster.bbox
|
591
581
|
)
|
592
|
-
overlap_ratio = overlap / cell.rect.to_bounding_box().area()
|
593
|
-
|
594
582
|
if overlap_ratio > best_overlap:
|
595
583
|
best_overlap = overlap_ratio
|
596
584
|
best_cluster = cluster
|
@@ -0,0 +1,69 @@
|
|
1
|
+
from typing import Optional, Tuple
|
2
|
+
|
3
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
4
|
+
from docling_core.types.doc.page import BoundingRectangle
|
5
|
+
|
6
|
+
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
|
7
|
+
|
8
|
+
|
9
|
+
def map_tesseract_script(script: str) -> str:
|
10
|
+
r""" """
|
11
|
+
if script == "Katakana" or script == "Hiragana":
|
12
|
+
script = "Japanese"
|
13
|
+
elif script == "Han":
|
14
|
+
script = "HanS"
|
15
|
+
elif script == "Korean":
|
16
|
+
script = "Hangul"
|
17
|
+
return script
|
18
|
+
|
19
|
+
|
20
|
+
def parse_tesseract_orientation(orientation: str) -> int:
|
21
|
+
# Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
|
22
|
+
# are [0, 360[ counterclockwise
|
23
|
+
parsed = int(orientation)
|
24
|
+
if parsed not in CLIPPED_ORIENTATIONS:
|
25
|
+
msg = (
|
26
|
+
f"invalid tesseract document orientation {orientation}, "
|
27
|
+
f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
|
28
|
+
)
|
29
|
+
raise ValueError(msg)
|
30
|
+
parsed = -parsed
|
31
|
+
parsed %= 360
|
32
|
+
return parsed
|
33
|
+
|
34
|
+
|
35
|
+
def tesseract_box_to_bounding_rectangle(
|
36
|
+
bbox: BoundingBox,
|
37
|
+
*,
|
38
|
+
original_offset: Optional[BoundingBox] = None,
|
39
|
+
scale: float,
|
40
|
+
orientation: int,
|
41
|
+
im_size: Tuple[int, int],
|
42
|
+
) -> BoundingRectangle:
|
43
|
+
# box is in the top, left, height, width format, top left coordinates
|
44
|
+
rect = rotate_bounding_box(bbox, angle=-orientation, im_size=im_size)
|
45
|
+
rect = BoundingRectangle(
|
46
|
+
r_x0=rect.r_x0 / scale,
|
47
|
+
r_y0=rect.r_y0 / scale,
|
48
|
+
r_x1=rect.r_x1 / scale,
|
49
|
+
r_y1=rect.r_y1 / scale,
|
50
|
+
r_x2=rect.r_x2 / scale,
|
51
|
+
r_y2=rect.r_y2 / scale,
|
52
|
+
r_x3=rect.r_x3 / scale,
|
53
|
+
r_y3=rect.r_y3 / scale,
|
54
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
55
|
+
)
|
56
|
+
if original_offset is not None:
|
57
|
+
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
|
58
|
+
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
|
59
|
+
raise ValueError(msg)
|
60
|
+
if original_offset is not None:
|
61
|
+
rect.r_x0 += original_offset.l
|
62
|
+
rect.r_x1 += original_offset.l
|
63
|
+
rect.r_x2 += original_offset.l
|
64
|
+
rect.r_x3 += original_offset.l
|
65
|
+
rect.r_y0 += original_offset.t
|
66
|
+
rect.r_y1 += original_offset.t
|
67
|
+
rect.r_y2 += original_offset.t
|
68
|
+
rect.r_y3 += original_offset.t
|
69
|
+
return rect
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from typing import Tuple
|
2
|
+
|
3
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
4
|
+
from docling_core.types.doc.page import BoundingRectangle
|
5
|
+
|
6
|
+
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
7
|
+
|
8
|
+
|
9
|
+
def rotate_bounding_box(
|
10
|
+
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
|
11
|
+
) -> BoundingRectangle:
|
12
|
+
# The box is left top width height in TOPLEFT coordinates
|
13
|
+
# Bounding rectangle start with r_0 at the bottom left whatever the
|
14
|
+
# coordinate system. Then other corners are found rotating counterclockwise
|
15
|
+
bbox = bbox.to_top_left_origin(im_size[1])
|
16
|
+
left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
|
17
|
+
im_h, im_w = im_size
|
18
|
+
angle = angle % 360
|
19
|
+
if angle == 0:
|
20
|
+
r_x0 = left
|
21
|
+
r_y0 = top + height
|
22
|
+
r_x1 = r_x0 + width
|
23
|
+
r_y1 = r_y0
|
24
|
+
r_x2 = r_x0 + width
|
25
|
+
r_y2 = r_y0 - height
|
26
|
+
r_x3 = r_x0
|
27
|
+
r_y3 = r_y0 - height
|
28
|
+
elif angle == 90:
|
29
|
+
r_x0 = im_w - (top + height)
|
30
|
+
r_y0 = left
|
31
|
+
r_x1 = r_x0
|
32
|
+
r_y1 = r_y0 + width
|
33
|
+
r_x2 = r_x0 + height
|
34
|
+
r_y2 = r_y0 + width
|
35
|
+
r_x3 = r_x0
|
36
|
+
r_y3 = r_y0 + width
|
37
|
+
elif angle == 180:
|
38
|
+
r_x0 = im_h - left
|
39
|
+
r_y0 = im_w - (top + height)
|
40
|
+
r_x1 = r_x0 - width
|
41
|
+
r_y1 = r_y0
|
42
|
+
r_x2 = r_x0 - width
|
43
|
+
r_y2 = r_y0 + height
|
44
|
+
r_x3 = r_x0
|
45
|
+
r_y3 = r_y0 + height
|
46
|
+
elif angle == 270:
|
47
|
+
r_x0 = top + height
|
48
|
+
r_y0 = im_h - left
|
49
|
+
r_x1 = r_x0
|
50
|
+
r_y1 = r_y0 - width
|
51
|
+
r_x2 = r_x0 - height
|
52
|
+
r_y2 = r_y0 - width
|
53
|
+
r_x3 = r_x0 - height
|
54
|
+
r_y3 = r_y0
|
55
|
+
else:
|
56
|
+
msg = (
|
57
|
+
f"invalid orientation {angle}, expected values in:"
|
58
|
+
f" {sorted(CLIPPED_ORIENTATIONS)}"
|
59
|
+
)
|
60
|
+
raise ValueError(msg)
|
61
|
+
return BoundingRectangle(
|
62
|
+
r_x0=r_x0,
|
63
|
+
r_y0=r_y0,
|
64
|
+
r_x1=r_x1,
|
65
|
+
r_y1=r_y1,
|
66
|
+
r_x2=r_x2,
|
67
|
+
r_y2=r_y2,
|
68
|
+
r_x3=r_x3,
|
69
|
+
r_y3=r_y3,
|
70
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
71
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.34.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = [
|
6
6
|
"Christoph Auer <cau@zurich.ibm.com>",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|