docling 2.33.0__tar.gz → 2.35.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.33.0 → docling-2.35.0}/PKG-INFO +2 -2
- {docling-2.33.0 → docling-2.35.0}/docling/backend/docling_parse_backend.py +1 -1
- {docling-2.33.0 → docling-2.35.0}/docling/backend/docling_parse_v2_backend.py +1 -1
- {docling-2.33.0 → docling-2.35.0}/docling/backend/docling_parse_v4_backend.py +1 -1
- {docling-2.33.0 → docling-2.35.0}/docling/cli/main.py +36 -3
- {docling-2.33.0 → docling-2.35.0}/docling/datamodel/base_models.py +99 -2
- {docling-2.33.0 → docling-2.35.0}/docling/datamodel/document.py +10 -3
- {docling-2.33.0 → docling-2.35.0}/docling/models/layout_model.py +19 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/page_assemble_model.py +1 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/page_preprocessing_model.py +54 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/tesseract_ocr_cli_model.py +85 -41
- {docling-2.33.0 → docling-2.35.0}/docling/models/tesseract_ocr_model.py +52 -30
- {docling-2.33.0 → docling-2.35.0}/docling/pipeline/standard_pdf_pipeline.py +75 -38
- {docling-2.33.0 → docling-2.35.0}/docling/utils/layout_postprocessor.py +10 -22
- docling-2.35.0/docling/utils/ocr_utils.py +69 -0
- docling-2.35.0/docling/utils/orientation.py +71 -0
- {docling-2.33.0 → docling-2.35.0}/pyproject.toml +2 -2
- docling-2.33.0/docling/utils/ocr_utils.py +0 -9
- {docling-2.33.0 → docling-2.35.0}/LICENSE +0 -0
- {docling-2.33.0 → docling-2.35.0}/README.md +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/html_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/md_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/chunking/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/cli/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/cli/models.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/cli/tools.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/datamodel/settings.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/document_converter.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/exceptions.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/base_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/hf_mlx_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/hf_vlm_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/pipeline/vlm_pipeline.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/py.typed +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/__init__.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/export.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/locks.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/profiling.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/utils.py +0 -0
- {docling-2.33.0 → docling-2.35.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.35.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
|
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
31
|
Requires-Dist: click (<8.2.0)
|
32
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
32
|
+
Requires-Dist: docling-core[chunking] (>=2.31.2,<3.0.0)
|
33
33
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
34
34
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
35
35
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
60
60
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
61
61
|
).to_top_left_origin(page_height=page_size.height * scale)
|
62
62
|
|
63
|
-
overlap_frac = cell_bbox.
|
63
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
64
64
|
|
65
65
|
if overlap_frac > 0.5:
|
66
66
|
if len(text_piece) > 0:
|
@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
71
71
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
72
72
|
).to_top_left_origin(page_height=page_size.height * scale)
|
73
73
|
|
74
|
-
overlap_frac = cell_bbox.
|
74
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
75
75
|
|
76
76
|
if overlap_frac > 0.5:
|
77
77
|
if len(text_piece) > 0:
|
@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
46
46
|
.scaled(scale)
|
47
47
|
)
|
48
48
|
|
49
|
-
overlap_frac = cell_bbox.
|
49
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
50
50
|
|
51
51
|
if overlap_frac > 0.5:
|
52
52
|
if len(text_piece) > 0:
|
@@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
|
|
12
12
|
|
13
13
|
import rich.table
|
14
14
|
import typer
|
15
|
+
from docling_core.transforms.serializer.html import (
|
16
|
+
HTMLDocSerializer,
|
17
|
+
HTMLOutputStyle,
|
18
|
+
HTMLParams,
|
19
|
+
)
|
20
|
+
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
|
15
21
|
from docling_core.types.doc import ImageRefMode
|
16
22
|
from docling_core.utils.file import resolve_source_to_path
|
17
23
|
from pydantic import TypeAdapter
|
@@ -156,6 +162,7 @@ def export_documents(
|
|
156
162
|
export_json: bool,
|
157
163
|
export_html: bool,
|
158
164
|
export_html_split_page: bool,
|
165
|
+
show_layout: bool,
|
159
166
|
export_md: bool,
|
160
167
|
export_txt: bool,
|
161
168
|
export_doctags: bool,
|
@@ -189,9 +196,27 @@ def export_documents(
|
|
189
196
|
if export_html_split_page:
|
190
197
|
fname = output_dir / f"{doc_filename}.html"
|
191
198
|
_log.info(f"writing HTML output to {fname}")
|
192
|
-
|
193
|
-
|
194
|
-
|
199
|
+
if show_layout:
|
200
|
+
ser = HTMLDocSerializer(
|
201
|
+
doc=conv_res.document,
|
202
|
+
params=HTMLParams(
|
203
|
+
image_mode=image_export_mode,
|
204
|
+
output_style=HTMLOutputStyle.SPLIT_PAGE,
|
205
|
+
),
|
206
|
+
)
|
207
|
+
visualizer = LayoutVisualizer()
|
208
|
+
visualizer.params.show_label = False
|
209
|
+
ser_res = ser.serialize(
|
210
|
+
visualizer=visualizer,
|
211
|
+
)
|
212
|
+
with open(fname, "w") as fw:
|
213
|
+
fw.write(ser_res.text)
|
214
|
+
else:
|
215
|
+
conv_res.document.save_as_html(
|
216
|
+
filename=fname,
|
217
|
+
image_mode=image_export_mode,
|
218
|
+
split_page_view=True,
|
219
|
+
)
|
195
220
|
|
196
221
|
# Export Text format:
|
197
222
|
if export_txt:
|
@@ -250,6 +275,13 @@ def convert( # noqa: C901
|
|
250
275
|
to_formats: List[OutputFormat] = typer.Option(
|
251
276
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
252
277
|
),
|
278
|
+
show_layout: Annotated[
|
279
|
+
bool,
|
280
|
+
typer.Option(
|
281
|
+
...,
|
282
|
+
help="If enabled, the page images will show the bounding-boxes of the items.",
|
283
|
+
),
|
284
|
+
] = False,
|
253
285
|
headers: str = typer.Option(
|
254
286
|
None,
|
255
287
|
"--headers",
|
@@ -596,6 +628,7 @@ def convert( # noqa: C901
|
|
596
628
|
export_json=export_json,
|
597
629
|
export_html=export_html,
|
598
630
|
export_html_split_page=export_html_split_page,
|
631
|
+
show_layout=show_layout,
|
599
632
|
export_md=export_md,
|
600
633
|
export_txt=export_txt,
|
601
634
|
export_doctags=export_doctags,
|
@@ -1,6 +1,9 @@
|
|
1
|
+
import math
|
2
|
+
from collections import defaultdict
|
1
3
|
from enum import Enum
|
2
|
-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
|
3
5
|
|
6
|
+
import numpy as np
|
4
7
|
from docling_core.types.doc import (
|
5
8
|
BoundingBox,
|
6
9
|
DocItemLabel,
|
@@ -16,7 +19,7 @@ from docling_core.types.io import (
|
|
16
19
|
DocumentStream,
|
17
20
|
)
|
18
21
|
from PIL.Image import Image
|
19
|
-
from pydantic import BaseModel, ConfigDict
|
22
|
+
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
20
23
|
|
21
24
|
if TYPE_CHECKING:
|
22
25
|
from docling.backend.pdf_backend import PdfPageBackend
|
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
|
|
298
301
|
choices: List[OpenAiResponseChoice]
|
299
302
|
created: int
|
300
303
|
usage: OpenAiResponseUsage
|
304
|
+
|
305
|
+
|
306
|
+
# Create a type alias for score values
|
307
|
+
ScoreValue = float
|
308
|
+
|
309
|
+
|
310
|
+
class QualityGrade(str, Enum):
|
311
|
+
POOR = "poor"
|
312
|
+
FAIR = "fair"
|
313
|
+
GOOD = "good"
|
314
|
+
EXCELLENT = "excellent"
|
315
|
+
UNSPECIFIED = "unspecified"
|
316
|
+
|
317
|
+
|
318
|
+
class PageConfidenceScores(BaseModel):
|
319
|
+
parse_score: ScoreValue = np.nan
|
320
|
+
layout_score: ScoreValue = np.nan
|
321
|
+
table_score: ScoreValue = np.nan
|
322
|
+
ocr_score: ScoreValue = np.nan
|
323
|
+
|
324
|
+
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
325
|
+
if score < 0.5:
|
326
|
+
return QualityGrade.POOR
|
327
|
+
elif score < 0.8:
|
328
|
+
return QualityGrade.FAIR
|
329
|
+
elif score < 0.9:
|
330
|
+
return QualityGrade.GOOD
|
331
|
+
elif score >= 0.9:
|
332
|
+
return QualityGrade.EXCELLENT
|
333
|
+
|
334
|
+
return QualityGrade.UNSPECIFIED
|
335
|
+
|
336
|
+
@computed_field # type: ignore
|
337
|
+
@property
|
338
|
+
def mean_grade(self) -> QualityGrade:
|
339
|
+
return self._score_to_grade(self.mean_score)
|
340
|
+
|
341
|
+
@computed_field # type: ignore
|
342
|
+
@property
|
343
|
+
def low_grade(self) -> QualityGrade:
|
344
|
+
return self._score_to_grade(self.low_score)
|
345
|
+
|
346
|
+
@computed_field # type: ignore
|
347
|
+
@property
|
348
|
+
def mean_score(self) -> ScoreValue:
|
349
|
+
return ScoreValue(
|
350
|
+
np.nanmean(
|
351
|
+
[
|
352
|
+
self.ocr_score,
|
353
|
+
self.table_score,
|
354
|
+
self.layout_score,
|
355
|
+
self.parse_score,
|
356
|
+
]
|
357
|
+
)
|
358
|
+
)
|
359
|
+
|
360
|
+
@computed_field # type: ignore
|
361
|
+
@property
|
362
|
+
def low_score(self) -> ScoreValue:
|
363
|
+
return ScoreValue(
|
364
|
+
np.nanquantile(
|
365
|
+
[
|
366
|
+
self.ocr_score,
|
367
|
+
self.table_score,
|
368
|
+
self.layout_score,
|
369
|
+
self.parse_score,
|
370
|
+
],
|
371
|
+
q=0.05,
|
372
|
+
)
|
373
|
+
)
|
374
|
+
|
375
|
+
|
376
|
+
class ConfidenceReport(PageConfidenceScores):
|
377
|
+
pages: Dict[int, PageConfidenceScores] = Field(
|
378
|
+
default_factory=lambda: defaultdict(PageConfidenceScores)
|
379
|
+
)
|
380
|
+
|
381
|
+
@computed_field # type: ignore
|
382
|
+
@property
|
383
|
+
def mean_score(self) -> ScoreValue:
|
384
|
+
return ScoreValue(
|
385
|
+
np.nanmean(
|
386
|
+
[c.mean_score for c in self.pages.values()],
|
387
|
+
)
|
388
|
+
)
|
389
|
+
|
390
|
+
@computed_field # type: ignore
|
391
|
+
@property
|
392
|
+
def low_score(self) -> ScoreValue:
|
393
|
+
return ScoreValue(
|
394
|
+
np.nanmean(
|
395
|
+
[c.low_score for c in self.pages.values()],
|
396
|
+
)
|
397
|
+
)
|
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
|
|
47
47
|
)
|
48
48
|
from docling_core.utils.file import resolve_source_to_stream
|
49
49
|
from docling_core.utils.legacy import docling_document_to_legacy
|
50
|
-
from pydantic import BaseModel
|
50
|
+
from pydantic import BaseModel, Field
|
51
51
|
from typing_extensions import deprecated
|
52
52
|
|
53
53
|
from docling.backend.abstract_backend import (
|
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
|
|
56
56
|
)
|
57
57
|
from docling.datamodel.base_models import (
|
58
58
|
AssembledUnit,
|
59
|
+
ConfidenceReport,
|
59
60
|
ConversionStatus,
|
60
61
|
DocumentStream,
|
61
62
|
ErrorItem,
|
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
|
|
201
202
|
pages: List[Page] = []
|
202
203
|
assembled: AssembledUnit = AssembledUnit()
|
203
204
|
timings: Dict[str, ProfilingItem] = {}
|
205
|
+
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
204
206
|
|
205
207
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
206
208
|
|
@@ -332,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
|
|
332
334
|
) -> Optional[InputFormat]:
|
333
335
|
"""Guess the input format of a document by checking part of its content."""
|
334
336
|
input_format: Optional[InputFormat] = None
|
335
|
-
content_str = content.decode("utf-8")
|
336
337
|
|
337
338
|
if mime == "application/xml":
|
339
|
+
content_str = content.decode("utf-8")
|
338
340
|
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
339
341
|
if match_doctype:
|
340
342
|
xml_doctype = match_doctype.group()
|
@@ -356,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
|
|
356
358
|
input_format = InputFormat.XML_JATS
|
357
359
|
|
358
360
|
elif mime == "text/plain":
|
361
|
+
content_str = content.decode("utf-8")
|
359
362
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
360
363
|
input_format = InputFormat.XML_USPTO
|
361
364
|
|
@@ -409,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
|
|
409
412
|
else:
|
410
413
|
return "application/xml"
|
411
414
|
|
412
|
-
if re.match(
|
415
|
+
if re.match(
|
416
|
+
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
|
417
|
+
content_str,
|
418
|
+
re.DOTALL,
|
419
|
+
):
|
413
420
|
return "text/html"
|
414
421
|
|
415
422
|
p = re.compile(
|
@@ -5,6 +5,7 @@ from collections.abc import Iterable
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Optional
|
7
7
|
|
8
|
+
import numpy as np
|
8
9
|
from docling_core.types.doc import DocItemLabel
|
9
10
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
10
11
|
from PIL import Image
|
@@ -184,6 +185,24 @@ class LayoutModel(BasePageModel):
|
|
184
185
|
).postprocess()
|
185
186
|
# processed_clusters, processed_cells = clusters, page.cells
|
186
187
|
|
188
|
+
with warnings.catch_warnings():
|
189
|
+
warnings.filterwarnings(
|
190
|
+
"ignore",
|
191
|
+
"Mean of empty slice|invalid value encountered in scalar divide",
|
192
|
+
RuntimeWarning,
|
193
|
+
"numpy",
|
194
|
+
)
|
195
|
+
|
196
|
+
conv_res.confidence.pages[page.page_no].layout_score = float(
|
197
|
+
np.mean([c.confidence for c in processed_clusters])
|
198
|
+
)
|
199
|
+
|
200
|
+
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
201
|
+
np.mean(
|
202
|
+
[c.confidence for c in processed_cells if c.from_ocr]
|
203
|
+
)
|
204
|
+
)
|
205
|
+
|
187
206
|
page.cells = processed_cells
|
188
207
|
page.predictions.layout = LayoutPrediction(
|
189
208
|
clusters=processed_clusters
|
@@ -1,7 +1,10 @@
|
|
1
|
+
import re
|
2
|
+
import warnings
|
1
3
|
from collections.abc import Iterable
|
2
4
|
from pathlib import Path
|
3
5
|
from typing import Optional
|
4
6
|
|
7
|
+
import numpy as np
|
5
8
|
from PIL import ImageDraw
|
6
9
|
from pydantic import BaseModel
|
7
10
|
|
@@ -21,6 +24,14 @@ class PagePreprocessingModel(BasePageModel):
|
|
21
24
|
def __init__(self, options: PagePreprocessingOptions):
|
22
25
|
self.options = options
|
23
26
|
|
27
|
+
# Pre-compiled regex patterns for efficiency
|
28
|
+
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
|
29
|
+
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
30
|
+
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
31
|
+
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
32
|
+
r"(?:/\w+\s*){2,}"
|
33
|
+
) # Two or more "/token " sequences
|
34
|
+
|
24
35
|
def __call__(
|
25
36
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
26
37
|
) -> Iterable[Page]:
|
@@ -60,6 +71,22 @@ class PagePreprocessingModel(BasePageModel):
|
|
60
71
|
if self.options.create_parsed_page:
|
61
72
|
page.parsed_page = page._backend.get_segmented_page()
|
62
73
|
|
74
|
+
# Rate the text quality from the PDF parser, and aggregate on page
|
75
|
+
text_scores = []
|
76
|
+
for c in page.cells:
|
77
|
+
score = self.rate_text_quality(c.text)
|
78
|
+
text_scores.append(score)
|
79
|
+
|
80
|
+
with warnings.catch_warnings():
|
81
|
+
warnings.filterwarnings(
|
82
|
+
"ignore", "Mean of empty slice", RuntimeWarning, "numpy"
|
83
|
+
)
|
84
|
+
conv_res.confidence.pages[page.page_no].parse_score = float(
|
85
|
+
np.nanquantile(
|
86
|
+
text_scores, q=0.10
|
87
|
+
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
88
|
+
)
|
89
|
+
|
63
90
|
# DEBUG code:
|
64
91
|
def draw_text_boxes(image, cells, show: bool = False):
|
65
92
|
draw = ImageDraw.Draw(image)
|
@@ -88,3 +115,30 @@ class PagePreprocessingModel(BasePageModel):
|
|
88
115
|
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
89
116
|
|
90
117
|
return page
|
118
|
+
|
119
|
+
def rate_text_quality(self, text: str) -> float:
|
120
|
+
# Hard errors: if any of these patterns are found, return 0.0 immediately.
|
121
|
+
blacklist_chars = ["�"]
|
122
|
+
if (
|
123
|
+
any(text.find(c) >= 0 for c in blacklist_chars)
|
124
|
+
or self.GLYPH_RE.search(text)
|
125
|
+
or self.SLASH_G_RE.search(text)
|
126
|
+
or self.SLASH_NUMBER_GARBAGE_RE.match(
|
127
|
+
text
|
128
|
+
) # Check if text is mostly slash-number pattern
|
129
|
+
):
|
130
|
+
return 0.0
|
131
|
+
|
132
|
+
penalty = 0.0
|
133
|
+
|
134
|
+
# Apply a penalty only if the fragmented words pattern occurs at least three times.
|
135
|
+
frag_matches = self.FRAG_RE.findall(text)
|
136
|
+
if len(frag_matches) >= 3:
|
137
|
+
penalty += 0.1 * len(frag_matches)
|
138
|
+
|
139
|
+
# Additional heuristic: if the average token length is below 2, add a penalty.
|
140
|
+
# tokens = text.split()
|
141
|
+
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
142
|
+
# penalty += 0.2
|
143
|
+
|
144
|
+
return max(1.0 - penalty, 0.0)
|
@@ -2,6 +2,7 @@ import csv
|
|
2
2
|
import io
|
3
3
|
import logging
|
4
4
|
import os
|
5
|
+
import subprocess
|
5
6
|
import tempfile
|
6
7
|
from collections.abc import Iterable
|
7
8
|
from pathlib import Path
|
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
|
|
10
11
|
|
11
12
|
import pandas as pd
|
12
13
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
13
|
-
from docling_core.types.doc.page import
|
14
|
+
from docling_core.types.doc.page import TextCell
|
14
15
|
|
15
16
|
from docling.datamodel.base_models import Page
|
16
17
|
from docling.datamodel.document import ConversionResult
|
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
|
|
21
22
|
)
|
22
23
|
from docling.datamodel.settings import settings
|
23
24
|
from docling.models.base_ocr_model import BaseOcrModel
|
24
|
-
from docling.utils.ocr_utils import
|
25
|
+
from docling.utils.ocr_utils import (
|
26
|
+
map_tesseract_script,
|
27
|
+
parse_tesseract_orientation,
|
28
|
+
tesseract_box_to_bounding_rectangle,
|
29
|
+
)
|
25
30
|
from docling.utils.profiling import TimeRecorder
|
26
31
|
|
27
32
|
_log = logging.getLogger(__name__)
|
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
49
54
|
self._version: Optional[str] = None
|
50
55
|
self._tesseract_languages: Optional[List[str]] = None
|
51
56
|
self._script_prefix: Optional[str] = None
|
57
|
+
self._is_auto: bool = "auto" in self.options.lang
|
52
58
|
|
53
59
|
if self.enabled:
|
54
60
|
try:
|
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
93
99
|
|
94
100
|
return name, version
|
95
101
|
|
96
|
-
def _run_tesseract(self, ifilename: str):
|
102
|
+
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
|
97
103
|
r"""
|
98
104
|
Run tesseract CLI
|
99
105
|
"""
|
100
106
|
cmd = [self.options.tesseract_cmd]
|
101
|
-
|
102
|
-
|
103
|
-
lang = self._detect_language(ifilename)
|
107
|
+
if self._is_auto:
|
108
|
+
lang = self._parse_language(osd)
|
104
109
|
if lang is not None:
|
105
110
|
cmd.append("-l")
|
106
111
|
cmd.append(lang)
|
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
115
120
|
cmd += [ifilename, "stdout", "tsv"]
|
116
121
|
_log.info("command: {}".format(" ".join(cmd)))
|
117
122
|
|
118
|
-
|
119
|
-
output, _ = proc.communicate()
|
123
|
+
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
120
124
|
|
121
125
|
# _log.info(output)
|
122
126
|
|
123
127
|
# Decode the byte string to a regular string
|
124
|
-
decoded_data = output.decode("utf-8")
|
128
|
+
decoded_data = output.stdout.decode("utf-8")
|
125
129
|
# _log.info(decoded_data)
|
126
130
|
|
127
131
|
# Read the TSV file generated by Tesseract
|
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
139
143
|
|
140
144
|
return df_filtered
|
141
145
|
|
142
|
-
def
|
146
|
+
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
|
143
147
|
r"""
|
144
148
|
Run tesseract in PSM 0 mode to detect the language
|
145
149
|
"""
|
146
|
-
assert self._tesseract_languages is not None
|
147
150
|
|
148
151
|
cmd = [self.options.tesseract_cmd]
|
149
152
|
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
150
153
|
_log.info("command: {}".format(" ".join(cmd)))
|
151
|
-
|
152
|
-
|
153
|
-
decoded_data = output.decode("utf-8")
|
154
|
+
output = subprocess.run(cmd, capture_output=True, check=True)
|
155
|
+
decoded_data = output.stdout.decode("utf-8")
|
154
156
|
df_detected = pd.read_csv(
|
155
157
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
156
158
|
)
|
157
|
-
|
159
|
+
return df_detected
|
160
|
+
|
161
|
+
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
|
162
|
+
assert self._tesseract_languages is not None
|
163
|
+
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
|
158
164
|
if len(scripts) == 0:
|
159
165
|
_log.warning("Tesseract cannot detect the script of the page")
|
160
166
|
return None
|
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
182
188
|
cmd = [self.options.tesseract_cmd]
|
183
189
|
cmd.append("--list-langs")
|
184
190
|
_log.info("command: {}".format(" ".join(cmd)))
|
185
|
-
|
186
|
-
|
187
|
-
decoded_data = output.decode("utf-8")
|
191
|
+
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
192
|
+
decoded_data = output.stdout.decode("utf-8")
|
188
193
|
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
189
194
|
self._tesseract_languages = df_list[0].tolist()[1:]
|
190
195
|
|
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
203
208
|
yield from page_batch
|
204
209
|
return
|
205
210
|
|
206
|
-
for page in page_batch:
|
211
|
+
for page_i, page in enumerate(page_batch):
|
207
212
|
assert page._backend is not None
|
208
213
|
if not page._backend.is_valid():
|
209
214
|
yield page
|
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
212
217
|
ocr_rects = self.get_ocr_rects(page)
|
213
218
|
|
214
219
|
all_ocr_cells = []
|
215
|
-
for ocr_rect in ocr_rects:
|
220
|
+
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
216
221
|
# Skip zero area boxes
|
217
222
|
if ocr_rect.area() == 0:
|
218
223
|
continue
|
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
225
230
|
) as image_file:
|
226
231
|
fname = image_file.name
|
227
232
|
high_res_image.save(image_file)
|
228
|
-
|
229
|
-
|
233
|
+
doc_orientation = 0
|
234
|
+
try:
|
235
|
+
df_osd = self._perform_osd(fname)
|
236
|
+
doc_orientation = _parse_orientation(df_osd)
|
237
|
+
except subprocess.CalledProcessError as exc:
|
238
|
+
_log.error(
|
239
|
+
"OSD failed (doc %s, page: %s, "
|
240
|
+
"OCR rectangle: %s, processed image file %s):\n %s",
|
241
|
+
conv_res.input.file,
|
242
|
+
page_i,
|
243
|
+
ocr_rect_i,
|
244
|
+
image_file,
|
245
|
+
exc.stderr,
|
246
|
+
)
|
247
|
+
# Skipping if OSD fail when in auto mode, otherwise proceed
|
248
|
+
# to OCR in the hope OCR will succeed while OSD failed
|
249
|
+
if self._is_auto:
|
250
|
+
continue
|
251
|
+
if doc_orientation != 0:
|
252
|
+
high_res_image = high_res_image.rotate(
|
253
|
+
-doc_orientation, expand=True
|
254
|
+
)
|
255
|
+
high_res_image.save(fname)
|
256
|
+
try:
|
257
|
+
df_result = self._run_tesseract(fname, df_osd)
|
258
|
+
except subprocess.CalledProcessError as exc:
|
259
|
+
_log.error(
|
260
|
+
"tesseract OCR failed (doc %s, page: %s, "
|
261
|
+
"OCR rectangle: %s, processed image file %s):\n %s",
|
262
|
+
conv_res.input.file,
|
263
|
+
page_i,
|
264
|
+
ocr_rect_i,
|
265
|
+
image_file,
|
266
|
+
exc.stderr,
|
267
|
+
)
|
268
|
+
continue
|
230
269
|
finally:
|
231
270
|
if os.path.exists(fname):
|
232
271
|
os.remove(fname)
|
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
238
277
|
text = row["text"]
|
239
278
|
conf = row["conf"]
|
240
279
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
280
|
+
left, top = float(row["left"]), float(row["top"])
|
281
|
+
right = left + float(row["width"])
|
282
|
+
bottom = top + row["height"]
|
283
|
+
bbox = BoundingBox(
|
284
|
+
l=left,
|
285
|
+
t=top,
|
286
|
+
r=right,
|
287
|
+
b=bottom,
|
288
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
289
|
+
)
|
290
|
+
rect = tesseract_box_to_bounding_rectangle(
|
291
|
+
bbox,
|
292
|
+
original_offset=ocr_rect,
|
293
|
+
scale=self.scale,
|
294
|
+
orientation=doc_orientation,
|
295
|
+
im_size=high_res_image.size,
|
296
|
+
)
|
249
297
|
cell = TextCell(
|
250
298
|
index=ix,
|
251
299
|
text=str(text),
|
252
300
|
orig=str(text),
|
253
301
|
from_ocr=True,
|
254
302
|
confidence=conf / 100.0,
|
255
|
-
rect=
|
256
|
-
BoundingBox.from_tuple(
|
257
|
-
coord=(
|
258
|
-
(l / self.scale) + ocr_rect.l,
|
259
|
-
(b / self.scale) + ocr_rect.t,
|
260
|
-
(r / self.scale) + ocr_rect.l,
|
261
|
-
(t / self.scale) + ocr_rect.t,
|
262
|
-
),
|
263
|
-
origin=CoordOrigin.TOPLEFT,
|
264
|
-
)
|
265
|
-
),
|
303
|
+
rect=rect,
|
266
304
|
)
|
267
305
|
all_ocr_cells.append(cell)
|
268
306
|
|
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
278
316
|
@classmethod
|
279
317
|
def get_options_type(cls) -> Type[OcrOptions]:
|
280
318
|
return TesseractCliOcrOptions
|
319
|
+
|
320
|
+
|
321
|
+
def _parse_orientation(df_osd: pd.DataFrame) -> int:
|
322
|
+
orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
|
323
|
+
orientation = parse_tesseract_orientation(orientations[0].strip())
|
324
|
+
return orientation
|