docling 2.32.0__py3-none-any.whl → 2.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +1 -1
- docling/backend/docling_parse_v2_backend.py +1 -1
- docling/backend/docling_parse_v4_backend.py +1 -1
- docling/backend/msword_backend.py +269 -12
- docling/backend/pypdfium2_backend.py +6 -1
- docling/datamodel/base_models.py +99 -2
- docling/datamodel/document.py +11 -2
- docling/models/layout_model.py +9 -0
- docling/models/page_assemble_model.py +1 -0
- docling/models/page_preprocessing_model.py +50 -1
- docling/models/tesseract_ocr_cli_model.py +85 -41
- docling/models/tesseract_ocr_model.py +52 -30
- docling/pipeline/standard_pdf_pipeline.py +28 -3
- docling/pipeline/vlm_pipeline.py +19 -21
- docling/utils/layout_postprocessor.py +10 -22
- docling/utils/ocr_utils.py +60 -0
- docling/utils/orientation.py +71 -0
- {docling-2.32.0.dist-info → docling-2.34.0.dist-info}/METADATA +2 -2
- {docling-2.32.0.dist-info → docling-2.34.0.dist-info}/RECORD +22 -21
- {docling-2.32.0.dist-info → docling-2.34.0.dist-info}/LICENSE +0 -0
- {docling-2.32.0.dist-info → docling-2.34.0.dist-info}/WHEEL +0 -0
- {docling-2.32.0.dist-info → docling-2.34.0.dist-info}/entry_points.txt +0 -0
@@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
60
60
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
61
61
|
).to_top_left_origin(page_height=page_size.height * scale)
|
62
62
|
|
63
|
-
overlap_frac = cell_bbox.
|
63
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
64
64
|
|
65
65
|
if overlap_frac > 0.5:
|
66
66
|
if len(text_piece) > 0:
|
@@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
71
71
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
72
72
|
).to_top_left_origin(page_height=page_size.height * scale)
|
73
73
|
|
74
|
-
overlap_frac = cell_bbox.
|
74
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
75
75
|
|
76
76
|
if overlap_frac > 0.5:
|
77
77
|
if len(text_piece) > 0:
|
@@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
46
46
|
.scaled(scale)
|
47
47
|
)
|
48
48
|
|
49
|
-
overlap_frac = cell_bbox.
|
49
|
+
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
50
50
|
|
51
51
|
if overlap_frac > 0.5:
|
52
52
|
if len(text_piece) > 0:
|
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
import re
|
3
3
|
from io import BytesIO
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Any, Optional, Union
|
5
|
+
from typing import Any, List, Optional, Union
|
6
6
|
|
7
7
|
from docling_core.types.doc import (
|
8
8
|
DocItemLabel,
|
@@ -24,7 +24,6 @@ from docx.text.hyperlink import Hyperlink
|
|
24
24
|
from docx.text.paragraph import Paragraph
|
25
25
|
from docx.text.run import Run
|
26
26
|
from lxml import etree
|
27
|
-
from lxml.etree import XPath
|
28
27
|
from PIL import Image, UnidentifiedImageError
|
29
28
|
from pydantic import AnyUrl
|
30
29
|
from typing_extensions import override
|
@@ -59,6 +58,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
59
58
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
60
59
|
self.numbered_headers: dict[int, int] = {}
|
61
60
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
61
|
+
# Track processed textbox elements to avoid duplication
|
62
|
+
self.processed_textbox_elements: List[int] = []
|
63
|
+
# Track content hash of processed paragraphs to avoid duplicate content
|
64
|
+
self.processed_paragraph_content: List[str] = []
|
65
|
+
|
62
66
|
for i in range(-1, self.max_levels):
|
63
67
|
self.parents[i] = None
|
64
68
|
|
@@ -175,10 +179,74 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
175
179
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
176
180
|
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
177
181
|
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
182
|
+
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
183
|
+
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
|
184
|
+
"v": "urn:schemas-microsoft-com:vml",
|
185
|
+
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
|
186
|
+
"w10": "urn:schemas-microsoft-com:office:word",
|
187
|
+
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
|
178
188
|
}
|
179
|
-
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
|
189
|
+
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
180
190
|
drawing_blip = xpath_expr(element)
|
181
191
|
|
192
|
+
# Check for textbox content - check multiple textbox formats
|
193
|
+
# Only process if the element hasn't been processed before
|
194
|
+
element_id = id(element)
|
195
|
+
if element_id not in self.processed_textbox_elements:
|
196
|
+
# Modern Word textboxes
|
197
|
+
txbx_xpath = etree.XPath(
|
198
|
+
".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
|
199
|
+
)
|
200
|
+
textbox_elements = txbx_xpath(element)
|
201
|
+
|
202
|
+
# No modern textboxes found, check for alternate/legacy textbox formats
|
203
|
+
if not textbox_elements and tag_name in ["drawing", "pict"]:
|
204
|
+
# Additional checks for textboxes in DrawingML and VML formats
|
205
|
+
alt_txbx_xpath = etree.XPath(
|
206
|
+
".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
|
207
|
+
namespaces=namespaces,
|
208
|
+
)
|
209
|
+
textbox_elements = alt_txbx_xpath(element)
|
210
|
+
|
211
|
+
# Check for shape text that's not in a standard textbox
|
212
|
+
if not textbox_elements:
|
213
|
+
shape_text_xpath = etree.XPath(
|
214
|
+
".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
|
215
|
+
namespaces=namespaces,
|
216
|
+
)
|
217
|
+
shape_text_elements = shape_text_xpath(element)
|
218
|
+
if shape_text_elements:
|
219
|
+
# Create custom text elements from shape text
|
220
|
+
text_content = " ".join(
|
221
|
+
[t.text for t in shape_text_elements if t.text]
|
222
|
+
)
|
223
|
+
if text_content.strip():
|
224
|
+
_log.debug(f"Found shape text: {text_content[:50]}...")
|
225
|
+
# Create a paragraph-like element to process with standard handler
|
226
|
+
level = self._get_level()
|
227
|
+
shape_group = doc.add_group(
|
228
|
+
label=GroupLabel.SECTION,
|
229
|
+
parent=self.parents[level - 1],
|
230
|
+
name="shape-text",
|
231
|
+
)
|
232
|
+
doc.add_text(
|
233
|
+
label=DocItemLabel.PARAGRAPH,
|
234
|
+
parent=shape_group,
|
235
|
+
text=text_content,
|
236
|
+
)
|
237
|
+
|
238
|
+
if textbox_elements:
|
239
|
+
# Mark the parent element as processed
|
240
|
+
self.processed_textbox_elements.append(element_id)
|
241
|
+
# Also mark all found textbox elements as processed
|
242
|
+
for tb_element in textbox_elements:
|
243
|
+
self.processed_textbox_elements.append(id(tb_element))
|
244
|
+
|
245
|
+
_log.debug(
|
246
|
+
f"Found textbox content with {len(textbox_elements)} elements"
|
247
|
+
)
|
248
|
+
self._handle_textbox_content(textbox_elements, docx_obj, doc)
|
249
|
+
|
182
250
|
# Check for Tables
|
183
251
|
if element.tag.endswith("tbl"):
|
184
252
|
try:
|
@@ -291,15 +359,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
291
359
|
|
292
360
|
@classmethod
|
293
361
|
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
362
|
+
# The .bold and .italic properties are booleans, but .underline can be an enum
|
363
|
+
# like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
|
364
|
+
has_bold = run.bold or False
|
365
|
+
has_italic = run.italic or False
|
366
|
+
# Convert any non-None underline value to True
|
367
|
+
has_underline = bool(run.underline is not None and run.underline)
|
368
|
+
|
369
|
+
return Formatting(
|
370
|
+
bold=has_bold,
|
371
|
+
italic=has_italic,
|
372
|
+
underline=has_underline,
|
303
373
|
)
|
304
374
|
|
305
375
|
def _get_paragraph_elements(self, paragraph: Paragraph):
|
@@ -355,6 +425,182 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
355
425
|
|
356
426
|
return paragraph_elements
|
357
427
|
|
428
|
+
def _get_paragraph_position(self, paragraph_element):
|
429
|
+
"""Extract vertical position information from paragraph element."""
|
430
|
+
# First try to directly get the index from w:p element that has an order-related attribute
|
431
|
+
if (
|
432
|
+
hasattr(paragraph_element, "getparent")
|
433
|
+
and paragraph_element.getparent() is not None
|
434
|
+
):
|
435
|
+
parent = paragraph_element.getparent()
|
436
|
+
# Get all paragraph siblings
|
437
|
+
paragraphs = [
|
438
|
+
p for p in parent.getchildren() if etree.QName(p).localname == "p"
|
439
|
+
]
|
440
|
+
# Find index of current paragraph within its siblings
|
441
|
+
try:
|
442
|
+
paragraph_index = paragraphs.index(paragraph_element)
|
443
|
+
return paragraph_index # Use index as position for consistent ordering
|
444
|
+
except ValueError:
|
445
|
+
pass
|
446
|
+
|
447
|
+
# Look for position hints in element attributes and ancestor elements
|
448
|
+
for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
|
449
|
+
# Check for direct position attributes
|
450
|
+
for attr_name in ["y", "top", "positionY", "y-position", "position"]:
|
451
|
+
value = elem.get(attr_name)
|
452
|
+
if value:
|
453
|
+
try:
|
454
|
+
# Remove any non-numeric characters (like 'pt', 'px', etc.)
|
455
|
+
clean_value = re.sub(r"[^0-9.]", "", value)
|
456
|
+
if clean_value:
|
457
|
+
return float(clean_value)
|
458
|
+
except (ValueError, TypeError):
|
459
|
+
pass
|
460
|
+
|
461
|
+
# Check for position in transform attribute
|
462
|
+
transform = elem.get("transform")
|
463
|
+
if transform:
|
464
|
+
# Extract translation component from transform matrix
|
465
|
+
match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
|
466
|
+
if match:
|
467
|
+
try:
|
468
|
+
return float(match.group(1))
|
469
|
+
except ValueError:
|
470
|
+
pass
|
471
|
+
|
472
|
+
# Check for anchors or relative position indicators in Word format
|
473
|
+
# 'dist' attributes can indicate relative positioning
|
474
|
+
for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
|
475
|
+
if elem.get(attr_name) is not None:
|
476
|
+
return elem.sourceline # Use the XML source line number as fallback
|
477
|
+
|
478
|
+
# For VML shapes, look for specific attributes
|
479
|
+
for ns_uri in paragraph_element.nsmap.values():
|
480
|
+
if "vml" in ns_uri:
|
481
|
+
# Try to extract position from style attribute
|
482
|
+
style = paragraph_element.get("style")
|
483
|
+
if style:
|
484
|
+
match = re.search(r"top:([0-9.]+)pt", style)
|
485
|
+
if match:
|
486
|
+
try:
|
487
|
+
return float(match.group(1))
|
488
|
+
except ValueError:
|
489
|
+
pass
|
490
|
+
|
491
|
+
# If no better position indicator found, use XML source line number as proxy for order
|
492
|
+
return (
|
493
|
+
paragraph_element.sourceline
|
494
|
+
if hasattr(paragraph_element, "sourceline")
|
495
|
+
else None
|
496
|
+
)
|
497
|
+
|
498
|
+
def _collect_textbox_paragraphs(self, textbox_elements):
|
499
|
+
"""Collect and organize paragraphs from textbox elements."""
|
500
|
+
processed_paragraphs = []
|
501
|
+
container_paragraphs = {}
|
502
|
+
|
503
|
+
for element in textbox_elements:
|
504
|
+
element_id = id(element)
|
505
|
+
# Skip if we've already processed this exact element
|
506
|
+
if element_id in processed_paragraphs:
|
507
|
+
continue
|
508
|
+
|
509
|
+
tag_name = etree.QName(element).localname
|
510
|
+
processed_paragraphs.append(element_id)
|
511
|
+
|
512
|
+
# Handle paragraphs directly found (VML textboxes)
|
513
|
+
if tag_name == "p":
|
514
|
+
# Find the containing textbox or shape element
|
515
|
+
container_id = None
|
516
|
+
for ancestor in element.iterancestors():
|
517
|
+
if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
|
518
|
+
container_id = id(ancestor)
|
519
|
+
break
|
520
|
+
|
521
|
+
if container_id not in container_paragraphs:
|
522
|
+
container_paragraphs[container_id] = []
|
523
|
+
container_paragraphs[container_id].append(
|
524
|
+
(element, self._get_paragraph_position(element))
|
525
|
+
)
|
526
|
+
|
527
|
+
# Handle txbxContent elements (Word DrawingML textboxes)
|
528
|
+
elif tag_name == "txbxContent":
|
529
|
+
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
530
|
+
container_id = id(element)
|
531
|
+
if container_id not in container_paragraphs:
|
532
|
+
container_paragraphs[container_id] = []
|
533
|
+
|
534
|
+
for p in paragraphs:
|
535
|
+
p_id = id(p)
|
536
|
+
if p_id not in processed_paragraphs:
|
537
|
+
processed_paragraphs.append(p_id)
|
538
|
+
container_paragraphs[container_id].append(
|
539
|
+
(p, self._get_paragraph_position(p))
|
540
|
+
)
|
541
|
+
else:
|
542
|
+
# Try to extract any paragraphs from unknown elements
|
543
|
+
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
544
|
+
container_id = id(element)
|
545
|
+
if container_id not in container_paragraphs:
|
546
|
+
container_paragraphs[container_id] = []
|
547
|
+
|
548
|
+
for p in paragraphs:
|
549
|
+
p_id = id(p)
|
550
|
+
if p_id not in processed_paragraphs:
|
551
|
+
processed_paragraphs.append(p_id)
|
552
|
+
container_paragraphs[container_id].append(
|
553
|
+
(p, self._get_paragraph_position(p))
|
554
|
+
)
|
555
|
+
|
556
|
+
return container_paragraphs
|
557
|
+
|
558
|
+
def _handle_textbox_content(
|
559
|
+
self,
|
560
|
+
textbox_elements: list,
|
561
|
+
docx_obj: DocxDocument,
|
562
|
+
doc: DoclingDocument,
|
563
|
+
) -> None:
|
564
|
+
"""Process textbox content and add it to the document structure."""
|
565
|
+
level = self._get_level()
|
566
|
+
# Create a textbox group to contain all text from the textbox
|
567
|
+
textbox_group = doc.add_group(
|
568
|
+
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
|
569
|
+
)
|
570
|
+
|
571
|
+
# Set this as the current parent to ensure textbox content
|
572
|
+
# is properly nested in document structure
|
573
|
+
original_parent = self.parents[level]
|
574
|
+
self.parents[level] = textbox_group
|
575
|
+
|
576
|
+
# Collect and organize paragraphs
|
577
|
+
container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
|
578
|
+
|
579
|
+
# Process all paragraphs
|
580
|
+
all_paragraphs = []
|
581
|
+
|
582
|
+
# Sort paragraphs within each container, then process containers
|
583
|
+
for container_id, paragraphs in container_paragraphs.items():
|
584
|
+
# Sort by vertical position within each container
|
585
|
+
sorted_container_paragraphs = sorted(
|
586
|
+
paragraphs,
|
587
|
+
key=lambda x: (
|
588
|
+
x[1] is None,
|
589
|
+
x[1] if x[1] is not None else float("inf"),
|
590
|
+
),
|
591
|
+
)
|
592
|
+
|
593
|
+
# Add the sorted paragraphs to our processing list
|
594
|
+
all_paragraphs.extend(sorted_container_paragraphs)
|
595
|
+
|
596
|
+
# Process all the paragraphs
|
597
|
+
for p, _ in all_paragraphs:
|
598
|
+
self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
|
599
|
+
|
600
|
+
# Restore original parent
|
601
|
+
self.parents[level] = original_parent
|
602
|
+
return
|
603
|
+
|
358
604
|
def _handle_equations_in_text(self, element, text):
|
359
605
|
only_texts = []
|
360
606
|
only_equations = []
|
@@ -423,10 +669,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
423
669
|
element: BaseOxmlElement,
|
424
670
|
docx_obj: DocxDocument,
|
425
671
|
doc: DoclingDocument,
|
672
|
+
is_from_textbox: bool = False,
|
426
673
|
) -> None:
|
427
674
|
paragraph = Paragraph(element, docx_obj)
|
428
675
|
|
676
|
+
# Skip if from a textbox and this exact paragraph content was already processed
|
677
|
+
# Skip if from a textbox and this exact paragraph content was already processed
|
429
678
|
raw_text = paragraph.text
|
679
|
+
if is_from_textbox and raw_text:
|
680
|
+
# Create a simple hash of content to detect duplicates
|
681
|
+
content_hash = f"{len(raw_text)}:{raw_text[:50]}"
|
682
|
+
if content_hash in self.processed_paragraph_content:
|
683
|
+
_log.debug(f"Skipping duplicate paragraph content: {content_hash}")
|
684
|
+
return
|
685
|
+
self.processed_paragraph_content.append(content_hash)
|
686
|
+
|
430
687
|
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
431
688
|
|
432
689
|
if text is None:
|
@@ -175,13 +175,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
175
175
|
if len(group) == 1:
|
176
176
|
return group[0]
|
177
177
|
|
178
|
-
merged_text = "".join(cell.text for cell in group)
|
179
178
|
merged_bbox = BoundingBox(
|
180
179
|
l=min(cell.rect.to_bounding_box().l for cell in group),
|
181
180
|
t=min(cell.rect.to_bounding_box().t for cell in group),
|
182
181
|
r=max(cell.rect.to_bounding_box().r for cell in group),
|
183
182
|
b=max(cell.rect.to_bounding_box().b for cell in group),
|
184
183
|
)
|
184
|
+
|
185
|
+
assert self._ppage is not None
|
186
|
+
self.text_page = self._ppage.get_textpage()
|
187
|
+
bbox = merged_bbox.to_bottom_left_origin(page_size.height)
|
188
|
+
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
|
189
|
+
|
185
190
|
return TextCell(
|
186
191
|
index=group[0].index,
|
187
192
|
text=merged_text,
|
docling/datamodel/base_models.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
import math
|
2
|
+
from collections import defaultdict
|
1
3
|
from enum import Enum
|
2
|
-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
|
3
5
|
|
6
|
+
import numpy as np
|
4
7
|
from docling_core.types.doc import (
|
5
8
|
BoundingBox,
|
6
9
|
DocItemLabel,
|
@@ -16,7 +19,7 @@ from docling_core.types.io import (
|
|
16
19
|
DocumentStream,
|
17
20
|
)
|
18
21
|
from PIL.Image import Image
|
19
|
-
from pydantic import BaseModel, ConfigDict
|
22
|
+
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
20
23
|
|
21
24
|
if TYPE_CHECKING:
|
22
25
|
from docling.backend.pdf_backend import PdfPageBackend
|
@@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
|
|
298
301
|
choices: List[OpenAiResponseChoice]
|
299
302
|
created: int
|
300
303
|
usage: OpenAiResponseUsage
|
304
|
+
|
305
|
+
|
306
|
+
# Create a type alias for score values
|
307
|
+
ScoreValue = float
|
308
|
+
|
309
|
+
|
310
|
+
class QualityGrade(str, Enum):
|
311
|
+
POOR = "poor"
|
312
|
+
FAIR = "fair"
|
313
|
+
GOOD = "good"
|
314
|
+
EXCELLENT = "excellent"
|
315
|
+
UNSPECIFIED = "unspecified"
|
316
|
+
|
317
|
+
|
318
|
+
class PageConfidenceScores(BaseModel):
|
319
|
+
parse_score: ScoreValue = np.nan
|
320
|
+
layout_score: ScoreValue = np.nan
|
321
|
+
table_score: ScoreValue = np.nan
|
322
|
+
ocr_score: ScoreValue = np.nan
|
323
|
+
|
324
|
+
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
325
|
+
if score < 0.5:
|
326
|
+
return QualityGrade.POOR
|
327
|
+
elif score < 0.8:
|
328
|
+
return QualityGrade.FAIR
|
329
|
+
elif score < 0.9:
|
330
|
+
return QualityGrade.GOOD
|
331
|
+
elif score >= 0.9:
|
332
|
+
return QualityGrade.EXCELLENT
|
333
|
+
|
334
|
+
return QualityGrade.UNSPECIFIED
|
335
|
+
|
336
|
+
@computed_field # type: ignore
|
337
|
+
@property
|
338
|
+
def mean_grade(self) -> QualityGrade:
|
339
|
+
return self._score_to_grade(self.mean_score)
|
340
|
+
|
341
|
+
@computed_field # type: ignore
|
342
|
+
@property
|
343
|
+
def low_grade(self) -> QualityGrade:
|
344
|
+
return self._score_to_grade(self.low_score)
|
345
|
+
|
346
|
+
@computed_field # type: ignore
|
347
|
+
@property
|
348
|
+
def mean_score(self) -> ScoreValue:
|
349
|
+
return ScoreValue(
|
350
|
+
np.nanmean(
|
351
|
+
[
|
352
|
+
self.ocr_score,
|
353
|
+
self.table_score,
|
354
|
+
self.layout_score,
|
355
|
+
self.parse_score,
|
356
|
+
]
|
357
|
+
)
|
358
|
+
)
|
359
|
+
|
360
|
+
@computed_field # type: ignore
|
361
|
+
@property
|
362
|
+
def low_score(self) -> ScoreValue:
|
363
|
+
return ScoreValue(
|
364
|
+
np.nanquantile(
|
365
|
+
[
|
366
|
+
self.ocr_score,
|
367
|
+
self.table_score,
|
368
|
+
self.layout_score,
|
369
|
+
self.parse_score,
|
370
|
+
],
|
371
|
+
q=0.05,
|
372
|
+
)
|
373
|
+
)
|
374
|
+
|
375
|
+
|
376
|
+
class ConfidenceReport(PageConfidenceScores):
|
377
|
+
pages: Dict[int, PageConfidenceScores] = Field(
|
378
|
+
default_factory=lambda: defaultdict(PageConfidenceScores)
|
379
|
+
)
|
380
|
+
|
381
|
+
@computed_field # type: ignore
|
382
|
+
@property
|
383
|
+
def mean_score(self) -> ScoreValue:
|
384
|
+
return ScoreValue(
|
385
|
+
np.nanmean(
|
386
|
+
[c.mean_score for c in self.pages.values()],
|
387
|
+
)
|
388
|
+
)
|
389
|
+
|
390
|
+
@computed_field # type: ignore
|
391
|
+
@property
|
392
|
+
def low_score(self) -> ScoreValue:
|
393
|
+
return ScoreValue(
|
394
|
+
np.nanmean(
|
395
|
+
[c.low_score for c in self.pages.values()],
|
396
|
+
)
|
397
|
+
)
|
docling/datamodel/document.py
CHANGED
@@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
|
|
47
47
|
)
|
48
48
|
from docling_core.utils.file import resolve_source_to_stream
|
49
49
|
from docling_core.utils.legacy import docling_document_to_legacy
|
50
|
-
from pydantic import BaseModel
|
50
|
+
from pydantic import BaseModel, Field
|
51
51
|
from typing_extensions import deprecated
|
52
52
|
|
53
53
|
from docling.backend.abstract_backend import (
|
@@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
|
|
56
56
|
)
|
57
57
|
from docling.datamodel.base_models import (
|
58
58
|
AssembledUnit,
|
59
|
+
ConfidenceReport,
|
59
60
|
ConversionStatus,
|
60
61
|
DocumentStream,
|
61
62
|
ErrorItem,
|
@@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
|
|
201
202
|
pages: List[Page] = []
|
202
203
|
assembled: AssembledUnit = AssembledUnit()
|
203
204
|
timings: Dict[str, ProfilingItem] = {}
|
205
|
+
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
204
206
|
|
205
207
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
206
208
|
|
@@ -302,7 +304,7 @@ class _DocumentConversionInput(BaseModel):
|
|
302
304
|
if ("." in obj.name and not obj.name.startswith("."))
|
303
305
|
else ""
|
304
306
|
)
|
305
|
-
mime = _DocumentConversionInput._mime_from_extension(ext)
|
307
|
+
mime = _DocumentConversionInput._mime_from_extension(ext.lower())
|
306
308
|
if mime is not None and mime.lower() == "application/zip":
|
307
309
|
objname = obj.name.lower()
|
308
310
|
if objname.endswith(".xlsx"):
|
@@ -376,6 +378,13 @@ class _DocumentConversionInput(BaseModel):
|
|
376
378
|
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
377
379
|
elif ext in FormatToExtensions[InputFormat.PDF]:
|
378
380
|
mime = FormatToMimeType[InputFormat.PDF][0]
|
381
|
+
elif ext in FormatToExtensions[InputFormat.DOCX]:
|
382
|
+
mime = FormatToMimeType[InputFormat.DOCX][0]
|
383
|
+
elif ext in FormatToExtensions[InputFormat.PPTX]:
|
384
|
+
mime = FormatToMimeType[InputFormat.PPTX][0]
|
385
|
+
elif ext in FormatToExtensions[InputFormat.XLSX]:
|
386
|
+
mime = FormatToMimeType[InputFormat.XLSX][0]
|
387
|
+
|
379
388
|
return mime
|
380
389
|
|
381
390
|
@staticmethod
|
docling/models/layout_model.py
CHANGED
@@ -5,6 +5,7 @@ from collections.abc import Iterable
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Optional
|
7
7
|
|
8
|
+
import numpy as np
|
8
9
|
from docling_core.types.doc import DocItemLabel
|
9
10
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
10
11
|
from PIL import Image
|
@@ -184,6 +185,14 @@ class LayoutModel(BasePageModel):
|
|
184
185
|
).postprocess()
|
185
186
|
# processed_clusters, processed_cells = clusters, page.cells
|
186
187
|
|
188
|
+
conv_res.confidence.pages[page.page_no].layout_score = float(
|
189
|
+
np.mean([c.confidence for c in processed_clusters])
|
190
|
+
)
|
191
|
+
|
192
|
+
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
193
|
+
np.mean([c.confidence for c in processed_cells if c.from_ocr])
|
194
|
+
)
|
195
|
+
|
187
196
|
page.cells = processed_cells
|
188
197
|
page.predictions.layout = LayoutPrediction(
|
189
198
|
clusters=processed_clusters
|
@@ -1,11 +1,13 @@
|
|
1
|
+
import re
|
1
2
|
from collections.abc import Iterable
|
2
3
|
from pathlib import Path
|
3
4
|
from typing import Optional
|
4
5
|
|
6
|
+
import numpy as np
|
5
7
|
from PIL import ImageDraw
|
6
8
|
from pydantic import BaseModel
|
7
9
|
|
8
|
-
from docling.datamodel.base_models import Page
|
10
|
+
from docling.datamodel.base_models import Page, ScoreValue
|
9
11
|
from docling.datamodel.document import ConversionResult
|
10
12
|
from docling.datamodel.settings import settings
|
11
13
|
from docling.models.base_model import BasePageModel
|
@@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
|
|
21
23
|
def __init__(self, options: PagePreprocessingOptions):
|
22
24
|
self.options = options
|
23
25
|
|
26
|
+
# Pre-compiled regex patterns for efficiency
|
27
|
+
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
|
28
|
+
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
29
|
+
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
30
|
+
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
31
|
+
r"(?:/\w+\s*){2,}"
|
32
|
+
) # Two or more "/token " sequences
|
33
|
+
|
24
34
|
def __call__(
|
25
35
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
26
36
|
) -> Iterable[Page]:
|
@@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
|
|
60
70
|
if self.options.create_parsed_page:
|
61
71
|
page.parsed_page = page._backend.get_segmented_page()
|
62
72
|
|
73
|
+
# Rate the text quality from the PDF parser, and aggregate on page
|
74
|
+
text_scores = []
|
75
|
+
for c in page.cells:
|
76
|
+
score = self.rate_text_quality(c.text)
|
77
|
+
text_scores.append(score)
|
78
|
+
|
79
|
+
conv_res.confidence.pages[page.page_no].parse_score = float(
|
80
|
+
np.nanquantile(
|
81
|
+
text_scores, q=0.10
|
82
|
+
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
83
|
+
)
|
84
|
+
|
63
85
|
# DEBUG code:
|
64
86
|
def draw_text_boxes(image, cells, show: bool = False):
|
65
87
|
draw = ImageDraw.Draw(image)
|
@@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
|
|
88
110
|
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
89
111
|
|
90
112
|
return page
|
113
|
+
|
114
|
+
def rate_text_quality(self, text: str) -> float:
|
115
|
+
# Hard errors: if any of these patterns are found, return 0.0 immediately.
|
116
|
+
blacklist_chars = ["�"]
|
117
|
+
if (
|
118
|
+
any(text.find(c) >= 0 for c in blacklist_chars)
|
119
|
+
or self.GLYPH_RE.search(text)
|
120
|
+
or self.SLASH_G_RE.search(text)
|
121
|
+
or self.SLASH_NUMBER_GARBAGE_RE.match(
|
122
|
+
text
|
123
|
+
) # Check if text is mostly slash-number pattern
|
124
|
+
):
|
125
|
+
return 0.0
|
126
|
+
|
127
|
+
penalty = 0.0
|
128
|
+
|
129
|
+
# Apply a penalty only if the fragmented words pattern occurs at least three times.
|
130
|
+
frag_matches = self.FRAG_RE.findall(text)
|
131
|
+
if len(frag_matches) >= 3:
|
132
|
+
penalty += 0.1 * len(frag_matches)
|
133
|
+
|
134
|
+
# Additional heuristic: if the average token length is below 2, add a penalty.
|
135
|
+
# tokens = text.split()
|
136
|
+
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
137
|
+
# penalty += 0.2
|
138
|
+
|
139
|
+
return max(1.0 - penalty, 0.0)
|