docling 2.36.0__py3-none-any.whl → 2.37.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +39 -18
- docling/backend/docling_parse_backend.py +61 -59
- docling/backend/docling_parse_v2_backend.py +72 -62
- docling/backend/docling_parse_v4_backend.py +21 -19
- docling/backend/mspowerpoint_backend.py +72 -113
- docling/backend/msword_backend.py +28 -18
- docling/backend/pypdfium2_backend.py +127 -53
- docling/datamodel/base_models.py +10 -3
- docling/datamodel/pipeline_options.py +3 -1
- docling/datamodel/pipeline_options_vlm_model.py +2 -1
- docling/models/base_ocr_model.py +33 -11
- docling/models/easyocr_model.py +1 -1
- docling/models/layout_model.py +2 -3
- docling/models/ocr_mac_model.py +1 -1
- docling/models/page_preprocessing_model.py +3 -6
- docling/models/rapid_ocr_model.py +1 -1
- docling/models/readingorder_model.py +2 -2
- docling/models/tesseract_ocr_cli_model.py +4 -3
- docling/models/tesseract_ocr_model.py +1 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +1 -0
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/utils/layout_postprocessor.py +11 -6
- {docling-2.36.0.dist-info → docling-2.37.0.dist-info}/METADATA +2 -3
- {docling-2.36.0.dist-info → docling-2.37.0.dist-info}/RECORD +28 -28
- {docling-2.36.0.dist-info → docling-2.37.0.dist-info}/WHEEL +0 -0
- {docling-2.36.0.dist-info → docling-2.37.0.dist-info}/entry_points.txt +0 -0
- {docling-2.36.0.dist-info → docling-2.37.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.36.0.dist-info → docling-2.37.0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
|
|
20
20
|
from PIL import Image, UnidentifiedImageError
|
21
21
|
from pptx import Presentation
|
22
22
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
23
|
+
from pptx.oxml.text import CT_TextLineBreak
|
23
24
|
|
24
25
|
from docling.backend.abstract_backend import (
|
25
26
|
DeclarativeDocumentBackend,
|
@@ -120,136 +121,91 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
120
121
|
|
121
122
|
return prov
|
122
123
|
|
123
|
-
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
124
|
-
is_a_list = False
|
124
|
+
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
125
125
|
is_list_group_created = False
|
126
126
|
enum_list_item_value = 0
|
127
127
|
new_list = None
|
128
|
-
bullet_type = "None"
|
129
|
-
list_label = GroupLabel.LIST
|
130
128
|
doc_label = DocItemLabel.LIST_ITEM
|
131
129
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
132
130
|
|
133
|
-
|
134
|
-
|
135
|
-
# Check if paragraph is a bullet point using the `element` XML
|
131
|
+
def is_list_item(paragraph):
|
132
|
+
"""Check if the paragraph is a list item."""
|
136
133
|
p = paragraph._element
|
137
134
|
if (
|
138
135
|
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
139
136
|
is not None
|
140
137
|
):
|
141
|
-
|
142
|
-
is_a_list = True
|
138
|
+
return (True, "Bullet")
|
143
139
|
elif (
|
144
140
|
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
145
141
|
is not None
|
146
142
|
):
|
147
|
-
|
148
|
-
|
149
|
-
else:
|
150
|
-
is_a_list = False
|
151
|
-
|
152
|
-
if paragraph.level > 0:
|
143
|
+
return (True, "Numbered")
|
144
|
+
elif paragraph.level > 0:
|
153
145
|
# Most likely a sub-list
|
154
|
-
|
155
|
-
|
156
|
-
if is_a_list:
|
157
|
-
# Determine if this is an unordered list or an ordered list.
|
158
|
-
# Set GroupLabel.ORDERED_LIST when it fits.
|
159
|
-
if bullet_type == "Numbered":
|
160
|
-
list_label = GroupLabel.ORDERED_LIST
|
161
|
-
|
162
|
-
if is_a_list:
|
163
|
-
_log.debug("LIST DETECTED!")
|
146
|
+
return (True, "None")
|
164
147
|
else:
|
165
|
-
|
166
|
-
|
167
|
-
# If there is a list inside of the shape, create a new docling list to assign list items to
|
168
|
-
# if is_a_list:
|
169
|
-
# new_list = doc.add_group(
|
170
|
-
# label=list_label, name=f"list", parent=parent_slide
|
171
|
-
# )
|
148
|
+
return (False, "None")
|
172
149
|
|
173
150
|
# Iterate through paragraphs to build up text
|
174
151
|
for paragraph in shape.text_frame.paragraphs:
|
175
|
-
|
152
|
+
is_a_list, bullet_type = is_list_item(paragraph)
|
176
153
|
p = paragraph._element
|
177
|
-
enum_list_item_value += 1
|
178
|
-
inline_paragraph_text = ""
|
179
|
-
inline_list_item_text = ""
|
180
|
-
|
181
|
-
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
182
|
-
if len(e.text.strip()) > 0:
|
183
|
-
e_is_a_list_item = False
|
184
|
-
is_numbered = False
|
185
|
-
if (
|
186
|
-
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
|
187
|
-
is not None
|
188
|
-
):
|
189
|
-
bullet_type = "Bullet"
|
190
|
-
e_is_a_list_item = True
|
191
|
-
elif (
|
192
|
-
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
|
193
|
-
is not None
|
194
|
-
):
|
195
|
-
bullet_type = "Numbered"
|
196
|
-
is_numbered = True
|
197
|
-
e_is_a_list_item = True
|
198
|
-
else:
|
199
|
-
e_is_a_list_item = False
|
200
|
-
|
201
|
-
if e_is_a_list_item:
|
202
|
-
if len(inline_paragraph_text) > 0:
|
203
|
-
# output accumulated inline text:
|
204
|
-
doc.add_text(
|
205
|
-
label=doc_label,
|
206
|
-
parent=parent_slide,
|
207
|
-
text=inline_paragraph_text,
|
208
|
-
prov=prov,
|
209
|
-
)
|
210
|
-
# Set marker and enumerated arguments if this is an enumeration element.
|
211
|
-
inline_list_item_text += e.text
|
212
|
-
# print(e.text)
|
213
|
-
else:
|
214
|
-
# Assign proper label to the text, depending if it's a Title or Section Header
|
215
|
-
# For other types of text, assign - PARAGRAPH
|
216
|
-
doc_label = DocItemLabel.PARAGRAPH
|
217
|
-
if shape.is_placeholder:
|
218
|
-
placeholder_type = shape.placeholder_format.type
|
219
|
-
if placeholder_type in [
|
220
|
-
PP_PLACEHOLDER.CENTER_TITLE,
|
221
|
-
PP_PLACEHOLDER.TITLE,
|
222
|
-
]:
|
223
|
-
# It's a title
|
224
|
-
doc_label = DocItemLabel.TITLE
|
225
|
-
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
226
|
-
DocItemLabel.SECTION_HEADER
|
227
|
-
enum_list_item_value = 0
|
228
|
-
inline_paragraph_text += e.text
|
229
|
-
|
230
|
-
if len(inline_paragraph_text) > 0:
|
231
|
-
# output accumulated inline text:
|
232
|
-
doc.add_text(
|
233
|
-
label=doc_label,
|
234
|
-
parent=parent_slide,
|
235
|
-
text=inline_paragraph_text,
|
236
|
-
prov=prov,
|
237
|
-
)
|
238
154
|
|
239
|
-
|
155
|
+
# Convert line breaks to spaces and accumulate text
|
156
|
+
p_text = ""
|
157
|
+
for e in p.content_children:
|
158
|
+
if isinstance(e, CT_TextLineBreak):
|
159
|
+
p_text += " "
|
160
|
+
else:
|
161
|
+
p_text += e.text
|
162
|
+
|
163
|
+
if is_a_list:
|
240
164
|
enum_marker = ""
|
241
|
-
|
242
|
-
|
165
|
+
enumerated = bullet_type == "Numbered"
|
166
|
+
|
243
167
|
if not is_list_group_created:
|
244
168
|
new_list = doc.add_group(
|
245
|
-
label=
|
169
|
+
label=GroupLabel.ORDERED_LIST
|
170
|
+
if enumerated
|
171
|
+
else GroupLabel.LIST,
|
172
|
+
name="list",
|
173
|
+
parent=parent_slide,
|
246
174
|
)
|
247
175
|
is_list_group_created = True
|
176
|
+
enum_list_item_value = 0
|
177
|
+
|
178
|
+
if enumerated:
|
179
|
+
enum_list_item_value += 1
|
180
|
+
enum_marker = str(enum_list_item_value) + "."
|
181
|
+
|
248
182
|
doc.add_list_item(
|
249
183
|
marker=enum_marker,
|
250
|
-
enumerated=
|
184
|
+
enumerated=enumerated,
|
251
185
|
parent=new_list,
|
252
|
-
text=
|
186
|
+
text=p_text,
|
187
|
+
prov=prov,
|
188
|
+
)
|
189
|
+
else: # is paragraph not a list item
|
190
|
+
# Assign proper label to the text, depending if it's a Title or Section Header
|
191
|
+
# For other types of text, assign - PARAGRAPH
|
192
|
+
doc_label = DocItemLabel.PARAGRAPH
|
193
|
+
if shape.is_placeholder:
|
194
|
+
placeholder_type = shape.placeholder_format.type
|
195
|
+
if placeholder_type in [
|
196
|
+
PP_PLACEHOLDER.CENTER_TITLE,
|
197
|
+
PP_PLACEHOLDER.TITLE,
|
198
|
+
]:
|
199
|
+
# It's a title
|
200
|
+
doc_label = DocItemLabel.TITLE
|
201
|
+
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
202
|
+
DocItemLabel.SECTION_HEADER
|
203
|
+
|
204
|
+
# output accumulated inline text:
|
205
|
+
doc.add_text(
|
206
|
+
label=doc_label,
|
207
|
+
parent=parent_slide,
|
208
|
+
text=p_text,
|
253
209
|
prov=prov,
|
254
210
|
)
|
255
211
|
return
|
@@ -423,18 +379,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
423
379
|
# Handle notes slide
|
424
380
|
if slide.has_notes_slide:
|
425
381
|
notes_slide = slide.notes_slide
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
382
|
+
if notes_slide.notes_text_frame is not None:
|
383
|
+
notes_text = notes_slide.notes_text_frame.text.strip()
|
384
|
+
if notes_text:
|
385
|
+
bbox = BoundingBox(l=0, t=0, r=0, b=0)
|
386
|
+
prov = ProvenanceItem(
|
387
|
+
page_no=slide_ind + 1,
|
388
|
+
charspan=[0, len(notes_text)],
|
389
|
+
bbox=bbox,
|
390
|
+
)
|
391
|
+
doc.add_text(
|
392
|
+
label=DocItemLabel.TEXT,
|
393
|
+
parent=parent_slide,
|
394
|
+
text=notes_text,
|
395
|
+
prov=prov,
|
396
|
+
content_layer=ContentLayer.FURNITURE,
|
397
|
+
)
|
439
398
|
|
440
399
|
return doc
|
@@ -60,8 +60,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
60
60
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
61
61
|
# Track processed textbox elements to avoid duplication
|
62
62
|
self.processed_textbox_elements: List[int] = []
|
63
|
-
# Track content hash of processed paragraphs to avoid duplicate content
|
64
|
-
self.processed_paragraph_content: List[str] = []
|
65
63
|
|
66
64
|
for i in range(-1, self.max_levels):
|
67
65
|
self.parents[i] = None
|
@@ -593,9 +591,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
593
591
|
# Add the sorted paragraphs to our processing list
|
594
592
|
all_paragraphs.extend(sorted_container_paragraphs)
|
595
593
|
|
594
|
+
# Track processed paragraphs to avoid duplicates (same content and position)
|
595
|
+
processed_paragraphs = set()
|
596
|
+
|
596
597
|
# Process all the paragraphs
|
597
|
-
for p,
|
598
|
-
|
598
|
+
for p, position in all_paragraphs:
|
599
|
+
# Create paragraph object to get text content
|
600
|
+
paragraph = Paragraph(p, docx_obj)
|
601
|
+
text_content = paragraph.text
|
602
|
+
|
603
|
+
# Create a unique identifier based on content and position
|
604
|
+
paragraph_id = (text_content, position)
|
605
|
+
|
606
|
+
# Skip if this paragraph (same content and position) was already processed
|
607
|
+
if paragraph_id in processed_paragraphs:
|
608
|
+
_log.debug(
|
609
|
+
f"Skipping duplicate paragraph: content='{text_content[:50]}...', position={position}"
|
610
|
+
)
|
611
|
+
continue
|
612
|
+
|
613
|
+
# Mark this paragraph as processed
|
614
|
+
processed_paragraphs.add(paragraph_id)
|
615
|
+
|
616
|
+
self._handle_text_elements(p, docx_obj, doc)
|
599
617
|
|
600
618
|
# Restore original parent
|
601
619
|
self.parents[level] = original_parent
|
@@ -669,22 +687,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
669
687
|
element: BaseOxmlElement,
|
670
688
|
docx_obj: DocxDocument,
|
671
689
|
doc: DoclingDocument,
|
672
|
-
is_from_textbox: bool = False,
|
673
690
|
) -> None:
|
674
691
|
paragraph = Paragraph(element, docx_obj)
|
675
692
|
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
if is_from_textbox and raw_text:
|
680
|
-
# Create a simple hash of content to detect duplicates
|
681
|
-
content_hash = f"{len(raw_text)}:{raw_text[:50]}"
|
682
|
-
if content_hash in self.processed_paragraph_content:
|
683
|
-
_log.debug(f"Skipping duplicate paragraph content: {content_hash}")
|
684
|
-
return
|
685
|
-
self.processed_paragraph_content.append(content_hash)
|
686
|
-
|
687
|
-
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
693
|
+
text, equations = self._handle_equations_in_text(
|
694
|
+
element=element, text=paragraph.text
|
695
|
+
)
|
688
696
|
|
689
697
|
if text is None:
|
690
698
|
return
|
@@ -750,7 +758,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
750
758
|
self._add_header(doc, p_level, text, is_numbered_style)
|
751
759
|
|
752
760
|
elif len(equations) > 0:
|
753
|
-
if (
|
761
|
+
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
|
762
|
+
text
|
763
|
+
) > 0:
|
754
764
|
# Standalone equation
|
755
765
|
level = self._get_level()
|
756
766
|
doc.add_text(
|
@@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
|
8
8
|
import pypdfium2 as pdfium
|
9
9
|
import pypdfium2.raw as pdfium_c
|
10
10
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
11
|
-
from docling_core.types.doc.page import
|
11
|
+
from docling_core.types.doc.page import (
|
12
|
+
BoundingRectangle,
|
13
|
+
PdfPageBoundaryType,
|
14
|
+
PdfPageGeometry,
|
15
|
+
SegmentedPdfPage,
|
16
|
+
TextCell,
|
17
|
+
)
|
12
18
|
from PIL import Image, ImageDraw
|
13
19
|
from pypdfium2 import PdfTextPage
|
14
20
|
from pypdfium2._helpers.misc import PdfiumError
|
@@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
|
|
16
22
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
17
23
|
from docling.utils.locks import pypdfium2_lock
|
18
24
|
|
25
|
+
|
26
|
+
def get_pdf_page_geometry(
|
27
|
+
ppage: pdfium.PdfPage,
|
28
|
+
angle: float = 0.0,
|
29
|
+
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
|
30
|
+
) -> PdfPageGeometry:
|
31
|
+
"""
|
32
|
+
Create PdfPageGeometry from a pypdfium2 PdfPage object.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
ppage: pypdfium2 PdfPage object
|
36
|
+
angle: Page rotation angle in degrees (default: 0.0)
|
37
|
+
boundary_type: The boundary type for the page (default: CROP_BOX)
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
PdfPageGeometry with all the different bounding boxes properly set
|
41
|
+
"""
|
42
|
+
with pypdfium2_lock:
|
43
|
+
# Get the main bounding box (intersection of crop_box and media_box)
|
44
|
+
bbox_tuple = ppage.get_bbox()
|
45
|
+
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
|
46
|
+
|
47
|
+
# Get all the different page boxes from pypdfium2
|
48
|
+
media_box_tuple = ppage.get_mediabox()
|
49
|
+
crop_box_tuple = ppage.get_cropbox()
|
50
|
+
art_box_tuple = ppage.get_artbox()
|
51
|
+
bleed_box_tuple = ppage.get_bleedbox()
|
52
|
+
trim_box_tuple = ppage.get_trimbox()
|
53
|
+
|
54
|
+
# Convert to BoundingBox objects using existing from_tuple method
|
55
|
+
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
|
56
|
+
# Use bbox as fallback when specific box types are not defined
|
57
|
+
media_bbox = (
|
58
|
+
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
|
59
|
+
if media_box_tuple
|
60
|
+
else bbox
|
61
|
+
)
|
62
|
+
crop_bbox = (
|
63
|
+
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
|
64
|
+
if crop_box_tuple
|
65
|
+
else bbox
|
66
|
+
)
|
67
|
+
art_bbox = (
|
68
|
+
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
|
69
|
+
if art_box_tuple
|
70
|
+
else bbox
|
71
|
+
)
|
72
|
+
bleed_bbox = (
|
73
|
+
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
|
74
|
+
if bleed_box_tuple
|
75
|
+
else bbox
|
76
|
+
)
|
77
|
+
trim_bbox = (
|
78
|
+
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
|
79
|
+
if trim_box_tuple
|
80
|
+
else bbox
|
81
|
+
)
|
82
|
+
|
83
|
+
return PdfPageGeometry(
|
84
|
+
angle=angle,
|
85
|
+
rect=BoundingRectangle.from_bounding_box(bbox),
|
86
|
+
boundary_type=boundary_type,
|
87
|
+
art_bbox=art_bbox,
|
88
|
+
bleed_bbox=bleed_bbox,
|
89
|
+
crop_bbox=crop_bbox,
|
90
|
+
media_bbox=media_bbox,
|
91
|
+
trim_bbox=trim_bbox,
|
92
|
+
)
|
93
|
+
|
94
|
+
|
19
95
|
if TYPE_CHECKING:
|
20
96
|
from docling.datamodel.document import InputDocument
|
21
97
|
|
@@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
41
117
|
def is_valid(self) -> bool:
|
42
118
|
return self.valid
|
43
119
|
|
44
|
-
def
|
45
|
-
|
46
|
-
page_size = self.get_size()
|
47
|
-
with pypdfium2_lock:
|
48
|
-
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
49
|
-
pos = obj.get_pos()
|
50
|
-
cropbox = BoundingBox.from_tuple(
|
51
|
-
pos, origin=CoordOrigin.BOTTOMLEFT
|
52
|
-
).to_top_left_origin(page_height=page_size.height)
|
53
|
-
|
54
|
-
if cropbox.area() > AREA_THRESHOLD:
|
55
|
-
cropbox = cropbox.scaled(scale=scale)
|
56
|
-
|
57
|
-
yield cropbox
|
58
|
-
|
59
|
-
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
60
|
-
with pypdfium2_lock:
|
61
|
-
if not self.text_page:
|
62
|
-
self.text_page = self._ppage.get_textpage()
|
63
|
-
|
64
|
-
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
65
|
-
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
66
|
-
|
67
|
-
with pypdfium2_lock:
|
68
|
-
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
69
|
-
|
70
|
-
return text_piece
|
71
|
-
|
72
|
-
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
73
|
-
return None
|
74
|
-
|
75
|
-
def get_text_cells(self) -> Iterable[TextCell]:
|
120
|
+
def _compute_text_cells(self) -> List[TextCell]:
|
121
|
+
"""Compute text cells from pypdfium."""
|
76
122
|
with pypdfium2_lock:
|
77
123
|
if not self.text_page:
|
78
124
|
self.text_page = self._ppage.get_textpage()
|
@@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
203
249
|
|
204
250
|
return merged_cells
|
205
251
|
|
206
|
-
|
207
|
-
image = (
|
208
|
-
self.get_page_image()
|
209
|
-
) # make new image to avoid drawing on the saved ones
|
210
|
-
draw = ImageDraw.Draw(image)
|
211
|
-
for c in cells:
|
212
|
-
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
213
|
-
cell_color = (
|
214
|
-
random.randint(30, 140),
|
215
|
-
random.randint(30, 140),
|
216
|
-
random.randint(30, 140),
|
217
|
-
)
|
218
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
219
|
-
image.show()
|
252
|
+
return merge_horizontal_cells(cells)
|
220
253
|
|
221
|
-
|
222
|
-
#
|
254
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
255
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
256
|
+
page_size = self.get_size()
|
257
|
+
with pypdfium2_lock:
|
258
|
+
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
259
|
+
pos = obj.get_pos()
|
260
|
+
cropbox = BoundingBox.from_tuple(
|
261
|
+
pos, origin=CoordOrigin.BOTTOMLEFT
|
262
|
+
).to_top_left_origin(page_height=page_size.height)
|
223
263
|
|
224
|
-
|
264
|
+
if cropbox.area() > AREA_THRESHOLD:
|
265
|
+
cropbox = cropbox.scaled(scale=scale)
|
225
266
|
|
226
|
-
|
227
|
-
# draw_clusters_and_cells()
|
267
|
+
yield cropbox
|
228
268
|
|
229
|
-
|
269
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
270
|
+
with pypdfium2_lock:
|
271
|
+
if not self.text_page:
|
272
|
+
self.text_page = self._ppage.get_textpage()
|
273
|
+
|
274
|
+
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
275
|
+
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
276
|
+
|
277
|
+
with pypdfium2_lock:
|
278
|
+
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
279
|
+
|
280
|
+
return text_piece
|
281
|
+
|
282
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
283
|
+
if not self.valid:
|
284
|
+
return None
|
285
|
+
|
286
|
+
text_cells = self._compute_text_cells()
|
287
|
+
|
288
|
+
# Get the PDF page geometry from pypdfium2
|
289
|
+
dimension = get_pdf_page_geometry(self._ppage)
|
290
|
+
|
291
|
+
# Create SegmentedPdfPage
|
292
|
+
return SegmentedPdfPage(
|
293
|
+
dimension=dimension,
|
294
|
+
textline_cells=text_cells,
|
295
|
+
char_cells=[],
|
296
|
+
word_cells=[],
|
297
|
+
has_textlines=len(text_cells) > 0,
|
298
|
+
has_words=False,
|
299
|
+
has_chars=False,
|
300
|
+
)
|
301
|
+
|
302
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
303
|
+
return self._compute_text_cells()
|
230
304
|
|
231
305
|
def get_page_image(
|
232
306
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
docling/datamodel/base_models.py
CHANGED
@@ -67,10 +67,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
67
67
|
InputFormat.MD: ["md"],
|
68
68
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
69
69
|
InputFormat.XML_JATS: ["xml", "nxml"],
|
70
|
-
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
70
|
+
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
|
71
71
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
72
72
|
InputFormat.CSV: ["csv"],
|
73
|
-
InputFormat.XLSX: ["xlsx"],
|
73
|
+
InputFormat.XLSX: ["xlsx", "xlsm"],
|
74
74
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
75
75
|
InputFormat.JSON_DOCLING: ["json"],
|
76
76
|
}
|
@@ -232,7 +232,6 @@ class Page(BaseModel):
|
|
232
232
|
page_no: int
|
233
233
|
# page_hash: Optional[str] = None
|
234
234
|
size: Optional[Size] = None
|
235
|
-
cells: List[TextCell] = []
|
236
235
|
parsed_page: Optional[SegmentedPdfPage] = None
|
237
236
|
predictions: PagePredictions = PagePredictions()
|
238
237
|
assembled: Optional[AssembledUnit] = None
|
@@ -245,6 +244,14 @@ class Page(BaseModel):
|
|
245
244
|
float, Image
|
246
245
|
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
247
246
|
|
247
|
+
@property
|
248
|
+
def cells(self) -> List[TextCell]:
|
249
|
+
"""Return text cells as a read-only view of parsed_page.textline_cells."""
|
250
|
+
if self.parsed_page is not None:
|
251
|
+
return self.parsed_page.textline_cells
|
252
|
+
else:
|
253
|
+
return []
|
254
|
+
|
248
255
|
def get_image(
|
249
256
|
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
250
257
|
) -> Optional[Image]:
|
@@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
292
292
|
),
|
293
293
|
)
|
294
294
|
|
295
|
-
generate_parsed_pages:
|
295
|
+
generate_parsed_pages: Literal[True] = (
|
296
|
+
True # Always True since parsed_page is now mandatory
|
297
|
+
)
|
296
298
|
|
297
299
|
|
298
300
|
class PdfPipeline(str, Enum):
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import Any, Dict, List, Literal
|
2
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
3
3
|
|
4
4
|
from pydantic import AnyUrl, BaseModel
|
5
5
|
from typing_extensions import deprecated
|
@@ -42,6 +42,7 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
42
42
|
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
43
43
|
response_format: ResponseFormat
|
44
44
|
|
45
|
+
torch_dtype: Optional[str] = None
|
45
46
|
supported_devices: List[AcceleratorDevice] = [
|
46
47
|
AcceleratorDevice.CPU,
|
47
48
|
AcceleratorDevice.CUDA,
|
docling/models/base_ocr_model.py
CHANGED
@@ -7,6 +7,7 @@ from typing import List, Optional, Type
|
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
10
|
+
from docling_core.types.doc.page import TextCell
|
10
11
|
from PIL import Image, ImageDraw
|
11
12
|
from rtree import index
|
12
13
|
from scipy.ndimage import binary_dilation, find_objects, label
|
@@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|
107
108
|
return []
|
108
109
|
|
109
110
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
110
|
-
def _filter_ocr_cells(
|
111
|
+
def _filter_ocr_cells(
|
112
|
+
self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
|
113
|
+
) -> List[TextCell]:
|
111
114
|
# Create R-tree index for programmatic cells
|
112
115
|
p = index.Property()
|
113
116
|
p.dimension = 2
|
@@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|
130
133
|
]
|
131
134
|
return filtered_ocr_cells
|
132
135
|
|
133
|
-
def post_process_cells(self, ocr_cells,
|
136
|
+
def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
|
134
137
|
r"""
|
135
|
-
Post-process the
|
138
|
+
Post-process the OCR cells and update the page object.
|
139
|
+
Updates parsed_page.textline_cells directly since page.cells is now read-only.
|
136
140
|
"""
|
141
|
+
# Get existing cells from the read-only property
|
142
|
+
existing_cells = page.cells
|
143
|
+
|
144
|
+
# Combine existing and OCR cells with overlap filtering
|
145
|
+
final_cells = self._combine_cells(existing_cells, ocr_cells)
|
146
|
+
|
147
|
+
assert page.parsed_page is not None
|
148
|
+
|
149
|
+
# Update parsed_page.textline_cells directly
|
150
|
+
page.parsed_page.textline_cells = final_cells
|
151
|
+
page.parsed_page.has_lines = len(final_cells) > 0
|
152
|
+
|
153
|
+
def _combine_cells(
|
154
|
+
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
|
155
|
+
) -> List[TextCell]:
|
156
|
+
"""Combine existing and OCR cells with filtering and re-indexing."""
|
137
157
|
if self.options.force_full_page_ocr:
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
158
|
+
combined = ocr_cells
|
159
|
+
else:
|
160
|
+
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
|
161
|
+
combined = list(existing_cells) + filtered_ocr_cells
|
162
|
+
|
163
|
+
# Re-index in-place
|
164
|
+
for i, cell in enumerate(combined):
|
165
|
+
cell.index = i
|
166
|
+
|
167
|
+
return combined
|
146
168
|
|
147
169
|
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
148
170
|
image = copy.deepcopy(page.image)
|
docling/models/easyocr_model.py
CHANGED
@@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
|
|
177
177
|
all_ocr_cells.extend(cells)
|
178
178
|
|
179
179
|
# Post-process the cells
|
180
|
-
|
180
|
+
self.post_process_cells(all_ocr_cells, page)
|
181
181
|
|
182
182
|
# DEBUG code:
|
183
183
|
if settings.debug.visualize_ocr:
|