docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +39 -18
- docling/backend/docling_parse_backend.py +61 -59
- docling/backend/docling_parse_v2_backend.py +72 -62
- docling/backend/docling_parse_v4_backend.py +21 -19
- docling/backend/md_backend.py +101 -81
- docling/backend/mspowerpoint_backend.py +72 -113
- docling/backend/msword_backend.py +99 -80
- docling/backend/noop_backend.py +51 -0
- docling/backend/pypdfium2_backend.py +127 -53
- docling/cli/main.py +82 -14
- docling/datamodel/asr_model_specs.py +92 -0
- docling/datamodel/base_models.py +21 -4
- docling/datamodel/document.py +3 -1
- docling/datamodel/pipeline_options.py +15 -2
- docling/datamodel/pipeline_options_asr_model.py +57 -0
- docling/datamodel/pipeline_options_vlm_model.py +4 -4
- docling/document_converter.py +8 -0
- docling/models/api_vlm_model.py +3 -1
- docling/models/base_model.py +1 -1
- docling/models/base_ocr_model.py +33 -11
- docling/models/easyocr_model.py +1 -1
- docling/models/layout_model.py +2 -3
- docling/models/ocr_mac_model.py +1 -1
- docling/models/page_preprocessing_model.py +3 -6
- docling/models/rapid_ocr_model.py +1 -1
- docling/models/readingorder_model.py +3 -3
- docling/models/tesseract_ocr_cli_model.py +4 -3
- docling/models/tesseract_ocr_model.py +1 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
- docling/models/vlm_models_inline/mlx_model.py +3 -1
- docling/pipeline/asr_pipeline.py +253 -0
- docling/pipeline/base_pipeline.py +11 -0
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/utils/layout_postprocessor.py +11 -6
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,7 @@ from docling_core.types.doc import (
|
|
14
14
|
TableCell,
|
15
15
|
TableData,
|
16
16
|
)
|
17
|
-
from docling_core.types.doc.document import Formatting
|
17
|
+
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
18
18
|
from docx import Document
|
19
19
|
from docx.document import Document as DocxDocument
|
20
20
|
from docx.oxml.table import CT_Tc
|
@@ -60,8 +60,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
60
60
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
61
61
|
# Track processed textbox elements to avoid duplication
|
62
62
|
self.processed_textbox_elements: List[int] = []
|
63
|
-
# Track content hash of processed paragraphs to avoid duplicate content
|
64
|
-
self.processed_paragraph_content: List[str] = []
|
65
63
|
|
66
64
|
for i in range(-1, self.max_levels):
|
67
65
|
self.parents[i] = None
|
@@ -86,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
86
84
|
self.valid = True
|
87
85
|
except Exception as e:
|
88
86
|
raise RuntimeError(
|
89
|
-
f"
|
87
|
+
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
90
88
|
) from e
|
91
89
|
|
92
90
|
@override
|
@@ -253,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
253
251
|
self._handle_tables(element, docx_obj, doc)
|
254
252
|
except Exception:
|
255
253
|
_log.debug("could not parse a table, broken docx table")
|
256
|
-
|
254
|
+
# Check for Image
|
257
255
|
elif drawing_blip:
|
258
256
|
self._handle_pictures(docx_obj, drawing_blip, doc)
|
257
|
+
# Check for Text after the Image
|
258
|
+
if (
|
259
|
+
tag_name in ["p"]
|
260
|
+
and element.find(".//w:t", namespaces=namespaces) is not None
|
261
|
+
):
|
262
|
+
self._handle_text_elements(element, docx_obj, doc)
|
259
263
|
# Check for the sdt containers, like table of contents
|
260
264
|
elif tag_name in ["sdt"]:
|
261
265
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
@@ -270,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
270
274
|
self._handle_text_elements(element, docx_obj, doc)
|
271
275
|
else:
|
272
276
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
277
|
+
|
273
278
|
return doc
|
274
279
|
|
275
280
|
def _str_to_int(
|
@@ -580,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
580
585
|
all_paragraphs = []
|
581
586
|
|
582
587
|
# Sort paragraphs within each container, then process containers
|
583
|
-
for
|
588
|
+
for paragraphs in container_paragraphs.values():
|
584
589
|
# Sort by vertical position within each container
|
585
590
|
sorted_container_paragraphs = sorted(
|
586
591
|
paragraphs,
|
@@ -593,9 +598,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
593
598
|
# Add the sorted paragraphs to our processing list
|
594
599
|
all_paragraphs.extend(sorted_container_paragraphs)
|
595
600
|
|
601
|
+
# Track processed paragraphs to avoid duplicates (same content and position)
|
602
|
+
processed_paragraphs = set()
|
603
|
+
|
596
604
|
# Process all the paragraphs
|
597
|
-
for p,
|
598
|
-
|
605
|
+
for p, position in all_paragraphs:
|
606
|
+
# Create paragraph object to get text content
|
607
|
+
paragraph = Paragraph(p, docx_obj)
|
608
|
+
text_content = paragraph.text
|
609
|
+
|
610
|
+
# Create a unique identifier based on content and position
|
611
|
+
paragraph_id = (text_content, position)
|
612
|
+
|
613
|
+
# Skip if this paragraph (same content and position) was already processed
|
614
|
+
if paragraph_id in processed_paragraphs:
|
615
|
+
_log.debug(
|
616
|
+
f"Skipping duplicate paragraph: content='{text_content[:50]}...', position={position}"
|
617
|
+
)
|
618
|
+
continue
|
619
|
+
|
620
|
+
# Mark this paragraph as processed
|
621
|
+
processed_paragraphs.add(paragraph_id)
|
622
|
+
|
623
|
+
self._handle_text_elements(p, docx_obj, doc)
|
599
624
|
|
600
625
|
# Restore original parent
|
601
626
|
self.parents[level] = original_parent
|
@@ -669,26 +694,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
669
694
|
element: BaseOxmlElement,
|
670
695
|
docx_obj: DocxDocument,
|
671
696
|
doc: DoclingDocument,
|
672
|
-
is_from_textbox: bool = False,
|
673
697
|
) -> None:
|
674
698
|
paragraph = Paragraph(element, docx_obj)
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
if is_from_textbox and raw_text:
|
680
|
-
# Create a simple hash of content to detect duplicates
|
681
|
-
content_hash = f"{len(raw_text)}:{raw_text[:50]}"
|
682
|
-
if content_hash in self.processed_paragraph_content:
|
683
|
-
_log.debug(f"Skipping duplicate paragraph content: {content_hash}")
|
684
|
-
return
|
685
|
-
self.processed_paragraph_content.append(content_hash)
|
686
|
-
|
687
|
-
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
699
|
+
paragraph_elements = self._get_paragraph_elements(paragraph)
|
700
|
+
text, equations = self._handle_equations_in_text(
|
701
|
+
element=element, text=paragraph.text
|
702
|
+
)
|
688
703
|
|
689
704
|
if text is None:
|
690
705
|
return
|
691
|
-
paragraph_elements = self._get_paragraph_elements(paragraph)
|
692
706
|
text = text.strip()
|
693
707
|
|
694
708
|
# Common styles for bullet and numbered lists.
|
@@ -750,7 +764,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
750
764
|
self._add_header(doc, p_level, text, is_numbered_style)
|
751
765
|
|
752
766
|
elif len(equations) > 0:
|
753
|
-
if (
|
767
|
+
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
|
768
|
+
text
|
769
|
+
) > 0:
|
754
770
|
# Standalone equation
|
755
771
|
level = self._get_level()
|
756
772
|
doc.add_text(
|
@@ -902,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
902
918
|
)
|
903
919
|
return
|
904
920
|
|
921
|
+
def _add_formatted_list_item(
|
922
|
+
self,
|
923
|
+
doc: DoclingDocument,
|
924
|
+
elements: list,
|
925
|
+
marker: str,
|
926
|
+
enumerated: bool,
|
927
|
+
level: int,
|
928
|
+
) -> None:
|
929
|
+
# This should not happen by construction
|
930
|
+
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
931
|
+
return
|
932
|
+
if len(elements) == 1:
|
933
|
+
text, format, hyperlink = elements[0]
|
934
|
+
doc.add_list_item(
|
935
|
+
marker=marker,
|
936
|
+
enumerated=enumerated,
|
937
|
+
parent=self.parents[level],
|
938
|
+
text=text,
|
939
|
+
formatting=format,
|
940
|
+
hyperlink=hyperlink,
|
941
|
+
)
|
942
|
+
else:
|
943
|
+
new_item = doc.add_list_item(
|
944
|
+
marker=marker,
|
945
|
+
enumerated=enumerated,
|
946
|
+
parent=self.parents[level],
|
947
|
+
text="",
|
948
|
+
)
|
949
|
+
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
950
|
+
for text, format, hyperlink in elements:
|
951
|
+
doc.add_text(
|
952
|
+
label=DocItemLabel.TEXT,
|
953
|
+
parent=new_parent,
|
954
|
+
text=text,
|
955
|
+
formatting=format,
|
956
|
+
hyperlink=hyperlink,
|
957
|
+
)
|
958
|
+
|
905
959
|
def _add_list_item(
|
906
960
|
self,
|
907
961
|
*,
|
@@ -911,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
911
965
|
elements: list,
|
912
966
|
is_numbered: bool = False,
|
913
967
|
) -> None:
|
968
|
+
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
969
|
+
if not elements:
|
970
|
+
return None
|
914
971
|
enum_marker = ""
|
915
972
|
|
916
973
|
level = self._get_level()
|
@@ -927,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
927
984
|
if is_numbered:
|
928
985
|
enum_marker = str(self.listIter) + "."
|
929
986
|
is_numbered = True
|
930
|
-
|
931
|
-
doc
|
932
|
-
prev_parent=self.parents[level],
|
933
|
-
paragraph_elements=elements,
|
987
|
+
self._add_formatted_list_item(
|
988
|
+
doc, elements, enum_marker, is_numbered, level
|
934
989
|
)
|
935
|
-
for text, format, hyperlink in elements:
|
936
|
-
doc.add_list_item(
|
937
|
-
marker=enum_marker,
|
938
|
-
enumerated=is_numbered,
|
939
|
-
parent=new_parent,
|
940
|
-
text=text,
|
941
|
-
formatting=format,
|
942
|
-
hyperlink=hyperlink,
|
943
|
-
)
|
944
|
-
|
945
990
|
elif (
|
946
991
|
self._prev_numid() == numid
|
947
992
|
and self.level_at_new_list is not None
|
@@ -971,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
971
1016
|
if is_numbered:
|
972
1017
|
enum_marker = str(self.listIter) + "."
|
973
1018
|
is_numbered = True
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
1019
|
+
self._add_formatted_list_item(
|
1020
|
+
doc,
|
1021
|
+
elements,
|
1022
|
+
enum_marker,
|
1023
|
+
is_numbered,
|
1024
|
+
self.level_at_new_list + ilevel,
|
979
1025
|
)
|
980
|
-
for text, format, hyperlink in elements:
|
981
|
-
doc.add_list_item(
|
982
|
-
marker=enum_marker,
|
983
|
-
enumerated=is_numbered,
|
984
|
-
parent=new_parent,
|
985
|
-
text=text,
|
986
|
-
formatting=format,
|
987
|
-
hyperlink=hyperlink,
|
988
|
-
)
|
989
1026
|
elif (
|
990
1027
|
self._prev_numid() == numid
|
991
1028
|
and self.level_at_new_list is not None
|
992
1029
|
and prev_indent is not None
|
993
1030
|
and ilevel < prev_indent
|
994
1031
|
): # Close list
|
995
|
-
for k
|
1032
|
+
for k in self.parents:
|
996
1033
|
if k > self.level_at_new_list + ilevel:
|
997
1034
|
self.parents[k] = None
|
998
1035
|
|
@@ -1001,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1001
1038
|
if is_numbered:
|
1002
1039
|
enum_marker = str(self.listIter) + "."
|
1003
1040
|
is_numbered = True
|
1004
|
-
|
1005
|
-
doc
|
1006
|
-
|
1007
|
-
|
1041
|
+
self._add_formatted_list_item(
|
1042
|
+
doc,
|
1043
|
+
elements,
|
1044
|
+
enum_marker,
|
1045
|
+
is_numbered,
|
1046
|
+
self.level_at_new_list + ilevel,
|
1008
1047
|
)
|
1009
|
-
for text, format, hyperlink in elements:
|
1010
|
-
doc.add_list_item(
|
1011
|
-
marker=enum_marker,
|
1012
|
-
enumerated=is_numbered,
|
1013
|
-
parent=new_parent,
|
1014
|
-
text=text,
|
1015
|
-
formatting=format,
|
1016
|
-
hyperlink=hyperlink,
|
1017
|
-
)
|
1018
1048
|
self.listIter = 0
|
1019
1049
|
|
1020
1050
|
elif self._prev_numid() == numid or prev_indent == ilevel:
|
@@ -1023,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1023
1053
|
if is_numbered:
|
1024
1054
|
enum_marker = str(self.listIter) + "."
|
1025
1055
|
is_numbered = True
|
1026
|
-
|
1027
|
-
doc
|
1028
|
-
prev_parent=self.parents[level - 1],
|
1029
|
-
paragraph_elements=elements,
|
1056
|
+
self._add_formatted_list_item(
|
1057
|
+
doc, elements, enum_marker, is_numbered, level - 1
|
1030
1058
|
)
|
1031
|
-
|
1032
|
-
# Add the list item to the parent group
|
1033
|
-
doc.add_list_item(
|
1034
|
-
marker=enum_marker,
|
1035
|
-
enumerated=is_numbered,
|
1036
|
-
parent=new_parent,
|
1037
|
-
text=text,
|
1038
|
-
formatting=format,
|
1039
|
-
hyperlink=hyperlink,
|
1040
|
-
)
|
1059
|
+
|
1041
1060
|
return
|
1042
1061
|
|
1043
1062
|
def _handle_tables(
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Set, Union
|
5
|
+
|
6
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
7
|
+
from docling.datamodel.base_models import InputFormat
|
8
|
+
from docling.datamodel.document import InputDocument
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class NoOpBackend(AbstractDocumentBackend):
|
14
|
+
"""
|
15
|
+
A no-op backend that only validates input existence.
|
16
|
+
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
20
|
+
super().__init__(in_doc, path_or_stream)
|
21
|
+
|
22
|
+
_log.debug(f"NoOpBackend initialized for: {path_or_stream}")
|
23
|
+
|
24
|
+
# Validate input
|
25
|
+
try:
|
26
|
+
if isinstance(self.path_or_stream, BytesIO):
|
27
|
+
# Check if stream has content
|
28
|
+
self.valid = len(self.path_or_stream.getvalue()) > 0
|
29
|
+
_log.debug(
|
30
|
+
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
|
31
|
+
)
|
32
|
+
elif isinstance(self.path_or_stream, Path):
|
33
|
+
# Check if file exists
|
34
|
+
self.valid = self.path_or_stream.exists()
|
35
|
+
_log.debug(f"File exists: {self.valid}")
|
36
|
+
else:
|
37
|
+
self.valid = False
|
38
|
+
except Exception as e:
|
39
|
+
_log.error(f"NoOpBackend validation failed: {e}")
|
40
|
+
self.valid = False
|
41
|
+
|
42
|
+
def is_valid(self) -> bool:
|
43
|
+
return self.valid
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def supports_pagination(cls) -> bool:
|
47
|
+
return False
|
48
|
+
|
49
|
+
@classmethod
|
50
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
51
|
+
return set(InputFormat)
|
@@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
|
8
8
|
import pypdfium2 as pdfium
|
9
9
|
import pypdfium2.raw as pdfium_c
|
10
10
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
11
|
-
from docling_core.types.doc.page import
|
11
|
+
from docling_core.types.doc.page import (
|
12
|
+
BoundingRectangle,
|
13
|
+
PdfPageBoundaryType,
|
14
|
+
PdfPageGeometry,
|
15
|
+
SegmentedPdfPage,
|
16
|
+
TextCell,
|
17
|
+
)
|
12
18
|
from PIL import Image, ImageDraw
|
13
19
|
from pypdfium2 import PdfTextPage
|
14
20
|
from pypdfium2._helpers.misc import PdfiumError
|
@@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
|
|
16
22
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
17
23
|
from docling.utils.locks import pypdfium2_lock
|
18
24
|
|
25
|
+
|
26
|
+
def get_pdf_page_geometry(
|
27
|
+
ppage: pdfium.PdfPage,
|
28
|
+
angle: float = 0.0,
|
29
|
+
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
|
30
|
+
) -> PdfPageGeometry:
|
31
|
+
"""
|
32
|
+
Create PdfPageGeometry from a pypdfium2 PdfPage object.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
ppage: pypdfium2 PdfPage object
|
36
|
+
angle: Page rotation angle in degrees (default: 0.0)
|
37
|
+
boundary_type: The boundary type for the page (default: CROP_BOX)
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
PdfPageGeometry with all the different bounding boxes properly set
|
41
|
+
"""
|
42
|
+
with pypdfium2_lock:
|
43
|
+
# Get the main bounding box (intersection of crop_box and media_box)
|
44
|
+
bbox_tuple = ppage.get_bbox()
|
45
|
+
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
|
46
|
+
|
47
|
+
# Get all the different page boxes from pypdfium2
|
48
|
+
media_box_tuple = ppage.get_mediabox()
|
49
|
+
crop_box_tuple = ppage.get_cropbox()
|
50
|
+
art_box_tuple = ppage.get_artbox()
|
51
|
+
bleed_box_tuple = ppage.get_bleedbox()
|
52
|
+
trim_box_tuple = ppage.get_trimbox()
|
53
|
+
|
54
|
+
# Convert to BoundingBox objects using existing from_tuple method
|
55
|
+
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
|
56
|
+
# Use bbox as fallback when specific box types are not defined
|
57
|
+
media_bbox = (
|
58
|
+
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
|
59
|
+
if media_box_tuple
|
60
|
+
else bbox
|
61
|
+
)
|
62
|
+
crop_bbox = (
|
63
|
+
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
|
64
|
+
if crop_box_tuple
|
65
|
+
else bbox
|
66
|
+
)
|
67
|
+
art_bbox = (
|
68
|
+
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
|
69
|
+
if art_box_tuple
|
70
|
+
else bbox
|
71
|
+
)
|
72
|
+
bleed_bbox = (
|
73
|
+
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
|
74
|
+
if bleed_box_tuple
|
75
|
+
else bbox
|
76
|
+
)
|
77
|
+
trim_bbox = (
|
78
|
+
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
|
79
|
+
if trim_box_tuple
|
80
|
+
else bbox
|
81
|
+
)
|
82
|
+
|
83
|
+
return PdfPageGeometry(
|
84
|
+
angle=angle,
|
85
|
+
rect=BoundingRectangle.from_bounding_box(bbox),
|
86
|
+
boundary_type=boundary_type,
|
87
|
+
art_bbox=art_bbox,
|
88
|
+
bleed_bbox=bleed_bbox,
|
89
|
+
crop_bbox=crop_bbox,
|
90
|
+
media_bbox=media_bbox,
|
91
|
+
trim_bbox=trim_bbox,
|
92
|
+
)
|
93
|
+
|
94
|
+
|
19
95
|
if TYPE_CHECKING:
|
20
96
|
from docling.datamodel.document import InputDocument
|
21
97
|
|
@@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
41
117
|
def is_valid(self) -> bool:
|
42
118
|
return self.valid
|
43
119
|
|
44
|
-
def
|
45
|
-
|
46
|
-
page_size = self.get_size()
|
47
|
-
with pypdfium2_lock:
|
48
|
-
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
49
|
-
pos = obj.get_pos()
|
50
|
-
cropbox = BoundingBox.from_tuple(
|
51
|
-
pos, origin=CoordOrigin.BOTTOMLEFT
|
52
|
-
).to_top_left_origin(page_height=page_size.height)
|
53
|
-
|
54
|
-
if cropbox.area() > AREA_THRESHOLD:
|
55
|
-
cropbox = cropbox.scaled(scale=scale)
|
56
|
-
|
57
|
-
yield cropbox
|
58
|
-
|
59
|
-
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
60
|
-
with pypdfium2_lock:
|
61
|
-
if not self.text_page:
|
62
|
-
self.text_page = self._ppage.get_textpage()
|
63
|
-
|
64
|
-
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
65
|
-
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
66
|
-
|
67
|
-
with pypdfium2_lock:
|
68
|
-
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
69
|
-
|
70
|
-
return text_piece
|
71
|
-
|
72
|
-
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
73
|
-
return None
|
74
|
-
|
75
|
-
def get_text_cells(self) -> Iterable[TextCell]:
|
120
|
+
def _compute_text_cells(self) -> List[TextCell]:
|
121
|
+
"""Compute text cells from pypdfium."""
|
76
122
|
with pypdfium2_lock:
|
77
123
|
if not self.text_page:
|
78
124
|
self.text_page = self._ppage.get_textpage()
|
@@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
203
249
|
|
204
250
|
return merged_cells
|
205
251
|
|
206
|
-
|
207
|
-
image = (
|
208
|
-
self.get_page_image()
|
209
|
-
) # make new image to avoid drawing on the saved ones
|
210
|
-
draw = ImageDraw.Draw(image)
|
211
|
-
for c in cells:
|
212
|
-
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
213
|
-
cell_color = (
|
214
|
-
random.randint(30, 140),
|
215
|
-
random.randint(30, 140),
|
216
|
-
random.randint(30, 140),
|
217
|
-
)
|
218
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
219
|
-
image.show()
|
252
|
+
return merge_horizontal_cells(cells)
|
220
253
|
|
221
|
-
|
222
|
-
#
|
254
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
255
|
+
AREA_THRESHOLD = 0 # 32 * 32
|
256
|
+
page_size = self.get_size()
|
257
|
+
with pypdfium2_lock:
|
258
|
+
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
259
|
+
pos = obj.get_pos()
|
260
|
+
cropbox = BoundingBox.from_tuple(
|
261
|
+
pos, origin=CoordOrigin.BOTTOMLEFT
|
262
|
+
).to_top_left_origin(page_height=page_size.height)
|
223
263
|
|
224
|
-
|
264
|
+
if cropbox.area() > AREA_THRESHOLD:
|
265
|
+
cropbox = cropbox.scaled(scale=scale)
|
225
266
|
|
226
|
-
|
227
|
-
# draw_clusters_and_cells()
|
267
|
+
yield cropbox
|
228
268
|
|
229
|
-
|
269
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
270
|
+
with pypdfium2_lock:
|
271
|
+
if not self.text_page:
|
272
|
+
self.text_page = self._ppage.get_textpage()
|
273
|
+
|
274
|
+
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
275
|
+
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
276
|
+
|
277
|
+
with pypdfium2_lock:
|
278
|
+
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
279
|
+
|
280
|
+
return text_piece
|
281
|
+
|
282
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
283
|
+
if not self.valid:
|
284
|
+
return None
|
285
|
+
|
286
|
+
text_cells = self._compute_text_cells()
|
287
|
+
|
288
|
+
# Get the PDF page geometry from pypdfium2
|
289
|
+
dimension = get_pdf_page_geometry(self._ppage)
|
290
|
+
|
291
|
+
# Create SegmentedPdfPage
|
292
|
+
return SegmentedPdfPage(
|
293
|
+
dimension=dimension,
|
294
|
+
textline_cells=text_cells,
|
295
|
+
char_cells=[],
|
296
|
+
word_cells=[],
|
297
|
+
has_textlines=len(text_cells) > 0,
|
298
|
+
has_words=False,
|
299
|
+
has_chars=False,
|
300
|
+
)
|
301
|
+
|
302
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
303
|
+
return self._compute_text_cells()
|
230
304
|
|
231
305
|
def get_page_image(
|
232
306
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|