natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +125 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +907 -513
- natural_pdf/core/pdf.py +385 -287
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +708 -508
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -4,454 +4,533 @@ Element Manager for natural-pdf.
|
|
4
4
|
This class handles the loading, creation, and management of PDF elements like
|
5
5
|
characters, words, rectangles, and lines extracted from a page.
|
6
6
|
"""
|
7
|
+
|
7
8
|
import logging
|
8
|
-
from typing import List, Dict, Any, Optional, Union, Tuple
|
9
|
-
from itertools import groupby
|
10
9
|
import re
|
10
|
+
from itertools import groupby
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
12
|
+
|
13
|
+
from pdfplumber.utils.text import WordExtractor
|
11
14
|
|
12
|
-
from natural_pdf.elements.text import TextElement
|
13
|
-
from natural_pdf.elements.rect import RectangleElement
|
14
15
|
from natural_pdf.elements.line import LineElement
|
16
|
+
from natural_pdf.elements.rect import RectangleElement
|
17
|
+
from natural_pdf.elements.text import TextElement
|
15
18
|
|
16
19
|
logger = logging.getLogger(__name__)
|
17
20
|
|
21
|
+
|
22
|
+
class NaturalWordExtractor(WordExtractor):
|
23
|
+
"""
|
24
|
+
Custom WordExtractor that splits words based on specified character attributes
|
25
|
+
in addition to pdfplumber's default spatial logic.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(self, word_split_attributes: List[str], extra_attrs: List[str], *args, **kwargs):
|
29
|
+
"""
|
30
|
+
Initialize the extractor.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
word_split_attributes: List of character attributes (keys in char dict)
|
34
|
+
that should trigger a word split if they differ
|
35
|
+
between adjacent characters.
|
36
|
+
extra_attrs: List of character attributes (keys in char dict)
|
37
|
+
to copy from the first char of a word into the
|
38
|
+
resulting word dictionary.
|
39
|
+
*args: Positional arguments passed to WordExtractor parent.
|
40
|
+
**kwargs: Keyword arguments passed to WordExtractor parent.
|
41
|
+
"""
|
42
|
+
self.word_split_attributes = word_split_attributes or []
|
43
|
+
# Remove our custom arg before passing to parent
|
44
|
+
# (Though WordExtractor likely ignores unknown kwargs)
|
45
|
+
# Ensure it's removed if it exists in kwargs
|
46
|
+
if "word_split_attributes" in kwargs:
|
47
|
+
del kwargs["word_split_attributes"]
|
48
|
+
# Pass extra_attrs to the parent constructor
|
49
|
+
kwargs["extra_attrs"] = extra_attrs
|
50
|
+
super().__init__(*args, **kwargs)
|
51
|
+
|
52
|
+
def char_begins_new_word(
|
53
|
+
self,
|
54
|
+
prev_char: Dict[str, Any],
|
55
|
+
curr_char: Dict[str, Any],
|
56
|
+
direction: str,
|
57
|
+
x_tolerance: float,
|
58
|
+
y_tolerance: float,
|
59
|
+
) -> bool:
|
60
|
+
"""
|
61
|
+
Determine if curr_char begins a new word, considering spatial and
|
62
|
+
attribute differences.
|
63
|
+
"""
|
64
|
+
# 1. Check pdfplumber's spatial logic first
|
65
|
+
spatial_split = super().char_begins_new_word(
|
66
|
+
prev_char, curr_char, direction, x_tolerance, y_tolerance
|
67
|
+
)
|
68
|
+
if spatial_split:
|
69
|
+
return True
|
70
|
+
|
71
|
+
# 2. Check for differences in specified attributes
|
72
|
+
if self.word_split_attributes:
|
73
|
+
for attr in self.word_split_attributes:
|
74
|
+
# Use .get() for safety, although _prepare_char_dicts should ensure presence
|
75
|
+
if prev_char.get(attr) != curr_char.get(attr):
|
76
|
+
logger.debug(
|
77
|
+
f"Splitting word due to attribute mismatch on '{attr}': {prev_char.get(attr)} != {curr_char.get(attr)}"
|
78
|
+
)
|
79
|
+
return True # Attribute mismatch forces a new word
|
80
|
+
|
81
|
+
# If both spatial and attribute checks pass, it's the same word
|
82
|
+
return False
|
83
|
+
|
84
|
+
|
18
85
|
class ElementManager:
|
19
86
|
"""
|
20
87
|
Manages the loading, creation, and retrieval of elements from a PDF page.
|
21
|
-
|
88
|
+
|
22
89
|
This class centralizes the element management functionality previously
|
23
90
|
contained in the Page class, providing better separation of concerns.
|
24
91
|
"""
|
25
|
-
|
92
|
+
|
26
93
|
def __init__(self, page, font_attrs=None):
|
27
94
|
"""
|
28
95
|
Initialize the ElementManager.
|
29
|
-
|
96
|
+
|
30
97
|
Args:
|
31
98
|
page: The parent Page object
|
32
99
|
font_attrs: Font attributes to consider when grouping characters into words.
|
33
|
-
Default: ['fontname', 'size'
|
100
|
+
Default: ['fontname', 'size', 'bold', 'italic']
|
34
101
|
None: Only consider spatial relationships
|
35
|
-
List: Custom attributes to consider
|
102
|
+
List: Custom attributes to consider
|
36
103
|
"""
|
37
104
|
self._page = page
|
38
105
|
self._elements = None # Lazy-loaded
|
39
|
-
# Default to
|
40
|
-
|
41
|
-
|
106
|
+
# Default to splitting by fontname, size, bold, italic if not specified
|
107
|
+
# Renamed internal variable for clarity
|
108
|
+
self._word_split_attributes = (
|
109
|
+
["fontname", "size", "bold", "italic"] if font_attrs is None else font_attrs
|
110
|
+
)
|
111
|
+
|
42
112
|
def load_elements(self):
|
43
113
|
"""
|
44
114
|
Load all elements from the page (lazy loading).
|
115
|
+
Uses NaturalWordExtractor for word grouping.
|
45
116
|
"""
|
46
|
-
if self._elements is None:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
#
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
words = self._process_line_into_words(line_chars, keep_spaces, font_attrs)
|
135
|
-
line_groups.extend(words)
|
136
|
-
|
137
|
-
return line_groups
|
138
|
-
|
139
|
-
def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
|
140
|
-
"""
|
141
|
-
Process a single line of characters into words.
|
142
|
-
|
143
|
-
Args:
|
144
|
-
line_chars: List of characters in the line
|
145
|
-
keep_spaces: Whether to keep spaces in words
|
146
|
-
font_attrs: Font attributes to consider for word breaks
|
147
|
-
|
148
|
-
Returns:
|
149
|
-
List of TextElement word objects for this line
|
150
|
-
"""
|
151
|
-
words = []
|
152
|
-
current_word = []
|
153
|
-
|
154
|
-
for i, char in enumerate(line_chars):
|
155
|
-
# Handle whitespace characters differently based on keep_spaces setting
|
156
|
-
if char['text'].isspace():
|
157
|
-
if keep_spaces:
|
158
|
-
# Include spaces in words when keep_spaces is enabled
|
159
|
-
if current_word:
|
160
|
-
current_word.append(char)
|
161
|
-
else:
|
162
|
-
# Skip leading spaces at the start of a line
|
163
|
-
continue
|
164
|
-
else:
|
165
|
-
# Original behavior: Skip whitespace and close current word
|
166
|
-
if current_word:
|
167
|
-
# Create word and add to words list
|
168
|
-
word = self._create_word_element(current_word, font_attrs)
|
169
|
-
words.append(word)
|
170
|
-
current_word = []
|
171
|
-
continue
|
172
|
-
|
173
|
-
# If this is a new word, start it
|
174
|
-
if not current_word:
|
175
|
-
current_word.append(char)
|
176
|
-
else:
|
177
|
-
# Check if this char is part of the current word or a new word
|
178
|
-
prev_char = current_word[-1]
|
179
|
-
|
180
|
-
# Check if font attributes match for this character
|
181
|
-
font_attrs_match = self._check_font_attributes_match(char, prev_char, font_attrs)
|
182
|
-
|
183
|
-
# If font attributes don't match, it's a new word
|
184
|
-
if not font_attrs_match:
|
185
|
-
# Complete current word
|
186
|
-
word = self._create_word_element(current_word, font_attrs)
|
187
|
-
words.append(word)
|
188
|
-
current_word = [char]
|
189
|
-
# If the gap between chars is larger than a threshold, it's a new word
|
190
|
-
# Use a wider threshold when keep_spaces is enabled to allow for natural spaces
|
191
|
-
elif char['x0'] - prev_char['x1'] > prev_char['width'] * (1.5 if keep_spaces else 0.5):
|
192
|
-
# Complete current word
|
193
|
-
word = self._create_word_element(current_word, font_attrs)
|
194
|
-
words.append(word)
|
195
|
-
current_word = [char]
|
196
|
-
else:
|
197
|
-
# Continue current word
|
198
|
-
current_word.append(char)
|
199
|
-
|
200
|
-
# Handle the last word if there is one
|
201
|
-
if current_word:
|
202
|
-
word = self._create_word_element(current_word, font_attrs)
|
203
|
-
words.append(word)
|
204
|
-
|
205
|
-
return words
|
206
|
-
|
207
|
-
def _check_font_attributes_match(self, char, prev_char, font_attrs):
|
117
|
+
if self._elements is not None:
|
118
|
+
return
|
119
|
+
|
120
|
+
logger.debug(f"Page {self._page.number}: Loading elements...")
|
121
|
+
|
122
|
+
# 1. Prepare character dictionaries (native + OCR) with necessary attributes
|
123
|
+
prepared_char_dicts = self._prepare_char_dicts()
|
124
|
+
logger.debug(
|
125
|
+
f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
|
126
|
+
)
|
127
|
+
|
128
|
+
# 2. Instantiate the custom word extractor
|
129
|
+
# Get config settings from the parent PDF or use defaults
|
130
|
+
pdf_config = getattr(self._page._parent, "_config", {})
|
131
|
+
xt = pdf_config.get("x_tolerance", 3)
|
132
|
+
yt = pdf_config.get("y_tolerance", 3)
|
133
|
+
use_flow = pdf_config.get("use_text_flow", False)
|
134
|
+
|
135
|
+
# Define which attributes to preserve on the merged word object
|
136
|
+
# Should include split attributes + any others needed for filtering (like color)
|
137
|
+
attributes_to_preserve = list(set(self._word_split_attributes + ["non_stroking_color"]))
|
138
|
+
|
139
|
+
# Pass our configured attributes for splitting
|
140
|
+
extractor = NaturalWordExtractor(
|
141
|
+
word_split_attributes=self._word_split_attributes,
|
142
|
+
extra_attrs=attributes_to_preserve,
|
143
|
+
x_tolerance=xt,
|
144
|
+
y_tolerance=yt,
|
145
|
+
keep_blank_chars=True,
|
146
|
+
use_text_flow=use_flow,
|
147
|
+
# Assuming default directions are okay, configure if needed
|
148
|
+
# line_dir=..., char_dir=...
|
149
|
+
)
|
150
|
+
|
151
|
+
# 3. Generate words using the extractor
|
152
|
+
generated_words = []
|
153
|
+
if prepared_char_dicts:
|
154
|
+
# Sort chars primarily by upright status, then page reading order
|
155
|
+
# Grouping by upright is crucial for WordExtractor's direction logic
|
156
|
+
sorted_chars_for_extraction = sorted(
|
157
|
+
prepared_char_dicts,
|
158
|
+
key=lambda c: (c.get("upright", True), round(c.get("top", 0)), c.get("x0", 0)),
|
159
|
+
)
|
160
|
+
|
161
|
+
word_tuples = extractor.iter_extract_tuples(sorted_chars_for_extraction)
|
162
|
+
for word_dict, char_list in word_tuples:
|
163
|
+
# Convert the generated word_dict to a TextElement
|
164
|
+
word_dict["_char_dicts"] = char_list
|
165
|
+
word_element = self._create_word_element(word_dict)
|
166
|
+
generated_words.append(word_element)
|
167
|
+
logger.debug(
|
168
|
+
f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
|
169
|
+
)
|
170
|
+
|
171
|
+
# 4. Load other elements (rects, lines)
|
172
|
+
rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
|
173
|
+
line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
|
174
|
+
logger.debug(
|
175
|
+
f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines."
|
176
|
+
)
|
177
|
+
|
178
|
+
# 5. Create the final elements dictionary
|
179
|
+
self._elements = {
|
180
|
+
# Store original char elements if needed (e.g., for visualization/debugging)
|
181
|
+
# We re-create them here from the prepared dicts
|
182
|
+
"chars": [TextElement(c_dict, self._page) for c_dict in prepared_char_dicts],
|
183
|
+
"words": generated_words,
|
184
|
+
"rects": rect_elements,
|
185
|
+
"lines": line_elements,
|
186
|
+
}
|
187
|
+
|
188
|
+
# Add regions if they exist
|
189
|
+
if hasattr(self._page, "_regions") and (
|
190
|
+
"detected" in self._page._regions or "named" in self._page._regions
|
191
|
+
):
|
192
|
+
regions = []
|
193
|
+
if "detected" in self._page._regions:
|
194
|
+
regions.extend(self._page._regions["detected"])
|
195
|
+
if "named" in self._page._regions:
|
196
|
+
regions.extend(self._page._regions["named"].values())
|
197
|
+
self._elements["regions"] = regions
|
198
|
+
logger.debug(f"Page {self._page.number}: Added {len(regions)} regions.")
|
199
|
+
else:
|
200
|
+
self._elements["regions"] = [] # Ensure key exists
|
201
|
+
|
202
|
+
logger.debug(f"Page {self._page.number}: Element loading complete.")
|
203
|
+
|
204
|
+
def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
|
208
205
|
"""
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
font_attrs: List of font attributes to check
|
215
|
-
|
206
|
+
Prepares a list of character dictionaries from native PDF characters,
|
207
|
+
augmenting them with necessary attributes like bold/italic flags.
|
208
|
+
This method focuses ONLY on native characters. OCR results are
|
209
|
+
handled separately by create_text_elements_from_ocr.
|
210
|
+
|
216
211
|
Returns:
|
217
|
-
|
212
|
+
List of augmented native character dictionaries.
|
218
213
|
"""
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
214
|
+
prepared_dicts = []
|
215
|
+
processed_native_ids = set() # To track processed native chars
|
216
|
+
|
217
|
+
# 1. Process Native PDF Characters
|
218
|
+
native_chars = self._page._page.chars or []
|
219
|
+
logger.debug(f"Page {self._page.number}: Preparing {len(native_chars)} native char dicts.")
|
220
|
+
for i, char_dict in enumerate(native_chars):
|
221
|
+
# Create a temporary TextElement for analysis ONLY
|
222
|
+
# We need to ensure the char_dict has necessary keys first
|
223
|
+
if not all(k in char_dict for k in ["x0", "top", "x1", "bottom", "text"]):
|
224
|
+
logger.warning(f"Skipping native char dict due to missing keys: {char_dict}")
|
225
|
+
continue
|
226
|
+
|
227
|
+
temp_element = TextElement(char_dict, self._page)
|
228
|
+
|
229
|
+
# Augment the original dictionary
|
230
|
+
augmented_dict = char_dict.copy() # Work on a copy
|
231
|
+
augmented_dict["bold"] = temp_element.bold
|
232
|
+
augmented_dict["italic"] = temp_element.italic
|
233
|
+
augmented_dict["source"] = "native"
|
234
|
+
# Copy color if it exists
|
235
|
+
if "non_stroking_color" in char_dict:
|
236
|
+
augmented_dict["non_stroking_color"] = char_dict["non_stroking_color"]
|
237
|
+
# Ensure basic required keys are present
|
238
|
+
augmented_dict.setdefault("upright", True)
|
239
|
+
augmented_dict.setdefault("fontname", "Unknown")
|
240
|
+
augmented_dict.setdefault("size", 0)
|
241
|
+
|
242
|
+
prepared_dicts.append(augmented_dict)
|
243
|
+
# Use a unique identifier if available (e.g., tuple of key properties)
|
244
|
+
# Simple approach: use index for now, assuming list order is stable here
|
245
|
+
processed_native_ids.add(i)
|
246
|
+
|
247
|
+
# 2. Remove OCR Processing from this method
|
248
|
+
# OCR results will be added later via create_text_elements_from_ocr
|
249
|
+
|
250
|
+
logger.debug(
|
251
|
+
f"Page {self._page.number}: Total prepared native char dicts: {len(prepared_dicts)}"
|
252
|
+
)
|
253
|
+
return prepared_dicts
|
254
|
+
|
255
|
+
def _create_word_element(self, word_dict: Dict[str, Any]) -> TextElement:
|
232
256
|
"""
|
233
|
-
Create a word
|
234
|
-
|
257
|
+
Create a TextElement (type 'word') from a word dictionary generated
|
258
|
+
by NaturalWordExtractor/pdfplumber.
|
259
|
+
|
235
260
|
Args:
|
236
|
-
|
237
|
-
|
238
|
-
|
261
|
+
word_dict: Dictionary representing the word, including geometry,
|
262
|
+
text, and attributes copied from the first char
|
263
|
+
(e.g., fontname, size, bold, italic).
|
264
|
+
|
239
265
|
Returns:
|
240
|
-
TextElement representing the word
|
266
|
+
TextElement representing the word.
|
241
267
|
"""
|
242
|
-
#
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
def create_text_elements_from_ocr(self, ocr_results, image_width=None, image_height=None):
|
268
|
+
# word_dict already contains calculated geometry (x0, top, x1, bottom, etc.)
|
269
|
+
# and text content. We just need to ensure our required fields exist
|
270
|
+
# and potentially set the source.
|
271
|
+
|
272
|
+
# Start with a copy of the word_dict
|
273
|
+
element_data = word_dict.copy()
|
274
|
+
|
275
|
+
# Ensure required TextElement fields are present or add defaults
|
276
|
+
element_data.setdefault("object_type", "word") # Set type to 'word'
|
277
|
+
element_data.setdefault("page_number", self._page.number)
|
278
|
+
# Determine source based on attributes present (e.g., if 'confidence' exists, it's likely OCR)
|
279
|
+
# This assumes the word_dict carries over some hint from its chars.
|
280
|
+
# A simpler approach: assume 'native' unless fontname is 'OCR'.
|
281
|
+
element_data.setdefault(
|
282
|
+
"source", "ocr" if element_data.get("fontname") == "OCR" else "native"
|
283
|
+
)
|
284
|
+
element_data.setdefault(
|
285
|
+
"confidence", 1.0 if element_data["source"] == "native" else 0.0
|
286
|
+
) # Default confidence
|
287
|
+
|
288
|
+
# Bold/italic should already be in word_dict if they were split attributes,
|
289
|
+
# copied from the first (representative) char by pdfplumber's merge_chars.
|
290
|
+
# Ensure they exist for TextElement initialization.
|
291
|
+
element_data.setdefault("bold", False)
|
292
|
+
element_data.setdefault("italic", False)
|
293
|
+
|
294
|
+
# Ensure fontname and size exist
|
295
|
+
element_data.setdefault("fontname", "Unknown")
|
296
|
+
element_data.setdefault("size", 0)
|
297
|
+
|
298
|
+
# Store the constituent char dicts (passed alongside word_dict from extractor)
|
299
|
+
# We need to modify the caller (load_elements) to pass this.
|
300
|
+
# For now, assume it might be passed in word_dict for placeholder.
|
301
|
+
element_data["_char_dicts"] = word_dict.get("_char_dicts", []) # Store char list
|
302
|
+
|
303
|
+
return TextElement(element_data, self._page)
|
304
|
+
|
305
|
+
def create_text_elements_from_ocr(self, ocr_results, scale_x=None, scale_y=None):
|
281
306
|
"""
|
282
|
-
Convert OCR results to TextElement objects
|
283
|
-
|
307
|
+
Convert OCR results to TextElement objects AND adds them to the manager's
|
308
|
+
'words' and 'chars' lists.
|
309
|
+
|
310
|
+
This method should be called AFTER initial elements (native) might have
|
311
|
+
been loaded, as it appends to the existing lists.
|
312
|
+
|
284
313
|
Args:
|
285
|
-
ocr_results: List of OCR results with text, bbox,
|
286
|
-
|
287
|
-
|
288
|
-
|
314
|
+
ocr_results: List of OCR results dictionaries with 'text', 'bbox', 'confidence'.
|
315
|
+
scale_x: Factor to convert image x-coordinates to PDF coordinates.
|
316
|
+
scale_y: Factor to convert image y-coordinates to PDF coordinates.
|
317
|
+
|
289
318
|
Returns:
|
290
|
-
List of created TextElement objects
|
319
|
+
List of created TextElement word objects that were added.
|
291
320
|
"""
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
321
|
+
added_word_elements = []
|
322
|
+
if self._elements is None:
|
323
|
+
# Trigger loading of native elements if not already done
|
324
|
+
logger.debug(
|
325
|
+
f"Page {self._page.number}: create_text_elements_from_ocr triggering initial load_elements."
|
326
|
+
)
|
327
|
+
self.load_elements()
|
328
|
+
|
329
|
+
# Ensure scales are valid numbers
|
330
|
+
scale_x = float(scale_x) if scale_x is not None else 1.0
|
331
|
+
scale_y = float(scale_y) if scale_y is not None else 1.0
|
332
|
+
|
333
|
+
logger.debug(
|
334
|
+
f"Page {self._page.number}: Adding {len(ocr_results)} OCR results as elements. Scale: x={scale_x:.2f}, y={scale_y:.2f}"
|
335
|
+
)
|
336
|
+
|
337
|
+
# Ensure the target lists exist in the _elements dict
|
338
|
+
if self._elements is None:
|
339
|
+
logger.error(
|
340
|
+
f"Page {self._page.number}: _elements dictionary is None after load_elements call in create_text_elements_from_ocr. Cannot add OCR elements."
|
341
|
+
)
|
342
|
+
return [] # Cannot proceed
|
343
|
+
|
344
|
+
if "words" not in self._elements:
|
345
|
+
self._elements["words"] = []
|
346
|
+
if "chars" not in self._elements:
|
347
|
+
self._elements["chars"] = []
|
348
|
+
|
303
349
|
for result in ocr_results:
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
350
|
+
try:
|
351
|
+
x0_img, top_img, x1_img, bottom_img = map(float, result["bbox"])
|
352
|
+
height_img = bottom_img - top_img
|
353
|
+
pdf_x0 = x0_img * scale_x
|
354
|
+
pdf_top = top_img * scale_y
|
355
|
+
pdf_x1 = x1_img * scale_x
|
356
|
+
pdf_bottom = bottom_img * scale_y
|
357
|
+
pdf_height = (bottom_img - top_img) * scale_y
|
358
|
+
|
359
|
+
# Create the TextElement for the word
|
360
|
+
word_element_data = {
|
361
|
+
"text": result["text"],
|
362
|
+
"x0": pdf_x0,
|
363
|
+
"top": pdf_top,
|
364
|
+
"x1": pdf_x1,
|
365
|
+
"bottom": pdf_bottom,
|
366
|
+
"width": (x1_img - x0_img) * scale_x,
|
367
|
+
"height": pdf_height,
|
368
|
+
"object_type": "word", # Treat OCR results as whole words
|
369
|
+
"source": "ocr",
|
370
|
+
"confidence": float(result.get("confidence", 0.0)),
|
371
|
+
"fontname": "OCR", # Use consistent OCR fontname
|
372
|
+
"size": (
|
373
|
+
round(pdf_height) if pdf_height > 0 else 10.0
|
374
|
+
), # Use calculated PDF height for size
|
375
|
+
"page_number": self._page.number,
|
376
|
+
"bold": False,
|
377
|
+
"italic": False,
|
378
|
+
"upright": True,
|
379
|
+
"doctop": pdf_top + self._page._page.initial_doctop,
|
380
|
+
}
|
381
|
+
|
382
|
+
# Create the representative char dict for this OCR word
|
383
|
+
ocr_char_dict = word_element_data.copy()
|
384
|
+
ocr_char_dict["object_type"] = "char"
|
385
|
+
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
386
|
+
|
387
|
+
# Add the char dict list to the word data before creating TextElement
|
388
|
+
word_element_data["_char_dicts"] = [ocr_char_dict]
|
389
|
+
|
390
|
+
word_elem = TextElement(word_element_data, self._page)
|
391
|
+
added_word_elements.append(word_elem)
|
392
|
+
|
393
|
+
# Append the word element to the manager's list
|
394
|
+
self._elements["words"].append(word_elem)
|
395
|
+
|
396
|
+
# Also create and append a representative character dictionary
|
397
|
+
# for consistency if someone iterates through manager.chars later.
|
398
|
+
# This char dict represents the entire OCR word as a single 'char'.
|
399
|
+
char_dict_data = ocr_char_dict # Use the one we already created
|
400
|
+
char_dict_data["object_type"] = "char" # Mark as char type
|
401
|
+
# pdfplumber char dicts don't typically have width/height/doctop,
|
402
|
+
# but keeping them won't hurt WordExtractor if it encounters them.
|
403
|
+
char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
|
404
|
+
|
405
|
+
self._elements["chars"].append(char_dict_data) # Append the dictionary
|
406
|
+
|
407
|
+
except (KeyError, ValueError, TypeError) as e:
|
408
|
+
logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)
|
409
|
+
continue
|
410
|
+
|
411
|
+
logger.info(
|
412
|
+
f"Page {self._page.number}: Appended {len(added_word_elements)} TextElements (words) and corresponding char dicts from OCR results."
|
413
|
+
)
|
414
|
+
return added_word_elements
|
415
|
+
|
416
|
+
def add_element(self, element, element_type="words"):
|
342
417
|
"""
|
343
418
|
Add an element to the managed elements.
|
344
|
-
|
419
|
+
|
345
420
|
Args:
|
346
421
|
element: The element to add
|
347
422
|
element_type: The type of element ('words', 'chars', etc.)
|
348
|
-
|
423
|
+
|
349
424
|
Returns:
|
350
425
|
True if added successfully, False otherwise
|
351
426
|
"""
|
352
427
|
# Load elements if not already loaded
|
353
428
|
self.load_elements()
|
354
|
-
|
429
|
+
|
355
430
|
# Add to the appropriate list
|
356
431
|
if element_type in self._elements:
|
357
|
-
|
358
|
-
|
359
|
-
|
432
|
+
# Avoid adding duplicates
|
433
|
+
if element not in self._elements[element_type]:
|
434
|
+
self._elements[element_type].append(element)
|
435
|
+
return True
|
436
|
+
else:
|
437
|
+
# logger.debug(f"Element already exists in {element_type}: {element}")
|
438
|
+
return False # Indicate it wasn't newly added
|
439
|
+
|
360
440
|
return False
|
361
|
-
|
441
|
+
|
362
442
|
def add_region(self, region, name=None):
|
363
443
|
"""
|
364
444
|
Add a region to the managed elements.
|
365
|
-
|
445
|
+
|
366
446
|
Args:
|
367
447
|
region: The region to add
|
368
448
|
name: Optional name for the region
|
369
|
-
|
449
|
+
|
370
450
|
Returns:
|
371
451
|
True if added successfully, False otherwise
|
372
452
|
"""
|
373
453
|
# Load elements if not already loaded
|
374
454
|
self.load_elements()
|
375
|
-
|
455
|
+
|
376
456
|
# Make sure regions is in _elements
|
377
|
-
if
|
378
|
-
self._elements[
|
379
|
-
|
457
|
+
if "regions" not in self._elements:
|
458
|
+
self._elements["regions"] = []
|
459
|
+
|
380
460
|
# Add to elements for selector queries
|
381
|
-
if region not in self._elements[
|
382
|
-
self._elements[
|
461
|
+
if region not in self._elements["regions"]:
|
462
|
+
self._elements["regions"].append(region)
|
383
463
|
return True
|
384
|
-
|
464
|
+
|
385
465
|
return False
|
386
|
-
|
466
|
+
|
387
467
|
def get_elements(self, element_type=None):
|
388
468
|
"""
|
389
469
|
Get all elements of the specified type, or all elements if type is None.
|
390
|
-
|
470
|
+
|
391
471
|
Args:
|
392
|
-
element_type: Optional element type ('words', 'chars', 'rects', 'lines', etc.)
|
393
|
-
|
472
|
+
element_type: Optional element type ('words', 'chars', 'rects', 'lines', 'regions' etc.)
|
473
|
+
|
394
474
|
Returns:
|
395
475
|
List of elements
|
396
476
|
"""
|
397
477
|
# Load elements if not already loaded
|
398
478
|
self.load_elements()
|
399
|
-
|
479
|
+
|
400
480
|
if element_type:
|
401
481
|
return self._elements.get(element_type, [])
|
402
|
-
|
482
|
+
|
403
483
|
# Combine all element types
|
404
484
|
all_elements = []
|
405
485
|
for elements in self._elements.values():
|
406
486
|
all_elements.extend(elements)
|
407
|
-
|
487
|
+
|
408
488
|
return all_elements
|
409
|
-
|
489
|
+
|
410
490
|
def get_all_elements(self):
|
411
491
|
"""
|
412
492
|
Get all elements from all types.
|
413
|
-
|
493
|
+
|
414
494
|
Returns:
|
415
495
|
List of all elements
|
416
496
|
"""
|
417
497
|
# Load elements if not already loaded
|
418
498
|
self.load_elements()
|
419
|
-
|
499
|
+
|
420
500
|
# Combine all element types
|
421
501
|
all_elements = []
|
422
|
-
|
423
|
-
|
424
|
-
|
502
|
+
if self._elements: # Ensure _elements is not None
|
503
|
+
for elements in self._elements.values():
|
504
|
+
if isinstance(elements, list): # Ensure we only extend lists
|
505
|
+
all_elements.extend(elements)
|
425
506
|
return all_elements
|
426
|
-
|
507
|
+
|
427
508
|
@property
|
428
509
|
def chars(self):
|
429
510
|
"""Get all character elements."""
|
430
511
|
self.load_elements()
|
431
|
-
return self._elements[
|
432
|
-
|
512
|
+
return self._elements.get("chars", [])
|
513
|
+
|
433
514
|
@property
|
434
515
|
def words(self):
|
435
516
|
"""Get all word elements."""
|
436
517
|
self.load_elements()
|
437
|
-
return self._elements[
|
438
|
-
|
518
|
+
return self._elements.get("words", [])
|
519
|
+
|
439
520
|
@property
|
440
521
|
def rects(self):
|
441
522
|
"""Get all rectangle elements."""
|
442
523
|
self.load_elements()
|
443
|
-
return self._elements[
|
444
|
-
|
524
|
+
return self._elements.get("rects", [])
|
525
|
+
|
445
526
|
@property
|
446
527
|
def lines(self):
|
447
528
|
"""Get all line elements."""
|
448
529
|
self.load_elements()
|
449
|
-
return self._elements[
|
450
|
-
|
530
|
+
return self._elements.get("lines", [])
|
531
|
+
|
451
532
|
@property
|
452
533
|
def regions(self):
|
453
534
|
"""Get all region elements."""
|
454
535
|
self.load_elements()
|
455
|
-
|
456
|
-
self._elements['regions'] = []
|
457
|
-
return self._elements['regions']
|
536
|
+
return self._elements.get("regions", [])
|