natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,457 @@
|
|
1
|
+
"""
|
2
|
+
Element Manager for natural-pdf.
|
3
|
+
|
4
|
+
This class handles the loading, creation, and management of PDF elements like
|
5
|
+
characters, words, rectangles, and lines extracted from a page.
|
6
|
+
"""
|
7
|
+
import logging
|
8
|
+
from typing import List, Dict, Any, Optional, Union, Tuple
|
9
|
+
from itertools import groupby
|
10
|
+
import re
|
11
|
+
|
12
|
+
from natural_pdf.elements.text import TextElement
|
13
|
+
from natural_pdf.elements.rect import RectangleElement
|
14
|
+
from natural_pdf.elements.line import LineElement
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
class ElementManager:
|
19
|
+
"""
|
20
|
+
Manages the loading, creation, and retrieval of elements from a PDF page.
|
21
|
+
|
22
|
+
This class centralizes the element management functionality previously
|
23
|
+
contained in the Page class, providing better separation of concerns.
|
24
|
+
"""
|
25
|
+
|
26
|
+
def __init__(self, page, font_attrs=None):
|
27
|
+
"""
|
28
|
+
Initialize the ElementManager.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
page: The parent Page object
|
32
|
+
font_attrs: Font attributes to consider when grouping characters into words.
|
33
|
+
Default: ['fontname', 'size'] (Group by font name and size)
|
34
|
+
None: Only consider spatial relationships
|
35
|
+
List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
|
36
|
+
"""
|
37
|
+
self._page = page
|
38
|
+
self._elements = None # Lazy-loaded
|
39
|
+
# Default to grouping by fontname and size if not specified
|
40
|
+
self._font_attrs = ['fontname', 'size'] if font_attrs is None else font_attrs
|
41
|
+
|
42
|
+
def load_elements(self):
|
43
|
+
"""
|
44
|
+
Load all elements from the page (lazy loading).
|
45
|
+
"""
|
46
|
+
if self._elements is None:
|
47
|
+
# Create character elements with font information
|
48
|
+
chars = self._create_char_elements()
|
49
|
+
|
50
|
+
# Get keep_spaces setting from PDF config or default to True
|
51
|
+
keep_spaces = self._page._parent._config.get('keep_spaces', True)
|
52
|
+
|
53
|
+
# Group characters into words
|
54
|
+
words = self._group_chars_into_words(keep_spaces, self._font_attrs)
|
55
|
+
|
56
|
+
# Create the elements dictionary with all element types
|
57
|
+
self._elements = {
|
58
|
+
'chars': chars,
|
59
|
+
'words': words,
|
60
|
+
'rects': [RectangleElement(r, self._page) for r in self._page._page.rects],
|
61
|
+
'lines': [LineElement(l, self._page) for l in self._page._page.lines],
|
62
|
+
# Add other element types as needed
|
63
|
+
}
|
64
|
+
|
65
|
+
# Add regions if they exist
|
66
|
+
if hasattr(self._page, '_regions') and ('detected' in self._page._regions or 'named' in self._page._regions):
|
67
|
+
regions = []
|
68
|
+
if 'detected' in self._page._regions:
|
69
|
+
regions.extend(self._page._regions['detected'])
|
70
|
+
if 'named' in self._page._regions:
|
71
|
+
regions.extend(self._page._regions['named'].values())
|
72
|
+
self._elements['regions'] = regions
|
73
|
+
|
74
|
+
def _create_char_elements(self):
|
75
|
+
"""
|
76
|
+
Create TextElement objects from page characters with enhanced font information.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
List of TextElement objects for characters
|
80
|
+
"""
|
81
|
+
chars = []
|
82
|
+
for c in self._page._page.chars:
|
83
|
+
# Process font reference information
|
84
|
+
self._process_font_information(c)
|
85
|
+
|
86
|
+
# Add source attribute for native text elements
|
87
|
+
c['source'] = 'native'
|
88
|
+
chars.append(TextElement(c, self._page))
|
89
|
+
|
90
|
+
return chars
|
91
|
+
|
92
|
+
def _process_font_information(self, char_dict):
|
93
|
+
"""
|
94
|
+
Process font information for a character dict, adding real_fontname when possible.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
char_dict: Character dictionary to process
|
98
|
+
"""
|
99
|
+
# Check for font references (F0, F1, etc.) and map to actual fonts
|
100
|
+
if char_dict.get('fontname', '').startswith('F') and len(char_dict['fontname']) <= 3:
|
101
|
+
# Access the PDF resource info to get actual font name
|
102
|
+
font_ref = char_dict['fontname']
|
103
|
+
try:
|
104
|
+
# Try to get font info from resources
|
105
|
+
if self._page._page.page_obj.get('Resources', {}).get('Font', {}):
|
106
|
+
fonts = self._page._page.page_obj['Resources']['Font']
|
107
|
+
if font_ref in fonts:
|
108
|
+
font_obj = fonts[font_ref]
|
109
|
+
if font_obj.get('BaseFont'):
|
110
|
+
char_dict['real_fontname'] = font_obj['BaseFont']
|
111
|
+
except (KeyError, AttributeError, TypeError):
|
112
|
+
pass
|
113
|
+
|
114
|
+
def _group_chars_into_words(self, keep_spaces=True, font_attrs=None):
|
115
|
+
"""
|
116
|
+
Group characters into words based on font attributes and spatial relationships.
|
117
|
+
|
118
|
+
Args:
|
119
|
+
keep_spaces: Whether to keep spaces in words or use them as word separators
|
120
|
+
font_attrs: Font attributes to consider when grouping characters
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
List of TextElement word objects
|
124
|
+
"""
|
125
|
+
# Sort chars by y-position (line) and then x-position
|
126
|
+
sorted_chars = sorted(self._page._page.chars, key=lambda c: (round(c['top']), c['x0']))
|
127
|
+
|
128
|
+
# Group chars by line (similar y-position)
|
129
|
+
line_groups = []
|
130
|
+
for _, line_chars in groupby(sorted_chars, key=lambda c: round(c['top'])):
|
131
|
+
line_chars = list(line_chars)
|
132
|
+
|
133
|
+
# Process each line of characters into words
|
134
|
+
words = self._process_line_into_words(line_chars, keep_spaces, font_attrs)
|
135
|
+
line_groups.extend(words)
|
136
|
+
|
137
|
+
return line_groups
|
138
|
+
|
139
|
+
def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
|
140
|
+
"""
|
141
|
+
Process a single line of characters into words.
|
142
|
+
|
143
|
+
Args:
|
144
|
+
line_chars: List of characters in the line
|
145
|
+
keep_spaces: Whether to keep spaces in words
|
146
|
+
font_attrs: Font attributes to consider for word breaks
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
List of TextElement word objects for this line
|
150
|
+
"""
|
151
|
+
words = []
|
152
|
+
current_word = []
|
153
|
+
|
154
|
+
for i, char in enumerate(line_chars):
|
155
|
+
# Handle whitespace characters differently based on keep_spaces setting
|
156
|
+
if char['text'].isspace():
|
157
|
+
if keep_spaces:
|
158
|
+
# Include spaces in words when keep_spaces is enabled
|
159
|
+
if current_word:
|
160
|
+
current_word.append(char)
|
161
|
+
else:
|
162
|
+
# Skip leading spaces at the start of a line
|
163
|
+
continue
|
164
|
+
else:
|
165
|
+
# Original behavior: Skip whitespace and close current word
|
166
|
+
if current_word:
|
167
|
+
# Create word and add to words list
|
168
|
+
word = self._create_word_element(current_word, font_attrs)
|
169
|
+
words.append(word)
|
170
|
+
current_word = []
|
171
|
+
continue
|
172
|
+
|
173
|
+
# If this is a new word, start it
|
174
|
+
if not current_word:
|
175
|
+
current_word.append(char)
|
176
|
+
else:
|
177
|
+
# Check if this char is part of the current word or a new word
|
178
|
+
prev_char = current_word[-1]
|
179
|
+
|
180
|
+
# Check if font attributes match for this character
|
181
|
+
font_attrs_match = self._check_font_attributes_match(char, prev_char, font_attrs)
|
182
|
+
|
183
|
+
# If font attributes don't match, it's a new word
|
184
|
+
if not font_attrs_match:
|
185
|
+
# Complete current word
|
186
|
+
word = self._create_word_element(current_word, font_attrs)
|
187
|
+
words.append(word)
|
188
|
+
current_word = [char]
|
189
|
+
# If the gap between chars is larger than a threshold, it's a new word
|
190
|
+
# Use a wider threshold when keep_spaces is enabled to allow for natural spaces
|
191
|
+
elif char['x0'] - prev_char['x1'] > prev_char['width'] * (1.5 if keep_spaces else 0.5):
|
192
|
+
# Complete current word
|
193
|
+
word = self._create_word_element(current_word, font_attrs)
|
194
|
+
words.append(word)
|
195
|
+
current_word = [char]
|
196
|
+
else:
|
197
|
+
# Continue current word
|
198
|
+
current_word.append(char)
|
199
|
+
|
200
|
+
# Handle the last word if there is one
|
201
|
+
if current_word:
|
202
|
+
word = self._create_word_element(current_word, font_attrs)
|
203
|
+
words.append(word)
|
204
|
+
|
205
|
+
return words
|
206
|
+
|
207
|
+
def _check_font_attributes_match(self, char, prev_char, font_attrs):
|
208
|
+
"""
|
209
|
+
Check if two characters have matching font attributes.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
char: Current character
|
213
|
+
prev_char: Previous character
|
214
|
+
font_attrs: List of font attributes to check
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
Boolean indicating whether font attributes match
|
218
|
+
"""
|
219
|
+
# Default to match if no font attributes specified
|
220
|
+
if not font_attrs:
|
221
|
+
return True
|
222
|
+
|
223
|
+
# Check each font attribute
|
224
|
+
for attr in font_attrs:
|
225
|
+
# If attribute doesn't match or isn't present in both chars, they don't match
|
226
|
+
if attr not in char or attr not in prev_char or char[attr] != prev_char[attr]:
|
227
|
+
return False
|
228
|
+
|
229
|
+
return True
|
230
|
+
|
231
|
+
def _create_word_element(self, chars, font_attrs):
|
232
|
+
"""
|
233
|
+
Create a word element from a list of character dictionaries.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
chars: List of character dictionaries
|
237
|
+
font_attrs: Font attributes to copy to the word
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
TextElement representing the word
|
241
|
+
"""
|
242
|
+
# Combine text from characters and normalize spaces
|
243
|
+
text = ''.join(c['text'] for c in chars)
|
244
|
+
|
245
|
+
# Collapse multiple consecutive spaces into a single space
|
246
|
+
text = re.sub(r'\s+', ' ', text)
|
247
|
+
|
248
|
+
# Create a combined word object
|
249
|
+
word_obj = {
|
250
|
+
'text': text,
|
251
|
+
'x0': min(c['x0'] for c in chars),
|
252
|
+
'x1': max(c['x1'] for c in chars),
|
253
|
+
'top': min(c['top'] for c in chars),
|
254
|
+
'bottom': max(c['bottom'] for c in chars),
|
255
|
+
'fontname': chars[0].get('fontname', ''),
|
256
|
+
'size': chars[0].get('size', 0),
|
257
|
+
'object_type': 'word',
|
258
|
+
'page_number': chars[0]['page_number']
|
259
|
+
}
|
260
|
+
|
261
|
+
# Handle real fontname if available
|
262
|
+
if 'real_fontname' in chars[0]:
|
263
|
+
word_obj['real_fontname'] = chars[0]['real_fontname']
|
264
|
+
|
265
|
+
# Handle color - use the first char's color
|
266
|
+
if 'non_stroking_color' in chars[0]:
|
267
|
+
word_obj['non_stroking_color'] = chars[0]['non_stroking_color']
|
268
|
+
|
269
|
+
# Copy any additional font attributes
|
270
|
+
if font_attrs:
|
271
|
+
for attr in font_attrs:
|
272
|
+
if attr in chars[0]:
|
273
|
+
word_obj[attr] = chars[0][attr]
|
274
|
+
|
275
|
+
# Add source attribute for native text elements
|
276
|
+
word_obj['source'] = 'native'
|
277
|
+
|
278
|
+
return TextElement(word_obj, self._page)
|
279
|
+
|
280
|
+
def create_text_elements_from_ocr(self, ocr_results, image_width=None, image_height=None):
|
281
|
+
"""
|
282
|
+
Convert OCR results to TextElement objects.
|
283
|
+
|
284
|
+
Args:
|
285
|
+
ocr_results: List of OCR results with text, bbox, and confidence
|
286
|
+
image_width: Width of the source image (for coordinate scaling)
|
287
|
+
image_height: Height of the source image (for coordinate scaling)
|
288
|
+
|
289
|
+
Returns:
|
290
|
+
List of created TextElement objects
|
291
|
+
"""
|
292
|
+
elements = []
|
293
|
+
|
294
|
+
# Calculate scale factors to convert from image coordinates to PDF coordinates
|
295
|
+
# Default to 1.0 if not provided (assume coordinates are already in PDF space)
|
296
|
+
scale_x = 1.0
|
297
|
+
scale_y = 1.0
|
298
|
+
|
299
|
+
if image_width and image_height:
|
300
|
+
scale_x = self._page.width / image_width
|
301
|
+
scale_y = self._page.height / image_height
|
302
|
+
|
303
|
+
for result in ocr_results:
|
304
|
+
# Convert numpy int32 to float if needed and scale to PDF coordinates
|
305
|
+
x0 = float(result['bbox'][0]) * scale_x
|
306
|
+
top = float(result['bbox'][1]) * scale_y
|
307
|
+
x1 = float(result['bbox'][2]) * scale_x
|
308
|
+
bottom = float(result['bbox'][3]) * scale_y
|
309
|
+
|
310
|
+
# Create a TextElement object with additional required fields for highlighting
|
311
|
+
element_data = {
|
312
|
+
'text': result['text'],
|
313
|
+
'x0': x0,
|
314
|
+
'top': top,
|
315
|
+
'x1': x1,
|
316
|
+
'bottom': bottom,
|
317
|
+
'width': x1 - x0,
|
318
|
+
'height': bottom - top,
|
319
|
+
'object_type': 'text',
|
320
|
+
'source': 'ocr',
|
321
|
+
'confidence': result['confidence'],
|
322
|
+
# Add default font information to work with existing expectations
|
323
|
+
'fontname': 'OCR-detected',
|
324
|
+
'size': 10.0,
|
325
|
+
'page_number': self._page.number
|
326
|
+
}
|
327
|
+
|
328
|
+
elem = TextElement(element_data, self._page)
|
329
|
+
elements.append(elem)
|
330
|
+
|
331
|
+
# Add to page's elements
|
332
|
+
if self._elements is not None:
|
333
|
+
# Add to words list to make it accessible via standard API
|
334
|
+
if 'words' in self._elements:
|
335
|
+
self._elements['words'].append(elem)
|
336
|
+
else:
|
337
|
+
self._elements['words'] = [elem]
|
338
|
+
|
339
|
+
return elements
|
340
|
+
|
341
|
+
def add_element(self, element, element_type='words'):
|
342
|
+
"""
|
343
|
+
Add an element to the managed elements.
|
344
|
+
|
345
|
+
Args:
|
346
|
+
element: The element to add
|
347
|
+
element_type: The type of element ('words', 'chars', etc.)
|
348
|
+
|
349
|
+
Returns:
|
350
|
+
True if added successfully, False otherwise
|
351
|
+
"""
|
352
|
+
# Load elements if not already loaded
|
353
|
+
self.load_elements()
|
354
|
+
|
355
|
+
# Add to the appropriate list
|
356
|
+
if element_type in self._elements:
|
357
|
+
self._elements[element_type].append(element)
|
358
|
+
return True
|
359
|
+
|
360
|
+
return False
|
361
|
+
|
362
|
+
def add_region(self, region, name=None):
|
363
|
+
"""
|
364
|
+
Add a region to the managed elements.
|
365
|
+
|
366
|
+
Args:
|
367
|
+
region: The region to add
|
368
|
+
name: Optional name for the region
|
369
|
+
|
370
|
+
Returns:
|
371
|
+
True if added successfully, False otherwise
|
372
|
+
"""
|
373
|
+
# Load elements if not already loaded
|
374
|
+
self.load_elements()
|
375
|
+
|
376
|
+
# Make sure regions is in _elements
|
377
|
+
if 'regions' not in self._elements:
|
378
|
+
self._elements['regions'] = []
|
379
|
+
|
380
|
+
# Add to elements for selector queries
|
381
|
+
if region not in self._elements['regions']:
|
382
|
+
self._elements['regions'].append(region)
|
383
|
+
return True
|
384
|
+
|
385
|
+
return False
|
386
|
+
|
387
|
+
def get_elements(self, element_type=None):
|
388
|
+
"""
|
389
|
+
Get all elements of the specified type, or all elements if type is None.
|
390
|
+
|
391
|
+
Args:
|
392
|
+
element_type: Optional element type ('words', 'chars', 'rects', 'lines', etc.)
|
393
|
+
|
394
|
+
Returns:
|
395
|
+
List of elements
|
396
|
+
"""
|
397
|
+
# Load elements if not already loaded
|
398
|
+
self.load_elements()
|
399
|
+
|
400
|
+
if element_type:
|
401
|
+
return self._elements.get(element_type, [])
|
402
|
+
|
403
|
+
# Combine all element types
|
404
|
+
all_elements = []
|
405
|
+
for elements in self._elements.values():
|
406
|
+
all_elements.extend(elements)
|
407
|
+
|
408
|
+
return all_elements
|
409
|
+
|
410
|
+
def get_all_elements(self):
|
411
|
+
"""
|
412
|
+
Get all elements from all types.
|
413
|
+
|
414
|
+
Returns:
|
415
|
+
List of all elements
|
416
|
+
"""
|
417
|
+
# Load elements if not already loaded
|
418
|
+
self.load_elements()
|
419
|
+
|
420
|
+
# Combine all element types
|
421
|
+
all_elements = []
|
422
|
+
for elements in self._elements.values():
|
423
|
+
all_elements.extend(elements)
|
424
|
+
|
425
|
+
return all_elements
|
426
|
+
|
427
|
+
@property
|
428
|
+
def chars(self):
|
429
|
+
"""Get all character elements."""
|
430
|
+
self.load_elements()
|
431
|
+
return self._elements['chars']
|
432
|
+
|
433
|
+
@property
|
434
|
+
def words(self):
|
435
|
+
"""Get all word elements."""
|
436
|
+
self.load_elements()
|
437
|
+
return self._elements['words']
|
438
|
+
|
439
|
+
@property
|
440
|
+
def rects(self):
|
441
|
+
"""Get all rectangle elements."""
|
442
|
+
self.load_elements()
|
443
|
+
return self._elements['rects']
|
444
|
+
|
445
|
+
@property
|
446
|
+
def lines(self):
|
447
|
+
"""Get all line elements."""
|
448
|
+
self.load_elements()
|
449
|
+
return self._elements['lines']
|
450
|
+
|
451
|
+
@property
|
452
|
+
def regions(self):
|
453
|
+
"""Get all region elements."""
|
454
|
+
self.load_elements()
|
455
|
+
if 'regions' not in self._elements:
|
456
|
+
self._elements['regions'] = []
|
457
|
+
return self._elements['regions']
|