natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,457 @@
1
+ """
2
+ Element Manager for natural-pdf.
3
+
4
+ This class handles the loading, creation, and management of PDF elements like
5
+ characters, words, rectangles, and lines extracted from a page.
6
+ """
7
+ import logging
8
+ from typing import List, Dict, Any, Optional, Union, Tuple
9
+ from itertools import groupby
10
+ import re
11
+
12
+ from natural_pdf.elements.text import TextElement
13
+ from natural_pdf.elements.rect import RectangleElement
14
+ from natural_pdf.elements.line import LineElement
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class ElementManager:
19
+ """
20
+ Manages the loading, creation, and retrieval of elements from a PDF page.
21
+
22
+ This class centralizes the element management functionality previously
23
+ contained in the Page class, providing better separation of concerns.
24
+ """
25
+
26
+ def __init__(self, page, font_attrs=None):
27
+ """
28
+ Initialize the ElementManager.
29
+
30
+ Args:
31
+ page: The parent Page object
32
+ font_attrs: Font attributes to consider when grouping characters into words.
33
+ Default: ['fontname', 'size'] (Group by font name and size)
34
+ None: Only consider spatial relationships
35
+ List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
36
+ """
37
+ self._page = page
38
+ self._elements = None # Lazy-loaded
39
+ # Default to grouping by fontname and size if not specified
40
+ self._font_attrs = ['fontname', 'size'] if font_attrs is None else font_attrs
41
+
42
+ def load_elements(self):
43
+ """
44
+ Load all elements from the page (lazy loading).
45
+ """
46
+ if self._elements is None:
47
+ # Create character elements with font information
48
+ chars = self._create_char_elements()
49
+
50
+ # Get keep_spaces setting from PDF config or default to True
51
+ keep_spaces = self._page._parent._config.get('keep_spaces', True)
52
+
53
+ # Group characters into words
54
+ words = self._group_chars_into_words(keep_spaces, self._font_attrs)
55
+
56
+ # Create the elements dictionary with all element types
57
+ self._elements = {
58
+ 'chars': chars,
59
+ 'words': words,
60
+ 'rects': [RectangleElement(r, self._page) for r in self._page._page.rects],
61
+ 'lines': [LineElement(l, self._page) for l in self._page._page.lines],
62
+ # Add other element types as needed
63
+ }
64
+
65
+ # Add regions if they exist
66
+ if hasattr(self._page, '_regions') and ('detected' in self._page._regions or 'named' in self._page._regions):
67
+ regions = []
68
+ if 'detected' in self._page._regions:
69
+ regions.extend(self._page._regions['detected'])
70
+ if 'named' in self._page._regions:
71
+ regions.extend(self._page._regions['named'].values())
72
+ self._elements['regions'] = regions
73
+
74
+ def _create_char_elements(self):
75
+ """
76
+ Create TextElement objects from page characters with enhanced font information.
77
+
78
+ Returns:
79
+ List of TextElement objects for characters
80
+ """
81
+ chars = []
82
+ for c in self._page._page.chars:
83
+ # Process font reference information
84
+ self._process_font_information(c)
85
+
86
+ # Add source attribute for native text elements
87
+ c['source'] = 'native'
88
+ chars.append(TextElement(c, self._page))
89
+
90
+ return chars
91
+
92
+ def _process_font_information(self, char_dict):
93
+ """
94
+ Process font information for a character dict, adding real_fontname when possible.
95
+
96
+ Args:
97
+ char_dict: Character dictionary to process
98
+ """
99
+ # Check for font references (F0, F1, etc.) and map to actual fonts
100
+ if char_dict.get('fontname', '').startswith('F') and len(char_dict['fontname']) <= 3:
101
+ # Access the PDF resource info to get actual font name
102
+ font_ref = char_dict['fontname']
103
+ try:
104
+ # Try to get font info from resources
105
+ if self._page._page.page_obj.get('Resources', {}).get('Font', {}):
106
+ fonts = self._page._page.page_obj['Resources']['Font']
107
+ if font_ref in fonts:
108
+ font_obj = fonts[font_ref]
109
+ if font_obj.get('BaseFont'):
110
+ char_dict['real_fontname'] = font_obj['BaseFont']
111
+ except (KeyError, AttributeError, TypeError):
112
+ pass
113
+
114
+ def _group_chars_into_words(self, keep_spaces=True, font_attrs=None):
115
+ """
116
+ Group characters into words based on font attributes and spatial relationships.
117
+
118
+ Args:
119
+ keep_spaces: Whether to keep spaces in words or use them as word separators
120
+ font_attrs: Font attributes to consider when grouping characters
121
+
122
+ Returns:
123
+ List of TextElement word objects
124
+ """
125
+ # Sort chars by y-position (line) and then x-position
126
+ sorted_chars = sorted(self._page._page.chars, key=lambda c: (round(c['top']), c['x0']))
127
+
128
+ # Group chars by line (similar y-position)
129
+ line_groups = []
130
+ for _, line_chars in groupby(sorted_chars, key=lambda c: round(c['top'])):
131
+ line_chars = list(line_chars)
132
+
133
+ # Process each line of characters into words
134
+ words = self._process_line_into_words(line_chars, keep_spaces, font_attrs)
135
+ line_groups.extend(words)
136
+
137
+ return line_groups
138
+
139
+ def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
140
+ """
141
+ Process a single line of characters into words.
142
+
143
+ Args:
144
+ line_chars: List of characters in the line
145
+ keep_spaces: Whether to keep spaces in words
146
+ font_attrs: Font attributes to consider for word breaks
147
+
148
+ Returns:
149
+ List of TextElement word objects for this line
150
+ """
151
+ words = []
152
+ current_word = []
153
+
154
+ for i, char in enumerate(line_chars):
155
+ # Handle whitespace characters differently based on keep_spaces setting
156
+ if char['text'].isspace():
157
+ if keep_spaces:
158
+ # Include spaces in words when keep_spaces is enabled
159
+ if current_word:
160
+ current_word.append(char)
161
+ else:
162
+ # Skip leading spaces at the start of a line
163
+ continue
164
+ else:
165
+ # Original behavior: Skip whitespace and close current word
166
+ if current_word:
167
+ # Create word and add to words list
168
+ word = self._create_word_element(current_word, font_attrs)
169
+ words.append(word)
170
+ current_word = []
171
+ continue
172
+
173
+ # If this is a new word, start it
174
+ if not current_word:
175
+ current_word.append(char)
176
+ else:
177
+ # Check if this char is part of the current word or a new word
178
+ prev_char = current_word[-1]
179
+
180
+ # Check if font attributes match for this character
181
+ font_attrs_match = self._check_font_attributes_match(char, prev_char, font_attrs)
182
+
183
+ # If font attributes don't match, it's a new word
184
+ if not font_attrs_match:
185
+ # Complete current word
186
+ word = self._create_word_element(current_word, font_attrs)
187
+ words.append(word)
188
+ current_word = [char]
189
+ # If the gap between chars is larger than a threshold, it's a new word
190
+ # Use a wider threshold when keep_spaces is enabled to allow for natural spaces
191
+ elif char['x0'] - prev_char['x1'] > prev_char['width'] * (1.5 if keep_spaces else 0.5):
192
+ # Complete current word
193
+ word = self._create_word_element(current_word, font_attrs)
194
+ words.append(word)
195
+ current_word = [char]
196
+ else:
197
+ # Continue current word
198
+ current_word.append(char)
199
+
200
+ # Handle the last word if there is one
201
+ if current_word:
202
+ word = self._create_word_element(current_word, font_attrs)
203
+ words.append(word)
204
+
205
+ return words
206
+
207
+ def _check_font_attributes_match(self, char, prev_char, font_attrs):
208
+ """
209
+ Check if two characters have matching font attributes.
210
+
211
+ Args:
212
+ char: Current character
213
+ prev_char: Previous character
214
+ font_attrs: List of font attributes to check
215
+
216
+ Returns:
217
+ Boolean indicating whether font attributes match
218
+ """
219
+ # Default to match if no font attributes specified
220
+ if not font_attrs:
221
+ return True
222
+
223
+ # Check each font attribute
224
+ for attr in font_attrs:
225
+ # If attribute doesn't match or isn't present in both chars, they don't match
226
+ if attr not in char or attr not in prev_char or char[attr] != prev_char[attr]:
227
+ return False
228
+
229
+ return True
230
+
231
+ def _create_word_element(self, chars, font_attrs):
232
+ """
233
+ Create a word element from a list of character dictionaries.
234
+
235
+ Args:
236
+ chars: List of character dictionaries
237
+ font_attrs: Font attributes to copy to the word
238
+
239
+ Returns:
240
+ TextElement representing the word
241
+ """
242
+ # Combine text from characters and normalize spaces
243
+ text = ''.join(c['text'] for c in chars)
244
+
245
+ # Collapse multiple consecutive spaces into a single space
246
+ text = re.sub(r'\s+', ' ', text)
247
+
248
+ # Create a combined word object
249
+ word_obj = {
250
+ 'text': text,
251
+ 'x0': min(c['x0'] for c in chars),
252
+ 'x1': max(c['x1'] for c in chars),
253
+ 'top': min(c['top'] for c in chars),
254
+ 'bottom': max(c['bottom'] for c in chars),
255
+ 'fontname': chars[0].get('fontname', ''),
256
+ 'size': chars[0].get('size', 0),
257
+ 'object_type': 'word',
258
+ 'page_number': chars[0]['page_number']
259
+ }
260
+
261
+ # Handle real fontname if available
262
+ if 'real_fontname' in chars[0]:
263
+ word_obj['real_fontname'] = chars[0]['real_fontname']
264
+
265
+ # Handle color - use the first char's color
266
+ if 'non_stroking_color' in chars[0]:
267
+ word_obj['non_stroking_color'] = chars[0]['non_stroking_color']
268
+
269
+ # Copy any additional font attributes
270
+ if font_attrs:
271
+ for attr in font_attrs:
272
+ if attr in chars[0]:
273
+ word_obj[attr] = chars[0][attr]
274
+
275
+ # Add source attribute for native text elements
276
+ word_obj['source'] = 'native'
277
+
278
+ return TextElement(word_obj, self._page)
279
+
280
+ def create_text_elements_from_ocr(self, ocr_results, image_width=None, image_height=None):
281
+ """
282
+ Convert OCR results to TextElement objects.
283
+
284
+ Args:
285
+ ocr_results: List of OCR results with text, bbox, and confidence
286
+ image_width: Width of the source image (for coordinate scaling)
287
+ image_height: Height of the source image (for coordinate scaling)
288
+
289
+ Returns:
290
+ List of created TextElement objects
291
+ """
292
+ elements = []
293
+
294
+ # Calculate scale factors to convert from image coordinates to PDF coordinates
295
+ # Default to 1.0 if not provided (assume coordinates are already in PDF space)
296
+ scale_x = 1.0
297
+ scale_y = 1.0
298
+
299
+ if image_width and image_height:
300
+ scale_x = self._page.width / image_width
301
+ scale_y = self._page.height / image_height
302
+
303
+ for result in ocr_results:
304
+ # Convert numpy int32 to float if needed and scale to PDF coordinates
305
+ x0 = float(result['bbox'][0]) * scale_x
306
+ top = float(result['bbox'][1]) * scale_y
307
+ x1 = float(result['bbox'][2]) * scale_x
308
+ bottom = float(result['bbox'][3]) * scale_y
309
+
310
+ # Create a TextElement object with additional required fields for highlighting
311
+ element_data = {
312
+ 'text': result['text'],
313
+ 'x0': x0,
314
+ 'top': top,
315
+ 'x1': x1,
316
+ 'bottom': bottom,
317
+ 'width': x1 - x0,
318
+ 'height': bottom - top,
319
+ 'object_type': 'text',
320
+ 'source': 'ocr',
321
+ 'confidence': result['confidence'],
322
+ # Add default font information to work with existing expectations
323
+ 'fontname': 'OCR-detected',
324
+ 'size': 10.0,
325
+ 'page_number': self._page.number
326
+ }
327
+
328
+ elem = TextElement(element_data, self._page)
329
+ elements.append(elem)
330
+
331
+ # Add to page's elements
332
+ if self._elements is not None:
333
+ # Add to words list to make it accessible via standard API
334
+ if 'words' in self._elements:
335
+ self._elements['words'].append(elem)
336
+ else:
337
+ self._elements['words'] = [elem]
338
+
339
+ return elements
340
+
341
+ def add_element(self, element, element_type='words'):
342
+ """
343
+ Add an element to the managed elements.
344
+
345
+ Args:
346
+ element: The element to add
347
+ element_type: The type of element ('words', 'chars', etc.)
348
+
349
+ Returns:
350
+ True if added successfully, False otherwise
351
+ """
352
+ # Load elements if not already loaded
353
+ self.load_elements()
354
+
355
+ # Add to the appropriate list
356
+ if element_type in self._elements:
357
+ self._elements[element_type].append(element)
358
+ return True
359
+
360
+ return False
361
+
362
+ def add_region(self, region, name=None):
363
+ """
364
+ Add a region to the managed elements.
365
+
366
+ Args:
367
+ region: The region to add
368
+ name: Optional name for the region
369
+
370
+ Returns:
371
+ True if added successfully, False otherwise
372
+ """
373
+ # Load elements if not already loaded
374
+ self.load_elements()
375
+
376
+ # Make sure regions is in _elements
377
+ if 'regions' not in self._elements:
378
+ self._elements['regions'] = []
379
+
380
+ # Add to elements for selector queries
381
+ if region not in self._elements['regions']:
382
+ self._elements['regions'].append(region)
383
+ return True
384
+
385
+ return False
386
+
387
+ def get_elements(self, element_type=None):
388
+ """
389
+ Get all elements of the specified type, or all elements if type is None.
390
+
391
+ Args:
392
+ element_type: Optional element type ('words', 'chars', 'rects', 'lines', etc.)
393
+
394
+ Returns:
395
+ List of elements
396
+ """
397
+ # Load elements if not already loaded
398
+ self.load_elements()
399
+
400
+ if element_type:
401
+ return self._elements.get(element_type, [])
402
+
403
+ # Combine all element types
404
+ all_elements = []
405
+ for elements in self._elements.values():
406
+ all_elements.extend(elements)
407
+
408
+ return all_elements
409
+
410
+ def get_all_elements(self):
411
+ """
412
+ Get all elements from all types.
413
+
414
+ Returns:
415
+ List of all elements
416
+ """
417
+ # Load elements if not already loaded
418
+ self.load_elements()
419
+
420
+ # Combine all element types
421
+ all_elements = []
422
+ for elements in self._elements.values():
423
+ all_elements.extend(elements)
424
+
425
+ return all_elements
426
+
427
+ @property
428
+ def chars(self):
429
+ """Get all character elements."""
430
+ self.load_elements()
431
+ return self._elements['chars']
432
+
433
+ @property
434
+ def words(self):
435
+ """Get all word elements."""
436
+ self.load_elements()
437
+ return self._elements['words']
438
+
439
+ @property
440
+ def rects(self):
441
+ """Get all rectangle elements."""
442
+ self.load_elements()
443
+ return self._elements['rects']
444
+
445
+ @property
446
+ def lines(self):
447
+ """Get all line elements."""
448
+ self.load_elements()
449
+ return self._elements['lines']
450
+
451
+ @property
452
+ def regions(self):
453
+ """Get all region elements."""
454
+ self.load_elements()
455
+ if 'regions' not in self._elements:
456
+ self._elements['regions'] = []
457
+ return self._elements['regions']