natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
natural_pdf/core/page.py
ADDED
@@ -0,0 +1,2376 @@
|
|
1
|
+
import pdfplumber
|
2
|
+
import os
|
3
|
+
import tempfile
|
4
|
+
from typing import List, Optional, Union, Any, Dict, Callable, TYPE_CHECKING, Tuple
|
5
|
+
from PIL import Image
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
import pdfplumber
|
9
|
+
from natural_pdf.core.pdf import PDF
|
10
|
+
from natural_pdf.elements.collections import ElementCollection
|
11
|
+
from natural_pdf.utils.highlighting import HighlightManager
|
12
|
+
from natural_pdf.elements.base import Element
|
13
|
+
|
14
|
+
from natural_pdf.elements.region import Region
|
15
|
+
from natural_pdf.elements.text import TextElement
|
16
|
+
from natural_pdf.analyzers.document_layout import (
|
17
|
+
YOLODocLayoutDetector,
|
18
|
+
TableTransformerDetector,
|
19
|
+
PaddleLayoutDetector,
|
20
|
+
convert_to_regions
|
21
|
+
)
|
22
|
+
from natural_pdf.utils.ocr import OCRManager
|
23
|
+
|
24
|
+
# Import OCR engines
|
25
|
+
try:
|
26
|
+
from natural_pdf.ocr import OCREngine, EasyOCREngine, PaddleOCREngine
|
27
|
+
HAS_OCR_ENGINES = True
|
28
|
+
except ImportError:
|
29
|
+
# Fallback if the OCR engines are not available
|
30
|
+
HAS_OCR_ENGINES = False
|
31
|
+
|
32
|
+
|
33
|
+
class Page:
|
34
|
+
"""
|
35
|
+
Enhanced Page wrapper built on top of pdfplumber.Page.
|
36
|
+
|
37
|
+
This class provides a fluent interface for working with PDF pages,
|
38
|
+
with improved selection, navigation, extraction, and question-answering capabilities.
|
39
|
+
"""
|
40
|
+
|
41
|
+
def __init__(self, page: 'pdfplumber.page.Page', parent: 'PDF', index: int, font_attrs=None):
|
42
|
+
"""
|
43
|
+
Initialize a page wrapper.
|
44
|
+
|
45
|
+
Args:
|
46
|
+
page: pdfplumber page object
|
47
|
+
parent: Parent PDF object
|
48
|
+
index: Index of this page in the PDF (0-based)
|
49
|
+
font_attrs: Font attributes to consider when grouping characters into words.
|
50
|
+
Default: ['fontname', 'size'] (Group by font name and size)
|
51
|
+
None: Only consider spatial relationships
|
52
|
+
List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
|
53
|
+
"""
|
54
|
+
self._page = page
|
55
|
+
self._parent = parent
|
56
|
+
self._index = index
|
57
|
+
self._elements = None # Lazy-loaded
|
58
|
+
self._highlight_manager = None # Lazy-loaded
|
59
|
+
self._text_styles = None # Lazy-loaded text style analyzer results
|
60
|
+
self._exclusions = [] # List to store exclusion functions/regions
|
61
|
+
|
62
|
+
# Region management
|
63
|
+
self._regions = {
|
64
|
+
'detected': [], # Layout detection results
|
65
|
+
'named': {}, # Named regions (name -> region)
|
66
|
+
}
|
67
|
+
|
68
|
+
# Default to grouping by fontname and size if not specified
|
69
|
+
self._font_attrs = ['fontname', 'size'] if font_attrs is None else font_attrs
|
70
|
+
|
71
|
+
@property
|
72
|
+
def number(self) -> int:
|
73
|
+
"""Get page number (1-based)."""
|
74
|
+
return self._page.page_number
|
75
|
+
|
76
|
+
@property
|
77
|
+
def index(self) -> int:
|
78
|
+
"""Get page index (0-based)."""
|
79
|
+
return self._index
|
80
|
+
|
81
|
+
@property
|
82
|
+
def width(self) -> float:
|
83
|
+
"""Get page width."""
|
84
|
+
return self._page.width
|
85
|
+
|
86
|
+
@property
|
87
|
+
def height(self) -> float:
|
88
|
+
"""Get page height."""
|
89
|
+
return self._page.height
|
90
|
+
|
91
|
+
def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region]) -> 'Page':
|
92
|
+
"""
|
93
|
+
Add an exclusion to the page. Text from these regions will be excluded from extraction.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
exclusion_func_or_region: Either a Region object or a function that takes a Page
|
97
|
+
and returns a Region to exclude
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
Self for method chaining
|
101
|
+
"""
|
102
|
+
self._exclusions.append(exclusion_func_or_region)
|
103
|
+
return self
|
104
|
+
|
105
|
+
def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
|
106
|
+
"""
|
107
|
+
Add a region to the page.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
region: Region object to add
|
111
|
+
name: Optional name for the region
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
Self for method chaining
|
115
|
+
"""
|
116
|
+
# Check if it's actually a Region object
|
117
|
+
if not isinstance(region, Region):
|
118
|
+
raise TypeError("region must be a Region object")
|
119
|
+
|
120
|
+
# Set the source and name
|
121
|
+
region.source = 'named'
|
122
|
+
|
123
|
+
if name:
|
124
|
+
region.name = name
|
125
|
+
# Add to named regions dictionary (overwriting if name already exists)
|
126
|
+
self._regions['named'][name] = region
|
127
|
+
else:
|
128
|
+
# Add to detected regions list (unnamed but registered)
|
129
|
+
self._regions['detected'].append(region)
|
130
|
+
|
131
|
+
# Make sure regions is in _elements for selectors
|
132
|
+
if self._elements is not None and 'regions' not in self._elements:
|
133
|
+
self._elements['regions'] = []
|
134
|
+
|
135
|
+
# Add to elements for selector queries
|
136
|
+
if self._elements is not None:
|
137
|
+
if region not in self._elements['regions']:
|
138
|
+
self._elements['regions'].append(region)
|
139
|
+
|
140
|
+
return self
|
141
|
+
|
142
|
+
def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> 'Page':
|
143
|
+
"""
|
144
|
+
Add multiple regions to the page.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
regions: List of Region objects to add
|
148
|
+
prefix: Optional prefix for automatic naming (regions will be named prefix_1, prefix_2, etc.)
|
149
|
+
|
150
|
+
Returns:
|
151
|
+
Self for method chaining
|
152
|
+
"""
|
153
|
+
if prefix:
|
154
|
+
# Add with automatic sequential naming
|
155
|
+
for i, region in enumerate(regions):
|
156
|
+
self.add_region(region, name=f"{prefix}_{i+1}")
|
157
|
+
else:
|
158
|
+
# Add without names
|
159
|
+
for region in regions:
|
160
|
+
self.add_region(region)
|
161
|
+
|
162
|
+
return self
|
163
|
+
|
164
|
+
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
165
|
+
"""
|
166
|
+
Get all exclusion regions for this page.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
include_callable: Whether to evaluate callable exclusion functions
|
170
|
+
debug: Enable verbose debug logging for exclusion evaluation
|
171
|
+
|
172
|
+
Returns:
|
173
|
+
List of Region objects to exclude
|
174
|
+
"""
|
175
|
+
regions = []
|
176
|
+
|
177
|
+
# Track exclusion results for debugging
|
178
|
+
if debug:
|
179
|
+
print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
|
180
|
+
|
181
|
+
for i, exclusion in enumerate(self._exclusions):
|
182
|
+
# Get exclusion label if it's a tuple from PDF level
|
183
|
+
exclusion_label = f"exclusion {i}"
|
184
|
+
original_exclusion = exclusion
|
185
|
+
|
186
|
+
# Check if it's a tuple from PDF.add_exclusion
|
187
|
+
if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
|
188
|
+
# This is likely from PDF.add_exclusion with (func, label)
|
189
|
+
exclusion_func, label = exclusion
|
190
|
+
if label:
|
191
|
+
exclusion_label = label
|
192
|
+
exclusion = exclusion_func
|
193
|
+
|
194
|
+
# Process callable exclusion functions
|
195
|
+
if callable(exclusion) and include_callable:
|
196
|
+
# It's a function, call it with this page
|
197
|
+
try:
|
198
|
+
if debug:
|
199
|
+
print(f" - Evaluating callable {exclusion_label}...")
|
200
|
+
|
201
|
+
# Create a temporary copy of exclusions to avoid recursion
|
202
|
+
original_exclusions = self._exclusions
|
203
|
+
self._exclusions = [] # Temporarily clear exclusions
|
204
|
+
|
205
|
+
# Call the function
|
206
|
+
region = exclusion(self)
|
207
|
+
|
208
|
+
# Restore exclusions
|
209
|
+
self._exclusions = original_exclusions
|
210
|
+
|
211
|
+
if region:
|
212
|
+
regions.append(region)
|
213
|
+
if debug:
|
214
|
+
print(f" ✓ Added region: {region}")
|
215
|
+
else:
|
216
|
+
if debug:
|
217
|
+
print(f" ✗ Function returned None, no region added")
|
218
|
+
|
219
|
+
except Exception as e:
|
220
|
+
error_msg = f"Error in {exclusion_label} for page {self.index}: {e}"
|
221
|
+
print(error_msg)
|
222
|
+
# Print more detailed traceback for debugging
|
223
|
+
import traceback
|
224
|
+
print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
|
225
|
+
|
226
|
+
# Process direct Region objects
|
227
|
+
elif not callable(exclusion):
|
228
|
+
# It's already a Region object
|
229
|
+
regions.append(exclusion)
|
230
|
+
if debug:
|
231
|
+
print(f" - Added direct region: {exclusion}")
|
232
|
+
|
233
|
+
if debug:
|
234
|
+
print(f"Page {self.index}: Found {len(regions)} valid exclusion regions")
|
235
|
+
|
236
|
+
return regions
|
237
|
+
|
238
|
+
def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
|
239
|
+
"""
|
240
|
+
Find first element on this page matching selector.
|
241
|
+
|
242
|
+
Args:
|
243
|
+
selector: CSS-like selector string
|
244
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
245
|
+
regex: Whether to use regex for text search in :contains (default: False)
|
246
|
+
case: Whether to do case-sensitive text search (default: True)
|
247
|
+
**kwargs: Additional filter parameters
|
248
|
+
|
249
|
+
Returns:
|
250
|
+
Element object or None if not found
|
251
|
+
"""
|
252
|
+
from natural_pdf.selectors.parser import parse_selector
|
253
|
+
selector_obj = parse_selector(selector)
|
254
|
+
|
255
|
+
# Pass regex and case flags to selector function
|
256
|
+
kwargs['regex'] = regex
|
257
|
+
kwargs['case'] = case
|
258
|
+
|
259
|
+
# First get all matching elements without applying exclusions
|
260
|
+
results = self._apply_selector(selector_obj, **kwargs)
|
261
|
+
|
262
|
+
# Then filter by exclusions if requested
|
263
|
+
if apply_exclusions and self._exclusions and results:
|
264
|
+
# Get all exclusion regions, including those from lambda functions
|
265
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True)
|
266
|
+
|
267
|
+
# Apply exclusion regions if any
|
268
|
+
if exclusion_regions:
|
269
|
+
results = results.exclude_regions(exclusion_regions)
|
270
|
+
|
271
|
+
return results.first if results else None
|
272
|
+
|
273
|
+
def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> 'ElementCollection':
|
274
|
+
"""
|
275
|
+
Find all elements on this page matching selector.
|
276
|
+
|
277
|
+
Args:
|
278
|
+
selector: CSS-like selector string
|
279
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
280
|
+
regex: Whether to use regex for text search in :contains (default: False)
|
281
|
+
case: Whether to do case-sensitive text search (default: True)
|
282
|
+
**kwargs: Additional filter parameters
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
ElementCollection with matching elements
|
286
|
+
"""
|
287
|
+
from natural_pdf.selectors.parser import parse_selector
|
288
|
+
selector_obj = parse_selector(selector)
|
289
|
+
|
290
|
+
# Pass regex and case flags to selector function
|
291
|
+
kwargs['regex'] = regex
|
292
|
+
kwargs['case'] = case
|
293
|
+
|
294
|
+
# First get all matching elements without applying exclusions
|
295
|
+
results = self._apply_selector(selector_obj, **kwargs)
|
296
|
+
|
297
|
+
# Then filter by exclusions if requested
|
298
|
+
if apply_exclusions and self._exclusions and results:
|
299
|
+
# Get all exclusion regions, including those from lambda functions
|
300
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True)
|
301
|
+
|
302
|
+
# Apply exclusion regions if any
|
303
|
+
if exclusion_regions:
|
304
|
+
results = results.exclude_regions(exclusion_regions)
|
305
|
+
|
306
|
+
return results
|
307
|
+
|
308
|
+
def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, **kwargs) -> 'ElementCollection':
|
309
|
+
"""
|
310
|
+
Apply selector to page elements.
|
311
|
+
|
312
|
+
Args:
|
313
|
+
selector_obj: Parsed selector dictionary
|
314
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
315
|
+
**kwargs: Additional filter parameters
|
316
|
+
|
317
|
+
Returns:
|
318
|
+
ElementCollection of matching elements
|
319
|
+
"""
|
320
|
+
from natural_pdf.elements.collections import ElementCollection
|
321
|
+
from natural_pdf.selectors.parser import selector_to_filter_func
|
322
|
+
|
323
|
+
# Load all elements if not already loaded
|
324
|
+
self._load_elements()
|
325
|
+
|
326
|
+
# Get element type to filter
|
327
|
+
element_type = selector_obj.get('type', 'any').lower()
|
328
|
+
|
329
|
+
# Determine which elements to search based on element type
|
330
|
+
elements_to_search = []
|
331
|
+
if element_type == 'any':
|
332
|
+
# Search all element types
|
333
|
+
for key, elements_list in self._elements.items():
|
334
|
+
# Skip chars if we have words for text search (avoid duplication)
|
335
|
+
if key == 'chars' and 'words' in self._elements:
|
336
|
+
continue
|
337
|
+
elements_to_search.extend(elements_list)
|
338
|
+
elif element_type == 'text':
|
339
|
+
# Prefer word elements over character elements for text
|
340
|
+
if 'words' in self._elements:
|
341
|
+
elements_to_search = self._elements.get('words', [])
|
342
|
+
else:
|
343
|
+
elements_to_search = self._elements.get('chars', [])
|
344
|
+
elif element_type == 'char':
|
345
|
+
elements_to_search = self._elements.get('chars', [])
|
346
|
+
elif element_type == 'word':
|
347
|
+
elements_to_search = self._elements.get('words', [])
|
348
|
+
elif element_type == 'rect' or element_type == 'rectangle':
|
349
|
+
elements_to_search = self._elements.get('rects', [])
|
350
|
+
elif element_type == 'line':
|
351
|
+
elements_to_search = self._elements.get('lines', [])
|
352
|
+
elif element_type == 'region':
|
353
|
+
# Start with an empty list
|
354
|
+
elements_to_search = []
|
355
|
+
|
356
|
+
# Add regions from _elements if available
|
357
|
+
if 'regions' in self._elements and self._elements['regions']:
|
358
|
+
elements_to_search.extend(self._elements['regions'])
|
359
|
+
|
360
|
+
# If no regions in _elements, look in _regions
|
361
|
+
if not elements_to_search:
|
362
|
+
# Add detected regions
|
363
|
+
elements_to_search.extend(self._regions['detected'])
|
364
|
+
|
365
|
+
# Add named regions
|
366
|
+
elements_to_search.extend(self._regions['named'].values())
|
367
|
+
else:
|
368
|
+
# If type doesn't match a specific category, look in all categories
|
369
|
+
for key, elements_list in self._elements.items():
|
370
|
+
# Skip chars if we have words for text search (avoid duplication)
|
371
|
+
if key == 'chars' and 'words' in self._elements:
|
372
|
+
continue
|
373
|
+
elements_to_search.extend(elements_list)
|
374
|
+
|
375
|
+
# Create filter function from selector, passing any additional parameters
|
376
|
+
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
377
|
+
|
378
|
+
# Apply the filter to matching elements
|
379
|
+
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
380
|
+
|
381
|
+
# Handle spatial pseudo-classes that require relationship checking
|
382
|
+
for pseudo in selector_obj.get('pseudo_classes', []):
|
383
|
+
name = pseudo.get('name')
|
384
|
+
args = pseudo.get('args', '')
|
385
|
+
|
386
|
+
if name in ('above', 'below', 'near', 'left-of', 'right-of'):
|
387
|
+
# Find the reference element first
|
388
|
+
from natural_pdf.selectors.parser import parse_selector
|
389
|
+
ref_selector = parse_selector(args) if isinstance(args, str) else args
|
390
|
+
ref_elements = self._apply_selector(ref_selector)
|
391
|
+
|
392
|
+
if not ref_elements:
|
393
|
+
# No reference elements found, so no matches
|
394
|
+
return ElementCollection([])
|
395
|
+
|
396
|
+
# Use the first reference element for now
|
397
|
+
# TODO: Improve this to consider all reference elements
|
398
|
+
ref_element = ref_elements.first()
|
399
|
+
|
400
|
+
# Filter elements based on spatial relationship
|
401
|
+
if name == 'above':
|
402
|
+
matching_elements = [el for el in matching_elements if el.bottom <= ref_element.top]
|
403
|
+
elif name == 'below':
|
404
|
+
matching_elements = [el for el in matching_elements if el.top >= ref_element.bottom]
|
405
|
+
elif name == 'left-of':
|
406
|
+
matching_elements = [el for el in matching_elements if el.x1 <= ref_element.x0]
|
407
|
+
elif name == 'right-of':
|
408
|
+
matching_elements = [el for el in matching_elements if el.x0 >= ref_element.x1]
|
409
|
+
elif name == 'near':
|
410
|
+
# Calculate distance between centers
|
411
|
+
def distance(el1, el2):
|
412
|
+
el1_center_x = (el1.x0 + el1.x1) / 2
|
413
|
+
el1_center_y = (el1.top + el1.bottom) / 2
|
414
|
+
el2_center_x = (el2.x0 + el2.x1) / 2
|
415
|
+
el2_center_y = (el2.top + el2.bottom) / 2
|
416
|
+
return ((el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2) ** 0.5
|
417
|
+
|
418
|
+
# Get distance threshold from kwargs or use default
|
419
|
+
threshold = kwargs.get('near_threshold', 50) # Default 50 points
|
420
|
+
matching_elements = [el for el in matching_elements if distance(el, ref_element) <= threshold]
|
421
|
+
|
422
|
+
# Sort elements in reading order if requested
|
423
|
+
if kwargs.get('reading_order', True):
|
424
|
+
# TODO: Implement proper reading order sorting
|
425
|
+
# For now, simple top-to-bottom, left-to-right ordering
|
426
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
427
|
+
|
428
|
+
# Create result collection
|
429
|
+
result = ElementCollection(matching_elements)
|
430
|
+
|
431
|
+
# Apply exclusions if requested and if there are exclusions defined
|
432
|
+
# Note: We don't apply exclusions here as that would cause recursion
|
433
|
+
# Exclusions are applied at the higher level via exclude_regions
|
434
|
+
|
435
|
+
return result
|
436
|
+
|
437
|
+
def create_region(self, x0: float, top: float, x1: float, bottom: float) -> Any:
|
438
|
+
"""
|
439
|
+
Create a region on this page with the specified coordinates.
|
440
|
+
|
441
|
+
Args:
|
442
|
+
x0: Left x-coordinate
|
443
|
+
top: Top y-coordinate
|
444
|
+
x1: Right x-coordinate
|
445
|
+
bottom: Bottom y-coordinate
|
446
|
+
|
447
|
+
Returns:
|
448
|
+
Region object for the specified coordinates
|
449
|
+
"""
|
450
|
+
from natural_pdf.elements.region import Region
|
451
|
+
return Region(self, (x0, top, x1, bottom))
|
452
|
+
|
453
|
+
def region(self, left: float = None, top: float = None, right: float = None, bottom: float = None,
|
454
|
+
width: str = "full") -> Any:
|
455
|
+
"""
|
456
|
+
Create a region on this page with more intuitive named parameters.
|
457
|
+
|
458
|
+
Args:
|
459
|
+
left: Left x-coordinate (default: 0)
|
460
|
+
top: Top y-coordinate (default: 0)
|
461
|
+
right: Right x-coordinate (default: page width)
|
462
|
+
bottom: Bottom y-coordinate (default: page height)
|
463
|
+
width: Width mode - "full" for full page width or "element" for element width
|
464
|
+
|
465
|
+
Returns:
|
466
|
+
Region object for the specified coordinates
|
467
|
+
|
468
|
+
Examples:
|
469
|
+
>>> page.region(top=100, bottom=200) # Full width from y=100 to y=200
|
470
|
+
>>> page.region(left=50, right=150, top=100, bottom=200) # Specific rectangle
|
471
|
+
"""
|
472
|
+
# Handle defaults
|
473
|
+
left = 0 if left is None else left
|
474
|
+
top = 0 if top is None else top
|
475
|
+
right = self.width if right is None else right
|
476
|
+
bottom = self.height if bottom is None else bottom
|
477
|
+
|
478
|
+
# Handle width parameter
|
479
|
+
if width == "full":
|
480
|
+
left = 0
|
481
|
+
right = self.width
|
482
|
+
elif width != "element":
|
483
|
+
raise ValueError("Width must be 'full' or 'element'")
|
484
|
+
|
485
|
+
from natural_pdf.elements.region import Region
|
486
|
+
region = Region(self, (left, top, right, bottom))
|
487
|
+
return region
|
488
|
+
|
489
|
+
def get_elements(self, apply_exclusions=True) -> List['Element']:
|
490
|
+
"""
|
491
|
+
Get all elements on this page.
|
492
|
+
|
493
|
+
Args:
|
494
|
+
apply_exclusions: Whether to apply exclusion regions
|
495
|
+
|
496
|
+
Returns:
|
497
|
+
List of all elements on the page
|
498
|
+
"""
|
499
|
+
# Load elements if not already loaded
|
500
|
+
self._load_elements()
|
501
|
+
|
502
|
+
# Combine all element types
|
503
|
+
all_elements = []
|
504
|
+
all_elements.extend(self.words)
|
505
|
+
all_elements.extend(self.rects)
|
506
|
+
all_elements.extend(self.lines)
|
507
|
+
# Add other element types as needed
|
508
|
+
|
509
|
+
# Apply exclusions if requested
|
510
|
+
if apply_exclusions and self._exclusions:
|
511
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True)
|
512
|
+
if exclusion_regions:
|
513
|
+
# Keep elements that are not in any exclusion region
|
514
|
+
filtered_elements = []
|
515
|
+
for element in all_elements:
|
516
|
+
in_exclusion = False
|
517
|
+
for region in exclusion_regions:
|
518
|
+
if region._is_element_in_region(element):
|
519
|
+
in_exclusion = True
|
520
|
+
break
|
521
|
+
if not in_exclusion:
|
522
|
+
filtered_elements.append(element)
|
523
|
+
return filtered_elements
|
524
|
+
|
525
|
+
return all_elements
|
526
|
+
|
527
|
+
def filter_elements(self, elements: List['Element'], selector: str, **kwargs) -> List['Element']:
|
528
|
+
"""
|
529
|
+
Filter a list of elements based on a selector.
|
530
|
+
|
531
|
+
Args:
|
532
|
+
elements: List of elements to filter
|
533
|
+
selector: CSS-like selector string
|
534
|
+
**kwargs: Additional filter parameters
|
535
|
+
|
536
|
+
Returns:
|
537
|
+
List of elements that match the selector
|
538
|
+
"""
|
539
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
540
|
+
|
541
|
+
# Parse the selector
|
542
|
+
selector_obj = parse_selector(selector)
|
543
|
+
|
544
|
+
# Create filter function from selector
|
545
|
+
filter_func = selector_to_filter_func(selector_obj)
|
546
|
+
|
547
|
+
# Apply the filter to the elements
|
548
|
+
matching_elements = [element for element in elements if filter_func(element)]
|
549
|
+
|
550
|
+
# Sort elements in reading order if requested
|
551
|
+
if kwargs.get('reading_order', True):
|
552
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
553
|
+
|
554
|
+
return matching_elements
|
555
|
+
|
556
|
+
def until(self, selector: str, include_endpoint: bool = True, **kwargs) -> Any:
|
557
|
+
"""
|
558
|
+
Select content from the top of the page until matching selector.
|
559
|
+
|
560
|
+
Args:
|
561
|
+
selector: CSS-like selector string
|
562
|
+
include_endpoint: Whether to include the endpoint element in the region
|
563
|
+
**kwargs: Additional selection parameters
|
564
|
+
|
565
|
+
Returns:
|
566
|
+
Region object representing the selected content
|
567
|
+
|
568
|
+
Examples:
|
569
|
+
>>> page.until('text:contains("Conclusion")') # Select from top to conclusion
|
570
|
+
>>> page.until('line[width>=2]', include_endpoint=False) # Select up to thick line
|
571
|
+
"""
|
572
|
+
# Find the target element
|
573
|
+
target = self.find(selector, **kwargs)
|
574
|
+
if not target:
|
575
|
+
# If target not found, return a default region
|
576
|
+
from natural_pdf.elements.region import Region
|
577
|
+
return Region(self, (0, 0, self.width, self.height))
|
578
|
+
|
579
|
+
# Create a region from the top of the page to the target
|
580
|
+
from natural_pdf.elements.region import Region
|
581
|
+
if include_endpoint:
|
582
|
+
# Include the target element
|
583
|
+
region = Region(self, (0, 0, self.width, target.bottom))
|
584
|
+
else:
|
585
|
+
# Up to the target element
|
586
|
+
region = Region(self, (0, 0, self.width, target.top))
|
587
|
+
|
588
|
+
region.end_element = target
|
589
|
+
return region
|
590
|
+
|
591
|
+
# Alias for backward compatibility
|
592
|
+
def select_until(self, selector: str, include_target: bool = True, **kwargs) -> Any:
|
593
|
+
"""
|
594
|
+
DEPRECATED: Use until() instead.
|
595
|
+
Select content from this point until matching selector.
|
596
|
+
|
597
|
+
Args:
|
598
|
+
selector: CSS-like selector string
|
599
|
+
include_target: Whether to include the target element in the region
|
600
|
+
**kwargs: Additional selection parameters
|
601
|
+
|
602
|
+
Returns:
|
603
|
+
Region object representing the selected content
|
604
|
+
"""
|
605
|
+
import warnings
|
606
|
+
warnings.warn(
|
607
|
+
"select_until() is deprecated and will be removed in a future version. Use until() instead.",
|
608
|
+
DeprecationWarning,
|
609
|
+
stacklevel=2
|
610
|
+
)
|
611
|
+
return self.until(selector, include_endpoint=include_target, **kwargs)
|
612
|
+
|
613
|
+
def crop(self, bbox=None, **kwargs) -> Any:
|
614
|
+
"""
|
615
|
+
Crop the page to the specified bounding box.
|
616
|
+
|
617
|
+
This is a direct wrapper around pdfplumber's crop method.
|
618
|
+
|
619
|
+
Args:
|
620
|
+
bbox: Bounding box (x0, top, x1, bottom) or None
|
621
|
+
**kwargs: Additional parameters (top, bottom, left, right)
|
622
|
+
|
623
|
+
Returns:
|
624
|
+
Cropped page object
|
625
|
+
"""
|
626
|
+
# TODO: Create proper wrapper for cropped page
|
627
|
+
return self._page.crop(bbox, **kwargs)
|
628
|
+
|
629
|
+
def extract_text(self,
|
630
|
+
preserve_whitespace=True,
|
631
|
+
use_exclusions=True,
|
632
|
+
debug_exclusions=False, ocr=None, **kwargs) -> str:
|
633
|
+
"""
|
634
|
+
Extract text from this page, respecting any exclusion regions.
|
635
|
+
|
636
|
+
Args:
|
637
|
+
preserve_whitespace: Whether to keep blank characters (default: True)
|
638
|
+
use_exclusions: Whether to apply exclusion regions (default: True)
|
639
|
+
debug_exclusions: Whether to output detailed exclusion debugging info (default: False)
|
640
|
+
ocr: OCR configuration. If None, uses PDF settings
|
641
|
+
**kwargs: Additional extraction parameters
|
642
|
+
|
643
|
+
Returns:
|
644
|
+
Extracted text as string
|
645
|
+
"""
|
646
|
+
if not self._exclusions or not use_exclusions:
|
647
|
+
# If no exclusions or exclusions disabled, use regular extraction
|
648
|
+
if debug_exclusions:
|
649
|
+
print(f"Page {self.index}: No exclusions to apply or use_exclusions=False")
|
650
|
+
# Note: pdfplumber still uses keep_blank_chars parameter
|
651
|
+
return self._page.extract_text(keep_blank_chars=preserve_whitespace, **kwargs)
|
652
|
+
|
653
|
+
# Get all exclusion regions
|
654
|
+
if debug_exclusions:
|
655
|
+
print(f"Page {self.index}: Getting exclusion regions with debugging enabled")
|
656
|
+
|
657
|
+
# Important: We need to evaluate lambda functions from PDF level
|
658
|
+
# These functions are stored directly in _exclusions and not as tuples
|
659
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug_exclusions)
|
660
|
+
|
661
|
+
if not exclusion_regions:
|
662
|
+
if debug_exclusions:
|
663
|
+
print(f"Page {self.index}: No valid exclusion regions were found")
|
664
|
+
# Note: pdfplumber still uses keep_blank_chars parameter
|
665
|
+
return self._page.extract_text(keep_blank_chars=preserve_whitespace, **kwargs)
|
666
|
+
|
667
|
+
if debug_exclusions:
|
668
|
+
print(f"Page {self.index}: Found {len(exclusion_regions)} exclusion regions to apply")
|
669
|
+
|
670
|
+
# Find all text elements
|
671
|
+
all_text = self.find_all('text')
|
672
|
+
|
673
|
+
if debug_exclusions:
|
674
|
+
print(f"Page {self.index}: Found {len(all_text)} text elements before exclusion filtering")
|
675
|
+
|
676
|
+
# Filter out elements in excluded regions
|
677
|
+
filtered_elements = []
|
678
|
+
excluded_count = 0
|
679
|
+
|
680
|
+
for element in all_text:
|
681
|
+
exclude = False
|
682
|
+
for region in exclusion_regions:
|
683
|
+
if region._is_element_in_region(element):
|
684
|
+
exclude = True
|
685
|
+
excluded_count += 1
|
686
|
+
break
|
687
|
+
if not exclude:
|
688
|
+
filtered_elements.append(element)
|
689
|
+
|
690
|
+
if debug_exclusions:
|
691
|
+
print(f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}")
|
692
|
+
|
693
|
+
# Extract text from filtered elements
|
694
|
+
from natural_pdf.elements.collections import ElementCollection
|
695
|
+
collection = ElementCollection(filtered_elements)
|
696
|
+
result = collection.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
697
|
+
|
698
|
+
# Apply OCR if explicitly requested
|
699
|
+
use_ocr = ocr is True or (ocr is not None and isinstance(ocr, dict) and ocr.get('enabled', False))
|
700
|
+
if use_ocr:
|
701
|
+
# Process OCR parameter into normalized config
|
702
|
+
ocr_config = self._get_ocr_config(ocr)
|
703
|
+
|
704
|
+
# Apply OCR if explicitly enabled or in auto mode and no text found
|
705
|
+
if ocr_config.get('enabled') is True or ocr is True or (
|
706
|
+
ocr_config.get('enabled') == 'auto' and not result.strip()
|
707
|
+
):
|
708
|
+
print(f"Using OCR for extract_text")
|
709
|
+
# Get existing OCR elements or run OCR
|
710
|
+
if any(elem.source == 'ocr' for elem in filtered_elements):
|
711
|
+
# We already have OCR elements, just re-extract from them
|
712
|
+
ocr_elements = [elem for elem in filtered_elements if elem.source == 'ocr']
|
713
|
+
ocr_collection = ElementCollection(ocr_elements)
|
714
|
+
ocr_text = ocr_collection.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
715
|
+
|
716
|
+
if ocr_text.strip():
|
717
|
+
result = ocr_text
|
718
|
+
else:
|
719
|
+
# Run OCR and get text from OCR elements
|
720
|
+
ocr_elements = self.apply_ocr(**ocr_config)
|
721
|
+
|
722
|
+
if ocr_elements:
|
723
|
+
# Filter OCR elements by exclusions
|
724
|
+
if use_exclusions:
|
725
|
+
filtered_ocr = []
|
726
|
+
for element in ocr_elements:
|
727
|
+
exclude = False
|
728
|
+
for region in exclusion_regions:
|
729
|
+
if region._is_element_in_region(element):
|
730
|
+
exclude = True
|
731
|
+
break
|
732
|
+
if not exclude:
|
733
|
+
filtered_ocr.append(element)
|
734
|
+
else:
|
735
|
+
filtered_ocr = ocr_elements
|
736
|
+
|
737
|
+
ocr_collection = ElementCollection(filtered_ocr)
|
738
|
+
ocr_text = ocr_collection.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
739
|
+
|
740
|
+
# Use OCR text if it's not empty
|
741
|
+
if ocr_text.strip():
|
742
|
+
result = ocr_text
|
743
|
+
|
744
|
+
if debug_exclusions:
|
745
|
+
print(f"Page {self.index}: Extracted {len(result)} characters of text with exclusions applied")
|
746
|
+
|
747
|
+
return result
|
748
|
+
|
749
|
+
def extract_table(self, table_settings={}) -> List[Any]:
|
750
|
+
"""
|
751
|
+
Extract the largest table from this page.
|
752
|
+
|
753
|
+
Args:
|
754
|
+
table_settings: Additional extraction parameters
|
755
|
+
|
756
|
+
Returns:
|
757
|
+
List of extracted tables
|
758
|
+
"""
|
759
|
+
# For now, directly use pdfplumber's extraction
|
760
|
+
return self._page.extract_table(table_settings)
|
761
|
+
|
762
|
+
def extract_tables(self, table_settings={}) -> List[Any]:
|
763
|
+
"""
|
764
|
+
Extract tables from this page.
|
765
|
+
|
766
|
+
Args:
|
767
|
+
table_settings: Additional extraction parameters
|
768
|
+
|
769
|
+
Returns:
|
770
|
+
List of extracted tables
|
771
|
+
"""
|
772
|
+
# For now, directly use pdfplumber's extraction
|
773
|
+
return self._page.extract_tables(table_settings)
|
774
|
+
|
775
|
+
def _load_elements(self, include_ocr=None):
|
776
|
+
"""
|
777
|
+
Load all elements from the page (lazy loading).
|
778
|
+
|
779
|
+
Args:
|
780
|
+
include_ocr: Whether to include OCR text elements. If None, uses PDF settings.
|
781
|
+
"""
|
782
|
+
if self._elements is None:
|
783
|
+
from natural_pdf.elements.text import TextElement
|
784
|
+
from natural_pdf.elements.rect import RectangleElement
|
785
|
+
from natural_pdf.elements.line import LineElement
|
786
|
+
|
787
|
+
# Get the font attributes to use for word grouping
|
788
|
+
font_attrs = self._font_attrs
|
789
|
+
|
790
|
+
# Get keep_spaces setting from PDF config or default to True (new behavior)
|
791
|
+
keep_spaces = self._parent._config.get('keep_spaces', True)
|
792
|
+
|
793
|
+
# Process characters, annotating with font information
|
794
|
+
chars = []
|
795
|
+
for c in self._page.chars:
|
796
|
+
# Check for font references (F0, F1, etc.) and map to actual fonts
|
797
|
+
if c.get('fontname', '').startswith('F') and len(c['fontname']) <= 3:
|
798
|
+
# Access the PDF resource info to get actual font name
|
799
|
+
font_ref = c['fontname']
|
800
|
+
try:
|
801
|
+
# Try to get font info from resources
|
802
|
+
if self._page.page_obj.get('Resources', {}).get('Font', {}):
|
803
|
+
fonts = self._page.page_obj['Resources']['Font']
|
804
|
+
if font_ref in fonts:
|
805
|
+
font_obj = fonts[font_ref]
|
806
|
+
if font_obj.get('BaseFont'):
|
807
|
+
c['real_fontname'] = font_obj['BaseFont']
|
808
|
+
except (KeyError, AttributeError, TypeError):
|
809
|
+
pass
|
810
|
+
|
811
|
+
chars.append(TextElement(c, self))
|
812
|
+
|
813
|
+
# Create word-level text elements by grouping chars
|
814
|
+
from itertools import groupby
|
815
|
+
from operator import itemgetter
|
816
|
+
|
817
|
+
# Sort chars by y-position (line) and then x-position
|
818
|
+
sorted_chars = sorted(self._page.chars, key=lambda c: (round(c['top']), c['x0']))
|
819
|
+
|
820
|
+
# Group chars by line (similar y-position)
|
821
|
+
line_groups = []
|
822
|
+
for _, line_chars in groupby(sorted_chars, key=lambda c: round(c['top'])):
|
823
|
+
line_chars = list(line_chars)
|
824
|
+
|
825
|
+
# Now group chars into words based on x-distance and font attributes
|
826
|
+
words = []
|
827
|
+
current_word = []
|
828
|
+
|
829
|
+
for i, char in enumerate(line_chars):
|
830
|
+
# Handle whitespace characters differently based on keep_spaces setting
|
831
|
+
if char['text'].isspace():
|
832
|
+
if keep_spaces:
|
833
|
+
# Include spaces in words when keep_spaces is enabled
|
834
|
+
if current_word:
|
835
|
+
current_word.append(char)
|
836
|
+
else:
|
837
|
+
# Skip leading spaces at the start of a line
|
838
|
+
continue
|
839
|
+
else:
|
840
|
+
# Original behavior: Skip whitespace and close current word
|
841
|
+
if current_word:
|
842
|
+
# Combine text from characters and normalize spaces
|
843
|
+
text = ''.join(c['text'] for c in current_word)
|
844
|
+
|
845
|
+
# Collapse multiple consecutive spaces into a single space
|
846
|
+
import re
|
847
|
+
text = re.sub(r'\s+', ' ', text)
|
848
|
+
|
849
|
+
# Create a combined word object
|
850
|
+
word_obj = {
|
851
|
+
'text': text,
|
852
|
+
'x0': min(c['x0'] for c in current_word),
|
853
|
+
'x1': max(c['x1'] for c in current_word),
|
854
|
+
'top': min(c['top'] for c in current_word),
|
855
|
+
'bottom': max(c['bottom'] for c in current_word),
|
856
|
+
'fontname': current_word[0].get('fontname', ''),
|
857
|
+
'size': current_word[0].get('size', 0),
|
858
|
+
'object_type': 'word',
|
859
|
+
'page_number': current_word[0]['page_number']
|
860
|
+
}
|
861
|
+
|
862
|
+
# Handle real fontname if available
|
863
|
+
if 'real_fontname' in current_word[0]:
|
864
|
+
word_obj['real_fontname'] = current_word[0]['real_fontname']
|
865
|
+
|
866
|
+
# Handle color - use the first char's color
|
867
|
+
if 'non_stroking_color' in current_word[0]:
|
868
|
+
word_obj['non_stroking_color'] = current_word[0]['non_stroking_color']
|
869
|
+
|
870
|
+
# Copy any additional font attributes
|
871
|
+
for attr in font_attrs:
|
872
|
+
if attr in current_word[0]:
|
873
|
+
word_obj[attr] = current_word[0][attr]
|
874
|
+
|
875
|
+
words.append(TextElement(word_obj, self))
|
876
|
+
current_word = []
|
877
|
+
continue
|
878
|
+
|
879
|
+
# If this is a new word, start it
|
880
|
+
if not current_word:
|
881
|
+
current_word.append(char)
|
882
|
+
else:
|
883
|
+
# Check if this char is part of the current word or a new word
|
884
|
+
prev_char = current_word[-1]
|
885
|
+
|
886
|
+
# Check if font attributes match for this character
|
887
|
+
font_attrs_match = True
|
888
|
+
if font_attrs:
|
889
|
+
for attr in font_attrs:
|
890
|
+
# If attribute doesn't match or isn't present in both chars, break word
|
891
|
+
if attr not in char or attr not in prev_char or char[attr] != prev_char[attr]:
|
892
|
+
font_attrs_match = False
|
893
|
+
break
|
894
|
+
|
895
|
+
# If font attributes don't match, it's a new word
|
896
|
+
if not font_attrs_match:
|
897
|
+
# Combine text from characters and normalize spaces
|
898
|
+
text = ''.join(c['text'] for c in current_word)
|
899
|
+
|
900
|
+
# Collapse multiple consecutive spaces into a single space
|
901
|
+
import re
|
902
|
+
text = re.sub(r'\s+', ' ', text)
|
903
|
+
|
904
|
+
# Finish current word
|
905
|
+
word_obj = {
|
906
|
+
'text': text,
|
907
|
+
'x0': min(c['x0'] for c in current_word),
|
908
|
+
'x1': max(c['x1'] for c in current_word),
|
909
|
+
'top': min(c['top'] for c in current_word),
|
910
|
+
'bottom': max(c['bottom'] for c in current_word),
|
911
|
+
'fontname': current_word[0].get('fontname', ''),
|
912
|
+
'size': current_word[0].get('size', 0),
|
913
|
+
'object_type': 'word',
|
914
|
+
'page_number': current_word[0]['page_number']
|
915
|
+
}
|
916
|
+
|
917
|
+
# Handle real fontname if available
|
918
|
+
if 'real_fontname' in current_word[0]:
|
919
|
+
word_obj['real_fontname'] = current_word[0]['real_fontname']
|
920
|
+
|
921
|
+
# Handle color - use the first char's color
|
922
|
+
if 'non_stroking_color' in current_word[0]:
|
923
|
+
word_obj['non_stroking_color'] = current_word[0]['non_stroking_color']
|
924
|
+
|
925
|
+
# Copy any additional font attributes
|
926
|
+
for attr in font_attrs:
|
927
|
+
if attr in current_word[0]:
|
928
|
+
word_obj[attr] = current_word[0][attr]
|
929
|
+
|
930
|
+
words.append(TextElement(word_obj, self))
|
931
|
+
current_word = [char]
|
932
|
+
# If the gap between chars is larger than a threshold, it's a new word
|
933
|
+
# Use a wider threshold when keep_spaces is enabled to allow for natural spaces
|
934
|
+
elif char['x0'] - prev_char['x1'] > prev_char['width'] * (1.5 if keep_spaces else 0.5):
|
935
|
+
# Combine text from characters and normalize spaces
|
936
|
+
text = ''.join(c['text'] for c in current_word)
|
937
|
+
|
938
|
+
# Collapse multiple consecutive spaces into a single space
|
939
|
+
import re
|
940
|
+
text = re.sub(r'\s+', ' ', text)
|
941
|
+
|
942
|
+
# Finish current word
|
943
|
+
word_obj = {
|
944
|
+
'text': text,
|
945
|
+
'x0': min(c['x0'] for c in current_word),
|
946
|
+
'x1': max(c['x1'] for c in current_word),
|
947
|
+
'top': min(c['top'] for c in current_word),
|
948
|
+
'bottom': max(c['bottom'] for c in current_word),
|
949
|
+
'fontname': current_word[0].get('fontname', ''),
|
950
|
+
'size': current_word[0].get('size', 0),
|
951
|
+
'object_type': 'word',
|
952
|
+
'page_number': current_word[0]['page_number']
|
953
|
+
}
|
954
|
+
|
955
|
+
# Handle real fontname if available
|
956
|
+
if 'real_fontname' in current_word[0]:
|
957
|
+
word_obj['real_fontname'] = current_word[0]['real_fontname']
|
958
|
+
|
959
|
+
# Handle color - use the first char's color
|
960
|
+
if 'non_stroking_color' in current_word[0]:
|
961
|
+
word_obj['non_stroking_color'] = current_word[0]['non_stroking_color']
|
962
|
+
|
963
|
+
# Copy any additional font attributes
|
964
|
+
for attr in font_attrs:
|
965
|
+
if attr in current_word[0]:
|
966
|
+
word_obj[attr] = current_word[0][attr]
|
967
|
+
|
968
|
+
words.append(TextElement(word_obj, self))
|
969
|
+
current_word = [char]
|
970
|
+
else:
|
971
|
+
# Continue current word
|
972
|
+
current_word.append(char)
|
973
|
+
|
974
|
+
# Handle the last word if there is one
|
975
|
+
if current_word:
|
976
|
+
# Combine text from characters and normalize spaces
|
977
|
+
text = ''.join(c['text'] for c in current_word)
|
978
|
+
|
979
|
+
# Collapse multiple consecutive spaces into a single space
|
980
|
+
import re
|
981
|
+
text = re.sub(r'\s+', ' ', text)
|
982
|
+
|
983
|
+
word_obj = {
|
984
|
+
'text': text,
|
985
|
+
'x0': min(c['x0'] for c in current_word),
|
986
|
+
'x1': max(c['x1'] for c in current_word),
|
987
|
+
'top': min(c['top'] for c in current_word),
|
988
|
+
'bottom': max(c['bottom'] for c in current_word),
|
989
|
+
'fontname': current_word[0].get('fontname', ''),
|
990
|
+
'size': current_word[0].get('size', 0),
|
991
|
+
'object_type': 'word',
|
992
|
+
'page_number': current_word[0]['page_number']
|
993
|
+
}
|
994
|
+
|
995
|
+
# Handle real fontname if available
|
996
|
+
if 'real_fontname' in current_word[0]:
|
997
|
+
word_obj['real_fontname'] = current_word[0]['real_fontname']
|
998
|
+
|
999
|
+
# Handle color - use the first char's color
|
1000
|
+
if 'non_stroking_color' in current_word[0]:
|
1001
|
+
word_obj['non_stroking_color'] = current_word[0]['non_stroking_color']
|
1002
|
+
|
1003
|
+
# Copy any additional font attributes
|
1004
|
+
for attr in font_attrs:
|
1005
|
+
if attr in current_word[0]:
|
1006
|
+
word_obj[attr] = current_word[0][attr]
|
1007
|
+
|
1008
|
+
words.append(TextElement(word_obj, self))
|
1009
|
+
|
1010
|
+
line_groups.extend(words)
|
1011
|
+
|
1012
|
+
self._elements = {
|
1013
|
+
'chars': chars,
|
1014
|
+
'words': line_groups,
|
1015
|
+
'rects': [RectangleElement(r, self) for r in self._page.rects],
|
1016
|
+
'lines': [LineElement(l, self) for l in self._page.lines],
|
1017
|
+
# Add other element types as needed
|
1018
|
+
}
|
1019
|
+
|
1020
|
+
# Check if we should run OCR
|
1021
|
+
apply_ocr = False
|
1022
|
+
|
1023
|
+
# Check if OCR is explicitly requested
|
1024
|
+
if include_ocr is True:
|
1025
|
+
apply_ocr = True
|
1026
|
+
# Otherwise, check PDF-level settings for auto mode
|
1027
|
+
elif include_ocr is None and self._parent._ocr_config.get('enabled') == 'auto':
|
1028
|
+
# In auto mode, apply OCR if few or no text elements found
|
1029
|
+
if len(line_groups) < 5: # Arbitrary threshold
|
1030
|
+
apply_ocr = True
|
1031
|
+
|
1032
|
+
# Apply OCR if needed
|
1033
|
+
if apply_ocr:
|
1034
|
+
ocr_elements = self.apply_ocr()
|
1035
|
+
# OCR elements are already added to self._elements in apply_ocr()
|
1036
|
+
|
1037
|
+
@property
|
1038
|
+
def chars(self) -> List[Any]:
|
1039
|
+
"""Get all character elements on this page."""
|
1040
|
+
self._load_elements()
|
1041
|
+
return self._elements['chars']
|
1042
|
+
|
1043
|
+
@property
|
1044
|
+
def words(self) -> List[Any]:
|
1045
|
+
"""Get all word elements on this page."""
|
1046
|
+
self._load_elements()
|
1047
|
+
return self._elements['words']
|
1048
|
+
|
1049
|
+
@property
|
1050
|
+
def rects(self) -> List[Any]:
|
1051
|
+
"""Get all rectangle elements on this page."""
|
1052
|
+
self._load_elements()
|
1053
|
+
return self._elements['rects']
|
1054
|
+
|
1055
|
+
@property
|
1056
|
+
def lines(self) -> List[Any]:
|
1057
|
+
"""Get all line elements on this page."""
|
1058
|
+
self._load_elements()
|
1059
|
+
return self._elements['lines']
|
1060
|
+
|
1061
|
+
@property
|
1062
|
+
def _highlight_mgr(self) -> 'HighlightManager':
|
1063
|
+
"""Get the highlight manager for this page."""
|
1064
|
+
if self._highlight_manager is None:
|
1065
|
+
from natural_pdf.utils.highlighting import HighlightManager
|
1066
|
+
self._highlight_manager = HighlightManager(self)
|
1067
|
+
return self._highlight_manager
|
1068
|
+
|
1069
|
+
def highlight(self,
|
1070
|
+
color: Optional[Tuple[int, int, int, int]] = None,
|
1071
|
+
label: Optional[str] = None) -> 'Page':
|
1072
|
+
"""
|
1073
|
+
Highlight the entire page.
|
1074
|
+
|
1075
|
+
Args:
|
1076
|
+
color: RGBA color tuple for the highlight, or None to use the next color
|
1077
|
+
label: Optional label for the highlight
|
1078
|
+
|
1079
|
+
Returns:
|
1080
|
+
Self for method chaining
|
1081
|
+
"""
|
1082
|
+
# Add a highlight for the entire page
|
1083
|
+
self._highlight_mgr.add_highlight(
|
1084
|
+
(0, 0, self.width, self.height), color, label
|
1085
|
+
)
|
1086
|
+
return self
|
1087
|
+
|
1088
|
+
def show(self,
|
1089
|
+
scale: float = 2.0,
|
1090
|
+
width: Optional[int] = None,
|
1091
|
+
labels: bool = True,
|
1092
|
+
legend_position: str = 'right',
|
1093
|
+
render_ocr: bool = False) -> Image.Image:
|
1094
|
+
"""
|
1095
|
+
Show the page with any highlights.
|
1096
|
+
|
1097
|
+
Args:
|
1098
|
+
scale: Scale factor for rendering
|
1099
|
+
width: Optional width for the output image in pixels
|
1100
|
+
labels: Whether to include a legend for labels
|
1101
|
+
legend_position: Position of the legend
|
1102
|
+
render_ocr: Whether to render OCR text with white background boxes
|
1103
|
+
|
1104
|
+
Returns:
|
1105
|
+
PIL Image of the page with highlights
|
1106
|
+
"""
|
1107
|
+
# Use to_image to get the image
|
1108
|
+
return self.to_image(
|
1109
|
+
scale=scale,
|
1110
|
+
width=width,
|
1111
|
+
labels=labels,
|
1112
|
+
legend_position=legend_position,
|
1113
|
+
render_ocr=render_ocr
|
1114
|
+
)
|
1115
|
+
|
1116
|
+
|
1117
|
+
|
1118
|
+
def save_image(self,
|
1119
|
+
filename: str,
|
1120
|
+
scale: float = 2.0,
|
1121
|
+
width: Optional[int] = None,
|
1122
|
+
labels: bool = True,
|
1123
|
+
legend_position: str = 'right',
|
1124
|
+
render_ocr: bool = False) -> 'Page':
|
1125
|
+
"""
|
1126
|
+
Save the page with any highlights to an image file.
|
1127
|
+
|
1128
|
+
Args:
|
1129
|
+
filename: Path to save the image to
|
1130
|
+
scale: Scale factor for rendering
|
1131
|
+
width: Optional width for the output image in pixels
|
1132
|
+
labels: Whether to include a legend for labels
|
1133
|
+
legend_position: Position of the legend
|
1134
|
+
render_ocr: Whether to render OCR text with white background boxes
|
1135
|
+
|
1136
|
+
Returns:
|
1137
|
+
Self for method chaining
|
1138
|
+
"""
|
1139
|
+
# Use to_image to generate and save the image
|
1140
|
+
self.to_image(
|
1141
|
+
path=filename,
|
1142
|
+
scale=scale,
|
1143
|
+
width=width,
|
1144
|
+
labels=labels,
|
1145
|
+
legend_position=legend_position,
|
1146
|
+
render_ocr=render_ocr
|
1147
|
+
)
|
1148
|
+
return self
|
1149
|
+
|
1150
|
+
def debug_ocr(self, output_path):
|
1151
|
+
"""
|
1152
|
+
Generate an interactive HTML debug report for OCR results.
|
1153
|
+
|
1154
|
+
This creates a single-file HTML report with:
|
1155
|
+
- Side-by-side view of image regions and OCR text
|
1156
|
+
- Confidence scores with color coding
|
1157
|
+
- Editable correction fields
|
1158
|
+
- Filtering and sorting options
|
1159
|
+
- Export functionality for corrected text
|
1160
|
+
|
1161
|
+
Args:
|
1162
|
+
output_path: Path to save the HTML report
|
1163
|
+
|
1164
|
+
Returns:
|
1165
|
+
Path to the generated HTML file
|
1166
|
+
"""
|
1167
|
+
from natural_pdf.utils.ocr import debug_ocr_to_html
|
1168
|
+
return debug_ocr_to_html([self], output_path)
|
1169
|
+
|
1170
|
+
def save(self,
|
1171
|
+
filename: str,
|
1172
|
+
scale: float = 2.0,
|
1173
|
+
width: Optional[int] = None,
|
1174
|
+
labels: bool = False,
|
1175
|
+
legend_position: str = 'right') -> 'Page':
|
1176
|
+
"""
|
1177
|
+
DEPRECATED: Use to_image() instead.
|
1178
|
+
Save the page with any highlights to an image file.
|
1179
|
+
"""
|
1180
|
+
import warnings
|
1181
|
+
warnings.warn(
|
1182
|
+
"save() is deprecated and will be removed in a future version. Use to_image() instead.",
|
1183
|
+
DeprecationWarning,
|
1184
|
+
stacklevel=2
|
1185
|
+
)
|
1186
|
+
self.to_image(
|
1187
|
+
path=filename,
|
1188
|
+
scale=scale,
|
1189
|
+
width=width,
|
1190
|
+
show_labels=labels,
|
1191
|
+
legend_position=legend_position
|
1192
|
+
)
|
1193
|
+
return self
|
1194
|
+
|
1195
|
+
def clear_highlights(self) -> 'Page':
|
1196
|
+
"""
|
1197
|
+
Clear all highlights from the page.
|
1198
|
+
|
1199
|
+
Returns:
|
1200
|
+
Self for method chaining
|
1201
|
+
"""
|
1202
|
+
self._highlight_mgr.clear_highlights()
|
1203
|
+
return self
|
1204
|
+
|
1205
|
+
def analyze_text_styles(self) -> Dict[str, 'ElementCollection']:
|
1206
|
+
"""
|
1207
|
+
Analyze and group text elements by their style properties.
|
1208
|
+
|
1209
|
+
Returns:
|
1210
|
+
Dictionary mapping style labels to element collections
|
1211
|
+
"""
|
1212
|
+
# Import the analyzer
|
1213
|
+
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
1214
|
+
|
1215
|
+
# Create analyzer
|
1216
|
+
analyzer = TextStyleAnalyzer()
|
1217
|
+
|
1218
|
+
# Analyze the page and store the results
|
1219
|
+
self._text_styles = analyzer.analyze(self)
|
1220
|
+
|
1221
|
+
# Return the analyzed styles
|
1222
|
+
return self._text_styles
|
1223
|
+
|
1224
|
+
def highlight_text_styles(self) -> 'Page':
|
1225
|
+
"""
|
1226
|
+
Highlight text elements grouped by their style properties.
|
1227
|
+
|
1228
|
+
This automatically analyzes the styles if they haven't been analyzed yet.
|
1229
|
+
|
1230
|
+
Returns:
|
1231
|
+
Self for method chaining
|
1232
|
+
"""
|
1233
|
+
# Analyze styles if not already done
|
1234
|
+
if self._text_styles is None:
|
1235
|
+
self.analyze_text_styles()
|
1236
|
+
|
1237
|
+
# Highlight each style group with its own color
|
1238
|
+
for label, elements in self._text_styles.items():
|
1239
|
+
elements.highlight(label=label)
|
1240
|
+
|
1241
|
+
return self
|
1242
|
+
|
1243
|
+
def highlight_all(self,
|
1244
|
+
include_types: Optional[List[str]] = None,
|
1245
|
+
include_text_styles: bool = False,
|
1246
|
+
include_layout_regions: bool = False,
|
1247
|
+
apply_exclusions: bool = True,
|
1248
|
+
use_color_cycling: bool = True,
|
1249
|
+
layout_confidence: float = 0.2) -> 'Page':
|
1250
|
+
"""
|
1251
|
+
Highlight all elements on the page, grouped by type or style.
|
1252
|
+
|
1253
|
+
Each element type or style gets its own color and label in the legend.
|
1254
|
+
|
1255
|
+
Args:
|
1256
|
+
include_types: Optional list of element types to include
|
1257
|
+
(e.g., ['text', 'line', 'rect'])
|
1258
|
+
If None, all available types will be included
|
1259
|
+
include_text_styles: Whether to highlight text by style groups
|
1260
|
+
(font, size, etc.) instead of as a single group
|
1261
|
+
include_layout_regions: Whether to include detected layout regions
|
1262
|
+
(will run layout detection if not already done)
|
1263
|
+
Layout regions will be grouped by model and type
|
1264
|
+
apply_exclusions: Whether to respect exclusion zones (default: True)
|
1265
|
+
use_color_cycling: Whether to use different colors for each type (default: True)
|
1266
|
+
layout_confidence: Confidence threshold for layout regions (default: 0.2)
|
1267
|
+
If True is passed, all regions will be included regardless of confidence
|
1268
|
+
|
1269
|
+
Returns:
|
1270
|
+
Self for method chaining
|
1271
|
+
"""
|
1272
|
+
# Load all elements if not already loaded
|
1273
|
+
self._load_elements()
|
1274
|
+
|
1275
|
+
# Get exclusion regions if we're applying exclusions
|
1276
|
+
exclusion_regions = []
|
1277
|
+
if apply_exclusions and self._exclusions:
|
1278
|
+
# Get exclusion regions using callable functions when appropriate
|
1279
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True)
|
1280
|
+
|
1281
|
+
# Define all available element types
|
1282
|
+
all_types = {
|
1283
|
+
'text': self.words,
|
1284
|
+
'char': self.chars,
|
1285
|
+
'rect': self.rects,
|
1286
|
+
'line': self.lines,
|
1287
|
+
# Add other types as they become available
|
1288
|
+
}
|
1289
|
+
|
1290
|
+
# Highlight by text styles if requested
|
1291
|
+
# This takes precedence over normal text highlighting
|
1292
|
+
if include_text_styles:
|
1293
|
+
# Analyze text styles
|
1294
|
+
styles = self.analyze_text_styles()
|
1295
|
+
|
1296
|
+
# Apply exclusions to each style group if needed
|
1297
|
+
if apply_exclusions and exclusion_regions:
|
1298
|
+
for label, elements in styles.items():
|
1299
|
+
# Filter out excluded elements
|
1300
|
+
filtered_elements = elements.exclude_regions(exclusion_regions)
|
1301
|
+
# Highlight with appropriate label
|
1302
|
+
filtered_elements.highlight(label=label, use_color_cycling=use_color_cycling)
|
1303
|
+
else:
|
1304
|
+
# Highlight without exclusions
|
1305
|
+
for label, elements in styles.items():
|
1306
|
+
elements.highlight(label=label, use_color_cycling=use_color_cycling)
|
1307
|
+
|
1308
|
+
# Highlight non-text elements normally
|
1309
|
+
if include_types:
|
1310
|
+
# Filter to only include non-text types
|
1311
|
+
non_text_types = [t for t in include_types if t != 'text']
|
1312
|
+
|
1313
|
+
# Highlight each non-text type
|
1314
|
+
for element_type in non_text_types:
|
1315
|
+
if element_type in all_types and all_types[element_type]:
|
1316
|
+
label = f"{element_type.capitalize()} Elements"
|
1317
|
+
elements = all_types[element_type]
|
1318
|
+
|
1319
|
+
# Skip empty collections
|
1320
|
+
if not elements:
|
1321
|
+
continue
|
1322
|
+
|
1323
|
+
# Create an ElementCollection if needed
|
1324
|
+
from natural_pdf.elements.collections import ElementCollection
|
1325
|
+
if not isinstance(elements, ElementCollection):
|
1326
|
+
elements = ElementCollection(elements)
|
1327
|
+
|
1328
|
+
# Apply exclusions if needed
|
1329
|
+
if apply_exclusions and exclusion_regions:
|
1330
|
+
elements = elements.exclude_regions(exclusion_regions)
|
1331
|
+
|
1332
|
+
# Highlight with appropriate label
|
1333
|
+
elements.highlight(label=label, cycle_colors=cycle_colors)
|
1334
|
+
else:
|
1335
|
+
# Highlight all non-text elements
|
1336
|
+
for element_type in all_types.keys():
|
1337
|
+
if element_type != 'text' and element_type != 'char':
|
1338
|
+
if all_types[element_type]:
|
1339
|
+
label = f"{element_type.capitalize()} Elements"
|
1340
|
+
elements = all_types[element_type]
|
1341
|
+
|
1342
|
+
# Skip empty collections
|
1343
|
+
if not elements:
|
1344
|
+
continue
|
1345
|
+
|
1346
|
+
# Create an ElementCollection if needed
|
1347
|
+
from natural_pdf.elements.collections import ElementCollection
|
1348
|
+
if not isinstance(elements, ElementCollection):
|
1349
|
+
elements = ElementCollection(elements)
|
1350
|
+
|
1351
|
+
# Apply exclusions if needed
|
1352
|
+
if apply_exclusions and exclusion_regions:
|
1353
|
+
elements = elements.exclude_regions(exclusion_regions)
|
1354
|
+
|
1355
|
+
# Highlight with appropriate label
|
1356
|
+
elements.highlight(label=label, use_color_cycling=use_color_cycling)
|
1357
|
+
|
1358
|
+
return self
|
1359
|
+
|
1360
|
+
# Normal highlight_all behavior (by element type)
|
1361
|
+
# Determine which types to highlight
|
1362
|
+
types_to_highlight = include_types if include_types else all_types.keys()
|
1363
|
+
|
1364
|
+
# Highlight each type of element with its own color/label
|
1365
|
+
for element_type in types_to_highlight:
|
1366
|
+
if element_type in all_types and all_types[element_type]:
|
1367
|
+
# Format label (e.g., "text" -> "Text Elements")
|
1368
|
+
label = f"{element_type.capitalize()} Elements"
|
1369
|
+
|
1370
|
+
# Get the elements and highlight them
|
1371
|
+
elements = all_types[element_type]
|
1372
|
+
|
1373
|
+
# Skip empty collections
|
1374
|
+
if not elements:
|
1375
|
+
continue
|
1376
|
+
|
1377
|
+
# Create an ElementCollection if needed
|
1378
|
+
from natural_pdf.elements.collections import ElementCollection
|
1379
|
+
if not isinstance(elements, ElementCollection):
|
1380
|
+
elements = ElementCollection(elements)
|
1381
|
+
|
1382
|
+
# Apply exclusions if needed
|
1383
|
+
if apply_exclusions and exclusion_regions:
|
1384
|
+
elements = elements.exclude_regions(exclusion_regions)
|
1385
|
+
|
1386
|
+
# Highlight with appropriate label
|
1387
|
+
elements.highlight(label=label, use_color_cycling=use_color_cycling)
|
1388
|
+
|
1389
|
+
# Include layout regions if requested
|
1390
|
+
if include_layout_regions:
|
1391
|
+
# Run layout detection if not already done
|
1392
|
+
if (not hasattr(self, 'detected_layout_regions') or not self.detected_layout_regions) and \
|
1393
|
+
('detected' not in self._regions or not self._regions['detected']):
|
1394
|
+
# Make sure to run analyze_layout with include_highlights=False
|
1395
|
+
self.analyze_layout(confidence=layout_confidence)
|
1396
|
+
|
1397
|
+
# Get layout regions from either detected_layout_regions or _regions['detected']
|
1398
|
+
layout_regions = []
|
1399
|
+
if hasattr(self, 'detected_layout_regions') and self.detected_layout_regions:
|
1400
|
+
layout_regions = self.detected_layout_regions
|
1401
|
+
elif 'detected' in self._regions and self._regions['detected']:
|
1402
|
+
layout_regions = self._regions['detected']
|
1403
|
+
|
1404
|
+
# Filter regions by confidence (handle case where layout_confidence=True)
|
1405
|
+
if isinstance(layout_confidence, bool):
|
1406
|
+
# If True is passed, don't filter by confidence
|
1407
|
+
filtered_regions = layout_regions
|
1408
|
+
else:
|
1409
|
+
# Filter by confidence threshold
|
1410
|
+
filtered_regions = [r for r in layout_regions if hasattr(r, 'confidence') and r.confidence >= layout_confidence]
|
1411
|
+
layout_regions = filtered_regions
|
1412
|
+
|
1413
|
+
# Group regions by model and type for better visualization
|
1414
|
+
models = set(r.model for r in layout_regions if hasattr(r, 'model'))
|
1415
|
+
|
1416
|
+
for model in models:
|
1417
|
+
# Get regions for this model
|
1418
|
+
model_regions = [r for r in layout_regions if hasattr(r, 'model') and r.model == model]
|
1419
|
+
|
1420
|
+
# Group by type within model
|
1421
|
+
types = set(r.region_type for r in model_regions if hasattr(r, 'region_type'))
|
1422
|
+
|
1423
|
+
for region_type in types:
|
1424
|
+
# Get regions of this type
|
1425
|
+
type_regions = [r for r in model_regions if hasattr(r, 'region_type') and r.region_type == region_type]
|
1426
|
+
|
1427
|
+
# Create a collection and highlight
|
1428
|
+
from natural_pdf.elements.collections import ElementCollection
|
1429
|
+
collection = ElementCollection(type_regions)
|
1430
|
+
|
1431
|
+
# Determine color based on type (similar to highlight_layout logic)
|
1432
|
+
color = None
|
1433
|
+
if model == 'tatr':
|
1434
|
+
if region_type == 'table':
|
1435
|
+
color = (1, 0, 0, 0.3) # Red for tables
|
1436
|
+
elif region_type == 'table row':
|
1437
|
+
color = (0, 1, 0, 0.3) # Green for rows
|
1438
|
+
elif region_type == 'table column':
|
1439
|
+
color = (0, 0, 1, 0.3) # Blue for columns
|
1440
|
+
elif region_type == 'table column header':
|
1441
|
+
color = (0, 1, 1, 0.3) # Cyan for column headers
|
1442
|
+
|
1443
|
+
# Don't use ElementCollection for this case since we want individual confidence scores
|
1444
|
+
# Instead, highlight each region individually with its own confidence
|
1445
|
+
for region in type_regions:
|
1446
|
+
# Create a label with model and type
|
1447
|
+
label = f"Layout ({model}): {region_type}"
|
1448
|
+
|
1449
|
+
# Highlight with the same color scheme but don't automatically include attributes
|
1450
|
+
region.highlight(
|
1451
|
+
label=label,
|
1452
|
+
color=color,
|
1453
|
+
use_color_cycling=use_color_cycling
|
1454
|
+
# No include_attrs by default - user must explicitly request it
|
1455
|
+
)
|
1456
|
+
|
1457
|
+
return self
|
1458
|
+
|
1459
|
+
def to_image(self,
|
1460
|
+
path: Optional[str] = None,
|
1461
|
+
scale: float = 2.0,
|
1462
|
+
width: Optional[int] = None,
|
1463
|
+
labels: bool = True,
|
1464
|
+
legend_position: str = 'right',
|
1465
|
+
render_ocr: bool = False,
|
1466
|
+
resolution: float = None,
|
1467
|
+
include_highlights: bool = True,
|
1468
|
+
**kwargs) -> Image.Image:
|
1469
|
+
"""
|
1470
|
+
Generate a PIL image of the page, optionally with highlights, and optionally save it to a file.
|
1471
|
+
|
1472
|
+
Args:
|
1473
|
+
path: Optional path to save the image to
|
1474
|
+
scale: Scale factor for rendering highlights (default: 2.0)
|
1475
|
+
width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
|
1476
|
+
labels: Whether to include a legend for labels (default: True)
|
1477
|
+
legend_position: Position of the legend (default: 'right')
|
1478
|
+
render_ocr: Whether to render OCR text with white background boxes (default: False)
|
1479
|
+
resolution: Resolution in DPI for base page image (default: scale * 72)
|
1480
|
+
include_highlights: Whether to include highlights (default: True)
|
1481
|
+
**kwargs: Additional parameters for pdfplumber.to_image
|
1482
|
+
|
1483
|
+
Returns:
|
1484
|
+
PIL Image of the page
|
1485
|
+
|
1486
|
+
Examples:
|
1487
|
+
>>> # Get base page image without highlights
|
1488
|
+
>>> img = page.to_image(include_highlights=False)
|
1489
|
+
>>>
|
1490
|
+
>>> # Get image with highlights and no labels
|
1491
|
+
>>> img = page.to_image(labels=False)
|
1492
|
+
>>>
|
1493
|
+
>>> # Save image with specific width
|
1494
|
+
>>> page.to_image(path="output.png", width=800)
|
1495
|
+
"""
|
1496
|
+
# Use resolution based on scale if not provided
|
1497
|
+
if resolution is None:
|
1498
|
+
resolution = scale * 72 # Convert scale to DPI (72 is base DPI)
|
1499
|
+
|
1500
|
+
if include_highlights and hasattr(self, '_highlight_mgr'):
|
1501
|
+
# Get the highlighted image
|
1502
|
+
image = self._highlight_mgr.get_highlighted_image(scale, labels, legend_position, render_ocr)
|
1503
|
+
else:
|
1504
|
+
# Get the base page image from pdfplumber
|
1505
|
+
image = self._page.to_image(resolution=resolution, **kwargs).annotated
|
1506
|
+
|
1507
|
+
# Resize the image if width is provided
|
1508
|
+
if width is not None and width > 0:
|
1509
|
+
# Calculate height to maintain aspect ratio
|
1510
|
+
aspect_ratio = image.height / image.width
|
1511
|
+
height = int(width * aspect_ratio)
|
1512
|
+
# Resize the image
|
1513
|
+
image = image.resize((width, height), Image.LANCZOS)
|
1514
|
+
|
1515
|
+
# Save the image if path is provided
|
1516
|
+
if path:
|
1517
|
+
image.save(path)
|
1518
|
+
|
1519
|
+
return image
|
1520
|
+
|
1521
|
+
def _get_ocr_config(self, ocr_params: Optional[Union[bool, str, List, Dict]] = None) -> Dict[str, Any]:
|
1522
|
+
"""
|
1523
|
+
Get the OCR configuration by merging defaults, PDF settings, and provided params.
|
1524
|
+
|
1525
|
+
Args:
|
1526
|
+
ocr_params: OCR parameters to override defaults
|
1527
|
+
|
1528
|
+
Returns:
|
1529
|
+
Merged OCR configuration
|
1530
|
+
"""
|
1531
|
+
if HAS_OCR_ENGINES and hasattr(self._parent, '_ocr_engine') and self._parent._ocr_engine:
|
1532
|
+
# Use new OCR engine system
|
1533
|
+
engine = self._parent._ocr_engine
|
1534
|
+
|
1535
|
+
# Get normalized PDF-level config
|
1536
|
+
pdf_config = self._parent._ocr_config
|
1537
|
+
|
1538
|
+
# Special case: If ocr_params is boolean True, convert to config with enabled=True
|
1539
|
+
if ocr_params is True:
|
1540
|
+
ocr_params = {"enabled": True}
|
1541
|
+
|
1542
|
+
# Normalize provided config
|
1543
|
+
if ocr_params is not None:
|
1544
|
+
provided_config = engine.normalize_config(ocr_params)
|
1545
|
+
|
1546
|
+
# If provided config explicitly sets enabled, respect that
|
1547
|
+
if "enabled" in provided_config:
|
1548
|
+
# Always merge configs to get language settings etc. from PDF-level config
|
1549
|
+
result_config = engine.merge_configs(pdf_config, provided_config)
|
1550
|
+
# Only print status if verbose mode is not explicitly disabled
|
1551
|
+
if provided_config.get('verbose', True):
|
1552
|
+
print(f"OCR enabled status from provided params: {provided_config.get('enabled')}")
|
1553
|
+
return result_config
|
1554
|
+
else:
|
1555
|
+
# Merge configs and keep PDF-level enabled status
|
1556
|
+
result_config = engine.merge_configs(pdf_config, provided_config)
|
1557
|
+
# Only print status if verbose mode is not explicitly disabled
|
1558
|
+
if provided_config.get('verbose', True):
|
1559
|
+
print(f"OCR enabled status from PDF config: {pdf_config.get('enabled')}")
|
1560
|
+
return result_config
|
1561
|
+
else:
|
1562
|
+
# Use PDF-level config
|
1563
|
+
# Only print status if verbose mode is not explicitly disabled
|
1564
|
+
if ocr_params is None or not isinstance(ocr_params, dict) or ocr_params.get('verbose', True):
|
1565
|
+
print(f"Using PDF-level OCR config: {pdf_config}")
|
1566
|
+
return pdf_config
|
1567
|
+
else:
|
1568
|
+
# Fallback to legacy OCR manager
|
1569
|
+
ocr_manager = OCRManager.get_instance()
|
1570
|
+
|
1571
|
+
# Get normalized PDF-level config
|
1572
|
+
pdf_config = self._parent._ocr_config
|
1573
|
+
|
1574
|
+
# Special case: If ocr_params is boolean True, convert to config with enabled=True
|
1575
|
+
if ocr_params is True:
|
1576
|
+
ocr_params = {"enabled": True}
|
1577
|
+
|
1578
|
+
# Normalize provided config
|
1579
|
+
if ocr_params is not None:
|
1580
|
+
provided_config = ocr_manager.normalize_config(ocr_params)
|
1581
|
+
|
1582
|
+
# If provided config explicitly sets enabled, respect that
|
1583
|
+
if "enabled" in provided_config:
|
1584
|
+
# Always merge configs to get language settings etc. from PDF-level config
|
1585
|
+
result_config = ocr_manager.merge_configs(pdf_config, provided_config)
|
1586
|
+
print(f"OCR enabled status from provided params: {provided_config.get('enabled')}")
|
1587
|
+
return result_config
|
1588
|
+
else:
|
1589
|
+
# Merge configs and keep PDF-level enabled status
|
1590
|
+
result_config = ocr_manager.merge_configs(pdf_config, provided_config)
|
1591
|
+
print(f"OCR enabled status from PDF config: {pdf_config.get('enabled')}")
|
1592
|
+
return result_config
|
1593
|
+
else:
|
1594
|
+
# Use PDF-level config
|
1595
|
+
print(f"Using PDF-level OCR config: {pdf_config}")
|
1596
|
+
return pdf_config
|
1597
|
+
|
1598
|
+
def _create_text_elements_from_ocr(self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None) -> List[TextElement]:
|
1599
|
+
"""
|
1600
|
+
Convert OCR results to TextElement objects.
|
1601
|
+
|
1602
|
+
Args:
|
1603
|
+
ocr_results: List of OCR results with text, bbox, and confidence
|
1604
|
+
image_width: Width of the source image (for coordinate scaling)
|
1605
|
+
image_height: Height of the source image (for coordinate scaling)
|
1606
|
+
|
1607
|
+
Returns:
|
1608
|
+
List of created TextElement objects
|
1609
|
+
"""
|
1610
|
+
elements = []
|
1611
|
+
|
1612
|
+
# Calculate scale factors to convert from image coordinates to PDF coordinates
|
1613
|
+
# Default to 1.0 if not provided (assume coordinates are already in PDF space)
|
1614
|
+
scale_x = 1.0
|
1615
|
+
scale_y = 1.0
|
1616
|
+
|
1617
|
+
if image_width and image_height:
|
1618
|
+
scale_x = self.width / image_width
|
1619
|
+
scale_y = self.height / image_height
|
1620
|
+
|
1621
|
+
for result in ocr_results:
|
1622
|
+
# Convert numpy int32 to float if needed and scale to PDF coordinates
|
1623
|
+
x0 = float(result['bbox'][0]) * scale_x
|
1624
|
+
top = float(result['bbox'][1]) * scale_y
|
1625
|
+
x1 = float(result['bbox'][2]) * scale_x
|
1626
|
+
bottom = float(result['bbox'][3]) * scale_y
|
1627
|
+
|
1628
|
+
# Create a TextElement object with additional required fields for highlighting
|
1629
|
+
element_data = {
|
1630
|
+
'text': result['text'],
|
1631
|
+
'x0': x0,
|
1632
|
+
'top': top,
|
1633
|
+
'x1': x1,
|
1634
|
+
'bottom': bottom,
|
1635
|
+
'width': x1 - x0,
|
1636
|
+
'height': bottom - top,
|
1637
|
+
'object_type': 'text',
|
1638
|
+
'source': 'ocr',
|
1639
|
+
'confidence': result['confidence'],
|
1640
|
+
# Add default font information to work with existing expectations
|
1641
|
+
'fontname': 'OCR-detected',
|
1642
|
+
'size': 10.0,
|
1643
|
+
'page_number': self.number
|
1644
|
+
}
|
1645
|
+
|
1646
|
+
elem = TextElement(element_data, self)
|
1647
|
+
elements.append(elem)
|
1648
|
+
|
1649
|
+
# Add to page's elements
|
1650
|
+
if hasattr(self, '_elements') and self._elements is not None:
|
1651
|
+
# Add to words list to make it accessible via standard API
|
1652
|
+
if 'words' in self._elements:
|
1653
|
+
self._elements['words'].append(elem)
|
1654
|
+
else:
|
1655
|
+
self._elements['words'] = [elem]
|
1656
|
+
|
1657
|
+
return elements
|
1658
|
+
|
1659
|
+
def apply_ocr(self, **ocr_params) -> List[TextElement]:
|
1660
|
+
"""
|
1661
|
+
Apply OCR to this page and register results as text elements.
|
1662
|
+
|
1663
|
+
Args:
|
1664
|
+
**ocr_params: OCR parameters to override defaults
|
1665
|
+
|
1666
|
+
Returns:
|
1667
|
+
List of created text elements
|
1668
|
+
"""
|
1669
|
+
# Get OCR config (merge defaults, PDF settings, and provided params)
|
1670
|
+
# Ensure OCR is enabled for this explicit OCR call
|
1671
|
+
if isinstance(ocr_params, dict):
|
1672
|
+
ocr_params["enabled"] = True
|
1673
|
+
else:
|
1674
|
+
ocr_params = {"enabled": True}
|
1675
|
+
|
1676
|
+
config = self._get_ocr_config(ocr_params)
|
1677
|
+
|
1678
|
+
# Skip if OCR is still disabled (should not happen after the above override)
|
1679
|
+
if not config.get('enabled'):
|
1680
|
+
print(f"OCR is disabled in config despite override - forcing enabled=True")
|
1681
|
+
config["enabled"] = True
|
1682
|
+
|
1683
|
+
# Render page to image
|
1684
|
+
print(f"Rendering page {self.number} to image for OCR...")
|
1685
|
+
image = self.to_image()
|
1686
|
+
print(f"Image size: {image.width}x{image.height}")
|
1687
|
+
|
1688
|
+
# Save image for debugging if needed
|
1689
|
+
try:
|
1690
|
+
import os
|
1691
|
+
debug_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "output")
|
1692
|
+
os.makedirs(debug_dir, exist_ok=True)
|
1693
|
+
debug_path = os.path.join(debug_dir, f"page_{self.number}_for_ocr.png")
|
1694
|
+
image.save(debug_path)
|
1695
|
+
print(f"Saved page image for debugging to {debug_path}")
|
1696
|
+
except Exception as e:
|
1697
|
+
print(f"Could not save debug image: {e}")
|
1698
|
+
|
1699
|
+
# Process the image with the appropriate OCR engine
|
1700
|
+
print(f"Processing image with OCR engine...")
|
1701
|
+
if HAS_OCR_ENGINES and hasattr(self._parent, '_ocr_engine') and self._parent._ocr_engine:
|
1702
|
+
# Use new OCR engine system
|
1703
|
+
print(f"Using OCR engine: {self._parent._ocr_engine.__class__.__name__}")
|
1704
|
+
engine = self._parent._ocr_engine
|
1705
|
+
results = engine.process_image(image, config)
|
1706
|
+
else:
|
1707
|
+
# Fallback to legacy OCR manager
|
1708
|
+
print(f"Using legacy OCR manager")
|
1709
|
+
ocr_mgr = OCRManager.get_instance()
|
1710
|
+
results = ocr_mgr.detect_and_recognize(image, config)
|
1711
|
+
|
1712
|
+
print(f"OCR returned {len(results)} results")
|
1713
|
+
|
1714
|
+
# Convert results to elements and add to page, with image dimensions for scaling
|
1715
|
+
elements = self._create_text_elements_from_ocr(results, image.width, image.height)
|
1716
|
+
|
1717
|
+
return elements
|
1718
|
+
|
1719
|
+
def extract_ocr_elements(self, **ocr_params) -> List[TextElement]:
|
1720
|
+
"""
|
1721
|
+
Extract text elements using OCR.
|
1722
|
+
|
1723
|
+
This method applies OCR to the page and returns the resulting text elements
|
1724
|
+
without modifying the page's elements list.
|
1725
|
+
|
1726
|
+
Args:
|
1727
|
+
**ocr_params: OCR parameters to override defaults
|
1728
|
+
|
1729
|
+
Returns:
|
1730
|
+
List of text elements created from OCR
|
1731
|
+
"""
|
1732
|
+
print("=" * 40)
|
1733
|
+
print(f"Page.extract_ocr_elements called with params: {ocr_params}")
|
1734
|
+
|
1735
|
+
# Get OCR config
|
1736
|
+
# Ensure OCR is enabled for this explicit OCR call
|
1737
|
+
if isinstance(ocr_params, dict):
|
1738
|
+
ocr_params["enabled"] = True
|
1739
|
+
else:
|
1740
|
+
ocr_params = {"enabled": True}
|
1741
|
+
|
1742
|
+
config = self._get_ocr_config(ocr_params)
|
1743
|
+
print(f"OCR config after normalization: {config}")
|
1744
|
+
|
1745
|
+
# Skip if OCR is still disabled (should not happen after the above override)
|
1746
|
+
if not config.get('enabled'):
|
1747
|
+
print(f"OCR is disabled in config despite override - forcing enabled=True")
|
1748
|
+
config["enabled"] = True
|
1749
|
+
|
1750
|
+
# Try direct OCR test for debugging
|
1751
|
+
import os
|
1752
|
+
try:
|
1753
|
+
print("Trying direct OCR test for debugging...")
|
1754
|
+
|
1755
|
+
# Save image to temp file for debugging
|
1756
|
+
output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "output")
|
1757
|
+
os.makedirs(output_dir, exist_ok=True)
|
1758
|
+
temp_image_path = os.path.join(output_dir, "direct_ocr_debug.png")
|
1759
|
+
|
1760
|
+
# Get the image using the direct method
|
1761
|
+
print("Generating page image...")
|
1762
|
+
from PIL import Image
|
1763
|
+
image = self.to_image()
|
1764
|
+
image.save(temp_image_path)
|
1765
|
+
print(f"Saved image to {temp_image_path}")
|
1766
|
+
|
1767
|
+
try:
|
1768
|
+
import easyocr
|
1769
|
+
print("Testing direct EasyOCR...")
|
1770
|
+
reader = easyocr.Reader(['en'])
|
1771
|
+
import numpy as np
|
1772
|
+
result = reader.readtext(np.array(image))
|
1773
|
+
print(f"Direct EasyOCR test got {len(result)} results")
|
1774
|
+
except ImportError:
|
1775
|
+
print("EasyOCR not available for direct test")
|
1776
|
+
except Exception as e:
|
1777
|
+
print(f"Error in direct EasyOCR test: {e}")
|
1778
|
+
|
1779
|
+
try:
|
1780
|
+
import paddleocr
|
1781
|
+
print("Testing direct PaddleOCR...")
|
1782
|
+
reader = paddleocr.PaddleOCR(lang='en')
|
1783
|
+
import numpy as np
|
1784
|
+
result = reader.ocr(np.array(image), cls=False)
|
1785
|
+
if result is not None and len(result) > 0:
|
1786
|
+
page_result = result[0] if isinstance(result[0], list) else result
|
1787
|
+
print(f"Direct PaddleOCR test got {len(page_result)} results")
|
1788
|
+
else:
|
1789
|
+
print(f"Direct PaddleOCR test got no results: {result}")
|
1790
|
+
except ImportError:
|
1791
|
+
print("PaddleOCR not available for direct test")
|
1792
|
+
except Exception as e:
|
1793
|
+
print(f"Error in direct PaddleOCR test: {e}")
|
1794
|
+
except Exception as e:
|
1795
|
+
print(f"Error in direct OCR test: {e}")
|
1796
|
+
|
1797
|
+
# Now try the normal process
|
1798
|
+
print("Proceeding with normal OCR process...")
|
1799
|
+
|
1800
|
+
# Render page to image
|
1801
|
+
print(f"Rendering page {self.number} to image for OCR...")
|
1802
|
+
image = self.to_image()
|
1803
|
+
print(f"Image size: {image.width}x{image.height}")
|
1804
|
+
|
1805
|
+
# Process the image with the appropriate OCR engine
|
1806
|
+
print(f"Processing image with OCR engine...")
|
1807
|
+
results = []
|
1808
|
+
|
1809
|
+
try:
|
1810
|
+
if HAS_OCR_ENGINES and hasattr(self._parent, '_ocr_engine') and self._parent._ocr_engine:
|
1811
|
+
# Use new OCR engine system
|
1812
|
+
print(f"Using OCR engine: {self._parent._ocr_engine.__class__.__name__}")
|
1813
|
+
engine = self._parent._ocr_engine
|
1814
|
+
|
1815
|
+
# Directly test the engine
|
1816
|
+
print(f"Direct test of {engine.__class__.__name__}.process_image")
|
1817
|
+
results = engine.process_image(image, config)
|
1818
|
+
print(f"Engine returned {len(results)} results")
|
1819
|
+
else:
|
1820
|
+
# Fallback to legacy OCR manager
|
1821
|
+
print(f"Using legacy OCR manager")
|
1822
|
+
ocr_mgr = OCRManager.get_instance()
|
1823
|
+
results = ocr_mgr.detect_and_recognize(image, config)
|
1824
|
+
print(f"OCR manager returned {len(results)} results")
|
1825
|
+
except Exception as e:
|
1826
|
+
print(f"Error during OCR processing: {e}")
|
1827
|
+
import traceback
|
1828
|
+
traceback.print_exc()
|
1829
|
+
return []
|
1830
|
+
|
1831
|
+
print(f"OCR returned {len(results)} results")
|
1832
|
+
if len(results) > 0:
|
1833
|
+
print(f"First result: {results[0]}")
|
1834
|
+
|
1835
|
+
# Create a copy of the original _elements so we can restore it later
|
1836
|
+
original_elements = None
|
1837
|
+
if hasattr(self, '_elements'):
|
1838
|
+
original_elements = self._elements
|
1839
|
+
# Temporarily set _elements to None so they aren't added to the page
|
1840
|
+
self._elements = None
|
1841
|
+
|
1842
|
+
# Create elements with proper scaling (but don't add to page)
|
1843
|
+
print(f"Creating text elements from {len(results)} OCR results...")
|
1844
|
+
elements = self._create_text_elements_from_ocr(results, image.width, image.height)
|
1845
|
+
print(f"Created {len(elements)} text elements")
|
1846
|
+
|
1847
|
+
# Restore original elements
|
1848
|
+
if original_elements is not None:
|
1849
|
+
self._elements = original_elements
|
1850
|
+
|
1851
|
+
print(f"Returning {len(elements)} OCR elements")
|
1852
|
+
print("=" * 40)
|
1853
|
+
return elements
|
1854
|
+
|
1855
|
+
def analyze_layout(self,
|
1856
|
+
model: str = "yolo",
|
1857
|
+
confidence: float = 0.2,
|
1858
|
+
classes: Optional[List[str]] = None,
|
1859
|
+
exclude_classes: Optional[List[str]] = None,
|
1860
|
+
device: str = "cpu",
|
1861
|
+
existing: str = "replace",
|
1862
|
+
model_params: Optional[Dict[str, Any]] = None,
|
1863
|
+
# Legacy parameters for backward compatibility
|
1864
|
+
model_path: Optional[str] = None,
|
1865
|
+
image_size: int = 1024,
|
1866
|
+
create_cells: bool = False) -> 'Page':
|
1867
|
+
"""
|
1868
|
+
Analyze the page layout using a machine learning model.
|
1869
|
+
|
1870
|
+
Args:
|
1871
|
+
model: Model type to use ('yolo', 'tatr', or 'paddle')
|
1872
|
+
confidence: Minimum confidence threshold for detections
|
1873
|
+
classes: Specific classes to detect (None for all supported classes)
|
1874
|
+
exclude_classes: Classes to exclude from detection
|
1875
|
+
device: Device to use for inference ('cpu' or 'cuda:0'/'gpu')
|
1876
|
+
existing: How to handle existing regions: 'replace' (default) or 'append'
|
1877
|
+
model_params: Dictionary of model-specific parameters:
|
1878
|
+
- YOLO: {"model_path": "...", "image_size": 1024}
|
1879
|
+
- TATR: {"model_path": "...", "create_cells": False}
|
1880
|
+
- Paddle: {"lang": "en", "use_angle_cls": False, "enable_table": True}
|
1881
|
+
model_path: (Legacy) Optional path to custom model file
|
1882
|
+
image_size: (Legacy) Size to resize the image to before detection (YOLO only)
|
1883
|
+
create_cells: (Legacy) Whether to create cell regions for TATR table regions
|
1884
|
+
|
1885
|
+
Returns:
|
1886
|
+
Self for method chaining
|
1887
|
+
"""
|
1888
|
+
# Initialize model_params if None
|
1889
|
+
if model_params is None:
|
1890
|
+
model_params = {}
|
1891
|
+
|
1892
|
+
# Handle legacy parameters by adding them to model_params
|
1893
|
+
if model_path is not None:
|
1894
|
+
model_params['model_path'] = model_path
|
1895
|
+
if model.lower() == "yolo" and image_size != 1024:
|
1896
|
+
model_params['image_size'] = image_size
|
1897
|
+
if model.lower() == "tatr" and create_cells:
|
1898
|
+
model_params['create_cells'] = create_cells
|
1899
|
+
|
1900
|
+
# Create a temporary directory to store the page image
|
1901
|
+
temp_dir = tempfile.mkdtemp()
|
1902
|
+
temp_image_path = os.path.join(temp_dir, f"page_{self.index}.png")
|
1903
|
+
|
1904
|
+
try:
|
1905
|
+
# Render the page as an image and save to temp file
|
1906
|
+
# Explicitly set include_highlights=False to ensure we get the original page image
|
1907
|
+
page_image = self.to_image(resolution=150.0, include_highlights=False)
|
1908
|
+
page_image.save(temp_image_path)
|
1909
|
+
|
1910
|
+
# Initialize the appropriate detector based on the model type
|
1911
|
+
if model.lower() == "yolo":
|
1912
|
+
# Extract YOLO-specific parameters
|
1913
|
+
model_file = model_params.get('model_path', "doclayout_yolo_docstructbench_imgsz1024.pt")
|
1914
|
+
yolo_image_size = model_params.get('image_size', 1024)
|
1915
|
+
|
1916
|
+
detector = YOLODocLayoutDetector(
|
1917
|
+
model_file=model_file,
|
1918
|
+
device=device
|
1919
|
+
)
|
1920
|
+
# Run detection
|
1921
|
+
detections = detector.detect(
|
1922
|
+
temp_image_path,
|
1923
|
+
confidence=confidence,
|
1924
|
+
classes=classes,
|
1925
|
+
exclude_classes=exclude_classes,
|
1926
|
+
image_size=yolo_image_size
|
1927
|
+
)
|
1928
|
+
|
1929
|
+
elif model.lower() == "tatr" or model.lower() == "table-transformer":
|
1930
|
+
# Extract TATR-specific parameters
|
1931
|
+
tatr_model_path = model_params.get('model_path')
|
1932
|
+
|
1933
|
+
detector = TableTransformerDetector(
|
1934
|
+
detection_model="microsoft/table-transformer-detection" if tatr_model_path is None else tatr_model_path,
|
1935
|
+
device=device
|
1936
|
+
)
|
1937
|
+
# Run detection
|
1938
|
+
detections = detector.detect(
|
1939
|
+
temp_image_path,
|
1940
|
+
confidence=confidence,
|
1941
|
+
classes=classes,
|
1942
|
+
exclude_classes=exclude_classes
|
1943
|
+
)
|
1944
|
+
|
1945
|
+
elif model.lower() == "paddle":
|
1946
|
+
# Extract PaddlePaddle-specific parameters
|
1947
|
+
paddle_lang = model_params.get('lang', 'en')
|
1948
|
+
use_angle_cls = model_params.get('use_angle_cls', False)
|
1949
|
+
enable_table = model_params.get('enable_table', True)
|
1950
|
+
show_log = model_params.get('show_log', False)
|
1951
|
+
|
1952
|
+
# Convert device format
|
1953
|
+
paddle_device = 'gpu' if device.startswith('cuda') else device
|
1954
|
+
|
1955
|
+
# Initialize PaddleLayoutDetector
|
1956
|
+
detector = PaddleLayoutDetector(
|
1957
|
+
lang=paddle_lang,
|
1958
|
+
use_angle_cls=use_angle_cls,
|
1959
|
+
device=paddle_device,
|
1960
|
+
enable_table=enable_table,
|
1961
|
+
show_log=show_log
|
1962
|
+
)
|
1963
|
+
|
1964
|
+
# Run detection
|
1965
|
+
detections = detector.detect(
|
1966
|
+
temp_image_path,
|
1967
|
+
confidence=confidence,
|
1968
|
+
classes=classes,
|
1969
|
+
exclude_classes=exclude_classes
|
1970
|
+
)
|
1971
|
+
|
1972
|
+
else:
|
1973
|
+
raise ValueError(f"Unsupported model type: {model}. Currently supported: 'yolo', 'tatr', 'paddle'")
|
1974
|
+
|
1975
|
+
# Calculate the scale factor to convert from image to PDF coordinates
|
1976
|
+
# Note: This assumes the image resolution is 150 DPI
|
1977
|
+
scale_x = self.width / page_image.width
|
1978
|
+
scale_y = self.height / page_image.height
|
1979
|
+
|
1980
|
+
# Create a list to store layout regions
|
1981
|
+
layout_regions = []
|
1982
|
+
|
1983
|
+
# Convert detections to regions
|
1984
|
+
for detection in detections:
|
1985
|
+
x_min, y_min, x_max, y_max = detection['bbox']
|
1986
|
+
|
1987
|
+
# Convert coordinates from image to PDF space
|
1988
|
+
pdf_x0 = x_min * scale_x
|
1989
|
+
pdf_y0 = y_min * scale_y
|
1990
|
+
pdf_x1 = x_max * scale_x
|
1991
|
+
pdf_y1 = y_max * scale_y
|
1992
|
+
|
1993
|
+
# Create a region
|
1994
|
+
region = Region(self, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
|
1995
|
+
region.region_type = detection['class']
|
1996
|
+
region.normalized_type = detection['normalized_class']
|
1997
|
+
region.confidence = detection['confidence']
|
1998
|
+
region.model = model # Store which model detected this region
|
1999
|
+
region.source = 'detected' # Set the source for selectors
|
2000
|
+
|
2001
|
+
layout_regions.append(region)
|
2002
|
+
|
2003
|
+
# Handle existing regions based on mode
|
2004
|
+
if existing.lower() == 'append':
|
2005
|
+
# Append to existing detected regions
|
2006
|
+
self._regions['detected'].extend(layout_regions)
|
2007
|
+
else:
|
2008
|
+
# Replace existing detected regions
|
2009
|
+
self._regions['detected'] = layout_regions
|
2010
|
+
|
2011
|
+
# Make sure elements is initialized
|
2012
|
+
self._load_elements()
|
2013
|
+
|
2014
|
+
# Update elements collection for selectors
|
2015
|
+
if 'regions' not in self._elements:
|
2016
|
+
self._elements['regions'] = []
|
2017
|
+
|
2018
|
+
# Update elements collection based on existing mode
|
2019
|
+
if existing.lower() == 'append':
|
2020
|
+
# Only add new regions that aren't already in the collection
|
2021
|
+
for region in layout_regions:
|
2022
|
+
if region not in self._elements['regions']:
|
2023
|
+
self._elements['regions'].append(region)
|
2024
|
+
else:
|
2025
|
+
# Replace existing regions in _elements with detected regions, keep named regions
|
2026
|
+
# First get all named regions from _elements['regions']
|
2027
|
+
named_regions = [r for r in self._elements['regions'] if r.source == 'named']
|
2028
|
+
# Then create a new list with named regions and layout regions
|
2029
|
+
self._elements['regions'] = named_regions + layout_regions
|
2030
|
+
|
2031
|
+
# Create cells for table regions if requested and using TATR
|
2032
|
+
create_cells_flag = model_params.get('create_cells', create_cells)
|
2033
|
+
if model.lower() == 'tatr' and create_cells_flag:
|
2034
|
+
# Debug log
|
2035
|
+
print(f"Creating cells for {len([r for r in layout_regions if r.region_type == 'table'])} table regions")
|
2036
|
+
|
2037
|
+
cell_count = 0
|
2038
|
+
for region in layout_regions:
|
2039
|
+
# Check if it's a table region
|
2040
|
+
if region.region_type == 'table':
|
2041
|
+
try:
|
2042
|
+
# Create cells for the table
|
2043
|
+
cells = region.create_cells()
|
2044
|
+
cell_count += len(cells)
|
2045
|
+
|
2046
|
+
# Add cell regions to our tracking structures
|
2047
|
+
layout_regions.extend(cells)
|
2048
|
+
|
2049
|
+
# Also add to _elements for selectors
|
2050
|
+
if 'regions' in self._elements:
|
2051
|
+
self._elements['regions'].extend(cells)
|
2052
|
+
|
2053
|
+
# And to _regions['detected']
|
2054
|
+
self._regions['detected'].extend(cells)
|
2055
|
+
|
2056
|
+
except Exception as e:
|
2057
|
+
print(f"Error creating cells for table: {e}")
|
2058
|
+
|
2059
|
+
# Debug log
|
2060
|
+
print(f"Created {cell_count} cells in total")
|
2061
|
+
|
2062
|
+
# Store layout regions in an instance variable so they can be accessed after the method returns
|
2063
|
+
self.detected_layout_regions = layout_regions
|
2064
|
+
return self
|
2065
|
+
|
2066
|
+
finally:
|
2067
|
+
# Clean up temporary file and directory
|
2068
|
+
if os.path.exists(temp_image_path):
|
2069
|
+
os.remove(temp_image_path)
|
2070
|
+
os.rmdir(temp_dir)
|
2071
|
+
|
2072
|
+
def highlight_layout(self,
|
2073
|
+
layout_regions: Optional[List[Region]] = None,
|
2074
|
+
confidence: float = 0.2,
|
2075
|
+
label_format: str = "{type} ({conf:.2f}){model}") -> 'Page':
|
2076
|
+
"""
|
2077
|
+
Highlight detected layout regions on the page.
|
2078
|
+
|
2079
|
+
Args:
|
2080
|
+
layout_regions: List of regions to highlight (runs analyze_layout if None)
|
2081
|
+
confidence: Minimum confidence threshold for highlighting regions
|
2082
|
+
label_format: Format string for region labels
|
2083
|
+
|
2084
|
+
Returns:
|
2085
|
+
Self for method chaining
|
2086
|
+
"""
|
2087
|
+
# If no regions provided, use detected_layout_regions, detected regions, or run layout detection
|
2088
|
+
if layout_regions:
|
2089
|
+
regions = layout_regions
|
2090
|
+
elif hasattr(self, 'detected_layout_regions') and self.detected_layout_regions:
|
2091
|
+
regions = self.detected_layout_regions
|
2092
|
+
elif 'detected' in self._regions and self._regions['detected']:
|
2093
|
+
regions = self._regions['detected']
|
2094
|
+
else:
|
2095
|
+
# Call analyze_layout with include_highlights=False and use the result directly
|
2096
|
+
self.analyze_layout(confidence=confidence)
|
2097
|
+
regions = self.detected_layout_regions
|
2098
|
+
|
2099
|
+
# Highlight each region with its type as the label
|
2100
|
+
for region in regions:
|
2101
|
+
# Skip regions below confidence threshold
|
2102
|
+
if region.confidence < confidence:
|
2103
|
+
continue
|
2104
|
+
|
2105
|
+
# No model filtering here - use selectors for that
|
2106
|
+
|
2107
|
+
# Format label
|
2108
|
+
model_suffix = f" ({region.model})" if hasattr(region, 'model') else ""
|
2109
|
+
label = label_format.format(
|
2110
|
+
type=region.region_type,
|
2111
|
+
conf=region.confidence,
|
2112
|
+
model=model_suffix
|
2113
|
+
)
|
2114
|
+
|
2115
|
+
# Highlight region with appropriate color based on model
|
2116
|
+
if hasattr(region, 'model') and region.model == 'tatr':
|
2117
|
+
# Use different colors for table structure elements
|
2118
|
+
if region.region_type == 'table':
|
2119
|
+
color = (1, 0, 0, 0.3) # Red for tables
|
2120
|
+
elif region.region_type == 'table row':
|
2121
|
+
color = (0, 1, 0, 0.3) # Green for rows
|
2122
|
+
elif region.region_type == 'table column':
|
2123
|
+
color = (0, 0, 1, 0.3) # Blue for columns
|
2124
|
+
elif region.region_type == 'table column header':
|
2125
|
+
color = (0, 1, 1, 0.3) # Cyan for column headers
|
2126
|
+
else:
|
2127
|
+
color = None # Default color cycling
|
2128
|
+
region.highlight(label=label, color=color)
|
2129
|
+
else:
|
2130
|
+
region.highlight(label=label)
|
2131
|
+
|
2132
|
+
return self
|
2133
|
+
|
2134
|
+
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Region:
|
2135
|
+
"""
|
2136
|
+
Get a section between two elements on this page.
|
2137
|
+
|
2138
|
+
Args:
|
2139
|
+
start_element: Element marking the start of the section
|
2140
|
+
end_element: Element marking the end of the section
|
2141
|
+
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2142
|
+
|
2143
|
+
Returns:
|
2144
|
+
Region representing the section between elements
|
2145
|
+
"""
|
2146
|
+
# Create a full-page region
|
2147
|
+
page_region = self.create_region(0, 0, self.width, self.height)
|
2148
|
+
|
2149
|
+
# Get the section from the region
|
2150
|
+
return page_region.get_section_between(
|
2151
|
+
start_element=start_element,
|
2152
|
+
end_element=end_element,
|
2153
|
+
boundary_inclusion=boundary_inclusion
|
2154
|
+
)
|
2155
|
+
|
2156
|
+
def get_sections(self,
|
2157
|
+
start_elements=None,
|
2158
|
+
end_elements=None,
|
2159
|
+
boundary_inclusion='both',
|
2160
|
+
y_threshold=5.0,
|
2161
|
+
bounding_box=None):
|
2162
|
+
"""
|
2163
|
+
Get sections of a page defined by start/end elements.
|
2164
|
+
|
2165
|
+
Args:
|
2166
|
+
start_elements: Elements or selector string that mark the start of sections
|
2167
|
+
end_elements: Elements or selector string that mark the end of sections
|
2168
|
+
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
|
2169
|
+
y_threshold: Maximum vertical difference to consider elements on same line
|
2170
|
+
bounding_box: Optional tuple (x0, top, x1, bottom) to limit the section area
|
2171
|
+
|
2172
|
+
Returns:
|
2173
|
+
List of Region objects representing the sections
|
2174
|
+
"""
|
2175
|
+
# Helper function to get bounds from bounding_box parameter
|
2176
|
+
def get_bounds():
|
2177
|
+
if bounding_box:
|
2178
|
+
return bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]
|
2179
|
+
else:
|
2180
|
+
return 0, 0, self.width, self.height
|
2181
|
+
|
2182
|
+
regions = []
|
2183
|
+
|
2184
|
+
# Handle cases where elements are provided as strings (selectors)
|
2185
|
+
if isinstance(start_elements, str):
|
2186
|
+
start_elements = self.find_all(start_elements)
|
2187
|
+
|
2188
|
+
if isinstance(end_elements, str):
|
2189
|
+
end_elements = self.find_all(end_elements)
|
2190
|
+
|
2191
|
+
# Validate boundary_inclusion parameter
|
2192
|
+
valid_inclusions = ['start', 'end', 'both', 'none']
|
2193
|
+
if boundary_inclusion not in valid_inclusions:
|
2194
|
+
raise ValueError(f"boundary_inclusion must be one of {valid_inclusions}")
|
2195
|
+
|
2196
|
+
# If no start elements, can't do anything
|
2197
|
+
if not start_elements:
|
2198
|
+
return regions
|
2199
|
+
|
2200
|
+
# Sort elements by position (top-to-bottom, left-to-right)
|
2201
|
+
all_elements = []
|
2202
|
+
|
2203
|
+
for element in start_elements:
|
2204
|
+
all_elements.append((element, 'start'))
|
2205
|
+
|
2206
|
+
if end_elements:
|
2207
|
+
for element in end_elements:
|
2208
|
+
all_elements.append((element, 'end'))
|
2209
|
+
|
2210
|
+
# Group elements with similar Y coordinates
|
2211
|
+
# Consider elements on the same line if they're within the threshold
|
2212
|
+
|
2213
|
+
# First sort all elements by Y position
|
2214
|
+
all_elements.sort(key=lambda x: x[0].top)
|
2215
|
+
|
2216
|
+
# Group elements on the same line
|
2217
|
+
grouped_elements = []
|
2218
|
+
current_group = []
|
2219
|
+
current_group_type = None
|
2220
|
+
current_y = None
|
2221
|
+
|
2222
|
+
for element, element_type in all_elements:
|
2223
|
+
if current_y is None or abs(element.top - current_y) <= y_threshold:
|
2224
|
+
# Element is on the same line as current group
|
2225
|
+
if current_group and element_type != current_group_type:
|
2226
|
+
# If we have a mixed group, prioritize start elements over end elements
|
2227
|
+
if element_type == 'start':
|
2228
|
+
current_group_type = 'start'
|
2229
|
+
elif not current_group:
|
2230
|
+
current_group_type = element_type
|
2231
|
+
|
2232
|
+
current_group.append(element)
|
2233
|
+
current_y = element.top # Update reference Y
|
2234
|
+
else:
|
2235
|
+
# Element is on a new line, close current group and start a new one
|
2236
|
+
if current_group:
|
2237
|
+
# Find the leftmost element in the group
|
2238
|
+
leftmost = min(current_group, key=lambda e: e.x0)
|
2239
|
+
grouped_elements.append((leftmost, current_group_type))
|
2240
|
+
|
2241
|
+
# Start a new group
|
2242
|
+
current_group = [element]
|
2243
|
+
current_group_type = element_type
|
2244
|
+
current_y = element.top
|
2245
|
+
|
2246
|
+
# Add the last group
|
2247
|
+
if current_group:
|
2248
|
+
# Find the leftmost element in the group
|
2249
|
+
leftmost = min(current_group, key=lambda e: e.x0)
|
2250
|
+
grouped_elements.append((leftmost, current_group_type))
|
2251
|
+
|
2252
|
+
# Use the grouped elements for sectioning
|
2253
|
+
all_elements = grouped_elements
|
2254
|
+
|
2255
|
+
# Find sections
|
2256
|
+
current_start = None
|
2257
|
+
|
2258
|
+
for i, (element, element_type) in enumerate(all_elements):
|
2259
|
+
if element_type == 'start':
|
2260
|
+
# If we already have a start without an end, create a section until this start
|
2261
|
+
if current_start is not None:
|
2262
|
+
# Create a region from current_start to this start
|
2263
|
+
start_element = current_start
|
2264
|
+
end_element = element
|
2265
|
+
|
2266
|
+
# Determine region boundaries based on inclusion parameter
|
2267
|
+
if boundary_inclusion in ['start', 'both']:
|
2268
|
+
top = start_element.top
|
2269
|
+
else:
|
2270
|
+
top = start_element.bottom
|
2271
|
+
|
2272
|
+
if boundary_inclusion in ['end', 'both']:
|
2273
|
+
bottom = end_element.bottom
|
2274
|
+
else:
|
2275
|
+
bottom = end_element.top
|
2276
|
+
|
2277
|
+
# Create the region
|
2278
|
+
x0, _, x1, _ = get_bounds()
|
2279
|
+
region = self.create_region(x0, top, x1, bottom)
|
2280
|
+
region.start_element = start_element
|
2281
|
+
region.end_element = end_element
|
2282
|
+
region.is_end_next_start = True
|
2283
|
+
regions.append(region)
|
2284
|
+
|
2285
|
+
# Save this element as the current start
|
2286
|
+
current_start = element
|
2287
|
+
|
2288
|
+
elif element_type == 'end' and current_start is not None:
|
2289
|
+
# We found an end for the current start
|
2290
|
+
start_element = current_start
|
2291
|
+
end_element = element
|
2292
|
+
|
2293
|
+
# Determine region boundaries based on inclusion parameter
|
2294
|
+
if boundary_inclusion in ['start', 'both']:
|
2295
|
+
top = start_element.top
|
2296
|
+
else:
|
2297
|
+
top = start_element.bottom
|
2298
|
+
|
2299
|
+
if boundary_inclusion in ['end', 'both']:
|
2300
|
+
bottom = end_element.bottom
|
2301
|
+
else:
|
2302
|
+
bottom = end_element.top
|
2303
|
+
|
2304
|
+
# Create the region
|
2305
|
+
x0, _, x1, _ = get_bounds()
|
2306
|
+
region = self.create_region(x0, top, x1, bottom)
|
2307
|
+
region.start_element = start_element
|
2308
|
+
region.end_element = end_element
|
2309
|
+
region.is_end_next_start = False
|
2310
|
+
regions.append(region)
|
2311
|
+
|
2312
|
+
# Reset current start so we don't use it again
|
2313
|
+
current_start = None
|
2314
|
+
|
2315
|
+
# If we have a start without an end at the end, create a section to the page bottom
|
2316
|
+
if current_start is not None:
|
2317
|
+
# Determine region top boundary based on inclusion parameter
|
2318
|
+
if boundary_inclusion in ['start', 'both']:
|
2319
|
+
top = current_start.top
|
2320
|
+
else:
|
2321
|
+
top = current_start.bottom
|
2322
|
+
|
2323
|
+
# Create the region to the bottom of the page
|
2324
|
+
x0, _, x1, page_bottom = get_bounds()
|
2325
|
+
region = self.create_region(x0, top, x1, page_bottom)
|
2326
|
+
region.start_element = current_start
|
2327
|
+
region.end_element = None
|
2328
|
+
region.is_end_next_start = False
|
2329
|
+
regions.append(region)
|
2330
|
+
|
2331
|
+
return regions
|
2332
|
+
|
2333
|
+
def __repr__(self) -> str:
|
2334
|
+
"""String representation of the page."""
|
2335
|
+
return f"<Page number={self.number} index={self.index}>"
|
2336
|
+
|
2337
|
+
def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
|
2338
|
+
"""
|
2339
|
+
Ask a question about the page content using document QA.
|
2340
|
+
|
2341
|
+
This method uses a document question answering model to extract answers from the page content.
|
2342
|
+
It leverages both textual content and layout information for better understanding.
|
2343
|
+
|
2344
|
+
Args:
|
2345
|
+
question: The question to ask about the page content
|
2346
|
+
min_confidence: Minimum confidence threshold for answers (0.0-1.0)
|
2347
|
+
model: Optional model name to use for QA (if None, uses default model)
|
2348
|
+
**kwargs: Additional parameters to pass to the QA engine
|
2349
|
+
|
2350
|
+
Returns:
|
2351
|
+
Dictionary with answer details: {
|
2352
|
+
"answer": extracted text,
|
2353
|
+
"confidence": confidence score,
|
2354
|
+
"found": whether an answer was found,
|
2355
|
+
"page_num": page number,
|
2356
|
+
"source_elements": list of elements that contain the answer (if found)
|
2357
|
+
}
|
2358
|
+
"""
|
2359
|
+
try:
|
2360
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
2361
|
+
|
2362
|
+
# Get or initialize QA engine with specified model
|
2363
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
2364
|
+
|
2365
|
+
# Ask the question using the QA engine
|
2366
|
+
return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
|
2367
|
+
except ImportError as e:
|
2368
|
+
import logging
|
2369
|
+
logger = logging.getLogger("natural_pdf.core.page")
|
2370
|
+
logger.warning(f"QA functionality not available: {e}")
|
2371
|
+
return {
|
2372
|
+
"answer": "",
|
2373
|
+
"confidence": 0.0,
|
2374
|
+
"error": "QA functionality not available",
|
2375
|
+
"found": False
|
2376
|
+
}
|