natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py
CHANGED
@@ -1,51 +1,63 @@
|
|
1
|
-
import pdfplumber
|
2
|
-
import os
|
3
|
-
import logging
|
4
|
-
import tempfile
|
5
|
-
from typing import List, Optional, Union, Any, Dict, Callable, TYPE_CHECKING, Tuple
|
6
|
-
from PIL import Image
|
7
1
|
import base64
|
2
|
+
import hashlib
|
8
3
|
import io
|
9
4
|
import json
|
5
|
+
import logging
|
6
|
+
import os
|
10
7
|
import re
|
11
|
-
import
|
8
|
+
import tempfile
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
11
|
+
|
12
|
+
import pdfplumber
|
13
|
+
from PIL import Image
|
12
14
|
|
13
15
|
from natural_pdf.elements.collections import ElementCollection
|
14
16
|
from natural_pdf.elements.region import Region
|
15
17
|
|
16
18
|
if TYPE_CHECKING:
|
17
19
|
import pdfplumber
|
18
|
-
|
19
|
-
from natural_pdf.elements.collections import ElementCollection
|
20
|
+
|
20
21
|
from natural_pdf.core.highlighting_service import HighlightingService
|
22
|
+
from natural_pdf.core.pdf import PDF
|
21
23
|
from natural_pdf.elements.base import Element
|
24
|
+
from natural_pdf.elements.collections import ElementCollection
|
22
25
|
|
23
|
-
|
26
|
+
# New Imports
|
27
|
+
import itertools
|
28
|
+
|
29
|
+
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
30
|
+
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
31
|
+
|
32
|
+
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
24
33
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
25
34
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
26
|
-
from natural_pdf.ocr import OCROptions
|
27
|
-
from natural_pdf.ocr import OCRManager
|
28
|
-
from natural_pdf.core.element_manager import ElementManager
|
29
|
-
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
30
|
-
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
31
35
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
36
|
+
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
37
|
+
from natural_pdf.core.element_manager import ElementManager
|
38
|
+
from natural_pdf.elements.text import TextElement
|
39
|
+
from natural_pdf.ocr import OCRManager, OCROptions
|
40
|
+
|
41
|
+
# Import new utils
|
42
|
+
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
32
43
|
from natural_pdf.widgets import InteractiveViewerWidget
|
33
|
-
from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
|
44
|
+
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
34
45
|
|
35
46
|
logger = logging.getLogger(__name__)
|
36
47
|
|
48
|
+
|
37
49
|
class Page:
|
38
50
|
"""
|
39
51
|
Enhanced Page wrapper built on top of pdfplumber.Page.
|
40
|
-
|
52
|
+
|
41
53
|
This class provides a fluent interface for working with PDF pages,
|
42
54
|
with improved selection, navigation, extraction, and question-answering capabilities.
|
43
55
|
"""
|
44
|
-
|
45
|
-
def __init__(self, page:
|
56
|
+
|
57
|
+
def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
|
46
58
|
"""
|
47
59
|
Initialize a page wrapper.
|
48
|
-
|
60
|
+
|
49
61
|
Args:
|
50
62
|
page: pdfplumber page object
|
51
63
|
parent: Parent PDF object
|
@@ -57,39 +69,51 @@ class Page:
|
|
57
69
|
self._index = index
|
58
70
|
self._text_styles = None # Lazy-loaded text style analyzer results
|
59
71
|
self._exclusions = [] # List to store exclusion functions/regions
|
60
|
-
|
72
|
+
|
61
73
|
# Region management
|
62
74
|
self._regions = {
|
63
|
-
|
64
|
-
|
75
|
+
"detected": [], # Layout detection results
|
76
|
+
"named": {}, # Named regions (name -> region)
|
65
77
|
}
|
66
|
-
|
78
|
+
|
67
79
|
# Initialize ElementManager
|
68
80
|
self._element_mgr = ElementManager(self, font_attrs)
|
69
81
|
|
70
82
|
# --- Get OCR Manager Instance ---
|
71
|
-
if
|
83
|
+
if (
|
84
|
+
OCRManager
|
85
|
+
and hasattr(parent, "_ocr_manager")
|
86
|
+
and isinstance(parent._ocr_manager, OCRManager)
|
87
|
+
):
|
72
88
|
self._ocr_manager = parent._ocr_manager
|
73
89
|
logger.debug(f"Page {self.number}: Using OCRManager instance from parent PDF.")
|
74
90
|
else:
|
75
91
|
self._ocr_manager = None
|
76
92
|
if OCRManager:
|
77
|
-
|
93
|
+
logger.warning(
|
94
|
+
f"Page {self.number}: OCRManager instance not found on parent PDF object."
|
95
|
+
)
|
78
96
|
|
79
97
|
# --- Get Layout Manager Instance ---
|
80
|
-
if
|
98
|
+
if (
|
99
|
+
LayoutManager
|
100
|
+
and hasattr(parent, "_layout_manager")
|
101
|
+
and isinstance(parent._layout_manager, LayoutManager)
|
102
|
+
):
|
81
103
|
self._layout_manager = parent._layout_manager
|
82
104
|
logger.debug(f"Page {self.number}: Using LayoutManager instance from parent PDF.")
|
83
105
|
else:
|
84
106
|
self._layout_manager = None
|
85
107
|
if LayoutManager:
|
86
|
-
|
108
|
+
logger.warning(
|
109
|
+
f"Page {self.number}: LayoutManager instance not found on parent PDF object. Layout analysis will fail."
|
110
|
+
)
|
87
111
|
|
88
112
|
# Initialize the internal variable with a single underscore
|
89
|
-
self._layout_analyzer = None
|
113
|
+
self._layout_analyzer = None
|
90
114
|
|
91
115
|
@property
|
92
|
-
def pdf(self) ->
|
116
|
+
def pdf(self) -> "PDF":
|
93
117
|
"""Provides public access to the parent PDF object."""
|
94
118
|
return self._parent
|
95
119
|
|
@@ -97,7 +121,7 @@ class Page:
|
|
97
121
|
def number(self) -> int:
|
98
122
|
"""Get page number (1-based)."""
|
99
123
|
return self._page.page_number
|
100
|
-
|
124
|
+
|
101
125
|
@property
|
102
126
|
def page_number(self) -> int:
|
103
127
|
"""Get page number (1-based)."""
|
@@ -107,12 +131,12 @@ class Page:
|
|
107
131
|
def index(self) -> int:
|
108
132
|
"""Get page index (0-based)."""
|
109
133
|
return self._index
|
110
|
-
|
134
|
+
|
111
135
|
@property
|
112
136
|
def width(self) -> float:
|
113
137
|
"""Get page width."""
|
114
138
|
return self._page.width
|
115
|
-
|
139
|
+
|
116
140
|
@property
|
117
141
|
def height(self) -> float:
|
118
142
|
"""Get page height."""
|
@@ -120,107 +144,125 @@ class Page:
|
|
120
144
|
|
121
145
|
# --- Highlighting Service Accessor ---
|
122
146
|
@property
|
123
|
-
def _highlighter(self) ->
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
147
|
+
def _highlighter(self) -> "HighlightingService":
|
148
|
+
"""Provides access to the parent PDF's HighlightingService."""
|
149
|
+
if not hasattr(self._parent, "highlighter"):
|
150
|
+
# This should ideally not happen if PDF.__init__ works correctly
|
151
|
+
raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
|
152
|
+
return self._parent.highlighter
|
129
153
|
|
130
|
-
def clear_exclusions(self) ->
|
154
|
+
def clear_exclusions(self) -> "Page":
|
131
155
|
"""
|
132
156
|
Clear all exclusions from the page.
|
133
157
|
"""
|
134
158
|
self._exclusions = []
|
135
159
|
return self
|
136
160
|
|
137
|
-
def add_exclusion(
|
161
|
+
def add_exclusion(
|
162
|
+
self,
|
163
|
+
exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
|
164
|
+
label: Optional[str] = None,
|
165
|
+
) -> "Page":
|
138
166
|
"""
|
139
167
|
Add an exclusion to the page. Text from these regions will be excluded from extraction.
|
140
168
|
Ensures non-callable items are stored as Region objects if possible.
|
141
|
-
|
169
|
+
|
142
170
|
Args:
|
143
171
|
exclusion_func_or_region: Either a callable function returning a Region,
|
144
172
|
a Region object, or another object with a valid .bbox attribute.
|
145
173
|
label: Optional label for this exclusion (e.g., 'header', 'footer').
|
146
|
-
|
174
|
+
|
147
175
|
Returns:
|
148
176
|
Self for method chaining
|
149
|
-
|
177
|
+
|
150
178
|
Raises:
|
151
179
|
TypeError: If a non-callable, non-Region object without a valid bbox is provided.
|
152
180
|
"""
|
153
|
-
exclusion_data = None
|
181
|
+
exclusion_data = None # Initialize exclusion data
|
154
182
|
|
155
183
|
if callable(exclusion_func_or_region):
|
156
184
|
# Store callable functions along with their label
|
157
185
|
exclusion_data = (exclusion_func_or_region, label)
|
158
|
-
logger.debug(
|
186
|
+
logger.debug(
|
187
|
+
f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}"
|
188
|
+
)
|
159
189
|
elif isinstance(exclusion_func_or_region, Region):
|
160
190
|
# Store Region objects directly, assigning the label
|
161
|
-
exclusion_func_or_region.label = label
|
162
|
-
exclusion_data = (exclusion_func_or_region, label)
|
163
|
-
logger.debug(
|
164
|
-
|
191
|
+
exclusion_func_or_region.label = label # Assign label
|
192
|
+
exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
|
193
|
+
logger.debug(
|
194
|
+
f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}"
|
195
|
+
)
|
196
|
+
elif (
|
197
|
+
hasattr(exclusion_func_or_region, "bbox")
|
198
|
+
and isinstance(getattr(exclusion_func_or_region, "bbox", None), (tuple, list))
|
199
|
+
and len(exclusion_func_or_region.bbox) == 4
|
200
|
+
):
|
165
201
|
# Convert objects with a valid bbox to a Region before storing
|
166
202
|
try:
|
167
203
|
bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
|
168
204
|
# Pass the label to the Region constructor
|
169
205
|
region_to_add = Region(self, bbox_coords, label=label)
|
170
|
-
exclusion_data = (region_to_add, label)
|
171
|
-
logger.debug(
|
206
|
+
exclusion_data = (region_to_add, label) # Store as tuple
|
207
|
+
logger.debug(
|
208
|
+
f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}"
|
209
|
+
)
|
172
210
|
except (ValueError, TypeError, Exception) as e:
|
173
211
|
# Raise an error if conversion fails
|
174
|
-
raise TypeError(
|
212
|
+
raise TypeError(
|
213
|
+
f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
|
214
|
+
) from e
|
175
215
|
else:
|
176
216
|
# Reject invalid types
|
177
|
-
raise TypeError(
|
217
|
+
raise TypeError(
|
218
|
+
f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
|
219
|
+
)
|
178
220
|
|
179
221
|
# Append the stored data (tuple of object/callable and label)
|
180
222
|
if exclusion_data:
|
181
223
|
self._exclusions.append(exclusion_data)
|
182
224
|
|
183
225
|
return self
|
184
|
-
|
185
|
-
def add_region(self, region: Region, name: Optional[str] = None) ->
|
226
|
+
|
227
|
+
def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
|
186
228
|
"""
|
187
229
|
Add a region to the page.
|
188
|
-
|
230
|
+
|
189
231
|
Args:
|
190
232
|
region: Region object to add
|
191
233
|
name: Optional name for the region
|
192
|
-
|
234
|
+
|
193
235
|
Returns:
|
194
236
|
Self for method chaining
|
195
237
|
"""
|
196
238
|
# Check if it's actually a Region object
|
197
239
|
if not isinstance(region, Region):
|
198
240
|
raise TypeError("region must be a Region object")
|
199
|
-
|
241
|
+
|
200
242
|
# Set the source and name
|
201
|
-
region.source =
|
202
|
-
|
243
|
+
region.source = "named"
|
244
|
+
|
203
245
|
if name:
|
204
246
|
region.name = name
|
205
247
|
# Add to named regions dictionary (overwriting if name already exists)
|
206
|
-
self._regions[
|
248
|
+
self._regions["named"][name] = region
|
207
249
|
else:
|
208
250
|
# Add to detected regions list (unnamed but registered)
|
209
|
-
self._regions[
|
210
|
-
|
251
|
+
self._regions["detected"].append(region)
|
252
|
+
|
211
253
|
# Add to element manager for selector queries
|
212
254
|
self._element_mgr.add_region(region)
|
213
|
-
|
255
|
+
|
214
256
|
return self
|
215
|
-
|
216
|
-
def add_regions(self, regions: List[Region], prefix: Optional[str] = None) ->
|
257
|
+
|
258
|
+
def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
|
217
259
|
"""
|
218
260
|
Add multiple regions to the page.
|
219
|
-
|
261
|
+
|
220
262
|
Args:
|
221
263
|
regions: List of Region objects to add
|
222
264
|
prefix: Optional prefix for automatic naming (regions will be named prefix_1, prefix_2, etc.)
|
223
|
-
|
265
|
+
|
224
266
|
Returns:
|
225
267
|
Self for method chaining
|
226
268
|
"""
|
@@ -232,23 +274,23 @@ class Page:
|
|
232
274
|
# Add without names
|
233
275
|
for region in regions:
|
234
276
|
self.add_region(region)
|
235
|
-
|
277
|
+
|
236
278
|
return self
|
237
|
-
|
279
|
+
|
238
280
|
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
239
281
|
"""
|
240
282
|
Get all exclusion regions for this page.
|
241
283
|
Assumes self._exclusions contains tuples of (callable/Region, label).
|
242
|
-
|
284
|
+
|
243
285
|
Args:
|
244
286
|
include_callable: Whether to evaluate callable exclusion functions
|
245
287
|
debug: Enable verbose debug logging for exclusion evaluation
|
246
|
-
|
288
|
+
|
247
289
|
Returns:
|
248
290
|
List of Region objects to exclude, with labels assigned.
|
249
291
|
"""
|
250
292
|
regions = []
|
251
|
-
|
293
|
+
|
252
294
|
if debug:
|
253
295
|
print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
|
254
296
|
|
@@ -280,32 +322,39 @@ class Page:
|
|
280
322
|
if debug:
|
281
323
|
print(f" ✓ Added region from callable '{label}': {region_result}")
|
282
324
|
elif region_result:
|
283
|
-
|
284
|
-
|
285
|
-
|
325
|
+
logger.warning(
|
326
|
+
f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
|
327
|
+
)
|
328
|
+
if debug:
|
329
|
+
print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
|
286
330
|
else:
|
287
331
|
if debug:
|
288
|
-
print(
|
332
|
+
print(
|
333
|
+
f" ✗ Callable '{exclusion_label}' returned None, no region added"
|
334
|
+
)
|
289
335
|
|
290
336
|
except Exception as e:
|
291
337
|
error_msg = f"Error evaluating callable exclusion '{exclusion_label}' for page {self.index}: {e}"
|
292
338
|
print(error_msg)
|
293
339
|
import traceback
|
340
|
+
|
294
341
|
print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
|
295
342
|
|
296
343
|
# Process direct Region objects (label was assigned in add_exclusion)
|
297
344
|
elif isinstance(exclusion_item, Region):
|
298
|
-
regions.append(exclusion_item)
|
345
|
+
regions.append(exclusion_item) # Label is already on the Region object
|
299
346
|
if debug:
|
300
347
|
print(f" - Added direct region '{label}': {exclusion_item}")
|
301
348
|
# No else needed, add_exclusion should prevent invalid types
|
302
|
-
|
349
|
+
|
303
350
|
if debug:
|
304
351
|
print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
|
305
|
-
|
352
|
+
|
306
353
|
return regions
|
307
354
|
|
308
|
-
def _filter_elements_by_exclusions(
|
355
|
+
def _filter_elements_by_exclusions(
|
356
|
+
self, elements: List["Element"], debug_exclusions: bool = False
|
357
|
+
) -> List["Element"]:
|
309
358
|
"""
|
310
359
|
Filters a list of elements, removing those within the page's exclusion regions.
|
311
360
|
|
@@ -318,19 +367,27 @@ class Page:
|
|
318
367
|
"""
|
319
368
|
if not self._exclusions:
|
320
369
|
if debug_exclusions:
|
321
|
-
print(
|
370
|
+
print(
|
371
|
+
f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
|
372
|
+
)
|
322
373
|
return elements
|
323
374
|
|
324
375
|
# Get all exclusion regions, including evaluating callable functions
|
325
|
-
exclusion_regions = self._get_exclusion_regions(
|
376
|
+
exclusion_regions = self._get_exclusion_regions(
|
377
|
+
include_callable=True, debug=debug_exclusions
|
378
|
+
)
|
326
379
|
|
327
380
|
if not exclusion_regions:
|
328
381
|
if debug_exclusions:
|
329
|
-
print(
|
382
|
+
print(
|
383
|
+
f"Page {self.index}: No valid exclusion regions found, returning all {len(elements)} elements."
|
384
|
+
)
|
330
385
|
return elements
|
331
386
|
|
332
387
|
if debug_exclusions:
|
333
|
-
print(
|
388
|
+
print(
|
389
|
+
f"Page {self.index}: Applying {len(exclusion_regions)} exclusion regions to {len(elements)} elements."
|
390
|
+
)
|
334
391
|
|
335
392
|
filtered_elements = []
|
336
393
|
excluded_count = 0
|
@@ -346,7 +403,9 @@ class Page:
|
|
346
403
|
filtered_elements.append(element)
|
347
404
|
|
348
405
|
if debug_exclusions:
|
349
|
-
print(
|
406
|
+
print(
|
407
|
+
f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}."
|
408
|
+
)
|
350
409
|
|
351
410
|
return filtered_elements
|
352
411
|
|
@@ -365,15 +424,18 @@ class Page:
|
|
365
424
|
Element object or None if not found
|
366
425
|
"""
|
367
426
|
from natural_pdf.selectors.parser import parse_selector
|
427
|
+
|
368
428
|
selector_obj = parse_selector(selector)
|
369
|
-
|
429
|
+
|
370
430
|
# Pass regex and case flags to selector function
|
371
|
-
kwargs[
|
372
|
-
kwargs[
|
373
|
-
|
431
|
+
kwargs["regex"] = regex
|
432
|
+
kwargs["case"] = case
|
433
|
+
|
374
434
|
# First get all matching elements without applying exclusions initially within _apply_selector
|
375
|
-
results_collection = self._apply_selector(
|
376
|
-
|
435
|
+
results_collection = self._apply_selector(
|
436
|
+
selector_obj, **kwargs
|
437
|
+
) # _apply_selector doesn't filter
|
438
|
+
|
377
439
|
# Filter the results based on exclusions if requested
|
378
440
|
if apply_exclusions and self._exclusions and results_collection:
|
379
441
|
filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
|
@@ -385,7 +447,9 @@ class Page:
|
|
385
447
|
else:
|
386
448
|
return None
|
387
449
|
|
388
|
-
def find_all(
|
450
|
+
def find_all(
|
451
|
+
self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
|
452
|
+
) -> "ElementCollection":
|
389
453
|
"""
|
390
454
|
Find all elements on this page matching selector.
|
391
455
|
|
@@ -395,20 +459,23 @@ class Page:
|
|
395
459
|
regex: Whether to use regex for text search in :contains (default: False)
|
396
460
|
case: Whether to do case-sensitive text search (default: True)
|
397
461
|
**kwargs: Additional filter parameters
|
398
|
-
|
462
|
+
|
399
463
|
Returns:
|
400
464
|
ElementCollection with matching elements
|
401
465
|
"""
|
402
466
|
from natural_pdf.selectors.parser import parse_selector
|
467
|
+
|
403
468
|
selector_obj = parse_selector(selector)
|
404
|
-
|
469
|
+
|
405
470
|
# Pass regex and case flags to selector function
|
406
|
-
kwargs[
|
407
|
-
kwargs[
|
408
|
-
|
471
|
+
kwargs["regex"] = regex
|
472
|
+
kwargs["case"] = case
|
473
|
+
|
409
474
|
# First get all matching elements without applying exclusions initially within _apply_selector
|
410
|
-
results_collection = self._apply_selector(
|
411
|
-
|
475
|
+
results_collection = self._apply_selector(
|
476
|
+
selector_obj, **kwargs
|
477
|
+
) # _apply_selector doesn't filter
|
478
|
+
|
412
479
|
# Filter the results based on exclusions if requested
|
413
480
|
if apply_exclusions and self._exclusions and results_collection:
|
414
481
|
filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
|
@@ -416,208 +483,348 @@ class Page:
|
|
416
483
|
else:
|
417
484
|
# Return the unfiltered collection
|
418
485
|
return results_collection
|
419
|
-
|
420
|
-
def _apply_selector(
|
486
|
+
|
487
|
+
def _apply_selector(
|
488
|
+
self, selector_obj: Dict, **kwargs
|
489
|
+
) -> "ElementCollection": # Removed apply_exclusions arg
|
421
490
|
"""
|
422
491
|
Apply selector to page elements.
|
423
492
|
Exclusions are now handled by the calling methods (find, find_all) if requested.
|
424
|
-
|
493
|
+
|
425
494
|
Args:
|
426
495
|
selector_obj: Parsed selector dictionary
|
427
496
|
**kwargs: Additional filter parameters including 'regex' and 'case'
|
428
|
-
|
497
|
+
|
429
498
|
Returns:
|
430
499
|
ElementCollection of matching elements (unfiltered by exclusions)
|
431
500
|
"""
|
432
501
|
from natural_pdf.selectors.parser import selector_to_filter_func
|
433
|
-
|
502
|
+
|
434
503
|
# Get element type to filter
|
435
|
-
element_type = selector_obj.get(
|
436
|
-
|
504
|
+
element_type = selector_obj.get("type", "any").lower()
|
505
|
+
|
437
506
|
# Determine which elements to search based on element type
|
438
507
|
elements_to_search = []
|
439
|
-
if element_type ==
|
508
|
+
if element_type == "any":
|
440
509
|
elements_to_search = self._element_mgr.get_all_elements()
|
441
|
-
elif element_type ==
|
510
|
+
elif element_type == "text":
|
442
511
|
elements_to_search = self._element_mgr.words
|
443
|
-
elif element_type ==
|
512
|
+
elif element_type == "char":
|
444
513
|
elements_to_search = self._element_mgr.chars
|
445
|
-
elif element_type ==
|
514
|
+
elif element_type == "word":
|
446
515
|
elements_to_search = self._element_mgr.words
|
447
|
-
elif element_type ==
|
516
|
+
elif element_type == "rect" or element_type == "rectangle":
|
448
517
|
elements_to_search = self._element_mgr.rects
|
449
|
-
elif element_type ==
|
518
|
+
elif element_type == "line":
|
450
519
|
elements_to_search = self._element_mgr.lines
|
451
|
-
elif element_type ==
|
520
|
+
elif element_type == "region":
|
452
521
|
elements_to_search = self._element_mgr.regions
|
453
522
|
else:
|
454
523
|
elements_to_search = self._element_mgr.get_all_elements()
|
455
|
-
|
524
|
+
|
456
525
|
# Create filter function from selector, passing any additional parameters
|
457
526
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
458
|
-
|
527
|
+
|
459
528
|
# Apply the filter to matching elements
|
460
529
|
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
461
|
-
|
530
|
+
|
462
531
|
# Handle spatial pseudo-classes that require relationship checking
|
463
|
-
for pseudo in selector_obj.get(
|
464
|
-
name = pseudo.get(
|
465
|
-
args = pseudo.get(
|
466
|
-
|
467
|
-
if name in (
|
532
|
+
for pseudo in selector_obj.get("pseudo_classes", []):
|
533
|
+
name = pseudo.get("name")
|
534
|
+
args = pseudo.get("args", "")
|
535
|
+
|
536
|
+
if name in ("above", "below", "near", "left-of", "right-of"):
|
468
537
|
# Find the reference element first
|
469
538
|
from natural_pdf.selectors.parser import parse_selector
|
539
|
+
|
470
540
|
ref_selector = parse_selector(args) if isinstance(args, str) else args
|
471
541
|
# Recursively call _apply_selector for reference element (exclusions handled later)
|
472
|
-
ref_elements = self._apply_selector(ref_selector, **kwargs)
|
473
|
-
|
542
|
+
ref_elements = self._apply_selector(ref_selector, **kwargs)
|
543
|
+
|
474
544
|
if not ref_elements:
|
475
545
|
return ElementCollection([])
|
476
|
-
|
546
|
+
|
477
547
|
ref_element = ref_elements.first
|
478
|
-
if not ref_element:
|
479
|
-
|
548
|
+
if not ref_element:
|
549
|
+
continue
|
550
|
+
|
480
551
|
# Filter elements based on spatial relationship
|
481
|
-
if name ==
|
482
|
-
matching_elements = [
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
elif name ==
|
552
|
+
if name == "above":
|
553
|
+
matching_elements = [
|
554
|
+
el
|
555
|
+
for el in matching_elements
|
556
|
+
if hasattr(el, "bottom")
|
557
|
+
and hasattr(ref_element, "top")
|
558
|
+
and el.bottom <= ref_element.top
|
559
|
+
]
|
560
|
+
elif name == "below":
|
561
|
+
matching_elements = [
|
562
|
+
el
|
563
|
+
for el in matching_elements
|
564
|
+
if hasattr(el, "top")
|
565
|
+
and hasattr(ref_element, "bottom")
|
566
|
+
and el.top >= ref_element.bottom
|
567
|
+
]
|
568
|
+
elif name == "left-of":
|
569
|
+
matching_elements = [
|
570
|
+
el
|
571
|
+
for el in matching_elements
|
572
|
+
if hasattr(el, "x1")
|
573
|
+
and hasattr(ref_element, "x0")
|
574
|
+
and el.x1 <= ref_element.x0
|
575
|
+
]
|
576
|
+
elif name == "right-of":
|
577
|
+
matching_elements = [
|
578
|
+
el
|
579
|
+
for el in matching_elements
|
580
|
+
if hasattr(el, "x0")
|
581
|
+
and hasattr(ref_element, "x1")
|
582
|
+
and el.x0 >= ref_element.x1
|
583
|
+
]
|
584
|
+
elif name == "near":
|
585
|
+
|
490
586
|
def distance(el1, el2):
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
587
|
+
if not (
|
588
|
+
hasattr(el1, "x0")
|
589
|
+
and hasattr(el1, "x1")
|
590
|
+
and hasattr(el1, "top")
|
591
|
+
and hasattr(el1, "bottom")
|
592
|
+
and hasattr(el2, "x0")
|
593
|
+
and hasattr(el2, "x1")
|
594
|
+
and hasattr(el2, "top")
|
595
|
+
and hasattr(el2, "bottom")
|
596
|
+
):
|
597
|
+
return float("inf") # Cannot calculate distance
|
598
|
+
el1_center_x = (el1.x0 + el1.x1) / 2
|
599
|
+
el1_center_y = (el1.top + el1.bottom) / 2
|
600
|
+
el2_center_x = (el2.x0 + el2.x1) / 2
|
601
|
+
el2_center_y = (el2.top + el2.bottom) / 2
|
602
|
+
return (
|
603
|
+
(el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2
|
604
|
+
) ** 0.5
|
605
|
+
|
606
|
+
threshold = kwargs.get("near_threshold", 50)
|
607
|
+
matching_elements = [
|
608
|
+
el for el in matching_elements if distance(el, ref_element) <= threshold
|
609
|
+
]
|
610
|
+
|
503
611
|
# Sort elements in reading order if requested
|
504
|
-
if kwargs.get(
|
505
|
-
if all(hasattr(el,
|
506
|
-
|
612
|
+
if kwargs.get("reading_order", True):
|
613
|
+
if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
|
614
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
507
615
|
else:
|
508
|
-
|
509
|
-
|
616
|
+
logger.warning(
|
617
|
+
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
618
|
+
)
|
619
|
+
|
510
620
|
# Create result collection - exclusions are handled by the calling methods (find, find_all)
|
511
621
|
result = ElementCollection(matching_elements)
|
512
|
-
|
622
|
+
|
513
623
|
return result
|
514
624
|
|
515
625
|
def create_region(self, x0: float, top: float, x1: float, bottom: float) -> Any:
|
516
626
|
"""
|
517
627
|
Create a region on this page with the specified coordinates.
|
518
|
-
|
628
|
+
|
519
629
|
Args:
|
520
630
|
x0: Left x-coordinate
|
521
631
|
top: Top y-coordinate
|
522
632
|
x1: Right x-coordinate
|
523
633
|
bottom: Bottom y-coordinate
|
524
|
-
|
634
|
+
|
525
635
|
Returns:
|
526
636
|
Region object for the specified coordinates
|
527
637
|
"""
|
528
638
|
from natural_pdf.elements.region import Region
|
639
|
+
|
529
640
|
return Region(self, (x0, top, x1, bottom))
|
530
|
-
|
531
|
-
def region(
|
532
|
-
|
641
|
+
|
642
|
+
def region(
|
643
|
+
self,
|
644
|
+
left: float = None,
|
645
|
+
top: float = None,
|
646
|
+
right: float = None,
|
647
|
+
bottom: float = None,
|
648
|
+
width: Union[str, float, None] = None,
|
649
|
+
height: Optional[float] = None,
|
650
|
+
) -> Any:
|
533
651
|
"""
|
534
|
-
Create a region on this page with more intuitive named parameters
|
535
|
-
|
652
|
+
Create a region on this page with more intuitive named parameters,
|
653
|
+
allowing definition by coordinates or by coordinate + dimension.
|
654
|
+
|
536
655
|
Args:
|
537
|
-
left: Left x-coordinate (default: 0)
|
538
|
-
top: Top y-coordinate (default: 0)
|
539
|
-
right: Right x-coordinate (default: page width)
|
540
|
-
bottom: Bottom y-coordinate (default: page height)
|
541
|
-
width: Width
|
542
|
-
|
656
|
+
left: Left x-coordinate (default: 0 if width not used).
|
657
|
+
top: Top y-coordinate (default: 0 if height not used).
|
658
|
+
right: Right x-coordinate (default: page width if width not used).
|
659
|
+
bottom: Bottom y-coordinate (default: page height if height not used).
|
660
|
+
width: Width definition. Can be:
|
661
|
+
- Numeric: The width of the region in points. Cannot be used with both left and right.
|
662
|
+
- String 'full': Sets region width to full page width (overrides left/right).
|
663
|
+
- String 'element' or None (default): Uses provided/calculated left/right,
|
664
|
+
defaulting to page width if neither are specified.
|
665
|
+
height: Numeric height of the region. Cannot be used with both top and bottom.
|
666
|
+
|
543
667
|
Returns:
|
544
668
|
Region object for the specified coordinates
|
545
|
-
|
669
|
+
|
670
|
+
Raises:
|
671
|
+
ValueError: If conflicting arguments are provided (e.g., top, bottom, and height)
|
672
|
+
or if width is an invalid string.
|
673
|
+
|
546
674
|
Examples:
|
547
|
-
>>> page.region(top=100,
|
548
|
-
>>> page.region(left=50,
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
#
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
raise ValueError("
|
562
|
-
|
675
|
+
>>> page.region(top=100, height=50) # Region from y=100 to y=150, default width
|
676
|
+
>>> page.region(left=50, width=100) # Region from x=50 to x=150, default height
|
677
|
+
>>> page.region(bottom=500, height=50) # Region from y=450 to y=500
|
678
|
+
>>> page.region(right=200, width=50) # Region from x=150 to x=200
|
679
|
+
>>> page.region(top=100, bottom=200, width="full") # Explicit full width
|
680
|
+
"""
|
681
|
+
# --- Type checking and basic validation ---
|
682
|
+
is_width_numeric = isinstance(width, (int, float))
|
683
|
+
is_width_string = isinstance(width, str)
|
684
|
+
width_mode = "element" # Default mode
|
685
|
+
|
686
|
+
if height is not None and top is not None and bottom is not None:
|
687
|
+
raise ValueError("Cannot specify top, bottom, and height simultaneously.")
|
688
|
+
if is_width_numeric and left is not None and right is not None:
|
689
|
+
raise ValueError("Cannot specify left, right, and a numeric width simultaneously.")
|
690
|
+
if is_width_string:
|
691
|
+
width_lower = width.lower()
|
692
|
+
if width_lower not in ["full", "element"]:
|
693
|
+
raise ValueError("String width argument must be 'full' or 'element'.")
|
694
|
+
width_mode = width_lower
|
695
|
+
|
696
|
+
# --- Calculate Coordinates ---
|
697
|
+
final_top = top
|
698
|
+
final_bottom = bottom
|
699
|
+
final_left = left
|
700
|
+
final_right = right
|
701
|
+
|
702
|
+
# Height calculations
|
703
|
+
if height is not None:
|
704
|
+
if top is not None:
|
705
|
+
final_bottom = top + height
|
706
|
+
elif bottom is not None:
|
707
|
+
final_top = bottom - height
|
708
|
+
else: # Neither top nor bottom provided, default top to 0
|
709
|
+
final_top = 0
|
710
|
+
final_bottom = height
|
711
|
+
|
712
|
+
# Width calculations (numeric only)
|
713
|
+
if is_width_numeric:
|
714
|
+
if left is not None:
|
715
|
+
final_right = left + width
|
716
|
+
elif right is not None:
|
717
|
+
final_left = right - width
|
718
|
+
else: # Neither left nor right provided, default left to 0
|
719
|
+
final_left = 0
|
720
|
+
final_right = width
|
721
|
+
|
722
|
+
# --- Apply Defaults for Unset Coordinates ---
|
723
|
+
# Only default coordinates if they weren't set by dimension calculation
|
724
|
+
if final_top is None:
|
725
|
+
final_top = 0
|
726
|
+
if final_bottom is None:
|
727
|
+
# Check if bottom should have been set by height calc
|
728
|
+
if height is None or top is None:
|
729
|
+
final_bottom = self.height
|
730
|
+
|
731
|
+
if final_left is None:
|
732
|
+
final_left = 0
|
733
|
+
if final_right is None:
|
734
|
+
# Check if right should have been set by width calc
|
735
|
+
if not is_width_numeric or left is None:
|
736
|
+
final_right = self.width
|
737
|
+
|
738
|
+
# --- Handle width_mode == 'full' ---
|
739
|
+
if width_mode == "full":
|
740
|
+
# Override left/right if mode is full
|
741
|
+
final_left = 0
|
742
|
+
final_right = self.width
|
743
|
+
|
744
|
+
# --- Final Validation & Creation ---
|
745
|
+
# Ensure coordinates are within page bounds (clamp)
|
746
|
+
final_left = max(0, final_left)
|
747
|
+
final_top = max(0, final_top)
|
748
|
+
final_right = min(self.width, final_right)
|
749
|
+
final_bottom = min(self.height, final_bottom)
|
750
|
+
|
751
|
+
# Ensure valid box (x0<=x1, top<=bottom)
|
752
|
+
if final_left > final_right:
|
753
|
+
logger.warning(f"Calculated left ({final_left}) > right ({final_right}). Swapping.")
|
754
|
+
final_left, final_right = final_right, final_left
|
755
|
+
if final_top > final_bottom:
|
756
|
+
logger.warning(f"Calculated top ({final_top}) > bottom ({final_bottom}). Swapping.")
|
757
|
+
final_top, final_bottom = final_bottom, final_top
|
758
|
+
|
563
759
|
from natural_pdf.elements.region import Region
|
564
|
-
|
760
|
+
|
761
|
+
region = Region(self, (final_left, final_top, final_right, final_bottom))
|
565
762
|
return region
|
566
|
-
|
567
|
-
def get_elements(
|
763
|
+
|
764
|
+
def get_elements(
|
765
|
+
self, apply_exclusions=True, debug_exclusions: bool = False
|
766
|
+
) -> List["Element"]:
|
568
767
|
"""
|
569
768
|
Get all elements on this page.
|
570
|
-
|
769
|
+
|
571
770
|
Args:
|
572
771
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
573
772
|
debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
|
574
|
-
|
773
|
+
|
575
774
|
Returns:
|
576
775
|
List of all elements on the page, potentially filtered by exclusions.
|
577
776
|
"""
|
578
777
|
# Get all elements from the element manager
|
579
778
|
all_elements = self._element_mgr.get_all_elements()
|
580
|
-
|
779
|
+
|
581
780
|
# Apply exclusions if requested
|
582
781
|
if apply_exclusions and self._exclusions:
|
583
|
-
return self._filter_elements_by_exclusions(
|
782
|
+
return self._filter_elements_by_exclusions(
|
783
|
+
all_elements, debug_exclusions=debug_exclusions
|
784
|
+
)
|
584
785
|
else:
|
585
786
|
if debug_exclusions:
|
586
|
-
|
787
|
+
print(
|
788
|
+
f"Page {self.index}: get_elements returning all {len(all_elements)} elements (exclusions not applied)."
|
789
|
+
)
|
587
790
|
return all_elements
|
588
|
-
|
589
|
-
def filter_elements(
|
791
|
+
|
792
|
+
def filter_elements(
|
793
|
+
self, elements: List["Element"], selector: str, **kwargs
|
794
|
+
) -> List["Element"]:
|
590
795
|
"""
|
591
796
|
Filter a list of elements based on a selector.
|
592
|
-
|
797
|
+
|
593
798
|
Args:
|
594
799
|
elements: List of elements to filter
|
595
800
|
selector: CSS-like selector string
|
596
801
|
**kwargs: Additional filter parameters
|
597
|
-
|
802
|
+
|
598
803
|
Returns:
|
599
804
|
List of elements that match the selector
|
600
805
|
"""
|
601
806
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
602
|
-
|
807
|
+
|
603
808
|
# Parse the selector
|
604
809
|
selector_obj = parse_selector(selector)
|
605
|
-
|
810
|
+
|
606
811
|
# Create filter function from selector
|
607
812
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
608
|
-
|
813
|
+
|
609
814
|
# Apply the filter to the elements
|
610
815
|
matching_elements = [element for element in elements if filter_func(element)]
|
611
|
-
|
816
|
+
|
612
817
|
# Sort elements in reading order if requested
|
613
|
-
if kwargs.get(
|
614
|
-
if all(hasattr(el,
|
615
|
-
|
818
|
+
if kwargs.get("reading_order", True):
|
819
|
+
if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
|
820
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
616
821
|
else:
|
617
|
-
|
618
|
-
|
822
|
+
logger.warning(
|
823
|
+
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
824
|
+
)
|
825
|
+
|
619
826
|
return matching_elements
|
620
|
-
|
827
|
+
|
621
828
|
def until(self, selector: str, include_endpoint: bool = True, **kwargs) -> Any:
|
622
829
|
"""
|
623
830
|
Select content from the top of the page until matching selector.
|
@@ -626,26 +833,28 @@ class Page:
|
|
626
833
|
selector: CSS-like selector string
|
627
834
|
include_endpoint: Whether to include the endpoint element in the region
|
628
835
|
**kwargs: Additional selection parameters
|
629
|
-
|
836
|
+
|
630
837
|
Returns:
|
631
838
|
Region object representing the selected content
|
632
|
-
|
839
|
+
|
633
840
|
Examples:
|
634
841
|
>>> page.until('text:contains("Conclusion")') # Select from top to conclusion
|
635
842
|
>>> page.until('line[width>=2]', include_endpoint=False) # Select up to thick line
|
636
843
|
"""
|
637
|
-
# Find the target element
|
844
|
+
# Find the target element
|
638
845
|
target = self.find(selector, **kwargs)
|
639
846
|
if not target:
|
640
847
|
# If target not found, return a default region (full page)
|
641
848
|
from natural_pdf.elements.region import Region
|
849
|
+
|
642
850
|
return Region(self, (0, 0, self.width, self.height))
|
643
|
-
|
851
|
+
|
644
852
|
# Create a region from the top of the page to the target
|
645
853
|
from natural_pdf.elements.region import Region
|
854
|
+
|
646
855
|
# Ensure target has positional attributes before using them
|
647
|
-
target_top = getattr(target,
|
648
|
-
target_bottom = getattr(target,
|
856
|
+
target_top = getattr(target, "top", 0)
|
857
|
+
target_bottom = getattr(target, "bottom", self.height)
|
649
858
|
|
650
859
|
if include_endpoint:
|
651
860
|
# Include the target element
|
@@ -653,17 +862,16 @@ class Page:
|
|
653
862
|
else:
|
654
863
|
# Up to the target element
|
655
864
|
region = Region(self, (0, 0, self.width, target_top))
|
656
|
-
|
865
|
+
|
657
866
|
region.end_element = target
|
658
867
|
return region
|
659
868
|
|
660
|
-
|
661
869
|
def crop(self, bbox=None, **kwargs) -> Any:
|
662
870
|
"""
|
663
871
|
Crop the page to the specified bounding box.
|
664
872
|
|
665
873
|
This is a direct wrapper around pdfplumber's crop method.
|
666
|
-
|
874
|
+
|
667
875
|
Args:
|
668
876
|
bbox: Bounding box (x0, top, x1, bottom) or None
|
669
877
|
**kwargs: Additional parameters (top, bottom, left, right)
|
@@ -674,59 +882,82 @@ class Page:
|
|
674
882
|
# Returns the pdfplumber page object, not a natural-pdf Page
|
675
883
|
return self._page.crop(bbox, **kwargs)
|
676
884
|
|
677
|
-
def extract_text(
|
678
|
-
|
679
|
-
|
680
|
-
debug_exclusions=False, **kwargs) -> str:
|
885
|
+
def extract_text(
|
886
|
+
self, preserve_whitespace=True, use_exclusions=True, debug_exclusions=False, **kwargs
|
887
|
+
) -> str:
|
681
888
|
"""
|
682
|
-
Extract text from this page, respecting
|
683
|
-
|
889
|
+
Extract text from this page, respecting exclusions and using pdfplumber's
|
890
|
+
layout engine (chars_to_textmap) if layout arguments are provided or default.
|
891
|
+
|
684
892
|
Args:
|
685
|
-
|
686
|
-
|
687
|
-
debug_exclusions: Whether to output detailed exclusion debugging info (default: False)
|
688
|
-
**kwargs: Additional
|
689
|
-
|
893
|
+
use_exclusions: Whether to apply exclusion regions (default: True).
|
894
|
+
Note: Filtering logic is now always applied if exclusions exist.
|
895
|
+
debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
|
896
|
+
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
897
|
+
`chars_to_textmap` function. Common parameters include:
|
898
|
+
- layout (bool): If True (default), inserts spaces/newlines.
|
899
|
+
- x_density (float): Pixels per character horizontally.
|
900
|
+
- y_density (float): Pixels per line vertically.
|
901
|
+
- x_tolerance (float): Tolerance for horizontal character grouping.
|
902
|
+
- y_tolerance (float): Tolerance for vertical character grouping.
|
903
|
+
- line_dir (str): 'ttb', 'btt', 'ltr', 'rtl'
|
904
|
+
- char_dir (str): 'ttb', 'btt', 'ltr', 'rtl'
|
905
|
+
See pdfplumber documentation for more.
|
906
|
+
|
690
907
|
Returns:
|
691
|
-
Extracted text as string
|
692
|
-
"""
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
908
|
+
Extracted text as string, potentially with layout-based spacing.
|
909
|
+
"""
|
910
|
+
logger.debug(f"Page {self.number}: extract_text called with kwargs: {kwargs}")
|
911
|
+
debug = kwargs.get("debug", debug_exclusions) # Allow 'debug' kwarg
|
912
|
+
|
913
|
+
# 1. Get Word Elements (triggers load_elements if needed)
|
914
|
+
word_elements = self.words
|
915
|
+
if not word_elements:
|
916
|
+
logger.debug(f"Page {self.number}: No word elements found.")
|
917
|
+
return ""
|
918
|
+
|
919
|
+
# 2. Get Exclusions
|
920
|
+
apply_exclusions_flag = kwargs.get("use_exclusions", True)
|
921
|
+
exclusion_regions = []
|
922
|
+
if apply_exclusions_flag and self._exclusions:
|
923
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
|
924
|
+
if debug:
|
925
|
+
logger.debug(f"Page {self.number}: Applying {len(exclusion_regions)} exclusions.")
|
926
|
+
elif debug:
|
927
|
+
logger.debug(f"Page {self.number}: Not applying exclusions.")
|
928
|
+
|
929
|
+
# 3. Collect All Character Dictionaries from Word Elements
|
930
|
+
all_char_dicts = []
|
931
|
+
for word in word_elements:
|
932
|
+
all_char_dicts.extend(getattr(word, "_char_dicts", []))
|
933
|
+
|
934
|
+
# 4. Spatially Filter Characters
|
935
|
+
filtered_chars = filter_chars_spatially(
|
936
|
+
char_dicts=all_char_dicts,
|
937
|
+
exclusion_regions=exclusion_regions,
|
938
|
+
target_region=None, # No target region for full page extraction
|
939
|
+
debug=debug,
|
940
|
+
)
|
941
|
+
|
942
|
+
# 5. Generate Text Layout using Utility
|
943
|
+
# Pass page bbox as layout context
|
944
|
+
page_bbox = (0, 0, self.width, self.height)
|
945
|
+
result = generate_text_layout(
|
946
|
+
char_dicts=filtered_chars,
|
947
|
+
layout_context_bbox=page_bbox,
|
948
|
+
user_kwargs=kwargs, # Pass original user kwargs
|
949
|
+
)
|
950
|
+
|
951
|
+
logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
|
721
952
|
return result
|
722
953
|
|
723
954
|
def extract_table(self, table_settings={}) -> List[Any]:
|
724
955
|
"""
|
725
956
|
Extract the largest table from this page.
|
726
|
-
|
957
|
+
|
727
958
|
Args:
|
728
959
|
table_settings: Additional extraction parameters
|
729
|
-
|
960
|
+
|
730
961
|
Returns:
|
731
962
|
List of extracted tables (or None if no table found)
|
732
963
|
"""
|
@@ -736,10 +967,10 @@ class Page:
|
|
736
967
|
def extract_tables(self, table_settings={}) -> List[Any]:
|
737
968
|
"""
|
738
969
|
Extract tables from this page.
|
739
|
-
|
970
|
+
|
740
971
|
Args:
|
741
972
|
table_settings: Additional extraction parameters
|
742
|
-
|
973
|
+
|
743
974
|
Returns:
|
744
975
|
List of extracted tables
|
745
976
|
"""
|
@@ -749,33 +980,33 @@ class Page:
|
|
749
980
|
def _load_elements(self):
|
750
981
|
"""Load all elements from the page via ElementManager."""
|
751
982
|
self._element_mgr.load_elements()
|
752
|
-
|
983
|
+
|
753
984
|
def _create_char_elements(self):
|
754
985
|
"""DEPRECATED: Use self._element_mgr.chars"""
|
755
986
|
logger.warning("_create_char_elements is deprecated. Access via self._element_mgr.chars.")
|
756
|
-
return self._element_mgr.chars
|
987
|
+
return self._element_mgr.chars # Delegate
|
757
988
|
|
758
989
|
def _process_font_information(self, char_dict):
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
990
|
+
"""DEPRECATED: Handled by ElementManager"""
|
991
|
+
logger.warning("_process_font_information is deprecated. Handled by ElementManager.")
|
992
|
+
# ElementManager handles this internally
|
993
|
+
pass
|
763
994
|
|
764
995
|
def _group_chars_into_words(self, keep_spaces=True, font_attrs=None):
|
765
996
|
"""DEPRECATED: Use self._element_mgr.words"""
|
766
997
|
logger.warning("_group_chars_into_words is deprecated. Access via self._element_mgr.words.")
|
767
|
-
return self._element_mgr.words
|
998
|
+
return self._element_mgr.words # Delegate
|
768
999
|
|
769
1000
|
def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
|
770
1001
|
"""DEPRECATED: Handled by ElementManager"""
|
771
1002
|
logger.warning("_process_line_into_words is deprecated. Handled by ElementManager.")
|
772
1003
|
pass
|
773
|
-
|
1004
|
+
|
774
1005
|
def _check_font_attributes_match(self, char, prev_char, font_attrs):
|
775
1006
|
"""DEPRECATED: Handled by ElementManager"""
|
776
1007
|
logger.warning("_check_font_attributes_match is deprecated. Handled by ElementManager.")
|
777
1008
|
pass
|
778
|
-
|
1009
|
+
|
779
1010
|
def _create_word_element(self, chars, font_attrs):
|
780
1011
|
"""DEPRECATED: Handled by ElementManager"""
|
781
1012
|
logger.warning("_create_word_element is deprecated. Handled by ElementManager.")
|
@@ -785,34 +1016,36 @@ class Page:
|
|
785
1016
|
def chars(self) -> List[Any]:
|
786
1017
|
"""Get all character elements on this page."""
|
787
1018
|
return self._element_mgr.chars
|
788
|
-
|
1019
|
+
|
789
1020
|
@property
|
790
1021
|
def words(self) -> List[Any]:
|
791
1022
|
"""Get all word elements on this page."""
|
792
1023
|
return self._element_mgr.words
|
793
|
-
|
1024
|
+
|
794
1025
|
@property
|
795
1026
|
def rects(self) -> List[Any]:
|
796
1027
|
"""Get all rectangle elements on this page."""
|
797
1028
|
return self._element_mgr.rects
|
798
|
-
|
1029
|
+
|
799
1030
|
@property
|
800
1031
|
def lines(self) -> List[Any]:
|
801
1032
|
"""Get all line elements on this page."""
|
802
1033
|
return self._element_mgr.lines
|
803
|
-
|
804
|
-
def highlight(
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
1034
|
+
|
1035
|
+
def highlight(
|
1036
|
+
self,
|
1037
|
+
bbox: Optional[Tuple[float, float, float, float]] = None,
|
1038
|
+
color: Optional[Union[Tuple, str]] = None,
|
1039
|
+
label: Optional[str] = None,
|
1040
|
+
use_color_cycling: bool = False,
|
1041
|
+
element: Optional[Any] = None,
|
1042
|
+
include_attrs: Optional[List[str]] = None,
|
1043
|
+
existing: str = "append",
|
1044
|
+
) -> "Page":
|
812
1045
|
"""
|
813
1046
|
Highlight a bounding box or the entire page.
|
814
1047
|
Delegates to the central HighlightingService.
|
815
|
-
|
1048
|
+
|
816
1049
|
Args:
|
817
1050
|
bbox: Bounding box (x0, top, x1, bottom). If None, highlight entire page.
|
818
1051
|
color: RGBA color tuple/string for the highlight.
|
@@ -834,23 +1067,24 @@ class Page:
|
|
834
1067
|
use_color_cycling=use_color_cycling,
|
835
1068
|
element=element,
|
836
1069
|
include_attrs=include_attrs,
|
837
|
-
existing=existing
|
1070
|
+
existing=existing,
|
838
1071
|
)
|
839
1072
|
return self
|
840
1073
|
|
841
1074
|
def highlight_polygon(
|
842
|
-
self,
|
1075
|
+
self,
|
843
1076
|
polygon: List[Tuple[float, float]],
|
844
|
-
color: Optional[Union[Tuple, str]] = None,
|
1077
|
+
color: Optional[Union[Tuple, str]] = None,
|
845
1078
|
label: Optional[str] = None,
|
846
1079
|
use_color_cycling: bool = False,
|
847
1080
|
element: Optional[Any] = None,
|
848
1081
|
include_attrs: Optional[List[str]] = None,
|
849
|
-
existing: str =
|
1082
|
+
existing: str = "append",
|
1083
|
+
) -> "Page":
|
850
1084
|
"""
|
851
1085
|
Highlight a polygon shape on the page.
|
852
1086
|
Delegates to the central HighlightingService.
|
853
|
-
|
1087
|
+
|
854
1088
|
Args:
|
855
1089
|
polygon: List of (x, y) points defining the polygon.
|
856
1090
|
color: RGBA color tuple/string for the highlight.
|
@@ -871,51 +1105,55 @@ class Page:
|
|
871
1105
|
use_color_cycling=use_color_cycling,
|
872
1106
|
element=element,
|
873
1107
|
include_attrs=include_attrs,
|
874
|
-
existing=existing
|
1108
|
+
existing=existing,
|
875
1109
|
)
|
876
1110
|
return self
|
877
|
-
|
878
|
-
def show(
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
1111
|
+
|
1112
|
+
def show(
|
1113
|
+
self,
|
1114
|
+
scale: float = 2.0,
|
1115
|
+
width: Optional[int] = None,
|
1116
|
+
labels: bool = True,
|
1117
|
+
legend_position: str = "right",
|
1118
|
+
render_ocr: bool = False,
|
1119
|
+
) -> Optional[Image.Image]:
|
884
1120
|
"""
|
885
1121
|
Generates and returns an image of the page with persistent highlights rendered.
|
886
|
-
|
1122
|
+
|
887
1123
|
Args:
|
888
1124
|
scale: Scale factor for rendering.
|
889
1125
|
width: Optional width for the output image.
|
890
1126
|
labels: Whether to include a legend for labels.
|
891
1127
|
legend_position: Position of the legend.
|
892
1128
|
render_ocr: Whether to render OCR text.
|
893
|
-
|
1129
|
+
|
894
1130
|
Returns:
|
895
1131
|
PIL Image object of the page with highlights, or None if rendering fails.
|
896
1132
|
"""
|
897
1133
|
return self.to_image(
|
898
1134
|
scale=scale,
|
899
1135
|
width=width,
|
900
|
-
labels=labels,
|
901
|
-
legend_position=legend_position,
|
1136
|
+
labels=labels,
|
1137
|
+
legend_position=legend_position,
|
902
1138
|
render_ocr=render_ocr,
|
903
|
-
include_highlights=True
|
1139
|
+
include_highlights=True, # Ensure highlights are requested
|
904
1140
|
)
|
905
|
-
|
906
|
-
def save_image(
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
1141
|
+
|
1142
|
+
def save_image(
|
1143
|
+
self,
|
1144
|
+
filename: str,
|
1145
|
+
scale: float = 2.0,
|
1146
|
+
width: Optional[int] = None,
|
1147
|
+
labels: bool = True,
|
1148
|
+
legend_position: str = "right",
|
1149
|
+
render_ocr: bool = False,
|
1150
|
+
include_highlights: bool = True, # Allow saving without highlights
|
1151
|
+
resolution: Optional[float] = None,
|
1152
|
+
**kwargs,
|
1153
|
+
) -> "Page":
|
916
1154
|
"""
|
917
1155
|
Save the page image to a file, rendering highlights via HighlightingService.
|
918
|
-
|
1156
|
+
|
919
1157
|
Args:
|
920
1158
|
filename: Path to save the image to.
|
921
1159
|
scale: Scale factor for rendering highlights.
|
@@ -926,7 +1164,7 @@ class Page:
|
|
926
1164
|
include_highlights: Whether to render highlights.
|
927
1165
|
resolution: Resolution for base image rendering.
|
928
1166
|
**kwargs: Additional args for pdfplumber's to_image.
|
929
|
-
|
1167
|
+
|
930
1168
|
Returns:
|
931
1169
|
Self for method chaining.
|
932
1170
|
"""
|
@@ -935,25 +1173,25 @@ class Page:
|
|
935
1173
|
path=filename,
|
936
1174
|
scale=scale,
|
937
1175
|
width=width,
|
938
|
-
labels=labels,
|
1176
|
+
labels=labels,
|
939
1177
|
legend_position=legend_position,
|
940
1178
|
render_ocr=render_ocr,
|
941
1179
|
include_highlights=include_highlights,
|
942
1180
|
resolution=resolution,
|
943
|
-
**kwargs
|
1181
|
+
**kwargs,
|
944
1182
|
)
|
945
1183
|
return self
|
946
|
-
|
947
|
-
def clear_highlights(self) ->
|
1184
|
+
|
1185
|
+
def clear_highlights(self) -> "Page":
|
948
1186
|
"""
|
949
1187
|
Clear all highlights *from this specific page* via HighlightingService.
|
950
|
-
|
1188
|
+
|
951
1189
|
Returns:
|
952
1190
|
Self for method chaining
|
953
1191
|
"""
|
954
1192
|
self._highlighter.clear_page(self.index)
|
955
1193
|
return self
|
956
|
-
|
1194
|
+
|
957
1195
|
def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
|
958
1196
|
"""
|
959
1197
|
Analyze text elements by style, adding attributes directly to elements.
|
@@ -982,19 +1220,21 @@ class Page:
|
|
982
1220
|
# Return the collection of elements which now have style attributes
|
983
1221
|
return processed_elements_collection
|
984
1222
|
|
985
|
-
def to_image(
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
1223
|
+
def to_image(
|
1224
|
+
self,
|
1225
|
+
path: Optional[str] = None,
|
1226
|
+
scale: float = 2.0,
|
1227
|
+
width: Optional[int] = None,
|
1228
|
+
labels: bool = True,
|
1229
|
+
legend_position: str = "right",
|
1230
|
+
render_ocr: bool = False,
|
1231
|
+
resolution: Optional[float] = None,
|
1232
|
+
include_highlights: bool = True,
|
1233
|
+
**kwargs,
|
1234
|
+
) -> Optional[Image.Image]:
|
995
1235
|
"""
|
996
1236
|
Generate a PIL image of the page, using HighlightingService if needed.
|
997
|
-
|
1237
|
+
|
998
1238
|
Args:
|
999
1239
|
path: Optional path to save the image to.
|
1000
1240
|
scale: Scale factor for rendering highlights.
|
@@ -1005,7 +1245,7 @@ class Page:
|
|
1005
1245
|
resolution: Resolution in DPI for base page image (default: scale * 72).
|
1006
1246
|
include_highlights: Whether to render highlights.
|
1007
1247
|
**kwargs: Additional parameters for pdfplumber.to_image.
|
1008
|
-
|
1248
|
+
|
1009
1249
|
Returns:
|
1010
1250
|
PIL Image of the page, or None if rendering fails.
|
1011
1251
|
"""
|
@@ -1020,7 +1260,7 @@ class Page:
|
|
1020
1260
|
legend_position=legend_position,
|
1021
1261
|
render_ocr=render_ocr,
|
1022
1262
|
resolution=resolution,
|
1023
|
-
**kwargs
|
1263
|
+
**kwargs,
|
1024
1264
|
)
|
1025
1265
|
else:
|
1026
1266
|
# Get the base page image directly from pdfplumber if no highlights needed
|
@@ -1028,26 +1268,36 @@ class Page:
|
|
1028
1268
|
# Use the underlying pdfplumber page object
|
1029
1269
|
img_object = self._page.to_image(resolution=render_resolution, **kwargs)
|
1030
1270
|
# Access the PIL image directly (assuming pdfplumber structure)
|
1031
|
-
image =
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1271
|
+
image = (
|
1272
|
+
img_object.annotated
|
1273
|
+
if hasattr(img_object, "annotated")
|
1274
|
+
else img_object._repr_png_()
|
1275
|
+
)
|
1276
|
+
if isinstance(image, bytes): # Handle cases where it returns bytes
|
1277
|
+
from io import BytesIO
|
1278
|
+
|
1279
|
+
image = Image.open(BytesIO(image)).convert(
|
1280
|
+
"RGB"
|
1281
|
+
) # Convert to RGB for consistency
|
1282
|
+
|
1036
1283
|
except Exception as e:
|
1037
1284
|
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1038
|
-
return None
|
1285
|
+
return None # Return None on error
|
1039
1286
|
|
1040
|
-
if image is None:
|
1287
|
+
if image is None:
|
1288
|
+
return None
|
1041
1289
|
|
1042
1290
|
# Resize the final image if width is provided
|
1043
1291
|
if width is not None and width > 0 and image.width > 0:
|
1044
1292
|
aspect_ratio = image.height / image.width
|
1045
1293
|
height = int(width * aspect_ratio)
|
1046
1294
|
try:
|
1047
|
-
image = image.resize(
|
1295
|
+
image = image.resize(
|
1296
|
+
(width, height), Image.Resampling.LANCZOS
|
1297
|
+
) # Use modern resampling
|
1048
1298
|
except Exception as resize_error:
|
1049
|
-
|
1050
|
-
|
1299
|
+
logger.warning(f"Could not resize image: {resize_error}")
|
1300
|
+
|
1051
1301
|
# Save the image if path is provided
|
1052
1302
|
if path:
|
1053
1303
|
try:
|
@@ -1056,15 +1306,21 @@ class Page:
|
|
1056
1306
|
image.save(path)
|
1057
1307
|
logger.debug(f"Saved page image to: {path}")
|
1058
1308
|
except Exception as save_error:
|
1059
|
-
|
1060
|
-
|
1309
|
+
logger.error(f"Failed to save image to {path}: {save_error}")
|
1310
|
+
|
1061
1311
|
return image
|
1062
|
-
|
1063
|
-
def _create_text_elements_from_ocr(
|
1312
|
+
|
1313
|
+
def _create_text_elements_from_ocr(
|
1314
|
+
self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
|
1315
|
+
) -> List[TextElement]:
|
1064
1316
|
"""DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
|
1065
|
-
logger.warning(
|
1066
|
-
|
1067
|
-
|
1317
|
+
logger.warning(
|
1318
|
+
"_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
|
1319
|
+
)
|
1320
|
+
return self._element_mgr.create_text_elements_from_ocr(
|
1321
|
+
ocr_results, image_width, image_height
|
1322
|
+
)
|
1323
|
+
|
1068
1324
|
def apply_ocr(
|
1069
1325
|
self,
|
1070
1326
|
engine: Optional[str] = None,
|
@@ -1072,35 +1328,40 @@ class Page:
|
|
1072
1328
|
languages: Optional[List[str]] = None,
|
1073
1329
|
min_confidence: Optional[float] = None,
|
1074
1330
|
device: Optional[str] = None,
|
1075
|
-
) ->
|
1331
|
+
) -> "Page":
|
1076
1332
|
"""
|
1077
|
-
Apply OCR to THIS page and add results to page elements via PDF.
|
1078
|
-
|
1333
|
+
Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
|
1334
|
+
|
1079
1335
|
Returns:
|
1080
1336
|
List of created TextElements derived from OCR results for this page.
|
1081
1337
|
"""
|
1082
|
-
if not hasattr(self._parent,
|
1083
|
-
|
1084
|
-
|
1338
|
+
if not hasattr(self._parent, "apply_ocr"):
|
1339
|
+
logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
|
1340
|
+
return []
|
1085
1341
|
|
1086
|
-
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.
|
1342
|
+
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1087
1343
|
try:
|
1088
1344
|
# Delegate to parent PDF, targeting only this page's index
|
1089
|
-
self._parent.
|
1345
|
+
self._parent.apply_ocr(
|
1090
1346
|
pages=[self.index],
|
1091
|
-
engine=engine,
|
1092
|
-
|
1347
|
+
engine=engine,
|
1348
|
+
options=options,
|
1349
|
+
languages=languages,
|
1350
|
+
min_confidence=min_confidence,
|
1351
|
+
device=device,
|
1093
1352
|
)
|
1094
1353
|
except Exception as e:
|
1095
|
-
|
1096
|
-
|
1354
|
+
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1355
|
+
return []
|
1097
1356
|
|
1098
1357
|
# Return the OCR elements specifically added to this page
|
1099
1358
|
# Use element manager to retrieve them
|
1100
|
-
ocr_elements = [el for el in self.words if getattr(el,
|
1101
|
-
logger.debug(
|
1102
|
-
|
1103
|
-
|
1359
|
+
ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
|
1360
|
+
logger.debug(
|
1361
|
+
f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
|
1362
|
+
)
|
1363
|
+
return self
|
1364
|
+
|
1104
1365
|
def extract_ocr_elements(
|
1105
1366
|
self,
|
1106
1367
|
engine: Optional[str] = None,
|
@@ -1114,42 +1375,55 @@ class Page:
|
|
1114
1375
|
Uses the shared OCRManager instance.
|
1115
1376
|
"""
|
1116
1377
|
if not self._ocr_manager:
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1378
|
+
logger.error(
|
1379
|
+
f"Page {self.number}: OCRManager not available. Cannot extract OCR elements."
|
1380
|
+
)
|
1381
|
+
return []
|
1382
|
+
|
1120
1383
|
logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
|
1121
1384
|
try:
|
1122
|
-
ocr_scale = getattr(self._parent,
|
1385
|
+
ocr_scale = getattr(self._parent, "_config", {}).get("ocr_image_scale", 2.0)
|
1123
1386
|
# Get base image without highlights
|
1124
1387
|
image = self.to_image(scale=ocr_scale, include_highlights=False)
|
1125
1388
|
if not image:
|
1126
|
-
|
1127
|
-
|
1389
|
+
logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
|
1390
|
+
return []
|
1128
1391
|
logger.debug(f" Rendered image size: {image.width}x{image.height}")
|
1129
1392
|
except Exception as e:
|
1130
1393
|
logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
|
1131
1394
|
return []
|
1132
|
-
|
1133
|
-
manager_args = {
|
1134
|
-
if languages is not None:
|
1135
|
-
|
1136
|
-
if
|
1137
|
-
|
1138
|
-
|
1395
|
+
|
1396
|
+
manager_args = {"images": image, "options": options, "engine": engine}
|
1397
|
+
if languages is not None:
|
1398
|
+
manager_args["languages"] = languages
|
1399
|
+
if min_confidence is not None:
|
1400
|
+
manager_args["min_confidence"] = min_confidence
|
1401
|
+
if device is not None:
|
1402
|
+
manager_args["device"] = device
|
1403
|
+
|
1404
|
+
logger.debug(
|
1405
|
+
f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }"
|
1406
|
+
)
|
1139
1407
|
try:
|
1140
1408
|
# apply_ocr now returns List[List[Dict]] or List[Dict]
|
1141
1409
|
results_list = self._ocr_manager.apply_ocr(**manager_args)
|
1142
1410
|
# If it returned a list of lists (batch mode), take the first list
|
1143
|
-
results =
|
1144
|
-
|
1411
|
+
results = (
|
1412
|
+
results_list[0]
|
1413
|
+
if isinstance(results_list, list)
|
1414
|
+
and results_list
|
1415
|
+
and isinstance(results_list[0], list)
|
1416
|
+
else results_list
|
1417
|
+
)
|
1418
|
+
|
1145
1419
|
if not isinstance(results, list):
|
1146
|
-
|
1147
|
-
|
1420
|
+
logger.error(f" OCR Manager returned unexpected type: {type(results)}")
|
1421
|
+
results = []
|
1148
1422
|
logger.info(f" OCR Manager returned {len(results)} results for extraction.")
|
1149
1423
|
except Exception as e:
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1424
|
+
logger.error(f" OCR processing failed during extraction: {e}", exc_info=True)
|
1425
|
+
return []
|
1426
|
+
|
1153
1427
|
# Convert results but DO NOT add to ElementManager
|
1154
1428
|
logger.debug(f" Converting OCR results to TextElements (extract only)...")
|
1155
1429
|
# Use a temporary method to create elements without adding them globally
|
@@ -1157,29 +1431,36 @@ class Page:
|
|
1157
1431
|
scale_x = self.width / image.width if image.width else 1
|
1158
1432
|
scale_y = self.height / image.height if image.height else 1
|
1159
1433
|
for result in results:
|
1160
|
-
x0, top, x1, bottom = [float(c) for c in result[
|
1434
|
+
x0, top, x1, bottom = [float(c) for c in result["bbox"]]
|
1161
1435
|
elem_data = {
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1436
|
+
"text": result["text"],
|
1437
|
+
"confidence": result["confidence"],
|
1438
|
+
"x0": x0 * scale_x,
|
1439
|
+
"top": top * scale_y,
|
1440
|
+
"x1": x1 * scale_x,
|
1441
|
+
"bottom": bottom * scale_y,
|
1442
|
+
"width": (x1 - x0) * scale_x,
|
1443
|
+
"height": (bottom - top) * scale_y,
|
1444
|
+
"object_type": "text",
|
1445
|
+
"source": "ocr",
|
1446
|
+
"fontname": "OCR-temp",
|
1447
|
+
"size": 10.0,
|
1448
|
+
"page_number": self.number,
|
1168
1449
|
}
|
1169
1450
|
temp_elements.append(TextElement(elem_data, self))
|
1170
1451
|
|
1171
1452
|
logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
|
1172
1453
|
return temp_elements
|
1173
|
-
|
1454
|
+
|
1174
1455
|
@property
|
1175
1456
|
def layout_analyzer(self) -> LayoutAnalyzer:
|
1176
1457
|
"""Get or create the layout analyzer for this page."""
|
1177
|
-
if self._layout_analyzer is None:
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
return self._layout_analyzer
|
1458
|
+
if self._layout_analyzer is None:
|
1459
|
+
if not self._layout_manager:
|
1460
|
+
logger.warning("LayoutManager not available, cannot create LayoutAnalyzer.")
|
1461
|
+
return None
|
1462
|
+
self._layout_analyzer = LayoutAnalyzer(self)
|
1463
|
+
return self._layout_analyzer
|
1183
1464
|
|
1184
1465
|
def analyze_layout(
|
1185
1466
|
self,
|
@@ -1189,7 +1470,7 @@ class Page:
|
|
1189
1470
|
classes: Optional[List[str]] = None,
|
1190
1471
|
exclude_classes: Optional[List[str]] = None,
|
1191
1472
|
device: Optional[str] = None,
|
1192
|
-
existing: str = "replace"
|
1473
|
+
existing: str = "replace",
|
1193
1474
|
) -> ElementCollection[Region]:
|
1194
1475
|
"""
|
1195
1476
|
Analyze the page layout using the configured LayoutManager.
|
@@ -1200,8 +1481,10 @@ class Page:
|
|
1200
1481
|
"""
|
1201
1482
|
analyzer = self.layout_analyzer
|
1202
1483
|
if not analyzer:
|
1203
|
-
|
1204
|
-
|
1484
|
+
logger.error(
|
1485
|
+
"Layout analysis failed: LayoutAnalyzer not initialized (is LayoutManager available?)."
|
1486
|
+
)
|
1487
|
+
return ElementCollection([]) # Return empty collection
|
1205
1488
|
|
1206
1489
|
# The analyzer's analyze_layout method already adds regions to the page
|
1207
1490
|
# and its element manager. We just need to retrieve them.
|
@@ -1212,17 +1495,20 @@ class Page:
|
|
1212
1495
|
classes=classes,
|
1213
1496
|
exclude_classes=exclude_classes,
|
1214
1497
|
device=device,
|
1215
|
-
existing=existing
|
1498
|
+
existing=existing,
|
1216
1499
|
)
|
1217
1500
|
|
1218
1501
|
# Retrieve the detected regions from the element manager
|
1219
1502
|
# Filter regions based on source='detected' and potentially the model used if available
|
1220
|
-
detected_regions = [
|
1221
|
-
|
1503
|
+
detected_regions = [
|
1504
|
+
r
|
1505
|
+
for r in self._element_mgr.regions
|
1506
|
+
if r.source == "detected" and (not engine or getattr(r, "model", None) == engine)
|
1507
|
+
]
|
1222
1508
|
|
1223
1509
|
return ElementCollection(detected_regions)
|
1224
1510
|
|
1225
|
-
def clear_detected_layout_regions(self) ->
|
1511
|
+
def clear_detected_layout_regions(self) -> "Page":
|
1226
1512
|
"""
|
1227
1513
|
Removes all regions from this page that were added by layout analysis
|
1228
1514
|
(i.e., regions where `source` attribute is 'detected').
|
@@ -1233,47 +1519,61 @@ class Page:
|
|
1233
1519
|
Returns:
|
1234
1520
|
Self for method chaining.
|
1235
1521
|
"""
|
1236
|
-
if
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1522
|
+
if (
|
1523
|
+
not hasattr(self._element_mgr, "regions")
|
1524
|
+
or not hasattr(self._element_mgr, "_elements")
|
1525
|
+
or "regions" not in self._element_mgr._elements
|
1526
|
+
):
|
1527
|
+
logger.debug(
|
1528
|
+
f"Page {self.index}: No regions found in ElementManager, nothing to clear."
|
1529
|
+
)
|
1530
|
+
self._regions["detected"] = [] # Ensure page's list is also clear
|
1531
|
+
return self
|
1240
1532
|
|
1241
1533
|
# Filter ElementManager's list to keep only non-detected regions
|
1242
1534
|
original_count = len(self._element_mgr.regions)
|
1243
|
-
self._element_mgr._elements[
|
1535
|
+
self._element_mgr._elements["regions"] = [
|
1536
|
+
r for r in self._element_mgr.regions if getattr(r, "source", None) != "detected"
|
1537
|
+
]
|
1244
1538
|
new_count = len(self._element_mgr.regions)
|
1245
1539
|
removed_count = original_count - new_count
|
1246
1540
|
|
1247
1541
|
# Clear the page's specific list of detected regions
|
1248
|
-
self._regions[
|
1542
|
+
self._regions["detected"] = []
|
1249
1543
|
|
1250
1544
|
logger.info(f"Page {self.index}: Cleared {removed_count} detected layout regions.")
|
1251
1545
|
return self
|
1252
1546
|
|
1253
|
-
def get_section_between(
|
1547
|
+
def get_section_between(
|
1548
|
+
self, start_element=None, end_element=None, boundary_inclusion="both"
|
1549
|
+
) -> Optional[Region]: # Return Optional
|
1254
1550
|
"""
|
1255
1551
|
Get a section between two elements on this page.
|
1256
1552
|
"""
|
1257
1553
|
# Create a full-page region to operate within
|
1258
1554
|
page_region = self.create_region(0, 0, self.width, self.height)
|
1259
|
-
|
1555
|
+
|
1260
1556
|
# Delegate to the region's method
|
1261
1557
|
try:
|
1262
1558
|
return page_region.get_section_between(
|
1263
1559
|
start_element=start_element,
|
1264
1560
|
end_element=end_element,
|
1265
|
-
boundary_inclusion=boundary_inclusion
|
1561
|
+
boundary_inclusion=boundary_inclusion,
|
1266
1562
|
)
|
1267
1563
|
except Exception as e:
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1564
|
+
logger.error(
|
1565
|
+
f"Error getting section between elements on page {self.index}: {e}", exc_info=True
|
1566
|
+
)
|
1567
|
+
return None
|
1568
|
+
|
1569
|
+
def get_sections(
|
1570
|
+
self,
|
1571
|
+
start_elements=None,
|
1572
|
+
end_elements=None,
|
1573
|
+
boundary_inclusion="both",
|
1574
|
+
y_threshold=5.0,
|
1575
|
+
bounding_box=None,
|
1576
|
+
) -> "ElementCollection[Region]": # Updated type hint
|
1277
1577
|
"""
|
1278
1578
|
Get sections of a page defined by start/end elements.
|
1279
1579
|
Uses the page-level implementation.
|
@@ -1281,6 +1581,7 @@ class Page:
|
|
1281
1581
|
Returns:
|
1282
1582
|
An ElementCollection containing the found Region objects.
|
1283
1583
|
"""
|
1584
|
+
|
1284
1585
|
# Helper function to get bounds from bounding_box parameter
|
1285
1586
|
def get_bounds():
|
1286
1587
|
if bounding_box:
|
@@ -1289,130 +1590,180 @@ class Page:
|
|
1289
1590
|
return max(0, x0), max(0, top), min(self.width, x1), min(self.height, bottom)
|
1290
1591
|
else:
|
1291
1592
|
return 0, 0, self.width, self.height
|
1292
|
-
|
1593
|
+
|
1293
1594
|
regions = []
|
1294
|
-
|
1595
|
+
|
1295
1596
|
# Handle cases where elements are provided as strings (selectors)
|
1296
1597
|
if isinstance(start_elements, str):
|
1297
|
-
start_elements = self.find_all(start_elements).elements
|
1298
|
-
elif hasattr(start_elements,
|
1299
|
-
|
1300
|
-
|
1598
|
+
start_elements = self.find_all(start_elements).elements # Get list of elements
|
1599
|
+
elif hasattr(start_elements, "elements"): # Handle ElementCollection input
|
1600
|
+
start_elements = start_elements.elements
|
1601
|
+
|
1301
1602
|
if isinstance(end_elements, str):
|
1302
1603
|
end_elements = self.find_all(end_elements).elements
|
1303
|
-
elif hasattr(end_elements,
|
1304
|
-
|
1604
|
+
elif hasattr(end_elements, "elements"):
|
1605
|
+
end_elements = end_elements.elements
|
1305
1606
|
|
1306
1607
|
# Ensure start_elements is a list
|
1307
|
-
if start_elements is None:
|
1308
|
-
|
1608
|
+
if start_elements is None:
|
1609
|
+
start_elements = []
|
1610
|
+
if end_elements is None:
|
1611
|
+
end_elements = []
|
1309
1612
|
|
1310
|
-
valid_inclusions = [
|
1613
|
+
valid_inclusions = ["start", "end", "both", "none"]
|
1311
1614
|
if boundary_inclusion not in valid_inclusions:
|
1312
1615
|
raise ValueError(f"boundary_inclusion must be one of {valid_inclusions}")
|
1313
|
-
|
1616
|
+
|
1314
1617
|
if not start_elements:
|
1315
1618
|
# Return an empty ElementCollection if no start elements
|
1316
1619
|
return ElementCollection([])
|
1317
|
-
|
1620
|
+
|
1318
1621
|
# Combine start and end elements with their type
|
1319
1622
|
all_boundaries = []
|
1320
|
-
for el in start_elements:
|
1321
|
-
|
1322
|
-
|
1623
|
+
for el in start_elements:
|
1624
|
+
all_boundaries.append((el, "start"))
|
1625
|
+
for el in end_elements:
|
1626
|
+
all_boundaries.append((el, "end"))
|
1627
|
+
|
1323
1628
|
# Sort all boundary elements primarily by top, then x0
|
1324
1629
|
try:
|
1325
|
-
|
1630
|
+
all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
|
1326
1631
|
except AttributeError as e:
|
1327
|
-
|
1328
|
-
|
1632
|
+
logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
|
1633
|
+
return ElementCollection([]) # Cannot proceed if elements lack position
|
1329
1634
|
|
1330
1635
|
# Process sorted boundaries to find sections
|
1331
1636
|
current_start_element = None
|
1332
1637
|
active_section_started = False
|
1333
1638
|
|
1334
1639
|
for element, element_type in all_boundaries:
|
1335
|
-
if element_type ==
|
1640
|
+
if element_type == "start":
|
1336
1641
|
# If we have an active section, this start implicitly ends it
|
1337
1642
|
if active_section_started:
|
1338
|
-
end_boundary_el = element
|
1643
|
+
end_boundary_el = element # Use this start as the end boundary
|
1339
1644
|
# Determine region boundaries
|
1340
|
-
sec_top =
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1645
|
+
sec_top = (
|
1646
|
+
current_start_element.top
|
1647
|
+
if boundary_inclusion in ["start", "both"]
|
1648
|
+
else current_start_element.bottom
|
1649
|
+
)
|
1650
|
+
sec_bottom = (
|
1651
|
+
end_boundary_el.top
|
1652
|
+
if boundary_inclusion not in ["end", "both"]
|
1653
|
+
else end_boundary_el.bottom
|
1654
|
+
)
|
1655
|
+
|
1656
|
+
if sec_top < sec_bottom: # Ensure valid region
|
1344
1657
|
x0, _, x1, _ = get_bounds()
|
1345
1658
|
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
1346
1659
|
region.start_element = current_start_element
|
1347
|
-
region.end_element = end_boundary_el
|
1348
|
-
region.is_end_next_start = True
|
1660
|
+
region.end_element = end_boundary_el # Mark the element that ended it
|
1661
|
+
region.is_end_next_start = True # Mark how it ended
|
1349
1662
|
regions.append(region)
|
1350
|
-
active_section_started = False
|
1351
|
-
|
1663
|
+
active_section_started = False # Reset for the new start
|
1664
|
+
|
1352
1665
|
# Set this as the potential start of the next section
|
1353
1666
|
current_start_element = element
|
1354
1667
|
active_section_started = True
|
1355
1668
|
|
1356
|
-
elif element_type ==
|
1669
|
+
elif element_type == "end" and active_section_started:
|
1357
1670
|
# We found an explicit end for the current section
|
1358
1671
|
end_boundary_el = element
|
1359
|
-
sec_top =
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1672
|
+
sec_top = (
|
1673
|
+
current_start_element.top
|
1674
|
+
if boundary_inclusion in ["start", "both"]
|
1675
|
+
else current_start_element.bottom
|
1676
|
+
)
|
1677
|
+
sec_bottom = (
|
1678
|
+
end_boundary_el.bottom
|
1679
|
+
if boundary_inclusion in ["end", "both"]
|
1680
|
+
else end_boundary_el.top
|
1681
|
+
)
|
1682
|
+
|
1683
|
+
if sec_top < sec_bottom: # Ensure valid region
|
1363
1684
|
x0, _, x1, _ = get_bounds()
|
1364
1685
|
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
1365
1686
|
region.start_element = current_start_element
|
1366
1687
|
region.end_element = end_boundary_el
|
1367
1688
|
region.is_end_next_start = False
|
1368
1689
|
regions.append(region)
|
1369
|
-
|
1690
|
+
|
1370
1691
|
# Reset: section ended explicitly
|
1371
1692
|
current_start_element = None
|
1372
1693
|
active_section_started = False
|
1373
|
-
|
1694
|
+
|
1374
1695
|
# Handle the last section if it was started but never explicitly ended
|
1375
1696
|
if active_section_started:
|
1376
|
-
sec_top =
|
1697
|
+
sec_top = (
|
1698
|
+
current_start_element.top
|
1699
|
+
if boundary_inclusion in ["start", "both"]
|
1700
|
+
else current_start_element.bottom
|
1701
|
+
)
|
1377
1702
|
x0, _, x1, page_bottom = get_bounds()
|
1378
1703
|
if sec_top < page_bottom:
|
1379
|
-
|
1380
|
-
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1704
|
+
region = self.create_region(x0, sec_top, x1, page_bottom)
|
1705
|
+
region.start_element = current_start_element
|
1706
|
+
region.end_element = None # Ended by page end
|
1707
|
+
region.is_end_next_start = False
|
1708
|
+
regions.append(region)
|
1709
|
+
|
1385
1710
|
# Return the list wrapped in an ElementCollection
|
1386
1711
|
return ElementCollection(regions)
|
1387
|
-
|
1712
|
+
|
1388
1713
|
def __repr__(self) -> str:
|
1389
1714
|
"""String representation of the page."""
|
1390
1715
|
return f"<Page number={self.number} index={self.index}>"
|
1391
|
-
|
1392
|
-
def ask(
|
1716
|
+
|
1717
|
+
def ask(
|
1718
|
+
self,
|
1719
|
+
question: str,
|
1720
|
+
min_confidence: float = 0.1,
|
1721
|
+
model: str = None,
|
1722
|
+
debug: bool = False,
|
1723
|
+
**kwargs,
|
1724
|
+
) -> Dict[str, Any]:
|
1393
1725
|
"""
|
1394
1726
|
Ask a question about the page content using document QA.
|
1395
1727
|
"""
|
1396
1728
|
try:
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1729
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1730
|
+
|
1731
|
+
# Get or initialize QA engine with specified model
|
1732
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1733
|
+
# Ask the question using the QA engine
|
1734
|
+
return qa_engine.ask_pdf_page(
|
1735
|
+
self, question, min_confidence=min_confidence, debug=debug, **kwargs
|
1736
|
+
)
|
1402
1737
|
except ImportError:
|
1403
|
-
|
1404
|
-
|
1738
|
+
logger.error(
|
1739
|
+
"Question answering requires the 'natural_pdf.qa' module. Please install necessary dependencies."
|
1740
|
+
)
|
1741
|
+
return {
|
1742
|
+
"answer": None,
|
1743
|
+
"confidence": 0.0,
|
1744
|
+
"found": False,
|
1745
|
+
"page_num": self.number,
|
1746
|
+
"source_elements": [],
|
1747
|
+
}
|
1405
1748
|
except Exception as e:
|
1406
|
-
|
1407
|
-
|
1749
|
+
logger.error(f"Error during page.ask: {e}", exc_info=True)
|
1750
|
+
return {
|
1751
|
+
"answer": None,
|
1752
|
+
"confidence": 0.0,
|
1753
|
+
"found": False,
|
1754
|
+
"page_num": self.number,
|
1755
|
+
"source_elements": [],
|
1756
|
+
}
|
1408
1757
|
|
1409
|
-
def show_preview(
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1758
|
+
def show_preview(
|
1759
|
+
self,
|
1760
|
+
temporary_highlights: List[Dict],
|
1761
|
+
scale: float = 2.0,
|
1762
|
+
width: Optional[int] = None,
|
1763
|
+
labels: bool = True,
|
1764
|
+
legend_position: str = "right",
|
1765
|
+
render_ocr: bool = False,
|
1766
|
+
) -> Optional[Image.Image]:
|
1416
1767
|
"""
|
1417
1768
|
Generates and returns a non-stateful preview image containing only
|
1418
1769
|
the provided temporary highlights.
|
@@ -1437,13 +1788,16 @@ class Page:
|
|
1437
1788
|
scale=scale,
|
1438
1789
|
labels=labels,
|
1439
1790
|
legend_position=legend_position,
|
1440
|
-
render_ocr=render_ocr
|
1791
|
+
render_ocr=render_ocr,
|
1441
1792
|
)
|
1442
1793
|
except AttributeError:
|
1443
1794
|
logger.error(f"HighlightingService does not have the required 'render_preview' method.")
|
1444
1795
|
return None
|
1445
1796
|
except Exception as e:
|
1446
|
-
logger.error(
|
1797
|
+
logger.error(
|
1798
|
+
f"Error calling highlighter.render_preview for page {self.index}: {e}",
|
1799
|
+
exc_info=True,
|
1800
|
+
)
|
1447
1801
|
return None
|
1448
1802
|
|
1449
1803
|
# Return the rendered image directly
|
@@ -1451,7 +1805,7 @@ class Page:
|
|
1451
1805
|
|
1452
1806
|
@property
|
1453
1807
|
def text_style_labels(self) -> List[str]:
|
1454
|
-
"""
|
1808
|
+
"""
|
1455
1809
|
Get a sorted list of unique text style labels found on the page.
|
1456
1810
|
|
1457
1811
|
Runs text style analysis with default options if it hasn't been run yet.
|
@@ -1461,52 +1815,66 @@ class Page:
|
|
1461
1815
|
A sorted list of unique style label strings.
|
1462
1816
|
"""
|
1463
1817
|
# Check if the summary attribute exists from a previous run
|
1464
|
-
if not hasattr(self,
|
1818
|
+
if not hasattr(self, "_text_styles_summary") or not self._text_styles_summary:
|
1465
1819
|
# If not, run the analysis with default options
|
1466
1820
|
logger.debug(f"Page {self.number}: Running default text style analysis to get labels.")
|
1467
|
-
self.analyze_text_styles()
|
1821
|
+
self.analyze_text_styles() # Use default options
|
1468
1822
|
|
1469
1823
|
# Extract labels from the summary dictionary
|
1470
|
-
if hasattr(self,
|
1824
|
+
if hasattr(self, "_text_styles_summary") and self._text_styles_summary:
|
1471
1825
|
# The summary maps style_key -> {'label': ..., 'properties': ...}
|
1472
|
-
labels = {style_info[
|
1826
|
+
labels = {style_info["label"] for style_info in self._text_styles_summary.values()}
|
1473
1827
|
return sorted(list(labels))
|
1474
1828
|
else:
|
1475
1829
|
# Fallback if summary wasn't created for some reason (e.g., no text elements)
|
1476
|
-
|
1477
|
-
|
1830
|
+
logger.warning(f"Page {self.number}: Text style summary not found after analysis.")
|
1831
|
+
return []
|
1478
1832
|
|
1479
|
-
def viewer(
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1833
|
+
def viewer(
|
1834
|
+
self,
|
1835
|
+
# elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
|
1836
|
+
# include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
|
1837
|
+
) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
|
1483
1838
|
"""
|
1484
1839
|
Creates and returns an interactive ipywidget for exploring elements on this page.
|
1485
1840
|
|
1486
1841
|
Uses SimpleInteractiveViewerWidget.from_page() to create the viewer.
|
1487
1842
|
|
1488
1843
|
Returns:
|
1489
|
-
A SimpleInteractiveViewerWidget instance ready for display in Jupyter
|
1844
|
+
A SimpleInteractiveViewerWidget instance ready for display in Jupyter,
|
1845
|
+
or None if ipywidgets is not installed or widget creation fails.
|
1490
1846
|
|
1491
1847
|
Raises:
|
1492
|
-
|
1848
|
+
# Optional: Could raise ImportError instead of returning None
|
1849
|
+
# ImportError: If required dependencies (ipywidgets) are missing.
|
1493
1850
|
ValueError: If image rendering or data preparation fails within from_page.
|
1494
1851
|
"""
|
1495
|
-
#
|
1852
|
+
# Check for availability using the imported flag and class variable
|
1853
|
+
if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
|
1854
|
+
logger.error(
|
1855
|
+
"Interactive viewer requires optional dependencies ('ipywidgets'). "
|
1856
|
+
"Install with `pip install natural-pdf[interactive]`"
|
1857
|
+
)
|
1858
|
+
# raise ImportError("ipywidgets not found.") # Option 1: Raise error
|
1859
|
+
return None # Option 2: Return None gracefully
|
1860
|
+
|
1861
|
+
# If we reach here, SimpleInteractiveViewerWidget should be the actual class
|
1496
1862
|
try:
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1863
|
+
# Pass self (the Page object) to the factory method
|
1864
|
+
return SimpleInteractiveViewerWidget.from_page(self)
|
1865
|
+
except Exception as e:
|
1866
|
+
# Catch potential errors during widget creation (e.g., image rendering)
|
1867
|
+
logger.error(
|
1868
|
+
f"Error creating viewer widget from page {self.number}: {e}", exc_info=True
|
1869
|
+
)
|
1870
|
+
# raise # Option 1: Re-raise error (might include ValueError from from_page)
|
1871
|
+
return None # Option 2: Return None on creation error
|
1504
1872
|
|
1505
1873
|
# --- Indexable Protocol Methods ---
|
1506
1874
|
def get_id(self) -> str:
|
1507
1875
|
"""Returns a unique identifier for the page (required by Indexable protocol)."""
|
1508
1876
|
# Ensure path is safe for use in IDs (replace problematic chars)
|
1509
|
-
safe_path = re.sub(r
|
1877
|
+
safe_path = re.sub(r"[^a-zA-Z0-9_-]", "_", str(self.pdf.path))
|
1510
1878
|
return f"pdf_{safe_path}_page_{self.page_number}"
|
1511
1879
|
|
1512
1880
|
def get_metadata(self) -> Dict[str, Any]:
|
@@ -1517,21 +1885,47 @@ class Page:
|
|
1517
1885
|
"page_number": self.page_number,
|
1518
1886
|
"width": self.width,
|
1519
1887
|
"height": self.height,
|
1520
|
-
"content_hash": self.get_content_hash()
|
1888
|
+
"content_hash": self.get_content_hash(), # Include the hash
|
1521
1889
|
}
|
1522
1890
|
return metadata
|
1523
1891
|
|
1524
|
-
def get_content(self) ->
|
1892
|
+
def get_content(self) -> "Page":
|
1525
1893
|
"""
|
1526
1894
|
Returns the primary content object (self) for indexing (required by Indexable protocol).
|
1527
1895
|
SearchService implementations decide how to process this (e.g., call extract_text).
|
1528
1896
|
"""
|
1529
|
-
return self
|
1897
|
+
return self # Return the Page object itself
|
1530
1898
|
|
1531
1899
|
def get_content_hash(self) -> str:
|
1532
1900
|
"""Returns a SHA256 hash of the extracted text content (required by Indexable for sync)."""
|
1533
1901
|
# Hash the extracted text (without exclusions for consistency)
|
1534
1902
|
# Consider if exclusions should be part of the hash? For now, hash raw text.
|
1535
1903
|
# Using extract_text directly might be slow if called repeatedly. Cache? TODO: Optimization
|
1536
|
-
text_content = self.extract_text(
|
1537
|
-
|
1904
|
+
text_content = self.extract_text(
|
1905
|
+
use_exclusions=False, preserve_whitespace=False
|
1906
|
+
) # Normalize whitespace?
|
1907
|
+
return hashlib.sha256(text_content.encode("utf-8")).hexdigest()
|
1908
|
+
|
1909
|
+
# --- New Method: save_searchable ---
|
1910
|
+
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
1911
|
+
"""
|
1912
|
+
Saves the PDF page with an OCR text layer, making content searchable.
|
1913
|
+
|
1914
|
+
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
|
1915
|
+
|
1916
|
+
Note: OCR must have been applied to the pages beforehand
|
1917
|
+
(e.g., using pdf.apply_ocr()).
|
1918
|
+
|
1919
|
+
Args:
|
1920
|
+
output_path: Path to save the searchable PDF.
|
1921
|
+
dpi: Resolution for rendering and OCR overlay (default 300).
|
1922
|
+
**kwargs: Additional keyword arguments passed to the exporter.
|
1923
|
+
"""
|
1924
|
+
# Import moved here, assuming it's always available now
|
1925
|
+
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
1926
|
+
|
1927
|
+
# Convert pathlib.Path to string if necessary
|
1928
|
+
output_path_str = str(output_path)
|
1929
|
+
|
1930
|
+
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
1931
|
+
logger.info(f"Searchable PDF saved to: {output_path_str}")
|