natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py
CHANGED
@@ -1,51 +1,66 @@
|
|
1
|
-
import pdfplumber
|
2
|
-
import os
|
3
|
-
import logging
|
4
|
-
import tempfile
|
5
|
-
from typing import List, Optional, Union, Any, Dict, Callable, TYPE_CHECKING, Tuple
|
6
|
-
from PIL import Image
|
7
1
|
import base64
|
2
|
+
import hashlib
|
8
3
|
import io
|
9
4
|
import json
|
5
|
+
import logging
|
6
|
+
import os
|
10
7
|
import re
|
11
|
-
import
|
8
|
+
import tempfile
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
11
|
+
|
12
|
+
import pdfplumber
|
13
|
+
from PIL import Image, ImageDraw
|
12
14
|
|
13
15
|
from natural_pdf.elements.collections import ElementCollection
|
14
16
|
from natural_pdf.elements.region import Region
|
15
17
|
|
16
18
|
if TYPE_CHECKING:
|
17
19
|
import pdfplumber
|
18
|
-
|
19
|
-
from natural_pdf.elements.collections import ElementCollection
|
20
|
+
|
20
21
|
from natural_pdf.core.highlighting_service import HighlightingService
|
22
|
+
from natural_pdf.core.pdf import PDF
|
21
23
|
from natural_pdf.elements.base import Element
|
24
|
+
from natural_pdf.elements.collections import ElementCollection
|
22
25
|
|
23
|
-
|
26
|
+
# New Imports
|
27
|
+
import itertools
|
28
|
+
|
29
|
+
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
30
|
+
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
31
|
+
|
32
|
+
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
24
33
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
25
34
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
26
|
-
from natural_pdf.ocr import OCROptions
|
27
|
-
from natural_pdf.ocr import OCRManager
|
28
|
-
from natural_pdf.core.element_manager import ElementManager
|
29
|
-
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
30
|
-
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
31
35
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
36
|
+
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
37
|
+
from natural_pdf.core.element_manager import ElementManager
|
38
|
+
from natural_pdf.elements.text import TextElement
|
39
|
+
from natural_pdf.ocr import OCRManager, OCROptions
|
40
|
+
|
41
|
+
# Import new utils
|
42
|
+
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
32
43
|
from natural_pdf.widgets import InteractiveViewerWidget
|
33
|
-
from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
|
44
|
+
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
45
|
+
|
46
|
+
from natural_pdf.qa import DocumentQA, get_qa_engine
|
47
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
34
48
|
|
35
49
|
logger = logging.getLogger(__name__)
|
36
50
|
|
51
|
+
|
37
52
|
class Page:
|
38
53
|
"""
|
39
54
|
Enhanced Page wrapper built on top of pdfplumber.Page.
|
40
|
-
|
55
|
+
|
41
56
|
This class provides a fluent interface for working with PDF pages,
|
42
57
|
with improved selection, navigation, extraction, and question-answering capabilities.
|
43
58
|
"""
|
44
|
-
|
45
|
-
def __init__(self, page:
|
59
|
+
|
60
|
+
def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
|
46
61
|
"""
|
47
62
|
Initialize a page wrapper.
|
48
|
-
|
63
|
+
|
49
64
|
Args:
|
50
65
|
page: pdfplumber page object
|
51
66
|
parent: Parent PDF object
|
@@ -57,39 +72,51 @@ class Page:
|
|
57
72
|
self._index = index
|
58
73
|
self._text_styles = None # Lazy-loaded text style analyzer results
|
59
74
|
self._exclusions = [] # List to store exclusion functions/regions
|
60
|
-
|
75
|
+
|
61
76
|
# Region management
|
62
77
|
self._regions = {
|
63
|
-
|
64
|
-
|
78
|
+
"detected": [], # Layout detection results
|
79
|
+
"named": {}, # Named regions (name -> region)
|
65
80
|
}
|
66
|
-
|
81
|
+
|
67
82
|
# Initialize ElementManager
|
68
83
|
self._element_mgr = ElementManager(self, font_attrs)
|
69
84
|
|
70
85
|
# --- Get OCR Manager Instance ---
|
71
|
-
if
|
86
|
+
if (
|
87
|
+
OCRManager
|
88
|
+
and hasattr(parent, "_ocr_manager")
|
89
|
+
and isinstance(parent._ocr_manager, OCRManager)
|
90
|
+
):
|
72
91
|
self._ocr_manager = parent._ocr_manager
|
73
92
|
logger.debug(f"Page {self.number}: Using OCRManager instance from parent PDF.")
|
74
93
|
else:
|
75
94
|
self._ocr_manager = None
|
76
95
|
if OCRManager:
|
77
|
-
|
96
|
+
logger.warning(
|
97
|
+
f"Page {self.number}: OCRManager instance not found on parent PDF object."
|
98
|
+
)
|
78
99
|
|
79
100
|
# --- Get Layout Manager Instance ---
|
80
|
-
if
|
101
|
+
if (
|
102
|
+
LayoutManager
|
103
|
+
and hasattr(parent, "_layout_manager")
|
104
|
+
and isinstance(parent._layout_manager, LayoutManager)
|
105
|
+
):
|
81
106
|
self._layout_manager = parent._layout_manager
|
82
107
|
logger.debug(f"Page {self.number}: Using LayoutManager instance from parent PDF.")
|
83
108
|
else:
|
84
109
|
self._layout_manager = None
|
85
110
|
if LayoutManager:
|
86
|
-
|
111
|
+
logger.warning(
|
112
|
+
f"Page {self.number}: LayoutManager instance not found on parent PDF object. Layout analysis will fail."
|
113
|
+
)
|
87
114
|
|
88
115
|
# Initialize the internal variable with a single underscore
|
89
|
-
self._layout_analyzer = None
|
116
|
+
self._layout_analyzer = None
|
90
117
|
|
91
118
|
@property
|
92
|
-
def pdf(self) ->
|
119
|
+
def pdf(self) -> "PDF":
|
93
120
|
"""Provides public access to the parent PDF object."""
|
94
121
|
return self._parent
|
95
122
|
|
@@ -97,7 +124,7 @@ class Page:
|
|
97
124
|
def number(self) -> int:
|
98
125
|
"""Get page number (1-based)."""
|
99
126
|
return self._page.page_number
|
100
|
-
|
127
|
+
|
101
128
|
@property
|
102
129
|
def page_number(self) -> int:
|
103
130
|
"""Get page number (1-based)."""
|
@@ -107,12 +134,12 @@ class Page:
|
|
107
134
|
def index(self) -> int:
|
108
135
|
"""Get page index (0-based)."""
|
109
136
|
return self._index
|
110
|
-
|
137
|
+
|
111
138
|
@property
|
112
139
|
def width(self) -> float:
|
113
140
|
"""Get page width."""
|
114
141
|
return self._page.width
|
115
|
-
|
142
|
+
|
116
143
|
@property
|
117
144
|
def height(self) -> float:
|
118
145
|
"""Get page height."""
|
@@ -120,107 +147,125 @@ class Page:
|
|
120
147
|
|
121
148
|
# --- Highlighting Service Accessor ---
|
122
149
|
@property
|
123
|
-
def _highlighter(self) ->
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
150
|
+
def _highlighter(self) -> "HighlightingService":
|
151
|
+
"""Provides access to the parent PDF's HighlightingService."""
|
152
|
+
if not hasattr(self._parent, "highlighter"):
|
153
|
+
# This should ideally not happen if PDF.__init__ works correctly
|
154
|
+
raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
|
155
|
+
return self._parent.highlighter
|
129
156
|
|
130
|
-
def clear_exclusions(self) ->
|
157
|
+
def clear_exclusions(self) -> "Page":
|
131
158
|
"""
|
132
159
|
Clear all exclusions from the page.
|
133
160
|
"""
|
134
161
|
self._exclusions = []
|
135
162
|
return self
|
136
163
|
|
137
|
-
def add_exclusion(
|
164
|
+
def add_exclusion(
|
165
|
+
self,
|
166
|
+
exclusion_func_or_region: Union[Callable[["Page"], Region], Region, Any],
|
167
|
+
label: Optional[str] = None,
|
168
|
+
) -> "Page":
|
138
169
|
"""
|
139
170
|
Add an exclusion to the page. Text from these regions will be excluded from extraction.
|
140
171
|
Ensures non-callable items are stored as Region objects if possible.
|
141
|
-
|
172
|
+
|
142
173
|
Args:
|
143
174
|
exclusion_func_or_region: Either a callable function returning a Region,
|
144
175
|
a Region object, or another object with a valid .bbox attribute.
|
145
176
|
label: Optional label for this exclusion (e.g., 'header', 'footer').
|
146
|
-
|
177
|
+
|
147
178
|
Returns:
|
148
179
|
Self for method chaining
|
149
|
-
|
180
|
+
|
150
181
|
Raises:
|
151
182
|
TypeError: If a non-callable, non-Region object without a valid bbox is provided.
|
152
183
|
"""
|
153
|
-
exclusion_data = None
|
184
|
+
exclusion_data = None # Initialize exclusion data
|
154
185
|
|
155
186
|
if callable(exclusion_func_or_region):
|
156
187
|
# Store callable functions along with their label
|
157
188
|
exclusion_data = (exclusion_func_or_region, label)
|
158
|
-
logger.debug(
|
189
|
+
logger.debug(
|
190
|
+
f"Page {self.index}: Added callable exclusion '{label}': {exclusion_func_or_region}"
|
191
|
+
)
|
159
192
|
elif isinstance(exclusion_func_or_region, Region):
|
160
193
|
# Store Region objects directly, assigning the label
|
161
|
-
exclusion_func_or_region.label = label
|
162
|
-
exclusion_data = (exclusion_func_or_region, label)
|
163
|
-
logger.debug(
|
164
|
-
|
194
|
+
exclusion_func_or_region.label = label # Assign label
|
195
|
+
exclusion_data = (exclusion_func_or_region, label) # Store as tuple for consistency
|
196
|
+
logger.debug(
|
197
|
+
f"Page {self.index}: Added Region exclusion '{label}': {exclusion_func_or_region}"
|
198
|
+
)
|
199
|
+
elif (
|
200
|
+
hasattr(exclusion_func_or_region, "bbox")
|
201
|
+
and isinstance(getattr(exclusion_func_or_region, "bbox", None), (tuple, list))
|
202
|
+
and len(exclusion_func_or_region.bbox) == 4
|
203
|
+
):
|
165
204
|
# Convert objects with a valid bbox to a Region before storing
|
166
205
|
try:
|
167
206
|
bbox_coords = tuple(float(v) for v in exclusion_func_or_region.bbox)
|
168
207
|
# Pass the label to the Region constructor
|
169
208
|
region_to_add = Region(self, bbox_coords, label=label)
|
170
|
-
exclusion_data = (region_to_add, label)
|
171
|
-
logger.debug(
|
209
|
+
exclusion_data = (region_to_add, label) # Store as tuple
|
210
|
+
logger.debug(
|
211
|
+
f"Page {self.index}: Added exclusion '{label}' converted to Region from {type(exclusion_func_or_region)}: {region_to_add}"
|
212
|
+
)
|
172
213
|
except (ValueError, TypeError, Exception) as e:
|
173
214
|
# Raise an error if conversion fails
|
174
|
-
raise TypeError(
|
215
|
+
raise TypeError(
|
216
|
+
f"Failed to convert exclusion object {exclusion_func_or_region} with bbox {getattr(exclusion_func_or_region, 'bbox', 'N/A')} to Region: {e}"
|
217
|
+
) from e
|
175
218
|
else:
|
176
219
|
# Reject invalid types
|
177
|
-
raise TypeError(
|
220
|
+
raise TypeError(
|
221
|
+
f"Invalid exclusion type: {type(exclusion_func_or_region)}. Must be callable, Region, or have a valid .bbox attribute."
|
222
|
+
)
|
178
223
|
|
179
224
|
# Append the stored data (tuple of object/callable and label)
|
180
225
|
if exclusion_data:
|
181
226
|
self._exclusions.append(exclusion_data)
|
182
227
|
|
183
228
|
return self
|
184
|
-
|
185
|
-
def add_region(self, region: Region, name: Optional[str] = None) ->
|
229
|
+
|
230
|
+
def add_region(self, region: Region, name: Optional[str] = None) -> "Page":
|
186
231
|
"""
|
187
232
|
Add a region to the page.
|
188
|
-
|
233
|
+
|
189
234
|
Args:
|
190
235
|
region: Region object to add
|
191
236
|
name: Optional name for the region
|
192
|
-
|
237
|
+
|
193
238
|
Returns:
|
194
239
|
Self for method chaining
|
195
240
|
"""
|
196
241
|
# Check if it's actually a Region object
|
197
242
|
if not isinstance(region, Region):
|
198
243
|
raise TypeError("region must be a Region object")
|
199
|
-
|
244
|
+
|
200
245
|
# Set the source and name
|
201
|
-
region.source =
|
202
|
-
|
246
|
+
region.source = "named"
|
247
|
+
|
203
248
|
if name:
|
204
249
|
region.name = name
|
205
250
|
# Add to named regions dictionary (overwriting if name already exists)
|
206
|
-
self._regions[
|
251
|
+
self._regions["named"][name] = region
|
207
252
|
else:
|
208
253
|
# Add to detected regions list (unnamed but registered)
|
209
|
-
self._regions[
|
210
|
-
|
254
|
+
self._regions["detected"].append(region)
|
255
|
+
|
211
256
|
# Add to element manager for selector queries
|
212
257
|
self._element_mgr.add_region(region)
|
213
|
-
|
258
|
+
|
214
259
|
return self
|
215
|
-
|
216
|
-
def add_regions(self, regions: List[Region], prefix: Optional[str] = None) ->
|
260
|
+
|
261
|
+
def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> "Page":
|
217
262
|
"""
|
218
263
|
Add multiple regions to the page.
|
219
|
-
|
264
|
+
|
220
265
|
Args:
|
221
266
|
regions: List of Region objects to add
|
222
267
|
prefix: Optional prefix for automatic naming (regions will be named prefix_1, prefix_2, etc.)
|
223
|
-
|
268
|
+
|
224
269
|
Returns:
|
225
270
|
Self for method chaining
|
226
271
|
"""
|
@@ -232,23 +277,23 @@ class Page:
|
|
232
277
|
# Add without names
|
233
278
|
for region in regions:
|
234
279
|
self.add_region(region)
|
235
|
-
|
280
|
+
|
236
281
|
return self
|
237
|
-
|
282
|
+
|
238
283
|
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
239
284
|
"""
|
240
285
|
Get all exclusion regions for this page.
|
241
286
|
Assumes self._exclusions contains tuples of (callable/Region, label).
|
242
|
-
|
287
|
+
|
243
288
|
Args:
|
244
289
|
include_callable: Whether to evaluate callable exclusion functions
|
245
290
|
debug: Enable verbose debug logging for exclusion evaluation
|
246
|
-
|
291
|
+
|
247
292
|
Returns:
|
248
293
|
List of Region objects to exclude, with labels assigned.
|
249
294
|
"""
|
250
295
|
regions = []
|
251
|
-
|
296
|
+
|
252
297
|
if debug:
|
253
298
|
print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
|
254
299
|
|
@@ -280,32 +325,39 @@ class Page:
|
|
280
325
|
if debug:
|
281
326
|
print(f" ✓ Added region from callable '{label}': {region_result}")
|
282
327
|
elif region_result:
|
283
|
-
|
284
|
-
|
285
|
-
|
328
|
+
logger.warning(
|
329
|
+
f"Callable exclusion '{exclusion_label}' returned non-Region object: {type(region_result)}. Skipping."
|
330
|
+
)
|
331
|
+
if debug:
|
332
|
+
print(f" ✗ Callable returned non-Region/None: {type(region_result)}")
|
286
333
|
else:
|
287
334
|
if debug:
|
288
|
-
print(
|
335
|
+
print(
|
336
|
+
f" ✗ Callable '{exclusion_label}' returned None, no region added"
|
337
|
+
)
|
289
338
|
|
290
339
|
except Exception as e:
|
291
340
|
error_msg = f"Error evaluating callable exclusion '{exclusion_label}' for page {self.index}: {e}"
|
292
341
|
print(error_msg)
|
293
342
|
import traceback
|
343
|
+
|
294
344
|
print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
|
295
345
|
|
296
346
|
# Process direct Region objects (label was assigned in add_exclusion)
|
297
347
|
elif isinstance(exclusion_item, Region):
|
298
|
-
regions.append(exclusion_item)
|
348
|
+
regions.append(exclusion_item) # Label is already on the Region object
|
299
349
|
if debug:
|
300
350
|
print(f" - Added direct region '{label}': {exclusion_item}")
|
301
351
|
# No else needed, add_exclusion should prevent invalid types
|
302
|
-
|
352
|
+
|
303
353
|
if debug:
|
304
354
|
print(f"Page {self.index}: Found {len(regions)} valid exclusion regions to apply")
|
305
|
-
|
355
|
+
|
306
356
|
return regions
|
307
357
|
|
308
|
-
def _filter_elements_by_exclusions(
|
358
|
+
def _filter_elements_by_exclusions(
|
359
|
+
self, elements: List["Element"], debug_exclusions: bool = False
|
360
|
+
) -> List["Element"]:
|
309
361
|
"""
|
310
362
|
Filters a list of elements, removing those within the page's exclusion regions.
|
311
363
|
|
@@ -318,19 +370,27 @@ class Page:
|
|
318
370
|
"""
|
319
371
|
if not self._exclusions:
|
320
372
|
if debug_exclusions:
|
321
|
-
print(
|
373
|
+
print(
|
374
|
+
f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
|
375
|
+
)
|
322
376
|
return elements
|
323
377
|
|
324
378
|
# Get all exclusion regions, including evaluating callable functions
|
325
|
-
exclusion_regions = self._get_exclusion_regions(
|
379
|
+
exclusion_regions = self._get_exclusion_regions(
|
380
|
+
include_callable=True, debug=debug_exclusions
|
381
|
+
)
|
326
382
|
|
327
383
|
if not exclusion_regions:
|
328
384
|
if debug_exclusions:
|
329
|
-
print(
|
385
|
+
print(
|
386
|
+
f"Page {self.index}: No valid exclusion regions found, returning all {len(elements)} elements."
|
387
|
+
)
|
330
388
|
return elements
|
331
389
|
|
332
390
|
if debug_exclusions:
|
333
|
-
print(
|
391
|
+
print(
|
392
|
+
f"Page {self.index}: Applying {len(exclusion_regions)} exclusion regions to {len(elements)} elements."
|
393
|
+
)
|
334
394
|
|
335
395
|
filtered_elements = []
|
336
396
|
excluded_count = 0
|
@@ -346,7 +406,9 @@ class Page:
|
|
346
406
|
filtered_elements.append(element)
|
347
407
|
|
348
408
|
if debug_exclusions:
|
349
|
-
print(
|
409
|
+
print(
|
410
|
+
f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}."
|
411
|
+
)
|
350
412
|
|
351
413
|
return filtered_elements
|
352
414
|
|
@@ -365,15 +427,18 @@ class Page:
|
|
365
427
|
Element object or None if not found
|
366
428
|
"""
|
367
429
|
from natural_pdf.selectors.parser import parse_selector
|
430
|
+
|
368
431
|
selector_obj = parse_selector(selector)
|
369
|
-
|
432
|
+
|
370
433
|
# Pass regex and case flags to selector function
|
371
|
-
kwargs[
|
372
|
-
kwargs[
|
373
|
-
|
434
|
+
kwargs["regex"] = regex
|
435
|
+
kwargs["case"] = case
|
436
|
+
|
374
437
|
# First get all matching elements without applying exclusions initially within _apply_selector
|
375
|
-
results_collection = self._apply_selector(
|
376
|
-
|
438
|
+
results_collection = self._apply_selector(
|
439
|
+
selector_obj, **kwargs
|
440
|
+
) # _apply_selector doesn't filter
|
441
|
+
|
377
442
|
# Filter the results based on exclusions if requested
|
378
443
|
if apply_exclusions and self._exclusions and results_collection:
|
379
444
|
filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
|
@@ -385,7 +450,9 @@ class Page:
|
|
385
450
|
else:
|
386
451
|
return None
|
387
452
|
|
388
|
-
def find_all(
|
453
|
+
def find_all(
|
454
|
+
self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
|
455
|
+
) -> "ElementCollection":
|
389
456
|
"""
|
390
457
|
Find all elements on this page matching selector.
|
391
458
|
|
@@ -395,20 +462,23 @@ class Page:
|
|
395
462
|
regex: Whether to use regex for text search in :contains (default: False)
|
396
463
|
case: Whether to do case-sensitive text search (default: True)
|
397
464
|
**kwargs: Additional filter parameters
|
398
|
-
|
465
|
+
|
399
466
|
Returns:
|
400
467
|
ElementCollection with matching elements
|
401
468
|
"""
|
402
469
|
from natural_pdf.selectors.parser import parse_selector
|
470
|
+
|
403
471
|
selector_obj = parse_selector(selector)
|
404
|
-
|
472
|
+
|
405
473
|
# Pass regex and case flags to selector function
|
406
|
-
kwargs[
|
407
|
-
kwargs[
|
408
|
-
|
474
|
+
kwargs["regex"] = regex
|
475
|
+
kwargs["case"] = case
|
476
|
+
|
409
477
|
# First get all matching elements without applying exclusions initially within _apply_selector
|
410
|
-
results_collection = self._apply_selector(
|
411
|
-
|
478
|
+
results_collection = self._apply_selector(
|
479
|
+
selector_obj, **kwargs
|
480
|
+
) # _apply_selector doesn't filter
|
481
|
+
|
412
482
|
# Filter the results based on exclusions if requested
|
413
483
|
if apply_exclusions and self._exclusions and results_collection:
|
414
484
|
filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
|
@@ -416,208 +486,348 @@ class Page:
|
|
416
486
|
else:
|
417
487
|
# Return the unfiltered collection
|
418
488
|
return results_collection
|
419
|
-
|
420
|
-
def _apply_selector(
|
489
|
+
|
490
|
+
def _apply_selector(
|
491
|
+
self, selector_obj: Dict, **kwargs
|
492
|
+
) -> "ElementCollection": # Removed apply_exclusions arg
|
421
493
|
"""
|
422
494
|
Apply selector to page elements.
|
423
495
|
Exclusions are now handled by the calling methods (find, find_all) if requested.
|
424
|
-
|
496
|
+
|
425
497
|
Args:
|
426
498
|
selector_obj: Parsed selector dictionary
|
427
499
|
**kwargs: Additional filter parameters including 'regex' and 'case'
|
428
|
-
|
500
|
+
|
429
501
|
Returns:
|
430
502
|
ElementCollection of matching elements (unfiltered by exclusions)
|
431
503
|
"""
|
432
504
|
from natural_pdf.selectors.parser import selector_to_filter_func
|
433
|
-
|
505
|
+
|
434
506
|
# Get element type to filter
|
435
|
-
element_type = selector_obj.get(
|
436
|
-
|
507
|
+
element_type = selector_obj.get("type", "any").lower()
|
508
|
+
|
437
509
|
# Determine which elements to search based on element type
|
438
510
|
elements_to_search = []
|
439
|
-
if element_type ==
|
511
|
+
if element_type == "any":
|
440
512
|
elements_to_search = self._element_mgr.get_all_elements()
|
441
|
-
elif element_type ==
|
513
|
+
elif element_type == "text":
|
442
514
|
elements_to_search = self._element_mgr.words
|
443
|
-
elif element_type ==
|
515
|
+
elif element_type == "char":
|
444
516
|
elements_to_search = self._element_mgr.chars
|
445
|
-
elif element_type ==
|
517
|
+
elif element_type == "word":
|
446
518
|
elements_to_search = self._element_mgr.words
|
447
|
-
elif element_type ==
|
519
|
+
elif element_type == "rect" or element_type == "rectangle":
|
448
520
|
elements_to_search = self._element_mgr.rects
|
449
|
-
elif element_type ==
|
521
|
+
elif element_type == "line":
|
450
522
|
elements_to_search = self._element_mgr.lines
|
451
|
-
elif element_type ==
|
523
|
+
elif element_type == "region":
|
452
524
|
elements_to_search = self._element_mgr.regions
|
453
525
|
else:
|
454
526
|
elements_to_search = self._element_mgr.get_all_elements()
|
455
|
-
|
527
|
+
|
456
528
|
# Create filter function from selector, passing any additional parameters
|
457
529
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
458
|
-
|
530
|
+
|
459
531
|
# Apply the filter to matching elements
|
460
532
|
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
461
|
-
|
533
|
+
|
462
534
|
# Handle spatial pseudo-classes that require relationship checking
|
463
|
-
for pseudo in selector_obj.get(
|
464
|
-
name = pseudo.get(
|
465
|
-
args = pseudo.get(
|
466
|
-
|
467
|
-
if name in (
|
535
|
+
for pseudo in selector_obj.get("pseudo_classes", []):
|
536
|
+
name = pseudo.get("name")
|
537
|
+
args = pseudo.get("args", "")
|
538
|
+
|
539
|
+
if name in ("above", "below", "near", "left-of", "right-of"):
|
468
540
|
# Find the reference element first
|
469
541
|
from natural_pdf.selectors.parser import parse_selector
|
542
|
+
|
470
543
|
ref_selector = parse_selector(args) if isinstance(args, str) else args
|
471
544
|
# Recursively call _apply_selector for reference element (exclusions handled later)
|
472
|
-
ref_elements = self._apply_selector(ref_selector, **kwargs)
|
473
|
-
|
545
|
+
ref_elements = self._apply_selector(ref_selector, **kwargs)
|
546
|
+
|
474
547
|
if not ref_elements:
|
475
548
|
return ElementCollection([])
|
476
|
-
|
549
|
+
|
477
550
|
ref_element = ref_elements.first
|
478
|
-
if not ref_element:
|
479
|
-
|
551
|
+
if not ref_element:
|
552
|
+
continue
|
553
|
+
|
480
554
|
# Filter elements based on spatial relationship
|
481
|
-
if name ==
|
482
|
-
matching_elements = [
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
elif name ==
|
555
|
+
if name == "above":
|
556
|
+
matching_elements = [
|
557
|
+
el
|
558
|
+
for el in matching_elements
|
559
|
+
if hasattr(el, "bottom")
|
560
|
+
and hasattr(ref_element, "top")
|
561
|
+
and el.bottom <= ref_element.top
|
562
|
+
]
|
563
|
+
elif name == "below":
|
564
|
+
matching_elements = [
|
565
|
+
el
|
566
|
+
for el in matching_elements
|
567
|
+
if hasattr(el, "top")
|
568
|
+
and hasattr(ref_element, "bottom")
|
569
|
+
and el.top >= ref_element.bottom
|
570
|
+
]
|
571
|
+
elif name == "left-of":
|
572
|
+
matching_elements = [
|
573
|
+
el
|
574
|
+
for el in matching_elements
|
575
|
+
if hasattr(el, "x1")
|
576
|
+
and hasattr(ref_element, "x0")
|
577
|
+
and el.x1 <= ref_element.x0
|
578
|
+
]
|
579
|
+
elif name == "right-of":
|
580
|
+
matching_elements = [
|
581
|
+
el
|
582
|
+
for el in matching_elements
|
583
|
+
if hasattr(el, "x0")
|
584
|
+
and hasattr(ref_element, "x1")
|
585
|
+
and el.x0 >= ref_element.x1
|
586
|
+
]
|
587
|
+
elif name == "near":
|
588
|
+
|
490
589
|
def distance(el1, el2):
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
590
|
+
if not (
|
591
|
+
hasattr(el1, "x0")
|
592
|
+
and hasattr(el1, "x1")
|
593
|
+
and hasattr(el1, "top")
|
594
|
+
and hasattr(el1, "bottom")
|
595
|
+
and hasattr(el2, "x0")
|
596
|
+
and hasattr(el2, "x1")
|
597
|
+
and hasattr(el2, "top")
|
598
|
+
and hasattr(el2, "bottom")
|
599
|
+
):
|
600
|
+
return float("inf") # Cannot calculate distance
|
601
|
+
el1_center_x = (el1.x0 + el1.x1) / 2
|
602
|
+
el1_center_y = (el1.top + el1.bottom) / 2
|
603
|
+
el2_center_x = (el2.x0 + el2.x1) / 2
|
604
|
+
el2_center_y = (el2.top + el2.bottom) / 2
|
605
|
+
return (
|
606
|
+
(el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2
|
607
|
+
) ** 0.5
|
608
|
+
|
609
|
+
threshold = kwargs.get("near_threshold", 50)
|
610
|
+
matching_elements = [
|
611
|
+
el for el in matching_elements if distance(el, ref_element) <= threshold
|
612
|
+
]
|
613
|
+
|
503
614
|
# Sort elements in reading order if requested
|
504
|
-
if kwargs.get(
|
505
|
-
if all(hasattr(el,
|
506
|
-
|
615
|
+
if kwargs.get("reading_order", True):
|
616
|
+
if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
|
617
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
507
618
|
else:
|
508
|
-
|
509
|
-
|
619
|
+
logger.warning(
|
620
|
+
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
621
|
+
)
|
622
|
+
|
510
623
|
# Create result collection - exclusions are handled by the calling methods (find, find_all)
|
511
624
|
result = ElementCollection(matching_elements)
|
512
|
-
|
625
|
+
|
513
626
|
return result
|
514
627
|
|
515
628
|
def create_region(self, x0: float, top: float, x1: float, bottom: float) -> Any:
|
516
629
|
"""
|
517
630
|
Create a region on this page with the specified coordinates.
|
518
|
-
|
631
|
+
|
519
632
|
Args:
|
520
633
|
x0: Left x-coordinate
|
521
634
|
top: Top y-coordinate
|
522
635
|
x1: Right x-coordinate
|
523
636
|
bottom: Bottom y-coordinate
|
524
|
-
|
637
|
+
|
525
638
|
Returns:
|
526
639
|
Region object for the specified coordinates
|
527
640
|
"""
|
528
641
|
from natural_pdf.elements.region import Region
|
642
|
+
|
529
643
|
return Region(self, (x0, top, x1, bottom))
|
530
|
-
|
531
|
-
def region(
|
532
|
-
|
644
|
+
|
645
|
+
def region(
|
646
|
+
self,
|
647
|
+
left: float = None,
|
648
|
+
top: float = None,
|
649
|
+
right: float = None,
|
650
|
+
bottom: float = None,
|
651
|
+
width: Union[str, float, None] = None,
|
652
|
+
height: Optional[float] = None,
|
653
|
+
) -> Any:
|
533
654
|
"""
|
534
|
-
Create a region on this page with more intuitive named parameters
|
535
|
-
|
655
|
+
Create a region on this page with more intuitive named parameters,
|
656
|
+
allowing definition by coordinates or by coordinate + dimension.
|
657
|
+
|
536
658
|
Args:
|
537
|
-
left: Left x-coordinate (default: 0)
|
538
|
-
top: Top y-coordinate (default: 0)
|
539
|
-
right: Right x-coordinate (default: page width)
|
540
|
-
bottom: Bottom y-coordinate (default: page height)
|
541
|
-
width: Width
|
542
|
-
|
659
|
+
left: Left x-coordinate (default: 0 if width not used).
|
660
|
+
top: Top y-coordinate (default: 0 if height not used).
|
661
|
+
right: Right x-coordinate (default: page width if width not used).
|
662
|
+
bottom: Bottom y-coordinate (default: page height if height not used).
|
663
|
+
width: Width definition. Can be:
|
664
|
+
- Numeric: The width of the region in points. Cannot be used with both left and right.
|
665
|
+
- String 'full': Sets region width to full page width (overrides left/right).
|
666
|
+
- String 'element' or None (default): Uses provided/calculated left/right,
|
667
|
+
defaulting to page width if neither are specified.
|
668
|
+
height: Numeric height of the region. Cannot be used with both top and bottom.
|
669
|
+
|
543
670
|
Returns:
|
544
671
|
Region object for the specified coordinates
|
545
|
-
|
672
|
+
|
673
|
+
Raises:
|
674
|
+
ValueError: If conflicting arguments are provided (e.g., top, bottom, and height)
|
675
|
+
or if width is an invalid string.
|
676
|
+
|
546
677
|
Examples:
|
547
|
-
>>> page.region(top=100,
|
548
|
-
>>> page.region(left=50,
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
#
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
raise ValueError("
|
562
|
-
|
678
|
+
>>> page.region(top=100, height=50) # Region from y=100 to y=150, default width
|
679
|
+
>>> page.region(left=50, width=100) # Region from x=50 to x=150, default height
|
680
|
+
>>> page.region(bottom=500, height=50) # Region from y=450 to y=500
|
681
|
+
>>> page.region(right=200, width=50) # Region from x=150 to x=200
|
682
|
+
>>> page.region(top=100, bottom=200, width="full") # Explicit full width
|
683
|
+
"""
|
684
|
+
# --- Type checking and basic validation ---
|
685
|
+
is_width_numeric = isinstance(width, (int, float))
|
686
|
+
is_width_string = isinstance(width, str)
|
687
|
+
width_mode = "element" # Default mode
|
688
|
+
|
689
|
+
if height is not None and top is not None and bottom is not None:
|
690
|
+
raise ValueError("Cannot specify top, bottom, and height simultaneously.")
|
691
|
+
if is_width_numeric and left is not None and right is not None:
|
692
|
+
raise ValueError("Cannot specify left, right, and a numeric width simultaneously.")
|
693
|
+
if is_width_string:
|
694
|
+
width_lower = width.lower()
|
695
|
+
if width_lower not in ["full", "element"]:
|
696
|
+
raise ValueError("String width argument must be 'full' or 'element'.")
|
697
|
+
width_mode = width_lower
|
698
|
+
|
699
|
+
# --- Calculate Coordinates ---
|
700
|
+
final_top = top
|
701
|
+
final_bottom = bottom
|
702
|
+
final_left = left
|
703
|
+
final_right = right
|
704
|
+
|
705
|
+
# Height calculations
|
706
|
+
if height is not None:
|
707
|
+
if top is not None:
|
708
|
+
final_bottom = top + height
|
709
|
+
elif bottom is not None:
|
710
|
+
final_top = bottom - height
|
711
|
+
else: # Neither top nor bottom provided, default top to 0
|
712
|
+
final_top = 0
|
713
|
+
final_bottom = height
|
714
|
+
|
715
|
+
# Width calculations (numeric only)
|
716
|
+
if is_width_numeric:
|
717
|
+
if left is not None:
|
718
|
+
final_right = left + width
|
719
|
+
elif right is not None:
|
720
|
+
final_left = right - width
|
721
|
+
else: # Neither left nor right provided, default left to 0
|
722
|
+
final_left = 0
|
723
|
+
final_right = width
|
724
|
+
|
725
|
+
# --- Apply Defaults for Unset Coordinates ---
|
726
|
+
# Only default coordinates if they weren't set by dimension calculation
|
727
|
+
if final_top is None:
|
728
|
+
final_top = 0
|
729
|
+
if final_bottom is None:
|
730
|
+
# Check if bottom should have been set by height calc
|
731
|
+
if height is None or top is None:
|
732
|
+
final_bottom = self.height
|
733
|
+
|
734
|
+
if final_left is None:
|
735
|
+
final_left = 0
|
736
|
+
if final_right is None:
|
737
|
+
# Check if right should have been set by width calc
|
738
|
+
if not is_width_numeric or left is None:
|
739
|
+
final_right = self.width
|
740
|
+
|
741
|
+
# --- Handle width_mode == 'full' ---
|
742
|
+
if width_mode == "full":
|
743
|
+
# Override left/right if mode is full
|
744
|
+
final_left = 0
|
745
|
+
final_right = self.width
|
746
|
+
|
747
|
+
# --- Final Validation & Creation ---
|
748
|
+
# Ensure coordinates are within page bounds (clamp)
|
749
|
+
final_left = max(0, final_left)
|
750
|
+
final_top = max(0, final_top)
|
751
|
+
final_right = min(self.width, final_right)
|
752
|
+
final_bottom = min(self.height, final_bottom)
|
753
|
+
|
754
|
+
# Ensure valid box (x0<=x1, top<=bottom)
|
755
|
+
if final_left > final_right:
|
756
|
+
logger.warning(f"Calculated left ({final_left}) > right ({final_right}). Swapping.")
|
757
|
+
final_left, final_right = final_right, final_left
|
758
|
+
if final_top > final_bottom:
|
759
|
+
logger.warning(f"Calculated top ({final_top}) > bottom ({final_bottom}). Swapping.")
|
760
|
+
final_top, final_bottom = final_bottom, final_top
|
761
|
+
|
563
762
|
from natural_pdf.elements.region import Region
|
564
|
-
|
763
|
+
|
764
|
+
region = Region(self, (final_left, final_top, final_right, final_bottom))
|
565
765
|
return region
|
566
|
-
|
567
|
-
def get_elements(
|
766
|
+
|
767
|
+
def get_elements(
|
768
|
+
self, apply_exclusions=True, debug_exclusions: bool = False
|
769
|
+
) -> List["Element"]:
|
568
770
|
"""
|
569
771
|
Get all elements on this page.
|
570
|
-
|
772
|
+
|
571
773
|
Args:
|
572
774
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
573
775
|
debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
|
574
|
-
|
776
|
+
|
575
777
|
Returns:
|
576
778
|
List of all elements on the page, potentially filtered by exclusions.
|
577
779
|
"""
|
578
780
|
# Get all elements from the element manager
|
579
781
|
all_elements = self._element_mgr.get_all_elements()
|
580
|
-
|
782
|
+
|
581
783
|
# Apply exclusions if requested
|
582
784
|
if apply_exclusions and self._exclusions:
|
583
|
-
return self._filter_elements_by_exclusions(
|
785
|
+
return self._filter_elements_by_exclusions(
|
786
|
+
all_elements, debug_exclusions=debug_exclusions
|
787
|
+
)
|
584
788
|
else:
|
585
789
|
if debug_exclusions:
|
586
|
-
|
790
|
+
print(
|
791
|
+
f"Page {self.index}: get_elements returning all {len(all_elements)} elements (exclusions not applied)."
|
792
|
+
)
|
587
793
|
return all_elements
|
588
|
-
|
589
|
-
def filter_elements(
|
794
|
+
|
795
|
+
def filter_elements(
|
796
|
+
self, elements: List["Element"], selector: str, **kwargs
|
797
|
+
) -> List["Element"]:
|
590
798
|
"""
|
591
799
|
Filter a list of elements based on a selector.
|
592
|
-
|
800
|
+
|
593
801
|
Args:
|
594
802
|
elements: List of elements to filter
|
595
803
|
selector: CSS-like selector string
|
596
804
|
**kwargs: Additional filter parameters
|
597
|
-
|
805
|
+
|
598
806
|
Returns:
|
599
807
|
List of elements that match the selector
|
600
808
|
"""
|
601
809
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
602
|
-
|
810
|
+
|
603
811
|
# Parse the selector
|
604
812
|
selector_obj = parse_selector(selector)
|
605
|
-
|
813
|
+
|
606
814
|
# Create filter function from selector
|
607
815
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
608
|
-
|
816
|
+
|
609
817
|
# Apply the filter to the elements
|
610
818
|
matching_elements = [element for element in elements if filter_func(element)]
|
611
|
-
|
819
|
+
|
612
820
|
# Sort elements in reading order if requested
|
613
|
-
if kwargs.get(
|
614
|
-
if all(hasattr(el,
|
615
|
-
|
821
|
+
if kwargs.get("reading_order", True):
|
822
|
+
if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
|
823
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
616
824
|
else:
|
617
|
-
|
618
|
-
|
825
|
+
logger.warning(
|
826
|
+
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
827
|
+
)
|
828
|
+
|
619
829
|
return matching_elements
|
620
|
-
|
830
|
+
|
621
831
|
def until(self, selector: str, include_endpoint: bool = True, **kwargs) -> Any:
|
622
832
|
"""
|
623
833
|
Select content from the top of the page until matching selector.
|
@@ -626,26 +836,28 @@ class Page:
|
|
626
836
|
selector: CSS-like selector string
|
627
837
|
include_endpoint: Whether to include the endpoint element in the region
|
628
838
|
**kwargs: Additional selection parameters
|
629
|
-
|
839
|
+
|
630
840
|
Returns:
|
631
841
|
Region object representing the selected content
|
632
|
-
|
842
|
+
|
633
843
|
Examples:
|
634
844
|
>>> page.until('text:contains("Conclusion")') # Select from top to conclusion
|
635
845
|
>>> page.until('line[width>=2]', include_endpoint=False) # Select up to thick line
|
636
846
|
"""
|
637
|
-
# Find the target element
|
847
|
+
# Find the target element
|
638
848
|
target = self.find(selector, **kwargs)
|
639
849
|
if not target:
|
640
850
|
# If target not found, return a default region (full page)
|
641
851
|
from natural_pdf.elements.region import Region
|
852
|
+
|
642
853
|
return Region(self, (0, 0, self.width, self.height))
|
643
|
-
|
854
|
+
|
644
855
|
# Create a region from the top of the page to the target
|
645
856
|
from natural_pdf.elements.region import Region
|
857
|
+
|
646
858
|
# Ensure target has positional attributes before using them
|
647
|
-
target_top = getattr(target,
|
648
|
-
target_bottom = getattr(target,
|
859
|
+
target_top = getattr(target, "top", 0)
|
860
|
+
target_bottom = getattr(target, "bottom", self.height)
|
649
861
|
|
650
862
|
if include_endpoint:
|
651
863
|
# Include the target element
|
@@ -653,17 +865,16 @@ class Page:
|
|
653
865
|
else:
|
654
866
|
# Up to the target element
|
655
867
|
region = Region(self, (0, 0, self.width, target_top))
|
656
|
-
|
868
|
+
|
657
869
|
region.end_element = target
|
658
870
|
return region
|
659
871
|
|
660
|
-
|
661
872
|
def crop(self, bbox=None, **kwargs) -> Any:
|
662
873
|
"""
|
663
874
|
Crop the page to the specified bounding box.
|
664
875
|
|
665
876
|
This is a direct wrapper around pdfplumber's crop method.
|
666
|
-
|
877
|
+
|
667
878
|
Args:
|
668
879
|
bbox: Bounding box (x0, top, x1, bottom) or None
|
669
880
|
**kwargs: Additional parameters (top, bottom, left, right)
|
@@ -674,59 +885,82 @@ class Page:
|
|
674
885
|
# Returns the pdfplumber page object, not a natural-pdf Page
|
675
886
|
return self._page.crop(bbox, **kwargs)
|
676
887
|
|
677
|
-
def extract_text(
|
678
|
-
|
679
|
-
|
680
|
-
debug_exclusions=False, **kwargs) -> str:
|
888
|
+
def extract_text(
|
889
|
+
self, preserve_whitespace=True, use_exclusions=True, debug_exclusions=False, **kwargs
|
890
|
+
) -> str:
|
681
891
|
"""
|
682
|
-
Extract text from this page, respecting
|
683
|
-
|
892
|
+
Extract text from this page, respecting exclusions and using pdfplumber's
|
893
|
+
layout engine (chars_to_textmap) if layout arguments are provided or default.
|
894
|
+
|
684
895
|
Args:
|
685
|
-
|
686
|
-
|
687
|
-
debug_exclusions: Whether to output detailed exclusion debugging info (default: False)
|
688
|
-
**kwargs: Additional
|
689
|
-
|
896
|
+
use_exclusions: Whether to apply exclusion regions (default: True).
|
897
|
+
Note: Filtering logic is now always applied if exclusions exist.
|
898
|
+
debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
|
899
|
+
**kwargs: Additional layout parameters passed directly to pdfplumber's
|
900
|
+
`chars_to_textmap` function. Common parameters include:
|
901
|
+
- layout (bool): If True (default), inserts spaces/newlines.
|
902
|
+
- x_density (float): Pixels per character horizontally.
|
903
|
+
- y_density (float): Pixels per line vertically.
|
904
|
+
- x_tolerance (float): Tolerance for horizontal character grouping.
|
905
|
+
- y_tolerance (float): Tolerance for vertical character grouping.
|
906
|
+
- line_dir (str): 'ttb', 'btt', 'ltr', 'rtl'
|
907
|
+
- char_dir (str): 'ttb', 'btt', 'ltr', 'rtl'
|
908
|
+
See pdfplumber documentation for more.
|
909
|
+
|
690
910
|
Returns:
|
691
|
-
Extracted text as string
|
692
|
-
"""
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
911
|
+
Extracted text as string, potentially with layout-based spacing.
|
912
|
+
"""
|
913
|
+
logger.debug(f"Page {self.number}: extract_text called with kwargs: {kwargs}")
|
914
|
+
debug = kwargs.get("debug", debug_exclusions) # Allow 'debug' kwarg
|
915
|
+
|
916
|
+
# 1. Get Word Elements (triggers load_elements if needed)
|
917
|
+
word_elements = self.words
|
918
|
+
if not word_elements:
|
919
|
+
logger.debug(f"Page {self.number}: No word elements found.")
|
920
|
+
return ""
|
921
|
+
|
922
|
+
# 2. Get Exclusions
|
923
|
+
apply_exclusions_flag = kwargs.get("use_exclusions", True)
|
924
|
+
exclusion_regions = []
|
925
|
+
if apply_exclusions_flag and self._exclusions:
|
926
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
|
927
|
+
if debug:
|
928
|
+
logger.debug(f"Page {self.number}: Applying {len(exclusion_regions)} exclusions.")
|
929
|
+
elif debug:
|
930
|
+
logger.debug(f"Page {self.number}: Not applying exclusions.")
|
931
|
+
|
932
|
+
# 3. Collect All Character Dictionaries from Word Elements
|
933
|
+
all_char_dicts = []
|
934
|
+
for word in word_elements:
|
935
|
+
all_char_dicts.extend(getattr(word, "_char_dicts", []))
|
936
|
+
|
937
|
+
# 4. Spatially Filter Characters
|
938
|
+
filtered_chars = filter_chars_spatially(
|
939
|
+
char_dicts=all_char_dicts,
|
940
|
+
exclusion_regions=exclusion_regions,
|
941
|
+
target_region=None, # No target region for full page extraction
|
942
|
+
debug=debug,
|
943
|
+
)
|
705
944
|
|
706
|
-
#
|
707
|
-
|
945
|
+
# 5. Generate Text Layout using Utility
|
946
|
+
# Pass page bbox as layout context
|
947
|
+
page_bbox = (0, 0, self.width, self.height)
|
948
|
+
result = generate_text_layout(
|
949
|
+
char_dicts=filtered_chars,
|
950
|
+
layout_context_bbox=page_bbox,
|
951
|
+
user_kwargs=kwargs, # Pass original user kwargs
|
952
|
+
)
|
708
953
|
|
709
|
-
|
710
|
-
collection = ElementCollection(filtered_elements)
|
711
|
-
# Ensure elements are sorted for logical text flow (might be redundant if self.words is sorted)
|
712
|
-
if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in collection.elements):
|
713
|
-
collection.sort(key=lambda el: (el.top, el.x0))
|
714
|
-
|
715
|
-
# Join text, handling potential missing text attributes gracefully
|
716
|
-
result = " ".join(getattr(el, 'text', '') for el in collection.elements)
|
717
|
-
|
718
|
-
if debug_exclusions:
|
719
|
-
print(f"Page {self.index}: Extracted {len(result)} characters of text with exclusions applied.")
|
720
|
-
|
954
|
+
logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
|
721
955
|
return result
|
722
956
|
|
723
957
|
def extract_table(self, table_settings={}) -> List[Any]:
|
724
958
|
"""
|
725
959
|
Extract the largest table from this page.
|
726
|
-
|
960
|
+
|
727
961
|
Args:
|
728
962
|
table_settings: Additional extraction parameters
|
729
|
-
|
963
|
+
|
730
964
|
Returns:
|
731
965
|
List of extracted tables (or None if no table found)
|
732
966
|
"""
|
@@ -736,10 +970,10 @@ class Page:
|
|
736
970
|
def extract_tables(self, table_settings={}) -> List[Any]:
|
737
971
|
"""
|
738
972
|
Extract tables from this page.
|
739
|
-
|
973
|
+
|
740
974
|
Args:
|
741
975
|
table_settings: Additional extraction parameters
|
742
|
-
|
976
|
+
|
743
977
|
Returns:
|
744
978
|
List of extracted tables
|
745
979
|
"""
|
@@ -749,33 +983,33 @@ class Page:
|
|
749
983
|
def _load_elements(self):
|
750
984
|
"""Load all elements from the page via ElementManager."""
|
751
985
|
self._element_mgr.load_elements()
|
752
|
-
|
986
|
+
|
753
987
|
def _create_char_elements(self):
|
754
988
|
"""DEPRECATED: Use self._element_mgr.chars"""
|
755
989
|
logger.warning("_create_char_elements is deprecated. Access via self._element_mgr.chars.")
|
756
|
-
return self._element_mgr.chars
|
990
|
+
return self._element_mgr.chars # Delegate
|
757
991
|
|
758
992
|
def _process_font_information(self, char_dict):
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
993
|
+
"""DEPRECATED: Handled by ElementManager"""
|
994
|
+
logger.warning("_process_font_information is deprecated. Handled by ElementManager.")
|
995
|
+
# ElementManager handles this internally
|
996
|
+
pass
|
763
997
|
|
764
998
|
def _group_chars_into_words(self, keep_spaces=True, font_attrs=None):
|
765
999
|
"""DEPRECATED: Use self._element_mgr.words"""
|
766
1000
|
logger.warning("_group_chars_into_words is deprecated. Access via self._element_mgr.words.")
|
767
|
-
return self._element_mgr.words
|
1001
|
+
return self._element_mgr.words # Delegate
|
768
1002
|
|
769
1003
|
def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
|
770
1004
|
"""DEPRECATED: Handled by ElementManager"""
|
771
1005
|
logger.warning("_process_line_into_words is deprecated. Handled by ElementManager.")
|
772
1006
|
pass
|
773
|
-
|
1007
|
+
|
774
1008
|
def _check_font_attributes_match(self, char, prev_char, font_attrs):
|
775
1009
|
"""DEPRECATED: Handled by ElementManager"""
|
776
1010
|
logger.warning("_check_font_attributes_match is deprecated. Handled by ElementManager.")
|
777
1011
|
pass
|
778
|
-
|
1012
|
+
|
779
1013
|
def _create_word_element(self, chars, font_attrs):
|
780
1014
|
"""DEPRECATED: Handled by ElementManager"""
|
781
1015
|
logger.warning("_create_word_element is deprecated. Handled by ElementManager.")
|
@@ -785,34 +1019,36 @@ class Page:
|
|
785
1019
|
def chars(self) -> List[Any]:
|
786
1020
|
"""Get all character elements on this page."""
|
787
1021
|
return self._element_mgr.chars
|
788
|
-
|
1022
|
+
|
789
1023
|
@property
|
790
1024
|
def words(self) -> List[Any]:
|
791
1025
|
"""Get all word elements on this page."""
|
792
1026
|
return self._element_mgr.words
|
793
|
-
|
1027
|
+
|
794
1028
|
@property
|
795
1029
|
def rects(self) -> List[Any]:
|
796
1030
|
"""Get all rectangle elements on this page."""
|
797
1031
|
return self._element_mgr.rects
|
798
|
-
|
1032
|
+
|
799
1033
|
@property
|
800
1034
|
def lines(self) -> List[Any]:
|
801
1035
|
"""Get all line elements on this page."""
|
802
1036
|
return self._element_mgr.lines
|
803
|
-
|
804
|
-
def highlight(
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
1037
|
+
|
1038
|
+
def highlight(
|
1039
|
+
self,
|
1040
|
+
bbox: Optional[Tuple[float, float, float, float]] = None,
|
1041
|
+
color: Optional[Union[Tuple, str]] = None,
|
1042
|
+
label: Optional[str] = None,
|
1043
|
+
use_color_cycling: bool = False,
|
1044
|
+
element: Optional[Any] = None,
|
1045
|
+
include_attrs: Optional[List[str]] = None,
|
1046
|
+
existing: str = "append",
|
1047
|
+
) -> "Page":
|
812
1048
|
"""
|
813
1049
|
Highlight a bounding box or the entire page.
|
814
1050
|
Delegates to the central HighlightingService.
|
815
|
-
|
1051
|
+
|
816
1052
|
Args:
|
817
1053
|
bbox: Bounding box (x0, top, x1, bottom). If None, highlight entire page.
|
818
1054
|
color: RGBA color tuple/string for the highlight.
|
@@ -834,23 +1070,24 @@ class Page:
|
|
834
1070
|
use_color_cycling=use_color_cycling,
|
835
1071
|
element=element,
|
836
1072
|
include_attrs=include_attrs,
|
837
|
-
existing=existing
|
1073
|
+
existing=existing,
|
838
1074
|
)
|
839
1075
|
return self
|
840
1076
|
|
841
1077
|
def highlight_polygon(
|
842
|
-
self,
|
1078
|
+
self,
|
843
1079
|
polygon: List[Tuple[float, float]],
|
844
|
-
color: Optional[Union[Tuple, str]] = None,
|
1080
|
+
color: Optional[Union[Tuple, str]] = None,
|
845
1081
|
label: Optional[str] = None,
|
846
1082
|
use_color_cycling: bool = False,
|
847
1083
|
element: Optional[Any] = None,
|
848
1084
|
include_attrs: Optional[List[str]] = None,
|
849
|
-
existing: str =
|
1085
|
+
existing: str = "append",
|
1086
|
+
) -> "Page":
|
850
1087
|
"""
|
851
1088
|
Highlight a polygon shape on the page.
|
852
1089
|
Delegates to the central HighlightingService.
|
853
|
-
|
1090
|
+
|
854
1091
|
Args:
|
855
1092
|
polygon: List of (x, y) points defining the polygon.
|
856
1093
|
color: RGBA color tuple/string for the highlight.
|
@@ -871,51 +1108,55 @@ class Page:
|
|
871
1108
|
use_color_cycling=use_color_cycling,
|
872
1109
|
element=element,
|
873
1110
|
include_attrs=include_attrs,
|
874
|
-
existing=existing
|
1111
|
+
existing=existing,
|
875
1112
|
)
|
876
1113
|
return self
|
877
|
-
|
878
|
-
def show(
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
1114
|
+
|
1115
|
+
def show(
|
1116
|
+
self,
|
1117
|
+
scale: float = 2.0,
|
1118
|
+
width: Optional[int] = None,
|
1119
|
+
labels: bool = True,
|
1120
|
+
legend_position: str = "right",
|
1121
|
+
render_ocr: bool = False,
|
1122
|
+
) -> Optional[Image.Image]:
|
884
1123
|
"""
|
885
1124
|
Generates and returns an image of the page with persistent highlights rendered.
|
886
|
-
|
1125
|
+
|
887
1126
|
Args:
|
888
1127
|
scale: Scale factor for rendering.
|
889
1128
|
width: Optional width for the output image.
|
890
1129
|
labels: Whether to include a legend for labels.
|
891
1130
|
legend_position: Position of the legend.
|
892
1131
|
render_ocr: Whether to render OCR text.
|
893
|
-
|
1132
|
+
|
894
1133
|
Returns:
|
895
1134
|
PIL Image object of the page with highlights, or None if rendering fails.
|
896
1135
|
"""
|
897
1136
|
return self.to_image(
|
898
1137
|
scale=scale,
|
899
1138
|
width=width,
|
900
|
-
labels=labels,
|
901
|
-
legend_position=legend_position,
|
1139
|
+
labels=labels,
|
1140
|
+
legend_position=legend_position,
|
902
1141
|
render_ocr=render_ocr,
|
903
|
-
include_highlights=True
|
1142
|
+
include_highlights=True, # Ensure highlights are requested
|
904
1143
|
)
|
905
|
-
|
906
|
-
def save_image(
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
1144
|
+
|
1145
|
+
def save_image(
|
1146
|
+
self,
|
1147
|
+
filename: str,
|
1148
|
+
scale: float = 2.0,
|
1149
|
+
width: Optional[int] = None,
|
1150
|
+
labels: bool = True,
|
1151
|
+
legend_position: str = "right",
|
1152
|
+
render_ocr: bool = False,
|
1153
|
+
include_highlights: bool = True, # Allow saving without highlights
|
1154
|
+
resolution: Optional[float] = None,
|
1155
|
+
**kwargs,
|
1156
|
+
) -> "Page":
|
916
1157
|
"""
|
917
1158
|
Save the page image to a file, rendering highlights via HighlightingService.
|
918
|
-
|
1159
|
+
|
919
1160
|
Args:
|
920
1161
|
filename: Path to save the image to.
|
921
1162
|
scale: Scale factor for rendering highlights.
|
@@ -926,7 +1167,7 @@ class Page:
|
|
926
1167
|
include_highlights: Whether to render highlights.
|
927
1168
|
resolution: Resolution for base image rendering.
|
928
1169
|
**kwargs: Additional args for pdfplumber's to_image.
|
929
|
-
|
1170
|
+
|
930
1171
|
Returns:
|
931
1172
|
Self for method chaining.
|
932
1173
|
"""
|
@@ -935,25 +1176,25 @@ class Page:
|
|
935
1176
|
path=filename,
|
936
1177
|
scale=scale,
|
937
1178
|
width=width,
|
938
|
-
labels=labels,
|
1179
|
+
labels=labels,
|
939
1180
|
legend_position=legend_position,
|
940
1181
|
render_ocr=render_ocr,
|
941
1182
|
include_highlights=include_highlights,
|
942
1183
|
resolution=resolution,
|
943
|
-
**kwargs
|
1184
|
+
**kwargs,
|
944
1185
|
)
|
945
1186
|
return self
|
946
|
-
|
947
|
-
def clear_highlights(self) ->
|
1187
|
+
|
1188
|
+
def clear_highlights(self) -> "Page":
|
948
1189
|
"""
|
949
1190
|
Clear all highlights *from this specific page* via HighlightingService.
|
950
|
-
|
1191
|
+
|
951
1192
|
Returns:
|
952
1193
|
Self for method chaining
|
953
1194
|
"""
|
954
1195
|
self._highlighter.clear_page(self.index)
|
955
1196
|
return self
|
956
|
-
|
1197
|
+
|
957
1198
|
def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
|
958
1199
|
"""
|
959
1200
|
Analyze text elements by style, adding attributes directly to elements.
|
@@ -982,19 +1223,22 @@ class Page:
|
|
982
1223
|
# Return the collection of elements which now have style attributes
|
983
1224
|
return processed_elements_collection
|
984
1225
|
|
985
|
-
def to_image(
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
1226
|
+
def to_image(
|
1227
|
+
self,
|
1228
|
+
path: Optional[str] = None,
|
1229
|
+
scale: float = 2.0,
|
1230
|
+
width: Optional[int] = None,
|
1231
|
+
labels: bool = True,
|
1232
|
+
legend_position: str = "right",
|
1233
|
+
render_ocr: bool = False,
|
1234
|
+
resolution: Optional[float] = None,
|
1235
|
+
include_highlights: bool = True,
|
1236
|
+
exclusions: Optional[str] = None, # New parameter
|
1237
|
+
**kwargs,
|
1238
|
+
) -> Optional[Image.Image]:
|
995
1239
|
"""
|
996
1240
|
Generate a PIL image of the page, using HighlightingService if needed.
|
997
|
-
|
1241
|
+
|
998
1242
|
Args:
|
999
1243
|
path: Optional path to save the image to.
|
1000
1244
|
scale: Scale factor for rendering highlights.
|
@@ -1004,50 +1248,104 @@ class Page:
|
|
1004
1248
|
render_ocr: Whether to render OCR text on highlights.
|
1005
1249
|
resolution: Resolution in DPI for base page image (default: scale * 72).
|
1006
1250
|
include_highlights: Whether to render highlights.
|
1251
|
+
exclusions: If 'mask', excluded regions will be whited out on the image.
|
1252
|
+
(default: None).
|
1007
1253
|
**kwargs: Additional parameters for pdfplumber.to_image.
|
1008
|
-
|
1254
|
+
|
1009
1255
|
Returns:
|
1010
1256
|
PIL Image of the page, or None if rendering fails.
|
1011
1257
|
"""
|
1012
1258
|
image = None
|
1259
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
1013
1260
|
try:
|
1014
1261
|
if include_highlights:
|
1015
1262
|
# Delegate rendering to the central service
|
1016
1263
|
image = self._highlighter.render_page(
|
1017
1264
|
page_index=self.index,
|
1018
|
-
scale=scale,
|
1265
|
+
scale=scale, # Note: scale is used by highlighter internally for drawing
|
1019
1266
|
labels=labels,
|
1020
1267
|
legend_position=legend_position,
|
1021
1268
|
render_ocr=render_ocr,
|
1022
|
-
resolution=resolution
|
1023
|
-
**kwargs
|
1269
|
+
resolution=render_resolution, # Pass the calculated resolution
|
1270
|
+
**kwargs,
|
1024
1271
|
)
|
1025
1272
|
else:
|
1026
1273
|
# Get the base page image directly from pdfplumber if no highlights needed
|
1027
|
-
render_resolution = resolution if resolution is not None else scale * 72
|
1028
1274
|
# Use the underlying pdfplumber page object
|
1029
1275
|
img_object = self._page.to_image(resolution=render_resolution, **kwargs)
|
1030
1276
|
# Access the PIL image directly (assuming pdfplumber structure)
|
1031
|
-
image =
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1277
|
+
image = (
|
1278
|
+
img_object.annotated
|
1279
|
+
if hasattr(img_object, "annotated")
|
1280
|
+
else img_object._repr_png_()
|
1281
|
+
)
|
1282
|
+
if isinstance(image, bytes): # Handle cases where it returns bytes
|
1283
|
+
from io import BytesIO
|
1284
|
+
|
1285
|
+
image = Image.open(BytesIO(image)).convert(
|
1286
|
+
"RGB"
|
1287
|
+
) # Convert to RGB for consistency
|
1288
|
+
|
1036
1289
|
except Exception as e:
|
1037
1290
|
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1038
|
-
return None
|
1291
|
+
return None # Return None on error
|
1039
1292
|
|
1040
|
-
if image is None:
|
1293
|
+
if image is None:
|
1294
|
+
return None
|
1295
|
+
|
1296
|
+
# --- Apply exclusion masking if requested ---
|
1297
|
+
if exclusions == "mask" and self._exclusions:
|
1298
|
+
try:
|
1299
|
+
# Ensure image is mutable (RGB or RGBA)
|
1300
|
+
if image.mode not in ("RGB", "RGBA"):
|
1301
|
+
image = image.convert("RGB")
|
1302
|
+
|
1303
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
|
1304
|
+
if exclusion_regions:
|
1305
|
+
draw = ImageDraw.Draw(image)
|
1306
|
+
# Calculate the scaling factor used for the image
|
1307
|
+
# Base image was rendered at render_resolution (DPI)
|
1308
|
+
# pdfplumber default is 72 DPI
|
1309
|
+
# Scale factor = (pixels / inch) / (points / inch) = DPI / 72
|
1310
|
+
img_scale = render_resolution / 72.0
|
1311
|
+
|
1312
|
+
for region in exclusion_regions:
|
1313
|
+
# Convert PDF points (x0, top, x1, bottom) to image pixels
|
1314
|
+
img_x0 = region.x0 * img_scale
|
1315
|
+
img_top = region.top * img_scale
|
1316
|
+
img_x1 = region.x1 * img_scale
|
1317
|
+
img_bottom = region.bottom * img_scale
|
1318
|
+
|
1319
|
+
# Draw a white rectangle over the excluded area
|
1320
|
+
# Ensure coordinates are within image bounds (though region should be)
|
1321
|
+
img_coords = (
|
1322
|
+
max(0, img_x0),
|
1323
|
+
max(0, img_top),
|
1324
|
+
min(image.width, img_x1),
|
1325
|
+
min(image.height, img_bottom)
|
1326
|
+
)
|
1327
|
+
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1328
|
+
draw.rectangle(img_coords, fill="white")
|
1329
|
+
else:
|
1330
|
+
logger.warning(f"Skipping invalid exclusion rect for masking: {img_coords}")
|
1331
|
+
|
1332
|
+
del draw # Release drawing context
|
1333
|
+
except Exception as mask_error:
|
1334
|
+
logger.error(f"Error applying exclusion mask to page {self.index}: {mask_error}", exc_info=True)
|
1335
|
+
# Decide if you want to return None or continue without mask
|
1336
|
+
# For now, continue without mask
|
1041
1337
|
|
1042
1338
|
# Resize the final image if width is provided
|
1043
1339
|
if width is not None and width > 0 and image.width > 0:
|
1044
1340
|
aspect_ratio = image.height / image.width
|
1045
1341
|
height = int(width * aspect_ratio)
|
1046
1342
|
try:
|
1047
|
-
image = image.resize(
|
1343
|
+
image = image.resize(
|
1344
|
+
(width, height), Image.Resampling.LANCZOS
|
1345
|
+
) # Use modern resampling
|
1048
1346
|
except Exception as resize_error:
|
1049
|
-
|
1050
|
-
|
1347
|
+
logger.warning(f"Could not resize image: {resize_error}")
|
1348
|
+
|
1051
1349
|
# Save the image if path is provided
|
1052
1350
|
if path:
|
1053
1351
|
try:
|
@@ -1056,15 +1354,21 @@ class Page:
|
|
1056
1354
|
image.save(path)
|
1057
1355
|
logger.debug(f"Saved page image to: {path}")
|
1058
1356
|
except Exception as save_error:
|
1059
|
-
|
1060
|
-
|
1357
|
+
logger.error(f"Failed to save image to {path}: {save_error}")
|
1358
|
+
|
1061
1359
|
return image
|
1062
|
-
|
1063
|
-
def _create_text_elements_from_ocr(
|
1360
|
+
|
1361
|
+
def _create_text_elements_from_ocr(
|
1362
|
+
self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
|
1363
|
+
) -> List[TextElement]:
|
1064
1364
|
"""DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
|
1065
|
-
logger.warning(
|
1066
|
-
|
1067
|
-
|
1365
|
+
logger.warning(
|
1366
|
+
"_create_text_elements_from_ocr is deprecated. Use self._element_mgr version."
|
1367
|
+
)
|
1368
|
+
return self._element_mgr.create_text_elements_from_ocr(
|
1369
|
+
ocr_results, image_width, image_height
|
1370
|
+
)
|
1371
|
+
|
1068
1372
|
def apply_ocr(
|
1069
1373
|
self,
|
1070
1374
|
engine: Optional[str] = None,
|
@@ -1072,35 +1376,58 @@ class Page:
|
|
1072
1376
|
languages: Optional[List[str]] = None,
|
1073
1377
|
min_confidence: Optional[float] = None,
|
1074
1378
|
device: Optional[str] = None,
|
1075
|
-
|
1379
|
+
resolution: Optional[int] = None,
|
1380
|
+
detect_only: bool = False,
|
1381
|
+
apply_exclusions: bool = True,
|
1382
|
+
) -> "Page":
|
1076
1383
|
"""
|
1077
1384
|
Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
|
1078
|
-
|
1385
|
+
|
1386
|
+
Args:
|
1387
|
+
engine: Name of the OCR engine.
|
1388
|
+
options: Engine-specific options object or dict.
|
1389
|
+
languages: List of engine-specific language codes.
|
1390
|
+
min_confidence: Minimum confidence threshold.
|
1391
|
+
device: Device to run OCR on.
|
1392
|
+
resolution: DPI resolution for rendering page image before OCR.
|
1393
|
+
apply_exclusions: If True (default), render page image for OCR
|
1394
|
+
with excluded areas masked (whited out).
|
1395
|
+
|
1079
1396
|
Returns:
|
1080
1397
|
List of created TextElements derived from OCR results for this page.
|
1081
1398
|
"""
|
1082
|
-
if not hasattr(self._parent,
|
1083
|
-
|
1084
|
-
|
1399
|
+
if not hasattr(self._parent, "apply_ocr"):
|
1400
|
+
logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
|
1401
|
+
return [] # Return empty list for consistency
|
1085
1402
|
|
1086
1403
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1087
1404
|
try:
|
1088
1405
|
# Delegate to parent PDF, targeting only this page's index
|
1406
|
+
# Pass all relevant parameters through, including apply_exclusions
|
1089
1407
|
self._parent.apply_ocr(
|
1090
1408
|
pages=[self.index],
|
1091
|
-
engine=engine,
|
1092
|
-
|
1409
|
+
engine=engine,
|
1410
|
+
options=options,
|
1411
|
+
languages=languages,
|
1412
|
+
min_confidence=min_confidence,
|
1413
|
+
device=device,
|
1414
|
+
resolution=resolution,
|
1415
|
+
detect_only=detect_only,
|
1416
|
+
apply_exclusions=apply_exclusions,
|
1093
1417
|
)
|
1094
1418
|
except Exception as e:
|
1095
|
-
|
1096
|
-
|
1419
|
+
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1420
|
+
return []
|
1097
1421
|
|
1098
1422
|
# Return the OCR elements specifically added to this page
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1423
|
+
ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
|
1424
|
+
logger.debug(
|
1425
|
+
f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
|
1426
|
+
)
|
1427
|
+
# Note: The method is typed to return Page for chaining, but the log indicates
|
1428
|
+
# finding elements. Let's stick to returning self for chaining consistency.
|
1429
|
+
return self
|
1430
|
+
|
1104
1431
|
def extract_ocr_elements(
|
1105
1432
|
self,
|
1106
1433
|
engine: Optional[str] = None,
|
@@ -1108,78 +1435,118 @@ class Page:
|
|
1108
1435
|
languages: Optional[List[str]] = None,
|
1109
1436
|
min_confidence: Optional[float] = None,
|
1110
1437
|
device: Optional[str] = None,
|
1438
|
+
resolution: Optional[int] = None,
|
1111
1439
|
) -> List[TextElement]:
|
1112
1440
|
"""
|
1113
1441
|
Extract text elements using OCR *without* adding them to the page's elements.
|
1114
1442
|
Uses the shared OCRManager instance.
|
1443
|
+
|
1444
|
+
Args:
|
1445
|
+
engine: Name of the OCR engine.
|
1446
|
+
options: Engine-specific options object or dict.
|
1447
|
+
languages: List of engine-specific language codes.
|
1448
|
+
min_confidence: Minimum confidence threshold.
|
1449
|
+
device: Device to run OCR on.
|
1450
|
+
resolution: DPI resolution for rendering page image before OCR.
|
1451
|
+
|
1452
|
+
Returns:
|
1453
|
+
List of created TextElement objects derived from OCR results for this page.
|
1115
1454
|
"""
|
1116
1455
|
if not self._ocr_manager:
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1456
|
+
logger.error(
|
1457
|
+
f"Page {self.number}: OCRManager not available. Cannot extract OCR elements."
|
1458
|
+
)
|
1459
|
+
return []
|
1460
|
+
|
1120
1461
|
logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
|
1462
|
+
|
1463
|
+
# Determine rendering resolution
|
1464
|
+
final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
|
1465
|
+
logger.debug(f" Using rendering resolution: {final_resolution} DPI")
|
1466
|
+
|
1121
1467
|
try:
|
1122
|
-
|
1123
|
-
|
1124
|
-
image = self.to_image(scale=ocr_scale, include_highlights=False)
|
1468
|
+
# Get base image without highlights using the determined resolution
|
1469
|
+
image = self.to_image(resolution=final_resolution, include_highlights=False)
|
1125
1470
|
if not image:
|
1126
|
-
|
1127
|
-
|
1471
|
+
logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
|
1472
|
+
return []
|
1128
1473
|
logger.debug(f" Rendered image size: {image.width}x{image.height}")
|
1129
1474
|
except Exception as e:
|
1130
1475
|
logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
|
1131
1476
|
return []
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1477
|
+
|
1478
|
+
# Prepare arguments for the OCR Manager call
|
1479
|
+
manager_args = {
|
1480
|
+
"images": image,
|
1481
|
+
"engine": engine,
|
1482
|
+
"languages": languages,
|
1483
|
+
"min_confidence": min_confidence,
|
1484
|
+
"device": device,
|
1485
|
+
"options": options
|
1486
|
+
}
|
1487
|
+
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
1488
|
+
|
1489
|
+
logger.debug(
|
1490
|
+
f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }"
|
1491
|
+
)
|
1139
1492
|
try:
|
1140
1493
|
# apply_ocr now returns List[List[Dict]] or List[Dict]
|
1141
1494
|
results_list = self._ocr_manager.apply_ocr(**manager_args)
|
1142
1495
|
# If it returned a list of lists (batch mode), take the first list
|
1143
|
-
results =
|
1144
|
-
|
1496
|
+
results = (
|
1497
|
+
results_list[0]
|
1498
|
+
if isinstance(results_list, list)
|
1499
|
+
and results_list
|
1500
|
+
and isinstance(results_list[0], list)
|
1501
|
+
else results_list
|
1502
|
+
)
|
1145
1503
|
if not isinstance(results, list):
|
1146
|
-
|
1147
|
-
|
1504
|
+
logger.error(f" OCR Manager returned unexpected type: {type(results)}")
|
1505
|
+
results = []
|
1148
1506
|
logger.info(f" OCR Manager returned {len(results)} results for extraction.")
|
1149
1507
|
except Exception as e:
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1508
|
+
logger.error(f" OCR processing failed during extraction: {e}", exc_info=True)
|
1509
|
+
return []
|
1510
|
+
|
1153
1511
|
# Convert results but DO NOT add to ElementManager
|
1154
1512
|
logger.debug(f" Converting OCR results to TextElements (extract only)...")
|
1155
|
-
# Use a temporary method to create elements without adding them globally
|
1156
1513
|
temp_elements = []
|
1157
1514
|
scale_x = self.width / image.width if image.width else 1
|
1158
1515
|
scale_y = self.height / image.height if image.height else 1
|
1159
1516
|
for result in results:
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1517
|
+
try: # Added try-except around result processing
|
1518
|
+
x0, top, x1, bottom = [float(c) for c in result["bbox"]]
|
1519
|
+
elem_data = {
|
1520
|
+
"text": result["text"],
|
1521
|
+
"confidence": result["confidence"],
|
1522
|
+
"x0": x0 * scale_x,
|
1523
|
+
"top": top * scale_y,
|
1524
|
+
"x1": x1 * scale_x,
|
1525
|
+
"bottom": bottom * scale_y,
|
1526
|
+
"width": (x1 - x0) * scale_x,
|
1527
|
+
"height": (bottom - top) * scale_y,
|
1528
|
+
"object_type": "text", # Using text for temporary elements
|
1529
|
+
"source": "ocr",
|
1530
|
+
"fontname": "OCR-extract", # Different name for clarity
|
1531
|
+
"size": 10.0,
|
1532
|
+
"page_number": self.number,
|
1533
|
+
}
|
1534
|
+
temp_elements.append(TextElement(elem_data, self))
|
1535
|
+
except (KeyError, ValueError, TypeError) as convert_err:
|
1536
|
+
logger.warning(f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}")
|
1170
1537
|
|
1171
1538
|
logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
|
1172
1539
|
return temp_elements
|
1173
|
-
|
1540
|
+
|
1174
1541
|
@property
|
1175
1542
|
def layout_analyzer(self) -> LayoutAnalyzer:
|
1176
1543
|
"""Get or create the layout analyzer for this page."""
|
1177
|
-
if self._layout_analyzer is None:
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
return self._layout_analyzer
|
1544
|
+
if self._layout_analyzer is None:
|
1545
|
+
if not self._layout_manager:
|
1546
|
+
logger.warning("LayoutManager not available, cannot create LayoutAnalyzer.")
|
1547
|
+
return None
|
1548
|
+
self._layout_analyzer = LayoutAnalyzer(self)
|
1549
|
+
return self._layout_analyzer
|
1183
1550
|
|
1184
1551
|
def analyze_layout(
|
1185
1552
|
self,
|
@@ -1189,7 +1556,7 @@ class Page:
|
|
1189
1556
|
classes: Optional[List[str]] = None,
|
1190
1557
|
exclude_classes: Optional[List[str]] = None,
|
1191
1558
|
device: Optional[str] = None,
|
1192
|
-
existing: str = "replace"
|
1559
|
+
existing: str = "replace",
|
1193
1560
|
) -> ElementCollection[Region]:
|
1194
1561
|
"""
|
1195
1562
|
Analyze the page layout using the configured LayoutManager.
|
@@ -1200,8 +1567,10 @@ class Page:
|
|
1200
1567
|
"""
|
1201
1568
|
analyzer = self.layout_analyzer
|
1202
1569
|
if not analyzer:
|
1203
|
-
|
1204
|
-
|
1570
|
+
logger.error(
|
1571
|
+
"Layout analysis failed: LayoutAnalyzer not initialized (is LayoutManager available?)."
|
1572
|
+
)
|
1573
|
+
return ElementCollection([]) # Return empty collection
|
1205
1574
|
|
1206
1575
|
# The analyzer's analyze_layout method already adds regions to the page
|
1207
1576
|
# and its element manager. We just need to retrieve them.
|
@@ -1212,17 +1581,20 @@ class Page:
|
|
1212
1581
|
classes=classes,
|
1213
1582
|
exclude_classes=exclude_classes,
|
1214
1583
|
device=device,
|
1215
|
-
existing=existing
|
1584
|
+
existing=existing,
|
1216
1585
|
)
|
1217
1586
|
|
1218
1587
|
# Retrieve the detected regions from the element manager
|
1219
1588
|
# Filter regions based on source='detected' and potentially the model used if available
|
1220
|
-
detected_regions = [
|
1221
|
-
|
1589
|
+
detected_regions = [
|
1590
|
+
r
|
1591
|
+
for r in self._element_mgr.regions
|
1592
|
+
if r.source == "detected" and (not engine or getattr(r, "model", None) == engine)
|
1593
|
+
]
|
1222
1594
|
|
1223
1595
|
return ElementCollection(detected_regions)
|
1224
1596
|
|
1225
|
-
def clear_detected_layout_regions(self) ->
|
1597
|
+
def clear_detected_layout_regions(self) -> "Page":
|
1226
1598
|
"""
|
1227
1599
|
Removes all regions from this page that were added by layout analysis
|
1228
1600
|
(i.e., regions where `source` attribute is 'detected').
|
@@ -1233,47 +1605,61 @@ class Page:
|
|
1233
1605
|
Returns:
|
1234
1606
|
Self for method chaining.
|
1235
1607
|
"""
|
1236
|
-
if
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1608
|
+
if (
|
1609
|
+
not hasattr(self._element_mgr, "regions")
|
1610
|
+
or not hasattr(self._element_mgr, "_elements")
|
1611
|
+
or "regions" not in self._element_mgr._elements
|
1612
|
+
):
|
1613
|
+
logger.debug(
|
1614
|
+
f"Page {self.index}: No regions found in ElementManager, nothing to clear."
|
1615
|
+
)
|
1616
|
+
self._regions["detected"] = [] # Ensure page's list is also clear
|
1617
|
+
return self
|
1240
1618
|
|
1241
1619
|
# Filter ElementManager's list to keep only non-detected regions
|
1242
1620
|
original_count = len(self._element_mgr.regions)
|
1243
|
-
self._element_mgr._elements[
|
1621
|
+
self._element_mgr._elements["regions"] = [
|
1622
|
+
r for r in self._element_mgr.regions if getattr(r, "source", None) != "detected"
|
1623
|
+
]
|
1244
1624
|
new_count = len(self._element_mgr.regions)
|
1245
1625
|
removed_count = original_count - new_count
|
1246
1626
|
|
1247
1627
|
# Clear the page's specific list of detected regions
|
1248
|
-
self._regions[
|
1628
|
+
self._regions["detected"] = []
|
1249
1629
|
|
1250
1630
|
logger.info(f"Page {self.index}: Cleared {removed_count} detected layout regions.")
|
1251
1631
|
return self
|
1252
1632
|
|
1253
|
-
def get_section_between(
|
1633
|
+
def get_section_between(
|
1634
|
+
self, start_element=None, end_element=None, boundary_inclusion="both"
|
1635
|
+
) -> Optional[Region]: # Return Optional
|
1254
1636
|
"""
|
1255
1637
|
Get a section between two elements on this page.
|
1256
1638
|
"""
|
1257
1639
|
# Create a full-page region to operate within
|
1258
1640
|
page_region = self.create_region(0, 0, self.width, self.height)
|
1259
|
-
|
1641
|
+
|
1260
1642
|
# Delegate to the region's method
|
1261
1643
|
try:
|
1262
1644
|
return page_region.get_section_between(
|
1263
1645
|
start_element=start_element,
|
1264
1646
|
end_element=end_element,
|
1265
|
-
boundary_inclusion=boundary_inclusion
|
1647
|
+
boundary_inclusion=boundary_inclusion,
|
1266
1648
|
)
|
1267
1649
|
except Exception as e:
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1650
|
+
logger.error(
|
1651
|
+
f"Error getting section between elements on page {self.index}: {e}", exc_info=True
|
1652
|
+
)
|
1653
|
+
return None
|
1654
|
+
|
1655
|
+
def get_sections(
|
1656
|
+
self,
|
1657
|
+
start_elements=None,
|
1658
|
+
end_elements=None,
|
1659
|
+
boundary_inclusion="both",
|
1660
|
+
y_threshold=5.0,
|
1661
|
+
bounding_box=None,
|
1662
|
+
) -> "ElementCollection[Region]": # Updated type hint
|
1277
1663
|
"""
|
1278
1664
|
Get sections of a page defined by start/end elements.
|
1279
1665
|
Uses the page-level implementation.
|
@@ -1281,6 +1667,7 @@ class Page:
|
|
1281
1667
|
Returns:
|
1282
1668
|
An ElementCollection containing the found Region objects.
|
1283
1669
|
"""
|
1670
|
+
|
1284
1671
|
# Helper function to get bounds from bounding_box parameter
|
1285
1672
|
def get_bounds():
|
1286
1673
|
if bounding_box:
|
@@ -1289,130 +1676,180 @@ class Page:
|
|
1289
1676
|
return max(0, x0), max(0, top), min(self.width, x1), min(self.height, bottom)
|
1290
1677
|
else:
|
1291
1678
|
return 0, 0, self.width, self.height
|
1292
|
-
|
1679
|
+
|
1293
1680
|
regions = []
|
1294
|
-
|
1681
|
+
|
1295
1682
|
# Handle cases where elements are provided as strings (selectors)
|
1296
1683
|
if isinstance(start_elements, str):
|
1297
|
-
start_elements = self.find_all(start_elements).elements
|
1298
|
-
elif hasattr(start_elements,
|
1299
|
-
|
1300
|
-
|
1684
|
+
start_elements = self.find_all(start_elements).elements # Get list of elements
|
1685
|
+
elif hasattr(start_elements, "elements"): # Handle ElementCollection input
|
1686
|
+
start_elements = start_elements.elements
|
1687
|
+
|
1301
1688
|
if isinstance(end_elements, str):
|
1302
1689
|
end_elements = self.find_all(end_elements).elements
|
1303
|
-
elif hasattr(end_elements,
|
1304
|
-
|
1690
|
+
elif hasattr(end_elements, "elements"):
|
1691
|
+
end_elements = end_elements.elements
|
1305
1692
|
|
1306
1693
|
# Ensure start_elements is a list
|
1307
|
-
if start_elements is None:
|
1308
|
-
|
1694
|
+
if start_elements is None:
|
1695
|
+
start_elements = []
|
1696
|
+
if end_elements is None:
|
1697
|
+
end_elements = []
|
1309
1698
|
|
1310
|
-
valid_inclusions = [
|
1699
|
+
valid_inclusions = ["start", "end", "both", "none"]
|
1311
1700
|
if boundary_inclusion not in valid_inclusions:
|
1312
1701
|
raise ValueError(f"boundary_inclusion must be one of {valid_inclusions}")
|
1313
|
-
|
1702
|
+
|
1314
1703
|
if not start_elements:
|
1315
1704
|
# Return an empty ElementCollection if no start elements
|
1316
1705
|
return ElementCollection([])
|
1317
|
-
|
1706
|
+
|
1318
1707
|
# Combine start and end elements with their type
|
1319
1708
|
all_boundaries = []
|
1320
|
-
for el in start_elements:
|
1321
|
-
|
1322
|
-
|
1709
|
+
for el in start_elements:
|
1710
|
+
all_boundaries.append((el, "start"))
|
1711
|
+
for el in end_elements:
|
1712
|
+
all_boundaries.append((el, "end"))
|
1713
|
+
|
1323
1714
|
# Sort all boundary elements primarily by top, then x0
|
1324
1715
|
try:
|
1325
|
-
|
1716
|
+
all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
|
1326
1717
|
except AttributeError as e:
|
1327
|
-
|
1328
|
-
|
1718
|
+
logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
|
1719
|
+
return ElementCollection([]) # Cannot proceed if elements lack position
|
1329
1720
|
|
1330
1721
|
# Process sorted boundaries to find sections
|
1331
1722
|
current_start_element = None
|
1332
1723
|
active_section_started = False
|
1333
1724
|
|
1334
1725
|
for element, element_type in all_boundaries:
|
1335
|
-
if element_type ==
|
1726
|
+
if element_type == "start":
|
1336
1727
|
# If we have an active section, this start implicitly ends it
|
1337
1728
|
if active_section_started:
|
1338
|
-
end_boundary_el = element
|
1729
|
+
end_boundary_el = element # Use this start as the end boundary
|
1339
1730
|
# Determine region boundaries
|
1340
|
-
sec_top =
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1731
|
+
sec_top = (
|
1732
|
+
current_start_element.top
|
1733
|
+
if boundary_inclusion in ["start", "both"]
|
1734
|
+
else current_start_element.bottom
|
1735
|
+
)
|
1736
|
+
sec_bottom = (
|
1737
|
+
end_boundary_el.top
|
1738
|
+
if boundary_inclusion not in ["end", "both"]
|
1739
|
+
else end_boundary_el.bottom
|
1740
|
+
)
|
1741
|
+
|
1742
|
+
if sec_top < sec_bottom: # Ensure valid region
|
1344
1743
|
x0, _, x1, _ = get_bounds()
|
1345
1744
|
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
1346
1745
|
region.start_element = current_start_element
|
1347
|
-
region.end_element = end_boundary_el
|
1348
|
-
region.is_end_next_start = True
|
1746
|
+
region.end_element = end_boundary_el # Mark the element that ended it
|
1747
|
+
region.is_end_next_start = True # Mark how it ended
|
1349
1748
|
regions.append(region)
|
1350
|
-
active_section_started = False
|
1351
|
-
|
1749
|
+
active_section_started = False # Reset for the new start
|
1750
|
+
|
1352
1751
|
# Set this as the potential start of the next section
|
1353
1752
|
current_start_element = element
|
1354
1753
|
active_section_started = True
|
1355
1754
|
|
1356
|
-
elif element_type ==
|
1755
|
+
elif element_type == "end" and active_section_started:
|
1357
1756
|
# We found an explicit end for the current section
|
1358
1757
|
end_boundary_el = element
|
1359
|
-
sec_top =
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1758
|
+
sec_top = (
|
1759
|
+
current_start_element.top
|
1760
|
+
if boundary_inclusion in ["start", "both"]
|
1761
|
+
else current_start_element.bottom
|
1762
|
+
)
|
1763
|
+
sec_bottom = (
|
1764
|
+
end_boundary_el.bottom
|
1765
|
+
if boundary_inclusion in ["end", "both"]
|
1766
|
+
else end_boundary_el.top
|
1767
|
+
)
|
1768
|
+
|
1769
|
+
if sec_top < sec_bottom: # Ensure valid region
|
1363
1770
|
x0, _, x1, _ = get_bounds()
|
1364
1771
|
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
1365
1772
|
region.start_element = current_start_element
|
1366
1773
|
region.end_element = end_boundary_el
|
1367
1774
|
region.is_end_next_start = False
|
1368
1775
|
regions.append(region)
|
1369
|
-
|
1776
|
+
|
1370
1777
|
# Reset: section ended explicitly
|
1371
1778
|
current_start_element = None
|
1372
1779
|
active_section_started = False
|
1373
|
-
|
1780
|
+
|
1374
1781
|
# Handle the last section if it was started but never explicitly ended
|
1375
1782
|
if active_section_started:
|
1376
|
-
sec_top =
|
1783
|
+
sec_top = (
|
1784
|
+
current_start_element.top
|
1785
|
+
if boundary_inclusion in ["start", "both"]
|
1786
|
+
else current_start_element.bottom
|
1787
|
+
)
|
1377
1788
|
x0, _, x1, page_bottom = get_bounds()
|
1378
1789
|
if sec_top < page_bottom:
|
1379
|
-
|
1380
|
-
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1790
|
+
region = self.create_region(x0, sec_top, x1, page_bottom)
|
1791
|
+
region.start_element = current_start_element
|
1792
|
+
region.end_element = None # Ended by page end
|
1793
|
+
region.is_end_next_start = False
|
1794
|
+
regions.append(region)
|
1795
|
+
|
1385
1796
|
# Return the list wrapped in an ElementCollection
|
1386
1797
|
return ElementCollection(regions)
|
1387
|
-
|
1798
|
+
|
1388
1799
|
def __repr__(self) -> str:
|
1389
1800
|
"""String representation of the page."""
|
1390
1801
|
return f"<Page number={self.number} index={self.index}>"
|
1391
|
-
|
1392
|
-
def ask(
|
1802
|
+
|
1803
|
+
def ask(
|
1804
|
+
self,
|
1805
|
+
question: str,
|
1806
|
+
min_confidence: float = 0.1,
|
1807
|
+
model: str = None,
|
1808
|
+
debug: bool = False,
|
1809
|
+
**kwargs,
|
1810
|
+
) -> Dict[str, Any]:
|
1393
1811
|
"""
|
1394
1812
|
Ask a question about the page content using document QA.
|
1395
1813
|
"""
|
1396
1814
|
try:
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1815
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1816
|
+
|
1817
|
+
# Get or initialize QA engine with specified model
|
1818
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1819
|
+
# Ask the question using the QA engine
|
1820
|
+
return qa_engine.ask_pdf_page(
|
1821
|
+
self, question, min_confidence=min_confidence, debug=debug, **kwargs
|
1822
|
+
)
|
1402
1823
|
except ImportError:
|
1403
|
-
|
1404
|
-
|
1824
|
+
logger.error(
|
1825
|
+
"Question answering requires the 'natural_pdf.qa' module. Please install necessary dependencies."
|
1826
|
+
)
|
1827
|
+
return {
|
1828
|
+
"answer": None,
|
1829
|
+
"confidence": 0.0,
|
1830
|
+
"found": False,
|
1831
|
+
"page_num": self.number,
|
1832
|
+
"source_elements": [],
|
1833
|
+
}
|
1405
1834
|
except Exception as e:
|
1406
|
-
|
1407
|
-
|
1835
|
+
logger.error(f"Error during page.ask: {e}", exc_info=True)
|
1836
|
+
return {
|
1837
|
+
"answer": None,
|
1838
|
+
"confidence": 0.0,
|
1839
|
+
"found": False,
|
1840
|
+
"page_num": self.number,
|
1841
|
+
"source_elements": [],
|
1842
|
+
}
|
1408
1843
|
|
1409
|
-
def show_preview(
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1844
|
+
def show_preview(
|
1845
|
+
self,
|
1846
|
+
temporary_highlights: List[Dict],
|
1847
|
+
scale: float = 2.0,
|
1848
|
+
width: Optional[int] = None,
|
1849
|
+
labels: bool = True,
|
1850
|
+
legend_position: str = "right",
|
1851
|
+
render_ocr: bool = False,
|
1852
|
+
) -> Optional[Image.Image]:
|
1416
1853
|
"""
|
1417
1854
|
Generates and returns a non-stateful preview image containing only
|
1418
1855
|
the provided temporary highlights.
|
@@ -1437,13 +1874,16 @@ class Page:
|
|
1437
1874
|
scale=scale,
|
1438
1875
|
labels=labels,
|
1439
1876
|
legend_position=legend_position,
|
1440
|
-
render_ocr=render_ocr
|
1877
|
+
render_ocr=render_ocr,
|
1441
1878
|
)
|
1442
1879
|
except AttributeError:
|
1443
1880
|
logger.error(f"HighlightingService does not have the required 'render_preview' method.")
|
1444
1881
|
return None
|
1445
1882
|
except Exception as e:
|
1446
|
-
logger.error(
|
1883
|
+
logger.error(
|
1884
|
+
f"Error calling highlighter.render_preview for page {self.index}: {e}",
|
1885
|
+
exc_info=True,
|
1886
|
+
)
|
1447
1887
|
return None
|
1448
1888
|
|
1449
1889
|
# Return the rendered image directly
|
@@ -1451,7 +1891,7 @@ class Page:
|
|
1451
1891
|
|
1452
1892
|
@property
|
1453
1893
|
def text_style_labels(self) -> List[str]:
|
1454
|
-
"""
|
1894
|
+
"""
|
1455
1895
|
Get a sorted list of unique text style labels found on the page.
|
1456
1896
|
|
1457
1897
|
Runs text style analysis with default options if it hasn't been run yet.
|
@@ -1461,52 +1901,66 @@ class Page:
|
|
1461
1901
|
A sorted list of unique style label strings.
|
1462
1902
|
"""
|
1463
1903
|
# Check if the summary attribute exists from a previous run
|
1464
|
-
if not hasattr(self,
|
1904
|
+
if not hasattr(self, "_text_styles_summary") or not self._text_styles_summary:
|
1465
1905
|
# If not, run the analysis with default options
|
1466
1906
|
logger.debug(f"Page {self.number}: Running default text style analysis to get labels.")
|
1467
|
-
self.analyze_text_styles()
|
1907
|
+
self.analyze_text_styles() # Use default options
|
1468
1908
|
|
1469
1909
|
# Extract labels from the summary dictionary
|
1470
|
-
if hasattr(self,
|
1910
|
+
if hasattr(self, "_text_styles_summary") and self._text_styles_summary:
|
1471
1911
|
# The summary maps style_key -> {'label': ..., 'properties': ...}
|
1472
|
-
labels = {style_info[
|
1912
|
+
labels = {style_info["label"] for style_info in self._text_styles_summary.values()}
|
1473
1913
|
return sorted(list(labels))
|
1474
1914
|
else:
|
1475
1915
|
# Fallback if summary wasn't created for some reason (e.g., no text elements)
|
1476
|
-
|
1477
|
-
|
1916
|
+
logger.warning(f"Page {self.number}: Text style summary not found after analysis.")
|
1917
|
+
return []
|
1478
1918
|
|
1479
|
-
def viewer(
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1919
|
+
def viewer(
|
1920
|
+
self,
|
1921
|
+
# elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
|
1922
|
+
# include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
|
1923
|
+
) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
|
1483
1924
|
"""
|
1484
1925
|
Creates and returns an interactive ipywidget for exploring elements on this page.
|
1485
1926
|
|
1486
1927
|
Uses SimpleInteractiveViewerWidget.from_page() to create the viewer.
|
1487
1928
|
|
1488
1929
|
Returns:
|
1489
|
-
A SimpleInteractiveViewerWidget instance ready for display in Jupyter
|
1930
|
+
A SimpleInteractiveViewerWidget instance ready for display in Jupyter,
|
1931
|
+
or None if ipywidgets is not installed or widget creation fails.
|
1490
1932
|
|
1491
1933
|
Raises:
|
1492
|
-
|
1934
|
+
# Optional: Could raise ImportError instead of returning None
|
1935
|
+
# ImportError: If required dependencies (ipywidgets) are missing.
|
1493
1936
|
ValueError: If image rendering or data preparation fails within from_page.
|
1494
1937
|
"""
|
1495
|
-
#
|
1938
|
+
# Check for availability using the imported flag and class variable
|
1939
|
+
if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
|
1940
|
+
logger.error(
|
1941
|
+
"Interactive viewer requires optional dependencies ('ipywidgets'). "
|
1942
|
+
"Install with `pip install natural-pdf[interactive]`"
|
1943
|
+
)
|
1944
|
+
# raise ImportError("ipywidgets not found.") # Option 1: Raise error
|
1945
|
+
return None # Option 2: Return None gracefully
|
1946
|
+
|
1947
|
+
# If we reach here, SimpleInteractiveViewerWidget should be the actual class
|
1496
1948
|
try:
|
1497
|
-
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1949
|
+
# Pass self (the Page object) to the factory method
|
1950
|
+
return SimpleInteractiveViewerWidget.from_page(self)
|
1951
|
+
except Exception as e:
|
1952
|
+
# Catch potential errors during widget creation (e.g., image rendering)
|
1953
|
+
logger.error(
|
1954
|
+
f"Error creating viewer widget from page {self.number}: {e}", exc_info=True
|
1955
|
+
)
|
1956
|
+
# raise # Option 1: Re-raise error (might include ValueError from from_page)
|
1957
|
+
return None # Option 2: Return None on creation error
|
1504
1958
|
|
1505
1959
|
# --- Indexable Protocol Methods ---
|
1506
1960
|
def get_id(self) -> str:
|
1507
1961
|
"""Returns a unique identifier for the page (required by Indexable protocol)."""
|
1508
1962
|
# Ensure path is safe for use in IDs (replace problematic chars)
|
1509
|
-
safe_path = re.sub(r
|
1963
|
+
safe_path = re.sub(r"[^a-zA-Z0-9_-]", "_", str(self.pdf.path))
|
1510
1964
|
return f"pdf_{safe_path}_page_{self.page_number}"
|
1511
1965
|
|
1512
1966
|
def get_metadata(self) -> Dict[str, Any]:
|
@@ -1517,21 +1971,90 @@ class Page:
|
|
1517
1971
|
"page_number": self.page_number,
|
1518
1972
|
"width": self.width,
|
1519
1973
|
"height": self.height,
|
1520
|
-
"content_hash": self.get_content_hash()
|
1974
|
+
"content_hash": self.get_content_hash(), # Include the hash
|
1521
1975
|
}
|
1522
1976
|
return metadata
|
1523
1977
|
|
1524
|
-
def get_content(self) ->
|
1978
|
+
def get_content(self) -> "Page":
|
1525
1979
|
"""
|
1526
1980
|
Returns the primary content object (self) for indexing (required by Indexable protocol).
|
1527
1981
|
SearchService implementations decide how to process this (e.g., call extract_text).
|
1528
1982
|
"""
|
1529
|
-
return self
|
1983
|
+
return self # Return the Page object itself
|
1530
1984
|
|
1531
1985
|
def get_content_hash(self) -> str:
|
1532
1986
|
"""Returns a SHA256 hash of the extracted text content (required by Indexable for sync)."""
|
1533
1987
|
# Hash the extracted text (without exclusions for consistency)
|
1534
1988
|
# Consider if exclusions should be part of the hash? For now, hash raw text.
|
1535
1989
|
# Using extract_text directly might be slow if called repeatedly. Cache? TODO: Optimization
|
1536
|
-
text_content = self.extract_text(
|
1537
|
-
|
1990
|
+
text_content = self.extract_text(
|
1991
|
+
use_exclusions=False, preserve_whitespace=False
|
1992
|
+
) # Normalize whitespace?
|
1993
|
+
return hashlib.sha256(text_content.encode("utf-8")).hexdigest()
|
1994
|
+
|
1995
|
+
# --- New Method: save_searchable ---
|
1996
|
+
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
1997
|
+
"""
|
1998
|
+
Saves the PDF page with an OCR text layer, making content searchable.
|
1999
|
+
|
2000
|
+
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
|
2001
|
+
|
2002
|
+
Note: OCR must have been applied to the pages beforehand
|
2003
|
+
(e.g., pdf.apply_ocr()).
|
2004
|
+
|
2005
|
+
Args:
|
2006
|
+
output_path: Path to save the searchable PDF.
|
2007
|
+
dpi: Resolution for rendering and OCR overlay (default 300).
|
2008
|
+
**kwargs: Additional keyword arguments passed to the exporter.
|
2009
|
+
"""
|
2010
|
+
# Import moved here, assuming it's always available now
|
2011
|
+
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
2012
|
+
|
2013
|
+
# Convert pathlib.Path to string if necessary
|
2014
|
+
output_path_str = str(output_path)
|
2015
|
+
|
2016
|
+
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
2017
|
+
logger.info(f"Searchable PDF saved to: {output_path_str}")
|
2018
|
+
|
2019
|
+
# --- Added correct_ocr method ---
|
2020
|
+
def correct_ocr(
|
2021
|
+
self,
|
2022
|
+
correction_callback: Callable[[Any], Optional[str]],
|
2023
|
+
) -> "Page": # Return self for chaining
|
2024
|
+
"""
|
2025
|
+
Applies corrections to OCR-generated text elements on this page
|
2026
|
+
using a user-provided callback function.
|
2027
|
+
|
2028
|
+
Finds text elements on this page whose 'source' attribute starts
|
2029
|
+
with 'ocr' and calls the `correction_callback` for each, passing the
|
2030
|
+
element itself.
|
2031
|
+
|
2032
|
+
The `correction_callback` should contain the logic to:
|
2033
|
+
1. Determine if the element needs correction.
|
2034
|
+
2. Perform the correction (e.g., call an LLM).
|
2035
|
+
3. Return the new text (`str`) or `None`.
|
2036
|
+
|
2037
|
+
If the callback returns a string, the element's `.text` is updated.
|
2038
|
+
Metadata updates (source, confidence, etc.) should happen within the callback.
|
2039
|
+
|
2040
|
+
Args:
|
2041
|
+
correction_callback: A function accepting an element and returning
|
2042
|
+
`Optional[str]` (new text or None).
|
2043
|
+
|
2044
|
+
Returns:
|
2045
|
+
Self for method chaining.
|
2046
|
+
"""
|
2047
|
+
logger.info(f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'")
|
2048
|
+
|
2049
|
+
# Find OCR elements specifically on this page
|
2050
|
+
# Note: We typically want to correct even if the element falls in an excluded area
|
2051
|
+
target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
|
2052
|
+
|
2053
|
+
# Delegate to the utility function
|
2054
|
+
_apply_ocr_correction_to_elements(
|
2055
|
+
elements=target_elements, # Pass the ElementCollection directly
|
2056
|
+
correction_callback=correction_callback,
|
2057
|
+
caller_info=f"Page({self.number})", # Pass caller info
|
2058
|
+
)
|
2059
|
+
|
2060
|
+
return self # Return self for chaining
|