natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +159 -3
- natural_pdf/core/highlighting_service.py +8 -8
- natural_pdf/core/page.py +135 -4
- natural_pdf/core/page_collection.py +37 -0
- natural_pdf/core/page_groupby.py +229 -0
- natural_pdf/core/render_spec.py +18 -4
- natural_pdf/elements/base.py +54 -6
- natural_pdf/elements/element_collection.py +1 -0
- natural_pdf/elements/region.py +2 -2
- natural_pdf/elements/text.py +5 -0
- natural_pdf/extraction/manager.py +8 -14
- natural_pdf/extraction/mixin.py +35 -21
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +37 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/METADATA +2 -2
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/RECORD +22 -21
- optimization/performance_analysis.py +1 -1
- tools/bad_pdf_eval/analyser.py +1 -1
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.1.dev0.dist-info → natural_pdf-0.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,229 @@
|
|
1
|
+
"""
|
2
|
+
PageGroupBy class for grouping pages by selector text or callable results.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from collections import defaultdict
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
|
7
|
+
|
8
|
+
from tqdm.auto import tqdm
|
9
|
+
|
10
|
+
if TYPE_CHECKING:
|
11
|
+
from natural_pdf.core.page import Page
|
12
|
+
from natural_pdf.core.page_collection import PageCollection
|
13
|
+
|
14
|
+
|
15
|
+
class PageGroupBy:
|
16
|
+
"""
|
17
|
+
A groupby object for PageCollections that supports both iteration and dict-like access.
|
18
|
+
|
19
|
+
This class provides pandas-like groupby functionality for natural-pdf PageCollections.
|
20
|
+
Pages are grouped by the result of applying a selector string or callable function.
|
21
|
+
|
22
|
+
Supports:
|
23
|
+
- Direct iteration: for key, pages in grouped:
|
24
|
+
- Dict-like access: grouped.get(key), grouped.get_group(key)
|
25
|
+
- Batch operations: grouped.apply(func)
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(
|
29
|
+
self,
|
30
|
+
page_collection: "PageCollection",
|
31
|
+
by: Union[str, Callable],
|
32
|
+
*,
|
33
|
+
show_progress: bool = True,
|
34
|
+
):
|
35
|
+
"""
|
36
|
+
Initialize the PageGroupBy object.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
page_collection: The PageCollection to group
|
40
|
+
by: CSS selector string or callable function for grouping
|
41
|
+
show_progress: Whether to show progress bar during computation (default: True)
|
42
|
+
"""
|
43
|
+
self.page_collection = page_collection
|
44
|
+
self.by = by
|
45
|
+
self.show_progress = show_progress
|
46
|
+
self._groups: Optional[Dict[Any, "PageCollection"]] = None
|
47
|
+
|
48
|
+
def _compute_groups(self) -> Dict[Any, "PageCollection"]:
|
49
|
+
"""
|
50
|
+
Compute the groups by applying the selector/callable to each page.
|
51
|
+
|
52
|
+
Returns:
|
53
|
+
Dictionary mapping group keys to PageCollection objects
|
54
|
+
"""
|
55
|
+
if self._groups is not None:
|
56
|
+
return self._groups
|
57
|
+
|
58
|
+
groups = defaultdict(list)
|
59
|
+
|
60
|
+
# Setup progress bar if enabled and collection is large enough
|
61
|
+
pages_iterator = self.page_collection.pages
|
62
|
+
total_pages = len(self.page_collection)
|
63
|
+
|
64
|
+
if self.show_progress and total_pages > 1: # Show progress for more than 1 page
|
65
|
+
desc = f"Grouping by {'selector' if isinstance(self.by, str) else 'function'}"
|
66
|
+
pages_iterator = tqdm(pages_iterator, desc=desc, unit="pages", total=total_pages)
|
67
|
+
|
68
|
+
for page in pages_iterator:
|
69
|
+
if callable(self.by):
|
70
|
+
# Apply callable function
|
71
|
+
key = self.by(page)
|
72
|
+
else:
|
73
|
+
# Apply selector string
|
74
|
+
element = page.find(self.by)
|
75
|
+
if element:
|
76
|
+
key = element.extract_text()
|
77
|
+
else:
|
78
|
+
key = None
|
79
|
+
|
80
|
+
groups[key].append(page)
|
81
|
+
|
82
|
+
# Convert lists to PageCollections
|
83
|
+
from natural_pdf.core.page_collection import PageCollection
|
84
|
+
|
85
|
+
self._groups = {key: PageCollection(pages) for key, pages in groups.items()}
|
86
|
+
|
87
|
+
return self._groups
|
88
|
+
|
89
|
+
def __iter__(self) -> Iterator[Tuple[Any, "PageCollection"]]:
|
90
|
+
"""
|
91
|
+
Support direct iteration: for key, pages in grouped:
|
92
|
+
|
93
|
+
Yields:
|
94
|
+
Tuples of (group_key, PageCollection)
|
95
|
+
"""
|
96
|
+
groups = self._compute_groups()
|
97
|
+
return iter(groups.items())
|
98
|
+
|
99
|
+
def get(
|
100
|
+
self, key: Any, default: Optional["PageCollection"] = None
|
101
|
+
) -> Optional["PageCollection"]:
|
102
|
+
"""
|
103
|
+
Dict-like access to get a specific group.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
key: The group key to look up
|
107
|
+
default: Value to return if key is not found
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
PageCollection for the group, or default if not found
|
111
|
+
"""
|
112
|
+
groups = self._compute_groups()
|
113
|
+
return groups.get(key, default)
|
114
|
+
|
115
|
+
def get_group(self, key: Any) -> "PageCollection":
|
116
|
+
"""
|
117
|
+
Pandas-style access to get a specific group.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
key: The group key to look up
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
PageCollection for the group
|
124
|
+
|
125
|
+
Raises:
|
126
|
+
KeyError: If the group key is not found
|
127
|
+
"""
|
128
|
+
groups = self._compute_groups()
|
129
|
+
if key not in groups:
|
130
|
+
raise KeyError(f"Group key '{key}' not found")
|
131
|
+
return groups[key]
|
132
|
+
|
133
|
+
def keys(self) -> List[Any]:
|
134
|
+
"""
|
135
|
+
Get all group keys.
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
List of all group keys
|
139
|
+
"""
|
140
|
+
groups = self._compute_groups()
|
141
|
+
return list(groups.keys())
|
142
|
+
|
143
|
+
def __getitem__(self, index: Union[int, Any]) -> "PageCollection":
|
144
|
+
"""
|
145
|
+
Access groups by index or key.
|
146
|
+
|
147
|
+
Args:
|
148
|
+
index: Integer index (0-based) or group key
|
149
|
+
|
150
|
+
Returns:
|
151
|
+
PageCollection for the specified group
|
152
|
+
|
153
|
+
Examples:
|
154
|
+
grouped = pages.groupby('text[size=16]')
|
155
|
+
|
156
|
+
# Access by index (useful for quick exploration)
|
157
|
+
first_group = grouped[0] # First group by order
|
158
|
+
second_group = grouped[1] # Second group
|
159
|
+
last_group = grouped[-1] # Last group
|
160
|
+
|
161
|
+
# Access by key (same as .get_group())
|
162
|
+
madison = grouped['CITY OF MADISON']
|
163
|
+
"""
|
164
|
+
groups = self._compute_groups()
|
165
|
+
|
166
|
+
if isinstance(index, int):
|
167
|
+
# Access by integer index
|
168
|
+
keys_list = list(groups.keys())
|
169
|
+
original_index = index # Keep original for error message
|
170
|
+
if index < 0:
|
171
|
+
index = len(keys_list) + index # Support negative indexing
|
172
|
+
if not (0 <= index < len(keys_list)):
|
173
|
+
raise IndexError(f"Group index {original_index} out of range")
|
174
|
+
key = keys_list[index]
|
175
|
+
return groups[key]
|
176
|
+
else:
|
177
|
+
# Access by key (same as get_group)
|
178
|
+
if index not in groups:
|
179
|
+
raise KeyError(f"Group key '{index}' not found")
|
180
|
+
return groups[index]
|
181
|
+
|
182
|
+
def apply(self, func: Callable[["PageCollection"], Any]) -> Dict[Any, Any]:
|
183
|
+
"""
|
184
|
+
Apply a function to each group.
|
185
|
+
|
186
|
+
Args:
|
187
|
+
func: Function to apply to each PageCollection group
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
Dictionary mapping group keys to function results
|
191
|
+
"""
|
192
|
+
groups = self._compute_groups()
|
193
|
+
return {key: func(pages) for key, pages in groups.items()}
|
194
|
+
|
195
|
+
def show(self, **kwargs):
|
196
|
+
"""
|
197
|
+
Show each group separately with headers.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
**kwargs: Arguments passed to each group's show() method
|
201
|
+
"""
|
202
|
+
groups = self._compute_groups()
|
203
|
+
for key, pages in groups.items():
|
204
|
+
print(f"\n--- Group: {key} ({len(pages)} pages) ---")
|
205
|
+
pages.show(**kwargs)
|
206
|
+
|
207
|
+
def __len__(self) -> int:
|
208
|
+
"""Return the number of groups."""
|
209
|
+
groups = self._compute_groups()
|
210
|
+
return len(groups)
|
211
|
+
|
212
|
+
def info(self) -> None:
|
213
|
+
"""
|
214
|
+
Print information about all groups.
|
215
|
+
|
216
|
+
Useful for quick exploration of group structure.
|
217
|
+
"""
|
218
|
+
groups = self._compute_groups()
|
219
|
+
print(f"PageGroupBy with {len(groups)} groups:")
|
220
|
+
print("-" * 40)
|
221
|
+
|
222
|
+
for i, (key, pages) in enumerate(groups.items()):
|
223
|
+
key_display = f"'{key}'" if key is not None else "None"
|
224
|
+
print(f"[{i}] {key_display}: {len(pages)} pages")
|
225
|
+
|
226
|
+
def __repr__(self) -> str:
|
227
|
+
"""String representation showing group count."""
|
228
|
+
groups = self._compute_groups()
|
229
|
+
return f"<PageGroupBy(groups={len(groups)})>"
|
natural_pdf/core/render_spec.py
CHANGED
@@ -146,10 +146,11 @@ class Visualizable:
|
|
146
146
|
legend_position: str = "right",
|
147
147
|
annotate: Optional[Union[str, List[str]]] = None,
|
148
148
|
# Layout options for multi-page/region
|
149
|
-
layout: Literal["stack", "grid", "single"] =
|
149
|
+
layout: Optional[Literal["stack", "grid", "single"]] = None,
|
150
150
|
stack_direction: Literal["vertical", "horizontal"] = "vertical",
|
151
151
|
gap: int = 5,
|
152
|
-
columns: Optional[int] =
|
152
|
+
columns: Optional[int] = 6, # For grid layout, defaults to 6 columns
|
153
|
+
limit: Optional[int] = 30, # Max pages to show (default 30)
|
153
154
|
# Cropping options
|
154
155
|
crop: Union[bool, Literal["content"]] = False,
|
155
156
|
crop_bbox: Optional[Tuple[float, float, float, float]] = None,
|
@@ -169,10 +170,11 @@ class Visualizable:
|
|
169
170
|
highlights: Additional highlight groups to show
|
170
171
|
legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
|
171
172
|
annotate: Attribute name(s) to display on highlights (string or list)
|
172
|
-
layout: How to arrange multiple pages/regions
|
173
|
+
layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)
|
173
174
|
stack_direction: Direction for stack layout
|
174
175
|
gap: Pixels between stacked images
|
175
|
-
columns: Number of columns for grid layout
|
176
|
+
columns: Number of columns for grid layout (defaults to 6)
|
177
|
+
limit: Maximum number of pages to display (default 30, None for all)
|
176
178
|
crop: Whether to crop (True, False, or 'content' for bbox of elements)
|
177
179
|
crop_bbox: Explicit crop bounds
|
178
180
|
**kwargs: Additional parameters passed to rendering
|
@@ -184,6 +186,10 @@ class Visualizable:
|
|
184
186
|
if isinstance(annotate, str):
|
185
187
|
annotate = [annotate]
|
186
188
|
|
189
|
+
# Pass limit as max_pages to _get_render_specs
|
190
|
+
if limit is not None:
|
191
|
+
kwargs["max_pages"] = limit
|
192
|
+
|
187
193
|
specs = self._get_render_specs(
|
188
194
|
mode="show",
|
189
195
|
color=color,
|
@@ -198,6 +204,14 @@ class Visualizable:
|
|
198
204
|
logger.warning(f"{self.__class__.__name__}.show() generated no render specs")
|
199
205
|
return None
|
200
206
|
|
207
|
+
# Determine default layout based on content and parameters
|
208
|
+
if layout is None:
|
209
|
+
# For PDFs and multi-page collections, default to grid with 6 columns
|
210
|
+
if len(specs) > 1:
|
211
|
+
layout = "grid"
|
212
|
+
else:
|
213
|
+
layout = "single"
|
214
|
+
|
201
215
|
highlighter = self._get_highlighter()
|
202
216
|
return highlighter.unified_render(
|
203
217
|
specs=specs,
|
natural_pdf/elements/base.py
CHANGED
@@ -260,7 +260,7 @@ class DirectionalMixin:
|
|
260
260
|
|
261
261
|
Args:
|
262
262
|
height: Height of the region above, in points
|
263
|
-
width: Width mode - "full" for full page width or "element" for element width
|
263
|
+
width: Width mode - "full" (default) for full page width or "element" for element width
|
264
264
|
include_source: Whether to include this element/region in the result (default: False)
|
265
265
|
until: Optional selector string to specify an upper boundary element
|
266
266
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
@@ -268,6 +268,18 @@ class DirectionalMixin:
|
|
268
268
|
|
269
269
|
Returns:
|
270
270
|
Region object representing the area above
|
271
|
+
|
272
|
+
Examples:
|
273
|
+
```python
|
274
|
+
# Default: full page width
|
275
|
+
signature.above() # Gets everything above across full page width
|
276
|
+
|
277
|
+
# Match element width
|
278
|
+
signature.above(width='element') # Gets region above matching signature width
|
279
|
+
|
280
|
+
# Stop at specific element
|
281
|
+
signature.above(until='text:contains("Date")') # Region from date to signature
|
282
|
+
```
|
271
283
|
"""
|
272
284
|
return self._direction(
|
273
285
|
direction="above",
|
@@ -293,7 +305,7 @@ class DirectionalMixin:
|
|
293
305
|
|
294
306
|
Args:
|
295
307
|
height: Height of the region below, in points
|
296
|
-
width: Width mode - "full" for full page width or "element" for element width
|
308
|
+
width: Width mode - "full" (default) for full page width or "element" for element width
|
297
309
|
include_source: Whether to include this element/region in the result (default: False)
|
298
310
|
until: Optional selector string to specify a lower boundary element
|
299
311
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
@@ -301,6 +313,18 @@ class DirectionalMixin:
|
|
301
313
|
|
302
314
|
Returns:
|
303
315
|
Region object representing the area below
|
316
|
+
|
317
|
+
Examples:
|
318
|
+
```python
|
319
|
+
# Default: full page width
|
320
|
+
header.below() # Gets everything below across full page width
|
321
|
+
|
322
|
+
# Match element width
|
323
|
+
header.below(width='element') # Gets region below matching header width
|
324
|
+
|
325
|
+
# Limited height
|
326
|
+
header.below(height=200) # Gets 200pt tall region below header
|
327
|
+
```
|
304
328
|
"""
|
305
329
|
return self._direction(
|
306
330
|
direction="below",
|
@@ -315,7 +339,7 @@ class DirectionalMixin:
|
|
315
339
|
def left(
|
316
340
|
self,
|
317
341
|
width: Optional[float] = None,
|
318
|
-
height: str = "
|
342
|
+
height: str = "element",
|
319
343
|
include_source: bool = False,
|
320
344
|
until: Optional[str] = None,
|
321
345
|
include_endpoint: bool = True,
|
@@ -326,7 +350,7 @@ class DirectionalMixin:
|
|
326
350
|
|
327
351
|
Args:
|
328
352
|
width: Width of the region to the left, in points
|
329
|
-
height: Height mode - "
|
353
|
+
height: Height mode - "element" (default) for element height or "full" for full page height
|
330
354
|
include_source: Whether to include this element/region in the result (default: False)
|
331
355
|
until: Optional selector string to specify a left boundary element
|
332
356
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
@@ -334,6 +358,18 @@ class DirectionalMixin:
|
|
334
358
|
|
335
359
|
Returns:
|
336
360
|
Region object representing the area to the left
|
361
|
+
|
362
|
+
Examples:
|
363
|
+
```python
|
364
|
+
# Default: matches element height
|
365
|
+
table.left() # Gets region to the left at same height as table
|
366
|
+
|
367
|
+
# Full page height
|
368
|
+
table.left(height='full') # Gets entire left side of page
|
369
|
+
|
370
|
+
# Custom height
|
371
|
+
table.left(height=100) # Gets 100pt tall region to the left
|
372
|
+
```
|
337
373
|
"""
|
338
374
|
return self._direction(
|
339
375
|
direction="left",
|
@@ -348,7 +384,7 @@ class DirectionalMixin:
|
|
348
384
|
def right(
|
349
385
|
self,
|
350
386
|
width: Optional[float] = None,
|
351
|
-
height: str = "
|
387
|
+
height: str = "element",
|
352
388
|
include_source: bool = False,
|
353
389
|
until: Optional[str] = None,
|
354
390
|
include_endpoint: bool = True,
|
@@ -359,7 +395,7 @@ class DirectionalMixin:
|
|
359
395
|
|
360
396
|
Args:
|
361
397
|
width: Width of the region to the right, in points
|
362
|
-
height: Height mode - "
|
398
|
+
height: Height mode - "element" (default) for element height or "full" for full page height
|
363
399
|
include_source: Whether to include this element/region in the result (default: False)
|
364
400
|
until: Optional selector string to specify a right boundary element
|
365
401
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
@@ -367,6 +403,18 @@ class DirectionalMixin:
|
|
367
403
|
|
368
404
|
Returns:
|
369
405
|
Region object representing the area to the right
|
406
|
+
|
407
|
+
Examples:
|
408
|
+
```python
|
409
|
+
# Default: matches element height
|
410
|
+
label.right() # Gets region to the right at same height as label
|
411
|
+
|
412
|
+
# Full page height
|
413
|
+
label.right(height='full') # Gets entire right side of page
|
414
|
+
|
415
|
+
# Custom height
|
416
|
+
label.right(height=50) # Gets 50pt tall region to the right
|
417
|
+
```
|
370
418
|
"""
|
371
419
|
return self._direction(
|
372
420
|
direction="right",
|
@@ -891,6 +891,7 @@ class ElementCollection(
|
|
891
891
|
label_format: Optional[str] = None,
|
892
892
|
annotate: Optional[List[str]] = None,
|
893
893
|
bins: Optional[Union[int, List[float]]] = None,
|
894
|
+
**kwargs,
|
894
895
|
) -> List[Dict]:
|
895
896
|
"""
|
896
897
|
Determines the parameters for highlighting each element based on the strategy.
|
natural_pdf/elements/region.py
CHANGED
@@ -960,7 +960,7 @@ class Region(
|
|
960
960
|
right_content_col = min(width - 1, content_col_indices[-1] + padding)
|
961
961
|
|
962
962
|
# Convert trimmed pixel coordinates back to PDF coordinates
|
963
|
-
scale_factor = resolution / 72.0 # Scale factor used in
|
963
|
+
scale_factor = resolution / 72.0 # Scale factor used in render()
|
964
964
|
|
965
965
|
# Calculate new PDF coordinates and ensure they are Python floats
|
966
966
|
trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
|
@@ -3437,7 +3437,7 @@ class Region(
|
|
3437
3437
|
r_idx = int(cell.metadata.get("row_index"))
|
3438
3438
|
c_idx = int(cell.metadata.get("col_index"))
|
3439
3439
|
text_val = cell.extract_text(
|
3440
|
-
layout=False, apply_exclusions=
|
3440
|
+
layout=False, apply_exclusions=True, content_filter=content_filter
|
3441
3441
|
).strip()
|
3442
3442
|
table_grid[r_idx][c_idx] = text_val if text_val else None
|
3443
3443
|
except Exception as _err:
|
natural_pdf/elements/text.py
CHANGED
@@ -215,6 +215,11 @@ class TextElement(Element):
|
|
215
215
|
if isinstance(color, (int, float)):
|
216
216
|
return (color, color, color)
|
217
217
|
|
218
|
+
# If it's a single-value tuple (grayscale), treat as grayscale
|
219
|
+
if isinstance(color, tuple) and len(color) == 1:
|
220
|
+
gray = color[0]
|
221
|
+
return (gray, gray, gray)
|
222
|
+
|
218
223
|
# If it's a tuple of 3 values, treat as RGB
|
219
224
|
if isinstance(color, tuple) and len(color) == 3:
|
220
225
|
return color
|
@@ -119,17 +119,11 @@ class StructuredDataManager:
|
|
119
119
|
)
|
120
120
|
messages = self._prepare_llm_messages(content, prompt, using, schema)
|
121
121
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
)
|
131
|
-
except Exception as e:
|
132
|
-
logger.error(f"Extraction failed: {str(e)}")
|
133
|
-
return StructuredDataResult(
|
134
|
-
data=None, success=False, error_message=str(e), model_used=selected_model
|
135
|
-
)
|
122
|
+
logger.debug(f"Extracting with model '{selected_model}'")
|
123
|
+
completion = client.beta.chat.completions.parse(
|
124
|
+
model=selected_model, messages=messages, response_format=schema, **kwargs
|
125
|
+
)
|
126
|
+
parsed_data = completion.choices[0].message.parsed
|
127
|
+
return StructuredDataResult(
|
128
|
+
data=parsed_data, success=True, error_message=None, model_used=selected_model
|
129
|
+
)
|
natural_pdf/extraction/mixin.py
CHANGED
@@ -35,7 +35,7 @@ class ExtractionMixin(ABC):
|
|
35
35
|
|
36
36
|
Host class requirements:
|
37
37
|
- Must implement extract_text(**kwargs) -> str
|
38
|
-
- Must implement
|
38
|
+
- Must implement render(**kwargs) -> PIL.Image
|
39
39
|
- Must have access to StructuredDataManager (usually via parent PDF)
|
40
40
|
|
41
41
|
Example:
|
@@ -72,25 +72,24 @@ class ExtractionMixin(ABC):
|
|
72
72
|
|
73
73
|
Args:
|
74
74
|
using: 'text' or 'vision'
|
75
|
-
**kwargs: Additional arguments passed to extract_text or
|
75
|
+
**kwargs: Additional arguments passed to extract_text or render
|
76
76
|
|
77
77
|
Returns:
|
78
78
|
str: Extracted text if using='text'
|
79
79
|
PIL.Image.Image: Rendered image if using='vision'
|
80
80
|
None: If content cannot be retrieved
|
81
81
|
"""
|
82
|
-
if not hasattr(self, "extract_text") or not callable(self.extract_text):
|
83
|
-
logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
|
84
|
-
return None
|
85
|
-
if not hasattr(self, "to_image") or not callable(self.to_image):
|
86
|
-
logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
|
87
|
-
return None
|
88
|
-
|
89
82
|
try:
|
90
83
|
if using == "text":
|
84
|
+
if not hasattr(self, "extract_text") or not callable(self.extract_text):
|
85
|
+
logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
|
86
|
+
return None
|
91
87
|
layout = kwargs.pop("layout", True)
|
92
88
|
return self.extract_text(layout=layout, **kwargs)
|
93
89
|
elif using == "vision":
|
90
|
+
if not hasattr(self, "render") or not callable(self.render):
|
91
|
+
logger.error(f"ExtractionMixin requires 'render' method on {self!r}")
|
92
|
+
return None
|
94
93
|
resolution = kwargs.pop("resolution", 72)
|
95
94
|
include_highlights = kwargs.pop("include_highlights", False)
|
96
95
|
labels = kwargs.pop("labels", False)
|
@@ -102,8 +101,13 @@ class ExtractionMixin(ABC):
|
|
102
101
|
logger.error(f"Unsupported value for 'using': {using}")
|
103
102
|
return None
|
104
103
|
except Exception as e:
|
105
|
-
|
106
|
-
|
104
|
+
import warnings
|
105
|
+
|
106
|
+
warnings.warn(
|
107
|
+
f"Error getting {using} content from {self!r}: {e}",
|
108
|
+
RuntimeWarning,
|
109
|
+
)
|
110
|
+
raise
|
107
111
|
|
108
112
|
def extract(
|
109
113
|
self: Any,
|
@@ -275,10 +279,7 @@ class ExtractionMixin(ABC):
|
|
275
279
|
raise RuntimeError("StructuredDataManager is not available")
|
276
280
|
|
277
281
|
# Get content
|
278
|
-
|
279
|
-
content = self._get_extraction_content(
|
280
|
-
using=using, layout=layout_for_text, **kwargs
|
281
|
-
) # Pass kwargs
|
282
|
+
content = self._get_extraction_content(using=using, **kwargs) # Pass kwargs
|
282
283
|
|
283
284
|
if content is None or (
|
284
285
|
using == "text" and isinstance(content, str) and not content.strip()
|
@@ -359,10 +360,11 @@ class ExtractionMixin(ABC):
|
|
359
360
|
)
|
360
361
|
|
361
362
|
if not result.success:
|
362
|
-
|
363
|
-
|
364
|
-
f"
|
363
|
+
# Return None for failed extractions to allow batch processing to continue
|
364
|
+
logger.warning(
|
365
|
+
f"Extraction '{target_key}' failed: {result.error_message}. Returning None."
|
365
366
|
)
|
367
|
+
return None
|
366
368
|
|
367
369
|
if result.data is None:
|
368
370
|
# This case might occur if success=True but data is somehow None
|
@@ -591,16 +593,28 @@ class ExtractionMixin(ABC):
|
|
591
593
|
raise RuntimeError("StructuredDataManager is not available")
|
592
594
|
|
593
595
|
# Content preparation
|
594
|
-
|
595
|
-
|
596
|
+
content = self._get_extraction_content(using=using, **kwargs)
|
597
|
+
|
598
|
+
import warnings
|
596
599
|
|
597
600
|
if content is None or (
|
598
601
|
using == "text" and isinstance(content, str) and not content.strip()
|
599
602
|
):
|
603
|
+
preview = None
|
604
|
+
if isinstance(content, str):
|
605
|
+
preview = content[:120]
|
606
|
+
msg = (
|
607
|
+
f"No content available for extraction (using='{using}'). "
|
608
|
+
"Ensure the page has a text layer or render() returns an image. "
|
609
|
+
"For scanned PDFs run apply_ocr() or switch to using='vision'. "
|
610
|
+
f"Content preview: {preview!r}"
|
611
|
+
)
|
612
|
+
warnings.warn(msg, RuntimeWarning)
|
613
|
+
|
600
614
|
result = StructuredDataResult(
|
601
615
|
data=None,
|
602
616
|
success=False,
|
603
|
-
error_message=
|
617
|
+
error_message=msg,
|
604
618
|
model_used=model,
|
605
619
|
)
|
606
620
|
else:
|
natural_pdf/selectors/parser.py
CHANGED
@@ -721,8 +721,8 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
721
721
|
# Start with a base name, modify for specifics like :not
|
722
722
|
filter_name = f"pseudo-class :{name}"
|
723
723
|
|
724
|
-
# Relational pseudo-classes are handled separately by the caller
|
725
|
-
if name in ("above", "below", "near", "left-of", "right-of"):
|
724
|
+
# Relational pseudo-classes and collection-level pseudo-classes are handled separately by the caller
|
725
|
+
if name in ("above", "below", "near", "left-of", "right-of", "first", "last"):
|
726
726
|
continue
|
727
727
|
|
728
728
|
# --- Handle :not() ---
|
natural_pdf/tables/result.py
CHANGED
@@ -44,6 +44,7 @@ class TableResult(Sequence):
|
|
44
44
|
header: Union[str, int, List[int], None] = "first",
|
45
45
|
index_col=None,
|
46
46
|
skip_repeating_headers=None,
|
47
|
+
keep_blank: bool = False,
|
47
48
|
**kwargs,
|
48
49
|
):
|
49
50
|
"""Convert to *pandas* DataFrame.
|
@@ -52,11 +53,22 @@ class TableResult(Sequence):
|
|
52
53
|
----------
|
53
54
|
header : "first" | int | list[int] | None, default "first"
|
54
55
|
• "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
|
56
|
+
|
57
|
+
Note: If the header row has a different number of columns than the
|
58
|
+
body rows, the method will automatically fall back to header=None
|
59
|
+
to prevent pandas errors. This commonly occurs when headers are
|
60
|
+
merged into a single cell during PDF extraction.
|
61
|
+
|
55
62
|
index_col : same semantics as pandas, forwarded.
|
56
63
|
skip_repeating_headers : bool, optional
|
57
64
|
Whether to remove body rows that exactly match the header row(s).
|
58
65
|
Defaults to True when header is truthy, False otherwise.
|
59
66
|
Useful for PDFs where headers repeat throughout the table body.
|
67
|
+
keep_blank : bool, default False
|
68
|
+
Whether to preserve empty strings ('') as-is in the DataFrame.
|
69
|
+
When False (default), empty cells become pd.NA for better pandas integration
|
70
|
+
with numerical operations and missing data functions (.dropna(), .fillna(), etc.).
|
71
|
+
When True, empty strings are preserved as empty strings.
|
60
72
|
**kwargs : forwarded to :pyclass:`pandas.DataFrame`.
|
61
73
|
"""
|
62
74
|
try:
|
@@ -112,7 +124,32 @@ class TableResult(Sequence):
|
|
112
124
|
# Could add logging here if desired
|
113
125
|
pass
|
114
126
|
|
127
|
+
# Check for header/body column count mismatch and fallback to no header
|
128
|
+
if hdr is not None and body:
|
129
|
+
# Get the maximum number of columns from all body rows
|
130
|
+
# This handles cases where some rows have different column counts
|
131
|
+
max_cols = max(len(row) for row in body) if body else 0
|
132
|
+
|
133
|
+
# Check if header matches the maximum column count
|
134
|
+
header_cols = 0
|
135
|
+
if isinstance(hdr, list) and not isinstance(hdr[0], list):
|
136
|
+
# Single header row
|
137
|
+
header_cols = len(hdr)
|
138
|
+
elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
|
139
|
+
# Multi-row header - check first header row
|
140
|
+
header_cols = len(hdr[0])
|
141
|
+
|
142
|
+
if header_cols != max_cols:
|
143
|
+
# Column count mismatch - fallback to no header
|
144
|
+
hdr = None
|
145
|
+
body = self._rows # Use all rows as body
|
146
|
+
|
115
147
|
df = pd.DataFrame(body, columns=hdr)
|
148
|
+
|
149
|
+
# Convert empty strings to NaN by default
|
150
|
+
if not keep_blank:
|
151
|
+
df = df.replace("", pd.NA)
|
152
|
+
|
116
153
|
if index_col is not None and not df.empty:
|
117
154
|
df.set_index(
|
118
155
|
df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True
|