natural-pdf 0.2.1.dev0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ """
2
+ PageGroupBy class for grouping pages by selector text or callable results.
3
+ """
4
+
5
+ from collections import defaultdict
6
+ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
7
+
8
+ from tqdm.auto import tqdm
9
+
10
+ if TYPE_CHECKING:
11
+ from natural_pdf.core.page import Page
12
+ from natural_pdf.core.page_collection import PageCollection
13
+
14
+
15
+ class PageGroupBy:
16
+ """
17
+ A groupby object for PageCollections that supports both iteration and dict-like access.
18
+
19
+ This class provides pandas-like groupby functionality for natural-pdf PageCollections.
20
+ Pages are grouped by the result of applying a selector string or callable function.
21
+
22
+ Supports:
23
+ - Direct iteration: for key, pages in grouped:
24
+ - Dict-like access: grouped.get(key), grouped.get_group(key)
25
+ - Batch operations: grouped.apply(func)
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ page_collection: "PageCollection",
31
+ by: Union[str, Callable],
32
+ *,
33
+ show_progress: bool = True,
34
+ ):
35
+ """
36
+ Initialize the PageGroupBy object.
37
+
38
+ Args:
39
+ page_collection: The PageCollection to group
40
+ by: CSS selector string or callable function for grouping
41
+ show_progress: Whether to show progress bar during computation (default: True)
42
+ """
43
+ self.page_collection = page_collection
44
+ self.by = by
45
+ self.show_progress = show_progress
46
+ self._groups: Optional[Dict[Any, "PageCollection"]] = None
47
+
48
+ def _compute_groups(self) -> Dict[Any, "PageCollection"]:
49
+ """
50
+ Compute the groups by applying the selector/callable to each page.
51
+
52
+ Returns:
53
+ Dictionary mapping group keys to PageCollection objects
54
+ """
55
+ if self._groups is not None:
56
+ return self._groups
57
+
58
+ groups = defaultdict(list)
59
+
60
+ # Setup progress bar if enabled and collection is large enough
61
+ pages_iterator = self.page_collection.pages
62
+ total_pages = len(self.page_collection)
63
+
64
+ if self.show_progress and total_pages > 1: # Show progress for more than 1 page
65
+ desc = f"Grouping by {'selector' if isinstance(self.by, str) else 'function'}"
66
+ pages_iterator = tqdm(pages_iterator, desc=desc, unit="pages", total=total_pages)
67
+
68
+ for page in pages_iterator:
69
+ if callable(self.by):
70
+ # Apply callable function
71
+ key = self.by(page)
72
+ else:
73
+ # Apply selector string
74
+ element = page.find(self.by)
75
+ if element:
76
+ key = element.extract_text()
77
+ else:
78
+ key = None
79
+
80
+ groups[key].append(page)
81
+
82
+ # Convert lists to PageCollections
83
+ from natural_pdf.core.page_collection import PageCollection
84
+
85
+ self._groups = {key: PageCollection(pages) for key, pages in groups.items()}
86
+
87
+ return self._groups
88
+
89
+ def __iter__(self) -> Iterator[Tuple[Any, "PageCollection"]]:
90
+ """
91
+ Support direct iteration: for key, pages in grouped:
92
+
93
+ Yields:
94
+ Tuples of (group_key, PageCollection)
95
+ """
96
+ groups = self._compute_groups()
97
+ return iter(groups.items())
98
+
99
+ def get(
100
+ self, key: Any, default: Optional["PageCollection"] = None
101
+ ) -> Optional["PageCollection"]:
102
+ """
103
+ Dict-like access to get a specific group.
104
+
105
+ Args:
106
+ key: The group key to look up
107
+ default: Value to return if key is not found
108
+
109
+ Returns:
110
+ PageCollection for the group, or default if not found
111
+ """
112
+ groups = self._compute_groups()
113
+ return groups.get(key, default)
114
+
115
+ def get_group(self, key: Any) -> "PageCollection":
116
+ """
117
+ Pandas-style access to get a specific group.
118
+
119
+ Args:
120
+ key: The group key to look up
121
+
122
+ Returns:
123
+ PageCollection for the group
124
+
125
+ Raises:
126
+ KeyError: If the group key is not found
127
+ """
128
+ groups = self._compute_groups()
129
+ if key not in groups:
130
+ raise KeyError(f"Group key '{key}' not found")
131
+ return groups[key]
132
+
133
+ def keys(self) -> List[Any]:
134
+ """
135
+ Get all group keys.
136
+
137
+ Returns:
138
+ List of all group keys
139
+ """
140
+ groups = self._compute_groups()
141
+ return list(groups.keys())
142
+
143
+ def __getitem__(self, index: Union[int, Any]) -> "PageCollection":
144
+ """
145
+ Access groups by index or key.
146
+
147
+ Args:
148
+ index: Integer index (0-based) or group key
149
+
150
+ Returns:
151
+ PageCollection for the specified group
152
+
153
+ Examples:
154
+ grouped = pages.groupby('text[size=16]')
155
+
156
+ # Access by index (useful for quick exploration)
157
+ first_group = grouped[0] # First group by order
158
+ second_group = grouped[1] # Second group
159
+ last_group = grouped[-1] # Last group
160
+
161
+ # Access by key (same as .get_group())
162
+ madison = grouped['CITY OF MADISON']
163
+ """
164
+ groups = self._compute_groups()
165
+
166
+ if isinstance(index, int):
167
+ # Access by integer index
168
+ keys_list = list(groups.keys())
169
+ original_index = index # Keep original for error message
170
+ if index < 0:
171
+ index = len(keys_list) + index # Support negative indexing
172
+ if not (0 <= index < len(keys_list)):
173
+ raise IndexError(f"Group index {original_index} out of range")
174
+ key = keys_list[index]
175
+ return groups[key]
176
+ else:
177
+ # Access by key (same as get_group)
178
+ if index not in groups:
179
+ raise KeyError(f"Group key '{index}' not found")
180
+ return groups[index]
181
+
182
+ def apply(self, func: Callable[["PageCollection"], Any]) -> Dict[Any, Any]:
183
+ """
184
+ Apply a function to each group.
185
+
186
+ Args:
187
+ func: Function to apply to each PageCollection group
188
+
189
+ Returns:
190
+ Dictionary mapping group keys to function results
191
+ """
192
+ groups = self._compute_groups()
193
+ return {key: func(pages) for key, pages in groups.items()}
194
+
195
+ def show(self, **kwargs):
196
+ """
197
+ Show each group separately with headers.
198
+
199
+ Args:
200
+ **kwargs: Arguments passed to each group's show() method
201
+ """
202
+ groups = self._compute_groups()
203
+ for key, pages in groups.items():
204
+ print(f"\n--- Group: {key} ({len(pages)} pages) ---")
205
+ pages.show(**kwargs)
206
+
207
+ def __len__(self) -> int:
208
+ """Return the number of groups."""
209
+ groups = self._compute_groups()
210
+ return len(groups)
211
+
212
+ def info(self) -> None:
213
+ """
214
+ Print information about all groups.
215
+
216
+ Useful for quick exploration of group structure.
217
+ """
218
+ groups = self._compute_groups()
219
+ print(f"PageGroupBy with {len(groups)} groups:")
220
+ print("-" * 40)
221
+
222
+ for i, (key, pages) in enumerate(groups.items()):
223
+ key_display = f"'{key}'" if key is not None else "None"
224
+ print(f"[{i}] {key_display}: {len(pages)} pages")
225
+
226
+ def __repr__(self) -> str:
227
+ """String representation showing group count."""
228
+ groups = self._compute_groups()
229
+ return f"<PageGroupBy(groups={len(groups)})>"
@@ -146,10 +146,11 @@ class Visualizable:
146
146
  legend_position: str = "right",
147
147
  annotate: Optional[Union[str, List[str]]] = None,
148
148
  # Layout options for multi-page/region
149
- layout: Literal["stack", "grid", "single"] = "stack",
149
+ layout: Optional[Literal["stack", "grid", "single"]] = None,
150
150
  stack_direction: Literal["vertical", "horizontal"] = "vertical",
151
151
  gap: int = 5,
152
- columns: Optional[int] = None, # For grid layout
152
+ columns: Optional[int] = 6, # For grid layout, defaults to 6 columns
153
+ limit: Optional[int] = 30, # Max pages to show (default 30)
153
154
  # Cropping options
154
155
  crop: Union[bool, Literal["content"]] = False,
155
156
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
@@ -169,10 +170,11 @@ class Visualizable:
169
170
  highlights: Additional highlight groups to show
170
171
  legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
171
172
  annotate: Attribute name(s) to display on highlights (string or list)
172
- layout: How to arrange multiple pages/regions
173
+ layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)
173
174
  stack_direction: Direction for stack layout
174
175
  gap: Pixels between stacked images
175
- columns: Number of columns for grid layout
176
+ columns: Number of columns for grid layout (defaults to 6)
177
+ limit: Maximum number of pages to display (default 30, None for all)
176
178
  crop: Whether to crop (True, False, or 'content' for bbox of elements)
177
179
  crop_bbox: Explicit crop bounds
178
180
  **kwargs: Additional parameters passed to rendering
@@ -184,6 +186,10 @@ class Visualizable:
184
186
  if isinstance(annotate, str):
185
187
  annotate = [annotate]
186
188
 
189
+ # Pass limit as max_pages to _get_render_specs
190
+ if limit is not None:
191
+ kwargs["max_pages"] = limit
192
+
187
193
  specs = self._get_render_specs(
188
194
  mode="show",
189
195
  color=color,
@@ -198,6 +204,14 @@ class Visualizable:
198
204
  logger.warning(f"{self.__class__.__name__}.show() generated no render specs")
199
205
  return None
200
206
 
207
+ # Determine default layout based on content and parameters
208
+ if layout is None:
209
+ # For PDFs and multi-page collections, default to grid with 6 columns
210
+ if len(specs) > 1:
211
+ layout = "grid"
212
+ else:
213
+ layout = "single"
214
+
201
215
  highlighter = self._get_highlighter()
202
216
  return highlighter.unified_render(
203
217
  specs=specs,
@@ -260,7 +260,7 @@ class DirectionalMixin:
260
260
 
261
261
  Args:
262
262
  height: Height of the region above, in points
263
- width: Width mode - "full" for full page width or "element" for element width
263
+ width: Width mode - "full" (default) for full page width or "element" for element width
264
264
  include_source: Whether to include this element/region in the result (default: False)
265
265
  until: Optional selector string to specify an upper boundary element
266
266
  include_endpoint: Whether to include the boundary element in the region (default: True)
@@ -268,6 +268,18 @@ class DirectionalMixin:
268
268
 
269
269
  Returns:
270
270
  Region object representing the area above
271
+
272
+ Examples:
273
+ ```python
274
+ # Default: full page width
275
+ signature.above() # Gets everything above across full page width
276
+
277
+ # Match element width
278
+ signature.above(width='element') # Gets region above matching signature width
279
+
280
+ # Stop at specific element
281
+ signature.above(until='text:contains("Date")') # Region from date to signature
282
+ ```
271
283
  """
272
284
  return self._direction(
273
285
  direction="above",
@@ -293,7 +305,7 @@ class DirectionalMixin:
293
305
 
294
306
  Args:
295
307
  height: Height of the region below, in points
296
- width: Width mode - "full" for full page width or "element" for element width
308
+ width: Width mode - "full" (default) for full page width or "element" for element width
297
309
  include_source: Whether to include this element/region in the result (default: False)
298
310
  until: Optional selector string to specify a lower boundary element
299
311
  include_endpoint: Whether to include the boundary element in the region (default: True)
@@ -301,6 +313,18 @@ class DirectionalMixin:
301
313
 
302
314
  Returns:
303
315
  Region object representing the area below
316
+
317
+ Examples:
318
+ ```python
319
+ # Default: full page width
320
+ header.below() # Gets everything below across full page width
321
+
322
+ # Match element width
323
+ header.below(width='element') # Gets region below matching header width
324
+
325
+ # Limited height
326
+ header.below(height=200) # Gets 200pt tall region below header
327
+ ```
304
328
  """
305
329
  return self._direction(
306
330
  direction="below",
@@ -315,7 +339,7 @@ class DirectionalMixin:
315
339
  def left(
316
340
  self,
317
341
  width: Optional[float] = None,
318
- height: str = "full",
342
+ height: str = "element",
319
343
  include_source: bool = False,
320
344
  until: Optional[str] = None,
321
345
  include_endpoint: bool = True,
@@ -326,7 +350,7 @@ class DirectionalMixin:
326
350
 
327
351
  Args:
328
352
  width: Width of the region to the left, in points
329
- height: Height mode - "full" for full page height or "element" for element height
353
+ height: Height mode - "element" (default) for element height or "full" for full page height
330
354
  include_source: Whether to include this element/region in the result (default: False)
331
355
  until: Optional selector string to specify a left boundary element
332
356
  include_endpoint: Whether to include the boundary element in the region (default: True)
@@ -334,6 +358,18 @@ class DirectionalMixin:
334
358
 
335
359
  Returns:
336
360
  Region object representing the area to the left
361
+
362
+ Examples:
363
+ ```python
364
+ # Default: matches element height
365
+ table.left() # Gets region to the left at same height as table
366
+
367
+ # Full page height
368
+ table.left(height='full') # Gets entire left side of page
369
+
370
+ # Custom height
371
+ table.left(height=100) # Gets 100pt tall region to the left
372
+ ```
337
373
  """
338
374
  return self._direction(
339
375
  direction="left",
@@ -348,7 +384,7 @@ class DirectionalMixin:
348
384
  def right(
349
385
  self,
350
386
  width: Optional[float] = None,
351
- height: str = "full",
387
+ height: str = "element",
352
388
  include_source: bool = False,
353
389
  until: Optional[str] = None,
354
390
  include_endpoint: bool = True,
@@ -359,7 +395,7 @@ class DirectionalMixin:
359
395
 
360
396
  Args:
361
397
  width: Width of the region to the right, in points
362
- height: Height mode - "full" for full page height or "element" for element height
398
+ height: Height mode - "element" (default) for element height or "full" for full page height
363
399
  include_source: Whether to include this element/region in the result (default: False)
364
400
  until: Optional selector string to specify a right boundary element
365
401
  include_endpoint: Whether to include the boundary element in the region (default: True)
@@ -367,6 +403,18 @@ class DirectionalMixin:
367
403
 
368
404
  Returns:
369
405
  Region object representing the area to the right
406
+
407
+ Examples:
408
+ ```python
409
+ # Default: matches element height
410
+ label.right() # Gets region to the right at same height as label
411
+
412
+ # Full page height
413
+ label.right(height='full') # Gets entire right side of page
414
+
415
+ # Custom height
416
+ label.right(height=50) # Gets 50pt tall region to the right
417
+ ```
370
418
  """
371
419
  return self._direction(
372
420
  direction="right",
@@ -891,6 +891,7 @@ class ElementCollection(
891
891
  label_format: Optional[str] = None,
892
892
  annotate: Optional[List[str]] = None,
893
893
  bins: Optional[Union[int, List[float]]] = None,
894
+ **kwargs,
894
895
  ) -> List[Dict]:
895
896
  """
896
897
  Determines the parameters for highlighting each element based on the strategy.
@@ -960,7 +960,7 @@ class Region(
960
960
  right_content_col = min(width - 1, content_col_indices[-1] + padding)
961
961
 
962
962
  # Convert trimmed pixel coordinates back to PDF coordinates
963
- scale_factor = resolution / 72.0 # Scale factor used in to_image()
963
+ scale_factor = resolution / 72.0 # Scale factor used in render()
964
964
 
965
965
  # Calculate new PDF coordinates and ensure they are Python floats
966
966
  trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
@@ -3437,7 +3437,7 @@ class Region(
3437
3437
  r_idx = int(cell.metadata.get("row_index"))
3438
3438
  c_idx = int(cell.metadata.get("col_index"))
3439
3439
  text_val = cell.extract_text(
3440
- layout=False, apply_exclusions=False, content_filter=content_filter
3440
+ layout=False, apply_exclusions=True, content_filter=content_filter
3441
3441
  ).strip()
3442
3442
  table_grid[r_idx][c_idx] = text_val if text_val else None
3443
3443
  except Exception as _err:
@@ -215,6 +215,11 @@ class TextElement(Element):
215
215
  if isinstance(color, (int, float)):
216
216
  return (color, color, color)
217
217
 
218
+ # If it's a single-value tuple (grayscale), treat as grayscale
219
+ if isinstance(color, tuple) and len(color) == 1:
220
+ gray = color[0]
221
+ return (gray, gray, gray)
222
+
218
223
  # If it's a tuple of 3 values, treat as RGB
219
224
  if isinstance(color, tuple) and len(color) == 3:
220
225
  return color
@@ -119,17 +119,11 @@ class StructuredDataManager:
119
119
  )
120
120
  messages = self._prepare_llm_messages(content, prompt, using, schema)
121
121
 
122
- try:
123
- logger.debug(f"Extracting with model '{selected_model}'")
124
- completion = client.beta.chat.completions.parse(
125
- model=selected_model, messages=messages, response_format=schema, **kwargs
126
- )
127
- parsed_data = completion.choices[0].message.parsed
128
- return StructuredDataResult(
129
- data=parsed_data, success=True, error_message=None, model_used=selected_model
130
- )
131
- except Exception as e:
132
- logger.error(f"Extraction failed: {str(e)}")
133
- return StructuredDataResult(
134
- data=None, success=False, error_message=str(e), model_used=selected_model
135
- )
122
+ logger.debug(f"Extracting with model '{selected_model}'")
123
+ completion = client.beta.chat.completions.parse(
124
+ model=selected_model, messages=messages, response_format=schema, **kwargs
125
+ )
126
+ parsed_data = completion.choices[0].message.parsed
127
+ return StructuredDataResult(
128
+ data=parsed_data, success=True, error_message=None, model_used=selected_model
129
+ )
@@ -35,7 +35,7 @@ class ExtractionMixin(ABC):
35
35
 
36
36
  Host class requirements:
37
37
  - Must implement extract_text(**kwargs) -> str
38
- - Must implement to_image(**kwargs) -> PIL.Image
38
+ - Must implement render(**kwargs) -> PIL.Image
39
39
  - Must have access to StructuredDataManager (usually via parent PDF)
40
40
 
41
41
  Example:
@@ -72,25 +72,24 @@ class ExtractionMixin(ABC):
72
72
 
73
73
  Args:
74
74
  using: 'text' or 'vision'
75
- **kwargs: Additional arguments passed to extract_text or to_image
75
+ **kwargs: Additional arguments passed to extract_text or render
76
76
 
77
77
  Returns:
78
78
  str: Extracted text if using='text'
79
79
  PIL.Image.Image: Rendered image if using='vision'
80
80
  None: If content cannot be retrieved
81
81
  """
82
- if not hasattr(self, "extract_text") or not callable(self.extract_text):
83
- logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
84
- return None
85
- if not hasattr(self, "to_image") or not callable(self.to_image):
86
- logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
87
- return None
88
-
89
82
  try:
90
83
  if using == "text":
84
+ if not hasattr(self, "extract_text") or not callable(self.extract_text):
85
+ logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
86
+ return None
91
87
  layout = kwargs.pop("layout", True)
92
88
  return self.extract_text(layout=layout, **kwargs)
93
89
  elif using == "vision":
90
+ if not hasattr(self, "render") or not callable(self.render):
91
+ logger.error(f"ExtractionMixin requires 'render' method on {self!r}")
92
+ return None
94
93
  resolution = kwargs.pop("resolution", 72)
95
94
  include_highlights = kwargs.pop("include_highlights", False)
96
95
  labels = kwargs.pop("labels", False)
@@ -102,8 +101,13 @@ class ExtractionMixin(ABC):
102
101
  logger.error(f"Unsupported value for 'using': {using}")
103
102
  return None
104
103
  except Exception as e:
105
- logger.error(f"Error getting {using} content from {self!r}: {e}")
106
- return None
104
+ import warnings
105
+
106
+ warnings.warn(
107
+ f"Error getting {using} content from {self!r}: {e}",
108
+ RuntimeWarning,
109
+ )
110
+ raise
107
111
 
108
112
  def extract(
109
113
  self: Any,
@@ -275,10 +279,7 @@ class ExtractionMixin(ABC):
275
279
  raise RuntimeError("StructuredDataManager is not available")
276
280
 
277
281
  # Get content
278
- layout_for_text = kwargs.pop("layout", True)
279
- content = self._get_extraction_content(
280
- using=using, layout=layout_for_text, **kwargs
281
- ) # Pass kwargs
282
+ content = self._get_extraction_content(using=using, **kwargs) # Pass kwargs
282
283
 
283
284
  if content is None or (
284
285
  using == "text" and isinstance(content, str) and not content.strip()
@@ -359,10 +360,11 @@ class ExtractionMixin(ABC):
359
360
  )
360
361
 
361
362
  if not result.success:
362
- raise ValueError(
363
- f"Stored result for '{target_key}' indicates a failed extraction attempt. "
364
- f"Error: {result.error_message}"
363
+ # Return None for failed extractions to allow batch processing to continue
364
+ logger.warning(
365
+ f"Extraction '{target_key}' failed: {result.error_message}. Returning None."
365
366
  )
367
+ return None
366
368
 
367
369
  if result.data is None:
368
370
  # This case might occur if success=True but data is somehow None
@@ -591,16 +593,28 @@ class ExtractionMixin(ABC):
591
593
  raise RuntimeError("StructuredDataManager is not available")
592
594
 
593
595
  # Content preparation
594
- layout_for_text = kwargs.pop("layout", True)
595
- content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs)
596
+ content = self._get_extraction_content(using=using, **kwargs)
597
+
598
+ import warnings
596
599
 
597
600
  if content is None or (
598
601
  using == "text" and isinstance(content, str) and not content.strip()
599
602
  ):
603
+ preview = None
604
+ if isinstance(content, str):
605
+ preview = content[:120]
606
+ msg = (
607
+ f"No content available for extraction (using='{using}'). "
608
+ "Ensure the page has a text layer or render() returns an image. "
609
+ "For scanned PDFs run apply_ocr() or switch to using='vision'. "
610
+ f"Content preview: {preview!r}"
611
+ )
612
+ warnings.warn(msg, RuntimeWarning)
613
+
600
614
  result = StructuredDataResult(
601
615
  data=None,
602
616
  success=False,
603
- error_message=f"No content available for extraction (using='{using}')",
617
+ error_message=msg,
604
618
  model_used=model,
605
619
  )
606
620
  else:
@@ -721,8 +721,8 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
721
721
  # Start with a base name, modify for specifics like :not
722
722
  filter_name = f"pseudo-class :{name}"
723
723
 
724
- # Relational pseudo-classes are handled separately by the caller
725
- if name in ("above", "below", "near", "left-of", "right-of"):
724
+ # Relational pseudo-classes and collection-level pseudo-classes are handled separately by the caller
725
+ if name in ("above", "below", "near", "left-of", "right-of", "first", "last"):
726
726
  continue
727
727
 
728
728
  # --- Handle :not() ---
@@ -44,6 +44,7 @@ class TableResult(Sequence):
44
44
  header: Union[str, int, List[int], None] = "first",
45
45
  index_col=None,
46
46
  skip_repeating_headers=None,
47
+ keep_blank: bool = False,
47
48
  **kwargs,
48
49
  ):
49
50
  """Convert to *pandas* DataFrame.
@@ -52,11 +53,22 @@ class TableResult(Sequence):
52
53
  ----------
53
54
  header : "first" | int | list[int] | None, default "first"
54
55
  • "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
56
+
57
+ Note: If the header row has a different number of columns than the
58
+ body rows, the method will automatically fall back to header=None
59
+ to prevent pandas errors. This commonly occurs when headers are
60
+ merged into a single cell during PDF extraction.
61
+
55
62
  index_col : same semantics as pandas, forwarded.
56
63
  skip_repeating_headers : bool, optional
57
64
  Whether to remove body rows that exactly match the header row(s).
58
65
  Defaults to True when header is truthy, False otherwise.
59
66
  Useful for PDFs where headers repeat throughout the table body.
67
+ keep_blank : bool, default False
68
+ Whether to preserve empty strings ('') as-is in the DataFrame.
69
+ When False (default), empty cells become pd.NA for better pandas integration
70
+ with numerical operations and missing data functions (.dropna(), .fillna(), etc.).
71
+ When True, empty strings are preserved as empty strings.
60
72
  **kwargs : forwarded to :pyclass:`pandas.DataFrame`.
61
73
  """
62
74
  try:
@@ -112,7 +124,32 @@ class TableResult(Sequence):
112
124
  # Could add logging here if desired
113
125
  pass
114
126
 
127
+ # Check for header/body column count mismatch and fallback to no header
128
+ if hdr is not None and body:
129
+ # Get the maximum number of columns from all body rows
130
+ # This handles cases where some rows have different column counts
131
+ max_cols = max(len(row) for row in body) if body else 0
132
+
133
+ # Check if header matches the maximum column count
134
+ header_cols = 0
135
+ if isinstance(hdr, list) and not isinstance(hdr[0], list):
136
+ # Single header row
137
+ header_cols = len(hdr)
138
+ elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
139
+ # Multi-row header - check first header row
140
+ header_cols = len(hdr[0])
141
+
142
+ if header_cols != max_cols:
143
+ # Column count mismatch - fallback to no header
144
+ hdr = None
145
+ body = self._rows # Use all rows as body
146
+
115
147
  df = pd.DataFrame(body, columns=hdr)
148
+
149
+ # Convert empty strings to NaN by default
150
+ if not keep_blank:
151
+ df = df.replace("", pd.NA)
152
+
116
153
  if index_col is not None and not df.empty:
117
154
  df.set_index(
118
155
  df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True