natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +45 -1
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/layout/yolo.py +2 -2
  9. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  10. natural_pdf/classification/manager.py +67 -0
  11. natural_pdf/core/element_manager.py +556 -25
  12. natural_pdf/core/highlighting_service.py +98 -43
  13. natural_pdf/core/page.py +86 -20
  14. natural_pdf/core/pdf.py +0 -2
  15. natural_pdf/describe/base.py +40 -9
  16. natural_pdf/describe/elements.py +11 -6
  17. natural_pdf/elements/base.py +134 -20
  18. natural_pdf/elements/collections.py +43 -11
  19. natural_pdf/elements/image.py +43 -0
  20. natural_pdf/elements/region.py +64 -19
  21. natural_pdf/elements/text.py +89 -11
  22. natural_pdf/flows/collections.py +4 -4
  23. natural_pdf/flows/region.py +17 -2
  24. natural_pdf/ocr/engine_paddle.py +1 -1
  25. natural_pdf/ocr/ocr_factory.py +8 -8
  26. natural_pdf/ocr/ocr_manager.py +51 -1
  27. natural_pdf/selectors/parser.py +27 -7
  28. natural_pdf/tables/__init__.py +5 -0
  29. natural_pdf/tables/result.py +101 -0
  30. natural_pdf/utils/bidi_mirror.py +36 -0
  31. natural_pdf/utils/visualization.py +15 -1
  32. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
  33. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
  34. natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
  35. optimization/memory_comparison.py +172 -0
  36. optimization/pdf_analyzer.py +410 -0
  37. optimization/performance_analysis.py +397 -0
  38. optimization/test_cleanup_methods.py +155 -0
  39. optimization/test_memory_fix.py +162 -0
  40. tools/bad_pdf_eval/__init__.py +1 -0
  41. tools/bad_pdf_eval/analyser.py +302 -0
  42. tools/bad_pdf_eval/collate_summaries.py +130 -0
  43. tools/bad_pdf_eval/eval_suite.py +116 -0
  44. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  45. tools/bad_pdf_eval/llm_enrich.py +273 -0
  46. tools/bad_pdf_eval/reporter.py +17 -0
  47. tools/bad_pdf_eval/utils.py +127 -0
  48. tools/rtl_smoke_test.py +80 -0
  49. natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
  50. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
  51. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
  52. {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -32,10 +32,34 @@ class TextElement(Element):
32
32
  obj["object_type"] = "text"
33
33
 
34
34
  super().__init__(obj, page)
35
- # Explicitly store constituent characters if provided
36
- # (Pop from obj to avoid storing it twice if super() stores _obj by ref)
35
+
36
+ # Memory optimization: Store character indices instead of full dictionaries
37
+ # This reduces memory usage by ~50% by avoiding character data duplication
38
+ self._char_indices = obj.pop("_char_indices", [])
39
+
40
+ # Backward compatibility: Keep _char_dicts for existing code
41
+ # But prefer _char_indices when available to save memory
37
42
  self._char_dicts = obj.pop("_char_dicts", [])
38
43
 
44
+ @property
45
+ def chars(self):
46
+ """Get constituent character elements efficiently.
47
+
48
+ Uses character indices when available to avoid memory duplication,
49
+ falls back to _char_dicts for backward compatibility.
50
+ """
51
+ if self._char_indices:
52
+ # Memory-efficient approach: access characters by index
53
+ if hasattr(self.page, '_element_mgr'):
54
+ char_elements = self.page._element_mgr.get_elements('chars')
55
+ return [char_elements[i] for i in self._char_indices if i < len(char_elements)]
56
+
57
+ # Backward compatibility: convert _char_dicts to TextElement objects
58
+ if self._char_dicts:
59
+ return [TextElement(char_dict, self.page) for char_dict in self._char_dicts]
60
+
61
+ return []
62
+
39
63
  @property
40
64
  def text(self) -> str:
41
65
  """Get the text content."""
@@ -43,17 +67,22 @@ class TextElement(Element):
43
67
 
44
68
  @text.setter
45
69
  def text(self, value: str):
46
- """Set the text content and synchronise underlying char dictionaries (if any)."""
70
+ """Set the text content and synchronise underlying char dictionaries/indices (if any)."""
47
71
  # Update the primary text value stored on the object itself
48
72
  self._obj["text"] = value
49
73
 
50
- # --- Keep _char_dicts in sync so downstream utilities (e.g. extract_text)
51
- # that rely on the raw character dictionaries see the corrected text.
52
- # For OCR-generated words we usually have a single representative char
53
- # dict; for native words there may be one per character.
54
- # ---------------------------------------------------------------------
74
+ # --- Sync character data for both memory-efficient and legacy approaches
55
75
  try:
56
- if hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
76
+ # If using memory-efficient character indices, update the referenced chars
77
+ if hasattr(self, "_char_indices") and self._char_indices:
78
+ if hasattr(self.page, '_element_mgr'):
79
+ char_elements = self.page._element_mgr.get_elements('chars')
80
+ for idx, char_idx in enumerate(self._char_indices):
81
+ if char_idx < len(char_elements) and idx < len(value):
82
+ char_elements[char_idx].text = value[idx]
83
+
84
+ # Legacy _char_dicts synchronization for backward compatibility
85
+ elif hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
57
86
  if not self._char_dicts:
58
87
  return # Nothing to update
59
88
 
@@ -93,7 +122,7 @@ class TextElement(Element):
93
122
  # Keep failures silent but logged; better to have outdated chars than crash.
94
123
  import logging
95
124
  logger = logging.getLogger(__name__)
96
- logger.debug(f"TextElement: Failed to sync _char_dicts after text update: {sync_err}")
125
+ logger.debug(f"TextElement: Failed to sync char data after text update: {sync_err}")
97
126
 
98
127
  @property
99
128
  def source(self) -> str:
@@ -331,6 +360,45 @@ class TextElement(Element):
331
360
 
332
361
  return False
333
362
 
363
+ @property
364
+ def strike(self) -> bool: # alias: struck
365
+ """True if this element (word/char) is marked as strikethrough."""
366
+ # Two possible storage places: raw object dict (comes from extractor
367
+ # via extra_attrs) or metadata (if later pipeline stages mutate).
368
+ return bool(self._obj.get("strike") or self.metadata.get("decoration", {}).get("strike"))
369
+
370
+ # Back-compat alias
371
+ @property
372
+ def struck(self) -> bool: # noqa: D401
373
+ return self.strike
374
+
375
+ # -----------------------------
376
+ # Underline decoration
377
+ # -----------------------------
378
+
379
+ @property
380
+ def underline(self) -> bool:
381
+ """True if element is underlined."""
382
+ return bool(self._obj.get("underline") or self.metadata.get("decoration", {}).get("underline"))
383
+
384
+ # -----------------------------
385
+ # Highlight decoration
386
+ # -----------------------------
387
+
388
+ @property
389
+ def is_highlighted(self) -> bool:
390
+ """True if element (char/word) is marked as highlighted in the original PDF."""
391
+ return bool(
392
+ self._obj.get("highlight")
393
+ or self._obj.get("is_highlighted")
394
+ or self.metadata.get("decoration", {}).get("highlight")
395
+ )
396
+
397
+ @property
398
+ def highlight_color(self):
399
+ """Return RGB(A) tuple of highlight colour if stored."""
400
+ return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get("highlight_color")
401
+
334
402
  def __repr__(self) -> str:
335
403
  """String representation of the text element."""
336
404
  if self.text:
@@ -342,6 +410,12 @@ class TextElement(Element):
342
410
  font_style.append("bold")
343
411
  if self.italic:
344
412
  font_style.append("italic")
413
+ if self.strike:
414
+ font_style.append("strike")
415
+ if self.underline:
416
+ font_style.append("underline")
417
+ if self.is_highlighted:
418
+ font_style.append("highlight")
345
419
  style_str = f", style={font_style}" if font_style else ""
346
420
 
347
421
  # Use font_family for display but include raw fontname and variant
@@ -353,7 +427,11 @@ class TextElement(Element):
353
427
  base_font = self.fontname.split("+", 1)[1]
354
428
  font_display = f"{font_display} ({base_font})"
355
429
 
356
- return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str} bbox={self.bbox}>"
430
+ color_info = ""
431
+ if self.is_highlighted and self.highlight_color is not None:
432
+ color_info = f", highlight_color={self.highlight_color}"
433
+
434
+ return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str}{color_info} bbox={self.bbox}>"
357
435
 
358
436
  def font_info(self) -> dict:
359
437
  """
@@ -164,7 +164,7 @@ class FlowElementCollection(MutableSequence[T_FEC]):
164
164
 
165
165
  def show(
166
166
  self,
167
- scale: float = 2.0,
167
+ resolution: Optional[float] = None,
168
168
  labels: bool = True,
169
169
  legend_position: str = "right",
170
170
  default_color: Optional[Union[Tuple, str]] = "orange", # A distinct color for FEC show
@@ -273,7 +273,7 @@ class FlowElementCollection(MutableSequence[T_FEC]):
273
273
  else getattr(page_obj, "page_number", 1) - 1
274
274
  ),
275
275
  temporary_highlights=temp_highlights_for_page,
276
- scale=scale,
276
+ resolution=resolution,
277
277
  width=width,
278
278
  labels=labels,
279
279
  legend_position=legend_position,
@@ -480,7 +480,7 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
480
480
 
481
481
  def show(
482
482
  self,
483
- scale: float = 2.0,
483
+ resolution: Optional[float] = None,
484
484
  labels: bool = True,
485
485
  legend_position: str = "right",
486
486
  default_color: Optional[Union[Tuple, str]] = "darkviolet", # A distinct color for FRC show
@@ -565,7 +565,7 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
565
565
  else getattr(page_obj, "page_number", 1) - 1
566
566
  ),
567
567
  temporary_highlights=temp_highlights_for_page,
568
- scale=scale,
568
+ resolution=resolution,
569
569
  width=width,
570
570
  labels=labels,
571
571
  legend_position=legend_position,
@@ -244,7 +244,7 @@ class FlowRegion:
244
244
 
245
245
  def show(
246
246
  self,
247
- scale: float = 2.0,
247
+ resolution: Optional[float] = None,
248
248
  labels: bool = True,
249
249
  legend_position: str = "right",
250
250
  color: Optional[Union[Tuple, str]] = "fuchsia",
@@ -258,6 +258,21 @@ class FlowRegion:
258
258
  """
259
259
  Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
260
260
  If multiple pages are involved, they are stacked into a single image.
261
+
262
+ Args:
263
+ resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
264
+ labels: Whether to include a legend for highlights.
265
+ legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
266
+ color: Color for highlighting the constituent regions.
267
+ label_prefix: Prefix for region labels (e.g., 'FlowPart').
268
+ width: Optional width for the output image (overrides resolution).
269
+ stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
270
+ stack_gap: Gap in pixels between stacked pages.
271
+ stack_background_color: RGB background color for the stacked image.
272
+ **kwargs: Additional arguments passed to the underlying rendering methods.
273
+
274
+ Returns:
275
+ PIL Image of the rendered pages with highlighted regions, or None if rendering fails.
261
276
  """
262
277
  if not self.constituent_regions:
263
278
  logger.info("FlowRegion.show() called with no constituent regions.")
@@ -350,7 +365,7 @@ class FlowRegion:
350
365
  else getattr(page_obj, "page_number", 1) - 1
351
366
  ),
352
367
  temporary_highlights=temp_highlights_for_page,
353
- scale=scale,
368
+ resolution=resolution,
354
369
  width=width,
355
370
  labels=labels, # Pass through labels
356
371
  legend_position=legend_position,
@@ -127,7 +127,7 @@ class PaddleOCREngine(OCREngine):
127
127
  except ImportError as e:
128
128
  self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
129
129
  raise RuntimeError(
130
- "paddleocr is not available. Install via: natural-pdf install paddle"
130
+ "paddleocr is not available. Install via: npdf install paddle"
131
131
  ) from e
132
132
 
133
133
  paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
@@ -32,7 +32,7 @@ class OCRFactory:
32
32
  return SuryaOCREngine(**kwargs)
33
33
  except ImportError:
34
34
  raise ImportError(
35
- "Surya engine requires additional dependencies. " "Install with: natural-pdf install surya"
35
+ "Surya engine requires additional dependencies. " "Install with: npdf install surya"
36
36
  )
37
37
  elif engine_type == "easyocr":
38
38
  try:
@@ -42,7 +42,7 @@ class OCRFactory:
42
42
  except ImportError:
43
43
  raise ImportError(
44
44
  "EasyOCR engine requires the 'easyocr' package. "
45
- "Install with: pip install easyocr (or natural-pdf install easyocr when available)"
45
+ "Install with: pip install easyocr (or npdf install easyocr when available)"
46
46
  )
47
47
  elif engine_type == "paddle":
48
48
  try:
@@ -52,7 +52,7 @@ class OCRFactory:
52
52
  except ImportError:
53
53
  raise ImportError(
54
54
  "PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
55
- "Install with: natural-pdf install paddle"
55
+ "Install with: npdf install paddle"
56
56
  )
57
57
  elif engine_type == "doctr":
58
58
  try:
@@ -137,9 +137,9 @@ class OCRFactory:
137
137
 
138
138
  # If we get here, no engines are available
139
139
  raise ImportError(
140
- "No OCR engines are installed. You can add one via the natural-pdf installer, e.g.:\n"
141
- " natural-pdf install easyocr # fastest to set up\n"
142
- " natural-pdf install paddle # best Asian-language accuracy\n"
143
- " natural-pdf install surya # Surya OCR engine\n"
144
- " natural-pdf install yolo # Layout detection (YOLO)\n"
140
+ "No OCR engines are installed. You can add one via the npdf installer, e.g.:\n"
141
+ " npdf install easyocr # fastest to set up\n"
142
+ " npdf install paddle # best Asian-language accuracy\n"
143
+ " npdf install surya # Surya OCR engine\n"
144
+ " npdf install yolo # Layout detection (YOLO)\n"
145
145
  )
@@ -94,7 +94,7 @@ class OCRManager:
94
94
  engine_instance = engine_class() # Instantiate first
95
95
  if not engine_instance.is_available():
96
96
  # Check availability before storing
97
- install_hint = f"natural-pdf install {engine_name}"
97
+ install_hint = f"npdf install {engine_name}"
98
98
 
99
99
  raise RuntimeError(
100
100
  f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
@@ -295,3 +295,53 @@ class OCRManager:
295
295
  ) # Log check failures at debug level
296
296
  pass # Ignore engines that fail to instantiate or check
297
297
  return available
298
+
299
+ def cleanup_engine(self, engine_name: Optional[str] = None) -> int:
300
+ """
301
+ Cleanup OCR engine instances to free memory.
302
+
303
+ Args:
304
+ engine_name: Specific engine to cleanup, or None to cleanup all engines
305
+
306
+ Returns:
307
+ Number of engines cleaned up
308
+ """
309
+ cleaned_count = 0
310
+
311
+ if engine_name:
312
+ # Cleanup specific engine
313
+ engine_name = engine_name.lower()
314
+ if engine_name in self._engine_instances:
315
+ engine = self._engine_instances.pop(engine_name)
316
+ if hasattr(engine, 'cleanup'):
317
+ try:
318
+ engine.cleanup()
319
+ except Exception as e:
320
+ logger.debug(f"Engine {engine_name} cleanup method failed: {e}")
321
+
322
+ # Clear associated locks
323
+ self._engine_locks.pop(engine_name, None)
324
+ self._engine_inference_locks.pop(engine_name, None)
325
+
326
+ logger.info(f"Cleaned up OCR engine: {engine_name}")
327
+ cleaned_count = 1
328
+ else:
329
+ # Cleanup all engines
330
+ for name, engine in list(self._engine_instances.items()):
331
+ if hasattr(engine, 'cleanup'):
332
+ try:
333
+ engine.cleanup()
334
+ except Exception as e:
335
+ logger.debug(f"Engine {name} cleanup method failed: {e}")
336
+
337
+ # Clear all caches
338
+ engine_count = len(self._engine_instances)
339
+ self._engine_instances.clear()
340
+ self._engine_locks.clear()
341
+ self._engine_inference_locks.clear()
342
+
343
+ if engine_count > 0:
344
+ logger.info(f"Cleaned up {engine_count} OCR engines")
345
+ cleaned_count = engine_count
346
+
347
+ return cleaned_count
@@ -224,6 +224,18 @@ def parse_selector(selector: str) -> Dict[str, Any]:
224
224
 
225
225
  selector = selector.strip()
226
226
 
227
+ # ------------------------------------------------------------------
228
+ # Handle wildcard selector (leading "*")
229
+ # ------------------------------------------------------------------
230
+ # A selector can start with "*" to denote "any element type", optionally
231
+ # followed by attribute blocks or pseudo-classes – e.g. *[width>100].
232
+ # We strip the asterisk but keep the remainder so the normal attribute
233
+ # / pseudo-class parsing logic can proceed.
234
+
235
+ if selector.startswith("*"):
236
+ # Keep everything *after* the asterisk (attributes, pseudos, etc.).
237
+ selector = selector[1:].strip()
238
+
227
239
  # --- Handle OR operators first (| or ,) ---
228
240
  # Check if selector contains OR operators at the top level only
229
241
  # (not inside quotes, parentheses, or brackets)
@@ -253,13 +265,6 @@ def parse_selector(selector: str) -> Dict[str, Any]:
253
265
 
254
266
  # --- Continue with single selector parsing (existing logic) ---
255
267
 
256
- # --- Handle wildcard selector explicitly ---
257
- if selector == "*":
258
- # Wildcard matches any type, already the default.
259
- # Clear selector so the loop doesn't run and error out.
260
- selector = ""
261
- # --- END NEW ---
262
-
263
268
  # 1. Extract type (optional, at the beginning)
264
269
  # Only run if selector wasn't '*'
265
270
  if selector:
@@ -741,6 +746,21 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
741
746
  elif name == "vertical":
742
747
  filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
743
748
 
749
+ # --- New: :strike / :strikethrough / :strikeout pseudo-classes --- #
750
+ elif name in ("strike", "strikethrough", "strikeout"):
751
+ filter_lambda = lambda el: hasattr(el, "strike") and bool(getattr(el, "strike"))
752
+ filter_name = f"pseudo-class :{name}"
753
+ elif name in ("underline", "underlined"):
754
+ filter_lambda = lambda el: hasattr(el, "underline") and bool(getattr(el, "underline"))
755
+ filter_name = f"pseudo-class :{name}"
756
+ elif name in ("highlight", "highlighted"):
757
+ # Match only if the element exposes an `is_highlighted` boolean flag.
758
+ # We deliberately avoid looking at the generic `.highlight()` method on
759
+ # Element, because it is a callable present on every element and would
760
+ # incorrectly mark everything as highlighted.
761
+ filter_lambda = lambda el: bool(getattr(el, "is_highlighted", False))
762
+ filter_name = f"pseudo-class :{name}"
763
+
744
764
  # Check predefined lambda functions (e.g., :first-child, :empty)
745
765
  elif name in PSEUDO_CLASS_FUNCTIONS:
746
766
  filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
@@ -0,0 +1,5 @@
1
+ # new file
2
+ # Re-export for convenient import
3
+ from .result import TableResult
4
+
5
+ __all__ = ["TableResult"]
@@ -0,0 +1,101 @@
1
+ """Sequence wrapper for table data with convenient DataFrame helpers."""
2
+ from __future__ import annotations
3
+
4
+ from collections.abc import Sequence
5
+ from typing import Any, List, Iterator, Optional, Union
6
+
7
+
8
+ class TableResult(Sequence):
9
+ """List-of-rows plus `.df` / `.to_df()` helpers.
10
+
11
+ The object behaves like an immutable sequence of rows (each row is a
12
+ list of cell values) but offers an easy hand-off to *pandas*.
13
+ """
14
+
15
+ _IMMUTABLE_MESSAGE = (
16
+ "TableResult is read-only; convert to list(result) if you need to mutate"
17
+ )
18
+
19
+ def __init__(self, rows: Optional[List[List[Any]]] = None) -> None:
20
+ # Normalise to list of list so that Sequence operations work as expected
21
+ self._rows: List[List[Any]] = list(rows or [])
22
+
23
+ # ---------------------------------------------------------------------
24
+ # Sequence API
25
+ # ---------------------------------------------------------------------
26
+ def __getitem__(self, index): # type: ignore[override]
27
+ return self._rows[index]
28
+
29
+ def __len__(self) -> int: # type: ignore[override]
30
+ return len(self._rows)
31
+
32
+ def __iter__(self) -> Iterator[List[Any]]: # type: ignore[override]
33
+ return iter(self._rows)
34
+
35
+ # ------------------------------------------------------------------
36
+ # Convenience helpers
37
+ # ------------------------------------------------------------------
38
+ @property
39
+ def df(self):
40
+ """Quick property alias → calls :py:meth:`to_df` with default args."""
41
+ return self.to_df()
42
+
43
+ def to_df(self, header: Union[str, int, List[int], None] = "first", index_col=None, **kwargs):
44
+ """Convert to *pandas* DataFrame.
45
+
46
+ Parameters
47
+ ----------
48
+ header : "first" | int | list[int] | None, default "first"
49
+ • "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
50
+ index_col : same semantics as pandas, forwarded.
51
+ **kwargs : forwarded to :pyclass:`pandas.DataFrame`.
52
+ """
53
+ try:
54
+ import pandas as pd # type: ignore
55
+ except ModuleNotFoundError as exc:
56
+ raise ImportError(
57
+ "pandas is required for TableResult.to_df(); install via `pip install pandas`."
58
+ ) from exc
59
+
60
+ rows = self._rows
61
+ if not rows:
62
+ return pd.DataFrame()
63
+
64
+ # Determine header rows and body rows
65
+ body = rows
66
+ hdr = None
67
+ if header == "first":
68
+ hdr = rows[0]
69
+ body = rows[1:]
70
+ elif header is None or header is False:
71
+ hdr = None
72
+ elif isinstance(header, int):
73
+ hdr = rows[header]
74
+ body = rows[:header] + rows[header + 1 :]
75
+ elif isinstance(header, (list, tuple)):
76
+ hdr_rows = [rows[i] for i in header]
77
+ body = [r for idx, r in enumerate(rows) if idx not in header]
78
+ hdr = hdr_rows
79
+ else:
80
+ raise ValueError("Invalid value for header parameter")
81
+
82
+ df = pd.DataFrame(body, columns=hdr)
83
+ if index_col is not None and not df.empty:
84
+ df.set_index(df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True)
85
+
86
+ if kwargs:
87
+ df = pd.DataFrame(df, **kwargs)
88
+ return df
89
+
90
+ # ------------------------------------------------------------------
91
+ # Block mutating operations to keep result read-only
92
+ # ------------------------------------------------------------------
93
+ def _readonly(self, *args, **kwargs):
94
+ raise TypeError(self._IMMUTABLE_MESSAGE)
95
+
96
+ append = extend = insert = __setitem__ = __delitem__ = clear = pop = remove = _readonly # type: ignore
97
+
98
+ # Nice repr in notebooks
99
+ def __repr__(self) -> str: # noqa: D401 (simple)
100
+ preview = "…" if len(self._rows) > 5 else ""
101
+ return f"TableResult(rows={len(self._rows)}{preview})"
@@ -0,0 +1,36 @@
1
+ """Light-weight bracket mirroring for RTL text.
2
+
3
+ This module provides `mirror_brackets`, a fast pure-python helper that
4
+ replaces each bracket/parenthesis character with its Unicode-defined pair.
5
+
6
+ For everyday PDFs the six ASCII pairs are enough, but the mapping can be
7
+ extended easily from Unicode's BidiBrackets.txt.
8
+ """
9
+ from typing import Dict
10
+
11
+ # Minimal mapping – ( ) [ ] { }
12
+ _ASCII_MIRROR: Dict[int, str] = {
13
+ 0x0028: ")", # ( -> )
14
+ 0x0029: "(", # ) -> (
15
+ 0x005B: "]", # [ -> ]
16
+ 0x005D: "[", # ] -> [
17
+ 0x007B: "}", # { -> }
18
+ 0x007D: "{", # } -> {
19
+ }
20
+
21
+
22
+ def mirror_brackets(text: str) -> str: # pragma: no cover
23
+ """Return *text* with each bracket replaced by its mirror partner.
24
+
25
+ The function is context-free: it blindly flips every character found in
26
+ the mapping, which is sufficient once the string is already in visual
27
+ order (e.g., after `bidi.algorithm.get_display`).
28
+ """
29
+ if not text:
30
+ return text
31
+ # Fast path: only allocate when needed
32
+ out_chars = []
33
+ append = out_chars.append
34
+ for ch in text:
35
+ append(_ASCII_MIRROR.get(ord(ch), ch))
36
+ return "".join(out_chars)
@@ -235,12 +235,26 @@ def merge_images_with_legend(
235
235
 
236
236
 
237
237
  def render_plain_page(page, resolution):
238
+ """
239
+ Render a page to PIL Image using the specified resolution.
240
+
241
+ Args:
242
+ page: Page object to render
243
+ resolution: DPI resolution for rendering
244
+
245
+ Returns:
246
+ PIL Image of the rendered page
247
+ """
238
248
  doc = pypdfium2.PdfDocument(page._page.pdf.stream)
239
249
 
240
250
  pdf_page = doc[page.index]
241
251
 
252
+ # Convert resolution (DPI) to scale factor for pypdfium2
253
+ # PDF standard is 72 DPI, so scale = resolution / 72
254
+ scale_factor = resolution / 72.0
255
+
242
256
  bitmap = pdf_page.render(
243
- scale=resolution / 72,
257
+ scale=scale_factor,
244
258
  )
245
259
  image = bitmap.to_pil().convert("RGB")
246
260
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.27
3
+ Version: 0.1.30
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -24,6 +24,7 @@ Requires-Dist: pydantic
24
24
  Requires-Dist: jenkspy
25
25
  Requires-Dist: scipy
26
26
  Requires-Dist: ipywidgets>=7.0.0
27
+ Requires-Dist: python-bidi
27
28
  Provides-Extra: test
28
29
  Requires-Dist: pytest; extra == "test"
29
30
  Requires-Dist: pytest-xdist; extra == "test"