natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +113 -22
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,16 @@
1
- """
2
- Element Manager for natural-pdf.
3
-
4
- This class handles the loading, creation, and management of PDF elements like
5
- characters, words, rectangles, and lines extracted from a page.
1
+ """Element Manager for natural-pdf.
2
+
3
+ This module handles the loading, creation, and management of PDF elements like
4
+ characters, words, rectangles, lines, and images extracted from a page. The
5
+ ElementManager class serves as the central coordinator for element lifecycle
6
+ management and provides enhanced word extraction capabilities.
7
+
8
+ The module includes:
9
+ - Element creation and caching for performance
10
+ - Custom word extraction that respects font boundaries
11
+ - OCR coordinate transformation and integration
12
+ - Text decoration detection (underline, strikethrough, highlights)
13
+ - Performance optimizations for bulk text processing
6
14
  """
7
15
 
8
16
  import logging
@@ -13,10 +21,10 @@ from typing import Any, Dict, List, Optional, Tuple, Union
13
21
 
14
22
  from pdfplumber.utils.text import WordExtractor
15
23
 
24
+ from natural_pdf.elements.image import ImageElement
16
25
  from natural_pdf.elements.line import LineElement
17
26
  from natural_pdf.elements.rect import RectangleElement
18
27
  from natural_pdf.elements.text import TextElement
19
- from natural_pdf.elements.image import ImageElement
20
28
 
21
29
  logger = logging.getLogger(__name__)
22
30
 
@@ -25,8 +33,8 @@ logger = logging.getLogger(__name__)
25
33
  # ------------------------------------------------------------------
26
34
 
27
35
  STRIKE_DEFAULTS = {
28
- "thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
29
- "horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
36
+ "thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
37
+ "horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
30
38
  "coverage_ratio": 0.7, # proportion of glyph width to be overlapped
31
39
  "band_top_frac": 0.35, # fraction of glyph height above top baseline band
32
40
  "band_bottom_frac": 0.65, # fraction below top (same used internally)
@@ -36,48 +44,90 @@ UNDERLINE_DEFAULTS = {
36
44
  "thickness_tol": 1.5,
37
45
  "horiz_tol": 1.0,
38
46
  "coverage_ratio": 0.8,
39
- "band_frac": 0.25, # height fraction above baseline
40
- "below_pad": 0.7, # pt ; pad below baseline
47
+ "band_frac": 0.25, # height fraction above baseline
48
+ "below_pad": 0.7, # pt ; pad below baseline
41
49
  }
42
50
 
43
51
  HIGHLIGHT_DEFAULTS = {
44
52
  "height_min_ratio": 0.6, # rect height relative to char height lower bound
45
53
  "height_max_ratio": 2.0, # upper bound
46
- "coverage_ratio": 0.6, # horizontal overlap with glyph
54
+ "coverage_ratio": 0.6, # horizontal overlap with glyph
47
55
  "color_saturation_min": 0.4, # HSV S >
48
- "color_value_min": 0.4, # HSV V >
56
+ "color_value_min": 0.4, # HSV V >
49
57
  }
50
58
 
51
59
 
52
60
  @contextmanager
53
61
  def disable_text_sync():
54
- """
55
- Temporarily disable text synchronization for performance.
56
-
57
- This is used when bulk-updating text content where character-level
58
- synchronization is not needed, such as during bidi processing.
59
- Fixes exponential recursion issue with Arabic/RTL text processing.
62
+ """Temporarily disable text synchronization for performance.
63
+
64
+ This context manager is used when bulk-updating text content where character-level
65
+ synchronization is not needed, such as during bidi processing or large-scale
66
+ text transformations. It prevents exponential recursion issues with Arabic/RTL
67
+ text processing by bypassing the normal text property setter.
68
+
69
+ Yields:
70
+ None: The context where text synchronization is disabled.
71
+
72
+ Example:
73
+ ```python
74
+ with disable_text_sync():
75
+ for element in text_elements:
76
+ element.text = process_arabic_text(element.text)
77
+ # Text sync automatically restored after the block
78
+ ```
79
+
80
+ Note:
81
+ This optimization is critical for performance when processing documents
82
+ with complex text layouts or right-to-left scripts that would otherwise
83
+ trigger expensive character synchronization operations.
60
84
  """
61
85
  # Save original setter
62
86
  original_setter = TextElement.text.fset
63
-
87
+
64
88
  # Create a fast setter that skips sync
65
89
  def fast_setter(self, value):
66
90
  self._obj["text"] = value
67
91
  # Skip character synchronization for performance
68
-
92
+
69
93
  # Apply fast setter
70
94
  TextElement.text = property(TextElement.text.fget, fast_setter)
71
-
95
+
72
96
  try:
73
97
  yield
74
98
  finally:
75
99
  # Restore original setter
76
100
  TextElement.text = property(TextElement.text.fget, original_setter)
77
101
 
102
+
78
103
  class NaturalWordExtractor(WordExtractor):
79
- """
80
- Custom WordExtractor that splits words based on specified character attributes
104
+ """Custom WordExtractor that splits words based on specified character attributes.
105
+
106
+ This class extends pdfplumber's WordExtractor to provide more intelligent word
107
+ segmentation that respects font boundaries and other character attributes.
108
+ It prevents words from spanning across different fonts, sizes, or styles,
109
+ which is essential for maintaining semantic meaning in document analysis.
110
+
111
+ The extractor considers multiple character attributes when determining word
112
+ boundaries, ensuring that visually distinct text elements (like bold headers
113
+ mixed with regular text) are properly separated into distinct words.
114
+
115
+ Attributes:
116
+ font_attrs: List of character attributes to consider for word boundaries.
117
+ Common attributes include 'fontname', 'size', 'flags', etc.
118
+
119
+ Example:
120
+ ```python
121
+ # Create extractor that splits on font and size changes
122
+ extractor = NaturalWordExtractor(['fontname', 'size'])
123
+
124
+ # Extract words with font-aware boundaries
125
+ words = extractor.extract_words(page_chars)
126
+
127
+ # Each word will have consistent font properties
128
+ for word in words:
129
+ print(f"'{word['text']}' in {word['fontname']} size {word['size']}")
130
+ ```
81
131
  in addition to pdfplumber's default spatial logic.
82
132
  """
83
133
 
@@ -198,7 +248,9 @@ class ElementManager:
198
248
  if self._load_text and prepared_char_dicts:
199
249
  try:
200
250
  self._mark_strikethrough_chars(prepared_char_dicts)
201
- except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
251
+ except (
252
+ Exception
253
+ ) as strike_err: # pragma: no cover – strike detection must never crash loading
202
254
  logger.warning(
203
255
  f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
204
256
  exc_info=True,
@@ -244,16 +296,16 @@ class ElementManager:
244
296
  # 2. Instantiate the custom word extractor
245
297
  # Prefer page-level config over PDF-level for tolerance lookup
246
298
  word_elements: List[TextElement] = []
247
-
299
+
248
300
  # Get config objects (needed for auto_text_tolerance check)
249
301
  page_config = getattr(self._page, "_config", {})
250
302
  pdf_config = getattr(self._page._parent, "_config", {})
251
-
303
+
252
304
  # Initialize tolerance variables
253
305
  xt = None
254
306
  yt = None
255
307
  use_flow = pdf_config.get("use_text_flow", False)
256
-
308
+
257
309
  if self._load_text and prepared_char_dicts:
258
310
  # Start with any explicitly supplied tolerances (may be None)
259
311
  xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
@@ -275,7 +327,7 @@ class ElementManager:
275
327
  # Record back to page config for downstream users
276
328
  page_config["x_tolerance"] = xt
277
329
  if yt is None:
278
- yt = 0.6 * median_size # ~line spacing fraction
330
+ yt = 0.6 * median_size # ~line spacing fraction
279
331
  page_config["y_tolerance"] = yt
280
332
 
281
333
  # Warn users when the page's font size is extremely small –
@@ -364,7 +416,8 @@ class ElementManager:
364
416
  char_dir = "ltr"
365
417
 
366
418
  extractor = NaturalWordExtractor(
367
- word_split_attributes=self._word_split_attributes + ["strike", "underline", "highlight"],
419
+ word_split_attributes=self._word_split_attributes
420
+ + ["strike", "underline", "highlight"],
368
421
  extra_attrs=attributes_to_preserve,
369
422
  x_tolerance=xt,
370
423
  y_tolerance=yt,
@@ -413,12 +466,13 @@ class ElementManager:
413
466
  # Convert from visual order (from PDF) to logical order using bidi
414
467
  try:
415
468
  from bidi.algorithm import get_display # type: ignore
469
+
416
470
  from natural_pdf.utils.bidi_mirror import mirror_brackets
417
471
 
418
472
  with disable_text_sync():
419
473
  # word_element.text is currently in visual order (from PDF)
420
474
  # Convert to logical order using bidi with auto direction detection
421
- logical_text = get_display(word_element.text, base_dir='L')
475
+ logical_text = get_display(word_element.text, base_dir="L")
422
476
  # Apply bracket mirroring for logical order
423
477
  word_element.text = mirror_brackets(logical_text)
424
478
  except Exception:
@@ -495,7 +549,11 @@ class ElementManager:
495
549
  if color_counts:
496
550
  dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
497
551
  try:
498
- w._obj["highlight_color"] = tuple(dominant_color) if isinstance(dominant_color, (list, tuple)) else dominant_color
552
+ w._obj["highlight_color"] = (
553
+ tuple(dominant_color)
554
+ if isinstance(dominant_color, (list, tuple))
555
+ else dominant_color
556
+ )
499
557
  except Exception:
500
558
  w._obj["highlight_color"] = dominant_color
501
559
 
@@ -998,12 +1056,16 @@ class ElementManager:
998
1056
  # Strikethrough detection (horizontal strike-out lines)
999
1057
  # ------------------------------------------------------------------
1000
1058
 
1001
- def _mark_strikethrough_chars(self, char_dicts: List[Dict[str, Any]], *,
1002
- thickness_tol: float = 1.5,
1003
- horiz_tol: float = 1.0,
1004
- coverage_ratio: float = 0.7,
1005
- band_top: float = 0.35,
1006
- band_bottom: float = 0.65) -> None:
1059
+ def _mark_strikethrough_chars(
1060
+ self,
1061
+ char_dicts: List[Dict[str, Any]],
1062
+ *,
1063
+ thickness_tol: float = 1.5,
1064
+ horiz_tol: float = 1.0,
1065
+ coverage_ratio: float = 0.7,
1066
+ band_top: float = 0.35,
1067
+ band_bottom: float = 0.65,
1068
+ ) -> None:
1007
1069
  """Annotate character dictionaries with a boolean ``strike`` flag.
1008
1070
 
1009
1071
  Args
@@ -1102,11 +1164,31 @@ class ElementManager:
1102
1164
  # Allow user overrides via PDF._config["underline_detection"]
1103
1165
  pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
1104
1166
 
1105
- thickness_tol = thickness_tol if thickness_tol is not None else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
1106
- horiz_tol = horiz_tol if horiz_tol is not None else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
1107
- coverage_ratio= coverage_ratio if coverage_ratio is not None else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
1108
- band_frac = band_frac if band_frac is not None else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
1109
- below_pad = below_pad if below_pad is not None else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
1167
+ thickness_tol = (
1168
+ thickness_tol
1169
+ if thickness_tol is not None
1170
+ else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
1171
+ )
1172
+ horiz_tol = (
1173
+ horiz_tol
1174
+ if horiz_tol is not None
1175
+ else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
1176
+ )
1177
+ coverage_ratio = (
1178
+ coverage_ratio
1179
+ if coverage_ratio is not None
1180
+ else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
1181
+ )
1182
+ band_frac = (
1183
+ band_frac
1184
+ if band_frac is not None
1185
+ else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
1186
+ )
1187
+ below_pad = (
1188
+ below_pad
1189
+ if below_pad is not None
1190
+ else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
1191
+ )
1110
1192
 
1111
1193
  raw_lines = list(getattr(self._page._page, "lines", []))
1112
1194
  raw_rects = list(getattr(self._page._page, "rects", []))
@@ -1148,7 +1230,7 @@ class ElementManager:
1148
1230
  table_y = {k for k, v in y_groups.items() if v >= 3}
1149
1231
 
1150
1232
  # filter out candidates on those y values
1151
- filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
1233
+ filtered_candidates = [c for c in candidates if int((c[1] + c[3]) / 2) not in table_y]
1152
1234
 
1153
1235
  # annotate chars
1154
1236
  for ch in char_dicts:
@@ -1205,7 +1287,9 @@ class ElementManager:
1205
1287
  y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
1206
1288
  y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
1207
1289
  rheight = y1_rect - y0_rect
1208
- highlight_rects.append((rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col))
1290
+ highlight_rects.append(
1291
+ (rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col)
1292
+ )
1209
1293
 
1210
1294
  if not highlight_rects:
1211
1295
  for ch in char_dicts:
@@ -1238,7 +1322,9 @@ class ElementManager:
1238
1322
  if overlap > 0 and (overlap / width) >= coverage_ratio:
1239
1323
  ch["highlight"] = True
1240
1324
  try:
1241
- ch["highlight_color"] = tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
1325
+ ch["highlight_color"] = (
1326
+ tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
1327
+ )
1242
1328
  except Exception:
1243
1329
  ch["highlight_color"] = rcolor
1244
1330
  break
@@ -98,7 +98,9 @@ class HighlightRenderer:
98
98
  scaled_bbox = None
99
99
 
100
100
  if highlight.is_polygon:
101
- scaled_polygon = [(p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon]
101
+ scaled_polygon = [
102
+ (p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon
103
+ ]
102
104
  # Draw polygon fill and border
103
105
  draw.polygon(
104
106
  scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
@@ -597,7 +599,7 @@ class HighlightingService:
597
599
  if page_index in self._highlights_by_page:
598
600
  del self._highlights_by_page[page_index]
599
601
  logger.debug(f"Cleared highlights for page {page_index}.")
600
-
602
+
601
603
  # Also clear any cached rendered images for this page so the next render
602
604
  # reflects the removal of highlights.
603
605
  try:
@@ -683,7 +685,6 @@ class HighlightingService:
683
685
  )
684
686
 
685
687
  try:
686
- # base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
687
688
  img_object = page_obj._page.to_image(**to_image_args)
688
689
  base_image_pil = (
689
690
  img_object.annotated
@@ -929,9 +930,7 @@ class HighlightingService:
929
930
  right_px = max(left_px + 1, min(right_px, rendered_image.width))
930
931
  bottom_px = max(top_px + 1, min(bottom_px, rendered_image.height))
931
932
 
932
- rendered_image = rendered_image.crop(
933
- (left_px, top_px, right_px, bottom_px)
934
- )
933
+ rendered_image = rendered_image.crop((left_px, top_px, right_px, bottom_px))
935
934
 
936
935
  legend = None
937
936
  if labels:
natural_pdf/core/page.py CHANGED
@@ -77,7 +77,6 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
77
77
  # --- End Classification Imports --- #
78
78
 
79
79
 
80
-
81
80
  # --- End Shape Detection Mixin --- #
82
81
 
83
82
 
@@ -94,23 +93,107 @@ logger = logging.getLogger(__name__)
94
93
 
95
94
 
96
95
  class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
97
- """
98
- Enhanced Page wrapper built on top of pdfplumber.Page.
96
+ """Enhanced Page wrapper built on top of pdfplumber.Page.
99
97
 
100
98
  This class provides a fluent interface for working with PDF pages,
101
99
  with improved selection, navigation, extraction, and question-answering capabilities.
100
+ It integrates multiple analysis capabilities through mixins and provides spatial
101
+ navigation with CSS-like selectors.
102
+
103
+ The Page class serves as the primary interface for document analysis, offering:
104
+ - Element selection and spatial navigation
105
+ - OCR and layout analysis integration
106
+ - Table detection and extraction
107
+ - AI-powered classification and data extraction
108
+ - Visual debugging with highlighting and cropping
109
+ - Text style analysis and structure detection
110
+
111
+ Attributes:
112
+ index: Zero-based index of this page in the PDF.
113
+ number: One-based page number (index + 1).
114
+ width: Page width in points.
115
+ height: Page height in points.
116
+ bbox: Bounding box tuple (x0, top, x1, bottom) of the page.
117
+ chars: Collection of character elements on the page.
118
+ words: Collection of word elements on the page.
119
+ lines: Collection of line elements on the page.
120
+ rects: Collection of rectangle elements on the page.
121
+ images: Collection of image elements on the page.
122
+ metadata: Dictionary for storing analysis results and custom data.
123
+
124
+ Example:
125
+ Basic usage:
126
+ ```python
127
+ pdf = npdf.PDF("document.pdf")
128
+ page = pdf.pages[0]
129
+
130
+ # Find elements with CSS-like selectors
131
+ headers = page.find_all('text[size>12]:bold')
132
+ summaries = page.find('text:contains("Summary")')
133
+
134
+ # Spatial navigation
135
+ content_below = summaries.below(until='text[size>12]:bold')
136
+
137
+ # Table extraction
138
+ tables = page.extract_table()
139
+ ```
140
+
141
+ Advanced usage:
142
+ ```python
143
+ # Apply OCR if needed
144
+ page.apply_ocr(engine='easyocr', resolution=300)
145
+
146
+ # Layout analysis
147
+ page.analyze_layout(engine='yolo')
148
+
149
+ # AI-powered extraction
150
+ data = page.extract_structured_data(MySchema)
151
+
152
+ # Visual debugging
153
+ page.find('text:contains("Important")').show()
154
+ ```
102
155
  """
103
156
 
104
- def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None, load_text: bool = True):
105
- """
106
- Initialize a page wrapper.
157
+ def __init__(
158
+ self,
159
+ page: "pdfplumber.page.Page",
160
+ parent: "PDF",
161
+ index: int,
162
+ font_attrs=None,
163
+ load_text: bool = True,
164
+ ):
165
+ """Initialize a page wrapper.
166
+
167
+ Creates an enhanced Page object that wraps a pdfplumber page with additional
168
+ functionality for spatial navigation, analysis, and AI-powered extraction.
107
169
 
108
170
  Args:
109
- page: pdfplumber page object
110
- parent: Parent PDF object
111
- index: Index of this page in the PDF (0-based)
112
- font_attrs: Font attributes to consider when grouping characters into words.
113
- load_text: Whether to load text elements from the PDF (default: True).
171
+ page: The underlying pdfplumber page object that provides raw PDF data.
172
+ parent: Parent PDF object that contains this page and provides access
173
+ to managers and global settings.
174
+ index: Zero-based index of this page in the PDF document.
175
+ font_attrs: List of font attributes to consider when grouping characters
176
+ into words. Common attributes include ['fontname', 'size', 'flags'].
177
+ If None, uses default character-to-word grouping rules.
178
+ load_text: If True, load and process text elements from the PDF's text layer.
179
+ If False, skip text layer processing (useful for OCR-only workflows).
180
+
181
+ Note:
182
+ This constructor is typically called automatically when accessing pages
183
+ through the PDF.pages collection. Direct instantiation is rarely needed.
184
+
185
+ Example:
186
+ ```python
187
+ # Pages are usually accessed through the PDF object
188
+ pdf = npdf.PDF("document.pdf")
189
+ page = pdf.pages[0] # Page object created automatically
190
+
191
+ # Direct construction (advanced usage)
192
+ import pdfplumber
193
+ with pdfplumber.open("document.pdf") as plumber_pdf:
194
+ plumber_page = plumber_pdf.pages[0]
195
+ page = Page(plumber_page, pdf, 0, load_text=True)
196
+ ```
114
197
  """
115
198
  self._page = page
116
199
  self._parent = parent
@@ -1190,6 +1273,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1190
1273
  if _contains_rtl(result):
1191
1274
  try:
1192
1275
  from bidi.algorithm import get_display # type: ignore
1276
+
1193
1277
  from natural_pdf.utils.bidi_mirror import mirror_brackets
1194
1278
 
1195
1279
  result = "\n".join(
@@ -1199,8 +1283,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1199
1283
  base_dir=(
1200
1284
  "R"
1201
1285
  if any(
1202
- unicodedata.bidirectional(ch)
1203
- in ("R", "AL", "AN")
1286
+ unicodedata.bidirectional(ch) in ("R", "AL", "AN")
1204
1287
  for ch in line
1205
1288
  )
1206
1289
  else "L"
@@ -1396,11 +1479,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1396
1479
  table_settings.setdefault("text_y_tolerance", y_tol)
1397
1480
 
1398
1481
  # pdfplumber's text strategy benefits from a tight snap tolerance.
1399
- if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
1482
+ if (
1483
+ "snap_tolerance" not in table_settings
1484
+ and "snap_x_tolerance" not in table_settings
1485
+ ):
1400
1486
  # Derive from y_tol if available, else default 1
1401
1487
  snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
1402
1488
  table_settings.setdefault("snap_tolerance", snap)
1403
- if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
1489
+ if (
1490
+ "join_tolerance" not in table_settings
1491
+ and "join_x_tolerance" not in table_settings
1492
+ ):
1404
1493
  join = table_settings.get("snap_tolerance", 1)
1405
1494
  table_settings.setdefault("join_tolerance", join)
1406
1495
  table_settings.setdefault("join_x_tolerance", join)
@@ -2998,29 +3087,31 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2998
3087
  InspectionSummary with element tables showing coordinates,
2999
3088
  properties, and other details for each element
3000
3089
  """
3001
- return self.find_all('*').inspect(limit=limit)
3090
+ return self.find_all("*").inspect(limit=limit)
3002
3091
 
3003
3092
  def remove_text_layer(self) -> "Page":
3004
3093
  """
3005
3094
  Remove all text elements from this page.
3006
-
3095
+
3007
3096
  This removes all text elements (words and characters) from the page,
3008
3097
  effectively clearing the text layer.
3009
-
3098
+
3010
3099
  Returns:
3011
3100
  Self for method chaining
3012
3101
  """
3013
3102
  logger.info(f"Page {self.number}: Removing all text elements...")
3014
-
3103
+
3015
3104
  # Remove all words and chars from the element manager
3016
3105
  removed_words = len(self._element_mgr.words)
3017
3106
  removed_chars = len(self._element_mgr.chars)
3018
-
3107
+
3019
3108
  # Clear the lists
3020
3109
  self._element_mgr._elements["words"] = []
3021
3110
  self._element_mgr._elements["chars"] = []
3022
-
3023
- logger.info(f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters")
3111
+
3112
+ logger.info(
3113
+ f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters"
3114
+ )
3024
3115
  return self
3025
3116
 
3026
3117
  @property