natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +188 -82
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +132 -16
  19. natural_pdf/core/pdf.py +486 -71
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +238 -111
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.32.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py CHANGED
@@ -77,7 +77,6 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
77
77
  # --- End Classification Imports --- #
78
78
 
79
79
 
80
-
81
80
  # --- End Shape Detection Mixin --- #
82
81
 
83
82
 
@@ -94,26 +93,112 @@ logger = logging.getLogger(__name__)
94
93
 
95
94
 
96
95
  class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
97
- """
98
- Enhanced Page wrapper built on top of pdfplumber.Page.
96
+ """Enhanced Page wrapper built on top of pdfplumber.Page.
99
97
 
100
98
  This class provides a fluent interface for working with PDF pages,
101
99
  with improved selection, navigation, extraction, and question-answering capabilities.
100
+ It integrates multiple analysis capabilities through mixins and provides spatial
101
+ navigation with CSS-like selectors.
102
+
103
+ The Page class serves as the primary interface for document analysis, offering:
104
+ - Element selection and spatial navigation
105
+ - OCR and layout analysis integration
106
+ - Table detection and extraction
107
+ - AI-powered classification and data extraction
108
+ - Visual debugging with highlighting and cropping
109
+ - Text style analysis and structure detection
110
+
111
+ Attributes:
112
+ index: Zero-based index of this page in the PDF.
113
+ number: One-based page number (index + 1).
114
+ width: Page width in points.
115
+ height: Page height in points.
116
+ bbox: Bounding box tuple (x0, top, x1, bottom) of the page.
117
+ chars: Collection of character elements on the page.
118
+ words: Collection of word elements on the page.
119
+ lines: Collection of line elements on the page.
120
+ rects: Collection of rectangle elements on the page.
121
+ images: Collection of image elements on the page.
122
+ metadata: Dictionary for storing analysis results and custom data.
123
+
124
+ Example:
125
+ Basic usage:
126
+ ```python
127
+ pdf = npdf.PDF("document.pdf")
128
+ page = pdf.pages[0]
129
+
130
+ # Find elements with CSS-like selectors
131
+ headers = page.find_all('text[size>12]:bold')
132
+ summaries = page.find('text:contains("Summary")')
133
+
134
+ # Spatial navigation
135
+ content_below = summaries.below(until='text[size>12]:bold')
136
+
137
+ # Table extraction
138
+ tables = page.extract_table()
139
+ ```
140
+
141
+ Advanced usage:
142
+ ```python
143
+ # Apply OCR if needed
144
+ page.apply_ocr(engine='easyocr', resolution=300)
145
+
146
+ # Layout analysis
147
+ page.analyze_layout(engine='yolo')
148
+
149
+ # AI-powered extraction
150
+ data = page.extract_structured_data(MySchema)
151
+
152
+ # Visual debugging
153
+ page.find('text:contains("Important")').show()
154
+ ```
102
155
  """
103
156
 
104
- def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
105
- """
106
- Initialize a page wrapper.
157
+ def __init__(
158
+ self,
159
+ page: "pdfplumber.page.Page",
160
+ parent: "PDF",
161
+ index: int,
162
+ font_attrs=None,
163
+ load_text: bool = True,
164
+ ):
165
+ """Initialize a page wrapper.
166
+
167
+ Creates an enhanced Page object that wraps a pdfplumber page with additional
168
+ functionality for spatial navigation, analysis, and AI-powered extraction.
107
169
 
108
170
  Args:
109
- page: pdfplumber page object
110
- parent: Parent PDF object
111
- index: Index of this page in the PDF (0-based)
112
- font_attrs: Font attributes to consider when grouping characters into words.
171
+ page: The underlying pdfplumber page object that provides raw PDF data.
172
+ parent: Parent PDF object that contains this page and provides access
173
+ to managers and global settings.
174
+ index: Zero-based index of this page in the PDF document.
175
+ font_attrs: List of font attributes to consider when grouping characters
176
+ into words. Common attributes include ['fontname', 'size', 'flags'].
177
+ If None, uses default character-to-word grouping rules.
178
+ load_text: If True, load and process text elements from the PDF's text layer.
179
+ If False, skip text layer processing (useful for OCR-only workflows).
180
+
181
+ Note:
182
+ This constructor is typically called automatically when accessing pages
183
+ through the PDF.pages collection. Direct instantiation is rarely needed.
184
+
185
+ Example:
186
+ ```python
187
+ # Pages are usually accessed through the PDF object
188
+ pdf = npdf.PDF("document.pdf")
189
+ page = pdf.pages[0] # Page object created automatically
190
+
191
+ # Direct construction (advanced usage)
192
+ import pdfplumber
193
+ with pdfplumber.open("document.pdf") as plumber_pdf:
194
+ plumber_page = plumber_pdf.pages[0]
195
+ page = Page(plumber_page, pdf, 0, load_text=True)
196
+ ```
113
197
  """
114
198
  self._page = page
115
199
  self._parent = parent
116
200
  self._index = index
201
+ self._load_text = load_text
117
202
  self._text_styles = None # Lazy-loaded text style analyzer results
118
203
  self._exclusions = [] # List to store exclusion functions/regions
119
204
  self._skew_angle: Optional[float] = None # Stores detected skew angle
@@ -136,7 +221,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
136
221
  self._config = dict(getattr(self._parent, "_config", {}))
137
222
 
138
223
  # Initialize ElementManager, passing font_attrs
139
- self._element_mgr = ElementManager(self, font_attrs=font_attrs)
224
+ self._element_mgr = ElementManager(self, font_attrs=font_attrs, load_text=self._load_text)
140
225
  # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
141
226
  # --- NEW --- Central registry for analysis results
142
227
  self.analyses: Dict[str, Any] = {}
@@ -1188,6 +1273,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1188
1273
  if _contains_rtl(result):
1189
1274
  try:
1190
1275
  from bidi.algorithm import get_display # type: ignore
1276
+
1191
1277
  from natural_pdf.utils.bidi_mirror import mirror_brackets
1192
1278
 
1193
1279
  result = "\n".join(
@@ -1197,8 +1283,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1197
1283
  base_dir=(
1198
1284
  "R"
1199
1285
  if any(
1200
- unicodedata.bidirectional(ch)
1201
- in ("R", "AL", "AN")
1286
+ unicodedata.bidirectional(ch) in ("R", "AL", "AN")
1202
1287
  for ch in line
1203
1288
  )
1204
1289
  else "L"
@@ -1394,11 +1479,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1394
1479
  table_settings.setdefault("text_y_tolerance", y_tol)
1395
1480
 
1396
1481
  # pdfplumber's text strategy benefits from a tight snap tolerance.
1397
- if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
1482
+ if (
1483
+ "snap_tolerance" not in table_settings
1484
+ and "snap_x_tolerance" not in table_settings
1485
+ ):
1398
1486
  # Derive from y_tol if available, else default 1
1399
1487
  snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
1400
1488
  table_settings.setdefault("snap_tolerance", snap)
1401
- if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
1489
+ if (
1490
+ "join_tolerance" not in table_settings
1491
+ and "join_x_tolerance" not in table_settings
1492
+ ):
1402
1493
  join = table_settings.get("snap_tolerance", 1)
1403
1494
  table_settings.setdefault("join_tolerance", join)
1404
1495
  table_settings.setdefault("join_x_tolerance", join)
@@ -2996,7 +3087,32 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2996
3087
  InspectionSummary with element tables showing coordinates,
2997
3088
  properties, and other details for each element
2998
3089
  """
2999
- return self.find_all('*').inspect(limit=limit)
3090
+ return self.find_all("*").inspect(limit=limit)
3091
+
3092
+ def remove_text_layer(self) -> "Page":
3093
+ """
3094
+ Remove all text elements from this page.
3095
+
3096
+ This removes all text elements (words and characters) from the page,
3097
+ effectively clearing the text layer.
3098
+
3099
+ Returns:
3100
+ Self for method chaining
3101
+ """
3102
+ logger.info(f"Page {self.number}: Removing all text elements...")
3103
+
3104
+ # Remove all words and chars from the element manager
3105
+ removed_words = len(self._element_mgr.words)
3106
+ removed_chars = len(self._element_mgr.chars)
3107
+
3108
+ # Clear the lists
3109
+ self._element_mgr._elements["words"] = []
3110
+ self._element_mgr._elements["chars"] = []
3111
+
3112
+ logger.info(
3113
+ f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters"
3114
+ )
3115
+ return self
3000
3116
 
3001
3117
  @property
3002
3118
  def lines(self) -> List[Any]: