natural-pdf 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -146,7 +146,7 @@ class ElementManager:
146
146
  contained in the Page class, providing better separation of concerns.
147
147
  """
148
148
 
149
- def __init__(self, page, font_attrs=None):
149
+ def __init__(self, page, font_attrs=None, load_text: bool = True):
150
150
  """
151
151
  Initialize the ElementManager.
152
152
 
@@ -156,9 +156,11 @@ class ElementManager:
156
156
  Default: ['fontname', 'size', 'bold', 'italic']
157
157
  None: Only consider spatial relationships
158
158
  List: Custom attributes to consider
159
+ load_text: Whether to load text elements from the PDF (default: True).
159
160
  """
160
161
  self._page = page
161
162
  self._elements = None # Lazy-loaded
163
+ self._load_text = load_text
162
164
  # Default to splitting by fontname, size, bold, italic if not specified
163
165
  # Renamed internal variable for clarity
164
166
  self._word_split_attributes = (
@@ -175,11 +177,15 @@ class ElementManager:
175
177
 
176
178
  logger.debug(f"Page {self._page.number}: Loading elements...")
177
179
 
178
- # 1. Prepare character dictionaries (native + OCR) with necessary attributes
179
- prepared_char_dicts = self._prepare_char_dicts()
180
- logger.debug(
181
- f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
182
- )
180
+ # 1. Prepare character dictionaries only if loading text
181
+ if self._load_text:
182
+ prepared_char_dicts = self._prepare_char_dicts()
183
+ logger.debug(
184
+ f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
185
+ )
186
+ else:
187
+ prepared_char_dicts = []
188
+ logger.debug(f"Page {self._page.number}: Skipping text loading (load_text=False)")
183
189
 
184
190
  # -------------------------------------------------------------
185
191
  # Detect strikethrough (horizontal strike-out lines) on raw
@@ -189,52 +195,105 @@ class ElementManager:
189
195
  # belong to the same word.
190
196
  # -------------------------------------------------------------
191
197
 
192
- try:
193
- self._mark_strikethrough_chars(prepared_char_dicts)
194
- except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
195
- logger.warning(
196
- f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
197
- exc_info=True,
198
- )
198
+ if self._load_text and prepared_char_dicts:
199
+ try:
200
+ self._mark_strikethrough_chars(prepared_char_dicts)
201
+ except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
202
+ logger.warning(
203
+ f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
204
+ exc_info=True,
205
+ )
199
206
 
200
207
  # -------------------------------------------------------------
201
208
  # Detect underlines on raw characters (must come after strike so
202
209
  # both attributes are present before word grouping).
203
210
  # -------------------------------------------------------------
204
211
 
205
- try:
206
- self._mark_underline_chars(prepared_char_dicts)
207
- except Exception as u_err: # pragma: no cover
208
- logger.warning(
209
- f"Page {self._page.number}: Underline detection failed – {u_err}",
210
- exc_info=True,
211
- )
212
+ if self._load_text and prepared_char_dicts:
213
+ try:
214
+ self._mark_underline_chars(prepared_char_dicts)
215
+ except Exception as u_err: # pragma: no cover
216
+ logger.warning(
217
+ f"Page {self._page.number}: Underline detection failed – {u_err}",
218
+ exc_info=True,
219
+ )
212
220
 
213
221
  # Detect highlights
214
- try:
215
- self._mark_highlight_chars(prepared_char_dicts)
216
- except Exception as h_err:
217
- logger.warning(
218
- f"Page {self._page.number}: Highlight detection failed – {h_err}",
219
- exc_info=True,
220
- )
222
+ if self._load_text and prepared_char_dicts:
223
+ try:
224
+ self._mark_highlight_chars(prepared_char_dicts)
225
+ except Exception as h_err:
226
+ logger.warning(
227
+ f"Page {self._page.number}: Highlight detection failed – {h_err}",
228
+ exc_info=True,
229
+ )
221
230
 
222
231
  # Create a mapping from character dict to index for efficient lookup
223
- char_to_index = {}
224
- for idx, char_dict in enumerate(prepared_char_dicts):
225
- key = (
226
- char_dict.get("x0", 0),
227
- char_dict.get("top", 0),
228
- char_dict.get("text", ""),
229
- )
230
- char_to_index[key] = idx
232
+ if self._load_text:
233
+ char_to_index = {}
234
+ for idx, char_dict in enumerate(prepared_char_dicts):
235
+ key = (
236
+ char_dict.get("x0", 0),
237
+ char_dict.get("top", 0),
238
+ char_dict.get("text", ""),
239
+ )
240
+ char_to_index[key] = idx
241
+ else:
242
+ char_to_index = {}
231
243
 
232
244
  # 2. Instantiate the custom word extractor
233
- # Get config settings from the parent PDF or use defaults
245
+ # Prefer page-level config over PDF-level for tolerance lookup
246
+ word_elements: List[TextElement] = []
247
+
248
+ # Get config objects (needed for auto_text_tolerance check)
249
+ page_config = getattr(self._page, "_config", {})
234
250
  pdf_config = getattr(self._page._parent, "_config", {})
235
- xt = pdf_config.get("x_tolerance", 3)
236
- yt = pdf_config.get("y_tolerance", 3)
251
+
252
+ # Initialize tolerance variables
253
+ xt = None
254
+ yt = None
237
255
  use_flow = pdf_config.get("use_text_flow", False)
256
+
257
+ if self._load_text and prepared_char_dicts:
258
+ # Start with any explicitly supplied tolerances (may be None)
259
+ xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
260
+ yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
261
+
262
+ # ------------------------------------------------------------------
263
+ # Auto-adaptive tolerance: scale based on median character size when
264
+ # requested and explicit values are absent.
265
+ # ------------------------------------------------------------------
266
+ if self._load_text and pdf_config.get("auto_text_tolerance", True):
267
+ import statistics
268
+
269
+ sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
270
+ median_size = None
271
+ if sizes:
272
+ median_size = statistics.median(sizes)
273
+ if xt is None:
274
+ xt = 0.25 * median_size # ~kerning width
275
+ # Record back to page config for downstream users
276
+ page_config["x_tolerance"] = xt
277
+ if yt is None:
278
+ yt = 0.6 * median_size # ~line spacing fraction
279
+ page_config["y_tolerance"] = yt
280
+
281
+ # Warn users when the page's font size is extremely small –
282
+ # this is often the root cause of merged-row/column issues.
283
+ if median_size and median_size < 6: # 6 pt is unusually small
284
+ logger.warning(
285
+ f"Page {self._page.number}: Median font size is only {median_size:.1f} pt; "
286
+ f"auto-set x_tolerance={xt:.2f}, y_tolerance={yt:.2f}. "
287
+ "If the output looks wrong you can override these values via "
288
+ "PDF(..., text_tolerance={'x_tolerance': X, 'y_tolerance': Y}, "
289
+ "auto_text_tolerance=False)."
290
+ )
291
+
292
+ # Fallback to pdfplumber defaults if still None
293
+ if xt is None:
294
+ xt = 3
295
+ if yt is None:
296
+ yt = 3
238
297
 
239
298
  # List of attributes to preserve on word objects
240
299
  attributes_to_preserve = list(
@@ -284,7 +343,6 @@ class ElementManager:
284
343
  current_line_key = line_key
285
344
  lines[-1].append(char_dict)
286
345
 
287
- word_elements: List[TextElement] = []
288
346
  # Process each line separately with direction detection
289
347
  for line_chars in lines:
290
348
  if not line_chars:
@@ -441,7 +499,8 @@ class ElementManager:
441
499
  except Exception:
442
500
  w._obj["highlight_color"] = dominant_color
443
501
 
444
- generated_words = word_elements
502
+ # generated_words defaults to empty list if text loading is disabled
503
+ generated_words = word_elements if self._load_text else []
445
504
  logger.debug(
446
505
  f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
447
506
  )
natural_pdf/core/page.py CHANGED
@@ -101,7 +101,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
101
101
  with improved selection, navigation, extraction, and question-answering capabilities.
102
102
  """
103
103
 
104
- def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None):
104
+ def __init__(self, page: "pdfplumber.page.Page", parent: "PDF", index: int, font_attrs=None, load_text: bool = True):
105
105
  """
106
106
  Initialize a page wrapper.
107
107
 
@@ -110,10 +110,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
110
110
  parent: Parent PDF object
111
111
  index: Index of this page in the PDF (0-based)
112
112
  font_attrs: Font attributes to consider when grouping characters into words.
113
+ load_text: Whether to load text elements from the PDF (default: True).
113
114
  """
114
115
  self._page = page
115
116
  self._parent = parent
116
117
  self._index = index
118
+ self._load_text = load_text
117
119
  self._text_styles = None # Lazy-loaded text style analyzer results
118
120
  self._exclusions = [] # List to store exclusion functions/regions
119
121
  self._skew_angle: Optional[float] = None # Stores detected skew angle
@@ -128,8 +130,15 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
128
130
  "named": {}, # Named regions (name -> region)
129
131
  }
130
132
 
133
+ # -------------------------------------------------------------
134
+ # Page-scoped configuration begins as a shallow copy of the parent
135
+ # PDF-level configuration so that auto-computed tolerances or other
136
+ # page-specific values do not overwrite siblings.
137
+ # -------------------------------------------------------------
138
+ self._config = dict(getattr(self._parent, "_config", {}))
139
+
131
140
  # Initialize ElementManager, passing font_attrs
132
- self._element_mgr = ElementManager(self, font_attrs=font_attrs)
141
+ self._element_mgr = ElementManager(self, font_attrs=font_attrs, load_text=self._load_text)
133
142
  # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
134
143
  # --- NEW --- Central registry for analysis results
135
144
  self.analyses: Dict[str, Any] = {}
@@ -1153,10 +1162,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1153
1162
  # 5. Generate Text Layout using Utility
1154
1163
  # Pass page bbox as layout context
1155
1164
  page_bbox = (0, 0, self.width, self.height)
1165
+ # Merge PDF-level default tolerances if caller did not override
1166
+ merged_kwargs = dict(kwargs)
1167
+ tol_keys = ["x_tolerance", "x_tolerance_ratio", "y_tolerance"]
1168
+ for k in tol_keys:
1169
+ if k not in merged_kwargs:
1170
+ if k in self._config:
1171
+ merged_kwargs[k] = self._config[k]
1172
+ elif k in getattr(self._parent, "_config", {}):
1173
+ merged_kwargs[k] = self._parent._config[k]
1174
+
1156
1175
  result = generate_text_layout(
1157
1176
  char_dicts=filtered_chars,
1158
1177
  layout_context_bbox=page_bbox,
1159
- user_kwargs=kwargs, # Pass original user kwargs
1178
+ user_kwargs=merged_kwargs,
1160
1179
  )
1161
1180
 
1162
1181
  # --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
@@ -1356,6 +1375,37 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1356
1375
 
1357
1376
  # Use the selected method
1358
1377
  if effective_method == "pdfplumber":
1378
+ # ---------------------------------------------------------
1379
+ # Inject auto-computed or user-specified text tolerances so
1380
+ # pdfplumber uses the same numbers we used for word grouping
1381
+ # whenever the table algorithm relies on word positions.
1382
+ # ---------------------------------------------------------
1383
+ if "text" in (
1384
+ table_settings.get("vertical_strategy"),
1385
+ table_settings.get("horizontal_strategy"),
1386
+ ):
1387
+ print("SETTING IT UP")
1388
+ pdf_cfg = getattr(self, "_config", getattr(self._parent, "_config", {}))
1389
+ if "text_x_tolerance" not in table_settings and "x_tolerance" not in table_settings:
1390
+ x_tol = pdf_cfg.get("x_tolerance")
1391
+ if x_tol is not None:
1392
+ table_settings.setdefault("text_x_tolerance", x_tol)
1393
+ if "text_y_tolerance" not in table_settings and "y_tolerance" not in table_settings:
1394
+ y_tol = pdf_cfg.get("y_tolerance")
1395
+ if y_tol is not None:
1396
+ table_settings.setdefault("text_y_tolerance", y_tol)
1397
+
1398
+ # pdfplumber's text strategy benefits from a tight snap tolerance.
1399
+ if "snap_tolerance" not in table_settings and "snap_x_tolerance" not in table_settings:
1400
+ # Derive from y_tol if available, else default 1
1401
+ snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
1402
+ table_settings.setdefault("snap_tolerance", snap)
1403
+ if "join_tolerance" not in table_settings and "join_x_tolerance" not in table_settings:
1404
+ join = table_settings.get("snap_tolerance", 1)
1405
+ table_settings.setdefault("join_tolerance", join)
1406
+ table_settings.setdefault("join_x_tolerance", join)
1407
+ table_settings.setdefault("join_y_tolerance", join)
1408
+
1359
1409
  return self._page.extract_tables(table_settings)
1360
1410
  else:
1361
1411
  raise ValueError(
@@ -2950,6 +3000,29 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2950
3000
  """
2951
3001
  return self.find_all('*').inspect(limit=limit)
2952
3002
 
3003
+ def remove_text_layer(self) -> "Page":
3004
+ """
3005
+ Remove all text elements from this page.
3006
+
3007
+ This removes all text elements (words and characters) from the page,
3008
+ effectively clearing the text layer.
3009
+
3010
+ Returns:
3011
+ Self for method chaining
3012
+ """
3013
+ logger.info(f"Page {self.number}: Removing all text elements...")
3014
+
3015
+ # Remove all words and chars from the element manager
3016
+ removed_words = len(self._element_mgr.words)
3017
+ removed_chars = len(self._element_mgr.chars)
3018
+
3019
+ # Clear the lists
3020
+ self._element_mgr._elements["words"] = []
3021
+ self._element_mgr._elements["chars"] = []
3022
+
3023
+ logger.info(f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters")
3024
+ return self
3025
+
2953
3026
  @property
2954
3027
  def lines(self) -> List[Any]:
2955
3028
  """Get all line elements on this page."""
natural_pdf/core/pdf.py CHANGED
@@ -108,12 +108,13 @@ class _LazyPageList(Sequence):
108
108
  also supported and will materialise pages on demand.
109
109
  """
110
110
 
111
- def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
111
+ def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True):
112
112
  self._parent_pdf = parent_pdf
113
113
  self._plumber_pdf = plumber_pdf
114
114
  self._font_attrs = font_attrs
115
115
  # One slot per pdfplumber page – initially all None
116
116
  self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
117
+ self._load_text = load_text
117
118
 
118
119
  # Internal helper -----------------------------------------------------
119
120
  def _create_page(self, index: int) -> "Page":
@@ -123,7 +124,7 @@ class _LazyPageList(Sequence):
123
124
  from natural_pdf.core.page import Page
124
125
 
125
126
  plumber_page = self._plumber_pdf.pages[index]
126
- cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
127
+ cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs, load_text=self._load_text)
127
128
  self._cache[index] = cached
128
129
  return cached
129
130
 
@@ -168,6 +169,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
168
169
  reading_order: bool = True,
169
170
  font_attrs: Optional[List[str]] = None,
170
171
  keep_spaces: bool = True,
172
+ text_tolerance: Optional[dict] = None,
173
+ auto_text_tolerance: bool = True,
174
+ text_layer: bool = True,
171
175
  ):
172
176
  """
173
177
  Initialize the enhanced PDF object.
@@ -177,11 +181,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
177
181
  reading_order: Whether to use natural reading order
178
182
  font_attrs: Font attributes for grouping characters into words
179
183
  keep_spaces: Whether to include spaces in word elements
184
+ text_tolerance: PDFplumber-style tolerance settings
185
+ auto_text_tolerance: Whether to automatically scale text tolerance
186
+ text_layer: Whether to keep the existing text layer from the PDF (default: True).
187
+ If False, removes all existing text elements during initialization.
180
188
  """
181
189
  self._original_path_or_stream = path_or_url_or_stream
182
190
  self._temp_file = None
183
191
  self._resolved_path = None
184
192
  self._is_stream = False
193
+ self._text_layer = text_layer
185
194
  stream_to_open = None
186
195
 
187
196
  if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
@@ -253,7 +262,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
253
262
  self._manager_registry = {}
254
263
 
255
264
  # Lazily instantiate pages only when accessed
256
- self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
265
+ self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer)
257
266
 
258
267
  self._element_cache = {}
259
268
  self._exclusions = []
@@ -263,6 +272,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
263
272
 
264
273
  self._initialize_managers()
265
274
  self._initialize_highlighter()
275
+
276
+ # Remove text layer if requested
277
+ if not self._text_layer:
278
+ logger.info("Removing text layer as requested (text_layer=False)")
279
+ # Text layer is not loaded when text_layer=False, so no need to remove
280
+ pass
281
+
266
282
  # Analysis results accessed via self.analyses property (see below)
267
283
 
268
284
  # --- Automatic cleanup when object is garbage-collected ---
@@ -274,6 +290,24 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
274
290
  getattr(self, "_is_stream", False),
275
291
  )
276
292
 
293
+ # --- Text tolerance settings ------------------------------------
294
+ # Users can pass pdfplumber-style keys (x_tolerance, x_tolerance_ratio,
295
+ # y_tolerance, etc.) via *text_tolerance*. We also keep a flag that
296
+ # enables automatic tolerance scaling when explicit values are not
297
+ # supplied.
298
+ self._config["auto_text_tolerance"] = bool(auto_text_tolerance)
299
+ if text_tolerance:
300
+ # Only copy recognised primitives (numbers / None); ignore junk.
301
+ allowed = {
302
+ "x_tolerance",
303
+ "x_tolerance_ratio",
304
+ "y_tolerance",
305
+ "keep_blank_chars", # passthrough convenience
306
+ }
307
+ for k, v in text_tolerance.items():
308
+ if k in allowed:
309
+ self._config[k] = v
310
+
277
311
  def _initialize_managers(self):
278
312
  """Set up manager factories for lazy instantiation."""
279
313
  # Store factories/classes for each manager key
@@ -1441,6 +1475,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1441
1475
  reading_order=self._reading_order,
1442
1476
  font_attrs=self._font_attrs,
1443
1477
  keep_spaces=self._config.get("keep_spaces", True),
1478
+ text_layer=self._text_layer,
1444
1479
  )
1445
1480
  return new_pdf
1446
1481
  except Exception as e:
@@ -1901,7 +1901,68 @@ class ElementCollection(
1901
1901
  )
1902
1902
  )
1903
1903
 
1904
+ # ------------------------------------------------------------------
1905
+ # NEW METHOD: apply_ocr for collections (supports custom function)
1906
+ # ------------------------------------------------------------------
1907
+ def apply_ocr(
1908
+ self,
1909
+ *,
1910
+ function: Optional[Callable[["Region"], Optional[str]]] = None,
1911
+ show_progress: bool = True,
1912
+ **kwargs,
1913
+ ) -> "ElementCollection":
1914
+ """Apply OCR to every element in the collection.
1915
+
1916
+ This is a convenience wrapper that simply iterates over the collection
1917
+ and calls ``el.apply_ocr(...)`` on each item.
1918
+
1919
+ Two modes are supported depending on the arguments provided:
1920
+
1921
+ 1. **Built-in OCR engines** – pass parameters like ``engine='easyocr'``
1922
+ or ``languages=['en']`` and each element delegates to the global
1923
+ OCRManager.
1924
+ 2. **Custom function** – pass a *callable* via the ``function`` keyword
1925
+ (alias ``ocr_function`` also recognised). The callable will receive
1926
+ the element/region and must return the recognised text (or ``None``).
1927
+ Internally this is forwarded through the element's own
1928
+ :py:meth:`apply_ocr` implementation, so the behaviour mirrors the
1929
+ single-element API.
1930
+
1931
+ Parameters
1932
+ ----------
1933
+ function : callable, optional
1934
+ Custom OCR function to use instead of the built-in engines.
1935
+ show_progress : bool, default True
1936
+ Display a tqdm progress bar while processing.
1937
+ **kwargs
1938
+ Additional parameters forwarded to each element's ``apply_ocr``.
1939
+
1940
+ Returns
1941
+ -------
1942
+ ElementCollection
1943
+ *Self* for fluent chaining.
1944
+ """
1945
+ # Alias for backward-compatibility
1946
+ if function is None and "ocr_function" in kwargs:
1947
+ function = kwargs.pop("ocr_function")
1948
+
1949
+ def _process(el):
1950
+ if hasattr(el, "apply_ocr"):
1951
+ if function is not None:
1952
+ return el.apply_ocr(function=function, **kwargs)
1953
+ else:
1954
+ return el.apply_ocr(**kwargs)
1955
+ else:
1956
+ logger.warning(
1957
+ f"Element of type {type(el).__name__} does not support apply_ocr. Skipping."
1958
+ )
1959
+ return el
1960
+
1961
+ # Use collection's apply helper for optional progress bar
1962
+ self.apply(_process, show_progress=show_progress)
1963
+ return self
1904
1964
 
1965
+ # ------------------------------------------------------------------
1905
1966
 
1906
1967
 
1907
1968
  class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):