natural-pdf 0.1.34__py3-none-any.whl → 0.1.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,17 +2,29 @@
2
2
  Analyzers for natural_pdf.
3
3
  """
4
4
 
5
+ # Import these directly as they don't depend on Region
5
6
  from natural_pdf.analyzers.guides import Guides
6
- from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
7
- from natural_pdf.analyzers.layout.layout_manager import LayoutManager
8
- from natural_pdf.analyzers.layout.layout_options import LayoutOptions
9
7
  from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
10
8
  from natural_pdf.analyzers.text_options import TextStyleOptions
11
9
  from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
12
10
 
11
+ # Lazy imports to avoid circular dependencies
12
+ # These will be imported when actually accessed
13
+ def __getattr__(name):
14
+ if name == "LayoutAnalyzer":
15
+ from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
16
+ return LayoutAnalyzer
17
+ elif name == "LayoutManager":
18
+ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
19
+ return LayoutManager
20
+ elif name == "LayoutOptions":
21
+ from natural_pdf.analyzers.layout.layout_options import LayoutOptions
22
+ return LayoutOptions
23
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
24
+
13
25
  __all__ = [
14
26
  "LayoutAnalyzer",
15
- "LayoutManager",
27
+ "LayoutManager",
16
28
  "LayoutOptions",
17
29
  "ShapeDetectionMixin",
18
30
  "TextStyleOptions",
@@ -119,6 +119,8 @@ class GuidesList(UserList):
119
119
  align: Literal["left", "right", "center", "between"] = "left",
120
120
  outer: bool = True,
121
121
  tolerance: float = 5,
122
+ *,
123
+ append: bool = False,
122
124
  ) -> "Guides":
123
125
  """
124
126
  Create guides from content markers and add to this axis.
@@ -154,11 +156,17 @@ class GuidesList(UserList):
154
156
  tolerance=tolerance,
155
157
  )
156
158
 
157
- # Add to our list
158
- if self._axis == "vertical":
159
- self.extend(new_guides.vertical)
159
+ # Replace or append based on parameter
160
+ if append:
161
+ if self._axis == "vertical":
162
+ self.extend(new_guides.vertical)
163
+ else:
164
+ self.extend(new_guides.horizontal)
160
165
  else:
161
- self.extend(new_guides.horizontal)
166
+ if self._axis == "vertical":
167
+ self.data = list(new_guides.vertical)
168
+ else:
169
+ self.data = list(new_guides.horizontal)
162
170
 
163
171
  # Remove duplicates while preserving order
164
172
  seen = set()
@@ -183,6 +191,7 @@ class GuidesList(UserList):
183
191
  *,
184
192
  n: Optional[int] = None,
185
193
  min_gap: Optional[int] = None,
194
+ append: bool = False,
186
195
  **detect_kwargs,
187
196
  ) -> "Guides":
188
197
  """
@@ -241,11 +250,17 @@ class GuidesList(UserList):
241
250
  **detect_kwargs,
242
251
  )
243
252
 
244
- # Add to our list
245
- if self._axis == "vertical":
246
- self.extend(new_guides.vertical)
253
+ # Replace or append based on parameter
254
+ if append:
255
+ if self._axis == "vertical":
256
+ self.extend(new_guides.vertical)
257
+ else:
258
+ self.extend(new_guides.horizontal)
247
259
  else:
248
- self.extend(new_guides.horizontal)
260
+ if self._axis == "vertical":
261
+ self.data = list(new_guides.vertical)
262
+ else:
263
+ self.data = list(new_guides.horizontal)
249
264
 
250
265
  # Remove duplicates
251
266
  seen = set()
@@ -259,7 +274,8 @@ class GuidesList(UserList):
259
274
  return self._parent
260
275
 
261
276
  def from_whitespace(
262
- self, obj: Optional[Union["Page", "Region"]] = None, min_gap: float = 10
277
+ self, obj: Optional[Union["Page", "Region"]] = None, min_gap: float = 10,
278
+ *, append: bool = False
263
279
  ) -> "Guides":
264
280
  """
265
281
  Create guides from whitespace gaps.
@@ -278,11 +294,17 @@ class GuidesList(UserList):
278
294
  # Create guides for this axis
279
295
  new_guides = Guides.from_whitespace(obj=target_obj, axis=self._axis, min_gap=min_gap)
280
296
 
281
- # Add to our list
282
- if self._axis == "vertical":
283
- self.extend(new_guides.vertical)
297
+ # Replace or append
298
+ if append:
299
+ if self._axis == "vertical":
300
+ self.extend(new_guides.vertical)
301
+ else:
302
+ self.extend(new_guides.horizontal)
284
303
  else:
285
- self.extend(new_guides.horizontal)
304
+ if self._axis == "vertical":
305
+ self.data = list(new_guides.vertical)
306
+ else:
307
+ self.data = list(new_guides.horizontal)
286
308
 
287
309
  # Remove duplicates
288
310
  seen = set()
@@ -313,11 +335,11 @@ class GuidesList(UserList):
313
335
  # Create guides using divide
314
336
  new_guides = Guides.divide(obj=target_obj, n=n, axis=self._axis)
315
337
 
316
- # Add to our list
338
+ # Replace existing guides instead of extending (no append option here)
317
339
  if self._axis == "vertical":
318
- self.extend(new_guides.vertical)
340
+ self.data = list(new_guides.vertical)
319
341
  else:
320
- self.extend(new_guides.horizontal)
342
+ self.data = list(new_guides.horizontal)
321
343
 
322
344
  # Remove duplicates
323
345
  seen = set()
@@ -1982,6 +2004,53 @@ class Guides:
1982
2004
  row_boundaries = sorted(list(set(row_boundaries)))
1983
2005
  col_boundaries = sorted(list(set(col_boundaries)))
1984
2006
 
2007
+ # ------------------------------------------------------------------
2008
+ # Clean-up: remove any previously created grid regions (table, rows,
2009
+ # columns, cells) that were generated by the same `source` label and
2010
+ # overlap the area we are about to populate. This prevents the page's
2011
+ # `ElementManager` from accumulating stale/duplicate regions when the
2012
+ # user rebuilds the grid multiple times.
2013
+ # ------------------------------------------------------------------
2014
+ try:
2015
+ # Bounding box of the grid we are about to create
2016
+ if row_boundaries and col_boundaries:
2017
+ grid_bbox = (
2018
+ col_boundaries[0], # x0
2019
+ row_boundaries[0], # top
2020
+ col_boundaries[-1], # x1
2021
+ row_boundaries[-1], # bottom
2022
+ )
2023
+
2024
+ def _bbox_overlap(b1, b2):
2025
+ """Return True if two (x0, top, x1, bottom) bboxes overlap."""
2026
+ return not (
2027
+ b1[2] <= b2[0] # b1 right ≤ b2 left
2028
+ or b1[0] >= b2[2] # b1 left ≥ b2 right
2029
+ or b1[3] <= b2[1] # b1 bottom ≤ b2 top
2030
+ or b1[1] >= b2[3] # b1 top ≥ b2 bottom
2031
+ )
2032
+
2033
+ # Collect existing regions that match the source & region types
2034
+ regions_to_remove = [
2035
+ r
2036
+ for r in element_manager.regions
2037
+ if getattr(r, "source", None) == source
2038
+ and getattr(r, "region_type", None)
2039
+ in {"table", "table_row", "table_column", "table_cell"}
2040
+ and hasattr(r, "bbox")
2041
+ and _bbox_overlap(r.bbox, grid_bbox)
2042
+ ]
2043
+
2044
+ for r in regions_to_remove:
2045
+ element_manager.remove_element(r, element_type="regions")
2046
+
2047
+ if regions_to_remove:
2048
+ logger.debug(
2049
+ f"Removed {len(regions_to_remove)} existing grid region(s) prior to rebuild"
2050
+ )
2051
+ except Exception as cleanup_err: # pragma: no cover – cleanup must never crash
2052
+ logger.warning(f"Grid cleanup failed: {cleanup_err}")
2053
+
1985
2054
  logger.debug(
1986
2055
  f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries"
1987
2056
  )
natural_pdf/core/page.py CHANGED
@@ -1780,8 +1780,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1780
1780
  # Apply global options as defaults, but allow explicit parameters to override
1781
1781
  import natural_pdf
1782
1782
 
1783
+ # Determine if this is likely a computational use (OCR, analysis, etc.)
1784
+ # If resolution is explicitly provided but width is not, assume computational use
1785
+ # and don't apply global display width settings
1786
+ is_computational_use = (resolution is not None and width is None and
1787
+ kwargs.get('include_highlights', True) is False)
1788
+
1783
1789
  # Use global options if parameters are not explicitly set
1784
- if width is None:
1790
+ if width is None and not is_computational_use:
1785
1791
  width = natural_pdf.options.image.width
1786
1792
  if resolution is None:
1787
1793
  if natural_pdf.options.image.resolution is not None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.34
3
+ Version: 0.1.36
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Requires-Dist: scikit-learn
14
15
  Requires-Dist: markdown
15
16
  Requires-Dist: pandas
16
17
  Requires-Dist: pdfplumber
@@ -1,7 +1,7 @@
1
1
  natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
2
2
  natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
3
- natural_pdf/analyzers/__init__.py,sha256=IPu_PMKFviDeEIeiC8_2KdeqH7z8OQ6q2v980hkByFY,672
4
- natural_pdf/analyzers/guides.py,sha256=76gEVJ3EGeIBq8KP-BZtEn__qHQfrQ-DTwenup54vI4,89165
3
+ natural_pdf/analyzers/__init__.py,sha256=M5oD4oEsIBWrzgTaeg4uBYrC4OPd7Mp7tz8ootKN_l8,1134
4
+ natural_pdf/analyzers/guides.py,sha256=5Lqc51trtqmLvjxLjDS__mgeyviRrjV-CIIT69RmEt4,92327
5
5
  natural_pdf/analyzers/shape_detection_mixin.py,sha256=Ef1o73QYVXQ2QcQMM_W9XRwY6vaIQHgxzD7etJ6LbiM,62820
6
6
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
7
7
  natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
@@ -27,7 +27,7 @@ natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw
27
27
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
28
  natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
29
29
  natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
30
- natural_pdf/core/page.py,sha256=obo8Fkomz39cvr-s8a2CAyhAFmYp-3-jENotfhxcc98,132790
30
+ natural_pdf/core/page.py,sha256=Jw5SDshnHesqoC4yhtKEokeV08wMHuWZyWs5kDMOAjo,133204
31
31
  natural_pdf/core/pdf.py,sha256=9t8Ks-AZp3yjH_lRkFZAyIkjUQoCTRbmXK7vSi1e4UE,92415
32
32
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
33
33
  natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
@@ -97,7 +97,7 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
97
97
  natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
98
98
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
99
99
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
100
- natural_pdf-0.1.34.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
100
+ natural_pdf-0.1.36.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
101
101
  optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
102
102
  optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
103
103
  optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
@@ -114,8 +114,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
114
114
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
115
115
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
116
116
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
117
- natural_pdf-0.1.34.dist-info/METADATA,sha256=zFL9gCQDfqhmApuwRXOcaOCENRPmeJ0C9Lt4uXRzKG0,6711
118
- natural_pdf-0.1.34.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
119
- natural_pdf-0.1.34.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
120
- natural_pdf-0.1.34.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
121
- natural_pdf-0.1.34.dist-info/RECORD,,
117
+ natural_pdf-0.1.36.dist-info/METADATA,sha256=bAjoKpKPZW76v_QVBA0HgyXvA9ZP2uFrrD50mJq051M,6739
118
+ natural_pdf-0.1.36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
119
+ natural_pdf-0.1.36.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
120
+ natural_pdf-0.1.36.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
121
+ natural_pdf-0.1.36.dist-info/RECORD,,