natural-pdf 0.1.34__py3-none-any.whl → 0.1.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +16 -4
- natural_pdf/analyzers/guides.py +85 -16
- natural_pdf/core/page.py +7 -1
- {natural_pdf-0.1.34.dist-info → natural_pdf-0.1.36.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.34.dist-info → natural_pdf-0.1.36.dist-info}/RECORD +9 -9
- {natural_pdf-0.1.34.dist-info → natural_pdf-0.1.36.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.34.dist-info → natural_pdf-0.1.36.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.34.dist-info → natural_pdf-0.1.36.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.34.dist-info → natural_pdf-0.1.36.dist-info}/top_level.txt +0 -0
@@ -2,17 +2,29 @@
|
|
2
2
|
Analyzers for natural_pdf.
|
3
3
|
"""
|
4
4
|
|
5
|
+
# Import these directly as they don't depend on Region
|
5
6
|
from natural_pdf.analyzers.guides import Guides
|
6
|
-
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
7
|
-
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
8
|
-
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
9
7
|
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
10
8
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
11
9
|
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
12
10
|
|
11
|
+
# Lazy imports to avoid circular dependencies
|
12
|
+
# These will be imported when actually accessed
|
13
|
+
def __getattr__(name):
|
14
|
+
if name == "LayoutAnalyzer":
|
15
|
+
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
16
|
+
return LayoutAnalyzer
|
17
|
+
elif name == "LayoutManager":
|
18
|
+
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
19
|
+
return LayoutManager
|
20
|
+
elif name == "LayoutOptions":
|
21
|
+
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
22
|
+
return LayoutOptions
|
23
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
24
|
+
|
13
25
|
__all__ = [
|
14
26
|
"LayoutAnalyzer",
|
15
|
-
"LayoutManager",
|
27
|
+
"LayoutManager",
|
16
28
|
"LayoutOptions",
|
17
29
|
"ShapeDetectionMixin",
|
18
30
|
"TextStyleOptions",
|
natural_pdf/analyzers/guides.py
CHANGED
@@ -119,6 +119,8 @@ class GuidesList(UserList):
|
|
119
119
|
align: Literal["left", "right", "center", "between"] = "left",
|
120
120
|
outer: bool = True,
|
121
121
|
tolerance: float = 5,
|
122
|
+
*,
|
123
|
+
append: bool = False,
|
122
124
|
) -> "Guides":
|
123
125
|
"""
|
124
126
|
Create guides from content markers and add to this axis.
|
@@ -154,11 +156,17 @@ class GuidesList(UserList):
|
|
154
156
|
tolerance=tolerance,
|
155
157
|
)
|
156
158
|
|
157
|
-
#
|
158
|
-
if
|
159
|
-
self.
|
159
|
+
# Replace or append based on parameter
|
160
|
+
if append:
|
161
|
+
if self._axis == "vertical":
|
162
|
+
self.extend(new_guides.vertical)
|
163
|
+
else:
|
164
|
+
self.extend(new_guides.horizontal)
|
160
165
|
else:
|
161
|
-
self.
|
166
|
+
if self._axis == "vertical":
|
167
|
+
self.data = list(new_guides.vertical)
|
168
|
+
else:
|
169
|
+
self.data = list(new_guides.horizontal)
|
162
170
|
|
163
171
|
# Remove duplicates while preserving order
|
164
172
|
seen = set()
|
@@ -183,6 +191,7 @@ class GuidesList(UserList):
|
|
183
191
|
*,
|
184
192
|
n: Optional[int] = None,
|
185
193
|
min_gap: Optional[int] = None,
|
194
|
+
append: bool = False,
|
186
195
|
**detect_kwargs,
|
187
196
|
) -> "Guides":
|
188
197
|
"""
|
@@ -241,11 +250,17 @@ class GuidesList(UserList):
|
|
241
250
|
**detect_kwargs,
|
242
251
|
)
|
243
252
|
|
244
|
-
#
|
245
|
-
if
|
246
|
-
self.
|
253
|
+
# Replace or append based on parameter
|
254
|
+
if append:
|
255
|
+
if self._axis == "vertical":
|
256
|
+
self.extend(new_guides.vertical)
|
257
|
+
else:
|
258
|
+
self.extend(new_guides.horizontal)
|
247
259
|
else:
|
248
|
-
self.
|
260
|
+
if self._axis == "vertical":
|
261
|
+
self.data = list(new_guides.vertical)
|
262
|
+
else:
|
263
|
+
self.data = list(new_guides.horizontal)
|
249
264
|
|
250
265
|
# Remove duplicates
|
251
266
|
seen = set()
|
@@ -259,7 +274,8 @@ class GuidesList(UserList):
|
|
259
274
|
return self._parent
|
260
275
|
|
261
276
|
def from_whitespace(
|
262
|
-
self, obj: Optional[Union["Page", "Region"]] = None, min_gap: float = 10
|
277
|
+
self, obj: Optional[Union["Page", "Region"]] = None, min_gap: float = 10,
|
278
|
+
*, append: bool = False
|
263
279
|
) -> "Guides":
|
264
280
|
"""
|
265
281
|
Create guides from whitespace gaps.
|
@@ -278,11 +294,17 @@ class GuidesList(UserList):
|
|
278
294
|
# Create guides for this axis
|
279
295
|
new_guides = Guides.from_whitespace(obj=target_obj, axis=self._axis, min_gap=min_gap)
|
280
296
|
|
281
|
-
#
|
282
|
-
if
|
283
|
-
self.
|
297
|
+
# Replace or append
|
298
|
+
if append:
|
299
|
+
if self._axis == "vertical":
|
300
|
+
self.extend(new_guides.vertical)
|
301
|
+
else:
|
302
|
+
self.extend(new_guides.horizontal)
|
284
303
|
else:
|
285
|
-
self.
|
304
|
+
if self._axis == "vertical":
|
305
|
+
self.data = list(new_guides.vertical)
|
306
|
+
else:
|
307
|
+
self.data = list(new_guides.horizontal)
|
286
308
|
|
287
309
|
# Remove duplicates
|
288
310
|
seen = set()
|
@@ -313,11 +335,11 @@ class GuidesList(UserList):
|
|
313
335
|
# Create guides using divide
|
314
336
|
new_guides = Guides.divide(obj=target_obj, n=n, axis=self._axis)
|
315
337
|
|
316
|
-
#
|
338
|
+
# Replace existing guides instead of extending (no append option here)
|
317
339
|
if self._axis == "vertical":
|
318
|
-
self.
|
340
|
+
self.data = list(new_guides.vertical)
|
319
341
|
else:
|
320
|
-
self.
|
342
|
+
self.data = list(new_guides.horizontal)
|
321
343
|
|
322
344
|
# Remove duplicates
|
323
345
|
seen = set()
|
@@ -1982,6 +2004,53 @@ class Guides:
|
|
1982
2004
|
row_boundaries = sorted(list(set(row_boundaries)))
|
1983
2005
|
col_boundaries = sorted(list(set(col_boundaries)))
|
1984
2006
|
|
2007
|
+
# ------------------------------------------------------------------
|
2008
|
+
# Clean-up: remove any previously created grid regions (table, rows,
|
2009
|
+
# columns, cells) that were generated by the same `source` label and
|
2010
|
+
# overlap the area we are about to populate. This prevents the page's
|
2011
|
+
# `ElementManager` from accumulating stale/duplicate regions when the
|
2012
|
+
# user rebuilds the grid multiple times.
|
2013
|
+
# ------------------------------------------------------------------
|
2014
|
+
try:
|
2015
|
+
# Bounding box of the grid we are about to create
|
2016
|
+
if row_boundaries and col_boundaries:
|
2017
|
+
grid_bbox = (
|
2018
|
+
col_boundaries[0], # x0
|
2019
|
+
row_boundaries[0], # top
|
2020
|
+
col_boundaries[-1], # x1
|
2021
|
+
row_boundaries[-1], # bottom
|
2022
|
+
)
|
2023
|
+
|
2024
|
+
def _bbox_overlap(b1, b2):
|
2025
|
+
"""Return True if two (x0, top, x1, bottom) bboxes overlap."""
|
2026
|
+
return not (
|
2027
|
+
b1[2] <= b2[0] # b1 right ≤ b2 left
|
2028
|
+
or b1[0] >= b2[2] # b1 left ≥ b2 right
|
2029
|
+
or b1[3] <= b2[1] # b1 bottom ≤ b2 top
|
2030
|
+
or b1[1] >= b2[3] # b1 top ≥ b2 bottom
|
2031
|
+
)
|
2032
|
+
|
2033
|
+
# Collect existing regions that match the source & region types
|
2034
|
+
regions_to_remove = [
|
2035
|
+
r
|
2036
|
+
for r in element_manager.regions
|
2037
|
+
if getattr(r, "source", None) == source
|
2038
|
+
and getattr(r, "region_type", None)
|
2039
|
+
in {"table", "table_row", "table_column", "table_cell"}
|
2040
|
+
and hasattr(r, "bbox")
|
2041
|
+
and _bbox_overlap(r.bbox, grid_bbox)
|
2042
|
+
]
|
2043
|
+
|
2044
|
+
for r in regions_to_remove:
|
2045
|
+
element_manager.remove_element(r, element_type="regions")
|
2046
|
+
|
2047
|
+
if regions_to_remove:
|
2048
|
+
logger.debug(
|
2049
|
+
f"Removed {len(regions_to_remove)} existing grid region(s) prior to rebuild"
|
2050
|
+
)
|
2051
|
+
except Exception as cleanup_err: # pragma: no cover – cleanup must never crash
|
2052
|
+
logger.warning(f"Grid cleanup failed: {cleanup_err}")
|
2053
|
+
|
1985
2054
|
logger.debug(
|
1986
2055
|
f"Building grid with {len(row_boundaries)} row and {len(col_boundaries)} col boundaries"
|
1987
2056
|
)
|
natural_pdf/core/page.py
CHANGED
@@ -1780,8 +1780,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1780
1780
|
# Apply global options as defaults, but allow explicit parameters to override
|
1781
1781
|
import natural_pdf
|
1782
1782
|
|
1783
|
+
# Determine if this is likely a computational use (OCR, analysis, etc.)
|
1784
|
+
# If resolution is explicitly provided but width is not, assume computational use
|
1785
|
+
# and don't apply global display width settings
|
1786
|
+
is_computational_use = (resolution is not None and width is None and
|
1787
|
+
kwargs.get('include_highlights', True) is False)
|
1788
|
+
|
1783
1789
|
# Use global options if parameters are not explicitly set
|
1784
|
-
if width is None:
|
1790
|
+
if width is None and not is_computational_use:
|
1785
1791
|
width = natural_pdf.options.image.width
|
1786
1792
|
if resolution is None:
|
1787
1793
|
if natural_pdf.options.image.resolution is not None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.36
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
|
|
11
11
|
Requires-Python: >=3.9
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
|
+
Requires-Dist: scikit-learn
|
14
15
|
Requires-Dist: markdown
|
15
16
|
Requires-Dist: pandas
|
16
17
|
Requires-Dist: pdfplumber
|
@@ -1,7 +1,7 @@
|
|
1
1
|
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
2
2
|
natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
|
3
|
-
natural_pdf/analyzers/__init__.py,sha256=
|
4
|
-
natural_pdf/analyzers/guides.py,sha256=
|
3
|
+
natural_pdf/analyzers/__init__.py,sha256=M5oD4oEsIBWrzgTaeg4uBYrC4OPd7Mp7tz8ootKN_l8,1134
|
4
|
+
natural_pdf/analyzers/guides.py,sha256=5Lqc51trtqmLvjxLjDS__mgeyviRrjV-CIIT69RmEt4,92327
|
5
5
|
natural_pdf/analyzers/shape_detection_mixin.py,sha256=Ef1o73QYVXQ2QcQMM_W9XRwY6vaIQHgxzD7etJ6LbiM,62820
|
6
6
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
7
7
|
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
@@ -27,7 +27,7 @@ natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw
|
|
27
27
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
28
|
natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
|
29
29
|
natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
|
30
|
-
natural_pdf/core/page.py,sha256=
|
30
|
+
natural_pdf/core/page.py,sha256=Jw5SDshnHesqoC4yhtKEokeV08wMHuWZyWs5kDMOAjo,133204
|
31
31
|
natural_pdf/core/pdf.py,sha256=9t8Ks-AZp3yjH_lRkFZAyIkjUQoCTRbmXK7vSi1e4UE,92415
|
32
32
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
33
33
|
natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
|
@@ -97,7 +97,7 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
|
|
97
97
|
natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
|
98
98
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
99
99
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
100
|
-
natural_pdf-0.1.
|
100
|
+
natural_pdf-0.1.36.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
101
101
|
optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
|
102
102
|
optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
|
103
103
|
optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
|
@@ -114,8 +114,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
114
114
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
115
115
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
116
116
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
117
|
-
natural_pdf-0.1.
|
118
|
-
natural_pdf-0.1.
|
119
|
-
natural_pdf-0.1.
|
120
|
-
natural_pdf-0.1.
|
121
|
-
natural_pdf-0.1.
|
117
|
+
natural_pdf-0.1.36.dist-info/METADATA,sha256=bAjoKpKPZW76v_QVBA0HgyXvA9ZP2uFrrD50mJq051M,6739
|
118
|
+
natural_pdf-0.1.36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
119
|
+
natural_pdf-0.1.36.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
120
|
+
natural_pdf-0.1.36.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
121
|
+
natural_pdf-0.1.36.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|