natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +751 -607
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +120 -23
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
- natural_pdf-0.1.35.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,16 @@
|
|
1
|
-
"""
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
1
|
+
"""Element Manager for natural-pdf.
|
2
|
+
|
3
|
+
This module handles the loading, creation, and management of PDF elements like
|
4
|
+
characters, words, rectangles, lines, and images extracted from a page. The
|
5
|
+
ElementManager class serves as the central coordinator for element lifecycle
|
6
|
+
management and provides enhanced word extraction capabilities.
|
7
|
+
|
8
|
+
The module includes:
|
9
|
+
- Element creation and caching for performance
|
10
|
+
- Custom word extraction that respects font boundaries
|
11
|
+
- OCR coordinate transformation and integration
|
12
|
+
- Text decoration detection (underline, strikethrough, highlights)
|
13
|
+
- Performance optimizations for bulk text processing
|
6
14
|
"""
|
7
15
|
|
8
16
|
import logging
|
@@ -13,10 +21,10 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
13
21
|
|
14
22
|
from pdfplumber.utils.text import WordExtractor
|
15
23
|
|
24
|
+
from natural_pdf.elements.image import ImageElement
|
16
25
|
from natural_pdf.elements.line import LineElement
|
17
26
|
from natural_pdf.elements.rect import RectangleElement
|
18
27
|
from natural_pdf.elements.text import TextElement
|
19
|
-
from natural_pdf.elements.image import ImageElement
|
20
28
|
|
21
29
|
logger = logging.getLogger(__name__)
|
22
30
|
|
@@ -25,8 +33,8 @@ logger = logging.getLogger(__name__)
|
|
25
33
|
# ------------------------------------------------------------------
|
26
34
|
|
27
35
|
STRIKE_DEFAULTS = {
|
28
|
-
"thickness_tol": 1.5,
|
29
|
-
"horiz_tol": 1.0,
|
36
|
+
"thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
|
37
|
+
"horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
|
30
38
|
"coverage_ratio": 0.7, # proportion of glyph width to be overlapped
|
31
39
|
"band_top_frac": 0.35, # fraction of glyph height above top baseline band
|
32
40
|
"band_bottom_frac": 0.65, # fraction below top (same used internally)
|
@@ -36,48 +44,90 @@ UNDERLINE_DEFAULTS = {
|
|
36
44
|
"thickness_tol": 1.5,
|
37
45
|
"horiz_tol": 1.0,
|
38
46
|
"coverage_ratio": 0.8,
|
39
|
-
"band_frac": 0.25,
|
40
|
-
"below_pad": 0.7,
|
47
|
+
"band_frac": 0.25, # height fraction above baseline
|
48
|
+
"below_pad": 0.7, # pt ; pad below baseline
|
41
49
|
}
|
42
50
|
|
43
51
|
HIGHLIGHT_DEFAULTS = {
|
44
52
|
"height_min_ratio": 0.6, # rect height relative to char height lower bound
|
45
53
|
"height_max_ratio": 2.0, # upper bound
|
46
|
-
"coverage_ratio": 0.6,
|
54
|
+
"coverage_ratio": 0.6, # horizontal overlap with glyph
|
47
55
|
"color_saturation_min": 0.4, # HSV S >
|
48
|
-
"color_value_min": 0.4,
|
56
|
+
"color_value_min": 0.4, # HSV V >
|
49
57
|
}
|
50
58
|
|
51
59
|
|
52
60
|
@contextmanager
|
53
61
|
def disable_text_sync():
|
54
|
-
"""
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
62
|
+
"""Temporarily disable text synchronization for performance.
|
63
|
+
|
64
|
+
This context manager is used when bulk-updating text content where character-level
|
65
|
+
synchronization is not needed, such as during bidi processing or large-scale
|
66
|
+
text transformations. It prevents exponential recursion issues with Arabic/RTL
|
67
|
+
text processing by bypassing the normal text property setter.
|
68
|
+
|
69
|
+
Yields:
|
70
|
+
None: The context where text synchronization is disabled.
|
71
|
+
|
72
|
+
Example:
|
73
|
+
```python
|
74
|
+
with disable_text_sync():
|
75
|
+
for element in text_elements:
|
76
|
+
element.text = process_arabic_text(element.text)
|
77
|
+
# Text sync automatically restored after the block
|
78
|
+
```
|
79
|
+
|
80
|
+
Note:
|
81
|
+
This optimization is critical for performance when processing documents
|
82
|
+
with complex text layouts or right-to-left scripts that would otherwise
|
83
|
+
trigger expensive character synchronization operations.
|
60
84
|
"""
|
61
85
|
# Save original setter
|
62
86
|
original_setter = TextElement.text.fset
|
63
|
-
|
87
|
+
|
64
88
|
# Create a fast setter that skips sync
|
65
89
|
def fast_setter(self, value):
|
66
90
|
self._obj["text"] = value
|
67
91
|
# Skip character synchronization for performance
|
68
|
-
|
92
|
+
|
69
93
|
# Apply fast setter
|
70
94
|
TextElement.text = property(TextElement.text.fget, fast_setter)
|
71
|
-
|
95
|
+
|
72
96
|
try:
|
73
97
|
yield
|
74
98
|
finally:
|
75
99
|
# Restore original setter
|
76
100
|
TextElement.text = property(TextElement.text.fget, original_setter)
|
77
101
|
|
102
|
+
|
78
103
|
class NaturalWordExtractor(WordExtractor):
|
79
|
-
"""
|
80
|
-
|
104
|
+
"""Custom WordExtractor that splits words based on specified character attributes.
|
105
|
+
|
106
|
+
This class extends pdfplumber's WordExtractor to provide more intelligent word
|
107
|
+
segmentation that respects font boundaries and other character attributes.
|
108
|
+
It prevents words from spanning across different fonts, sizes, or styles,
|
109
|
+
which is essential for maintaining semantic meaning in document analysis.
|
110
|
+
|
111
|
+
The extractor considers multiple character attributes when determining word
|
112
|
+
boundaries, ensuring that visually distinct text elements (like bold headers
|
113
|
+
mixed with regular text) are properly separated into distinct words.
|
114
|
+
|
115
|
+
Attributes:
|
116
|
+
font_attrs: List of character attributes to consider for word boundaries.
|
117
|
+
Common attributes include 'fontname', 'size', 'flags', etc.
|
118
|
+
|
119
|
+
Example:
|
120
|
+
```python
|
121
|
+
# Create extractor that splits on font and size changes
|
122
|
+
extractor = NaturalWordExtractor(['fontname', 'size'])
|
123
|
+
|
124
|
+
# Extract words with font-aware boundaries
|
125
|
+
words = extractor.extract_words(page_chars)
|
126
|
+
|
127
|
+
# Each word will have consistent font properties
|
128
|
+
for word in words:
|
129
|
+
print(f"'{word['text']}' in {word['fontname']} size {word['size']}")
|
130
|
+
```
|
81
131
|
in addition to pdfplumber's default spatial logic.
|
82
132
|
"""
|
83
133
|
|
@@ -198,7 +248,9 @@ class ElementManager:
|
|
198
248
|
if self._load_text and prepared_char_dicts:
|
199
249
|
try:
|
200
250
|
self._mark_strikethrough_chars(prepared_char_dicts)
|
201
|
-
except
|
251
|
+
except (
|
252
|
+
Exception
|
253
|
+
) as strike_err: # pragma: no cover – strike detection must never crash loading
|
202
254
|
logger.warning(
|
203
255
|
f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
|
204
256
|
exc_info=True,
|
@@ -244,16 +296,16 @@ class ElementManager:
|
|
244
296
|
# 2. Instantiate the custom word extractor
|
245
297
|
# Prefer page-level config over PDF-level for tolerance lookup
|
246
298
|
word_elements: List[TextElement] = []
|
247
|
-
|
299
|
+
|
248
300
|
# Get config objects (needed for auto_text_tolerance check)
|
249
301
|
page_config = getattr(self._page, "_config", {})
|
250
302
|
pdf_config = getattr(self._page._parent, "_config", {})
|
251
|
-
|
303
|
+
|
252
304
|
# Initialize tolerance variables
|
253
305
|
xt = None
|
254
306
|
yt = None
|
255
307
|
use_flow = pdf_config.get("use_text_flow", False)
|
256
|
-
|
308
|
+
|
257
309
|
if self._load_text and prepared_char_dicts:
|
258
310
|
# Start with any explicitly supplied tolerances (may be None)
|
259
311
|
xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
|
@@ -275,7 +327,7 @@ class ElementManager:
|
|
275
327
|
# Record back to page config for downstream users
|
276
328
|
page_config["x_tolerance"] = xt
|
277
329
|
if yt is None:
|
278
|
-
yt = 0.6 * median_size
|
330
|
+
yt = 0.6 * median_size # ~line spacing fraction
|
279
331
|
page_config["y_tolerance"] = yt
|
280
332
|
|
281
333
|
# Warn users when the page's font size is extremely small –
|
@@ -364,7 +416,8 @@ class ElementManager:
|
|
364
416
|
char_dir = "ltr"
|
365
417
|
|
366
418
|
extractor = NaturalWordExtractor(
|
367
|
-
word_split_attributes=self._word_split_attributes
|
419
|
+
word_split_attributes=self._word_split_attributes
|
420
|
+
+ ["strike", "underline", "highlight"],
|
368
421
|
extra_attrs=attributes_to_preserve,
|
369
422
|
x_tolerance=xt,
|
370
423
|
y_tolerance=yt,
|
@@ -413,12 +466,13 @@ class ElementManager:
|
|
413
466
|
# Convert from visual order (from PDF) to logical order using bidi
|
414
467
|
try:
|
415
468
|
from bidi.algorithm import get_display # type: ignore
|
469
|
+
|
416
470
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
417
471
|
|
418
472
|
with disable_text_sync():
|
419
473
|
# word_element.text is currently in visual order (from PDF)
|
420
474
|
# Convert to logical order using bidi with auto direction detection
|
421
|
-
logical_text = get_display(word_element.text, base_dir=
|
475
|
+
logical_text = get_display(word_element.text, base_dir="L")
|
422
476
|
# Apply bracket mirroring for logical order
|
423
477
|
word_element.text = mirror_brackets(logical_text)
|
424
478
|
except Exception:
|
@@ -495,7 +549,11 @@ class ElementManager:
|
|
495
549
|
if color_counts:
|
496
550
|
dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
|
497
551
|
try:
|
498
|
-
w._obj["highlight_color"] =
|
552
|
+
w._obj["highlight_color"] = (
|
553
|
+
tuple(dominant_color)
|
554
|
+
if isinstance(dominant_color, (list, tuple))
|
555
|
+
else dominant_color
|
556
|
+
)
|
499
557
|
except Exception:
|
500
558
|
w._obj["highlight_color"] = dominant_color
|
501
559
|
|
@@ -998,12 +1056,16 @@ class ElementManager:
|
|
998
1056
|
# Strikethrough detection (horizontal strike-out lines)
|
999
1057
|
# ------------------------------------------------------------------
|
1000
1058
|
|
1001
|
-
def _mark_strikethrough_chars(
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1059
|
+
def _mark_strikethrough_chars(
|
1060
|
+
self,
|
1061
|
+
char_dicts: List[Dict[str, Any]],
|
1062
|
+
*,
|
1063
|
+
thickness_tol: float = 1.5,
|
1064
|
+
horiz_tol: float = 1.0,
|
1065
|
+
coverage_ratio: float = 0.7,
|
1066
|
+
band_top: float = 0.35,
|
1067
|
+
band_bottom: float = 0.65,
|
1068
|
+
) -> None:
|
1007
1069
|
"""Annotate character dictionaries with a boolean ``strike`` flag.
|
1008
1070
|
|
1009
1071
|
Args
|
@@ -1102,11 +1164,31 @@ class ElementManager:
|
|
1102
1164
|
# Allow user overrides via PDF._config["underline_detection"]
|
1103
1165
|
pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
|
1104
1166
|
|
1105
|
-
thickness_tol =
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1167
|
+
thickness_tol = (
|
1168
|
+
thickness_tol
|
1169
|
+
if thickness_tol is not None
|
1170
|
+
else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
|
1171
|
+
)
|
1172
|
+
horiz_tol = (
|
1173
|
+
horiz_tol
|
1174
|
+
if horiz_tol is not None
|
1175
|
+
else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
|
1176
|
+
)
|
1177
|
+
coverage_ratio = (
|
1178
|
+
coverage_ratio
|
1179
|
+
if coverage_ratio is not None
|
1180
|
+
else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
|
1181
|
+
)
|
1182
|
+
band_frac = (
|
1183
|
+
band_frac
|
1184
|
+
if band_frac is not None
|
1185
|
+
else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
|
1186
|
+
)
|
1187
|
+
below_pad = (
|
1188
|
+
below_pad
|
1189
|
+
if below_pad is not None
|
1190
|
+
else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
|
1191
|
+
)
|
1110
1192
|
|
1111
1193
|
raw_lines = list(getattr(self._page._page, "lines", []))
|
1112
1194
|
raw_rects = list(getattr(self._page._page, "rects", []))
|
@@ -1148,7 +1230,7 @@ class ElementManager:
|
|
1148
1230
|
table_y = {k for k, v in y_groups.items() if v >= 3}
|
1149
1231
|
|
1150
1232
|
# filter out candidates on those y values
|
1151
|
-
filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
|
1233
|
+
filtered_candidates = [c for c in candidates if int((c[1] + c[3]) / 2) not in table_y]
|
1152
1234
|
|
1153
1235
|
# annotate chars
|
1154
1236
|
for ch in char_dicts:
|
@@ -1205,7 +1287,9 @@ class ElementManager:
|
|
1205
1287
|
y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
|
1206
1288
|
y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
|
1207
1289
|
rheight = y1_rect - y0_rect
|
1208
|
-
highlight_rects.append(
|
1290
|
+
highlight_rects.append(
|
1291
|
+
(rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col)
|
1292
|
+
)
|
1209
1293
|
|
1210
1294
|
if not highlight_rects:
|
1211
1295
|
for ch in char_dicts:
|
@@ -1238,7 +1322,9 @@ class ElementManager:
|
|
1238
1322
|
if overlap > 0 and (overlap / width) >= coverage_ratio:
|
1239
1323
|
ch["highlight"] = True
|
1240
1324
|
try:
|
1241
|
-
ch["highlight_color"] =
|
1325
|
+
ch["highlight_color"] = (
|
1326
|
+
tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
|
1327
|
+
)
|
1242
1328
|
except Exception:
|
1243
1329
|
ch["highlight_color"] = rcolor
|
1244
1330
|
break
|
@@ -98,7 +98,9 @@ class HighlightRenderer:
|
|
98
98
|
scaled_bbox = None
|
99
99
|
|
100
100
|
if highlight.is_polygon:
|
101
|
-
scaled_polygon = [
|
101
|
+
scaled_polygon = [
|
102
|
+
(p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon
|
103
|
+
]
|
102
104
|
# Draw polygon fill and border
|
103
105
|
draw.polygon(
|
104
106
|
scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
|
@@ -597,7 +599,7 @@ class HighlightingService:
|
|
597
599
|
if page_index in self._highlights_by_page:
|
598
600
|
del self._highlights_by_page[page_index]
|
599
601
|
logger.debug(f"Cleared highlights for page {page_index}.")
|
600
|
-
|
602
|
+
|
601
603
|
# Also clear any cached rendered images for this page so the next render
|
602
604
|
# reflects the removal of highlights.
|
603
605
|
try:
|
@@ -683,7 +685,6 @@ class HighlightingService:
|
|
683
685
|
)
|
684
686
|
|
685
687
|
try:
|
686
|
-
# base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
|
687
688
|
img_object = page_obj._page.to_image(**to_image_args)
|
688
689
|
base_image_pil = (
|
689
690
|
img_object.annotated
|
@@ -929,9 +930,7 @@ class HighlightingService:
|
|
929
930
|
right_px = max(left_px + 1, min(right_px, rendered_image.width))
|
930
931
|
bottom_px = max(top_px + 1, min(bottom_px, rendered_image.height))
|
931
932
|
|
932
|
-
rendered_image = rendered_image.crop(
|
933
|
-
(left_px, top_px, right_px, bottom_px)
|
934
|
-
)
|
933
|
+
rendered_image = rendered_image.crop((left_px, top_px, right_px, bottom_px))
|
935
934
|
|
936
935
|
legend = None
|
937
936
|
if labels:
|
natural_pdf/core/page.py
CHANGED
@@ -77,7 +77,6 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
|
|
77
77
|
# --- End Classification Imports --- #
|
78
78
|
|
79
79
|
|
80
|
-
|
81
80
|
# --- End Shape Detection Mixin --- #
|
82
81
|
|
83
82
|
|
@@ -94,23 +93,107 @@ logger = logging.getLogger(__name__)
|
|
94
93
|
|
95
94
|
|
96
95
|
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
97
|
-
"""
|
98
|
-
Enhanced Page wrapper built on top of pdfplumber.Page.
|
96
|
+
"""Enhanced Page wrapper built on top of pdfplumber.Page.
|
99
97
|
|
100
98
|
This class provides a fluent interface for working with PDF pages,
|
101
99
|
with improved selection, navigation, extraction, and question-answering capabilities.
|
100
|
+
It integrates multiple analysis capabilities through mixins and provides spatial
|
101
|
+
navigation with CSS-like selectors.
|
102
|
+
|
103
|
+
The Page class serves as the primary interface for document analysis, offering:
|
104
|
+
- Element selection and spatial navigation
|
105
|
+
- OCR and layout analysis integration
|
106
|
+
- Table detection and extraction
|
107
|
+
- AI-powered classification and data extraction
|
108
|
+
- Visual debugging with highlighting and cropping
|
109
|
+
- Text style analysis and structure detection
|
110
|
+
|
111
|
+
Attributes:
|
112
|
+
index: Zero-based index of this page in the PDF.
|
113
|
+
number: One-based page number (index + 1).
|
114
|
+
width: Page width in points.
|
115
|
+
height: Page height in points.
|
116
|
+
bbox: Bounding box tuple (x0, top, x1, bottom) of the page.
|
117
|
+
chars: Collection of character elements on the page.
|
118
|
+
words: Collection of word elements on the page.
|
119
|
+
lines: Collection of line elements on the page.
|
120
|
+
rects: Collection of rectangle elements on the page.
|
121
|
+
images: Collection of image elements on the page.
|
122
|
+
metadata: Dictionary for storing analysis results and custom data.
|
123
|
+
|
124
|
+
Example:
|
125
|
+
Basic usage:
|
126
|
+
```python
|
127
|
+
pdf = npdf.PDF("document.pdf")
|
128
|
+
page = pdf.pages[0]
|
129
|
+
|
130
|
+
# Find elements with CSS-like selectors
|
131
|
+
headers = page.find_all('text[size>12]:bold')
|
132
|
+
summaries = page.find('text:contains("Summary")')
|
133
|
+
|
134
|
+
# Spatial navigation
|
135
|
+
content_below = summaries.below(until='text[size>12]:bold')
|
136
|
+
|
137
|
+
# Table extraction
|
138
|
+
tables = page.extract_table()
|
139
|
+
```
|
140
|
+
|
141
|
+
Advanced usage:
|
142
|
+
```python
|
143
|
+
# Apply OCR if needed
|
144
|
+
page.apply_ocr(engine='easyocr', resolution=300)
|
145
|
+
|
146
|
+
# Layout analysis
|
147
|
+
page.analyze_layout(engine='yolo')
|
148
|
+
|
149
|
+
# AI-powered extraction
|
150
|
+
data = page.extract_structured_data(MySchema)
|
151
|
+
|
152
|
+
# Visual debugging
|
153
|
+
page.find('text:contains("Important")').show()
|
154
|
+
```
|
102
155
|
"""
|
103
156
|
|
104
|
-
def __init__(
|
105
|
-
|
106
|
-
|
157
|
+
def __init__(
|
158
|
+
self,
|
159
|
+
page: "pdfplumber.page.Page",
|
160
|
+
parent: "PDF",
|
161
|
+
index: int,
|
162
|
+
font_attrs=None,
|
163
|
+
load_text: bool = True,
|
164
|
+
):
|
165
|
+
"""Initialize a page wrapper.
|
166
|
+
|
167
|
+
Creates an enhanced Page object that wraps a pdfplumber page with additional
|
168
|
+
functionality for spatial navigation, analysis, and AI-powered extraction.
|
107
169
|
|
108
170
|
Args:
|
109
|
-
page: pdfplumber page object
|
110
|
-
parent: Parent PDF object
|
111
|
-
|
112
|
-
|
113
|
-
|
171
|
+
page: The underlying pdfplumber page object that provides raw PDF data.
|
172
|
+
parent: Parent PDF object that contains this page and provides access
|
173
|
+
to managers and global settings.
|
174
|
+
index: Zero-based index of this page in the PDF document.
|
175
|
+
font_attrs: List of font attributes to consider when grouping characters
|
176
|
+
into words. Common attributes include ['fontname', 'size', 'flags'].
|
177
|
+
If None, uses default character-to-word grouping rules.
|
178
|
+
load_text: If True, load and process text elements from the PDF's text layer.
|
179
|
+
If False, skip text layer processing (useful for OCR-only workflows).
|
180
|
+
|
181
|
+
Note:
|
182
|
+
This constructor is typically called automatically when accessing pages
|
183
|
+
through the PDF.pages collection. Direct instantiation is rarely needed.
|
184
|
+
|
185
|
+
Example:
|
186
|
+
```python
|
187
|
+
# Pages are usually accessed through the PDF object
|
188
|
+
pdf = npdf.PDF("document.pdf")
|
189
|
+
page = pdf.pages[0] # Page object created automatically
|
190
|
+
|
191
|
+
# Direct construction (advanced usage)
|
192
|
+
import pdfplumber
|
193
|
+
with pdfplumber.open("document.pdf") as plumber_pdf:
|
194
|
+
plumber_page = plumber_pdf.pages[0]
|
195
|
+
page = Page(plumber_page, pdf, 0, load_text=True)
|
196
|
+
```
|
114
197
|
"""
|
115
198
|
self._page = page
|
116
199
|
self._parent = parent
|
@@ -1190,6 +1273,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1190
1273
|
if _contains_rtl(result):
|
1191
1274
|
try:
|
1192
1275
|
from bidi.algorithm import get_display # type: ignore
|
1276
|
+
|
1193
1277
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
1194
1278
|
|
1195
1279
|
result = "\n".join(
|
@@ -1199,8 +1283,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1199
1283
|
base_dir=(
|
1200
1284
|
"R"
|
1201
1285
|
if any(
|
1202
|
-
unicodedata.bidirectional(ch)
|
1203
|
-
in ("R", "AL", "AN")
|
1286
|
+
unicodedata.bidirectional(ch) in ("R", "AL", "AN")
|
1204
1287
|
for ch in line
|
1205
1288
|
)
|
1206
1289
|
else "L"
|
@@ -1396,11 +1479,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1396
1479
|
table_settings.setdefault("text_y_tolerance", y_tol)
|
1397
1480
|
|
1398
1481
|
# pdfplumber's text strategy benefits from a tight snap tolerance.
|
1399
|
-
if
|
1482
|
+
if (
|
1483
|
+
"snap_tolerance" not in table_settings
|
1484
|
+
and "snap_x_tolerance" not in table_settings
|
1485
|
+
):
|
1400
1486
|
# Derive from y_tol if available, else default 1
|
1401
1487
|
snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
|
1402
1488
|
table_settings.setdefault("snap_tolerance", snap)
|
1403
|
-
if
|
1489
|
+
if (
|
1490
|
+
"join_tolerance" not in table_settings
|
1491
|
+
and "join_x_tolerance" not in table_settings
|
1492
|
+
):
|
1404
1493
|
join = table_settings.get("snap_tolerance", 1)
|
1405
1494
|
table_settings.setdefault("join_tolerance", join)
|
1406
1495
|
table_settings.setdefault("join_x_tolerance", join)
|
@@ -1691,8 +1780,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1691
1780
|
# Apply global options as defaults, but allow explicit parameters to override
|
1692
1781
|
import natural_pdf
|
1693
1782
|
|
1783
|
+
# Determine if this is likely a computational use (OCR, analysis, etc.)
|
1784
|
+
# If resolution is explicitly provided but width is not, assume computational use
|
1785
|
+
# and don't apply global display width settings
|
1786
|
+
is_computational_use = (resolution is not None and width is None and
|
1787
|
+
kwargs.get('include_highlights', True) is False)
|
1788
|
+
|
1694
1789
|
# Use global options if parameters are not explicitly set
|
1695
|
-
if width is None:
|
1790
|
+
if width is None and not is_computational_use:
|
1696
1791
|
width = natural_pdf.options.image.width
|
1697
1792
|
if resolution is None:
|
1698
1793
|
if natural_pdf.options.image.resolution is not None:
|
@@ -2998,29 +3093,31 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2998
3093
|
InspectionSummary with element tables showing coordinates,
|
2999
3094
|
properties, and other details for each element
|
3000
3095
|
"""
|
3001
|
-
return self.find_all(
|
3096
|
+
return self.find_all("*").inspect(limit=limit)
|
3002
3097
|
|
3003
3098
|
def remove_text_layer(self) -> "Page":
|
3004
3099
|
"""
|
3005
3100
|
Remove all text elements from this page.
|
3006
|
-
|
3101
|
+
|
3007
3102
|
This removes all text elements (words and characters) from the page,
|
3008
3103
|
effectively clearing the text layer.
|
3009
|
-
|
3104
|
+
|
3010
3105
|
Returns:
|
3011
3106
|
Self for method chaining
|
3012
3107
|
"""
|
3013
3108
|
logger.info(f"Page {self.number}: Removing all text elements...")
|
3014
|
-
|
3109
|
+
|
3015
3110
|
# Remove all words and chars from the element manager
|
3016
3111
|
removed_words = len(self._element_mgr.words)
|
3017
3112
|
removed_chars = len(self._element_mgr.chars)
|
3018
|
-
|
3113
|
+
|
3019
3114
|
# Clear the lists
|
3020
3115
|
self._element_mgr._elements["words"] = []
|
3021
3116
|
self._element_mgr._elements["chars"] = []
|
3022
|
-
|
3023
|
-
logger.info(
|
3117
|
+
|
3118
|
+
logger.info(
|
3119
|
+
f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters"
|
3120
|
+
)
|
3024
3121
|
return self
|
3025
3122
|
|
3026
3123
|
@property
|