natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +751 -607
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +120 -23
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
- natural_pdf-0.1.35.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -10,10 +10,11 @@ This script compares memory usage before and after the optimization by:
|
|
10
10
|
|
11
11
|
import gc
|
12
12
|
import os
|
13
|
-
import psutil
|
14
13
|
import sys
|
15
14
|
from pathlib import Path
|
16
15
|
|
16
|
+
import psutil
|
17
|
+
|
17
18
|
import natural_pdf as npdf
|
18
19
|
|
19
20
|
|
@@ -22,9 +23,9 @@ def get_detailed_memory_info():
|
|
22
23
|
process = psutil.Process()
|
23
24
|
memory_info = process.memory_info()
|
24
25
|
return {
|
25
|
-
|
26
|
-
|
27
|
-
|
26
|
+
"rss_mb": memory_info.rss / 1024 / 1024,
|
27
|
+
"vms_mb": memory_info.vms / 1024 / 1024,
|
28
|
+
"python_objects": len(gc.get_objects()),
|
28
29
|
}
|
29
30
|
|
30
31
|
|
@@ -32,77 +33,83 @@ def analyze_character_storage(page):
|
|
32
33
|
"""Analyze how characters are stored in the page"""
|
33
34
|
# Force element loading
|
34
35
|
text_elements = page.find_all("text")
|
35
|
-
|
36
|
+
|
36
37
|
total_char_indices = 0
|
37
38
|
total_char_dicts = 0
|
38
39
|
total_chars_in_words = 0
|
39
40
|
memory_efficient_words = 0
|
40
41
|
legacy_words = 0
|
41
|
-
|
42
|
+
|
42
43
|
for element in text_elements:
|
43
|
-
if hasattr(element,
|
44
|
+
if hasattr(element, "_char_indices") and element._char_indices:
|
44
45
|
memory_efficient_words += 1
|
45
46
|
total_char_indices += len(element._char_indices)
|
46
47
|
total_chars_in_words += len(element._char_indices)
|
47
|
-
|
48
|
-
if hasattr(element,
|
48
|
+
|
49
|
+
if hasattr(element, "_char_dicts") and element._char_dicts:
|
49
50
|
total_char_dicts += len(element._char_dicts)
|
50
|
-
if not (hasattr(element,
|
51
|
+
if not (hasattr(element, "_char_indices") and element._char_indices):
|
51
52
|
legacy_words += 1
|
52
53
|
total_chars_in_words += len(element._char_dicts)
|
53
|
-
|
54
|
+
|
54
55
|
# Get individual character elements
|
55
56
|
char_elements = []
|
56
|
-
if hasattr(page,
|
57
|
-
char_elements = page._element_mgr.get_elements(
|
58
|
-
|
57
|
+
if hasattr(page, "_element_mgr"):
|
58
|
+
char_elements = page._element_mgr.get_elements("chars")
|
59
|
+
|
59
60
|
return {
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
61
|
+
"total_words": len(text_elements),
|
62
|
+
"memory_efficient_words": memory_efficient_words,
|
63
|
+
"legacy_words": legacy_words,
|
64
|
+
"total_char_elements": len(char_elements),
|
65
|
+
"total_char_indices": total_char_indices,
|
66
|
+
"total_char_dicts": total_char_dicts,
|
67
|
+
"total_chars_in_words": total_chars_in_words,
|
68
|
+
"estimated_duplication_ratio": total_char_dicts / max(len(char_elements), 1),
|
68
69
|
}
|
69
70
|
|
70
71
|
|
71
72
|
def test_memory_optimization():
|
72
73
|
"""Test the memory optimization with a real PDF"""
|
73
|
-
|
74
|
+
|
74
75
|
# Test with the practice PDF
|
75
76
|
test_pdf = Path("pdfs/01-practice.pdf")
|
76
77
|
if not test_pdf.exists():
|
77
78
|
print(f"Test PDF not found: {test_pdf}")
|
78
79
|
return
|
79
|
-
|
80
|
+
|
80
81
|
print("=" * 60)
|
81
82
|
print("MEMORY OPTIMIZATION ANALYSIS")
|
82
83
|
print("=" * 60)
|
83
|
-
|
84
|
+
|
84
85
|
# Baseline memory
|
85
86
|
gc.collect()
|
86
87
|
baseline_memory = get_detailed_memory_info()
|
87
|
-
print(
|
88
|
-
|
88
|
+
print(
|
89
|
+
f"Baseline memory: {baseline_memory['rss_mb']:.2f} MB RSS, {baseline_memory['python_objects']:,} objects"
|
90
|
+
)
|
91
|
+
|
89
92
|
# Load PDF
|
90
93
|
pdf = npdf.PDF(str(test_pdf))
|
91
94
|
page = pdf.pages[0]
|
92
|
-
|
95
|
+
|
93
96
|
post_load_memory = get_detailed_memory_info()
|
94
|
-
print(
|
95
|
-
|
97
|
+
print(
|
98
|
+
f"After PDF load: {post_load_memory['rss_mb']:.2f} MB RSS, {post_load_memory['python_objects']:,} objects"
|
99
|
+
)
|
100
|
+
|
96
101
|
# Analyze character storage
|
97
102
|
storage_analysis = analyze_character_storage(page)
|
98
|
-
|
103
|
+
|
99
104
|
final_memory = get_detailed_memory_info()
|
100
|
-
print(
|
101
|
-
|
105
|
+
print(
|
106
|
+
f"After element load: {final_memory['rss_mb']:.2f} MB RSS, {final_memory['python_objects']:,} objects"
|
107
|
+
)
|
108
|
+
|
102
109
|
print("\n" + "=" * 40)
|
103
110
|
print("CHARACTER STORAGE ANALYSIS")
|
104
111
|
print("=" * 40)
|
105
|
-
|
112
|
+
|
106
113
|
print(f"Total words: {storage_analysis['total_words']}")
|
107
114
|
print(f"Memory-efficient words: {storage_analysis['memory_efficient_words']}")
|
108
115
|
print(f"Legacy words: {storage_analysis['legacy_words']}")
|
@@ -110,63 +117,71 @@ def test_memory_optimization():
|
|
110
117
|
print(f"Character indices used: {storage_analysis['total_char_indices']}")
|
111
118
|
print(f"Character dicts stored: {storage_analysis['total_char_dicts']}")
|
112
119
|
print(f"Characters referenced by words: {storage_analysis['total_chars_in_words']}")
|
113
|
-
|
120
|
+
|
114
121
|
# Calculate optimization metrics
|
115
|
-
duplication_ratio = storage_analysis[
|
116
|
-
optimization_percentage =
|
117
|
-
|
122
|
+
duplication_ratio = storage_analysis["estimated_duplication_ratio"]
|
123
|
+
optimization_percentage = (
|
124
|
+
storage_analysis["memory_efficient_words"] / max(storage_analysis["total_words"], 1) * 100
|
125
|
+
)
|
126
|
+
|
118
127
|
print(f"\nOptimization metrics:")
|
119
128
|
print(f"- Duplication ratio: {duplication_ratio:.2f}x")
|
120
129
|
print(f"- Words using optimization: {optimization_percentage:.1f}%")
|
121
|
-
|
130
|
+
|
122
131
|
# Memory savings estimation
|
123
|
-
memory_used = final_memory[
|
124
|
-
chars_total = storage_analysis[
|
125
|
-
|
132
|
+
memory_used = final_memory["rss_mb"] - baseline_memory["rss_mb"]
|
133
|
+
chars_total = storage_analysis["total_char_elements"]
|
134
|
+
|
126
135
|
if chars_total > 0:
|
127
136
|
memory_per_char = memory_used / chars_total * 1024 # KB per char
|
128
137
|
print(f"- Memory per character: {memory_per_char:.2f} KB")
|
129
|
-
|
138
|
+
|
130
139
|
# Estimate savings from eliminating _char_dicts duplication
|
131
|
-
duplicated_chars = storage_analysis[
|
140
|
+
duplicated_chars = storage_analysis["total_char_dicts"]
|
132
141
|
if duplicated_chars > 0:
|
133
142
|
estimated_wasted_memory = duplicated_chars * memory_per_char / 1024 # MB
|
134
143
|
print(f"- Estimated memory saved by optimization: {estimated_wasted_memory:.2f} MB")
|
135
|
-
print(
|
136
|
-
|
144
|
+
print(
|
145
|
+
f"- Memory efficiency improvement: {estimated_wasted_memory / memory_used * 100:.1f}%"
|
146
|
+
)
|
147
|
+
|
137
148
|
print(f"\nTotal memory used for page processing: {memory_used:.2f} MB")
|
138
|
-
|
149
|
+
|
139
150
|
# Test functionality
|
140
151
|
print("\n" + "=" * 40)
|
141
152
|
print("FUNCTIONALITY VERIFICATION")
|
142
153
|
print("=" * 40)
|
143
|
-
|
154
|
+
|
144
155
|
# Test character access
|
145
156
|
test_elements = page.find_all("text")[:3]
|
146
157
|
for i, element in enumerate(test_elements):
|
147
158
|
print(f"\nWord {i+1}: '{element.text[:30]}{'...' if len(element.text) > 30 else ''}'")
|
148
|
-
|
149
|
-
if hasattr(element,
|
159
|
+
|
160
|
+
if hasattr(element, "_char_indices") and element._char_indices:
|
150
161
|
chars = element.chars
|
151
|
-
print(
|
162
|
+
print(
|
163
|
+
f" - Uses character indices: {len(element._char_indices)} indices -> {len(chars)} chars"
|
164
|
+
)
|
152
165
|
print(f" - Memory optimization: ACTIVE")
|
153
|
-
|
166
|
+
|
154
167
|
# Verify character access works
|
155
168
|
if chars:
|
156
169
|
first_char = chars[0]
|
157
|
-
print(
|
158
|
-
|
159
|
-
|
170
|
+
print(
|
171
|
+
f" - First char: '{first_char.text}' at ({first_char.x0:.1f}, {first_char.top:.1f})"
|
172
|
+
)
|
173
|
+
|
174
|
+
elif hasattr(element, "_char_dicts") and element._char_dicts:
|
160
175
|
print(f" - Uses character dicts: {len(element._char_dicts)} dicts")
|
161
176
|
print(f" - Memory optimization: LEGACY MODE")
|
162
|
-
|
177
|
+
|
163
178
|
else:
|
164
179
|
print(f" - No character data available")
|
165
|
-
|
180
|
+
|
166
181
|
print("\n" + "=" * 60)
|
167
182
|
print("✅ MEMORY OPTIMIZATION ANALYSIS COMPLETE")
|
168
183
|
print("=" * 60)
|
169
184
|
|
170
185
|
|
171
186
|
if __name__ == "__main__":
|
172
|
-
test_memory_optimization()
|
187
|
+
test_memory_optimization()
|