natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +751 -607
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +120 -23
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
- natural_pdf-0.1.35.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
@@ -9,15 +9,16 @@ operations using real large PDFs to inform memory management decisions.
|
|
9
9
|
import gc
|
10
10
|
import json
|
11
11
|
import os
|
12
|
-
import psutil
|
13
12
|
import sys
|
14
13
|
import time
|
15
14
|
import tracemalloc
|
16
|
-
from dataclasses import
|
15
|
+
from dataclasses import asdict, dataclass
|
17
16
|
from pathlib import Path
|
18
|
-
from typing import
|
19
|
-
|
17
|
+
from typing import Any, Callable, Dict, List, Optional
|
18
|
+
|
20
19
|
import matplotlib.pyplot as plt
|
20
|
+
import pandas as pd
|
21
|
+
import psutil
|
21
22
|
|
22
23
|
import natural_pdf as npdf
|
23
24
|
|
@@ -25,6 +26,7 @@ import natural_pdf as npdf
|
|
25
26
|
@dataclass
|
26
27
|
class MemorySnapshot:
|
27
28
|
"""Snapshot of memory usage at a point in time"""
|
29
|
+
|
28
30
|
timestamp: float
|
29
31
|
rss_mb: float # Resident Set Size
|
30
32
|
vms_mb: float # Virtual Memory Size
|
@@ -37,26 +39,27 @@ class MemorySnapshot:
|
|
37
39
|
|
38
40
|
class PerformanceProfiler:
|
39
41
|
"""Profiles memory usage and performance of Natural PDF operations"""
|
40
|
-
|
42
|
+
|
41
43
|
def __init__(self, output_dir: str = "performance_results"):
|
42
44
|
self.output_dir = Path(output_dir)
|
43
45
|
self.output_dir.mkdir(exist_ok=True)
|
44
|
-
|
46
|
+
|
45
47
|
self.snapshots: List[MemorySnapshot] = []
|
46
48
|
self.process = psutil.Process()
|
47
49
|
self.start_time = time.time()
|
48
|
-
|
50
|
+
|
49
51
|
# Start tracemalloc for detailed Python memory tracking
|
50
52
|
tracemalloc.start()
|
51
|
-
|
52
|
-
def take_snapshot(
|
53
|
-
|
53
|
+
|
54
|
+
def take_snapshot(
|
55
|
+
self, operation: str, page_count: int = 0, pdf_name: str = "", **additional_info
|
56
|
+
):
|
54
57
|
"""Take a memory usage snapshot"""
|
55
58
|
gc.collect() # Force garbage collection for accurate measurement
|
56
|
-
|
59
|
+
|
57
60
|
memory_info = self.process.memory_info()
|
58
61
|
python_objects = len(gc.get_objects())
|
59
|
-
|
62
|
+
|
60
63
|
snapshot = MemorySnapshot(
|
61
64
|
timestamp=time.time() - self.start_time,
|
62
65
|
rss_mb=memory_info.rss / 1024 / 1024,
|
@@ -65,108 +68,108 @@ class PerformanceProfiler:
|
|
65
68
|
operation=operation,
|
66
69
|
page_count=page_count,
|
67
70
|
pdf_name=pdf_name,
|
68
|
-
additional_info=additional_info
|
71
|
+
additional_info=additional_info,
|
69
72
|
)
|
70
|
-
|
73
|
+
|
71
74
|
self.snapshots.append(snapshot)
|
72
|
-
print(
|
73
|
-
|
75
|
+
print(
|
76
|
+
f"[{snapshot.timestamp:.1f}s] {operation}: {snapshot.rss_mb:.1f}MB RSS, {python_objects} objects"
|
77
|
+
)
|
78
|
+
|
74
79
|
def save_results(self, test_name: str):
|
75
80
|
"""Save results to JSON and CSV"""
|
76
81
|
# Convert to list of dicts for JSON serialization
|
77
82
|
data = [asdict(s) for s in self.snapshots]
|
78
|
-
|
83
|
+
|
79
84
|
# Save JSON
|
80
85
|
json_path = self.output_dir / f"{test_name}_snapshots.json"
|
81
|
-
with open(json_path,
|
86
|
+
with open(json_path, "w") as f:
|
82
87
|
json.dump(data, f, indent=2)
|
83
|
-
|
88
|
+
|
84
89
|
# Save CSV for easy analysis
|
85
90
|
df = pd.DataFrame(data)
|
86
91
|
csv_path = self.output_dir / f"{test_name}_snapshots.csv"
|
87
92
|
df.to_csv(csv_path, index=False)
|
88
|
-
|
93
|
+
|
89
94
|
print(f"Results saved to {json_path} and {csv_path}")
|
90
95
|
return df
|
91
96
|
|
92
97
|
|
93
98
|
class PDFPerformanceTester:
|
94
99
|
"""Tests specific PDF operations and measures their performance"""
|
95
|
-
|
100
|
+
|
96
101
|
def __init__(self, pdf_path: str, profiler: PerformanceProfiler):
|
97
102
|
self.pdf_path = Path(pdf_path)
|
98
103
|
self.pdf_name = self.pdf_path.stem
|
99
104
|
self.profiler = profiler
|
100
105
|
self.pdf = None
|
101
|
-
|
106
|
+
|
102
107
|
def test_load_pdf(self):
|
103
108
|
"""Test just loading the PDF"""
|
104
109
|
self.profiler.take_snapshot("before_load", pdf_name=self.pdf_name)
|
105
|
-
|
110
|
+
|
106
111
|
self.pdf = npdf.PDF(str(self.pdf_path))
|
107
|
-
|
108
|
-
self.profiler.take_snapshot(
|
109
|
-
|
110
|
-
|
112
|
+
|
113
|
+
self.profiler.take_snapshot(
|
114
|
+
"after_load", pdf_name=self.pdf_name, total_pages=len(self.pdf.pages)
|
115
|
+
)
|
116
|
+
|
111
117
|
def test_page_access(self, max_pages: int = 10):
|
112
118
|
"""Test accessing pages sequentially"""
|
113
119
|
if not self.pdf:
|
114
120
|
self.test_load_pdf()
|
115
|
-
|
121
|
+
|
116
122
|
pages_to_test = min(max_pages, len(self.pdf.pages))
|
117
|
-
|
123
|
+
|
118
124
|
for i in range(pages_to_test):
|
119
125
|
page = self.pdf.pages[i]
|
120
|
-
|
126
|
+
|
121
127
|
# Just access the page to trigger lazy loading
|
122
128
|
_ = page.width, page.height
|
123
|
-
|
129
|
+
|
124
130
|
self.profiler.take_snapshot(
|
125
|
-
f"page_access_{i+1}",
|
126
|
-
page_count=i+1,
|
131
|
+
f"page_access_{i+1}",
|
132
|
+
page_count=i + 1,
|
127
133
|
pdf_name=self.pdf_name,
|
128
134
|
page_width=page.width,
|
129
|
-
page_height=page.height
|
135
|
+
page_height=page.height,
|
130
136
|
)
|
131
|
-
|
137
|
+
|
132
138
|
def test_describe_pages(self, max_pages: int = 5):
|
133
139
|
"""Test using .describe() on pages"""
|
134
140
|
if not self.pdf:
|
135
141
|
self.test_load_pdf()
|
136
|
-
|
142
|
+
|
137
143
|
pages_to_test = min(max_pages, len(self.pdf.pages))
|
138
|
-
|
144
|
+
|
139
145
|
for i in range(pages_to_test):
|
140
146
|
page = self.pdf.pages[i]
|
141
|
-
|
147
|
+
|
142
148
|
# Use describe to understand page content
|
143
149
|
try:
|
144
150
|
description = page.describe()
|
145
|
-
|
151
|
+
|
146
152
|
self.profiler.take_snapshot(
|
147
153
|
f"describe_{i+1}",
|
148
|
-
page_count=i+1,
|
154
|
+
page_count=i + 1,
|
149
155
|
pdf_name=self.pdf_name,
|
150
|
-
description_length=len(description) if description else 0
|
156
|
+
description_length=len(description) if description else 0,
|
151
157
|
)
|
152
158
|
except Exception as e:
|
153
159
|
self.profiler.take_snapshot(
|
154
|
-
f"describe_{i+1}_error",
|
155
|
-
page_count=i+1,
|
156
|
-
pdf_name=self.pdf_name,
|
157
|
-
error=str(e)
|
160
|
+
f"describe_{i+1}_error", page_count=i + 1, pdf_name=self.pdf_name, error=str(e)
|
158
161
|
)
|
159
|
-
|
162
|
+
|
160
163
|
def test_element_collections(self, max_pages: int = 5):
|
161
164
|
"""Test find_all operations that create element collections"""
|
162
165
|
if not self.pdf:
|
163
166
|
self.test_load_pdf()
|
164
|
-
|
167
|
+
|
165
168
|
pages_to_test = min(max_pages, len(self.pdf.pages))
|
166
|
-
|
169
|
+
|
167
170
|
for i in range(pages_to_test):
|
168
171
|
page = self.pdf.pages[i]
|
169
|
-
|
172
|
+
|
170
173
|
# Test different element collection operations
|
171
174
|
operations = [
|
172
175
|
("words", lambda p: p.find_all("words")),
|
@@ -174,121 +177,118 @@ class PDFPerformanceTester:
|
|
174
177
|
("rects", lambda p: p.find_all("rect")),
|
175
178
|
("large_text", lambda p: p.find_all("text[size>12]")),
|
176
179
|
]
|
177
|
-
|
180
|
+
|
178
181
|
for op_name, operation in operations:
|
179
182
|
try:
|
180
183
|
elements = operation(page)
|
181
184
|
element_count = len(elements) if elements else 0
|
182
|
-
|
185
|
+
|
183
186
|
self.profiler.take_snapshot(
|
184
187
|
f"{op_name}_{i+1}",
|
185
|
-
page_count=i+1,
|
188
|
+
page_count=i + 1,
|
186
189
|
pdf_name=self.pdf_name,
|
187
190
|
operation_type=op_name,
|
188
|
-
element_count=element_count
|
191
|
+
element_count=element_count,
|
189
192
|
)
|
190
193
|
except Exception as e:
|
191
194
|
self.profiler.take_snapshot(
|
192
195
|
f"{op_name}_{i+1}_error",
|
193
|
-
page_count=i+1,
|
196
|
+
page_count=i + 1,
|
194
197
|
pdf_name=self.pdf_name,
|
195
198
|
operation_type=op_name,
|
196
|
-
error=str(e)
|
199
|
+
error=str(e),
|
197
200
|
)
|
198
|
-
|
201
|
+
|
199
202
|
def test_image_generation(self, max_pages: int = 3, resolutions: List[int] = [72, 144, 216]):
|
200
203
|
"""Test image generation at different resolutions"""
|
201
204
|
if not self.pdf:
|
202
205
|
self.test_load_pdf()
|
203
|
-
|
206
|
+
|
204
207
|
pages_to_test = min(max_pages, len(self.pdf.pages))
|
205
|
-
|
208
|
+
|
206
209
|
for i in range(pages_to_test):
|
207
210
|
page = self.pdf.pages[i]
|
208
|
-
|
211
|
+
|
209
212
|
for resolution in resolutions:
|
210
213
|
try:
|
211
214
|
img = page.to_image(resolution=resolution)
|
212
|
-
|
215
|
+
|
213
216
|
self.profiler.take_snapshot(
|
214
217
|
f"image_{resolution}dpi_{i+1}",
|
215
|
-
page_count=i+1,
|
218
|
+
page_count=i + 1,
|
216
219
|
pdf_name=self.pdf_name,
|
217
220
|
resolution=resolution,
|
218
|
-
image_size=f"{img.width}x{img.height}" if img else "None"
|
221
|
+
image_size=f"{img.width}x{img.height}" if img else "None",
|
219
222
|
)
|
220
|
-
|
223
|
+
|
221
224
|
# Clean up image immediately to test memory release
|
222
225
|
del img
|
223
|
-
|
226
|
+
|
224
227
|
except Exception as e:
|
225
228
|
self.profiler.take_snapshot(
|
226
229
|
f"image_{resolution}dpi_{i+1}_error",
|
227
|
-
page_count=i+1,
|
230
|
+
page_count=i + 1,
|
228
231
|
pdf_name=self.pdf_name,
|
229
232
|
resolution=resolution,
|
230
|
-
error=str(e)
|
233
|
+
error=str(e),
|
231
234
|
)
|
232
|
-
|
235
|
+
|
233
236
|
def test_ocr(self, max_pages: int = 2):
|
234
237
|
"""Test OCR operations (expensive!)"""
|
235
238
|
if not self.pdf:
|
236
239
|
self.test_load_pdf()
|
237
|
-
|
240
|
+
|
238
241
|
pages_to_test = min(max_pages, len(self.pdf.pages))
|
239
|
-
|
242
|
+
|
240
243
|
for i in range(pages_to_test):
|
241
244
|
page = self.pdf.pages[i]
|
242
|
-
|
245
|
+
|
243
246
|
try:
|
244
247
|
# Run OCR
|
245
248
|
page.apply_ocr(engine="easyocr") # Default engine
|
246
|
-
|
249
|
+
|
247
250
|
self.profiler.take_snapshot(
|
248
|
-
f"ocr_{i+1}",
|
249
|
-
page_count=i+1,
|
250
|
-
pdf_name=self.pdf_name,
|
251
|
-
operation_type="ocr"
|
251
|
+
f"ocr_{i+1}", page_count=i + 1, pdf_name=self.pdf_name, operation_type="ocr"
|
252
252
|
)
|
253
|
-
|
253
|
+
|
254
254
|
except Exception as e:
|
255
255
|
self.profiler.take_snapshot(
|
256
256
|
f"ocr_{i+1}_error",
|
257
|
-
page_count=i+1,
|
257
|
+
page_count=i + 1,
|
258
258
|
pdf_name=self.pdf_name,
|
259
259
|
operation_type="ocr",
|
260
|
-
error=str(e)
|
260
|
+
error=str(e),
|
261
261
|
)
|
262
|
-
|
262
|
+
|
263
263
|
def test_layout_analysis(self, max_pages: int = 3):
|
264
264
|
"""Test layout analysis operations"""
|
265
265
|
if not self.pdf:
|
266
266
|
self.test_load_pdf()
|
267
|
-
|
267
|
+
|
268
268
|
pages_to_test = min(max_pages, len(self.pdf.pages))
|
269
|
-
|
269
|
+
|
270
270
|
for i in range(pages_to_test):
|
271
271
|
page = self.pdf.pages[i]
|
272
|
-
|
272
|
+
|
273
273
|
try:
|
274
274
|
# Run layout analysis
|
275
275
|
layout_result = page.analyze_layout()
|
276
|
-
|
276
|
+
|
277
277
|
self.profiler.take_snapshot(
|
278
278
|
f"layout_{i+1}",
|
279
|
-
page_count=i+1,
|
279
|
+
page_count=i + 1,
|
280
280
|
pdf_name=self.pdf_name,
|
281
281
|
operation_type="layout",
|
282
|
-
layout_regions=len(layout_result) if layout_result else 0
|
282
|
+
layout_regions=len(layout_result) if layout_result else 0,
|
283
283
|
)
|
284
|
-
|
284
|
+
|
285
285
|
except Exception as e:
|
286
286
|
self.profiler.take_snapshot(
|
287
287
|
f"layout_{i+1}_error",
|
288
|
-
page_count=i+1,
|
288
|
+
page_count=i + 1,
|
289
289
|
pdf_name=self.pdf_name,
|
290
290
|
operation_type="layout",
|
291
|
-
error=str(e)
|
291
|
+
error=str(e),
|
292
292
|
)
|
293
293
|
|
294
294
|
|
@@ -298,43 +298,43 @@ def run_comprehensive_test(pdf_path: str, test_name: str):
|
|
298
298
|
print(f"COMPREHENSIVE TEST: {test_name}")
|
299
299
|
print(f"PDF: {pdf_path}")
|
300
300
|
print(f"{'='*60}")
|
301
|
-
|
301
|
+
|
302
302
|
profiler = PerformanceProfiler()
|
303
303
|
tester = PDFPerformanceTester(pdf_path, profiler)
|
304
|
-
|
304
|
+
|
305
305
|
# Initial baseline
|
306
306
|
profiler.take_snapshot("baseline_start", pdf_name=Path(pdf_path).stem)
|
307
|
-
|
307
|
+
|
308
308
|
# Test sequence
|
309
309
|
print("\n1. Testing PDF Load...")
|
310
310
|
tester.test_load_pdf()
|
311
|
-
|
311
|
+
|
312
312
|
print("\n2. Testing Page Access...")
|
313
313
|
tester.test_page_access(max_pages=10)
|
314
|
-
|
314
|
+
|
315
315
|
print("\n3. Testing Describe Operations...")
|
316
316
|
tester.test_describe_pages(max_pages=5)
|
317
|
-
|
317
|
+
|
318
318
|
print("\n4. Testing Element Collections...")
|
319
319
|
tester.test_element_collections(max_pages=5)
|
320
|
-
|
320
|
+
|
321
321
|
print("\n5. Testing Image Generation...")
|
322
322
|
tester.test_image_generation(max_pages=3)
|
323
|
-
|
323
|
+
|
324
324
|
print("\n6. Testing Layout Analysis...")
|
325
325
|
tester.test_layout_analysis(max_pages=3)
|
326
|
-
|
326
|
+
|
327
327
|
# OCR test (only for image-heavy PDFs)
|
328
328
|
if "OCR" in pdf_path or "image" in test_name.lower():
|
329
329
|
print("\n7. Testing OCR (Image-heavy PDF)...")
|
330
330
|
tester.test_ocr(max_pages=2)
|
331
|
-
|
331
|
+
|
332
332
|
# Final snapshot
|
333
333
|
profiler.take_snapshot("test_complete", pdf_name=Path(pdf_path).stem)
|
334
|
-
|
334
|
+
|
335
335
|
# Save results
|
336
336
|
df = profiler.save_results(test_name)
|
337
|
-
|
337
|
+
|
338
338
|
# Quick analysis
|
339
339
|
print(f"\n{'-'*40}")
|
340
340
|
print("QUICK ANALYSIS:")
|
@@ -342,7 +342,7 @@ def run_comprehensive_test(pdf_path: str, test_name: str):
|
|
342
342
|
print(f"Memory Growth: {df['rss_mb'].iloc[-1] - df['rss_mb'].iloc[0]:.1f} MB")
|
343
343
|
print(f"Peak Objects: {df['python_objects'].max():,}")
|
344
344
|
print(f"Total Time: {df['timestamp'].iloc[-1]:.1f} seconds")
|
345
|
-
|
345
|
+
|
346
346
|
return df
|
347
347
|
|
348
348
|
|
@@ -350,22 +350,23 @@ def main():
|
|
350
350
|
"""Main test runner"""
|
351
351
|
print("Natural PDF Performance Analysis Micro-Suite")
|
352
352
|
print("=" * 50)
|
353
|
-
|
353
|
+
|
354
354
|
# Find test PDFs
|
355
355
|
large_pdfs_dir = Path("pdfs/hidden/large")
|
356
356
|
if not large_pdfs_dir.exists():
|
357
357
|
print(f"Error: {large_pdfs_dir} not found")
|
358
358
|
print("Please ensure large test PDFs are available")
|
359
359
|
return
|
360
|
-
|
360
|
+
|
361
361
|
# Expected test PDFs
|
362
362
|
test_pdfs = {
|
363
363
|
"text_heavy": large_pdfs_dir / "appendix_fy2026.pdf",
|
364
|
-
"image_heavy": large_pdfs_dir
|
364
|
+
"image_heavy": large_pdfs_dir
|
365
|
+
/ "OCR 0802030-56.2022.8.14.0060_Cópia integral_Fazenda Marrocos.pdf",
|
365
366
|
}
|
366
|
-
|
367
|
+
|
367
368
|
results = {}
|
368
|
-
|
369
|
+
|
369
370
|
for test_name, pdf_path in test_pdfs.items():
|
370
371
|
if pdf_path.exists():
|
371
372
|
try:
|
@@ -375,23 +376,23 @@ def main():
|
|
375
376
|
traceback.print_exc()
|
376
377
|
else:
|
377
378
|
print(f"Warning: {pdf_path} not found, skipping {test_name} test")
|
378
|
-
|
379
|
+
|
379
380
|
# Generate comparison report
|
380
381
|
if results:
|
381
382
|
print(f"\n{'='*60}")
|
382
383
|
print("COMPARISON SUMMARY")
|
383
384
|
print(f"{'='*60}")
|
384
|
-
|
385
|
+
|
385
386
|
for test_name, df in results.items():
|
386
387
|
print(f"\n{test_name.upper()}:")
|
387
388
|
print(f" Peak Memory: {df['rss_mb'].max():.1f} MB")
|
388
389
|
print(f" Memory Growth: {df['rss_mb'].iloc[-1] - df['rss_mb'].iloc[0]:.1f} MB")
|
389
390
|
print(f" Peak Objects: {df['python_objects'].max():,}")
|
390
391
|
print(f" Duration: {df['timestamp'].iloc[-1]:.1f}s")
|
391
|
-
|
392
|
+
|
392
393
|
print(f"\nResults saved to performance_results/ directory")
|
393
394
|
print("Use the CSV files for detailed analysis")
|
394
395
|
|
395
396
|
|
396
397
|
if __name__ == "__main__":
|
397
|
-
main()
|
398
|
+
main()
|
@@ -12,124 +12,135 @@ import gc
|
|
12
12
|
import os
|
13
13
|
import sys
|
14
14
|
from pathlib import Path
|
15
|
+
|
15
16
|
import pytest
|
16
17
|
|
17
18
|
import natural_pdf as npdf
|
18
|
-
from natural_pdf.ocr.ocr_manager import OCRManager
|
19
19
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
20
20
|
from natural_pdf.classification.manager import ClassificationManager
|
21
|
+
from natural_pdf.ocr.ocr_manager import OCRManager
|
21
22
|
|
22
23
|
|
23
24
|
class TestCleanupMethods:
|
24
25
|
"""Test suite for manager cleanup methods"""
|
25
|
-
|
26
|
+
|
26
27
|
def test_ocr_manager_cleanup_empty(self):
|
27
28
|
"""Test OCR manager cleanup when no engines are loaded"""
|
28
29
|
manager = OCRManager()
|
29
|
-
|
30
|
+
|
30
31
|
# Test cleanup when nothing is loaded
|
31
32
|
count = manager.cleanup_engine()
|
32
33
|
assert count == 0, "Should return 0 when no engines loaded"
|
33
|
-
|
34
|
+
|
34
35
|
# Test cleanup of specific non-existent engine
|
35
36
|
count = manager.cleanup_engine("nonexistent")
|
36
37
|
assert count == 0, "Should return 0 when engine doesn't exist"
|
37
|
-
|
38
|
+
|
38
39
|
def test_layout_manager_cleanup_empty(self):
|
39
40
|
"""Test Layout manager cleanup when no detectors are loaded"""
|
40
41
|
manager = LayoutManager()
|
41
|
-
|
42
|
+
|
42
43
|
# Test cleanup when nothing is loaded
|
43
44
|
count = manager.cleanup_detector()
|
44
45
|
assert count == 0, "Should return 0 when no detectors loaded"
|
45
|
-
|
46
|
+
|
46
47
|
# Test cleanup of specific non-existent detector
|
47
48
|
count = manager.cleanup_detector("nonexistent")
|
48
49
|
assert count == 0, "Should return 0 when detector doesn't exist"
|
49
|
-
|
50
|
+
|
50
51
|
def test_classification_manager_cleanup_empty(self):
|
51
52
|
"""Test Classification manager cleanup when no models are loaded"""
|
52
53
|
try:
|
53
54
|
manager = ClassificationManager()
|
54
|
-
|
55
|
+
|
55
56
|
# Test cleanup when nothing is loaded
|
56
57
|
count = manager.cleanup_models()
|
57
58
|
assert count == 0, "Should return 0 when no models loaded"
|
58
|
-
|
59
|
+
|
59
60
|
# Test cleanup of specific non-existent model
|
60
61
|
count = manager.cleanup_models("nonexistent/model")
|
61
62
|
assert count == 0, "Should return 0 when model doesn't exist"
|
62
|
-
|
63
|
+
|
63
64
|
except ImportError:
|
64
65
|
pytest.skip("Classification dependencies not available")
|
65
|
-
|
66
|
+
|
66
67
|
def test_ocr_manager_cleanup_with_engine(self):
|
67
68
|
"""Test OCR manager cleanup after loading an engine"""
|
68
69
|
manager = OCRManager()
|
69
|
-
|
70
|
+
|
70
71
|
# Check if any OCR engines are available
|
71
72
|
available_engines = manager.get_available_engines()
|
72
73
|
if not available_engines:
|
73
74
|
pytest.skip("No OCR engines available for testing")
|
74
|
-
|
75
|
+
|
75
76
|
engine_name = available_engines[0]
|
76
77
|
print(f"Testing with OCR engine: {engine_name}")
|
77
|
-
|
78
|
+
|
78
79
|
# Load an engine by accessing it
|
79
80
|
try:
|
80
81
|
engine_instance = manager._get_engine_instance(engine_name)
|
81
82
|
assert engine_name in manager._engine_instances, "Engine should be cached"
|
82
|
-
|
83
|
+
|
83
84
|
# Test cleanup of specific engine
|
84
85
|
count = manager.cleanup_engine(engine_name)
|
85
86
|
assert count == 1, f"Should return 1 after cleaning up {engine_name}"
|
86
|
-
assert
|
87
|
-
|
87
|
+
assert (
|
88
|
+
engine_name not in manager._engine_instances
|
89
|
+
), "Engine should be removed from cache"
|
90
|
+
|
88
91
|
except Exception as e:
|
89
92
|
pytest.skip(f"Could not load {engine_name} engine: {e}")
|
90
|
-
|
93
|
+
|
91
94
|
def test_layout_manager_cleanup_with_detector(self):
|
92
95
|
"""Test Layout manager cleanup after loading a detector"""
|
93
96
|
manager = LayoutManager()
|
94
|
-
|
97
|
+
|
95
98
|
# Check if any layout engines are available
|
96
99
|
available_engines = manager.get_available_engines()
|
97
100
|
if not available_engines:
|
98
101
|
pytest.skip("No layout engines available for testing")
|
99
|
-
|
102
|
+
|
100
103
|
engine_name = available_engines[0]
|
101
104
|
print(f"Testing with layout engine: {engine_name}")
|
102
|
-
|
105
|
+
|
103
106
|
# Load a detector by accessing it
|
104
107
|
try:
|
105
108
|
detector_instance = manager._get_engine_instance(engine_name)
|
106
109
|
assert engine_name in manager._detector_instances, "Detector should be cached"
|
107
|
-
|
110
|
+
|
108
111
|
# Test cleanup of specific detector
|
109
112
|
count = manager.cleanup_detector(engine_name)
|
110
113
|
assert count == 1, f"Should return 1 after cleaning up {engine_name}"
|
111
|
-
assert
|
112
|
-
|
114
|
+
assert (
|
115
|
+
engine_name not in manager._detector_instances
|
116
|
+
), "Detector should be removed from cache"
|
117
|
+
|
113
118
|
except Exception as e:
|
114
119
|
pytest.skip(f"Could not load {engine_name} detector: {e}")
|
115
|
-
|
120
|
+
|
116
121
|
def test_methods_exist(self):
|
117
122
|
"""Test that all cleanup methods exist and are callable"""
|
118
123
|
# Test OCRManager
|
119
124
|
manager = OCRManager()
|
120
|
-
assert hasattr(manager,
|
125
|
+
assert hasattr(manager, "cleanup_engine"), "OCRManager should have cleanup_engine method"
|
121
126
|
assert callable(manager.cleanup_engine), "cleanup_engine should be callable"
|
122
|
-
|
127
|
+
|
123
128
|
# Test LayoutManager
|
124
129
|
layout_manager = LayoutManager()
|
125
|
-
assert hasattr(
|
130
|
+
assert hasattr(
|
131
|
+
layout_manager, "cleanup_detector"
|
132
|
+
), "LayoutManager should have cleanup_detector method"
|
126
133
|
assert callable(layout_manager.cleanup_detector), "cleanup_detector should be callable"
|
127
|
-
|
134
|
+
|
128
135
|
# Test ClassificationManager (if available)
|
129
136
|
try:
|
130
137
|
classification_manager = ClassificationManager()
|
131
|
-
assert hasattr(
|
132
|
-
|
138
|
+
assert hasattr(
|
139
|
+
classification_manager, "cleanup_models"
|
140
|
+
), "ClassificationManager should have cleanup_models method"
|
141
|
+
assert callable(
|
142
|
+
classification_manager.cleanup_models
|
143
|
+
), "cleanup_models should be callable"
|
133
144
|
except ImportError:
|
134
145
|
print("Classification dependencies not available, skipping ClassificationManager test")
|
135
146
|
|
@@ -137,19 +148,19 @@ class TestCleanupMethods:
|
|
137
148
|
def main():
|
138
149
|
"""Run the cleanup method tests"""
|
139
150
|
print("Testing manager cleanup methods...")
|
140
|
-
|
151
|
+
|
141
152
|
# Run pytest on just this file
|
142
153
|
exit_code = pytest.main([__file__, "-v", "-s"])
|
143
|
-
|
154
|
+
|
144
155
|
if exit_code == 0:
|
145
156
|
print("\n✅ All cleanup method tests passed!")
|
146
157
|
print("The memory management methods are working correctly.")
|
147
158
|
else:
|
148
159
|
print("\n❌ Some tests failed!")
|
149
160
|
print("The cleanup methods need investigation.")
|
150
|
-
|
161
|
+
|
151
162
|
return exit_code
|
152
163
|
|
153
164
|
|
154
165
|
if __name__ == "__main__":
|
155
|
-
exit(main())
|
166
|
+
exit(main())
|