natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +188 -82
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +132 -16
  19. natural_pdf/core/pdf.py +486 -71
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +238 -111
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.32.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -9,15 +9,16 @@ operations using real large PDFs to inform memory management decisions.
9
9
  import gc
10
10
  import json
11
11
  import os
12
- import psutil
13
12
  import sys
14
13
  import time
15
14
  import tracemalloc
16
- from dataclasses import dataclass, asdict
15
+ from dataclasses import asdict, dataclass
17
16
  from pathlib import Path
18
- from typing import Dict, List, Optional, Any, Callable
19
- import pandas as pd
17
+ from typing import Any, Callable, Dict, List, Optional
18
+
20
19
  import matplotlib.pyplot as plt
20
+ import pandas as pd
21
+ import psutil
21
22
 
22
23
  import natural_pdf as npdf
23
24
 
@@ -25,6 +26,7 @@ import natural_pdf as npdf
25
26
  @dataclass
26
27
  class MemorySnapshot:
27
28
  """Snapshot of memory usage at a point in time"""
29
+
28
30
  timestamp: float
29
31
  rss_mb: float # Resident Set Size
30
32
  vms_mb: float # Virtual Memory Size
@@ -37,26 +39,27 @@ class MemorySnapshot:
37
39
 
38
40
  class PerformanceProfiler:
39
41
  """Profiles memory usage and performance of Natural PDF operations"""
40
-
42
+
41
43
  def __init__(self, output_dir: str = "performance_results"):
42
44
  self.output_dir = Path(output_dir)
43
45
  self.output_dir.mkdir(exist_ok=True)
44
-
46
+
45
47
  self.snapshots: List[MemorySnapshot] = []
46
48
  self.process = psutil.Process()
47
49
  self.start_time = time.time()
48
-
50
+
49
51
  # Start tracemalloc for detailed Python memory tracking
50
52
  tracemalloc.start()
51
-
52
- def take_snapshot(self, operation: str, page_count: int = 0,
53
- pdf_name: str = "", **additional_info):
53
+
54
+ def take_snapshot(
55
+ self, operation: str, page_count: int = 0, pdf_name: str = "", **additional_info
56
+ ):
54
57
  """Take a memory usage snapshot"""
55
58
  gc.collect() # Force garbage collection for accurate measurement
56
-
59
+
57
60
  memory_info = self.process.memory_info()
58
61
  python_objects = len(gc.get_objects())
59
-
62
+
60
63
  snapshot = MemorySnapshot(
61
64
  timestamp=time.time() - self.start_time,
62
65
  rss_mb=memory_info.rss / 1024 / 1024,
@@ -65,108 +68,108 @@ class PerformanceProfiler:
65
68
  operation=operation,
66
69
  page_count=page_count,
67
70
  pdf_name=pdf_name,
68
- additional_info=additional_info
71
+ additional_info=additional_info,
69
72
  )
70
-
73
+
71
74
  self.snapshots.append(snapshot)
72
- print(f"[{snapshot.timestamp:.1f}s] {operation}: {snapshot.rss_mb:.1f}MB RSS, {python_objects} objects")
73
-
75
+ print(
76
+ f"[{snapshot.timestamp:.1f}s] {operation}: {snapshot.rss_mb:.1f}MB RSS, {python_objects} objects"
77
+ )
78
+
74
79
  def save_results(self, test_name: str):
75
80
  """Save results to JSON and CSV"""
76
81
  # Convert to list of dicts for JSON serialization
77
82
  data = [asdict(s) for s in self.snapshots]
78
-
83
+
79
84
  # Save JSON
80
85
  json_path = self.output_dir / f"{test_name}_snapshots.json"
81
- with open(json_path, 'w') as f:
86
+ with open(json_path, "w") as f:
82
87
  json.dump(data, f, indent=2)
83
-
88
+
84
89
  # Save CSV for easy analysis
85
90
  df = pd.DataFrame(data)
86
91
  csv_path = self.output_dir / f"{test_name}_snapshots.csv"
87
92
  df.to_csv(csv_path, index=False)
88
-
93
+
89
94
  print(f"Results saved to {json_path} and {csv_path}")
90
95
  return df
91
96
 
92
97
 
93
98
  class PDFPerformanceTester:
94
99
  """Tests specific PDF operations and measures their performance"""
95
-
100
+
96
101
  def __init__(self, pdf_path: str, profiler: PerformanceProfiler):
97
102
  self.pdf_path = Path(pdf_path)
98
103
  self.pdf_name = self.pdf_path.stem
99
104
  self.profiler = profiler
100
105
  self.pdf = None
101
-
106
+
102
107
  def test_load_pdf(self):
103
108
  """Test just loading the PDF"""
104
109
  self.profiler.take_snapshot("before_load", pdf_name=self.pdf_name)
105
-
110
+
106
111
  self.pdf = npdf.PDF(str(self.pdf_path))
107
-
108
- self.profiler.take_snapshot("after_load", pdf_name=self.pdf_name,
109
- total_pages=len(self.pdf.pages))
110
-
112
+
113
+ self.profiler.take_snapshot(
114
+ "after_load", pdf_name=self.pdf_name, total_pages=len(self.pdf.pages)
115
+ )
116
+
111
117
  def test_page_access(self, max_pages: int = 10):
112
118
  """Test accessing pages sequentially"""
113
119
  if not self.pdf:
114
120
  self.test_load_pdf()
115
-
121
+
116
122
  pages_to_test = min(max_pages, len(self.pdf.pages))
117
-
123
+
118
124
  for i in range(pages_to_test):
119
125
  page = self.pdf.pages[i]
120
-
126
+
121
127
  # Just access the page to trigger lazy loading
122
128
  _ = page.width, page.height
123
-
129
+
124
130
  self.profiler.take_snapshot(
125
- f"page_access_{i+1}",
126
- page_count=i+1,
131
+ f"page_access_{i+1}",
132
+ page_count=i + 1,
127
133
  pdf_name=self.pdf_name,
128
134
  page_width=page.width,
129
- page_height=page.height
135
+ page_height=page.height,
130
136
  )
131
-
137
+
132
138
  def test_describe_pages(self, max_pages: int = 5):
133
139
  """Test using .describe() on pages"""
134
140
  if not self.pdf:
135
141
  self.test_load_pdf()
136
-
142
+
137
143
  pages_to_test = min(max_pages, len(self.pdf.pages))
138
-
144
+
139
145
  for i in range(pages_to_test):
140
146
  page = self.pdf.pages[i]
141
-
147
+
142
148
  # Use describe to understand page content
143
149
  try:
144
150
  description = page.describe()
145
-
151
+
146
152
  self.profiler.take_snapshot(
147
153
  f"describe_{i+1}",
148
- page_count=i+1,
154
+ page_count=i + 1,
149
155
  pdf_name=self.pdf_name,
150
- description_length=len(description) if description else 0
156
+ description_length=len(description) if description else 0,
151
157
  )
152
158
  except Exception as e:
153
159
  self.profiler.take_snapshot(
154
- f"describe_{i+1}_error",
155
- page_count=i+1,
156
- pdf_name=self.pdf_name,
157
- error=str(e)
160
+ f"describe_{i+1}_error", page_count=i + 1, pdf_name=self.pdf_name, error=str(e)
158
161
  )
159
-
162
+
160
163
  def test_element_collections(self, max_pages: int = 5):
161
164
  """Test find_all operations that create element collections"""
162
165
  if not self.pdf:
163
166
  self.test_load_pdf()
164
-
167
+
165
168
  pages_to_test = min(max_pages, len(self.pdf.pages))
166
-
169
+
167
170
  for i in range(pages_to_test):
168
171
  page = self.pdf.pages[i]
169
-
172
+
170
173
  # Test different element collection operations
171
174
  operations = [
172
175
  ("words", lambda p: p.find_all("words")),
@@ -174,121 +177,118 @@ class PDFPerformanceTester:
174
177
  ("rects", lambda p: p.find_all("rect")),
175
178
  ("large_text", lambda p: p.find_all("text[size>12]")),
176
179
  ]
177
-
180
+
178
181
  for op_name, operation in operations:
179
182
  try:
180
183
  elements = operation(page)
181
184
  element_count = len(elements) if elements else 0
182
-
185
+
183
186
  self.profiler.take_snapshot(
184
187
  f"{op_name}_{i+1}",
185
- page_count=i+1,
188
+ page_count=i + 1,
186
189
  pdf_name=self.pdf_name,
187
190
  operation_type=op_name,
188
- element_count=element_count
191
+ element_count=element_count,
189
192
  )
190
193
  except Exception as e:
191
194
  self.profiler.take_snapshot(
192
195
  f"{op_name}_{i+1}_error",
193
- page_count=i+1,
196
+ page_count=i + 1,
194
197
  pdf_name=self.pdf_name,
195
198
  operation_type=op_name,
196
- error=str(e)
199
+ error=str(e),
197
200
  )
198
-
201
+
199
202
  def test_image_generation(self, max_pages: int = 3, resolutions: List[int] = [72, 144, 216]):
200
203
  """Test image generation at different resolutions"""
201
204
  if not self.pdf:
202
205
  self.test_load_pdf()
203
-
206
+
204
207
  pages_to_test = min(max_pages, len(self.pdf.pages))
205
-
208
+
206
209
  for i in range(pages_to_test):
207
210
  page = self.pdf.pages[i]
208
-
211
+
209
212
  for resolution in resolutions:
210
213
  try:
211
214
  img = page.to_image(resolution=resolution)
212
-
215
+
213
216
  self.profiler.take_snapshot(
214
217
  f"image_{resolution}dpi_{i+1}",
215
- page_count=i+1,
218
+ page_count=i + 1,
216
219
  pdf_name=self.pdf_name,
217
220
  resolution=resolution,
218
- image_size=f"{img.width}x{img.height}" if img else "None"
221
+ image_size=f"{img.width}x{img.height}" if img else "None",
219
222
  )
220
-
223
+
221
224
  # Clean up image immediately to test memory release
222
225
  del img
223
-
226
+
224
227
  except Exception as e:
225
228
  self.profiler.take_snapshot(
226
229
  f"image_{resolution}dpi_{i+1}_error",
227
- page_count=i+1,
230
+ page_count=i + 1,
228
231
  pdf_name=self.pdf_name,
229
232
  resolution=resolution,
230
- error=str(e)
233
+ error=str(e),
231
234
  )
232
-
235
+
233
236
  def test_ocr(self, max_pages: int = 2):
234
237
  """Test OCR operations (expensive!)"""
235
238
  if not self.pdf:
236
239
  self.test_load_pdf()
237
-
240
+
238
241
  pages_to_test = min(max_pages, len(self.pdf.pages))
239
-
242
+
240
243
  for i in range(pages_to_test):
241
244
  page = self.pdf.pages[i]
242
-
245
+
243
246
  try:
244
247
  # Run OCR
245
248
  page.apply_ocr(engine="easyocr") # Default engine
246
-
249
+
247
250
  self.profiler.take_snapshot(
248
- f"ocr_{i+1}",
249
- page_count=i+1,
250
- pdf_name=self.pdf_name,
251
- operation_type="ocr"
251
+ f"ocr_{i+1}", page_count=i + 1, pdf_name=self.pdf_name, operation_type="ocr"
252
252
  )
253
-
253
+
254
254
  except Exception as e:
255
255
  self.profiler.take_snapshot(
256
256
  f"ocr_{i+1}_error",
257
- page_count=i+1,
257
+ page_count=i + 1,
258
258
  pdf_name=self.pdf_name,
259
259
  operation_type="ocr",
260
- error=str(e)
260
+ error=str(e),
261
261
  )
262
-
262
+
263
263
  def test_layout_analysis(self, max_pages: int = 3):
264
264
  """Test layout analysis operations"""
265
265
  if not self.pdf:
266
266
  self.test_load_pdf()
267
-
267
+
268
268
  pages_to_test = min(max_pages, len(self.pdf.pages))
269
-
269
+
270
270
  for i in range(pages_to_test):
271
271
  page = self.pdf.pages[i]
272
-
272
+
273
273
  try:
274
274
  # Run layout analysis
275
275
  layout_result = page.analyze_layout()
276
-
276
+
277
277
  self.profiler.take_snapshot(
278
278
  f"layout_{i+1}",
279
- page_count=i+1,
279
+ page_count=i + 1,
280
280
  pdf_name=self.pdf_name,
281
281
  operation_type="layout",
282
- layout_regions=len(layout_result) if layout_result else 0
282
+ layout_regions=len(layout_result) if layout_result else 0,
283
283
  )
284
-
284
+
285
285
  except Exception as e:
286
286
  self.profiler.take_snapshot(
287
287
  f"layout_{i+1}_error",
288
- page_count=i+1,
288
+ page_count=i + 1,
289
289
  pdf_name=self.pdf_name,
290
290
  operation_type="layout",
291
- error=str(e)
291
+ error=str(e),
292
292
  )
293
293
 
294
294
 
@@ -298,43 +298,43 @@ def run_comprehensive_test(pdf_path: str, test_name: str):
298
298
  print(f"COMPREHENSIVE TEST: {test_name}")
299
299
  print(f"PDF: {pdf_path}")
300
300
  print(f"{'='*60}")
301
-
301
+
302
302
  profiler = PerformanceProfiler()
303
303
  tester = PDFPerformanceTester(pdf_path, profiler)
304
-
304
+
305
305
  # Initial baseline
306
306
  profiler.take_snapshot("baseline_start", pdf_name=Path(pdf_path).stem)
307
-
307
+
308
308
  # Test sequence
309
309
  print("\n1. Testing PDF Load...")
310
310
  tester.test_load_pdf()
311
-
311
+
312
312
  print("\n2. Testing Page Access...")
313
313
  tester.test_page_access(max_pages=10)
314
-
314
+
315
315
  print("\n3. Testing Describe Operations...")
316
316
  tester.test_describe_pages(max_pages=5)
317
-
317
+
318
318
  print("\n4. Testing Element Collections...")
319
319
  tester.test_element_collections(max_pages=5)
320
-
320
+
321
321
  print("\n5. Testing Image Generation...")
322
322
  tester.test_image_generation(max_pages=3)
323
-
323
+
324
324
  print("\n6. Testing Layout Analysis...")
325
325
  tester.test_layout_analysis(max_pages=3)
326
-
326
+
327
327
  # OCR test (only for image-heavy PDFs)
328
328
  if "OCR" in pdf_path or "image" in test_name.lower():
329
329
  print("\n7. Testing OCR (Image-heavy PDF)...")
330
330
  tester.test_ocr(max_pages=2)
331
-
331
+
332
332
  # Final snapshot
333
333
  profiler.take_snapshot("test_complete", pdf_name=Path(pdf_path).stem)
334
-
334
+
335
335
  # Save results
336
336
  df = profiler.save_results(test_name)
337
-
337
+
338
338
  # Quick analysis
339
339
  print(f"\n{'-'*40}")
340
340
  print("QUICK ANALYSIS:")
@@ -342,7 +342,7 @@ def run_comprehensive_test(pdf_path: str, test_name: str):
342
342
  print(f"Memory Growth: {df['rss_mb'].iloc[-1] - df['rss_mb'].iloc[0]:.1f} MB")
343
343
  print(f"Peak Objects: {df['python_objects'].max():,}")
344
344
  print(f"Total Time: {df['timestamp'].iloc[-1]:.1f} seconds")
345
-
345
+
346
346
  return df
347
347
 
348
348
 
@@ -350,22 +350,23 @@ def main():
350
350
  """Main test runner"""
351
351
  print("Natural PDF Performance Analysis Micro-Suite")
352
352
  print("=" * 50)
353
-
353
+
354
354
  # Find test PDFs
355
355
  large_pdfs_dir = Path("pdfs/hidden/large")
356
356
  if not large_pdfs_dir.exists():
357
357
  print(f"Error: {large_pdfs_dir} not found")
358
358
  print("Please ensure large test PDFs are available")
359
359
  return
360
-
360
+
361
361
  # Expected test PDFs
362
362
  test_pdfs = {
363
363
  "text_heavy": large_pdfs_dir / "appendix_fy2026.pdf",
364
- "image_heavy": large_pdfs_dir / "OCR 0802030-56.2022.8.14.0060_Cópia integral_Fazenda Marrocos.pdf"
364
+ "image_heavy": large_pdfs_dir
365
+ / "OCR 0802030-56.2022.8.14.0060_Cópia integral_Fazenda Marrocos.pdf",
365
366
  }
366
-
367
+
367
368
  results = {}
368
-
369
+
369
370
  for test_name, pdf_path in test_pdfs.items():
370
371
  if pdf_path.exists():
371
372
  try:
@@ -375,23 +376,23 @@ def main():
375
376
  traceback.print_exc()
376
377
  else:
377
378
  print(f"Warning: {pdf_path} not found, skipping {test_name} test")
378
-
379
+
379
380
  # Generate comparison report
380
381
  if results:
381
382
  print(f"\n{'='*60}")
382
383
  print("COMPARISON SUMMARY")
383
384
  print(f"{'='*60}")
384
-
385
+
385
386
  for test_name, df in results.items():
386
387
  print(f"\n{test_name.upper()}:")
387
388
  print(f" Peak Memory: {df['rss_mb'].max():.1f} MB")
388
389
  print(f" Memory Growth: {df['rss_mb'].iloc[-1] - df['rss_mb'].iloc[0]:.1f} MB")
389
390
  print(f" Peak Objects: {df['python_objects'].max():,}")
390
391
  print(f" Duration: {df['timestamp'].iloc[-1]:.1f}s")
391
-
392
+
392
393
  print(f"\nResults saved to performance_results/ directory")
393
394
  print("Use the CSV files for detailed analysis")
394
395
 
395
396
 
396
397
  if __name__ == "__main__":
397
- main()
398
+ main()
@@ -12,124 +12,135 @@ import gc
12
12
  import os
13
13
  import sys
14
14
  from pathlib import Path
15
+
15
16
  import pytest
16
17
 
17
18
  import natural_pdf as npdf
18
- from natural_pdf.ocr.ocr_manager import OCRManager
19
19
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
20
20
  from natural_pdf.classification.manager import ClassificationManager
21
+ from natural_pdf.ocr.ocr_manager import OCRManager
21
22
 
22
23
 
23
24
  class TestCleanupMethods:
24
25
  """Test suite for manager cleanup methods"""
25
-
26
+
26
27
  def test_ocr_manager_cleanup_empty(self):
27
28
  """Test OCR manager cleanup when no engines are loaded"""
28
29
  manager = OCRManager()
29
-
30
+
30
31
  # Test cleanup when nothing is loaded
31
32
  count = manager.cleanup_engine()
32
33
  assert count == 0, "Should return 0 when no engines loaded"
33
-
34
+
34
35
  # Test cleanup of specific non-existent engine
35
36
  count = manager.cleanup_engine("nonexistent")
36
37
  assert count == 0, "Should return 0 when engine doesn't exist"
37
-
38
+
38
39
  def test_layout_manager_cleanup_empty(self):
39
40
  """Test Layout manager cleanup when no detectors are loaded"""
40
41
  manager = LayoutManager()
41
-
42
+
42
43
  # Test cleanup when nothing is loaded
43
44
  count = manager.cleanup_detector()
44
45
  assert count == 0, "Should return 0 when no detectors loaded"
45
-
46
+
46
47
  # Test cleanup of specific non-existent detector
47
48
  count = manager.cleanup_detector("nonexistent")
48
49
  assert count == 0, "Should return 0 when detector doesn't exist"
49
-
50
+
50
51
  def test_classification_manager_cleanup_empty(self):
51
52
  """Test Classification manager cleanup when no models are loaded"""
52
53
  try:
53
54
  manager = ClassificationManager()
54
-
55
+
55
56
  # Test cleanup when nothing is loaded
56
57
  count = manager.cleanup_models()
57
58
  assert count == 0, "Should return 0 when no models loaded"
58
-
59
+
59
60
  # Test cleanup of specific non-existent model
60
61
  count = manager.cleanup_models("nonexistent/model")
61
62
  assert count == 0, "Should return 0 when model doesn't exist"
62
-
63
+
63
64
  except ImportError:
64
65
  pytest.skip("Classification dependencies not available")
65
-
66
+
66
67
  def test_ocr_manager_cleanup_with_engine(self):
67
68
  """Test OCR manager cleanup after loading an engine"""
68
69
  manager = OCRManager()
69
-
70
+
70
71
  # Check if any OCR engines are available
71
72
  available_engines = manager.get_available_engines()
72
73
  if not available_engines:
73
74
  pytest.skip("No OCR engines available for testing")
74
-
75
+
75
76
  engine_name = available_engines[0]
76
77
  print(f"Testing with OCR engine: {engine_name}")
77
-
78
+
78
79
  # Load an engine by accessing it
79
80
  try:
80
81
  engine_instance = manager._get_engine_instance(engine_name)
81
82
  assert engine_name in manager._engine_instances, "Engine should be cached"
82
-
83
+
83
84
  # Test cleanup of specific engine
84
85
  count = manager.cleanup_engine(engine_name)
85
86
  assert count == 1, f"Should return 1 after cleaning up {engine_name}"
86
- assert engine_name not in manager._engine_instances, "Engine should be removed from cache"
87
-
87
+ assert (
88
+ engine_name not in manager._engine_instances
89
+ ), "Engine should be removed from cache"
90
+
88
91
  except Exception as e:
89
92
  pytest.skip(f"Could not load {engine_name} engine: {e}")
90
-
93
+
91
94
  def test_layout_manager_cleanup_with_detector(self):
92
95
  """Test Layout manager cleanup after loading a detector"""
93
96
  manager = LayoutManager()
94
-
97
+
95
98
  # Check if any layout engines are available
96
99
  available_engines = manager.get_available_engines()
97
100
  if not available_engines:
98
101
  pytest.skip("No layout engines available for testing")
99
-
102
+
100
103
  engine_name = available_engines[0]
101
104
  print(f"Testing with layout engine: {engine_name}")
102
-
105
+
103
106
  # Load a detector by accessing it
104
107
  try:
105
108
  detector_instance = manager._get_engine_instance(engine_name)
106
109
  assert engine_name in manager._detector_instances, "Detector should be cached"
107
-
110
+
108
111
  # Test cleanup of specific detector
109
112
  count = manager.cleanup_detector(engine_name)
110
113
  assert count == 1, f"Should return 1 after cleaning up {engine_name}"
111
- assert engine_name not in manager._detector_instances, "Detector should be removed from cache"
112
-
114
+ assert (
115
+ engine_name not in manager._detector_instances
116
+ ), "Detector should be removed from cache"
117
+
113
118
  except Exception as e:
114
119
  pytest.skip(f"Could not load {engine_name} detector: {e}")
115
-
120
+
116
121
  def test_methods_exist(self):
117
122
  """Test that all cleanup methods exist and are callable"""
118
123
  # Test OCRManager
119
124
  manager = OCRManager()
120
- assert hasattr(manager, 'cleanup_engine'), "OCRManager should have cleanup_engine method"
125
+ assert hasattr(manager, "cleanup_engine"), "OCRManager should have cleanup_engine method"
121
126
  assert callable(manager.cleanup_engine), "cleanup_engine should be callable"
122
-
127
+
123
128
  # Test LayoutManager
124
129
  layout_manager = LayoutManager()
125
- assert hasattr(layout_manager, 'cleanup_detector'), "LayoutManager should have cleanup_detector method"
130
+ assert hasattr(
131
+ layout_manager, "cleanup_detector"
132
+ ), "LayoutManager should have cleanup_detector method"
126
133
  assert callable(layout_manager.cleanup_detector), "cleanup_detector should be callable"
127
-
134
+
128
135
  # Test ClassificationManager (if available)
129
136
  try:
130
137
  classification_manager = ClassificationManager()
131
- assert hasattr(classification_manager, 'cleanup_models'), "ClassificationManager should have cleanup_models method"
132
- assert callable(classification_manager.cleanup_models), "cleanup_models should be callable"
138
+ assert hasattr(
139
+ classification_manager, "cleanup_models"
140
+ ), "ClassificationManager should have cleanup_models method"
141
+ assert callable(
142
+ classification_manager.cleanup_models
143
+ ), "cleanup_models should be callable"
133
144
  except ImportError:
134
145
  print("Classification dependencies not available, skipping ClassificationManager test")
135
146
 
@@ -137,19 +148,19 @@ class TestCleanupMethods:
137
148
  def main():
138
149
  """Run the cleanup method tests"""
139
150
  print("Testing manager cleanup methods...")
140
-
151
+
141
152
  # Run pytest on just this file
142
153
  exit_code = pytest.main([__file__, "-v", "-s"])
143
-
154
+
144
155
  if exit_code == 0:
145
156
  print("\n✅ All cleanup method tests passed!")
146
157
  print("The memory management methods are working correctly.")
147
158
  else:
148
159
  print("\n❌ Some tests failed!")
149
160
  print("The cleanup methods need investigation.")
150
-
161
+
151
162
  return exit_code
152
163
 
153
164
 
154
165
  if __name__ == "__main__":
155
- exit(main())
166
+ exit(main())