ptdu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ptdu/scanner.py ADDED
@@ -0,0 +1,490 @@
1
+ """Directory scanning functionality for PTDU."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import platform
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Callable, Optional
10
+
11
+ if TYPE_CHECKING:
12
+ from ptdu.cache import ScanCache
13
+
14
+
15
+ @dataclass
16
+ class ScanResult:
17
+ """Result of scanning a single file or directory entry."""
18
+
19
+ path: Path
20
+ name: str
21
+ size: int
22
+ is_dir: bool
23
+ error: Optional[str] = None
24
+
25
+
26
+ class Scanner:
27
+ """Directory scanner using os.scandir() for optimal performance."""
28
+
29
+ # Default patterns to skip (system directories and common VCS)
30
+ DEFAULT_SKIP_PATTERNS: tuple[str, ...] = (
31
+ "/proc",
32
+ "/sys",
33
+ "/dev",
34
+ "/run",
35
+ "/boot",
36
+ ".git",
37
+ ".svn",
38
+ ".hg",
39
+ "__pycache__",
40
+ ".pytest_cache",
41
+ ".mypy_cache",
42
+ ".egg-info",
43
+ ".tox",
44
+ "node_modules",
45
+ ".aider*",
46
+ )
47
+
48
+ def __init__(
49
+ self,
50
+ skip_patterns: Optional[tuple[str, ...]] = None,
51
+ follow_symlinks: bool = False,
52
+ show_hidden: bool = True,
53
+ exclude_patterns: Optional[tuple[str, ...]] = None,
54
+ cache: Optional[ScanCache] = None,
55
+ ) -> None:
56
+ """Initialize scanner with configuration.
57
+
58
+ Args:
59
+ skip_patterns: Tuple of path patterns to skip (uses DEFAULT_SKIP_PATTERNS if None)
60
+ follow_symlinks: Whether to follow symbolic links
61
+ show_hidden: Whether to show hidden files (dotfiles)
62
+ exclude_patterns: Additional patterns to exclude from scans
63
+ cache: Optional ScanCache for caching results
64
+ """
65
+ self.skip_patterns: tuple[str, ...] = (
66
+ skip_patterns or self.DEFAULT_SKIP_PATTERNS
67
+ )
68
+ self.follow_symlinks: bool = follow_symlinks
69
+ self._show_hidden: bool = show_hidden
70
+ self._exclude_patterns: tuple[str, ...] = exclude_patterns or ()
71
+ self._cache: Optional[ScanCache] = cache
72
+ self._use_cache: bool = cache is not None
73
+
74
+ def should_skip(self, path: Path) -> bool:
75
+ """Check if a path should be skipped based on skip patterns.
76
+
77
+ Args:
78
+ path: Path to check
79
+
80
+ Returns:
81
+ True if path should be skipped
82
+ """
83
+ path_str = str(path)
84
+ name = path.name
85
+
86
+ # Check skip patterns
87
+ for pattern in self.skip_patterns:
88
+ # Check if path matches pattern at any level
89
+ if pattern.startswith("/"):
90
+ # Absolute path pattern - check if path starts with it
91
+ if path_str.startswith(pattern):
92
+ return True
93
+ else:
94
+ # Relative/name pattern - check if any path component matches
95
+ if name == pattern or pattern in path_str:
96
+ return True
97
+
98
+ # Check exclude patterns
99
+ for pattern in self._exclude_patterns:
100
+ if pattern.startswith("/"):
101
+ if path_str.startswith(pattern):
102
+ return True
103
+ else:
104
+ if name == pattern or pattern in path_str:
105
+ return True
106
+
107
+ return False
108
+
109
+ def set_exclude_patterns(self, patterns: tuple[str, ...]) -> None:
110
+ """Set exclude patterns.
111
+
112
+ Args:
113
+ patterns: Patterns to exclude
114
+ """
115
+ self._exclude_patterns = patterns
116
+
117
+ def add_exclude_pattern(self, pattern: str) -> None:
118
+ """Add an exclude pattern.
119
+
120
+ Args:
121
+ pattern: Pattern to add
122
+ """
123
+ self._exclude_patterns = self._exclude_patterns + (pattern,)
124
+
125
+ def set_cache(self, cache: Optional[ScanCache]) -> None:
126
+ """Set cache for scan results.
127
+
128
+ Args:
129
+ cache: ScanCache instance or None
130
+ """
131
+ self._cache = cache
132
+ self._use_cache = cache is not None
133
+
134
+ def try_load_from_cache(self, path: Path) -> Optional[list[ScanResult]]:
135
+ """Try to load scan results from cache.
136
+
137
+ Args:
138
+ path: Directory path
139
+
140
+ Returns:
141
+ List of ScanResult or None if not cached/invalid
142
+ """
143
+ if not self._use_cache or self._cache is None:
144
+ return None
145
+
146
+ try:
147
+ entries = self._cache.get(path)
148
+ if entries is None:
149
+ return None
150
+
151
+ return [
152
+ ScanResult(
153
+ path=Path(e.path),
154
+ name=e.name,
155
+ size=e.size,
156
+ is_dir=e.is_dir,
157
+ )
158
+ for e in entries
159
+ ]
160
+ except Exception:
161
+ return None
162
+
163
+ def store_in_cache(self, path: Path, results: list[ScanResult]) -> None:
164
+ """Store scan results in cache.
165
+
166
+ Args:
167
+ path: Directory path
168
+ results: Scan results to store
169
+ """
170
+ if not self._use_cache or self._cache is None:
171
+ return
172
+
173
+ try:
174
+ self._cache.store(path, results)
175
+ except Exception:
176
+ # Cache errors are non-critical
177
+ pass
178
+
179
+ def is_hidden(self, path: Path) -> bool:
180
+ """Check if a path is a hidden file (starts with .).
181
+
182
+ Args:
183
+ path: Path to check
184
+
185
+ Returns:
186
+ True if path is hidden
187
+ """
188
+ return path.name.startswith(".")
189
+
190
+ def set_show_hidden(self, show_hidden: bool) -> None:
191
+ """Set whether to show hidden files.
192
+
193
+ Args:
194
+ show_hidden: True to show hidden files, False to hide them
195
+ """
196
+ self._show_hidden = show_hidden
197
+
198
+ def should_show(self, path: Path) -> bool:
199
+ """Check if a path should be shown based on filters.
200
+
201
+ Args:
202
+ path: Path to check
203
+
204
+ Returns:
205
+ True if path should be shown
206
+ """
207
+ if not self._show_hidden and self.is_hidden(path):
208
+ return False
209
+ return not self.should_skip(path)
210
+
211
+ def scan_directory(
212
+ self,
213
+ path: Path,
214
+ progress_callback: Optional[Callable[[ScanResult], None]] = None,
215
+ ) -> list[ScanResult]:
216
+ """Scan a directory and return results.
217
+
218
+ Args:
219
+ path: Directory path to scan
220
+ progress_callback: Optional callback called for each entry found
221
+
222
+ Returns:
223
+ List of ScanResult objects
224
+ """
225
+ results: list[ScanResult] = []
226
+
227
+ if self.should_skip(path) or not self.should_show(path):
228
+ return results
229
+
230
+ try:
231
+ with os.scandir(path) as entries:
232
+ for entry in entries:
233
+ # Skip hidden files if not showing them
234
+ if not self._show_hidden and entry.name.startswith("."):
235
+ continue
236
+ try:
237
+ result = self._process_entry(entry)
238
+ if result is not None:
239
+ results.append(result)
240
+ if progress_callback is not None:
241
+ progress_callback(result)
242
+ except (OSError, PermissionError) as e:
243
+ # Log error but continue scanning
244
+ error_result = ScanResult(
245
+ path=Path(entry.path),
246
+ name=entry.name,
247
+ size=0,
248
+ is_dir=False,
249
+ error=str(e),
250
+ )
251
+ results.append(error_result)
252
+ if progress_callback is not None:
253
+ progress_callback(error_result)
254
+
255
+ except PermissionError:
256
+ # Cannot access directory
257
+ results.append(
258
+ ScanResult(
259
+ path=path,
260
+ name=path.name,
261
+ size=0,
262
+ is_dir=True,
263
+ error="Permission denied",
264
+ )
265
+ )
266
+ except FileNotFoundError:
267
+ # Directory was deleted during scan
268
+ results.append(
269
+ ScanResult(
270
+ path=path,
271
+ name=path.name,
272
+ size=0,
273
+ is_dir=True,
274
+ error="Directory not found",
275
+ )
276
+ )
277
+ except OSError as e:
278
+ # Other OS errors
279
+ results.append(
280
+ ScanResult(
281
+ path=path,
282
+ name=path.name,
283
+ size=0,
284
+ is_dir=True,
285
+ error=str(e),
286
+ )
287
+ )
288
+
289
+ return results
290
+
291
+ def _process_entry(self, entry: os.DirEntry[str]) -> Optional[ScanResult]:
292
+ """Process a single directory entry.
293
+
294
+ Args:
295
+ entry: os.DirEntry object
296
+
297
+ Returns:
298
+ ScanResult or None if entry should be skipped
299
+ """
300
+ path = Path(entry.path)
301
+
302
+ # Check skip patterns
303
+ if self.should_skip(path):
304
+ return None
305
+
306
+ # Check hidden files
307
+ if not self._show_hidden and entry.name.startswith("."):
308
+ return None
309
+
310
+ try:
311
+ # Get entry info - use stat() for symlinks if not following
312
+ if entry.is_symlink() and not self.follow_symlinks:
313
+ return ScanResult(
314
+ path=path,
315
+ name=entry.name,
316
+ size=0,
317
+ is_dir=False, # Symlinks treated as files for size purposes
318
+ )
319
+
320
+ # Use entry.stat() which is cached and faster than Path.stat()
321
+ stat_info = entry.stat(follow_symlinks=self.follow_symlinks)
322
+
323
+ # Check if it's a directory
324
+ is_dir = entry.is_dir(follow_symlinks=self.follow_symlinks)
325
+
326
+ # For directories, size is calculated from children (will be 0 here)
327
+ # For files, use the actual file size
328
+ size = 0 if is_dir else stat_info.st_size
329
+
330
+ return ScanResult(
331
+ path=path,
332
+ name=entry.name,
333
+ size=size,
334
+ is_dir=is_dir,
335
+ )
336
+
337
+ except (OSError, PermissionError):
338
+ # Broken symlink or permission issue
339
+ return ScanResult(
340
+ path=path,
341
+ name=entry.name,
342
+ size=0,
343
+ is_dir=False,
344
+ error="Cannot access entry",
345
+ )
346
+
347
+ def scan_recursive(
348
+ self,
349
+ path: Path,
350
+ max_depth: Optional[int] = None,
351
+ current_depth: int = 0,
352
+ progress_callback: Optional[Callable[[ScanResult], None]] = None,
353
+ ) -> list[ScanResult]:
354
+ """Recursively scan a directory.
355
+
356
+ Args:
357
+ path: Directory path to scan
358
+ max_depth: Maximum recursion depth (None for unlimited)
359
+ current_depth: Current depth (used internally)
360
+ progress_callback: Optional callback for progress updates
361
+
362
+ Returns:
363
+ List of all ScanResult objects
364
+ """
365
+ if max_depth is not None and current_depth >= max_depth:
366
+ return []
367
+
368
+ if self.should_skip(path):
369
+ return []
370
+
371
+ results = self.scan_directory(path, progress_callback)
372
+
373
+ # Recursively scan subdirectories
374
+ subdirs = [r for r in results if r.is_dir and r.error is None]
375
+ for subdir in subdirs:
376
+ sub_results = self.scan_recursive(
377
+ subdir.path,
378
+ max_depth=max_depth,
379
+ current_depth=current_depth + 1,
380
+ progress_callback=progress_callback,
381
+ )
382
+ results.extend(sub_results)
383
+
384
+ return results
385
+
386
+ def get_directory_size(self, path: Path) -> int:
387
+ """Calculate total size of a directory recursively.
388
+
389
+ Args:
390
+ path: Directory path
391
+
392
+ Returns:
393
+ Total size in bytes
394
+ """
395
+ if self.should_skip(path):
396
+ return 0
397
+
398
+ total_size = 0
399
+
400
+ try:
401
+ with os.scandir(path) as entries:
402
+ for entry in entries:
403
+ try:
404
+ if entry.is_symlink() and not self.follow_symlinks:
405
+ continue
406
+
407
+ if entry.is_dir(follow_symlinks=self.follow_symlinks):
408
+ total_size += self.get_directory_size(Path(entry.path))
409
+ else:
410
+ stat_info = entry.stat(follow_symlinks=self.follow_symlinks)
411
+ total_size += stat_info.st_size
412
+ except (OSError, PermissionError):
413
+ # Skip inaccessible entries
414
+ continue
415
+
416
+ except (OSError, PermissionError):
417
+ # Cannot access directory
418
+ pass
419
+
420
+ return total_size
421
+
422
+
423
+ class MemoryMonitor:
424
+ """Monitors memory usage and provides warnings."""
425
+
426
+ def __init__(self, warning_threshold_mb: int = 500) -> None:
427
+ """Initialize memory monitor.
428
+
429
+ Args:
430
+ warning_threshold_mb: Memory threshold for warnings (MB)
431
+ """
432
+ self._warning_threshold: int = warning_threshold_mb * 1024 * 1024
433
+ self._critical_threshold: int = warning_threshold_mb * 2 * 1024 * 1024
434
+
435
+ def get_memory_usage(self) -> dict[str, int]:
436
+ """Get current memory usage.
437
+
438
+ Returns:
439
+ Dictionary with memory stats in bytes
440
+ """
441
+ try:
442
+ import resource
443
+
444
+ usage = resource.getrusage(resource.RUSAGE_SELF)
445
+ return {
446
+ "rss": usage.ru_maxrss * 1024, # Convert KB to bytes
447
+ "vms": 0, # Not available on all platforms
448
+ }
449
+ except ImportError:
450
+ # Fallback for non-Unix systems
451
+ return {"rss": 0, "vms": 0}
452
+
453
+ def check_memory(self) -> tuple[bool, bool]:
454
+ """Check memory usage against thresholds.
455
+
456
+ Returns:
457
+ Tuple of (warning_triggered, critical_triggered)
458
+ """
459
+ usage = self.get_memory_usage()
460
+ rss = usage.get("rss", 0)
461
+
462
+ warning = rss > self._warning_threshold
463
+ critical = rss > self._critical_threshold
464
+
465
+ return warning, critical
466
+
467
+
468
+ def get_system_info() -> dict[str, str | int]:
469
+ """Get system information for debugging.
470
+
471
+ Returns:
472
+ Dictionary with system info
473
+ """
474
+ info: dict[str, str | int] = {
475
+ "platform": platform.platform(),
476
+ "python_version": platform.python_version(),
477
+ "processor": platform.processor(),
478
+ "machine": platform.machine(),
479
+ }
480
+
481
+ try:
482
+ import resource
483
+
484
+ soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
485
+ info["max_open_files_soft"] = soft
486
+ info["max_open_files_hard"] = hard
487
+ except (ImportError, OSError):
488
+ pass
489
+
490
+ return info