additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,1119 +0,0 @@
1
- # enhanced_cache_manager.py
2
- # Content-hash based cache manager for enhanced expressions system
3
-
4
- import os
5
- import json
6
- import shutil
7
- import hashlib
8
- import threading
9
- from typing import Dict, List, Optional, Set, Any, Tuple
10
- from dataclasses import dataclass, field
11
- from datetime import datetime, timedelta
12
- from pathlib import Path
13
-
14
- from .logging import log_info, log_warning
15
- from .integrity_manager import IntegrityManager, SecurityError
16
- from .namespace_manager import NamespaceManager
17
- from .enhanced_version_manager import EnhancedVersionManager
18
-
19
-
20
- @dataclass
21
- class CacheEntry:
22
- """Represents a cached expression entry"""
23
- expression_name: str
24
- version: str
25
- namespace: str
26
- source_path: str
27
- cached_path: str
28
- content_hash: str
29
- cached_at: datetime
30
- last_accessed: datetime
31
- access_count: int = 0
32
- file_size: int = 0
33
- integrity_verified: bool = False
34
- metadata: Dict[str, Any] = field(default_factory=dict)
35
-
36
-
37
- @dataclass
38
- class CacheStats:
39
- """Cache statistics and metrics"""
40
- total_entries: int = 0
41
- cache_hits: int = 0
42
- cache_misses: int = 0
43
- integrity_failures: int = 0
44
- cache_size_bytes: int = 0
45
- last_cleanup: Optional[datetime] = None
46
- corruption_recoveries: int = 0
47
-
48
-
49
- class CacheCorruptionError(Exception):
50
- """Raised when cache corruption is detected"""
51
- pass
52
-
53
-
54
- class CacheValidationError(Exception):
55
- """Raised when cache validation fails"""
56
- pass
57
-
58
-
59
- class EnhancedCacheManager:
60
- """Content-hash based cache manager with namespace separation"""
61
-
62
- def __init__(self):
63
- # Cache paths for different namespaces
64
- self.cache_paths = {
65
- "builtin": os.path.expanduser("~/.additory/cache/expressions/core/"),
66
- "user": os.path.expanduser("~/.additory/cache/expressions/user/")
67
- }
68
-
69
- # Component managers
70
- self.integrity_manager = IntegrityManager()
71
- self.namespace_manager = NamespaceManager()
72
- self.version_manager = EnhancedVersionManager()
73
-
74
- # Cache state
75
- self.cache_entries: Dict[str, Dict[str, CacheEntry]] = {
76
- "builtin": {},
77
- "user": {}
78
- }
79
-
80
- # Statistics
81
- self.stats = CacheStats()
82
-
83
- # Thread safety
84
- self._cache_lock = threading.RLock()
85
-
86
- # Configuration
87
- self.max_cache_age_days = 30
88
- self.max_cache_size_mb = 500
89
- self.auto_cleanup_enabled = True
90
-
91
- # Initialize cache directories and load existing cache
92
- self._initialize_cache()
93
-
94
- log_info("[cache_manager] Enhanced Cache Manager initialized")
95
-
96
- def _initialize_cache(self):
97
- """Initialize cache directories and load existing cache metadata"""
98
- try:
99
- # Ensure cache directories exist
100
- for namespace, cache_path in self.cache_paths.items():
101
- os.makedirs(cache_path, exist_ok=True)
102
- log_info(f"[cache_manager] Ensured cache directory: {cache_path}")
103
-
104
- # Load existing cache metadata
105
- self._load_cache_metadata()
106
-
107
- except Exception as e:
108
- log_warning(f"[cache_manager] Failed to initialize cache: {e}")
109
-
110
- def _load_cache_metadata(self):
111
- """Load cache metadata from all namespaces"""
112
- with self._cache_lock:
113
- for namespace in self.cache_paths:
114
- try:
115
- metadata_path = os.path.join(self.cache_paths[namespace], "metadata.json")
116
-
117
- if os.path.exists(metadata_path):
118
- with open(metadata_path, 'r', encoding='utf-8') as f:
119
- metadata = json.load(f)
120
-
121
- # Load cache entries
122
- expressions = metadata.get("expressions", {})
123
- for filename, entry_data in expressions.items():
124
- cache_entry = self._parse_cache_entry(entry_data, namespace)
125
- if cache_entry:
126
- key = f"{cache_entry.expression_name}_{cache_entry.version}"
127
- self.cache_entries[namespace][key] = cache_entry
128
-
129
- log_info(f"[cache_manager] Loaded {len(expressions)} cache entries for {namespace}")
130
-
131
- except Exception as e:
132
- log_warning(f"[cache_manager] Failed to load cache metadata for {namespace}: {e}")
133
-
134
- def _parse_cache_entry(self, entry_data: dict, namespace: str) -> Optional[CacheEntry]:
135
- """Parse cache entry from metadata"""
136
- try:
137
- return CacheEntry(
138
- expression_name=entry_data.get("expression_name", ""),
139
- version=entry_data.get("version", ""),
140
- namespace=namespace,
141
- source_path=entry_data.get("source_path", ""),
142
- cached_path=entry_data.get("cached_path", ""),
143
- content_hash=entry_data.get("content_hash", ""),
144
- cached_at=datetime.fromisoformat(entry_data.get("cached_at", datetime.now().isoformat())),
145
- last_accessed=datetime.fromisoformat(entry_data.get("last_accessed", datetime.now().isoformat())),
146
- access_count=entry_data.get("access_count", 0),
147
- file_size=entry_data.get("file_size", 0),
148
- integrity_verified=entry_data.get("integrity_verified", False),
149
- metadata=entry_data.get("metadata", {})
150
- )
151
- except Exception as e:
152
- log_warning(f"[cache_manager] Failed to parse cache entry: {e}")
153
- return None
154
-
155
- def cache_expression(self, source_path: str, namespace: str,
156
- expression_name: str, version: str) -> bool:
157
- """
158
- Cache an expression file with content-hash validation
159
-
160
- Args:
161
- source_path: Path to source expression file
162
- namespace: Namespace (builtin or user)
163
- expression_name: Name of the expression
164
- version: Version of the expression
165
-
166
- Returns:
167
- True if caching was successful
168
-
169
- Raises:
170
- CacheValidationError: If validation fails
171
- """
172
- with self._cache_lock:
173
- try:
174
- # Validate inputs
175
- if namespace not in self.cache_paths:
176
- raise CacheValidationError(f"Invalid namespace: {namespace}")
177
-
178
- if not os.path.exists(source_path):
179
- raise CacheValidationError(f"Source file not found: {source_path}")
180
-
181
- # Generate cache filename and path
182
- cache_filename = f"{expression_name}_{version}.add"
183
- cache_path = self.cache_paths[namespace]
184
- cached_file_path = os.path.join(cache_path, cache_filename)
185
-
186
- # Calculate content hash before copying
187
- source_content_hash = self._calculate_content_hash(source_path)
188
-
189
- # Copy file to cache
190
- shutil.copy2(source_path, cached_file_path)
191
-
192
- # Validate integrity of cached file with namespace policy
193
- try:
194
- self.integrity_manager.validate_integrity_with_policy(cached_file_path, namespace)
195
- integrity_verified = True
196
- except SecurityError as e:
197
- # For built-in namespace, re-raise the error (strict policy)
198
- if namespace == "builtin":
199
- # Clean up the cached file
200
- if os.path.exists(cached_file_path):
201
- os.remove(cached_file_path)
202
- raise CacheValidationError(f"Built-in expression integrity validation failed: {e}")
203
- # For user namespace, log warning and continue (flexible policy)
204
- log_warning(f"[cache_manager] Integrity validation failed for {expression_name}: {e}")
205
- integrity_verified = False
206
-
207
- # Create cache entry
208
- cache_entry = CacheEntry(
209
- expression_name=expression_name,
210
- version=version,
211
- namespace=namespace,
212
- source_path=source_path,
213
- cached_path=cached_file_path,
214
- content_hash=source_content_hash,
215
- cached_at=datetime.now(),
216
- last_accessed=datetime.now(),
217
- access_count=0,
218
- file_size=os.path.getsize(cached_file_path),
219
- integrity_verified=integrity_verified
220
- )
221
-
222
- # Store cache entry
223
- key = f"{expression_name}_{version}"
224
- self.cache_entries[namespace][key] = cache_entry
225
-
226
- # Update metadata
227
- self._update_cache_metadata(namespace)
228
-
229
- # Update statistics
230
- self.stats.total_entries += 1
231
- self.stats.cache_size_bytes += cache_entry.file_size
232
-
233
- log_info(f"[cache_manager] Cached expression {expression_name} v{version} in {namespace}")
234
- return True
235
-
236
- except Exception as e:
237
- # Clean up on failure
238
- if 'cached_file_path' in locals() and os.path.exists(cached_file_path):
239
- try:
240
- os.remove(cached_file_path)
241
- except Exception:
242
- pass
243
-
244
- raise CacheValidationError(f"Failed to cache expression {expression_name}: {e}")
245
-
246
- def get_cached_expression(self, expression_name: str, version: str,
247
- namespace: str) -> Optional[str]:
248
- """
249
- Get cached expression file path with validation
250
-
251
- Args:
252
- expression_name: Name of the expression
253
- version: Version of the expression
254
- namespace: Namespace to search in
255
-
256
- Returns:
257
- Path to cached file if valid, None otherwise
258
- """
259
- with self._cache_lock:
260
- try:
261
- key = f"{expression_name}_{version}"
262
-
263
- if namespace not in self.cache_entries:
264
- self.stats.cache_misses += 1
265
- return None
266
-
267
- if key not in self.cache_entries[namespace]:
268
- self.stats.cache_misses += 1
269
- return None
270
-
271
- cache_entry = self.cache_entries[namespace][key]
272
-
273
- # Check if cached file exists
274
- if not os.path.exists(cache_entry.cached_path):
275
- log_warning(f"[cache_manager] Cached file missing: {cache_entry.cached_path}")
276
- self._remove_cache_entry(namespace, key)
277
- self.stats.cache_misses += 1
278
- return None
279
-
280
- # Validate content hash
281
- if not self._validate_content_hash(cache_entry):
282
- log_warning(f"[cache_manager] Content hash validation failed for {expression_name}")
283
- self._remove_cache_entry(namespace, key)
284
- self.stats.integrity_failures += 1
285
- return None
286
-
287
- # Validate integrity if required with namespace policy
288
- if cache_entry.integrity_verified:
289
- try:
290
- self.integrity_manager.validate_integrity_with_policy(cache_entry.cached_path, namespace)
291
- except SecurityError as e:
292
- log_warning(f"[cache_manager] Integrity validation failed: {e}")
293
- self._remove_cache_entry(namespace, key)
294
- self.stats.integrity_failures += 1
295
- return None
296
-
297
- # Update access statistics
298
- cache_entry.last_accessed = datetime.now()
299
- cache_entry.access_count += 1
300
- self.stats.cache_hits += 1
301
-
302
- log_info(f"[cache_manager] Cache hit for {expression_name} v{version}")
303
- return cache_entry.cached_path
304
-
305
- except Exception as e:
306
- log_warning(f"[cache_manager] Failed to get cached expression: {e}")
307
- self.stats.cache_misses += 1
308
- return None
309
-
310
- def _calculate_content_hash(self, file_path: str) -> str:
311
- """Calculate SHA256 hash of file content"""
312
- try:
313
- with open(file_path, 'rb') as f:
314
- content = f.read()
315
- return f"sha256:{hashlib.sha256(content).hexdigest()}"
316
- except Exception as e:
317
- log_warning(f"[cache_manager] Failed to calculate content hash: {e}")
318
- return ""
319
-
320
- def _validate_content_hash(self, cache_entry: CacheEntry) -> bool:
321
- """Validate content hash of cached file"""
322
- try:
323
- current_hash = self._calculate_content_hash(cache_entry.cached_path)
324
- return current_hash == cache_entry.content_hash
325
- except Exception:
326
- return False
327
-
328
- def _remove_cache_entry(self, namespace: str, key: str):
329
- """Remove cache entry and associated file"""
330
- try:
331
- if key in self.cache_entries[namespace]:
332
- cache_entry = self.cache_entries[namespace][key]
333
-
334
- # Remove file
335
- if os.path.exists(cache_entry.cached_path):
336
- os.remove(cache_entry.cached_path)
337
-
338
- # Update statistics
339
- self.stats.cache_size_bytes -= cache_entry.file_size
340
- self.stats.total_entries -= 1
341
-
342
- # Remove from cache
343
- del self.cache_entries[namespace][key]
344
-
345
- log_info(f"[cache_manager] Removed cache entry: {key}")
346
- except Exception as e:
347
- log_warning(f"[cache_manager] Failed to remove cache entry {key}: {e}")
348
-
349
- def _update_cache_metadata(self, namespace: str):
350
- """Update cache metadata file for namespace"""
351
- try:
352
- metadata_path = os.path.join(self.cache_paths[namespace], "metadata.json")
353
-
354
- # Prepare metadata
355
- metadata = {
356
- "cache_version": "2.0",
357
- "namespace": namespace,
358
- "created_at": datetime.now().isoformat(),
359
- "last_updated": datetime.now().isoformat(),
360
- "expressions": {}
361
- }
362
-
363
- # Add cache entries
364
- for key, cache_entry in self.cache_entries[namespace].items():
365
- filename = os.path.basename(cache_entry.cached_path)
366
- metadata["expressions"][filename] = {
367
- "expression_name": cache_entry.expression_name,
368
- "version": cache_entry.version,
369
- "source_path": cache_entry.source_path,
370
- "cached_path": cache_entry.cached_path,
371
- "content_hash": cache_entry.content_hash,
372
- "cached_at": cache_entry.cached_at.isoformat(),
373
- "last_accessed": cache_entry.last_accessed.isoformat(),
374
- "access_count": cache_entry.access_count,
375
- "file_size": cache_entry.file_size,
376
- "integrity_verified": cache_entry.integrity_verified,
377
- "metadata": cache_entry.metadata
378
- }
379
-
380
- # Write metadata
381
- with open(metadata_path, 'w', encoding='utf-8') as f:
382
- json.dump(metadata, f, indent=2, ensure_ascii=False)
383
-
384
- except Exception as e:
385
- log_warning(f"[cache_manager] Failed to update cache metadata for {namespace}: {e}")
386
-
387
- def validate_cache_integrity(self, namespace: str) -> bool:
388
- """
389
- Validate integrity of all cached expressions in namespace
390
-
391
- Args:
392
- namespace: Namespace to validate
393
-
394
- Returns:
395
- True if all entries are valid
396
- """
397
- with self._cache_lock:
398
- try:
399
- if namespace not in self.cache_entries:
400
- return True
401
-
402
- invalid_entries = []
403
-
404
- for key, cache_entry in self.cache_entries[namespace].items():
405
- # Check file exists
406
- if not os.path.exists(cache_entry.cached_path):
407
- invalid_entries.append(key)
408
- continue
409
-
410
- # Validate content hash
411
- if not self._validate_content_hash(cache_entry):
412
- invalid_entries.append(key)
413
- continue
414
-
415
- # Validate integrity if required with namespace policy
416
- if cache_entry.integrity_verified:
417
- try:
418
- self.integrity_manager.validate_integrity_with_policy(cache_entry.cached_path, namespace)
419
- except SecurityError:
420
- invalid_entries.append(key)
421
- continue
422
-
423
- # Remove invalid entries
424
- for key in invalid_entries:
425
- self._remove_cache_entry(namespace, key)
426
-
427
- if invalid_entries:
428
- log_warning(f"[cache_manager] Removed {len(invalid_entries)} invalid cache entries from {namespace}")
429
- self._update_cache_metadata(namespace)
430
-
431
- return len(invalid_entries) == 0
432
-
433
- except Exception as e:
434
- log_warning(f"[cache_manager] Cache integrity validation failed for {namespace}: {e}")
435
- return False
436
-
437
- def refresh_cache(self, namespace: str = None) -> Dict[str, int]:
438
- """
439
- Refresh cache by reloading from source files
440
-
441
- Args:
442
- namespace: Specific namespace to refresh, or None for all
443
-
444
- Returns:
445
- Dictionary with refresh statistics
446
- """
447
- with self._cache_lock:
448
- stats = {"refreshed": 0, "failed": 0, "removed": 0}
449
-
450
- namespaces = [namespace] if namespace else list(self.cache_paths.keys())
451
-
452
- for ns in namespaces:
453
- try:
454
- log_info(f"[cache_manager] Refreshing cache for namespace: {ns}")
455
-
456
- # Get current cache entries
457
- current_entries = list(self.cache_entries[ns].items())
458
-
459
- for key, cache_entry in current_entries:
460
- try:
461
- # Check if source file still exists
462
- if not os.path.exists(cache_entry.source_path):
463
- self._remove_cache_entry(ns, key)
464
- stats["removed"] += 1
465
- continue
466
-
467
- # Check if source has changed
468
- source_hash = self._calculate_content_hash(cache_entry.source_path)
469
- if source_hash != cache_entry.content_hash:
470
- # Re-cache the expression
471
- self._remove_cache_entry(ns, key)
472
-
473
- if self.cache_expression(
474
- cache_entry.source_path,
475
- ns,
476
- cache_entry.expression_name,
477
- cache_entry.version
478
- ):
479
- stats["refreshed"] += 1
480
- else:
481
- stats["failed"] += 1
482
-
483
- except Exception as e:
484
- log_warning(f"[cache_manager] Failed to refresh {key}: {e}")
485
- stats["failed"] += 1
486
-
487
- # Update metadata
488
- self._update_cache_metadata(ns)
489
-
490
- except Exception as e:
491
- log_warning(f"[cache_manager] Failed to refresh namespace {ns}: {e}")
492
-
493
- log_info(f"[cache_manager] Cache refresh completed: {stats}")
494
- return stats
495
-
496
- def cleanup_cache(self, max_age_days: int = None, max_size_mb: int = None) -> Dict[str, int]:
497
- """
498
- Clean up old or excessive cache entries
499
-
500
- Args:
501
- max_age_days: Maximum age in days (uses default if None)
502
- max_size_mb: Maximum cache size in MB (uses default if None)
503
-
504
- Returns:
505
- Cleanup statistics
506
- """
507
- with self._cache_lock:
508
- max_age = max_age_days or self.max_cache_age_days
509
- max_size = max_size_mb or self.max_cache_size_mb
510
-
511
- stats = {"removed_old": 0, "removed_excess": 0, "bytes_freed": 0}
512
- cutoff_date = datetime.now() - timedelta(days=max_age)
513
-
514
- for namespace in self.cache_paths:
515
- # Remove old entries
516
- old_entries = []
517
- for key, cache_entry in self.cache_entries[namespace].items():
518
- if cache_entry.last_accessed < cutoff_date:
519
- old_entries.append(key)
520
-
521
- for key in old_entries:
522
- cache_entry = self.cache_entries[namespace][key]
523
- stats["bytes_freed"] += cache_entry.file_size
524
- self._remove_cache_entry(namespace, key)
525
- stats["removed_old"] += 1
526
-
527
- # Remove excess entries if cache is too large
528
- if self.stats.cache_size_bytes > max_size * 1024 * 1024:
529
- # Sort by last accessed (oldest first)
530
- entries_by_access = sorted(
531
- self.cache_entries[namespace].items(),
532
- key=lambda x: x[1].last_accessed
533
- )
534
-
535
- while (self.stats.cache_size_bytes > max_size * 1024 * 1024 and
536
- entries_by_access):
537
- key, cache_entry = entries_by_access.pop(0)
538
- stats["bytes_freed"] += cache_entry.file_size
539
- self._remove_cache_entry(namespace, key)
540
- stats["removed_excess"] += 1
541
-
542
- # Update metadata
543
- self._update_cache_metadata(namespace)
544
-
545
- self.stats.last_cleanup = datetime.now()
546
- log_info(f"[cache_manager] Cache cleanup completed: {stats}")
547
- return stats
548
-
549
- def get_cache_stats(self) -> Dict[str, Any]:
550
- """Get comprehensive cache statistics"""
551
- with self._cache_lock:
552
- namespace_stats = {}
553
-
554
- for namespace in self.cache_paths:
555
- entries = self.cache_entries[namespace]
556
- total_size = sum(entry.file_size for entry in entries.values())
557
-
558
- namespace_stats[namespace] = {
559
- "entry_count": len(entries),
560
- "total_size_bytes": total_size,
561
- "total_size_mb": total_size / (1024 * 1024),
562
- "cache_path": self.cache_paths[namespace],
563
- "most_accessed": self._get_most_accessed_entry(namespace),
564
- "oldest_entry": self._get_oldest_entry(namespace),
565
- "newest_entry": self._get_newest_entry(namespace)
566
- }
567
-
568
- return {
569
- "global_stats": {
570
- "total_entries": self.stats.total_entries,
571
- "cache_hits": self.stats.cache_hits,
572
- "cache_misses": self.stats.cache_misses,
573
- "hit_rate": self._calculate_hit_rate(),
574
- "integrity_failures": self.stats.integrity_failures,
575
- "cache_size_bytes": self.stats.cache_size_bytes,
576
- "cache_size_mb": self.stats.cache_size_bytes / (1024 * 1024),
577
- "last_cleanup": self.stats.last_cleanup.isoformat() if self.stats.last_cleanup else None,
578
- "corruption_recoveries": self.stats.corruption_recoveries
579
- },
580
- "namespace_stats": namespace_stats,
581
- "configuration": {
582
- "max_cache_age_days": self.max_cache_age_days,
583
- "max_cache_size_mb": self.max_cache_size_mb,
584
- "auto_cleanup_enabled": self.auto_cleanup_enabled
585
- }
586
- }
587
-
588
- def _calculate_hit_rate(self) -> float:
589
- """Calculate cache hit rate"""
590
- total_requests = self.stats.cache_hits + self.stats.cache_misses
591
- if total_requests == 0:
592
- return 0.0
593
- return (self.stats.cache_hits / total_requests) * 100.0
594
-
595
- def _get_most_accessed_entry(self, namespace: str) -> Optional[Dict[str, Any]]:
596
- """Get most accessed cache entry in namespace"""
597
- if not self.cache_entries[namespace]:
598
- return None
599
-
600
- most_accessed = max(
601
- self.cache_entries[namespace].values(),
602
- key=lambda x: x.access_count
603
- )
604
-
605
- return {
606
- "expression_name": most_accessed.expression_name,
607
- "version": most_accessed.version,
608
- "access_count": most_accessed.access_count
609
- }
610
-
611
- def _get_oldest_entry(self, namespace: str) -> Optional[Dict[str, Any]]:
612
- """Get oldest cache entry in namespace"""
613
- if not self.cache_entries[namespace]:
614
- return None
615
-
616
- oldest = min(
617
- self.cache_entries[namespace].values(),
618
- key=lambda x: x.cached_at
619
- )
620
-
621
- return {
622
- "expression_name": oldest.expression_name,
623
- "version": oldest.version,
624
- "cached_at": oldest.cached_at.isoformat()
625
- }
626
-
627
- def _get_newest_entry(self, namespace: str) -> Optional[Dict[str, Any]]:
628
- """Get newest cache entry in namespace"""
629
- if not self.cache_entries[namespace]:
630
- return None
631
-
632
- newest = max(
633
- self.cache_entries[namespace].values(),
634
- key=lambda x: x.cached_at
635
- )
636
-
637
- return {
638
- "expression_name": newest.expression_name,
639
- "version": newest.version,
640
- "cached_at": newest.cached_at.isoformat()
641
- }
642
-
643
- def clear_cache(self, namespace: str = None) -> int:
644
- """
645
- Clear cache entries
646
-
647
- Args:
648
- namespace: Specific namespace to clear, or None for all
649
-
650
- Returns:
651
- Number of entries removed
652
- """
653
- with self._cache_lock:
654
- removed_count = 0
655
-
656
- namespaces = [namespace] if namespace else list(self.cache_paths.keys())
657
-
658
- for ns in namespaces:
659
- # Remove all entries
660
- entries_to_remove = list(self.cache_entries[ns].keys())
661
- for key in entries_to_remove:
662
- self._remove_cache_entry(ns, key)
663
- removed_count += 1
664
-
665
- # Update metadata
666
- self._update_cache_metadata(ns)
667
-
668
- log_info(f"[cache_manager] Cleared {removed_count} cache entries")
669
- return removed_count
670
-
671
- def handle_cache_corruption(self, namespace: str, expression_name: str,
672
- version: str) -> bool:
673
- """
674
- Handle cache corruption by attempting recovery
675
-
676
- Args:
677
- namespace: Affected namespace
678
- expression_name: Expression name
679
- version: Expression version
680
-
681
- Returns:
682
- True if recovery was successful
683
- """
684
- with self._cache_lock:
685
- try:
686
- key = f"{expression_name}_{version}"
687
-
688
- if key not in self.cache_entries[namespace]:
689
- return False
690
-
691
- cache_entry = self.cache_entries[namespace][key]
692
-
693
- log_warning(f"[cache_manager] Handling cache corruption for {expression_name} v{version}")
694
-
695
- # Remove corrupted entry
696
- self._remove_cache_entry(namespace, key)
697
-
698
- # Attempt to re-cache from source if available
699
- if os.path.exists(cache_entry.source_path):
700
- success = self.cache_expression(
701
- cache_entry.source_path,
702
- namespace,
703
- expression_name,
704
- version
705
- )
706
-
707
- if success:
708
- self.stats.corruption_recoveries += 1
709
- log_info(f"[cache_manager] Successfully recovered corrupted cache entry")
710
- return True
711
-
712
- log_warning(f"[cache_manager] Failed to recover corrupted cache entry")
713
- return False
714
-
715
- except Exception as e:
716
- log_warning(f"[cache_manager] Cache corruption recovery failed: {e}")
717
- return False
718
-
719
- def set_configuration(self, **kwargs):
720
- """Update cache configuration"""
721
- for key, value in kwargs.items():
722
- if hasattr(self, key):
723
- setattr(self, key, value)
724
- log_info(f"[cache_manager] Updated configuration {key} = {value}")
725
- else:
726
- log_warning(f"[cache_manager] Unknown configuration key: {key}")
727
-
728
- def reset_stats(self):
729
- """Reset cache statistics"""
730
- self.stats = CacheStats()
731
- log_info("[cache_manager] Cache statistics reset")
732
-
733
- # ===== TASK 3.2: Cache Invalidation and Refresh Methods =====
734
-
735
- def invalidate_expression(self, namespace: str, expression_name: str, version: str) -> bool:
736
- """
737
- Invalidate a specific expression from cache
738
-
739
- Args:
740
- namespace: Namespace (builtin or user)
741
- expression_name: Name of the expression
742
- version: Version of the expression
743
-
744
- Returns:
745
- True if expression was invalidated
746
- """
747
- with self._cache_lock:
748
- try:
749
- key = f"{expression_name}_{version}"
750
-
751
- if namespace not in self.cache_entries:
752
- log_warning(f"[cache_manager] Invalid namespace for invalidation: {namespace}")
753
- return False
754
-
755
- if key not in self.cache_entries[namespace]:
756
- log_info(f"[cache_manager] Expression {expression_name} v{version} not in cache")
757
- return False
758
-
759
- # Remove the cache entry
760
- self._remove_cache_entry(namespace, key)
761
-
762
- # Update metadata
763
- self._update_cache_metadata(namespace)
764
-
765
- log_info(f"[cache_manager] Invalidated {expression_name} v{version} from {namespace}")
766
- return True
767
-
768
- except Exception as e:
769
- log_warning(f"[cache_manager] Failed to invalidate expression {expression_name}: {e}")
770
- return False
771
-
772
- def invalidate_version(self, namespace: str, version: str) -> int:
773
- """
774
- Invalidate all expressions of a specific version from cache
775
-
776
- Args:
777
- namespace: Namespace (builtin or user)
778
- version: Version to invalidate
779
-
780
- Returns:
781
- Number of expressions invalidated
782
- """
783
- with self._cache_lock:
784
- try:
785
- if namespace not in self.cache_entries:
786
- log_warning(f"[cache_manager] Invalid namespace for version invalidation: {namespace}")
787
- return 0
788
-
789
- # Find all entries with the specified version
790
- entries_to_invalidate = []
791
- for key, cache_entry in self.cache_entries[namespace].items():
792
- if cache_entry.version == version:
793
- entries_to_invalidate.append(key)
794
-
795
- # Remove all matching entries
796
- invalidated_count = 0
797
- for key in entries_to_invalidate:
798
- self._remove_cache_entry(namespace, key)
799
- invalidated_count += 1
800
-
801
- # Update metadata if any entries were removed
802
- if invalidated_count > 0:
803
- self._update_cache_metadata(namespace)
804
-
805
- log_info(f"[cache_manager] Invalidated {invalidated_count} expressions of version {version} from {namespace}")
806
- return invalidated_count
807
-
808
- except Exception as e:
809
- log_warning(f"[cache_manager] Failed to invalidate version {version}: {e}")
810
- return 0
811
-
812
- def invalidate_namespace(self, namespace: str) -> int:
813
- """
814
- Invalidate all expressions in a namespace
815
-
816
- Args:
817
- namespace: Namespace to invalidate
818
-
819
- Returns:
820
- Number of expressions invalidated
821
- """
822
- with self._cache_lock:
823
- try:
824
- if namespace not in self.cache_entries:
825
- log_warning(f"[cache_manager] Invalid namespace for invalidation: {namespace}")
826
- return 0
827
-
828
- # Get count before clearing
829
- invalidated_count = len(self.cache_entries[namespace])
830
-
831
- # Remove all entries
832
- entries_to_remove = list(self.cache_entries[namespace].keys())
833
- for key in entries_to_remove:
834
- self._remove_cache_entry(namespace, key)
835
-
836
- # Update metadata
837
- self._update_cache_metadata(namespace)
838
-
839
- log_info(f"[cache_manager] Invalidated {invalidated_count} expressions from {namespace}")
840
- return invalidated_count
841
-
842
- except Exception as e:
843
- log_warning(f"[cache_manager] Failed to invalidate namespace {namespace}: {e}")
844
- return 0
845
-
846
- def cleanup_expired_cache(self, max_age_days: int = None) -> Dict[str, int]:
847
- """
848
- Clean up expired cache entries based on age
849
-
850
- Args:
851
- max_age_days: Maximum age in days (uses default if None)
852
-
853
- Returns:
854
- Cleanup statistics
855
- """
856
- max_age = max_age_days or self.max_cache_age_days
857
- return self.cleanup_cache(max_age_days=max_age, max_size_mb=None)
858
-
859
- def cleanup_orphaned_cache(self) -> Dict[str, int]:
860
- """
861
- Clean up orphaned cache files (cached files without source files)
862
-
863
- Returns:
864
- Cleanup statistics
865
- """
866
- with self._cache_lock:
867
- stats = {"removed_orphaned": 0, "bytes_freed": 0}
868
-
869
- for namespace in self.cache_paths:
870
- orphaned_entries = []
871
-
872
- # Find entries where source file no longer exists
873
- for key, cache_entry in self.cache_entries[namespace].items():
874
- if not os.path.exists(cache_entry.source_path):
875
- orphaned_entries.append(key)
876
-
877
- # Remove orphaned entries
878
- for key in orphaned_entries:
879
- cache_entry = self.cache_entries[namespace][key]
880
- stats["bytes_freed"] += cache_entry.file_size
881
- self._remove_cache_entry(namespace, key)
882
- stats["removed_orphaned"] += 1
883
-
884
- # Update metadata if any entries were removed
885
- if orphaned_entries:
886
- self._update_cache_metadata(namespace)
887
-
888
- log_info(f"[cache_manager] Cleaned up {stats['removed_orphaned']} orphaned cache entries")
889
- return stats
890
-
891
- def get_cache_status(self) -> Dict[str, Any]:
892
- """
893
- Get comprehensive cache status and health information
894
-
895
- Returns:
896
- Detailed cache status information
897
- """
898
- with self._cache_lock:
899
- status = {
900
- "health": "healthy",
901
- "issues": [],
902
- "recommendations": [],
903
- "statistics": self.get_cache_stats(),
904
- "namespace_health": {},
905
- "disk_usage": {},
906
- "integrity_status": {}
907
- }
908
-
909
- # Check each namespace
910
- for namespace in self.cache_paths:
911
- namespace_status = {
912
- "entry_count": len(self.cache_entries[namespace]),
913
- "integrity_valid": True,
914
- "orphaned_entries": 0,
915
- "expired_entries": 0,
916
- "corrupted_entries": 0
917
- }
918
-
919
- # Check for orphaned entries
920
- orphaned_count = 0
921
- expired_count = 0
922
- corrupted_count = 0
923
- cutoff_date = datetime.now() - timedelta(days=self.max_cache_age_days)
924
-
925
- for key, cache_entry in self.cache_entries[namespace].items():
926
- # Check if source exists (orphaned)
927
- if not os.path.exists(cache_entry.source_path):
928
- orphaned_count += 1
929
-
930
- # Check if expired
931
- if cache_entry.last_accessed < cutoff_date:
932
- expired_count += 1
933
-
934
- # Check if cached file exists and is valid
935
- if not os.path.exists(cache_entry.cached_path):
936
- corrupted_count += 1
937
- elif not self._validate_content_hash(cache_entry):
938
- corrupted_count += 1
939
-
940
- namespace_status["orphaned_entries"] = orphaned_count
941
- namespace_status["expired_entries"] = expired_count
942
- namespace_status["corrupted_entries"] = corrupted_count
943
-
944
- # Determine namespace health
945
- if corrupted_count > 0:
946
- namespace_status["integrity_valid"] = False
947
- status["issues"].append(f"{namespace}: {corrupted_count} corrupted entries")
948
-
949
- if orphaned_count > 0:
950
- status["issues"].append(f"{namespace}: {orphaned_count} orphaned entries")
951
- status["recommendations"].append(f"Run cleanup_orphaned_cache() to remove orphaned entries")
952
-
953
- if expired_count > 0:
954
- status["recommendations"].append(f"Run cleanup_expired_cache() to remove {expired_count} expired entries")
955
-
956
- status["namespace_health"][namespace] = namespace_status
957
-
958
- # Get disk usage for namespace
959
- try:
960
- cache_path = self.cache_paths[namespace]
961
- if os.path.exists(cache_path):
962
- total_size = 0
963
- file_count = 0
964
- for root, dirs, files in os.walk(cache_path):
965
- for file in files:
966
- file_path = os.path.join(root, file)
967
- if os.path.exists(file_path):
968
- total_size += os.path.getsize(file_path)
969
- file_count += 1
970
-
971
- status["disk_usage"][namespace] = {
972
- "total_size_bytes": total_size,
973
- "total_size_mb": total_size / (1024 * 1024),
974
- "file_count": file_count
975
- }
976
- except Exception as e:
977
- log_warning(f"[cache_manager] Failed to get disk usage for {namespace}: {e}")
978
-
979
- # Overall health assessment
980
- if status["issues"]:
981
- status["health"] = "degraded" if len(status["issues"]) < 5 else "unhealthy"
982
-
983
- # Add integrity status
984
- for namespace in self.cache_paths:
985
- try:
986
- integrity_valid = self.validate_cache_integrity(namespace)
987
- status["integrity_status"][namespace] = {
988
- "valid": integrity_valid,
989
- "last_checked": datetime.now().isoformat()
990
- }
991
- except Exception as e:
992
- status["integrity_status"][namespace] = {
993
- "valid": False,
994
- "error": str(e),
995
- "last_checked": datetime.now().isoformat()
996
- }
997
-
998
- return status
999
-
1000
- def get_expression_cache_info(self, expression_name: str, version: str, namespace: str) -> Optional[Dict[str, Any]]:
1001
- """
1002
- Get detailed cache information for a specific expression
1003
-
1004
- Args:
1005
- expression_name: Name of the expression
1006
- version: Version of the expression
1007
- namespace: Namespace to search in
1008
-
1009
- Returns:
1010
- Detailed cache information or None if not cached
1011
- """
1012
- with self._cache_lock:
1013
- try:
1014
- key = f"{expression_name}_{version}"
1015
-
1016
- if namespace not in self.cache_entries or key not in self.cache_entries[namespace]:
1017
- return None
1018
-
1019
- cache_entry = self.cache_entries[namespace][key]
1020
-
1021
- # Check file status
1022
- cached_file_exists = os.path.exists(cache_entry.cached_path)
1023
- source_file_exists = os.path.exists(cache_entry.source_path)
1024
- content_hash_valid = self._validate_content_hash(cache_entry) if cached_file_exists else False
1025
-
1026
- # Calculate age
1027
- age_days = (datetime.now() - cache_entry.cached_at).days
1028
- last_access_days = (datetime.now() - cache_entry.last_accessed).days
1029
-
1030
- return {
1031
- "expression_name": cache_entry.expression_name,
1032
- "version": cache_entry.version,
1033
- "namespace": cache_entry.namespace,
1034
- "source_path": cache_entry.source_path,
1035
- "cached_path": cache_entry.cached_path,
1036
- "content_hash": cache_entry.content_hash,
1037
- "cached_at": cache_entry.cached_at.isoformat(),
1038
- "last_accessed": cache_entry.last_accessed.isoformat(),
1039
- "access_count": cache_entry.access_count,
1040
- "file_size": cache_entry.file_size,
1041
- "file_size_mb": cache_entry.file_size / (1024 * 1024),
1042
- "integrity_verified": cache_entry.integrity_verified,
1043
- "age_days": age_days,
1044
- "last_access_days": last_access_days,
1045
- "status": {
1046
- "cached_file_exists": cached_file_exists,
1047
- "source_file_exists": source_file_exists,
1048
- "content_hash_valid": content_hash_valid,
1049
- "is_orphaned": not source_file_exists,
1050
- "is_expired": age_days > self.max_cache_age_days,
1051
- "is_corrupted": not (cached_file_exists and content_hash_valid)
1052
- },
1053
- "metadata": cache_entry.metadata
1054
- }
1055
-
1056
- except Exception as e:
1057
- log_warning(f"[cache_manager] Failed to get cache info for {expression_name}: {e}")
1058
- return None
1059
-
1060
- def force_refresh_expression(self, expression_name: str, version: str, namespace: str) -> bool:
1061
- """
1062
- Force refresh a specific expression from its source file
1063
-
1064
- Args:
1065
- expression_name: Name of the expression
1066
- version: Version of the expression
1067
- namespace: Namespace of the expression
1068
-
1069
- Returns:
1070
- True if refresh was successful
1071
- """
1072
- with self._cache_lock:
1073
- try:
1074
- key = f"{expression_name}_{version}"
1075
-
1076
- if namespace not in self.cache_entries or key not in self.cache_entries[namespace]:
1077
- log_warning(f"[cache_manager] Expression {expression_name} v{version} not in cache")
1078
- return False
1079
-
1080
- cache_entry = self.cache_entries[namespace][key]
1081
-
1082
- # Check if source file exists
1083
- if not os.path.exists(cache_entry.source_path):
1084
- log_warning(f"[cache_manager] Source file not found for refresh: {cache_entry.source_path}")
1085
- return False
1086
-
1087
- # Remove current cache entry
1088
- self._remove_cache_entry(namespace, key)
1089
-
1090
- # Re-cache from source
1091
- success = self.cache_expression(
1092
- cache_entry.source_path,
1093
- namespace,
1094
- expression_name,
1095
- version
1096
- )
1097
-
1098
- if success:
1099
- log_info(f"[cache_manager] Force refreshed {expression_name} v{version}")
1100
- else:
1101
- log_warning(f"[cache_manager] Failed to force refresh {expression_name} v{version}")
1102
-
1103
- return success
1104
-
1105
- except Exception as e:
1106
- log_warning(f"[cache_manager] Force refresh failed for {expression_name}: {e}")
1107
- return False
1108
-
1109
-
1110
- # Global cache manager instance
1111
- _global_cache_manager = None
1112
-
1113
-
1114
- def get_cache_manager() -> EnhancedCacheManager:
1115
- """Get the global cache manager instance"""
1116
- global _global_cache_manager
1117
- if _global_cache_manager is None:
1118
- _global_cache_manager = EnhancedCacheManager()
1119
- return _global_cache_manager