alma-memory 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. alma/__init__.py +296 -194
  2. alma/compression/__init__.py +33 -0
  3. alma/compression/pipeline.py +980 -0
  4. alma/confidence/__init__.py +47 -47
  5. alma/confidence/engine.py +540 -540
  6. alma/confidence/types.py +351 -351
  7. alma/config/loader.py +157 -157
  8. alma/consolidation/__init__.py +23 -23
  9. alma/consolidation/engine.py +678 -678
  10. alma/consolidation/prompts.py +84 -84
  11. alma/core.py +1189 -322
  12. alma/domains/__init__.py +30 -30
  13. alma/domains/factory.py +359 -359
  14. alma/domains/schemas.py +448 -448
  15. alma/domains/types.py +272 -272
  16. alma/events/__init__.py +75 -75
  17. alma/events/emitter.py +285 -284
  18. alma/events/storage_mixin.py +246 -246
  19. alma/events/types.py +126 -126
  20. alma/events/webhook.py +425 -425
  21. alma/exceptions.py +49 -49
  22. alma/extraction/__init__.py +31 -31
  23. alma/extraction/auto_learner.py +265 -264
  24. alma/extraction/extractor.py +420 -420
  25. alma/graph/__init__.py +106 -81
  26. alma/graph/backends/__init__.py +32 -18
  27. alma/graph/backends/kuzu.py +624 -0
  28. alma/graph/backends/memgraph.py +432 -0
  29. alma/graph/backends/memory.py +236 -236
  30. alma/graph/backends/neo4j.py +417 -417
  31. alma/graph/base.py +159 -159
  32. alma/graph/extraction.py +198 -198
  33. alma/graph/store.py +860 -860
  34. alma/harness/__init__.py +35 -35
  35. alma/harness/base.py +386 -386
  36. alma/harness/domains.py +705 -705
  37. alma/initializer/__init__.py +37 -37
  38. alma/initializer/initializer.py +418 -418
  39. alma/initializer/types.py +250 -250
  40. alma/integration/__init__.py +62 -62
  41. alma/integration/claude_agents.py +444 -432
  42. alma/integration/helena.py +423 -423
  43. alma/integration/victor.py +471 -471
  44. alma/learning/__init__.py +101 -86
  45. alma/learning/decay.py +878 -0
  46. alma/learning/forgetting.py +1446 -1446
  47. alma/learning/heuristic_extractor.py +390 -390
  48. alma/learning/protocols.py +374 -374
  49. alma/learning/validation.py +346 -346
  50. alma/mcp/__init__.py +123 -45
  51. alma/mcp/__main__.py +156 -156
  52. alma/mcp/resources.py +122 -122
  53. alma/mcp/server.py +955 -591
  54. alma/mcp/tools.py +3254 -511
  55. alma/observability/__init__.py +91 -0
  56. alma/observability/config.py +302 -0
  57. alma/observability/guidelines.py +170 -0
  58. alma/observability/logging.py +424 -0
  59. alma/observability/metrics.py +583 -0
  60. alma/observability/tracing.py +440 -0
  61. alma/progress/__init__.py +21 -21
  62. alma/progress/tracker.py +607 -607
  63. alma/progress/types.py +250 -250
  64. alma/retrieval/__init__.py +134 -53
  65. alma/retrieval/budget.py +525 -0
  66. alma/retrieval/cache.py +1304 -1061
  67. alma/retrieval/embeddings.py +202 -202
  68. alma/retrieval/engine.py +850 -366
  69. alma/retrieval/modes.py +365 -0
  70. alma/retrieval/progressive.py +560 -0
  71. alma/retrieval/scoring.py +344 -344
  72. alma/retrieval/trust_scoring.py +637 -0
  73. alma/retrieval/verification.py +797 -0
  74. alma/session/__init__.py +19 -19
  75. alma/session/manager.py +442 -399
  76. alma/session/types.py +288 -288
  77. alma/storage/__init__.py +101 -61
  78. alma/storage/archive.py +233 -0
  79. alma/storage/azure_cosmos.py +1259 -1048
  80. alma/storage/base.py +1083 -525
  81. alma/storage/chroma.py +1443 -1443
  82. alma/storage/constants.py +103 -0
  83. alma/storage/file_based.py +614 -619
  84. alma/storage/migrations/__init__.py +21 -0
  85. alma/storage/migrations/base.py +321 -0
  86. alma/storage/migrations/runner.py +323 -0
  87. alma/storage/migrations/version_stores.py +337 -0
  88. alma/storage/migrations/versions/__init__.py +11 -0
  89. alma/storage/migrations/versions/v1_0_0.py +373 -0
  90. alma/storage/migrations/versions/v1_1_0_workflow_context.py +551 -0
  91. alma/storage/pinecone.py +1080 -1080
  92. alma/storage/postgresql.py +1948 -1452
  93. alma/storage/qdrant.py +1306 -1306
  94. alma/storage/sqlite_local.py +3041 -1358
  95. alma/testing/__init__.py +46 -0
  96. alma/testing/factories.py +301 -0
  97. alma/testing/mocks.py +389 -0
  98. alma/types.py +292 -264
  99. alma/utils/__init__.py +19 -0
  100. alma/utils/tokenizer.py +521 -0
  101. alma/workflow/__init__.py +83 -0
  102. alma/workflow/artifacts.py +170 -0
  103. alma/workflow/checkpoint.py +311 -0
  104. alma/workflow/context.py +228 -0
  105. alma/workflow/outcomes.py +189 -0
  106. alma/workflow/reducers.py +393 -0
  107. {alma_memory-0.5.0.dist-info → alma_memory-0.7.0.dist-info}/METADATA +244 -72
  108. alma_memory-0.7.0.dist-info/RECORD +112 -0
  109. alma_memory-0.5.0.dist-info/RECORD +0 -76
  110. {alma_memory-0.5.0.dist-info → alma_memory-0.7.0.dist-info}/WHEEL +0 -0
  111. {alma_memory-0.5.0.dist-info → alma_memory-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,1446 +1,1446 @@
1
- """
2
- ALMA Forgetting Mechanism.
3
-
4
- Implements intelligent memory pruning to prevent bloat and maintain relevance.
5
-
6
- Features:
7
- - Confidence decay over time (exponential, linear, step functions)
8
- - Staleness detection based on last_validated timestamps
9
- - Automated cleanup job scheduling
10
- - Memory growth monitoring and alerting
11
- """
12
-
13
- import logging
14
- import threading
15
- import time
16
- from abc import ABC, abstractmethod
17
- from dataclasses import dataclass, field
18
- from datetime import datetime, timedelta, timezone
19
- from enum import Enum
20
- from typing import Any, Callable, Dict, List, Optional
21
-
22
- from alma.storage.base import StorageBackend
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- class PruneReason(Enum):
28
- """Reason for pruning a memory item."""
29
-
30
- STALE = "stale" # Too old without validation
31
- LOW_CONFIDENCE = "low_confidence" # Below confidence threshold
32
- LOW_SUCCESS_RATE = "low_success" # Too many failures
33
- SUPERSEDED = "superseded" # Replaced by better heuristic
34
- DUPLICATE = "duplicate" # Duplicate of another item
35
- QUOTA_EXCEEDED = "quota" # Agent memory quota exceeded
36
-
37
-
38
- @dataclass
39
- class PruneResult:
40
- """Result of a prune operation."""
41
-
42
- reason: PruneReason
43
- item_type: str
44
- item_id: str
45
- agent: str
46
- project_id: str
47
- details: str = ""
48
-
49
-
50
- @dataclass
51
- class PruneSummary:
52
- """Summary of a complete prune operation."""
53
-
54
- outcomes_pruned: int = 0
55
- heuristics_pruned: int = 0
56
- knowledge_pruned: int = 0
57
- anti_patterns_pruned: int = 0
58
- total_pruned: int = 0
59
- pruned_items: List[PruneResult] = field(default_factory=list)
60
- execution_time_ms: int = 0
61
-
62
- def to_dict(self) -> Dict[str, Any]:
63
- """Convert to dictionary."""
64
- return {
65
- "outcomes_pruned": self.outcomes_pruned,
66
- "heuristics_pruned": self.heuristics_pruned,
67
- "knowledge_pruned": self.knowledge_pruned,
68
- "anti_patterns_pruned": self.anti_patterns_pruned,
69
- "total_pruned": self.total_pruned,
70
- "execution_time_ms": self.execution_time_ms,
71
- }
72
-
73
-
74
- @dataclass
75
- class PrunePolicy:
76
- """
77
- Configuration for memory pruning behavior.
78
-
79
- Defines thresholds and quotas for different memory types.
80
- """
81
-
82
- # Age-based pruning
83
- outcome_max_age_days: int = 90
84
- knowledge_max_age_days: int = 180
85
- anti_pattern_max_age_days: int = 365
86
-
87
- # Confidence thresholds
88
- heuristic_min_confidence: float = 0.3
89
- knowledge_min_confidence: float = 0.5
90
-
91
- # Success rate thresholds
92
- heuristic_min_success_rate: float = 0.4
93
- min_occurrences_before_prune: int = 5 # Don't prune until enough data
94
-
95
- # Quota limits (per agent)
96
- max_heuristics_per_agent: int = 100
97
- max_outcomes_per_agent: int = 500
98
- max_knowledge_per_agent: int = 200
99
- max_anti_patterns_per_agent: int = 50
100
-
101
- # Staleness (time since last validation)
102
- heuristic_stale_days: int = 60
103
- knowledge_stale_days: int = 90
104
-
105
-
106
- class ForgettingEngine:
107
- """
108
- Manages memory pruning and forgetting.
109
-
110
- Implements multiple strategies:
111
- - Age-based decay (old memories are pruned)
112
- - Confidence-based pruning (low confidence items removed)
113
- - Success-rate based pruning (unsuccessful patterns removed)
114
- - Quota enforcement (prevents memory bloat)
115
- - Staleness detection (unvalidated memories removed)
116
- """
117
-
118
- def __init__(
119
- self,
120
- storage: StorageBackend,
121
- policy: Optional[PrunePolicy] = None,
122
- ):
123
- """
124
- Initialize forgetting engine.
125
-
126
- Args:
127
- storage: Storage backend to prune
128
- policy: Pruning policy configuration
129
- """
130
- self.storage = storage
131
- self.policy = policy or PrunePolicy()
132
-
133
- def prune(
134
- self,
135
- project_id: str,
136
- agent: Optional[str] = None,
137
- dry_run: bool = False,
138
- ) -> PruneSummary:
139
- """
140
- Run a complete prune operation.
141
-
142
- Args:
143
- project_id: Project to prune
144
- agent: Specific agent or None for all
145
- dry_run: If True, only report what would be pruned
146
-
147
- Returns:
148
- PruneSummary with details
149
- """
150
- import time
151
-
152
- start_time = time.time()
153
-
154
- summary = PruneSummary()
155
-
156
- # Prune each memory type
157
- summary.outcomes_pruned = self._prune_stale_outcomes(
158
- project_id, agent, dry_run, summary.pruned_items
159
- )
160
-
161
- summary.heuristics_pruned = self._prune_heuristics(
162
- project_id, agent, dry_run, summary.pruned_items
163
- )
164
-
165
- summary.knowledge_pruned = self._prune_domain_knowledge(
166
- project_id, agent, dry_run, summary.pruned_items
167
- )
168
-
169
- summary.anti_patterns_pruned = self._prune_anti_patterns(
170
- project_id, agent, dry_run, summary.pruned_items
171
- )
172
-
173
- # Enforce quotas
174
- quota_pruned = self._enforce_quotas(
175
- project_id, agent, dry_run, summary.pruned_items
176
- )
177
- summary.heuristics_pruned += quota_pruned.get("heuristics", 0)
178
- summary.outcomes_pruned += quota_pruned.get("outcomes", 0)
179
-
180
- summary.total_pruned = (
181
- summary.outcomes_pruned
182
- + summary.heuristics_pruned
183
- + summary.knowledge_pruned
184
- + summary.anti_patterns_pruned
185
- )
186
-
187
- summary.execution_time_ms = int((time.time() - start_time) * 1000)
188
-
189
- action = "Would prune" if dry_run else "Pruned"
190
- logger.info(
191
- f"{action} {summary.total_pruned} items for project={project_id}, "
192
- f"agent={agent or 'all'}"
193
- )
194
-
195
- return summary
196
-
197
- def _prune_stale_outcomes(
198
- self,
199
- project_id: str,
200
- agent: Optional[str],
201
- dry_run: bool,
202
- results: List[PruneResult],
203
- ) -> int:
204
- """Prune outcomes older than max age."""
205
- cutoff = datetime.now(timezone.utc) - timedelta(
206
- days=self.policy.outcome_max_age_days
207
- )
208
-
209
- if dry_run:
210
- # Get count of outcomes to prune
211
- outcomes = self.storage.get_outcomes(
212
- project_id=project_id,
213
- agent=agent,
214
- top_k=10000,
215
- success_only=False,
216
- )
217
- count = sum(1 for o in outcomes if o.timestamp < cutoff)
218
- for o in outcomes:
219
- if o.timestamp < cutoff:
220
- results.append(
221
- PruneResult(
222
- reason=PruneReason.STALE,
223
- item_type="outcome",
224
- item_id=o.id,
225
- agent=o.agent,
226
- project_id=project_id,
227
- details=f"Older than {self.policy.outcome_max_age_days} days",
228
- )
229
- )
230
- return count
231
- else:
232
- return self.storage.delete_outcomes_older_than(
233
- project_id=project_id,
234
- older_than=cutoff,
235
- agent=agent,
236
- )
237
-
238
- def _prune_heuristics(
239
- self,
240
- project_id: str,
241
- agent: Optional[str],
242
- dry_run: bool,
243
- results: List[PruneResult],
244
- ) -> int:
245
- """Prune heuristics based on confidence and success rate."""
246
- heuristics = self.storage.get_heuristics(
247
- project_id=project_id,
248
- agent=agent,
249
- top_k=10000,
250
- min_confidence=0.0,
251
- )
252
-
253
- to_delete = []
254
- now = datetime.now(timezone.utc)
255
- stale_cutoff = now - timedelta(days=self.policy.heuristic_stale_days)
256
-
257
- for h in heuristics:
258
- reason = None
259
- details = ""
260
-
261
- # Check confidence
262
- if h.confidence < self.policy.heuristic_min_confidence:
263
- reason = PruneReason.LOW_CONFIDENCE
264
- details = f"Confidence {h.confidence:.2f} < {self.policy.heuristic_min_confidence}"
265
-
266
- # Check success rate (only if enough occurrences)
267
- elif (
268
- h.occurrence_count >= self.policy.min_occurrences_before_prune
269
- and h.success_rate < self.policy.heuristic_min_success_rate
270
- ):
271
- reason = PruneReason.LOW_SUCCESS_RATE
272
- details = f"Success rate {h.success_rate:.2f} < {self.policy.heuristic_min_success_rate}"
273
-
274
- # Check staleness
275
- elif h.last_validated < stale_cutoff:
276
- reason = PruneReason.STALE
277
- details = f"Not validated since {h.last_validated.date()}"
278
-
279
- if reason:
280
- to_delete.append(h)
281
- results.append(
282
- PruneResult(
283
- reason=reason,
284
- item_type="heuristic",
285
- item_id=h.id,
286
- agent=h.agent,
287
- project_id=project_id,
288
- details=details,
289
- )
290
- )
291
-
292
- if not dry_run:
293
- for h in to_delete:
294
- self.storage.delete_heuristic(h.id)
295
-
296
- return len(to_delete)
297
-
298
- def _prune_domain_knowledge(
299
- self,
300
- project_id: str,
301
- agent: Optional[str],
302
- dry_run: bool,
303
- results: List[PruneResult],
304
- ) -> int:
305
- """Prune old or low-confidence domain knowledge."""
306
- knowledge = self.storage.get_domain_knowledge(
307
- project_id=project_id,
308
- agent=agent,
309
- top_k=10000,
310
- )
311
-
312
- to_delete = []
313
- now = datetime.now(timezone.utc)
314
- age_cutoff = now - timedelta(days=self.policy.knowledge_max_age_days)
315
- stale_cutoff = now - timedelta(days=self.policy.knowledge_stale_days)
316
-
317
- for dk in knowledge:
318
- reason = None
319
- details = ""
320
-
321
- # Check confidence
322
- if dk.confidence < self.policy.knowledge_min_confidence:
323
- reason = PruneReason.LOW_CONFIDENCE
324
- details = f"Confidence {dk.confidence:.2f} < {self.policy.knowledge_min_confidence}"
325
-
326
- # Check age
327
- elif dk.last_verified < age_cutoff:
328
- reason = PruneReason.STALE
329
- details = f"Older than {self.policy.knowledge_max_age_days} days"
330
-
331
- # Check staleness
332
- elif dk.last_verified < stale_cutoff:
333
- reason = PruneReason.STALE
334
- details = f"Not verified since {dk.last_verified.date()}"
335
-
336
- if reason:
337
- to_delete.append(dk)
338
- results.append(
339
- PruneResult(
340
- reason=reason,
341
- item_type="domain_knowledge",
342
- item_id=dk.id,
343
- agent=dk.agent,
344
- project_id=project_id,
345
- details=details,
346
- )
347
- )
348
-
349
- if not dry_run:
350
- for dk in to_delete:
351
- self.storage.delete_domain_knowledge(dk.id)
352
-
353
- return len(to_delete)
354
-
355
- def _prune_anti_patterns(
356
- self,
357
- project_id: str,
358
- agent: Optional[str],
359
- dry_run: bool,
360
- results: List[PruneResult],
361
- ) -> int:
362
- """Prune old anti-patterns."""
363
- anti_patterns = self.storage.get_anti_patterns(
364
- project_id=project_id,
365
- agent=agent,
366
- top_k=10000,
367
- )
368
-
369
- to_delete = []
370
- now = datetime.now(timezone.utc)
371
- age_cutoff = now - timedelta(days=self.policy.anti_pattern_max_age_days)
372
-
373
- for ap in anti_patterns:
374
- if ap.last_seen < age_cutoff:
375
- to_delete.append(ap)
376
- results.append(
377
- PruneResult(
378
- reason=PruneReason.STALE,
379
- item_type="anti_pattern",
380
- item_id=ap.id,
381
- agent=ap.agent,
382
- project_id=project_id,
383
- details=f"Not seen since {ap.last_seen.date()}",
384
- )
385
- )
386
-
387
- if not dry_run:
388
- for ap in to_delete:
389
- self.storage.delete_anti_pattern(ap.id)
390
-
391
- return len(to_delete)
392
-
393
- def _enforce_quotas(
394
- self,
395
- project_id: str,
396
- agent: Optional[str],
397
- dry_run: bool,
398
- results: List[PruneResult],
399
- ) -> Dict[str, int]:
400
- """Enforce per-agent memory quotas."""
401
- pruned = {"heuristics": 0, "outcomes": 0}
402
-
403
- if agent:
404
- agents = [agent]
405
- else:
406
- # Get all agents with data
407
- stats = self.storage.get_stats(project_id=project_id)
408
- agents = stats.get("agents", [])
409
-
410
- for ag in agents:
411
- # Check heuristic quota
412
- heuristics = self.storage.get_heuristics(
413
- project_id=project_id,
414
- agent=ag,
415
- top_k=self.policy.max_heuristics_per_agent + 100,
416
- min_confidence=0.0,
417
- )
418
-
419
- if len(heuristics) > self.policy.max_heuristics_per_agent:
420
- # Sort by confidence (lowest first)
421
- sorted_h = sorted(heuristics, key=lambda x: x.confidence)
422
- to_remove = len(heuristics) - self.policy.max_heuristics_per_agent
423
-
424
- for h in sorted_h[:to_remove]:
425
- results.append(
426
- PruneResult(
427
- reason=PruneReason.QUOTA_EXCEEDED,
428
- item_type="heuristic",
429
- item_id=h.id,
430
- agent=ag,
431
- project_id=project_id,
432
- details=f"Exceeded quota of {self.policy.max_heuristics_per_agent}",
433
- )
434
- )
435
- if not dry_run:
436
- self.storage.delete_heuristic(h.id)
437
- pruned["heuristics"] += 1
438
-
439
- # Check outcome quota
440
- outcomes = self.storage.get_outcomes(
441
- project_id=project_id,
442
- agent=ag,
443
- top_k=self.policy.max_outcomes_per_agent + 100,
444
- success_only=False,
445
- )
446
-
447
- if len(outcomes) > self.policy.max_outcomes_per_agent:
448
- # Sort by timestamp (oldest first)
449
- sorted_o = sorted(outcomes, key=lambda x: x.timestamp)
450
- to_remove = len(outcomes) - self.policy.max_outcomes_per_agent
451
-
452
- for o in sorted_o[:to_remove]:
453
- results.append(
454
- PruneResult(
455
- reason=PruneReason.QUOTA_EXCEEDED,
456
- item_type="outcome",
457
- item_id=o.id,
458
- agent=ag,
459
- project_id=project_id,
460
- details=f"Exceeded quota of {self.policy.max_outcomes_per_agent}",
461
- )
462
- )
463
- if not dry_run:
464
- self.storage.delete_outcome(o.id)
465
- pruned["outcomes"] += 1
466
-
467
- return pruned
468
-
469
- def compute_decay_score(
470
- self,
471
- item_age_days: float,
472
- confidence: float,
473
- success_rate: float,
474
- occurrence_count: int,
475
- ) -> float:
476
- """
477
- Compute a decay score for an item (lower = more likely to forget).
478
-
479
- Factors:
480
- - Recency (newer = higher)
481
- - Confidence (higher = higher)
482
- - Success rate (higher = higher)
483
- - Validation frequency (more = higher)
484
-
485
- Returns:
486
- Score between 0 and 1
487
- """
488
- # Age decay (half-life of 30 days)
489
- age_score = 0.5 ** (item_age_days / 30.0)
490
-
491
- # Normalize occurrence count (cap at 20)
492
- occurrence_score = min(occurrence_count / 20.0, 1.0)
493
-
494
- # Weighted combination
495
- return (
496
- 0.3 * age_score
497
- + 0.3 * confidence
498
- + 0.2 * success_rate
499
- + 0.2 * occurrence_score
500
- )
501
-
502
- def identify_candidates(
503
- self,
504
- project_id: str,
505
- agent: Optional[str] = None,
506
- max_candidates: int = 20,
507
- ) -> List[Dict[str, Any]]:
508
- """
509
- Identify memory items that are candidates for pruning.
510
-
511
- Returns items with lowest decay scores.
512
-
513
- Args:
514
- project_id: Project to analyze
515
- agent: Specific agent or None for all
516
- max_candidates: Maximum candidates to return
517
-
518
- Returns:
519
- List of candidate items with scores
520
- """
521
- candidates = []
522
- now = datetime.now(timezone.utc)
523
-
524
- # Analyze heuristics
525
- heuristics = self.storage.get_heuristics(
526
- project_id=project_id,
527
- agent=agent,
528
- top_k=1000,
529
- min_confidence=0.0,
530
- )
531
-
532
- for h in heuristics:
533
- age_days = (now - h.created_at).total_seconds() / (24 * 60 * 60)
534
- score = self.compute_decay_score(
535
- item_age_days=age_days,
536
- confidence=h.confidence,
537
- success_rate=h.success_rate,
538
- occurrence_count=h.occurrence_count,
539
- )
540
- candidates.append(
541
- {
542
- "type": "heuristic",
543
- "id": h.id,
544
- "agent": h.agent,
545
- "score": score,
546
- "age_days": int(age_days),
547
- "confidence": h.confidence,
548
- "summary": h.strategy[:50],
549
- }
550
- )
551
-
552
- # Sort by score (lowest first = best candidates for pruning)
553
- candidates.sort(key=lambda x: x["score"])
554
-
555
- return candidates[:max_candidates]
556
-
557
-
558
- # ==================== DECAY FUNCTIONS ====================
559
-
560
-
561
- class DecayFunction(ABC):
562
- """Abstract base class for confidence decay functions."""
563
-
564
- @abstractmethod
565
- def compute_decay(self, days_since_validation: float) -> float:
566
- """
567
- Compute decay multiplier for a given time since validation.
568
-
569
- Args:
570
- days_since_validation: Days since last validation
571
-
572
- Returns:
573
- Multiplier between 0 and 1 to apply to confidence
574
- """
575
- pass
576
-
577
- @abstractmethod
578
- def get_name(self) -> str:
579
- """Return the name of this decay function."""
580
- pass
581
-
582
-
583
- class ExponentialDecay(DecayFunction):
584
- """
585
- Exponential decay with configurable half-life.
586
-
587
- Confidence = original * 0.5^(days/half_life)
588
- """
589
-
590
- def __init__(self, half_life_days: float = 30.0):
591
- """
592
- Initialize exponential decay.
593
-
594
- Args:
595
- half_life_days: Days until confidence halves
596
- """
597
- self.half_life_days = half_life_days
598
-
599
- def compute_decay(self, days_since_validation: float) -> float:
600
- """Compute exponential decay multiplier."""
601
- return 0.5 ** (days_since_validation / self.half_life_days)
602
-
603
- def get_name(self) -> str:
604
- return f"exponential(half_life={self.half_life_days}d)"
605
-
606
-
607
- class LinearDecay(DecayFunction):
608
- """
609
- Linear decay to zero over a specified period.
610
-
611
- Confidence decreases linearly from 1 to min_value over decay_period.
612
- """
613
-
614
- def __init__(
615
- self,
616
- decay_period_days: float = 90.0,
617
- min_value: float = 0.1,
618
- ):
619
- """
620
- Initialize linear decay.
621
-
622
- Args:
623
- decay_period_days: Days until confidence reaches min_value
624
- min_value: Minimum confidence value (floor)
625
- """
626
- self.decay_period_days = decay_period_days
627
- self.min_value = min_value
628
-
629
- def compute_decay(self, days_since_validation: float) -> float:
630
- """Compute linear decay multiplier."""
631
- decay = 1.0 - (days_since_validation / self.decay_period_days) * (
632
- 1.0 - self.min_value
633
- )
634
- return max(self.min_value, decay)
635
-
636
- def get_name(self) -> str:
637
- return f"linear(period={self.decay_period_days}d, min={self.min_value})"
638
-
639
-
640
- class StepDecay(DecayFunction):
641
- """
642
- Step-wise decay with configurable thresholds.
643
-
644
- Confidence drops at specific day thresholds.
645
- """
646
-
647
- def __init__(
648
- self,
649
- steps: Optional[List[tuple]] = None,
650
- ):
651
- """
652
- Initialize step decay.
653
-
654
- Args:
655
- steps: List of (days, multiplier) tuples, sorted by days ascending
656
- Default: [(30, 0.9), (60, 0.7), (90, 0.5), (180, 0.3)]
657
- """
658
- self.steps = steps or [
659
- (30, 0.9),
660
- (60, 0.7),
661
- (90, 0.5),
662
- (180, 0.3),
663
- ]
664
- # Ensure sorted
665
- self.steps = sorted(self.steps, key=lambda x: x[0])
666
-
667
- def compute_decay(self, days_since_validation: float) -> float:
668
- """Compute step decay multiplier."""
669
- multiplier = 1.0
670
- for threshold_days, step_multiplier in self.steps:
671
- if days_since_validation >= threshold_days:
672
- multiplier = step_multiplier
673
- else:
674
- break
675
- return multiplier
676
-
677
- def get_name(self) -> str:
678
- return f"step({len(self.steps)} steps)"
679
-
680
-
681
- class NoDecay(DecayFunction):
682
- """No decay - confidence remains constant."""
683
-
684
- def compute_decay(self, days_since_validation: float) -> float:
685
- return 1.0
686
-
687
- def get_name(self) -> str:
688
- return "none"
689
-
690
-
691
- # ==================== CONFIDENCE DECAYER ====================
692
-
693
-
694
- @dataclass
695
- class DecayResult:
696
- """Result of applying confidence decay."""
697
-
698
- items_processed: int = 0
699
- items_updated: int = 0
700
- items_pruned: int = 0
701
- avg_decay_applied: float = 0.0
702
- execution_time_ms: int = 0
703
-
704
-
705
- class ConfidenceDecayer:
706
- """
707
- Applies confidence decay to memories based on time since validation.
708
-
709
- Unlike pruning (which removes items), decay reduces confidence over time,
710
- making items less likely to be retrieved while preserving them for potential
711
- revalidation.
712
- """
713
-
714
- def __init__(
715
- self,
716
- storage: StorageBackend,
717
- decay_function: Optional[DecayFunction] = None,
718
- prune_below_confidence: float = 0.1,
719
- ):
720
- """
721
- Initialize confidence decayer.
722
-
723
- Args:
724
- storage: Storage backend to update
725
- decay_function: Function to compute decay (default: ExponentialDecay)
726
- prune_below_confidence: Auto-prune items that decay below this threshold
727
- """
728
- self.storage = storage
729
- self.decay_function = decay_function or ExponentialDecay(half_life_days=30.0)
730
- self.prune_below_confidence = prune_below_confidence
731
-
732
- def apply_decay(
733
- self,
734
- project_id: str,
735
- agent: Optional[str] = None,
736
- dry_run: bool = False,
737
- ) -> DecayResult:
738
- """
739
- Apply confidence decay to all eligible memories.
740
-
741
- Args:
742
- project_id: Project to process
743
- agent: Specific agent or None for all
744
- dry_run: If True, calculate but don't update
745
-
746
- Returns:
747
- DecayResult with statistics
748
- """
749
- start_time = time.time()
750
- result = DecayResult()
751
- now = datetime.now(timezone.utc)
752
- total_decay = 0.0
753
-
754
- # Process heuristics
755
- heuristics = self.storage.get_heuristics(
756
- project_id=project_id,
757
- agent=agent,
758
- top_k=10000,
759
- min_confidence=0.0,
760
- )
761
-
762
- for h in heuristics:
763
- result.items_processed += 1
764
- days_since = (now - h.last_validated).total_seconds() / (24 * 60 * 60)
765
- decay_multiplier = self.decay_function.compute_decay(days_since)
766
-
767
- new_confidence = h.confidence * decay_multiplier
768
- total_decay += 1.0 - decay_multiplier
769
-
770
- if new_confidence != h.confidence:
771
- if new_confidence < self.prune_below_confidence:
772
- # Below threshold - prune
773
- if not dry_run:
774
- self.storage.delete_heuristic(h.id)
775
- result.items_pruned += 1
776
- else:
777
- # Update confidence
778
- if not dry_run:
779
- self.storage.update_heuristic_confidence(h.id, new_confidence)
780
- result.items_updated += 1
781
-
782
- # Process domain knowledge
783
- knowledge = self.storage.get_domain_knowledge(
784
- project_id=project_id,
785
- agent=agent,
786
- top_k=10000,
787
- )
788
-
789
- for dk in knowledge:
790
- result.items_processed += 1
791
- days_since = (now - dk.last_verified).total_seconds() / (24 * 60 * 60)
792
- decay_multiplier = self.decay_function.compute_decay(days_since)
793
-
794
- new_confidence = dk.confidence * decay_multiplier
795
- total_decay += 1.0 - decay_multiplier
796
-
797
- if new_confidence != dk.confidence:
798
- if new_confidence < self.prune_below_confidence:
799
- if not dry_run:
800
- self.storage.delete_domain_knowledge(dk.id)
801
- result.items_pruned += 1
802
- else:
803
- if not dry_run:
804
- self.storage.update_knowledge_confidence(dk.id, new_confidence)
805
- result.items_updated += 1
806
-
807
- result.execution_time_ms = int((time.time() - start_time) * 1000)
808
- if result.items_processed > 0:
809
- result.avg_decay_applied = total_decay / result.items_processed
810
-
811
- action = "Would apply" if dry_run else "Applied"
812
- logger.info(
813
- f"{action} decay to {result.items_processed} items: "
814
- f"{result.items_updated} updated, {result.items_pruned} pruned "
815
- f"(avg decay: {result.avg_decay_applied:.2%})"
816
- )
817
-
818
- return result
819
-
820
-
821
- # ==================== MEMORY HEALTH MONITOR ====================
822
-
823
-
824
- @dataclass
825
- class MemoryHealthMetrics:
826
- """Metrics about memory health and growth."""
827
-
828
- total_items: int = 0
829
- heuristic_count: int = 0
830
- outcome_count: int = 0
831
- knowledge_count: int = 0
832
- anti_pattern_count: int = 0
833
- avg_heuristic_confidence: float = 0.0
834
- avg_heuristic_age_days: float = 0.0
835
- stale_heuristic_count: int = 0
836
- low_confidence_count: int = 0
837
- storage_bytes: int = 0
838
- agents_count: int = 0
839
- timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
840
-
841
- def to_dict(self) -> Dict[str, Any]:
842
- """Convert to dictionary."""
843
- return {
844
- "total_items": self.total_items,
845
- "heuristic_count": self.heuristic_count,
846
- "outcome_count": self.outcome_count,
847
- "knowledge_count": self.knowledge_count,
848
- "anti_pattern_count": self.anti_pattern_count,
849
- "avg_heuristic_confidence": round(self.avg_heuristic_confidence, 3),
850
- "avg_heuristic_age_days": round(self.avg_heuristic_age_days, 1),
851
- "stale_heuristic_count": self.stale_heuristic_count,
852
- "low_confidence_count": self.low_confidence_count,
853
- "storage_bytes": self.storage_bytes,
854
- "agents_count": self.agents_count,
855
- "timestamp": self.timestamp.isoformat(),
856
- }
857
-
858
-
859
- @dataclass
860
- class HealthAlert:
861
- """An alert about memory health issues."""
862
-
863
- level: str # "warning", "critical"
864
- category: str
865
- message: str
866
- current_value: Any
867
- threshold: Any
868
- timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
869
-
870
-
871
- @dataclass
872
- class HealthThresholds:
873
- """Thresholds for health monitoring alerts."""
874
-
875
- # Warning thresholds
876
- max_total_items_warning: int = 5000
877
- max_stale_percentage_warning: float = 0.3
878
- min_avg_confidence_warning: float = 0.5
879
- max_agent_items_warning: int = 500
880
-
881
- # Critical thresholds
882
- max_total_items_critical: int = 10000
883
- max_stale_percentage_critical: float = 0.5
884
- min_avg_confidence_critical: float = 0.3
885
- max_storage_bytes_critical: int = 100 * 1024 * 1024 # 100MB
886
-
887
-
888
- class MemoryHealthMonitor:
889
- """
890
- Monitors memory health and growth, generating alerts when thresholds exceeded.
891
-
892
- Tracks:
893
- - Total memory item counts
894
- - Average confidence levels
895
- - Staleness ratios
896
- - Storage size
897
- - Per-agent statistics
898
- """
899
-
900
- def __init__(
901
- self,
902
- storage: StorageBackend,
903
- thresholds: Optional[HealthThresholds] = None,
904
- stale_days: int = 60,
905
- low_confidence_threshold: float = 0.3,
906
- ):
907
- """
908
- Initialize health monitor.
909
-
910
- Args:
911
- storage: Storage backend to monitor
912
- thresholds: Alert thresholds
913
- stale_days: Days since validation to consider stale
914
- low_confidence_threshold: Confidence below which to count as low
915
- """
916
- self.storage = storage
917
- self.thresholds = thresholds or HealthThresholds()
918
- self.stale_days = stale_days
919
- self.low_confidence_threshold = low_confidence_threshold
920
-
921
- # History for trend analysis
922
- self._metrics_history: List[MemoryHealthMetrics] = []
923
- self._max_history = 100
924
-
925
- # Alert callbacks
926
- self._alert_handlers: List[Callable[[HealthAlert], None]] = []
927
-
928
- def add_alert_handler(self, handler: Callable[[HealthAlert], None]) -> None:
929
- """Add a callback to be called when alerts are generated."""
930
- self._alert_handlers.append(handler)
931
-
932
- def collect_metrics(self, project_id: str) -> MemoryHealthMetrics:
933
- """
934
- Collect current memory health metrics.
935
-
936
- Args:
937
- project_id: Project to analyze
938
-
939
- Returns:
940
- MemoryHealthMetrics snapshot
941
- """
942
- now = datetime.now(timezone.utc)
943
- stale_cutoff = now - timedelta(days=self.stale_days)
944
-
945
- metrics = MemoryHealthMetrics()
946
-
947
- # Get all heuristics
948
- heuristics = self.storage.get_heuristics(
949
- project_id=project_id,
950
- top_k=10000,
951
- min_confidence=0.0,
952
- )
953
- metrics.heuristic_count = len(heuristics)
954
-
955
- if heuristics:
956
- total_confidence = 0.0
957
- total_age = 0.0
958
- for h in heuristics:
959
- total_confidence += h.confidence
960
- age_days = (now - h.created_at).total_seconds() / (24 * 60 * 60)
961
- total_age += age_days
962
- if h.last_validated < stale_cutoff:
963
- metrics.stale_heuristic_count += 1
964
- if h.confidence < self.low_confidence_threshold:
965
- metrics.low_confidence_count += 1
966
-
967
- metrics.avg_heuristic_confidence = total_confidence / len(heuristics)
968
- metrics.avg_heuristic_age_days = total_age / len(heuristics)
969
-
970
- # Get other counts
971
- outcomes = self.storage.get_outcomes(
972
- project_id=project_id,
973
- top_k=10000,
974
- success_only=False,
975
- )
976
- metrics.outcome_count = len(outcomes)
977
-
978
- knowledge = self.storage.get_domain_knowledge(
979
- project_id=project_id,
980
- top_k=10000,
981
- )
982
- metrics.knowledge_count = len(knowledge)
983
-
984
- anti_patterns = self.storage.get_anti_patterns(
985
- project_id=project_id,
986
- top_k=10000,
987
- )
988
- metrics.anti_pattern_count = len(anti_patterns)
989
-
990
- metrics.total_items = (
991
- metrics.heuristic_count
992
- + metrics.outcome_count
993
- + metrics.knowledge_count
994
- + metrics.anti_pattern_count
995
- )
996
-
997
- # Get agent count
998
- stats = self.storage.get_stats(project_id=project_id)
999
- metrics.agents_count = len(stats.get("agents", []))
1000
-
1001
- # Estimate storage size (rough approximation)
1002
- # Average ~500 bytes per item
1003
- metrics.storage_bytes = metrics.total_items * 500
1004
-
1005
- # Store in history
1006
- self._metrics_history.append(metrics)
1007
- if len(self._metrics_history) > self._max_history:
1008
- self._metrics_history = self._metrics_history[-self._max_history :]
1009
-
1010
- return metrics
1011
-
1012
- def check_health(self, project_id: str) -> List[HealthAlert]:
1013
- """
1014
- Check memory health and generate alerts if thresholds exceeded.
1015
-
1016
- Args:
1017
- project_id: Project to check
1018
-
1019
- Returns:
1020
- List of health alerts (empty if healthy)
1021
- """
1022
- metrics = self.collect_metrics(project_id)
1023
- alerts: List[HealthAlert] = []
1024
- t = self.thresholds
1025
-
1026
- # Check total items
1027
- if metrics.total_items >= t.max_total_items_critical:
1028
- alerts.append(
1029
- HealthAlert(
1030
- level="critical",
1031
- category="total_items",
1032
- message="Memory item count critically high",
1033
- current_value=metrics.total_items,
1034
- threshold=t.max_total_items_critical,
1035
- )
1036
- )
1037
- elif metrics.total_items >= t.max_total_items_warning:
1038
- alerts.append(
1039
- HealthAlert(
1040
- level="warning",
1041
- category="total_items",
1042
- message="Memory item count approaching limit",
1043
- current_value=metrics.total_items,
1044
- threshold=t.max_total_items_warning,
1045
- )
1046
- )
1047
-
1048
- # Check staleness
1049
- if metrics.heuristic_count > 0:
1050
- stale_percentage = metrics.stale_heuristic_count / metrics.heuristic_count
1051
- if stale_percentage >= t.max_stale_percentage_critical:
1052
- alerts.append(
1053
- HealthAlert(
1054
- level="critical",
1055
- category="staleness",
1056
- message="Too many stale heuristics",
1057
- current_value=f"{stale_percentage:.0%}",
1058
- threshold=f"{t.max_stale_percentage_critical:.0%}",
1059
- )
1060
- )
1061
- elif stale_percentage >= t.max_stale_percentage_warning:
1062
- alerts.append(
1063
- HealthAlert(
1064
- level="warning",
1065
- category="staleness",
1066
- message="Many heuristics are stale",
1067
- current_value=f"{stale_percentage:.0%}",
1068
- threshold=f"{t.max_stale_percentage_warning:.0%}",
1069
- )
1070
- )
1071
-
1072
- # Check average confidence
1073
- if metrics.heuristic_count > 0:
1074
- if metrics.avg_heuristic_confidence < t.min_avg_confidence_critical:
1075
- alerts.append(
1076
- HealthAlert(
1077
- level="critical",
1078
- category="confidence",
1079
- message="Average heuristic confidence critically low",
1080
- current_value=f"{metrics.avg_heuristic_confidence:.2f}",
1081
- threshold=f"{t.min_avg_confidence_critical:.2f}",
1082
- )
1083
- )
1084
- elif metrics.avg_heuristic_confidence < t.min_avg_confidence_warning:
1085
- alerts.append(
1086
- HealthAlert(
1087
- level="warning",
1088
- category="confidence",
1089
- message="Average heuristic confidence is low",
1090
- current_value=f"{metrics.avg_heuristic_confidence:.2f}",
1091
- threshold=f"{t.min_avg_confidence_warning:.2f}",
1092
- )
1093
- )
1094
-
1095
- # Check storage size
1096
- if metrics.storage_bytes >= t.max_storage_bytes_critical:
1097
- alerts.append(
1098
- HealthAlert(
1099
- level="critical",
1100
- category="storage",
1101
- message="Memory storage size critically high",
1102
- current_value=f"{metrics.storage_bytes / (1024 * 1024):.1f}MB",
1103
- threshold=f"{t.max_storage_bytes_critical / (1024 * 1024):.1f}MB",
1104
- )
1105
- )
1106
-
1107
- # Notify handlers
1108
- for alert in alerts:
1109
- for handler in self._alert_handlers:
1110
- try:
1111
- handler(alert)
1112
- except Exception as e:
1113
- logger.error(f"Alert handler error: {e}")
1114
-
1115
- return alerts
1116
-
1117
- def get_growth_trend(self, project_id: str) -> Dict[str, Any]:
1118
- """
1119
- Analyze memory growth trend from history.
1120
-
1121
- Args:
1122
- project_id: Project to analyze
1123
-
1124
- Returns:
1125
- Trend analysis
1126
- """
1127
- if len(self._metrics_history) < 2:
1128
- return {
1129
- "status": "insufficient_data",
1130
- "samples": len(self._metrics_history),
1131
- }
1132
-
1133
- first = self._metrics_history[0]
1134
- last = self._metrics_history[-1]
1135
-
1136
- time_span = (last.timestamp - first.timestamp).total_seconds()
1137
- if time_span <= 0:
1138
- return {"status": "insufficient_time_span"}
1139
-
1140
- days_span = time_span / (24 * 60 * 60)
1141
- item_growth = last.total_items - first.total_items
1142
- growth_per_day = item_growth / days_span if days_span > 0 else 0
1143
-
1144
- return {
1145
- "status": "ok",
1146
- "samples": len(self._metrics_history),
1147
- "time_span_days": round(days_span, 1),
1148
- "total_growth": item_growth,
1149
- "growth_per_day": round(growth_per_day, 2),
1150
- "first_total": first.total_items,
1151
- "last_total": last.total_items,
1152
- "confidence_trend": round(
1153
- last.avg_heuristic_confidence - first.avg_heuristic_confidence, 3
1154
- ),
1155
- }
1156
-
1157
-
1158
- # ==================== CLEANUP SCHEDULER ====================
1159
-
1160
-
1161
- @dataclass
1162
- class CleanupJob:
1163
- """Configuration for a scheduled cleanup job."""
1164
-
1165
- name: str
1166
- project_id: str
1167
- interval_hours: float
1168
- agent: Optional[str] = None
1169
- policy: Optional[PrunePolicy] = None
1170
- apply_decay: bool = True
1171
- last_run: Optional[datetime] = None
1172
- next_run: Optional[datetime] = None
1173
- enabled: bool = True
1174
-
1175
-
1176
- @dataclass
1177
- class CleanupResult:
1178
- """Result of a cleanup job execution."""
1179
-
1180
- job_name: str
1181
- project_id: str
1182
- started_at: datetime
1183
- completed_at: datetime
1184
- prune_summary: Optional[PruneSummary] = None
1185
- decay_result: Optional[DecayResult] = None
1186
- alerts: List[HealthAlert] = field(default_factory=list)
1187
- success: bool = True
1188
- error: Optional[str] = None
1189
-
1190
-
1191
- class CleanupScheduler:
1192
- """
1193
- Schedules and executes automated memory cleanup jobs.
1194
-
1195
- Features:
1196
- - Configurable job intervals
1197
- - Prune + decay in single operation
1198
- - Health check integration
1199
- - Job execution history
1200
- - Thread-safe operation
1201
- """
1202
-
1203
- def __init__(
1204
- self,
1205
- storage: StorageBackend,
1206
- forgetting_engine: Optional[ForgettingEngine] = None,
1207
- decayer: Optional[ConfidenceDecayer] = None,
1208
- health_monitor: Optional[MemoryHealthMonitor] = None,
1209
- ):
1210
- """
1211
- Initialize cleanup scheduler.
1212
-
1213
- Args:
1214
- storage: Storage backend
1215
- forgetting_engine: Engine for pruning (created if not provided)
1216
- decayer: Engine for decay (created if not provided)
1217
- health_monitor: Health monitor (created if not provided)
1218
- """
1219
- self.storage = storage
1220
- self.forgetting_engine = forgetting_engine or ForgettingEngine(storage)
1221
- self.decayer = decayer or ConfidenceDecayer(storage)
1222
- self.health_monitor = health_monitor or MemoryHealthMonitor(storage)
1223
-
1224
- self._jobs: Dict[str, CleanupJob] = {}
1225
- self._history: List[CleanupResult] = []
1226
- self._max_history = 50
1227
- self._lock = threading.RLock()
1228
- self._running = False
1229
- self._thread: Optional[threading.Thread] = None
1230
-
1231
- def register_job(self, job: CleanupJob) -> None:
1232
- """
1233
- Register a cleanup job.
1234
-
1235
- Args:
1236
- job: Job configuration
1237
- """
1238
- with self._lock:
1239
- now = datetime.now(timezone.utc)
1240
- job.next_run = now + timedelta(hours=job.interval_hours)
1241
- self._jobs[job.name] = job
1242
- logger.info(
1243
- f"Registered cleanup job '{job.name}' for project {job.project_id}"
1244
- )
1245
-
1246
- def unregister_job(self, name: str) -> bool:
1247
- """
1248
- Unregister a cleanup job.
1249
-
1250
- Args:
1251
- name: Job name
1252
-
1253
- Returns:
1254
- True if job was found and removed
1255
- """
1256
- with self._lock:
1257
- if name in self._jobs:
1258
- del self._jobs[name]
1259
- logger.info(f"Unregistered cleanup job '{name}'")
1260
- return True
1261
- return False
1262
-
1263
- def run_job(self, name: str, dry_run: bool = False) -> CleanupResult:
1264
- """
1265
- Manually run a specific job.
1266
-
1267
- Args:
1268
- name: Job name
1269
- dry_run: If True, don't actually modify data
1270
-
1271
- Returns:
1272
- CleanupResult with execution details
1273
- """
1274
- with self._lock:
1275
- if name not in self._jobs:
1276
- raise ValueError(f"Job '{name}' not found")
1277
- job = self._jobs[name]
1278
-
1279
- return self._execute_job(job, dry_run)
1280
-
1281
- def run_all_due(self) -> List[CleanupResult]:
1282
- """
1283
- Run all jobs that are due.
1284
-
1285
- Returns:
1286
- List of results for executed jobs
1287
- """
1288
- results = []
1289
- now = datetime.now(timezone.utc)
1290
-
1291
- with self._lock:
1292
- due_jobs = [
1293
- job
1294
- for job in self._jobs.values()
1295
- if job.enabled and job.next_run and job.next_run <= now
1296
- ]
1297
-
1298
- for job in due_jobs:
1299
- try:
1300
- result = self._execute_job(job)
1301
- results.append(result)
1302
- except Exception as e:
1303
- logger.error(f"Error running job '{job.name}': {e}")
1304
- results.append(
1305
- CleanupResult(
1306
- job_name=job.name,
1307
- project_id=job.project_id,
1308
- started_at=now,
1309
- completed_at=datetime.now(timezone.utc),
1310
- success=False,
1311
- error=str(e),
1312
- )
1313
- )
1314
-
1315
- return results
1316
-
1317
- def _execute_job(self, job: CleanupJob, dry_run: bool = False) -> CleanupResult:
1318
- """Execute a cleanup job."""
1319
- started_at = datetime.now(timezone.utc)
1320
- result = CleanupResult(
1321
- job_name=job.name,
1322
- project_id=job.project_id,
1323
- started_at=started_at,
1324
- completed_at=started_at,
1325
- )
1326
-
1327
- try:
1328
- # Run prune
1329
- engine = ForgettingEngine(
1330
- self.storage,
1331
- job.policy or self.forgetting_engine.policy,
1332
- )
1333
- result.prune_summary = engine.prune(
1334
- project_id=job.project_id,
1335
- agent=job.agent,
1336
- dry_run=dry_run,
1337
- )
1338
-
1339
- # Run decay if enabled
1340
- if job.apply_decay:
1341
- result.decay_result = self.decayer.apply_decay(
1342
- project_id=job.project_id,
1343
- agent=job.agent,
1344
- dry_run=dry_run,
1345
- )
1346
-
1347
- # Check health
1348
- result.alerts = self.health_monitor.check_health(job.project_id)
1349
-
1350
- # Update job timing
1351
- with self._lock:
1352
- now = datetime.now(timezone.utc)
1353
- job.last_run = now
1354
- job.next_run = now + timedelta(hours=job.interval_hours)
1355
-
1356
- result.success = True
1357
-
1358
- except Exception as e:
1359
- result.success = False
1360
- result.error = str(e)
1361
- logger.error(f"Cleanup job '{job.name}' failed: {e}")
1362
-
1363
- result.completed_at = datetime.now(timezone.utc)
1364
-
1365
- # Store in history
1366
- with self._lock:
1367
- self._history.append(result)
1368
- if len(self._history) > self._max_history:
1369
- self._history = self._history[-self._max_history :]
1370
-
1371
- return result
1372
-
1373
- def start_background(self, check_interval_seconds: int = 60) -> None:
1374
- """
1375
- Start background job execution thread.
1376
-
1377
- Args:
1378
- check_interval_seconds: How often to check for due jobs
1379
- """
1380
- if self._running:
1381
- logger.warning("Scheduler already running")
1382
- return
1383
-
1384
- self._running = True
1385
-
1386
- def run():
1387
- while self._running:
1388
- try:
1389
- self.run_all_due()
1390
- except Exception as e:
1391
- logger.error(f"Scheduler error: {e}")
1392
- time.sleep(check_interval_seconds)
1393
-
1394
- self._thread = threading.Thread(target=run, daemon=True)
1395
- self._thread.start()
1396
- logger.info(f"Cleanup scheduler started (interval: {check_interval_seconds}s)")
1397
-
1398
- def stop_background(self) -> None:
1399
- """Stop the background execution thread."""
1400
- self._running = False
1401
- if self._thread:
1402
- self._thread.join(timeout=5)
1403
- self._thread = None
1404
- logger.info("Cleanup scheduler stopped")
1405
-
1406
- def get_jobs(self) -> List[Dict[str, Any]]:
1407
- """Get all registered jobs."""
1408
- with self._lock:
1409
- return [
1410
- {
1411
- "name": job.name,
1412
- "project_id": job.project_id,
1413
- "interval_hours": job.interval_hours,
1414
- "agent": job.agent,
1415
- "enabled": job.enabled,
1416
- "last_run": job.last_run.isoformat() if job.last_run else None,
1417
- "next_run": job.next_run.isoformat() if job.next_run else None,
1418
- }
1419
- for job in self._jobs.values()
1420
- ]
1421
-
1422
- def get_history(self, limit: int = 10) -> List[Dict[str, Any]]:
1423
- """Get recent job execution history."""
1424
- with self._lock:
1425
- recent = self._history[-limit:]
1426
- return [
1427
- {
1428
- "job_name": r.job_name,
1429
- "project_id": r.project_id,
1430
- "started_at": r.started_at.isoformat(),
1431
- "completed_at": r.completed_at.isoformat(),
1432
- "duration_ms": int(
1433
- (r.completed_at - r.started_at).total_seconds() * 1000
1434
- ),
1435
- "success": r.success,
1436
- "items_pruned": (
1437
- r.prune_summary.total_pruned if r.prune_summary else 0
1438
- ),
1439
- "items_decayed": (
1440
- r.decay_result.items_updated if r.decay_result else 0
1441
- ),
1442
- "alerts": len(r.alerts),
1443
- "error": r.error,
1444
- }
1445
- for r in reversed(recent)
1446
- ]
1
+ """
2
+ ALMA Forgetting Mechanism.
3
+
4
+ Implements intelligent memory pruning to prevent bloat and maintain relevance.
5
+
6
+ Features:
7
+ - Confidence decay over time (exponential, linear, step functions)
8
+ - Staleness detection based on last_validated timestamps
9
+ - Automated cleanup job scheduling
10
+ - Memory growth monitoring and alerting
11
+ """
12
+
13
+ import logging
14
+ import threading
15
+ import time
16
+ from abc import ABC, abstractmethod
17
+ from dataclasses import dataclass, field
18
+ from datetime import datetime, timedelta, timezone
19
+ from enum import Enum
20
+ from typing import Any, Callable, Dict, List, Optional
21
+
22
+ from alma.storage.base import StorageBackend
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class PruneReason(Enum):
28
+ """Reason for pruning a memory item."""
29
+
30
+ STALE = "stale" # Too old without validation
31
+ LOW_CONFIDENCE = "low_confidence" # Below confidence threshold
32
+ LOW_SUCCESS_RATE = "low_success" # Too many failures
33
+ SUPERSEDED = "superseded" # Replaced by better heuristic
34
+ DUPLICATE = "duplicate" # Duplicate of another item
35
+ QUOTA_EXCEEDED = "quota" # Agent memory quota exceeded
36
+
37
+
38
+ @dataclass
39
+ class PruneResult:
40
+ """Result of a prune operation."""
41
+
42
+ reason: PruneReason
43
+ item_type: str
44
+ item_id: str
45
+ agent: str
46
+ project_id: str
47
+ details: str = ""
48
+
49
+
50
+ @dataclass
51
+ class PruneSummary:
52
+ """Summary of a complete prune operation."""
53
+
54
+ outcomes_pruned: int = 0
55
+ heuristics_pruned: int = 0
56
+ knowledge_pruned: int = 0
57
+ anti_patterns_pruned: int = 0
58
+ total_pruned: int = 0
59
+ pruned_items: List[PruneResult] = field(default_factory=list)
60
+ execution_time_ms: int = 0
61
+
62
+ def to_dict(self) -> Dict[str, Any]:
63
+ """Convert to dictionary."""
64
+ return {
65
+ "outcomes_pruned": self.outcomes_pruned,
66
+ "heuristics_pruned": self.heuristics_pruned,
67
+ "knowledge_pruned": self.knowledge_pruned,
68
+ "anti_patterns_pruned": self.anti_patterns_pruned,
69
+ "total_pruned": self.total_pruned,
70
+ "execution_time_ms": self.execution_time_ms,
71
+ }
72
+
73
+
74
+ @dataclass
75
+ class PrunePolicy:
76
+ """
77
+ Configuration for memory pruning behavior.
78
+
79
+ Defines thresholds and quotas for different memory types.
80
+ """
81
+
82
+ # Age-based pruning
83
+ outcome_max_age_days: int = 90
84
+ knowledge_max_age_days: int = 180
85
+ anti_pattern_max_age_days: int = 365
86
+
87
+ # Confidence thresholds
88
+ heuristic_min_confidence: float = 0.3
89
+ knowledge_min_confidence: float = 0.5
90
+
91
+ # Success rate thresholds
92
+ heuristic_min_success_rate: float = 0.4
93
+ min_occurrences_before_prune: int = 5 # Don't prune until enough data
94
+
95
+ # Quota limits (per agent)
96
+ max_heuristics_per_agent: int = 100
97
+ max_outcomes_per_agent: int = 500
98
+ max_knowledge_per_agent: int = 200
99
+ max_anti_patterns_per_agent: int = 50
100
+
101
+ # Staleness (time since last validation)
102
+ heuristic_stale_days: int = 60
103
+ knowledge_stale_days: int = 90
104
+
105
+
106
+ class ForgettingEngine:
107
+ """
108
+ Manages memory pruning and forgetting.
109
+
110
+ Implements multiple strategies:
111
+ - Age-based decay (old memories are pruned)
112
+ - Confidence-based pruning (low confidence items removed)
113
+ - Success-rate based pruning (unsuccessful patterns removed)
114
+ - Quota enforcement (prevents memory bloat)
115
+ - Staleness detection (unvalidated memories removed)
116
+ """
117
+
118
+ def __init__(
119
+ self,
120
+ storage: StorageBackend,
121
+ policy: Optional[PrunePolicy] = None,
122
+ ):
123
+ """
124
+ Initialize forgetting engine.
125
+
126
+ Args:
127
+ storage: Storage backend to prune
128
+ policy: Pruning policy configuration
129
+ """
130
+ self.storage = storage
131
+ self.policy = policy or PrunePolicy()
132
+
133
+ def prune(
134
+ self,
135
+ project_id: str,
136
+ agent: Optional[str] = None,
137
+ dry_run: bool = False,
138
+ ) -> PruneSummary:
139
+ """
140
+ Run a complete prune operation.
141
+
142
+ Args:
143
+ project_id: Project to prune
144
+ agent: Specific agent or None for all
145
+ dry_run: If True, only report what would be pruned
146
+
147
+ Returns:
148
+ PruneSummary with details
149
+ """
150
+ import time
151
+
152
+ start_time = time.time()
153
+
154
+ summary = PruneSummary()
155
+
156
+ # Prune each memory type
157
+ summary.outcomes_pruned = self._prune_stale_outcomes(
158
+ project_id, agent, dry_run, summary.pruned_items
159
+ )
160
+
161
+ summary.heuristics_pruned = self._prune_heuristics(
162
+ project_id, agent, dry_run, summary.pruned_items
163
+ )
164
+
165
+ summary.knowledge_pruned = self._prune_domain_knowledge(
166
+ project_id, agent, dry_run, summary.pruned_items
167
+ )
168
+
169
+ summary.anti_patterns_pruned = self._prune_anti_patterns(
170
+ project_id, agent, dry_run, summary.pruned_items
171
+ )
172
+
173
+ # Enforce quotas
174
+ quota_pruned = self._enforce_quotas(
175
+ project_id, agent, dry_run, summary.pruned_items
176
+ )
177
+ summary.heuristics_pruned += quota_pruned.get("heuristics", 0)
178
+ summary.outcomes_pruned += quota_pruned.get("outcomes", 0)
179
+
180
+ summary.total_pruned = (
181
+ summary.outcomes_pruned
182
+ + summary.heuristics_pruned
183
+ + summary.knowledge_pruned
184
+ + summary.anti_patterns_pruned
185
+ )
186
+
187
+ summary.execution_time_ms = int((time.time() - start_time) * 1000)
188
+
189
+ action = "Would prune" if dry_run else "Pruned"
190
+ logger.info(
191
+ f"{action} {summary.total_pruned} items for project={project_id}, "
192
+ f"agent={agent or 'all'}"
193
+ )
194
+
195
+ return summary
196
+
197
+ def _prune_stale_outcomes(
198
+ self,
199
+ project_id: str,
200
+ agent: Optional[str],
201
+ dry_run: bool,
202
+ results: List[PruneResult],
203
+ ) -> int:
204
+ """Prune outcomes older than max age."""
205
+ cutoff = datetime.now(timezone.utc) - timedelta(
206
+ days=self.policy.outcome_max_age_days
207
+ )
208
+
209
+ if dry_run:
210
+ # Get count of outcomes to prune
211
+ outcomes = self.storage.get_outcomes(
212
+ project_id=project_id,
213
+ agent=agent,
214
+ top_k=10000,
215
+ success_only=False,
216
+ )
217
+ count = sum(1 for o in outcomes if o.timestamp < cutoff)
218
+ for o in outcomes:
219
+ if o.timestamp < cutoff:
220
+ results.append(
221
+ PruneResult(
222
+ reason=PruneReason.STALE,
223
+ item_type="outcome",
224
+ item_id=o.id,
225
+ agent=o.agent,
226
+ project_id=project_id,
227
+ details=f"Older than {self.policy.outcome_max_age_days} days",
228
+ )
229
+ )
230
+ return count
231
+ else:
232
+ return self.storage.delete_outcomes_older_than(
233
+ project_id=project_id,
234
+ older_than=cutoff,
235
+ agent=agent,
236
+ )
237
+
238
+ def _prune_heuristics(
239
+ self,
240
+ project_id: str,
241
+ agent: Optional[str],
242
+ dry_run: bool,
243
+ results: List[PruneResult],
244
+ ) -> int:
245
+ """Prune heuristics based on confidence and success rate."""
246
+ heuristics = self.storage.get_heuristics(
247
+ project_id=project_id,
248
+ agent=agent,
249
+ top_k=10000,
250
+ min_confidence=0.0,
251
+ )
252
+
253
+ to_delete = []
254
+ now = datetime.now(timezone.utc)
255
+ stale_cutoff = now - timedelta(days=self.policy.heuristic_stale_days)
256
+
257
+ for h in heuristics:
258
+ reason = None
259
+ details = ""
260
+
261
+ # Check confidence
262
+ if h.confidence < self.policy.heuristic_min_confidence:
263
+ reason = PruneReason.LOW_CONFIDENCE
264
+ details = f"Confidence {h.confidence:.2f} < {self.policy.heuristic_min_confidence}"
265
+
266
+ # Check success rate (only if enough occurrences)
267
+ elif (
268
+ h.occurrence_count >= self.policy.min_occurrences_before_prune
269
+ and h.success_rate < self.policy.heuristic_min_success_rate
270
+ ):
271
+ reason = PruneReason.LOW_SUCCESS_RATE
272
+ details = f"Success rate {h.success_rate:.2f} < {self.policy.heuristic_min_success_rate}"
273
+
274
+ # Check staleness
275
+ elif h.last_validated < stale_cutoff:
276
+ reason = PruneReason.STALE
277
+ details = f"Not validated since {h.last_validated.date()}"
278
+
279
+ if reason:
280
+ to_delete.append(h)
281
+ results.append(
282
+ PruneResult(
283
+ reason=reason,
284
+ item_type="heuristic",
285
+ item_id=h.id,
286
+ agent=h.agent,
287
+ project_id=project_id,
288
+ details=details,
289
+ )
290
+ )
291
+
292
+ if not dry_run:
293
+ for h in to_delete:
294
+ self.storage.delete_heuristic(h.id)
295
+
296
+ return len(to_delete)
297
+
298
+ def _prune_domain_knowledge(
299
+ self,
300
+ project_id: str,
301
+ agent: Optional[str],
302
+ dry_run: bool,
303
+ results: List[PruneResult],
304
+ ) -> int:
305
+ """Prune old or low-confidence domain knowledge."""
306
+ knowledge = self.storage.get_domain_knowledge(
307
+ project_id=project_id,
308
+ agent=agent,
309
+ top_k=10000,
310
+ )
311
+
312
+ to_delete = []
313
+ now = datetime.now(timezone.utc)
314
+ age_cutoff = now - timedelta(days=self.policy.knowledge_max_age_days)
315
+ stale_cutoff = now - timedelta(days=self.policy.knowledge_stale_days)
316
+
317
+ for dk in knowledge:
318
+ reason = None
319
+ details = ""
320
+
321
+ # Check confidence
322
+ if dk.confidence < self.policy.knowledge_min_confidence:
323
+ reason = PruneReason.LOW_CONFIDENCE
324
+ details = f"Confidence {dk.confidence:.2f} < {self.policy.knowledge_min_confidence}"
325
+
326
+ # Check age
327
+ elif dk.last_verified < age_cutoff:
328
+ reason = PruneReason.STALE
329
+ details = f"Older than {self.policy.knowledge_max_age_days} days"
330
+
331
+ # Check staleness
332
+ elif dk.last_verified < stale_cutoff:
333
+ reason = PruneReason.STALE
334
+ details = f"Not verified since {dk.last_verified.date()}"
335
+
336
+ if reason:
337
+ to_delete.append(dk)
338
+ results.append(
339
+ PruneResult(
340
+ reason=reason,
341
+ item_type="domain_knowledge",
342
+ item_id=dk.id,
343
+ agent=dk.agent,
344
+ project_id=project_id,
345
+ details=details,
346
+ )
347
+ )
348
+
349
+ if not dry_run:
350
+ for dk in to_delete:
351
+ self.storage.delete_domain_knowledge(dk.id)
352
+
353
+ return len(to_delete)
354
+
355
+ def _prune_anti_patterns(
356
+ self,
357
+ project_id: str,
358
+ agent: Optional[str],
359
+ dry_run: bool,
360
+ results: List[PruneResult],
361
+ ) -> int:
362
+ """Prune old anti-patterns."""
363
+ anti_patterns = self.storage.get_anti_patterns(
364
+ project_id=project_id,
365
+ agent=agent,
366
+ top_k=10000,
367
+ )
368
+
369
+ to_delete = []
370
+ now = datetime.now(timezone.utc)
371
+ age_cutoff = now - timedelta(days=self.policy.anti_pattern_max_age_days)
372
+
373
+ for ap in anti_patterns:
374
+ if ap.last_seen < age_cutoff:
375
+ to_delete.append(ap)
376
+ results.append(
377
+ PruneResult(
378
+ reason=PruneReason.STALE,
379
+ item_type="anti_pattern",
380
+ item_id=ap.id,
381
+ agent=ap.agent,
382
+ project_id=project_id,
383
+ details=f"Not seen since {ap.last_seen.date()}",
384
+ )
385
+ )
386
+
387
+ if not dry_run:
388
+ for ap in to_delete:
389
+ self.storage.delete_anti_pattern(ap.id)
390
+
391
+ return len(to_delete)
392
+
393
+ def _enforce_quotas(
394
+ self,
395
+ project_id: str,
396
+ agent: Optional[str],
397
+ dry_run: bool,
398
+ results: List[PruneResult],
399
+ ) -> Dict[str, int]:
400
+ """Enforce per-agent memory quotas."""
401
+ pruned = {"heuristics": 0, "outcomes": 0}
402
+
403
+ if agent:
404
+ agents = [agent]
405
+ else:
406
+ # Get all agents with data
407
+ stats = self.storage.get_stats(project_id=project_id)
408
+ agents = stats.get("agents", [])
409
+
410
+ for ag in agents:
411
+ # Check heuristic quota
412
+ heuristics = self.storage.get_heuristics(
413
+ project_id=project_id,
414
+ agent=ag,
415
+ top_k=self.policy.max_heuristics_per_agent + 100,
416
+ min_confidence=0.0,
417
+ )
418
+
419
+ if len(heuristics) > self.policy.max_heuristics_per_agent:
420
+ # Sort by confidence (lowest first)
421
+ sorted_h = sorted(heuristics, key=lambda x: x.confidence)
422
+ to_remove = len(heuristics) - self.policy.max_heuristics_per_agent
423
+
424
+ for h in sorted_h[:to_remove]:
425
+ results.append(
426
+ PruneResult(
427
+ reason=PruneReason.QUOTA_EXCEEDED,
428
+ item_type="heuristic",
429
+ item_id=h.id,
430
+ agent=ag,
431
+ project_id=project_id,
432
+ details=f"Exceeded quota of {self.policy.max_heuristics_per_agent}",
433
+ )
434
+ )
435
+ if not dry_run:
436
+ self.storage.delete_heuristic(h.id)
437
+ pruned["heuristics"] += 1
438
+
439
+ # Check outcome quota
440
+ outcomes = self.storage.get_outcomes(
441
+ project_id=project_id,
442
+ agent=ag,
443
+ top_k=self.policy.max_outcomes_per_agent + 100,
444
+ success_only=False,
445
+ )
446
+
447
+ if len(outcomes) > self.policy.max_outcomes_per_agent:
448
+ # Sort by timestamp (oldest first)
449
+ sorted_o = sorted(outcomes, key=lambda x: x.timestamp)
450
+ to_remove = len(outcomes) - self.policy.max_outcomes_per_agent
451
+
452
+ for o in sorted_o[:to_remove]:
453
+ results.append(
454
+ PruneResult(
455
+ reason=PruneReason.QUOTA_EXCEEDED,
456
+ item_type="outcome",
457
+ item_id=o.id,
458
+ agent=ag,
459
+ project_id=project_id,
460
+ details=f"Exceeded quota of {self.policy.max_outcomes_per_agent}",
461
+ )
462
+ )
463
+ if not dry_run:
464
+ self.storage.delete_outcome(o.id)
465
+ pruned["outcomes"] += 1
466
+
467
+ return pruned
468
+
469
+ def compute_decay_score(
470
+ self,
471
+ item_age_days: float,
472
+ confidence: float,
473
+ success_rate: float,
474
+ occurrence_count: int,
475
+ ) -> float:
476
+ """
477
+ Compute a decay score for an item (lower = more likely to forget).
478
+
479
+ Factors:
480
+ - Recency (newer = higher)
481
+ - Confidence (higher = higher)
482
+ - Success rate (higher = higher)
483
+ - Validation frequency (more = higher)
484
+
485
+ Returns:
486
+ Score between 0 and 1
487
+ """
488
+ # Age decay (half-life of 30 days)
489
+ age_score = 0.5 ** (item_age_days / 30.0)
490
+
491
+ # Normalize occurrence count (cap at 20)
492
+ occurrence_score = min(occurrence_count / 20.0, 1.0)
493
+
494
+ # Weighted combination
495
+ return (
496
+ 0.3 * age_score
497
+ + 0.3 * confidence
498
+ + 0.2 * success_rate
499
+ + 0.2 * occurrence_score
500
+ )
501
+
502
+ def identify_candidates(
503
+ self,
504
+ project_id: str,
505
+ agent: Optional[str] = None,
506
+ max_candidates: int = 20,
507
+ ) -> List[Dict[str, Any]]:
508
+ """
509
+ Identify memory items that are candidates for pruning.
510
+
511
+ Returns items with lowest decay scores.
512
+
513
+ Args:
514
+ project_id: Project to analyze
515
+ agent: Specific agent or None for all
516
+ max_candidates: Maximum candidates to return
517
+
518
+ Returns:
519
+ List of candidate items with scores
520
+ """
521
+ candidates = []
522
+ now = datetime.now(timezone.utc)
523
+
524
+ # Analyze heuristics
525
+ heuristics = self.storage.get_heuristics(
526
+ project_id=project_id,
527
+ agent=agent,
528
+ top_k=1000,
529
+ min_confidence=0.0,
530
+ )
531
+
532
+ for h in heuristics:
533
+ age_days = (now - h.created_at).total_seconds() / (24 * 60 * 60)
534
+ score = self.compute_decay_score(
535
+ item_age_days=age_days,
536
+ confidence=h.confidence,
537
+ success_rate=h.success_rate,
538
+ occurrence_count=h.occurrence_count,
539
+ )
540
+ candidates.append(
541
+ {
542
+ "type": "heuristic",
543
+ "id": h.id,
544
+ "agent": h.agent,
545
+ "score": score,
546
+ "age_days": int(age_days),
547
+ "confidence": h.confidence,
548
+ "summary": h.strategy[:50],
549
+ }
550
+ )
551
+
552
+ # Sort by score (lowest first = best candidates for pruning)
553
+ candidates.sort(key=lambda x: x["score"])
554
+
555
+ return candidates[:max_candidates]
556
+
557
+
558
+ # ==================== DECAY FUNCTIONS ====================
559
+
560
+
561
+ class DecayFunction(ABC):
562
+ """Abstract base class for confidence decay functions."""
563
+
564
+ @abstractmethod
565
+ def compute_decay(self, days_since_validation: float) -> float:
566
+ """
567
+ Compute decay multiplier for a given time since validation.
568
+
569
+ Args:
570
+ days_since_validation: Days since last validation
571
+
572
+ Returns:
573
+ Multiplier between 0 and 1 to apply to confidence
574
+ """
575
+ pass
576
+
577
+ @abstractmethod
578
+ def get_name(self) -> str:
579
+ """Return the name of this decay function."""
580
+ pass
581
+
582
+
583
+ class ExponentialDecay(DecayFunction):
584
+ """
585
+ Exponential decay with configurable half-life.
586
+
587
+ Confidence = original * 0.5^(days/half_life)
588
+ """
589
+
590
+ def __init__(self, half_life_days: float = 30.0):
591
+ """
592
+ Initialize exponential decay.
593
+
594
+ Args:
595
+ half_life_days: Days until confidence halves
596
+ """
597
+ self.half_life_days = half_life_days
598
+
599
+ def compute_decay(self, days_since_validation: float) -> float:
600
+ """Compute exponential decay multiplier."""
601
+ return 0.5 ** (days_since_validation / self.half_life_days)
602
+
603
+ def get_name(self) -> str:
604
+ return f"exponential(half_life={self.half_life_days}d)"
605
+
606
+
607
+ class LinearDecay(DecayFunction):
608
+ """
609
+ Linear decay to zero over a specified period.
610
+
611
+ Confidence decreases linearly from 1 to min_value over decay_period.
612
+ """
613
+
614
+ def __init__(
615
+ self,
616
+ decay_period_days: float = 90.0,
617
+ min_value: float = 0.1,
618
+ ):
619
+ """
620
+ Initialize linear decay.
621
+
622
+ Args:
623
+ decay_period_days: Days until confidence reaches min_value
624
+ min_value: Minimum confidence value (floor)
625
+ """
626
+ self.decay_period_days = decay_period_days
627
+ self.min_value = min_value
628
+
629
+ def compute_decay(self, days_since_validation: float) -> float:
630
+ """Compute linear decay multiplier."""
631
+ decay = 1.0 - (days_since_validation / self.decay_period_days) * (
632
+ 1.0 - self.min_value
633
+ )
634
+ return max(self.min_value, decay)
635
+
636
+ def get_name(self) -> str:
637
+ return f"linear(period={self.decay_period_days}d, min={self.min_value})"
638
+
639
+
640
+ class StepDecay(DecayFunction):
641
+ """
642
+ Step-wise decay with configurable thresholds.
643
+
644
+ Confidence drops at specific day thresholds.
645
+ """
646
+
647
+ def __init__(
648
+ self,
649
+ steps: Optional[List[tuple]] = None,
650
+ ):
651
+ """
652
+ Initialize step decay.
653
+
654
+ Args:
655
+ steps: List of (days, multiplier) tuples, sorted by days ascending
656
+ Default: [(30, 0.9), (60, 0.7), (90, 0.5), (180, 0.3)]
657
+ """
658
+ self.steps = steps or [
659
+ (30, 0.9),
660
+ (60, 0.7),
661
+ (90, 0.5),
662
+ (180, 0.3),
663
+ ]
664
+ # Ensure sorted
665
+ self.steps = sorted(self.steps, key=lambda x: x[0])
666
+
667
+ def compute_decay(self, days_since_validation: float) -> float:
668
+ """Compute step decay multiplier."""
669
+ multiplier = 1.0
670
+ for threshold_days, step_multiplier in self.steps:
671
+ if days_since_validation >= threshold_days:
672
+ multiplier = step_multiplier
673
+ else:
674
+ break
675
+ return multiplier
676
+
677
+ def get_name(self) -> str:
678
+ return f"step({len(self.steps)} steps)"
679
+
680
+
681
+ class NoDecay(DecayFunction):
682
+ """No decay - confidence remains constant."""
683
+
684
+ def compute_decay(self, days_since_validation: float) -> float:
685
+ return 1.0
686
+
687
+ def get_name(self) -> str:
688
+ return "none"
689
+
690
+
691
+ # ==================== CONFIDENCE DECAYER ====================
692
+
693
+
694
+ @dataclass
695
+ class DecayResult:
696
+ """Result of applying confidence decay."""
697
+
698
+ items_processed: int = 0
699
+ items_updated: int = 0
700
+ items_pruned: int = 0
701
+ avg_decay_applied: float = 0.0
702
+ execution_time_ms: int = 0
703
+
704
+
705
+ class ConfidenceDecayer:
706
+ """
707
+ Applies confidence decay to memories based on time since validation.
708
+
709
+ Unlike pruning (which removes items), decay reduces confidence over time,
710
+ making items less likely to be retrieved while preserving them for potential
711
+ revalidation.
712
+ """
713
+
714
+ def __init__(
715
+ self,
716
+ storage: StorageBackend,
717
+ decay_function: Optional[DecayFunction] = None,
718
+ prune_below_confidence: float = 0.1,
719
+ ):
720
+ """
721
+ Initialize confidence decayer.
722
+
723
+ Args:
724
+ storage: Storage backend to update
725
+ decay_function: Function to compute decay (default: ExponentialDecay)
726
+ prune_below_confidence: Auto-prune items that decay below this threshold
727
+ """
728
+ self.storage = storage
729
+ self.decay_function = decay_function or ExponentialDecay(half_life_days=30.0)
730
+ self.prune_below_confidence = prune_below_confidence
731
+
732
+ def apply_decay(
733
+ self,
734
+ project_id: str,
735
+ agent: Optional[str] = None,
736
+ dry_run: bool = False,
737
+ ) -> DecayResult:
738
+ """
739
+ Apply confidence decay to all eligible memories.
740
+
741
+ Args:
742
+ project_id: Project to process
743
+ agent: Specific agent or None for all
744
+ dry_run: If True, calculate but don't update
745
+
746
+ Returns:
747
+ DecayResult with statistics
748
+ """
749
+ start_time = time.time()
750
+ result = DecayResult()
751
+ now = datetime.now(timezone.utc)
752
+ total_decay = 0.0
753
+
754
+ # Process heuristics
755
+ heuristics = self.storage.get_heuristics(
756
+ project_id=project_id,
757
+ agent=agent,
758
+ top_k=10000,
759
+ min_confidence=0.0,
760
+ )
761
+
762
+ for h in heuristics:
763
+ result.items_processed += 1
764
+ days_since = (now - h.last_validated).total_seconds() / (24 * 60 * 60)
765
+ decay_multiplier = self.decay_function.compute_decay(days_since)
766
+
767
+ new_confidence = h.confidence * decay_multiplier
768
+ total_decay += 1.0 - decay_multiplier
769
+
770
+ if new_confidence != h.confidence:
771
+ if new_confidence < self.prune_below_confidence:
772
+ # Below threshold - prune
773
+ if not dry_run:
774
+ self.storage.delete_heuristic(h.id)
775
+ result.items_pruned += 1
776
+ else:
777
+ # Update confidence
778
+ if not dry_run:
779
+ self.storage.update_heuristic_confidence(h.id, new_confidence)
780
+ result.items_updated += 1
781
+
782
+ # Process domain knowledge
783
+ knowledge = self.storage.get_domain_knowledge(
784
+ project_id=project_id,
785
+ agent=agent,
786
+ top_k=10000,
787
+ )
788
+
789
+ for dk in knowledge:
790
+ result.items_processed += 1
791
+ days_since = (now - dk.last_verified).total_seconds() / (24 * 60 * 60)
792
+ decay_multiplier = self.decay_function.compute_decay(days_since)
793
+
794
+ new_confidence = dk.confidence * decay_multiplier
795
+ total_decay += 1.0 - decay_multiplier
796
+
797
+ if new_confidence != dk.confidence:
798
+ if new_confidence < self.prune_below_confidence:
799
+ if not dry_run:
800
+ self.storage.delete_domain_knowledge(dk.id)
801
+ result.items_pruned += 1
802
+ else:
803
+ if not dry_run:
804
+ self.storage.update_knowledge_confidence(dk.id, new_confidence)
805
+ result.items_updated += 1
806
+
807
+ result.execution_time_ms = int((time.time() - start_time) * 1000)
808
+ if result.items_processed > 0:
809
+ result.avg_decay_applied = total_decay / result.items_processed
810
+
811
+ action = "Would apply" if dry_run else "Applied"
812
+ logger.info(
813
+ f"{action} decay to {result.items_processed} items: "
814
+ f"{result.items_updated} updated, {result.items_pruned} pruned "
815
+ f"(avg decay: {result.avg_decay_applied:.2%})"
816
+ )
817
+
818
+ return result
819
+
820
+
821
+ # ==================== MEMORY HEALTH MONITOR ====================
822
+
823
+
824
+ @dataclass
825
+ class MemoryHealthMetrics:
826
+ """Metrics about memory health and growth."""
827
+
828
+ total_items: int = 0
829
+ heuristic_count: int = 0
830
+ outcome_count: int = 0
831
+ knowledge_count: int = 0
832
+ anti_pattern_count: int = 0
833
+ avg_heuristic_confidence: float = 0.0
834
+ avg_heuristic_age_days: float = 0.0
835
+ stale_heuristic_count: int = 0
836
+ low_confidence_count: int = 0
837
+ storage_bytes: int = 0
838
+ agents_count: int = 0
839
+ timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
840
+
841
+ def to_dict(self) -> Dict[str, Any]:
842
+ """Convert to dictionary."""
843
+ return {
844
+ "total_items": self.total_items,
845
+ "heuristic_count": self.heuristic_count,
846
+ "outcome_count": self.outcome_count,
847
+ "knowledge_count": self.knowledge_count,
848
+ "anti_pattern_count": self.anti_pattern_count,
849
+ "avg_heuristic_confidence": round(self.avg_heuristic_confidence, 3),
850
+ "avg_heuristic_age_days": round(self.avg_heuristic_age_days, 1),
851
+ "stale_heuristic_count": self.stale_heuristic_count,
852
+ "low_confidence_count": self.low_confidence_count,
853
+ "storage_bytes": self.storage_bytes,
854
+ "agents_count": self.agents_count,
855
+ "timestamp": self.timestamp.isoformat(),
856
+ }
857
+
858
+
859
+ @dataclass
860
+ class HealthAlert:
861
+ """An alert about memory health issues."""
862
+
863
+ level: str # "warning", "critical"
864
+ category: str
865
+ message: str
866
+ current_value: Any
867
+ threshold: Any
868
+ timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
869
+
870
+
871
+ @dataclass
872
+ class HealthThresholds:
873
+ """Thresholds for health monitoring alerts."""
874
+
875
+ # Warning thresholds
876
+ max_total_items_warning: int = 5000
877
+ max_stale_percentage_warning: float = 0.3
878
+ min_avg_confidence_warning: float = 0.5
879
+ max_agent_items_warning: int = 500
880
+
881
+ # Critical thresholds
882
+ max_total_items_critical: int = 10000
883
+ max_stale_percentage_critical: float = 0.5
884
+ min_avg_confidence_critical: float = 0.3
885
+ max_storage_bytes_critical: int = 100 * 1024 * 1024 # 100MB
886
+
887
+
888
+ class MemoryHealthMonitor:
889
+ """
890
+ Monitors memory health and growth, generating alerts when thresholds exceeded.
891
+
892
+ Tracks:
893
+ - Total memory item counts
894
+ - Average confidence levels
895
+ - Staleness ratios
896
+ - Storage size
897
+ - Per-agent statistics
898
+ """
899
+
900
+ def __init__(
901
+ self,
902
+ storage: StorageBackend,
903
+ thresholds: Optional[HealthThresholds] = None,
904
+ stale_days: int = 60,
905
+ low_confidence_threshold: float = 0.3,
906
+ ):
907
+ """
908
+ Initialize health monitor.
909
+
910
+ Args:
911
+ storage: Storage backend to monitor
912
+ thresholds: Alert thresholds
913
+ stale_days: Days since validation to consider stale
914
+ low_confidence_threshold: Confidence below which to count as low
915
+ """
916
+ self.storage = storage
917
+ self.thresholds = thresholds or HealthThresholds()
918
+ self.stale_days = stale_days
919
+ self.low_confidence_threshold = low_confidence_threshold
920
+
921
+ # History for trend analysis
922
+ self._metrics_history: List[MemoryHealthMetrics] = []
923
+ self._max_history = 100
924
+
925
+ # Alert callbacks
926
+ self._alert_handlers: List[Callable[[HealthAlert], None]] = []
927
+
928
+ def add_alert_handler(self, handler: Callable[[HealthAlert], None]) -> None:
929
+ """Add a callback to be called when alerts are generated."""
930
+ self._alert_handlers.append(handler)
931
+
932
+ def collect_metrics(self, project_id: str) -> MemoryHealthMetrics:
933
+ """
934
+ Collect current memory health metrics.
935
+
936
+ Args:
937
+ project_id: Project to analyze
938
+
939
+ Returns:
940
+ MemoryHealthMetrics snapshot
941
+ """
942
+ now = datetime.now(timezone.utc)
943
+ stale_cutoff = now - timedelta(days=self.stale_days)
944
+
945
+ metrics = MemoryHealthMetrics()
946
+
947
+ # Get all heuristics
948
+ heuristics = self.storage.get_heuristics(
949
+ project_id=project_id,
950
+ top_k=10000,
951
+ min_confidence=0.0,
952
+ )
953
+ metrics.heuristic_count = len(heuristics)
954
+
955
+ if heuristics:
956
+ total_confidence = 0.0
957
+ total_age = 0.0
958
+ for h in heuristics:
959
+ total_confidence += h.confidence
960
+ age_days = (now - h.created_at).total_seconds() / (24 * 60 * 60)
961
+ total_age += age_days
962
+ if h.last_validated < stale_cutoff:
963
+ metrics.stale_heuristic_count += 1
964
+ if h.confidence < self.low_confidence_threshold:
965
+ metrics.low_confidence_count += 1
966
+
967
+ metrics.avg_heuristic_confidence = total_confidence / len(heuristics)
968
+ metrics.avg_heuristic_age_days = total_age / len(heuristics)
969
+
970
+ # Get other counts
971
+ outcomes = self.storage.get_outcomes(
972
+ project_id=project_id,
973
+ top_k=10000,
974
+ success_only=False,
975
+ )
976
+ metrics.outcome_count = len(outcomes)
977
+
978
+ knowledge = self.storage.get_domain_knowledge(
979
+ project_id=project_id,
980
+ top_k=10000,
981
+ )
982
+ metrics.knowledge_count = len(knowledge)
983
+
984
+ anti_patterns = self.storage.get_anti_patterns(
985
+ project_id=project_id,
986
+ top_k=10000,
987
+ )
988
+ metrics.anti_pattern_count = len(anti_patterns)
989
+
990
+ metrics.total_items = (
991
+ metrics.heuristic_count
992
+ + metrics.outcome_count
993
+ + metrics.knowledge_count
994
+ + metrics.anti_pattern_count
995
+ )
996
+
997
+ # Get agent count
998
+ stats = self.storage.get_stats(project_id=project_id)
999
+ metrics.agents_count = len(stats.get("agents", []))
1000
+
1001
+ # Estimate storage size (rough approximation)
1002
+ # Average ~500 bytes per item
1003
+ metrics.storage_bytes = metrics.total_items * 500
1004
+
1005
+ # Store in history
1006
+ self._metrics_history.append(metrics)
1007
+ if len(self._metrics_history) > self._max_history:
1008
+ self._metrics_history = self._metrics_history[-self._max_history :]
1009
+
1010
+ return metrics
1011
+
1012
+ def check_health(self, project_id: str) -> List[HealthAlert]:
1013
+ """
1014
+ Check memory health and generate alerts if thresholds exceeded.
1015
+
1016
+ Args:
1017
+ project_id: Project to check
1018
+
1019
+ Returns:
1020
+ List of health alerts (empty if healthy)
1021
+ """
1022
+ metrics = self.collect_metrics(project_id)
1023
+ alerts: List[HealthAlert] = []
1024
+ t = self.thresholds
1025
+
1026
+ # Check total items
1027
+ if metrics.total_items >= t.max_total_items_critical:
1028
+ alerts.append(
1029
+ HealthAlert(
1030
+ level="critical",
1031
+ category="total_items",
1032
+ message="Memory item count critically high",
1033
+ current_value=metrics.total_items,
1034
+ threshold=t.max_total_items_critical,
1035
+ )
1036
+ )
1037
+ elif metrics.total_items >= t.max_total_items_warning:
1038
+ alerts.append(
1039
+ HealthAlert(
1040
+ level="warning",
1041
+ category="total_items",
1042
+ message="Memory item count approaching limit",
1043
+ current_value=metrics.total_items,
1044
+ threshold=t.max_total_items_warning,
1045
+ )
1046
+ )
1047
+
1048
+ # Check staleness
1049
+ if metrics.heuristic_count > 0:
1050
+ stale_percentage = metrics.stale_heuristic_count / metrics.heuristic_count
1051
+ if stale_percentage >= t.max_stale_percentage_critical:
1052
+ alerts.append(
1053
+ HealthAlert(
1054
+ level="critical",
1055
+ category="staleness",
1056
+ message="Too many stale heuristics",
1057
+ current_value=f"{stale_percentage:.0%}",
1058
+ threshold=f"{t.max_stale_percentage_critical:.0%}",
1059
+ )
1060
+ )
1061
+ elif stale_percentage >= t.max_stale_percentage_warning:
1062
+ alerts.append(
1063
+ HealthAlert(
1064
+ level="warning",
1065
+ category="staleness",
1066
+ message="Many heuristics are stale",
1067
+ current_value=f"{stale_percentage:.0%}",
1068
+ threshold=f"{t.max_stale_percentage_warning:.0%}",
1069
+ )
1070
+ )
1071
+
1072
+ # Check average confidence
1073
+ if metrics.heuristic_count > 0:
1074
+ if metrics.avg_heuristic_confidence < t.min_avg_confidence_critical:
1075
+ alerts.append(
1076
+ HealthAlert(
1077
+ level="critical",
1078
+ category="confidence",
1079
+ message="Average heuristic confidence critically low",
1080
+ current_value=f"{metrics.avg_heuristic_confidence:.2f}",
1081
+ threshold=f"{t.min_avg_confidence_critical:.2f}",
1082
+ )
1083
+ )
1084
+ elif metrics.avg_heuristic_confidence < t.min_avg_confidence_warning:
1085
+ alerts.append(
1086
+ HealthAlert(
1087
+ level="warning",
1088
+ category="confidence",
1089
+ message="Average heuristic confidence is low",
1090
+ current_value=f"{metrics.avg_heuristic_confidence:.2f}",
1091
+ threshold=f"{t.min_avg_confidence_warning:.2f}",
1092
+ )
1093
+ )
1094
+
1095
+ # Check storage size
1096
+ if metrics.storage_bytes >= t.max_storage_bytes_critical:
1097
+ alerts.append(
1098
+ HealthAlert(
1099
+ level="critical",
1100
+ category="storage",
1101
+ message="Memory storage size critically high",
1102
+ current_value=f"{metrics.storage_bytes / (1024 * 1024):.1f}MB",
1103
+ threshold=f"{t.max_storage_bytes_critical / (1024 * 1024):.1f}MB",
1104
+ )
1105
+ )
1106
+
1107
+ # Notify handlers
1108
+ for alert in alerts:
1109
+ for handler in self._alert_handlers:
1110
+ try:
1111
+ handler(alert)
1112
+ except Exception as e:
1113
+ logger.error(f"Alert handler error: {e}")
1114
+
1115
+ return alerts
1116
+
1117
+ def get_growth_trend(self, project_id: str) -> Dict[str, Any]:
1118
+ """
1119
+ Analyze memory growth trend from history.
1120
+
1121
+ Args:
1122
+ project_id: Project to analyze
1123
+
1124
+ Returns:
1125
+ Trend analysis
1126
+ """
1127
+ if len(self._metrics_history) < 2:
1128
+ return {
1129
+ "status": "insufficient_data",
1130
+ "samples": len(self._metrics_history),
1131
+ }
1132
+
1133
+ first = self._metrics_history[0]
1134
+ last = self._metrics_history[-1]
1135
+
1136
+ time_span = (last.timestamp - first.timestamp).total_seconds()
1137
+ if time_span <= 0:
1138
+ return {"status": "insufficient_time_span"}
1139
+
1140
+ days_span = time_span / (24 * 60 * 60)
1141
+ item_growth = last.total_items - first.total_items
1142
+ growth_per_day = item_growth / days_span if days_span > 0 else 0
1143
+
1144
+ return {
1145
+ "status": "ok",
1146
+ "samples": len(self._metrics_history),
1147
+ "time_span_days": round(days_span, 1),
1148
+ "total_growth": item_growth,
1149
+ "growth_per_day": round(growth_per_day, 2),
1150
+ "first_total": first.total_items,
1151
+ "last_total": last.total_items,
1152
+ "confidence_trend": round(
1153
+ last.avg_heuristic_confidence - first.avg_heuristic_confidence, 3
1154
+ ),
1155
+ }
1156
+
1157
+
1158
+ # ==================== CLEANUP SCHEDULER ====================
1159
+
1160
+
1161
+ @dataclass
1162
+ class CleanupJob:
1163
+ """Configuration for a scheduled cleanup job."""
1164
+
1165
+ name: str
1166
+ project_id: str
1167
+ interval_hours: float
1168
+ agent: Optional[str] = None
1169
+ policy: Optional[PrunePolicy] = None
1170
+ apply_decay: bool = True
1171
+ last_run: Optional[datetime] = None
1172
+ next_run: Optional[datetime] = None
1173
+ enabled: bool = True
1174
+
1175
+
1176
+ @dataclass
1177
+ class CleanupResult:
1178
+ """Result of a cleanup job execution."""
1179
+
1180
+ job_name: str
1181
+ project_id: str
1182
+ started_at: datetime
1183
+ completed_at: datetime
1184
+ prune_summary: Optional[PruneSummary] = None
1185
+ decay_result: Optional[DecayResult] = None
1186
+ alerts: List[HealthAlert] = field(default_factory=list)
1187
+ success: bool = True
1188
+ error: Optional[str] = None
1189
+
1190
+
1191
+ class CleanupScheduler:
1192
+ """
1193
+ Schedules and executes automated memory cleanup jobs.
1194
+
1195
+ Features:
1196
+ - Configurable job intervals
1197
+ - Prune + decay in single operation
1198
+ - Health check integration
1199
+ - Job execution history
1200
+ - Thread-safe operation
1201
+ """
1202
+
1203
+ def __init__(
1204
+ self,
1205
+ storage: StorageBackend,
1206
+ forgetting_engine: Optional[ForgettingEngine] = None,
1207
+ decayer: Optional[ConfidenceDecayer] = None,
1208
+ health_monitor: Optional[MemoryHealthMonitor] = None,
1209
+ ):
1210
+ """
1211
+ Initialize cleanup scheduler.
1212
+
1213
+ Args:
1214
+ storage: Storage backend
1215
+ forgetting_engine: Engine for pruning (created if not provided)
1216
+ decayer: Engine for decay (created if not provided)
1217
+ health_monitor: Health monitor (created if not provided)
1218
+ """
1219
+ self.storage = storage
1220
+ self.forgetting_engine = forgetting_engine or ForgettingEngine(storage)
1221
+ self.decayer = decayer or ConfidenceDecayer(storage)
1222
+ self.health_monitor = health_monitor or MemoryHealthMonitor(storage)
1223
+
1224
+ self._jobs: Dict[str, CleanupJob] = {}
1225
+ self._history: List[CleanupResult] = []
1226
+ self._max_history = 50
1227
+ self._lock = threading.RLock()
1228
+ self._running = False
1229
+ self._thread: Optional[threading.Thread] = None
1230
+
1231
+ def register_job(self, job: CleanupJob) -> None:
1232
+ """
1233
+ Register a cleanup job.
1234
+
1235
+ Args:
1236
+ job: Job configuration
1237
+ """
1238
+ with self._lock:
1239
+ now = datetime.now(timezone.utc)
1240
+ job.next_run = now + timedelta(hours=job.interval_hours)
1241
+ self._jobs[job.name] = job
1242
+ logger.info(
1243
+ f"Registered cleanup job '{job.name}' for project {job.project_id}"
1244
+ )
1245
+
1246
+ def unregister_job(self, name: str) -> bool:
1247
+ """
1248
+ Unregister a cleanup job.
1249
+
1250
+ Args:
1251
+ name: Job name
1252
+
1253
+ Returns:
1254
+ True if job was found and removed
1255
+ """
1256
+ with self._lock:
1257
+ if name in self._jobs:
1258
+ del self._jobs[name]
1259
+ logger.info(f"Unregistered cleanup job '{name}'")
1260
+ return True
1261
+ return False
1262
+
1263
+ def run_job(self, name: str, dry_run: bool = False) -> CleanupResult:
1264
+ """
1265
+ Manually run a specific job.
1266
+
1267
+ Args:
1268
+ name: Job name
1269
+ dry_run: If True, don't actually modify data
1270
+
1271
+ Returns:
1272
+ CleanupResult with execution details
1273
+ """
1274
+ with self._lock:
1275
+ if name not in self._jobs:
1276
+ raise ValueError(f"Job '{name}' not found")
1277
+ job = self._jobs[name]
1278
+
1279
+ return self._execute_job(job, dry_run)
1280
+
1281
+ def run_all_due(self) -> List[CleanupResult]:
1282
+ """
1283
+ Run all jobs that are due.
1284
+
1285
+ Returns:
1286
+ List of results for executed jobs
1287
+ """
1288
+ results = []
1289
+ now = datetime.now(timezone.utc)
1290
+
1291
+ with self._lock:
1292
+ due_jobs = [
1293
+ job
1294
+ for job in self._jobs.values()
1295
+ if job.enabled and job.next_run and job.next_run <= now
1296
+ ]
1297
+
1298
+ for job in due_jobs:
1299
+ try:
1300
+ result = self._execute_job(job)
1301
+ results.append(result)
1302
+ except Exception as e:
1303
+ logger.error(f"Error running job '{job.name}': {e}")
1304
+ results.append(
1305
+ CleanupResult(
1306
+ job_name=job.name,
1307
+ project_id=job.project_id,
1308
+ started_at=now,
1309
+ completed_at=datetime.now(timezone.utc),
1310
+ success=False,
1311
+ error=str(e),
1312
+ )
1313
+ )
1314
+
1315
+ return results
1316
+
1317
+ def _execute_job(self, job: CleanupJob, dry_run: bool = False) -> CleanupResult:
1318
+ """Execute a cleanup job."""
1319
+ started_at = datetime.now(timezone.utc)
1320
+ result = CleanupResult(
1321
+ job_name=job.name,
1322
+ project_id=job.project_id,
1323
+ started_at=started_at,
1324
+ completed_at=started_at,
1325
+ )
1326
+
1327
+ try:
1328
+ # Run prune
1329
+ engine = ForgettingEngine(
1330
+ self.storage,
1331
+ job.policy or self.forgetting_engine.policy,
1332
+ )
1333
+ result.prune_summary = engine.prune(
1334
+ project_id=job.project_id,
1335
+ agent=job.agent,
1336
+ dry_run=dry_run,
1337
+ )
1338
+
1339
+ # Run decay if enabled
1340
+ if job.apply_decay:
1341
+ result.decay_result = self.decayer.apply_decay(
1342
+ project_id=job.project_id,
1343
+ agent=job.agent,
1344
+ dry_run=dry_run,
1345
+ )
1346
+
1347
+ # Check health
1348
+ result.alerts = self.health_monitor.check_health(job.project_id)
1349
+
1350
+ # Update job timing
1351
+ with self._lock:
1352
+ now = datetime.now(timezone.utc)
1353
+ job.last_run = now
1354
+ job.next_run = now + timedelta(hours=job.interval_hours)
1355
+
1356
+ result.success = True
1357
+
1358
+ except Exception as e:
1359
+ result.success = False
1360
+ result.error = str(e)
1361
+ logger.error(f"Cleanup job '{job.name}' failed: {e}")
1362
+
1363
+ result.completed_at = datetime.now(timezone.utc)
1364
+
1365
+ # Store in history
1366
+ with self._lock:
1367
+ self._history.append(result)
1368
+ if len(self._history) > self._max_history:
1369
+ self._history = self._history[-self._max_history :]
1370
+
1371
+ return result
1372
+
1373
+ def start_background(self, check_interval_seconds: int = 60) -> None:
1374
+ """
1375
+ Start background job execution thread.
1376
+
1377
+ Args:
1378
+ check_interval_seconds: How often to check for due jobs
1379
+ """
1380
+ if self._running:
1381
+ logger.warning("Scheduler already running")
1382
+ return
1383
+
1384
+ self._running = True
1385
+
1386
+ def run():
1387
+ while self._running:
1388
+ try:
1389
+ self.run_all_due()
1390
+ except Exception as e:
1391
+ logger.error(f"Scheduler error: {e}")
1392
+ time.sleep(check_interval_seconds)
1393
+
1394
+ self._thread = threading.Thread(target=run, daemon=True)
1395
+ self._thread.start()
1396
+ logger.info(f"Cleanup scheduler started (interval: {check_interval_seconds}s)")
1397
+
1398
+ def stop_background(self) -> None:
1399
+ """Stop the background execution thread."""
1400
+ self._running = False
1401
+ if self._thread:
1402
+ self._thread.join(timeout=5)
1403
+ self._thread = None
1404
+ logger.info("Cleanup scheduler stopped")
1405
+
1406
+ def get_jobs(self) -> List[Dict[str, Any]]:
1407
+ """Get all registered jobs."""
1408
+ with self._lock:
1409
+ return [
1410
+ {
1411
+ "name": job.name,
1412
+ "project_id": job.project_id,
1413
+ "interval_hours": job.interval_hours,
1414
+ "agent": job.agent,
1415
+ "enabled": job.enabled,
1416
+ "last_run": job.last_run.isoformat() if job.last_run else None,
1417
+ "next_run": job.next_run.isoformat() if job.next_run else None,
1418
+ }
1419
+ for job in self._jobs.values()
1420
+ ]
1421
+
1422
+ def get_history(self, limit: int = 10) -> List[Dict[str, Any]]:
1423
+ """Get recent job execution history."""
1424
+ with self._lock:
1425
+ recent = self._history[-limit:]
1426
+ return [
1427
+ {
1428
+ "job_name": r.job_name,
1429
+ "project_id": r.project_id,
1430
+ "started_at": r.started_at.isoformat(),
1431
+ "completed_at": r.completed_at.isoformat(),
1432
+ "duration_ms": int(
1433
+ (r.completed_at - r.started_at).total_seconds() * 1000
1434
+ ),
1435
+ "success": r.success,
1436
+ "items_pruned": (
1437
+ r.prune_summary.total_pruned if r.prune_summary else 0
1438
+ ),
1439
+ "items_decayed": (
1440
+ r.decay_result.items_updated if r.decay_result else 0
1441
+ ),
1442
+ "alerts": len(r.alerts),
1443
+ "error": r.error,
1444
+ }
1445
+ for r in reversed(recent)
1446
+ ]