superlocalmemory 2.8.2 → 2.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +7 -5
  2. package/api_server.py +5 -0
  3. package/bin/slm.bat +3 -3
  4. package/docs/SECURITY-QUICK-REFERENCE.md +214 -0
  5. package/install.ps1 +11 -11
  6. package/mcp_server.py +3 -3
  7. package/package.json +2 -2
  8. package/requirements-core.txt +16 -18
  9. package/requirements-learning.txt +8 -8
  10. package/requirements.txt +9 -7
  11. package/scripts/prepack.js +33 -0
  12. package/scripts/verify-v27.ps1 +301 -0
  13. package/src/agent_registry.py +32 -28
  14. package/src/auto_backup.py +12 -6
  15. package/src/cache_manager.py +2 -2
  16. package/src/compression/__init__.py +25 -0
  17. package/src/compression/cli.py +150 -0
  18. package/src/compression/cold_storage.py +217 -0
  19. package/src/compression/config.py +72 -0
  20. package/src/compression/orchestrator.py +133 -0
  21. package/src/compression/tier2_compressor.py +228 -0
  22. package/src/compression/tier3_compressor.py +153 -0
  23. package/src/compression/tier_classifier.py +148 -0
  24. package/src/db_connection_manager.py +5 -5
  25. package/src/event_bus.py +24 -22
  26. package/src/hnsw_index.py +3 -3
  27. package/src/learning/__init__.py +5 -4
  28. package/src/learning/adaptive_ranker.py +14 -265
  29. package/src/learning/bootstrap/__init__.py +69 -0
  30. package/src/learning/bootstrap/constants.py +93 -0
  31. package/src/learning/bootstrap/db_queries.py +316 -0
  32. package/src/learning/bootstrap/sampling.py +82 -0
  33. package/src/learning/bootstrap/text_utils.py +71 -0
  34. package/src/learning/cross_project_aggregator.py +58 -57
  35. package/src/learning/db/__init__.py +40 -0
  36. package/src/learning/db/constants.py +44 -0
  37. package/src/learning/db/schema.py +279 -0
  38. package/src/learning/learning_db.py +15 -234
  39. package/src/learning/ranking/__init__.py +33 -0
  40. package/src/learning/ranking/constants.py +84 -0
  41. package/src/learning/ranking/helpers.py +278 -0
  42. package/src/learning/source_quality_scorer.py +66 -65
  43. package/src/learning/synthetic_bootstrap.py +28 -310
  44. package/src/memory/__init__.py +36 -0
  45. package/src/memory/cli.py +205 -0
  46. package/src/memory/constants.py +39 -0
  47. package/src/memory/helpers.py +28 -0
  48. package/src/memory/schema.py +166 -0
  49. package/src/memory-profiles.py +94 -86
  50. package/src/memory-reset.py +187 -185
  51. package/src/memory_compression.py +2 -2
  52. package/src/memory_store_v2.py +34 -354
  53. package/src/migrate_v1_to_v2.py +11 -10
  54. package/src/patterns/analyzers.py +104 -100
  55. package/src/patterns/learner.py +17 -13
  56. package/src/patterns/scoring.py +25 -21
  57. package/src/patterns/store.py +40 -38
  58. package/src/patterns/terminology.py +53 -51
  59. package/src/provenance_tracker.py +2 -2
  60. package/src/qualixar_attribution.py +1 -1
  61. package/src/search/engine.py +16 -14
  62. package/src/search/index_loader.py +13 -11
  63. package/src/setup_validator.py +160 -158
  64. package/src/subscription_manager.py +20 -18
  65. package/src/tree/builder.py +66 -64
  66. package/src/tree/nodes.py +103 -97
  67. package/src/tree/queries.py +142 -137
  68. package/src/tree/schema.py +46 -42
  69. package/src/webhook_dispatcher.py +3 -3
  70. package/ui_server.py +7 -4
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """
5
+ Constants for AdaptiveRanker.
6
+
7
+ Includes phase thresholds, rule-based boost multipliers, and LightGBM
8
+ training parameters.
9
+ """
10
+
11
+ from pathlib import Path
12
+
13
+ # ============================================================================
14
+ # Paths
15
+ # ============================================================================
16
+
17
+ MODELS_DIR = Path.home() / ".claude-memory" / "models"
18
+ MODEL_PATH = MODELS_DIR / "ranker.txt"
19
+
20
+ # ============================================================================
21
+ # Phase Thresholds
22
+ # ============================================================================
23
+
24
+ # Phase thresholds — how many feedback signals to trigger each phase
25
+ PHASE_THRESHOLDS = {
26
+ 'baseline': 0, # 0 feedback samples -> no re-ranking
27
+ 'rule_based': 20, # 20+ feedback -> rule-based boosting
28
+ 'ml_model': 200, # 200+ feedback across 50+ unique queries -> ML
29
+ }
30
+
31
+ # Minimum unique queries required for ML phase (prevents overfitting
32
+ # to a small number of repeated queries)
33
+ MIN_UNIQUE_QUERIES_FOR_ML = 50
34
+
35
+ # ============================================================================
36
+ # Rule-Based Boost Multipliers (Phase 1)
37
+ # ============================================================================
38
+
39
+ # These are conservative — they nudge the ranking without flipping order
40
+ RULE_BOOST = {
41
+ 'tech_match_strong': 1.3, # Memory matches 2+ preferred techs
42
+ 'tech_match_weak': 1.1, # Memory matches 1 preferred tech
43
+ 'project_match': 1.5, # Memory from current project
44
+ 'project_unknown': 1.0, # No project context — no boost
45
+ 'project_mismatch': 0.9, # Memory from different project
46
+ 'source_quality_high': 1.2, # Source quality > 0.7
47
+ 'source_quality_low': 0.85, # Source quality < 0.3
48
+ 'recency_boost_max': 1.2, # Recent memory (< 7 days)
49
+ 'recency_penalty_max': 0.8, # Old memory (> 365 days)
50
+ 'high_importance': 1.15, # Importance >= 8
51
+ 'high_access': 1.1, # Accessed 5+ times
52
+ # v2.8: Lifecycle + behavioral boosts
53
+ 'lifecycle_active': 1.0,
54
+ 'lifecycle_warm': 0.85,
55
+ 'lifecycle_cold': 0.6,
56
+ 'outcome_success_high': 1.3,
57
+ 'outcome_failure_high': 0.7,
58
+ 'behavioral_match_strong': 1.25,
59
+ 'cross_project_boost': 1.15,
60
+ 'high_trust_creator': 1.1,
61
+ 'low_trust_creator': 0.8,
62
+ }
63
+
64
+ # ============================================================================
65
+ # LightGBM Training Parameters
66
+ # ============================================================================
67
+
68
+ # LightGBM training parameters — tuned for small, personal datasets
69
+ # Aggressive regularization prevents overfitting on < 10K samples
70
+ TRAINING_PARAMS = {
71
+ 'objective': 'lambdarank',
72
+ 'metric': 'ndcg',
73
+ 'ndcg_eval_at': [5, 10],
74
+ 'learning_rate': 0.05,
75
+ 'num_leaves': 16,
76
+ 'max_depth': 4,
77
+ 'min_child_samples': 10,
78
+ 'subsample': 0.8,
79
+ 'reg_alpha': 0.1,
80
+ 'reg_lambda': 1.0,
81
+ 'boosting_type': 'dart',
82
+ 'n_estimators': 50,
83
+ 'verbose': -1,
84
+ }
@@ -0,0 +1,278 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """
5
+ Helper functions for AdaptiveRanker.
6
+
7
+ Extracted from adaptive_ranker.py to reduce file size while maintaining
8
+ backward compatibility.
9
+ """
10
+
11
+ import logging
12
+ import sqlite3
13
+ from pathlib import Path
14
+ from typing import Dict, List, Optional
15
+
16
+ from .constants import RULE_BOOST
17
+
18
+ logger = logging.getLogger("superlocalmemory.learning.ranking.helpers")
19
+
20
+ # NumPy is optional — used for feature matrix construction
21
+ try:
22
+ import numpy as np
23
+ HAS_NUMPY = True
24
+ except ImportError:
25
+ np = None
26
+ HAS_NUMPY = False
27
+
28
+
29
+ def calculate_rule_boost(features: List[float]) -> float:
30
+ """
31
+ Calculate rule-based boost multiplier from extracted features.
32
+
33
+ This function encapsulates the rule-based boosting logic from Phase 1,
34
+ making the main rerank method more readable.
35
+
36
+ Args:
37
+ features: Feature vector extracted for a memory.
38
+
39
+ Returns:
40
+ Boost multiplier (typically 0.5 to 2.0).
41
+ """
42
+ boost = 1.0
43
+
44
+ # Feature [2]: tech_match
45
+ tech_match = features[2]
46
+ if tech_match >= 0.8:
47
+ boost *= RULE_BOOST['tech_match_strong']
48
+ elif tech_match >= 0.4:
49
+ boost *= RULE_BOOST['tech_match_weak']
50
+
51
+ # Feature [3]: project_match
52
+ project_match = features[3]
53
+ if project_match >= 0.9:
54
+ boost *= RULE_BOOST['project_match']
55
+ elif project_match <= 0.35:
56
+ boost *= RULE_BOOST['project_mismatch']
57
+
58
+ # Feature [5]: source_quality
59
+ source_quality = features[5]
60
+ if source_quality >= 0.7:
61
+ boost *= RULE_BOOST['source_quality_high']
62
+ elif source_quality < 0.3:
63
+ boost *= RULE_BOOST['source_quality_low']
64
+
65
+ # Feature [7]: recency_score (exponential decay)
66
+ recency = features[7]
67
+ # Linear interpolation between penalty and boost
68
+ recency_factor = (
69
+ RULE_BOOST['recency_penalty_max']
70
+ + recency * (
71
+ RULE_BOOST['recency_boost_max']
72
+ - RULE_BOOST['recency_penalty_max']
73
+ )
74
+ )
75
+ boost *= recency_factor
76
+
77
+ # Feature [6]: importance_norm
78
+ importance_norm = features[6]
79
+ if importance_norm >= 0.8:
80
+ boost *= RULE_BOOST['high_importance']
81
+
82
+ # Feature [8]: access_frequency
83
+ access_freq = features[8]
84
+ if access_freq >= 0.5:
85
+ boost *= RULE_BOOST['high_access']
86
+
87
+ # Feature [10]: signal_count (v2.7.4 — feedback volume)
88
+ if len(features) > 10:
89
+ signal_count = features[10]
90
+ if signal_count >= 0.3: # 3+ signals
91
+ boost *= 1.1 # Mild boost for well-known memories
92
+
93
+ # Feature [11]: avg_signal_value (v2.7.4 — feedback quality)
94
+ if len(features) > 11:
95
+ avg_signal = features[11]
96
+ if avg_signal >= 0.7:
97
+ boost *= 1.15 # Boost memories with positive feedback
98
+ elif avg_signal < 0.3 and avg_signal > 0.0:
99
+ boost *= 0.85 # Penalize memories with negative feedback
100
+
101
+ # Feature [12]: lifecycle_state (v2.8)
102
+ if len(features) > 12:
103
+ lifecycle_state = features[12]
104
+ if lifecycle_state >= 0.9:
105
+ boost *= RULE_BOOST.get('lifecycle_active', 1.0)
106
+ elif lifecycle_state >= 0.6:
107
+ boost *= RULE_BOOST.get('lifecycle_warm', 0.85)
108
+ elif lifecycle_state >= 0.3:
109
+ boost *= RULE_BOOST.get('lifecycle_cold', 0.6)
110
+
111
+ # Feature [13]: outcome_success_rate (v2.8)
112
+ if len(features) > 13:
113
+ success_rate = features[13]
114
+ if success_rate >= 0.8:
115
+ boost *= RULE_BOOST.get('outcome_success_high', 1.3)
116
+ elif success_rate <= 0.2:
117
+ boost *= RULE_BOOST.get('outcome_failure_high', 0.7)
118
+
119
+ # Feature [15]: behavioral_match (v2.8)
120
+ if len(features) > 15:
121
+ behavioral = features[15]
122
+ if behavioral >= 0.7:
123
+ boost *= RULE_BOOST.get('behavioral_match_strong', 1.25)
124
+
125
+ # Feature [16]: cross_project_score (v2.8)
126
+ if len(features) > 16:
127
+ cross_project = features[16]
128
+ if cross_project >= 0.5:
129
+ boost *= RULE_BOOST.get('cross_project_boost', 1.15)
130
+
131
+ # Feature [18]: trust_at_creation (v2.8)
132
+ if len(features) > 18:
133
+ trust = features[18]
134
+ if trust >= 0.9:
135
+ boost *= RULE_BOOST.get('high_trust_creator', 1.1)
136
+ elif trust <= 0.3:
137
+ boost *= RULE_BOOST.get('low_trust_creator', 0.8)
138
+
139
+ return boost
140
+
141
+
142
+ def prepare_training_data_internal(
143
+ feedback: List[dict],
144
+ feature_extractor,
145
+ ) -> Optional[tuple]:
146
+ """
147
+ Prepare training data from feedback records.
148
+
149
+ For each unique query (grouped by query_hash):
150
+ - Fetch all feedback entries for that query
151
+ - Look up the corresponding memory from memory.db
152
+ - Extract features for each memory
153
+ - Use signal_value as the relevance label
154
+
155
+ Args:
156
+ feedback: List of feedback records from LearningDB.
157
+ feature_extractor: FeatureExtractor instance with context set.
158
+
159
+ Returns:
160
+ Tuple of (X, y, groups) for LGBMRanker, or None if insufficient.
161
+ X: numpy array (n_samples, NUM_FEATURES)
162
+ y: numpy array (n_samples,) — relevance labels
163
+ groups: list of ints — samples per query group
164
+ """
165
+ if not HAS_NUMPY:
166
+ logger.warning("NumPy not available for training data preparation")
167
+ return None
168
+
169
+ if not feedback:
170
+ return None
171
+
172
+ # Group feedback by query_hash
173
+ query_groups: Dict[str, List[dict]] = {}
174
+ for entry in feedback:
175
+ qh = entry['query_hash']
176
+ if qh not in query_groups:
177
+ query_groups[qh] = []
178
+ query_groups[qh].append(entry)
179
+
180
+ # Filter: only keep groups with 2+ items (ranking requires pairs)
181
+ query_groups = {
182
+ qh: entries for qh, entries in query_groups.items()
183
+ if len(entries) >= 2
184
+ }
185
+
186
+ if not query_groups:
187
+ logger.info("No query groups with 2+ feedback entries")
188
+ return None
189
+
190
+ # Collect memory IDs we need to look up
191
+ memory_ids_needed = set()
192
+ for entries in query_groups.values():
193
+ for entry in entries:
194
+ memory_ids_needed.add(entry['memory_id'])
195
+
196
+ # Fetch memories from memory.db
197
+ memory_db_path = Path.home() / ".claude-memory" / "memory.db"
198
+ if not memory_db_path.exists():
199
+ logger.warning("memory.db not found at %s", memory_db_path)
200
+ return None
201
+
202
+ memories_by_id = {}
203
+ try:
204
+ conn = sqlite3.connect(str(memory_db_path), timeout=5)
205
+ try:
206
+ conn.row_factory = sqlite3.Row
207
+ cursor = conn.cursor()
208
+
209
+ # Batch fetch memories (in chunks to avoid SQLite variable limit)
210
+ id_list = list(memory_ids_needed)
211
+ chunk_size = 500
212
+ for i in range(0, len(id_list), chunk_size):
213
+ chunk = id_list[i:i + chunk_size]
214
+ placeholders = ','.join('?' for _ in chunk)
215
+ cursor.execute(f'''
216
+ SELECT id, content, summary, project_path, project_name,
217
+ tags, category, memory_type, importance, created_at,
218
+ last_accessed, access_count
219
+ FROM memories
220
+ WHERE id IN ({placeholders})
221
+ ''', chunk)
222
+ for row in cursor.fetchall():
223
+ memories_by_id[row['id']] = dict(row)
224
+ finally:
225
+ conn.close()
226
+ except Exception as e:
227
+ logger.error("Failed to fetch memories for training: %s", e)
228
+ return None
229
+
230
+ # Build feature matrix and labels
231
+ all_features = []
232
+ all_labels = []
233
+ groups = []
234
+
235
+ # Set a neutral context for training (we don't have query-time context)
236
+ feature_extractor.set_context()
237
+
238
+ for qh, entries in query_groups.items():
239
+ group_features = []
240
+ group_labels = []
241
+
242
+ for entry in entries:
243
+ mid = entry['memory_id']
244
+ memory = memories_by_id.get(mid)
245
+ if memory is None:
246
+ continue # Memory may have been deleted
247
+
248
+ # Use query_keywords as proxy for query text
249
+ query_text = entry.get('query_keywords', '') or ''
250
+
251
+ features = feature_extractor.extract_features(
252
+ memory, query_text
253
+ )
254
+ group_features.append(features)
255
+ group_labels.append(float(entry['signal_value']))
256
+
257
+ # Only include groups with 2+ valid entries
258
+ if len(group_features) >= 2:
259
+ all_features.extend(group_features)
260
+ all_labels.extend(group_labels)
261
+ groups.append(len(group_features))
262
+
263
+ if not groups or len(all_features) < 4:
264
+ logger.info(
265
+ "Insufficient valid training data: %d features, %d groups",
266
+ len(all_features), len(groups)
267
+ )
268
+ return None
269
+
270
+ X = np.array(all_features, dtype=np.float64)
271
+ y = np.array(all_labels, dtype=np.float64)
272
+
273
+ logger.info(
274
+ "Prepared training data: %d samples, %d groups, %d features",
275
+ X.shape[0], len(groups), X.shape[1]
276
+ )
277
+
278
+ return X, y, groups
@@ -274,38 +274,39 @@ class SourceQualityScorer:
274
274
 
275
275
  try:
276
276
  conn = sqlite3.connect(str(self.memory_db_path), timeout=10)
277
- conn.execute("PRAGMA busy_timeout=5000")
278
- cursor = conn.cursor()
279
-
280
- # Check if created_by column exists
281
- cursor.execute("PRAGMA table_info(memories)")
282
- columns = {row[1] for row in cursor.fetchall()}
283
-
284
- if "created_by" in columns:
285
- cursor.execute("""
286
- SELECT
287
- COALESCE(created_by, 'unknown') AS source,
288
- COUNT(*) AS cnt
289
- FROM memories
290
- GROUP BY source
291
- ORDER BY cnt DESC
292
- """)
293
- for row in cursor.fetchall():
294
- source_id = row[0] if row[0] else "unknown"
295
- counts[source_id] = row[1]
296
- else:
297
- # Column doesn't exist — count all as 'unknown'
298
- cursor.execute("SELECT COUNT(*) FROM memories")
299
- total = cursor.fetchone()[0]
300
- if total > 0:
301
- counts["unknown"] = total
302
- logger.debug(
303
- "created_by column not in memory.db — "
304
- "all %d memories grouped as 'unknown'.",
305
- total,
306
- )
307
-
308
- conn.close()
277
+ try:
278
+ conn.execute("PRAGMA busy_timeout=5000")
279
+ cursor = conn.cursor()
280
+
281
+ # Check if created_by column exists
282
+ cursor.execute("PRAGMA table_info(memories)")
283
+ columns = {row[1] for row in cursor.fetchall()}
284
+
285
+ if "created_by" in columns:
286
+ cursor.execute("""
287
+ SELECT
288
+ COALESCE(created_by, 'unknown') AS source,
289
+ COUNT(*) AS cnt
290
+ FROM memories
291
+ GROUP BY source
292
+ ORDER BY cnt DESC
293
+ """)
294
+ for row in cursor.fetchall():
295
+ source_id = row[0] if row[0] else "unknown"
296
+ counts[source_id] = row[1]
297
+ else:
298
+ # Column doesn't exist — count all as 'unknown'
299
+ cursor.execute("SELECT COUNT(*) FROM memories")
300
+ total = cursor.fetchone()[0]
301
+ if total > 0:
302
+ counts["unknown"] = total
303
+ logger.debug(
304
+ "created_by column not in memory.db "
305
+ "all %d memories grouped as 'unknown'.",
306
+ total,
307
+ )
308
+ finally:
309
+ conn.close()
309
310
 
310
311
  except sqlite3.OperationalError as e:
311
312
  logger.warning("Error reading memory counts by source: %s", e)
@@ -361,40 +362,40 @@ class SourceQualityScorer:
361
362
  # Step 2: Look up created_by for each feedback memory_id in memory.db
362
363
  try:
363
364
  conn = sqlite3.connect(str(self.memory_db_path), timeout=10)
364
- conn.execute("PRAGMA busy_timeout=5000")
365
- cursor = conn.cursor()
366
-
367
- # Check if created_by column exists
368
- cursor.execute("PRAGMA table_info(memories)")
369
- columns = {row[1] for row in cursor.fetchall()}
370
-
371
- if "created_by" not in columns:
372
- # All positives go to 'unknown'
373
- total_positives = sum(feedback_memory_ids.values())
374
- if total_positives > 0:
375
- positives["unknown"] = total_positives
365
+ try:
366
+ conn.execute("PRAGMA busy_timeout=5000")
367
+ cursor = conn.cursor()
368
+
369
+ # Check if created_by column exists
370
+ cursor.execute("PRAGMA table_info(memories)")
371
+ columns = {row[1] for row in cursor.fetchall()}
372
+
373
+ if "created_by" not in columns:
374
+ # All positives go to 'unknown'
375
+ total_positives = sum(feedback_memory_ids.values())
376
+ if total_positives > 0:
377
+ positives["unknown"] = total_positives
378
+ return positives
379
+
380
+ # Batch lookup in chunks to avoid SQLite variable limit
381
+ mem_ids = list(feedback_memory_ids.keys())
382
+ chunk_size = 500 # SQLite max variables is 999
383
+
384
+ for i in range(0, len(mem_ids), chunk_size):
385
+ chunk = mem_ids[i:i + chunk_size]
386
+ placeholders = ",".join("?" * len(chunk))
387
+ cursor.execute(
388
+ "SELECT id, COALESCE(created_by, 'unknown') "
389
+ "FROM memories WHERE id IN (%s)" % placeholders,
390
+ chunk,
391
+ )
392
+ for row in cursor.fetchall():
393
+ mem_id = row[0]
394
+ source_id = row[1] if row[1] else "unknown"
395
+ count = feedback_memory_ids.get(mem_id, 0)
396
+ positives[source_id] = positives.get(source_id, 0) + count
397
+ finally:
376
398
  conn.close()
377
- return positives
378
-
379
- # Batch lookup in chunks to avoid SQLite variable limit
380
- mem_ids = list(feedback_memory_ids.keys())
381
- chunk_size = 500 # SQLite max variables is 999
382
-
383
- for i in range(0, len(mem_ids), chunk_size):
384
- chunk = mem_ids[i:i + chunk_size]
385
- placeholders = ",".join("?" * len(chunk))
386
- cursor.execute(
387
- "SELECT id, COALESCE(created_by, 'unknown') "
388
- "FROM memories WHERE id IN (%s)" % placeholders,
389
- chunk,
390
- )
391
- for row in cursor.fetchall():
392
- mem_id = row[0]
393
- source_id = row[1] if row[1] else "unknown"
394
- count = feedback_memory_ids.get(mem_id, 0)
395
- positives[source_id] = positives.get(source_id, 0) + count
396
-
397
- conn.close()
398
399
 
399
400
  except sqlite3.OperationalError as e:
400
401
  logger.warning("Error looking up memory sources: %s", e)