superlocalmemory 2.8.2 → 2.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/ATTRIBUTION.md +1 -1
  2. package/CHANGELOG.md +17 -0
  3. package/README.md +7 -5
  4. package/api_server.py +5 -0
  5. package/bin/slm +35 -0
  6. package/bin/slm.bat +3 -3
  7. package/docs/SECURITY-QUICK-REFERENCE.md +214 -0
  8. package/install.ps1 +11 -11
  9. package/mcp_server.py +78 -10
  10. package/package.json +2 -2
  11. package/requirements-core.txt +16 -18
  12. package/requirements-learning.txt +8 -8
  13. package/requirements.txt +9 -7
  14. package/scripts/prepack.js +33 -0
  15. package/scripts/verify-v27.ps1 +301 -0
  16. package/src/agent_registry.py +32 -28
  17. package/src/auto_backup.py +12 -6
  18. package/src/cache_manager.py +2 -2
  19. package/src/compression/__init__.py +25 -0
  20. package/src/compression/cli.py +150 -0
  21. package/src/compression/cold_storage.py +217 -0
  22. package/src/compression/config.py +72 -0
  23. package/src/compression/orchestrator.py +133 -0
  24. package/src/compression/tier2_compressor.py +228 -0
  25. package/src/compression/tier3_compressor.py +153 -0
  26. package/src/compression/tier_classifier.py +148 -0
  27. package/src/db_connection_manager.py +5 -5
  28. package/src/event_bus.py +24 -22
  29. package/src/hnsw_index.py +3 -3
  30. package/src/learning/__init__.py +5 -4
  31. package/src/learning/adaptive_ranker.py +14 -265
  32. package/src/learning/bootstrap/__init__.py +69 -0
  33. package/src/learning/bootstrap/constants.py +93 -0
  34. package/src/learning/bootstrap/db_queries.py +316 -0
  35. package/src/learning/bootstrap/sampling.py +82 -0
  36. package/src/learning/bootstrap/text_utils.py +71 -0
  37. package/src/learning/cross_project_aggregator.py +58 -57
  38. package/src/learning/db/__init__.py +40 -0
  39. package/src/learning/db/constants.py +44 -0
  40. package/src/learning/db/schema.py +279 -0
  41. package/src/learning/learning_db.py +15 -234
  42. package/src/learning/ranking/__init__.py +33 -0
  43. package/src/learning/ranking/constants.py +84 -0
  44. package/src/learning/ranking/helpers.py +278 -0
  45. package/src/learning/source_quality_scorer.py +66 -65
  46. package/src/learning/synthetic_bootstrap.py +28 -310
  47. package/src/memory/__init__.py +36 -0
  48. package/src/memory/cli.py +205 -0
  49. package/src/memory/constants.py +39 -0
  50. package/src/memory/helpers.py +28 -0
  51. package/src/memory/schema.py +166 -0
  52. package/src/memory-profiles.py +94 -86
  53. package/src/memory-reset.py +187 -185
  54. package/src/memory_compression.py +2 -2
  55. package/src/memory_store_v2.py +44 -354
  56. package/src/migrate_v1_to_v2.py +11 -10
  57. package/src/patterns/analyzers.py +104 -100
  58. package/src/patterns/learner.py +17 -13
  59. package/src/patterns/scoring.py +25 -21
  60. package/src/patterns/store.py +40 -38
  61. package/src/patterns/terminology.py +53 -51
  62. package/src/provenance_tracker.py +2 -2
  63. package/src/qualixar_attribution.py +1 -1
  64. package/src/search/engine.py +16 -14
  65. package/src/search/index_loader.py +13 -11
  66. package/src/setup_validator.py +160 -158
  67. package/src/subscription_manager.py +20 -18
  68. package/src/tree/builder.py +66 -64
  69. package/src/tree/nodes.py +103 -97
  70. package/src/tree/queries.py +142 -137
  71. package/src/tree/schema.py +46 -42
  72. package/src/webhook_dispatcher.py +3 -3
  73. package/ui_server.py +7 -4
@@ -63,67 +63,17 @@ from .feature_extractor import FeatureExtractor, FEATURE_NAMES, NUM_FEATURES
63
63
 
64
64
  logger = logging.getLogger("superlocalmemory.learning.adaptive_ranker")
65
65
 
66
- # ============================================================================
67
- # Constants
68
- # ============================================================================
69
-
70
- MODELS_DIR = Path.home() / ".claude-memory" / "models"
71
- MODEL_PATH = MODELS_DIR / "ranker.txt"
72
-
73
- # Phase thresholds — how many feedback signals to trigger each phase
74
- PHASE_THRESHOLDS = {
75
- 'baseline': 0, # 0 feedback samples -> no re-ranking
76
- 'rule_based': 20, # 20+ feedback -> rule-based boosting
77
- 'ml_model': 200, # 200+ feedback across 50+ unique queries -> ML
78
- }
79
-
80
- # Minimum unique queries required for ML phase (prevents overfitting
81
- # to a small number of repeated queries)
82
- MIN_UNIQUE_QUERIES_FOR_ML = 50
83
-
84
- # Rule-based boost multipliers (Phase 1)
85
- # These are conservative — they nudge the ranking without flipping order
86
- _RULE_BOOST = {
87
- 'tech_match_strong': 1.3, # Memory matches 2+ preferred techs
88
- 'tech_match_weak': 1.1, # Memory matches 1 preferred tech
89
- 'project_match': 1.5, # Memory from current project
90
- 'project_unknown': 1.0, # No project context — no boost
91
- 'project_mismatch': 0.9, # Memory from different project
92
- 'source_quality_high': 1.2, # Source quality > 0.7
93
- 'source_quality_low': 0.85, # Source quality < 0.3
94
- 'recency_boost_max': 1.2, # Recent memory (< 7 days)
95
- 'recency_penalty_max': 0.8, # Old memory (> 365 days)
96
- 'high_importance': 1.15, # Importance >= 8
97
- 'high_access': 1.1, # Accessed 5+ times
98
- # v2.8: Lifecycle + behavioral boosts
99
- 'lifecycle_active': 1.0,
100
- 'lifecycle_warm': 0.85,
101
- 'lifecycle_cold': 0.6,
102
- 'outcome_success_high': 1.3,
103
- 'outcome_failure_high': 0.7,
104
- 'behavioral_match_strong': 1.25,
105
- 'cross_project_boost': 1.15,
106
- 'high_trust_creator': 1.1,
107
- 'low_trust_creator': 0.8,
108
- }
109
-
110
- # LightGBM training parameters — tuned for small, personal datasets
111
- # Aggressive regularization prevents overfitting on < 10K samples
112
- TRAINING_PARAMS = {
113
- 'objective': 'lambdarank',
114
- 'metric': 'ndcg',
115
- 'ndcg_eval_at': [5, 10],
116
- 'learning_rate': 0.05,
117
- 'num_leaves': 16,
118
- 'max_depth': 4,
119
- 'min_child_samples': 10,
120
- 'subsample': 0.8,
121
- 'reg_alpha': 0.1,
122
- 'reg_lambda': 1.0,
123
- 'boosting_type': 'dart',
124
- 'n_estimators': 50,
125
- 'verbose': -1,
126
- }
66
+ # Import constants and helpers from ranking subpackage
67
+ from .ranking import (
68
+ MODELS_DIR,
69
+ MODEL_PATH,
70
+ PHASE_THRESHOLDS,
71
+ MIN_UNIQUE_QUERIES_FOR_ML,
72
+ RULE_BOOST,
73
+ TRAINING_PARAMS,
74
+ calculate_rule_boost,
75
+ prepare_training_data_internal,
76
+ )
127
77
 
128
78
 
129
79
  class AdaptiveRanker:
@@ -373,102 +323,7 @@ class AdaptiveRanker:
373
323
  continue
374
324
 
375
325
  features = feature_vectors[i]
376
- boost = 1.0
377
-
378
- # Feature [2]: tech_match
379
- tech_match = features[2]
380
- if tech_match >= 0.8:
381
- boost *= _RULE_BOOST['tech_match_strong']
382
- elif tech_match >= 0.4:
383
- boost *= _RULE_BOOST['tech_match_weak']
384
-
385
- # Feature [3]: project_match
386
- project_match = features[3]
387
- if project_match >= 0.9:
388
- boost *= _RULE_BOOST['project_match']
389
- elif project_match <= 0.35:
390
- boost *= _RULE_BOOST['project_mismatch']
391
-
392
- # Feature [5]: source_quality
393
- source_quality = features[5]
394
- if source_quality >= 0.7:
395
- boost *= _RULE_BOOST['source_quality_high']
396
- elif source_quality < 0.3:
397
- boost *= _RULE_BOOST['source_quality_low']
398
-
399
- # Feature [7]: recency_score (exponential decay)
400
- recency = features[7]
401
- # Linear interpolation between penalty and boost
402
- recency_factor = (
403
- _RULE_BOOST['recency_penalty_max']
404
- + recency * (
405
- _RULE_BOOST['recency_boost_max']
406
- - _RULE_BOOST['recency_penalty_max']
407
- )
408
- )
409
- boost *= recency_factor
410
-
411
- # Feature [6]: importance_norm
412
- importance_norm = features[6]
413
- if importance_norm >= 0.8:
414
- boost *= _RULE_BOOST['high_importance']
415
-
416
- # Feature [8]: access_frequency
417
- access_freq = features[8]
418
- if access_freq >= 0.5:
419
- boost *= _RULE_BOOST['high_access']
420
-
421
- # Feature [10]: signal_count (v2.7.4 — feedback volume)
422
- if len(features) > 10:
423
- signal_count = features[10]
424
- if signal_count >= 0.3: # 3+ signals
425
- boost *= 1.1 # Mild boost for well-known memories
426
-
427
- # Feature [11]: avg_signal_value (v2.7.4 — feedback quality)
428
- if len(features) > 11:
429
- avg_signal = features[11]
430
- if avg_signal >= 0.7:
431
- boost *= 1.15 # Boost memories with positive feedback
432
- elif avg_signal < 0.3 and avg_signal > 0.0:
433
- boost *= 0.85 # Penalize memories with negative feedback
434
-
435
- # Feature [12]: lifecycle_state (v2.8)
436
- if len(features) > 12:
437
- lifecycle_state = features[12]
438
- if lifecycle_state >= 0.9:
439
- boost *= _RULE_BOOST.get('lifecycle_active', 1.0)
440
- elif lifecycle_state >= 0.6:
441
- boost *= _RULE_BOOST.get('lifecycle_warm', 0.85)
442
- elif lifecycle_state >= 0.3:
443
- boost *= _RULE_BOOST.get('lifecycle_cold', 0.6)
444
-
445
- # Feature [13]: outcome_success_rate (v2.8)
446
- if len(features) > 13:
447
- success_rate = features[13]
448
- if success_rate >= 0.8:
449
- boost *= _RULE_BOOST.get('outcome_success_high', 1.3)
450
- elif success_rate <= 0.2:
451
- boost *= _RULE_BOOST.get('outcome_failure_high', 0.7)
452
-
453
- # Feature [15]: behavioral_match (v2.8)
454
- if len(features) > 15:
455
- behavioral = features[15]
456
- if behavioral >= 0.7:
457
- boost *= _RULE_BOOST.get('behavioral_match_strong', 1.25)
458
-
459
- # Feature [16]: cross_project_score (v2.8)
460
- if len(features) > 16:
461
- cross_project = features[16]
462
- if cross_project >= 0.5:
463
- boost *= _RULE_BOOST.get('cross_project_boost', 1.15)
464
-
465
- # Feature [18]: trust_at_creation (v2.8)
466
- if len(features) > 18:
467
- trust = features[18]
468
- if trust >= 0.9:
469
- boost *= _RULE_BOOST.get('high_trust_creator', 1.1)
470
- elif trust <= 0.3:
471
- boost *= _RULE_BOOST.get('low_trust_creator', 0.8)
326
+ boost = calculate_rule_boost(features)
472
327
 
473
328
  # Apply boost to score
474
329
  result['score'] = base_score * boost
@@ -799,12 +654,10 @@ class AdaptiveRanker:
799
654
 
800
655
  Returns:
801
656
  Tuple of (X, y, groups) for LGBMRanker, or None if insufficient.
802
- X: numpy array (n_samples, 9)
657
+ X: numpy array (n_samples, NUM_FEATURES)
803
658
  y: numpy array (n_samples,) — relevance labels
804
659
  groups: list of ints — samples per query group
805
660
  """
806
- import sqlite3
807
-
808
661
  ldb = self._get_learning_db()
809
662
  if ldb is None:
810
663
  return None
@@ -813,111 +666,7 @@ class AdaptiveRanker:
813
666
  if not feedback:
814
667
  return None
815
668
 
816
- # Group feedback by query_hash
817
- query_groups: Dict[str, List[dict]] = {}
818
- for entry in feedback:
819
- qh = entry['query_hash']
820
- if qh not in query_groups:
821
- query_groups[qh] = []
822
- query_groups[qh].append(entry)
823
-
824
- # Filter: only keep groups with 2+ items (ranking requires pairs)
825
- query_groups = {
826
- qh: entries for qh, entries in query_groups.items()
827
- if len(entries) >= 2
828
- }
829
-
830
- if not query_groups:
831
- logger.info("No query groups with 2+ feedback entries")
832
- return None
833
-
834
- # Collect memory IDs we need to look up
835
- memory_ids_needed = set()
836
- for entries in query_groups.values():
837
- for entry in entries:
838
- memory_ids_needed.add(entry['memory_id'])
839
-
840
- # Fetch memories from memory.db
841
- memory_db_path = Path.home() / ".claude-memory" / "memory.db"
842
- if not memory_db_path.exists():
843
- logger.warning("memory.db not found at %s", memory_db_path)
844
- return None
845
-
846
- memories_by_id = {}
847
- try:
848
- conn = sqlite3.connect(str(memory_db_path), timeout=5)
849
- conn.row_factory = sqlite3.Row
850
- cursor = conn.cursor()
851
-
852
- # Batch fetch memories (in chunks to avoid SQLite variable limit)
853
- id_list = list(memory_ids_needed)
854
- chunk_size = 500
855
- for i in range(0, len(id_list), chunk_size):
856
- chunk = id_list[i:i + chunk_size]
857
- placeholders = ','.join('?' for _ in chunk)
858
- cursor.execute(f'''
859
- SELECT id, content, summary, project_path, project_name,
860
- tags, category, memory_type, importance, created_at,
861
- last_accessed, access_count
862
- FROM memories
863
- WHERE id IN ({placeholders})
864
- ''', chunk)
865
- for row in cursor.fetchall():
866
- memories_by_id[row['id']] = dict(row)
867
- conn.close()
868
- except Exception as e:
869
- logger.error("Failed to fetch memories for training: %s", e)
870
- return None
871
-
872
- # Build feature matrix and labels
873
- all_features = []
874
- all_labels = []
875
- groups = []
876
-
877
- # Set a neutral context for training (we don't have query-time context)
878
- self._feature_extractor.set_context()
879
-
880
- for qh, entries in query_groups.items():
881
- group_features = []
882
- group_labels = []
883
-
884
- for entry in entries:
885
- mid = entry['memory_id']
886
- memory = memories_by_id.get(mid)
887
- if memory is None:
888
- continue # Memory may have been deleted
889
-
890
- # Use query_keywords as proxy for query text
891
- query_text = entry.get('query_keywords', '') or ''
892
-
893
- features = self._feature_extractor.extract_features(
894
- memory, query_text
895
- )
896
- group_features.append(features)
897
- group_labels.append(float(entry['signal_value']))
898
-
899
- # Only include groups with 2+ valid entries
900
- if len(group_features) >= 2:
901
- all_features.extend(group_features)
902
- all_labels.extend(group_labels)
903
- groups.append(len(group_features))
904
-
905
- if not groups or len(all_features) < 4:
906
- logger.info(
907
- "Insufficient valid training data: %d features, %d groups",
908
- len(all_features), len(groups)
909
- )
910
- return None
911
-
912
- X = np.array(all_features, dtype=np.float64)
913
- y = np.array(all_labels, dtype=np.float64)
914
-
915
- logger.info(
916
- "Prepared training data: %d samples, %d groups, %d features",
917
- X.shape[0], len(groups), X.shape[1]
918
- )
919
-
920
- return X, y, groups
669
+ return prepare_training_data_internal(feedback, self._feature_extractor)
921
670
 
922
671
 
923
672
  # ============================================================================
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """
5
+ Bootstrap utilities package.
6
+
7
+ Re-exports all constants, functions, and utilities used by SyntheticBootstrapper.
8
+ """
9
+
10
+ # Constants
11
+ from .constants import (
12
+ MEMORY_DB_PATH,
13
+ MODELS_DIR,
14
+ MODEL_PATH,
15
+ MIN_MEMORIES_FOR_BOOTSTRAP,
16
+ BOOTSTRAP_CONFIG,
17
+ BOOTSTRAP_PARAMS,
18
+ STOPWORDS,
19
+ MIN_KEYWORD_LENGTH,
20
+ )
21
+
22
+ # Text utilities
23
+ from .text_utils import (
24
+ extract_keywords,
25
+ clean_fts_query,
26
+ )
27
+
28
+ # Database queries
29
+ from .db_queries import (
30
+ get_memory_count,
31
+ get_memories_by_access,
32
+ get_memories_by_importance,
33
+ get_recent_memories,
34
+ get_learned_patterns,
35
+ search_memories,
36
+ find_negative_memories,
37
+ )
38
+
39
+ # Sampling utilities
40
+ from .sampling import (
41
+ diverse_sample,
42
+ count_sources,
43
+ )
44
+
45
+ __all__ = [
46
+ # Constants
47
+ 'MEMORY_DB_PATH',
48
+ 'MODELS_DIR',
49
+ 'MODEL_PATH',
50
+ 'MIN_MEMORIES_FOR_BOOTSTRAP',
51
+ 'BOOTSTRAP_CONFIG',
52
+ 'BOOTSTRAP_PARAMS',
53
+ 'STOPWORDS',
54
+ 'MIN_KEYWORD_LENGTH',
55
+ # Text utilities
56
+ 'extract_keywords',
57
+ 'clean_fts_query',
58
+ # Database queries
59
+ 'get_memory_count',
60
+ 'get_memories_by_access',
61
+ 'get_memories_by_importance',
62
+ 'get_recent_memories',
63
+ 'get_learned_patterns',
64
+ 'search_memories',
65
+ 'find_negative_memories',
66
+ # Sampling
67
+ 'diverse_sample',
68
+ 'count_sources',
69
+ ]
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """
5
+ Bootstrap constants and configuration.
6
+
7
+ All constant values, configuration dicts, and static data used
8
+ by SyntheticBootstrapper are defined here.
9
+ """
10
+
11
+ from pathlib import Path
12
+
13
+ # ============================================================================
14
+ # Paths
15
+ # ============================================================================
16
+
17
+ MEMORY_DB_PATH = Path.home() / ".claude-memory" / "memory.db"
18
+ MODELS_DIR = Path.home() / ".claude-memory" / "models"
19
+ MODEL_PATH = MODELS_DIR / "ranker.txt"
20
+
21
+ # ============================================================================
22
+ # Bootstrap Configuration
23
+ # ============================================================================
24
+
25
+ # Minimum memories needed before bootstrap makes sense
26
+ MIN_MEMORIES_FOR_BOOTSTRAP = 50
27
+
28
+ # Tiered config — bootstrap model complexity scales with data size
29
+ BOOTSTRAP_CONFIG = {
30
+ 'small': {
31
+ 'min_memories': 50,
32
+ 'max_memories': 499,
33
+ 'target_samples': 200,
34
+ 'n_estimators': 30,
35
+ 'max_depth': 3,
36
+ },
37
+ 'medium': {
38
+ 'min_memories': 500,
39
+ 'max_memories': 4999,
40
+ 'target_samples': 1000,
41
+ 'n_estimators': 50,
42
+ 'max_depth': 4,
43
+ },
44
+ 'large': {
45
+ 'min_memories': 5000,
46
+ 'max_memories': float('inf'),
47
+ 'target_samples': 2000,
48
+ 'n_estimators': 100,
49
+ 'max_depth': 6,
50
+ },
51
+ }
52
+
53
+ # ============================================================================
54
+ # LightGBM Parameters
55
+ # ============================================================================
56
+
57
+ # LightGBM bootstrap parameters — MORE aggressive regularization than
58
+ # real training because synthetic data has systematic biases
59
+ BOOTSTRAP_PARAMS = {
60
+ 'objective': 'lambdarank',
61
+ 'metric': 'ndcg',
62
+ 'ndcg_eval_at': [5, 10],
63
+ 'learning_rate': 0.1,
64
+ 'num_leaves': 8,
65
+ 'max_depth': 3,
66
+ 'min_child_samples': 5,
67
+ 'subsample': 0.7,
68
+ 'reg_alpha': 0.5,
69
+ 'reg_lambda': 2.0,
70
+ 'boosting_type': 'dart',
71
+ 'verbose': -1,
72
+ }
73
+
74
+ # ============================================================================
75
+ # Text Processing
76
+ # ============================================================================
77
+
78
+ # English stopwords for keyword extraction (no external deps)
79
+ STOPWORDS = frozenset({
80
+ 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
81
+ 'of', 'with', 'by', 'from', 'is', 'it', 'this', 'that', 'was', 'are',
82
+ 'be', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would',
83
+ 'could', 'should', 'may', 'might', 'can', 'not', 'no', 'if', 'then',
84
+ 'so', 'as', 'up', 'out', 'about', 'into', 'over', 'after', 'before',
85
+ 'when', 'where', 'how', 'what', 'which', 'who', 'whom', 'why',
86
+ 'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other',
87
+ 'some', 'such', 'than', 'too', 'very', 'just', 'also', 'now',
88
+ 'here', 'there', 'use', 'used', 'using', 'make', 'made',
89
+ 'need', 'needed', 'get', 'got', 'set', 'new', 'old', 'one', 'two',
90
+ })
91
+
92
+ # Minimum word length for keyword extraction
93
+ MIN_KEYWORD_LENGTH = 3