npm - superlocalmemory - Versions diffs - 2.8.2 → 2.8.3 - Mend

superlocalmemory 2.8.2 → 2.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/README.md +7 -5
package/api_server.py +5 -0
package/bin/slm.bat +3 -3
package/docs/SECURITY-QUICK-REFERENCE.md +214 -0
package/install.ps1 +11 -11
package/mcp_server.py +3 -3
package/package.json +2 -2
package/requirements-core.txt +16 -18
package/requirements-learning.txt +8 -8
package/requirements.txt +9 -7
package/scripts/prepack.js +33 -0
package/scripts/verify-v27.ps1 +301 -0
package/src/agent_registry.py +32 -28
package/src/auto_backup.py +12 -6
package/src/cache_manager.py +2 -2
package/src/compression/__init__.py +25 -0
package/src/compression/cli.py +150 -0
package/src/compression/cold_storage.py +217 -0
package/src/compression/config.py +72 -0
package/src/compression/orchestrator.py +133 -0
package/src/compression/tier2_compressor.py +228 -0
package/src/compression/tier3_compressor.py +153 -0
package/src/compression/tier_classifier.py +148 -0
package/src/db_connection_manager.py +5 -5
package/src/event_bus.py +24 -22
package/src/hnsw_index.py +3 -3
package/src/learning/__init__.py +5 -4
package/src/learning/adaptive_ranker.py +14 -265
package/src/learning/bootstrap/__init__.py +69 -0
package/src/learning/bootstrap/constants.py +93 -0
package/src/learning/bootstrap/db_queries.py +316 -0
package/src/learning/bootstrap/sampling.py +82 -0
package/src/learning/bootstrap/text_utils.py +71 -0
package/src/learning/cross_project_aggregator.py +58 -57
package/src/learning/db/__init__.py +40 -0
package/src/learning/db/constants.py +44 -0
package/src/learning/db/schema.py +279 -0
package/src/learning/learning_db.py +15 -234
package/src/learning/ranking/__init__.py +33 -0
package/src/learning/ranking/constants.py +84 -0
package/src/learning/ranking/helpers.py +278 -0
package/src/learning/source_quality_scorer.py +66 -65
package/src/learning/synthetic_bootstrap.py +28 -310
package/src/memory/__init__.py +36 -0
package/src/memory/cli.py +205 -0
package/src/memory/constants.py +39 -0
package/src/memory/helpers.py +28 -0
package/src/memory/schema.py +166 -0
package/src/memory-profiles.py +94 -86
package/src/memory-reset.py +187 -185
package/src/memory_compression.py +2 -2
package/src/memory_store_v2.py +34 -354
package/src/migrate_v1_to_v2.py +11 -10
package/src/patterns/analyzers.py +104 -100
package/src/patterns/learner.py +17 -13
package/src/patterns/scoring.py +25 -21
package/src/patterns/store.py +40 -38
package/src/patterns/terminology.py +53 -51
package/src/provenance_tracker.py +2 -2
package/src/qualixar_attribution.py +1 -1
package/src/search/engine.py +16 -14
package/src/search/index_loader.py +13 -11
package/src/setup_validator.py +160 -158
package/src/subscription_manager.py +20 -18
package/src/tree/builder.py +66 -64
package/src/tree/nodes.py +103 -97
package/src/tree/queries.py +142 -137
package/src/tree/schema.py +46 -42
package/src/webhook_dispatcher.py +3 -3
package/ui_server.py +7 -4

package/src/learning/adaptive_ranker.py CHANGED Viewed

@@ -63,67 +63,17 @@ from .feature_extractor import FeatureExtractor, FEATURE_NAMES, NUM_FEATURES
 logger = logging.getLogger("superlocalmemory.learning.adaptive_ranker")
-# ============================================================================
-# Constants
-# ============================================================================
-MODELS_DIR = Path.home() / ".claude-memory" / "models"
-MODEL_PATH = MODELS_DIR / "ranker.txt"
-# Phase thresholds — how many feedback signals to trigger each phase
-PHASE_THRESHOLDS = {
-    'baseline': 0,       # 0 feedback samples -> no re-ranking
-    'rule_based': 20,    # 20+ feedback -> rule-based boosting
-    'ml_model': 200,     # 200+ feedback across 50+ unique queries -> ML
-}
-# Minimum unique queries required for ML phase (prevents overfitting
-# to a small number of repeated queries)
-MIN_UNIQUE_QUERIES_FOR_ML = 50
-# Rule-based boost multipliers (Phase 1)
-# These are conservative — they nudge the ranking without flipping order
-_RULE_BOOST = {
-    'tech_match_strong': 1.3,      # Memory matches 2+ preferred techs
-    'tech_match_weak': 1.1,        # Memory matches 1 preferred tech
-    'project_match': 1.5,          # Memory from current project
-    'project_unknown': 1.0,        # No project context — no boost
-    'project_mismatch': 0.9,       # Memory from different project
-    'source_quality_high': 1.2,    # Source quality > 0.7
-    'source_quality_low': 0.85,    # Source quality < 0.3
-    'recency_boost_max': 1.2,      # Recent memory (< 7 days)
-    'recency_penalty_max': 0.8,    # Old memory (> 365 days)
-    'high_importance': 1.15,       # Importance >= 8
-    'high_access': 1.1,            # Accessed 5+ times
-    # v2.8: Lifecycle + behavioral boosts
-    'lifecycle_active': 1.0,
-    'lifecycle_warm': 0.85,
-    'lifecycle_cold': 0.6,
-    'outcome_success_high': 1.3,
-    'outcome_failure_high': 0.7,
-    'behavioral_match_strong': 1.25,
-    'cross_project_boost': 1.15,
-    'high_trust_creator': 1.1,
-    'low_trust_creator': 0.8,
-}
-# LightGBM training parameters — tuned for small, personal datasets
-# Aggressive regularization prevents overfitting on < 10K samples
-TRAINING_PARAMS = {
-    'objective': 'lambdarank',
-    'metric': 'ndcg',
-    'ndcg_eval_at': [5, 10],
-    'learning_rate': 0.05,
-    'num_leaves': 16,
-    'max_depth': 4,
-    'min_child_samples': 10,
-    'subsample': 0.8,
-    'reg_alpha': 0.1,
-    'reg_lambda': 1.0,
-    'boosting_type': 'dart',
-    'n_estimators': 50,
-    'verbose': -1,
-}
+# Import constants and helpers from ranking subpackage
+from .ranking import (
+    MODELS_DIR,
+    MODEL_PATH,
+    PHASE_THRESHOLDS,
+    MIN_UNIQUE_QUERIES_FOR_ML,
+    RULE_BOOST,
+    TRAINING_PARAMS,
+    calculate_rule_boost,
+    prepare_training_data_internal,
+)
 class AdaptiveRanker:
@@ -373,102 +323,7 @@ class AdaptiveRanker:
                 continue
             features = feature_vectors[i]
-            boost = 1.0
-            # Feature [2]: tech_match
-            tech_match = features[2]
-            if tech_match >= 0.8:
-                boost *= _RULE_BOOST['tech_match_strong']
-            elif tech_match >= 0.4:
-                boost *= _RULE_BOOST['tech_match_weak']
-            # Feature [3]: project_match
-            project_match = features[3]
-            if project_match >= 0.9:
-                boost *= _RULE_BOOST['project_match']
-            elif project_match <= 0.35:
-                boost *= _RULE_BOOST['project_mismatch']
-            # Feature [5]: source_quality
-            source_quality = features[5]
-            if source_quality >= 0.7:
-                boost *= _RULE_BOOST['source_quality_high']
-            elif source_quality < 0.3:
-                boost *= _RULE_BOOST['source_quality_low']
-            # Feature [7]: recency_score (exponential decay)
-            recency = features[7]
-            # Linear interpolation between penalty and boost
-            recency_factor = (
-                _RULE_BOOST['recency_penalty_max']
-                + recency * (
-                    _RULE_BOOST['recency_boost_max']
-                    - _RULE_BOOST['recency_penalty_max']
-                )
-            )
-            boost *= recency_factor
-            # Feature [6]: importance_norm
-            importance_norm = features[6]
-            if importance_norm >= 0.8:
-                boost *= _RULE_BOOST['high_importance']
-            # Feature [8]: access_frequency
-            access_freq = features[8]
-            if access_freq >= 0.5:
-                boost *= _RULE_BOOST['high_access']
-            # Feature [10]: signal_count (v2.7.4 — feedback volume)
-            if len(features) > 10:
-                signal_count = features[10]
-                if signal_count >= 0.3:  # 3+ signals
-                    boost *= 1.1  # Mild boost for well-known memories
-            # Feature [11]: avg_signal_value (v2.7.4 — feedback quality)
-            if len(features) > 11:
-                avg_signal = features[11]
-                if avg_signal >= 0.7:
-                    boost *= 1.15  # Boost memories with positive feedback
-                elif avg_signal < 0.3 and avg_signal > 0.0:
-                    boost *= 0.85  # Penalize memories with negative feedback
-            # Feature [12]: lifecycle_state (v2.8)
-            if len(features) > 12:
-                lifecycle_state = features[12]
-                if lifecycle_state >= 0.9:
-                    boost *= _RULE_BOOST.get('lifecycle_active', 1.0)
-                elif lifecycle_state >= 0.6:
-                    boost *= _RULE_BOOST.get('lifecycle_warm', 0.85)
-                elif lifecycle_state >= 0.3:
-                    boost *= _RULE_BOOST.get('lifecycle_cold', 0.6)
-            # Feature [13]: outcome_success_rate (v2.8)
-            if len(features) > 13:
-                success_rate = features[13]
-                if success_rate >= 0.8:
-                    boost *= _RULE_BOOST.get('outcome_success_high', 1.3)
-                elif success_rate <= 0.2:
-                    boost *= _RULE_BOOST.get('outcome_failure_high', 0.7)
-            # Feature [15]: behavioral_match (v2.8)
-            if len(features) > 15:
-                behavioral = features[15]
-                if behavioral >= 0.7:
-                    boost *= _RULE_BOOST.get('behavioral_match_strong', 1.25)
-            # Feature [16]: cross_project_score (v2.8)
-            if len(features) > 16:
-                cross_project = features[16]
-                if cross_project >= 0.5:
-                    boost *= _RULE_BOOST.get('cross_project_boost', 1.15)
-            # Feature [18]: trust_at_creation (v2.8)
-            if len(features) > 18:
-                trust = features[18]
-                if trust >= 0.9:
-                    boost *= _RULE_BOOST.get('high_trust_creator', 1.1)
-                elif trust <= 0.3:
-                    boost *= _RULE_BOOST.get('low_trust_creator', 0.8)
+            boost = calculate_rule_boost(features)
             # Apply boost to score
             result['score'] = base_score * boost
@@ -799,12 +654,10 @@ class AdaptiveRanker:
         Returns:
             Tuple of (X, y, groups) for LGBMRanker, or None if insufficient.
-            X: numpy array (n_samples, 9)
+            X: numpy array (n_samples, NUM_FEATURES)
             y: numpy array (n_samples,) — relevance labels
             groups: list of ints — samples per query group
         """
-        import sqlite3
         ldb = self._get_learning_db()
         if ldb is None:
             return None
@@ -813,111 +666,7 @@ class AdaptiveRanker:
         if not feedback:
             return None
-        # Group feedback by query_hash
-        query_groups: Dict[str, List[dict]] = {}
-        for entry in feedback:
-            qh = entry['query_hash']
-            if qh not in query_groups:
-                query_groups[qh] = []
-            query_groups[qh].append(entry)
-        # Filter: only keep groups with 2+ items (ranking requires pairs)
-        query_groups = {
-            qh: entries for qh, entries in query_groups.items()
-            if len(entries) >= 2
-        }
-        if not query_groups:
-            logger.info("No query groups with 2+ feedback entries")
-            return None
-        # Collect memory IDs we need to look up
-        memory_ids_needed = set()
-        for entries in query_groups.values():
-            for entry in entries:
-                memory_ids_needed.add(entry['memory_id'])
-        # Fetch memories from memory.db
-        memory_db_path = Path.home() / ".claude-memory" / "memory.db"
-        if not memory_db_path.exists():
-            logger.warning("memory.db not found at %s", memory_db_path)
-            return None
-        memories_by_id = {}
-        try:
-            conn = sqlite3.connect(str(memory_db_path), timeout=5)
-            conn.row_factory = sqlite3.Row
-            cursor = conn.cursor()
-            # Batch fetch memories (in chunks to avoid SQLite variable limit)
-            id_list = list(memory_ids_needed)
-            chunk_size = 500
-            for i in range(0, len(id_list), chunk_size):
-                chunk = id_list[i:i + chunk_size]
-                placeholders = ','.join('?' for _ in chunk)
-                cursor.execute(f'''
-                    SELECT id, content, summary, project_path, project_name,
-                           tags, category, memory_type, importance, created_at,
-                           last_accessed, access_count
-                    FROM memories
-                    WHERE id IN ({placeholders})
-                ''', chunk)
-                for row in cursor.fetchall():
-                    memories_by_id[row['id']] = dict(row)
-            conn.close()
-        except Exception as e:
-            logger.error("Failed to fetch memories for training: %s", e)
-            return None
-        # Build feature matrix and labels
-        all_features = []
-        all_labels = []
-        groups = []
-        # Set a neutral context for training (we don't have query-time context)
-        self._feature_extractor.set_context()
-        for qh, entries in query_groups.items():
-            group_features = []
-            group_labels = []
-            for entry in entries:
-                mid = entry['memory_id']
-                memory = memories_by_id.get(mid)
-                if memory is None:
-                    continue  # Memory may have been deleted
-                # Use query_keywords as proxy for query text
-                query_text = entry.get('query_keywords', '') or ''
-                features = self._feature_extractor.extract_features(
-                    memory, query_text
-                )
-                group_features.append(features)
-                group_labels.append(float(entry['signal_value']))
-            # Only include groups with 2+ valid entries
-            if len(group_features) >= 2:
-                all_features.extend(group_features)
-                all_labels.extend(group_labels)
-                groups.append(len(group_features))
-        if not groups or len(all_features) < 4:
-            logger.info(
-                "Insufficient valid training data: %d features, %d groups",
-                len(all_features), len(groups)
-            )
-            return None
-        X = np.array(all_features, dtype=np.float64)
-        y = np.array(all_labels, dtype=np.float64)
-        logger.info(
-            "Prepared training data: %d samples, %d groups, %d features",
-            X.shape[0], len(groups), X.shape[1]
-        )
-        return X, y, groups
+        return prepare_training_data_internal(feedback, self._feature_extractor)
 # ============================================================================

package/src/learning/bootstrap/__init__.py ADDED Viewed

@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
+"""
+Bootstrap utilities package.
+Re-exports all constants, functions, and utilities used by SyntheticBootstrapper.
+"""
+# Constants
+from .constants import (
+    MEMORY_DB_PATH,
+    MODELS_DIR,
+    MODEL_PATH,
+    MIN_MEMORIES_FOR_BOOTSTRAP,
+    BOOTSTRAP_CONFIG,
+    BOOTSTRAP_PARAMS,
+    STOPWORDS,
+    MIN_KEYWORD_LENGTH,
+)
+# Text utilities
+from .text_utils import (
+    extract_keywords,
+    clean_fts_query,
+)
+# Database queries
+from .db_queries import (
+    get_memory_count,
+    get_memories_by_access,
+    get_memories_by_importance,
+    get_recent_memories,
+    get_learned_patterns,
+    search_memories,
+    find_negative_memories,
+)
+# Sampling utilities
+from .sampling import (
+    diverse_sample,
+    count_sources,
+)
+__all__ = [
+    # Constants
+    'MEMORY_DB_PATH',
+    'MODELS_DIR',
+    'MODEL_PATH',
+    'MIN_MEMORIES_FOR_BOOTSTRAP',
+    'BOOTSTRAP_CONFIG',
+    'BOOTSTRAP_PARAMS',
+    'STOPWORDS',
+    'MIN_KEYWORD_LENGTH',
+    # Text utilities
+    'extract_keywords',
+    'clean_fts_query',
+    # Database queries
+    'get_memory_count',
+    'get_memories_by_access',
+    'get_memories_by_importance',
+    'get_recent_memories',
+    'get_learned_patterns',
+    'search_memories',
+    'find_negative_memories',
+    # Sampling
+    'diverse_sample',
+    'count_sources',
+]

package/src/learning/bootstrap/constants.py ADDED Viewed

@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
+"""
+Bootstrap constants and configuration.
+All constant values, configuration dicts, and static data used
+by SyntheticBootstrapper are defined here.
+"""
+from pathlib import Path
+# ============================================================================
+# Paths
+# ============================================================================
+MEMORY_DB_PATH = Path.home() / ".claude-memory" / "memory.db"
+MODELS_DIR = Path.home() / ".claude-memory" / "models"
+MODEL_PATH = MODELS_DIR / "ranker.txt"
+# ============================================================================
+# Bootstrap Configuration
+# ============================================================================
+# Minimum memories needed before bootstrap makes sense
+MIN_MEMORIES_FOR_BOOTSTRAP = 50
+# Tiered config — bootstrap model complexity scales with data size
+BOOTSTRAP_CONFIG = {
+    'small': {
+        'min_memories': 50,
+        'max_memories': 499,
+        'target_samples': 200,
+        'n_estimators': 30,
+        'max_depth': 3,
+    },
+    'medium': {
+        'min_memories': 500,
+        'max_memories': 4999,
+        'target_samples': 1000,
+        'n_estimators': 50,
+        'max_depth': 4,
+    },
+    'large': {
+        'min_memories': 5000,
+        'max_memories': float('inf'),
+        'target_samples': 2000,
+        'n_estimators': 100,
+        'max_depth': 6,
+    },
+}
+# ============================================================================
+# LightGBM Parameters
+# ============================================================================
+# LightGBM bootstrap parameters — MORE aggressive regularization than
+# real training because synthetic data has systematic biases
+BOOTSTRAP_PARAMS = {
+    'objective': 'lambdarank',
+    'metric': 'ndcg',
+    'ndcg_eval_at': [5, 10],
+    'learning_rate': 0.1,
+    'num_leaves': 8,
+    'max_depth': 3,
+    'min_child_samples': 5,
+    'subsample': 0.7,
+    'reg_alpha': 0.5,
+    'reg_lambda': 2.0,
+    'boosting_type': 'dart',
+    'verbose': -1,
+}
+# ============================================================================
+# Text Processing
+# ============================================================================
+# English stopwords for keyword extraction (no external deps)
+STOPWORDS = frozenset({
+    'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+    'of', 'with', 'by', 'from', 'is', 'it', 'this', 'that', 'was', 'are',
+    'be', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would',
+    'could', 'should', 'may', 'might', 'can', 'not', 'no', 'if', 'then',
+    'so', 'as', 'up', 'out', 'about', 'into', 'over', 'after', 'before',
+    'when', 'where', 'how', 'what', 'which', 'who', 'whom', 'why',
+    'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other',
+    'some', 'such', 'than', 'too', 'very', 'just', 'also', 'now',
+    'here', 'there', 'use', 'used', 'using', 'make', 'made',
+    'need', 'needed', 'get', 'got', 'set', 'new', 'old', 'one', 'two',
+})
+# Minimum word length for keyword extraction
+MIN_KEYWORD_LENGTH = 3