npm - superlocalmemory - Versions diffs - 2.8.2 → 2.8.3 - Mend

superlocalmemory 2.8.2 → 2.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/README.md +7 -5
package/api_server.py +5 -0
package/bin/slm.bat +3 -3
package/docs/SECURITY-QUICK-REFERENCE.md +214 -0
package/install.ps1 +11 -11
package/mcp_server.py +3 -3
package/package.json +2 -2
package/requirements-core.txt +16 -18
package/requirements-learning.txt +8 -8
package/requirements.txt +9 -7
package/scripts/prepack.js +33 -0
package/scripts/verify-v27.ps1 +301 -0
package/src/agent_registry.py +32 -28
package/src/auto_backup.py +12 -6
package/src/cache_manager.py +2 -2
package/src/compression/__init__.py +25 -0
package/src/compression/cli.py +150 -0
package/src/compression/cold_storage.py +217 -0
package/src/compression/config.py +72 -0
package/src/compression/orchestrator.py +133 -0
package/src/compression/tier2_compressor.py +228 -0
package/src/compression/tier3_compressor.py +153 -0
package/src/compression/tier_classifier.py +148 -0
package/src/db_connection_manager.py +5 -5
package/src/event_bus.py +24 -22
package/src/hnsw_index.py +3 -3
package/src/learning/__init__.py +5 -4
package/src/learning/adaptive_ranker.py +14 -265
package/src/learning/bootstrap/__init__.py +69 -0
package/src/learning/bootstrap/constants.py +93 -0
package/src/learning/bootstrap/db_queries.py +316 -0
package/src/learning/bootstrap/sampling.py +82 -0
package/src/learning/bootstrap/text_utils.py +71 -0
package/src/learning/cross_project_aggregator.py +58 -57
package/src/learning/db/__init__.py +40 -0
package/src/learning/db/constants.py +44 -0
package/src/learning/db/schema.py +279 -0
package/src/learning/learning_db.py +15 -234
package/src/learning/ranking/__init__.py +33 -0
package/src/learning/ranking/constants.py +84 -0
package/src/learning/ranking/helpers.py +278 -0
package/src/learning/source_quality_scorer.py +66 -65
package/src/learning/synthetic_bootstrap.py +28 -310
package/src/memory/__init__.py +36 -0
package/src/memory/cli.py +205 -0
package/src/memory/constants.py +39 -0
package/src/memory/helpers.py +28 -0
package/src/memory/schema.py +166 -0
package/src/memory-profiles.py +94 -86
package/src/memory-reset.py +187 -185
package/src/memory_compression.py +2 -2
package/src/memory_store_v2.py +34 -354
package/src/migrate_v1_to_v2.py +11 -10
package/src/patterns/analyzers.py +104 -100
package/src/patterns/learner.py +17 -13
package/src/patterns/scoring.py +25 -21
package/src/patterns/store.py +40 -38
package/src/patterns/terminology.py +53 -51
package/src/provenance_tracker.py +2 -2
package/src/qualixar_attribution.py +1 -1
package/src/search/engine.py +16 -14
package/src/search/index_loader.py +13 -11
package/src/setup_validator.py +160 -158
package/src/subscription_manager.py +20 -18
package/src/tree/builder.py +66 -64
package/src/tree/nodes.py +103 -97
package/src/tree/queries.py +142 -137
package/src/tree/schema.py +46 -42
package/src/webhook_dispatcher.py +3 -3
package/ui_server.py +7 -4

package/src/learning/ranking/constants.py ADDED Viewed

@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
+"""
+Constants for AdaptiveRanker.
+Includes phase thresholds, rule-based boost multipliers, and LightGBM
+training parameters.
+"""
+from pathlib import Path
+# ============================================================================
+# Paths
+# ============================================================================
+MODELS_DIR = Path.home() / ".claude-memory" / "models"
+MODEL_PATH = MODELS_DIR / "ranker.txt"
+# ============================================================================
+# Phase Thresholds
+# ============================================================================
+# Phase thresholds — how many feedback signals to trigger each phase
+PHASE_THRESHOLDS = {
+    'baseline': 0,       # 0 feedback samples -> no re-ranking
+    'rule_based': 20,    # 20+ feedback -> rule-based boosting
+    'ml_model': 200,     # 200+ feedback across 50+ unique queries -> ML
+}
+# Minimum unique queries required for ML phase (prevents overfitting
+# to a small number of repeated queries)
+MIN_UNIQUE_QUERIES_FOR_ML = 50
+# ============================================================================
+# Rule-Based Boost Multipliers (Phase 1)
+# ============================================================================
+# These are conservative — they nudge the ranking without flipping order
+RULE_BOOST = {
+    'tech_match_strong': 1.3,      # Memory matches 2+ preferred techs
+    'tech_match_weak': 1.1,        # Memory matches 1 preferred tech
+    'project_match': 1.5,          # Memory from current project
+    'project_unknown': 1.0,        # No project context — no boost
+    'project_mismatch': 0.9,       # Memory from different project
+    'source_quality_high': 1.2,    # Source quality > 0.7
+    'source_quality_low': 0.85,    # Source quality < 0.3
+    'recency_boost_max': 1.2,      # Recent memory (< 7 days)
+    'recency_penalty_max': 0.8,    # Old memory (> 365 days)
+    'high_importance': 1.15,       # Importance >= 8
+    'high_access': 1.1,            # Accessed 5+ times
+    # v2.8: Lifecycle + behavioral boosts
+    'lifecycle_active': 1.0,
+    'lifecycle_warm': 0.85,
+    'lifecycle_cold': 0.6,
+    'outcome_success_high': 1.3,
+    'outcome_failure_high': 0.7,
+    'behavioral_match_strong': 1.25,
+    'cross_project_boost': 1.15,
+    'high_trust_creator': 1.1,
+    'low_trust_creator': 0.8,
+}
+# ============================================================================
+# LightGBM Training Parameters
+# ============================================================================
+# LightGBM training parameters — tuned for small, personal datasets
+# Aggressive regularization prevents overfitting on < 10K samples
+TRAINING_PARAMS = {
+    'objective': 'lambdarank',
+    'metric': 'ndcg',
+    'ndcg_eval_at': [5, 10],
+    'learning_rate': 0.05,
+    'num_leaves': 16,
+    'max_depth': 4,
+    'min_child_samples': 10,
+    'subsample': 0.8,
+    'reg_alpha': 0.1,
+    'reg_lambda': 1.0,
+    'boosting_type': 'dart',
+    'n_estimators': 50,
+    'verbose': -1,
+}

package/src/learning/ranking/helpers.py ADDED Viewed

@@ -0,0 +1,278 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
+"""
+Helper functions for AdaptiveRanker.
+Extracted from adaptive_ranker.py to reduce file size while maintaining
+backward compatibility.
+"""
+import logging
+import sqlite3
+from pathlib import Path
+from typing import Dict, List, Optional
+from .constants import RULE_BOOST
+logger = logging.getLogger("superlocalmemory.learning.ranking.helpers")
+# NumPy is optional — used for feature matrix construction
+try:
+    import numpy as np
+    HAS_NUMPY = True
+except ImportError:
+    np = None
+    HAS_NUMPY = False
+def calculate_rule_boost(features: List[float]) -> float:
+    """
+    Calculate rule-based boost multiplier from extracted features.
+    This function encapsulates the rule-based boosting logic from Phase 1,
+    making the main rerank method more readable.
+    Args:
+        features: Feature vector extracted for a memory.
+    Returns:
+        Boost multiplier (typically 0.5 to 2.0).
+    """
+    boost = 1.0
+    # Feature [2]: tech_match
+    tech_match = features[2]
+    if tech_match >= 0.8:
+        boost *= RULE_BOOST['tech_match_strong']
+    elif tech_match >= 0.4:
+        boost *= RULE_BOOST['tech_match_weak']
+    # Feature [3]: project_match
+    project_match = features[3]
+    if project_match >= 0.9:
+        boost *= RULE_BOOST['project_match']
+    elif project_match <= 0.35:
+        boost *= RULE_BOOST['project_mismatch']
+    # Feature [5]: source_quality
+    source_quality = features[5]
+    if source_quality >= 0.7:
+        boost *= RULE_BOOST['source_quality_high']
+    elif source_quality < 0.3:
+        boost *= RULE_BOOST['source_quality_low']
+    # Feature [7]: recency_score (exponential decay)
+    recency = features[7]
+    # Linear interpolation between penalty and boost
+    recency_factor = (
+        RULE_BOOST['recency_penalty_max']
+        + recency * (
+            RULE_BOOST['recency_boost_max']
+            - RULE_BOOST['recency_penalty_max']
+        )
+    )
+    boost *= recency_factor
+    # Feature [6]: importance_norm
+    importance_norm = features[6]
+    if importance_norm >= 0.8:
+        boost *= RULE_BOOST['high_importance']
+    # Feature [8]: access_frequency
+    access_freq = features[8]
+    if access_freq >= 0.5:
+        boost *= RULE_BOOST['high_access']
+    # Feature [10]: signal_count (v2.7.4 — feedback volume)
+    if len(features) > 10:
+        signal_count = features[10]
+        if signal_count >= 0.3:  # 3+ signals
+            boost *= 1.1  # Mild boost for well-known memories
+    # Feature [11]: avg_signal_value (v2.7.4 — feedback quality)
+    if len(features) > 11:
+        avg_signal = features[11]
+        if avg_signal >= 0.7:
+            boost *= 1.15  # Boost memories with positive feedback
+        elif avg_signal < 0.3 and avg_signal > 0.0:
+            boost *= 0.85  # Penalize memories with negative feedback
+    # Feature [12]: lifecycle_state (v2.8)
+    if len(features) > 12:
+        lifecycle_state = features[12]
+        if lifecycle_state >= 0.9:
+            boost *= RULE_BOOST.get('lifecycle_active', 1.0)
+        elif lifecycle_state >= 0.6:
+            boost *= RULE_BOOST.get('lifecycle_warm', 0.85)
+        elif lifecycle_state >= 0.3:
+            boost *= RULE_BOOST.get('lifecycle_cold', 0.6)
+    # Feature [13]: outcome_success_rate (v2.8)
+    if len(features) > 13:
+        success_rate = features[13]
+        if success_rate >= 0.8:
+            boost *= RULE_BOOST.get('outcome_success_high', 1.3)
+        elif success_rate <= 0.2:
+            boost *= RULE_BOOST.get('outcome_failure_high', 0.7)
+    # Feature [15]: behavioral_match (v2.8)
+    if len(features) > 15:
+        behavioral = features[15]
+        if behavioral >= 0.7:
+            boost *= RULE_BOOST.get('behavioral_match_strong', 1.25)
+    # Feature [16]: cross_project_score (v2.8)
+    if len(features) > 16:
+        cross_project = features[16]
+        if cross_project >= 0.5:
+            boost *= RULE_BOOST.get('cross_project_boost', 1.15)
+    # Feature [18]: trust_at_creation (v2.8)
+    if len(features) > 18:
+        trust = features[18]
+        if trust >= 0.9:
+            boost *= RULE_BOOST.get('high_trust_creator', 1.1)
+        elif trust <= 0.3:
+            boost *= RULE_BOOST.get('low_trust_creator', 0.8)
+    return boost
+def prepare_training_data_internal(
+    feedback: List[dict],
+    feature_extractor,
+) -> Optional[tuple]:
+    """
+    Prepare training data from feedback records.
+    For each unique query (grouped by query_hash):
+        - Fetch all feedback entries for that query
+        - Look up the corresponding memory from memory.db
+        - Extract features for each memory
+        - Use signal_value as the relevance label
+    Args:
+        feedback: List of feedback records from LearningDB.
+        feature_extractor: FeatureExtractor instance with context set.
+    Returns:
+        Tuple of (X, y, groups) for LGBMRanker, or None if insufficient.
+        X: numpy array (n_samples, NUM_FEATURES)
+        y: numpy array (n_samples,) — relevance labels
+        groups: list of ints — samples per query group
+    """
+    if not HAS_NUMPY:
+        logger.warning("NumPy not available for training data preparation")
+        return None
+    if not feedback:
+        return None
+    # Group feedback by query_hash
+    query_groups: Dict[str, List[dict]] = {}
+    for entry in feedback:
+        qh = entry['query_hash']
+        if qh not in query_groups:
+            query_groups[qh] = []
+        query_groups[qh].append(entry)
+    # Filter: only keep groups with 2+ items (ranking requires pairs)
+    query_groups = {
+        qh: entries for qh, entries in query_groups.items()
+        if len(entries) >= 2
+    }
+    if not query_groups:
+        logger.info("No query groups with 2+ feedback entries")
+        return None
+    # Collect memory IDs we need to look up
+    memory_ids_needed = set()
+    for entries in query_groups.values():
+        for entry in entries:
+            memory_ids_needed.add(entry['memory_id'])
+    # Fetch memories from memory.db
+    memory_db_path = Path.home() / ".claude-memory" / "memory.db"
+    if not memory_db_path.exists():
+        logger.warning("memory.db not found at %s", memory_db_path)
+        return None
+    memories_by_id = {}
+    try:
+        conn = sqlite3.connect(str(memory_db_path), timeout=5)
+        try:
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            # Batch fetch memories (in chunks to avoid SQLite variable limit)
+            id_list = list(memory_ids_needed)
+            chunk_size = 500
+            for i in range(0, len(id_list), chunk_size):
+                chunk = id_list[i:i + chunk_size]
+                placeholders = ','.join('?' for _ in chunk)
+                cursor.execute(f'''
+                    SELECT id, content, summary, project_path, project_name,
+                           tags, category, memory_type, importance, created_at,
+                           last_accessed, access_count
+                    FROM memories
+                    WHERE id IN ({placeholders})
+                ''', chunk)
+                for row in cursor.fetchall():
+                    memories_by_id[row['id']] = dict(row)
+        finally:
+            conn.close()
+    except Exception as e:
+        logger.error("Failed to fetch memories for training: %s", e)
+        return None
+    # Build feature matrix and labels
+    all_features = []
+    all_labels = []
+    groups = []
+    # Set a neutral context for training (we don't have query-time context)
+    feature_extractor.set_context()
+    for qh, entries in query_groups.items():
+        group_features = []
+        group_labels = []
+        for entry in entries:
+            mid = entry['memory_id']
+            memory = memories_by_id.get(mid)
+            if memory is None:
+                continue  # Memory may have been deleted
+            # Use query_keywords as proxy for query text
+            query_text = entry.get('query_keywords', '') or ''
+            features = feature_extractor.extract_features(
+                memory, query_text
+            )
+            group_features.append(features)
+            group_labels.append(float(entry['signal_value']))
+        # Only include groups with 2+ valid entries
+        if len(group_features) >= 2:
+            all_features.extend(group_features)
+            all_labels.extend(group_labels)
+            groups.append(len(group_features))
+    if not groups or len(all_features) < 4:
+        logger.info(
+            "Insufficient valid training data: %d features, %d groups",
+            len(all_features), len(groups)
+        )
+        return None
+    X = np.array(all_features, dtype=np.float64)
+    y = np.array(all_labels, dtype=np.float64)
+    logger.info(
+        "Prepared training data: %d samples, %d groups, %d features",
+        X.shape[0], len(groups), X.shape[1]
+    )
+    return X, y, groups

package/src/learning/source_quality_scorer.py CHANGED Viewed

@@ -274,38 +274,39 @@ class SourceQualityScorer:
         try:
             conn = sqlite3.connect(str(self.memory_db_path), timeout=10)
-            conn.execute("PRAGMA busy_timeout=5000")
-            cursor = conn.cursor()
-            # Check if created_by column exists
-            cursor.execute("PRAGMA table_info(memories)")
-            columns = {row[1] for row in cursor.fetchall()}
-            if "created_by" in columns:
-                cursor.execute("""
-                    SELECT
-                        COALESCE(created_by, 'unknown') AS source,
-                        COUNT(*) AS cnt
-                    FROM memories
-                    GROUP BY source
-                    ORDER BY cnt DESC
-                """)
-                for row in cursor.fetchall():
-                    source_id = row[0] if row[0] else "unknown"
-                    counts[source_id] = row[1]
-            else:
-                # Column doesn't exist — count all as 'unknown'
-                cursor.execute("SELECT COUNT(*) FROM memories")
-                total = cursor.fetchone()[0]
-                if total > 0:
-                    counts["unknown"] = total
-                logger.debug(
-                    "created_by column not in memory.db — "
-                    "all %d memories grouped as 'unknown'.",
-                    total,
-                )
-            conn.close()
+            try:
+                conn.execute("PRAGMA busy_timeout=5000")
+                cursor = conn.cursor()
+                # Check if created_by column exists
+                cursor.execute("PRAGMA table_info(memories)")
+                columns = {row[1] for row in cursor.fetchall()}
+                if "created_by" in columns:
+                    cursor.execute("""
+                        SELECT
+                            COALESCE(created_by, 'unknown') AS source,
+                            COUNT(*) AS cnt
+                        FROM memories
+                        GROUP BY source
+                        ORDER BY cnt DESC
+                    """)
+                    for row in cursor.fetchall():
+                        source_id = row[0] if row[0] else "unknown"
+                        counts[source_id] = row[1]
+                else:
+                    # Column doesn't exist — count all as 'unknown'
+                    cursor.execute("SELECT COUNT(*) FROM memories")
+                    total = cursor.fetchone()[0]
+                    if total > 0:
+                        counts["unknown"] = total
+                    logger.debug(
+                        "created_by column not in memory.db — "
+                        "all %d memories grouped as 'unknown'.",
+                        total,
+                    )
+            finally:
+                conn.close()
         except sqlite3.OperationalError as e:
             logger.warning("Error reading memory counts by source: %s", e)
@@ -361,40 +362,40 @@ class SourceQualityScorer:
         # Step 2: Look up created_by for each feedback memory_id in memory.db
         try:
             conn = sqlite3.connect(str(self.memory_db_path), timeout=10)
-            conn.execute("PRAGMA busy_timeout=5000")
-            cursor = conn.cursor()
-            # Check if created_by column exists
-            cursor.execute("PRAGMA table_info(memories)")
-            columns = {row[1] for row in cursor.fetchall()}
-            if "created_by" not in columns:
-                # All positives go to 'unknown'
-                total_positives = sum(feedback_memory_ids.values())
-                if total_positives > 0:
-                    positives["unknown"] = total_positives
+            try:
+                conn.execute("PRAGMA busy_timeout=5000")
+                cursor = conn.cursor()
+                # Check if created_by column exists
+                cursor.execute("PRAGMA table_info(memories)")
+                columns = {row[1] for row in cursor.fetchall()}
+                if "created_by" not in columns:
+                    # All positives go to 'unknown'
+                    total_positives = sum(feedback_memory_ids.values())
+                    if total_positives > 0:
+                        positives["unknown"] = total_positives
+                    return positives
+                # Batch lookup in chunks to avoid SQLite variable limit
+                mem_ids = list(feedback_memory_ids.keys())
+                chunk_size = 500  # SQLite max variables is 999
+                for i in range(0, len(mem_ids), chunk_size):
+                    chunk = mem_ids[i:i + chunk_size]
+                    placeholders = ",".join("?" * len(chunk))
+                    cursor.execute(
+                        "SELECT id, COALESCE(created_by, 'unknown') "
+                        "FROM memories WHERE id IN (%s)" % placeholders,
+                        chunk,
+                    )
+                    for row in cursor.fetchall():
+                        mem_id = row[0]
+                        source_id = row[1] if row[1] else "unknown"
+                        count = feedback_memory_ids.get(mem_id, 0)
+                        positives[source_id] = positives.get(source_id, 0) + count
+            finally:
                 conn.close()
-                return positives
-            # Batch lookup in chunks to avoid SQLite variable limit
-            mem_ids = list(feedback_memory_ids.keys())
-            chunk_size = 500  # SQLite max variables is 999
-            for i in range(0, len(mem_ids), chunk_size):
-                chunk = mem_ids[i:i + chunk_size]
-                placeholders = ",".join("?" * len(chunk))
-                cursor.execute(
-                    "SELECT id, COALESCE(created_by, 'unknown') "
-                    "FROM memories WHERE id IN (%s)" % placeholders,
-                    chunk,
-                )
-                for row in cursor.fetchall():
-                    mem_id = row[0]
-                    source_id = row[1] if row[1] else "unknown"
-                    count = feedback_memory_ids.get(mem_id, 0)
-                    positives[source_id] = positives.get(source_id, 0) + count
-            conn.close()
         except sqlite3.OperationalError as e:
             logger.warning("Error looking up memory sources: %s", e)