npm - superlocalmemory - Versions diffs - 2.6.0 → 2.6.5 - Mend

superlocalmemory 2.6.0 → 2.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/CHANGELOG.md +122 -1806
package/README.md +142 -410
package/docs/ACCESSIBILITY.md +291 -0
package/docs/FRAMEWORK-INTEGRATIONS.md +300 -0
package/package.json +1 -1
package/src/learning/__init__.py +201 -0
package/src/learning/adaptive_ranker.py +826 -0
package/src/learning/cross_project_aggregator.py +866 -0
package/src/learning/engagement_tracker.py +638 -0
package/src/learning/feature_extractor.py +461 -0
package/src/learning/feedback_collector.py +690 -0
package/src/learning/learning_db.py +842 -0
package/src/learning/project_context_manager.py +582 -0
package/src/learning/source_quality_scorer.py +685 -0
package/src/learning/workflow_pattern_miner.py +665 -0
package/ui/index.html +346 -13
package/ui/js/clusters.js +90 -1
package/ui/js/graph-core.js +445 -0
package/ui/js/graph-cytoscape-monolithic-backup.js +1168 -0
package/ui/js/graph-cytoscape.js +1168 -0
package/ui/js/graph-d3-backup.js +32 -0
package/ui/js/graph-filters.js +220 -0
package/ui/js/graph-interactions.js +354 -0
package/ui/js/graph-ui.js +214 -0
package/ui/js/memories.js +52 -0
package/ui/js/modal.js +104 -1

package/src/learning/feature_extractor.py ADDED Viewed

@@ -0,0 +1,461 @@
+#!/usr/bin/env python3
+"""
+SuperLocalMemory V2 - Feature Extractor (v2.7)
+Copyright (c) 2026 Varun Pratap Bhardwaj
+Licensed under MIT License
+Repository: https://github.com/varun369/SuperLocalMemoryV2
+Author: Varun Pratap Bhardwaj (Solution Architect)
+NOTICE: This software is protected by MIT License.
+Attribution must be preserved in all copies or derivatives.
+"""
+"""
+FeatureExtractor — Extracts 9-dimensional feature vectors for candidate memories.
+Each memory retrieved during recall gets a feature vector that feeds into
+the AdaptiveRanker. In Phase 1 (rule-based), features drive boosting weights.
+In Phase 2 (ML), features become LightGBM input columns.
+Feature Vector (9 dimensions):
+    [0] bm25_score       — Existing retrieval score from search results
+    [1] tfidf_score      — TF-IDF cosine similarity from search results
+    [2] tech_match       — Does memory match user's tech preferences?
+    [3] project_match    — Is memory from the current project?
+    [4] workflow_fit     — Does memory fit current workflow phase?
+    [5] source_quality   — Quality score of the source that created this memory
+    [6] importance_norm  — Normalized importance (importance / 10.0)
+    [7] recency_score    — Exponential decay based on age (180-day half-life)
+    [8] access_frequency — How often this memory was accessed (capped at 1.0)
+Design Principles:
+    - All features normalized to [0.0, 1.0] range for ML compatibility
+    - Graceful defaults when data is missing (0.5 = "unknown/neutral")
+    - No external API calls — everything computed locally
+    - Context (tech preferences, current project) set once per recall batch
+    - Thread-safe: no shared mutable state after set_context()
+"""
+import logging
+import math
+import re
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional
+logger = logging.getLogger("superlocalmemory.learning.feature_extractor")
+# ============================================================================
+# Feature Name Registry
+# ============================================================================
+FEATURE_NAMES = [
+    'bm25_score',          # 0: Existing retrieval score (from search results)
+    'tfidf_score',         # 1: TF-IDF cosine similarity (from search results)
+    'tech_match',          # 2: Does memory match user's tech preferences?
+    'project_match',       # 3: Is memory from the current project?
+    'workflow_fit',        # 4: Does memory fit current workflow phase?
+    'source_quality',      # 5: Quality score of the source that created this memory
+    'importance_norm',     # 6: Normalized importance (importance / 10.0)
+    'recency_score',       # 7: Exponential decay based on age
+    'access_frequency',    # 8: How often this memory was accessed (capped at 1.0)
+]
+NUM_FEATURES = len(FEATURE_NAMES)
+# Workflow phase keywords — maps workflow phase to content signals
+_WORKFLOW_PHASE_KEYWORDS = {
+    'planning': [
+        'architecture', 'design', 'plan', 'roadmap', 'decision',
+        'approach', 'strategy', 'requirement', 'spec', 'rfc',
+    ],
+    'coding': [
+        'implement', 'function', 'class', 'method', 'api',
+        'code', 'module', 'refactor', 'pattern', 'library',
+    ],
+    'testing': [
+        'test', 'assert', 'mock', 'fixture', 'coverage',
+        'pytest', 'jest', 'spec', 'validation', 'regression',
+    ],
+    'debugging': [
+        'bug', 'error', 'fix', 'issue', 'traceback',
+        'debug', 'crash', 'exception', 'stack', 'log',
+    ],
+    'deployment': [
+        'deploy', 'docker', 'kubernetes', 'ci/cd', 'pipeline',
+        'release', 'production', 'staging', 'env', 'config',
+    ],
+    'review': [
+        'review', 'pr', 'merge', 'feedback', 'comment',
+        'approve', 'change', 'diff', 'suggestion', 'lint',
+    ],
+}
+# Half-life for recency decay (in days)
+_RECENCY_HALF_LIFE_DAYS = 180.0
+# Maximum access count before capping to 1.0
+_MAX_ACCESS_COUNT = 10
+class FeatureExtractor:
+    """
+    Extracts 9-dimensional feature vectors for candidate memories.
+    Usage:
+        extractor = FeatureExtractor()
+        extractor.set_context(
+            source_scores={'claude-desktop': 0.8, 'cursor': 0.6},
+            tech_preferences={'python': {'confidence': 0.9}, 'react': {'confidence': 0.7}},
+            current_project='SuperLocalMemoryV2',
+            workflow_phase='testing',
+        )
+        features = extractor.extract_batch(memories, query="search optimization")
+        # features is List[List[float]], shape (n_memories, 9)
+    """
+    FEATURE_NAMES = FEATURE_NAMES
+    def __init__(self):
+        """Initialize FeatureExtractor with empty context."""
+        self._source_scores: Dict[str, float] = {}
+        self._tech_preferences: Dict[str, dict] = {}
+        self._tech_keywords_lower: List[str] = []
+        self._current_project: Optional[str] = None
+        self._current_project_lower: Optional[str] = None
+        self._workflow_phase: Optional[str] = None
+        self._workflow_keywords: List[str] = []
+    def set_context(
+        self,
+        source_scores: Optional[Dict[str, float]] = None,
+        tech_preferences: Optional[Dict[str, dict]] = None,
+        current_project: Optional[str] = None,
+        workflow_phase: Optional[str] = None,
+    ):
+        """
+        Set context for feature extraction. Called once per recall query.
+        These values are expensive to compute (require DB lookups in learning_db),
+        so they are set once and reused across all candidate memories in a batch.
+        Args:
+            source_scores: Map of source_id -> quality score (0.0-1.0).
+                           From learning_db.get_source_scores().
+            tech_preferences: Map of tech_name -> {confidence, evidence_count, ...}.
+                              From cross_project_aggregator or pattern_learner.
+            current_project: Name of the currently active project (if detected).
+            workflow_phase: Current workflow phase (planning, coding, testing, etc).
+        """
+        self._source_scores = source_scores or {}
+        self._tech_preferences = tech_preferences or {}
+        # Pre-compute lowercased tech keywords for faster matching
+        self._tech_keywords_lower = [
+            k.lower() for k in self._tech_preferences.keys()
+        ]
+        self._current_project = current_project
+        self._current_project_lower = (
+            current_project.lower() if current_project else None
+        )
+        self._workflow_phase = workflow_phase
+        self._workflow_keywords = (
+            _WORKFLOW_PHASE_KEYWORDS.get(workflow_phase, [])
+            if workflow_phase else []
+        )
+    def extract_features(self, memory: dict, query: str) -> List[float]:
+        """
+        Extract 9-dimensional feature vector for a single memory.
+        Args:
+            memory: Memory dict from search results. Expected keys:
+                    id, content, score, match_type, importance, created_at,
+                    access_count, project_name, tags, created_by (optional).
+            query: The recall query string.
+        Returns:
+            List of 9 floats in [0.0, 1.0] range, one per feature.
+        """
+        return [
+            self._compute_bm25_score(memory),
+            self._compute_tfidf_score(memory),
+            self._compute_tech_match(memory),
+            self._compute_project_match(memory),
+            self._compute_workflow_fit(memory),
+            self._compute_source_quality(memory),
+            self._compute_importance_norm(memory),
+            self._compute_recency_score(memory),
+            self._compute_access_frequency(memory),
+        ]
+    def extract_batch(
+        self,
+        memories: List[dict],
+        query: str,
+    ) -> List[List[float]]:
+        """
+        Extract feature vectors for all candidate memories.
+        Args:
+            memories: List of memory dicts from search results.
+            query: The recall query string.
+        Returns:
+            List of feature vectors (List[List[float]]), shape (n, 9).
+            Returns empty list if memories is empty.
+        """
+        if not memories:
+            return []
+        return [self.extract_features(m, query) for m in memories]
+    # ========================================================================
+    # Individual Feature Computations
+    # ========================================================================
+    def _compute_bm25_score(self, memory: dict) -> float:
+        """
+        Use 'score' field from search results for keyword-based retrieval.
+        BM25/FTS5 rank scores are not naturally bounded to [0,1], so we
+        apply a simple normalization. For keyword matches, score is
+        typically set to 0.5 by MemoryStoreV2._row_to_dict(). For semantic
+        matches, score is already in [0,1] from cosine similarity.
+        We use match_type to distinguish: 'keyword' -> treat as BM25 signal,
+        'semantic'/'hnsw' -> set to 0.0 (not a BM25 signal).
+        """
+        match_type = memory.get('match_type', '')
+        if match_type == 'keyword':
+            # FTS5 keyword match — normalize the rank score
+            score = memory.get('score', 0.0)
+            # FTS5 rank is negative (lower = better), score field is already
+            # mapped to 0.5 by _row_to_dict, so use it directly
+            return max(0.0, min(float(score), 1.0))
+        # Not a keyword match — no BM25 signal
+        return 0.0
+    def _compute_tfidf_score(self, memory: dict) -> float:
+        """
+        Use cosine similarity score from TF-IDF semantic search.
+        For semantic matches, the score field contains the cosine
+        similarity (already in [0,1]). For keyword-only matches,
+        this returns 0.0.
+        """
+        match_type = memory.get('match_type', '')
+        if match_type in ('semantic', 'hnsw'):
+            score = memory.get('score', 0.0)
+            return max(0.0, min(float(score), 1.0))
+        return 0.0
+    def _compute_tech_match(self, memory: dict) -> float:
+        """
+        Check if memory content mentions user's preferred technologies.
+        Returns:
+            1.0 if strong match (2+ tech keywords found)
+            0.5 if weak match (1 tech keyword found)
+            0.0 if no match or no tech preferences set
+        """
+        if not self._tech_keywords_lower:
+            return 0.5  # No preferences known — neutral
+        content = memory.get('content', '')
+        if not content:
+            return 0.0
+        content_lower = content.lower()
+        tags_str = ''
+        tags = memory.get('tags', [])
+        if isinstance(tags, list):
+            tags_str = ' '.join(t.lower() for t in tags)
+        elif isinstance(tags, str):
+            tags_str = tags.lower()
+        searchable = content_lower + ' ' + tags_str
+        match_count = 0
+        for tech_kw in self._tech_keywords_lower:
+            # Word-boundary check for short keywords to avoid false positives
+            # e.g., "go" matching "google" — require word boundary
+            if len(tech_kw) <= 3:
+                if re.search(r'\b' + re.escape(tech_kw) + r'\b', searchable):
+                    match_count += 1
+            else:
+                if tech_kw in searchable:
+                    match_count += 1
+        if match_count >= 2:
+            return 1.0
+        elif match_count == 1:
+            return 0.5
+        return 0.0
+    def _compute_project_match(self, memory: dict) -> float:
+        """
+        Check if memory belongs to the currently active project.
+        Returns:
+            1.0 if memory's project_name matches current_project
+            0.6 if no current project detected (neutral — don't penalize)
+            0.3 if memory is from a different project
+            0.5 if memory has no project_name (unknown)
+        """
+        if self._current_project_lower is None:
+            # No current project context — neutral for all
+            return 0.6
+        memory_project = memory.get('project_name', '')
+        if not memory_project:
+            return 0.5  # Memory has no project — slightly neutral
+        if memory_project.lower() == self._current_project_lower:
+            return 1.0
+        return 0.3
+    def _compute_workflow_fit(self, memory: dict) -> float:
+        """
+        Check if memory content aligns with the current workflow phase.
+        Returns:
+            0.8 if strong fit (3+ keywords match)
+            0.6 if moderate fit (1-2 keywords match)
+            0.5 if unknown workflow phase (neutral)
+            0.3 if no fit at all
+        """
+        if not self._workflow_keywords:
+            return 0.5  # No workflow phase known — neutral
+        content = memory.get('content', '')
+        if not content:
+            return 0.3
+        content_lower = content.lower()
+        match_count = sum(
+            1 for kw in self._workflow_keywords
+            if kw in content_lower
+        )
+        if match_count >= 3:
+            return 0.8
+        elif match_count >= 1:
+            return 0.6
+        return 0.3
+    def _compute_source_quality(self, memory: dict) -> float:
+        """
+        Look up source quality from cached scores.
+        Returns:
+            The source's quality score if known (0.0-1.0)
+            0.5 for unknown sources (neutral default)
+        """
+        # Try created_by first (v2.5+ provenance), then source_tool
+        source_id = memory.get('created_by') or memory.get('source_tool', '')
+        if not source_id:
+            return 0.5  # Unknown source — neutral
+        return self._source_scores.get(source_id, 0.5)
+    def _compute_importance_norm(self, memory: dict) -> float:
+        """
+        Normalize importance to [0.0, 1.0].
+        importance is stored as 1-10 integer in memory.db.
+        Dividing by 10.0 gives clean normalization.
+        """
+        importance = memory.get('importance', 5)
+        if importance is None:
+            importance = 5
+        try:
+            importance = int(importance)
+        except (ValueError, TypeError):
+            importance = 5
+        # Clamp to valid range before normalizing
+        importance = max(1, min(importance, 10))
+        return importance / 10.0
+    def _compute_recency_score(self, memory: dict) -> float:
+        """
+        Exponential decay based on memory age.
+        Formula: exp(-age_days / half_life)
+        With 180-day half-life:
+            - 0 days old -> 1.0
+            - 30 days old -> ~0.85
+            - 90 days old -> ~0.61
+            - 180 days old -> ~0.37
+            - 365 days old -> ~0.13
+        Handles missing, None, or malformed created_at gracefully.
+        """
+        created_at = memory.get('created_at')
+        if not created_at:
+            return 0.5  # Unknown age — neutral
+        try:
+            # Parse the timestamp — handle multiple formats
+            if isinstance(created_at, str):
+                # Try ISO format first (most common in SQLite)
+                created_at = created_at.replace('Z', '+00:00')
+                try:
+                    created_dt = datetime.fromisoformat(created_at)
+                except ValueError:
+                    # Fallback: try common SQLite format
+                    created_dt = datetime.strptime(
+                        created_at, '%Y-%m-%d %H:%M:%S'
+                    )
+            elif isinstance(created_at, (int, float)):
+                created_dt = datetime.fromtimestamp(created_at)
+            else:
+                return 0.5
+            # Make timezone-naive for comparison
+            if created_dt.tzinfo is not None:
+                created_dt = created_dt.replace(tzinfo=None)
+            now = datetime.now()
+            age_days = max(0, (now - created_dt).total_seconds() / 86400.0)
+            # Exponential decay: e^(-age / half_life)
+            score = math.exp(-age_days / _RECENCY_HALF_LIFE_DAYS)
+            return max(0.0, min(score, 1.0))
+        except (ValueError, TypeError, OverflowError, OSError) as e:
+            logger.debug("Failed to parse created_at for recency: %s", e)
+            return 0.5  # Parse failure — neutral
+    def _compute_access_frequency(self, memory: dict) -> float:
+        """
+        Normalize access_count to [0.0, 1.0], capped at MAX_ACCESS_COUNT.
+        access_count tracks how many times a memory has been recalled.
+        Capping prevents frequently-accessed memories from dominating.
+        """
+        access_count = memory.get('access_count', 0)
+        if access_count is None:
+            access_count = 0
+        try:
+            access_count = int(access_count)
+        except (ValueError, TypeError):
+            access_count = 0
+        return min(access_count / float(_MAX_ACCESS_COUNT), 1.0)
+# ============================================================================
+# Module-level convenience functions
+# ============================================================================
+def get_feature_names() -> List[str]:
+    """Return ordered list of feature names (matches vector indices)."""
+    return list(FEATURE_NAMES)
+def get_num_features() -> int:
+    """Return the number of features in the vector."""
+    return NUM_FEATURES