superlocalmemory 2.6.0 → 2.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,461 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SuperLocalMemory V2 - Feature Extractor (v2.7)
4
+ Copyright (c) 2026 Varun Pratap Bhardwaj
5
+ Licensed under MIT License
6
+
7
+ Repository: https://github.com/varun369/SuperLocalMemoryV2
8
+ Author: Varun Pratap Bhardwaj (Solution Architect)
9
+
10
+ NOTICE: This software is protected by MIT License.
11
+ Attribution must be preserved in all copies or derivatives.
12
+ """
13
+
14
+ """
15
+ FeatureExtractor — Extracts 9-dimensional feature vectors for candidate memories.
16
+
17
+ Each memory retrieved during recall gets a feature vector that feeds into
18
+ the AdaptiveRanker. In Phase 1 (rule-based), features drive boosting weights.
19
+ In Phase 2 (ML), features become LightGBM input columns.
20
+
21
+ Feature Vector (9 dimensions):
22
+ [0] bm25_score — Existing retrieval score from search results
23
+ [1] tfidf_score — TF-IDF cosine similarity from search results
24
+ [2] tech_match — Does memory match user's tech preferences?
25
+ [3] project_match — Is memory from the current project?
26
+ [4] workflow_fit — Does memory fit current workflow phase?
27
+ [5] source_quality — Quality score of the source that created this memory
28
+ [6] importance_norm — Normalized importance (importance / 10.0)
29
+ [7] recency_score — Exponential decay based on age (180-day half-life)
30
+ [8] access_frequency — How often this memory was accessed (capped at 1.0)
31
+
32
+ Design Principles:
33
+ - All features normalized to [0.0, 1.0] range for ML compatibility
34
+ - Graceful defaults when data is missing (0.5 = "unknown/neutral")
35
+ - No external API calls — everything computed locally
36
+ - Context (tech preferences, current project) set once per recall batch
37
+ - Thread-safe: no shared mutable state after set_context()
38
+ """
39
+
40
+ import logging
41
+ import math
42
+ import re
43
+ from datetime import datetime, timezone
44
+ from typing import Any, Dict, List, Optional
45
+
46
+ logger = logging.getLogger("superlocalmemory.learning.feature_extractor")
47
+
48
+ # ============================================================================
49
+ # Feature Name Registry
50
+ # ============================================================================
51
+
52
+ FEATURE_NAMES = [
53
+ 'bm25_score', # 0: Existing retrieval score (from search results)
54
+ 'tfidf_score', # 1: TF-IDF cosine similarity (from search results)
55
+ 'tech_match', # 2: Does memory match user's tech preferences?
56
+ 'project_match', # 3: Is memory from the current project?
57
+ 'workflow_fit', # 4: Does memory fit current workflow phase?
58
+ 'source_quality', # 5: Quality score of the source that created this memory
59
+ 'importance_norm', # 6: Normalized importance (importance / 10.0)
60
+ 'recency_score', # 7: Exponential decay based on age
61
+ 'access_frequency', # 8: How often this memory was accessed (capped at 1.0)
62
+ ]
63
+
64
+ NUM_FEATURES = len(FEATURE_NAMES)
65
+
66
+ # Workflow phase keywords — maps workflow phase to content signals
67
+ _WORKFLOW_PHASE_KEYWORDS = {
68
+ 'planning': [
69
+ 'architecture', 'design', 'plan', 'roadmap', 'decision',
70
+ 'approach', 'strategy', 'requirement', 'spec', 'rfc',
71
+ ],
72
+ 'coding': [
73
+ 'implement', 'function', 'class', 'method', 'api',
74
+ 'code', 'module', 'refactor', 'pattern', 'library',
75
+ ],
76
+ 'testing': [
77
+ 'test', 'assert', 'mock', 'fixture', 'coverage',
78
+ 'pytest', 'jest', 'spec', 'validation', 'regression',
79
+ ],
80
+ 'debugging': [
81
+ 'bug', 'error', 'fix', 'issue', 'traceback',
82
+ 'debug', 'crash', 'exception', 'stack', 'log',
83
+ ],
84
+ 'deployment': [
85
+ 'deploy', 'docker', 'kubernetes', 'ci/cd', 'pipeline',
86
+ 'release', 'production', 'staging', 'env', 'config',
87
+ ],
88
+ 'review': [
89
+ 'review', 'pr', 'merge', 'feedback', 'comment',
90
+ 'approve', 'change', 'diff', 'suggestion', 'lint',
91
+ ],
92
+ }
93
+
94
+ # Half-life for recency decay (in days)
95
+ _RECENCY_HALF_LIFE_DAYS = 180.0
96
+
97
+ # Maximum access count before capping to 1.0
98
+ _MAX_ACCESS_COUNT = 10
99
+
100
+
101
+ class FeatureExtractor:
102
+ """
103
+ Extracts 9-dimensional feature vectors for candidate memories.
104
+
105
+ Usage:
106
+ extractor = FeatureExtractor()
107
+ extractor.set_context(
108
+ source_scores={'claude-desktop': 0.8, 'cursor': 0.6},
109
+ tech_preferences={'python': {'confidence': 0.9}, 'react': {'confidence': 0.7}},
110
+ current_project='SuperLocalMemoryV2',
111
+ workflow_phase='testing',
112
+ )
113
+ features = extractor.extract_batch(memories, query="search optimization")
114
+ # features is List[List[float]], shape (n_memories, 9)
115
+ """
116
+
117
+ FEATURE_NAMES = FEATURE_NAMES
118
+
119
+ def __init__(self):
120
+ """Initialize FeatureExtractor with empty context."""
121
+ self._source_scores: Dict[str, float] = {}
122
+ self._tech_preferences: Dict[str, dict] = {}
123
+ self._tech_keywords_lower: List[str] = []
124
+ self._current_project: Optional[str] = None
125
+ self._current_project_lower: Optional[str] = None
126
+ self._workflow_phase: Optional[str] = None
127
+ self._workflow_keywords: List[str] = []
128
+
129
+ def set_context(
130
+ self,
131
+ source_scores: Optional[Dict[str, float]] = None,
132
+ tech_preferences: Optional[Dict[str, dict]] = None,
133
+ current_project: Optional[str] = None,
134
+ workflow_phase: Optional[str] = None,
135
+ ):
136
+ """
137
+ Set context for feature extraction. Called once per recall query.
138
+
139
+ These values are expensive to compute (require DB lookups in learning_db),
140
+ so they are set once and reused across all candidate memories in a batch.
141
+
142
+ Args:
143
+ source_scores: Map of source_id -> quality score (0.0-1.0).
144
+ From learning_db.get_source_scores().
145
+ tech_preferences: Map of tech_name -> {confidence, evidence_count, ...}.
146
+ From cross_project_aggregator or pattern_learner.
147
+ current_project: Name of the currently active project (if detected).
148
+ workflow_phase: Current workflow phase (planning, coding, testing, etc).
149
+ """
150
+ self._source_scores = source_scores or {}
151
+ self._tech_preferences = tech_preferences or {}
152
+
153
+ # Pre-compute lowercased tech keywords for faster matching
154
+ self._tech_keywords_lower = [
155
+ k.lower() for k in self._tech_preferences.keys()
156
+ ]
157
+
158
+ self._current_project = current_project
159
+ self._current_project_lower = (
160
+ current_project.lower() if current_project else None
161
+ )
162
+
163
+ self._workflow_phase = workflow_phase
164
+ self._workflow_keywords = (
165
+ _WORKFLOW_PHASE_KEYWORDS.get(workflow_phase, [])
166
+ if workflow_phase else []
167
+ )
168
+
169
+ def extract_features(self, memory: dict, query: str) -> List[float]:
170
+ """
171
+ Extract 9-dimensional feature vector for a single memory.
172
+
173
+ Args:
174
+ memory: Memory dict from search results. Expected keys:
175
+ id, content, score, match_type, importance, created_at,
176
+ access_count, project_name, tags, created_by (optional).
177
+ query: The recall query string.
178
+
179
+ Returns:
180
+ List of 9 floats in [0.0, 1.0] range, one per feature.
181
+ """
182
+ return [
183
+ self._compute_bm25_score(memory),
184
+ self._compute_tfidf_score(memory),
185
+ self._compute_tech_match(memory),
186
+ self._compute_project_match(memory),
187
+ self._compute_workflow_fit(memory),
188
+ self._compute_source_quality(memory),
189
+ self._compute_importance_norm(memory),
190
+ self._compute_recency_score(memory),
191
+ self._compute_access_frequency(memory),
192
+ ]
193
+
194
+ def extract_batch(
195
+ self,
196
+ memories: List[dict],
197
+ query: str,
198
+ ) -> List[List[float]]:
199
+ """
200
+ Extract feature vectors for all candidate memories.
201
+
202
+ Args:
203
+ memories: List of memory dicts from search results.
204
+ query: The recall query string.
205
+
206
+ Returns:
207
+ List of feature vectors (List[List[float]]), shape (n, 9).
208
+ Returns empty list if memories is empty.
209
+ """
210
+ if not memories:
211
+ return []
212
+
213
+ return [self.extract_features(m, query) for m in memories]
214
+
215
+ # ========================================================================
216
+ # Individual Feature Computations
217
+ # ========================================================================
218
+
219
+ def _compute_bm25_score(self, memory: dict) -> float:
220
+ """
221
+ Use 'score' field from search results for keyword-based retrieval.
222
+
223
+ BM25/FTS5 rank scores are not naturally bounded to [0,1], so we
224
+ apply a simple normalization. For keyword matches, score is
225
+ typically set to 0.5 by MemoryStoreV2._row_to_dict(). For semantic
226
+ matches, score is already in [0,1] from cosine similarity.
227
+
228
+ We use match_type to distinguish: 'keyword' -> treat as BM25 signal,
229
+ 'semantic'/'hnsw' -> set to 0.0 (not a BM25 signal).
230
+ """
231
+ match_type = memory.get('match_type', '')
232
+ if match_type == 'keyword':
233
+ # FTS5 keyword match — normalize the rank score
234
+ score = memory.get('score', 0.0)
235
+ # FTS5 rank is negative (lower = better), score field is already
236
+ # mapped to 0.5 by _row_to_dict, so use it directly
237
+ return max(0.0, min(float(score), 1.0))
238
+ # Not a keyword match — no BM25 signal
239
+ return 0.0
240
+
241
+ def _compute_tfidf_score(self, memory: dict) -> float:
242
+ """
243
+ Use cosine similarity score from TF-IDF semantic search.
244
+
245
+ For semantic matches, the score field contains the cosine
246
+ similarity (already in [0,1]). For keyword-only matches,
247
+ this returns 0.0.
248
+ """
249
+ match_type = memory.get('match_type', '')
250
+ if match_type in ('semantic', 'hnsw'):
251
+ score = memory.get('score', 0.0)
252
+ return max(0.0, min(float(score), 1.0))
253
+ return 0.0
254
+
255
+ def _compute_tech_match(self, memory: dict) -> float:
256
+ """
257
+ Check if memory content mentions user's preferred technologies.
258
+
259
+ Returns:
260
+ 1.0 if strong match (2+ tech keywords found)
261
+ 0.5 if weak match (1 tech keyword found)
262
+ 0.0 if no match or no tech preferences set
263
+ """
264
+ if not self._tech_keywords_lower:
265
+ return 0.5 # No preferences known — neutral
266
+
267
+ content = memory.get('content', '')
268
+ if not content:
269
+ return 0.0
270
+
271
+ content_lower = content.lower()
272
+ tags_str = ''
273
+ tags = memory.get('tags', [])
274
+ if isinstance(tags, list):
275
+ tags_str = ' '.join(t.lower() for t in tags)
276
+ elif isinstance(tags, str):
277
+ tags_str = tags.lower()
278
+
279
+ searchable = content_lower + ' ' + tags_str
280
+ match_count = 0
281
+
282
+ for tech_kw in self._tech_keywords_lower:
283
+ # Word-boundary check for short keywords to avoid false positives
284
+ # e.g., "go" matching "google" — require word boundary
285
+ if len(tech_kw) <= 3:
286
+ if re.search(r'\b' + re.escape(tech_kw) + r'\b', searchable):
287
+ match_count += 1
288
+ else:
289
+ if tech_kw in searchable:
290
+ match_count += 1
291
+
292
+ if match_count >= 2:
293
+ return 1.0
294
+ elif match_count == 1:
295
+ return 0.5
296
+ return 0.0
297
+
298
+ def _compute_project_match(self, memory: dict) -> float:
299
+ """
300
+ Check if memory belongs to the currently active project.
301
+
302
+ Returns:
303
+ 1.0 if memory's project_name matches current_project
304
+ 0.6 if no current project detected (neutral — don't penalize)
305
+ 0.3 if memory is from a different project
306
+ 0.5 if memory has no project_name (unknown)
307
+ """
308
+ if self._current_project_lower is None:
309
+ # No current project context — neutral for all
310
+ return 0.6
311
+
312
+ memory_project = memory.get('project_name', '')
313
+ if not memory_project:
314
+ return 0.5 # Memory has no project — slightly neutral
315
+
316
+ if memory_project.lower() == self._current_project_lower:
317
+ return 1.0
318
+ return 0.3
319
+
320
+ def _compute_workflow_fit(self, memory: dict) -> float:
321
+ """
322
+ Check if memory content aligns with the current workflow phase.
323
+
324
+ Returns:
325
+ 0.8 if strong fit (3+ keywords match)
326
+ 0.6 if moderate fit (1-2 keywords match)
327
+ 0.5 if unknown workflow phase (neutral)
328
+ 0.3 if no fit at all
329
+ """
330
+ if not self._workflow_keywords:
331
+ return 0.5 # No workflow phase known — neutral
332
+
333
+ content = memory.get('content', '')
334
+ if not content:
335
+ return 0.3
336
+
337
+ content_lower = content.lower()
338
+ match_count = sum(
339
+ 1 for kw in self._workflow_keywords
340
+ if kw in content_lower
341
+ )
342
+
343
+ if match_count >= 3:
344
+ return 0.8
345
+ elif match_count >= 1:
346
+ return 0.6
347
+ return 0.3
348
+
349
+ def _compute_source_quality(self, memory: dict) -> float:
350
+ """
351
+ Look up source quality from cached scores.
352
+
353
+ Returns:
354
+ The source's quality score if known (0.0-1.0)
355
+ 0.5 for unknown sources (neutral default)
356
+ """
357
+ # Try created_by first (v2.5+ provenance), then source_tool
358
+ source_id = memory.get('created_by') or memory.get('source_tool', '')
359
+ if not source_id:
360
+ return 0.5 # Unknown source — neutral
361
+
362
+ return self._source_scores.get(source_id, 0.5)
363
+
364
+ def _compute_importance_norm(self, memory: dict) -> float:
365
+ """
366
+ Normalize importance to [0.0, 1.0].
367
+
368
+ importance is stored as 1-10 integer in memory.db.
369
+ Dividing by 10.0 gives clean normalization.
370
+ """
371
+ importance = memory.get('importance', 5)
372
+ if importance is None:
373
+ importance = 5
374
+ try:
375
+ importance = int(importance)
376
+ except (ValueError, TypeError):
377
+ importance = 5
378
+ # Clamp to valid range before normalizing
379
+ importance = max(1, min(importance, 10))
380
+ return importance / 10.0
381
+
382
+ def _compute_recency_score(self, memory: dict) -> float:
383
+ """
384
+ Exponential decay based on memory age.
385
+
386
+ Formula: exp(-age_days / half_life)
387
+ With 180-day half-life:
388
+ - 0 days old -> 1.0
389
+ - 30 days old -> ~0.85
390
+ - 90 days old -> ~0.61
391
+ - 180 days old -> ~0.37
392
+ - 365 days old -> ~0.13
393
+
394
+ Handles missing, None, or malformed created_at gracefully.
395
+ """
396
+ created_at = memory.get('created_at')
397
+ if not created_at:
398
+ return 0.5 # Unknown age — neutral
399
+
400
+ try:
401
+ # Parse the timestamp — handle multiple formats
402
+ if isinstance(created_at, str):
403
+ # Try ISO format first (most common in SQLite)
404
+ created_at = created_at.replace('Z', '+00:00')
405
+ try:
406
+ created_dt = datetime.fromisoformat(created_at)
407
+ except ValueError:
408
+ # Fallback: try common SQLite format
409
+ created_dt = datetime.strptime(
410
+ created_at, '%Y-%m-%d %H:%M:%S'
411
+ )
412
+ elif isinstance(created_at, (int, float)):
413
+ created_dt = datetime.fromtimestamp(created_at)
414
+ else:
415
+ return 0.5
416
+
417
+ # Make timezone-naive for comparison
418
+ if created_dt.tzinfo is not None:
419
+ created_dt = created_dt.replace(tzinfo=None)
420
+
421
+ now = datetime.now()
422
+ age_days = max(0, (now - created_dt).total_seconds() / 86400.0)
423
+
424
+ # Exponential decay: e^(-age / half_life)
425
+ score = math.exp(-age_days / _RECENCY_HALF_LIFE_DAYS)
426
+ return max(0.0, min(score, 1.0))
427
+
428
+ except (ValueError, TypeError, OverflowError, OSError) as e:
429
+ logger.debug("Failed to parse created_at for recency: %s", e)
430
+ return 0.5 # Parse failure — neutral
431
+
432
+ def _compute_access_frequency(self, memory: dict) -> float:
433
+ """
434
+ Normalize access_count to [0.0, 1.0], capped at MAX_ACCESS_COUNT.
435
+
436
+ access_count tracks how many times a memory has been recalled.
437
+ Capping prevents frequently-accessed memories from dominating.
438
+ """
439
+ access_count = memory.get('access_count', 0)
440
+ if access_count is None:
441
+ access_count = 0
442
+ try:
443
+ access_count = int(access_count)
444
+ except (ValueError, TypeError):
445
+ access_count = 0
446
+
447
+ return min(access_count / float(_MAX_ACCESS_COUNT), 1.0)
448
+
449
+
450
+ # ============================================================================
451
+ # Module-level convenience functions
452
+ # ============================================================================
453
+
454
+ def get_feature_names() -> List[str]:
455
+ """Return ordered list of feature names (matches vector indices)."""
456
+ return list(FEATURE_NAMES)
457
+
458
+
459
+ def get_num_features() -> int:
460
+ """Return the number of features in the vector."""
461
+ return NUM_FEATURES