superlocalmemory 2.6.5 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1047 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SuperLocalMemory V2 - Synthetic Bootstrap (v2.7)
4
+ Copyright (c) 2026 Varun Pratap Bhardwaj
5
+ Licensed under MIT License
6
+
7
+ Repository: https://github.com/varun369/SuperLocalMemoryV2
8
+ Author: Varun Pratap Bhardwaj (Solution Architect)
9
+
10
+ NOTICE: This software is protected by MIT License.
11
+ Attribution must be preserved in all copies or derivatives.
12
+ """
13
+
14
+ """
15
+ SyntheticBootstrapper — Bootstrap ML model from existing data patterns.
16
+
17
+ PROBLEM: LightGBM needs 200+ feedback signals across 50+ unique queries
18
+ to activate ML ranking (Phase 2). A new user has zero feedback. Without
19
+ bootstrap, users must endure ~200 recalls before getting personalization.
20
+ That's weeks of usage with no benefit. Users abandon before reaching Phase 2.
21
+
22
+ SOLUTION: Generate synthetic (query, memory, relevance_label) tuples from
23
+ EXISTING data patterns in memory.db. These aren't real user feedback, but
24
+ they encode reasonable assumptions:
25
+ - Frequently accessed memories are probably relevant to their keywords
26
+ - High-importance memories should rank higher for their topics
27
+ - Learned patterns (from pattern_learner.py) encode real preferences
28
+ - Recent memories should generally outrank older ones
29
+
30
+ Four Strategies:
31
+ 1. Access-based: Memories accessed 5+ times -> positive for their keywords
32
+ 2. Importance-based: Importance >= 8 -> positive for their tags
33
+ 3. Pattern-based: Learned identity_patterns -> positive for matching memories
34
+ 4. Recency decay: For any synthetic query, recent memories rank higher
35
+
36
+ The bootstrap model uses MORE aggressive regularization than the real model
37
+ (fewer trees, smaller depth, higher reg_lambda) to prevent overfitting
38
+ on synthetic data. Once real feedback accumulates, the model is retrained
39
+ with continued learning (init_model), gradually replacing synthetic signal
40
+ with real signal.
41
+
42
+ Research Backing:
43
+ - FCS LREC 2024: Cold-start mitigation via synthetic bootstrap
44
+ - eKNOW 2025: BM25 -> re-ranker pipeline effectiveness
45
+ """
46
+
47
+ import hashlib
48
+ import logging
49
+ import re
50
+ import sqlite3
51
+ from collections import Counter
52
+ from datetime import datetime
53
+ from pathlib import Path
54
+ from typing import Any, Dict, List, Optional, Set
55
+
56
+ # LightGBM is OPTIONAL — bootstrap only works when LightGBM is installed
57
+ try:
58
+ import lightgbm as lgb
59
+ HAS_LIGHTGBM = True
60
+ except ImportError:
61
+ lgb = None
62
+ HAS_LIGHTGBM = False
63
+
64
+ try:
65
+ import numpy as np
66
+ HAS_NUMPY = True
67
+ except ImportError:
68
+ np = None
69
+ HAS_NUMPY = False
70
+
71
+ from .feature_extractor import FeatureExtractor, FEATURE_NAMES, NUM_FEATURES
72
+
73
+ logger = logging.getLogger("superlocalmemory.learning.synthetic_bootstrap")
74
+
75
+ # ============================================================================
76
+ # Constants
77
+ # ============================================================================
78
+
79
+ MEMORY_DB_PATH = Path.home() / ".claude-memory" / "memory.db"
80
+ MODELS_DIR = Path.home() / ".claude-memory" / "models"
81
+ MODEL_PATH = MODELS_DIR / "ranker.txt"
82
+
83
+ # Minimum memories needed before bootstrap makes sense
84
+ MIN_MEMORIES_FOR_BOOTSTRAP = 50
85
+
86
+ # Tiered config — bootstrap model complexity scales with data size
87
+ BOOTSTRAP_CONFIG = {
88
+ 'small': {
89
+ 'min_memories': 50,
90
+ 'max_memories': 499,
91
+ 'target_samples': 200,
92
+ 'n_estimators': 30,
93
+ 'max_depth': 3,
94
+ },
95
+ 'medium': {
96
+ 'min_memories': 500,
97
+ 'max_memories': 4999,
98
+ 'target_samples': 1000,
99
+ 'n_estimators': 50,
100
+ 'max_depth': 4,
101
+ },
102
+ 'large': {
103
+ 'min_memories': 5000,
104
+ 'max_memories': float('inf'),
105
+ 'target_samples': 2000,
106
+ 'n_estimators': 100,
107
+ 'max_depth': 6,
108
+ },
109
+ }
110
+
111
+ # LightGBM bootstrap parameters — MORE aggressive regularization than
112
+ # real training because synthetic data has systematic biases
113
+ BOOTSTRAP_PARAMS = {
114
+ 'objective': 'lambdarank',
115
+ 'metric': 'ndcg',
116
+ 'ndcg_eval_at': [5, 10],
117
+ 'learning_rate': 0.1,
118
+ 'num_leaves': 8,
119
+ 'max_depth': 3,
120
+ 'min_child_samples': 5,
121
+ 'subsample': 0.7,
122
+ 'reg_alpha': 0.5,
123
+ 'reg_lambda': 2.0,
124
+ 'boosting_type': 'dart',
125
+ 'verbose': -1,
126
+ }
127
+
128
+ # English stopwords for keyword extraction (no external deps)
129
+ _STOPWORDS = frozenset({
130
+ 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
131
+ 'of', 'with', 'by', 'from', 'is', 'it', 'this', 'that', 'was', 'are',
132
+ 'be', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would',
133
+ 'could', 'should', 'may', 'might', 'can', 'not', 'no', 'if', 'then',
134
+ 'so', 'as', 'up', 'out', 'about', 'into', 'over', 'after', 'before',
135
+ 'when', 'where', 'how', 'what', 'which', 'who', 'whom', 'why',
136
+ 'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other',
137
+ 'some', 'such', 'than', 'too', 'very', 'just', 'also', 'now',
138
+ 'here', 'there', 'use', 'used', 'using', 'make', 'made',
139
+ 'need', 'needed', 'get', 'got', 'set', 'new', 'old', 'one', 'two',
140
+ })
141
+
142
+ # Minimum word length for keyword extraction
143
+ _MIN_KEYWORD_LENGTH = 3
144
+
145
+
146
+ class SyntheticBootstrapper:
147
+ """
148
+ Generates synthetic training data and bootstraps the ML ranking model.
149
+
150
+ Usage:
151
+ bootstrapper = SyntheticBootstrapper()
152
+ if bootstrapper.should_bootstrap():
153
+ result = bootstrapper.bootstrap_model()
154
+ if result:
155
+ print(f"Bootstrapped with {result['training_samples']} samples")
156
+
157
+ The bootstrapped model is saved to the same path as the real model.
158
+ When real feedback accumulates, AdaptiveRanker.train() uses
159
+ continued learning (init_model) to incrementally replace synthetic
160
+ signal with real signal.
161
+ """
162
+
163
+ MIN_MEMORIES_FOR_BOOTSTRAP = MIN_MEMORIES_FOR_BOOTSTRAP
164
+ BOOTSTRAP_CONFIG = BOOTSTRAP_CONFIG
165
+
166
+ def __init__(
167
+ self,
168
+ memory_db_path: Optional[Path] = None,
169
+ learning_db=None,
170
+ ):
171
+ """
172
+ Initialize SyntheticBootstrapper.
173
+
174
+ Args:
175
+ memory_db_path: Path to memory.db (defaults to ~/.claude-memory/memory.db).
176
+ learning_db: Optional LearningDB instance for recording metadata.
177
+ """
178
+ self._memory_db = Path(memory_db_path) if memory_db_path else MEMORY_DB_PATH
179
+ self._learning_db = learning_db
180
+ self._feature_extractor = FeatureExtractor()
181
+
182
+ # ========================================================================
183
+ # LearningDB Access
184
+ # ========================================================================
185
+
186
+ def _get_learning_db(self):
187
+ """Get or create the LearningDB instance."""
188
+ if self._learning_db is None:
189
+ try:
190
+ from .learning_db import LearningDB
191
+ self._learning_db = LearningDB()
192
+ except Exception as e:
193
+ logger.warning("Cannot access LearningDB: %s", e)
194
+ return None
195
+ return self._learning_db
196
+
197
+ # ========================================================================
198
+ # Pre-flight Checks
199
+ # ========================================================================
200
+
201
+ def should_bootstrap(self) -> bool:
202
+ """
203
+ Check if synthetic bootstrap is needed and possible.
204
+
205
+ Returns True if:
206
+ 1. LightGBM + NumPy are available
207
+ 2. No existing model file (or forced rebuild)
208
+ 3. At least MIN_MEMORIES_FOR_BOOTSTRAP memories exist in memory.db
209
+ """
210
+ if not HAS_LIGHTGBM or not HAS_NUMPY:
211
+ logger.debug("Bootstrap unavailable: LightGBM=%s, NumPy=%s",
212
+ HAS_LIGHTGBM, HAS_NUMPY)
213
+ return False
214
+
215
+ if MODEL_PATH.exists():
216
+ logger.debug("Model already exists at %s — skipping bootstrap",
217
+ MODEL_PATH)
218
+ return False
219
+
220
+ memory_count = self._get_memory_count()
221
+ if memory_count < MIN_MEMORIES_FOR_BOOTSTRAP:
222
+ logger.debug(
223
+ "Not enough memories for bootstrap: %d (need %d)",
224
+ memory_count, MIN_MEMORIES_FOR_BOOTSTRAP
225
+ )
226
+ return False
227
+
228
+ return True
229
+
230
+ def get_tier(self) -> Optional[str]:
231
+ """
232
+ Determine bootstrap tier based on memory count.
233
+
234
+ Returns:
235
+ 'small', 'medium', 'large', or None if < MIN_MEMORIES.
236
+ """
237
+ count = self._get_memory_count()
238
+ for tier_name, config in BOOTSTRAP_CONFIG.items():
239
+ if config['min_memories'] <= count <= config['max_memories']:
240
+ return tier_name
241
+ return None
242
+
243
+ def _get_memory_count(self) -> int:
244
+ """Count total memories in memory.db."""
245
+ if not self._memory_db.exists():
246
+ return 0
247
+ try:
248
+ conn = sqlite3.connect(str(self._memory_db), timeout=5)
249
+ cursor = conn.cursor()
250
+ cursor.execute('SELECT COUNT(*) FROM memories')
251
+ count = cursor.fetchone()[0]
252
+ conn.close()
253
+ return count
254
+ except Exception as e:
255
+ logger.warning("Failed to count memories: %s", e)
256
+ return 0
257
+
258
+ # ========================================================================
259
+ # Synthetic Data Generation
260
+ # ========================================================================
261
+
262
+ def generate_synthetic_training_data(self) -> List[dict]:
263
+ """
264
+ Generate synthetic (query, memory, label, features) records.
265
+
266
+ Combines four strategies to produce training data from existing
267
+ memory patterns. Each record contains:
268
+ - query: Synthetic query string (extracted keywords)
269
+ - memory_id: ID of the memory in memory.db
270
+ - label: Relevance label (0.0 = irrelevant, 1.0 = highly relevant)
271
+ - source: Which strategy generated this record
272
+ - features: 9-dimensional feature vector
273
+
274
+ Returns:
275
+ List of training record dicts. May be empty if insufficient data.
276
+ """
277
+ records = []
278
+
279
+ # Strategy 1: Access-based pseudo-labels
280
+ access_records = self._generate_access_based()
281
+ records.extend(access_records)
282
+ logger.info("Strategy 1 (access): %d records", len(access_records))
283
+
284
+ # Strategy 2: Importance-based pseudo-labels
285
+ importance_records = self._generate_importance_based()
286
+ records.extend(importance_records)
287
+ logger.info("Strategy 2 (importance): %d records",
288
+ len(importance_records))
289
+
290
+ # Strategy 3: Pattern-based synthetic queries
291
+ pattern_records = self._generate_pattern_based()
292
+ records.extend(pattern_records)
293
+ logger.info("Strategy 3 (patterns): %d records", len(pattern_records))
294
+
295
+ # Strategy 4: Recency decay pseudo-labels
296
+ recency_records = self._generate_recency_based()
297
+ records.extend(recency_records)
298
+ logger.info("Strategy 4 (recency): %d records", len(recency_records))
299
+
300
+ logger.info("Total synthetic records: %d", len(records))
301
+ return records
302
+
303
+ def _generate_access_based(self) -> List[dict]:
304
+ """
305
+ Strategy 1: Memories accessed 5+ times are relevant for their keywords.
306
+
307
+ Logic: If a user keeps coming back to a memory via certain searches,
308
+ the keywords in that memory are relevant queries for it.
309
+ """
310
+ records = []
311
+ high_access_memories = self._get_memories_by_access(min_access=5)
312
+
313
+ for memory in high_access_memories:
314
+ keywords = self._extract_keywords(memory.get('content', ''))
315
+ if not keywords:
316
+ continue
317
+
318
+ query = ' '.join(keywords)
319
+
320
+ # Positive: This memory is relevant to its own keywords
321
+ records.append(self._build_record(
322
+ query=query,
323
+ memory=memory,
324
+ label=1.0,
325
+ source='access_positive',
326
+ ))
327
+
328
+ # Find some non-matching memories as negatives
329
+ negatives = self._find_negative_memories(
330
+ memory, exclude_ids={memory['id']}, limit=2
331
+ )
332
+ for neg_memory in negatives:
333
+ records.append(self._build_record(
334
+ query=query,
335
+ memory=neg_memory,
336
+ label=0.0,
337
+ source='access_negative',
338
+ ))
339
+
340
+ return records
341
+
342
+ def _generate_importance_based(self) -> List[dict]:
343
+ """
344
+ Strategy 2: High-importance memories (>= 8) are positive for their tags.
345
+
346
+ Logic: User explicitly rated these memories as important. Their tags
347
+ represent topics the user cares about.
348
+ """
349
+ records = []
350
+ important_memories = self._get_memories_by_importance(min_importance=8)
351
+
352
+ for memory in important_memories:
353
+ # Use tags as synthetic query, fall back to content keywords
354
+ tags = memory.get('tags', '')
355
+ if isinstance(tags, str):
356
+ try:
357
+ import json
358
+ tags_list = json.loads(tags)
359
+ except (ValueError, TypeError):
360
+ tags_list = [t.strip() for t in tags.split(',') if t.strip()]
361
+ elif isinstance(tags, list):
362
+ tags_list = tags
363
+ else:
364
+ tags_list = []
365
+
366
+ if tags_list:
367
+ query = ' '.join(tags_list[:5])
368
+ else:
369
+ keywords = self._extract_keywords(memory.get('content', ''))
370
+ query = ' '.join(keywords) if keywords else ''
371
+
372
+ if not query:
373
+ continue
374
+
375
+ # Positive: High-importance memory matches its tags
376
+ records.append(self._build_record(
377
+ query=query,
378
+ memory=memory,
379
+ label=1.0,
380
+ source='importance_positive',
381
+ ))
382
+
383
+ # Find some negatives
384
+ negatives = self._find_negative_memories(
385
+ memory, exclude_ids={memory['id']}, limit=2
386
+ )
387
+ for neg_memory in negatives:
388
+ records.append(self._build_record(
389
+ query=query,
390
+ memory=neg_memory,
391
+ label=0.0,
392
+ source='importance_negative',
393
+ ))
394
+
395
+ return records
396
+
397
+ def _generate_pattern_based(self) -> List[dict]:
398
+ """
399
+ Strategy 3: Use learned identity_patterns to create synthetic queries.
400
+
401
+ Logic: Pattern learner has already identified user's tech preferences,
402
+ coding style, etc. Use these as queries and find matching memories.
403
+ """
404
+ records = []
405
+ patterns = self._get_learned_patterns(min_confidence=0.7)
406
+
407
+ if not patterns:
408
+ return records
409
+
410
+ for pattern in patterns:
411
+ # Build query from pattern key + value
412
+ query_parts = []
413
+ key = pattern.get('key', '')
414
+ value = pattern.get('value', '')
415
+ if key:
416
+ query_parts.append(key)
417
+ if value and value != key:
418
+ query_parts.append(value)
419
+
420
+ query = ' '.join(query_parts)
421
+ if not query or len(query) < 3:
422
+ continue
423
+
424
+ # Search for memories matching this pattern
425
+ matching = self._search_memories(query, limit=10)
426
+
427
+ if len(matching) < 2:
428
+ continue
429
+
430
+ # Top results are positive, bottom results are weak negatives
431
+ for i, memory in enumerate(matching):
432
+ if i < 3:
433
+ label = 1.0 # Top matches are relevant
434
+ elif i < 6:
435
+ label = 0.5 # Middle matches are weakly relevant
436
+ else:
437
+ label = 0.1 # Bottom matches are marginal
438
+
439
+ records.append(self._build_record(
440
+ query=query,
441
+ memory=memory,
442
+ label=label,
443
+ source='pattern',
444
+ ))
445
+
446
+ return records
447
+
448
+ def _generate_recency_based(self) -> List[dict]:
449
+ """
450
+ Strategy 4: Recency decay — for shared-topic queries, recent wins.
451
+
452
+ Logic: For memories about the same topic, more recent memories
453
+ should generally rank higher (fresher context, more current).
454
+ Generates pairs where newer = positive, older = weak negative.
455
+ """
456
+ records = []
457
+
458
+ # Get a sample of recent and old memories
459
+ recent = self._get_recent_memories(limit=30)
460
+ if len(recent) < 4:
461
+ return records
462
+
463
+ # Take pairs: for each recent memory's keywords, create a query
464
+ # then the recent memory is positive and older memories are negative
465
+ processed_queries: Set[str] = set()
466
+
467
+ for memory in recent[:15]:
468
+ keywords = self._extract_keywords(memory.get('content', ''))
469
+ query = ' '.join(keywords) if keywords else ''
470
+ if not query or query in processed_queries:
471
+ continue
472
+ processed_queries.add(query)
473
+
474
+ # This recent memory is positive
475
+ records.append(self._build_record(
476
+ query=query,
477
+ memory=memory,
478
+ label=0.8, # Good but not perfect (it's synthetic)
479
+ source='recency_positive',
480
+ ))
481
+
482
+ # Find older memories about similar topic
483
+ similar_old = self._search_memories(query, limit=5)
484
+ for old_mem in similar_old:
485
+ if old_mem['id'] == memory['id']:
486
+ continue
487
+ # Older memories get lower label
488
+ records.append(self._build_record(
489
+ query=query,
490
+ memory=old_mem,
491
+ label=0.3,
492
+ source='recency_negative',
493
+ ))
494
+
495
+ return records
496
+
497
+ # ========================================================================
498
+ # Record Building
499
+ # ========================================================================
500
+
501
+ def _build_record(
502
+ self,
503
+ query: str,
504
+ memory: dict,
505
+ label: float,
506
+ source: str,
507
+ ) -> dict:
508
+ """
509
+ Build a training record with features.
510
+
511
+ For synthetic data, we use simplified context:
512
+ - No tech preferences (unknown at bootstrap time)
513
+ - No current project
514
+ - No workflow phase
515
+ Focus on measurable features: importance, recency, access_frequency.
516
+ """
517
+ # Set neutral context (no query-time info for synthetic data)
518
+ # Context is already set externally or defaults to neutral
519
+ features = self._feature_extractor.extract_features(memory, query)
520
+
521
+ return {
522
+ 'query': query,
523
+ 'query_hash': hashlib.sha256(query.encode()).hexdigest()[:16],
524
+ 'memory_id': memory.get('id', 0),
525
+ 'label': label,
526
+ 'source': source,
527
+ 'features': features,
528
+ }
529
+
530
+ # ========================================================================
531
+ # Model Training
532
+ # ========================================================================
533
+
534
+ def bootstrap_model(self) -> Optional[Dict[str, Any]]:
535
+ """
536
+ Generate synthetic data and train the bootstrap model.
537
+
538
+ Steps:
539
+ 1. Generate synthetic training data
540
+ 2. Build feature matrix and label vectors
541
+ 3. Train LightGBM with aggressive regularization
542
+ 4. Save model to ~/.claude-memory/models/ranker.txt
543
+ 5. Record metadata in learning_db
544
+ 6. Return metadata
545
+
546
+ Returns:
547
+ Training metadata dict, or None if bootstrap not possible.
548
+ """
549
+ if not HAS_LIGHTGBM or not HAS_NUMPY:
550
+ logger.warning("Bootstrap requires LightGBM and NumPy")
551
+ return None
552
+
553
+ tier = self.get_tier()
554
+ if tier is None:
555
+ logger.info("Not enough memories for bootstrap")
556
+ return None
557
+
558
+ config = BOOTSTRAP_CONFIG[tier]
559
+ logger.info(
560
+ "Starting bootstrap (tier=%s, target=%d samples)",
561
+ tier, config['target_samples']
562
+ )
563
+
564
+ # Set neutral context for feature extraction
565
+ self._feature_extractor.set_context()
566
+
567
+ # Generate synthetic data
568
+ records = self.generate_synthetic_training_data()
569
+ if not records:
570
+ logger.warning("No synthetic records generated")
571
+ return None
572
+
573
+ # Trim to target sample count if needed
574
+ if len(records) > config['target_samples']:
575
+ # Keep a diverse sample across sources
576
+ records = self._diverse_sample(records, config['target_samples'])
577
+
578
+ # Group by query_hash for LGBMRanker
579
+ query_groups: Dict[str, List[dict]] = {}
580
+ for record in records:
581
+ qh = record['query_hash']
582
+ if qh not in query_groups:
583
+ query_groups[qh] = []
584
+ query_groups[qh].append(record)
585
+
586
+ # Filter: only keep groups with 2+ items
587
+ query_groups = {
588
+ qh: recs for qh, recs in query_groups.items()
589
+ if len(recs) >= 2
590
+ }
591
+
592
+ if not query_groups:
593
+ logger.warning("No valid query groups (need 2+ records per group)")
594
+ return None
595
+
596
+ # Build matrices
597
+ all_features = []
598
+ all_labels = []
599
+ groups = []
600
+
601
+ for qh, group_records in query_groups.items():
602
+ group_size = 0
603
+ for record in group_records:
604
+ all_features.append(record['features'])
605
+ all_labels.append(record['label'])
606
+ group_size += 1
607
+ groups.append(group_size)
608
+
609
+ X = np.array(all_features, dtype=np.float64)
610
+ y = np.array(all_labels, dtype=np.float64)
611
+ total_samples = X.shape[0]
612
+
613
+ if total_samples < 10:
614
+ logger.warning("Too few samples after grouping: %d", total_samples)
615
+ return None
616
+
617
+ logger.info(
618
+ "Training bootstrap model: %d samples, %d groups, tier=%s",
619
+ total_samples, len(groups), tier
620
+ )
621
+
622
+ # Create LightGBM dataset
623
+ train_dataset = lgb.Dataset(
624
+ X, label=y, group=groups,
625
+ feature_name=list(FEATURE_NAMES),
626
+ free_raw_data=False,
627
+ )
628
+
629
+ # Use tiered n_estimators and max_depth
630
+ params = dict(BOOTSTRAP_PARAMS)
631
+ params['max_depth'] = config['max_depth']
632
+ n_estimators = config['n_estimators']
633
+
634
+ # Train
635
+ try:
636
+ booster = lgb.train(
637
+ params,
638
+ train_dataset,
639
+ num_boost_round=n_estimators,
640
+ valid_sets=[train_dataset],
641
+ valid_names=['train'],
642
+ callbacks=[lgb.log_evaluation(period=0)], # Silent
643
+ )
644
+ except Exception as e:
645
+ logger.error("Bootstrap training failed: %s", e)
646
+ return None
647
+
648
+ # Save model
649
+ MODELS_DIR.mkdir(parents=True, exist_ok=True)
650
+ try:
651
+ booster.save_model(str(MODEL_PATH))
652
+ logger.info("Bootstrap model saved to %s", MODEL_PATH)
653
+ except Exception as e:
654
+ logger.error("Failed to save bootstrap model: %s", e)
655
+ return None
656
+
657
+ # Extract NDCG@10 from training evaluation
658
+ ndcg_at_10 = None
659
+ try:
660
+ eval_results = booster.eval_train(
661
+ lgb.Dataset(X, label=y, group=groups)
662
+ )
663
+ for name, _dataset_name, value, _is_higher_better in eval_results:
664
+ if 'ndcg@10' in name:
665
+ ndcg_at_10 = value
666
+ break
667
+ except Exception:
668
+ pass
669
+
670
+ # Record metadata in learning_db
671
+ model_version = f"bootstrap_{tier}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
672
+ ldb = self._get_learning_db()
673
+ if ldb:
674
+ try:
675
+ ldb.record_model_training(
676
+ model_version=model_version,
677
+ training_samples=total_samples,
678
+ synthetic_samples=total_samples,
679
+ real_samples=0,
680
+ ndcg_at_10=ndcg_at_10,
681
+ model_path=str(MODEL_PATH),
682
+ )
683
+ except Exception as e:
684
+ logger.warning("Failed to record bootstrap metadata: %s", e)
685
+
686
+ metadata = {
687
+ 'model_version': model_version,
688
+ 'tier': tier,
689
+ 'training_samples': total_samples,
690
+ 'synthetic_samples': total_samples,
691
+ 'query_groups': len(groups),
692
+ 'n_estimators': n_estimators,
693
+ 'max_depth': config['max_depth'],
694
+ 'ndcg_at_10': ndcg_at_10,
695
+ 'model_path': str(MODEL_PATH),
696
+ 'source_breakdown': self._count_sources(records),
697
+ 'created_at': datetime.now().isoformat(),
698
+ }
699
+ logger.info("Bootstrap complete: %s", metadata)
700
+ return metadata
701
+
702
+ # ========================================================================
703
+ # Memory Database Queries (READ-ONLY on memory.db)
704
+ # ========================================================================
705
+
706
+ def _get_memories_by_access(self, min_access: int = 5) -> List[dict]:
707
+ """
708
+ Fetch memories with access_count >= min_access from memory.db.
709
+
710
+ These are memories the user keeps coming back to — strong positive signal.
711
+ """
712
+ if not self._memory_db.exists():
713
+ return []
714
+ try:
715
+ conn = sqlite3.connect(str(self._memory_db), timeout=5)
716
+ conn.row_factory = sqlite3.Row
717
+ cursor = conn.cursor()
718
+ cursor.execute('''
719
+ SELECT id, content, summary, project_name, tags,
720
+ category, importance, created_at, access_count
721
+ FROM memories
722
+ WHERE access_count >= ?
723
+ ORDER BY access_count DESC
724
+ LIMIT 100
725
+ ''', (min_access,))
726
+ results = [dict(row) for row in cursor.fetchall()]
727
+ conn.close()
728
+ return results
729
+ except Exception as e:
730
+ logger.warning("Failed to fetch high-access memories: %s", e)
731
+ return []
732
+
733
+ def _get_memories_by_importance(self, min_importance: int = 8) -> List[dict]:
734
+ """
735
+ Fetch memories with importance >= min_importance from memory.db.
736
+
737
+ High importance = user explicitly rated these as valuable.
738
+ """
739
+ if not self._memory_db.exists():
740
+ return []
741
+ try:
742
+ conn = sqlite3.connect(str(self._memory_db), timeout=5)
743
+ conn.row_factory = sqlite3.Row
744
+ cursor = conn.cursor()
745
+ cursor.execute('''
746
+ SELECT id, content, summary, project_name, tags,
747
+ category, importance, created_at, access_count
748
+ FROM memories
749
+ WHERE importance >= ?
750
+ ORDER BY importance DESC
751
+ LIMIT 100
752
+ ''', (min_importance,))
753
+ results = [dict(row) for row in cursor.fetchall()]
754
+ conn.close()
755
+ return results
756
+ except Exception as e:
757
+ logger.warning("Failed to fetch high-importance memories: %s", e)
758
+ return []
759
+
760
+ def _get_recent_memories(self, limit: int = 30) -> List[dict]:
761
+ """Fetch the N most recently created memories."""
762
+ if not self._memory_db.exists():
763
+ return []
764
+ try:
765
+ conn = sqlite3.connect(str(self._memory_db), timeout=5)
766
+ conn.row_factory = sqlite3.Row
767
+ cursor = conn.cursor()
768
+ cursor.execute('''
769
+ SELECT id, content, summary, project_name, tags,
770
+ category, importance, created_at, access_count
771
+ FROM memories
772
+ ORDER BY created_at DESC
773
+ LIMIT ?
774
+ ''', (limit,))
775
+ results = [dict(row) for row in cursor.fetchall()]
776
+ conn.close()
777
+ return results
778
+ except Exception as e:
779
+ logger.warning("Failed to fetch recent memories: %s", e)
780
+ return []
781
+
782
+ def _get_learned_patterns(
783
+ self,
784
+ min_confidence: float = 0.7,
785
+ ) -> List[dict]:
786
+ """
787
+ Fetch high-confidence identity_patterns from memory.db.
788
+
789
+ These are patterns detected by pattern_learner.py (Layer 4) —
790
+ tech preferences, coding style, terminology, etc.
791
+
792
+ Returns empty list if identity_patterns table doesn't exist
793
+ (backward compatible with pre-v2.3 databases).
794
+ """
795
+ if not self._memory_db.exists():
796
+ return []
797
+ try:
798
+ conn = sqlite3.connect(str(self._memory_db), timeout=5)
799
+ conn.row_factory = sqlite3.Row
800
+ cursor = conn.cursor()
801
+
802
+ # Check if table exists (backward compatibility)
803
+ cursor.execute('''
804
+ SELECT name FROM sqlite_master
805
+ WHERE type='table' AND name='identity_patterns'
806
+ ''')
807
+ if cursor.fetchone() is None:
808
+ conn.close()
809
+ return []
810
+
811
+ cursor.execute('''
812
+ SELECT id, pattern_type, key, value, confidence,
813
+ evidence_count, category
814
+ FROM identity_patterns
815
+ WHERE confidence >= ?
816
+ ORDER BY confidence DESC
817
+ LIMIT 50
818
+ ''', (min_confidence,))
819
+ results = [dict(row) for row in cursor.fetchall()]
820
+ conn.close()
821
+ return results
822
+ except Exception as e:
823
+ logger.warning("Failed to fetch learned patterns: %s", e)
824
+ return []
825
+
826
+ def _search_memories(self, query: str, limit: int = 20) -> List[dict]:
827
+ """
828
+ Simple FTS5 search in memory.db.
829
+
830
+ Used to find memories matching synthetic query terms.
831
+ This is a lightweight search — no TF-IDF, no HNSW, just FTS5.
832
+ """
833
+ if not self._memory_db.exists():
834
+ return []
835
+ if not query or not query.strip():
836
+ return []
837
+
838
+ try:
839
+ conn = sqlite3.connect(str(self._memory_db), timeout=5)
840
+ conn.row_factory = sqlite3.Row
841
+ cursor = conn.cursor()
842
+
843
+ # Clean query for FTS5 (same approach as memory_store_v2.search)
844
+ fts_tokens = re.findall(r'\w+', query)
845
+ if not fts_tokens:
846
+ conn.close()
847
+ return []
848
+ fts_query = ' OR '.join(fts_tokens)
849
+
850
+ cursor.execute('''
851
+ SELECT m.id, m.content, m.summary, m.project_name, m.tags,
852
+ m.category, m.importance, m.created_at, m.access_count
853
+ FROM memories m
854
+ JOIN memories_fts fts ON m.id = fts.rowid
855
+ WHERE memories_fts MATCH ?
856
+ ORDER BY rank
857
+ LIMIT ?
858
+ ''', (fts_query, limit))
859
+ results = [dict(row) for row in cursor.fetchall()]
860
+ conn.close()
861
+ return results
862
+ except Exception as e:
863
+ logger.debug("FTS5 search failed (may not exist yet): %s", e)
864
+ return []
865
+
866
+ def _find_negative_memories(
867
+ self,
868
+ anchor_memory: dict,
869
+ exclude_ids: Optional[Set[int]] = None,
870
+ limit: int = 2,
871
+ ) -> List[dict]:
872
+ """
873
+ Find memories dissimilar to the anchor (for negative examples).
874
+
875
+ Simple heuristic: pick memories from a different category or project.
876
+ Falls back to random sample if no structured differences available.
877
+ """
878
+ if not self._memory_db.exists():
879
+ return []
880
+ exclude_ids = exclude_ids or set()
881
+
882
+ try:
883
+ conn = sqlite3.connect(str(self._memory_db), timeout=5)
884
+ conn.row_factory = sqlite3.Row
885
+ cursor = conn.cursor()
886
+
887
+ anchor_project = anchor_memory.get('project_name', '')
888
+ anchor_category = anchor_memory.get('category', '')
889
+
890
+ # Try to find memories from different project or category
891
+ conditions = []
892
+ params: list = []
893
+
894
+ if anchor_project:
895
+ conditions.append('project_name != ?')
896
+ params.append(anchor_project)
897
+ if anchor_category:
898
+ conditions.append('category != ?')
899
+ params.append(anchor_category)
900
+
901
+ # Exclude specified IDs
902
+ if exclude_ids:
903
+ placeholders = ','.join('?' for _ in exclude_ids)
904
+ conditions.append(f'id NOT IN ({placeholders})')
905
+ params.extend(exclude_ids)
906
+
907
+ where_clause = ' AND '.join(conditions) if conditions else '1=1'
908
+
909
+ cursor.execute(f'''
910
+ SELECT id, content, summary, project_name, tags,
911
+ category, importance, created_at, access_count
912
+ FROM memories
913
+ WHERE {where_clause}
914
+ ORDER BY RANDOM()
915
+ LIMIT ?
916
+ ''', (*params, limit))
917
+ results = [dict(row) for row in cursor.fetchall()]
918
+ conn.close()
919
+ return results
920
+ except Exception as e:
921
+ logger.debug("Failed to find negative memories: %s", e)
922
+ return []
923
+
924
+ # ========================================================================
925
+ # Text Processing
926
+ # ========================================================================
927
+
928
+ def _extract_keywords(self, content: str, top_n: int = 3) -> List[str]:
929
+ """
930
+ Extract meaningful keywords from memory content.
931
+
932
+ Simple frequency-based extraction:
933
+ 1. Tokenize (alphanumeric words)
934
+ 2. Remove stopwords and short words
935
+ 3. Return top N by frequency
936
+
937
+ No external NLP dependencies — just regex + counter.
938
+ """
939
+ if not content:
940
+ return []
941
+
942
+ # Tokenize: extract alphanumeric words
943
+ words = re.findall(r'[a-zA-Z][a-zA-Z0-9_.-]*[a-zA-Z0-9]|[a-zA-Z]', content.lower())
944
+
945
+ # Filter stopwords and short words
946
+ meaningful = [
947
+ w for w in words
948
+ if w not in _STOPWORDS and len(w) >= _MIN_KEYWORD_LENGTH
949
+ ]
950
+
951
+ if not meaningful:
952
+ return []
953
+
954
+ # Count and return top N
955
+ counter = Counter(meaningful)
956
+ return [word for word, _count in counter.most_common(top_n)]
957
+
958
+ # ========================================================================
959
+ # Utility
960
+ # ========================================================================
961
+
962
+ def _diverse_sample(
963
+ self,
964
+ records: List[dict],
965
+ target: int,
966
+ ) -> List[dict]:
967
+ """
968
+ Sample records while maintaining source diversity.
969
+
970
+ Takes proportional samples from each source strategy to ensure
971
+ the training data isn't dominated by one strategy.
972
+ """
973
+ if len(records) <= target:
974
+ return records
975
+
976
+ # Group by source
977
+ by_source: Dict[str, List[dict]] = {}
978
+ for r in records:
979
+ src = r.get('source', 'unknown')
980
+ if src not in by_source:
981
+ by_source[src] = []
982
+ by_source[src].append(r)
983
+
984
+ # Proportional allocation
985
+ n_sources = len(by_source)
986
+ if n_sources == 0:
987
+ return records[:target]
988
+
989
+ per_source = max(1, target // n_sources)
990
+ sampled = []
991
+
992
+ for source, source_records in by_source.items():
993
+ # Take up to per_source from each, or all if fewer
994
+ take = min(len(source_records), per_source)
995
+ sampled.extend(source_records[:take])
996
+
997
+ # If under target, fill from remaining
998
+ if len(sampled) < target:
999
+ used_ids = {(r['query_hash'], r['memory_id']) for r in sampled}
1000
+ for r in records:
1001
+ if len(sampled) >= target:
1002
+ break
1003
+ key = (r['query_hash'], r['memory_id'])
1004
+ if key not in used_ids:
1005
+ sampled.append(r)
1006
+ used_ids.add(key)
1007
+
1008
+ return sampled[:target]
1009
+
1010
+ def _count_sources(self, records: List[dict]) -> Dict[str, int]:
1011
+ """Count records by source strategy."""
1012
+ counts: Dict[str, int] = {}
1013
+ for r in records:
1014
+ src = r.get('source', 'unknown')
1015
+ counts[src] = counts.get(src, 0) + 1
1016
+ return counts
1017
+
1018
+
1019
+ # ============================================================================
1020
+ # Module-level convenience
1021
+ # ============================================================================
1022
+
1023
+ def should_bootstrap(memory_db_path: Optional[Path] = None) -> bool:
1024
+ """Quick check if bootstrap is needed (creates temporary bootstrapper)."""
1025
+ try:
1026
+ bootstrapper = SyntheticBootstrapper(memory_db_path=memory_db_path)
1027
+ return bootstrapper.should_bootstrap()
1028
+ except Exception:
1029
+ return False
1030
+
1031
+
1032
+ def run_bootstrap(
1033
+ memory_db_path: Optional[Path] = None,
1034
+ learning_db=None,
1035
+ ) -> Optional[Dict[str, Any]]:
1036
+ """Run bootstrap and return metadata (convenience function)."""
1037
+ try:
1038
+ bootstrapper = SyntheticBootstrapper(
1039
+ memory_db_path=memory_db_path,
1040
+ learning_db=learning_db,
1041
+ )
1042
+ if bootstrapper.should_bootstrap():
1043
+ return bootstrapper.bootstrap_model()
1044
+ return None
1045
+ except Exception as e:
1046
+ logger.error("Bootstrap failed: %s", e)
1047
+ return None