superlocalmemory 2.6.5 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -0
- package/README.md +96 -13
- package/bin/slm +179 -3
- package/bin/superlocalmemoryv2:learning +4 -0
- package/bin/superlocalmemoryv2:patterns +4 -0
- package/docs/ARCHITECTURE.md +12 -6
- package/docs/MCP-MANUAL-SETUP.md +14 -4
- package/install.sh +99 -3
- package/mcp_server.py +291 -1
- package/package.json +2 -1
- package/requirements-learning.txt +12 -0
- package/scripts/verify-v27.sh +233 -0
- package/skills/slm-show-patterns/SKILL.md +224 -0
- package/src/learning/synthetic_bootstrap.py +1047 -0
- package/src/learning/tests/__init__.py +0 -0
- package/src/learning/tests/test_adaptive_ranker.py +328 -0
- package/src/learning/tests/test_aggregator.py +309 -0
- package/src/learning/tests/test_feedback_collector.py +295 -0
- package/src/learning/tests/test_learning_db.py +606 -0
- package/src/learning/tests/test_project_context.py +296 -0
- package/src/learning/tests/test_source_quality.py +355 -0
- package/src/learning/tests/test_synthetic_bootstrap.py +433 -0
- package/src/learning/tests/test_workflow_miner.py +322 -0
|
@@ -0,0 +1,1047 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SuperLocalMemory V2 - Synthetic Bootstrap (v2.7)
|
|
4
|
+
Copyright (c) 2026 Varun Pratap Bhardwaj
|
|
5
|
+
Licensed under MIT License
|
|
6
|
+
|
|
7
|
+
Repository: https://github.com/varun369/SuperLocalMemoryV2
|
|
8
|
+
Author: Varun Pratap Bhardwaj (Solution Architect)
|
|
9
|
+
|
|
10
|
+
NOTICE: This software is protected by MIT License.
|
|
11
|
+
Attribution must be preserved in all copies or derivatives.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
SyntheticBootstrapper — Bootstrap ML model from existing data patterns.
|
|
16
|
+
|
|
17
|
+
PROBLEM: LightGBM needs 200+ feedback signals across 50+ unique queries
|
|
18
|
+
to activate ML ranking (Phase 2). A new user has zero feedback. Without
|
|
19
|
+
bootstrap, users must endure ~200 recalls before getting personalization.
|
|
20
|
+
That's weeks of usage with no benefit. Users abandon before reaching Phase 2.
|
|
21
|
+
|
|
22
|
+
SOLUTION: Generate synthetic (query, memory, relevance_label) tuples from
|
|
23
|
+
EXISTING data patterns in memory.db. These aren't real user feedback, but
|
|
24
|
+
they encode reasonable assumptions:
|
|
25
|
+
- Frequently accessed memories are probably relevant to their keywords
|
|
26
|
+
- High-importance memories should rank higher for their topics
|
|
27
|
+
- Learned patterns (from pattern_learner.py) encode real preferences
|
|
28
|
+
- Recent memories should generally outrank older ones
|
|
29
|
+
|
|
30
|
+
Four Strategies:
|
|
31
|
+
1. Access-based: Memories accessed 5+ times -> positive for their keywords
|
|
32
|
+
2. Importance-based: Importance >= 8 -> positive for their tags
|
|
33
|
+
3. Pattern-based: Learned identity_patterns -> positive for matching memories
|
|
34
|
+
4. Recency decay: For any synthetic query, recent memories rank higher
|
|
35
|
+
|
|
36
|
+
The bootstrap model uses MORE aggressive regularization than the real model
|
|
37
|
+
(fewer trees, smaller depth, higher reg_lambda) to prevent overfitting
|
|
38
|
+
on synthetic data. Once real feedback accumulates, the model is retrained
|
|
39
|
+
with continued learning (init_model), gradually replacing synthetic signal
|
|
40
|
+
with real signal.
|
|
41
|
+
|
|
42
|
+
Research Backing:
|
|
43
|
+
- FCS LREC 2024: Cold-start mitigation via synthetic bootstrap
|
|
44
|
+
- eKNOW 2025: BM25 -> re-ranker pipeline effectiveness
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
import hashlib
|
|
48
|
+
import logging
|
|
49
|
+
import re
|
|
50
|
+
import sqlite3
|
|
51
|
+
from collections import Counter
|
|
52
|
+
from datetime import datetime
|
|
53
|
+
from pathlib import Path
|
|
54
|
+
from typing import Any, Dict, List, Optional, Set
|
|
55
|
+
|
|
56
|
+
# LightGBM is OPTIONAL — bootstrap only works when LightGBM is installed
|
|
57
|
+
try:
|
|
58
|
+
import lightgbm as lgb
|
|
59
|
+
HAS_LIGHTGBM = True
|
|
60
|
+
except ImportError:
|
|
61
|
+
lgb = None
|
|
62
|
+
HAS_LIGHTGBM = False
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
import numpy as np
|
|
66
|
+
HAS_NUMPY = True
|
|
67
|
+
except ImportError:
|
|
68
|
+
np = None
|
|
69
|
+
HAS_NUMPY = False
|
|
70
|
+
|
|
71
|
+
from .feature_extractor import FeatureExtractor, FEATURE_NAMES, NUM_FEATURES
|
|
72
|
+
|
|
73
|
+
logger = logging.getLogger("superlocalmemory.learning.synthetic_bootstrap")
|
|
74
|
+
|
|
75
|
+
# ============================================================================
|
|
76
|
+
# Constants
|
|
77
|
+
# ============================================================================
|
|
78
|
+
|
|
79
|
+
MEMORY_DB_PATH = Path.home() / ".claude-memory" / "memory.db"
|
|
80
|
+
MODELS_DIR = Path.home() / ".claude-memory" / "models"
|
|
81
|
+
MODEL_PATH = MODELS_DIR / "ranker.txt"
|
|
82
|
+
|
|
83
|
+
# Minimum memories needed before bootstrap makes sense
|
|
84
|
+
MIN_MEMORIES_FOR_BOOTSTRAP = 50
|
|
85
|
+
|
|
86
|
+
# Tiered config — bootstrap model complexity scales with data size
|
|
87
|
+
BOOTSTRAP_CONFIG = {
|
|
88
|
+
'small': {
|
|
89
|
+
'min_memories': 50,
|
|
90
|
+
'max_memories': 499,
|
|
91
|
+
'target_samples': 200,
|
|
92
|
+
'n_estimators': 30,
|
|
93
|
+
'max_depth': 3,
|
|
94
|
+
},
|
|
95
|
+
'medium': {
|
|
96
|
+
'min_memories': 500,
|
|
97
|
+
'max_memories': 4999,
|
|
98
|
+
'target_samples': 1000,
|
|
99
|
+
'n_estimators': 50,
|
|
100
|
+
'max_depth': 4,
|
|
101
|
+
},
|
|
102
|
+
'large': {
|
|
103
|
+
'min_memories': 5000,
|
|
104
|
+
'max_memories': float('inf'),
|
|
105
|
+
'target_samples': 2000,
|
|
106
|
+
'n_estimators': 100,
|
|
107
|
+
'max_depth': 6,
|
|
108
|
+
},
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# LightGBM bootstrap parameters — MORE aggressive regularization than
|
|
112
|
+
# real training because synthetic data has systematic biases
|
|
113
|
+
BOOTSTRAP_PARAMS = {
|
|
114
|
+
'objective': 'lambdarank',
|
|
115
|
+
'metric': 'ndcg',
|
|
116
|
+
'ndcg_eval_at': [5, 10],
|
|
117
|
+
'learning_rate': 0.1,
|
|
118
|
+
'num_leaves': 8,
|
|
119
|
+
'max_depth': 3,
|
|
120
|
+
'min_child_samples': 5,
|
|
121
|
+
'subsample': 0.7,
|
|
122
|
+
'reg_alpha': 0.5,
|
|
123
|
+
'reg_lambda': 2.0,
|
|
124
|
+
'boosting_type': 'dart',
|
|
125
|
+
'verbose': -1,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# English stopwords for keyword extraction (no external deps)
|
|
129
|
+
_STOPWORDS = frozenset({
|
|
130
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
131
|
+
'of', 'with', 'by', 'from', 'is', 'it', 'this', 'that', 'was', 'are',
|
|
132
|
+
'be', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
133
|
+
'could', 'should', 'may', 'might', 'can', 'not', 'no', 'if', 'then',
|
|
134
|
+
'so', 'as', 'up', 'out', 'about', 'into', 'over', 'after', 'before',
|
|
135
|
+
'when', 'where', 'how', 'what', 'which', 'who', 'whom', 'why',
|
|
136
|
+
'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other',
|
|
137
|
+
'some', 'such', 'than', 'too', 'very', 'just', 'also', 'now',
|
|
138
|
+
'here', 'there', 'use', 'used', 'using', 'make', 'made',
|
|
139
|
+
'need', 'needed', 'get', 'got', 'set', 'new', 'old', 'one', 'two',
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
# Minimum word length for keyword extraction
|
|
143
|
+
_MIN_KEYWORD_LENGTH = 3
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class SyntheticBootstrapper:
|
|
147
|
+
"""
|
|
148
|
+
Generates synthetic training data and bootstraps the ML ranking model.
|
|
149
|
+
|
|
150
|
+
Usage:
|
|
151
|
+
bootstrapper = SyntheticBootstrapper()
|
|
152
|
+
if bootstrapper.should_bootstrap():
|
|
153
|
+
result = bootstrapper.bootstrap_model()
|
|
154
|
+
if result:
|
|
155
|
+
print(f"Bootstrapped with {result['training_samples']} samples")
|
|
156
|
+
|
|
157
|
+
The bootstrapped model is saved to the same path as the real model.
|
|
158
|
+
When real feedback accumulates, AdaptiveRanker.train() uses
|
|
159
|
+
continued learning (init_model) to incrementally replace synthetic
|
|
160
|
+
signal with real signal.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
MIN_MEMORIES_FOR_BOOTSTRAP = MIN_MEMORIES_FOR_BOOTSTRAP
|
|
164
|
+
BOOTSTRAP_CONFIG = BOOTSTRAP_CONFIG
|
|
165
|
+
|
|
166
|
+
def __init__(
|
|
167
|
+
self,
|
|
168
|
+
memory_db_path: Optional[Path] = None,
|
|
169
|
+
learning_db=None,
|
|
170
|
+
):
|
|
171
|
+
"""
|
|
172
|
+
Initialize SyntheticBootstrapper.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
memory_db_path: Path to memory.db (defaults to ~/.claude-memory/memory.db).
|
|
176
|
+
learning_db: Optional LearningDB instance for recording metadata.
|
|
177
|
+
"""
|
|
178
|
+
self._memory_db = Path(memory_db_path) if memory_db_path else MEMORY_DB_PATH
|
|
179
|
+
self._learning_db = learning_db
|
|
180
|
+
self._feature_extractor = FeatureExtractor()
|
|
181
|
+
|
|
182
|
+
# ========================================================================
|
|
183
|
+
# LearningDB Access
|
|
184
|
+
# ========================================================================
|
|
185
|
+
|
|
186
|
+
def _get_learning_db(self):
|
|
187
|
+
"""Get or create the LearningDB instance."""
|
|
188
|
+
if self._learning_db is None:
|
|
189
|
+
try:
|
|
190
|
+
from .learning_db import LearningDB
|
|
191
|
+
self._learning_db = LearningDB()
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.warning("Cannot access LearningDB: %s", e)
|
|
194
|
+
return None
|
|
195
|
+
return self._learning_db
|
|
196
|
+
|
|
197
|
+
# ========================================================================
|
|
198
|
+
# Pre-flight Checks
|
|
199
|
+
# ========================================================================
|
|
200
|
+
|
|
201
|
+
def should_bootstrap(self) -> bool:
|
|
202
|
+
"""
|
|
203
|
+
Check if synthetic bootstrap is needed and possible.
|
|
204
|
+
|
|
205
|
+
Returns True if:
|
|
206
|
+
1. LightGBM + NumPy are available
|
|
207
|
+
2. No existing model file (or forced rebuild)
|
|
208
|
+
3. At least MIN_MEMORIES_FOR_BOOTSTRAP memories exist in memory.db
|
|
209
|
+
"""
|
|
210
|
+
if not HAS_LIGHTGBM or not HAS_NUMPY:
|
|
211
|
+
logger.debug("Bootstrap unavailable: LightGBM=%s, NumPy=%s",
|
|
212
|
+
HAS_LIGHTGBM, HAS_NUMPY)
|
|
213
|
+
return False
|
|
214
|
+
|
|
215
|
+
if MODEL_PATH.exists():
|
|
216
|
+
logger.debug("Model already exists at %s — skipping bootstrap",
|
|
217
|
+
MODEL_PATH)
|
|
218
|
+
return False
|
|
219
|
+
|
|
220
|
+
memory_count = self._get_memory_count()
|
|
221
|
+
if memory_count < MIN_MEMORIES_FOR_BOOTSTRAP:
|
|
222
|
+
logger.debug(
|
|
223
|
+
"Not enough memories for bootstrap: %d (need %d)",
|
|
224
|
+
memory_count, MIN_MEMORIES_FOR_BOOTSTRAP
|
|
225
|
+
)
|
|
226
|
+
return False
|
|
227
|
+
|
|
228
|
+
return True
|
|
229
|
+
|
|
230
|
+
def get_tier(self) -> Optional[str]:
|
|
231
|
+
"""
|
|
232
|
+
Determine bootstrap tier based on memory count.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
'small', 'medium', 'large', or None if < MIN_MEMORIES.
|
|
236
|
+
"""
|
|
237
|
+
count = self._get_memory_count()
|
|
238
|
+
for tier_name, config in BOOTSTRAP_CONFIG.items():
|
|
239
|
+
if config['min_memories'] <= count <= config['max_memories']:
|
|
240
|
+
return tier_name
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
def _get_memory_count(self) -> int:
|
|
244
|
+
"""Count total memories in memory.db."""
|
|
245
|
+
if not self._memory_db.exists():
|
|
246
|
+
return 0
|
|
247
|
+
try:
|
|
248
|
+
conn = sqlite3.connect(str(self._memory_db), timeout=5)
|
|
249
|
+
cursor = conn.cursor()
|
|
250
|
+
cursor.execute('SELECT COUNT(*) FROM memories')
|
|
251
|
+
count = cursor.fetchone()[0]
|
|
252
|
+
conn.close()
|
|
253
|
+
return count
|
|
254
|
+
except Exception as e:
|
|
255
|
+
logger.warning("Failed to count memories: %s", e)
|
|
256
|
+
return 0
|
|
257
|
+
|
|
258
|
+
# ========================================================================
|
|
259
|
+
# Synthetic Data Generation
|
|
260
|
+
# ========================================================================
|
|
261
|
+
|
|
262
|
+
def generate_synthetic_training_data(self) -> List[dict]:
|
|
263
|
+
"""
|
|
264
|
+
Generate synthetic (query, memory, label, features) records.
|
|
265
|
+
|
|
266
|
+
Combines four strategies to produce training data from existing
|
|
267
|
+
memory patterns. Each record contains:
|
|
268
|
+
- query: Synthetic query string (extracted keywords)
|
|
269
|
+
- memory_id: ID of the memory in memory.db
|
|
270
|
+
- label: Relevance label (0.0 = irrelevant, 1.0 = highly relevant)
|
|
271
|
+
- source: Which strategy generated this record
|
|
272
|
+
- features: 9-dimensional feature vector
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
List of training record dicts. May be empty if insufficient data.
|
|
276
|
+
"""
|
|
277
|
+
records = []
|
|
278
|
+
|
|
279
|
+
# Strategy 1: Access-based pseudo-labels
|
|
280
|
+
access_records = self._generate_access_based()
|
|
281
|
+
records.extend(access_records)
|
|
282
|
+
logger.info("Strategy 1 (access): %d records", len(access_records))
|
|
283
|
+
|
|
284
|
+
# Strategy 2: Importance-based pseudo-labels
|
|
285
|
+
importance_records = self._generate_importance_based()
|
|
286
|
+
records.extend(importance_records)
|
|
287
|
+
logger.info("Strategy 2 (importance): %d records",
|
|
288
|
+
len(importance_records))
|
|
289
|
+
|
|
290
|
+
# Strategy 3: Pattern-based synthetic queries
|
|
291
|
+
pattern_records = self._generate_pattern_based()
|
|
292
|
+
records.extend(pattern_records)
|
|
293
|
+
logger.info("Strategy 3 (patterns): %d records", len(pattern_records))
|
|
294
|
+
|
|
295
|
+
# Strategy 4: Recency decay pseudo-labels
|
|
296
|
+
recency_records = self._generate_recency_based()
|
|
297
|
+
records.extend(recency_records)
|
|
298
|
+
logger.info("Strategy 4 (recency): %d records", len(recency_records))
|
|
299
|
+
|
|
300
|
+
logger.info("Total synthetic records: %d", len(records))
|
|
301
|
+
return records
|
|
302
|
+
|
|
303
|
+
def _generate_access_based(self) -> List[dict]:
|
|
304
|
+
"""
|
|
305
|
+
Strategy 1: Memories accessed 5+ times are relevant for their keywords.
|
|
306
|
+
|
|
307
|
+
Logic: If a user keeps coming back to a memory via certain searches,
|
|
308
|
+
the keywords in that memory are relevant queries for it.
|
|
309
|
+
"""
|
|
310
|
+
records = []
|
|
311
|
+
high_access_memories = self._get_memories_by_access(min_access=5)
|
|
312
|
+
|
|
313
|
+
for memory in high_access_memories:
|
|
314
|
+
keywords = self._extract_keywords(memory.get('content', ''))
|
|
315
|
+
if not keywords:
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
query = ' '.join(keywords)
|
|
319
|
+
|
|
320
|
+
# Positive: This memory is relevant to its own keywords
|
|
321
|
+
records.append(self._build_record(
|
|
322
|
+
query=query,
|
|
323
|
+
memory=memory,
|
|
324
|
+
label=1.0,
|
|
325
|
+
source='access_positive',
|
|
326
|
+
))
|
|
327
|
+
|
|
328
|
+
# Find some non-matching memories as negatives
|
|
329
|
+
negatives = self._find_negative_memories(
|
|
330
|
+
memory, exclude_ids={memory['id']}, limit=2
|
|
331
|
+
)
|
|
332
|
+
for neg_memory in negatives:
|
|
333
|
+
records.append(self._build_record(
|
|
334
|
+
query=query,
|
|
335
|
+
memory=neg_memory,
|
|
336
|
+
label=0.0,
|
|
337
|
+
source='access_negative',
|
|
338
|
+
))
|
|
339
|
+
|
|
340
|
+
return records
|
|
341
|
+
|
|
342
|
+
def _generate_importance_based(self) -> List[dict]:
|
|
343
|
+
"""
|
|
344
|
+
Strategy 2: High-importance memories (>= 8) are positive for their tags.
|
|
345
|
+
|
|
346
|
+
Logic: User explicitly rated these memories as important. Their tags
|
|
347
|
+
represent topics the user cares about.
|
|
348
|
+
"""
|
|
349
|
+
records = []
|
|
350
|
+
important_memories = self._get_memories_by_importance(min_importance=8)
|
|
351
|
+
|
|
352
|
+
for memory in important_memories:
|
|
353
|
+
# Use tags as synthetic query, fall back to content keywords
|
|
354
|
+
tags = memory.get('tags', '')
|
|
355
|
+
if isinstance(tags, str):
|
|
356
|
+
try:
|
|
357
|
+
import json
|
|
358
|
+
tags_list = json.loads(tags)
|
|
359
|
+
except (ValueError, TypeError):
|
|
360
|
+
tags_list = [t.strip() for t in tags.split(',') if t.strip()]
|
|
361
|
+
elif isinstance(tags, list):
|
|
362
|
+
tags_list = tags
|
|
363
|
+
else:
|
|
364
|
+
tags_list = []
|
|
365
|
+
|
|
366
|
+
if tags_list:
|
|
367
|
+
query = ' '.join(tags_list[:5])
|
|
368
|
+
else:
|
|
369
|
+
keywords = self._extract_keywords(memory.get('content', ''))
|
|
370
|
+
query = ' '.join(keywords) if keywords else ''
|
|
371
|
+
|
|
372
|
+
if not query:
|
|
373
|
+
continue
|
|
374
|
+
|
|
375
|
+
# Positive: High-importance memory matches its tags
|
|
376
|
+
records.append(self._build_record(
|
|
377
|
+
query=query,
|
|
378
|
+
memory=memory,
|
|
379
|
+
label=1.0,
|
|
380
|
+
source='importance_positive',
|
|
381
|
+
))
|
|
382
|
+
|
|
383
|
+
# Find some negatives
|
|
384
|
+
negatives = self._find_negative_memories(
|
|
385
|
+
memory, exclude_ids={memory['id']}, limit=2
|
|
386
|
+
)
|
|
387
|
+
for neg_memory in negatives:
|
|
388
|
+
records.append(self._build_record(
|
|
389
|
+
query=query,
|
|
390
|
+
memory=neg_memory,
|
|
391
|
+
label=0.0,
|
|
392
|
+
source='importance_negative',
|
|
393
|
+
))
|
|
394
|
+
|
|
395
|
+
return records
|
|
396
|
+
|
|
397
|
+
def _generate_pattern_based(self) -> List[dict]:
|
|
398
|
+
"""
|
|
399
|
+
Strategy 3: Use learned identity_patterns to create synthetic queries.
|
|
400
|
+
|
|
401
|
+
Logic: Pattern learner has already identified user's tech preferences,
|
|
402
|
+
coding style, etc. Use these as queries and find matching memories.
|
|
403
|
+
"""
|
|
404
|
+
records = []
|
|
405
|
+
patterns = self._get_learned_patterns(min_confidence=0.7)
|
|
406
|
+
|
|
407
|
+
if not patterns:
|
|
408
|
+
return records
|
|
409
|
+
|
|
410
|
+
for pattern in patterns:
|
|
411
|
+
# Build query from pattern key + value
|
|
412
|
+
query_parts = []
|
|
413
|
+
key = pattern.get('key', '')
|
|
414
|
+
value = pattern.get('value', '')
|
|
415
|
+
if key:
|
|
416
|
+
query_parts.append(key)
|
|
417
|
+
if value and value != key:
|
|
418
|
+
query_parts.append(value)
|
|
419
|
+
|
|
420
|
+
query = ' '.join(query_parts)
|
|
421
|
+
if not query or len(query) < 3:
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# Search for memories matching this pattern
|
|
425
|
+
matching = self._search_memories(query, limit=10)
|
|
426
|
+
|
|
427
|
+
if len(matching) < 2:
|
|
428
|
+
continue
|
|
429
|
+
|
|
430
|
+
# Top results are positive, bottom results are weak negatives
|
|
431
|
+
for i, memory in enumerate(matching):
|
|
432
|
+
if i < 3:
|
|
433
|
+
label = 1.0 # Top matches are relevant
|
|
434
|
+
elif i < 6:
|
|
435
|
+
label = 0.5 # Middle matches are weakly relevant
|
|
436
|
+
else:
|
|
437
|
+
label = 0.1 # Bottom matches are marginal
|
|
438
|
+
|
|
439
|
+
records.append(self._build_record(
|
|
440
|
+
query=query,
|
|
441
|
+
memory=memory,
|
|
442
|
+
label=label,
|
|
443
|
+
source='pattern',
|
|
444
|
+
))
|
|
445
|
+
|
|
446
|
+
return records
|
|
447
|
+
|
|
448
|
+
def _generate_recency_based(self) -> List[dict]:
|
|
449
|
+
"""
|
|
450
|
+
Strategy 4: Recency decay — for shared-topic queries, recent wins.
|
|
451
|
+
|
|
452
|
+
Logic: For memories about the same topic, more recent memories
|
|
453
|
+
should generally rank higher (fresher context, more current).
|
|
454
|
+
Generates pairs where newer = positive, older = weak negative.
|
|
455
|
+
"""
|
|
456
|
+
records = []
|
|
457
|
+
|
|
458
|
+
# Get a sample of recent and old memories
|
|
459
|
+
recent = self._get_recent_memories(limit=30)
|
|
460
|
+
if len(recent) < 4:
|
|
461
|
+
return records
|
|
462
|
+
|
|
463
|
+
# Take pairs: for each recent memory's keywords, create a query
|
|
464
|
+
# then the recent memory is positive and older memories are negative
|
|
465
|
+
processed_queries: Set[str] = set()
|
|
466
|
+
|
|
467
|
+
for memory in recent[:15]:
|
|
468
|
+
keywords = self._extract_keywords(memory.get('content', ''))
|
|
469
|
+
query = ' '.join(keywords) if keywords else ''
|
|
470
|
+
if not query or query in processed_queries:
|
|
471
|
+
continue
|
|
472
|
+
processed_queries.add(query)
|
|
473
|
+
|
|
474
|
+
# This recent memory is positive
|
|
475
|
+
records.append(self._build_record(
|
|
476
|
+
query=query,
|
|
477
|
+
memory=memory,
|
|
478
|
+
label=0.8, # Good but not perfect (it's synthetic)
|
|
479
|
+
source='recency_positive',
|
|
480
|
+
))
|
|
481
|
+
|
|
482
|
+
# Find older memories about similar topic
|
|
483
|
+
similar_old = self._search_memories(query, limit=5)
|
|
484
|
+
for old_mem in similar_old:
|
|
485
|
+
if old_mem['id'] == memory['id']:
|
|
486
|
+
continue
|
|
487
|
+
# Older memories get lower label
|
|
488
|
+
records.append(self._build_record(
|
|
489
|
+
query=query,
|
|
490
|
+
memory=old_mem,
|
|
491
|
+
label=0.3,
|
|
492
|
+
source='recency_negative',
|
|
493
|
+
))
|
|
494
|
+
|
|
495
|
+
return records
|
|
496
|
+
|
|
497
|
+
# ========================================================================
|
|
498
|
+
# Record Building
|
|
499
|
+
# ========================================================================
|
|
500
|
+
|
|
501
|
+
def _build_record(
|
|
502
|
+
self,
|
|
503
|
+
query: str,
|
|
504
|
+
memory: dict,
|
|
505
|
+
label: float,
|
|
506
|
+
source: str,
|
|
507
|
+
) -> dict:
|
|
508
|
+
"""
|
|
509
|
+
Build a training record with features.
|
|
510
|
+
|
|
511
|
+
For synthetic data, we use simplified context:
|
|
512
|
+
- No tech preferences (unknown at bootstrap time)
|
|
513
|
+
- No current project
|
|
514
|
+
- No workflow phase
|
|
515
|
+
Focus on measurable features: importance, recency, access_frequency.
|
|
516
|
+
"""
|
|
517
|
+
# Set neutral context (no query-time info for synthetic data)
|
|
518
|
+
# Context is already set externally or defaults to neutral
|
|
519
|
+
features = self._feature_extractor.extract_features(memory, query)
|
|
520
|
+
|
|
521
|
+
return {
|
|
522
|
+
'query': query,
|
|
523
|
+
'query_hash': hashlib.sha256(query.encode()).hexdigest()[:16],
|
|
524
|
+
'memory_id': memory.get('id', 0),
|
|
525
|
+
'label': label,
|
|
526
|
+
'source': source,
|
|
527
|
+
'features': features,
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
# ========================================================================
|
|
531
|
+
# Model Training
|
|
532
|
+
# ========================================================================
|
|
533
|
+
|
|
534
|
+
def bootstrap_model(self) -> Optional[Dict[str, Any]]:
|
|
535
|
+
"""
|
|
536
|
+
Generate synthetic data and train the bootstrap model.
|
|
537
|
+
|
|
538
|
+
Steps:
|
|
539
|
+
1. Generate synthetic training data
|
|
540
|
+
2. Build feature matrix and label vectors
|
|
541
|
+
3. Train LightGBM with aggressive regularization
|
|
542
|
+
4. Save model to ~/.claude-memory/models/ranker.txt
|
|
543
|
+
5. Record metadata in learning_db
|
|
544
|
+
6. Return metadata
|
|
545
|
+
|
|
546
|
+
Returns:
|
|
547
|
+
Training metadata dict, or None if bootstrap not possible.
|
|
548
|
+
"""
|
|
549
|
+
if not HAS_LIGHTGBM or not HAS_NUMPY:
|
|
550
|
+
logger.warning("Bootstrap requires LightGBM and NumPy")
|
|
551
|
+
return None
|
|
552
|
+
|
|
553
|
+
tier = self.get_tier()
|
|
554
|
+
if tier is None:
|
|
555
|
+
logger.info("Not enough memories for bootstrap")
|
|
556
|
+
return None
|
|
557
|
+
|
|
558
|
+
config = BOOTSTRAP_CONFIG[tier]
|
|
559
|
+
logger.info(
|
|
560
|
+
"Starting bootstrap (tier=%s, target=%d samples)",
|
|
561
|
+
tier, config['target_samples']
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
# Set neutral context for feature extraction
|
|
565
|
+
self._feature_extractor.set_context()
|
|
566
|
+
|
|
567
|
+
# Generate synthetic data
|
|
568
|
+
records = self.generate_synthetic_training_data()
|
|
569
|
+
if not records:
|
|
570
|
+
logger.warning("No synthetic records generated")
|
|
571
|
+
return None
|
|
572
|
+
|
|
573
|
+
# Trim to target sample count if needed
|
|
574
|
+
if len(records) > config['target_samples']:
|
|
575
|
+
# Keep a diverse sample across sources
|
|
576
|
+
records = self._diverse_sample(records, config['target_samples'])
|
|
577
|
+
|
|
578
|
+
# Group by query_hash for LGBMRanker
|
|
579
|
+
query_groups: Dict[str, List[dict]] = {}
|
|
580
|
+
for record in records:
|
|
581
|
+
qh = record['query_hash']
|
|
582
|
+
if qh not in query_groups:
|
|
583
|
+
query_groups[qh] = []
|
|
584
|
+
query_groups[qh].append(record)
|
|
585
|
+
|
|
586
|
+
# Filter: only keep groups with 2+ items
|
|
587
|
+
query_groups = {
|
|
588
|
+
qh: recs for qh, recs in query_groups.items()
|
|
589
|
+
if len(recs) >= 2
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
if not query_groups:
|
|
593
|
+
logger.warning("No valid query groups (need 2+ records per group)")
|
|
594
|
+
return None
|
|
595
|
+
|
|
596
|
+
# Build matrices
|
|
597
|
+
all_features = []
|
|
598
|
+
all_labels = []
|
|
599
|
+
groups = []
|
|
600
|
+
|
|
601
|
+
for qh, group_records in query_groups.items():
|
|
602
|
+
group_size = 0
|
|
603
|
+
for record in group_records:
|
|
604
|
+
all_features.append(record['features'])
|
|
605
|
+
all_labels.append(record['label'])
|
|
606
|
+
group_size += 1
|
|
607
|
+
groups.append(group_size)
|
|
608
|
+
|
|
609
|
+
X = np.array(all_features, dtype=np.float64)
|
|
610
|
+
y = np.array(all_labels, dtype=np.float64)
|
|
611
|
+
total_samples = X.shape[0]
|
|
612
|
+
|
|
613
|
+
if total_samples < 10:
|
|
614
|
+
logger.warning("Too few samples after grouping: %d", total_samples)
|
|
615
|
+
return None
|
|
616
|
+
|
|
617
|
+
logger.info(
|
|
618
|
+
"Training bootstrap model: %d samples, %d groups, tier=%s",
|
|
619
|
+
total_samples, len(groups), tier
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
# Create LightGBM dataset
|
|
623
|
+
train_dataset = lgb.Dataset(
|
|
624
|
+
X, label=y, group=groups,
|
|
625
|
+
feature_name=list(FEATURE_NAMES),
|
|
626
|
+
free_raw_data=False,
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
# Use tiered n_estimators and max_depth
|
|
630
|
+
params = dict(BOOTSTRAP_PARAMS)
|
|
631
|
+
params['max_depth'] = config['max_depth']
|
|
632
|
+
n_estimators = config['n_estimators']
|
|
633
|
+
|
|
634
|
+
# Train
|
|
635
|
+
try:
|
|
636
|
+
booster = lgb.train(
|
|
637
|
+
params,
|
|
638
|
+
train_dataset,
|
|
639
|
+
num_boost_round=n_estimators,
|
|
640
|
+
valid_sets=[train_dataset],
|
|
641
|
+
valid_names=['train'],
|
|
642
|
+
callbacks=[lgb.log_evaluation(period=0)], # Silent
|
|
643
|
+
)
|
|
644
|
+
except Exception as e:
|
|
645
|
+
logger.error("Bootstrap training failed: %s", e)
|
|
646
|
+
return None
|
|
647
|
+
|
|
648
|
+
# Save model
|
|
649
|
+
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
|
650
|
+
try:
|
|
651
|
+
booster.save_model(str(MODEL_PATH))
|
|
652
|
+
logger.info("Bootstrap model saved to %s", MODEL_PATH)
|
|
653
|
+
except Exception as e:
|
|
654
|
+
logger.error("Failed to save bootstrap model: %s", e)
|
|
655
|
+
return None
|
|
656
|
+
|
|
657
|
+
# Extract NDCG@10 from training evaluation
|
|
658
|
+
ndcg_at_10 = None
|
|
659
|
+
try:
|
|
660
|
+
eval_results = booster.eval_train(
|
|
661
|
+
lgb.Dataset(X, label=y, group=groups)
|
|
662
|
+
)
|
|
663
|
+
for name, _dataset_name, value, _is_higher_better in eval_results:
|
|
664
|
+
if 'ndcg@10' in name:
|
|
665
|
+
ndcg_at_10 = value
|
|
666
|
+
break
|
|
667
|
+
except Exception:
|
|
668
|
+
pass
|
|
669
|
+
|
|
670
|
+
# Record metadata in learning_db
|
|
671
|
+
model_version = f"bootstrap_{tier}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
672
|
+
ldb = self._get_learning_db()
|
|
673
|
+
if ldb:
|
|
674
|
+
try:
|
|
675
|
+
ldb.record_model_training(
|
|
676
|
+
model_version=model_version,
|
|
677
|
+
training_samples=total_samples,
|
|
678
|
+
synthetic_samples=total_samples,
|
|
679
|
+
real_samples=0,
|
|
680
|
+
ndcg_at_10=ndcg_at_10,
|
|
681
|
+
model_path=str(MODEL_PATH),
|
|
682
|
+
)
|
|
683
|
+
except Exception as e:
|
|
684
|
+
logger.warning("Failed to record bootstrap metadata: %s", e)
|
|
685
|
+
|
|
686
|
+
metadata = {
|
|
687
|
+
'model_version': model_version,
|
|
688
|
+
'tier': tier,
|
|
689
|
+
'training_samples': total_samples,
|
|
690
|
+
'synthetic_samples': total_samples,
|
|
691
|
+
'query_groups': len(groups),
|
|
692
|
+
'n_estimators': n_estimators,
|
|
693
|
+
'max_depth': config['max_depth'],
|
|
694
|
+
'ndcg_at_10': ndcg_at_10,
|
|
695
|
+
'model_path': str(MODEL_PATH),
|
|
696
|
+
'source_breakdown': self._count_sources(records),
|
|
697
|
+
'created_at': datetime.now().isoformat(),
|
|
698
|
+
}
|
|
699
|
+
logger.info("Bootstrap complete: %s", metadata)
|
|
700
|
+
return metadata
|
|
701
|
+
|
|
702
|
+
# ========================================================================
|
|
703
|
+
# Memory Database Queries (READ-ONLY on memory.db)
|
|
704
|
+
# ========================================================================
|
|
705
|
+
|
|
706
|
+
def _get_memories_by_access(self, min_access: int = 5) -> List[dict]:
|
|
707
|
+
"""
|
|
708
|
+
Fetch memories with access_count >= min_access from memory.db.
|
|
709
|
+
|
|
710
|
+
These are memories the user keeps coming back to — strong positive signal.
|
|
711
|
+
"""
|
|
712
|
+
if not self._memory_db.exists():
|
|
713
|
+
return []
|
|
714
|
+
try:
|
|
715
|
+
conn = sqlite3.connect(str(self._memory_db), timeout=5)
|
|
716
|
+
conn.row_factory = sqlite3.Row
|
|
717
|
+
cursor = conn.cursor()
|
|
718
|
+
cursor.execute('''
|
|
719
|
+
SELECT id, content, summary, project_name, tags,
|
|
720
|
+
category, importance, created_at, access_count
|
|
721
|
+
FROM memories
|
|
722
|
+
WHERE access_count >= ?
|
|
723
|
+
ORDER BY access_count DESC
|
|
724
|
+
LIMIT 100
|
|
725
|
+
''', (min_access,))
|
|
726
|
+
results = [dict(row) for row in cursor.fetchall()]
|
|
727
|
+
conn.close()
|
|
728
|
+
return results
|
|
729
|
+
except Exception as e:
|
|
730
|
+
logger.warning("Failed to fetch high-access memories: %s", e)
|
|
731
|
+
return []
|
|
732
|
+
|
|
733
|
+
def _get_memories_by_importance(self, min_importance: int = 8) -> List[dict]:
|
|
734
|
+
"""
|
|
735
|
+
Fetch memories with importance >= min_importance from memory.db.
|
|
736
|
+
|
|
737
|
+
High importance = user explicitly rated these as valuable.
|
|
738
|
+
"""
|
|
739
|
+
if not self._memory_db.exists():
|
|
740
|
+
return []
|
|
741
|
+
try:
|
|
742
|
+
conn = sqlite3.connect(str(self._memory_db), timeout=5)
|
|
743
|
+
conn.row_factory = sqlite3.Row
|
|
744
|
+
cursor = conn.cursor()
|
|
745
|
+
cursor.execute('''
|
|
746
|
+
SELECT id, content, summary, project_name, tags,
|
|
747
|
+
category, importance, created_at, access_count
|
|
748
|
+
FROM memories
|
|
749
|
+
WHERE importance >= ?
|
|
750
|
+
ORDER BY importance DESC
|
|
751
|
+
LIMIT 100
|
|
752
|
+
''', (min_importance,))
|
|
753
|
+
results = [dict(row) for row in cursor.fetchall()]
|
|
754
|
+
conn.close()
|
|
755
|
+
return results
|
|
756
|
+
except Exception as e:
|
|
757
|
+
logger.warning("Failed to fetch high-importance memories: %s", e)
|
|
758
|
+
return []
|
|
759
|
+
|
|
760
|
+
def _get_recent_memories(self, limit: int = 30) -> List[dict]:
|
|
761
|
+
"""Fetch the N most recently created memories."""
|
|
762
|
+
if not self._memory_db.exists():
|
|
763
|
+
return []
|
|
764
|
+
try:
|
|
765
|
+
conn = sqlite3.connect(str(self._memory_db), timeout=5)
|
|
766
|
+
conn.row_factory = sqlite3.Row
|
|
767
|
+
cursor = conn.cursor()
|
|
768
|
+
cursor.execute('''
|
|
769
|
+
SELECT id, content, summary, project_name, tags,
|
|
770
|
+
category, importance, created_at, access_count
|
|
771
|
+
FROM memories
|
|
772
|
+
ORDER BY created_at DESC
|
|
773
|
+
LIMIT ?
|
|
774
|
+
''', (limit,))
|
|
775
|
+
results = [dict(row) for row in cursor.fetchall()]
|
|
776
|
+
conn.close()
|
|
777
|
+
return results
|
|
778
|
+
except Exception as e:
|
|
779
|
+
logger.warning("Failed to fetch recent memories: %s", e)
|
|
780
|
+
return []
|
|
781
|
+
|
|
782
|
+
def _get_learned_patterns(
|
|
783
|
+
self,
|
|
784
|
+
min_confidence: float = 0.7,
|
|
785
|
+
) -> List[dict]:
|
|
786
|
+
"""
|
|
787
|
+
Fetch high-confidence identity_patterns from memory.db.
|
|
788
|
+
|
|
789
|
+
These are patterns detected by pattern_learner.py (Layer 4) —
|
|
790
|
+
tech preferences, coding style, terminology, etc.
|
|
791
|
+
|
|
792
|
+
Returns empty list if identity_patterns table doesn't exist
|
|
793
|
+
(backward compatible with pre-v2.3 databases).
|
|
794
|
+
"""
|
|
795
|
+
if not self._memory_db.exists():
|
|
796
|
+
return []
|
|
797
|
+
try:
|
|
798
|
+
conn = sqlite3.connect(str(self._memory_db), timeout=5)
|
|
799
|
+
conn.row_factory = sqlite3.Row
|
|
800
|
+
cursor = conn.cursor()
|
|
801
|
+
|
|
802
|
+
# Check if table exists (backward compatibility)
|
|
803
|
+
cursor.execute('''
|
|
804
|
+
SELECT name FROM sqlite_master
|
|
805
|
+
WHERE type='table' AND name='identity_patterns'
|
|
806
|
+
''')
|
|
807
|
+
if cursor.fetchone() is None:
|
|
808
|
+
conn.close()
|
|
809
|
+
return []
|
|
810
|
+
|
|
811
|
+
cursor.execute('''
|
|
812
|
+
SELECT id, pattern_type, key, value, confidence,
|
|
813
|
+
evidence_count, category
|
|
814
|
+
FROM identity_patterns
|
|
815
|
+
WHERE confidence >= ?
|
|
816
|
+
ORDER BY confidence DESC
|
|
817
|
+
LIMIT 50
|
|
818
|
+
''', (min_confidence,))
|
|
819
|
+
results = [dict(row) for row in cursor.fetchall()]
|
|
820
|
+
conn.close()
|
|
821
|
+
return results
|
|
822
|
+
except Exception as e:
|
|
823
|
+
logger.warning("Failed to fetch learned patterns: %s", e)
|
|
824
|
+
return []
|
|
825
|
+
|
|
826
|
+
def _search_memories(self, query: str, limit: int = 20) -> List[dict]:
|
|
827
|
+
"""
|
|
828
|
+
Simple FTS5 search in memory.db.
|
|
829
|
+
|
|
830
|
+
Used to find memories matching synthetic query terms.
|
|
831
|
+
This is a lightweight search — no TF-IDF, no HNSW, just FTS5.
|
|
832
|
+
"""
|
|
833
|
+
if not self._memory_db.exists():
|
|
834
|
+
return []
|
|
835
|
+
if not query or not query.strip():
|
|
836
|
+
return []
|
|
837
|
+
|
|
838
|
+
try:
|
|
839
|
+
conn = sqlite3.connect(str(self._memory_db), timeout=5)
|
|
840
|
+
conn.row_factory = sqlite3.Row
|
|
841
|
+
cursor = conn.cursor()
|
|
842
|
+
|
|
843
|
+
# Clean query for FTS5 (same approach as memory_store_v2.search)
|
|
844
|
+
fts_tokens = re.findall(r'\w+', query)
|
|
845
|
+
if not fts_tokens:
|
|
846
|
+
conn.close()
|
|
847
|
+
return []
|
|
848
|
+
fts_query = ' OR '.join(fts_tokens)
|
|
849
|
+
|
|
850
|
+
cursor.execute('''
|
|
851
|
+
SELECT m.id, m.content, m.summary, m.project_name, m.tags,
|
|
852
|
+
m.category, m.importance, m.created_at, m.access_count
|
|
853
|
+
FROM memories m
|
|
854
|
+
JOIN memories_fts fts ON m.id = fts.rowid
|
|
855
|
+
WHERE memories_fts MATCH ?
|
|
856
|
+
ORDER BY rank
|
|
857
|
+
LIMIT ?
|
|
858
|
+
''', (fts_query, limit))
|
|
859
|
+
results = [dict(row) for row in cursor.fetchall()]
|
|
860
|
+
conn.close()
|
|
861
|
+
return results
|
|
862
|
+
except Exception as e:
|
|
863
|
+
logger.debug("FTS5 search failed (may not exist yet): %s", e)
|
|
864
|
+
return []
|
|
865
|
+
|
|
866
|
+
def _find_negative_memories(
|
|
867
|
+
self,
|
|
868
|
+
anchor_memory: dict,
|
|
869
|
+
exclude_ids: Optional[Set[int]] = None,
|
|
870
|
+
limit: int = 2,
|
|
871
|
+
) -> List[dict]:
|
|
872
|
+
"""
|
|
873
|
+
Find memories dissimilar to the anchor (for negative examples).
|
|
874
|
+
|
|
875
|
+
Simple heuristic: pick memories from a different category or project.
|
|
876
|
+
Falls back to random sample if no structured differences available.
|
|
877
|
+
"""
|
|
878
|
+
if not self._memory_db.exists():
|
|
879
|
+
return []
|
|
880
|
+
exclude_ids = exclude_ids or set()
|
|
881
|
+
|
|
882
|
+
try:
|
|
883
|
+
conn = sqlite3.connect(str(self._memory_db), timeout=5)
|
|
884
|
+
conn.row_factory = sqlite3.Row
|
|
885
|
+
cursor = conn.cursor()
|
|
886
|
+
|
|
887
|
+
anchor_project = anchor_memory.get('project_name', '')
|
|
888
|
+
anchor_category = anchor_memory.get('category', '')
|
|
889
|
+
|
|
890
|
+
# Try to find memories from different project or category
|
|
891
|
+
conditions = []
|
|
892
|
+
params: list = []
|
|
893
|
+
|
|
894
|
+
if anchor_project:
|
|
895
|
+
conditions.append('project_name != ?')
|
|
896
|
+
params.append(anchor_project)
|
|
897
|
+
if anchor_category:
|
|
898
|
+
conditions.append('category != ?')
|
|
899
|
+
params.append(anchor_category)
|
|
900
|
+
|
|
901
|
+
# Exclude specified IDs
|
|
902
|
+
if exclude_ids:
|
|
903
|
+
placeholders = ','.join('?' for _ in exclude_ids)
|
|
904
|
+
conditions.append(f'id NOT IN ({placeholders})')
|
|
905
|
+
params.extend(exclude_ids)
|
|
906
|
+
|
|
907
|
+
where_clause = ' AND '.join(conditions) if conditions else '1=1'
|
|
908
|
+
|
|
909
|
+
cursor.execute(f'''
|
|
910
|
+
SELECT id, content, summary, project_name, tags,
|
|
911
|
+
category, importance, created_at, access_count
|
|
912
|
+
FROM memories
|
|
913
|
+
WHERE {where_clause}
|
|
914
|
+
ORDER BY RANDOM()
|
|
915
|
+
LIMIT ?
|
|
916
|
+
''', (*params, limit))
|
|
917
|
+
results = [dict(row) for row in cursor.fetchall()]
|
|
918
|
+
conn.close()
|
|
919
|
+
return results
|
|
920
|
+
except Exception as e:
|
|
921
|
+
logger.debug("Failed to find negative memories: %s", e)
|
|
922
|
+
return []
|
|
923
|
+
|
|
924
|
+
# ========================================================================
|
|
925
|
+
# Text Processing
|
|
926
|
+
# ========================================================================
|
|
927
|
+
|
|
928
|
+
def _extract_keywords(self, content: str, top_n: int = 3) -> List[str]:
|
|
929
|
+
"""
|
|
930
|
+
Extract meaningful keywords from memory content.
|
|
931
|
+
|
|
932
|
+
Simple frequency-based extraction:
|
|
933
|
+
1. Tokenize (alphanumeric words)
|
|
934
|
+
2. Remove stopwords and short words
|
|
935
|
+
3. Return top N by frequency
|
|
936
|
+
|
|
937
|
+
No external NLP dependencies — just regex + counter.
|
|
938
|
+
"""
|
|
939
|
+
if not content:
|
|
940
|
+
return []
|
|
941
|
+
|
|
942
|
+
# Tokenize: extract alphanumeric words
|
|
943
|
+
words = re.findall(r'[a-zA-Z][a-zA-Z0-9_.-]*[a-zA-Z0-9]|[a-zA-Z]', content.lower())
|
|
944
|
+
|
|
945
|
+
# Filter stopwords and short words
|
|
946
|
+
meaningful = [
|
|
947
|
+
w for w in words
|
|
948
|
+
if w not in _STOPWORDS and len(w) >= _MIN_KEYWORD_LENGTH
|
|
949
|
+
]
|
|
950
|
+
|
|
951
|
+
if not meaningful:
|
|
952
|
+
return []
|
|
953
|
+
|
|
954
|
+
# Count and return top N
|
|
955
|
+
counter = Counter(meaningful)
|
|
956
|
+
return [word for word, _count in counter.most_common(top_n)]
|
|
957
|
+
|
|
958
|
+
# ========================================================================
|
|
959
|
+
# Utility
|
|
960
|
+
# ========================================================================
|
|
961
|
+
|
|
962
|
+
def _diverse_sample(
|
|
963
|
+
self,
|
|
964
|
+
records: List[dict],
|
|
965
|
+
target: int,
|
|
966
|
+
) -> List[dict]:
|
|
967
|
+
"""
|
|
968
|
+
Sample records while maintaining source diversity.
|
|
969
|
+
|
|
970
|
+
Takes proportional samples from each source strategy to ensure
|
|
971
|
+
the training data isn't dominated by one strategy.
|
|
972
|
+
"""
|
|
973
|
+
if len(records) <= target:
|
|
974
|
+
return records
|
|
975
|
+
|
|
976
|
+
# Group by source
|
|
977
|
+
by_source: Dict[str, List[dict]] = {}
|
|
978
|
+
for r in records:
|
|
979
|
+
src = r.get('source', 'unknown')
|
|
980
|
+
if src not in by_source:
|
|
981
|
+
by_source[src] = []
|
|
982
|
+
by_source[src].append(r)
|
|
983
|
+
|
|
984
|
+
# Proportional allocation
|
|
985
|
+
n_sources = len(by_source)
|
|
986
|
+
if n_sources == 0:
|
|
987
|
+
return records[:target]
|
|
988
|
+
|
|
989
|
+
per_source = max(1, target // n_sources)
|
|
990
|
+
sampled = []
|
|
991
|
+
|
|
992
|
+
for source, source_records in by_source.items():
|
|
993
|
+
# Take up to per_source from each, or all if fewer
|
|
994
|
+
take = min(len(source_records), per_source)
|
|
995
|
+
sampled.extend(source_records[:take])
|
|
996
|
+
|
|
997
|
+
# If under target, fill from remaining
|
|
998
|
+
if len(sampled) < target:
|
|
999
|
+
used_ids = {(r['query_hash'], r['memory_id']) for r in sampled}
|
|
1000
|
+
for r in records:
|
|
1001
|
+
if len(sampled) >= target:
|
|
1002
|
+
break
|
|
1003
|
+
key = (r['query_hash'], r['memory_id'])
|
|
1004
|
+
if key not in used_ids:
|
|
1005
|
+
sampled.append(r)
|
|
1006
|
+
used_ids.add(key)
|
|
1007
|
+
|
|
1008
|
+
return sampled[:target]
|
|
1009
|
+
|
|
1010
|
+
def _count_sources(self, records: List[dict]) -> Dict[str, int]:
|
|
1011
|
+
"""Count records by source strategy."""
|
|
1012
|
+
counts: Dict[str, int] = {}
|
|
1013
|
+
for r in records:
|
|
1014
|
+
src = r.get('source', 'unknown')
|
|
1015
|
+
counts[src] = counts.get(src, 0) + 1
|
|
1016
|
+
return counts
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
# ============================================================================
|
|
1020
|
+
# Module-level convenience
|
|
1021
|
+
# ============================================================================
|
|
1022
|
+
|
|
1023
|
+
def should_bootstrap(memory_db_path: Optional[Path] = None) -> bool:
|
|
1024
|
+
"""Quick check if bootstrap is needed (creates temporary bootstrapper)."""
|
|
1025
|
+
try:
|
|
1026
|
+
bootstrapper = SyntheticBootstrapper(memory_db_path=memory_db_path)
|
|
1027
|
+
return bootstrapper.should_bootstrap()
|
|
1028
|
+
except Exception:
|
|
1029
|
+
return False
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
def run_bootstrap(
|
|
1033
|
+
memory_db_path: Optional[Path] = None,
|
|
1034
|
+
learning_db=None,
|
|
1035
|
+
) -> Optional[Dict[str, Any]]:
|
|
1036
|
+
"""Run bootstrap and return metadata (convenience function)."""
|
|
1037
|
+
try:
|
|
1038
|
+
bootstrapper = SyntheticBootstrapper(
|
|
1039
|
+
memory_db_path=memory_db_path,
|
|
1040
|
+
learning_db=learning_db,
|
|
1041
|
+
)
|
|
1042
|
+
if bootstrapper.should_bootstrap():
|
|
1043
|
+
return bootstrapper.bootstrap_model()
|
|
1044
|
+
return None
|
|
1045
|
+
except Exception as e:
|
|
1046
|
+
logger.error("Bootstrap failed: %s", e)
|
|
1047
|
+
return None
|