superlocalmemory 2.6.5 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,433 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SuperLocalMemory V2 - Tests for SyntheticBootstrapper (v2.7)
4
+ Copyright (c) 2026 Varun Pratap Bhardwaj
5
+ Licensed under MIT License
6
+ """
7
+
8
+ import sqlite3
9
+ from pathlib import Path
10
+
11
+ import pytest
12
+
13
+ # Detect optional dependencies at import time
14
+ try:
15
+ import lightgbm
16
+ HAS_LIGHTGBM = True
17
+ except ImportError:
18
+ HAS_LIGHTGBM = False
19
+
20
+ try:
21
+ import numpy as np
22
+ HAS_NUMPY = True
23
+ except ImportError:
24
+ np = None
25
+ HAS_NUMPY = False
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Fixtures
30
+ # ---------------------------------------------------------------------------
31
+
32
+ @pytest.fixture(autouse=True)
33
+ def reset_singleton():
34
+ from src.learning.learning_db import LearningDB
35
+ LearningDB.reset_instance()
36
+ yield
37
+ LearningDB.reset_instance()
38
+
39
+
40
+ @pytest.fixture
41
+ def learning_db(tmp_path):
42
+ from src.learning.learning_db import LearningDB
43
+ db_path = tmp_path / "learning.db"
44
+ return LearningDB(db_path=db_path)
45
+
46
+
47
+ @pytest.fixture
48
+ def memory_db(tmp_path):
49
+ """Create a memory.db with FTS5 and identity_patterns tables."""
50
+ db_path = tmp_path / "memory.db"
51
+ conn = sqlite3.connect(str(db_path))
52
+ cursor = conn.cursor()
53
+ cursor.execute('''
54
+ CREATE TABLE IF NOT EXISTS memories (
55
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
56
+ content TEXT NOT NULL,
57
+ summary TEXT,
58
+ project_path TEXT,
59
+ project_name TEXT,
60
+ tags TEXT DEFAULT '[]',
61
+ category TEXT,
62
+ parent_id INTEGER,
63
+ tree_path TEXT DEFAULT '/',
64
+ depth INTEGER DEFAULT 0,
65
+ memory_type TEXT DEFAULT 'session',
66
+ importance INTEGER DEFAULT 5,
67
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
68
+ last_accessed TIMESTAMP,
69
+ access_count INTEGER DEFAULT 0,
70
+ content_hash TEXT,
71
+ cluster_id INTEGER,
72
+ profile TEXT DEFAULT 'default',
73
+ created_by TEXT,
74
+ source_protocol TEXT,
75
+ trust_score REAL DEFAULT 1.0
76
+ )
77
+ ''')
78
+ cursor.execute('''
79
+ CREATE VIRTUAL TABLE IF NOT EXISTS memories_fts
80
+ USING fts5(content, summary, tags, content='memories', content_rowid='id')
81
+ ''')
82
+ cursor.execute('''
83
+ CREATE TABLE IF NOT EXISTS identity_patterns (
84
+ id INTEGER PRIMARY KEY,
85
+ pattern_type TEXT,
86
+ pattern_key TEXT,
87
+ pattern_value TEXT,
88
+ confidence REAL DEFAULT 0.0,
89
+ frequency INTEGER DEFAULT 1,
90
+ last_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
91
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
92
+ )
93
+ ''')
94
+ conn.commit()
95
+ conn.close()
96
+ return db_path
97
+
98
+
99
+ def _insert_memories(db_path, memories):
100
+ conn = sqlite3.connect(str(db_path))
101
+ cursor = conn.cursor()
102
+ for m in memories:
103
+ cursor.execute('''
104
+ INSERT INTO memories (content, tags, project_name, project_path,
105
+ importance, access_count, profile, created_by,
106
+ source_protocol, created_at, category)
107
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
108
+ ''', (
109
+ m.get('content', 'test'),
110
+ m.get('tags', '[]'),
111
+ m.get('project_name'),
112
+ m.get('project_path'),
113
+ m.get('importance', 5),
114
+ m.get('access_count', 0),
115
+ m.get('profile', 'default'),
116
+ m.get('created_by'),
117
+ m.get('source_protocol'),
118
+ m.get('created_at', '2026-02-16 10:00:00'),
119
+ m.get('category'),
120
+ ))
121
+ conn.commit()
122
+ conn.close()
123
+
124
+
125
+ def _insert_patterns(db_path, patterns):
126
+ conn = sqlite3.connect(str(db_path))
127
+ cursor = conn.cursor()
128
+ for p in patterns:
129
+ cursor.execute('''
130
+ INSERT INTO identity_patterns (pattern_type, pattern_key,
131
+ pattern_value, confidence)
132
+ VALUES (?, ?, ?, ?)
133
+ ''', (
134
+ p.get('pattern_type', 'tech'),
135
+ p.get('key', 'unknown'),
136
+ p.get('value', 'unknown'),
137
+ p.get('confidence', 0.8),
138
+ ))
139
+ conn.commit()
140
+ conn.close()
141
+
142
+
143
+ @pytest.fixture
144
+ def bootstrapper(memory_db, learning_db):
145
+ from src.learning.synthetic_bootstrap import SyntheticBootstrapper
146
+ return SyntheticBootstrapper(
147
+ memory_db_path=memory_db,
148
+ learning_db=learning_db,
149
+ )
150
+
151
+
152
+ @pytest.fixture
153
+ def bootstrapper_with_data(memory_db, learning_db):
154
+ """Bootstrapper with 60 memories (above MIN_MEMORIES_FOR_BOOTSTRAP=50)."""
155
+ memories = []
156
+ for i in range(60):
157
+ memories.append({
158
+ "content": f"Memory about python fastapi development topic {i} implementing features",
159
+ "tags": '["python", "fastapi"]',
160
+ "project_name": "TestProject" if i % 3 == 0 else "OtherProject",
161
+ "importance": 8 if i % 5 == 0 else 5,
162
+ "access_count": 10 if i % 4 == 0 else 1,
163
+ "created_at": f"2026-02-{(i % 28) + 1:02d} 10:00:00",
164
+ "category": "development" if i % 2 == 0 else "architecture",
165
+ })
166
+ _insert_memories(memory_db, memories)
167
+
168
+ from src.learning.synthetic_bootstrap import SyntheticBootstrapper
169
+ return SyntheticBootstrapper(
170
+ memory_db_path=memory_db,
171
+ learning_db=learning_db,
172
+ )
173
+
174
+
175
+ # ---------------------------------------------------------------------------
176
+ # should_bootstrap
177
+ # ---------------------------------------------------------------------------
178
+
179
+ class TestShouldBootstrap:
180
+ def test_returns_false_below_50_memories(self, bootstrapper, memory_db):
181
+ """With fewer than 50 memories, bootstrap should not run."""
182
+ _insert_memories(memory_db, [
183
+ {"content": f"Memory {i}"} for i in range(10)
184
+ ])
185
+ assert bootstrapper.should_bootstrap() is False
186
+
187
+ @pytest.mark.skipif(not HAS_LIGHTGBM or not HAS_NUMPY,
188
+ reason="LightGBM/NumPy required")
189
+ def test_returns_true_above_50(self, bootstrapper_with_data, tmp_path):
190
+ """With 50+ memories, LightGBM, and no existing model, should be True."""
191
+ # Ensure no model file exists
192
+ from src.learning.synthetic_bootstrap import MODEL_PATH
193
+ if MODEL_PATH.exists():
194
+ MODEL_PATH.unlink()
195
+ assert bootstrapper_with_data.should_bootstrap() is True
196
+
197
+ def test_returns_false_without_lightgbm(self, bootstrapper_with_data):
198
+ """Without LightGBM, bootstrap should be False."""
199
+ from src.learning import synthetic_bootstrap as sb_module
200
+ original = sb_module.HAS_LIGHTGBM
201
+ sb_module.HAS_LIGHTGBM = False
202
+ try:
203
+ assert bootstrapper_with_data.should_bootstrap() is False
204
+ finally:
205
+ sb_module.HAS_LIGHTGBM = original
206
+
207
+ def test_returns_false_with_existing_model(self, bootstrapper_with_data, tmp_path):
208
+ """If a model file exists, bootstrap should be False."""
209
+ from src.learning.synthetic_bootstrap import MODEL_PATH
210
+ MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
211
+ MODEL_PATH.write_text("dummy model")
212
+ try:
213
+ assert bootstrapper_with_data.should_bootstrap() is False
214
+ finally:
215
+ if MODEL_PATH.exists():
216
+ MODEL_PATH.unlink()
217
+
218
+
219
+ # ---------------------------------------------------------------------------
220
+ # get_tier
221
+ # ---------------------------------------------------------------------------
222
+
223
+ class TestGetTier:
224
+ def test_none_below_50(self, bootstrapper, memory_db):
225
+ _insert_memories(memory_db, [{"content": f"m{i}"} for i in range(10)])
226
+ assert bootstrapper.get_tier() is None
227
+
228
+ def test_small_tier(self, bootstrapper, memory_db):
229
+ _insert_memories(memory_db, [{"content": f"m{i}"} for i in range(60)])
230
+ assert bootstrapper.get_tier() == "small"
231
+
232
+ def test_medium_tier(self, bootstrapper, memory_db):
233
+ _insert_memories(memory_db, [{"content": f"m{i}"} for i in range(600)])
234
+ assert bootstrapper.get_tier() == "medium"
235
+
236
+ def test_large_tier(self, bootstrapper, memory_db):
237
+ _insert_memories(memory_db, [{"content": f"m{i}"} for i in range(5100)])
238
+ assert bootstrapper.get_tier() == "large"
239
+
240
+
241
+ # ---------------------------------------------------------------------------
242
+ # Synthetic Data Generation
243
+ # ---------------------------------------------------------------------------
244
+
245
+ class TestGenerateSyntheticData:
246
+ def test_generates_records(self, bootstrapper_with_data):
247
+ """Should produce non-empty list of training records."""
248
+ records = bootstrapper_with_data.generate_synthetic_training_data()
249
+ assert len(records) > 0
250
+
251
+ def test_record_structure(self, bootstrapper_with_data):
252
+ records = bootstrapper_with_data.generate_synthetic_training_data()
253
+ if records:
254
+ r = records[0]
255
+ assert "query" in r
256
+ assert "query_hash" in r
257
+ assert "memory_id" in r
258
+ assert "label" in r
259
+ assert "source" in r
260
+ assert "features" in r
261
+ assert len(r["features"]) == 9 # 9-dimensional feature vector
262
+
263
+ def test_labels_in_range(self, bootstrapper_with_data):
264
+ records = bootstrapper_with_data.generate_synthetic_training_data()
265
+ for r in records:
266
+ assert 0.0 <= r["label"] <= 1.0, f"Label out of range: {r['label']}"
267
+
268
+ def test_multiple_sources(self, bootstrapper_with_data):
269
+ """Data should come from multiple strategies."""
270
+ records = bootstrapper_with_data.generate_synthetic_training_data()
271
+ sources = {r["source"] for r in records}
272
+ # At least 2 different source strategies should contribute
273
+ assert len(sources) >= 1 # access_based or importance_based at minimum
274
+
275
+ def test_with_identity_patterns(self, memory_db, learning_db):
276
+ """Pattern-based strategy should use identity_patterns."""
277
+ _insert_memories(memory_db, [
278
+ {
279
+ "content": f"Using python and fastapi for backend development {i}",
280
+ "importance": 8 if i % 3 == 0 else 5,
281
+ "access_count": 6 if i % 4 == 0 else 1,
282
+ }
283
+ for i in range(60)
284
+ ])
285
+ _insert_patterns(memory_db, [
286
+ {"pattern_type": "tech", "key": "language", "value": "python",
287
+ "confidence": 0.9},
288
+ {"pattern_type": "tech", "key": "framework", "value": "fastapi",
289
+ "confidence": 0.85},
290
+ ])
291
+
292
+ from src.learning.synthetic_bootstrap import SyntheticBootstrapper
293
+ bs = SyntheticBootstrapper(
294
+ memory_db_path=memory_db,
295
+ learning_db=learning_db,
296
+ )
297
+ records = bs.generate_synthetic_training_data()
298
+ pattern_records = [r for r in records if r["source"] == "pattern"]
299
+ # Pattern-based records may or may not be generated depending on FTS5
300
+ # The important thing is no crash
301
+ assert isinstance(pattern_records, list)
302
+
303
+
304
+ # ---------------------------------------------------------------------------
305
+ # Keyword Extraction
306
+ # ---------------------------------------------------------------------------
307
+
308
+ class TestExtractKeywords:
309
+ def test_basic_extraction(self, bootstrapper):
310
+ kws = bootstrapper._extract_keywords("python fastapi deployment docker")
311
+ assert len(kws) <= 3
312
+ assert len(kws) > 0
313
+ assert all(isinstance(k, str) for k in kws)
314
+
315
+ def test_stopword_removal(self, bootstrapper):
316
+ kws = bootstrapper._extract_keywords("the and or but python")
317
+ assert "the" not in kws
318
+ assert "and" not in kws
319
+ assert "python" in kws
320
+
321
+ def test_empty_content(self, bootstrapper):
322
+ assert bootstrapper._extract_keywords("") == []
323
+
324
+ def test_only_stopwords(self, bootstrapper):
325
+ assert bootstrapper._extract_keywords("the and or but is are") == []
326
+
327
+ def test_short_words_filtered(self, bootstrapper):
328
+ """Words shorter than MIN_KEYWORD_LENGTH (3) should be filtered."""
329
+ kws = bootstrapper._extract_keywords("a by python")
330
+ assert "a" not in kws
331
+ assert "by" not in kws
332
+
333
+ def test_frequency_based(self, bootstrapper):
334
+ """Most frequent word should appear first."""
335
+ kws = bootstrapper._extract_keywords(
336
+ "python python python fastapi fastapi docker"
337
+ )
338
+ assert kws[0] == "python"
339
+
340
+
341
+ # ---------------------------------------------------------------------------
342
+ # bootstrap_model (LightGBM required)
343
+ # ---------------------------------------------------------------------------
344
+
345
+ class TestBootstrapModel:
346
+ @pytest.mark.skipif(not HAS_LIGHTGBM or not HAS_NUMPY,
347
+ reason="LightGBM/NumPy required for bootstrap training")
348
+ def test_bootstrap_with_sufficient_data(self, bootstrapper_with_data, tmp_path):
349
+ """Full bootstrap should produce a model file and return metadata."""
350
+ from src.learning.synthetic_bootstrap import MODEL_PATH, MODELS_DIR
351
+ # Clean up any existing model
352
+ if MODEL_PATH.exists():
353
+ MODEL_PATH.unlink()
354
+
355
+ result = bootstrapper_with_data.bootstrap_model()
356
+ if result is not None:
357
+ assert "model_version" in result
358
+ assert "training_samples" in result
359
+ assert result["training_samples"] > 0
360
+ assert "bootstrap" in result["model_version"]
361
+ assert "tier" in result
362
+ assert result["tier"] == "small" # 60 memories = small tier
363
+
364
+ # Clean up
365
+ if MODEL_PATH.exists():
366
+ MODEL_PATH.unlink()
367
+
368
+ def test_bootstrap_without_lightgbm(self, bootstrapper_with_data):
369
+ """Should return None gracefully when LightGBM not available."""
370
+ from src.learning import synthetic_bootstrap as sb_module
371
+ original_lgb = sb_module.HAS_LIGHTGBM
372
+ sb_module.HAS_LIGHTGBM = False
373
+ try:
374
+ result = bootstrapper_with_data.bootstrap_model()
375
+ assert result is None
376
+ finally:
377
+ sb_module.HAS_LIGHTGBM = original_lgb
378
+
379
+ def test_bootstrap_below_minimum(self, bootstrapper, memory_db):
380
+ """Should return None with too few memories."""
381
+ _insert_memories(memory_db, [{"content": f"m{i}"} for i in range(10)])
382
+ result = bootstrapper.bootstrap_model()
383
+ assert result is None
384
+
385
+
386
+ # ---------------------------------------------------------------------------
387
+ # Diverse Sample
388
+ # ---------------------------------------------------------------------------
389
+
390
+ class TestDiverseSample:
391
+ def test_under_target(self, bootstrapper):
392
+ records = [{"source": "a", "query_hash": "q1", "memory_id": 1}] * 5
393
+ result = bootstrapper._diverse_sample(records, 10)
394
+ assert len(result) == 5
395
+
396
+ def test_over_target_proportional(self, bootstrapper):
397
+ records = (
398
+ [{"source": "a", "query_hash": f"qa{i}", "memory_id": i} for i in range(50)]
399
+ + [{"source": "b", "query_hash": f"qb{i}", "memory_id": i + 50} for i in range(50)]
400
+ )
401
+ result = bootstrapper._diverse_sample(records, 20)
402
+ assert len(result) == 20
403
+ sources = {r["source"] for r in result}
404
+ assert len(sources) == 2 # Both sources represented
405
+
406
+
407
+ # ---------------------------------------------------------------------------
408
+ # Count Sources
409
+ # ---------------------------------------------------------------------------
410
+
411
+ class TestCountSources:
412
+ def test_count(self, bootstrapper):
413
+ records = [
414
+ {"source": "access_positive"},
415
+ {"source": "access_positive"},
416
+ {"source": "importance_positive"},
417
+ {"source": "recency_positive"},
418
+ ]
419
+ counts = bootstrapper._count_sources(records)
420
+ assert counts["access_positive"] == 2
421
+ assert counts["importance_positive"] == 1
422
+ assert counts["recency_positive"] == 1
423
+
424
+
425
+ # ---------------------------------------------------------------------------
426
+ # Module-level convenience
427
+ # ---------------------------------------------------------------------------
428
+
429
+ class TestModuleLevel:
430
+ def test_should_bootstrap_function(self, memory_db):
431
+ from src.learning.synthetic_bootstrap import should_bootstrap
432
+ result = should_bootstrap(memory_db_path=memory_db)
433
+ assert isinstance(result, bool)