claude-mem-lite 2.87.0 → 2.88.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,7 @@
10
10
  "plugins": [
11
11
  {
12
12
  "name": "claude-mem-lite",
13
- "version": "2.87.0",
13
+ "version": "2.88.0",
14
14
  "source": "./",
15
15
  "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. Alternative to claude-mem with 600x lower cost."
16
16
  }
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-mem-lite",
3
- "version": "2.87.0",
3
+ "version": "2.88.0",
4
4
  "description": "Persistent long-term memory for Claude Code via MCP — captures coding decisions, bugfixes, and context across sessions. Hybrid FTS5 + TF-IDF search with episode batching. Single SQLite DB, no external services. Alternative to claude-mem with 600x lower cost.",
5
5
  "author": {
6
6
  "name": "sdsrss"
package/hook.mjs CHANGED
@@ -26,7 +26,7 @@ import {
26
26
  truncate, inferProject, detectBashSignificance,
27
27
  extractErrorKeywords, extractFilePaths, isRelatedToEpisode,
28
28
  makeEntryDesc, scrubSecrets, stripPrivate, EDIT_TOOLS, debugCatch, debugLog,
29
- COMPRESSED_AUTO, COMPRESSED_PENDING_PURGE, isoWeekKey, OBS_BM25,
29
+ COMPRESSED_AUTO, COMPRESSED_PENDING_PURGE, OBS_BM25,
30
30
  computeMinHash, estimateJaccardFromMinHash, jaccardSimilarity,
31
31
  } from './utils.mjs';
32
32
  import {
@@ -45,6 +45,8 @@ import {
45
45
  } from './hook-shared.mjs';
46
46
  import { handleLLMEpisode, handleLLMSummary, saveObservation, buildImmediateObservation } from './hook-llm.mjs';
47
47
  import { scrubRecord } from './lib/scrub-record.mjs';
48
+ import { selectCompressionCandidates, groupByProjectWeek, compressGroup } from './lib/compress-core.mjs';
49
+ import { cleanupBroken, decayAndMarkIdle, boostAccessed } from './lib/maintain-core.mjs';
48
50
  import {
49
51
  extractCitationsFromTranscript,
50
52
  extractAllInjected,
@@ -819,65 +821,19 @@ async function handleSessionStart() {
819
821
  `).run(Date.now() - 37 * 86400000);
820
822
  if (purged.changes > 0) debugLog('DEBUG', 'auto-maintain', `purged ${purged.changes} stale observations`);
821
823
 
822
- // Cleanup: remove broken observations (no title AND no narrative)
823
- const cleaned = db.prepare(`
824
- DELETE FROM observations WHERE id IN (
825
- SELECT id FROM observations
826
- WHERE COALESCE(compressed_into, 0) = 0
827
- AND (title IS NULL OR title = '') AND (narrative IS NULL OR narrative = '')
828
- LIMIT ${OP_CAP}
829
- )
830
- `).run();
831
- if (cleaned.changes > 0) debugLog('DEBUG', 'auto-maintain', `cleaned ${cleaned.changes} broken observations`);
832
-
833
- // Decay: reduce importance of old, never-accessed observations
834
- // v2.56.0 #4: injection_count is a separate engagement signal
835
- // hook-memory.mjs bumps it when the obs is auto-injected into Claude's
836
- // context. Pre-v2.56 only checked access_count, so an obs auto-injected
837
- // 8x (proven contextually relevant) still got decayed/marked. Adding
838
- // `injection_count = 0` treats injection as first-class engagement.
839
- const decayed = db.prepare(`
840
- UPDATE observations SET importance = MAX(1, COALESCE(importance, 1) - 1)
841
- WHERE id IN (
842
- SELECT id FROM observations
843
- WHERE COALESCE(compressed_into, 0) = 0
844
- AND COALESCE(importance, 1) > 1
845
- AND COALESCE(access_count, 0) = 0
846
- AND COALESCE(injection_count, 0) = 0
847
- AND created_at_epoch < ?
848
- LIMIT ${OP_CAP}
849
- )
850
- `).run(STALE_AGE);
851
- if (decayed.changes > 0) debugLog('DEBUG', 'auto-maintain', `decayed ${decayed.changes} stale observations`);
852
-
853
- // Mark idle: importance=1, never-accessed, never-injected, old → pending-purge
854
- // (will be purged next cycle). v2.56.0 #4: injection_count protects.
855
- const idleMarked = db.prepare(`
856
- UPDATE observations SET compressed_into = ${COMPRESSED_PENDING_PURGE}
857
- WHERE id IN (
858
- SELECT id FROM observations
859
- WHERE COALESCE(compressed_into, 0) = 0
860
- AND COALESCE(importance, 1) = 1
861
- AND COALESCE(access_count, 0) = 0
862
- AND COALESCE(injection_count, 0) = 0
863
- AND created_at_epoch < ?
864
- LIMIT ${OP_CAP}
865
- )
866
- `).run(STALE_AGE);
867
- if (idleMarked.changes > 0) debugLog('DEBUG', 'auto-maintain', `marked ${idleMarked.changes} idle as pending-purge`);
868
-
869
- // Boost: increase importance of frequently-accessed observations
870
- const boosted = db.prepare(`
871
- UPDATE observations SET importance = MIN(3, COALESCE(importance, 1) + 1)
872
- WHERE id IN (
873
- SELECT id FROM observations
874
- WHERE COALESCE(compressed_into, 0) = 0
875
- AND COALESCE(access_count, 0) > 3
876
- AND COALESCE(importance, 1) < 3
877
- LIMIT ${OP_CAP}
878
- )
879
- `).run();
880
- if (boosted.changes > 0) debugLog('DEBUG', 'auto-maintain', `boosted ${boosted.changes} frequently-accessed observations`);
824
+ // cleanup / decay+mark-idle / boost via maintain-core (shared with CLI + MCP).
825
+ // injection_count>0 protection lives in decayAndMarkIdle. Whole-DB, cap 500.
826
+ const mctx = { projectFilter: '', baseParams: [], staleAge: STALE_AGE, opCap: OP_CAP };
827
+
828
+ const cleaned = cleanupBroken(db, mctx);
829
+ if (cleaned > 0) debugLog('DEBUG', 'auto-maintain', `cleaned ${cleaned} broken observations`);
830
+
831
+ const { decayed, idleMarked } = decayAndMarkIdle(db, mctx);
832
+ if (decayed > 0) debugLog('DEBUG', 'auto-maintain', `decayed ${decayed} stale observations`);
833
+ if (idleMarked > 0) debugLog('DEBUG', 'auto-maintain', `marked ${idleMarked} idle as pending-purge`);
834
+
835
+ const boosted = boostAccessed(db, mctx);
836
+ if (boosted > 0) debugLog('DEBUG', 'auto-maintain', `boosted ${boosted} frequently-accessed observations`);
881
837
 
882
838
  // Auto-dedup (exact): merge identical-title observations within 1h.
883
839
  // Catches rapid duplicate writes (same hook firing twice, race conditions).
@@ -1361,57 +1317,16 @@ function handleAutoCompress() {
1361
1317
 
1362
1318
  try {
1363
1319
  const compressCutoff = Date.now() - 60 * 86400000; // 60 days
1364
- const compressCandidates = db.prepare(`
1365
- SELECT id, project, type, title, created_at_epoch
1366
- FROM observations
1367
- WHERE COALESCE(importance, 1) = 1 AND COALESCE(access_count, 0) = 0
1368
- AND created_at_epoch < ?
1369
- AND (compressed_into IS NULL OR compressed_into = ${COMPRESSED_AUTO})
1370
- ORDER BY project, created_at_epoch
1371
- `).all(compressCutoff);
1320
+ const compressCandidates = selectCompressionCandidates(db, { cutoff: compressCutoff, includeAutoMarked: true });
1372
1321
  if (compressCandidates.length < 3) return;
1373
1322
 
1374
- const groups = new Map();
1375
- for (const c of compressCandidates) {
1376
- const key = `${c.project}::${isoWeekKey(c.created_at_epoch)}`;
1377
- if (!groups.has(key)) groups.set(key, []);
1378
- groups.get(key).push(c);
1379
- }
1380
- // Transact each group to prevent orphan summaries on crash
1381
- const compressGroup = db.transaction((proj, obs) => {
1382
- const types = {};
1383
- for (const o of obs) types[o.type] = (types[o.type] || 0) + 1;
1384
- const dominantType = Object.entries(types).sort((a, b) => b[1] - a[1])[0][0];
1385
- const title = `Weekly summary: ${obs.length} ${dominantType} observations`;
1386
- const narrative = obs.map(o => `- ${o.title || '(untitled)'}`).join('\n');
1387
- const sortedEpochs = obs.map(o => o.created_at_epoch).sort((a, b) => a - b);
1388
- const medianEpoch = sortedEpochs[Math.floor(sortedEpochs.length / 2)];
1389
- const sessionId = `compress-${proj}`;
1390
- const now = new Date();
1391
- db.prepare(`INSERT OR IGNORE INTO sdk_sessions
1392
- (content_session_id, memory_session_id, project, started_at, started_at_epoch, status)
1393
- VALUES (?,?,?,?,?,'active')`
1394
- ).run(sessionId, sessionId, proj, now.toISOString(), now.getTime());
1395
- // Defense-in-depth: title/narrative are derived from already-stored
1396
- // obs.title, but those rows pre-date the central scrub policy in some
1397
- // cases. Re-scrub at the persistence boundary.
1398
- const safe = scrubRecord('observations', { text: narrative, title, narrative });
1399
- const summaryResult = db.prepare(`INSERT INTO observations
1400
- (memory_session_id, project, text, type, title, subtitle, narrative, concepts, facts,
1401
- files_read, files_modified, importance, created_at, created_at_epoch)
1402
- VALUES (?,?,?,?,?,'',?,'','','[]','[]',2,?,?)`
1403
- ).run(sessionId, proj, safe.text, dominantType, safe.title, safe.narrative, new Date(medianEpoch).toISOString(), medianEpoch);
1404
- const summaryId = Number(summaryResult.lastInsertRowid);
1405
- const obsIds = obs.map(o => o.id);
1406
- db.prepare(`UPDATE observations SET compressed_into = ? WHERE id IN (${obsIds.map(() => '?').join(',')})`)
1407
- .run(summaryId, ...obsIds);
1408
- return obs.length;
1409
- });
1323
+ const groups = groupByProjectWeek(compressCandidates);
1324
+ // Transact each group to prevent orphan summaries on crash (CLI/MCP wrap all groups in one).
1325
+ const compressGroupTxn = db.transaction((proj, obs) => compressGroup(db, proj, obs).compressed);
1410
1326
  let totalCompressed = 0;
1411
1327
  for (const [key, obs] of groups) {
1412
- if (obs.length < 3) continue;
1413
1328
  const [proj] = key.split('::');
1414
- totalCompressed += compressGroup(proj, obs);
1329
+ totalCompressed += compressGroupTxn(proj, obs);
1415
1330
  }
1416
1331
  if (totalCompressed > 0) {
1417
1332
  debugLog('DEBUG', 'auto-compress', `auto-compressed ${totalCompressed} observations into weekly summaries`);
@@ -0,0 +1,98 @@
1
+ // Shared "compress old low-value observations into weekly summaries" core.
2
+ //
3
+ // Single source of truth for cmdCompress (CLI), mem_compress (MCP), and
4
+ // handleAutoCompress (hook). Pre-extraction the candidate query, the
5
+ // project+ISO-week grouping, and the per-group summary INSERT + mark-compressed
6
+ // were copy-pasted across all three and kept in sync by hand-written "parity"
7
+ // comments — which is exactly how the TF-IDF-vector write drifted out of the
8
+ // compression path (audit ARCH-1). Call sites keep what legitimately differs:
9
+ // argument parsing, preview rendering, candidate-window params, and transaction
10
+ // granularity (CLI/MCP wrap all groups in one transaction; the hook transacts
11
+ // each group). They no longer re-implement the mutation.
12
+ //
13
+ // NOTE: the summary INSERT still omits the observation_vectors write, matching
14
+ // pre-extraction behavior. Fixing that (audit P5) is now a single change here
15
+ // instead of three — but it is a behavior change, intentionally NOT bundled.
16
+
17
+ import { isoWeekKey, COMPRESSED_AUTO } from '../utils.mjs';
18
+ import { scrubRecord } from './scrub-record.mjs';
19
+
20
+ /**
21
+ * Low-value compression candidates: importance=1, never accessed, older than
22
+ * `cutoff`, not already compressed. `includeAutoMarked` also folds in rows the
23
+ * hook lightweight-marked as COMPRESSED_AUTO (the hook re-summarizes those).
24
+ */
25
+ export function selectCompressionCandidates(db, { cutoff, project = null, includeAutoMarked = false }) {
26
+ const compressedFilter = includeAutoMarked
27
+ ? `AND (compressed_into IS NULL OR compressed_into = ${COMPRESSED_AUTO})`
28
+ : 'AND compressed_into IS NULL';
29
+ const projectFilter = project ? 'AND project = ?' : '';
30
+ const params = project ? [cutoff, project] : [cutoff];
31
+ return db.prepare(`
32
+ SELECT id, project, type, title, created_at, created_at_epoch
33
+ FROM observations
34
+ WHERE COALESCE(importance, 1) = 1
35
+ AND COALESCE(access_count, 0) = 0
36
+ AND created_at_epoch < ?
37
+ ${compressedFilter}
38
+ ${projectFilter}
39
+ ORDER BY project, created_at_epoch
40
+ `).all(...params);
41
+ }
42
+
43
+ /**
44
+ * Group candidates by `project::isoWeek` and keep only groups worth compressing
45
+ * (≥ 3 observations). Returns [[key, obs[]], …] — callers split the key on '::'
46
+ * for the project.
47
+ */
48
+ export function groupByProjectWeek(candidates) {
49
+ const groups = new Map();
50
+ for (const c of candidates) {
51
+ const key = `${c.project}::${isoWeekKey(c.created_at_epoch)}`;
52
+ if (!groups.has(key)) groups.set(key, []);
53
+ groups.get(key).push(c);
54
+ }
55
+ return [...groups.entries()].filter(([, obs]) => obs.length >= 3);
56
+ }
57
+
58
+ /**
59
+ * Compress one group: create a weekly-summary observation (importance 2, dominant
60
+ * type, median timestamp so it sits correctly in recency/timeline), then mark all
61
+ * sources as compressed into it. Statement-only — the CALLER owns the transaction
62
+ * boundary (all-groups-in-one for CLI/MCP, per-group for the hook).
63
+ *
64
+ * @returns {{ summaryId: number, compressed: number }}
65
+ */
66
+ export function compressGroup(db, proj, obs) {
67
+ const types = {};
68
+ for (const o of obs) types[o.type] = (types[o.type] || 0) + 1;
69
+ const dominantType = Object.entries(types).sort((a, b) => b[1] - a[1])[0][0];
70
+ const title = `Weekly summary: ${obs.length} ${dominantType} observations`;
71
+ const narrative = obs.map((o) => `- ${o.title || '(untitled)'}`).join('\n');
72
+ const sessionId = `compress-${proj}`;
73
+
74
+ const sortedEpochs = obs.map((o) => o.created_at_epoch).sort((a, b) => a - b);
75
+ const medianEpoch = sortedEpochs[Math.floor(sortedEpochs.length / 2)];
76
+ const medianDate = new Date(medianEpoch);
77
+
78
+ const now = new Date();
79
+ db.prepare(`
80
+ INSERT OR IGNORE INTO sdk_sessions (content_session_id, memory_session_id, project, started_at, started_at_epoch, status)
81
+ VALUES (?, ?, ?, ?, ?, 'active')
82
+ `).run(sessionId, sessionId, proj, now.toISOString(), now.getTime());
83
+
84
+ // Defense-in-depth: source rows were scrubbed at ingest, but the new narrative
85
+ // is constructed here and re-persisted.
86
+ const safe = scrubRecord('observations', { text: narrative, title, narrative });
87
+ const summaryResult = db.prepare(`
88
+ INSERT INTO observations (memory_session_id, project, text, type, title, subtitle, narrative, concepts, facts, files_read, files_modified, importance, created_at, created_at_epoch)
89
+ VALUES (?, ?, ?, ?, ?, '', ?, '', '', '[]', '[]', 2, ?, ?)
90
+ `).run(sessionId, proj, safe.text, dominantType, safe.title, safe.narrative, medianDate.toISOString(), medianEpoch);
91
+ const summaryId = Number(summaryResult.lastInsertRowid);
92
+
93
+ const obsIds = obs.map((o) => o.id);
94
+ const obsPh = obsIds.map(() => '?').join(',');
95
+ db.prepare(`UPDATE observations SET compressed_into = ? WHERE id IN (${obsPh})`).run(summaryId, ...obsIds);
96
+
97
+ return { summaryId, compressed: obs.length };
98
+ }
@@ -0,0 +1,236 @@
1
+ // Shared maintenance operations — single source of truth for cmdMaintain (CLI),
2
+ // mem_maintain (MCP), and handleAutoMaintain (hook). Pre-extraction each
3
+ // operation's SQL was copy-pasted across the call sites and kept in sync by
4
+ // "parity" comments, which had already drifted: the CLI/hook `decay` and
5
+ // `mark-idle` protect injection_count>0 (v2.56.0 — an obs Claude was shown 8×
6
+ // is contextually proven), but the MCP copy never got that clause, so
7
+ // mem_maintain decayed/purged injected memories the other two paths preserve.
8
+ // Consolidating here UNIFIES decay/mark-idle on the protected (correct) form.
9
+ //
10
+ // Every mutation is statement-only — the CALLER owns the transaction boundary
11
+ // (CLI/MCP wrap the execute ops in one transaction; the hook runs them in its
12
+ // auto-maintain block). `ctx` carries the per-caller knobs:
13
+ // { projectFilter: 'AND project = ?' | '', baseParams: [project?] , staleAge, opCap }
14
+
15
+ import { COMPRESSED_PENDING_PURGE, computeMinHash, estimateJaccardFromMinHash, jaccardSimilarity } from '../utils.mjs';
16
+ import { rebuildVocabulary, computeVector, _resetVocabCache } from '../tfidf.mjs';
17
+
18
+ export const STALE_AGE_MS = 30 * 86400000;
19
+ export const OP_CAP = 1000;
20
+ export const SCAN_LIMIT = 500;
21
+ export const DUPLICATE_LIMIT = 50;
22
+ export const SIMILARITY_THRESHOLD = 0.7;
23
+ export const MINHASH_PRE_THRESHOLD = 0.5;
24
+ // A memory injected this many times with zero citations is "pinned noise" that
25
+ // the regular decay op can't touch (decay protects injection_count>0).
26
+ export const PINNED_INJ_THRESHOLD = 8;
27
+
28
+ /** Delete broken observations (no title AND no narrative). Returns rows deleted. */
29
+ export function cleanupBroken(db, { projectFilter, baseParams, opCap = OP_CAP }) {
30
+ return db.prepare(`
31
+ DELETE FROM observations WHERE id IN (
32
+ SELECT id FROM observations
33
+ WHERE COALESCE(compressed_into, 0) = 0
34
+ AND (title IS NULL OR title = '') AND (narrative IS NULL OR narrative = '')
35
+ ${projectFilter} LIMIT ${opCap}
36
+ )
37
+ `).run(...baseParams).changes;
38
+ }
39
+
40
+ /**
41
+ * Decay importance of old, never-accessed, NEVER-INJECTED observations, then mark
42
+ * the importance-1 idle ones as pending-purge. injection_count>0 is protected as
43
+ * first-class engagement alongside access_count (unified across all three paths).
44
+ */
45
+ export function decayAndMarkIdle(db, { projectFilter, baseParams, staleAge, opCap = OP_CAP }) {
46
+ const decayed = db.prepare(`
47
+ UPDATE observations SET importance = MAX(1, COALESCE(importance, 1) - 1)
48
+ WHERE id IN (
49
+ SELECT id FROM observations
50
+ WHERE COALESCE(compressed_into, 0) = 0
51
+ AND COALESCE(importance, 1) > 1
52
+ AND COALESCE(access_count, 0) = 0
53
+ AND COALESCE(injection_count, 0) = 0
54
+ AND created_at_epoch < ?
55
+ ${projectFilter} LIMIT ${opCap}
56
+ )
57
+ `).run(staleAge, ...baseParams).changes;
58
+
59
+ const idleMarked = db.prepare(`
60
+ UPDATE observations SET compressed_into = ${COMPRESSED_PENDING_PURGE}
61
+ WHERE id IN (
62
+ SELECT id FROM observations
63
+ WHERE COALESCE(compressed_into, 0) = 0
64
+ AND COALESCE(importance, 1) = 1
65
+ AND COALESCE(access_count, 0) = 0
66
+ AND COALESCE(injection_count, 0) = 0
67
+ AND created_at_epoch < ?
68
+ ${projectFilter} LIMIT ${opCap}
69
+ )
70
+ `).run(staleAge, ...baseParams).changes;
71
+
72
+ return { decayed, idleMarked };
73
+ }
74
+
75
+ /** Boost importance of frequently-accessed observations. Returns rows boosted. */
76
+ export function boostAccessed(db, { projectFilter, baseParams, opCap = OP_CAP }) {
77
+ return db.prepare(`
78
+ UPDATE observations SET importance = MIN(3, COALESCE(importance, 1) + 1)
79
+ WHERE id IN (
80
+ SELECT id FROM observations
81
+ WHERE COALESCE(compressed_into, 0) = 0
82
+ AND COALESCE(access_count, 0) > 3
83
+ AND COALESCE(importance, 1) < 3
84
+ ${projectFilter} LIMIT ${opCap}
85
+ )
86
+ `).run(...baseParams).changes;
87
+ }
88
+
89
+ /**
90
+ * Repair the citation-decay blind spot: heavy-injection + zero-citation rows that
91
+ * decay protects (injection_count>0) stay pinned at max importance forever. Drop
92
+ * them to importance 1 in one pass (injection priority is binary at >=2, so a
93
+ * single step would not de-rank). Floor 1, not purge.
94
+ */
95
+ export function demotePinned(db, { projectFilter, baseParams, opCap = OP_CAP }) {
96
+ return db.prepare(`
97
+ UPDATE observations SET importance = 1
98
+ WHERE id IN (
99
+ SELECT id FROM observations
100
+ WHERE COALESCE(compressed_into, 0) = 0
101
+ AND COALESCE(injection_count, 0) >= ${PINNED_INJ_THRESHOLD}
102
+ AND COALESCE(cited_count, 0) = 0
103
+ AND COALESCE(importance, 1) > 1
104
+ ${projectFilter} LIMIT ${opCap}
105
+ )
106
+ `).run(...baseParams).changes;
107
+ }
108
+
109
+ /**
110
+ * Merge explicit duplicate groups: each group is [keepId, removeId, …]. Marks the
111
+ * removeIds compressed into keepId (only if not already compressed). Returns the
112
+ * number of rows merged. Callers parse their own input (CLI string / MCP array).
113
+ */
114
+ export function mergeDuplicates(db, groups) {
115
+ let merged = 0;
116
+ const mergeStmt = db.prepare('UPDATE observations SET compressed_into = ? WHERE id = ? AND COALESCE(compressed_into, 0) = 0');
117
+ for (const group of groups) {
118
+ if (!group || group.length < 2) continue;
119
+ const [keepId, ...removeIds] = group;
120
+ for (const removeId of removeIds) merged += mergeStmt.run(keepId, removeId).changes;
121
+ }
122
+ return merged;
123
+ }
124
+
125
+ /** Preview pending-purge candidates older than the retain cutoff (no deletion). */
126
+ export function purgeStalePreview(db, { projectFilter, baseParams }, retainCutoff) {
127
+ return db.prepare(`
128
+ SELECT COUNT(*) AS candidates, MIN(created_at_epoch) AS oldest, MAX(created_at_epoch) AS newest
129
+ FROM observations
130
+ WHERE compressed_into = ${COMPRESSED_PENDING_PURGE} AND created_at_epoch < ? ${projectFilter}
131
+ `).get(retainCutoff, ...baseParams);
132
+ }
133
+
134
+ /** Delete pending-purge observations older than the retain cutoff. Returns rows deleted. */
135
+ export function purgeStale(db, { projectFilter, baseParams, opCap = OP_CAP }, retainCutoff) {
136
+ return db.prepare(`
137
+ DELETE FROM observations WHERE id IN (
138
+ SELECT id FROM observations
139
+ WHERE compressed_into = ${COMPRESSED_PENDING_PURGE} AND created_at_epoch < ?
140
+ ${projectFilter} LIMIT ${opCap}
141
+ )
142
+ `).run(retainCutoff, ...baseParams).changes;
143
+ }
144
+
145
+ /**
146
+ * Near-duplicate title detection: MinHash pre-filter → exact Jaccard. Returns
147
+ * [{ a:{id,title,importance}, b:{…}, similarity:'0.NN' }, …].
148
+ */
149
+ export function findDuplicates(db, { projectFilter, baseParams, limit = SCAN_LIMIT, dupLimit = DUPLICATE_LIMIT }) {
150
+ const recent = db.prepare(`
151
+ SELECT id, title, project, importance, access_count, created_at_epoch
152
+ FROM observations
153
+ WHERE COALESCE(compressed_into, 0) = 0 ${projectFilter}
154
+ ORDER BY created_at_epoch DESC LIMIT ${limit}
155
+ `).all(...baseParams);
156
+
157
+ const titles = recent.map((r) => (r.title || '').trim());
158
+ const minhashes = titles.map((t) => (t ? computeMinHash(t) : null));
159
+ const duplicates = [];
160
+ for (let i = 0; i < recent.length && duplicates.length < dupLimit; i++) {
161
+ if (!titles[i] || !minhashes[i]) continue;
162
+ for (let j = i + 1; j < recent.length; j++) {
163
+ if (!titles[j] || !minhashes[j]) continue;
164
+ if (estimateJaccardFromMinHash(minhashes[i], minhashes[j]) < MINHASH_PRE_THRESHOLD) continue;
165
+ const sim = jaccardSimilarity(titles[i], titles[j]);
166
+ if (sim > SIMILARITY_THRESHOLD) {
167
+ duplicates.push({
168
+ a: { id: recent[i].id, title: recent[i].title, importance: recent[i].importance },
169
+ b: { id: recent[j].id, title: recent[j].title, importance: recent[j].importance },
170
+ similarity: sim.toFixed(2),
171
+ });
172
+ }
173
+ if (duplicates.length >= dupLimit) break;
174
+ }
175
+ }
176
+ return duplicates;
177
+ }
178
+
179
+ /** Single-scan maintenance counters (includes `pinned`; callers render what they show). */
180
+ export function maintenanceStats(db, { projectFilter, baseParams, staleAge }) {
181
+ const stats = db.prepare(`
182
+ SELECT
183
+ COUNT(*) as total,
184
+ COALESCE(SUM(CASE WHEN COALESCE(importance, 1) = 1 AND COALESCE(access_count, 0) = 0
185
+ AND created_at_epoch < ? THEN 1 ELSE 0 END), 0) as stale,
186
+ COALESCE(SUM(CASE WHEN (title IS NULL OR title = '') AND (narrative IS NULL OR narrative = '')
187
+ THEN 1 ELSE 0 END), 0) as broken,
188
+ COALESCE(SUM(CASE WHEN COALESCE(access_count, 0) > 3 AND COALESCE(importance, 1) < 3
189
+ THEN 1 ELSE 0 END), 0) as boostable,
190
+ COALESCE(SUM(CASE WHEN COALESCE(injection_count, 0) >= ${PINNED_INJ_THRESHOLD}
191
+ AND COALESCE(cited_count, 0) = 0 AND COALESCE(importance, 1) > 1
192
+ THEN 1 ELSE 0 END), 0) as pinned
193
+ FROM observations
194
+ WHERE COALESCE(compressed_into, 0) = 0 ${projectFilter}
195
+ `).get(staleAge, ...baseParams);
196
+ const pendingPurge = db.prepare(
197
+ `SELECT COUNT(*) as count FROM observations WHERE compressed_into = ${COMPRESSED_PENDING_PURGE} ${projectFilter}`
198
+ ).get(...baseParams);
199
+ return { ...stats, pendingPurge: pendingPurge.count };
200
+ }
201
+
202
+ /** Rebuild the TF-IDF vocabulary + every active observation vector (own transaction). */
203
+ export function rebuildVectors(db) {
204
+ _resetVocabCache();
205
+ const vocab = rebuildVocabulary(db);
206
+ if (!vocab) return { ok: false, reason: 'no observations to build vocabulary from' };
207
+ const allObs = db.prepare(`
208
+ SELECT id, title, narrative, concepts FROM observations
209
+ WHERE COALESCE(compressed_into, 0) = 0 AND superseded_at IS NULL
210
+ `).all();
211
+ let updated = 0;
212
+ const insertStmt = db.prepare('INSERT OR REPLACE INTO observation_vectors (observation_id, vector, vocab_version, created_at_epoch) VALUES (?, ?, ?, ?)');
213
+ const now = Date.now();
214
+ db.transaction(() => {
215
+ db.prepare('DELETE FROM observation_vectors').run();
216
+ for (const obs of allObs) {
217
+ const text = [obs.title || '', obs.narrative || '', obs.concepts || ''].filter(Boolean).join(' ');
218
+ const vec = computeVector(text, vocab);
219
+ if (vec) {
220
+ insertStmt.run(obs.id, Buffer.from(vec.buffer), vocab.version, now);
221
+ updated++;
222
+ }
223
+ }
224
+ })();
225
+ return { ok: true, terms: vocab.terms.size, updated, total: allObs.length };
226
+ }
227
+
228
+ /** VACUUM the whole DB, reporting freelist reclaim. Must run OUTSIDE any transaction. */
229
+ export function vacuum(db) {
230
+ const pageSize = db.pragma('page_size', { simple: true });
231
+ const freeBefore = db.pragma('freelist_count', { simple: true });
232
+ db.exec('VACUUM');
233
+ const freeAfter = db.pragma('freelist_count', { simple: true });
234
+ const reclaimedMB = ((Math.max(0, freeBefore - freeAfter) * pageSize) / 1048576).toFixed(1);
235
+ return { reclaimedMB, freeBefore, freeAfter };
236
+ }