claude-mem-lite 2.87.0 → 2.89.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +11 -9
- package/README.zh-CN.md +8 -8
- package/haiku-client.mjs +20 -10
- package/hook-llm.mjs +4 -3
- package/hook-optimize.mjs +7 -3
- package/hook.mjs +31 -110
- package/lib/citation-tracker.mjs +61 -1
- package/lib/cite-back-hint.mjs +39 -1
- package/lib/compress-core.mjs +118 -0
- package/lib/dedup-constants.mjs +35 -0
- package/lib/maintain-core.mjs +239 -0
- package/lib/save-observation.mjs +1 -1
- package/mem-cli.mjs +51 -249
- package/package.json +5 -2
- package/schema.mjs +25 -2
- package/search-engine.mjs +2 -1
- package/server.mjs +41 -253
- package/source-files.mjs +14 -0
- package/tfidf.mjs +12 -8
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
// Shared "compress old low-value observations into weekly summaries" core.
|
|
2
|
+
//
|
|
3
|
+
// Single source of truth for cmdCompress (CLI), mem_compress (MCP), and
|
|
4
|
+
// handleAutoCompress (hook). Pre-extraction the candidate query, the
|
|
5
|
+
// project+ISO-week grouping, and the per-group summary INSERT + mark-compressed
|
|
6
|
+
// were copy-pasted across all three and kept in sync by hand-written "parity"
|
|
7
|
+
// comments — which is exactly how the TF-IDF-vector write drifted out of the
|
|
8
|
+
// compression path (audit ARCH-1). Call sites keep what legitimately differs:
|
|
9
|
+
// argument parsing, preview rendering, candidate-window params, and transaction
|
|
10
|
+
// granularity (CLI/MCP wrap all groups in one transaction; the hook transacts
|
|
11
|
+
// each group). They no longer re-implement the mutation.
|
|
12
|
+
//
|
|
13
|
+
// The summary INSERT also writes its TF-IDF observation_vectors row in the same
|
|
14
|
+
// (caller-owned) transaction — fixed once here rather than in all three call
|
|
15
|
+
// sites. Without it, FTS-miss queries that fall back to vector recall (CJK /
|
|
16
|
+
// concept / paraphrase) could never reach compressed summaries; the LLM
|
|
17
|
+
// smart-compress path already wrote vectors, so the deterministic path was the
|
|
18
|
+
// sole gap (audit P6).
|
|
19
|
+
|
|
20
|
+
import { isoWeekKey, COMPRESSED_AUTO, debugCatch } from '../utils.mjs';
|
|
21
|
+
import { getVocabulary, computeVector } from '../tfidf.mjs';
|
|
22
|
+
import { scrubRecord } from './scrub-record.mjs';
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Low-value compression candidates: importance=1, never accessed, older than
|
|
26
|
+
* `cutoff`, not already compressed. `includeAutoMarked` also folds in rows the
|
|
27
|
+
* hook lightweight-marked as COMPRESSED_AUTO (the hook re-summarizes those).
|
|
28
|
+
*/
|
|
29
|
+
export function selectCompressionCandidates(db, { cutoff, project = null, includeAutoMarked = false }) {
|
|
30
|
+
const compressedFilter = includeAutoMarked
|
|
31
|
+
? `AND (compressed_into IS NULL OR compressed_into = ${COMPRESSED_AUTO})`
|
|
32
|
+
: 'AND compressed_into IS NULL';
|
|
33
|
+
const projectFilter = project ? 'AND project = ?' : '';
|
|
34
|
+
const params = project ? [cutoff, project] : [cutoff];
|
|
35
|
+
return db.prepare(`
|
|
36
|
+
SELECT id, project, type, title, created_at, created_at_epoch
|
|
37
|
+
FROM observations
|
|
38
|
+
WHERE COALESCE(importance, 1) = 1
|
|
39
|
+
AND COALESCE(access_count, 0) = 0
|
|
40
|
+
AND created_at_epoch < ?
|
|
41
|
+
${compressedFilter}
|
|
42
|
+
${projectFilter}
|
|
43
|
+
ORDER BY project, created_at_epoch
|
|
44
|
+
`).all(...params);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Group candidates by `project::isoWeek` and keep only groups worth compressing
|
|
49
|
+
* (≥ 3 observations). Returns [[key, obs[]], …] — callers split the key on '::'
|
|
50
|
+
* for the project.
|
|
51
|
+
*/
|
|
52
|
+
export function groupByProjectWeek(candidates) {
|
|
53
|
+
const groups = new Map();
|
|
54
|
+
for (const c of candidates) {
|
|
55
|
+
const key = `${c.project}::${isoWeekKey(c.created_at_epoch)}`;
|
|
56
|
+
if (!groups.has(key)) groups.set(key, []);
|
|
57
|
+
groups.get(key).push(c);
|
|
58
|
+
}
|
|
59
|
+
return [...groups.entries()].filter(([, obs]) => obs.length >= 3);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Compress one group: create a weekly-summary observation (importance 2, dominant
|
|
64
|
+
* type, median timestamp so it sits correctly in recency/timeline), then mark all
|
|
65
|
+
* sources as compressed into it. Statement-only — the CALLER owns the transaction
|
|
66
|
+
* boundary (all-groups-in-one for CLI/MCP, per-group for the hook).
|
|
67
|
+
*
|
|
68
|
+
* @returns {{ summaryId: number, compressed: number }}
|
|
69
|
+
*/
|
|
70
|
+
export function compressGroup(db, proj, obs) {
|
|
71
|
+
const types = {};
|
|
72
|
+
for (const o of obs) types[o.type] = (types[o.type] || 0) + 1;
|
|
73
|
+
const dominantType = Object.entries(types).sort((a, b) => b[1] - a[1])[0][0];
|
|
74
|
+
const title = `Weekly summary: ${obs.length} ${dominantType} observations`;
|
|
75
|
+
const narrative = obs.map((o) => `- ${o.title || '(untitled)'}`).join('\n');
|
|
76
|
+
const sessionId = `compress-${proj}`;
|
|
77
|
+
|
|
78
|
+
const sortedEpochs = obs.map((o) => o.created_at_epoch).sort((a, b) => a - b);
|
|
79
|
+
const medianEpoch = sortedEpochs[Math.floor(sortedEpochs.length / 2)];
|
|
80
|
+
const medianDate = new Date(medianEpoch);
|
|
81
|
+
|
|
82
|
+
const now = new Date();
|
|
83
|
+
db.prepare(`
|
|
84
|
+
INSERT OR IGNORE INTO sdk_sessions (content_session_id, memory_session_id, project, started_at, started_at_epoch, status)
|
|
85
|
+
VALUES (?, ?, ?, ?, ?, 'active')
|
|
86
|
+
`).run(sessionId, sessionId, proj, now.toISOString(), now.getTime());
|
|
87
|
+
|
|
88
|
+
// Defense-in-depth: source rows were scrubbed at ingest, but the new narrative
|
|
89
|
+
// is constructed here and re-persisted.
|
|
90
|
+
const safe = scrubRecord('observations', { text: narrative, title, narrative });
|
|
91
|
+
const summaryResult = db.prepare(`
|
|
92
|
+
INSERT INTO observations (memory_session_id, project, text, type, title, subtitle, narrative, concepts, facts, files_read, files_modified, importance, created_at, created_at_epoch)
|
|
93
|
+
VALUES (?, ?, ?, ?, ?, '', ?, '', '', '[]', '[]', 2, ?, ?)
|
|
94
|
+
`).run(sessionId, proj, safe.text, dominantType, safe.title, safe.narrative, medianDate.toISOString(), medianEpoch);
|
|
95
|
+
const summaryId = Number(summaryResult.lastInsertRowid);
|
|
96
|
+
|
|
97
|
+
// TF-IDF vector for the summary so it is reachable by vector recall (parity
|
|
98
|
+
// with save-observation.mjs and the LLM smart-compress path). Best-effort:
|
|
99
|
+
// vocab may be uninitialized on a fresh DB — a failure here must not abort the
|
|
100
|
+
// compression the caller is transacting.
|
|
101
|
+
try {
|
|
102
|
+
const vocab = getVocabulary(db);
|
|
103
|
+
if (vocab) {
|
|
104
|
+
const vec = computeVector(`${safe.title} ${safe.narrative}`, vocab);
|
|
105
|
+
if (vec) {
|
|
106
|
+
db.prepare(
|
|
107
|
+
'INSERT OR REPLACE INTO observation_vectors (observation_id, vector, vocab_version, created_at_epoch) VALUES (?, ?, ?, ?)'
|
|
108
|
+
).run(summaryId, Buffer.from(vec.buffer), vocab.version, medianEpoch);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
} catch (e) { debugCatch(e, 'compress-vector'); }
|
|
112
|
+
|
|
113
|
+
const obsIds = obs.map((o) => o.id);
|
|
114
|
+
const obsPh = obsIds.map(() => '?').join(',');
|
|
115
|
+
db.prepare(`UPDATE observations SET compressed_into = ? WHERE id IN (${obsPh})`).run(summaryId, ...obsIds);
|
|
116
|
+
|
|
117
|
+
return { summaryId, compressed: obs.length };
|
|
118
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
// Dedup / merge similarity thresholds — single source of truth (P10).
|
|
2
|
+
//
|
|
3
|
+
// All values are Jaccard-space (word-set overlap, 0..1) unless noted. They were
|
|
4
|
+
// scattered as bare literals and duplicate local consts across save-observation,
|
|
5
|
+
// maintain-core, hook-llm, hook-optimize, mem-cli, server, and hook; converging
|
|
6
|
+
// them here removes the drift risk and gives the P7 benchmark named knobs.
|
|
7
|
+
// Vector-side constants (VOCAB_DIM / MIN_COSINE_SIMILARITY / RRF_K) deliberately
|
|
8
|
+
// stay in tfidf.mjs next to the search engine that consumes them.
|
|
9
|
+
//
|
|
10
|
+
// Pure constants only — no imports, so nothing can import-cycle through this.
|
|
11
|
+
|
|
12
|
+
// 0.7: near-duplicate cutoff for save-time dedup (5-min window, lib/save-observation)
|
|
13
|
+
// and the hook-llm tier-1 title dedup. Catches "Modified X" / "Fixed X" restatements
|
|
14
|
+
// (~70% word overlap) without collapsing distinct-but-related observations.
|
|
15
|
+
export const DEDUP_JACCARD_THRESHOLD = 0.7;
|
|
16
|
+
|
|
17
|
+
// 0.85: high-confidence auto-merge cutoff (maintain + optimize cluster-merge, CLI/MCP
|
|
18
|
+
// dedup preview). Pairs at or above this merge without an LLM merge-decision call.
|
|
19
|
+
export const AUTO_MERGE_THRESHOLD = 0.85;
|
|
20
|
+
|
|
21
|
+
// 0.4: low bound of the LLM-review merge band [0.4, 0.85) in hook-optimize. Below it,
|
|
22
|
+
// a pair is too dissimilar to be worth a merge-decision call.
|
|
23
|
+
export const MERGE_JACCARD_LOW = 0.4;
|
|
24
|
+
|
|
25
|
+
// 0.5: MinHash estimated-Jaccard pre-filter for the maintain O(n²) scan — skip the
|
|
26
|
+
// exact-Jaccard compare when the cheap signature estimate is already below this.
|
|
27
|
+
export const MINHASH_PRE_THRESHOLD = 0.5;
|
|
28
|
+
|
|
29
|
+
// 0.7: MinHash pre-filter for the hook post-inject fuzzy-dedup pass. Stricter than
|
|
30
|
+
// maintain's 0.5 to keep the inline inject path cheap (it runs in the hot Stop path).
|
|
31
|
+
export const MINHASH_PREFILTER = 0.7;
|
|
32
|
+
|
|
33
|
+
// 0.95: strict title-Jaccard cutoff for the hook post-inject fuzzy-dedup pass — only
|
|
34
|
+
// collapse near-identical titles inline; anything softer waits for the maintain sweep.
|
|
35
|
+
export const FUZZY_DEDUP_THRESHOLD = 0.95;
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
// Shared maintenance operations — single source of truth for cmdMaintain (CLI),
|
|
2
|
+
// mem_maintain (MCP), and handleAutoMaintain (hook). Pre-extraction each
|
|
3
|
+
// operation's SQL was copy-pasted across the call sites and kept in sync by
|
|
4
|
+
// "parity" comments, which had already drifted: the CLI/hook `decay` and
|
|
5
|
+
// `mark-idle` protect injection_count>0 (v2.56.0 — an obs Claude was shown 8×
|
|
6
|
+
// is contextually proven), but the MCP copy never got that clause, so
|
|
7
|
+
// mem_maintain decayed/purged injected memories the other two paths preserve.
|
|
8
|
+
// Consolidating here UNIFIES decay/mark-idle on the protected (correct) form.
|
|
9
|
+
//
|
|
10
|
+
// Every mutation is statement-only — the CALLER owns the transaction boundary
|
|
11
|
+
// (CLI/MCP wrap the execute ops in one transaction; the hook runs them in its
|
|
12
|
+
// auto-maintain block). `ctx` carries the per-caller knobs:
|
|
13
|
+
// { projectFilter: 'AND project = ?' | '', baseParams: [project?] , staleAge, opCap }
|
|
14
|
+
|
|
15
|
+
import { COMPRESSED_PENDING_PURGE, computeMinHash, estimateJaccardFromMinHash, jaccardSimilarity } from '../utils.mjs';
|
|
16
|
+
import { rebuildVocabulary, computeVector, _resetVocabCache } from '../tfidf.mjs';
|
|
17
|
+
import { DEDUP_JACCARD_THRESHOLD, MINHASH_PRE_THRESHOLD as MINHASH_PRE_THRESHOLD_SRC } from './dedup-constants.mjs';
|
|
18
|
+
|
|
19
|
+
export const STALE_AGE_MS = 30 * 86400000;
|
|
20
|
+
export const OP_CAP = 1000;
|
|
21
|
+
export const SCAN_LIMIT = 500;
|
|
22
|
+
export const DUPLICATE_LIMIT = 50;
|
|
23
|
+
// Back-compat: maintain-core historically exported these names; both now source
|
|
24
|
+
// their value from the single canonical lib/dedup-constants.mjs.
|
|
25
|
+
export const SIMILARITY_THRESHOLD = DEDUP_JACCARD_THRESHOLD;
|
|
26
|
+
export const MINHASH_PRE_THRESHOLD = MINHASH_PRE_THRESHOLD_SRC;
|
|
27
|
+
// A memory injected this many times with zero citations is "pinned noise" that
|
|
28
|
+
// the regular decay op can't touch (decay protects injection_count>0).
|
|
29
|
+
export const PINNED_INJ_THRESHOLD = 8;
|
|
30
|
+
|
|
31
|
+
/** Delete broken observations (no title AND no narrative). Returns rows deleted. */
|
|
32
|
+
export function cleanupBroken(db, { projectFilter, baseParams, opCap = OP_CAP }) {
|
|
33
|
+
return db.prepare(`
|
|
34
|
+
DELETE FROM observations WHERE id IN (
|
|
35
|
+
SELECT id FROM observations
|
|
36
|
+
WHERE COALESCE(compressed_into, 0) = 0
|
|
37
|
+
AND (title IS NULL OR title = '') AND (narrative IS NULL OR narrative = '')
|
|
38
|
+
${projectFilter} LIMIT ${opCap}
|
|
39
|
+
)
|
|
40
|
+
`).run(...baseParams).changes;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Decay importance of old, never-accessed, NEVER-INJECTED observations, then mark
|
|
45
|
+
* the importance-1 idle ones as pending-purge. injection_count>0 is protected as
|
|
46
|
+
* first-class engagement alongside access_count (unified across all three paths).
|
|
47
|
+
*/
|
|
48
|
+
export function decayAndMarkIdle(db, { projectFilter, baseParams, staleAge, opCap = OP_CAP }) {
|
|
49
|
+
const decayed = db.prepare(`
|
|
50
|
+
UPDATE observations SET importance = MAX(1, COALESCE(importance, 1) - 1)
|
|
51
|
+
WHERE id IN (
|
|
52
|
+
SELECT id FROM observations
|
|
53
|
+
WHERE COALESCE(compressed_into, 0) = 0
|
|
54
|
+
AND COALESCE(importance, 1) > 1
|
|
55
|
+
AND COALESCE(access_count, 0) = 0
|
|
56
|
+
AND COALESCE(injection_count, 0) = 0
|
|
57
|
+
AND created_at_epoch < ?
|
|
58
|
+
${projectFilter} LIMIT ${opCap}
|
|
59
|
+
)
|
|
60
|
+
`).run(staleAge, ...baseParams).changes;
|
|
61
|
+
|
|
62
|
+
const idleMarked = db.prepare(`
|
|
63
|
+
UPDATE observations SET compressed_into = ${COMPRESSED_PENDING_PURGE}
|
|
64
|
+
WHERE id IN (
|
|
65
|
+
SELECT id FROM observations
|
|
66
|
+
WHERE COALESCE(compressed_into, 0) = 0
|
|
67
|
+
AND COALESCE(importance, 1) = 1
|
|
68
|
+
AND COALESCE(access_count, 0) = 0
|
|
69
|
+
AND COALESCE(injection_count, 0) = 0
|
|
70
|
+
AND created_at_epoch < ?
|
|
71
|
+
${projectFilter} LIMIT ${opCap}
|
|
72
|
+
)
|
|
73
|
+
`).run(staleAge, ...baseParams).changes;
|
|
74
|
+
|
|
75
|
+
return { decayed, idleMarked };
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Boost importance of frequently-accessed observations. Returns rows boosted. */
|
|
79
|
+
export function boostAccessed(db, { projectFilter, baseParams, opCap = OP_CAP }) {
|
|
80
|
+
return db.prepare(`
|
|
81
|
+
UPDATE observations SET importance = MIN(3, COALESCE(importance, 1) + 1)
|
|
82
|
+
WHERE id IN (
|
|
83
|
+
SELECT id FROM observations
|
|
84
|
+
WHERE COALESCE(compressed_into, 0) = 0
|
|
85
|
+
AND COALESCE(access_count, 0) > 3
|
|
86
|
+
AND COALESCE(importance, 1) < 3
|
|
87
|
+
${projectFilter} LIMIT ${opCap}
|
|
88
|
+
)
|
|
89
|
+
`).run(...baseParams).changes;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Repair the citation-decay blind spot: heavy-injection + zero-citation rows that
|
|
94
|
+
* decay protects (injection_count>0) stay pinned at max importance forever. Drop
|
|
95
|
+
* them to importance 1 in one pass (injection priority is binary at >=2, so a
|
|
96
|
+
* single step would not de-rank). Floor 1, not purge.
|
|
97
|
+
*/
|
|
98
|
+
export function demotePinned(db, { projectFilter, baseParams, opCap = OP_CAP }) {
|
|
99
|
+
return db.prepare(`
|
|
100
|
+
UPDATE observations SET importance = 1
|
|
101
|
+
WHERE id IN (
|
|
102
|
+
SELECT id FROM observations
|
|
103
|
+
WHERE COALESCE(compressed_into, 0) = 0
|
|
104
|
+
AND COALESCE(injection_count, 0) >= ${PINNED_INJ_THRESHOLD}
|
|
105
|
+
AND COALESCE(cited_count, 0) = 0
|
|
106
|
+
AND COALESCE(importance, 1) > 1
|
|
107
|
+
${projectFilter} LIMIT ${opCap}
|
|
108
|
+
)
|
|
109
|
+
`).run(...baseParams).changes;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Merge explicit duplicate groups: each group is [keepId, removeId, …]. Marks the
|
|
114
|
+
* removeIds compressed into keepId (only if not already compressed). Returns the
|
|
115
|
+
* number of rows merged. Callers parse their own input (CLI string / MCP array).
|
|
116
|
+
*/
|
|
117
|
+
export function mergeDuplicates(db, groups) {
|
|
118
|
+
let merged = 0;
|
|
119
|
+
const mergeStmt = db.prepare('UPDATE observations SET compressed_into = ? WHERE id = ? AND COALESCE(compressed_into, 0) = 0');
|
|
120
|
+
for (const group of groups) {
|
|
121
|
+
if (!group || group.length < 2) continue;
|
|
122
|
+
const [keepId, ...removeIds] = group;
|
|
123
|
+
for (const removeId of removeIds) merged += mergeStmt.run(keepId, removeId).changes;
|
|
124
|
+
}
|
|
125
|
+
return merged;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/** Preview pending-purge candidates older than the retain cutoff (no deletion). */
|
|
129
|
+
export function purgeStalePreview(db, { projectFilter, baseParams }, retainCutoff) {
|
|
130
|
+
return db.prepare(`
|
|
131
|
+
SELECT COUNT(*) AS candidates, MIN(created_at_epoch) AS oldest, MAX(created_at_epoch) AS newest
|
|
132
|
+
FROM observations
|
|
133
|
+
WHERE compressed_into = ${COMPRESSED_PENDING_PURGE} AND created_at_epoch < ? ${projectFilter}
|
|
134
|
+
`).get(retainCutoff, ...baseParams);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/** Delete pending-purge observations older than the retain cutoff. Returns rows deleted. */
|
|
138
|
+
export function purgeStale(db, { projectFilter, baseParams, opCap = OP_CAP }, retainCutoff) {
|
|
139
|
+
return db.prepare(`
|
|
140
|
+
DELETE FROM observations WHERE id IN (
|
|
141
|
+
SELECT id FROM observations
|
|
142
|
+
WHERE compressed_into = ${COMPRESSED_PENDING_PURGE} AND created_at_epoch < ?
|
|
143
|
+
${projectFilter} LIMIT ${opCap}
|
|
144
|
+
)
|
|
145
|
+
`).run(retainCutoff, ...baseParams).changes;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Near-duplicate title detection: MinHash pre-filter → exact Jaccard. Returns
|
|
150
|
+
* [{ a:{id,title,importance}, b:{…}, similarity:'0.NN' }, …].
|
|
151
|
+
*/
|
|
152
|
+
export function findDuplicates(db, { projectFilter, baseParams, limit = SCAN_LIMIT, dupLimit = DUPLICATE_LIMIT }) {
|
|
153
|
+
const recent = db.prepare(`
|
|
154
|
+
SELECT id, title, project, importance, access_count, created_at_epoch
|
|
155
|
+
FROM observations
|
|
156
|
+
WHERE COALESCE(compressed_into, 0) = 0 ${projectFilter}
|
|
157
|
+
ORDER BY created_at_epoch DESC LIMIT ${limit}
|
|
158
|
+
`).all(...baseParams);
|
|
159
|
+
|
|
160
|
+
const titles = recent.map((r) => (r.title || '').trim());
|
|
161
|
+
const minhashes = titles.map((t) => (t ? computeMinHash(t) : null));
|
|
162
|
+
const duplicates = [];
|
|
163
|
+
for (let i = 0; i < recent.length && duplicates.length < dupLimit; i++) {
|
|
164
|
+
if (!titles[i] || !minhashes[i]) continue;
|
|
165
|
+
for (let j = i + 1; j < recent.length; j++) {
|
|
166
|
+
if (!titles[j] || !minhashes[j]) continue;
|
|
167
|
+
if (estimateJaccardFromMinHash(minhashes[i], minhashes[j]) < MINHASH_PRE_THRESHOLD) continue;
|
|
168
|
+
const sim = jaccardSimilarity(titles[i], titles[j]);
|
|
169
|
+
if (sim > SIMILARITY_THRESHOLD) {
|
|
170
|
+
duplicates.push({
|
|
171
|
+
a: { id: recent[i].id, title: recent[i].title, importance: recent[i].importance },
|
|
172
|
+
b: { id: recent[j].id, title: recent[j].title, importance: recent[j].importance },
|
|
173
|
+
similarity: sim.toFixed(2),
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
if (duplicates.length >= dupLimit) break;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return duplicates;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/** Single-scan maintenance counters (includes `pinned`; callers render what they show). */
|
|
183
|
+
export function maintenanceStats(db, { projectFilter, baseParams, staleAge }) {
|
|
184
|
+
const stats = db.prepare(`
|
|
185
|
+
SELECT
|
|
186
|
+
COUNT(*) as total,
|
|
187
|
+
COALESCE(SUM(CASE WHEN COALESCE(importance, 1) = 1 AND COALESCE(access_count, 0) = 0
|
|
188
|
+
AND created_at_epoch < ? THEN 1 ELSE 0 END), 0) as stale,
|
|
189
|
+
COALESCE(SUM(CASE WHEN (title IS NULL OR title = '') AND (narrative IS NULL OR narrative = '')
|
|
190
|
+
THEN 1 ELSE 0 END), 0) as broken,
|
|
191
|
+
COALESCE(SUM(CASE WHEN COALESCE(access_count, 0) > 3 AND COALESCE(importance, 1) < 3
|
|
192
|
+
THEN 1 ELSE 0 END), 0) as boostable,
|
|
193
|
+
COALESCE(SUM(CASE WHEN COALESCE(injection_count, 0) >= ${PINNED_INJ_THRESHOLD}
|
|
194
|
+
AND COALESCE(cited_count, 0) = 0 AND COALESCE(importance, 1) > 1
|
|
195
|
+
THEN 1 ELSE 0 END), 0) as pinned
|
|
196
|
+
FROM observations
|
|
197
|
+
WHERE COALESCE(compressed_into, 0) = 0 ${projectFilter}
|
|
198
|
+
`).get(staleAge, ...baseParams);
|
|
199
|
+
const pendingPurge = db.prepare(
|
|
200
|
+
`SELECT COUNT(*) as count FROM observations WHERE compressed_into = ${COMPRESSED_PENDING_PURGE} ${projectFilter}`
|
|
201
|
+
).get(...baseParams);
|
|
202
|
+
return { ...stats, pendingPurge: pendingPurge.count };
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/** Rebuild the TF-IDF vocabulary + every active observation vector (own transaction). */
|
|
206
|
+
export function rebuildVectors(db) {
|
|
207
|
+
_resetVocabCache();
|
|
208
|
+
const vocab = rebuildVocabulary(db);
|
|
209
|
+
if (!vocab) return { ok: false, reason: 'no observations to build vocabulary from' };
|
|
210
|
+
const allObs = db.prepare(`
|
|
211
|
+
SELECT id, title, narrative, concepts FROM observations
|
|
212
|
+
WHERE COALESCE(compressed_into, 0) = 0 AND superseded_at IS NULL
|
|
213
|
+
`).all();
|
|
214
|
+
let updated = 0;
|
|
215
|
+
const insertStmt = db.prepare('INSERT OR REPLACE INTO observation_vectors (observation_id, vector, vocab_version, created_at_epoch) VALUES (?, ?, ?, ?)');
|
|
216
|
+
const now = Date.now();
|
|
217
|
+
db.transaction(() => {
|
|
218
|
+
db.prepare('DELETE FROM observation_vectors').run();
|
|
219
|
+
for (const obs of allObs) {
|
|
220
|
+
const text = [obs.title || '', obs.narrative || '', obs.concepts || ''].filter(Boolean).join(' ');
|
|
221
|
+
const vec = computeVector(text, vocab);
|
|
222
|
+
if (vec) {
|
|
223
|
+
insertStmt.run(obs.id, Buffer.from(vec.buffer), vocab.version, now);
|
|
224
|
+
updated++;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
})();
|
|
228
|
+
return { ok: true, terms: vocab.terms.size, updated, total: allObs.length };
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/** VACUUM the whole DB, reporting freelist reclaim. Must run OUTSIDE any transaction. */
|
|
232
|
+
export function vacuum(db) {
|
|
233
|
+
const pageSize = db.pragma('page_size', { simple: true });
|
|
234
|
+
const freeBefore = db.pragma('freelist_count', { simple: true });
|
|
235
|
+
db.exec('VACUUM');
|
|
236
|
+
const freeAfter = db.pragma('freelist_count', { simple: true });
|
|
237
|
+
const reclaimedMB = ((Math.max(0, freeBefore - freeAfter) * pageSize) / 1048576).toFixed(1);
|
|
238
|
+
return { reclaimedMB, freeBefore, freeAfter };
|
|
239
|
+
}
|
package/lib/save-observation.mjs
CHANGED
|
@@ -13,10 +13,10 @@
|
|
|
13
13
|
|
|
14
14
|
import { jaccardSimilarity, scrubSecrets, computeMinHash, cjkBigrams, getCurrentBranch, debugCatch } from '../utils.mjs';
|
|
15
15
|
import { getVocabulary, computeVector } from '../tfidf.mjs';
|
|
16
|
+
import { DEDUP_JACCARD_THRESHOLD } from './dedup-constants.mjs';
|
|
16
17
|
|
|
17
18
|
const DEDUP_WINDOW_MS = 5 * 60 * 1000;
|
|
18
19
|
const DEDUP_RECENT_LIMIT = 50;
|
|
19
|
-
const DEDUP_JACCARD_THRESHOLD = 0.7;
|
|
20
20
|
|
|
21
21
|
/**
|
|
22
22
|
* Save a new observation if it isn't a near-duplicate of one saved within the
|