claude-mem-lite 2.19.0 → 2.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +1 -1
- package/.claude-plugin/plugin.json +1 -1
- package/hook.mjs +4 -3
- package/install.mjs +14 -11
- package/nlp.mjs +187 -0
- package/package.json +6 -1
- package/scoring-sql.mjs +41 -0
- package/server-internals.mjs +3 -8
- package/server.mjs +13 -7
- package/stop-words.mjs +16 -0
- package/tfidf.mjs +425 -0
- package/tier.mjs +116 -0
- package/tool-schemas.mjs +1 -0
- package/utils.mjs +6 -2
package/hook.mjs
CHANGED
|
@@ -723,8 +723,9 @@ async function handleSessionStart() {
|
|
|
723
723
|
try { getVocabulary(db); } catch (e) { debugCatch(e, 'session-start-vocab'); }
|
|
724
724
|
|
|
725
725
|
// Auto-update check (24h throttle, 3s timeout, silent on failure)
|
|
726
|
-
//
|
|
727
|
-
|
|
726
|
+
// Awaited so process.exit(0) doesn't kill the promise before notification
|
|
727
|
+
try {
|
|
728
|
+
const updateResult = await checkForUpdate();
|
|
728
729
|
if (updateResult?.updated) {
|
|
729
730
|
process.stdout.write(`\n🔄 claude-mem-lite: v${updateResult.from} → v${updateResult.to} updated\n`);
|
|
730
731
|
} else if (updateResult?.updateAvailable) {
|
|
@@ -733,7 +734,7 @@ async function handleSessionStart() {
|
|
|
733
734
|
: '';
|
|
734
735
|
process.stdout.write(`\n📦 claude-mem-lite: v${updateResult.to} available (current: v${updateResult.from})${hint}\n`);
|
|
735
736
|
}
|
|
736
|
-
}
|
|
737
|
+
} catch (e) { debugCatch(e, 'session-start-update'); }
|
|
737
738
|
|
|
738
739
|
} finally {
|
|
739
740
|
db.close();
|
package/install.mjs
CHANGED
|
@@ -51,13 +51,13 @@ function reindexKnownResources(rdb) {
|
|
|
51
51
|
UPDATE resources SET
|
|
52
52
|
intent_tags = ?, domain_tags = ?,
|
|
53
53
|
capability_summary = ?, trigger_patterns = ?,
|
|
54
|
-
invocation_name = CASE WHEN ?
|
|
55
|
-
recommendation_mode = CASE WHEN ?
|
|
56
|
-
keywords = CASE WHEN ?
|
|
57
|
-
tech_stack = CASE WHEN ?
|
|
58
|
-
use_cases = CASE WHEN ?
|
|
54
|
+
invocation_name = CASE WHEN ? != '' THEN ? ELSE invocation_name END,
|
|
55
|
+
recommendation_mode = CASE WHEN ? != '' THEN ? ELSE recommendation_mode END,
|
|
56
|
+
keywords = CASE WHEN ? != '' THEN ? ELSE keywords END,
|
|
57
|
+
tech_stack = CASE WHEN ? != '' THEN ? ELSE tech_stack END,
|
|
58
|
+
use_cases = CASE WHEN ? != '' THEN ? ELSE use_cases END,
|
|
59
59
|
updated_at = datetime('now')
|
|
60
|
-
WHERE type = ?
|
|
60
|
+
WHERE type = ? AND name = ?
|
|
61
61
|
`);
|
|
62
62
|
|
|
63
63
|
rdb.transaction(() => {
|
|
@@ -68,14 +68,17 @@ function reindexKnownResources(rdb) {
|
|
|
68
68
|
const name = key.slice(sep + 1);
|
|
69
69
|
const invName = meta.invocation_name || deriveInvocationName(name);
|
|
70
70
|
const recMode = meta.recommendation_mode || '';
|
|
71
|
+
const kw = meta.keywords || '';
|
|
72
|
+
const ts = meta.tech_stack || '';
|
|
73
|
+
const uc = meta.use_cases || '';
|
|
71
74
|
update.run(
|
|
72
75
|
meta.intent_tags, meta.domain_tags,
|
|
73
76
|
meta.capability_summary, meta.trigger_patterns,
|
|
74
|
-
invName,
|
|
75
|
-
recMode,
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
invName, invName,
|
|
78
|
+
recMode, recMode,
|
|
79
|
+
kw, kw,
|
|
80
|
+
ts, ts,
|
|
81
|
+
uc, uc,
|
|
79
82
|
type, name
|
|
80
83
|
);
|
|
81
84
|
}
|
package/nlp.mjs
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
// nlp.mjs -- FTS5 query building, synonym expansion, CJK tokenization.
|
|
2
|
+
// Extracted from utils.mjs for focused module boundaries.
|
|
3
|
+
|
|
4
|
+
import { BASE_STOP_WORDS } from './stop-words.mjs';
|
|
5
|
+
import { SYNONYM_MAP, CJK_COMPOUNDS } from './synonyms.mjs';
|
|
6
|
+
|
|
7
|
+
// Re-export for backward compatibility (consumers import from nlp.mjs or utils.mjs)
|
|
8
|
+
export { SYNONYM_MAP, CJK_COMPOUNDS };
|
|
9
|
+
|
|
10
|
+
// ─── FTS5 Constants ──────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
const FTS5_KEYWORDS = new Set(['AND', 'OR', 'NOT', 'NEAR']);
|
|
13
|
+
|
|
14
|
+
// Sort by length descending for greedy matching
|
|
15
|
+
const CJK_SORTED = [...CJK_COMPOUNDS].sort((a, b) => b.length - a.length);
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Generate search tokens from CJK text using dictionary-first tokenization.
|
|
19
|
+
* Compound words are emitted whole; remaining chars use bigram fallback.
|
|
20
|
+
* "修复了数据库崩溃" → "修复 数据库 崩溃" (3 clean tokens)
|
|
21
|
+
* vs old bigram: "修复 复了 了数 数据 据库 库崩 崩溃" (7 noisy tokens)
|
|
22
|
+
* @param {string} text Input text containing CJK characters
|
|
23
|
+
* @returns {string} Space-separated tokens
|
|
24
|
+
*/
|
|
25
|
+
export function cjkBigrams(text) {
|
|
26
|
+
if (!text) return '';
|
|
27
|
+
const runs = text.match(/[\u4e00-\u9fff\u3400-\u4dbf]{2,}/g) || [];
|
|
28
|
+
const tokens = [];
|
|
29
|
+
for (const run of runs) {
|
|
30
|
+
let i = 0;
|
|
31
|
+
while (i < run.length) {
|
|
32
|
+
let matched = false;
|
|
33
|
+
// Greedy dictionary match (longest first)
|
|
34
|
+
for (const word of CJK_SORTED) {
|
|
35
|
+
if (i + word.length <= run.length && run.slice(i, i + word.length) === word) {
|
|
36
|
+
tokens.push(word);
|
|
37
|
+
i += word.length;
|
|
38
|
+
matched = true;
|
|
39
|
+
break;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
if (!matched) {
|
|
43
|
+
// Fallback: bigram for unknown compound
|
|
44
|
+
if (i + 1 < run.length) {
|
|
45
|
+
tokens.push(run[i] + run[i + 1]);
|
|
46
|
+
}
|
|
47
|
+
i++;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
return [...new Set(tokens)].join(' ');
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// ─── CJK Synonym Extraction ─────────────────────────────────────────────────
|
|
55
|
+
|
|
56
|
+
// Extract known CJK words (from SYNONYM_MAP) out of unsegmented CJK text.
|
|
57
|
+
// Greedy longest-match: "数据库的全文搜索" → ["数据库", "搜索"] (skips particles/unknown).
|
|
58
|
+
const _cjkSynonymKeys = [...SYNONYM_MAP.keys()]
|
|
59
|
+
.filter(k => /[\u4e00-\u9fff\u3400-\u4dbf]/.test(k))
|
|
60
|
+
.sort((a, b) => b.length - a.length); // longest first
|
|
61
|
+
|
|
62
|
+
export function extractCjkSynonymTokens(text) {
|
|
63
|
+
const found = [];
|
|
64
|
+
let i = 0;
|
|
65
|
+
while (i < text.length) {
|
|
66
|
+
let matched = false;
|
|
67
|
+
for (const key of _cjkSynonymKeys) {
|
|
68
|
+
if (text.startsWith(key, i)) {
|
|
69
|
+
found.push(key);
|
|
70
|
+
i += key.length;
|
|
71
|
+
matched = true;
|
|
72
|
+
break;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
if (!matched) i++;
|
|
76
|
+
}
|
|
77
|
+
return found;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// ─── FTS5 Token Formatting ──────────────────────────────────────────────────
|
|
81
|
+
|
|
82
|
+
// Format a term for FTS5: quote if it contains spaces, hyphens, or special chars
|
|
83
|
+
function ftsToken(term) {
|
|
84
|
+
// Bare tokens are safe if purely alphanumeric or CJK characters
|
|
85
|
+
if (/^[a-zA-Z0-9\u4e00-\u9fff\u3400-\u4dbf]+$/.test(term)) return term;
|
|
86
|
+
return `"${term.replace(/"/g, '""')}"`;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
export function expandToken(token) {
|
|
90
|
+
const synonyms = SYNONYM_MAP.get(token.toLowerCase());
|
|
91
|
+
if (!synonyms || synonyms.size === 0) return ftsToken(token);
|
|
92
|
+
// FTS5 OR group: (original OR synonym1 OR "multi word synonym")
|
|
93
|
+
const parts = [ftsToken(token)];
|
|
94
|
+
for (const syn of synonyms) {
|
|
95
|
+
parts.push(ftsToken(syn));
|
|
96
|
+
}
|
|
97
|
+
return `(${parts.join(' OR ')})`;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ─── Stop Words ──────────────────────────────────────────────────────────────
|
|
101
|
+
|
|
102
|
+
export const FTS_STOP_WORDS = new Set([...BASE_STOP_WORDS]);
|
|
103
|
+
|
|
104
|
+
// ─── FTS5 Query Sanitization ─────────────────────────────────────────────────
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Sanitize and expand a user query into a valid FTS5 query string.
|
|
108
|
+
* Strips special characters, expands synonyms, and joins with AND/space.
|
|
109
|
+
* @param {string} query Raw user search query
|
|
110
|
+
* @returns {string|null} FTS5-safe query or null if empty
|
|
111
|
+
*/
|
|
112
|
+
export function sanitizeFtsQuery(query) {
|
|
113
|
+
if (!query) return null;
|
|
114
|
+
const cleaned = query
|
|
115
|
+
.replace(/[{}()[\]^~*:"\\]/g, ' ')
|
|
116
|
+
.replace(/(^|\s)-/g, '$1')
|
|
117
|
+
.trim();
|
|
118
|
+
if (!cleaned) return null;
|
|
119
|
+
let tokens = cleaned.split(/\s+/).filter(t =>
|
|
120
|
+
t && !/^-+$/.test(t) && !FTS5_KEYWORDS.has(t.toUpperCase()) && !/^NEAR\/\d+$/i.test(t)
|
|
121
|
+
// Skip single ASCII-letter tokens — too noisy for FTS5 (CJK single chars handled separately below)
|
|
122
|
+
&& !(t.length === 1 && /^[a-zA-Z]$/.test(t))
|
|
123
|
+
);
|
|
124
|
+
// Filter stop words (but keep all if filtering would empty the query)
|
|
125
|
+
const filtered = tokens.filter(t => !FTS_STOP_WORDS.has(t.toLowerCase()));
|
|
126
|
+
if (filtered.length > 0) tokens = filtered;
|
|
127
|
+
// Split unsegmented CJK tokens into known vocabulary words for synonym expansion.
|
|
128
|
+
// e.g. "数据库的全文搜索" → ["数据库", "搜索"] (both have EN synonyms in SYNONYM_MAP)
|
|
129
|
+
const expandedTokens = [];
|
|
130
|
+
let cjkExtracted = false;
|
|
131
|
+
for (const t of tokens) {
|
|
132
|
+
if (/[\u4e00-\u9fff\u3400-\u4dbf]/.test(t) && t.length > 2) {
|
|
133
|
+
const cjkWords = extractCjkSynonymTokens(t);
|
|
134
|
+
if (cjkWords.length > 0) {
|
|
135
|
+
expandedTokens.push(...cjkWords);
|
|
136
|
+
cjkExtracted = true;
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
expandedTokens.push(t);
|
|
141
|
+
}
|
|
142
|
+
tokens = expandedTokens;
|
|
143
|
+
if (tokens.length === 0) return null;
|
|
144
|
+
// Replace single CJK character tokens with bigrams for better phrase matching.
|
|
145
|
+
// Individual CJK chars ("系","统") are too noisy; bigrams ("系统") capture compound words.
|
|
146
|
+
// Skip bigrams when CJK synonym extraction already produced meaningful tokens —
|
|
147
|
+
// bigrams joined with AND would make the query too restrictive.
|
|
148
|
+
const bigrams = cjkExtracted ? null : cjkBigrams(cleaned);
|
|
149
|
+
const bigramSet = new Set(bigrams ? bigrams.split(' ').filter(Boolean) : []);
|
|
150
|
+
const hasBigrams = bigramSet.size > 0;
|
|
151
|
+
const finalTokens = [];
|
|
152
|
+
const seen = new Set();
|
|
153
|
+
const rawTokensSeen = new Set(); // track raw tokens to prevent bigram duplicates
|
|
154
|
+
for (const t of tokens) {
|
|
155
|
+
// Skip single CJK characters when we have bigrams — they're subsumed by bigram tokens
|
|
156
|
+
if (hasBigrams && /^[\u4e00-\u9fff\u3400-\u4dbf]$/.test(t)) continue;
|
|
157
|
+
const expanded = expandToken(t);
|
|
158
|
+
if (!seen.has(expanded)) { seen.add(expanded); rawTokensSeen.add(t); finalTokens.push(expanded); }
|
|
159
|
+
}
|
|
160
|
+
for (const bg of bigramSet) {
|
|
161
|
+
if (!seen.has(bg) && !rawTokensSeen.has(bg)) { seen.add(bg); finalTokens.push(bg); }
|
|
162
|
+
}
|
|
163
|
+
if (finalTokens.length === 0) return null;
|
|
164
|
+
// FTS5 requires explicit AND after parenthesized OR groups
|
|
165
|
+
const hasGroup = finalTokens.some(e => e.startsWith('('));
|
|
166
|
+
return finalTokens.join(hasGroup ? ' AND ' : ' ');
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Relax an AND-joined FTS5 query to OR-joined for fallback search.
|
|
171
|
+
* Only useful when the original query has multiple tokens (single-token queries
|
|
172
|
+
* are already as relaxed as possible).
|
|
173
|
+
* @param {string} ftsQuery Original AND-joined FTS5 query from sanitizeFtsQuery
|
|
174
|
+
* @returns {string|null} OR-joined query, or null if relaxation wouldn't help
|
|
175
|
+
*/
|
|
176
|
+
export function relaxFtsQueryToOr(ftsQuery) {
|
|
177
|
+
if (!ftsQuery) return null;
|
|
178
|
+
// Replace AND joins with OR — handles both explicit " AND " and implicit space joins
|
|
179
|
+
const orQuery = ftsQuery.replace(/ AND /g, ' OR ');
|
|
180
|
+
// If no AND was present, tokens are space-joined (implicit AND); convert to OR
|
|
181
|
+
if (orQuery === ftsQuery && !ftsQuery.includes(' OR ')) {
|
|
182
|
+
const parts = ftsQuery.split(/\s+/);
|
|
183
|
+
if (parts.length < 2) return null; // single token — OR won't help
|
|
184
|
+
return parts.join(' OR ');
|
|
185
|
+
}
|
|
186
|
+
return orQuery !== ftsQuery ? orQuery : null;
|
|
187
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-mem-lite",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.20.0",
|
|
4
4
|
"description": "Lightweight persistent memory system for Claude Code",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"engines": {
|
|
@@ -46,6 +46,11 @@
|
|
|
46
46
|
"skip-tools.mjs",
|
|
47
47
|
"tool-schemas.mjs",
|
|
48
48
|
"utils.mjs",
|
|
49
|
+
"nlp.mjs",
|
|
50
|
+
"scoring-sql.mjs",
|
|
51
|
+
"stop-words.mjs",
|
|
52
|
+
"tier.mjs",
|
|
53
|
+
"tfidf.mjs",
|
|
49
54
|
"project-utils.mjs",
|
|
50
55
|
"secret-scrub.mjs",
|
|
51
56
|
"format-utils.mjs",
|
package/scoring-sql.mjs
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
// scoring-sql.mjs — SQL constants for BM25 scoring and temporal decay.
|
|
2
|
+
// Extracted from utils.mjs for focused module boundaries.
|
|
3
|
+
|
|
4
|
+
// ─── Type-Differentiated Recency Decay ──────────────────────────────────────
|
|
5
|
+
|
|
6
|
+
/** Recency half-life per observation type (in milliseconds) */
|
|
7
|
+
export const DECAY_HALF_LIFE_BY_TYPE = {
|
|
8
|
+
decision: 90 * 86400000, // 90 days — architectural decisions persist
|
|
9
|
+
discovery: 60 * 86400000, // 60 days — learned patterns last
|
|
10
|
+
feature: 30 * 86400000, // 30 days — feature work is mid-range
|
|
11
|
+
bugfix: 14 * 86400000, // 14 days — bugs are usually one-off
|
|
12
|
+
refactor: 14 * 86400000, // 14 days — code cleanup
|
|
13
|
+
change: 7 * 86400000, // 7 days — routine changes decay fast
|
|
14
|
+
};
|
|
15
|
+
export const DEFAULT_DECAY_HALF_LIFE_MS = 14 * 86400000;
|
|
16
|
+
|
|
17
|
+
// ─── BM25 Weight Constants ──────────────────────────────────────────────────
|
|
18
|
+
// Single source of truth for FTS5 BM25 weight expressions.
|
|
19
|
+
// Column order must match ensureFTS() calls in schema.mjs.
|
|
20
|
+
|
|
21
|
+
/** observations_fts BM25 weights: title=10, subtitle=5, narrative=5, text=3, facts=3, concepts=2, lesson_learned=8 */
|
|
22
|
+
export const OBS_BM25 = 'bm25(observations_fts, 10, 5, 5, 3, 3, 2, 8)';
|
|
23
|
+
|
|
24
|
+
/** session_summaries_fts BM25 weights: request=5, investigated=3, learned=3, completed=3, next_steps=2, notes=1, remaining_items=1 */
|
|
25
|
+
export const SESS_BM25 = 'bm25(session_summaries_fts, 5, 3, 3, 3, 2, 1, 1)';
|
|
26
|
+
|
|
27
|
+
/** FTS5 columns for observations (must match BM25 weight order) */
|
|
28
|
+
export const OBS_FTS_COLUMNS = ['title', 'subtitle', 'narrative', 'text', 'facts', 'concepts', 'lesson_learned'];
|
|
29
|
+
|
|
30
|
+
/** SQL CASE for type-differentiated recency decay half-lives (milliseconds) */
|
|
31
|
+
export const TYPE_DECAY_CASE = `(
|
|
32
|
+
CASE o.type
|
|
33
|
+
WHEN 'decision' THEN 7776000000.0
|
|
34
|
+
WHEN 'discovery' THEN 5184000000.0
|
|
35
|
+
WHEN 'feature' THEN 2592000000.0
|
|
36
|
+
WHEN 'bugfix' THEN 1209600000.0
|
|
37
|
+
WHEN 'refactor' THEN 1209600000.0
|
|
38
|
+
WHEN 'change' THEN 604800000.0
|
|
39
|
+
ELSE 1209600000.0
|
|
40
|
+
END
|
|
41
|
+
)`;
|
package/server-internals.mjs
CHANGED
|
@@ -76,8 +76,6 @@ export function reRankWithContext(db, results, project) {
|
|
|
76
76
|
*/
|
|
77
77
|
export function markSuperseded(db, results) {
|
|
78
78
|
if (!results || results.length === 0) return;
|
|
79
|
-
const now = Date.now();
|
|
80
|
-
const updateStmt = db ? db.prepare('UPDATE observations SET superseded_at = ?, superseded_by = ? WHERE id = ? AND superseded_at IS NULL') : null;
|
|
81
79
|
// Build map: file → [result objects], only for obs with files
|
|
82
80
|
const fileMap = new Map();
|
|
83
81
|
for (const r of results) {
|
|
@@ -89,7 +87,9 @@ export function markSuperseded(db, results) {
|
|
|
89
87
|
fileMap.get(f).push(r);
|
|
90
88
|
}
|
|
91
89
|
}
|
|
92
|
-
// For each file with 2+ observations: mark older lower-importance as superseded
|
|
90
|
+
// For each file with 2+ observations: mark older lower-importance as superseded (in-memory only)
|
|
91
|
+
// Note: DB persistence removed — search is a read operation and should not write.
|
|
92
|
+
// Persistent superseding belongs in mem_maintain/mem_compress write paths.
|
|
93
93
|
for (const [, obsForFile] of fileMap) {
|
|
94
94
|
if (obsForFile.length < 2) continue;
|
|
95
95
|
obsForFile.sort((a, b) => (b.date || '').localeCompare(a.date || ''));
|
|
@@ -97,11 +97,6 @@ export function markSuperseded(db, results) {
|
|
|
97
97
|
for (let i = 1; i < obsForFile.length; i++) {
|
|
98
98
|
if ((obsForFile[i].importance ?? 1) <= (newest.importance ?? 1)) {
|
|
99
99
|
obsForFile[i].superseded = true;
|
|
100
|
-
if (db) {
|
|
101
|
-
try {
|
|
102
|
-
updateStmt.run(now, newest.id, obsForFile[i].id);
|
|
103
|
-
} catch (e) { debugCatch(e, 'markSuperseded-persist'); }
|
|
104
|
-
}
|
|
105
100
|
}
|
|
106
101
|
}
|
|
107
102
|
}
|
package/server.mjs
CHANGED
|
@@ -729,7 +729,7 @@ server.registerTool(
|
|
|
729
729
|
// Auto-boost importance for frequently accessed observations
|
|
730
730
|
autoBoostIfNeeded(db, args.ids);
|
|
731
731
|
rows = db.prepare(`SELECT * FROM observations WHERE id IN (${placeholders}) ORDER BY created_at_epoch ASC`).all(...args.ids);
|
|
732
|
-
allFields = ['id', 'type', 'title', 'subtitle', 'narrative', 'text', 'facts', 'concepts', 'files_read', 'files_modified', 'project', 'created_at', 'memory_session_id', 'prompt_number', 'importance', 'related_ids', 'access_count', 'branch', 'superseded_at', 'superseded_by', 'last_accessed_at'];
|
|
732
|
+
allFields = ['id', 'type', 'title', 'subtitle', 'narrative', 'text', 'facts', 'concepts', 'lesson_learned', 'search_aliases', 'files_read', 'files_modified', 'project', 'created_at', 'memory_session_id', 'prompt_number', 'importance', 'related_ids', 'access_count', 'branch', 'superseded_at', 'superseded_by', 'last_accessed_at'];
|
|
733
733
|
prefix = '#';
|
|
734
734
|
}
|
|
735
735
|
|
|
@@ -1531,7 +1531,10 @@ server.registerTool(
|
|
|
1531
1531
|
const updates = [];
|
|
1532
1532
|
const params = [];
|
|
1533
1533
|
for (const [key, col] of [['title','title'],['narrative','narrative'],['type','type'],['importance','importance'],['lesson_learned','lesson_learned'],['concepts','concepts']]) {
|
|
1534
|
-
if (args[key] !== undefined) {
|
|
1534
|
+
if (args[key] !== undefined) {
|
|
1535
|
+
updates.push(`${col} = ?`);
|
|
1536
|
+
params.push(typeof args[key] === 'string' ? scrubSecrets(args[key]) : args[key]);
|
|
1537
|
+
}
|
|
1535
1538
|
}
|
|
1536
1539
|
if (updates.length === 0) return { content: [{ type: 'text', text: 'No fields to update' }], isError: true };
|
|
1537
1540
|
|
|
@@ -1541,9 +1544,11 @@ server.registerTool(
|
|
|
1541
1544
|
db.transaction(() => {
|
|
1542
1545
|
db.prepare(`UPDATE observations SET ${updates.join(', ')} WHERE id = ?`).run(...params);
|
|
1543
1546
|
|
|
1544
|
-
// Rebuild FTS text field
|
|
1545
|
-
const row = db.prepare('SELECT title, subtitle, narrative, concepts, facts FROM observations WHERE id = ?').get(args.id);
|
|
1546
|
-
const
|
|
1547
|
+
// Rebuild FTS text field (must include CJK bigrams + search_aliases to match mem_save/hook-llm)
|
|
1548
|
+
const row = db.prepare('SELECT title, subtitle, narrative, concepts, facts, lesson_learned, search_aliases FROM observations WHERE id = ?').get(args.id);
|
|
1549
|
+
const base = [row.title, row.subtitle, row.narrative, row.concepts, row.facts, row.lesson_learned, row.search_aliases].filter(Boolean).join(' ');
|
|
1550
|
+
const bigrams = cjkBigrams((row.title || '') + ' ' + (row.narrative || ''));
|
|
1551
|
+
const textField = bigrams ? base + ' ' + bigrams : base;
|
|
1547
1552
|
db.prepare('UPDATE observations SET text = ? WHERE id = ?').run(textField, args.id);
|
|
1548
1553
|
|
|
1549
1554
|
// Re-vectorize (non-critical — catch to avoid rollback)
|
|
@@ -1589,7 +1594,8 @@ server.registerTool(
|
|
|
1589
1594
|
}
|
|
1590
1595
|
|
|
1591
1596
|
const where = wheres.length > 0 ? 'WHERE ' + wheres.join(' AND ') : '';
|
|
1592
|
-
const
|
|
1597
|
+
const exportLimit = Math.min(args.limit ?? 200, 1000);
|
|
1598
|
+
const rows = db.prepare(`SELECT id, project, type, title, subtitle, narrative, concepts, facts, lesson_learned, importance, files_modified, created_at, created_at_epoch FROM observations ${where} ORDER BY created_at_epoch DESC LIMIT ?`).all(...params, exportLimit);
|
|
1593
1599
|
|
|
1594
1600
|
if (rows.length === 0) return { content: [{ type: 'text', text: 'No observations found matching the criteria.' }] };
|
|
1595
1601
|
|
|
@@ -1597,7 +1603,7 @@ server.registerTool(
|
|
|
1597
1603
|
? rows.map(r => JSON.stringify(r)).join('\n')
|
|
1598
1604
|
: JSON.stringify(rows, null, 2);
|
|
1599
1605
|
|
|
1600
|
-
const cap = rows.length >=
|
|
1606
|
+
const cap = rows.length >= exportLimit ? `\nNote: Results capped at ${exportLimit}. Use date_from/date_to or increase limit (max 1000) to export more.` : '';
|
|
1601
1607
|
return { content: [{ type: 'text', text: `Exported ${rows.length} observations:${cap}\n${output}` }] };
|
|
1602
1608
|
})
|
|
1603
1609
|
);
|
package/stop-words.mjs
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
// stop-words.mjs — Shared base stop-word set for all NLP/search modules.
|
|
2
|
+
// Single source of truth: consumers extend with domain-specific extras.
|
|
3
|
+
|
|
4
|
+
/** Common English stop words shared across FTS, TF-IDF, PRF, and registry search. */
|
|
5
|
+
export const BASE_STOP_WORDS = new Set([
|
|
6
|
+
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
|
7
|
+
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
8
|
+
'should', 'may', 'might', 'can', 'shall', 'to', 'of', 'in', 'for',
|
|
9
|
+
'on', 'with', 'at', 'by', 'from', 'as', 'into', 'about', 'between',
|
|
10
|
+
'after', 'before', 'above', 'below', 'and', 'or', 'but', 'not', 'no',
|
|
11
|
+
'this', 'that', 'these', 'those', 'it', 'its', 'my', 'your', 'his',
|
|
12
|
+
'her', 'our', 'their', 'me', 'him', 'us', 'them', 'i', 'you', 'he',
|
|
13
|
+
'she', 'we', 'they', 'what', 'which', 'who', 'when', 'where', 'how',
|
|
14
|
+
'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some',
|
|
15
|
+
'such', 'than', 'too', 'very', 'just', 'also', 'then', 'so', 'if',
|
|
16
|
+
]);
|
package/tfidf.mjs
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
// tfidf.mjs — TF-IDF vector search engine
|
|
2
|
+
// Pure JS implementation, zero external dependencies.
|
|
3
|
+
// Provides tokenization, vocabulary building, vector computation,
|
|
4
|
+
// cosine similarity, vector search, and RRF merging.
|
|
5
|
+
|
|
6
|
+
import { cjkBigrams } from './utils.mjs';
|
|
7
|
+
import { BASE_STOP_WORDS } from './stop-words.mjs';
|
|
8
|
+
import { createHash } from 'crypto';
|
|
9
|
+
|
|
10
|
+
export const VOCAB_DIM = 512;
|
|
11
|
+
export const MIN_COSINE_SIMILARITY = 0.05;
|
|
12
|
+
export const VECTOR_SCAN_LIMIT = 500;
|
|
13
|
+
|
|
14
|
+
const VOCAB_STOP_WORDS = new Set([
|
|
15
|
+
...BASE_STOP_WORDS,
|
|
16
|
+
'now','only','still','here','there','up','out','am',
|
|
17
|
+
]);
|
|
18
|
+
|
|
19
|
+
// ─── Porter Stemmer ──────────────────────────────────────────────────────────
|
|
20
|
+
// Minimal Porter stemmer (1980) aligned with SQLite FTS5's built-in porter tokenizer.
|
|
21
|
+
|
|
22
|
+
const step2map = {
|
|
23
|
+
ational:'ate', tional:'tion', enci:'ence', anci:'ance', izer:'ize',
|
|
24
|
+
abli:'able', alli:'al', entli:'ent', eli:'e', ousli:'ous', ization:'ize',
|
|
25
|
+
ation:'ate', ator:'ate', alism:'al', iveness:'ive', fulness:'ful',
|
|
26
|
+
ousness:'ous', aliti:'al', iviti:'ive', biliti:'ble', logi:'log',
|
|
27
|
+
};
|
|
28
|
+
const step3map = {
|
|
29
|
+
icate:'ic', ative:'', alize:'al', iciti:'ic', ical:'ic', ful:'', ness:'',
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
function consonant(word, i) {
|
|
33
|
+
const c = word[i];
|
|
34
|
+
if (/[aeiou]/.test(c)) return false;
|
|
35
|
+
if (c === 'y') return i === 0 || !/[aeiou]/.test(word[i - 1]);
|
|
36
|
+
return true;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function measure(word) {
|
|
40
|
+
let m = 0, prev = true; // start assuming consonant context
|
|
41
|
+
for (let i = 0; i < word.length; i++) {
|
|
42
|
+
const c = consonant(word, i);
|
|
43
|
+
if (!c && prev) m++;
|
|
44
|
+
prev = c;
|
|
45
|
+
}
|
|
46
|
+
return m;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function hasVowel(word) {
|
|
50
|
+
for (let i = 0; i < word.length; i++) if (!consonant(word, i)) return true;
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function endsDouble(word) {
|
|
55
|
+
const l = word.length;
|
|
56
|
+
return l >= 2 && word[l - 1] === word[l - 2] && consonant(word, l - 1);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function cvc(word) {
|
|
60
|
+
const l = word.length;
|
|
61
|
+
return l >= 3 && consonant(word, l - 1) && !consonant(word, l - 2) && consonant(word, l - 3)
|
|
62
|
+
&& !/[wxy]/.test(word[l - 1]);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export function porterStem(w) {
|
|
66
|
+
if (w.length <= 2) return w;
|
|
67
|
+
let word = w;
|
|
68
|
+
|
|
69
|
+
// Step 1a
|
|
70
|
+
if (word.endsWith('sses')) word = word.slice(0, -2);
|
|
71
|
+
else if (word.endsWith('ies')) word = word.slice(0, -2);
|
|
72
|
+
else if (!word.endsWith('ss') && word.endsWith('s')) word = word.slice(0, -1);
|
|
73
|
+
|
|
74
|
+
// Step 1b
|
|
75
|
+
let step1b2 = false;
|
|
76
|
+
if (word.endsWith('eed')) {
|
|
77
|
+
if (measure(word.slice(0, -3)) > 0) word = word.slice(0, -1);
|
|
78
|
+
} else if (word.endsWith('ed') && hasVowel(word.slice(0, -2))) {
|
|
79
|
+
word = word.slice(0, -2); step1b2 = true;
|
|
80
|
+
} else if (word.endsWith('ing') && hasVowel(word.slice(0, -3))) {
|
|
81
|
+
word = word.slice(0, -3); step1b2 = true;
|
|
82
|
+
}
|
|
83
|
+
if (step1b2) {
|
|
84
|
+
if (word.endsWith('at') || word.endsWith('bl') || word.endsWith('iz')) word += 'e';
|
|
85
|
+
else if (endsDouble(word) && !/[lsz]/.test(word[word.length - 1])) word = word.slice(0, -1);
|
|
86
|
+
else if (measure(word) === 1 && cvc(word)) word += 'e';
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Step 1c
|
|
90
|
+
if (word.endsWith('y') && hasVowel(word.slice(0, -1))) {
|
|
91
|
+
word = word.slice(0, -1) + 'i';
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Step 2
|
|
95
|
+
for (const [suffix, repl] of Object.entries(step2map)) {
|
|
96
|
+
if (word.endsWith(suffix)) {
|
|
97
|
+
const stem = word.slice(0, -suffix.length);
|
|
98
|
+
if (measure(stem) > 0) word = stem + repl;
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Step 3
|
|
104
|
+
for (const [suffix, repl] of Object.entries(step3map)) {
|
|
105
|
+
if (word.endsWith(suffix)) {
|
|
106
|
+
const stem = word.slice(0, -suffix.length);
|
|
107
|
+
if (measure(stem) > 0) word = stem + repl;
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Step 4
|
|
113
|
+
const step4suffixes = ['al','ance','ence','er','ic','able','ible','ant','ement','ment',
|
|
114
|
+
'ent','ion','ou','ism','ate','iti','ous','ive','ize'];
|
|
115
|
+
for (const suffix of step4suffixes) {
|
|
116
|
+
if (word.endsWith(suffix)) {
|
|
117
|
+
const stem = word.slice(0, -suffix.length);
|
|
118
|
+
if (measure(stem) > 1) {
|
|
119
|
+
if (suffix === 'ion' && stem.length > 0 && /[st]$/.test(stem)) word = stem;
|
|
120
|
+
else if (suffix !== 'ion') word = stem;
|
|
121
|
+
}
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Step 5a
|
|
127
|
+
if (word.endsWith('e')) {
|
|
128
|
+
const stem = word.slice(0, -1);
|
|
129
|
+
if (measure(stem) > 1 || (measure(stem) === 1 && !cvc(stem))) word = stem;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Step 5b
|
|
133
|
+
if (measure(word) > 1 && endsDouble(word) && word.endsWith('l')) {
|
|
134
|
+
word = word.slice(0, -1);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return word;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function isNoiseTerm(term) {
|
|
141
|
+
if (VOCAB_STOP_WORDS.has(term)) return true;
|
|
142
|
+
if (/^\d+$/.test(term)) return true;
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// ─── Tokenization ───────────────────────────────────────────────────────────
|
|
147
|
+
|
|
148
|
+
const CJK_RANGE = /[\u4e00-\u9fff\u3400-\u4dbf]/;
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Tokenize text into terms for TF-IDF.
|
|
152
|
+
* ASCII: lowercase + split + Porter stem (aligned with FTS5's porter tokenizer).
|
|
153
|
+
* CJK: reuse cjkBigrams() for consistency with FTS5.
|
|
154
|
+
*/
|
|
155
|
+
export function tokenize(text) {
|
|
156
|
+
if (!text) return [];
|
|
157
|
+
text = String(text).toLowerCase();
|
|
158
|
+
|
|
159
|
+
const tokens = [];
|
|
160
|
+
|
|
161
|
+
// Split into ASCII and CJK segments
|
|
162
|
+
const parts = text.split(/([\u4e00-\u9fff\u3400-\u4dbf]+)/);
|
|
163
|
+
for (const part of parts) {
|
|
164
|
+
if (CJK_RANGE.test(part)) {
|
|
165
|
+
// CJK: use bigrams for consistency with FTS5 indexing
|
|
166
|
+
const bigrams = cjkBigrams(part);
|
|
167
|
+
if (bigrams) {
|
|
168
|
+
for (const t of bigrams.split(/\s+/)) {
|
|
169
|
+
if (t.length >= 2) tokens.push(t);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
} else {
|
|
173
|
+
// ASCII: split on non-alphanumeric, then Porter stem
|
|
174
|
+
for (const t of part.split(/[^a-z0-9]+/)) {
|
|
175
|
+
if (t.length >= 2) tokens.push(porterStem(t));
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return tokens;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// ─── Vocabulary ─────────────────────────────────────────────────────────────
|
|
184
|
+
|
|
185
|
+
let _vocabCache = null;
|
|
186
|
+
|
|
187
|
+
/** Reset vocabulary cache (for testing). */
|
|
188
|
+
export function _resetVocabCache() { _vocabCache = null; }
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Build global vocabulary (IDF table) from all active observations.
|
|
192
|
+
* @param {object} db - better-sqlite3 database
|
|
193
|
+
* @returns {{ terms: Map<string, {index: number, idf: number}>, version: string, dim: number } | null}
|
|
194
|
+
*/
|
|
195
|
+
export function buildVocabulary(db) {
|
|
196
|
+
const rows = db.prepare(`
|
|
197
|
+
SELECT title, narrative, concepts FROM observations
|
|
198
|
+
WHERE COALESCE(compressed_into, 0) = 0 AND superseded_at IS NULL
|
|
199
|
+
`).all();
|
|
200
|
+
|
|
201
|
+
const N = rows.length;
|
|
202
|
+
if (N === 0) return null;
|
|
203
|
+
|
|
204
|
+
// Count document frequency for each term
|
|
205
|
+
const df = new Map();
|
|
206
|
+
for (const row of rows) {
|
|
207
|
+
const text = [row.title || '', row.narrative || '', row.concepts || ''].join(' ');
|
|
208
|
+
const docTerms = new Set(tokenize(text));
|
|
209
|
+
for (const term of docTerms) {
|
|
210
|
+
df.set(term, (df.get(term) || 0) + 1);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Compute IDF and sort by IDF (rare discriminative terms first), filter noise + hapax
|
|
215
|
+
const idf = (freq) => Math.log(1 + N / (1 + freq));
|
|
216
|
+
const sortedTerms = [...df.entries()]
|
|
217
|
+
.filter(([term, freq]) => !isNoiseTerm(term) && freq >= 2)
|
|
218
|
+
.map(([term, freq]) => ({ term, df: freq, idf: idf(freq), ig: freq * idf(freq) }))
|
|
219
|
+
.sort((a, b) => b.idf - a.idf)
|
|
220
|
+
.slice(0, VOCAB_DIM);
|
|
221
|
+
|
|
222
|
+
// Build terms map with index and IDF
|
|
223
|
+
const terms = new Map();
|
|
224
|
+
sortedTerms.forEach((entry, index) => {
|
|
225
|
+
terms.set(entry.term, { index, idf: entry.idf });
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
// Version hash for staleness detection
|
|
229
|
+
const termList = sortedTerms.map(e => e.term).join(',');
|
|
230
|
+
const version = createHash('md5').update(termList).digest('hex').slice(0, 12);
|
|
231
|
+
|
|
232
|
+
const vocab = { terms, version, dim: VOCAB_DIM };
|
|
233
|
+
_vocabCache = vocab;
|
|
234
|
+
return vocab;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Rebuild vocabulary from corpus AND persist to vocab_state table.
|
|
239
|
+
* @param {object} db - better-sqlite3 database
|
|
240
|
+
* @returns {object|null} The new vocabulary
|
|
241
|
+
*/
|
|
242
|
+
export function rebuildVocabulary(db) {
|
|
243
|
+
const vocab = buildVocabulary(db);
|
|
244
|
+
if (!vocab) return null;
|
|
245
|
+
|
|
246
|
+
const insertStmt = db.prepare(
|
|
247
|
+
'INSERT INTO vocab_state (term, term_index, idf, version, created_at_epoch) VALUES (?, ?, ?, ?, ?)'
|
|
248
|
+
);
|
|
249
|
+
const now = Date.now();
|
|
250
|
+
db.transaction(() => {
|
|
251
|
+
db.prepare('DELETE FROM vocab_state').run();
|
|
252
|
+
for (const [term, entry] of vocab.terms) {
|
|
253
|
+
insertStmt.run(term, entry.index, entry.idf, vocab.version, now);
|
|
254
|
+
}
|
|
255
|
+
})();
|
|
256
|
+
|
|
257
|
+
_vocabCache = vocab;
|
|
258
|
+
return vocab;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/**
|
|
262
|
+
* Get cached vocabulary, load from DB, or rebuild from corpus.
|
|
263
|
+
* @param {object} db - better-sqlite3 database
|
|
264
|
+
* @returns {object|null} vocabulary
|
|
265
|
+
*/
|
|
266
|
+
export function getVocabulary(db) {
|
|
267
|
+
if (_vocabCache) return _vocabCache;
|
|
268
|
+
|
|
269
|
+
// Try loading from persisted vocab_state
|
|
270
|
+
try {
|
|
271
|
+
const rows = db.prepare(
|
|
272
|
+
'SELECT term, term_index, idf, version FROM vocab_state ORDER BY term_index'
|
|
273
|
+
).all();
|
|
274
|
+
if (rows.length > 0) {
|
|
275
|
+
const terms = new Map();
|
|
276
|
+
for (const r of rows) {
|
|
277
|
+
terms.set(r.term, { index: r.term_index, idf: r.idf });
|
|
278
|
+
}
|
|
279
|
+
const vocab = { terms, version: rows[0].version, dim: VOCAB_DIM };
|
|
280
|
+
_vocabCache = vocab;
|
|
281
|
+
return vocab;
|
|
282
|
+
}
|
|
283
|
+
} catch { /* table may not exist in old/test DBs */ }
|
|
284
|
+
|
|
285
|
+
// Fallback: compute and persist (first run)
|
|
286
|
+
return rebuildVocabulary(db);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// ─── Vector Computation ─────────────────────────────────────────────────────
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Compute TF-IDF vector for a text string.
|
|
293
|
+
* @returns {Float32Array | null} L2-normalized vector, or null if empty/no matching terms
|
|
294
|
+
*/
|
|
295
|
+
export function computeVector(text, vocab) {
|
|
296
|
+
if (!vocab || !text) return null;
|
|
297
|
+
|
|
298
|
+
const tokens = tokenize(text);
|
|
299
|
+
if (tokens.length === 0) return null;
|
|
300
|
+
|
|
301
|
+
// Compute term frequency
|
|
302
|
+
const tf = new Map();
|
|
303
|
+
for (const t of tokens) {
|
|
304
|
+
tf.set(t, (tf.get(t) || 0) + 1);
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Build TF-IDF vector with sublinear TF: 1 + log(tf)
|
|
308
|
+
const vec = new Float32Array(vocab.dim);
|
|
309
|
+
let hasNonZero = false;
|
|
310
|
+
for (const [term, freq] of tf) {
|
|
311
|
+
const entry = vocab.terms.get(term);
|
|
312
|
+
if (entry) {
|
|
313
|
+
vec[entry.index] = (1 + Math.log(freq)) * entry.idf;
|
|
314
|
+
hasNonZero = true;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if (!hasNonZero) return null;
|
|
319
|
+
|
|
320
|
+
// L2 normalize
|
|
321
|
+
let norm = 0;
|
|
322
|
+
for (let i = 0; i < vec.length; i++) norm += vec[i] * vec[i];
|
|
323
|
+
norm = Math.sqrt(norm);
|
|
324
|
+
if (norm === 0) return null;
|
|
325
|
+
for (let i = 0; i < vec.length; i++) vec[i] /= norm;
|
|
326
|
+
|
|
327
|
+
return vec;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// ─── Cosine Similarity ──────────────────────────────────────────────────────
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Dot product of two L2-normalized Float32Arrays = cosine similarity.
|
|
334
|
+
*/
|
|
335
|
+
export function cosineSimilarity(a, b) {
|
|
336
|
+
let dot = 0;
|
|
337
|
+
const len = Math.min(a.length, b.length);
|
|
338
|
+
for (let i = 0; i < len; i++) dot += a[i] * b[i];
|
|
339
|
+
return dot;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// ─── Vector Search ──────────────────────────────────────────────────────────
|
|
343
|
+
|
|
344
|
+
/**
|
|
345
|
+
* Search observation_vectors by cosine similarity.
|
|
346
|
+
* @param {object} db - better-sqlite3 database
|
|
347
|
+
* @param {Float32Array} queryVec - query vector
|
|
348
|
+
* @param {object} opts - { project?, type?, vocabVersion, limit? }
|
|
349
|
+
* @returns {{ id: number, similarity: number }[]}
|
|
350
|
+
*/
|
|
351
|
+
const VECTOR_TIME_WINDOW_MS = 90 * 24 * 60 * 60 * 1000; // 90 days
|
|
352
|
+
const VECTOR_MIN_RESULTS = 50; // fallback to full scan if time-window yields fewer
|
|
353
|
+
|
|
354
|
+
export function vectorSearch(db, queryVec, { project, type, vocabVersion, limit = VECTOR_SCAN_LIMIT }) {
|
|
355
|
+
if (!queryVec) return [];
|
|
356
|
+
|
|
357
|
+
const now = Date.now();
|
|
358
|
+
|
|
359
|
+
const wheres = [
|
|
360
|
+
'COALESCE(o.compressed_into, 0) = 0',
|
|
361
|
+
'o.superseded_at IS NULL',
|
|
362
|
+
'ov.vocab_version = ?',
|
|
363
|
+
];
|
|
364
|
+
const params = [vocabVersion];
|
|
365
|
+
|
|
366
|
+
if (project) { wheres.push('o.project = ?'); params.push(project); }
|
|
367
|
+
if (type) { wheres.push('o.type = ?'); params.push(type); }
|
|
368
|
+
|
|
369
|
+
// Time-window prefilter: try 90 days first, fallback to full if too few results
|
|
370
|
+
const timeWheres = [...wheres, 'o.created_at_epoch > ?'];
|
|
371
|
+
const timeParams = [...params, now - VECTOR_TIME_WINDOW_MS, limit];
|
|
372
|
+
|
|
373
|
+
let rows = db.prepare(`
|
|
374
|
+
SELECT ov.observation_id, ov.vector
|
|
375
|
+
FROM observation_vectors ov
|
|
376
|
+
JOIN observations o ON ov.observation_id = o.id
|
|
377
|
+
WHERE ${timeWheres.join(' AND ')}
|
|
378
|
+
ORDER BY o.created_at_epoch DESC
|
|
379
|
+
LIMIT ?
|
|
380
|
+
`).all(...timeParams);
|
|
381
|
+
|
|
382
|
+
// Fallback: if time-window yields too few, scan without time constraint
|
|
383
|
+
if (rows.length < VECTOR_MIN_RESULTS) {
|
|
384
|
+
params.push(limit);
|
|
385
|
+
rows = db.prepare(`
|
|
386
|
+
SELECT ov.observation_id, ov.vector
|
|
387
|
+
FROM observation_vectors ov
|
|
388
|
+
JOIN observations o ON ov.observation_id = o.id
|
|
389
|
+
WHERE ${wheres.join(' AND ')}
|
|
390
|
+
ORDER BY o.created_at_epoch DESC
|
|
391
|
+
LIMIT ?
|
|
392
|
+
`).all(...params);
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
const results = [];
|
|
396
|
+
for (const row of rows) {
|
|
397
|
+
const vec = new Float32Array(row.vector.buffer, row.vector.byteOffset, row.vector.byteLength / 4);
|
|
398
|
+
const sim = cosineSimilarity(queryVec, vec);
|
|
399
|
+
if (sim > MIN_COSINE_SIMILARITY) results.push({ id: row.observation_id, similarity: sim });
|
|
400
|
+
}
|
|
401
|
+
results.sort((a, b) => b.similarity - a.similarity);
|
|
402
|
+
return results.slice(0, 20);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// ─── RRF Merge ──────────────────────────────────────────────────────────────
|
|
406
|
+
|
|
407
|
+
/**
|
|
408
|
+
* Reciprocal Rank Fusion: merge two ranked result lists.
|
|
409
|
+
* @param {{ id: number }[]} bm25Results - FTS5 results (ranked by position)
|
|
410
|
+
* @param {{ id: number }[]} vectorResults - Vector results (ranked by position)
|
|
411
|
+
* @param {number} k - RRF constant (default 60)
|
|
412
|
+
* @returns {{ id: number, rrfScore: number }[]}
|
|
413
|
+
*/
|
|
414
|
+
export function rrfMerge(bm25Results, vectorResults, k = 60) {
|
|
415
|
+
const scores = new Map();
|
|
416
|
+
bm25Results.forEach((r, i) => {
|
|
417
|
+
scores.set(r.id, (scores.get(r.id) ?? 0) + 1 / (k + i + 1));
|
|
418
|
+
});
|
|
419
|
+
vectorResults.forEach((r, i) => {
|
|
420
|
+
scores.set(r.id, (scores.get(r.id) ?? 0) + 1 / (k + i + 1));
|
|
421
|
+
});
|
|
422
|
+
return [...scores.entries()]
|
|
423
|
+
.sort((a, b) => b[1] - a[1])
|
|
424
|
+
.map(([id, score]) => ({ id, rrfScore: score }));
|
|
425
|
+
}
|
package/tier.mjs
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
// tier.mjs — Virtual three-tier memory classification engine
|
|
2
|
+
// Computes tier (working/active/archive) from existing observation fields.
|
|
3
|
+
// No database dependencies — pure functions + SQL expression.
|
|
4
|
+
|
|
5
|
+
import { DECAY_HALF_LIFE_BY_TYPE } from './utils.mjs';
|
|
6
|
+
|
|
7
|
+
// ─── Constants ────────────────────────────────────────────────────────────────
|
|
8
|
+
|
|
9
|
+
const TWO_HOURS_MS = 2 * 3600000;
|
|
10
|
+
|
|
11
|
+
/** Active window = 2x decay half-life per type (ms) */
|
|
12
|
+
export const ACTIVE_WINDOWS = Object.fromEntries(
|
|
13
|
+
Object.entries(DECAY_HALF_LIFE_BY_TYPE).map(([type, hl]) => [type, hl * 2])
|
|
14
|
+
);
|
|
15
|
+
const DEFAULT_ACTIVE_WINDOW_MS = DECAY_HALF_LIFE_BY_TYPE.change * 2;
|
|
16
|
+
|
|
17
|
+
// ─── JavaScript Tier Classification ──────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Compute tier for a single observation row.
|
|
21
|
+
* @param {object} obs - Row from observations table
|
|
22
|
+
* @param {object} ctx - { now, currentProject, currentSessionId }
|
|
23
|
+
* @returns {'working' | 'active' | 'archive'}
|
|
24
|
+
*/
|
|
25
|
+
export function computeTier(obs, ctx) {
|
|
26
|
+
const { now, currentProject, currentSessionId } = ctx;
|
|
27
|
+
|
|
28
|
+
// Rule 1: Archive if compressed or superseded
|
|
29
|
+
if ((obs.compressed_into ?? 0) !== 0) return 'archive';
|
|
30
|
+
if (obs.superseded_at !== null && obs.superseded_at !== undefined) return 'archive';
|
|
31
|
+
|
|
32
|
+
// Rule 2: Working if same session
|
|
33
|
+
if (currentSessionId && obs.memory_session_id === currentSessionId) return 'working';
|
|
34
|
+
|
|
35
|
+
const twoHoursAgo = now - TWO_HOURS_MS;
|
|
36
|
+
|
|
37
|
+
// Rule 3: Working if same project + high importance + recently accessed
|
|
38
|
+
if (obs.project === currentProject && (obs.importance ?? 1) >= 2 && obs.last_accessed_at >= twoHoursAgo) {
|
|
39
|
+
return 'working';
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Rule 4: Working if same project + recently created
|
|
43
|
+
if (obs.project === currentProject && obs.created_at_epoch >= twoHoursAgo) {
|
|
44
|
+
return 'working';
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Rule 5: Active if within type-specific window
|
|
48
|
+
const activeWindow = ACTIVE_WINDOWS[obs.type] ?? DEFAULT_ACTIVE_WINDOW_MS;
|
|
49
|
+
if (now - obs.created_at_epoch < activeWindow) return 'active';
|
|
50
|
+
|
|
51
|
+
// Rule 6: Archive (fallback)
|
|
52
|
+
return 'archive';
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// ─── SQL CASE Expression ────────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* SQL CASE expression for inline tier computation.
|
|
59
|
+
* Params: see tierSqlParams().
|
|
60
|
+
*/
|
|
61
|
+
export const TIER_CASE_SQL = `(CASE
|
|
62
|
+
WHEN COALESCE(compressed_into, 0) != 0 THEN 'archive'
|
|
63
|
+
WHEN superseded_at IS NOT NULL THEN 'archive'
|
|
64
|
+
WHEN memory_session_id = ? THEN 'working'
|
|
65
|
+
WHEN project = ? AND COALESCE(importance, 1) >= 2 AND last_accessed_at >= ? THEN 'working'
|
|
66
|
+
WHEN project = ? AND created_at_epoch >= ? THEN 'working'
|
|
67
|
+
WHEN type = 'decision' AND created_at_epoch >= ? THEN 'active'
|
|
68
|
+
WHEN type = 'discovery' AND created_at_epoch >= ? THEN 'active'
|
|
69
|
+
WHEN type = 'feature' AND created_at_epoch >= ? THEN 'active'
|
|
70
|
+
WHEN type = 'bugfix' AND created_at_epoch >= ? THEN 'active'
|
|
71
|
+
WHEN type = 'refactor' AND created_at_epoch >= ? THEN 'active'
|
|
72
|
+
WHEN type = 'change' AND created_at_epoch >= ? THEN 'active'
|
|
73
|
+
WHEN created_at_epoch >= ? THEN 'active'
|
|
74
|
+
ELSE 'archive'
|
|
75
|
+
END)`;
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Build params array for TIER_CASE_SQL.
|
|
79
|
+
* @param {object} ctx - { now, currentProject, currentSessionId }
|
|
80
|
+
* @returns {any[]}
|
|
81
|
+
*/
|
|
82
|
+
export function tierSqlParams(ctx) {
|
|
83
|
+
const { now, currentProject, currentSessionId } = ctx;
|
|
84
|
+
const twoHoursAgo = now - TWO_HOURS_MS;
|
|
85
|
+
return [
|
|
86
|
+
currentSessionId ?? '',
|
|
87
|
+
currentProject ?? '',
|
|
88
|
+
twoHoursAgo,
|
|
89
|
+
currentProject ?? '',
|
|
90
|
+
twoHoursAgo,
|
|
91
|
+
now - ACTIVE_WINDOWS.decision,
|
|
92
|
+
now - ACTIVE_WINDOWS.discovery,
|
|
93
|
+
now - ACTIVE_WINDOWS.feature,
|
|
94
|
+
now - ACTIVE_WINDOWS.bugfix,
|
|
95
|
+
now - ACTIVE_WINDOWS.refactor,
|
|
96
|
+
now - ACTIVE_WINDOWS.change,
|
|
97
|
+
now - DEFAULT_ACTIVE_WINDOW_MS,
|
|
98
|
+
];
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// ─── Relative Time Formatting ───────────────────────────────────────────────
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Format epoch as relative time string (e.g., "5min ago", "3d ago").
|
|
105
|
+
* @param {number} epoch - Timestamp in milliseconds
|
|
106
|
+
* @param {number} now - Current time in milliseconds
|
|
107
|
+
* @returns {string}
|
|
108
|
+
*/
|
|
109
|
+
export function relativeTime(epoch, now) {
|
|
110
|
+
const diff = now - epoch;
|
|
111
|
+
if (diff < 60000) return `${Math.floor(diff / 1000)}s ago`;
|
|
112
|
+
if (diff < 3600000) return `${Math.floor(diff / 60000)}min ago`;
|
|
113
|
+
if (diff < 86400000) return `${Math.floor(diff / 3600000)}h ago`;
|
|
114
|
+
if (diff < 30 * 86400000) return `${Math.floor(diff / 86400000)}d ago`;
|
|
115
|
+
return `${Math.floor(diff / (30 * 86400000))}mo ago`;
|
|
116
|
+
}
|
package/tool-schemas.mjs
CHANGED
|
@@ -111,6 +111,7 @@ export const memExportSchema = {
|
|
|
111
111
|
date_from: z.string().optional().describe('Start date (ISO 8601 or YYYY-MM-DD)'),
|
|
112
112
|
date_to: z.string().optional().describe('End date (ISO 8601 or YYYY-MM-DD)'),
|
|
113
113
|
include_compressed: coerceBool.optional().describe('Include compressed observations (default: false)'),
|
|
114
|
+
limit: coerceInt.optional().describe('Max observations to export (default: 200, max: 1000)'),
|
|
114
115
|
};
|
|
115
116
|
|
|
116
117
|
export const memFtsCheckSchema = {
|
package/utils.mjs
CHANGED
|
@@ -294,8 +294,11 @@ export function extractMatchKeywords(text, files) {
|
|
|
294
294
|
// ─── Git Branch Detection ──────────────────────────────────────────────────
|
|
295
295
|
|
|
296
296
|
let _cachedBranch;
|
|
297
|
+
let _branchCacheTime = 0;
|
|
298
|
+
const BRANCH_CACHE_TTL = 60000; // 60s TTL for long-running MCP server process
|
|
297
299
|
export function getCurrentBranch() {
|
|
298
|
-
|
|
300
|
+
const now = Date.now();
|
|
301
|
+
if (_cachedBranch !== undefined && (now - _branchCacheTime) < BRANCH_CACHE_TTL) return _cachedBranch;
|
|
299
302
|
try {
|
|
300
303
|
const result = execSync('git rev-parse --abbrev-ref HEAD', {
|
|
301
304
|
encoding: 'utf8', timeout: 2000, stdio: ['pipe', 'pipe', 'pipe'],
|
|
@@ -304,8 +307,9 @@ export function getCurrentBranch() {
|
|
|
304
307
|
} catch {
|
|
305
308
|
_cachedBranch = null;
|
|
306
309
|
}
|
|
310
|
+
_branchCacheTime = now;
|
|
307
311
|
return _cachedBranch;
|
|
308
312
|
}
|
|
309
313
|
|
|
310
314
|
/** Reset cached branch (for testing or after git checkout) */
|
|
311
|
-
export function _resetBranchCache() { _cachedBranch = undefined; }
|
|
315
|
+
export function _resetBranchCache() { _cachedBranch = undefined; _branchCacheTime = 0; }
|