claude-mem-lite 2.17.0 → 2.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +1 -1
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +21 -4
- package/hook-llm.mjs +20 -5
- package/hook-memory.mjs +31 -18
- package/hook.mjs +3 -3
- package/install.mjs +1 -0
- package/mem-cli.mjs +49 -9
- package/package.json +1 -1
- package/registry-retriever.mjs +3 -10
- package/schema.mjs +79 -16
- package/scripts/user-prompt-search.js +27 -75
- package/server-internals.mjs +28 -21
- package/server.mjs +33 -21
- package/utils.mjs +7 -323
package/README.md
CHANGED
|
@@ -53,7 +53,7 @@ The original sends **everything to the LLM and hopes it filters well**. claude-m
|
|
|
53
53
|
## Features
|
|
54
54
|
|
|
55
55
|
- **Automatic capture** -- Hooks into Claude Code lifecycle (PostToolUse, SessionStart, Stop, UserPromptSubmit) to record observations without manual effort
|
|
56
|
-
- **
|
|
56
|
+
- **Hybrid search** -- FTS5 BM25 + TF-IDF vector cosine similarity, merged via Reciprocal Rank Fusion (RRF). FTS5 handles keyword matching; 512-dim TF-IDF vectors capture semantic similarity for recall beyond exact terms
|
|
57
57
|
- **Timeline browsing** -- Navigate observations chronologically with anchor-based context windows
|
|
58
58
|
- **Episode batching** -- Groups related file operations into coherent episodes before LLM encoding
|
|
59
59
|
- **Error-triggered recall** -- Automatically searches memory when Bash errors occur, surfacing relevant past fixes
|
|
@@ -67,7 +67,10 @@ The original sends **everything to the LLM and hopes it filters well**. claude-m
|
|
|
67
67
|
- **Read file tracking** -- Tracks files read during sessions for richer episode context
|
|
68
68
|
- **Zero data loss** -- If LLM fails, observations are saved with degraded (inferred) metadata instead of being discarded
|
|
69
69
|
- **Two-tier dedup** -- Jaccard similarity (5-minute window) + MinHash signatures (7-day cross-session window) prevent duplicates
|
|
70
|
-
- **Synonym expansion** -- Abbreviations like `K8s`, `DB`, `auth` automatically expand to full forms in FTS5 search (
|
|
70
|
+
- **Synonym expansion** -- Abbreviations like `K8s`, `DB`, `auth` automatically expand to full forms in FTS5 search (100+ pairs including CJK↔EN cross-language mappings)
|
|
71
|
+
- **CJK synonym extraction** -- Unsegmented Chinese text is scanned for known vocabulary words (数据库→database, 搜索→search, etc.) enabling cross-language memory recall
|
|
72
|
+
- **Stop-word filtering** -- English stop words filtered from both TF-IDF vocabulary (reclaiming ~18% of vector dimensions) and FTS queries (preventing false negatives from noise terms like "how", "the", "does")
|
|
73
|
+
- **Persisted vocabulary** -- TF-IDF vocabulary persisted to `vocab_state` table, preventing vector staleness when document frequencies shift. Vectors stay valid until explicit rebuild
|
|
71
74
|
- **Pseudo-relevance feedback (PRF)** -- Top results seed expansion queries for broader recall
|
|
72
75
|
- **Concept co-occurrence** -- Shared concepts across observations expand search to related topics
|
|
73
76
|
- **Context-aware re-ranking** -- Active file overlap boosts relevance (exact match + directory-level half-weight)
|
|
@@ -88,6 +91,8 @@ The original sends **everything to the LLM and hopes it filters well**. claude-m
|
|
|
88
91
|
- **Exponential recency decay** -- Type-differentiated half-lives (decisions: 90d, discoveries: 60d, bugfixes: 14d, changes: 7d) consistently applied in all ranking paths
|
|
89
92
|
- **Prompt-time memory injection** -- UserPromptSubmit hook automatically searches and injects relevant past observations with recency and importance weighting
|
|
90
93
|
- **Dual injection dedup** -- `user-prompt-search.js` and `handleUserPrompt` coordinate via temp file to prevent duplicate memory injection
|
|
94
|
+
- **Result-dedup cooldown** -- User-prompt memory injection uses result-overlap detection (>80% ID overlap → skip) instead of time-based cooldown, allowing topic switches within seconds while preventing redundant injections
|
|
95
|
+
- **OR query fallback** -- When AND-joined FTS5 queries return zero results, automatically relaxes to OR-joined queries for broader recall (applied in both user-prompt-search and hook-memory paths)
|
|
91
96
|
- **Configurable LLM model** -- Switch between Haiku (fast/cheap) and Sonnet (deeper analysis) via `CLAUDE_MEM_MODEL` env var
|
|
92
97
|
- **DB auto-recovery** -- Detects and cleans corrupted WAL/SHM files on startup; periodic WAL checkpoints prevent unbounded growth
|
|
93
98
|
- **Schema auto-migration** -- Idempotent `ALTER TABLE` migrations run on every startup, safely adding new columns and indexes without data loss
|
|
@@ -202,7 +207,7 @@ rm -rf ~/claude-mem-lite/ # pre-v0.5 unhidden (if not auto-moved)
|
|
|
202
207
|
| `mem_stats` | View statistics: counts, type distribution, top projects, daily activity. |
|
|
203
208
|
| `mem_delete` | Delete observations by ID with preview/confirm workflow. FTS5 cleanup is automatic. |
|
|
204
209
|
| `mem_compress` | Compress old low-value observations into weekly summaries to reduce noise. |
|
|
205
|
-
| `mem_maintain` | Memory maintenance: scan for duplicates/stale/broken items, then execute cleanup/dedup operations. |
|
|
210
|
+
| `mem_maintain` | Memory maintenance: scan for duplicates/stale/broken items, then execute cleanup/dedup/rebuild_vectors operations. |
|
|
206
211
|
| `mem_registry` | Manage resource registry: search for skills/agents by need, list resources, view stats, import/remove tools, reindex. |
|
|
207
212
|
|
|
208
213
|
### Skill Commands (in Claude Code chat)
|
|
@@ -260,6 +265,16 @@ project, type, session_id, working_on, completed, unfinished,
|
|
|
260
265
|
key_files, key_decisions, match_keywords, created_at_epoch
|
|
261
266
|
```
|
|
262
267
|
|
|
268
|
+
**observation_vectors** -- TF-IDF vector embeddings for hybrid search
|
|
269
|
+
```
|
|
270
|
+
observation_id, vector (BLOB Float32Array), vocab_version, created_at_epoch
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
**vocab_state** -- Persisted TF-IDF vocabulary for stable vector indexing
|
|
274
|
+
```
|
|
275
|
+
term, term_index, idf, version, created_at_epoch
|
|
276
|
+
```
|
|
277
|
+
|
|
263
278
|
FTS5 indexes: `observations_fts` (title, subtitle, narrative, text, facts, concepts, lesson_learned), `session_summaries_fts`, `user_prompts_fts`
|
|
264
279
|
|
|
265
280
|
## How It Works
|
|
@@ -405,7 +420,9 @@ claude-mem-lite/
|
|
|
405
420
|
hook-semaphore.mjs # LLM concurrency control: file-based semaphore for background workers
|
|
406
421
|
schema.mjs # Database schema: single source of truth for tables, migrations, FTS5
|
|
407
422
|
tool-schemas.mjs # Shared Zod schemas for MCP tool validation
|
|
408
|
-
|
|
423
|
+
tfidf.mjs # TF-IDF vector engine: tokenization, vocabulary building, vector computation, cosine similarity, RRF merge
|
|
424
|
+
tier.mjs # Temporal tier system: activity-based time window classification
|
|
425
|
+
utils.mjs # Shared utilities: FTS5 query building, BM25 weight constants, MinHash dedup, secret scrubbing, CJK synonym extraction
|
|
409
426
|
# Resource registry
|
|
410
427
|
registry.mjs # Resource registry DB: schema, CRUD, FTS5, invocation tracking
|
|
411
428
|
registry-retriever.mjs # FTS5 retrieval with synonym expansion and composite scoring
|
package/hook-llm.mjs
CHANGED
|
@@ -27,6 +27,11 @@ function buildFtsTextField(obs) {
|
|
|
27
27
|
return { conceptsText, factsText, textField: [conceptsText, factsText, aliasesText, bigramText].filter(Boolean).join(' ') };
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
+
/**
|
|
31
|
+
* Save an observation to the database with three-tier dedup.
|
|
32
|
+
* @returns {number|null} The saved observation ID, or null if deduped.
|
|
33
|
+
* Throws on DB error (callers should catch if needed).
|
|
34
|
+
*/
|
|
30
35
|
export function saveObservation(obs, projectOverride, sessionIdOverride, externalDb) {
|
|
31
36
|
const db = externalDb || openDb();
|
|
32
37
|
if (!db) return null;
|
|
@@ -41,7 +46,7 @@ export function saveObservation(obs, projectOverride, sessionIdOverride, externa
|
|
|
41
46
|
VALUES (?, ?, ?, ?, ?, 'active')
|
|
42
47
|
`).run(sessionId, sessionId, project, now.toISOString(), now.getTime());
|
|
43
48
|
|
|
44
|
-
// Three-tier dedup
|
|
49
|
+
// Three-tier dedup — returns null (not throw) for dedup hits
|
|
45
50
|
// Tier 1 (fast): 5-min Jaccard on titles
|
|
46
51
|
const fiveMinAgo = now.getTime() - DEDUP_WINDOW_MS;
|
|
47
52
|
const recent = db.prepare(`
|
|
@@ -51,7 +56,7 @@ export function saveObservation(obs, projectOverride, sessionIdOverride, externa
|
|
|
51
56
|
`).all(project, fiveMinAgo);
|
|
52
57
|
|
|
53
58
|
if (obs.title && recent.some(r => jaccardSimilarity(r.title, obs.title) > 0.7)) {
|
|
54
|
-
return null;
|
|
59
|
+
return null; // dedup: Jaccard title match
|
|
55
60
|
}
|
|
56
61
|
|
|
57
62
|
// Tier 1.5: Extended title dedup for low-signal degraded titles
|
|
@@ -68,7 +73,7 @@ export function saveObservation(obs, projectOverride, sessionIdOverride, externa
|
|
|
68
73
|
WHERE project = ? AND title = ? AND created_at_epoch > ? AND created_at_epoch <= ?
|
|
69
74
|
LIMIT 1
|
|
70
75
|
`).get(project, obs.title, sevenDaysAgo, fiveMinAgo);
|
|
71
|
-
if (exactDup) return null;
|
|
76
|
+
if (exactDup) return null; // dedup: exact title match
|
|
72
77
|
// Phase 2: Jaccard similarity for near-duplicates (3-day window)
|
|
73
78
|
const extRecent = db.prepare(`
|
|
74
79
|
SELECT title FROM observations
|
|
@@ -76,7 +81,7 @@ export function saveObservation(obs, projectOverride, sessionIdOverride, externa
|
|
|
76
81
|
ORDER BY created_at_epoch DESC LIMIT 60
|
|
77
82
|
`).all(project, threeDaysAgo, fiveMinAgo);
|
|
78
83
|
if (extRecent.some(r => jaccardSimilarity(r.title, obs.title) > 0.85)) {
|
|
79
|
-
return null;
|
|
84
|
+
return null; // dedup: low-signal Jaccard match
|
|
80
85
|
}
|
|
81
86
|
}
|
|
82
87
|
|
|
@@ -91,7 +96,7 @@ export function saveObservation(obs, projectOverride, sessionIdOverride, externa
|
|
|
91
96
|
`).all(project, sevenDaysAgo);
|
|
92
97
|
|
|
93
98
|
if (recentSigs.some(r => estimateJaccardFromMinHash(minhashSig, r.minhash_sig) > 0.8)) {
|
|
94
|
-
return null;
|
|
99
|
+
return null; // dedup: MinHash similarity match
|
|
95
100
|
}
|
|
96
101
|
}
|
|
97
102
|
|
|
@@ -117,6 +122,16 @@ export function saveObservation(obs, projectOverride, sessionIdOverride, externa
|
|
|
117
122
|
);
|
|
118
123
|
const savedId = Number(result.lastInsertRowid);
|
|
119
124
|
|
|
125
|
+
// Populate observation_files junction table (non-critical)
|
|
126
|
+
if (savedId && obs.files && obs.files.length > 0) {
|
|
127
|
+
try {
|
|
128
|
+
const insertFile = db.prepare('INSERT OR IGNORE INTO observation_files (obs_id, filename) VALUES (?, ?)');
|
|
129
|
+
for (const f of obs.files) {
|
|
130
|
+
if (typeof f === 'string' && f.length > 0) insertFile.run(savedId, f);
|
|
131
|
+
}
|
|
132
|
+
} catch (e) { debugCatch(e, 'saveObservation-obsFiles'); }
|
|
133
|
+
}
|
|
134
|
+
|
|
120
135
|
// Write TF-IDF vector (non-critical)
|
|
121
136
|
try {
|
|
122
137
|
const vocab = getVocabulary(db);
|
package/hook-memory.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// claude-mem-lite — Semantic Memory Injection
|
|
2
2
|
// Search past observations for relevant memories to inject as context at user-prompt time.
|
|
3
3
|
|
|
4
|
-
import { sanitizeFtsQuery, debugCatch, OBS_BM25 } from './utils.mjs';
|
|
4
|
+
import { sanitizeFtsQuery, relaxFtsQueryToOr, debugCatch, OBS_BM25 } from './utils.mjs';
|
|
5
5
|
|
|
6
6
|
const MAX_MEMORY_INJECTIONS = 3;
|
|
7
7
|
const MEMORY_LOOKBACK_MS = 60 * 86400000; // 60 days
|
|
@@ -44,13 +44,21 @@ export function searchRelevantMemories(db, userPrompt, project, excludeIds = [])
|
|
|
44
44
|
ORDER BY ${OBS_BM25}
|
|
45
45
|
LIMIT 10
|
|
46
46
|
`);
|
|
47
|
-
|
|
47
|
+
let rows = selectStmt.all(ftsQuery, project, cutoff);
|
|
48
|
+
|
|
49
|
+
// OR fallback when AND returns nothing
|
|
50
|
+
if (rows.length === 0) {
|
|
51
|
+
const orQuery = relaxFtsQueryToOr(ftsQuery);
|
|
52
|
+
if (orQuery) {
|
|
53
|
+
try { rows = selectStmt.all(orQuery, project, cutoff); } catch {}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
48
56
|
|
|
49
57
|
// Phase 2: Cross-project search for high-value decisions/discoveries
|
|
50
58
|
// These are transferable insights (debugging patterns, architectural reasons, gotchas)
|
|
51
59
|
let crossRows = [];
|
|
52
60
|
try {
|
|
53
|
-
|
|
61
|
+
const crossStmt = db.prepare(`
|
|
54
62
|
SELECT o.id, o.type, o.title, o.importance, o.lesson_learned, o.project,
|
|
55
63
|
${OBS_BM25} as relevance
|
|
56
64
|
FROM observations_fts
|
|
@@ -64,7 +72,14 @@ export function searchRelevantMemories(db, userPrompt, project, excludeIds = [])
|
|
|
64
72
|
AND o.superseded_at IS NULL
|
|
65
73
|
ORDER BY ${OBS_BM25}
|
|
66
74
|
LIMIT 5
|
|
67
|
-
`)
|
|
75
|
+
`);
|
|
76
|
+
crossRows = crossStmt.all(ftsQuery, project, cutoff);
|
|
77
|
+
if (crossRows.length === 0) {
|
|
78
|
+
const orQuery = relaxFtsQueryToOr(ftsQuery);
|
|
79
|
+
if (orQuery) {
|
|
80
|
+
try { crossRows = crossStmt.all(orQuery, project, cutoff); } catch {}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
68
83
|
} catch (e) { debugCatch(e, 'crossProjectSearch'); }
|
|
69
84
|
|
|
70
85
|
// Merge and score: same-project full weight, cross-project 0.7x
|
|
@@ -117,22 +132,20 @@ export function recallForFile(db, filePath, project) {
|
|
|
117
132
|
const cutoff = Date.now() - FILE_RECALL_LOOKBACK_MS;
|
|
118
133
|
// Escape SQL LIKE wildcards in filename to prevent injection
|
|
119
134
|
const escaped = basename.replace(/%/g, '\\%').replace(/_/g, '\\_');
|
|
120
|
-
|
|
121
|
-
// Two patterns avoid false positives: %/file.mjs"% won't match /webapp.mjs
|
|
122
|
-
const pathPattern = `%/${escaped}"%`;
|
|
123
|
-
const namePattern = `%"${escaped}"%`;
|
|
135
|
+
const likePattern = `%${escaped}`;
|
|
124
136
|
const rows = db.prepare(`
|
|
125
|
-
SELECT id, type, title, importance, lesson_learned
|
|
126
|
-
FROM observations
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
AND
|
|
130
|
-
AND
|
|
131
|
-
AND
|
|
132
|
-
AND
|
|
133
|
-
|
|
137
|
+
SELECT DISTINCT o.id, o.type, o.title, o.importance, o.lesson_learned
|
|
138
|
+
FROM observations o
|
|
139
|
+
JOIN observation_files of2 ON of2.obs_id = o.id
|
|
140
|
+
WHERE o.project = ?
|
|
141
|
+
AND o.importance >= 2
|
|
142
|
+
AND COALESCE(o.compressed_into, 0) = 0
|
|
143
|
+
AND o.superseded_at IS NULL
|
|
144
|
+
AND o.created_at_epoch > ?
|
|
145
|
+
AND (of2.filename = ? OR of2.filename LIKE ? ESCAPE '\\')
|
|
146
|
+
ORDER BY o.created_at_epoch DESC
|
|
134
147
|
LIMIT ?
|
|
135
|
-
`).all(project, cutoff,
|
|
148
|
+
`).all(project, cutoff, filePath, likePattern, MAX_FILE_RECALL);
|
|
136
149
|
const now = Date.now();
|
|
137
150
|
const updateStmt = db.prepare('UPDATE observations SET access_count = COALESCE(access_count, 0) + 1, last_accessed_at = ? WHERE id = ?');
|
|
138
151
|
for (const r of rows) updateStmt.run(now, r.id);
|
package/hook.mjs
CHANGED
|
@@ -32,7 +32,7 @@ import { searchRelevantMemories, recallForFile } from './hook-memory.mjs';
|
|
|
32
32
|
import { buildAndSaveHandoff, detectContinuationIntent, renderHandoffInjection, extractUnfinishedSummary } from './hook-handoff.mjs';
|
|
33
33
|
import { checkForUpdate } from './hook-update.mjs';
|
|
34
34
|
import { SKIP_TOOLS, SKIP_PREFIXES } from './skip-tools.mjs';
|
|
35
|
-
import {
|
|
35
|
+
import { getVocabulary } from './tfidf.mjs';
|
|
36
36
|
|
|
37
37
|
// Prevent recursive hooks from background claude -p calls
|
|
38
38
|
// Background workers (llm-episode, llm-summary) are exempt — they're ours
|
|
@@ -719,8 +719,8 @@ async function handleSessionStart() {
|
|
|
719
719
|
// CLAUDE.md: slim (summary + handoff state — observations already in stdout)
|
|
720
720
|
updateClaudeMd([...summaryLines, ...handoffLines].join('\n'));
|
|
721
721
|
|
|
722
|
-
// Pre-
|
|
723
|
-
try {
|
|
722
|
+
// Pre-load TF-IDF vocabulary cache for this session (from DB, ~1ms)
|
|
723
|
+
try { getVocabulary(db); } catch (e) { debugCatch(e, 'session-start-vocab'); }
|
|
724
724
|
|
|
725
725
|
// Auto-update check (24h throttle, 3s timeout, silent on failure)
|
|
726
726
|
// Fire-and-forget: don't block SessionStart for up to 3s network timeout
|
package/install.mjs
CHANGED
|
@@ -206,6 +206,7 @@ async function install() {
|
|
|
206
206
|
'registry.mjs', 'registry-scanner.mjs', 'registry-indexer.mjs',
|
|
207
207
|
'registry-retriever.mjs', 'resource-discovery.mjs',
|
|
208
208
|
'install-metadata.mjs', 'mem-cli.mjs', 'tier.mjs', 'tfidf.mjs',
|
|
209
|
+
'nlp.mjs', 'scoring-sql.mjs', 'stop-words.mjs',
|
|
209
210
|
];
|
|
210
211
|
|
|
211
212
|
if (IS_DEV) {
|
package/mem-cli.mjs
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
import { ensureDb, DB_PATH } from './schema.mjs';
|
|
6
6
|
import { sanitizeFtsQuery, relaxFtsQueryToOr, truncate, typeIcon, inferProject, jaccardSimilarity, computeMinHash, scrubSecrets, cjkBigrams, OBS_BM25, TYPE_DECAY_CASE, getCurrentBranch } from './utils.mjs';
|
|
7
7
|
import { TIER_CASE_SQL, tierSqlParams } from './tier.mjs';
|
|
8
|
-
import { getVocabulary, computeVector } from './tfidf.mjs';
|
|
8
|
+
import { getVocabulary, computeVector, vectorSearch, rrfMerge, VECTOR_SCAN_LIMIT } from './tfidf.mjs';
|
|
9
9
|
import { basename, join } from 'path';
|
|
10
10
|
import { readFileSync } from 'fs';
|
|
11
11
|
|
|
@@ -147,7 +147,7 @@ function searchFts(db, ftsQuery, { type, project, limit, dateFrom, dateTo, minIm
|
|
|
147
147
|
const params = [...whereParams, ...orderParams, limit];
|
|
148
148
|
|
|
149
149
|
// Scoring aligned with server.mjs: BM25 × type-decay × project_boost × importance × access_bonus
|
|
150
|
-
|
|
150
|
+
const ftsRows = db.prepare(`
|
|
151
151
|
SELECT o.id, o.type, o.title, o.subtitle, o.created_at, o.lesson_learned
|
|
152
152
|
FROM observations_fts
|
|
153
153
|
JOIN observations o ON observations_fts.rowid = o.id
|
|
@@ -159,6 +159,43 @@ function searchFts(db, ftsQuery, { type, project, limit, dateFrom, dateTo, minIm
|
|
|
159
159
|
* (1.0 + 0.1 * LN(1 + COALESCE(o.access_count, 0)))
|
|
160
160
|
LIMIT ?
|
|
161
161
|
`).all(...params);
|
|
162
|
+
|
|
163
|
+
// Hybrid: vector search + RRF merge (best-effort)
|
|
164
|
+
try {
|
|
165
|
+
const vocab = getVocabulary(db);
|
|
166
|
+
if (vocab) {
|
|
167
|
+
const queryText = ftsQuery.replace(/['"()]/g, ' ');
|
|
168
|
+
const queryVec = computeVector(queryText, vocab);
|
|
169
|
+
if (queryVec) {
|
|
170
|
+
const vecResults = vectorSearch(db, queryVec, {
|
|
171
|
+
project: project || null,
|
|
172
|
+
vocabVersion: vocab.version,
|
|
173
|
+
limit: VECTOR_SCAN_LIMIT,
|
|
174
|
+
});
|
|
175
|
+
if (vecResults.length > 0 && ftsRows.length > 0) {
|
|
176
|
+
const rrfRanking = rrfMerge(ftsRows, vecResults);
|
|
177
|
+
const rowMap = new Map(ftsRows.map(r => [r.id, r]));
|
|
178
|
+
for (const vr of vecResults) {
|
|
179
|
+
if (!rowMap.has(vr.id)) {
|
|
180
|
+
const obs = db.prepare('SELECT id, type, title, subtitle, created_at, lesson_learned FROM observations WHERE id = ?').get(vr.id);
|
|
181
|
+
if (obs) rowMap.set(vr.id, obs);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return rrfRanking
|
|
185
|
+
.filter(rr => rowMap.has(rr.id))
|
|
186
|
+
.map(rr => rowMap.get(rr.id))
|
|
187
|
+
.slice(0, limit);
|
|
188
|
+
} else if (vecResults.length > 0 && ftsRows.length === 0) {
|
|
189
|
+
return vecResults
|
|
190
|
+
.map(vr => db.prepare('SELECT id, type, title, subtitle, created_at, lesson_learned FROM observations WHERE id = ?').get(vr.id))
|
|
191
|
+
.filter(Boolean)
|
|
192
|
+
.slice(0, limit);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
} catch { /* vector search is best-effort */ }
|
|
197
|
+
|
|
198
|
+
return ftsRows;
|
|
162
199
|
}
|
|
163
200
|
|
|
164
201
|
function cmdRecent(db, args) {
|
|
@@ -203,15 +240,18 @@ function cmdRecall(db, args) {
|
|
|
203
240
|
const filename = basename(file);
|
|
204
241
|
const limit = parseInt(flags.limit, 10) || 10;
|
|
205
242
|
|
|
206
|
-
// Search
|
|
243
|
+
// Search via observation_files junction table for indexed filename lookups
|
|
244
|
+
const escaped = filename.replace(/%/g, '\\%').replace(/_/g, '\\_');
|
|
245
|
+
const likePattern = `%${escaped}`;
|
|
207
246
|
const rows = db.prepare(`
|
|
208
|
-
SELECT id, type, title, lesson_learned, created_at
|
|
209
|
-
FROM observations
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
247
|
+
SELECT DISTINCT o.id, o.type, o.title, o.lesson_learned, o.created_at
|
|
248
|
+
FROM observations o
|
|
249
|
+
JOIN observation_files of2 ON of2.obs_id = o.id
|
|
250
|
+
WHERE COALESCE(o.compressed_into, 0) = 0
|
|
251
|
+
AND (of2.filename = ? OR of2.filename LIKE ? ESCAPE '\\')
|
|
252
|
+
ORDER BY o.created_at_epoch DESC
|
|
213
253
|
LIMIT ?
|
|
214
|
-
`).all(
|
|
254
|
+
`).all(filename, likePattern, limit);
|
|
215
255
|
|
|
216
256
|
if (rows.length === 0) {
|
|
217
257
|
out(`[mem] No history for "${filename}"`);
|
package/package.json
CHANGED
package/registry-retriever.mjs
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
// Tier 2 of the 3-tier dispatch intelligence architecture
|
|
3
3
|
|
|
4
4
|
import { debugCatch } from './utils.mjs';
|
|
5
|
+
import { BASE_STOP_WORDS } from './stop-words.mjs';
|
|
5
6
|
|
|
6
7
|
// ─── Domain Synonyms ─────────────────────────────────────────────────────────
|
|
7
8
|
|
|
@@ -227,16 +228,8 @@ export function buildEnhancedQuery(signals) {
|
|
|
227
228
|
* @returns {string|null} FTS5 query string or null
|
|
228
229
|
*/
|
|
229
230
|
const TEXT_QUERY_STOP_WORDS = new Set([
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
'should', 'may', 'might', 'can', 'shall', 'to', 'of', 'in', 'for',
|
|
233
|
-
'on', 'with', 'at', 'by', 'from', 'as', 'into', 'about', 'between',
|
|
234
|
-
'after', 'before', 'above', 'below', 'and', 'or', 'but', 'not', 'no',
|
|
235
|
-
'this', 'that', 'these', 'those', 'it', 'its', 'my', 'your', 'his',
|
|
236
|
-
'her', 'our', 'their', 'me', 'him', 'us', 'them', 'i', 'you', 'he',
|
|
237
|
-
'she', 'we', 'they', 'what', 'which', 'who', 'when', 'where', 'how',
|
|
238
|
-
'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some',
|
|
239
|
-
'such', 'than', 'too', 'very', 'just', 'also', 'then', 'so', 'if',
|
|
231
|
+
...BASE_STOP_WORDS,
|
|
232
|
+
// CJK stop words (particles, pronouns, common verbs)
|
|
240
233
|
'的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都',
|
|
241
234
|
'一', '一个', '上', '也', '这', '那', '你', '他', '她', '它', '们',
|
|
242
235
|
'把', '让', '给', '用', '来', '去', '做', '说', '要', '会', '能',
|
package/schema.mjs
CHANGED
|
@@ -12,6 +12,9 @@ export const DB_DIR = process.env.CLAUDE_MEM_DIR || join(homedir(), '.claude-mem
|
|
|
12
12
|
export const DB_PATH = join(DB_DIR, 'claude-mem-lite.db');
|
|
13
13
|
export const REGISTRY_DB_PATH = join(DB_DIR, 'resource-registry.db');
|
|
14
14
|
|
|
15
|
+
// Increment when schema changes (tables, columns, indexes, FTS, migrations)
|
|
16
|
+
export const CURRENT_SCHEMA_VERSION = 18;
|
|
17
|
+
|
|
15
18
|
const CORE_SCHEMA = `
|
|
16
19
|
CREATE TABLE IF NOT EXISTS sdk_sessions (
|
|
17
20
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
@@ -116,6 +119,12 @@ const MIGRATIONS = [
|
|
|
116
119
|
* The DB should have foreign_keys OFF before calling (enabled after dedup migration).
|
|
117
120
|
*/
|
|
118
121
|
export function initSchema(db) {
|
|
122
|
+
// Fast path: skip all migrations if schema is already at current version
|
|
123
|
+
try {
|
|
124
|
+
const row = db.prepare('SELECT version FROM schema_version LIMIT 1').get();
|
|
125
|
+
if (row && row.version === CURRENT_SCHEMA_VERSION) return db;
|
|
126
|
+
} catch { /* table may not exist yet */ }
|
|
127
|
+
|
|
119
128
|
// Create core tables
|
|
120
129
|
db.exec(CORE_SCHEMA);
|
|
121
130
|
|
|
@@ -136,23 +145,21 @@ export function initSchema(db) {
|
|
|
136
145
|
GROUP BY memory_session_id HAVING cnt > 1
|
|
137
146
|
`).all();
|
|
138
147
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
}
|
|
148
|
+
// Atomic: dedup + create unique index in one transaction
|
|
149
|
+
const dedupAndIndex = db.transaction(() => {
|
|
150
|
+
for (const { memory_session_id } of dupes) {
|
|
151
|
+
const rows = db.prepare(`
|
|
152
|
+
SELECT s.id FROM sdk_sessions s
|
|
153
|
+
WHERE s.memory_session_id = ?
|
|
154
|
+
ORDER BY s.id ASC
|
|
155
|
+
`).all(memory_session_id);
|
|
156
|
+
for (let i = 1; i < rows.length; i++) {
|
|
157
|
+
db.prepare('DELETE FROM sdk_sessions WHERE id = ?').run(rows[i].id);
|
|
150
158
|
}
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_sess_memory_sid ON sdk_sessions(memory_session_id)`);
|
|
159
|
+
}
|
|
160
|
+
db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_sess_memory_sid ON sdk_sessions(memory_session_id)`);
|
|
161
|
+
});
|
|
162
|
+
dedupAndIndex();
|
|
156
163
|
}
|
|
157
164
|
db.pragma('foreign_keys = ON');
|
|
158
165
|
|
|
@@ -190,6 +197,45 @@ export function initSchema(db) {
|
|
|
190
197
|
}
|
|
191
198
|
} catch { /* non-critical */ }
|
|
192
199
|
|
|
200
|
+
// Observation files junction table for normalized file lookups (replaces LIKE scans on files_modified JSON)
|
|
201
|
+
db.exec(`
|
|
202
|
+
CREATE TABLE IF NOT EXISTS observation_files (
|
|
203
|
+
obs_id INTEGER NOT NULL REFERENCES observations(id) ON DELETE CASCADE,
|
|
204
|
+
filename TEXT NOT NULL,
|
|
205
|
+
UNIQUE(obs_id, filename)
|
|
206
|
+
)
|
|
207
|
+
`);
|
|
208
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_obsfiles_filename ON observation_files(filename)`);
|
|
209
|
+
|
|
210
|
+
// Data migration: populate observation_files from existing observations.files_modified JSON
|
|
211
|
+
// Only runs once: when observation_files is empty but observations has rows with files_modified
|
|
212
|
+
try {
|
|
213
|
+
const obsFilesCount = db.prepare('SELECT COUNT(*) as c FROM observation_files').get().c;
|
|
214
|
+
if (obsFilesCount === 0) {
|
|
215
|
+
const obsWithFiles = db.prepare(
|
|
216
|
+
`SELECT id, files_modified FROM observations WHERE files_modified IS NOT NULL AND files_modified != '[]'`
|
|
217
|
+
).all();
|
|
218
|
+
if (obsWithFiles.length > 0) {
|
|
219
|
+
const migrateFiles = db.transaction(() => {
|
|
220
|
+
const insertFile = db.prepare('INSERT OR IGNORE INTO observation_files (obs_id, filename) VALUES (?, ?)');
|
|
221
|
+
for (const row of obsWithFiles) {
|
|
222
|
+
try {
|
|
223
|
+
const files = JSON.parse(row.files_modified);
|
|
224
|
+
if (Array.isArray(files)) {
|
|
225
|
+
for (const f of files) {
|
|
226
|
+
if (typeof f === 'string' && f.length > 0) {
|
|
227
|
+
insertFile.run(row.id, f);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
} catch { /* skip malformed JSON */ }
|
|
232
|
+
}
|
|
233
|
+
});
|
|
234
|
+
migrateFiles();
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
} catch { /* non-critical — migration can retry on next open */ }
|
|
238
|
+
|
|
193
239
|
// Observation vectors table for TF-IDF vector search
|
|
194
240
|
db.exec(`
|
|
195
241
|
CREATE TABLE IF NOT EXISTS observation_vectors (
|
|
@@ -201,6 +247,18 @@ export function initSchema(db) {
|
|
|
201
247
|
)
|
|
202
248
|
`);
|
|
203
249
|
|
|
250
|
+
// Persisted vocabulary for stable TF-IDF vector indexing
|
|
251
|
+
db.exec(`
|
|
252
|
+
CREATE TABLE IF NOT EXISTS vocab_state (
|
|
253
|
+
term TEXT NOT NULL,
|
|
254
|
+
term_index INTEGER NOT NULL,
|
|
255
|
+
idf REAL NOT NULL,
|
|
256
|
+
version TEXT NOT NULL,
|
|
257
|
+
created_at_epoch INTEGER NOT NULL
|
|
258
|
+
)
|
|
259
|
+
`);
|
|
260
|
+
db.exec('CREATE INDEX IF NOT EXISTS idx_vocab_state_version ON vocab_state(version)');
|
|
261
|
+
|
|
204
262
|
// Project name normalization: migrate short names ("mem") to canonical form ("projects--mem")
|
|
205
263
|
// Strategy: exact suffix match first, then substring match for package-name aliases
|
|
206
264
|
// Idempotent: only runs when short-name records exist
|
|
@@ -242,6 +300,11 @@ export function initSchema(db) {
|
|
|
242
300
|
}
|
|
243
301
|
} catch { /* non-critical — normalization can retry on next open */ }
|
|
244
302
|
|
|
303
|
+
// Record schema version for fast-path on subsequent calls
|
|
304
|
+
db.exec('CREATE TABLE IF NOT EXISTS schema_version (version INTEGER NOT NULL)');
|
|
305
|
+
db.exec('DELETE FROM schema_version');
|
|
306
|
+
db.prepare('INSERT INTO schema_version (version) VALUES (?)').run(CURRENT_SCHEMA_VERSION);
|
|
307
|
+
|
|
245
308
|
return db;
|
|
246
309
|
}
|
|
247
310
|
|
|
@@ -5,70 +5,15 @@
|
|
|
5
5
|
|
|
6
6
|
import { ensureDb } from '../schema.mjs';
|
|
7
7
|
import { sanitizeFtsQuery, relaxFtsQueryToOr, truncate, typeIcon, inferProject, OBS_BM25, TYPE_DECAY_CASE } from '../utils.mjs';
|
|
8
|
-
import {
|
|
8
|
+
import { writeFileSync, readFileSync } from 'fs';
|
|
9
|
+
import { shouldSkip, detectIntent, shouldSkipByDedup, extractFiles, DEDUP_STALE_MS } from './prompt-search-utils.mjs';
|
|
9
10
|
|
|
10
11
|
// ─── Constants ──────────────────────────────────────────────────────────────
|
|
11
12
|
|
|
12
|
-
const COOLDOWN_FILE = `/tmp/.claude-mem-prompt-ctx-${inferProject()}`;
|
|
13
13
|
const INJECTED_IDS_FILE = `/tmp/.claude-mem-injected-${inferProject()}`;
|
|
14
|
-
const COOLDOWN_MS = 60_000;
|
|
15
14
|
const MAX_RESULTS = 5;
|
|
16
15
|
const LOOKBACK_MS = 60 * 86400000; // 60 days
|
|
17
16
|
|
|
18
|
-
// ─── Skip Patterns ──────────────────────────────────────────────────────────
|
|
19
|
-
|
|
20
|
-
const CONFIRM_RE = /^(y(es)?|no?|ok|done|go|sure|lgtm|thanks?|ty|继续|确认|好的|是的|对|嗯|行|可以|没问题)$/i;
|
|
21
|
-
const SLASH_CMD_RE = /^\//;
|
|
22
|
-
const PURE_OP_RE = /^(git\s+(commit|push|merge)|npm\s+(publish|deploy))\b/i;
|
|
23
|
-
|
|
24
|
-
function shouldSkip(text) {
|
|
25
|
-
if (!text || text.length < 8) return true;
|
|
26
|
-
const trimmed = text.trim();
|
|
27
|
-
if (CONFIRM_RE.test(trimmed)) return true;
|
|
28
|
-
if (SLASH_CMD_RE.test(trimmed)) return true;
|
|
29
|
-
if (PURE_OP_RE.test(trimmed)) return true;
|
|
30
|
-
return false;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
// ─── Cooldown ───────────────────────────────────────────────────────────────
|
|
34
|
-
|
|
35
|
-
function checkCooldown() {
|
|
36
|
-
try {
|
|
37
|
-
const stat = statSync(COOLDOWN_FILE);
|
|
38
|
-
return (Date.now() - stat.mtimeMs) < COOLDOWN_MS;
|
|
39
|
-
} catch { return false; }
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
function touchCooldown() {
|
|
43
|
-
try { writeFileSync(COOLDOWN_FILE, String(Date.now())); } catch {}
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
// ─── Intent Detection ───────────────────────────────────────────────────────
|
|
47
|
-
|
|
48
|
-
const INTENTS = [
|
|
49
|
-
// Error/debug intent
|
|
50
|
-
{ pattern: /error|bug|crash|broken|fail|fix|报错|出错|错误|崩溃|修复/i, type: 'bugfix', limit: 3 },
|
|
51
|
-
// Decision/architecture intent (before recall — "为什么...之前" is a decision question, not recall)
|
|
52
|
-
{ pattern: /why|decided|architecture|design|为什么|决定|架构|设计/i, type: 'decision', limit: 3 },
|
|
53
|
-
// Recall/history intent (catch-all temporal, lowest priority)
|
|
54
|
-
{ pattern: /before|previously|last time|remember|之前|上次|以前|记得/i, type: null, limit: 5, useRecent: true },
|
|
55
|
-
];
|
|
56
|
-
|
|
57
|
-
function detectIntent(text) {
|
|
58
|
-
for (const intent of INTENTS) {
|
|
59
|
-
if (intent.pattern.test(text)) return intent;
|
|
60
|
-
}
|
|
61
|
-
return null;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// ─── File Path Detection ─────────────────────────────────────────────────────
|
|
65
|
-
|
|
66
|
-
// Detect file paths in text
|
|
67
|
-
function extractFiles(text) {
|
|
68
|
-
const matches = text.match(/[\w./-]+\.\w{1,10}/g) || [];
|
|
69
|
-
return matches.filter(m => m.includes('.') && !m.startsWith('http'));
|
|
70
|
-
}
|
|
71
|
-
|
|
72
17
|
// ─── DB Query Functions ─────────────────────────────────────────────────────
|
|
73
18
|
|
|
74
19
|
function searchByFts(db, queryText, project, limit, typeFilter) {
|
|
@@ -124,20 +69,20 @@ function searchByFile(db, files, project, limit) {
|
|
|
124
69
|
const basename = file.split('/').pop();
|
|
125
70
|
if (!basename || basename.length < 2) continue;
|
|
126
71
|
const escaped = basename.replace(/%/g, '\\%').replace(/_/g, '\\_');
|
|
127
|
-
const
|
|
128
|
-
const namePattern = `%"${escaped}"%`;
|
|
72
|
+
const likePattern = `%${escaped}`;
|
|
129
73
|
|
|
130
74
|
const rows = db.prepare(`
|
|
131
|
-
SELECT id, type, title, lesson_learned
|
|
132
|
-
FROM observations
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
AND
|
|
136
|
-
AND
|
|
137
|
-
AND
|
|
138
|
-
|
|
75
|
+
SELECT DISTINCT o.id, o.type, o.title, o.lesson_learned
|
|
76
|
+
FROM observations o
|
|
77
|
+
JOIN observation_files of2 ON of2.obs_id = o.id
|
|
78
|
+
WHERE o.project = ?
|
|
79
|
+
AND o.importance >= 1
|
|
80
|
+
AND COALESCE(o.compressed_into, 0) = 0
|
|
81
|
+
AND o.created_at_epoch > ?
|
|
82
|
+
AND (of2.filename = ? OR of2.filename LIKE ? ESCAPE '\\')
|
|
83
|
+
ORDER BY o.created_at_epoch DESC
|
|
139
84
|
LIMIT ?
|
|
140
|
-
`).all(project, cutoff,
|
|
85
|
+
`).all(project, cutoff, file, likePattern, limit);
|
|
141
86
|
|
|
142
87
|
results.push(...rows);
|
|
143
88
|
}
|
|
@@ -226,9 +171,6 @@ async function main() {
|
|
|
226
171
|
// Skip short/confirmation/slash-command/simple-op prompts
|
|
227
172
|
if (shouldSkip(promptText)) return;
|
|
228
173
|
|
|
229
|
-
// Cooldown check — avoid flooding context on rapid prompts
|
|
230
|
-
if (checkCooldown()) return;
|
|
231
|
-
|
|
232
174
|
let db;
|
|
233
175
|
try {
|
|
234
176
|
db = ensureDb();
|
|
@@ -264,14 +206,24 @@ async function main() {
|
|
|
264
206
|
rows = rows.slice(0, MAX_RESULTS);
|
|
265
207
|
}
|
|
266
208
|
|
|
209
|
+
const candidateIds = rows.map(r => r.id);
|
|
210
|
+
if (shouldSkipByDedup(candidateIds, INJECTED_IDS_FILE)) return;
|
|
211
|
+
|
|
267
212
|
const output = formatResults(rows);
|
|
268
213
|
if (output) {
|
|
269
214
|
process.stdout.write(output + '\n');
|
|
270
|
-
|
|
271
|
-
// Write injected IDs for dedup with hook.mjs handleUserPrompt
|
|
215
|
+
// Write injected IDs for dedup with hook.mjs handleUserPrompt + self-dedup
|
|
272
216
|
try {
|
|
273
|
-
|
|
274
|
-
|
|
217
|
+
let prevCount = 0;
|
|
218
|
+
try {
|
|
219
|
+
const prev = JSON.parse(readFileSync(INJECTED_IDS_FILE, 'utf8'));
|
|
220
|
+
if (prev.ts && Date.now() - prev.ts < DEDUP_STALE_MS) prevCount = prev.count || 0;
|
|
221
|
+
} catch {}
|
|
222
|
+
writeFileSync(INJECTED_IDS_FILE, JSON.stringify({
|
|
223
|
+
ids: candidateIds,
|
|
224
|
+
ts: Date.now(),
|
|
225
|
+
count: prevCount + 1,
|
|
226
|
+
}));
|
|
275
227
|
} catch {}
|
|
276
228
|
}
|
|
277
229
|
} catch {
|
package/server-internals.mjs
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
// Extracted from server.mjs for testability (server.mjs has top-level side effects)
|
|
3
3
|
|
|
4
4
|
import { debugCatch, COMPRESSED_AUTO, COMPRESSED_PENDING_PURGE, OBS_BM25 } from './utils.mjs';
|
|
5
|
+
import { BASE_STOP_WORDS } from './stop-words.mjs';
|
|
5
6
|
|
|
6
7
|
// ─── Search Re-ranking Helpers ────────────────────────────────────────────
|
|
7
8
|
|
|
@@ -14,21 +15,15 @@ import { debugCatch, COMPRESSED_AUTO, COMPRESSED_PENDING_PURGE, OBS_BM25 } from
|
|
|
14
15
|
*/
|
|
15
16
|
export function reRankWithContext(db, results, project) {
|
|
16
17
|
if (!results || results.length === 0) return;
|
|
17
|
-
// Get recently active files (last 2 hours, same project)
|
|
18
|
+
// Get recently active files (last 2 hours, same project) via observation_files junction table
|
|
18
19
|
const twoHoursAgo = Date.now() - 2 * 3600000;
|
|
19
|
-
const
|
|
20
|
-
SELECT
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
const recentFiles = db.prepare(`
|
|
21
|
+
SELECT DISTINCT of2.filename FROM observation_files of2
|
|
22
|
+
JOIN observations o ON o.id = of2.obs_id
|
|
23
|
+
WHERE o.project = ? AND o.created_at_epoch > ?
|
|
23
24
|
`).all(project, twoHoursAgo);
|
|
24
25
|
|
|
25
|
-
const activeFiles = new Set();
|
|
26
|
-
for (const r of recentObs) {
|
|
27
|
-
try {
|
|
28
|
-
const files = JSON.parse(r.files_modified || '[]');
|
|
29
|
-
for (const f of files) activeFiles.add(f);
|
|
30
|
-
} catch (e) { debugCatch(e, 'reRankWithContext-parse'); }
|
|
31
|
-
}
|
|
26
|
+
const activeFiles = new Set(recentFiles.map(r => r.filename));
|
|
32
27
|
if (activeFiles.size === 0) return;
|
|
33
28
|
|
|
34
29
|
// Pre-compute active directories for directory-level matching
|
|
@@ -38,11 +33,25 @@ export function reRankWithContext(db, results, project) {
|
|
|
38
33
|
if (lastSlash > 0) activeDirs.add(f.substring(0, lastSlash));
|
|
39
34
|
}
|
|
40
35
|
|
|
41
|
-
for
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
36
|
+
// Batch-fetch observation_files for all obs result IDs
|
|
37
|
+
const obsResults = results.filter(r => r.source === 'obs' && r.id);
|
|
38
|
+
if (obsResults.length === 0) return;
|
|
39
|
+
const obsIds = obsResults.map(r => r.id);
|
|
40
|
+
const placeholders = obsIds.map(() => '?').join(',');
|
|
41
|
+
const fileRows = db.prepare(
|
|
42
|
+
`SELECT obs_id, filename FROM observation_files WHERE obs_id IN (${placeholders})`
|
|
43
|
+
).all(...obsIds);
|
|
44
|
+
|
|
45
|
+
// Build map: obs_id → [filenames]
|
|
46
|
+
const obsFileMap = new Map();
|
|
47
|
+
for (const row of fileRows) {
|
|
48
|
+
if (!obsFileMap.has(row.obs_id)) obsFileMap.set(row.obs_id, []);
|
|
49
|
+
obsFileMap.get(row.obs_id).push(row.filename);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
for (const result of obsResults) {
|
|
53
|
+
const resultFiles = obsFileMap.get(result.id);
|
|
54
|
+
if (!resultFiles || resultFiles.length === 0) continue;
|
|
46
55
|
const exactMatches = resultFiles.filter(f => activeFiles.has(f)).length;
|
|
47
56
|
// Directory-level: same parent dir but different file (half weight)
|
|
48
57
|
const dirMatches = resultFiles.filter(f => {
|
|
@@ -104,10 +113,8 @@ export function markSuperseded(db, results) {
|
|
|
104
113
|
|
|
105
114
|
/** @type {Set<string>} Common words excluded from PRF term extraction */
|
|
106
115
|
export const PRF_STOP_WORDS = new Set([
|
|
107
|
-
|
|
108
|
-
'
|
|
109
|
-
'which', 'their', 'will', 'would', 'could', 'should', 'also', 'than',
|
|
110
|
-
'then', 'its', 'use', 'used', 'using', 'some', 'new', 'added', 'updated',
|
|
116
|
+
...BASE_STOP_WORDS,
|
|
117
|
+
'use', 'used', 'using', 'new', 'added', 'updated',
|
|
111
118
|
'file', 'files', 'code', 'change', 'changed', 'changes',
|
|
112
119
|
]);
|
|
113
120
|
|
package/server.mjs
CHANGED
|
@@ -4,14 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
6
6
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
7
|
-
import { jaccardSimilarity, truncate, typeIcon, sanitizeFtsQuery, relaxFtsQueryToOr, inferProject, computeMinHash, estimateJaccardFromMinHash, scrubSecrets, cjkBigrams, fmtDate, isoWeekKey, debugLog, debugCatch, COMPRESSED_PENDING_PURGE, OBS_BM25, SESS_BM25, TYPE_DECAY_CASE, getCurrentBranch } from './utils.mjs';
|
|
7
|
+
import { jaccardSimilarity, truncate, typeIcon, sanitizeFtsQuery, relaxFtsQueryToOr, inferProject, computeMinHash, estimateJaccardFromMinHash, scrubSecrets, cjkBigrams, fmtDate, isoWeekKey, debugLog, debugCatch, COMPRESSED_PENDING_PURGE, OBS_BM25, SESS_BM25, TYPE_DECAY_CASE, getCurrentBranch, DEFAULT_DECAY_HALF_LIFE_MS } from './utils.mjs';
|
|
8
8
|
import { ensureDb, DB_PATH, REGISTRY_DB_PATH } from './schema.mjs';
|
|
9
9
|
import { reRankWithContext, markSuperseded, extractPRFTerms, expandQueryByConcepts, autoBoostIfNeeded, runIdleCleanup } from './server-internals.mjs';
|
|
10
10
|
import { computeTier, TIER_CASE_SQL, tierSqlParams } from './tier.mjs';
|
|
11
11
|
import { memSearchSchema, memTimelineSchema, memGetSchema, memDeleteSchema, memSaveSchema, memStatsSchema, memCompressSchema, memMaintainSchema, memRegistrySchema } from './tool-schemas.mjs';
|
|
12
12
|
import { ensureRegistryDb, upsertResource } from './registry.mjs';
|
|
13
13
|
import { searchResources } from './registry-retriever.mjs';
|
|
14
|
-
import { getVocabulary,
|
|
14
|
+
import { getVocabulary, rebuildVocabulary, _resetVocabCache, computeVector, vectorSearch, rrfMerge } from './tfidf.mjs';
|
|
15
15
|
import { createRequire } from 'module';
|
|
16
16
|
|
|
17
17
|
const require = createRequire(import.meta.url);
|
|
@@ -102,7 +102,7 @@ function resolveProject(name) {
|
|
|
102
102
|
// Access bonus: 1 + 0.1 × ln(1 + access_count)
|
|
103
103
|
|
|
104
104
|
// OBS_BM25, SESS_BM25, TYPE_DECAY_CASE imported from utils.mjs
|
|
105
|
-
const RECENCY_HALF_LIFE_MS =
|
|
105
|
+
const RECENCY_HALF_LIFE_MS = DEFAULT_DECAY_HALF_LIFE_MS;
|
|
106
106
|
|
|
107
107
|
// ─── MCP Server ─────────────────────────────────────────────────────────────
|
|
108
108
|
|
|
@@ -883,22 +883,28 @@ server.registerTool(
|
|
|
883
883
|
const bigramText = cjkBigrams(safeTitle + ' ' + safeContent);
|
|
884
884
|
const textField = bigramText ? safeContent + ' ' + bigramText : safeContent;
|
|
885
885
|
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
const
|
|
896
|
-
if (
|
|
897
|
-
|
|
898
|
-
|
|
886
|
+
// Atomic: insert observation + TF-IDF vector in one transaction
|
|
887
|
+
const saveTx = db.transaction(() => {
|
|
888
|
+
const result = db.prepare(`
|
|
889
|
+
INSERT INTO observations (memory_session_id, project, text, type, title, narrative, concepts, facts, files_read, files_modified, importance, minhash_sig, branch, created_at, created_at_epoch)
|
|
890
|
+
VALUES (?, ?, ?, ?, ?, ?, '', '', '[]', '[]', ?, ?, ?, ?, ?)
|
|
891
|
+
`).run(sessionId, project, textField, type, safeTitle, safeContent, args.importance ?? 1, minhashSig, getCurrentBranch(), now.toISOString(), now.getTime());
|
|
892
|
+
|
|
893
|
+
// Write TF-IDF vector
|
|
894
|
+
try {
|
|
895
|
+
const vocab = getVocabulary(db);
|
|
896
|
+
if (vocab) {
|
|
897
|
+
const vec = computeVector(safeTitle + ' ' + safeContent, vocab);
|
|
898
|
+
if (vec) {
|
|
899
|
+
db.prepare('INSERT OR REPLACE INTO observation_vectors (observation_id, vector, vocab_version, created_at_epoch) VALUES (?, ?, ?, ?)')
|
|
900
|
+
.run(Number(result.lastInsertRowid), Buffer.from(vec.buffer), vocab.version, Date.now());
|
|
901
|
+
}
|
|
899
902
|
}
|
|
900
|
-
}
|
|
901
|
-
|
|
903
|
+
} catch (e) { debugCatch(e, 'mem_save-vector'); }
|
|
904
|
+
|
|
905
|
+
return result;
|
|
906
|
+
});
|
|
907
|
+
const result = saveTx();
|
|
902
908
|
|
|
903
909
|
return { content: [{ type: 'text', text: `Saved as observation #${result.lastInsertRowid} [${type}] in project "${project}".` }] };
|
|
904
910
|
})
|
|
@@ -1314,12 +1320,18 @@ server.registerTool(
|
|
|
1314
1320
|
for (const group of args.merge_ids) {
|
|
1315
1321
|
if (group.length < 2) continue;
|
|
1316
1322
|
const [keepId, ...removeIds] = group;
|
|
1317
|
-
for (const removeId of removeIds)
|
|
1318
|
-
|
|
1323
|
+
for (const removeId of removeIds) {
|
|
1324
|
+
const result = mergeStmt.run(keepId, removeId);
|
|
1325
|
+
totalMerged += result.changes;
|
|
1326
|
+
}
|
|
1319
1327
|
}
|
|
1320
1328
|
results.push(`Merged ${totalMerged} duplicate observations`);
|
|
1321
1329
|
}
|
|
1322
1330
|
|
|
1331
|
+
if (!ops.includes('dedup') && args.merge_ids) {
|
|
1332
|
+
results.push('Warning: merge_ids provided but "dedup" not in operations — merge_ids ignored');
|
|
1333
|
+
}
|
|
1334
|
+
|
|
1323
1335
|
if (ops.includes('purge_stale')) {
|
|
1324
1336
|
// Delete observations previously marked as pending-purge by idle cleanup.
|
|
1325
1337
|
// Requires user confirmation via /mem:update or /mem:mem.
|
|
@@ -1345,7 +1357,7 @@ server.registerTool(
|
|
|
1345
1357
|
if (ops.includes('rebuild_vectors')) {
|
|
1346
1358
|
try {
|
|
1347
1359
|
_resetVocabCache();
|
|
1348
|
-
const vocab =
|
|
1360
|
+
const vocab = rebuildVocabulary(db);
|
|
1349
1361
|
if (!vocab) {
|
|
1350
1362
|
results.push('Vectors: no observations to build vocabulary from');
|
|
1351
1363
|
} else {
|
package/utils.mjs
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
// claude-mem-lite shared utilities
|
|
2
2
|
// Used by server.mjs, hook.mjs, and tests
|
|
3
3
|
|
|
4
|
+
|
|
4
5
|
import { basename, dirname } from 'path';
|
|
5
6
|
import { execSync } from 'child_process';
|
|
6
7
|
|
|
8
|
+
// ─── Re-exports from extracted modules ──────────────────────────────────────
|
|
9
|
+
// Backward compatibility: all consumers import from utils.mjs
|
|
10
|
+
|
|
11
|
+
export { DECAY_HALF_LIFE_BY_TYPE, DEFAULT_DECAY_HALF_LIFE_MS, OBS_BM25, SESS_BM25, TYPE_DECAY_CASE, OBS_FTS_COLUMNS } from './scoring-sql.mjs';
|
|
12
|
+
export { cjkBigrams, extractCjkSynonymTokens, SYNONYM_MAP, expandToken, sanitizeFtsQuery, relaxFtsQueryToOr, FTS_STOP_WORDS, CJK_COMPOUNDS } from './nlp.mjs';
|
|
13
|
+
|
|
7
14
|
// ─── Sentinel Values ────────────────────────────────────────────────────────
|
|
8
15
|
|
|
9
16
|
/** compressed_into sentinel: auto-compressed without merge target */
|
|
@@ -11,45 +18,6 @@ export const COMPRESSED_AUTO = -1;
|
|
|
11
18
|
/** compressed_into sentinel: pending user-confirmed purge (marked by idle cleanup) */
|
|
12
19
|
export const COMPRESSED_PENDING_PURGE = -2;
|
|
13
20
|
|
|
14
|
-
// ─── Type-Differentiated Recency Decay ──────────────────────────────────────
|
|
15
|
-
|
|
16
|
-
/** Recency half-life per observation type (in milliseconds) */
|
|
17
|
-
export const DECAY_HALF_LIFE_BY_TYPE = {
|
|
18
|
-
decision: 90 * 86400000, // 90 days — architectural decisions persist
|
|
19
|
-
discovery: 60 * 86400000, // 60 days — learned patterns last
|
|
20
|
-
feature: 30 * 86400000, // 30 days — feature work is mid-range
|
|
21
|
-
bugfix: 14 * 86400000, // 14 days — bugs are usually one-off
|
|
22
|
-
refactor: 14 * 86400000, // 14 days — code cleanup
|
|
23
|
-
change: 7 * 86400000, // 7 days — routine changes decay fast
|
|
24
|
-
};
|
|
25
|
-
export const DEFAULT_DECAY_HALF_LIFE_MS = 14 * 86400000;
|
|
26
|
-
|
|
27
|
-
// ─── BM25 Weight Constants ──────────────────────────────────────────────────
|
|
28
|
-
// Single source of truth for FTS5 BM25 weight expressions.
|
|
29
|
-
// Column order must match ensureFTS() calls in schema.mjs.
|
|
30
|
-
|
|
31
|
-
/** observations_fts BM25 weights: title=10, subtitle=5, narrative=5, text=3, facts=3, concepts=2, lesson_learned=8 */
|
|
32
|
-
export const OBS_BM25 = 'bm25(observations_fts, 10, 5, 5, 3, 3, 2, 8)';
|
|
33
|
-
|
|
34
|
-
/** session_summaries_fts BM25 weights: request=5, investigated=3, learned=3, completed=3, next_steps=2, notes=1, remaining_items=1 */
|
|
35
|
-
export const SESS_BM25 = 'bm25(session_summaries_fts, 5, 3, 3, 3, 2, 1, 1)';
|
|
36
|
-
|
|
37
|
-
/** FTS5 columns for observations (must match BM25 weight order) */
|
|
38
|
-
export const OBS_FTS_COLUMNS = ['title', 'subtitle', 'narrative', 'text', 'facts', 'concepts', 'lesson_learned'];
|
|
39
|
-
|
|
40
|
-
/** SQL CASE for type-differentiated recency decay half-lives (milliseconds) */
|
|
41
|
-
export const TYPE_DECAY_CASE = `(
|
|
42
|
-
CASE o.type
|
|
43
|
-
WHEN 'decision' THEN 7776000000.0
|
|
44
|
-
WHEN 'discovery' THEN 5184000000.0
|
|
45
|
-
WHEN 'feature' THEN 2592000000.0
|
|
46
|
-
WHEN 'bugfix' THEN 1209600000.0
|
|
47
|
-
WHEN 'refactor' THEN 1209600000.0
|
|
48
|
-
WHEN 'change' THEN 604800000.0
|
|
49
|
-
ELSE 1209600000.0
|
|
50
|
-
END
|
|
51
|
-
)`;
|
|
52
|
-
|
|
53
21
|
// ─── String Utilities ────────────────────────────────────────────────────────
|
|
54
22
|
|
|
55
23
|
/**
|
|
@@ -229,223 +197,6 @@ export function typeIcon(type) {
|
|
|
229
197
|
return icons[type] || '⚪';
|
|
230
198
|
}
|
|
231
199
|
|
|
232
|
-
// ─── FTS5 ────────────────────────────────────────────────────────────────────
|
|
233
|
-
|
|
234
|
-
const FTS5_KEYWORDS = new Set(['AND', 'OR', 'NOT', 'NEAR']);
|
|
235
|
-
|
|
236
|
-
// Synonym/abbreviation map: query abbreviation → expanded full forms
|
|
237
|
-
// Bidirectional: both directions are registered so "K8s" finds "Kubernetes" and vice versa
|
|
238
|
-
const SYNONYM_MAP = new Map();
|
|
239
|
-
const SYNONYM_PAIRS = [
|
|
240
|
-
// Abbreviation ↔ full form
|
|
241
|
-
['k8s', 'kubernetes'],
|
|
242
|
-
['db', 'database'],
|
|
243
|
-
['js', 'javascript'],
|
|
244
|
-
['ts', 'typescript'],
|
|
245
|
-
['py', 'python'],
|
|
246
|
-
['ci', 'continuous integration'],
|
|
247
|
-
['cd', 'continuous deployment'],
|
|
248
|
-
['ws', 'websocket'],
|
|
249
|
-
['auth', 'authentication'],
|
|
250
|
-
['authn', 'authentication'],
|
|
251
|
-
['authz', 'authorization'],
|
|
252
|
-
['config', 'configuration'],
|
|
253
|
-
['deps', 'dependencies'],
|
|
254
|
-
['env', 'environment'],
|
|
255
|
-
['infra', 'infrastructure'],
|
|
256
|
-
['msg', 'message'],
|
|
257
|
-
['pkg', 'package'],
|
|
258
|
-
['repo', 'repository'],
|
|
259
|
-
['req', 'request'],
|
|
260
|
-
['res', 'response'],
|
|
261
|
-
['ml', 'machine learning'],
|
|
262
|
-
['ai', 'artificial intelligence'],
|
|
263
|
-
['api', 'application programming interface'],
|
|
264
|
-
['ui', 'user interface'],
|
|
265
|
-
['ux', 'user experience'],
|
|
266
|
-
['fe', 'frontend'],
|
|
267
|
-
['be', 'backend'],
|
|
268
|
-
['gql', 'graphql'],
|
|
269
|
-
['tf', 'terraform'],
|
|
270
|
-
['cdk', 'cloud development kit'],
|
|
271
|
-
['iac', 'infrastructure as code'],
|
|
272
|
-
['e2e', 'end to end'],
|
|
273
|
-
['perf', 'performance'],
|
|
274
|
-
['impl', 'implementation'],
|
|
275
|
-
['fn', 'function'],
|
|
276
|
-
['util', 'utility'],
|
|
277
|
-
['utils', 'utilities'],
|
|
278
|
-
['err', 'error'],
|
|
279
|
-
['src', 'source'],
|
|
280
|
-
['lib', 'library'],
|
|
281
|
-
['dev', 'development'],
|
|
282
|
-
['prod', 'production'],
|
|
283
|
-
['async', 'asynchronous'],
|
|
284
|
-
['sync', 'synchronous'],
|
|
285
|
-
// Semantic equivalents — precise synonyms only (overly broad bridges removed)
|
|
286
|
-
['login', 'signin'],
|
|
287
|
-
['bug', 'error'],
|
|
288
|
-
['bug', 'defect'],
|
|
289
|
-
['crash', 'panic'],
|
|
290
|
-
['crash', 'segfault'],
|
|
291
|
-
['slow', 'latency'],
|
|
292
|
-
['remove', 'delete'],
|
|
293
|
-
['setup', 'install'],
|
|
294
|
-
['deploy', 'release'],
|
|
295
|
-
['deploy', 'publish'],
|
|
296
|
-
['refactor', 'restructure'],
|
|
297
|
-
['test', 'spec'],
|
|
298
|
-
['cache', 'caching'],
|
|
299
|
-
['cache', 'memoize'],
|
|
300
|
-
['optimize', 'optimization'],
|
|
301
|
-
['fix', 'bugfix'],
|
|
302
|
-
['fix', 'patch'],
|
|
303
|
-
['debug', 'debugging'],
|
|
304
|
-
['debug', 'troubleshoot'],
|
|
305
|
-
['error', 'failure'],
|
|
306
|
-
['migrate', 'migration'],
|
|
307
|
-
// ─── CJK ↔ EN cross-language synonyms ───
|
|
308
|
-
// Authentication & Authorization
|
|
309
|
-
['认证', 'auth'], ['认证', 'authentication'], ['登录', 'login'], ['登录', 'auth'],
|
|
310
|
-
['授权', 'authorization'], ['权限', 'permission'],
|
|
311
|
-
// Deployment & Operations
|
|
312
|
-
['部署', 'deploy'], ['部署', 'deployment'], ['发布', 'release'], ['发布', 'publish'],
|
|
313
|
-
// Data & Storage
|
|
314
|
-
['缓存', 'cache'], ['缓存', 'caching'],
|
|
315
|
-
['数据库', 'database'], ['数据库', 'db'],
|
|
316
|
-
// Testing & Debugging
|
|
317
|
-
['测试', 'test'], ['测试', 'testing'],
|
|
318
|
-
['调试', 'debug'], ['调试', 'debugging'],
|
|
319
|
-
['修复', 'fix'], ['修复', 'bugfix'],
|
|
320
|
-
// Code Quality
|
|
321
|
-
['重构', 'refactor'], ['重构', 'refactoring'],
|
|
322
|
-
['配置', 'config'], ['配置', 'configuration'],
|
|
323
|
-
// API & Networking
|
|
324
|
-
['接口', 'api'], ['接口', 'endpoint'],
|
|
325
|
-
['路由', 'route'], ['路由', 'routing'],
|
|
326
|
-
['中间件', 'middleware'],
|
|
327
|
-
// UI & Components
|
|
328
|
-
['组件', 'component'], ['模板', 'template'],
|
|
329
|
-
// Database Operations
|
|
330
|
-
['迁移', 'migration'], ['迁移', 'migrate'],
|
|
331
|
-
['索引', 'index'], ['查询', 'query'], ['查询', 'search'],
|
|
332
|
-
['排序', 'sort'], ['分页', 'pagination'],
|
|
333
|
-
// Validation & Security
|
|
334
|
-
['验证', 'validate'], ['验证', 'validation'],
|
|
335
|
-
['加密', 'encrypt'], ['加密', 'encryption'],
|
|
336
|
-
['会话', 'session'], ['令牌', 'token'],
|
|
337
|
-
// Patterns & Architecture
|
|
338
|
-
['钩子', 'hook'], ['回调', 'callback'],
|
|
339
|
-
['异步', 'async'], ['同步', 'sync'],
|
|
340
|
-
['并发', 'concurrent'], ['线程', 'thread'],
|
|
341
|
-
// Performance
|
|
342
|
-
['性能', 'performance'], ['性能', 'perf'],
|
|
343
|
-
['内存', 'memory'], ['泄漏', 'leak'],
|
|
344
|
-
['超时', 'timeout'], ['重试', 'retry'],
|
|
345
|
-
// Observability
|
|
346
|
-
['日志', 'log'], ['日志', 'logging'],
|
|
347
|
-
['监控', 'monitor'], ['告警', 'alert'],
|
|
348
|
-
// Build & Dependencies
|
|
349
|
-
['依赖', 'dependency'], ['构建', 'build'], ['构建', 'compile'],
|
|
350
|
-
['打包', 'bundle'], ['类型', 'type'], ['类型', 'typescript'],
|
|
351
|
-
// Errors
|
|
352
|
-
['错误', 'error'], ['异常', 'exception'],
|
|
353
|
-
// Infrastructure
|
|
354
|
-
['容器', 'container'], ['容器', 'docker'],
|
|
355
|
-
['集群', 'cluster'], ['集群', 'kubernetes'],
|
|
356
|
-
['网关', 'gateway'], ['负载', 'load balancing'],
|
|
357
|
-
['队列', 'queue'], ['序列化', 'serialize'],
|
|
358
|
-
];
|
|
359
|
-
// Build bidirectional lookup (case-insensitive)
|
|
360
|
-
for (const [abbr, full] of SYNONYM_PAIRS) {
|
|
361
|
-
const aLow = abbr.toLowerCase();
|
|
362
|
-
const fLow = full.toLowerCase();
|
|
363
|
-
if (!SYNONYM_MAP.has(aLow)) SYNONYM_MAP.set(aLow, new Set());
|
|
364
|
-
SYNONYM_MAP.get(aLow).add(fLow);
|
|
365
|
-
if (!SYNONYM_MAP.has(fLow)) SYNONYM_MAP.set(fLow, new Set());
|
|
366
|
-
SYNONYM_MAP.get(fLow).add(aLow);
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
// Format a term for FTS5: quote if it contains spaces, hyphens, or special chars
|
|
370
|
-
function ftsToken(term) {
|
|
371
|
-
// Bare tokens are safe if purely alphanumeric or CJK characters
|
|
372
|
-
if (/^[a-zA-Z0-9\u4e00-\u9fff\u3400-\u4dbf]+$/.test(term)) return term;
|
|
373
|
-
return `"${term.replace(/"/g, '""')}"`;
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
function expandToken(token) {
|
|
377
|
-
const synonyms = SYNONYM_MAP.get(token.toLowerCase());
|
|
378
|
-
if (!synonyms || synonyms.size === 0) return ftsToken(token);
|
|
379
|
-
// FTS5 OR group: (original OR synonym1 OR "multi word synonym")
|
|
380
|
-
const parts = [ftsToken(token)];
|
|
381
|
-
for (const syn of synonyms) {
|
|
382
|
-
parts.push(ftsToken(syn));
|
|
383
|
-
}
|
|
384
|
-
return `(${parts.join(' OR ')})`;
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
/**
|
|
388
|
-
* Sanitize and expand a user query into a valid FTS5 query string.
|
|
389
|
-
* Strips special characters, expands synonyms, and joins with AND/space.
|
|
390
|
-
* @param {string} query Raw user search query
|
|
391
|
-
* @returns {string|null} FTS5-safe query or null if empty
|
|
392
|
-
*/
|
|
393
|
-
export function sanitizeFtsQuery(query) {
|
|
394
|
-
if (!query) return null;
|
|
395
|
-
const cleaned = query
|
|
396
|
-
.replace(/[{}()[\]^~*:"\\]/g, ' ')
|
|
397
|
-
.replace(/(^|\s)-/g, '$1')
|
|
398
|
-
.trim();
|
|
399
|
-
if (!cleaned) return null;
|
|
400
|
-
const tokens = cleaned.split(/\s+/).filter(t =>
|
|
401
|
-
t && !/^-+$/.test(t) && !FTS5_KEYWORDS.has(t.toUpperCase()) && !/^NEAR\/\d+$/i.test(t)
|
|
402
|
-
// Skip single ASCII-letter tokens — too noisy for FTS5 (CJK single chars handled separately below)
|
|
403
|
-
&& !(t.length === 1 && /^[a-zA-Z]$/.test(t))
|
|
404
|
-
);
|
|
405
|
-
if (tokens.length === 0) return null;
|
|
406
|
-
// Replace single CJK character tokens with bigrams for better phrase matching.
|
|
407
|
-
// Individual CJK chars ("系","统") are too noisy; bigrams ("系统") capture compound words.
|
|
408
|
-
const bigrams = cjkBigrams(cleaned);
|
|
409
|
-
const bigramSet = new Set(bigrams ? bigrams.split(' ').filter(Boolean) : []);
|
|
410
|
-
const hasBigrams = bigramSet.size > 0;
|
|
411
|
-
const finalTokens = [];
|
|
412
|
-
const seen = new Set();
|
|
413
|
-
const rawTokensSeen = new Set(); // track raw tokens to prevent bigram duplicates
|
|
414
|
-
for (const t of tokens) {
|
|
415
|
-
// Skip single CJK characters when we have bigrams — they're subsumed by bigram tokens
|
|
416
|
-
if (hasBigrams && /^[\u4e00-\u9fff\u3400-\u4dbf]$/.test(t)) continue;
|
|
417
|
-
const expanded = expandToken(t);
|
|
418
|
-
if (!seen.has(expanded)) { seen.add(expanded); rawTokensSeen.add(t); finalTokens.push(expanded); }
|
|
419
|
-
}
|
|
420
|
-
for (const bg of bigramSet) {
|
|
421
|
-
if (!seen.has(bg) && !rawTokensSeen.has(bg)) { seen.add(bg); finalTokens.push(bg); }
|
|
422
|
-
}
|
|
423
|
-
if (finalTokens.length === 0) return null;
|
|
424
|
-
// FTS5 requires explicit AND after parenthesized OR groups
|
|
425
|
-
const hasGroup = finalTokens.some(e => e.startsWith('('));
|
|
426
|
-
return finalTokens.join(hasGroup ? ' AND ' : ' ');
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
/**
|
|
430
|
-
* Relax an AND-joined FTS5 query to OR-joined for fallback search.
|
|
431
|
-
* Only useful when the original query has multiple tokens (single-token queries
|
|
432
|
-
* are already as relaxed as possible).
|
|
433
|
-
* @param {string} ftsQuery Original AND-joined FTS5 query from sanitizeFtsQuery
|
|
434
|
-
* @returns {string|null} OR-joined query, or null if relaxation wouldn't help
|
|
435
|
-
*/
|
|
436
|
-
export function relaxFtsQueryToOr(ftsQuery) {
|
|
437
|
-
if (!ftsQuery) return null;
|
|
438
|
-
// Replace AND joins with OR — handles both explicit " AND " and implicit space joins
|
|
439
|
-
const orQuery = ftsQuery.replace(/ AND /g, ' OR ');
|
|
440
|
-
// If no AND was present, tokens are space-joined (implicit AND); convert to OR
|
|
441
|
-
if (orQuery === ftsQuery && !ftsQuery.includes(' OR ')) {
|
|
442
|
-
const parts = ftsQuery.split(/\s+/);
|
|
443
|
-
if (parts.length < 2) return null; // single token — OR won't help
|
|
444
|
-
return parts.join(' OR ');
|
|
445
|
-
}
|
|
446
|
-
return orQuery !== ftsQuery ? orQuery : null;
|
|
447
|
-
}
|
|
448
|
-
|
|
449
200
|
// ─── Importance ──────────────────────────────────────────────────────────────
|
|
450
201
|
|
|
451
202
|
/**
|
|
@@ -499,73 +250,6 @@ export function computeRuleImportance(episode) {
|
|
|
499
250
|
return importance;
|
|
500
251
|
}
|
|
501
252
|
|
|
502
|
-
/**
|
|
503
|
-
* Generate CJK bigrams from text for improved Chinese phrase matching in FTS5.
|
|
504
|
-
* "修复了系统崩溃" → "修复 系统 统崩 崩溃"
|
|
505
|
-
* @param {string} text Input text containing CJK characters
|
|
506
|
-
* @returns {string} Space-separated bigrams
|
|
507
|
-
*/
|
|
508
|
-
// Common CJK compound words (2-4 chars) — dictionary-first tokenization.
|
|
509
|
-
// When a compound word is found, it's emitted as a whole token instead of being
|
|
510
|
-
// split into overlapping bigrams. This dramatically reduces noise:
|
|
511
|
-
// "数据库" → "数据库" (1 token) instead of "数据 据库" (2 noisy tokens)
|
|
512
|
-
const CJK_COMPOUNDS = new Set([
|
|
513
|
-
// tech/programming
|
|
514
|
-
'数据库', '数据', '接口', '函数', '变量', '组件', '模块', '配置', '框架', '部署',
|
|
515
|
-
'测试', '调试', '编译', '打包', '构建', '缓存', '索引', '迁移', '回滚', '权限',
|
|
516
|
-
'认证', '授权', '加密', '解密', '序列', '并发', '异步', '同步', '线程', '进程',
|
|
517
|
-
'容器', '集群', '服务器', '中间件', '网关', '负载', '监控', '日志', '告警',
|
|
518
|
-
'前端', '后端', '全栈', '响应式', '路由', '状态', '渲染', '样式', '布局',
|
|
519
|
-
// actions
|
|
520
|
-
'修复', '重构', '优化', '升级', '安装', '卸载', '导入', '导出', '上传', '下载',
|
|
521
|
-
'提交', '推送', '合并', '发布', '上线', '回退', '审查', '审核', '评审',
|
|
522
|
-
// errors/issues
|
|
523
|
-
'报错', '崩溃', '泄露', '溢出', '死锁', '超时', '中断', '异常', '故障',
|
|
524
|
-
// architecture
|
|
525
|
-
'架构', '设计', '方案', '规划', '文档', '注释', '版本', '分支', '依赖',
|
|
526
|
-
'性能', '安全', '漏洞', '补丁',
|
|
527
|
-
]);
|
|
528
|
-
|
|
529
|
-
// Sort by length descending for greedy matching
|
|
530
|
-
const CJK_SORTED = [...CJK_COMPOUNDS].sort((a, b) => b.length - a.length);
|
|
531
|
-
|
|
532
|
-
/**
|
|
533
|
-
* Generate search tokens from CJK text using dictionary-first tokenization.
|
|
534
|
-
* Compound words are emitted whole; remaining chars use bigram fallback.
|
|
535
|
-
* "修复了数据库崩溃" → "修复 数据库 崩溃" (3 clean tokens)
|
|
536
|
-
* vs old bigram: "修复 复了 了数 数据 据库 库崩 崩溃" (7 noisy tokens)
|
|
537
|
-
* @param {string} text Input text containing CJK characters
|
|
538
|
-
* @returns {string} Space-separated tokens
|
|
539
|
-
*/
|
|
540
|
-
export function cjkBigrams(text) {
|
|
541
|
-
if (!text) return '';
|
|
542
|
-
const runs = text.match(/[\u4e00-\u9fff\u3400-\u4dbf]{2,}/g) || [];
|
|
543
|
-
const tokens = [];
|
|
544
|
-
for (const run of runs) {
|
|
545
|
-
let i = 0;
|
|
546
|
-
while (i < run.length) {
|
|
547
|
-
let matched = false;
|
|
548
|
-
// Greedy dictionary match (longest first)
|
|
549
|
-
for (const word of CJK_SORTED) {
|
|
550
|
-
if (i + word.length <= run.length && run.slice(i, i + word.length) === word) {
|
|
551
|
-
tokens.push(word);
|
|
552
|
-
i += word.length;
|
|
553
|
-
matched = true;
|
|
554
|
-
break;
|
|
555
|
-
}
|
|
556
|
-
}
|
|
557
|
-
if (!matched) {
|
|
558
|
-
// Fallback: bigram for unknown compound
|
|
559
|
-
if (i + 1 < run.length) {
|
|
560
|
-
tokens.push(run[i] + run[i + 1]);
|
|
561
|
-
}
|
|
562
|
-
i++;
|
|
563
|
-
}
|
|
564
|
-
}
|
|
565
|
-
}
|
|
566
|
-
return [...new Set(tokens)].join(' ');
|
|
567
|
-
}
|
|
568
|
-
|
|
569
253
|
// ─── Project Inference ───────────────────────────────────────────────────────
|
|
570
254
|
|
|
571
255
|
/**
|