claude-mem-lite 2.88.0 → 2.90.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/install.mjs CHANGED
@@ -10,8 +10,18 @@ import { createRequire } from 'node:module';
10
10
 
11
11
  const PROJECT_DIR = resolve(import.meta.dirname ?? dirname(fileURLToPath(import.meta.url)));
12
12
  const SETTINGS_PATH = join(homedir(), '.claude', 'settings.json');
13
+ // Plugin CODE / install location — ALWAYS homedir-rooted. Claude Code's
14
+ // settings.json + MCP registration bake ABSOLUTE paths to server.mjs / hooks here,
15
+ // and env vars are per-shell (the MCP launcher won't reliably inherit
16
+ // CLAUDE_MEM_DIR), so code must NOT follow the relocation env var.
13
17
  const DATA_DIR = join(homedir(), '.claude-mem-lite');
14
- const DB_PATH = join(DATA_DIR, 'claude-mem-lite.db');
18
+ // User DATA location — DB, managed resources, registry DB, runtime/. Honors
19
+ // CLAUDE_MEM_DIR exactly like schema.mjs DB_DIR so the installer WRITES data where
20
+ // the runtime/data layer READS it (pre-fix: installer wrote homedir, runtime read
21
+ // the relocated dir → preinstalled skills silently vanished, doctor read the wrong
22
+ // DB). Equals DATA_DIR when CLAUDE_MEM_DIR is unset (the common case).
23
+ const MEM_DATA_DIR = process.env.CLAUDE_MEM_DIR || DATA_DIR;
24
+ const DB_PATH = join(MEM_DATA_DIR, 'claude-mem-lite.db');
15
25
  const OLD_DATA_DIR = join(homedir(), '.claude-mem');
16
26
 
17
27
  // Detect ephemeral context (npx) — files won't persist after exit
@@ -319,6 +329,8 @@ async function install() {
319
329
  }
320
330
 
321
331
  if (!existsSync(DATA_DIR)) mkdirSync(DATA_DIR, { recursive: true });
332
+ // Under relocation the DB/managed/runtime live here, not in the code dir — create it too.
333
+ if (!existsSync(MEM_DATA_DIR)) mkdirSync(MEM_DATA_DIR, { recursive: true });
322
334
 
323
335
  if (IS_DEV) {
324
336
  log('Dev mode — creating symlinks in ~/.claude-mem-lite/...');
@@ -675,7 +687,7 @@ async function install() {
675
687
  // "no such column: memory_session_id". Rename to a timestamped backup
676
688
  // so the new install creates a fresh v28 DB.
677
689
  try {
678
- const r = migrateLegacyClaudeMemData(OLD_DATA_DIR, DATA_DIR);
690
+ const r = migrateLegacyClaudeMemData(OLD_DATA_DIR, MEM_DATA_DIR);
679
691
  if (r.action === 'backed-up') {
680
692
  ok(`Legacy ~/.claude-mem/ DB backed up to ${r.backupPath}`);
681
693
  log('New v28 DB will be created on first launch (legacy schema is incompatible).');
@@ -685,7 +697,7 @@ async function install() {
685
697
  }
686
698
 
687
699
  // 5b. Rename claude-mem.db → claude-mem-lite.db in same directory
688
- const oldDbInDir = join(DATA_DIR, 'claude-mem.db');
700
+ const oldDbInDir = join(MEM_DATA_DIR, 'claude-mem.db');
689
701
  if (existsSync(oldDbInDir) && !existsSync(DB_PATH)) {
690
702
  renameSync(oldDbInDir, DB_PATH);
691
703
  for (const ext of ['-wal', '-shm']) {
@@ -714,7 +726,7 @@ async function install() {
714
726
  const resources = manifest.resources || [];
715
727
 
716
728
  if (resources.length > 0) {
717
- const managedDir = join(DATA_DIR, 'managed');
729
+ const managedDir = join(MEM_DATA_DIR, 'managed');
718
730
 
719
731
  // 6a. Git shallow clone unique repos
720
732
  const repos = new Map();
@@ -805,7 +817,7 @@ async function install() {
805
817
 
806
818
  // 6b. Init registry DB and record preinstalled entries
807
819
  const { ensureRegistryDb } = await importFromInstall('registry.mjs');
808
- const regDbPath = join(DATA_DIR, 'resource-registry.db');
820
+ const regDbPath = join(MEM_DATA_DIR, 'resource-registry.db');
809
821
  const rdb = ensureRegistryDb(regDbPath);
810
822
 
811
823
  const insertPre = rdb.prepare(`
@@ -853,7 +865,7 @@ async function install() {
853
865
  // 6d. Scan and index resources (fallback-only, Haiku indexing deferred to first run)
854
866
  log(' Scanning resources...');
855
867
  const { scanAllResources, diffResources } = await importFromInstall('registry-scanner.mjs');
856
- const scanned = scanAllResources({ dataDir: DATA_DIR });
868
+ const scanned = scanAllResources({ dataDir: MEM_DATA_DIR });
857
869
 
858
870
  // Attach star counts and repo URLs
859
871
  for (const s of scanned) {
@@ -1063,15 +1075,26 @@ async function uninstall() {
1063
1075
 
1064
1076
  // 6. Purge data if requested
1065
1077
  if (flags.has('--purge')) {
1066
- const expectedPurgePath = join(homedir(), '.claude-mem-lite');
1067
- if (existsSync(DATA_DIR) && DATA_DIR === expectedPurgePath) {
1078
+ const homeDir = join(homedir(), '.claude-mem-lite');
1079
+ // Always remove the homedir code/install dir (guarded to the canonical path).
1080
+ if (existsSync(DATA_DIR) && DATA_DIR === homeDir) {
1068
1081
  rmSync(DATA_DIR, { recursive: true, force: true });
1069
1082
  ok('Data purged (~/.claude-mem-lite/)');
1070
1083
  } else if (existsSync(DATA_DIR)) {
1071
1084
  fail('DATA_DIR path mismatch, refusing to purge for safety: ' + DATA_DIR);
1072
1085
  }
1086
+ // Also remove the relocated data dir — but ONLY if it's genuinely our data dir
1087
+ // (contains claude-mem-lite.db), so a mistyped CLAUDE_MEM_DIR is never rm'd.
1088
+ if (MEM_DATA_DIR !== homeDir) {
1089
+ if (existsSync(join(MEM_DATA_DIR, 'claude-mem-lite.db'))) {
1090
+ rmSync(MEM_DATA_DIR, { recursive: true, force: true });
1091
+ ok(`Relocated data purged (${MEM_DATA_DIR})`);
1092
+ } else if (existsSync(MEM_DATA_DIR)) {
1093
+ warn(`CLAUDE_MEM_DIR (${MEM_DATA_DIR}) has no claude-mem-lite.db — left untouched. Remove manually if intended.`);
1094
+ }
1095
+ }
1073
1096
  } else {
1074
- log('Data preserved in ~/.claude-mem-lite/ (use --purge to remove)');
1097
+ log('Data preserved (use --purge to remove)');
1075
1098
  }
1076
1099
 
1077
1100
  console.log('\n Done!\n');
@@ -1383,7 +1406,7 @@ async function doctor() {
1383
1406
 
1384
1407
  // Update state
1385
1408
  try {
1386
- const stateFile = join(INSTALL_DIR, 'runtime', 'update-state.json');
1409
+ const stateFile = join(MEM_DATA_DIR, 'runtime', 'update-state.json');
1387
1410
  if (existsSync(stateFile)) {
1388
1411
  const state = JSON.parse(readFileSync(stateFile, 'utf8'));
1389
1412
  const parts = [];
@@ -1439,11 +1462,14 @@ async function doctor() {
1439
1462
 
1440
1463
  // Stale temp files
1441
1464
  try {
1442
- const runtimeDir = join(INSTALL_DIR, 'runtime');
1465
+ // hook-update + the episode workers write runtime/ + staging under DB_DIR
1466
+ // (= MEM_DATA_DIR, env-aware), NOT the homedir code dir — scan there so doctor
1467
+ // sees the real residue under relocation.
1468
+ const runtimeDir = join(MEM_DATA_DIR, 'runtime');
1443
1469
  let staleCount = 0;
1444
1470
  const stalePatterns = ['.update-staging-', '.update-backup-'];
1445
- if (existsSync(INSTALL_DIR)) {
1446
- for (const f of readdirSync(INSTALL_DIR)) {
1471
+ if (existsSync(MEM_DATA_DIR)) {
1472
+ for (const f of readdirSync(MEM_DATA_DIR)) {
1447
1473
  if (stalePatterns.some(p => f.startsWith(p))) staleCount++;
1448
1474
  }
1449
1475
  }
@@ -1712,10 +1738,11 @@ function cleanup() {
1712
1738
  console.log(`\nclaude-mem-lite cleanup${dryRun ? ' (--dry-run)' : ''}\n`);
1713
1739
  let removed = 0;
1714
1740
 
1715
- // Clean .update-staging-* / .update-backup-* in INSTALL_DIR
1741
+ // Clean .update-staging-* / .update-backup-* hook-update writes these under
1742
+ // DB_DIR (= MEM_DATA_DIR, env-aware), so scan the data dir, not the homedir code dir.
1716
1743
  const stalePatterns = ['.update-staging-', '.update-backup-'];
1717
- if (existsSync(INSTALL_DIR)) {
1718
- for (const f of readdirSync(INSTALL_DIR)) {
1744
+ if (existsSync(MEM_DATA_DIR)) {
1745
+ for (const f of readdirSync(MEM_DATA_DIR)) {
1719
1746
  if (stalePatterns.some(p => f.startsWith(p))) {
1720
1747
  if (dryRun) {
1721
1748
  ok(`Would remove: ${f}`);
@@ -1723,7 +1750,7 @@ function cleanup() {
1723
1750
  continue;
1724
1751
  }
1725
1752
  try {
1726
- rmSync(join(INSTALL_DIR, f), { recursive: true, force: true });
1753
+ rmSync(join(MEM_DATA_DIR, f), { recursive: true, force: true });
1727
1754
  ok(`Removed: ${f}`);
1728
1755
  removed++;
1729
1756
  } catch (e) {
@@ -1733,8 +1760,8 @@ function cleanup() {
1733
1760
  }
1734
1761
  }
1735
1762
 
1736
- // Clean pending-* / ep-flush-* in runtime/
1737
- const runtimeDir = join(INSTALL_DIR, 'runtime');
1763
+ // Clean pending-* / ep-flush-* in runtime/ (under the env-aware data dir)
1764
+ const runtimeDir = join(MEM_DATA_DIR, 'runtime');
1738
1765
  if (existsSync(runtimeDir)) {
1739
1766
  for (const f of readdirSync(runtimeDir)) {
1740
1767
  if (f.startsWith('pending-') || f.startsWith('ep-flush-')) {
@@ -387,6 +387,42 @@ const IMPORTANCE_CAP = 3;
387
387
  const IMPORTANCE_FLOOR = 0;
388
388
  const UNCITED_STREAK_THRESHOLD = 3;
389
389
 
390
+ // Adoption-rate gate (P5 ②). A project's cite-rate is SUM(cited_count) /
391
+ // SUM(decay_seen_count) over its non-superseded observations: of every decay
392
+ // resolution this project has ever produced, what fraction were citations.
393
+ // Below ADOPTION_THRESHOLD with at least ADOPTION_MIN_SEEN resolutions on record,
394
+ // the project has demonstrably not adopted the #NN convention, so we suppress
395
+ // DEMOTION (never promotion) — see the construct-validity note on
396
+ // applyCitationDecay. MIN_SEEN keeps the gate dormant for low-data projects so
397
+ // the established behavior is preserved until there's enough signal to judge.
398
+ const ADOPTION_THRESHOLD = 0.02;
399
+ const ADOPTION_MIN_SEEN = 8;
400
+
401
+ /**
402
+ * Compute a project's citation-adoption snapshot: total citations vs total decay
403
+ * resolutions on record, and their ratio. Read-only; safe to call before the
404
+ * decay transaction (the gate decision is made on the pre-mutation snapshot).
405
+ *
406
+ * @param {import('better-sqlite3').Database} db
407
+ * @param {string} project
408
+ * @returns {{cited: number, seen: number, rate: number}}
409
+ */
410
+ export function computeCitationAdoption(db, project) {
411
+ const empty = { cited: 0, seen: 0, rate: 0 };
412
+ if (!db || !project) return empty;
413
+ try {
414
+ const row = db.prepare(`
415
+ SELECT COALESCE(SUM(cited_count), 0) AS cited,
416
+ COALESCE(SUM(decay_seen_count), 0) AS seen
417
+ FROM observations
418
+ WHERE project = ? AND superseded_at IS NULL
419
+ `).get(project);
420
+ const cited = row?.cited || 0;
421
+ const seen = row?.seen || 0;
422
+ return { cited, seen, rate: seen > 0 ? cited / seen : 0 };
423
+ } catch (e) { debugCatch(e, 'computeCitationAdoption'); return empty; }
424
+ }
425
+
390
426
  /**
391
427
  * Apply the citation-feedback loop for one session: for each injected obs id,
392
428
  * decide cited vs uncited and mutate importance/streak/cited_count per spec.
@@ -398,6 +434,20 @@ const UNCITED_STREAK_THRESHOLD = 3;
398
434
  * - cross-project IDs are silently ignored by the WHERE clause.
399
435
  * - MEM_DISABLE_CITATION_DECAY=1 disables all writes; returns zeros.
400
436
  *
437
+ * CONSTRUCT-VALIDITY ASSUMPTION (P5): a "citation" is operationally two signals,
438
+ * neither of which is ground-truth behavioral impact:
439
+ * 1. the literal `#NN` token appears in main-thread assistant text (citedIds), and
440
+ * 2. (cite-back) the agent edited a file a prior lesson #NN had warned about —
441
+ * unioned into citedIds by the Stop handler before this call.
442
+ * Signal 2 was added because signal 1 alone penalizes projects that act on a
443
+ * lesson without typing its id. Even so, both are proxies. For a project that has
444
+ * never cited anything (cite-rate below ADOPTION_THRESHOLD over ≥ADOPTION_MIN_SEEN
445
+ * resolutions), demotion is suppressed: absent any positive signal we cannot
446
+ * distinguish "useless lesson" from "useful lesson in a project that doesn't use
447
+ * the #NN convention," and a false demotion is the costlier error. The gate trades
448
+ * missed demotions (stale lessons linger) for avoided false demotions. Promotion
449
+ * is never gated — a single citation lifts the project's rate and re-enables decay.
450
+ *
401
451
  * @param {import('better-sqlite3').Database} db
402
452
  * @param {string} project
403
453
  * @param {Set<number>|Iterable<number>} injectedIds
@@ -413,6 +463,13 @@ export function applyCitationDecay(db, project, injectedIds, citedIds, sessionId
413
463
  if (injected.size === 0) return empty;
414
464
  const cited = citedIds instanceof Set ? citedIds : new Set(citedIds || []);
415
465
 
466
+ // Adoption gate (snapshot taken before any mutation this run). Suppress only
467
+ // demotion; promotion always proceeds. Threshold overridable via env.
468
+ const adoption = computeCitationAdoption(db, project);
469
+ const envThreshold = Number.parseFloat(process.env.CLAUDE_MEM_CITATION_ADOPTION_THRESHOLD);
470
+ const adoptionThreshold = Number.isFinite(envThreshold) && envThreshold >= 0 ? envThreshold : ADOPTION_THRESHOLD;
471
+ const suppressDemotion = adoption.seen >= ADOPTION_MIN_SEEN && adoption.rate < adoptionThreshold;
472
+
416
473
  const selectStmt = db.prepare(
417
474
  'SELECT id, importance, uncited_streak, last_decided_session_id FROM observations WHERE id = ? AND project = ?'
418
475
  );
@@ -457,7 +514,10 @@ export function applyCitationDecay(db, project, injectedIds, citedIds, sessionId
457
514
  promoted++;
458
515
  } else {
459
516
  const nextStreak = (row.uncited_streak || 0) + 1;
460
- if (nextStreak >= UNCITED_STREAK_THRESHOLD) {
517
+ // Demote only when the streak is up AND the project has demonstrably
518
+ // adopted citations. A non-adopting project advances the streak (idempotent
519
+ // bookkeeping) but never loses importance — see construct-validity note.
520
+ if (nextStreak >= UNCITED_STREAK_THRESHOLD && !suppressDemotion) {
461
521
  updateDemote.run(IMPORTANCE_FLOOR, sessionId, Date.now(), id);
462
522
  demoted++;
463
523
  } else {
@@ -17,6 +17,11 @@ import { EDIT_TOOLS } from '../utils.mjs';
17
17
 
18
18
  const MAX_FILES = 2;
19
19
 
20
+ // Leader literal for the cite-back hint. Shared by the builder (below) and the
21
+ // Stop-time signal extractor (extractCiteBackSignals) so the two can never drift
22
+ // — the extractor finds hint emissions by this exact prefix.
23
+ const CITE_BACK_HINT_LEADER = '[mem] ⚠ Cite-back:';
24
+
20
25
  export function buildCiteBackHint(episode, cooldown) {
21
26
  if (!episode || !cooldown) return null;
22
27
  const entries = episode.entries;
@@ -48,7 +53,7 @@ export function buildCiteBackHint(episode, cooldown) {
48
53
  // numeric framing is measurably harder to dismiss than a hedged hint.
49
54
  const totalLessons = matches.reduce((sum, m) => sum + m.ids.length, 0);
50
55
  const lines = [
51
- `[mem] ⚠ Cite-back: edited ${matches.length} file(s) with ${totalLessons} prior lesson(s) this session. Save now if any was the root cause:`,
56
+ `${CITE_BACK_HINT_LEADER} edited ${matches.length} file(s) with ${totalLessons} prior lesson(s) this session. Save now if any was the root cause:`,
52
57
  ];
53
58
  for (const m of matches) {
54
59
  const fname = basename(m.file);
@@ -242,3 +247,36 @@ export function loadCiteBackForEpisode(episode, runtimeDir) {
242
247
  }
243
248
  return buildCiteBackHint(episode, cooldown);
244
249
  }
250
+
251
+ // ─── extractCiteBackSignals (P5 ①) ──────────────────────────────────────────
252
+ // Stop-time positive-citation signal. Scans the transcript for cite-back hint
253
+ // emissions (PostToolUse attachment.stdout carrying CITE_BACK_HINT_LEADER — the
254
+ // same source countUnsavedBugfixShape reads) and collects the `#NN` lesson ids
255
+ // they name. Each id is an observation whose warned file the agent actually
256
+ // EDITED this session — a behavioral citation even when the agent never typed
257
+ // #NN. The Stop handler unions these into the cited set passed to
258
+ // applyCitationDecay (lib/citation-tracker.mjs), so acting on a lesson promotes
259
+ // it and lifts the project's adoption rate. Returns an empty set on missing path.
260
+ const CITE_BACK_ID_RE = /#(\d{1,7})\b/g;
261
+
262
+ export function extractCiteBackSignals(transcriptPath) {
263
+ const ids = new Set();
264
+ if (!transcriptPath || !existsSync(transcriptPath)) return ids;
265
+ let raw;
266
+ try { raw = readFileSync(transcriptPath, 'utf8'); } catch { return ids; }
267
+ for (const line of raw.split('\n')) {
268
+ if (!line.trim()) continue;
269
+ let entry;
270
+ try { entry = JSON.parse(line); } catch { continue; }
271
+ if (entry.type !== 'attachment') continue;
272
+ const stdout = entry.attachment?.stdout || '';
273
+ if (!stdout.includes(CITE_BACK_HINT_LEADER)) continue;
274
+ CITE_BACK_ID_RE.lastIndex = 0;
275
+ let m;
276
+ while ((m = CITE_BACK_ID_RE.exec(stdout))) {
277
+ const id = Number(m[1]);
278
+ if (Number.isInteger(id) && id > 0 && id < 1e7) ids.add(id);
279
+ }
280
+ }
281
+ return ids;
282
+ }
package/lib/cli-flags.mjs CHANGED
@@ -19,6 +19,21 @@
19
19
 
20
20
  const DEFAULT_STDERR_WRITE = msg => process.stderr.write(msg);
21
21
 
22
+ /**
23
+ * True if `raw` is a clean integer or float-literal token — no trailing garbage,
24
+ * hex, or scientific notation. Float literals ARE accepted (callers truncate via
25
+ * parseInt, the deliberate #8277 decision); this only rejects shapes bare parseInt
26
+ * would silently coerce ("2abc"→2, "0x10"→0, "1e2"→1). Single source of the
27
+ * strict-shape rule shared by parseIntFlag and the reject-style numeric flags
28
+ * (save/update --importance, defer --priority).
29
+ *
30
+ * @param {string|number} raw Flag value as captured by parseArgs.
31
+ * @returns {boolean}
32
+ */
33
+ export function isNumericToken(raw) {
34
+ return /^-?\d+(\.\d+)?$/.test(String(raw).trim());
35
+ }
36
+
22
37
  /**
23
38
  * Validate and parse a CLI numeric flag with optional bounds.
24
39
  *
@@ -38,8 +53,15 @@ export function parseIntFlag(rawValue, opts) {
38
53
  return defaultValue;
39
54
  }
40
55
 
41
- const parsed = parseInt(rawValue, 10);
42
- if (!Number.isInteger(parsed) || parsed < min || parsed > max) {
56
+ // Reject trailing-garbage / hex / scientific tokens that bare parseInt would
57
+ // silently coerce by stopping at the first non-digit ("2abc"→2, "0x10"→0,
58
+ // "1e2"→1) — those slip past the Number.isInteger gate and violate the
59
+ // warn+default contract above. Float literals ("3.7"→3) stay ACCEPTED via
60
+ // parseInt truncation: that's the deliberate #8277 decision pinned by the
61
+ // 'rejects floats' case in cli-flags.test.mjs, so the shape check admits them.
62
+ const str = String(rawValue).trim();
63
+ const parsed = parseInt(str, 10);
64
+ if (!isNumericToken(str) || !Number.isInteger(parsed) || parsed < min || parsed > max) {
43
65
  const range = max === Number.MAX_SAFE_INTEGER ? `≥ ${min}` : `between ${min} and ${max}`;
44
66
  warn(`[mem] Invalid ${name} "${rawValue}" (must be an integer ${range}); using default ${defaultValue}\n`);
45
67
  return defaultValue;
@@ -10,11 +10,15 @@
10
10
  // granularity (CLI/MCP wrap all groups in one transaction; the hook transacts
11
11
  // each group). They no longer re-implement the mutation.
12
12
  //
13
- // NOTE: the summary INSERT still omits the observation_vectors write, matching
14
- // pre-extraction behavior. Fixing that (audit P5) is now a single change here
15
- // instead of three but it is a behavior change, intentionally NOT bundled.
13
+ // The summary INSERT also writes its TF-IDF observation_vectors row in the same
14
+ // (caller-owned) transaction fixed once here rather than in all three call
15
+ // sites. Without it, FTS-miss queries that fall back to vector recall (CJK /
16
+ // concept / paraphrase) could never reach compressed summaries; the LLM
17
+ // smart-compress path already wrote vectors, so the deterministic path was the
18
+ // sole gap (audit P6).
16
19
 
17
- import { isoWeekKey, COMPRESSED_AUTO } from '../utils.mjs';
20
+ import { isoWeekKey, COMPRESSED_AUTO, debugCatch } from '../utils.mjs';
21
+ import { getVocabulary, computeVector } from '../tfidf.mjs';
18
22
  import { scrubRecord } from './scrub-record.mjs';
19
23
 
20
24
  /**
@@ -90,6 +94,22 @@ export function compressGroup(db, proj, obs) {
90
94
  `).run(sessionId, proj, safe.text, dominantType, safe.title, safe.narrative, medianDate.toISOString(), medianEpoch);
91
95
  const summaryId = Number(summaryResult.lastInsertRowid);
92
96
 
97
+ // TF-IDF vector for the summary so it is reachable by vector recall (parity
98
+ // with save-observation.mjs and the LLM smart-compress path). Best-effort:
99
+ // vocab may be uninitialized on a fresh DB — a failure here must not abort the
100
+ // compression the caller is transacting.
101
+ try {
102
+ const vocab = getVocabulary(db);
103
+ if (vocab) {
104
+ const vec = computeVector(`${safe.title} ${safe.narrative}`, vocab);
105
+ if (vec) {
106
+ db.prepare(
107
+ 'INSERT OR REPLACE INTO observation_vectors (observation_id, vector, vocab_version, created_at_epoch) VALUES (?, ?, ?, ?)'
108
+ ).run(summaryId, Buffer.from(vec.buffer), vocab.version, medianEpoch);
109
+ }
110
+ }
111
+ } catch (e) { debugCatch(e, 'compress-vector'); }
112
+
93
113
  const obsIds = obs.map((o) => o.id);
94
114
  const obsPh = obsIds.map(() => '?').join(',');
95
115
  db.prepare(`UPDATE observations SET compressed_into = ? WHERE id IN (${obsPh})`).run(summaryId, ...obsIds);
@@ -0,0 +1,35 @@
1
+ // Dedup / merge similarity thresholds — single source of truth (P10).
2
+ //
3
+ // All values are Jaccard-space (word-set overlap, 0..1) unless noted. They were
4
+ // scattered as bare literals and duplicate local consts across save-observation,
5
+ // maintain-core, hook-llm, hook-optimize, mem-cli, server, and hook; converging
6
+ // them here removes the drift risk and gives the P7 benchmark named knobs.
7
+ // Vector-side constants (VOCAB_DIM / MIN_COSINE_SIMILARITY / RRF_K) deliberately
8
+ // stay in tfidf.mjs next to the search engine that consumes them.
9
+ //
10
+ // Pure constants only — no imports, so nothing can import-cycle through this.
11
+
12
+ // 0.7: near-duplicate cutoff for save-time dedup (5-min window, lib/save-observation)
13
+ // and the hook-llm tier-1 title dedup. Catches "Modified X" / "Fixed X" restatements
14
+ // (~70% word overlap) without collapsing distinct-but-related observations.
15
+ export const DEDUP_JACCARD_THRESHOLD = 0.7;
16
+
17
+ // 0.85: high-confidence auto-merge cutoff (maintain + optimize cluster-merge, CLI/MCP
18
+ // dedup preview). Pairs at or above this merge without an LLM merge-decision call.
19
+ export const AUTO_MERGE_THRESHOLD = 0.85;
20
+
21
+ // 0.4: low bound of the LLM-review merge band [0.4, 0.85) in hook-optimize. Below it,
22
+ // a pair is too dissimilar to be worth a merge-decision call.
23
+ export const MERGE_JACCARD_LOW = 0.4;
24
+
25
+ // 0.5: MinHash estimated-Jaccard pre-filter for the maintain O(n²) scan — skip the
26
+ // exact-Jaccard compare when the cheap signature estimate is already below this.
27
+ export const MINHASH_PRE_THRESHOLD = 0.5;
28
+
29
+ // 0.7: MinHash pre-filter for the hook post-inject fuzzy-dedup pass. Stricter than
30
+ // maintain's 0.5 to keep the inline inject path cheap (it runs in the hot Stop path).
31
+ export const MINHASH_PREFILTER = 0.7;
32
+
33
+ // 0.95: strict title-Jaccard cutoff for the hook post-inject fuzzy-dedup pass — only
34
+ // collapse near-identical titles inline; anything softer waits for the maintain sweep.
35
+ export const FUZZY_DEDUP_THRESHOLD = 0.95;
@@ -14,13 +14,16 @@
14
14
 
15
15
  import { COMPRESSED_PENDING_PURGE, computeMinHash, estimateJaccardFromMinHash, jaccardSimilarity } from '../utils.mjs';
16
16
  import { rebuildVocabulary, computeVector, _resetVocabCache } from '../tfidf.mjs';
17
+ import { DEDUP_JACCARD_THRESHOLD, MINHASH_PRE_THRESHOLD as MINHASH_PRE_THRESHOLD_SRC } from './dedup-constants.mjs';
17
18
 
18
19
  export const STALE_AGE_MS = 30 * 86400000;
19
20
  export const OP_CAP = 1000;
20
21
  export const SCAN_LIMIT = 500;
21
22
  export const DUPLICATE_LIMIT = 50;
22
- export const SIMILARITY_THRESHOLD = 0.7;
23
- export const MINHASH_PRE_THRESHOLD = 0.5;
23
+ // Back-compat: maintain-core historically exported these names; both now source
24
+ // their value from the single canonical lib/dedup-constants.mjs.
25
+ export const SIMILARITY_THRESHOLD = DEDUP_JACCARD_THRESHOLD;
26
+ export const MINHASH_PRE_THRESHOLD = MINHASH_PRE_THRESHOLD_SRC;
24
27
  // A memory injected this many times with zero citations is "pinned noise" that
25
28
  // the regular decay op can't touch (decay protects injection_count>0).
26
29
  export const PINNED_INJ_THRESHOLD = 8;
@@ -13,10 +13,10 @@
13
13
 
14
14
  import { jaccardSimilarity, scrubSecrets, computeMinHash, cjkBigrams, getCurrentBranch, debugCatch } from '../utils.mjs';
15
15
  import { getVocabulary, computeVector } from '../tfidf.mjs';
16
+ import { DEDUP_JACCARD_THRESHOLD } from './dedup-constants.mjs';
16
17
 
17
18
  const DEDUP_WINDOW_MS = 5 * 60 * 1000;
18
19
  const DEDUP_RECENT_LIMIT = 50;
19
- const DEDUP_JACCARD_THRESHOLD = 0.7;
20
20
 
21
21
  /**
22
22
  * Save a new observation if it isn't a near-duplicate of one saved within the