@lh8ppl/claude-memory-kit 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ // Memory-health analysis — content quality, not plumbing (Task 144, D-130).
2
+ //
3
+ // Public boundary:
4
+ // analyzeMemoryHealth({projectRoot, now?, staleDays?, ...seams}) → report
5
+ // formatMemoryHealth(report) → string (the doctor's informational section)
6
+ //
7
+ // Read-only by contract: pure reads over the fact archive, the audit log,
8
+ // and the queue files. Never mutates, never logs, never affects the doctor
9
+ // exit code — the section is INFORMATIONAL ("42 facts: 3 old-and-untouched,
10
+ // 2 possible duplicates, 1 conflict pending"), making curation visible
11
+ // before Task 95 automates it. Candidates are SURFACED, never auto-acted
12
+ // (the reviewable-not-silent rule).
13
+ //
14
+ // SPEC DEVIATION (recorded in tasks.md 144): the task entry assumed "the
15
+ // audit log has every recall" — it does not. The audit log is
16
+ // MUTATIONS-only by design (glossary: "any mutating operation"); search /
17
+ // get / cite write nothing. "Stale" is therefore defined honestly as
18
+ // OLD-AND-UNTOUCHED — created > staleDays ago with no audit-trail mutation
19
+ // mentioning the fact's id since creation. True recall-tracking is parked
20
+ // for Task 95 (trigger: when curation automation needs recall frequency).
21
+ //
22
+ // Near-dup detection here is the LITERAL tier: normalized-token Jaccard
23
+ // over fact bodies (cheap, embedder-free). Task 143 adds the semantic
24
+ // tier at write time; this section is the batch view over what already
25
+ // landed.
26
+
27
+ import { existsSync, readFileSync, readdirSync } from 'node:fs';
28
+ import { join } from 'node:path';
29
+ import { parse as parseFrontmatter } from './frontmatter.mjs';
30
+ import { listConflictQueue } from './conflict-queue.mjs';
31
+ import { listReviewQueue } from './review-queue.mjs';
32
+ import { nowIso } from './audit-log.mjs';
33
+
34
+ const DAY_MS = 24 * 60 * 60 * 1000;
35
+ const DEFAULT_STALE_DAYS = 60;
36
+ // Jaccard threshold for "possible duplicate" — tuned to catch paraphrase
37
+ // pairs sharing most content words while leaving topically-adjacent facts
38
+ // alone. A candidate list errs slightly eager (a human reviews it).
39
+ const NEAR_DUP_JACCARD = 0.6;
40
+ // Short bodies make Jaccard noisy; require a minimal token set.
41
+ const MIN_TOKENS_FOR_DUP = 4;
42
+ // Above this many facts the O(n²) pair scan is skipped (the report notes it).
43
+ const PAIR_SCAN_CAP = 2000;
44
+
45
+ const STOPWORDS = new Set([
46
+ 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'to', 'of', 'in',
47
+ 'on', 'for', 'and', 'or', 'not', 'with', 'at', 'by', 'it', 'this',
48
+ 'that', 'we', 'you', 'always', 'never', 'use', 'from',
49
+ ]);
50
+
51
+ function tokenize(text) {
52
+ return new Set(
53
+ String(text)
54
+ .toLowerCase()
55
+ .split(/[^a-z0-9]+/)
56
+ .filter((t) => t.length > 1 && !STOPWORDS.has(t)),
57
+ );
58
+ }
59
+
60
+ function jaccard(aSet, bSet) {
61
+ let inter = 0;
62
+ for (const t of aSet) if (bSet.has(t)) inter += 1;
63
+ const union = aSet.size + bSet.size - inter;
64
+ return union === 0 ? 0 : inter / union;
65
+ }
66
+
67
+ function readFacts(projectRoot) {
68
+ const dir = join(projectRoot, 'context', 'memory');
69
+ const facts = [];
70
+ if (!existsSync(dir)) return facts;
71
+ for (const name of readdirSync(dir)) {
72
+ if (!name.endsWith('.md') || name === 'INDEX.md') continue;
73
+ try {
74
+ const { frontmatter, body } = parseFrontmatter(readFileSync(join(dir, name), 'utf8'));
75
+ if (!frontmatter?.id) continue;
76
+ facts.push({
77
+ slug: name.replace(/\.md$/, ''),
78
+ id: frontmatter.id,
79
+ type: frontmatter.type ?? 'unknown',
80
+ trust: frontmatter.trust ?? 'unknown',
81
+ createdAt: frontmatter.created_at ?? null,
82
+ body: String(body ?? ''),
83
+ });
84
+ } catch {
85
+ // unparseable file — content health can't read it; HC-4/reindex own that class
86
+ }
87
+ }
88
+ return facts;
89
+ }
90
+
91
+ // Every audit ts per fact id AFTER its creation entry — any mutation counts
92
+ // as "touched" (trust override, merge, graduation, tombstone...).
93
+ function readTouchedIds(projectRoot) {
94
+ const touched = new Map(); // id → latest mutation ts
95
+ try {
96
+ const auditPath = join(projectRoot, 'context', '.locks', 'audit.log');
97
+ if (!existsSync(auditPath)) return touched;
98
+ for (const line of readFileSync(auditPath, 'utf8').split(/\r?\n/)) {
99
+ if (!line.trim()) continue;
100
+ try {
101
+ const e = JSON.parse(line);
102
+ if (!e.id || !e.ts) continue;
103
+ // Creation-class entries aren't touches: 'created', and 'import'
104
+ // (an imported fact's OWN creation writes action:'import' — counting
105
+ // it would make imported facts permanently un-stale; and a later
106
+ // skipped-duplicate import entry proves the SOURCE still holds the
107
+ // text, not that anyone curated the fact). Skill-review finding.
108
+ if (e.action === 'created' || e.action === 'import') continue;
109
+ const prev = touched.get(e.id);
110
+ if (!prev || e.ts > prev) touched.set(e.id, e.ts);
111
+ } catch {
112
+ // torn line
113
+ }
114
+ }
115
+ } catch {
116
+ // unreadable log — degrade to "nothing touched"
117
+ }
118
+ return touched;
119
+ }
120
+
121
+ /**
122
+ * Read-only content-health analysis of the project tier.
123
+ *
124
+ * @param {object} opts
125
+ * @param {string} opts.projectRoot
126
+ * @param {string} [opts.now]
127
+ * @param {number} [opts.staleDays]
128
+ * @param {Function} [opts.listConflictsImpl] - test seam.
129
+ * @param {Function} [opts.listReviewImpl] - test seam.
130
+ */
131
+ export function analyzeMemoryHealth({
132
+ projectRoot,
133
+ now,
134
+ staleDays = DEFAULT_STALE_DAYS,
135
+ listConflictsImpl,
136
+ listReviewImpl,
137
+ } = {}) {
138
+ const nowMs = Date.parse(now ?? nowIso());
139
+ const facts = readFacts(projectRoot);
140
+
141
+ const byType = {};
142
+ const byTrust = {};
143
+ for (const f of facts) {
144
+ byType[f.type] = (byType[f.type] ?? 0) + 1;
145
+ byTrust[f.trust] = (byTrust[f.trust] ?? 0) + 1;
146
+ }
147
+
148
+ // Old-and-untouched: created > staleDays ago, no post-creation mutation.
149
+ const touched = readTouchedIds(projectRoot);
150
+ const oldUntouched = facts.filter((f) => {
151
+ if (!f.createdAt) return false;
152
+ const ageMs = nowMs - Date.parse(f.createdAt);
153
+ if (!(ageMs > staleDays * DAY_MS)) return false;
154
+ return !touched.has(f.id);
155
+ });
156
+
157
+ // Near-dup candidate pairs (literal tier). O(n²) pairwise scan — fine at
158
+ // memory-archive scale (106 facts ≈ 5.5K pairs on the dogfood); guarded
159
+ // above PAIR_SCAN_CAP so a pathological archive can't stall the doctor.
160
+ const tokenized = facts.length <= PAIR_SCAN_CAP ? facts.map((f) => ({ f, tokens: tokenize(f.body) })) : [];
161
+ const nearDupPairs = [];
162
+ for (let i = 0; i < tokenized.length; i++) {
163
+ for (let j = i + 1; j < tokenized.length; j++) {
164
+ const { f: fa, tokens: ta } = tokenized[i];
165
+ const { f: fb, tokens: tb } = tokenized[j];
166
+ if (ta.size < MIN_TOKENS_FOR_DUP || tb.size < MIN_TOKENS_FOR_DUP) continue;
167
+ const score = jaccard(ta, tb);
168
+ if (score >= NEAR_DUP_JACCARD) {
169
+ nearDupPairs.push({ a: fa.slug, b: fb.slug, idA: fa.id, idB: fb.id, score: Number(score.toFixed(2)) });
170
+ }
171
+ }
172
+ }
173
+
174
+ // The detected-contradiction surface = the pending queues.
175
+ let conflicts = 0;
176
+ let review = 0;
177
+ try {
178
+ conflicts = (listConflictsImpl ?? listConflictQueue)({ tier: 'P', projectRoot }).length;
179
+ } catch {
180
+ // queue unreadable — degrade to zero
181
+ }
182
+ try {
183
+ review = (listReviewImpl ?? listReviewQueue)({ tier: 'P', projectRoot }).length;
184
+ } catch {
185
+ // queue unreadable — degrade to zero
186
+ }
187
+
188
+ return {
189
+ facts: { total: facts.length, byType, byTrust },
190
+ oldUntouched: oldUntouched.map((f) => ({ slug: f.slug, id: f.id, createdAt: f.createdAt })),
191
+ nearDupPairs,
192
+ queues: { conflicts, review },
193
+ staleDays,
194
+ };
195
+ }
196
+
197
+ /**
198
+ * Render the doctor's informational section. Zero-concerns stay silent —
199
+ * a healthy memory earns one quiet line, never noise.
200
+ */
201
+ export function formatMemoryHealth(report) {
202
+ const lines = [];
203
+ const t = report.facts;
204
+ const trustBits = Object.entries(t.byTrust)
205
+ .map(([k, v]) => `${v} ${k}`)
206
+ .join(' · ');
207
+ lines.push(
208
+ `Memory health (informational): ${t.total} fact(s)` + (trustBits ? ` — trust: ${trustBits}` : ''),
209
+ );
210
+ if (report.oldUntouched.length > 0) {
211
+ lines.push(
212
+ ` ${report.oldUntouched.length} old-and-untouched (> ${report.staleDays}d, no mutation since creation) — worth a skim: ` +
213
+ report.oldUntouched.slice(0, 3).map((f) => f.slug).join(', ') +
214
+ (report.oldUntouched.length > 3 ? ', …' : ''),
215
+ );
216
+ }
217
+ if (report.nearDupPairs.length > 0) {
218
+ lines.push(
219
+ ` ${report.nearDupPairs.length} possible duplicate pair(s): ` +
220
+ report.nearDupPairs.slice(0, 3).map((p) => `${p.a} ↔ ${p.b}`).join('; ') +
221
+ (report.nearDupPairs.length > 3 ? '; …' : ''),
222
+ );
223
+ }
224
+ const q = [];
225
+ if (report.queues.conflicts > 0) q.push(`${report.queues.conflicts} conflict(s)`);
226
+ if (report.queues.review > 0) q.push(`${report.queues.review} review item(s)`);
227
+ if (q.length > 0) lines.push(` ${q.join(' + ')} pending — cmk queue`);
228
+ return lines.join('\n');
229
+ }
@@ -43,8 +43,8 @@ import {
43
43
  mkdirSync,
44
44
  } from 'node:fs';
45
45
  import { join, dirname } from 'node:path';
46
- import { createHash } from 'node:crypto';
47
46
  import { generateId } from '@lh8ppl/cmk-canonicalize';
47
+ import { hashContent } from './content-hash.mjs';
48
48
  import {
49
49
  resolveTierRoot,
50
50
  resolveScratchpadPath,
@@ -58,6 +58,7 @@ import { parseBulletProvenance, isProvenanceCommentLine } from './provenance.mjs
58
58
  import { checkPoisonGuard, logPoisonGuardRejection } from './poison-guard.mjs';
59
59
  import { detectConflicts, writeConflictEntry } from './conflict-queue.mjs';
60
60
  import { sanitizeHomePaths } from './sanitize.mjs';
61
+ import { sanitizePrivacyTags } from './privacy.mjs';
61
62
 
62
63
  const VALID_ACTIONS = new Set(['add', 'replace', 'remove']);
63
64
 
@@ -253,15 +254,21 @@ function doAdd(opts) {
253
254
  if (errors.length > 0) {
254
255
  return errorResult({ category: ERROR_CATEGORIES.SCHEMA, errors });
255
256
  }
256
- // Privacy (write-path fix #1): abstract home-dir paths to `~` for
257
- // committed/shared tiers (P/U) BEFORE the bullet is screened, conflict-
258
- // checked, dedup-keyed, and written so a captured fact never ships the
259
- // local username and stays portable. Local tier (L) keeps machine paths
260
- // verbatim (its purpose). Everything downstream uses `addOpts`.
257
+ // Privacy: strip <private>…</private> FIRST, on EVERY tier (cut-gate
258
+ // v0.3.1 finding the tag was honored only by the UserPromptSubmit hook,
259
+ // so `cmk remember`/`mk_remember` wrote the secret verbatim). Runs before
260
+ // home-path sanitization, Poison_Guard, conflict-check, dedup, and the
261
+ // write so the redacted text is what everything downstream sees, on
262
+ // committed AND local tiers (private content must not reach context.local
263
+ // either). The same single-safe-path philosophy as Poison_Guard.
264
+ const privacyStripped = sanitizePrivacyTags(opts.text);
265
+ // Then abstract home-dir paths to `~` for committed/shared tiers (P/U) so a
266
+ // captured fact never ships the local username + stays portable; local
267
+ // tier (L) keeps machine paths verbatim (its purpose).
261
268
  const sanitizedText =
262
269
  opts.tier === 'P' || opts.tier === 'U'
263
- ? sanitizeHomePaths(opts.text)
264
- : opts.text;
270
+ ? sanitizeHomePaths(privacyStripped)
271
+ : privacyStripped;
265
272
  const addOpts =
266
273
  sanitizedText === opts.text ? opts : { ...opts, text: sanitizedText };
267
274
 
@@ -293,6 +300,11 @@ function doAdd(opts) {
293
300
  newTrust,
294
301
  scratchpadPath,
295
302
  sectionTitle: opts.section,
303
+ // Task 143 (D-130): the async adapters may inject a semantic similarity
304
+ // fn (prepareSemanticSimilarity) + its threshold; absent → the literal
305
+ // tokenJaccard default (graceful degradation).
306
+ similarityFn: opts.similarityFn,
307
+ similarityThreshold: opts.similarityThreshold,
296
308
  });
297
309
  // Defensive guard against a future detectConflicts schema-error
298
310
  // path. Today the upstream validator catches bad opts before this
@@ -304,7 +316,17 @@ function doAdd(opts) {
304
316
  if (conflict.action === 'error') {
305
317
  return conflict;
306
318
  }
307
- if (conflict.conflict === true && conflict.action === 'queue') {
319
+ // Task 143 (D-130): near-dup proposals. The pre-143 contract queues only
320
+ // when new.trust < existing.trust — an EQUAL-trust paraphrase ("use uv not
321
+ // pip" twice) takes the 'supersede' action and APPENDS, which is exactly
322
+ // the memory-rot case. When the caller opts in (queueNearDups, set by the
323
+ // semantic-equipped adapters), ANY above-threshold match routes to the
324
+ // conflict queue as a reviewable proposal — never auto-dropped, never
325
+ // silently duplicated. Default behavior unchanged.
326
+ const routeToQueue =
327
+ conflict.conflict === true &&
328
+ (conflict.action === 'queue' || (opts.queueNearDups === true && conflict.action === 'supersede'));
329
+ if (routeToQueue) {
308
330
  // Compute the proposed ID using the same canonical-id derivation
309
331
  // appendScratchpadBullet would have used, then route to the queue.
310
332
  // (Task 25b fix: generateId is positional `(tier, text)`, not
@@ -333,7 +355,7 @@ function appendBulletGuarded(opts) {
333
355
  // Caller MUST have run Poison_Guard already. This is the inner
334
356
  // write step — delegates to the existing scratchpad writer which
335
357
  // handles dedup + cap + consolidation + audit + ID derivation.
336
- const sha1 = createHash('sha1').update(opts.text, 'utf8').digest('hex');
358
+ const sha1 = hashContent(opts.text);
337
359
  const ts = opts.now ?? nowIso();
338
360
  return appendScratchpadBullet({
339
361
  tier: opts.tier,
@@ -0,0 +1,142 @@
1
+ // Native-binding health probes (Task 141a, D-129/D-133).
2
+ //
3
+ // npm 12 (~July 2026) flips `allowScripts` OFF by default: dependency
4
+ // install scripts — including the IMPLICIT node-gyp build a binding.gyp
5
+ // package gets — silently don't run on a fresh `npm install -g`. The kit's
6
+ // two native deps are exactly that shape:
7
+ // - better-sqlite3 (core: the search index) — kit-level remedy
8
+ // - onnxruntime-node (inside the optional @huggingface/transformers
9
+ // embedder) — semantic-level remedy
10
+ //
11
+ // Without the binding the package LOOKS installed but `cmk search`/reindex
12
+ // crash at first use. These probes detect that state cheaply so:
13
+ // - `cmk install` can ask the user and fix INLINE (the primary UX — the
14
+ // user's 2026-06-12 steer: ask at install, not a secondary command);
15
+ // - `cmk doctor` HC-8 stays as the ongoing backstop;
16
+ // - the --with-semantic runner passes the allow flag itself.
17
+ //
18
+ // Remediation verified against the primary sources (2026-06-12): GitHub
19
+ // changelog "Upcoming breaking changes for npm v12" + npm v11 config docs —
20
+ // the `allow-scripts` CONFIG (comma-separated package list) is the
21
+ // documented path "for one-off and global contexts: npm exec, npx, and
22
+ // npm install -g"; the project-level `npm approve-scripts` allowlist in
23
+ // package.json does not apply to `-g` installs. Warnings (and the config
24
+ // key) exist from npm 11.16.0.
25
+
26
+ import { createRequire } from 'node:module';
27
+ import { spawnSync } from 'node:child_process';
28
+
29
+ export const KIT_BINDING_REMEDY =
30
+ 'npm install -g @lh8ppl/claude-memory-kit --allow-scripts=better-sqlite3';
31
+ export const EMBEDDER_BINDING_REMEDY =
32
+ 'npm install -g @huggingface/transformers --allow-scripts=onnxruntime-node';
33
+
34
+ // The `allow-scripts` config key ships (as warnings + config) in 11.16.0.
35
+ const ALLOW_SCRIPTS_MIN = [11, 16, 0];
36
+
37
+ const requireFromHere = createRequire(import.meta.url);
38
+
39
+ /**
40
+ * Probe the kit's own native dep (better-sqlite3). A bare require is NOT
41
+ * enough: better-sqlite3 v12 loads its .node binding LAZILY — `bindings()`
42
+ * fires inside `new Database(...)`, so on a script-blocked install the
43
+ * require succeeds and only instantiation throws ("Could not locate the
44
+ * bindings file"). Live-verified 2026-06-12 against a real
45
+ * `--ignore-scripts` install (npm 12's exact effect): require → loaded,
46
+ * `new Database(':memory:')` → the bindings error. The probe therefore
47
+ * opens (and closes) an in-memory DB. Synchronous on purpose (CJS) so
48
+ * install and doctor call it without changing their flow.
49
+ *
50
+ * @param {object} [opts] - { requireImpl } test seam (throw = broken).
51
+ * @returns {{ok: true} | {ok: false, reason: string, remedy: string}}
52
+ */
53
+ export function checkKitBinding({ requireImpl } = {}) {
54
+ const req =
55
+ requireImpl ??
56
+ (() => {
57
+ const Database = requireFromHere('better-sqlite3');
58
+ const db = new Database(':memory:');
59
+ db.close();
60
+ });
61
+ try {
62
+ req();
63
+ return { ok: true };
64
+ } catch (err) {
65
+ return {
66
+ ok: false,
67
+ reason: err?.message ?? String(err),
68
+ remedy: KIT_BINDING_REMEDY,
69
+ };
70
+ }
71
+ }
72
+
73
+ /**
74
+ * Probe the optional semantic embedder. Distinguishes NOT-INSTALLED (the
75
+ * normal opt-out state — `installed: false`) from INSTALLED-BUT-BROKEN
76
+ * (npm 12 blocked onnxruntime-node's script — `installed: true`). The
77
+ * semantic-backend's own loader collapses both into "not installed"
78
+ * (loadExtractor's catch), which under npm 12 would report the wrong
79
+ * reason — this probe is what tells the truth.
80
+ *
81
+ * Honest limitation: this is an IMPORT-level probe. Like better-sqlite3,
82
+ * onnxruntime may bind lazily — an installed-but-script-blocked embedder
83
+ * can pass the import and only fail at pipeline construction. The deep
84
+ * check is `warmEmbedder` (it builds a real pipeline), which runs at
85
+ * `--with-semantic` install time; and the runner's `--allow-scripts`
86
+ * flag prevents the broken state from being created at all. A broken
87
+ * embedder also degrades GRACEFULLY (keyword fallback + note, D-111) —
88
+ * unlike the kit binding, it can't crash search.
89
+ *
90
+ * @param {object} [opts] - { importImpl } test seam.
91
+ * @returns {Promise<{ok: true} | {ok: false, installed: boolean, reason: string, remedy: string}>}
92
+ */
93
+ export async function checkEmbedderBinding({ importImpl } = {}) {
94
+ const imp = importImpl ?? (() => import('@huggingface/transformers'));
95
+ try {
96
+ await imp();
97
+ return { ok: true };
98
+ } catch (err) {
99
+ const message = err?.message ?? String(err);
100
+ const notInstalled =
101
+ err?.code === 'ERR_MODULE_NOT_FOUND' && message.includes('@huggingface/transformers');
102
+ return {
103
+ ok: false,
104
+ installed: !notInstalled,
105
+ reason: notInstalled ? 'not-installed' : message,
106
+ remedy: EMBEDDER_BINDING_REMEDY,
107
+ };
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Whether the host npm understands the `allow-scripts` config (≥ 11.16.0).
113
+ * Conservative on any probe failure: report unsupported so callers never
114
+ * emit a flag the host npm would reject as unknown.
115
+ *
116
+ * @param {object} [opts] - { spawnSyncImpl } test seam.
117
+ * @returns {{supported: boolean, version: string | null}}
118
+ */
119
+ export function npmSupportsAllowScripts({ spawnSyncImpl = spawnSync } = {}) {
120
+ try {
121
+ // Constant command under shell:true — npm is npm.cmd on Windows; the
122
+ // shell resolves it cross-platform (the buildDefaultNpmRunner pattern).
123
+ const r = spawnSyncImpl('npm --version', {
124
+ encoding: 'utf8',
125
+ shell: true,
126
+ timeout: 30_000,
127
+ });
128
+ if (r.status !== 0 || !r.stdout) return { supported: false, version: null };
129
+ const version = String(r.stdout).trim();
130
+ const parts = version.split('.').map((n) => Number.parseInt(n, 10));
131
+ if (parts.length < 3 || parts.some(Number.isNaN)) {
132
+ return { supported: false, version };
133
+ }
134
+ for (let i = 0; i < 3; i++) {
135
+ if (parts[i] > ALLOW_SCRIPTS_MIN[i]) return { supported: true, version };
136
+ if (parts[i] < ALLOW_SCRIPTS_MIN[i]) return { supported: false, version };
137
+ }
138
+ return { supported: true, version };
139
+ } catch {
140
+ return { supported: false, version: null };
141
+ }
142
+ }
@@ -93,6 +93,61 @@ const SECRET_PATTERNS = [
93
93
  category: 'secret',
94
94
  re: /\bghp_[A-Za-z0-9]{36}/,
95
95
  },
96
+ // Task 134: the other GitHub token classes — OAuth (gho_), user-to-server
97
+ // (ghu_), server-to-server (ghs_), refresh (ghr_). Same ghp_ shape (prefix
98
+ // + 36 alnum); the 36-char floor is what keeps "ghost"/"ghs config" prose
99
+ // out (a real English word can't reach prefix+36 alphanumerics).
100
+ {
101
+ id: 'secret_github_token',
102
+ category: 'secret',
103
+ re: /\bgh[ousr]_[A-Za-z0-9]{36}/,
104
+ },
105
+ // GitHub fine-grained PAT: github_pat_ + 82 chars of [A-Za-z0-9_]
106
+ // (GitHub's documented detection regex; the token is the 11-char prefix +
107
+ // 82-char body = 93 total. The body's internal underscore placement is not
108
+ // contractually fixed, so match the whole-body class rather than a
109
+ // prefix_body split — verified against GitHub Docs + GitGuardian's
110
+ // detector, 2026-06-13).
111
+ {
112
+ id: 'secret_github_fine_grained_pat',
113
+ category: 'secret',
114
+ re: /\bgithub_pat_[A-Za-z0-9_]{82}/,
115
+ },
116
+ // Stripe secret keys: sk_live_ / rk_live_ (restricted) + 24+ alnum. The
117
+ // _live_ infix + the floor keep benign "sk_" / "Stripe" prose out.
118
+ {
119
+ id: 'secret_stripe_key',
120
+ category: 'secret',
121
+ re: /\b[sr]k_live_[A-Za-z0-9]{24,}/,
122
+ },
123
+ // Google API key: AIza + 35 of [A-Za-z0-9_-] (39 total — the documented
124
+ // length). The 4-char prefix alone is harmless prose ("AIza" mentioned);
125
+ // the 35-char body is the gate.
126
+ {
127
+ id: 'secret_google_api_key',
128
+ category: 'secret',
129
+ re: /\bAIza[A-Za-z0-9_-]{35}\b/,
130
+ },
131
+ // GitLab personal access token: glpat- + 20+ alnum/dash/underscore.
132
+ {
133
+ id: 'secret_gitlab_pat',
134
+ category: 'secret',
135
+ re: /\bglpat-[A-Za-z0-9_-]{20,}/,
136
+ },
137
+ // npm access token: npm_ + 36 alnum (the modern granular/automation shape).
138
+ {
139
+ id: 'secret_npm_token',
140
+ category: 'secret',
141
+ re: /\bnpm_[A-Za-z0-9]{36}/,
142
+ },
143
+ // Hugging Face access token: hf_ + 34+ alnum (kit-relevant — the semantic
144
+ // install pulls models from HF; a leaked hf_ token in a captured fact is
145
+ // a real risk). The 34-char floor keeps "hf"/"half" prose out.
146
+ {
147
+ id: 'secret_huggingface_token',
148
+ category: 'secret',
149
+ re: /\bhf_[A-Za-z0-9]{34,}/,
150
+ },
96
151
  // OpenAI / Anthropic style keys. sk- prefix + optional ant-/proj-
97
152
  // qualifier + ≥40 chars of alphanumeric/dash/underscore.
98
153
  {
@@ -17,7 +17,8 @@
17
17
  // to keep (design §10.1), not the core's.
18
18
 
19
19
  import { resolve as resolvePath } from 'node:path';
20
- import { createHash } from 'node:crypto';
20
+ import { hashContent } from './content-hash.mjs';
21
+ import { sanitizePrivacyTags } from './privacy.mjs';
21
22
  import { writeFact as defaultWriteFact } from './write-fact.mjs';
22
23
  import { buildRichFactBody, slugifyFact } from './rich-fact.mjs';
23
24
 
@@ -53,8 +54,17 @@ export function rememberRich(text, options = {}, deps = {}) {
53
54
  const projectRoot = deps.projectRoot ?? resolvePath(process.cwd());
54
55
  const write = deps.writeFact ?? defaultWriteFact;
55
56
 
56
- const headline = String(text).trim();
57
- const title = (options.title && String(options.title).trim()) || headline.split('\n')[0].slice(0, 80);
57
+ // Strip <private>…</private> BEFORE deriving/slicing the title (cut-gate
58
+ // v0.3.1 clean-build finding). writeFact also strips, but it receives a title
59
+ // already sliced to 80 chars — and an 80-char cut that lands inside a private
60
+ // span SEVERS the closing tag, so writeFact's `<private>…</private>` regex no
61
+ // longer matches and the secret survives in the frontmatter title + INDEX.md.
62
+ // Stripping the intact text here means the slice only ever sees redacted text.
63
+ const headline = sanitizePrivacyTags(String(text).trim());
64
+ const safeTitle = options.title
65
+ ? sanitizePrivacyTags(String(options.title).trim())
66
+ : '';
67
+ const title = safeTitle || headline.split('\n')[0].slice(0, 80);
58
68
  const body = buildRichFactBody({ text: headline, why: options.why, how: options.how });
59
69
  // `links` arrives as an ARRAY from the MCP tool (z.array) and as a
60
70
  // comma-STRING from the CLI flag — accept both. The old `String(links)` path
@@ -76,10 +86,10 @@ export function rememberRich(text, options = {}, deps = {}) {
76
86
  trust: options.trust ?? 'high',
77
87
  sourceFile: 'user-explicit',
78
88
  sourceLine: 1,
79
- // Content fingerprint for provenance/dedup — NOT a security context. Matches
80
- // the kit's sha1-of-content convention (memory-write.mjs, index-rebuild.mjs);
81
- // writeFact dedups by content-addressed id, this is the source_sha1 field. // NOSONAR
82
- sourceSha1: createHash('sha1').update(body).digest('hex'), // NOSONAR
89
+ // Content fingerprint for provenance/dedup — NOT a security context.
90
+ // Routes through the shared hashContent (SHA-256, D-149); writeFact dedups
91
+ // by content-addressed id, this is the source_sha1 metadata field.
92
+ sourceSha1: hashContent(body),
83
93
  related,
84
94
  projectRoot,
85
95
  });
@@ -87,5 +97,40 @@ export function rememberRich(text, options = {}, deps = {}) {
87
97
 
88
98
  /** The title rememberRich() will derive for `text`/`options` (for caller messages). */
89
99
  export function richFactTitle(text, options = {}) {
90
- return (options.title && String(options.title).trim()) || String(text).trim().split('\n')[0].slice(0, 80);
100
+ // Mirror rememberRich: strip <private> before slicing so the preview a caller
101
+ // echoes to the console never carries private content either (cut-gate v0.3.1).
102
+ const safeTitle = options.title ? sanitizePrivacyTags(String(options.title).trim()) : '';
103
+ return safeTitle || sanitizePrivacyTags(String(text).trim()).split('\n')[0].slice(0, 80);
104
+ }
105
+
106
+ /**
107
+ * Task 143 (D-130): the write-time near-dup guard for the EXPLICIT terse
108
+ * capture paths (cmk remember / mk_remember). Returns extra memoryWrite
109
+ * options — `{similarityFn, queueNearDups: true}` when this project is
110
+ * semantic-configured AND the local embedder is available; `{}` otherwise.
111
+ *
112
+ * One shared gate for both adapters (the shared-modules rule). Best-effort
113
+ * by contract: ANY failure (no embedder, model error, db hiccup) returns {}
114
+ * so capture proceeds on the literal pipeline — losing a capture to a
115
+ * similarity upgrade would invert the kit's priorities. The auto-extract
116
+ * hook path deliberately does NOT call this (its detached child is
117
+ * budget-constrained; the landed corpus gets the doctor's batch near-dup
118
+ * view, Task 144, and re-curation, Task 95).
119
+ *
120
+ * @param {object} opts - { projectRoot, text, prepareImpl?, resolveModeImpl? } (seams for tests).
121
+ * @returns {Promise<object>} extra memoryWrite options (possibly empty).
122
+ */
123
+ export async function prepareNearDupGuard({ projectRoot, text, prepareImpl, resolveModeImpl } = {}) {
124
+ try {
125
+ const { resolveDefaultSearchMode, prepareSemanticSimilarity, SEMANTIC_NEARDUP_THRESHOLD } = await import('./semantic-backend.mjs');
126
+ const mode = (resolveModeImpl ?? resolveDefaultSearchMode)({ projectRoot });
127
+ if (mode === 'keyword') return {};
128
+ const sem = await (prepareImpl ?? prepareSemanticSimilarity)({ projectRoot, newText: text });
129
+ if (!sem.ok) return {};
130
+ // The MEASURED bge-base threshold (see SEMANTIC_NEARDUP_THRESHOLD) — the
131
+ // generic 0.85 default would miss the canonical "use uv not pip" pair.
132
+ return { similarityFn: sem.similarityFn, similarityThreshold: SEMANTIC_NEARDUP_THRESHOLD, queueNearDups: true };
133
+ } catch {
134
+ return {};
135
+ }
91
136
  }
package/src/repair.mjs CHANGED
@@ -159,12 +159,25 @@ function repairLocks({ projectRoot, userDir, staleLockMs, now, ts }) {
159
159
  * @param {Function} [opts.reindexer] test-injected reindex function; defaults to import('./index-rebuild.mjs').reindexFull
160
160
  */
161
161
  async function repairIndex({ projectRoot, userDir, reindexer }) {
162
+ // Production reindexFull requires a `db` (it calls db.exec) — repairIndex
163
+ // must open + own + close it, exactly like runReindex does. The earlier
164
+ // code called reindexFull({projectRoot,userDir}) with NO db, so
165
+ // `cmk repair --index`/`--all` threw "undefined (reading 'exec')" since
166
+ // Task 49 (cut-gate v0.3.1 finding — every test mocked the reindexer, so
167
+ // the real call-shape was never exercised). An injected reindexer (tests)
168
+ // takes whatever args it wants; we only open a db for the real one.
162
169
  let reindexFn = reindexer;
170
+ let db = null;
163
171
  if (!reindexFn) {
164
- const mod = await import('./index-rebuild.mjs');
165
- reindexFn = mod.reindexFull;
172
+ const [{ reindexFull }, { openIndexDb }] = await Promise.all([
173
+ import('./index-rebuild.mjs'),
174
+ import('./index-db.mjs'),
175
+ ]);
176
+ reindexFn = reindexFull;
177
+ db = openIndexDb({ projectRoot });
166
178
  }
167
179
  if (typeof reindexFn !== 'function') {
180
+ if (db) db.close();
168
181
  return {
169
182
  kind: 'index',
170
183
  changed: false,
@@ -172,7 +185,9 @@ async function repairIndex({ projectRoot, userDir, reindexer }) {
172
185
  };
173
186
  }
174
187
  try {
175
- const r = await reindexFn({ projectRoot, userDir });
188
+ const r = db
189
+ ? await reindexFn({ projectRoot, userDir, db })
190
+ : await reindexFn({ projectRoot, userDir });
176
191
  return {
177
192
  kind: 'index',
178
193
  changed: true,
@@ -184,6 +199,8 @@ async function repairIndex({ projectRoot, userDir, reindexer }) {
184
199
  changed: false,
185
200
  error: `reindex failed: ${err?.message ?? err}`,
186
201
  };
202
+ } finally {
203
+ if (db) db.close();
187
204
  }
188
205
  }
189
206