@ijfw/memory-server 1.5.5 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/bin/ijfw-dashboard +20 -1
  2. package/package.json +4 -3
  3. package/src/audit-roster.js +89 -12
  4. package/src/brain/tiered-llm.js +57 -7
  5. package/src/cross-orchestrator-cli.js +344 -4
  6. package/src/cross-project-search.js +39 -1
  7. package/src/dashboard-server.js +7 -1
  8. package/src/dream/runner.mjs +560 -8
  9. package/src/handlers/brain-handler.js +101 -1
  10. package/src/importers/discover.js +1 -1
  11. package/src/memory/bench-metrics.js +289 -0
  12. package/src/memory/benchmark.js +1 -1
  13. package/src/memory/search.js +53 -1
  14. package/src/orchestrator/plan-checker.js +1 -1
  15. package/src/profile/audit.js +671 -0
  16. package/src/profile/capture.js +871 -0
  17. package/src/profile/derive-dialectic.js +242 -0
  18. package/src/profile/derive-heuristic.js +733 -0
  19. package/src/profile/derive.js +156 -0
  20. package/src/profile/egress.js +306 -0
  21. package/src/profile/eval/build-real-probes.mjs +197 -0
  22. package/src/profile/eval/corpus-from-reddit.mjs +166 -0
  23. package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
  24. package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
  25. package/src/profile/eval/gate-b-behavior.mjs +420 -0
  26. package/src/profile/eval/gate-b-decision-run.mjs +171 -0
  27. package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
  28. package/src/profile/eval/gate-b-run.mjs +417 -0
  29. package/src/profile/eval/gate-b-run.test.mjs +204 -0
  30. package/src/profile/eval/gate-c-capture.mjs +323 -0
  31. package/src/profile/eval/harness.mjs +551 -0
  32. package/src/profile/eval/instrument-validation.mjs +248 -0
  33. package/src/profile/eval/instrument-validation.test.mjs +125 -0
  34. package/src/profile/eval/multi-subject-harness.mjs +106 -0
  35. package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
  36. package/src/profile/eval/personas.test.mjs +83 -0
  37. package/src/profile/eval/plumbing.test.mjs +69 -0
  38. package/src/profile/eval/prereg.mjs +130 -0
  39. package/src/profile/eval/prereg.test.mjs +78 -0
  40. package/src/profile/eval/real-corpus.test.mjs +103 -0
  41. package/src/profile/eval/real-personas.mjs +109 -0
  42. package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
  43. package/src/profile/eval/run-real-corpus.mjs +358 -0
  44. package/src/profile/eval/slug-quality.mjs +464 -0
  45. package/src/profile/eval/stylometry-features.js +85 -0
  46. package/src/profile/eval/stylometry-reference.js +16 -0
  47. package/src/profile/eval/stylometry.js +224 -0
  48. package/src/profile/eval/stylometry.test.mjs +103 -0
  49. package/src/profile/eval/synthetic-personas.js +91 -0
  50. package/src/profile/eval/verifier-features.mjs +170 -0
  51. package/src/profile/eval/verifier-logreg.mjs +74 -0
  52. package/src/profile/eval/verifier-pair.mjs +122 -0
  53. package/src/profile/eval/verifier-reference.mjs +68 -0
  54. package/src/profile/eval/verifier-scorer.mjs +30 -0
  55. package/src/profile/eval/wrong-target-control.mjs +168 -0
  56. package/src/profile/eval/wrong-target-control.test.mjs +124 -0
  57. package/src/profile/exemplar-capture.js +232 -0
  58. package/src/profile/exemplar-retrieve.js +138 -0
  59. package/src/profile/exemplar-store.js +314 -0
  60. package/src/profile/lock.js +64 -0
  61. package/src/profile/merge.js +624 -0
  62. package/src/profile/path-policy.js +213 -0
  63. package/src/profile/precision-stamp.mjs +151 -0
  64. package/src/profile/render-brief.js +717 -0
  65. package/src/profile/schema.js +244 -0
  66. package/src/profile/sensitivity.js +249 -0
  67. package/src/profile/serve.js +345 -0
  68. package/src/profile/store.js +261 -0
  69. package/src/profile/telemetry.js +289 -0
  70. package/src/recovery/checkpoint.js +7 -1
  71. package/src/server.js +185 -14
  72. package/src/.registry-meta-key.pem +0 -3
@@ -25,6 +25,9 @@ import { compileWikiPage, slugify } from '../brain/wiki-compiler.js';
25
25
  import { resolveCitations } from '../brain/citation-resolver.js';
26
26
  import { exportPageBundle, writeShareReadme } from '../brain/export.js';
27
27
  import { validateSafeRepoPath } from '../brain/path-guard.js';
28
+ import { profileGet, profileBrief } from '../profile/serve.js';
29
+ import { forgetAndWrite, listInferences } from '../profile/audit.js';
30
+ import { readProfile } from '../profile/store.js';
28
31
 
29
32
  const WIKI_TYPES = ['concepts', 'entities', 'decisions', 'milestones'];
30
33
 
@@ -336,7 +339,99 @@ function verbConflictResolve(db, repoRoot, args) {
336
339
  return { ok: true, resolved: true, winnerId: args.winnerId, supersededIds, validTo: chosenValidTo };
337
340
  }
338
341
 
339
- export async function handleIjfwBrain({ verb, args = {}, db, repoRoot, env: _env, opts = {} } = {}) {
342
+ // ---------------------------------------------------------------------------
343
+ // PHASE P4 — cross-system profile bus serving verbs (folded into ijfw_brain so
344
+ // the MCP tool cap stays at 13/13 — NO new top-level tool). Both verbs are
345
+ // ZERO-LLM by construction: they route into src/profile/serve.js, which imports
346
+ // only the store/render/egress/sensitivity modules (the P4.5 import-graph guard
347
+ // proves the serve path never reaches the LLM tier). `env` threads the host's
348
+ // env so per-host opt-in (IJFW_PROFILE_SHARE_SENSITIVE), redaction
349
+ // (IJFW_PROFILE_REDACT) and the kill-switch (IJFW_PROFILE_KILL) are honored.
350
+ //
351
+ // args (both verbs, all optional):
352
+ // tokenBudget number — cap brief output (brief only)
353
+ // context { overlay?, host?, session? } — overlay key + egress meta
354
+ // shareSensitive boolean — programmatic per-host opt-in (else env flag)
355
+ // ---------------------------------------------------------------------------
356
+
357
+ function profileServeOpts(args, env) {
358
+ return {
359
+ tokenBudget: args && Number.isFinite(args.tokenBudget) ? args.tokenBudget : undefined,
360
+ context: (args && args.context && typeof args.context === 'object') ? args.context : {},
361
+ shareSensitive: (args && typeof args.shareSensitive === 'boolean') ? args.shareSensitive : undefined,
362
+ env: env || process.env,
363
+ };
364
+ }
365
+
366
+ function verbProfileGet(args, env) {
367
+ return profileGet(profileServeOpts(args, env));
368
+ }
369
+
370
+ function verbProfileBrief(args, env) {
371
+ return profileBrief(profileServeOpts(args, env));
372
+ }
373
+
374
+ // ---------------------------------------------------------------------------
375
+ // PHASE P4 — right-to-be-forgotten + audit INVOCATION SURFACE (audit M2). The
376
+ // audit module's forgetAndWrite / listInferences were previously reachable only
377
+ // from tests; these verbs give the user a real way to SEE what was inferred and
378
+ // to DELETE it (the egress purge rides along inside forgetAndWrite). Folded into
379
+ // ijfw_brain — NO new top-level tool, cap stays 13/13.
380
+ // ---------------------------------------------------------------------------
381
+
382
+ /**
383
+ * Build a forget pattern from wire args. Across JSON we accept ONLY a string
384
+ * (exact-id or `kind::`/`::subject` segment match — see audit.matcherFor) or an
385
+ * explicit { regex, flags? } object that we compile under audit's ReDoS guard.
386
+ * A bad regex source compiles into a RegExp whose forgetAndWrite pre-validation
387
+ * (validatePattern) rejects it BEFORE the global lock — so a hostile pattern can
388
+ * neither hang the event loop nor over-delete.
389
+ */
390
+ function buildForgetPattern(args) {
391
+ if (args && typeof args.id === 'string' && args.id) return { ok: true, pattern: args.id };
392
+ if (args && typeof args.pattern === 'string' && args.pattern) return { ok: true, pattern: args.pattern };
393
+ if (args && args.regex && typeof args.regex === 'string') {
394
+ const flags = typeof args.flags === 'string' ? args.flags.replace(/[^gimsuy]/g, '') : '';
395
+ try {
396
+ return { ok: true, pattern: new RegExp(args.regex, flags) };
397
+ } catch (e) {
398
+ return { ok: false, error: 'invalid-regex', message: e.message };
399
+ }
400
+ }
401
+ return { ok: false, error: 'missing-pattern' };
402
+ }
403
+
404
+ async function verbProfileForget(args) {
405
+ const built = buildForgetPattern(args);
406
+ if (!built.ok) return { ok: false, error: built.error, message: built.message };
407
+ // forgetAndWrite validates the pattern (ReDoS guard) BEFORE taking the global
408
+ // profile lock, runs read→forget→write under the lock, and purges egress.
409
+ const r = await forgetAndWrite(built.pattern);
410
+ if (!r.ok) return { ok: false, error: r.code || 'forget-failed', message: r.message };
411
+ return {
412
+ ok: true,
413
+ removed: (r.removed || []).map((inf) => ({ id: inf.id, kind: inf.kind, subject: inf.subject })),
414
+ removedCount: (r.removed || []).length,
415
+ egressRemoved: r.egressRemoved || 0,
416
+ };
417
+ }
418
+
419
+ function verbProfileAudit() {
420
+ // Read the current global profile and surface every inference with full
421
+ // provenance (scope, evidence, source sessions/hosts, sensitivity). Cold
422
+ // start (no profile on disk) -> empty list, never an error.
423
+ let profile = null;
424
+ try {
425
+ const r = readProfile();
426
+ profile = r && r.ok ? r.profile : null;
427
+ } catch {
428
+ profile = null;
429
+ }
430
+ if (!profile) return { ok: true, inferences: [] };
431
+ return { ok: true, inferences: listInferences(profile) };
432
+ }
433
+
434
+ export async function handleIjfwBrain({ verb, args = {}, db, repoRoot, env = process.env, opts = {} } = {}) {
340
435
  if (!verb || typeof verb !== 'string') return { ok: false, error: 'missing-verb' };
341
436
  switch (verb) {
342
437
  case 'think': return verbThink(db, repoRoot, args, opts);
@@ -347,6 +442,10 @@ export async function handleIjfwBrain({ verb, args = {}, db, repoRoot, env: _env
347
442
  case 'wiki.export': return verbWikiExport(db, repoRoot, args);
348
443
  case 'wiki.shareReadme': return verbWikiShareReadme(db, repoRoot);
349
444
  case 'conflict.resolve': return verbConflictResolve(db, repoRoot, args);
445
+ case 'profile.get': return verbProfileGet(args, env);
446
+ case 'profile.brief': return verbProfileBrief(args, env);
447
+ case 'profile.forget': return verbProfileForget(args);
448
+ case 'profile.audit': return verbProfileAudit();
350
449
  default: return { ok: false, error: 'unknown-verb', verb };
351
450
  }
352
451
  }
@@ -355,4 +454,5 @@ export const IJFW_BRAIN_VERBS = [
355
454
  'think', 'links',
356
455
  'wiki.get', 'wiki.compile', 'wiki.promote', 'wiki.export', 'wiki.shareReadme',
357
456
  'conflict.resolve',
457
+ 'profile.get', 'profile.brief', 'profile.forget', 'profile.audit',
358
458
  ];
@@ -23,7 +23,7 @@ import { homedir } from 'node:os';
23
23
  const DEV_PARENTS = ['dev', 'Code', 'code', 'projects', 'repos', 'work', 'src'];
24
24
 
25
25
  // Decode Claude Code's path-encoded project directory name back to an absolute
26
- // path. Example: "-Users-seandonahoe-dev-pip" -> "/Users/seandonahoe/dev/pip".
26
+ // path. Example: "-Users-alice-dev-pip" -> "/Users/alice/dev/pip".
27
27
  // Encoding replaces `/` with `-`. Leading `-` becomes leading `/`.
28
28
  // Caveat: directories with literal `-` in their name become ambiguous on
29
29
  // decode; we verify by checking whether the decoded path exists.
@@ -0,0 +1,289 @@
1
+ // IJFW v1.6.0 -- benchmark metrics. Pure functions, no I/O, no LLM.
2
+ import { mulberry32 } from './benchmark.js';
3
+ //
4
+ // Retrieval metrics (free -- no model calls):
5
+ // recallAtK, precisionAtK, mrr, episodesPerQuery, latencyPercentile
6
+ // Answer metrics:
7
+ // normalizeAnswer + answerExactMatch (free string match). LLM-judged
8
+ // answer correctness is a separate paid path wired in P5, not here.
9
+ //
10
+ // All retrieval metrics operate on:
11
+ // retrievedIds : string[] -- ranked result ids (best first)
12
+ // relevantIds : string[] -- gold evidence ids (the relevant set)
13
+ // expressed in the SAME id space (the loader's job to align granularity).
14
+
15
+ /** Recall@k = |relevant ∩ retrieved[0:k]| / |relevant|. */
16
+ export function recallAtK(retrievedIds, relevantIds, k) {
17
+ if (!relevantIds || relevantIds.length === 0) return null; // undefined for no-evidence queries
18
+ const top = new Set(retrievedIds.slice(0, k));
19
+ let hit = 0;
20
+ for (const r of relevantIds) if (top.has(r)) hit++;
21
+ return hit / relevantIds.length;
22
+ }
23
+
24
+ /** Precision@k = |relevant ∩ retrieved[0:k]| / k. */
25
+ export function precisionAtK(retrievedIds, relevantIds, k) {
26
+ if (k <= 0) return null;
27
+ const rel = new Set(relevantIds || []);
28
+ const top = retrievedIds.slice(0, k);
29
+ if (top.length === 0) return 0;
30
+ let hit = 0;
31
+ for (const id of top) if (rel.has(id)) hit++;
32
+ return hit / Math.min(k, top.length);
33
+ }
34
+
35
+ /** Mean Reciprocal Rank: 1/(rank of first relevant), else 0. */
36
+ export function reciprocalRank(retrievedIds, relevantIds) {
37
+ const rel = new Set(relevantIds || []);
38
+ for (let i = 0; i < retrievedIds.length; i++) {
39
+ if (rel.has(retrievedIds[i])) return 1 / (i + 1);
40
+ }
41
+ return 0;
42
+ }
43
+
44
+ /** Mean over a list of per-query numbers, skipping null (undefined) entries. */
45
+ export function mean(values) {
46
+ const xs = values.filter((v) => v !== null && v !== undefined && !Number.isNaN(v));
47
+ if (xs.length === 0) return null;
48
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
49
+ }
50
+
51
+ /** Percentile (linear interpolation). p in [0,100]. */
52
+ export function percentile(values, p) {
53
+ const xs = values.filter((v) => typeof v === 'number' && !Number.isNaN(v)).slice().sort((a, b) => a - b);
54
+ if (xs.length === 0) return null;
55
+ if (xs.length === 1) return xs[0];
56
+ const rank = (p / 100) * (xs.length - 1);
57
+ const lo = Math.floor(rank);
58
+ const hi = Math.ceil(rank);
59
+ if (lo === hi) return xs[lo];
60
+ return xs[lo] + (xs[hi] - xs[lo]) * (rank - lo);
61
+ }
62
+
63
+ /** Normalize an answer string for exact-match comparison (SQuAD-style). */
64
+ export function normalizeAnswer(s) {
65
+ return String(s ?? '')
66
+ .toLowerCase()
67
+ .replace(/\b(a|an|the)\b/g, ' ')
68
+ .replace(/[^a-z0-9 ]/g, ' ')
69
+ .replace(/\s+/g, ' ')
70
+ .trim();
71
+ }
72
+
73
+ /**
74
+ * G1b L4-HIGH-1: is this ground-truth entry an ABSTENTION (no-answer / trap) gold?
75
+ *
76
+ * Both confirmatory loaders (longmemeval.js `_abs`, convomem.js abstention_evidence)
77
+ * stamp `ground_truth[qid].is_abstention = true` and KEEP the dataset's own
78
+ * natural-language no-answer gold. This canonical flag — NOT the gold's surface
79
+ * form — is what routes scoring to the single abstention rubric, so a correct
80
+ * abstention scores the same on BOTH datasets and a confident hallucination is
81
+ * wrong on BOTH. Pure; exported for unit tests.
82
+ *
83
+ * @param {object|null|undefined} gt a ground_truth entry
84
+ * @returns {boolean}
85
+ */
86
+ export function isAbstentionGold(gt) {
87
+ return !!(gt && gt.is_abstention === true);
88
+ }
89
+
90
+ // Lexical cues that a prediction CORRECTLY abstained / declined (no confident
91
+ // claim). Deliberately rubric-only (no LLM) so the abstention verdict is the
92
+ // SAME function regardless of dataset, judge mode, or gold surface form.
93
+ const ABSTENTION_CUES = [
94
+ /\bi (?:do not|don't|dont) (?:know|have)\b/,
95
+ /\b(?:no|not any|don't have|do not have|lack) (?:information|info|record|data|details?)\b/,
96
+ /\bno (?:information|mention|record|reference)\b/,
97
+ /\bnot (?:mentioned|stated|specified|provided|available|found|present|discussed)\b/,
98
+ /\b(?:cannot|can't|cant|could not|couldn't|unable to) (?:find|determine|tell|answer|say|locate)\b/,
99
+ /\bisn't (?:any )?(?:information|mention|record)\b/,
100
+ /\bthere (?:is|isn't|is no|isn't any|was no) (?:no )?(?:information|mention|record|reference|data)\b/,
101
+ /\bnot enough (?:information|context|detail)\b/,
102
+ ];
103
+
104
+ // G1b cross-C LOW-1: bare cue TOKENS ("unknown", "unclear", "n/a") are genuine
105
+ // abstentions ONLY when they are essentially the WHOLE answer — a short decline.
106
+ // As a SUBSTRING of a longer confident response ("...the status is unknown to
107
+ // the team but the project shipped", "Unknown Pleasures") they are a
108
+ // hallucination on the trap, NOT a decline, and must score 0. These are matched
109
+ // against the whole normalized answer (anchored), separate from the phrase cues
110
+ // above which may legitimately appear inside a sentence. Symmetric across
111
+ // subjects (the metric is subject-agnostic).
112
+ const BARE_TOKEN_ABSTENTIONS = [
113
+ /^unknown$/,
114
+ /^unclear$/,
115
+ /^n\/a$/,
116
+ /^na$/,
117
+ ];
118
+
119
+ /**
120
+ * G1b L4-HIGH-1: score one ABSTENTION question under ONE rubric.
121
+ *
122
+ * Returns 1 when the prediction correctly abstains / declines (it does not assert
123
+ * a specific fact), 0 when it makes a confident factual claim (a hallucination on
124
+ * an unanswerable trap). Independent of the gold's surface form — the gold is a
125
+ * natural-language no-answer sentence on LongMemEval and ConvoMem alike; the
126
+ * ABILITY graded is "did the system refuse to invent an answer". Pure.
127
+ *
128
+ * @param {string} predicted
129
+ * @returns {1|0}
130
+ */
131
+ export function scoreAbstentionMatch(predicted) {
132
+ const t = String(predicted ?? '').toLowerCase().trim();
133
+ if (t === '') return 0; // an empty answer is not a stated abstention (caller may skip earlier)
134
+ // Phrase-level cues may appear inside a sentence (a genuine NL decline).
135
+ for (const re of ABSTENTION_CUES) {
136
+ if (re.test(t)) return 1;
137
+ }
138
+ // Bare cue tokens count ONLY when they are essentially the whole answer (a
139
+ // short decline) — strip surrounding punctuation/whitespace first so
140
+ // "Unknown.", "n/a!", " unclear " still match, but a confident sentence that
141
+ // merely CONTAINS the token does not.
142
+ const bare = t.replace(/^[\s"'.,!?-]+|[\s"'.,!?-]+$/g, '');
143
+ for (const re of BARE_TOKEN_ABSTENTIONS) {
144
+ if (re.test(bare)) return 1;
145
+ }
146
+ return 0;
147
+ }
148
+
149
+ /** Free answer-correctness signal: normalized exact / containment match. */
150
+ export function answerExactMatch(predicted, gold) {
151
+ const p = normalizeAnswer(predicted);
152
+ const g = normalizeAnswer(gold);
153
+ if (!g) return null;
154
+ if (p === g) return 1;
155
+ // containment either direction handles "yes" vs "yes, both American" cases
156
+ if (p && (p.includes(g) || g.includes(p))) return 1;
157
+ return 0;
158
+ }
159
+
160
+ /**
161
+ * Aggregate per-query retrieval records into the metrics block for one
162
+ * (adapter, dataset) cell.
163
+ * @param {Array<{retrievedIds, relevantIds, latency_ms, tokens_in, tokens_out, cost_usd, n_retrieved, answerMatch}>} per
164
+ */
165
+ export function aggregate(per) {
166
+ const r1 = per.map((q) => recallAtK(q.retrievedIds, q.relevantIds, 1));
167
+ const r5 = per.map((q) => recallAtK(q.retrievedIds, q.relevantIds, 5));
168
+ const r10 = per.map((q) => recallAtK(q.retrievedIds, q.relevantIds, 10));
169
+ const p5 = per.map((q) => precisionAtK(q.retrievedIds, q.relevantIds, 5));
170
+ const rr = per.map((q) => reciprocalRank(q.retrievedIds, q.relevantIds));
171
+ const lat = per.map((q) => q.latency_ms).filter((x) => typeof x === 'number');
172
+ const ans = per.map((q) => q.answerMatch).filter((x) => x !== null && x !== undefined);
173
+
174
+ return {
175
+ n_queries: per.length,
176
+ recall_at_1: round(mean(r1)),
177
+ recall_at_5: round(mean(r5)),
178
+ recall_at_10: round(mean(r10)),
179
+ precision_at_5: round(mean(p5)),
180
+ mrr: round(mean(rr)),
181
+ episodes_per_query_mean: round(mean(per.map((q) => q.n_retrieved))),
182
+ latency_p50_ms: round(percentile(lat, 50)),
183
+ latency_p95_ms: round(percentile(lat, 95)),
184
+ tokens_per_query_mean: round(mean(per.map((q) => (q.tokens_in || 0) + (q.tokens_out || 0)))),
185
+ cost_per_query_usd: round6(mean(per.map((q) => q.cost_usd || 0))),
186
+ answer_match_mean: ans.length ? round(mean(ans)) : null,
187
+ hops_mean: round(mean(per.map((q) => q.hops))),
188
+ };
189
+ }
190
+
191
+ /**
192
+ * Aggregate per-query records bucketed by a dimension key (e.g. question type).
193
+ * Returns { [dimensionValue]: aggregateBlock }. This is the diagnostic view —
194
+ * global aggregates hide WHERE a system fails; per-dimension exposes it.
195
+ * @param {Array} per per-query records, each carrying a `dim` field
196
+ */
197
+ export function aggregateByDimension(per) {
198
+ const buckets = new Map();
199
+ for (const q of per) {
200
+ const key = q.dim ?? 'unknown';
201
+ if (!buckets.has(key)) buckets.set(key, []);
202
+ buckets.get(key).push(q);
203
+ }
204
+ const out = {};
205
+ for (const [key, rows] of buckets) out[key] = aggregate(rows);
206
+ return out;
207
+ }
208
+
209
+ function round(x) { return x === null || x === undefined ? null : Math.round(x * 10000) / 10000; }
210
+ function round6(x) { return x === null || x === undefined ? null : Math.round(x * 1e6) / 1e6; }
211
+
212
+ /**
213
+ * Bootstrap confidence interval for the mean of per-query numeric scores.
214
+ *
215
+ * @param {number[]} perQuery - per-query numeric scores (e.g. 0/1 per query)
216
+ * @param {{ iters?: number, alpha?: number, seed?: number }} opts
217
+ * @returns {{ point: number, lo: number, hi: number }}
218
+ * point = mean(perQuery), lo/hi are the alpha/2 and 1-alpha/2 percentiles
219
+ * of the bootstrap distribution. Deterministic for a given seed.
220
+ */
221
+ export function bootstrapCI(perQuery, { iters = 1000, alpha = 0.05, seed = 42 } = {}) {
222
+ const n = perQuery.length;
223
+ const point = n > 0 ? perQuery.reduce((a, b) => a + b, 0) / n : 0;
224
+ if (n === 0) return { point, lo: 0, hi: 0 };
225
+
226
+ const rng = mulberry32(seed);
227
+ const boots = Array.from({ length: iters });
228
+ for (let i = 0; i < iters; i++) {
229
+ let s = 0;
230
+ for (let j = 0; j < n; j++) {
231
+ s += perQuery[Math.floor(rng() * n)];
232
+ }
233
+ boots[i] = s / n;
234
+ }
235
+ boots.sort((a, b) => a - b);
236
+
237
+ const loIdx = Math.floor((alpha / 2) * iters);
238
+ const hiIdx = Math.floor((1 - alpha / 2) * iters);
239
+ return {
240
+ point,
241
+ lo: boots[loIdx],
242
+ hi: boots[Math.min(hiIdx, iters - 1)],
243
+ };
244
+ }
245
+
246
+ /**
247
+ * Paired McNemar test for two binary result arrays (same length, 0/1 per query).
248
+ *
249
+ * Uses continuity-corrected McNemar χ² statistic.
250
+ * pValue is derived from the χ²(1) survival function via an erfc approximation.
251
+ *
252
+ * @param {number[]} before - binary array (0/1 per query)
253
+ * @param {number[]} after - binary array (0/1 per query)
254
+ * @returns {{ b: number, c: number, statistic: number, pValue: number, significant: boolean }}
255
+ * b = count(before=0, after=1) (after wins)
256
+ * c = count(before=1, after=0) (before wins)
257
+ */
258
+ export function mcnemar(before, after) {
259
+ let b = 0; // before=0, after=1
260
+ let c = 0; // before=1, after=0
261
+ for (let i = 0; i < before.length; i++) {
262
+ if (before[i] === 0 && after[i] === 1) b++;
263
+ else if (before[i] === 1 && after[i] === 0) c++;
264
+ }
265
+
266
+ const bc = b + c;
267
+ if (bc === 0) return { b, c, statistic: 0, pValue: 1, significant: false };
268
+
269
+ // Continuity-corrected McNemar χ² = (|b-c| - 1)² / (b+c)
270
+ const diff = Math.abs(b - c) - 1;
271
+ const statistic = (diff * diff) / bc;
272
+
273
+ // χ²(1) survival function: P(χ² > x) = erfc(sqrt(x/2))
274
+ // Using erfc approximation (Abramowitz & Stegun 7.1.26)
275
+ const pValue = erfcApprox(Math.sqrt(statistic / 2));
276
+
277
+ return { b, c, statistic, pValue, significant: pValue < 0.05 };
278
+ }
279
+
280
+ /**
281
+ * Complementary error function approximation for χ²(1) p-value computation.
282
+ * Abramowitz & Stegun 7.1.26 rational approximation (max |err| < 1.5e-7).
283
+ */
284
+ function erfcApprox(x) {
285
+ if (x < 0) return 2 - erfcApprox(-x);
286
+ const t = 1 / (1 + 0.3275911 * x);
287
+ const poly = t * (0.254829592 + t * (-0.284496736 + t * (1.421413741 + t * (-1.453152027 + t * 1.061405429))));
288
+ return poly * Math.exp(-x * x);
289
+ }
@@ -115,7 +115,7 @@ function mean(values) {
115
115
 
116
116
  // Deterministic PRNG (mulberry32) so the synthetic corpus is reproducible
117
117
  // across runs + machines. Same seed => same docs/queries/gold-mapping.
118
- function mulberry32(seed) {
118
+ export function mulberry32(seed) {
119
119
  let a = seed >>> 0;
120
120
  return function() {
121
121
  a = (a + 0x6d2b79f5) >>> 0;
@@ -45,6 +45,53 @@ const DB_FILENAME = 'memory.db';
45
45
  const INDEX_DIR_NAME = 'index';
46
46
  const IJFW_DIR_NAME = '.ijfw';
47
47
 
48
+ // --- W1.3 (v1.6.0): natural-language OR-query construction ------------------
49
+ //
50
+ // FTS5 treats a space-separated MATCH as implicit AND -- every token must
51
+ // co-occur in one indexed entry. A real natural-language recall ("what
52
+ // database did we pick for the auth service") almost never has all its tokens
53
+ // in a single entry, so the implicit-AND query starves and retrieves nothing.
54
+ // expandQuery() only OR-groups *synonyms* ("(db OR database) AND user"); the
55
+ // inter-token relation stays AND. The fix (proven by the v1.6.0 bench harness)
56
+ // is to OR the salient terms: drop stopwords + sub-3-char tokens, dedup, fold
57
+ // each surviving token's synonym group in, and OR-join. Single-token and
58
+ // exact-phrase queries are unaffected (one quoted term / one OR-group).
59
+ const FTS_STOPWORDS = new Set([
60
+ 'the', 'and', 'for', 'are', 'was', 'were', 'with', 'that', 'this', 'from',
61
+ 'who', 'what', 'when', 'where', 'which', 'whom', 'whose', 'why', 'how',
62
+ 'did', 'does', 'has', 'had', 'have', 'been', 'being', 'into', 'than',
63
+ 'same', 'both', 'also', 'about', 'between', 'their', 'they', 'them',
64
+ 'his', 'her', 'its', 'our', 'your', 'you', 'she', 'him',
65
+ ]);
66
+
67
+ // Strip FTS5 special / column-separator chars to spaces, collapse whitespace.
68
+ // Keeps alphanumerics + underscore + spaces. (Mirrors the bench harness's
69
+ // sanitiser; inlined so the hot search path stays uncoupled from bench code.)
70
+ function sanitizeFtsQuery(q) {
71
+ if (typeof q !== 'string') return '';
72
+ return q.replace(/[^a-zA-Z0-9_\s]/g, ' ').replace(/\s+/g, ' ').trim();
73
+ }
74
+
75
+ // Build an OR-of-salient-terms FTS5 query from a natural-language string.
76
+ // Each surviving token is folded through expandQuery so synonym groups still
77
+ // fire (e.g. "auth" -> "(auth OR authentication)"); non-expanding tokens are
78
+ // quoted as literals (safe against any residual FTS5 keyword). Returns '' when
79
+ // nothing salient survives, so the caller can fall back to the raw query.
80
+ function buildOrQuery(q) {
81
+ const sanitized = sanitizeFtsQuery(q);
82
+ if (!sanitized) return '';
83
+ const seen = new Set();
84
+ const groups = [];
85
+ for (const tok of sanitized.split(/\s+/)) {
86
+ const t = tok.toLowerCase();
87
+ if (t.length < 3 || FTS_STOPWORDS.has(t) || seen.has(t)) continue;
88
+ seen.add(t);
89
+ const { expanded, applied } = expandQuery(tok);
90
+ groups.push(applied ? expanded : `"${tok}"`);
91
+ }
92
+ return groups.join(' OR ');
93
+ }
94
+
48
95
  // --- Driver bootstrap (top-level await; resolves once at module load) -----
49
96
 
50
97
  let DRIVER = null;
@@ -524,7 +571,12 @@ export function searchMemory(q, files, limit = MAX_RESULTS, options) {
524
571
  if (rowCount(db) === 0 && files.length > 0) {
525
572
  autoIndex(db, files);
526
573
  }
527
- const ftsQuery = applied ? expanded : q;
574
+ // W1.3: OR the salient terms so NL queries don't starve under FTS5's
575
+ // implicit AND. Falls back to the synonym-expanded (or raw) query when
576
+ // no salient term survives. Final catch retries the raw query so a
577
+ // malformed rewrite can never regress to fewer results than today.
578
+ const orQuery = buildOrQuery(q);
579
+ const ftsQuery = orQuery || (applied ? expanded : q);
528
580
  let rows;
529
581
  try {
530
582
  rows = searchFts5(db, ftsQuery, limit, tier_semantic, include_stale);
@@ -7,7 +7,7 @@
7
7
  * surfaced in the `ijfw-plan-check` skill as the deterministic pre-dispatch
8
8
  * gate.
9
9
  *
10
- * Distilled from /Users/seandonahoe/.claude/agents/gsd-plan-checker.md extracts
10
+ * Distilled from the gsd-plan-checker agent definition: extracts
11
11
  * the mechanically-checkable rules (the prose-reasoning ones stay in the skill).
12
12
  *
13
13
  * No I/O, no network — operates on plan text passed in by caller.