@ijfw/memory-server 1.5.5 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ijfw-dashboard +20 -1
- package/package.json +4 -3
- package/src/audit-roster.js +89 -12
- package/src/brain/tiered-llm.js +57 -7
- package/src/cross-orchestrator-cli.js +344 -4
- package/src/cross-project-search.js +39 -1
- package/src/dashboard-server.js +7 -1
- package/src/dream/runner.mjs +560 -8
- package/src/handlers/brain-handler.js +101 -1
- package/src/importers/discover.js +1 -1
- package/src/memory/bench-metrics.js +289 -0
- package/src/memory/benchmark.js +1 -1
- package/src/memory/search.js +53 -1
- package/src/orchestrator/plan-checker.js +1 -1
- package/src/profile/audit.js +671 -0
- package/src/profile/capture.js +871 -0
- package/src/profile/derive-dialectic.js +242 -0
- package/src/profile/derive-heuristic.js +733 -0
- package/src/profile/derive.js +156 -0
- package/src/profile/egress.js +306 -0
- package/src/profile/eval/build-real-probes.mjs +197 -0
- package/src/profile/eval/corpus-from-reddit.mjs +166 -0
- package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
- package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
- package/src/profile/eval/gate-b-behavior.mjs +420 -0
- package/src/profile/eval/gate-b-decision-run.mjs +171 -0
- package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
- package/src/profile/eval/gate-b-run.mjs +417 -0
- package/src/profile/eval/gate-b-run.test.mjs +204 -0
- package/src/profile/eval/gate-c-capture.mjs +323 -0
- package/src/profile/eval/harness.mjs +551 -0
- package/src/profile/eval/instrument-validation.mjs +248 -0
- package/src/profile/eval/instrument-validation.test.mjs +125 -0
- package/src/profile/eval/multi-subject-harness.mjs +106 -0
- package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
- package/src/profile/eval/personas.test.mjs +83 -0
- package/src/profile/eval/plumbing.test.mjs +69 -0
- package/src/profile/eval/prereg.mjs +130 -0
- package/src/profile/eval/prereg.test.mjs +78 -0
- package/src/profile/eval/real-corpus.test.mjs +103 -0
- package/src/profile/eval/real-personas.mjs +109 -0
- package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
- package/src/profile/eval/run-real-corpus.mjs +358 -0
- package/src/profile/eval/slug-quality.mjs +464 -0
- package/src/profile/eval/stylometry-features.js +85 -0
- package/src/profile/eval/stylometry-reference.js +16 -0
- package/src/profile/eval/stylometry.js +224 -0
- package/src/profile/eval/stylometry.test.mjs +103 -0
- package/src/profile/eval/synthetic-personas.js +91 -0
- package/src/profile/eval/verifier-features.mjs +170 -0
- package/src/profile/eval/verifier-logreg.mjs +74 -0
- package/src/profile/eval/verifier-pair.mjs +122 -0
- package/src/profile/eval/verifier-reference.mjs +68 -0
- package/src/profile/eval/verifier-scorer.mjs +30 -0
- package/src/profile/eval/wrong-target-control.mjs +168 -0
- package/src/profile/eval/wrong-target-control.test.mjs +124 -0
- package/src/profile/exemplar-capture.js +232 -0
- package/src/profile/exemplar-retrieve.js +138 -0
- package/src/profile/exemplar-store.js +314 -0
- package/src/profile/lock.js +64 -0
- package/src/profile/merge.js +624 -0
- package/src/profile/path-policy.js +213 -0
- package/src/profile/precision-stamp.mjs +151 -0
- package/src/profile/render-brief.js +717 -0
- package/src/profile/schema.js +244 -0
- package/src/profile/sensitivity.js +249 -0
- package/src/profile/serve.js +345 -0
- package/src/profile/store.js +261 -0
- package/src/profile/telemetry.js +289 -0
- package/src/recovery/checkpoint.js +7 -1
- package/src/server.js +185 -14
- package/src/.registry-meta-key.pem +0 -3
|
@@ -25,6 +25,9 @@ import { compileWikiPage, slugify } from '../brain/wiki-compiler.js';
|
|
|
25
25
|
import { resolveCitations } from '../brain/citation-resolver.js';
|
|
26
26
|
import { exportPageBundle, writeShareReadme } from '../brain/export.js';
|
|
27
27
|
import { validateSafeRepoPath } from '../brain/path-guard.js';
|
|
28
|
+
import { profileGet, profileBrief } from '../profile/serve.js';
|
|
29
|
+
import { forgetAndWrite, listInferences } from '../profile/audit.js';
|
|
30
|
+
import { readProfile } from '../profile/store.js';
|
|
28
31
|
|
|
29
32
|
const WIKI_TYPES = ['concepts', 'entities', 'decisions', 'milestones'];
|
|
30
33
|
|
|
@@ -336,7 +339,99 @@ function verbConflictResolve(db, repoRoot, args) {
|
|
|
336
339
|
return { ok: true, resolved: true, winnerId: args.winnerId, supersededIds, validTo: chosenValidTo };
|
|
337
340
|
}
|
|
338
341
|
|
|
339
|
-
|
|
342
|
+
// ---------------------------------------------------------------------------
|
|
343
|
+
// PHASE P4 — cross-system profile bus serving verbs (folded into ijfw_brain so
|
|
344
|
+
// the MCP tool cap stays at 13/13 — NO new top-level tool). Both verbs are
|
|
345
|
+
// ZERO-LLM by construction: they route into src/profile/serve.js, which imports
|
|
346
|
+
// only the store/render/egress/sensitivity modules (the P4.5 import-graph guard
|
|
347
|
+
// proves the serve path never reaches the LLM tier). `env` threads the host's
|
|
348
|
+
// env so per-host opt-in (IJFW_PROFILE_SHARE_SENSITIVE), redaction
|
|
349
|
+
// (IJFW_PROFILE_REDACT) and the kill-switch (IJFW_PROFILE_KILL) are honored.
|
|
350
|
+
//
|
|
351
|
+
// args (both verbs, all optional):
|
|
352
|
+
// tokenBudget number — cap brief output (brief only)
|
|
353
|
+
// context { overlay?, host?, session? } — overlay key + egress meta
|
|
354
|
+
// shareSensitive boolean — programmatic per-host opt-in (else env flag)
|
|
355
|
+
// ---------------------------------------------------------------------------
|
|
356
|
+
|
|
357
|
+
function profileServeOpts(args, env) {
|
|
358
|
+
return {
|
|
359
|
+
tokenBudget: args && Number.isFinite(args.tokenBudget) ? args.tokenBudget : undefined,
|
|
360
|
+
context: (args && args.context && typeof args.context === 'object') ? args.context : {},
|
|
361
|
+
shareSensitive: (args && typeof args.shareSensitive === 'boolean') ? args.shareSensitive : undefined,
|
|
362
|
+
env: env || process.env,
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
function verbProfileGet(args, env) {
|
|
367
|
+
return profileGet(profileServeOpts(args, env));
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
function verbProfileBrief(args, env) {
|
|
371
|
+
return profileBrief(profileServeOpts(args, env));
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// ---------------------------------------------------------------------------
|
|
375
|
+
// PHASE P4 — right-to-be-forgotten + audit INVOCATION SURFACE (audit M2). The
|
|
376
|
+
// audit module's forgetAndWrite / listInferences were previously reachable only
|
|
377
|
+
// from tests; these verbs give the user a real way to SEE what was inferred and
|
|
378
|
+
// to DELETE it (the egress purge rides along inside forgetAndWrite). Folded into
|
|
379
|
+
// ijfw_brain — NO new top-level tool, cap stays 13/13.
|
|
380
|
+
// ---------------------------------------------------------------------------
|
|
381
|
+
|
|
382
|
+
/**
|
|
383
|
+
* Build a forget pattern from wire args. Across JSON we accept ONLY a string
|
|
384
|
+
* (exact-id or `kind::`/`::subject` segment match — see audit.matcherFor) or an
|
|
385
|
+
* explicit { regex, flags? } object that we compile under audit's ReDoS guard.
|
|
386
|
+
* A bad regex source compiles into a RegExp whose forgetAndWrite pre-validation
|
|
387
|
+
* (validatePattern) rejects it BEFORE the global lock — so a hostile pattern can
|
|
388
|
+
* neither hang the event loop nor over-delete.
|
|
389
|
+
*/
|
|
390
|
+
function buildForgetPattern(args) {
|
|
391
|
+
if (args && typeof args.id === 'string' && args.id) return { ok: true, pattern: args.id };
|
|
392
|
+
if (args && typeof args.pattern === 'string' && args.pattern) return { ok: true, pattern: args.pattern };
|
|
393
|
+
if (args && args.regex && typeof args.regex === 'string') {
|
|
394
|
+
const flags = typeof args.flags === 'string' ? args.flags.replace(/[^gimsuy]/g, '') : '';
|
|
395
|
+
try {
|
|
396
|
+
return { ok: true, pattern: new RegExp(args.regex, flags) };
|
|
397
|
+
} catch (e) {
|
|
398
|
+
return { ok: false, error: 'invalid-regex', message: e.message };
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
return { ok: false, error: 'missing-pattern' };
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
async function verbProfileForget(args) {
|
|
405
|
+
const built = buildForgetPattern(args);
|
|
406
|
+
if (!built.ok) return { ok: false, error: built.error, message: built.message };
|
|
407
|
+
// forgetAndWrite validates the pattern (ReDoS guard) BEFORE taking the global
|
|
408
|
+
// profile lock, runs read→forget→write under the lock, and purges egress.
|
|
409
|
+
const r = await forgetAndWrite(built.pattern);
|
|
410
|
+
if (!r.ok) return { ok: false, error: r.code || 'forget-failed', message: r.message };
|
|
411
|
+
return {
|
|
412
|
+
ok: true,
|
|
413
|
+
removed: (r.removed || []).map((inf) => ({ id: inf.id, kind: inf.kind, subject: inf.subject })),
|
|
414
|
+
removedCount: (r.removed || []).length,
|
|
415
|
+
egressRemoved: r.egressRemoved || 0,
|
|
416
|
+
};
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
function verbProfileAudit() {
|
|
420
|
+
// Read the current global profile and surface every inference with full
|
|
421
|
+
// provenance (scope, evidence, source sessions/hosts, sensitivity). Cold
|
|
422
|
+
// start (no profile on disk) -> empty list, never an error.
|
|
423
|
+
let profile = null;
|
|
424
|
+
try {
|
|
425
|
+
const r = readProfile();
|
|
426
|
+
profile = r && r.ok ? r.profile : null;
|
|
427
|
+
} catch {
|
|
428
|
+
profile = null;
|
|
429
|
+
}
|
|
430
|
+
if (!profile) return { ok: true, inferences: [] };
|
|
431
|
+
return { ok: true, inferences: listInferences(profile) };
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
export async function handleIjfwBrain({ verb, args = {}, db, repoRoot, env = process.env, opts = {} } = {}) {
|
|
340
435
|
if (!verb || typeof verb !== 'string') return { ok: false, error: 'missing-verb' };
|
|
341
436
|
switch (verb) {
|
|
342
437
|
case 'think': return verbThink(db, repoRoot, args, opts);
|
|
@@ -347,6 +442,10 @@ export async function handleIjfwBrain({ verb, args = {}, db, repoRoot, env: _env
|
|
|
347
442
|
case 'wiki.export': return verbWikiExport(db, repoRoot, args);
|
|
348
443
|
case 'wiki.shareReadme': return verbWikiShareReadme(db, repoRoot);
|
|
349
444
|
case 'conflict.resolve': return verbConflictResolve(db, repoRoot, args);
|
|
445
|
+
case 'profile.get': return verbProfileGet(args, env);
|
|
446
|
+
case 'profile.brief': return verbProfileBrief(args, env);
|
|
447
|
+
case 'profile.forget': return verbProfileForget(args);
|
|
448
|
+
case 'profile.audit': return verbProfileAudit();
|
|
350
449
|
default: return { ok: false, error: 'unknown-verb', verb };
|
|
351
450
|
}
|
|
352
451
|
}
|
|
@@ -355,4 +454,5 @@ export const IJFW_BRAIN_VERBS = [
|
|
|
355
454
|
'think', 'links',
|
|
356
455
|
'wiki.get', 'wiki.compile', 'wiki.promote', 'wiki.export', 'wiki.shareReadme',
|
|
357
456
|
'conflict.resolve',
|
|
457
|
+
'profile.get', 'profile.brief', 'profile.forget', 'profile.audit',
|
|
358
458
|
];
|
|
@@ -23,7 +23,7 @@ import { homedir } from 'node:os';
|
|
|
23
23
|
const DEV_PARENTS = ['dev', 'Code', 'code', 'projects', 'repos', 'work', 'src'];
|
|
24
24
|
|
|
25
25
|
// Decode Claude Code's path-encoded project directory name back to an absolute
|
|
26
|
-
// path. Example: "-Users-
|
|
26
|
+
// path. Example: "-Users-alice-dev-pip" -> "/Users/alice/dev/pip".
|
|
27
27
|
// Encoding replaces `/` with `-`. Leading `-` becomes leading `/`.
|
|
28
28
|
// Caveat: directories with literal `-` in their name become ambiguous on
|
|
29
29
|
// decode; we verify by checking whether the decoded path exists.
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
// IJFW v1.6.0 -- benchmark metrics. Pure functions, no I/O, no LLM.
|
|
2
|
+
import { mulberry32 } from './benchmark.js';
|
|
3
|
+
//
|
|
4
|
+
// Retrieval metrics (free -- no model calls):
|
|
5
|
+
// recallAtK, precisionAtK, mrr, episodesPerQuery, latencyPercentile
|
|
6
|
+
// Answer metrics:
|
|
7
|
+
// normalizeAnswer + answerExactMatch (free string match). LLM-judged
|
|
8
|
+
// answer correctness is a separate paid path wired in P5, not here.
|
|
9
|
+
//
|
|
10
|
+
// All retrieval metrics operate on:
|
|
11
|
+
// retrievedIds : string[] -- ranked result ids (best first)
|
|
12
|
+
// relevantIds : string[] -- gold evidence ids (the relevant set)
|
|
13
|
+
// expressed in the SAME id space (the loader's job to align granularity).
|
|
14
|
+
|
|
15
|
+
/** Recall@k = |relevant ∩ retrieved[0:k]| / |relevant|. */
|
|
16
|
+
export function recallAtK(retrievedIds, relevantIds, k) {
|
|
17
|
+
if (!relevantIds || relevantIds.length === 0) return null; // undefined for no-evidence queries
|
|
18
|
+
const top = new Set(retrievedIds.slice(0, k));
|
|
19
|
+
let hit = 0;
|
|
20
|
+
for (const r of relevantIds) if (top.has(r)) hit++;
|
|
21
|
+
return hit / relevantIds.length;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Precision@k = |relevant ∩ retrieved[0:k]| / k. */
|
|
25
|
+
export function precisionAtK(retrievedIds, relevantIds, k) {
|
|
26
|
+
if (k <= 0) return null;
|
|
27
|
+
const rel = new Set(relevantIds || []);
|
|
28
|
+
const top = retrievedIds.slice(0, k);
|
|
29
|
+
if (top.length === 0) return 0;
|
|
30
|
+
let hit = 0;
|
|
31
|
+
for (const id of top) if (rel.has(id)) hit++;
|
|
32
|
+
return hit / Math.min(k, top.length);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Mean Reciprocal Rank: 1/(rank of first relevant), else 0. */
|
|
36
|
+
export function reciprocalRank(retrievedIds, relevantIds) {
|
|
37
|
+
const rel = new Set(relevantIds || []);
|
|
38
|
+
for (let i = 0; i < retrievedIds.length; i++) {
|
|
39
|
+
if (rel.has(retrievedIds[i])) return 1 / (i + 1);
|
|
40
|
+
}
|
|
41
|
+
return 0;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Mean over a list of per-query numbers, skipping null (undefined) entries. */
|
|
45
|
+
export function mean(values) {
|
|
46
|
+
const xs = values.filter((v) => v !== null && v !== undefined && !Number.isNaN(v));
|
|
47
|
+
if (xs.length === 0) return null;
|
|
48
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/** Percentile (linear interpolation). p in [0,100]. */
|
|
52
|
+
export function percentile(values, p) {
|
|
53
|
+
const xs = values.filter((v) => typeof v === 'number' && !Number.isNaN(v)).slice().sort((a, b) => a - b);
|
|
54
|
+
if (xs.length === 0) return null;
|
|
55
|
+
if (xs.length === 1) return xs[0];
|
|
56
|
+
const rank = (p / 100) * (xs.length - 1);
|
|
57
|
+
const lo = Math.floor(rank);
|
|
58
|
+
const hi = Math.ceil(rank);
|
|
59
|
+
if (lo === hi) return xs[lo];
|
|
60
|
+
return xs[lo] + (xs[hi] - xs[lo]) * (rank - lo);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/** Normalize an answer string for exact-match comparison (SQuAD-style). */
|
|
64
|
+
export function normalizeAnswer(s) {
|
|
65
|
+
return String(s ?? '')
|
|
66
|
+
.toLowerCase()
|
|
67
|
+
.replace(/\b(a|an|the)\b/g, ' ')
|
|
68
|
+
.replace(/[^a-z0-9 ]/g, ' ')
|
|
69
|
+
.replace(/\s+/g, ' ')
|
|
70
|
+
.trim();
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* G1b L4-HIGH-1: is this ground-truth entry an ABSTENTION (no-answer / trap) gold?
|
|
75
|
+
*
|
|
76
|
+
* Both confirmatory loaders (longmemeval.js `_abs`, convomem.js abstention_evidence)
|
|
77
|
+
* stamp `ground_truth[qid].is_abstention = true` and KEEP the dataset's own
|
|
78
|
+
* natural-language no-answer gold. This canonical flag — NOT the gold's surface
|
|
79
|
+
* form — is what routes scoring to the single abstention rubric, so a correct
|
|
80
|
+
* abstention scores the same on BOTH datasets and a confident hallucination is
|
|
81
|
+
* wrong on BOTH. Pure; exported for unit tests.
|
|
82
|
+
*
|
|
83
|
+
* @param {object|null|undefined} gt a ground_truth entry
|
|
84
|
+
* @returns {boolean}
|
|
85
|
+
*/
|
|
86
|
+
export function isAbstentionGold(gt) {
|
|
87
|
+
return !!(gt && gt.is_abstention === true);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Lexical cues that a prediction CORRECTLY abstained / declined (no confident
|
|
91
|
+
// claim). Deliberately rubric-only (no LLM) so the abstention verdict is the
|
|
92
|
+
// SAME function regardless of dataset, judge mode, or gold surface form.
|
|
93
|
+
const ABSTENTION_CUES = [
|
|
94
|
+
/\bi (?:do not|don't|dont) (?:know|have)\b/,
|
|
95
|
+
/\b(?:no|not any|don't have|do not have|lack) (?:information|info|record|data|details?)\b/,
|
|
96
|
+
/\bno (?:information|mention|record|reference)\b/,
|
|
97
|
+
/\bnot (?:mentioned|stated|specified|provided|available|found|present|discussed)\b/,
|
|
98
|
+
/\b(?:cannot|can't|cant|could not|couldn't|unable to) (?:find|determine|tell|answer|say|locate)\b/,
|
|
99
|
+
/\bisn't (?:any )?(?:information|mention|record)\b/,
|
|
100
|
+
/\bthere (?:is|isn't|is no|isn't any|was no) (?:no )?(?:information|mention|record|reference|data)\b/,
|
|
101
|
+
/\bnot enough (?:information|context|detail)\b/,
|
|
102
|
+
];
|
|
103
|
+
|
|
104
|
+
// G1b cross-C LOW-1: bare cue TOKENS ("unknown", "unclear", "n/a") are genuine
|
|
105
|
+
// abstentions ONLY when they are essentially the WHOLE answer — a short decline.
|
|
106
|
+
// As a SUBSTRING of a longer confident response ("...the status is unknown to
|
|
107
|
+
// the team but the project shipped", "Unknown Pleasures") they are a
|
|
108
|
+
// hallucination on the trap, NOT a decline, and must score 0. These are matched
|
|
109
|
+
// against the whole normalized answer (anchored), separate from the phrase cues
|
|
110
|
+
// above which may legitimately appear inside a sentence. Symmetric across
|
|
111
|
+
// subjects (the metric is subject-agnostic).
|
|
112
|
+
const BARE_TOKEN_ABSTENTIONS = [
|
|
113
|
+
/^unknown$/,
|
|
114
|
+
/^unclear$/,
|
|
115
|
+
/^n\/a$/,
|
|
116
|
+
/^na$/,
|
|
117
|
+
];
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* G1b L4-HIGH-1: score one ABSTENTION question under ONE rubric.
|
|
121
|
+
*
|
|
122
|
+
* Returns 1 when the prediction correctly abstains / declines (it does not assert
|
|
123
|
+
* a specific fact), 0 when it makes a confident factual claim (a hallucination on
|
|
124
|
+
* an unanswerable trap). Independent of the gold's surface form — the gold is a
|
|
125
|
+
* natural-language no-answer sentence on LongMemEval and ConvoMem alike; the
|
|
126
|
+
* ABILITY graded is "did the system refuse to invent an answer". Pure.
|
|
127
|
+
*
|
|
128
|
+
* @param {string} predicted
|
|
129
|
+
* @returns {1|0}
|
|
130
|
+
*/
|
|
131
|
+
export function scoreAbstentionMatch(predicted) {
|
|
132
|
+
const t = String(predicted ?? '').toLowerCase().trim();
|
|
133
|
+
if (t === '') return 0; // an empty answer is not a stated abstention (caller may skip earlier)
|
|
134
|
+
// Phrase-level cues may appear inside a sentence (a genuine NL decline).
|
|
135
|
+
for (const re of ABSTENTION_CUES) {
|
|
136
|
+
if (re.test(t)) return 1;
|
|
137
|
+
}
|
|
138
|
+
// Bare cue tokens count ONLY when they are essentially the whole answer (a
|
|
139
|
+
// short decline) — strip surrounding punctuation/whitespace first so
|
|
140
|
+
// "Unknown.", "n/a!", " unclear " still match, but a confident sentence that
|
|
141
|
+
// merely CONTAINS the token does not.
|
|
142
|
+
const bare = t.replace(/^[\s"'.,!?-]+|[\s"'.,!?-]+$/g, '');
|
|
143
|
+
for (const re of BARE_TOKEN_ABSTENTIONS) {
|
|
144
|
+
if (re.test(bare)) return 1;
|
|
145
|
+
}
|
|
146
|
+
return 0;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/** Free answer-correctness signal: normalized exact / containment match. */
|
|
150
|
+
export function answerExactMatch(predicted, gold) {
|
|
151
|
+
const p = normalizeAnswer(predicted);
|
|
152
|
+
const g = normalizeAnswer(gold);
|
|
153
|
+
if (!g) return null;
|
|
154
|
+
if (p === g) return 1;
|
|
155
|
+
// containment either direction handles "yes" vs "yes, both American" cases
|
|
156
|
+
if (p && (p.includes(g) || g.includes(p))) return 1;
|
|
157
|
+
return 0;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Aggregate per-query retrieval records into the metrics block for one
|
|
162
|
+
* (adapter, dataset) cell.
|
|
163
|
+
* @param {Array<{retrievedIds, relevantIds, latency_ms, tokens_in, tokens_out, cost_usd, n_retrieved, answerMatch}>} per
|
|
164
|
+
*/
|
|
165
|
+
export function aggregate(per) {
|
|
166
|
+
const r1 = per.map((q) => recallAtK(q.retrievedIds, q.relevantIds, 1));
|
|
167
|
+
const r5 = per.map((q) => recallAtK(q.retrievedIds, q.relevantIds, 5));
|
|
168
|
+
const r10 = per.map((q) => recallAtK(q.retrievedIds, q.relevantIds, 10));
|
|
169
|
+
const p5 = per.map((q) => precisionAtK(q.retrievedIds, q.relevantIds, 5));
|
|
170
|
+
const rr = per.map((q) => reciprocalRank(q.retrievedIds, q.relevantIds));
|
|
171
|
+
const lat = per.map((q) => q.latency_ms).filter((x) => typeof x === 'number');
|
|
172
|
+
const ans = per.map((q) => q.answerMatch).filter((x) => x !== null && x !== undefined);
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
n_queries: per.length,
|
|
176
|
+
recall_at_1: round(mean(r1)),
|
|
177
|
+
recall_at_5: round(mean(r5)),
|
|
178
|
+
recall_at_10: round(mean(r10)),
|
|
179
|
+
precision_at_5: round(mean(p5)),
|
|
180
|
+
mrr: round(mean(rr)),
|
|
181
|
+
episodes_per_query_mean: round(mean(per.map((q) => q.n_retrieved))),
|
|
182
|
+
latency_p50_ms: round(percentile(lat, 50)),
|
|
183
|
+
latency_p95_ms: round(percentile(lat, 95)),
|
|
184
|
+
tokens_per_query_mean: round(mean(per.map((q) => (q.tokens_in || 0) + (q.tokens_out || 0)))),
|
|
185
|
+
cost_per_query_usd: round6(mean(per.map((q) => q.cost_usd || 0))),
|
|
186
|
+
answer_match_mean: ans.length ? round(mean(ans)) : null,
|
|
187
|
+
hops_mean: round(mean(per.map((q) => q.hops))),
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Aggregate per-query records bucketed by a dimension key (e.g. question type).
|
|
193
|
+
* Returns { [dimensionValue]: aggregateBlock }. This is the diagnostic view —
|
|
194
|
+
* global aggregates hide WHERE a system fails; per-dimension exposes it.
|
|
195
|
+
* @param {Array} per per-query records, each carrying a `dim` field
|
|
196
|
+
*/
|
|
197
|
+
export function aggregateByDimension(per) {
|
|
198
|
+
const buckets = new Map();
|
|
199
|
+
for (const q of per) {
|
|
200
|
+
const key = q.dim ?? 'unknown';
|
|
201
|
+
if (!buckets.has(key)) buckets.set(key, []);
|
|
202
|
+
buckets.get(key).push(q);
|
|
203
|
+
}
|
|
204
|
+
const out = {};
|
|
205
|
+
for (const [key, rows] of buckets) out[key] = aggregate(rows);
|
|
206
|
+
return out;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function round(x) { return x === null || x === undefined ? null : Math.round(x * 10000) / 10000; }
|
|
210
|
+
function round6(x) { return x === null || x === undefined ? null : Math.round(x * 1e6) / 1e6; }
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Bootstrap confidence interval for the mean of per-query numeric scores.
|
|
214
|
+
*
|
|
215
|
+
* @param {number[]} perQuery - per-query numeric scores (e.g. 0/1 per query)
|
|
216
|
+
* @param {{ iters?: number, alpha?: number, seed?: number }} opts
|
|
217
|
+
* @returns {{ point: number, lo: number, hi: number }}
|
|
218
|
+
* point = mean(perQuery), lo/hi are the alpha/2 and 1-alpha/2 percentiles
|
|
219
|
+
* of the bootstrap distribution. Deterministic for a given seed.
|
|
220
|
+
*/
|
|
221
|
+
export function bootstrapCI(perQuery, { iters = 1000, alpha = 0.05, seed = 42 } = {}) {
|
|
222
|
+
const n = perQuery.length;
|
|
223
|
+
const point = n > 0 ? perQuery.reduce((a, b) => a + b, 0) / n : 0;
|
|
224
|
+
if (n === 0) return { point, lo: 0, hi: 0 };
|
|
225
|
+
|
|
226
|
+
const rng = mulberry32(seed);
|
|
227
|
+
const boots = Array.from({ length: iters });
|
|
228
|
+
for (let i = 0; i < iters; i++) {
|
|
229
|
+
let s = 0;
|
|
230
|
+
for (let j = 0; j < n; j++) {
|
|
231
|
+
s += perQuery[Math.floor(rng() * n)];
|
|
232
|
+
}
|
|
233
|
+
boots[i] = s / n;
|
|
234
|
+
}
|
|
235
|
+
boots.sort((a, b) => a - b);
|
|
236
|
+
|
|
237
|
+
const loIdx = Math.floor((alpha / 2) * iters);
|
|
238
|
+
const hiIdx = Math.floor((1 - alpha / 2) * iters);
|
|
239
|
+
return {
|
|
240
|
+
point,
|
|
241
|
+
lo: boots[loIdx],
|
|
242
|
+
hi: boots[Math.min(hiIdx, iters - 1)],
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Paired McNemar test for two binary result arrays (same length, 0/1 per query).
|
|
248
|
+
*
|
|
249
|
+
* Uses continuity-corrected McNemar χ² statistic.
|
|
250
|
+
* pValue is derived from the χ²(1) survival function via an erfc approximation.
|
|
251
|
+
*
|
|
252
|
+
* @param {number[]} before - binary array (0/1 per query)
|
|
253
|
+
* @param {number[]} after - binary array (0/1 per query)
|
|
254
|
+
* @returns {{ b: number, c: number, statistic: number, pValue: number, significant: boolean }}
|
|
255
|
+
* b = count(before=0, after=1) (after wins)
|
|
256
|
+
* c = count(before=1, after=0) (before wins)
|
|
257
|
+
*/
|
|
258
|
+
export function mcnemar(before, after) {
|
|
259
|
+
let b = 0; // before=0, after=1
|
|
260
|
+
let c = 0; // before=1, after=0
|
|
261
|
+
for (let i = 0; i < before.length; i++) {
|
|
262
|
+
if (before[i] === 0 && after[i] === 1) b++;
|
|
263
|
+
else if (before[i] === 1 && after[i] === 0) c++;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
const bc = b + c;
|
|
267
|
+
if (bc === 0) return { b, c, statistic: 0, pValue: 1, significant: false };
|
|
268
|
+
|
|
269
|
+
// Continuity-corrected McNemar χ² = (|b-c| - 1)² / (b+c)
|
|
270
|
+
const diff = Math.abs(b - c) - 1;
|
|
271
|
+
const statistic = (diff * diff) / bc;
|
|
272
|
+
|
|
273
|
+
// χ²(1) survival function: P(χ² > x) = erfc(sqrt(x/2))
|
|
274
|
+
// Using erfc approximation (Abramowitz & Stegun 7.1.26)
|
|
275
|
+
const pValue = erfcApprox(Math.sqrt(statistic / 2));
|
|
276
|
+
|
|
277
|
+
return { b, c, statistic, pValue, significant: pValue < 0.05 };
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Complementary error function approximation for χ²(1) p-value computation.
|
|
282
|
+
* Abramowitz & Stegun 7.1.26 rational approximation (max |err| < 1.5e-7).
|
|
283
|
+
*/
|
|
284
|
+
function erfcApprox(x) {
|
|
285
|
+
if (x < 0) return 2 - erfcApprox(-x);
|
|
286
|
+
const t = 1 / (1 + 0.3275911 * x);
|
|
287
|
+
const poly = t * (0.254829592 + t * (-0.284496736 + t * (1.421413741 + t * (-1.453152027 + t * 1.061405429))));
|
|
288
|
+
return poly * Math.exp(-x * x);
|
|
289
|
+
}
|
package/src/memory/benchmark.js
CHANGED
|
@@ -115,7 +115,7 @@ function mean(values) {
|
|
|
115
115
|
|
|
116
116
|
// Deterministic PRNG (mulberry32) so the synthetic corpus is reproducible
|
|
117
117
|
// across runs + machines. Same seed => same docs/queries/gold-mapping.
|
|
118
|
-
function mulberry32(seed) {
|
|
118
|
+
export function mulberry32(seed) {
|
|
119
119
|
let a = seed >>> 0;
|
|
120
120
|
return function() {
|
|
121
121
|
a = (a + 0x6d2b79f5) >>> 0;
|
package/src/memory/search.js
CHANGED
|
@@ -45,6 +45,53 @@ const DB_FILENAME = 'memory.db';
|
|
|
45
45
|
const INDEX_DIR_NAME = 'index';
|
|
46
46
|
const IJFW_DIR_NAME = '.ijfw';
|
|
47
47
|
|
|
48
|
+
// --- W1.3 (v1.6.0): natural-language OR-query construction ------------------
|
|
49
|
+
//
|
|
50
|
+
// FTS5 treats a space-separated MATCH as implicit AND -- every token must
|
|
51
|
+
// co-occur in one indexed entry. A real natural-language recall ("what
|
|
52
|
+
// database did we pick for the auth service") almost never has all its tokens
|
|
53
|
+
// in a single entry, so the implicit-AND query starves and retrieves nothing.
|
|
54
|
+
// expandQuery() only OR-groups *synonyms* ("(db OR database) AND user"); the
|
|
55
|
+
// inter-token relation stays AND. The fix (proven by the v1.6.0 bench harness)
|
|
56
|
+
// is to OR the salient terms: drop stopwords + sub-3-char tokens, dedup, fold
|
|
57
|
+
// each surviving token's synonym group in, and OR-join. Single-token and
|
|
58
|
+
// exact-phrase queries are unaffected (one quoted term / one OR-group).
|
|
59
|
+
const FTS_STOPWORDS = new Set([
|
|
60
|
+
'the', 'and', 'for', 'are', 'was', 'were', 'with', 'that', 'this', 'from',
|
|
61
|
+
'who', 'what', 'when', 'where', 'which', 'whom', 'whose', 'why', 'how',
|
|
62
|
+
'did', 'does', 'has', 'had', 'have', 'been', 'being', 'into', 'than',
|
|
63
|
+
'same', 'both', 'also', 'about', 'between', 'their', 'they', 'them',
|
|
64
|
+
'his', 'her', 'its', 'our', 'your', 'you', 'she', 'him',
|
|
65
|
+
]);
|
|
66
|
+
|
|
67
|
+
// Strip FTS5 special / column-separator chars to spaces, collapse whitespace.
|
|
68
|
+
// Keeps alphanumerics + underscore + spaces. (Mirrors the bench harness's
|
|
69
|
+
// sanitiser; inlined so the hot search path stays uncoupled from bench code.)
|
|
70
|
+
function sanitizeFtsQuery(q) {
|
|
71
|
+
if (typeof q !== 'string') return '';
|
|
72
|
+
return q.replace(/[^a-zA-Z0-9_\s]/g, ' ').replace(/\s+/g, ' ').trim();
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Build an OR-of-salient-terms FTS5 query from a natural-language string.
|
|
76
|
+
// Each surviving token is folded through expandQuery so synonym groups still
|
|
77
|
+
// fire (e.g. "auth" -> "(auth OR authentication)"); non-expanding tokens are
|
|
78
|
+
// quoted as literals (safe against any residual FTS5 keyword). Returns '' when
|
|
79
|
+
// nothing salient survives, so the caller can fall back to the raw query.
|
|
80
|
+
function buildOrQuery(q) {
|
|
81
|
+
const sanitized = sanitizeFtsQuery(q);
|
|
82
|
+
if (!sanitized) return '';
|
|
83
|
+
const seen = new Set();
|
|
84
|
+
const groups = [];
|
|
85
|
+
for (const tok of sanitized.split(/\s+/)) {
|
|
86
|
+
const t = tok.toLowerCase();
|
|
87
|
+
if (t.length < 3 || FTS_STOPWORDS.has(t) || seen.has(t)) continue;
|
|
88
|
+
seen.add(t);
|
|
89
|
+
const { expanded, applied } = expandQuery(tok);
|
|
90
|
+
groups.push(applied ? expanded : `"${tok}"`);
|
|
91
|
+
}
|
|
92
|
+
return groups.join(' OR ');
|
|
93
|
+
}
|
|
94
|
+
|
|
48
95
|
// --- Driver bootstrap (top-level await; resolves once at module load) -----
|
|
49
96
|
|
|
50
97
|
let DRIVER = null;
|
|
@@ -524,7 +571,12 @@ export function searchMemory(q, files, limit = MAX_RESULTS, options) {
|
|
|
524
571
|
if (rowCount(db) === 0 && files.length > 0) {
|
|
525
572
|
autoIndex(db, files);
|
|
526
573
|
}
|
|
527
|
-
|
|
574
|
+
// W1.3: OR the salient terms so NL queries don't starve under FTS5's
|
|
575
|
+
// implicit AND. Falls back to the synonym-expanded (or raw) query when
|
|
576
|
+
// no salient term survives. Final catch retries the raw query so a
|
|
577
|
+
// malformed rewrite can never regress to fewer results than today.
|
|
578
|
+
const orQuery = buildOrQuery(q);
|
|
579
|
+
const ftsQuery = orQuery || (applied ? expanded : q);
|
|
528
580
|
let rows;
|
|
529
581
|
try {
|
|
530
582
|
rows = searchFts5(db, ftsQuery, limit, tier_semantic, include_stale);
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
* surfaced in the `ijfw-plan-check` skill as the deterministic pre-dispatch
|
|
8
8
|
* gate.
|
|
9
9
|
*
|
|
10
|
-
* Distilled from
|
|
10
|
+
* Distilled from the gsd-plan-checker agent definition: extracts
|
|
11
11
|
* the mechanically-checkable rules (the prose-reasoning ones stay in the skill).
|
|
12
12
|
*
|
|
13
13
|
* No I/O, no network — operates on plan text passed in by caller.
|