ruvector-mragent 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Darwin Mode scorePolicy for the MRAgent graph-memory harness (ADR-269).
3
+ *
4
+ * `metaharness evolve` calls scoreVariant() after each mutation of the harness
5
+ * source (agent/harness.mjs). It evaluates the CURRENT genome over the frozen
6
+ * Cue-Tag-Content corpus and returns a fitness in [0, 1]:
7
+ *
8
+ * score = 0.40 × accuracy (helpfulness on answerable tasks)
9
+ * + 0.30 × riskScore (calibration: abstain, don't hallucinate)
10
+ * + 0.12 × (1 − avgLatencyMs / BASE_LATENCY).clamp(0,1)
11
+ * + 0.10 × (1 − avgContext / BASE_CONTEXT).clamp(0,1)
12
+ * + 0.08 × (1 − avgHops / BASE_HOPS).clamp(0,1)
13
+ *
14
+ * Helpfulness AND calibration both dominate (a confident wrong answer is worse
15
+ * than an honest abstention); the rest rewards cheaper reconstruction — the
16
+ * MRAgent "prune irrelevant paths" objective.
17
+ *
18
+ * This mirrors crates/ruvector-sota-bench/harness/scorePolicy.ts so the same
19
+ * Darwin tooling drives both the ANN benchmark and the MRAgent harness.
20
+ */
21
+
22
+ import * as fs from "node:fs";
23
+ import * as path from "node:path";
24
+ import { fileURLToPath } from "node:url";
25
+
26
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
27
+ const ROOT = path.resolve(__dirname, "..");
28
+
29
+ // Baselines (the shipped baselineGenome() over this corpus).
30
+ const BASE_LATENCY = 4.0;
31
+ const BASE_CONTEXT = 6.0;
32
+ const BASE_HOPS = 2.0;
33
+
34
+ interface Metrics {
35
+ accuracy: number;
36
+ riskScore: number;
37
+ avgLatencyMs: number;
38
+ avgHops: number;
39
+ avgContext: number;
40
+ n: number;
41
+ }
42
+
43
+ function fitness(m: Metrics): number {
44
+ const lat = Math.max(0, Math.min(1, 1 - m.avgLatencyMs / BASE_LATENCY));
45
+ const ctx = Math.max(0, Math.min(1, 1 - m.avgContext / BASE_CONTEXT));
46
+ const hop = Math.max(0, Math.min(1, 1 - m.avgHops / BASE_HOPS));
47
+ return 0.4 * m.accuracy + 0.3 * m.riskScore + 0.12 * lat + 0.1 * ctx + 0.08 * hop;
48
+ }
49
+
50
+ /**
51
+ * Score the current working-tree harness variant. Darwin mutates the genome
52
+ * defaults inside agent/harness.mjs; we import it fresh and evaluate.
53
+ * Returns a fitness in [0, 1]. Any failure scores 0 (a broken mutation is unfit).
54
+ */
55
+ export async function scoreVariant(): Promise<number> {
56
+ try {
57
+ const harness = await import(path.join(ROOT, "agent", "harness.mjs"));
58
+ const corpus = JSON.parse(
59
+ fs.readFileSync(path.join(ROOT, "data", "eval-set.json"), "utf8"),
60
+ );
61
+ const store = new harness.MemoryStore(corpus.tasks);
62
+ const metrics: Metrics = harness.evaluate(
63
+ harness.baselineGenome(),
64
+ store,
65
+ corpus.tasks,
66
+ );
67
+ return Math.max(0, Math.min(1, fitness(metrics)));
68
+ } catch (e) {
69
+ console.error("[scorePolicy] variant failed to evaluate:", (e as Error).message);
70
+ return 0;
71
+ }
72
+ }
73
+
74
+ export { fitness };
75
+ export default scoreVariant;
package/optimize.mjs ADDED
@@ -0,0 +1,304 @@
1
+ // MRAgent harness optimizer — Darwin Mode for graph-memory reconstruction.
2
+ //
3
+ // Principle (Meta-Harness / @metaharness/darwin): "freeze the model, evolve the
4
+ // harness." FROZEN MODEL = the RuVector Cue-Tag-Content memory substrate
5
+ // (agent/memory.mjs). EVOLVED HARNESS = the reconstruction genome in
6
+ // agent/harness.mjs (cue-k, efSearch, RRF alpha, traversal depth, fan-out, prune
7
+ // threshold, content limit, GNN rerank, prompt strategy).
8
+ //
9
+ // We use Darwin's `mapLimit` (bounded-concurrency evaluation) and `paretoFront`
10
+ // (multi-objective selection) when @metaharness/darwin is installed, and fall
11
+ // back to an equivalent in-process loop when it is not (ADR-150 invariant 3:
12
+ // graceful degradation — MODULE_NOT_FOUND must never crash the example).
13
+ //
14
+ // Run: npm run optimize
15
+
16
+ import fs from "node:fs";
17
+ import path from "node:path";
18
+ import { fileURLToPath } from "node:url";
19
+ import { MemoryStore, baselineGenome, mutate, evaluate, splitByClass, kFoldByClass, runReasoningLoop } from "./agent/harness.mjs";
20
+ import { consolidate } from "./agent/consolidate.mjs";
21
+ import { detectEndpoint, llmProposeGenomes } from "./agent/llmMutator.mjs";
22
+
23
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
24
+
25
+ // ── ADR-150 graceful degradation: optional Darwin primitives ────────────────
26
+ async function loadDarwin() {
27
+ try {
28
+ const d = await import("@metaharness/darwin");
29
+ console.log("[darwin] @metaharness/darwin loaded — using mapLimit + paretoFront");
30
+ return { mapLimit: d.mapLimit, paretoFront: d.paretoFront, available: true };
31
+ } catch (e) {
32
+ if (e.code !== "ERR_MODULE_NOT_FOUND" && e.code !== "MODULE_NOT_FOUND") throw e;
33
+ console.warn("[darwin] @metaharness/darwin not installed — using built-in evolution loop");
34
+ return { mapLimit: localMapLimit, paretoFront: localParetoFront, available: false };
35
+ }
36
+ }
37
+
38
+ // Minimal local stand-ins (identical contracts to the Darwin exports).
39
+ async function localMapLimit(items, _limit, fn) {
40
+ const out = [];
41
+ for (let i = 0; i < items.length; i++) out.push(await fn(items[i], i));
42
+ return out;
43
+ }
44
+ function localParetoFront(items, objFn) {
45
+ const objs = items.map(objFn);
46
+ return items.filter((_, i) =>
47
+ !items.some((_, j) => j !== i && dominates(objs[j], objs[i])));
48
+ }
49
+ function dominates(a, b) {
50
+ let strictly = false;
51
+ for (let k = 0; k < a.length; k++) {
52
+ if (a[k] < b[k]) return false;
53
+ if (a[k] > b[k]) strictly = true;
54
+ }
55
+ return strictly;
56
+ }
57
+
58
+ // ── Scoring — the Darwin fitness (see harness/scorePolicy.ts for the canonical
59
+ // version used by `metaharness evolve`). Helpfulness (accuracy) AND calibration
60
+ // (risk-adjusted utility — abstain instead of hallucinate) both dominate;
61
+ // reconstruction cost (latency, hops, context) is penalised vs the baseline. ──
62
+ const BASE = { latency: 4.0, hops: 2.0, context: 6.0 };
63
+ function scalar(m) {
64
+ const latTerm = Math.max(0, 1 - m.avgLatencyMs / BASE.latency);
65
+ const hopTerm = Math.max(0, 1 - m.avgHops / BASE.hops);
66
+ const ctxTerm = Math.max(0, 1 - m.avgContext / BASE.context);
67
+ return 0.40 * m.accuracy + 0.30 * m.riskScore + 0.12 * latTerm + 0.10 * ctxTerm + 0.08 * hopTerm;
68
+ }
69
+ // Pareto maximises every component (negate minimised objectives).
70
+ function objectives(m) {
71
+ return [m.accuracy, m.riskScore, -m.avgLatencyMs, -m.avgHops, -m.avgContext];
72
+ }
73
+
74
+ // ── Run ─────────────────────────────────────────────────────────────────────
75
+ const { mapLimit, paretoFront, available } = await loadDarwin();
76
+
77
+ // ── GPU LLM write-layer (opt-in): a local code model proposes genome leaps from
78
+ // failure traces, the directed-search layer the random GA lacks (ADR-260).
79
+ // Disabled with MRAGENT_LLM=off; otherwise auto-detects a local endpoint. ──
80
+ const llm = process.env.MRAGENT_LLM === "off" ? null : await detectEndpoint();
81
+ if (llm) console.log(`[llm] GPU write-layer: ${llm.model} @ ${llm.url}`);
82
+ else console.log("[llm] no local LLM endpoint — GA-only (set MRAGENT_LLM_URL to enable)");
83
+ let llmProposed = 0, llmEnteredElite = 0;
84
+
85
+ const corpus = JSON.parse(fs.readFileSync(path.join(__dirname, "data", "eval-set.json"), "utf8"));
86
+ const tasks = corpus.tasks;
87
+ // ONE memory holds all nodes (full cross-task cue competition); we evolve on the
88
+ // TRAIN queries only and report held-out TEST to prove the genome generalizes.
89
+ const store = new MemoryStore(tasks);
90
+ const { train, test } = splitByClass(tasks, 0.6);
91
+ const folds = kFoldByClass(train, 3); // cross-validation folds over the train pool
92
+
93
+ // Cross-validated fitness: mean fold score MINUS the fold range. The penalty
94
+ // rejects genomes that win on one fold but collapse on another (e.g. a knife-edge
95
+ // abstainThreshold), which is exactly the overfit a single split hides.
96
+ function cvScore(genome) {
97
+ const fs2 = folds.map((f) => scalar(evaluate(genome, store, f)));
98
+ const mean = fs2.reduce((a, b) => a + b, 0) / fs2.length;
99
+ const range = Math.max(...fs2) - Math.min(...fs2);
100
+ return mean - 0.5 * range;
101
+ }
102
+
103
+ // Compact failure trace for the LLM write-layer: tasks the genome gets wrong /
104
+ // hallucinates, with its confidence (so the model can reason about thresholds).
105
+ function failureTraces(genome, tasks, limit = 6) {
106
+ const out = [];
107
+ for (const t of tasks) {
108
+ if (out.length >= limit) break;
109
+ const isAns = t.answerable !== false;
110
+ const r = runReasoningLoop(store.queryText(t.id), store, genome, t);
111
+ if (isAns && !r.correct) {
112
+ out.push(`${t.id}[${t.class ?? "?"}]: ${r.abstained ? "abstained-on-answerable" : "wrong"} conf=${r.confidence.toFixed(2)}`);
113
+ } else if (!isAns && !r.abstained) {
114
+ out.push(`${t.id}[${t.class ?? "?"}]: hallucinated-on-unanswerable conf=${r.confidence.toFixed(2)}`);
115
+ }
116
+ }
117
+ return out.join("\n") || "none";
118
+ }
119
+ const llmGenomes = []; // every coerced LLM proposal, for end-of-run attribution
120
+
121
+ const POP = 16, GENERATIONS = 12, ELITE = 5, CONCURRENCY = 4;
122
+ const baseline = baselineGenome();
123
+ const baseMetrics = evaluate(baseline, store, train);
124
+
125
+ let population = [baseline, ...Array.from({ length: POP - 1 }, () => mutate(baseline))];
126
+ let best = { genome: baseline, metrics: baseMetrics, score: cvScore(baseline) };
127
+ const archive = [];
128
+ const history = [];
129
+
130
+ console.log("== MRAgent · Darwin harness optimizer (v2 — beyond MRAgent) ==");
131
+ console.log(`frozen model: RuVector Cue-Tag-Content graph (${tasks.length} tasks) | train ${train.length} / test ${test.length} (held out)`);
132
+ console.log(`baseline (train): acc ${(baseMetrics.accuracy * 100).toFixed(1)}% risk ${baseMetrics.riskScore.toFixed(3)} halluc ${baseMetrics.hallucinationRate.toFixed(2)}\n`);
133
+
134
+ for (let gen = 0; gen < GENERATIONS; gen++) {
135
+ const scored = await mapLimit(population, CONCURRENCY, async (genome) => {
136
+ const metrics = evaluate(genome, store, train);
137
+ return { genome, metrics, score: cvScore(genome) };
138
+ });
139
+ archive.push(...scored);
140
+
141
+ const front = paretoFront(scored, (e) => objectives(e.metrics));
142
+ const winner = scored.reduce((a, b) => (b.score > a.score ? b : a));
143
+ if (winner.score > best.score) best = winner;
144
+
145
+ history.push({
146
+ gen,
147
+ best: { accuracy: winner.metrics.accuracy, avgLatencyMs: winner.metrics.avgLatencyMs, score: winner.score },
148
+ frontSize: front.length,
149
+ });
150
+ console.log(
151
+ `gen ${gen}: acc ${(winner.metrics.accuracy * 100).toFixed(1)}% risk ${winner.metrics.riskScore.toFixed(3)} ` +
152
+ `halluc ${winner.metrics.hallucinationRate.toFixed(2)} lat ${winner.metrics.avgLatencyMs.toFixed(2)}ms hops ${winner.metrics.avgHops.toFixed(2)} ` +
153
+ `score ${winner.score.toFixed(4)} · pareto ${front.length}`
154
+ );
155
+
156
+ // Next generation: elites + mutated children + a couple of random restarts to
157
+ // keep diversity (the built-in loop has no LLM write-layer to propose leaps).
158
+ const elites = [...scored].sort((a, b) => b.score - a.score).slice(0, ELITE).map((e) => e.genome);
159
+ const next = [...elites];
160
+
161
+ // GPU LLM write-layer: every 3rd generation, ask the local code model for
162
+ // directed genome leaps from the current winner's failure traces. Proposals
163
+ // are bounds-clamped in llmMutator, so they can only ever be safe genomes.
164
+ if (llm && gen % 3 === 0) {
165
+ const traces = failureTraces(winner.genome, train);
166
+ const props = await llmProposeGenomes({ url: llm.url, model: llm.model, baseline, current: winner.genome, failures: traces, n: 2 });
167
+ for (const g of props) {
168
+ llmGenomes.push(g);
169
+ llmProposed++;
170
+ if (next.length < POP) next.push(g);
171
+ }
172
+ if (props.length) console.log(` [llm] gen ${gen}: +${props.length} GPU-proposed genome(s) injected`);
173
+ }
174
+
175
+ const RESTARTS = 2;
176
+ for (let r = 0; r < RESTARTS && next.length < POP; r++) {
177
+ let g = baseline;
178
+ for (let m = 0; m < 6; m++) g = mutate(g); // heavy random walk
179
+ next.push(g);
180
+ }
181
+ while (next.length < POP) next.push(mutate(elites[Math.floor(Math.random() * elites.length)]));
182
+ population = next;
183
+ }
184
+
185
+ // Fold GPU-proposed genomes into the archive so they compete in polish +
186
+ // acceptance on equal footing with GA candidates.
187
+ let llmBest = -Infinity;
188
+ for (const g of llmGenomes) {
189
+ const e = { genome: g, metrics: evaluate(g, store, train), score: cvScore(g) };
190
+ archive.push(e);
191
+ if (e.score > llmBest) llmBest = e.score;
192
+ if (e.score > best.score) { best = e; llmEnteredElite++; }
193
+ }
194
+ if (llm) {
195
+ console.log(`\n[llm] GPU write-layer: ${llmProposed} genome(s) proposed, best cv-score ${llmBest > -Infinity ? llmBest.toFixed(4) : "n/a"}${llmEnteredElite ? `, became GA-best ${llmEnteredElite}×` : ""}`);
196
+ }
197
+
198
+ // ── Memetic polish — deterministic coordinate descent over each gene ─────────
199
+ // The GA explores broadly but the LLM-free fallback struggles with NARROW optima
200
+ // (e.g. the abstainThreshold band that catches hallucinations without abstaining
201
+ // on correct answers). A final hill-climb over a per-gene candidate grid finds
202
+ // them reliably and makes the shipped result reproducible. (The real Darwin
203
+ // write-layer proposes such leaps directly from failure traces — ADR-260.)
204
+ const GRID = {
205
+ cueK: [1, 2, 3, 4, 6, 8],
206
+ efSearch: [16, 24, 32, 48, 64, 96, 128],
207
+ hybridAlpha: [0, 0.2, 0.35, 0.5, 0.65, 0.8, 1],
208
+ fusion: ["rrf", "linear", "dbsf"],
209
+ traversalDepth: [1, 2, 3, 4],
210
+ tagFanout: [1, 2, 3, 4, 6, 8],
211
+ pruneThreshold: [0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4],
212
+ maxContent: [1, 2, 3, 4, 6, 8, 12],
213
+ haltConfidence: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
214
+ rerank: ["gnn", "none"],
215
+ promptStrategy: ["terse", "evidence-first", "prune-explicit"],
216
+ abstainThreshold: [0, 0.1, 0.2, 0.3, 0.34, 0.36, 0.38, 0.4, 0.45, 0.5],
217
+ };
218
+ function localPolish(genome) {
219
+ let cur = { ...genome };
220
+ let curScore = cvScore(cur); // cross-validated over train folds — never see test
221
+ for (let pass = 0; pass < 3; pass++) {
222
+ let improved = false;
223
+ for (const [gene, candidates] of Object.entries(GRID)) {
224
+ for (const v of candidates) {
225
+ if (cur[gene] === v) continue;
226
+ const cand = { ...cur, [gene]: v };
227
+ const s = cvScore(cand);
228
+ if (s > curScore + 1e-9) { cur = cand; curScore = s; improved = true; }
229
+ }
230
+ }
231
+ if (!improved) break;
232
+ }
233
+ return { genome: cur, score: curScore };
234
+ }
235
+ // Multi-start polish: greedy coordinate descent is start-dependent, so refine from
236
+ // several diverse seeds (GA winner + baseline + top archive elites) and keep the
237
+ // global best. This makes the calibrated optimum reproducible across runs.
238
+ const seeds = [best.genome, baseline, ...[...archive].sort((a, b) => b.score - a.score).slice(0, 4).map((e) => e.genome)];
239
+ for (const seed of seeds) {
240
+ const polished = localPolish(seed);
241
+ if (polished.score > best.score) best = { genome: polished.genome, metrics: evaluate(polished.genome, store, train), score: polished.score };
242
+ }
243
+ console.log(`\n[polish] multi-start coordinate-descent (train) → score ${best.score.toFixed(4)} (acc ${(best.metrics.accuracy * 100).toFixed(1)}% risk ${best.metrics.riskScore.toFixed(3)} halluc ${best.metrics.hallucinationRate.toFixed(2)})`);
244
+
245
+ // ── Acceptance gate over the whole archive ──────────────────────────────────
246
+ const gate = (m) => {
247
+ const accGain = m.accuracy - baseMetrics.accuracy;
248
+ const riskGain = m.riskScore - baseMetrics.riskScore;
249
+ const noRegress = m.accuracy >= baseMetrics.accuracy - 1e-9 && m.riskScore >= baseMetrics.riskScore - 1e-9;
250
+ return { accGain, riskGain, noRegress, passed: noRegress && (accGain >= 0.04 || riskGain >= 0.04) };
251
+ };
252
+ const passers = [best, ...archive]
253
+ .map((e) => ({ e, g: gate(e.metrics) }))
254
+ .filter((x) => x.g.passed)
255
+ .sort((a, b) => (b.e.score - a.e.score));
256
+ const accepted = passers[0]?.e ?? best;
257
+ const acc = gate(accepted.metrics);
258
+
259
+ console.log("\n-- acceptance gate (over archive) --");
260
+ console.log(`candidates evaluated: ${archive.length} | gate-passing: ${passers.length}`);
261
+ console.log(`accepted: acc ${(accepted.metrics.accuracy * 100).toFixed(1)}% (${acc.accGain >= 0 ? "+" : ""}${(acc.accGain * 100).toFixed(1)}pt) · risk ${accepted.metrics.riskScore.toFixed(3)} (${acc.riskGain >= 0 ? "+" : ""}${acc.riskGain.toFixed(3)}) · halluc ${accepted.metrics.hallucinationRate.toFixed(2)}`);
262
+ console.log(passers.length ? "PASS — Pareto-superior harness found (freeze model, evolve harness)" : "no gate-passing variant this run");
263
+
264
+ // ── Generalization: held-out TEST split (never seen during evolution) ────────
265
+ // Generalization criterion = does evolving on TRAIN improve UNSEEN test? (not an
266
+ // absolute accuracy bar — the synthetic toy embedding has per-instance noise, and
267
+ // a single global hybridAlpha cannot perfectly serve both dense- and sparse-keyed
268
+ // queries; the question that matters is whether optimization transfers.)
269
+ const baseTest = evaluate(baseline, store, test);
270
+ const evoTest = evaluate(accepted.genome, store, test);
271
+ const generalizes = evoTest.accuracy >= baseTest.accuracy + 0.10 && evoTest.hallucinationRate <= baseTest.hallucinationRate + 1e-9;
272
+ console.log("\n-- generalization (held-out test split, never seen in evolution) --");
273
+ console.log(`baseline test: acc ${(baseTest.accuracy * 100).toFixed(1)}% risk ${baseTest.riskScore.toFixed(3)} halluc ${baseTest.hallucinationRate.toFixed(2)}`);
274
+ console.log(`evolved test: acc ${(evoTest.accuracy * 100).toFixed(1)}% risk ${evoTest.riskScore.toFixed(3)} halluc ${evoTest.hallucinationRate.toFixed(2)}`);
275
+ console.log(`gain: +${((evoTest.accuracy - baseTest.accuracy) * 100).toFixed(1)}pt acc, +${(evoTest.riskScore - baseTest.riskScore).toFixed(3)} risk on unseen tasks`);
276
+ console.log(generalizes ? "GENERALIZES — evolution transfers to unseen tasks (not overfit)" : "WARNING — evolved genome does not transfer");
277
+
278
+ // ── Replay/consolidation pass on the accepted genome (self-reorganizing memory) ─
279
+ const memAfter = new MemoryStore(tasks);
280
+ const evoMetricsPre = evaluate(accepted.genome, memAfter, tasks);
281
+ const consolidation = consolidate(memAfter, tasks, accepted.genome);
282
+ const evoMetricsPost = evaluate(accepted.genome, memAfter, tasks);
283
+ console.log(`\n-- consolidation (replay) on accepted genome --`);
284
+ console.log(`shortcuts laid: ${consolidation.consolidated} | avgHops ${evoMetricsPre.avgHops.toFixed(3)} -> ${evoMetricsPost.avgHops.toFixed(3)} (${(((evoMetricsPre.avgHops - evoMetricsPost.avgHops) / evoMetricsPre.avgHops) * 100).toFixed(1)}% fewer) at acc ${(evoMetricsPost.accuracy * 100).toFixed(1)}%`);
285
+
286
+ const report = {
287
+ tool: "metaharness/darwin",
288
+ philosophy: "freeze the model, evolve the harness",
289
+ frozenModel: "RuVector Cue-Tag-Content graph memory (agent/memory.mjs)",
290
+ darwinAvailable: available,
291
+ primitivesUsed: ["mapLimit", "paretoFront"],
292
+ gpuWriteLayer: llm
293
+ ? { endpoint: llm.url, model: llm.model, proposed: llmProposed, bestCvScore: llmBest > -Infinity ? llmBest : null, becameBest: llmEnteredElite }
294
+ : { enabled: false },
295
+ split: { train: train.length, test: test.length },
296
+ baseline: { trainMetrics: baseMetrics, testMetrics: baseTest },
297
+ evolved: { genome: accepted.genome, trainMetrics: accepted.metrics, testMetrics: evoTest, score: accepted.score },
298
+ generalizes,
299
+ consolidation: { shortcuts: consolidation.consolidated, avgHopsBefore: evoMetricsPre.avgHops, avgHopsAfter: evoMetricsPost.avgHops, metricsAfter: evoMetricsPost },
300
+ acceptance: acc,
301
+ history,
302
+ };
303
+ fs.writeFileSync(path.join(__dirname, "optimize.report.json"), JSON.stringify(report, null, 2));
304
+ console.log(`\nreport -> ${path.join(__dirname, "optimize.report.json")}`);
package/package.json ADDED
@@ -0,0 +1,56 @@
1
+ {
2
+ "name": "ruvector-mragent",
3
+ "version": "0.1.0",
4
+ "private": false,
5
+ "type": "module",
6
+ "description": "MRAgent — Cue-Tag-Content graph memory over RuVector, with a Meta-Harness Darwin loop that evolves the reconstruction harness (freeze the model, evolve the harness).",
7
+ "homepage": "https://github.com/ruvnet/ruvector/tree/main/examples/mragent#readme",
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "git+https://github.com/ruvnet/ruvector.git",
11
+ "directory": "examples/mragent"
12
+ },
13
+ "files": [
14
+ "agent/",
15
+ "harness/",
16
+ "data/",
17
+ "tools/",
18
+ "test/",
19
+ "*.mjs",
20
+ "README.md",
21
+ "LICENSE"
22
+ ],
23
+ "publishConfig": {
24
+ "access": "public"
25
+ },
26
+ "scripts": {
27
+ "probe": "node probeDarwin.mjs",
28
+ "optimize": "node optimize.mjs",
29
+ "benchmark": "node benchmark.mjs",
30
+ "test": "node --test \"test/*.test.mjs\"",
31
+ "gen-corpus": "node tools/genCorpus.mjs"
32
+ },
33
+ "keywords": [
34
+ "ruvector",
35
+ "mragent",
36
+ "graph-memory",
37
+ "metaharness",
38
+ "darwin",
39
+ "rag",
40
+ "hnsw",
41
+ "cypher"
42
+ ],
43
+ "license": "MIT",
44
+ "optionalDependencies": {
45
+ "@metaharness/darwin": "^0.3.1",
46
+ "ruvector": "^2.1.0"
47
+ },
48
+ "peerDependencies": {
49
+ "@metaharness/darwin": "^0.3.1"
50
+ },
51
+ "peerDependenciesMeta": {
52
+ "@metaharness/darwin": {
53
+ "optional": true
54
+ }
55
+ }
56
+ }
@@ -0,0 +1,24 @@
1
+ // Probe the @metaharness/darwin surface so we wire to the real exports.
2
+ // Mirrors examples/sonic-ct/probeDarwin.mjs. Exits 0 (skip) when the optional
3
+ // package is absent — the example must run without it (ADR-150).
4
+ try {
5
+ const mod = await import("@metaharness/darwin");
6
+ const names = Object.keys(mod).sort();
7
+ console.log("exports", names);
8
+ for (const required of ["mapLimit", "paretoFront"]) {
9
+ if (typeof mod[required] !== "function") {
10
+ throw new Error(`Missing required export: ${required}`);
11
+ }
12
+ }
13
+ const paretoNames = names.filter((n) => /pareto/i.test(n));
14
+ console.log("pareto exports", paretoNames);
15
+ const fnNames = names.filter((n) => typeof mod[n] === "function");
16
+ console.log("function exports:", fnNames);
17
+ if (typeof mod.evolve === "function") console.log("evolve.length (arity):", mod.evolve.length);
18
+ } catch (e) {
19
+ if (e.code === "ERR_MODULE_NOT_FOUND" || e.code === "MODULE_NOT_FOUND") {
20
+ console.warn("[probe] @metaharness/darwin not installed — skipping (optional dependency).");
21
+ process.exit(0);
22
+ }
23
+ throw e;
24
+ }
@@ -0,0 +1,134 @@
1
+ // MRAgent v2 acceptance gates. Deterministic — no network, no native deps.
2
+ // Every gene is proven load-bearing here. Run: npm test
3
+
4
+ import { test } from "node:test";
5
+ import assert from "node:assert/strict";
6
+ import fs from "node:fs";
7
+ import path from "node:path";
8
+ import { fileURLToPath } from "node:url";
9
+ import { MemoryStore, baselineGenome, evaluate, mutate, runReasoningLoop, splitByClass } from "../agent/harness.mjs";
10
+ import { embed, EMBED_DIM, tokenize } from "../agent/memory.mjs";
11
+ import { consolidate } from "../agent/consolidate.mjs";
12
+
13
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
14
+ const corpus = JSON.parse(fs.readFileSync(path.join(__dirname, "..", "data", "eval-set.json"), "utf8"));
15
+ const tasks = corpus.tasks;
16
+ const store = new MemoryStore(tasks);
17
+ const sub = (cls) => tasks.filter((t) => t.class === cls);
18
+ const accOn = (genome, subset) => {
19
+ const s = new MemoryStore(tasks);
20
+ let c = 0, n = 0;
21
+ for (const t of subset) { if (t.answerable === false) continue; n++; if (runReasoningLoop(s.queryText(t.id), s, genome, t).correct) c++; }
22
+ return c / (n || 1);
23
+ };
24
+
25
+ test("embeddings are deterministic and L2-normalized", () => {
26
+ const a = embed("fast cold-boot");
27
+ assert.equal(a.length, EMBED_DIM);
28
+ assert.deepEqual([...a], [...embed("fast cold-boot")]);
29
+ let norm = 0; for (const x of a) norm += x * x;
30
+ assert.ok(Math.abs(Math.sqrt(norm) - 1) < 1e-5);
31
+ });
32
+
33
+ test("dense (concept) and sparse (token) signals are decoupled", () => {
34
+ const cos = (x, y) => { let d = 0; for (let i = 0; i < x.length; i++) d += x[i] * y[i]; return d; };
35
+ const overlap = (x, y) => { const A = new Set(tokenize(x)); let s = 0; for (const t of tokenize(y)) if (A.has(t)) s++; return s; };
36
+ // paraphrase: shared concepts, zero shared tokens → dense-close, sparse-zero
37
+ assert.ok(cos(embed("fast boot"), embed("rapid cold-start")) > 0.4);
38
+ assert.equal(overlap("fast boot", "rapid cold-start"), 0);
39
+ });
40
+
41
+ test("evaluation is reproducible for a fixed genome", () => {
42
+ const g = baselineGenome();
43
+ assert.deepEqual(evaluate(g, store, tasks), evaluate(g, store, tasks));
44
+ });
45
+
46
+ test("baseline answers a non-trivial share but is not perfect (headroom exists)", () => {
47
+ const m = evaluate(baselineGenome(), store, tasks);
48
+ assert.ok(m.accuracy >= 0.4 && m.accuracy < 0.9, `baseline accuracy ${m.accuracy}`);
49
+ });
50
+
51
+ test("hybridAlpha is load-bearing in BOTH directions (dense vs sparse)", () => {
52
+ const denseHeavy = { ...baselineGenome(), hybridAlpha: 1, cueK: 1, fusion: "linear" };
53
+ const sparseHeavy = { ...baselineGenome(), hybridAlpha: 0, cueK: 1, fusion: "linear" };
54
+ // semantic tasks need dense; lexical tasks need sparse
55
+ assert.ok(accOn(denseHeavy, sub("semantic")) > accOn(sparseHeavy, sub("semantic")), "semantic needs dense");
56
+ assert.ok(accOn(sparseHeavy, sub("lexical")) > accOn(denseHeavy, sub("lexical")), "lexical needs sparse");
57
+ });
58
+
59
+ test("traversalDepth is load-bearing: 2-hop-bridge tasks need depth>=3", () => {
60
+ const bridge2 = tasks.filter((t) => (t.bridges || 0) >= 2);
61
+ assert.ok(bridge2.length > 0);
62
+ assert.equal(accOn({ ...baselineGenome(), traversalDepth: 2 }, bridge2), 0, "depth 2 misses 2-hop bridges");
63
+ assert.equal(accOn({ ...baselineGenome(), traversalDepth: 3 }, bridge2), 1, "depth 3 resolves them");
64
+ });
65
+
66
+ test("abstention sharply cuts hallucination and raises risk-adjusted utility", () => {
67
+ const reckless = evaluate({ ...baselineGenome(), abstainThreshold: 0 }, store, tasks);
68
+ const calibrated = evaluate({ ...baselineGenome(), abstainThreshold: 0.4 }, store, tasks);
69
+ assert.ok(reckless.hallucinationRate > 0.1, "baseline hallucinates on unanswerable");
70
+ assert.ok(calibrated.hallucinationRate <= reckless.hallucinationRate / 2, "abstention at least halves hallucination");
71
+ assert.ok(calibrated.riskScore > reckless.riskScore + 0.1, "risk-adjusted utility improves materially");
72
+ });
73
+
74
+ test("corroboration (rerank=gnn) + fanout rescue distractor tasks under a terse window", () => {
75
+ const d = sub("distractor");
76
+ const none = accOn({ ...baselineGenome(), rerank: "none", promptStrategy: "terse", tagFanout: 3, maxContent: 8 }, d);
77
+ const gnn = accOn({ ...baselineGenome(), rerank: "gnn", promptStrategy: "terse", tagFanout: 3, maxContent: 8 }, d);
78
+ const gnnNoFan = accOn({ ...baselineGenome(), rerank: "gnn", promptStrategy: "terse", tagFanout: 1, maxContent: 8 }, d);
79
+ assert.equal(gnn, 1, "gnn corroboration + fanout resolves all distractor tasks");
80
+ assert.ok(gnn > none + 0.3, "corroboration beats no-rerank under a terse window");
81
+ assert.ok(gnn > gnnNoFan + 0.3, "corroboration needs fanout to reach the corroborating tag");
82
+ });
83
+
84
+ test("consolidation (replay) reduces hops at equal-or-better accuracy", () => {
85
+ const g = { ...baselineGenome(), traversalDepth: 3, fusion: "linear", haltConfidence: 0.5, abstainThreshold: 0.36 };
86
+ const s = new MemoryStore(tasks);
87
+ const before = evaluate(g, s, tasks);
88
+ consolidate(s, tasks, g);
89
+ const after = evaluate(g, s, tasks);
90
+ assert.ok(after.avgHops < before.avgHops, `hops ${before.avgHops} -> ${after.avgHops}`);
91
+ assert.ok(after.accuracy >= before.accuracy - 1e-9, "accuracy not regressed");
92
+ });
93
+
94
+ test("a calibrated genome reaches high accuracy with near-zero hallucination", () => {
95
+ const tuned = { ...baselineGenome(), fusion: "linear", traversalDepth: 3, tagFanout: 3, abstainThreshold: 0.4, maxContent: 6 };
96
+ const m = evaluate(tuned, store, tasks);
97
+ assert.ok(m.accuracy >= 0.8, `accuracy ${m.accuracy}`);
98
+ assert.ok(m.hallucinationRate <= 0.05, `halluc ${m.hallucinationRate}`);
99
+ });
100
+
101
+ test("evolved-style genome GENERALIZES: beats baseline on a held-out test split", () => {
102
+ const { train, test } = splitByClass(tasks, 0.6);
103
+ assert.ok(test.length >= 10, "test split is non-trivial");
104
+ const tuned = { ...baselineGenome(), fusion: "linear", traversalDepth: 3, tagFanout: 3, abstainThreshold: 0.4, maxContent: 6 };
105
+ const baseTest = evaluate(baselineGenome(), store, test);
106
+ const evoTest = evaluate(tuned, store, test);
107
+ assert.ok(evoTest.accuracy >= baseTest.accuracy + 0.1, `test acc ${baseTest.accuracy} -> ${evoTest.accuracy}`);
108
+ assert.ok(evoTest.hallucinationRate <= baseTest.hallucinationRate, "no worse hallucination on unseen tasks");
109
+ // depth-independent confidence: deep (2-hop) bridges are still confident
110
+ const bridge2 = test.filter((t) => (t.bridges || 0) >= 2);
111
+ for (const t of bridge2) {
112
+ const r = runReasoningLoop(store.queryText(t.id), store, tuned, t);
113
+ assert.ok(r.confidence > 0.5, `2-hop bridge ${t.id} confidence ${r.confidence} should stay high`);
114
+ }
115
+ });
116
+
117
+ test("mutate stays within declared genome bounds (all 12 genes)", () => {
118
+ let g = baselineGenome();
119
+ for (let i = 0; i < 300; i++) {
120
+ g = mutate(g);
121
+ assert.ok(g.cueK >= 1 && g.cueK <= 12);
122
+ assert.ok(g.efSearch >= 16 && g.efSearch <= 256);
123
+ assert.ok(g.hybridAlpha >= 0 && g.hybridAlpha <= 1);
124
+ assert.ok(["rrf", "linear", "dbsf"].includes(g.fusion));
125
+ assert.ok(g.traversalDepth >= 1 && g.traversalDepth <= 4);
126
+ assert.ok(g.tagFanout >= 1 && g.tagFanout <= 8);
127
+ assert.ok(g.pruneThreshold >= 0 && g.pruneThreshold <= 0.6);
128
+ assert.ok(g.maxContent >= 1 && g.maxContent <= 20);
129
+ assert.ok(g.haltConfidence >= 0.2 && g.haltConfidence <= 0.9);
130
+ assert.ok(["gnn", "none"].includes(g.rerank));
131
+ assert.ok(["terse", "evidence-first", "prune-explicit"].includes(g.promptStrategy));
132
+ assert.ok(g.abstainThreshold >= 0 && g.abstainThreshold <= 0.6);
133
+ }
134
+ });
@@ -0,0 +1,48 @@
1
+ // Tests for the GPU LLM write-layer's safety boundary: whatever a model returns,
2
+ // coerceGenome must produce a genome whose every gene is within declared bounds.
3
+ import { test } from "node:test";
4
+ import assert from "node:assert/strict";
5
+ import { coerceGenome } from "../agent/llmMutator.mjs";
6
+ import { baselineGenome } from "../agent/harness.mjs";
7
+
8
+ test("coerceGenome clamps out-of-range / wrong-type LLM output to safe genome", () => {
9
+ const base = baselineGenome();
10
+ const hostile = {
11
+ cueK: 9999, // way over max
12
+ efSearch: -50, // under min
13
+ hybridAlpha: 7.5, // over 1
14
+ traversalDepth: 0, // under min
15
+ tagFanout: "lots", // wrong type → ignored, keeps baseline
16
+ pruneThreshold: 2, // over max
17
+ maxContent: 1000,
18
+ haltConfidence: -1, // under min (0.2)
19
+ abstainThreshold: 5,
20
+ fusion: "telepathy", // invalid enum → baseline
21
+ rerank: "none", // valid enum → applied
22
+ promptStrategy: "evil",// invalid enum → baseline
23
+ injected: "ignore me", // unknown key → dropped
24
+ };
25
+ const g = coerceGenome(hostile, base);
26
+
27
+ assert.ok(g.cueK >= 1 && g.cueK <= 12);
28
+ assert.ok(g.efSearch >= 16 && g.efSearch <= 256);
29
+ assert.ok(g.hybridAlpha >= 0 && g.hybridAlpha <= 1);
30
+ assert.ok(g.traversalDepth >= 1 && g.traversalDepth <= 4);
31
+ assert.equal(g.tagFanout, base.tagFanout); // wrong type ignored
32
+ assert.ok(g.pruneThreshold >= 0 && g.pruneThreshold <= 0.6);
33
+ assert.ok(g.maxContent >= 1 && g.maxContent <= 20);
34
+ assert.ok(g.haltConfidence >= 0.2 && g.haltConfidence <= 0.9);
35
+ assert.ok(g.abstainThreshold >= 0 && g.abstainThreshold <= 0.6);
36
+ assert.equal(g.fusion, base.fusion); // invalid enum → baseline
37
+ assert.equal(g.rerank, "none"); // valid enum → applied
38
+ assert.equal(g.promptStrategy, base.promptStrategy);
39
+ assert.ok(!("injected" in g)); // unknown key dropped
40
+ assert.ok(Number.isInteger(g.cueK) && Number.isInteger(g.efSearch));
41
+ });
42
+
43
+ test("coerceGenome on junk returns the baseline untouched", () => {
44
+ const base = baselineGenome();
45
+ assert.deepEqual(coerceGenome(null, base), base);
46
+ assert.deepEqual(coerceGenome("not an object", base), base);
47
+ assert.deepEqual(coerceGenome(42, base), base);
48
+ });