ruvector-mragent 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +159 -0
- package/agent/concepts.mjs +71 -0
- package/agent/consolidate.mjs +55 -0
- package/agent/harness.mjs +204 -0
- package/agent/llmMutator.mjs +147 -0
- package/agent/memory.mjs +355 -0
- package/benchmark.mjs +63 -0
- package/data/eval-set.json +1685 -0
- package/harness/scorePolicy.ts +75 -0
- package/optimize.mjs +304 -0
- package/package.json +56 -0
- package/probeDarwin.mjs +24 -0
- package/test/harness.test.mjs +134 -0
- package/test/llmMutator.test.mjs +48 -0
- package/tools/genCorpus.mjs +74 -0
package/agent/memory.mjs
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
// MRAgent FROZEN MODEL — the Cue-Tag-Content associative memory substrate.
|
|
2
|
+
//
|
|
3
|
+
// Per the Meta-Harness invariant ("freeze the model, evolve the harness"), this
|
|
4
|
+
// file is NEVER mutated by Darwin. It is the RuVector-backed memory store. In
|
|
5
|
+
// production the nodes, embeddings, and edges live in a RuVector `.rvf` index and
|
|
6
|
+
// traversal is a Cypher query:
|
|
7
|
+
//
|
|
8
|
+
// MATCH (c:Cue)-[:LINKED_TO*1..N]->(t:Tag)-[:REFERENCES]->(m:Content)
|
|
9
|
+
// WHERE c.id IN $cueIds RETURN m
|
|
10
|
+
//
|
|
11
|
+
// To keep this example runnable with ZERO native dependencies (and fully
|
|
12
|
+
// deterministic for CI), the store is reimplemented in-process with the same
|
|
13
|
+
// semantics: hybrid (sparse+dense RRF) cue search and bounded-depth, prunable
|
|
14
|
+
// graph reconstruction. If the real `ruvector` package is installed it is used
|
|
15
|
+
// for embeddings; otherwise a deterministic hashed embedding is used. Either way
|
|
16
|
+
// the GRAPH SEMANTICS are identical, so the harness genome evolved here transfers
|
|
17
|
+
// to a live RuVector deployment unchanged.
|
|
18
|
+
|
|
19
|
+
import { createRequire } from "node:module";
|
|
20
|
+
import { NUM_CONCEPTS, conceptOf, syn } from "./concepts.mjs";
|
|
21
|
+
const require = createRequire(import.meta.url);
|
|
22
|
+
|
|
23
|
+
// Runtime-optional production backend. The example never *requires* it.
|
|
24
|
+
let RuVector = null;
|
|
25
|
+
try { RuVector = require("ruvector"); } catch { /* deterministic fallback */ }
|
|
26
|
+
|
|
27
|
+
// Dense embedding = concept-projected semantics + a small lexical hash tail.
|
|
28
|
+
// The concept block (first NUM_CONCEPTS dims) makes paraphrases dense-close even
|
|
29
|
+
// with zero shared tokens; the hash tail keeps unique tokens distinguishable.
|
|
30
|
+
const HASH_TAIL = 64;
|
|
31
|
+
export const EMBED_DIM = NUM_CONCEPTS + HASH_TAIL;
|
|
32
|
+
export const usingRuVector = !!RuVector;
|
|
33
|
+
|
|
34
|
+
const STOP = new Set(["the", "a", "an", "to", "of", "is", "are", "and", "in", "into", "does", "do", "what", "which", "how", "with", "from", "for", "that"]);
|
|
35
|
+
|
|
36
|
+
export function tokenize(text) {
|
|
37
|
+
return String(text)
|
|
38
|
+
.toLowerCase()
|
|
39
|
+
.replace(/[^a-z0-9]+/g, " ")
|
|
40
|
+
.split(" ")
|
|
41
|
+
.filter((w) => w.length > 1 && !STOP.has(w));
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Deterministic FNV-1a hash → stable across runs/platforms (no Math.random here).
|
|
45
|
+
function hash32(str) {
|
|
46
|
+
let h = 0x811c9dc5;
|
|
47
|
+
for (let i = 0; i < str.length; i++) {
|
|
48
|
+
h ^= str.charCodeAt(i);
|
|
49
|
+
h = Math.imul(h, 0x01000193) >>> 0;
|
|
50
|
+
}
|
|
51
|
+
return h >>> 0;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Deterministic concept-projected embedding. Stands in for an ONNX MiniLM dense
|
|
55
|
+
// vector: tokens sharing a concept (synonyms) land on the same concept dim, so
|
|
56
|
+
// paraphrases are dense-close WITHOUT lexical overlap. Identifier-like tokens
|
|
57
|
+
// only hit the hash tail, so they are semantically generic (sparse decides them).
|
|
58
|
+
export function embed(text) {
|
|
59
|
+
const v = new Float32Array(EMBED_DIM);
|
|
60
|
+
const toks = tokenize(text);
|
|
61
|
+
for (const t of toks) {
|
|
62
|
+
const c = conceptOf(t);
|
|
63
|
+
if (c >= 0) {
|
|
64
|
+
v[c] += 1; // concept dimension (dense semantics)
|
|
65
|
+
} else {
|
|
66
|
+
// lexical-only token → hash tail (after the concept block)
|
|
67
|
+
v[NUM_CONCEPTS + (hash32(t) % HASH_TAIL)] += 0.6;
|
|
68
|
+
v[NUM_CONCEPTS + (hash32("salt:" + t) % HASH_TAIL)] += 0.3;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
let norm = 0;
|
|
72
|
+
for (let i = 0; i < EMBED_DIM; i++) norm += v[i] * v[i];
|
|
73
|
+
norm = Math.sqrt(norm) || 1;
|
|
74
|
+
for (let i = 0; i < EMBED_DIM; i++) v[i] /= norm;
|
|
75
|
+
return v;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function cosine(a, b) {
|
|
79
|
+
let dot = 0;
|
|
80
|
+
for (let i = 0; i < a.length; i++) dot += a[i] * b[i];
|
|
81
|
+
return dot; // both are L2-normalized
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Sparse term-overlap score (BM25-lite): shared tokens / sqrt(len product).
|
|
85
|
+
function sparseScore(queryToks, docToks) {
|
|
86
|
+
if (!queryToks.length || !docToks.length) return 0;
|
|
87
|
+
const q = new Set(queryToks);
|
|
88
|
+
let shared = 0;
|
|
89
|
+
for (const t of docToks) if (q.has(t)) shared++;
|
|
90
|
+
return shared / Math.sqrt(queryToks.length * docToks.length);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// ── Graph builder ───────────────────────────────────────────────────────────
|
|
94
|
+
// Builds the Cue-Tag-Content graph from the eval corpus, plus cross-task
|
|
95
|
+
// distractor cues/contents so every gene is load-bearing.
|
|
96
|
+
//
|
|
97
|
+
// Texts are SYNTHESIZED from each task's structured signal spec (concept names +
|
|
98
|
+
// lexical identifiers) so that dense/sparse separation, ranking-distractors and
|
|
99
|
+
// multi-hop bridges are guaranteed, not dependent on fragile English wording.
|
|
100
|
+
//
|
|
101
|
+
// query = qConcepts(variant0) + qLex
|
|
102
|
+
// correct cue = cue.concepts(variant1) + cue.lex (same concepts, diff tokens)
|
|
103
|
+
// correct text = qConcepts(variant0) + expected_fact + cue.lex
|
|
104
|
+
// distractor = query echoed twice (out-ranks correct on raw sim, no fact)
|
|
105
|
+
// decoy cue = decoy.concepts/lex → wrong tag → wrong content
|
|
106
|
+
//
|
|
107
|
+
// Edge model:
|
|
108
|
+
// Cue -LINKED_TO-> [bridge0 -> … ->] { relevantTag, corroborateTag }
|
|
109
|
+
// Tag -REFERENCES-> Content
|
|
110
|
+
function synth(concepts = [], lex = [], variant = 0) {
|
|
111
|
+
return [...concepts.map((c) => syn(c, variant)), ...lex].join(" ");
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/** Synthesize the query string for a task spec (used at retrieval time). */
|
|
115
|
+
export function queryTextFor(spec) {
|
|
116
|
+
return synth(spec.qConcepts || [], spec.qLex || [], 0);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
export function buildGraph(specs) {
|
|
120
|
+
const cues = new Map();
|
|
121
|
+
const tags = new Map();
|
|
122
|
+
const content = new Map();
|
|
123
|
+
const queries = new Map();
|
|
124
|
+
|
|
125
|
+
const mkTag = (name) => {
|
|
126
|
+
const id = `tag:${name}`;
|
|
127
|
+
const t = { id, name, text: name.replace(/-/g, " "), toks: tokenize(name), vec: embed(name.replace(/-/g, " ")), content: [], next: [] };
|
|
128
|
+
tags.set(id, t);
|
|
129
|
+
return t;
|
|
130
|
+
};
|
|
131
|
+
const mkContent = (id, text, taskId) => {
|
|
132
|
+
content.set(id, { id, text, toks: tokenize(text), vec: embed(text), taskId });
|
|
133
|
+
return id;
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
for (const spec of specs) {
|
|
137
|
+
queries.set(spec.id, queryTextFor(spec));
|
|
138
|
+
|
|
139
|
+
// Unanswerable task: NO correct content exists — the only honest answer is to
|
|
140
|
+
// abstain. We still create the cue + decoys so the agent has something to chase
|
|
141
|
+
// and must judge that the reconstructed evidence is too weak (low confidence).
|
|
142
|
+
const answerable = spec.answerable !== false;
|
|
143
|
+
|
|
144
|
+
let entry;
|
|
145
|
+
if (answerable) {
|
|
146
|
+
// Correct content: relevant to the query (shares query concepts) + the fact.
|
|
147
|
+
const cid = `content:${spec.id}`;
|
|
148
|
+
mkContent(cid, [synth(spec.qConcepts, [], 0), spec.expected_fact, ...(spec.cue?.lex || [])].join(" "), spec.id);
|
|
149
|
+
|
|
150
|
+
// Relevant tag references the correct content (+ ranking-distractor contents).
|
|
151
|
+
const rel = mkTag(`${spec.id}-rel`);
|
|
152
|
+
rel.content.push(cid);
|
|
153
|
+
for (let d = 0; d < (spec.distractors || 0); d++) {
|
|
154
|
+
// Echoes the query MORE than the correct content → higher raw sim, but no
|
|
155
|
+
// expected_fact. Only rerank (corroboration) or a wide window survives it.
|
|
156
|
+
const did = mkContent(`content:${spec.id}:d${d}`,
|
|
157
|
+
[synth(spec.qConcepts, spec.qLex, 0), synth(spec.qConcepts, [], 0), (spec.qLex || []).join(" ")].join(" "),
|
|
158
|
+
`${spec.id}-distractor`);
|
|
159
|
+
rel.content.push(did);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Corroborating tag references the SAME correct content via a second path.
|
|
163
|
+
// Only surfaces with rerank="gnn" (corroboration boost) AND tagFanout>=2.
|
|
164
|
+
const tail = [rel];
|
|
165
|
+
if (spec.corroborate) {
|
|
166
|
+
const corr = mkTag(`${spec.id}-corr`);
|
|
167
|
+
corr.content.push(cid);
|
|
168
|
+
tail.push(corr);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Bridge chain: cue -> b0 -> … -> tail. k bridges ⇒ need traversalDepth k+1.
|
|
172
|
+
const bridges = [];
|
|
173
|
+
for (let b = 0; b < (spec.bridges || 0); b++) bridges.push(mkTag(`${spec.id}-b${b}`));
|
|
174
|
+
for (let b = 0; b < bridges.length; b++) {
|
|
175
|
+
const nxt = b + 1 < bridges.length ? [bridges[b + 1]] : tail;
|
|
176
|
+
for (const t of nxt) bridges[b].next.push(t.id);
|
|
177
|
+
}
|
|
178
|
+
entry = bridges.length ? [bridges[0]] : tail;
|
|
179
|
+
} else {
|
|
180
|
+
// Only a weak tag with a low-similarity placeholder → confidence stays low.
|
|
181
|
+
const weak = mkTag(`${spec.id}-weak`);
|
|
182
|
+
const wid = mkContent(`content:${spec.id}:weak`, ["tangential unrelated note", spec.id].join(" "), `${spec.id}-none`);
|
|
183
|
+
weak.content.push(wid);
|
|
184
|
+
entry = [weak];
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Correct cue (concepts via variant-1 surface tokens, so dense-close to query
|
|
188
|
+
// but lexically distinct; shares cue.lex with the query for the sparse signal).
|
|
189
|
+
mkCue(cues, `cue:${spec.id}:correct`,
|
|
190
|
+
synth(spec.cue?.concepts || [], spec.cue?.lex || [], 1), answerable ? spec.id : `${spec.id}-none`, entry.map((t) => t.id));
|
|
191
|
+
|
|
192
|
+
// Decoy cues → wrong tag → wrong content. Concepts use variant-2 surface tokens
|
|
193
|
+
// so a concept-decoy is dense-close to the query but shares NO token with it —
|
|
194
|
+
// the correct cue is only retrievable with the right fusion weight.
|
|
195
|
+
(spec.decoys || []).forEach((dec, di) => {
|
|
196
|
+
const wc = mkContent(`content:${spec.id}:w${di}`, ["wrong decoy", synth(dec.concepts || [], dec.lex || [], 2)].join(" "), `${spec.id}-decoy`);
|
|
197
|
+
const wt = mkTag(`${spec.id}-w${di}`);
|
|
198
|
+
wt.content.push(wc);
|
|
199
|
+
mkCue(cues, `cue:${spec.id}:decoy${di}`, synth(dec.concepts || [], dec.lex || [], 2), `${spec.id}-decoy`, [wt.id]);
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return { cues, tags, content, queries };
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
function mkCue(cues, id, text, taskId, links) {
|
|
207
|
+
cues.set(id, { id, text, toks: tokenize(text), vec: embed(text), taskId, links });
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// ── MemoryStore: hybrid cue search + bounded-depth reconstruction ─────────────
|
|
211
|
+
export class MemoryStore {
|
|
212
|
+
constructor(tasks) {
|
|
213
|
+
this.tasks = tasks;
|
|
214
|
+
this.graph = buildGraph(tasks);
|
|
215
|
+
this.cueList = [...this.graph.cues.values()];
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/** Synthesized query string for a task id (the text actually issued at search). */
|
|
219
|
+
queryText(taskId) {
|
|
220
|
+
return this.graph.queries.get(taskId) ?? "";
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Stage 1 — find entry cues with hybrid (sparse + dense) search + RRF.
|
|
225
|
+
* `efSearch` bounds the dense candidate pool (HNSW recall proxy): a small
|
|
226
|
+
* efSearch can drop the correct cue before fusion ever sees it.
|
|
227
|
+
*/
|
|
228
|
+
hybridSearch(queryText, { cueK = 5, efSearch = 64, hybridAlpha = 0.5, fusion = "rrf" } = {}) {
|
|
229
|
+
const qTok = tokenize(queryText);
|
|
230
|
+
const qVec = embed(queryText);
|
|
231
|
+
|
|
232
|
+
const dense = this.cueList
|
|
233
|
+
.map((c) => ({ c, s: cosine(qVec, c.vec) }))
|
|
234
|
+
.sort((a, b) => b.s - a.s)
|
|
235
|
+
.slice(0, Math.max(1, efSearch)); // HNSW recall ceiling
|
|
236
|
+
|
|
237
|
+
const sparse = this.cueList
|
|
238
|
+
.map((c) => ({ c, s: sparseScore(qTok, c.toks) }))
|
|
239
|
+
.sort((a, b) => b.s - a.s)
|
|
240
|
+
.slice(0, Math.max(1, efSearch));
|
|
241
|
+
|
|
242
|
+
const fused = fuse(dense, sparse, { hybridAlpha, fusion });
|
|
243
|
+
return fused.slice(0, Math.max(1, cueK)).map((e) => e.c.id);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Stage 2 — ACTIVE RECONSTRUCTION. From cue ids, traverse LINKED_TO up to
|
|
248
|
+
* `traversalDepth` hops (expanding <= tagFanout tags per frontier node),
|
|
249
|
+
* scoring each path by query relevance with per-hop decay, pruning paths
|
|
250
|
+
* below `pruneThreshold`, and collecting REFERENCES content (capped maxContent).
|
|
251
|
+
* Returns ordered content + reconstruction stats.
|
|
252
|
+
*/
|
|
253
|
+
reconstruct(queryText, cueIds, { traversalDepth = 2, tagFanout = 4, pruneThreshold = 0.15, maxContent = 10, decay = 0.7, haltConfidence = 1.1 } = {}) {
|
|
254
|
+
const qVec = embed(queryText);
|
|
255
|
+
const qTok = tokenize(queryText);
|
|
256
|
+
const { tags, content } = this.graph;
|
|
257
|
+
|
|
258
|
+
// Per content: best single-path score AND # of corroborating paths.
|
|
259
|
+
const acc = new Map(); // contentId -> { best, paths }
|
|
260
|
+
let nodesVisited = 0;
|
|
261
|
+
let hops = 0;
|
|
262
|
+
let halted = false;
|
|
263
|
+
const seenTag = new Set();
|
|
264
|
+
|
|
265
|
+
let frontier = [];
|
|
266
|
+
for (const cueId of cueIds) {
|
|
267
|
+
const cue = this.graph.cues.get(cueId);
|
|
268
|
+
if (!cue) continue;
|
|
269
|
+
for (const tagId of cue.links.slice(0, tagFanout)) frontier.push({ tagId, evidence: 1 });
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
for (let depth = 0; depth < traversalDepth && frontier.length; depth++) {
|
|
273
|
+
hops = depth + 1;
|
|
274
|
+
const next = [];
|
|
275
|
+
for (const { tagId, evidence } of frontier) {
|
|
276
|
+
if (seenTag.has(tagId)) continue;
|
|
277
|
+
seenTag.add(tagId);
|
|
278
|
+
const tag = tags.get(tagId);
|
|
279
|
+
if (!tag) continue;
|
|
280
|
+
nodesVisited++;
|
|
281
|
+
|
|
282
|
+
// Cue→Tag links are ASSOCIATIVE (structural), not semantic. Path strength
|
|
283
|
+
// is the carried cue evidence, decayed per hop.
|
|
284
|
+
const carried = evidence * decay ** depth;
|
|
285
|
+
|
|
286
|
+
for (const cid of tag.content) {
|
|
287
|
+
const c = content.get(cid);
|
|
288
|
+
if (!c) continue;
|
|
289
|
+
const contentSim = 0.6 * cosine(qVec, c.vec) + 0.4 * sparseScore(qTok, c.toks);
|
|
290
|
+
const pathScore = carried * contentSim;
|
|
291
|
+
if (pathScore < pruneThreshold) continue; // prune irrelevant path
|
|
292
|
+
const e = acc.get(cid) ?? { best: 0, sim: 0, paths: 0 };
|
|
293
|
+
e.best = Math.max(e.best, pathScore); // decayed — for ranking competition
|
|
294
|
+
e.sim = Math.max(e.sim, contentSim); // raw relevance — for abstention confidence
|
|
295
|
+
e.paths += 1; // corroboration: distinct paths reaching this content
|
|
296
|
+
acc.set(cid, e);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
for (const nxt of tag.next.slice(0, tagFanout)) next.push({ tagId: nxt, evidence });
|
|
300
|
+
}
|
|
301
|
+
frontier = next;
|
|
302
|
+
|
|
303
|
+
// ADAPTIVE DEPTH (beyond MRAgent): halt once a genuinely relevant answer
|
|
304
|
+
// exists, spending traversal only on hard queries (ACT-style adaptive
|
|
305
|
+
// computation). Uses RAW relevance (sim), not the decayed score, so a deep-
|
|
306
|
+
// but-relevant answer can trigger halt while a mediocre shallow one cannot.
|
|
307
|
+
let top = 0;
|
|
308
|
+
for (const e of acc.values()) top = Math.max(top, e.sim);
|
|
309
|
+
if (top >= haltConfidence) { halted = true; break; }
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
const ordered = [...acc.entries()]
|
|
313
|
+
.map(([id, e]) => ({ id, score: e.best, sim: e.sim, paths: e.paths, taskId: content.get(id)?.taskId, text: content.get(id)?.text }))
|
|
314
|
+
.sort((a, b) => b.score - a.score)
|
|
315
|
+
.slice(0, Math.max(1, maxContent));
|
|
316
|
+
|
|
317
|
+
// Abstention confidence = the chosen content's RAW relevance to the query, not
|
|
318
|
+
// its decayed path score — so a deep-but-relevant answer is not mistaken for a
|
|
319
|
+
// weak one. This keeps the abstain threshold robust across traversal depths.
|
|
320
|
+
const confidence = ordered.length ? ordered[0].sim : 0;
|
|
321
|
+
return { content: ordered, stats: { hops, nodesVisited, candidates: acc.size, halted, confidence } };
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Reciprocal Rank Fusion (and linear / dbsf variants) over two ranked lists.
|
|
326
|
+
function fuse(dense, sparse, { hybridAlpha, fusion }) {
|
|
327
|
+
const k = 60;
|
|
328
|
+
const acc = new Map(); // cueId -> { c, s }
|
|
329
|
+
const add = (id, c, s) => {
|
|
330
|
+
const e = acc.get(id) ?? { c, s: 0 };
|
|
331
|
+
e.s += s;
|
|
332
|
+
acc.set(id, e);
|
|
333
|
+
};
|
|
334
|
+
if (fusion === "linear") {
|
|
335
|
+
const dMax = Math.max(1e-9, ...dense.map((e) => e.s));
|
|
336
|
+
const sMax = Math.max(1e-9, ...sparse.map((e) => e.s));
|
|
337
|
+
dense.forEach((e) => add(e.c.id, e.c, hybridAlpha * (e.s / dMax)));
|
|
338
|
+
sparse.forEach((e) => add(e.c.id, e.c, (1 - hybridAlpha) * (e.s / sMax)));
|
|
339
|
+
} else if (fusion === "dbsf") {
|
|
340
|
+
// distribution-based score fusion: z-normalize then weight
|
|
341
|
+
const z = (arr) => {
|
|
342
|
+
const m = arr.reduce((a, e) => a + e.s, 0) / (arr.length || 1);
|
|
343
|
+
const sd = Math.sqrt(arr.reduce((a, e) => a + (e.s - m) ** 2, 0) / (arr.length || 1)) || 1;
|
|
344
|
+
return new Map(arr.map((e) => [e.c.id, (e.s - m) / sd]));
|
|
345
|
+
};
|
|
346
|
+
const zd = z(dense), zs = z(sparse);
|
|
347
|
+
dense.forEach((e) => add(e.c.id, e.c, hybridAlpha * (zd.get(e.c.id) ?? 0)));
|
|
348
|
+
sparse.forEach((e) => add(e.c.id, e.c, (1 - hybridAlpha) * (zs.get(e.c.id) ?? 0)));
|
|
349
|
+
} else {
|
|
350
|
+
// rrf (default)
|
|
351
|
+
dense.forEach((e, i) => add(e.c.id, e.c, hybridAlpha * (1 / (k + i + 1))));
|
|
352
|
+
sparse.forEach((e, i) => add(e.c.id, e.c, (1 - hybridAlpha) * (1 / (k + i + 1))));
|
|
353
|
+
}
|
|
354
|
+
return [...acc.values()].sort((a, b) => b.s - a.s);
|
|
355
|
+
}
|
package/benchmark.mjs
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
// MRAgent benchmark (v2): baseline vs Darwin-evolved harness over the frozen
|
|
2
|
+
// Cue-Tag-Content corpus, plus the consolidation (replay) pass. Reports the three
|
|
3
|
+
// beyond-SOTA dimensions: helpfulness (accuracy), calibration (risk + halluc), and
|
|
4
|
+
// reconstruction cost (latency/hops/context). Picks up the evolved genome from
|
|
5
|
+
// optimize.report.json if present.
|
|
6
|
+
//
|
|
7
|
+
// Run: npm run benchmark
|
|
8
|
+
|
|
9
|
+
import fs from "node:fs";
|
|
10
|
+
import path from "node:path";
|
|
11
|
+
import { fileURLToPath } from "node:url";
|
|
12
|
+
import { MemoryStore, baselineGenome, evaluate } from "./agent/harness.mjs";
|
|
13
|
+
import { consolidate } from "./agent/consolidate.mjs";
|
|
14
|
+
|
|
15
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
16
|
+
const corpus = JSON.parse(fs.readFileSync(path.join(__dirname, "data", "eval-set.json"), "utf8"));
|
|
17
|
+
const tasks = corpus.tasks;
|
|
18
|
+
|
|
19
|
+
const baseline = baselineGenome();
|
|
20
|
+
// Evolved genome: from a prior `npm run optimize`, else a calibrated reference.
|
|
21
|
+
let evolved = { ...baseline, fusion: "linear", traversalDepth: 3, abstainThreshold: 0.4, haltConfidence: 0.5, maxContent: 6, tagFanout: 3 };
|
|
22
|
+
const reportPath = path.join(__dirname, "optimize.report.json");
|
|
23
|
+
if (fs.existsSync(reportPath)) {
|
|
24
|
+
try {
|
|
25
|
+
const rep = JSON.parse(fs.readFileSync(reportPath, "utf8"));
|
|
26
|
+
if (rep?.evolved?.genome) evolved = rep.evolved.genome;
|
|
27
|
+
} catch { /* keep reference */ }
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const base = evaluate(baseline, new MemoryStore(tasks), tasks);
|
|
31
|
+
const evoStore = new MemoryStore(tasks);
|
|
32
|
+
const evo = evaluate(evolved, evoStore, tasks);
|
|
33
|
+
|
|
34
|
+
// Consolidation pass (self-reorganizing memory) on the evolved harness.
|
|
35
|
+
const evoPre = evaluate(evolved, evoStore, tasks);
|
|
36
|
+
const cons = consolidate(evoStore, tasks, evolved);
|
|
37
|
+
const evoPost = evaluate(evolved, evoStore, tasks);
|
|
38
|
+
|
|
39
|
+
console.log("== MRAgent benchmark (v2 — beyond MRAgent) ==");
|
|
40
|
+
console.log(`corpus: ${tasks.length} Cue-Tag-Content tasks (semantic/lexical/hybrid/bridge/distractor/unanswerable)\n`);
|
|
41
|
+
console.log("config accuracy risk halluc latency hops context");
|
|
42
|
+
const row = (name, m) =>
|
|
43
|
+
console.log(`${name.padEnd(17)} ${(m.accuracy * 100).toFixed(1).padStart(5)}% ${m.riskScore.toFixed(3)} ${m.hallucinationRate.toFixed(2)} ${m.avgLatencyMs.toFixed(2).padStart(5)} ${m.avgHops.toFixed(2)} ${m.avgContext.toFixed(1)}`);
|
|
44
|
+
row("baseline", base);
|
|
45
|
+
row("evolved", evo);
|
|
46
|
+
row("evolved+replay", evoPost);
|
|
47
|
+
|
|
48
|
+
const dAcc = (evo.accuracy - base.accuracy) * 100;
|
|
49
|
+
const dRisk = evo.riskScore - base.riskScore;
|
|
50
|
+
const dHops = ((evoPre.avgHops - evoPost.avgHops) / Math.max(evoPre.avgHops, 1e-9)) * 100;
|
|
51
|
+
console.log(`\nevolved vs baseline: accuracy ${dAcc >= 0 ? "+" : ""}${dAcc.toFixed(1)}pt · risk ${dRisk >= 0 ? "+" : ""}${dRisk.toFixed(3)} · hallucination ${base.hallucinationRate.toFixed(2)} → ${evo.hallucinationRate.toFixed(2)}`);
|
|
52
|
+
console.log(`consolidation: ${cons.consolidated} shortcuts → ${dHops.toFixed(1)}% fewer hops at ${(evoPost.accuracy * 100).toFixed(1)}% accuracy`);
|
|
53
|
+
|
|
54
|
+
const report = {
|
|
55
|
+
frozenModel: "RuVector Cue-Tag-Content graph (frozen)",
|
|
56
|
+
corpusSize: tasks.length,
|
|
57
|
+
baseline: { genome: baseline, metrics: base },
|
|
58
|
+
evolved: { genome: evolved, metrics: evo },
|
|
59
|
+
consolidated: { shortcuts: cons.consolidated, metrics: evoPost },
|
|
60
|
+
deltas: { accuracyPoints: dAcc, riskDelta: dRisk, hopsReductionPct: dHops },
|
|
61
|
+
};
|
|
62
|
+
fs.writeFileSync(path.join(__dirname, "benchmark.report.json"), JSON.stringify(report, null, 2));
|
|
63
|
+
console.log(`\nreport -> ${path.join(__dirname, "benchmark.report.json")}`);
|