ruvector-mragent 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,355 @@
1
+ // MRAgent FROZEN MODEL — the Cue-Tag-Content associative memory substrate.
2
+ //
3
+ // Per the Meta-Harness invariant ("freeze the model, evolve the harness"), this
4
+ // file is NEVER mutated by Darwin. It is the RuVector-backed memory store. In
5
+ // production the nodes, embeddings, and edges live in a RuVector `.rvf` index and
6
+ // traversal is a Cypher query:
7
+ //
8
+ // MATCH (c:Cue)-[:LINKED_TO*1..N]->(t:Tag)-[:REFERENCES]->(m:Content)
9
+ // WHERE c.id IN $cueIds RETURN m
10
+ //
11
+ // To keep this example runnable with ZERO native dependencies (and fully
12
+ // deterministic for CI), the store is reimplemented in-process with the same
13
+ // semantics: hybrid (sparse+dense RRF) cue search and bounded-depth, prunable
14
+ // graph reconstruction. If the real `ruvector` package is installed it is used
15
+ // for embeddings; otherwise a deterministic hashed embedding is used. Either way
16
+ // the GRAPH SEMANTICS are identical, so the harness genome evolved here transfers
17
+ // to a live RuVector deployment unchanged.
18
+
19
+ import { createRequire } from "node:module";
20
+ import { NUM_CONCEPTS, conceptOf, syn } from "./concepts.mjs";
21
+ const require = createRequire(import.meta.url);
22
+
23
+ // Runtime-optional production backend. The example never *requires* it.
24
+ let RuVector = null;
25
+ try { RuVector = require("ruvector"); } catch { /* deterministic fallback */ }
26
+
27
+ // Dense embedding = concept-projected semantics + a small lexical hash tail.
28
+ // The concept block (first NUM_CONCEPTS dims) makes paraphrases dense-close even
29
+ // with zero shared tokens; the hash tail keeps unique tokens distinguishable.
30
+ const HASH_TAIL = 64;
31
+ export const EMBED_DIM = NUM_CONCEPTS + HASH_TAIL;
32
+ export const usingRuVector = !!RuVector;
33
+
34
+ const STOP = new Set(["the", "a", "an", "to", "of", "is", "are", "and", "in", "into", "does", "do", "what", "which", "how", "with", "from", "for", "that"]);
35
+
36
+ export function tokenize(text) {
37
+ return String(text)
38
+ .toLowerCase()
39
+ .replace(/[^a-z0-9]+/g, " ")
40
+ .split(" ")
41
+ .filter((w) => w.length > 1 && !STOP.has(w));
42
+ }
43
+
44
+ // Deterministic FNV-1a hash → stable across runs/platforms (no Math.random here).
45
+ function hash32(str) {
46
+ let h = 0x811c9dc5;
47
+ for (let i = 0; i < str.length; i++) {
48
+ h ^= str.charCodeAt(i);
49
+ h = Math.imul(h, 0x01000193) >>> 0;
50
+ }
51
+ return h >>> 0;
52
+ }
53
+
54
+ // Deterministic concept-projected embedding. Stands in for an ONNX MiniLM dense
55
+ // vector: tokens sharing a concept (synonyms) land on the same concept dim, so
56
+ // paraphrases are dense-close WITHOUT lexical overlap. Identifier-like tokens
57
+ // only hit the hash tail, so they are semantically generic (sparse decides them).
58
+ export function embed(text) {
59
+ const v = new Float32Array(EMBED_DIM);
60
+ const toks = tokenize(text);
61
+ for (const t of toks) {
62
+ const c = conceptOf(t);
63
+ if (c >= 0) {
64
+ v[c] += 1; // concept dimension (dense semantics)
65
+ } else {
66
+ // lexical-only token → hash tail (after the concept block)
67
+ v[NUM_CONCEPTS + (hash32(t) % HASH_TAIL)] += 0.6;
68
+ v[NUM_CONCEPTS + (hash32("salt:" + t) % HASH_TAIL)] += 0.3;
69
+ }
70
+ }
71
+ let norm = 0;
72
+ for (let i = 0; i < EMBED_DIM; i++) norm += v[i] * v[i];
73
+ norm = Math.sqrt(norm) || 1;
74
+ for (let i = 0; i < EMBED_DIM; i++) v[i] /= norm;
75
+ return v;
76
+ }
77
+
78
+ function cosine(a, b) {
79
+ let dot = 0;
80
+ for (let i = 0; i < a.length; i++) dot += a[i] * b[i];
81
+ return dot; // both are L2-normalized
82
+ }
83
+
84
+ // Sparse term-overlap score (BM25-lite): shared tokens / sqrt(len product).
85
+ function sparseScore(queryToks, docToks) {
86
+ if (!queryToks.length || !docToks.length) return 0;
87
+ const q = new Set(queryToks);
88
+ let shared = 0;
89
+ for (const t of docToks) if (q.has(t)) shared++;
90
+ return shared / Math.sqrt(queryToks.length * docToks.length);
91
+ }
92
+
93
+ // ── Graph builder ───────────────────────────────────────────────────────────
94
+ // Builds the Cue-Tag-Content graph from the eval corpus, plus cross-task
95
+ // distractor cues/contents so every gene is load-bearing.
96
+ //
97
+ // Texts are SYNTHESIZED from each task's structured signal spec (concept names +
98
+ // lexical identifiers) so that dense/sparse separation, ranking-distractors and
99
+ // multi-hop bridges are guaranteed, not dependent on fragile English wording.
100
+ //
101
+ // query = qConcepts(variant0) + qLex
102
+ // correct cue = cue.concepts(variant1) + cue.lex (same concepts, diff tokens)
103
+ // correct text = qConcepts(variant0) + expected_fact + cue.lex
104
+ // distractor = query echoed twice (out-ranks correct on raw sim, no fact)
105
+ // decoy cue = decoy.concepts/lex → wrong tag → wrong content
106
+ //
107
+ // Edge model:
108
+ // Cue -LINKED_TO-> [bridge0 -> … ->] { relevantTag, corroborateTag }
109
+ // Tag -REFERENCES-> Content
110
+ function synth(concepts = [], lex = [], variant = 0) {
111
+ return [...concepts.map((c) => syn(c, variant)), ...lex].join(" ");
112
+ }
113
+
114
+ /** Synthesize the query string for a task spec (used at retrieval time). */
115
+ export function queryTextFor(spec) {
116
+ return synth(spec.qConcepts || [], spec.qLex || [], 0);
117
+ }
118
+
119
+ export function buildGraph(specs) {
120
+ const cues = new Map();
121
+ const tags = new Map();
122
+ const content = new Map();
123
+ const queries = new Map();
124
+
125
+ const mkTag = (name) => {
126
+ const id = `tag:${name}`;
127
+ const t = { id, name, text: name.replace(/-/g, " "), toks: tokenize(name), vec: embed(name.replace(/-/g, " ")), content: [], next: [] };
128
+ tags.set(id, t);
129
+ return t;
130
+ };
131
+ const mkContent = (id, text, taskId) => {
132
+ content.set(id, { id, text, toks: tokenize(text), vec: embed(text), taskId });
133
+ return id;
134
+ };
135
+
136
+ for (const spec of specs) {
137
+ queries.set(spec.id, queryTextFor(spec));
138
+
139
+ // Unanswerable task: NO correct content exists — the only honest answer is to
140
+ // abstain. We still create the cue + decoys so the agent has something to chase
141
+ // and must judge that the reconstructed evidence is too weak (low confidence).
142
+ const answerable = spec.answerable !== false;
143
+
144
+ let entry;
145
+ if (answerable) {
146
+ // Correct content: relevant to the query (shares query concepts) + the fact.
147
+ const cid = `content:${spec.id}`;
148
+ mkContent(cid, [synth(spec.qConcepts, [], 0), spec.expected_fact, ...(spec.cue?.lex || [])].join(" "), spec.id);
149
+
150
+ // Relevant tag references the correct content (+ ranking-distractor contents).
151
+ const rel = mkTag(`${spec.id}-rel`);
152
+ rel.content.push(cid);
153
+ for (let d = 0; d < (spec.distractors || 0); d++) {
154
+ // Echoes the query MORE than the correct content → higher raw sim, but no
155
+ // expected_fact. Only rerank (corroboration) or a wide window survives it.
156
+ const did = mkContent(`content:${spec.id}:d${d}`,
157
+ [synth(spec.qConcepts, spec.qLex, 0), synth(spec.qConcepts, [], 0), (spec.qLex || []).join(" ")].join(" "),
158
+ `${spec.id}-distractor`);
159
+ rel.content.push(did);
160
+ }
161
+
162
+ // Corroborating tag references the SAME correct content via a second path.
163
+ // Only surfaces with rerank="gnn" (corroboration boost) AND tagFanout>=2.
164
+ const tail = [rel];
165
+ if (spec.corroborate) {
166
+ const corr = mkTag(`${spec.id}-corr`);
167
+ corr.content.push(cid);
168
+ tail.push(corr);
169
+ }
170
+
171
+ // Bridge chain: cue -> b0 -> … -> tail. k bridges ⇒ need traversalDepth k+1.
172
+ const bridges = [];
173
+ for (let b = 0; b < (spec.bridges || 0); b++) bridges.push(mkTag(`${spec.id}-b${b}`));
174
+ for (let b = 0; b < bridges.length; b++) {
175
+ const nxt = b + 1 < bridges.length ? [bridges[b + 1]] : tail;
176
+ for (const t of nxt) bridges[b].next.push(t.id);
177
+ }
178
+ entry = bridges.length ? [bridges[0]] : tail;
179
+ } else {
180
+ // Only a weak tag with a low-similarity placeholder → confidence stays low.
181
+ const weak = mkTag(`${spec.id}-weak`);
182
+ const wid = mkContent(`content:${spec.id}:weak`, ["tangential unrelated note", spec.id].join(" "), `${spec.id}-none`);
183
+ weak.content.push(wid);
184
+ entry = [weak];
185
+ }
186
+
187
+ // Correct cue (concepts via variant-1 surface tokens, so dense-close to query
188
+ // but lexically distinct; shares cue.lex with the query for the sparse signal).
189
+ mkCue(cues, `cue:${spec.id}:correct`,
190
+ synth(spec.cue?.concepts || [], spec.cue?.lex || [], 1), answerable ? spec.id : `${spec.id}-none`, entry.map((t) => t.id));
191
+
192
+ // Decoy cues → wrong tag → wrong content. Concepts use variant-2 surface tokens
193
+ // so a concept-decoy is dense-close to the query but shares NO token with it —
194
+ // the correct cue is only retrievable with the right fusion weight.
195
+ (spec.decoys || []).forEach((dec, di) => {
196
+ const wc = mkContent(`content:${spec.id}:w${di}`, ["wrong decoy", synth(dec.concepts || [], dec.lex || [], 2)].join(" "), `${spec.id}-decoy`);
197
+ const wt = mkTag(`${spec.id}-w${di}`);
198
+ wt.content.push(wc);
199
+ mkCue(cues, `cue:${spec.id}:decoy${di}`, synth(dec.concepts || [], dec.lex || [], 2), `${spec.id}-decoy`, [wt.id]);
200
+ });
201
+ }
202
+
203
+ return { cues, tags, content, queries };
204
+ }
205
+
206
+ function mkCue(cues, id, text, taskId, links) {
207
+ cues.set(id, { id, text, toks: tokenize(text), vec: embed(text), taskId, links });
208
+ }
209
+
210
+ // ── MemoryStore: hybrid cue search + bounded-depth reconstruction ─────────────
211
+ export class MemoryStore {
212
+ constructor(tasks) {
213
+ this.tasks = tasks;
214
+ this.graph = buildGraph(tasks);
215
+ this.cueList = [...this.graph.cues.values()];
216
+ }
217
+
218
+ /** Synthesized query string for a task id (the text actually issued at search). */
219
+ queryText(taskId) {
220
+ return this.graph.queries.get(taskId) ?? "";
221
+ }
222
+
223
+ /**
224
+ * Stage 1 — find entry cues with hybrid (sparse + dense) search + RRF.
225
+ * `efSearch` bounds the dense candidate pool (HNSW recall proxy): a small
226
+ * efSearch can drop the correct cue before fusion ever sees it.
227
+ */
228
+ hybridSearch(queryText, { cueK = 5, efSearch = 64, hybridAlpha = 0.5, fusion = "rrf" } = {}) {
229
+ const qTok = tokenize(queryText);
230
+ const qVec = embed(queryText);
231
+
232
+ const dense = this.cueList
233
+ .map((c) => ({ c, s: cosine(qVec, c.vec) }))
234
+ .sort((a, b) => b.s - a.s)
235
+ .slice(0, Math.max(1, efSearch)); // HNSW recall ceiling
236
+
237
+ const sparse = this.cueList
238
+ .map((c) => ({ c, s: sparseScore(qTok, c.toks) }))
239
+ .sort((a, b) => b.s - a.s)
240
+ .slice(0, Math.max(1, efSearch));
241
+
242
+ const fused = fuse(dense, sparse, { hybridAlpha, fusion });
243
+ return fused.slice(0, Math.max(1, cueK)).map((e) => e.c.id);
244
+ }
245
+
246
+ /**
247
+ * Stage 2 — ACTIVE RECONSTRUCTION. From cue ids, traverse LINKED_TO up to
248
+ * `traversalDepth` hops (expanding <= tagFanout tags per frontier node),
249
+ * scoring each path by query relevance with per-hop decay, pruning paths
250
+ * below `pruneThreshold`, and collecting REFERENCES content (capped maxContent).
251
+ * Returns ordered content + reconstruction stats.
252
+ */
253
+ reconstruct(queryText, cueIds, { traversalDepth = 2, tagFanout = 4, pruneThreshold = 0.15, maxContent = 10, decay = 0.7, haltConfidence = 1.1 } = {}) {
254
+ const qVec = embed(queryText);
255
+ const qTok = tokenize(queryText);
256
+ const { tags, content } = this.graph;
257
+
258
+ // Per content: best single-path score AND # of corroborating paths.
259
+ const acc = new Map(); // contentId -> { best, paths }
260
+ let nodesVisited = 0;
261
+ let hops = 0;
262
+ let halted = false;
263
+ const seenTag = new Set();
264
+
265
+ let frontier = [];
266
+ for (const cueId of cueIds) {
267
+ const cue = this.graph.cues.get(cueId);
268
+ if (!cue) continue;
269
+ for (const tagId of cue.links.slice(0, tagFanout)) frontier.push({ tagId, evidence: 1 });
270
+ }
271
+
272
+ for (let depth = 0; depth < traversalDepth && frontier.length; depth++) {
273
+ hops = depth + 1;
274
+ const next = [];
275
+ for (const { tagId, evidence } of frontier) {
276
+ if (seenTag.has(tagId)) continue;
277
+ seenTag.add(tagId);
278
+ const tag = tags.get(tagId);
279
+ if (!tag) continue;
280
+ nodesVisited++;
281
+
282
+ // Cue→Tag links are ASSOCIATIVE (structural), not semantic. Path strength
283
+ // is the carried cue evidence, decayed per hop.
284
+ const carried = evidence * decay ** depth;
285
+
286
+ for (const cid of tag.content) {
287
+ const c = content.get(cid);
288
+ if (!c) continue;
289
+ const contentSim = 0.6 * cosine(qVec, c.vec) + 0.4 * sparseScore(qTok, c.toks);
290
+ const pathScore = carried * contentSim;
291
+ if (pathScore < pruneThreshold) continue; // prune irrelevant path
292
+ const e = acc.get(cid) ?? { best: 0, sim: 0, paths: 0 };
293
+ e.best = Math.max(e.best, pathScore); // decayed — for ranking competition
294
+ e.sim = Math.max(e.sim, contentSim); // raw relevance — for abstention confidence
295
+ e.paths += 1; // corroboration: distinct paths reaching this content
296
+ acc.set(cid, e);
297
+ }
298
+
299
+ for (const nxt of tag.next.slice(0, tagFanout)) next.push({ tagId: nxt, evidence });
300
+ }
301
+ frontier = next;
302
+
303
+ // ADAPTIVE DEPTH (beyond MRAgent): halt once a genuinely relevant answer
304
+ // exists, spending traversal only on hard queries (ACT-style adaptive
305
+ // computation). Uses RAW relevance (sim), not the decayed score, so a deep-
306
+ // but-relevant answer can trigger halt while a mediocre shallow one cannot.
307
+ let top = 0;
308
+ for (const e of acc.values()) top = Math.max(top, e.sim);
309
+ if (top >= haltConfidence) { halted = true; break; }
310
+ }
311
+
312
+ const ordered = [...acc.entries()]
313
+ .map(([id, e]) => ({ id, score: e.best, sim: e.sim, paths: e.paths, taskId: content.get(id)?.taskId, text: content.get(id)?.text }))
314
+ .sort((a, b) => b.score - a.score)
315
+ .slice(0, Math.max(1, maxContent));
316
+
317
+ // Abstention confidence = the chosen content's RAW relevance to the query, not
318
+ // its decayed path score — so a deep-but-relevant answer is not mistaken for a
319
+ // weak one. This keeps the abstain threshold robust across traversal depths.
320
+ const confidence = ordered.length ? ordered[0].sim : 0;
321
+ return { content: ordered, stats: { hops, nodesVisited, candidates: acc.size, halted, confidence } };
322
+ }
323
+ }
324
+
325
+ // Reciprocal Rank Fusion (and linear / dbsf variants) over two ranked lists.
326
+ function fuse(dense, sparse, { hybridAlpha, fusion }) {
327
+ const k = 60;
328
+ const acc = new Map(); // cueId -> { c, s }
329
+ const add = (id, c, s) => {
330
+ const e = acc.get(id) ?? { c, s: 0 };
331
+ e.s += s;
332
+ acc.set(id, e);
333
+ };
334
+ if (fusion === "linear") {
335
+ const dMax = Math.max(1e-9, ...dense.map((e) => e.s));
336
+ const sMax = Math.max(1e-9, ...sparse.map((e) => e.s));
337
+ dense.forEach((e) => add(e.c.id, e.c, hybridAlpha * (e.s / dMax)));
338
+ sparse.forEach((e) => add(e.c.id, e.c, (1 - hybridAlpha) * (e.s / sMax)));
339
+ } else if (fusion === "dbsf") {
340
+ // distribution-based score fusion: z-normalize then weight
341
+ const z = (arr) => {
342
+ const m = arr.reduce((a, e) => a + e.s, 0) / (arr.length || 1);
343
+ const sd = Math.sqrt(arr.reduce((a, e) => a + (e.s - m) ** 2, 0) / (arr.length || 1)) || 1;
344
+ return new Map(arr.map((e) => [e.c.id, (e.s - m) / sd]));
345
+ };
346
+ const zd = z(dense), zs = z(sparse);
347
+ dense.forEach((e) => add(e.c.id, e.c, hybridAlpha * (zd.get(e.c.id) ?? 0)));
348
+ sparse.forEach((e) => add(e.c.id, e.c, (1 - hybridAlpha) * (zs.get(e.c.id) ?? 0)));
349
+ } else {
350
+ // rrf (default)
351
+ dense.forEach((e, i) => add(e.c.id, e.c, hybridAlpha * (1 / (k + i + 1))));
352
+ sparse.forEach((e, i) => add(e.c.id, e.c, (1 - hybridAlpha) * (1 / (k + i + 1))));
353
+ }
354
+ return [...acc.values()].sort((a, b) => b.s - a.s);
355
+ }
package/benchmark.mjs ADDED
@@ -0,0 +1,63 @@
1
+ // MRAgent benchmark (v2): baseline vs Darwin-evolved harness over the frozen
2
+ // Cue-Tag-Content corpus, plus the consolidation (replay) pass. Reports the three
3
+ // beyond-SOTA dimensions: helpfulness (accuracy), calibration (risk + halluc), and
4
+ // reconstruction cost (latency/hops/context). Picks up the evolved genome from
5
+ // optimize.report.json if present.
6
+ //
7
+ // Run: npm run benchmark
8
+
9
+ import fs from "node:fs";
10
+ import path from "node:path";
11
+ import { fileURLToPath } from "node:url";
12
+ import { MemoryStore, baselineGenome, evaluate } from "./agent/harness.mjs";
13
+ import { consolidate } from "./agent/consolidate.mjs";
14
+
15
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
16
+ const corpus = JSON.parse(fs.readFileSync(path.join(__dirname, "data", "eval-set.json"), "utf8"));
17
+ const tasks = corpus.tasks;
18
+
19
+ const baseline = baselineGenome();
20
+ // Evolved genome: from a prior `npm run optimize`, else a calibrated reference.
21
+ let evolved = { ...baseline, fusion: "linear", traversalDepth: 3, abstainThreshold: 0.4, haltConfidence: 0.5, maxContent: 6, tagFanout: 3 };
22
+ const reportPath = path.join(__dirname, "optimize.report.json");
23
+ if (fs.existsSync(reportPath)) {
24
+ try {
25
+ const rep = JSON.parse(fs.readFileSync(reportPath, "utf8"));
26
+ if (rep?.evolved?.genome) evolved = rep.evolved.genome;
27
+ } catch { /* keep reference */ }
28
+ }
29
+
30
+ const base = evaluate(baseline, new MemoryStore(tasks), tasks);
31
+ const evoStore = new MemoryStore(tasks);
32
+ const evo = evaluate(evolved, evoStore, tasks);
33
+
34
+ // Consolidation pass (self-reorganizing memory) on the evolved harness.
35
+ const evoPre = evaluate(evolved, evoStore, tasks);
36
+ const cons = consolidate(evoStore, tasks, evolved);
37
+ const evoPost = evaluate(evolved, evoStore, tasks);
38
+
39
+ console.log("== MRAgent benchmark (v2 — beyond MRAgent) ==");
40
+ console.log(`corpus: ${tasks.length} Cue-Tag-Content tasks (semantic/lexical/hybrid/bridge/distractor/unanswerable)\n`);
41
+ console.log("config accuracy risk halluc latency hops context");
42
+ const row = (name, m) =>
43
+ console.log(`${name.padEnd(17)} ${(m.accuracy * 100).toFixed(1).padStart(5)}% ${m.riskScore.toFixed(3)} ${m.hallucinationRate.toFixed(2)} ${m.avgLatencyMs.toFixed(2).padStart(5)} ${m.avgHops.toFixed(2)} ${m.avgContext.toFixed(1)}`);
44
+ row("baseline", base);
45
+ row("evolved", evo);
46
+ row("evolved+replay", evoPost);
47
+
48
+ const dAcc = (evo.accuracy - base.accuracy) * 100;
49
+ const dRisk = evo.riskScore - base.riskScore;
50
+ const dHops = ((evoPre.avgHops - evoPost.avgHops) / Math.max(evoPre.avgHops, 1e-9)) * 100;
51
+ console.log(`\nevolved vs baseline: accuracy ${dAcc >= 0 ? "+" : ""}${dAcc.toFixed(1)}pt · risk ${dRisk >= 0 ? "+" : ""}${dRisk.toFixed(3)} · hallucination ${base.hallucinationRate.toFixed(2)} → ${evo.hallucinationRate.toFixed(2)}`);
52
+ console.log(`consolidation: ${cons.consolidated} shortcuts → ${dHops.toFixed(1)}% fewer hops at ${(evoPost.accuracy * 100).toFixed(1)}% accuracy`);
53
+
54
+ const report = {
55
+ frozenModel: "RuVector Cue-Tag-Content graph (frozen)",
56
+ corpusSize: tasks.length,
57
+ baseline: { genome: baseline, metrics: base },
58
+ evolved: { genome: evolved, metrics: evo },
59
+ consolidated: { shortcuts: cons.consolidated, metrics: evoPost },
60
+ deltas: { accuracyPoints: dAcc, riskDelta: dRisk, hopsReductionPct: dHops },
61
+ };
62
+ fs.writeFileSync(path.join(__dirname, "benchmark.report.json"), JSON.stringify(report, null, 2));
63
+ console.log(`\nreport -> ${path.join(__dirname, "benchmark.report.json")}`);