ruvector-mragent 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +159 -0
- package/agent/concepts.mjs +71 -0
- package/agent/consolidate.mjs +55 -0
- package/agent/harness.mjs +204 -0
- package/agent/llmMutator.mjs +147 -0
- package/agent/memory.mjs +355 -0
- package/benchmark.mjs +63 -0
- package/data/eval-set.json +1685 -0
- package/harness/scorePolicy.ts +75 -0
- package/optimize.mjs +304 -0
- package/package.json +56 -0
- package/probeDarwin.mjs +24 -0
- package/test/harness.test.mjs +134 -0
- package/test/llmMutator.test.mjs +48 -0
- package/tools/genCorpus.mjs +74 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
// Deterministic corpus generator for the MRAgent benchmark.
|
|
2
|
+
//
|
|
3
|
+
// Produces a large, class-balanced, DIFFICULTY-VARIED set of structured signal
|
|
4
|
+
// specs (see agent/memory.mjs for how they become Cue-Tag-Content nodes). Variety
|
|
5
|
+
// is the point: each class spans easy→hard instances (1-hop AND 2-hop bridges,
|
|
6
|
+
// 1–3 ranking-distractors, etc.) so a train/test split constrains every gene and
|
|
7
|
+
// the optimizer cannot overfit an under-constrained knob. No RNG — fully
|
|
8
|
+
// reproducible. Run: node tools/genCorpus.mjs (writes data/eval-set.json)
|
|
9
|
+
|
|
10
|
+
import fs from "node:fs";
|
|
11
|
+
import path from "node:path";
|
|
12
|
+
import { fileURLToPath } from "node:url";
|
|
13
|
+
import { CONCEPT_NAMES } from "../agent/concepts.mjs";
|
|
14
|
+
|
|
15
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
16
|
+
const C = CONCEPT_NAMES; // 15 real concepts
|
|
17
|
+
const c = (i) => C[i % C.length];
|
|
18
|
+
const PER_CLASS = 10; // 6 classes × 10 = 60 tasks
|
|
19
|
+
const OOD = ["weather", "tariff", "opera", "sprint", "recipe", "planet", "ballet", "comet", "cuisine", "monsoon", "harpsichord", "tundra"];
|
|
20
|
+
|
|
21
|
+
const tasks = [];
|
|
22
|
+
const fact = (cls, i) => `${cls}fact${i}`;
|
|
23
|
+
|
|
24
|
+
for (let i = 0; i < PER_CLASS; i++) {
|
|
25
|
+
// semantic — answerable only via DENSE (paraphrase, no shared tokens)
|
|
26
|
+
tasks.push({
|
|
27
|
+
id: `s${i}`, class: "semantic", prompt: `semantic ${i}`, expected_fact: fact("s", i),
|
|
28
|
+
qConcepts: [c(i), c(i + 3)], qLex: [`s${i}tok`],
|
|
29
|
+
cue: { concepts: [c(i), c(i + 3)], lex: [] },
|
|
30
|
+
decoys: [{ concepts: [], lex: [`s${i}tok`] }],
|
|
31
|
+
});
|
|
32
|
+
// lexical — answerable only via SPARSE (rare id; concept is generic)
|
|
33
|
+
tasks.push({
|
|
34
|
+
id: `l${i}`, class: "lexical", prompt: `lexical ${i}`, expected_fact: fact("l", i),
|
|
35
|
+
qConcepts: [c(i + 1)], qLex: [`lx-${i}`],
|
|
36
|
+
cue: { concepts: [], lex: [`lx-${i}`] },
|
|
37
|
+
decoys: [{ concepts: [c(i + 1)], lex: [] }],
|
|
38
|
+
});
|
|
39
|
+
// hybrid — needs BOTH signals (RRF)
|
|
40
|
+
tasks.push({
|
|
41
|
+
id: `h${i}`, class: "hybrid", prompt: `hybrid ${i}`, expected_fact: fact("h", i),
|
|
42
|
+
qConcepts: [c(i), c(i + 5)], qLex: [`hy-${i}`],
|
|
43
|
+
cue: { concepts: [c(i)], lex: [`hy-${i}`] },
|
|
44
|
+
decoys: [{ concepts: [c(i), c(i + 5)], lex: [] }, { concepts: [], lex: [`hy-${i}`] }],
|
|
45
|
+
});
|
|
46
|
+
// bridge — ALTERNATE 1-hop and 2-hop so train+test both contain each difficulty
|
|
47
|
+
tasks.push({
|
|
48
|
+
id: `b${i}`, class: "bridge", prompt: `bridge ${i}`, expected_fact: fact("b", i),
|
|
49
|
+
qConcepts: [c(i + 2), c(i + 7)], qLex: [`b${i}tok`],
|
|
50
|
+
cue: { concepts: [c(i + 2), c(i + 7)], lex: [`b${i}tok`] },
|
|
51
|
+
bridges: 1 + (i % 2),
|
|
52
|
+
});
|
|
53
|
+
// distractor — VARY 1–3 ranking-distractors; corroborated correct content
|
|
54
|
+
tasks.push({
|
|
55
|
+
id: `d${i}`, class: "distractor", prompt: `distractor ${i}`, expected_fact: fact("d", i),
|
|
56
|
+
qConcepts: [c(i + 4), c(i + 9)], qLex: [`d${i}tok`],
|
|
57
|
+
cue: { concepts: [c(i + 4), c(i + 9)], lex: [`d${i}tok`] },
|
|
58
|
+
distractors: 1 + (i % 3), corroborate: true,
|
|
59
|
+
});
|
|
60
|
+
// unanswerable — out-of-distribution concepts, no correct content → abstain
|
|
61
|
+
tasks.push({
|
|
62
|
+
id: `u${i}`, class: "unanswerable", answerable: false, prompt: `unanswerable ${i}`, expected_fact: "N/A",
|
|
63
|
+
qConcepts: [OOD[i % OOD.length], OOD[(i + 6) % OOD.length]], qLex: [`u${i}tok`],
|
|
64
|
+
cue: { concepts: [OOD[i % OOD.length], OOD[(i + 6) % OOD.length]], lex: [`u${i}tok`] },
|
|
65
|
+
decoys: [{ concepts: [], lex: [`u${i}tok`] }],
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const out = {
|
|
70
|
+
_comment: "GENERATED by tools/genCorpus.mjs — do not hand-edit. Class-balanced, difficulty-varied structured signal specs; agent/memory.mjs synthesizes Cue/Tag/Content node texts. 6 classes × 10 = 60 tasks. Bridges alternate 1-hop/2-hop; distractors vary 1–3. Regenerate: node tools/genCorpus.mjs",
|
|
71
|
+
tasks,
|
|
72
|
+
};
|
|
73
|
+
fs.writeFileSync(path.join(__dirname, "..", "data", "eval-set.json"), JSON.stringify(out, null, 2) + "\n");
|
|
74
|
+
console.log(`wrote ${tasks.length} tasks to data/eval-set.json`);
|