@sx4im/skillcheck 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/METHODOLOGY.md +91 -0
- package/README.md +159 -0
- package/dist/bin/skillcheck.d.ts +2 -0
- package/dist/bin/skillcheck.js +8 -0
- package/dist/bin/skillcheck.js.map +1 -0
- package/dist/src/adapters/nvidia-nim.d.ts +30 -0
- package/dist/src/adapters/nvidia-nim.js +165 -0
- package/dist/src/adapters/nvidia-nim.js.map +1 -0
- package/dist/src/cache.d.ts +5 -0
- package/dist/src/cache.js +27 -0
- package/dist/src/cache.js.map +1 -0
- package/dist/src/cli.d.ts +1 -0
- package/dist/src/cli.js +146 -0
- package/dist/src/cli.js.map +1 -0
- package/dist/src/corpus.d.ts +43 -0
- package/dist/src/corpus.js +233 -0
- package/dist/src/corpus.js.map +1 -0
- package/dist/src/deterministic.d.ts +7 -0
- package/dist/src/deterministic.js +25 -0
- package/dist/src/deterministic.js.map +1 -0
- package/dist/src/env.d.ts +12 -0
- package/dist/src/env.js +39 -0
- package/dist/src/env.js.map +1 -0
- package/dist/src/eval.d.ts +13 -0
- package/dist/src/eval.js +155 -0
- package/dist/src/eval.js.map +1 -0
- package/dist/src/generate.d.ts +9 -0
- package/dist/src/generate.js +94 -0
- package/dist/src/generate.js.map +1 -0
- package/dist/src/grade.d.ts +5 -0
- package/dist/src/grade.js +112 -0
- package/dist/src/grade.js.map +1 -0
- package/dist/src/hash.d.ts +2 -0
- package/dist/src/hash.js +8 -0
- package/dist/src/hash.js.map +1 -0
- package/dist/src/m0/hardcoded.d.ts +7 -0
- package/dist/src/m0/hardcoded.js +51 -0
- package/dist/src/m0/hardcoded.js.map +1 -0
- package/dist/src/m0/run.d.ts +38 -0
- package/dist/src/m0/run.js +102 -0
- package/dist/src/m0/run.js.map +1 -0
- package/dist/src/normalize.d.ts +2 -0
- package/dist/src/normalize.js +109 -0
- package/dist/src/normalize.js.map +1 -0
- package/dist/src/rot.d.ts +62 -0
- package/dist/src/rot.js +156 -0
- package/dist/src/rot.js.map +1 -0
- package/dist/src/run.d.ts +5 -0
- package/dist/src/run.js +47 -0
- package/dist/src/run.js.map +1 -0
- package/dist/src/score.d.ts +14 -0
- package/dist/src/score.js +59 -0
- package/dist/src/score.js.map +1 -0
- package/dist/src/types.d.ts +41 -0
- package/dist/src/types.js +2 -0
- package/dist/src/types.js.map +1 -0
- package/dist/src/verify.d.ts +5 -0
- package/dist/src/verify.js +71 -0
- package/dist/src/verify.js.map +1 -0
- package/package.json +64 -0
package/dist/src/eval.js
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import { mkdir, readFile, writeFile } from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { NvidiaNimClient } from './adapters/nvidia-nim.js';
|
|
4
|
+
import { JsonCache } from './cache.js';
|
|
5
|
+
import { loadNvidiaConfig } from './env.js';
|
|
6
|
+
import { generateTasks } from './generate.js';
|
|
7
|
+
import { hashJson } from './hash.js';
|
|
8
|
+
import { gradeOutputs } from './grade.js';
|
|
9
|
+
import { normalizeSkill } from './normalize.js';
|
|
10
|
+
import { runTrials } from './run.js';
|
|
11
|
+
import { scorePairedObservations } from './score.js';
|
|
12
|
+
function applyModelOverrides(config, options) {
|
|
13
|
+
return {
|
|
14
|
+
...config,
|
|
15
|
+
runnerModel: options.runner ?? config.runnerModel,
|
|
16
|
+
graderModel: options.grader ?? config.graderModel,
|
|
17
|
+
generatorModel: options.generator ?? config.generatorModel
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
function pairedObservations(graded) {
|
|
21
|
+
const byPair = new Map();
|
|
22
|
+
for (const item of graded) {
|
|
23
|
+
const key = `${item.taskId}:${item.trial}`;
|
|
24
|
+
const current = byPair.get(key) ?? {};
|
|
25
|
+
if (item.arm === 'with_skill') {
|
|
26
|
+
current.withSkillPass = item.pass;
|
|
27
|
+
}
|
|
28
|
+
else {
|
|
29
|
+
current.noSkillPass = item.pass;
|
|
30
|
+
}
|
|
31
|
+
byPair.set(key, current);
|
|
32
|
+
}
|
|
33
|
+
return [...byPair.values()].map((item) => {
|
|
34
|
+
if (typeof item.withSkillPass !== 'boolean' || typeof item.noSkillPass !== 'boolean') {
|
|
35
|
+
throw new Error('Incomplete A/B pair while scoring');
|
|
36
|
+
}
|
|
37
|
+
return item;
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
function taskBreakdowns(tasks, graded) {
|
|
41
|
+
return tasks.map((task) => {
|
|
42
|
+
const taskGrades = graded.filter((item) => item.taskId === task.id);
|
|
43
|
+
const withSkill = taskGrades.filter((item) => item.arm === 'with_skill');
|
|
44
|
+
const noSkill = taskGrades.filter((item) => item.arm === 'no_skill');
|
|
45
|
+
return {
|
|
46
|
+
id: task.id,
|
|
47
|
+
prompt: task.prompt,
|
|
48
|
+
criterion_type: task.criterionType,
|
|
49
|
+
criterion: task.criterion,
|
|
50
|
+
arm_a_pass_rate: withSkill.filter((item) => item.pass).length / withSkill.length,
|
|
51
|
+
arm_b_pass_rate: noSkill.filter((item) => item.pass).length / noSkill.length
|
|
52
|
+
};
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
function mean(values) {
|
|
56
|
+
return values.length === 0 ? 0 : values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
57
|
+
}
|
|
58
|
+
async function writeJson(filePath, value) {
|
|
59
|
+
await mkdir(path.dirname(filePath), { recursive: true });
|
|
60
|
+
await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`);
|
|
61
|
+
}
|
|
62
|
+
function parseTaskSuite(text) {
|
|
63
|
+
const value = JSON.parse(text);
|
|
64
|
+
const tasks = Array.isArray(value)
|
|
65
|
+
? value
|
|
66
|
+
: typeof value === 'object' && value !== null && Array.isArray(value.tasks)
|
|
67
|
+
? value.tasks
|
|
68
|
+
: undefined;
|
|
69
|
+
if (!tasks) {
|
|
70
|
+
throw new Error('Task suite must be an array or an object with a tasks array');
|
|
71
|
+
}
|
|
72
|
+
return tasks.map((task, index) => {
|
|
73
|
+
const item = task;
|
|
74
|
+
const criterionType = item.criterionType ?? item.criterion_type ?? 'rubric';
|
|
75
|
+
if (criterionType !== 'rubric' && criterionType !== 'deterministic') {
|
|
76
|
+
throw new Error(`Unsupported criterion type in task ${index + 1}`);
|
|
77
|
+
}
|
|
78
|
+
return {
|
|
79
|
+
id: String(item.id ?? `t${String(index + 1).padStart(3, '0')}`),
|
|
80
|
+
prompt: String(item.prompt ?? ''),
|
|
81
|
+
criterionType,
|
|
82
|
+
criterion: String(item.criterion ?? '')
|
|
83
|
+
};
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
export async function evalSkill(options) {
|
|
87
|
+
const baseConfig = loadNvidiaConfig();
|
|
88
|
+
const config = applyModelOverrides(baseConfig, options);
|
|
89
|
+
const client = new NvidiaNimClient(config);
|
|
90
|
+
const cache = new JsonCache();
|
|
91
|
+
const skill = await normalizeSkill(options.inputPath);
|
|
92
|
+
const tasks = options.taskSuite
|
|
93
|
+
? parseTaskSuite(await readFile(options.taskSuite, 'utf8')).slice(0, options.tasks)
|
|
94
|
+
: await generateTasks({ domain: skill.domain, count: options.tasks }, config, client, cache);
|
|
95
|
+
const taskSuiteHash = hashJson({ skill: skill.versionHash, tasks });
|
|
96
|
+
const taskSuitePath = options.taskSuite ?? `results/tasks/${taskSuiteHash}.json`;
|
|
97
|
+
await writeJson(taskSuitePath, tasks);
|
|
98
|
+
const outputs = await runTrials(skill, tasks, options.trials, config, client, cache);
|
|
99
|
+
const graded = await gradeOutputs(tasks, outputs, config, client, cache);
|
|
100
|
+
const score = scorePairedObservations(pairedObservations(graded));
|
|
101
|
+
const breakdowns = taskBreakdowns(tasks, graded);
|
|
102
|
+
const withSkillTokens = graded.filter((item) => item.arm === 'with_skill').map((item) => item.promptTokens);
|
|
103
|
+
const noSkillTokens = graded.filter((item) => item.arm === 'no_skill').map((item) => item.promptTokens);
|
|
104
|
+
const tokenOverhead = Math.max(0, Math.round(mean(withSkillTokens) - mean(noSkillTokens)));
|
|
105
|
+
const valuePer1kTokens = tokenOverhead === 0 ? 0 : Number((score.effectPp / (tokenOverhead / 1000)).toFixed(2));
|
|
106
|
+
const runDate = new Date().toISOString().slice(0, 10);
|
|
107
|
+
const result = {
|
|
108
|
+
skill: {
|
|
109
|
+
name: skill.name,
|
|
110
|
+
source: options.sourceLabel ?? skill.sourcePath,
|
|
111
|
+
format: skill.format,
|
|
112
|
+
commit_hash: skill.versionHash,
|
|
113
|
+
domain: skill.domain
|
|
114
|
+
},
|
|
115
|
+
config: {
|
|
116
|
+
runner_model: config.runnerModel,
|
|
117
|
+
runner_version: config.runnerModel,
|
|
118
|
+
grader_model: config.graderModel,
|
|
119
|
+
grader_version: config.graderModel,
|
|
120
|
+
generator_model: config.generatorModel,
|
|
121
|
+
trials: options.trials,
|
|
122
|
+
tasks: options.tasks,
|
|
123
|
+
temperature: 0.7,
|
|
124
|
+
mode: 'forced-injection'
|
|
125
|
+
},
|
|
126
|
+
result: {
|
|
127
|
+
effect_pp: score.effectPp,
|
|
128
|
+
ci_pp: score.ciPp,
|
|
129
|
+
verdict: score.verdict,
|
|
130
|
+
with_skill_pass: score.withSkillPass,
|
|
131
|
+
no_skill_pass: score.noSkillPass,
|
|
132
|
+
token_overhead: tokenOverhead,
|
|
133
|
+
value_per_1k_tokens: valuePer1kTokens
|
|
134
|
+
},
|
|
135
|
+
tasks: breakdowns,
|
|
136
|
+
reproducibility: {
|
|
137
|
+
task_suite_path: taskSuitePath,
|
|
138
|
+
transcript_hashes: graded.map((item) => item.transcriptHash)
|
|
139
|
+
},
|
|
140
|
+
history: [
|
|
141
|
+
{
|
|
142
|
+
runner_version: config.runnerModel,
|
|
143
|
+
run_date: runDate,
|
|
144
|
+
effect_pp: score.effectPp,
|
|
145
|
+
verdict: score.verdict
|
|
146
|
+
}
|
|
147
|
+
],
|
|
148
|
+
run_date: runDate
|
|
149
|
+
};
|
|
150
|
+
if (options.output) {
|
|
151
|
+
await writeJson(options.output, result);
|
|
152
|
+
}
|
|
153
|
+
return result;
|
|
154
|
+
}
|
|
155
|
+
//# sourceMappingURL=eval.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../packages/cli/src/eval.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC3D,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,EAAE,gBAAgB,EAAqB,MAAM,UAAU,CAAC;AAC/D,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AACrC,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,uBAAuB,EAA0B,MAAM,YAAY,CAAC;AAgB7E,SAAS,mBAAmB,CAAC,MAAoB,EAAE,OAAoB;IACrE,OAAO;QACL,GAAG,MAAM;QACT,WAAW,EAAE,OAAO,CAAC,MAAM,IAAI,MAAM,CAAC,WAAW;QACjD,WAAW,EAAE,OAAO,CAAC,MAAM,IAAI,MAAM,CAAC,WAAW;QACjD,cAAc,EAAE,OAAO,CAAC,SAAS,IAAI,MAAM,CAAC,cAAc;KAC3D,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,MAAsB;IAChD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAsC,CAAC;IAC7D,KAAK,MAAM,IAAI,IAAI,MAAM,EAAE,CAAC;QAC1B,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;QAC3C,MAAM,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,IAAI,CAAC,GAAG,KAAK,YAAY,EAAE,CAAC;YAC9B,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,IAAI,CAAC;QACpC,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC;QAClC,CAAC;QACD,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IAC3B,CAAC;IAED,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACvC,IAAI,OAAO,IAAI,CAAC,aAAa,KAAK,SAAS,IAAI,OAAO,IAAI,CAAC,WAAW,KAAK,SAAS,EAAE,CAAC;YACrF,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACvD,CAAC;QACD,OAAO,IAAyB,CAAC;IACnC,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,cAAc,CAAC,KAAsB,EAAE,MAAsB;IACpE,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACxB,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,KAAK,IAAI,CAAC,EAAE,CAAC,CAAC;QACpE,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,KAAK,YAAY,CAAC,CAAC;QACzE,MAAM,OAAO,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,KAAK,UAAU,CAAC,CAAC;QACrE,OAAO;YACL,EAAE,EAAE,IAAI,CAAC,EAAE;YACX,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,cAAc,EAAE,IAAI,CAAC,aAAa;YAClC,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,eAAe,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,SAAS,CAAC,MAAM;YAChF,eAAe,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM;SAC7E,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,IAAI,CAAC,MAAgB;IAC5B,OAAO,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,GAAG,KAAK,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;AACjG,CAAC;AAED,KAAK,UAAU,SAAS,CAAC,QAAgB,EAAE,KAAc;IACvD,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzD,MAAM,SAAS,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;AACnE,CAAC;AAED,SAAS,cAAc,CAAC,IAAY;IAClC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAY,CAAC;IAC1C,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC;QAChC,CAAC,CAAC,KAAK;QACP,CAAC,CAAC,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,CAAC,OAAO,CAAE,KAA6B,CAAC,KAAK,CAAC;YAClG,CAAC,CAAE,KAA8B,CAAC,KAAK;YACvC,CAAC,CAAC,SAAS,CAAC;IAChB,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,6DAA6D,CAAC,CAAC;IACjF,CAAC;IAED,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE;QAC/B,MAAM,IAAI,GAAG,IAA+B,CAAC;QAC7C,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,cAAc,IAAI,QAAQ,CAAC;QAC5E,IAAI,aAAa,KAAK,QAAQ,IAAI,aAAa,KAAK,eAAe,EAAE,CAAC;YACpE,MAAM,IAAI,KAAK,CAAC,sCAAsC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;QACrE,CAAC;QACD,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,IAAI,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;YAC/D,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM,IAAI,EAAE,CAAC;YACjC,aAAa;YACb,SAAS,EAAE,MAAM,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;SACxC,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,OAAoB;IAClD,MAAM,UAAU,GAAG,gBAAgB,EAAE,CAAC;IACtC,MAAM,MAAM,GAAG,mBAAmB,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;IACxD,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC,MAAM,CAAC,CAAC;IAC3C,MAAM,KAAK,GAAG,IAAI,SAAS,EAAE,CAAC;IAC9B,MAAM,KAAK,GAAG,MAAM,cAAc,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAEtD,MAAM,KAAK,GAAG,OAAO,CAAC,SAAS;QAC7B,CAAC,CAAC,cAAc,CAAC,MAAM,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,KAAK,CAAC;QACnF,CAAC,CAAC,MAAM,aAAa,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,MAAM,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC;IAC/F,MAAM,aAAa,GAAG,QAAQ,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,WAAW,EAAE,KAAK,EAAE,CAAC,CAAC;IACpE,MAAM,aAAa,GAAG,OAAO,CAAC,SAAS,IAAI,iBAAiB,aAAa,OAAO,CAAC;IACjF,MAAM,SAAS,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC;IAEtC,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,EAAE,KAAK,EAAE,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC;IACrF,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC;IACzE,MAAM,KAAK,GAAG,uBAAuB,CAAC,kBAAkB,CAAC,MAAM,CAAC,CAAC,CAAC;IAClE,MAAM,UAAU,GAAG,cAAc,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC;IACjD,MAAM,eAAe,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,KAAK,YAAY,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IAC5G,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,KAAK,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IACxG,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;IAC3F,MAAM,gBAAgB,GAAG,aAAa,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,KAAK,CAAC,QAAQ,GAAG,CAAC,aAAa,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;IAChH,MAAM,OAAO,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAEtD,MAAM,MAAM,GAAG;QACb,KAAK,EAAE;YACL,IAAI,EAAE,KAAK,CAAC,IAAI;YAChB,MAAM,EAAE,OAAO,CAAC,WAAW,IAAI,KAAK,CAAC,UAAU;YAC/C,MAAM,EAAE,KAAK,CAAC,MAAM;YACpB,WAAW,EAAE,KAAK,CAAC,WAAW;YAC9B,MAAM,EAAE,KAAK,CAAC,MAAM;SACrB;QACD,MAAM,EAAE;YACN,YAAY,EAAE,MAAM,CAAC,WAAW;YAChC,cAAc,EAAE,MAAM,CAAC,WAAW;YAClC,YAAY,EAAE,MAAM,CAAC,WAAW;YAChC,cAAc,EAAE,MAAM,CAAC,WAAW;YAClC,eAAe,EAAE,MAAM,CAAC,cAAc;YACtC,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,WAAW,EAAE,GAAG;YAChB,IAAI,EAAE,kBAAkB;SACzB;QACD,MAAM,EAAE;YACN,SAAS,EAAE,KAAK,CAAC,QAAQ;YACzB,KAAK,EAAE,KAAK,CAAC,IAAI;YACjB,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,eAAe,EAAE,KAAK,CAAC,aAAa;YACpC,aAAa,EAAE,KAAK,CAAC,WAAW;YAChC,cAAc,EAAE,aAAa;YAC7B,mBAAmB,EAAE,gBAAgB;SACtC;QACD,KAAK,EAAE,UAAU;QACjB,eAAe,EAAE;YACf,eAAe,EAAE,aAAa;YAC9B,iBAAiB,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,cAAc,CAAC;SAC7D;QACD,OAAO,EAAE;YACP;gBACE,cAAc,EAAE,MAAM,CAAC,WAAW;gBAClC,QAAQ,EAAE,OAAO;gBACjB,SAAS,EAAE,KAAK,CAAC,QAAQ;gBACzB,OAAO,EAAE,KAAK,CAAC,OAAO;aACvB;SACF;QACD,QAAQ,EAAE,OAAO;KAClB,CAAC;IAEF,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;QACnB,MAAM,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC1C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { NvidiaNimClient } from './adapters/nvidia-nim.js';
|
|
2
|
+
import type { JsonCache } from './cache.js';
|
|
3
|
+
import type { NvidiaConfig } from './env.js';
|
|
4
|
+
import type { GeneratedTask } from './types.js';
|
|
5
|
+
export interface TaskGenerationInput {
|
|
6
|
+
domain: string;
|
|
7
|
+
count: number;
|
|
8
|
+
}
|
|
9
|
+
export declare function generateTasks(input: TaskGenerationInput, config: NvidiaConfig, client: NvidiaNimClient, cache: JsonCache): Promise<GeneratedTask[]>;
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { hashJson } from './hash.js';
|
|
2
|
+
function extractJsonPayload(text) {
|
|
3
|
+
const trimmed = text.trim();
|
|
4
|
+
if (trimmed.startsWith('[') || trimmed.startsWith('{')) {
|
|
5
|
+
return JSON.parse(trimmed);
|
|
6
|
+
}
|
|
7
|
+
const arrayStart = trimmed.indexOf('[');
|
|
8
|
+
const arrayEnd = trimmed.lastIndexOf(']');
|
|
9
|
+
const objectStart = trimmed.indexOf('{');
|
|
10
|
+
const objectEnd = trimmed.lastIndexOf('}');
|
|
11
|
+
const useObject = objectStart !== -1 && (arrayStart === -1 || objectStart < arrayStart);
|
|
12
|
+
const start = useObject ? objectStart : arrayStart;
|
|
13
|
+
const end = useObject ? objectEnd : arrayEnd;
|
|
14
|
+
if (start === -1 || end === -1 || end <= start) {
|
|
15
|
+
throw new Error(`Generator did not return JSON: ${trimmed.slice(0, 200)}`);
|
|
16
|
+
}
|
|
17
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
18
|
+
}
|
|
19
|
+
function validateTasks(value, count) {
|
|
20
|
+
const array = Array.isArray(value)
|
|
21
|
+
? value
|
|
22
|
+
: typeof value === 'object' && value !== null && Array.isArray(value.tasks)
|
|
23
|
+
? value.tasks
|
|
24
|
+
: undefined;
|
|
25
|
+
if (!array) {
|
|
26
|
+
throw new Error('Generated task payload must be an array');
|
|
27
|
+
}
|
|
28
|
+
return array.slice(0, count).map((item, index) => {
|
|
29
|
+
if (typeof item !== 'object' || item === null) {
|
|
30
|
+
throw new Error(`Generated task ${index} is not an object`);
|
|
31
|
+
}
|
|
32
|
+
const task = item;
|
|
33
|
+
const prompt = String(task.prompt ?? '').trim();
|
|
34
|
+
const criterion = String(task.criterion ?? task.rubric ?? '').trim();
|
|
35
|
+
if (!prompt || !criterion) {
|
|
36
|
+
throw new Error(`Generated task ${index} is missing prompt or criterion`);
|
|
37
|
+
}
|
|
38
|
+
return {
|
|
39
|
+
id: String(task.id ?? `t${String(index + 1).padStart(3, '0')}`),
|
|
40
|
+
prompt,
|
|
41
|
+
criterionType: 'rubric',
|
|
42
|
+
criterion
|
|
43
|
+
};
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
function seededShuffle(items, seedText) {
|
|
47
|
+
let state = parseInt(hashJson(seedText).slice(0, 8), 16) >>> 0;
|
|
48
|
+
const random = () => {
|
|
49
|
+
state = (state * 1664525 + 1013904223) >>> 0;
|
|
50
|
+
return state / 0x100000000;
|
|
51
|
+
};
|
|
52
|
+
const copy = [...items];
|
|
53
|
+
for (let index = copy.length - 1; index > 0; index -= 1) {
|
|
54
|
+
const swapIndex = Math.floor(random() * (index + 1));
|
|
55
|
+
[copy[index], copy[swapIndex]] = [copy[swapIndex], copy[index]];
|
|
56
|
+
}
|
|
57
|
+
return copy;
|
|
58
|
+
}
|
|
59
|
+
export async function generateTasks(input, config, client, cache) {
|
|
60
|
+
const generatedCount = input.count * 2;
|
|
61
|
+
let lastError;
|
|
62
|
+
for (let attempt = 1; attempt <= 3; attempt += 1) {
|
|
63
|
+
const response = await cache.getOrSet('generator', { model: config.generatorModel, input, generatedCount, promptVersion: 7, attempt }, () => client.complete({
|
|
64
|
+
model: config.generatorModel,
|
|
65
|
+
temperature: 0.4,
|
|
66
|
+
maxTokens: 8000,
|
|
67
|
+
responseFormat: 'json_object',
|
|
68
|
+
chatTemplateKwargs: { thinking: false },
|
|
69
|
+
messages: [
|
|
70
|
+
{
|
|
71
|
+
role: 'system',
|
|
72
|
+
content: 'Generate independent evaluation tasks. You only know the declared domain, never skill instructions. Return only valid compact JSON.'
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
role: 'user',
|
|
76
|
+
content: `Declared domain:\n${input.domain}\n\nGenerate ${generatedCount} concise tasks. Return exactly {"tasks":[{"id":"t1","prompt":"one concrete task under 80 words","criterion":"one pass/fail rubric under 60 words"}]}. Keep every criterion a single string, not an array. Do not include markdown or commentary.`
|
|
77
|
+
}
|
|
78
|
+
]
|
|
79
|
+
}));
|
|
80
|
+
try {
|
|
81
|
+
const generated = validateTasks(extractJsonPayload(response.content), generatedCount);
|
|
82
|
+
return seededShuffle(generated, `${input.domain}:${config.generatorModel}`).slice(0, input.count).map((task, index) => ({
|
|
83
|
+
...task,
|
|
84
|
+
id: `t${String(index + 1).padStart(3, '0')}`
|
|
85
|
+
}));
|
|
86
|
+
}
|
|
87
|
+
catch (error) {
|
|
88
|
+
lastError = error;
|
|
89
|
+
console.error(`[eval] generator returned invalid JSON on attempt ${attempt}/3`);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
throw lastError instanceof Error ? lastError : new Error('Generator did not return valid tasks after retries');
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=generate.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"generate.js","sourceRoot":"","sources":["../../packages/cli/src/generate.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAQrC,SAAS,kBAAkB,CAAC,IAAY;IACtC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACvD,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;IAED,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IACxC,MAAM,QAAQ,GAAG,OAAO,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,WAAW,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IACzC,MAAM,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAC3C,MAAM,SAAS,GAAG,WAAW,KAAK,CAAC,CAAC,IAAI,CAAC,UAAU,KAAK,CAAC,CAAC,IAAI,WAAW,GAAG,UAAU,CAAC,CAAC;IACxF,MAAM,KAAK,GAAG,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,UAAU,CAAC;IACnD,MAAM,GAAG,GAAG,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC;IAC7C,IAAI,KAAK,KAAK,CAAC,CAAC,IAAI,GAAG,KAAK,CAAC,CAAC,IAAI,GAAG,IAAI,KAAK,EAAE,CAAC;QAC/C,MAAM,IAAI,KAAK,CAAC,kCAAkC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;IAC7E,CAAC;IACD,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;AACnD,CAAC;AAED,SAAS,aAAa,CAAC,KAAc,EAAE,KAAa;IAClD,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC;QAChC,CAAC,CAAC,KAAK;QACP,CAAC,CAAC,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,CAAC,OAAO,CAAE,KAA6B,CAAC,KAAK,CAAC;YAClG,CAAC,CAAE,KAA8B,CAAC,KAAK;YACvC,CAAC,CAAC,SAAS,CAAC;IAChB,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE;QAC/C,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;YAC9C,MAAM,IAAI,KAAK,CAAC,kBAAkB,KAAK,mBAAmB,CAAC,CAAC;QAC9D,CAAC;QACD,MAAM,IAAI,GAAG,IAA+B,CAAC;QAC7C,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QAChD,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,SAAS,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACrE,IAAI,CAAC,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,kBAAkB,KAAK,iCAAiC,CAAC,CAAC;QAC5E,CAAC;QACD,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,IAAI,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;YAC/D,MAAM;YACN,aAAa,EAAE,QAAQ;YACvB,SAAS;SACV,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,aAAa,CAAI,KAAU,EAAE,QAAgB;IACpD,IAAI,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC;IAC/D,MAAM,MAAM,GAAG,GAAG,EAAE;QAClB,KAAK,GAAG,CAAC,KAAK,GAAG,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;QAC7C,OAAO,KAAK,GAAG,WAAW,CAAC;IAC7B,CAAC,CAAC;IACF,MAAM,IAAI,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;IACxB,KAAK,IAAI,KAAK,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,KAAK,GAAG,CAAC,EAAE,KAAK,IAAI,CAAC,EAAE,CAAC;QACxD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC;QACrD,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAE,EAAE,IAAI,CAAC,KAAK,CAAE,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,KAA0B,EAC1B,MAAoB,EACpB,MAAuB,EACvB,KAAgB;IAEhB,MAAM,cAAc,GAAG,KAAK,CAAC,KAAK,GAAG,CAAC,CAAC;IACvC,IAAI,SAAkB,CAAC;IACvB,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,CAAC,EAAE,OAAO,IAAI,CAAC,EAAE,CAAC;QACjD,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,QAAQ,CACnC,WAAW,EACX,EAAE,KAAK,EAAE,MAAM,CAAC,cAAc,EAAE,KAAK,EAAE,cAAc,EAAE,aAAa,EAAE,CAAC,EAAE,OAAO,EAAE,EAClF,GAAG,EAAE,CACH,MAAM,CAAC,QAAQ,CAAC;YACd,KAAK,EAAE,MAAM,CAAC,cAAc;YAC5B,WAAW,EAAE,GAAG;YAChB,SAAS,EAAE,IAAI;YACf,cAAc,EAAE,aAAa;YAC7B,kBAAkB,EAAE,EAAE,QAAQ,EAAE,KAAK,EAAE;YACvC,QAAQ,EAAE;gBACR;oBACE,IAAI,EAAE,QAAQ;oBACd,OAAO,EAAE,qIAAqI;iBAC/I;gBACD;oBACE,IAAI,EAAE,MAAM;oBACZ,OAAO,EAAE,qBAAqB,KAAK,CAAC,MAAM,gBAAgB,cAAc,kPAAkP;iBAC3T;aACF;SACF,CAAC,CACL,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,aAAa,CAAC,kBAAkB,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,cAAc,CAAC,CAAC;YACtF,OAAO,aAAa,CAAC,SAAS,EAAE,GAAG,KAAK,CAAC,MAAM,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC,CAAC;gBACtH,GAAG,IAAI;gBACP,EAAE,EAAE,IAAI,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;aAC7C,CAAC,CAAC,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,SAAS,GAAG,KAAK,CAAC;YAClB,OAAO,CAAC,KAAK,CAAC,qDAAqD,OAAO,IAAI,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED,MAAM,SAAS,YAAY,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,oDAAoD,CAAC,CAAC;AACjH,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { NvidiaNimClient } from './adapters/nvidia-nim.js';
|
|
2
|
+
import type { JsonCache } from './cache.js';
|
|
3
|
+
import type { NvidiaConfig } from './env.js';
|
|
4
|
+
import type { GeneratedTask, GradedOutput, TrialOutput } from './types.js';
|
|
5
|
+
export declare function gradeOutputs(tasks: GeneratedTask[], outputs: TrialOutput[], config: NvidiaConfig, client: NvidiaNimClient, cache: JsonCache): Promise<GradedOutput[]>;
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import { gradeDeterministically } from './deterministic.js';
|
|
2
|
+
import { hashJson } from './hash.js';
|
|
3
|
+
function parseGrade(text) {
|
|
4
|
+
const trimmed = text.trim();
|
|
5
|
+
if (trimmed.startsWith('{')) {
|
|
6
|
+
const end = trimmed.lastIndexOf('}');
|
|
7
|
+
if (end === -1) {
|
|
8
|
+
throw new Error('Grader JSON object was not closed');
|
|
9
|
+
}
|
|
10
|
+
const parsed = JSON.parse(trimmed.slice(0, end + 1));
|
|
11
|
+
const score = Number(parsed.score);
|
|
12
|
+
if (!Number.isFinite(score)) {
|
|
13
|
+
throw new Error('Grader JSON missing numeric score');
|
|
14
|
+
}
|
|
15
|
+
return {
|
|
16
|
+
score: Math.max(0, Math.min(1, score)),
|
|
17
|
+
reason: String(parsed.reason ?? '')
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
const lower = trimmed.toLowerCase();
|
|
21
|
+
const score = /\b(score|grade)\s*[:=]\s*1\b/.test(lower) || /\b(pass|passes|meets)\b/.test(lower) ? 1 : 0;
|
|
22
|
+
return {
|
|
23
|
+
score: Math.max(0, Math.min(1, score)),
|
|
24
|
+
reason: `non-json grader response: ${trimmed.slice(0, 160)}`
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
function seededShuffle(items, seedText) {
|
|
28
|
+
let state = parseInt(hashJson(seedText).slice(0, 8), 16) >>> 0;
|
|
29
|
+
const random = () => {
|
|
30
|
+
state = (state * 1664525 + 1013904223) >>> 0;
|
|
31
|
+
return state / 0x100000000;
|
|
32
|
+
};
|
|
33
|
+
const copy = [...items];
|
|
34
|
+
for (let index = copy.length - 1; index > 0; index -= 1) {
|
|
35
|
+
const swapIndex = Math.floor(random() * (index + 1));
|
|
36
|
+
[copy[index], copy[swapIndex]] = [copy[swapIndex], copy[index]];
|
|
37
|
+
}
|
|
38
|
+
return copy;
|
|
39
|
+
}
|
|
40
|
+
export async function gradeOutputs(tasks, outputs, config, client, cache) {
|
|
41
|
+
const taskById = new Map(tasks.map((task) => [task.id, task]));
|
|
42
|
+
const shuffled = seededShuffle(outputs, hashJson(outputs.map((output) => output.transcriptHash)));
|
|
43
|
+
const graded = new Map();
|
|
44
|
+
for (const output of shuffled) {
|
|
45
|
+
const task = taskById.get(output.taskId);
|
|
46
|
+
if (!task) {
|
|
47
|
+
throw new Error(`Missing task for output ${output.taskId}`);
|
|
48
|
+
}
|
|
49
|
+
if (task.criterionType === 'deterministic') {
|
|
50
|
+
const grade = gradeDeterministically(task, output.output);
|
|
51
|
+
graded.set(output.transcriptHash, {
|
|
52
|
+
...output,
|
|
53
|
+
...grade
|
|
54
|
+
});
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
console.error(`[eval] grade ${output.taskId} trial ${output.trial}`);
|
|
58
|
+
let grade;
|
|
59
|
+
let lastError;
|
|
60
|
+
for (let attempt = 1; attempt <= 3; attempt += 1) {
|
|
61
|
+
const response = await cache.getOrSet('grader', {
|
|
62
|
+
model: config.graderModel,
|
|
63
|
+
criterion: task.criterion,
|
|
64
|
+
output: output.output,
|
|
65
|
+
promptVersion: 6,
|
|
66
|
+
responseFormat: 'json_object',
|
|
67
|
+
attempt
|
|
68
|
+
}, () => client.complete({
|
|
69
|
+
model: config.graderModel,
|
|
70
|
+
temperature: 0,
|
|
71
|
+
maxTokens: 240,
|
|
72
|
+
responseFormat: 'json_object',
|
|
73
|
+
chatTemplateKwargs: { thinking: false },
|
|
74
|
+
messages: [
|
|
75
|
+
{
|
|
76
|
+
role: 'system',
|
|
77
|
+
content: 'You are a blind evaluator. Grade only the provided output against the success criterion. You do not know which experimental arm produced it. Do not explain your reasoning. Return only valid JSON.'
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
role: 'user',
|
|
81
|
+
content: `Success criterion:\n${task.criterion}\n\nOutput to grade:\n${output.output}\n\nReturn exactly one JSON object with this shape: {"score":0,"reason":"brief reason"}. Use score 1 only if the output satisfies the criterion; otherwise use 0. Do not include markdown or commentary.`
|
|
82
|
+
}
|
|
83
|
+
]
|
|
84
|
+
}));
|
|
85
|
+
try {
|
|
86
|
+
grade = parseGrade(response.content);
|
|
87
|
+
break;
|
|
88
|
+
}
|
|
89
|
+
catch (error) {
|
|
90
|
+
lastError = error;
|
|
91
|
+
console.error(`[eval] grader returned invalid JSON on attempt ${attempt}/3`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (!grade) {
|
|
95
|
+
throw lastError instanceof Error ? lastError : new Error('Grader did not return a valid grade after retries');
|
|
96
|
+
}
|
|
97
|
+
graded.set(output.transcriptHash, {
|
|
98
|
+
...output,
|
|
99
|
+
score: grade.score,
|
|
100
|
+
reason: grade.reason,
|
|
101
|
+
pass: grade.score >= 0.5
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
return outputs.map((output) => {
|
|
105
|
+
const item = graded.get(output.transcriptHash);
|
|
106
|
+
if (!item) {
|
|
107
|
+
throw new Error(`Missing grade for ${output.transcriptHash}`);
|
|
108
|
+
}
|
|
109
|
+
return item;
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
//# sourceMappingURL=grade.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"grade.js","sourceRoot":"","sources":["../../packages/cli/src/grade.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,sBAAsB,EAAE,MAAM,oBAAoB,CAAC;AAC5D,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAQrC,SAAS,UAAU,CAAC,IAAY;IAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,OAAO,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QACrC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACvD,CAAC;QACD,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,GAAG,CAAC,CAAC,CAA4B,CAAC;QAChF,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACnC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAC5B,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACvD,CAAC;QACD,OAAO;YACL,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;YACtC,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;SACpC,CAAC;IACJ,CAAC;IAED,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IACpC,MAAM,KAAK,GAAG,8BAA8B,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,yBAAyB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1G,OAAO;QACL,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;QACtC,MAAM,EAAE,6BAA6B,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE;KAC7D,CAAC;AACJ,CAAC;AAED,SAAS,aAAa,CAAI,KAAU,EAAE,QAAgB;IACpD,IAAI,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC;IAC/D,MAAM,MAAM,GAAG,GAAG,EAAE;QAClB,KAAK,GAAG,CAAC,KAAK,GAAG,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;QAC7C,OAAO,KAAK,GAAG,WAAW,CAAC;IAC7B,CAAC,CAAC;IACF,MAAM,IAAI,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;IACxB,KAAK,IAAI,KAAK,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,KAAK,GAAG,CAAC,EAAE,KAAK,IAAI,CAAC,EAAE,CAAC;QACxD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC;QACrD,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAE,EAAE,IAAI,CAAC,KAAK,CAAE,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,KAAsB,EACtB,OAAsB,EACtB,MAAoB,EACpB,MAAuB,EACvB,KAAgB;IAEhB,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,EAAE,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC;IAC/D,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,EAAE,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC;IAClG,MAAM,MAAM,GAAG,IAAI,GAAG,EAAwB,CAAC;IAE/C,KAAK,MAAM,MAAM,IAAI,QAAQ,EAAE,CAAC;QAC9B,MAAM,IAAI,GAAG,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QACzC,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,MAAM,IAAI,KAAK,CAAC,2BAA2B,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,IAAI,IAAI,CAAC,aAAa,KAAK,eAAe,EAAE,CAAC;YAC3C,MAAM,KAAK,GAAG,sBAAsB,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;YAC1D,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,cAAc,EAAE;gBAChC,GAAG,MAAM;gBACT,GAAG,KAAK;aACT,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,gBAAgB,MAAM,CAAC,MAAM,UAAU,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;QACrE,IAAI,KAA+B,CAAC;QACpC,IAAI,SAAkB,CAAC;QACvB,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,CAAC,EAAE,OAAO,IAAI,CAAC,EAAE,CAAC;YACjD,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,QAAQ,CACnC,QAAQ,EACR;gBACE,KAAK,EAAE,MAAM,CAAC,WAAW;gBACzB,SAAS,EAAE,IAAI,CAAC,SAAS;gBACzB,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,aAAa,EAAE,CAAC;gBAChB,cAAc,EAAE,aAAa;gBAC7B,OAAO;aACR,EACD,GAAG,EAAE,CACH,MAAM,CAAC,QAAQ,CAAC;gBACd,KAAK,EAAE,MAAM,CAAC,WAAW;gBACzB,WAAW,EAAE,CAAC;gBACd,SAAS,EAAE,GAAG;gBACd,cAAc,EAAE,aAAa;gBAC7B,kBAAkB,EAAE,EAAE,QAAQ,EAAE,KAAK,EAAE;gBACvC,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,QAAQ;wBACd,OAAO,EACL,qMAAqM;qBACxM;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE,uBAAuB,IAAI,CAAC,SAAS,yBAAyB,MAAM,CAAC,MAAM,0MAA0M;qBAC/R;iBACF;aACF,CAAC,CACL,CAAC;YAEF,IAAI,CAAC;gBACH,KAAK,GAAG,UAAU,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;gBACrC,MAAM;YACR,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,SAAS,GAAG,KAAK,CAAC;gBAClB,OAAO,CAAC,KAAK,CAAC,kDAAkD,OAAO,IAAI,CAAC,CAAC;YAC/E,CAAC;QACH,CAAC;QACD,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,SAAS,YAAY,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,mDAAmD,CAAC,CAAC;QAChH,CAAC;QACD,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,cAAc,EAAE;YAChC,GAAG,MAAM;YACT,KAAK,EAAE,KAAK,CAAC,KAAK;YAClB,MAAM,EAAE,KAAK,CAAC,MAAM;YACpB,IAAI,EAAE,KAAK,CAAC,KAAK,IAAI,GAAG;SACzB,CAAC,CAAC;IACL,CAAC;IAED,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE;QAC5B,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC;QAC/C,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,MAAM,IAAI,KAAK,CAAC,qBAAqB,MAAM,CAAC,cAAc,EAAE,CAAC,CAAC;QAChE,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/dist/src/hash.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hash.js","sourceRoot":"","sources":["../../packages/cli/src/hash.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAEzC,MAAM,UAAU,MAAM,CAAC,KAAa;IAClC,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC1D,CAAC;AAED,MAAM,UAAU,QAAQ,CAAC,KAAc;IACrC,OAAO,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC;AACvC,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export interface M0Task {
|
|
2
|
+
id: string;
|
|
3
|
+
prompt: string;
|
|
4
|
+
expected: 'VALID' | 'INVALID';
|
|
5
|
+
}
|
|
6
|
+
export declare const M0_SKILL = "For SkillCheck Canary SKU classification tasks, apply this private rule exactly.\n\nA candidate is VALID only when all conditions hold:\n1. It has the exact form SC- followed by four digits, followed by a hyphen, followed by one lowercase checksum letter.\n2. Sum the four digits.\n3. The checksum letter must be q when the digit sum modulo 3 is 0, r when it is 1, and s when it is 2.\n\nFor every task, answer exactly VALID or INVALID. Do not explain.";
|
|
7
|
+
export declare const M0_TASKS: M0Task[];
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
export const M0_SKILL = `For SkillCheck Canary SKU classification tasks, apply this private rule exactly.
|
|
2
|
+
|
|
3
|
+
A candidate is VALID only when all conditions hold:
|
|
4
|
+
1. It has the exact form SC- followed by four digits, followed by a hyphen, followed by one lowercase checksum letter.
|
|
5
|
+
2. Sum the four digits.
|
|
6
|
+
3. The checksum letter must be q when the digit sum modulo 3 is 0, r when it is 1, and s when it is 2.
|
|
7
|
+
|
|
8
|
+
For every task, answer exactly VALID or INVALID. Do not explain.`;
|
|
9
|
+
export const M0_TASKS = [
|
|
10
|
+
{
|
|
11
|
+
id: 'm0-001',
|
|
12
|
+
prompt: 'Under the private SkillCheck Canary SKU rule, classify this candidate. Candidate: SC-1200-q. Answer exactly VALID or INVALID.',
|
|
13
|
+
expected: 'VALID'
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
id: 'm0-002',
|
|
17
|
+
prompt: 'Under the private SkillCheck Canary SKU rule, classify this candidate. Candidate: SC-1201-r. Answer exactly VALID or INVALID.',
|
|
18
|
+
expected: 'VALID'
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
id: 'm0-003',
|
|
22
|
+
prompt: 'Under the private SkillCheck Canary SKU rule, classify this candidate. Candidate: SC-1202-s. Answer exactly VALID or INVALID.',
|
|
23
|
+
expected: 'VALID'
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
id: 'm0-004',
|
|
27
|
+
prompt: 'Under the private SkillCheck Canary SKU rule, classify this candidate. Candidate: SC-9999-q. Answer exactly VALID or INVALID.',
|
|
28
|
+
expected: 'VALID'
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
id: 'm0-005',
|
|
32
|
+
prompt: 'Under the private SkillCheck Canary SKU rule, classify this candidate. Candidate: SC-1200-r. Answer exactly VALID or INVALID.',
|
|
33
|
+
expected: 'INVALID'
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
id: 'm0-006',
|
|
37
|
+
prompt: 'Under the private SkillCheck Canary SKU rule, classify this candidate. Candidate: SC-1202-q. Answer exactly VALID or INVALID.',
|
|
38
|
+
expected: 'INVALID'
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
id: 'm0-007',
|
|
42
|
+
prompt: 'Under the private SkillCheck Canary SKU rule, classify this candidate. Candidate: SC-12A2-s. Answer exactly VALID or INVALID.',
|
|
43
|
+
expected: 'INVALID'
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
id: 'm0-008',
|
|
47
|
+
prompt: 'Under the private SkillCheck Canary SKU rule, classify this candidate. Candidate: SC-5555-s. Answer exactly VALID or INVALID.',
|
|
48
|
+
expected: 'VALID'
|
|
49
|
+
}
|
|
50
|
+
];
|
|
51
|
+
//# sourceMappingURL=hardcoded.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hardcoded.js","sourceRoot":"","sources":["../../../packages/cli/src/m0/hardcoded.ts"],"names":[],"mappings":"AAMA,MAAM,CAAC,MAAM,QAAQ,GAAG;;;;;;;iEAOyC,CAAC;AAElE,MAAM,CAAC,MAAM,QAAQ,GAAa;IAChC;QACE,EAAE,EAAE,QAAQ;QACZ,MAAM,EAAE,+HAA+H;QACvI,QAAQ,EAAE,OAAO;KAClB;IACD;QACE,EAAE,EAAE,QAAQ;QACZ,MAAM,EAAE,+HAA+H;QACvI,QAAQ,EAAE,OAAO;KAClB;IACD;QACE,EAAE,EAAE,QAAQ;QACZ,MAAM,EAAE,+HAA+H;QACvI,QAAQ,EAAE,OAAO;KAClB;IACD;QACE,EAAE,EAAE,QAAQ;QACZ,MAAM,EAAE,+HAA+H;QACvI,QAAQ,EAAE,OAAO;KAClB;IACD;QACE,EAAE,EAAE,QAAQ;QACZ,MAAM,EAAE,+HAA+H;QACvI,QAAQ,EAAE,SAAS;KACpB;IACD;QACE,EAAE,EAAE,QAAQ;QACZ,MAAM,EAAE,+HAA+H;QACvI,QAAQ,EAAE,SAAS;KACpB;IACD;QACE,EAAE,EAAE,QAAQ;QACZ,MAAM,EAAE,+HAA+H;QACvI,QAAQ,EAAE,SAAS;KACpB;IACD;QACE,EAAE,EAAE,QAAQ;QACZ,MAAM,EAAE,+HAA+H;QACvI,QAAQ,EAAE,OAAO;KAClB;CACF,CAAC"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import type { NvidiaNimClient } from '../adapters/nvidia-nim.js';
|
|
2
|
+
import { loadNvidiaConfig } from '../env.js';
|
|
3
|
+
import { type PairedObservation, type ScoreSummary } from '../score.js';
|
|
4
|
+
import { type M0Task } from './hardcoded.js';
|
|
5
|
+
interface ArmResult {
|
|
6
|
+
taskId: string;
|
|
7
|
+
trial: number;
|
|
8
|
+
arm: 'with_skill' | 'no_skill';
|
|
9
|
+
expected: M0Task['expected'];
|
|
10
|
+
output: string;
|
|
11
|
+
pass: boolean;
|
|
12
|
+
}
|
|
13
|
+
interface M0RunResult {
|
|
14
|
+
runId: string;
|
|
15
|
+
skillLabel: string;
|
|
16
|
+
score: ScoreSummary;
|
|
17
|
+
observations: PairedObservation[];
|
|
18
|
+
armResults: ArmResult[];
|
|
19
|
+
}
|
|
20
|
+
export interface M0GateReport {
|
|
21
|
+
config: {
|
|
22
|
+
runnerModel: string;
|
|
23
|
+
trials: number;
|
|
24
|
+
tasks: number;
|
|
25
|
+
temperature: number;
|
|
26
|
+
};
|
|
27
|
+
repeatability: {
|
|
28
|
+
passed: boolean;
|
|
29
|
+
runs: M0RunResult[];
|
|
30
|
+
};
|
|
31
|
+
emptyControl: {
|
|
32
|
+
passed: boolean;
|
|
33
|
+
run: M0RunResult;
|
|
34
|
+
};
|
|
35
|
+
passed: boolean;
|
|
36
|
+
}
|
|
37
|
+
export declare function runM0Gate(clientFactory: (config: ReturnType<typeof loadNvidiaConfig>) => NvidiaNimClient): Promise<M0GateReport>;
|
|
38
|
+
export {};
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import { loadNvidiaConfig } from '../env.js';
|
|
2
|
+
import { ciOverlapsZero, effectInsideCi, scorePairedObservations } from '../score.js';
|
|
3
|
+
import { M0_SKILL, M0_TASKS } from './hardcoded.js';
|
|
4
|
+
const K_TRIALS = 3;
|
|
5
|
+
const TEMPERATURE = 0.7;
|
|
6
|
+
const MAX_TOKENS = 16;
|
|
7
|
+
function buildMessages(skill, prompt) {
|
|
8
|
+
if (!skill.trim()) {
|
|
9
|
+
return [{ role: 'user', content: prompt }];
|
|
10
|
+
}
|
|
11
|
+
return [
|
|
12
|
+
{
|
|
13
|
+
role: 'system',
|
|
14
|
+
content: `You are the runner in an A/B skill evaluation. Follow any skill instructions below when they apply.\n\n${skill}`
|
|
15
|
+
},
|
|
16
|
+
{ role: 'user', content: prompt }
|
|
17
|
+
];
|
|
18
|
+
}
|
|
19
|
+
function outputPasses(output, expected) {
|
|
20
|
+
const escaped = expected.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
21
|
+
return new RegExp(`^\\s*${escaped}\\s*\\.?\\s*$`, 'i').test(output);
|
|
22
|
+
}
|
|
23
|
+
async function runArm(client, model, skill, task, trial, arm) {
|
|
24
|
+
const response = await client.complete({
|
|
25
|
+
model,
|
|
26
|
+
messages: buildMessages(skill, task.prompt),
|
|
27
|
+
temperature: TEMPERATURE,
|
|
28
|
+
maxTokens: MAX_TOKENS
|
|
29
|
+
});
|
|
30
|
+
return {
|
|
31
|
+
taskId: task.id,
|
|
32
|
+
trial,
|
|
33
|
+
arm,
|
|
34
|
+
expected: task.expected,
|
|
35
|
+
output: response.content.trim(),
|
|
36
|
+
pass: outputPasses(response.content, task.expected)
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
async function runSkillOnce(client, model, skill, skillLabel, runId) {
|
|
40
|
+
const armResults = [];
|
|
41
|
+
const observations = [];
|
|
42
|
+
for (const task of M0_TASKS) {
|
|
43
|
+
for (let trial = 1; trial <= K_TRIALS; trial += 1) {
|
|
44
|
+
console.error(`[m0] ${runId} ${task.id} trial ${trial}/${K_TRIALS} with_skill`);
|
|
45
|
+
const withSkill = await runArm(client, model, skill, task, trial, 'with_skill');
|
|
46
|
+
console.error(`[m0] ${runId} ${task.id} trial ${trial}/${K_TRIALS} no_skill`);
|
|
47
|
+
const noSkill = await runArm(client, model, '', task, trial, 'no_skill');
|
|
48
|
+
armResults.push(withSkill, noSkill);
|
|
49
|
+
observations.push({
|
|
50
|
+
withSkillPass: withSkill.pass,
|
|
51
|
+
noSkillPass: noSkill.pass
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return {
|
|
56
|
+
runId,
|
|
57
|
+
skillLabel,
|
|
58
|
+
score: scorePairedObservations(observations, 1000, hashSeed(runId)),
|
|
59
|
+
observations,
|
|
60
|
+
armResults
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
function hashSeed(value) {
|
|
64
|
+
let hash = 2166136261;
|
|
65
|
+
for (const char of value) {
|
|
66
|
+
hash ^= char.charCodeAt(0);
|
|
67
|
+
hash = Math.imul(hash, 16777619);
|
|
68
|
+
}
|
|
69
|
+
return hash >>> 0;
|
|
70
|
+
}
|
|
71
|
+
export async function runM0Gate(clientFactory) {
|
|
72
|
+
const config = loadNvidiaConfig();
|
|
73
|
+
const client = clientFactory(config);
|
|
74
|
+
const startedAt = new Date().toISOString();
|
|
75
|
+
const repeatRuns = [];
|
|
76
|
+
for (let index = 1; index <= 3; index += 1) {
|
|
77
|
+
console.error(`[m0] starting repeatability run ${index}/3`);
|
|
78
|
+
repeatRuns.push(await runSkillOnce(client, config.runnerModel, M0_SKILL, 'canary-sku-skill', `${startedAt}-repeat-${index}`));
|
|
79
|
+
}
|
|
80
|
+
console.error('[m0] starting empty-control run');
|
|
81
|
+
const emptyRun = await runSkillOnce(client, config.runnerModel, '', 'empty-skill', `${startedAt}-empty-control`);
|
|
82
|
+
const repeatabilityPassed = repeatRuns.every((run) => effectInsideCi(run.score));
|
|
83
|
+
const emptyControlPassed = ciOverlapsZero(emptyRun.score);
|
|
84
|
+
return {
|
|
85
|
+
config: {
|
|
86
|
+
runnerModel: config.runnerModel,
|
|
87
|
+
trials: K_TRIALS,
|
|
88
|
+
tasks: M0_TASKS.length,
|
|
89
|
+
temperature: TEMPERATURE
|
|
90
|
+
},
|
|
91
|
+
repeatability: {
|
|
92
|
+
passed: repeatabilityPassed,
|
|
93
|
+
runs: repeatRuns
|
|
94
|
+
},
|
|
95
|
+
emptyControl: {
|
|
96
|
+
passed: emptyControlPassed,
|
|
97
|
+
run: emptyRun
|
|
98
|
+
},
|
|
99
|
+
passed: repeatabilityPassed && emptyControlPassed
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
//# sourceMappingURL=run.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run.js","sourceRoot":"","sources":["../../../packages/cli/src/m0/run.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,gBAAgB,EAAE,MAAM,WAAW,CAAC;AAC7C,OAAO,EACL,cAAc,EACd,cAAc,EACd,uBAAuB,EAGxB,MAAM,aAAa,CAAC;AACrB,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAe,MAAM,gBAAgB,CAAC;AAEjE,MAAM,QAAQ,GAAG,CAAC,CAAC;AACnB,MAAM,WAAW,GAAG,GAAG,CAAC;AACxB,MAAM,UAAU,GAAG,EAAE,CAAC;AAqCtB,SAAS,aAAa,CAAC,KAAa,EAAE,MAAc;IAClD,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;QAClB,OAAO,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;IAC7C,CAAC;IAED,OAAO;QACL;YACE,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,0GAA0G,KAAK,EAAE;SAC3H;QACD,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE;KAClC,CAAC;AACJ,CAAC;AAED,SAAS,YAAY,CAAC,MAAc,EAAE,QAA4B;IAChE,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC;IAChE,OAAO,IAAI,MAAM,CAAC,QAAQ,OAAO,eAAe,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AACtE,CAAC;AAED,KAAK,UAAU,MAAM,CACnB,MAAuB,EACvB,KAAa,EACb,KAAa,EACb,IAAY,EACZ,KAAa,EACb,GAAqB;IAErB,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC;QACrC,KAAK;QACL,QAAQ,EAAE,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC;QAC3C,WAAW,EAAE,WAAW;QACxB,SAAS,EAAE,UAAU;KACtB,CAAC,CAAC;IAEH,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,EAAE;QACf,KAAK;QACL,GAAG;QACH,QAAQ,EAAE,IAAI,CAAC,QAAQ;QACvB,MAAM,EAAE,QAAQ,CAAC,OAAO,CAAC,IAAI,EAAE;QAC/B,IAAI,EAAE,YAAY,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,QAAQ,CAAC;KACpD,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,YAAY,CACzB,MAAuB,EACvB,KAAa,EACb,KAAa,EACb,UAAkB,EAClB,KAAa;IAEb,MAAM,UAAU,GAAgB,EAAE,CAAC;IACnC,MAAM,YAAY,GAAwB,EAAE,CAAC;IAE7C,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,IAAI,QAAQ,EAAE,KAAK,IAAI,CAAC,EAAE,CAAC;YAClD,OAAO,CAAC,KAAK,CAAC,QAAQ,KAAK,IAAI,IAAI,CAAC,EAAE,UAAU,KAAK,IAAI,QAAQ,aAAa,CAAC,CAAC;YAChF,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,YAAY,CAAC,CAAC;YAChF,OAAO,CAAC,KAAK,CAAC,QAAQ,KAAK,IAAI,IAAI,CAAC,EAAE,UAAU,KAAK,IAAI,QAAQ,WAAW,CAAC,CAAC;YAC9E,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,UAAU,CAAC,CAAC;YAEzE,UAAU,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;YACpC,YAAY,CAAC,IAAI,CAAC;gBAChB,aAAa,EAAE,SAAS,CAAC,IAAI;gBAC7B,WAAW,EAAE,OAAO,CAAC,IAAI;aAC1B,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO;QACL,KAAK;QACL,UAAU;QACV,KAAK,EAAE,uBAAuB,CAAC,YAAY,EAAE,IAAI,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC;QACnE,YAAY;QACZ,UAAU;KACX,CAAC;AACJ,CAAC;AAED,SAAS,QAAQ,CAAC,KAAa;IAC7B,IAAI,IAAI,GAAG,UAAU,CAAC;IACtB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC3B,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACnC,CAAC;IACD,OAAO,IAAI,KAAK,CAAC,CAAC;AACpB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,aAA+E;IAC7G,MAAM,MAAM,GAAG,gBAAgB,EAAE,CAAC;IAClC,MAAM,MAAM,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;IACrC,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC3C,MAAM,UAAU,GAAkB,EAAE,CAAC;IAErC,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,IAAI,CAAC,EAAE,KAAK,IAAI,CAAC,EAAE,CAAC;QAC3C,OAAO,CAAC,KAAK,CAAC,mCAAmC,KAAK,IAAI,CAAC,CAAC;QAC5D,UAAU,CAAC,IAAI,CACb,MAAM,YAAY,CAAC,MAAM,EAAE,MAAM,CAAC,WAAW,EAAE,QAAQ,EAAE,kBAAkB,EAAE,GAAG,SAAS,WAAW,KAAK,EAAE,CAAC,CAC7G,CAAC;IACJ,CAAC;IAED,OAAO,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;IACjD,MAAM,QAAQ,GAAG,MAAM,YAAY,CAAC,MAAM,EAAE,MAAM,CAAC,WAAW,EAAE,EAAE,EAAE,aAAa,EAAE,GAAG,SAAS,gBAAgB,CAAC,CAAC;IACjH,MAAM,mBAAmB,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC;IACjF,MAAM,kBAAkB,GAAG,cAAc,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;IAE1D,OAAO;QACL,MAAM,EAAE;YACN,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,MAAM,EAAE,QAAQ;YAChB,KAAK,EAAE,QAAQ,CAAC,MAAM;YACtB,WAAW,EAAE,WAAW;SACzB;QACD,aAAa,EAAE;YACb,MAAM,EAAE,mBAAmB;YAC3B,IAAI,EAAE,UAAU;SACjB;QACD,YAAY,EAAE;YACZ,MAAM,EAAE,kBAAkB;YAC1B,GAAG,EAAE,QAAQ;SACd;QACD,MAAM,EAAE,mBAAmB,IAAI,kBAAkB;KAClD,CAAC;AACJ,CAAC"}
|