@sx4im/skillcheck 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/LICENSE +21 -0
  2. package/METHODOLOGY.md +91 -0
  3. package/README.md +159 -0
  4. package/dist/bin/skillcheck.d.ts +2 -0
  5. package/dist/bin/skillcheck.js +8 -0
  6. package/dist/bin/skillcheck.js.map +1 -0
  7. package/dist/src/adapters/nvidia-nim.d.ts +30 -0
  8. package/dist/src/adapters/nvidia-nim.js +165 -0
  9. package/dist/src/adapters/nvidia-nim.js.map +1 -0
  10. package/dist/src/cache.d.ts +5 -0
  11. package/dist/src/cache.js +27 -0
  12. package/dist/src/cache.js.map +1 -0
  13. package/dist/src/cli.d.ts +1 -0
  14. package/dist/src/cli.js +146 -0
  15. package/dist/src/cli.js.map +1 -0
  16. package/dist/src/corpus.d.ts +43 -0
  17. package/dist/src/corpus.js +233 -0
  18. package/dist/src/corpus.js.map +1 -0
  19. package/dist/src/deterministic.d.ts +7 -0
  20. package/dist/src/deterministic.js +25 -0
  21. package/dist/src/deterministic.js.map +1 -0
  22. package/dist/src/env.d.ts +12 -0
  23. package/dist/src/env.js +39 -0
  24. package/dist/src/env.js.map +1 -0
  25. package/dist/src/eval.d.ts +13 -0
  26. package/dist/src/eval.js +155 -0
  27. package/dist/src/eval.js.map +1 -0
  28. package/dist/src/generate.d.ts +9 -0
  29. package/dist/src/generate.js +94 -0
  30. package/dist/src/generate.js.map +1 -0
  31. package/dist/src/grade.d.ts +5 -0
  32. package/dist/src/grade.js +112 -0
  33. package/dist/src/grade.js.map +1 -0
  34. package/dist/src/hash.d.ts +2 -0
  35. package/dist/src/hash.js +8 -0
  36. package/dist/src/hash.js.map +1 -0
  37. package/dist/src/m0/hardcoded.d.ts +7 -0
  38. package/dist/src/m0/hardcoded.js +51 -0
  39. package/dist/src/m0/hardcoded.js.map +1 -0
  40. package/dist/src/m0/run.d.ts +38 -0
  41. package/dist/src/m0/run.js +102 -0
  42. package/dist/src/m0/run.js.map +1 -0
  43. package/dist/src/normalize.d.ts +2 -0
  44. package/dist/src/normalize.js +109 -0
  45. package/dist/src/normalize.js.map +1 -0
  46. package/dist/src/rot.d.ts +62 -0
  47. package/dist/src/rot.js +156 -0
  48. package/dist/src/rot.js.map +1 -0
  49. package/dist/src/run.d.ts +5 -0
  50. package/dist/src/run.js +47 -0
  51. package/dist/src/run.js.map +1 -0
  52. package/dist/src/score.d.ts +14 -0
  53. package/dist/src/score.js +59 -0
  54. package/dist/src/score.js.map +1 -0
  55. package/dist/src/types.d.ts +41 -0
  56. package/dist/src/types.js +2 -0
  57. package/dist/src/types.js.map +1 -0
  58. package/dist/src/verify.d.ts +5 -0
  59. package/dist/src/verify.js +71 -0
  60. package/dist/src/verify.js.map +1 -0
  61. package/package.json +64 -0
@@ -0,0 +1,109 @@
1
+ import { readdir, readFile, stat } from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ import { sha256 } from './hash.js';
4
+ const SUPPORTED_FILES = [
5
+ { file: 'SKILL.md', format: 'SKILL.md' },
6
+ { file: 'AGENTS.md', format: 'AGENTS.md' },
7
+ { file: 'CLAUDE.md', format: 'CLAUDE.md' },
8
+ { file: '.cursorrules', format: '.cursorrules' }
9
+ ];
10
+ async function resolveSkillFile(inputPath) {
11
+ const stats = await stat(inputPath);
12
+ if (stats.isDirectory()) {
13
+ for (const candidate of SUPPORTED_FILES) {
14
+ const filePath = path.join(inputPath, candidate.file);
15
+ try {
16
+ const candidateStats = await stat(filePath);
17
+ if (candidateStats.isFile()) {
18
+ return { filePath, format: candidate.format };
19
+ }
20
+ }
21
+ catch (error) {
22
+ if (error.code !== 'ENOENT') {
23
+ throw error;
24
+ }
25
+ }
26
+ }
27
+ throw new Error(`No supported skill file found in ${inputPath}`);
28
+ }
29
+ const basename = path.basename(inputPath);
30
+ const supported = SUPPORTED_FILES.find((candidate) => candidate.file === basename);
31
+ if (!supported) {
32
+ throw new Error(`Unsupported skill file: ${inputPath}`);
33
+ }
34
+ return { filePath: inputPath, format: supported.format };
35
+ }
36
+ function extractFrontMatter(text) {
37
+ if (!text.startsWith('---\n')) {
38
+ return {};
39
+ }
40
+ const end = text.indexOf('\n---', 4);
41
+ if (end === -1) {
42
+ return {};
43
+ }
44
+ const fields = {};
45
+ for (const line of text.slice(4, end).split('\n')) {
46
+ const match = /^([A-Za-z0-9_-]+):\s*(.+)$/.exec(line.trim());
47
+ if (match) {
48
+ fields[match[1].toLowerCase()] = match[2].trim().replace(/^["']|["']$/g, '');
49
+ }
50
+ }
51
+ return fields;
52
+ }
53
+ function firstHeading(text) {
54
+ return text
55
+ .split('\n')
56
+ .map((line) => /^#\s+(.+)$/.exec(line.trim())?.[1]?.trim())
57
+ .find(Boolean);
58
+ }
59
+ function inferDomain(text) {
60
+ return firstHeading(text) || 'general agent skill';
61
+ }
62
+ function extractDomain(text) {
63
+ const frontMatter = extractFrontMatter(text);
64
+ const declared = frontMatter.domain ||
65
+ frontMatter.description ||
66
+ frontMatter.when_to_use ||
67
+ frontMatter['when-to-use'];
68
+ if (declared) {
69
+ return declared;
70
+ }
71
+ const descriptionLine = /^description:\s*(.+)$/im.exec(text)?.[1]?.trim();
72
+ if (descriptionLine) {
73
+ return descriptionLine;
74
+ }
75
+ const whenLine = /^(when_to_use|when to use|when-to-use):\s*(.+)$/im.exec(text)?.[2]?.trim();
76
+ if (whenLine) {
77
+ return whenLine;
78
+ }
79
+ return inferDomain(text);
80
+ }
81
+ async function listAssets(skillFilePath) {
82
+ const dir = path.dirname(skillFilePath);
83
+ const skillName = path.basename(skillFilePath);
84
+ try {
85
+ const entries = await readdir(dir, { withFileTypes: true });
86
+ return entries
87
+ .filter((entry) => entry.name !== skillName && !entry.name.startsWith('.git'))
88
+ .map((entry) => entry.name)
89
+ .sort();
90
+ }
91
+ catch {
92
+ return [];
93
+ }
94
+ }
95
+ export async function normalizeSkill(inputPath) {
96
+ const { filePath, format } = await resolveSkillFile(inputPath);
97
+ const instructions = await readFile(filePath, 'utf8');
98
+ const name = firstHeading(instructions) || path.basename(path.dirname(filePath)) || path.basename(filePath);
99
+ return {
100
+ name,
101
+ sourcePath: filePath,
102
+ format,
103
+ instructions,
104
+ domain: extractDomain(instructions),
105
+ assets: await listAssets(filePath),
106
+ versionHash: sha256(instructions)
107
+ };
108
+ }
109
+ //# sourceMappingURL=normalize.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalize.js","sourceRoot":"","sources":["../../packages/cli/src/normalize.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AAC3D,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,MAAM,EAAE,MAAM,WAAW,CAAC;AAGnC,MAAM,eAAe,GAAiD;IACpE,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,UAAU,EAAE;IACxC,EAAE,IAAI,EAAE,WAAW,EAAE,MAAM,EAAE,WAAW,EAAE;IAC1C,EAAE,IAAI,EAAE,WAAW,EAAE,MAAM,EAAE,WAAW,EAAE;IAC1C,EAAE,IAAI,EAAE,cAAc,EAAE,MAAM,EAAE,cAAc,EAAE;CACjD,CAAC;AAEF,KAAK,UAAU,gBAAgB,CAAC,SAAiB;IAC/C,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,CAAC;IACpC,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC;QACxB,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;YACxC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC;YACtD,IAAI,CAAC;gBACH,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,CAAC;gBAC5C,IAAI,cAAc,CAAC,MAAM,EAAE,EAAE,CAAC;oBAC5B,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,MAAM,EAAE,CAAC;gBAChD,CAAC;YACH,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,IAAK,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;oBACvD,MAAM,KAAK,CAAC;gBACd,CAAC;YACH,CAAC;QACH,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,oCAAoC,SAAS,EAAE,CAAC,CAAC;IACnE,CAAC;IAED,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;IAC1C,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,SAAS,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;IACnF,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,IAAI,KAAK,CAAC,2BAA2B,SAAS,EAAE,CAAC,CAAC;IAC1D,CAAC;IACD,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,CAAC,MAAM,EAAE,CAAC;AAC3D,CAAC;AAED,SAAS,kBAAkB,CAAC,IAAY;IACtC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QAC9B,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;IACrC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;QACf,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QAClD,MAAM,KAAK,GAAG,4BAA4B,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;QAC7D,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,WAAW,EAAE,CAAC,GAAG,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;QACjF,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,YAAY,CAAC,IAAY;IAChC,OAAO,IAAI;SACR,KAAK,CAAC,IAAI,CAAC;SACX,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;SAC1D,IAAI,CAAC,OAAO,CAAC,CAAC;AACnB,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,OAAO,YAAY,CAAC,IAAI,CAAC,IAAI,qBAAqB,CAAC;AACrD,CAAC;AAED,SAAS,aAAa,CAAC,IAAY;IACjC,MAAM,WAAW,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAC;IAC7C,MAAM,QAAQ,GACZ,WAAW,CAAC,MAAM;QAClB,WAAW,CAAC,WAAW;QACvB,WAAW,CAAC,WAAW;QACvB,WAAW,CAAC,aAAa,CAAC,CAAC;IAC7B,IAAI,QAAQ,EAAE,CAAC;QACb,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,MAAM,eAAe,GAAG,yBAAyB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;IAC1E,IAAI,eAAe,EAAE,CAAC;QACpB,OAAO,eAAe,CAAC;IACzB,CAAC;IAED,MAAM,QAAQ,GAAG,mDAAmD,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;IAC7F,IAAI,QAAQ,EAAE,CAAC;QACb,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,OAAO,WAAW,CAAC,IAAI,CAAC,CAAC;AAC3B,CAAC;AAED,KAAK,UAAU,UAAU,CAAC,aAAqB;IAC7C,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC;IACxC,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC;IAC/C,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;QAC5D,OAAO,OAAO;aACX,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;aAC7E,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC;aAC1B,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,SAAiB;IACpD,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,MAAM,gBAAgB,CAAC,SAAS,CAAC,CAAC;IAC/D,MAAM,YAAY,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IACtD,MAAM,IAAI,GAAG,YAAY,CAAC,YAAY,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAE5G,OAAO;QACL,IAAI;QACJ,UAAU,EAAE,QAAQ;QACpB,MAAM;QACN,YAAY;QACZ,MAAM,EAAE,aAAa,CAAC,YAAY,CAAC;QACnC,MAAM,EAAE,MAAM,UAAU,CAAC,QAAQ,CAAC;QAClC,WAAW,EAAE,MAAM,CAAC,YAAY,CAAC;KAClC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,62 @@
1
+ type Verdict = 'helps' | 'placebo' | 'harms';
2
+ type RotStatus = 'new' | 'stable' | 'rot';
3
+ interface StoredResult {
4
+ skill: {
5
+ name: string;
6
+ source: string;
7
+ format: string;
8
+ commit_hash: string;
9
+ domain: string;
10
+ };
11
+ config: {
12
+ runner_model: string;
13
+ runner_version?: string;
14
+ };
15
+ result: {
16
+ effect_pp: number;
17
+ ci_pp: [number, number];
18
+ verdict: Verdict;
19
+ };
20
+ run_date: string;
21
+ }
22
+ export interface RotHistoryEntry {
23
+ result_id: string;
24
+ file_path: string;
25
+ runner_model: string;
26
+ runner_version: string;
27
+ run_date: string;
28
+ effect_pp: number;
29
+ ci_pp: [number, number];
30
+ verdict: Verdict;
31
+ }
32
+ export interface RotSkillReport {
33
+ key: string;
34
+ skill: StoredResult['skill'];
35
+ status: RotStatus;
36
+ latest: RotHistoryEntry;
37
+ changed_from?: RotHistoryEntry;
38
+ history: RotHistoryEntry[];
39
+ }
40
+ export interface RotReport {
41
+ generated_at: string;
42
+ results_dir: string;
43
+ model?: string;
44
+ summary: {
45
+ skills: number;
46
+ new: number;
47
+ stable: number;
48
+ rot: number;
49
+ };
50
+ skills: RotSkillReport[];
51
+ }
52
+ export interface RotOptions {
53
+ resultsDir: string;
54
+ output?: string;
55
+ model?: string;
56
+ corpus?: string;
57
+ tasks: number;
58
+ trials: number;
59
+ }
60
+ export declare function buildRotReport(resultsDir: string, model?: string): Promise<RotReport>;
61
+ export declare function runRot(options: RotOptions): Promise<RotReport>;
62
+ export {};
@@ -0,0 +1,156 @@
1
+ import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ import { runCorpus } from './corpus.js';
4
+ function slugify(value) {
5
+ return value
6
+ .toLowerCase()
7
+ .replace(/[^a-z0-9]+/g, '-')
8
+ .replace(/^-|-$/g, '');
9
+ }
10
+ function resultKey(result) {
11
+ return `${slugify(result.skill.name)}:${result.skill.commit_hash}`;
12
+ }
13
+ function isVerdict(value) {
14
+ return value === 'helps' || value === 'placebo' || value === 'harms';
15
+ }
16
+ function asStoredResult(value) {
17
+ const candidate = value;
18
+ if (!candidate.skill?.name ||
19
+ !candidate.skill.source ||
20
+ !candidate.skill.format ||
21
+ !candidate.skill.commit_hash ||
22
+ !candidate.skill.domain ||
23
+ !candidate.config?.runner_model ||
24
+ !candidate.result?.ci_pp ||
25
+ !isVerdict(candidate.result.verdict) ||
26
+ typeof candidate.result.effect_pp !== 'number' ||
27
+ !candidate.run_date) {
28
+ return undefined;
29
+ }
30
+ return candidate;
31
+ }
32
+ async function listJsonFiles(dir) {
33
+ let entries;
34
+ try {
35
+ entries = await readdir(dir, { withFileTypes: true });
36
+ }
37
+ catch (error) {
38
+ if (error.code === 'ENOENT') {
39
+ return [];
40
+ }
41
+ throw error;
42
+ }
43
+ const files = await Promise.all(entries.map(async (entry) => {
44
+ const fullPath = path.join(dir, entry.name);
45
+ if (entry.isDirectory()) {
46
+ return listJsonFiles(fullPath);
47
+ }
48
+ return entry.isFile() && entry.name.endsWith('.json') ? [fullPath] : [];
49
+ }));
50
+ return files.flat();
51
+ }
52
+ async function readStoredResults(resultsDir) {
53
+ const files = await listJsonFiles(resultsDir);
54
+ const results = [];
55
+ for (const file of files) {
56
+ let parsed;
57
+ try {
58
+ parsed = JSON.parse(await readFile(file, 'utf8'));
59
+ }
60
+ catch {
61
+ continue;
62
+ }
63
+ const result = asStoredResult(parsed);
64
+ if (result) {
65
+ results.push({ file, result });
66
+ }
67
+ }
68
+ return results;
69
+ }
70
+ function compareHistory(a, b) {
71
+ const dateDelta = Date.parse(a.run_date) - Date.parse(b.run_date);
72
+ return dateDelta === 0 ? a.file_path.localeCompare(b.file_path) : dateDelta;
73
+ }
74
+ function historyEntry(resultsDir, file, result) {
75
+ const relative = path.relative(resultsDir, file);
76
+ return {
77
+ result_id: `${slugify(result.skill.name)}-${slugify(path.dirname(relative))}-${slugify(path.basename(relative, '.json'))}`,
78
+ file_path: relative,
79
+ runner_model: result.config.runner_model,
80
+ runner_version: result.config.runner_version ?? result.config.runner_model,
81
+ run_date: result.run_date,
82
+ effect_pp: result.result.effect_pp,
83
+ ci_pp: result.result.ci_pp,
84
+ verdict: result.result.verdict
85
+ };
86
+ }
87
+ function summarizeSkill(key, skill, history) {
88
+ const sorted = [...history].sort(compareHistory);
89
+ const latest = sorted.at(-1);
90
+ if (!latest) {
91
+ throw new Error(`No history for ${key}`);
92
+ }
93
+ const changedFrom = [...sorted.slice(0, -1)].reverse().find((entry) => entry.verdict === 'helps');
94
+ const status = sorted.length === 1 ? 'new' : changedFrom && latest.verdict !== 'helps' ? 'rot' : 'stable';
95
+ return {
96
+ key,
97
+ skill,
98
+ status,
99
+ latest,
100
+ ...(status === 'rot' ? { changed_from: changedFrom } : {}),
101
+ history: sorted
102
+ };
103
+ }
104
+ export async function buildRotReport(resultsDir, model) {
105
+ const absoluteResultsDir = path.resolve(resultsDir);
106
+ const storedResults = await readStoredResults(absoluteResultsDir);
107
+ const grouped = new Map();
108
+ for (const { file, result } of storedResults) {
109
+ const key = resultKey(result);
110
+ const current = grouped.get(key) ?? { skill: result.skill, history: [] };
111
+ current.history.push(historyEntry(absoluteResultsDir, file, result));
112
+ grouped.set(key, current);
113
+ }
114
+ const skills = [...grouped.entries()]
115
+ .map(([key, group]) => summarizeSkill(key, group.skill, group.history))
116
+ .sort((a, b) => a.skill.name.localeCompare(b.skill.name));
117
+ return {
118
+ generated_at: new Date().toISOString(),
119
+ results_dir: resultsDir,
120
+ ...(model ? { model } : {}),
121
+ summary: {
122
+ skills: skills.length,
123
+ new: skills.filter((skill) => skill.status === 'new').length,
124
+ stable: skills.filter((skill) => skill.status === 'stable').length,
125
+ rot: skills.filter((skill) => skill.status === 'rot').length
126
+ },
127
+ skills
128
+ };
129
+ }
130
+ async function rerunCorpus(options) {
131
+ if (!options.corpus) {
132
+ return;
133
+ }
134
+ const runId = `${new Date().toISOString().replace(/[:.]/g, '-')}-${slugify(options.model ?? 'env-runner')}`;
135
+ await runCorpus({
136
+ corpus: options.corpus,
137
+ outputDir: path.join(options.resultsDir, 'rot-runs', runId),
138
+ tasks: options.tasks,
139
+ trials: options.trials,
140
+ concurrency: 2,
141
+ runner: options.model
142
+ });
143
+ }
144
+ async function writeJson(filePath, value) {
145
+ await mkdir(path.dirname(filePath), { recursive: true });
146
+ await writeFile(filePath, `${JSON.stringify(value, null, 2)}\n`);
147
+ }
148
+ export async function runRot(options) {
149
+ await rerunCorpus(options);
150
+ const report = await buildRotReport(options.resultsDir, options.model);
151
+ if (options.output) {
152
+ await writeJson(options.output, report);
153
+ }
154
+ return report;
155
+ }
156
+ //# sourceMappingURL=rot.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rot.js","sourceRoot":"","sources":["../../packages/cli/src/rot.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACvE,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAmExC,SAAS,OAAO,CAAC,KAAa;IAC5B,OAAO,KAAK;SACT,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;AAC3B,CAAC;AAED,SAAS,SAAS,CAAC,MAAoB;IACrC,OAAO,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC;AACrE,CAAC;AAED,SAAS,SAAS,CAAC,KAAc;IAC/B,OAAO,KAAK,KAAK,OAAO,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,OAAO,CAAC;AACvE,CAAC;AAED,SAAS,cAAc,CAAC,KAAc;IACpC,MAAM,SAAS,GAAG,KAA8B,CAAC;IACjD,IACE,CAAC,SAAS,CAAC,KAAK,EAAE,IAAI;QACtB,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM;QACvB,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM;QACvB,CAAC,SAAS,CAAC,KAAK,CAAC,WAAW;QAC5B,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM;QACvB,CAAC,SAAS,CAAC,MAAM,EAAE,YAAY;QAC/B,CAAC,SAAS,CAAC,MAAM,EAAE,KAAK;QACxB,CAAC,SAAS,CAAC,SAAS,CAAC,MAAM,CAAC,OAAO,CAAC;QACpC,OAAO,SAAS,CAAC,MAAM,CAAC,SAAS,KAAK,QAAQ;QAC9C,CAAC,SAAS,CAAC,QAAQ,EACnB,CAAC;QACD,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,OAAO,SAAyB,CAAC;AACnC,CAAC;AAED,KAAK,UAAU,aAAa,CAAC,GAAW;IACtC,IAAI,OAAO,CAAC;IACZ,IAAI,CAAC;QACH,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;IACxD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAK,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YACvD,OAAO,EAAE,CAAC;QACZ,CAAC;QACD,MAAM,KAAK,CAAC;IACd,CAAC;IAED,MAAM,KAAK,GAAG,MAAM,OAAO,CAAC,GAAG,CAC7B,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC1B,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;QAC5C,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC;YACxB,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAC;QACjC,CAAC;QACD,OAAO,KAAK,CAAC,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAC1E,CAAC,CAAC,CACH,CAAC;IACF,OAAO,KAAK,CAAC,IAAI,EAAE,CAAC;AACtB,CAAC;AAED,KAAK,UAAU,iBAAiB,CAAC,UAAkB;IACjD,MAAM,KAAK,GAAG,MAAM,aAAa,CAAC,UAAU,CAAC,CAAC;IAC9C,MAAM,OAAO,GAAkD,EAAE,CAAC;IAElE,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,MAAe,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC,CAAY,CAAC;QAC/D,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QACD,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC;QACtC,IAAI,MAAM,EAAE,CAAC;YACX,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,cAAc,CAAC,CAAkB,EAAE,CAAkB;IAC5D,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAClE,OAAO,SAAS,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;AAC9E,CAAC;AAED,SAAS,YAAY,CAAC,UAAkB,EAAE,IAAY,EAAE,MAAoB;IAC1E,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;IACjD,OAAO;QACL,SAAS,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,EAAE;QAC1H,SAAS,EAAE,QAAQ;QACnB,YAAY,EAAE,MAAM,CAAC,MAAM,CAAC,YAAY;QACxC,cAAc,EAAE,MAAM,CAAC,MAAM,CAAC,cAAc,IAAI,MAAM,CAAC,MAAM,CAAC,YAAY;QAC1E,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,SAAS,EAAE,MAAM,CAAC,MAAM,CAAC,SAAS;QAClC,KAAK,EAAE,MAAM,CAAC,MAAM,CAAC,KAAK;QAC1B,OAAO,EAAE,MAAM,CAAC,MAAM,CAAC,OAAO;KAC/B,CAAC;AACJ,CAAC;AAED,SAAS,cAAc,CAAC,GAAW,EAAE,KAA4B,EAAE,OAA0B;IAC3F,MAAM,MAAM,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IACjD,MAAM,MAAM,GAAG,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7B,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CAAC,kBAAkB,GAAG,EAAE,CAAC,CAAC;IAC3C,CAAC;IACD,MAAM,WAAW,GAAG,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,EAAE,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,OAAO,KAAK,OAAO,CAAC,CAAC;IAClG,MAAM,MAAM,GAAc,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,WAAW,IAAI,MAAM,CAAC,OAAO,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC;IAErH,OAAO;QACL,GAAG;QACH,KAAK;QACL,MAAM;QACN,MAAM;QACN,GAAG,CAAC,MAAM,KAAK,KAAK,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC1D,OAAO,EAAE,MAAM;KAChB,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,UAAkB,EAAE,KAAc;IACrE,MAAM,kBAAkB,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;IACpD,MAAM,aAAa,GAAG,MAAM,iBAAiB,CAAC,kBAAkB,CAAC,CAAC;IAClE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAwE,CAAC;IAEhG,KAAK,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,aAAa,EAAE,CAAC;QAC7C,MAAM,GAAG,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC;QAC9B,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC;QACzE,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,kBAAkB,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC,CAAC;QACrE,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IAC5B,CAAC;IAED,MAAM,MAAM,GAAG,CAAC,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC;SAClC,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,GAAG,EAAE,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;SACtE,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC;IAE5D,OAAO;QACL,YAAY,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACtC,WAAW,EAAE,UAAU;QACvB,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3B,OAAO,EAAE;YACP,MAAM,EAAE,MAAM,CAAC,MAAM;YACrB,GAAG,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,MAAM,KAAK,KAAK,CAAC,CAAC,MAAM;YAC5D,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,MAAM;YAClE,GAAG,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,MAAM,KAAK,KAAK,CAAC,CAAC,MAAM;SAC7D;QACD,MAAM;KACP,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,OAAmB;IAC5C,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;QACpB,OAAO;IACT,CAAC;IAED,MAAM,KAAK,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,IAAI,OAAO,CAAC,OAAO,CAAC,KAAK,IAAI,YAAY,CAAC,EAAE,CAAC;IAC5G,MAAM,SAAS,CAAC;QACd,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,UAAU,EAAE,KAAK,CAAC;QAC3D,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,WAAW,EAAE,CAAC;QACd,MAAM,EAAE,OAAO,CAAC,KAAK;KACtB,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,SAAS,CAAC,QAAgB,EAAE,KAAc;IACvD,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzD,MAAM,SAAS,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;AACnE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,MAAM,CAAC,OAAmB;IAC9C,MAAM,WAAW,CAAC,OAAO,CAAC,CAAC;IAC3B,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC;IACvE,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;QACnB,MAAM,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC1C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,5 @@
1
+ import type { NvidiaNimClient } from './adapters/nvidia-nim.js';
2
+ import type { JsonCache } from './cache.js';
3
+ import type { NvidiaConfig } from './env.js';
4
+ import type { GeneratedTask, NormalizedSkill, TrialOutput } from './types.js';
5
+ export declare function runTrials(skill: NormalizedSkill, tasks: GeneratedTask[], trials: number, config: NvidiaConfig, client: NvidiaNimClient, cache: JsonCache): Promise<TrialOutput[]>;
@@ -0,0 +1,47 @@
1
+ import { hashJson } from './hash.js';
2
+ function messagesForArm(skill, task, withSkill) {
3
+ if (!withSkill) {
4
+ return [{ role: 'user', content: task.prompt }];
5
+ }
6
+ return [
7
+ {
8
+ role: 'system',
9
+ content: `You are completing an evaluation task. Apply the following skill instructions when relevant.\n\n${skill.instructions}`
10
+ },
11
+ { role: 'user', content: task.prompt }
12
+ ];
13
+ }
14
+ async function runOne(skill, task, trial, arm, config, client, cache) {
15
+ const messages = messagesForArm(skill, task, arm === 'with_skill');
16
+ const response = await cache.getOrSet('runner', { model: config.runnerModel, temperature: 0.7, maxTokens: 1200, messages }, () => client.complete({
17
+ model: config.runnerModel,
18
+ messages,
19
+ temperature: 0.7,
20
+ maxTokens: 1200
21
+ }));
22
+ const transcriptHash = `sha256:${hashJson({ taskId: task.id, trial, arm, messages, output: response.content })}`;
23
+ return {
24
+ taskId: task.id,
25
+ trial,
26
+ arm,
27
+ output: response.content,
28
+ model: response.model,
29
+ promptTokens: response.usage?.promptTokens ?? 0,
30
+ completionTokens: response.usage?.completionTokens ?? 0,
31
+ totalTokens: response.usage?.totalTokens ?? 0,
32
+ transcriptHash
33
+ };
34
+ }
35
+ export async function runTrials(skill, tasks, trials, config, client, cache) {
36
+ const outputs = [];
37
+ for (const task of tasks) {
38
+ for (let trial = 1; trial <= trials; trial += 1) {
39
+ console.error(`[eval] run ${task.id} trial ${trial}/${trials} with_skill`);
40
+ outputs.push(await runOne(skill, task, trial, 'with_skill', config, client, cache));
41
+ console.error(`[eval] run ${task.id} trial ${trial}/${trials} no_skill`);
42
+ outputs.push(await runOne(skill, task, trial, 'no_skill', config, client, cache));
43
+ }
44
+ }
45
+ return outputs;
46
+ }
47
+ //# sourceMappingURL=run.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"run.js","sourceRoot":"","sources":["../../packages/cli/src/run.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAGrC,SAAS,cAAc,CAAC,KAAsB,EAAE,IAAmB,EAAE,SAAkB;IACrF,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,OAAO,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;IAClD,CAAC;IAED,OAAO;QACL;YACE,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,mGAAmG,KAAK,CAAC,YAAY,EAAE;SACjI;QACD,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,MAAM,EAAE;KACvC,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,MAAM,CACnB,KAAsB,EACtB,IAAmB,EACnB,KAAa,EACb,GAAuB,EACvB,MAAoB,EACpB,MAAuB,EACvB,KAAgB;IAEhB,MAAM,QAAQ,GAAG,cAAc,CAAC,KAAK,EAAE,IAAI,EAAE,GAAG,KAAK,YAAY,CAAC,CAAC;IACnE,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,QAAQ,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,WAAW,EAAE,WAAW,EAAE,GAAG,EAAE,SAAS,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,GAAG,EAAE,CAC/H,MAAM,CAAC,QAAQ,CAAC;QACd,KAAK,EAAE,MAAM,CAAC,WAAW;QACzB,QAAQ;QACR,WAAW,EAAE,GAAG;QAChB,SAAS,EAAE,IAAI;KAChB,CAAC,CACH,CAAC;IACF,MAAM,cAAc,GAAG,UAAU,QAAQ,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,EAAE,KAAK,EAAE,GAAG,EAAE,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,OAAO,EAAE,CAAC,EAAE,CAAC;IAEjH,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,EAAE;QACf,KAAK;QACL,GAAG;QACH,MAAM,EAAE,QAAQ,CAAC,OAAO;QACxB,KAAK,EAAE,QAAQ,CAAC,KAAK;QACrB,YAAY,EAAE,QAAQ,CAAC,KAAK,EAAE,YAAY,IAAI,CAAC;QAC/C,gBAAgB,EAAE,QAAQ,CAAC,KAAK,EAAE,gBAAgB,IAAI,CAAC;QACvD,WAAW,EAAE,QAAQ,CAAC,KAAK,EAAE,WAAW,IAAI,CAAC;QAC7C,cAAc;KACf,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,KAAsB,EACtB,KAAsB,EACtB,MAAc,EACd,MAAoB,EACpB,MAAuB,EACvB,KAAgB;IAEhB,MAAM,OAAO,GAAkB,EAAE,CAAC;IAClC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,IAAI,MAAM,EAAE,KAAK,IAAI,CAAC,EAAE,CAAC;YAChD,OAAO,CAAC,KAAK,CAAC,cAAc,IAAI,CAAC,EAAE,UAAU,KAAK,IAAI,MAAM,aAAa,CAAC,CAAC;YAC3E,OAAO,CAAC,IAAI,CAAC,MAAM,MAAM,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,YAAY,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;YACpF,OAAO,CAAC,KAAK,CAAC,cAAc,IAAI,CAAC,EAAE,UAAU,KAAK,IAAI,MAAM,WAAW,CAAC,CAAC;YACzE,OAAO,CAAC,IAAI,CAAC,MAAM,MAAM,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;QACpF,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -0,0 +1,14 @@
1
+ export interface PairedObservation {
2
+ withSkillPass: boolean;
3
+ noSkillPass: boolean;
4
+ }
5
+ export interface ScoreSummary {
6
+ effectPp: number;
7
+ ciPp: [number, number];
8
+ verdict: 'helps' | 'placebo' | 'harms';
9
+ withSkillPass: number;
10
+ noSkillPass: number;
11
+ }
12
+ export declare function scorePairedObservations(observations: PairedObservation[], iterations?: number, seed?: number): ScoreSummary;
13
+ export declare function effectInsideCi(summary: ScoreSummary): boolean;
14
+ export declare function ciOverlapsZero(summary: ScoreSummary): boolean;
@@ -0,0 +1,59 @@
1
+ function createSeededRandom(seed) {
2
+ let state = seed >>> 0;
3
+ return () => {
4
+ state = (state * 1664525 + 1013904223) >>> 0;
5
+ return state / 0x100000000;
6
+ };
7
+ }
8
+ function quantile(sortedValues, q) {
9
+ if (sortedValues.length === 0) {
10
+ throw new Error('Cannot compute quantile of an empty sample');
11
+ }
12
+ const index = (sortedValues.length - 1) * q;
13
+ const lower = Math.floor(index);
14
+ const upper = Math.ceil(index);
15
+ if (lower === upper) {
16
+ return sortedValues[lower];
17
+ }
18
+ const weight = index - lower;
19
+ return sortedValues[lower] * (1 - weight) + sortedValues[upper] * weight;
20
+ }
21
+ function toPp(value) {
22
+ return Number((value * 100).toFixed(2));
23
+ }
24
+ export function scorePairedObservations(observations, iterations = 1000, seed = 1337) {
25
+ if (observations.length === 0) {
26
+ throw new Error('Cannot score zero observations');
27
+ }
28
+ const withSkillPass = observations.filter((observation) => observation.withSkillPass).length / observations.length;
29
+ const noSkillPass = observations.filter((observation) => observation.noSkillPass).length / observations.length;
30
+ const effect = withSkillPass - noSkillPass;
31
+ const random = createSeededRandom(seed);
32
+ const bootstrapEffects = [];
33
+ for (let iteration = 0; iteration < iterations; iteration += 1) {
34
+ let total = 0;
35
+ for (let index = 0; index < observations.length; index += 1) {
36
+ const sample = observations[Math.floor(random() * observations.length)];
37
+ total += Number(sample.withSkillPass) - Number(sample.noSkillPass);
38
+ }
39
+ bootstrapEffects.push(total / observations.length);
40
+ }
41
+ bootstrapEffects.sort((a, b) => a - b);
42
+ const lower = toPp(quantile(bootstrapEffects, 0.025));
43
+ const upper = toPp(quantile(bootstrapEffects, 0.975));
44
+ const verdict = lower > 0 ? 'helps' : upper < 0 ? 'harms' : 'placebo';
45
+ return {
46
+ effectPp: toPp(effect),
47
+ ciPp: [lower, upper],
48
+ verdict,
49
+ withSkillPass: Number(withSkillPass.toFixed(4)),
50
+ noSkillPass: Number(noSkillPass.toFixed(4))
51
+ };
52
+ }
53
+ export function effectInsideCi(summary) {
54
+ return summary.effectPp >= summary.ciPp[0] && summary.effectPp <= summary.ciPp[1];
55
+ }
56
+ export function ciOverlapsZero(summary) {
57
+ return summary.ciPp[0] <= 0 && summary.ciPp[1] >= 0;
58
+ }
59
+ //# sourceMappingURL=score.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"score.js","sourceRoot":"","sources":["../../packages/cli/src/score.ts"],"names":[],"mappings":"AAaA,SAAS,kBAAkB,CAAC,IAAY;IACtC,IAAI,KAAK,GAAG,IAAI,KAAK,CAAC,CAAC;IACvB,OAAO,GAAG,EAAE;QACV,KAAK,GAAG,CAAC,KAAK,GAAG,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;QAC7C,OAAO,KAAK,GAAG,WAAW,CAAC;IAC7B,CAAC,CAAC;AACJ,CAAC;AAED,SAAS,QAAQ,CAAC,YAAsB,EAAE,CAAS;IACjD,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CAAC,4CAA4C,CAAC,CAAC;IAChE,CAAC;IAED,MAAM,KAAK,GAAG,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;IAC5C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC/B,IAAI,KAAK,KAAK,KAAK,EAAE,CAAC;QACpB,OAAO,YAAY,CAAC,KAAK,CAAE,CAAC;IAC9B,CAAC;IAED,MAAM,MAAM,GAAG,KAAK,GAAG,KAAK,CAAC;IAC7B,OAAO,YAAY,CAAC,KAAK,CAAE,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,YAAY,CAAC,KAAK,CAAE,GAAG,MAAM,CAAC;AAC7E,CAAC;AAED,SAAS,IAAI,CAAC,KAAa;IACzB,OAAO,MAAM,CAAC,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;AAC1C,CAAC;AAED,MAAM,UAAU,uBAAuB,CACrC,YAAiC,EACjC,UAAU,GAAG,IAAI,EACjB,IAAI,GAAG,IAAI;IAEX,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC;IACpD,CAAC;IAED,MAAM,aAAa,GACjB,YAAY,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,EAAE,CAAC,WAAW,CAAC,aAAa,CAAC,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM,CAAC;IAC/F,MAAM,WAAW,GACf,YAAY,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,EAAE,CAAC,WAAW,CAAC,WAAW,CAAC,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM,CAAC;IAC7F,MAAM,MAAM,GAAG,aAAa,GAAG,WAAW,CAAC;IAE3C,MAAM,MAAM,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAC;IACxC,MAAM,gBAAgB,GAAa,EAAE,CAAC;IAEtC,KAAK,IAAI,SAAS,GAAG,CAAC,EAAE,SAAS,GAAG,UAAU,EAAE,SAAS,IAAI,CAAC,EAAE,CAAC;QAC/D,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,YAAY,CAAC,MAAM,EAAE,KAAK,IAAI,CAAC,EAAE,CAAC;YAC5D,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,YAAY,CAAC,MAAM,CAAC,CAAE,CAAC;YACzE,KAAK,IAAI,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;QACrE,CAAC;QACD,gBAAgB,CAAC,IAAI,CAAC,KAAK,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC;IACrD,CAAC;IAED,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvC,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,gBAAgB,EAAE,KAAK,CAAC,CAAC,CAAC;IACtD,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,gBAAgB,EAAE,KAAK,CAAC,CAAC,CAAC;IACtD,MAAM,OAAO,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC;IAEtE,OAAO;QACL,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC;QACtB,IAAI,EAAE,CAAC,KAAK,EAAE,KAAK,CAAC;QACpB,OAAO;QACP,aAAa,EAAE,MAAM,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAC/C,WAAW,EAAE,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;KAC5C,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,OAAqB;IAClD,OAAO,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AACpF,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,OAAqB;IAClD,OAAO,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;AACtD,CAAC"}
@@ -0,0 +1,41 @@
1
+ export type SkillFormat = 'SKILL.md' | 'AGENTS.md' | '.cursorrules' | 'CLAUDE.md';
2
+ export type CriterionType = 'rubric' | 'deterministic';
3
+ export interface NormalizedSkill {
4
+ name: string;
5
+ sourcePath: string;
6
+ format: SkillFormat;
7
+ instructions: string;
8
+ domain: string;
9
+ assets: string[];
10
+ versionHash: string;
11
+ }
12
+ export interface GeneratedTask {
13
+ id: string;
14
+ prompt: string;
15
+ criterionType: CriterionType;
16
+ criterion: string;
17
+ }
18
+ export interface TrialOutput {
19
+ taskId: string;
20
+ trial: number;
21
+ arm: 'with_skill' | 'no_skill';
22
+ output: string;
23
+ model: string;
24
+ promptTokens: number;
25
+ completionTokens: number;
26
+ totalTokens: number;
27
+ transcriptHash: string;
28
+ }
29
+ export interface GradedOutput extends TrialOutput {
30
+ score: number;
31
+ reason: string;
32
+ pass: boolean;
33
+ }
34
+ export interface TaskBreakdown {
35
+ id: string;
36
+ prompt: string;
37
+ criterion_type: CriterionType;
38
+ criterion: string;
39
+ arm_a_pass_rate: number;
40
+ arm_b_pass_rate: number;
41
+ }
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../packages/cli/src/types.ts"],"names":[],"mappings":""}
@@ -0,0 +1,5 @@
1
+ export interface VerifyOptions {
2
+ resultPath: string;
3
+ sample: number;
4
+ }
5
+ export declare function verifyResult(options: VerifyOptions): Promise<unknown>;
@@ -0,0 +1,71 @@
1
+ import { readFile } from 'node:fs/promises';
2
+ import { NvidiaNimClient } from './adapters/nvidia-nim.js';
3
+ import { JsonCache } from './cache.js';
4
+ import { loadNvidiaConfig } from './env.js';
5
+ import { gradeOutputs } from './grade.js';
6
+ import { normalizeSkill } from './normalize.js';
7
+ import { runTrials } from './run.js';
8
+ import { scorePairedObservations } from './score.js';
9
+ function parseTasks(text) {
10
+ const value = JSON.parse(text);
11
+ const tasks = Array.isArray(value) ? value : value.tasks;
12
+ if (!Array.isArray(tasks)) {
13
+ throw new Error('Task suite must be an array or object with tasks');
14
+ }
15
+ return tasks.map((task, index) => {
16
+ const item = task;
17
+ const criterionType = item.criterionType ?? item.criterion_type ?? 'rubric';
18
+ if (criterionType !== 'rubric' && criterionType !== 'deterministic') {
19
+ throw new Error(`Unsupported criterion type in task ${index + 1}`);
20
+ }
21
+ return {
22
+ id: String(item.id ?? `t${String(index + 1).padStart(3, '0')}`),
23
+ prompt: String(item.prompt ?? ''),
24
+ criterionType,
25
+ criterion: String(item.criterion ?? '')
26
+ };
27
+ });
28
+ }
29
+ function pairedObservations(graded) {
30
+ const byPair = new Map();
31
+ for (const item of graded) {
32
+ const key = `${item.taskId}:${item.trial}`;
33
+ const current = byPair.get(key) ?? {};
34
+ if (item.arm === 'with_skill') {
35
+ current.withSkillPass = item.pass;
36
+ }
37
+ else {
38
+ current.noSkillPass = item.pass;
39
+ }
40
+ byPair.set(key, current);
41
+ }
42
+ return [...byPair.values()].map((item) => {
43
+ if (typeof item.withSkillPass !== 'boolean' || typeof item.noSkillPass !== 'boolean') {
44
+ throw new Error('Incomplete A/B pair while verifying');
45
+ }
46
+ return item;
47
+ });
48
+ }
49
+ export async function verifyResult(options) {
50
+ const published = JSON.parse(await readFile(options.resultPath, 'utf8'));
51
+ const config = loadNvidiaConfig();
52
+ const client = new NvidiaNimClient(config);
53
+ const cache = new JsonCache();
54
+ const skill = await normalizeSkill(published.skill.source);
55
+ const tasks = parseTasks(await readFile(published.reproducibility.task_suite_path, 'utf8')).slice(0, options.sample);
56
+ const outputs = await runTrials(skill, tasks, published.config.trials, config, client, cache);
57
+ const graded = await gradeOutputs(tasks, outputs, config, client, cache);
58
+ const score = scorePairedObservations(pairedObservations(graded));
59
+ const [lower, upper] = published.result.ci_pp;
60
+ const passed = score.effectPp >= lower && score.effectPp <= upper;
61
+ return {
62
+ passed,
63
+ sample: tasks.length,
64
+ published_effect_pp: published.result.effect_pp,
65
+ published_ci_pp: published.result.ci_pp,
66
+ verify_effect_pp: score.effectPp,
67
+ verify_ci_pp: score.ciPp,
68
+ verify_verdict: score.verdict
69
+ };
70
+ }
71
+ //# sourceMappingURL=verify.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"verify.js","sourceRoot":"","sources":["../../packages/cli/src/verify.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAC3D,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAC5C,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,uBAAuB,EAA0B,MAAM,YAAY,CAAC;AAQ7E,SAAS,UAAU,CAAC,IAAY;IAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAY,CAAC;IAC1C,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAE,KAA6B,CAAC,KAAK,CAAC;IAClF,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,kDAAkD,CAAC,CAAC;IACtE,CAAC;IACD,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE;QAC/B,MAAM,IAAI,GAAG,IAA+B,CAAC;QAC7C,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,cAAc,IAAI,QAAQ,CAAC;QAC5E,IAAI,aAAa,KAAK,QAAQ,IAAI,aAAa,KAAK,eAAe,EAAE,CAAC;YACpE,MAAM,IAAI,KAAK,CAAC,sCAAsC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;QACrE,CAAC;QACD,OAAO;YACL,EAAE,EAAE,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,IAAI,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;YAC/D,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM,IAAI,EAAE,CAAC;YACjC,aAAa;YACb,SAAS,EAAE,MAAM,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;SACxC,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,kBAAkB,CAAC,MAAsB;IAChD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAsC,CAAC;IAC7D,KAAK,MAAM,IAAI,IAAI,MAAM,EAAE,CAAC;QAC1B,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;QAC3C,MAAM,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,IAAI,CAAC,GAAG,KAAK,YAAY,EAAE,CAAC;YAC9B,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,IAAI,CAAC;QACpC,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC;QAClC,CAAC;QACD,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IAC3B,CAAC;IACD,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACvC,IAAI,OAAO,IAAI,CAAC,aAAa,KAAK,SAAS,IAAI,OAAO,IAAI,CAAC,WAAW,KAAK,SAAS,EAAE,CAAC;YACrF,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;QACzD,CAAC;QACD,OAAO,IAAyB,CAAC;IACnC,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,OAAsB;IACvD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,MAAM,CAAC,CAKtE,CAAC;IACF,MAAM,MAAM,GAAG,gBAAgB,EAAE,CAAC;IAClC,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC,MAAM,CAAC,CAAC;IAC3C,MAAM,KAAK,GAAG,IAAI,SAAS,EAAE,CAAC;IAC9B,MAAM,KAAK,GAAG,MAAM,cAAc,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC3D,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM,QAAQ,CAAC,SAAS,CAAC,eAAe,CAAC,eAAe,EAAE,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;IACrH,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,EAAE,KAAK,EAAE,SAAS,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC;IAC9F,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC;IACzE,MAAM,KAAK,GAAG,uBAAuB,CAAC,kBAAkB,CAAC,MAAM,CAAC,CAAC,CAAC;IAClE,MAAM,CAAC,KAAK,EAAE,KAAK,CAAC,GAAG,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC;IAC9C,MAAM,MAAM,GAAG,KAAK,CAAC,QAAQ,IAAI,KAAK,IAAI,KAAK,CAAC,QAAQ,IAAI,KAAK,CAAC;IAElE,OAAO;QACL,MAAM;QACN,MAAM,EAAE,KAAK,CAAC,MAAM;QACpB,mBAAmB,EAAE,SAAS,CAAC,MAAM,CAAC,SAAS;QAC/C,eAAe,EAAE,SAAS,CAAC,MAAM,CAAC,KAAK;QACvC,gBAAgB,EAAE,KAAK,CAAC,QAAQ;QAChC,YAAY,EAAE,KAAK,CAAC,IAAI;QACxB,cAAc,EAAE,KAAK,CAAC,OAAO;KAC9B,CAAC;AACJ,CAAC"}