@alexanderzzlatkov/skilleval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +186 -0
- package/dist/config.d.ts +63 -0
- package/dist/config.js +42 -0
- package/dist/context-builder.d.ts +5 -0
- package/dist/context-builder.js +121 -0
- package/dist/evaluator.d.ts +4 -0
- package/dist/evaluator.js +239 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +113 -0
- package/dist/parser.d.ts +2 -0
- package/dist/parser.js +152 -0
- package/dist/providers.d.ts +4 -0
- package/dist/providers.js +33 -0
- package/dist/reporter.d.ts +5 -0
- package/dist/reporter.js +80 -0
- package/dist/runner.d.ts +2 -0
- package/dist/runner.js +53 -0
- package/dist/test-generator.d.ts +3 -0
- package/dist/test-generator.js +109 -0
- package/package.json +58 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import { generateText } from 'ai';
|
|
2
|
+
import { buildComplianceSystemPrompt, buildMockTools } from './context-builder.js';
|
|
3
|
+
const RETRY_DELAY_MS = 2000;
|
|
4
|
+
const MAX_RETRIES = 3;
|
|
5
|
+
async function generateWithRetry(models, options) {
|
|
6
|
+
for (let modelIdx = 0; modelIdx < models.length; modelIdx++) {
|
|
7
|
+
for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
|
|
8
|
+
try {
|
|
9
|
+
const { text } = await generateText({
|
|
10
|
+
model: models[modelIdx],
|
|
11
|
+
system: options.system,
|
|
12
|
+
prompt: options.prompt,
|
|
13
|
+
temperature: options.temperature,
|
|
14
|
+
});
|
|
15
|
+
return text;
|
|
16
|
+
}
|
|
17
|
+
catch {
|
|
18
|
+
if (attempt < MAX_RETRIES - 1) {
|
|
19
|
+
await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS));
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
throw new Error('All judge models failed');
|
|
25
|
+
}
|
|
26
|
+
function stripCodeFences(text) {
|
|
27
|
+
return text.replace(/^```(?:json)?\s*\n?/gm, '').replace(/\n?```\s*$/gm, '').trim();
|
|
28
|
+
}
|
|
29
|
+
function safeParseTriggerEval(text) {
|
|
30
|
+
try {
|
|
31
|
+
const parsed = JSON.parse(stripCodeFences(text));
|
|
32
|
+
return {
|
|
33
|
+
triggered: Boolean(parsed.triggered),
|
|
34
|
+
correct: Boolean(parsed.correct),
|
|
35
|
+
reason: String(parsed.reason ?? ''),
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
return { triggered: false, correct: false, reason: 'Failed to parse judge response' };
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
function safeParseComplianceEval(text) {
|
|
43
|
+
try {
|
|
44
|
+
const parsed = JSON.parse(stripCodeFences(text));
|
|
45
|
+
return {
|
|
46
|
+
compliant: Boolean(parsed.compliant),
|
|
47
|
+
score: Math.max(0, Math.min(100, Number(parsed.score) || 0)),
|
|
48
|
+
reason: String(parsed.reason ?? ''),
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
catch {
|
|
52
|
+
return { compliant: false, score: 0, reason: 'Failed to parse judge response' };
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
const TRIGGER_JUDGE_SYSTEM = `You are an evaluation judge. You assess whether an AI model correctly identified when to use a specific skill.
|
|
56
|
+
Respond ONLY with valid JSON, no other text.`;
|
|
57
|
+
function triggerJudgePrompt(skill, promptText, promptType, response) {
|
|
58
|
+
return `## Skill Being Tested
|
|
59
|
+
Name: ${skill.name}
|
|
60
|
+
Description: ${skill.description}
|
|
61
|
+
|
|
62
|
+
## User Prompt
|
|
63
|
+
"${promptText}"
|
|
64
|
+
|
|
65
|
+
## Expected Behavior
|
|
66
|
+
This is a ${promptType} test case.
|
|
67
|
+
${promptType === 'positive'
|
|
68
|
+
? 'The model SHOULD have recognized this skill as relevant and indicated it would use it.'
|
|
69
|
+
: 'The model should NOT have mentioned or tried to use this skill.'}
|
|
70
|
+
|
|
71
|
+
## Model Response
|
|
72
|
+
${response}
|
|
73
|
+
|
|
74
|
+
## Your Task
|
|
75
|
+
Did the model behave correctly? Evaluate whether it correctly identified (or correctly ignored) the "${skill.name}" skill.
|
|
76
|
+
|
|
77
|
+
Respond ONLY with JSON:
|
|
78
|
+
{"triggered": true/false, "correct": true/false, "reason": "brief explanation"}`;
|
|
79
|
+
}
|
|
80
|
+
const COMPLIANCE_JUDGE_SYSTEM = `You are an evaluation judge. You assess whether an AI model correctly followed a skill's instructions by calling the right tools and following the right workflow.
|
|
81
|
+
Respond ONLY with valid JSON, no other text.`;
|
|
82
|
+
function complianceJudgePrompt(skill, promptText, response) {
|
|
83
|
+
return `## Skill Instructions
|
|
84
|
+
${skill.body}
|
|
85
|
+
|
|
86
|
+
## User Prompt
|
|
87
|
+
"${promptText}"
|
|
88
|
+
|
|
89
|
+
## Model Response and Tool Calls
|
|
90
|
+
${response}
|
|
91
|
+
|
|
92
|
+
## Your Task
|
|
93
|
+
Did the model correctly follow the skill's instructions? Evaluate:
|
|
94
|
+
1. Did it call the correct tools as described in the skill?
|
|
95
|
+
2. Did it pass reasonable arguments to those tools?
|
|
96
|
+
3. Did it follow the stated workflow/steps in the right order?
|
|
97
|
+
4. Did it stay within the scope of the skill?
|
|
98
|
+
|
|
99
|
+
Note: Tools returned mock results. Do not penalize for the quality of returned data — focus on whether the model called the right tools with the right parameters.
|
|
100
|
+
|
|
101
|
+
Respond ONLY with JSON:
|
|
102
|
+
{"compliant": true/false, "score": 0-100, "reason": "brief explanation"}`;
|
|
103
|
+
}
|
|
104
|
+
export async function evaluateResults(skill, testResults, judgeModels, models, verbose) {
|
|
105
|
+
const evalResults = [];
|
|
106
|
+
const total = testResults.length;
|
|
107
|
+
let completed = 0;
|
|
108
|
+
const mockTools = buildMockTools();
|
|
109
|
+
const toolNames = Object.keys(mockTools);
|
|
110
|
+
const complianceSystemPrompt = buildComplianceSystemPrompt(skill);
|
|
111
|
+
if (verbose) {
|
|
112
|
+
process.stderr.write(`\n Compliance system prompt:\n ---\n${complianceSystemPrompt}\n ---\n`);
|
|
113
|
+
if (toolNames.length > 0) {
|
|
114
|
+
process.stderr.write(` Mock tools provided: ${toolNames.join(', ')}\n`);
|
|
115
|
+
}
|
|
116
|
+
process.stderr.write('\n');
|
|
117
|
+
}
|
|
118
|
+
for (const result of testResults) {
|
|
119
|
+
completed++;
|
|
120
|
+
const prefix = ` [${completed}/${total}] ${result.modelId} — ${result.prompt.type}: "${result.prompt.text.slice(0, 40)}..."`;
|
|
121
|
+
if (result.error) {
|
|
122
|
+
process.stderr.write(`\n${prefix} — skipped (error)\n`);
|
|
123
|
+
evalResults.push({
|
|
124
|
+
modelId: result.modelId,
|
|
125
|
+
prompt: result.prompt,
|
|
126
|
+
response: result.response,
|
|
127
|
+
trigger: { triggered: false, correct: false, reason: `Error: ${result.error}` },
|
|
128
|
+
});
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
// Trigger evaluation
|
|
132
|
+
process.stderr.write(`\n${prefix}\n Judging trigger...`);
|
|
133
|
+
let trigger;
|
|
134
|
+
try {
|
|
135
|
+
const text = await generateWithRetry(judgeModels, {
|
|
136
|
+
system: TRIGGER_JUDGE_SYSTEM,
|
|
137
|
+
prompt: triggerJudgePrompt(skill, result.prompt.text, result.prompt.type, result.response),
|
|
138
|
+
temperature: 0.1,
|
|
139
|
+
});
|
|
140
|
+
trigger = safeParseTriggerEval(text);
|
|
141
|
+
process.stderr.write(` ${trigger.correct ? 'PASS' : 'FAIL'} (triggered: ${trigger.triggered})\n`);
|
|
142
|
+
}
|
|
143
|
+
catch (err) {
|
|
144
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
145
|
+
trigger = { triggered: false, correct: false, reason: `Judge call failed: ${message}` };
|
|
146
|
+
process.stderr.write(` FAILED: ${message}\n`);
|
|
147
|
+
}
|
|
148
|
+
// Compliance evaluation (only for positive prompts where skill was triggered)
|
|
149
|
+
let compliance;
|
|
150
|
+
if (result.prompt.type === 'positive' && trigger.triggered) {
|
|
151
|
+
// Run compliance test: send the same prompt with full skill content and mock tools
|
|
152
|
+
const modelEntry = models.find(m => m.modelId === result.modelId);
|
|
153
|
+
if (modelEntry) {
|
|
154
|
+
try {
|
|
155
|
+
process.stderr.write(` Running compliance test...`);
|
|
156
|
+
if (toolNames.length > 0) {
|
|
157
|
+
process.stderr.write(` (mock tools: ${toolNames.join(', ')})`);
|
|
158
|
+
}
|
|
159
|
+
const { text: complianceResponse, toolCalls, steps } = await generateText({
|
|
160
|
+
model: modelEntry.model,
|
|
161
|
+
messages: [
|
|
162
|
+
{ role: 'system', content: complianceSystemPrompt },
|
|
163
|
+
{ role: 'user', content: result.prompt.text },
|
|
164
|
+
],
|
|
165
|
+
tools: mockTools,
|
|
166
|
+
maxSteps: 10,
|
|
167
|
+
temperature: 0.3,
|
|
168
|
+
});
|
|
169
|
+
// Collect all tool calls across all steps
|
|
170
|
+
const allToolCalls = steps.flatMap(step => step.toolCalls ?? []);
|
|
171
|
+
const toolCallSummary = allToolCalls.length > 0
|
|
172
|
+
? `\n\nTool calls made:\n${allToolCalls.map(tc => `- ${tc.toolName}(${JSON.stringify(tc.args)})`).join('\n')}`
|
|
173
|
+
: '\n\nNo tool calls were made.';
|
|
174
|
+
const fullResponse = complianceResponse + toolCallSummary;
|
|
175
|
+
process.stderr.write(` done (${allToolCalls.length} tool calls, ${steps.length} steps)\n`);
|
|
176
|
+
if (verbose) {
|
|
177
|
+
if (allToolCalls.length > 0) {
|
|
178
|
+
process.stderr.write(` Tool calls:\n`);
|
|
179
|
+
for (const tc of allToolCalls) {
|
|
180
|
+
process.stderr.write(` - ${tc.toolName}(${JSON.stringify(tc.args).slice(0, 100)})\n`);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
process.stderr.write(` Judging compliance...`);
|
|
185
|
+
const judgeText = await generateWithRetry(judgeModels, {
|
|
186
|
+
system: COMPLIANCE_JUDGE_SYSTEM,
|
|
187
|
+
prompt: complianceJudgePrompt(skill, result.prompt.text, fullResponse),
|
|
188
|
+
temperature: 0.1,
|
|
189
|
+
});
|
|
190
|
+
compliance = safeParseComplianceEval(judgeText);
|
|
191
|
+
process.stderr.write(` ${compliance.compliant ? 'PASS' : 'FAIL'} (${compliance.score}/100)\n`);
|
|
192
|
+
}
|
|
193
|
+
catch (err) {
|
|
194
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
195
|
+
compliance = { compliant: false, score: 0, reason: `Compliance evaluation failed: ${message}` };
|
|
196
|
+
process.stderr.write(` FAILED: ${message}\n`);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
evalResults.push({
|
|
201
|
+
modelId: result.modelId,
|
|
202
|
+
prompt: result.prompt,
|
|
203
|
+
response: result.response,
|
|
204
|
+
trigger,
|
|
205
|
+
compliance,
|
|
206
|
+
});
|
|
207
|
+
if (verbose) {
|
|
208
|
+
process.stderr.write(` Reason: ${trigger.reason}\n`);
|
|
209
|
+
if (compliance) {
|
|
210
|
+
process.stderr.write(` Compliance reason: ${compliance.reason}\n`);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
process.stderr.write('\n');
|
|
215
|
+
return evalResults;
|
|
216
|
+
}
|
|
217
|
+
export function computeReport(evalResults, modelIds) {
|
|
218
|
+
return modelIds.map(modelId => {
|
|
219
|
+
const modelResults = evalResults.filter(r => r.modelId === modelId);
|
|
220
|
+
const triggerTotal = modelResults.length;
|
|
221
|
+
const triggerCorrect = modelResults.filter(r => r.trigger.correct).length;
|
|
222
|
+
const complianceResults = modelResults.filter(r => r.compliance != null);
|
|
223
|
+
const complianceCorrect = complianceResults.filter(r => r.compliance.compliant).length;
|
|
224
|
+
const complianceTotal = complianceResults.length;
|
|
225
|
+
const avgScore = complianceTotal > 0
|
|
226
|
+
? complianceResults.reduce((sum, r) => sum + r.compliance.score, 0) / complianceTotal
|
|
227
|
+
: 0;
|
|
228
|
+
const triggerAcc = triggerTotal > 0 ? triggerCorrect / triggerTotal : 0;
|
|
229
|
+
const complianceAcc = complianceTotal > 0 ? complianceCorrect / complianceTotal : 0;
|
|
230
|
+
const overall = Math.round(triggerAcc * 50 + complianceAcc * 30 + (avgScore / 100) * 20);
|
|
231
|
+
return {
|
|
232
|
+
modelId,
|
|
233
|
+
triggerScore: { correct: triggerCorrect, total: triggerTotal },
|
|
234
|
+
complianceScore: { correct: complianceCorrect, total: complianceTotal, avgScore: Math.round(avgScore) },
|
|
235
|
+
overall,
|
|
236
|
+
};
|
|
237
|
+
}).sort((a, b) => b.overall - a.overall);
|
|
238
|
+
}
|
|
239
|
+
//# sourceMappingURL=evaluator.js.map
|
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import 'dotenv/config';
|
|
3
|
+
import { Command } from 'commander';
|
|
4
|
+
import chalk from 'chalk';
|
|
5
|
+
import { parseSkill } from './parser.js';
|
|
6
|
+
import { createModel, resolveApiKey } from './providers.js';
|
|
7
|
+
import { generateTestPrompts } from './test-generator.js';
|
|
8
|
+
import { runTests } from './runner.js';
|
|
9
|
+
import { evaluateResults, computeReport } from './evaluator.js';
|
|
10
|
+
import { printReport } from './reporter.js';
|
|
11
|
+
import { DEFAULT_FREE_MODELS, DEFAULT_GENERATOR_MODELS, DEFAULT_JUDGE_MODELS, PROVIDER_NAMES, } from './config.js';
|
|
12
|
+
const program = new Command();
|
|
13
|
+
program
|
|
14
|
+
.name('skilleval')
|
|
15
|
+
.description('Evaluate how well AI models understand Agent Skills (SKILL.md files)')
|
|
16
|
+
.version('0.1.0')
|
|
17
|
+
.argument('<skill>', 'Path, URL, or GitHub shorthand (owner/repo) to a SKILL.md file')
|
|
18
|
+
.option('-p, --provider <provider>', 'Provider: openrouter, anthropic, openai, google', 'openrouter')
|
|
19
|
+
.option('-m, --models <models>', 'Comma-separated model IDs to test')
|
|
20
|
+
.option('-k, --key <key>', 'API key (or use provider-specific env var)')
|
|
21
|
+
.option('--generator-model <model>', 'Model for test prompt generation (comma-separated for fallbacks)')
|
|
22
|
+
.option('--judge-model <model>', 'Model for evaluation judging (comma-separated for fallbacks)')
|
|
23
|
+
.option('--json', 'Output results as JSON', false)
|
|
24
|
+
.option('--verbose', 'Show detailed per-prompt results', false)
|
|
25
|
+
.option('--prompts <path>', 'Path to JSON file with custom test prompts')
|
|
26
|
+
.option('-s, --skill <name>', 'Skill name within the repo (looks for skills/<name>/SKILL.md)')
|
|
27
|
+
.option('-n, --count <number>', 'Number of positive+negative test prompts (default: 5+5)', '5')
|
|
28
|
+
.action(async (skillSource, opts) => {
|
|
29
|
+
try {
|
|
30
|
+
const provider = opts.provider;
|
|
31
|
+
if (!PROVIDER_NAMES.includes(provider)) {
|
|
32
|
+
console.error(chalk.red(`Invalid provider "${provider}". Must be one of: ${PROVIDER_NAMES.join(', ')}`));
|
|
33
|
+
process.exit(1);
|
|
34
|
+
}
|
|
35
|
+
// Resolve API key for the test models
|
|
36
|
+
let apiKey;
|
|
37
|
+
try {
|
|
38
|
+
apiKey = resolveApiKey(provider, opts.key);
|
|
39
|
+
}
|
|
40
|
+
catch (err) {
|
|
41
|
+
console.error(chalk.red(err.message));
|
|
42
|
+
process.exit(1);
|
|
43
|
+
}
|
|
44
|
+
// Resolve model IDs
|
|
45
|
+
const modelIds = opts.models
|
|
46
|
+
? opts.models.split(',').map((m) => m.trim())
|
|
47
|
+
: (provider === 'openrouter' ? DEFAULT_FREE_MODELS : []);
|
|
48
|
+
if (modelIds.length === 0) {
|
|
49
|
+
console.error(chalk.red('No models specified. Use --models or default to openrouter provider for free models.'));
|
|
50
|
+
process.exit(1);
|
|
51
|
+
}
|
|
52
|
+
// Resolve generator/judge keys (always OpenRouter for internal models)
|
|
53
|
+
let internalApiKey;
|
|
54
|
+
try {
|
|
55
|
+
internalApiKey = resolveApiKey('openrouter', provider === 'openrouter' ? apiKey : undefined);
|
|
56
|
+
}
|
|
57
|
+
catch {
|
|
58
|
+
if (!opts.prompts) {
|
|
59
|
+
console.error(chalk.red('OPENROUTER_API_KEY is required for test generation and evaluation (uses free models).\n' +
|
|
60
|
+
'Set OPENROUTER_API_KEY env var, or provide custom prompts with --prompts.'));
|
|
61
|
+
process.exit(1);
|
|
62
|
+
}
|
|
63
|
+
internalApiKey = '';
|
|
64
|
+
}
|
|
65
|
+
// Parse skill
|
|
66
|
+
process.stderr.write(chalk.cyan('Parsing skill...\n'));
|
|
67
|
+
const skill = await parseSkill(skillSource, opts.skill);
|
|
68
|
+
if (!opts.json) {
|
|
69
|
+
console.log(`\n${chalk.bold('skilleval')} v0.1.0`);
|
|
70
|
+
console.log(`${chalk.bold('Skill:')} ${skill.name}`);
|
|
71
|
+
console.log(`${chalk.bold('Description:')} ${skill.description}`);
|
|
72
|
+
console.log(`${chalk.bold('Provider:')} ${provider}`);
|
|
73
|
+
console.log(`${chalk.bold('Models:')} ${modelIds.length}\n`);
|
|
74
|
+
}
|
|
75
|
+
// Create model instances
|
|
76
|
+
const models = modelIds.map(id => ({
|
|
77
|
+
model: createModel(provider, id, apiKey),
|
|
78
|
+
modelId: id,
|
|
79
|
+
}));
|
|
80
|
+
// Generate test prompts
|
|
81
|
+
process.stderr.write(chalk.cyan('Generating test prompts...\n'));
|
|
82
|
+
const generatorModelIds = opts.generatorModel
|
|
83
|
+
? opts.generatorModel.split(',').map((m) => m.trim())
|
|
84
|
+
: DEFAULT_GENERATOR_MODELS;
|
|
85
|
+
const generatorModels = generatorModelIds.map(id => createModel('openrouter', id, internalApiKey));
|
|
86
|
+
const count = parseInt(opts.count, 10);
|
|
87
|
+
const prompts = await generateTestPrompts(skill, generatorModels, count, opts.prompts, opts.verbose);
|
|
88
|
+
process.stderr.write(chalk.green(` Generated ${prompts.length} test prompts\n\n`));
|
|
89
|
+
// Run trigger tests
|
|
90
|
+
process.stderr.write(chalk.cyan('Running trigger tests...\n'));
|
|
91
|
+
const testResults = await runTests(skill, prompts, models, opts.verbose);
|
|
92
|
+
// Evaluate results
|
|
93
|
+
process.stderr.write(chalk.cyan('Evaluating results...\n'));
|
|
94
|
+
const judgeModelIds = opts.judgeModel
|
|
95
|
+
? opts.judgeModel.split(',').map((m) => m.trim())
|
|
96
|
+
: DEFAULT_JUDGE_MODELS;
|
|
97
|
+
const judgeModels = judgeModelIds.map(id => createModel('openrouter', id, internalApiKey));
|
|
98
|
+
const evalResults = await evaluateResults(skill, testResults, judgeModels, models, opts.verbose);
|
|
99
|
+
// Compute and print report
|
|
100
|
+
const reports = computeReport(evalResults, modelIds);
|
|
101
|
+
console.log('');
|
|
102
|
+
printReport(reports, evalResults, { json: opts.json, verbose: opts.verbose });
|
|
103
|
+
// Exit code based on scores
|
|
104
|
+
const allPassing = reports.every(r => r.overall >= 50);
|
|
105
|
+
process.exit(allPassing ? 0 : 1);
|
|
106
|
+
}
|
|
107
|
+
catch (err) {
|
|
108
|
+
console.error(chalk.red(`Error: ${err.message}`));
|
|
109
|
+
process.exit(1);
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
program.parse();
|
|
113
|
+
//# sourceMappingURL=index.js.map
|
package/dist/parser.d.ts
ADDED
package/dist/parser.js
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
2
|
+
import { basename } from 'node:path';
|
|
3
|
+
import matter from 'gray-matter';
|
|
4
|
+
const GITHUB_BLOB_RE = /^https?:\/\/github\.com\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)$/;
|
|
5
|
+
const GITHUB_REPO_RE = /^https?:\/\/github\.com\/([^/]+)\/([^/]+)\/?$/;
|
|
6
|
+
const GITHUB_SHORTHAND_RE = /^([^/\s]+)\/([^/\s]+)$/;
|
|
7
|
+
const SKILL_SEARCH_PATHS = [
|
|
8
|
+
'SKILL.md',
|
|
9
|
+
'.claude/skills',
|
|
10
|
+
'skills',
|
|
11
|
+
];
|
|
12
|
+
async function fetchText(url) {
|
|
13
|
+
const res = await fetch(url);
|
|
14
|
+
if (!res.ok)
|
|
15
|
+
throw new Error(`Failed to fetch ${url}: ${res.status} ${res.statusText}`);
|
|
16
|
+
return res.text();
|
|
17
|
+
}
|
|
18
|
+
function toRawGitHubUrl(owner, repo, branch, path) {
|
|
19
|
+
return `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${path}`;
|
|
20
|
+
}
|
|
21
|
+
async function discoverNamedSkill(owner, repo, skillName) {
|
|
22
|
+
const searchPaths = [
|
|
23
|
+
`skills/${skillName}/SKILL.md`,
|
|
24
|
+
`.claude/skills/${skillName}/SKILL.md`,
|
|
25
|
+
`${skillName}/SKILL.md`,
|
|
26
|
+
];
|
|
27
|
+
for (const path of searchPaths) {
|
|
28
|
+
for (const branch of ['main', 'master']) {
|
|
29
|
+
const url = toRawGitHubUrl(owner, repo, branch, path);
|
|
30
|
+
try {
|
|
31
|
+
const res = await fetch(url, { method: 'HEAD' });
|
|
32
|
+
if (res.ok)
|
|
33
|
+
return url;
|
|
34
|
+
}
|
|
35
|
+
catch { /* continue */ }
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
throw new Error(`Could not find skill "${skillName}" in ${owner}/${repo}. Searched ${searchPaths.join(', ')}.`);
|
|
39
|
+
}
|
|
40
|
+
async function discoverSkillInRepo(owner, repo) {
|
|
41
|
+
// Try root SKILL.md first
|
|
42
|
+
const rootUrl = toRawGitHubUrl(owner, repo, 'main', 'SKILL.md');
|
|
43
|
+
try {
|
|
44
|
+
const res = await fetch(rootUrl, { method: 'HEAD' });
|
|
45
|
+
if (res.ok)
|
|
46
|
+
return rootUrl;
|
|
47
|
+
}
|
|
48
|
+
catch { /* continue */ }
|
|
49
|
+
// Try 'master' branch
|
|
50
|
+
const masterUrl = toRawGitHubUrl(owner, repo, 'master', 'SKILL.md');
|
|
51
|
+
try {
|
|
52
|
+
const res = await fetch(masterUrl, { method: 'HEAD' });
|
|
53
|
+
if (res.ok)
|
|
54
|
+
return masterUrl;
|
|
55
|
+
}
|
|
56
|
+
catch { /* continue */ }
|
|
57
|
+
// Search via GitHub API for SKILL.md files in common directories
|
|
58
|
+
for (const searchPath of SKILL_SEARCH_PATHS.slice(1)) {
|
|
59
|
+
for (const branch of ['main', 'master']) {
|
|
60
|
+
const apiUrl = `https://api.github.com/repos/${owner}/${repo}/contents/${searchPath}?ref=${branch}`;
|
|
61
|
+
try {
|
|
62
|
+
const res = await fetch(apiUrl);
|
|
63
|
+
if (!res.ok)
|
|
64
|
+
continue;
|
|
65
|
+
const items = await res.json();
|
|
66
|
+
if (!Array.isArray(items))
|
|
67
|
+
continue;
|
|
68
|
+
for (const item of items) {
|
|
69
|
+
if (item.name === 'SKILL.md') {
|
|
70
|
+
return toRawGitHubUrl(owner, repo, branch, item.path);
|
|
71
|
+
}
|
|
72
|
+
// Check subdirectories (e.g., .claude/skills/my-skill/SKILL.md)
|
|
73
|
+
const subApiUrl = `https://api.github.com/repos/${owner}/${repo}/contents/${item.path}?ref=${branch}`;
|
|
74
|
+
try {
|
|
75
|
+
const subRes = await fetch(subApiUrl);
|
|
76
|
+
if (!subRes.ok)
|
|
77
|
+
continue;
|
|
78
|
+
const subItems = await subRes.json();
|
|
79
|
+
if (!Array.isArray(subItems))
|
|
80
|
+
continue;
|
|
81
|
+
const skillFile = subItems.find(s => s.name === 'SKILL.md');
|
|
82
|
+
if (skillFile) {
|
|
83
|
+
return toRawGitHubUrl(owner, repo, branch, skillFile.path);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
catch { /* continue */ }
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
catch { /* continue */ }
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
throw new Error(`Could not find SKILL.md in ${owner}/${repo}. Searched root, .claude/skills/, and skills/ directories.`);
|
|
93
|
+
}
|
|
94
|
+
async function resolveSource(source, skillName) {
|
|
95
|
+
// Case 1: GitHub blob URL
|
|
96
|
+
const blobMatch = source.match(GITHUB_BLOB_RE);
|
|
97
|
+
if (blobMatch) {
|
|
98
|
+
const [, owner, repo, branch, path] = blobMatch;
|
|
99
|
+
const rawUrl = toRawGitHubUrl(owner, repo, branch, path);
|
|
100
|
+
return { content: await fetchText(rawUrl), sourceName: path };
|
|
101
|
+
}
|
|
102
|
+
// Case 2: GitHub repo URL
|
|
103
|
+
const repoMatch = source.match(GITHUB_REPO_RE);
|
|
104
|
+
if (repoMatch) {
|
|
105
|
+
const [, owner, repo] = repoMatch;
|
|
106
|
+
const rawUrl = skillName
|
|
107
|
+
? await discoverNamedSkill(owner, repo, skillName)
|
|
108
|
+
: await discoverSkillInRepo(owner, repo);
|
|
109
|
+
return { content: await fetchText(rawUrl), sourceName: skillName ?? `${owner}/${repo}` };
|
|
110
|
+
}
|
|
111
|
+
// Case 3: GitHub shorthand (owner/repo)
|
|
112
|
+
const shorthandMatch = source.match(GITHUB_SHORTHAND_RE);
|
|
113
|
+
if (shorthandMatch && !source.includes('\\') && !source.includes(':')) {
|
|
114
|
+
const [, owner, repo] = shorthandMatch;
|
|
115
|
+
const rawUrl = skillName
|
|
116
|
+
? await discoverNamedSkill(owner, repo, skillName)
|
|
117
|
+
: await discoverSkillInRepo(owner, repo);
|
|
118
|
+
return { content: await fetchText(rawUrl), sourceName: skillName ?? `${owner}/${repo}` };
|
|
119
|
+
}
|
|
120
|
+
// Case 4: URL
|
|
121
|
+
if (source.startsWith('http://') || source.startsWith('https://')) {
|
|
122
|
+
return { content: await fetchText(source), sourceName: source };
|
|
123
|
+
}
|
|
124
|
+
// Case 5: Local file path
|
|
125
|
+
const content = await readFile(source, 'utf-8');
|
|
126
|
+
return { content, sourceName: basename(source, '.md') };
|
|
127
|
+
}
|
|
128
|
+
export async function parseSkill(source, skillName) {
|
|
129
|
+
const { content: rawContent, sourceName } = await resolveSource(source, skillName);
|
|
130
|
+
const { data, content: body } = matter(rawContent);
|
|
131
|
+
if (!body || !body.trim()) {
|
|
132
|
+
throw new Error('SKILL.md has no content body');
|
|
133
|
+
}
|
|
134
|
+
// Resolve name
|
|
135
|
+
let name = data.name;
|
|
136
|
+
if (!name) {
|
|
137
|
+
const headingMatch = body.match(/^#\s+(.+)$/m);
|
|
138
|
+
name = headingMatch ? headingMatch[1].trim() : sourceName;
|
|
139
|
+
}
|
|
140
|
+
// Resolve description
|
|
141
|
+
let description = data.description;
|
|
142
|
+
if (!description) {
|
|
143
|
+
const firstParagraph = body
|
|
144
|
+
.split('\n\n')
|
|
145
|
+
.find(p => p.trim() && !p.trim().startsWith('#'));
|
|
146
|
+
description = firstParagraph
|
|
147
|
+
? firstParagraph.trim().slice(0, 200)
|
|
148
|
+
: 'No description available';
|
|
149
|
+
}
|
|
150
|
+
return { name, description, body: body.trim(), rawContent };
|
|
151
|
+
}
|
|
152
|
+
//# sourceMappingURL=parser.js.map
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { LanguageModel } from 'ai';
|
|
2
|
+
import { type ProviderName } from './config.js';
|
|
3
|
+
export declare function resolveApiKey(provider: ProviderName, cliKey?: string): string;
|
|
4
|
+
export declare function createModel(provider: ProviderName, modelId: string, apiKey: string): LanguageModel;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { createOpenAI } from '@ai-sdk/openai';
|
|
2
|
+
import { createAnthropic } from '@ai-sdk/anthropic';
|
|
3
|
+
import { createGoogleGenerativeAI } from '@ai-sdk/google';
|
|
4
|
+
import { PROVIDER_ENV_VARS } from './config.js';
|
|
5
|
+
export function resolveApiKey(provider, cliKey) {
|
|
6
|
+
if (cliKey)
|
|
7
|
+
return cliKey;
|
|
8
|
+
const envVar = PROVIDER_ENV_VARS[provider];
|
|
9
|
+
const envValue = process.env[envVar];
|
|
10
|
+
if (envValue)
|
|
11
|
+
return envValue;
|
|
12
|
+
throw new Error(`No API key found for ${provider}. Provide --key or set ${envVar} environment variable.`);
|
|
13
|
+
}
|
|
14
|
+
export function createModel(provider, modelId, apiKey) {
|
|
15
|
+
switch (provider) {
|
|
16
|
+
case 'openrouter':
|
|
17
|
+
return createOpenAI({
|
|
18
|
+
baseURL: 'https://openrouter.ai/api/v1',
|
|
19
|
+
apiKey,
|
|
20
|
+
headers: {
|
|
21
|
+
'HTTP-Referer': 'https://github.com/zlatkov/skilleval',
|
|
22
|
+
'X-Title': 'skilleval',
|
|
23
|
+
},
|
|
24
|
+
})(modelId);
|
|
25
|
+
case 'openai':
|
|
26
|
+
return createOpenAI({ apiKey })(modelId);
|
|
27
|
+
case 'anthropic':
|
|
28
|
+
return createAnthropic({ apiKey })(modelId);
|
|
29
|
+
case 'google':
|
|
30
|
+
return createGoogleGenerativeAI({ apiKey })(modelId);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=providers.js.map
|
package/dist/reporter.js
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import chalk from 'chalk';
|
|
2
|
+
function padRight(str, len) {
|
|
3
|
+
return str.length >= len ? str.slice(0, len) : str + ' '.repeat(len - str.length);
|
|
4
|
+
}
|
|
5
|
+
function scoreColor(score) {
|
|
6
|
+
if (score >= 70)
|
|
7
|
+
return chalk.green;
|
|
8
|
+
if (score >= 40)
|
|
9
|
+
return chalk.yellow;
|
|
10
|
+
return chalk.red;
|
|
11
|
+
}
|
|
12
|
+
function printTable(reports) {
|
|
13
|
+
const modelWidth = Math.max(20, ...reports.map(r => r.modelId.length)) + 2;
|
|
14
|
+
const triggerWidth = 14;
|
|
15
|
+
const complianceWidth = 16;
|
|
16
|
+
const overallWidth = 9;
|
|
17
|
+
const line = (left, mid, right, fill) => left + fill.repeat(modelWidth) + mid + fill.repeat(triggerWidth) + mid + fill.repeat(complianceWidth) + mid + fill.repeat(overallWidth) + right;
|
|
18
|
+
console.log(line('┌', '┬', '┐', '─'));
|
|
19
|
+
console.log('│' + padRight(' Model', modelWidth) +
|
|
20
|
+
'│' + padRight(' Trigger', triggerWidth) +
|
|
21
|
+
'│' + padRight(' Compliance', complianceWidth) +
|
|
22
|
+
'│' + padRight(' Overall', overallWidth) + '│');
|
|
23
|
+
console.log(line('├', '┼', '┤', '─'));
|
|
24
|
+
for (const report of reports) {
|
|
25
|
+
const triggerStr = `${report.triggerScore.correct}/${report.triggerScore.total}`;
|
|
26
|
+
const complianceStr = report.complianceScore.total > 0
|
|
27
|
+
? `${report.complianceScore.correct}/${report.complianceScore.total} (${report.complianceScore.avgScore})`
|
|
28
|
+
: 'N/A';
|
|
29
|
+
const overallStr = `${report.overall}%`;
|
|
30
|
+
const color = scoreColor(report.overall);
|
|
31
|
+
console.log('│' + padRight(` ${report.modelId}`, modelWidth) +
|
|
32
|
+
'│' + padRight(` ${triggerStr}`, triggerWidth) +
|
|
33
|
+
'│' + padRight(` ${complianceStr}`, complianceWidth) +
|
|
34
|
+
'│' + color(padRight(` ${overallStr}`, overallWidth)) + '│');
|
|
35
|
+
}
|
|
36
|
+
console.log(line('└', '┴', '┘', '─'));
|
|
37
|
+
if (reports.length > 0) {
|
|
38
|
+
const best = reports[0];
|
|
39
|
+
const worst = reports[reports.length - 1];
|
|
40
|
+
console.log(`\n${chalk.green('Best model:')} ${best.modelId} (${best.overall}%)`);
|
|
41
|
+
if (reports.length > 1) {
|
|
42
|
+
console.log(`${chalk.red('Worst model:')} ${worst.modelId} (${worst.overall}%)`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
function printVerbose(evalResults) {
|
|
47
|
+
const byModel = new Map();
|
|
48
|
+
for (const result of evalResults) {
|
|
49
|
+
const arr = byModel.get(result.modelId) ?? [];
|
|
50
|
+
arr.push(result);
|
|
51
|
+
byModel.set(result.modelId, arr);
|
|
52
|
+
}
|
|
53
|
+
for (const [modelId, results] of byModel) {
|
|
54
|
+
console.log(`\n${chalk.bold(`--- ${modelId} ---`)}`);
|
|
55
|
+
for (const result of results) {
|
|
56
|
+
const status = result.trigger.correct ? chalk.green('PASS') : chalk.red('FAIL');
|
|
57
|
+
console.log(` [${status}] ${result.prompt.type}: "${result.prompt.text.slice(0, 60)}"`);
|
|
58
|
+
console.log(` ${result.trigger.reason}`);
|
|
59
|
+
if (result.compliance) {
|
|
60
|
+
const compStatus = result.compliance.compliant ? chalk.green('PASS') : chalk.red('FAIL');
|
|
61
|
+
console.log(` Compliance: [${compStatus}] ${result.compliance.score}/100 — ${result.compliance.reason}`);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
export function printReport(reports, evalResults, options) {
|
|
67
|
+
if (options.json) {
|
|
68
|
+
console.log(JSON.stringify({ reports, evalResults }, null, 2));
|
|
69
|
+
return;
|
|
70
|
+
}
|
|
71
|
+
printTable(reports);
|
|
72
|
+
if (options.verbose) {
|
|
73
|
+
printVerbose(evalResults);
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
console.log(`\nRun with ${chalk.cyan('--verbose')} to see individual test results.`);
|
|
77
|
+
console.log(`Run with ${chalk.cyan('--json')} to get machine-readable output.`);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
//# sourceMappingURL=reporter.js.map
|
package/dist/runner.d.ts
ADDED