@alexanderzzlatkov/skilleval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,239 @@
1
+ import { generateText } from 'ai';
2
+ import { buildComplianceSystemPrompt, buildMockTools } from './context-builder.js';
3
+ const RETRY_DELAY_MS = 2000;
4
+ const MAX_RETRIES = 3;
5
+ async function generateWithRetry(models, options) {
6
+ for (let modelIdx = 0; modelIdx < models.length; modelIdx++) {
7
+ for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
8
+ try {
9
+ const { text } = await generateText({
10
+ model: models[modelIdx],
11
+ system: options.system,
12
+ prompt: options.prompt,
13
+ temperature: options.temperature,
14
+ });
15
+ return text;
16
+ }
17
+ catch {
18
+ if (attempt < MAX_RETRIES - 1) {
19
+ await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS));
20
+ }
21
+ }
22
+ }
23
+ }
24
+ throw new Error('All judge models failed');
25
+ }
26
+ function stripCodeFences(text) {
27
+ return text.replace(/^```(?:json)?\s*\n?/gm, '').replace(/\n?```\s*$/gm, '').trim();
28
+ }
29
+ function safeParseTriggerEval(text) {
30
+ try {
31
+ const parsed = JSON.parse(stripCodeFences(text));
32
+ return {
33
+ triggered: Boolean(parsed.triggered),
34
+ correct: Boolean(parsed.correct),
35
+ reason: String(parsed.reason ?? ''),
36
+ };
37
+ }
38
+ catch {
39
+ return { triggered: false, correct: false, reason: 'Failed to parse judge response' };
40
+ }
41
+ }
42
+ function safeParseComplianceEval(text) {
43
+ try {
44
+ const parsed = JSON.parse(stripCodeFences(text));
45
+ return {
46
+ compliant: Boolean(parsed.compliant),
47
+ score: Math.max(0, Math.min(100, Number(parsed.score) || 0)),
48
+ reason: String(parsed.reason ?? ''),
49
+ };
50
+ }
51
+ catch {
52
+ return { compliant: false, score: 0, reason: 'Failed to parse judge response' };
53
+ }
54
+ }
55
+ const TRIGGER_JUDGE_SYSTEM = `You are an evaluation judge. You assess whether an AI model correctly identified when to use a specific skill.
56
+ Respond ONLY with valid JSON, no other text.`;
57
+ function triggerJudgePrompt(skill, promptText, promptType, response) {
58
+ return `## Skill Being Tested
59
+ Name: ${skill.name}
60
+ Description: ${skill.description}
61
+
62
+ ## User Prompt
63
+ "${promptText}"
64
+
65
+ ## Expected Behavior
66
+ This is a ${promptType} test case.
67
+ ${promptType === 'positive'
68
+ ? 'The model SHOULD have recognized this skill as relevant and indicated it would use it.'
69
+ : 'The model should NOT have mentioned or tried to use this skill.'}
70
+
71
+ ## Model Response
72
+ ${response}
73
+
74
+ ## Your Task
75
+ Did the model behave correctly? Evaluate whether it correctly identified (or correctly ignored) the "${skill.name}" skill.
76
+
77
+ Respond ONLY with JSON:
78
+ {"triggered": true/false, "correct": true/false, "reason": "brief explanation"}`;
79
+ }
80
+ const COMPLIANCE_JUDGE_SYSTEM = `You are an evaluation judge. You assess whether an AI model correctly followed a skill's instructions by calling the right tools and following the right workflow.
81
+ Respond ONLY with valid JSON, no other text.`;
82
+ function complianceJudgePrompt(skill, promptText, response) {
83
+ return `## Skill Instructions
84
+ ${skill.body}
85
+
86
+ ## User Prompt
87
+ "${promptText}"
88
+
89
+ ## Model Response and Tool Calls
90
+ ${response}
91
+
92
+ ## Your Task
93
+ Did the model correctly follow the skill's instructions? Evaluate:
94
+ 1. Did it call the correct tools as described in the skill?
95
+ 2. Did it pass reasonable arguments to those tools?
96
+ 3. Did it follow the stated workflow/steps in the right order?
97
+ 4. Did it stay within the scope of the skill?
98
+
99
+ Note: Tools returned mock results. Do not penalize for the quality of returned data — focus on whether the model called the right tools with the right parameters.
100
+
101
+ Respond ONLY with JSON:
102
+ {"compliant": true/false, "score": 0-100, "reason": "brief explanation"}`;
103
+ }
104
+ export async function evaluateResults(skill, testResults, judgeModels, models, verbose) {
105
+ const evalResults = [];
106
+ const total = testResults.length;
107
+ let completed = 0;
108
+ const mockTools = buildMockTools();
109
+ const toolNames = Object.keys(mockTools);
110
+ const complianceSystemPrompt = buildComplianceSystemPrompt(skill);
111
+ if (verbose) {
112
+ process.stderr.write(`\n Compliance system prompt:\n ---\n${complianceSystemPrompt}\n ---\n`);
113
+ if (toolNames.length > 0) {
114
+ process.stderr.write(` Mock tools provided: ${toolNames.join(', ')}\n`);
115
+ }
116
+ process.stderr.write('\n');
117
+ }
118
+ for (const result of testResults) {
119
+ completed++;
120
+ const prefix = ` [${completed}/${total}] ${result.modelId} — ${result.prompt.type}: "${result.prompt.text.slice(0, 40)}..."`;
121
+ if (result.error) {
122
+ process.stderr.write(`\n${prefix} — skipped (error)\n`);
123
+ evalResults.push({
124
+ modelId: result.modelId,
125
+ prompt: result.prompt,
126
+ response: result.response,
127
+ trigger: { triggered: false, correct: false, reason: `Error: ${result.error}` },
128
+ });
129
+ continue;
130
+ }
131
+ // Trigger evaluation
132
+ process.stderr.write(`\n${prefix}\n Judging trigger...`);
133
+ let trigger;
134
+ try {
135
+ const text = await generateWithRetry(judgeModels, {
136
+ system: TRIGGER_JUDGE_SYSTEM,
137
+ prompt: triggerJudgePrompt(skill, result.prompt.text, result.prompt.type, result.response),
138
+ temperature: 0.1,
139
+ });
140
+ trigger = safeParseTriggerEval(text);
141
+ process.stderr.write(` ${trigger.correct ? 'PASS' : 'FAIL'} (triggered: ${trigger.triggered})\n`);
142
+ }
143
+ catch (err) {
144
+ const message = err instanceof Error ? err.message : String(err);
145
+ trigger = { triggered: false, correct: false, reason: `Judge call failed: ${message}` };
146
+ process.stderr.write(` FAILED: ${message}\n`);
147
+ }
148
+ // Compliance evaluation (only for positive prompts where skill was triggered)
149
+ let compliance;
150
+ if (result.prompt.type === 'positive' && trigger.triggered) {
151
+ // Run compliance test: send the same prompt with full skill content and mock tools
152
+ const modelEntry = models.find(m => m.modelId === result.modelId);
153
+ if (modelEntry) {
154
+ try {
155
+ process.stderr.write(` Running compliance test...`);
156
+ if (toolNames.length > 0) {
157
+ process.stderr.write(` (mock tools: ${toolNames.join(', ')})`);
158
+ }
159
+ const { text: complianceResponse, toolCalls, steps } = await generateText({
160
+ model: modelEntry.model,
161
+ messages: [
162
+ { role: 'system', content: complianceSystemPrompt },
163
+ { role: 'user', content: result.prompt.text },
164
+ ],
165
+ tools: mockTools,
166
+ maxSteps: 10,
167
+ temperature: 0.3,
168
+ });
169
+ // Collect all tool calls across all steps
170
+ const allToolCalls = steps.flatMap(step => step.toolCalls ?? []);
171
+ const toolCallSummary = allToolCalls.length > 0
172
+ ? `\n\nTool calls made:\n${allToolCalls.map(tc => `- ${tc.toolName}(${JSON.stringify(tc.args)})`).join('\n')}`
173
+ : '\n\nNo tool calls were made.';
174
+ const fullResponse = complianceResponse + toolCallSummary;
175
+ process.stderr.write(` done (${allToolCalls.length} tool calls, ${steps.length} steps)\n`);
176
+ if (verbose) {
177
+ if (allToolCalls.length > 0) {
178
+ process.stderr.write(` Tool calls:\n`);
179
+ for (const tc of allToolCalls) {
180
+ process.stderr.write(` - ${tc.toolName}(${JSON.stringify(tc.args).slice(0, 100)})\n`);
181
+ }
182
+ }
183
+ }
184
+ process.stderr.write(` Judging compliance...`);
185
+ const judgeText = await generateWithRetry(judgeModels, {
186
+ system: COMPLIANCE_JUDGE_SYSTEM,
187
+ prompt: complianceJudgePrompt(skill, result.prompt.text, fullResponse),
188
+ temperature: 0.1,
189
+ });
190
+ compliance = safeParseComplianceEval(judgeText);
191
+ process.stderr.write(` ${compliance.compliant ? 'PASS' : 'FAIL'} (${compliance.score}/100)\n`);
192
+ }
193
+ catch (err) {
194
+ const message = err instanceof Error ? err.message : String(err);
195
+ compliance = { compliant: false, score: 0, reason: `Compliance evaluation failed: ${message}` };
196
+ process.stderr.write(` FAILED: ${message}\n`);
197
+ }
198
+ }
199
+ }
200
+ evalResults.push({
201
+ modelId: result.modelId,
202
+ prompt: result.prompt,
203
+ response: result.response,
204
+ trigger,
205
+ compliance,
206
+ });
207
+ if (verbose) {
208
+ process.stderr.write(` Reason: ${trigger.reason}\n`);
209
+ if (compliance) {
210
+ process.stderr.write(` Compliance reason: ${compliance.reason}\n`);
211
+ }
212
+ }
213
+ }
214
+ process.stderr.write('\n');
215
+ return evalResults;
216
+ }
217
+ export function computeReport(evalResults, modelIds) {
218
+ return modelIds.map(modelId => {
219
+ const modelResults = evalResults.filter(r => r.modelId === modelId);
220
+ const triggerTotal = modelResults.length;
221
+ const triggerCorrect = modelResults.filter(r => r.trigger.correct).length;
222
+ const complianceResults = modelResults.filter(r => r.compliance != null);
223
+ const complianceCorrect = complianceResults.filter(r => r.compliance.compliant).length;
224
+ const complianceTotal = complianceResults.length;
225
+ const avgScore = complianceTotal > 0
226
+ ? complianceResults.reduce((sum, r) => sum + r.compliance.score, 0) / complianceTotal
227
+ : 0;
228
+ const triggerAcc = triggerTotal > 0 ? triggerCorrect / triggerTotal : 0;
229
+ const complianceAcc = complianceTotal > 0 ? complianceCorrect / complianceTotal : 0;
230
+ const overall = Math.round(triggerAcc * 50 + complianceAcc * 30 + (avgScore / 100) * 20);
231
+ return {
232
+ modelId,
233
+ triggerScore: { correct: triggerCorrect, total: triggerTotal },
234
+ complianceScore: { correct: complianceCorrect, total: complianceTotal, avgScore: Math.round(avgScore) },
235
+ overall,
236
+ };
237
+ }).sort((a, b) => b.overall - a.overall);
238
+ }
239
+ //# sourceMappingURL=evaluator.js.map
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ import 'dotenv/config';
package/dist/index.js ADDED
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env node
2
+ import 'dotenv/config';
3
+ import { Command } from 'commander';
4
+ import chalk from 'chalk';
5
+ import { parseSkill } from './parser.js';
6
+ import { createModel, resolveApiKey } from './providers.js';
7
+ import { generateTestPrompts } from './test-generator.js';
8
+ import { runTests } from './runner.js';
9
+ import { evaluateResults, computeReport } from './evaluator.js';
10
+ import { printReport } from './reporter.js';
11
+ import { DEFAULT_FREE_MODELS, DEFAULT_GENERATOR_MODELS, DEFAULT_JUDGE_MODELS, PROVIDER_NAMES, } from './config.js';
12
+ const program = new Command();
13
+ program
14
+ .name('skilleval')
15
+ .description('Evaluate how well AI models understand Agent Skills (SKILL.md files)')
16
+ .version('0.1.0')
17
+ .argument('<skill>', 'Path, URL, or GitHub shorthand (owner/repo) to a SKILL.md file')
18
+ .option('-p, --provider <provider>', 'Provider: openrouter, anthropic, openai, google', 'openrouter')
19
+ .option('-m, --models <models>', 'Comma-separated model IDs to test')
20
+ .option('-k, --key <key>', 'API key (or use provider-specific env var)')
21
+ .option('--generator-model <model>', 'Model for test prompt generation (comma-separated for fallbacks)')
22
+ .option('--judge-model <model>', 'Model for evaluation judging (comma-separated for fallbacks)')
23
+ .option('--json', 'Output results as JSON', false)
24
+ .option('--verbose', 'Show detailed per-prompt results', false)
25
+ .option('--prompts <path>', 'Path to JSON file with custom test prompts')
26
+ .option('-s, --skill <name>', 'Skill name within the repo (looks for skills/<name>/SKILL.md)')
27
+ .option('-n, --count <number>', 'Number of positive+negative test prompts (default: 5+5)', '5')
28
+ .action(async (skillSource, opts) => {
29
+ try {
30
+ const provider = opts.provider;
31
+ if (!PROVIDER_NAMES.includes(provider)) {
32
+ console.error(chalk.red(`Invalid provider "${provider}". Must be one of: ${PROVIDER_NAMES.join(', ')}`));
33
+ process.exit(1);
34
+ }
35
+ // Resolve API key for the test models
36
+ let apiKey;
37
+ try {
38
+ apiKey = resolveApiKey(provider, opts.key);
39
+ }
40
+ catch (err) {
41
+ console.error(chalk.red(err.message));
42
+ process.exit(1);
43
+ }
44
+ // Resolve model IDs
45
+ const modelIds = opts.models
46
+ ? opts.models.split(',').map((m) => m.trim())
47
+ : (provider === 'openrouter' ? DEFAULT_FREE_MODELS : []);
48
+ if (modelIds.length === 0) {
49
+ console.error(chalk.red('No models specified. Use --models or default to openrouter provider for free models.'));
50
+ process.exit(1);
51
+ }
52
+ // Resolve generator/judge keys (always OpenRouter for internal models)
53
+ let internalApiKey;
54
+ try {
55
+ internalApiKey = resolveApiKey('openrouter', provider === 'openrouter' ? apiKey : undefined);
56
+ }
57
+ catch {
58
+ if (!opts.prompts) {
59
+ console.error(chalk.red('OPENROUTER_API_KEY is required for test generation and evaluation (uses free models).\n' +
60
+ 'Set OPENROUTER_API_KEY env var, or provide custom prompts with --prompts.'));
61
+ process.exit(1);
62
+ }
63
+ internalApiKey = '';
64
+ }
65
+ // Parse skill
66
+ process.stderr.write(chalk.cyan('Parsing skill...\n'));
67
+ const skill = await parseSkill(skillSource, opts.skill);
68
+ if (!opts.json) {
69
+ console.log(`\n${chalk.bold('skilleval')} v0.1.0`);
70
+ console.log(`${chalk.bold('Skill:')} ${skill.name}`);
71
+ console.log(`${chalk.bold('Description:')} ${skill.description}`);
72
+ console.log(`${chalk.bold('Provider:')} ${provider}`);
73
+ console.log(`${chalk.bold('Models:')} ${modelIds.length}\n`);
74
+ }
75
+ // Create model instances
76
+ const models = modelIds.map(id => ({
77
+ model: createModel(provider, id, apiKey),
78
+ modelId: id,
79
+ }));
80
+ // Generate test prompts
81
+ process.stderr.write(chalk.cyan('Generating test prompts...\n'));
82
+ const generatorModelIds = opts.generatorModel
83
+ ? opts.generatorModel.split(',').map((m) => m.trim())
84
+ : DEFAULT_GENERATOR_MODELS;
85
+ const generatorModels = generatorModelIds.map(id => createModel('openrouter', id, internalApiKey));
86
+ const count = parseInt(opts.count, 10);
87
+ const prompts = await generateTestPrompts(skill, generatorModels, count, opts.prompts, opts.verbose);
88
+ process.stderr.write(chalk.green(` Generated ${prompts.length} test prompts\n\n`));
89
+ // Run trigger tests
90
+ process.stderr.write(chalk.cyan('Running trigger tests...\n'));
91
+ const testResults = await runTests(skill, prompts, models, opts.verbose);
92
+ // Evaluate results
93
+ process.stderr.write(chalk.cyan('Evaluating results...\n'));
94
+ const judgeModelIds = opts.judgeModel
95
+ ? opts.judgeModel.split(',').map((m) => m.trim())
96
+ : DEFAULT_JUDGE_MODELS;
97
+ const judgeModels = judgeModelIds.map(id => createModel('openrouter', id, internalApiKey));
98
+ const evalResults = await evaluateResults(skill, testResults, judgeModels, models, opts.verbose);
99
+ // Compute and print report
100
+ const reports = computeReport(evalResults, modelIds);
101
+ console.log('');
102
+ printReport(reports, evalResults, { json: opts.json, verbose: opts.verbose });
103
+ // Exit code based on scores
104
+ const allPassing = reports.every(r => r.overall >= 50);
105
+ process.exit(allPassing ? 0 : 1);
106
+ }
107
+ catch (err) {
108
+ console.error(chalk.red(`Error: ${err.message}`));
109
+ process.exit(1);
110
+ }
111
+ });
112
+ program.parse();
113
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,2 @@
1
+ import type { SkillDefinition } from './config.js';
2
+ export declare function parseSkill(source: string, skillName?: string): Promise<SkillDefinition>;
package/dist/parser.js ADDED
@@ -0,0 +1,152 @@
1
+ import { readFile } from 'node:fs/promises';
2
+ import { basename } from 'node:path';
3
+ import matter from 'gray-matter';
4
+ const GITHUB_BLOB_RE = /^https?:\/\/github\.com\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)$/;
5
+ const GITHUB_REPO_RE = /^https?:\/\/github\.com\/([^/]+)\/([^/]+)\/?$/;
6
+ const GITHUB_SHORTHAND_RE = /^([^/\s]+)\/([^/\s]+)$/;
7
+ const SKILL_SEARCH_PATHS = [
8
+ 'SKILL.md',
9
+ '.claude/skills',
10
+ 'skills',
11
+ ];
12
+ async function fetchText(url) {
13
+ const res = await fetch(url);
14
+ if (!res.ok)
15
+ throw new Error(`Failed to fetch ${url}: ${res.status} ${res.statusText}`);
16
+ return res.text();
17
+ }
18
+ function toRawGitHubUrl(owner, repo, branch, path) {
19
+ return `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${path}`;
20
+ }
21
+ async function discoverNamedSkill(owner, repo, skillName) {
22
+ const searchPaths = [
23
+ `skills/${skillName}/SKILL.md`,
24
+ `.claude/skills/${skillName}/SKILL.md`,
25
+ `${skillName}/SKILL.md`,
26
+ ];
27
+ for (const path of searchPaths) {
28
+ for (const branch of ['main', 'master']) {
29
+ const url = toRawGitHubUrl(owner, repo, branch, path);
30
+ try {
31
+ const res = await fetch(url, { method: 'HEAD' });
32
+ if (res.ok)
33
+ return url;
34
+ }
35
+ catch { /* continue */ }
36
+ }
37
+ }
38
+ throw new Error(`Could not find skill "${skillName}" in ${owner}/${repo}. Searched ${searchPaths.join(', ')}.`);
39
+ }
40
+ async function discoverSkillInRepo(owner, repo) {
41
+ // Try root SKILL.md first
42
+ const rootUrl = toRawGitHubUrl(owner, repo, 'main', 'SKILL.md');
43
+ try {
44
+ const res = await fetch(rootUrl, { method: 'HEAD' });
45
+ if (res.ok)
46
+ return rootUrl;
47
+ }
48
+ catch { /* continue */ }
49
+ // Try 'master' branch
50
+ const masterUrl = toRawGitHubUrl(owner, repo, 'master', 'SKILL.md');
51
+ try {
52
+ const res = await fetch(masterUrl, { method: 'HEAD' });
53
+ if (res.ok)
54
+ return masterUrl;
55
+ }
56
+ catch { /* continue */ }
57
+ // Search via GitHub API for SKILL.md files in common directories
58
+ for (const searchPath of SKILL_SEARCH_PATHS.slice(1)) {
59
+ for (const branch of ['main', 'master']) {
60
+ const apiUrl = `https://api.github.com/repos/${owner}/${repo}/contents/${searchPath}?ref=${branch}`;
61
+ try {
62
+ const res = await fetch(apiUrl);
63
+ if (!res.ok)
64
+ continue;
65
+ const items = await res.json();
66
+ if (!Array.isArray(items))
67
+ continue;
68
+ for (const item of items) {
69
+ if (item.name === 'SKILL.md') {
70
+ return toRawGitHubUrl(owner, repo, branch, item.path);
71
+ }
72
+ // Check subdirectories (e.g., .claude/skills/my-skill/SKILL.md)
73
+ const subApiUrl = `https://api.github.com/repos/${owner}/${repo}/contents/${item.path}?ref=${branch}`;
74
+ try {
75
+ const subRes = await fetch(subApiUrl);
76
+ if (!subRes.ok)
77
+ continue;
78
+ const subItems = await subRes.json();
79
+ if (!Array.isArray(subItems))
80
+ continue;
81
+ const skillFile = subItems.find(s => s.name === 'SKILL.md');
82
+ if (skillFile) {
83
+ return toRawGitHubUrl(owner, repo, branch, skillFile.path);
84
+ }
85
+ }
86
+ catch { /* continue */ }
87
+ }
88
+ }
89
+ catch { /* continue */ }
90
+ }
91
+ }
92
+ throw new Error(`Could not find SKILL.md in ${owner}/${repo}. Searched root, .claude/skills/, and skills/ directories.`);
93
+ }
94
+ async function resolveSource(source, skillName) {
95
+ // Case 1: GitHub blob URL
96
+ const blobMatch = source.match(GITHUB_BLOB_RE);
97
+ if (blobMatch) {
98
+ const [, owner, repo, branch, path] = blobMatch;
99
+ const rawUrl = toRawGitHubUrl(owner, repo, branch, path);
100
+ return { content: await fetchText(rawUrl), sourceName: path };
101
+ }
102
+ // Case 2: GitHub repo URL
103
+ const repoMatch = source.match(GITHUB_REPO_RE);
104
+ if (repoMatch) {
105
+ const [, owner, repo] = repoMatch;
106
+ const rawUrl = skillName
107
+ ? await discoverNamedSkill(owner, repo, skillName)
108
+ : await discoverSkillInRepo(owner, repo);
109
+ return { content: await fetchText(rawUrl), sourceName: skillName ?? `${owner}/${repo}` };
110
+ }
111
+ // Case 3: GitHub shorthand (owner/repo)
112
+ const shorthandMatch = source.match(GITHUB_SHORTHAND_RE);
113
+ if (shorthandMatch && !source.includes('\\') && !source.includes(':')) {
114
+ const [, owner, repo] = shorthandMatch;
115
+ const rawUrl = skillName
116
+ ? await discoverNamedSkill(owner, repo, skillName)
117
+ : await discoverSkillInRepo(owner, repo);
118
+ return { content: await fetchText(rawUrl), sourceName: skillName ?? `${owner}/${repo}` };
119
+ }
120
+ // Case 4: URL
121
+ if (source.startsWith('http://') || source.startsWith('https://')) {
122
+ return { content: await fetchText(source), sourceName: source };
123
+ }
124
+ // Case 5: Local file path
125
+ const content = await readFile(source, 'utf-8');
126
+ return { content, sourceName: basename(source, '.md') };
127
+ }
128
+ export async function parseSkill(source, skillName) {
129
+ const { content: rawContent, sourceName } = await resolveSource(source, skillName);
130
+ const { data, content: body } = matter(rawContent);
131
+ if (!body || !body.trim()) {
132
+ throw new Error('SKILL.md has no content body');
133
+ }
134
+ // Resolve name
135
+ let name = data.name;
136
+ if (!name) {
137
+ const headingMatch = body.match(/^#\s+(.+)$/m);
138
+ name = headingMatch ? headingMatch[1].trim() : sourceName;
139
+ }
140
+ // Resolve description
141
+ let description = data.description;
142
+ if (!description) {
143
+ const firstParagraph = body
144
+ .split('\n\n')
145
+ .find(p => p.trim() && !p.trim().startsWith('#'));
146
+ description = firstParagraph
147
+ ? firstParagraph.trim().slice(0, 200)
148
+ : 'No description available';
149
+ }
150
+ return { name, description, body: body.trim(), rawContent };
151
+ }
152
+ //# sourceMappingURL=parser.js.map
@@ -0,0 +1,4 @@
1
+ import type { LanguageModel } from 'ai';
2
+ import { type ProviderName } from './config.js';
3
+ export declare function resolveApiKey(provider: ProviderName, cliKey?: string): string;
4
+ export declare function createModel(provider: ProviderName, modelId: string, apiKey: string): LanguageModel;
@@ -0,0 +1,33 @@
1
+ import { createOpenAI } from '@ai-sdk/openai';
2
+ import { createAnthropic } from '@ai-sdk/anthropic';
3
+ import { createGoogleGenerativeAI } from '@ai-sdk/google';
4
+ import { PROVIDER_ENV_VARS } from './config.js';
5
+ export function resolveApiKey(provider, cliKey) {
6
+ if (cliKey)
7
+ return cliKey;
8
+ const envVar = PROVIDER_ENV_VARS[provider];
9
+ const envValue = process.env[envVar];
10
+ if (envValue)
11
+ return envValue;
12
+ throw new Error(`No API key found for ${provider}. Provide --key or set ${envVar} environment variable.`);
13
+ }
14
+ export function createModel(provider, modelId, apiKey) {
15
+ switch (provider) {
16
+ case 'openrouter':
17
+ return createOpenAI({
18
+ baseURL: 'https://openrouter.ai/api/v1',
19
+ apiKey,
20
+ headers: {
21
+ 'HTTP-Referer': 'https://github.com/zlatkov/skilleval',
22
+ 'X-Title': 'skilleval',
23
+ },
24
+ })(modelId);
25
+ case 'openai':
26
+ return createOpenAI({ apiKey })(modelId);
27
+ case 'anthropic':
28
+ return createAnthropic({ apiKey })(modelId);
29
+ case 'google':
30
+ return createGoogleGenerativeAI({ apiKey })(modelId);
31
+ }
32
+ }
33
+ //# sourceMappingURL=providers.js.map
@@ -0,0 +1,5 @@
1
+ import type { EvalReport, EvalResult } from './config.js';
2
+ export declare function printReport(reports: EvalReport[], evalResults: EvalResult[], options: {
3
+ json: boolean;
4
+ verbose: boolean;
5
+ }): void;
@@ -0,0 +1,80 @@
1
+ import chalk from 'chalk';
2
+ function padRight(str, len) {
3
+ return str.length >= len ? str.slice(0, len) : str + ' '.repeat(len - str.length);
4
+ }
5
+ function scoreColor(score) {
6
+ if (score >= 70)
7
+ return chalk.green;
8
+ if (score >= 40)
9
+ return chalk.yellow;
10
+ return chalk.red;
11
+ }
12
+ function printTable(reports) {
13
+ const modelWidth = Math.max(20, ...reports.map(r => r.modelId.length)) + 2;
14
+ const triggerWidth = 14;
15
+ const complianceWidth = 16;
16
+ const overallWidth = 9;
17
+ const line = (left, mid, right, fill) => left + fill.repeat(modelWidth) + mid + fill.repeat(triggerWidth) + mid + fill.repeat(complianceWidth) + mid + fill.repeat(overallWidth) + right;
18
+ console.log(line('┌', '┬', '┐', '─'));
19
+ console.log('│' + padRight(' Model', modelWidth) +
20
+ '│' + padRight(' Trigger', triggerWidth) +
21
+ '│' + padRight(' Compliance', complianceWidth) +
22
+ '│' + padRight(' Overall', overallWidth) + '│');
23
+ console.log(line('├', '┼', '┤', '─'));
24
+ for (const report of reports) {
25
+ const triggerStr = `${report.triggerScore.correct}/${report.triggerScore.total}`;
26
+ const complianceStr = report.complianceScore.total > 0
27
+ ? `${report.complianceScore.correct}/${report.complianceScore.total} (${report.complianceScore.avgScore})`
28
+ : 'N/A';
29
+ const overallStr = `${report.overall}%`;
30
+ const color = scoreColor(report.overall);
31
+ console.log('│' + padRight(` ${report.modelId}`, modelWidth) +
32
+ '│' + padRight(` ${triggerStr}`, triggerWidth) +
33
+ '│' + padRight(` ${complianceStr}`, complianceWidth) +
34
+ '│' + color(padRight(` ${overallStr}`, overallWidth)) + '│');
35
+ }
36
+ console.log(line('└', '┴', '┘', '─'));
37
+ if (reports.length > 0) {
38
+ const best = reports[0];
39
+ const worst = reports[reports.length - 1];
40
+ console.log(`\n${chalk.green('Best model:')} ${best.modelId} (${best.overall}%)`);
41
+ if (reports.length > 1) {
42
+ console.log(`${chalk.red('Worst model:')} ${worst.modelId} (${worst.overall}%)`);
43
+ }
44
+ }
45
+ }
46
+ function printVerbose(evalResults) {
47
+ const byModel = new Map();
48
+ for (const result of evalResults) {
49
+ const arr = byModel.get(result.modelId) ?? [];
50
+ arr.push(result);
51
+ byModel.set(result.modelId, arr);
52
+ }
53
+ for (const [modelId, results] of byModel) {
54
+ console.log(`\n${chalk.bold(`--- ${modelId} ---`)}`);
55
+ for (const result of results) {
56
+ const status = result.trigger.correct ? chalk.green('PASS') : chalk.red('FAIL');
57
+ console.log(` [${status}] ${result.prompt.type}: "${result.prompt.text.slice(0, 60)}"`);
58
+ console.log(` ${result.trigger.reason}`);
59
+ if (result.compliance) {
60
+ const compStatus = result.compliance.compliant ? chalk.green('PASS') : chalk.red('FAIL');
61
+ console.log(` Compliance: [${compStatus}] ${result.compliance.score}/100 — ${result.compliance.reason}`);
62
+ }
63
+ }
64
+ }
65
+ }
66
+ export function printReport(reports, evalResults, options) {
67
+ if (options.json) {
68
+ console.log(JSON.stringify({ reports, evalResults }, null, 2));
69
+ return;
70
+ }
71
+ printTable(reports);
72
+ if (options.verbose) {
73
+ printVerbose(evalResults);
74
+ }
75
+ else {
76
+ console.log(`\nRun with ${chalk.cyan('--verbose')} to see individual test results.`);
77
+ console.log(`Run with ${chalk.cyan('--json')} to get machine-readable output.`);
78
+ }
79
+ }
80
+ //# sourceMappingURL=reporter.js.map
@@ -0,0 +1,2 @@
1
+ import type { ModelWithId, SkillDefinition, TestPrompt, TestResult } from './config.js';
2
+ export declare function runTests(skill: SkillDefinition, prompts: TestPrompt[], models: ModelWithId[], verbose: boolean): Promise<TestResult[]>;