@alexanderzzlatkov/skilleval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +186 -0
- package/dist/config.d.ts +63 -0
- package/dist/config.js +42 -0
- package/dist/context-builder.d.ts +5 -0
- package/dist/context-builder.js +121 -0
- package/dist/evaluator.d.ts +4 -0
- package/dist/evaluator.js +239 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +113 -0
- package/dist/parser.d.ts +2 -0
- package/dist/parser.js +152 -0
- package/dist/providers.d.ts +4 -0
- package/dist/providers.js +33 -0
- package/dist/reporter.d.ts +5 -0
- package/dist/reporter.js +80 -0
- package/dist/runner.d.ts +2 -0
- package/dist/runner.js +53 -0
- package/dist/test-generator.d.ts +3 -0
- package/dist/test-generator.js +109 -0
- package/package.json +58 -0
package/dist/runner.js
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { generateText } from 'ai';
|
|
2
|
+
import { buildTriggerSystemPrompt } from './context-builder.js';
|
|
3
|
+
export async function runTests(skill, prompts, models, verbose) {
|
|
4
|
+
const systemPrompt = buildTriggerSystemPrompt(skill);
|
|
5
|
+
const results = [];
|
|
6
|
+
const totalTests = models.length * prompts.length;
|
|
7
|
+
let completed = 0;
|
|
8
|
+
if (verbose) {
|
|
9
|
+
process.stderr.write(`\n Trigger system prompt:\n ---\n${systemPrompt}\n ---\n\n`);
|
|
10
|
+
}
|
|
11
|
+
for (const { model, modelId } of models) {
|
|
12
|
+
for (const prompt of prompts) {
|
|
13
|
+
completed++;
|
|
14
|
+
process.stderr.write(`\r [${completed}/${totalTests}] Testing ${modelId}...`);
|
|
15
|
+
const start = performance.now();
|
|
16
|
+
try {
|
|
17
|
+
const { text } = await generateText({
|
|
18
|
+
model,
|
|
19
|
+
messages: [
|
|
20
|
+
{ role: 'system', content: systemPrompt },
|
|
21
|
+
{ role: 'user', content: prompt.text },
|
|
22
|
+
],
|
|
23
|
+
temperature: 0.3,
|
|
24
|
+
});
|
|
25
|
+
results.push({
|
|
26
|
+
modelId,
|
|
27
|
+
prompt,
|
|
28
|
+
response: text,
|
|
29
|
+
latencyMs: Math.round(performance.now() - start),
|
|
30
|
+
});
|
|
31
|
+
if (verbose) {
|
|
32
|
+
process.stderr.write(`\n ${prompt.type}: "${prompt.text.slice(0, 60)}..." → ${text.slice(0, 80)}...\n`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
catch (err) {
|
|
36
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
37
|
+
results.push({
|
|
38
|
+
modelId,
|
|
39
|
+
prompt,
|
|
40
|
+
response: '',
|
|
41
|
+
latencyMs: Math.round(performance.now() - start),
|
|
42
|
+
error: message,
|
|
43
|
+
});
|
|
44
|
+
if (verbose) {
|
|
45
|
+
process.stderr.write(`\n ERROR: ${message}\n`);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
process.stderr.write('\n');
|
|
51
|
+
return results;
|
|
52
|
+
}
|
|
53
|
+
//# sourceMappingURL=runner.js.map
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { type LanguageModel } from 'ai';
|
|
2
|
+
import type { SkillDefinition, TestPrompt } from './config.js';
|
|
3
|
+
export declare function generateTestPrompts(skill: SkillDefinition, generatorModels: LanguageModel[], count: number, customPromptsPath?: string, verbose?: boolean): Promise<TestPrompt[]>;
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
2
|
+
import { generateText } from 'ai';
|
|
3
|
+
const RETRY_DELAY_MS = 2000;
|
|
4
|
+
function stripCodeFences(text) {
|
|
5
|
+
return text.replace(/^```(?:json)?\s*\n?/gm, '').replace(/\n?```\s*$/gm, '').trim();
|
|
6
|
+
}
|
|
7
|
+
function validatePrompts(parsed, count) {
|
|
8
|
+
if (!Array.isArray(parsed))
|
|
9
|
+
throw new Error('Expected JSON array');
|
|
10
|
+
const prompts = [];
|
|
11
|
+
for (const item of parsed) {
|
|
12
|
+
if (typeof item !== 'object' || item === null ||
|
|
13
|
+
typeof item.text !== 'string' || !item.text.trim() ||
|
|
14
|
+
(item.type !== 'positive' && item.type !== 'negative')) {
|
|
15
|
+
throw new Error('Invalid prompt format');
|
|
16
|
+
}
|
|
17
|
+
prompts.push({ text: item.text.trim(), type: item.type });
|
|
18
|
+
}
|
|
19
|
+
const positiveCount = prompts.filter(p => p.type === 'positive').length;
|
|
20
|
+
const negativeCount = prompts.filter(p => p.type === 'negative').length;
|
|
21
|
+
if (positiveCount !== count || negativeCount !== count) {
|
|
22
|
+
throw new Error(`Expected ${count} positive and ${count} negative prompts, got ${positiveCount} and ${negativeCount}`);
|
|
23
|
+
}
|
|
24
|
+
return prompts;
|
|
25
|
+
}
|
|
26
|
+
const ALL_FALLBACK_POSITIVE = [
|
|
27
|
+
s => `I need help with ${s.name}`,
|
|
28
|
+
s => `Can you use the ${s.name} skill for this task?`,
|
|
29
|
+
s => `Help me with something related to ${s.description.toLowerCase()}`,
|
|
30
|
+
s => `I'd like to ${s.description.toLowerCase().slice(0, 50)}`,
|
|
31
|
+
s => `Please assist me with ${s.name} functionality`,
|
|
32
|
+
];
|
|
33
|
+
const ALL_FALLBACK_NEGATIVE = [
|
|
34
|
+
`What's the weather like today?`,
|
|
35
|
+
`Tell me a joke about programming`,
|
|
36
|
+
`How do I make a cup of coffee?`,
|
|
37
|
+
`What is the capital of France?`,
|
|
38
|
+
`Help me write a haiku about nature`,
|
|
39
|
+
];
|
|
40
|
+
function fallbackPrompts(skill, count) {
|
|
41
|
+
const positive = ALL_FALLBACK_POSITIVE.slice(0, count).map(fn => ({ text: fn(skill), type: 'positive' }));
|
|
42
|
+
const negative = ALL_FALLBACK_NEGATIVE.slice(0, count).map(text => ({ text, type: 'negative' }));
|
|
43
|
+
return [...positive, ...negative];
|
|
44
|
+
}
|
|
45
|
+
const GENERATION_PROMPT = (skill, count) => `Generate test prompts for this AI skill:
|
|
46
|
+
Name: ${skill.name}
|
|
47
|
+
Description: ${skill.description}
|
|
48
|
+
|
|
49
|
+
Content summary (first 500 chars): ${skill.body.slice(0, 500)}
|
|
50
|
+
|
|
51
|
+
Generate exactly ${count * 2} prompts as a JSON array:
|
|
52
|
+
- ${count} "positive" prompts that SHOULD trigger this skill (realistic user requests, varied wording)
|
|
53
|
+
- ${count} "negative" prompts that should NOT trigger this skill (related but out of scope, or different topics)
|
|
54
|
+
|
|
55
|
+
Format: [{"text": "...", "type": "positive"}, {"text": "...", "type": "negative"}]
|
|
56
|
+
|
|
57
|
+
Respond ONLY with valid JSON, no other text.`;
|
|
58
|
+
function logPrompts(prompts, source) {
|
|
59
|
+
process.stderr.write(`\n Test prompts (${source}):\n`);
|
|
60
|
+
for (const p of prompts) {
|
|
61
|
+
const label = p.type === 'positive' ? '+' : '-';
|
|
62
|
+
process.stderr.write(` [${label}] ${p.text}\n`);
|
|
63
|
+
}
|
|
64
|
+
process.stderr.write('\n');
|
|
65
|
+
}
|
|
66
|
+
export async function generateTestPrompts(skill, generatorModels, count, customPromptsPath, verbose) {
|
|
67
|
+
// Load custom prompts if provided
|
|
68
|
+
if (customPromptsPath) {
|
|
69
|
+
const raw = await readFile(customPromptsPath, 'utf-8');
|
|
70
|
+
const prompts = validatePrompts(JSON.parse(raw), count);
|
|
71
|
+
if (verbose)
|
|
72
|
+
logPrompts(prompts, 'custom');
|
|
73
|
+
return prompts;
|
|
74
|
+
}
|
|
75
|
+
// Try each model with retries
|
|
76
|
+
for (let modelIdx = 0; modelIdx < generatorModels.length; modelIdx++) {
|
|
77
|
+
const model = generatorModels[modelIdx];
|
|
78
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
79
|
+
try {
|
|
80
|
+
const { text } = await generateText({
|
|
81
|
+
model,
|
|
82
|
+
system: 'You generate test prompts for evaluating AI skill detection. Respond ONLY with a JSON array. No markdown, no explanation.',
|
|
83
|
+
prompt: attempt === 0
|
|
84
|
+
? GENERATION_PROMPT(skill, count)
|
|
85
|
+
: `${GENERATION_PROMPT(skill, count)}\n\nYour previous response was invalid JSON. Return ONLY valid JSON.`,
|
|
86
|
+
temperature: 0.8,
|
|
87
|
+
});
|
|
88
|
+
const cleaned = stripCodeFences(text);
|
|
89
|
+
const prompts = validatePrompts(JSON.parse(cleaned), count);
|
|
90
|
+
if (verbose)
|
|
91
|
+
logPrompts(prompts, 'generated');
|
|
92
|
+
return prompts;
|
|
93
|
+
}
|
|
94
|
+
catch (err) {
|
|
95
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
96
|
+
process.stderr.write(`Warning: Model ${modelIdx + 1}/${generatorModels.length}, attempt ${attempt + 1}/3 failed: ${message}\n`);
|
|
97
|
+
if (attempt < 2) {
|
|
98
|
+
await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS));
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
process.stderr.write('Warning: All generator models failed, using fallback prompts.\n');
|
|
104
|
+
const prompts = fallbackPrompts(skill, count);
|
|
105
|
+
if (verbose)
|
|
106
|
+
logPrompts(prompts, 'fallback');
|
|
107
|
+
return prompts;
|
|
108
|
+
}
|
|
109
|
+
//# sourceMappingURL=test-generator.js.map
|
package/package.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@alexanderzzlatkov/skilleval",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Evaluate how well AI models understand Agent Skills (SKILL.md files)",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"ai",
|
|
7
|
+
"agent",
|
|
8
|
+
"skills",
|
|
9
|
+
"eval",
|
|
10
|
+
"llm",
|
|
11
|
+
"claude-code",
|
|
12
|
+
"openrouter",
|
|
13
|
+
"skilleval"
|
|
14
|
+
],
|
|
15
|
+
"homepage": "https://github.com/zlatkov/skilleval#readme",
|
|
16
|
+
"bugs": {
|
|
17
|
+
"url": "https://github.com/zlatkov/skilleval/issues"
|
|
18
|
+
},
|
|
19
|
+
"repository": {
|
|
20
|
+
"type": "git",
|
|
21
|
+
"url": "git+https://github.com/zlatkov/skilleval.git"
|
|
22
|
+
},
|
|
23
|
+
"license": "MIT",
|
|
24
|
+
"author": "Alexander Zlatkov (https://github.com/zlatkov)",
|
|
25
|
+
"type": "module",
|
|
26
|
+
"main": "./dist/index.js",
|
|
27
|
+
"types": "./dist/index.d.ts",
|
|
28
|
+
"bin": {
|
|
29
|
+
"skilleval": "dist/index.js"
|
|
30
|
+
},
|
|
31
|
+
"files": [
|
|
32
|
+
"dist/**/*.js",
|
|
33
|
+
"dist/**/*.d.ts"
|
|
34
|
+
],
|
|
35
|
+
"scripts": {
|
|
36
|
+
"build": "tsc",
|
|
37
|
+
"dev": "tsx src/index.ts",
|
|
38
|
+
"prepublishOnly": "npm run build"
|
|
39
|
+
},
|
|
40
|
+
"dependencies": {
|
|
41
|
+
"@ai-sdk/anthropic": "^1.0.0",
|
|
42
|
+
"@ai-sdk/google": "^1.0.0",
|
|
43
|
+
"@ai-sdk/openai": "^1.0.0",
|
|
44
|
+
"ai": "^4.0.0",
|
|
45
|
+
"chalk": "^5.4.0",
|
|
46
|
+
"commander": "^13.0.0",
|
|
47
|
+
"dotenv": "^17.3.1",
|
|
48
|
+
"gray-matter": "^4.0.3"
|
|
49
|
+
},
|
|
50
|
+
"devDependencies": {
|
|
51
|
+
"@types/node": "^22.0.0",
|
|
52
|
+
"tsx": "^4.0.0",
|
|
53
|
+
"typescript": "^5.7.0"
|
|
54
|
+
},
|
|
55
|
+
"engines": {
|
|
56
|
+
"node": ">=18"
|
|
57
|
+
}
|
|
58
|
+
}
|