guild-agents 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -0
- package/bin/guild.js +73 -0
- package/package.json +5 -2
- package/src/commands/eval.js +225 -0
- package/src/commands/stats.js +147 -0
- package/src/commands/workspace.js +38 -1
- package/src/templates/skills/build-feature/evals/evals.json +53 -0
- package/src/templates/skills/build-feature/evals/triggers.json +16 -0
- package/src/templates/skills/council/SKILL.md +27 -6
- package/src/templates/skills/council/evals/evals.json +41 -0
- package/src/templates/skills/council/evals/triggers.json +16 -0
- package/src/templates/skills/create-pr/evals/evals.json +44 -0
- package/src/templates/skills/create-pr/evals/triggers.json +16 -0
- package/src/templates/skills/debug/SKILL.md +1 -1
- package/src/templates/skills/debug/evals/triggers.json +16 -0
- package/src/templates/skills/dev-flow/evals/evals.json +36 -0
- package/src/templates/skills/dev-flow/evals/triggers.json +16 -0
- package/src/templates/skills/guild-specialize/evals/evals.json +54 -0
- package/src/templates/skills/guild-specialize/evals/triggers.json +16 -0
- package/src/templates/skills/new-feature/evals/evals.json +41 -0
- package/src/templates/skills/new-feature/evals/triggers.json +16 -0
- package/src/templates/skills/qa-cycle/evals/evals.json +46 -0
- package/src/templates/skills/qa-cycle/evals/triggers.json +16 -0
- package/src/templates/skills/re-specialize/evals/evals.json +48 -0
- package/src/templates/skills/re-specialize/evals/triggers.json +16 -0
- package/src/templates/skills/review/evals/evals.json +43 -0
- package/src/templates/skills/review/evals/triggers.json +16 -0
- package/src/templates/skills/session-end/evals/evals.json +40 -0
- package/src/templates/skills/session-end/evals/triggers.json +16 -0
- package/src/templates/skills/session-start/evals/evals.json +50 -0
- package/src/templates/skills/session-start/evals/triggers.json +16 -0
- package/src/templates/skills/status/evals/evals.json +40 -0
- package/src/templates/skills/status/evals/triggers.json +16 -0
- package/src/templates/skills/tdd/evals/triggers.json +16 -0
- package/src/templates/skills/verify/evals/triggers.json +16 -0
- package/src/utils/accounting.js +139 -0
- package/src/utils/benchmark.js +128 -0
- package/src/utils/description-analyzer.js +92 -0
- package/src/utils/eval-runner.js +139 -0
- package/src/utils/pricing.js +28 -0
- package/src/utils/semantic-matcher.js +91 -0
- package/src/utils/trigger-matcher.js +64 -0
- package/src/utils/trigger-runner.js +132 -0
- package/src/utils/workspace.js +89 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "session-end",
|
|
3
|
+
"matcherType": "keyword",
|
|
4
|
+
"description": "Saves current state to SESSION.md",
|
|
5
|
+
"threshold": 0.3,
|
|
6
|
+
"tests": [
|
|
7
|
+
{ "prompt": "save the session state", "shouldTrigger": true },
|
|
8
|
+
{ "prompt": "end my session and save to SESSION.md", "shouldTrigger": true },
|
|
9
|
+
{ "prompt": "save current state to SESSION", "shouldTrigger": true },
|
|
10
|
+
{ "prompt": "I'm done for today, save my progress", "shouldTrigger": true, "keywordExpected": false },
|
|
11
|
+
{ "prompt": "start my session", "shouldTrigger": false },
|
|
12
|
+
{ "prompt": "create a pull request", "shouldTrigger": false },
|
|
13
|
+
{ "prompt": "review my code", "shouldTrigger": false },
|
|
14
|
+
{ "prompt": "debug this bug", "shouldTrigger": false }
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "session-start",
|
|
3
|
+
"evals": [
|
|
4
|
+
{
|
|
5
|
+
"id": "ss-has-core-steps",
|
|
6
|
+
"description": "Session start has load, detect, present, suggest, update steps",
|
|
7
|
+
"expectations": [
|
|
8
|
+
{ "text": "Has load-context step", "assertion": "step-exists:load-context" },
|
|
9
|
+
{ "text": "Has detect-resumable step", "assertion": "step-exists:detect-resumable" },
|
|
10
|
+
{ "text": "Has present-state step", "assertion": "step-exists:present-state" },
|
|
11
|
+
{ "text": "Has suggest-continuation step", "assertion": "step-exists:suggest-continuation" },
|
|
12
|
+
{ "text": "Has update-session step", "assertion": "step-exists:update-session" }
|
|
13
|
+
]
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"id": "ss-all-system",
|
|
17
|
+
"description": "All steps are system role",
|
|
18
|
+
"expectations": [
|
|
19
|
+
{ "text": "load-context is system", "assertion": "step-role:load-context:system" },
|
|
20
|
+
{ "text": "detect-resumable is system", "assertion": "step-role:detect-resumable:system" },
|
|
21
|
+
{ "text": "present-state is system", "assertion": "step-role:present-state:system" },
|
|
22
|
+
{ "text": "suggest-continuation is system", "assertion": "step-role:suggest-continuation:system" },
|
|
23
|
+
{ "text": "update-session is system", "assertion": "step-role:update-session:system" }
|
|
24
|
+
]
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"id": "ss-gates",
|
|
28
|
+
"description": "Gates at presentation, suggestion, and session update",
|
|
29
|
+
"expectations": [
|
|
30
|
+
{ "text": "present-state has gate", "assertion": "gate-exists:present-state" },
|
|
31
|
+
{ "text": "suggest-continuation has gate", "assertion": "gate-exists:suggest-continuation" },
|
|
32
|
+
{ "text": "update-session has gate", "assertion": "gate-exists:update-session" }
|
|
33
|
+
]
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"id": "ss-detect-requires-session",
|
|
37
|
+
"description": "Detect-resumable requires session state",
|
|
38
|
+
"expectations": [
|
|
39
|
+
{ "text": "detect-resumable requires session-md", "assertion": "step-requires:detect-resumable:session-md" }
|
|
40
|
+
]
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"id": "ss-minimum-steps",
|
|
44
|
+
"description": "Has at least 5 steps",
|
|
45
|
+
"expectations": [
|
|
46
|
+
{ "text": "At least 5 steps", "assertion": "step-count:5" }
|
|
47
|
+
]
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "session-start",
|
|
3
|
+
"matcherType": "keyword",
|
|
4
|
+
"description": "Loads context and resumes work from SESSION.md",
|
|
5
|
+
"threshold": 0.3,
|
|
6
|
+
"tests": [
|
|
7
|
+
{ "prompt": "load my session context", "shouldTrigger": true },
|
|
8
|
+
{ "prompt": "resume work from SESSION.md", "shouldTrigger": true },
|
|
9
|
+
{ "prompt": "start session and load context", "shouldTrigger": true },
|
|
10
|
+
{ "prompt": "where did I leave off", "shouldTrigger": true, "keywordExpected": false },
|
|
11
|
+
{ "prompt": "save my progress", "shouldTrigger": false },
|
|
12
|
+
{ "prompt": "create a new feature", "shouldTrigger": false },
|
|
13
|
+
{ "prompt": "review my code", "shouldTrigger": false },
|
|
14
|
+
{ "prompt": "run the tests", "shouldTrigger": false }
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "status",
|
|
3
|
+
"evals": [
|
|
4
|
+
{
|
|
5
|
+
"id": "st-has-core-steps",
|
|
6
|
+
"description": "Status has read-state, scan-resources, present steps",
|
|
7
|
+
"expectations": [
|
|
8
|
+
{ "text": "Has read-state step", "assertion": "step-exists:read-state" },
|
|
9
|
+
{ "text": "Has scan-resources step", "assertion": "step-exists:scan-resources" },
|
|
10
|
+
{ "text": "Has present-status step", "assertion": "step-exists:present-status" }
|
|
11
|
+
]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"id": "st-all-system",
|
|
15
|
+
"description": "All steps are system role",
|
|
16
|
+
"expectations": [
|
|
17
|
+
{ "text": "read-state is system", "assertion": "step-role:read-state:system" },
|
|
18
|
+
{ "text": "scan-resources is system", "assertion": "step-role:scan-resources:system" },
|
|
19
|
+
{ "text": "present-status is system", "assertion": "step-role:present-status:system" }
|
|
20
|
+
]
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"id": "st-presentation-gate",
|
|
24
|
+
"description": "Present-status has gate",
|
|
25
|
+
"expectations": [
|
|
26
|
+
{ "text": "present-status has gate", "assertion": "gate-exists:present-status" }
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": "st-dependencies",
|
|
31
|
+
"description": "Present-status requires project and session data",
|
|
32
|
+
"expectations": [
|
|
33
|
+
{ "text": "present-status requires project-md", "assertion": "step-requires:present-status:project-md" },
|
|
34
|
+
{ "text": "present-status requires session-md", "assertion": "step-requires:present-status:session-md" },
|
|
35
|
+
{ "text": "present-status requires agent-list", "assertion": "step-requires:present-status:agent-list" },
|
|
36
|
+
{ "text": "present-status requires skill-list", "assertion": "step-requires:present-status:skill-list" }
|
|
37
|
+
]
|
|
38
|
+
}
|
|
39
|
+
]
|
|
40
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "status",
|
|
3
|
+
"matcherType": "keyword",
|
|
4
|
+
"description": "Shows current project and session state",
|
|
5
|
+
"threshold": 0.3,
|
|
6
|
+
"tests": [
|
|
7
|
+
{ "prompt": "show the project status", "shouldTrigger": true },
|
|
8
|
+
{ "prompt": "show current session state", "shouldTrigger": true },
|
|
9
|
+
{ "prompt": "what is the current project state", "shouldTrigger": true },
|
|
10
|
+
{ "prompt": "how is the project going", "shouldTrigger": true },
|
|
11
|
+
{ "prompt": "create a pull request", "shouldTrigger": false },
|
|
12
|
+
{ "prompt": "review my code", "shouldTrigger": false },
|
|
13
|
+
{ "prompt": "debug this bug", "shouldTrigger": false },
|
|
14
|
+
{ "prompt": "save my session", "shouldTrigger": false }
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "tdd",
|
|
3
|
+
"matcherType": "keyword",
|
|
4
|
+
"description": "Discipline skill — TDD red-green-refactor cycle. Use when implementing any feature or bugfix, before writing implementation code.",
|
|
5
|
+
"threshold": 0.3,
|
|
6
|
+
"tests": [
|
|
7
|
+
{ "prompt": "use TDD for this implementation", "shouldTrigger": true },
|
|
8
|
+
{ "prompt": "red green refactor cycle", "shouldTrigger": true },
|
|
9
|
+
{ "prompt": "follow the TDD red-green-refactor discipline", "shouldTrigger": true },
|
|
10
|
+
{ "prompt": "implement this feature with TDD", "shouldTrigger": true },
|
|
11
|
+
{ "prompt": "create a pull request", "shouldTrigger": false },
|
|
12
|
+
{ "prompt": "review my code", "shouldTrigger": false },
|
|
13
|
+
{ "prompt": "save my session", "shouldTrigger": false },
|
|
14
|
+
{ "prompt": "show project status", "shouldTrigger": false }
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "verify",
|
|
3
|
+
"matcherType": "keyword",
|
|
4
|
+
"description": "Discipline skill — verification before completion. Use when about to claim work is complete, fixed, or passing, before committing or creating PRs.",
|
|
5
|
+
"threshold": 0.3,
|
|
6
|
+
"tests": [
|
|
7
|
+
{ "prompt": "verify before committing", "shouldTrigger": true },
|
|
8
|
+
{ "prompt": "run verification before completion", "shouldTrigger": true },
|
|
9
|
+
{ "prompt": "verify the work is complete", "shouldTrigger": true },
|
|
10
|
+
{ "prompt": "make sure everything passes before the PR", "shouldTrigger": true, "keywordExpected": false },
|
|
11
|
+
{ "prompt": "create a pull request", "shouldTrigger": false },
|
|
12
|
+
{ "prompt": "review my code", "shouldTrigger": false },
|
|
13
|
+
{ "prompt": "debug this bug", "shouldTrigger": false },
|
|
14
|
+
{ "prompt": "save my session", "shouldTrigger": false }
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* accounting.js — Token usage recording, persistence, and aggregation.
|
|
3
|
+
*
|
|
4
|
+
* Persists usage data to .claude/guild/usage.json.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
|
8
|
+
import { join, dirname } from 'path';
|
|
9
|
+
import { estimateCost } from './pricing.js';
|
|
10
|
+
|
|
11
|
+
const USAGE_PATH = join('.claude', 'guild', 'usage.json');
|
|
12
|
+
|
|
13
|
+
export function emptyUsage() {
|
|
14
|
+
return {
|
|
15
|
+
version: 1,
|
|
16
|
+
lastUpdated: new Date().toISOString(),
|
|
17
|
+
entries: [],
|
|
18
|
+
totals: {
|
|
19
|
+
totalTokens: 0,
|
|
20
|
+
totalInputTokens: 0,
|
|
21
|
+
totalOutputTokens: 0,
|
|
22
|
+
totalCostUSD: 0,
|
|
23
|
+
tokensByModel: {},
|
|
24
|
+
tokensByTier: {},
|
|
25
|
+
tokensByWorkflow: {},
|
|
26
|
+
workflowCount: 0,
|
|
27
|
+
},
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function createEntry({ workflow, agent, tier, model, inputTokens, outputTokens }) {
|
|
32
|
+
const totalTokens = inputTokens + outputTokens;
|
|
33
|
+
return {
|
|
34
|
+
timestamp: new Date().toISOString(),
|
|
35
|
+
workflow,
|
|
36
|
+
agent,
|
|
37
|
+
tier,
|
|
38
|
+
model,
|
|
39
|
+
inputTokens,
|
|
40
|
+
outputTokens,
|
|
41
|
+
totalTokens,
|
|
42
|
+
estimatedCostUSD: estimateCost(model, inputTokens, outputTokens),
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export function loadUsage(root) {
|
|
47
|
+
const filePath = join(root, USAGE_PATH);
|
|
48
|
+
if (!existsSync(filePath)) return emptyUsage();
|
|
49
|
+
try {
|
|
50
|
+
return JSON.parse(readFileSync(filePath, 'utf8'));
|
|
51
|
+
} catch {
|
|
52
|
+
return emptyUsage();
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export function saveUsage(root, usage) {
|
|
57
|
+
const filePath = join(root, USAGE_PATH);
|
|
58
|
+
mkdirSync(dirname(filePath), { recursive: true });
|
|
59
|
+
usage.lastUpdated = new Date().toISOString();
|
|
60
|
+
writeFileSync(filePath, JSON.stringify(usage, null, 2) + '\n');
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function updateTotals(totals, entry) {
|
|
64
|
+
totals.totalTokens += entry.totalTokens;
|
|
65
|
+
totals.totalInputTokens += entry.inputTokens;
|
|
66
|
+
totals.totalOutputTokens += entry.outputTokens;
|
|
67
|
+
totals.totalCostUSD += entry.estimatedCostUSD;
|
|
68
|
+
totals.tokensByModel[entry.model] = (totals.tokensByModel[entry.model] || 0) + entry.totalTokens;
|
|
69
|
+
totals.tokensByTier[entry.tier] = (totals.tokensByTier[entry.tier] || 0) + entry.totalTokens;
|
|
70
|
+
totals.tokensByWorkflow[entry.workflow] = (totals.tokensByWorkflow[entry.workflow] || 0) + entry.totalTokens;
|
|
71
|
+
totals.workflowCount += 1;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export function recordStep(root, params) {
|
|
75
|
+
const usage = loadUsage(root);
|
|
76
|
+
const entry = createEntry(params);
|
|
77
|
+
usage.entries.push(entry);
|
|
78
|
+
updateTotals(usage.totals, entry);
|
|
79
|
+
saveUsage(root, usage);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const PROFILES = {
|
|
83
|
+
max: { reasoning: 'claude-opus-4-6', execution: 'claude-sonnet-4-5', routine: 'claude-haiku-4-5' },
|
|
84
|
+
pro: { reasoning: 'claude-sonnet-4-5', execution: 'claude-sonnet-4-5', routine: 'claude-haiku-4-5' },
|
|
85
|
+
'all-opus': { reasoning: 'claude-opus-4-6', execution: 'claude-opus-4-6', routine: 'claude-opus-4-6' },
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
export function aggregate(root, period) {
|
|
89
|
+
const usage = loadUsage(root);
|
|
90
|
+
const now = new Date();
|
|
91
|
+
let cutoff;
|
|
92
|
+
|
|
93
|
+
switch (period) {
|
|
94
|
+
case 'today':
|
|
95
|
+
cutoff = new Date(now.getFullYear(), now.getMonth(), now.getDate());
|
|
96
|
+
break;
|
|
97
|
+
case 'week':
|
|
98
|
+
cutoff = new Date(now);
|
|
99
|
+
cutoff.setDate(cutoff.getDate() - 7);
|
|
100
|
+
break;
|
|
101
|
+
case 'month':
|
|
102
|
+
cutoff = new Date(now);
|
|
103
|
+
cutoff.setDate(cutoff.getDate() - 30);
|
|
104
|
+
break;
|
|
105
|
+
default:
|
|
106
|
+
cutoff = new Date(0);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const filtered = usage.entries.filter(e => new Date(e.timestamp) >= cutoff);
|
|
110
|
+
|
|
111
|
+
const totals = {
|
|
112
|
+
totalTokens: 0,
|
|
113
|
+
totalInputTokens: 0,
|
|
114
|
+
totalOutputTokens: 0,
|
|
115
|
+
totalCostUSD: 0,
|
|
116
|
+
tokensByModel: {},
|
|
117
|
+
tokensByTier: {},
|
|
118
|
+
tokensByWorkflow: {},
|
|
119
|
+
workflowCount: 0,
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
for (const entry of filtered) {
|
|
123
|
+
updateTotals(totals, entry);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return totals;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
export function estimateWithProfile(entries, profileName) {
|
|
130
|
+
const profile = PROFILES[profileName];
|
|
131
|
+
if (!profile) return 0;
|
|
132
|
+
|
|
133
|
+
let cost = 0;
|
|
134
|
+
for (const entry of entries) {
|
|
135
|
+
const model = profile[entry.tier] || entry.model;
|
|
136
|
+
cost += estimateCost(model, entry.inputTokens, entry.outputTokens);
|
|
137
|
+
}
|
|
138
|
+
return cost;
|
|
139
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* benchmark.js — Records, reports, and detects regressions in eval benchmarks.
|
|
3
|
+
*
|
|
4
|
+
* Persists results to benchmarks/benchmark.json with 30-entry rotation.
|
|
5
|
+
* Generates benchmarks/benchmark.md as a human-readable report.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
|
|
9
|
+
import { dirname } from 'path';
|
|
10
|
+
|
|
11
|
+
const MAX_ENTRIES = 30;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Appends a benchmark entry to the JSON file, rotating old entries.
|
|
15
|
+
* @param {object} entry - Benchmark entry with timestamp, matcher, skills, aggregate
|
|
16
|
+
* @param {string} filePath - Path to benchmark.json
|
|
17
|
+
*/
|
|
18
|
+
export function recordBenchmark(entry, filePath) {
|
|
19
|
+
const dir = dirname(filePath);
|
|
20
|
+
if (!existsSync(dir)) {
|
|
21
|
+
mkdirSync(dir, { recursive: true });
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
let entries = [];
|
|
25
|
+
if (existsSync(filePath)) {
|
|
26
|
+
entries = JSON.parse(readFileSync(filePath, 'utf8'));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
entries.push(entry);
|
|
30
|
+
|
|
31
|
+
if (entries.length > MAX_ENTRIES) {
|
|
32
|
+
entries = entries.slice(entries.length - MAX_ENTRIES);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
writeFileSync(filePath, JSON.stringify(entries, null, 2));
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Generates a markdown report from a benchmark entry.
|
|
40
|
+
* @param {object} current - Current benchmark entry
|
|
41
|
+
* @param {object|null} previous - Previous entry for delta comparison
|
|
42
|
+
* @returns {string} Markdown report
|
|
43
|
+
*/
|
|
44
|
+
export function generateReport(current, previous) {
|
|
45
|
+
const lines = [];
|
|
46
|
+
const date = current.timestamp;
|
|
47
|
+
const matcher = current.matcher;
|
|
48
|
+
const model = current.model ? ` (${current.model})` : '';
|
|
49
|
+
|
|
50
|
+
lines.push(`# Eval Benchmark — ${date}`);
|
|
51
|
+
lines.push(`Matcher: ${matcher}${model} | Skills: ${current.skills.length} | Total tests: ${current.aggregate.total}`);
|
|
52
|
+
lines.push('');
|
|
53
|
+
lines.push('| Skill | Accuracy | Precision | Recall | Delta |');
|
|
54
|
+
lines.push('|-------|----------|-----------|--------|-------|');
|
|
55
|
+
|
|
56
|
+
for (const skill of current.skills) {
|
|
57
|
+
let delta = '—';
|
|
58
|
+
if (previous) {
|
|
59
|
+
const prev = previous.skills.find(s => s.name === skill.name);
|
|
60
|
+
if (prev) {
|
|
61
|
+
const diff = (skill.accuracy - prev.accuracy) * 100;
|
|
62
|
+
if (Math.abs(diff) >= 0.1) {
|
|
63
|
+
const sign = diff > 0 ? '+' : '';
|
|
64
|
+
const warn = diff < -5 ? ' !!' : '';
|
|
65
|
+
delta = `${sign}${diff.toFixed(1)}%${warn}`;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
lines.push(`| ${skill.name} | ${(skill.accuracy * 100).toFixed(1)}% | ${(skill.precision * 100).toFixed(1)}% | ${(skill.recall * 100).toFixed(1)}% | ${delta} |`);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
lines.push('');
|
|
74
|
+
lines.push('## Aggregate');
|
|
75
|
+
|
|
76
|
+
let aggDelta = '';
|
|
77
|
+
if (previous) {
|
|
78
|
+
const diff = (current.aggregate.accuracy - previous.aggregate.accuracy) * 100;
|
|
79
|
+
if (Math.abs(diff) >= 0.1) {
|
|
80
|
+
const sign = diff > 0 ? '+' : '';
|
|
81
|
+
aggDelta = ` (Delta ${sign}${diff.toFixed(1)}%)`;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
lines.push(`Accuracy: ${(current.aggregate.accuracy * 100).toFixed(1)}%${aggDelta}`);
|
|
86
|
+
lines.push(`Precision: ${(current.aggregate.precision * 100).toFixed(1)}%`);
|
|
87
|
+
lines.push(`Recall: ${(current.aggregate.recall * 100).toFixed(1)}%`);
|
|
88
|
+
lines.push('');
|
|
89
|
+
|
|
90
|
+
return lines.join('\n');
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Detects regressions between two benchmark entries.
|
|
95
|
+
* A regression is: accuracy dropped >5% AND at least 2 tests flipped.
|
|
96
|
+
* @param {object} current
|
|
97
|
+
* @param {object|null} previous
|
|
98
|
+
* @returns {Array<{ skill: string, currentAccuracy: number, previousAccuracy: number, delta: number, flippedTests: number }>}
|
|
99
|
+
*/
|
|
100
|
+
export function detectRegressions(current, previous) {
|
|
101
|
+
if (!previous) return [];
|
|
102
|
+
|
|
103
|
+
const regressions = [];
|
|
104
|
+
|
|
105
|
+
for (const skill of current.skills) {
|
|
106
|
+
const prev = previous.skills.find(s => s.name === skill.name);
|
|
107
|
+
if (!prev) continue;
|
|
108
|
+
|
|
109
|
+
const delta = skill.accuracy - prev.accuracy;
|
|
110
|
+
if (delta > -0.05) continue;
|
|
111
|
+
|
|
112
|
+
const currentCorrect = skill.tp + skill.tn;
|
|
113
|
+
const prevCorrect = prev.tp + prev.tn;
|
|
114
|
+
const flippedTests = Math.abs(currentCorrect - prevCorrect);
|
|
115
|
+
|
|
116
|
+
if (flippedTests < 2) continue;
|
|
117
|
+
|
|
118
|
+
regressions.push({
|
|
119
|
+
skill: skill.name,
|
|
120
|
+
currentAccuracy: skill.accuracy,
|
|
121
|
+
previousAccuracy: prev.accuracy,
|
|
122
|
+
delta,
|
|
123
|
+
flippedTests,
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return regressions;
|
|
128
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* description-analyzer.js — Analyzes keyword gaps in skill descriptions.
|
|
3
|
+
*
|
|
4
|
+
* Uses token analysis to identify which keywords are missing from
|
|
5
|
+
* skill descriptions based on failed trigger tests. No LLM required.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { tokenize } from './trigger-matcher.js';
|
|
9
|
+
|
|
10
|
+
const STOP_WORDS = new Set([
|
|
11
|
+
'the', 'is', 'at', 'in', 'on', 'to', 'of', 'for', 'and', 'or', 'an',
|
|
12
|
+
'it', 'by', 'as', 'be', 'do', 'if', 'no', 'so', 'up', 'we', 'my',
|
|
13
|
+
'use', 'when', 'with', 'from', 'this', 'that', 'will', 'can', 'has',
|
|
14
|
+
'not', 'are', 'was', 'but', 'all', 'any', 'its', 'you', 'your',
|
|
15
|
+
'want', 'need', 'just', 'let', 'get', 'make', 'help', 'me',
|
|
16
|
+
]);
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Checks if a token matches any description token (full or substring).
|
|
20
|
+
*/
|
|
21
|
+
function tokenMatchesDescription(token, descTokens) {
|
|
22
|
+
for (const dt of descTokens) {
|
|
23
|
+
if (dt === token || dt.includes(token) || token.includes(dt)) {
|
|
24
|
+
return true;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Analyzes gaps between failed trigger prompts and a skill description.
|
|
32
|
+
* @param {Array} triggerResults - Results from runTriggerTests
|
|
33
|
+
* @param {string} description - Skill description
|
|
34
|
+
* @returns {{ missingKeywords: string[], failedPrompts: string[] }}
|
|
35
|
+
*/
|
|
36
|
+
export function analyzeGaps(triggerResults, description) {
|
|
37
|
+
const failedPositives = triggerResults.filter(r => r.expected && !r.actual);
|
|
38
|
+
|
|
39
|
+
if (failedPositives.length === 0) {
|
|
40
|
+
return { missingKeywords: [], failedPrompts: [] };
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const descTokens = tokenize(description).filter(w => !STOP_WORDS.has(w));
|
|
44
|
+
const missingKeywords = [];
|
|
45
|
+
const failedPrompts = [];
|
|
46
|
+
|
|
47
|
+
for (const result of failedPositives) {
|
|
48
|
+
failedPrompts.push(result.prompt);
|
|
49
|
+
const promptTokens = tokenize(result.prompt).filter(w => !STOP_WORDS.has(w));
|
|
50
|
+
|
|
51
|
+
for (const token of promptTokens) {
|
|
52
|
+
if (!tokenMatchesDescription(token, descTokens)) {
|
|
53
|
+
missingKeywords.push(token);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return { missingKeywords, failedPrompts };
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Generates keyword suggestions from gap analysis results.
|
|
63
|
+
* @param {Array<{ skill: string, currentDescription: string, missingKeywords: string[], failedPrompts: string[] }>} gapsList
|
|
64
|
+
* @returns {Array<{ skill: string, currentDescription: string, suggestedKeywords: Array<{ word: string, confidence: string }> }>}
|
|
65
|
+
*/
|
|
66
|
+
export function generateSuggestions(gapsList) {
|
|
67
|
+
const suggestions = [];
|
|
68
|
+
|
|
69
|
+
for (const gaps of gapsList) {
|
|
70
|
+
if (gaps.missingKeywords.length === 0) continue;
|
|
71
|
+
|
|
72
|
+
const freq = new Map();
|
|
73
|
+
for (const word of gaps.missingKeywords) {
|
|
74
|
+
freq.set(word, (freq.get(word) || 0) + 1);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const suggestedKeywords = [...freq.entries()]
|
|
78
|
+
.sort((a, b) => b[1] - a[1])
|
|
79
|
+
.map(([word, count]) => ({
|
|
80
|
+
word,
|
|
81
|
+
confidence: count >= 2 ? 'high' : 'medium',
|
|
82
|
+
}));
|
|
83
|
+
|
|
84
|
+
suggestions.push({
|
|
85
|
+
skill: gaps.skill,
|
|
86
|
+
currentDescription: gaps.currentDescription,
|
|
87
|
+
suggestedKeywords,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return suggestions;
|
|
92
|
+
}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* eval-runner.js — Skill evaluation framework for Guild.
|
|
3
|
+
*
|
|
4
|
+
* Runs assertions against parsed skill workflows to verify
|
|
5
|
+
* structural correctness. Compatible with anthropics/skills eval format.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { readFileSync, existsSync } from 'fs';
|
|
9
|
+
import { join, dirname } from 'path';
|
|
10
|
+
import { fileURLToPath } from 'url';
|
|
11
|
+
import { parseSkill } from './workflow-parser.js';
|
|
12
|
+
|
|
13
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
14
|
+
const TEMPLATES_DIR = join(__dirname, '..', 'templates', 'skills');
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Evaluates a single assertion against a parsed workflow.
|
|
18
|
+
* @param {object} workflow - Parsed workflow with { version, steps[] }
|
|
19
|
+
* @param {string} assertion - Assertion string (e.g. "step-exists:evaluate")
|
|
20
|
+
* @returns {{ passed: boolean, evidence: string }}
|
|
21
|
+
*/
|
|
22
|
+
export function evaluateAssertion(workflow, assertion) {
|
|
23
|
+
const colonIdx = assertion.indexOf(':');
|
|
24
|
+
if (colonIdx === -1) {
|
|
25
|
+
return { passed: false, evidence: `Malformed assertion: "${assertion}"` };
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const type = assertion.slice(0, colonIdx);
|
|
29
|
+
const args = assertion.slice(colonIdx + 1);
|
|
30
|
+
|
|
31
|
+
switch (type) {
|
|
32
|
+
case 'step-exists': {
|
|
33
|
+
const step = workflow.steps.find(s => s.id === args);
|
|
34
|
+
return step
|
|
35
|
+
? { passed: true, evidence: `Step "${args}" found` }
|
|
36
|
+
: { passed: false, evidence: `Step "${args}" not found in ${workflow.steps.map(s => s.id).join(', ')}` };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
case 'step-role': {
|
|
40
|
+
const [stepId, expectedRole] = args.split(':');
|
|
41
|
+
const step = workflow.steps.find(s => s.id === stepId);
|
|
42
|
+
if (!step) return { passed: false, evidence: `Step "${stepId}" not found` };
|
|
43
|
+
return step.role === expectedRole
|
|
44
|
+
? { passed: true, evidence: `Step "${stepId}" has role "${expectedRole}"` }
|
|
45
|
+
: { passed: false, evidence: `Step "${stepId}" has role "${step.role}", expected "${expectedRole}"` };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
case 'step-model-tier': {
|
|
49
|
+
const [stepId, expectedTier] = args.split(':');
|
|
50
|
+
const step = workflow.steps.find(s => s.id === stepId);
|
|
51
|
+
if (!step) return { passed: false, evidence: `Step "${stepId}" not found` };
|
|
52
|
+
return step.modelTier === expectedTier
|
|
53
|
+
? { passed: true, evidence: `Step "${stepId}" uses tier "${expectedTier}"` }
|
|
54
|
+
: { passed: false, evidence: `Step "${stepId}" uses tier "${step.modelTier}", expected "${expectedTier}"` };
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
case 'step-requires': {
|
|
58
|
+
const [stepId, dep] = args.split(':');
|
|
59
|
+
const step = workflow.steps.find(s => s.id === stepId);
|
|
60
|
+
if (!step) return { passed: false, evidence: `Step "${stepId}" not found` };
|
|
61
|
+
return step.requires.includes(dep)
|
|
62
|
+
? { passed: true, evidence: `Step "${stepId}" requires "${dep}"` }
|
|
63
|
+
: { passed: false, evidence: `Step "${stepId}" requires [${step.requires.join(', ')}], missing "${dep}"` };
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
case 'step-parallel': {
|
|
67
|
+
const step = workflow.steps.find(s => s.id === args);
|
|
68
|
+
if (!step) return { passed: false, evidence: `Step "${args}" not found` };
|
|
69
|
+
return step.parallel && step.parallel.length > 0
|
|
70
|
+
? { passed: true, evidence: `Step "${args}" is parallel with [${step.parallel.join(', ')}]` }
|
|
71
|
+
: { passed: false, evidence: `Step "${args}" has no parallel group` };
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
case 'gate-exists': {
|
|
75
|
+
const step = workflow.steps.find(s => s.id === args);
|
|
76
|
+
if (!step) return { passed: false, evidence: `Step "${args}" not found` };
|
|
77
|
+
return step.gate === true
|
|
78
|
+
? { passed: true, evidence: `Step "${args}" has gate: true` }
|
|
79
|
+
: { passed: false, evidence: `Step "${args}" has gate: ${step.gate}` };
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
case 'step-count': {
|
|
83
|
+
const min = parseInt(args, 10);
|
|
84
|
+
const actual = workflow.steps.length;
|
|
85
|
+
return actual >= min
|
|
86
|
+
? { passed: true, evidence: `Workflow has ${actual} steps (minimum ${min})` }
|
|
87
|
+
: { passed: false, evidence: `Workflow has ${actual} steps, expected at least ${min}` };
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
default:
|
|
91
|
+
return { passed: false, evidence: `Unknown assertion type: "${type}"` };
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Loads evals.json for a skill template.
|
|
97
|
+
* @param {string} skillName - Skill directory name (e.g. 'build-feature')
|
|
98
|
+
* @returns {object|null} Parsed evals object or null if no evals exist
|
|
99
|
+
*/
|
|
100
|
+
export function loadEvals(skillName) {
|
|
101
|
+
const evalsPath = join(TEMPLATES_DIR, skillName, 'evals', 'evals.json');
|
|
102
|
+
if (!existsSync(evalsPath)) return null;
|
|
103
|
+
return JSON.parse(readFileSync(evalsPath, 'utf8'));
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Runs all evals for a skill template.
|
|
108
|
+
* Parses the SKILL.md, loads evals.json, and evaluates each assertion.
|
|
109
|
+
* @param {string} skillName - Skill directory name
|
|
110
|
+
* @returns {{ skill: string, results: Array<{ id: string, description: string, passed: boolean, expectations: Array }> }}
|
|
111
|
+
*/
|
|
112
|
+
export function runEvals(skillName) {
|
|
113
|
+
const evals = loadEvals(skillName);
|
|
114
|
+
if (!evals) throw new Error(`No evals found for skill "${skillName}"`);
|
|
115
|
+
|
|
116
|
+
const skillPath = join(TEMPLATES_DIR, skillName, 'SKILL.md');
|
|
117
|
+
const content = readFileSync(skillPath, 'utf8');
|
|
118
|
+
const skill = parseSkill(content);
|
|
119
|
+
|
|
120
|
+
if (!skill.workflow) {
|
|
121
|
+
throw new Error(`Skill "${skillName}" has no workflow definition`);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const results = evals.evals.map(evalCase => {
|
|
125
|
+
const expectations = evalCase.expectations.map(exp => {
|
|
126
|
+
const result = evaluateAssertion(skill.workflow, exp.assertion);
|
|
127
|
+
return { text: exp.text, assertion: exp.assertion, ...result };
|
|
128
|
+
});
|
|
129
|
+
const passed = expectations.every(e => e.passed);
|
|
130
|
+
return {
|
|
131
|
+
id: evalCase.id,
|
|
132
|
+
description: evalCase.description,
|
|
133
|
+
passed,
|
|
134
|
+
expectations,
|
|
135
|
+
};
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
return { skill: skillName, results };
|
|
139
|
+
}
|