@skilljack/evals 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +327 -0
- package/action/action.yml +72 -0
- package/action/index.ts +78 -0
- package/dist/action/index.d.ts +8 -0
- package/dist/action/index.d.ts.map +1 -0
- package/dist/action/index.js +68 -0
- package/dist/action/index.js.map +1 -0
- package/dist/src/cli.d.ts +9 -0
- package/dist/src/cli.d.ts.map +1 -0
- package/dist/src/cli.js +264 -0
- package/dist/src/cli.js.map +1 -0
- package/dist/src/config.d.ts +52 -0
- package/dist/src/config.d.ts.map +1 -0
- package/dist/src/config.js +194 -0
- package/dist/src/config.js.map +1 -0
- package/dist/src/index.d.ts +24 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +28 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/parser.d.ts +22 -0
- package/dist/src/parser.d.ts.map +1 -0
- package/dist/src/parser.js +205 -0
- package/dist/src/parser.js.map +1 -0
- package/dist/src/pipeline.d.ts +53 -0
- package/dist/src/pipeline.d.ts.map +1 -0
- package/dist/src/pipeline.js +185 -0
- package/dist/src/pipeline.js.map +1 -0
- package/dist/src/report/github-summary.d.ts +15 -0
- package/dist/src/report/github-summary.d.ts.map +1 -0
- package/dist/src/report/github-summary.js +77 -0
- package/dist/src/report/github-summary.js.map +1 -0
- package/dist/src/report/report.d.ts +23 -0
- package/dist/src/report/report.d.ts.map +1 -0
- package/dist/src/report/report.js +216 -0
- package/dist/src/report/report.js.map +1 -0
- package/dist/src/runner/runner.d.ts +29 -0
- package/dist/src/runner/runner.d.ts.map +1 -0
- package/dist/src/runner/runner.js +211 -0
- package/dist/src/runner/runner.js.map +1 -0
- package/dist/src/runner/security.d.ts +26 -0
- package/dist/src/runner/security.d.ts.map +1 -0
- package/dist/src/runner/security.js +34 -0
- package/dist/src/runner/security.js.map +1 -0
- package/dist/src/runner/skill-setup.d.ts +19 -0
- package/dist/src/runner/skill-setup.d.ts.map +1 -0
- package/dist/src/runner/skill-setup.js +72 -0
- package/dist/src/runner/skill-setup.js.map +1 -0
- package/dist/src/scorer/deterministic.d.ts +12 -0
- package/dist/src/scorer/deterministic.d.ts.map +1 -0
- package/dist/src/scorer/deterministic.js +149 -0
- package/dist/src/scorer/deterministic.js.map +1 -0
- package/dist/src/scorer/judge.d.ts +34 -0
- package/dist/src/scorer/judge.d.ts.map +1 -0
- package/dist/src/scorer/judge.js +226 -0
- package/dist/src/scorer/judge.js.map +1 -0
- package/dist/src/scorer/scorer.d.ts +25 -0
- package/dist/src/scorer/scorer.d.ts.map +1 -0
- package/dist/src/scorer/scorer.js +149 -0
- package/dist/src/scorer/scorer.js.map +1 -0
- package/dist/src/session/session-logger.d.ts +30 -0
- package/dist/src/session/session-logger.d.ts.map +1 -0
- package/dist/src/session/session-logger.js +157 -0
- package/dist/src/session/session-logger.js.map +1 -0
- package/dist/src/types.d.ts +227 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +16 -0
- package/dist/src/types.js.map +1 -0
- package/package.json +44 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ola Hungerford
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
# skilljack-evals
|
|
2
|
+
|
|
3
|
+
CLI for evaluating AI agent skills. Tests how well agents discover, load, and execute [Agent Skills](https://agentskills.io/home) — measuring discoverability, instruction adherence, and output quality.
|
|
4
|
+
|
|
5
|
+
Runs standalone or as a GitHub Action.
|
|
6
|
+
|
|
7
|
+
## What are Agent Skills?
|
|
8
|
+
|
|
9
|
+
Agent Skills are a lightweight, open-source format for extending AI agent capabilities. Each skill is a folder containing a `SKILL.md` file with metadata and instructions that agents can discover and use. Learn more at [agentskills.io](https://agentskills.io/home).
|
|
10
|
+
|
|
11
|
+
## Requirements
|
|
12
|
+
|
|
13
|
+
- Node.js >= 20.0.0
|
|
14
|
+
- Anthropic API key (or AWS credentials for Bedrock)
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
npm install
|
|
20
|
+
npm run build
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Run the example greeting evaluation
|
|
27
|
+
skilljack-evals run evals/example-greeting/tasks.yaml --verbose
|
|
28
|
+
|
|
29
|
+
# Deterministic scoring only (no LLM judge, free)
|
|
30
|
+
skilljack-evals run evals/example-greeting/tasks.yaml --no-judge
|
|
31
|
+
|
|
32
|
+
# Validate a task file without running
|
|
33
|
+
skilljack-evals validate evals/example-greeting/tasks.yaml
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Configuration
|
|
37
|
+
|
|
38
|
+
### API Key
|
|
39
|
+
|
|
40
|
+
Set `ANTHROPIC_API_KEY` in your environment or a `.env` file (see `.env.example`).
|
|
41
|
+
|
|
42
|
+
### Bedrock
|
|
43
|
+
|
|
44
|
+
Set these environment variables — the Agent SDK handles the rest:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
CLAUDE_CODE_USE_BEDROCK=1
|
|
48
|
+
AWS_REGION=us-west-2
|
|
49
|
+
AWS_PROFILE=your-profile
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Config File
|
|
53
|
+
|
|
54
|
+
Create an `eval.config.yaml` in your project root (all fields optional):
|
|
55
|
+
|
|
56
|
+
```yaml
|
|
57
|
+
models:
|
|
58
|
+
agent: sonnet # EVAL_AGENT_MODEL
|
|
59
|
+
judge: haiku # EVAL_JUDGE_MODEL
|
|
60
|
+
|
|
61
|
+
scoring:
|
|
62
|
+
weights:
|
|
63
|
+
discovery: 0.3
|
|
64
|
+
adherence: 0.4
|
|
65
|
+
output: 0.3
|
|
66
|
+
|
|
67
|
+
thresholds:
|
|
68
|
+
discovery_rate: 0.8 # EVAL_DISCOVERY_THRESHOLD
|
|
69
|
+
avg_score: 4.0 # EVAL_SCORE_THRESHOLD
|
|
70
|
+
|
|
71
|
+
runner:
|
|
72
|
+
timeout_ms: 300000 # EVAL_TASK_TIMEOUT_MS
|
|
73
|
+
allowed_write_dirs:
|
|
74
|
+
- ./results/
|
|
75
|
+
- ./fixtures/
|
|
76
|
+
|
|
77
|
+
output:
|
|
78
|
+
dir: ./results # EVAL_OUTPUT_DIR
|
|
79
|
+
judge_truncation: 5000
|
|
80
|
+
report_truncation: 2000
|
|
81
|
+
|
|
82
|
+
ci:
|
|
83
|
+
exit_on_failure: true
|
|
84
|
+
github_summary: false
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**Precedence** (lowest to highest): YAML defaults → `eval.config.yaml` → environment variables (`EVAL_*`) → CLI flags.
|
|
88
|
+
|
|
89
|
+
## CLI Commands
|
|
90
|
+
|
|
91
|
+
### `run` — Full evaluation pipeline
|
|
92
|
+
|
|
93
|
+
Runs the agent against tasks, scores results, and generates reports.
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
skilljack-evals run evals/greeting/tasks.yaml \
|
|
97
|
+
--model sonnet --judge-model haiku \
|
|
98
|
+
--timeout 300000 \
|
|
99
|
+
--tasks gr-001,gr-002 \
|
|
100
|
+
--threshold-discovery 0.8 --threshold-score 4.0 \
|
|
101
|
+
--output-dir ./results \
|
|
102
|
+
--github-summary --verbose
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### `score` — Score existing results
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
skilljack-evals score results.json --judge-model haiku
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### `report` — Generate reports from scored results
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
skilljack-evals report results.json -o report.md --json report.json
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### `validate` — Check YAML syntax
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
skilljack-evals validate evals/greeting/tasks.yaml
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### `create-eval` — Generate task template
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
skilljack-evals create-eval greeting -o evals/greeting/tasks.yaml -n 10
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### `parse` — Parse YAML to JSON
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
skilljack-evals parse evals/greeting/tasks.yaml
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Architecture
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
YAML tasks → Config → Runner (Agent SDK) → Scorer (deterministic + LLM judge) → Report
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Pipeline
|
|
142
|
+
|
|
143
|
+
1. **Parse** — Load and validate task definitions from YAML
|
|
144
|
+
2. **Setup** — Copy skills to `.claude/skills/` in the working directory
|
|
145
|
+
3. **Run** — Execute agent against each task via the Claude Agent SDK
|
|
146
|
+
4. **Score** — Deterministic checks (free, fast) then optional LLM judge
|
|
147
|
+
5. **Report** — Generate markdown + JSON reports, check pass/fail thresholds
|
|
148
|
+
6. **Cleanup** — Remove copied skills
|
|
149
|
+
|
|
150
|
+
### Scoring
|
|
151
|
+
|
|
152
|
+
Two scoring methods that can run independently or together:
|
|
153
|
+
|
|
154
|
+
**Deterministic** (free, fast):
|
|
155
|
+
- Checks tool calls for skill activation
|
|
156
|
+
- Searches output for expected marker strings
|
|
157
|
+
- Validates expected/forbidden tool usage
|
|
158
|
+
- Binary pass/fail
|
|
159
|
+
|
|
160
|
+
**LLM Judge** (richer, ~$0.001/task):
|
|
161
|
+
- Discovery (0 or 1) — Did the agent load the expected skill?
|
|
162
|
+
- Adherence (1-5) — How well did the agent follow skill instructions?
|
|
163
|
+
- Output Quality (1-5) — Does the output meet task requirements?
|
|
164
|
+
- Failure categorization
|
|
165
|
+
|
|
166
|
+
**Combined score**: `w_d * discovery + w_a * ((adherence-1)/4) + w_o * ((outputQuality-1)/4)`
|
|
167
|
+
|
|
168
|
+
### Failure Categories
|
|
169
|
+
|
|
170
|
+
| Category | Meaning |
|
|
171
|
+
|----------|---------|
|
|
172
|
+
| `discovery_failure` | Agent didn't load the skill |
|
|
173
|
+
| `false_positive` | Agent loaded a skill it shouldn't have |
|
|
174
|
+
| `instruction_ambiguity` | Agent misinterpreted instructions |
|
|
175
|
+
| `missing_guidance` | Skill didn't cover the needed case |
|
|
176
|
+
| `agent_error` | Agent made a mistake despite guidance |
|
|
177
|
+
| `none` | No failure |
|
|
178
|
+
|
|
179
|
+
## Task File Format
|
|
180
|
+
|
|
181
|
+
```yaml
|
|
182
|
+
skill: greeting
|
|
183
|
+
version: "1.0"
|
|
184
|
+
|
|
185
|
+
defaults:
|
|
186
|
+
expected_skill_load: greeting
|
|
187
|
+
criteria:
|
|
188
|
+
discovery: { weight: 0.3 }
|
|
189
|
+
adherence: { weight: 0.4 }
|
|
190
|
+
output: { weight: 0.3 }
|
|
191
|
+
|
|
192
|
+
tasks:
|
|
193
|
+
- id: gr-001
|
|
194
|
+
prompt: "Hello! Please greet me using the greeting skill."
|
|
195
|
+
|
|
196
|
+
# Deterministic checks (optional, free)
|
|
197
|
+
deterministic:
|
|
198
|
+
expect_skill_activation: true
|
|
199
|
+
expect_marker: "GREETING_SUCCESS"
|
|
200
|
+
expect_tool_calls: []
|
|
201
|
+
expect_no_tool_calls: []
|
|
202
|
+
|
|
203
|
+
# LLM judge criteria (optional, costs API calls)
|
|
204
|
+
criteria:
|
|
205
|
+
discovery: { weight: 0.3, description: "Should load greeting skill" }
|
|
206
|
+
adherence: { weight: 0.4, description: "Should follow skill format" }
|
|
207
|
+
output: { weight: 0.3, description: "Greeting is friendly" }
|
|
208
|
+
golden_checklist:
|
|
209
|
+
- "Loaded the greeting skill"
|
|
210
|
+
- "Friendly tone"
|
|
211
|
+
|
|
212
|
+
# False positive test — skill should NOT activate
|
|
213
|
+
- id: gr-fp-001
|
|
214
|
+
prompt: "What are best practices for email greetings?"
|
|
215
|
+
expected_skill_load: none
|
|
216
|
+
deterministic:
|
|
217
|
+
expect_skill_activation: false
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Both `deterministic` and `criteria` blocks are optional. If both are present, the scorer runs both and merges results.
|
|
221
|
+
|
|
222
|
+
## GitHub Action
|
|
223
|
+
|
|
224
|
+
```yaml
|
|
225
|
+
- uses: olaservo/skilljack-evals@v1
|
|
226
|
+
with:
|
|
227
|
+
tasks: evals/commit/tasks.yaml
|
|
228
|
+
threshold-discovery: '0.8'
|
|
229
|
+
threshold-score: '4.0'
|
|
230
|
+
env:
|
|
231
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Inputs
|
|
235
|
+
|
|
236
|
+
| Input | Required | Default | Description |
|
|
237
|
+
|-------|----------|---------|-------------|
|
|
238
|
+
| `tasks` | Yes | — | Path to tasks YAML file |
|
|
239
|
+
| `model` | No | `sonnet` | Agent model |
|
|
240
|
+
| `judge-model` | No | `haiku` | LLM judge model |
|
|
241
|
+
| `config` | No | — | Path to eval.config.yaml |
|
|
242
|
+
| `threshold-discovery` | No | `0.8` | Minimum discovery rate (0-1) |
|
|
243
|
+
| `threshold-score` | No | `4.0` | Minimum average score (1-5) |
|
|
244
|
+
| `timeout` | No | `300000` | Per-task timeout (ms) |
|
|
245
|
+
| `tasks-filter` | No | — | Comma-separated task IDs |
|
|
246
|
+
| `skills-dir` | No | — | Path to skills directory |
|
|
247
|
+
| `no-judge` | No | `false` | Skip LLM judge |
|
|
248
|
+
| `no-deterministic` | No | `false` | Skip deterministic scoring |
|
|
249
|
+
|
|
250
|
+
### Outputs
|
|
251
|
+
|
|
252
|
+
| Output | Description |
|
|
253
|
+
|--------|-------------|
|
|
254
|
+
| `passed` | Whether all thresholds were met |
|
|
255
|
+
| `discovery-rate` | Discovery rate achieved (0-1) |
|
|
256
|
+
| `avg-score` | Average weighted score |
|
|
257
|
+
| `report-path` | Path to markdown report |
|
|
258
|
+
| `json-path` | Path to JSON report |
|
|
259
|
+
|
|
260
|
+
The action writes a condensed summary to `$GITHUB_STEP_SUMMARY` and exits with code 1 if thresholds are not met.
|
|
261
|
+
|
|
262
|
+
## Library Usage
|
|
263
|
+
|
|
264
|
+
```typescript
|
|
265
|
+
import {
|
|
266
|
+
parseSkillEvaluation,
|
|
267
|
+
SkillJudge,
|
|
268
|
+
generateReport,
|
|
269
|
+
runPipeline,
|
|
270
|
+
scoreDeterministic,
|
|
271
|
+
loadConfig,
|
|
272
|
+
} from '@skilljack/evals';
|
|
273
|
+
|
|
274
|
+
// Full pipeline
|
|
275
|
+
const result = await runPipeline('evals/greeting/tasks.yaml', {
|
|
276
|
+
model: 'sonnet',
|
|
277
|
+
verbose: true,
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
// Or individual steps
|
|
281
|
+
const evaluation = await parseSkillEvaluation('path/to/tasks.yaml');
|
|
282
|
+
const judge = new SkillJudge({ model: 'haiku' });
|
|
283
|
+
const score = await judge.judgeResult(task, result);
|
|
284
|
+
const detScore = scoreDeterministic(task, result);
|
|
285
|
+
const report = generateReport(evaluation, results, scores);
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Development
|
|
289
|
+
|
|
290
|
+
```bash
|
|
291
|
+
npm run dev # Run CLI in dev mode (tsx)
|
|
292
|
+
npm run build # Compile TypeScript
|
|
293
|
+
npm run typecheck # Type check without emitting
|
|
294
|
+
npm run start # Run compiled CLI
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
## Project Structure
|
|
298
|
+
|
|
299
|
+
```
|
|
300
|
+
src/
|
|
301
|
+
cli.ts # CLI entry point (commander)
|
|
302
|
+
index.ts # Public API exports
|
|
303
|
+
types.ts # TypeScript interfaces
|
|
304
|
+
config.ts # Centralized config (file + env + CLI)
|
|
305
|
+
parser.ts # YAML parsing and validation
|
|
306
|
+
pipeline.ts # Full eval pipeline orchestrator
|
|
307
|
+
runner/
|
|
308
|
+
runner.ts # Agent SDK runner
|
|
309
|
+
skill-setup.ts # Skill file management
|
|
310
|
+
security.ts # Tool write restrictions
|
|
311
|
+
scorer/
|
|
312
|
+
scorer.ts # Score orchestrator (deterministic + judge)
|
|
313
|
+
deterministic.ts # Marker/tool-call checks
|
|
314
|
+
judge.ts # LLM-as-judge scoring
|
|
315
|
+
session/
|
|
316
|
+
session-logger.ts # Event capture and logging
|
|
317
|
+
report/
|
|
318
|
+
report.ts # Markdown + JSON report generation
|
|
319
|
+
github-summary.ts # GitHub Actions summary
|
|
320
|
+
action/
|
|
321
|
+
action.yml # GitHub Action metadata
|
|
322
|
+
index.ts # Action entry point
|
|
323
|
+
evals/
|
|
324
|
+
example-greeting/ # Example evaluation
|
|
325
|
+
tasks.yaml
|
|
326
|
+
skills/greeting/SKILL.md
|
|
327
|
+
```
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
name: 'Agent Skill Evaluation'
|
|
2
|
+
description: 'Run agent skill evaluations to test discoverability, adherence, and output quality'
|
|
3
|
+
branding:
|
|
4
|
+
icon: 'check-circle'
|
|
5
|
+
color: 'blue'
|
|
6
|
+
|
|
7
|
+
inputs:
|
|
8
|
+
tasks:
|
|
9
|
+
description: 'Path to tasks YAML file'
|
|
10
|
+
required: true
|
|
11
|
+
model:
|
|
12
|
+
description: 'Agent model for task execution'
|
|
13
|
+
required: false
|
|
14
|
+
default: 'sonnet'
|
|
15
|
+
judge-model:
|
|
16
|
+
description: 'Model for LLM-as-judge scoring'
|
|
17
|
+
required: false
|
|
18
|
+
default: 'haiku'
|
|
19
|
+
config:
|
|
20
|
+
description: 'Path to eval.config.yaml'
|
|
21
|
+
required: false
|
|
22
|
+
threshold-discovery:
|
|
23
|
+
description: 'Minimum discovery rate (0-1)'
|
|
24
|
+
required: false
|
|
25
|
+
default: '0.8'
|
|
26
|
+
threshold-score:
|
|
27
|
+
description: 'Minimum average score (1-5)'
|
|
28
|
+
required: false
|
|
29
|
+
default: '4.0'
|
|
30
|
+
timeout:
|
|
31
|
+
description: 'Per-task timeout in milliseconds'
|
|
32
|
+
required: false
|
|
33
|
+
default: '300000'
|
|
34
|
+
tasks-filter:
|
|
35
|
+
description: 'Comma-separated list of task IDs to run'
|
|
36
|
+
required: false
|
|
37
|
+
skills-dir:
|
|
38
|
+
description: 'Path to skills directory for local setup'
|
|
39
|
+
required: false
|
|
40
|
+
working-directory:
|
|
41
|
+
description: 'Working directory for agent execution'
|
|
42
|
+
required: false
|
|
43
|
+
default: '.'
|
|
44
|
+
no-judge:
|
|
45
|
+
description: 'Skip LLM judge scoring (deterministic only)'
|
|
46
|
+
required: false
|
|
47
|
+
default: 'false'
|
|
48
|
+
no-deterministic:
|
|
49
|
+
description: 'Skip deterministic scoring (LLM judge only)'
|
|
50
|
+
required: false
|
|
51
|
+
default: 'false'
|
|
52
|
+
anthropic-api-key:
|
|
53
|
+
description: 'Anthropic API key (or use ANTHROPIC_API_KEY env var)'
|
|
54
|
+
required: false
|
|
55
|
+
|
|
56
|
+
outputs:
|
|
57
|
+
passed:
|
|
58
|
+
description: 'Whether all thresholds were met (true/false)'
|
|
59
|
+
discovery-rate:
|
|
60
|
+
description: 'Discovery rate achieved (0-1)'
|
|
61
|
+
avg-score:
|
|
62
|
+
description: 'Average weighted score achieved'
|
|
63
|
+
report-path:
|
|
64
|
+
description: 'Path to generated markdown report'
|
|
65
|
+
json-path:
|
|
66
|
+
description: 'Path to generated JSON report'
|
|
67
|
+
|
|
68
|
+
runs:
|
|
69
|
+
using: 'node20'
|
|
70
|
+
main: '../dist/action/index.js'
|
|
71
|
+
# Note: For GitHub Actions marketplace, this path should be relative to repo root.
|
|
72
|
+
# If publishing as a GitHub Action, the main path is: dist/action/index.js
|
package/action/index.ts
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Action entry point for skill evaluation.
|
|
3
|
+
*
|
|
4
|
+
* Reads inputs from the action.yml, runs the evaluation pipeline,
|
|
5
|
+
* and sets outputs + job summary.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import * as core from '@actions/core';
|
|
9
|
+
import { runPipeline } from '../src/pipeline.js';
|
|
10
|
+
import type { EvalConfig } from '../src/config.js';
|
|
11
|
+
|
|
12
|
+
async function run(): Promise<void> {
|
|
13
|
+
try {
|
|
14
|
+
// Read inputs
|
|
15
|
+
const tasks = core.getInput('tasks', { required: true });
|
|
16
|
+
const model = core.getInput('model') || 'sonnet';
|
|
17
|
+
const judgeModel = core.getInput('judge-model') || 'haiku';
|
|
18
|
+
const configPath = core.getInput('config') || undefined;
|
|
19
|
+
const thresholdDiscovery = parseFloat(core.getInput('threshold-discovery') || '0.8');
|
|
20
|
+
const thresholdScore = parseFloat(core.getInput('threshold-score') || '4.0');
|
|
21
|
+
const timeout = parseInt(core.getInput('timeout') || '300000', 10);
|
|
22
|
+
const tasksFilter = core.getInput('tasks-filter') || undefined;
|
|
23
|
+
const skillsDir = core.getInput('skills-dir') || undefined;
|
|
24
|
+
const cwd = core.getInput('working-directory') || process.cwd();
|
|
25
|
+
const noJudge = core.getInput('no-judge') === 'true';
|
|
26
|
+
const noDeterministic = core.getInput('no-deterministic') === 'true';
|
|
27
|
+
|
|
28
|
+
// Handle API key
|
|
29
|
+
const apiKey = core.getInput('anthropic-api-key') || process.env.ANTHROPIC_API_KEY;
|
|
30
|
+
if (apiKey) {
|
|
31
|
+
process.env.ANTHROPIC_API_KEY = apiKey;
|
|
32
|
+
core.setSecret(apiKey);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Build config overrides
|
|
36
|
+
const configOverrides: Partial<EvalConfig> = {
|
|
37
|
+
defaultAgentModel: model,
|
|
38
|
+
defaultJudgeModel: judgeModel,
|
|
39
|
+
discoveryThreshold: thresholdDiscovery,
|
|
40
|
+
scoreThreshold: thresholdScore,
|
|
41
|
+
taskTimeoutMs: timeout,
|
|
42
|
+
githubSummary: true,
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
// Run pipeline
|
|
46
|
+
const result = await runPipeline({
|
|
47
|
+
tasksFile: tasks,
|
|
48
|
+
configPath,
|
|
49
|
+
configOverrides,
|
|
50
|
+
cwd,
|
|
51
|
+
skillsDir,
|
|
52
|
+
taskFilter: tasksFilter,
|
|
53
|
+
noJudge,
|
|
54
|
+
noDeterministic,
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// Set outputs
|
|
58
|
+
core.setOutput('passed', String(result.passed));
|
|
59
|
+
core.setOutput('discovery-rate', String(result.report.summary.discoveryAccuracy));
|
|
60
|
+
core.setOutput('avg-score', String(result.report.summary.avgWeightedScore));
|
|
61
|
+
core.setOutput('report-path', result.reportPath || '');
|
|
62
|
+
core.setOutput('json-path', result.jsonPath || '');
|
|
63
|
+
|
|
64
|
+
// Write job summary
|
|
65
|
+
await core.summary.addRaw(result.markdownSummary).write();
|
|
66
|
+
|
|
67
|
+
// Set exit status
|
|
68
|
+
if (!result.passed) {
|
|
69
|
+
core.setFailed(
|
|
70
|
+
`Evaluation below thresholds: ${result.failureReasons.join(', ')}`
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
} catch (error) {
|
|
74
|
+
core.setFailed(error instanceof Error ? error.message : String(error));
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
run();
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../action/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG"}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Action entry point for skill evaluation.
|
|
3
|
+
*
|
|
4
|
+
* Reads inputs from the action.yml, runs the evaluation pipeline,
|
|
5
|
+
* and sets outputs + job summary.
|
|
6
|
+
*/
|
|
7
|
+
import * as core from '@actions/core';
|
|
8
|
+
import { runPipeline } from '../src/pipeline.js';
|
|
9
|
+
async function run() {
|
|
10
|
+
try {
|
|
11
|
+
// Read inputs
|
|
12
|
+
const tasks = core.getInput('tasks', { required: true });
|
|
13
|
+
const model = core.getInput('model') || 'sonnet';
|
|
14
|
+
const judgeModel = core.getInput('judge-model') || 'haiku';
|
|
15
|
+
const configPath = core.getInput('config') || undefined;
|
|
16
|
+
const thresholdDiscovery = parseFloat(core.getInput('threshold-discovery') || '0.8');
|
|
17
|
+
const thresholdScore = parseFloat(core.getInput('threshold-score') || '4.0');
|
|
18
|
+
const timeout = parseInt(core.getInput('timeout') || '300000', 10);
|
|
19
|
+
const tasksFilter = core.getInput('tasks-filter') || undefined;
|
|
20
|
+
const skillsDir = core.getInput('skills-dir') || undefined;
|
|
21
|
+
const cwd = core.getInput('working-directory') || process.cwd();
|
|
22
|
+
const noJudge = core.getInput('no-judge') === 'true';
|
|
23
|
+
const noDeterministic = core.getInput('no-deterministic') === 'true';
|
|
24
|
+
// Handle API key
|
|
25
|
+
const apiKey = core.getInput('anthropic-api-key') || process.env.ANTHROPIC_API_KEY;
|
|
26
|
+
if (apiKey) {
|
|
27
|
+
process.env.ANTHROPIC_API_KEY = apiKey;
|
|
28
|
+
core.setSecret(apiKey);
|
|
29
|
+
}
|
|
30
|
+
// Build config overrides
|
|
31
|
+
const configOverrides = {
|
|
32
|
+
defaultAgentModel: model,
|
|
33
|
+
defaultJudgeModel: judgeModel,
|
|
34
|
+
discoveryThreshold: thresholdDiscovery,
|
|
35
|
+
scoreThreshold: thresholdScore,
|
|
36
|
+
taskTimeoutMs: timeout,
|
|
37
|
+
githubSummary: true,
|
|
38
|
+
};
|
|
39
|
+
// Run pipeline
|
|
40
|
+
const result = await runPipeline({
|
|
41
|
+
tasksFile: tasks,
|
|
42
|
+
configPath,
|
|
43
|
+
configOverrides,
|
|
44
|
+
cwd,
|
|
45
|
+
skillsDir,
|
|
46
|
+
taskFilter: tasksFilter,
|
|
47
|
+
noJudge,
|
|
48
|
+
noDeterministic,
|
|
49
|
+
});
|
|
50
|
+
// Set outputs
|
|
51
|
+
core.setOutput('passed', String(result.passed));
|
|
52
|
+
core.setOutput('discovery-rate', String(result.report.summary.discoveryAccuracy));
|
|
53
|
+
core.setOutput('avg-score', String(result.report.summary.avgWeightedScore));
|
|
54
|
+
core.setOutput('report-path', result.reportPath || '');
|
|
55
|
+
core.setOutput('json-path', result.jsonPath || '');
|
|
56
|
+
// Write job summary
|
|
57
|
+
await core.summary.addRaw(result.markdownSummary).write();
|
|
58
|
+
// Set exit status
|
|
59
|
+
if (!result.passed) {
|
|
60
|
+
core.setFailed(`Evaluation below thresholds: ${result.failureReasons.join(', ')}`);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
catch (error) {
|
|
64
|
+
core.setFailed(error instanceof Error ? error.message : String(error));
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
run();
|
|
68
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../action/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,IAAI,MAAM,eAAe,CAAC;AACtC,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAGjD,KAAK,UAAU,GAAG;IAChB,IAAI,CAAC;QACH,cAAc;QACd,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,QAAQ,CAAC;QACjD,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,OAAO,CAAC;QAC3D,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,SAAS,CAAC;QACxD,MAAM,kBAAkB,GAAG,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,qBAAqB,CAAC,IAAI,KAAK,CAAC,CAAC;QACrF,MAAM,cAAc,GAAG,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,iBAAiB,CAAC,IAAI,KAAK,CAAC,CAAC;QAC7E,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,EAAE,EAAE,CAAC,CAAC;QACnE,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,SAAS,CAAC;QAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,IAAI,SAAS,CAAC;QAC3D,MAAM,GAAG,GAAG,IAAI,CAAC,QAAQ,CAAC,mBAAmB,CAAC,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;QAChE,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,MAAM,CAAC;QACrD,MAAM,eAAe,GAAG,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC,KAAK,MAAM,CAAC;QAErE,iBAAiB;QACjB,MAAM,MAAM,GAAG,IAAI,CAAC,QAAQ,CAAC,mBAAmB,CAAC,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACnF,IAAI,MAAM,EAAE,CAAC;YACX,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,MAAM,CAAC;YACvC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACzB,CAAC;QAED,yBAAyB;QACzB,MAAM,eAAe,GAAwB;YAC3C,iBAAiB,EAAE,KAAK;YACxB,iBAAiB,EAAE,UAAU;YAC7B,kBAAkB,EAAE,kBAAkB;YACtC,cAAc,EAAE,cAAc;YAC9B,aAAa,EAAE,OAAO;YACtB,aAAa,EAAE,IAAI;SACpB,CAAC;QAEF,eAAe;QACf,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC;YAC/B,SAAS,EAAE,KAAK;YAChB,UAAU;YACV,eAAe;YACf,GAAG;YACH,SAAS;YACT,UAAU,EAAE,WAAW;YACvB,OAAO;YACP,eAAe;SAChB,CAAC,CAAC;QAEH,cAAc;QACd,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;QAChD,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,iBAAiB,CAAC,CAAC,CAAC;QAClF,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC,CAAC;QAC5E,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;QAEnD,oBAAoB;QACpB,MAAM,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,KAAK,EAAE,CAAC;QAE1D,kBAAkB;QAClB,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACnB,IAAI,CAAC,SAAS,CACZ,gCAAgC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACnE,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,CAAC,SAAS,CAAC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;IACzE,CAAC;AACH,CAAC;AAED,GAAG,EAAE,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI for skill evaluation framework.
|
|
4
|
+
*
|
|
5
|
+
* Primary command: `skilljack-evals run` — runs the full evaluation pipeline.
|
|
6
|
+
* Also supports: score, report, create-eval, validate.
|
|
7
|
+
*/
|
|
8
|
+
import 'dotenv/config';
|
|
9
|
+
//# sourceMappingURL=cli.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../src/cli.ts"],"names":[],"mappings":";AAEA;;;;;GAKG;AAEH,OAAO,eAAe,CAAC"}
|