@skilljack/evals 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +327 -0
  3. package/action/action.yml +72 -0
  4. package/action/index.ts +78 -0
  5. package/dist/action/index.d.ts +8 -0
  6. package/dist/action/index.d.ts.map +1 -0
  7. package/dist/action/index.js +68 -0
  8. package/dist/action/index.js.map +1 -0
  9. package/dist/src/cli.d.ts +9 -0
  10. package/dist/src/cli.d.ts.map +1 -0
  11. package/dist/src/cli.js +264 -0
  12. package/dist/src/cli.js.map +1 -0
  13. package/dist/src/config.d.ts +52 -0
  14. package/dist/src/config.d.ts.map +1 -0
  15. package/dist/src/config.js +194 -0
  16. package/dist/src/config.js.map +1 -0
  17. package/dist/src/index.d.ts +24 -0
  18. package/dist/src/index.d.ts.map +1 -0
  19. package/dist/src/index.js +28 -0
  20. package/dist/src/index.js.map +1 -0
  21. package/dist/src/parser.d.ts +22 -0
  22. package/dist/src/parser.d.ts.map +1 -0
  23. package/dist/src/parser.js +205 -0
  24. package/dist/src/parser.js.map +1 -0
  25. package/dist/src/pipeline.d.ts +53 -0
  26. package/dist/src/pipeline.d.ts.map +1 -0
  27. package/dist/src/pipeline.js +185 -0
  28. package/dist/src/pipeline.js.map +1 -0
  29. package/dist/src/report/github-summary.d.ts +15 -0
  30. package/dist/src/report/github-summary.d.ts.map +1 -0
  31. package/dist/src/report/github-summary.js +77 -0
  32. package/dist/src/report/github-summary.js.map +1 -0
  33. package/dist/src/report/report.d.ts +23 -0
  34. package/dist/src/report/report.d.ts.map +1 -0
  35. package/dist/src/report/report.js +216 -0
  36. package/dist/src/report/report.js.map +1 -0
  37. package/dist/src/runner/runner.d.ts +29 -0
  38. package/dist/src/runner/runner.d.ts.map +1 -0
  39. package/dist/src/runner/runner.js +211 -0
  40. package/dist/src/runner/runner.js.map +1 -0
  41. package/dist/src/runner/security.d.ts +26 -0
  42. package/dist/src/runner/security.d.ts.map +1 -0
  43. package/dist/src/runner/security.js +34 -0
  44. package/dist/src/runner/security.js.map +1 -0
  45. package/dist/src/runner/skill-setup.d.ts +19 -0
  46. package/dist/src/runner/skill-setup.d.ts.map +1 -0
  47. package/dist/src/runner/skill-setup.js +72 -0
  48. package/dist/src/runner/skill-setup.js.map +1 -0
  49. package/dist/src/scorer/deterministic.d.ts +12 -0
  50. package/dist/src/scorer/deterministic.d.ts.map +1 -0
  51. package/dist/src/scorer/deterministic.js +149 -0
  52. package/dist/src/scorer/deterministic.js.map +1 -0
  53. package/dist/src/scorer/judge.d.ts +34 -0
  54. package/dist/src/scorer/judge.d.ts.map +1 -0
  55. package/dist/src/scorer/judge.js +226 -0
  56. package/dist/src/scorer/judge.js.map +1 -0
  57. package/dist/src/scorer/scorer.d.ts +25 -0
  58. package/dist/src/scorer/scorer.d.ts.map +1 -0
  59. package/dist/src/scorer/scorer.js +149 -0
  60. package/dist/src/scorer/scorer.js.map +1 -0
  61. package/dist/src/session/session-logger.d.ts +30 -0
  62. package/dist/src/session/session-logger.d.ts.map +1 -0
  63. package/dist/src/session/session-logger.js +157 -0
  64. package/dist/src/session/session-logger.js.map +1 -0
  65. package/dist/src/types.d.ts +227 -0
  66. package/dist/src/types.d.ts.map +1 -0
  67. package/dist/src/types.js +16 -0
  68. package/dist/src/types.js.map +1 -0
  69. package/package.json +44 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ola Hungerford
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,327 @@
1
+ # skilljack-evals
2
+
3
+ CLI for evaluating AI agent skills. Tests how well agents discover, load, and execute [Agent Skills](https://agentskills.io/home) — measuring discoverability, instruction adherence, and output quality.
4
+
5
+ Runs standalone or as a GitHub Action.
6
+
7
+ ## What are Agent Skills?
8
+
9
+ Agent Skills are a lightweight, open-source format for extending AI agent capabilities. Each skill is a folder containing a `SKILL.md` file with metadata and instructions that agents can discover and use. Learn more at [agentskills.io](https://agentskills.io/home).
10
+
11
+ ## Requirements
12
+
13
+ - Node.js >= 20.0.0
14
+ - Anthropic API key (or AWS credentials for Bedrock)
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ npm install
20
+ npm run build
21
+ ```
22
+
23
+ ## Quick Start
24
+
25
+ ```bash
26
+ # Run the example greeting evaluation
27
+ skilljack-evals run evals/example-greeting/tasks.yaml --verbose
28
+
29
+ # Deterministic scoring only (no LLM judge, free)
30
+ skilljack-evals run evals/example-greeting/tasks.yaml --no-judge
31
+
32
+ # Validate a task file without running
33
+ skilljack-evals validate evals/example-greeting/tasks.yaml
34
+ ```
35
+
36
+ ## Configuration
37
+
38
+ ### API Key
39
+
40
+ Set `ANTHROPIC_API_KEY` in your environment or a `.env` file (see `.env.example`).
41
+
42
+ ### Bedrock
43
+
44
+ Set these environment variables — the Agent SDK handles the rest:
45
+
46
+ ```bash
47
+ CLAUDE_CODE_USE_BEDROCK=1
48
+ AWS_REGION=us-west-2
49
+ AWS_PROFILE=your-profile
50
+ ```
51
+
52
+ ### Config File
53
+
54
+ Create an `eval.config.yaml` in your project root (all fields optional):
55
+
56
+ ```yaml
57
+ models:
58
+ agent: sonnet # EVAL_AGENT_MODEL
59
+ judge: haiku # EVAL_JUDGE_MODEL
60
+
61
+ scoring:
62
+ weights:
63
+ discovery: 0.3
64
+ adherence: 0.4
65
+ output: 0.3
66
+
67
+ thresholds:
68
+ discovery_rate: 0.8 # EVAL_DISCOVERY_THRESHOLD
69
+ avg_score: 4.0 # EVAL_SCORE_THRESHOLD
70
+
71
+ runner:
72
+ timeout_ms: 300000 # EVAL_TASK_TIMEOUT_MS
73
+ allowed_write_dirs:
74
+ - ./results/
75
+ - ./fixtures/
76
+
77
+ output:
78
+ dir: ./results # EVAL_OUTPUT_DIR
79
+ judge_truncation: 5000
80
+ report_truncation: 2000
81
+
82
+ ci:
83
+ exit_on_failure: true
84
+ github_summary: false
85
+ ```
86
+
87
+ **Precedence** (lowest to highest): YAML defaults → `eval.config.yaml` → environment variables (`EVAL_*`) → CLI flags.
88
+
89
+ ## CLI Commands
90
+
91
+ ### `run` — Full evaluation pipeline
92
+
93
+ Runs the agent against tasks, scores results, and generates reports.
94
+
95
+ ```bash
96
+ skilljack-evals run evals/greeting/tasks.yaml \
97
+ --model sonnet --judge-model haiku \
98
+ --timeout 300000 \
99
+ --tasks gr-001,gr-002 \
100
+ --threshold-discovery 0.8 --threshold-score 4.0 \
101
+ --output-dir ./results \
102
+ --github-summary --verbose
103
+ ```
104
+
105
+ ### `score` — Score existing results
106
+
107
+ ```bash
108
+ skilljack-evals score results.json --judge-model haiku
109
+ ```
110
+
111
+ ### `report` — Generate reports from scored results
112
+
113
+ ```bash
114
+ skilljack-evals report results.json -o report.md --json report.json
115
+ ```
116
+
117
+ ### `validate` — Check YAML syntax
118
+
119
+ ```bash
120
+ skilljack-evals validate evals/greeting/tasks.yaml
121
+ ```
122
+
123
+ ### `create-eval` — Generate task template
124
+
125
+ ```bash
126
+ skilljack-evals create-eval greeting -o evals/greeting/tasks.yaml -n 10
127
+ ```
128
+
129
+ ### `parse` — Parse YAML to JSON
130
+
131
+ ```bash
132
+ skilljack-evals parse evals/greeting/tasks.yaml
133
+ ```
134
+
135
+ ## Architecture
136
+
137
+ ```
138
+ YAML tasks → Config → Runner (Agent SDK) → Scorer (deterministic + LLM judge) → Report
139
+ ```
140
+
141
+ ### Pipeline
142
+
143
+ 1. **Parse** — Load and validate task definitions from YAML
144
+ 2. **Setup** — Copy skills to `.claude/skills/` in the working directory
145
+ 3. **Run** — Execute agent against each task via the Claude Agent SDK
146
+ 4. **Score** — Deterministic checks (free, fast) then optional LLM judge
147
+ 5. **Report** — Generate markdown + JSON reports, check pass/fail thresholds
148
+ 6. **Cleanup** — Remove copied skills
149
+
150
+ ### Scoring
151
+
152
+ Two scoring methods that can run independently or together:
153
+
154
+ **Deterministic** (free, fast):
155
+ - Checks tool calls for skill activation
156
+ - Searches output for expected marker strings
157
+ - Validates expected/forbidden tool usage
158
+ - Binary pass/fail
159
+
160
+ **LLM Judge** (richer, ~$0.001/task):
161
+ - Discovery (0 or 1) — Did the agent load the expected skill?
162
+ - Adherence (1-5) — How well did the agent follow skill instructions?
163
+ - Output Quality (1-5) — Does the output meet task requirements?
164
+ - Failure categorization
165
+
166
+ **Combined score**: `w_d * discovery + w_a * ((adherence-1)/4) + w_o * ((outputQuality-1)/4)`
167
+
168
+ ### Failure Categories
169
+
170
+ | Category | Meaning |
171
+ |----------|---------|
172
+ | `discovery_failure` | Agent didn't load the skill |
173
+ | `false_positive` | Agent loaded a skill it shouldn't have |
174
+ | `instruction_ambiguity` | Agent misinterpreted instructions |
175
+ | `missing_guidance` | Skill didn't cover the needed case |
176
+ | `agent_error` | Agent made a mistake despite guidance |
177
+ | `none` | No failure |
178
+
179
+ ## Task File Format
180
+
181
+ ```yaml
182
+ skill: greeting
183
+ version: "1.0"
184
+
185
+ defaults:
186
+ expected_skill_load: greeting
187
+ criteria:
188
+ discovery: { weight: 0.3 }
189
+ adherence: { weight: 0.4 }
190
+ output: { weight: 0.3 }
191
+
192
+ tasks:
193
+ - id: gr-001
194
+ prompt: "Hello! Please greet me using the greeting skill."
195
+
196
+ # Deterministic checks (optional, free)
197
+ deterministic:
198
+ expect_skill_activation: true
199
+ expect_marker: "GREETING_SUCCESS"
200
+ expect_tool_calls: []
201
+ expect_no_tool_calls: []
202
+
203
+ # LLM judge criteria (optional, costs API calls)
204
+ criteria:
205
+ discovery: { weight: 0.3, description: "Should load greeting skill" }
206
+ adherence: { weight: 0.4, description: "Should follow skill format" }
207
+ output: { weight: 0.3, description: "Greeting is friendly" }
208
+ golden_checklist:
209
+ - "Loaded the greeting skill"
210
+ - "Friendly tone"
211
+
212
+ # False positive test — skill should NOT activate
213
+ - id: gr-fp-001
214
+ prompt: "What are best practices for email greetings?"
215
+ expected_skill_load: none
216
+ deterministic:
217
+ expect_skill_activation: false
218
+ ```
219
+
220
+ Both `deterministic` and `criteria` blocks are optional. If both are present, the scorer runs both and merges results.
221
+
222
+ ## GitHub Action
223
+
224
+ ```yaml
225
+ - uses: olaservo/skilljack-evals@v1
226
+ with:
227
+ tasks: evals/commit/tasks.yaml
228
+ threshold-discovery: '0.8'
229
+ threshold-score: '4.0'
230
+ env:
231
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
232
+ ```
233
+
234
+ ### Inputs
235
+
236
+ | Input | Required | Default | Description |
237
+ |-------|----------|---------|-------------|
238
+ | `tasks` | Yes | — | Path to tasks YAML file |
239
+ | `model` | No | `sonnet` | Agent model |
240
+ | `judge-model` | No | `haiku` | LLM judge model |
241
+ | `config` | No | — | Path to eval.config.yaml |
242
+ | `threshold-discovery` | No | `0.8` | Minimum discovery rate (0-1) |
243
+ | `threshold-score` | No | `4.0` | Minimum average score (1-5) |
244
+ | `timeout` | No | `300000` | Per-task timeout (ms) |
245
+ | `tasks-filter` | No | — | Comma-separated task IDs |
246
+ | `skills-dir` | No | — | Path to skills directory |
247
+ | `no-judge` | No | `false` | Skip LLM judge |
248
+ | `no-deterministic` | No | `false` | Skip deterministic scoring |
249
+
250
+ ### Outputs
251
+
252
+ | Output | Description |
253
+ |--------|-------------|
254
+ | `passed` | Whether all thresholds were met |
255
+ | `discovery-rate` | Discovery rate achieved (0-1) |
256
+ | `avg-score` | Average weighted score |
257
+ | `report-path` | Path to markdown report |
258
+ | `json-path` | Path to JSON report |
259
+
260
+ The action writes a condensed summary to `$GITHUB_STEP_SUMMARY` and exits with code 1 if thresholds are not met.
261
+
262
+ ## Library Usage
263
+
264
+ ```typescript
265
+ import {
266
+ parseSkillEvaluation,
267
+ SkillJudge,
268
+ generateReport,
269
+ runPipeline,
270
+ scoreDeterministic,
271
+ loadConfig,
272
+ } from '@skilljack/evals';
273
+
274
+ // Full pipeline
275
+ const result = await runPipeline('evals/greeting/tasks.yaml', {
276
+ model: 'sonnet',
277
+ verbose: true,
278
+ });
279
+
280
+ // Or individual steps
281
+ const evaluation = await parseSkillEvaluation('path/to/tasks.yaml');
282
+ const judge = new SkillJudge({ model: 'haiku' });
283
+ const score = await judge.judgeResult(task, result);
284
+ const detScore = scoreDeterministic(task, result);
285
+ const report = generateReport(evaluation, results, scores);
286
+ ```
287
+
288
+ ## Development
289
+
290
+ ```bash
291
+ npm run dev # Run CLI in dev mode (tsx)
292
+ npm run build # Compile TypeScript
293
+ npm run typecheck # Type check without emitting
294
+ npm run start # Run compiled CLI
295
+ ```
296
+
297
+ ## Project Structure
298
+
299
+ ```
300
+ src/
301
+ cli.ts # CLI entry point (commander)
302
+ index.ts # Public API exports
303
+ types.ts # TypeScript interfaces
304
+ config.ts # Centralized config (file + env + CLI)
305
+ parser.ts # YAML parsing and validation
306
+ pipeline.ts # Full eval pipeline orchestrator
307
+ runner/
308
+ runner.ts # Agent SDK runner
309
+ skill-setup.ts # Skill file management
310
+ security.ts # Tool write restrictions
311
+ scorer/
312
+ scorer.ts # Score orchestrator (deterministic + judge)
313
+ deterministic.ts # Marker/tool-call checks
314
+ judge.ts # LLM-as-judge scoring
315
+ session/
316
+ session-logger.ts # Event capture and logging
317
+ report/
318
+ report.ts # Markdown + JSON report generation
319
+ github-summary.ts # GitHub Actions summary
320
+ action/
321
+ action.yml # GitHub Action metadata
322
+ index.ts # Action entry point
323
+ evals/
324
+ example-greeting/ # Example evaluation
325
+ tasks.yaml
326
+ skills/greeting/SKILL.md
327
+ ```
@@ -0,0 +1,72 @@
1
+ name: 'Agent Skill Evaluation'
2
+ description: 'Run agent skill evaluations to test discoverability, adherence, and output quality'
3
+ branding:
4
+ icon: 'check-circle'
5
+ color: 'blue'
6
+
7
+ inputs:
8
+ tasks:
9
+ description: 'Path to tasks YAML file'
10
+ required: true
11
+ model:
12
+ description: 'Agent model for task execution'
13
+ required: false
14
+ default: 'sonnet'
15
+ judge-model:
16
+ description: 'Model for LLM-as-judge scoring'
17
+ required: false
18
+ default: 'haiku'
19
+ config:
20
+ description: 'Path to eval.config.yaml'
21
+ required: false
22
+ threshold-discovery:
23
+ description: 'Minimum discovery rate (0-1)'
24
+ required: false
25
+ default: '0.8'
26
+ threshold-score:
27
+ description: 'Minimum average score (1-5)'
28
+ required: false
29
+ default: '4.0'
30
+ timeout:
31
+ description: 'Per-task timeout in milliseconds'
32
+ required: false
33
+ default: '300000'
34
+ tasks-filter:
35
+ description: 'Comma-separated list of task IDs to run'
36
+ required: false
37
+ skills-dir:
38
+ description: 'Path to skills directory for local setup'
39
+ required: false
40
+ working-directory:
41
+ description: 'Working directory for agent execution'
42
+ required: false
43
+ default: '.'
44
+ no-judge:
45
+ description: 'Skip LLM judge scoring (deterministic only)'
46
+ required: false
47
+ default: 'false'
48
+ no-deterministic:
49
+ description: 'Skip deterministic scoring (LLM judge only)'
50
+ required: false
51
+ default: 'false'
52
+ anthropic-api-key:
53
+ description: 'Anthropic API key (or use ANTHROPIC_API_KEY env var)'
54
+ required: false
55
+
56
+ outputs:
57
+ passed:
58
+ description: 'Whether all thresholds were met (true/false)'
59
+ discovery-rate:
60
+ description: 'Discovery rate achieved (0-1)'
61
+ avg-score:
62
+ description: 'Average weighted score achieved'
63
+ report-path:
64
+ description: 'Path to generated markdown report'
65
+ json-path:
66
+ description: 'Path to generated JSON report'
67
+
68
+ runs:
69
+ using: 'node20'
70
+ main: '../dist/action/index.js'
71
+ # Note: For GitHub Actions marketplace, this path should be relative to repo root.
72
+ # If publishing as a GitHub Action, the main path is: dist/action/index.js
@@ -0,0 +1,78 @@
1
+ /**
2
+ * GitHub Action entry point for skill evaluation.
3
+ *
4
+ * Reads inputs from the action.yml, runs the evaluation pipeline,
5
+ * and sets outputs + job summary.
6
+ */
7
+
8
+ import * as core from '@actions/core';
9
+ import { runPipeline } from '../src/pipeline.js';
10
+ import type { EvalConfig } from '../src/config.js';
11
+
12
+ async function run(): Promise<void> {
13
+ try {
14
+ // Read inputs
15
+ const tasks = core.getInput('tasks', { required: true });
16
+ const model = core.getInput('model') || 'sonnet';
17
+ const judgeModel = core.getInput('judge-model') || 'haiku';
18
+ const configPath = core.getInput('config') || undefined;
19
+ const thresholdDiscovery = parseFloat(core.getInput('threshold-discovery') || '0.8');
20
+ const thresholdScore = parseFloat(core.getInput('threshold-score') || '4.0');
21
+ const timeout = parseInt(core.getInput('timeout') || '300000', 10);
22
+ const tasksFilter = core.getInput('tasks-filter') || undefined;
23
+ const skillsDir = core.getInput('skills-dir') || undefined;
24
+ const cwd = core.getInput('working-directory') || process.cwd();
25
+ const noJudge = core.getInput('no-judge') === 'true';
26
+ const noDeterministic = core.getInput('no-deterministic') === 'true';
27
+
28
+ // Handle API key
29
+ const apiKey = core.getInput('anthropic-api-key') || process.env.ANTHROPIC_API_KEY;
30
+ if (apiKey) {
31
+ process.env.ANTHROPIC_API_KEY = apiKey;
32
+ core.setSecret(apiKey);
33
+ }
34
+
35
+ // Build config overrides
36
+ const configOverrides: Partial<EvalConfig> = {
37
+ defaultAgentModel: model,
38
+ defaultJudgeModel: judgeModel,
39
+ discoveryThreshold: thresholdDiscovery,
40
+ scoreThreshold: thresholdScore,
41
+ taskTimeoutMs: timeout,
42
+ githubSummary: true,
43
+ };
44
+
45
+ // Run pipeline
46
+ const result = await runPipeline({
47
+ tasksFile: tasks,
48
+ configPath,
49
+ configOverrides,
50
+ cwd,
51
+ skillsDir,
52
+ taskFilter: tasksFilter,
53
+ noJudge,
54
+ noDeterministic,
55
+ });
56
+
57
+ // Set outputs
58
+ core.setOutput('passed', String(result.passed));
59
+ core.setOutput('discovery-rate', String(result.report.summary.discoveryAccuracy));
60
+ core.setOutput('avg-score', String(result.report.summary.avgWeightedScore));
61
+ core.setOutput('report-path', result.reportPath || '');
62
+ core.setOutput('json-path', result.jsonPath || '');
63
+
64
+ // Write job summary
65
+ await core.summary.addRaw(result.markdownSummary).write();
66
+
67
+ // Set exit status
68
+ if (!result.passed) {
69
+ core.setFailed(
70
+ `Evaluation below thresholds: ${result.failureReasons.join(', ')}`
71
+ );
72
+ }
73
+ } catch (error) {
74
+ core.setFailed(error instanceof Error ? error.message : String(error));
75
+ }
76
+ }
77
+
78
+ run();
@@ -0,0 +1,8 @@
1
+ /**
2
+ * GitHub Action entry point for skill evaluation.
3
+ *
4
+ * Reads inputs from the action.yml, runs the evaluation pipeline,
5
+ * and sets outputs + job summary.
6
+ */
7
+ export {};
8
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../action/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG"}
@@ -0,0 +1,68 @@
1
+ /**
2
+ * GitHub Action entry point for skill evaluation.
3
+ *
4
+ * Reads inputs from the action.yml, runs the evaluation pipeline,
5
+ * and sets outputs + job summary.
6
+ */
7
+ import * as core from '@actions/core';
8
+ import { runPipeline } from '../src/pipeline.js';
9
+ async function run() {
10
+ try {
11
+ // Read inputs
12
+ const tasks = core.getInput('tasks', { required: true });
13
+ const model = core.getInput('model') || 'sonnet';
14
+ const judgeModel = core.getInput('judge-model') || 'haiku';
15
+ const configPath = core.getInput('config') || undefined;
16
+ const thresholdDiscovery = parseFloat(core.getInput('threshold-discovery') || '0.8');
17
+ const thresholdScore = parseFloat(core.getInput('threshold-score') || '4.0');
18
+ const timeout = parseInt(core.getInput('timeout') || '300000', 10);
19
+ const tasksFilter = core.getInput('tasks-filter') || undefined;
20
+ const skillsDir = core.getInput('skills-dir') || undefined;
21
+ const cwd = core.getInput('working-directory') || process.cwd();
22
+ const noJudge = core.getInput('no-judge') === 'true';
23
+ const noDeterministic = core.getInput('no-deterministic') === 'true';
24
+ // Handle API key
25
+ const apiKey = core.getInput('anthropic-api-key') || process.env.ANTHROPIC_API_KEY;
26
+ if (apiKey) {
27
+ process.env.ANTHROPIC_API_KEY = apiKey;
28
+ core.setSecret(apiKey);
29
+ }
30
+ // Build config overrides
31
+ const configOverrides = {
32
+ defaultAgentModel: model,
33
+ defaultJudgeModel: judgeModel,
34
+ discoveryThreshold: thresholdDiscovery,
35
+ scoreThreshold: thresholdScore,
36
+ taskTimeoutMs: timeout,
37
+ githubSummary: true,
38
+ };
39
+ // Run pipeline
40
+ const result = await runPipeline({
41
+ tasksFile: tasks,
42
+ configPath,
43
+ configOverrides,
44
+ cwd,
45
+ skillsDir,
46
+ taskFilter: tasksFilter,
47
+ noJudge,
48
+ noDeterministic,
49
+ });
50
+ // Set outputs
51
+ core.setOutput('passed', String(result.passed));
52
+ core.setOutput('discovery-rate', String(result.report.summary.discoveryAccuracy));
53
+ core.setOutput('avg-score', String(result.report.summary.avgWeightedScore));
54
+ core.setOutput('report-path', result.reportPath || '');
55
+ core.setOutput('json-path', result.jsonPath || '');
56
+ // Write job summary
57
+ await core.summary.addRaw(result.markdownSummary).write();
58
+ // Set exit status
59
+ if (!result.passed) {
60
+ core.setFailed(`Evaluation below thresholds: ${result.failureReasons.join(', ')}`);
61
+ }
62
+ }
63
+ catch (error) {
64
+ core.setFailed(error instanceof Error ? error.message : String(error));
65
+ }
66
+ }
67
+ run();
68
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../action/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,IAAI,MAAM,eAAe,CAAC;AACtC,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAGjD,KAAK,UAAU,GAAG;IAChB,IAAI,CAAC;QACH,cAAc;QACd,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,QAAQ,CAAC;QACjD,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,OAAO,CAAC;QAC3D,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,SAAS,CAAC;QACxD,MAAM,kBAAkB,GAAG,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,qBAAqB,CAAC,IAAI,KAAK,CAAC,CAAC;QACrF,MAAM,cAAc,GAAG,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,iBAAiB,CAAC,IAAI,KAAK,CAAC,CAAC;QAC7E,MAAM,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,EAAE,EAAE,CAAC,CAAC;QACnE,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,SAAS,CAAC;QAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,IAAI,SAAS,CAAC;QAC3D,MAAM,GAAG,GAAG,IAAI,CAAC,QAAQ,CAAC,mBAAmB,CAAC,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;QAChE,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,MAAM,CAAC;QACrD,MAAM,eAAe,GAAG,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC,KAAK,MAAM,CAAC;QAErE,iBAAiB;QACjB,MAAM,MAAM,GAAG,IAAI,CAAC,QAAQ,CAAC,mBAAmB,CAAC,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;QACnF,IAAI,MAAM,EAAE,CAAC;YACX,OAAO,CAAC,GAAG,CAAC,iBAAiB,GAAG,MAAM,CAAC;YACvC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACzB,CAAC;QAED,yBAAyB;QACzB,MAAM,eAAe,GAAwB;YAC3C,iBAAiB,EAAE,KAAK;YACxB,iBAAiB,EAAE,UAAU;YAC7B,kBAAkB,EAAE,kBAAkB;YACtC,cAAc,EAAE,cAAc;YAC9B,aAAa,EAAE,OAAO;YACtB,aAAa,EAAE,IAAI;SACpB,CAAC;QAEF,eAAe;QACf,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC;YAC/B,SAAS,EAAE,KAAK;YAChB,UAAU;YACV,eAAe;YACf,GAAG;YACH,SAAS;YACT,UAAU,EAAE,WAAW;YACvB,OAAO;YACP,eAAe;SAChB,CAAC,CAAC;QAEH,cAAc;QACd,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;QAChD,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,iBAAiB,CAAC,CAAC,CAAC;QAClF,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC,CAAC;QAC5E,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC;QACvD,IAAI,CAAC,SAAS,CAAC,WAAW,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;QAEnD,oBAAoB;QACpB,MAAM,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC,KAAK,EAAE,CAAC;QAE1D,kBAAkB;QAClB,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACnB,IAAI,CAAC,SAAS,CACZ,gCAAgC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACnE,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,CAAC,SAAS,CAAC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;IACzE,CAAC;AACH,CAAC;AAED,GAAG,EAAE,CAAC"}
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * CLI for skill evaluation framework.
4
+ *
5
+ * Primary command: `skilljack-evals run` — runs the full evaluation pipeline.
6
+ * Also supports: score, report, create-eval, validate.
7
+ */
8
+ import 'dotenv/config';
9
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../src/cli.ts"],"names":[],"mappings":";AAEA;;;;;GAKG;AAEH,OAAO,eAAe,CAAC"}