@skilljack/evals 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +327 -0
- package/action/action.yml +72 -0
- package/action/index.ts +78 -0
- package/dist/action/index.d.ts +8 -0
- package/dist/action/index.d.ts.map +1 -0
- package/dist/action/index.js +68 -0
- package/dist/action/index.js.map +1 -0
- package/dist/src/cli.d.ts +9 -0
- package/dist/src/cli.d.ts.map +1 -0
- package/dist/src/cli.js +264 -0
- package/dist/src/cli.js.map +1 -0
- package/dist/src/config.d.ts +52 -0
- package/dist/src/config.d.ts.map +1 -0
- package/dist/src/config.js +194 -0
- package/dist/src/config.js.map +1 -0
- package/dist/src/index.d.ts +24 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +28 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/parser.d.ts +22 -0
- package/dist/src/parser.d.ts.map +1 -0
- package/dist/src/parser.js +205 -0
- package/dist/src/parser.js.map +1 -0
- package/dist/src/pipeline.d.ts +53 -0
- package/dist/src/pipeline.d.ts.map +1 -0
- package/dist/src/pipeline.js +185 -0
- package/dist/src/pipeline.js.map +1 -0
- package/dist/src/report/github-summary.d.ts +15 -0
- package/dist/src/report/github-summary.d.ts.map +1 -0
- package/dist/src/report/github-summary.js +77 -0
- package/dist/src/report/github-summary.js.map +1 -0
- package/dist/src/report/report.d.ts +23 -0
- package/dist/src/report/report.d.ts.map +1 -0
- package/dist/src/report/report.js +216 -0
- package/dist/src/report/report.js.map +1 -0
- package/dist/src/runner/runner.d.ts +29 -0
- package/dist/src/runner/runner.d.ts.map +1 -0
- package/dist/src/runner/runner.js +211 -0
- package/dist/src/runner/runner.js.map +1 -0
- package/dist/src/runner/security.d.ts +26 -0
- package/dist/src/runner/security.d.ts.map +1 -0
- package/dist/src/runner/security.js +34 -0
- package/dist/src/runner/security.js.map +1 -0
- package/dist/src/runner/skill-setup.d.ts +19 -0
- package/dist/src/runner/skill-setup.d.ts.map +1 -0
- package/dist/src/runner/skill-setup.js +72 -0
- package/dist/src/runner/skill-setup.js.map +1 -0
- package/dist/src/scorer/deterministic.d.ts +12 -0
- package/dist/src/scorer/deterministic.d.ts.map +1 -0
- package/dist/src/scorer/deterministic.js +149 -0
- package/dist/src/scorer/deterministic.js.map +1 -0
- package/dist/src/scorer/judge.d.ts +34 -0
- package/dist/src/scorer/judge.d.ts.map +1 -0
- package/dist/src/scorer/judge.js +226 -0
- package/dist/src/scorer/judge.js.map +1 -0
- package/dist/src/scorer/scorer.d.ts +25 -0
- package/dist/src/scorer/scorer.d.ts.map +1 -0
- package/dist/src/scorer/scorer.js +149 -0
- package/dist/src/scorer/scorer.js.map +1 -0
- package/dist/src/session/session-logger.d.ts +30 -0
- package/dist/src/session/session-logger.d.ts.map +1 -0
- package/dist/src/session/session-logger.js +157 -0
- package/dist/src/session/session-logger.js.map +1 -0
- package/dist/src/types.d.ts +227 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +16 -0
- package/dist/src/types.js.map +1 -0
- package/package.json +44 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YAML parser for skill evaluation task files.
|
|
3
|
+
*
|
|
4
|
+
* Supports enriched YAML schema with:
|
|
5
|
+
* - defaults block (shared criteria, expected_skill_load)
|
|
6
|
+
* - deterministic block (marker-based checks, tool call expectations)
|
|
7
|
+
* - fixture block (setup/teardown scripts per task)
|
|
8
|
+
*/
|
|
9
|
+
import yaml from 'js-yaml';
|
|
10
|
+
import * as fs from 'fs/promises';
|
|
11
|
+
// ============================================
|
|
12
|
+
// Parser
|
|
13
|
+
// ============================================
|
|
14
|
+
/**
|
|
15
|
+
* Parse a YAML evaluation file into a SkillEvaluation object.
|
|
16
|
+
*/
|
|
17
|
+
export async function parseEvalFile(filePath) {
|
|
18
|
+
const content = await fs.readFile(filePath, 'utf-8');
|
|
19
|
+
const raw = yaml.load(content);
|
|
20
|
+
if (!raw || !raw.skill) {
|
|
21
|
+
throw new Error(`Invalid evaluation file: missing 'skill' field`);
|
|
22
|
+
}
|
|
23
|
+
const defaults = raw.defaults ? parseDefaults(raw.defaults) : undefined;
|
|
24
|
+
const tasks = (raw.tasks || []).map((t) => parseTask(t, defaults));
|
|
25
|
+
return {
|
|
26
|
+
skillName: raw.skill,
|
|
27
|
+
version: raw.version,
|
|
28
|
+
defaults,
|
|
29
|
+
tasks,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Parse the defaults block.
|
|
34
|
+
*/
|
|
35
|
+
function parseDefaults(raw) {
|
|
36
|
+
const defaults = {};
|
|
37
|
+
if (raw.expected_skill_load) {
|
|
38
|
+
defaults.expectedSkillLoad = raw.expected_skill_load;
|
|
39
|
+
}
|
|
40
|
+
if (raw.criteria) {
|
|
41
|
+
defaults.criteria = {};
|
|
42
|
+
for (const dim of ['discovery', 'adherence', 'output']) {
|
|
43
|
+
const rawCrit = raw.criteria[dim];
|
|
44
|
+
if (rawCrit) {
|
|
45
|
+
defaults.criteria[dim] = {
|
|
46
|
+
weight: rawCrit.weight,
|
|
47
|
+
description: rawCrit.description,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return defaults;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Parse a single task from raw YAML, merging with defaults.
|
|
56
|
+
*/
|
|
57
|
+
function parseTask(raw, defaults) {
|
|
58
|
+
// Merge expected skill load: task overrides defaults
|
|
59
|
+
const expectedSkillLoad = raw.expected_skill_load
|
|
60
|
+
?? defaults?.expectedSkillLoad
|
|
61
|
+
?? '';
|
|
62
|
+
// Merge criteria: task-level overrides default-level
|
|
63
|
+
const criteria = [];
|
|
64
|
+
const dimensions = ['discovery', 'adherence', 'output'];
|
|
65
|
+
for (const dim of dimensions) {
|
|
66
|
+
const taskCrit = raw.criteria?.[dim];
|
|
67
|
+
const defaultCrit = defaults?.criteria?.[dim];
|
|
68
|
+
if (taskCrit || defaultCrit) {
|
|
69
|
+
criteria.push({
|
|
70
|
+
dimension: dim,
|
|
71
|
+
weight: taskCrit?.weight ?? defaultCrit?.weight ?? 0.33,
|
|
72
|
+
description: taskCrit?.description ?? defaultCrit?.description ?? '',
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
// Parse deterministic block
|
|
77
|
+
let deterministic;
|
|
78
|
+
if (raw.deterministic) {
|
|
79
|
+
deterministic = {
|
|
80
|
+
expectSkillActivation: raw.deterministic.expect_skill_activation ?? true,
|
|
81
|
+
expectMarker: raw.deterministic.expect_marker,
|
|
82
|
+
expectToolCalls: raw.deterministic.expect_tool_calls,
|
|
83
|
+
expectNoToolCalls: raw.deterministic.expect_no_tool_calls,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
// Parse fixture block
|
|
87
|
+
let fixture;
|
|
88
|
+
if (raw.fixture) {
|
|
89
|
+
fixture = {
|
|
90
|
+
state: raw.fixture.state ?? 'default',
|
|
91
|
+
setup: raw.fixture.setup,
|
|
92
|
+
teardown: raw.fixture.teardown,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
return {
|
|
96
|
+
id: raw.id || '',
|
|
97
|
+
prompt: raw.prompt || '',
|
|
98
|
+
expectedSkillLoad,
|
|
99
|
+
criteria,
|
|
100
|
+
goldenChecklist: raw.golden_checklist || [],
|
|
101
|
+
deterministic,
|
|
102
|
+
fixture,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
// ============================================
|
|
106
|
+
// Validation
|
|
107
|
+
// ============================================
|
|
108
|
+
/**
|
|
109
|
+
* Validate a YAML evaluation file and return any errors.
|
|
110
|
+
*/
|
|
111
|
+
export async function validateEvalFile(filePath) {
|
|
112
|
+
const errors = [];
|
|
113
|
+
let content;
|
|
114
|
+
try {
|
|
115
|
+
content = await fs.readFile(filePath, 'utf-8');
|
|
116
|
+
}
|
|
117
|
+
catch {
|
|
118
|
+
return [`Cannot read file: ${filePath}`];
|
|
119
|
+
}
|
|
120
|
+
let raw;
|
|
121
|
+
try {
|
|
122
|
+
raw = yaml.load(content);
|
|
123
|
+
}
|
|
124
|
+
catch (e) {
|
|
125
|
+
return [`Invalid YAML: ${e instanceof Error ? e.message : String(e)}`];
|
|
126
|
+
}
|
|
127
|
+
if (!raw) {
|
|
128
|
+
return ['File is empty'];
|
|
129
|
+
}
|
|
130
|
+
if (!raw.skill) {
|
|
131
|
+
errors.push("Missing required field: 'skill'");
|
|
132
|
+
}
|
|
133
|
+
if (!raw.tasks || !Array.isArray(raw.tasks)) {
|
|
134
|
+
errors.push("Missing or invalid 'tasks' array");
|
|
135
|
+
return errors;
|
|
136
|
+
}
|
|
137
|
+
const taskIds = new Set();
|
|
138
|
+
for (let i = 0; i < raw.tasks.length; i++) {
|
|
139
|
+
const task = raw.tasks[i];
|
|
140
|
+
const prefix = `tasks[${i}]`;
|
|
141
|
+
if (!task.id) {
|
|
142
|
+
errors.push(`${prefix}: Missing 'id'`);
|
|
143
|
+
}
|
|
144
|
+
else if (taskIds.has(task.id)) {
|
|
145
|
+
errors.push(`${prefix}: Duplicate task id '${task.id}'`);
|
|
146
|
+
}
|
|
147
|
+
else {
|
|
148
|
+
taskIds.add(task.id);
|
|
149
|
+
}
|
|
150
|
+
if (!task.prompt) {
|
|
151
|
+
errors.push(`${prefix}: Missing 'prompt'`);
|
|
152
|
+
}
|
|
153
|
+
// Validate criteria weights sum roughly to 1
|
|
154
|
+
if (task.criteria) {
|
|
155
|
+
const weights = Object.values(task.criteria)
|
|
156
|
+
.filter((c) => c !== undefined)
|
|
157
|
+
.map((c) => c.weight ?? 0.33);
|
|
158
|
+
const sum = weights.reduce((a, b) => a + b, 0);
|
|
159
|
+
if (weights.length > 0 && Math.abs(sum - 1) > 0.1) {
|
|
160
|
+
errors.push(`${prefix}: Criteria weights sum to ${sum.toFixed(2)}, expected ~1.0`);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return errors;
|
|
165
|
+
}
|
|
166
|
+
// ============================================
|
|
167
|
+
// Template Generation
|
|
168
|
+
// ============================================
|
|
169
|
+
/**
|
|
170
|
+
* Generate a YAML template for a new skill evaluation.
|
|
171
|
+
*/
|
|
172
|
+
export function createEvalTemplate(skillName, numTasks = 5) {
|
|
173
|
+
const prefix = skillName.slice(0, 2).toLowerCase();
|
|
174
|
+
const tasks = Array.from({ length: numTasks }, (_, i) => {
|
|
175
|
+
const taskId = `${prefix}-${String(i + 1).padStart(3, '0')}`;
|
|
176
|
+
return ` - id: ${taskId}
|
|
177
|
+
prompt: "TODO: Write a realistic prompt that should trigger ${skillName}"
|
|
178
|
+
expected_skill_load: ${skillName}
|
|
179
|
+
deterministic:
|
|
180
|
+
expect_skill_activation: true
|
|
181
|
+
# expect_marker: "OPTIONAL_MARKER_TEXT"
|
|
182
|
+
criteria:
|
|
183
|
+
discovery: { weight: 0.3, description: "Should load ${skillName} based on task context" }
|
|
184
|
+
adherence: { weight: 0.4, description: "Should follow ${skillName} instructions" }
|
|
185
|
+
output: { weight: 0.3, description: "Should produce quality output meeting requirements" }
|
|
186
|
+
golden_checklist:
|
|
187
|
+
- "TODO: Add expected behavior 1"
|
|
188
|
+
- "TODO: Add expected behavior 2"
|
|
189
|
+
- "TODO: Add expected behavior 3"`;
|
|
190
|
+
});
|
|
191
|
+
return `skill: ${skillName}
|
|
192
|
+
version: "1.0"
|
|
193
|
+
|
|
194
|
+
defaults:
|
|
195
|
+
expected_skill_load: ${skillName}
|
|
196
|
+
criteria:
|
|
197
|
+
discovery: { weight: 0.3 }
|
|
198
|
+
adherence: { weight: 0.4 }
|
|
199
|
+
output: { weight: 0.3 }
|
|
200
|
+
|
|
201
|
+
tasks:
|
|
202
|
+
${tasks.join('\n\n')}
|
|
203
|
+
`;
|
|
204
|
+
}
|
|
205
|
+
//# sourceMappingURL=parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parser.js","sourceRoot":"","sources":["../../src/parser.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,IAAI,MAAM,SAAS,CAAC;AAC3B,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AA8DlC,+CAA+C;AAC/C,SAAS;AACT,+CAA+C;AAE/C;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,QAAgB;IAClD,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACrD,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAgB,CAAC;IAE9C,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC;QACvB,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAC;IACpE,CAAC;IAED,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,aAAa,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IACxE,MAAM,KAAK,GAAG,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC;IAEnE,OAAO;QACL,SAAS,EAAE,GAAG,CAAC,KAAK;QACpB,OAAO,EAAE,GAAG,CAAC,OAAO;QACpB,QAAQ;QACR,KAAK;KACN,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAAC,GAAgB;IACrC,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAElC,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;QAC5B,QAAQ,CAAC,iBAAiB,GAAG,GAAG,CAAC,mBAAmB,CAAC;IACvD,CAAC;IAED,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QACjB,QAAQ,CAAC,QAAQ,GAAG,EAAE,CAAC;QACvB,KAAK,MAAM,GAAG,IAAI,CAAC,WAAW,EAAE,WAAW,EAAE,QAAQ,CAAU,EAAE,CAAC;YAChE,MAAM,OAAO,GAAG,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,OAAO,EAAE,CAAC;gBACZ,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG;oBACvB,MAAM,EAAE,OAAO,CAAC,MAAM;oBACtB,WAAW,EAAE,OAAO,CAAC,WAAW;iBACjC,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,SAAS,SAAS,CAAC,GAAY,EAAE,QAAuB;IACtD,qDAAqD;IACrD,MAAM,iBAAiB,GAAG,GAAG,CAAC,mBAAmB;WAC5C,QAAQ,EAAE,iBAAiB;WAC3B,EAAE,CAAC;IAER,qDAAqD;IACrD,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,MAAM,UAAU,GAAG,CAAC,WAAW,EAAE,WAAW,EAAE,QAAQ,CAAU,CAAC;IAEjE,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC,GAAG,CAAC,CAAC;QACrC,MAAM,WAAW,GAAG,QAAQ,EAAE,QAAQ,EAAE,CAAC,GAAG,CAAC,CAAC;QAE9C,IAAI,QAAQ,IAAI,WAAW,EAAE,CAAC;YAC5B,QAAQ,CAAC,IAAI,CAAC;gBACZ,SAAS,EAAE,GAAG;gBACd,MAAM,EAAE,QAAQ,EAAE,MAAM,IAAI,WAAW,EAAE,MAAM,IAAI,IAAI;gBACvD,WAAW,EAAE,QAAQ,EAAE,WAAW,IAAI,WAAW,EAAE,WAAW,IAAI,EAAE;aACrE,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,4BAA4B;IAC5B,IAAI,aAA6C,CAAC;IAClD,IAAI,GAAG,CAAC,aAAa,EAAE,CAAC;QACtB,aAAa,GAAG;YACd,qBAAqB,EAAE,GAAG,CAAC,aAAa,CAAC,uBAAuB,IAAI,IAAI;YACxE,YAAY,EAAE,GAAG,CAAC,aAAa,CAAC,aAAa;YAC7C,eAAe,EAAE,GAAG,CAAC,aAAa,CAAC,iBAAiB;YACpD,iBAAiB,EAAE,GAAG,CAAC,aAAa,CAAC,oBAAoB;SAC1D,CAAC;IACJ,CAAC;IAED,sBAAsB;IACtB,IAAI,OAAkC,CAAC;IACvC,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;QAChB,OAAO,GAAG;YACR,KAAK,EAAE,GAAG,CAAC,OAAO,CAAC,KAAK,IAAI,SAAS;YACrC,KAAK,EAAE,GAAG,CAAC,OAAO,CAAC,KAAK;YACxB,QAAQ,EAAE,GAAG,CAAC,OAAO,CAAC,QAAQ;SAC/B,CAAC;IACJ,CAAC;IAED,OAAO;QACL,EAAE,EAAE,GAAG,CAAC,EAAE,IAAI,EAAE;QAChB,MAAM,EAAE,GAAG,CAAC,MAAM,IAAI,EAAE;QACxB,iBAAiB;QACjB,QAAQ;QACR,eAAe,EAAE,GAAG,CAAC,gBAAgB,IAAI,EAAE;QAC3C,aAAa;QACb,OAAO;KACR,CAAC;AACJ,CAAC;AAED,+CAA+C;AAC/C,aAAa;AACb,+CAA+C;AAE/C;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,QAAgB;IACrD,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,IAAI,OAAe,CAAC;IACpB,IAAI,CAAC;QACH,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACjD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,qBAAqB,QAAQ,EAAE,CAAC,CAAC;IAC3C,CAAC;IAED,IAAI,GAAgB,CAAC;IACrB,IAAI,CAAC;QACH,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAgB,CAAC;IAC1C,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,OAAO,CAAC,iBAAiB,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACzE,CAAC;IAED,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,OAAO,CAAC,eAAe,CAAC,CAAC;IAC3B,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC;QACf,MAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;IACjD,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;QAC5C,MAAM,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC;QAChD,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC;QAE7B,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC;YACb,MAAM,CAAC,IAAI,CAAC,GAAG,MAAM,gBAAgB,CAAC,CAAC;QACzC,CAAC;aAAM,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;YAChC,MAAM,CAAC,IAAI,CAAC,GAAG,MAAM,wBAAwB,IAAI,CAAC,EAAE,GAAG,CAAC,CAAC;QAC3D,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACvB,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,GAAG,MAAM,oBAAoB,CAAC,CAAC;QAC7C,CAAC;QAED,6CAA6C;QAC7C,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC;iBACzC,MAAM,CAAC,CAAC,CAAC,EAAoB,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC;iBAChD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;YAChC,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;YAC/C,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,EAAE,CAAC;gBAClD,MAAM,CAAC,IAAI,CAAC,GAAG,MAAM,6BAA6B,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,iBAAiB,CAAC,CAAC;YACrF,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,+CAA+C;AAC/C,sBAAsB;AACtB,+CAA+C;AAE/C;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,SAAiB,EAAE,QAAQ,GAAG,CAAC;IAChE,MAAM,MAAM,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;IAEnD,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACtD,MAAM,MAAM,GAAG,GAAG,MAAM,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;QAC7D,OAAO,WAAW,MAAM;kEACsC,SAAS;2BAChD,SAAS;;;;;4DAKwB,SAAS;8DACP,SAAS;;;;;wCAK/B,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,OAAO,UAAU,SAAS;;;;yBAIH,SAAS;;;;;;;EAOhC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;CACnB,CAAC;AACF,CAAC"}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation pipeline orchestrator.
|
|
3
|
+
*
|
|
4
|
+
* Coordinates the full evaluation flow:
|
|
5
|
+
* parse tasks → setup skills → run agent → score → report → check thresholds
|
|
6
|
+
*/
|
|
7
|
+
import { type EvalConfig } from './config.js';
|
|
8
|
+
import type { SkillEvaluation, TaskResult, CombinedScore, EvaluationReport } from './types.js';
|
|
9
|
+
export interface PipelineOptions {
|
|
10
|
+
/** Path to tasks YAML file */
|
|
11
|
+
tasksFile: string;
|
|
12
|
+
/** Path to eval.config.yaml */
|
|
13
|
+
configPath?: string;
|
|
14
|
+
/** Config overrides from CLI flags */
|
|
15
|
+
configOverrides?: Partial<EvalConfig>;
|
|
16
|
+
/** Working directory for agent execution */
|
|
17
|
+
cwd?: string;
|
|
18
|
+
/** Path to skills directory (for local skill setup) */
|
|
19
|
+
skillsDir?: string;
|
|
20
|
+
/** Comma-separated task IDs to run (empty = all) */
|
|
21
|
+
taskFilter?: string;
|
|
22
|
+
/** Skip deterministic scoring */
|
|
23
|
+
noDeterministic?: boolean;
|
|
24
|
+
/** Skip LLM judge scoring */
|
|
25
|
+
noJudge?: boolean;
|
|
26
|
+
/** Enable verbose logging */
|
|
27
|
+
verbose?: boolean;
|
|
28
|
+
}
|
|
29
|
+
export interface PipelineResult {
|
|
30
|
+
passed: boolean;
|
|
31
|
+
failureReasons: string[];
|
|
32
|
+
evaluation: SkillEvaluation;
|
|
33
|
+
results: TaskResult[];
|
|
34
|
+
scores: CombinedScore[];
|
|
35
|
+
report: EvaluationReport;
|
|
36
|
+
reportPath?: string;
|
|
37
|
+
jsonPath?: string;
|
|
38
|
+
markdownSummary: string;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Run the full evaluation pipeline.
|
|
42
|
+
*/
|
|
43
|
+
export declare function runPipeline(options: PipelineOptions): Promise<PipelineResult>;
|
|
44
|
+
/**
|
|
45
|
+
* Score existing results (no runner).
|
|
46
|
+
*/
|
|
47
|
+
export declare function scorePipeline(resultsPath: string, options?: {
|
|
48
|
+
configPath?: string;
|
|
49
|
+
configOverrides?: Partial<EvalConfig>;
|
|
50
|
+
noJudge?: boolean;
|
|
51
|
+
noDeterministic?: boolean;
|
|
52
|
+
}): Promise<PipelineResult>;
|
|
53
|
+
//# sourceMappingURL=pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/pipeline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAWH,OAAO,EAAc,KAAK,UAAU,EAAE,MAAM,aAAa,CAAC;AAC1D,OAAO,KAAK,EACV,eAAe,EACf,UAAU,EACV,aAAa,EACb,gBAAgB,EAGjB,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,eAAe;IAC9B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,sCAAsC;IACtC,eAAe,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC;IACtC,4CAA4C;IAC5C,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,uDAAuD;IACvD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,oDAAoD;IACpD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iCAAiC;IACjC,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,6BAA6B;IAC7B,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,6BAA6B;IAC7B,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,OAAO,CAAC;IAChB,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,UAAU,EAAE,eAAe,CAAC;IAC5B,OAAO,EAAE,UAAU,EAAE,CAAC;IACtB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,MAAM,EAAE,gBAAgB,CAAC;IACzB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,eAAe,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC,CAuHnF;AAED;;GAEG;AACH,wBAAsB,aAAa,CACjC,WAAW,EAAE,MAAM,EACnB,OAAO,GAAE;IACP,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,eAAe,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC;IACtC,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,eAAe,CAAC,EAAE,OAAO,CAAC;CACtB,GACL,OAAO,CAAC,cAAc,CAAC,CA+CzB"}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation pipeline orchestrator.
|
|
3
|
+
*
|
|
4
|
+
* Coordinates the full evaluation flow:
|
|
5
|
+
* parse tasks → setup skills → run agent → score → report → check thresholds
|
|
6
|
+
*/
|
|
7
|
+
import * as path from 'path';
|
|
8
|
+
import * as fs from 'fs/promises';
|
|
9
|
+
import { parseEvalFile } from './parser.js';
|
|
10
|
+
import { SkillEvalRunner } from './runner/runner.js';
|
|
11
|
+
import { setupLocalSkills, cleanupLocalSkills } from './runner/skill-setup.js';
|
|
12
|
+
import { scoreAll } from './scorer/scorer.js';
|
|
13
|
+
import { SessionLogger } from './session/session-logger.js';
|
|
14
|
+
import { generateReport, generateJsonResults } from './report/report.js';
|
|
15
|
+
import { generateGitHubSummary, writeGitHubSummary } from './report/github-summary.js';
|
|
16
|
+
import { loadConfig } from './config.js';
|
|
17
|
+
/**
|
|
18
|
+
* Run the full evaluation pipeline.
|
|
19
|
+
*/
|
|
20
|
+
export async function runPipeline(options) {
|
|
21
|
+
const config = await loadConfig(options.configPath, options.configOverrides);
|
|
22
|
+
const cwd = options.cwd || process.cwd();
|
|
23
|
+
// 1. Parse tasks
|
|
24
|
+
console.log(`Parsing tasks from: ${options.tasksFile}`);
|
|
25
|
+
let evaluation = await parseEvalFile(options.tasksFile);
|
|
26
|
+
// Filter tasks if specified
|
|
27
|
+
if (options.taskFilter) {
|
|
28
|
+
const filterIds = new Set(options.taskFilter.split(',').map((s) => s.trim()));
|
|
29
|
+
evaluation = {
|
|
30
|
+
...evaluation,
|
|
31
|
+
tasks: evaluation.tasks.filter((t) => filterIds.has(t.id)),
|
|
32
|
+
};
|
|
33
|
+
console.log(`Filtered to ${evaluation.tasks.length} task(s): ${options.taskFilter}`);
|
|
34
|
+
}
|
|
35
|
+
if (evaluation.tasks.length === 0) {
|
|
36
|
+
throw new Error('No tasks to run');
|
|
37
|
+
}
|
|
38
|
+
console.log(`Running ${evaluation.tasks.length} task(s) for skill: ${evaluation.skillName}`);
|
|
39
|
+
// 2. Setup local skills
|
|
40
|
+
// Auto-detect skills/ directory relative to tasks file if not explicitly provided
|
|
41
|
+
let skillsDir = options.skillsDir;
|
|
42
|
+
if (!skillsDir) {
|
|
43
|
+
const tasksDir = path.dirname(path.resolve(options.tasksFile));
|
|
44
|
+
const autoSkillsDir = path.join(tasksDir, 'skills');
|
|
45
|
+
try {
|
|
46
|
+
const stat = await fs.stat(autoSkillsDir);
|
|
47
|
+
if (stat.isDirectory()) {
|
|
48
|
+
skillsDir = autoSkillsDir;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
catch {
|
|
52
|
+
// No skills/ directory found, that's fine
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
let skillsSetup = false;
|
|
56
|
+
if (skillsDir) {
|
|
57
|
+
console.log(`Setting up local skills from: ${skillsDir}`);
|
|
58
|
+
const skillNames = await setupLocalSkills(skillsDir, cwd);
|
|
59
|
+
skillsSetup = true;
|
|
60
|
+
console.log(`Skills configured: ${skillNames.join(', ')}`);
|
|
61
|
+
}
|
|
62
|
+
try {
|
|
63
|
+
// 3. Run agent against tasks
|
|
64
|
+
console.log('\n--- Running Tasks ---\n');
|
|
65
|
+
const runner = new SkillEvalRunner({
|
|
66
|
+
cwd,
|
|
67
|
+
model: config.defaultAgentModel,
|
|
68
|
+
parallel: false,
|
|
69
|
+
allowedWriteDirs: config.allowedWriteDirs,
|
|
70
|
+
});
|
|
71
|
+
const logDir = path.join(config.outputDir, 'logs');
|
|
72
|
+
const results = await runner.runAll(evaluation, (task) => new SessionLogger(task.id, logDir));
|
|
73
|
+
// 4. Score results
|
|
74
|
+
console.log('\n--- Scoring ---\n');
|
|
75
|
+
const scorerOptions = {
|
|
76
|
+
noDeterministic: options.noDeterministic,
|
|
77
|
+
noJudge: options.noJudge,
|
|
78
|
+
judgeOptions: { model: config.defaultJudgeModel },
|
|
79
|
+
};
|
|
80
|
+
const scores = await scoreAll(evaluation.tasks, results, scorerOptions);
|
|
81
|
+
// 5. Generate reports
|
|
82
|
+
console.log('\n--- Generating Reports ---\n');
|
|
83
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
84
|
+
const reportBaseName = `${evaluation.skillName}-${timestamp}`;
|
|
85
|
+
const reportPath = path.join(config.outputDir, `${reportBaseName}.md`);
|
|
86
|
+
const jsonPath = path.join(config.outputDir, `${reportBaseName}.json`);
|
|
87
|
+
const metadata = {
|
|
88
|
+
skillPath: options.tasksFile,
|
|
89
|
+
agentModel: config.defaultAgentModel,
|
|
90
|
+
judgeModel: config.defaultJudgeModel,
|
|
91
|
+
};
|
|
92
|
+
await generateReport(evaluation, results, scores, reportPath, metadata);
|
|
93
|
+
const report = await generateJsonResults(evaluation, results, scores, jsonPath, metadata);
|
|
94
|
+
// 6. GitHub summary
|
|
95
|
+
if (config.githubSummary) {
|
|
96
|
+
const wrote = await writeGitHubSummary(report);
|
|
97
|
+
if (wrote) {
|
|
98
|
+
console.log('GitHub step summary written');
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
const markdownSummary = generateGitHubSummary(report);
|
|
102
|
+
// 7. Print summary
|
|
103
|
+
printSummary(report);
|
|
104
|
+
return {
|
|
105
|
+
passed: report.passed,
|
|
106
|
+
failureReasons: report.failureReasons,
|
|
107
|
+
evaluation,
|
|
108
|
+
results,
|
|
109
|
+
scores,
|
|
110
|
+
report,
|
|
111
|
+
reportPath,
|
|
112
|
+
jsonPath,
|
|
113
|
+
markdownSummary,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
finally {
|
|
117
|
+
// Cleanup local skills
|
|
118
|
+
if (skillsSetup) {
|
|
119
|
+
await cleanupLocalSkills(cwd);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Score existing results (no runner).
|
|
125
|
+
*/
|
|
126
|
+
export async function scorePipeline(resultsPath, options = {}) {
|
|
127
|
+
const config = await loadConfig(options.configPath, options.configOverrides);
|
|
128
|
+
const data = JSON.parse(await fs.readFile(resultsPath, 'utf-8'));
|
|
129
|
+
const evaluation = {
|
|
130
|
+
skillName: data.skillName,
|
|
131
|
+
tasks: data.tasks,
|
|
132
|
+
};
|
|
133
|
+
const results = data.results;
|
|
134
|
+
const scorerOptions = {
|
|
135
|
+
noDeterministic: options.noDeterministic,
|
|
136
|
+
noJudge: options.noJudge,
|
|
137
|
+
judgeOptions: { model: config.defaultJudgeModel },
|
|
138
|
+
};
|
|
139
|
+
console.log(`Scoring ${results.length} result(s)...`);
|
|
140
|
+
const scores = await scoreAll(evaluation.tasks, results, scorerOptions);
|
|
141
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
142
|
+
const reportBaseName = `${evaluation.skillName}-scored-${timestamp}`;
|
|
143
|
+
const reportPath = path.join(config.outputDir, `${reportBaseName}.md`);
|
|
144
|
+
const jsonPath = path.join(config.outputDir, `${reportBaseName}.json`);
|
|
145
|
+
const metadata = {
|
|
146
|
+
skillPath: resultsPath,
|
|
147
|
+
agentModel: data.metadata?.agentModel || config.defaultAgentModel,
|
|
148
|
+
judgeModel: config.defaultJudgeModel,
|
|
149
|
+
};
|
|
150
|
+
await generateReport(evaluation, results, scores, reportPath, metadata);
|
|
151
|
+
const report = await generateJsonResults(evaluation, results, scores, jsonPath, metadata);
|
|
152
|
+
const markdownSummary = generateGitHubSummary(report);
|
|
153
|
+
printSummary(report);
|
|
154
|
+
return {
|
|
155
|
+
passed: report.passed,
|
|
156
|
+
failureReasons: report.failureReasons,
|
|
157
|
+
evaluation,
|
|
158
|
+
results,
|
|
159
|
+
scores,
|
|
160
|
+
report,
|
|
161
|
+
reportPath,
|
|
162
|
+
jsonPath,
|
|
163
|
+
markdownSummary,
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
function printSummary(report) {
|
|
167
|
+
const s = report.summary;
|
|
168
|
+
console.log('\n' + '='.repeat(50));
|
|
169
|
+
console.log(` Skill Evaluation: ${report.skillName}`);
|
|
170
|
+
console.log('='.repeat(50));
|
|
171
|
+
console.log(` Result: ${report.passed ? 'PASS' : 'FAIL'}`);
|
|
172
|
+
console.log(` Discovery: ${(s.discoveryAccuracy * 100).toFixed(0)}%`);
|
|
173
|
+
console.log(` Avg Adherence: ${s.avgAdherence.toFixed(2)}/5`);
|
|
174
|
+
console.log(` Avg Output Quality: ${s.avgOutputQuality.toFixed(2)}/5`);
|
|
175
|
+
console.log(` Weighted Score: ${s.avgWeightedScore.toFixed(2)}`);
|
|
176
|
+
console.log(` Duration: ${(s.totalDurationMs / 1000).toFixed(1)}s | Cost: $${s.totalCostUsd.toFixed(4)}`);
|
|
177
|
+
if (!report.passed && report.failureReasons.length > 0) {
|
|
178
|
+
console.log(`\n Failures:`);
|
|
179
|
+
for (const reason of report.failureReasons) {
|
|
180
|
+
console.log(` - ${reason}`);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
console.log('='.repeat(50));
|
|
184
|
+
}
|
|
185
|
+
//# sourceMappingURL=pipeline.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/pipeline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAC/E,OAAO,EAAE,QAAQ,EAAsB,MAAM,oBAAoB,CAAC;AAClE,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAkB,MAAM,oBAAoB,CAAC;AACzF,OAAO,EAAE,qBAAqB,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AACvF,OAAO,EAAE,UAAU,EAAmB,MAAM,aAAa,CAAC;AA2C1D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,OAAwB;IACxD,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC;IAC7E,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IAEzC,iBAAiB;IACjB,OAAO,CAAC,GAAG,CAAC,uBAAuB,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC;IACxD,IAAI,UAAU,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAExD,4BAA4B;IAC5B,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;QACvB,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAC9E,UAAU,GAAG;YACX,GAAG,UAAU;YACb,KAAK,EAAE,UAAU,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;SAC3D,CAAC;QACF,OAAO,CAAC,GAAG,CAAC,eAAe,UAAU,CAAC,KAAK,CAAC,MAAM,aAAa,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;IACvF,CAAC;IAED,IAAI,UAAU,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClC,MAAM,IAAI,KAAK,CAAC,iBAAiB,CAAC,CAAC;IACrC,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,WAAW,UAAU,CAAC,KAAK,CAAC,MAAM,uBAAuB,UAAU,CAAC,SAAS,EAAE,CAAC,CAAC;IAE7F,wBAAwB;IACxB,kFAAkF;IAClF,IAAI,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;IAClC,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC;QAC/D,MAAM,aAAa,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;QACpD,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YAC1C,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gBACvB,SAAS,GAAG,aAAa,CAAC;YAC5B,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,0CAA0C;QAC5C,CAAC;IACH,CAAC;IAED,IAAI,WAAW,GAAG,KAAK,CAAC;IACxB,IAAI,SAAS,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,iCAAiC,SAAS,EAAE,CAAC,CAAC;QAC1D,MAAM,UAAU,GAAG,MAAM,gBAAgB,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;QAC1D,WAAW,GAAG,IAAI,CAAC;QACnB,OAAO,CAAC,GAAG,CAAC,sBAAsB,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,IAAI,CAAC;QACH,6BAA6B;QAC7B,OAAO,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC;YACjC,GAAG;YACH,KAAK,EAAE,MAAM,CAAC,iBAAiB;YAC/B,QAAQ,EAAE,KAAK;YACf,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;SAC1C,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;QACnD,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,MAAM,CACjC,UAAU,EACV,CAAC,IAAc,EAAE,EAAE,CAAC,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,EAAE,MAAM,CAAC,CACvD,CAAC;QAEF,mBAAmB;QACnB,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC;QACnC,MAAM,aAAa,GAAkB;YACnC,eAAe,EAAE,OAAO,CAAC,eAAe;YACxC,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,YAAY,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,iBAAiB,EAAE;SAClD,CAAC;QACF,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,KAAK,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC;QAExE,sBAAsB;QACtB,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;QAC9C,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QACjE,MAAM,cAAc,GAAG,GAAG,UAAU,CAAC,SAAS,IAAI,SAAS,EAAE,CAAC;QAC9D,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,GAAG,cAAc,KAAK,CAAC,CAAC;QACvE,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,GAAG,cAAc,OAAO,CAAC,CAAC;QAEvE,MAAM,QAAQ,GAAmB;YAC/B,SAAS,EAAE,OAAO,CAAC,SAAS;YAC5B,UAAU,EAAE,MAAM,CAAC,iBAAiB;YACpC,UAAU,EAAE,MAAM,CAAC,iBAAiB;SACrC,CAAC;QAEF,MAAM,cAAc,CAAC,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC;QACxE,MAAM,MAAM,GAAG,MAAM,mBAAmB,CAAC,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAC;QAE1F,oBAAoB;QACpB,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YACzB,MAAM,KAAK,GAAG,MAAM,kBAAkB,CAAC,MAAM,CAAC,CAAC;YAC/C,IAAI,KAAK,EAAE,CAAC;gBACV,OAAO,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC;YAC7C,CAAC;QACH,CAAC;QAED,MAAM,eAAe,GAAG,qBAAqB,CAAC,MAAM,CAAC,CAAC;QAEtD,mBAAmB;QACnB,YAAY,CAAC,MAAM,CAAC,CAAC;QAErB,OAAO;YACL,MAAM,EAAE,MAAM,CAAC,MAAM;YACrB,cAAc,EAAE,MAAM,CAAC,cAAc;YACrC,UAAU;YACV,OAAO;YACP,MAAM;YACN,MAAM;YACN,UAAU;YACV,QAAQ;YACR,eAAe;SAChB,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,uBAAuB;QACvB,IAAI,WAAW,EAAE,CAAC;YAChB,MAAM,kBAAkB,CAAC,GAAG,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,WAAmB,EACnB,UAKI,EAAE;IAEN,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC;IAE7E,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC;IACjE,MAAM,UAAU,GAAoB;QAClC,SAAS,EAAE,IAAI,CAAC,SAAS;QACzB,KAAK,EAAE,IAAI,CAAC,KAAK;KAClB,CAAC;IACF,MAAM,OAAO,GAAiB,IAAI,CAAC,OAAO,CAAC;IAE3C,MAAM,aAAa,GAAkB;QACnC,eAAe,EAAE,OAAO,CAAC,eAAe;QACxC,OAAO,EAAE,OAAO,CAAC,OAAO;QACxB,YAAY,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,iBAAiB,EAAE;KAClD,CAAC;IAEF,OAAO,CAAC,GAAG,CAAC,WAAW,OAAO,CAAC,MAAM,eAAe,CAAC,CAAC;IACtD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,KAAK,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC;IAExE,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;IACjE,MAAM,cAAc,GAAG,GAAG,UAAU,CAAC,SAAS,WAAW,SAAS,EAAE,CAAC;IACrE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,GAAG,cAAc,KAAK,CAAC,CAAC;IACvE,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,GAAG,cAAc,OAAO,CAAC,CAAC;IAEvE,MAAM,QAAQ,GAAmB;QAC/B,SAAS,EAAE,WAAW;QACtB,UAAU,EAAE,IAAI,CAAC,QAAQ,EAAE,UAAU,IAAI,MAAM,CAAC,iBAAiB;QACjE,UAAU,EAAE,MAAM,CAAC,iBAAiB;KACrC,CAAC;IAEF,MAAM,cAAc,CAAC,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC;IACxE,MAAM,MAAM,GAAG,MAAM,mBAAmB,CAAC,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAE1F,MAAM,eAAe,GAAG,qBAAqB,CAAC,MAAM,CAAC,CAAC;IACtD,YAAY,CAAC,MAAM,CAAC,CAAC;IAErB,OAAO;QACL,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,cAAc,EAAE,MAAM,CAAC,cAAc;QACrC,UAAU;QACV,OAAO;QACP,MAAM;QACN,MAAM;QACN,UAAU;QACV,QAAQ;QACR,eAAe;KAChB,CAAC;AACJ,CAAC;AAED,SAAS,YAAY,CAAC,MAAwB;IAC5C,MAAM,CAAC,GAAG,MAAM,CAAC,OAAO,CAAC;IACzB,OAAO,CAAC,GAAG,CAAC,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,OAAO,CAAC,GAAG,CAAC,uBAAuB,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;IAC5D,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC,CAAC,iBAAiB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAC/D,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAClE,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC3G,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,MAAM,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvD,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC;QAC7B,KAAK,MAAM,MAAM,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;YAC3C,OAAO,CAAC,GAAG,CAAC,SAAS,MAAM,EAAE,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;AAC9B,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Actions job summary generation.
|
|
3
|
+
*
|
|
4
|
+
* Produces condensed markdown suitable for $GITHUB_STEP_SUMMARY.
|
|
5
|
+
*/
|
|
6
|
+
import type { EvaluationReport } from '../types.js';
|
|
7
|
+
/**
|
|
8
|
+
* Generate a condensed summary for GitHub Actions.
|
|
9
|
+
*/
|
|
10
|
+
export declare function generateGitHubSummary(report: EvaluationReport): string;
|
|
11
|
+
/**
|
|
12
|
+
* Write summary to $GITHUB_STEP_SUMMARY if available.
|
|
13
|
+
*/
|
|
14
|
+
export declare function writeGitHubSummary(report: EvaluationReport): Promise<boolean>;
|
|
15
|
+
//# sourceMappingURL=github-summary.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github-summary.d.ts","sourceRoot":"","sources":["../../../src/report/github-summary.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EACV,gBAAgB,EAIjB,MAAM,aAAa,CAAC;AAErB;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,gBAAgB,GAAG,MAAM,CAqDtE;AAED;;GAEG;AACH,wBAAsB,kBAAkB,CAAC,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,OAAO,CAAC,CAOnF"}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Actions job summary generation.
|
|
3
|
+
*
|
|
4
|
+
* Produces condensed markdown suitable for $GITHUB_STEP_SUMMARY.
|
|
5
|
+
*/
|
|
6
|
+
import * as fs from 'fs/promises';
|
|
7
|
+
/**
|
|
8
|
+
* Generate a condensed summary for GitHub Actions.
|
|
9
|
+
*/
|
|
10
|
+
export function generateGitHubSummary(report) {
|
|
11
|
+
const { summary, failureBreakdown, tasks } = report;
|
|
12
|
+
const lines = [];
|
|
13
|
+
const icon = report.passed ? ':white_check_mark:' : ':x:';
|
|
14
|
+
lines.push(`## ${icon} Skill Evaluation: ${report.skillName}`);
|
|
15
|
+
lines.push('');
|
|
16
|
+
// Summary table
|
|
17
|
+
lines.push('| Metric | Value | Status |');
|
|
18
|
+
lines.push('|--------|-------|--------|');
|
|
19
|
+
lines.push(`| Discovery Rate | ${(summary.discoveryAccuracy * 100).toFixed(0)}% (${Math.round(summary.discoveryAccuracy * summary.totalTasks)}/${summary.totalTasks}) | ${summary.discoveryAccuracy >= 0.8 ? 'PASS' : 'FAIL'} |`);
|
|
20
|
+
lines.push(`| Avg Adherence | ${summary.avgAdherence.toFixed(1)}/5 | ${summary.avgAdherence >= 4.0 ? 'PASS' : 'FAIL'} |`);
|
|
21
|
+
lines.push(`| Avg Output Quality | ${summary.avgOutputQuality.toFixed(1)}/5 | ${summary.avgOutputQuality >= 4.0 ? 'PASS' : 'FAIL'} |`);
|
|
22
|
+
lines.push(`| Weighted Score | ${summary.avgWeightedScore.toFixed(2)} | |`);
|
|
23
|
+
lines.push(`| Duration | ${(summary.totalDurationMs / 1000).toFixed(1)}s | |`);
|
|
24
|
+
lines.push(`| Cost | $${summary.totalCostUsd.toFixed(4)} | |`);
|
|
25
|
+
lines.push('');
|
|
26
|
+
// Failures
|
|
27
|
+
const failures = tasks.filter((t) => t.score.failureCategory !== 'none');
|
|
28
|
+
if (failures.length > 0) {
|
|
29
|
+
lines.push(`### Failures (${failures.length})`);
|
|
30
|
+
lines.push('');
|
|
31
|
+
lines.push('| Task | Category | Details |');
|
|
32
|
+
lines.push('|------|----------|---------|');
|
|
33
|
+
for (const f of failures) {
|
|
34
|
+
const cat = formatCategory(f.score.failureCategory);
|
|
35
|
+
const reason = f.score.reasoning.slice(0, 80) + (f.score.reasoning.length > 80 ? '...' : '');
|
|
36
|
+
lines.push(`| ${f.task.id} | ${cat} | ${reason} |`);
|
|
37
|
+
}
|
|
38
|
+
lines.push('');
|
|
39
|
+
}
|
|
40
|
+
// Per-task details in collapsible
|
|
41
|
+
lines.push('<details><summary>All task results</summary>');
|
|
42
|
+
lines.push('');
|
|
43
|
+
lines.push('| Task | Discovery | Adherence | Output | Weighted | Status |');
|
|
44
|
+
lines.push('|------|-----------|-----------|--------|----------|--------|');
|
|
45
|
+
for (const t of tasks) {
|
|
46
|
+
const s = t.score;
|
|
47
|
+
const status = s.failureCategory === 'none' ? 'PASS' : 'FAIL';
|
|
48
|
+
lines.push(`| ${t.task.id} | ${s.discovery} | ${s.adherence}/5 | ${s.outputQuality}/5 | ${s.weightedScore.toFixed(2)} | ${status} |`);
|
|
49
|
+
}
|
|
50
|
+
lines.push('');
|
|
51
|
+
lines.push('</details>');
|
|
52
|
+
lines.push('');
|
|
53
|
+
if (!report.passed && report.failureReasons.length > 0) {
|
|
54
|
+
lines.push(`**Failure reasons:** ${report.failureReasons.join('; ')}`);
|
|
55
|
+
}
|
|
56
|
+
return lines.join('\n');
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Write summary to $GITHUB_STEP_SUMMARY if available.
|
|
60
|
+
*/
|
|
61
|
+
export async function writeGitHubSummary(report) {
|
|
62
|
+
const summaryPath = process.env.GITHUB_STEP_SUMMARY;
|
|
63
|
+
if (!summaryPath)
|
|
64
|
+
return false;
|
|
65
|
+
const summary = generateGitHubSummary(report);
|
|
66
|
+
await fs.appendFile(summaryPath, summary + '\n');
|
|
67
|
+
return true;
|
|
68
|
+
}
|
|
69
|
+
function formatCategory(cat) {
|
|
70
|
+
if (cat === 'none')
|
|
71
|
+
return 'No Failure';
|
|
72
|
+
return cat
|
|
73
|
+
.split('_')
|
|
74
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
75
|
+
.join(' ');
|
|
76
|
+
}
|
|
77
|
+
//# sourceMappingURL=github-summary.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github-summary.js","sourceRoot":"","sources":["../../../src/report/github-summary.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAQlC;;GAEG;AACH,MAAM,UAAU,qBAAqB,CAAC,MAAwB;IAC5D,MAAM,EAAE,OAAO,EAAE,gBAAgB,EAAE,KAAK,EAAE,GAAG,MAAM,CAAC;IACpD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,KAAK,CAAC;IAC1D,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,sBAAsB,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IAC/D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,gBAAgB;IAChB,KAAK,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1C,KAAK,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1C,KAAK,CAAC,IAAI,CAAC,sBAAsB,CAAC,OAAO,CAAC,iBAAiB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,iBAAiB,GAAG,OAAO,CAAC,UAAU,CAAC,IAAI,OAAO,CAAC,UAAU,OAAO,OAAO,CAAC,iBAAiB,IAAI,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;IAClO,KAAK,CAAC,IAAI,CAAC,qBAAqB,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,OAAO,CAAC,YAAY,IAAI,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;IAC1H,KAAK,CAAC,IAAI,CAAC,0BAA0B,OAAO,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,OAAO,CAAC,gBAAgB,IAAI,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;IACvI,KAAK,CAAC,IAAI,CAAC,sBAAsB,OAAO,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC5E,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;IAC/E,KAAK,CAAC,IAAI,CAAC,aAAa,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC/D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,WAAW;IACX,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,eAAe,KAAK,MAAM,CAAC,CAAC;IACzE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,KAAK,CAAC,IAAI,CAAC,iBAAiB,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC;QAChD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;QAC5C,KAAK,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;QAC5C,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YACzB,MAAM,GAAG,GAAG,cAAc,CAAC,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC;YACpD,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAC7F,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,EAAE,MAAM,GAAG,MAAM,MAAM,IAAI,CAAC,CAAC;QACtD,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,kCAAkC;IAClC,KAAK,CAAC,IAAI,CAAC,8CAA8C,CAAC,CAAC;IAC3D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,+DAA+D,CAAC,CAAC;IAC5E,KAAK,CAAC,IAAI,CAAC,+DAA+D,CAAC,CAAC;IAC5E,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC;QAClB,MAAM,MAAM,GAAG,CAAC,CAAC,eAAe,KAAK,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9D,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC,CAAC,SAAS,MAAM,CAAC,CAAC,SAAS,QAAQ,CAAC,CAAC,aAAa,QAAQ,CAAC,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,MAAM,IAAI,CAAC,CAAC;IACxI,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IACzB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,MAAM,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvD,KAAK,CAAC,IAAI,CAAC,wBAAwB,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACzE,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,MAAwB;IAC/D,MAAM,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC;IACpD,IAAI,CAAC,WAAW;QAAE,OAAO,KAAK,CAAC;IAE/B,MAAM,OAAO,GAAG,qBAAqB,CAAC,MAAM,CAAC,CAAC;IAC9C,MAAM,EAAE,CAAC,UAAU,CAAC,WAAW,EAAE,OAAO,GAAG,IAAI,CAAC,CAAC;IACjD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,cAAc,CAAC,GAAW;IACjC,IAAI,GAAG,KAAK,MAAM;QAAE,OAAO,YAAY,CAAC;IACxC,OAAO,GAAG;SACP,KAAK,CAAC,GAAG,CAAC;SACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;SAClD,IAAI,CAAC,GAAG,CAAC,CAAC;AACf,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Report generation for skill evaluation results.
|
|
3
|
+
*
|
|
4
|
+
* Generates markdown and JSON reports from combined evaluation scores.
|
|
5
|
+
*/
|
|
6
|
+
import type { SkillEvaluation, TaskResult, CombinedScore, EvaluationReport, EvaluationSummary, FailureBreakdown, ReportMetadata } from '../types.js';
|
|
7
|
+
/**
|
|
8
|
+
* Generate a markdown report from evaluation results.
|
|
9
|
+
*/
|
|
10
|
+
export declare function generateReport(evaluation: SkillEvaluation, results: TaskResult[], scores: CombinedScore[], outputPath?: string, metadata?: ReportMetadata): Promise<string>;
|
|
11
|
+
/**
|
|
12
|
+
* Generate JSON report for programmatic analysis.
|
|
13
|
+
*/
|
|
14
|
+
export declare function generateJsonResults(evaluation: SkillEvaluation, results: TaskResult[], scores: CombinedScore[], outputPath?: string, metadata?: ReportMetadata): Promise<EvaluationReport>;
|
|
15
|
+
/**
|
|
16
|
+
* Compute summary statistics from combined scores.
|
|
17
|
+
*/
|
|
18
|
+
export declare function computeSummary(results: TaskResult[], scores: CombinedScore[]): EvaluationSummary;
|
|
19
|
+
/**
|
|
20
|
+
* Compute failure category breakdown.
|
|
21
|
+
*/
|
|
22
|
+
export declare function computeFailureBreakdown(scores: CombinedScore[]): FailureBreakdown[];
|
|
23
|
+
//# sourceMappingURL=report.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"report.d.ts","sourceRoot":"","sources":["../../../src/report/report.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAIH,OAAO,KAAK,EACV,eAAe,EACf,UAAU,EACV,aAAa,EACb,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAEhB,cAAc,EACf,MAAM,aAAa,CAAC;AAGrB;;GAEG;AACH,wBAAsB,cAAc,CAClC,UAAU,EAAE,eAAe,EAC3B,OAAO,EAAE,UAAU,EAAE,EACrB,MAAM,EAAE,aAAa,EAAE,EACvB,UAAU,CAAC,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,cAAc,GACxB,OAAO,CAAC,MAAM,CAAC,CAsHjB;AAED;;GAEG;AACH,wBAAsB,mBAAmB,CACvC,UAAU,EAAE,eAAe,EAC3B,OAAO,EAAE,UAAU,EAAE,EACrB,MAAM,EAAE,aAAa,EAAE,EACvB,UAAU,CAAC,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,cAAc,GACxB,OAAO,CAAC,gBAAgB,CAAC,CAuD3B;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,OAAO,EAAE,UAAU,EAAE,EACrB,MAAM,EAAE,aAAa,EAAE,GACtB,iBAAiB,CAmBnB;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CAAC,MAAM,EAAE,aAAa,EAAE,GAAG,gBAAgB,EAAE,CAenF"}
|