@skilljack/evals 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +327 -0
  3. package/action/action.yml +72 -0
  4. package/action/index.ts +78 -0
  5. package/dist/action/index.d.ts +8 -0
  6. package/dist/action/index.d.ts.map +1 -0
  7. package/dist/action/index.js +68 -0
  8. package/dist/action/index.js.map +1 -0
  9. package/dist/src/cli.d.ts +9 -0
  10. package/dist/src/cli.d.ts.map +1 -0
  11. package/dist/src/cli.js +264 -0
  12. package/dist/src/cli.js.map +1 -0
  13. package/dist/src/config.d.ts +52 -0
  14. package/dist/src/config.d.ts.map +1 -0
  15. package/dist/src/config.js +194 -0
  16. package/dist/src/config.js.map +1 -0
  17. package/dist/src/index.d.ts +24 -0
  18. package/dist/src/index.d.ts.map +1 -0
  19. package/dist/src/index.js +28 -0
  20. package/dist/src/index.js.map +1 -0
  21. package/dist/src/parser.d.ts +22 -0
  22. package/dist/src/parser.d.ts.map +1 -0
  23. package/dist/src/parser.js +205 -0
  24. package/dist/src/parser.js.map +1 -0
  25. package/dist/src/pipeline.d.ts +53 -0
  26. package/dist/src/pipeline.d.ts.map +1 -0
  27. package/dist/src/pipeline.js +185 -0
  28. package/dist/src/pipeline.js.map +1 -0
  29. package/dist/src/report/github-summary.d.ts +15 -0
  30. package/dist/src/report/github-summary.d.ts.map +1 -0
  31. package/dist/src/report/github-summary.js +77 -0
  32. package/dist/src/report/github-summary.js.map +1 -0
  33. package/dist/src/report/report.d.ts +23 -0
  34. package/dist/src/report/report.d.ts.map +1 -0
  35. package/dist/src/report/report.js +216 -0
  36. package/dist/src/report/report.js.map +1 -0
  37. package/dist/src/runner/runner.d.ts +29 -0
  38. package/dist/src/runner/runner.d.ts.map +1 -0
  39. package/dist/src/runner/runner.js +211 -0
  40. package/dist/src/runner/runner.js.map +1 -0
  41. package/dist/src/runner/security.d.ts +26 -0
  42. package/dist/src/runner/security.d.ts.map +1 -0
  43. package/dist/src/runner/security.js +34 -0
  44. package/dist/src/runner/security.js.map +1 -0
  45. package/dist/src/runner/skill-setup.d.ts +19 -0
  46. package/dist/src/runner/skill-setup.d.ts.map +1 -0
  47. package/dist/src/runner/skill-setup.js +72 -0
  48. package/dist/src/runner/skill-setup.js.map +1 -0
  49. package/dist/src/scorer/deterministic.d.ts +12 -0
  50. package/dist/src/scorer/deterministic.d.ts.map +1 -0
  51. package/dist/src/scorer/deterministic.js +149 -0
  52. package/dist/src/scorer/deterministic.js.map +1 -0
  53. package/dist/src/scorer/judge.d.ts +34 -0
  54. package/dist/src/scorer/judge.d.ts.map +1 -0
  55. package/dist/src/scorer/judge.js +226 -0
  56. package/dist/src/scorer/judge.js.map +1 -0
  57. package/dist/src/scorer/scorer.d.ts +25 -0
  58. package/dist/src/scorer/scorer.d.ts.map +1 -0
  59. package/dist/src/scorer/scorer.js +149 -0
  60. package/dist/src/scorer/scorer.js.map +1 -0
  61. package/dist/src/session/session-logger.d.ts +30 -0
  62. package/dist/src/session/session-logger.d.ts.map +1 -0
  63. package/dist/src/session/session-logger.js +157 -0
  64. package/dist/src/session/session-logger.js.map +1 -0
  65. package/dist/src/types.d.ts +227 -0
  66. package/dist/src/types.d.ts.map +1 -0
  67. package/dist/src/types.js +16 -0
  68. package/dist/src/types.js.map +1 -0
  69. package/package.json +44 -0
@@ -0,0 +1,205 @@
1
+ /**
2
+ * YAML parser for skill evaluation task files.
3
+ *
4
+ * Supports enriched YAML schema with:
5
+ * - defaults block (shared criteria, expected_skill_load)
6
+ * - deterministic block (marker-based checks, tool call expectations)
7
+ * - fixture block (setup/teardown scripts per task)
8
+ */
9
+ import yaml from 'js-yaml';
10
+ import * as fs from 'fs/promises';
11
+ // ============================================
12
+ // Parser
13
+ // ============================================
14
+ /**
15
+ * Parse a YAML evaluation file into a SkillEvaluation object.
16
+ */
17
+ export async function parseEvalFile(filePath) {
18
+ const content = await fs.readFile(filePath, 'utf-8');
19
+ const raw = yaml.load(content);
20
+ if (!raw || !raw.skill) {
21
+ throw new Error(`Invalid evaluation file: missing 'skill' field`);
22
+ }
23
+ const defaults = raw.defaults ? parseDefaults(raw.defaults) : undefined;
24
+ const tasks = (raw.tasks || []).map((t) => parseTask(t, defaults));
25
+ return {
26
+ skillName: raw.skill,
27
+ version: raw.version,
28
+ defaults,
29
+ tasks,
30
+ };
31
+ }
32
+ /**
33
+ * Parse the defaults block.
34
+ */
35
+ function parseDefaults(raw) {
36
+ const defaults = {};
37
+ if (raw.expected_skill_load) {
38
+ defaults.expectedSkillLoad = raw.expected_skill_load;
39
+ }
40
+ if (raw.criteria) {
41
+ defaults.criteria = {};
42
+ for (const dim of ['discovery', 'adherence', 'output']) {
43
+ const rawCrit = raw.criteria[dim];
44
+ if (rawCrit) {
45
+ defaults.criteria[dim] = {
46
+ weight: rawCrit.weight,
47
+ description: rawCrit.description,
48
+ };
49
+ }
50
+ }
51
+ }
52
+ return defaults;
53
+ }
54
+ /**
55
+ * Parse a single task from raw YAML, merging with defaults.
56
+ */
57
+ function parseTask(raw, defaults) {
58
+ // Merge expected skill load: task overrides defaults
59
+ const expectedSkillLoad = raw.expected_skill_load
60
+ ?? defaults?.expectedSkillLoad
61
+ ?? '';
62
+ // Merge criteria: task-level overrides default-level
63
+ const criteria = [];
64
+ const dimensions = ['discovery', 'adherence', 'output'];
65
+ for (const dim of dimensions) {
66
+ const taskCrit = raw.criteria?.[dim];
67
+ const defaultCrit = defaults?.criteria?.[dim];
68
+ if (taskCrit || defaultCrit) {
69
+ criteria.push({
70
+ dimension: dim,
71
+ weight: taskCrit?.weight ?? defaultCrit?.weight ?? 0.33,
72
+ description: taskCrit?.description ?? defaultCrit?.description ?? '',
73
+ });
74
+ }
75
+ }
76
+ // Parse deterministic block
77
+ let deterministic;
78
+ if (raw.deterministic) {
79
+ deterministic = {
80
+ expectSkillActivation: raw.deterministic.expect_skill_activation ?? true,
81
+ expectMarker: raw.deterministic.expect_marker,
82
+ expectToolCalls: raw.deterministic.expect_tool_calls,
83
+ expectNoToolCalls: raw.deterministic.expect_no_tool_calls,
84
+ };
85
+ }
86
+ // Parse fixture block
87
+ let fixture;
88
+ if (raw.fixture) {
89
+ fixture = {
90
+ state: raw.fixture.state ?? 'default',
91
+ setup: raw.fixture.setup,
92
+ teardown: raw.fixture.teardown,
93
+ };
94
+ }
95
+ return {
96
+ id: raw.id || '',
97
+ prompt: raw.prompt || '',
98
+ expectedSkillLoad,
99
+ criteria,
100
+ goldenChecklist: raw.golden_checklist || [],
101
+ deterministic,
102
+ fixture,
103
+ };
104
+ }
105
+ // ============================================
106
+ // Validation
107
+ // ============================================
108
+ /**
109
+ * Validate a YAML evaluation file and return any errors.
110
+ */
111
+ export async function validateEvalFile(filePath) {
112
+ const errors = [];
113
+ let content;
114
+ try {
115
+ content = await fs.readFile(filePath, 'utf-8');
116
+ }
117
+ catch {
118
+ return [`Cannot read file: ${filePath}`];
119
+ }
120
+ let raw;
121
+ try {
122
+ raw = yaml.load(content);
123
+ }
124
+ catch (e) {
125
+ return [`Invalid YAML: ${e instanceof Error ? e.message : String(e)}`];
126
+ }
127
+ if (!raw) {
128
+ return ['File is empty'];
129
+ }
130
+ if (!raw.skill) {
131
+ errors.push("Missing required field: 'skill'");
132
+ }
133
+ if (!raw.tasks || !Array.isArray(raw.tasks)) {
134
+ errors.push("Missing or invalid 'tasks' array");
135
+ return errors;
136
+ }
137
+ const taskIds = new Set();
138
+ for (let i = 0; i < raw.tasks.length; i++) {
139
+ const task = raw.tasks[i];
140
+ const prefix = `tasks[${i}]`;
141
+ if (!task.id) {
142
+ errors.push(`${prefix}: Missing 'id'`);
143
+ }
144
+ else if (taskIds.has(task.id)) {
145
+ errors.push(`${prefix}: Duplicate task id '${task.id}'`);
146
+ }
147
+ else {
148
+ taskIds.add(task.id);
149
+ }
150
+ if (!task.prompt) {
151
+ errors.push(`${prefix}: Missing 'prompt'`);
152
+ }
153
+ // Validate criteria weights sum roughly to 1
154
+ if (task.criteria) {
155
+ const weights = Object.values(task.criteria)
156
+ .filter((c) => c !== undefined)
157
+ .map((c) => c.weight ?? 0.33);
158
+ const sum = weights.reduce((a, b) => a + b, 0);
159
+ if (weights.length > 0 && Math.abs(sum - 1) > 0.1) {
160
+ errors.push(`${prefix}: Criteria weights sum to ${sum.toFixed(2)}, expected ~1.0`);
161
+ }
162
+ }
163
+ }
164
+ return errors;
165
+ }
166
+ // ============================================
167
+ // Template Generation
168
+ // ============================================
169
+ /**
170
+ * Generate a YAML template for a new skill evaluation.
171
+ */
172
+ export function createEvalTemplate(skillName, numTasks = 5) {
173
+ const prefix = skillName.slice(0, 2).toLowerCase();
174
+ const tasks = Array.from({ length: numTasks }, (_, i) => {
175
+ const taskId = `${prefix}-${String(i + 1).padStart(3, '0')}`;
176
+ return ` - id: ${taskId}
177
+ prompt: "TODO: Write a realistic prompt that should trigger ${skillName}"
178
+ expected_skill_load: ${skillName}
179
+ deterministic:
180
+ expect_skill_activation: true
181
+ # expect_marker: "OPTIONAL_MARKER_TEXT"
182
+ criteria:
183
+ discovery: { weight: 0.3, description: "Should load ${skillName} based on task context" }
184
+ adherence: { weight: 0.4, description: "Should follow ${skillName} instructions" }
185
+ output: { weight: 0.3, description: "Should produce quality output meeting requirements" }
186
+ golden_checklist:
187
+ - "TODO: Add expected behavior 1"
188
+ - "TODO: Add expected behavior 2"
189
+ - "TODO: Add expected behavior 3"`;
190
+ });
191
+ return `skill: ${skillName}
192
+ version: "1.0"
193
+
194
+ defaults:
195
+ expected_skill_load: ${skillName}
196
+ criteria:
197
+ discovery: { weight: 0.3 }
198
+ adherence: { weight: 0.4 }
199
+ output: { weight: 0.3 }
200
+
201
+ tasks:
202
+ ${tasks.join('\n\n')}
203
+ `;
204
+ }
205
+ //# sourceMappingURL=parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parser.js","sourceRoot":"","sources":["../../src/parser.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,IAAI,MAAM,SAAS,CAAC;AAC3B,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AA8DlC,+CAA+C;AAC/C,SAAS;AACT,+CAA+C;AAE/C;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,QAAgB;IAClD,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACrD,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAgB,CAAC;IAE9C,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC;QACvB,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAC;IACpE,CAAC;IAED,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,aAAa,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IACxE,MAAM,KAAK,GAAG,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC;IAEnE,OAAO;QACL,SAAS,EAAE,GAAG,CAAC,KAAK;QACpB,OAAO,EAAE,GAAG,CAAC,OAAO;QACpB,QAAQ;QACR,KAAK;KACN,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAAC,GAAgB;IACrC,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAElC,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;QAC5B,QAAQ,CAAC,iBAAiB,GAAG,GAAG,CAAC,mBAAmB,CAAC;IACvD,CAAC;IAED,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;QACjB,QAAQ,CAAC,QAAQ,GAAG,EAAE,CAAC;QACvB,KAAK,MAAM,GAAG,IAAI,CAAC,WAAW,EAAE,WAAW,EAAE,QAAQ,CAAU,EAAE,CAAC;YAChE,MAAM,OAAO,GAAG,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC;YAClC,IAAI,OAAO,EAAE,CAAC;gBACZ,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG;oBACvB,MAAM,EAAE,OAAO,CAAC,MAAM;oBACtB,WAAW,EAAE,OAAO,CAAC,WAAW;iBACjC,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,SAAS,SAAS,CAAC,GAAY,EAAE,QAAuB;IACtD,qDAAqD;IACrD,MAAM,iBAAiB,GAAG,GAAG,CAAC,mBAAmB;WAC5C,QAAQ,EAAE,iBAAiB;WAC3B,EAAE,CAAC;IAER,qDAAqD;IACrD,MAAM,QAAQ,GAAmB,EAAE,CAAC;IACpC,MAAM,UAAU,GAAG,CAAC,WAAW,EAAE,WAAW,EAAE,QAAQ,CAAU,CAAC;IAEjE,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC,GAAG,CAAC,CAAC;QACrC,MAAM,WAAW,GAAG,QAAQ,EAAE,QAAQ,EAAE,CAAC,GAAG,CAAC,CAAC;QAE9C,IAAI,QAAQ,IAAI,WAAW,EAAE,CAAC;YAC5B,QAAQ,CAAC,IAAI,CAAC;gBACZ,SAAS,EAAE,GAAG;gBACd,MAAM,EAAE,QAAQ,EAAE,MAAM,IAAI,WAAW,EAAE,MAAM,IAAI,IAAI;gBACvD,WAAW,EAAE,QAAQ,EAAE,WAAW,IAAI,WAAW,EAAE,WAAW,IAAI,EAAE;aACrE,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,4BAA4B;IAC5B,IAAI,aAA6C,CAAC;IAClD,IAAI,GAAG,CAAC,aAAa,EAAE,CAAC;QACtB,aAAa,GAAG;YACd,qBAAqB,EAAE,GAAG,CAAC,aAAa,CAAC,uBAAuB,IAAI,IAAI;YACxE,YAAY,EAAE,GAAG,CAAC,aAAa,CAAC,aAAa;YAC7C,eAAe,EAAE,GAAG,CAAC,aAAa,CAAC,iBAAiB;YACpD,iBAAiB,EAAE,GAAG,CAAC,aAAa,CAAC,oBAAoB;SAC1D,CAAC;IACJ,CAAC;IAED,sBAAsB;IACtB,IAAI,OAAkC,CAAC;IACvC,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;QAChB,OAAO,GAAG;YACR,KAAK,EAAE,GAAG,CAAC,OAAO,CAAC,KAAK,IAAI,SAAS;YACrC,KAAK,EAAE,GAAG,CAAC,OAAO,CAAC,KAAK;YACxB,QAAQ,EAAE,GAAG,CAAC,OAAO,CAAC,QAAQ;SAC/B,CAAC;IACJ,CAAC;IAED,OAAO;QACL,EAAE,EAAE,GAAG,CAAC,EAAE,IAAI,EAAE;QAChB,MAAM,EAAE,GAAG,CAAC,MAAM,IAAI,EAAE;QACxB,iBAAiB;QACjB,QAAQ;QACR,eAAe,EAAE,GAAG,CAAC,gBAAgB,IAAI,EAAE;QAC3C,aAAa;QACb,OAAO;KACR,CAAC;AACJ,CAAC;AAED,+CAA+C;AAC/C,aAAa;AACb,+CAA+C;AAE/C;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,QAAgB;IACrD,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,IAAI,OAAe,CAAC;IACpB,IAAI,CAAC;QACH,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACjD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,qBAAqB,QAAQ,EAAE,CAAC,CAAC;IAC3C,CAAC;IAED,IAAI,GAAgB,CAAC;IACrB,IAAI,CAAC;QACH,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAgB,CAAC;IAC1C,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,OAAO,CAAC,iBAAiB,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACzE,CAAC;IAED,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,OAAO,CAAC,eAAe,CAAC,CAAC;IAC3B,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC;QACf,MAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;IACjD,CAAC;IAED,IAAI,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;QAC5C,MAAM,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC;QAChD,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC;QAE7B,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC;YACb,MAAM,CAAC,IAAI,CAAC,GAAG,MAAM,gBAAgB,CAAC,CAAC;QACzC,CAAC;aAAM,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;YAChC,MAAM,CAAC,IAAI,CAAC,GAAG,MAAM,wBAAwB,IAAI,CAAC,EAAE,GAAG,CAAC,CAAC;QAC3D,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACvB,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,GAAG,MAAM,oBAAoB,CAAC,CAAC;QAC7C,CAAC;QAED,6CAA6C;QAC7C,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC;iBACzC,MAAM,CAAC,CAAC,CAAC,EAAoB,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC;iBAChD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;YAChC,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;YAC/C,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,EAAE,CAAC;gBAClD,MAAM,CAAC,IAAI,CAAC,GAAG,MAAM,6BAA6B,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,iBAAiB,CAAC,CAAC;YACrF,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,+CAA+C;AAC/C,sBAAsB;AACtB,+CAA+C;AAE/C;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,SAAiB,EAAE,QAAQ,GAAG,CAAC;IAChE,MAAM,MAAM,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;IAEnD,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACtD,MAAM,MAAM,GAAG,GAAG,MAAM,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;QAC7D,OAAO,WAAW,MAAM;kEACsC,SAAS;2BAChD,SAAS;;;;;4DAKwB,SAAS;8DACP,SAAS;;;;;wCAK/B,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,OAAO,UAAU,SAAS;;;;yBAIH,SAAS;;;;;;;EAOhC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;CACnB,CAAC;AACF,CAAC"}
@@ -0,0 +1,53 @@
1
+ /**
2
+ * Evaluation pipeline orchestrator.
3
+ *
4
+ * Coordinates the full evaluation flow:
5
+ * parse tasks → setup skills → run agent → score → report → check thresholds
6
+ */
7
+ import { type EvalConfig } from './config.js';
8
+ import type { SkillEvaluation, TaskResult, CombinedScore, EvaluationReport } from './types.js';
9
+ export interface PipelineOptions {
10
+ /** Path to tasks YAML file */
11
+ tasksFile: string;
12
+ /** Path to eval.config.yaml */
13
+ configPath?: string;
14
+ /** Config overrides from CLI flags */
15
+ configOverrides?: Partial<EvalConfig>;
16
+ /** Working directory for agent execution */
17
+ cwd?: string;
18
+ /** Path to skills directory (for local skill setup) */
19
+ skillsDir?: string;
20
+ /** Comma-separated task IDs to run (empty = all) */
21
+ taskFilter?: string;
22
+ /** Skip deterministic scoring */
23
+ noDeterministic?: boolean;
24
+ /** Skip LLM judge scoring */
25
+ noJudge?: boolean;
26
+ /** Enable verbose logging */
27
+ verbose?: boolean;
28
+ }
29
+ export interface PipelineResult {
30
+ passed: boolean;
31
+ failureReasons: string[];
32
+ evaluation: SkillEvaluation;
33
+ results: TaskResult[];
34
+ scores: CombinedScore[];
35
+ report: EvaluationReport;
36
+ reportPath?: string;
37
+ jsonPath?: string;
38
+ markdownSummary: string;
39
+ }
40
+ /**
41
+ * Run the full evaluation pipeline.
42
+ */
43
+ export declare function runPipeline(options: PipelineOptions): Promise<PipelineResult>;
44
+ /**
45
+ * Score existing results (no runner).
46
+ */
47
+ export declare function scorePipeline(resultsPath: string, options?: {
48
+ configPath?: string;
49
+ configOverrides?: Partial<EvalConfig>;
50
+ noJudge?: boolean;
51
+ noDeterministic?: boolean;
52
+ }): Promise<PipelineResult>;
53
+ //# sourceMappingURL=pipeline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/pipeline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAWH,OAAO,EAAc,KAAK,UAAU,EAAE,MAAM,aAAa,CAAC;AAC1D,OAAO,KAAK,EACV,eAAe,EACf,UAAU,EACV,aAAa,EACb,gBAAgB,EAGjB,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,eAAe;IAC9B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,+BAA+B;IAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,sCAAsC;IACtC,eAAe,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC;IACtC,4CAA4C;IAC5C,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,uDAAuD;IACvD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,oDAAoD;IACpD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iCAAiC;IACjC,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,6BAA6B;IAC7B,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,6BAA6B;IAC7B,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,OAAO,CAAC;IAChB,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,UAAU,EAAE,eAAe,CAAC;IAC5B,OAAO,EAAE,UAAU,EAAE,CAAC;IACtB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,MAAM,EAAE,gBAAgB,CAAC;IACzB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,eAAe,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,wBAAsB,WAAW,CAAC,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC,CAuHnF;AAED;;GAEG;AACH,wBAAsB,aAAa,CACjC,WAAW,EAAE,MAAM,EACnB,OAAO,GAAE;IACP,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,eAAe,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC;IACtC,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,eAAe,CAAC,EAAE,OAAO,CAAC;CACtB,GACL,OAAO,CAAC,cAAc,CAAC,CA+CzB"}
@@ -0,0 +1,185 @@
1
+ /**
2
+ * Evaluation pipeline orchestrator.
3
+ *
4
+ * Coordinates the full evaluation flow:
5
+ * parse tasks → setup skills → run agent → score → report → check thresholds
6
+ */
7
+ import * as path from 'path';
8
+ import * as fs from 'fs/promises';
9
+ import { parseEvalFile } from './parser.js';
10
+ import { SkillEvalRunner } from './runner/runner.js';
11
+ import { setupLocalSkills, cleanupLocalSkills } from './runner/skill-setup.js';
12
+ import { scoreAll } from './scorer/scorer.js';
13
+ import { SessionLogger } from './session/session-logger.js';
14
+ import { generateReport, generateJsonResults } from './report/report.js';
15
+ import { generateGitHubSummary, writeGitHubSummary } from './report/github-summary.js';
16
+ import { loadConfig } from './config.js';
17
+ /**
18
+ * Run the full evaluation pipeline.
19
+ */
20
+ export async function runPipeline(options) {
21
+ const config = await loadConfig(options.configPath, options.configOverrides);
22
+ const cwd = options.cwd || process.cwd();
23
+ // 1. Parse tasks
24
+ console.log(`Parsing tasks from: ${options.tasksFile}`);
25
+ let evaluation = await parseEvalFile(options.tasksFile);
26
+ // Filter tasks if specified
27
+ if (options.taskFilter) {
28
+ const filterIds = new Set(options.taskFilter.split(',').map((s) => s.trim()));
29
+ evaluation = {
30
+ ...evaluation,
31
+ tasks: evaluation.tasks.filter((t) => filterIds.has(t.id)),
32
+ };
33
+ console.log(`Filtered to ${evaluation.tasks.length} task(s): ${options.taskFilter}`);
34
+ }
35
+ if (evaluation.tasks.length === 0) {
36
+ throw new Error('No tasks to run');
37
+ }
38
+ console.log(`Running ${evaluation.tasks.length} task(s) for skill: ${evaluation.skillName}`);
39
+ // 2. Setup local skills
40
+ // Auto-detect skills/ directory relative to tasks file if not explicitly provided
41
+ let skillsDir = options.skillsDir;
42
+ if (!skillsDir) {
43
+ const tasksDir = path.dirname(path.resolve(options.tasksFile));
44
+ const autoSkillsDir = path.join(tasksDir, 'skills');
45
+ try {
46
+ const stat = await fs.stat(autoSkillsDir);
47
+ if (stat.isDirectory()) {
48
+ skillsDir = autoSkillsDir;
49
+ }
50
+ }
51
+ catch {
52
+ // No skills/ directory found, that's fine
53
+ }
54
+ }
55
+ let skillsSetup = false;
56
+ if (skillsDir) {
57
+ console.log(`Setting up local skills from: ${skillsDir}`);
58
+ const skillNames = await setupLocalSkills(skillsDir, cwd);
59
+ skillsSetup = true;
60
+ console.log(`Skills configured: ${skillNames.join(', ')}`);
61
+ }
62
+ try {
63
+ // 3. Run agent against tasks
64
+ console.log('\n--- Running Tasks ---\n');
65
+ const runner = new SkillEvalRunner({
66
+ cwd,
67
+ model: config.defaultAgentModel,
68
+ parallel: false,
69
+ allowedWriteDirs: config.allowedWriteDirs,
70
+ });
71
+ const logDir = path.join(config.outputDir, 'logs');
72
+ const results = await runner.runAll(evaluation, (task) => new SessionLogger(task.id, logDir));
73
+ // 4. Score results
74
+ console.log('\n--- Scoring ---\n');
75
+ const scorerOptions = {
76
+ noDeterministic: options.noDeterministic,
77
+ noJudge: options.noJudge,
78
+ judgeOptions: { model: config.defaultJudgeModel },
79
+ };
80
+ const scores = await scoreAll(evaluation.tasks, results, scorerOptions);
81
+ // 5. Generate reports
82
+ console.log('\n--- Generating Reports ---\n');
83
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
84
+ const reportBaseName = `${evaluation.skillName}-${timestamp}`;
85
+ const reportPath = path.join(config.outputDir, `${reportBaseName}.md`);
86
+ const jsonPath = path.join(config.outputDir, `${reportBaseName}.json`);
87
+ const metadata = {
88
+ skillPath: options.tasksFile,
89
+ agentModel: config.defaultAgentModel,
90
+ judgeModel: config.defaultJudgeModel,
91
+ };
92
+ await generateReport(evaluation, results, scores, reportPath, metadata);
93
+ const report = await generateJsonResults(evaluation, results, scores, jsonPath, metadata);
94
+ // 6. GitHub summary
95
+ if (config.githubSummary) {
96
+ const wrote = await writeGitHubSummary(report);
97
+ if (wrote) {
98
+ console.log('GitHub step summary written');
99
+ }
100
+ }
101
+ const markdownSummary = generateGitHubSummary(report);
102
+ // 7. Print summary
103
+ printSummary(report);
104
+ return {
105
+ passed: report.passed,
106
+ failureReasons: report.failureReasons,
107
+ evaluation,
108
+ results,
109
+ scores,
110
+ report,
111
+ reportPath,
112
+ jsonPath,
113
+ markdownSummary,
114
+ };
115
+ }
116
+ finally {
117
+ // Cleanup local skills
118
+ if (skillsSetup) {
119
+ await cleanupLocalSkills(cwd);
120
+ }
121
+ }
122
+ }
123
+ /**
124
+ * Score existing results (no runner).
125
+ */
126
+ export async function scorePipeline(resultsPath, options = {}) {
127
+ const config = await loadConfig(options.configPath, options.configOverrides);
128
+ const data = JSON.parse(await fs.readFile(resultsPath, 'utf-8'));
129
+ const evaluation = {
130
+ skillName: data.skillName,
131
+ tasks: data.tasks,
132
+ };
133
+ const results = data.results;
134
+ const scorerOptions = {
135
+ noDeterministic: options.noDeterministic,
136
+ noJudge: options.noJudge,
137
+ judgeOptions: { model: config.defaultJudgeModel },
138
+ };
139
+ console.log(`Scoring ${results.length} result(s)...`);
140
+ const scores = await scoreAll(evaluation.tasks, results, scorerOptions);
141
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
142
+ const reportBaseName = `${evaluation.skillName}-scored-${timestamp}`;
143
+ const reportPath = path.join(config.outputDir, `${reportBaseName}.md`);
144
+ const jsonPath = path.join(config.outputDir, `${reportBaseName}.json`);
145
+ const metadata = {
146
+ skillPath: resultsPath,
147
+ agentModel: data.metadata?.agentModel || config.defaultAgentModel,
148
+ judgeModel: config.defaultJudgeModel,
149
+ };
150
+ await generateReport(evaluation, results, scores, reportPath, metadata);
151
+ const report = await generateJsonResults(evaluation, results, scores, jsonPath, metadata);
152
+ const markdownSummary = generateGitHubSummary(report);
153
+ printSummary(report);
154
+ return {
155
+ passed: report.passed,
156
+ failureReasons: report.failureReasons,
157
+ evaluation,
158
+ results,
159
+ scores,
160
+ report,
161
+ reportPath,
162
+ jsonPath,
163
+ markdownSummary,
164
+ };
165
+ }
166
+ function printSummary(report) {
167
+ const s = report.summary;
168
+ console.log('\n' + '='.repeat(50));
169
+ console.log(` Skill Evaluation: ${report.skillName}`);
170
+ console.log('='.repeat(50));
171
+ console.log(` Result: ${report.passed ? 'PASS' : 'FAIL'}`);
172
+ console.log(` Discovery: ${(s.discoveryAccuracy * 100).toFixed(0)}%`);
173
+ console.log(` Avg Adherence: ${s.avgAdherence.toFixed(2)}/5`);
174
+ console.log(` Avg Output Quality: ${s.avgOutputQuality.toFixed(2)}/5`);
175
+ console.log(` Weighted Score: ${s.avgWeightedScore.toFixed(2)}`);
176
+ console.log(` Duration: ${(s.totalDurationMs / 1000).toFixed(1)}s | Cost: $${s.totalCostUsd.toFixed(4)}`);
177
+ if (!report.passed && report.failureReasons.length > 0) {
178
+ console.log(`\n Failures:`);
179
+ for (const reason of report.failureReasons) {
180
+ console.log(` - ${reason}`);
181
+ }
182
+ }
183
+ console.log('='.repeat(50));
184
+ }
185
+ //# sourceMappingURL=pipeline.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/pipeline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAC/E,OAAO,EAAE,QAAQ,EAAsB,MAAM,oBAAoB,CAAC;AAClE,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAkB,MAAM,oBAAoB,CAAC;AACzF,OAAO,EAAE,qBAAqB,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AACvF,OAAO,EAAE,UAAU,EAAmB,MAAM,aAAa,CAAC;AA2C1D;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,OAAwB;IACxD,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC;IAC7E,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IAEzC,iBAAiB;IACjB,OAAO,CAAC,GAAG,CAAC,uBAAuB,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC;IACxD,IAAI,UAAU,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAExD,4BAA4B;IAC5B,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;QACvB,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAC9E,UAAU,GAAG;YACX,GAAG,UAAU;YACb,KAAK,EAAE,UAAU,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;SAC3D,CAAC;QACF,OAAO,CAAC,GAAG,CAAC,eAAe,UAAU,CAAC,KAAK,CAAC,MAAM,aAAa,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;IACvF,CAAC;IAED,IAAI,UAAU,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClC,MAAM,IAAI,KAAK,CAAC,iBAAiB,CAAC,CAAC;IACrC,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,WAAW,UAAU,CAAC,KAAK,CAAC,MAAM,uBAAuB,UAAU,CAAC,SAAS,EAAE,CAAC,CAAC;IAE7F,wBAAwB;IACxB,kFAAkF;IAClF,IAAI,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;IAClC,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC;QAC/D,MAAM,aAAa,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;QACpD,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YAC1C,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gBACvB,SAAS,GAAG,aAAa,CAAC;YAC5B,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,0CAA0C;QAC5C,CAAC;IACH,CAAC;IAED,IAAI,WAAW,GAAG,KAAK,CAAC;IACxB,IAAI,SAAS,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,iCAAiC,SAAS,EAAE,CAAC,CAAC;QAC1D,MAAM,UAAU,GAAG,MAAM,gBAAgB,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC;QAC1D,WAAW,GAAG,IAAI,CAAC;QACnB,OAAO,CAAC,GAAG,CAAC,sBAAsB,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,IAAI,CAAC;QACH,6BAA6B;QAC7B,OAAO,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC;YACjC,GAAG;YACH,KAAK,EAAE,MAAM,CAAC,iBAAiB;YAC/B,QAAQ,EAAE,KAAK;YACf,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;SAC1C,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;QACnD,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,MAAM,CACjC,UAAU,EACV,CAAC,IAAc,EAAE,EAAE,CAAC,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,EAAE,MAAM,CAAC,CACvD,CAAC;QAEF,mBAAmB;QACnB,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC;QACnC,MAAM,aAAa,GAAkB;YACnC,eAAe,EAAE,OAAO,CAAC,eAAe;YACxC,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,YAAY,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,iBAAiB,EAAE;SAClD,CAAC;QACF,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,KAAK,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC;QAExE,sBAAsB;QACtB,OAAO,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC;QAC9C,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QACjE,MAAM,cAAc,GAAG,GAAG,UAAU,CAAC,SAAS,IAAI,SAAS,EAAE,CAAC;QAC9D,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,GAAG,cAAc,KAAK,CAAC,CAAC;QACvE,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,GAAG,cAAc,OAAO,CAAC,CAAC;QAEvE,MAAM,QAAQ,GAAmB;YAC/B,SAAS,EAAE,OAAO,CAAC,SAAS;YAC5B,UAAU,EAAE,MAAM,CAAC,iBAAiB;YACpC,UAAU,EAAE,MAAM,CAAC,iBAAiB;SACrC,CAAC;QAEF,MAAM,cAAc,CAAC,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC;QACxE,MAAM,MAAM,GAAG,MAAM,mBAAmB,CAAC,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAC;QAE1F,oBAAoB;QACpB,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YACzB,MAAM,KAAK,GAAG,MAAM,kBAAkB,CAAC,MAAM,CAAC,CAAC;YAC/C,IAAI,KAAK,EAAE,CAAC;gBACV,OAAO,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC;YAC7C,CAAC;QACH,CAAC;QAED,MAAM,eAAe,GAAG,qBAAqB,CAAC,MAAM,CAAC,CAAC;QAEtD,mBAAmB;QACnB,YAAY,CAAC,MAAM,CAAC,CAAC;QAErB,OAAO;YACL,MAAM,EAAE,MAAM,CAAC,MAAM;YACrB,cAAc,EAAE,MAAM,CAAC,cAAc;YACrC,UAAU;YACV,OAAO;YACP,MAAM;YACN,MAAM;YACN,UAAU;YACV,QAAQ;YACR,eAAe;SAChB,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,uBAAuB;QACvB,IAAI,WAAW,EAAE,CAAC;YAChB,MAAM,kBAAkB,CAAC,GAAG,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,WAAmB,EACnB,UAKI,EAAE;IAEN,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC;IAE7E,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC;IACjE,MAAM,UAAU,GAAoB;QAClC,SAAS,EAAE,IAAI,CAAC,SAAS;QACzB,KAAK,EAAE,IAAI,CAAC,KAAK;KAClB,CAAC;IACF,MAAM,OAAO,GAAiB,IAAI,CAAC,OAAO,CAAC;IAE3C,MAAM,aAAa,GAAkB;QACnC,eAAe,EAAE,OAAO,CAAC,eAAe;QACxC,OAAO,EAAE,OAAO,CAAC,OAAO;QACxB,YAAY,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,iBAAiB,EAAE;KAClD,CAAC;IAEF,OAAO,CAAC,GAAG,CAAC,WAAW,OAAO,CAAC,MAAM,eAAe,CAAC,CAAC;IACtD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,KAAK,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC;IAExE,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;IACjE,MAAM,cAAc,GAAG,GAAG,UAAU,CAAC,SAAS,WAAW,SAAS,EAAE,CAAC;IACrE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,GAAG,cAAc,KAAK,CAAC,CAAC;IACvE,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,GAAG,cAAc,OAAO,CAAC,CAAC;IAEvE,MAAM,QAAQ,GAAmB;QAC/B,SAAS,EAAE,WAAW;QACtB,UAAU,EAAE,IAAI,CAAC,QAAQ,EAAE,UAAU,IAAI,MAAM,CAAC,iBAAiB;QACjE,UAAU,EAAE,MAAM,CAAC,iBAAiB;KACrC,CAAC;IAEF,MAAM,cAAc,CAAC,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC;IACxE,MAAM,MAAM,GAAG,MAAM,mBAAmB,CAAC,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAE1F,MAAM,eAAe,GAAG,qBAAqB,CAAC,MAAM,CAAC,CAAC;IACtD,YAAY,CAAC,MAAM,CAAC,CAAC;IAErB,OAAO;QACL,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,cAAc,EAAE,MAAM,CAAC,cAAc;QACrC,UAAU;QACV,OAAO;QACP,MAAM;QACN,MAAM;QACN,UAAU;QACV,QAAQ;QACR,eAAe;KAChB,CAAC;AACJ,CAAC;AAED,SAAS,YAAY,CAAC,MAAwB;IAC5C,MAAM,CAAC,GAAG,MAAM,CAAC,OAAO,CAAC;IACzB,OAAO,CAAC,GAAG,CAAC,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,OAAO,CAAC,GAAG,CAAC,uBAAuB,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;IAC5D,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC,CAAC,iBAAiB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAC/D,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAClE,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC3G,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,MAAM,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvD,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC;QAC7B,KAAK,MAAM,MAAM,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;YAC3C,OAAO,CAAC,GAAG,CAAC,SAAS,MAAM,EAAE,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;AAC9B,CAAC"}
@@ -0,0 +1,15 @@
1
+ /**
2
+ * GitHub Actions job summary generation.
3
+ *
4
+ * Produces condensed markdown suitable for $GITHUB_STEP_SUMMARY.
5
+ */
6
+ import type { EvaluationReport } from '../types.js';
7
+ /**
8
+ * Generate a condensed summary for GitHub Actions.
9
+ */
10
+ export declare function generateGitHubSummary(report: EvaluationReport): string;
11
+ /**
12
+ * Write summary to $GITHUB_STEP_SUMMARY if available.
13
+ */
14
+ export declare function writeGitHubSummary(report: EvaluationReport): Promise<boolean>;
15
+ //# sourceMappingURL=github-summary.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"github-summary.d.ts","sourceRoot":"","sources":["../../../src/report/github-summary.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EACV,gBAAgB,EAIjB,MAAM,aAAa,CAAC;AAErB;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,gBAAgB,GAAG,MAAM,CAqDtE;AAED;;GAEG;AACH,wBAAsB,kBAAkB,CAAC,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,OAAO,CAAC,CAOnF"}
@@ -0,0 +1,77 @@
1
+ /**
2
+ * GitHub Actions job summary generation.
3
+ *
4
+ * Produces condensed markdown suitable for $GITHUB_STEP_SUMMARY.
5
+ */
6
+ import * as fs from 'fs/promises';
7
+ /**
8
+ * Generate a condensed summary for GitHub Actions.
9
+ */
10
+ export function generateGitHubSummary(report) {
11
+ const { summary, failureBreakdown, tasks } = report;
12
+ const lines = [];
13
+ const icon = report.passed ? ':white_check_mark:' : ':x:';
14
+ lines.push(`## ${icon} Skill Evaluation: ${report.skillName}`);
15
+ lines.push('');
16
+ // Summary table
17
+ lines.push('| Metric | Value | Status |');
18
+ lines.push('|--------|-------|--------|');
19
+ lines.push(`| Discovery Rate | ${(summary.discoveryAccuracy * 100).toFixed(0)}% (${Math.round(summary.discoveryAccuracy * summary.totalTasks)}/${summary.totalTasks}) | ${summary.discoveryAccuracy >= 0.8 ? 'PASS' : 'FAIL'} |`);
20
+ lines.push(`| Avg Adherence | ${summary.avgAdherence.toFixed(1)}/5 | ${summary.avgAdherence >= 4.0 ? 'PASS' : 'FAIL'} |`);
21
+ lines.push(`| Avg Output Quality | ${summary.avgOutputQuality.toFixed(1)}/5 | ${summary.avgOutputQuality >= 4.0 ? 'PASS' : 'FAIL'} |`);
22
+ lines.push(`| Weighted Score | ${summary.avgWeightedScore.toFixed(2)} | |`);
23
+ lines.push(`| Duration | ${(summary.totalDurationMs / 1000).toFixed(1)}s | |`);
24
+ lines.push(`| Cost | $${summary.totalCostUsd.toFixed(4)} | |`);
25
+ lines.push('');
26
+ // Failures
27
+ const failures = tasks.filter((t) => t.score.failureCategory !== 'none');
28
+ if (failures.length > 0) {
29
+ lines.push(`### Failures (${failures.length})`);
30
+ lines.push('');
31
+ lines.push('| Task | Category | Details |');
32
+ lines.push('|------|----------|---------|');
33
+ for (const f of failures) {
34
+ const cat = formatCategory(f.score.failureCategory);
35
+ const reason = f.score.reasoning.slice(0, 80) + (f.score.reasoning.length > 80 ? '...' : '');
36
+ lines.push(`| ${f.task.id} | ${cat} | ${reason} |`);
37
+ }
38
+ lines.push('');
39
+ }
40
+ // Per-task details in collapsible
41
+ lines.push('<details><summary>All task results</summary>');
42
+ lines.push('');
43
+ lines.push('| Task | Discovery | Adherence | Output | Weighted | Status |');
44
+ lines.push('|------|-----------|-----------|--------|----------|--------|');
45
+ for (const t of tasks) {
46
+ const s = t.score;
47
+ const status = s.failureCategory === 'none' ? 'PASS' : 'FAIL';
48
+ lines.push(`| ${t.task.id} | ${s.discovery} | ${s.adherence}/5 | ${s.outputQuality}/5 | ${s.weightedScore.toFixed(2)} | ${status} |`);
49
+ }
50
+ lines.push('');
51
+ lines.push('</details>');
52
+ lines.push('');
53
+ if (!report.passed && report.failureReasons.length > 0) {
54
+ lines.push(`**Failure reasons:** ${report.failureReasons.join('; ')}`);
55
+ }
56
+ return lines.join('\n');
57
+ }
58
+ /**
59
+ * Write summary to $GITHUB_STEP_SUMMARY if available.
60
+ */
61
+ export async function writeGitHubSummary(report) {
62
+ const summaryPath = process.env.GITHUB_STEP_SUMMARY;
63
+ if (!summaryPath)
64
+ return false;
65
+ const summary = generateGitHubSummary(report);
66
+ await fs.appendFile(summaryPath, summary + '\n');
67
+ return true;
68
+ }
69
+ function formatCategory(cat) {
70
+ if (cat === 'none')
71
+ return 'No Failure';
72
+ return cat
73
+ .split('_')
74
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
75
+ .join(' ');
76
+ }
77
+ //# sourceMappingURL=github-summary.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"github-summary.js","sourceRoot":"","sources":["../../../src/report/github-summary.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAQlC;;GAEG;AACH,MAAM,UAAU,qBAAqB,CAAC,MAAwB;IAC5D,MAAM,EAAE,OAAO,EAAE,gBAAgB,EAAE,KAAK,EAAE,GAAG,MAAM,CAAC;IACpD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,KAAK,CAAC;IAC1D,KAAK,CAAC,IAAI,CAAC,MAAM,IAAI,sBAAsB,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IAC/D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,gBAAgB;IAChB,KAAK,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1C,KAAK,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1C,KAAK,CAAC,IAAI,CAAC,sBAAsB,CAAC,OAAO,CAAC,iBAAiB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,iBAAiB,GAAG,OAAO,CAAC,UAAU,CAAC,IAAI,OAAO,CAAC,UAAU,OAAO,OAAO,CAAC,iBAAiB,IAAI,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;IAClO,KAAK,CAAC,IAAI,CAAC,qBAAqB,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,OAAO,CAAC,YAAY,IAAI,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;IAC1H,KAAK,CAAC,IAAI,CAAC,0BAA0B,OAAO,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,OAAO,CAAC,gBAAgB,IAAI,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;IACvI,KAAK,CAAC,IAAI,CAAC,sBAAsB,OAAO,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC5E,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;IAC/E,KAAK,CAAC,IAAI,CAAC,aAAa,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAC/D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,WAAW;IACX,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,eAAe,KAAK,MAAM,CAAC,CAAC;IACzE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,KAAK,CAAC,IAAI,CAAC,iBAAiB,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC;QAChD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;QAC5C,KAAK,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;QAC5C,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YACzB,MAAM,GAAG,GAAG,cAAc,CAAC,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC;YACpD,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAC7F,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,EAAE,MAAM,GAAG,MAAM,MAAM,IAAI,CAAC,CAAC;QACtD,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,kCAAkC;IAClC,KAAK,CAAC,IAAI,CAAC,8CAA8C,CAAC,CAAC;IAC3D,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,+DAA+D,CAAC,CAAC;IAC5E,KAAK,CAAC,IAAI,CAAC,+DAA+D,CAAC,CAAC;IAC5E,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC;QAClB,MAAM,MAAM,GAAG,CAAC,CAAC,eAAe,KAAK,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9D,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC,CAAC,SAAS,MAAM,CAAC,CAAC,SAAS,QAAQ,CAAC,CAAC,aAAa,QAAQ,CAAC,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,MAAM,IAAI,CAAC,CAAC;IACxI,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IACzB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,MAAM,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvD,KAAK,CAAC,IAAI,CAAC,wBAAwB,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACzE,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,MAAwB;IAC/D,MAAM,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC;IACpD,IAAI,CAAC,WAAW;QAAE,OAAO,KAAK,CAAC;IAE/B,MAAM,OAAO,GAAG,qBAAqB,CAAC,MAAM,CAAC,CAAC;IAC9C,MAAM,EAAE,CAAC,UAAU,CAAC,WAAW,EAAE,OAAO,GAAG,IAAI,CAAC,CAAC;IACjD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,cAAc,CAAC,GAAW;IACjC,IAAI,GAAG,KAAK,MAAM;QAAE,OAAO,YAAY,CAAC;IACxC,OAAO,GAAG;SACP,KAAK,CAAC,GAAG,CAAC;SACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;SAClD,IAAI,CAAC,GAAG,CAAC,CAAC;AACf,CAAC"}
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Report generation for skill evaluation results.
3
+ *
4
+ * Generates markdown and JSON reports from combined evaluation scores.
5
+ */
6
+ import type { SkillEvaluation, TaskResult, CombinedScore, EvaluationReport, EvaluationSummary, FailureBreakdown, ReportMetadata } from '../types.js';
7
+ /**
8
+ * Generate a markdown report from evaluation results.
9
+ */
10
+ export declare function generateReport(evaluation: SkillEvaluation, results: TaskResult[], scores: CombinedScore[], outputPath?: string, metadata?: ReportMetadata): Promise<string>;
11
+ /**
12
+ * Generate JSON report for programmatic analysis.
13
+ */
14
+ export declare function generateJsonResults(evaluation: SkillEvaluation, results: TaskResult[], scores: CombinedScore[], outputPath?: string, metadata?: ReportMetadata): Promise<EvaluationReport>;
15
+ /**
16
+ * Compute summary statistics from combined scores.
17
+ */
18
+ export declare function computeSummary(results: TaskResult[], scores: CombinedScore[]): EvaluationSummary;
19
+ /**
20
+ * Compute failure category breakdown.
21
+ */
22
+ export declare function computeFailureBreakdown(scores: CombinedScore[]): FailureBreakdown[];
23
+ //# sourceMappingURL=report.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"report.d.ts","sourceRoot":"","sources":["../../../src/report/report.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAIH,OAAO,KAAK,EACV,eAAe,EACf,UAAU,EACV,aAAa,EACb,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAEhB,cAAc,EACf,MAAM,aAAa,CAAC;AAGrB;;GAEG;AACH,wBAAsB,cAAc,CAClC,UAAU,EAAE,eAAe,EAC3B,OAAO,EAAE,UAAU,EAAE,EACrB,MAAM,EAAE,aAAa,EAAE,EACvB,UAAU,CAAC,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,cAAc,GACxB,OAAO,CAAC,MAAM,CAAC,CAsHjB;AAED;;GAEG;AACH,wBAAsB,mBAAmB,CACvC,UAAU,EAAE,eAAe,EAC3B,OAAO,EAAE,UAAU,EAAE,EACrB,MAAM,EAAE,aAAa,EAAE,EACvB,UAAU,CAAC,EAAE,MAAM,EACnB,QAAQ,CAAC,EAAE,cAAc,GACxB,OAAO,CAAC,gBAAgB,CAAC,CAuD3B;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,OAAO,EAAE,UAAU,EAAE,EACrB,MAAM,EAAE,aAAa,EAAE,GACtB,iBAAiB,CAmBnB;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CAAC,MAAM,EAAE,aAAa,EAAE,GAAG,gBAAgB,EAAE,CAenF"}