@skilljack/evals 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +327 -0
  3. package/action/action.yml +72 -0
  4. package/action/index.ts +78 -0
  5. package/dist/action/index.d.ts +8 -0
  6. package/dist/action/index.d.ts.map +1 -0
  7. package/dist/action/index.js +68 -0
  8. package/dist/action/index.js.map +1 -0
  9. package/dist/src/cli.d.ts +9 -0
  10. package/dist/src/cli.d.ts.map +1 -0
  11. package/dist/src/cli.js +264 -0
  12. package/dist/src/cli.js.map +1 -0
  13. package/dist/src/config.d.ts +52 -0
  14. package/dist/src/config.d.ts.map +1 -0
  15. package/dist/src/config.js +194 -0
  16. package/dist/src/config.js.map +1 -0
  17. package/dist/src/index.d.ts +24 -0
  18. package/dist/src/index.d.ts.map +1 -0
  19. package/dist/src/index.js +28 -0
  20. package/dist/src/index.js.map +1 -0
  21. package/dist/src/parser.d.ts +22 -0
  22. package/dist/src/parser.d.ts.map +1 -0
  23. package/dist/src/parser.js +205 -0
  24. package/dist/src/parser.js.map +1 -0
  25. package/dist/src/pipeline.d.ts +53 -0
  26. package/dist/src/pipeline.d.ts.map +1 -0
  27. package/dist/src/pipeline.js +185 -0
  28. package/dist/src/pipeline.js.map +1 -0
  29. package/dist/src/report/github-summary.d.ts +15 -0
  30. package/dist/src/report/github-summary.d.ts.map +1 -0
  31. package/dist/src/report/github-summary.js +77 -0
  32. package/dist/src/report/github-summary.js.map +1 -0
  33. package/dist/src/report/report.d.ts +23 -0
  34. package/dist/src/report/report.d.ts.map +1 -0
  35. package/dist/src/report/report.js +216 -0
  36. package/dist/src/report/report.js.map +1 -0
  37. package/dist/src/runner/runner.d.ts +29 -0
  38. package/dist/src/runner/runner.d.ts.map +1 -0
  39. package/dist/src/runner/runner.js +211 -0
  40. package/dist/src/runner/runner.js.map +1 -0
  41. package/dist/src/runner/security.d.ts +26 -0
  42. package/dist/src/runner/security.d.ts.map +1 -0
  43. package/dist/src/runner/security.js +34 -0
  44. package/dist/src/runner/security.js.map +1 -0
  45. package/dist/src/runner/skill-setup.d.ts +19 -0
  46. package/dist/src/runner/skill-setup.d.ts.map +1 -0
  47. package/dist/src/runner/skill-setup.js +72 -0
  48. package/dist/src/runner/skill-setup.js.map +1 -0
  49. package/dist/src/scorer/deterministic.d.ts +12 -0
  50. package/dist/src/scorer/deterministic.d.ts.map +1 -0
  51. package/dist/src/scorer/deterministic.js +149 -0
  52. package/dist/src/scorer/deterministic.js.map +1 -0
  53. package/dist/src/scorer/judge.d.ts +34 -0
  54. package/dist/src/scorer/judge.d.ts.map +1 -0
  55. package/dist/src/scorer/judge.js +226 -0
  56. package/dist/src/scorer/judge.js.map +1 -0
  57. package/dist/src/scorer/scorer.d.ts +25 -0
  58. package/dist/src/scorer/scorer.d.ts.map +1 -0
  59. package/dist/src/scorer/scorer.js +149 -0
  60. package/dist/src/scorer/scorer.js.map +1 -0
  61. package/dist/src/session/session-logger.d.ts +30 -0
  62. package/dist/src/session/session-logger.d.ts.map +1 -0
  63. package/dist/src/session/session-logger.js +157 -0
  64. package/dist/src/session/session-logger.js.map +1 -0
  65. package/dist/src/types.d.ts +227 -0
  66. package/dist/src/types.d.ts.map +1 -0
  67. package/dist/src/types.js +16 -0
  68. package/dist/src/types.js.map +1 -0
  69. package/package.json +44 -0
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Session logger for capturing evaluation run events.
3
+ *
4
+ * Captures tool calls, text output, metrics, and eval results
5
+ * for debugging and reporting purposes.
6
+ */
7
+ import type { SessionLogEntry, MetricsData } from '../types.js';
8
+ export declare class SessionLogger {
9
+ private log;
10
+ private logDir;
11
+ constructor(task: string, logDir?: string);
12
+ addEntry(type: SessionLogEntry['type'], data: unknown): void;
13
+ addTextMessage(text: string): void;
14
+ addToolUse(name: string, input: unknown): void;
15
+ addToolResult(name: string, success: boolean): void;
16
+ addAssistantMessage(content: unknown[]): void;
17
+ setMetrics(metrics: MetricsData): void;
18
+ markAsError(errorMessage: string): void;
19
+ getEntries(): SessionLogEntry[];
20
+ getSessionId(): string;
21
+ /**
22
+ * Save session log to disk as both JSON and human-readable markdown.
23
+ */
24
+ save(): Promise<{
25
+ jsonPath: string;
26
+ mdPath: string;
27
+ }>;
28
+ private generateReadableLog;
29
+ }
30
+ //# sourceMappingURL=session-logger.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"session-logger.d.ts","sourceRoot":"","sources":["../../../src/session/session-logger.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,eAAe,EAAc,WAAW,EAAE,MAAM,aAAa,CAAC;AAE5E,qBAAa,aAAa;IACxB,OAAO,CAAC,GAAG,CAAa;IACxB,OAAO,CAAC,MAAM,CAAS;gBAEX,IAAI,EAAE,MAAM,EAAE,MAAM,GAAE,MAAyB;IAY3D,QAAQ,CAAC,IAAI,EAAE,eAAe,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,OAAO,GAAG,IAAI;IAQ5D,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI;IAIlC,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,IAAI;IAI9C,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,GAAG,IAAI;IAInD,mBAAmB,CAAC,OAAO,EAAE,OAAO,EAAE,GAAG,IAAI;IAI7C,UAAU,CAAC,OAAO,EAAE,WAAW,GAAG,IAAI;IAItC,WAAW,CAAC,YAAY,EAAE,MAAM,GAAG,IAAI;IAKvC,UAAU,IAAI,eAAe,EAAE;IAI/B,YAAY,IAAI,MAAM;IAItB;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IAiB3D,OAAO,CAAC,mBAAmB;CAoF5B"}
@@ -0,0 +1,157 @@
1
+ /**
2
+ * Session logger for capturing evaluation run events.
3
+ *
4
+ * Captures tool calls, text output, metrics, and eval results
5
+ * for debugging and reporting purposes.
6
+ */
7
+ import * as fs from 'fs/promises';
8
+ import * as path from 'path';
9
+ export class SessionLogger {
10
+ log;
11
+ logDir;
12
+ constructor(task, logDir = './results/logs') {
13
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
14
+ this.logDir = logDir;
15
+ this.log = {
16
+ sessionId: `eval-${task}-${timestamp}`,
17
+ task,
18
+ startTime: new Date().toISOString(),
19
+ status: 'success',
20
+ entries: [],
21
+ };
22
+ }
23
+ addEntry(type, data) {
24
+ this.log.entries.push({
25
+ timestamp: new Date().toISOString(),
26
+ type,
27
+ data,
28
+ });
29
+ }
30
+ addTextMessage(text) {
31
+ this.addEntry('text', { text });
32
+ }
33
+ addToolUse(name, input) {
34
+ this.addEntry('tool_use', { name, input });
35
+ }
36
+ addToolResult(name, success) {
37
+ this.addEntry('tool_result', { name, success });
38
+ }
39
+ addAssistantMessage(content) {
40
+ this.addEntry('assistant', { content });
41
+ }
42
+ setMetrics(metrics) {
43
+ this.log.metrics = metrics;
44
+ }
45
+ markAsError(errorMessage) {
46
+ this.log.status = 'error';
47
+ this.log.errorMessage = errorMessage;
48
+ }
49
+ getEntries() {
50
+ return this.log.entries;
51
+ }
52
+ getSessionId() {
53
+ return this.log.sessionId;
54
+ }
55
+ /**
56
+ * Save session log to disk as both JSON and human-readable markdown.
57
+ */
58
+ async save() {
59
+ this.log.endTime = new Date().toISOString();
60
+ await fs.mkdir(this.logDir, { recursive: true });
61
+ const prefix = this.log.status === 'error' ? 'FAILED__' : '';
62
+ const baseName = `${prefix}${this.log.sessionId}`;
63
+ const jsonPath = path.join(this.logDir, `${baseName}.json`);
64
+ const mdPath = path.join(this.logDir, `${baseName}.md`);
65
+ await fs.writeFile(jsonPath, JSON.stringify(this.log, null, 2));
66
+ await fs.writeFile(mdPath, this.generateReadableLog());
67
+ return { jsonPath, mdPath };
68
+ }
69
+ generateReadableLog() {
70
+ const lines = [];
71
+ lines.push(`# Eval Session: ${this.log.sessionId}`);
72
+ lines.push(`**Task:** ${this.log.task}`);
73
+ lines.push(`**Start:** ${this.log.startTime}`);
74
+ lines.push(`**End:** ${this.log.endTime || 'In progress'}`);
75
+ lines.push(`**Status:** ${this.log.status === 'success' ? 'PASS' : 'FAIL'} (${this.log.status})`);
76
+ if (this.log.errorMessage) {
77
+ lines.push(`**Error:** ${this.log.errorMessage}`);
78
+ }
79
+ lines.push('');
80
+ if (this.log.metrics) {
81
+ lines.push('## Metrics');
82
+ lines.push(`- **Duration:** ${formatDuration(this.log.metrics.timing.totalElapsedMs)}`);
83
+ lines.push(`- **Cost:** $${this.log.metrics.cost.toFixed(6)}`);
84
+ lines.push(`- **Turns:** ${this.log.metrics.turns}`);
85
+ lines.push(`- **Tokens:** ${this.log.metrics.tokens.total.toLocaleString()}`);
86
+ lines.push('');
87
+ }
88
+ lines.push('## Events');
89
+ lines.push('');
90
+ let toolCount = 0;
91
+ for (const entry of this.log.entries) {
92
+ const time = entry.timestamp.split('T')[1]?.split('.')[0] || '';
93
+ switch (entry.type) {
94
+ case 'text': {
95
+ const data = entry.data;
96
+ const preview = data.text.length > 500 ? data.text.substring(0, 500) + '...' : data.text;
97
+ lines.push(`### [${time}] Text`);
98
+ lines.push('```');
99
+ lines.push(preview);
100
+ lines.push('```');
101
+ lines.push('');
102
+ break;
103
+ }
104
+ case 'tool_use': {
105
+ toolCount++;
106
+ const data = entry.data;
107
+ lines.push(`### [${time}] Tool #${toolCount}: ${data.name}`);
108
+ const inputStr = JSON.stringify(data.input, null, 2);
109
+ lines.push('```json');
110
+ lines.push(inputStr.length > 1000 ? inputStr.substring(0, 1000) + '\n...' : inputStr);
111
+ lines.push('```');
112
+ lines.push('');
113
+ break;
114
+ }
115
+ case 'tool_result': {
116
+ const data = entry.data;
117
+ lines.push(`- Tool result: ${data.name} - ${data.success ? 'Success' : 'Failed'}`);
118
+ lines.push('');
119
+ break;
120
+ }
121
+ case 'assistant': {
122
+ const data = entry.data;
123
+ lines.push(`### [${time}] Assistant`);
124
+ for (const chunk of data.content) {
125
+ if (typeof chunk === 'object' && chunk !== null) {
126
+ const c = chunk;
127
+ if (c.type === 'text' && c.text) {
128
+ const preview = c.text.length > 300 ? c.text.substring(0, 300) + '...' : c.text;
129
+ lines.push('```');
130
+ lines.push(preview);
131
+ lines.push('```');
132
+ }
133
+ else if (c.type === 'tool_use' && c.name) {
134
+ lines.push(`Tool call: ${c.name}`);
135
+ }
136
+ }
137
+ }
138
+ lines.push('');
139
+ break;
140
+ }
141
+ }
142
+ }
143
+ lines.push('---');
144
+ lines.push(`Total tool calls: ${toolCount}`);
145
+ return lines.join('\n');
146
+ }
147
+ }
148
+ function formatDuration(ms) {
149
+ if (ms < 1000)
150
+ return `${ms}ms`;
151
+ if (ms < 60000)
152
+ return `${(ms / 1000).toFixed(2)}s`;
153
+ const min = Math.floor(ms / 60000);
154
+ const sec = ((ms % 60000) / 1000).toFixed(1);
155
+ return `${min}m ${sec}s`;
156
+ }
157
+ //# sourceMappingURL=session-logger.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"session-logger.js","sourceRoot":"","sources":["../../../src/session/session-logger.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAG7B,MAAM,OAAO,aAAa;IAChB,GAAG,CAAa;IAChB,MAAM,CAAS;IAEvB,YAAY,IAAY,EAAE,SAAiB,gBAAgB;QACzD,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QACjE,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,GAAG,GAAG;YACT,SAAS,EAAE,QAAQ,IAAI,IAAI,SAAS,EAAE;YACtC,IAAI;YACJ,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,MAAM,EAAE,SAAS;YACjB,OAAO,EAAE,EAAE;SACZ,CAAC;IACJ,CAAC;IAED,QAAQ,CAAC,IAA6B,EAAE,IAAa;QACnD,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC;YACpB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,IAAI;YACJ,IAAI;SACL,CAAC,CAAC;IACL,CAAC;IAED,cAAc,CAAC,IAAY;QACzB,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;IAClC,CAAC;IAED,UAAU,CAAC,IAAY,EAAE,KAAc;QACrC,IAAI,CAAC,QAAQ,CAAC,UAAU,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;IAC7C,CAAC;IAED,aAAa,CAAC,IAAY,EAAE,OAAgB;QAC1C,IAAI,CAAC,QAAQ,CAAC,aAAa,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IAED,mBAAmB,CAAC,OAAkB;QACpC,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;IAC1C,CAAC;IAED,UAAU,CAAC,OAAoB;QAC7B,IAAI,CAAC,GAAG,CAAC,OAAO,GAAG,OAAO,CAAC;IAC7B,CAAC;IAED,WAAW,CAAC,YAAoB;QAC9B,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC;QAC1B,IAAI,CAAC,GAAG,CAAC,YAAY,GAAG,YAAY,CAAC;IACvC,CAAC;IAED,UAAU;QACR,OAAO,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC;IAC1B,CAAC;IAED,YAAY;QACV,OAAO,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC;IAC5B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,CAAC,GAAG,CAAC,OAAO,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAE5C,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAEjD,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,KAAK,OAAO,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,MAAM,QAAQ,GAAG,GAAG,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC;QAElD,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,QAAQ,OAAO,CAAC,CAAC;QAC5D,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,QAAQ,KAAK,CAAC,CAAC;QAExD,MAAM,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QAChE,MAAM,EAAE,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,CAAC,mBAAmB,EAAE,CAAC,CAAC;QAEvD,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC;IAC9B,CAAC;IAEO,mBAAmB;QACzB,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,CAAC,IAAI,CAAC,mBAAmB,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC;QACpD,KAAK,CAAC,IAAI,CAAC,aAAa,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACzC,KAAK,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC;QAC/C,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,GAAG,CAAC,OAAO,IAAI,aAAa,EAAE,CAAC,CAAC;QAC5D,KAAK,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,GAAG,CAAC,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC;QAClG,IAAI,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC;YAC1B,KAAK,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,CAAC;QACpD,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAEf,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC;YACrB,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YACzB,KAAK,CAAC,IAAI,CAAC,mBAAmB,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC;YACxF,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAC/D,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;YACrD,KAAK,CAAC,IAAI,CAAC,iBAAiB,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;YAC9E,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjB,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACxB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAEf,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC;YACrC,MAAM,IAAI,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAEhE,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;gBACnB,KAAK,MAAM,CAAC,CAAC,CAAC;oBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAwB,CAAC;oBAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;oBACzF,KAAK,CAAC,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,CAAC;oBACjC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBACpB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACf,MAAM;gBACR,CAAC;gBACD,KAAK,UAAU,CAAC,CAAC,CAAC;oBAChB,SAAS,EAAE,CAAC;oBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAwC,CAAC;oBAC5D,KAAK,CAAC,IAAI,CAAC,QAAQ,IAAI,WAAW,SAAS,KAAK,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;oBAC7D,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;oBACrD,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;oBACtB,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;oBACtF,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACf,MAAM;gBACR,CAAC;gBACD,KAAK,aAAa,CAAC,CAAC,CAAC;oBACnB,MAAM,IAAI,GAAG,KAAK,CAAC,IAA0C,CAAC;oBAC9D,KAAK,CAAC,IAAI,CAAC,kBAAkB,IAAI,CAAC,IAAI,MAAM,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;oBACnF,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACf,MAAM;gBACR,CAAC;gBACD,KAAK,WAAW,CAAC,CAAC,CAAC;oBACjB,MAAM,IAAI,GAAG,KAAK,CAAC,IAA8B,CAAC;oBAClD,KAAK,CAAC,IAAI,CAAC,QAAQ,IAAI,aAAa,CAAC,CAAC;oBACtC,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;wBACjC,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;4BAChD,MAAM,CAAC,GAAG,KAAuD,CAAC;4BAClE,IAAI,CAAC,CAAC,IAAI,KAAK,MAAM,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gCAChC,MAAM,OAAO,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;gCAChF,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gCAClB,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gCACpB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;4BACpB,CAAC;iCAAM,IAAI,CAAC,CAAC,IAAI,KAAK,UAAU,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gCAC3C,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;4BACrC,CAAC;wBACH,CAAC;oBACH,CAAC;oBACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACf,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClB,KAAK,CAAC,IAAI,CAAC,qBAAqB,SAAS,EAAE,CAAC,CAAC;QAE7C,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;CACF;AAED,SAAS,cAAc,CAAC,EAAU;IAChC,IAAI,EAAE,GAAG,IAAI;QAAE,OAAO,GAAG,EAAE,IAAI,CAAC;IAChC,IAAI,EAAE,GAAG,KAAK;QAAE,OAAO,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;IACpD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,GAAG,KAAK,CAAC,CAAC;IACnC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,GAAG,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;IAC7C,OAAO,GAAG,GAAG,KAAK,GAAG,GAAG,CAAC;AAC3B,CAAC"}
@@ -0,0 +1,227 @@
1
+ /**
2
+ * Type definitions for the skill evaluation framework.
3
+ */
4
+ export interface EvalCriteria {
5
+ dimension: 'discovery' | 'adherence' | 'output';
6
+ weight: number;
7
+ description: string;
8
+ }
9
+ export interface DeterministicCheck {
10
+ expectSkillActivation: boolean;
11
+ expectMarker?: string;
12
+ expectToolCalls?: string[];
13
+ expectNoToolCalls?: string[];
14
+ }
15
+ export interface FixtureConfig {
16
+ state: string;
17
+ setup?: string;
18
+ teardown?: string;
19
+ }
20
+ export interface EvalTask {
21
+ id: string;
22
+ prompt: string;
23
+ expectedSkillLoad: string;
24
+ criteria: EvalCriteria[];
25
+ goldenChecklist: string[];
26
+ deterministic?: DeterministicCheck;
27
+ fixture?: FixtureConfig;
28
+ }
29
+ export interface EvalDefaults {
30
+ expectedSkillLoad?: string;
31
+ criteria?: Partial<Record<'discovery' | 'adherence' | 'output', {
32
+ weight?: number;
33
+ description?: string;
34
+ }>>;
35
+ }
36
+ export interface SkillEvaluation {
37
+ skillName: string;
38
+ version?: string;
39
+ defaults?: EvalDefaults;
40
+ tasks: EvalTask[];
41
+ }
42
+ export interface ToolCallRecord {
43
+ tool: string;
44
+ toolUseId: string;
45
+ timestamp: number;
46
+ input?: unknown;
47
+ }
48
+ export interface TaskResult {
49
+ taskId: string;
50
+ prompt: string;
51
+ output: string;
52
+ durationMs: number;
53
+ numTurns: number;
54
+ costUsd: number;
55
+ skillLoads: string[];
56
+ toolCalls: ToolCallRecord[];
57
+ isError: boolean;
58
+ errorMessage: string;
59
+ }
60
+ export interface RunnerOptions {
61
+ cwd?: string;
62
+ parallel?: boolean;
63
+ model?: string;
64
+ settingSources?: Array<'user' | 'project' | 'local'>;
65
+ /** Count Read calls to SKILL.md as skill discovery (default: false) */
66
+ countReadAsFallback?: boolean;
67
+ /** Directories the agent is allowed to write to */
68
+ allowedWriteDirs?: string[];
69
+ }
70
+ export interface DeterministicResult {
71
+ skillActivated: boolean;
72
+ skillName?: string;
73
+ markerFound: boolean | null;
74
+ expectedToolsCalled: boolean | null;
75
+ unexpectedToolsCalled: boolean | null;
76
+ passed: boolean;
77
+ details: string[];
78
+ }
79
+ export type FailureCategory = 'discovery_failure' | 'false_positive' | 'instruction_ambiguity' | 'missing_guidance' | 'agent_error' | 'none';
80
+ export interface JudgeScore {
81
+ taskId: string;
82
+ discovery: number;
83
+ adherence: number;
84
+ outputQuality: number;
85
+ weightedScore: number;
86
+ failureCategory: FailureCategory;
87
+ reasoning: string;
88
+ }
89
+ export interface JudgeOptions {
90
+ model?: string;
91
+ outputTruncation?: number;
92
+ }
93
+ export interface CombinedScore {
94
+ taskId: string;
95
+ deterministic: DeterministicResult | null;
96
+ judge: JudgeScore | null;
97
+ discovery: number;
98
+ adherence: number;
99
+ outputQuality: number;
100
+ weightedScore: number;
101
+ failureCategory: FailureCategory;
102
+ reasoning: string;
103
+ }
104
+ export interface SessionLogEntry {
105
+ timestamp: string;
106
+ type: 'text' | 'tool_use' | 'tool_result' | 'assistant';
107
+ data: unknown;
108
+ }
109
+ export interface MetricsData {
110
+ timestamp: string;
111
+ task: string;
112
+ timing: {
113
+ totalElapsedMs: number;
114
+ sdkDurationMs: number;
115
+ apiDurationMs: number;
116
+ overheadMs: number;
117
+ };
118
+ tokens: {
119
+ input: number;
120
+ output: number;
121
+ cacheRead: number;
122
+ cacheCreation: number;
123
+ total: number;
124
+ };
125
+ cost: number;
126
+ turns: number;
127
+ modelUsage: Record<string, {
128
+ inputTokens: number;
129
+ outputTokens: number;
130
+ costUSD: number;
131
+ }>;
132
+ }
133
+ export interface SessionLog {
134
+ sessionId: string;
135
+ task: string;
136
+ startTime: string;
137
+ endTime?: string;
138
+ status: 'success' | 'error';
139
+ errorMessage?: string;
140
+ entries: SessionLogEntry[];
141
+ metrics?: MetricsData;
142
+ }
143
+ export interface EvaluationSummary {
144
+ totalTasks: number;
145
+ discoveryAccuracy: number;
146
+ avgAdherence: number;
147
+ avgOutputQuality: number;
148
+ avgWeightedScore: number;
149
+ totalDurationMs: number;
150
+ totalCostUsd: number;
151
+ }
152
+ export interface FailureBreakdown {
153
+ category: FailureCategory;
154
+ count: number;
155
+ percentage: number;
156
+ }
157
+ export interface ReportMetadata {
158
+ skillPath: string;
159
+ gitCommit?: string;
160
+ gitBranch?: string;
161
+ version?: string;
162
+ agentModel: string;
163
+ judgeModel: string;
164
+ }
165
+ export interface EvaluationReport {
166
+ skillName: string;
167
+ timestamp: string;
168
+ passed: boolean;
169
+ failureReasons: string[];
170
+ metadata?: ReportMetadata;
171
+ summary: EvaluationSummary;
172
+ failureBreakdown: FailureBreakdown[];
173
+ tasks: Array<{
174
+ task: EvalTask;
175
+ result: TaskResult;
176
+ score: CombinedScore;
177
+ sessionLogPath?: string;
178
+ }>;
179
+ }
180
+ export interface SdkTextBlock {
181
+ type: 'text';
182
+ text: string;
183
+ }
184
+ export interface SdkToolUseBlock {
185
+ type: 'tool_use';
186
+ id: string;
187
+ name: string;
188
+ input: Record<string, unknown>;
189
+ }
190
+ export type SdkContentBlock = SdkTextBlock | SdkToolUseBlock | {
191
+ type: string;
192
+ [key: string]: unknown;
193
+ };
194
+ export interface SdkAssistantMessage {
195
+ type: 'assistant';
196
+ message: {
197
+ content: SdkContentBlock[];
198
+ };
199
+ }
200
+ export interface SdkResultMessage {
201
+ type: 'result';
202
+ result?: string;
203
+ duration_ms: number;
204
+ duration_api_ms: number;
205
+ num_turns: number;
206
+ total_cost_usd: number;
207
+ usage: {
208
+ input_tokens: number;
209
+ output_tokens: number;
210
+ cache_read_input_tokens: number;
211
+ cache_creation_input_tokens: number;
212
+ };
213
+ modelUsage: Record<string, {
214
+ inputTokens: number;
215
+ outputTokens: number;
216
+ costUSD: number;
217
+ }>;
218
+ }
219
+ export type SdkStreamMessage = SdkAssistantMessage | SdkResultMessage | {
220
+ type: string;
221
+ [key: string]: unknown;
222
+ };
223
+ export declare function isAssistantMessage(msg: unknown): msg is SdkAssistantMessage;
224
+ export declare function isResultMessage(msg: unknown): msg is SdkResultMessage;
225
+ export declare function isTextBlock(block: unknown): block is SdkTextBlock;
226
+ export declare function isToolUseBlock(block: unknown): block is SdkToolUseBlock;
227
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAMH,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE,WAAW,GAAG,WAAW,GAAG,QAAQ,CAAC;IAChD,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,kBAAkB;IACjC,qBAAqB,EAAE,OAAO,CAAC;IAC/B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;CAC9B;AAED,MAAM,WAAW,aAAa;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,iBAAiB,EAAE,MAAM,CAAC;IAC1B,QAAQ,EAAE,YAAY,EAAE,CAAC;IACzB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,aAAa,CAAC,EAAE,kBAAkB,CAAC;IACnC,OAAO,CAAC,EAAE,aAAa,CAAC;CACzB;AAED,MAAM,WAAW,YAAY;IAC3B,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,WAAW,GAAG,WAAW,GAAG,QAAQ,EAAE;QAAE,MAAM,CAAC,EAAE,MAAM,CAAC;QAAC,WAAW,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC,CAAC;CAC7G;AAED,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,YAAY,CAAC;IACxB,KAAK,EAAE,QAAQ,EAAE,CAAC;CACnB;AAMD,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,SAAS,EAAE,cAAc,EAAE,CAAC;IAC5B,OAAO,EAAE,OAAO,CAAC;IACjB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,aAAa;IAC5B,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,cAAc,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,SAAS,GAAG,OAAO,CAAC,CAAC;IACrD,uEAAuE;IACvE,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAC9B,mDAAmD;IACnD,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;CAC7B;AAMD,MAAM,WAAW,mBAAmB;IAClC,cAAc,EAAE,OAAO,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,OAAO,GAAG,IAAI,CAAC;IAC5B,mBAAmB,EAAE,OAAO,GAAG,IAAI,CAAC;IACpC,qBAAqB,EAAE,OAAO,GAAG,IAAI,CAAC;IACtC,MAAM,EAAE,OAAO,CAAC;IAChB,OAAO,EAAE,MAAM,EAAE,CAAC;CACnB;AAMD,MAAM,MAAM,eAAe,GACvB,mBAAmB,GACnB,gBAAgB,GAChB,uBAAuB,GACvB,kBAAkB,GAClB,aAAa,GACb,MAAM,CAAC;AAEX,MAAM,WAAW,UAAU;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;IACtB,aAAa,EAAE,MAAM,CAAC;IACtB,eAAe,EAAE,eAAe,CAAC;IACjC,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,YAAY;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAMD,MAAM,WAAW,aAAa;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,EAAE,mBAAmB,GAAG,IAAI,CAAC;IAC1C,KAAK,EAAE,UAAU,GAAG,IAAI,CAAC;IAGzB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;IACtB,aAAa,EAAE,MAAM,CAAC;IACtB,eAAe,EAAE,eAAe,CAAC;IACjC,SAAS,EAAE,MAAM,CAAC;CACnB;AAMD,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,GAAG,UAAU,GAAG,aAAa,GAAG,WAAW,CAAC;IACxD,IAAI,EAAE,OAAO,CAAC;CACf;AAED,MAAM,WAAW,WAAW;IAC1B,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE;QACN,cAAc,EAAE,MAAM,CAAC;QACvB,aAAa,EAAE,MAAM,CAAC;QACtB,aAAa,EAAE,MAAM,CAAC;QACtB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,MAAM,EAAE;QACN,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,SAAS,EAAE,MAAM,CAAC;QAClB,aAAa,EAAE,MAAM,CAAC;QACtB,KAAK,EAAE,MAAM,CAAC;KACf,CAAC;IACF,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE;QACzB,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,OAAO,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,UAAU;IACzB,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,SAAS,GAAG,OAAO,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,OAAO,CAAC,EAAE,WAAW,CAAC;CACvB;AAMD,MAAM,WAAW,iBAAiB;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,YAAY,EAAE,MAAM,CAAC;IACrB,gBAAgB,EAAE,MAAM,CAAC;IACzB,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,eAAe,CAAC;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,cAAc;IAC7B,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,gBAAgB;IAC/B,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,OAAO,CAAC;IAChB,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,QAAQ,CAAC,EAAE,cAAc,CAAC;IAC1B,OAAO,EAAE,iBAAiB,CAAC;IAC3B,gBAAgB,EAAE,gBAAgB,EAAE,CAAC;IACrC,KAAK,EAAE,KAAK,CAAC;QACX,IAAI,EAAE,QAAQ,CAAC;QACf,MAAM,EAAE,UAAU,CAAC;QACnB,KAAK,EAAE,aAAa,CAAC;QACrB,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,CAAC,CAAC;CACJ;AAMD,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,UAAU,CAAC;IACjB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAChC;AAED,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,eAAe,GAAG;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CAAE,CAAC;AAExG,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,WAAW,CAAC;IAClB,OAAO,EAAE;QACP,OAAO,EAAE,eAAe,EAAE,CAAC;KAC5B,CAAC;CACH;AAED,MAAM,WAAW,gBAAgB;IAC/B,IAAI,EAAE,QAAQ,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,cAAc,EAAE,MAAM,CAAC;IACvB,KAAK,EAAE;QACL,YAAY,EAAE,MAAM,CAAC;QACrB,aAAa,EAAE,MAAM,CAAC;QACtB,uBAAuB,EAAE,MAAM,CAAC;QAChC,2BAA2B,EAAE,MAAM,CAAC;KACrC,CAAC;IACF,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE;QACzB,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,OAAO,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;CACJ;AAED,MAAM,MAAM,gBAAgB,GAAG,mBAAmB,GAAG,gBAAgB,GAAG;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CAAE,CAAC;AAEjH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,OAAO,GAAG,GAAG,IAAI,mBAAmB,CAE3E;AAED,wBAAgB,eAAe,CAAC,GAAG,EAAE,OAAO,GAAG,GAAG,IAAI,gBAAgB,CAErE;AAED,wBAAgB,WAAW,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,IAAI,YAAY,CAEjE;AAED,wBAAgB,cAAc,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,IAAI,eAAe,CAEvE"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Type definitions for the skill evaluation framework.
3
+ */
4
+ export function isAssistantMessage(msg) {
5
+ return typeof msg === 'object' && msg !== null && msg.type === 'assistant';
6
+ }
7
+ export function isResultMessage(msg) {
8
+ return typeof msg === 'object' && msg !== null && msg.type === 'result';
9
+ }
10
+ export function isTextBlock(block) {
11
+ return typeof block === 'object' && block !== null && block.type === 'text';
12
+ }
13
+ export function isToolUseBlock(block) {
14
+ return typeof block === 'object' && block !== null && block.type === 'tool_use';
15
+ }
16
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAuRH,MAAM,UAAU,kBAAkB,CAAC,GAAY;IAC7C,OAAO,OAAO,GAAG,KAAK,QAAQ,IAAI,GAAG,KAAK,IAAI,IAAK,GAAyB,CAAC,IAAI,KAAK,WAAW,CAAC;AACpG,CAAC;AAED,MAAM,UAAU,eAAe,CAAC,GAAY;IAC1C,OAAO,OAAO,GAAG,KAAK,QAAQ,IAAI,GAAG,KAAK,IAAI,IAAK,GAAyB,CAAC,IAAI,KAAK,QAAQ,CAAC;AACjG,CAAC;AAED,MAAM,UAAU,WAAW,CAAC,KAAc;IACxC,OAAO,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAK,KAA2B,CAAC,IAAI,KAAK,MAAM,CAAC;AACrG,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,KAAc;IAC3C,OAAO,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAK,KAA2B,CAAC,IAAI,KAAK,UAAU,CAAC;AACzG,CAAC"}
package/package.json ADDED
@@ -0,0 +1,44 @@
1
+ {
2
+ "name": "@skilljack/evals",
3
+ "version": "1.0.0",
4
+ "type": "module",
5
+ "description": "CLI for evaluating AI agent skill discoverability, adherence, and output quality. Runs as standalone CLI or GitHub Action.",
6
+ "bin": {
7
+ "skilljack-evals": "./dist/src/cli.js"
8
+ },
9
+ "main": "./dist/src/index.js",
10
+ "types": "./dist/src/index.d.ts",
11
+ "exports": {
12
+ ".": {
13
+ "import": "./dist/src/index.js",
14
+ "types": "./dist/src/index.d.ts"
15
+ }
16
+ },
17
+ "scripts": {
18
+ "build": "tsc",
19
+ "dev": "tsx src/cli.ts",
20
+ "start": "node dist/src/cli.js",
21
+ "clean": "rm -rf dist",
22
+ "typecheck": "tsc --noEmit"
23
+ },
24
+ "dependencies": {
25
+ "@anthropic-ai/claude-agent-sdk": "^0.1.42",
26
+ "commander": "^12.0.0",
27
+ "dotenv": "^16.4.0",
28
+ "js-yaml": "^4.1.0"
29
+ },
30
+ "devDependencies": {
31
+ "@actions/core": "^1.10.0",
32
+ "@types/js-yaml": "^4.0.9",
33
+ "@types/node": "^20.0.0",
34
+ "tsx": "^4.0.0",
35
+ "typescript": "^5.0.0"
36
+ },
37
+ "engines": {
38
+ "node": ">=20.0.0"
39
+ },
40
+ "files": [
41
+ "dist",
42
+ "action"
43
+ ]
44
+ }