@skilljack/evals 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +327 -0
- package/action/action.yml +72 -0
- package/action/index.ts +78 -0
- package/dist/action/index.d.ts +8 -0
- package/dist/action/index.d.ts.map +1 -0
- package/dist/action/index.js +68 -0
- package/dist/action/index.js.map +1 -0
- package/dist/src/cli.d.ts +9 -0
- package/dist/src/cli.d.ts.map +1 -0
- package/dist/src/cli.js +264 -0
- package/dist/src/cli.js.map +1 -0
- package/dist/src/config.d.ts +52 -0
- package/dist/src/config.d.ts.map +1 -0
- package/dist/src/config.js +194 -0
- package/dist/src/config.js.map +1 -0
- package/dist/src/index.d.ts +24 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +28 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/parser.d.ts +22 -0
- package/dist/src/parser.d.ts.map +1 -0
- package/dist/src/parser.js +205 -0
- package/dist/src/parser.js.map +1 -0
- package/dist/src/pipeline.d.ts +53 -0
- package/dist/src/pipeline.d.ts.map +1 -0
- package/dist/src/pipeline.js +185 -0
- package/dist/src/pipeline.js.map +1 -0
- package/dist/src/report/github-summary.d.ts +15 -0
- package/dist/src/report/github-summary.d.ts.map +1 -0
- package/dist/src/report/github-summary.js +77 -0
- package/dist/src/report/github-summary.js.map +1 -0
- package/dist/src/report/report.d.ts +23 -0
- package/dist/src/report/report.d.ts.map +1 -0
- package/dist/src/report/report.js +216 -0
- package/dist/src/report/report.js.map +1 -0
- package/dist/src/runner/runner.d.ts +29 -0
- package/dist/src/runner/runner.d.ts.map +1 -0
- package/dist/src/runner/runner.js +211 -0
- package/dist/src/runner/runner.js.map +1 -0
- package/dist/src/runner/security.d.ts +26 -0
- package/dist/src/runner/security.d.ts.map +1 -0
- package/dist/src/runner/security.js +34 -0
- package/dist/src/runner/security.js.map +1 -0
- package/dist/src/runner/skill-setup.d.ts +19 -0
- package/dist/src/runner/skill-setup.d.ts.map +1 -0
- package/dist/src/runner/skill-setup.js +72 -0
- package/dist/src/runner/skill-setup.js.map +1 -0
- package/dist/src/scorer/deterministic.d.ts +12 -0
- package/dist/src/scorer/deterministic.d.ts.map +1 -0
- package/dist/src/scorer/deterministic.js +149 -0
- package/dist/src/scorer/deterministic.js.map +1 -0
- package/dist/src/scorer/judge.d.ts +34 -0
- package/dist/src/scorer/judge.d.ts.map +1 -0
- package/dist/src/scorer/judge.js +226 -0
- package/dist/src/scorer/judge.js.map +1 -0
- package/dist/src/scorer/scorer.d.ts +25 -0
- package/dist/src/scorer/scorer.d.ts.map +1 -0
- package/dist/src/scorer/scorer.js +149 -0
- package/dist/src/scorer/scorer.js.map +1 -0
- package/dist/src/session/session-logger.d.ts +30 -0
- package/dist/src/session/session-logger.d.ts.map +1 -0
- package/dist/src/session/session-logger.js +157 -0
- package/dist/src/session/session-logger.js.map +1 -0
- package/dist/src/types.d.ts +227 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +16 -0
- package/dist/src/types.js.map +1 -0
- package/package.json +44 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Session logger for capturing evaluation run events.
|
|
3
|
+
*
|
|
4
|
+
* Captures tool calls, text output, metrics, and eval results
|
|
5
|
+
* for debugging and reporting purposes.
|
|
6
|
+
*/
|
|
7
|
+
import type { SessionLogEntry, MetricsData } from '../types.js';
|
|
8
|
+
export declare class SessionLogger {
|
|
9
|
+
private log;
|
|
10
|
+
private logDir;
|
|
11
|
+
constructor(task: string, logDir?: string);
|
|
12
|
+
addEntry(type: SessionLogEntry['type'], data: unknown): void;
|
|
13
|
+
addTextMessage(text: string): void;
|
|
14
|
+
addToolUse(name: string, input: unknown): void;
|
|
15
|
+
addToolResult(name: string, success: boolean): void;
|
|
16
|
+
addAssistantMessage(content: unknown[]): void;
|
|
17
|
+
setMetrics(metrics: MetricsData): void;
|
|
18
|
+
markAsError(errorMessage: string): void;
|
|
19
|
+
getEntries(): SessionLogEntry[];
|
|
20
|
+
getSessionId(): string;
|
|
21
|
+
/**
|
|
22
|
+
* Save session log to disk as both JSON and human-readable markdown.
|
|
23
|
+
*/
|
|
24
|
+
save(): Promise<{
|
|
25
|
+
jsonPath: string;
|
|
26
|
+
mdPath: string;
|
|
27
|
+
}>;
|
|
28
|
+
private generateReadableLog;
|
|
29
|
+
}
|
|
30
|
+
//# sourceMappingURL=session-logger.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"session-logger.d.ts","sourceRoot":"","sources":["../../../src/session/session-logger.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,eAAe,EAAc,WAAW,EAAE,MAAM,aAAa,CAAC;AAE5E,qBAAa,aAAa;IACxB,OAAO,CAAC,GAAG,CAAa;IACxB,OAAO,CAAC,MAAM,CAAS;gBAEX,IAAI,EAAE,MAAM,EAAE,MAAM,GAAE,MAAyB;IAY3D,QAAQ,CAAC,IAAI,EAAE,eAAe,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,OAAO,GAAG,IAAI;IAQ5D,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,IAAI;IAIlC,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,IAAI;IAI9C,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,GAAG,IAAI;IAInD,mBAAmB,CAAC,OAAO,EAAE,OAAO,EAAE,GAAG,IAAI;IAI7C,UAAU,CAAC,OAAO,EAAE,WAAW,GAAG,IAAI;IAItC,WAAW,CAAC,YAAY,EAAE,MAAM,GAAG,IAAI;IAKvC,UAAU,IAAI,eAAe,EAAE;IAI/B,YAAY,IAAI,MAAM;IAItB;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IAiB3D,OAAO,CAAC,mBAAmB;CAoF5B"}
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Session logger for capturing evaluation run events.
|
|
3
|
+
*
|
|
4
|
+
* Captures tool calls, text output, metrics, and eval results
|
|
5
|
+
* for debugging and reporting purposes.
|
|
6
|
+
*/
|
|
7
|
+
import * as fs from 'fs/promises';
|
|
8
|
+
import * as path from 'path';
|
|
9
|
+
export class SessionLogger {
|
|
10
|
+
log;
|
|
11
|
+
logDir;
|
|
12
|
+
constructor(task, logDir = './results/logs') {
|
|
13
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
14
|
+
this.logDir = logDir;
|
|
15
|
+
this.log = {
|
|
16
|
+
sessionId: `eval-${task}-${timestamp}`,
|
|
17
|
+
task,
|
|
18
|
+
startTime: new Date().toISOString(),
|
|
19
|
+
status: 'success',
|
|
20
|
+
entries: [],
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
addEntry(type, data) {
|
|
24
|
+
this.log.entries.push({
|
|
25
|
+
timestamp: new Date().toISOString(),
|
|
26
|
+
type,
|
|
27
|
+
data,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
addTextMessage(text) {
|
|
31
|
+
this.addEntry('text', { text });
|
|
32
|
+
}
|
|
33
|
+
addToolUse(name, input) {
|
|
34
|
+
this.addEntry('tool_use', { name, input });
|
|
35
|
+
}
|
|
36
|
+
addToolResult(name, success) {
|
|
37
|
+
this.addEntry('tool_result', { name, success });
|
|
38
|
+
}
|
|
39
|
+
addAssistantMessage(content) {
|
|
40
|
+
this.addEntry('assistant', { content });
|
|
41
|
+
}
|
|
42
|
+
setMetrics(metrics) {
|
|
43
|
+
this.log.metrics = metrics;
|
|
44
|
+
}
|
|
45
|
+
markAsError(errorMessage) {
|
|
46
|
+
this.log.status = 'error';
|
|
47
|
+
this.log.errorMessage = errorMessage;
|
|
48
|
+
}
|
|
49
|
+
getEntries() {
|
|
50
|
+
return this.log.entries;
|
|
51
|
+
}
|
|
52
|
+
getSessionId() {
|
|
53
|
+
return this.log.sessionId;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Save session log to disk as both JSON and human-readable markdown.
|
|
57
|
+
*/
|
|
58
|
+
async save() {
|
|
59
|
+
this.log.endTime = new Date().toISOString();
|
|
60
|
+
await fs.mkdir(this.logDir, { recursive: true });
|
|
61
|
+
const prefix = this.log.status === 'error' ? 'FAILED__' : '';
|
|
62
|
+
const baseName = `${prefix}${this.log.sessionId}`;
|
|
63
|
+
const jsonPath = path.join(this.logDir, `${baseName}.json`);
|
|
64
|
+
const mdPath = path.join(this.logDir, `${baseName}.md`);
|
|
65
|
+
await fs.writeFile(jsonPath, JSON.stringify(this.log, null, 2));
|
|
66
|
+
await fs.writeFile(mdPath, this.generateReadableLog());
|
|
67
|
+
return { jsonPath, mdPath };
|
|
68
|
+
}
|
|
69
|
+
generateReadableLog() {
|
|
70
|
+
const lines = [];
|
|
71
|
+
lines.push(`# Eval Session: ${this.log.sessionId}`);
|
|
72
|
+
lines.push(`**Task:** ${this.log.task}`);
|
|
73
|
+
lines.push(`**Start:** ${this.log.startTime}`);
|
|
74
|
+
lines.push(`**End:** ${this.log.endTime || 'In progress'}`);
|
|
75
|
+
lines.push(`**Status:** ${this.log.status === 'success' ? 'PASS' : 'FAIL'} (${this.log.status})`);
|
|
76
|
+
if (this.log.errorMessage) {
|
|
77
|
+
lines.push(`**Error:** ${this.log.errorMessage}`);
|
|
78
|
+
}
|
|
79
|
+
lines.push('');
|
|
80
|
+
if (this.log.metrics) {
|
|
81
|
+
lines.push('## Metrics');
|
|
82
|
+
lines.push(`- **Duration:** ${formatDuration(this.log.metrics.timing.totalElapsedMs)}`);
|
|
83
|
+
lines.push(`- **Cost:** $${this.log.metrics.cost.toFixed(6)}`);
|
|
84
|
+
lines.push(`- **Turns:** ${this.log.metrics.turns}`);
|
|
85
|
+
lines.push(`- **Tokens:** ${this.log.metrics.tokens.total.toLocaleString()}`);
|
|
86
|
+
lines.push('');
|
|
87
|
+
}
|
|
88
|
+
lines.push('## Events');
|
|
89
|
+
lines.push('');
|
|
90
|
+
let toolCount = 0;
|
|
91
|
+
for (const entry of this.log.entries) {
|
|
92
|
+
const time = entry.timestamp.split('T')[1]?.split('.')[0] || '';
|
|
93
|
+
switch (entry.type) {
|
|
94
|
+
case 'text': {
|
|
95
|
+
const data = entry.data;
|
|
96
|
+
const preview = data.text.length > 500 ? data.text.substring(0, 500) + '...' : data.text;
|
|
97
|
+
lines.push(`### [${time}] Text`);
|
|
98
|
+
lines.push('```');
|
|
99
|
+
lines.push(preview);
|
|
100
|
+
lines.push('```');
|
|
101
|
+
lines.push('');
|
|
102
|
+
break;
|
|
103
|
+
}
|
|
104
|
+
case 'tool_use': {
|
|
105
|
+
toolCount++;
|
|
106
|
+
const data = entry.data;
|
|
107
|
+
lines.push(`### [${time}] Tool #${toolCount}: ${data.name}`);
|
|
108
|
+
const inputStr = JSON.stringify(data.input, null, 2);
|
|
109
|
+
lines.push('```json');
|
|
110
|
+
lines.push(inputStr.length > 1000 ? inputStr.substring(0, 1000) + '\n...' : inputStr);
|
|
111
|
+
lines.push('```');
|
|
112
|
+
lines.push('');
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
case 'tool_result': {
|
|
116
|
+
const data = entry.data;
|
|
117
|
+
lines.push(`- Tool result: ${data.name} - ${data.success ? 'Success' : 'Failed'}`);
|
|
118
|
+
lines.push('');
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
case 'assistant': {
|
|
122
|
+
const data = entry.data;
|
|
123
|
+
lines.push(`### [${time}] Assistant`);
|
|
124
|
+
for (const chunk of data.content) {
|
|
125
|
+
if (typeof chunk === 'object' && chunk !== null) {
|
|
126
|
+
const c = chunk;
|
|
127
|
+
if (c.type === 'text' && c.text) {
|
|
128
|
+
const preview = c.text.length > 300 ? c.text.substring(0, 300) + '...' : c.text;
|
|
129
|
+
lines.push('```');
|
|
130
|
+
lines.push(preview);
|
|
131
|
+
lines.push('```');
|
|
132
|
+
}
|
|
133
|
+
else if (c.type === 'tool_use' && c.name) {
|
|
134
|
+
lines.push(`Tool call: ${c.name}`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
lines.push('');
|
|
139
|
+
break;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
lines.push('---');
|
|
144
|
+
lines.push(`Total tool calls: ${toolCount}`);
|
|
145
|
+
return lines.join('\n');
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
function formatDuration(ms) {
|
|
149
|
+
if (ms < 1000)
|
|
150
|
+
return `${ms}ms`;
|
|
151
|
+
if (ms < 60000)
|
|
152
|
+
return `${(ms / 1000).toFixed(2)}s`;
|
|
153
|
+
const min = Math.floor(ms / 60000);
|
|
154
|
+
const sec = ((ms % 60000) / 1000).toFixed(1);
|
|
155
|
+
return `${min}m ${sec}s`;
|
|
156
|
+
}
|
|
157
|
+
//# sourceMappingURL=session-logger.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"session-logger.js","sourceRoot":"","sources":["../../../src/session/session-logger.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAG7B,MAAM,OAAO,aAAa;IAChB,GAAG,CAAa;IAChB,MAAM,CAAS;IAEvB,YAAY,IAAY,EAAE,SAAiB,gBAAgB;QACzD,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;QACjE,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,GAAG,GAAG;YACT,SAAS,EAAE,QAAQ,IAAI,IAAI,SAAS,EAAE;YACtC,IAAI;YACJ,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,MAAM,EAAE,SAAS;YACjB,OAAO,EAAE,EAAE;SACZ,CAAC;IACJ,CAAC;IAED,QAAQ,CAAC,IAA6B,EAAE,IAAa;QACnD,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC;YACpB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,IAAI;YACJ,IAAI;SACL,CAAC,CAAC;IACL,CAAC;IAED,cAAc,CAAC,IAAY;QACzB,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;IAClC,CAAC;IAED,UAAU,CAAC,IAAY,EAAE,KAAc;QACrC,IAAI,CAAC,QAAQ,CAAC,UAAU,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;IAC7C,CAAC;IAED,aAAa,CAAC,IAAY,EAAE,OAAgB;QAC1C,IAAI,CAAC,QAAQ,CAAC,aAAa,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IAED,mBAAmB,CAAC,OAAkB;QACpC,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;IAC1C,CAAC;IAED,UAAU,CAAC,OAAoB;QAC7B,IAAI,CAAC,GAAG,CAAC,OAAO,GAAG,OAAO,CAAC;IAC7B,CAAC;IAED,WAAW,CAAC,YAAoB;QAC9B,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC;QAC1B,IAAI,CAAC,GAAG,CAAC,YAAY,GAAG,YAAY,CAAC;IACvC,CAAC;IAED,UAAU;QACR,OAAO,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC;IAC1B,CAAC;IAED,YAAY;QACV,OAAO,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC;IAC5B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,CAAC,GAAG,CAAC,OAAO,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAE5C,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAEjD,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,KAAK,OAAO,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,MAAM,QAAQ,GAAG,GAAG,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC;QAElD,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,QAAQ,OAAO,CAAC,CAAC;QAC5D,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,QAAQ,KAAK,CAAC,CAAC;QAExD,MAAM,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QAChE,MAAM,EAAE,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,CAAC,mBAAmB,EAAE,CAAC,CAAC;QAEvD,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC;IAC9B,CAAC;IAEO,mBAAmB;QACzB,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,CAAC,IAAI,CAAC,mBAAmB,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC;QACpD,KAAK,CAAC,IAAI,CAAC,aAAa,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACzC,KAAK,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC;QAC/C,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,GAAG,CAAC,OAAO,IAAI,aAAa,EAAE,CAAC,CAAC;QAC5D,KAAK,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,GAAG,CAAC,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,KAAK,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC;QAClG,IAAI,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC;YAC1B,KAAK,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,CAAC;QACpD,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAEf,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC;YACrB,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YACzB,KAAK,CAAC,IAAI,CAAC,mBAAmB,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC;YACxF,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAC/D,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;YACrD,KAAK,CAAC,IAAI,CAAC,iBAAiB,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;YAC9E,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjB,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACxB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAEf,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC;YACrC,MAAM,IAAI,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAEhE,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;gBACnB,KAAK,MAAM,CAAC,CAAC,CAAC;oBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAwB,CAAC;oBAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;oBACzF,KAAK,CAAC,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,CAAC;oBACjC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBACpB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACf,MAAM;gBACR,CAAC;gBACD,KAAK,UAAU,CAAC,CAAC,CAAC;oBAChB,SAAS,EAAE,CAAC;oBACZ,MAAM,IAAI,GAAG,KAAK,CAAC,IAAwC,CAAC;oBAC5D,KAAK,CAAC,IAAI,CAAC,QAAQ,IAAI,WAAW,SAAS,KAAK,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;oBAC7D,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;oBACrD,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;oBACtB,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;oBACtF,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACf,MAAM;gBACR,CAAC;gBACD,KAAK,aAAa,CAAC,CAAC,CAAC;oBACnB,MAAM,IAAI,GAAG,KAAK,CAAC,IAA0C,CAAC;oBAC9D,KAAK,CAAC,IAAI,CAAC,kBAAkB,IAAI,CAAC,IAAI,MAAM,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;oBACnF,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACf,MAAM;gBACR,CAAC;gBACD,KAAK,WAAW,CAAC,CAAC,CAAC;oBACjB,MAAM,IAAI,GAAG,KAAK,CAAC,IAA8B,CAAC;oBAClD,KAAK,CAAC,IAAI,CAAC,QAAQ,IAAI,aAAa,CAAC,CAAC;oBACtC,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;wBACjC,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;4BAChD,MAAM,CAAC,GAAG,KAAuD,CAAC;4BAClE,IAAI,CAAC,CAAC,IAAI,KAAK,MAAM,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gCAChC,MAAM,OAAO,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;gCAChF,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gCAClB,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gCACpB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;4BACpB,CAAC;iCAAM,IAAI,CAAC,CAAC,IAAI,KAAK,UAAU,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gCAC3C,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;4BACrC,CAAC;wBACH,CAAC;oBACH,CAAC;oBACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;oBACf,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClB,KAAK,CAAC,IAAI,CAAC,qBAAqB,SAAS,EAAE,CAAC,CAAC;QAE7C,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;CACF;AAED,SAAS,cAAc,CAAC,EAAU;IAChC,IAAI,EAAE,GAAG,IAAI;QAAE,OAAO,GAAG,EAAE,IAAI,CAAC;IAChC,IAAI,EAAE,GAAG,KAAK;QAAE,OAAO,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;IACpD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,GAAG,KAAK,CAAC,CAAC;IACnC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,GAAG,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;IAC7C,OAAO,GAAG,GAAG,KAAK,GAAG,GAAG,CAAC;AAC3B,CAAC"}
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type definitions for the skill evaluation framework.
|
|
3
|
+
*/
|
|
4
|
+
export interface EvalCriteria {
|
|
5
|
+
dimension: 'discovery' | 'adherence' | 'output';
|
|
6
|
+
weight: number;
|
|
7
|
+
description: string;
|
|
8
|
+
}
|
|
9
|
+
export interface DeterministicCheck {
|
|
10
|
+
expectSkillActivation: boolean;
|
|
11
|
+
expectMarker?: string;
|
|
12
|
+
expectToolCalls?: string[];
|
|
13
|
+
expectNoToolCalls?: string[];
|
|
14
|
+
}
|
|
15
|
+
export interface FixtureConfig {
|
|
16
|
+
state: string;
|
|
17
|
+
setup?: string;
|
|
18
|
+
teardown?: string;
|
|
19
|
+
}
|
|
20
|
+
export interface EvalTask {
|
|
21
|
+
id: string;
|
|
22
|
+
prompt: string;
|
|
23
|
+
expectedSkillLoad: string;
|
|
24
|
+
criteria: EvalCriteria[];
|
|
25
|
+
goldenChecklist: string[];
|
|
26
|
+
deterministic?: DeterministicCheck;
|
|
27
|
+
fixture?: FixtureConfig;
|
|
28
|
+
}
|
|
29
|
+
export interface EvalDefaults {
|
|
30
|
+
expectedSkillLoad?: string;
|
|
31
|
+
criteria?: Partial<Record<'discovery' | 'adherence' | 'output', {
|
|
32
|
+
weight?: number;
|
|
33
|
+
description?: string;
|
|
34
|
+
}>>;
|
|
35
|
+
}
|
|
36
|
+
export interface SkillEvaluation {
|
|
37
|
+
skillName: string;
|
|
38
|
+
version?: string;
|
|
39
|
+
defaults?: EvalDefaults;
|
|
40
|
+
tasks: EvalTask[];
|
|
41
|
+
}
|
|
42
|
+
export interface ToolCallRecord {
|
|
43
|
+
tool: string;
|
|
44
|
+
toolUseId: string;
|
|
45
|
+
timestamp: number;
|
|
46
|
+
input?: unknown;
|
|
47
|
+
}
|
|
48
|
+
export interface TaskResult {
|
|
49
|
+
taskId: string;
|
|
50
|
+
prompt: string;
|
|
51
|
+
output: string;
|
|
52
|
+
durationMs: number;
|
|
53
|
+
numTurns: number;
|
|
54
|
+
costUsd: number;
|
|
55
|
+
skillLoads: string[];
|
|
56
|
+
toolCalls: ToolCallRecord[];
|
|
57
|
+
isError: boolean;
|
|
58
|
+
errorMessage: string;
|
|
59
|
+
}
|
|
60
|
+
export interface RunnerOptions {
|
|
61
|
+
cwd?: string;
|
|
62
|
+
parallel?: boolean;
|
|
63
|
+
model?: string;
|
|
64
|
+
settingSources?: Array<'user' | 'project' | 'local'>;
|
|
65
|
+
/** Count Read calls to SKILL.md as skill discovery (default: false) */
|
|
66
|
+
countReadAsFallback?: boolean;
|
|
67
|
+
/** Directories the agent is allowed to write to */
|
|
68
|
+
allowedWriteDirs?: string[];
|
|
69
|
+
}
|
|
70
|
+
export interface DeterministicResult {
|
|
71
|
+
skillActivated: boolean;
|
|
72
|
+
skillName?: string;
|
|
73
|
+
markerFound: boolean | null;
|
|
74
|
+
expectedToolsCalled: boolean | null;
|
|
75
|
+
unexpectedToolsCalled: boolean | null;
|
|
76
|
+
passed: boolean;
|
|
77
|
+
details: string[];
|
|
78
|
+
}
|
|
79
|
+
export type FailureCategory = 'discovery_failure' | 'false_positive' | 'instruction_ambiguity' | 'missing_guidance' | 'agent_error' | 'none';
|
|
80
|
+
export interface JudgeScore {
|
|
81
|
+
taskId: string;
|
|
82
|
+
discovery: number;
|
|
83
|
+
adherence: number;
|
|
84
|
+
outputQuality: number;
|
|
85
|
+
weightedScore: number;
|
|
86
|
+
failureCategory: FailureCategory;
|
|
87
|
+
reasoning: string;
|
|
88
|
+
}
|
|
89
|
+
export interface JudgeOptions {
|
|
90
|
+
model?: string;
|
|
91
|
+
outputTruncation?: number;
|
|
92
|
+
}
|
|
93
|
+
export interface CombinedScore {
|
|
94
|
+
taskId: string;
|
|
95
|
+
deterministic: DeterministicResult | null;
|
|
96
|
+
judge: JudgeScore | null;
|
|
97
|
+
discovery: number;
|
|
98
|
+
adherence: number;
|
|
99
|
+
outputQuality: number;
|
|
100
|
+
weightedScore: number;
|
|
101
|
+
failureCategory: FailureCategory;
|
|
102
|
+
reasoning: string;
|
|
103
|
+
}
|
|
104
|
+
export interface SessionLogEntry {
|
|
105
|
+
timestamp: string;
|
|
106
|
+
type: 'text' | 'tool_use' | 'tool_result' | 'assistant';
|
|
107
|
+
data: unknown;
|
|
108
|
+
}
|
|
109
|
+
export interface MetricsData {
|
|
110
|
+
timestamp: string;
|
|
111
|
+
task: string;
|
|
112
|
+
timing: {
|
|
113
|
+
totalElapsedMs: number;
|
|
114
|
+
sdkDurationMs: number;
|
|
115
|
+
apiDurationMs: number;
|
|
116
|
+
overheadMs: number;
|
|
117
|
+
};
|
|
118
|
+
tokens: {
|
|
119
|
+
input: number;
|
|
120
|
+
output: number;
|
|
121
|
+
cacheRead: number;
|
|
122
|
+
cacheCreation: number;
|
|
123
|
+
total: number;
|
|
124
|
+
};
|
|
125
|
+
cost: number;
|
|
126
|
+
turns: number;
|
|
127
|
+
modelUsage: Record<string, {
|
|
128
|
+
inputTokens: number;
|
|
129
|
+
outputTokens: number;
|
|
130
|
+
costUSD: number;
|
|
131
|
+
}>;
|
|
132
|
+
}
|
|
133
|
+
export interface SessionLog {
|
|
134
|
+
sessionId: string;
|
|
135
|
+
task: string;
|
|
136
|
+
startTime: string;
|
|
137
|
+
endTime?: string;
|
|
138
|
+
status: 'success' | 'error';
|
|
139
|
+
errorMessage?: string;
|
|
140
|
+
entries: SessionLogEntry[];
|
|
141
|
+
metrics?: MetricsData;
|
|
142
|
+
}
|
|
143
|
+
export interface EvaluationSummary {
|
|
144
|
+
totalTasks: number;
|
|
145
|
+
discoveryAccuracy: number;
|
|
146
|
+
avgAdherence: number;
|
|
147
|
+
avgOutputQuality: number;
|
|
148
|
+
avgWeightedScore: number;
|
|
149
|
+
totalDurationMs: number;
|
|
150
|
+
totalCostUsd: number;
|
|
151
|
+
}
|
|
152
|
+
export interface FailureBreakdown {
|
|
153
|
+
category: FailureCategory;
|
|
154
|
+
count: number;
|
|
155
|
+
percentage: number;
|
|
156
|
+
}
|
|
157
|
+
export interface ReportMetadata {
|
|
158
|
+
skillPath: string;
|
|
159
|
+
gitCommit?: string;
|
|
160
|
+
gitBranch?: string;
|
|
161
|
+
version?: string;
|
|
162
|
+
agentModel: string;
|
|
163
|
+
judgeModel: string;
|
|
164
|
+
}
|
|
165
|
+
export interface EvaluationReport {
|
|
166
|
+
skillName: string;
|
|
167
|
+
timestamp: string;
|
|
168
|
+
passed: boolean;
|
|
169
|
+
failureReasons: string[];
|
|
170
|
+
metadata?: ReportMetadata;
|
|
171
|
+
summary: EvaluationSummary;
|
|
172
|
+
failureBreakdown: FailureBreakdown[];
|
|
173
|
+
tasks: Array<{
|
|
174
|
+
task: EvalTask;
|
|
175
|
+
result: TaskResult;
|
|
176
|
+
score: CombinedScore;
|
|
177
|
+
sessionLogPath?: string;
|
|
178
|
+
}>;
|
|
179
|
+
}
|
|
180
|
+
export interface SdkTextBlock {
|
|
181
|
+
type: 'text';
|
|
182
|
+
text: string;
|
|
183
|
+
}
|
|
184
|
+
export interface SdkToolUseBlock {
|
|
185
|
+
type: 'tool_use';
|
|
186
|
+
id: string;
|
|
187
|
+
name: string;
|
|
188
|
+
input: Record<string, unknown>;
|
|
189
|
+
}
|
|
190
|
+
export type SdkContentBlock = SdkTextBlock | SdkToolUseBlock | {
|
|
191
|
+
type: string;
|
|
192
|
+
[key: string]: unknown;
|
|
193
|
+
};
|
|
194
|
+
export interface SdkAssistantMessage {
|
|
195
|
+
type: 'assistant';
|
|
196
|
+
message: {
|
|
197
|
+
content: SdkContentBlock[];
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
export interface SdkResultMessage {
|
|
201
|
+
type: 'result';
|
|
202
|
+
result?: string;
|
|
203
|
+
duration_ms: number;
|
|
204
|
+
duration_api_ms: number;
|
|
205
|
+
num_turns: number;
|
|
206
|
+
total_cost_usd: number;
|
|
207
|
+
usage: {
|
|
208
|
+
input_tokens: number;
|
|
209
|
+
output_tokens: number;
|
|
210
|
+
cache_read_input_tokens: number;
|
|
211
|
+
cache_creation_input_tokens: number;
|
|
212
|
+
};
|
|
213
|
+
modelUsage: Record<string, {
|
|
214
|
+
inputTokens: number;
|
|
215
|
+
outputTokens: number;
|
|
216
|
+
costUSD: number;
|
|
217
|
+
}>;
|
|
218
|
+
}
|
|
219
|
+
export type SdkStreamMessage = SdkAssistantMessage | SdkResultMessage | {
|
|
220
|
+
type: string;
|
|
221
|
+
[key: string]: unknown;
|
|
222
|
+
};
|
|
223
|
+
export declare function isAssistantMessage(msg: unknown): msg is SdkAssistantMessage;
|
|
224
|
+
export declare function isResultMessage(msg: unknown): msg is SdkResultMessage;
|
|
225
|
+
export declare function isTextBlock(block: unknown): block is SdkTextBlock;
|
|
226
|
+
export declare function isToolUseBlock(block: unknown): block is SdkToolUseBlock;
|
|
227
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAMH,MAAM,WAAW,YAAY;IAC3B,SAAS,EAAE,WAAW,GAAG,WAAW,GAAG,QAAQ,CAAC;IAChD,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,kBAAkB;IACjC,qBAAqB,EAAE,OAAO,CAAC;IAC/B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,iBAAiB,CAAC,EAAE,MAAM,EAAE,CAAC;CAC9B;AAED,MAAM,WAAW,aAAa;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,iBAAiB,EAAE,MAAM,CAAC;IAC1B,QAAQ,EAAE,YAAY,EAAE,CAAC;IACzB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,aAAa,CAAC,EAAE,kBAAkB,CAAC;IACnC,OAAO,CAAC,EAAE,aAAa,CAAC;CACzB;AAED,MAAM,WAAW,YAAY;IAC3B,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,WAAW,GAAG,WAAW,GAAG,QAAQ,EAAE;QAAE,MAAM,CAAC,EAAE,MAAM,CAAC;QAAC,WAAW,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC,CAAC;CAC7G;AAED,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,YAAY,CAAC;IACxB,KAAK,EAAE,QAAQ,EAAE,CAAC;CACnB;AAMD,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,SAAS,EAAE,cAAc,EAAE,CAAC;IAC5B,OAAO,EAAE,OAAO,CAAC;IACjB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,aAAa;IAC5B,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,cAAc,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,SAAS,GAAG,OAAO,CAAC,CAAC;IACrD,uEAAuE;IACvE,mBAAmB,CAAC,EAAE,OAAO,CAAC;IAC9B,mDAAmD;IACnD,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;CAC7B;AAMD,MAAM,WAAW,mBAAmB;IAClC,cAAc,EAAE,OAAO,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,OAAO,GAAG,IAAI,CAAC;IAC5B,mBAAmB,EAAE,OAAO,GAAG,IAAI,CAAC;IACpC,qBAAqB,EAAE,OAAO,GAAG,IAAI,CAAC;IACtC,MAAM,EAAE,OAAO,CAAC;IAChB,OAAO,EAAE,MAAM,EAAE,CAAC;CACnB;AAMD,MAAM,MAAM,eAAe,GACvB,mBAAmB,GACnB,gBAAgB,GAChB,uBAAuB,GACvB,kBAAkB,GAClB,aAAa,GACb,MAAM,CAAC;AAEX,MAAM,WAAW,UAAU;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;IACtB,aAAa,EAAE,MAAM,CAAC;IACtB,eAAe,EAAE,eAAe,CAAC;IACjC,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,YAAY;IAC3B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAMD,MAAM,WAAW,aAAa;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,aAAa,EAAE,mBAAmB,GAAG,IAAI,CAAC;IAC1C,KAAK,EAAE,UAAU,GAAG,IAAI,CAAC;IAGzB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;IACtB,aAAa,EAAE,MAAM,CAAC;IACtB,eAAe,EAAE,eAAe,CAAC;IACjC,SAAS,EAAE,MAAM,CAAC;CACnB;AAMD,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,GAAG,UAAU,GAAG,aAAa,GAAG,WAAW,CAAC;IACxD,IAAI,EAAE,OAAO,CAAC;CACf;AAED,MAAM,WAAW,WAAW;IAC1B,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE;QACN,cAAc,EAAE,MAAM,CAAC;QACvB,aAAa,EAAE,MAAM,CAAC;QACtB,aAAa,EAAE,MAAM,CAAC;QACtB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,MAAM,EAAE;QACN,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,SAAS,EAAE,MAAM,CAAC;QAClB,aAAa,EAAE,MAAM,CAAC;QACtB,KAAK,EAAE,MAAM,CAAC;KACf,CAAC;IACF,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE;QACzB,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,OAAO,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,UAAU;IACzB,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,SAAS,GAAG,OAAO,CAAC;IAC5B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,OAAO,CAAC,EAAE,WAAW,CAAC;CACvB;AAMD,MAAM,WAAW,iBAAiB;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,YAAY,EAAE,MAAM,CAAC;IACrB,gBAAgB,EAAE,MAAM,CAAC;IACzB,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,eAAe,CAAC;IAC1B,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,cAAc;IAC7B,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,gBAAgB;IAC/B,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,OAAO,CAAC;IAChB,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,QAAQ,CAAC,EAAE,cAAc,CAAC;IAC1B,OAAO,EAAE,iBAAiB,CAAC;IAC3B,gBAAgB,EAAE,gBAAgB,EAAE,CAAC;IACrC,KAAK,EAAE,KAAK,CAAC;QACX,IAAI,EAAE,QAAQ,CAAC;QACf,MAAM,EAAE,UAAU,CAAC;QACnB,KAAK,EAAE,aAAa,CAAC;QACrB,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,CAAC,CAAC;CACJ;AAMD,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,UAAU,CAAC;IACjB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAChC;AAED,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,eAAe,GAAG;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CAAE,CAAC;AAExG,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,WAAW,CAAC;IAClB,OAAO,EAAE;QACP,OAAO,EAAE,eAAe,EAAE,CAAC;KAC5B,CAAC;CACH;AAED,MAAM,WAAW,gBAAgB;IAC/B,IAAI,EAAE,QAAQ,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,cAAc,EAAE,MAAM,CAAC;IACvB,KAAK,EAAE;QACL,YAAY,EAAE,MAAM,CAAC;QACrB,aAAa,EAAE,MAAM,CAAC;QACtB,uBAAuB,EAAE,MAAM,CAAC;QAChC,2BAA2B,EAAE,MAAM,CAAC;KACrC,CAAC;IACF,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE;QACzB,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,OAAO,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;CACJ;AAED,MAAM,MAAM,gBAAgB,GAAG,mBAAmB,GAAG,gBAAgB,GAAG;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CAAE,CAAC;AAEjH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,OAAO,GAAG,GAAG,IAAI,mBAAmB,CAE3E;AAED,wBAAgB,eAAe,CAAC,GAAG,EAAE,OAAO,GAAG,GAAG,IAAI,gBAAgB,CAErE;AAED,wBAAgB,WAAW,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,IAAI,YAAY,CAEjE;AAED,wBAAgB,cAAc,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,IAAI,eAAe,CAEvE"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type definitions for the skill evaluation framework.
|
|
3
|
+
*/
|
|
4
|
+
export function isAssistantMessage(msg) {
|
|
5
|
+
return typeof msg === 'object' && msg !== null && msg.type === 'assistant';
|
|
6
|
+
}
|
|
7
|
+
export function isResultMessage(msg) {
|
|
8
|
+
return typeof msg === 'object' && msg !== null && msg.type === 'result';
|
|
9
|
+
}
|
|
10
|
+
export function isTextBlock(block) {
|
|
11
|
+
return typeof block === 'object' && block !== null && block.type === 'text';
|
|
12
|
+
}
|
|
13
|
+
export function isToolUseBlock(block) {
|
|
14
|
+
return typeof block === 'object' && block !== null && block.type === 'tool_use';
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=types.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAuRH,MAAM,UAAU,kBAAkB,CAAC,GAAY;IAC7C,OAAO,OAAO,GAAG,KAAK,QAAQ,IAAI,GAAG,KAAK,IAAI,IAAK,GAAyB,CAAC,IAAI,KAAK,WAAW,CAAC;AACpG,CAAC;AAED,MAAM,UAAU,eAAe,CAAC,GAAY;IAC1C,OAAO,OAAO,GAAG,KAAK,QAAQ,IAAI,GAAG,KAAK,IAAI,IAAK,GAAyB,CAAC,IAAI,KAAK,QAAQ,CAAC;AACjG,CAAC;AAED,MAAM,UAAU,WAAW,CAAC,KAAc;IACxC,OAAO,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAK,KAA2B,CAAC,IAAI,KAAK,MAAM,CAAC;AACrG,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,KAAc;IAC3C,OAAO,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAK,KAA2B,CAAC,IAAI,KAAK,UAAU,CAAC;AACzG,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@skilljack/evals",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "CLI for evaluating AI agent skill discoverability, adherence, and output quality. Runs as standalone CLI or GitHub Action.",
|
|
6
|
+
"bin": {
|
|
7
|
+
"skilljack-evals": "./dist/src/cli.js"
|
|
8
|
+
},
|
|
9
|
+
"main": "./dist/src/index.js",
|
|
10
|
+
"types": "./dist/src/index.d.ts",
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"import": "./dist/src/index.js",
|
|
14
|
+
"types": "./dist/src/index.d.ts"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"scripts": {
|
|
18
|
+
"build": "tsc",
|
|
19
|
+
"dev": "tsx src/cli.ts",
|
|
20
|
+
"start": "node dist/src/cli.js",
|
|
21
|
+
"clean": "rm -rf dist",
|
|
22
|
+
"typecheck": "tsc --noEmit"
|
|
23
|
+
},
|
|
24
|
+
"dependencies": {
|
|
25
|
+
"@anthropic-ai/claude-agent-sdk": "^0.1.42",
|
|
26
|
+
"commander": "^12.0.0",
|
|
27
|
+
"dotenv": "^16.4.0",
|
|
28
|
+
"js-yaml": "^4.1.0"
|
|
29
|
+
},
|
|
30
|
+
"devDependencies": {
|
|
31
|
+
"@actions/core": "^1.10.0",
|
|
32
|
+
"@types/js-yaml": "^4.0.9",
|
|
33
|
+
"@types/node": "^20.0.0",
|
|
34
|
+
"tsx": "^4.0.0",
|
|
35
|
+
"typescript": "^5.0.0"
|
|
36
|
+
},
|
|
37
|
+
"engines": {
|
|
38
|
+
"node": ">=20.0.0"
|
|
39
|
+
},
|
|
40
|
+
"files": [
|
|
41
|
+
"dist",
|
|
42
|
+
"action"
|
|
43
|
+
]
|
|
44
|
+
}
|