snapeval 2.2.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -13
- package/bin/snapeval.ts +7 -44
- package/dist/bin/snapeval.d.ts +1 -1
- package/dist/bin/snapeval.js +6 -42
- package/dist/bin/snapeval.js.map +1 -1
- package/dist/src/adapters/copilot-sdk-client.d.ts +0 -4
- package/dist/src/adapters/copilot-sdk-client.js +2 -23
- package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
- package/dist/src/adapters/harness/copilot-cli.js +1 -0
- package/dist/src/adapters/harness/copilot-cli.js.map +1 -1
- package/dist/src/adapters/harness/copilot-sdk.js +6 -16
- package/dist/src/adapters/harness/copilot-sdk.js.map +1 -1
- package/dist/src/adapters/harness/resolve.js +1 -5
- package/dist/src/adapters/harness/resolve.js.map +1 -1
- package/dist/src/adapters/inference/copilot-sdk.d.ts +1 -1
- package/dist/src/adapters/inference/copilot-sdk.js +4 -2
- package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
- package/dist/src/adapters/inference/github-models.js +3 -0
- package/dist/src/adapters/inference/github-models.js.map +1 -1
- package/dist/src/adapters/inference/resolve.js +6 -32
- package/dist/src/adapters/inference/resolve.js.map +1 -1
- package/dist/src/commands/eval.d.ts +1 -0
- package/dist/src/commands/eval.js +8 -0
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/errors.d.ts +0 -6
- package/dist/src/errors.js +1 -13
- package/dist/src/errors.js.map +1 -1
- package/package.json +8 -11
- package/plugin.json +4 -2
- package/skills/create-evals/SKILL.md +152 -0
- package/skills/run-evals/SKILL.md +132 -0
- package/src/adapters/copilot-sdk-client.ts +2 -22
- package/src/adapters/harness/copilot-cli.ts +1 -0
- package/src/adapters/harness/copilot-sdk.ts +6 -17
- package/src/adapters/harness/resolve.ts +1 -8
- package/src/adapters/inference/copilot-sdk.ts +4 -2
- package/src/adapters/inference/github-models.ts +3 -0
- package/src/adapters/inference/resolve.ts +8 -43
- package/src/commands/eval.ts +14 -1
- package/src/errors.ts +1 -15
- package/assets/ideation-viewer.html +0 -469
- package/dist/src/adapters/inference/copilot.d.ts +0 -5
- package/dist/src/adapters/inference/copilot.js +0 -10
- package/dist/src/adapters/inference/copilot.js.map +0 -1
- package/dist/src/commands/review.d.ts +0 -8
- package/dist/src/commands/review.js +0 -32
- package/dist/src/commands/review.js.map +0 -1
- package/src/adapters/inference/copilot.ts +0 -12
- package/src/commands/review.ts +0 -46
|
@@ -1,19 +1,7 @@
|
|
|
1
|
-
import { execFileSync } from 'node:child_process';
|
|
2
1
|
import type { InferenceAdapter } from '../../types.js';
|
|
3
2
|
import { AdapterNotAvailableError } from '../../errors.js';
|
|
4
3
|
import { GitHubModelsInference } from './github-models.js';
|
|
5
|
-
import { CopilotInference } from './copilot.js';
|
|
6
4
|
import { CopilotSDKInference } from './copilot-sdk.js';
|
|
7
|
-
import { isSDKInstalled } from '../copilot-sdk-client.js';
|
|
8
|
-
|
|
9
|
-
function isCopilotAvailable(): boolean {
|
|
10
|
-
try {
|
|
11
|
-
execFileSync('copilot', ['--version'], { encoding: 'utf-8', stdio: 'pipe' });
|
|
12
|
-
return true;
|
|
13
|
-
} catch {
|
|
14
|
-
return false;
|
|
15
|
-
}
|
|
16
|
-
}
|
|
17
5
|
|
|
18
6
|
function isGitHubTokenAvailable(): boolean {
|
|
19
7
|
return Boolean(process.env.GITHUB_TOKEN);
|
|
@@ -21,31 +9,18 @@ function isGitHubTokenAvailable(): boolean {
|
|
|
21
9
|
|
|
22
10
|
export function resolveInference(preference: string): InferenceAdapter {
|
|
23
11
|
if (preference === 'auto') {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
if (copilotAvailable) {
|
|
28
|
-
return new CopilotInference();
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
if (tokenAvailable) {
|
|
32
|
-
return new GitHubModelsInference();
|
|
33
|
-
}
|
|
12
|
+
return new CopilotSDKInference();
|
|
13
|
+
}
|
|
34
14
|
|
|
15
|
+
if (preference === 'copilot') {
|
|
35
16
|
throw new AdapterNotAvailableError(
|
|
36
|
-
'
|
|
37
|
-
'
|
|
17
|
+
'copilot',
|
|
18
|
+
'The copilot CLI inference adapter has been removed. Use --inference copilot-sdk instead.'
|
|
38
19
|
);
|
|
39
20
|
}
|
|
40
21
|
|
|
41
|
-
if (preference === 'copilot') {
|
|
42
|
-
|
|
43
|
-
throw new AdapterNotAvailableError(
|
|
44
|
-
'copilot',
|
|
45
|
-
'GitHub Copilot CLI is not available. Install with: npm install -g @github/copilot'
|
|
46
|
-
);
|
|
47
|
-
}
|
|
48
|
-
return new CopilotInference();
|
|
22
|
+
if (preference === 'copilot-sdk') {
|
|
23
|
+
return new CopilotSDKInference();
|
|
49
24
|
}
|
|
50
25
|
|
|
51
26
|
if (preference === 'github-models') {
|
|
@@ -58,18 +33,8 @@ export function resolveInference(preference: string): InferenceAdapter {
|
|
|
58
33
|
return new GitHubModelsInference();
|
|
59
34
|
}
|
|
60
35
|
|
|
61
|
-
if (preference === 'copilot-sdk') {
|
|
62
|
-
if (!isSDKInstalled()) {
|
|
63
|
-
throw new AdapterNotAvailableError(
|
|
64
|
-
'copilot-sdk',
|
|
65
|
-
'@github/copilot-sdk is not installed. Install with: npm install @github/copilot-sdk'
|
|
66
|
-
);
|
|
67
|
-
}
|
|
68
|
-
return new CopilotSDKInference();
|
|
69
|
-
}
|
|
70
|
-
|
|
71
36
|
throw new AdapterNotAvailableError(
|
|
72
37
|
preference,
|
|
73
|
-
`Unknown inference adapter "${preference}". Valid options: auto, copilot
|
|
38
|
+
`Unknown inference adapter "${preference}". Valid options: auto, copilot-sdk, github-models.`
|
|
74
39
|
);
|
|
75
40
|
}
|
package/src/commands/eval.ts
CHANGED
|
@@ -7,6 +7,7 @@ import type {
|
|
|
7
7
|
EvalResults,
|
|
8
8
|
EvalRunResult,
|
|
9
9
|
GradingResult,
|
|
10
|
+
FeedbackData,
|
|
10
11
|
} from '../types.js';
|
|
11
12
|
import { WorkspaceManager } from '../engine/workspace.js';
|
|
12
13
|
import { runEval } from '../engine/runner.js';
|
|
@@ -86,7 +87,7 @@ export async function evalCommand(
|
|
|
86
87
|
skillPath: string,
|
|
87
88
|
harness: Harness,
|
|
88
89
|
inference: InferenceAdapter,
|
|
89
|
-
options: { workspace?: string; runs?: number; oldSkill?: string; concurrency?: number; only?: number[]; threshold?: number }
|
|
90
|
+
options: { workspace?: string; runs?: number; oldSkill?: string; concurrency?: number; only?: number[]; threshold?: number; feedback?: boolean }
|
|
90
91
|
): Promise<EvalResults> {
|
|
91
92
|
const evalsPath = path.join(skillPath, 'evals', 'evals.json');
|
|
92
93
|
if (!fs.existsSync(evalsPath)) {
|
|
@@ -224,6 +225,18 @@ export async function evalCommand(
|
|
|
224
225
|
typeof value === 'number' ? Math.round(value * 10000) / 10000 : value, 2)
|
|
225
226
|
);
|
|
226
227
|
|
|
228
|
+
// Write feedback template if requested
|
|
229
|
+
if (options.feedback) {
|
|
230
|
+
const feedback: FeedbackData = {};
|
|
231
|
+
for (const run of evalRuns) {
|
|
232
|
+
feedback[`eval-${run.slug}`] = '';
|
|
233
|
+
}
|
|
234
|
+
fs.writeFileSync(
|
|
235
|
+
path.join(iterationDir, 'feedback.json'),
|
|
236
|
+
JSON.stringify(feedback, null, 2)
|
|
237
|
+
);
|
|
238
|
+
}
|
|
239
|
+
|
|
227
240
|
// Check threshold if set (for CI gating)
|
|
228
241
|
if (options.threshold !== undefined) {
|
|
229
242
|
const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
|
package/src/errors.ts
CHANGED
|
@@ -35,21 +35,7 @@ export class AdapterNotAvailableError extends SnapevalError {
|
|
|
35
35
|
|
|
36
36
|
export class RateLimitError extends SnapevalError {
|
|
37
37
|
constructor(adapterName: string) {
|
|
38
|
-
super(`${adapterName} rate limit exceeded. Try again later or use a different adapter
|
|
38
|
+
super(`${adapterName} rate limit exceeded. Try again later or use a different adapter.`, 4);
|
|
39
39
|
this.name = 'RateLimitError';
|
|
40
40
|
}
|
|
41
41
|
}
|
|
42
|
-
|
|
43
|
-
export class TimeoutError extends SnapevalError {
|
|
44
|
-
constructor(evalId: number, timeoutMs: number) {
|
|
45
|
-
super(`Eval ${evalId} timed out after ${timeoutMs}ms.`, 4);
|
|
46
|
-
this.name = 'TimeoutError';
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
export class GradingError extends SnapevalError {
|
|
51
|
-
constructor(evalId: number, detail: string) {
|
|
52
|
-
super(`Grading failed for eval ${evalId}: ${detail}`, 4);
|
|
53
|
-
this.name = 'GradingError';
|
|
54
|
-
}
|
|
55
|
-
}
|
|
@@ -1,469 +0,0 @@
|
|
|
1
|
-
<!DOCTYPE html>
|
|
2
|
-
<html lang="en">
|
|
3
|
-
<head>
|
|
4
|
-
<meta charset="UTF-8">
|
|
5
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
-
<title>snapeval — Scenario Ideation</title>
|
|
7
|
-
<style>
|
|
8
|
-
/* === Reset & Base === */
|
|
9
|
-
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
|
10
|
-
body {
|
|
11
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
12
|
-
background: #faf9f6;
|
|
13
|
-
color: #1a1a1a;
|
|
14
|
-
line-height: 1.6;
|
|
15
|
-
padding: 2rem;
|
|
16
|
-
max-width: 1200px;
|
|
17
|
-
margin: 0 auto;
|
|
18
|
-
}
|
|
19
|
-
code, pre, .mono { font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace; }
|
|
20
|
-
|
|
21
|
-
/* === Colors (shared with eval viewer) === */
|
|
22
|
-
:root {
|
|
23
|
-
--accent: #2563eb;
|
|
24
|
-
--accent-light: #dbeafe;
|
|
25
|
-
--pass: #16a34a;
|
|
26
|
-
--pass-bg: #dcfce7;
|
|
27
|
-
--fail: #dc2626;
|
|
28
|
-
--fail-bg: #fee2e2;
|
|
29
|
-
--warn: #ca8a04;
|
|
30
|
-
--warn-bg: #fef9c3;
|
|
31
|
-
--gray: #6b7280;
|
|
32
|
-
--gray-light: #f3f4f6;
|
|
33
|
-
--gray-border: #e5e7eb;
|
|
34
|
-
--bg: #faf9f6;
|
|
35
|
-
--card-bg: #ffffff;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/* === Layout === */
|
|
39
|
-
header { margin-bottom: 2rem; }
|
|
40
|
-
header h1 { font-size: 1.5rem; font-weight: 600; }
|
|
41
|
-
header .subtitle { color: var(--gray); font-size: 0.875rem; margin-top: 0.25rem; }
|
|
42
|
-
.stats { display: flex; gap: 1.5rem; margin-top: 1rem; flex-wrap: wrap; }
|
|
43
|
-
.stat { background: var(--card-bg); border: 1px solid var(--gray-border); border-radius: 8px; padding: 0.75rem 1rem; }
|
|
44
|
-
.stat-value { font-size: 1.25rem; font-weight: 600; }
|
|
45
|
-
.stat-label { font-size: 0.75rem; color: var(--gray); text-transform: uppercase; letter-spacing: 0.05em; }
|
|
46
|
-
|
|
47
|
-
/* === Sections === */
|
|
48
|
-
section { margin-bottom: 2rem; }
|
|
49
|
-
section h2 { font-size: 1.125rem; font-weight: 600; margin-bottom: 1rem; padding-bottom: 0.5rem; border-bottom: 1px solid var(--gray-border); }
|
|
50
|
-
section h3 { font-size: 0.875rem; font-weight: 600; color: var(--gray); text-transform: uppercase; letter-spacing: 0.05em; margin-bottom: 0.75rem; }
|
|
51
|
-
|
|
52
|
-
/* === Cards === */
|
|
53
|
-
.card-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 1rem; }
|
|
54
|
-
.card {
|
|
55
|
-
background: var(--card-bg);
|
|
56
|
-
border: 1px solid var(--gray-border);
|
|
57
|
-
border-radius: 8px;
|
|
58
|
-
padding: 1rem;
|
|
59
|
-
transition: border-color 0.15s;
|
|
60
|
-
}
|
|
61
|
-
.card:hover { border-color: var(--accent); }
|
|
62
|
-
.card-title { font-weight: 600; font-size: 0.9rem; margin-bottom: 0.5rem; }
|
|
63
|
-
.card-desc { font-size: 0.85rem; color: var(--gray); }
|
|
64
|
-
|
|
65
|
-
/* === Ambiguities === */
|
|
66
|
-
.ambiguity-card { border-left: 3px solid var(--warn); }
|
|
67
|
-
.ambiguity-why { font-size: 0.8rem; color: var(--gray); margin: 0.5rem 0; font-style: italic; }
|
|
68
|
-
.scope-toggle { display: flex; gap: 0.5rem; margin-top: 0.75rem; }
|
|
69
|
-
.scope-btn {
|
|
70
|
-
padding: 0.3rem 0.75rem;
|
|
71
|
-
border: 1px solid var(--gray-border);
|
|
72
|
-
border-radius: 4px;
|
|
73
|
-
background: var(--card-bg);
|
|
74
|
-
cursor: pointer;
|
|
75
|
-
font-size: 0.8rem;
|
|
76
|
-
transition: all 0.15s;
|
|
77
|
-
}
|
|
78
|
-
.scope-btn:hover { border-color: var(--accent); }
|
|
79
|
-
.scope-btn.active-in { background: var(--pass-bg); border-color: var(--pass); color: var(--pass); }
|
|
80
|
-
.scope-btn.active-out { background: var(--gray-light); border-color: var(--gray); color: var(--gray); }
|
|
81
|
-
|
|
82
|
-
/* === Scenario Cards === */
|
|
83
|
-
.scenario-card { position: relative; }
|
|
84
|
-
.scenario-card.disabled { opacity: 0.5; }
|
|
85
|
-
.scenario-toggle {
|
|
86
|
-
position: absolute;
|
|
87
|
-
top: 1rem;
|
|
88
|
-
right: 1rem;
|
|
89
|
-
width: 40px;
|
|
90
|
-
height: 22px;
|
|
91
|
-
background: var(--pass);
|
|
92
|
-
border-radius: 11px;
|
|
93
|
-
cursor: pointer;
|
|
94
|
-
border: none;
|
|
95
|
-
transition: background 0.2s;
|
|
96
|
-
}
|
|
97
|
-
.scenario-toggle.off { background: var(--gray); }
|
|
98
|
-
.scenario-toggle::after {
|
|
99
|
-
content: '';
|
|
100
|
-
position: absolute;
|
|
101
|
-
top: 2px;
|
|
102
|
-
left: 2px;
|
|
103
|
-
width: 18px;
|
|
104
|
-
height: 18px;
|
|
105
|
-
background: white;
|
|
106
|
-
border-radius: 50%;
|
|
107
|
-
transition: transform 0.2s;
|
|
108
|
-
}
|
|
109
|
-
.scenario-toggle.off::after { transform: translateX(0); }
|
|
110
|
-
.scenario-toggle:not(.off)::after { transform: translateX(18px); }
|
|
111
|
-
|
|
112
|
-
.scenario-prompt {
|
|
113
|
-
background: var(--gray-light);
|
|
114
|
-
border-radius: 4px;
|
|
115
|
-
padding: 0.75rem;
|
|
116
|
-
font-family: monospace;
|
|
117
|
-
font-size: 0.85rem;
|
|
118
|
-
margin: 0.75rem 0;
|
|
119
|
-
white-space: pre-wrap;
|
|
120
|
-
word-break: break-word;
|
|
121
|
-
}
|
|
122
|
-
.scenario-why { font-size: 0.8rem; color: var(--gray); margin-bottom: 0.5rem; }
|
|
123
|
-
.scenario-expected { font-size: 0.85rem; margin-top: 0.5rem; }
|
|
124
|
-
.scenario-expected strong { font-weight: 600; }
|
|
125
|
-
|
|
126
|
-
.editable {
|
|
127
|
-
border: 1px solid transparent;
|
|
128
|
-
border-radius: 4px;
|
|
129
|
-
padding: 0.25rem;
|
|
130
|
-
transition: border-color 0.15s;
|
|
131
|
-
cursor: text;
|
|
132
|
-
}
|
|
133
|
-
.editable:hover { border-color: var(--gray-border); }
|
|
134
|
-
.editable:focus { border-color: var(--accent); outline: none; background: var(--accent-light); }
|
|
135
|
-
|
|
136
|
-
/* === Add Scenario === */
|
|
137
|
-
.add-form {
|
|
138
|
-
background: var(--card-bg);
|
|
139
|
-
border: 2px dashed var(--gray-border);
|
|
140
|
-
border-radius: 8px;
|
|
141
|
-
padding: 1.25rem;
|
|
142
|
-
margin-top: 1rem;
|
|
143
|
-
}
|
|
144
|
-
.add-form label { display: block; font-size: 0.85rem; font-weight: 600; margin-bottom: 0.25rem; margin-top: 0.75rem; }
|
|
145
|
-
.add-form label:first-child { margin-top: 0; }
|
|
146
|
-
.add-form textarea, .add-form input[type="text"] {
|
|
147
|
-
width: 100%;
|
|
148
|
-
padding: 0.5rem;
|
|
149
|
-
border: 1px solid var(--gray-border);
|
|
150
|
-
border-radius: 4px;
|
|
151
|
-
font-family: inherit;
|
|
152
|
-
font-size: 0.85rem;
|
|
153
|
-
resize: vertical;
|
|
154
|
-
}
|
|
155
|
-
.add-form textarea:focus, .add-form input[type="text"]:focus { border-color: var(--accent); outline: none; }
|
|
156
|
-
|
|
157
|
-
/* === Notes === */
|
|
158
|
-
#user-notes {
|
|
159
|
-
width: 100%;
|
|
160
|
-
min-height: 80px;
|
|
161
|
-
padding: 0.75rem;
|
|
162
|
-
border: 1px solid var(--gray-border);
|
|
163
|
-
border-radius: 8px;
|
|
164
|
-
font-family: inherit;
|
|
165
|
-
font-size: 0.85rem;
|
|
166
|
-
resize: vertical;
|
|
167
|
-
}
|
|
168
|
-
#user-notes:focus { border-color: var(--accent); outline: none; }
|
|
169
|
-
|
|
170
|
-
/* === Buttons === */
|
|
171
|
-
.btn {
|
|
172
|
-
display: inline-flex;
|
|
173
|
-
align-items: center;
|
|
174
|
-
gap: 0.5rem;
|
|
175
|
-
padding: 0.6rem 1.25rem;
|
|
176
|
-
border: none;
|
|
177
|
-
border-radius: 6px;
|
|
178
|
-
font-size: 0.9rem;
|
|
179
|
-
font-weight: 500;
|
|
180
|
-
cursor: pointer;
|
|
181
|
-
transition: all 0.15s;
|
|
182
|
-
}
|
|
183
|
-
.btn-primary { background: var(--accent); color: white; }
|
|
184
|
-
.btn-primary:hover { background: #1d4ed8; }
|
|
185
|
-
.btn-secondary { background: var(--gray-light); color: var(--gray); border: 1px solid var(--gray-border); }
|
|
186
|
-
.btn-secondary:hover { background: var(--gray-border); }
|
|
187
|
-
.btn-add { background: var(--pass-bg); color: var(--pass); border: 1px solid var(--pass); }
|
|
188
|
-
.btn-add:hover { background: var(--pass); color: white; }
|
|
189
|
-
|
|
190
|
-
.actions { display: flex; gap: 1rem; margin-top: 2rem; padding-top: 1.5rem; border-top: 2px solid var(--gray-border); }
|
|
191
|
-
.actions .spacer { flex: 1; }
|
|
192
|
-
</style>
|
|
193
|
-
</head>
|
|
194
|
-
<body>
|
|
195
|
-
|
|
196
|
-
<script>
|
|
197
|
-
const DATA = __ANALYSIS_DATA_PLACEHOLDER__;
|
|
198
|
-
|
|
199
|
-
// State
|
|
200
|
-
const state = {
|
|
201
|
-
scenarios: DATA.scenarios.map(s => ({ ...s })),
|
|
202
|
-
ambiguityDecisions: DATA.ambiguities.map(a => ({ description: a.description, decision: a.in_scope === true ? 'in_scope' : a.in_scope === false ? 'out_of_scope' : null })),
|
|
203
|
-
customScenarios: [],
|
|
204
|
-
userNotes: '',
|
|
205
|
-
};
|
|
206
|
-
|
|
207
|
-
function render() {
|
|
208
|
-
document.getElementById('app').innerHTML = `
|
|
209
|
-
${renderHeader()}
|
|
210
|
-
${renderSkillMap()}
|
|
211
|
-
${renderAmbiguities()}
|
|
212
|
-
${renderScenarios()}
|
|
213
|
-
${renderAddForm()}
|
|
214
|
-
${renderNotes()}
|
|
215
|
-
${renderActions()}
|
|
216
|
-
`;
|
|
217
|
-
bindEvents();
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
function renderHeader() {
|
|
221
|
-
const enabled = state.scenarios.filter(s => s.enabled).length;
|
|
222
|
-
return `
|
|
223
|
-
<header>
|
|
224
|
-
<h1>snapeval — ${DATA.skill_name}</h1>
|
|
225
|
-
<div class="subtitle">Interactive Scenario Ideation</div>
|
|
226
|
-
<div class="stats">
|
|
227
|
-
<div class="stat"><div class="stat-value">${DATA.behaviors.length}</div><div class="stat-label">Behaviors</div></div>
|
|
228
|
-
<div class="stat"><div class="stat-value">${DATA.dimensions.length}</div><div class="stat-label">Dimensions</div></div>
|
|
229
|
-
<div class="stat"><div class="stat-value">${enabled} / ${state.scenarios.length + state.customScenarios.length}</div><div class="stat-label">Scenarios</div></div>
|
|
230
|
-
<div class="stat"><div class="stat-value">${DATA.ambiguities.length}</div><div class="stat-label">Ambiguities</div></div>
|
|
231
|
-
</div>
|
|
232
|
-
</header>
|
|
233
|
-
`;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
function renderSkillMap() {
|
|
237
|
-
const behaviorCards = DATA.behaviors.map(b => `
|
|
238
|
-
<div class="card">
|
|
239
|
-
<div class="card-title">${esc(b.name)}</div>
|
|
240
|
-
<div class="card-desc">${esc(b.description)}</div>
|
|
241
|
-
</div>
|
|
242
|
-
`).join('');
|
|
243
|
-
|
|
244
|
-
const dimensionCards = DATA.dimensions.map(d => `
|
|
245
|
-
<div class="card">
|
|
246
|
-
<div class="card-title">${esc(d.name)}</div>
|
|
247
|
-
<div class="card-desc">${d.values.map(v => esc(v)).join(', ')}</div>
|
|
248
|
-
</div>
|
|
249
|
-
`).join('');
|
|
250
|
-
|
|
251
|
-
return `
|
|
252
|
-
<section>
|
|
253
|
-
<h2>Skill Map</h2>
|
|
254
|
-
<h3>Behaviors</h3>
|
|
255
|
-
<div class="card-grid">${behaviorCards}</div>
|
|
256
|
-
<h3 style="margin-top:1.5rem">Input Dimensions</h3>
|
|
257
|
-
<div class="card-grid">${dimensionCards}</div>
|
|
258
|
-
</section>
|
|
259
|
-
`;
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
function renderAmbiguities() {
|
|
263
|
-
if (DATA.ambiguities.length === 0) return '';
|
|
264
|
-
const cards = DATA.ambiguities.map((a, i) => {
|
|
265
|
-
const decision = state.ambiguityDecisions[i]?.decision;
|
|
266
|
-
return `
|
|
267
|
-
<div class="card ambiguity-card">
|
|
268
|
-
<div class="card-title">${esc(a.description)}</div>
|
|
269
|
-
<div class="ambiguity-why">${esc(a.why_it_matters)}</div>
|
|
270
|
-
<div class="scope-toggle">
|
|
271
|
-
<button class="scope-btn ${decision === 'in_scope' ? 'active-in' : ''}" data-amb-idx="${i}" data-decision="in_scope">In Scope</button>
|
|
272
|
-
<button class="scope-btn ${decision === 'out_of_scope' ? 'active-out' : ''}" data-amb-idx="${i}" data-decision="out_of_scope">Out of Scope</button>
|
|
273
|
-
</div>
|
|
274
|
-
</div>
|
|
275
|
-
`;
|
|
276
|
-
}).join('');
|
|
277
|
-
|
|
278
|
-
return `
|
|
279
|
-
<section>
|
|
280
|
-
<h2>Gaps & Ambiguities</h2>
|
|
281
|
-
<div class="card-grid">${cards}</div>
|
|
282
|
-
</section>
|
|
283
|
-
`;
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
function renderScenarios() {
|
|
287
|
-
const cards = state.scenarios.map((s, i) => `
|
|
288
|
-
<div class="card scenario-card ${s.enabled ? '' : 'disabled'}">
|
|
289
|
-
<button class="scenario-toggle ${s.enabled ? '' : 'off'}" data-scenario-idx="${i}" title="${s.enabled ? 'Enabled' : 'Disabled'}"></button>
|
|
290
|
-
<div class="card-title">Scenario ${s.id}</div>
|
|
291
|
-
<div class="scenario-why">${esc(s.why)}</div>
|
|
292
|
-
<div class="scenario-prompt editable" contenteditable="true" data-field="prompt" data-scenario-idx="${i}">${esc(s.prompt)}</div>
|
|
293
|
-
<div class="scenario-expected"><strong>Expected:</strong> <span class="editable" contenteditable="true" data-field="expected_behavior" data-scenario-idx="${i}">${esc(s.expected_behavior)}</span></div>
|
|
294
|
-
</div>
|
|
295
|
-
`).join('');
|
|
296
|
-
|
|
297
|
-
const customCards = state.customScenarios.map((s, i) => `
|
|
298
|
-
<div class="card scenario-card" style="border-color:var(--pass)">
|
|
299
|
-
<div class="card-title" style="color:var(--pass)">Custom #${i + 1} <button class="btn-secondary" style="font-size:0.7rem;padding:0.15rem 0.4rem;margin-left:0.5rem" data-remove-custom="${i}">Remove</button></div>
|
|
300
|
-
<div class="scenario-prompt">${esc(s.prompt)}</div>
|
|
301
|
-
<div class="scenario-expected"><strong>Expected:</strong> ${esc(s.expected_behavior)}</div>
|
|
302
|
-
</div>
|
|
303
|
-
`).join('');
|
|
304
|
-
|
|
305
|
-
return `
|
|
306
|
-
<section>
|
|
307
|
-
<h2>Proposed Scenarios</h2>
|
|
308
|
-
<div class="card-grid">${cards}${customCards}</div>
|
|
309
|
-
</section>
|
|
310
|
-
`;
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
function renderAddForm() {
|
|
314
|
-
return `
|
|
315
|
-
<section>
|
|
316
|
-
<h2>Add Custom Scenario</h2>
|
|
317
|
-
<div class="add-form">
|
|
318
|
-
<label for="custom-prompt">User Prompt</label>
|
|
319
|
-
<textarea id="custom-prompt" rows="3" placeholder="Type a realistic user prompt..."></textarea>
|
|
320
|
-
<label for="custom-expected">Expected Behavior</label>
|
|
321
|
-
<input type="text" id="custom-expected" placeholder="What should happen?" />
|
|
322
|
-
<div style="margin-top:0.75rem">
|
|
323
|
-
<button class="btn btn-add" id="add-scenario-btn">Add Scenario</button>
|
|
324
|
-
</div>
|
|
325
|
-
</div>
|
|
326
|
-
</section>
|
|
327
|
-
`;
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
function renderNotes() {
|
|
331
|
-
return `
|
|
332
|
-
<section>
|
|
333
|
-
<h2>Notes for AI</h2>
|
|
334
|
-
<textarea id="user-notes" placeholder="Add any context, constraints, or known issues you want the AI to consider...">${esc(state.userNotes)}</textarea>
|
|
335
|
-
</section>
|
|
336
|
-
`;
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
function renderActions() {
|
|
340
|
-
const enabledCount = state.scenarios.filter(s => s.enabled).length + state.customScenarios.length;
|
|
341
|
-
return `
|
|
342
|
-
<div class="actions">
|
|
343
|
-
<span style="color:var(--gray);font-size:0.85rem;align-self:center">${enabledCount} scenario${enabledCount !== 1 ? 's' : ''} will be exported</span>
|
|
344
|
-
<span class="spacer"></span>
|
|
345
|
-
<button class="btn btn-primary" id="confirm-btn">Confirm & Run</button>
|
|
346
|
-
</div>
|
|
347
|
-
`;
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
function bindEvents() {
|
|
351
|
-
// Scenario toggles
|
|
352
|
-
document.querySelectorAll('.scenario-toggle').forEach(btn => {
|
|
353
|
-
btn.addEventListener('click', () => {
|
|
354
|
-
const idx = parseInt(btn.dataset.scenarioIdx);
|
|
355
|
-
state.scenarios[idx].enabled = !state.scenarios[idx].enabled;
|
|
356
|
-
render();
|
|
357
|
-
});
|
|
358
|
-
});
|
|
359
|
-
|
|
360
|
-
// Ambiguity scope buttons
|
|
361
|
-
document.querySelectorAll('.scope-btn').forEach(btn => {
|
|
362
|
-
btn.addEventListener('click', () => {
|
|
363
|
-
const idx = parseInt(btn.dataset.ambIdx);
|
|
364
|
-
const decision = btn.dataset.decision;
|
|
365
|
-
const current = state.ambiguityDecisions[idx].decision;
|
|
366
|
-
state.ambiguityDecisions[idx].decision = current === decision ? null : decision;
|
|
367
|
-
render();
|
|
368
|
-
});
|
|
369
|
-
});
|
|
370
|
-
|
|
371
|
-
// Editable fields (blur saves)
|
|
372
|
-
document.querySelectorAll('.editable[data-scenario-idx]').forEach(el => {
|
|
373
|
-
el.addEventListener('blur', () => {
|
|
374
|
-
const idx = parseInt(el.dataset.scenarioIdx);
|
|
375
|
-
const field = el.dataset.field;
|
|
376
|
-
state.scenarios[idx][field] = el.textContent.trim();
|
|
377
|
-
});
|
|
378
|
-
});
|
|
379
|
-
|
|
380
|
-
// Remove custom scenario
|
|
381
|
-
document.querySelectorAll('[data-remove-custom]').forEach(btn => {
|
|
382
|
-
btn.addEventListener('click', () => {
|
|
383
|
-
const idx = parseInt(btn.dataset.removeCustom);
|
|
384
|
-
state.customScenarios.splice(idx, 1);
|
|
385
|
-
render();
|
|
386
|
-
});
|
|
387
|
-
});
|
|
388
|
-
|
|
389
|
-
// Add scenario
|
|
390
|
-
const addBtn = document.getElementById('add-scenario-btn');
|
|
391
|
-
if (addBtn) {
|
|
392
|
-
addBtn.addEventListener('click', () => {
|
|
393
|
-
const prompt = document.getElementById('custom-prompt').value.trim();
|
|
394
|
-
const expected = document.getElementById('custom-expected').value.trim();
|
|
395
|
-
if (!prompt) return;
|
|
396
|
-
state.customScenarios.push({ prompt, expected_behavior: expected || 'Not specified' });
|
|
397
|
-
render();
|
|
398
|
-
});
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
// Notes
|
|
402
|
-
const notes = document.getElementById('user-notes');
|
|
403
|
-
if (notes) {
|
|
404
|
-
notes.addEventListener('input', () => { state.userNotes = notes.value; });
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
// Confirm & Run
|
|
408
|
-
const confirmBtn = document.getElementById('confirm-btn');
|
|
409
|
-
if (confirmBtn) {
|
|
410
|
-
confirmBtn.addEventListener('click', exportPlan);
|
|
411
|
-
}
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
function exportPlan() {
|
|
415
|
-
const plan = {
|
|
416
|
-
version: 1,
|
|
417
|
-
confirmed_scenarios: state.scenarios
|
|
418
|
-
.filter(s => s.enabled)
|
|
419
|
-
.map(s => ({
|
|
420
|
-
id: s.id,
|
|
421
|
-
prompt: s.prompt,
|
|
422
|
-
expected_behavior: s.expected_behavior,
|
|
423
|
-
covers: s.covers,
|
|
424
|
-
why: s.why,
|
|
425
|
-
})),
|
|
426
|
-
custom_scenarios: state.customScenarios.map(s => ({
|
|
427
|
-
prompt: s.prompt,
|
|
428
|
-
expected_behavior: s.expected_behavior,
|
|
429
|
-
})),
|
|
430
|
-
ambiguity_decisions: state.ambiguityDecisions.filter(a => a.decision !== null),
|
|
431
|
-
user_notes: state.userNotes || '',
|
|
432
|
-
};
|
|
433
|
-
|
|
434
|
-
const blob = new Blob([JSON.stringify(plan, null, 2)], { type: 'application/json' });
|
|
435
|
-
const url = URL.createObjectURL(blob);
|
|
436
|
-
const a = document.createElement('a');
|
|
437
|
-
a.href = url;
|
|
438
|
-
a.download = 'scenario_plan.json';
|
|
439
|
-
document.body.appendChild(a);
|
|
440
|
-
a.click();
|
|
441
|
-
document.body.removeChild(a);
|
|
442
|
-
URL.revokeObjectURL(url);
|
|
443
|
-
|
|
444
|
-
const confirmBtn = document.getElementById('confirm-btn');
|
|
445
|
-
if (confirmBtn) {
|
|
446
|
-
confirmBtn.textContent = 'Exported! Return to your terminal.';
|
|
447
|
-
confirmBtn.disabled = true;
|
|
448
|
-
confirmBtn.style.background = 'var(--pass)';
|
|
449
|
-
}
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
function esc(str) {
|
|
453
|
-
if (!str) return '';
|
|
454
|
-
const div = document.createElement('div');
|
|
455
|
-
div.textContent = str;
|
|
456
|
-
return div.innerHTML;
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
// Boot
|
|
460
|
-
document.addEventListener('DOMContentLoaded', () => {
|
|
461
|
-
const app = document.createElement('div');
|
|
462
|
-
app.id = 'app';
|
|
463
|
-
document.body.appendChild(app);
|
|
464
|
-
render();
|
|
465
|
-
});
|
|
466
|
-
</script>
|
|
467
|
-
|
|
468
|
-
</body>
|
|
469
|
-
</html>
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
import { execFileSync } from 'node:child_process';
|
|
2
|
-
export class CopilotInference {
|
|
3
|
-
name = 'copilot';
|
|
4
|
-
async chat(messages, _options) {
|
|
5
|
-
const prompt = messages.map((m) => m.content).join('\n');
|
|
6
|
-
const result = execFileSync('copilot', ['-s', '--no-ask-user', '--model', 'gpt-4.1', '-p', prompt], { encoding: 'utf-8' });
|
|
7
|
-
return result.trim();
|
|
8
|
-
}
|
|
9
|
-
}
|
|
10
|
-
//# sourceMappingURL=copilot.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"copilot.js","sourceRoot":"","sources":["../../../../src/adapters/inference/copilot.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAGlD,MAAM,OAAO,gBAAgB;IAClB,IAAI,GAAG,SAAS,CAAC;IAE1B,KAAK,CAAC,IAAI,CAAC,QAAmB,EAAE,QAAsB;QACpD,MAAM,MAAM,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzD,MAAM,MAAM,GAAG,YAAY,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,eAAe,EAAE,SAAS,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;QAC3H,OAAO,MAAM,CAAC,IAAI,EAAE,CAAC;IACvB,CAAC;CACF"}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
import type { Harness, InferenceAdapter } from '../types.js';
|
|
2
|
-
export declare function reviewCommand(skillPath: string, harness: Harness, inference: InferenceAdapter, options: {
|
|
3
|
-
workspace?: string;
|
|
4
|
-
runs?: number;
|
|
5
|
-
oldSkill?: string;
|
|
6
|
-
noOpen?: boolean;
|
|
7
|
-
concurrency?: number;
|
|
8
|
-
}): Promise<void>;
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
import { execFile } from 'node:child_process';
|
|
2
|
-
import * as fs from 'node:fs';
|
|
3
|
-
import * as path from 'node:path';
|
|
4
|
-
import * as process from 'node:process';
|
|
5
|
-
import { evalCommand } from './eval.js';
|
|
6
|
-
import { TerminalReporter } from '../adapters/report/terminal.js';
|
|
7
|
-
export async function reviewCommand(skillPath, harness, inference, options) {
|
|
8
|
-
const results = await evalCommand(skillPath, harness, inference, options);
|
|
9
|
-
const terminal = new TerminalReporter();
|
|
10
|
-
await terminal.report(results);
|
|
11
|
-
// feedback.json template
|
|
12
|
-
const feedback = {};
|
|
13
|
-
for (const run of results.evalRuns) {
|
|
14
|
-
feedback[`eval-${run.slug}`] = '';
|
|
15
|
-
}
|
|
16
|
-
fs.writeFileSync(path.join(results.iterationDir, 'feedback.json'), JSON.stringify(feedback, null, 2));
|
|
17
|
-
// Open in browser (placeholder - HTML reporter will be wired later)
|
|
18
|
-
if (!options.noOpen) {
|
|
19
|
-
const reportPath = path.join(results.iterationDir, 'benchmark.json');
|
|
20
|
-
openInBrowser(reportPath);
|
|
21
|
-
}
|
|
22
|
-
}
|
|
23
|
-
function openInBrowser(filePath) {
|
|
24
|
-
const cmd = process.platform === 'darwin' ? 'open' :
|
|
25
|
-
process.platform === 'win32' ? 'cmd' : 'xdg-open';
|
|
26
|
-
const args = process.platform === 'win32' ? ['/c', 'start', '', filePath] : [filePath];
|
|
27
|
-
execFile(cmd, args, (err) => {
|
|
28
|
-
if (err)
|
|
29
|
-
console.warn(`Could not open browser: ${err.message}`);
|
|
30
|
-
});
|
|
31
|
-
}
|
|
32
|
-
//# sourceMappingURL=review.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"review.js","sourceRoot":"","sources":["../../../src/commands/review.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,OAAO,MAAM,cAAc,CAAC;AAExC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAElE,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAAyG;IAEzG,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAE1E,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE/B,yBAAyB;IACzB,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAClC,KAAK,MAAM,GAAG,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACnC,QAAQ,CAAC,QAAQ,GAAG,CAAC,IAAI,EAAE,CAAC,GAAG,EAAE,CAAC;IACpC,CAAC;IACD,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,eAAe,CAAC,EAChD,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAClC,CAAC;IAEF,oEAAoE;IACpE,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;QACpB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,gBAAgB,CAAC,CAAC;QACrE,aAAa,CAAC,UAAU,CAAC,CAAC;IAC5B,CAAC;AACH,CAAC;AAED,SAAS,aAAa,CAAC,QAAgB;IACrC,MAAM,GAAG,GACP,OAAO,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QACxC,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;IACpD,MAAM,IAAI,GACR,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAC5E,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;QAC1B,IAAI,GAAG;YAAE,OAAO,CAAC,IAAI,CAAC,2BAA2B,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;AACL,CAAC"}
|