@bradtaylorsf/alpha-loop 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +80 -1
- package/dist/cli.js.map +1 -1
- package/dist/commands/eval.d.ts +53 -0
- package/dist/commands/eval.js +538 -0
- package/dist/commands/eval.js.map +1 -0
- package/dist/commands/evolve.d.ts +25 -0
- package/dist/commands/evolve.js +270 -0
- package/dist/commands/evolve.js.map +1 -0
- package/dist/commands/history.d.ts +1 -1
- package/dist/commands/history.js +4 -4
- package/dist/commands/history.js.map +1 -1
- package/dist/commands/run.js +139 -0
- package/dist/commands/run.js.map +1 -1
- package/dist/lib/agent.d.ts +8 -0
- package/dist/lib/agent.js +28 -3
- package/dist/lib/agent.js.map +1 -1
- package/dist/lib/config.d.ts +22 -0
- package/dist/lib/config.js +63 -0
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/eval-checks.d.ts +91 -0
- package/dist/lib/eval-checks.js +254 -0
- package/dist/lib/eval-checks.js.map +1 -0
- package/dist/lib/eval-runner.d.ts +29 -0
- package/dist/lib/eval-runner.js +439 -0
- package/dist/lib/eval-runner.js.map +1 -0
- package/dist/lib/eval.d.ts +170 -0
- package/dist/lib/eval.js +507 -0
- package/dist/lib/eval.js.map +1 -0
- package/dist/lib/pipeline.d.ts +9 -0
- package/dist/lib/pipeline.js +186 -10
- package/dist/lib/pipeline.js.map +1 -1
- package/dist/lib/prompts.d.ts +18 -0
- package/dist/lib/prompts.js +48 -0
- package/dist/lib/prompts.js.map +1 -1
- package/dist/lib/score.d.ts +80 -0
- package/dist/lib/score.js +172 -0
- package/dist/lib/score.js.map +1 -0
- package/dist/lib/session.d.ts +2 -1
- package/dist/lib/session.js +16 -0
- package/dist/lib/session.js.map +1 -1
- package/dist/lib/traces.d.ts +173 -0
- package/dist/lib/traces.js +272 -0
- package/dist/lib/traces.js.map +1 -0
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -8,7 +8,7 @@ import { syncCommand } from './commands/sync.js';
|
|
|
8
8
|
program
|
|
9
9
|
.name('alpha-loop')
|
|
10
10
|
.description('Agent-agnostic automated development loop')
|
|
11
|
-
.version('1.
|
|
11
|
+
.version('1.4.0');
|
|
12
12
|
program
|
|
13
13
|
.command('init')
|
|
14
14
|
.description('Full project onboarding: config, templates, vision, scan, sync')
|
|
@@ -76,5 +76,84 @@ program
|
|
|
76
76
|
const { reviewCommand } = await import('./commands/review.js');
|
|
77
77
|
await reviewCommand(options);
|
|
78
78
|
});
|
|
79
|
+
// Eval subcommands
|
|
80
|
+
const evalCmd = program
|
|
81
|
+
.command('eval')
|
|
82
|
+
.description('Run eval suite, capture failures, list cases, view scores');
|
|
83
|
+
evalCmd
|
|
84
|
+
.command('run', { isDefault: true })
|
|
85
|
+
.description('Run the eval suite and compute composite score')
|
|
86
|
+
.option('--tags <tags>', 'Filter by tags (comma-separated)')
|
|
87
|
+
.option('--suite <suite>', 'Run only a suite: step (fast) or e2e (slow)')
|
|
88
|
+
.option('--case <id>', 'Run a single eval case by ID prefix')
|
|
89
|
+
.option('--type <type>', 'Filter by type: full or step')
|
|
90
|
+
.option('--step <step>', 'Filter by pipeline step (plan, implement, test, review, verify)')
|
|
91
|
+
.option('--verbose', 'Show detailed output')
|
|
92
|
+
.action(async (options) => {
|
|
93
|
+
const { evalRunCommand } = await import('./commands/eval.js');
|
|
94
|
+
await evalRunCommand(options);
|
|
95
|
+
});
|
|
96
|
+
evalCmd
|
|
97
|
+
.command('capture [issue]')
|
|
98
|
+
.description('Capture failures as eval cases (interactive)')
|
|
99
|
+
.action(async (issue) => {
|
|
100
|
+
const { evalCaptureCommand } = await import('./commands/eval.js');
|
|
101
|
+
await evalCaptureCommand({ issue });
|
|
102
|
+
});
|
|
103
|
+
evalCmd
|
|
104
|
+
.command('list')
|
|
105
|
+
.description('Show eval cases and recent scores')
|
|
106
|
+
.action(async () => {
|
|
107
|
+
const { evalListCommand } = await import('./commands/eval.js');
|
|
108
|
+
evalListCommand();
|
|
109
|
+
});
|
|
110
|
+
evalCmd
|
|
111
|
+
.command('scores')
|
|
112
|
+
.description('Show score history over time')
|
|
113
|
+
.action(async () => {
|
|
114
|
+
const { evalScoresCommand } = await import('./commands/eval.js');
|
|
115
|
+
evalScoresCommand();
|
|
116
|
+
});
|
|
117
|
+
evalCmd
|
|
118
|
+
.command('search')
|
|
119
|
+
.description('Greedy search over model/agent configurations')
|
|
120
|
+
.option('--models <models>', 'Models to test (comma-separated)')
|
|
121
|
+
.option('--agents <agents>', 'Agents to test (comma-separated)')
|
|
122
|
+
.option('--max-runs <n>', 'Maximum number of eval runs')
|
|
123
|
+
.action(async (options) => {
|
|
124
|
+
const { evalSearchCommand } = await import('./commands/eval.js');
|
|
125
|
+
await evalSearchCommand(options);
|
|
126
|
+
});
|
|
127
|
+
evalCmd
|
|
128
|
+
.command('pareto')
|
|
129
|
+
.description('Show score/cost Pareto frontier')
|
|
130
|
+
.action(async () => {
|
|
131
|
+
const { evalParetoCommand } = await import('./commands/eval.js');
|
|
132
|
+
evalParetoCommand();
|
|
133
|
+
});
|
|
134
|
+
evalCmd
|
|
135
|
+
.command('compare <run1> <run2>')
|
|
136
|
+
.description('Compare two eval runs showing per-case changes')
|
|
137
|
+
.action(async (run1, run2) => {
|
|
138
|
+
const { evalCompareCommand } = await import('./commands/eval.js');
|
|
139
|
+
evalCompareCommand(run1, run2);
|
|
140
|
+
});
|
|
141
|
+
evalCmd
|
|
142
|
+
.command('import-swebench')
|
|
143
|
+
.description('Import eval cases from SWE-bench dataset')
|
|
144
|
+
.action(async () => {
|
|
145
|
+
const { evalImportSwebenchCommand } = await import('./commands/eval.js');
|
|
146
|
+
await evalImportSwebenchCommand();
|
|
147
|
+
});
|
|
148
|
+
program
|
|
149
|
+
.command('evolve')
|
|
150
|
+
.description('Meta-Harness-style automated optimization loop')
|
|
151
|
+
.option('--max-iterations <n>', 'Maximum optimization iterations (default: 5)')
|
|
152
|
+
.option('--dry-run', 'Preview without making changes')
|
|
153
|
+
.option('--verbose', 'Show detailed agent output')
|
|
154
|
+
.action(async (options) => {
|
|
155
|
+
const { evolveCommand } = await import('./commands/evolve.js');
|
|
156
|
+
await evolveCommand(options);
|
|
157
|
+
});
|
|
79
158
|
program.parse();
|
|
80
159
|
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,2CAA2C,CAAC;KACxD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC3D,MAAM,WAAW,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,2EAA2E,CAAC;KACxF,MAAM,CAAC,WAAW,EAAE,yBAAyB,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,iBAAiB,CAAC;KAC5C,MAAM,CAAC,cAAc,EAAE,qBAAqB,CAAC;KAC7C,MAAM,CAAC,eAAe,EAAE,kBAAkB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,oBAAoB,EAAE,uCAAuC,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,QAAQ,EAAE,4BAA4B,CAAC;KAC9C,MAAM,CAAC,WAAW,EAAE,sCAAsC,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;IACzD,IAAI,OAAO,CAAC,IAAI;QAAE,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IACxC,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;AAC5B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,mBAAmB,CAAC;KAC5B,WAAW,CAAC,sBAAsB,CAAC;KACnC,MAAM,CAAC,MAAM,EAAE,+BAA+B,CAAC;KAC/C,MAAM,CAAC,SAAS,EAAE,yBAAyB,CAAC;KAC5C,MAAM,CAAC,cAAc,CAAC,CAAC;AAE1B,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,aAAa,CAAC,CAAC;AAEzB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,yDAAyD,CAAC;KACtE,MAAM,CAAC,SAAS,EAAE,iEAAiE,CAAC;KACpF,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,4DAA4D,CAAC;KACzE,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,kBAAkB,EAAE,0CAA0C,CAAC;KACtE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,2FAA2F,CAAC;KACxG,MAAM,CAAC,SAAS,EAAE,4CAA4C,CAAC;KAC/D,MAAM,CAAC,kBAAkB,EAAE,gDAAgD,CAAC;KAC5E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,2CAA2C,CAAC;KACxD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC3D,MAAM,WAAW,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,2EAA2E,CAAC;KACxF,MAAM,CAAC,WAAW,EAAE,yBAAyB,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,iBAAiB,CAAC;KAC5C,MAAM,CAAC,cAAc,EAAE,qBAAqB,CAAC;KAC7C,MAAM,CAAC,eAAe,EAAE,kBAAkB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,oBAAoB,EAAE,uCAAuC,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,QAAQ,EAAE,4BAA4B,CAAC;KAC9C,MAAM,CAAC,WAAW,EAAE,sCAAsC,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;IACzD,IAAI,OAAO,CAAC,IAAI;QAAE,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IACxC,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;AAC5B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,mBAAmB,CAAC;KAC5B,WAAW,CAAC,sBAAsB,CAAC;KACnC,MAAM,CAAC,MAAM,EAAE,+BAA+B,CAAC;KAC/C,MAAM,CAAC,SAAS,EAAE,yBAAyB,CAAC;KAC5C,MAAM,CAAC,cAAc,CAAC,CAAC;AAE1B,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,aAAa,CAAC,CAAC;AAEzB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,yDAAyD,CAAC;KACtE,MAAM,CAAC,SAAS,EAAE,iEAAiE,CAAC;KACpF,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,4DAA4D,CAAC;KACzE,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,kBAAkB,EAAE,0CAA0C,CAAC;KACtE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,2FAA2F,CAAC;KACxG,MAAM,CAAC,SAAS,EAAE,4CAA4C,CAAC;KAC/D,MAAM,CAAC,kBAAkB,EAAE,gDAAgD,CAAC;KAC5E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,mBAAmB;AACnB,MAAM,OAAO,GAAG,OAAO;KACpB,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,2DAA2D,CAAC,CAAC;AAE5E,OAAO;KACJ,OAAO,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;KACnC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,eAAe,EAAE,kCAAkC,CAAC;KAC3D,MAAM,CAAC,iBAAiB,EAAE,6CAA6C,CAAC;KACxE,MAAM,CAAC,aAAa,EAAE,qCAAqC,CAAC;KAC5D,MAAM,CAAC,eAAe,EAAE,8BAA8B,CAAC;KACvD,MAAM,CAAC,eAAe,EAAE,iEAAiE,CAAC;KAC1F,MAAM,CAAC,WAAW,EAAE,sBAAsB,CAAC;KAC3C,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC9D,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;AAChC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,8CAA8C,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;IACtB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,mCAAmC,CAAC;KAChD,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC/D,eAAe,EAAE,CAAC;AACpB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,8BAA8B,CAAC;KAC3C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,+CAA+C,CAAC;KAC5D,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,gBAAgB,EAAE,6BAA6B,CAAC;KACvD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,MAAM,iBAAiB,CAAC,OAAO,CAAC,CAAC;AACnC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,iCAAiC,CAAC;KAC9C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,uBAAuB,CAAC;KAChC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,IAAY,EAAE,EAAE;IAC3C,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;AACjC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,0CAA0C,CAAC;KACvD,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,yBAAyB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACzE,MAAM,yBAAyB,EAAE,CAAC;AACpC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,sBAAsB,EAAE,8CAA8C,CAAC;KAC9E,MAAM,CAAC,WAAW,EAAE,gCAAgC,CAAC;KACrD,MAAM,CAAC,WAAW,EAAE,4BAA4B,CAAC;KACjD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
export type EvalOptions = {
|
|
2
|
+
tags?: string;
|
|
3
|
+
suite?: string;
|
|
4
|
+
case?: string;
|
|
5
|
+
type?: 'full' | 'step';
|
|
6
|
+
step?: string;
|
|
7
|
+
verbose?: boolean;
|
|
8
|
+
};
|
|
9
|
+
export type EvalCaptureOptions = {
|
|
10
|
+
issue?: string;
|
|
11
|
+
};
|
|
12
|
+
export type EvalSearchOptions = {
|
|
13
|
+
models?: string;
|
|
14
|
+
agents?: string;
|
|
15
|
+
maxRuns?: string;
|
|
16
|
+
};
|
|
17
|
+
/**
|
|
18
|
+
* Run the eval suite.
|
|
19
|
+
*/
|
|
20
|
+
export declare function evalRunCommand(options: EvalOptions): Promise<void>;
|
|
21
|
+
/**
|
|
22
|
+
* Compare two eval runs.
|
|
23
|
+
*/
|
|
24
|
+
export declare function evalCompareCommand(run1: string, run2: string): void;
|
|
25
|
+
/**
|
|
26
|
+
* Capture a failure as an eval case — interactive walkthrough.
|
|
27
|
+
*
|
|
28
|
+
* Flow:
|
|
29
|
+
* 1. Show unannotated (auto-captured) skeleton cases first, prompt to annotate
|
|
30
|
+
* 2. Show recent session failures grouped by session
|
|
31
|
+
* 3. For each failure: show step, test/verify status, prompt for diagnosis
|
|
32
|
+
*/
|
|
33
|
+
export declare function evalCaptureCommand(options: EvalCaptureOptions): Promise<void>;
|
|
34
|
+
/**
|
|
35
|
+
* List eval cases and recent scores.
|
|
36
|
+
*/
|
|
37
|
+
export declare function evalListCommand(): void;
|
|
38
|
+
/**
|
|
39
|
+
* Show score history.
|
|
40
|
+
*/
|
|
41
|
+
export declare function evalScoresCommand(): void;
|
|
42
|
+
/**
|
|
43
|
+
* Show score/cost Pareto frontier.
|
|
44
|
+
*/
|
|
45
|
+
export declare function evalParetoCommand(): void;
|
|
46
|
+
/**
|
|
47
|
+
* Greedy search over model/agent configs.
|
|
48
|
+
*/
|
|
49
|
+
export declare function evalSearchCommand(options: EvalSearchOptions): Promise<void>;
|
|
50
|
+
/**
|
|
51
|
+
* Import SWE-bench cases (placeholder — requires HuggingFace download).
|
|
52
|
+
*/
|
|
53
|
+
export declare function evalImportSwebenchCommand(): Promise<void>;
|