@bradtaylorsf/alpha-loop 1.4.2 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +122 -1
- package/dist/cli.js +43 -5
- package/dist/cli.js.map +1 -1
- package/dist/commands/eval.d.ts +59 -4
- package/dist/commands/eval.js +370 -55
- package/dist/commands/eval.js.map +1 -1
- package/dist/commands/evolve.d.ts +43 -4
- package/dist/commands/evolve.js +444 -66
- package/dist/commands/evolve.js.map +1 -1
- package/dist/lib/config.d.ts +20 -0
- package/dist/lib/config.js +55 -0
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/eval-checks.d.ts +11 -1
- package/dist/lib/eval-checks.js +39 -0
- package/dist/lib/eval-checks.js.map +1 -1
- package/dist/lib/eval-fixtures.d.ts +55 -0
- package/dist/lib/eval-fixtures.js +172 -0
- package/dist/lib/eval-fixtures.js.map +1 -0
- package/dist/lib/eval-runner.d.ts +26 -2
- package/dist/lib/eval-runner.js +202 -17
- package/dist/lib/eval-runner.js.map +1 -1
- package/dist/lib/eval-skill-bridge.d.ts +53 -0
- package/dist/lib/eval-skill-bridge.js +121 -0
- package/dist/lib/eval-skill-bridge.js.map +1 -0
- package/dist/lib/eval-swebench.d.ts +68 -0
- package/dist/lib/eval-swebench.js +274 -0
- package/dist/lib/eval-swebench.js.map +1 -0
- package/dist/lib/eval.d.ts +9 -1
- package/dist/lib/eval.js +27 -7
- package/dist/lib/eval.js.map +1 -1
- package/dist/lib/score.d.ts +24 -2
- package/dist/lib/score.js +162 -3
- package/dist/lib/score.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -58,7 +58,11 @@ Alpha Loop implements a 12-step pipeline for each issue:
|
|
|
58
58
|
11. **Auto-Merge** — Merges the PR to the session branch (if enabled)
|
|
59
59
|
12. **Cleanup** — Removes the worktree
|
|
60
60
|
|
|
61
|
-
After all issues are processed, Alpha Loop
|
|
61
|
+
After all issues are processed, Alpha Loop:
|
|
62
|
+
1. **Auto-captures failures** as eval cases for regression testing
|
|
63
|
+
2. Generates a **session summary** aggregating learnings across issues
|
|
64
|
+
3. Runs a **post-session code review** on the full session diff to catch cross-issue integration problems
|
|
65
|
+
4. Creates the **session PR** with all findings included
|
|
62
66
|
|
|
63
67
|
### Milestone-Based Workflow
|
|
64
68
|
|
|
@@ -101,6 +105,65 @@ Run `alpha-loop review` to trigger the self-improvement loop. It reads all accum
|
|
|
101
105
|
|
|
102
106
|
Without `--apply`, proposals are saved to `learnings/proposed-updates/` for review. With `--apply`, changes are written and a draft PR is created.
|
|
103
107
|
|
|
108
|
+
### Eval System (`alpha-loop eval`)
|
|
109
|
+
|
|
110
|
+
Alpha Loop includes a self-improving eval system inspired by [Meta-Harness](https://arxiv.org/abs/2603.28052) (Lee et al., 2026). It captures real failures as eval cases and tracks composite scores over time to measure whether prompt/skill changes actually help.
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
# Capture failures from recent sessions as eval cases
|
|
114
|
+
alpha-loop eval capture
|
|
115
|
+
|
|
116
|
+
# Run the eval suite and compute composite score
|
|
117
|
+
alpha-loop eval run
|
|
118
|
+
|
|
119
|
+
# View score history, Pareto frontier, or compare runs
|
|
120
|
+
alpha-loop eval scores
|
|
121
|
+
alpha-loop eval pareto
|
|
122
|
+
alpha-loop eval compare 1 2
|
|
123
|
+
|
|
124
|
+
# Greedy search over model configurations per pipeline step
|
|
125
|
+
alpha-loop eval search --models "haiku,sonnet,opus"
|
|
126
|
+
|
|
127
|
+
# Estimate cost before running
|
|
128
|
+
alpha-loop eval estimate
|
|
129
|
+
|
|
130
|
+
# Compare two config files side-by-side
|
|
131
|
+
alpha-loop eval compare-configs config-a.yaml config-b.yaml
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Eval cases live in `.alpha-loop/evals/` and scores are appended to `scores.jsonl` (Git-friendly, append-only). The composite score formula is pass-rate primary with lightweight penalties for retries and duration. Real API costs (tokens, USD) are tracked per case from agent output and used for the Pareto frontier.
|
|
135
|
+
|
|
136
|
+
Step-level evals test individual pipeline stages (plan, implement, test, test-fix, review, learn, skill) and run in seconds using LLM-judge and keyword checks:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
# Run only step-level evals (fast, cheap)
|
|
140
|
+
alpha-loop eval --suite step
|
|
141
|
+
|
|
142
|
+
# Run evals for a specific step
|
|
143
|
+
alpha-loop eval --suite step --step review
|
|
144
|
+
|
|
145
|
+
# Convert between AlphaLoop and skill-creator eval formats
|
|
146
|
+
alpha-loop eval convert --direction to-skill
|
|
147
|
+
alpha-loop eval convert --direction from-skill --input path/to/evals.json
|
|
148
|
+
|
|
149
|
+
# Import SWE-bench cases from HuggingFace (requires Python + datasets)
|
|
150
|
+
alpha-loop eval import-swebench --count 10 --repo "django/django"
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Evolve (`alpha-loop evolve`)
|
|
154
|
+
|
|
155
|
+
The evolve command runs a Meta-Harness-style optimization loop: a proposer agent reads full execution traces, scores, and source code, then proposes targeted changes to prompts, skills, or config. Changes are evaluated against the eval suite — improvements are kept, regressions are reverted (autoresearch keep/discard pattern).
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
alpha-loop evolve # Run up to 5 iterations
|
|
159
|
+
alpha-loop evolve --max-iterations 10 # Run 10 iterations
|
|
160
|
+
alpha-loop evolve --continuous # Run until manually stopped (Ctrl-C)
|
|
161
|
+
alpha-loop evolve --surface prompts # Only modify agent prompts (safest)
|
|
162
|
+
alpha-loop evolve --surface all # Modify prompts + pipeline code (riskier)
|
|
163
|
+
alpha-loop evolve --resume # Resume from a previous evolve session
|
|
164
|
+
alpha-loop evolve --dry-run # Preview without changes
|
|
165
|
+
```
|
|
166
|
+
|
|
104
167
|
### Crash Recovery (`alpha-loop resume`)
|
|
105
168
|
|
|
106
169
|
If the loop hangs or crashes mid-session, work can be stranded on local branches with no PR. Run `alpha-loop resume` to recover:
|
|
@@ -135,6 +198,18 @@ During live verification, the agent takes screenshots at key states and saves th
|
|
|
135
198
|
| `alpha-loop resume --issue <N>` | Resume a specific issue |
|
|
136
199
|
| `alpha-loop review` | Analyze learnings and propose self-improvements |
|
|
137
200
|
| `alpha-loop review --apply` | Apply proposed improvements and create a draft PR |
|
|
201
|
+
| `alpha-loop eval` | Run the eval suite and compute composite score |
|
|
202
|
+
| `alpha-loop eval capture` | Capture failures as eval cases (interactive) |
|
|
203
|
+
| `alpha-loop eval list` | Show eval cases and recent scores |
|
|
204
|
+
| `alpha-loop eval scores` | Show score history over time |
|
|
205
|
+
| `alpha-loop eval pareto` | Show score/cost Pareto frontier |
|
|
206
|
+
| `alpha-loop eval compare <r1> <r2>` | Compare two eval runs |
|
|
207
|
+
| `alpha-loop eval search` | Greedy search over model configurations per pipeline step |
|
|
208
|
+
| `alpha-loop eval estimate` | Estimate cost of running the eval suite |
|
|
209
|
+
| `alpha-loop eval compare-configs <a> <b>` | Compare two YAML config files side-by-side |
|
|
210
|
+
| `alpha-loop eval convert` | Convert between AlphaLoop and skill-creator eval formats |
|
|
211
|
+
| `alpha-loop eval import-swebench` | Import eval cases from SWE-bench dataset |
|
|
212
|
+
| `alpha-loop evolve` | Meta-Harness-style automated optimization loop |
|
|
138
213
|
|
|
139
214
|
### Run Options
|
|
140
215
|
|
|
@@ -176,6 +251,15 @@ harnesses:
|
|
|
176
251
|
# Safety limits (0 = unlimited)
|
|
177
252
|
max_issues: 20
|
|
178
253
|
max_session_duration: 7200 # 2 hours in seconds
|
|
254
|
+
|
|
255
|
+
# Post-session review (runs after all issues, reviews full session diff)
|
|
256
|
+
post_session:
|
|
257
|
+
review: true
|
|
258
|
+
security_scan: true
|
|
259
|
+
|
|
260
|
+
# Eval system
|
|
261
|
+
auto_capture: true # capture failures as eval cases
|
|
262
|
+
eval_dir: .alpha-loop/evals
|
|
179
263
|
```
|
|
180
264
|
|
|
181
265
|
### Configuration Reference
|
|
@@ -210,6 +294,14 @@ max_session_duration: 7200 # 2 hours in seconds
|
|
|
210
294
|
| `run_full` | `false` | Run full pipeline without skipping any steps |
|
|
211
295
|
| `verbose` | `false` | Enable verbose agent output |
|
|
212
296
|
| `harnesses` | (auto from agent) | Coding harnesses to sync skills/agents to (e.g., `claude`, `codex`) |
|
|
297
|
+
| `eval_dir` | `.alpha-loop/evals` | Directory for eval cases and scores |
|
|
298
|
+
| `eval_model` | (agent default) | AI model for eval judging |
|
|
299
|
+
| `eval_timeout` | `300` | Timeout in seconds for eval case execution |
|
|
300
|
+
| `auto_capture` | `true` | Auto-capture failures as eval cases at end of session |
|
|
301
|
+
| `pipeline` | `{}` | Per-step agent/model overrides (see below) |
|
|
302
|
+
| `pricing` | (built-in) | Custom token pricing per model for cost tracking |
|
|
303
|
+
| `post_session.review` | `true` | Run holistic code review on full session diff |
|
|
304
|
+
| `post_session.security_scan` | `true` | Include security scanning in post-session review |
|
|
213
305
|
|
|
214
306
|
### Environment Variables
|
|
215
307
|
|
|
@@ -243,6 +335,12 @@ All config options can be set via environment variables (uppercase, same names):
|
|
|
243
335
|
| `MERGE_TO` | `merge_to` |
|
|
244
336
|
| `RUN_FULL` | `run_full` |
|
|
245
337
|
| `VERBOSE` | `verbose` |
|
|
338
|
+
| `EVAL_DIR` | `eval_dir` |
|
|
339
|
+
| `EVAL_MODEL` | `eval_model` |
|
|
340
|
+
| `EVAL_TIMEOUT` | `eval_timeout` |
|
|
341
|
+
| `AUTO_CAPTURE` | `auto_capture` |
|
|
342
|
+
| `SKIP_POST_SESSION_REVIEW` | `post_session.review` (inverted) |
|
|
343
|
+
| `SKIP_POST_SESSION_SECURITY` | `post_session.security_scan` (inverted) |
|
|
246
344
|
|
|
247
345
|
**Precedence:** CLI flags > environment variables > `.alpha-loop.yaml` > auto-detection > defaults
|
|
248
346
|
|
|
@@ -278,6 +376,27 @@ harnesses:
|
|
|
278
376
|
- claude # also sync to Claude for teammates using it
|
|
279
377
|
```
|
|
280
378
|
|
|
379
|
+
### Per-Step Pipeline Config
|
|
380
|
+
|
|
381
|
+
Use `pipeline` to assign different models to different pipeline stages. This lets you use cheaper models for simple steps and reserve expensive models for implementation:
|
|
382
|
+
|
|
383
|
+
```yaml
|
|
384
|
+
agent: claude
|
|
385
|
+
model: claude-sonnet-4-6 # default for all steps
|
|
386
|
+
|
|
387
|
+
pipeline:
|
|
388
|
+
plan:
|
|
389
|
+
model: claude-haiku-4-5 # cheap model for planning
|
|
390
|
+
implement:
|
|
391
|
+
model: claude-sonnet-4-6 # main model for coding
|
|
392
|
+
review:
|
|
393
|
+
model: claude-opus-4-6 # best model for review
|
|
394
|
+
learn:
|
|
395
|
+
model: claude-haiku-4-5 # cheap model for learning
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
Use `alpha-loop eval search` to automatically find the best model assignment per step via greedy coordinate descent over your eval suite.
|
|
399
|
+
|
|
281
400
|
## GitHub Setup
|
|
282
401
|
|
|
283
402
|
### Labels
|
|
@@ -331,6 +450,8 @@ What needs to be done.
|
|
|
331
450
|
| `.alpha-loop/vision.md` | Yes | Project vision document |
|
|
332
451
|
| `.alpha-loop/context.md` | Yes | Auto-generated project context |
|
|
333
452
|
| `.alpha-loop/learnings/` | Yes | Learning files, session manifests, and session summaries (shared with team) |
|
|
453
|
+
| `.alpha-loop/evals/` | Yes | Eval cases (YAML) and score history (`scores.jsonl`) |
|
|
454
|
+
| `.alpha-loop/traces/` | No (gitignored) | Meta-Harness style execution traces per session |
|
|
334
455
|
| `.alpha-loop/sessions/` | No (gitignored) | Local session logs, results JSON, screenshots |
|
|
335
456
|
| `.alpha-loop/auth/` | No (gitignored) | Saved browser auth state for verification |
|
|
336
457
|
| `.worktrees/` | No (gitignored) | Temporary git worktrees during processing |
|
package/dist/cli.js
CHANGED
|
@@ -8,7 +8,7 @@ import { syncCommand } from './commands/sync.js';
|
|
|
8
8
|
program
|
|
9
9
|
.name('alpha-loop')
|
|
10
10
|
.description('Agent-agnostic automated development loop')
|
|
11
|
-
.version('1.4.
|
|
11
|
+
.version('1.4.3');
|
|
12
12
|
program
|
|
13
13
|
.command('init')
|
|
14
14
|
.description('Full project onboarding: config, templates, vision, scan, sync')
|
|
@@ -87,7 +87,7 @@ evalCmd
|
|
|
87
87
|
.option('--suite <suite>', 'Run only a suite: step (fast) or e2e (slow)')
|
|
88
88
|
.option('--case <id>', 'Run a single eval case by ID prefix')
|
|
89
89
|
.option('--type <type>', 'Filter by type: full or step')
|
|
90
|
-
.option('--step <step>', 'Filter by pipeline step (plan, implement, test, review, verify)')
|
|
90
|
+
.option('--step <step>', 'Filter by pipeline step (plan, implement, test, test-fix, review, verify, learn, skill)')
|
|
91
91
|
.option('--verbose', 'Show detailed output')
|
|
92
92
|
.action(async (options) => {
|
|
93
93
|
const { evalRunCommand } = await import('./commands/eval.js');
|
|
@@ -116,10 +116,14 @@ evalCmd
|
|
|
116
116
|
});
|
|
117
117
|
evalCmd
|
|
118
118
|
.command('search')
|
|
119
|
-
.description('Greedy search over model/agent configurations')
|
|
119
|
+
.description('Greedy coordinate descent search over model/agent configurations')
|
|
120
120
|
.option('--models <models>', 'Models to test (comma-separated)')
|
|
121
121
|
.option('--agents <agents>', 'Agents to test (comma-separated)')
|
|
122
122
|
.option('--max-runs <n>', 'Maximum number of eval runs')
|
|
123
|
+
.option('--budget <n>', 'Maximum number of eval runs (alias for --max-runs)')
|
|
124
|
+
.option('--step <step>', 'Only search over this pipeline step')
|
|
125
|
+
.option('--min-score <score>', 'Minimum acceptable score threshold')
|
|
126
|
+
.option('--optimize <target>', 'Optimize for: cost or efficiency (default: efficiency)')
|
|
123
127
|
.action(async (options) => {
|
|
124
128
|
const { evalSearchCommand } = await import('./commands/eval.js');
|
|
125
129
|
await evalSearchCommand(options);
|
|
@@ -138,17 +142,51 @@ evalCmd
|
|
|
138
142
|
const { evalCompareCommand } = await import('./commands/eval.js');
|
|
139
143
|
evalCompareCommand(run1, run2);
|
|
140
144
|
});
|
|
145
|
+
evalCmd
|
|
146
|
+
.command('estimate')
|
|
147
|
+
.description('Estimate cost of running the eval suite with current or specified config')
|
|
148
|
+
.option('--config <path>', 'Path to a YAML config file to estimate')
|
|
149
|
+
.action(async (options) => {
|
|
150
|
+
const { evalEstimateCommand } = await import('./commands/eval.js');
|
|
151
|
+
evalEstimateCommand(options);
|
|
152
|
+
});
|
|
153
|
+
evalCmd
|
|
154
|
+
.command('compare-configs <configA> <configB>')
|
|
155
|
+
.description('Compare two YAML config files side-by-side')
|
|
156
|
+
.action(async (configA, configB) => {
|
|
157
|
+
const { evalCompareConfigsCommand } = await import('./commands/eval.js');
|
|
158
|
+
evalCompareConfigsCommand(configA, configB);
|
|
159
|
+
});
|
|
141
160
|
evalCmd
|
|
142
161
|
.command('import-swebench')
|
|
143
162
|
.description('Import eval cases from SWE-bench dataset')
|
|
144
|
-
.
|
|
163
|
+
.option('--dataset <path>', 'Path to a downloaded JSONL file (skips auto-download)')
|
|
164
|
+
.option('--dataset-id <id>', 'HuggingFace dataset ID (default: princeton-nlp/SWE-bench_Lite)')
|
|
165
|
+
.option('--count <n>', 'Maximum number of cases to import')
|
|
166
|
+
.option('--repo <owner/repo>', 'Filter by repository (e.g. django/django)')
|
|
167
|
+
.option('--ids <csv>', 'Import specific instance IDs (comma-separated)')
|
|
168
|
+
.option('--step <step>', 'Pipeline step to target (default: implement)')
|
|
169
|
+
.action(async (options) => {
|
|
145
170
|
const { evalImportSwebenchCommand } = await import('./commands/eval.js');
|
|
146
|
-
await evalImportSwebenchCommand();
|
|
171
|
+
await evalImportSwebenchCommand(options);
|
|
172
|
+
});
|
|
173
|
+
evalCmd
|
|
174
|
+
.command('convert')
|
|
175
|
+
.description('Convert between AlphaLoop eval format and skill-creator format')
|
|
176
|
+
.option('--direction <dir>', 'Conversion direction: to-skill or from-skill (default: to-skill)')
|
|
177
|
+
.option('--input <path>', 'Input file path (for from-skill)')
|
|
178
|
+
.option('--output <path>', 'Output file path')
|
|
179
|
+
.action(async (options) => {
|
|
180
|
+
const { evalConvertCommand } = await import('./commands/eval.js');
|
|
181
|
+
evalConvertCommand(options);
|
|
147
182
|
});
|
|
148
183
|
program
|
|
149
184
|
.command('evolve')
|
|
150
185
|
.description('Meta-Harness-style automated optimization loop')
|
|
151
186
|
.option('--max-iterations <n>', 'Maximum optimization iterations (default: 5)')
|
|
187
|
+
.option('--continuous', 'Run until manually stopped (SIGINT)')
|
|
188
|
+
.option('--surface <level>', 'Optimization surface: prompts, skills, config, all (default: prompts)')
|
|
189
|
+
.option('--resume', 'Resume from a previous evolve session')
|
|
152
190
|
.option('--dry-run', 'Preview without making changes')
|
|
153
191
|
.option('--verbose', 'Show detailed agent output')
|
|
154
192
|
.action(async (options) => {
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,2CAA2C,CAAC;KACxD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC3D,MAAM,WAAW,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,2EAA2E,CAAC;KACxF,MAAM,CAAC,WAAW,EAAE,yBAAyB,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,iBAAiB,CAAC;KAC5C,MAAM,CAAC,cAAc,EAAE,qBAAqB,CAAC;KAC7C,MAAM,CAAC,eAAe,EAAE,kBAAkB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,oBAAoB,EAAE,uCAAuC,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,QAAQ,EAAE,4BAA4B,CAAC;KAC9C,MAAM,CAAC,WAAW,EAAE,sCAAsC,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;IACzD,IAAI,OAAO,CAAC,IAAI;QAAE,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IACxC,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;AAC5B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,mBAAmB,CAAC;KAC5B,WAAW,CAAC,sBAAsB,CAAC;KACnC,MAAM,CAAC,MAAM,EAAE,+BAA+B,CAAC;KAC/C,MAAM,CAAC,SAAS,EAAE,yBAAyB,CAAC;KAC5C,MAAM,CAAC,cAAc,CAAC,CAAC;AAE1B,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,aAAa,CAAC,CAAC;AAEzB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,yDAAyD,CAAC;KACtE,MAAM,CAAC,SAAS,EAAE,iEAAiE,CAAC;KACpF,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,4DAA4D,CAAC;KACzE,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,kBAAkB,EAAE,0CAA0C,CAAC;KACtE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,2FAA2F,CAAC;KACxG,MAAM,CAAC,SAAS,EAAE,4CAA4C,CAAC;KAC/D,MAAM,CAAC,kBAAkB,EAAE,gDAAgD,CAAC;KAC5E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,mBAAmB;AACnB,MAAM,OAAO,GAAG,OAAO;KACpB,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,2DAA2D,CAAC,CAAC;AAE5E,OAAO;KACJ,OAAO,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;KACnC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,eAAe,EAAE,kCAAkC,CAAC;KAC3D,MAAM,CAAC,iBAAiB,EAAE,6CAA6C,CAAC;KACxE,MAAM,CAAC,aAAa,EAAE,qCAAqC,CAAC;KAC5D,MAAM,CAAC,eAAe,EAAE,8BAA8B,CAAC;KACvD,MAAM,CAAC,eAAe,EAAE,
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,2CAA2C,CAAC;KACxD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC3D,MAAM,WAAW,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,2EAA2E,CAAC;KACxF,MAAM,CAAC,WAAW,EAAE,yBAAyB,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,iBAAiB,CAAC;KAC5C,MAAM,CAAC,cAAc,EAAE,qBAAqB,CAAC;KAC7C,MAAM,CAAC,eAAe,EAAE,kBAAkB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,oBAAoB,EAAE,uCAAuC,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,QAAQ,EAAE,4BAA4B,CAAC;KAC9C,MAAM,CAAC,WAAW,EAAE,sCAAsC,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;IACzD,IAAI,OAAO,CAAC,IAAI;QAAE,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IACxC,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;AAC5B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,mBAAmB,CAAC;KAC5B,WAAW,CAAC,sBAAsB,CAAC;KACnC,MAAM,CAAC,MAAM,EAAE,+BAA+B,CAAC;KAC/C,MAAM,CAAC,SAAS,EAAE,yBAAyB,CAAC;KAC5C,MAAM,CAAC,cAAc,CAAC,CAAC;AAE1B,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,aAAa,CAAC,CAAC;AAEzB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,yDAAyD,CAAC;KACtE,MAAM,CAAC,SAAS,EAAE,iEAAiE,CAAC;KACpF,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,4DAA4D,CAAC;KACzE,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,kBAAkB,EAAE,0CAA0C,CAAC;KACtE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,2FAA2F,CAAC;KACxG,MAAM,CAAC,SAAS,EAAE,4CAA4C,CAAC;KAC/D,MAAM,CAAC,kBAAkB,EAAE,gDAAgD,CAAC;KAC5E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,mBAAmB;AACnB,MAAM,OAAO,GAAG,OAAO;KACpB,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,2DAA2D,CAAC,CAAC;AAE5E,OAAO;KACJ,OAAO,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;KACnC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,eAAe,EAAE,kCAAkC,CAAC;KAC3D,MAAM,CAAC,iBAAiB,EAAE,6CAA6C,CAAC;KACxE,MAAM,CAAC,aAAa,EAAE,qCAAqC,CAAC;KAC5D,MAAM,CAAC,eAAe,EAAE,8BAA8B,CAAC;KACvD,MAAM,CAAC,eAAe,EAAE,yFAAyF,CAAC;KAClH,MAAM,CAAC,WAAW,EAAE,sBAAsB,CAAC;KAC3C,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC9D,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;AAChC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,8CAA8C,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;IACtB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,mCAAmC,CAAC;KAChD,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC/D,eAAe,EAAE,CAAC;AACpB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,8BAA8B,CAAC;KAC3C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,kEAAkE,CAAC;KAC/E,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,gBAAgB,EAAE,6BAA6B,CAAC;KACvD,MAAM,CAAC,cAAc,EAAE,oDAAoD,CAAC;KAC5E,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,qBAAqB,EAAE,oCAAoC,CAAC;KACnE,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,MAAM,iBAAiB,CAAC,OAAO,CAAC,CAAC;AACnC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,iCAAiC,CAAC;KAC9C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,uBAAuB,CAAC;KAChC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,IAAY,EAAE,EAAE;IAC3C,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;AACjC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,UAAU,CAAC;KACnB,WAAW,CAAC,0EAA0E,CAAC;KACvF,MAAM,CAAC,iBAAiB,EAAE,wCAAwC,CAAC;KACnE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,mBAAmB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACnE,mBAAmB,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,qCAAqC,CAAC;KAC9C,WAAW,CAAC,4CAA4C,CAAC;KACzD,MAAM,CAAC,KAAK,EAAE,OAAe,EAAE,OAAe,EAAE,EAAE;IACjD,MAAM,EAAE,yBAAyB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACzE,yBAAyB,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,0CAA0C,CAAC;KACvD,MAAM,CAAC,kBAAkB,EAAE,uDAAuD,CAAC;KACnF,MAAM,CAAC,mBAAmB,EAAE,gEAAgE,CAAC;KAC7F,MAAM,CAAC,aAAa,EAAE,mCAAmC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,2CAA2C,CAAC;KAC1E,MAAM,CAAC,aAAa,EAAE,gDAAgD,CAAC;KACvE,MAAM,CAAC,eAAe,EAAE,8CAA8C,CAAC;KACvE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,yBAAyB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACzE,MAAM,yBAAyB,CAAC,OAAO,CAAC,CAAC;AAC3C,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,SAAS,CAAC;KAClB,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,mBAAmB,EAAE,kEAAkE,CAAC;KAC/F,MAAM,CAAC,gBAAgB,EAAE,kCAAkC,CAAC;KAC5D,MAAM,CAAC,iBAAiB,EAAE,kBAAkB,CAAC;KAC7C,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC9B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,sBAAsB,EAAE,8CAA8C,CAAC;KAC9E,MAAM,CAAC,cAAc,EAAE,qCAAqC,CAAC;KAC7D,MAAM,CAAC,mBAAmB,EAAE,uEAAuE,CAAC;KACpG,MAAM,CAAC,UAAU,EAAE,uCAAuC,CAAC;KAC3D,MAAM,CAAC,WAAW,EAAE,gCAAgC,CAAC;KACrD,MAAM,CAAC,WAAW,EAAE,4BAA4B,CAAC;KACjD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
package/dist/commands/eval.d.ts
CHANGED
|
@@ -13,6 +13,14 @@ export type EvalSearchOptions = {
|
|
|
13
13
|
models?: string;
|
|
14
14
|
agents?: string;
|
|
15
15
|
maxRuns?: string;
|
|
16
|
+
/** Only search over this pipeline step. */
|
|
17
|
+
step?: string;
|
|
18
|
+
/** Minimum acceptable score. */
|
|
19
|
+
minScore?: string;
|
|
20
|
+
/** What to optimize: 'cost' or 'efficiency'. */
|
|
21
|
+
optimize?: string;
|
|
22
|
+
/** Maximum number of eval runs (alias for maxRuns). */
|
|
23
|
+
budget?: string;
|
|
16
24
|
};
|
|
17
25
|
/**
|
|
18
26
|
* Run the eval suite.
|
|
@@ -40,14 +48,61 @@ export declare function evalListCommand(): void;
|
|
|
40
48
|
*/
|
|
41
49
|
export declare function evalScoresCommand(): void;
|
|
42
50
|
/**
|
|
43
|
-
* Show score/cost Pareto frontier.
|
|
51
|
+
* Show score/cost Pareto frontier with ASCII chart.
|
|
44
52
|
*/
|
|
45
53
|
export declare function evalParetoCommand(): void;
|
|
46
54
|
/**
|
|
47
|
-
* Greedy search over model/agent configs.
|
|
55
|
+
* Greedy coordinate descent search over model/agent configs.
|
|
56
|
+
*
|
|
57
|
+
* Strategy:
|
|
58
|
+
* 1. Establish baseline: run eval with current config → S₀, C₀
|
|
59
|
+
* 2. For each pipeline step, try alternative models (holding others fixed)
|
|
60
|
+
* 3. Keep Pareto-optimal changes (better score at ≤ cost, or same score at lower cost)
|
|
61
|
+
* 4. Repeat until no step can be improved or budget is exhausted
|
|
48
62
|
*/
|
|
49
63
|
export declare function evalSearchCommand(options: EvalSearchOptions): Promise<void>;
|
|
64
|
+
export type EvalImportSwebenchOptions = {
|
|
65
|
+
dataset?: string;
|
|
66
|
+
datasetId?: string;
|
|
67
|
+
count?: string;
|
|
68
|
+
repo?: string;
|
|
69
|
+
ids?: string;
|
|
70
|
+
step?: string;
|
|
71
|
+
};
|
|
72
|
+
/**
|
|
73
|
+
* Import SWE-bench cases from HuggingFace or a local JSONL file.
|
|
74
|
+
*
|
|
75
|
+
* Downloads entries from HuggingFace (requires Python + datasets library),
|
|
76
|
+
* converts each to a directory-based eval case under .alpha-loop/evals/cases/e2e/,
|
|
77
|
+
* and updates config.yaml with repo base commit mappings.
|
|
78
|
+
*/
|
|
79
|
+
export declare function evalImportSwebenchCommand(options?: EvalImportSwebenchOptions): Promise<void>;
|
|
80
|
+
export type EvalConvertOptions = {
|
|
81
|
+
direction?: string;
|
|
82
|
+
input?: string;
|
|
83
|
+
output?: string;
|
|
84
|
+
};
|
|
85
|
+
/**
|
|
86
|
+
* Convert between AlphaLoop eval format and skill-creator format.
|
|
87
|
+
*
|
|
88
|
+
* Directions:
|
|
89
|
+
* to-skill — Convert AlphaLoop eval case → skill-creator evals.json
|
|
90
|
+
* from-skill — Convert skill-creator evals.json → AlphaLoop eval cases
|
|
91
|
+
*/
|
|
92
|
+
export declare function evalConvertCommand(options: EvalConvertOptions): void;
|
|
93
|
+
export type EvalEstimateOptions = {
|
|
94
|
+
config?: string;
|
|
95
|
+
};
|
|
96
|
+
/**
|
|
97
|
+
* Estimate cost of running the eval suite with a given config.
|
|
98
|
+
* Shows per-step breakdown using pricing table and average token estimates.
|
|
99
|
+
*/
|
|
100
|
+
export declare function evalEstimateCommand(options: EvalEstimateOptions): void;
|
|
101
|
+
export type EvalCompareConfigsOptions = {
|
|
102
|
+
configA: string;
|
|
103
|
+
configB: string;
|
|
104
|
+
};
|
|
50
105
|
/**
|
|
51
|
-
*
|
|
106
|
+
* Compare two YAML config files side-by-side showing per-step model/agent differences.
|
|
52
107
|
*/
|
|
53
|
-
export declare function
|
|
108
|
+
export declare function evalCompareConfigsCommand(configAPath: string, configBPath: string): void;
|