@bradtaylorsf/alpha-loop 1.4.2 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +136 -3
- package/dist/cli.js +73 -6
- package/dist/cli.js.map +1 -1
- package/dist/commands/eval.d.ts +59 -4
- package/dist/commands/eval.js +370 -55
- package/dist/commands/eval.js.map +1 -1
- package/dist/commands/evolve.d.ts +43 -4
- package/dist/commands/evolve.js +444 -66
- package/dist/commands/evolve.js.map +1 -1
- package/dist/commands/init.js +3 -7
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/plan.d.ts +11 -0
- package/dist/commands/plan.js +298 -0
- package/dist/commands/plan.js.map +1 -0
- package/dist/commands/roadmap.d.ts +9 -0
- package/dist/commands/roadmap.js +202 -0
- package/dist/commands/roadmap.js.map +1 -0
- package/dist/commands/triage.d.ts +9 -0
- package/dist/commands/triage.js +226 -0
- package/dist/commands/triage.js.map +1 -0
- package/dist/commands/vision.js +1 -0
- package/dist/commands/vision.js.map +1 -1
- package/dist/lib/config.d.ts +20 -0
- package/dist/lib/config.js +55 -0
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/eval-checks.d.ts +11 -1
- package/dist/lib/eval-checks.js +39 -0
- package/dist/lib/eval-checks.js.map +1 -1
- package/dist/lib/eval-fixtures.d.ts +55 -0
- package/dist/lib/eval-fixtures.js +172 -0
- package/dist/lib/eval-fixtures.js.map +1 -0
- package/dist/lib/eval-runner.d.ts +26 -2
- package/dist/lib/eval-runner.js +202 -17
- package/dist/lib/eval-runner.js.map +1 -1
- package/dist/lib/eval-skill-bridge.d.ts +53 -0
- package/dist/lib/eval-skill-bridge.js +121 -0
- package/dist/lib/eval-skill-bridge.js.map +1 -0
- package/dist/lib/eval-swebench.d.ts +68 -0
- package/dist/lib/eval-swebench.js +274 -0
- package/dist/lib/eval-swebench.js.map +1 -0
- package/dist/lib/eval.d.ts +9 -1
- package/dist/lib/eval.js +27 -7
- package/dist/lib/eval.js.map +1 -1
- package/dist/lib/github.d.ts +46 -0
- package/dist/lib/github.js +179 -0
- package/dist/lib/github.js.map +1 -1
- package/dist/lib/pipeline.js +44 -2
- package/dist/lib/pipeline.js.map +1 -1
- package/dist/lib/planning.d.ts +91 -0
- package/dist/lib/planning.js +315 -0
- package/dist/lib/planning.js.map +1 -0
- package/dist/lib/prompts.d.ts +79 -0
- package/dist/lib/prompts.js +151 -2
- package/dist/lib/prompts.js.map +1 -1
- package/dist/lib/score.d.ts +24 -2
- package/dist/lib/score.js +162 -3
- package/dist/lib/score.js.map +1 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -48,17 +48,23 @@ Alpha Loop implements a 12-step pipeline for each issue:
|
|
|
48
48
|
1. **Status Update** — Labels issue `in-progress`, assigns to you, updates project board
|
|
49
49
|
2. **Worktree** — Creates an isolated git worktree so work doesn't conflict with other issues
|
|
50
50
|
3. **Plan** — Agent analyzes the issue and enriches it with implementation details
|
|
51
|
-
|
|
51
|
+
3b. **Fetch Comments** — Loads issue comments so the agent has the full conversation context
|
|
52
|
+
4. **Implement** — Agent writes the code, guided by project vision, context, comments, and learnings from previous issues
|
|
52
53
|
5. **Test + Retry** — Runs your test command; if tests fail, agent fixes and retries (up to `max_test_retries`)
|
|
53
54
|
6. **Verify + Retry** — Starts your dev server, uses playwright-cli to test the feature like a real user, takes screenshots
|
|
54
55
|
7. **Review** — A review agent reads the diff, checks for gaps, security issues, and missing wiring — fixes what it can
|
|
55
56
|
8. **Create PR** — Opens a PR with test results, review summary, and verification status
|
|
57
|
+
8b. **Assumptions** — Agent summarizes assumptions and decisions made, posts as a comment on the issue for user validation
|
|
56
58
|
9. **Learn** — Extracts learnings (patterns, anti-patterns, what worked/failed) for future sessions
|
|
57
59
|
10. **Update Issue** — Posts results as a comment, updates labels
|
|
58
60
|
11. **Auto-Merge** — Merges the PR to the session branch (if enabled)
|
|
59
61
|
12. **Cleanup** — Removes the worktree
|
|
60
62
|
|
|
61
|
-
After all issues are processed, Alpha Loop
|
|
63
|
+
After all issues are processed, Alpha Loop:
|
|
64
|
+
1. **Auto-captures failures** as eval cases for regression testing
|
|
65
|
+
2. Generates a **session summary** aggregating learnings across issues
|
|
66
|
+
3. Runs a **post-session code review** on the full session diff to catch cross-issue integration problems
|
|
67
|
+
4. Creates the **session PR** with all findings included
|
|
62
68
|
|
|
63
69
|
### Milestone-Based Workflow
|
|
64
70
|
|
|
@@ -101,6 +107,65 @@ Run `alpha-loop review` to trigger the self-improvement loop. It reads all accum
|
|
|
101
107
|
|
|
102
108
|
Without `--apply`, proposals are saved to `learnings/proposed-updates/` for review. With `--apply`, changes are written and a draft PR is created.
|
|
103
109
|
|
|
110
|
+
### Eval System (`alpha-loop eval`)
|
|
111
|
+
|
|
112
|
+
Alpha Loop includes a self-improving eval system inspired by [Meta-Harness](https://arxiv.org/abs/2603.28052) (Lee et al., 2026). It captures real failures as eval cases and tracks composite scores over time to measure whether prompt/skill changes actually help.
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# Capture failures from recent sessions as eval cases
|
|
116
|
+
alpha-loop eval capture
|
|
117
|
+
|
|
118
|
+
# Run the eval suite and compute composite score
|
|
119
|
+
alpha-loop eval run
|
|
120
|
+
|
|
121
|
+
# View score history, Pareto frontier, or compare runs
|
|
122
|
+
alpha-loop eval scores
|
|
123
|
+
alpha-loop eval pareto
|
|
124
|
+
alpha-loop eval compare 1 2
|
|
125
|
+
|
|
126
|
+
# Greedy search over model configurations per pipeline step
|
|
127
|
+
alpha-loop eval search --models "haiku,sonnet,opus"
|
|
128
|
+
|
|
129
|
+
# Estimate cost before running
|
|
130
|
+
alpha-loop eval estimate
|
|
131
|
+
|
|
132
|
+
# Compare two config files side-by-side
|
|
133
|
+
alpha-loop eval compare-configs config-a.yaml config-b.yaml
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Eval cases live in `.alpha-loop/evals/` and scores are appended to `scores.jsonl` (Git-friendly, append-only). The composite score formula is pass-rate primary with lightweight penalties for retries and duration. Real API costs (tokens, USD) are tracked per case from agent output and used for the Pareto frontier.
|
|
137
|
+
|
|
138
|
+
Step-level evals test individual pipeline stages (plan, implement, test, test-fix, review, learn, skill) and run in seconds using LLM-judge and keyword checks:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Run only step-level evals (fast, cheap)
|
|
142
|
+
alpha-loop eval --suite step
|
|
143
|
+
|
|
144
|
+
# Run evals for a specific step
|
|
145
|
+
alpha-loop eval --suite step --step review
|
|
146
|
+
|
|
147
|
+
# Convert between AlphaLoop and skill-creator eval formats
|
|
148
|
+
alpha-loop eval convert --direction to-skill
|
|
149
|
+
alpha-loop eval convert --direction from-skill --input path/to/evals.json
|
|
150
|
+
|
|
151
|
+
# Import SWE-bench cases from HuggingFace (requires Python + datasets)
|
|
152
|
+
alpha-loop eval import-swebench --count 10 --repo "django/django"
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Evolve (`alpha-loop evolve`)
|
|
156
|
+
|
|
157
|
+
The evolve command runs a Meta-Harness-style optimization loop: a proposer agent reads full execution traces, scores, and source code, then proposes targeted changes to prompts, skills, or config. Changes are evaluated against the eval suite — improvements are kept, regressions are reverted (autoresearch keep/discard pattern).
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
alpha-loop evolve # Run up to 5 iterations
|
|
161
|
+
alpha-loop evolve --max-iterations 10 # Run 10 iterations
|
|
162
|
+
alpha-loop evolve --continuous # Run until manually stopped (Ctrl-C)
|
|
163
|
+
alpha-loop evolve --surface prompts # Only modify agent prompts (safest)
|
|
164
|
+
alpha-loop evolve --surface all # Modify prompts + pipeline code (riskier)
|
|
165
|
+
alpha-loop evolve --resume # Resume from a previous evolve session
|
|
166
|
+
alpha-loop evolve --dry-run # Preview without changes
|
|
167
|
+
```
|
|
168
|
+
|
|
104
169
|
### Crash Recovery (`alpha-loop resume`)
|
|
105
170
|
|
|
106
171
|
If the loop hangs or crashes mid-session, work can be stranded on local branches with no PR. Run `alpha-loop resume` to recover:
|
|
@@ -124,7 +189,7 @@ During live verification, the agent takes screenshots at key states and saves th
|
|
|
124
189
|
| `alpha-loop run` | Fetch matching issues, process them all, then exit |
|
|
125
190
|
| `alpha-loop run --dry-run` | Preview without making changes |
|
|
126
191
|
| `alpha-loop scan` | Generate/refresh project context and instructions file |
|
|
127
|
-
| `alpha-loop vision` |
|
|
192
|
+
| `alpha-loop vision` | **(deprecated)** Use `alpha-loop plan` instead |
|
|
128
193
|
| `alpha-loop auth` | Save authenticated browser state for verification |
|
|
129
194
|
| `alpha-loop history` | View session history |
|
|
130
195
|
| `alpha-loop history <name>` | View a specific session |
|
|
@@ -135,6 +200,28 @@ During live verification, the agent takes screenshots at key states and saves th
|
|
|
135
200
|
| `alpha-loop resume --issue <N>` | Resume a specific issue |
|
|
136
201
|
| `alpha-loop review` | Analyze learnings and propose self-improvements |
|
|
137
202
|
| `alpha-loop review --apply` | Apply proposed improvements and create a draft PR |
|
|
203
|
+
| `alpha-loop eval` | Run the eval suite and compute composite score |
|
|
204
|
+
| `alpha-loop eval capture` | Capture failures as eval cases (interactive) |
|
|
205
|
+
| `alpha-loop eval list` | Show eval cases and recent scores |
|
|
206
|
+
| `alpha-loop eval scores` | Show score history over time |
|
|
207
|
+
| `alpha-loop eval pareto` | Show score/cost Pareto frontier |
|
|
208
|
+
| `alpha-loop eval compare <r1> <r2>` | Compare two eval runs |
|
|
209
|
+
| `alpha-loop eval search` | Greedy search over model configurations per pipeline step |
|
|
210
|
+
| `alpha-loop eval estimate` | Estimate cost of running the eval suite |
|
|
211
|
+
| `alpha-loop eval compare-configs <a> <b>` | Compare two YAML config files side-by-side |
|
|
212
|
+
| `alpha-loop eval convert` | Convert between AlphaLoop and skill-creator eval formats |
|
|
213
|
+
| `alpha-loop eval import-swebench` | Import eval cases from SWE-bench dataset |
|
|
214
|
+
| `alpha-loop evolve` | Meta-Harness-style automated optimization loop |
|
|
215
|
+
| `alpha-loop plan` | Generate a full project scope (milestones + issues) from seed inputs using AI |
|
|
216
|
+
| `alpha-loop plan --seed <file>` | Read seed description from a file instead of prompting |
|
|
217
|
+
| `alpha-loop plan --dry-run` | Display the plan without creating any GitHub resources |
|
|
218
|
+
| `alpha-loop plan --yes --seed <file>` | Non-interactive mode: accept all AI recommendations |
|
|
219
|
+
| `alpha-loop triage` | Analyze and improve existing issues (staleness, clarity, size, duplicates, support ticket enrichment) |
|
|
220
|
+
| `alpha-loop triage --dry-run` | Display findings without making changes |
|
|
221
|
+
| `alpha-loop triage --yes` | Non-interactive mode: apply all AI-recommended triage actions |
|
|
222
|
+
| `alpha-loop roadmap` | Organize open issues into milestones using AI analysis |
|
|
223
|
+
| `alpha-loop roadmap --dry-run` | Display proposed roadmap without making changes |
|
|
224
|
+
| `alpha-loop roadmap --yes` | Non-interactive mode: apply all AI-recommended assignments |
|
|
138
225
|
|
|
139
226
|
### Run Options
|
|
140
227
|
|
|
@@ -176,6 +263,15 @@ harnesses:
|
|
|
176
263
|
# Safety limits (0 = unlimited)
|
|
177
264
|
max_issues: 20
|
|
178
265
|
max_session_duration: 7200 # 2 hours in seconds
|
|
266
|
+
|
|
267
|
+
# Post-session review (runs after all issues, reviews full session diff)
|
|
268
|
+
post_session:
|
|
269
|
+
review: true
|
|
270
|
+
security_scan: true
|
|
271
|
+
|
|
272
|
+
# Eval system
|
|
273
|
+
auto_capture: true # capture failures as eval cases
|
|
274
|
+
eval_dir: .alpha-loop/evals
|
|
179
275
|
```
|
|
180
276
|
|
|
181
277
|
### Configuration Reference
|
|
@@ -210,6 +306,14 @@ max_session_duration: 7200 # 2 hours in seconds
|
|
|
210
306
|
| `run_full` | `false` | Run full pipeline without skipping any steps |
|
|
211
307
|
| `verbose` | `false` | Enable verbose agent output |
|
|
212
308
|
| `harnesses` | (auto from agent) | Coding harnesses to sync skills/agents to (e.g., `claude`, `codex`) |
|
|
309
|
+
| `eval_dir` | `.alpha-loop/evals` | Directory for eval cases and scores |
|
|
310
|
+
| `eval_model` | (agent default) | AI model for eval judging |
|
|
311
|
+
| `eval_timeout` | `300` | Timeout in seconds for eval case execution |
|
|
312
|
+
| `auto_capture` | `true` | Auto-capture failures as eval cases at end of session |
|
|
313
|
+
| `pipeline` | `{}` | Per-step agent/model overrides (see below) |
|
|
314
|
+
| `pricing` | (built-in) | Custom token pricing per model for cost tracking |
|
|
315
|
+
| `post_session.review` | `true` | Run holistic code review on full session diff |
|
|
316
|
+
| `post_session.security_scan` | `true` | Include security scanning in post-session review |
|
|
213
317
|
|
|
214
318
|
### Environment Variables
|
|
215
319
|
|
|
@@ -243,6 +347,12 @@ All config options can be set via environment variables (uppercase, same names):
|
|
|
243
347
|
| `MERGE_TO` | `merge_to` |
|
|
244
348
|
| `RUN_FULL` | `run_full` |
|
|
245
349
|
| `VERBOSE` | `verbose` |
|
|
350
|
+
| `EVAL_DIR` | `eval_dir` |
|
|
351
|
+
| `EVAL_MODEL` | `eval_model` |
|
|
352
|
+
| `EVAL_TIMEOUT` | `eval_timeout` |
|
|
353
|
+
| `AUTO_CAPTURE` | `auto_capture` |
|
|
354
|
+
| `SKIP_POST_SESSION_REVIEW` | `post_session.review` (inverted) |
|
|
355
|
+
| `SKIP_POST_SESSION_SECURITY` | `post_session.security_scan` (inverted) |
|
|
246
356
|
|
|
247
357
|
**Precedence:** CLI flags > environment variables > `.alpha-loop.yaml` > auto-detection > defaults
|
|
248
358
|
|
|
@@ -278,6 +388,27 @@ harnesses:
|
|
|
278
388
|
- claude # also sync to Claude for teammates using it
|
|
279
389
|
```
|
|
280
390
|
|
|
391
|
+
### Per-Step Pipeline Config
|
|
392
|
+
|
|
393
|
+
Use `pipeline` to assign different models to different pipeline stages. This lets you use cheaper models for simple steps and reserve expensive models for implementation:
|
|
394
|
+
|
|
395
|
+
```yaml
|
|
396
|
+
agent: claude
|
|
397
|
+
model: claude-sonnet-4-6 # default for all steps
|
|
398
|
+
|
|
399
|
+
pipeline:
|
|
400
|
+
plan:
|
|
401
|
+
model: claude-haiku-4-5 # cheap model for planning
|
|
402
|
+
implement:
|
|
403
|
+
model: claude-sonnet-4-6 # main model for coding
|
|
404
|
+
review:
|
|
405
|
+
model: claude-opus-4-6 # best model for review
|
|
406
|
+
learn:
|
|
407
|
+
model: claude-haiku-4-5 # cheap model for learning
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
Use `alpha-loop eval search` to automatically find the best model assignment per step via greedy coordinate descent over your eval suite.
|
|
411
|
+
|
|
281
412
|
## GitHub Setup
|
|
282
413
|
|
|
283
414
|
### Labels
|
|
@@ -331,6 +462,8 @@ What needs to be done.
|
|
|
331
462
|
| `.alpha-loop/vision.md` | Yes | Project vision document |
|
|
332
463
|
| `.alpha-loop/context.md` | Yes | Auto-generated project context |
|
|
333
464
|
| `.alpha-loop/learnings/` | Yes | Learning files, session manifests, and session summaries (shared with team) |
|
|
465
|
+
| `.alpha-loop/evals/` | Yes | Eval cases (YAML) and score history (`scores.jsonl`) |
|
|
466
|
+
| `.alpha-loop/traces/` | No (gitignored) | Meta-Harness style execution traces per session |
|
|
334
467
|
| `.alpha-loop/sessions/` | No (gitignored) | Local session logs, results JSON, screenshots |
|
|
335
468
|
| `.alpha-loop/auth/` | No (gitignored) | Saved browser auth state for verification |
|
|
336
469
|
| `.worktrees/` | No (gitignored) | Temporary git worktrees during processing |
|
package/dist/cli.js
CHANGED
|
@@ -8,7 +8,7 @@ import { syncCommand } from './commands/sync.js';
|
|
|
8
8
|
program
|
|
9
9
|
.name('alpha-loop')
|
|
10
10
|
.description('Agent-agnostic automated development loop')
|
|
11
|
-
.version('1.
|
|
11
|
+
.version('1.5.0');
|
|
12
12
|
program
|
|
13
13
|
.command('init')
|
|
14
14
|
.description('Full project onboarding: config, templates, vision, scan, sync')
|
|
@@ -47,7 +47,7 @@ program
|
|
|
47
47
|
.action(scanCommand);
|
|
48
48
|
program
|
|
49
49
|
.command('vision')
|
|
50
|
-
.description('Interactive project vision setup')
|
|
50
|
+
.description('(deprecated) Interactive project vision setup — use "plan" instead')
|
|
51
51
|
.action(visionCommand);
|
|
52
52
|
program
|
|
53
53
|
.command('auth')
|
|
@@ -58,6 +58,35 @@ program
|
|
|
58
58
|
.description('Sync .alpha-loop/templates/ to all configured harnesses')
|
|
59
59
|
.option('--check', 'Check for drift without syncing (exits non-zero if drift found)')
|
|
60
60
|
.action(syncCommand);
|
|
61
|
+
program
|
|
62
|
+
.command('plan')
|
|
63
|
+
.description('Generate a full project scope (milestones + issues) from seed inputs using AI')
|
|
64
|
+
.option('--seed <file>', 'Read seed description from a file instead of prompting')
|
|
65
|
+
.option('--no-vision', 'Skip vision generation even if no vision exists')
|
|
66
|
+
.option('--dry-run', 'Display the plan without creating any GitHub resources')
|
|
67
|
+
.option('-y, --yes', 'Skip interactive prompts, accept all AI recommendations')
|
|
68
|
+
.action(async (options) => {
|
|
69
|
+
const { planCommand } = await import('./commands/plan.js');
|
|
70
|
+
await planCommand(options);
|
|
71
|
+
});
|
|
72
|
+
program
|
|
73
|
+
.command('triage')
|
|
74
|
+
.description('Analyze and improve existing issues (staleness, clarity, size, duplicates)')
|
|
75
|
+
.option('--dry-run', 'Display findings without making changes')
|
|
76
|
+
.option('-y, --yes', 'Skip interactive prompts, accept all AI recommendations')
|
|
77
|
+
.action(async (options) => {
|
|
78
|
+
const { triageCommand } = await import('./commands/triage.js');
|
|
79
|
+
await triageCommand(options);
|
|
80
|
+
});
|
|
81
|
+
program
|
|
82
|
+
.command('roadmap')
|
|
83
|
+
.description('Organize open issues into milestones using AI analysis')
|
|
84
|
+
.option('--dry-run', 'Display proposed roadmap without making changes')
|
|
85
|
+
.option('-y, --yes', 'Skip interactive prompts, accept all AI recommendations')
|
|
86
|
+
.action(async (options) => {
|
|
87
|
+
const { roadmapCommand } = await import('./commands/roadmap.js');
|
|
88
|
+
await roadmapCommand(options);
|
|
89
|
+
});
|
|
61
90
|
program
|
|
62
91
|
.command('resume')
|
|
63
92
|
.description('Resume stranded work — push branches, run review, open PRs')
|
|
@@ -87,7 +116,7 @@ evalCmd
|
|
|
87
116
|
.option('--suite <suite>', 'Run only a suite: step (fast) or e2e (slow)')
|
|
88
117
|
.option('--case <id>', 'Run a single eval case by ID prefix')
|
|
89
118
|
.option('--type <type>', 'Filter by type: full or step')
|
|
90
|
-
.option('--step <step>', 'Filter by pipeline step (plan, implement, test, review, verify)')
|
|
119
|
+
.option('--step <step>', 'Filter by pipeline step (plan, implement, test, test-fix, review, verify, learn, skill)')
|
|
91
120
|
.option('--verbose', 'Show detailed output')
|
|
92
121
|
.action(async (options) => {
|
|
93
122
|
const { evalRunCommand } = await import('./commands/eval.js');
|
|
@@ -116,10 +145,14 @@ evalCmd
|
|
|
116
145
|
});
|
|
117
146
|
evalCmd
|
|
118
147
|
.command('search')
|
|
119
|
-
.description('Greedy search over model/agent configurations')
|
|
148
|
+
.description('Greedy coordinate descent search over model/agent configurations')
|
|
120
149
|
.option('--models <models>', 'Models to test (comma-separated)')
|
|
121
150
|
.option('--agents <agents>', 'Agents to test (comma-separated)')
|
|
122
151
|
.option('--max-runs <n>', 'Maximum number of eval runs')
|
|
152
|
+
.option('--budget <n>', 'Maximum number of eval runs (alias for --max-runs)')
|
|
153
|
+
.option('--step <step>', 'Only search over this pipeline step')
|
|
154
|
+
.option('--min-score <score>', 'Minimum acceptable score threshold')
|
|
155
|
+
.option('--optimize <target>', 'Optimize for: cost or efficiency (default: efficiency)')
|
|
123
156
|
.action(async (options) => {
|
|
124
157
|
const { evalSearchCommand } = await import('./commands/eval.js');
|
|
125
158
|
await evalSearchCommand(options);
|
|
@@ -138,17 +171,51 @@ evalCmd
|
|
|
138
171
|
const { evalCompareCommand } = await import('./commands/eval.js');
|
|
139
172
|
evalCompareCommand(run1, run2);
|
|
140
173
|
});
|
|
174
|
+
evalCmd
|
|
175
|
+
.command('estimate')
|
|
176
|
+
.description('Estimate cost of running the eval suite with current or specified config')
|
|
177
|
+
.option('--config <path>', 'Path to a YAML config file to estimate')
|
|
178
|
+
.action(async (options) => {
|
|
179
|
+
const { evalEstimateCommand } = await import('./commands/eval.js');
|
|
180
|
+
evalEstimateCommand(options);
|
|
181
|
+
});
|
|
182
|
+
evalCmd
|
|
183
|
+
.command('compare-configs <configA> <configB>')
|
|
184
|
+
.description('Compare two YAML config files side-by-side')
|
|
185
|
+
.action(async (configA, configB) => {
|
|
186
|
+
const { evalCompareConfigsCommand } = await import('./commands/eval.js');
|
|
187
|
+
evalCompareConfigsCommand(configA, configB);
|
|
188
|
+
});
|
|
141
189
|
evalCmd
|
|
142
190
|
.command('import-swebench')
|
|
143
191
|
.description('Import eval cases from SWE-bench dataset')
|
|
144
|
-
.
|
|
192
|
+
.option('--dataset <path>', 'Path to a downloaded JSONL file (skips auto-download)')
|
|
193
|
+
.option('--dataset-id <id>', 'HuggingFace dataset ID (default: princeton-nlp/SWE-bench_Lite)')
|
|
194
|
+
.option('--count <n>', 'Maximum number of cases to import')
|
|
195
|
+
.option('--repo <owner/repo>', 'Filter by repository (e.g. django/django)')
|
|
196
|
+
.option('--ids <csv>', 'Import specific instance IDs (comma-separated)')
|
|
197
|
+
.option('--step <step>', 'Pipeline step to target (default: implement)')
|
|
198
|
+
.action(async (options) => {
|
|
145
199
|
const { evalImportSwebenchCommand } = await import('./commands/eval.js');
|
|
146
|
-
await evalImportSwebenchCommand();
|
|
200
|
+
await evalImportSwebenchCommand(options);
|
|
201
|
+
});
|
|
202
|
+
evalCmd
|
|
203
|
+
.command('convert')
|
|
204
|
+
.description('Convert between AlphaLoop eval format and skill-creator format')
|
|
205
|
+
.option('--direction <dir>', 'Conversion direction: to-skill or from-skill (default: to-skill)')
|
|
206
|
+
.option('--input <path>', 'Input file path (for from-skill)')
|
|
207
|
+
.option('--output <path>', 'Output file path')
|
|
208
|
+
.action(async (options) => {
|
|
209
|
+
const { evalConvertCommand } = await import('./commands/eval.js');
|
|
210
|
+
evalConvertCommand(options);
|
|
147
211
|
});
|
|
148
212
|
program
|
|
149
213
|
.command('evolve')
|
|
150
214
|
.description('Meta-Harness-style automated optimization loop')
|
|
151
215
|
.option('--max-iterations <n>', 'Maximum optimization iterations (default: 5)')
|
|
216
|
+
.option('--continuous', 'Run until manually stopped (SIGINT)')
|
|
217
|
+
.option('--surface <level>', 'Optimization surface: prompts, skills, config, all (default: prompts)')
|
|
218
|
+
.option('--resume', 'Resume from a previous evolve session')
|
|
152
219
|
.option('--dry-run', 'Preview without making changes')
|
|
153
220
|
.option('--verbose', 'Show detailed agent output')
|
|
154
221
|
.action(async (options) => {
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,2CAA2C,CAAC;KACxD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC3D,MAAM,WAAW,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,2EAA2E,CAAC;KACxF,MAAM,CAAC,WAAW,EAAE,yBAAyB,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,iBAAiB,CAAC;KAC5C,MAAM,CAAC,cAAc,EAAE,qBAAqB,CAAC;KAC7C,MAAM,CAAC,eAAe,EAAE,kBAAkB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,oBAAoB,EAAE,uCAAuC,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,QAAQ,EAAE,4BAA4B,CAAC;KAC9C,MAAM,CAAC,WAAW,EAAE,sCAAsC,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;IACzD,IAAI,OAAO,CAAC,IAAI;QAAE,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IACxC,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;AAC5B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,mBAAmB,CAAC;KAC5B,WAAW,CAAC,sBAAsB,CAAC;KACnC,MAAM,CAAC,MAAM,EAAE,+BAA+B,CAAC;KAC/C,MAAM,CAAC,SAAS,EAAE,yBAAyB,CAAC;KAC5C,MAAM,CAAC,cAAc,CAAC,CAAC;AAE1B,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,2CAA2C,CAAC;KACxD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC3D,MAAM,WAAW,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,2EAA2E,CAAC;KACxF,MAAM,CAAC,WAAW,EAAE,yBAAyB,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,iBAAiB,CAAC;KAC5C,MAAM,CAAC,cAAc,EAAE,qBAAqB,CAAC;KAC7C,MAAM,CAAC,eAAe,EAAE,kBAAkB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,oBAAoB,EAAE,uCAAuC,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,QAAQ,EAAE,4BAA4B,CAAC;KAC9C,MAAM,CAAC,WAAW,EAAE,sCAAsC,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;IACzD,IAAI,OAAO,CAAC,IAAI;QAAE,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IACxC,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;AAC5B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,mBAAmB,CAAC;KAC5B,WAAW,CAAC,sBAAsB,CAAC;KACnC,MAAM,CAAC,MAAM,EAAE,+BAA+B,CAAC;KAC/C,MAAM,CAAC,SAAS,EAAE,yBAAyB,CAAC;KAC5C,MAAM,CAAC,cAAc,CAAC,CAAC;AAE1B,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,oEAAoE,CAAC;KACjF,MAAM,CAAC,aAAa,CAAC,CAAC;AAEzB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,yDAAyD,CAAC;KACtE,MAAM,CAAC,SAAS,EAAE,iEAAiE,CAAC;KACpF,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,+EAA+E,CAAC;KAC5F,MAAM,CAAC,eAAe,EAAE,wDAAwD,CAAC;KACjF,MAAM,CAAC,aAAa,EAAE,iDAAiD,CAAC;KACxE,MAAM,CAAC,WAAW,EAAE,wDAAwD,CAAC;KAC7E,MAAM,CAAC,WAAW,EAAE,yDAAyD,CAAC;KAC9E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC3D,MAAM,WAAW,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,4EAA4E,CAAC;KACzF,MAAM,CAAC,WAAW,EAAE,yCAAyC,CAAC;KAC9D,MAAM,CAAC,WAAW,EAAE,yDAAyD,CAAC;KAC9E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,SAAS,CAAC;KAClB,WAAW,CAAC,wDAAwD,CAAC;KACrE,MAAM,CAAC,WAAW,EAAE,iDAAiD,CAAC;KACtE,MAAM,CAAC,WAAW,EAAE,yDAAyD,CAAC;KAC9E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,uBAAuB,CAAC,CAAC;IACjE,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;AAChC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,4DAA4D,CAAC;KACzE,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,kBAAkB,EAAE,0CAA0C,CAAC;KACtE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,2FAA2F,CAAC;KACxG,MAAM,CAAC,SAAS,EAAE,4CAA4C,CAAC;KAC/D,MAAM,CAAC,kBAAkB,EAAE,gDAAgD,CAAC;KAC5E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,mBAAmB;AACnB,MAAM,OAAO,GAAG,OAAO;KACpB,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,2DAA2D,CAAC,CAAC;AAE5E,OAAO;KACJ,OAAO,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;KACnC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,eAAe,EAAE,kCAAkC,CAAC;KAC3D,MAAM,CAAC,iBAAiB,EAAE,6CAA6C,CAAC;KACxE,MAAM,CAAC,aAAa,EAAE,qCAAqC,CAAC;KAC5D,MAAM,CAAC,eAAe,EAAE,8BAA8B,CAAC;KACvD,MAAM,CAAC,eAAe,EAAE,yFAAyF,CAAC;KAClH,MAAM,CAAC,WAAW,EAAE,sBAAsB,CAAC;KAC3C,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC9D,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;AAChC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,8CAA8C,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;IACtB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,mCAAmC,CAAC;KAChD,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC/D,eAAe,EAAE,CAAC;AACpB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,8BAA8B,CAAC;KAC3C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,kEAAkE,CAAC;KAC/E,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,gBAAgB,EAAE,6BAA6B,CAAC;KACvD,MAAM,CAAC,cAAc,EAAE,oDAAoD,CAAC;KAC5E,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,qBAAqB,EAAE,oCAAoC,CAAC;KACnE,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,MAAM,iBAAiB,CAAC,OAAO,CAAC,CAAC;AACnC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,iCAAiC,CAAC;KAC9C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,uBAAuB,CAAC;KAChC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,IAAY,EAAE,EAAE;IAC3C,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;AACjC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,UAAU,CAAC;KACnB,WAAW,CAAC,0EAA0E,CAAC;KACvF,MAAM,CAAC,iBAAiB,EAAE,wCAAwC,CAAC;KACnE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,mBAAmB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACnE,mBAAmB,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,qCAAqC,CAAC;KAC9C,WAAW,CAAC,4CAA4C,CAAC;KACzD,MAAM,CAAC,KAAK,EAAE,OAAe,EAAE,OAAe,EAAE,EAAE;IACjD,MAAM,EAAE,yBAAyB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACzE,yBAAyB,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,0CAA0C,CAAC;KACvD,MAAM,CAAC,kBAAkB,EAAE,uDAAuD,CAAC;KACnF,MAAM,CAAC,mBAAmB,EAAE,gEAAgE,CAAC;KAC7F,MAAM,CAAC,aAAa,EAAE,mCAAmC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,2CAA2C,CAAC;KAC1E,MAAM,CAAC,aAAa,EAAE,gDAAgD,CAAC;KACvE,MAAM,CAAC,eAAe,EAAE,8CAA8C,CAAC;KACvE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,yBAAyB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACzE,MAAM,yBAAyB,CAAC,OAAO,CAAC,CAAC;AAC3C,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,SAAS,CAAC;KAClB,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,mBAAmB,EAAE,kEAAkE,CAAC;KAC/F,MAAM,CAAC,gBAAgB,EAAE,kCAAkC,CAAC;KAC5D,MAAM,CAAC,iBAAiB,EAAE,kBAAkB,CAAC;KAC7C,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC9B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,sBAAsB,EAAE,8CAA8C,CAAC;KAC9E,MAAM,CAAC,cAAc,EAAE,qCAAqC,CAAC;KAC7D,MAAM,CAAC,mBAAmB,EAAE,uEAAuE,CAAC;KACpG,MAAM,CAAC,UAAU,EAAE,uCAAuC,CAAC;KAC3D,MAAM,CAAC,WAAW,EAAE,gCAAgC,CAAC;KACrD,MAAM,CAAC,WAAW,EAAE,4BAA4B,CAAC;KACjD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
package/dist/commands/eval.d.ts
CHANGED
|
@@ -13,6 +13,14 @@ export type EvalSearchOptions = {
|
|
|
13
13
|
models?: string;
|
|
14
14
|
agents?: string;
|
|
15
15
|
maxRuns?: string;
|
|
16
|
+
/** Only search over this pipeline step. */
|
|
17
|
+
step?: string;
|
|
18
|
+
/** Minimum acceptable score. */
|
|
19
|
+
minScore?: string;
|
|
20
|
+
/** What to optimize: 'cost' or 'efficiency'. */
|
|
21
|
+
optimize?: string;
|
|
22
|
+
/** Maximum number of eval runs (alias for maxRuns). */
|
|
23
|
+
budget?: string;
|
|
16
24
|
};
|
|
17
25
|
/**
|
|
18
26
|
* Run the eval suite.
|
|
@@ -40,14 +48,61 @@ export declare function evalListCommand(): void;
|
|
|
40
48
|
*/
|
|
41
49
|
export declare function evalScoresCommand(): void;
|
|
42
50
|
/**
|
|
43
|
-
* Show score/cost Pareto frontier.
|
|
51
|
+
* Show score/cost Pareto frontier with ASCII chart.
|
|
44
52
|
*/
|
|
45
53
|
export declare function evalParetoCommand(): void;
|
|
46
54
|
/**
|
|
47
|
-
* Greedy search over model/agent configs.
|
|
55
|
+
* Greedy coordinate descent search over model/agent configs.
|
|
56
|
+
*
|
|
57
|
+
* Strategy:
|
|
58
|
+
* 1. Establish baseline: run eval with current config → S₀, C₀
|
|
59
|
+
* 2. For each pipeline step, try alternative models (holding others fixed)
|
|
60
|
+
* 3. Keep Pareto-optimal changes (better score at ≤ cost, or same score at lower cost)
|
|
61
|
+
* 4. Repeat until no step can be improved or budget is exhausted
|
|
48
62
|
*/
|
|
49
63
|
export declare function evalSearchCommand(options: EvalSearchOptions): Promise<void>;
|
|
64
|
+
export type EvalImportSwebenchOptions = {
|
|
65
|
+
dataset?: string;
|
|
66
|
+
datasetId?: string;
|
|
67
|
+
count?: string;
|
|
68
|
+
repo?: string;
|
|
69
|
+
ids?: string;
|
|
70
|
+
step?: string;
|
|
71
|
+
};
|
|
72
|
+
/**
|
|
73
|
+
* Import SWE-bench cases from HuggingFace or a local JSONL file.
|
|
74
|
+
*
|
|
75
|
+
* Downloads entries from HuggingFace (requires Python + datasets library),
|
|
76
|
+
* converts each to a directory-based eval case under .alpha-loop/evals/cases/e2e/,
|
|
77
|
+
* and updates config.yaml with repo base commit mappings.
|
|
78
|
+
*/
|
|
79
|
+
export declare function evalImportSwebenchCommand(options?: EvalImportSwebenchOptions): Promise<void>;
|
|
80
|
+
export type EvalConvertOptions = {
|
|
81
|
+
direction?: string;
|
|
82
|
+
input?: string;
|
|
83
|
+
output?: string;
|
|
84
|
+
};
|
|
85
|
+
/**
|
|
86
|
+
* Convert between AlphaLoop eval format and skill-creator format.
|
|
87
|
+
*
|
|
88
|
+
* Directions:
|
|
89
|
+
* to-skill — Convert AlphaLoop eval case → skill-creator evals.json
|
|
90
|
+
* from-skill — Convert skill-creator evals.json → AlphaLoop eval cases
|
|
91
|
+
*/
|
|
92
|
+
export declare function evalConvertCommand(options: EvalConvertOptions): void;
|
|
93
|
+
export type EvalEstimateOptions = {
|
|
94
|
+
config?: string;
|
|
95
|
+
};
|
|
96
|
+
/**
|
|
97
|
+
* Estimate cost of running the eval suite with a given config.
|
|
98
|
+
* Shows per-step breakdown using pricing table and average token estimates.
|
|
99
|
+
*/
|
|
100
|
+
export declare function evalEstimateCommand(options: EvalEstimateOptions): void;
|
|
101
|
+
export type EvalCompareConfigsOptions = {
|
|
102
|
+
configA: string;
|
|
103
|
+
configB: string;
|
|
104
|
+
};
|
|
50
105
|
/**
|
|
51
|
-
*
|
|
106
|
+
* Compare two YAML config files side-by-side showing per-step model/agent differences.
|
|
52
107
|
*/
|
|
53
|
-
export declare function
|
|
108
|
+
export declare function evalCompareConfigsCommand(configAPath: string, configBPath: string): void;
|