npm - @bradtaylorsf/alpha-loop - Versions diffs - 1.4.2 → 1.4.3 - Mend

@bradtaylorsf/alpha-loop 1.4.2 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/README.md +122 -1
package/dist/cli.js +43 -5
package/dist/cli.js.map +1 -1
package/dist/commands/eval.d.ts +59 -4
package/dist/commands/eval.js +370 -55
package/dist/commands/eval.js.map +1 -1
package/dist/commands/evolve.d.ts +43 -4
package/dist/commands/evolve.js +444 -66
package/dist/commands/evolve.js.map +1 -1
package/dist/lib/config.d.ts +20 -0
package/dist/lib/config.js +55 -0
package/dist/lib/config.js.map +1 -1
package/dist/lib/eval-checks.d.ts +11 -1
package/dist/lib/eval-checks.js +39 -0
package/dist/lib/eval-checks.js.map +1 -1
package/dist/lib/eval-fixtures.d.ts +55 -0
package/dist/lib/eval-fixtures.js +172 -0
package/dist/lib/eval-fixtures.js.map +1 -0
package/dist/lib/eval-runner.d.ts +26 -2
package/dist/lib/eval-runner.js +202 -17
package/dist/lib/eval-runner.js.map +1 -1
package/dist/lib/eval-skill-bridge.d.ts +53 -0
package/dist/lib/eval-skill-bridge.js +121 -0
package/dist/lib/eval-skill-bridge.js.map +1 -0
package/dist/lib/eval-swebench.d.ts +68 -0
package/dist/lib/eval-swebench.js +274 -0
package/dist/lib/eval-swebench.js.map +1 -0
package/dist/lib/eval.d.ts +9 -1
package/dist/lib/eval.js +27 -7
package/dist/lib/eval.js.map +1 -1
package/dist/lib/score.d.ts +24 -2
package/dist/lib/score.js +162 -3
package/dist/lib/score.js.map +1 -1
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -58,7 +58,11 @@ Alpha Loop implements a 12-step pipeline for each issue:
 11. **Auto-Merge** — Merges the PR to the session branch (if enabled)
 12. **Cleanup** — Removes the worktree
-After all issues are processed, Alpha Loop generates a **session summary** that aggregates learnings across issues and produces actionable recommendations.
+After all issues are processed, Alpha Loop:
+1. **Auto-captures failures** as eval cases for regression testing
+2. Generates a **session summary** aggregating learnings across issues
+3. Runs a **post-session code review** on the full session diff to catch cross-issue integration problems
+4. Creates the **session PR** with all findings included
 ### Milestone-Based Workflow
@@ -101,6 +105,65 @@ Run `alpha-loop review` to trigger the self-improvement loop. It reads all accum
 Without `--apply`, proposals are saved to `learnings/proposed-updates/` for review. With `--apply`, changes are written and a draft PR is created.
+### Eval System (`alpha-loop eval`)
+Alpha Loop includes a self-improving eval system inspired by [Meta-Harness](https://arxiv.org/abs/2603.28052) (Lee et al., 2026). It captures real failures as eval cases and tracks composite scores over time to measure whether prompt/skill changes actually help.
+```bash
+# Capture failures from recent sessions as eval cases
+alpha-loop eval capture
+# Run the eval suite and compute composite score
+alpha-loop eval run
+# View score history, Pareto frontier, or compare runs
+alpha-loop eval scores
+alpha-loop eval pareto
+alpha-loop eval compare 1 2
+# Greedy search over model configurations per pipeline step
+alpha-loop eval search --models "haiku,sonnet,opus"
+# Estimate cost before running
+alpha-loop eval estimate
+# Compare two config files side-by-side
+alpha-loop eval compare-configs config-a.yaml config-b.yaml
+```
+Eval cases live in `.alpha-loop/evals/` and scores are appended to `scores.jsonl` (Git-friendly, append-only). The composite score formula is pass-rate primary with lightweight penalties for retries and duration. Real API costs (tokens, USD) are tracked per case from agent output and used for the Pareto frontier.
+Step-level evals test individual pipeline stages (plan, implement, test, test-fix, review, learn, skill) and run in seconds using LLM-judge and keyword checks:
+```bash
+# Run only step-level evals (fast, cheap)
+alpha-loop eval --suite step
+# Run evals for a specific step
+alpha-loop eval --suite step --step review
+# Convert between AlphaLoop and skill-creator eval formats
+alpha-loop eval convert --direction to-skill
+alpha-loop eval convert --direction from-skill --input path/to/evals.json
+# Import SWE-bench cases from HuggingFace (requires Python + datasets)
+alpha-loop eval import-swebench --count 10 --repo "django/django"
+```
+### Evolve (`alpha-loop evolve`)
+The evolve command runs a Meta-Harness-style optimization loop: a proposer agent reads full execution traces, scores, and source code, then proposes targeted changes to prompts, skills, or config. Changes are evaluated against the eval suite — improvements are kept, regressions are reverted (autoresearch keep/discard pattern).
+```bash
+alpha-loop evolve                         # Run up to 5 iterations
+alpha-loop evolve --max-iterations 10     # Run 10 iterations
+alpha-loop evolve --continuous            # Run until manually stopped (Ctrl-C)
+alpha-loop evolve --surface prompts       # Only modify agent prompts (safest)
+alpha-loop evolve --surface all           # Modify prompts + pipeline code (riskier)
+alpha-loop evolve --resume                # Resume from a previous evolve session
+alpha-loop evolve --dry-run               # Preview without changes
+```
 ### Crash Recovery (`alpha-loop resume`)
 If the loop hangs or crashes mid-session, work can be stranded on local branches with no PR. Run `alpha-loop resume` to recover:
@@ -135,6 +198,18 @@ During live verification, the agent takes screenshots at key states and saves th
 | `alpha-loop resume --issue <N>` | Resume a specific issue |
 | `alpha-loop review` | Analyze learnings and propose self-improvements |
 | `alpha-loop review --apply` | Apply proposed improvements and create a draft PR |
+| `alpha-loop eval` | Run the eval suite and compute composite score |
+| `alpha-loop eval capture` | Capture failures as eval cases (interactive) |
+| `alpha-loop eval list` | Show eval cases and recent scores |
+| `alpha-loop eval scores` | Show score history over time |
+| `alpha-loop eval pareto` | Show score/cost Pareto frontier |
+| `alpha-loop eval compare <r1> <r2>` | Compare two eval runs |
+| `alpha-loop eval search` | Greedy search over model configurations per pipeline step |
+| `alpha-loop eval estimate` | Estimate cost of running the eval suite |
+| `alpha-loop eval compare-configs <a> <b>` | Compare two YAML config files side-by-side |
+| `alpha-loop eval convert` | Convert between AlphaLoop and skill-creator eval formats |
+| `alpha-loop eval import-swebench` | Import eval cases from SWE-bench dataset |
+| `alpha-loop evolve` | Meta-Harness-style automated optimization loop |
 ### Run Options
@@ -176,6 +251,15 @@ harnesses:
 # Safety limits (0 = unlimited)
 max_issues: 20
 max_session_duration: 7200  # 2 hours in seconds
+# Post-session review (runs after all issues, reviews full session diff)
+post_session:
+  review: true
+  security_scan: true
+# Eval system
+auto_capture: true  # capture failures as eval cases
+eval_dir: .alpha-loop/evals
 ```
 ### Configuration Reference
@@ -210,6 +294,14 @@ max_session_duration: 7200  # 2 hours in seconds
 | `run_full` | `false` | Run full pipeline without skipping any steps |
 | `verbose` | `false` | Enable verbose agent output |
 | `harnesses` | (auto from agent) | Coding harnesses to sync skills/agents to (e.g., `claude`, `codex`) |
+| `eval_dir` | `.alpha-loop/evals` | Directory for eval cases and scores |
+| `eval_model` | (agent default) | AI model for eval judging |
+| `eval_timeout` | `300` | Timeout in seconds for eval case execution |
+| `auto_capture` | `true` | Auto-capture failures as eval cases at end of session |
+| `pipeline` | `{}` | Per-step agent/model overrides (see below) |
+| `pricing` | (built-in) | Custom token pricing per model for cost tracking |
+| `post_session.review` | `true` | Run holistic code review on full session diff |
+| `post_session.security_scan` | `true` | Include security scanning in post-session review |
 ### Environment Variables
@@ -243,6 +335,12 @@ All config options can be set via environment variables (uppercase, same names):
 | `MERGE_TO` | `merge_to` |
 | `RUN_FULL` | `run_full` |
 | `VERBOSE` | `verbose` |
+| `EVAL_DIR` | `eval_dir` |
+| `EVAL_MODEL` | `eval_model` |
+| `EVAL_TIMEOUT` | `eval_timeout` |
+| `AUTO_CAPTURE` | `auto_capture` |
+| `SKIP_POST_SESSION_REVIEW` | `post_session.review` (inverted) |
+| `SKIP_POST_SESSION_SECURITY` | `post_session.security_scan` (inverted) |
 **Precedence:** CLI flags > environment variables > `.alpha-loop.yaml` > auto-detection > defaults
@@ -278,6 +376,27 @@ harnesses:
   - claude  # also sync to Claude for teammates using it
 ```
+### Per-Step Pipeline Config
+Use `pipeline` to assign different models to different pipeline stages. This lets you use cheaper models for simple steps and reserve expensive models for implementation:
+```yaml
+agent: claude
+model: claude-sonnet-4-6  # default for all steps
+pipeline:
+  plan:
+    model: claude-haiku-4-5       # cheap model for planning
+  implement:
+    model: claude-sonnet-4-6      # main model for coding
+  review:
+    model: claude-opus-4-6        # best model for review
+  learn:
+    model: claude-haiku-4-5       # cheap model for learning
+```
+Use `alpha-loop eval search` to automatically find the best model assignment per step via greedy coordinate descent over your eval suite.
 ## GitHub Setup
 ### Labels
@@ -331,6 +450,8 @@ What needs to be done.
 | `.alpha-loop/vision.md` | Yes | Project vision document |
 | `.alpha-loop/context.md` | Yes | Auto-generated project context |
 | `.alpha-loop/learnings/` | Yes | Learning files, session manifests, and session summaries (shared with team) |
+| `.alpha-loop/evals/` | Yes | Eval cases (YAML) and score history (`scores.jsonl`) |
+| `.alpha-loop/traces/` | No (gitignored) | Meta-Harness style execution traces per session |
 | `.alpha-loop/sessions/` | No (gitignored) | Local session logs, results JSON, screenshots |
 | `.alpha-loop/auth/` | No (gitignored) | Saved browser auth state for verification |
 | `.worktrees/` | No (gitignored) | Temporary git worktrees during processing |

package/dist/cli.js CHANGED Viewed

@@ -8,7 +8,7 @@ import { syncCommand } from './commands/sync.js';
 program
     .name('alpha-loop')
     .description('Agent-agnostic automated development loop')
-    .version('1.4.2');
+    .version('1.4.3');
 program
     .command('init')
     .description('Full project onboarding: config, templates, vision, scan, sync')
@@ -87,7 +87,7 @@ evalCmd
     .option('--suite <suite>', 'Run only a suite: step (fast) or e2e (slow)')
     .option('--case <id>', 'Run a single eval case by ID prefix')
     .option('--type <type>', 'Filter by type: full or step')
-    .option('--step <step>', 'Filter by pipeline step (plan, implement, test, review, verify)')
+    .option('--step <step>', 'Filter by pipeline step (plan, implement, test, test-fix, review, verify, learn, skill)')
     .option('--verbose', 'Show detailed output')
     .action(async (options) => {
     const { evalRunCommand } = await import('./commands/eval.js');
@@ -116,10 +116,14 @@ evalCmd
 });
 evalCmd
     .command('search')
-    .description('Greedy search over model/agent configurations')
+    .description('Greedy coordinate descent search over model/agent configurations')
     .option('--models <models>', 'Models to test (comma-separated)')
     .option('--agents <agents>', 'Agents to test (comma-separated)')
     .option('--max-runs <n>', 'Maximum number of eval runs')
+    .option('--budget <n>', 'Maximum number of eval runs (alias for --max-runs)')
+    .option('--step <step>', 'Only search over this pipeline step')
+    .option('--min-score <score>', 'Minimum acceptable score threshold')
+    .option('--optimize <target>', 'Optimize for: cost or efficiency (default: efficiency)')
     .action(async (options) => {
     const { evalSearchCommand } = await import('./commands/eval.js');
     await evalSearchCommand(options);
@@ -138,17 +142,51 @@ evalCmd
     const { evalCompareCommand } = await import('./commands/eval.js');
     evalCompareCommand(run1, run2);
 });
+evalCmd
+    .command('estimate')
+    .description('Estimate cost of running the eval suite with current or specified config')
+    .option('--config <path>', 'Path to a YAML config file to estimate')
+    .action(async (options) => {
+    const { evalEstimateCommand } = await import('./commands/eval.js');
+    evalEstimateCommand(options);
+});
+evalCmd
+    .command('compare-configs <configA> <configB>')
+    .description('Compare two YAML config files side-by-side')
+    .action(async (configA, configB) => {
+    const { evalCompareConfigsCommand } = await import('./commands/eval.js');
+    evalCompareConfigsCommand(configA, configB);
+});
 evalCmd
     .command('import-swebench')
     .description('Import eval cases from SWE-bench dataset')
-    .action(async () => {
+    .option('--dataset <path>', 'Path to a downloaded JSONL file (skips auto-download)')
+    .option('--dataset-id <id>', 'HuggingFace dataset ID (default: princeton-nlp/SWE-bench_Lite)')
+    .option('--count <n>', 'Maximum number of cases to import')
+    .option('--repo <owner/repo>', 'Filter by repository (e.g. django/django)')
+    .option('--ids <csv>', 'Import specific instance IDs (comma-separated)')
+    .option('--step <step>', 'Pipeline step to target (default: implement)')
+    .action(async (options) => {
     const { evalImportSwebenchCommand } = await import('./commands/eval.js');
-    await evalImportSwebenchCommand();
+    await evalImportSwebenchCommand(options);
+});
+evalCmd
+    .command('convert')
+    .description('Convert between AlphaLoop eval format and skill-creator format')
+    .option('--direction <dir>', 'Conversion direction: to-skill or from-skill (default: to-skill)')
+    .option('--input <path>', 'Input file path (for from-skill)')
+    .option('--output <path>', 'Output file path')
+    .action(async (options) => {
+    const { evalConvertCommand } = await import('./commands/eval.js');
+    evalConvertCommand(options);
 });
 program
     .command('evolve')
     .description('Meta-Harness-style automated optimization loop')
     .option('--max-iterations <n>', 'Maximum optimization iterations (default: 5)')
+    .option('--continuous', 'Run until manually stopped (SIGINT)')
+    .option('--surface <level>', 'Optimization surface: prompts, skills, config, all (default: prompts)')
+    .option('--resume', 'Resume from a previous evolve session')
     .option('--dry-run', 'Preview without making changes')
     .option('--verbose', 'Show detailed agent output')
     .action(async (options) => {

package/dist/cli.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,2CAA2C,CAAC;KACxD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC3D,MAAM,WAAW,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,2EAA2E,CAAC;KACxF,MAAM,CAAC,WAAW,EAAE,yBAAyB,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,iBAAiB,CAAC;KAC5C,MAAM,CAAC,cAAc,EAAE,qBAAqB,CAAC;KAC7C,MAAM,CAAC,eAAe,EAAE,kBAAkB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,oBAAoB,EAAE,uCAAuC,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,QAAQ,EAAE,4BAA4B,CAAC;KAC9C,MAAM,CAAC,WAAW,EAAE,sCAAsC,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;IACzD,IAAI,OAAO,CAAC,IAAI;QAAE,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IACxC,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;AAC5B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,mBAAmB,CAAC;KAC5B,WAAW,CAAC,sBAAsB,CAAC;KACnC,MAAM,CAAC,MAAM,EAAE,+BAA+B,CAAC;KAC/C,MAAM,CAAC,SAAS,EAAE,yBAAyB,CAAC;KAC5C,MAAM,CAAC,cAAc,CAAC,CAAC;AAE1B,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,aAAa,CAAC,CAAC;AAEzB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,yDAAyD,CAAC;KACtE,MAAM,CAAC,SAAS,EAAE,iEAAiE,CAAC;KACpF,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,4DAA4D,CAAC;KACzE,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,kBAAkB,EAAE,0CAA0C,CAAC;KACtE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,2FAA2F,CAAC;KACxG,MAAM,CAAC,SAAS,EAAE,4CAA4C,CAAC;KAC/D,MAAM,CAAC,kBAAkB,EAAE,gDAAgD,CAAC;KAC5E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,mBAAmB;AACnB,MAAM,OAAO,GAAG,OAAO;KACpB,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,2DAA2D,CAAC,CAAC;AAE5E,OAAO;KACJ,OAAO,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;KACnC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,eAAe,EAAE,kCAAkC,CAAC;KAC3D,MAAM,CAAC,iBAAiB,EAAE,6CAA6C,CAAC;KACxE,MAAM,CAAC,aAAa,EAAE,qCAAqC,CAAC;KAC5D,MAAM,CAAC,eAAe,EAAE,8BAA8B,CAAC;KACvD,MAAM,CAAC,eAAe,EAAE,~~iEAAiE~~,CAAC;~~KAC1F~~,MAAM,CAAC,WAAW,EAAE,sBAAsB,CAAC;KAC3C,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC9D,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;AAChC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,8CAA8C,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;IACtB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,mCAAmC,CAAC;KAChD,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC/D,eAAe,EAAE,CAAC;AACpB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,8BAA8B,CAAC;KAC3C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC~~,+CAA+C~~,CAAC;~~KAC5D~~,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,gBAAgB,EAAE,6BAA6B,CAAC;KACvD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,MAAM,iBAAiB,CAAC,OAAO,CAAC,CAAC;AACnC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,iCAAiC,CAAC;KAC9C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,uBAAuB,CAAC;KAChC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,IAAY,EAAE,EAAE;IAC3C,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;AACjC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,0CAA0C,CAAC;KACvD,MAAM,CAAC,KAAK,~~IAAI~~,EAAE;~~IACjB~~,MAAM,EAAE,yBAAyB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACzE,MAAM,yBAAyB,EAAE,CAAC;~~AACpC~~,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,sBAAsB,EAAE,8CAA8C,CAAC;KAC9E,MAAM,CAAC,WAAW,EAAE,gCAAgC,CAAC;KACrD,MAAM,CAAC,WAAW,EAAE,4BAA4B,CAAC;KACjD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
1	+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAEjD,OAAO;KACJ,IAAI,CAAC,YAAY,CAAC;KAClB,WAAW,CAAC,2CAA2C,CAAC;KACxD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC3D,MAAM,WAAW,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,2EAA2E,CAAC;KACxF,MAAM,CAAC,WAAW,EAAE,yBAAyB,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,iBAAiB,CAAC;KAC5C,MAAM,CAAC,cAAc,EAAE,qBAAqB,CAAC;KAC7C,MAAM,CAAC,eAAe,EAAE,kBAAkB,CAAC;KAC3C,MAAM,CAAC,cAAc,EAAE,0BAA0B,CAAC;KAClD,MAAM,CAAC,oBAAoB,EAAE,uCAAuC,CAAC;KACrE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,QAAQ,EAAE,4BAA4B,CAAC;KAC9C,MAAM,CAAC,WAAW,EAAE,sCAAsC,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;IACzD,IAAI,OAAO,CAAC,IAAI;QAAE,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;IACxC,MAAM,UAAU,CAAC,OAAO,CAAC,CAAC;AAC5B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,mBAAmB,CAAC;KAC5B,WAAW,CAAC,sBAAsB,CAAC;KACnC,MAAM,CAAC,MAAM,EAAE,+BAA+B,CAAC;KAC/C,MAAM,CAAC,SAAS,EAAE,yBAAyB,CAAC;KAC5C,MAAM,CAAC,cAAc,CAAC,CAAC;AAE1B,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,aAAa,CAAC,CAAC;AAEzB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,kCAAkC,CAAC;KAC/C,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,yDAAyD,CAAC;KACtE,MAAM,CAAC,SAAS,EAAE,iEAAiE,CAAC;KACpF,MAAM,CAAC,WAAW,CAAC,CAAC;AAEvB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,4DAA4D,CAAC;KACzE,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,kBAAkB,EAAE,0CAA0C,CAAC;KACtE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,2FAA2F,CAAC;KACxG,MAAM,CAAC,SAAS,EAAE,4CAA4C,CAAC;KAC/D,MAAM,CAAC,kBAAkB,EAAE,gDAAgD,CAAC;KAC5E,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,mBAAmB;AACnB,MAAM,OAAO,GAAG,OAAO;KACpB,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,2DAA2D,CAAC,CAAC;AAE5E,OAAO;KACJ,OAAO,CAAC,KAAK,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;KACnC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,eAAe,EAAE,kCAAkC,CAAC;KAC3D,MAAM,CAAC,iBAAiB,EAAE,6CAA6C,CAAC;KACxE,MAAM,CAAC,aAAa,EAAE,qCAAqC,CAAC;KAC5D,MAAM,CAAC,eAAe,EAAE,8BAA8B,CAAC;KACvD,MAAM,CAAC,eAAe,EAAE,yFAAyF,CAAC;KAClH,MAAM,CAAC,WAAW,EAAE,sBAAsB,CAAC;KAC3C,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC9D,MAAM,cAAc,CAAC,OAAO,CAAC,CAAC;AAChC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,8CAA8C,CAAC;KAC3D,MAAM,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;IACtB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,MAAM,kBAAkB,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,mCAAmC,CAAC;KAChD,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAC/D,eAAe,EAAE,CAAC;AACpB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,8BAA8B,CAAC;KAC3C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,kEAAkE,CAAC;KAC/E,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,mBAAmB,EAAE,kCAAkC,CAAC;KAC/D,MAAM,CAAC,gBAAgB,EAAE,6BAA6B,CAAC;KACvD,MAAM,CAAC,cAAc,EAAE,oDAAoD,CAAC;KAC5E,MAAM,CAAC,eAAe,EAAE,qCAAqC,CAAC;KAC9D,MAAM,CAAC,qBAAqB,EAAE,oCAAoC,CAAC;KACnE,MAAM,CAAC,qBAAqB,EAAE,wDAAwD,CAAC;KACvF,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,MAAM,iBAAiB,CAAC,OAAO,CAAC,CAAC;AACnC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,iCAAiC,CAAC;KAC9C,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,MAAM,EAAE,iBAAiB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACjE,iBAAiB,EAAE,CAAC;AACtB,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,uBAAuB,CAAC;KAChC,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,IAAY,EAAE,EAAE;IAC3C,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;AACjC,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,UAAU,CAAC;KACnB,WAAW,CAAC,0EAA0E,CAAC;KACvF,MAAM,CAAC,iBAAiB,EAAE,wCAAwC,CAAC;KACnE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,mBAAmB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACnE,mBAAmB,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,qCAAqC,CAAC;KAC9C,WAAW,CAAC,4CAA4C,CAAC;KACzD,MAAM,CAAC,KAAK,EAAE,OAAe,EAAE,OAAe,EAAE,EAAE;IACjD,MAAM,EAAE,yBAAyB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACzE,yBAAyB,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;AAC9C,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,iBAAiB,CAAC;KAC1B,WAAW,CAAC,0CAA0C,CAAC;KACvD,MAAM,CAAC,kBAAkB,EAAE,uDAAuD,CAAC;KACnF,MAAM,CAAC,mBAAmB,EAAE,gEAAgE,CAAC;KAC7F,MAAM,CAAC,aAAa,EAAE,mCAAmC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,2CAA2C,CAAC;KAC1E,MAAM,CAAC,aAAa,EAAE,gDAAgD,CAAC;KACvE,MAAM,CAAC,eAAe,EAAE,8CAA8C,CAAC;KACvE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,yBAAyB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IACzE,MAAM,yBAAyB,CAAC,OAAO,CAAC,CAAC;AAC3C,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,SAAS,CAAC;KAClB,WAAW,CAAC,gEAAgE,CAAC;KAC7E,MAAM,CAAC,mBAAmB,EAAE,kEAAkE,CAAC;KAC/F,MAAM,CAAC,gBAAgB,EAAE,kCAAkC,CAAC;KAC5D,MAAM,CAAC,iBAAiB,EAAE,kBAAkB,CAAC;KAC7C,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;IAClE,kBAAkB,CAAC,OAAO,CAAC,CAAC;AAC9B,CAAC,CAAC,CAAC;AAEL,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,gDAAgD,CAAC;KAC7D,MAAM,CAAC,sBAAsB,EAAE,8CAA8C,CAAC;KAC9E,MAAM,CAAC,cAAc,EAAE,qCAAqC,CAAC;KAC7D,MAAM,CAAC,mBAAmB,EAAE,uEAAuE,CAAC;KACpG,MAAM,CAAC,UAAU,EAAE,uCAAuC,CAAC;KAC3D,MAAM,CAAC,WAAW,EAAE,gCAAgC,CAAC;KACrD,MAAM,CAAC,WAAW,EAAE,4BAA4B,CAAC;KACjD,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,MAAM,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;IAC/D,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;AAC/B,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}

package/dist/commands/eval.d.ts CHANGED Viewed

@@ -13,6 +13,14 @@ export type EvalSearchOptions = {
     models?: string;
     agents?: string;
     maxRuns?: string;
+    /** Only search over this pipeline step. */
+    step?: string;
+    /** Minimum acceptable score. */
+    minScore?: string;
+    /** What to optimize: 'cost' or 'efficiency'. */
+    optimize?: string;
+    /** Maximum number of eval runs (alias for maxRuns). */
+    budget?: string;
 };
 /**
  * Run the eval suite.
@@ -40,14 +48,61 @@ export declare function evalListCommand(): void;
  */
 export declare function evalScoresCommand(): void;
 /**
- * Show score/cost Pareto frontier.
+ * Show score/cost Pareto frontier with ASCII chart.
  */
 export declare function evalParetoCommand(): void;
 /**
- * Greedy search over model/agent configs.
+ * Greedy coordinate descent search over model/agent configs.
+ *
+ * Strategy:
+ *   1. Establish baseline: run eval with current config → S₀, C₀
+ *   2. For each pipeline step, try alternative models (holding others fixed)
+ *   3. Keep Pareto-optimal changes (better score at ≤ cost, or same score at lower cost)
+ *   4. Repeat until no step can be improved or budget is exhausted
  */
 export declare function evalSearchCommand(options: EvalSearchOptions): Promise<void>;
+export type EvalImportSwebenchOptions = {
+    dataset?: string;
+    datasetId?: string;
+    count?: string;
+    repo?: string;
+    ids?: string;
+    step?: string;
+};
+/**
+ * Import SWE-bench cases from HuggingFace or a local JSONL file.
+ *
+ * Downloads entries from HuggingFace (requires Python + datasets library),
+ * converts each to a directory-based eval case under .alpha-loop/evals/cases/e2e/,
+ * and updates config.yaml with repo base commit mappings.
+ */
+export declare function evalImportSwebenchCommand(options?: EvalImportSwebenchOptions): Promise<void>;
+export type EvalConvertOptions = {
+    direction?: string;
+    input?: string;
+    output?: string;
+};
+/**
+ * Convert between AlphaLoop eval format and skill-creator format.
+ *
+ * Directions:
+ *   to-skill     — Convert AlphaLoop eval case → skill-creator evals.json
+ *   from-skill   — Convert skill-creator evals.json → AlphaLoop eval cases
+ */
+export declare function evalConvertCommand(options: EvalConvertOptions): void;
+export type EvalEstimateOptions = {
+    config?: string;
+};
+/**
+ * Estimate cost of running the eval suite with a given config.
+ * Shows per-step breakdown using pricing table and average token estimates.
+ */
+export declare function evalEstimateCommand(options: EvalEstimateOptions): void;
+export type EvalCompareConfigsOptions = {
+    configA: string;
+    configB: string;
+};
 /**
- * Import SWE-bench cases (placeholder — requires HuggingFace download).
+ * Compare two YAML config files side-by-side showing per-step model/agent differences.
  */
-export declare function evalImportSwebenchCommand(): Promise<void>;
+export declare function evalCompareConfigsCommand(configAPath: string, configBPath: string): void;