@zhijiewang/openharness 2.39.0 → 2.40.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -44,6 +44,7 @@ AI coding agent in your terminal. Works with any LLM -- free local models or clo
44
44
  - [Providers](#providers)
45
45
  - [Auth](#auth)
46
46
  - [Update](#update)
47
+ - [Evals](#evals)
47
48
  - [FAQ](#faq)
48
49
  - [Install](#install)
49
50
  - [Development](#development)
@@ -859,6 +860,40 @@ Plugins are npm packages that bundle skills, hooks, and MCP servers:
859
860
 
860
861
  Save as `openharness-plugin.json` in your npm package root. Install with `npm install`, and openHarness discovers it automatically from `node_modules/`.
861
862
 
863
+ ## Evals
864
+
865
+ `oh evals` runs SWE-bench-Lite-compatible evaluations against any provider, locally, with mandatory cost caps. Useful for measuring real-world bug-fix performance instead of synthetic benchmarks.
866
+
867
+ ```bash
868
+ # Run a custom pack with a $5 total cap, 2 parallel agents
869
+ oh evals run my-pack --max-cost-usd 5 --concurrency 2
870
+
871
+ # Run a specific instance
872
+ oh evals run my-pack --max-cost-usd 1 --instance django__django-11551
873
+
874
+ # Random sample of 3
875
+ oh evals run my-pack --max-cost-usd 2 --sample 3
876
+
877
+ # Resume a partial run that hit its cost cap
878
+ oh evals run my-pack --max-cost-usd 10 --resume 2026-05-05T14-30-00
879
+
880
+ # List installed packs
881
+ oh evals list-packs
882
+
883
+ # Show summary of a past run
884
+ oh evals show 2026-05-05T14-30-00
885
+ ```
886
+
887
+ Output lives at `~/.oh/evals/runs/<run-id>/`:
888
+
889
+ - `results.json` — full per-task data with cost, turns, duration, tests_status, error_message.
890
+ - `predictions.json` — submittable to the SWE-bench leaderboard at https://www.swebench.com/.
891
+ - `transcripts/<instance_id>.jsonl` — verbatim subprocess `stream-json` output per task.
892
+
893
+ A pluggable pack contract (`pack.json` + `instances.jsonl` + `fixtures/<id>/`) lets you author packs against any test suite. The `scripts/build-evals-pack.mjs` helper bakes a SWE-bench-Lite-compatible repo at a given base_commit into a fixture; see [CONTRIBUTING.md](CONTRIBUTING.md#authoring-eval-packs).
894
+
895
+ A bundled `swe-bench-lite-mini` pack (10 cherry-picked instances, ready to run out-of-the-box) is shipping in v2.40.2.
896
+
862
897
  ## How It Works
863
898
 
864
899
  ```mermaid
package/README.zh-CN.md CHANGED
@@ -44,6 +44,7 @@
44
44
  - [模型提供商](#模型提供商)
45
45
  - [鉴权(Auth)](#鉴权auth)
46
46
  - [自动更新(Update)](#自动更新update)
47
+ - [评测(Evals)](#评测evals)
47
48
  - [常见问题](#常见问题)
48
49
  - [安装](#安装)
49
50
  - [开发](#开发)
@@ -858,6 +859,40 @@ Run the deploy script with health checks...
858
859
 
859
860
  把它命名为 `openharness-plugin.json` 放在 npm 包根目录。安装时 `npm install`,openHarness 会自动从 `node_modules/` 中发现它。
860
861
 
862
+ ## 评测(Evals)
863
+
864
+ `oh evals` 在本地针对任意 Provider 运行 SWE-bench-Lite 兼容的评测,并强制要求成本上限。用于衡量真实 Bug 修复表现,比合成基准更具参考价值。
865
+
866
+ ```bash
867
+ # 用 5 美元总上限、2 路并发跑一个自定义 pack
868
+ oh evals run my-pack --max-cost-usd 5 --concurrency 2
869
+
870
+ # 只跑指定 instance
871
+ oh evals run my-pack --max-cost-usd 1 --instance django__django-11551
872
+
873
+ # 随机抽取 3 个
874
+ oh evals run my-pack --max-cost-usd 2 --sample 3
875
+
876
+ # 续跑因成本上限中断的运行
877
+ oh evals run my-pack --max-cost-usd 10 --resume 2026-05-05T14-30-00
878
+
879
+ # 列出已安装的 pack
880
+ oh evals list-packs
881
+
882
+ # 查看历史运行的汇总
883
+ oh evals show 2026-05-05T14-30-00
884
+ ```
885
+
886
+ 输出位于 `~/.oh/evals/runs/<run-id>/`:
887
+
888
+ - `results.json` — 每个任务的完整数据:成本、轮次、耗时、tests_status、错误信息。
889
+ - `predictions.json` — 可直接提交到 SWE-bench 排行榜 https://www.swebench.com/。
890
+ - `transcripts/<instance_id>.jsonl` — 每个任务子进程的原始 `stream-json` 输出。
891
+
892
+ 可插拔的 pack 协议(`pack.json` + `instances.jsonl` + `fixtures/<id>/`)允许你针对任意测试套件编写 pack。`scripts/build-evals-pack.mjs` 工具可将 SWE-bench-Lite 兼容仓库在指定 base_commit 处烘焙为 fixture,详见 [CONTRIBUTING.md](CONTRIBUTING.md#authoring-eval-packs)。
893
+
894
+ 内置的 `swe-bench-lite-mini` pack(10 个精选 instance,开箱即跑)将在 v2.40.2 版本发布。
895
+
861
896
  ## 工作原理
862
897
 
863
898
  ```mermaid
@@ -793,18 +793,12 @@ export function registerInfoCommands(register, getCommandMap) {
793
793
  ];
794
794
  return { output: lines.join("\n"), handled: true };
795
795
  });
796
- register("benchmark", "Run SWE-bench benchmark suite", (args) => {
797
- const task = args.trim();
798
- if (!task) {
799
- return {
800
- output: "Usage: /benchmark <task-id or 'list'>\n\nExamples:\n /benchmark list List available tasks\n /benchmark django__django-1234 Run a specific task\n\nSee BENCHMARKS.md for results and methodology.",
801
- handled: true,
802
- };
803
- }
796
+ register("benchmark", "Run SWE-bench benchmark suite (deprecated — use 'oh evals')", () => {
804
797
  return {
805
- output: `[benchmark] ${task}`,
806
- handled: false,
807
- prependToPrompt: `You are running a SWE-bench benchmark task. Task: ${task}\n\nFollow the standard benchmark protocol: read the issue, understand the codebase, implement the fix, and verify with tests.`,
798
+ output: "/benchmark is replaced by the top-level command 'oh evals run <pack>'.\n" +
799
+ "Run from the shell, not from inside this REPL — evals spawn isolated subprocesses.\n" +
800
+ "See: oh evals --help",
801
+ handled: true,
808
802
  };
809
803
  });
810
804
  }
@@ -0,0 +1,22 @@
1
+ /**
2
+ * oh evals — CLI surface and terminal table renderer.
3
+ *
4
+ * Three subcommands: run, list-packs, show.
5
+ * Terminal output: ANSI-colored Unicode-tabular layout matching the project's
6
+ * existing /traces table style.
7
+ */
8
+ import type { Command } from "commander";
9
+ import type { RunArtifacts } from "./types.js";
10
+ export declare function renderResultsTable(artifacts: RunArtifacts): string;
11
+ export declare function defaultOutputDir(): string;
12
+ export declare function newRunDir(): string;
13
+ export declare function listRunDirs(): string[];
14
+ export declare function loadRunArtifacts(runDir: string): RunArtifacts;
15
+ /**
16
+ * Register the `oh evals` subcommand group on the root Commander program.
17
+ *
18
+ * Mounted from src/main.tsx alongside other top-level groups (auth, project,
19
+ * etc.).
20
+ */
21
+ export declare function registerEvalsCommand(program: Command): void;
22
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1,214 @@
1
+ /**
2
+ * oh evals — CLI surface and terminal table renderer.
3
+ *
4
+ * Three subcommands: run, list-packs, show.
5
+ * Terminal output: ANSI-colored Unicode-tabular layout matching the project's
6
+ * existing /traces table style.
7
+ */
8
+ import { existsSync, mkdirSync, readdirSync, readFileSync } from "node:fs";
9
+ import { cpus, homedir } from "node:os";
10
+ import { join } from "node:path";
11
+ import { RunOrchestrator } from "./orchestrator.js";
12
+ import { listAvailablePacks, loadPack, resolvePackDir } from "./pack-loader.js";
13
+ const STATUS_GLYPH = {
14
+ resolved: "✓",
15
+ failed: "✗",
16
+ error: "⚠",
17
+ timeout: "⏱",
18
+ budget_exceeded: "$",
19
+ skipped: "⊘",
20
+ };
21
+ export function renderResultsTable(artifacts) {
22
+ const lines = [];
23
+ lines.push(`=== oh evals — ${artifacts.pack} ===`);
24
+ lines.push("");
25
+ lines.push(` pass task${" ".repeat(34)}turns cost time note`);
26
+ for (const r of artifacts.results) {
27
+ const glyph = STATUS_GLYPH[r.status];
28
+ const taskCol = r.instance_id.padEnd(38).slice(0, 38);
29
+ const turnsCol = String(r.turns_used).padStart(5);
30
+ const costCol = `$${r.cost_usd.toFixed(2)}`.padStart(8);
31
+ const timeCol = formatDuration(r.duration_ms).padStart(8);
32
+ const note = r.status === "resolved" || r.status === "failed" ? "" : statusNote(r);
33
+ lines.push(` ${glyph} ${taskCol} ${turnsCol} ${costCol} ${timeCol} ${note}`);
34
+ }
35
+ lines.push("");
36
+ const denom = artifacts.resolved + artifacts.failed + artifacts.error + artifacts.timeout;
37
+ const pct = denom === 0 ? "0.0" : (artifacts.pass_rate * 100).toFixed(1);
38
+ lines.push(` ${artifacts.resolved}/${denom} resolved (${pct}%) — total $${artifacts.total_cost_usd.toFixed(2)} — ${formatDuration(artifacts.total_duration_ms)} elapsed`);
39
+ if (artifacts.partial) {
40
+ lines.push(` ⚠ run halted at task ${artifacts.results.length} — total cost cap of $${artifacts.max_cost_usd} reached`);
41
+ }
42
+ return lines.join("\n");
43
+ }
44
+ function formatDuration(ms) {
45
+ if (ms < 1000)
46
+ return `${ms}ms`;
47
+ const totalSeconds = Math.round(ms / 1000);
48
+ const minutes = Math.floor(totalSeconds / 60);
49
+ const seconds = totalSeconds % 60;
50
+ if (minutes === 0)
51
+ return `${seconds}s`;
52
+ return `${minutes}m${seconds.toString().padStart(2, "0")}s`;
53
+ }
54
+ function statusNote(r) {
55
+ switch (r.status) {
56
+ case "error":
57
+ return r.error_message ? r.error_message.slice(0, 40) : "error";
58
+ case "timeout":
59
+ return "timeout";
60
+ case "budget_exceeded":
61
+ return "budget_exceeded";
62
+ case "skipped":
63
+ return r.error_message ? r.error_message.slice(0, 40) : "skipped";
64
+ default:
65
+ return "";
66
+ }
67
+ }
68
+ export function defaultOutputDir() {
69
+ return join(homedir(), ".oh", "evals", "runs");
70
+ }
71
+ export function newRunDir() {
72
+ const ts = new Date().toISOString().replace(/[:]/g, "-").replace(/\..+$/, "");
73
+ return join(defaultOutputDir(), ts);
74
+ }
75
+ export function listRunDirs() {
76
+ const dir = defaultOutputDir();
77
+ if (!existsSync(dir))
78
+ return [];
79
+ return readdirSync(dir)
80
+ .filter((entry) => existsSync(join(dir, entry, "results.json")))
81
+ .sort()
82
+ .reverse();
83
+ }
84
+ export function loadRunArtifacts(runDir) {
85
+ return JSON.parse(readFileSync(join(runDir, "results.json"), "utf-8"));
86
+ }
87
+ /**
88
+ * Register the `oh evals` subcommand group on the root Commander program.
89
+ *
90
+ * Mounted from src/main.tsx alongside other top-level groups (auth, project,
91
+ * etc.).
92
+ */
93
+ export function registerEvalsCommand(program) {
94
+ const evalsCmd = program.command("evals").description("Run eval packs against the agent");
95
+ evalsCmd
96
+ .command("run [pack]", { isDefault: false })
97
+ .description("Run an eval pack (default: swe-bench-lite-mini)")
98
+ .requiredOption("--max-cost-usd <amount>", "REQUIRED. Total cost cap for the run in USD.")
99
+ .option("--max-task-cost-usd <amount>", "Per-task cap (default: max-cost-usd / num_tasks)")
100
+ .option("--max-task-turns <n>", "Per-task tool-use cap", "50")
101
+ .option("--task-timeout <seconds>", "Wall-clock per-task kill in seconds", "600")
102
+ .option("--concurrency <n>", "Parallel subprocess agents", "1")
103
+ .option("--model <model>", "Model under test")
104
+ .option("--fallback-model <model>", "One-shot fallback model")
105
+ .option("--instance <id>", "Run only this instance")
106
+ .option("--sample <n>", "Random N instances")
107
+ .option("--filter <regex>", "Run instances whose instance_id matches the regex")
108
+ .option("--resume <run-id>", "Continue a partial run; skip already-completed instances")
109
+ .option("--json", "Emit run summary as JSON to stdout (still writes files)")
110
+ .option("--output-dir <path>", "Override default ~/.oh/evals/runs/")
111
+ .action(async (packArg, opts) => {
112
+ const packName = packArg ?? "swe-bench-lite-mini";
113
+ const packDir = resolvePackDir(packName);
114
+ if (!packDir) {
115
+ console.error(`pack not found: ${packName}`);
116
+ console.error(`available packs: ${listAvailablePacks().join(", ") || "(none)"}`);
117
+ process.exit(2);
118
+ }
119
+ const { pack, tasks: allTasks } = loadPack(packDir);
120
+ // Filter / sample.
121
+ let tasks = allTasks;
122
+ if (opts.instance) {
123
+ tasks = tasks.filter((t) => t.instance_id === opts.instance);
124
+ if (tasks.length === 0) {
125
+ console.error(`instance not found in pack: ${opts.instance}`);
126
+ process.exit(2);
127
+ }
128
+ }
129
+ if (opts.filter) {
130
+ const re = new RegExp(opts.filter);
131
+ tasks = tasks.filter((t) => re.test(t.instance_id));
132
+ }
133
+ if (opts.sample) {
134
+ const n = Number(opts.sample);
135
+ tasks = [...tasks].sort(() => Math.random() - 0.5).slice(0, n);
136
+ }
137
+ if (tasks.length === 0) {
138
+ console.error("no tasks selected after filters");
139
+ process.exit(2);
140
+ }
141
+ const maxCostUsd = Number(opts.maxCostUsd);
142
+ if (!Number.isFinite(maxCostUsd) || maxCostUsd <= 0) {
143
+ console.error(`--max-cost-usd must be a positive number, got '${opts.maxCostUsd}'`);
144
+ process.exit(2);
145
+ }
146
+ const concurrencyOpt = Math.max(1, Math.min(Number(opts.concurrency), cpus().length));
147
+ const runDir = opts.outputDir
148
+ ? join(opts.outputDir, isoSlug())
149
+ : opts.resume
150
+ ? join(defaultOutputDir(), opts.resume)
151
+ : newRunDir();
152
+ mkdirSync(runDir, { recursive: true });
153
+ const orch = new RunOrchestrator({
154
+ pack,
155
+ packDir,
156
+ tasks,
157
+ model: opts.model ?? "claude-sonnet-4-6",
158
+ fallbackModel: opts.fallbackModel,
159
+ maxCostUsd,
160
+ maxTaskCostUsd: opts.maxTaskCostUsd ? Number(opts.maxTaskCostUsd) : undefined,
161
+ maxTaskTurns: Number(opts.maxTaskTurns),
162
+ taskTimeoutMs: Number(opts.taskTimeout) * 1000,
163
+ concurrency: concurrencyOpt,
164
+ runDir,
165
+ resumeFromRunId: opts.resume,
166
+ onTaskStart: (t) => console.log(`▶ ${t.instance_id}`),
167
+ onTaskComplete: (r) => console.log(` ${STATUS_GLYPH[r.status]} ${r.instance_id} ($${r.cost_usd.toFixed(2)}, ${r.turns_used} turns)`),
168
+ });
169
+ const stop = () => orch.cancel();
170
+ process.on("SIGINT", stop);
171
+ process.on("SIGTERM", stop);
172
+ const artifacts = await orch.run();
173
+ if (opts.json) {
174
+ process.stdout.write(`${JSON.stringify(artifacts, null, 2)}\n`);
175
+ }
176
+ else {
177
+ console.log("");
178
+ console.log(renderResultsTable(artifacts));
179
+ console.log("");
180
+ console.log(`Detailed: ${join(runDir, "results.json")}`);
181
+ console.log(`Submittable: ${join(runDir, "predictions.json")}`);
182
+ }
183
+ });
184
+ evalsCmd
185
+ .command("list-packs")
186
+ .description("List bundled and user-installed eval packs")
187
+ .action(() => {
188
+ const packs = listAvailablePacks();
189
+ if (packs.length === 0) {
190
+ console.log("(no packs installed)");
191
+ return;
192
+ }
193
+ for (const p of packs)
194
+ console.log(p);
195
+ });
196
+ evalsCmd
197
+ .command("show <run-id>")
198
+ .description("Print summary table for a past run from ~/.oh/evals/runs/")
199
+ .action((runId) => {
200
+ const dir = join(defaultOutputDir(), runId);
201
+ if (!existsSync(join(dir, "results.json"))) {
202
+ console.error(`run not found: ${runId}`);
203
+ console.error("available runs:");
204
+ for (const r of listRunDirs())
205
+ console.error(` ${r}`);
206
+ process.exit(2);
207
+ }
208
+ console.log(renderResultsTable(loadRunArtifacts(dir)));
209
+ });
210
+ }
211
+ function isoSlug() {
212
+ return new Date().toISOString().replace(/[:]/g, "-").replace(/\..+$/, "");
213
+ }
214
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1,12 @@
1
+ /**
2
+ * oh evals — public re-exports for the eval harness.
3
+ */
4
+ export type { OrchestratorOptions, TaskSpawnOpts } from "./orchestrator.js";
5
+ export { RunOrchestrator } from "./orchestrator.js";
6
+ export { listAvailablePacks, loadPack, resolveFixturePath, resolvePackDir, validatePack } from "./pack-loader.js";
7
+ export type { RunHeader } from "./run-writer.js";
8
+ export { RunWriter } from "./run-writer.js";
9
+ export type { ScoreResult, TestOutcome } from "./scorer.js";
10
+ export { parseJunitXml, scoreTask } from "./scorer.js";
11
+ export type { EvalsPack, EvalsResult, EvalsStatus, EvalsTask, RunArtifacts, TestsStatus, } from "./types.js";
12
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,8 @@
1
+ /**
2
+ * oh evals — public re-exports for the eval harness.
3
+ */
4
+ export { RunOrchestrator } from "./orchestrator.js";
5
+ export { listAvailablePacks, loadPack, resolveFixturePath, resolvePackDir, validatePack } from "./pack-loader.js";
6
+ export { RunWriter } from "./run-writer.js";
7
+ export { parseJunitXml, scoreTask } from "./scorer.js";
8
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,64 @@
1
+ /**
2
+ * oh evals — run orchestrator.
3
+ *
4
+ * Coordinates the full run lifecycle:
5
+ * - manages a concurrency pool of N parallel task workers
6
+ * - per task: extract repo tarball → setup.sh → spawn `oh run` subprocess
7
+ * → tee stdout to transcript file + parse stream-json → git diff →
8
+ * scoreTask → RunWriter.appendResult → cleanup worktree
9
+ * - aggregates total cost; halts scheduling when total >= max_cost_usd
10
+ * - resumability: skip instance_ids already in results.jsonl
11
+ * - cancellation: cancel() sets flag, SIGTERMs running subs, then SIGKILL
12
+ *
13
+ * Subprocess command (no --working-dir flag — we use spawn's cwd option):
14
+ * node dist/main.js run --bare --output-format stream-json
15
+ * --no-session-persistence --max-budget-usd <cap> --max-turns <n>
16
+ * --model <model> "<problem_statement>"
17
+ */
18
+ import type { EvalsPack, EvalsResult, EvalsTask, RunArtifacts } from "./types.js";
19
+ export type OrchestratorOptions = {
20
+ pack: EvalsPack;
21
+ packDir: string;
22
+ tasks: EvalsTask[];
23
+ model: string;
24
+ fallbackModel?: string;
25
+ maxCostUsd: number;
26
+ maxTaskCostUsd?: number;
27
+ maxTaskTurns: number;
28
+ taskTimeoutMs: number;
29
+ concurrency: number;
30
+ runDir: string;
31
+ resumeFromRunId?: string;
32
+ /** Path to dist/main.js. Default = resolved from package root. Overridable for tests. */
33
+ ohEntry?: string;
34
+ /** Override the subprocess executable (default: process.execPath). Tests use the fake-oh-run stub. */
35
+ subprocessExec?: string;
36
+ /** Override the args (default = the `oh run` arg list). Tests use ["<stub>"]. */
37
+ subprocessArgvBuilder?: (task: EvalsTask, opts: TaskSpawnOpts) => {
38
+ exec: string;
39
+ args: string[];
40
+ };
41
+ onTaskStart?: (task: EvalsTask) => void;
42
+ onTaskComplete?: (result: EvalsResult) => void;
43
+ };
44
+ export type TaskSpawnOpts = {
45
+ worktreeDir: string;
46
+ perTaskCostCap: number;
47
+ maxTurns: number;
48
+ model: string;
49
+ };
50
+ export declare class RunOrchestrator {
51
+ private readonly opts;
52
+ private readonly writer;
53
+ private readonly perTaskCap;
54
+ private cancelled;
55
+ private halted;
56
+ private totalCost;
57
+ private readonly running;
58
+ private readonly skipIds;
59
+ constructor(opts: OrchestratorOptions);
60
+ cancel(): void;
61
+ run(): Promise<RunArtifacts>;
62
+ private runOneTask;
63
+ }
64
+ //# sourceMappingURL=orchestrator.d.ts.map