npm - @zhijiewang/openharness - Versions diffs - 2.38.0 → 2.40.0 - Mend

@zhijiewang/openharness 2.38.0 → 2.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +35 -0
package/README.zh-CN.md +35 -0
package/dist/commands/info.js +14 -15
package/dist/evals/cli.d.ts +22 -0
package/dist/evals/cli.js +214 -0
package/dist/evals/index.d.ts +12 -0
package/dist/evals/index.js +8 -0
package/dist/evals/orchestrator.d.ts +64 -0
package/dist/evals/orchestrator.js +391 -0
package/dist/evals/pack-loader.d.ts +29 -0
package/dist/evals/pack-loader.js +153 -0
package/dist/evals/run-writer.d.ts +35 -0
package/dist/evals/run-writer.js +94 -0
package/dist/evals/scorer.d.ts +34 -0
package/dist/evals/scorer.js +127 -0
package/dist/evals/types.d.ts +74 -0
package/dist/evals/types.js +10 -0
package/dist/harness/sandbox.d.ts +34 -0
package/dist/harness/sandbox.js +104 -0
package/dist/harness/traces.d.ts +25 -0
package/dist/harness/traces.js +168 -0
package/dist/main.js +3 -0
package/dist/tools/GrepTool/index.d.ts +4 -4
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -44,6 +44,7 @@ AI coding agent in your terminal. Works with any LLM -- free local models or clo
 - [Providers](#providers)
 - [Auth](#auth)
 - [Update](#update)
+- [Evals](#evals)
 - [FAQ](#faq)
 - [Install](#install)
 - [Development](#development)
@@ -859,6 +860,40 @@ Plugins are npm packages that bundle skills, hooks, and MCP servers:
 Save as `openharness-plugin.json` in your npm package root. Install with `npm install`, and openHarness discovers it automatically from `node_modules/`.
+## Evals
+`oh evals` runs SWE-bench-Lite-compatible evaluations against any provider, locally, with mandatory cost caps. Useful for measuring real-world bug-fix performance instead of synthetic benchmarks.
+```bash
+# Run a custom pack with a $5 total cap, 2 parallel agents
+oh evals run my-pack --max-cost-usd 5 --concurrency 2
+# Run a specific instance
+oh evals run my-pack --max-cost-usd 1 --instance django__django-11551
+# Random sample of 3
+oh evals run my-pack --max-cost-usd 2 --sample 3
+# Resume a partial run that hit its cost cap
+oh evals run my-pack --max-cost-usd 10 --resume 2026-05-05T14-30-00
+# List installed packs
+oh evals list-packs
+# Show summary of a past run
+oh evals show 2026-05-05T14-30-00
+```
+Output lives at `~/.oh/evals/runs/<run-id>/`:
+- `results.json` — full per-task data with cost, turns, duration, tests_status, error_message.
+- `predictions.json` — submittable to the SWE-bench leaderboard at https://www.swebench.com/.
+- `transcripts/<instance_id>.jsonl` — verbatim subprocess `stream-json` output per task.
+A pluggable pack contract (`pack.json` + `instances.jsonl` + `fixtures/<id>/`) lets you author packs against any test suite. The `scripts/build-evals-pack.mjs` helper bakes a SWE-bench-Lite-compatible repo at a given base_commit into a fixture; see [CONTRIBUTING.md](CONTRIBUTING.md#authoring-eval-packs).
+A bundled `swe-bench-lite-mini` pack (10 cherry-picked instances, ready to run out-of-the-box) is shipping in v2.40.1.
 ## How It Works
 ```mermaid

package/README.zh-CN.md CHANGED Viewed

@@ -44,6 +44,7 @@
 - [模型提供商](#模型提供商)
 - [鉴权（Auth）](#鉴权auth)
 - [自动更新（Update）](#自动更新update)
+- [评测（Evals）](#评测evals)
 - [常见问题](#常见问题)
 - [安装](#安装)
 - [开发](#开发)
@@ -858,6 +859,40 @@ Run the deploy script with health checks...
 把它命名为 `openharness-plugin.json` 放在 npm 包根目录。安装时 `npm install`，openHarness 会自动从 `node_modules/` 中发现它。
+## 评测（Evals）
+`oh evals` 在本地针对任意 Provider 运行 SWE-bench-Lite 兼容的评测，并强制要求成本上限。用于衡量真实 Bug 修复表现，比合成基准更具参考价值。
+```bash
+# 用 5 美元总上限、2 路并发跑一个自定义 pack
+oh evals run my-pack --max-cost-usd 5 --concurrency 2
+# 只跑指定 instance
+oh evals run my-pack --max-cost-usd 1 --instance django__django-11551
+# 随机抽取 3 个
+oh evals run my-pack --max-cost-usd 2 --sample 3
+# 续跑因成本上限中断的运行
+oh evals run my-pack --max-cost-usd 10 --resume 2026-05-05T14-30-00
+# 列出已安装的 pack
+oh evals list-packs
+# 查看历史运行的汇总
+oh evals show 2026-05-05T14-30-00
+```
+输出位于 `~/.oh/evals/runs/<run-id>/`：
+- `results.json` — 每个任务的完整数据：成本、轮次、耗时、tests_status、错误信息。
+- `predictions.json` — 可直接提交到 SWE-bench 排行榜 https://www.swebench.com/。
+- `transcripts/<instance_id>.jsonl` — 每个任务子进程的原始 `stream-json` 输出。
+可插拔的 pack 协议（`pack.json` + `instances.jsonl` + `fixtures/<id>/`）允许你针对任意测试套件编写 pack。`scripts/build-evals-pack.mjs` 工具可将 SWE-bench-Lite 兼容仓库在指定 base_commit 处烘焙为 fixture，详见 [CONTRIBUTING.md](CONTRIBUTING.md#authoring-eval-packs)。
+内置的 `swe-bench-lite-mini` pack（10 个精选 instance，开箱即跑）将在 v2.40.1 版本发布。
 ## 工作原理
 ```mermaid

package/dist/commands/info.js CHANGED Viewed

@@ -10,7 +10,7 @@ import { estimateMessageTokens } from "../harness/context-warning.js";
 import { getContextWindow } from "../harness/cost.js";
 import { getHooks, invalidateHookCache } from "../harness/hooks.js";
 import { discoverPlugins, discoverSkills } from "../harness/plugins.js";
-import { formatTrace, listTracedSessions, loadTrace } from "../harness/traces.js";
+import { formatFlameGraph, formatTrace, listTracedSessions, loadTrace } from "../harness/traces.js";
 import { getVerificationConfig, invalidateVerificationCache } from "../harness/verification.js";
 import { normalizeMcpConfig } from "../mcp/config-normalize.js";
 import { connectedMcpServers, disconnectMcpClients, loadMcpTools } from "../mcp/loader.js";
@@ -358,13 +358,18 @@ export function registerInfoCommands(register, getCommandMap) {
     register("hooks", "List loaded hooks grouped by event", () => {
         return { output: formatHooksReport(getHooks()), handled: true };
     });
-    register("traces", "List sessions with persisted OTel-style traces (or show one with /traces <sessionId>)", (args) => {
-        const id = args.trim();
+    register("traces", "List sessions with persisted OTel-style traces (or show one with /traces <sessionId>; add --flame for a flame-graph view)", (args) => {
+        // Parse: `<sessionId>` for tree view, `<sessionId> --flame` (or `--flamegraph`)
+        // for the time-axis flame view. Order doesn't matter — accept the flag
+        // before or after the id.
+        const tokens = args.trim().split(/\s+/).filter(Boolean);
+        const flame = tokens.some((t) => t === "--flame" || t === "--flamegraph" || t === "--flame-graph");
+        const id = tokens.find((t) => !t.startsWith("--"));
         if (id) {
             const spans = loadTrace(id);
             if (spans.length === 0)
                 return { output: `No trace found for session ${id}.`, handled: true };
-            return { output: formatTrace(spans), handled: true };
+            return { output: flame ? formatFlameGraph(spans) : formatTrace(spans), handled: true };
         }
         const sessions = listTracedSessions();
         if (sessions.length === 0) {
@@ -788,18 +793,12 @@ export function registerInfoCommands(register, getCommandMap) {
         ];
         return { output: lines.join("\n"), handled: true };
     });
-    register("benchmark", "Run SWE-bench benchmark suite", (args) => {
-        const task = args.trim();
-        if (!task) {
-            return {
-                output: "Usage: /benchmark <task-id or 'list'>\n\nExamples:\n  /benchmark list              List available tasks\n  /benchmark django__django-1234  Run a specific task\n\nSee BENCHMARKS.md for results and methodology.",
-                handled: true,
-            };
-        }
+    register("benchmark", "Run SWE-bench benchmark suite (deprecated — use 'oh evals')", () => {
         return {
-            output: `[benchmark] ${task}`,
-            handled: false,
-            prependToPrompt: `You are running a SWE-bench benchmark task. Task: ${task}\n\nFollow the standard benchmark protocol: read the issue, understand the codebase, implement the fix, and verify with tests.`,
+            output: "/benchmark is replaced by the top-level command 'oh evals run <pack>'.\n" +
+                "Run from the shell, not from inside this REPL — evals spawn isolated subprocesses.\n" +
+                "See: oh evals --help",
+            handled: true,
         };
     });
 }

package/dist/evals/cli.d.ts ADDED Viewed

@@ -0,0 +1,22 @@
+/**
+ * oh evals — CLI surface and terminal table renderer.
+ *
+ * Three subcommands: run, list-packs, show.
+ * Terminal output: ANSI-colored Unicode-tabular layout matching the project's
+ * existing /traces table style.
+ */
+import type { Command } from "commander";
+import type { RunArtifacts } from "./types.js";
+export declare function renderResultsTable(artifacts: RunArtifacts): string;
+export declare function defaultOutputDir(): string;
+export declare function newRunDir(): string;
+export declare function listRunDirs(): string[];
+export declare function loadRunArtifacts(runDir: string): RunArtifacts;
+/**
+ * Register the `oh evals` subcommand group on the root Commander program.
+ *
+ * Mounted from src/main.tsx alongside other top-level groups (auth, project,
+ * etc.).
+ */
+export declare function registerEvalsCommand(program: Command): void;
+//# sourceMappingURL=cli.d.ts.map

package/dist/evals/cli.js ADDED Viewed

@@ -0,0 +1,214 @@
+/**
+ * oh evals — CLI surface and terminal table renderer.
+ *
+ * Three subcommands: run, list-packs, show.
+ * Terminal output: ANSI-colored Unicode-tabular layout matching the project's
+ * existing /traces table style.
+ */
+import { existsSync, mkdirSync, readdirSync, readFileSync } from "node:fs";
+import { cpus, homedir } from "node:os";
+import { join } from "node:path";
+import { RunOrchestrator } from "./orchestrator.js";
+import { listAvailablePacks, loadPack, resolvePackDir } from "./pack-loader.js";
+const STATUS_GLYPH = {
+    resolved: "✓",
+    failed: "✗",
+    error: "⚠",
+    timeout: "⏱",
+    budget_exceeded: "$",
+    skipped: "⊘",
+};
+export function renderResultsTable(artifacts) {
+    const lines = [];
+    lines.push(`=== oh evals — ${artifacts.pack} ===`);
+    lines.push("");
+    lines.push(`  pass  task${" ".repeat(34)}turns  cost      time     note`);
+    for (const r of artifacts.results) {
+        const glyph = STATUS_GLYPH[r.status];
+        const taskCol = r.instance_id.padEnd(38).slice(0, 38);
+        const turnsCol = String(r.turns_used).padStart(5);
+        const costCol = `$${r.cost_usd.toFixed(2)}`.padStart(8);
+        const timeCol = formatDuration(r.duration_ms).padStart(8);
+        const note = r.status === "resolved" || r.status === "failed" ? "" : statusNote(r);
+        lines.push(`  ${glyph}     ${taskCol} ${turnsCol}  ${costCol}  ${timeCol}  ${note}`);
+    }
+    lines.push("");
+    const denom = artifacts.resolved + artifacts.failed + artifacts.error + artifacts.timeout;
+    const pct = denom === 0 ? "0.0" : (artifacts.pass_rate * 100).toFixed(1);
+    lines.push(`  ${artifacts.resolved}/${denom} resolved (${pct}%) — total $${artifacts.total_cost_usd.toFixed(2)} — ${formatDuration(artifacts.total_duration_ms)} elapsed`);
+    if (artifacts.partial) {
+        lines.push(`  ⚠ run halted at task ${artifacts.results.length} — total cost cap of $${artifacts.max_cost_usd} reached`);
+    }
+    return lines.join("\n");
+}
+function formatDuration(ms) {
+    if (ms < 1000)
+        return `${ms}ms`;
+    const totalSeconds = Math.round(ms / 1000);
+    const minutes = Math.floor(totalSeconds / 60);
+    const seconds = totalSeconds % 60;
+    if (minutes === 0)
+        return `${seconds}s`;
+    return `${minutes}m${seconds.toString().padStart(2, "0")}s`;
+}
+function statusNote(r) {
+    switch (r.status) {
+        case "error":
+            return r.error_message ? r.error_message.slice(0, 40) : "error";
+        case "timeout":
+            return "timeout";
+        case "budget_exceeded":
+            return "budget_exceeded";
+        case "skipped":
+            return r.error_message ? r.error_message.slice(0, 40) : "skipped";
+        default:
+            return "";
+    }
+}
+export function defaultOutputDir() {
+    return join(homedir(), ".oh", "evals", "runs");
+}
+export function newRunDir() {
+    const ts = new Date().toISOString().replace(/[:]/g, "-").replace(/\..+$/, "");
+    return join(defaultOutputDir(), ts);
+}
+export function listRunDirs() {
+    const dir = defaultOutputDir();
+    if (!existsSync(dir))
+        return [];
+    return readdirSync(dir)
+        .filter((entry) => existsSync(join(dir, entry, "results.json")))
+        .sort()
+        .reverse();
+}
+export function loadRunArtifacts(runDir) {
+    return JSON.parse(readFileSync(join(runDir, "results.json"), "utf-8"));
+}
+/**
+ * Register the `oh evals` subcommand group on the root Commander program.
+ *
+ * Mounted from src/main.tsx alongside other top-level groups (auth, project,
+ * etc.).
+ */
+export function registerEvalsCommand(program) {
+    const evalsCmd = program.command("evals").description("Run eval packs against the agent");
+    evalsCmd
+        .command("run [pack]", { isDefault: false })
+        .description("Run an eval pack (default: swe-bench-lite-mini)")
+        .requiredOption("--max-cost-usd <amount>", "REQUIRED. Total cost cap for the run in USD.")
+        .option("--max-task-cost-usd <amount>", "Per-task cap (default: max-cost-usd / num_tasks)")
+        .option("--max-task-turns <n>", "Per-task tool-use cap", "50")
+        .option("--task-timeout <seconds>", "Wall-clock per-task kill in seconds", "600")
+        .option("--concurrency <n>", "Parallel subprocess agents", "1")
+        .option("--model <model>", "Model under test")
+        .option("--fallback-model <model>", "One-shot fallback model")
+        .option("--instance <id>", "Run only this instance")
+        .option("--sample <n>", "Random N instances")
+        .option("--filter <regex>", "Run instances whose instance_id matches the regex")
+        .option("--resume <run-id>", "Continue a partial run; skip already-completed instances")
+        .option("--json", "Emit run summary as JSON to stdout (still writes files)")
+        .option("--output-dir <path>", "Override default ~/.oh/evals/runs/")
+        .action(async (packArg, opts) => {
+        const packName = packArg ?? "swe-bench-lite-mini";
+        const packDir = resolvePackDir(packName);
+        if (!packDir) {
+            console.error(`pack not found: ${packName}`);
+            console.error(`available packs: ${listAvailablePacks().join(", ") || "(none)"}`);
+            process.exit(2);
+        }
+        const { pack, tasks: allTasks } = loadPack(packDir);
+        // Filter / sample.
+        let tasks = allTasks;
+        if (opts.instance) {
+            tasks = tasks.filter((t) => t.instance_id === opts.instance);
+            if (tasks.length === 0) {
+                console.error(`instance not found in pack: ${opts.instance}`);
+                process.exit(2);
+            }
+        }
+        if (opts.filter) {
+            const re = new RegExp(opts.filter);
+            tasks = tasks.filter((t) => re.test(t.instance_id));
+        }
+        if (opts.sample) {
+            const n = Number(opts.sample);
+            tasks = [...tasks].sort(() => Math.random() - 0.5).slice(0, n);
+        }
+        if (tasks.length === 0) {
+            console.error("no tasks selected after filters");
+            process.exit(2);
+        }
+        const maxCostUsd = Number(opts.maxCostUsd);
+        if (!Number.isFinite(maxCostUsd) || maxCostUsd <= 0) {
+            console.error(`--max-cost-usd must be a positive number, got '${opts.maxCostUsd}'`);
+            process.exit(2);
+        }
+        const concurrencyOpt = Math.max(1, Math.min(Number(opts.concurrency), cpus().length));
+        const runDir = opts.outputDir
+            ? join(opts.outputDir, isoSlug())
+            : opts.resume
+                ? join(defaultOutputDir(), opts.resume)
+                : newRunDir();
+        mkdirSync(runDir, { recursive: true });
+        const orch = new RunOrchestrator({
+            pack,
+            packDir,
+            tasks,
+            model: opts.model ?? "claude-sonnet-4-6",
+            fallbackModel: opts.fallbackModel,
+            maxCostUsd,
+            maxTaskCostUsd: opts.maxTaskCostUsd ? Number(opts.maxTaskCostUsd) : undefined,
+            maxTaskTurns: Number(opts.maxTaskTurns),
+            taskTimeoutMs: Number(opts.taskTimeout) * 1000,
+            concurrency: concurrencyOpt,
+            runDir,
+            resumeFromRunId: opts.resume,
+            onTaskStart: (t) => console.log(`▶ ${t.instance_id}`),
+            onTaskComplete: (r) => console.log(`  ${STATUS_GLYPH[r.status]} ${r.instance_id} ($${r.cost_usd.toFixed(2)}, ${r.turns_used} turns)`),
+        });
+        const stop = () => orch.cancel();
+        process.on("SIGINT", stop);
+        process.on("SIGTERM", stop);
+        const artifacts = await orch.run();
+        if (opts.json) {
+            process.stdout.write(`${JSON.stringify(artifacts, null, 2)}\n`);
+        }
+        else {
+            console.log("");
+            console.log(renderResultsTable(artifacts));
+            console.log("");
+            console.log(`Detailed:    ${join(runDir, "results.json")}`);
+            console.log(`Submittable: ${join(runDir, "predictions.json")}`);
+        }
+    });
+    evalsCmd
+        .command("list-packs")
+        .description("List bundled and user-installed eval packs")
+        .action(() => {
+        const packs = listAvailablePacks();
+        if (packs.length === 0) {
+            console.log("(no packs installed)");
+            return;
+        }
+        for (const p of packs)
+            console.log(p);
+    });
+    evalsCmd
+        .command("show <run-id>")
+        .description("Print summary table for a past run from ~/.oh/evals/runs/")
+        .action((runId) => {
+        const dir = join(defaultOutputDir(), runId);
+        if (!existsSync(join(dir, "results.json"))) {
+            console.error(`run not found: ${runId}`);
+            console.error("available runs:");
+            for (const r of listRunDirs())
+                console.error(`  ${r}`);
+            process.exit(2);
+        }
+        console.log(renderResultsTable(loadRunArtifacts(dir)));
+    });
+}
+function isoSlug() {
+    return new Date().toISOString().replace(/[:]/g, "-").replace(/\..+$/, "");
+}
+//# sourceMappingURL=cli.js.map

package/dist/evals/index.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+/**
+ * oh evals — public re-exports for the eval harness.
+ */
+export type { OrchestratorOptions, TaskSpawnOpts } from "./orchestrator.js";
+export { RunOrchestrator } from "./orchestrator.js";
+export { listAvailablePacks, loadPack, resolveFixturePath, resolvePackDir, validatePack } from "./pack-loader.js";
+export type { RunHeader } from "./run-writer.js";
+export { RunWriter } from "./run-writer.js";
+export type { ScoreResult, TestOutcome } from "./scorer.js";
+export { parseJunitXml, scoreTask } from "./scorer.js";
+export type { EvalsPack, EvalsResult, EvalsStatus, EvalsTask, RunArtifacts, TestsStatus, } from "./types.js";
+//# sourceMappingURL=index.d.ts.map

package/dist/evals/index.js ADDED Viewed

@@ -0,0 +1,8 @@
+/**
+ * oh evals — public re-exports for the eval harness.
+ */
+export { RunOrchestrator } from "./orchestrator.js";
+export { listAvailablePacks, loadPack, resolveFixturePath, resolvePackDir, validatePack } from "./pack-loader.js";
+export { RunWriter } from "./run-writer.js";
+export { parseJunitXml, scoreTask } from "./scorer.js";
+//# sourceMappingURL=index.js.map

package/dist/evals/orchestrator.d.ts ADDED Viewed

@@ -0,0 +1,64 @@
+/**
+ * oh evals — run orchestrator.
+ *
+ * Coordinates the full run lifecycle:
+ *   - manages a concurrency pool of N parallel task workers
+ *   - per task: extract repo tarball → setup.sh → spawn `oh run` subprocess
+ *     → tee stdout to transcript file + parse stream-json → git diff →
+ *     scoreTask → RunWriter.appendResult → cleanup worktree
+ *   - aggregates total cost; halts scheduling when total >= max_cost_usd
+ *   - resumability: skip instance_ids already in results.jsonl
+ *   - cancellation: cancel() sets flag, SIGTERMs running subs, then SIGKILL
+ *
+ * Subprocess command (no --working-dir flag — we use spawn's cwd option):
+ *   node dist/main.js run --bare --output-format stream-json
+ *     --no-session-persistence --max-budget-usd <cap> --max-turns <n>
+ *     --model <model> "<problem_statement>"
+ */
+import type { EvalsPack, EvalsResult, EvalsTask, RunArtifacts } from "./types.js";
+export type OrchestratorOptions = {
+    pack: EvalsPack;
+    packDir: string;
+    tasks: EvalsTask[];
+    model: string;
+    fallbackModel?: string;
+    maxCostUsd: number;
+    maxTaskCostUsd?: number;
+    maxTaskTurns: number;
+    taskTimeoutMs: number;
+    concurrency: number;
+    runDir: string;
+    resumeFromRunId?: string;
+    /** Path to dist/main.js. Default = resolved from package root. Overridable for tests. */
+    ohEntry?: string;
+    /** Override the subprocess executable (default: process.execPath). Tests use the fake-oh-run stub. */
+    subprocessExec?: string;
+    /** Override the args (default = the `oh run` arg list). Tests use ["<stub>"]. */
+    subprocessArgvBuilder?: (task: EvalsTask, opts: TaskSpawnOpts) => {
+        exec: string;
+        args: string[];
+    };
+    onTaskStart?: (task: EvalsTask) => void;
+    onTaskComplete?: (result: EvalsResult) => void;
+};
+export type TaskSpawnOpts = {
+    worktreeDir: string;
+    perTaskCostCap: number;
+    maxTurns: number;
+    model: string;
+};
+export declare class RunOrchestrator {
+    private readonly opts;
+    private readonly writer;
+    private readonly perTaskCap;
+    private cancelled;
+    private halted;
+    private totalCost;
+    private readonly running;
+    private readonly skipIds;
+    constructor(opts: OrchestratorOptions);
+    cancel(): void;
+    run(): Promise<RunArtifacts>;
+    private runOneTask;
+}
+//# sourceMappingURL=orchestrator.d.ts.map