npm - @zhijiewang/openharness - Versions diffs - 2.39.0 → 2.40.0 - Mend

@zhijiewang/openharness 2.39.0 → 2.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +35 -0
package/README.zh-CN.md +35 -0
package/dist/commands/info.js +5 -11
package/dist/evals/cli.d.ts +22 -0
package/dist/evals/cli.js +214 -0
package/dist/evals/index.d.ts +12 -0
package/dist/evals/index.js +8 -0
package/dist/evals/orchestrator.d.ts +64 -0
package/dist/evals/orchestrator.js +391 -0
package/dist/evals/pack-loader.d.ts +29 -0
package/dist/evals/pack-loader.js +153 -0
package/dist/evals/run-writer.d.ts +35 -0
package/dist/evals/run-writer.js +94 -0
package/dist/evals/scorer.d.ts +34 -0
package/dist/evals/scorer.js +127 -0
package/dist/evals/types.d.ts +74 -0
package/dist/evals/types.js +10 -0
package/dist/harness/sandbox.d.ts +34 -0
package/dist/harness/sandbox.js +104 -0
package/dist/main.js +3 -0
package/dist/tools/GrepTool/index.d.ts +2 -2
package/package.json +1 -1

package/dist/evals/run-writer.js ADDED Viewed

@@ -0,0 +1,94 @@
+/**
+ * oh evals — run writer.
+ *
+ * Streams per-task results to disk atomically:
+ *  - results.jsonl   : append-only, one EvalsResult per line
+ *  - predictions.json: array, rewritten on each append, SWE-bench-submittable
+ *  - results.json    : merged + aggregates, written ONLY by finalize()
+ *
+ * Crash-safety: results.jsonl + predictions.json are valid up to the last
+ * successful append. `oh evals run --resume <run_id>` reads results.jsonl
+ * to determine completed instance_ids.
+ */
+import { appendFileSync, existsSync, mkdirSync, readFileSync, renameSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+export class RunWriter {
+    runDir;
+    header;
+    results = [];
+    constructor(runDir, header) {
+        this.runDir = runDir;
+        this.header = header;
+        mkdirSync(runDir, { recursive: true });
+        mkdirSync(join(runDir, "transcripts"), { recursive: true });
+    }
+    appendResult(result) {
+        this.results.push(result);
+        // results.jsonl — append a single line atomically.
+        const line = `${JSON.stringify(result)}\n`;
+        appendFileSync(join(this.runDir, "results.jsonl"), line);
+        // predictions.json — rewrite the array atomically (.tmp → rename).
+        const preds = this.results.map((r) => ({
+            instance_id: r.instance_id,
+            model_patch: r.model_patch,
+            model_name_or_path: this.header.model,
+        }));
+        const tmp = join(this.runDir, "predictions.json.tmp");
+        writeFileSync(tmp, JSON.stringify(preds, null, 2));
+        renameSync(tmp, join(this.runDir, "predictions.json"));
+    }
+    loadExistingResults() {
+        const path = join(this.runDir, "results.jsonl");
+        if (!existsSync(path))
+            return [];
+        return readFileSync(path, "utf-8")
+            .split("\n")
+            .filter((l) => l.trim().length > 0)
+            .map((l) => JSON.parse(l));
+    }
+    finalize(opts) {
+        const counts = {
+            resolved: 0,
+            failed: 0,
+            error: 0,
+            timeout: 0,
+            budget_exceeded: 0,
+            skipped: 0,
+        };
+        let totalCost = 0;
+        let totalDuration = 0;
+        for (const r of this.results) {
+            counts[r.status]++;
+            totalCost += r.cost_usd;
+            totalDuration += r.duration_ms;
+        }
+        const denom = counts.resolved + counts.failed + counts.error + counts.timeout;
+        const passRate = denom === 0 ? 0 : counts.resolved / denom;
+        const artifacts = {
+            run_id: this.header.run_id,
+            pack: this.header.pack,
+            pack_version: this.header.pack_version,
+            model: this.header.model,
+            harness_version: this.header.harness_version,
+            started_at: this.header.started_at,
+            finished_at: opts.finished_at,
+            total_cost_usd: totalCost,
+            max_cost_usd: this.header.max_cost_usd,
+            total_duration_ms: totalDuration,
+            resolved: counts.resolved,
+            failed: counts.failed,
+            error: counts.error,
+            timeout: counts.timeout,
+            budget_exceeded: counts.budget_exceeded,
+            skipped: counts.skipped,
+            pass_rate: passRate,
+            partial: opts.partial,
+            results: [...this.results],
+        };
+        const tmp = join(this.runDir, "results.json.tmp");
+        writeFileSync(tmp, JSON.stringify(artifacts, null, 2));
+        renameSync(tmp, join(this.runDir, "results.json"));
+        return artifacts;
+    }
+}
+//# sourceMappingURL=run-writer.js.map

package/dist/evals/scorer.d.ts ADDED Viewed

@@ -0,0 +1,34 @@
+/**
+ * oh evals — scorer.
+ *
+ * After the agent runs, we score the task by:
+ *  (1) Running an oracle script (oracle.sh / oracle.mjs) if one exists in
+ *      the fixture dir — exit 0 = pass.
+ *  (2) Else running the pack's default test command and parsing the
+ *      junit-xml output for FAIL_TO_PASS / PASS_TO_PASS test IDs.
+ *
+ * Test ID convention matches SWE-bench: "<classname>.<name>".
+ */
+import type { EvalsTask, TestsStatus } from "./types.js";
+export type TestOutcome = "pass" | "fail" | "skip";
+/**
+ * Minimal junit-xml parser. Returns a map of "<classname>.<name>" → outcome.
+ *
+ * We don't take a full XML parser dependency; pytest's junit-xml is
+ * well-formed and simple enough to extract testcase elements with regex.
+ */
+export declare function parseJunitXml(xml: string): Record<string, TestOutcome>;
+export type ScoreResult = {
+    resolved: boolean;
+    tests_status: TestsStatus;
+    oracle_used: boolean;
+    error_message?: string;
+};
+export declare function scoreTask(args: {
+    task: EvalsTask;
+    worktreeDir: string;
+    fixtureDir: string;
+    packDefaultTestCommand: string;
+    testTimeoutMs: number;
+}): Promise<ScoreResult>;
+//# sourceMappingURL=scorer.d.ts.map

package/dist/evals/scorer.js ADDED Viewed

@@ -0,0 +1,127 @@
+/**
+ * oh evals — scorer.
+ *
+ * After the agent runs, we score the task by:
+ *  (1) Running an oracle script (oracle.sh / oracle.mjs) if one exists in
+ *      the fixture dir — exit 0 = pass.
+ *  (2) Else running the pack's default test command and parsing the
+ *      junit-xml output for FAIL_TO_PASS / PASS_TO_PASS test IDs.
+ *
+ * Test ID convention matches SWE-bench: "<classname>.<name>".
+ */
+import { spawnSync } from "node:child_process";
+import { existsSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+/**
+ * Minimal junit-xml parser. Returns a map of "<classname>.<name>" → outcome.
+ *
+ * We don't take a full XML parser dependency; pytest's junit-xml is
+ * well-formed and simple enough to extract testcase elements with regex.
+ */
+export function parseJunitXml(xml) {
+    const out = {};
+    const testcaseRe = /<testcase\b([^>]*?)(?:\/>|>([\s\S]*?)<\/testcase>)/g;
+    let match = testcaseRe.exec(xml);
+    while (match !== null) {
+        const attrs = match[1];
+        const inner = match[2] ?? "";
+        const cn = /classname="([^"]*)"/.exec(attrs)?.[1];
+        const name = /\bname="([^"]*)"/.exec(attrs)?.[1];
+        if (cn && name) {
+            const id = `${cn}.${name}`;
+            if (/<failure\b/.test(inner) || /<error\b/.test(inner)) {
+                out[id] = "fail";
+            }
+            else if (/<skipped\b/.test(inner)) {
+                out[id] = "skip";
+            }
+            else {
+                out[id] = "pass";
+            }
+        }
+        match = testcaseRe.exec(xml);
+    }
+    return out;
+}
+const EMPTY_TESTS_STATUS = {
+    FAIL_TO_PASS: { success: [], failure: [] },
+    PASS_TO_PASS: { success: [], failure: [] },
+};
+export async function scoreTask(args) {
+    const { task, worktreeDir, fixtureDir, packDefaultTestCommand, testTimeoutMs } = args;
+    // (1) Oracle escape hatch.
+    const oracleSh = join(fixtureDir, "oracle.sh");
+    const oracleMjs = join(fixtureDir, "oracle.mjs");
+    if (existsSync(oracleSh)) {
+        const r = spawnSync(oracleSh, [], {
+            cwd: worktreeDir,
+            env: {
+                ...process.env,
+                INSTANCE_ID: task.instance_id,
+                WORKTREE_DIR: worktreeDir,
+                FIXTURE_DIR: fixtureDir,
+            },
+            timeout: testTimeoutMs,
+            shell: process.platform === "win32",
+        });
+        return {
+            resolved: r.status === 0,
+            tests_status: EMPTY_TESTS_STATUS,
+            oracle_used: true,
+            error_message: r.status === 0 ? undefined : (r.stderr?.toString().slice(-500) ?? ""),
+        };
+    }
+    if (existsSync(oracleMjs)) {
+        const r = spawnSync(process.execPath, [oracleMjs], {
+            cwd: worktreeDir,
+            env: {
+                ...process.env,
+                INSTANCE_ID: task.instance_id,
+                WORKTREE_DIR: worktreeDir,
+                FIXTURE_DIR: fixtureDir,
+            },
+            timeout: testTimeoutMs,
+        });
+        return {
+            resolved: r.status === 0,
+            tests_status: EMPTY_TESTS_STATUS,
+            oracle_used: true,
+            error_message: r.status === 0 ? undefined : (r.stderr?.toString().slice(-500) ?? ""),
+        };
+    }
+    // (2) Default test command.
+    const r = spawnSync(packDefaultTestCommand, {
+        cwd: worktreeDir,
+        shell: true,
+        timeout: testTimeoutMs,
+    });
+    const xmlPath = join(worktreeDir, ".oh-evals-results.xml");
+    if (!existsSync(xmlPath)) {
+        return {
+            resolved: false,
+            tests_status: structuredClone(EMPTY_TESTS_STATUS),
+            oracle_used: false,
+            error_message: `junit-xml not produced at ${xmlPath} (test command exit ${r.status}). stderr: ${r.stderr?.toString().slice(-500) ?? ""}`,
+        };
+    }
+    const outcomes = parseJunitXml(readFileSync(xmlPath, "utf-8"));
+    const tests_status = {
+        FAIL_TO_PASS: { success: [], failure: [] },
+        PASS_TO_PASS: { success: [], failure: [] },
+    };
+    for (const id of task.FAIL_TO_PASS) {
+        if (outcomes[id] === "pass")
+            tests_status.FAIL_TO_PASS.success.push(id);
+        else
+            tests_status.FAIL_TO_PASS.failure.push(id);
+    }
+    for (const id of task.PASS_TO_PASS) {
+        if (outcomes[id] === "pass")
+            tests_status.PASS_TO_PASS.success.push(id);
+        else
+            tests_status.PASS_TO_PASS.failure.push(id);
+    }
+    const resolved = tests_status.FAIL_TO_PASS.failure.length === 0 && tests_status.PASS_TO_PASS.failure.length === 0;
+    return { resolved, tests_status, oracle_used: false };
+}
+//# sourceMappingURL=scorer.js.map

package/dist/evals/types.d.ts ADDED Viewed

@@ -0,0 +1,74 @@
+/**
+ * oh evals — type definitions for the eval harness.
+ *
+ * Schema mirrors SWE-bench's evaluation contract so packs of cherry-picked
+ * SWE-bench Lite instances drop in unmodified. Our `EvalsResult` is a
+ * superset of SWE-bench's `results.json` per-instance shape, with cost,
+ * turns, duration, and transcript-path enrichments.
+ */
+export type EvalsTask = {
+    instance_id: string;
+    repo: string;
+    base_commit: string;
+    problem_statement: string;
+    FAIL_TO_PASS: string[];
+    PASS_TO_PASS: string[];
+    hints_text?: string;
+};
+export type EvalsPack = {
+    name: string;
+    version: string;
+    description: string;
+    language: "python" | "javascript" | "typescript" | "polyglot";
+    runner_requirements: string[];
+    default_test_command: string;
+    instance_count: number;
+    compatible_with?: string;
+};
+export type EvalsStatus = "resolved" | "failed" | "error" | "timeout" | "budget_exceeded" | "skipped";
+export type TestsStatus = {
+    FAIL_TO_PASS: {
+        success: string[];
+        failure: string[];
+    };
+    PASS_TO_PASS: {
+        success: string[];
+        failure: string[];
+    };
+};
+export type EvalsResult = {
+    instance_id: string;
+    status: EvalsStatus;
+    resolved: boolean;
+    cost_usd: number;
+    turns_used: number;
+    duration_ms: number;
+    model_patch: string;
+    tests_status: TestsStatus;
+    transcript_path: string;
+    error_message?: string;
+    started_at: string;
+    finished_at: string;
+};
+export type RunArtifacts = {
+    run_id: string;
+    pack: string;
+    pack_version: string;
+    model: string;
+    harness_version: string;
+    started_at: string;
+    finished_at: string;
+    total_cost_usd: number;
+    max_cost_usd: number;
+    total_duration_ms: number;
+    resolved: number;
+    failed: number;
+    error: number;
+    timeout: number;
+    budget_exceeded: number;
+    skipped: number;
+    pass_rate: number;
+    partial: boolean;
+    results: EvalsResult[];
+};
+//# sourceMappingURL=types.d.ts.map

package/dist/evals/types.js ADDED Viewed

@@ -0,0 +1,10 @@
+/**
+ * oh evals — type definitions for the eval harness.
+ *
+ * Schema mirrors SWE-bench's evaluation contract so packs of cherry-picked
+ * SWE-bench Lite instances drop in unmodified. Our `EvalsResult` is a
+ * superset of SWE-bench's `results.json` per-instance shape, with cost,
+ * turns, duration, and transcript-path enrichments.
+ */
+export {};
+//# sourceMappingURL=types.js.map

package/dist/harness/sandbox.d.ts ADDED Viewed

@@ -0,0 +1,34 @@
+/**
+ * Sandbox — filesystem and network restrictions for tool execution.
+ *
+ * Limits what tools can access:
+ * - File tools: only write to allowed paths
+ * - Web tools: only access allowed domains
+ * - Bash: restricted commands (no curl/wget by default)
+ *
+ * Reduces permission prompts while maintaining security.
+ */
+export type SandboxConfig = {
+    enabled: boolean;
+    /** Paths tools can write to (glob-style, relative to cwd) */
+    allowedPaths: string[];
+    /** Domains WebFetch/WebSearch can access */
+    allowedDomains: string[];
+    /** Block all network access */
+    blockNetwork: boolean;
+    /** Commands blocked in Bash (default: curl, wget) */
+    blockedCommands: string[];
+};
+/** Get the current sandbox config */
+export declare function getSandboxConfig(): SandboxConfig;
+/** Reset cached config */
+export declare function invalidateSandboxCache(): void;
+/** Check if a file path is allowed for writing */
+export declare function isPathAllowed(filePath: string): boolean;
+/** Check if a domain is allowed for network access */
+export declare function isDomainAllowed(url: string): boolean;
+/** Check if a bash command is allowed */
+export declare function isCommandAllowed(command: string): boolean;
+/** Get a human-readable sandbox status */
+export declare function sandboxStatus(): string;
+//# sourceMappingURL=sandbox.d.ts.map

package/dist/harness/sandbox.js ADDED Viewed

@@ -0,0 +1,104 @@
+/**
+ * Sandbox — filesystem and network restrictions for tool execution.
+ *
+ * Limits what tools can access:
+ * - File tools: only write to allowed paths
+ * - Web tools: only access allowed domains
+ * - Bash: restricted commands (no curl/wget by default)
+ *
+ * Reduces permission prompts while maintaining security.
+ */
+import { relative, resolve } from "node:path";
+import { readOhConfig } from "./config.js";
+const DEFAULT_SANDBOX = {
+    enabled: false,
+    allowedPaths: ["."], // current directory
+    allowedDomains: [], // empty = all allowed
+    blockNetwork: false,
+    blockedCommands: ["curl", "wget"],
+};
+// ── Sandbox Manager ──
+let _config = null;
+/** Get the current sandbox config */
+export function getSandboxConfig() {
+    if (_config)
+        return _config;
+    const ohConfig = readOhConfig();
+    if (ohConfig?.sandbox) {
+        _config = {
+            ...DEFAULT_SANDBOX,
+            ...ohConfig.sandbox,
+        };
+    }
+    else {
+        _config = DEFAULT_SANDBOX;
+    }
+    return _config;
+}
+/** Reset cached config */
+export function invalidateSandboxCache() {
+    _config = null;
+}
+/** Check if a file path is allowed for writing */
+export function isPathAllowed(filePath) {
+    const config = getSandboxConfig();
+    if (!config.enabled)
+        return true;
+    const resolved = resolve(filePath);
+    const cwd = process.cwd();
+    for (const allowed of config.allowedPaths) {
+        const allowedResolved = resolve(cwd, allowed);
+        // Check if the file is within the allowed directory
+        const rel = relative(allowedResolved, resolved);
+        if (!rel.startsWith("..") && !rel.startsWith("/"))
+            return true;
+    }
+    return false;
+}
+/** Check if a domain is allowed for network access */
+export function isDomainAllowed(url) {
+    const config = getSandboxConfig();
+    if (!config.enabled)
+        return true;
+    if (config.blockNetwork)
+        return false;
+    if (config.allowedDomains.length === 0)
+        return true;
+    try {
+        const hostname = new URL(url).hostname.toLowerCase();
+        return config.allowedDomains.some((d) => hostname === d.toLowerCase() || hostname.endsWith(`.${d.toLowerCase()}`));
+    }
+    catch {
+        return false;
+    }
+}
+/** Check if a bash command is allowed */
+export function isCommandAllowed(command) {
+    const config = getSandboxConfig();
+    if (!config.enabled)
+        return true;
+    const firstWord = command.trim().split(/\s+/)[0]?.toLowerCase() ?? "";
+    return !config.blockedCommands.includes(firstWord);
+}
+/** Get a human-readable sandbox status */
+export function sandboxStatus() {
+    const config = getSandboxConfig();
+    if (!config.enabled)
+        return "Sandbox: disabled";
+    const lines = ["Sandbox: enabled"];
+    lines.push(`  Allowed paths: ${config.allowedPaths.join(", ") || "none"}`);
+    if (config.blockNetwork) {
+        lines.push("  Network: blocked");
+    }
+    else if (config.allowedDomains.length > 0) {
+        lines.push(`  Allowed domains: ${config.allowedDomains.join(", ")}`);
+    }
+    else {
+        lines.push("  Network: unrestricted");
+    }
+    if (config.blockedCommands.length > 0) {
+        lines.push(`  Blocked commands: ${config.blockedCommands.join(", ")}`);
+    }
+    return lines.join("\n");
+}
+//# sourceMappingURL=sandbox.js.map

package/dist/main.js CHANGED Viewed

@@ -15,6 +15,7 @@ import { homedir } from "node:os";
 import { join } from "node:path";
 import { Command, Option } from "commander";
 import { render } from "ink";
+import { registerEvalsCommand } from "./evals/cli.js";
 import { parseSettingSources, readOhConfig } from "./harness/config.js";
 import { emitHook, setHookDecisionObserver } from "./harness/hooks.js";
 import { languageToPrompt } from "./harness/language.js";
@@ -1318,6 +1319,8 @@ program
     console.log(result.message);
     console.log();
 });
+// ── evals (oh evals run/list-packs/show) ──
+registerEvalsCommand(program);
 // ── sessions ──
 program
     .command("sessions")

package/dist/tools/GrepTool/index.d.ts CHANGED Viewed

@@ -20,6 +20,7 @@ declare const inputSchema: z.ZodObject<{
     path?: string | undefined;
     type?: string | undefined;
     "-i"?: boolean | undefined;
+    "-C"?: number | undefined;
     context?: number | undefined;
     glob?: string | undefined;
     offset?: number | undefined;
@@ -28,13 +29,13 @@ declare const inputSchema: z.ZodObject<{
     multiline?: boolean | undefined;
     "-A"?: number | undefined;
     "-B"?: number | undefined;
-    "-C"?: number | undefined;
     "-n"?: boolean | undefined;
 }, {
     pattern: string;
     path?: string | undefined;
     type?: string | undefined;
     "-i"?: boolean | undefined;
+    "-C"?: number | undefined;
     context?: number | undefined;
     glob?: string | undefined;
     offset?: number | undefined;
@@ -43,7 +44,6 @@ declare const inputSchema: z.ZodObject<{
     multiline?: boolean | undefined;
     "-A"?: number | undefined;
     "-B"?: number | undefined;
-    "-C"?: number | undefined;
     "-n"?: boolean | undefined;
 }>;
 export declare const GrepTool: Tool<typeof inputSchema>;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@zhijiewang/openharness",
-  "version": "2.39.0",
+  "version": "2.40.0",
   "description": "Open-source terminal coding agent. Works with any LLM.",
   "type": "module",
   "bin": {