npm - @zhijiewang/openharness - Versions diffs - 2.40.1 → 2.40.3 - Mend

@zhijiewang/openharness 2.40.1 → 2.40.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/Tool.d.ts +2 -0
package/dist/Tool.js +9 -1
package/dist/commands/info.js +5 -2
package/dist/commands/skills.js +12 -6
package/dist/evals/orchestrator.js +124 -21
package/dist/evals/run-writer.d.ts +3 -0
package/dist/evals/run-writer.js +5 -0
package/dist/evals/scorer.js +72 -26
package/dist/harness/config.d.ts +9 -0
package/dist/harness/cost.js +12 -1
package/dist/harness/plugins.d.ts +3 -1
package/dist/harness/plugins.js +31 -4
package/dist/main.js +89 -4
package/dist/mcp/loader.d.ts +2 -0
package/dist/mcp/loader.js +7 -0
package/dist/providers/anthropic.js +16 -4
package/dist/query/index.js +1 -0
package/dist/query/types.d.ts +2 -0
package/dist/repl.js +1 -0
package/dist/tools/BashTool/index.js +2 -2
package/dist/utils/safe-env.js +2 -0
package/package.json +1 -1

package/dist/Tool.d.ts CHANGED Viewed

@@ -33,6 +33,8 @@ export type ToolContext = {
     tracer?: import("./harness/traces.js").SessionTracer;
     /** Optional parent span ID for the current tool execution (set by query loop). */
     parentSpanId?: string;
+    /** Session ID for the current query — injected into Bash subprocess env. */
+    sessionId?: string;
 };
 export type Tool<Input extends z.ZodType = z.ZodType> = {
     readonly name: string;

package/dist/Tool.js CHANGED Viewed

@@ -51,7 +51,15 @@ function zodToJsonSchema(schema) {
         return { type: "boolean" };
     if (def?.typeName === "ZodArray")
         return { type: "array", items: zodToJsonSchema(def.type) };
-    return { type: "string" }; // fallback
+    // ZodRecord (used by DeferredTool's permissive schema) → permissive object.
+    // Anthropic's tool-use API requires `type: "object"` for tool input_schema.
+    if (def?.typeName === "ZodRecord")
+        return { type: "object", additionalProperties: {} };
+    if (def?.typeName === "ZodUnknown" || def?.typeName === "ZodAny")
+        return {};
+    // Fallback: return permissive object so tool-use APIs that require object
+    // input schemas (Anthropic) don't reject the request.
+    return { type: "object", additionalProperties: {} };
 }
 /**
  * Find a tool by name from a list of tools.

package/dist/commands/info.js CHANGED Viewed

@@ -13,7 +13,7 @@ import { discoverPlugins, discoverSkills } from "../harness/plugins.js";
 import { formatFlameGraph, formatTrace, listTracedSessions, loadTrace } from "../harness/traces.js";
 import { getVerificationConfig, invalidateVerificationCache } from "../harness/verification.js";
 import { normalizeMcpConfig } from "../mcp/config-normalize.js";
-import { connectedMcpServers, disconnectMcpClients, loadMcpTools } from "../mcp/loader.js";
+import { connectedMcpServers, disconnectMcpClients, loadMcpTools, mcpServerToolCount } from "../mcp/loader.js";
 import { getAuthStatus } from "../mcp/oauth.js";
 import { formatRegistry, generateConfigBlock, MCP_REGISTRY, searchRegistry } from "../mcp/registry.js";
 import { getRouteSelection } from "../providers/router.js";
@@ -458,6 +458,8 @@ export function registerInfoCommands(register, getCommandMap) {
                 continue;
             }
             const kind = normalized.cfg.type;
+            const toolCount = mcpServerToolCount(name);
+            const toolsLabel = toolCount !== undefined ? `${toolCount} tool${toolCount === 1 ? "" : "s"}` : "";
             const status = await getAuthStatus(normalized.cfg, storageDir);
             let statusText;
             switch (status) {
@@ -474,7 +476,8 @@ export function registerInfoCommands(register, getCommandMap) {
                     statusText = "expired (re-authenticate with /mcp-login)";
                     break;
             }
-            lines.push(`  ${name.padEnd(20)}  ${kind.padEnd(6)}  ${statusText}`);
+            const toolsPart = toolsLabel ? `  ${toolsLabel.padEnd(9)}` : "             ";
+            lines.push(`  ${name.padEnd(20)}  ${kind.padEnd(6)}${toolsPart}  ${statusText}`);
         }
         lines.push("");
         lines.push("Run /mcp-registry to browse and add more servers.");

package/dist/commands/skills.js CHANGED Viewed

@@ -3,33 +3,39 @@
  */
 import { existsSync, mkdirSync, unlinkSync, writeFileSync } from "node:fs";
 import { join } from "node:path";
+import { readOhConfig } from "../harness/config.js";
 import { discoverSkills, findSkill } from "../harness/plugins.js";
 export function registerSkillCommands(register) {
     register("skills", "List all available skills", () => {
         const skills = discoverSkills();
-        if (skills.length === 0) {
+        const overrides = readOhConfig()?.skillOverrides ?? {};
+        // "off" skills are fully hidden from the user
+        const visible = skills.filter((s) => overrides[s.name] !== "off");
+        if (visible.length === 0) {
             return {
                 output: "No skills found. Create .oh/skills/*.md to add one, or run /skill-search to browse the registry.",
                 handled: true,
             };
         }
-        // Group by source for readability
         const lines = ["Available skills:"];
         const sourceLabel = {
             project: "[project]",
             global: "[global]",
             plugin: "[plugin]",
         };
-        // Sort: bundled-style (project, no path under .oh) first, then by source then name
-        const sorted = [...skills].sort((a, b) => {
+        const sorted = [...visible].sort((a, b) => {
             if (a.source !== b.source)
                 return a.source.localeCompare(b.source);
             return a.name.localeCompare(b.name);
         });
         for (const s of sorted) {
             const tag = sourceLabel[s.source] ?? `[${s.source}]`;
-            const desc = s.description ? `: ${s.description}` : "";
-            lines.push(`  - ${s.name} ${tag}${desc}`);
+            const ov = overrides[s.name];
+            // "user-invocable-only": show name but mark as not available to model
+            // "name-only": suppress description (mirrors model-side behaviour)
+            const descText = ov === "name-only" || !s.description ? "" : `: ${s.description}`;
+            const hint = ov === "user-invocable-only" ? " [user-only]" : "";
+            lines.push(`  - ${s.name} ${tag}${descText}${hint}`);
         }
         return { output: lines.join("\n"), handled: true };
     });

package/dist/evals/orchestrator.js CHANGED Viewed

@@ -16,7 +16,7 @@
  *     --model <model> "<problem_statement>"
  */
 import { execFileSync, spawn, spawnSync } from "node:child_process";
-import { createWriteStream, existsSync, mkdirSync, rmSync as nodeRmSync, readFileSync } from "node:fs";
+import { copyFileSync, createWriteStream, existsSync, mkdirSync, rmSync as nodeRmSync, readFileSync, unlinkSync, writeFileSync, } from "node:fs";
 import { join } from "node:path";
 import { isGitRepo, removeWorktree } from "../git/index.js";
 import { RunWriter } from "./run-writer.js";
@@ -48,6 +48,7 @@ export class RunOrchestrator {
             for (const r of prior) {
                 this.skipIds.add(r.instance_id);
                 this.totalCost += r.cost_usd;
+                this.writer.preloadResult(r);
             }
         }
     }
@@ -134,7 +135,7 @@ export class RunOrchestrator {
                         maxTurns: this.opts.maxTaskTurns,
                         model: this.opts.model,
                         fallbackModel: this.opts.fallbackModel,
-                        prompt: task.problem_statement,
+                        prompt: buildEvalPrompt(task.problem_statement),
                     }),
                 };
             const transcriptPath = join(this.opts.runDir, "transcripts", `${task.instance_id}.jsonl`);
@@ -251,7 +252,8 @@ export class RunOrchestrator {
         }
         finally {
             // Clean up worktree (best-effort; swallow errors so a leak doesn't stop a run).
-            if (worktreePath && existsSync(worktreePath)) {
+            // Set OH_EVALS_KEEP_WORKTREES=1 to skip cleanup for post-run debugging.
+            if (worktreePath && existsSync(worktreePath) && !process.env.OH_EVALS_KEEP_WORKTREES) {
                 try {
                     if (usedGitWorktree)
                         removeWorktree(worktreePath);
@@ -317,12 +319,20 @@ function parseStreamJsonResult(stdout) {
     return { cost_usd: 0, turns_used: 0, exit_reason: "ok", final_message: "" };
 }
 function captureGitDiff(worktreeDir) {
-    try {
-        return execFileSync("git", ["-C", worktreeDir, "diff", "HEAD"], { encoding: "utf-8" });
-    }
-    catch {
-        return "";
+    // setup.sh initialises the git repo at worktreeDir/repo/.git, so diff from
+    // that subdirectory. Fall back to worktreeDir for legacy fixtures that put
+    // .git at the worktree root.
+    for (const dir of [join(worktreeDir, "repo"), worktreeDir]) {
+        try {
+            const out = execFileSync("git", ["-C", dir, "diff", "HEAD"], { encoding: "utf-8" });
+            if (out)
+                return out;
+        }
+        catch {
+            /* try next */
+        }
     }
+    return "";
 }
 async function extractFixture(packDir, instanceId, dest) {
     const fxDir = join(packDir, "fixtures", instanceId);
@@ -340,14 +350,28 @@ async function extractFixture(packDir, instanceId, dest) {
             // handles initialization; we just ensure the dest dir exists.
             return;
         }
-        if (c.flag === "-xzf") {
-            execFileSync("tar", ["-xzf", c.path, "-C", dest], { stdio: ["ignore", "pipe", "pipe"] });
+        // Use cwd + relative archive name to avoid GNU tar treating Windows drive
+        // letters (e.g. "E:") as remote hostnames when passed as absolute paths.
+        const archiveName = c.flag === "-xzf" ? "_repo.tar.gz" : "_repo.tar.zst";
+        copyFileSync(c.path, join(dest, archiveName));
+        try {
+            if (c.flag === "-xzf") {
+                execFileSync("tar", ["-xzf", archiveName], { cwd: dest, stdio: ["ignore", "pipe", "pipe"] });
+            }
+            else {
+                execFileSync("tar", ["--use-compress-program=zstd -d", "-xf", archiveName], {
+                    cwd: dest,
+                    stdio: ["ignore", "pipe", "pipe"],
+                });
+            }
         }
-        else {
-            // Legacy .tar.zst path: requires the system `zstd` binary on PATH.
-            execFileSync("tar", ["--use-compress-program=zstd -d", "-xf", c.path, "-C", dest], {
-                stdio: ["ignore", "pipe", "pipe"],
-            });
+        finally {
+            try {
+                unlinkSync(join(dest, archiveName));
+            }
+            catch {
+                /* best-effort */
+            }
         }
         return;
     }
@@ -356,19 +380,98 @@ async function runSetupScript(packDir, instanceId, worktreeDir) {
     const setupPath = join(packDir, "fixtures", instanceId, "setup.sh");
     if (!existsSync(setupPath))
         return { ok: true }; // No setup needed.
-    const r = spawnSync(setupPath, [], {
-        cwd: worktreeDir,
-        shell: true, // works for both .sh on POSIX and bash-as-shell on Windows
-        encoding: "utf-8",
-    });
+    // Invoke sh/bash explicitly so the script runs without the execute bit.
+    // On Windows, use bash (Git Bash) and define python3 as a shell function
+    // that delegates to `python` — Python 3 on Windows ships as python.exe only.
+    let r;
+    if (process.platform === "win32") {
+        // Python 3 on Windows installs as python.exe only, and the WindowsApps stub
+        // for both `python` and `python3` appears first on Git Bash's PATH. We find
+        // the real interpreter via where.exe and use its absolute POSIX path directly.
+        const realPython = windowsRealPythonPosix();
+        // On Windows, `python3 -m venv` creates .venv/Scripts/activate, not .venv/bin/activate.
+        // Patch setup.sh to use the Windows path so sourcing works in Git Bash.
+        const original = readFileSync(setupPath, "utf-8");
+        const patched = original
+            .replace(/\bsource\s+\.venv\/bin\/activate\b/g, "source .venv/Scripts/activate")
+            .replace(/\. \.venv\/bin\/activate\b/g, ". .venv/Scripts/activate");
+        const tmpSetup = `${setupPath}.win.sh`;
+        try {
+            unlinkSync(tmpSetup);
+        }
+        catch {
+            /* ok */
+        }
+        writeFileSync(tmpSetup, patched, "utf-8");
+        const posixTmp = tmpSetup.replace(/\\/g, "/").replace(/^([A-Za-z]):/, (_, d) => `/${d.toLowerCase()}`);
+        // Only define python3 — pip must NOT be overridden because after venv activation
+        // the venv's pip.exe is on PATH and must be used (not system Python's pip).
+        const pyFn = realPython ? `python3() { "${realPython}" "$@"; }` : "";
+        r = spawnSync("bash", ["-c", `${pyFn}${pyFn ? "; " : ""}. "${posixTmp}"`], {
+            cwd: worktreeDir,
+            encoding: "utf-8",
+        });
+        try {
+            unlinkSync(tmpSetup);
+        }
+        catch {
+            /* best-effort */
+        }
+    }
+    else {
+        r = spawnSync("/bin/sh", [setupPath], { cwd: worktreeDir, encoding: "utf-8" });
+    }
     if (r.status !== 0) {
-        return { ok: false, error: (r.stderr ?? "").slice(-500) };
+        return { ok: false, error: String(r.stderr ?? "").slice(-500) };
     }
     return { ok: true };
 }
+/** Returns the POSIX path to the real Python interpreter on Windows,
+ *  skipping the WindowsApps stub which is a dead-end redirect. */
+function windowsRealPythonPosix() {
+    try {
+        const out = spawnSync("where.exe", ["python"], { encoding: "utf-8" }).stdout ?? "";
+        for (const line of out.split(/\r?\n/)) {
+            const p = line.trim();
+            if (p && !p.includes("WindowsApps")) {
+                return p.replace(/\\/g, "/").replace(/^([A-Za-z]):/, (_, d) => `/${d.toLowerCase()}`);
+            }
+        }
+    }
+    catch {
+        /* fall through */
+    }
+    return "";
+}
 function defaultOhEntry() {
     return join(process.cwd(), "dist", "main.js");
 }
+/** Wrap a SWE-bench problem statement with SWE-bench-style instructions:
+ *  the working tree is in ./repo, only that subtree is committed/diffed,
+ *  edit source files in place, don't create documentation/scratch files. */
+function buildEvalPrompt(problemStatement) {
+    return [
+        "You are an autonomous software engineer fixing a bug in an open-source Python project.",
+        "",
+        "WORKING DIRECTORY",
+        "- The repository source is in `./repo/` (relative to your current directory).",
+        "- A `.venv` next to it has the project installed editably; do NOT recreate it.",
+        "- Run all bash commands with `cd repo && …` or use absolute paths under `./repo/`.",
+        "",
+        "WHAT TO DO",
+        "- Read the problem statement below, locate the relevant source files in `./repo/`, and edit them in place to fix the bug.",
+        "- Use the existing test suite to verify (run with `cd repo && python -m pytest <file_or_pattern>`).",
+        "- Only changes inside `./repo/` are scored; the orchestrator runs `git diff HEAD` from `./repo/` to extract your patch.",
+        "",
+        "WHAT NOT TO DO",
+        "- Do NOT create README/SUMMARY/GUIDE/PATCH/SOLUTION/COMPLETION files. Edit the source.",
+        "- Do NOT write standalone scratch scripts at the worktree root — only edit files under `./repo/`.",
+        "- Do NOT modify `.venv/`, generated `_version.py` files, or anything outside `./repo/`.",
+        "",
+        "PROBLEM STATEMENT",
+        problemStatement,
+    ].join("\n");
+}
 function defaultRunArgs(opts) {
     const args = [
         opts.ohEntry,

package/dist/evals/run-writer.d.ts CHANGED Viewed

@@ -26,6 +26,9 @@ export declare class RunWriter {
     private readonly results;
     constructor(runDir: string, header: RunHeader);
     appendResult(result: EvalsResult): void;
+    /** Load a result that was written in a prior run into the in-memory array without
+     *  re-writing it to disk (used by the resume path so finalize() includes all results). */
+    preloadResult(result: EvalsResult): void;
     loadExistingResults(): EvalsResult[];
     finalize(opts: {
         partial: boolean;

package/dist/evals/run-writer.js CHANGED Viewed

@@ -37,6 +37,11 @@ export class RunWriter {
         writeFileSync(tmp, JSON.stringify(preds, null, 2));
         renameSync(tmp, join(this.runDir, "predictions.json"));
     }
+    /** Load a result that was written in a prior run into the in-memory array without
+     *  re-writing it to disk (used by the resume path so finalize() includes all results). */
+    preloadResult(result) {
+        this.results.push(result);
+    }
     loadExistingResults() {
         const path = join(this.runDir, "results.jsonl");
         if (!existsSync(path))

package/dist/evals/scorer.js CHANGED Viewed

@@ -12,6 +12,33 @@
 import { spawnSync } from "node:child_process";
 import { existsSync, readFileSync } from "node:fs";
 import { join } from "node:path";
+/** Convert pytest junit-xml classname/name (+ optional file= attr) into the
+ *  pytest-style id that SWE-bench uses: `path/to/file.py::[Class::]test_name`.
+ *  Returns null if a sensible id can't be built. */
+function pytestStyleId(cn, name, file) {
+    let fileNorm;
+    let classTail;
+    if (file) {
+        fileNorm = file.replace(/\\/g, "/");
+        const moduleFromFile = fileNorm.replace(/\.py$/, "").replace(/\//g, ".");
+        classTail = cn.startsWith(`${moduleFromFile}.`) ? cn.slice(moduleFromFile.length + 1) : "";
+    }
+    else {
+        // No `file=` attribute (older pytest / minimal junit-xml). Derive the
+        // path from classname: trailing PascalCase segments are class names,
+        // the rest is the dotted module path → file is module/path.py.
+        const parts = cn.split(".");
+        const classParts = [];
+        while (parts.length > 0 && /^[A-Z]/.test(parts[parts.length - 1] ?? "")) {
+            classParts.unshift(parts.pop());
+        }
+        if (parts.length === 0)
+            return null;
+        fileNorm = `${parts.join("/")}.py`;
+        classTail = classParts.join("::");
+    }
+    return classTail ? `${fileNorm}::${classTail}::${name}` : `${fileNorm}::${name}`;
+}
 /**
  * Minimal junit-xml parser. Returns a map of "<classname>.<name>" → outcome.
  *
@@ -27,17 +54,19 @@ export function parseJunitXml(xml) {
         const inner = match[2] ?? "";
         const cn = /classname="([^"]*)"/.exec(attrs)?.[1];
         const name = /\bname="([^"]*)"/.exec(attrs)?.[1];
+        const file = /\bfile="([^"]*)"/.exec(attrs)?.[1];
         if (cn && name) {
-            const id = `${cn}.${name}`;
-            if (/<failure\b/.test(inner) || /<error\b/.test(inner)) {
-                out[id] = "fail";
-            }
-            else if (/<skipped\b/.test(inner)) {
-                out[id] = "skip";
-            }
-            else {
-                out[id] = "pass";
-            }
+            let outcome = "pass";
+            if (/<failure\b/.test(inner) || /<error\b/.test(inner))
+                outcome = "fail";
+            else if (/<skipped\b/.test(inner))
+                outcome = "skip";
+            // Emit BOTH a dotted classname.name id (legacy) and pytest-style
+            // file::[Class::]name ids so SWE-bench-format expected IDs match.
+            out[`${cn}.${name}`] = outcome;
+            const ptid = pytestStyleId(cn, name, file);
+            if (ptid)
+                out[ptid] = outcome;
         }
         match = testcaseRe.exec(xml);
     }
@@ -53,22 +82,28 @@ export async function scoreTask(args) {
     const oracleSh = join(fixtureDir, "oracle.sh");
     const oracleMjs = join(fixtureDir, "oracle.mjs");
     if (existsSync(oracleSh)) {
-        const r = spawnSync(oracleSh, [], {
-            cwd: worktreeDir,
-            env: {
-                ...process.env,
-                INSTANCE_ID: task.instance_id,
-                WORKTREE_DIR: worktreeDir,
-                FIXTURE_DIR: fixtureDir,
-            },
-            timeout: testTimeoutMs,
-            shell: process.platform === "win32",
-        });
+        // Invoke /bin/sh explicitly so oracle.sh runs without the execute bit.
+        // Files committed from Windows or via writeFileSync default to mode 100644.
+        const r = process.platform === "win32"
+            ? spawnSync(oracleSh, [], {
+                cwd: worktreeDir,
+                env: { ...process.env, INSTANCE_ID: task.instance_id, WORKTREE_DIR: worktreeDir, FIXTURE_DIR: fixtureDir },
+                timeout: testTimeoutMs,
+                shell: true,
+            })
+            : spawnSync("/bin/sh", [oracleSh], {
+                cwd: worktreeDir,
+                env: { ...process.env, INSTANCE_ID: task.instance_id, WORKTREE_DIR: worktreeDir, FIXTURE_DIR: fixtureDir },
+                timeout: testTimeoutMs,
+            });
+        // Oracle exit code is the pass/fail signal — do NOT set error_message for a clean
+        // non-zero exit (that means "test failed", not "scoring errored"). Only flag when
+        // the process itself failed to run (killed, spawn error, etc.).
         return {
             resolved: r.status === 0,
             tests_status: EMPTY_TESTS_STATUS,
             oracle_used: true,
-            error_message: r.status === 0 ? undefined : (r.stderr?.toString().slice(-500) ?? ""),
+            error_message: r.status === null ? `oracle.sh did not exit cleanly: signal=${r.signal}` : undefined,
         };
     }
     if (existsSync(oracleMjs)) {
@@ -86,16 +121,27 @@ export async function scoreTask(args) {
             resolved: r.status === 0,
             tests_status: EMPTY_TESTS_STATUS,
             oracle_used: true,
-            error_message: r.status === 0 ? undefined : (r.stderr?.toString().slice(-500) ?? ""),
+            error_message: r.status === null ? `oracle.mjs did not exit cleanly: signal=${r.signal}` : undefined,
         };
     }
     // (2) Default test command.
-    const r = spawnSync(packDefaultTestCommand, {
+    // Run via bash so the venv is activated; cd into ./repo first if it exists
+    // (real SWE-bench packs put project source there). For synthetic packs
+    // without a repo/ subdir, run from the worktree root.
+    const hasRepo = existsSync(join(worktreeDir, "repo"));
+    const venvActivate = process.platform === "win32"
+        ? "[ -f .venv/Scripts/activate ] && source .venv/Scripts/activate"
+        : "[ -f .venv/bin/activate ] && source .venv/bin/activate";
+    const cdRepo = hasRepo ? "cd repo && " : "";
+    const r = spawnSync("bash", ["-c", `${venvActivate}; ${cdRepo}${packDefaultTestCommand}`], {
         cwd: worktreeDir,
-        shell: true,
         timeout: testTimeoutMs,
     });
-    const xmlPath = join(worktreeDir, ".oh-evals-results.xml");
+    // Test command writes junit-xml relative to its CWD. Prefer repo/ when it
+    // exists; fall back to worktree root for synthetic/legacy packs.
+    const xmlPathRepo = join(worktreeDir, "repo", ".oh-evals-results.xml");
+    const xmlPathRoot = join(worktreeDir, ".oh-evals-results.xml");
+    const xmlPath = existsSync(xmlPathRepo) ? xmlPathRepo : xmlPathRoot;
     if (!existsSync(xmlPath)) {
         return {
             resolved: false,

package/dist/harness/config.d.ts CHANGED Viewed

@@ -253,6 +253,15 @@ export type OhConfig = {
      * call-site that already uses `safeEnv()` picks this up automatically.
      */
     env?: Record<string, string>;
+    /**
+     * Per-skill visibility overrides. Keys are skill names (e.g. "my-skill" or
+     * "plugin:skill-name"). Values:
+     *   "off"                 — hidden from model AND from the slash picker
+     *   "user-invocable-only" — hidden from model, still shows in /skills + slash picker
+     *   "name-only"           — shown to model but description collapsed to name only
+     * Mirrors Claude Code's `skillOverrides` setting.
+     */
+    skillOverrides?: Record<string, "off" | "user-invocable-only" | "name-only">;
 };
 /** Clear cached config (call after writes or to force re-read) */
 export declare function invalidateConfigCache(): void;

package/dist/harness/cost.js CHANGED Viewed

@@ -60,7 +60,18 @@ export const MODEL_PRICING = {
     "qwen-turbo": [0.2, 0.6],
 };
 export function estimateCost(model, inputTokens, outputTokens) {
-    const pricing = MODEL_PRICING[model];
+    // Exact match first; otherwise prefix-match so dated model IDs like
+    // "claude-haiku-4-5-20251001" resolve to "claude-haiku-4-5".
+    let pricing = MODEL_PRICING[model];
+    if (!pricing) {
+        let bestKey = "";
+        for (const key of Object.keys(MODEL_PRICING)) {
+            if (model.startsWith(key) && key.length > bestKey.length)
+                bestKey = key;
+        }
+        if (bestKey)
+            pricing = MODEL_PRICING[bestKey];
+    }
     if (!pricing)
         return 0;
     return (inputTokens / 1_000_000) * pricing[0] + (outputTokens / 1_000_000) * pricing[1];

package/dist/harness/plugins.d.ts CHANGED Viewed

@@ -54,6 +54,8 @@ export type AgentTeamConfig = {
         tools?: string[];
     }>;
 };
+/** Register an extra plugin directory for this session (not persisted). */
+export declare function addExtraPluginDir(dir: string): void;
 /** Discover all available skills from bundled + project + global dirs + installed plugins */
 export declare function discoverSkills(): SkillMetadata[];
 /** Find a skill by name (case-insensitive) */
@@ -73,5 +75,5 @@ export declare function loadPluginManifest(dir: string): PluginManifest | null;
 /** Discover plugins from node_modules */
 export declare function discoverPlugins(): PluginManifest[];
 /** Build a prompt listing available skills for the LLM */
-export declare function skillsToPrompt(skills: SkillMetadata[]): string;
+export declare function skillsToPrompt(skills: SkillMetadata[], overrides?: Record<string, "off" | "user-invocable-only" | "name-only">): string;
 //# sourceMappingURL=plugins.d.ts.map

package/dist/harness/plugins.js CHANGED Viewed

@@ -15,6 +15,13 @@ import { homedir } from "node:os";
 import { dirname, join, relative } from "node:path";
 import { fileURLToPath } from "node:url";
 import { getInstalledPlugins } from "./marketplace.js";
+/** Session-scoped extra plugin directories registered via --plugin-dir / --plugin-url. */
+const extraPluginDirs = [];
+/** Register an extra plugin directory for this session (not persisted). */
+export function addExtraPluginDir(dir) {
+    if (!extraPluginDirs.includes(dir))
+        extraPluginDirs.push(dir);
+}
 const PROJECT_SKILLS_DIR = join(".oh", "skills");
 const GLOBAL_SKILLS_DIR = join(homedir(), ".oh", "skills");
 // Claude Code ecosystem mirror paths (Anthropic convention)
@@ -192,6 +199,17 @@ export function discoverSkills() {
     catch {
         /* marketplace module may not be loaded yet */
     }
+    // Session-scoped extra plugin dirs (--plugin-dir / --plugin-url)
+    for (const dir of extraPluginDirs) {
+        const pluginSkillsDir = join(dir, "skills");
+        const pluginSkills = loadSkillsFromDir(pluginSkillsDir, "plugin");
+        const manifest = loadPluginManifest(dir);
+        const pluginName = manifest?.name ?? dir.split(/[/\\]/).pop() ?? "extra";
+        for (const skill of pluginSkills) {
+            skill.name = `${pluginName}:${skill.name}`;
+        }
+        skills.push(...pluginSkills);
+    }
     // De-duplicate by name+filePath: if same skill appears in multiple paths (e.g. CC mirror), keep first.
     const seen = new Set();
     return skills.filter((s) => {
@@ -283,12 +301,21 @@ export function discoverPlugins() {
     return plugins;
 }
 /** Build a prompt listing available skills for the LLM */
-export function skillsToPrompt(skills) {
-    // Only include skills with invokeModel !== false (hidden skills excluded from prompt)
-    const visible = skills.filter((s) => s.invokeModel !== false);
+export function skillsToPrompt(skills, overrides) {
+    // invokeModel:false hides from model; "off" and "user-invocable-only" overrides also hide from model.
+    const visible = skills.filter((s) => {
+        if (s.invokeModel === false)
+            return false;
+        const ov = overrides?.[s.name];
+        return ov !== "off" && ov !== "user-invocable-only";
+    });
     if (visible.length === 0)
         return "";
-    const lines = visible.map((s) => `- ${s.name}: ${s.description}${s.trigger ? ` (auto-trigger: "${s.trigger}")` : ""}`);
+    const lines = visible.map((s) => {
+        const desc = overrides?.[s.name] === "name-only" ? "" : `: ${s.description}`;
+        const trigger = overrides?.[s.name] === "name-only" ? "" : s.trigger ? ` (auto-trigger: "${s.trigger}")` : "";
+        return `- ${s.name}${desc}${trigger}`;
+    });
     return `# Available Skills\nUse the Skill tool to invoke these:\n${lines.join("\n")}`;
 }
 //# sourceMappingURL=plugins.js.map

package/dist/main.js CHANGED Viewed

@@ -21,7 +21,7 @@ import { emitHook, setHookDecisionObserver } from "./harness/hooks.js";
 import { languageToPrompt } from "./harness/language.js";
 import { loadActiveMemories, memoriesToPrompt, userProfileToPrompt } from "./harness/memory.js";
 import { detectProject, projectContextToPrompt } from "./harness/onboarding.js";
-import { discoverSkills, skillsToPrompt } from "./harness/plugins.js";
+import { addExtraPluginDir, discoverSkills, skillsToPrompt } from "./harness/plugins.js";
 import { createRulesFile, loadRules, loadRulesAsPrompt } from "./harness/rules.js";
 import { listSessions } from "./harness/session.js";
 import { connectedMcpServers, disconnectMcpClients, getMcpInstructions, loadMcpPrompts, loadMcpTools, parseMcpConfigFile, } from "./mcp/loader.js";
@@ -164,7 +164,7 @@ function buildSystemPrompt(model, opts = {}) {
         parts.push(memoriesPrompt);
     // Available skills (Level 0 — names + descriptions only)
     const skills = discoverSkills();
-    const skillsPrompt = skillsToPrompt(skills);
+    const skillsPrompt = skillsToPrompt(skills, cfg?.skillOverrides);
     if (skillsPrompt)
         parts.push(skillsPrompt);
     // MCP server instructions (sandboxed — treat as untrusted)
@@ -366,7 +366,12 @@ program
     if (outputFormat === "stream-json") {
         console.log(JSON.stringify({ type: "turnStart", turnNumber: 0 }));
     }
-    for await (const event of query(prompt, config, priorMessages)) {
+    // Track cumulative cost + turn count so stream-json mode can emit a final
+    // `result` event (consumed by `oh evals` and SDK callers).
+    let cumulativeCost = 0;
+    let turnsCompleted = 0;
+    let lastTurnReason = "ok";
+    for await (const event of query(prompt, { ...config, sessionId }, priorMessages)) {
         if (event.type === "text_delta") {
             fullOutput += event.content;
             if (outputFormat === "text")
@@ -408,6 +413,7 @@ program
             }
         }
         else if (event.type === "cost_update") {
+            cumulativeCost += event.cost;
             if (outputFormat === "stream-json") {
                 console.log(JSON.stringify({
                     type: "cost_update",
@@ -419,6 +425,8 @@ program
             }
         }
         else if (event.type === "turn_complete") {
+            turnsCompleted += 1;
+            lastTurnReason = event.reason;
             if (outputFormat === "stream-json") {
                 console.log(JSON.stringify({ type: "turn_complete", reason: event.reason }));
             }
@@ -431,6 +439,15 @@ program
             }
         }
     }
+    if (outputFormat === "stream-json") {
+        console.log(JSON.stringify({
+            type: "result",
+            subtype: lastTurnReason,
+            total_cost_usd: cumulativeCost,
+            num_turns: turnsCompleted,
+            result: fullOutput,
+        }));
+    }
     if (outputFormat === "json") {
         console.log(JSON.stringify({ output: fullOutput, tools: toolResults }, null, 2));
     }
@@ -632,7 +649,7 @@ program
             permissionMode,
         });
         console.log(JSON.stringify({ id, type: "turnStart", turnNumber: turnIdx }));
-        for await (const event of query(prompt, config, conversation)) {
+        for await (const event of query(prompt, { ...config, sessionId }, conversation)) {
             if (event.type === "text_delta") {
                 assistantText += event.content;
                 console.log(JSON.stringify({ id, type: "text", content: event.content }));
@@ -1535,6 +1552,74 @@ program
     }, intervalMs);
     process.stderr.write(`[schedule] Running every ${opts.interval} minutes. Ctrl+C to stop.\n`);
 });
+// ── --plugin-dir / --plugin-url (session-scoped extra plugins) ──
+// Added as global options so they work with any subcommand (run, session, REPL).
+program
+    .option("--plugin-dir <path>", "Load a plugin from a local directory for this session (not persisted)")
+    .option("--plugin-url <url>", "Download a plugin .zip or .tar.gz from a URL and load it for this session");
+program.hook("preAction", async () => {
+    const opts = program.opts();
+    if (opts.pluginDir) {
+        addExtraPluginDir(opts.pluginDir);
+    }
+    if (opts.pluginUrl) {
+        const { get: httpsGet } = await import("node:https");
+        const { createWriteStream, mkdirSync: fsMkdir, readdirSync: fsReaddir } = await import("node:fs");
+        const { mkdtempSync } = await import("node:fs");
+        const { tmpdir } = await import("node:os");
+        const { execFileSync: execFile } = await import("node:child_process");
+        const url = opts.pluginUrl;
+        const tmp = mkdtempSync(join(tmpdir(), "oh-plugin-"));
+        const isZip = url.endsWith(".zip");
+        const archiveName = isZip ? "plugin.zip" : "plugin.tar.gz";
+        const archivePath = join(tmp, archiveName);
+        await new Promise((resolve, reject) => {
+            function follow(u, depth = 0) {
+                if (depth > 5) {
+                    reject(new Error("too many redirects"));
+                    return;
+                }
+                httpsGet(u, (res) => {
+                    if (res.statusCode === 301 || res.statusCode === 302) {
+                        follow(res.headers.location ?? u, depth + 1);
+                    }
+                    else if (res.statusCode !== 200) {
+                        reject(new Error(`HTTP ${res.statusCode} fetching plugin from ${u}`));
+                    }
+                    else {
+                        const out = createWriteStream(archivePath);
+                        res.pipe(out);
+                        out.on("finish", resolve);
+                        out.on("error", reject);
+                    }
+                }).on("error", reject);
+            }
+            follow(url);
+        });
+        const extractDir = join(tmp, "plugin");
+        fsMkdir(extractDir, { recursive: true });
+        if (isZip) {
+            execFile("unzip", ["-q", archivePath, "-d", extractDir]);
+        }
+        else {
+            execFile("tar", ["-xzf", archivePath], { cwd: extractDir });
+        }
+        // If the archive produced a single top-level dir, step into it (common convention).
+        const { statSync: fsStat } = await import("node:fs");
+        const entries = fsReaddir(extractDir);
+        const singleDir = entries.length === 1 &&
+            (() => {
+                try {
+                    return fsStat(join(extractDir, entries[0])).isDirectory();
+                }
+                catch {
+                    return false;
+                }
+            })();
+        const pluginRoot = singleDir ? join(extractDir, entries[0]) : extractDir;
+        addExtraPluginDir(pluginRoot);
+    }
+});
 program.parseAsync(process.argv).catch((err) => {
     console.error(err instanceof Error ? err.message : String(err));
     process.exitCode = 1;

package/dist/mcp/loader.d.ts CHANGED Viewed

@@ -32,6 +32,8 @@ export declare function loadMcpTools(opts?: LoadMcpOptions): Promise<Tool[]>;
 export declare function disconnectMcpClients(): void;
 /** Names of connected MCP servers */
 export declare function connectedMcpServers(): string[];
+/** Tool count for a connected MCP server, or undefined if not connected. */
+export declare function mcpServerToolCount(name: string): number | undefined;
 export type McpPromptHandle = {
     /** `<server>:<prompt>` qualified name — the slash command is `/<server>:<prompt>`. */
     qualifiedName: string;

package/dist/mcp/loader.js CHANGED Viewed

@@ -48,6 +48,7 @@ export function parseMcpConfigFile(path) {
     return servers;
 }
 const connectedClients = [];
+const serverToolCount = new Map();
 let exitHandlerInstalled = false;
 function installExitHandler() {
     if (exitHandlerInstalled)
@@ -104,6 +105,7 @@ export async function loadMcpTools(opts = {}) {
         }
         const { client, defs, server } = result.value;
         connectedClients.push(client);
+        serverToolCount.set(server.name, defs.length);
         debug("mcp", "connected", { server: server.name, tools: defs.length, deferred: defs.length > DEFERRED_THRESHOLD });
         if (defs.length > DEFERRED_THRESHOLD) {
             for (const def of defs) {
@@ -129,11 +131,16 @@ export function disconnectMcpClients() {
         }
     }
     connectedClients.length = 0;
+    serverToolCount.clear();
 }
 /** Names of connected MCP servers */
 export function connectedMcpServers() {
     return connectedClients.map((c) => c.name);
 }
+/** Tool count for a connected MCP server, or undefined if not connected. */
+export function mcpServerToolCount(name) {
+    return serverToolCount.get(name);
+}
 /**
  * Enumerate prompts on every already-connected MCP server. Servers that don't
  * implement the `prompts/list` capability return an empty list (handled

package/dist/providers/anthropic.js CHANGED Viewed

@@ -87,10 +87,11 @@ export class AnthropicProvider {
         // Prompt caching: send system prompt as content blocks with cache_control.
         // Anthropic caches matching prefixes — 90% cost reduction on repeat turns.
         const systemBlocks = [{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } }];
-        // Scale max_tokens and thinking budget based on model
+        // Scale max_tokens and thinking budget based on model.
+        // Anthropic requires max_tokens > thinking.budget_tokens.
         const isOpus = m.includes("opus");
-        const maxTokens = isOpus ? 16384 : 8192;
-        const thinkingBudget = isOpus ? 32000 : 10000;
+        const maxTokens = isOpus ? 32768 : 16384;
+        const thinkingBudget = isOpus ? 24576 : 8192;
         const body = {
             model: m,
             max_tokens: maxTokens,
@@ -293,7 +294,18 @@ export class AnthropicProvider {
         return createAssistantMessage(content, toolCalls.length ? toolCalls : undefined);
     }
     getModelInfo(id) {
-        return this.listModels().find((m) => m.id === id);
+        // Exact match first; otherwise prefix-match so dated model IDs like
+        // "claude-haiku-4-5-20251001" resolve to "claude-haiku-4-5".
+        const models = this.listModels();
+        const exact = models.find((m) => m.id === id);
+        if (exact)
+            return exact;
+        let best;
+        for (const m of models) {
+            if (id.startsWith(m.id) && (!best || m.id.length > best.id.length))
+                best = m;
+        }
+        return best;
     }
     listModels() {
         return [

package/dist/query/index.js CHANGED Viewed

@@ -58,6 +58,7 @@ export async function* query(userMessage, config, existingMessages = []) {
         gitCommitPerTool: config.gitCommitPerTool,
         tracer: config.tracer,
         parentSpanId: querySpanId,
+        sessionId: config.sessionId,
     };
     const estimateTokens = makeTokenEstimator(config.provider);
     const contextManager = new ContextManager(undefined, config.model);

package/dist/query/types.d.ts CHANGED Viewed

@@ -35,6 +35,8 @@ export type QueryConfig = {
     permissionPromptTool?: string;
     /** Optional session tracer. When set, query() emits `query` and `tool:<Name>` spans. */
     tracer?: SessionTracer;
+    /** Session ID injected into Bash subprocess env as OH_SESSION_ID. */
+    sessionId?: string;
 };
 export type TransitionReason = "next_turn" | "retry_network" | "retry_prompt_too_long" | "retry_max_output_tokens";
 export type QueryLoopState = {

package/dist/repl.js CHANGED Viewed

@@ -921,6 +921,7 @@ export async function startREPL(config) {
             model: currentModel || undefined,
             abortSignal: abortController.signal,
             tracer,
+            sessionId: session.id,
         };
         try {
             for await (const event of query(prompt, queryConfig, messages)) {

package/dist/tools/BashTool/index.js CHANGED Viewed

@@ -52,7 +52,7 @@ export const BashTool = {
             const bgId = Date.now().toString(36) + Math.random().toString(36).slice(2, 6);
             const proc = spawn(shell, shellArgs, {
                 cwd: context.workingDir,
-                env: safeEnv(),
+                env: safeEnv(context.sessionId ? { OH_SESSION_ID: context.sessionId } : undefined),
                 stdio: ["ignore", "pipe", "pipe"],
                 detached: false,
                 ...extraSpawnOpts,
@@ -98,7 +98,7 @@ export const BashTool = {
             let killed = false;
             const proc = spawn(shell, shellArgs, {
                 cwd: context.workingDir,
-                env: safeEnv(),
+                env: safeEnv(context.sessionId ? { OH_SESSION_ID: context.sessionId } : undefined),
                 stdio: ["ignore", "pipe", "pipe"],
                 ...extraSpawnOpts,
             });

package/dist/utils/safe-env.js CHANGED Viewed

@@ -19,6 +19,8 @@ const BLOCKED_PATTERNS = [
     /^DOCKER_.*TOKEN$/i,
     /^SSH_.*KEY$/i,
     /^OH_CREDENTIAL/i,
+    // Prevent subprocesses from inheriting the CLI's own OTLP endpoint.
+    /^OTEL_/i,
 ];
 /**
  * Filter process.env to remove credential-containing variables.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@zhijiewang/openharness",
-  "version": "2.40.1",
+  "version": "2.40.3",
   "description": "Open-source terminal coding agent. Works with any LLM.",
   "type": "module",
   "bin": {