@zhijiewang/openharness 2.40.1 → 2.40.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/Tool.d.ts CHANGED
@@ -33,6 +33,8 @@ export type ToolContext = {
33
33
  tracer?: import("./harness/traces.js").SessionTracer;
34
34
  /** Optional parent span ID for the current tool execution (set by query loop). */
35
35
  parentSpanId?: string;
36
+ /** Session ID for the current query — injected into Bash subprocess env. */
37
+ sessionId?: string;
36
38
  };
37
39
  export type Tool<Input extends z.ZodType = z.ZodType> = {
38
40
  readonly name: string;
package/dist/Tool.js CHANGED
@@ -51,7 +51,15 @@ function zodToJsonSchema(schema) {
51
51
  return { type: "boolean" };
52
52
  if (def?.typeName === "ZodArray")
53
53
  return { type: "array", items: zodToJsonSchema(def.type) };
54
- return { type: "string" }; // fallback
54
+ // ZodRecord (used by DeferredTool's permissive schema) → permissive object.
55
+ // Anthropic's tool-use API requires `type: "object"` for tool input_schema.
56
+ if (def?.typeName === "ZodRecord")
57
+ return { type: "object", additionalProperties: {} };
58
+ if (def?.typeName === "ZodUnknown" || def?.typeName === "ZodAny")
59
+ return {};
60
+ // Fallback: return permissive object so tool-use APIs that require object
61
+ // input schemas (Anthropic) don't reject the request.
62
+ return { type: "object", additionalProperties: {} };
55
63
  }
56
64
  /**
57
65
  * Find a tool by name from a list of tools.
@@ -13,7 +13,7 @@ import { discoverPlugins, discoverSkills } from "../harness/plugins.js";
13
13
  import { formatFlameGraph, formatTrace, listTracedSessions, loadTrace } from "../harness/traces.js";
14
14
  import { getVerificationConfig, invalidateVerificationCache } from "../harness/verification.js";
15
15
  import { normalizeMcpConfig } from "../mcp/config-normalize.js";
16
- import { connectedMcpServers, disconnectMcpClients, loadMcpTools } from "../mcp/loader.js";
16
+ import { connectedMcpServers, disconnectMcpClients, loadMcpTools, mcpServerToolCount } from "../mcp/loader.js";
17
17
  import { getAuthStatus } from "../mcp/oauth.js";
18
18
  import { formatRegistry, generateConfigBlock, MCP_REGISTRY, searchRegistry } from "../mcp/registry.js";
19
19
  import { getRouteSelection } from "../providers/router.js";
@@ -458,6 +458,8 @@ export function registerInfoCommands(register, getCommandMap) {
458
458
  continue;
459
459
  }
460
460
  const kind = normalized.cfg.type;
461
+ const toolCount = mcpServerToolCount(name);
462
+ const toolsLabel = toolCount !== undefined ? `${toolCount} tool${toolCount === 1 ? "" : "s"}` : "";
461
463
  const status = await getAuthStatus(normalized.cfg, storageDir);
462
464
  let statusText;
463
465
  switch (status) {
@@ -474,7 +476,8 @@ export function registerInfoCommands(register, getCommandMap) {
474
476
  statusText = "expired (re-authenticate with /mcp-login)";
475
477
  break;
476
478
  }
477
- lines.push(` ${name.padEnd(20)} ${kind.padEnd(6)} ${statusText}`);
479
+ const toolsPart = toolsLabel ? ` ${toolsLabel.padEnd(9)}` : " ";
480
+ lines.push(` ${name.padEnd(20)} ${kind.padEnd(6)}${toolsPart} ${statusText}`);
478
481
  }
479
482
  lines.push("");
480
483
  lines.push("Run /mcp-registry to browse and add more servers.");
@@ -3,33 +3,39 @@
3
3
  */
4
4
  import { existsSync, mkdirSync, unlinkSync, writeFileSync } from "node:fs";
5
5
  import { join } from "node:path";
6
+ import { readOhConfig } from "../harness/config.js";
6
7
  import { discoverSkills, findSkill } from "../harness/plugins.js";
7
8
  export function registerSkillCommands(register) {
8
9
  register("skills", "List all available skills", () => {
9
10
  const skills = discoverSkills();
10
- if (skills.length === 0) {
11
+ const overrides = readOhConfig()?.skillOverrides ?? {};
12
+ // "off" skills are fully hidden from the user
13
+ const visible = skills.filter((s) => overrides[s.name] !== "off");
14
+ if (visible.length === 0) {
11
15
  return {
12
16
  output: "No skills found. Create .oh/skills/*.md to add one, or run /skill-search to browse the registry.",
13
17
  handled: true,
14
18
  };
15
19
  }
16
- // Group by source for readability
17
20
  const lines = ["Available skills:"];
18
21
  const sourceLabel = {
19
22
  project: "[project]",
20
23
  global: "[global]",
21
24
  plugin: "[plugin]",
22
25
  };
23
- // Sort: bundled-style (project, no path under .oh) first, then by source then name
24
- const sorted = [...skills].sort((a, b) => {
26
+ const sorted = [...visible].sort((a, b) => {
25
27
  if (a.source !== b.source)
26
28
  return a.source.localeCompare(b.source);
27
29
  return a.name.localeCompare(b.name);
28
30
  });
29
31
  for (const s of sorted) {
30
32
  const tag = sourceLabel[s.source] ?? `[${s.source}]`;
31
- const desc = s.description ? `: ${s.description}` : "";
32
- lines.push(` - ${s.name} ${tag}${desc}`);
33
+ const ov = overrides[s.name];
34
+ // "user-invocable-only": show name but mark as not available to model
35
+ // "name-only": suppress description (mirrors model-side behaviour)
36
+ const descText = ov === "name-only" || !s.description ? "" : `: ${s.description}`;
37
+ const hint = ov === "user-invocable-only" ? " [user-only]" : "";
38
+ lines.push(` - ${s.name} ${tag}${descText}${hint}`);
33
39
  }
34
40
  return { output: lines.join("\n"), handled: true };
35
41
  });
@@ -16,7 +16,7 @@
16
16
  * --model <model> "<problem_statement>"
17
17
  */
18
18
  import { execFileSync, spawn, spawnSync } from "node:child_process";
19
- import { createWriteStream, existsSync, mkdirSync, rmSync as nodeRmSync, readFileSync } from "node:fs";
19
+ import { copyFileSync, createWriteStream, existsSync, mkdirSync, rmSync as nodeRmSync, readFileSync, unlinkSync, writeFileSync, } from "node:fs";
20
20
  import { join } from "node:path";
21
21
  import { isGitRepo, removeWorktree } from "../git/index.js";
22
22
  import { RunWriter } from "./run-writer.js";
@@ -48,6 +48,7 @@ export class RunOrchestrator {
48
48
  for (const r of prior) {
49
49
  this.skipIds.add(r.instance_id);
50
50
  this.totalCost += r.cost_usd;
51
+ this.writer.preloadResult(r);
51
52
  }
52
53
  }
53
54
  }
@@ -134,7 +135,7 @@ export class RunOrchestrator {
134
135
  maxTurns: this.opts.maxTaskTurns,
135
136
  model: this.opts.model,
136
137
  fallbackModel: this.opts.fallbackModel,
137
- prompt: task.problem_statement,
138
+ prompt: buildEvalPrompt(task.problem_statement),
138
139
  }),
139
140
  };
140
141
  const transcriptPath = join(this.opts.runDir, "transcripts", `${task.instance_id}.jsonl`);
@@ -251,7 +252,8 @@ export class RunOrchestrator {
251
252
  }
252
253
  finally {
253
254
  // Clean up worktree (best-effort; swallow errors so a leak doesn't stop a run).
254
- if (worktreePath && existsSync(worktreePath)) {
255
+ // Set OH_EVALS_KEEP_WORKTREES=1 to skip cleanup for post-run debugging.
256
+ if (worktreePath && existsSync(worktreePath) && !process.env.OH_EVALS_KEEP_WORKTREES) {
255
257
  try {
256
258
  if (usedGitWorktree)
257
259
  removeWorktree(worktreePath);
@@ -317,12 +319,20 @@ function parseStreamJsonResult(stdout) {
317
319
  return { cost_usd: 0, turns_used: 0, exit_reason: "ok", final_message: "" };
318
320
  }
319
321
  function captureGitDiff(worktreeDir) {
320
- try {
321
- return execFileSync("git", ["-C", worktreeDir, "diff", "HEAD"], { encoding: "utf-8" });
322
- }
323
- catch {
324
- return "";
322
+ // setup.sh initialises the git repo at worktreeDir/repo/.git, so diff from
323
+ // that subdirectory. Fall back to worktreeDir for legacy fixtures that put
324
+ // .git at the worktree root.
325
+ for (const dir of [join(worktreeDir, "repo"), worktreeDir]) {
326
+ try {
327
+ const out = execFileSync("git", ["-C", dir, "diff", "HEAD"], { encoding: "utf-8" });
328
+ if (out)
329
+ return out;
330
+ }
331
+ catch {
332
+ /* try next */
333
+ }
325
334
  }
335
+ return "";
326
336
  }
327
337
  async function extractFixture(packDir, instanceId, dest) {
328
338
  const fxDir = join(packDir, "fixtures", instanceId);
@@ -340,14 +350,28 @@ async function extractFixture(packDir, instanceId, dest) {
340
350
  // handles initialization; we just ensure the dest dir exists.
341
351
  return;
342
352
  }
343
- if (c.flag === "-xzf") {
344
- execFileSync("tar", ["-xzf", c.path, "-C", dest], { stdio: ["ignore", "pipe", "pipe"] });
353
+ // Use cwd + relative archive name to avoid GNU tar treating Windows drive
354
+ // letters (e.g. "E:") as remote hostnames when passed as absolute paths.
355
+ const archiveName = c.flag === "-xzf" ? "_repo.tar.gz" : "_repo.tar.zst";
356
+ copyFileSync(c.path, join(dest, archiveName));
357
+ try {
358
+ if (c.flag === "-xzf") {
359
+ execFileSync("tar", ["-xzf", archiveName], { cwd: dest, stdio: ["ignore", "pipe", "pipe"] });
360
+ }
361
+ else {
362
+ execFileSync("tar", ["--use-compress-program=zstd -d", "-xf", archiveName], {
363
+ cwd: dest,
364
+ stdio: ["ignore", "pipe", "pipe"],
365
+ });
366
+ }
345
367
  }
346
- else {
347
- // Legacy .tar.zst path: requires the system `zstd` binary on PATH.
348
- execFileSync("tar", ["--use-compress-program=zstd -d", "-xf", c.path, "-C", dest], {
349
- stdio: ["ignore", "pipe", "pipe"],
350
- });
368
+ finally {
369
+ try {
370
+ unlinkSync(join(dest, archiveName));
371
+ }
372
+ catch {
373
+ /* best-effort */
374
+ }
351
375
  }
352
376
  return;
353
377
  }
@@ -356,19 +380,98 @@ async function runSetupScript(packDir, instanceId, worktreeDir) {
356
380
  const setupPath = join(packDir, "fixtures", instanceId, "setup.sh");
357
381
  if (!existsSync(setupPath))
358
382
  return { ok: true }; // No setup needed.
359
- const r = spawnSync(setupPath, [], {
360
- cwd: worktreeDir,
361
- shell: true, // works for both .sh on POSIX and bash-as-shell on Windows
362
- encoding: "utf-8",
363
- });
383
+ // Invoke sh/bash explicitly so the script runs without the execute bit.
384
+ // On Windows, use bash (Git Bash) and define python3 as a shell function
385
+ // that delegates to `python` Python 3 on Windows ships as python.exe only.
386
+ let r;
387
+ if (process.platform === "win32") {
388
+ // Python 3 on Windows installs as python.exe only, and the WindowsApps stub
389
+ // for both `python` and `python3` appears first on Git Bash's PATH. We find
390
+ // the real interpreter via where.exe and use its absolute POSIX path directly.
391
+ const realPython = windowsRealPythonPosix();
392
+ // On Windows, `python3 -m venv` creates .venv/Scripts/activate, not .venv/bin/activate.
393
+ // Patch setup.sh to use the Windows path so sourcing works in Git Bash.
394
+ const original = readFileSync(setupPath, "utf-8");
395
+ const patched = original
396
+ .replace(/\bsource\s+\.venv\/bin\/activate\b/g, "source .venv/Scripts/activate")
397
+ .replace(/\. \.venv\/bin\/activate\b/g, ". .venv/Scripts/activate");
398
+ const tmpSetup = `${setupPath}.win.sh`;
399
+ try {
400
+ unlinkSync(tmpSetup);
401
+ }
402
+ catch {
403
+ /* ok */
404
+ }
405
+ writeFileSync(tmpSetup, patched, "utf-8");
406
+ const posixTmp = tmpSetup.replace(/\\/g, "/").replace(/^([A-Za-z]):/, (_, d) => `/${d.toLowerCase()}`);
407
+ // Only define python3 — pip must NOT be overridden because after venv activation
408
+ // the venv's pip.exe is on PATH and must be used (not system Python's pip).
409
+ const pyFn = realPython ? `python3() { "${realPython}" "$@"; }` : "";
410
+ r = spawnSync("bash", ["-c", `${pyFn}${pyFn ? "; " : ""}. "${posixTmp}"`], {
411
+ cwd: worktreeDir,
412
+ encoding: "utf-8",
413
+ });
414
+ try {
415
+ unlinkSync(tmpSetup);
416
+ }
417
+ catch {
418
+ /* best-effort */
419
+ }
420
+ }
421
+ else {
422
+ r = spawnSync("/bin/sh", [setupPath], { cwd: worktreeDir, encoding: "utf-8" });
423
+ }
364
424
  if (r.status !== 0) {
365
- return { ok: false, error: (r.stderr ?? "").slice(-500) };
425
+ return { ok: false, error: String(r.stderr ?? "").slice(-500) };
366
426
  }
367
427
  return { ok: true };
368
428
  }
429
+ /** Returns the POSIX path to the real Python interpreter on Windows,
430
+ * skipping the WindowsApps stub which is a dead-end redirect. */
431
+ function windowsRealPythonPosix() {
432
+ try {
433
+ const out = spawnSync("where.exe", ["python"], { encoding: "utf-8" }).stdout ?? "";
434
+ for (const line of out.split(/\r?\n/)) {
435
+ const p = line.trim();
436
+ if (p && !p.includes("WindowsApps")) {
437
+ return p.replace(/\\/g, "/").replace(/^([A-Za-z]):/, (_, d) => `/${d.toLowerCase()}`);
438
+ }
439
+ }
440
+ }
441
+ catch {
442
+ /* fall through */
443
+ }
444
+ return "";
445
+ }
369
446
  function defaultOhEntry() {
370
447
  return join(process.cwd(), "dist", "main.js");
371
448
  }
449
+ /** Wrap a SWE-bench problem statement with SWE-bench-style instructions:
450
+ * the working tree is in ./repo, only that subtree is committed/diffed,
451
+ * edit source files in place, don't create documentation/scratch files. */
452
+ function buildEvalPrompt(problemStatement) {
453
+ return [
454
+ "You are an autonomous software engineer fixing a bug in an open-source Python project.",
455
+ "",
456
+ "WORKING DIRECTORY",
457
+ "- The repository source is in `./repo/` (relative to your current directory).",
458
+ "- A `.venv` next to it has the project installed editably; do NOT recreate it.",
459
+ "- Run all bash commands with `cd repo && …` or use absolute paths under `./repo/`.",
460
+ "",
461
+ "WHAT TO DO",
462
+ "- Read the problem statement below, locate the relevant source files in `./repo/`, and edit them in place to fix the bug.",
463
+ "- Use the existing test suite to verify (run with `cd repo && python -m pytest <file_or_pattern>`).",
464
+ "- Only changes inside `./repo/` are scored; the orchestrator runs `git diff HEAD` from `./repo/` to extract your patch.",
465
+ "",
466
+ "WHAT NOT TO DO",
467
+ "- Do NOT create README/SUMMARY/GUIDE/PATCH/SOLUTION/COMPLETION files. Edit the source.",
468
+ "- Do NOT write standalone scratch scripts at the worktree root — only edit files under `./repo/`.",
469
+ "- Do NOT modify `.venv/`, generated `_version.py` files, or anything outside `./repo/`.",
470
+ "",
471
+ "PROBLEM STATEMENT",
472
+ problemStatement,
473
+ ].join("\n");
474
+ }
372
475
  function defaultRunArgs(opts) {
373
476
  const args = [
374
477
  opts.ohEntry,
@@ -26,6 +26,9 @@ export declare class RunWriter {
26
26
  private readonly results;
27
27
  constructor(runDir: string, header: RunHeader);
28
28
  appendResult(result: EvalsResult): void;
29
+ /** Load a result that was written in a prior run into the in-memory array without
30
+ * re-writing it to disk (used by the resume path so finalize() includes all results). */
31
+ preloadResult(result: EvalsResult): void;
29
32
  loadExistingResults(): EvalsResult[];
30
33
  finalize(opts: {
31
34
  partial: boolean;
@@ -37,6 +37,11 @@ export class RunWriter {
37
37
  writeFileSync(tmp, JSON.stringify(preds, null, 2));
38
38
  renameSync(tmp, join(this.runDir, "predictions.json"));
39
39
  }
40
+ /** Load a result that was written in a prior run into the in-memory array without
41
+ * re-writing it to disk (used by the resume path so finalize() includes all results). */
42
+ preloadResult(result) {
43
+ this.results.push(result);
44
+ }
40
45
  loadExistingResults() {
41
46
  const path = join(this.runDir, "results.jsonl");
42
47
  if (!existsSync(path))
@@ -12,6 +12,33 @@
12
12
  import { spawnSync } from "node:child_process";
13
13
  import { existsSync, readFileSync } from "node:fs";
14
14
  import { join } from "node:path";
15
+ /** Convert pytest junit-xml classname/name (+ optional file= attr) into the
16
+ * pytest-style id that SWE-bench uses: `path/to/file.py::[Class::]test_name`.
17
+ * Returns null if a sensible id can't be built. */
18
+ function pytestStyleId(cn, name, file) {
19
+ let fileNorm;
20
+ let classTail;
21
+ if (file) {
22
+ fileNorm = file.replace(/\\/g, "/");
23
+ const moduleFromFile = fileNorm.replace(/\.py$/, "").replace(/\//g, ".");
24
+ classTail = cn.startsWith(`${moduleFromFile}.`) ? cn.slice(moduleFromFile.length + 1) : "";
25
+ }
26
+ else {
27
+ // No `file=` attribute (older pytest / minimal junit-xml). Derive the
28
+ // path from classname: trailing PascalCase segments are class names,
29
+ // the rest is the dotted module path → file is module/path.py.
30
+ const parts = cn.split(".");
31
+ const classParts = [];
32
+ while (parts.length > 0 && /^[A-Z]/.test(parts[parts.length - 1] ?? "")) {
33
+ classParts.unshift(parts.pop());
34
+ }
35
+ if (parts.length === 0)
36
+ return null;
37
+ fileNorm = `${parts.join("/")}.py`;
38
+ classTail = classParts.join("::");
39
+ }
40
+ return classTail ? `${fileNorm}::${classTail}::${name}` : `${fileNorm}::${name}`;
41
+ }
15
42
  /**
16
43
  * Minimal junit-xml parser. Returns a map of "<classname>.<name>" → outcome.
17
44
  *
@@ -27,17 +54,19 @@ export function parseJunitXml(xml) {
27
54
  const inner = match[2] ?? "";
28
55
  const cn = /classname="([^"]*)"/.exec(attrs)?.[1];
29
56
  const name = /\bname="([^"]*)"/.exec(attrs)?.[1];
57
+ const file = /\bfile="([^"]*)"/.exec(attrs)?.[1];
30
58
  if (cn && name) {
31
- const id = `${cn}.${name}`;
32
- if (/<failure\b/.test(inner) || /<error\b/.test(inner)) {
33
- out[id] = "fail";
34
- }
35
- else if (/<skipped\b/.test(inner)) {
36
- out[id] = "skip";
37
- }
38
- else {
39
- out[id] = "pass";
40
- }
59
+ let outcome = "pass";
60
+ if (/<failure\b/.test(inner) || /<error\b/.test(inner))
61
+ outcome = "fail";
62
+ else if (/<skipped\b/.test(inner))
63
+ outcome = "skip";
64
+ // Emit BOTH a dotted classname.name id (legacy) and pytest-style
65
+ // file::[Class::]name ids so SWE-bench-format expected IDs match.
66
+ out[`${cn}.${name}`] = outcome;
67
+ const ptid = pytestStyleId(cn, name, file);
68
+ if (ptid)
69
+ out[ptid] = outcome;
41
70
  }
42
71
  match = testcaseRe.exec(xml);
43
72
  }
@@ -53,22 +82,28 @@ export async function scoreTask(args) {
53
82
  const oracleSh = join(fixtureDir, "oracle.sh");
54
83
  const oracleMjs = join(fixtureDir, "oracle.mjs");
55
84
  if (existsSync(oracleSh)) {
56
- const r = spawnSync(oracleSh, [], {
57
- cwd: worktreeDir,
58
- env: {
59
- ...process.env,
60
- INSTANCE_ID: task.instance_id,
61
- WORKTREE_DIR: worktreeDir,
62
- FIXTURE_DIR: fixtureDir,
63
- },
64
- timeout: testTimeoutMs,
65
- shell: process.platform === "win32",
66
- });
85
+ // Invoke /bin/sh explicitly so oracle.sh runs without the execute bit.
86
+ // Files committed from Windows or via writeFileSync default to mode 100644.
87
+ const r = process.platform === "win32"
88
+ ? spawnSync(oracleSh, [], {
89
+ cwd: worktreeDir,
90
+ env: { ...process.env, INSTANCE_ID: task.instance_id, WORKTREE_DIR: worktreeDir, FIXTURE_DIR: fixtureDir },
91
+ timeout: testTimeoutMs,
92
+ shell: true,
93
+ })
94
+ : spawnSync("/bin/sh", [oracleSh], {
95
+ cwd: worktreeDir,
96
+ env: { ...process.env, INSTANCE_ID: task.instance_id, WORKTREE_DIR: worktreeDir, FIXTURE_DIR: fixtureDir },
97
+ timeout: testTimeoutMs,
98
+ });
99
+ // Oracle exit code is the pass/fail signal — do NOT set error_message for a clean
100
+ // non-zero exit (that means "test failed", not "scoring errored"). Only flag when
101
+ // the process itself failed to run (killed, spawn error, etc.).
67
102
  return {
68
103
  resolved: r.status === 0,
69
104
  tests_status: EMPTY_TESTS_STATUS,
70
105
  oracle_used: true,
71
- error_message: r.status === 0 ? undefined : (r.stderr?.toString().slice(-500) ?? ""),
106
+ error_message: r.status === null ? `oracle.sh did not exit cleanly: signal=${r.signal}` : undefined,
72
107
  };
73
108
  }
74
109
  if (existsSync(oracleMjs)) {
@@ -86,16 +121,27 @@ export async function scoreTask(args) {
86
121
  resolved: r.status === 0,
87
122
  tests_status: EMPTY_TESTS_STATUS,
88
123
  oracle_used: true,
89
- error_message: r.status === 0 ? undefined : (r.stderr?.toString().slice(-500) ?? ""),
124
+ error_message: r.status === null ? `oracle.mjs did not exit cleanly: signal=${r.signal}` : undefined,
90
125
  };
91
126
  }
92
127
  // (2) Default test command.
93
- const r = spawnSync(packDefaultTestCommand, {
128
+ // Run via bash so the venv is activated; cd into ./repo first if it exists
129
+ // (real SWE-bench packs put project source there). For synthetic packs
130
+ // without a repo/ subdir, run from the worktree root.
131
+ const hasRepo = existsSync(join(worktreeDir, "repo"));
132
+ const venvActivate = process.platform === "win32"
133
+ ? "[ -f .venv/Scripts/activate ] && source .venv/Scripts/activate"
134
+ : "[ -f .venv/bin/activate ] && source .venv/bin/activate";
135
+ const cdRepo = hasRepo ? "cd repo && " : "";
136
+ const r = spawnSync("bash", ["-c", `${venvActivate}; ${cdRepo}${packDefaultTestCommand}`], {
94
137
  cwd: worktreeDir,
95
- shell: true,
96
138
  timeout: testTimeoutMs,
97
139
  });
98
- const xmlPath = join(worktreeDir, ".oh-evals-results.xml");
140
+ // Test command writes junit-xml relative to its CWD. Prefer repo/ when it
141
+ // exists; fall back to worktree root for synthetic/legacy packs.
142
+ const xmlPathRepo = join(worktreeDir, "repo", ".oh-evals-results.xml");
143
+ const xmlPathRoot = join(worktreeDir, ".oh-evals-results.xml");
144
+ const xmlPath = existsSync(xmlPathRepo) ? xmlPathRepo : xmlPathRoot;
99
145
  if (!existsSync(xmlPath)) {
100
146
  return {
101
147
  resolved: false,
@@ -253,6 +253,15 @@ export type OhConfig = {
253
253
  * call-site that already uses `safeEnv()` picks this up automatically.
254
254
  */
255
255
  env?: Record<string, string>;
256
+ /**
257
+ * Per-skill visibility overrides. Keys are skill names (e.g. "my-skill" or
258
+ * "plugin:skill-name"). Values:
259
+ * "off" — hidden from model AND from the slash picker
260
+ * "user-invocable-only" — hidden from model, still shows in /skills + slash picker
261
+ * "name-only" — shown to model but description collapsed to name only
262
+ * Mirrors Claude Code's `skillOverrides` setting.
263
+ */
264
+ skillOverrides?: Record<string, "off" | "user-invocable-only" | "name-only">;
256
265
  };
257
266
  /** Clear cached config (call after writes or to force re-read) */
258
267
  export declare function invalidateConfigCache(): void;
@@ -60,7 +60,18 @@ export const MODEL_PRICING = {
60
60
  "qwen-turbo": [0.2, 0.6],
61
61
  };
62
62
  export function estimateCost(model, inputTokens, outputTokens) {
63
- const pricing = MODEL_PRICING[model];
63
+ // Exact match first; otherwise prefix-match so dated model IDs like
64
+ // "claude-haiku-4-5-20251001" resolve to "claude-haiku-4-5".
65
+ let pricing = MODEL_PRICING[model];
66
+ if (!pricing) {
67
+ let bestKey = "";
68
+ for (const key of Object.keys(MODEL_PRICING)) {
69
+ if (model.startsWith(key) && key.length > bestKey.length)
70
+ bestKey = key;
71
+ }
72
+ if (bestKey)
73
+ pricing = MODEL_PRICING[bestKey];
74
+ }
64
75
  if (!pricing)
65
76
  return 0;
66
77
  return (inputTokens / 1_000_000) * pricing[0] + (outputTokens / 1_000_000) * pricing[1];
@@ -54,6 +54,8 @@ export type AgentTeamConfig = {
54
54
  tools?: string[];
55
55
  }>;
56
56
  };
57
+ /** Register an extra plugin directory for this session (not persisted). */
58
+ export declare function addExtraPluginDir(dir: string): void;
57
59
  /** Discover all available skills from bundled + project + global dirs + installed plugins */
58
60
  export declare function discoverSkills(): SkillMetadata[];
59
61
  /** Find a skill by name (case-insensitive) */
@@ -73,5 +75,5 @@ export declare function loadPluginManifest(dir: string): PluginManifest | null;
73
75
  /** Discover plugins from node_modules */
74
76
  export declare function discoverPlugins(): PluginManifest[];
75
77
  /** Build a prompt listing available skills for the LLM */
76
- export declare function skillsToPrompt(skills: SkillMetadata[]): string;
78
+ export declare function skillsToPrompt(skills: SkillMetadata[], overrides?: Record<string, "off" | "user-invocable-only" | "name-only">): string;
77
79
  //# sourceMappingURL=plugins.d.ts.map
@@ -15,6 +15,13 @@ import { homedir } from "node:os";
15
15
  import { dirname, join, relative } from "node:path";
16
16
  import { fileURLToPath } from "node:url";
17
17
  import { getInstalledPlugins } from "./marketplace.js";
18
+ /** Session-scoped extra plugin directories registered via --plugin-dir / --plugin-url. */
19
+ const extraPluginDirs = [];
20
+ /** Register an extra plugin directory for this session (not persisted). */
21
+ export function addExtraPluginDir(dir) {
22
+ if (!extraPluginDirs.includes(dir))
23
+ extraPluginDirs.push(dir);
24
+ }
18
25
  const PROJECT_SKILLS_DIR = join(".oh", "skills");
19
26
  const GLOBAL_SKILLS_DIR = join(homedir(), ".oh", "skills");
20
27
  // Claude Code ecosystem mirror paths (Anthropic convention)
@@ -192,6 +199,17 @@ export function discoverSkills() {
192
199
  catch {
193
200
  /* marketplace module may not be loaded yet */
194
201
  }
202
+ // Session-scoped extra plugin dirs (--plugin-dir / --plugin-url)
203
+ for (const dir of extraPluginDirs) {
204
+ const pluginSkillsDir = join(dir, "skills");
205
+ const pluginSkills = loadSkillsFromDir(pluginSkillsDir, "plugin");
206
+ const manifest = loadPluginManifest(dir);
207
+ const pluginName = manifest?.name ?? dir.split(/[/\\]/).pop() ?? "extra";
208
+ for (const skill of pluginSkills) {
209
+ skill.name = `${pluginName}:${skill.name}`;
210
+ }
211
+ skills.push(...pluginSkills);
212
+ }
195
213
  // De-duplicate by name+filePath: if same skill appears in multiple paths (e.g. CC mirror), keep first.
196
214
  const seen = new Set();
197
215
  return skills.filter((s) => {
@@ -283,12 +301,21 @@ export function discoverPlugins() {
283
301
  return plugins;
284
302
  }
285
303
  /** Build a prompt listing available skills for the LLM */
286
- export function skillsToPrompt(skills) {
287
- // Only include skills with invokeModel !== false (hidden skills excluded from prompt)
288
- const visible = skills.filter((s) => s.invokeModel !== false);
304
+ export function skillsToPrompt(skills, overrides) {
305
+ // invokeModel:false hides from model; "off" and "user-invocable-only" overrides also hide from model.
306
+ const visible = skills.filter((s) => {
307
+ if (s.invokeModel === false)
308
+ return false;
309
+ const ov = overrides?.[s.name];
310
+ return ov !== "off" && ov !== "user-invocable-only";
311
+ });
289
312
  if (visible.length === 0)
290
313
  return "";
291
- const lines = visible.map((s) => `- ${s.name}: ${s.description}${s.trigger ? ` (auto-trigger: "${s.trigger}")` : ""}`);
314
+ const lines = visible.map((s) => {
315
+ const desc = overrides?.[s.name] === "name-only" ? "" : `: ${s.description}`;
316
+ const trigger = overrides?.[s.name] === "name-only" ? "" : s.trigger ? ` (auto-trigger: "${s.trigger}")` : "";
317
+ return `- ${s.name}${desc}${trigger}`;
318
+ });
292
319
  return `# Available Skills\nUse the Skill tool to invoke these:\n${lines.join("\n")}`;
293
320
  }
294
321
  //# sourceMappingURL=plugins.js.map
package/dist/main.js CHANGED
@@ -21,7 +21,7 @@ import { emitHook, setHookDecisionObserver } from "./harness/hooks.js";
21
21
  import { languageToPrompt } from "./harness/language.js";
22
22
  import { loadActiveMemories, memoriesToPrompt, userProfileToPrompt } from "./harness/memory.js";
23
23
  import { detectProject, projectContextToPrompt } from "./harness/onboarding.js";
24
- import { discoverSkills, skillsToPrompt } from "./harness/plugins.js";
24
+ import { addExtraPluginDir, discoverSkills, skillsToPrompt } from "./harness/plugins.js";
25
25
  import { createRulesFile, loadRules, loadRulesAsPrompt } from "./harness/rules.js";
26
26
  import { listSessions } from "./harness/session.js";
27
27
  import { connectedMcpServers, disconnectMcpClients, getMcpInstructions, loadMcpPrompts, loadMcpTools, parseMcpConfigFile, } from "./mcp/loader.js";
@@ -164,7 +164,7 @@ function buildSystemPrompt(model, opts = {}) {
164
164
  parts.push(memoriesPrompt);
165
165
  // Available skills (Level 0 — names + descriptions only)
166
166
  const skills = discoverSkills();
167
- const skillsPrompt = skillsToPrompt(skills);
167
+ const skillsPrompt = skillsToPrompt(skills, cfg?.skillOverrides);
168
168
  if (skillsPrompt)
169
169
  parts.push(skillsPrompt);
170
170
  // MCP server instructions (sandboxed — treat as untrusted)
@@ -366,7 +366,12 @@ program
366
366
  if (outputFormat === "stream-json") {
367
367
  console.log(JSON.stringify({ type: "turnStart", turnNumber: 0 }));
368
368
  }
369
- for await (const event of query(prompt, config, priorMessages)) {
369
+ // Track cumulative cost + turn count so stream-json mode can emit a final
370
+ // `result` event (consumed by `oh evals` and SDK callers).
371
+ let cumulativeCost = 0;
372
+ let turnsCompleted = 0;
373
+ let lastTurnReason = "ok";
374
+ for await (const event of query(prompt, { ...config, sessionId }, priorMessages)) {
370
375
  if (event.type === "text_delta") {
371
376
  fullOutput += event.content;
372
377
  if (outputFormat === "text")
@@ -408,6 +413,7 @@ program
408
413
  }
409
414
  }
410
415
  else if (event.type === "cost_update") {
416
+ cumulativeCost += event.cost;
411
417
  if (outputFormat === "stream-json") {
412
418
  console.log(JSON.stringify({
413
419
  type: "cost_update",
@@ -419,6 +425,8 @@ program
419
425
  }
420
426
  }
421
427
  else if (event.type === "turn_complete") {
428
+ turnsCompleted += 1;
429
+ lastTurnReason = event.reason;
422
430
  if (outputFormat === "stream-json") {
423
431
  console.log(JSON.stringify({ type: "turn_complete", reason: event.reason }));
424
432
  }
@@ -431,6 +439,15 @@ program
431
439
  }
432
440
  }
433
441
  }
442
+ if (outputFormat === "stream-json") {
443
+ console.log(JSON.stringify({
444
+ type: "result",
445
+ subtype: lastTurnReason,
446
+ total_cost_usd: cumulativeCost,
447
+ num_turns: turnsCompleted,
448
+ result: fullOutput,
449
+ }));
450
+ }
434
451
  if (outputFormat === "json") {
435
452
  console.log(JSON.stringify({ output: fullOutput, tools: toolResults }, null, 2));
436
453
  }
@@ -632,7 +649,7 @@ program
632
649
  permissionMode,
633
650
  });
634
651
  console.log(JSON.stringify({ id, type: "turnStart", turnNumber: turnIdx }));
635
- for await (const event of query(prompt, config, conversation)) {
652
+ for await (const event of query(prompt, { ...config, sessionId }, conversation)) {
636
653
  if (event.type === "text_delta") {
637
654
  assistantText += event.content;
638
655
  console.log(JSON.stringify({ id, type: "text", content: event.content }));
@@ -1535,6 +1552,74 @@ program
1535
1552
  }, intervalMs);
1536
1553
  process.stderr.write(`[schedule] Running every ${opts.interval} minutes. Ctrl+C to stop.\n`);
1537
1554
  });
1555
+ // ── --plugin-dir / --plugin-url (session-scoped extra plugins) ──
1556
+ // Added as global options so they work with any subcommand (run, session, REPL).
1557
+ program
1558
+ .option("--plugin-dir <path>", "Load a plugin from a local directory for this session (not persisted)")
1559
+ .option("--plugin-url <url>", "Download a plugin .zip or .tar.gz from a URL and load it for this session");
1560
+ program.hook("preAction", async () => {
1561
+ const opts = program.opts();
1562
+ if (opts.pluginDir) {
1563
+ addExtraPluginDir(opts.pluginDir);
1564
+ }
1565
+ if (opts.pluginUrl) {
1566
+ const { get: httpsGet } = await import("node:https");
1567
+ const { createWriteStream, mkdirSync: fsMkdir, readdirSync: fsReaddir } = await import("node:fs");
1568
+ const { mkdtempSync } = await import("node:fs");
1569
+ const { tmpdir } = await import("node:os");
1570
+ const { execFileSync: execFile } = await import("node:child_process");
1571
+ const url = opts.pluginUrl;
1572
+ const tmp = mkdtempSync(join(tmpdir(), "oh-plugin-"));
1573
+ const isZip = url.endsWith(".zip");
1574
+ const archiveName = isZip ? "plugin.zip" : "plugin.tar.gz";
1575
+ const archivePath = join(tmp, archiveName);
1576
+ await new Promise((resolve, reject) => {
1577
+ function follow(u, depth = 0) {
1578
+ if (depth > 5) {
1579
+ reject(new Error("too many redirects"));
1580
+ return;
1581
+ }
1582
+ httpsGet(u, (res) => {
1583
+ if (res.statusCode === 301 || res.statusCode === 302) {
1584
+ follow(res.headers.location ?? u, depth + 1);
1585
+ }
1586
+ else if (res.statusCode !== 200) {
1587
+ reject(new Error(`HTTP ${res.statusCode} fetching plugin from ${u}`));
1588
+ }
1589
+ else {
1590
+ const out = createWriteStream(archivePath);
1591
+ res.pipe(out);
1592
+ out.on("finish", resolve);
1593
+ out.on("error", reject);
1594
+ }
1595
+ }).on("error", reject);
1596
+ }
1597
+ follow(url);
1598
+ });
1599
+ const extractDir = join(tmp, "plugin");
1600
+ fsMkdir(extractDir, { recursive: true });
1601
+ if (isZip) {
1602
+ execFile("unzip", ["-q", archivePath, "-d", extractDir]);
1603
+ }
1604
+ else {
1605
+ execFile("tar", ["-xzf", archivePath], { cwd: extractDir });
1606
+ }
1607
+ // If the archive produced a single top-level dir, step into it (common convention).
1608
+ const { statSync: fsStat } = await import("node:fs");
1609
+ const entries = fsReaddir(extractDir);
1610
+ const singleDir = entries.length === 1 &&
1611
+ (() => {
1612
+ try {
1613
+ return fsStat(join(extractDir, entries[0])).isDirectory();
1614
+ }
1615
+ catch {
1616
+ return false;
1617
+ }
1618
+ })();
1619
+ const pluginRoot = singleDir ? join(extractDir, entries[0]) : extractDir;
1620
+ addExtraPluginDir(pluginRoot);
1621
+ }
1622
+ });
1538
1623
  program.parseAsync(process.argv).catch((err) => {
1539
1624
  console.error(err instanceof Error ? err.message : String(err));
1540
1625
  process.exitCode = 1;
@@ -32,6 +32,8 @@ export declare function loadMcpTools(opts?: LoadMcpOptions): Promise<Tool[]>;
32
32
  export declare function disconnectMcpClients(): void;
33
33
  /** Names of connected MCP servers */
34
34
  export declare function connectedMcpServers(): string[];
35
+ /** Tool count for a connected MCP server, or undefined if not connected. */
36
+ export declare function mcpServerToolCount(name: string): number | undefined;
35
37
  export type McpPromptHandle = {
36
38
  /** `<server>:<prompt>` qualified name — the slash command is `/<server>:<prompt>`. */
37
39
  qualifiedName: string;
@@ -48,6 +48,7 @@ export function parseMcpConfigFile(path) {
48
48
  return servers;
49
49
  }
50
50
  const connectedClients = [];
51
+ const serverToolCount = new Map();
51
52
  let exitHandlerInstalled = false;
52
53
  function installExitHandler() {
53
54
  if (exitHandlerInstalled)
@@ -104,6 +105,7 @@ export async function loadMcpTools(opts = {}) {
104
105
  }
105
106
  const { client, defs, server } = result.value;
106
107
  connectedClients.push(client);
108
+ serverToolCount.set(server.name, defs.length);
107
109
  debug("mcp", "connected", { server: server.name, tools: defs.length, deferred: defs.length > DEFERRED_THRESHOLD });
108
110
  if (defs.length > DEFERRED_THRESHOLD) {
109
111
  for (const def of defs) {
@@ -129,11 +131,16 @@ export function disconnectMcpClients() {
129
131
  }
130
132
  }
131
133
  connectedClients.length = 0;
134
+ serverToolCount.clear();
132
135
  }
133
136
  /** Names of connected MCP servers */
134
137
  export function connectedMcpServers() {
135
138
  return connectedClients.map((c) => c.name);
136
139
  }
140
+ /** Tool count for a connected MCP server, or undefined if not connected. */
141
+ export function mcpServerToolCount(name) {
142
+ return serverToolCount.get(name);
143
+ }
137
144
  /**
138
145
  * Enumerate prompts on every already-connected MCP server. Servers that don't
139
146
  * implement the `prompts/list` capability return an empty list (handled
@@ -87,10 +87,11 @@ export class AnthropicProvider {
87
87
  // Prompt caching: send system prompt as content blocks with cache_control.
88
88
  // Anthropic caches matching prefixes — 90% cost reduction on repeat turns.
89
89
  const systemBlocks = [{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } }];
90
- // Scale max_tokens and thinking budget based on model
90
+ // Scale max_tokens and thinking budget based on model.
91
+ // Anthropic requires max_tokens > thinking.budget_tokens.
91
92
  const isOpus = m.includes("opus");
92
- const maxTokens = isOpus ? 16384 : 8192;
93
- const thinkingBudget = isOpus ? 32000 : 10000;
93
+ const maxTokens = isOpus ? 32768 : 16384;
94
+ const thinkingBudget = isOpus ? 24576 : 8192;
94
95
  const body = {
95
96
  model: m,
96
97
  max_tokens: maxTokens,
@@ -293,7 +294,18 @@ export class AnthropicProvider {
293
294
  return createAssistantMessage(content, toolCalls.length ? toolCalls : undefined);
294
295
  }
295
296
  getModelInfo(id) {
296
- return this.listModels().find((m) => m.id === id);
297
+ // Exact match first; otherwise prefix-match so dated model IDs like
298
+ // "claude-haiku-4-5-20251001" resolve to "claude-haiku-4-5".
299
+ const models = this.listModels();
300
+ const exact = models.find((m) => m.id === id);
301
+ if (exact)
302
+ return exact;
303
+ let best;
304
+ for (const m of models) {
305
+ if (id.startsWith(m.id) && (!best || m.id.length > best.id.length))
306
+ best = m;
307
+ }
308
+ return best;
297
309
  }
298
310
  listModels() {
299
311
  return [
@@ -58,6 +58,7 @@ export async function* query(userMessage, config, existingMessages = []) {
58
58
  gitCommitPerTool: config.gitCommitPerTool,
59
59
  tracer: config.tracer,
60
60
  parentSpanId: querySpanId,
61
+ sessionId: config.sessionId,
61
62
  };
62
63
  const estimateTokens = makeTokenEstimator(config.provider);
63
64
  const contextManager = new ContextManager(undefined, config.model);
@@ -35,6 +35,8 @@ export type QueryConfig = {
35
35
  permissionPromptTool?: string;
36
36
  /** Optional session tracer. When set, query() emits `query` and `tool:<Name>` spans. */
37
37
  tracer?: SessionTracer;
38
+ /** Session ID injected into Bash subprocess env as OH_SESSION_ID. */
39
+ sessionId?: string;
38
40
  };
39
41
  export type TransitionReason = "next_turn" | "retry_network" | "retry_prompt_too_long" | "retry_max_output_tokens";
40
42
  export type QueryLoopState = {
package/dist/repl.js CHANGED
@@ -921,6 +921,7 @@ export async function startREPL(config) {
921
921
  model: currentModel || undefined,
922
922
  abortSignal: abortController.signal,
923
923
  tracer,
924
+ sessionId: session.id,
924
925
  };
925
926
  try {
926
927
  for await (const event of query(prompt, queryConfig, messages)) {
@@ -52,7 +52,7 @@ export const BashTool = {
52
52
  const bgId = Date.now().toString(36) + Math.random().toString(36).slice(2, 6);
53
53
  const proc = spawn(shell, shellArgs, {
54
54
  cwd: context.workingDir,
55
- env: safeEnv(),
55
+ env: safeEnv(context.sessionId ? { OH_SESSION_ID: context.sessionId } : undefined),
56
56
  stdio: ["ignore", "pipe", "pipe"],
57
57
  detached: false,
58
58
  ...extraSpawnOpts,
@@ -98,7 +98,7 @@ export const BashTool = {
98
98
  let killed = false;
99
99
  const proc = spawn(shell, shellArgs, {
100
100
  cwd: context.workingDir,
101
- env: safeEnv(),
101
+ env: safeEnv(context.sessionId ? { OH_SESSION_ID: context.sessionId } : undefined),
102
102
  stdio: ["ignore", "pipe", "pipe"],
103
103
  ...extraSpawnOpts,
104
104
  });
@@ -19,6 +19,8 @@ const BLOCKED_PATTERNS = [
19
19
  /^DOCKER_.*TOKEN$/i,
20
20
  /^SSH_.*KEY$/i,
21
21
  /^OH_CREDENTIAL/i,
22
+ // Prevent subprocesses from inheriting the CLI's own OTLP endpoint.
23
+ /^OTEL_/i,
22
24
  ];
23
25
  /**
24
26
  * Filter process.env to remove credential-containing variables.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@zhijiewang/openharness",
3
- "version": "2.40.1",
3
+ "version": "2.40.3",
4
4
  "description": "Open-source terminal coding agent. Works with any LLM.",
5
5
  "type": "module",
6
6
  "bin": {