@zhijiewang/openharness 2.40.1 → 2.40.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Tool.d.ts +2 -0
- package/dist/Tool.js +9 -1
- package/dist/commands/info.js +5 -2
- package/dist/commands/skills.js +12 -6
- package/dist/evals/orchestrator.js +124 -21
- package/dist/evals/run-writer.d.ts +3 -0
- package/dist/evals/run-writer.js +5 -0
- package/dist/evals/scorer.js +72 -26
- package/dist/harness/config.d.ts +9 -0
- package/dist/harness/cost.js +12 -1
- package/dist/harness/plugins.d.ts +3 -1
- package/dist/harness/plugins.js +31 -4
- package/dist/main.js +89 -4
- package/dist/mcp/loader.d.ts +2 -0
- package/dist/mcp/loader.js +7 -0
- package/dist/providers/anthropic.js +16 -4
- package/dist/query/index.js +1 -0
- package/dist/query/types.d.ts +2 -0
- package/dist/repl.js +1 -0
- package/dist/tools/BashTool/index.js +2 -2
- package/dist/utils/safe-env.js +2 -0
- package/package.json +1 -1
package/dist/Tool.d.ts
CHANGED
|
@@ -33,6 +33,8 @@ export type ToolContext = {
|
|
|
33
33
|
tracer?: import("./harness/traces.js").SessionTracer;
|
|
34
34
|
/** Optional parent span ID for the current tool execution (set by query loop). */
|
|
35
35
|
parentSpanId?: string;
|
|
36
|
+
/** Session ID for the current query — injected into Bash subprocess env. */
|
|
37
|
+
sessionId?: string;
|
|
36
38
|
};
|
|
37
39
|
export type Tool<Input extends z.ZodType = z.ZodType> = {
|
|
38
40
|
readonly name: string;
|
package/dist/Tool.js
CHANGED
|
@@ -51,7 +51,15 @@ function zodToJsonSchema(schema) {
|
|
|
51
51
|
return { type: "boolean" };
|
|
52
52
|
if (def?.typeName === "ZodArray")
|
|
53
53
|
return { type: "array", items: zodToJsonSchema(def.type) };
|
|
54
|
-
|
|
54
|
+
// ZodRecord (used by DeferredTool's permissive schema) → permissive object.
|
|
55
|
+
// Anthropic's tool-use API requires `type: "object"` for tool input_schema.
|
|
56
|
+
if (def?.typeName === "ZodRecord")
|
|
57
|
+
return { type: "object", additionalProperties: {} };
|
|
58
|
+
if (def?.typeName === "ZodUnknown" || def?.typeName === "ZodAny")
|
|
59
|
+
return {};
|
|
60
|
+
// Fallback: return permissive object so tool-use APIs that require object
|
|
61
|
+
// input schemas (Anthropic) don't reject the request.
|
|
62
|
+
return { type: "object", additionalProperties: {} };
|
|
55
63
|
}
|
|
56
64
|
/**
|
|
57
65
|
* Find a tool by name from a list of tools.
|
package/dist/commands/info.js
CHANGED
|
@@ -13,7 +13,7 @@ import { discoverPlugins, discoverSkills } from "../harness/plugins.js";
|
|
|
13
13
|
import { formatFlameGraph, formatTrace, listTracedSessions, loadTrace } from "../harness/traces.js";
|
|
14
14
|
import { getVerificationConfig, invalidateVerificationCache } from "../harness/verification.js";
|
|
15
15
|
import { normalizeMcpConfig } from "../mcp/config-normalize.js";
|
|
16
|
-
import { connectedMcpServers, disconnectMcpClients, loadMcpTools } from "../mcp/loader.js";
|
|
16
|
+
import { connectedMcpServers, disconnectMcpClients, loadMcpTools, mcpServerToolCount } from "../mcp/loader.js";
|
|
17
17
|
import { getAuthStatus } from "../mcp/oauth.js";
|
|
18
18
|
import { formatRegistry, generateConfigBlock, MCP_REGISTRY, searchRegistry } from "../mcp/registry.js";
|
|
19
19
|
import { getRouteSelection } from "../providers/router.js";
|
|
@@ -458,6 +458,8 @@ export function registerInfoCommands(register, getCommandMap) {
|
|
|
458
458
|
continue;
|
|
459
459
|
}
|
|
460
460
|
const kind = normalized.cfg.type;
|
|
461
|
+
const toolCount = mcpServerToolCount(name);
|
|
462
|
+
const toolsLabel = toolCount !== undefined ? `${toolCount} tool${toolCount === 1 ? "" : "s"}` : "";
|
|
461
463
|
const status = await getAuthStatus(normalized.cfg, storageDir);
|
|
462
464
|
let statusText;
|
|
463
465
|
switch (status) {
|
|
@@ -474,7 +476,8 @@ export function registerInfoCommands(register, getCommandMap) {
|
|
|
474
476
|
statusText = "expired (re-authenticate with /mcp-login)";
|
|
475
477
|
break;
|
|
476
478
|
}
|
|
477
|
-
|
|
479
|
+
const toolsPart = toolsLabel ? ` ${toolsLabel.padEnd(9)}` : " ";
|
|
480
|
+
lines.push(` ${name.padEnd(20)} ${kind.padEnd(6)}${toolsPart} ${statusText}`);
|
|
478
481
|
}
|
|
479
482
|
lines.push("");
|
|
480
483
|
lines.push("Run /mcp-registry to browse and add more servers.");
|
package/dist/commands/skills.js
CHANGED
|
@@ -3,33 +3,39 @@
|
|
|
3
3
|
*/
|
|
4
4
|
import { existsSync, mkdirSync, unlinkSync, writeFileSync } from "node:fs";
|
|
5
5
|
import { join } from "node:path";
|
|
6
|
+
import { readOhConfig } from "../harness/config.js";
|
|
6
7
|
import { discoverSkills, findSkill } from "../harness/plugins.js";
|
|
7
8
|
export function registerSkillCommands(register) {
|
|
8
9
|
register("skills", "List all available skills", () => {
|
|
9
10
|
const skills = discoverSkills();
|
|
10
|
-
|
|
11
|
+
const overrides = readOhConfig()?.skillOverrides ?? {};
|
|
12
|
+
// "off" skills are fully hidden from the user
|
|
13
|
+
const visible = skills.filter((s) => overrides[s.name] !== "off");
|
|
14
|
+
if (visible.length === 0) {
|
|
11
15
|
return {
|
|
12
16
|
output: "No skills found. Create .oh/skills/*.md to add one, or run /skill-search to browse the registry.",
|
|
13
17
|
handled: true,
|
|
14
18
|
};
|
|
15
19
|
}
|
|
16
|
-
// Group by source for readability
|
|
17
20
|
const lines = ["Available skills:"];
|
|
18
21
|
const sourceLabel = {
|
|
19
22
|
project: "[project]",
|
|
20
23
|
global: "[global]",
|
|
21
24
|
plugin: "[plugin]",
|
|
22
25
|
};
|
|
23
|
-
|
|
24
|
-
const sorted = [...skills].sort((a, b) => {
|
|
26
|
+
const sorted = [...visible].sort((a, b) => {
|
|
25
27
|
if (a.source !== b.source)
|
|
26
28
|
return a.source.localeCompare(b.source);
|
|
27
29
|
return a.name.localeCompare(b.name);
|
|
28
30
|
});
|
|
29
31
|
for (const s of sorted) {
|
|
30
32
|
const tag = sourceLabel[s.source] ?? `[${s.source}]`;
|
|
31
|
-
const
|
|
32
|
-
|
|
33
|
+
const ov = overrides[s.name];
|
|
34
|
+
// "user-invocable-only": show name but mark as not available to model
|
|
35
|
+
// "name-only": suppress description (mirrors model-side behaviour)
|
|
36
|
+
const descText = ov === "name-only" || !s.description ? "" : `: ${s.description}`;
|
|
37
|
+
const hint = ov === "user-invocable-only" ? " [user-only]" : "";
|
|
38
|
+
lines.push(` - ${s.name} ${tag}${descText}${hint}`);
|
|
33
39
|
}
|
|
34
40
|
return { output: lines.join("\n"), handled: true };
|
|
35
41
|
});
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
* --model <model> "<problem_statement>"
|
|
17
17
|
*/
|
|
18
18
|
import { execFileSync, spawn, spawnSync } from "node:child_process";
|
|
19
|
-
import { createWriteStream, existsSync, mkdirSync, rmSync as nodeRmSync, readFileSync } from "node:fs";
|
|
19
|
+
import { copyFileSync, createWriteStream, existsSync, mkdirSync, rmSync as nodeRmSync, readFileSync, unlinkSync, writeFileSync, } from "node:fs";
|
|
20
20
|
import { join } from "node:path";
|
|
21
21
|
import { isGitRepo, removeWorktree } from "../git/index.js";
|
|
22
22
|
import { RunWriter } from "./run-writer.js";
|
|
@@ -48,6 +48,7 @@ export class RunOrchestrator {
|
|
|
48
48
|
for (const r of prior) {
|
|
49
49
|
this.skipIds.add(r.instance_id);
|
|
50
50
|
this.totalCost += r.cost_usd;
|
|
51
|
+
this.writer.preloadResult(r);
|
|
51
52
|
}
|
|
52
53
|
}
|
|
53
54
|
}
|
|
@@ -134,7 +135,7 @@ export class RunOrchestrator {
|
|
|
134
135
|
maxTurns: this.opts.maxTaskTurns,
|
|
135
136
|
model: this.opts.model,
|
|
136
137
|
fallbackModel: this.opts.fallbackModel,
|
|
137
|
-
prompt: task.problem_statement,
|
|
138
|
+
prompt: buildEvalPrompt(task.problem_statement),
|
|
138
139
|
}),
|
|
139
140
|
};
|
|
140
141
|
const transcriptPath = join(this.opts.runDir, "transcripts", `${task.instance_id}.jsonl`);
|
|
@@ -251,7 +252,8 @@ export class RunOrchestrator {
|
|
|
251
252
|
}
|
|
252
253
|
finally {
|
|
253
254
|
// Clean up worktree (best-effort; swallow errors so a leak doesn't stop a run).
|
|
254
|
-
|
|
255
|
+
// Set OH_EVALS_KEEP_WORKTREES=1 to skip cleanup for post-run debugging.
|
|
256
|
+
if (worktreePath && existsSync(worktreePath) && !process.env.OH_EVALS_KEEP_WORKTREES) {
|
|
255
257
|
try {
|
|
256
258
|
if (usedGitWorktree)
|
|
257
259
|
removeWorktree(worktreePath);
|
|
@@ -317,12 +319,20 @@ function parseStreamJsonResult(stdout) {
|
|
|
317
319
|
return { cost_usd: 0, turns_used: 0, exit_reason: "ok", final_message: "" };
|
|
318
320
|
}
|
|
319
321
|
function captureGitDiff(worktreeDir) {
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
322
|
+
// setup.sh initialises the git repo at worktreeDir/repo/.git, so diff from
|
|
323
|
+
// that subdirectory. Fall back to worktreeDir for legacy fixtures that put
|
|
324
|
+
// .git at the worktree root.
|
|
325
|
+
for (const dir of [join(worktreeDir, "repo"), worktreeDir]) {
|
|
326
|
+
try {
|
|
327
|
+
const out = execFileSync("git", ["-C", dir, "diff", "HEAD"], { encoding: "utf-8" });
|
|
328
|
+
if (out)
|
|
329
|
+
return out;
|
|
330
|
+
}
|
|
331
|
+
catch {
|
|
332
|
+
/* try next */
|
|
333
|
+
}
|
|
325
334
|
}
|
|
335
|
+
return "";
|
|
326
336
|
}
|
|
327
337
|
async function extractFixture(packDir, instanceId, dest) {
|
|
328
338
|
const fxDir = join(packDir, "fixtures", instanceId);
|
|
@@ -340,14 +350,28 @@ async function extractFixture(packDir, instanceId, dest) {
|
|
|
340
350
|
// handles initialization; we just ensure the dest dir exists.
|
|
341
351
|
return;
|
|
342
352
|
}
|
|
343
|
-
|
|
344
|
-
|
|
353
|
+
// Use cwd + relative archive name to avoid GNU tar treating Windows drive
|
|
354
|
+
// letters (e.g. "E:") as remote hostnames when passed as absolute paths.
|
|
355
|
+
const archiveName = c.flag === "-xzf" ? "_repo.tar.gz" : "_repo.tar.zst";
|
|
356
|
+
copyFileSync(c.path, join(dest, archiveName));
|
|
357
|
+
try {
|
|
358
|
+
if (c.flag === "-xzf") {
|
|
359
|
+
execFileSync("tar", ["-xzf", archiveName], { cwd: dest, stdio: ["ignore", "pipe", "pipe"] });
|
|
360
|
+
}
|
|
361
|
+
else {
|
|
362
|
+
execFileSync("tar", ["--use-compress-program=zstd -d", "-xf", archiveName], {
|
|
363
|
+
cwd: dest,
|
|
364
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
365
|
+
});
|
|
366
|
+
}
|
|
345
367
|
}
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
368
|
+
finally {
|
|
369
|
+
try {
|
|
370
|
+
unlinkSync(join(dest, archiveName));
|
|
371
|
+
}
|
|
372
|
+
catch {
|
|
373
|
+
/* best-effort */
|
|
374
|
+
}
|
|
351
375
|
}
|
|
352
376
|
return;
|
|
353
377
|
}
|
|
@@ -356,19 +380,98 @@ async function runSetupScript(packDir, instanceId, worktreeDir) {
|
|
|
356
380
|
const setupPath = join(packDir, "fixtures", instanceId, "setup.sh");
|
|
357
381
|
if (!existsSync(setupPath))
|
|
358
382
|
return { ok: true }; // No setup needed.
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
383
|
+
// Invoke sh/bash explicitly so the script runs without the execute bit.
|
|
384
|
+
// On Windows, use bash (Git Bash) and define python3 as a shell function
|
|
385
|
+
// that delegates to `python` — Python 3 on Windows ships as python.exe only.
|
|
386
|
+
let r;
|
|
387
|
+
if (process.platform === "win32") {
|
|
388
|
+
// Python 3 on Windows installs as python.exe only, and the WindowsApps stub
|
|
389
|
+
// for both `python` and `python3` appears first on Git Bash's PATH. We find
|
|
390
|
+
// the real interpreter via where.exe and use its absolute POSIX path directly.
|
|
391
|
+
const realPython = windowsRealPythonPosix();
|
|
392
|
+
// On Windows, `python3 -m venv` creates .venv/Scripts/activate, not .venv/bin/activate.
|
|
393
|
+
// Patch setup.sh to use the Windows path so sourcing works in Git Bash.
|
|
394
|
+
const original = readFileSync(setupPath, "utf-8");
|
|
395
|
+
const patched = original
|
|
396
|
+
.replace(/\bsource\s+\.venv\/bin\/activate\b/g, "source .venv/Scripts/activate")
|
|
397
|
+
.replace(/\. \.venv\/bin\/activate\b/g, ". .venv/Scripts/activate");
|
|
398
|
+
const tmpSetup = `${setupPath}.win.sh`;
|
|
399
|
+
try {
|
|
400
|
+
unlinkSync(tmpSetup);
|
|
401
|
+
}
|
|
402
|
+
catch {
|
|
403
|
+
/* ok */
|
|
404
|
+
}
|
|
405
|
+
writeFileSync(tmpSetup, patched, "utf-8");
|
|
406
|
+
const posixTmp = tmpSetup.replace(/\\/g, "/").replace(/^([A-Za-z]):/, (_, d) => `/${d.toLowerCase()}`);
|
|
407
|
+
// Only define python3 — pip must NOT be overridden because after venv activation
|
|
408
|
+
// the venv's pip.exe is on PATH and must be used (not system Python's pip).
|
|
409
|
+
const pyFn = realPython ? `python3() { "${realPython}" "$@"; }` : "";
|
|
410
|
+
r = spawnSync("bash", ["-c", `${pyFn}${pyFn ? "; " : ""}. "${posixTmp}"`], {
|
|
411
|
+
cwd: worktreeDir,
|
|
412
|
+
encoding: "utf-8",
|
|
413
|
+
});
|
|
414
|
+
try {
|
|
415
|
+
unlinkSync(tmpSetup);
|
|
416
|
+
}
|
|
417
|
+
catch {
|
|
418
|
+
/* best-effort */
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
else {
|
|
422
|
+
r = spawnSync("/bin/sh", [setupPath], { cwd: worktreeDir, encoding: "utf-8" });
|
|
423
|
+
}
|
|
364
424
|
if (r.status !== 0) {
|
|
365
|
-
return { ok: false, error: (r.stderr ?? "").slice(-500) };
|
|
425
|
+
return { ok: false, error: String(r.stderr ?? "").slice(-500) };
|
|
366
426
|
}
|
|
367
427
|
return { ok: true };
|
|
368
428
|
}
|
|
429
|
+
/** Returns the POSIX path to the real Python interpreter on Windows,
|
|
430
|
+
* skipping the WindowsApps stub which is a dead-end redirect. */
|
|
431
|
+
function windowsRealPythonPosix() {
|
|
432
|
+
try {
|
|
433
|
+
const out = spawnSync("where.exe", ["python"], { encoding: "utf-8" }).stdout ?? "";
|
|
434
|
+
for (const line of out.split(/\r?\n/)) {
|
|
435
|
+
const p = line.trim();
|
|
436
|
+
if (p && !p.includes("WindowsApps")) {
|
|
437
|
+
return p.replace(/\\/g, "/").replace(/^([A-Za-z]):/, (_, d) => `/${d.toLowerCase()}`);
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
catch {
|
|
442
|
+
/* fall through */
|
|
443
|
+
}
|
|
444
|
+
return "";
|
|
445
|
+
}
|
|
369
446
|
function defaultOhEntry() {
|
|
370
447
|
return join(process.cwd(), "dist", "main.js");
|
|
371
448
|
}
|
|
449
|
+
/** Wrap a SWE-bench problem statement with SWE-bench-style instructions:
|
|
450
|
+
* the working tree is in ./repo, only that subtree is committed/diffed,
|
|
451
|
+
* edit source files in place, don't create documentation/scratch files. */
|
|
452
|
+
function buildEvalPrompt(problemStatement) {
|
|
453
|
+
return [
|
|
454
|
+
"You are an autonomous software engineer fixing a bug in an open-source Python project.",
|
|
455
|
+
"",
|
|
456
|
+
"WORKING DIRECTORY",
|
|
457
|
+
"- The repository source is in `./repo/` (relative to your current directory).",
|
|
458
|
+
"- A `.venv` next to it has the project installed editably; do NOT recreate it.",
|
|
459
|
+
"- Run all bash commands with `cd repo && …` or use absolute paths under `./repo/`.",
|
|
460
|
+
"",
|
|
461
|
+
"WHAT TO DO",
|
|
462
|
+
"- Read the problem statement below, locate the relevant source files in `./repo/`, and edit them in place to fix the bug.",
|
|
463
|
+
"- Use the existing test suite to verify (run with `cd repo && python -m pytest <file_or_pattern>`).",
|
|
464
|
+
"- Only changes inside `./repo/` are scored; the orchestrator runs `git diff HEAD` from `./repo/` to extract your patch.",
|
|
465
|
+
"",
|
|
466
|
+
"WHAT NOT TO DO",
|
|
467
|
+
"- Do NOT create README/SUMMARY/GUIDE/PATCH/SOLUTION/COMPLETION files. Edit the source.",
|
|
468
|
+
"- Do NOT write standalone scratch scripts at the worktree root — only edit files under `./repo/`.",
|
|
469
|
+
"- Do NOT modify `.venv/`, generated `_version.py` files, or anything outside `./repo/`.",
|
|
470
|
+
"",
|
|
471
|
+
"PROBLEM STATEMENT",
|
|
472
|
+
problemStatement,
|
|
473
|
+
].join("\n");
|
|
474
|
+
}
|
|
372
475
|
function defaultRunArgs(opts) {
|
|
373
476
|
const args = [
|
|
374
477
|
opts.ohEntry,
|
|
@@ -26,6 +26,9 @@ export declare class RunWriter {
|
|
|
26
26
|
private readonly results;
|
|
27
27
|
constructor(runDir: string, header: RunHeader);
|
|
28
28
|
appendResult(result: EvalsResult): void;
|
|
29
|
+
/** Load a result that was written in a prior run into the in-memory array without
|
|
30
|
+
* re-writing it to disk (used by the resume path so finalize() includes all results). */
|
|
31
|
+
preloadResult(result: EvalsResult): void;
|
|
29
32
|
loadExistingResults(): EvalsResult[];
|
|
30
33
|
finalize(opts: {
|
|
31
34
|
partial: boolean;
|
package/dist/evals/run-writer.js
CHANGED
|
@@ -37,6 +37,11 @@ export class RunWriter {
|
|
|
37
37
|
writeFileSync(tmp, JSON.stringify(preds, null, 2));
|
|
38
38
|
renameSync(tmp, join(this.runDir, "predictions.json"));
|
|
39
39
|
}
|
|
40
|
+
/** Load a result that was written in a prior run into the in-memory array without
|
|
41
|
+
* re-writing it to disk (used by the resume path so finalize() includes all results). */
|
|
42
|
+
preloadResult(result) {
|
|
43
|
+
this.results.push(result);
|
|
44
|
+
}
|
|
40
45
|
loadExistingResults() {
|
|
41
46
|
const path = join(this.runDir, "results.jsonl");
|
|
42
47
|
if (!existsSync(path))
|
package/dist/evals/scorer.js
CHANGED
|
@@ -12,6 +12,33 @@
|
|
|
12
12
|
import { spawnSync } from "node:child_process";
|
|
13
13
|
import { existsSync, readFileSync } from "node:fs";
|
|
14
14
|
import { join } from "node:path";
|
|
15
|
+
/** Convert pytest junit-xml classname/name (+ optional file= attr) into the
|
|
16
|
+
* pytest-style id that SWE-bench uses: `path/to/file.py::[Class::]test_name`.
|
|
17
|
+
* Returns null if a sensible id can't be built. */
|
|
18
|
+
function pytestStyleId(cn, name, file) {
|
|
19
|
+
let fileNorm;
|
|
20
|
+
let classTail;
|
|
21
|
+
if (file) {
|
|
22
|
+
fileNorm = file.replace(/\\/g, "/");
|
|
23
|
+
const moduleFromFile = fileNorm.replace(/\.py$/, "").replace(/\//g, ".");
|
|
24
|
+
classTail = cn.startsWith(`${moduleFromFile}.`) ? cn.slice(moduleFromFile.length + 1) : "";
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
// No `file=` attribute (older pytest / minimal junit-xml). Derive the
|
|
28
|
+
// path from classname: trailing PascalCase segments are class names,
|
|
29
|
+
// the rest is the dotted module path → file is module/path.py.
|
|
30
|
+
const parts = cn.split(".");
|
|
31
|
+
const classParts = [];
|
|
32
|
+
while (parts.length > 0 && /^[A-Z]/.test(parts[parts.length - 1] ?? "")) {
|
|
33
|
+
classParts.unshift(parts.pop());
|
|
34
|
+
}
|
|
35
|
+
if (parts.length === 0)
|
|
36
|
+
return null;
|
|
37
|
+
fileNorm = `${parts.join("/")}.py`;
|
|
38
|
+
classTail = classParts.join("::");
|
|
39
|
+
}
|
|
40
|
+
return classTail ? `${fileNorm}::${classTail}::${name}` : `${fileNorm}::${name}`;
|
|
41
|
+
}
|
|
15
42
|
/**
|
|
16
43
|
* Minimal junit-xml parser. Returns a map of "<classname>.<name>" → outcome.
|
|
17
44
|
*
|
|
@@ -27,17 +54,19 @@ export function parseJunitXml(xml) {
|
|
|
27
54
|
const inner = match[2] ?? "";
|
|
28
55
|
const cn = /classname="([^"]*)"/.exec(attrs)?.[1];
|
|
29
56
|
const name = /\bname="([^"]*)"/.exec(attrs)?.[1];
|
|
57
|
+
const file = /\bfile="([^"]*)"/.exec(attrs)?.[1];
|
|
30
58
|
if (cn && name) {
|
|
31
|
-
|
|
32
|
-
if (/<failure\b/.test(inner) || /<error\b/.test(inner))
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
59
|
+
let outcome = "pass";
|
|
60
|
+
if (/<failure\b/.test(inner) || /<error\b/.test(inner))
|
|
61
|
+
outcome = "fail";
|
|
62
|
+
else if (/<skipped\b/.test(inner))
|
|
63
|
+
outcome = "skip";
|
|
64
|
+
// Emit BOTH a dotted classname.name id (legacy) and pytest-style
|
|
65
|
+
// file::[Class::]name ids so SWE-bench-format expected IDs match.
|
|
66
|
+
out[`${cn}.${name}`] = outcome;
|
|
67
|
+
const ptid = pytestStyleId(cn, name, file);
|
|
68
|
+
if (ptid)
|
|
69
|
+
out[ptid] = outcome;
|
|
41
70
|
}
|
|
42
71
|
match = testcaseRe.exec(xml);
|
|
43
72
|
}
|
|
@@ -53,22 +82,28 @@ export async function scoreTask(args) {
|
|
|
53
82
|
const oracleSh = join(fixtureDir, "oracle.sh");
|
|
54
83
|
const oracleMjs = join(fixtureDir, "oracle.mjs");
|
|
55
84
|
if (existsSync(oracleSh)) {
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
WORKTREE_DIR: worktreeDir,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
85
|
+
// Invoke /bin/sh explicitly so oracle.sh runs without the execute bit.
|
|
86
|
+
// Files committed from Windows or via writeFileSync default to mode 100644.
|
|
87
|
+
const r = process.platform === "win32"
|
|
88
|
+
? spawnSync(oracleSh, [], {
|
|
89
|
+
cwd: worktreeDir,
|
|
90
|
+
env: { ...process.env, INSTANCE_ID: task.instance_id, WORKTREE_DIR: worktreeDir, FIXTURE_DIR: fixtureDir },
|
|
91
|
+
timeout: testTimeoutMs,
|
|
92
|
+
shell: true,
|
|
93
|
+
})
|
|
94
|
+
: spawnSync("/bin/sh", [oracleSh], {
|
|
95
|
+
cwd: worktreeDir,
|
|
96
|
+
env: { ...process.env, INSTANCE_ID: task.instance_id, WORKTREE_DIR: worktreeDir, FIXTURE_DIR: fixtureDir },
|
|
97
|
+
timeout: testTimeoutMs,
|
|
98
|
+
});
|
|
99
|
+
// Oracle exit code is the pass/fail signal — do NOT set error_message for a clean
|
|
100
|
+
// non-zero exit (that means "test failed", not "scoring errored"). Only flag when
|
|
101
|
+
// the process itself failed to run (killed, spawn error, etc.).
|
|
67
102
|
return {
|
|
68
103
|
resolved: r.status === 0,
|
|
69
104
|
tests_status: EMPTY_TESTS_STATUS,
|
|
70
105
|
oracle_used: true,
|
|
71
|
-
error_message: r.status ===
|
|
106
|
+
error_message: r.status === null ? `oracle.sh did not exit cleanly: signal=${r.signal}` : undefined,
|
|
72
107
|
};
|
|
73
108
|
}
|
|
74
109
|
if (existsSync(oracleMjs)) {
|
|
@@ -86,16 +121,27 @@ export async function scoreTask(args) {
|
|
|
86
121
|
resolved: r.status === 0,
|
|
87
122
|
tests_status: EMPTY_TESTS_STATUS,
|
|
88
123
|
oracle_used: true,
|
|
89
|
-
error_message: r.status ===
|
|
124
|
+
error_message: r.status === null ? `oracle.mjs did not exit cleanly: signal=${r.signal}` : undefined,
|
|
90
125
|
};
|
|
91
126
|
}
|
|
92
127
|
// (2) Default test command.
|
|
93
|
-
|
|
128
|
+
// Run via bash so the venv is activated; cd into ./repo first if it exists
|
|
129
|
+
// (real SWE-bench packs put project source there). For synthetic packs
|
|
130
|
+
// without a repo/ subdir, run from the worktree root.
|
|
131
|
+
const hasRepo = existsSync(join(worktreeDir, "repo"));
|
|
132
|
+
const venvActivate = process.platform === "win32"
|
|
133
|
+
? "[ -f .venv/Scripts/activate ] && source .venv/Scripts/activate"
|
|
134
|
+
: "[ -f .venv/bin/activate ] && source .venv/bin/activate";
|
|
135
|
+
const cdRepo = hasRepo ? "cd repo && " : "";
|
|
136
|
+
const r = spawnSync("bash", ["-c", `${venvActivate}; ${cdRepo}${packDefaultTestCommand}`], {
|
|
94
137
|
cwd: worktreeDir,
|
|
95
|
-
shell: true,
|
|
96
138
|
timeout: testTimeoutMs,
|
|
97
139
|
});
|
|
98
|
-
|
|
140
|
+
// Test command writes junit-xml relative to its CWD. Prefer repo/ when it
|
|
141
|
+
// exists; fall back to worktree root for synthetic/legacy packs.
|
|
142
|
+
const xmlPathRepo = join(worktreeDir, "repo", ".oh-evals-results.xml");
|
|
143
|
+
const xmlPathRoot = join(worktreeDir, ".oh-evals-results.xml");
|
|
144
|
+
const xmlPath = existsSync(xmlPathRepo) ? xmlPathRepo : xmlPathRoot;
|
|
99
145
|
if (!existsSync(xmlPath)) {
|
|
100
146
|
return {
|
|
101
147
|
resolved: false,
|
package/dist/harness/config.d.ts
CHANGED
|
@@ -253,6 +253,15 @@ export type OhConfig = {
|
|
|
253
253
|
* call-site that already uses `safeEnv()` picks this up automatically.
|
|
254
254
|
*/
|
|
255
255
|
env?: Record<string, string>;
|
|
256
|
+
/**
|
|
257
|
+
* Per-skill visibility overrides. Keys are skill names (e.g. "my-skill" or
|
|
258
|
+
* "plugin:skill-name"). Values:
|
|
259
|
+
* "off" — hidden from model AND from the slash picker
|
|
260
|
+
* "user-invocable-only" — hidden from model, still shows in /skills + slash picker
|
|
261
|
+
* "name-only" — shown to model but description collapsed to name only
|
|
262
|
+
* Mirrors Claude Code's `skillOverrides` setting.
|
|
263
|
+
*/
|
|
264
|
+
skillOverrides?: Record<string, "off" | "user-invocable-only" | "name-only">;
|
|
256
265
|
};
|
|
257
266
|
/** Clear cached config (call after writes or to force re-read) */
|
|
258
267
|
export declare function invalidateConfigCache(): void;
|
package/dist/harness/cost.js
CHANGED
|
@@ -60,7 +60,18 @@ export const MODEL_PRICING = {
|
|
|
60
60
|
"qwen-turbo": [0.2, 0.6],
|
|
61
61
|
};
|
|
62
62
|
export function estimateCost(model, inputTokens, outputTokens) {
|
|
63
|
-
|
|
63
|
+
// Exact match first; otherwise prefix-match so dated model IDs like
|
|
64
|
+
// "claude-haiku-4-5-20251001" resolve to "claude-haiku-4-5".
|
|
65
|
+
let pricing = MODEL_PRICING[model];
|
|
66
|
+
if (!pricing) {
|
|
67
|
+
let bestKey = "";
|
|
68
|
+
for (const key of Object.keys(MODEL_PRICING)) {
|
|
69
|
+
if (model.startsWith(key) && key.length > bestKey.length)
|
|
70
|
+
bestKey = key;
|
|
71
|
+
}
|
|
72
|
+
if (bestKey)
|
|
73
|
+
pricing = MODEL_PRICING[bestKey];
|
|
74
|
+
}
|
|
64
75
|
if (!pricing)
|
|
65
76
|
return 0;
|
|
66
77
|
return (inputTokens / 1_000_000) * pricing[0] + (outputTokens / 1_000_000) * pricing[1];
|
|
@@ -54,6 +54,8 @@ export type AgentTeamConfig = {
|
|
|
54
54
|
tools?: string[];
|
|
55
55
|
}>;
|
|
56
56
|
};
|
|
57
|
+
/** Register an extra plugin directory for this session (not persisted). */
|
|
58
|
+
export declare function addExtraPluginDir(dir: string): void;
|
|
57
59
|
/** Discover all available skills from bundled + project + global dirs + installed plugins */
|
|
58
60
|
export declare function discoverSkills(): SkillMetadata[];
|
|
59
61
|
/** Find a skill by name (case-insensitive) */
|
|
@@ -73,5 +75,5 @@ export declare function loadPluginManifest(dir: string): PluginManifest | null;
|
|
|
73
75
|
/** Discover plugins from node_modules */
|
|
74
76
|
export declare function discoverPlugins(): PluginManifest[];
|
|
75
77
|
/** Build a prompt listing available skills for the LLM */
|
|
76
|
-
export declare function skillsToPrompt(skills: SkillMetadata[]): string;
|
|
78
|
+
export declare function skillsToPrompt(skills: SkillMetadata[], overrides?: Record<string, "off" | "user-invocable-only" | "name-only">): string;
|
|
77
79
|
//# sourceMappingURL=plugins.d.ts.map
|
package/dist/harness/plugins.js
CHANGED
|
@@ -15,6 +15,13 @@ import { homedir } from "node:os";
|
|
|
15
15
|
import { dirname, join, relative } from "node:path";
|
|
16
16
|
import { fileURLToPath } from "node:url";
|
|
17
17
|
import { getInstalledPlugins } from "./marketplace.js";
|
|
18
|
+
/** Session-scoped extra plugin directories registered via --plugin-dir / --plugin-url. */
|
|
19
|
+
const extraPluginDirs = [];
|
|
20
|
+
/** Register an extra plugin directory for this session (not persisted). */
|
|
21
|
+
export function addExtraPluginDir(dir) {
|
|
22
|
+
if (!extraPluginDirs.includes(dir))
|
|
23
|
+
extraPluginDirs.push(dir);
|
|
24
|
+
}
|
|
18
25
|
const PROJECT_SKILLS_DIR = join(".oh", "skills");
|
|
19
26
|
const GLOBAL_SKILLS_DIR = join(homedir(), ".oh", "skills");
|
|
20
27
|
// Claude Code ecosystem mirror paths (Anthropic convention)
|
|
@@ -192,6 +199,17 @@ export function discoverSkills() {
|
|
|
192
199
|
catch {
|
|
193
200
|
/* marketplace module may not be loaded yet */
|
|
194
201
|
}
|
|
202
|
+
// Session-scoped extra plugin dirs (--plugin-dir / --plugin-url)
|
|
203
|
+
for (const dir of extraPluginDirs) {
|
|
204
|
+
const pluginSkillsDir = join(dir, "skills");
|
|
205
|
+
const pluginSkills = loadSkillsFromDir(pluginSkillsDir, "plugin");
|
|
206
|
+
const manifest = loadPluginManifest(dir);
|
|
207
|
+
const pluginName = manifest?.name ?? dir.split(/[/\\]/).pop() ?? "extra";
|
|
208
|
+
for (const skill of pluginSkills) {
|
|
209
|
+
skill.name = `${pluginName}:${skill.name}`;
|
|
210
|
+
}
|
|
211
|
+
skills.push(...pluginSkills);
|
|
212
|
+
}
|
|
195
213
|
// De-duplicate by name+filePath: if same skill appears in multiple paths (e.g. CC mirror), keep first.
|
|
196
214
|
const seen = new Set();
|
|
197
215
|
return skills.filter((s) => {
|
|
@@ -283,12 +301,21 @@ export function discoverPlugins() {
|
|
|
283
301
|
return plugins;
|
|
284
302
|
}
|
|
285
303
|
/** Build a prompt listing available skills for the LLM */
|
|
286
|
-
export function skillsToPrompt(skills) {
|
|
287
|
-
//
|
|
288
|
-
const visible = skills.filter((s) =>
|
|
304
|
+
export function skillsToPrompt(skills, overrides) {
|
|
305
|
+
// invokeModel:false hides from model; "off" and "user-invocable-only" overrides also hide from model.
|
|
306
|
+
const visible = skills.filter((s) => {
|
|
307
|
+
if (s.invokeModel === false)
|
|
308
|
+
return false;
|
|
309
|
+
const ov = overrides?.[s.name];
|
|
310
|
+
return ov !== "off" && ov !== "user-invocable-only";
|
|
311
|
+
});
|
|
289
312
|
if (visible.length === 0)
|
|
290
313
|
return "";
|
|
291
|
-
const lines = visible.map((s) =>
|
|
314
|
+
const lines = visible.map((s) => {
|
|
315
|
+
const desc = overrides?.[s.name] === "name-only" ? "" : `: ${s.description}`;
|
|
316
|
+
const trigger = overrides?.[s.name] === "name-only" ? "" : s.trigger ? ` (auto-trigger: "${s.trigger}")` : "";
|
|
317
|
+
return `- ${s.name}${desc}${trigger}`;
|
|
318
|
+
});
|
|
292
319
|
return `# Available Skills\nUse the Skill tool to invoke these:\n${lines.join("\n")}`;
|
|
293
320
|
}
|
|
294
321
|
//# sourceMappingURL=plugins.js.map
|
package/dist/main.js
CHANGED
|
@@ -21,7 +21,7 @@ import { emitHook, setHookDecisionObserver } from "./harness/hooks.js";
|
|
|
21
21
|
import { languageToPrompt } from "./harness/language.js";
|
|
22
22
|
import { loadActiveMemories, memoriesToPrompt, userProfileToPrompt } from "./harness/memory.js";
|
|
23
23
|
import { detectProject, projectContextToPrompt } from "./harness/onboarding.js";
|
|
24
|
-
import { discoverSkills, skillsToPrompt } from "./harness/plugins.js";
|
|
24
|
+
import { addExtraPluginDir, discoverSkills, skillsToPrompt } from "./harness/plugins.js";
|
|
25
25
|
import { createRulesFile, loadRules, loadRulesAsPrompt } from "./harness/rules.js";
|
|
26
26
|
import { listSessions } from "./harness/session.js";
|
|
27
27
|
import { connectedMcpServers, disconnectMcpClients, getMcpInstructions, loadMcpPrompts, loadMcpTools, parseMcpConfigFile, } from "./mcp/loader.js";
|
|
@@ -164,7 +164,7 @@ function buildSystemPrompt(model, opts = {}) {
|
|
|
164
164
|
parts.push(memoriesPrompt);
|
|
165
165
|
// Available skills (Level 0 — names + descriptions only)
|
|
166
166
|
const skills = discoverSkills();
|
|
167
|
-
const skillsPrompt = skillsToPrompt(skills);
|
|
167
|
+
const skillsPrompt = skillsToPrompt(skills, cfg?.skillOverrides);
|
|
168
168
|
if (skillsPrompt)
|
|
169
169
|
parts.push(skillsPrompt);
|
|
170
170
|
// MCP server instructions (sandboxed — treat as untrusted)
|
|
@@ -366,7 +366,12 @@ program
|
|
|
366
366
|
if (outputFormat === "stream-json") {
|
|
367
367
|
console.log(JSON.stringify({ type: "turnStart", turnNumber: 0 }));
|
|
368
368
|
}
|
|
369
|
-
|
|
369
|
+
// Track cumulative cost + turn count so stream-json mode can emit a final
|
|
370
|
+
// `result` event (consumed by `oh evals` and SDK callers).
|
|
371
|
+
let cumulativeCost = 0;
|
|
372
|
+
let turnsCompleted = 0;
|
|
373
|
+
let lastTurnReason = "ok";
|
|
374
|
+
for await (const event of query(prompt, { ...config, sessionId }, priorMessages)) {
|
|
370
375
|
if (event.type === "text_delta") {
|
|
371
376
|
fullOutput += event.content;
|
|
372
377
|
if (outputFormat === "text")
|
|
@@ -408,6 +413,7 @@ program
|
|
|
408
413
|
}
|
|
409
414
|
}
|
|
410
415
|
else if (event.type === "cost_update") {
|
|
416
|
+
cumulativeCost += event.cost;
|
|
411
417
|
if (outputFormat === "stream-json") {
|
|
412
418
|
console.log(JSON.stringify({
|
|
413
419
|
type: "cost_update",
|
|
@@ -419,6 +425,8 @@ program
|
|
|
419
425
|
}
|
|
420
426
|
}
|
|
421
427
|
else if (event.type === "turn_complete") {
|
|
428
|
+
turnsCompleted += 1;
|
|
429
|
+
lastTurnReason = event.reason;
|
|
422
430
|
if (outputFormat === "stream-json") {
|
|
423
431
|
console.log(JSON.stringify({ type: "turn_complete", reason: event.reason }));
|
|
424
432
|
}
|
|
@@ -431,6 +439,15 @@ program
|
|
|
431
439
|
}
|
|
432
440
|
}
|
|
433
441
|
}
|
|
442
|
+
if (outputFormat === "stream-json") {
|
|
443
|
+
console.log(JSON.stringify({
|
|
444
|
+
type: "result",
|
|
445
|
+
subtype: lastTurnReason,
|
|
446
|
+
total_cost_usd: cumulativeCost,
|
|
447
|
+
num_turns: turnsCompleted,
|
|
448
|
+
result: fullOutput,
|
|
449
|
+
}));
|
|
450
|
+
}
|
|
434
451
|
if (outputFormat === "json") {
|
|
435
452
|
console.log(JSON.stringify({ output: fullOutput, tools: toolResults }, null, 2));
|
|
436
453
|
}
|
|
@@ -632,7 +649,7 @@ program
|
|
|
632
649
|
permissionMode,
|
|
633
650
|
});
|
|
634
651
|
console.log(JSON.stringify({ id, type: "turnStart", turnNumber: turnIdx }));
|
|
635
|
-
for await (const event of query(prompt, config, conversation)) {
|
|
652
|
+
for await (const event of query(prompt, { ...config, sessionId }, conversation)) {
|
|
636
653
|
if (event.type === "text_delta") {
|
|
637
654
|
assistantText += event.content;
|
|
638
655
|
console.log(JSON.stringify({ id, type: "text", content: event.content }));
|
|
@@ -1535,6 +1552,74 @@ program
|
|
|
1535
1552
|
}, intervalMs);
|
|
1536
1553
|
process.stderr.write(`[schedule] Running every ${opts.interval} minutes. Ctrl+C to stop.\n`);
|
|
1537
1554
|
});
|
|
1555
|
+
// ── --plugin-dir / --plugin-url (session-scoped extra plugins) ──
|
|
1556
|
+
// Added as global options so they work with any subcommand (run, session, REPL).
|
|
1557
|
+
program
|
|
1558
|
+
.option("--plugin-dir <path>", "Load a plugin from a local directory for this session (not persisted)")
|
|
1559
|
+
.option("--plugin-url <url>", "Download a plugin .zip or .tar.gz from a URL and load it for this session");
|
|
1560
|
+
program.hook("preAction", async () => {
|
|
1561
|
+
const opts = program.opts();
|
|
1562
|
+
if (opts.pluginDir) {
|
|
1563
|
+
addExtraPluginDir(opts.pluginDir);
|
|
1564
|
+
}
|
|
1565
|
+
if (opts.pluginUrl) {
|
|
1566
|
+
const { get: httpsGet } = await import("node:https");
|
|
1567
|
+
const { createWriteStream, mkdirSync: fsMkdir, readdirSync: fsReaddir } = await import("node:fs");
|
|
1568
|
+
const { mkdtempSync } = await import("node:fs");
|
|
1569
|
+
const { tmpdir } = await import("node:os");
|
|
1570
|
+
const { execFileSync: execFile } = await import("node:child_process");
|
|
1571
|
+
const url = opts.pluginUrl;
|
|
1572
|
+
const tmp = mkdtempSync(join(tmpdir(), "oh-plugin-"));
|
|
1573
|
+
const isZip = url.endsWith(".zip");
|
|
1574
|
+
const archiveName = isZip ? "plugin.zip" : "plugin.tar.gz";
|
|
1575
|
+
const archivePath = join(tmp, archiveName);
|
|
1576
|
+
await new Promise((resolve, reject) => {
|
|
1577
|
+
function follow(u, depth = 0) {
|
|
1578
|
+
if (depth > 5) {
|
|
1579
|
+
reject(new Error("too many redirects"));
|
|
1580
|
+
return;
|
|
1581
|
+
}
|
|
1582
|
+
httpsGet(u, (res) => {
|
|
1583
|
+
if (res.statusCode === 301 || res.statusCode === 302) {
|
|
1584
|
+
follow(res.headers.location ?? u, depth + 1);
|
|
1585
|
+
}
|
|
1586
|
+
else if (res.statusCode !== 200) {
|
|
1587
|
+
reject(new Error(`HTTP ${res.statusCode} fetching plugin from ${u}`));
|
|
1588
|
+
}
|
|
1589
|
+
else {
|
|
1590
|
+
const out = createWriteStream(archivePath);
|
|
1591
|
+
res.pipe(out);
|
|
1592
|
+
out.on("finish", resolve);
|
|
1593
|
+
out.on("error", reject);
|
|
1594
|
+
}
|
|
1595
|
+
}).on("error", reject);
|
|
1596
|
+
}
|
|
1597
|
+
follow(url);
|
|
1598
|
+
});
|
|
1599
|
+
const extractDir = join(tmp, "plugin");
|
|
1600
|
+
fsMkdir(extractDir, { recursive: true });
|
|
1601
|
+
if (isZip) {
|
|
1602
|
+
execFile("unzip", ["-q", archivePath, "-d", extractDir]);
|
|
1603
|
+
}
|
|
1604
|
+
else {
|
|
1605
|
+
execFile("tar", ["-xzf", archivePath], { cwd: extractDir });
|
|
1606
|
+
}
|
|
1607
|
+
// If the archive produced a single top-level dir, step into it (common convention).
|
|
1608
|
+
const { statSync: fsStat } = await import("node:fs");
|
|
1609
|
+
const entries = fsReaddir(extractDir);
|
|
1610
|
+
const singleDir = entries.length === 1 &&
|
|
1611
|
+
(() => {
|
|
1612
|
+
try {
|
|
1613
|
+
return fsStat(join(extractDir, entries[0])).isDirectory();
|
|
1614
|
+
}
|
|
1615
|
+
catch {
|
|
1616
|
+
return false;
|
|
1617
|
+
}
|
|
1618
|
+
})();
|
|
1619
|
+
const pluginRoot = singleDir ? join(extractDir, entries[0]) : extractDir;
|
|
1620
|
+
addExtraPluginDir(pluginRoot);
|
|
1621
|
+
}
|
|
1622
|
+
});
|
|
1538
1623
|
program.parseAsync(process.argv).catch((err) => {
|
|
1539
1624
|
console.error(err instanceof Error ? err.message : String(err));
|
|
1540
1625
|
process.exitCode = 1;
|
package/dist/mcp/loader.d.ts
CHANGED
|
@@ -32,6 +32,8 @@ export declare function loadMcpTools(opts?: LoadMcpOptions): Promise<Tool[]>;
|
|
|
32
32
|
export declare function disconnectMcpClients(): void;
|
|
33
33
|
/** Names of connected MCP servers */
|
|
34
34
|
export declare function connectedMcpServers(): string[];
|
|
35
|
+
/** Tool count for a connected MCP server, or undefined if not connected. */
|
|
36
|
+
export declare function mcpServerToolCount(name: string): number | undefined;
|
|
35
37
|
export type McpPromptHandle = {
|
|
36
38
|
/** `<server>:<prompt>` qualified name — the slash command is `/<server>:<prompt>`. */
|
|
37
39
|
qualifiedName: string;
|
package/dist/mcp/loader.js
CHANGED
|
@@ -48,6 +48,7 @@ export function parseMcpConfigFile(path) {
|
|
|
48
48
|
return servers;
|
|
49
49
|
}
|
|
50
50
|
const connectedClients = [];
|
|
51
|
+
const serverToolCount = new Map();
|
|
51
52
|
let exitHandlerInstalled = false;
|
|
52
53
|
function installExitHandler() {
|
|
53
54
|
if (exitHandlerInstalled)
|
|
@@ -104,6 +105,7 @@ export async function loadMcpTools(opts = {}) {
|
|
|
104
105
|
}
|
|
105
106
|
const { client, defs, server } = result.value;
|
|
106
107
|
connectedClients.push(client);
|
|
108
|
+
serverToolCount.set(server.name, defs.length);
|
|
107
109
|
debug("mcp", "connected", { server: server.name, tools: defs.length, deferred: defs.length > DEFERRED_THRESHOLD });
|
|
108
110
|
if (defs.length > DEFERRED_THRESHOLD) {
|
|
109
111
|
for (const def of defs) {
|
|
@@ -129,11 +131,16 @@ export function disconnectMcpClients() {
|
|
|
129
131
|
}
|
|
130
132
|
}
|
|
131
133
|
connectedClients.length = 0;
|
|
134
|
+
serverToolCount.clear();
|
|
132
135
|
}
|
|
133
136
|
/** Names of connected MCP servers */
|
|
134
137
|
export function connectedMcpServers() {
|
|
135
138
|
return connectedClients.map((c) => c.name);
|
|
136
139
|
}
|
|
140
|
+
/** Tool count for a connected MCP server, or undefined if not connected. */
|
|
141
|
+
export function mcpServerToolCount(name) {
|
|
142
|
+
return serverToolCount.get(name);
|
|
143
|
+
}
|
|
137
144
|
/**
|
|
138
145
|
* Enumerate prompts on every already-connected MCP server. Servers that don't
|
|
139
146
|
* implement the `prompts/list` capability return an empty list (handled
|
|
@@ -87,10 +87,11 @@ export class AnthropicProvider {
|
|
|
87
87
|
// Prompt caching: send system prompt as content blocks with cache_control.
|
|
88
88
|
// Anthropic caches matching prefixes — 90% cost reduction on repeat turns.
|
|
89
89
|
const systemBlocks = [{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } }];
|
|
90
|
-
// Scale max_tokens and thinking budget based on model
|
|
90
|
+
// Scale max_tokens and thinking budget based on model.
|
|
91
|
+
// Anthropic requires max_tokens > thinking.budget_tokens.
|
|
91
92
|
const isOpus = m.includes("opus");
|
|
92
|
-
const maxTokens = isOpus ?
|
|
93
|
-
const thinkingBudget = isOpus ?
|
|
93
|
+
const maxTokens = isOpus ? 32768 : 16384;
|
|
94
|
+
const thinkingBudget = isOpus ? 24576 : 8192;
|
|
94
95
|
const body = {
|
|
95
96
|
model: m,
|
|
96
97
|
max_tokens: maxTokens,
|
|
@@ -293,7 +294,18 @@ export class AnthropicProvider {
|
|
|
293
294
|
return createAssistantMessage(content, toolCalls.length ? toolCalls : undefined);
|
|
294
295
|
}
|
|
295
296
|
getModelInfo(id) {
|
|
296
|
-
|
|
297
|
+
// Exact match first; otherwise prefix-match so dated model IDs like
|
|
298
|
+
// "claude-haiku-4-5-20251001" resolve to "claude-haiku-4-5".
|
|
299
|
+
const models = this.listModels();
|
|
300
|
+
const exact = models.find((m) => m.id === id);
|
|
301
|
+
if (exact)
|
|
302
|
+
return exact;
|
|
303
|
+
let best;
|
|
304
|
+
for (const m of models) {
|
|
305
|
+
if (id.startsWith(m.id) && (!best || m.id.length > best.id.length))
|
|
306
|
+
best = m;
|
|
307
|
+
}
|
|
308
|
+
return best;
|
|
297
309
|
}
|
|
298
310
|
listModels() {
|
|
299
311
|
return [
|
package/dist/query/index.js
CHANGED
|
@@ -58,6 +58,7 @@ export async function* query(userMessage, config, existingMessages = []) {
|
|
|
58
58
|
gitCommitPerTool: config.gitCommitPerTool,
|
|
59
59
|
tracer: config.tracer,
|
|
60
60
|
parentSpanId: querySpanId,
|
|
61
|
+
sessionId: config.sessionId,
|
|
61
62
|
};
|
|
62
63
|
const estimateTokens = makeTokenEstimator(config.provider);
|
|
63
64
|
const contextManager = new ContextManager(undefined, config.model);
|
package/dist/query/types.d.ts
CHANGED
|
@@ -35,6 +35,8 @@ export type QueryConfig = {
|
|
|
35
35
|
permissionPromptTool?: string;
|
|
36
36
|
/** Optional session tracer. When set, query() emits `query` and `tool:<Name>` spans. */
|
|
37
37
|
tracer?: SessionTracer;
|
|
38
|
+
/** Session ID injected into Bash subprocess env as OH_SESSION_ID. */
|
|
39
|
+
sessionId?: string;
|
|
38
40
|
};
|
|
39
41
|
export type TransitionReason = "next_turn" | "retry_network" | "retry_prompt_too_long" | "retry_max_output_tokens";
|
|
40
42
|
export type QueryLoopState = {
|
package/dist/repl.js
CHANGED
|
@@ -52,7 +52,7 @@ export const BashTool = {
|
|
|
52
52
|
const bgId = Date.now().toString(36) + Math.random().toString(36).slice(2, 6);
|
|
53
53
|
const proc = spawn(shell, shellArgs, {
|
|
54
54
|
cwd: context.workingDir,
|
|
55
|
-
env: safeEnv(),
|
|
55
|
+
env: safeEnv(context.sessionId ? { OH_SESSION_ID: context.sessionId } : undefined),
|
|
56
56
|
stdio: ["ignore", "pipe", "pipe"],
|
|
57
57
|
detached: false,
|
|
58
58
|
...extraSpawnOpts,
|
|
@@ -98,7 +98,7 @@ export const BashTool = {
|
|
|
98
98
|
let killed = false;
|
|
99
99
|
const proc = spawn(shell, shellArgs, {
|
|
100
100
|
cwd: context.workingDir,
|
|
101
|
-
env: safeEnv(),
|
|
101
|
+
env: safeEnv(context.sessionId ? { OH_SESSION_ID: context.sessionId } : undefined),
|
|
102
102
|
stdio: ["ignore", "pipe", "pipe"],
|
|
103
103
|
...extraSpawnOpts,
|
|
104
104
|
});
|
package/dist/utils/safe-env.js
CHANGED