@zhijiewang/openharness 2.38.0 → 2.40.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -0
- package/README.zh-CN.md +35 -0
- package/dist/commands/info.js +14 -15
- package/dist/evals/cli.d.ts +22 -0
- package/dist/evals/cli.js +214 -0
- package/dist/evals/index.d.ts +12 -0
- package/dist/evals/index.js +8 -0
- package/dist/evals/orchestrator.d.ts +64 -0
- package/dist/evals/orchestrator.js +391 -0
- package/dist/evals/pack-loader.d.ts +29 -0
- package/dist/evals/pack-loader.js +153 -0
- package/dist/evals/run-writer.d.ts +35 -0
- package/dist/evals/run-writer.js +94 -0
- package/dist/evals/scorer.d.ts +34 -0
- package/dist/evals/scorer.js +127 -0
- package/dist/evals/types.d.ts +74 -0
- package/dist/evals/types.js +10 -0
- package/dist/harness/sandbox.d.ts +34 -0
- package/dist/harness/sandbox.js +104 -0
- package/dist/harness/traces.d.ts +25 -0
- package/dist/harness/traces.js +168 -0
- package/dist/main.js +3 -0
- package/dist/tools/GrepTool/index.d.ts +4 -4
- package/package.json +1 -1
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* oh evals — run orchestrator.
|
|
3
|
+
*
|
|
4
|
+
* Coordinates the full run lifecycle:
|
|
5
|
+
* - manages a concurrency pool of N parallel task workers
|
|
6
|
+
* - per task: extract repo tarball → setup.sh → spawn `oh run` subprocess
|
|
7
|
+
* → tee stdout to transcript file + parse stream-json → git diff →
|
|
8
|
+
* scoreTask → RunWriter.appendResult → cleanup worktree
|
|
9
|
+
* - aggregates total cost; halts scheduling when total >= max_cost_usd
|
|
10
|
+
* - resumability: skip instance_ids already in results.jsonl
|
|
11
|
+
* - cancellation: cancel() sets flag, SIGTERMs running subs, then SIGKILL
|
|
12
|
+
*
|
|
13
|
+
* Subprocess command (no --working-dir flag — we use spawn's cwd option):
|
|
14
|
+
* node dist/main.js run --bare --output-format stream-json
|
|
15
|
+
* --no-session-persistence --max-budget-usd <cap> --max-turns <n>
|
|
16
|
+
* --model <model> "<problem_statement>"
|
|
17
|
+
*/
|
|
18
|
+
import { execFileSync, spawn, spawnSync } from "node:child_process";
|
|
19
|
+
import { createWriteStream, existsSync, mkdirSync, rmSync as nodeRmSync, readFileSync } from "node:fs";
|
|
20
|
+
import { join } from "node:path";
|
|
21
|
+
import { isGitRepo, removeWorktree } from "../git/index.js";
|
|
22
|
+
import { RunWriter } from "./run-writer.js";
|
|
23
|
+
import { scoreTask } from "./scorer.js";
|
|
24
|
+
export class RunOrchestrator {
|
|
25
|
+
opts;
|
|
26
|
+
writer;
|
|
27
|
+
perTaskCap;
|
|
28
|
+
cancelled = false;
|
|
29
|
+
halted = false;
|
|
30
|
+
totalCost = 0;
|
|
31
|
+
running = new Set();
|
|
32
|
+
skipIds = new Set();
|
|
33
|
+
constructor(opts) {
|
|
34
|
+
this.opts = opts;
|
|
35
|
+
this.perTaskCap = opts.maxTaskCostUsd ?? opts.maxCostUsd / Math.max(1, opts.tasks.length);
|
|
36
|
+
const harnessVersion = readHarnessVersion();
|
|
37
|
+
this.writer = new RunWriter(opts.runDir, {
|
|
38
|
+
run_id: pathBaseName(opts.runDir),
|
|
39
|
+
pack: opts.pack.name,
|
|
40
|
+
pack_version: opts.pack.version,
|
|
41
|
+
model: opts.model,
|
|
42
|
+
harness_version: harnessVersion,
|
|
43
|
+
max_cost_usd: opts.maxCostUsd,
|
|
44
|
+
started_at: new Date().toISOString(),
|
|
45
|
+
});
|
|
46
|
+
if (opts.resumeFromRunId) {
|
|
47
|
+
const prior = this.writer.loadExistingResults();
|
|
48
|
+
for (const r of prior) {
|
|
49
|
+
this.skipIds.add(r.instance_id);
|
|
50
|
+
this.totalCost += r.cost_usd;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
cancel() {
|
|
55
|
+
this.cancelled = true;
|
|
56
|
+
for (const child of this.running) {
|
|
57
|
+
try {
|
|
58
|
+
child.kill("SIGTERM");
|
|
59
|
+
}
|
|
60
|
+
catch {
|
|
61
|
+
// already exited
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
async run() {
|
|
66
|
+
const queue = this.opts.tasks.filter((t) => !this.skipIds.has(t.instance_id));
|
|
67
|
+
let nextIndex = 0;
|
|
68
|
+
const concurrency = Math.max(1, this.opts.concurrency);
|
|
69
|
+
const worker = async () => {
|
|
70
|
+
while (!this.cancelled && !this.halted) {
|
|
71
|
+
if (this.totalCost >= this.opts.maxCostUsd) {
|
|
72
|
+
this.halted = true;
|
|
73
|
+
break;
|
|
74
|
+
}
|
|
75
|
+
const idx = nextIndex++;
|
|
76
|
+
if (idx >= queue.length)
|
|
77
|
+
break;
|
|
78
|
+
const task = queue[idx];
|
|
79
|
+
this.opts.onTaskStart?.(task);
|
|
80
|
+
const result = await this.runOneTask(task);
|
|
81
|
+
this.totalCost += result.cost_usd;
|
|
82
|
+
this.writer.appendResult(result);
|
|
83
|
+
this.opts.onTaskComplete?.(result);
|
|
84
|
+
}
|
|
85
|
+
};
|
|
86
|
+
await Promise.all(Array.from({ length: concurrency }, worker));
|
|
87
|
+
return this.writer.finalize({
|
|
88
|
+
partial: this.cancelled || this.halted,
|
|
89
|
+
finished_at: new Date().toISOString(),
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
async runOneTask(task) {
|
|
93
|
+
const startedAt = new Date();
|
|
94
|
+
const start = Date.now();
|
|
95
|
+
const taskWorktreeBase = join(this.opts.runDir, "worktrees", task.instance_id);
|
|
96
|
+
mkdirSync(taskWorktreeBase, { recursive: true });
|
|
97
|
+
let worktreePath = null;
|
|
98
|
+
let usedGitWorktree = false;
|
|
99
|
+
try {
|
|
100
|
+
// 1. Extract fixture into the per-task worktree dir.
|
|
101
|
+
worktreePath = taskWorktreeBase;
|
|
102
|
+
await extractFixture(this.opts.packDir, task.instance_id, worktreePath);
|
|
103
|
+
// 2. Run setup.sh (creates a base commit so we can git diff later).
|
|
104
|
+
const setupOk = await runSetupScript(this.opts.packDir, task.instance_id, worktreePath);
|
|
105
|
+
if (!setupOk.ok) {
|
|
106
|
+
return makeResult({
|
|
107
|
+
task,
|
|
108
|
+
status: "skipped",
|
|
109
|
+
resolved: false,
|
|
110
|
+
cost_usd: 0,
|
|
111
|
+
turns_used: 0,
|
|
112
|
+
duration_ms: Date.now() - start,
|
|
113
|
+
model_patch: "",
|
|
114
|
+
tests_status: emptyTestsStatus(),
|
|
115
|
+
transcript_path: `transcripts/${task.instance_id}.jsonl`,
|
|
116
|
+
error_message: `setup.sh failed: ${setupOk.error}`,
|
|
117
|
+
startedAt,
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
usedGitWorktree = isGitRepo(worktreePath);
|
|
121
|
+
// 3. Spawn the subprocess.
|
|
122
|
+
const { exec, args } = this.opts.subprocessArgvBuilder
|
|
123
|
+
? this.opts.subprocessArgvBuilder(task, {
|
|
124
|
+
worktreeDir: worktreePath,
|
|
125
|
+
perTaskCostCap: this.perTaskCap,
|
|
126
|
+
maxTurns: this.opts.maxTaskTurns,
|
|
127
|
+
model: this.opts.model,
|
|
128
|
+
})
|
|
129
|
+
: {
|
|
130
|
+
exec: process.execPath,
|
|
131
|
+
args: defaultRunArgs({
|
|
132
|
+
ohEntry: this.opts.ohEntry ?? defaultOhEntry(),
|
|
133
|
+
perTaskCostCap: this.perTaskCap,
|
|
134
|
+
maxTurns: this.opts.maxTaskTurns,
|
|
135
|
+
model: this.opts.model,
|
|
136
|
+
fallbackModel: this.opts.fallbackModel,
|
|
137
|
+
prompt: task.problem_statement,
|
|
138
|
+
}),
|
|
139
|
+
};
|
|
140
|
+
const transcriptPath = join(this.opts.runDir, "transcripts", `${task.instance_id}.jsonl`);
|
|
141
|
+
const transcriptStream = createWriteStream(transcriptPath);
|
|
142
|
+
const child = spawn(exec, args, {
|
|
143
|
+
cwd: worktreePath,
|
|
144
|
+
env: { ...process.env },
|
|
145
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
146
|
+
});
|
|
147
|
+
this.running.add(child);
|
|
148
|
+
// Tee stdout to transcript file + parser.
|
|
149
|
+
let stdoutBuf = "";
|
|
150
|
+
child.stdout?.on("data", (chunk) => {
|
|
151
|
+
transcriptStream.write(chunk);
|
|
152
|
+
stdoutBuf += chunk.toString("utf-8");
|
|
153
|
+
});
|
|
154
|
+
let stderrBuf = "";
|
|
155
|
+
child.stderr?.on("data", (chunk) => {
|
|
156
|
+
stderrBuf += chunk.toString("utf-8");
|
|
157
|
+
});
|
|
158
|
+
// 4. Race subprocess vs timeout.
|
|
159
|
+
let timedOut = false;
|
|
160
|
+
const timeoutHandle = setTimeout(() => {
|
|
161
|
+
timedOut = true;
|
|
162
|
+
try {
|
|
163
|
+
child.kill("SIGKILL");
|
|
164
|
+
}
|
|
165
|
+
catch {
|
|
166
|
+
/* already exited */
|
|
167
|
+
}
|
|
168
|
+
}, this.opts.taskTimeoutMs);
|
|
169
|
+
const exitCode = await new Promise((resolve) => {
|
|
170
|
+
child.once("exit", (code) => resolve(code));
|
|
171
|
+
child.once("error", () => resolve(null));
|
|
172
|
+
});
|
|
173
|
+
clearTimeout(timeoutHandle);
|
|
174
|
+
this.running.delete(child);
|
|
175
|
+
// Flush the transcript stream before proceeding to cleanup. On Windows,
|
|
176
|
+
// unflushed write streams can keep file handles open, which races with
|
|
177
|
+
// worktree rmSync in the finally block.
|
|
178
|
+
await new Promise((resolve) => {
|
|
179
|
+
transcriptStream.end(() => resolve());
|
|
180
|
+
});
|
|
181
|
+
// 5. Parse stream-json result event.
|
|
182
|
+
const parsed = parseStreamJsonResult(stdoutBuf);
|
|
183
|
+
if (timedOut) {
|
|
184
|
+
return makeResult({
|
|
185
|
+
task,
|
|
186
|
+
status: "timeout",
|
|
187
|
+
resolved: false,
|
|
188
|
+
cost_usd: parsed.cost_usd,
|
|
189
|
+
turns_used: parsed.turns_used,
|
|
190
|
+
duration_ms: Date.now() - start,
|
|
191
|
+
model_patch: usedGitWorktree ? captureGitDiff(worktreePath) : "",
|
|
192
|
+
tests_status: emptyTestsStatus(),
|
|
193
|
+
transcript_path: `transcripts/${task.instance_id}.jsonl`,
|
|
194
|
+
error_message: `task exceeded ${this.opts.taskTimeoutMs}ms timeout`,
|
|
195
|
+
startedAt,
|
|
196
|
+
});
|
|
197
|
+
}
|
|
198
|
+
if (parsed.exit_reason === "budget_exceeded") {
|
|
199
|
+
return makeResult({
|
|
200
|
+
task,
|
|
201
|
+
status: "budget_exceeded",
|
|
202
|
+
resolved: false,
|
|
203
|
+
cost_usd: parsed.cost_usd,
|
|
204
|
+
turns_used: parsed.turns_used,
|
|
205
|
+
duration_ms: Date.now() - start,
|
|
206
|
+
model_patch: usedGitWorktree ? captureGitDiff(worktreePath) : "",
|
|
207
|
+
tests_status: emptyTestsStatus(),
|
|
208
|
+
transcript_path: `transcripts/${task.instance_id}.jsonl`,
|
|
209
|
+
startedAt,
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
if (exitCode !== 0) {
|
|
213
|
+
return makeResult({
|
|
214
|
+
task,
|
|
215
|
+
status: "error",
|
|
216
|
+
resolved: false,
|
|
217
|
+
cost_usd: parsed.cost_usd,
|
|
218
|
+
turns_used: parsed.turns_used,
|
|
219
|
+
duration_ms: Date.now() - start,
|
|
220
|
+
model_patch: usedGitWorktree ? captureGitDiff(worktreePath) : "",
|
|
221
|
+
tests_status: emptyTestsStatus(),
|
|
222
|
+
transcript_path: `transcripts/${task.instance_id}.jsonl`,
|
|
223
|
+
error_message: `subprocess exit ${exitCode}: ${stderrBuf.slice(-500)}`,
|
|
224
|
+
startedAt,
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
// 6. Capture model_patch.
|
|
228
|
+
const modelPatch = usedGitWorktree ? captureGitDiff(worktreePath) : "";
|
|
229
|
+
// 7. Score.
|
|
230
|
+
const score = await scoreTask({
|
|
231
|
+
task,
|
|
232
|
+
worktreeDir: worktreePath,
|
|
233
|
+
fixtureDir: join(this.opts.packDir, "fixtures", task.instance_id),
|
|
234
|
+
packDefaultTestCommand: this.opts.pack.default_test_command,
|
|
235
|
+
testTimeoutMs: this.opts.taskTimeoutMs,
|
|
236
|
+
});
|
|
237
|
+
const status = score.error_message !== undefined ? "error" : score.resolved ? "resolved" : "failed";
|
|
238
|
+
return makeResult({
|
|
239
|
+
task,
|
|
240
|
+
status,
|
|
241
|
+
resolved: score.resolved,
|
|
242
|
+
cost_usd: parsed.cost_usd,
|
|
243
|
+
turns_used: parsed.turns_used,
|
|
244
|
+
duration_ms: Date.now() - start,
|
|
245
|
+
model_patch: modelPatch,
|
|
246
|
+
tests_status: score.tests_status,
|
|
247
|
+
transcript_path: `transcripts/${task.instance_id}.jsonl`,
|
|
248
|
+
error_message: score.error_message,
|
|
249
|
+
startedAt,
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
finally {
|
|
253
|
+
// Clean up worktree (best-effort; swallow errors so a leak doesn't stop a run).
|
|
254
|
+
if (worktreePath && existsSync(worktreePath)) {
|
|
255
|
+
try {
|
|
256
|
+
if (usedGitWorktree)
|
|
257
|
+
removeWorktree(worktreePath);
|
|
258
|
+
// Also remove the temp dir tree under runDir/worktrees/<id> regardless.
|
|
259
|
+
rmSyncIfExists(worktreePath);
|
|
260
|
+
}
|
|
261
|
+
catch {
|
|
262
|
+
/* swallow */
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
// ── helpers ──
|
|
269
|
+
function rmSyncIfExists(p) {
|
|
270
|
+
try {
|
|
271
|
+
nodeRmSync(p, { recursive: true, force: true });
|
|
272
|
+
}
|
|
273
|
+
catch {
|
|
274
|
+
/* swallow */
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
function emptyTestsStatus() {
|
|
278
|
+
return {
|
|
279
|
+
FAIL_TO_PASS: { success: [], failure: [] },
|
|
280
|
+
PASS_TO_PASS: { success: [], failure: [] },
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
function makeResult(args) {
|
|
284
|
+
return {
|
|
285
|
+
instance_id: args.task.instance_id,
|
|
286
|
+
status: args.status,
|
|
287
|
+
resolved: args.resolved,
|
|
288
|
+
cost_usd: args.cost_usd,
|
|
289
|
+
turns_used: args.turns_used,
|
|
290
|
+
duration_ms: args.duration_ms,
|
|
291
|
+
model_patch: args.model_patch,
|
|
292
|
+
tests_status: args.tests_status,
|
|
293
|
+
transcript_path: args.transcript_path,
|
|
294
|
+
error_message: args.error_message,
|
|
295
|
+
started_at: args.startedAt.toISOString(),
|
|
296
|
+
finished_at: new Date().toISOString(),
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
function parseStreamJsonResult(stdout) {
|
|
300
|
+
const lines = stdout.split("\n").filter((l) => l.trim().length > 0);
|
|
301
|
+
for (let i = lines.length - 1; i >= 0; i--) {
|
|
302
|
+
try {
|
|
303
|
+
const evt = JSON.parse(lines[i]);
|
|
304
|
+
if (evt.type === "result") {
|
|
305
|
+
return {
|
|
306
|
+
cost_usd: Number(evt.total_cost_usd ?? 0),
|
|
307
|
+
turns_used: Number(evt.num_turns ?? 0),
|
|
308
|
+
exit_reason: String(evt.subtype ?? "ok"),
|
|
309
|
+
final_message: String(evt.result ?? ""),
|
|
310
|
+
};
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
catch {
|
|
314
|
+
/* not JSON; skip */
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
return { cost_usd: 0, turns_used: 0, exit_reason: "ok", final_message: "" };
|
|
318
|
+
}
|
|
319
|
+
function captureGitDiff(worktreeDir) {
|
|
320
|
+
try {
|
|
321
|
+
return execFileSync("git", ["-C", worktreeDir, "diff", "HEAD"], { encoding: "utf-8" });
|
|
322
|
+
}
|
|
323
|
+
catch {
|
|
324
|
+
return "";
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
async function extractFixture(packDir, instanceId, dest) {
|
|
328
|
+
const tarPath = join(packDir, "fixtures", instanceId, "repo.tar.zst");
|
|
329
|
+
if (!existsSync(tarPath) || readFileSync(tarPath).length === 0) {
|
|
330
|
+
// Empty tarball = test mode (synthetic pack). Caller's setup.sh
|
|
331
|
+
// handles initialization; we just ensure the dest dir exists.
|
|
332
|
+
return;
|
|
333
|
+
}
|
|
334
|
+
// Extract via system `tar` (Linux/macOS/Win10+ all ship one).
|
|
335
|
+
// -I/--use-compress-program zstd is available on tar 1.31+.
|
|
336
|
+
execFileSync("tar", ["--use-compress-program=zstd -d", "-xf", tarPath, "-C", dest], {
|
|
337
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
338
|
+
});
|
|
339
|
+
}
|
|
340
|
+
async function runSetupScript(packDir, instanceId, worktreeDir) {
|
|
341
|
+
const setupPath = join(packDir, "fixtures", instanceId, "setup.sh");
|
|
342
|
+
if (!existsSync(setupPath))
|
|
343
|
+
return { ok: true }; // No setup needed.
|
|
344
|
+
const r = spawnSync(setupPath, [], {
|
|
345
|
+
cwd: worktreeDir,
|
|
346
|
+
shell: true, // works for both .sh on POSIX and bash-as-shell on Windows
|
|
347
|
+
encoding: "utf-8",
|
|
348
|
+
});
|
|
349
|
+
if (r.status !== 0) {
|
|
350
|
+
return { ok: false, error: (r.stderr ?? "").slice(-500) };
|
|
351
|
+
}
|
|
352
|
+
return { ok: true };
|
|
353
|
+
}
|
|
354
|
+
function defaultOhEntry() {
|
|
355
|
+
return join(process.cwd(), "dist", "main.js");
|
|
356
|
+
}
|
|
357
|
+
function defaultRunArgs(opts) {
|
|
358
|
+
const args = [
|
|
359
|
+
opts.ohEntry,
|
|
360
|
+
"run",
|
|
361
|
+
"--bare",
|
|
362
|
+
"--output-format",
|
|
363
|
+
"stream-json",
|
|
364
|
+
"--no-session-persistence",
|
|
365
|
+
"--max-budget-usd",
|
|
366
|
+
String(opts.perTaskCostCap),
|
|
367
|
+
"--max-turns",
|
|
368
|
+
String(opts.maxTurns),
|
|
369
|
+
"--model",
|
|
370
|
+
opts.model,
|
|
371
|
+
];
|
|
372
|
+
if (opts.fallbackModel)
|
|
373
|
+
args.push("--fallback-model", opts.fallbackModel);
|
|
374
|
+
args.push(opts.prompt);
|
|
375
|
+
return args;
|
|
376
|
+
}
|
|
377
|
+
function readHarnessVersion() {
|
|
378
|
+
try {
|
|
379
|
+
const pkgPath = join(process.cwd(), "package.json");
|
|
380
|
+
const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
|
|
381
|
+
return pkg.version ?? "0.0.0";
|
|
382
|
+
}
|
|
383
|
+
catch {
|
|
384
|
+
return "0.0.0";
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
function pathBaseName(p) {
|
|
388
|
+
const parts = p.split(/[\\/]/);
|
|
389
|
+
return parts[parts.length - 1] ?? "";
|
|
390
|
+
}
|
|
391
|
+
//# sourceMappingURL=orchestrator.js.map
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* oh evals — pack loader. Loads, validates, and resolves fixture paths
|
|
3
|
+
* for eval packs from disk.
|
|
4
|
+
*
|
|
5
|
+
* Bundled packs live under `data/evals/packs/<name>/`. User-installed packs
|
|
6
|
+
* live under `~/.oh/evals/packs/<name>/`. Bundled packs win precedence on
|
|
7
|
+
* name collision (so users can't shadow `swe-bench-lite-mini` accidentally).
|
|
8
|
+
*/
|
|
9
|
+
import type { EvalsPack, EvalsTask } from "./types.js";
|
|
10
|
+
export declare function validatePack(packDir: string): {
|
|
11
|
+
ok: true;
|
|
12
|
+
} | {
|
|
13
|
+
ok: false;
|
|
14
|
+
errors: string[];
|
|
15
|
+
};
|
|
16
|
+
export declare function loadPack(packDir: string): {
|
|
17
|
+
pack: EvalsPack;
|
|
18
|
+
tasks: EvalsTask[];
|
|
19
|
+
};
|
|
20
|
+
export declare function resolveFixturePath(packDir: string, instanceId: string): string;
|
|
21
|
+
/** Returns names of packs found in bundled and user directories. Bundled wins on collision. */
|
|
22
|
+
export declare function listAvailablePacks(): string[];
|
|
23
|
+
/**
|
|
24
|
+
* Resolve a pack by name to its on-disk directory. Bundled packs (under
|
|
25
|
+
* the package's data/evals/packs/) win precedence over user packs
|
|
26
|
+
* (~/.oh/evals/packs/).
|
|
27
|
+
*/
|
|
28
|
+
export declare function resolvePackDir(packName: string): string | null;
|
|
29
|
+
//# sourceMappingURL=pack-loader.d.ts.map
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* oh evals — pack loader. Loads, validates, and resolves fixture paths
|
|
3
|
+
* for eval packs from disk.
|
|
4
|
+
*
|
|
5
|
+
* Bundled packs live under `data/evals/packs/<name>/`. User-installed packs
|
|
6
|
+
* live under `~/.oh/evals/packs/<name>/`. Bundled packs win precedence on
|
|
7
|
+
* name collision (so users can't shadow `swe-bench-lite-mini` accidentally).
|
|
8
|
+
*/
|
|
9
|
+
import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
|
|
10
|
+
import { homedir } from "node:os";
|
|
11
|
+
import { dirname, join } from "node:path";
|
|
12
|
+
import { fileURLToPath } from "node:url";
|
|
13
|
+
const REQUIRED_PACK_FIELDS = [
|
|
14
|
+
"name",
|
|
15
|
+
"version",
|
|
16
|
+
"description",
|
|
17
|
+
"language",
|
|
18
|
+
"runner_requirements",
|
|
19
|
+
"default_test_command",
|
|
20
|
+
"instance_count",
|
|
21
|
+
];
|
|
22
|
+
const REQUIRED_TASK_FIELDS = [
|
|
23
|
+
"instance_id",
|
|
24
|
+
"repo",
|
|
25
|
+
"base_commit",
|
|
26
|
+
"problem_statement",
|
|
27
|
+
"FAIL_TO_PASS",
|
|
28
|
+
"PASS_TO_PASS",
|
|
29
|
+
];
|
|
30
|
+
export function validatePack(packDir) {
|
|
31
|
+
const errors = [];
|
|
32
|
+
const packJsonPath = join(packDir, "pack.json");
|
|
33
|
+
if (!existsSync(packJsonPath)) {
|
|
34
|
+
errors.push(`missing pack.json at ${packJsonPath}`);
|
|
35
|
+
return { ok: false, errors };
|
|
36
|
+
}
|
|
37
|
+
let pack;
|
|
38
|
+
try {
|
|
39
|
+
pack = JSON.parse(readFileSync(packJsonPath, "utf-8"));
|
|
40
|
+
}
|
|
41
|
+
catch (err) {
|
|
42
|
+
errors.push(`failed to parse pack.json: ${err.message}`);
|
|
43
|
+
return { ok: false, errors };
|
|
44
|
+
}
|
|
45
|
+
if (typeof pack !== "object" || pack === null) {
|
|
46
|
+
errors.push(`pack.json is not an object`);
|
|
47
|
+
return { ok: false, errors };
|
|
48
|
+
}
|
|
49
|
+
for (const field of REQUIRED_PACK_FIELDS) {
|
|
50
|
+
if (!(field in pack)) {
|
|
51
|
+
errors.push(`pack.json missing required field: ${field}`);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const instancesPath = join(packDir, "instances.jsonl");
|
|
55
|
+
if (!existsSync(instancesPath)) {
|
|
56
|
+
errors.push(`missing instances.jsonl at ${instancesPath}`);
|
|
57
|
+
return errors.length === 0 ? { ok: true } : { ok: false, errors };
|
|
58
|
+
}
|
|
59
|
+
const lines = readFileSync(instancesPath, "utf-8")
|
|
60
|
+
.split("\n")
|
|
61
|
+
.filter((l) => l.trim().length > 0);
|
|
62
|
+
for (let i = 0; i < lines.length; i++) {
|
|
63
|
+
const lineNo = i + 1;
|
|
64
|
+
let task;
|
|
65
|
+
try {
|
|
66
|
+
task = JSON.parse(lines[i]);
|
|
67
|
+
}
|
|
68
|
+
catch (err) {
|
|
69
|
+
errors.push(`instances.jsonl:${lineNo} parse error: ${err.message}`);
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
if (typeof task !== "object" || task === null) {
|
|
73
|
+
errors.push(`instances.jsonl:${lineNo} not an object`);
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
for (const field of REQUIRED_TASK_FIELDS) {
|
|
77
|
+
if (!(field in task)) {
|
|
78
|
+
errors.push(`instances.jsonl:${lineNo} missing required field: ${field}`);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
const instanceId = task.instance_id;
|
|
82
|
+
if (typeof instanceId === "string") {
|
|
83
|
+
const fixtureDir = join(packDir, "fixtures", instanceId);
|
|
84
|
+
if (!existsSync(fixtureDir)) {
|
|
85
|
+
errors.push(`fixture dir missing for ${instanceId} at ${fixtureDir}`);
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
if (!existsSync(join(fixtureDir, "repo.tar.zst"))) {
|
|
89
|
+
errors.push(`fixture missing repo.tar.zst for ${instanceId}`);
|
|
90
|
+
}
|
|
91
|
+
if (!existsSync(join(fixtureDir, "setup.sh"))) {
|
|
92
|
+
errors.push(`fixture missing setup.sh for ${instanceId}`);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
return errors.length === 0 ? { ok: true } : { ok: false, errors };
|
|
98
|
+
}
|
|
99
|
+
export function loadPack(packDir) {
|
|
100
|
+
const r = validatePack(packDir);
|
|
101
|
+
if (!r.ok) {
|
|
102
|
+
throw new Error(`pack at ${packDir} failed validation:\n - ${r.errors.join("\n - ")}`);
|
|
103
|
+
}
|
|
104
|
+
const pack = JSON.parse(readFileSync(join(packDir, "pack.json"), "utf-8"));
|
|
105
|
+
const lines = readFileSync(join(packDir, "instances.jsonl"), "utf-8")
|
|
106
|
+
.split("\n")
|
|
107
|
+
.filter((l) => l.trim().length > 0);
|
|
108
|
+
const tasks = lines.map((l) => JSON.parse(l));
|
|
109
|
+
return { pack, tasks };
|
|
110
|
+
}
|
|
111
|
+
export function resolveFixturePath(packDir, instanceId) {
|
|
112
|
+
return join(packDir, "fixtures", instanceId);
|
|
113
|
+
}
|
|
114
|
+
/** Returns names of packs found in bundled and user directories. Bundled wins on collision. */
|
|
115
|
+
export function listAvailablePacks() {
|
|
116
|
+
const seen = new Set();
|
|
117
|
+
const out = [];
|
|
118
|
+
for (const root of packSearchRoots()) {
|
|
119
|
+
if (!existsSync(root))
|
|
120
|
+
continue;
|
|
121
|
+
for (const entry of readdirSync(root)) {
|
|
122
|
+
const full = join(root, entry);
|
|
123
|
+
if (statSync(full).isDirectory() && existsSync(join(full, "pack.json")) && !seen.has(entry)) {
|
|
124
|
+
seen.add(entry);
|
|
125
|
+
out.push(entry);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return out;
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Resolve a pack by name to its on-disk directory. Bundled packs (under
|
|
133
|
+
* the package's data/evals/packs/) win precedence over user packs
|
|
134
|
+
* (~/.oh/evals/packs/).
|
|
135
|
+
*/
|
|
136
|
+
export function resolvePackDir(packName) {
|
|
137
|
+
for (const root of packSearchRoots()) {
|
|
138
|
+
const candidate = join(root, packName);
|
|
139
|
+
if (existsSync(join(candidate, "pack.json")))
|
|
140
|
+
return candidate;
|
|
141
|
+
}
|
|
142
|
+
return null;
|
|
143
|
+
}
|
|
144
|
+
function packSearchRoots() {
|
|
145
|
+
// Bundled: packaged data/evals/packs/, located relative to this module.
|
|
146
|
+
// In dev: src/evals/pack-loader.ts → ../../data/evals/packs/
|
|
147
|
+
// In published build: dist/evals/pack-loader.js → ../../data/evals/packs/
|
|
148
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
149
|
+
const bundled = join(here, "..", "..", "data", "evals", "packs");
|
|
150
|
+
const user = join(homedir(), ".oh", "evals", "packs");
|
|
151
|
+
return [bundled, user];
|
|
152
|
+
}
|
|
153
|
+
//# sourceMappingURL=pack-loader.js.map
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* oh evals — run writer.
|
|
3
|
+
*
|
|
4
|
+
* Streams per-task results to disk atomically:
|
|
5
|
+
* - results.jsonl : append-only, one EvalsResult per line
|
|
6
|
+
* - predictions.json: array, rewritten on each append, SWE-bench-submittable
|
|
7
|
+
* - results.json : merged + aggregates, written ONLY by finalize()
|
|
8
|
+
*
|
|
9
|
+
* Crash-safety: results.jsonl + predictions.json are valid up to the last
|
|
10
|
+
* successful append. `oh evals run --resume <run_id>` reads results.jsonl
|
|
11
|
+
* to determine completed instance_ids.
|
|
12
|
+
*/
|
|
13
|
+
import type { EvalsResult, RunArtifacts } from "./types.js";
|
|
14
|
+
export type RunHeader = {
|
|
15
|
+
run_id: string;
|
|
16
|
+
pack: string;
|
|
17
|
+
pack_version: string;
|
|
18
|
+
model: string;
|
|
19
|
+
harness_version: string;
|
|
20
|
+
max_cost_usd: number;
|
|
21
|
+
started_at: string;
|
|
22
|
+
};
|
|
23
|
+
export declare class RunWriter {
|
|
24
|
+
private readonly runDir;
|
|
25
|
+
private readonly header;
|
|
26
|
+
private readonly results;
|
|
27
|
+
constructor(runDir: string, header: RunHeader);
|
|
28
|
+
appendResult(result: EvalsResult): void;
|
|
29
|
+
loadExistingResults(): EvalsResult[];
|
|
30
|
+
finalize(opts: {
|
|
31
|
+
partial: boolean;
|
|
32
|
+
finished_at: string;
|
|
33
|
+
}): RunArtifacts;
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=run-writer.d.ts.map
|