@alis-build/harness-eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,623 @@
1
+ #!/usr/bin/env node
2
+ import { C as gradeReport, O as trajectoryToOtlp, S as resolveGradeOptions, a as toProtoInstances, b as formatGradingConsole, i as toAgentTrace, n as buildEvalRunEnvelopeFromFiles, o as toTrajectory, w as loadSuiteReport, x as gradingReportPassed, y as formatReport } from "../build-DsVJ_UeU.js";
3
+ import { t as runSuite, u as getAdapter } from "../suite-chj0j22j.js";
4
+ import { i as loadGradingConfig, t as loadSuite } from "../loader-BCnFJ8rm.js";
5
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
6
+ import { dirname, join } from "node:path";
7
+ import { fileURLToPath } from "node:url";
8
+ //#region src/cli/args.ts
9
+ function parseArgs(argv) {
10
+ const positional = [];
11
+ const options = {};
12
+ let command;
13
+ const args = [...argv];
14
+ if (args.length > 0 && !args[0].startsWith("-")) command = args.shift();
15
+ for (let i = 0; i < args.length; i++) {
16
+ const arg = args[i];
17
+ if (arg === "--") {
18
+ positional.push(...args.slice(i + 1));
19
+ break;
20
+ }
21
+ if (arg.startsWith("--")) {
22
+ const key = arg.slice(2);
23
+ const next = args[i + 1];
24
+ if (next && !next.startsWith("-")) {
25
+ options[key] = next;
26
+ i++;
27
+ } else options[key] = true;
28
+ } else if (arg.startsWith("-") && arg.length === 2) {
29
+ const key = arg.slice(1);
30
+ const next = args[i + 1];
31
+ if (next && !next.startsWith("-")) {
32
+ options[key] = next;
33
+ i++;
34
+ } else options[key] = true;
35
+ } else positional.push(arg);
36
+ }
37
+ return {
38
+ command,
39
+ positional,
40
+ options
41
+ };
42
+ }
43
+ function getOption(options, name) {
44
+ const v = options[name];
45
+ return typeof v === "string" ? v : void 0;
46
+ }
47
+ function getOptionInt(options, name, defaultValue) {
48
+ const v = getOption(options, name);
49
+ if (v === void 0) return defaultValue;
50
+ const n = Number.parseInt(v, 10);
51
+ if (!Number.isFinite(n)) return defaultValue;
52
+ return n;
53
+ }
54
+ function hasOption(options, name) {
55
+ const v = options[name];
56
+ return v === true || typeof v === "string" && v === "true";
57
+ }
58
+ //#endregion
59
+ //#region src/cli/commands/envelope.ts
60
+ /**
61
+ * `harness-eval envelope` — build EvalRunEnvelope and interchange projections.
62
+ */
63
+ const PROJECTIONS = /* @__PURE__ */ new Set([
64
+ "envelope",
65
+ "trajectory",
66
+ "instances",
67
+ "agent-trace"
68
+ ]);
69
+ function parseEnvelopeProjection(value) {
70
+ if (value === void 0) return "envelope";
71
+ if (PROJECTIONS.has(value)) return value;
72
+ }
73
+ function serializeEnvelopeProjection(envelope, projection) {
74
+ switch (projection) {
75
+ case "trajectory": return `${toTrajectory(envelope).map((row) => JSON.stringify(row)).join("\n")}\n`;
76
+ case "instances": return `${JSON.stringify(toProtoInstances(envelope), null, 2)}\n`;
77
+ case "agent-trace": return `${JSON.stringify(toAgentTrace(envelope), null, 2)}\n`;
78
+ default: return `${JSON.stringify(envelope, null, 2)}\n`;
79
+ }
80
+ }
81
+ async function readFrameworkVersion() {
82
+ try {
83
+ const text = await readFile(join(dirname(fileURLToPath(import.meta.url)), "../../../package.json"), "utf8");
84
+ return JSON.parse(text).version;
85
+ } catch {
86
+ return;
87
+ }
88
+ }
89
+ async function envelopeCommand(args) {
90
+ const reportPath = args.positional[0];
91
+ if (!reportPath) {
92
+ console.error("usage: harness-eval envelope <report.json> [--output path] [--grading path] [--suite path] [--projection envelope|trajectory|instances|agent-trace] [--include-raw-stream-events] [--no-transcript]");
93
+ return 2;
94
+ }
95
+ const outputPath = getOption(args.options, "output");
96
+ const gradingPath = getOption(args.options, "grading");
97
+ const suitePath = getOption(args.options, "suite");
98
+ const projection = parseEnvelopeProjection(getOption(args.options, "projection"));
99
+ if (!projection) {
100
+ console.error("invalid --projection; expected envelope, trajectory, instances, or agent-trace");
101
+ return 2;
102
+ }
103
+ let envelope;
104
+ try {
105
+ const frameworkVersion = await readFrameworkVersion();
106
+ envelope = await buildEvalRunEnvelopeFromFiles(reportPath, {
107
+ gradingPath,
108
+ suitePath,
109
+ includeTranscript: !hasOption(args.options, "no-transcript"),
110
+ includeRawStreamEvents: hasOption(args.options, "include-raw-stream-events"),
111
+ harness: { frameworkVersion }
112
+ });
113
+ } catch (err) {
114
+ console.error(err instanceof Error ? err.message : String(err));
115
+ return 2;
116
+ }
117
+ const serialized = serializeEnvelopeProjection(envelope, projection);
118
+ if (outputPath) await writeFile(outputPath, serialized, "utf8");
119
+ else process.stdout.write(serialized);
120
+ return envelope.summary.behavioralPass ? 0 : 1;
121
+ }
122
+ //#endregion
123
+ //#region src/cli/commands/format.ts
124
+ /**
125
+ * `harness-eval format` command.
126
+ */
127
+ async function formatCommand(args) {
128
+ const reportPath = args.positional[0];
129
+ if (!reportPath) {
130
+ console.error("usage: harness-eval format <report.json> [options]");
131
+ return 2;
132
+ }
133
+ const format = getOption(args.options, "format") ?? "console";
134
+ const baselinePath = getOption(args.options, "baseline");
135
+ let report;
136
+ try {
137
+ report = JSON.parse(await readFile(reportPath, "utf8"));
138
+ } catch (err) {
139
+ console.error(err instanceof Error ? err.message : String(err));
140
+ return 2;
141
+ }
142
+ let baseline;
143
+ if (baselinePath) baseline = JSON.parse(await readFile(baselinePath, "utf8"));
144
+ const formatted = formatReport(report, {
145
+ format: format === "markdown" || format === "json" ? format : "console",
146
+ baseline,
147
+ color: format === "console"
148
+ });
149
+ process.stdout.write(formatted);
150
+ if (!formatted.endsWith("\n")) process.stdout.write("\n");
151
+ return report.cells.every((c) => c.passed) ? 0 : 1;
152
+ }
153
+ //#endregion
154
+ //#region src/cli/progress.ts
155
+ const GREEN = "\x1B[32m";
156
+ const RED = "\x1B[31m";
157
+ const YELLOW = "\x1B[33m";
158
+ const DIM = "\x1B[2m";
159
+ const RESET = "\x1B[0m";
160
+ function resolveProgressMode(options) {
161
+ const progress = getOption(options, "progress");
162
+ if (progress === "json" || progress === "quiet" || progress === "verbose" || progress === "default") return progress;
163
+ if (hasOption(options, "quiet")) return "quiet";
164
+ if (hasOption(options, "verbose")) return "verbose";
165
+ return "default";
166
+ }
167
+ /** Whether to emit ANSI colors on the progress stream (stderr). */
168
+ function resolveProgressColor(options, stream = process.stderr) {
169
+ if (hasOption(options, "no-color")) return false;
170
+ if (hasOption(options, "color")) return true;
171
+ if (process.env.NO_COLOR !== void 0 && process.env.NO_COLOR !== "") return false;
172
+ if (process.env.FORCE_COLOR !== void 0 && process.env.FORCE_COLOR !== "0") return true;
173
+ return "isTTY" in stream && stream.isTTY === true;
174
+ }
175
+ function okMark(color) {
176
+ return color ? `${GREEN}✓${RESET}` : "✓";
177
+ }
178
+ function failMark(color) {
179
+ return color ? `${RED}✗${RESET}` : "✗";
180
+ }
181
+ function okStatus(color) {
182
+ return color ? `${GREEN}ok${RESET}` : "ok";
183
+ }
184
+ function failStatus(color) {
185
+ return color ? `${RED}FAIL${RESET}` : "FAIL";
186
+ }
187
+ function passLabel(color) {
188
+ return color ? `${GREEN}PASS${RESET}` : "PASS";
189
+ }
190
+ function failLabel(color) {
191
+ return color ? `${RED}FAIL${RESET}` : "FAIL";
192
+ }
193
+ function createRunProgressHandler(options) {
194
+ const stream = options.stream ?? process.stderr;
195
+ const mode = options.mode;
196
+ const color = options.color ?? false;
197
+ let totalReps = 0;
198
+ let completed = 0;
199
+ let totalDurationMs = 0;
200
+ return (event) => {
201
+ switch (event.kind) {
202
+ case "suite-start":
203
+ totalReps = event.totalReps;
204
+ completed = 0;
205
+ totalDurationMs = 0;
206
+ if (mode === "quiet") return;
207
+ if (mode === "json") {
208
+ writeJson(stream, {
209
+ kind: "suite-start",
210
+ totalReps: event.totalReps,
211
+ maxConcurrent: options.maxConcurrent
212
+ });
213
+ return;
214
+ }
215
+ const concurrent = options.maxConcurrent !== void 0 ? ` (max-concurrent ${options.maxConcurrent})` : "";
216
+ stream.write(`Running ${totalReps} repetitions${concurrent}...\n\n`);
217
+ break;
218
+ case "rep-complete":
219
+ completed++;
220
+ totalDurationMs += event.durationMs;
221
+ if (mode === "quiet") {
222
+ stream.write(event.ok ? color ? `${GREEN}.${RESET}` : "." : color ? `${RED}x${RESET}` : "x");
223
+ return;
224
+ }
225
+ if (mode === "json") {
226
+ writeJson(stream, {
227
+ kind: "rep-complete",
228
+ index: completed,
229
+ total: totalReps,
230
+ caseId: event.caseId,
231
+ cellLabel: event.cellLabel,
232
+ repIndex: event.repIndex,
233
+ ok: event.ok,
234
+ durationMs: event.durationMs,
235
+ toolCallCount: event.toolCallCount,
236
+ errorMessage: event.errorMessage
237
+ });
238
+ return;
239
+ }
240
+ const eta = formatEta(totalDurationMs, completed, totalReps);
241
+ const icon = event.ok ? okMark(color) : failMark(color);
242
+ const status = event.ok ? okStatus(color) : failStatus(color);
243
+ let line = `${icon} [${completed}/${totalReps}] ${event.caseId} @ ${event.cellLabel} #${event.repIndex} ${status} ${formatDuration(event.durationMs)}`;
244
+ if (eta) line += color ? ` ${DIM}(${eta})${RESET}` : ` (${eta})`;
245
+ if (!event.ok && event.errorMessage) line += color ? ` ${YELLOW}— ${truncate(event.errorMessage, 80)}${RESET}` : ` — ${truncate(event.errorMessage, 80)}`;
246
+ if (mode === "verbose") {
247
+ if (event.toolCallCount !== void 0) line += ` tools=${event.toolCallCount}`;
248
+ const summary = formatAssertionSummary(event.assertionResults, color);
249
+ if (summary) line += ` ${summary}`;
250
+ }
251
+ stream.write(`${line}\n`);
252
+ break;
253
+ case "cell-complete":
254
+ if (mode === "quiet") return;
255
+ if (mode === "json") {
256
+ writeJson(stream, {
257
+ kind: "cell-complete",
258
+ caseId: event.report.caseId,
259
+ cellLabel: event.report.cell.label,
260
+ passed: event.report.passed,
261
+ adapterErrors: event.report.adapterErrors,
262
+ assertionStats: event.report.assertionStats.map((s) => ({
263
+ description: s.description,
264
+ passRate: s.passRate,
265
+ meetsThreshold: s.meetsThreshold
266
+ }))
267
+ });
268
+ return;
269
+ }
270
+ stream.write(`${formatCellSummary(event.report, color)}\n`);
271
+ break;
272
+ case "suite-complete":
273
+ if (mode === "quiet") {
274
+ stream.write("\n");
275
+ return;
276
+ }
277
+ if (mode === "json") {
278
+ writeJson(stream, {
279
+ kind: "suite-complete",
280
+ durationMs: event.report.durationMs,
281
+ cellsTotal: event.report.cells.length,
282
+ cellsPassed: event.report.cells.filter((c) => c.passed).length
283
+ });
284
+ return;
285
+ }
286
+ const okReps = event.report.cells.reduce((n, c) => n + c.repetitions.filter((r) => r.error === null).length, 0);
287
+ const totalRun = event.report.cells.reduce((n, c) => n + c.repetitions.length, 0);
288
+ const adapterErrors = event.report.cells.reduce((n, c) => n + c.adapterErrors, 0);
289
+ let footer = `\nFinished in ${formatDuration(event.report.durationMs)} (${okReps}/${totalRun} reps ok`;
290
+ if (adapterErrors > 0) footer += `, ${adapterErrors} adapter error(s)`;
291
+ footer += ")\n\n";
292
+ stream.write(footer);
293
+ break;
294
+ default: break;
295
+ }
296
+ };
297
+ }
298
+ function createGradeProgressHandler(options) {
299
+ const stream = options.stream ?? process.stderr;
300
+ const mode = options.mode;
301
+ const color = options.color ?? false;
302
+ let total = 0;
303
+ let completed = 0;
304
+ let totalDurationMs = 0;
305
+ return (event) => {
306
+ switch (event.kind) {
307
+ case "grade-start":
308
+ total = event.total;
309
+ completed = 0;
310
+ totalDurationMs = 0;
311
+ if (mode === "quiet" || total === 0) return;
312
+ if (mode === "json") {
313
+ writeJson(stream, {
314
+ kind: "grade-start",
315
+ total: event.total,
316
+ maxConcurrent: options.maxConcurrent
317
+ });
318
+ return;
319
+ }
320
+ const concurrent = options.maxConcurrent !== void 0 ? ` (max-concurrent ${options.maxConcurrent})` : "";
321
+ stream.write(`Grading ${total} repetition(s)${concurrent}...\n\n`);
322
+ break;
323
+ case "grade-complete":
324
+ completed++;
325
+ totalDurationMs += event.durationMs;
326
+ if (mode === "quiet") {
327
+ const allPassed = event.failed === 0 && !event.graderError;
328
+ stream.write(allPassed ? color ? `${GREEN}.${RESET}` : "." : color ? `${RED}x${RESET}` : "x");
329
+ return;
330
+ }
331
+ if (mode === "json") {
332
+ writeJson(stream, {
333
+ kind: "grade-complete",
334
+ index: completed,
335
+ total,
336
+ caseId: event.caseId,
337
+ cellLabel: event.cellLabel,
338
+ repetitionIndex: event.repetitionIndex,
339
+ passed: event.passed,
340
+ failed: event.failed,
341
+ durationMs: event.durationMs,
342
+ graderError: event.graderError
343
+ });
344
+ return;
345
+ }
346
+ const eta = formatEta(totalDurationMs, completed, total);
347
+ const ok = event.failed === 0 && !event.graderError;
348
+ const icon = ok ? okMark(color) : failMark(color);
349
+ const status = ok ? okStatus(color) : failStatus(color);
350
+ let line = `${icon} [${completed}/${total}] ${event.caseId} @ ${event.cellLabel} #${event.repetitionIndex} ${status} ${formatDuration(event.durationMs)}`;
351
+ line += ` expectations ${event.passed}/${event.passed + event.failed}`;
352
+ if (eta) line += color ? ` ${DIM}(${eta})${RESET}` : ` (${eta})`;
353
+ if (event.graderError) line += color ? ` ${YELLOW}— ${truncate(event.graderError, 80)}${RESET}` : ` — ${truncate(event.graderError, 80)}`;
354
+ if (mode === "verbose" && event.failed && event.failed > 0) line += color ? ` ${YELLOW}see grading output${RESET}` : " see grading output";
355
+ stream.write(`${line}\n`);
356
+ break;
357
+ case "grade-done":
358
+ if (mode === "quiet") {
359
+ stream.write("\n");
360
+ return;
361
+ }
362
+ if (mode === "json") {
363
+ writeJson(stream, {
364
+ kind: "grade-done",
365
+ durationMs: event.durationMs,
366
+ totalExpectations: event.totalExpectations,
367
+ passedExpectations: event.passedExpectations
368
+ });
369
+ return;
370
+ }
371
+ if (total === 0) return;
372
+ stream.write(`\nGraded in ${formatDuration(event.durationMs)} (${event.passedExpectations}/${event.totalExpectations} expectations passed)\n\n`);
373
+ break;
374
+ default: break;
375
+ }
376
+ };
377
+ }
378
+ function writeJson(stream, value) {
379
+ stream.write(`${JSON.stringify(value)}\n`);
380
+ }
381
+ function formatDuration(ms) {
382
+ if (ms < 1e3) return `${ms}ms`;
383
+ const sec = ms / 1e3;
384
+ if (sec < 60) return `${sec.toFixed(1)}s`;
385
+ const min = Math.floor(sec / 60);
386
+ const remSec = Math.round(sec % 60);
387
+ if (min < 60) return `${min}m ${remSec}s`;
388
+ return `${Math.floor(min / 60)}h ${min % 60}m`;
389
+ }
390
+ function formatEta(totalDurationMs, completed, total) {
391
+ if (completed === 0 || completed >= total) return void 0;
392
+ const avg = totalDurationMs / completed;
393
+ const remaining = (total - completed) * avg;
394
+ return `~${formatDuration(Math.round(remaining))} remaining`;
395
+ }
396
+ function truncate(text, max) {
397
+ if (text.length <= max) return text;
398
+ return `${text.slice(0, max - 1)}…`;
399
+ }
400
+ function formatAssertionSummary(results, color = false) {
401
+ if (!results || results.length === 0) return "";
402
+ return results.map((r) => `${r.passed ? okMark(color) : failMark(color)} ${r.description}`).join(", ");
403
+ }
404
+ function formatCellSummary(cell, color) {
405
+ const mark = cell.passed ? okMark(color) : failMark(color);
406
+ const status = cell.passed ? passLabel(color) : failLabel(color);
407
+ const parts = cell.assertionStats.map((s) => {
408
+ const pct = (s.passRate * 100).toFixed(0);
409
+ return `${s.description} ${s.passedCount}/${s.evaluatedCount} (${pct}%)`;
410
+ });
411
+ const crash = cell.adapterErrors > 0 ? color ? ` ${YELLOW}[${cell.adapterErrors} adapter errors]${RESET}` : ` [${cell.adapterErrors} adapter errors]` : "";
412
+ const stats = parts.length > 0 ? ` ${parts.join(" · ")}` : "";
413
+ return `${mark} ${cell.caseId} @ ${cell.cell.label} ${status}${crash}${stats}`;
414
+ }
415
+ //#endregion
416
+ //#region src/cli/commands/grade.ts
417
+ /**
418
+ * `harness-eval grade` — LLM outcome grading on a suite report.
419
+ */
420
+ function optionalOptionInt(options, name) {
421
+ const raw = getOption(options, name);
422
+ if (raw === void 0) return void 0;
423
+ const n = Number.parseInt(raw, 10);
424
+ return Number.isFinite(n) ? n : void 0;
425
+ }
426
+ async function gradeCommand(args) {
427
+ const reportPath = args.positional[0];
428
+ if (!reportPath) {
429
+ console.error("usage: harness-eval grade <report.json> [--config grading.yaml] [--expectations path] [--output path] [--model id] [--timeout-ms N] [--max-concurrent N]");
430
+ return 2;
431
+ }
432
+ const configPath = getOption(args.options, "config");
433
+ const expectationsPath = getOption(args.options, "expectations");
434
+ const outputPath = getOption(args.options, "output");
435
+ const model = getOption(args.options, "model");
436
+ const binary = getOption(args.options, "binary");
437
+ const timeoutMs = optionalOptionInt(args.options, "timeout-ms");
438
+ const maxConcurrent = getOption(args.options, "max-concurrent") ? getOptionInt(args.options, "max-concurrent", 2) : void 0;
439
+ const format = getOption(args.options, "format") ?? "console";
440
+ const progressMode = resolveProgressMode(args.options);
441
+ const useProgressColor = progressMode !== "json" && resolveProgressColor(args.options);
442
+ let fileConfig;
443
+ if (configPath) try {
444
+ fileConfig = await loadGradingConfig(configPath);
445
+ } catch (err) {
446
+ console.error(err instanceof Error ? err.message : String(err));
447
+ return 2;
448
+ }
449
+ let report;
450
+ try {
451
+ report = await loadSuiteReport(reportPath);
452
+ } catch (err) {
453
+ console.error(err instanceof Error ? err.message : String(err));
454
+ return 2;
455
+ }
456
+ let gradeOptions;
457
+ try {
458
+ gradeOptions = resolveGradeOptions(fileConfig, {
459
+ sourceReport: reportPath,
460
+ expectationsPath,
461
+ model,
462
+ binary,
463
+ timeoutMs,
464
+ maxConcurrent
465
+ }, configPath);
466
+ } catch (err) {
467
+ console.error(err instanceof Error ? err.message : String(err));
468
+ return 2;
469
+ }
470
+ const onProgress = createGradeProgressHandler({
471
+ mode: progressMode,
472
+ maxConcurrent: gradeOptions.maxConcurrent ?? 2,
473
+ color: useProgressColor
474
+ });
475
+ const grading = await gradeReport(report, {
476
+ ...gradeOptions,
477
+ onProgress
478
+ });
479
+ if (outputPath) await writeFile(outputPath, JSON.stringify(grading, null, 2), "utf8");
480
+ if (format === "json") {
481
+ process.stdout.write(JSON.stringify(grading, null, 2));
482
+ process.stdout.write("\n");
483
+ } else {
484
+ const formatted = formatGradingConsole(grading, format === "console");
485
+ process.stdout.write(formatted);
486
+ if (!formatted.endsWith("\n")) process.stdout.write("\n");
487
+ }
488
+ if (grading.results.length === 0) return 2;
489
+ return gradingReportPassed(grading) ? 0 : 1;
490
+ }
491
+ //#endregion
492
+ //#region src/cli/commands/otel-output.ts
493
+ /**
494
+ * Write OTLP JSON artifacts from a suite report.
495
+ */
496
+ function safeFilePart(value) {
497
+ return value.replace(/[^a-zA-Z0-9._-]+/g, "_");
498
+ }
499
+ /**
500
+ * Write one OTLP JSON file per successful repetition.
501
+ *
502
+ * Files: `{caseId}__{cellLabel}__rep{N}.otlp.json`
503
+ */
504
+ async function writeOtelArtifacts(suite, report, outputDir) {
505
+ await mkdir(outputDir, { recursive: true });
506
+ let written = 0;
507
+ for (const cellReport of report.cells) {
508
+ const testCase = suite.cases.find((c) => c.id === cellReport.caseId);
509
+ if (!testCase) continue;
510
+ for (const rep of cellReport.repetitions) {
511
+ if (!rep.adapterResult) continue;
512
+ const otlp = trajectoryToOtlp(rep.adapterResult.view, { prompt: testCase.prompt });
513
+ await writeFile(join(outputDir, `${safeFilePart(cellReport.caseId)}__${safeFilePart(cellReport.cell.label)}__rep${rep.repetitionIndex}.otlp.json`), JSON.stringify(otlp, null, 2), "utf8");
514
+ written++;
515
+ }
516
+ }
517
+ return written;
518
+ }
519
+ //#endregion
520
+ //#region src/cli/commands/run.ts
521
+ /**
522
+ * `harness-eval run` command.
523
+ */
524
+ async function runCommand(args) {
525
+ const suitePath = args.positional[0];
526
+ if (!suitePath) {
527
+ console.error("usage: harness-eval run <suite.yaml> [options]");
528
+ return 2;
529
+ }
530
+ const format = getOption(args.options, "format") ?? "console";
531
+ const outputPath = getOption(args.options, "output");
532
+ const otelOutputDir = getOption(args.options, "otel-output");
533
+ const baselinePath = getOption(args.options, "baseline");
534
+ const maxConcurrent = getOptionInt(args.options, "max-concurrent", 4);
535
+ const adapterId = getOption(args.options, "adapter");
536
+ const progressMode = resolveProgressMode(args.options);
537
+ const useProgressColor = progressMode !== "json" && resolveProgressColor(args.options);
538
+ let suite;
539
+ try {
540
+ suite = await loadSuite(suitePath);
541
+ } catch (err) {
542
+ console.error(err instanceof Error ? err.message : String(err));
543
+ return 2;
544
+ }
545
+ const adapter = getAdapter(adapterId ?? suite.adapter ?? "claude-code");
546
+ const onProgress = createRunProgressHandler({
547
+ mode: progressMode,
548
+ maxConcurrent,
549
+ color: useProgressColor
550
+ });
551
+ const report = await runSuite(suite, {
552
+ adapter,
553
+ maxConcurrent,
554
+ onProgress
555
+ });
556
+ if (outputPath) await writeFile(outputPath, JSON.stringify(report, null, 2), "utf8");
557
+ if (otelOutputDir) {
558
+ const count = await writeOtelArtifacts(suite, report, otelOutputDir);
559
+ process.stderr.write(`otel: wrote ${count} trace file(s) to ${otelOutputDir}\n`);
560
+ }
561
+ let baseline;
562
+ if (baselinePath) {
563
+ const { readFile } = await import("node:fs/promises");
564
+ baseline = JSON.parse(await readFile(baselinePath, "utf8"));
565
+ }
566
+ const formatted = formatReport(report, {
567
+ format: format === "markdown" || format === "json" ? format : "console",
568
+ baseline,
569
+ color: format === "console"
570
+ });
571
+ process.stdout.write(formatted);
572
+ if (!formatted.endsWith("\n")) process.stdout.write("\n");
573
+ return report.cells.every((c) => c.passed) ? 0 : 1;
574
+ }
575
+ //#endregion
576
+ //#region src/cli/main.ts
577
+ /**
578
+ * CLI entry point.
579
+ */
580
+ const USAGE = `harness-eval — harness-level eval framework
581
+
582
+ Usage:
583
+ harness-eval run <suite.yaml> [--max-concurrent N] [--baseline path] [--output path] [--otel-output dir] [--format console|markdown|json] [--adapter id] [--quiet] [--verbose] [--progress default|quiet|verbose|json]
584
+ harness-eval grade <report.json> [--config grading.yaml] [--expectations path] [--output path] [--model id] [--timeout-ms N] [--max-concurrent N] [--format console|json] [--quiet] [--verbose] [--progress default|quiet|verbose|json]
585
+ harness-eval envelope <report.json> [--output path] [--grading path] [--suite path] [--projection envelope|trajectory|instances|agent-trace] [--include-raw-stream-events] [--no-transcript]
586
+ harness-eval format <report.json> [--format console|markdown|json] [--baseline path]
587
+ harness-eval --help
588
+
589
+ Progress (run & grade):
590
+ default one line per repetition + per-cell summary (default)
591
+ --quiet colored dots (. = ok, x = fail)
592
+ --verbose per-rep details (tool counts, assertion summary)
593
+ --progress json newline-delimited JSON events on stderr
594
+ --no-color disable ANSI colors on progress output
595
+ --color force ANSI colors on progress output
596
+ `;
597
+ async function main(argv) {
598
+ const parsed = parseArgs(argv);
599
+ if (parsed.options.help || parsed.command === "help" || parsed.options.h) {
600
+ process.stdout.write(USAGE);
601
+ return 0;
602
+ }
603
+ switch (parsed.command) {
604
+ case "run": return await runCommand(parsed);
605
+ case "grade": return await gradeCommand(parsed);
606
+ case "envelope": return await envelopeCommand(parsed);
607
+ case "format": return await formatCommand(parsed);
608
+ case void 0:
609
+ console.error(USAGE);
610
+ return 2;
611
+ default:
612
+ console.error(`unknown command: ${parsed.command}\n\n${USAGE}`);
613
+ return 2;
614
+ }
615
+ }
616
+ //#endregion
617
+ //#region src/cli/bin.ts
618
+ const code = await main(process.argv.slice(2));
619
+ process.exit(code);
620
+ //#endregion
621
+ export {};
622
+
623
+ //# sourceMappingURL=bin.js.map