@alis-build/harness-eval 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -10
- package/dist/adapters/claude-code/index.d.ts +2 -2
- package/dist/adapters/claude-code/index.js +2 -1
- package/dist/adapters/codex/index.d.ts +68 -0
- package/dist/adapters/codex/index.js +3 -0
- package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
- package/dist/claude-code-C_7hxC8z.js.map +1 -0
- package/dist/cli/bin.js +204 -127
- package/dist/cli/bin.js.map +1 -1
- package/dist/codex-0cHO2te9.js +496 -0
- package/dist/codex-0cHO2te9.js.map +1 -0
- package/dist/config/loader.d.ts +2 -2
- package/dist/config/loader.js +2 -2
- package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
- package/dist/index.d.ts +397 -153
- package/dist/index.js +125 -5
- package/dist/index.js.map +1 -0
- package/dist/loader-B1WmGGzf.d.ts +107 -0
- package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
- package/dist/loader-DnQ6Jt0i.js.map +1 -0
- package/dist/reporter-Biy-5-9M.js +2216 -0
- package/dist/reporter-Biy-5-9M.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
- package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
- package/dist/suite-BcP64nlb.js.map +1 -0
- package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
- package/dist/types-Bac8_Ixb.js +246 -0
- package/dist/types-Bac8_Ixb.js.map +1 -0
- package/dist/types-Bu8uOZZN.d.ts +77 -0
- package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
- package/package.json +7 -2
- package/schemas/eval-interchange-instances.schema.json +196 -0
- package/schemas/eval-interchange.schema.json +65 -52
- package/schemas/eval-run-envelope.schema.json +182 -425
- package/dist/build-DsVJ_UeU.js +0 -1396
- package/dist/build-DsVJ_UeU.js.map +0 -1
- package/dist/claude-code-ycT0JQZF.js.map +0 -1
- package/dist/loader-BCnFJ8rm.js.map +0 -1
- package/dist/loader-DTvoVfN0.d.ts +0 -33
- package/dist/suite-chj0j22j.js.map +0 -1
- package/schemas/eval-interchange-agent-trace.schema.json +0 -322
- package/schemas/eval-interchange-proto-instance.schema.json +0 -106
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"claude-code-C_7hxC8z.js","names":[],"sources":["../src/parsers/stream-json.ts","../src/adapters/claude-code/flags.ts","../src/adapters/claude-code/process.ts","../src/adapters/claude-code/index.ts"],"sourcesContent":["/**\n * Line-buffered NDJSON parser for Claude Code's `--output-format stream-json`.\n *\n * Claude Code emits one JSON object per line on stdout. The parser:\n * - buffers across chunk boundaries (a single JSON line may arrive in two reads)\n * - skips empty lines (defensive — shouldn't occur, but harmless if it does)\n * - emits a discriminated `ParseResult` per line so callers can decide whether\n * a malformed line should abort the run or just be logged.\n *\n * Why a generator (and not a Transform stream)?\n * The eval adapter consumes events sequentially and synchronously updates a\n * builder. Async iteration is the simplest interface for that pattern and\n * composes cleanly with `for await` in the adapter. A Transform would force\n * the builder into event-handler style.\n */\n\nimport type { Readable } from \"node:stream\";\nimport type { StreamEvent } from \"../types/stream\";\n\n/**\n * Result of attempting to parse a single line.\n *\n * Successful parses yield `{ ok: true }` with the typed event and the raw line\n * (kept for diagnostics and OTel `events.attributes.raw`). Failed parses yield\n * `{ ok: false }` with the parse error and the raw line — callers can log,\n * skip, or fail the run as they see fit.\n */\nexport type ParseResult =\n | { ok: true; event: StreamEvent; rawLine: string }\n | { ok: false; error: Error; rawLine: string };\n\n/**\n * Parse a readable stream of NDJSON into a sequence of typed stream-json events.\n *\n * @example\n * const child = spawn(\"claude\", [\"-p\", prompt, \"--output-format\", \"stream-json\", \"--verbose\"]);\n * for await (const result of parseStreamJson(child.stdout)) {\n * if (result.ok) builder.consume(result.event);\n * else console.warn(\"malformed stream line:\", result.rawLine, result.error);\n * }\n */\nexport async function* parseStreamJson(\n stream: Readable,\n): AsyncGenerator<ParseResult, void, void> {\n let buffer = \"\";\n // The Node child_process stdout is a binary stream by default. Setting the\n // encoding here means `for await (const chunk of stream)` yields strings.\n stream.setEncoding(\"utf8\");\n\n for await (const chunk of stream) {\n buffer += chunk as string;\n\n // Drain every complete line currently in the buffer before reading more.\n // Multiple JSON objects can arrive in one chunk (e.g. when the harness\n // emits a burst of events at session start).\n let newlineIdx: number;\n while ((newlineIdx = buffer.indexOf(\"\\n\")) !== -1) {\n const line = buffer.slice(0, newlineIdx).trim();\n buffer = buffer.slice(newlineIdx + 1);\n if (line.length === 0) continue;\n yield tryParseLine(line);\n }\n }\n\n // Flush any trailing content that arrived without a final newline. Stream-json\n // typically ends with a newline-terminated `result` event, but a killed\n // process may not flush, so we still try to emit what we have.\n const trailing = buffer.trim();\n if (trailing.length > 0) {\n yield tryParseLine(trailing);\n }\n}\n\n/**\n * Parse a single line. Extracted as a helper so the generator stays readable.\n *\n * Note: we do not validate the event structure beyond `JSON.parse`. Runtime\n * validation (e.g. zod) is overkill here — the schema is stable enough at\n * runtime, and the TrajectoryBuilder is tolerant of missing fields. Adding\n * validation would be premature.\n */\nfunction tryParseLine(line: string): ParseResult {\n try {\n const event = JSON.parse(line) as StreamEvent;\n return { ok: true, event, rawLine: line };\n } catch (err) {\n return {\n ok: false,\n error: err instanceof Error ? err : new Error(String(err)),\n rawLine: line,\n };\n }\n}\n","/**\n * Build CLI args for Claude Code judge subprocesses (JSON output, not stream-json).\n *\n * Shared flag assembly for harness runs (`buildArgs`) and LLM grading judges\n * (`buildJudgeArgs`).\n */\n\nimport type { ClaudeCodeAdapterConfig, ClaudeCodeOptions } from \"./types\";\n\n/** Append repeated `--flag value` pairs for array config fields. */\nfunction pushRepeatableFlag(args: string[], flag: string, values?: string[]): void {\n if (!values) return;\n for (const value of values) {\n args.push(flag, value);\n }\n}\n\n/**\n * Append an optional CLI flag. Boolean `true` emits the flag alone; other\n * scalars emit `--flag value`.\n */\nfunction pushOptionalFlag(\n args: string[],\n flag: string,\n value: string | number | boolean | undefined,\n): void {\n if (value === undefined) return;\n if (typeof value === \"boolean\") {\n if (value) args.push(flag);\n return;\n }\n args.push(flag, String(value));\n}\n\n/** Append Claude Code CLI flags shared by harness runs and grading judges. */\nexport function appendClaudeCodeFlags(\n args: string[],\n config: ClaudeCodeOptions & { model?: string },\n): void {\n pushRepeatableFlag(args, \"--plugin-dir\", config.pluginDirs);\n pushRepeatableFlag(args, \"--plugin-url\", config.pluginUrls);\n pushRepeatableFlag(args, \"--add-dir\", config.addDirs);\n\n pushOptionalFlag(args, \"--mcp-config\", config.mcpConfig);\n pushOptionalFlag(args, \"--model\", config.model);\n pushOptionalFlag(args, \"--permission-mode\", config.permissionMode);\n pushOptionalFlag(args, \"--effort\", config.effort);\n pushOptionalFlag(args, \"--agent\", config.agent);\n pushOptionalFlag(args, \"--fallback-model\", config.fallbackModel);\n pushOptionalFlag(args, \"--tools\", config.tools);\n pushOptionalFlag(args, \"--settings\", config.settings);\n pushOptionalFlag(args, \"--setting-sources\", config.settingSources);\n pushOptionalFlag(args, \"--max-turns\", config.maxTurns);\n pushOptionalFlag(args, \"--max-budget-usd\", config.maxBudgetUsd);\n pushOptionalFlag(args, \"--system-prompt\", config.systemPrompt);\n pushOptionalFlag(args, \"--system-prompt-file\", config.systemPromptFile);\n pushOptionalFlag(args, \"--append-system-prompt\", config.appendSystemPrompt);\n pushOptionalFlag(\n args,\n \"--append-system-prompt-file\",\n config.appendSystemPromptFile,\n );\n pushOptionalFlag(args, \"--debug\", config.debug);\n pushOptionalFlag(args, \"--debug-file\", config.debugFile);\n\n if (config.allowedTools && config.allowedTools.length > 0) {\n args.push(\"--allowedTools\", config.allowedTools.join(\",\"));\n }\n\n if (config.disallowedTools && config.disallowedTools.length > 0) {\n args.push(\"--disallowedTools\", config.disallowedTools.join(\",\"));\n }\n\n pushOptionalFlag(args, \"--strict-mcp-config\", config.strictMcpConfig);\n pushOptionalFlag(args, \"--include-hook-events\", config.includeHookEvents);\n pushOptionalFlag(args, \"--no-session-persistence\", config.noSessionPersistence);\n pushOptionalFlag(args, \"--disable-slash-commands\", config.disableSlashCommands);\n pushOptionalFlag(args, \"--bare\", config.bare);\n pushOptionalFlag(args, \"--safe-mode\", config.safeMode);\n pushOptionalFlag(\n args,\n \"--allow-dangerously-skip-permissions\",\n config.allowDangerouslySkipPermissions,\n );\n pushOptionalFlag(\n args,\n \"--dangerously-skip-permissions\",\n config.dangerouslySkipPermissions,\n );\n}\n\n/**\n * Build the argument vector for spawning `claude`.\n *\n * Order matters only for flags that take values — value flags must come\n * after their flag name. Everything else is order-independent.\n */\nexport function buildArgs(config: ClaudeCodeAdapterConfig): string[] {\n const args: string[] = [\n \"-p\",\n config.prompt,\n \"--output-format\",\n \"stream-json\",\n \"--verbose\",\n ];\n\n appendClaudeCodeFlags(args, config);\n\n return args;\n}\n\n/**\n * Build args for an LLM judge subprocess (`--output-format json`).\n *\n * Defaults permission mode to `bypassPermissions` so the judge does not\n * block on tool permission prompts during single-shot JSON grading.\n */\nexport function buildJudgeArgs(\n prompt: string,\n config: ClaudeCodeOptions & { model?: string } = {},\n): string[] {\n const args: string[] = [\"-p\", prompt, \"--output-format\", \"json\"];\n const permissionMode = config.permissionMode ?? \"bypassPermissions\";\n appendClaudeCodeFlags(args, {\n ...config,\n permissionMode,\n });\n return args;\n}\n","/**\n * Process management for the Claude Code adapter.\n *\n * This module owns spawning, timeout, abort signal handling, and process-tree\n * teardown. The orchestrator (`index.ts`) consumes the returned handle —\n * reading stdout and waiting for completion — but doesn't worry about how\n * the process gets killed or how its config gets isolated.\n *\n * Why a separate module? Process management is the one part of the adapter\n * with real I/O complexity (process groups, signal escalation, temp-dir\n * lifecycle, env merging). Isolating it makes the orchestrator easy to read\n * and lets us swap the spawning logic if we later need to, e.g., wrap claude\n * in a sandbox runner.\n */\n\nimport { spawn, type ChildProcess } from \"node:child_process\";\nimport { mkdtemp, rm } from \"node:fs/promises\";\nimport { tmpdir } from \"node:os\";\nimport { join } from \"node:path\";\nimport type { Readable } from \"node:stream\";\n\nimport { buildArgs } from \"./flags\";\nimport type { ClaudeCodeAdapterConfig } from \"./types\";\n\n/** Default hard timeout per run. Tunable via config.timeoutMs. */\nconst DEFAULT_TIMEOUT_MS = 5 * 60 * 1000;\n\n/**\n * Grace period between SIGTERM and SIGKILL. Most processes shut down cleanly\n * within a few seconds; this gives them that chance while preventing CI from\n * hanging indefinitely on a stuck child.\n */\nconst KILL_GRACE_MS = 5_000;\n\n/**\n * Handle to a spawned `claude` process. The orchestrator drives it:\n * - Read `stdout` (typically via parseStreamJson).\n * - Await `done` to learn the exit state.\n * - Await `stderrCollected` for diagnostic stderr.\n * - Check `timedOut()` after exit to distinguish kill-by-timeout from\n * normal termination.\n * - Call `cleanup()` after all of the above to remove the temp config dir.\n */\nexport interface SpawnedClaude {\n stdout: Readable;\n done: Promise<{ exitCode: number | null; signal: NodeJS.Signals | null }>;\n stderrCollected: Promise<string>;\n timedOut: () => boolean;\n cleanup: () => Promise<void>;\n}\n\n/**\n * Spawn `claude` in headless mode with isolated config and a process-group\n * lifecycle. See {@link SpawnedClaude} for how to consume the result.\n *\n * **Kill sequence:** timeout and abort both follow the same two-step path:\n * `SIGTERM` to the process group, then `SIGKILL` after {@link KILL_GRACE_MS}\n * if the group is still alive. This avoids leaving MCP/tool subprocesses\n * running while still giving claude a chance to flush stream-json output.\n *\n * @param config - Adapter options; `timeoutMs`, `signal`, and `isolateConfig`\n * control lifecycle and config isolation.\n */\nexport async function spawnClaude(\n config: ClaudeCodeAdapterConfig,\n): Promise<SpawnedClaude> {\n const binary = config.binary ?? \"claude\";\n const args = buildArgs(config);\n\n const isolateConfig = config.isolateConfig !== false;\n\n // Isolated runs use a fresh temp dir so plugins/settings don't leak between\n // reps. Non-isolated runs inherit the caller's Claude login and plugins.\n const tempConfigDir = isolateConfig\n ? await mkdtemp(join(tmpdir(), \"harness-eval-\"))\n : null;\n\n const env: Record<string, string | undefined> = {\n ...process.env,\n ...config.env,\n };\n if (tempConfigDir) {\n // Override after ...env so callers can't accidentally un-isolate.\n env.CLAUDE_CONFIG_DIR = tempConfigDir;\n }\n\n const child = spawn(binary, args, {\n cwd: config.cwd ?? process.cwd(),\n env,\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n // detached: true means the child becomes the leader of its own process\n // group. We exploit this to kill the entire group (including any MCP\n // server subprocesses and tool processes) on timeout/abort.\n detached: true,\n });\n\n\n // `timedOut` is set only by the hard timeout timer, not by abort — callers\n // use it to distinguish \"ran too long\" from user cancellation or normal exit.\n let timedOut = false;\n let killEscalation: NodeJS.Timeout | null = null;\n const timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;\n\n /**\n * Arm (or re-arm) the SIGKILL fallback. Each SIGTERM attempt gets its own\n * grace window so a slow shutdown doesn't leave orphaned MCP servers.\n */\n const scheduleKillEscalation = () => {\n if (killEscalation) clearTimeout(killEscalation);\n killEscalation = setTimeout(\n () => killTree(child, \"SIGKILL\"),\n KILL_GRACE_MS,\n );\n };\n\n const timeoutTimer = setTimeout(() => {\n timedOut = true;\n killTree(child, \"SIGTERM\");\n scheduleKillEscalation();\n }, timeoutMs);\n\n // AbortSignal cancellation mirrors timeout kills but does not flip `timedOut`.\n const onAbort = () => {\n killTree(child, \"SIGTERM\");\n scheduleKillEscalation();\n };\n config.signal?.addEventListener(\"abort\", onAbort, { once: true });\n\n\n // Drain stderr eagerly so the OS-level buffer never fills and stalls the\n // child (Node child processes will block on a full pipe).\n const stderrChunks: string[] = [];\n child.stderr?.setEncoding(\"utf8\");\n child.stderr?.on(\"data\", (chunk: string) => {\n stderrChunks.push(chunk);\n });\n\n const stderrCollected = new Promise<string>((resolve) => {\n const finalize = () => resolve(stderrChunks.join(\"\"));\n child.stderr?.on(\"end\", finalize);\n // Errors during stderr capture shouldn't fail the whole run; we just\n // return what we've buffered so far.\n child.stderr?.on(\"error\", finalize);\n });\n\n\n // Resolve once the process exits or fails to spawn. Guard against double\n // settlement because both `close` and `error` can fire in edge cases.\n const done = new Promise<{\n exitCode: number | null;\n signal: NodeJS.Signals | null;\n }>((resolve) => {\n let settled = false;\n const finalize = (\n exitCode: number | null,\n signal: NodeJS.Signals | null,\n ) => {\n if (settled) return;\n settled = true;\n // Tear down timers/listeners so a late timeout cannot SIGKILL a reused PID.\n clearTimeout(timeoutTimer);\n if (killEscalation) clearTimeout(killEscalation);\n config.signal?.removeEventListener(\"abort\", onAbort);\n resolve({ exitCode, signal });\n };\n\n child.on(\"close\", (code, signal) => finalize(code, signal));\n // ENOENT and other spawn failures emit `error` — `close` may not follow.\n child.on(\"error\", () => finalize(null, null));\n });\n\n\n const cleanup = async () => {\n if (!tempConfigDir) return;\n try {\n await rm(tempConfigDir, { recursive: true, force: true });\n } catch {\n // Best-effort. A leftover temp dir is annoying but not catastrophic;\n // we don't want to fail the run for it.\n }\n };\n\n // stdout is guaranteed non-null because we passed `stdio: [..., \"pipe\", ...]`.\n // The `!` is safe; the alternative would be a redundant runtime check that\n // could never fire.\n return {\n stdout: child.stdout!,\n done,\n stderrCollected,\n timedOut: () => timedOut,\n cleanup,\n };\n}\n\n/**\n * Kill the child's process group, then fall back to the bare PID if the\n * group is already gone. This catches MCP server subprocesses and tool\n * processes spawned by claude.\n *\n * **Signal escalation:** callers typically invoke this first with `SIGTERM`,\n * then again with `SIGKILL` after {@link KILL_GRACE_MS}. The group kill is\n * essential — a bare `child.kill()` would leave MCP servers running.\n *\n * **Platform edge case:** when the group leader exits first, `kill(-pid)`\n * throws `ESRCH`. The single-PID fallback covers that without failing the\n * adapter run.\n *\n * @param child - Spawned process handle from {@link spawn}.\n * @param signal - POSIX signal to deliver (`SIGTERM` or `SIGKILL` in practice).\n */\nfunction killTree(child: ChildProcess, signal: NodeJS.Signals): void {\n if (child.pid === undefined) return;\n try {\n // Negative PID targets the entire process group (requires detached spawn).\n process.kill(-child.pid, signal);\n } catch {\n try {\n // Group already reaped — try the leader PID directly.\n child.kill(signal);\n } catch {\n // Process fully gone; nothing to do.\n }\n }\n}\n","/**\n * Claude Code adapter — public API.\n */\n\nimport { parseStreamJson } from \"../../parsers/stream-json\";\nimport { TrajectoryBuilder } from \"../../trajectory/builder\";\nimport type { StreamEvent } from \"../../types/stream\";\n\nimport { AdapterError } from \"../types\";\nimport { spawnClaude } from \"./process\";\nimport type {\n AdapterDiagnostics,\n ClaudeCodeAdapterConfig,\n ClaudeCodeAdapterResult,\n ParseErrorRecord,\n} from \"./types\";\nimport type { HarnessAdapter } from \"../types\";\n\nexport { AdapterError } from \"../types\";\nexport type {\n AdapterDiagnostics,\n AdapterResult,\n ClaudeCodeAdapterConfig,\n ClaudeCodeAdapterResult,\n ClaudeCodeOptions,\n ParseErrorRecord,\n PermissionMode,\n} from \"./types\";\n\n/**\n * Run Claude Code in headless mode and return a trajectory.\n */\nexport async function runClaudeCode(\n config: ClaudeCodeAdapterConfig,\n): Promise<ClaudeCodeAdapterResult> {\n const startTs = Date.now();\n const spawned = await spawnClaude(config);\n\n const builder = new TrajectoryBuilder();\n const rawEvents: StreamEvent[] = [];\n const parseErrors: ParseErrorRecord[] = [];\n\n try {\n for await (const result of parseStreamJson(spawned.stdout)) {\n if (result.ok) {\n builder.consume(result.event);\n rawEvents.push(result.event);\n } else {\n parseErrors.push({\n line: result.rawLine,\n error: result.error.message,\n });\n }\n }\n\n const [{ exitCode, signal }, stderr] = await Promise.all([\n spawned.done,\n spawned.stderrCollected,\n ]);\n\n const diagnostics: AdapterDiagnostics = {\n exitCode,\n signal,\n stderr,\n parseErrors,\n timedOut: spawned.timedOut(),\n durationMs: Date.now() - startTs,\n };\n\n let view;\n try {\n view = builder.build();\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n throw new AdapterError(\n `harness produced no usable trajectory: ${message}`,\n diagnostics,\n );\n }\n\n return { view, diagnostics, rawEvents };\n } finally {\n await spawned.cleanup();\n }\n}\n\n/** Registered {@link HarnessAdapter} for Claude Code headless runs. */\nexport const claudeCodeAdapter: HarnessAdapter<ClaudeCodeAdapterConfig> = {\n id: \"claude-code\",\n run: runClaudeCode,\n};\n"],"mappings":";;;;;;;;;;;;;;;;;AAyCA,gBAAuB,gBACrB,QACyC;CACzC,IAAI,SAAS;CAGb,OAAO,YAAY,MAAM;CAEzB,WAAW,MAAM,SAAS,QAAQ;EAChC,UAAU;EAKV,IAAI;EACJ,QAAQ,aAAa,OAAO,QAAQ,IAAI,OAAO,IAAI;GACjD,MAAM,OAAO,OAAO,MAAM,GAAG,UAAU,CAAC,CAAC,KAAK;GAC9C,SAAS,OAAO,MAAM,aAAa,CAAC;GACpC,IAAI,KAAK,WAAW,GAAG;GACvB,MAAM,aAAa,IAAI;EACzB;CACF;CAKA,MAAM,WAAW,OAAO,KAAK;CAC7B,IAAI,SAAS,SAAS,GACpB,MAAM,aAAa,QAAQ;AAE/B;;;;;;;;;AAUA,SAAS,aAAa,MAA2B;CAC/C,IAAI;EAEF,OAAO;GAAE,IAAI;GAAM,OADL,KAAK,MAAM,IACF;GAAG,SAAS;EAAK;CAC1C,SAAS,KAAK;EACZ,OAAO;GACL,IAAI;GACJ,OAAO,eAAe,QAAQ,MAAM,IAAI,MAAM,OAAO,GAAG,CAAC;GACzD,SAAS;EACX;CACF;AACF;;;;AClFA,SAAS,mBAAmB,MAAgB,MAAc,QAAyB;CACjF,IAAI,CAAC,QAAQ;CACb,KAAK,MAAM,SAAS,QAClB,KAAK,KAAK,MAAM,KAAK;AAEzB;;;;;AAMA,SAAS,iBACP,MACA,MACA,OACM;CACN,IAAI,UAAU,KAAA,GAAW;CACzB,IAAI,OAAO,UAAU,WAAW;EAC9B,IAAI,OAAO,KAAK,KAAK,IAAI;EACzB;CACF;CACA,KAAK,KAAK,MAAM,OAAO,KAAK,CAAC;AAC/B;;AAGA,SAAgB,sBACd,MACA,QACM;CACN,mBAAmB,MAAM,gBAAgB,OAAO,UAAU;CAC1D,mBAAmB,MAAM,gBAAgB,OAAO,UAAU;CAC1D,mBAAmB,MAAM,aAAa,OAAO,OAAO;CAEpD,iBAAiB,MAAM,gBAAgB,OAAO,SAAS;CACvD,iBAAiB,MAAM,WAAW,OAAO,KAAK;CAC9C,iBAAiB,MAAM,qBAAqB,OAAO,cAAc;CACjE,iBAAiB,MAAM,YAAY,OAAO,MAAM;CAChD,iBAAiB,MAAM,WAAW,OAAO,KAAK;CAC9C,iBAAiB,MAAM,oBAAoB,OAAO,aAAa;CAC/D,iBAAiB,MAAM,WAAW,OAAO,KAAK;CAC9C,iBAAiB,MAAM,cAAc,OAAO,QAAQ;CACpD,iBAAiB,MAAM,qBAAqB,OAAO,cAAc;CACjE,iBAAiB,MAAM,eAAe,OAAO,QAAQ;CACrD,iBAAiB,MAAM,oBAAoB,OAAO,YAAY;CAC9D,iBAAiB,MAAM,mBAAmB,OAAO,YAAY;CAC7D,iBAAiB,MAAM,wBAAwB,OAAO,gBAAgB;CACtE,iBAAiB,MAAM,0BAA0B,OAAO,kBAAkB;CAC1E,iBACE,MACA,+BACA,OAAO,sBACT;CACA,iBAAiB,MAAM,WAAW,OAAO,KAAK;CAC9C,iBAAiB,MAAM,gBAAgB,OAAO,SAAS;CAEvD,IAAI,OAAO,gBAAgB,OAAO,aAAa,SAAS,GACtD,KAAK,KAAK,kBAAkB,OAAO,aAAa,KAAK,GAAG,CAAC;CAG3D,IAAI,OAAO,mBAAmB,OAAO,gBAAgB,SAAS,GAC5D,KAAK,KAAK,qBAAqB,OAAO,gBAAgB,KAAK,GAAG,CAAC;CAGjE,iBAAiB,MAAM,uBAAuB,OAAO,eAAe;CACpE,iBAAiB,MAAM,yBAAyB,OAAO,iBAAiB;CACxE,iBAAiB,MAAM,4BAA4B,OAAO,oBAAoB;CAC9E,iBAAiB,MAAM,4BAA4B,OAAO,oBAAoB;CAC9E,iBAAiB,MAAM,UAAU,OAAO,IAAI;CAC5C,iBAAiB,MAAM,eAAe,OAAO,QAAQ;CACrD,iBACE,MACA,wCACA,OAAO,+BACT;CACA,iBACE,MACA,kCACA,OAAO,0BACT;AACF;;;;;;;AAQA,SAAgB,UAAU,QAA2C;CACnE,MAAM,OAAiB;EACrB;EACA,OAAO;EACP;EACA;EACA;CACF;CAEA,sBAAsB,MAAM,MAAM;CAElC,OAAO;AACT;;;;;;;AAQA,SAAgB,eACd,QACA,SAAiD,CAAC,GACxC;CACV,MAAM,OAAiB;EAAC;EAAM;EAAQ;EAAmB;CAAM;CAC/D,MAAM,iBAAiB,OAAO,kBAAkB;CAChD,sBAAsB,MAAM;EAC1B,GAAG;EACH;CACF,CAAC;CACD,OAAO;AACT;;;;;;;;;;;;;;;;;;ACvGA,MAAM,qBAAqB,MAAS;;;;;;AAOpC,MAAM,gBAAgB;;;;;;;;;;;;;AA+BtB,eAAsB,YACpB,QACwB;CACxB,MAAM,SAAS,OAAO,UAAU;CAChC,MAAM,OAAO,UAAU,MAAM;CAM7B,MAAM,gBAJgB,OAAO,kBAAkB,QAK3C,MAAM,QAAQ,KAAK,OAAO,GAAG,eAAe,CAAC,IAC7C;CAEJ,MAAM,MAA0C;EAC9C,GAAG,QAAQ;EACX,GAAG,OAAO;CACZ;CACA,IAAI,eAEF,IAAI,oBAAoB;CAG1B,MAAM,QAAQ,MAAM,QAAQ,MAAM;EAChC,KAAK,OAAO,OAAO,QAAQ,IAAI;EAC/B;EACA,OAAO;GAAC;GAAU;GAAQ;EAAM;EAIhC,UAAU;CACZ,CAAC;CAKD,IAAI,WAAW;CACf,IAAI,iBAAwC;CAC5C,MAAM,YAAY,OAAO,aAAa;;;;;CAMtC,MAAM,+BAA+B;EACnC,IAAI,gBAAgB,aAAa,cAAc;EAC/C,iBAAiB,iBACT,SAAS,OAAO,SAAS,GAC/B,aACF;CACF;CAEA,MAAM,eAAe,iBAAiB;EACpC,WAAW;EACX,SAAS,OAAO,SAAS;EACzB,uBAAuB;CACzB,GAAG,SAAS;CAGZ,MAAM,gBAAgB;EACpB,SAAS,OAAO,SAAS;EACzB,uBAAuB;CACzB;CACA,OAAO,QAAQ,iBAAiB,SAAS,SAAS,EAAE,MAAM,KAAK,CAAC;CAKhE,MAAM,eAAyB,CAAC;CAChC,MAAM,QAAQ,YAAY,MAAM;CAChC,MAAM,QAAQ,GAAG,SAAS,UAAkB;EAC1C,aAAa,KAAK,KAAK;CACzB,CAAC;CAED,MAAM,kBAAkB,IAAI,SAAiB,YAAY;EACvD,MAAM,iBAAiB,QAAQ,aAAa,KAAK,EAAE,CAAC;EACpD,MAAM,QAAQ,GAAG,OAAO,QAAQ;EAGhC,MAAM,QAAQ,GAAG,SAAS,QAAQ;CACpC,CAAC;CAKD,MAAM,OAAO,IAAI,SAGb,YAAY;EACd,IAAI,UAAU;EACd,MAAM,YACJ,UACA,WACG;GACH,IAAI,SAAS;GACb,UAAU;GAEV,aAAa,YAAY;GACzB,IAAI,gBAAgB,aAAa,cAAc;GAC/C,OAAO,QAAQ,oBAAoB,SAAS,OAAO;GACnD,QAAQ;IAAE;IAAU;GAAO,CAAC;EAC9B;EAEA,MAAM,GAAG,UAAU,MAAM,WAAW,SAAS,MAAM,MAAM,CAAC;EAE1D,MAAM,GAAG,eAAe,SAAS,MAAM,IAAI,CAAC;CAC9C,CAAC;CAGD,MAAM,UAAU,YAAY;EAC1B,IAAI,CAAC,eAAe;EACpB,IAAI;GACF,MAAM,GAAG,eAAe;IAAE,WAAW;IAAM,OAAO;GAAK,CAAC;EAC1D,QAAQ,CAGR;CACF;CAKA,OAAO;EACL,QAAQ,MAAM;EACd;EACA;EACA,gBAAgB;EAChB;CACF;AACF;;;;;;;;;;;;;;;;;AAkBA,SAAS,SAAS,OAAqB,QAA8B;CACnE,IAAI,MAAM,QAAQ,KAAA,GAAW;CAC7B,IAAI;EAEF,QAAQ,KAAK,CAAC,MAAM,KAAK,MAAM;CACjC,QAAQ;EACN,IAAI;GAEF,MAAM,KAAK,MAAM;EACnB,QAAQ,CAER;CACF;AACF;;;;;;;;;;;;;;AC/LA,eAAsB,cACpB,QACkC;CAClC,MAAM,UAAU,KAAK,IAAI;CACzB,MAAM,UAAU,MAAM,YAAY,MAAM;CAExC,MAAM,UAAU,IAAI,kBAAkB;CACtC,MAAM,YAA2B,CAAC;CAClC,MAAM,cAAkC,CAAC;CAEzC,IAAI;EACF,WAAW,MAAM,UAAU,gBAAgB,QAAQ,MAAM,GACvD,IAAI,OAAO,IAAI;GACb,QAAQ,QAAQ,OAAO,KAAK;GAC5B,UAAU,KAAK,OAAO,KAAK;EAC7B,OACE,YAAY,KAAK;GACf,MAAM,OAAO;GACb,OAAO,OAAO,MAAM;EACtB,CAAC;EAIL,MAAM,CAAC,EAAE,UAAU,UAAU,UAAU,MAAM,QAAQ,IAAI,CACvD,QAAQ,MACR,QAAQ,eACV,CAAC;EAED,MAAM,cAAkC;GACtC;GACA;GACA;GACA;GACA,UAAU,QAAQ,SAAS;GAC3B,YAAY,KAAK,IAAI,IAAI;EAC3B;EAEA,IAAI;EACJ,IAAI;GACF,OAAO,QAAQ,MAAM;EACvB,SAAS,KAAK;GAEZ,MAAM,IAAI,aACR,0CAFc,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,KAG7D,WACF;EACF;EAEA,OAAO;GAAE;GAAM;GAAa;EAAU;CACxC,UAAU;EACR,MAAM,QAAQ,QAAQ;CACxB;AACF;;AAGA,MAAa,oBAA6D;CACxE,IAAI;CACJ,KAAK;AACP"}
|
package/dist/cli/bin.js
CHANGED
|
@@ -1,129 +1,19 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import {
|
|
3
|
-
import { t as runSuite, u as getAdapter } from "../suite-
|
|
4
|
-
import { i as loadGradingConfig, t as loadSuite } from "../loader-
|
|
2
|
+
import { F as loadSuiteReport, M as gradingReportPassed, N as resolveGradeOptions, P as gradeReport, a as envelopeCommand, c as getOptionInt, i as runPipeline, j as formatGradingConsole, l as hasOption, o as parseEnvelopeProjection, p as suiteDirectoryFromPath, r as trajectoryToOtlp, s as getOption, t as formatReport, u as parseArgs } from "../reporter-Biy-5-9M.js";
|
|
3
|
+
import { t as runSuite, u as getAdapter } from "../suite-BcP64nlb.js";
|
|
4
|
+
import { i as loadGradingConfig, o as loadSuiteDocument, t as loadSuite } from "../loader-DnQ6Jt0i.js";
|
|
5
5
|
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
6
|
-
import { dirname, join } from "node:path";
|
|
6
|
+
import { dirname, isAbsolute, join } from "node:path";
|
|
7
7
|
import { fileURLToPath } from "node:url";
|
|
8
|
-
//#region src/cli/args.ts
|
|
9
|
-
function parseArgs(argv) {
|
|
10
|
-
const positional = [];
|
|
11
|
-
const options = {};
|
|
12
|
-
let command;
|
|
13
|
-
const args = [...argv];
|
|
14
|
-
if (args.length > 0 && !args[0].startsWith("-")) command = args.shift();
|
|
15
|
-
for (let i = 0; i < args.length; i++) {
|
|
16
|
-
const arg = args[i];
|
|
17
|
-
if (arg === "--") {
|
|
18
|
-
positional.push(...args.slice(i + 1));
|
|
19
|
-
break;
|
|
20
|
-
}
|
|
21
|
-
if (arg.startsWith("--")) {
|
|
22
|
-
const key = arg.slice(2);
|
|
23
|
-
const next = args[i + 1];
|
|
24
|
-
if (next && !next.startsWith("-")) {
|
|
25
|
-
options[key] = next;
|
|
26
|
-
i++;
|
|
27
|
-
} else options[key] = true;
|
|
28
|
-
} else if (arg.startsWith("-") && arg.length === 2) {
|
|
29
|
-
const key = arg.slice(1);
|
|
30
|
-
const next = args[i + 1];
|
|
31
|
-
if (next && !next.startsWith("-")) {
|
|
32
|
-
options[key] = next;
|
|
33
|
-
i++;
|
|
34
|
-
} else options[key] = true;
|
|
35
|
-
} else positional.push(arg);
|
|
36
|
-
}
|
|
37
|
-
return {
|
|
38
|
-
command,
|
|
39
|
-
positional,
|
|
40
|
-
options
|
|
41
|
-
};
|
|
42
|
-
}
|
|
43
|
-
function getOption(options, name) {
|
|
44
|
-
const v = options[name];
|
|
45
|
-
return typeof v === "string" ? v : void 0;
|
|
46
|
-
}
|
|
47
|
-
function getOptionInt(options, name, defaultValue) {
|
|
48
|
-
const v = getOption(options, name);
|
|
49
|
-
if (v === void 0) return defaultValue;
|
|
50
|
-
const n = Number.parseInt(v, 10);
|
|
51
|
-
if (!Number.isFinite(n)) return defaultValue;
|
|
52
|
-
return n;
|
|
53
|
-
}
|
|
54
|
-
function hasOption(options, name) {
|
|
55
|
-
const v = options[name];
|
|
56
|
-
return v === true || typeof v === "string" && v === "true";
|
|
57
|
-
}
|
|
58
|
-
//#endregion
|
|
59
|
-
//#region src/cli/commands/envelope.ts
|
|
60
|
-
/**
|
|
61
|
-
* `harness-eval envelope` — build EvalRunEnvelope and interchange projections.
|
|
62
|
-
*/
|
|
63
|
-
const PROJECTIONS = /* @__PURE__ */ new Set([
|
|
64
|
-
"envelope",
|
|
65
|
-
"trajectory",
|
|
66
|
-
"instances",
|
|
67
|
-
"agent-trace"
|
|
68
|
-
]);
|
|
69
|
-
function parseEnvelopeProjection(value) {
|
|
70
|
-
if (value === void 0) return "envelope";
|
|
71
|
-
if (PROJECTIONS.has(value)) return value;
|
|
72
|
-
}
|
|
73
|
-
function serializeEnvelopeProjection(envelope, projection) {
|
|
74
|
-
switch (projection) {
|
|
75
|
-
case "trajectory": return `${toTrajectory(envelope).map((row) => JSON.stringify(row)).join("\n")}\n`;
|
|
76
|
-
case "instances": return `${JSON.stringify(toProtoInstances(envelope), null, 2)}\n`;
|
|
77
|
-
case "agent-trace": return `${JSON.stringify(toAgentTrace(envelope), null, 2)}\n`;
|
|
78
|
-
default: return `${JSON.stringify(envelope, null, 2)}\n`;
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
async function readFrameworkVersion() {
|
|
82
|
-
try {
|
|
83
|
-
const text = await readFile(join(dirname(fileURLToPath(import.meta.url)), "../../../package.json"), "utf8");
|
|
84
|
-
return JSON.parse(text).version;
|
|
85
|
-
} catch {
|
|
86
|
-
return;
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
async function envelopeCommand(args) {
|
|
90
|
-
const reportPath = args.positional[0];
|
|
91
|
-
if (!reportPath) {
|
|
92
|
-
console.error("usage: harness-eval envelope <report.json> [--output path] [--grading path] [--suite path] [--projection envelope|trajectory|instances|agent-trace] [--include-raw-stream-events] [--no-transcript]");
|
|
93
|
-
return 2;
|
|
94
|
-
}
|
|
95
|
-
const outputPath = getOption(args.options, "output");
|
|
96
|
-
const gradingPath = getOption(args.options, "grading");
|
|
97
|
-
const suitePath = getOption(args.options, "suite");
|
|
98
|
-
const projection = parseEnvelopeProjection(getOption(args.options, "projection"));
|
|
99
|
-
if (!projection) {
|
|
100
|
-
console.error("invalid --projection; expected envelope, trajectory, instances, or agent-trace");
|
|
101
|
-
return 2;
|
|
102
|
-
}
|
|
103
|
-
let envelope;
|
|
104
|
-
try {
|
|
105
|
-
const frameworkVersion = await readFrameworkVersion();
|
|
106
|
-
envelope = await buildEvalRunEnvelopeFromFiles(reportPath, {
|
|
107
|
-
gradingPath,
|
|
108
|
-
suitePath,
|
|
109
|
-
includeTranscript: !hasOption(args.options, "no-transcript"),
|
|
110
|
-
includeRawStreamEvents: hasOption(args.options, "include-raw-stream-events"),
|
|
111
|
-
harness: { frameworkVersion }
|
|
112
|
-
});
|
|
113
|
-
} catch (err) {
|
|
114
|
-
console.error(err instanceof Error ? err.message : String(err));
|
|
115
|
-
return 2;
|
|
116
|
-
}
|
|
117
|
-
const serialized = serializeEnvelopeProjection(envelope, projection);
|
|
118
|
-
if (outputPath) await writeFile(outputPath, serialized, "utf8");
|
|
119
|
-
else process.stdout.write(serialized);
|
|
120
|
-
return envelope.summary.behavioralPass ? 0 : 1;
|
|
121
|
-
}
|
|
122
|
-
//#endregion
|
|
123
8
|
//#region src/cli/commands/format.ts
|
|
124
9
|
/**
|
|
125
10
|
* `harness-eval format` command.
|
|
126
11
|
*/
|
|
12
|
+
/**
|
|
13
|
+
* Execute `harness-eval format`: re-render a saved report JSON.
|
|
14
|
+
*
|
|
15
|
+
* @returns 0 when all cells pass, 1 otherwise, 2 on load errors.
|
|
16
|
+
*/
|
|
127
17
|
async function formatCommand(args) {
|
|
128
18
|
const reportPath = args.positional[0];
|
|
129
19
|
if (!reportPath) {
|
|
@@ -152,11 +42,17 @@ async function formatCommand(args) {
|
|
|
152
42
|
}
|
|
153
43
|
//#endregion
|
|
154
44
|
//#region src/cli/progress.ts
|
|
45
|
+
/** ANSI SGR codes for progress output. Disabled when {@link resolveProgressColor} returns false. */
|
|
155
46
|
const GREEN = "\x1B[32m";
|
|
156
47
|
const RED = "\x1B[31m";
|
|
157
48
|
const YELLOW = "\x1B[33m";
|
|
158
49
|
const DIM = "\x1B[2m";
|
|
159
50
|
const RESET = "\x1B[0m";
|
|
51
|
+
/**
|
|
52
|
+
* Resolve progress mode from `--progress`, `--quiet`, or `--verbose` flags.
|
|
53
|
+
*
|
|
54
|
+
* Explicit `--progress` wins; otherwise `--quiet` / `--verbose` map to modes.
|
|
55
|
+
*/
|
|
160
56
|
function resolveProgressMode(options) {
|
|
161
57
|
const progress = getOption(options, "progress");
|
|
162
58
|
if (progress === "json" || progress === "quiet" || progress === "verbose" || progress === "default") return progress;
|
|
@@ -164,7 +60,12 @@ function resolveProgressMode(options) {
|
|
|
164
60
|
if (hasOption(options, "verbose")) return "verbose";
|
|
165
61
|
return "default";
|
|
166
62
|
}
|
|
167
|
-
/**
|
|
63
|
+
/**
|
|
64
|
+
* Whether to emit ANSI colors on the progress stream (stderr).
|
|
65
|
+
*
|
|
66
|
+
* Precedence: `--no-color` → off; `--color` → on; `NO_COLOR` env → off;
|
|
67
|
+
* `FORCE_COLOR` (non-zero) → on; otherwise TTY detection on `stream`.
|
|
68
|
+
*/
|
|
168
69
|
function resolveProgressColor(options, stream = process.stderr) {
|
|
169
70
|
if (hasOption(options, "no-color")) return false;
|
|
170
71
|
if (hasOption(options, "color")) return true;
|
|
@@ -172,24 +73,35 @@ function resolveProgressColor(options, stream = process.stderr) {
|
|
|
172
73
|
if (process.env.FORCE_COLOR !== void 0 && process.env.FORCE_COLOR !== "0") return true;
|
|
173
74
|
return "isTTY" in stream && stream.isTTY === true;
|
|
174
75
|
}
|
|
76
|
+
/** Green checkmark prefix for per-rep success lines. */
|
|
175
77
|
function okMark(color) {
|
|
176
78
|
return color ? `${GREEN}✓${RESET}` : "✓";
|
|
177
79
|
}
|
|
80
|
+
/** Red cross prefix for per-rep failure lines. */
|
|
178
81
|
function failMark(color) {
|
|
179
82
|
return color ? `${RED}✗${RESET}` : "✗";
|
|
180
83
|
}
|
|
84
|
+
/** Inline lowercase status word for repetition rows. */
|
|
181
85
|
function okStatus(color) {
|
|
182
86
|
return color ? `${GREEN}ok${RESET}` : "ok";
|
|
183
87
|
}
|
|
88
|
+
/** Inline uppercase status word for repetition failures. */
|
|
184
89
|
function failStatus(color) {
|
|
185
90
|
return color ? `${RED}FAIL${RESET}` : "FAIL";
|
|
186
91
|
}
|
|
92
|
+
/** Uppercase cell-level pass label in {@link formatCellSummary}. */
|
|
187
93
|
function passLabel(color) {
|
|
188
94
|
return color ? `${GREEN}PASS${RESET}` : "PASS";
|
|
189
95
|
}
|
|
96
|
+
/** Uppercase cell-level fail label in {@link formatCellSummary}. */
|
|
190
97
|
function failLabel(color) {
|
|
191
98
|
return color ? `${RED}FAIL${RESET}` : "FAIL";
|
|
192
99
|
}
|
|
100
|
+
/**
|
|
101
|
+
* Build a {@link ProgressCallback} for suite runs.
|
|
102
|
+
*
|
|
103
|
+
* Writes to `options.stream` (default stderr). JSON mode emits one event per line.
|
|
104
|
+
*/
|
|
193
105
|
function createRunProgressHandler(options) {
|
|
194
106
|
const stream = options.stream ?? process.stderr;
|
|
195
107
|
const mode = options.mode;
|
|
@@ -295,6 +207,7 @@ function createRunProgressHandler(options) {
|
|
|
295
207
|
}
|
|
296
208
|
};
|
|
297
209
|
}
|
|
210
|
+
/** Build a progress handler for outcome grading ({@link GradeProgressEvent}). */
|
|
298
211
|
function createGradeProgressHandler(options) {
|
|
299
212
|
const stream = options.stream ?? process.stderr;
|
|
300
213
|
const mode = options.mode;
|
|
@@ -375,9 +288,16 @@ function createGradeProgressHandler(options) {
|
|
|
375
288
|
}
|
|
376
289
|
};
|
|
377
290
|
}
|
|
291
|
+
/**
|
|
292
|
+
* Write one NDJSON progress event line to the progress stream.
|
|
293
|
+
*
|
|
294
|
+
* JSON mode keeps stdout clean for machine-readable reports while still
|
|
295
|
+
* exposing structured progress for CI log parsers.
|
|
296
|
+
*/
|
|
378
297
|
function writeJson(stream, value) {
|
|
379
298
|
stream.write(`${JSON.stringify(value)}\n`);
|
|
380
299
|
}
|
|
300
|
+
/** Format milliseconds as a human-readable duration string. */
|
|
381
301
|
function formatDuration(ms) {
|
|
382
302
|
if (ms < 1e3) return `${ms}ms`;
|
|
383
303
|
const sec = ms / 1e3;
|
|
@@ -387,20 +307,33 @@ function formatDuration(ms) {
|
|
|
387
307
|
if (min < 60) return `${min}m ${remSec}s`;
|
|
388
308
|
return `${Math.floor(min / 60)}h ${min % 60}m`;
|
|
389
309
|
}
|
|
310
|
+
/**
|
|
311
|
+
* Estimate remaining time from average completed rep duration.
|
|
312
|
+
*
|
|
313
|
+
* Uses a simple running mean — good enough for long suites without storing
|
|
314
|
+
* per-rep history. Returns `undefined` at start and when all reps are done.
|
|
315
|
+
*/
|
|
390
316
|
function formatEta(totalDurationMs, completed, total) {
|
|
391
317
|
if (completed === 0 || completed >= total) return void 0;
|
|
392
318
|
const avg = totalDurationMs / completed;
|
|
393
319
|
const remaining = (total - completed) * avg;
|
|
394
320
|
return `~${formatDuration(Math.round(remaining))} remaining`;
|
|
395
321
|
}
|
|
322
|
+
/** Truncate error text for single-line progress rows (Unicode ellipsis). */
|
|
396
323
|
function truncate(text, max) {
|
|
397
324
|
if (text.length <= max) return text;
|
|
398
325
|
return `${text.slice(0, max - 1)}…`;
|
|
399
326
|
}
|
|
327
|
+
/**
|
|
328
|
+
* Compact per-assertion pass/fail summary for `--progress verbose` rep lines.
|
|
329
|
+
*
|
|
330
|
+
* @returns Comma-separated `✓ description` / `✗ description` fragments, or empty string.
|
|
331
|
+
*/
|
|
400
332
|
function formatAssertionSummary(results, color = false) {
|
|
401
333
|
if (!results || results.length === 0) return "";
|
|
402
334
|
return results.map((r) => `${r.passed ? okMark(color) : failMark(color)} ${r.description}`).join(", ");
|
|
403
335
|
}
|
|
336
|
+
/** One-line summary when a matrix cell finishes (used in default progress mode). */
|
|
404
337
|
function formatCellSummary(cell, color) {
|
|
405
338
|
const mark = cell.passed ? okMark(color) : failMark(color);
|
|
406
339
|
const status = cell.passed ? passLabel(color) : failLabel(color);
|
|
@@ -417,19 +350,26 @@ function formatCellSummary(cell, color) {
|
|
|
417
350
|
/**
|
|
418
351
|
* `harness-eval grade` — LLM outcome grading on a suite report.
|
|
419
352
|
*/
|
|
353
|
+
/** Parse an optional integer CLI flag; returns undefined when absent or invalid. */
|
|
420
354
|
function optionalOptionInt(options, name) {
|
|
421
355
|
const raw = getOption(options, name);
|
|
422
356
|
if (raw === void 0) return void 0;
|
|
423
357
|
const n = Number.parseInt(raw, 10);
|
|
424
358
|
return Number.isFinite(n) ? n : void 0;
|
|
425
359
|
}
|
|
360
|
+
/**
|
|
361
|
+
* Execute `harness-eval grade`: LLM outcome grading on a suite report JSON.
|
|
362
|
+
*
|
|
363
|
+
* @returns 0 when all expectations pass, 1 on failure, 2 on usage/load errors or no reps graded.
|
|
364
|
+
*/
|
|
426
365
|
async function gradeCommand(args) {
|
|
427
366
|
const reportPath = args.positional[0];
|
|
428
367
|
if (!reportPath) {
|
|
429
|
-
console.error("usage: harness-eval grade <report.json> [--config grading.yaml] [--expectations path] [--output path] [--model id] [--timeout-ms N] [--max-concurrent N]");
|
|
368
|
+
console.error("usage: harness-eval grade <report.json> [--config grading.yaml] [--suite suite.yaml] [--expectations path] [--output path] [--model id] [--timeout-ms N] [--max-concurrent N]");
|
|
430
369
|
return 2;
|
|
431
370
|
}
|
|
432
371
|
const configPath = getOption(args.options, "config");
|
|
372
|
+
const suitePath = getOption(args.options, "suite");
|
|
433
373
|
const expectationsPath = getOption(args.options, "expectations");
|
|
434
374
|
const outputPath = getOption(args.options, "output");
|
|
435
375
|
const model = getOption(args.options, "model");
|
|
@@ -440,8 +380,13 @@ async function gradeCommand(args) {
|
|
|
440
380
|
const progressMode = resolveProgressMode(args.options);
|
|
441
381
|
const useProgressColor = progressMode !== "json" && resolveProgressColor(args.options);
|
|
442
382
|
let fileConfig;
|
|
443
|
-
|
|
444
|
-
|
|
383
|
+
const gradingConfigPath = configPath ?? suitePath;
|
|
384
|
+
if (configPath && suitePath) {
|
|
385
|
+
console.error("grade: use only one of --config or --suite");
|
|
386
|
+
return 2;
|
|
387
|
+
}
|
|
388
|
+
if (gradingConfigPath) try {
|
|
389
|
+
fileConfig = await loadGradingConfig(gradingConfigPath);
|
|
445
390
|
} catch (err) {
|
|
446
391
|
console.error(err instanceof Error ? err.message : String(err));
|
|
447
392
|
return 2;
|
|
@@ -462,7 +407,7 @@ async function gradeCommand(args) {
|
|
|
462
407
|
binary,
|
|
463
408
|
timeoutMs,
|
|
464
409
|
maxConcurrent
|
|
465
|
-
}, configPath);
|
|
410
|
+
}, configPath ?? suitePath);
|
|
466
411
|
} catch (err) {
|
|
467
412
|
console.error(err instanceof Error ? err.message : String(err));
|
|
468
413
|
return 2;
|
|
@@ -489,10 +434,125 @@ async function gradeCommand(args) {
|
|
|
489
434
|
return gradingReportPassed(grading) ? 0 : 1;
|
|
490
435
|
}
|
|
491
436
|
//#endregion
|
|
437
|
+
//#region src/cli/commands/pipeline.ts
|
|
438
|
+
/**
|
|
439
|
+
* `harness-eval pipeline` — orchestrate run → grade → envelope from suite.yaml.
|
|
440
|
+
*/
|
|
441
|
+
/** Read package version for envelope provenance (best-effort). */
|
|
442
|
+
async function readFrameworkVersion() {
|
|
443
|
+
try {
|
|
444
|
+
const text = await readFile(join(dirname(fileURLToPath(import.meta.url)), "../../../package.json"), "utf8");
|
|
445
|
+
return JSON.parse(text).version;
|
|
446
|
+
} catch {
|
|
447
|
+
return;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
/** Resolve CLI path overrides relative to the suite directory unless absolute or `~/`. */
|
|
451
|
+
function resolveOverridePath(value, suiteDir) {
|
|
452
|
+
if (!value) return void 0;
|
|
453
|
+
return isAbsolute(value) || value.startsWith("~/") ? value : join(suiteDir, value);
|
|
454
|
+
}
|
|
455
|
+
/**
|
|
456
|
+
* Execute `harness-eval pipeline`.
|
|
457
|
+
*
|
|
458
|
+
* @returns Step exit code (0 pass, 1 eval fail, 2 usage/load error).
|
|
459
|
+
*/
|
|
460
|
+
async function pipelineCommand(args) {
|
|
461
|
+
const suitePath = args.positional[0];
|
|
462
|
+
if (!suitePath) {
|
|
463
|
+
console.error("usage: harness-eval pipeline <suite.yaml|dir> [--steps run,grade,envelope] [--output path] [--grading path] [--report path] [--max-concurrent N] [--progress default|quiet|verbose|json]");
|
|
464
|
+
return 2;
|
|
465
|
+
}
|
|
466
|
+
let doc;
|
|
467
|
+
try {
|
|
468
|
+
doc = await loadSuiteDocument(suitePath);
|
|
469
|
+
} catch (err) {
|
|
470
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
471
|
+
return 2;
|
|
472
|
+
}
|
|
473
|
+
if (!doc.pipeline) {
|
|
474
|
+
console.error("suite.yaml has no pipeline block; use run, grade, and envelope commands separately");
|
|
475
|
+
return 2;
|
|
476
|
+
}
|
|
477
|
+
const suiteDir = suiteDirectoryFromPath(doc.suitePath);
|
|
478
|
+
const steps = getOption(args.options, "steps");
|
|
479
|
+
const maxConcurrent = getOptionInt(args.options, "max-concurrent", 4);
|
|
480
|
+
const progressMode = resolveProgressMode(args.options);
|
|
481
|
+
const useProgressColor = progressMode !== "json" && resolveProgressColor(args.options);
|
|
482
|
+
const projection = parseEnvelopeProjection(getOption(args.options, "projection"));
|
|
483
|
+
if (getOption(args.options, "projection") && !projection) {
|
|
484
|
+
console.error("invalid --projection; expected envelope, trajectory, or instances");
|
|
485
|
+
return 2;
|
|
486
|
+
}
|
|
487
|
+
const overrides = {};
|
|
488
|
+
const runOutput = getOption(args.options, "output");
|
|
489
|
+
if (runOutput) overrides.run = {
|
|
490
|
+
output: resolveOverridePath(runOutput, suiteDir),
|
|
491
|
+
maxConcurrent
|
|
492
|
+
};
|
|
493
|
+
const reportOverride = getOption(args.options, "report");
|
|
494
|
+
if (reportOverride) {
|
|
495
|
+
overrides.grade = {
|
|
496
|
+
...overrides.grade,
|
|
497
|
+
input: resolveOverridePath(reportOverride, suiteDir)
|
|
498
|
+
};
|
|
499
|
+
overrides.envelope = {
|
|
500
|
+
...overrides.envelope,
|
|
501
|
+
report: resolveOverridePath(reportOverride, suiteDir)
|
|
502
|
+
};
|
|
503
|
+
}
|
|
504
|
+
const gradingOutput = getOption(args.options, "grading-output");
|
|
505
|
+
if (gradingOutput) overrides.grade = {
|
|
506
|
+
...overrides.grade,
|
|
507
|
+
output: resolveOverridePath(gradingOutput, suiteDir)
|
|
508
|
+
};
|
|
509
|
+
const gradingInput = getOption(args.options, "grading");
|
|
510
|
+
if (gradingInput) overrides.envelope = {
|
|
511
|
+
...overrides.envelope,
|
|
512
|
+
grading: resolveOverridePath(gradingInput, suiteDir)
|
|
513
|
+
};
|
|
514
|
+
const envelopeOutput = getOption(args.options, "envelope-output");
|
|
515
|
+
if (envelopeOutput) overrides.envelope = {
|
|
516
|
+
...overrides.envelope,
|
|
517
|
+
output: resolveOverridePath(envelopeOutput, suiteDir)
|
|
518
|
+
};
|
|
519
|
+
if (projection) overrides.envelope = {
|
|
520
|
+
...overrides.envelope,
|
|
521
|
+
projection
|
|
522
|
+
};
|
|
523
|
+
if (doc.pipeline.grade && !doc.judge) {
|
|
524
|
+
console.error("pipeline grade step requires inline judge: block in suite.yaml");
|
|
525
|
+
return 2;
|
|
526
|
+
}
|
|
527
|
+
const frameworkVersion = await readFrameworkVersion();
|
|
528
|
+
try {
|
|
529
|
+
return (await runPipeline(doc, {
|
|
530
|
+
steps,
|
|
531
|
+
maxConcurrent,
|
|
532
|
+
overrides,
|
|
533
|
+
frameworkVersion,
|
|
534
|
+
onRunProgress: createRunProgressHandler({
|
|
535
|
+
mode: progressMode,
|
|
536
|
+
maxConcurrent,
|
|
537
|
+
color: useProgressColor
|
|
538
|
+
}),
|
|
539
|
+
onGradeProgress: createGradeProgressHandler({
|
|
540
|
+
mode: progressMode,
|
|
541
|
+
maxConcurrent,
|
|
542
|
+
color: useProgressColor
|
|
543
|
+
})
|
|
544
|
+
})).exitCode;
|
|
545
|
+
} catch (err) {
|
|
546
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
547
|
+
return 2;
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
//#endregion
|
|
492
551
|
//#region src/cli/commands/otel-output.ts
|
|
493
552
|
/**
|
|
494
553
|
* Write OTLP JSON artifacts from a suite report.
|
|
495
554
|
*/
|
|
555
|
+
/** Sanitize case/cell labels for use in OTLP artifact filenames. */
|
|
496
556
|
function safeFilePart(value) {
|
|
497
557
|
return value.replace(/[^a-zA-Z0-9._-]+/g, "_");
|
|
498
558
|
}
|
|
@@ -521,6 +581,11 @@ async function writeOtelArtifacts(suite, report, outputDir) {
|
|
|
521
581
|
/**
|
|
522
582
|
* `harness-eval run` command.
|
|
523
583
|
*/
|
|
584
|
+
/**
|
|
585
|
+
* Execute `harness-eval run`: load suite, run repetitions, format report.
|
|
586
|
+
*
|
|
587
|
+
* @returns 0 when all cells pass thresholds, 1 on assertion failure, 2 on usage/load errors.
|
|
588
|
+
*/
|
|
524
589
|
async function runCommand(args) {
|
|
525
590
|
const suitePath = args.positional[0];
|
|
526
591
|
if (!suitePath) {
|
|
@@ -575,14 +640,17 @@ async function runCommand(args) {
|
|
|
575
640
|
//#endregion
|
|
576
641
|
//#region src/cli/main.ts
|
|
577
642
|
/**
|
|
578
|
-
* CLI entry point
|
|
643
|
+
* CLI entry point — dispatches subcommands and prints usage on `--help`.
|
|
644
|
+
*
|
|
645
|
+
* Exit codes: 0 success, 1 eval/grade failure, 2 usage or load errors.
|
|
579
646
|
*/
|
|
580
647
|
const USAGE = `harness-eval — harness-level eval framework
|
|
581
648
|
|
|
582
649
|
Usage:
|
|
583
650
|
harness-eval run <suite.yaml> [--max-concurrent N] [--baseline path] [--output path] [--otel-output dir] [--format console|markdown|json] [--adapter id] [--quiet] [--verbose] [--progress default|quiet|verbose|json]
|
|
584
|
-
harness-eval grade <report.json> [--config grading.yaml] [--expectations path] [--output path] [--model id] [--timeout-ms N] [--max-concurrent N] [--format console|json] [--quiet] [--verbose] [--progress default|quiet|verbose|json]
|
|
585
|
-
harness-eval envelope <report.json> [--output path] [--grading path] [--suite path] [--projection envelope|trajectory|instances
|
|
651
|
+
harness-eval grade <report.json> [--config grading.yaml] [--suite suite.yaml] [--expectations path] [--output path] [--model id] [--timeout-ms N] [--max-concurrent N] [--format console|json] [--quiet] [--verbose] [--progress default|quiet|verbose|json]
|
|
652
|
+
harness-eval envelope <report.json> [--output path] [--grading path] [--suite path] [--projection envelope|trajectory|instances] [--include-raw-stream-events] [--no-transcript]
|
|
653
|
+
harness-eval pipeline <suite.yaml|dir> [--steps run,grade,envelope] [--output path] [--grading path] [--grading-output path] [--envelope-output path] [--report path] [--projection envelope|trajectory|instances] [--max-concurrent N] [--progress default|quiet|verbose|json]
|
|
586
654
|
harness-eval format <report.json> [--format console|markdown|json] [--baseline path]
|
|
587
655
|
harness-eval --help
|
|
588
656
|
|
|
@@ -594,6 +662,11 @@ Usage:
|
|
|
594
662
|
--no-color disable ANSI colors on progress output
|
|
595
663
|
--color force ANSI colors on progress output
|
|
596
664
|
`;
|
|
665
|
+
/**
|
|
666
|
+
* Route argv to the appropriate subcommand handler.
|
|
667
|
+
*
|
|
668
|
+
* @returns Process exit code (0 = success, 1 = eval failure, 2 = usage error).
|
|
669
|
+
*/
|
|
597
670
|
async function main(argv) {
|
|
598
671
|
const parsed = parseArgs(argv);
|
|
599
672
|
if (parsed.options.help || parsed.command === "help" || parsed.options.h) {
|
|
@@ -604,6 +677,7 @@ async function main(argv) {
|
|
|
604
677
|
case "run": return await runCommand(parsed);
|
|
605
678
|
case "grade": return await gradeCommand(parsed);
|
|
606
679
|
case "envelope": return await envelopeCommand(parsed);
|
|
680
|
+
case "pipeline": return await pipelineCommand(parsed);
|
|
607
681
|
case "format": return await formatCommand(parsed);
|
|
608
682
|
case void 0:
|
|
609
683
|
console.error(USAGE);
|
|
@@ -615,6 +689,9 @@ async function main(argv) {
|
|
|
615
689
|
}
|
|
616
690
|
//#endregion
|
|
617
691
|
//#region src/cli/bin.ts
|
|
692
|
+
/**
|
|
693
|
+
* CLI executable entry point — delegates to {@link main} and exits with its code.
|
|
694
|
+
*/
|
|
618
695
|
const code = await main(process.argv.slice(2));
|
|
619
696
|
process.exit(code);
|
|
620
697
|
//#endregion
|