@zhijiewang/openharness 2.38.0 → 2.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,94 @@
1
+ /**
2
+ * oh evals — run writer.
3
+ *
4
+ * Streams per-task results to disk atomically:
5
+ * - results.jsonl : append-only, one EvalsResult per line
6
+ * - predictions.json: array, rewritten on each append, SWE-bench-submittable
7
+ * - results.json : merged + aggregates, written ONLY by finalize()
8
+ *
9
+ * Crash-safety: results.jsonl + predictions.json are valid up to the last
10
+ * successful append. `oh evals run --resume <run_id>` reads results.jsonl
11
+ * to determine completed instance_ids.
12
+ */
13
+ import { appendFileSync, existsSync, mkdirSync, readFileSync, renameSync, writeFileSync } from "node:fs";
14
+ import { join } from "node:path";
15
+ export class RunWriter {
16
+ runDir;
17
+ header;
18
+ results = [];
19
+ constructor(runDir, header) {
20
+ this.runDir = runDir;
21
+ this.header = header;
22
+ mkdirSync(runDir, { recursive: true });
23
+ mkdirSync(join(runDir, "transcripts"), { recursive: true });
24
+ }
25
+ appendResult(result) {
26
+ this.results.push(result);
27
+ // results.jsonl — append a single line atomically.
28
+ const line = `${JSON.stringify(result)}\n`;
29
+ appendFileSync(join(this.runDir, "results.jsonl"), line);
30
+ // predictions.json — rewrite the array atomically (.tmp → rename).
31
+ const preds = this.results.map((r) => ({
32
+ instance_id: r.instance_id,
33
+ model_patch: r.model_patch,
34
+ model_name_or_path: this.header.model,
35
+ }));
36
+ const tmp = join(this.runDir, "predictions.json.tmp");
37
+ writeFileSync(tmp, JSON.stringify(preds, null, 2));
38
+ renameSync(tmp, join(this.runDir, "predictions.json"));
39
+ }
40
+ loadExistingResults() {
41
+ const path = join(this.runDir, "results.jsonl");
42
+ if (!existsSync(path))
43
+ return [];
44
+ return readFileSync(path, "utf-8")
45
+ .split("\n")
46
+ .filter((l) => l.trim().length > 0)
47
+ .map((l) => JSON.parse(l));
48
+ }
49
+ finalize(opts) {
50
+ const counts = {
51
+ resolved: 0,
52
+ failed: 0,
53
+ error: 0,
54
+ timeout: 0,
55
+ budget_exceeded: 0,
56
+ skipped: 0,
57
+ };
58
+ let totalCost = 0;
59
+ let totalDuration = 0;
60
+ for (const r of this.results) {
61
+ counts[r.status]++;
62
+ totalCost += r.cost_usd;
63
+ totalDuration += r.duration_ms;
64
+ }
65
+ const denom = counts.resolved + counts.failed + counts.error + counts.timeout;
66
+ const passRate = denom === 0 ? 0 : counts.resolved / denom;
67
+ const artifacts = {
68
+ run_id: this.header.run_id,
69
+ pack: this.header.pack,
70
+ pack_version: this.header.pack_version,
71
+ model: this.header.model,
72
+ harness_version: this.header.harness_version,
73
+ started_at: this.header.started_at,
74
+ finished_at: opts.finished_at,
75
+ total_cost_usd: totalCost,
76
+ max_cost_usd: this.header.max_cost_usd,
77
+ total_duration_ms: totalDuration,
78
+ resolved: counts.resolved,
79
+ failed: counts.failed,
80
+ error: counts.error,
81
+ timeout: counts.timeout,
82
+ budget_exceeded: counts.budget_exceeded,
83
+ skipped: counts.skipped,
84
+ pass_rate: passRate,
85
+ partial: opts.partial,
86
+ results: [...this.results],
87
+ };
88
+ const tmp = join(this.runDir, "results.json.tmp");
89
+ writeFileSync(tmp, JSON.stringify(artifacts, null, 2));
90
+ renameSync(tmp, join(this.runDir, "results.json"));
91
+ return artifacts;
92
+ }
93
+ }
94
+ //# sourceMappingURL=run-writer.js.map
@@ -0,0 +1,34 @@
1
+ /**
2
+ * oh evals — scorer.
3
+ *
4
+ * After the agent runs, we score the task by:
5
+ * (1) Running an oracle script (oracle.sh / oracle.mjs) if one exists in
6
+ * the fixture dir — exit 0 = pass.
7
+ * (2) Else running the pack's default test command and parsing the
8
+ * junit-xml output for FAIL_TO_PASS / PASS_TO_PASS test IDs.
9
+ *
10
+ * Test ID convention matches SWE-bench: "<classname>.<name>".
11
+ */
12
+ import type { EvalsTask, TestsStatus } from "./types.js";
13
+ export type TestOutcome = "pass" | "fail" | "skip";
14
+ /**
15
+ * Minimal junit-xml parser. Returns a map of "<classname>.<name>" → outcome.
16
+ *
17
+ * We don't take a full XML parser dependency; pytest's junit-xml is
18
+ * well-formed and simple enough to extract testcase elements with regex.
19
+ */
20
+ export declare function parseJunitXml(xml: string): Record<string, TestOutcome>;
21
+ export type ScoreResult = {
22
+ resolved: boolean;
23
+ tests_status: TestsStatus;
24
+ oracle_used: boolean;
25
+ error_message?: string;
26
+ };
27
+ export declare function scoreTask(args: {
28
+ task: EvalsTask;
29
+ worktreeDir: string;
30
+ fixtureDir: string;
31
+ packDefaultTestCommand: string;
32
+ testTimeoutMs: number;
33
+ }): Promise<ScoreResult>;
34
+ //# sourceMappingURL=scorer.d.ts.map
@@ -0,0 +1,127 @@
1
+ /**
2
+ * oh evals — scorer.
3
+ *
4
+ * After the agent runs, we score the task by:
5
+ * (1) Running an oracle script (oracle.sh / oracle.mjs) if one exists in
6
+ * the fixture dir — exit 0 = pass.
7
+ * (2) Else running the pack's default test command and parsing the
8
+ * junit-xml output for FAIL_TO_PASS / PASS_TO_PASS test IDs.
9
+ *
10
+ * Test ID convention matches SWE-bench: "<classname>.<name>".
11
+ */
12
+ import { spawnSync } from "node:child_process";
13
+ import { existsSync, readFileSync } from "node:fs";
14
+ import { join } from "node:path";
15
+ /**
16
+ * Minimal junit-xml parser. Returns a map of "<classname>.<name>" → outcome.
17
+ *
18
+ * We don't take a full XML parser dependency; pytest's junit-xml is
19
+ * well-formed and simple enough to extract testcase elements with regex.
20
+ */
21
+ export function parseJunitXml(xml) {
22
+ const out = {};
23
+ const testcaseRe = /<testcase\b([^>]*?)(?:\/>|>([\s\S]*?)<\/testcase>)/g;
24
+ let match = testcaseRe.exec(xml);
25
+ while (match !== null) {
26
+ const attrs = match[1];
27
+ const inner = match[2] ?? "";
28
+ const cn = /classname="([^"]*)"/.exec(attrs)?.[1];
29
+ const name = /\bname="([^"]*)"/.exec(attrs)?.[1];
30
+ if (cn && name) {
31
+ const id = `${cn}.${name}`;
32
+ if (/<failure\b/.test(inner) || /<error\b/.test(inner)) {
33
+ out[id] = "fail";
34
+ }
35
+ else if (/<skipped\b/.test(inner)) {
36
+ out[id] = "skip";
37
+ }
38
+ else {
39
+ out[id] = "pass";
40
+ }
41
+ }
42
+ match = testcaseRe.exec(xml);
43
+ }
44
+ return out;
45
+ }
46
+ const EMPTY_TESTS_STATUS = {
47
+ FAIL_TO_PASS: { success: [], failure: [] },
48
+ PASS_TO_PASS: { success: [], failure: [] },
49
+ };
50
+ export async function scoreTask(args) {
51
+ const { task, worktreeDir, fixtureDir, packDefaultTestCommand, testTimeoutMs } = args;
52
+ // (1) Oracle escape hatch.
53
+ const oracleSh = join(fixtureDir, "oracle.sh");
54
+ const oracleMjs = join(fixtureDir, "oracle.mjs");
55
+ if (existsSync(oracleSh)) {
56
+ const r = spawnSync(oracleSh, [], {
57
+ cwd: worktreeDir,
58
+ env: {
59
+ ...process.env,
60
+ INSTANCE_ID: task.instance_id,
61
+ WORKTREE_DIR: worktreeDir,
62
+ FIXTURE_DIR: fixtureDir,
63
+ },
64
+ timeout: testTimeoutMs,
65
+ shell: process.platform === "win32",
66
+ });
67
+ return {
68
+ resolved: r.status === 0,
69
+ tests_status: EMPTY_TESTS_STATUS,
70
+ oracle_used: true,
71
+ error_message: r.status === 0 ? undefined : (r.stderr?.toString().slice(-500) ?? ""),
72
+ };
73
+ }
74
+ if (existsSync(oracleMjs)) {
75
+ const r = spawnSync(process.execPath, [oracleMjs], {
76
+ cwd: worktreeDir,
77
+ env: {
78
+ ...process.env,
79
+ INSTANCE_ID: task.instance_id,
80
+ WORKTREE_DIR: worktreeDir,
81
+ FIXTURE_DIR: fixtureDir,
82
+ },
83
+ timeout: testTimeoutMs,
84
+ });
85
+ return {
86
+ resolved: r.status === 0,
87
+ tests_status: EMPTY_TESTS_STATUS,
88
+ oracle_used: true,
89
+ error_message: r.status === 0 ? undefined : (r.stderr?.toString().slice(-500) ?? ""),
90
+ };
91
+ }
92
+ // (2) Default test command.
93
+ const r = spawnSync(packDefaultTestCommand, {
94
+ cwd: worktreeDir,
95
+ shell: true,
96
+ timeout: testTimeoutMs,
97
+ });
98
+ const xmlPath = join(worktreeDir, ".oh-evals-results.xml");
99
+ if (!existsSync(xmlPath)) {
100
+ return {
101
+ resolved: false,
102
+ tests_status: structuredClone(EMPTY_TESTS_STATUS),
103
+ oracle_used: false,
104
+ error_message: `junit-xml not produced at ${xmlPath} (test command exit ${r.status}). stderr: ${r.stderr?.toString().slice(-500) ?? ""}`,
105
+ };
106
+ }
107
+ const outcomes = parseJunitXml(readFileSync(xmlPath, "utf-8"));
108
+ const tests_status = {
109
+ FAIL_TO_PASS: { success: [], failure: [] },
110
+ PASS_TO_PASS: { success: [], failure: [] },
111
+ };
112
+ for (const id of task.FAIL_TO_PASS) {
113
+ if (outcomes[id] === "pass")
114
+ tests_status.FAIL_TO_PASS.success.push(id);
115
+ else
116
+ tests_status.FAIL_TO_PASS.failure.push(id);
117
+ }
118
+ for (const id of task.PASS_TO_PASS) {
119
+ if (outcomes[id] === "pass")
120
+ tests_status.PASS_TO_PASS.success.push(id);
121
+ else
122
+ tests_status.PASS_TO_PASS.failure.push(id);
123
+ }
124
+ const resolved = tests_status.FAIL_TO_PASS.failure.length === 0 && tests_status.PASS_TO_PASS.failure.length === 0;
125
+ return { resolved, tests_status, oracle_used: false };
126
+ }
127
+ //# sourceMappingURL=scorer.js.map
@@ -0,0 +1,74 @@
1
+ /**
2
+ * oh evals — type definitions for the eval harness.
3
+ *
4
+ * Schema mirrors SWE-bench's evaluation contract so packs of cherry-picked
5
+ * SWE-bench Lite instances drop in unmodified. Our `EvalsResult` is a
6
+ * superset of SWE-bench's `results.json` per-instance shape, with cost,
7
+ * turns, duration, and transcript-path enrichments.
8
+ */
9
+ export type EvalsTask = {
10
+ instance_id: string;
11
+ repo: string;
12
+ base_commit: string;
13
+ problem_statement: string;
14
+ FAIL_TO_PASS: string[];
15
+ PASS_TO_PASS: string[];
16
+ hints_text?: string;
17
+ };
18
+ export type EvalsPack = {
19
+ name: string;
20
+ version: string;
21
+ description: string;
22
+ language: "python" | "javascript" | "typescript" | "polyglot";
23
+ runner_requirements: string[];
24
+ default_test_command: string;
25
+ instance_count: number;
26
+ compatible_with?: string;
27
+ };
28
+ export type EvalsStatus = "resolved" | "failed" | "error" | "timeout" | "budget_exceeded" | "skipped";
29
+ export type TestsStatus = {
30
+ FAIL_TO_PASS: {
31
+ success: string[];
32
+ failure: string[];
33
+ };
34
+ PASS_TO_PASS: {
35
+ success: string[];
36
+ failure: string[];
37
+ };
38
+ };
39
+ export type EvalsResult = {
40
+ instance_id: string;
41
+ status: EvalsStatus;
42
+ resolved: boolean;
43
+ cost_usd: number;
44
+ turns_used: number;
45
+ duration_ms: number;
46
+ model_patch: string;
47
+ tests_status: TestsStatus;
48
+ transcript_path: string;
49
+ error_message?: string;
50
+ started_at: string;
51
+ finished_at: string;
52
+ };
53
+ export type RunArtifacts = {
54
+ run_id: string;
55
+ pack: string;
56
+ pack_version: string;
57
+ model: string;
58
+ harness_version: string;
59
+ started_at: string;
60
+ finished_at: string;
61
+ total_cost_usd: number;
62
+ max_cost_usd: number;
63
+ total_duration_ms: number;
64
+ resolved: number;
65
+ failed: number;
66
+ error: number;
67
+ timeout: number;
68
+ budget_exceeded: number;
69
+ skipped: number;
70
+ pass_rate: number;
71
+ partial: boolean;
72
+ results: EvalsResult[];
73
+ };
74
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1,10 @@
1
+ /**
2
+ * oh evals — type definitions for the eval harness.
3
+ *
4
+ * Schema mirrors SWE-bench's evaluation contract so packs of cherry-picked
5
+ * SWE-bench Lite instances drop in unmodified. Our `EvalsResult` is a
6
+ * superset of SWE-bench's `results.json` per-instance shape, with cost,
7
+ * turns, duration, and transcript-path enrichments.
8
+ */
9
+ export {};
10
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Sandbox — filesystem and network restrictions for tool execution.
3
+ *
4
+ * Limits what tools can access:
5
+ * - File tools: only write to allowed paths
6
+ * - Web tools: only access allowed domains
7
+ * - Bash: restricted commands (no curl/wget by default)
8
+ *
9
+ * Reduces permission prompts while maintaining security.
10
+ */
11
+ export type SandboxConfig = {
12
+ enabled: boolean;
13
+ /** Paths tools can write to (glob-style, relative to cwd) */
14
+ allowedPaths: string[];
15
+ /** Domains WebFetch/WebSearch can access */
16
+ allowedDomains: string[];
17
+ /** Block all network access */
18
+ blockNetwork: boolean;
19
+ /** Commands blocked in Bash (default: curl, wget) */
20
+ blockedCommands: string[];
21
+ };
22
+ /** Get the current sandbox config */
23
+ export declare function getSandboxConfig(): SandboxConfig;
24
+ /** Reset cached config */
25
+ export declare function invalidateSandboxCache(): void;
26
+ /** Check if a file path is allowed for writing */
27
+ export declare function isPathAllowed(filePath: string): boolean;
28
+ /** Check if a domain is allowed for network access */
29
+ export declare function isDomainAllowed(url: string): boolean;
30
+ /** Check if a bash command is allowed */
31
+ export declare function isCommandAllowed(command: string): boolean;
32
+ /** Get a human-readable sandbox status */
33
+ export declare function sandboxStatus(): string;
34
+ //# sourceMappingURL=sandbox.d.ts.map
@@ -0,0 +1,104 @@
1
+ /**
2
+ * Sandbox — filesystem and network restrictions for tool execution.
3
+ *
4
+ * Limits what tools can access:
5
+ * - File tools: only write to allowed paths
6
+ * - Web tools: only access allowed domains
7
+ * - Bash: restricted commands (no curl/wget by default)
8
+ *
9
+ * Reduces permission prompts while maintaining security.
10
+ */
11
+ import { relative, resolve } from "node:path";
12
+ import { readOhConfig } from "./config.js";
13
+ const DEFAULT_SANDBOX = {
14
+ enabled: false,
15
+ allowedPaths: ["."], // current directory
16
+ allowedDomains: [], // empty = all allowed
17
+ blockNetwork: false,
18
+ blockedCommands: ["curl", "wget"],
19
+ };
20
+ // ── Sandbox Manager ──
21
+ let _config = null;
22
+ /** Get the current sandbox config */
23
+ export function getSandboxConfig() {
24
+ if (_config)
25
+ return _config;
26
+ const ohConfig = readOhConfig();
27
+ if (ohConfig?.sandbox) {
28
+ _config = {
29
+ ...DEFAULT_SANDBOX,
30
+ ...ohConfig.sandbox,
31
+ };
32
+ }
33
+ else {
34
+ _config = DEFAULT_SANDBOX;
35
+ }
36
+ return _config;
37
+ }
38
+ /** Reset cached config */
39
+ export function invalidateSandboxCache() {
40
+ _config = null;
41
+ }
42
+ /** Check if a file path is allowed for writing */
43
+ export function isPathAllowed(filePath) {
44
+ const config = getSandboxConfig();
45
+ if (!config.enabled)
46
+ return true;
47
+ const resolved = resolve(filePath);
48
+ const cwd = process.cwd();
49
+ for (const allowed of config.allowedPaths) {
50
+ const allowedResolved = resolve(cwd, allowed);
51
+ // Check if the file is within the allowed directory
52
+ const rel = relative(allowedResolved, resolved);
53
+ if (!rel.startsWith("..") && !rel.startsWith("/"))
54
+ return true;
55
+ }
56
+ return false;
57
+ }
58
+ /** Check if a domain is allowed for network access */
59
+ export function isDomainAllowed(url) {
60
+ const config = getSandboxConfig();
61
+ if (!config.enabled)
62
+ return true;
63
+ if (config.blockNetwork)
64
+ return false;
65
+ if (config.allowedDomains.length === 0)
66
+ return true;
67
+ try {
68
+ const hostname = new URL(url).hostname.toLowerCase();
69
+ return config.allowedDomains.some((d) => hostname === d.toLowerCase() || hostname.endsWith(`.${d.toLowerCase()}`));
70
+ }
71
+ catch {
72
+ return false;
73
+ }
74
+ }
75
+ /** Check if a bash command is allowed */
76
+ export function isCommandAllowed(command) {
77
+ const config = getSandboxConfig();
78
+ if (!config.enabled)
79
+ return true;
80
+ const firstWord = command.trim().split(/\s+/)[0]?.toLowerCase() ?? "";
81
+ return !config.blockedCommands.includes(firstWord);
82
+ }
83
+ /** Get a human-readable sandbox status */
84
+ export function sandboxStatus() {
85
+ const config = getSandboxConfig();
86
+ if (!config.enabled)
87
+ return "Sandbox: disabled";
88
+ const lines = ["Sandbox: enabled"];
89
+ lines.push(` Allowed paths: ${config.allowedPaths.join(", ") || "none"}`);
90
+ if (config.blockNetwork) {
91
+ lines.push(" Network: blocked");
92
+ }
93
+ else if (config.allowedDomains.length > 0) {
94
+ lines.push(` Allowed domains: ${config.allowedDomains.join(", ")}`);
95
+ }
96
+ else {
97
+ lines.push(" Network: unrestricted");
98
+ }
99
+ if (config.blockedCommands.length > 0) {
100
+ lines.push(` Blocked commands: ${config.blockedCommands.join(", ")}`);
101
+ }
102
+ return lines.join("\n");
103
+ }
104
+ //# sourceMappingURL=sandbox.js.map
@@ -83,6 +83,31 @@ export declare function loadTrace(sessionId: string): TraceSpan[];
83
83
  export declare function listTracedSessions(): string[];
84
84
  /** Format trace for display */
85
85
  export declare function formatTrace(spans: TraceSpan[]): string;
86
+ /**
87
+ * Render spans as a flame-graph (icicle-graph really — top-down by depth).
88
+ * Each span gets one row: indent by tree depth, then a bar of `█` characters
89
+ * positioned along a wall-time axis sized to `width` columns. Bars start at
90
+ * the column corresponding to the span's `startTime` relative to the trace's
91
+ * minimum startTime, and span as many columns as their `durationMs` requires
92
+ * (minimum 1 column so even sub-millisecond spans are visible).
93
+ *
94
+ * Total trace duration sets the time-axis scale: a 5-second trace and a
95
+ * 50-second trace both fit the same `width`, so the same view works at any
96
+ * scale without scrolling. Per-span ms label appears to the right of the bar;
97
+ * span name appears at the left, indented by parent depth.
98
+ *
99
+ * Errored spans (status: "error") render in red; others use a stable
100
+ * per-name color so the same tool keeps the same color across the trace.
101
+ *
102
+ * The bottom row is a time ruler with ticks at 0ms, 25%, 50%, 75%, 100%.
103
+ *
104
+ * @param spans the spans to render — typically `loadTrace(sessionId)`
105
+ * @param width target width in columns (defaults to terminal width or 100)
106
+ * @param opts.color emit ANSI color codes (defaults to true; set false for tests)
107
+ */
108
+ export declare function formatFlameGraph(spans: TraceSpan[], width?: number, opts?: {
109
+ color?: boolean;
110
+ }): string;
86
111
  /** Export trace in OpenTelemetry-compatible format */
87
112
  export declare function exportTraceOTLP(sessionId: string, spans: TraceSpan[]): object;
88
113
  //# sourceMappingURL=traces.d.ts.map
@@ -220,6 +220,174 @@ export function formatTrace(spans) {
220
220
  lines.push(`Total: ${spans.length} spans, ${totalMs}ms, ${errors} errors`);
221
221
  return lines.join("\n");
222
222
  }
223
+ // ── Flame-graph rendering ──
224
+ /** ANSI 256 colors picked for distinguishability across span names. */
225
+ const FLAME_COLORS = [
226
+ "\x1b[38;5;202m", // orange (query)
227
+ "\x1b[38;5;39m", // light blue (tool:Read)
228
+ "\x1b[38;5;208m", // bright orange (tool:Bash)
229
+ "\x1b[38;5;105m", // purple (tool:Edit)
230
+ "\x1b[38;5;118m", // green (tool:Glob/Grep)
231
+ "\x1b[38;5;226m", // yellow (tool:Web*)
232
+ "\x1b[38;5;213m", // pink (think tools)
233
+ "\x1b[38;5;245m", // grey (other)
234
+ ];
235
+ const ANSI_RESET = "\x1b[0m";
236
+ const ANSI_DIM = "\x1b[2m";
237
+ const ANSI_RED = "\x1b[38;5;196m";
238
+ function colorForSpan(name) {
239
+ // Stable hash so the same span name always lands the same color across renders.
240
+ let hash = 0;
241
+ for (let i = 0; i < name.length; i++)
242
+ hash = (hash * 31 + name.charCodeAt(i)) >>> 0;
243
+ return FLAME_COLORS[hash % FLAME_COLORS.length];
244
+ }
245
+ /**
246
+ * Render spans as a flame-graph (icicle-graph really — top-down by depth).
247
+ * Each span gets one row: indent by tree depth, then a bar of `█` characters
248
+ * positioned along a wall-time axis sized to `width` columns. Bars start at
249
+ * the column corresponding to the span's `startTime` relative to the trace's
250
+ * minimum startTime, and span as many columns as their `durationMs` requires
251
+ * (minimum 1 column so even sub-millisecond spans are visible).
252
+ *
253
+ * Total trace duration sets the time-axis scale: a 5-second trace and a
254
+ * 50-second trace both fit the same `width`, so the same view works at any
255
+ * scale without scrolling. Per-span ms label appears to the right of the bar;
256
+ * span name appears at the left, indented by parent depth.
257
+ *
258
+ * Errored spans (status: "error") render in red; others use a stable
259
+ * per-name color so the same tool keeps the same color across the trace.
260
+ *
261
+ * The bottom row is a time ruler with ticks at 0ms, 25%, 50%, 75%, 100%.
262
+ *
263
+ * @param spans the spans to render — typically `loadTrace(sessionId)`
264
+ * @param width target width in columns (defaults to terminal width or 100)
265
+ * @param opts.color emit ANSI color codes (defaults to true; set false for tests)
266
+ */
267
+ export function formatFlameGraph(spans, width = process.stdout.columns || 100, opts = {}) {
268
+ if (spans.length === 0)
269
+ return "No trace spans recorded.";
270
+ const useColor = opts.color !== false;
271
+ const c = (style, text) => (useColor ? `${style}${text}${ANSI_RESET}` : text);
272
+ // Trace bounds — every other timestamp is relative to minStart.
273
+ let minStart = Infinity;
274
+ let maxEnd = 0;
275
+ for (const s of spans) {
276
+ if (s.startTime < minStart)
277
+ minStart = s.startTime;
278
+ if (s.endTime > maxEnd)
279
+ maxEnd = s.endTime;
280
+ }
281
+ const totalMs = maxEnd > minStart ? maxEnd - minStart : 1;
282
+ // Layout: name column gets up to 30 chars; ms label gets up to 10; the rest
283
+ // is the bar canvas. We need at least ~20 cols of bar canvas to be useful.
284
+ const NAME_WIDTH = 30;
285
+ const MS_WIDTH = 10;
286
+ const PADDING = 3; // spaces between sections
287
+ const barWidth = Math.max(20, width - NAME_WIDTH - MS_WIDTH - PADDING);
288
+ // Build the depth map by walking the parent chain (spans are typically in
289
+ // start-order but we don't rely on it). Caps recursion to prevent infinite
290
+ // loops on a malformed trace where parent references form a cycle.
291
+ const byId = new Map(spans.map((s) => [s.spanId, s]));
292
+ const depthOf = new Map();
293
+ function depth(span, hops = 0) {
294
+ if (hops > 50)
295
+ return hops;
296
+ if (depthOf.has(span.spanId))
297
+ return depthOf.get(span.spanId);
298
+ let d = 0;
299
+ if (span.parentSpanId) {
300
+ const parent = byId.get(span.parentSpanId);
301
+ if (parent)
302
+ d = depth(parent, hops + 1) + 1;
303
+ }
304
+ depthOf.set(span.spanId, d);
305
+ return d;
306
+ }
307
+ for (const s of spans)
308
+ depth(s);
309
+ // Sort by start time, ties broken by depth (parents before children).
310
+ const sorted = [...spans].sort((a, b) => a.startTime - b.startTime || depthOf.get(a.spanId) - depthOf.get(b.spanId));
311
+ const lines = [];
312
+ for (const span of sorted) {
313
+ const d = depthOf.get(span.spanId);
314
+ const offset = Math.floor(((span.startTime - minStart) / totalMs) * barWidth);
315
+ const length = Math.max(1, Math.floor((span.durationMs / totalMs) * barWidth));
316
+ const indent = " ".repeat(Math.min(d, 4)); // visual cap at 4 indent levels
317
+ const name = `${indent}${span.name}`.padEnd(NAME_WIDTH).slice(0, NAME_WIDTH);
318
+ const bar = " ".repeat(offset) + "█".repeat(Math.min(length, barWidth - offset));
319
+ const paddedBar = bar.padEnd(barWidth);
320
+ const color = span.status === "error" ? ANSI_RED : colorForSpan(span.name);
321
+ const msLabel = `${span.durationMs}ms`.padStart(MS_WIDTH);
322
+ lines.push(`${name} ${c(color, paddedBar)} ${c(ANSI_DIM, msLabel)}`);
323
+ }
324
+ // Time ruler: 3-5 ticks depending on canvas width. We need ~8 columns per
325
+ // tick to fit timestamp labels without overlap; choose count that fits.
326
+ const tickCount = barWidth >= 50 ? 5 : barWidth >= 30 ? 3 : 2;
327
+ const tickPcts = [];
328
+ for (let i = 0; i < tickCount; i++)
329
+ tickPcts.push(i / (tickCount - 1));
330
+ const tickValues = tickPcts.map((pct) => `${Math.round(totalMs * pct)}ms`);
331
+ const rulerLine = " ".repeat(NAME_WIDTH + 3) + buildTimeRuler(barWidth, tickValues);
332
+ lines.push("");
333
+ lines.push(c(ANSI_DIM, rulerLine));
334
+ // Per-name summary: count + total ms, descending by total ms.
335
+ const summary = {};
336
+ for (const s of spans) {
337
+ const e = summary[s.name] ?? { count: 0, totalMs: 0 };
338
+ e.count++;
339
+ e.totalMs += s.durationMs;
340
+ summary[s.name] = e;
341
+ }
342
+ const ranked = Object.entries(summary).sort((a, b) => b[1].totalMs - a[1].totalMs);
343
+ lines.push("");
344
+ lines.push(c(ANSI_DIM, "Span breakdown (top by total time):"));
345
+ for (const [name, { count, totalMs: tms }] of ranked.slice(0, 10)) {
346
+ const pct = totalMs > 0 ? Math.round((tms / totalMs) * 100) : 0;
347
+ lines.push(` ${c(colorForSpan(name), "█")} ${name.padEnd(28)} ${count.toString().padStart(4)}× ${tms.toString().padStart(6)}ms ${pct}%`);
348
+ }
349
+ const errors = spans.filter((s) => s.status === "error").length;
350
+ lines.push("");
351
+ lines.push(c(ANSI_DIM, `${spans.length} spans, ${totalMs}ms total${errors > 0 ? `, ${errors} error(s)` : ""}`));
352
+ return lines.join("\n");
353
+ }
354
+ /**
355
+ * Build a time ruler line of exactly `width` columns with N tick labels
356
+ * distributed evenly. Strategy: anchor the last tick right-aligned to the
357
+ * width, then place earlier ticks at their proportional positions while
358
+ * truncating any label that would overlap the next tick (or the last
359
+ * tick's reserved start). Produces a clean ruler at any (width × N).
360
+ *
361
+ * The last tick's right-anchor means the rightmost timestamp always lands
362
+ * exactly at the canvas edge, matching where bars end.
363
+ */
364
+ function buildTimeRuler(width, ticks) {
365
+ if (ticks.length === 0 || width <= 0)
366
+ return "";
367
+ const buf = new Array(width).fill(" ");
368
+ // Step 1: place last tick right-aligned. Its start column constrains all
369
+ // earlier ticks (they must end before lastStart - 1 so there's a gap).
370
+ const lastLabel = ticks[ticks.length - 1];
371
+ const lastStart = Math.max(0, width - lastLabel.length);
372
+ for (let j = 0; j < lastLabel.length && lastStart + j < width; j++) {
373
+ buf[lastStart + j] = lastLabel[j];
374
+ }
375
+ // Step 2: place earlier ticks left-to-right. Each can occupy from its
376
+ // proportional start column up to either the next tick's start (minus 1
377
+ // for a separator space) or, for the second-to-last tick, lastStart - 1.
378
+ for (let i = 0; i < ticks.length - 1; i++) {
379
+ const label = ticks[i];
380
+ const start = Math.round((i / (ticks.length - 1)) * (width - 1));
381
+ const nextProportional = Math.round(((i + 1) / (ticks.length - 1)) * (width - 1));
382
+ const isPenultimate = i === ticks.length - 2;
383
+ const endExclusive = isPenultimate ? lastStart - 1 : nextProportional - 1;
384
+ const maxLen = Math.max(0, endExclusive - start);
385
+ const out = label.slice(0, maxLen);
386
+ for (let j = 0; j < out.length; j++)
387
+ buf[start + j] = out[j];
388
+ }
389
+ return buf.join("");
390
+ }
223
391
  /**
224
392
  * Coerce an arbitrary string (UUID with hyphens, "span-N", etc.) into a fixed-length
225
393
  * lowercase hex string suitable for OTLP. OTLP collectors (Jaeger, Tempo, OTel