@forwardimpact/libeval 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,213 @@
1
+ import { createWriteStream } from "node:fs";
2
+ import { mkdir } from "node:fs/promises";
3
+ import path from "node:path";
4
+ import { pipeline } from "node:stream/promises";
5
+ import { Readable } from "node:stream";
6
+
7
+ const API = "https://api.github.com";
8
+
9
+ /**
10
+ * GitHub API client for trace-related operations: listing workflow runs
11
+ * and downloading trace artifacts.
12
+ */
13
+ export class TraceGitHub {
14
+ /**
15
+ * @param {object} deps
16
+ * @param {string} deps.token - GitHub token
17
+ * @param {string} deps.owner - Repository owner
18
+ * @param {string} deps.repo - Repository name
19
+ */
20
+ constructor({ token, owner, repo }) {
21
+ this.token = token;
22
+ this.owner = owner;
23
+ this.repo = repo;
24
+ }
25
+
26
+ /**
27
+ * List recent workflow runs, optionally filtered by name pattern.
28
+ *
29
+ * @param {object} [opts]
30
+ * @param {string} [opts.pattern] - Case-insensitive substring to match workflow name (default: "agent")
31
+ * @param {number} [opts.limit=50] - Max runs to return from GitHub API
32
+ * @param {string} [opts.lookback="7d"] - How far back to search (e.g. "7d", "24h", "2w")
33
+ * @returns {Promise<object[]>} Array of {workflow, runId, status, conclusion, createdAt, branch, url}
34
+ */
35
+ async listRuns(opts = {}) {
36
+ const { pattern = "agent", limit = 50, lookback = "7d" } = opts;
37
+ const cutoff = parseLookback(lookback);
38
+
39
+ const params = new URLSearchParams({
40
+ per_page: String(Math.min(limit, 100)),
41
+ });
42
+ if (cutoff) {
43
+ params.set("created", `>=${cutoff}`);
44
+ }
45
+
46
+ const url = `${API}/repos/${this.owner}/${this.repo}/actions/runs?${params}`;
47
+ const data = await this.#get(url);
48
+ const runs = data.workflow_runs ?? [];
49
+
50
+ // eslint-disable-next-line security/detect-non-literal-regexp -- pattern is caller-controlled, not untrusted input
51
+ const re = new RegExp(pattern, "i");
52
+ return runs
53
+ .filter((r) => re.test(r.name))
54
+ .map((r) => ({
55
+ workflow: r.name,
56
+ runId: r.id,
57
+ status: r.status,
58
+ conclusion: r.conclusion,
59
+ createdAt: r.created_at,
60
+ branch: r.head_branch,
61
+ url: r.html_url,
62
+ }));
63
+ }
64
+
65
+ /**
66
+ * Download a trace artifact from a workflow run and extract it.
67
+ *
68
+ * Tries artifact names in order: combined-trace, agent-trace.
69
+ * The artifact zip is downloaded and extracted to the output directory.
70
+ *
71
+ * @param {number|string} runId
72
+ * @param {object} [opts]
73
+ * @param {string} [opts.dir] - Output directory (default: /tmp/trace-<runId>)
74
+ * @param {string} [opts.name] - Specific artifact name to download
75
+ * @returns {Promise<{dir: string, artifact: string, files: string[]}>}
76
+ */
77
+ async downloadTrace(runId, opts = {}) {
78
+ const dir = opts.dir ?? `/tmp/trace-${runId}`;
79
+ await mkdir(dir, { recursive: true });
80
+
81
+ // List artifacts for this run.
82
+ const url = `${API}/repos/${this.owner}/${this.repo}/actions/runs/${runId}/artifacts`;
83
+ const data = await this.#get(url);
84
+ const artifacts = data.artifacts ?? [];
85
+
86
+ // Find the trace artifact.
87
+ const preferredNames = opts.name
88
+ ? [opts.name]
89
+ : ["combined-trace", "agent-trace"];
90
+ let artifact = null;
91
+ for (const name of preferredNames) {
92
+ artifact = artifacts.find((a) => a.name === name);
93
+ if (artifact) break;
94
+ }
95
+
96
+ if (!artifact) {
97
+ const available = artifacts.map((a) => a.name).join(", ");
98
+ throw new Error(
99
+ `No trace artifact found for run ${runId}. Available: ${available || "none"}`,
100
+ );
101
+ }
102
+
103
+ // Download the zip.
104
+ const zipPath = path.join(dir, `${artifact.name}.zip`);
105
+ const downloadUrl = `${API}/repos/${this.owner}/${this.repo}/actions/artifacts/${artifact.id}/zip`;
106
+ const response = await fetch(downloadUrl, {
107
+ headers: this.#headers(),
108
+ redirect: "follow",
109
+ });
110
+ if (!response.ok) {
111
+ throw new Error(
112
+ `Failed to download artifact: ${response.status} ${response.statusText}`,
113
+ );
114
+ }
115
+
116
+ // Stream to disk then extract.
117
+ await pipeline(Readable.fromWeb(response.body), createWriteStream(zipPath));
118
+
119
+ const { execSync } = await import("node:child_process");
120
+ execSync(
121
+ `unzip -o -q ${JSON.stringify(zipPath)} -d ${JSON.stringify(dir)}`,
122
+ );
123
+
124
+ // List extracted files.
125
+ const { readdirSync } = await import("node:fs");
126
+ const files = readdirSync(dir).filter((f) => !f.endsWith(".zip"));
127
+
128
+ return { dir, artifact: artifact.name, files };
129
+ }
130
+
131
+ /**
132
+ * @param {string} url
133
+ * @returns {Promise<object>}
134
+ */
135
+ async #get(url) {
136
+ const response = await fetch(url, { headers: this.#headers() });
137
+ if (!response.ok) {
138
+ throw new Error(`GitHub API: ${response.status} ${response.statusText}`);
139
+ }
140
+ return response.json();
141
+ }
142
+
143
+ /** @returns {Record<string, string>} */
144
+ #headers() {
145
+ return {
146
+ Authorization: `Bearer ${this.token}`,
147
+ Accept: "application/vnd.github+json",
148
+ "X-GitHub-Api-Version": "2022-11-28",
149
+ };
150
+ }
151
+ }
152
+
153
+ /**
154
+ * Parse a lookback duration string into an ISO date string.
155
+ * Supports: Nd (days), Nh (hours), Nw (weeks).
156
+ * @param {string} lookback
157
+ * @returns {string|null} ISO date string or null if unparseable
158
+ */
159
+ function parseLookback(lookback) {
160
+ const match = lookback.match(/^(\d+)([dhw])$/);
161
+ if (!match) return null;
162
+ const [, val, unit] = match;
163
+ const ms = { d: 86400000, h: 3600000, w: 604800000 }[unit];
164
+ return new Date(Date.now() - parseInt(val, 10) * ms).toISOString();
165
+ }
166
+
167
+ /**
168
+ * Parse a GitHub repository URL or "owner/repo" string.
169
+ * @param {string} remote - Git remote URL or owner/repo string
170
+ * @returns {{owner: string, repo: string}}
171
+ */
172
+ export function parseGitRemote(remote) {
173
+ // SSH: git@github.com:owner/repo.git
174
+ const ssh = remote.match(/github\.com[:/]([^/]+)\/(.+?)(?:\.git)?$/);
175
+ if (ssh) return { owner: ssh[1], repo: ssh[2] };
176
+
177
+ // HTTPS: https://github.com/owner/repo
178
+ const https = remote.match(/github\.com\/([^/]+)\/(.+?)(?:\.git)?$/);
179
+ if (https) return { owner: https[1], repo: https[2] };
180
+
181
+ // Plain owner/repo format (no github.com prefix).
182
+ const simple = remote.match(/^([^/:@]+)\/([^/]+)$/);
183
+ if (simple) return { owner: simple[1], repo: simple[2] };
184
+
185
+ throw new Error(`Cannot parse GitHub remote: ${remote}`);
186
+ }
187
+
188
+ /**
189
+ * Create a TraceGitHub instance using libconfig for the token and
190
+ * git remote for the repo.
191
+ *
192
+ * @param {object} [opts]
193
+ * @param {string} [opts.repo] - "owner/repo" override (default: detect from git remote)
194
+ * @returns {Promise<TraceGitHub>}
195
+ */
196
+ export async function createTraceGitHub(opts = {}) {
197
+ const { createScriptConfig } = await import("@forwardimpact/libconfig");
198
+ const config = await createScriptConfig("eval");
199
+ const token = config.ghToken();
200
+
201
+ let owner, repo;
202
+ if (opts.repo) {
203
+ ({ owner, repo } = parseGitRemote(opts.repo));
204
+ } else {
205
+ const { execSync } = await import("node:child_process");
206
+ const remote = execSync("git remote get-url origin", {
207
+ encoding: "utf8",
208
+ }).trim();
209
+ ({ owner, repo } = parseGitRemote(remote));
210
+ }
211
+
212
+ return new TraceGitHub({ token, owner, repo });
213
+ }
@@ -0,0 +1,346 @@
1
+ /**
2
+ * Query engine for structured trace documents produced by TraceCollector.
3
+ *
4
+ * Loads a structured JSON trace into memory and provides methods for
5
+ * paging, searching, filtering, and summarizing turns — the operations
6
+ * agents need to analyze large traces efficiently.
7
+ */
8
+ export class TraceQuery {
9
+ /**
10
+ * @param {object} trace - Structured trace document (output of TraceCollector.toJSON())
11
+ */
12
+ constructor(trace) {
13
+ this.trace = trace;
14
+ this.metadata = trace.metadata ?? {};
15
+ this.turns = trace.turns ?? [];
16
+ this.summary = trace.summary ?? {};
17
+ }
18
+
19
+ /**
20
+ * High-level overview: metadata, summary, turn count, and tool frequency.
21
+ * @returns {object}
22
+ */
23
+ overview() {
24
+ return {
25
+ metadata: this.metadata,
26
+ summary: this.summary,
27
+ turnCount: this.turns.length,
28
+ tools: this.toolFrequency(),
29
+ };
30
+ }
31
+
32
+ /** @returns {number} */
33
+ count() {
34
+ return this.turns.length;
35
+ }
36
+
37
+ /**
38
+ * Return turns in range [from, to) (zero-indexed).
39
+ * @param {number} from
40
+ * @param {number} to
41
+ * @returns {object[]}
42
+ */
43
+ batch(from, to) {
44
+ return this.turns.slice(from, to);
45
+ }
46
+
47
+ /**
48
+ * First N turns.
49
+ * @param {number} [n=10]
50
+ * @returns {object[]}
51
+ */
52
+ head(n = 10) {
53
+ return this.turns.slice(0, n);
54
+ }
55
+
56
+ /**
57
+ * Last N turns.
58
+ * @param {number} [n=10]
59
+ * @returns {object[]}
60
+ */
61
+ tail(n = 10) {
62
+ return this.turns.slice(-n);
63
+ }
64
+
65
+ /**
66
+ * Search all turn content for a regex pattern. Returns matching turns
67
+ * with the matched text highlighted by context.
68
+ *
69
+ * Searches: assistant text blocks, tool_use names and stringified input,
70
+ * and tool_result content.
71
+ *
72
+ * @param {string} pattern - Regex pattern (case-insensitive)
73
+ * @param {object} [opts]
74
+ * @param {number} [opts.context=0] - Number of surrounding turns to include
75
+ * @param {number} [opts.limit=50] - Max results
76
+ * @returns {object[]} Array of {turn, matches, context?}
77
+ */
78
+ search(pattern, opts = {}) {
79
+ const { context = 0, limit = 50 } = opts;
80
+ // eslint-disable-next-line security/detect-non-literal-regexp -- pattern is caller-controlled, not untrusted input
81
+ const re = new RegExp(pattern, "gi");
82
+ const hits = [];
83
+
84
+ for (const turn of this.turns) {
85
+ const matches = matchTurn(turn, re);
86
+ if (matches.length > 0) {
87
+ const entry = { turn, matches };
88
+ if (context > 0) {
89
+ const idx = turn.index;
90
+ entry.context = this.turns.filter(
91
+ (t) =>
92
+ t.index !== idx &&
93
+ t.index >= idx - context &&
94
+ t.index <= idx + context,
95
+ );
96
+ }
97
+ hits.push(entry);
98
+ if (hits.length >= limit) break;
99
+ }
100
+ }
101
+ return hits;
102
+ }
103
+
104
+ /**
105
+ * Tool usage frequency, sorted descending.
106
+ * @returns {Array<{tool: string, count: number}>}
107
+ */
108
+ toolFrequency() {
109
+ const counts = {};
110
+ for (const turn of this.turns) {
111
+ if (turn.role !== "assistant") continue;
112
+ for (const block of turn.content) {
113
+ if (block.type === "tool_use") {
114
+ counts[block.name] = (counts[block.name] ?? 0) + 1;
115
+ }
116
+ }
117
+ }
118
+ return Object.entries(counts)
119
+ .map(([tool, count]) => ({ tool, count }))
120
+ .sort((a, b) => b.count - a.count);
121
+ }
122
+
123
+ /**
124
+ * Filter turns involving a specific tool (both the tool_use and its result).
125
+ * @param {string} name - Tool name
126
+ * @returns {object[]}
127
+ */
128
+ tool(name) {
129
+ const toolUseIds = new Set();
130
+ const results = [];
131
+
132
+ for (const turn of this.turns) {
133
+ if (turn.role === "assistant") {
134
+ const hasTool = turn.content.some(
135
+ (b) => b.type === "tool_use" && b.name === name,
136
+ );
137
+ if (hasTool) {
138
+ results.push(turn);
139
+ for (const b of turn.content) {
140
+ if (b.type === "tool_use" && b.name === name && b.toolUseId) {
141
+ toolUseIds.add(b.toolUseId);
142
+ }
143
+ }
144
+ }
145
+ } else if (
146
+ turn.role === "tool_result" &&
147
+ toolUseIds.has(turn.toolUseId)
148
+ ) {
149
+ results.push(turn);
150
+ }
151
+ }
152
+ return results;
153
+ }
154
+
155
+ /**
156
+ * All error turns (tool results with isError=true).
157
+ * @returns {object[]}
158
+ */
159
+ errors() {
160
+ return this.turns.filter(
161
+ (t) => t.role === "tool_result" && t.isError === true,
162
+ );
163
+ }
164
+
165
+ /**
166
+ * Extract just the reasoning text from assistant turns.
167
+ * @param {object} [opts]
168
+ * @param {number} [opts.from] - Start turn index
169
+ * @param {number} [opts.to] - End turn index (exclusive)
170
+ * @returns {Array<{index: number, text: string}>}
171
+ */
172
+ reasoning(opts = {}) {
173
+ const { from, to } = opts;
174
+ const results = [];
175
+ for (const turn of this.turns) {
176
+ if (turn.role !== "assistant") continue;
177
+ if (from !== undefined && turn.index < from) continue;
178
+ if (to !== undefined && turn.index >= to) continue;
179
+ const texts = turn.content
180
+ .filter((b) => b.type === "text")
181
+ .map((b) => b.text);
182
+ if (texts.length > 0) {
183
+ results.push({ index: turn.index, text: texts.join("\n") });
184
+ }
185
+ }
186
+ return results;
187
+ }
188
+
189
+ /**
190
+ * Compact one-line-per-assistant-turn timeline showing tool names,
191
+ * reasoning snippet, and token usage. Thinking-only turns are marked
192
+ * as such and their content is omitted (it is model-internal).
193
+ * @returns {string[]}
194
+ */
195
+ timeline() {
196
+ const lines = [];
197
+ for (const turn of this.turns) {
198
+ if (turn.role !== "assistant") continue;
199
+
200
+ const tools = turn.content
201
+ .filter((b) => b.type === "tool_use")
202
+ .map((b) => b.name);
203
+
204
+ const textBlocks = turn.content
205
+ .filter((b) => b.type === "text")
206
+ .map((b) => b.text);
207
+
208
+ const hasThinking = turn.content.some((b) => b.type === "thinking");
209
+
210
+ // Skip thinking-only turns (no user-visible content).
211
+ if (hasThinking && tools.length === 0 && textBlocks.length === 0)
212
+ continue;
213
+
214
+ const snippet = textBlocks.join(" ").slice(0, 80).replace(/\n/g, " ");
215
+
216
+ const input = turn.usage?.inputTokens ?? 0;
217
+ const output = turn.usage?.outputTokens ?? 0;
218
+ const cacheRead = turn.usage?.cacheReadInputTokens ?? 0;
219
+
220
+ const toolStr = tools.length > 0 ? tools.join(", ") : "(text only)";
221
+ const tokenStr = `in:${fmtK(input + cacheRead)} out:${fmtK(output)}`;
222
+
223
+ lines.push(
224
+ `[${turn.index}] ${toolStr.padEnd(30)} ${tokenStr.padEnd(18)} ${snippet}`,
225
+ );
226
+ }
227
+ return lines;
228
+ }
229
+
230
+ /**
231
+ * Token usage and cost breakdown per assistant turn, plus totals.
232
+ * @returns {object}
233
+ */
234
+ stats() {
235
+ let totalInput = 0;
236
+ let totalOutput = 0;
237
+ let totalCacheRead = 0;
238
+ let totalCacheCreate = 0;
239
+ const perTurn = [];
240
+
241
+ for (const turn of this.turns) {
242
+ if (turn.role !== "assistant" || !turn.usage) continue;
243
+ const u = turn.usage;
244
+ totalInput += u.inputTokens ?? 0;
245
+ totalOutput += u.outputTokens ?? 0;
246
+ totalCacheRead += u.cacheReadInputTokens ?? 0;
247
+ totalCacheCreate += u.cacheCreationInputTokens ?? 0;
248
+
249
+ perTurn.push({
250
+ index: turn.index,
251
+ inputTokens: u.inputTokens ?? 0,
252
+ outputTokens: u.outputTokens ?? 0,
253
+ cacheReadInputTokens: u.cacheReadInputTokens ?? 0,
254
+ cacheCreationInputTokens: u.cacheCreationInputTokens ?? 0,
255
+ });
256
+ }
257
+
258
+ return {
259
+ totals: {
260
+ inputTokens: totalInput,
261
+ outputTokens: totalOutput,
262
+ cacheReadInputTokens: totalCacheRead,
263
+ cacheCreationInputTokens: totalCacheCreate,
264
+ totalCostUsd: this.summary.totalCostUsd ?? 0,
265
+ durationMs: this.summary.durationMs ?? 0,
266
+ },
267
+ perTurn,
268
+ };
269
+ }
270
+ }
271
+
272
+ /**
273
+ * Search a single turn for regex matches. Returns array of match descriptions.
274
+ * @param {object} turn
275
+ * @param {RegExp} re
276
+ * @returns {string[]}
277
+ */
278
+ function matchTurn(turn, re) {
279
+ const matches = [];
280
+ if (turn.role === "assistant") {
281
+ for (const block of turn.content) {
282
+ if (block.type === "text" && re.test(block.text)) {
283
+ re.lastIndex = 0;
284
+ matches.push(`text: ${excerptAround(block.text, re)}`);
285
+ }
286
+ if (block.type === "tool_use") {
287
+ if (re.test(block.name)) {
288
+ re.lastIndex = 0;
289
+ matches.push(`tool_name: ${block.name}`);
290
+ }
291
+ const inputStr = JSON.stringify(block.input);
292
+ if (re.test(inputStr)) {
293
+ re.lastIndex = 0;
294
+ matches.push(
295
+ `tool_input(${block.name}): ${excerptAround(inputStr, re)}`,
296
+ );
297
+ }
298
+ }
299
+ }
300
+ } else if (turn.role === "tool_result") {
301
+ const content = turn.content ?? "";
302
+ if (re.test(content)) {
303
+ re.lastIndex = 0;
304
+ matches.push(`result: ${excerptAround(content, re)}`);
305
+ }
306
+ }
307
+ return matches;
308
+ }
309
+
310
+ /**
311
+ * Extract a short excerpt around the first regex match in text.
312
+ * @param {string} text
313
+ * @param {RegExp} re
314
+ * @returns {string}
315
+ */
316
+ function excerptAround(text, re) {
317
+ re.lastIndex = 0;
318
+ const m = re.exec(text);
319
+ if (!m) return text.slice(0, 100);
320
+ const start = Math.max(0, m.index - 40);
321
+ const end = Math.min(text.length, m.index + m[0].length + 40);
322
+ let excerpt = text.slice(start, end);
323
+ if (start > 0) excerpt = "..." + excerpt;
324
+ if (end < text.length) excerpt = excerpt + "...";
325
+ return excerpt;
326
+ }
327
+
328
+ /**
329
+ * Format a token count as compact K notation.
330
+ * @param {number} n
331
+ * @returns {string}
332
+ */
333
+ function fmtK(n) {
334
+ if (n < 1000) return String(n);
335
+ return (n / 1000).toFixed(1) + "K";
336
+ }
337
+
338
+ /**
339
+ * Load a structured trace from a JSON string.
340
+ * @param {string} json
341
+ * @returns {TraceQuery}
342
+ */
343
+ export function createTraceQuery(json) {
344
+ const trace = typeof json === "string" ? JSON.parse(json) : json;
345
+ return new TraceQuery(trace);
346
+ }