@forwardimpact/libeval 0.1.63 → 0.1.65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/redaction.js CHANGED
@@ -3,6 +3,14 @@
3
3
  * the trace artifact. Composes two layers: an env-var value allowlist and a
4
4
  * set of credential-shape regexes. Both run on every primitive string.
5
5
  *
6
+ * Coverage includes encoded credential forms, not only raw bytes: the env
7
+ * layer matches each allowlisted secret both raw and in its **standard
8
+ * base64** form at any byte offset within the encoded plaintext, and the
9
+ * pattern layer covers the git `extraheader` basic-auth wrapper. Boundary:
10
+ * **standard base64 only** — URL-safe base64, hex, and percent-encoding are
11
+ * not covered — and the **trace-write sink only**; content an agent authors
12
+ * into a wiki commit is never passed through this redactor.
13
+ *
6
14
  * Stateless after construction: `env` is captured once so in-process
7
15
  * `process.env` writes (e.g. agent-runner.js LIBEVAL_SKILL, commands/run.js
8
16
  * LIBEVAL_AGENT_PROFILE) cannot smuggle a value past the redactor.
@@ -52,15 +60,55 @@ const ENV_PLACEHOLDER = (name) => `[REDACTED:env:${name}]`;
52
60
  const PATTERN_PLACEHOLDER = (kind) => `[REDACTED:pattern:${kind}]`;
53
61
 
54
62
  /**
55
- * Build a frozen { name value } snapshot of the requested env vars.
56
- * Empty strings are skipped a leaked empty env var would otherwise
57
- * cause every empty string in the trace to be replaced.
63
+ * Minimum secret byte length for encoded-form matching. At 9 bytes the
64
+ * shortest offset core is exactly 8 chars; below 9 it drops under 8 — too
65
+ * short to be a sound needle against ordinary base64 trace content (margin of
66
+ * safety, false positives). Every DEFAULT_ENV_ALLOWLIST value (token, key,
67
+ * password) far exceeds it.
68
+ */
69
+ const MIN_ENCODED_SECRET_BYTES = 9;
70
+
71
+ // Leading base64 chars contaminated by the k filler bytes, per alignment.
72
+ const ENCODED_LEAD_STRIP = [0, 2, 3];
73
+
74
+ /**
75
+ * The three offset-invariant standard-base64 core substrings of `secret`, one
76
+ * per byte alignment (k = 0/1/2). base64 maps disjoint 3-byte groups to 4 chars
77
+ * independently, so the chars covering a secret's interior groups depend only
78
+ * on the secret's bytes — never on the bytes surrounding it. Only the partial
79
+ * groups at each edge are neighbour-dependent; stripping them leaves a core
80
+ * that appears in the base64 of any plaintext placing `secret` at that
81
+ * alignment. Padding lives only in the final partial group, which is stripped,
82
+ * so each core is padding-free and one needle matches padded and unpadded
83
+ * haystack content. Returns [] below MIN_ENCODED_SECRET_BYTES.
84
+ * @param {string} secret
85
+ * @returns {string[]}
86
+ */
87
+ function encodedNeedles(secret) {
88
+ if (Buffer.byteLength(secret, "utf8") < MIN_ENCODED_SECRET_BYTES) return [];
89
+ const needles = [];
90
+ for (let k = 0; k < 3; k++) {
91
+ const enc = Buffer.from("\0".repeat(k) + secret, "utf8")
92
+ .toString("base64")
93
+ .replace(/=+$/, "");
94
+ needles.push(enc.slice(ENCODED_LEAD_STRIP[k], enc.length - 4));
95
+ }
96
+ return needles;
97
+ }
98
+
99
+ /**
100
+ * Build a frozen { name → { secret, needles } } snapshot of the requested env
101
+ * vars. Empty strings are skipped — a leaked empty env var would otherwise
102
+ * cause every empty string in the trace to be replaced. `needles` are the
103
+ * precomputed standard-base64 cores (empty for sub-floor secrets).
58
104
  */
59
105
  function snapshotEnv(env, allowlist) {
60
106
  const snap = {};
61
107
  for (const name of allowlist) {
62
108
  const v = env[name];
63
- if (typeof v === "string" && v.length > 0) snap[name] = v;
109
+ if (typeof v === "string" && v.length > 0) {
110
+ snap[name] = { secret: v, needles: encodedNeedles(v) };
111
+ }
64
112
  }
65
113
  return Object.freeze(snap);
66
114
  }
@@ -81,7 +129,7 @@ function walk(value, redactString) {
81
129
  export class Redactor {
82
130
  /**
83
131
  * @param {object} deps
84
- * @param {Readonly<Record<string, string>>} deps.envSnapshot - Frozen { name → secret } map captured at construction time.
132
+ * @param {Readonly<Record<string, {secret: string, needles: string[]}>>} deps.envSnapshot - Frozen { name → { secret, needles } } map captured at construction time; `needles` are the precomputed standard-base64 cores of `secret`.
85
133
  * @param {ReadonlyArray<{kind: string, regex: RegExp}>} deps.patterns - Credential-shape regexes; each match becomes `[REDACTED:pattern:KIND]`.
86
134
  * @param {boolean} deps.enabled - When false, `redactValue` returns its input by reference.
87
135
  */
@@ -109,10 +157,21 @@ export class Redactor {
109
157
  */
110
158
  #redactString(s) {
111
159
  let out = s;
112
- for (const [name, secret] of Object.entries(this.envSnapshot)) {
160
+ for (const [name, { secret, needles }] of Object.entries(
161
+ this.envSnapshot,
162
+ )) {
113
163
  if (out.includes(secret)) {
114
164
  out = out.split(secret).join(ENV_PLACEHOLDER(name));
115
165
  }
166
+ // Standard-base64 form at any byte offset. Order among the three needles
167
+ // is irrelevant: once a region is replaced by the placeholder (which
168
+ // shares no base64 run with any needle) those bytes are gone, so a later
169
+ // needle cannot re-match them. The floor keeps every needle ≥ 8 chars.
170
+ for (const needle of needles) {
171
+ if (out.includes(needle)) {
172
+ out = out.split(needle).join(ENV_PLACEHOLDER(name));
173
+ }
174
+ }
116
175
  }
117
176
  for (const { kind, regex } of this.patterns) {
118
177
  out = out.replace(regex, PATTERN_PLACEHOLDER(kind));
@@ -171,6 +171,7 @@ export class TraceCollector {
171
171
  index: this.turnIndex++,
172
172
  role: "assistant",
173
173
  source,
174
+ messageId: message.id ?? null,
174
175
  content,
175
176
  usage,
176
177
  });
@@ -235,7 +236,7 @@ export class TraceCollector {
235
236
  durationMs: prev.durationMs + (event.duration_ms ?? 0),
236
237
  numTurns: prev.numTurns + (event.num_turns ?? 0),
237
238
  tokenUsage: sumTokenUsage(prev.tokenUsage, normalizeUsage(event.usage)),
238
- modelUsage: event.modelUsage ?? prev.modelUsage,
239
+ modelUsage: mergeModelUsage(prev.modelUsage, event.modelUsage),
239
240
  };
240
241
  }
241
242
 
@@ -245,7 +246,7 @@ export class TraceCollector {
245
246
  */
246
247
  toJSON() {
247
248
  return {
248
- version: "1.1.0",
249
+ version: "1.2.0",
249
250
  metadata: this.metadata ?? {
250
251
  timestamp: this.now(),
251
252
  sessionId: null,
@@ -363,6 +364,61 @@ function sumTokenUsage(a, b) {
363
364
  };
364
365
  }
365
366
 
367
+ /**
368
+ * Per-model fields that sum additively across result events — token counts,
369
+ * per-model cost, and request counters. Every other per-model field (e.g. a
370
+ * context-window size) is carried first-seen, never summed.
371
+ */
372
+ const ADDITIVE_MODEL_FIELDS = [
373
+ "inputTokens",
374
+ "outputTokens",
375
+ "cacheReadInputTokens",
376
+ "cacheCreationInputTokens",
377
+ "costUSD",
378
+ "webSearchRequests",
379
+ ];
380
+
381
+ /**
382
+ * Merge two per-model usage maps across result events. Additive fields
383
+ * (token counts, cost, request counters) sum; non-additive fields are carried
384
+ * from the first event that set them (prev wins). Either side may be null.
385
+ * @param {object|null} prevMU
386
+ * @param {object|null} nextMU
387
+ * @returns {object|null}
388
+ */
389
+ function mergeModelUsage(prevMU, nextMU) {
390
+ if (!prevMU) return nextMU ?? null;
391
+ if (!nextMU) return prevMU;
392
+
393
+ const merged = {};
394
+ for (const model of new Set([
395
+ ...Object.keys(prevMU),
396
+ ...Object.keys(nextMU),
397
+ ])) {
398
+ merged[model] = mergeOneModel(prevMU[model] ?? {}, nextMU[model] ?? {});
399
+ }
400
+ return merged;
401
+ }
402
+
403
+ /**
404
+ * Merge one model's usage: additive fields sum, others carry first-seen (a).
405
+ * @param {object} a - First-seen (prev) per-model usage.
406
+ * @param {object} b - Next per-model usage.
407
+ * @returns {object}
408
+ */
409
+ function mergeOneModel(a, b) {
410
+ const entry = { ...a, ...b };
411
+ for (const field of ADDITIVE_MODEL_FIELDS) {
412
+ if (field in a || field in b) {
413
+ entry[field] = (a[field] ?? 0) + (b[field] ?? 0);
414
+ }
415
+ }
416
+ for (const field of Object.keys(a)) {
417
+ if (!ADDITIVE_MODEL_FIELDS.includes(field)) entry[field] = a[field];
418
+ }
419
+ return entry;
420
+ }
421
+
366
422
  /**
367
423
  * Format milliseconds into a human-readable duration.
368
424
  * @param {number} ms - Duration in milliseconds
@@ -28,13 +28,28 @@ export class TraceGitHub {
28
28
  }
29
29
 
30
30
  /**
31
- * List recent workflow runs, optionally filtered by name pattern.
31
+ * List recent workflow runs, optionally filtered by name pattern and by the
32
+ * participant whose trace lane a run carries.
33
+ *
34
+ * Without `participant`, behaviour is unchanged: the workflow-name pattern is
35
+ * the only filter. With `participant`, each name-matched run is resolved
36
+ * against its trace lane (see {@link runMatchesParticipant}) and annotated
37
+ * with a `match` field:
38
+ * - `"confirmed"` — the participant's lane is present in the run's
39
+ * artifacts (matrix artifact name, or a member filename in the shared
40
+ * dispatch artifact).
41
+ * - `"unconfirmed-pending-artifacts"` — the run's workflow mints trace
42
+ * artifacts but none exist yet (still running, or completed-but-not-yet
43
+ * uploaded); reported as a candidate, never silently dropped.
44
+ * Runs that have artifacts but no matching lane are omitted. Participant
45
+ * identity is read from artifact/file *names* only, never from trace content.
32
46
  *
33
47
  * @param {object} [opts]
34
48
  * @param {string} [opts.pattern] - Case-insensitive regex to match workflow name (default: "kata|agent" — covers `Kata: Shift`, `Kata: Dispatch`, and any `agent`-named workflow)
35
49
  * @param {number} [opts.limit=50] - Max runs to return from GitHub API
36
50
  * @param {string} [opts.lookback="7d"] - How far back to search (e.g. "7d", "24h", "2w")
37
- * @returns {Promise<object[]>} Array of {workflow, runId, status, conclusion, createdAt, branch, url}
51
+ * @param {string} [opts.participant] - Participant name; when set, filter/annotate runs by trace lane
52
+ * @returns {Promise<object[]>} Array of {workflow, runId, status, conclusion, createdAt, branch, url[, match]}
38
53
  */
39
54
  async listRuns(opts = {}) {
40
55
  const { pattern = "kata|agent", limit = 50, lookback = "7d" } = opts;
@@ -52,7 +67,7 @@ export class TraceGitHub {
52
67
  const runs = data.workflow_runs ?? [];
53
68
 
54
69
  const re = new RegExp(pattern, "i");
55
- return runs
70
+ const matched = runs
56
71
  .filter((r) => re.test(r.name))
57
72
  .map((r) => ({
58
73
  workflow: r.name,
@@ -63,6 +78,133 @@ export class TraceGitHub {
63
78
  branch: r.head_branch,
64
79
  url: r.html_url,
65
80
  }));
81
+
82
+ if (!opts.participant) return matched;
83
+
84
+ const out = [];
85
+ for (const run of matched) {
86
+ const verdict = await this.runMatchesParticipant(
87
+ run.runId,
88
+ opts.participant,
89
+ );
90
+ if (verdict === "omit") continue;
91
+ out.push({ ...run, match: verdict });
92
+ }
93
+ return out;
94
+ }
95
+
96
+ /**
97
+ * Decide whether a run carries a participant's trace lane.
98
+ *
99
+ * Matrix hosts name the participant in an artifact name
100
+ * (`trace--<participant>`); dispatch hosts name it in a member filename
101
+ * (`trace--<case>--<participant>.<role>.ndjson`) inside one shared `trace--*`
102
+ * artifact. The GitHub artifacts API exposes only artifact-level metadata, so
103
+ * a matrix lane confirms from the inventory alone, while a dispatch lane
104
+ * requires downloading the shared artifact and listing its extracted member
105
+ * filenames — names only, never trace content.
106
+ *
107
+ * A run whose trace artifacts are absent (still running, or
108
+ * completed-but-not-yet-uploaded) is a candidate, not a drop.
109
+ *
110
+ * @param {number|string} runId
111
+ * @param {string} participant
112
+ * @returns {Promise<"confirmed"|"unconfirmed-pending-artifacts"|"omit">}
113
+ */
114
+ async runMatchesParticipant(runId, participant) {
115
+ const url = `${API}/repos/${this.owner}/${this.repo}/actions/runs/${runId}/artifacts`;
116
+ const data = await this.#get(url);
117
+ const artifacts = data.artifacts ?? [];
118
+ const traceArtifacts = artifacts.filter((a) =>
119
+ a.name.startsWith("trace--"),
120
+ );
121
+
122
+ // No trace artifacts yet: a candidate the matcher must report, not drop —
123
+ // the lane may upload when the host completes.
124
+ if (traceArtifacts.length === 0) return "unconfirmed-pending-artifacts";
125
+
126
+ // Matrix host: the participant is an artifact name. No download.
127
+ if (
128
+ participantInNames(
129
+ traceArtifacts.map((a) => a.name),
130
+ participant,
131
+ )
132
+ ) {
133
+ return "confirmed";
134
+ }
135
+
136
+ // Dispatch host: one shared artifact whose members name the participant.
137
+ // Download and list member filenames (names only).
138
+ for (const artifact of traceArtifacts) {
139
+ const { files } = await this.downloadTrace(runId, {
140
+ name: artifact.name,
141
+ });
142
+ if (participantInNames(files, participant)) return "confirmed";
143
+ }
144
+ return "omit";
145
+ }
146
+
147
+ /**
148
+ * Resolve a participant's lane trace path for a known run in one keyed
149
+ * lookup — no run enumeration, no trace-content inspection.
150
+ *
151
+ * Matrix host: the artifact name carries the participant (no download).
152
+ * Dispatch host: download the shared `trace--*` artifact and return the
153
+ * extracted member file whose name carries the participant.
154
+ *
155
+ * @param {number|string} runId
156
+ * @param {string} participant
157
+ * @param {object} [opts]
158
+ * @param {string} [opts.dir] - Output directory for a downloaded dispatch artifact
159
+ * @returns {Promise<{runId: (number|string), participant: string, host: "matrix"|"dispatch", artifact: string, path: string}>}
160
+ * @throws {Error} when the run has no trace artifacts, or none carries the participant's lane.
161
+ */
162
+ async findByKey(runId, participant, opts = {}) {
163
+ const url = `${API}/repos/${this.owner}/${this.repo}/actions/runs/${runId}/artifacts`;
164
+ const data = await this.#get(url);
165
+ const artifacts = data.artifacts ?? [];
166
+ const traceArtifacts = artifacts.filter((a) =>
167
+ a.name.startsWith("trace--"),
168
+ );
169
+ if (traceArtifacts.length === 0) {
170
+ throw new Error(`No trace artifacts for run ${runId}`);
171
+ }
172
+
173
+ // Matrix host: the artifact name carries the participant. No download.
174
+ const matrix = traceArtifacts.find((a) =>
175
+ participantInNames([a.name], participant),
176
+ );
177
+ if (matrix) {
178
+ return {
179
+ runId,
180
+ participant,
181
+ host: "matrix",
182
+ artifact: matrix.name,
183
+ path: matrix.name,
184
+ };
185
+ }
186
+
187
+ // Dispatch host: download the shared artifact and match a member filename.
188
+ for (const artifact of traceArtifacts) {
189
+ const { dir, files } = await this.downloadTrace(runId, {
190
+ name: artifact.name,
191
+ dir: opts.dir,
192
+ });
193
+ const member = files.find((f) => participantInNames([f], participant));
194
+ if (member) {
195
+ return {
196
+ runId,
197
+ participant,
198
+ host: "dispatch",
199
+ artifact: artifact.name,
200
+ path: path.join(dir, member),
201
+ };
202
+ }
203
+ }
204
+
205
+ throw new Error(
206
+ `No trace lane for participant "${participant}" in run ${runId}`,
207
+ );
66
208
  }
67
209
 
68
210
  /**
@@ -151,6 +293,36 @@ export class TraceGitHub {
151
293
  }
152
294
  }
153
295
 
296
+ /**
297
+ * Test whether a participant's trace lane is present in a list of names.
298
+ *
299
+ * Matches the two trace-naming shapes by *name* only (never by content):
300
+ * - matrix artifact name: `trace--<participant>`
301
+ * - dispatch member filename: `trace--<case>--<participant>.<role>.ndjson`
302
+ *
303
+ * The participant segment is delimited by `--` and ends at the next `--`, `.`,
304
+ * or end-of-string, so a substring like `release` does not match
305
+ * `release-engineer` and vice versa.
306
+ *
307
+ * @param {string[]} names - Artifact names or extracted member filenames.
308
+ * @param {string} participant - Participant name to look for.
309
+ * @returns {boolean}
310
+ */
311
+ export function participantInNames(names, participant) {
312
+ return names.some((name) => {
313
+ if (!name.startsWith("trace--")) return false;
314
+ const rest = name.slice("trace--".length);
315
+ // Matrix: `<participant>` is the whole remainder (artifact name).
316
+ if (rest === participant) return true;
317
+ // Dispatch: `<case>--<participant>.<role>.ndjson`.
318
+ const sep = rest.indexOf("--");
319
+ if (sep === -1) return false;
320
+ const afterCase = rest.slice(sep + 2);
321
+ const participantSegment = afterCase.split(".")[0];
322
+ return participantSegment === participant;
323
+ });
324
+ }
325
+
154
326
  /**
155
327
  * Pick the trace artifact to download from a workflow run's artifact list.
156
328
  *
@@ -0,0 +1,101 @@
1
+ /**
2
+ * Multi-file orchestrator for cross-trace `fit-trace` verbs.
3
+ *
4
+ * Two functions centralise the load-tag-concat (`runOver`) and
5
+ * aggregate-and-sort (`aggregate`) policies so every cross-trace verb shares
6
+ * one source-attribution rule. `compareTwo` derives per-side identity from
7
+ * each input's basename and threads it into `TraceQuery.compare()`.
8
+ *
9
+ * `load` is injected (the exported `loadTrace` from `commands/trace.js`) so
10
+ * this module stays IO-policy-free and unit-testable with a stub.
11
+ */
12
+ import { basename } from "node:path";
13
+
14
+ /**
15
+ * Load each file → `TraceQuery`, run `query(tq)`, tag each emitted record with
16
+ * `source: <basename>` only when more than one file is supplied. Records are
17
+ * concatenated in file-then-record order.
18
+ * @param {string[]} files
19
+ * @param {(tq: object) => object[]} query
20
+ * @param {(file: string) => object} load
21
+ * @returns {object[]}
22
+ */
23
+ export function runOver(files, query, load) {
24
+ const multi = files.length > 1;
25
+ const out = [];
26
+ for (const file of files) {
27
+ const source = basename(file);
28
+ const records = query(load(file));
29
+ for (const record of records) {
30
+ out.push(multi ? { ...record, source } : record);
31
+ }
32
+ }
33
+ return out;
34
+ }
35
+
36
+ /**
37
+ * Merge per-file record arrays by `key(record)`, summing each record's
38
+ * existing `count` field (not occurrence count), and frequency-sort by
39
+ * `count desc`. Merged records carry `sources: string[]` only when more than
40
+ * one file is supplied.
41
+ * @param {string[]} files
42
+ * @param {(tq: object) => Array<{count: number}>} query
43
+ * @param {(record: object) => string} key
44
+ * @param {(file: string) => object} load
45
+ * @returns {object[]}
46
+ */
47
+ export function aggregate(files, query, key, load) {
48
+ const multi = files.length > 1;
49
+ const merged = new Map();
50
+ for (const file of files) {
51
+ const source = basename(file);
52
+ for (const record of query(load(file))) {
53
+ const k = key(record);
54
+ if (!merged.has(k)) {
55
+ merged.set(k, { record: { ...record }, sources: new Set() });
56
+ } else {
57
+ merged.get(k).record.count += record.count;
58
+ }
59
+ merged.get(k).sources.add(source);
60
+ }
61
+ }
62
+ return [...merged.values()]
63
+ .map(({ record, sources }) =>
64
+ multi ? { ...record, sources: [...sources].sort() } : record,
65
+ )
66
+ .sort((a, b) => b.count - a.count);
67
+ }
68
+
69
+ /**
70
+ * Load two files, derive each side's `{caseName, participant}` from its
71
+ * basename via the `split` convention, and thread them into
72
+ * `a.compare(b, {aIdentity, bIdentity})`.
73
+ * @param {string} a
74
+ * @param {string} b
75
+ * @param {(file: string) => object} load
76
+ * @returns {object}
77
+ */
78
+ export function compareTwo(a, b, load) {
79
+ const qa = load(a);
80
+ const qb = load(b);
81
+ return qa.compare(qb, {
82
+ aIdentity: parseIdentity(a),
83
+ bIdentity: parseIdentity(b),
84
+ });
85
+ }
86
+
87
+ /**
88
+ * Parse `trace--<case>--<participant>.<role>.ndjson` into `{caseName,
89
+ * participant}`. On no match, `caseName` is the basename minus its final
90
+ * `.ndjson` extension only and `participant` is null.
91
+ * @param {string} file
92
+ * @returns {{caseName: string, participant: string|null}}
93
+ */
94
+ export function parseIdentity(file) {
95
+ const name = basename(file);
96
+ const match = name.match(/^trace--(.+?)--(.+?)\.[^.]+\.ndjson$/);
97
+ if (match) {
98
+ return { caseName: match[1], participant: match[2] };
99
+ }
100
+ return { caseName: name.replace(/\.ndjson$/, ""), participant: null };
101
+ }