npm - @forwardimpact/libeval - Versions diffs - 0.1.63 → 0.1.65 - Mend

@forwardimpact/libeval 0.1.63 → 0.1.65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/src/redaction.js CHANGED Viewed

@@ -3,6 +3,14 @@
  * the trace artifact. Composes two layers: an env-var value allowlist and a
  * set of credential-shape regexes. Both run on every primitive string.
  *
+ * Coverage includes encoded credential forms, not only raw bytes: the env
+ * layer matches each allowlisted secret both raw and in its **standard
+ * base64** form at any byte offset within the encoded plaintext, and the
+ * pattern layer covers the git `extraheader` basic-auth wrapper. Boundary:
+ * **standard base64 only** — URL-safe base64, hex, and percent-encoding are
+ * not covered — and the **trace-write sink only**; content an agent authors
+ * into a wiki commit is never passed through this redactor.
+ *
  * Stateless after construction: `env` is captured once so in-process
  * `process.env` writes (e.g. agent-runner.js LIBEVAL_SKILL, commands/run.js
  * LIBEVAL_AGENT_PROFILE) cannot smuggle a value past the redactor.
@@ -52,15 +60,55 @@ const ENV_PLACEHOLDER = (name) => `[REDACTED:env:${name}]`;
 const PATTERN_PLACEHOLDER = (kind) => `[REDACTED:pattern:${kind}]`;
 /**
- * Build a frozen { name → value } snapshot of the requested env vars.
- * Empty strings are skipped — a leaked empty env var would otherwise
- * cause every empty string in the trace to be replaced.
+ * Minimum secret byte length for encoded-form matching. At 9 bytes the
+ * shortest offset core is exactly 8 chars; below 9 it drops under 8 — too
+ * short to be a sound needle against ordinary base64 trace content (margin of
+ * safety, false positives). Every DEFAULT_ENV_ALLOWLIST value (token, key,
+ * password) far exceeds it.
+ */
+const MIN_ENCODED_SECRET_BYTES = 9;
+// Leading base64 chars contaminated by the k filler bytes, per alignment.
+const ENCODED_LEAD_STRIP = [0, 2, 3];
+/**
+ * The three offset-invariant standard-base64 core substrings of `secret`, one
+ * per byte alignment (k = 0/1/2). base64 maps disjoint 3-byte groups to 4 chars
+ * independently, so the chars covering a secret's interior groups depend only
+ * on the secret's bytes — never on the bytes surrounding it. Only the partial
+ * groups at each edge are neighbour-dependent; stripping them leaves a core
+ * that appears in the base64 of any plaintext placing `secret` at that
+ * alignment. Padding lives only in the final partial group, which is stripped,
+ * so each core is padding-free and one needle matches padded and unpadded
+ * haystack content. Returns [] below MIN_ENCODED_SECRET_BYTES.
+ * @param {string} secret
+ * @returns {string[]}
+ */
+function encodedNeedles(secret) {
+  if (Buffer.byteLength(secret, "utf8") < MIN_ENCODED_SECRET_BYTES) return [];
+  const needles = [];
+  for (let k = 0; k < 3; k++) {
+    const enc = Buffer.from("\0".repeat(k) + secret, "utf8")
+      .toString("base64")
+      .replace(/=+$/, "");
+    needles.push(enc.slice(ENCODED_LEAD_STRIP[k], enc.length - 4));
+  }
+  return needles;
+}
+/**
+ * Build a frozen { name → { secret, needles } } snapshot of the requested env
+ * vars. Empty strings are skipped — a leaked empty env var would otherwise
+ * cause every empty string in the trace to be replaced. `needles` are the
+ * precomputed standard-base64 cores (empty for sub-floor secrets).
  */
 function snapshotEnv(env, allowlist) {
   const snap = {};
   for (const name of allowlist) {
     const v = env[name];
-    if (typeof v === "string" && v.length > 0) snap[name] = v;
+    if (typeof v === "string" && v.length > 0) {
+      snap[name] = { secret: v, needles: encodedNeedles(v) };
+    }
   }
   return Object.freeze(snap);
 }
@@ -81,7 +129,7 @@ function walk(value, redactString) {
 export class Redactor {
   /**
    * @param {object} deps
-   * @param {Readonly<Record<string, string>>} deps.envSnapshot - Frozen { name → secret } map captured at construction time.
+   * @param {Readonly<Record<string, {secret: string, needles: string[]}>>} deps.envSnapshot - Frozen { name → { secret, needles } } map captured at construction time; `needles` are the precomputed standard-base64 cores of `secret`.
    * @param {ReadonlyArray<{kind: string, regex: RegExp}>} deps.patterns - Credential-shape regexes; each match becomes `[REDACTED:pattern:KIND]`.
    * @param {boolean} deps.enabled - When false, `redactValue` returns its input by reference.
    */
@@ -109,10 +157,21 @@ export class Redactor {
    */
   #redactString(s) {
     let out = s;
-    for (const [name, secret] of Object.entries(this.envSnapshot)) {
+    for (const [name, { secret, needles }] of Object.entries(
+      this.envSnapshot,
+    )) {
       if (out.includes(secret)) {
         out = out.split(secret).join(ENV_PLACEHOLDER(name));
       }
+      // Standard-base64 form at any byte offset. Order among the three needles
+      // is irrelevant: once a region is replaced by the placeholder (which
+      // shares no base64 run with any needle) those bytes are gone, so a later
+      // needle cannot re-match them. The floor keeps every needle ≥ 8 chars.
+      for (const needle of needles) {
+        if (out.includes(needle)) {
+          out = out.split(needle).join(ENV_PLACEHOLDER(name));
+        }
+      }
     }
     for (const { kind, regex } of this.patterns) {
       out = out.replace(regex, PATTERN_PLACEHOLDER(kind));

package/src/trace-collector.js CHANGED Viewed

@@ -171,6 +171,7 @@ export class TraceCollector {
       index: this.turnIndex++,
       role: "assistant",
       source,
+      messageId: message.id ?? null,
       content,
       usage,
     });
@@ -235,7 +236,7 @@ export class TraceCollector {
       durationMs: prev.durationMs + (event.duration_ms ?? 0),
       numTurns: prev.numTurns + (event.num_turns ?? 0),
       tokenUsage: sumTokenUsage(prev.tokenUsage, normalizeUsage(event.usage)),
-      modelUsage: event.modelUsage ?? prev.modelUsage,
+      modelUsage: mergeModelUsage(prev.modelUsage, event.modelUsage),
     };
   }
@@ -245,7 +246,7 @@ export class TraceCollector {
    */
   toJSON() {
     return {
-      version: "1.1.0",
+      version: "1.2.0",
       metadata: this.metadata ?? {
         timestamp: this.now(),
         sessionId: null,
@@ -363,6 +364,61 @@ function sumTokenUsage(a, b) {
   };
 }
+/**
+ * Per-model fields that sum additively across result events — token counts,
+ * per-model cost, and request counters. Every other per-model field (e.g. a
+ * context-window size) is carried first-seen, never summed.
+ */
+const ADDITIVE_MODEL_FIELDS = [
+  "inputTokens",
+  "outputTokens",
+  "cacheReadInputTokens",
+  "cacheCreationInputTokens",
+  "costUSD",
+  "webSearchRequests",
+];
+/**
+ * Merge two per-model usage maps across result events. Additive fields
+ * (token counts, cost, request counters) sum; non-additive fields are carried
+ * from the first event that set them (prev wins). Either side may be null.
+ * @param {object|null} prevMU
+ * @param {object|null} nextMU
+ * @returns {object|null}
+ */
+function mergeModelUsage(prevMU, nextMU) {
+  if (!prevMU) return nextMU ?? null;
+  if (!nextMU) return prevMU;
+  const merged = {};
+  for (const model of new Set([
+    ...Object.keys(prevMU),
+    ...Object.keys(nextMU),
+  ])) {
+    merged[model] = mergeOneModel(prevMU[model] ?? {}, nextMU[model] ?? {});
+  }
+  return merged;
+}
+/**
+ * Merge one model's usage: additive fields sum, others carry first-seen (a).
+ * @param {object} a - First-seen (prev) per-model usage.
+ * @param {object} b - Next per-model usage.
+ * @returns {object}
+ */
+function mergeOneModel(a, b) {
+  const entry = { ...a, ...b };
+  for (const field of ADDITIVE_MODEL_FIELDS) {
+    if (field in a || field in b) {
+      entry[field] = (a[field] ?? 0) + (b[field] ?? 0);
+    }
+  }
+  for (const field of Object.keys(a)) {
+    if (!ADDITIVE_MODEL_FIELDS.includes(field)) entry[field] = a[field];
+  }
+  return entry;
+}
 /**
  * Format milliseconds into a human-readable duration.
  * @param {number} ms - Duration in milliseconds

package/src/trace-github.js CHANGED Viewed

@@ -28,13 +28,28 @@ export class TraceGitHub {
   }
   /**
-   * List recent workflow runs, optionally filtered by name pattern.
+   * List recent workflow runs, optionally filtered by name pattern and by the
+   * participant whose trace lane a run carries.
+   *
+   * Without `participant`, behaviour is unchanged: the workflow-name pattern is
+   * the only filter. With `participant`, each name-matched run is resolved
+   * against its trace lane (see {@link runMatchesParticipant}) and annotated
+   * with a `match` field:
+   *   - `"confirmed"` — the participant's lane is present in the run's
+   *     artifacts (matrix artifact name, or a member filename in the shared
+   *     dispatch artifact).
+   *   - `"unconfirmed-pending-artifacts"` — the run's workflow mints trace
+   *     artifacts but none exist yet (still running, or completed-but-not-yet
+   *     uploaded); reported as a candidate, never silently dropped.
+   * Runs that have artifacts but no matching lane are omitted. Participant
+   * identity is read from artifact/file *names* only, never from trace content.
    *
    * @param {object} [opts]
    * @param {string} [opts.pattern] - Case-insensitive regex to match workflow name (default: "kata|agent" — covers `Kata: Shift`, `Kata: Dispatch`, and any `agent`-named workflow)
    * @param {number} [opts.limit=50] - Max runs to return from GitHub API
    * @param {string} [opts.lookback="7d"] - How far back to search (e.g. "7d", "24h", "2w")
-   * @returns {Promise<object[]>} Array of {workflow, runId, status, conclusion, createdAt, branch, url}
+   * @param {string} [opts.participant] - Participant name; when set, filter/annotate runs by trace lane
+   * @returns {Promise<object[]>} Array of {workflow, runId, status, conclusion, createdAt, branch, url[, match]}
    */
   async listRuns(opts = {}) {
     const { pattern = "kata|agent", limit = 50, lookback = "7d" } = opts;
@@ -52,7 +67,7 @@ export class TraceGitHub {
     const runs = data.workflow_runs ?? [];
     const re = new RegExp(pattern, "i");
-    return runs
+    const matched = runs
       .filter((r) => re.test(r.name))
       .map((r) => ({
         workflow: r.name,
@@ -63,6 +78,133 @@ export class TraceGitHub {
         branch: r.head_branch,
         url: r.html_url,
       }));
+    if (!opts.participant) return matched;
+    const out = [];
+    for (const run of matched) {
+      const verdict = await this.runMatchesParticipant(
+        run.runId,
+        opts.participant,
+      );
+      if (verdict === "omit") continue;
+      out.push({ ...run, match: verdict });
+    }
+    return out;
+  }
+  /**
+   * Decide whether a run carries a participant's trace lane.
+   *
+   * Matrix hosts name the participant in an artifact name
+   * (`trace--<participant>`); dispatch hosts name it in a member filename
+   * (`trace--<case>--<participant>.<role>.ndjson`) inside one shared `trace--*`
+   * artifact. The GitHub artifacts API exposes only artifact-level metadata, so
+   * a matrix lane confirms from the inventory alone, while a dispatch lane
+   * requires downloading the shared artifact and listing its extracted member
+   * filenames — names only, never trace content.
+   *
+   * A run whose trace artifacts are absent (still running, or
+   * completed-but-not-yet-uploaded) is a candidate, not a drop.
+   *
+   * @param {number|string} runId
+   * @param {string} participant
+   * @returns {Promise<"confirmed"|"unconfirmed-pending-artifacts"|"omit">}
+   */
+  async runMatchesParticipant(runId, participant) {
+    const url = `${API}/repos/${this.owner}/${this.repo}/actions/runs/${runId}/artifacts`;
+    const data = await this.#get(url);
+    const artifacts = data.artifacts ?? [];
+    const traceArtifacts = artifacts.filter((a) =>
+      a.name.startsWith("trace--"),
+    );
+    // No trace artifacts yet: a candidate the matcher must report, not drop —
+    // the lane may upload when the host completes.
+    if (traceArtifacts.length === 0) return "unconfirmed-pending-artifacts";
+    // Matrix host: the participant is an artifact name. No download.
+    if (
+      participantInNames(
+        traceArtifacts.map((a) => a.name),
+        participant,
+      )
+    ) {
+      return "confirmed";
+    }
+    // Dispatch host: one shared artifact whose members name the participant.
+    // Download and list member filenames (names only).
+    for (const artifact of traceArtifacts) {
+      const { files } = await this.downloadTrace(runId, {
+        name: artifact.name,
+      });
+      if (participantInNames(files, participant)) return "confirmed";
+    }
+    return "omit";
+  }
+  /**
+   * Resolve a participant's lane trace path for a known run in one keyed
+   * lookup — no run enumeration, no trace-content inspection.
+   *
+   * Matrix host: the artifact name carries the participant (no download).
+   * Dispatch host: download the shared `trace--*` artifact and return the
+   * extracted member file whose name carries the participant.
+   *
+   * @param {number|string} runId
+   * @param {string} participant
+   * @param {object} [opts]
+   * @param {string} [opts.dir] - Output directory for a downloaded dispatch artifact
+   * @returns {Promise<{runId: (number|string), participant: string, host: "matrix"|"dispatch", artifact: string, path: string}>}
+   * @throws {Error} when the run has no trace artifacts, or none carries the participant's lane.
+   */
+  async findByKey(runId, participant, opts = {}) {
+    const url = `${API}/repos/${this.owner}/${this.repo}/actions/runs/${runId}/artifacts`;
+    const data = await this.#get(url);
+    const artifacts = data.artifacts ?? [];
+    const traceArtifacts = artifacts.filter((a) =>
+      a.name.startsWith("trace--"),
+    );
+    if (traceArtifacts.length === 0) {
+      throw new Error(`No trace artifacts for run ${runId}`);
+    }
+    // Matrix host: the artifact name carries the participant. No download.
+    const matrix = traceArtifacts.find((a) =>
+      participantInNames([a.name], participant),
+    );
+    if (matrix) {
+      return {
+        runId,
+        participant,
+        host: "matrix",
+        artifact: matrix.name,
+        path: matrix.name,
+      };
+    }
+    // Dispatch host: download the shared artifact and match a member filename.
+    for (const artifact of traceArtifacts) {
+      const { dir, files } = await this.downloadTrace(runId, {
+        name: artifact.name,
+        dir: opts.dir,
+      });
+      const member = files.find((f) => participantInNames([f], participant));
+      if (member) {
+        return {
+          runId,
+          participant,
+          host: "dispatch",
+          artifact: artifact.name,
+          path: path.join(dir, member),
+        };
+      }
+    }
+    throw new Error(
+      `No trace lane for participant "${participant}" in run ${runId}`,
+    );
   }
   /**
@@ -151,6 +293,36 @@ export class TraceGitHub {
   }
 }
+/**
+ * Test whether a participant's trace lane is present in a list of names.
+ *
+ * Matches the two trace-naming shapes by *name* only (never by content):
+ *   - matrix artifact name: `trace--<participant>`
+ *   - dispatch member filename: `trace--<case>--<participant>.<role>.ndjson`
+ *
+ * The participant segment is delimited by `--` and ends at the next `--`, `.`,
+ * or end-of-string, so a substring like `release` does not match
+ * `release-engineer` and vice versa.
+ *
+ * @param {string[]} names - Artifact names or extracted member filenames.
+ * @param {string} participant - Participant name to look for.
+ * @returns {boolean}
+ */
+export function participantInNames(names, participant) {
+  return names.some((name) => {
+    if (!name.startsWith("trace--")) return false;
+    const rest = name.slice("trace--".length);
+    // Matrix: `<participant>` is the whole remainder (artifact name).
+    if (rest === participant) return true;
+    // Dispatch: `<case>--<participant>.<role>.ndjson`.
+    const sep = rest.indexOf("--");
+    if (sep === -1) return false;
+    const afterCase = rest.slice(sep + 2);
+    const participantSegment = afterCase.split(".")[0];
+    return participantSegment === participant;
+  });
+}
 /**
  * Pick the trace artifact to download from a workflow run's artifact list.
  *

package/src/trace-multi.js ADDED Viewed

@@ -0,0 +1,101 @@
+/**
+ * Multi-file orchestrator for cross-trace `fit-trace` verbs.
+ *
+ * Two functions centralise the load-tag-concat (`runOver`) and
+ * aggregate-and-sort (`aggregate`) policies so every cross-trace verb shares
+ * one source-attribution rule. `compareTwo` derives per-side identity from
+ * each input's basename and threads it into `TraceQuery.compare()`.
+ *
+ * `load` is injected (the exported `loadTrace` from `commands/trace.js`) so
+ * this module stays IO-policy-free and unit-testable with a stub.
+ */
+import { basename } from "node:path";
+/**
+ * Load each file → `TraceQuery`, run `query(tq)`, tag each emitted record with
+ * `source: <basename>` only when more than one file is supplied. Records are
+ * concatenated in file-then-record order.
+ * @param {string[]} files
+ * @param {(tq: object) => object[]} query
+ * @param {(file: string) => object} load
+ * @returns {object[]}
+ */
+export function runOver(files, query, load) {
+  const multi = files.length > 1;
+  const out = [];
+  for (const file of files) {
+    const source = basename(file);
+    const records = query(load(file));
+    for (const record of records) {
+      out.push(multi ? { ...record, source } : record);
+    }
+  }
+  return out;
+}
+/**
+ * Merge per-file record arrays by `key(record)`, summing each record's
+ * existing `count` field (not occurrence count), and frequency-sort by
+ * `count desc`. Merged records carry `sources: string[]` only when more than
+ * one file is supplied.
+ * @param {string[]} files
+ * @param {(tq: object) => Array<{count: number}>} query
+ * @param {(record: object) => string} key
+ * @param {(file: string) => object} load
+ * @returns {object[]}
+ */
+export function aggregate(files, query, key, load) {
+  const multi = files.length > 1;
+  const merged = new Map();
+  for (const file of files) {
+    const source = basename(file);
+    for (const record of query(load(file))) {
+      const k = key(record);
+      if (!merged.has(k)) {
+        merged.set(k, { record: { ...record }, sources: new Set() });
+      } else {
+        merged.get(k).record.count += record.count;
+      }
+      merged.get(k).sources.add(source);
+    }
+  }
+  return [...merged.values()]
+    .map(({ record, sources }) =>
+      multi ? { ...record, sources: [...sources].sort() } : record,
+    )
+    .sort((a, b) => b.count - a.count);
+}
+/**
+ * Load two files, derive each side's `{caseName, participant}` from its
+ * basename via the `split` convention, and thread them into
+ * `a.compare(b, {aIdentity, bIdentity})`.
+ * @param {string} a
+ * @param {string} b
+ * @param {(file: string) => object} load
+ * @returns {object}
+ */
+export function compareTwo(a, b, load) {
+  const qa = load(a);
+  const qb = load(b);
+  return qa.compare(qb, {
+    aIdentity: parseIdentity(a),
+    bIdentity: parseIdentity(b),
+  });
+}
+/**
+ * Parse `trace--<case>--<participant>.<role>.ndjson` into `{caseName,
+ * participant}`. On no match, `caseName` is the basename minus its final
+ * `.ndjson` extension only and `participant` is null.
+ * @param {string} file
+ * @returns {{caseName: string, participant: string|null}}
+ */
+export function parseIdentity(file) {
+  const name = basename(file);
+  const match = name.match(/^trace--(.+?)--(.+?)\.[^.]+\.ndjson$/);
+  if (match) {
+    return { caseName: match[1], participant: match[2] };
+  }
+  return { caseName: name.replace(/\.ndjson$/, ""), participant: null };
+}