npm - @forwardimpact/libeval - Versions diffs - 0.1.63 → 0.1.65 - Mend

@forwardimpact/libeval 0.1.63 → 0.1.65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/src/trace-query.js CHANGED Viewed

@@ -1,3 +1,13 @@
+import {
+  ZERO_USAGE,
+  bucketUsageByTool,
+  carriedPerTurn,
+  computeDivergence,
+  isPreChangeDoc,
+  perMessageUsage,
+  reconcileBucketsToTotals,
+} from "./trace-usage.js";
 /**
  * Query engine for structured trace documents produced by TraceCollector.
  *
@@ -33,6 +43,11 @@ export class TraceQuery {
       metadata: this.metadata,
       summary: this.summary,
       turnCount: this.turns.length,
+      resultEventTurns: this.summary.numTurns ?? null,
+      turnPopulations: {
+        turnCount: "rendered-trace-turns",
+        resultEventTurns: "result-event-turns",
+      },
       tools: this.toolFrequency(),
       taskPrompt,
     };
@@ -277,59 +292,216 @@ export class TraceQuery {
   }
   /**
-   * Token usage and cost breakdown per assistant turn, plus totals.
+   * Token usage and cost breakdown, accounted once per API message, plus
+   * totals that name their population.
+   *
+   * A structured document collected before this change (version < 1.2.0)
+   * carries no message identity, so it reports its carried last-wins summary
+   * labeled as such — corrected figures come from re-running the NDJSON source.
    *
-   * Token totals prefer the summary's result-event usage — the SDK's
-   * authoritative ledger, accumulated across every result event in the
-   * trace — over per-turn sums, whose stream-time snapshots double-count
-   * re-emitted messages. Traces without a result event (truncated or
-   * in-flight) fall back to the per-turn sums.
+   * Otherwise: when the trace carries result events, totals are the SDK's
+   * accumulated result-event sums (authoritative); the per-message sums are
+   * compared against them and any divergence on input/cacheRead/cacheCreation
+   * is surfaced, never silently absorbed. A trace with no result event
+   * (truncated or in-flight) falls back to the per-message sums, with output
+   * flagged as a streaming-snapshot lower bound and cost/duration/turns
+   * reported as unavailable rather than a silent 0.
    * @returns {object}
    */
   stats() {
-    const { perTurn, totals: turnTotals } = perTurnUsage(this.turns);
-    const tokenTotals = this.summary.tokenUsage ?? turnTotals;
+    if (isPreChangeDoc(this.trace.version)) {
+      return this.#carriedDocumentStats();
+    }
+    const { perMessage, totals: perMessageTotals } = perMessageUsage(
+      this.turns,
+    );
+    const re = this.summary.tokenUsage;
+    if (re) {
+      return {
+        totals: {
+          inputTokens: re.inputTokens ?? 0,
+          outputTokens: re.outputTokens ?? 0,
+          cacheReadInputTokens: re.cacheReadInputTokens ?? 0,
+          cacheCreationInputTokens: re.cacheCreationInputTokens ?? 0,
+          totalCostUsd: this.summary.totalCostUsd ?? 0,
+          durationMs: this.summary.durationMs ?? 0,
+          durationLabel: "cumulative invocation time",
+          resultEventTurns: this.summary.numTurns ?? 0,
+          population: "result-event-sum",
+          resultEventsPresent: true,
+        },
+        perTurn: perMessage,
+        modelUsage: this.summary.modelUsage ?? null,
+        divergence: computeDivergence(perMessageTotals, re),
+      };
+    }
+    return {
+      totals: {
+        ...perMessageTotals,
+        outputIsStreamingSnapshot: true,
+        totalCostUsd: null,
+        durationMs: null,
+        resultEventTurns: null,
+        population: "per-message-fallback",
+        resultEventsPresent: false,
+      },
+      perTurn: perMessage,
+      modelUsage: this.summary.modelUsage ?? null,
+      divergence: null,
+    };
+  }
+  /**
+   * Stats for a pre-change structured document: report the carried last-wins
+   * summary and per-stream-event breakdown, each labeled, without claiming
+   * result-event parity (the document lacks the message identity it needs).
+   * @returns {object}
+   */
+  #carriedDocumentStats() {
+    const re = this.summary.tokenUsage ?? ZERO_USAGE;
     return {
       totals: {
-        ...tokenTotals,
+        inputTokens: re.inputTokens ?? 0,
+        outputTokens: re.outputTokens ?? 0,
+        cacheReadInputTokens: re.cacheReadInputTokens ?? 0,
+        cacheCreationInputTokens: re.cacheCreationInputTokens ?? 0,
         totalCostUsd: this.summary.totalCostUsd ?? 0,
         durationMs: this.summary.durationMs ?? 0,
+        population: "carried-document-summary",
       },
-      perTurn,
+      perTurn: carriedPerTurn(this.turns),
+      modelUsage: this.summary.modelUsage ?? null,
+      divergence: null,
     };
   }
-}
-/**
- * Sum per-turn assistant usage and build the per-turn breakdown rows.
- * @param {object[]} turns
- * @returns {{perTurn: object[], totals: object}}
- */
-function perTurnUsage(turns) {
-  const totals = {
-    inputTokens: 0,
-    outputTokens: 0,
-    cacheReadInputTokens: 0,
-    cacheCreationInputTokens: 0,
-  };
-  const perTurn = [];
+  /**
+   * One record per `tool_use` block, each paired with its `tool_result`
+   * (joined by `toolUseId`) or `result: null` for orphaned calls.
+   * @returns {Array<{turnIndex: number, name: string, toolUseId: string, input: object, result: {content: *, isError: boolean}|null}>}
+   */
+  toolCalls() {
+    const blocks = collectToolUseBlocks(this.turns);
+    const results = new Map();
+    for (const turn of this.turns) {
+      if (turn.role === "tool_result" && turn.toolUseId) {
+        results.set(turn.toolUseId, {
+          content: turn.content ?? null,
+          isError: turn.isError ?? false,
+        });
+      }
+    }
+    return [...blocks.entries()].map(([toolUseId, b]) => ({
+      turnIndex: b.turnIndex,
+      name: b.name,
+      toolUseId,
+      input: b.input,
+      result: results.get(toolUseId) ?? null,
+    }));
+  }
-  for (const turn of turns) {
-    if (turn.role !== "assistant" || !turn.usage) continue;
-    const row = {
-      index: turn.index,
-      inputTokens: turn.usage.inputTokens ?? 0,
-      outputTokens: turn.usage.outputTokens ?? 0,
-      cacheReadInputTokens: turn.usage.cacheReadInputTokens ?? 0,
-      cacheCreationInputTokens: turn.usage.cacheCreationInputTokens ?? 0,
-    };
-    totals.inputTokens += row.inputTokens;
-    totals.outputTokens += row.outputTokens;
-    totals.cacheReadInputTokens += row.cacheReadInputTokens;
-    totals.cacheCreationInputTokens += row.cacheCreationInputTokens;
-    perTurn.push(row);
+  /**
+   * One record per `Bash` `tool_use` block, carrying its command text.
+   * @param {string} [re] - Optional regex source tested against `input.command`.
+   * @returns {Array<{turnIndex: number, toolUseId: string, command: string}>}
+   */
+  commands(re) {
+    const filter = re === undefined ? null : new RegExp(re);
+    const out = [];
+    for (const [toolUseId, b] of collectToolUseBlocks(this.turns, "Bash")) {
+      const command = b.input?.command ?? "";
+      if (filter && !filter.test(command)) continue;
+      out.push({ turnIndex: b.turnIndex, toolUseId, command });
+    }
+    return out;
+  }
+  /**
+   * Distinct `file_path` arguments across `Read`/`Edit`/`Write` tool calls,
+   * frequency-sorted (count desc, path asc tiebreak).
+   * @param {string} [prefix] - Optional `startsWith` filter.
+   * @returns {Array<{path: string, count: number}>}
+   */
+  paths(prefix) {
+    return [...collectFilePaths(this.turns).entries()]
+      .filter(([path]) => prefix === undefined || path.startsWith(prefix))
+      .map(([path, count]) => ({ path, count }))
+      .sort((a, b) => b.count - a.count || a.path.localeCompare(b.path));
+  }
+  /**
+   * Side-by-side comparison of this trace against another peer `TraceQuery`.
+   * Identity (case name, participant) comes from the caller — the trace
+   * carries no filename.
+   * @param {TraceQuery} other
+   * @param {{aIdentity: {caseName: string, participant: string|null}, bIdentity: {caseName: string, participant: string|null}}} identities
+   * @returns {{a: object, b: object, toolDelta: Array, pathDelta: Array}}
+   */
+  compare(other, { aIdentity, bIdentity } = {}) {
+    const a = sideSummary(this, aIdentity);
+    const b = sideSummary(other, bIdentity);
+    const toolNames = [
+      ...new Set([...a.toolFreq.keys(), ...b.toolFreq.keys()]),
+    ];
+    const toolDelta = toolNames
+      .map((tool) => {
+        const av = a.toolFreq.get(tool) ?? 0;
+        const bv = b.toolFreq.get(tool) ?? 0;
+        return { tool, a: av, b: bv, diff: bv - av };
+      })
+      .sort(
+        (x, y) =>
+          Math.abs(y.diff) - Math.abs(x.diff) || x.tool.localeCompare(y.tool),
+      );
+    const pathNames = [
+      ...new Set([...a.pathFreq.keys(), ...b.pathFreq.keys()]),
+    ];
+    const pathDelta = pathNames
+      .map((path) => {
+        const av = a.pathFreq.get(path) ?? 0;
+        const bv = b.pathFreq.get(path) ?? 0;
+        return { path, a: av, b: bv, diff: bv - av };
+      })
+      .sort(
+        (x, y) =>
+          Math.abs(y.diff) - Math.abs(x.diff) || x.path.localeCompare(y.path),
+      );
+    return { a: a.surface, b: b.surface, toolDelta, pathDelta };
+  }
+  /**
+   * Per-tool token attribution: each `tool_use` block gets an equal share of
+   * its host turn's usage; assistant turns with no `tool_use` block contribute
+   * full usage to the `(no-tool)` bucket. Per-bucket sums are scaled onto
+   * `stats().totals` — the authoritative population (result-event sums when the
+   * trace carries them, the per-message fallback otherwise) — so the buckets
+   * answer "of the reported total, what share did each tool drive" rather than
+   * a separate per-turn re-count that drifts from the headline figure. The
+   * largest bucket absorbs the rounding residual on each axis, so the input,
+   * output, and `costShare` columns each sum to the corresponding `totals`
+   * value (and `1.0`) exactly (criterion-6 invariant).
+   * @returns {{perTool: Array<{tool: string, turns: number, inputTokens: number, outputTokens: number, costShare: number}>, totals: object}}
+   */
+  statsByTool() {
+    const { buckets, bucketTurns } = bucketUsageByTool(this.turns);
+    const totals = this.stats().totals;
+    const perTool = reconcileBucketsToTotals(buckets, bucketTurns, totals);
+    return { perTool, totals };
+  }
+  /**
+   * Totals-only view — `stats().totals` with no per-turn array.
+   * @returns {{totals: object}}
+   */
+  statsSummary() {
+    return { totals: this.stats().totals };
   }
-  return { perTurn, totals };
 }
 /**
@@ -364,6 +536,31 @@ function matchesToolName(turn, toolName) {
   );
 }
+/**
+ * Collect every assistant `tool_use` block keyed by `toolUseId`, optionally
+ * filtered by tool name. The shared join-key source feeding `toolCalls()`,
+ * `commands()`, and `collectToolUseIds()`. Insertion order follows turn order.
+ * @param {object[]} turns
+ * @param {string} [name] - Optional tool-name filter.
+ * @returns {Map<string, {turnIndex: number, name: string, input: object}>}
+ */
+function collectToolUseBlocks(turns, name) {
+  const blocks = new Map();
+  for (const turn of turns) {
+    if (turn.role !== "assistant") continue;
+    for (const b of turn.content) {
+      if (b.type !== "tool_use" || !b.toolUseId) continue;
+      if (name !== undefined && b.name !== name) continue;
+      blocks.set(b.toolUseId, {
+        turnIndex: turn.index,
+        name: b.name,
+        input: b.input,
+      });
+    }
+  }
+  return blocks;
+}
 /**
  * Collect all toolUseIds for a given tool name from assistant turns.
  * @param {object[]} turns
@@ -371,16 +568,68 @@ function matchesToolName(turn, toolName) {
  * @returns {Set<string>}
  */
 function collectToolUseIds(turns, name) {
-  const ids = new Set();
+  return new Set(collectToolUseBlocks(turns, name).keys());
+}
+/** Tool names in `Read`/`Edit`/`Write` that carry a `file_path` argument. */
+const PATH_TOOLS = new Set(["Read", "Edit", "Write"]);
+/**
+ * Frequency map of distinct `file_path` arguments across `Read`/`Edit`/`Write`
+ * tool calls, in first-seen insertion order.
+ * @param {object[]} turns
+ * @returns {Map<string, number>}
+ */
+function collectFilePaths(turns) {
+  const counts = new Map();
   for (const turn of turns) {
     if (turn.role !== "assistant") continue;
-    for (const b of turn.content) {
-      if (b.type === "tool_use" && b.name === name && b.toolUseId) {
-        ids.add(b.toolUseId);
-      }
+    for (const block of turn.content) {
+      if (block.type !== "tool_use" || !PATH_TOOLS.has(block.name)) continue;
+      const p = block.input?.file_path;
+      if (typeof p !== "string") continue;
+      counts.set(p, (counts.get(p) ?? 0) + 1);
     }
   }
-  return ids;
+  return counts;
+}
+/**
+ * Build the per-side comparison surface plus the tool/path frequency maps
+ * the delta computation consumes. Empty traces emit a `(empty)` marker.
+ * @param {TraceQuery} query
+ * @param {{caseName: string, participant: string|null}} [identity]
+ * @returns {{surface: object, toolFreq: Map<string, number>, pathFreq: Map<string, number>}}
+ */
+function sideSummary(
+  query,
+  identity = { caseName: "(unknown)", participant: null },
+) {
+  const toolFreq = new Map(query.toolFrequency().map((t) => [t.tool, t.count]));
+  const pathFreq = collectFilePaths(query.turns);
+  const isEmpty = query.turns.length === 0;
+  const metadata = {
+    caseName: identity.caseName,
+    participant: identity.participant ?? null,
+  };
+  if (isEmpty) metadata.marker = "(empty)";
+  const tools = [...toolFreq.keys()].sort();
+  const paths = [...pathFreq.keys()].sort();
+  return {
+    surface: {
+      metadata,
+      turnCount: query.turns.length,
+      tools,
+      paths,
+      pathCount: paths.length,
+      cost: query.stats().totals.totalCostUsd,
+    },
+    toolFreq,
+    pathFreq,
+  };
 }
 /**

package/src/trace-render.js ADDED Viewed

@@ -0,0 +1,211 @@
+/**
+ * Text renderers for `fit-trace` query output.
+ *
+ * One named export per renderable verb. Each renderer accepts the query result
+ * plus `{multi, signatures}` and returns a string. `multi` controls
+ * source-attribution prefixing (`grep -H` convention); record-per-line
+ * renderers prepend `<basename>:`, block renderers emit `# <basename>` headers.
+ *
+ * Internal module — imported by `commands/trace.js` and tests by relative
+ * path, never re-exported from `src/index.js`.
+ */
+/** Collapse newlines/tabs in a value to a single-line, grep-friendly string. */
+function oneLine(value) {
+  const str = typeof value === "string" ? value : JSON.stringify(value ?? null);
+  return str.replace(/[\r\n\t]+/g, " ").trim();
+}
+/** Group records by their `source` field (multi-file path), preserving order. */
+function groupBySource(records) {
+  const groups = new Map();
+  for (const record of records) {
+    const key = record.source ?? "";
+    if (!groups.has(key)) groups.set(key, []);
+    groups.get(key).push(record);
+  }
+  return groups;
+}
+/**
+ * Render record-per-line output, prefixing each line with `<source>:` when
+ * multi-file. `lineOf` maps one record to its text line.
+ * @param {object[]} records
+ * @param {(record: object) => string} lineOf
+ * @param {{multi: boolean}} opts
+ * @returns {string}
+ */
+function renderLines(records, lineOf, { multi }) {
+  return records
+    .map((r) => (multi && r.source ? `${r.source}:${lineOf(r)}` : lineOf(r)))
+    .join("\n");
+}
+/**
+ * Render a block per source. `blockOf` maps one record to a multi-line string;
+ * multi-file output separates groups with `# <source>` headers.
+ * @param {object[]} records
+ * @param {(record: object) => string} blockOf
+ * @param {{multi: boolean}} opts
+ * @returns {string}
+ */
+function renderBlocks(records, blockOf, { multi }) {
+  if (!multi) return records.map(blockOf).join("\n");
+  const out = [];
+  for (const [source, group] of groupBySource(records)) {
+    out.push(`# ${source}`);
+    out.push(...group.map(blockOf));
+  }
+  return out.join("\n");
+}
+/** `[turnIdx] <Tool> <toolUseId>` / `  in:` / `  out:` per block. */
+export function renderToolCalls(records, opts = {}) {
+  return renderBlocks(
+    records,
+    (r) => {
+      const head = `[${r.turnIndex}] ${r.name} ${r.toolUseId}`;
+      const input = `  in: ${oneLine(r.input)}`;
+      const out = `  out: ${
+        r.result ? oneLine(r.result.content) : "(no result)"
+      }`;
+      return [head, input, out].join("\n");
+    },
+    opts,
+  );
+}
+/** `[turnIdx] <command>` per line, newlines escaped. */
+export function renderCommands(records, opts = {}) {
+  return renderLines(
+    records,
+    (r) => `[${r.turnIndex}] ${oneLine(r.command)}`,
+    opts,
+  );
+}
+/** `<count>\t<path>` frequency-sorted. */
+export function renderPaths(records, opts = {}) {
+  return renderLines(records, (r) => `${r.count}\t${r.path}`, opts);
+}
+/** Metadata header, per-row metrics, then Tool and Path delta tables. */
+export function renderCompare(result) {
+  const { a, b, toolDelta, pathDelta } = result;
+  const part = (p) => (p == null ? "(none)" : p);
+  const lines = [];
+  lines.push(
+    `A: ${a.metadata.caseName} / ${part(a.metadata.participant)}${
+      a.metadata.marker ? ` ${a.metadata.marker}` : ""
+    }`,
+  );
+  lines.push(
+    `B: ${b.metadata.caseName} / ${part(b.metadata.participant)}${
+      b.metadata.marker ? ` ${b.metadata.marker}` : ""
+    }`,
+  );
+  lines.push("");
+  lines.push(`turns    | ${a.turnCount} | ${b.turnCount}`);
+  lines.push(`tools    | ${a.tools.length} | ${b.tools.length}`);
+  lines.push(`paths    | ${a.pathCount} | ${b.pathCount}`);
+  lines.push(`cost     | ${a.cost} | ${b.cost}`);
+  lines.push("");
+  lines.push("Tool | A | B | Δ");
+  for (const d of toolDelta) {
+    lines.push(`${d.tool} | ${d.a} | ${d.b} | ${d.diff}`);
+  }
+  lines.push("");
+  lines.push("Path | A | B | Δ");
+  for (const d of pathDelta) {
+    lines.push(`${d.path} | ${d.a} | ${d.b} | ${d.diff}`);
+  }
+  return lines.join("\n");
+}
+/** `Tool | Turns | In | Out | Share` sorted Share desc. */
+export function renderStatsByTool(result) {
+  const lines = ["Tool | Turns | In | Out | Share"];
+  for (const b of result.perTool) {
+    lines.push(
+      `${b.tool} | ${b.turns} | ${Math.round(b.inputTokens)} | ${Math.round(
+        b.outputTokens,
+      )} | ${b.costShare.toFixed(4)}`,
+    );
+  }
+  return lines.join("\n");
+}
+/** Totals block only. */
+export function renderStatsSummary(result) {
+  const t = result.totals;
+  return [
+    `inputTokens: ${t.inputTokens}`,
+    `outputTokens: ${t.outputTokens}`,
+    `cacheReadInputTokens: ${t.cacheReadInputTokens}`,
+    `cacheCreationInputTokens: ${t.cacheCreationInputTokens}`,
+    `totalCostUsd: ${t.totalCostUsd}`,
+    `durationMs: ${t.durationMs}`,
+  ].join("\n");
+}
+/** `[turnIdx] <prefix>: <excerpt>` per match. */
+export function renderSearch(records, opts = {}) {
+  const lines = [];
+  for (const hit of records) {
+    const idx = hit.turn?.index;
+    const prefix = multiPrefix(hit, opts);
+    for (const match of hit.matches ?? []) {
+      lines.push(`${prefix}[${idx}] ${oneLine(match)}`);
+    }
+  }
+  return lines.join("\n");
+}
+/** Source prefix for a multi-file record (search/default), or "". */
+function multiPrefix(record, { multi }) {
+  return multi && record.source ? `${record.source}:` : "";
+}
+/**
+ * Default renderer for every other renderable verb: one record per block,
+ * fields rendered as `key: value` lines (no JSON braces or quotes, so the
+ * default output is grep/awk-friendly and does not parse as JSON). Nested
+ * values are collapsed to a single grep-friendly line. Multi-file output
+ * separates source groups with `# <source>` headers (`renderBlocks`
+ * convention).
+ * @param {object[]|object} result
+ * @param {{multi: boolean}} opts
+ * @returns {string}
+ */
+export function renderDefault(result, opts = {}) {
+  const records = Array.isArray(result) ? result : [result];
+  return renderBlocks(records, (r) => recordBlock(stripSource(r)), opts);
+}
+/**
+ * Render one record as `key: value` lines. Scalars render verbatim; objects
+ * and arrays collapse to a single line via `oneLine`. A non-object record
+ * (string/number) renders as its own single line.
+ * @param {*} record
+ * @returns {string}
+ */
+function recordBlock(record) {
+  if (record == null || typeof record !== "object" || Array.isArray(record)) {
+    return oneLine(record);
+  }
+  return Object.entries(record)
+    .map(([key, value]) => {
+      const scalar = value == null || typeof value !== "object";
+      return `${key}: ${scalar ? String(value) : oneLine(value)}`;
+    })
+    .join("\n");
+}
+/** Drop the orchestrator-injected `source` field before textifying. */
+function stripSource(record) {
+  if (record == null || typeof record !== "object" || Array.isArray(record)) {
+    return record;
+  }
+  const { source, ...rest } = record;
+  return rest;
+}