npm - @slowdini/slow-powers-opencode - Versions diffs - 0.2.0 → 0.3.0 - Mend

@slowdini/slow-powers-opencode 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import {
   findByDescription,
   listSubagents,
   parseTranscript,
+  parseTranscriptFull,
 } from "./claude-code-transcript";
 const FIXTURE_ROOT = join(tmpdir(), `claude-code-adapter-test-${process.pid}`);
@@ -193,6 +194,227 @@ describe("parseTranscript", () => {
   });
 });
+describe("parseTranscriptFull", () => {
+  const usage = (output: number) => ({
+    input_tokens: 100,
+    cache_creation_input_tokens: 50,
+    cache_read_input_tokens: 200,
+    output_tokens: output,
+  });
+  test("sums usage across unique message ids, deduping repeated ids", () => {
+    // One API response spans multiple jsonl lines (one per content block) and
+    // repeats the same message.id + usage on each — it must be counted once.
+    const path = join(FIXTURE_ROOT, "full-dedup.jsonl");
+    writeFileSync(
+      path,
+      jsonl([
+        {
+          type: "user",
+          timestamp: "2026-06-04T10:00:00.000Z",
+          message: { role: "user", content: "go" },
+        },
+        {
+          type: "assistant",
+          timestamp: "2026-06-04T10:00:05.000Z",
+          message: {
+            id: "msg_aaa",
+            role: "assistant",
+            usage: usage(10),
+            content: [{ type: "text", text: "first block" }],
+          },
+        },
+        {
+          type: "assistant",
+          timestamp: "2026-06-04T10:00:06.000Z",
+          message: {
+            id: "msg_aaa",
+            role: "assistant",
+            usage: usage(10),
+            content: [
+              {
+                type: "tool_use",
+                id: "toolu_1",
+                name: "Bash",
+                input: { command: "ls" },
+              },
+            ],
+          },
+        },
+        {
+          type: "assistant",
+          timestamp: "2026-06-04T10:01:00.000Z",
+          message: {
+            id: "msg_bbb",
+            role: "assistant",
+            usage: usage(40),
+            content: [{ type: "text", text: "done" }],
+          },
+        },
+      ]),
+    );
+    const full = parseTranscriptFull(path);
+    // msg_aaa counted once (100+50+200+10) + msg_bbb (100+50+200+40) = 750
+    expect(full.total_tokens).toBe(750);
+  });
+  test("returns null total_tokens when no usage objects present", () => {
+    const path = join(FIXTURE_ROOT, "full-no-usage.jsonl");
+    writeFileSync(
+      path,
+      jsonl([
+        {
+          type: "assistant",
+          message: {
+            role: "assistant",
+            content: [{ type: "text", text: "hi" }],
+          },
+        },
+      ]),
+    );
+    expect(parseTranscriptFull(path).total_tokens).toBeNull();
+  });
+  test("derives duration_ms from first and last line timestamps", () => {
+    const path = join(FIXTURE_ROOT, "full-duration.jsonl");
+    writeFileSync(
+      path,
+      jsonl([
+        {
+          type: "user",
+          timestamp: "2026-06-04T10:00:00.000Z",
+          message: { role: "user", content: "go" },
+        },
+        {
+          type: "assistant",
+          timestamp: "2026-06-04T10:02:30.500Z",
+          message: {
+            id: "msg_x",
+            role: "assistant",
+            content: [{ type: "text", text: "done" }],
+          },
+        },
+      ]),
+    );
+    expect(parseTranscriptFull(path).duration_ms).toBe(150_500);
+  });
+  test("returns null duration_ms with fewer than two timestamps", () => {
+    const path = join(FIXTURE_ROOT, "full-one-ts.jsonl");
+    writeFileSync(
+      path,
+      jsonl([
+        {
+          type: "assistant",
+          timestamp: "2026-06-04T10:00:00.000Z",
+          message: { role: "assistant", content: [] },
+        },
+        { type: "assistant", message: { role: "assistant", content: [] } },
+      ]),
+    );
+    expect(parseTranscriptFull(path).duration_ms).toBeNull();
+  });
+  test("final_text is the concatenated text of the last assistant message", () => {
+    const path = join(FIXTURE_ROOT, "full-final-text.jsonl");
+    writeFileSync(
+      path,
+      jsonl([
+        {
+          type: "assistant",
+          message: {
+            id: "msg_1",
+            role: "assistant",
+            content: [{ type: "text", text: "intermediate" }],
+          },
+        },
+        {
+          type: "assistant",
+          message: {
+            id: "msg_2",
+            role: "assistant",
+            content: [
+              { type: "text", text: "All tests pass." },
+              {
+                type: "tool_use",
+                id: "toolu_z",
+                name: "Bash",
+                input: { command: "true" },
+              },
+              { type: "text", text: "Wrapping up." },
+            ],
+          },
+        },
+        {
+          type: "user",
+          message: {
+            role: "user",
+            content: [
+              { type: "tool_result", tool_use_id: "toolu_z", content: "ok" },
+            ],
+          },
+        },
+      ]),
+    );
+    expect(parseTranscriptFull(path).final_text).toBe(
+      "All tests pass.\nWrapping up.",
+    );
+  });
+  test("final_text is null when no assistant text exists", () => {
+    const path = join(FIXTURE_ROOT, "full-no-text.jsonl");
+    writeFileSync(
+      path,
+      jsonl([{ type: "user", message: { role: "user", content: "hi" } }]),
+    );
+    expect(parseTranscriptFull(path).final_text).toBeNull();
+  });
+  test("tool_invocations matches parseTranscript output", () => {
+    const path = join(FIXTURE_ROOT, "full-invocations.jsonl");
+    writeFileSync(
+      path,
+      jsonl([
+        {
+          type: "assistant",
+          timestamp: "2026-06-04T10:00:00.000Z",
+          message: {
+            id: "msg_1",
+            role: "assistant",
+            usage: usage(5),
+            content: [
+              {
+                type: "tool_use",
+                id: "toolu_q",
+                name: "Read",
+                input: { file_path: "/tmp/a" },
+              },
+            ],
+          },
+        },
+        {
+          type: "user",
+          timestamp: "2026-06-04T10:00:02.000Z",
+          message: {
+            role: "user",
+            content: [
+              {
+                type: "tool_result",
+                tool_use_id: "toolu_q",
+                content: "contents",
+              },
+            ],
+          },
+        },
+      ]),
+    );
+    expect(parseTranscriptFull(path).tool_invocations).toEqual(
+      parseTranscript(path),
+    );
+  });
+});
 describe("listSubagents / findByDescription", () => {
   test("matches subagents by meta description", () => {
     const dir = join(FIXTURE_ROOT, "subagents");

package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts CHANGED Viewed

@@ -15,12 +15,31 @@ type ToolResultBlock = {
   content: string | unknown[];
 };
-type ContentBlock = ToolUseBlock | ToolResultBlock | { type: string };
+type TextBlock = {
+  type: "text";
+  text: string;
+};
+type ContentBlock =
+  | ToolUseBlock
+  | ToolResultBlock
+  | TextBlock
+  | { type: string };
+type UsageRecord = {
+  input_tokens?: number;
+  output_tokens?: number;
+  cache_creation_input_tokens?: number;
+  cache_read_input_tokens?: number;
+};
 type TranscriptRecord = {
   type: "user" | "assistant" | string;
+  timestamp?: string;
   message?: {
+    id?: string;
     role?: string;
+    usage?: UsageRecord;
     content?: string | ContentBlock[];
   };
 };
@@ -47,21 +66,25 @@ function stringifyResult(content: ToolResultBlock["content"]): string {
   return JSON.stringify(content);
 }
-export function parseTranscript(jsonlPath: string): ToolInvocation[] {
+function readRecords(jsonlPath: string): TranscriptRecord[] {
   const raw = readFileSync(jsonlPath, "utf8");
-  const lines = raw.split("\n").filter((l) => l.length > 0);
-  const invocations: ToolInvocation[] = [];
-  const indexById = new Map<string, number>();
-  for (const line of lines) {
-    let record: TranscriptRecord;
+  const records: TranscriptRecord[] = [];
+  for (const line of raw.split("\n")) {
+    if (line.length === 0) continue;
     try {
-      record = JSON.parse(line) as TranscriptRecord;
+      records.push(JSON.parse(line) as TranscriptRecord);
     } catch {
-      continue;
+      // skip malformed lines
     }
+  }
+  return records;
+}
+function extractInvocations(records: TranscriptRecord[]): ToolInvocation[] {
+  const invocations: ToolInvocation[] = [];
+  const indexById = new Map<string, number>();
+  for (const record of records) {
     const blocks = flattenContent(record.message?.content);
     if (record.type === "assistant") {
@@ -93,6 +116,79 @@ export function parseTranscript(jsonlPath: string): ToolInvocation[] {
   return invocations;
 }
+export function parseTranscript(jsonlPath: string): ToolInvocation[] {
+  return extractInvocations(readRecords(jsonlPath));
+}
+export type TranscriptSummary = {
+  tool_invocations: ToolInvocation[];
+  /**
+   * Sum of usage across unique API responses. One response spans multiple
+   * jsonl lines (one per content block) and repeats the same `message.id` +
+   * `usage` on each, so totals are deduped by `message.id`. Includes cache
+   * creation/read tokens — a different accounting than the harness's task
+   * completion event.
+   */
+  total_tokens: number | null;
+  /** Wall clock between the first and last line timestamps. */
+  duration_ms: number | null;
+  /** Concatenated text blocks of the last assistant message. */
+  final_text: string | null;
+};
+export function parseTranscriptFull(jsonlPath: string): TranscriptSummary {
+  const records = readRecords(jsonlPath);
+  const usageById = new Map<string, UsageRecord>();
+  let firstTs: number | null = null;
+  let lastTs: number | null = null;
+  let timestampCount = 0;
+  let finalText: string | null = null;
+  for (const record of records) {
+    if (record.timestamp) {
+      const ts = Date.parse(record.timestamp);
+      if (!Number.isNaN(ts)) {
+        if (firstTs === null) firstTs = ts;
+        lastTs = ts;
+        timestampCount++;
+      }
+    }
+    if (record.type !== "assistant") continue;
+    const { id, usage } = record.message ?? {};
+    if (id && usage) usageById.set(id, usage);
+    const texts = flattenContent(record.message?.content)
+      .filter((b): b is TextBlock => b.type === "text")
+      .map((b) => b.text);
+    if (texts.length > 0) finalText = texts.join("\n");
+  }
+  let totalTokens: number | null = null;
+  if (usageById.size > 0) {
+    totalTokens = 0;
+    for (const usage of usageById.values()) {
+      totalTokens +=
+        (usage.input_tokens ?? 0) +
+        (usage.output_tokens ?? 0) +
+        (usage.cache_creation_input_tokens ?? 0) +
+        (usage.cache_read_input_tokens ?? 0);
+    }
+  }
+  return {
+    tool_invocations: extractInvocations(records),
+    total_tokens: totalTokens,
+    duration_ms:
+      timestampCount >= 2 && firstTs !== null && lastTs !== null
+        ? lastTs - firstTs
+        : null,
+    final_text: finalText,
+  };
+}
 export type SubagentMeta = {
   agentType?: string;
   description?: string;

package/skills/evaluating-skills/runner/aggregate.test.ts CHANGED Viewed

@@ -186,6 +186,226 @@ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
     ).toBe(true);
   });
+  test("surfaces live-source reads as validity_warnings", () => {
+    const root = join(FIXTURE_ROOT, "agg-live-reads");
+    const skillDir = join(root, "skill-dir");
+    const skillSub = join(skillDir, "mr-review");
+    mkdirSync(skillSub, { recursive: true });
+    writeFileSync(
+      join(skillSub, "SKILL.md"),
+      "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    );
+    const cwd = join(root, "work");
+    const iterationDir = join(
+      cwd,
+      "skills-workspace",
+      "mr-review",
+      "iteration-1",
+    );
+    mkdirSync(iterationDir, { recursive: true });
+    writeJson(join(iterationDir, "conditions.json"), {
+      mode: "revision",
+      conditions: [
+        { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
+        { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
+      ],
+      timestamp: new Date().toISOString(),
+      harness: "claude-code",
+    });
+    for (const cond of ["old_skill", "new_skill"]) {
+      const condDir = join(iterationDir, "eval-e1", cond);
+      mkdirSync(condDir, { recursive: true });
+      writeJson(join(condDir, "grading.json"), {
+        assertion_results: [],
+        summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
+      });
+      writeJson(join(condDir, "timing.json"), {
+        total_tokens: 100,
+        duration_ms: 1,
+      });
+    }
+    writeJson(join(iterationDir, "stray-writes.json"), {
+      generated: new Date().toISOString(),
+      iteration: 1,
+      totals: { violations: 0, warnings: 0, live_source_reads: 1 },
+      runs: [
+        {
+          eval_id: "e1",
+          condition: "old_skill",
+          violations: [],
+          warnings: [],
+          live_source_reads: [
+            {
+              tool: "Read",
+              path: join(skillSub, "SKILL.md"),
+              ordinal: 0,
+              reason: "x",
+            },
+          ],
+        },
+      ],
+    });
+    const res = Bun.spawnSync(
+      [
+        "bun",
+        "run",
+        AGGREGATE_TS,
+        "--skill-dir",
+        skillDir,
+        "--skill",
+        "mr-review",
+        "--iteration",
+        "1",
+      ],
+      { cwd, stdout: "pipe", stderr: "pipe" },
+    );
+    expect(res.exitCode).toBe(0);
+    const benchmark = JSON.parse(
+      readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
+    ) as { validity_warnings: string[] };
+    expect(
+      benchmark.validity_warnings.some(
+        (w) => w.includes("e1/old_skill") && /live skill source/i.test(w),
+      ),
+    ).toBe(true);
+  });
+  test("warns when timing sources are mixed across the compared runs", () => {
+    const root = join(FIXTURE_ROOT, "agg-mixed-timing");
+    const skillDir = join(root, "skill-dir");
+    const skillSub = join(skillDir, "mr-review");
+    mkdirSync(skillSub, { recursive: true });
+    writeFileSync(
+      join(skillSub, "SKILL.md"),
+      "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    );
+    const cwd = join(root, "work");
+    const iterationDir = join(
+      cwd,
+      "skills-workspace",
+      "mr-review",
+      "iteration-1",
+    );
+    mkdirSync(iterationDir, { recursive: true });
+    writeJson(join(iterationDir, "conditions.json"), {
+      mode: "new-skill",
+      conditions: [
+        { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
+        { name: "without_skill", skill_path: null },
+      ],
+      timestamp: new Date().toISOString(),
+      harness: "claude-code",
+    });
+    // One arm has agent-captured completion-event timing (no source field, the
+    // pre-provenance shape); the other was backfilled from the transcript.
+    const mkCond = (cond: string, timing: unknown) => {
+      const condDir = join(iterationDir, "eval-e1", cond);
+      mkdirSync(condDir, { recursive: true });
+      writeJson(join(condDir, "grading.json"), {
+        assertion_results: [],
+        summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
+      });
+      writeJson(join(condDir, "timing.json"), timing);
+    };
+    mkCond("with_skill", { total_tokens: 5000, duration_ms: 1000 });
+    mkCond("without_skill", {
+      total_tokens: 90000,
+      duration_ms: 1200,
+      source: "transcript",
+    });
+    const res = Bun.spawnSync(
+      [
+        "bun",
+        "run",
+        AGGREGATE_TS,
+        "--skill-dir",
+        skillDir,
+        "--skill",
+        "mr-review",
+        "--iteration",
+        "1",
+      ],
+      { cwd, stdout: "pipe", stderr: "pipe" },
+    );
+    expect(res.exitCode).toBe(0);
+    const benchmark = JSON.parse(
+      readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
+    ) as { validity_warnings: string[] };
+    expect(
+      benchmark.validity_warnings.some(
+        (w) => w.includes("timing source") && w.includes("transcript"),
+      ),
+    ).toBe(true);
+  });
+  test("does not warn when all timing comes from one source", () => {
+    const root = join(FIXTURE_ROOT, "agg-same-timing");
+    const skillDir = join(root, "skill-dir");
+    const skillSub = join(skillDir, "mr-review");
+    mkdirSync(skillSub, { recursive: true });
+    writeFileSync(
+      join(skillSub, "SKILL.md"),
+      "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    );
+    const cwd = join(root, "work");
+    const iterationDir = join(
+      cwd,
+      "skills-workspace",
+      "mr-review",
+      "iteration-1",
+    );
+    mkdirSync(iterationDir, { recursive: true });
+    writeJson(join(iterationDir, "conditions.json"), {
+      mode: "new-skill",
+      conditions: [
+        { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
+        { name: "without_skill", skill_path: null },
+      ],
+      timestamp: new Date().toISOString(),
+      harness: "claude-code",
+    });
+    for (const cond of ["with_skill", "without_skill"]) {
+      const condDir = join(iterationDir, "eval-e1", cond);
+      mkdirSync(condDir, { recursive: true });
+      writeJson(join(condDir, "grading.json"), {
+        assertion_results: [],
+        summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
+      });
+      writeJson(join(condDir, "timing.json"), {
+        total_tokens: 100,
+        duration_ms: 1,
+        source: "transcript",
+      });
+    }
+    const res = Bun.spawnSync(
+      [
+        "bun",
+        "run",
+        AGGREGATE_TS,
+        "--skill-dir",
+        skillDir,
+        "--skill",
+        "mr-review",
+        "--iteration",
+        "1",
+      ],
+      { cwd, stdout: "pipe", stderr: "pipe" },
+    );
+    expect(res.exitCode).toBe(0);
+    const benchmark = JSON.parse(
+      readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
+    ) as { validity_warnings: string[] };
+    expect(
+      benchmark.validity_warnings.some((w) => w.includes("timing source")),
+    ).toBe(false);
+  });
   test("surfaces plugin-shadow findings as validity_warnings", () => {
     const root = join(FIXTURE_ROOT, "agg-shadow");
     const skillDir = join(root, "skill-dir");

package/skills/evaluating-skills/runner/aggregate.ts CHANGED Viewed

@@ -94,6 +94,11 @@ for (const c of conditions.conditions) {
 }
 let missingGradings = 0;
+// Timing provenance across all runs in the comparison. "completion-event"
+// (the agent-captured default, also assumed when `source` is absent) and
+// "transcript" (record-runs backfill, includes cache accounting) measure
+// different things — a delta mixing them is comparing two metrics.
+const timingSources = new Set<string>();
 for (const evalDir of evalDirs) {
   for (const cond of conditionNames) {
     const condDir = join(iterationDir, evalDir, cond);
@@ -116,6 +121,11 @@ for (const evalDir of evalDirs) {
         byCondition[cond].tokens.push(timing.total_tokens);
       if (typeof timing.duration_ms === "number")
         byCondition[cond].durations.push(timing.duration_ms);
+      if (
+        typeof timing.total_tokens === "number" ||
+        typeof timing.duration_ms === "number"
+      )
+        timingSources.add(timing.source ?? "completion-event");
     }
   }
 }
@@ -168,6 +178,11 @@ const delta = {
 };
 const validityWarnings: string[] = [];
+if (timingSources.size > 1) {
+  validityWarnings.push(
+    `runs mix timing sources (${[...timingSources].sort().join(", ")}) — transcript-derived totals include cache accounting, so the token/duration delta compares two different metrics. Re-record one side or read the delta as a rough signal only.`,
+  );
+}
 for (const cond of conditionNames) {
   const s = runSummary[cond];
   if (s.skill_invocation_rate != null && s.skill_invocation_rate < 1) {
@@ -188,6 +203,7 @@ if (existsSync(strayPath)) {
         eval_id: string;
         condition: string;
         violations?: unknown[];
+        live_source_reads?: unknown[];
       }>;
     };
     for (const r of stray.runs ?? []) {
@@ -196,6 +212,11 @@ if (existsSync(strayPath)) {
         validityWarnings.push(
           `${r.eval_id}/${r.condition} wrote ${n} file(s) outside its outputs dir — data point may be tainted (see stray-writes.json).`,
         );
+      const reads = r.live_source_reads?.length ?? 0;
+      if (reads > 0)
+        validityWarnings.push(
+          `${r.eval_id}/${r.condition} read the live skill source ${reads} time(s) instead of its staged copy — the arm may be contaminated (staged-slug resolution race; see stray-writes.json).`,
+        );
     }
   } catch {
     // ignore a malformed report rather than failing aggregation