npm - @slowdini/slow-powers-opencode - Versions diffs - 0.1.5 → 0.3.0 - Mend

@slowdini/slow-powers-opencode 0.1.5 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

package/skills/evaluating-skills/runner/aggregate.test.ts CHANGED Viewed

@@ -186,6 +186,226 @@ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
     ).toBe(true);
   });
+  test("surfaces live-source reads as validity_warnings", () => {
+    const root = join(FIXTURE_ROOT, "agg-live-reads");
+    const skillDir = join(root, "skill-dir");
+    const skillSub = join(skillDir, "mr-review");
+    mkdirSync(skillSub, { recursive: true });
+    writeFileSync(
+      join(skillSub, "SKILL.md"),
+      "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    );
+    const cwd = join(root, "work");
+    const iterationDir = join(
+      cwd,
+      "skills-workspace",
+      "mr-review",
+      "iteration-1",
+    );
+    mkdirSync(iterationDir, { recursive: true });
+    writeJson(join(iterationDir, "conditions.json"), {
+      mode: "revision",
+      conditions: [
+        { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
+        { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
+      ],
+      timestamp: new Date().toISOString(),
+      harness: "claude-code",
+    });
+    for (const cond of ["old_skill", "new_skill"]) {
+      const condDir = join(iterationDir, "eval-e1", cond);
+      mkdirSync(condDir, { recursive: true });
+      writeJson(join(condDir, "grading.json"), {
+        assertion_results: [],
+        summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
+      });
+      writeJson(join(condDir, "timing.json"), {
+        total_tokens: 100,
+        duration_ms: 1,
+      });
+    }
+    writeJson(join(iterationDir, "stray-writes.json"), {
+      generated: new Date().toISOString(),
+      iteration: 1,
+      totals: { violations: 0, warnings: 0, live_source_reads: 1 },
+      runs: [
+        {
+          eval_id: "e1",
+          condition: "old_skill",
+          violations: [],
+          warnings: [],
+          live_source_reads: [
+            {
+              tool: "Read",
+              path: join(skillSub, "SKILL.md"),
+              ordinal: 0,
+              reason: "x",
+            },
+          ],
+        },
+      ],
+    });
+    const res = Bun.spawnSync(
+      [
+        "bun",
+        "run",
+        AGGREGATE_TS,
+        "--skill-dir",
+        skillDir,
+        "--skill",
+        "mr-review",
+        "--iteration",
+        "1",
+      ],
+      { cwd, stdout: "pipe", stderr: "pipe" },
+    );
+    expect(res.exitCode).toBe(0);
+    const benchmark = JSON.parse(
+      readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
+    ) as { validity_warnings: string[] };
+    expect(
+      benchmark.validity_warnings.some(
+        (w) => w.includes("e1/old_skill") && /live skill source/i.test(w),
+      ),
+    ).toBe(true);
+  });
+  test("warns when timing sources are mixed across the compared runs", () => {
+    const root = join(FIXTURE_ROOT, "agg-mixed-timing");
+    const skillDir = join(root, "skill-dir");
+    const skillSub = join(skillDir, "mr-review");
+    mkdirSync(skillSub, { recursive: true });
+    writeFileSync(
+      join(skillSub, "SKILL.md"),
+      "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    );
+    const cwd = join(root, "work");
+    const iterationDir = join(
+      cwd,
+      "skills-workspace",
+      "mr-review",
+      "iteration-1",
+    );
+    mkdirSync(iterationDir, { recursive: true });
+    writeJson(join(iterationDir, "conditions.json"), {
+      mode: "new-skill",
+      conditions: [
+        { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
+        { name: "without_skill", skill_path: null },
+      ],
+      timestamp: new Date().toISOString(),
+      harness: "claude-code",
+    });
+    // One arm has agent-captured completion-event timing (no source field, the
+    // pre-provenance shape); the other was backfilled from the transcript.
+    const mkCond = (cond: string, timing: unknown) => {
+      const condDir = join(iterationDir, "eval-e1", cond);
+      mkdirSync(condDir, { recursive: true });
+      writeJson(join(condDir, "grading.json"), {
+        assertion_results: [],
+        summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
+      });
+      writeJson(join(condDir, "timing.json"), timing);
+    };
+    mkCond("with_skill", { total_tokens: 5000, duration_ms: 1000 });
+    mkCond("without_skill", {
+      total_tokens: 90000,
+      duration_ms: 1200,
+      source: "transcript",
+    });
+    const res = Bun.spawnSync(
+      [
+        "bun",
+        "run",
+        AGGREGATE_TS,
+        "--skill-dir",
+        skillDir,
+        "--skill",
+        "mr-review",
+        "--iteration",
+        "1",
+      ],
+      { cwd, stdout: "pipe", stderr: "pipe" },
+    );
+    expect(res.exitCode).toBe(0);
+    const benchmark = JSON.parse(
+      readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
+    ) as { validity_warnings: string[] };
+    expect(
+      benchmark.validity_warnings.some(
+        (w) => w.includes("timing source") && w.includes("transcript"),
+      ),
+    ).toBe(true);
+  });
+  test("does not warn when all timing comes from one source", () => {
+    const root = join(FIXTURE_ROOT, "agg-same-timing");
+    const skillDir = join(root, "skill-dir");
+    const skillSub = join(skillDir, "mr-review");
+    mkdirSync(skillSub, { recursive: true });
+    writeFileSync(
+      join(skillSub, "SKILL.md"),
+      "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    );
+    const cwd = join(root, "work");
+    const iterationDir = join(
+      cwd,
+      "skills-workspace",
+      "mr-review",
+      "iteration-1",
+    );
+    mkdirSync(iterationDir, { recursive: true });
+    writeJson(join(iterationDir, "conditions.json"), {
+      mode: "new-skill",
+      conditions: [
+        { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
+        { name: "without_skill", skill_path: null },
+      ],
+      timestamp: new Date().toISOString(),
+      harness: "claude-code",
+    });
+    for (const cond of ["with_skill", "without_skill"]) {
+      const condDir = join(iterationDir, "eval-e1", cond);
+      mkdirSync(condDir, { recursive: true });
+      writeJson(join(condDir, "grading.json"), {
+        assertion_results: [],
+        summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
+      });
+      writeJson(join(condDir, "timing.json"), {
+        total_tokens: 100,
+        duration_ms: 1,
+        source: "transcript",
+      });
+    }
+    const res = Bun.spawnSync(
+      [
+        "bun",
+        "run",
+        AGGREGATE_TS,
+        "--skill-dir",
+        skillDir,
+        "--skill",
+        "mr-review",
+        "--iteration",
+        "1",
+      ],
+      { cwd, stdout: "pipe", stderr: "pipe" },
+    );
+    expect(res.exitCode).toBe(0);
+    const benchmark = JSON.parse(
+      readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
+    ) as { validity_warnings: string[] };
+    expect(
+      benchmark.validity_warnings.some((w) => w.includes("timing source")),
+    ).toBe(false);
+  });
   test("surfaces plugin-shadow findings as validity_warnings", () => {
     const root = join(FIXTURE_ROOT, "agg-shadow");
     const skillDir = join(root, "skill-dir");

package/skills/evaluating-skills/runner/aggregate.ts CHANGED Viewed

@@ -94,6 +94,11 @@ for (const c of conditions.conditions) {
 }
 let missingGradings = 0;
+// Timing provenance across all runs in the comparison. "completion-event"
+// (the agent-captured default, also assumed when `source` is absent) and
+// "transcript" (record-runs backfill, includes cache accounting) measure
+// different things — a delta mixing them is comparing two metrics.
+const timingSources = new Set<string>();
 for (const evalDir of evalDirs) {
   for (const cond of conditionNames) {
     const condDir = join(iterationDir, evalDir, cond);
@@ -116,6 +121,11 @@ for (const evalDir of evalDirs) {
         byCondition[cond].tokens.push(timing.total_tokens);
       if (typeof timing.duration_ms === "number")
         byCondition[cond].durations.push(timing.duration_ms);
+      if (
+        typeof timing.total_tokens === "number" ||
+        typeof timing.duration_ms === "number"
+      )
+        timingSources.add(timing.source ?? "completion-event");
     }
   }
 }
@@ -168,6 +178,11 @@ const delta = {
 };
 const validityWarnings: string[] = [];
+if (timingSources.size > 1) {
+  validityWarnings.push(
+    `runs mix timing sources (${[...timingSources].sort().join(", ")}) — transcript-derived totals include cache accounting, so the token/duration delta compares two different metrics. Re-record one side or read the delta as a rough signal only.`,
+  );
+}
 for (const cond of conditionNames) {
   const s = runSummary[cond];
   if (s.skill_invocation_rate != null && s.skill_invocation_rate < 1) {
@@ -188,6 +203,7 @@ if (existsSync(strayPath)) {
         eval_id: string;
         condition: string;
         violations?: unknown[];
+        live_source_reads?: unknown[];
       }>;
     };
     for (const r of stray.runs ?? []) {
@@ -196,6 +212,11 @@ if (existsSync(strayPath)) {
         validityWarnings.push(
           `${r.eval_id}/${r.condition} wrote ${n} file(s) outside its outputs dir — data point may be tainted (see stray-writes.json).`,
         );
+      const reads = r.live_source_reads?.length ?? 0;
+      if (reads > 0)
+        validityWarnings.push(
+          `${r.eval_id}/${r.condition} read the live skill source ${reads} time(s) instead of its staged copy — the arm may be contaminated (staged-slug resolution race; see stray-writes.json).`,
+        );
     }
   } catch {
     // ignore a malformed report rather than failing aggregation

package/skills/evaluating-skills/runner/detect-stray-writes.test.ts CHANGED Viewed

@@ -1,9 +1,21 @@
-import { describe, expect, test } from "bun:test";
+import { afterAll, beforeAll, describe, expect, test } from "bun:test";
+import {
+  mkdirSync,
+  readFileSync,
+  realpathSync,
+  rmSync,
+  writeFileSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
 import { join } from "node:path";
-import { detectStrayWrites } from "./detect-stray-writes";
+import {
+  detectLiveSourceReads,
+  detectStrayWrites,
+} from "./detect-stray-writes";
 const OUTPUTS = "/work/iteration-1/eval-x/with_skill/outputs";
 const REPO = "/work/repo";
+const LIVE_SKILL = join(REPO, "skills", "mr-review");
 describe("detectStrayWrites", () => {
   test("a Write inside the outputs dir is clean", () => {
@@ -87,6 +99,32 @@ describe("detectStrayWrites", () => {
     expect(findings.warnings).toHaveLength(0);
   });
+  test("git worktree add is a warning (working tree outside the sandbox)", () => {
+    const findings = detectStrayWrites(
+      [
+        {
+          name: "Bash",
+          args: { command: "git worktree add ../wt -b scratch" },
+          ordinal: 0,
+        },
+      ],
+      OUTPUTS,
+      REPO,
+    );
+    expect(findings.warnings).toHaveLength(1);
+    expect(findings.warnings[0].reason).toMatch(/worktree/i);
+  });
+  test("creating a path under .claude is a warning", () => {
+    const findings = detectStrayWrites(
+      [{ name: "Bash", args: { command: "mkdir -p .claude/foo" }, ordinal: 0 }],
+      OUTPUTS,
+      REPO,
+    );
+    expect(findings.warnings).toHaveLength(1);
+    expect(findings.warnings[0].reason).toMatch(/\.claude/i);
+  });
   test("read-only tools are never flagged", () => {
     const findings = detectStrayWrites(
       [
@@ -101,3 +139,258 @@ describe("detectStrayWrites", () => {
     expect(findings.warnings).toHaveLength(0);
   });
 });
+describe("detectLiveSourceReads", () => {
+  test("a Read of the live SKILL.md is flagged", () => {
+    const findings = detectLiveSourceReads(
+      [
+        {
+          name: "Read",
+          args: { file_path: join(LIVE_SKILL, "SKILL.md") },
+          ordinal: 1,
+        },
+      ],
+      LIVE_SKILL,
+      REPO,
+    );
+    expect(findings).toHaveLength(1);
+    expect(findings[0]).toMatchObject({
+      tool: "Read",
+      path: join(LIVE_SKILL, "SKILL.md"),
+      ordinal: 1,
+    });
+    expect(findings[0].reason).toMatch(/live skill source/i);
+  });
+  test("a Read of a staged eval copy is not flagged", () => {
+    const findings = detectLiveSourceReads(
+      [
+        {
+          name: "Read",
+          args: {
+            file_path: join(
+              REPO,
+              ".claude/skills/slow-powers-eval-1-old_skill__mr-review/SKILL.md",
+            ),
+          },
+          ordinal: 0,
+        },
+      ],
+      LIVE_SKILL,
+      REPO,
+    );
+    expect(findings).toHaveLength(0);
+  });
+  test("a relative Read path resolving under the live dir is flagged", () => {
+    const findings = detectLiveSourceReads(
+      [
+        {
+          name: "Read",
+          args: { file_path: "skills/mr-review/SKILL.md" },
+          ordinal: 0,
+        },
+      ],
+      LIVE_SKILL,
+      REPO,
+    );
+    expect(findings).toHaveLength(1);
+  });
+  test("a Grep scoped to the live dir is flagged", () => {
+    const findings = detectLiveSourceReads(
+      [{ name: "Grep", args: { pattern: "x", path: LIVE_SKILL }, ordinal: 2 }],
+      LIVE_SKILL,
+      REPO,
+    );
+    expect(findings).toHaveLength(1);
+    expect(findings[0].tool).toBe("Grep");
+  });
+  test("a Bash command referencing the live dir relatively is flagged", () => {
+    const findings = detectLiveSourceReads(
+      [
+        {
+          name: "Bash",
+          args: { command: "cat skills/mr-review/SKILL.md" },
+          ordinal: 3,
+        },
+      ],
+      LIVE_SKILL,
+      REPO,
+    );
+    expect(findings).toHaveLength(1);
+    expect(findings[0].tool).toBe("Bash");
+    expect(findings[0].command).toBe("cat skills/mr-review/SKILL.md");
+  });
+  test("a Bash command referencing the live dir absolutely is flagged", () => {
+    const findings = detectLiveSourceReads(
+      [
+        {
+          name: "Bash",
+          args: { command: `grep -r trigger ${LIVE_SKILL}/` },
+          ordinal: 0,
+        },
+      ],
+      LIVE_SKILL,
+      REPO,
+    );
+    expect(findings).toHaveLength(1);
+  });
+  test("a Bash command referencing a staged copy under .claude/skills is not flagged", () => {
+    // --stage-name can stage under the skill's natural name; that path contains
+    // `skills/<name>` but lives under `.claude/`, so it must not match.
+    const findings = detectLiveSourceReads(
+      [
+        {
+          name: "Bash",
+          args: { command: "cat .claude/skills/mr-review/SKILL.md" },
+          ordinal: 0,
+        },
+      ],
+      LIVE_SKILL,
+      REPO,
+    );
+    expect(findings).toHaveLength(0);
+  });
+  test("unrelated reads and commands are not flagged", () => {
+    const findings = detectLiveSourceReads(
+      [
+        {
+          name: "Read",
+          args: { file_path: join(OUTPUTS, "x.md") },
+          ordinal: 0,
+        },
+        { name: "Bash", args: { command: "ls skills-workspace" }, ordinal: 1 },
+        {
+          name: "Write",
+          args: { file_path: join(LIVE_SKILL, "SKILL.md") },
+          ordinal: 2,
+        },
+      ],
+      LIVE_SKILL,
+      REPO,
+    );
+    // Write tools are detectStrayWrites' jurisdiction — this check is reads only.
+    expect(findings).toHaveLength(0);
+  });
+});
+describe("detect-stray-writes CLI", () => {
+  // realpath: the spawned CLI sees its cwd resolved (macOS /var → /private/var),
+  // so fixture paths must match that form for prefix checks to line up.
+  const FIXTURE_ROOT = join(
+    realpathSync(tmpdir()),
+    `slow-powers-detect-stray-test-${process.pid}`,
+  );
+  const SCRIPT = join(import.meta.dir, "detect-stray-writes.ts");
+  beforeAll(() => {
+    mkdirSync(FIXTURE_ROOT, { recursive: true });
+  });
+  afterAll(() => {
+    rmSync(FIXTURE_ROOT, { recursive: true, force: true });
+  });
+  test("reports live-source reads per run in stray-writes.json", () => {
+    const root = join(FIXTURE_ROOT, "cli-live-reads");
+    const skillDir = join(root, "skill-dir");
+    const skillSub = join(skillDir, "mr-review");
+    mkdirSync(skillSub, { recursive: true });
+    writeFileSync(
+      join(skillSub, "SKILL.md"),
+      "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
+    );
+    const cwd = join(root, "work");
+    const iterationDir = join(
+      cwd,
+      "skills-workspace",
+      "mr-review",
+      "iteration-1",
+    );
+    const condDir = join(iterationDir, "eval-e1", "old_skill");
+    mkdirSync(condDir, { recursive: true });
+    writeFileSync(
+      join(iterationDir, "conditions.json"),
+      `${JSON.stringify({
+        mode: "revision",
+        conditions: [
+          { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
+          { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
+        ],
+        timestamp: new Date().toISOString(),
+        harness: "claude-code",
+      })}\n`,
+    );
+    writeFileSync(
+      join(condDir, "run.json"),
+      `${JSON.stringify({
+        eval_id: "e1",
+        condition: "old_skill",
+        skill_path: join(skillSub, "SKILL.md"),
+        prompt: "do the task",
+        files: [],
+        final_message: "done",
+        tool_invocations: [
+          {
+            name: "Read",
+            args: { file_path: join(skillSub, "SKILL.md") },
+            ordinal: 0,
+          },
+          {
+            name: "Write",
+            args: { file_path: join(condDir, "outputs", "answer.md") },
+            ordinal: 1,
+          },
+        ],
+      })}\n`,
+    );
+    const res = Bun.spawnSync(
+      [
+        "bun",
+        "run",
+        SCRIPT,
+        "--skill-dir",
+        skillDir,
+        "--skill",
+        "mr-review",
+        "--iteration",
+        "1",
+      ],
+      { cwd, stdout: "pipe", stderr: "pipe" },
+    );
+    expect(res.exitCode).toBe(0);
+    const report = JSON.parse(
+      readFileSync(join(iterationDir, "stray-writes.json"), "utf8"),
+    ) as {
+      totals: {
+        violations: number;
+        warnings: number;
+        live_source_reads: number;
+      };
+      runs: Array<{
+        eval_id: string;
+        condition: string;
+        live_source_reads: Array<{ tool: string; path?: string }>;
+      }>;
+    };
+    expect(report.totals.live_source_reads).toBe(1);
+    expect(report.totals.violations).toBe(0);
+    expect(report.runs).toHaveLength(1);
+    expect(report.runs[0]).toMatchObject({
+      eval_id: "e1",
+      condition: "old_skill",
+    });
+    expect(report.runs[0].live_source_reads[0]).toMatchObject({
+      tool: "Read",
+      path: join(skillSub, "SKILL.md"),
+    });
+  });
+});