npm - @slowdini/slow-powers-opencode - Versions diffs - 0.3.0 → 0.4.0 - Mend

@slowdini/slow-powers-opencode 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/skills/evaluating-skills/runner/workspace-teardown.test.ts DELETED Viewed

@@ -1,227 +0,0 @@
-import { afterAll, beforeAll, describe, expect, test } from "bun:test";
-import { existsSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
-import { tmpdir } from "node:os";
-import { join } from "node:path";
-import {
-  cleanupWorkspace,
-  PROMOTED_MARKER,
-  SNAPSHOT_META,
-} from "./workspace-teardown";
-const FIXTURE_ROOT = join(
-  tmpdir(),
-  `slow-powers-workspace-teardown-test-${process.pid}`,
-);
-beforeAll(() => {
-  mkdirSync(FIXTURE_ROOT, { recursive: true });
-});
-afterAll(() => {
-  rmSync(FIXTURE_ROOT, { recursive: true, force: true });
-});
-let caseSeq = 0;
-function freshWorkspace(): string {
-  caseSeq += 1;
-  const workspaceRoot = join(
-    FIXTURE_ROOT,
-    `case-${caseSeq}`,
-    "skills-workspace",
-  );
-  mkdirSync(workspaceRoot, { recursive: true });
-  return workspaceRoot;
-}
-function writeJson(path: string, value: unknown) {
-  mkdirSync(join(path, ".."), { recursive: true });
-  writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
-}
-/** Build an iteration dir; `opts` controls which artifacts it carries. */
-function makeIteration(
-  workspaceRoot: string,
-  skill: string,
-  iteration: string,
-  opts: {
-    promoted?: boolean;
-    benchmark?: boolean;
-    runRecord?: boolean;
-    grading?: boolean;
-    scaffoldingOnly?: boolean;
-  },
-): string {
-  const dir = join(workspaceRoot, skill, iteration);
-  mkdirSync(dir, { recursive: true });
-  if (opts.scaffoldingOnly) {
-    writeFileSync(join(dir, "dispatch.json"), "[]\n");
-  }
-  if (opts.benchmark) {
-    writeJson(join(dir, "benchmark.json"), { delta: { pass_rate: 0.5 } });
-  }
-  if (opts.runRecord) {
-    writeJson(join(dir, "eval-e1", "with_skill", "run.json"), {
-      eval_id: "e1",
-    });
-  }
-  if (opts.grading) {
-    writeJson(join(dir, "eval-e1", "with_skill", "grading.json"), {
-      summary: { pass_rate: 1 },
-    });
-  }
-  if (opts.promoted) {
-    writeJson(join(dir, PROMOTED_MARKER), {
-      promoted_at: "2026-06-04T00:00:00.000Z",
-      baseline_dir: "/somewhere/evals/baseline",
-      commit: "abc1234",
-    });
-  }
-  return dir;
-}
-function makeSnapshot(
-  workspaceRoot: string,
-  skill: string,
-  label: string,
-  source: "ref" | "working-tree" | null,
-): string {
-  const dir = join(workspaceRoot, skill, "snapshots", label);
-  mkdirSync(dir, { recursive: true });
-  writeFileSync(join(dir, "SKILL.md"), "snapshot body\n");
-  if (source !== null) {
-    writeJson(
-      join(dir, SNAPSHOT_META),
-      source === "ref" ? { source, ref: "HEAD~1" } : { source },
-    );
-  }
-  return dir;
-}
-describe("cleanupWorkspace — iterations", () => {
-  test("removes a promoted iteration and prunes the emptied workspace", () => {
-    const ws = freshWorkspace();
-    const iter = makeIteration(ws, "mr-review", "iteration-1", {
-      promoted: true,
-      benchmark: true,
-      grading: true,
-    });
-    const summary = cleanupWorkspace(ws, "mr-review");
-    expect(existsSync(iter)).toBe(false);
-    expect(summary.removedIterations).toEqual(["iteration-1"]);
-    expect(summary.workspaceRemoved).toBe(true);
-    // Skill dir and the workspace root are pruned once empty.
-    expect(existsSync(join(ws, "mr-review"))).toBe(false);
-    expect(existsSync(ws)).toBe(false);
-  });
-  test("keeps an unpromoted iteration that holds a benchmark, and reports it", () => {
-    const ws = freshWorkspace();
-    const iter = makeIteration(ws, "mr-review", "iteration-1", {
-      benchmark: true,
-    });
-    const summary = cleanupWorkspace(ws, "mr-review");
-    expect(existsSync(iter)).toBe(true);
-    expect(summary.removedIterations).toEqual([]);
-    expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
-      "iteration-1",
-    ]);
-    // Nothing was emptied, so the workspace stays.
-    expect(existsSync(ws)).toBe(true);
-  });
-  test("keeps an unpromoted iteration that holds only a run record", () => {
-    const ws = freshWorkspace();
-    const iter = makeIteration(ws, "mr-review", "iteration-1", {
-      runRecord: true,
-    });
-    const summary = cleanupWorkspace(ws, "mr-review");
-    expect(existsSync(iter)).toBe(true);
-    expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
-      "iteration-1",
-    ]);
-  });
-  test("removes an unpromoted scaffolding-only iteration (no captured results)", () => {
-    const ws = freshWorkspace();
-    const iter = makeIteration(ws, "mr-review", "iteration-1", {
-      scaffoldingOnly: true,
-    });
-    const summary = cleanupWorkspace(ws, "mr-review");
-    expect(existsSync(iter)).toBe(false);
-    expect(summary.removedIterations).toEqual(["iteration-1"]);
-  });
-  test("mixed: promoted removed, unpromoted-with-results kept, skill dir NOT pruned", () => {
-    const ws = freshWorkspace();
-    const promoted = makeIteration(ws, "mr-review", "iteration-1", {
-      promoted: true,
-      benchmark: true,
-    });
-    const kept = makeIteration(ws, "mr-review", "iteration-2", {
-      benchmark: true,
-    });
-    const summary = cleanupWorkspace(ws, "mr-review");
-    expect(existsSync(promoted)).toBe(false);
-    expect(existsSync(kept)).toBe(true);
-    expect(summary.removedIterations).toEqual(["iteration-1"]);
-    expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
-      "iteration-2",
-    ]);
-    expect(summary.workspaceRemoved).toBe(false);
-    expect(existsSync(join(ws, "mr-review"))).toBe(true);
-  });
-});
-describe("cleanupWorkspace — snapshots", () => {
-  test("removes ref snapshots, keeps working-tree and legacy (no-meta) snapshots", () => {
-    const ws = freshWorkspace();
-    const refSnap = makeSnapshot(ws, "mr-review", "old-ref", "ref");
-    const wtSnap = makeSnapshot(ws, "mr-review", "wt", "working-tree");
-    const legacySnap = makeSnapshot(ws, "mr-review", "legacy", null);
-    const summary = cleanupWorkspace(ws, "mr-review");
-    expect(existsSync(refSnap)).toBe(false);
-    expect(existsSync(wtSnap)).toBe(true);
-    expect(existsSync(legacySnap)).toBe(true);
-    expect(summary.removedSnapshots).toEqual(["old-ref"]);
-    expect(summary.keptSnapshots.sort()).toEqual(["legacy", "wt"]);
-  });
-});
-describe("cleanupWorkspace — safety", () => {
-  test("never touches another skill's workspace, and leaves the root intact", () => {
-    const ws = freshWorkspace();
-    makeIteration(ws, "mr-review", "iteration-1", { promoted: true });
-    const otherIter = makeIteration(ws, "other-skill", "iteration-1", {
-      benchmark: true,
-    });
-    cleanupWorkspace(ws, "mr-review");
-    expect(existsSync(join(ws, "mr-review"))).toBe(false);
-    expect(existsSync(otherIter)).toBe(true);
-    // Root survives because other-skill still lives there.
-    expect(existsSync(ws)).toBe(true);
-  });
-  test("returns an empty summary and does not throw when the skill has no workspace", () => {
-    const ws = freshWorkspace();
-    const summary = cleanupWorkspace(ws, "never-ran");
-    expect(summary.removedIterations).toEqual([]);
-    expect(summary.keptIterations).toEqual([]);
-    expect(summary.removedSnapshots).toEqual([]);
-    expect(summary.keptSnapshots).toEqual([]);
-    expect(summary.workspaceRemoved).toBe(false);
-  });
-});

package/skills/evaluating-skills/runner/workspace-teardown.ts DELETED Viewed

@@ -1,136 +0,0 @@
-import { existsSync, readdirSync, readFileSync, rmSync } from "node:fs";
-import { join } from "node:path";
-/**
- * Marker `promote-baseline` drops into an iteration dir once that iteration's
- * durable results (benchmark + gradings) are committed under the skill's
- * `evals/baseline/`. Teardown treats its presence as "safe to delete" — the
- * data now lives in version control.
- */
-export const PROMOTED_MARKER = ".promoted.json";
-/**
- * Provenance the `snapshot` command writes into each `snapshots/<label>/` dir,
- * recording whether it was materialized from a git ref (reproducible) or copied
- * from the working tree (not reproducible). Teardown only reclaims ref snapshots.
- */
-export const SNAPSHOT_META = ".snapshot-meta.json";
-export type WorkspaceCleanupSummary = {
-  /** Iteration dir names removed (promoted, or pure scaffolding). */
-  removedIterations: string[];
-  /** Iterations kept because they hold uncommitted results, with the reason. */
-  keptIterations: { iteration: string; reason: string }[];
-  /** Snapshot labels removed (reproducible from a git ref). */
-  removedSnapshots: string[];
-  /** Snapshot labels kept (working-tree or legacy, can't be regenerated). */
-  keptSnapshots: string[];
-  /** True when the skill's whole workspace subtree was removed. */
-  workspaceRemoved: boolean;
-};
-/** Remove `dir` only if it exists and is empty. */
-function pruneIfEmpty(dir: string): void {
-  if (existsSync(dir) && readdirSync(dir).length === 0) {
-    rmSync(dir, { recursive: true, force: true });
-  }
-}
-/**
- * An iteration carries "captured results" worth preserving if it reached the
- * point of producing an aggregate (`benchmark.json`) or any per-run record or
- * grading. Anything short of that (e.g. a `--dry-run` or a run staged but never
- * dispatched) is reproducible scaffolding.
- */
-function iterationHasResults(iterDir: string): boolean {
-  if (existsSync(join(iterDir, "benchmark.json"))) return true;
-  for (const entry of readdirSync(iterDir, { withFileTypes: true })) {
-    if (!entry.isDirectory() || !entry.name.startsWith("eval-")) continue;
-    const evalDir = join(iterDir, entry.name);
-    for (const cond of readdirSync(evalDir, { withFileTypes: true })) {
-      if (!cond.isDirectory()) continue;
-      const condDir = join(evalDir, cond.name);
-      if (existsSync(join(condDir, "run.json"))) return true;
-      if (existsSync(join(condDir, "grading.json"))) return true;
-    }
-  }
-  return false;
-}
-function snapshotSource(snapDir: string): string | null {
-  const metaPath = join(snapDir, SNAPSHOT_META);
-  if (!existsSync(metaPath)) return null;
-  try {
-    const meta = JSON.parse(readFileSync(metaPath, "utf8")) as {
-      source?: string;
-    };
-    return meta.source ?? null;
-  } catch {
-    return null;
-  }
-}
-/**
- * End-of-run cleanup of a skill's `skills-workspace/<skill>/` subtree, so a
- * finished eval leaves behind nothing that wasn't meant to be committed —
- * without ever destroying results the user hasn't moved into version control.
- *
- * Per iteration: promoted (marker present) → removed; unpromoted but holding
- * captured results → kept and reported; unpromoted scaffolding → removed. Per
- * snapshot: ref-sourced → removed; working-tree or legacy → kept. Empty parents
- * (`snapshots/`, the skill dir, the workspace root) are pruned, but a non-empty
- * one — e.g. another skill's artifacts — is never touched.
- */
-export function cleanupWorkspace(
-  workspaceRoot: string,
-  skillName: string,
-): WorkspaceCleanupSummary {
-  const summary: WorkspaceCleanupSummary = {
-    removedIterations: [],
-    keptIterations: [],
-    removedSnapshots: [],
-    keptSnapshots: [],
-    workspaceRemoved: false,
-  };
-  const skillDir = join(workspaceRoot, skillName);
-  if (!existsSync(skillDir)) return summary;
-  for (const entry of readdirSync(skillDir, { withFileTypes: true })) {
-    if (!entry.isDirectory() || !entry.name.startsWith("iteration-")) continue;
-    const iterDir = join(skillDir, entry.name);
-    if (existsSync(join(iterDir, PROMOTED_MARKER))) {
-      rmSync(iterDir, { recursive: true, force: true });
-      summary.removedIterations.push(entry.name);
-    } else if (iterationHasResults(iterDir)) {
-      summary.keptIterations.push({
-        iteration: entry.name,
-        reason: "uncommitted results — not promoted to evals/baseline/",
-      });
-    } else {
-      rmSync(iterDir, { recursive: true, force: true });
-      summary.removedIterations.push(entry.name);
-    }
-  }
-  const snapshotsDir = join(skillDir, "snapshots");
-  if (existsSync(snapshotsDir)) {
-    for (const entry of readdirSync(snapshotsDir, { withFileTypes: true })) {
-      if (!entry.isDirectory()) continue;
-      const snapDir = join(snapshotsDir, entry.name);
-      if (snapshotSource(snapDir) === "ref") {
-        rmSync(snapDir, { recursive: true, force: true });
-        summary.removedSnapshots.push(entry.name);
-      } else {
-        summary.keptSnapshots.push(entry.name);
-      }
-    }
-    pruneIfEmpty(snapshotsDir);
-  }
-  pruneIfEmpty(skillDir);
-  summary.workspaceRemoved = !existsSync(skillDir);
-  pruneIfEmpty(workspaceRoot);
-  return summary;
-}

package/skills/evaluating-skills/schema/evals.schema.json DELETED Viewed

@@ -1,105 +0,0 @@
-{
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "$id": "https://slow-powers.dev/schemas/evals.schema.json",
-  "title": "Skill Evaluation Definition",
-  "description": "Defines a set of test cases for evaluating a skill. Lives at <skill>/evals/evals.json.",
-  "type": "object",
-  "required": ["skill_name", "evals"],
-  "additionalProperties": false,
-  "properties": {
-    "skill_name": {
-      "type": "string",
-      "description": "Name of the skill being evaluated. Should match the skill directory name."
-    },
-    "evals": {
-      "type": "array",
-      "minItems": 1,
-      "items": { "$ref": "#/definitions/eval" }
-    }
-  },
-  "definitions": {
-    "eval": {
-      "type": "object",
-      "required": ["id", "prompt", "expected_output"],
-      "additionalProperties": false,
-      "properties": {
-        "id": {
-          "type": "string",
-          "pattern": "^[a-z0-9][a-z0-9-]*$",
-          "description": "Stable kebab-case identifier. Used as directory name in the workspace tree."
-        },
-        "prompt": {
-          "type": "string",
-          "minLength": 1,
-          "description": "The user-facing message the subagent receives. Should read like a realistic user request."
-        },
-        "expected_output": {
-          "type": "string",
-          "minLength": 1,
-          "description": "Human-readable description of what a successful response looks like."
-        },
-        "files": {
-          "type": "array",
-          "items": { "type": "string" },
-          "description": "Fixture file paths relative to the skill's evals/ directory. Copied into the subagent's input directory before dispatch."
-        },
-        "skill_should_trigger": {
-          "type": "boolean",
-          "default": true,
-          "description": "Whether the skill-under-test is expected to fire on this eval. Defaults to true. Set false for negative evals where correct behavior is NOT invoking the skill (e.g. an over-trigger guard); such evals are excluded from the skill-invocation rate and its validity warning."
-        },
-        "assertions": {
-          "type": "array",
-          "items": { "$ref": "#/definitions/assertion" },
-          "description": "Pass/fail criteria, added after iteration 1 when you know what outputs look like."
-        }
-      }
-    },
-    "assertion": {
-      "oneOf": [
-        { "$ref": "#/definitions/transcriptCheck" },
-        { "$ref": "#/definitions/llmJudge" }
-      ]
-    },
-    "transcriptCheck": {
-      "type": "object",
-      "required": ["id", "type", "check"],
-      "additionalProperties": false,
-      "properties": {
-        "id": { "type": "string" },
-        "type": { "const": "transcript_check" },
-        "check": {
-          "type": "string",
-          "description": "Name of a transcript-check kind handled by the runner's grader (runner/grade.ts), e.g. tool_invocation_matches."
-        },
-        "pattern": {
-          "type": "string",
-          "description": "Regex (or substring) the check uses to match tool invocations."
-        },
-        "must_precede": {
-          "type": "string",
-          "enum": ["completion_claim", "any"],
-          "description": "Where in the run the matched invocation must occur. 'completion_claim' = before the final message. 'any' = anywhere in the run."
-        }
-      }
-    },
-    "llmJudge": {
-      "type": "object",
-      "required": ["id", "type", "rubric"],
-      "additionalProperties": false,
-      "properties": {
-        "id": { "type": "string" },
-        "type": { "const": "llm_judge" },
-        "rubric": {
-          "type": "string",
-          "minLength": 1,
-          "description": "The question the judge model answers. Should be answerable with PASS/FAIL + evidence."
-        },
-        "model": {
-          "type": "string",
-          "description": "Optional override. Defaults to whatever the harness operator configures for judge dispatches."
-        }
-      }
-    }
-  }
-}

package/skills/evaluating-skills/schema/grading.schema.json DELETED Viewed

@@ -1,84 +0,0 @@
-{
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "$id": "https://slow-powers.dev/schemas/grading.schema.json",
-  "title": "Grading Result",
-  "description": "Output of grading one (eval, condition) pair. Lives at <workspace>/iteration-N/eval-<id>/<condition>/grading.json.",
-  "type": "object",
-  "required": ["assertion_results", "summary"],
-  "additionalProperties": false,
-  "properties": {
-    "assertion_results": {
-      "type": "array",
-      "items": {
-        "type": "object",
-        "required": ["id", "passed", "evidence"],
-        "additionalProperties": false,
-        "properties": {
-          "id": {
-            "type": "string",
-            "description": "Matches the assertion id in evals.json."
-          },
-          "passed": { "type": "boolean" },
-          "evidence": {
-            "type": "string",
-            "description": "Direct quote or specific reference from the run record. Vague summaries are not evidence."
-          },
-          "confidence": {
-            "type": "number",
-            "minimum": 0,
-            "maximum": 1,
-            "description": "Judge confidence. Low confidence (< 0.7) flags this result for human review. Always 1.0 for transcript_check results."
-          },
-          "grader": {
-            "type": "string",
-            "enum": ["transcript_check", "llm_judge"],
-            "description": "Which grader produced this result."
-          }
-        }
-      }
-    },
-    "summary": {
-      "type": "object",
-      "required": ["passed", "failed", "total", "pass_rate"],
-      "additionalProperties": false,
-      "properties": {
-        "passed": { "type": "integer", "minimum": 0 },
-        "failed": { "type": "integer", "minimum": 0 },
-        "total": { "type": "integer", "minimum": 0 },
-        "pass_rate": { "type": "number", "minimum": 0, "maximum": 1 }
-      }
-    },
-    "meta_results": {
-      "type": "array",
-      "description": "Framework-injected meta-assertions (e.g. skill-invocation check). Reserved id prefix: __ (double underscore). Tracked separately from substantive assertion_results so they do not pollute the skill effectiveness pass_rate.",
-      "items": {
-        "type": "object",
-        "required": ["id", "passed", "evidence"],
-        "additionalProperties": false,
-        "properties": {
-          "id": { "type": "string" },
-          "passed": { "type": "boolean" },
-          "evidence": { "type": "string" },
-          "confidence": { "type": "number", "minimum": 0, "maximum": 1 },
-          "grader": {
-            "type": "string",
-            "enum": ["transcript_check", "llm_judge"]
-          }
-        }
-      }
-    },
-    "meta_summary": {
-      "type": "object",
-      "additionalProperties": false,
-      "properties": {
-        "passed": { "type": "integer", "minimum": 0 },
-        "failed": { "type": "integer", "minimum": 0 },
-        "total": { "type": "integer", "minimum": 0 },
-        "skill_invoked": {
-          "description": "True when the skill-invocation meta-check passed; false when the judge found no evidence the skill influenced behavior; null when no skill was loaded for this run.",
-          "type": ["boolean", "null"]
-        }
-      }
-    }
-  }
-}

package/skills/evaluating-skills/schema/run-record.schema.json DELETED Viewed

@@ -1,80 +0,0 @@
-{
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "$id": "https://slow-powers.dev/schemas/run-record.schema.json",
-  "title": "Portable Run Record",
-  "description": "Captures one subagent run. Harness-agnostic — each harness writes an adapter from its native transcript format to this shape. Downstream grading reads only this file.",
-  "type": "object",
-  "required": [
-    "eval_id",
-    "condition",
-    "skill_path",
-    "prompt",
-    "files",
-    "final_message",
-    "tool_invocations"
-  ],
-  "additionalProperties": false,
-  "properties": {
-    "eval_id": {
-      "type": "string",
-      "description": "Matches the eval's id in evals.json."
-    },
-    "condition": {
-      "type": "string",
-      "description": "Reserved names: with_skill, without_skill, old_skill, new_skill."
-    },
-    "skill_path": {
-      "type": ["string", "null"],
-      "description": "Absolute path to the SKILL.md the subagent could load, or null if no skill was provided (without_skill condition)."
-    },
-    "prompt": {
-      "type": "string",
-      "description": "The user prompt as dispatched to the subagent."
-    },
-    "files": {
-      "type": "array",
-      "items": { "type": "string" },
-      "description": "Fixture files the subagent had access to (absolute paths inside the run's workspace)."
-    },
-    "final_message": {
-      "type": "string",
-      "description": "The agent's final user-facing text output."
-    },
-    "tool_invocations": {
-      "type": "array",
-      "description": "Ordered list of tool calls during the run.",
-      "items": {
-        "type": "object",
-        "required": ["name", "ordinal"],
-        "additionalProperties": false,
-        "properties": {
-          "name": {
-            "type": "string",
-            "description": "Tool name as recorded by the harness (e.g. Bash, Read, run_command). Adapters should preserve original names."
-          },
-          "args": {
-            "description": "Tool arguments. Object for structured tools, string for raw command-style tools.",
-            "type": ["object", "string", "array", "null"]
-          },
-          "result": {
-            "description": "Tool output, if captured. Truncate long outputs to ~2KB.",
-            "type": ["string", "object", "null"]
-          },
-          "ordinal": {
-            "type": "integer",
-            "minimum": 0,
-            "description": "0-indexed position in the run. Used by must_precede checks."
-          }
-        }
-      }
-    },
-    "total_tokens": {
-      "type": ["integer", "null"],
-      "description": "From the harness's task completion event, or derived from the persisted transcript by record-runs (usage summed across unique message ids, including cache creation/read tokens — a different accounting than the completion event). Canonical timing lives in the sibling timing.json, whose `source` field records which origin produced it. May be null if neither source is available."
-    },
-    "duration_ms": {
-      "type": ["integer", "null"],
-      "description": "From the harness's task completion event, or derived from the persisted transcript by record-runs (wall clock between the first and last transcript timestamps). Canonical timing lives in the sibling timing.json. May be null if neither source is available."
-    }
-  }
-}