npm - @windyroad/tdd - Versions diffs - 0.4.4 → 0.4.5-preview.809 - Mend

@windyroad/tdd 0.4.4 → 0.4.5-preview.809

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/.claude-plugin/plugin.json +1 -1
package/agents/review-test.md +17 -27
package/lib/install-utils.mjs +99 -23
package/package.json +3 -2

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -101,5 +101,5 @@
     }
   },
   "name": "wr-tdd",
-  "version": "0.4.4"
+  "version": "0.4.5"
 }

package/agents/review-test.md CHANGED Viewed

@@ -30,7 +30,7 @@ A **behavioural** test asserts what the target **does** when invoked: its tool-c
 A **structural** test asserts what the target's source **says**: that a string appears in `SKILL.md`, that a frontmatter field has a particular value, that a section heading is present.
-Behavioural is the default per ADR-052. Structural is permitted only with documented justification (Surface 1: env-var skip; Surface 2: in-file justification comment).
+Per ADR-052 (Option 1A — Behavioural-only, 2026-06-09 amendment) **behavioural is the only permitted kind**. STRUCTURAL is a **failing** classification: structural assertions on prose-document content (`SKILL.md`, `agent.md`, `*.proposed.md`, `*.accepted.md`, `*.superseded.md`, `RISK-POLICY.md`, `CLAUDE.md`, and similar prose contracts) are not permitted under any justification. There is no escape hatch — no env-var skip, no in-file justification comment. A test that cannot yet be expressed behaviourally because the harness lacks a primitive does NOT ship as structural; it BLOCKS on the relevant Layer B harness-gap ticket (P324 / P176 / P012-descendants) and ships only once the primitive lands.
 ## Detection method
@@ -39,13 +39,10 @@ Read the full test source. For each test case:
 1. Identify the assertion target (the `run` invocation, the `expect(...)`, the `assert ...`, the `Then` step).
 2. Trace the target back to its data source.
 3. Classify:
-   - **STRUCTURAL** — assertion's data source reduces to "string X appears in (or is absent from) prose document Y" where Y is `SKILL.md` / `agent.md` / `*.proposed.md` / `*.accepted.md` / `*.superseded.md` / `RISK-POLICY.md` / `CLAUDE.md` / similar prose contracts.
-   - **BEHAVIOURAL** — assertion observes target invocation outputs (stdout / stderr / return value / promise resolution), exit codes, written artefacts (final filesystem state), captured tool-calls (mock invocation parameters), or final state of an externally-observable system.
-   - **STRUCTURAL-PERMITTED** — assertion is structural BUT the target is one of ADR-005's preserved permitted exceptions: `hooks.json` content checks, file-existence / file-removed checks, hook-script safety-construct presence (e.g. `set -euo pipefail`).
+   - **STRUCTURAL** (failing) — assertion's data source reduces to "string X appears in (or is absent from) prose document Y" where Y is `SKILL.md` / `agent.md` / `*.proposed.md` / `*.accepted.md` / `*.superseded.md` / `RISK-POLICY.md` / `CLAUDE.md` / similar prose contracts.
+   - **BEHAVIOURAL** — assertion observes target invocation outputs (stdout / stderr / return value / promise resolution), exit codes, written artefacts (final filesystem state), captured tool-calls (mock invocation parameters), or final state of an externally-observable system. ADR-005's **preserved exceptions** also classify as BEHAVIOURAL, not failing-STRUCTURAL: `hooks.json` content checks, file-existence / file-removed checks, and hook-script safety-construct presence (e.g. `set -euo pipefail`) on executable bash under `hooks/`. These observe artefact / executable / filesystem state rather than prose-document content, so ADR-052's narrowing leaves them permitted (ADR-052 retains ADR-005's hook-testing exceptions).
-If the test file contains the comment `tdd-review: structural-permitted (justification: …)` (any case), treat ALL its structural assertions as STRUCTURAL-JUSTIFIED. Recognise both `# tdd-review: …` (bash / pytest / cucumber) and `// tdd-review: …` (vitest / jest / mocha).
-If a single test file mixes structural and behavioural test cases without a justification comment, the file-level verdict is MIXED. Per-test-case classification appears in the evidence array.
+If a single test file mixes structural and behavioural test cases, the file-level verdict is MIXED. Per-test-case classification appears in the evidence array.
 If the file's intent is genuinely unclear (e.g. test cases that read a config file but assert on the parsed result rather than the raw text), emit `verdict: "unclear"` rather than guessing. Populate evidence and suggestion fields so a reader can resolve the ambiguity.
@@ -56,7 +53,7 @@ You will be classifying test files written in bats, vitest, jest, mocha, cucumbe
 ### bats
 ```bash
-# STRUCTURAL — asserts SKILL.md prose
+# STRUCTURAL (failing) — asserts SKILL.md prose
 @test "skill cites P081" {
   run grep -F "P081" "$SKILL_MD"
   [ "$status" -eq 0 ]
@@ -70,7 +67,8 @@ You will be classifying test files written in bats, vitest, jest, mocha, cucumbe
   [[ "$output" == *"BLOCKED"* ]]
 }
-# STRUCTURAL-PERMITTED — hook safety-construct on executable bash
+# BEHAVIOURAL (ADR-005 preserved exception) — hook safety-construct on
+# executable bash; observes the executable artefact's state, not prose.
 @test "hook prologue sets euo pipefail" {
   run grep -nE '^set -[eo]+u?[eo]*' "$HOOK"
   [ "$status" -eq 0 ]
@@ -80,7 +78,7 @@ You will be classifying test files written in bats, vitest, jest, mocha, cucumbe
 ### vitest / jest / mocha
 ```js
-// STRUCTURAL — asserts SKILL.md prose
+// STRUCTURAL (failing) — asserts SKILL.md prose
 expect(readFileSync('SKILL.md', 'utf8')).toContain('Step 5');
 // BEHAVIOURAL — exercises the skill, asserts on result
@@ -93,7 +91,7 @@ expect(result.toolCalls).toMatchObject([
 ### cucumber / .feature
 ```gherkin
-# STRUCTURAL — Then-step that greps a doc
+# STRUCTURAL (failing) — Then-step that greps a doc
 Then the SKILL.md should contain "Step 4a Verification"
 # BEHAVIOURAL — Then-step asserting on captured world state
@@ -103,7 +101,7 @@ Then the skill should call AskUserQuestion with options ["amend", "supersede", "
 ### pytest
 ```python
-# STRUCTURAL — reads prose document
+# STRUCTURAL (failing) — reads prose document
 assert "Step 5" in open("SKILL.md").read()
 # BEHAVIOURAL — exercises target, asserts on artefact
@@ -113,10 +111,8 @@ assert result.artefact_state == expected_tree
 ### Cross-framework heuristics
-- **STRUCTURAL signals**: assertion data flow `read_file(prose_doc)` → `contains(...)`; `readFileSync` / `cat` / `grep -F` / `grep -nE` against a `*.md` / `*.proposed.md` / `agent.md` / `SKILL.md` path.
-- **BEHAVIOURAL signals**: subprocess invocation (`bash`, `node`, `python -m`); function call returning a captured tool-call sequence; assertions on `status` / `exit_code` / `stdout` / `stderr` / `output` / `artefact_state` / `result.toolCalls` / `world.lastOutput` / mock call counts.
-- **STRUCTURAL-PERMITTED signals**: target is `hooks.json` content; file-existence / removal checks (`[ -f ... ]` / `[ ! -f ... ]` / `existsSync` / `os.path.exists`); shebang / safety-construct prologue greps on executable bash files (paths under `hooks/` ending `.sh`).
-- **STRUCTURAL-JUSTIFIED signals**: in-file comment `tdd-review: structural-permitted (justification: …)` linking a P012-descendant ticket ID.
+- **STRUCTURAL signals** (failing): assertion data flow `read_file(prose_doc)` → `contains(...)`; `readFileSync` / `cat` / `grep -F` / `grep -nE` against a `*.md` / `*.proposed.md` / `agent.md` / `SKILL.md` path.
+- **BEHAVIOURAL signals**: subprocess invocation (`bash`, `node`, `python -m`); function call returning a captured tool-call sequence; assertions on `status` / `exit_code` / `stdout` / `stderr` / `output` / `artefact_state` / `result.toolCalls` / `world.lastOutput` / mock call counts. Also ADR-005's preserved exceptions: `hooks.json` content; file-existence / removal checks (`[ -f ... ]` / `[ ! -f ... ]` / `existsSync` / `os.path.exists`); shebang / safety-construct prologue greps on executable bash files (paths under `hooks/` ending `.sh`).
 ## Verdict shape
@@ -124,7 +120,7 @@ Emit your verdict as a JSON object inside a fenced code block at the end of your
 ```json
 {
-  "verdict": "structural" | "behavioural" | "mixed" | "structural-permitted" | "structural-justified" | "unclear",
+  "verdict": "structural" | "behavioural" | "mixed" | "unclear",
   "evidence": [
     { "test_name": "skill cites P081", "line": 12, "why": "asserts grep -F on SKILL.md prose" }
   ],
@@ -135,21 +131,15 @@ Emit your verdict as a JSON object inside a fenced code block at the end of your
 ### Field rules
-- **verdict** — one of the six enum values. The file-level verdict; per-test-case classifications belong in evidence.
+- **verdict** — one of the four enum values (`structural` / `behavioural` / `mixed` / `unclear`). The file-level verdict; per-test-case classifications belong in evidence. STRUCTURAL is a failing classification, not a permitted one.
 - **evidence** — array of `{test_name, line, why}` objects, one per non-trivial classification. For BEHAVIOURAL files this may be empty or omit per-case detail.
 - **suggestion** — a behavioural alternative the test author can adapt. Concrete (name a specific assertion shape, not "write better tests"). Empty string when verdict is BEHAVIOURAL.
-- **harness_gap** — the ticket ID (`P012` / `P081-followup` / a new `PNNN`) of the harness primitive whose absence forces the structural assertion. Per [ADR-026](../../../docs/decisions/026-agent-output-grounding.proposed.md) grounding rules, this MUST be either a specific ticket ID OR `null`. **Never emit free-text speculation** (e.g. `"a Skill-tool interceptor would help"`) without a ticket citation. If you can't cite a ticket, emit `null`.
+- **harness_gap** — the ticket ID (`P012` / `P324` / a new `PNNN`) of the harness primitive whose absence forces the structural assertion. Per [ADR-026](../../../docs/decisions/026-agent-output-grounding.proposed.md) grounding rules, this MUST be either a specific ticket ID OR `null`. **Never emit free-text speculation** (e.g. `"a Skill-tool interceptor would help"`) without a ticket citation. If you can't cite a ticket, emit `null`. When you emit `verdict: "structural"` with a non-null `harness_gap`, the test BLOCKS on that ticket — it does not ship as structural with a permission marker.
 ### When the file has no test cases
 If the file is empty or contains only setup/teardown, emit `verdict: "unclear"` with `evidence: []` and `suggestion: "File contains no test cases — add @test / it() / Scenario: / def test_..."`. Do not classify as structural-by-default.
-## Escape-hatch recognition
-When the file contains the comment `tdd-review: structural-permitted (justification: …)` (or `// tdd-review: …`), emit `verdict: "structural-justified"` and report the cited ticket in `harness_gap` (parse the ticket ID from the justification text). The agent does not second-guess the justification — surfacing the verdict is the job.
-If the justification text does NOT cite a ticket ID (e.g. the comment is `tdd-review: structural-permitted (justification: TODO)`), emit `verdict: "structural-justified"` with `harness_gap: null` AND populate `suggestion` with a reminder to link a specific ticket per ADR-052's grounding requirement. Do not auto-promote to STRUCTURAL — the comment is the operator's deviation approval; the agent's role is to surface the missing citation, not to override the deviation.
 ## Input handling
 You will be given a test file path (or paths). Read the full file before classifying. If the prompt names a target source-under-test, also read it briefly to ground the suggestion (e.g. "for skill X delegating via Skill tool: simulate invocation and assert the Skill-tool call carries the expected target + arguments"). Do not load broader package context — JTBD-001 60-second budget applies.
@@ -167,5 +157,5 @@ Per [ADR-013](../../../docs/decisions/013-structured-user-interaction-for-govern
 - You run as a mechanical / silent classification stage per the project CLAUDE.md (P132 inverse-P078 carve-out). You MUST NOT call `AskUserQuestion` even when classification is genuinely ambiguous; emit `verdict: "unclear"` and let the main agent escalate at retro time.
 - You classify across frameworks: bats, vitest, jest, mocha, cucumber/.feature, pytest. Recognise the shape of each.
 - You ground every `harness_gap` claim in a specific ticket ID per ADR-026, OR emit `null`. Free-text harness-gap speculation is forbidden.
-- You respect ADR-005's preserved permitted exceptions: `hooks.json` content checks, file-existence / file-removed checks, and hook-script safety-construct presence on executable bash. Classify these as STRUCTURAL-PERMITTED, not STRUCTURAL.
-- You respect the in-file justification comment as a per-file deviation approval (ADR-044 category 2). Surface it as STRUCTURAL-JUSTIFIED; do not override.
+- Behavioural is the only permitted kind (ADR-052 Option 1A). STRUCTURAL — structural assertions on prose-document content — is a failing classification; there is no escape hatch.
+- You respect ADR-005's preserved exceptions (`hooks.json` content checks; file-existence / file-removed checks; hook-script safety-construct presence on executable bash under `hooks/`) by classifying them as BEHAVIOURAL, not failing-STRUCTURAL — they observe artefact / executable / filesystem state, not prose-document content.

package/lib/install-utils.mjs CHANGED Viewed

@@ -7,10 +7,12 @@ import { execSync } from "node:child_process";
 const MARKETPLACE_REPO = "windyroad/agent-plugins";
 const MARKETPLACE_NAME = "windyroad";
+const CODEX_MARKETPLACE_PATH = ".";
+const CODEX_MARKETPLACE_NAME = "windyroad-local";
 let _dryRun = false;
-export { MARKETPLACE_REPO, MARKETPLACE_NAME };
+export { MARKETPLACE_REPO, MARKETPLACE_NAME, CODEX_MARKETPLACE_PATH, CODEX_MARKETPLACE_NAME };
 export function setDryRun(value) {
   _dryRun = value;
@@ -35,16 +37,34 @@ export function run(cmd, label) {
   }
 }
-export function checkPrerequisites() {
+function runtimesFor(runtime = "claude") {
+  if (runtime === "both") return ["claude", "codex"];
+  return [runtime];
+}
+export function checkPrerequisites({ runtime = "claude" } = {}) {
   if (_dryRun) return;
-  try {
-    execSync("claude --version", { stdio: "pipe" });
-  } catch {
-    console.error(
-      "Error: 'claude' CLI not found. Install Claude Code first:\n  https://docs.anthropic.com/en/docs/claude-code\n"
-    );
-    process.exit(1);
+  for (const currentRuntime of runtimesFor(runtime)) {
+    if (currentRuntime === "claude") {
+      try {
+        execSync("claude --version", { stdio: "pipe" });
+      } catch {
+        console.error(
+          "Error: 'claude' CLI not found. Install Claude Code first:\n  https://docs.anthropic.com/en/docs/claude-code\n"
+        );
+        process.exit(1);
+      }
+    } else if (currentRuntime === "codex") {
+      try {
+        execSync("codex --version", { stdio: "pipe" });
+      } catch {
+        console.error(
+          "Error: 'codex' CLI not found. Install Codex CLI first:\n  https://developers.openai.com/codex\n"
+        );
+        process.exit(1);
+      }
+    }
   }
 }
@@ -55,6 +75,13 @@ export function addMarketplace() {
   );
 }
+export function addCodexMarketplace() {
+  return run(
+    `codex plugin marketplace add ${CODEX_MARKETPLACE_PATH}`,
+    `Codex marketplace: ${CODEX_MARKETPLACE_NAME}`
+  );
+}
 export function installPlugin(pluginName, { scope = "project" } = {}) {
   return run(
     `claude plugin install ${pluginName}@${MARKETPLACE_NAME} --scope ${scope}`,
@@ -62,6 +89,13 @@ export function installPlugin(pluginName, { scope = "project" } = {}) {
   );
 }
+export function installCodexPlugin(pluginName) {
+  return run(
+    `codex plugin add ${pluginName}@${CODEX_MARKETPLACE_NAME}`,
+    pluginName
+  );
+}
 export function updatePlugin(pluginName, { scope = "project" } = {}) {
   return run(
     `claude plugin update "${pluginName}@${MARKETPLACE_NAME}" --scope ${scope}`,
@@ -69,18 +103,36 @@ export function updatePlugin(pluginName, { scope = "project" } = {}) {
   );
 }
+export function updateCodexMarketplace() {
+  return run(
+    `codex plugin marketplace add ${CODEX_MARKETPLACE_PATH}`,
+    `Codex marketplace: ${CODEX_MARKETPLACE_NAME}`
+  );
+}
 export function uninstallPlugin(pluginName) {
   return run(`claude plugin uninstall ${pluginName}`, `Removing ${pluginName}`);
 }
+export function uninstallCodexPlugin(pluginName) {
+  return run(`codex plugin remove ${pluginName}`, `Removing ${pluginName}`);
+}
 /**
  * Install a single package: marketplace add + plugin install.
  */
-export function installPackage(pluginName, { deps = [], scope = "project" } = {}) {
+export function installPackage(pluginName, { deps = [], scope = "project", runtime = "claude" } = {}) {
   console.log(`\nInstalling @windyroad/${pluginName.replace("wr-", "")} (${scope} scope)...\n`);
-  addMarketplace();
-  installPlugin(pluginName, { scope });
+  if (runtime === "claude" || runtime === "both") {
+    addMarketplace();
+    installPlugin(pluginName, { scope });
+  }
+  if (runtime === "codex" || runtime === "both") {
+    addCodexMarketplace();
+    installCodexPlugin(pluginName);
+  }
   if (deps.length > 0) {
     console.log(`\nNote: This plugin works best with:`);
@@ -90,34 +142,47 @@ export function installPackage(pluginName, { deps = [], scope = "project" } = {}
   }
   console.log(
-    `\nDone! Restart Claude Code to activate.\n`
+    `\nDone! Restart ${runtime === "codex" ? "Codex" : runtime === "both" ? "Claude Code and Codex" : "Claude Code"} to activate.\n`
   );
 }
 /**
  * Update a single package.
  */
-export function updatePackage(pluginName, { scope = "project" } = {}) {
+export function updatePackage(pluginName, { scope = "project", runtime = "claude" } = {}) {
   console.log(`\nUpdating @windyroad/${pluginName.replace("wr-", "")}...\n`);
-  run(
-    `claude plugin marketplace update ${MARKETPLACE_NAME}`,
-    "Updating marketplace"
-  );
-  updatePlugin(pluginName, { scope });
+  if (runtime === "claude" || runtime === "both") {
+    run(
+      `claude plugin marketplace update ${MARKETPLACE_NAME}`,
+      "Updating marketplace"
+    );
+    updatePlugin(pluginName, { scope });
+  }
-  console.log("\nDone! Restart Claude Code to apply updates.\n");
+  if (runtime === "codex" || runtime === "both") {
+    updateCodexMarketplace();
+    installCodexPlugin(pluginName);
+  }
+  console.log(`\nDone! Restart ${runtime === "codex" ? "Codex" : runtime === "both" ? "Claude Code and Codex" : "Claude Code"} to apply updates.\n`);
 }
 /**
  * Uninstall a single package.
  */
-export function uninstallPackage(pluginName) {
+export function uninstallPackage(pluginName, { runtime = "claude" } = {}) {
   console.log(`\nUninstalling @windyroad/${pluginName.replace("wr-", "")}...\n`);
-  uninstallPlugin(pluginName);
+  if (runtime === "claude" || runtime === "both") {
+    uninstallPlugin(pluginName);
+  }
+  if (runtime === "codex" || runtime === "both") {
+    uninstallCodexPlugin(pluginName);
+  }
-  console.log("\nDone. Restart Claude Code to apply changes.\n");
+  console.log(`\nDone. Restart ${runtime === "codex" ? "Codex" : runtime === "both" ? "Claude Code and Codex" : "Claude Code"} to apply changes.\n`);
 }
 /**
@@ -131,6 +196,7 @@ export function parseStandardArgs(argv) {
     update: args.includes("--update"),
     dryRun: args.includes("--dry-run"),
     scope: "project",
+    runtime: "claude",
   };
   const scopeIdx = args.indexOf("--scope");
   if (scopeIdx !== -1 && args[scopeIdx + 1]) {
@@ -142,5 +208,15 @@ export function parseStandardArgs(argv) {
       process.exit(1);
     }
   }
+  const runtimeIdx = args.indexOf("--runtime");
+  if (runtimeIdx !== -1 && args[runtimeIdx + 1]) {
+    const val = args[runtimeIdx + 1];
+    if (["claude", "codex", "both"].includes(val)) {
+      flags.runtime = val;
+    } else {
+      console.error("--runtime requires: claude, codex, or both");
+      process.exit(1);
+    }
+  }
   return flags;
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@windyroad/tdd",
-  "version": "0.4.4",
+  "version": "0.4.5-preview.809",
   "description": "TDD state machine enforcement (Red-Green-Refactor cycle)",
   "bin": {
     "windyroad-tdd": "./bin/install.mjs"
@@ -24,6 +24,7 @@
     "hooks/",
     "skills/",
     ".claude-plugin/",
-    "lib/"
+    "lib/",
+    "!agents/eval/"
   ]
 }