@windyroad/tdd 0.4.4 → 0.4.5-preview.809

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,5 +101,5 @@
101
101
  }
102
102
  },
103
103
  "name": "wr-tdd",
104
- "version": "0.4.4"
104
+ "version": "0.4.5"
105
105
  }
@@ -30,7 +30,7 @@ A **behavioural** test asserts what the target **does** when invoked: its tool-c
30
30
 
31
31
  A **structural** test asserts what the target's source **says**: that a string appears in `SKILL.md`, that a frontmatter field has a particular value, that a section heading is present.
32
32
 
33
- Behavioural is the default per ADR-052. Structural is permitted only with documented justification (Surface 1: env-var skip; Surface 2: in-file justification comment).
33
+ Per ADR-052 (Option 1A — Behavioural-only, 2026-06-09 amendment) **behavioural is the only permitted kind**. STRUCTURAL is a **failing** classification: structural assertions on prose-document content (`SKILL.md`, `agent.md`, `*.proposed.md`, `*.accepted.md`, `*.superseded.md`, `RISK-POLICY.md`, `CLAUDE.md`, and similar prose contracts) are not permitted under any justification. There is no escape hatch — no env-var skip, no in-file justification comment. A test that cannot yet be expressed behaviourally because the harness lacks a primitive does NOT ship as structural; it BLOCKS on the relevant Layer B harness-gap ticket (P324 / P176 / P012-descendants) and ships only once the primitive lands.
34
34
 
35
35
  ## Detection method
36
36
 
@@ -39,13 +39,10 @@ Read the full test source. For each test case:
39
39
  1. Identify the assertion target (the `run` invocation, the `expect(...)`, the `assert ...`, the `Then` step).
40
40
  2. Trace the target back to its data source.
41
41
  3. Classify:
42
- - **STRUCTURAL** — assertion's data source reduces to "string X appears in (or is absent from) prose document Y" where Y is `SKILL.md` / `agent.md` / `*.proposed.md` / `*.accepted.md` / `*.superseded.md` / `RISK-POLICY.md` / `CLAUDE.md` / similar prose contracts.
43
- - **BEHAVIOURAL** — assertion observes target invocation outputs (stdout / stderr / return value / promise resolution), exit codes, written artefacts (final filesystem state), captured tool-calls (mock invocation parameters), or final state of an externally-observable system.
44
- - **STRUCTURAL-PERMITTED** — assertion is structural BUT the target is one of ADR-005's preserved permitted exceptions: `hooks.json` content checks, file-existence / file-removed checks, hook-script safety-construct presence (e.g. `set -euo pipefail`).
42
+ - **STRUCTURAL** (failing) — assertion's data source reduces to "string X appears in (or is absent from) prose document Y" where Y is `SKILL.md` / `agent.md` / `*.proposed.md` / `*.accepted.md` / `*.superseded.md` / `RISK-POLICY.md` / `CLAUDE.md` / similar prose contracts.
43
+ - **BEHAVIOURAL** — assertion observes target invocation outputs (stdout / stderr / return value / promise resolution), exit codes, written artefacts (final filesystem state), captured tool-calls (mock invocation parameters), or final state of an externally-observable system. ADR-005's **preserved exceptions** also classify as BEHAVIOURAL, not failing-STRUCTURAL: `hooks.json` content checks, file-existence / file-removed checks, and hook-script safety-construct presence (e.g. `set -euo pipefail`) on executable bash under `hooks/`. These observe artefact / executable / filesystem state rather than prose-document content, so ADR-052's narrowing leaves them permitted (ADR-052 retains ADR-005's hook-testing exceptions).
45
44
 
46
- If the test file contains the comment `tdd-review: structural-permitted (justification: …)` (any case), treat ALL its structural assertions as STRUCTURAL-JUSTIFIED. Recognise both `# tdd-review: …` (bash / pytest / cucumber) and `// tdd-review: …` (vitest / jest / mocha).
47
-
48
- If a single test file mixes structural and behavioural test cases without a justification comment, the file-level verdict is MIXED. Per-test-case classification appears in the evidence array.
45
+ If a single test file mixes structural and behavioural test cases, the file-level verdict is MIXED. Per-test-case classification appears in the evidence array.
49
46
 
50
47
  If the file's intent is genuinely unclear (e.g. test cases that read a config file but assert on the parsed result rather than the raw text), emit `verdict: "unclear"` rather than guessing. Populate evidence and suggestion fields so a reader can resolve the ambiguity.
51
48
 
@@ -56,7 +53,7 @@ You will be classifying test files written in bats, vitest, jest, mocha, cucumbe
56
53
  ### bats
57
54
 
58
55
  ```bash
59
- # STRUCTURAL — asserts SKILL.md prose
56
+ # STRUCTURAL (failing) — asserts SKILL.md prose
60
57
  @test "skill cites P081" {
61
58
  run grep -F "P081" "$SKILL_MD"
62
59
  [ "$status" -eq 0 ]
@@ -70,7 +67,8 @@ You will be classifying test files written in bats, vitest, jest, mocha, cucumbe
70
67
  [[ "$output" == *"BLOCKED"* ]]
71
68
  }
72
69
 
73
- # STRUCTURAL-PERMITTED — hook safety-construct on executable bash
70
+ # BEHAVIOURAL (ADR-005 preserved exception) — hook safety-construct on
71
+ # executable bash; observes the executable artefact's state, not prose.
74
72
  @test "hook prologue sets euo pipefail" {
75
73
  run grep -nE '^set -[eo]+u?[eo]*' "$HOOK"
76
74
  [ "$status" -eq 0 ]
@@ -80,7 +78,7 @@ You will be classifying test files written in bats, vitest, jest, mocha, cucumbe
80
78
  ### vitest / jest / mocha
81
79
 
82
80
  ```js
83
- // STRUCTURAL — asserts SKILL.md prose
81
+ // STRUCTURAL (failing) — asserts SKILL.md prose
84
82
  expect(readFileSync('SKILL.md', 'utf8')).toContain('Step 5');
85
83
 
86
84
  // BEHAVIOURAL — exercises the skill, asserts on result
@@ -93,7 +91,7 @@ expect(result.toolCalls).toMatchObject([
93
91
  ### cucumber / .feature
94
92
 
95
93
  ```gherkin
96
- # STRUCTURAL — Then-step that greps a doc
94
+ # STRUCTURAL (failing) — Then-step that greps a doc
97
95
  Then the SKILL.md should contain "Step 4a Verification"
98
96
 
99
97
  # BEHAVIOURAL — Then-step asserting on captured world state
@@ -103,7 +101,7 @@ Then the skill should call AskUserQuestion with options ["amend", "supersede", "
103
101
  ### pytest
104
102
 
105
103
  ```python
106
- # STRUCTURAL — reads prose document
104
+ # STRUCTURAL (failing) — reads prose document
107
105
  assert "Step 5" in open("SKILL.md").read()
108
106
 
109
107
  # BEHAVIOURAL — exercises target, asserts on artefact
@@ -113,10 +111,8 @@ assert result.artefact_state == expected_tree
113
111
 
114
112
  ### Cross-framework heuristics
115
113
 
116
- - **STRUCTURAL signals**: assertion data flow `read_file(prose_doc)` → `contains(...)`; `readFileSync` / `cat` / `grep -F` / `grep -nE` against a `*.md` / `*.proposed.md` / `agent.md` / `SKILL.md` path.
117
- - **BEHAVIOURAL signals**: subprocess invocation (`bash`, `node`, `python -m`); function call returning a captured tool-call sequence; assertions on `status` / `exit_code` / `stdout` / `stderr` / `output` / `artefact_state` / `result.toolCalls` / `world.lastOutput` / mock call counts.
118
- - **STRUCTURAL-PERMITTED signals**: target is `hooks.json` content; file-existence / removal checks (`[ -f ... ]` / `[ ! -f ... ]` / `existsSync` / `os.path.exists`); shebang / safety-construct prologue greps on executable bash files (paths under `hooks/` ending `.sh`).
119
- - **STRUCTURAL-JUSTIFIED signals**: in-file comment `tdd-review: structural-permitted (justification: …)` linking a P012-descendant ticket ID.
114
+ - **STRUCTURAL signals** (failing): assertion data flow `read_file(prose_doc)` → `contains(...)`; `readFileSync` / `cat` / `grep -F` / `grep -nE` against a `*.md` / `*.proposed.md` / `agent.md` / `SKILL.md` path.
115
+ - **BEHAVIOURAL signals**: subprocess invocation (`bash`, `node`, `python -m`); function call returning a captured tool-call sequence; assertions on `status` / `exit_code` / `stdout` / `stderr` / `output` / `artefact_state` / `result.toolCalls` / `world.lastOutput` / mock call counts. Also ADR-005's preserved exceptions: `hooks.json` content; file-existence / removal checks (`[ -f ... ]` / `[ ! -f ... ]` / `existsSync` / `os.path.exists`); shebang / safety-construct prologue greps on executable bash files (paths under `hooks/` ending `.sh`).
120
116
 
121
117
  ## Verdict shape
122
118
 
@@ -124,7 +120,7 @@ Emit your verdict as a JSON object inside a fenced code block at the end of your
124
120
 
125
121
  ```json
126
122
  {
127
- "verdict": "structural" | "behavioural" | "mixed" | "structural-permitted" | "structural-justified" | "unclear",
123
+ "verdict": "structural" | "behavioural" | "mixed" | "unclear",
128
124
  "evidence": [
129
125
  { "test_name": "skill cites P081", "line": 12, "why": "asserts grep -F on SKILL.md prose" }
130
126
  ],
@@ -135,21 +131,15 @@ Emit your verdict as a JSON object inside a fenced code block at the end of your
135
131
 
136
132
  ### Field rules
137
133
 
138
- - **verdict** — one of the six enum values. The file-level verdict; per-test-case classifications belong in evidence.
134
+ - **verdict** — one of the four enum values (`structural` / `behavioural` / `mixed` / `unclear`). The file-level verdict; per-test-case classifications belong in evidence. STRUCTURAL is a failing classification, not a permitted one.
139
135
  - **evidence** — array of `{test_name, line, why}` objects, one per non-trivial classification. For BEHAVIOURAL files this may be empty or omit per-case detail.
140
136
  - **suggestion** — a behavioural alternative the test author can adapt. Concrete (name a specific assertion shape, not "write better tests"). Empty string when verdict is BEHAVIOURAL.
141
- - **harness_gap** — the ticket ID (`P012` / `P081-followup` / a new `PNNN`) of the harness primitive whose absence forces the structural assertion. Per [ADR-026](../../../docs/decisions/026-agent-output-grounding.proposed.md) grounding rules, this MUST be either a specific ticket ID OR `null`. **Never emit free-text speculation** (e.g. `"a Skill-tool interceptor would help"`) without a ticket citation. If you can't cite a ticket, emit `null`.
137
+ - **harness_gap** — the ticket ID (`P012` / `P324` / a new `PNNN`) of the harness primitive whose absence forces the structural assertion. Per [ADR-026](../../../docs/decisions/026-agent-output-grounding.proposed.md) grounding rules, this MUST be either a specific ticket ID OR `null`. **Never emit free-text speculation** (e.g. `"a Skill-tool interceptor would help"`) without a ticket citation. If you can't cite a ticket, emit `null`. When you emit `verdict: "structural"` with a non-null `harness_gap`, the test BLOCKS on that ticket — it does not ship as structural with a permission marker.
142
138
 
143
139
  ### When the file has no test cases
144
140
 
145
141
  If the file is empty or contains only setup/teardown, emit `verdict: "unclear"` with `evidence: []` and `suggestion: "File contains no test cases — add @test / it() / Scenario: / def test_..."`. Do not classify as structural-by-default.
146
142
 
147
- ## Escape-hatch recognition
148
-
149
- When the file contains the comment `tdd-review: structural-permitted (justification: …)` (or `// tdd-review: …`), emit `verdict: "structural-justified"` and report the cited ticket in `harness_gap` (parse the ticket ID from the justification text). The agent does not second-guess the justification — surfacing the verdict is the job.
150
-
151
- If the justification text does NOT cite a ticket ID (e.g. the comment is `tdd-review: structural-permitted (justification: TODO)`), emit `verdict: "structural-justified"` with `harness_gap: null` AND populate `suggestion` with a reminder to link a specific ticket per ADR-052's grounding requirement. Do not auto-promote to STRUCTURAL — the comment is the operator's deviation approval; the agent's role is to surface the missing citation, not to override the deviation.
152
-
153
143
  ## Input handling
154
144
 
155
145
  You will be given a test file path (or paths). Read the full file before classifying. If the prompt names a target source-under-test, also read it briefly to ground the suggestion (e.g. "for skill X delegating via Skill tool: simulate invocation and assert the Skill-tool call carries the expected target + arguments"). Do not load broader package context — JTBD-001 60-second budget applies.
@@ -167,5 +157,5 @@ Per [ADR-013](../../../docs/decisions/013-structured-user-interaction-for-govern
167
157
  - You run as a mechanical / silent classification stage per the project CLAUDE.md (P132 inverse-P078 carve-out). You MUST NOT call `AskUserQuestion` even when classification is genuinely ambiguous; emit `verdict: "unclear"` and let the main agent escalate at retro time.
168
158
  - You classify across frameworks: bats, vitest, jest, mocha, cucumber/.feature, pytest. Recognise the shape of each.
169
159
  - You ground every `harness_gap` claim in a specific ticket ID per ADR-026, OR emit `null`. Free-text harness-gap speculation is forbidden.
170
- - You respect ADR-005's preserved permitted exceptions: `hooks.json` content checks, file-existence / file-removed checks, and hook-script safety-construct presence on executable bash. Classify these as STRUCTURAL-PERMITTED, not STRUCTURAL.
171
- - You respect the in-file justification comment as a per-file deviation approval (ADR-044 category 2). Surface it as STRUCTURAL-JUSTIFIED; do not override.
160
+ - Behavioural is the only permitted kind (ADR-052 Option 1A). STRUCTURAL structural assertions on prose-document content is a failing classification; there is no escape hatch.
161
+ - You respect ADR-005's preserved exceptions (`hooks.json` content checks; file-existence / file-removed checks; hook-script safety-construct presence on executable bash under `hooks/`) by classifying them as BEHAVIOURAL, not failing-STRUCTURAL they observe artefact / executable / filesystem state, not prose-document content.
@@ -7,10 +7,12 @@ import { execSync } from "node:child_process";
7
7
 
8
8
  const MARKETPLACE_REPO = "windyroad/agent-plugins";
9
9
  const MARKETPLACE_NAME = "windyroad";
10
+ const CODEX_MARKETPLACE_PATH = ".";
11
+ const CODEX_MARKETPLACE_NAME = "windyroad-local";
10
12
 
11
13
  let _dryRun = false;
12
14
 
13
- export { MARKETPLACE_REPO, MARKETPLACE_NAME };
15
+ export { MARKETPLACE_REPO, MARKETPLACE_NAME, CODEX_MARKETPLACE_PATH, CODEX_MARKETPLACE_NAME };
14
16
 
15
17
  export function setDryRun(value) {
16
18
  _dryRun = value;
@@ -35,16 +37,34 @@ export function run(cmd, label) {
35
37
  }
36
38
  }
37
39
 
38
- export function checkPrerequisites() {
40
+ function runtimesFor(runtime = "claude") {
41
+ if (runtime === "both") return ["claude", "codex"];
42
+ return [runtime];
43
+ }
44
+
45
+ export function checkPrerequisites({ runtime = "claude" } = {}) {
39
46
  if (_dryRun) return;
40
47
 
41
- try {
42
- execSync("claude --version", { stdio: "pipe" });
43
- } catch {
44
- console.error(
45
- "Error: 'claude' CLI not found. Install Claude Code first:\n https://docs.anthropic.com/en/docs/claude-code\n"
46
- );
47
- process.exit(1);
48
+ for (const currentRuntime of runtimesFor(runtime)) {
49
+ if (currentRuntime === "claude") {
50
+ try {
51
+ execSync("claude --version", { stdio: "pipe" });
52
+ } catch {
53
+ console.error(
54
+ "Error: 'claude' CLI not found. Install Claude Code first:\n https://docs.anthropic.com/en/docs/claude-code\n"
55
+ );
56
+ process.exit(1);
57
+ }
58
+ } else if (currentRuntime === "codex") {
59
+ try {
60
+ execSync("codex --version", { stdio: "pipe" });
61
+ } catch {
62
+ console.error(
63
+ "Error: 'codex' CLI not found. Install Codex CLI first:\n https://developers.openai.com/codex\n"
64
+ );
65
+ process.exit(1);
66
+ }
67
+ }
48
68
  }
49
69
  }
50
70
 
@@ -55,6 +75,13 @@ export function addMarketplace() {
55
75
  );
56
76
  }
57
77
 
78
+ export function addCodexMarketplace() {
79
+ return run(
80
+ `codex plugin marketplace add ${CODEX_MARKETPLACE_PATH}`,
81
+ `Codex marketplace: ${CODEX_MARKETPLACE_NAME}`
82
+ );
83
+ }
84
+
58
85
  export function installPlugin(pluginName, { scope = "project" } = {}) {
59
86
  return run(
60
87
  `claude plugin install ${pluginName}@${MARKETPLACE_NAME} --scope ${scope}`,
@@ -62,6 +89,13 @@ export function installPlugin(pluginName, { scope = "project" } = {}) {
62
89
  );
63
90
  }
64
91
 
92
+ export function installCodexPlugin(pluginName) {
93
+ return run(
94
+ `codex plugin add ${pluginName}@${CODEX_MARKETPLACE_NAME}`,
95
+ pluginName
96
+ );
97
+ }
98
+
65
99
  export function updatePlugin(pluginName, { scope = "project" } = {}) {
66
100
  return run(
67
101
  `claude plugin update "${pluginName}@${MARKETPLACE_NAME}" --scope ${scope}`,
@@ -69,18 +103,36 @@ export function updatePlugin(pluginName, { scope = "project" } = {}) {
69
103
  );
70
104
  }
71
105
 
106
+ export function updateCodexMarketplace() {
107
+ return run(
108
+ `codex plugin marketplace add ${CODEX_MARKETPLACE_PATH}`,
109
+ `Codex marketplace: ${CODEX_MARKETPLACE_NAME}`
110
+ );
111
+ }
112
+
72
113
  export function uninstallPlugin(pluginName) {
73
114
  return run(`claude plugin uninstall ${pluginName}`, `Removing ${pluginName}`);
74
115
  }
75
116
 
117
+ export function uninstallCodexPlugin(pluginName) {
118
+ return run(`codex plugin remove ${pluginName}`, `Removing ${pluginName}`);
119
+ }
120
+
76
121
  /**
77
122
  * Install a single package: marketplace add + plugin install.
78
123
  */
79
- export function installPackage(pluginName, { deps = [], scope = "project" } = {}) {
124
+ export function installPackage(pluginName, { deps = [], scope = "project", runtime = "claude" } = {}) {
80
125
  console.log(`\nInstalling @windyroad/${pluginName.replace("wr-", "")} (${scope} scope)...\n`);
81
126
 
82
- addMarketplace();
83
- installPlugin(pluginName, { scope });
127
+ if (runtime === "claude" || runtime === "both") {
128
+ addMarketplace();
129
+ installPlugin(pluginName, { scope });
130
+ }
131
+
132
+ if (runtime === "codex" || runtime === "both") {
133
+ addCodexMarketplace();
134
+ installCodexPlugin(pluginName);
135
+ }
84
136
 
85
137
  if (deps.length > 0) {
86
138
  console.log(`\nNote: This plugin works best with:`);
@@ -90,34 +142,47 @@ export function installPackage(pluginName, { deps = [], scope = "project" } = {}
90
142
  }
91
143
 
92
144
  console.log(
93
- `\nDone! Restart Claude Code to activate.\n`
145
+ `\nDone! Restart ${runtime === "codex" ? "Codex" : runtime === "both" ? "Claude Code and Codex" : "Claude Code"} to activate.\n`
94
146
  );
95
147
  }
96
148
 
97
149
  /**
98
150
  * Update a single package.
99
151
  */
100
- export function updatePackage(pluginName, { scope = "project" } = {}) {
152
+ export function updatePackage(pluginName, { scope = "project", runtime = "claude" } = {}) {
101
153
  console.log(`\nUpdating @windyroad/${pluginName.replace("wr-", "")}...\n`);
102
154
 
103
- run(
104
- `claude plugin marketplace update ${MARKETPLACE_NAME}`,
105
- "Updating marketplace"
106
- );
107
- updatePlugin(pluginName, { scope });
155
+ if (runtime === "claude" || runtime === "both") {
156
+ run(
157
+ `claude plugin marketplace update ${MARKETPLACE_NAME}`,
158
+ "Updating marketplace"
159
+ );
160
+ updatePlugin(pluginName, { scope });
161
+ }
108
162
 
109
- console.log("\nDone! Restart Claude Code to apply updates.\n");
163
+ if (runtime === "codex" || runtime === "both") {
164
+ updateCodexMarketplace();
165
+ installCodexPlugin(pluginName);
166
+ }
167
+
168
+ console.log(`\nDone! Restart ${runtime === "codex" ? "Codex" : runtime === "both" ? "Claude Code and Codex" : "Claude Code"} to apply updates.\n`);
110
169
  }
111
170
 
112
171
  /**
113
172
  * Uninstall a single package.
114
173
  */
115
- export function uninstallPackage(pluginName) {
174
+ export function uninstallPackage(pluginName, { runtime = "claude" } = {}) {
116
175
  console.log(`\nUninstalling @windyroad/${pluginName.replace("wr-", "")}...\n`);
117
176
 
118
- uninstallPlugin(pluginName);
177
+ if (runtime === "claude" || runtime === "both") {
178
+ uninstallPlugin(pluginName);
179
+ }
180
+
181
+ if (runtime === "codex" || runtime === "both") {
182
+ uninstallCodexPlugin(pluginName);
183
+ }
119
184
 
120
- console.log("\nDone. Restart Claude Code to apply changes.\n");
185
+ console.log(`\nDone. Restart ${runtime === "codex" ? "Codex" : runtime === "both" ? "Claude Code and Codex" : "Claude Code"} to apply changes.\n`);
121
186
  }
122
187
 
123
188
  /**
@@ -131,6 +196,7 @@ export function parseStandardArgs(argv) {
131
196
  update: args.includes("--update"),
132
197
  dryRun: args.includes("--dry-run"),
133
198
  scope: "project",
199
+ runtime: "claude",
134
200
  };
135
201
  const scopeIdx = args.indexOf("--scope");
136
202
  if (scopeIdx !== -1 && args[scopeIdx + 1]) {
@@ -142,5 +208,15 @@ export function parseStandardArgs(argv) {
142
208
  process.exit(1);
143
209
  }
144
210
  }
211
+ const runtimeIdx = args.indexOf("--runtime");
212
+ if (runtimeIdx !== -1 && args[runtimeIdx + 1]) {
213
+ const val = args[runtimeIdx + 1];
214
+ if (["claude", "codex", "both"].includes(val)) {
215
+ flags.runtime = val;
216
+ } else {
217
+ console.error("--runtime requires: claude, codex, or both");
218
+ process.exit(1);
219
+ }
220
+ }
145
221
  return flags;
146
222
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@windyroad/tdd",
3
- "version": "0.4.4",
3
+ "version": "0.4.5-preview.809",
4
4
  "description": "TDD state machine enforcement (Red-Green-Refactor cycle)",
5
5
  "bin": {
6
6
  "windyroad-tdd": "./bin/install.mjs"
@@ -24,6 +24,7 @@
24
24
  "hooks/",
25
25
  "skills/",
26
26
  ".claude-plugin/",
27
- "lib/"
27
+ "lib/",
28
+ "!agents/eval/"
28
29
  ]
29
30
  }