npm - codebyplan - Versions diffs - 1.13.55 → 1.13.56 - Mend

codebyplan 1.13.55 → 1.13.56

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/cli.js +64 -13
package/package.json +1 -1
package/templates/agents/cbp-e2e-maestro.md +97 -8
package/templates/agents/cbp-e2e-playwright.md +118 -15
package/templates/agents/cbp-verify-reviewer.md +8 -0
package/templates/context/testing/e2e.md +43 -2
package/templates/github-workflows/ci.yml +19 -14
package/templates/rules/e2e-mandatory.md +21 -0
package/templates/rules/two-tier-ci.md +14 -6

package/dist/cli.js CHANGED Viewed

@@ -39,7 +39,7 @@ var VERSION, PACKAGE_NAME;
 var init_version = __esm({
   "src/lib/version.ts"() {
     "use strict";
-    VERSION = "1.13.55";
+    VERSION = "1.13.56";
     PACKAGE_NAME = "codebyplan";
   }
 });
@@ -31701,6 +31701,16 @@ function detectPnpmVersionFromPackageJson(projectDir) {
     return "10";
   }
 }
+function detectStrictEnforcedFromCiJson(projectDir) {
+  try {
+    const ciJsonPath = path9.join(projectDir, ".codebyplan", "ci.json");
+    const raw = fs9.readFileSync(ciJsonPath, "utf-8");
+    const parsed = JSON.parse(raw);
+    return parsed.workflow?.strict_check_enforced === true;
+  } catch {
+    return false;
+  }
+}
 async function runScaffoldCiWorkflow(opts) {
   await Promise.resolve();
   const dryRun = opts?.dryRun ?? false;
@@ -31708,6 +31718,7 @@ async function runScaffoldCiWorkflow(opts) {
   const projectDir = path9.resolve(opts?.projectDir ?? process.cwd());
   const pnpmVersion = opts?.pnpmVersion ?? detectPnpmVersionFromPackageJson(projectDir);
   const nodeVersion = opts?.nodeVersion ?? "22";
+  const strictEnforced = opts?.strictEnforced ?? detectStrictEnforcedFromCiJson(projectDir);
   const templatesDir = opts?.templatesDir ?? resolveTemplatesDir();
   const templatePath = path9.join(templatesDir, "github-workflows", "ci.yml");
   if (!fs9.existsSync(templatePath)) {
@@ -31718,7 +31729,9 @@ async function runScaffoldCiWorkflow(opts) {
   const rawTemplate = fs9.readFileSync(templatePath, "utf-8");
   const renderedContent = substituteTokens(rawTemplate, {
     PNPM_VERSION: pnpmVersion,
-    NODE_VERSION: nodeVersion
+    NODE_VERSION: nodeVersion,
+    STRICT_NAME_SUFFIX: strictEnforced ? "" : " (report-only)",
+    STRICT_CONTINUE_ON_ERROR_LINE: strictEnforced ? "" : "\n    continue-on-error: true"
   });
   const targetPath = path9.join(projectDir, ".github", "workflows", "ci.yml");
   if (dryRun) {
@@ -31963,7 +31976,7 @@ function parseFlagsFromArgs2(args) {
 }
 function printHelp3() {
   process.stdout.write(
-    '\n  codebyplan ci\n\n  CI configuration management \u2014 detect platforms, scaffold workflow, enforce required check.\n\n  Subcommands:\n    init               Detect platforms and write/update .codebyplan/ci.json\n    scaffold-workflow  Write .github/workflows/ci.yml from the bundled template\n    enforce-check      Enforce the required CI status check on a GitHub branch\n    resolve <category>   Resolve the shell command for a CI check category (unit_test|typecheck|build|lint|e2e|audit)\n\n  Flags (all subcommands):\n    --dry-run            Preview the operation without writing any files\n    --force              Overwrite existing content that differs\n    --project-dir <p>    Target project root (default: current directory)\n    --json               Emit structured JSON to stdout\n\n  Flags (scaffold-workflow only):\n    --pnpm-version <v>   pnpm version for {{PNPM_VERSION}} token (default: auto-detected from package.json packageManager, falls back to "10")\n    --node-version <v>   Node.js version for {{NODE_VERSION}} token (default: "22")\n\n  Flags (enforce-check only):\n    --branch <b>         Branch to enforce (default: "main")\n    --check-name <n>     Status check name (default: "Lint + typecheck + test + build")\n\n  Flags (resolve only):\n    --platform <slug>    Platform slug to resolve against (e.g. next_js, nestjs, package)\n    --json               Emit the full CiResolveResult as JSON\n\n'
+    '\n  codebyplan ci\n\n  CI configuration management \u2014 detect platforms, scaffold workflow, enforce required check.\n\n  Subcommands:\n    init               Detect platforms and write/update .codebyplan/ci.json\n    scaffold-workflow  Write .github/workflows/ci.yml from the bundled template\n    enforce-check      Enforce the required CI status check on a GitHub branch\n    resolve <category>   Resolve the shell command for a CI check category (unit_test|typecheck|build|lint|e2e|audit)\n\n  Flags (all subcommands):\n    --dry-run            Preview the operation without writing any files\n    --force              Overwrite existing content that differs\n    --project-dir <p>    Target project root (default: current directory)\n    --json               Emit structured JSON to stdout\n\n  Flags (scaffold-workflow only):\n    --pnpm-version <v>   pnpm version for {{PNPM_VERSION}} token (default: auto-detected from package.json packageManager, falls back to "10")\n    --node-version <v>   Node.js version for {{NODE_VERSION}} token (default: "22")\n    --strict-enforced    Emit ci-strict as an enforced gate (continue-on-error:false, no\n                         report-only suffix). Default: read .codebyplan/ci.json\n                         workflow.strict_check_enforced (falls back to report-only)\n\n  Flags (enforce-check only):\n    --branch <b>         Branch to enforce (default: "main")\n    --check-name <n>     Status check name (default: "Lint + typecheck + test + build")\n\n  Flags (resolve only):\n    --platform <slug>    Platform slug to resolve against (e.g. next_js, nestjs, package)\n    --json               Emit the full CiResolveResult as JSON\n\n'
   );
 }
 function printInitResult(result) {
@@ -32062,6 +32075,7 @@ async function runCiCommand(args) {
   if (subcommand === "scaffold-workflow") {
     const pnpmVersion = flags["pnpm-version"];
     const nodeVersion = flags["node-version"];
+    const strictEnforced = flags["strict-enforced"] ? true : void 0;
     let result;
     try {
       result = await runScaffoldCiWorkflow({
@@ -32069,7 +32083,8 @@ async function runCiCommand(args) {
         force,
         projectDir,
         pnpmVersion,
-        nodeVersion
+        nodeVersion,
+        strictEnforced
       });
     } catch (err) {
       process.stderr.write(
@@ -36023,9 +36038,10 @@ function resolveTurboBin(projectRoot) {
   if (existsSync14(workspaceRootBin)) return workspaceRootBin;
   return TURBO_NOT_FOUND_SENTINEL;
 }
-function runTurboWithSummary(task, projectRoot, spawnFn) {
+function runTurboWithSummary(task, projectRoot, spawnFn, concurrency) {
   const turboBin = resolveTurboBin(projectRoot);
-  const command = `${turboBin} run ${task} --summarize`;
+  const concurrencyFlag = concurrency !== void 0 ? ` --concurrency=${concurrency}` : "";
+  const command = `${turboBin} run ${task}${concurrencyFlag} --summarize`;
   let spawnResult;
   try {
     spawnResult = spawnFn(command, { cwd: projectRoot });
@@ -36059,6 +36075,7 @@ function runCheck(opts) {
     spawnFn = defaultSpawnFn,
     updateBaseline: updateBaselineOpt = false,
     noBaseline = false,
+    concurrency,
     loadBaselineFn = loadBaseline,
     saveBaselineFn = saveBaseline
   } = opts;
@@ -36107,7 +36124,7 @@ function runCheck(opts) {
       spawnResult,
       failingPackages,
       command: lintCommand
-    } = runTurboWithSummary("lint", projectRoot, spawnFn);
+    } = runTurboWithSummary("lint", projectRoot, spawnFn, concurrency);
     currentFailing.lint = failingPackages;
     const newFailures = resolveNewFailures(
       "lint",
@@ -36132,7 +36149,7 @@ function runCheck(opts) {
       spawnResult,
       failingPackages,
       command: typecheckCommand
-    } = runTurboWithSummary("typecheck", projectRoot, spawnFn);
+    } = runTurboWithSummary("typecheck", projectRoot, spawnFn, concurrency);
     currentFailing.typecheck = failingPackages;
     const newFailures = resolveNewFailures(
       "typecheck",
@@ -36157,7 +36174,7 @@ function runCheck(opts) {
       spawnResult,
       failingPackages,
       command: testsCommand
-    } = runTurboWithSummary("test", projectRoot, spawnFn);
+    } = runTurboWithSummary("test", projectRoot, spawnFn, concurrency);
     currentFailing.tests = failingPackages;
     const newFailures = resolveNewFailures(
       "tests",
@@ -36358,9 +36375,24 @@ function parseCheckArgs(args) {
   let files;
   let updateBaseline = false;
   let noBaseline = false;
+  let concurrency;
   for (let i = 0; i < args.length; i++) {
     const arg = args[i];
-    if (arg === "--scope") {
+    if (arg === "--concurrency") {
+      const val = args[i + 1];
+      const parsed = val !== void 0 ? Number(val) : NaN;
+      if (Number.isInteger(parsed) && parsed > 0) {
+        concurrency = parsed;
+        i++;
+      } else {
+        process.stderr.write(
+          `check: --concurrency value must be a positive integer (got: '${val ?? ""}').
+`
+        );
+        process.exitCode = 1;
+        return null;
+      }
+    } else if (arg === "--scope") {
       const val = args[i + 1];
       if (val === "round" || val === "task" || val === "merged") {
         scope = val;
@@ -36392,7 +36424,7 @@ function parseCheckArgs(args) {
       }
     }
   }
-  return { scope, json, files, updateBaseline, noBaseline };
+  return { scope, json, files, updateBaseline, noBaseline, concurrency };
 }
 function emitTable(result) {
   const strict = result.no_baseline === true;
@@ -36465,13 +36497,14 @@ function runCheckCommand(args) {
   if (parsed === null) {
     return;
   }
-  const { scope, json, files, updateBaseline, noBaseline } = parsed;
+  const { scope, json, files, updateBaseline, noBaseline, concurrency } = parsed;
   const result = runCheck({
     scope,
     changedFiles: files,
     // NO-OP in whole-repo mode; notice emitted by runCheck
     updateBaseline,
-    noBaseline
+    noBaseline,
+    concurrency
   });
   if (json) {
     process.stdout.write(JSON.stringify(result, null, 2) + "\n");
@@ -41006,6 +41039,24 @@ function verifyRound(input) {
         }
       }
     }
+    if (output.status === "completed") {
+      const consoleErrors = output.console_errors;
+      if (Array.isArray(consoleErrors) && consoleErrors.length > 0) {
+        failed_checks.push({
+          check: "console_errors_reported",
+          framework,
+          detail: `Framework "${framework}" completed but reported ${consoleErrors.length} console/page error(s) \u2014 a completed run must be console-clean per rules/e2e-mandatory.md.`
+        });
+      }
+      const violations = output.a11y?.violations;
+      if (Array.isArray(violations) && violations.length > 0) {
+        failed_checks.push({
+          check: "a11y_violations_reported",
+          framework,
+          detail: `Framework "${framework}" completed but reported ${violations.length} accessibility violation(s) \u2014 fix in-scope or classify as category 'a11y' failures per rules/e2e-mandatory.md.`
+        });
+      }
+    }
   }
   return {
     pass: failed_checks.length === 0,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "codebyplan",
-  "version": "1.13.55",
+  "version": "1.13.56",
   "description": "CLI for CodeByPlan — AI-powered development planning and tracking",
   "type": "module",
   "bin": {

package/templates/agents/cbp-e2e-maestro.md CHANGED Viewed

@@ -132,7 +132,8 @@ One subdirectory per app module. Shared flows under `_shared/`. Probe under `_pr
 ## Spec-Writing Patterns
-**One flow per screen/feature.** Steps:
+**One flow per screen/feature.** A flow that only taps and asserts visibility is NOT done —
+prove behavior:
 ```yaml
 appId: ${APP_ID}
@@ -141,15 +142,97 @@ tags:
 ---
 - runFlow: _shared/login.yaml
 - assertVisible: "Dashboard"
-- takeScreenshot: "dashboard-loaded"
-- tapOn: "Create"
+- waitForAnimationToEnd
+- assertNoDefectsWithAI:       # AI visual-defect check — see AI Assertions below
+    optional: false
+- takeScreenshot: "dashboard-loaded"   # NEW states only — see Visual Baselines
+- tapOn:
+    text: "Create"
+    enabled: true              # state selector — waits for interactivity, catches broken gating
 - assertVisible: "New item"
-- takeScreenshot: "create-modal-open"
 ```
 Use text-based targeting first (`tapOn: "Button"`); use testID when ambiguous
-(`tapOn: { id: "btn" }`). For CRUD: create + verify visible; edit + verify updated;
-delete + confirm + verify removed.
+(`tapOn: { id: "btn" }`). `text`/`id` are REGEX by default — escape `$` and `[`; quote
+`YES`/`NO`/`ON`/`OFF` (unquoted they parse as YAML booleans).
+**Assertion depth requirements**:
+- **State selectors prove logic**: `enabled`, `checked`, `focused`, `selected` — e.g. assert
+  Submit is `enabled: false` before required fields are filled, `enabled: true` after.
+- **Data round-trips** via `copyTextFrom` + `assertTrue`: copy the value on screen A
+  (snapshot into `output.*` via `evalScript` before the next copy overwrites
+  `maestro.copiedText`), navigate, assert screen B shows the same value.
+- **Persistence proof** for create/edit flows — after the UI reports success, verify via
+  `runScript` `http.get` against the backend API (`json()` parse + `assertTrue` on the
+  field), or at minimum kill + relaunch and re-assert:
+  ```yaml
+  - killApp
+  - launchApp: { stopApp: false }
+  - assertVisible: ${output.createdTitle}
+  ```
+- For CRUD: create + verify (round-trip); edit + verify updated; delete + confirm + verify
+  removed.
+## Visual Baselines (assertScreenshot)
+Committed PNGs under `e2e/screenshots/maestro/` are BASELINES, not run artifacts.
+- **New state** (`git ls-files --error-unmatch <path>` exits non-zero): `waitForAnimationToEnd`,
+  then `takeScreenshot: "{flow}-{state}"` and `git add` the PNG (auto-new model).
+- **Existing baseline**: do NOT retake/overwrite. Assert against it:
+  ```yaml
+  - waitForAnimationToEnd
+  - assertScreenshot:
+      path: e2e/screenshots/maestro/{flow}-{state}.png
+      thresholdPercentage: 95
+  ```
+  On failure classify `visual_regression`: capture the live screen under a transient
+  diagnostic name (`{flow}-{state}-actual`, written to `--test-output-dir`), report it in
+  `screenshots[]`, and NEVER overwrite the committed baseline. The user accepts the change at
+  `/cbp-verify`; only then is the baseline re-captured and re-added.
+- `baseline_diff_pct` stays `null` (Maestro reports threshold pass/fail, not a percentage);
+  set `is_new` per git tracking as before.
+## AI Assertions (assertNoDefectsWithAI / assertWithAI)
+Maestro's AI commands screenshot the current screen and detect rendering defects (cut-off
+text, overlapping elements, mis-centered content). Run `assertNoDefectsWithAI` at every
+primary screen state; use `assertWithAI` for states selectors can't express:
+```yaml
+- assertNoDefectsWithAI:
+    optional: false
+- assertWithAI:
+    assertion: The 6-digit verification input is visible with all six boxes empty.
+    optional: false
+```
+**Critical**: AI commands default to `optional: true` (warn-only — a detected defect does
+NOT fail the flow). ALWAYS set `optional: false`.
+**Auth preflight (Step 6.5.1 addition)**: AI commands require Maestro auth — a `maestro login`
+session or `MAESTRO_CLOUD_API_KEY` (a free account suffices; the legacy `MAESTRO_CLI_AI_KEY`
+BYO-key path is retired). Probe before authoring AI steps. When unavailable, ask the user once
+(provide key / skip AI), record `ai_checks: 'unavailable'` in the output, omit AI commands,
+and rely on `assertScreenshot` baselines — never let an AI step fail a run on a missing key.
+AI artifacts (`ai-report-*.html`, `ai-*.json`) land under `--test-output-dir`; reference them
+in `critical_issues[].reason` when a defect is found.
+## Anti-Patterns
+- `waitForAnimationToEnd` is NOT an assertion — it succeeds even on timeout; always pair it
+  with a real assert or screenshot.
+- Don't wrap whole flows in `retry` (hides product flakiness); bound `repeat` loops with
+  `times` + `while` together.
+- No `point:` coordinate taps — device-dependent; combine attribute + relational selectors instead.
+- Don't max out timeouts ("60s everywhere") — defaults catch performance regressions.
+- Platform limits: `back` is Android/Web only; airplane-mode commands are Android-only;
+  Android `inputText` is ASCII-only; system biometric/HealthKit dialogs need XCUITest.
 ## Screenshot Capture
@@ -161,7 +244,9 @@ Screenshots written to `e2e/screenshots/maestro/` (via `screenshotsDir` in `conf
 Committed path convention: `e2e/screenshots/maestro/{flow}-{state}.png` (repo root).
 This path is intentionally outside `apps/web/e2e/screenshots/` (which is gitignored).
-After the flow completes, `git add e2e/screenshots/maestro/` to track new PNGs.
+After the flow completes, `git add` each NEW PNG individually — never `git add` the whole
+directory (that silently stages drifted baselines; existing states are gated by
+`assertScreenshot`, see Visual Baselines).
 **`is_new` detection**: `git ls-files --error-unmatch <path>` exits non-zero → `is_new: true`.
@@ -186,9 +271,13 @@ Include this in the specialist output alongside `screenshots[]`.
 ## Run Command
 ```bash
-maestro test maestro/flows/{module}/{flow}.yaml --format=junit --output maestro/results.xml
+maestro test maestro/flows/{module}/{flow}.yaml --format junit --output maestro/results.xml \
+  --test-output-dir maestro/output
 ```
+`maestro/output/` holds transient diagnostics (AI reports, `-actual` regression captures) —
+gitignore it; committed baselines live only under `e2e/screenshots/maestro/`.
 ## pnpm Scripts
 ```json

package/templates/agents/cbp-e2e-playwright.md CHANGED Viewed

@@ -21,7 +21,7 @@ accordingly.
 ## Install
 ```bash
-pnpm add -D @playwright/test
+pnpm add -D @playwright/test @axe-core/playwright
 pnpm exec playwright install chromium
 # CI with system deps:
 pnpm exec playwright install --with-deps chromium
@@ -265,32 +265,123 @@ port from `.codebyplan/server.local.json` (worktree overlay, checked first) then
 `.codebyplan/server.json` (committed base). On mismatch ask which is correct, then propose
 an Edit to align them.
+## Quality Fixture (MANDATORY)
+`apps/{app}/e2e/fixtures.ts` — the single `test` source for ALL specs. It auto-enforces the
+console-clean mandate (an `{ auto: true }` fixture runs in every test with zero per-spec
+opt-in) and provides the axe builder. Create it if absent; when touching an existing spec
+that still imports from `@playwright/test`, migrate its import.
+```ts
+import { test as base, expect } from "@playwright/test";
+import AxeBuilder from "@axe-core/playwright";
+// Known, triaged errors only — every entry needs a comment linking its fix task.
+const ALLOWED_CONSOLE: RegExp[] = [];
+type QualityFixtures = {
+  consoleGuard: void;
+  makeAxeBuilder: () => AxeBuilder;
+};
+export const test = base.extend<QualityFixtures>({
+  consoleGuard: [
+    async ({ page, baseURL }, use) => {
+      const errors: string[] = [];
+      page.on("console", (msg) => {
+        if (msg.type() === "error" && !ALLOWED_CONSOLE.some((re) => re.test(msg.text())))
+          errors.push(`console.error: ${msg.text()}`);
+      });
+      page.on("pageerror", (err) => errors.push(`pageerror: ${err.message}`));
+      page.on("requestfailed", (req) => {
+        // Own-origin, non-aborted failures only (cancelled prefetches are noise)
+        if (baseURL && req.url().startsWith(baseURL) && req.failure()?.errorText !== "net::ERR_ABORTED")
+          errors.push(`requestfailed: ${req.method()} ${req.url()} — ${req.failure()?.errorText}`);
+      });
+      await use();
+      expect(errors, "console/page errors captured during test").toEqual([]);
+    },
+    { auto: true },
+  ],
+  makeAxeBuilder: async ({ page }, use) => {
+    await use(() => new AxeBuilder({ page }).withTags(["wcag2a", "wcag2aa", "wcag21a", "wcag21aa"]));
+  },
+});
+export { expect };
+```
+Collected errors from failing tests feed the `console_errors[]` output field (see Output
+Additions below).
 ## Spec-Writing Patterns
-**One spec file per page/flow.** Mandatory per spec:
+**One spec file per page/flow.** Specs import `{ test, expect }` from the quality fixture
+(`./fixtures` or relative path) — NEVER directly from `@playwright/test`.
-- Smoke test: loads, title correct, no console errors.
-- Primary user flow: main interaction.
+Mandatory per spec — a spec that only proves elements are visible is NOT done:
+- Smoke test: loads, title correct (the console guard fails it on any console/page error).
+- Primary user flow: main interaction **with a behavior proof** (below).
 - Visual regression: `toHaveScreenshot` at every primary state.
+- Structure: `toMatchAriaSnapshot` on the primary state — catches hierarchy/label/role
+  breakage without pixel fragility.
+- Accessibility: one axe scan per page state, zero violations.
+### Functional Proof (mutations)
-For forms: fill + submit + verify success; validation errors.
-For CRUD: create + verify; edit + verify; delete + confirm + verify.
+Every flow that mutates state MUST prove the mutation happened — asserting the optimistic UI
+is not proof:
 ```ts
-import { test, expect } from "@playwright/test";
+// 1. Prove the API call succeeded
+const resp = page.waitForResponse((r) => r.url().includes("/api/items") && r.request().method() === "POST");
+await page.getByRole("button", { name: "Create" }).click();
+expect((await resp).status()).toBeLessThan(400);
+// 2. Prove persistence — reload and re-assert (or poll the API for eventual consistency)
+await page.reload();
+await expect(page.getByRole("listitem").filter({ hasText: itemName })).toBeVisible();
+// await expect.poll(async () => (await page.request.get(`/api/items/${id}`)).status()).toBe(200);
+```
-test.describe("Home page", () => {
-  test.beforeEach(async ({ page }) => {
-    await page.goto("/");
-  });
+### Error-State Proof (forms / CRUD)
-  test("loads and shows heading", async ({ page }) => {
-    await expect(page.getByRole("heading", { level: 1 })).toBeVisible();
-    await expect(page).toHaveScreenshot("home-loaded.png", { maxDiffPixelRatio: 0.001 });
-  });
+At least one test per form/CRUD spec injects a failure and asserts the rendered error UI —
+error paths are where untested UIs break in production:
+```ts
+await page.route("**/api/items", (r) => r.fulfill({ status: 500 }));
+await page.getByRole("button", { name: "Create" }).click();
+await expect(page.getByRole("alert")).toContainText(/failed|went wrong/i);
+```
+### Permission / RLS Proof
+When the route is role-gated, include one denial test (lower-privilege storage state or
+seeded non-member): assert the explicit denial UI or redirect — a blank render is a bug,
+not a pass.
+### Accessibility Scan
+```ts
+test("a11y: dashboard has no WCAG A/AA violations", async ({ page, makeAxeBuilder }) => {
+  await page.goto("/dashboard");
+  const results = await makeAxeBuilder().analyze();
+  expect(results.violations).toEqual([]);
 });
 ```
+Known issues are excluded via `.disableRules([...])` with a comment linking the fix task —
+never by deleting the scan.
+### Anti-Patterns (reject in review)
+- `page.waitForTimeout(...)` — web-first assertions auto-retry; hard sleeps mask races.
+- `expect(await locator.isVisible()).toBe(true)` — one-shot, no retry; use `await expect(locator).toBeVisible()`.
+- `.nth(n)` / `.first()` positional selection — except the documented SCSS-module fallback.
+- In-spec env skips (`test.skip(!process.env.X, ...)`) — forbidden per `rules/e2e-mandatory.md`.
+- Visibility-only assertions after a mutation — see Functional Proof.
 ## Screenshot Capture
 **Baseline regression** (preferred):
@@ -332,6 +423,18 @@ when the playwright.config project/device emulation indicates a mobile viewport
 Include this in the specialist output alongside `screenshots[]`.
+## Output Additions (Playwright)
+Beyond the shared contract, ALWAYS report:
+- `console_errors[]` — every entry the console guard collected on failed tests
+  (`{test_name, type: 'console' | 'pageerror' | 'requestfailed', text}`). Empty array on a
+  clean run — never omit the field.
+- `a11y` — `{scanned_pages: string[], violations: [{rule, impact, page}]}` aggregated from
+  the axe scans. A `status: 'completed'` output with non-empty `violations` is inconsistent —
+  fix in-scope or classify the failures as category `a11y`; `codebyplan e2e verify-round`
+  hard-fails the inconsistency.
 ## Run Command
 ```bash

package/templates/agents/cbp-verify-reviewer.md CHANGED Viewed

@@ -202,6 +202,14 @@ The deterministic e2e gate (`codebyplan e2e verify-round`) and the unit/lint/typ
 here). If the diff touches an e2e-eligible UI surface, note it in `summary` so the orchestrator
 confirms its gate ran — but do not assert a build/test result this agent did not run.
+E2E verdict gates (refuse `READY` per `rules/e2e-mandatory.md`): a zero-assertion run
+(`passed === 0 && skipped > 0` on a touched path); an empty `e2e_gallery[]` when the round
+touched UI for an eligible framework (sole exception: `vscode-test`-only rounds with explicit
+`e2e_gallery: []`); a `status: 'completed'` e2e output carrying non-empty `console_errors[]`
+or `a11y.violations[]`. Treat a `{type: 'shallow_coverage'}` critical issue on a mutation
+surface as a real finding (visibility-only specs prove rendering, not behavior) — severity
+`medium` minimum, routed to a follow-up round.
 ### Phase 6: Build Findings, Verdict & Routing
 Assign severity by impact: `critical` (runtime error / data corruption / security), `high`

package/templates/context/testing/e2e.md CHANGED Viewed

@@ -54,7 +54,7 @@ output:
       - test_name: string
         error: string
         file: string
-        category: 'env' | 'auth' | 'access' | 'flake' | 'real' | 'visual_regression'
+        category: 'env' | 'auth' | 'access' | 'flake' | 'real' | 'visual_regression' | 'console_error' | 'a11y'
         classification_reason: string
   framework_configured: boolean
   preflight:
@@ -77,6 +77,17 @@ output:
       committed_path: string             # repo-relative; MUST be git-tracked after the run
       is_new: boolean                    # true => no prior baseline; auto-captured+committed this run
       baseline_diff_pct: number | null   # null for non-playwright frameworks
+  console_errors:                        # REQUIRED for playwright (empty array on a clean run);
+    - test_name: string                  # null/omitted for frameworks without console capture
+      type: 'console' | 'pageerror' | 'requestfailed'
+      text: string
+  a11y:                                  # REQUIRED for playwright; null/omitted otherwise
+    scanned_pages: string[]
+    violations:
+      - rule: string                     # axe rule id (e.g. color-contrast)
+        impact: string                   # critical | serious | moderate | minor
+        page: string
+  ai_checks: 'ran' | 'unavailable' | null  # maestro only — AI assertion availability (see agent body)
   user_interactions: [{question, answer}]
   tech_stack_reconciliation:
     db_framework: string | null
@@ -177,12 +188,32 @@ For each failed test, assign exactly one category:
 | `auth` | Login-page redirect, 401 after credential submit, `invalid_grant`, `email_not_confirmed` | AskUserQuestion per Step 6.5.3 |
 | `access` | 403/404 on an accessible route, RLS denial text, missing seed data | AskUserQuestion: "Test failed with access error: `{error}`. Options: (1) fix + reply steps, (2) abort." |
 | `flake` | Timeout on first run, passes on immediate retry, network jitter | Retry up to 3 times before reclassifying to `real` |
-| `visual_regression` | `toHaveScreenshot` pixel-diff exceeded threshold | Do NOT retry. Include baseline + actual paths in `screenshots[]` with `baseline_diff_pct`. Do NOT auto-accept baselines. |
+| `visual_regression` | `toHaveScreenshot` / `assertScreenshot` diff exceeded threshold | Do NOT retry. Include baseline + actual paths in `screenshots[]` with `baseline_diff_pct`. Do NOT auto-accept baselines. |
+| `console_error` | Console guard collected console/page/request errors during the flow | App defect — fix in-scope or report; never allowlist without a linked fix task |
+| `a11y` | Axe scan reported WCAG A/AA violations | Do NOT retry. Report rule ids in `a11y.violations`; fix in-scope or surface at `/cbp-verify` |
 | `real` | Assertion failure on app behavior (wrong text, state, navigation) | Attempt fix (selector, timeout, assertion), max 3 attempts, then report |
 `env`, `auth`, `access` failures MUST NOT count toward `test_results.failed` until
 preflight passes — they block the run instead.
+## Functional Assertion Mandate
+Visibility-only specs are NOT sufficient coverage — they prove rendering, not behavior.
+Every spec/flow covering a mutation (create / edit / delete / submit) MUST include at
+least one behavior proof:
+- **network success proof** — response-status assertion on the mutating call
+  (`waitForResponse` in Playwright; `runScript` `http.*` in Maestro), AND/OR
+- **persistence proof** — reload / kill-and-relaunch / direct API re-read showing the
+  change survived, PLUS
+- **one error-state test per form/CRUD surface** — inject a failure (`page.route` 500 in
+  Playwright) and assert the rendered error UI.
+When a suite's assertions are entirely visibility/navigation-level, the specialist MUST
+report `critical_issues[]` entry `{type: 'shallow_coverage', ...}` — the run may pass, but
+the gap is flagged for the next round. `cbp-verify-reviewer` treats `shallow_coverage` on a
+mutation surface as a finding, not noise.
 ## Committed-Screenshot Mandate
 Every eligible e2e run MUST persist relevant screenshots to the framework's committed
@@ -215,6 +246,11 @@ classify as `visual_regression`. Do NOT auto-update. Surface as a blocking accep
 at `/cbp-verify` (round scope). The user must explicitly approve (`--update-snapshots`) or open a
 fix task. This relaxes the prior always-manual contract ONLY for new screens.
+The model applies to ALL screenshot-capable frameworks, not just Playwright: Maestro gates
+existing baselines with `assertScreenshot` against the committed PNG (the agent never
+retakes/overwrites an existing baseline; acceptance = re-capture + `git add` after user
+approval at `/cbp-verify`).
 ## Screenshot Collection Rule
 After every run, enumerate all committed PNGs and populate BOTH `screenshots[]` and
@@ -242,6 +278,11 @@ New-screen auto-capture (above) is the only exception to the always-manual contr
 - `tests_run === true`
 - `preflight.*.ok === true` for every required prerequisite
 - Every failure has `category` other than `env`, `auth`, or `access`
+- `console_errors[]` is empty and `a11y.violations[]` is empty (where the framework reports
+  them — Playwright always does). Non-empty values with `status: 'completed'` are
+  inconsistent and hard-fail `codebyplan e2e verify-round` (`console_errors_reported`,
+  `a11y_violations_reported`); either fix in-scope or return `status: 'failed'` with the
+  matching failure category.
 Otherwise return `status: 'failed'`.

package/templates/github-workflows/ci.yml CHANGED Viewed

@@ -17,9 +17,10 @@
 # Two jobs:
 #   ci         SOFT tier (authoritative required check) — the baseline-tolerant
 #              inner loop: lint, typecheck, test, build across the repo.
-#   ci-strict  HARDCORE tier (report-only) — whole-repo ABSOLUTE GREEN via
-#              `codebyplan check --scope merged --no-baseline`. Non-blocking for
-#              now; flip to a required check once the repo is absolute-green.
+#   ci-strict  HARDCORE tier — whole-repo ABSOLUTE GREEN via
+#              `codebyplan check --scope merged --no-baseline`. Report-only by
+#              default; set `workflow.strict_check_enforced: true` in
+#              `.codebyplan/ci.json` to make it a real gate (then enforce-check).
 name: CI
@@ -69,19 +70,21 @@ jobs:
       - name: Build
         run: pnpm turbo build
-  # ── HARDCORE strict tier (report-only) ──────────────────────────────────────
+  # ── HARDCORE strict tier ────────────────────────────────────────────────────
   # Whole-repo ABSOLUTE GREEN: `codebyplan check --scope merged --no-baseline`
   # ignores .check-baseline.json entirely, so ANY failing package (lint,
-  # typecheck, test) fails this job. This is the future checkpoint→main gate.
+  # typecheck, test) fails this job. This is the checkpoint→main gate.
   #
-  # report-only until apps/web baseline is burned down; flip to required after.
-  # `continue-on-error: true` keeps it non-blocking — the `ci` job above stays
-  # the authoritative required check. Do NOT wire this job as a branch-protection
-  # required check until the whole repo is absolute-green.
+  # report-only vs enforced is driven by `.codebyplan/ci.json`
+  # `workflow.strict_check_enforced` (scaffold-ci-workflow substitutes the
+  # tokens below): when false (default) the job name carries " (report-only)"
+  # and `continue-on-error: true` keeps it non-blocking; when true the suffix is
+  # dropped and `continue-on-error: false` makes it a real gate. Only flip the
+  # flag once the whole repo is absolute-green AND the job has run green in CI,
+  # then add it to branch protection via `codebyplan ci enforce-check`.
   ci-strict:
-    name: Strict whole-repo green (report-only)
-    runs-on: ubuntu-latest
-    continue-on-error: true
+    name: Strict whole-repo green{{STRICT_NAME_SUFFIX}}
+    runs-on: ubuntu-latest{{STRICT_CONTINUE_ON_ERROR_LINE}}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -112,10 +115,12 @@ jobs:
       # In the monorepo run the freshly-built bundle directly (the bin shim may
       # be missing because dist/cli.js did not exist at install time); in a
       # consumer repo that path is absent, so fall back to the installed bin.
+      # --concurrency=1 serializes turbo so the whole-repo matrix does not
+      # CPU-starve timing-sensitive test suites into flaky timeouts on the runner.
       - name: Strict check (no baseline)
         run: |
           if [ -f packages/codebyplan-package/dist/cli.js ]; then
-            node packages/codebyplan-package/dist/cli.js check --scope merged --no-baseline
+            node packages/codebyplan-package/dist/cli.js check --scope merged --no-baseline --concurrency=1
           else
-            pnpm exec codebyplan check --scope merged --no-baseline
+            pnpm exec codebyplan check --scope merged --no-baseline --concurrency=1
           fi

package/templates/rules/e2e-mandatory.md CHANGED Viewed

@@ -73,6 +73,27 @@ The sole exception is `vscode-test`: the committed dir may be empty when the ext
 has no visual output (behavior-only tests). Agents must still define the dir and report
 `e2e_gallery: []` explicitly — not omit the field.
+## Quality-Capture Mandates
+A green run that captured no quality signals is not evidence. Per framework:
+- **playwright**: every spec imports `test` from the shared quality fixture
+  (`e2e/fixtures.ts`) — console/pageerror guard auto-active in every test; one axe WCAG A/AA
+  scan per page state. The output MUST carry `console_errors[]` (empty on clean) and `a11y`
+  per `context/testing/e2e.md`.
+- **maestro**: existing committed screenshots are baselines — gate them with
+  `assertScreenshot` (never retake/overwrite); run `assertNoDefectsWithAI` with
+  `optional: false` at primary states when Maestro auth is available (the default
+  `optional: true` is warn-only and forbidden; record `ai_checks: 'unavailable'` when auth
+  is absent).
+- A `status: 'completed'` output carrying non-empty `console_errors[]` or
+  `a11y.violations[]` is inconsistent — `codebyplan e2e verify-round` hard-fails it
+  (`console_errors_reported`, `a11y_violations_reported`).
+Mutation flows MUST carry a behavior proof per `context/testing/e2e.md` § Functional
+Assertion Mandate (network success proof / persistence proof / error-state test);
+visibility-only suites are flagged `{type: 'shallow_coverage'}` in `critical_issues[]`.
 ## Cross-References
 - `context/testing/e2e.md` — Input/Output contract, pre-flight loop, failure classification,

package/templates/rules/two-tier-ci.md CHANGED Viewed

@@ -47,13 +47,21 @@ The branch model is **feat→main direct**; `.codebyplan/git.json` has `integrat
 IS the per-checkpoint feat branch. The hardcore tier runs against that feat branch's merged
 state before it lands on main; do not assume a staging/integration hop exists.
-## Report-Only Rollout
+## Strict-Tier Enforcement (report-only ⇄ enforced)
-The whole-repo hardcore CI **job** lands **report-only first** (`continue-on-error: true`) and is
-flipped to a required check ONLY after the `apps/web` baseline is burned down. Until then,
-`--scope merged --no-baseline` is advisory in CI — surfaced, not enforced — so a pre-existing
-`apps/web` red does not block a merge while the baseline is still being paid down. Locally,
-`cbp-verify` still runs and reports it.
+The whole-repo hardcore CI **job** (`ci-strict`) is config-driven via `.codebyplan/ci.json`
+`workflow.strict_check_enforced`, which `codebyplan ci scaffold-workflow` substitutes into the
+generated `.github/workflows/ci.yml`:
+- **`false` (default)** — report-only: the job carries the " (report-only)" name suffix and
+  `continue-on-error: true`, so `--scope merged --no-baseline` is advisory in CI — surfaced, not
+  enforced. A repo whose baseline is still red keeps merging while it pays the baseline down.
+- **`true`** — enforced: the suffix is dropped and `continue-on-error` is omitted (defaults to
+  `false`), making the job a real gate. Flip ONLY after the whole repo is absolute-green AND the
+  job has already run green in CI, then wire the check name `Strict whole-repo green` into branch
+  protection via `codebyplan ci enforce-check --check-name "Strict whole-repo green"`.
+Locally, `cbp-verify` runs and reports the same check regardless of the flag.
 ## Cross-References