codebyplan 1.13.55 → 1.13.56

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -39,7 +39,7 @@ var VERSION, PACKAGE_NAME;
39
39
  var init_version = __esm({
40
40
  "src/lib/version.ts"() {
41
41
  "use strict";
42
- VERSION = "1.13.55";
42
+ VERSION = "1.13.56";
43
43
  PACKAGE_NAME = "codebyplan";
44
44
  }
45
45
  });
@@ -31701,6 +31701,16 @@ function detectPnpmVersionFromPackageJson(projectDir) {
31701
31701
  return "10";
31702
31702
  }
31703
31703
  }
31704
+ function detectStrictEnforcedFromCiJson(projectDir) {
31705
+ try {
31706
+ const ciJsonPath = path9.join(projectDir, ".codebyplan", "ci.json");
31707
+ const raw = fs9.readFileSync(ciJsonPath, "utf-8");
31708
+ const parsed = JSON.parse(raw);
31709
+ return parsed.workflow?.strict_check_enforced === true;
31710
+ } catch {
31711
+ return false;
31712
+ }
31713
+ }
31704
31714
  async function runScaffoldCiWorkflow(opts) {
31705
31715
  await Promise.resolve();
31706
31716
  const dryRun = opts?.dryRun ?? false;
@@ -31708,6 +31718,7 @@ async function runScaffoldCiWorkflow(opts) {
31708
31718
  const projectDir = path9.resolve(opts?.projectDir ?? process.cwd());
31709
31719
  const pnpmVersion = opts?.pnpmVersion ?? detectPnpmVersionFromPackageJson(projectDir);
31710
31720
  const nodeVersion = opts?.nodeVersion ?? "22";
31721
+ const strictEnforced = opts?.strictEnforced ?? detectStrictEnforcedFromCiJson(projectDir);
31711
31722
  const templatesDir = opts?.templatesDir ?? resolveTemplatesDir();
31712
31723
  const templatePath = path9.join(templatesDir, "github-workflows", "ci.yml");
31713
31724
  if (!fs9.existsSync(templatePath)) {
@@ -31718,7 +31729,9 @@ async function runScaffoldCiWorkflow(opts) {
31718
31729
  const rawTemplate = fs9.readFileSync(templatePath, "utf-8");
31719
31730
  const renderedContent = substituteTokens(rawTemplate, {
31720
31731
  PNPM_VERSION: pnpmVersion,
31721
- NODE_VERSION: nodeVersion
31732
+ NODE_VERSION: nodeVersion,
31733
+ STRICT_NAME_SUFFIX: strictEnforced ? "" : " (report-only)",
31734
+ STRICT_CONTINUE_ON_ERROR_LINE: strictEnforced ? "" : "\n continue-on-error: true"
31722
31735
  });
31723
31736
  const targetPath = path9.join(projectDir, ".github", "workflows", "ci.yml");
31724
31737
  if (dryRun) {
@@ -31963,7 +31976,7 @@ function parseFlagsFromArgs2(args) {
31963
31976
  }
31964
31977
  function printHelp3() {
31965
31978
  process.stdout.write(
31966
- '\n codebyplan ci\n\n CI configuration management \u2014 detect platforms, scaffold workflow, enforce required check.\n\n Subcommands:\n init Detect platforms and write/update .codebyplan/ci.json\n scaffold-workflow Write .github/workflows/ci.yml from the bundled template\n enforce-check Enforce the required CI status check on a GitHub branch\n resolve <category> Resolve the shell command for a CI check category (unit_test|typecheck|build|lint|e2e|audit)\n\n Flags (all subcommands):\n --dry-run Preview the operation without writing any files\n --force Overwrite existing content that differs\n --project-dir <p> Target project root (default: current directory)\n --json Emit structured JSON to stdout\n\n Flags (scaffold-workflow only):\n --pnpm-version <v> pnpm version for {{PNPM_VERSION}} token (default: auto-detected from package.json packageManager, falls back to "10")\n --node-version <v> Node.js version for {{NODE_VERSION}} token (default: "22")\n\n Flags (enforce-check only):\n --branch <b> Branch to enforce (default: "main")\n --check-name <n> Status check name (default: "Lint + typecheck + test + build")\n\n Flags (resolve only):\n --platform <slug> Platform slug to resolve against (e.g. next_js, nestjs, package)\n --json Emit the full CiResolveResult as JSON\n\n'
31979
+ '\n codebyplan ci\n\n CI configuration management \u2014 detect platforms, scaffold workflow, enforce required check.\n\n Subcommands:\n init Detect platforms and write/update .codebyplan/ci.json\n scaffold-workflow Write .github/workflows/ci.yml from the bundled template\n enforce-check Enforce the required CI status check on a GitHub branch\n resolve <category> Resolve the shell command for a CI check category (unit_test|typecheck|build|lint|e2e|audit)\n\n Flags (all subcommands):\n --dry-run Preview the operation without writing any files\n --force Overwrite existing content that differs\n --project-dir <p> Target project root (default: current directory)\n --json Emit structured JSON to stdout\n\n Flags (scaffold-workflow only):\n --pnpm-version <v> pnpm version for {{PNPM_VERSION}} token (default: auto-detected from package.json packageManager, falls back to "10")\n --node-version <v> Node.js version for {{NODE_VERSION}} token (default: "22")\n --strict-enforced Emit ci-strict as an enforced gate (continue-on-error:false, no\n report-only suffix). Default: read .codebyplan/ci.json\n workflow.strict_check_enforced (falls back to report-only)\n\n Flags (enforce-check only):\n --branch <b> Branch to enforce (default: "main")\n --check-name <n> Status check name (default: "Lint + typecheck + test + build")\n\n Flags (resolve only):\n --platform <slug> Platform slug to resolve against (e.g. next_js, nestjs, package)\n --json Emit the full CiResolveResult as JSON\n\n'
31967
31980
  );
31968
31981
  }
31969
31982
  function printInitResult(result) {
@@ -32062,6 +32075,7 @@ async function runCiCommand(args) {
32062
32075
  if (subcommand === "scaffold-workflow") {
32063
32076
  const pnpmVersion = flags["pnpm-version"];
32064
32077
  const nodeVersion = flags["node-version"];
32078
+ const strictEnforced = flags["strict-enforced"] ? true : void 0;
32065
32079
  let result;
32066
32080
  try {
32067
32081
  result = await runScaffoldCiWorkflow({
@@ -32069,7 +32083,8 @@ async function runCiCommand(args) {
32069
32083
  force,
32070
32084
  projectDir,
32071
32085
  pnpmVersion,
32072
- nodeVersion
32086
+ nodeVersion,
32087
+ strictEnforced
32073
32088
  });
32074
32089
  } catch (err) {
32075
32090
  process.stderr.write(
@@ -36023,9 +36038,10 @@ function resolveTurboBin(projectRoot) {
36023
36038
  if (existsSync14(workspaceRootBin)) return workspaceRootBin;
36024
36039
  return TURBO_NOT_FOUND_SENTINEL;
36025
36040
  }
36026
- function runTurboWithSummary(task, projectRoot, spawnFn) {
36041
+ function runTurboWithSummary(task, projectRoot, spawnFn, concurrency) {
36027
36042
  const turboBin = resolveTurboBin(projectRoot);
36028
- const command = `${turboBin} run ${task} --summarize`;
36043
+ const concurrencyFlag = concurrency !== void 0 ? ` --concurrency=${concurrency}` : "";
36044
+ const command = `${turboBin} run ${task}${concurrencyFlag} --summarize`;
36029
36045
  let spawnResult;
36030
36046
  try {
36031
36047
  spawnResult = spawnFn(command, { cwd: projectRoot });
@@ -36059,6 +36075,7 @@ function runCheck(opts) {
36059
36075
  spawnFn = defaultSpawnFn,
36060
36076
  updateBaseline: updateBaselineOpt = false,
36061
36077
  noBaseline = false,
36078
+ concurrency,
36062
36079
  loadBaselineFn = loadBaseline,
36063
36080
  saveBaselineFn = saveBaseline
36064
36081
  } = opts;
@@ -36107,7 +36124,7 @@ function runCheck(opts) {
36107
36124
  spawnResult,
36108
36125
  failingPackages,
36109
36126
  command: lintCommand
36110
- } = runTurboWithSummary("lint", projectRoot, spawnFn);
36127
+ } = runTurboWithSummary("lint", projectRoot, spawnFn, concurrency);
36111
36128
  currentFailing.lint = failingPackages;
36112
36129
  const newFailures = resolveNewFailures(
36113
36130
  "lint",
@@ -36132,7 +36149,7 @@ function runCheck(opts) {
36132
36149
  spawnResult,
36133
36150
  failingPackages,
36134
36151
  command: typecheckCommand
36135
- } = runTurboWithSummary("typecheck", projectRoot, spawnFn);
36152
+ } = runTurboWithSummary("typecheck", projectRoot, spawnFn, concurrency);
36136
36153
  currentFailing.typecheck = failingPackages;
36137
36154
  const newFailures = resolveNewFailures(
36138
36155
  "typecheck",
@@ -36157,7 +36174,7 @@ function runCheck(opts) {
36157
36174
  spawnResult,
36158
36175
  failingPackages,
36159
36176
  command: testsCommand
36160
- } = runTurboWithSummary("test", projectRoot, spawnFn);
36177
+ } = runTurboWithSummary("test", projectRoot, spawnFn, concurrency);
36161
36178
  currentFailing.tests = failingPackages;
36162
36179
  const newFailures = resolveNewFailures(
36163
36180
  "tests",
@@ -36358,9 +36375,24 @@ function parseCheckArgs(args) {
36358
36375
  let files;
36359
36376
  let updateBaseline = false;
36360
36377
  let noBaseline = false;
36378
+ let concurrency;
36361
36379
  for (let i = 0; i < args.length; i++) {
36362
36380
  const arg = args[i];
36363
- if (arg === "--scope") {
36381
+ if (arg === "--concurrency") {
36382
+ const val = args[i + 1];
36383
+ const parsed = val !== void 0 ? Number(val) : NaN;
36384
+ if (Number.isInteger(parsed) && parsed > 0) {
36385
+ concurrency = parsed;
36386
+ i++;
36387
+ } else {
36388
+ process.stderr.write(
36389
+ `check: --concurrency value must be a positive integer (got: '${val ?? ""}').
36390
+ `
36391
+ );
36392
+ process.exitCode = 1;
36393
+ return null;
36394
+ }
36395
+ } else if (arg === "--scope") {
36364
36396
  const val = args[i + 1];
36365
36397
  if (val === "round" || val === "task" || val === "merged") {
36366
36398
  scope = val;
@@ -36392,7 +36424,7 @@ function parseCheckArgs(args) {
36392
36424
  }
36393
36425
  }
36394
36426
  }
36395
- return { scope, json, files, updateBaseline, noBaseline };
36427
+ return { scope, json, files, updateBaseline, noBaseline, concurrency };
36396
36428
  }
36397
36429
  function emitTable(result) {
36398
36430
  const strict = result.no_baseline === true;
@@ -36465,13 +36497,14 @@ function runCheckCommand(args) {
36465
36497
  if (parsed === null) {
36466
36498
  return;
36467
36499
  }
36468
- const { scope, json, files, updateBaseline, noBaseline } = parsed;
36500
+ const { scope, json, files, updateBaseline, noBaseline, concurrency } = parsed;
36469
36501
  const result = runCheck({
36470
36502
  scope,
36471
36503
  changedFiles: files,
36472
36504
  // NO-OP in whole-repo mode; notice emitted by runCheck
36473
36505
  updateBaseline,
36474
- noBaseline
36506
+ noBaseline,
36507
+ concurrency
36475
36508
  });
36476
36509
  if (json) {
36477
36510
  process.stdout.write(JSON.stringify(result, null, 2) + "\n");
@@ -41006,6 +41039,24 @@ function verifyRound(input) {
41006
41039
  }
41007
41040
  }
41008
41041
  }
41042
+ if (output.status === "completed") {
41043
+ const consoleErrors = output.console_errors;
41044
+ if (Array.isArray(consoleErrors) && consoleErrors.length > 0) {
41045
+ failed_checks.push({
41046
+ check: "console_errors_reported",
41047
+ framework,
41048
+ detail: `Framework "${framework}" completed but reported ${consoleErrors.length} console/page error(s) \u2014 a completed run must be console-clean per rules/e2e-mandatory.md.`
41049
+ });
41050
+ }
41051
+ const violations = output.a11y?.violations;
41052
+ if (Array.isArray(violations) && violations.length > 0) {
41053
+ failed_checks.push({
41054
+ check: "a11y_violations_reported",
41055
+ framework,
41056
+ detail: `Framework "${framework}" completed but reported ${violations.length} accessibility violation(s) \u2014 fix in-scope or classify as category 'a11y' failures per rules/e2e-mandatory.md.`
41057
+ });
41058
+ }
41059
+ }
41009
41060
  }
41010
41061
  return {
41011
41062
  pass: failed_checks.length === 0,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codebyplan",
3
- "version": "1.13.55",
3
+ "version": "1.13.56",
4
4
  "description": "CLI for CodeByPlan — AI-powered development planning and tracking",
5
5
  "type": "module",
6
6
  "bin": {
@@ -132,7 +132,8 @@ One subdirectory per app module. Shared flows under `_shared/`. Probe under `_pr
132
132
 
133
133
  ## Spec-Writing Patterns
134
134
 
135
- **One flow per screen/feature.** Steps:
135
+ **One flow per screen/feature.** A flow that only taps and asserts visibility is NOT done —
136
+ prove behavior:
136
137
 
137
138
  ```yaml
138
139
  appId: ${APP_ID}
@@ -141,15 +142,97 @@ tags:
141
142
  ---
142
143
  - runFlow: _shared/login.yaml
143
144
  - assertVisible: "Dashboard"
144
- - takeScreenshot: "dashboard-loaded"
145
- - tapOn: "Create"
145
+ - waitForAnimationToEnd
146
+ - assertNoDefectsWithAI: # AI visual-defect check — see AI Assertions below
147
+ optional: false
148
+ - takeScreenshot: "dashboard-loaded" # NEW states only — see Visual Baselines
149
+ - tapOn:
150
+ text: "Create"
151
+ enabled: true # state selector — waits for interactivity, catches broken gating
146
152
  - assertVisible: "New item"
147
- - takeScreenshot: "create-modal-open"
148
153
  ```
149
154
 
150
155
  Use text-based targeting first (`tapOn: "Button"`); use testID when ambiguous
151
- (`tapOn: { id: "btn" }`). For CRUD: create + verify visible; edit + verify updated;
152
- delete + confirm + verify removed.
156
+ (`tapOn: { id: "btn" }`). `text`/`id` are REGEX by default escape `$` and `[`; quote
157
+ `YES`/`NO`/`ON`/`OFF` (unquoted they parse as YAML booleans).
158
+
159
+ **Assertion depth requirements**:
160
+
161
+ - **State selectors prove logic**: `enabled`, `checked`, `focused`, `selected` — e.g. assert
162
+ Submit is `enabled: false` before required fields are filled, `enabled: true` after.
163
+ - **Data round-trips** via `copyTextFrom` + `assertTrue`: copy the value on screen A
164
+ (snapshot into `output.*` via `evalScript` before the next copy overwrites
165
+ `maestro.copiedText`), navigate, assert screen B shows the same value.
166
+ - **Persistence proof** for create/edit flows — after the UI reports success, verify via
167
+ `runScript` `http.get` against the backend API (`json()` parse + `assertTrue` on the
168
+ field), or at minimum kill + relaunch and re-assert:
169
+
170
+ ```yaml
171
+ - killApp
172
+ - launchApp: { stopApp: false }
173
+ - assertVisible: ${output.createdTitle}
174
+ ```
175
+
176
+ - For CRUD: create + verify (round-trip); edit + verify updated; delete + confirm + verify
177
+ removed.
178
+
179
+ ## Visual Baselines (assertScreenshot)
180
+
181
+ Committed PNGs under `e2e/screenshots/maestro/` are BASELINES, not run artifacts.
182
+
183
+ - **New state** (`git ls-files --error-unmatch <path>` exits non-zero): `waitForAnimationToEnd`,
184
+ then `takeScreenshot: "{flow}-{state}"` and `git add` the PNG (auto-new model).
185
+ - **Existing baseline**: do NOT retake/overwrite. Assert against it:
186
+
187
+ ```yaml
188
+ - waitForAnimationToEnd
189
+ - assertScreenshot:
190
+ path: e2e/screenshots/maestro/{flow}-{state}.png
191
+ thresholdPercentage: 95
192
+ ```
193
+
194
+ On failure classify `visual_regression`: capture the live screen under a transient
195
+ diagnostic name (`{flow}-{state}-actual`, written to `--test-output-dir`), report it in
196
+ `screenshots[]`, and NEVER overwrite the committed baseline. The user accepts the change at
197
+ `/cbp-verify`; only then is the baseline re-captured and re-added.
198
+ - `baseline_diff_pct` stays `null` (Maestro reports threshold pass/fail, not a percentage);
199
+ set `is_new` per git tracking as before.
200
+
201
+ ## AI Assertions (assertNoDefectsWithAI / assertWithAI)
202
+
203
+ Maestro's AI commands screenshot the current screen and detect rendering defects (cut-off
204
+ text, overlapping elements, mis-centered content). Run `assertNoDefectsWithAI` at every
205
+ primary screen state; use `assertWithAI` for states selectors can't express:
206
+
207
+ ```yaml
208
+ - assertNoDefectsWithAI:
209
+ optional: false
210
+ - assertWithAI:
211
+ assertion: The 6-digit verification input is visible with all six boxes empty.
212
+ optional: false
213
+ ```
214
+
215
+ **Critical**: AI commands default to `optional: true` (warn-only — a detected defect does
216
+ NOT fail the flow). ALWAYS set `optional: false`.
217
+
218
+ **Auth preflight (Step 6.5.1 addition)**: AI commands require Maestro auth — a `maestro login`
219
+ session or `MAESTRO_CLOUD_API_KEY` (a free account suffices; the legacy `MAESTRO_CLI_AI_KEY`
220
+ BYO-key path is retired). Probe before authoring AI steps. When unavailable, ask the user once
221
+ (provide key / skip AI), record `ai_checks: 'unavailable'` in the output, omit AI commands,
222
+ and rely on `assertScreenshot` baselines — never let an AI step fail a run on a missing key.
223
+ AI artifacts (`ai-report-*.html`, `ai-*.json`) land under `--test-output-dir`; reference them
224
+ in `critical_issues[].reason` when a defect is found.
225
+
226
+ ## Anti-Patterns
227
+
228
+ - `waitForAnimationToEnd` is NOT an assertion — it succeeds even on timeout; always pair it
229
+ with a real assert or screenshot.
230
+ - Don't wrap whole flows in `retry` (hides product flakiness); bound `repeat` loops with
231
+ `times` + `while` together.
232
+ - No `point:` coordinate taps — device-dependent; combine attribute + relational selectors instead.
233
+ - Don't max out timeouts ("60s everywhere") — defaults catch performance regressions.
234
+ - Platform limits: `back` is Android/Web only; airplane-mode commands are Android-only;
235
+ Android `inputText` is ASCII-only; system biometric/HealthKit dialogs need XCUITest.
153
236
 
154
237
  ## Screenshot Capture
155
238
 
@@ -161,7 +244,9 @@ Screenshots written to `e2e/screenshots/maestro/` (via `screenshotsDir` in `conf
161
244
  Committed path convention: `e2e/screenshots/maestro/{flow}-{state}.png` (repo root).
162
245
  This path is intentionally outside `apps/web/e2e/screenshots/` (which is gitignored).
163
246
 
164
- After the flow completes, `git add e2e/screenshots/maestro/` to track new PNGs.
247
+ After the flow completes, `git add` each NEW PNG individually — never `git add` the whole
248
+ directory (that silently stages drifted baselines; existing states are gated by
249
+ `assertScreenshot`, see Visual Baselines).
165
250
 
166
251
  **`is_new` detection**: `git ls-files --error-unmatch <path>` exits non-zero → `is_new: true`.
167
252
 
@@ -186,9 +271,13 @@ Include this in the specialist output alongside `screenshots[]`.
186
271
  ## Run Command
187
272
 
188
273
  ```bash
189
- maestro test maestro/flows/{module}/{flow}.yaml --format=junit --output maestro/results.xml
274
+ maestro test maestro/flows/{module}/{flow}.yaml --format junit --output maestro/results.xml \
275
+ --test-output-dir maestro/output
190
276
  ```
191
277
 
278
+ `maestro/output/` holds transient diagnostics (AI reports, `-actual` regression captures) —
279
+ gitignore it; committed baselines live only under `e2e/screenshots/maestro/`.
280
+
192
281
  ## pnpm Scripts
193
282
 
194
283
  ```json
@@ -21,7 +21,7 @@ accordingly.
21
21
  ## Install
22
22
 
23
23
  ```bash
24
- pnpm add -D @playwright/test
24
+ pnpm add -D @playwright/test @axe-core/playwright
25
25
  pnpm exec playwright install chromium
26
26
  # CI with system deps:
27
27
  pnpm exec playwright install --with-deps chromium
@@ -265,32 +265,123 @@ port from `.codebyplan/server.local.json` (worktree overlay, checked first) then
265
265
  `.codebyplan/server.json` (committed base). On mismatch ask which is correct, then propose
266
266
  an Edit to align them.
267
267
 
268
+ ## Quality Fixture (MANDATORY)
269
+
270
+ `apps/{app}/e2e/fixtures.ts` — the single `test` source for ALL specs. It auto-enforces the
271
+ console-clean mandate (an `{ auto: true }` fixture runs in every test with zero per-spec
272
+ opt-in) and provides the axe builder. Create it if absent; when touching an existing spec
273
+ that still imports from `@playwright/test`, migrate its import.
274
+
275
+ ```ts
276
+ import { test as base, expect } from "@playwright/test";
277
+ import AxeBuilder from "@axe-core/playwright";
278
+
279
+ // Known, triaged errors only — every entry needs a comment linking its fix task.
280
+ const ALLOWED_CONSOLE: RegExp[] = [];
281
+
282
+ type QualityFixtures = {
283
+ consoleGuard: void;
284
+ makeAxeBuilder: () => AxeBuilder;
285
+ };
286
+
287
+ export const test = base.extend<QualityFixtures>({
288
+ consoleGuard: [
289
+ async ({ page, baseURL }, use) => {
290
+ const errors: string[] = [];
291
+ page.on("console", (msg) => {
292
+ if (msg.type() === "error" && !ALLOWED_CONSOLE.some((re) => re.test(msg.text())))
293
+ errors.push(`console.error: ${msg.text()}`);
294
+ });
295
+ page.on("pageerror", (err) => errors.push(`pageerror: ${err.message}`));
296
+ page.on("requestfailed", (req) => {
297
+ // Own-origin, non-aborted failures only (cancelled prefetches are noise)
298
+ if (baseURL && req.url().startsWith(baseURL) && req.failure()?.errorText !== "net::ERR_ABORTED")
299
+ errors.push(`requestfailed: ${req.method()} ${req.url()} — ${req.failure()?.errorText}`);
300
+ });
301
+ await use();
302
+ expect(errors, "console/page errors captured during test").toEqual([]);
303
+ },
304
+ { auto: true },
305
+ ],
306
+ makeAxeBuilder: async ({ page }, use) => {
307
+ await use(() => new AxeBuilder({ page }).withTags(["wcag2a", "wcag2aa", "wcag21a", "wcag21aa"]));
308
+ },
309
+ });
310
+ export { expect };
311
+ ```
312
+
313
+ Collected errors from failing tests feed the `console_errors[]` output field (see Output
314
+ Additions below).
315
+
268
316
  ## Spec-Writing Patterns
269
317
 
270
- **One spec file per page/flow.** Mandatory per spec:
318
+ **One spec file per page/flow.** Specs import `{ test, expect }` from the quality fixture
319
+ (`./fixtures` or relative path) — NEVER directly from `@playwright/test`.
271
320
 
272
- - Smoke test: loads, title correct, no console errors.
273
- - Primary user flow: main interaction.
321
+ Mandatory per spec a spec that only proves elements are visible is NOT done:
322
+
323
+ - Smoke test: loads, title correct (the console guard fails it on any console/page error).
324
+ - Primary user flow: main interaction **with a behavior proof** (below).
274
325
  - Visual regression: `toHaveScreenshot` at every primary state.
326
+ - Structure: `toMatchAriaSnapshot` on the primary state — catches hierarchy/label/role
327
+ breakage without pixel fragility.
328
+ - Accessibility: one axe scan per page state, zero violations.
329
+
330
+ ### Functional Proof (mutations)
275
331
 
276
- For forms: fill + submit + verify success; validation errors.
277
- For CRUD: create + verify; edit + verify; delete + confirm + verify.
332
+ Every flow that mutates state MUST prove the mutation happened — asserting the optimistic UI
333
+ is not proof:
278
334
 
279
335
  ```ts
280
- import { test, expect } from "@playwright/test";
336
+ // 1. Prove the API call succeeded
337
+ const resp = page.waitForResponse((r) => r.url().includes("/api/items") && r.request().method() === "POST");
338
+ await page.getByRole("button", { name: "Create" }).click();
339
+ expect((await resp).status()).toBeLessThan(400);
340
+
341
+ // 2. Prove persistence — reload and re-assert (or poll the API for eventual consistency)
342
+ await page.reload();
343
+ await expect(page.getByRole("listitem").filter({ hasText: itemName })).toBeVisible();
344
+ // await expect.poll(async () => (await page.request.get(`/api/items/${id}`)).status()).toBe(200);
345
+ ```
281
346
 
282
- test.describe("Home page", () => {
283
- test.beforeEach(async ({ page }) => {
284
- await page.goto("/");
285
- });
347
+ ### Error-State Proof (forms / CRUD)
286
348
 
287
- test("loads and shows heading", async ({ page }) => {
288
- await expect(page.getByRole("heading", { level: 1 })).toBeVisible();
289
- await expect(page).toHaveScreenshot("home-loaded.png", { maxDiffPixelRatio: 0.001 });
290
- });
349
+ At least one test per form/CRUD spec injects a failure and asserts the rendered error UI —
350
+ error paths are where untested UIs break in production:
351
+
352
+ ```ts
353
+ await page.route("**/api/items", (r) => r.fulfill({ status: 500 }));
354
+ await page.getByRole("button", { name: "Create" }).click();
355
+ await expect(page.getByRole("alert")).toContainText(/failed|went wrong/i);
356
+ ```
357
+
358
+ ### Permission / RLS Proof
359
+
360
+ When the route is role-gated, include one denial test (lower-privilege storage state or
361
+ seeded non-member): assert the explicit denial UI or redirect — a blank render is a bug,
362
+ not a pass.
363
+
364
+ ### Accessibility Scan
365
+
366
+ ```ts
367
+ test("a11y: dashboard has no WCAG A/AA violations", async ({ page, makeAxeBuilder }) => {
368
+ await page.goto("/dashboard");
369
+ const results = await makeAxeBuilder().analyze();
370
+ expect(results.violations).toEqual([]);
291
371
  });
292
372
  ```
293
373
 
374
+ Known issues are excluded via `.disableRules([...])` with a comment linking the fix task —
375
+ never by deleting the scan.
376
+
377
+ ### Anti-Patterns (reject in review)
378
+
379
+ - `page.waitForTimeout(...)` — web-first assertions auto-retry; hard sleeps mask races.
380
+ - `expect(await locator.isVisible()).toBe(true)` — one-shot, no retry; use `await expect(locator).toBeVisible()`.
381
+ - `.nth(n)` / `.first()` positional selection — except the documented SCSS-module fallback.
382
+ - In-spec env skips (`test.skip(!process.env.X, ...)`) — forbidden per `rules/e2e-mandatory.md`.
383
+ - Visibility-only assertions after a mutation — see Functional Proof.
384
+
294
385
  ## Screenshot Capture
295
386
 
296
387
  **Baseline regression** (preferred):
@@ -332,6 +423,18 @@ when the playwright.config project/device emulation indicates a mobile viewport
332
423
 
333
424
  Include this in the specialist output alongside `screenshots[]`.
334
425
 
426
+ ## Output Additions (Playwright)
427
+
428
+ Beyond the shared contract, ALWAYS report:
429
+
430
+ - `console_errors[]` — every entry the console guard collected on failed tests
431
+ (`{test_name, type: 'console' | 'pageerror' | 'requestfailed', text}`). Empty array on a
432
+ clean run — never omit the field.
433
+ - `a11y` — `{scanned_pages: string[], violations: [{rule, impact, page}]}` aggregated from
434
+ the axe scans. A `status: 'completed'` output with non-empty `violations` is inconsistent —
435
+ fix in-scope or classify the failures as category `a11y`; `codebyplan e2e verify-round`
436
+ hard-fails the inconsistency.
437
+
335
438
  ## Run Command
336
439
 
337
440
  ```bash
@@ -202,6 +202,14 @@ The deterministic e2e gate (`codebyplan e2e verify-round`) and the unit/lint/typ
202
202
  here). If the diff touches an e2e-eligible UI surface, note it in `summary` so the orchestrator
203
203
  confirms its gate ran — but do not assert a build/test result this agent did not run.
204
204
 
205
+ E2E verdict gates (refuse `READY` per `rules/e2e-mandatory.md`): a zero-assertion run
206
+ (`passed === 0 && skipped > 0` on a touched path); an empty `e2e_gallery[]` when the round
207
+ touched UI for an eligible framework (sole exception: `vscode-test`-only rounds with explicit
208
+ `e2e_gallery: []`); a `status: 'completed'` e2e output carrying non-empty `console_errors[]`
209
+ or `a11y.violations[]`. Treat a `{type: 'shallow_coverage'}` critical issue on a mutation
210
+ surface as a real finding (visibility-only specs prove rendering, not behavior) — severity
211
+ `medium` minimum, routed to a follow-up round.
212
+
205
213
  ### Phase 6: Build Findings, Verdict & Routing
206
214
 
207
215
  Assign severity by impact: `critical` (runtime error / data corruption / security), `high`
@@ -54,7 +54,7 @@ output:
54
54
  - test_name: string
55
55
  error: string
56
56
  file: string
57
- category: 'env' | 'auth' | 'access' | 'flake' | 'real' | 'visual_regression'
57
+ category: 'env' | 'auth' | 'access' | 'flake' | 'real' | 'visual_regression' | 'console_error' | 'a11y'
58
58
  classification_reason: string
59
59
  framework_configured: boolean
60
60
  preflight:
@@ -77,6 +77,17 @@ output:
77
77
  committed_path: string # repo-relative; MUST be git-tracked after the run
78
78
  is_new: boolean # true => no prior baseline; auto-captured+committed this run
79
79
  baseline_diff_pct: number | null # null for non-playwright frameworks
80
+ console_errors: # REQUIRED for playwright (empty array on a clean run);
81
+ - test_name: string # null/omitted for frameworks without console capture
82
+ type: 'console' | 'pageerror' | 'requestfailed'
83
+ text: string
84
+ a11y: # REQUIRED for playwright; null/omitted otherwise
85
+ scanned_pages: string[]
86
+ violations:
87
+ - rule: string # axe rule id (e.g. color-contrast)
88
+ impact: string # critical | serious | moderate | minor
89
+ page: string
90
+ ai_checks: 'ran' | 'unavailable' | null # maestro only — AI assertion availability (see agent body)
80
91
  user_interactions: [{question, answer}]
81
92
  tech_stack_reconciliation:
82
93
  db_framework: string | null
@@ -177,12 +188,32 @@ For each failed test, assign exactly one category:
177
188
  | `auth` | Login-page redirect, 401 after credential submit, `invalid_grant`, `email_not_confirmed` | AskUserQuestion per Step 6.5.3 |
178
189
  | `access` | 403/404 on an accessible route, RLS denial text, missing seed data | AskUserQuestion: "Test failed with access error: `{error}`. Options: (1) fix + reply steps, (2) abort." |
179
190
  | `flake` | Timeout on first run, passes on immediate retry, network jitter | Retry up to 3 times before reclassifying to `real` |
180
- | `visual_regression` | `toHaveScreenshot` pixel-diff exceeded threshold | Do NOT retry. Include baseline + actual paths in `screenshots[]` with `baseline_diff_pct`. Do NOT auto-accept baselines. |
191
+ | `visual_regression` | `toHaveScreenshot` / `assertScreenshot` diff exceeded threshold | Do NOT retry. Include baseline + actual paths in `screenshots[]` with `baseline_diff_pct`. Do NOT auto-accept baselines. |
192
+ | `console_error` | Console guard collected console/page/request errors during the flow | App defect — fix in-scope or report; never allowlist without a linked fix task |
193
+ | `a11y` | Axe scan reported WCAG A/AA violations | Do NOT retry. Report rule ids in `a11y.violations`; fix in-scope or surface at `/cbp-verify` |
181
194
  | `real` | Assertion failure on app behavior (wrong text, state, navigation) | Attempt fix (selector, timeout, assertion), max 3 attempts, then report |
182
195
 
183
196
  `env`, `auth`, `access` failures MUST NOT count toward `test_results.failed` until
184
197
  preflight passes — they block the run instead.
185
198
 
199
+ ## Functional Assertion Mandate
200
+
201
+ Visibility-only specs are NOT sufficient coverage — they prove rendering, not behavior.
202
+ Every spec/flow covering a mutation (create / edit / delete / submit) MUST include at
203
+ least one behavior proof:
204
+
205
+ - **network success proof** — response-status assertion on the mutating call
206
+ (`waitForResponse` in Playwright; `runScript` `http.*` in Maestro), AND/OR
207
+ - **persistence proof** — reload / kill-and-relaunch / direct API re-read showing the
208
+ change survived, PLUS
209
+ - **one error-state test per form/CRUD surface** — inject a failure (`page.route` 500 in
210
+ Playwright) and assert the rendered error UI.
211
+
212
+ When a suite's assertions are entirely visibility/navigation-level, the specialist MUST
213
+ report `critical_issues[]` entry `{type: 'shallow_coverage', ...}` — the run may pass, but
214
+ the gap is flagged for the next round. `cbp-verify-reviewer` treats `shallow_coverage` on a
215
+ mutation surface as a finding, not noise.
216
+
186
217
  ## Committed-Screenshot Mandate
187
218
 
188
219
  Every eligible e2e run MUST persist relevant screenshots to the framework's committed
@@ -215,6 +246,11 @@ classify as `visual_regression`. Do NOT auto-update. Surface as a blocking accep
215
246
  at `/cbp-verify` (round scope). The user must explicitly approve (`--update-snapshots`) or open a
216
247
  fix task. This relaxes the prior always-manual contract ONLY for new screens.
217
248
 
249
+ The model applies to ALL screenshot-capable frameworks, not just Playwright: Maestro gates
250
+ existing baselines with `assertScreenshot` against the committed PNG (the agent never
251
+ retakes/overwrites an existing baseline; acceptance = re-capture + `git add` after user
252
+ approval at `/cbp-verify`).
253
+
218
254
  ## Screenshot Collection Rule
219
255
 
220
256
  After every run, enumerate all committed PNGs and populate BOTH `screenshots[]` and
@@ -242,6 +278,11 @@ New-screen auto-capture (above) is the only exception to the always-manual contr
242
278
  - `tests_run === true`
243
279
  - `preflight.*.ok === true` for every required prerequisite
244
280
  - Every failure has `category` other than `env`, `auth`, or `access`
281
+ - `console_errors[]` is empty and `a11y.violations[]` is empty (where the framework reports
282
+ them — Playwright always does). Non-empty values with `status: 'completed'` are
283
+ inconsistent and hard-fail `codebyplan e2e verify-round` (`console_errors_reported`,
284
+ `a11y_violations_reported`); either fix in-scope or return `status: 'failed'` with the
285
+ matching failure category.
245
286
 
246
287
  Otherwise return `status: 'failed'`.
247
288
 
@@ -17,9 +17,10 @@
17
17
  # Two jobs:
18
18
  # ci SOFT tier (authoritative required check) — the baseline-tolerant
19
19
  # inner loop: lint, typecheck, test, build across the repo.
20
- # ci-strict HARDCORE tier (report-only) — whole-repo ABSOLUTE GREEN via
21
- # `codebyplan check --scope merged --no-baseline`. Non-blocking for
22
- # now; flip to a required check once the repo is absolute-green.
20
+ # ci-strict HARDCORE tier — whole-repo ABSOLUTE GREEN via
21
+ # `codebyplan check --scope merged --no-baseline`. Report-only by
22
+ # default; set `workflow.strict_check_enforced: true` in
23
+ # `.codebyplan/ci.json` to make it a real gate (then enforce-check).
23
24
 
24
25
  name: CI
25
26
 
@@ -69,19 +70,21 @@ jobs:
69
70
  - name: Build
70
71
  run: pnpm turbo build
71
72
 
72
- # ── HARDCORE strict tier (report-only) ──────────────────────────────────────
73
+ # ── HARDCORE strict tier ────────────────────────────────────────────────────
73
74
  # Whole-repo ABSOLUTE GREEN: `codebyplan check --scope merged --no-baseline`
74
75
  # ignores .check-baseline.json entirely, so ANY failing package (lint,
75
- # typecheck, test) fails this job. This is the future checkpoint→main gate.
76
+ # typecheck, test) fails this job. This is the checkpoint→main gate.
76
77
  #
77
- # report-only until apps/web baseline is burned down; flip to required after.
78
- # `continue-on-error: true` keeps it non-blocking the `ci` job above stays
79
- # the authoritative required check. Do NOT wire this job as a branch-protection
80
- # required check until the whole repo is absolute-green.
78
+ # report-only vs enforced is driven by `.codebyplan/ci.json`
79
+ # `workflow.strict_check_enforced` (scaffold-ci-workflow substitutes the
80
+ # tokens below): when false (default) the job name carries " (report-only)"
81
+ # and `continue-on-error: true` keeps it non-blocking; when true the suffix is
82
+ # dropped and `continue-on-error: false` makes it a real gate. Only flip the
83
+ # flag once the whole repo is absolute-green AND the job has run green in CI,
84
+ # then add it to branch protection via `codebyplan ci enforce-check`.
81
85
  ci-strict:
82
- name: Strict whole-repo green (report-only)
83
- runs-on: ubuntu-latest
84
- continue-on-error: true
86
+ name: Strict whole-repo green{{STRICT_NAME_SUFFIX}}
87
+ runs-on: ubuntu-latest{{STRICT_CONTINUE_ON_ERROR_LINE}}
85
88
  steps:
86
89
  - name: Checkout
87
90
  uses: actions/checkout@v4
@@ -112,10 +115,12 @@ jobs:
112
115
  # In the monorepo run the freshly-built bundle directly (the bin shim may
113
116
  # be missing because dist/cli.js did not exist at install time); in a
114
117
  # consumer repo that path is absent, so fall back to the installed bin.
118
+ # --concurrency=1 serializes turbo so the whole-repo matrix does not
119
+ # CPU-starve timing-sensitive test suites into flaky timeouts on the runner.
115
120
  - name: Strict check (no baseline)
116
121
  run: |
117
122
  if [ -f packages/codebyplan-package/dist/cli.js ]; then
118
- node packages/codebyplan-package/dist/cli.js check --scope merged --no-baseline
123
+ node packages/codebyplan-package/dist/cli.js check --scope merged --no-baseline --concurrency=1
119
124
  else
120
- pnpm exec codebyplan check --scope merged --no-baseline
125
+ pnpm exec codebyplan check --scope merged --no-baseline --concurrency=1
121
126
  fi
@@ -73,6 +73,27 @@ The sole exception is `vscode-test`: the committed dir may be empty when the ext
73
73
  has no visual output (behavior-only tests). Agents must still define the dir and report
74
74
  `e2e_gallery: []` explicitly — not omit the field.
75
75
 
76
+ ## Quality-Capture Mandates
77
+
78
+ A green run that captured no quality signals is not evidence. Per framework:
79
+
80
+ - **playwright**: every spec imports `test` from the shared quality fixture
81
+ (`e2e/fixtures.ts`) — console/pageerror guard auto-active in every test; one axe WCAG A/AA
82
+ scan per page state. The output MUST carry `console_errors[]` (empty on clean) and `a11y`
83
+ per `context/testing/e2e.md`.
84
+ - **maestro**: existing committed screenshots are baselines — gate them with
85
+ `assertScreenshot` (never retake/overwrite); run `assertNoDefectsWithAI` with
86
+ `optional: false` at primary states when Maestro auth is available (the default
87
+ `optional: true` is warn-only and forbidden; record `ai_checks: 'unavailable'` when auth
88
+ is absent).
89
+ - A `status: 'completed'` output carrying non-empty `console_errors[]` or
90
+ `a11y.violations[]` is inconsistent — `codebyplan e2e verify-round` hard-fails it
91
+ (`console_errors_reported`, `a11y_violations_reported`).
92
+
93
+ Mutation flows MUST carry a behavior proof per `context/testing/e2e.md` § Functional
94
+ Assertion Mandate (network success proof / persistence proof / error-state test);
95
+ visibility-only suites are flagged `{type: 'shallow_coverage'}` in `critical_issues[]`.
96
+
76
97
  ## Cross-References
77
98
 
78
99
  - `context/testing/e2e.md` — Input/Output contract, pre-flight loop, failure classification,
@@ -47,13 +47,21 @@ The branch model is **feat→main direct**; `.codebyplan/git.json` has `integrat
47
47
  IS the per-checkpoint feat branch. The hardcore tier runs against that feat branch's merged
48
48
  state before it lands on main; do not assume a staging/integration hop exists.
49
49
 
50
- ## Report-Only Rollout
50
+ ## Strict-Tier Enforcement (report-only ⇄ enforced)
51
51
 
52
- The whole-repo hardcore CI **job** lands **report-only first** (`continue-on-error: true`) and is
53
- flipped to a required check ONLY after the `apps/web` baseline is burned down. Until then,
54
- `--scope merged --no-baseline` is advisory in CI — surfaced, not enforced — so a pre-existing
55
- `apps/web` red does not block a merge while the baseline is still being paid down. Locally,
56
- `cbp-verify` still runs and reports it.
52
+ The whole-repo hardcore CI **job** (`ci-strict`) is config-driven via `.codebyplan/ci.json`
53
+ `workflow.strict_check_enforced`, which `codebyplan ci scaffold-workflow` substitutes into the
54
+ generated `.github/workflows/ci.yml`:
55
+
56
+ - **`false` (default)** report-only: the job carries the " (report-only)" name suffix and
57
+ `continue-on-error: true`, so `--scope merged --no-baseline` is advisory in CI — surfaced, not
58
+ enforced. A repo whose baseline is still red keeps merging while it pays the baseline down.
59
+ - **`true`** — enforced: the suffix is dropped and `continue-on-error` is omitted (defaults to
60
+ `false`), making the job a real gate. Flip ONLY after the whole repo is absolute-green AND the
61
+ job has already run green in CI, then wire the check name `Strict whole-repo green` into branch
62
+ protection via `codebyplan ci enforce-check --check-name "Strict whole-repo green"`.
63
+
64
+ Locally, `cbp-verify` runs and reports the same check regardless of the flag.
57
65
 
58
66
  ## Cross-References
59
67