@metasession.co/devaudit-cli 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -10
- package/dist/index.js +17 -5
- package/dist/index.js.map +1 -1
- package/package.json +9 -5
- package/scripts/upload-evidence.sh +225 -0
- package/sdlc/CLAUDE.md +73 -0
- package/sdlc/HOST_ADAPTER.md +127 -0
- package/sdlc/SKILLS.md +137 -0
- package/sdlc/STACK_ADAPTER.md +130 -0
- package/sdlc/ai-rules/INSTRUCTIONS-SDLC.md +172 -0
- package/sdlc/ai-rules/README.md +103 -0
- package/sdlc/ai-rules/SDLC_RULES.md +584 -0
- package/sdlc/ai-rules/claude/CLAUDE.md +192 -0
- package/sdlc/ai-rules/cursor/.cursorrules +167 -0
- package/sdlc/ai-rules/windsurf/.windsurfrules +167 -0
- package/sdlc/article.md +219 -0
- package/sdlc/files/_common/0-project-setup.md +410 -0
- package/sdlc/files/_common/1-plan-requirement.md +381 -0
- package/sdlc/files/_common/2-implement-and-test.md +276 -0
- package/sdlc/files/_common/3-compile-evidence.md +603 -0
- package/sdlc/files/_common/4-submit-for-review.md +362 -0
- package/sdlc/files/_common/5-deploy-main.md +251 -0
- package/sdlc/files/_common/Periodic_Security_Review_Schedule.md +169 -0
- package/sdlc/files/_common/README_TEMPLATE.md +441 -0
- package/sdlc/files/_common/Test_Architecture.md +461 -0
- package/sdlc/files/_common/Test_Plan_TEMPLATE.md +311 -0
- package/sdlc/files/_common/Test_Policy.md +277 -0
- package/sdlc/files/_common/Test_Strategy.md +359 -0
- package/sdlc/files/_common/github/ISSUE_TEMPLATE/bug.yml +75 -0
- package/sdlc/files/_common/github/ISSUE_TEMPLATE/config.yml +11 -0
- package/sdlc/files/_common/github/ISSUE_TEMPLATE/requirement.yml +75 -0
- package/sdlc/files/_common/github/ISSUE_TEMPLATE/task.yml +48 -0
- package/sdlc/files/_common/github/pull_request_template.md +69 -0
- package/sdlc/files/_common/implementing-an-sdlc-issue.md +413 -0
- package/sdlc/files/_common/scripts/derive-release-version.sh +40 -0
- package/sdlc/files/_common/scripts/derive-release-version.test.sh +98 -0
- package/sdlc/files/_common/scripts/submit-for-uat-review.sh +162 -0
- package/sdlc/files/_common/scripts/validate-commits.sh +83 -0
- package/sdlc/files/_common/scripts/validate-compliance-artifacts.sh +202 -0
- package/sdlc/files/_common/scripts/validate-compliance-artifacts.test.sh +202 -0
- package/sdlc/files/_common/skills/_schema/skill.schema.json +36 -0
- package/sdlc/files/_common/skills/e2e-test-engineer/SKILL.md +254 -0
- package/sdlc/files/_common/skills/e2e-test-engineer/references/bootstrap.md +244 -0
- package/sdlc/files/_common/skills/e2e-test-engineer/references/evidence.ts +40 -0
- package/sdlc/files/_common/skills/sdlc-implementer/SKILL.md +189 -0
- package/sdlc/files/_common/skills/sdlc-implementer/references/call-graph.md +64 -0
- package/sdlc/files/_common/skills/sdlc-implementer/references/change-request-loop.md +192 -0
- package/sdlc/files/_common/skills/sdlc-implementer/references/compliance-constraints.md +81 -0
- package/sdlc/files/ci/check-release-approval.yml.template +201 -0
- package/sdlc/files/ci/ci-status-fallback.yml.template +41 -0
- package/sdlc/files/ci/ci.yml.template +390 -0
- package/sdlc/files/ci/compliance-evidence.yml.template +161 -0
- package/sdlc/files/ci/compliance-validation.yml.template +34 -0
- package/sdlc/files/ci/post-deploy-prod.yml.template +159 -0
- package/sdlc/files/ci/python/ci.yml.template +335 -0
- package/sdlc/files/hosts/_schema/adapter.schema.json +103 -0
- package/sdlc/files/hosts/railway/adapter.json +32 -0
- package/sdlc/files/sdlc-config.example.json +74 -0
- package/sdlc/files/stacks/_schema/adapter.schema.json +151 -0
- package/sdlc/files/stacks/node/adapter.json +54 -0
- package/sdlc/files/stacks/node/hooks/.prettierrc.json +9 -0
- package/sdlc/files/stacks/node/hooks/commit-msg +7 -0
- package/sdlc/files/stacks/node/hooks/commitlint.config.mjs +64 -0
- package/sdlc/files/stacks/node/hooks/lint-staged.config.mjs +16 -0
- package/sdlc/files/stacks/node/hooks/pre-commit +13 -0
- package/sdlc/files/stacks/node/hooks/pre-push +15 -0
- package/sdlc/files/stacks/node/scripts/check-requirement-jsdoc.sh +54 -0
- package/sdlc/files/stacks/python/adapter.json +36 -0
- package/sdlc/files/stacks/python/hooks/.pre-commit-config.yaml +51 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://devaudit.metasession.co/sdlc/skills/skill.schema.json",
|
|
4
|
+
"title": "Claude Code Skill frontmatter",
|
|
5
|
+
"description": "Validates the YAML frontmatter at the top of every SKILL.md under sdlc/files/_common/skills/ or sdlc/files/stacks/<name>/skills/. The body of SKILL.md (everything after the frontmatter) is free-form Markdown — Claude reads it as the skill's instructions and isn't validated here.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"additionalProperties": false,
|
|
8
|
+
"required": ["name", "description"],
|
|
9
|
+
"properties": {
|
|
10
|
+
"name": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"pattern": "^[a-z0-9][a-z0-9-]{1,63}$",
|
|
13
|
+
"description": "Lowercase identifier — matches the parent directory under skills/ and is how Claude refers to the skill when invoking it. 2–64 chars, lowercase, kebab-cased."
|
|
14
|
+
},
|
|
15
|
+
"description": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"minLength": 50,
|
|
18
|
+
"description": "What the skill does AND when Claude should invoke it. Both halves matter — without 'when to use' triggers, discovery doesn't fire. Aim for 100–500 characters."
|
|
19
|
+
},
|
|
20
|
+
"version": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"pattern": "^[0-9]+\\.[0-9]+\\.[0-9]+$",
|
|
23
|
+
"description": "Optional semver. Useful when a skill's expected behaviour changes incompatibly; consumers can pin to a known version."
|
|
24
|
+
},
|
|
25
|
+
"tags": {
|
|
26
|
+
"type": "array",
|
|
27
|
+
"items": { "type": "string", "minLength": 1 },
|
|
28
|
+
"uniqueItems": true,
|
|
29
|
+
"description": "Optional tags for grouping skills (e.g. 'testing', 'security', 'compliance'). Not consumed by Claude — useful for documentation tooling."
|
|
30
|
+
},
|
|
31
|
+
"license": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"description": "Optional SPDX licence identifier or 'proprietary'."
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: e2e-test-engineer
|
|
3
|
+
description: Maintain or bootstrap a project's end-to-end and visual regression test pack. Use when the user wants to add, update, or retire e2e or visual tests for a feature, ticket, issue, or PR — OR when no e2e suite exists yet and one needs setting up using best practices for the detected stack. Covers deriving the scenarios a change needs, matching the project's conventions, removing obsolete tests (only after confirmation), running the suite, and filing defects for failures or missed acceptance criteria. Trigger on phrases like "add e2e tests for [ticket]", "update the test pack", "what tests do we need for this issue", "are any tests obsolete", "run the e2e tests and file issues", "add visual regression coverage", "set up e2e tests for this project", or "bootstrap an e2e suite". Framework-agnostic (Playwright, Cypress, Selenium, etc.) and tracker-agnostic (GitHub, Jira, Linear, etc.). Do NOT use for unit, component, or API-only tests, or performance tests.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# E2E Test Engineer
|
|
7
|
+
|
|
8
|
+
Maintain or bootstrap a project's e2e and visual regression test suite. Given an issue (or ticket, or PR) plus the implementation, derive the scenarios the change actually needs, reconcile them with what's already there (adding what's missing, retiring what's obsolete), run the suite, and surface defects and missed acceptance criteria. When no suite exists yet, set one up using best practices for the detected tech stack before adding the change's tests as the first real tests in the new suite.
|
|
9
|
+
|
|
10
|
+
## Scope
|
|
11
|
+
|
|
12
|
+
**In scope**
|
|
13
|
+
- End-to-end tests (UI-driven, user-flow level) in any framework.
|
|
14
|
+
- Visual regression tests in any tool.
|
|
15
|
+
- Maintaining an existing test pack — adding, updating, retiring tests.
|
|
16
|
+
- Bootstrapping a new e2e suite when none exists, including framework selection, structure, configuration, a starter smoke test, and (optionally) CI integration.
|
|
17
|
+
|
|
18
|
+
**Out of scope**
|
|
19
|
+
- Unit, component, or API-only tests.
|
|
20
|
+
- Performance, load, or accessibility audits, unless the project's e2e pack already includes them — in which case follow its lead.
|
|
21
|
+
|
|
22
|
+
## The workflow
|
|
23
|
+
|
|
24
|
+
Six phases. Don't skip them and don't reorder — each one feeds the next. Communicate progress as you go; long silent phases feel like the skill has stalled.
|
|
25
|
+
|
|
26
|
+
### Phase 1 — Orient
|
|
27
|
+
|
|
28
|
+
Three things must be in hand before designing tests: the **test stack**, the **change**, and the **issue tracker**. Discover them in parallel where possible.
|
|
29
|
+
|
|
30
|
+
**Detect the test stack** from the repo:
|
|
31
|
+
|
|
32
|
+
- *E2E framework signals* — `playwright.config.*`, `cypress.config.*`, `wdio.conf.*`, `nightwatch.conf.*`, `codecept.conf.*`, `testcafe.config.*`. Failing that, check `package.json` dependencies for `@playwright/test`, `cypress`, `webdriverio`, `puppeteer`, `selenium-webdriver`, or Python equivalents (`pytest-playwright`, `selenium`, `splinter`).
|
|
33
|
+
- *Test location* — common patterns: `e2e/`, `tests/e2e/`, `cypress/e2e/`, `playwright/`, `test/e2e/`, `__tests__/e2e/`.
|
|
34
|
+
- *Visual regression tool* — Playwright or Cypress built-in snapshots, `cypress-image-snapshot`, `cypress-visual-regression`, `percy`, `applitools-eyes-*`, `chromatic`, `backstop.json`, `loki`, `reg-suit`. If none of these are present, the project doesn't do visual regression and you should only add it if the issue explicitly asks for it.
|
|
35
|
+
- *Run command* — search `package.json` scripts for `e2e`, `test:e2e`, `cy:run`, `pw:test`. Failing that, check `Makefile`, `justfile`, `tox.ini`, `pyproject.toml`, CI config. If still unclear, ask.
|
|
36
|
+
|
|
37
|
+
**Detect the issue tracker** so you can read the input issue now and file defect issues later:
|
|
38
|
+
|
|
39
|
+
- GitHub: git remote on `github.com` → use the `gh` CLI if installed, or a connected GitHub MCP.
|
|
40
|
+
- GitLab: `glab` CLI or a GitLab MCP.
|
|
41
|
+
- Jira: an Atlassian MCP, or `jira-cli` config.
|
|
42
|
+
- Linear: a Linear MCP.
|
|
43
|
+
- Azure DevOps: the `az boards` CLI or an ADO MCP.
|
|
44
|
+
- None of the above: ask the user where the source issue lives and where defects should go. Final fallback is a paste-ready markdown report.
|
|
45
|
+
|
|
46
|
+
**Take in the change.** The user typically gives you one of:
|
|
47
|
+
|
|
48
|
+
- An issue/ticket ID or URL → fetch its full text, description, comments, and acceptance criteria.
|
|
49
|
+
- A PR/MR URL or a branch name → fetch the description and the diff.
|
|
50
|
+
- A pasted description → use what you've been given; ask for the diff or branch if not provided.
|
|
51
|
+
|
|
52
|
+
Briefly summarise what you found (the stack, the tracker, the change in one or two sentences) and confirm before continuing. The user is far more responsive to corrections now than after you've written twelve test files.
|
|
53
|
+
|
|
54
|
+
**If no e2e suite was found, go to Phase 1b before continuing.** Otherwise skip to Phase 2.
|
|
55
|
+
|
|
56
|
+
### Phase 1b — Bootstrap (only when no suite exists)
|
|
57
|
+
|
|
58
|
+
This phase runs only if Phase 1 found no e2e framework, no test directory, and no relevant dependencies. Don't run it just because the existing suite is small or messy — that's a maintenance problem, not a bootstrap one.
|
|
59
|
+
|
|
60
|
+
The goal of bootstrap is a working, well-configured e2e suite with one passing smoke test, set up so that any tests added afterward (including the change's tests in Phase 5) feel native to the project. **Read `references/bootstrap.md`** before you start — it has the per-stack framework recommendations, official-installer commands, config best practices, and structure templates.
|
|
61
|
+
|
|
62
|
+
The bootstrap workflow:
|
|
63
|
+
|
|
64
|
+
1. **Gather extra context** beyond what Phase 1 found:
|
|
65
|
+
- Frontend framework (React, Vue, Angular, Svelte, Next.js, Nuxt, mobile, Electron, etc.) — from `package.json` deps or equivalent.
|
|
66
|
+
- Language — `tsconfig.json` for TypeScript, otherwise JS or whatever the project uses.
|
|
67
|
+
- Package manager — `pnpm-lock.yaml` → pnpm; `yarn.lock` → yarn; otherwise npm. Or pip/poetry/uv for Python; bundler for Ruby; etc.
|
|
68
|
+
- Dev server command and port — from `scripts` in `package.json` or the equivalent.
|
|
69
|
+
- CI system — `.github/workflows/`, `.gitlab-ci.yml`, `.circleci/`, `Jenkinsfile`, `azure-pipelines.yml`.
|
|
70
|
+
- Any existing unit/integration test framework — for matching style (assertions, file naming, runner conventions).
|
|
71
|
+
|
|
72
|
+
2. **Propose a framework choice with a one-line rationale, and get explicit confirmation.** This is a long-lived decision; do not install anything without a clear "yes". Use `references/bootstrap.md` for the recommendation matrix. Briefly mention the runner-up so the user can override if their team has a preference you couldn't infer.
|
|
73
|
+
|
|
74
|
+
3. **Decide whether to include visual regression now.** If the user asked for it, or the originating issue is visually significant, yes. If unsure, ask. Visual regression has its own tooling decision (built-in snapshots vs cloud service like Percy/Chromatic/Applitools) — `references/bootstrap.md` covers this.
|
|
75
|
+
|
|
76
|
+
4. **Install with the official installer** wherever one exists (e.g. `npm init playwright@latest`, `npx cypress install`). Official installers set up sensible defaults the skill shouldn't try to second-guess.
|
|
77
|
+
|
|
78
|
+
5. **Lay out the directory structure** for the project's expected scale: a top-level test directory (`e2e/` or whatever the framework's installer chose), with subfolders for specs, fixtures, page objects (or fixture-based equivalent), helpers, and visual specs if applicable. Write the structure as a short tree in your reply so the user can see what was created.
|
|
79
|
+
|
|
80
|
+
6. **Configure for best practice** — base URL pointing at the project's dev server, retries on CI only, parallel workers, an HTML reporter, trace on first retry, `screenshot: 'only-on-failure'` (failure forensics — see *Evidence vs failure forensics* below for per-AC evidence capture), video retention on failure, and (if the framework supports it) auto-starting the dev server before the suite runs. Specifics per framework are in `references/bootstrap.md`.
|
|
81
|
+
|
|
82
|
+
7. **Establish the abstraction pattern** — write one Page Object Model (or one fixture, depending on framework idioms) as a worked example so the change's tests in Phase 5 have a template to follow.
|
|
83
|
+
|
|
84
|
+
8. **Write a smoke test** that proves the setup works end-to-end: load the home page, assert the title or a known stable element. Run it. It must pass before you continue.
|
|
85
|
+
|
|
86
|
+
9. **Wire up runner scripts** — at minimum `test:e2e` (headless), `test:e2e:ui` or `:headed` (interactive), `test:e2e:debug`, and `test:e2e:update-snapshots` if visual regression is in.
|
|
87
|
+
|
|
88
|
+
10. **Offer a CI job** — write the YAML (or equivalent) for the project's CI system, but **do not commit it without confirmation**. Show it inline first.
|
|
89
|
+
|
|
90
|
+
11. **Write a short README** in the test directory explaining structure, how to run, how to add new tests, and how to update visual baselines. Future contributors (and the skill itself, on next invocation) will thank you.
|
|
91
|
+
|
|
92
|
+
After bootstrap, if there's a change to test, continue to Phase 2 as normal. If the user only wanted the suite set up with no specific change in mind, stop here with a final summary.
|
|
93
|
+
|
|
94
|
+
### Phase 2 — Understand the change
|
|
95
|
+
|
|
96
|
+
You cannot write the right tests without understanding what changed and why. Spend real time here.
|
|
97
|
+
|
|
98
|
+
1. **Read the issue end-to-end.** Capture: the user-facing goal, the explicit acceptance criteria (number them), any negative criteria ("should not allow X"), edge cases the description mentions, and any references to existing behaviour that must stay intact.
|
|
99
|
+
|
|
100
|
+
2. **Read the implementation.** From the diff or branch:
|
|
101
|
+
- Which files changed — routes, components, state, API contracts, styles?
|
|
102
|
+
- What's the user-facing entry point — what URL, what control, what flow leads here?
|
|
103
|
+
- What pre-conditions does the new behaviour assume — auth state, feature flag, seed data?
|
|
104
|
+
- What adjacent surfaces share code with the change and could regress as a side effect?
|
|
105
|
+
|
|
106
|
+
3. **Read the surrounding app** enough to know how a real user reaches the changed area. Look at routing, navigation, and the one or two most adjacent features.
|
|
107
|
+
|
|
108
|
+
4. **Write a short mental model** in your reply: *trigger → new behaviour → expected outcomes → likely side-effects*. If anything is ambiguous, ask the user before designing scenarios. Guesses here cascade into bad tests.
|
|
109
|
+
|
|
110
|
+
### Phase 3 — Design scenarios
|
|
111
|
+
|
|
112
|
+
The goal is the *minimum* set of scenarios that, if they all pass, would give a reasonable person high confidence the change works as intended and hasn't broken adjacent functionality. "Minimum" is load-bearing: e2e tests are slow to run and expensive to maintain, and a bloated suite gets ignored.
|
|
113
|
+
|
|
114
|
+
Derive scenarios from these sources, in this order:
|
|
115
|
+
|
|
116
|
+
1. **One scenario per acceptance criterion.** If an AC is compound ("user can filter by status AND see a count"), split it.
|
|
117
|
+
|
|
118
|
+
2. **One negative scenario per error path the change introduces.** Invalid input, unauthorised access, network failure — only if the change has explicit handling for it.
|
|
119
|
+
|
|
120
|
+
3. **Boundary scenarios** where the change has obvious boundaries: empty state, max length, zero results, single result, many results.
|
|
121
|
+
|
|
122
|
+
4. **Adjacent regression scenarios** — pick the one or two nearby flows most likely to break because they share code with the change. Don't try to re-test the whole app from this seat.
|
|
123
|
+
|
|
124
|
+
5. **Visual regression scenarios** (only if the project does visual regression): each visually-changed component or page state gets a snapshot at the breakpoints the project already covers. Add one or two adjacent surfaces that share styling.
|
|
125
|
+
|
|
126
|
+
Resist padding. A new endpoint doesn't need a test that re-verifies login if login is already covered. Match the project's existing depth — if it covers one happy path per feature, don't add six.
|
|
127
|
+
|
|
128
|
+
For each scenario, write a one-line description. Present the full grouped list to the user before writing any code: *"Here's the coverage I'd propose — anything to add or drop?"*
|
|
129
|
+
|
|
130
|
+
### Phase 4 — Reconcile with existing tests
|
|
131
|
+
|
|
132
|
+
For the area touched by the change, look at what's already there.
|
|
133
|
+
|
|
134
|
+
1. **Overlap** — find existing tests that already cover scenarios from Phase 3. Don't duplicate; either reuse or note the overlap and drop the duplicate from your add list.
|
|
135
|
+
|
|
136
|
+
2. **Obsolete** — a test is obsolete when:
|
|
137
|
+
- The behaviour it asserts has been intentionally removed or replaced.
|
|
138
|
+
- The selectors or routes it uses no longer exist and the new equivalents are covered elsewhere.
|
|
139
|
+
- It tests an old version of a flow that has been fully superseded.
|
|
140
|
+
|
|
141
|
+
A test is **not** obsolete just because it's failing. Failing tests are signals, not garbage. Be conservative — when in doubt, propose updating rather than deleting.
|
|
142
|
+
|
|
143
|
+
3. **Needs updating** — existing tests where the scenario is still valid but selectors, routes, or assertions have shifted.
|
|
144
|
+
|
|
145
|
+
Present three lists to the user:
|
|
146
|
+
- **To add** — new scenarios not already covered.
|
|
147
|
+
- **To update** — existing tests needing adjustment.
|
|
148
|
+
- **To delete** — genuinely obsolete tests, each with a one-line rationale.
|
|
149
|
+
|
|
150
|
+
**Do not delete anything without explicit confirmation.** Not even tests you're 95% sure about. The cost of asking is one sentence; the cost of deleting real coverage is real. Wait for a clear "yes, delete those" before removing anything.
|
|
151
|
+
|
|
152
|
+
### Phase 5 — Implement
|
|
153
|
+
|
|
154
|
+
Write the tests in the project's existing style.
|
|
155
|
+
|
|
156
|
+
- **Match the structure.** Same directory, same file-naming pattern, same test-ID or tag convention.
|
|
157
|
+
- **Reuse existing helpers.** Page Object Models, fixtures, custom commands, test-data factories — use them. Don't invent parallel infrastructure.
|
|
158
|
+
- **Match the assertion style.** If the codebase uses `expect(locator).toBeVisible()`, don't switch to `assert.isTrue(...)`.
|
|
159
|
+
- **Read 2–3 nearby tests before writing.** Fastest way to absorb conventions you wouldn't have noticed otherwise.
|
|
160
|
+
|
|
161
|
+
For **visual regression** specifically:
|
|
162
|
+
- New tests need baseline images. Generate them, but **do not auto-approve** — surface them for the user to verify before they're committed.
|
|
163
|
+
- Use the project's existing breakpoints, viewports, and element-masking conventions.
|
|
164
|
+
|
|
165
|
+
Do additions, updates, and (approved) deletions in the same change so the suite stays internally consistent.
|
|
166
|
+
|
|
167
|
+
### Phase 6 — Execute and report
|
|
168
|
+
|
|
169
|
+
Run the suite. Strategy:
|
|
170
|
+
|
|
171
|
+
1. **Run the new and updated tests first** in isolation if the framework supports filtering. Fast feedback on whether your tests themselves work.
|
|
172
|
+
2. **Then run the full suite** to catch regressions outside the changed area.
|
|
173
|
+
3. **For visual regression**, run the project's normal comparison mode against existing baselines.
|
|
174
|
+
|
|
175
|
+
Triage every failure into one of these buckets *before* taking any action:
|
|
176
|
+
|
|
177
|
+
- **Flake** — non-deterministic; passes on rerun. Rerun once. If it passes, note it. If it keeps flaking, flag it but don't file a noisy bug.
|
|
178
|
+
- **Test bug** — your test is wrong (bad selector, wrong assertion, timing). Fix the test; don't file anything.
|
|
179
|
+
- **Application defect** — the app does the wrong thing. File it.
|
|
180
|
+
- **Visual diff — intended** — the snapshot changed because the change intentionally changed the UI. Update the baseline and surface it for user approval.
|
|
181
|
+
- **Visual diff — unintended** — a snapshot changed somewhere the change shouldn't have affected. File it as a regression.
|
|
182
|
+
|
|
183
|
+
**Then check for missed requirements.** For each numbered acceptance criterion from Phase 2, confirm at least one *passing* test covers it. An AC with no passing test — because no test was written, or because the test fails — is a missed requirement. File it.
|
|
184
|
+
|
|
185
|
+
### Filing defects
|
|
186
|
+
|
|
187
|
+
Use whatever tracker integration you found in Phase 1: `gh issue create`, `glab issue create`, a Jira or Linear MCP tool, `az boards work-item create`. If nothing is available, produce a markdown report with each defect formatted ready to paste.
|
|
188
|
+
|
|
189
|
+
Each filed issue needs:
|
|
190
|
+
|
|
191
|
+
- **Title** — short, specific, describes the symptom not the cause. *"Filter by status shows zero results when status=pending"*, not *"Filter broken"*.
|
|
192
|
+
- **Steps to reproduce** — numbered, minimal, exact.
|
|
193
|
+
- **Expected vs actual** — on separate lines, no ambiguity.
|
|
194
|
+
- **Environment** — branch, commit SHA, test command, browser/viewport if relevant.
|
|
195
|
+
- **Evidence** — link or path to the failing test, error output, screenshot, trace.
|
|
196
|
+
- **Link back** to the originating issue/PR.
|
|
197
|
+
- **Severity** — your honest call: blocker, major, minor. Don't inflate.
|
|
198
|
+
|
|
199
|
+
Show the user the full set of issues you're about to file. Get confirmation. Then file them.
|
|
200
|
+
|
|
201
|
+
### Final report
|
|
202
|
+
|
|
203
|
+
Wrap up with a summary the user can drop into the PR or ticket:
|
|
204
|
+
|
|
205
|
+
- Tests added — count, with a list.
|
|
206
|
+
- Tests updated — count.
|
|
207
|
+
- Tests deleted — count, with rationale.
|
|
208
|
+
- Suite result — passing, failing, flaky.
|
|
209
|
+
- Defects filed — count, with links.
|
|
210
|
+
- Missed requirements — count, with links.
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## Evidence vs failure forensics
|
|
215
|
+
|
|
216
|
+
Playwright's auto-screenshot (`screenshot: 'only-on-failure'`) is for **failure forensics** — "what state was the page in when this test broke?" For a passing test it captures the post-test screen, which is useless as compliance evidence.
|
|
217
|
+
|
|
218
|
+
To prove an AC was actually verified, call `evidenceShot()` **at the assertion that proves it**, before any further interaction:
|
|
219
|
+
|
|
220
|
+
```ts
|
|
221
|
+
import { evidenceShot } from './helpers/evidence';
|
|
222
|
+
|
|
223
|
+
test('AC1: edit dialog opens with fields pre-filled', async ({ page }) => {
|
|
224
|
+
await openEditDialog(page, item.id);
|
|
225
|
+
await expect(dialog.locator('#name')).toHaveValue(item.name);
|
|
226
|
+
await evidenceShot(page, 'REQ-037', 'AC1-edit-dialog-prefilled');
|
|
227
|
+
// ...rest of test
|
|
228
|
+
});
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
**Discipline:**
|
|
232
|
+
|
|
233
|
+
- Call `evidenceShot` **immediately after** the AC-proving assertion, before navigating, closing dialogs, or any further interaction.
|
|
234
|
+
- Slug as `AC<n>-<what-this-proves>` — the filename documents the claim.
|
|
235
|
+
- One screenshot per AC, not per test.
|
|
236
|
+
- Failure forensics stays untouched (`screenshot: 'only-on-failure'` + `trace: 'on-first-retry'`).
|
|
237
|
+
|
|
238
|
+
The helper is shipped automatically into `e2e/helpers/evidence.ts` by the SDLC sync (node-stack consumers). Output lands at `compliance/evidence/<REQ-ID>/screenshots/<slug>.png` — commit these PNGs as part of the evidence pack so reviewers can corroborate the test-plan AC mapping.
|
|
239
|
+
|
|
240
|
+
The canonical helper source lives at `references/evidence.ts` in this skill.
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Principles
|
|
245
|
+
|
|
246
|
+
**Match the project's existing depth.** If it tests one happy path per feature, don't add six scenarios per AC. If it tests exhaustively, match that. Right coverage is coverage consistent with what's already there.
|
|
247
|
+
|
|
248
|
+
**E2E is expensive.** Every test you add costs CI time and maintenance forever. Add what's needed for confidence in the change; resist adding more. The goal is a suite that stays trusted, not a suite that's maximal.
|
|
249
|
+
|
|
250
|
+
**Don't invent infrastructure.** If the project uses POMs, use POMs. If it uses fixtures, use fixtures. If something is genuinely missing that you need, ask the user before adding it.
|
|
251
|
+
|
|
252
|
+
**Confirm before destructive or public actions.** Deleting tests, approving new visual baselines, filing defects in a tracker — all need explicit user sign-off. The cost of confirming is a sentence; the cost of getting it wrong is real.
|
|
253
|
+
|
|
254
|
+
**Ambiguity is a question, not a guess.** If an AC is unclear or the implementation does something the issue doesn't describe, ask. Tests built on guesses about intent are worse than no tests — they encode and propagate the misunderstanding.
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# Bootstrap Reference
|
|
2
|
+
|
|
3
|
+
Detailed guidance for setting up an e2e suite from scratch. Read this when Phase 1b of `SKILL.md` is in play.
|
|
4
|
+
|
|
5
|
+
## Contents
|
|
6
|
+
1. Framework selection matrix (by tech stack)
|
|
7
|
+
2. Visual regression tool selection
|
|
8
|
+
3. Official installer commands
|
|
9
|
+
4. Best-practice configuration
|
|
10
|
+
5. Directory structure templates
|
|
11
|
+
6. CI snippets
|
|
12
|
+
7. Anti-patterns to avoid
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## 1. Framework selection matrix
|
|
17
|
+
|
|
18
|
+
Recommend a primary, mention the runner-up, get user confirmation before installing. These recommendations reflect current best practice for new projects; if the team has a strong preference, follow it.
|
|
19
|
+
|
|
20
|
+
### Web apps — JavaScript / TypeScript
|
|
21
|
+
|
|
22
|
+
| Stack | Primary | Runner-up | Rationale |
|
|
23
|
+
|---|---|---|---|
|
|
24
|
+
| React / Next.js / Remix | **Playwright** | Cypress | Fast, parallel by default, multi-browser, first-party TS, built-in visual regression, excellent trace viewer. |
|
|
25
|
+
| Vue / Nuxt | **Playwright** | Cypress | Same reasons. |
|
|
26
|
+
| Angular | **Playwright** | Cypress | Playwright has overtaken Cypress as the Angular community default; Cypress still common in established Angular shops. |
|
|
27
|
+
| Svelte / SvelteKit | **Playwright** | — | SvelteKit's official template ships Playwright. |
|
|
28
|
+
| Static sites / SSG | **Playwright** | Cypress | Either works; Playwright is simpler to set up. |
|
|
29
|
+
| Legacy multi-browser-grid needs | **WebdriverIO** | Selenium | Needed when integrating with existing Sauce Labs / BrowserStack grids or non-Chromium-family browsers beyond what Playwright covers. |
|
|
30
|
+
| Anything requiring real IE11 (rare) | **Selenium** | — | Only Selenium still targets IE. |
|
|
31
|
+
|
|
32
|
+
### Mobile
|
|
33
|
+
|
|
34
|
+
| Stack | Primary | Runner-up | Rationale |
|
|
35
|
+
|---|---|---|---|
|
|
36
|
+
| React Native | **Detox** | Appium | Native, fast, integrates with RN test infra. |
|
|
37
|
+
| Native iOS / Android cross-platform | **Appium** | Maestro | Industry standard; Maestro is gaining popularity for simpler flows. |
|
|
38
|
+
| Mobile web | **Playwright** (mobile emulation) | Appium (for real-device need) | Playwright covers viewport + UA emulation; Appium for real-device touch interactions. |
|
|
39
|
+
|
|
40
|
+
### Desktop
|
|
41
|
+
|
|
42
|
+
| Stack | Primary | Runner-up | Rationale |
|
|
43
|
+
|---|---|---|---|
|
|
44
|
+
| Electron | **Playwright** | WebdriverIO + Electron service | Playwright has first-party Electron support. |
|
|
45
|
+
| Tauri | **WebdriverIO** + Tauri driver | — | Official path. |
|
|
46
|
+
|
|
47
|
+
### Backend-rendered / non-JS web apps
|
|
48
|
+
|
|
49
|
+
| Stack | Primary | Runner-up | Rationale |
|
|
50
|
+
|---|---|---|---|
|
|
51
|
+
| Python (Django, Flask, FastAPI with templates) | **pytest-playwright** | Selenium + pytest | Same Playwright engine, idiomatic for Python test suites. |
|
|
52
|
+
| Ruby on Rails | **Capybara + Cuprite** | Capybara + Selenium | Cuprite uses CDP, faster than Selenium. |
|
|
53
|
+
| Java (Spring etc.) | **Playwright for Java** | Selenium + JUnit/TestNG | Playwright Java is mature; Selenium still dominant in enterprise Java. |
|
|
54
|
+
| .NET | **Playwright for .NET** | Selenium + NUnit/xUnit | Playwright .NET is well-supported. |
|
|
55
|
+
| PHP (Laravel, Symfony) | **Playwright via PHP** or **Pest browser plugin** | Codeception | Pest's browser plugin is gaining ground for Laravel. |
|
|
56
|
+
|
|
57
|
+
### When to question the default
|
|
58
|
+
|
|
59
|
+
- **Team already uses Cypress productively elsewhere** — stick with Cypress for consistency.
|
|
60
|
+
- **Heavy iframe / cross-origin needs** — Cypress historically struggled here; Playwright handles it. (Cypress has improved but Playwright is smoother.)
|
|
61
|
+
- **Component testing matters too** — Cypress has a unified e2e + component runner; with Playwright you'd add a separate component test setup.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## 2. Visual regression tool selection
|
|
66
|
+
|
|
67
|
+
Only add visual regression if the user asked for it or the originating issue is visually significant. Default off for greenfield bootstrap unless explicitly requested.
|
|
68
|
+
|
|
69
|
+
| Need | Recommendation | Notes |
|
|
70
|
+
|---|---|---|
|
|
71
|
+
| Default for Playwright | **`toHaveScreenshot()` built-in** | Zero extra deps; baselines stored in repo. Configure `threshold` and `maxDiffPixels`. |
|
|
72
|
+
| Default for Cypress | **`cypress-image-snapshot`** | Or `cypress-visual-regression`. Both store baselines in repo. |
|
|
73
|
+
| Default for WebdriverIO | **`@wdio/visual-service`** | Maintained, image-comparison-based. |
|
|
74
|
+
| Default for BackstopJS-style standalone | **BackstopJS** | Use only if not integrating with an e2e framework above. |
|
|
75
|
+
| Cross-team approval workflow needed | **Percy** or **Chromatic** | Cloud, web-based diff review. Chromatic is Storybook-friendly. Percy is generic. |
|
|
76
|
+
| Heavy use of AI-assisted diffing / dynamic content | **Applitools** | Commercial, ML-based comparison, handles dynamic content gracefully. Higher cost. |
|
|
77
|
+
|
|
78
|
+
Baseline strategy on first bootstrap: generate baselines locally, then have the user verify each one before committing. Never auto-approve baselines in CI on first run.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## 3. Official installer commands
|
|
83
|
+
|
|
84
|
+
Always prefer the official installer — it gives the framework's authors' recommended defaults, which the skill should not override without reason.
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Playwright (Node)
|
|
88
|
+
npm init playwright@latest
|
|
89
|
+
# Picks language, test dir, GitHub Actions, browsers.
|
|
90
|
+
|
|
91
|
+
# Playwright (Python)
|
|
92
|
+
pip install pytest-playwright
|
|
93
|
+
playwright install
|
|
94
|
+
|
|
95
|
+
# Cypress
|
|
96
|
+
npm install --save-dev cypress
|
|
97
|
+
npx cypress open # first run sets up structure
|
|
98
|
+
|
|
99
|
+
# WebdriverIO
|
|
100
|
+
npm init wdio@latest .
|
|
101
|
+
|
|
102
|
+
# Detox (React Native)
|
|
103
|
+
npm install --save-dev detox
|
|
104
|
+
detox init
|
|
105
|
+
|
|
106
|
+
# Appium
|
|
107
|
+
npm install --save-dev appium @wdio/cli
|
|
108
|
+
# Then use wdio installer
|
|
109
|
+
|
|
110
|
+
# Selenium (Java, Maven)
|
|
111
|
+
# Add selenium-java + junit-jupiter to pom.xml; no CLI installer.
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Match the package manager: `npm` → `npm install`, `pnpm` → `pnpm add`, `yarn` → `yarn add`.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## 4. Best-practice configuration
|
|
119
|
+
|
|
120
|
+
### Playwright (`playwright.config.ts`)
|
|
121
|
+
|
|
122
|
+
Key settings to set up beyond the installer defaults:
|
|
123
|
+
|
|
124
|
+
- `baseURL` — pointing at the project's dev server (`http://localhost:3000`, `5173`, `4200`, etc. depending on framework).
|
|
125
|
+
- `retries: process.env.CI ? 2 : 0` — flake tolerance in CI only.
|
|
126
|
+
- `workers: process.env.CI ? 2 : undefined` — explicit CI worker count.
|
|
127
|
+
- `reporter: [['html'], ['list']]` — and add `['junit', ...]` if CI needs JUnit XML.
|
|
128
|
+
- `use.trace: 'on-first-retry'` — full trace on flake.
|
|
129
|
+
- `use.screenshot: 'only-on-failure'`.
|
|
130
|
+
- `use.video: 'retain-on-failure'`.
|
|
131
|
+
- `webServer` — auto-start the dev server before tests; saves CI config complexity.
|
|
132
|
+
- `projects` — at least Chromium; add Firefox/WebKit if cross-browser matters; add a mobile viewport project if relevant.
|
|
133
|
+
- `expect.toHaveScreenshot` — set `threshold` (default 0.2 is usually too lax for UI work; try 0.1) and `maxDiffPixels` for noise tolerance.
|
|
134
|
+
|
|
135
|
+
### Cypress (`cypress.config.ts`)
|
|
136
|
+
|
|
137
|
+
- `baseUrl`.
|
|
138
|
+
- `viewportWidth`, `viewportHeight` — set explicitly so visual diffs are stable.
|
|
139
|
+
- `retries: { runMode: 2, openMode: 0 }`.
|
|
140
|
+
- `video: false` (or `true` with `videoCompression`) — videos are large; opt in deliberately.
|
|
141
|
+
- `screenshotOnRunFailure: true`.
|
|
142
|
+
- For visual regression with `cypress-image-snapshot`: configure `failureThreshold` and `failureThresholdType: 'percent'`.
|
|
143
|
+
|
|
144
|
+
### Universal
|
|
145
|
+
|
|
146
|
+
- Pin framework versions in `package.json` (no `^` for the test runner) — flake from runner updates is real.
|
|
147
|
+
- Add the test browsers to `.gitignore` if downloaded outside `node_modules`.
|
|
148
|
+
- Add baseline images to git if visual regression is set up.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## 5. Directory structure templates
|
|
153
|
+
|
|
154
|
+
### Playwright (TypeScript)
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
e2e/
|
|
158
|
+
├── tests/
|
|
159
|
+
│ ├── auth/
|
|
160
|
+
│ │ └── login.spec.ts
|
|
161
|
+
│ └── home.smoke.spec.ts
|
|
162
|
+
├── pages/
|
|
163
|
+
│ └── login.page.ts
|
|
164
|
+
├── fixtures/
|
|
165
|
+
│ ├── auth.fixture.ts
|
|
166
|
+
│ └── test-data.ts
|
|
167
|
+
├── helpers/
|
|
168
|
+
│ └── api-setup.ts
|
|
169
|
+
└── visual/
|
|
170
|
+
└── home.visual.spec.ts
|
|
171
|
+
playwright.config.ts
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Playwright idiom is fixtures over POMs. POMs work; fixtures are more native. Use whichever the user prefers, but be consistent.
|
|
175
|
+
|
|
176
|
+
### Cypress
|
|
177
|
+
|
|
178
|
+
```
|
|
179
|
+
cypress/
|
|
180
|
+
├── e2e/
|
|
181
|
+
│ ├── auth/
|
|
182
|
+
│ │ └── login.cy.ts
|
|
183
|
+
│ └── home.smoke.cy.ts
|
|
184
|
+
├── support/
|
|
185
|
+
│ ├── commands.ts # custom commands
|
|
186
|
+
│ ├── e2e.ts # global setup
|
|
187
|
+
│ └── pages/
|
|
188
|
+
│ └── login.page.ts
|
|
189
|
+
├── fixtures/
|
|
190
|
+
│ └── users.json
|
|
191
|
+
└── snapshots/ # visual baselines if using image-snapshot
|
|
192
|
+
cypress.config.ts
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Cypress idiom is custom commands (`cy.login(...)`) over POMs, but POMs are fine and increasingly common.
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## 6. CI snippets
|
|
200
|
+
|
|
201
|
+
Offer these as suggestions, get user confirmation before writing or committing.
|
|
202
|
+
|
|
203
|
+
### GitHub Actions — Playwright
|
|
204
|
+
|
|
205
|
+
```yaml
|
|
206
|
+
name: E2E
|
|
207
|
+
on: [push, pull_request]
|
|
208
|
+
jobs:
|
|
209
|
+
e2e:
|
|
210
|
+
runs-on: ubuntu-latest
|
|
211
|
+
steps:
|
|
212
|
+
- uses: actions/checkout@v4
|
|
213
|
+
- uses: actions/setup-node@v4
|
|
214
|
+
with: { node-version: 'lts/*' }
|
|
215
|
+
- run: npm ci
|
|
216
|
+
- run: npx playwright install --with-deps
|
|
217
|
+
- run: npm run test:e2e
|
|
218
|
+
- uses: actions/upload-artifact@v4
|
|
219
|
+
if: always()
|
|
220
|
+
with:
|
|
221
|
+
name: playwright-report
|
|
222
|
+
path: playwright-report/
|
|
223
|
+
retention-days: 14
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### GitHub Actions — Cypress
|
|
227
|
+
|
|
228
|
+
Use the official `cypress-io/github-action` — handles caching and parallelisation. Equivalent shape; see Cypress docs for current syntax (it changes more often than other CI integrations).
|
|
229
|
+
|
|
230
|
+
### GitLab CI / CircleCI / Jenkins
|
|
231
|
+
|
|
232
|
+
Each framework's docs have reference pipelines; link the user to them rather than hand-writing from scratch. The shape is always: install deps → install browsers → start dev server (or rely on `webServer` config) → run tests → upload artifacts.
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## 7. Anti-patterns to avoid
|
|
237
|
+
|
|
238
|
+
- **Don't write your own framework wrapper.** Use the framework's idioms directly. Wrappers ossify and the framework upgrades pass them by.
|
|
239
|
+
- **Don't add every browser.** Start with one (Chromium); add others when the user has a real reason (cross-browser bugs reported, multi-browser SLA).
|
|
240
|
+
- **Don't enable video by default.** It's expensive in CI; opt in for the suites that need it.
|
|
241
|
+
- **Don't auto-approve visual baselines.** Bake in a manual approval step on first generation and on diff.
|
|
242
|
+
- **Don't put real credentials in fixtures.** Use environment variables, dotfiles (`.env.test`, gitignored), or per-environment auth state files.
|
|
243
|
+
- **Don't ignore the trace viewer / debug tools.** Playwright's trace viewer and Cypress's time-travel debugger are the reason these tools exist; lean on them for triage.
|
|
244
|
+
- **Don't skip the smoke test on bootstrap.** A passing smoke test is the proof the setup is real. Without it you're handing the user broken infrastructure.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import path from 'path';
|
|
2
|
+
import { type Page } from '@playwright/test';
|
|
3
|
+
|
|
4
|
+
const SLUG_RE = /^[A-Za-z0-9_-]+$/;
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Write a per-assertion screenshot into the requirement's evidence pack.
|
|
8
|
+
*
|
|
9
|
+
* Call this AT the assertion that proves the AC, before any further
|
|
10
|
+
* interaction or navigation. The PNG is committed as part of the
|
|
11
|
+
* evidence pack and used by reviewers to corroborate the test-plan
|
|
12
|
+
* AC mapping.
|
|
13
|
+
*
|
|
14
|
+
* Output path: `compliance/evidence/<reqId>/screenshots/<slug>.png`
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* await expect(dialog.locator('#name')).toHaveValue(item.name);
|
|
18
|
+
* await evidenceShot(page, 'REQ-037', 'AC1-edit-dialog-prefilled');
|
|
19
|
+
*/
|
|
20
|
+
export async function evidenceShot(
|
|
21
|
+
page: Page,
|
|
22
|
+
reqId: string,
|
|
23
|
+
slug: string,
|
|
24
|
+
opts: { fullPage?: boolean } = {},
|
|
25
|
+
): Promise<void> {
|
|
26
|
+
if (!SLUG_RE.test(reqId)) {
|
|
27
|
+
throw new Error(`evidenceShot: invalid reqId "${reqId}" (must match ${SLUG_RE})`);
|
|
28
|
+
}
|
|
29
|
+
if (!SLUG_RE.test(slug)) {
|
|
30
|
+
throw new Error(`evidenceShot: invalid slug "${slug}" (must match ${SLUG_RE})`);
|
|
31
|
+
}
|
|
32
|
+
const out = path.join(
|
|
33
|
+
process.cwd(),
|
|
34
|
+
'compliance/evidence',
|
|
35
|
+
reqId,
|
|
36
|
+
'screenshots',
|
|
37
|
+
`${slug}.png`,
|
|
38
|
+
);
|
|
39
|
+
await page.screenshot({ path: out, fullPage: opts.fullPage ?? true });
|
|
40
|
+
}
|