forgecraft-mcp 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/README.md +525 -525
  2. package/dist/artifacts/commit-hooks.d.ts +1 -1
  3. package/dist/artifacts/commit-hooks.d.ts.map +1 -1
  4. package/dist/artifacts/commit-hooks.js +2 -0
  5. package/dist/artifacts/commit-hooks.js.map +1 -1
  6. package/dist/cli/commands.d.ts +35 -1
  7. package/dist/cli/commands.d.ts.map +1 -1
  8. package/dist/cli/commands.js +109 -2
  9. package/dist/cli/commands.js.map +1 -1
  10. package/dist/cli/help.d.ts.map +1 -1
  11. package/dist/cli/help.js +51 -44
  12. package/dist/cli/help.js.map +1 -1
  13. package/dist/cli.d.ts.map +1 -1
  14. package/dist/cli.js +10 -1
  15. package/dist/cli.js.map +1 -1
  16. package/dist/registry/renderer-skeletons.js +92 -92
  17. package/dist/shared/gs-score-logger.js +6 -6
  18. package/dist/shared/result-utils.d.ts +27 -0
  19. package/dist/shared/result-utils.d.ts.map +1 -0
  20. package/dist/shared/result-utils.js +41 -0
  21. package/dist/shared/result-utils.js.map +1 -0
  22. package/dist/tools/add-module.js +123 -123
  23. package/dist/tools/advice-registry.js +18 -18
  24. package/dist/tools/check-cascade-report.js +64 -64
  25. package/dist/tools/close-cycle-helpers.d.ts +21 -2
  26. package/dist/tools/close-cycle-helpers.d.ts.map +1 -1
  27. package/dist/tools/close-cycle-helpers.js +66 -10
  28. package/dist/tools/close-cycle-helpers.js.map +1 -1
  29. package/dist/tools/close-cycle.d.ts +2 -2
  30. package/dist/tools/close-cycle.d.ts.map +1 -1
  31. package/dist/tools/close-cycle.js +1 -1
  32. package/dist/tools/close-cycle.js.map +1 -1
  33. package/dist/tools/configure-mcp.d.ts +3 -0
  34. package/dist/tools/configure-mcp.d.ts.map +1 -1
  35. package/dist/tools/configure-mcp.js +10 -0
  36. package/dist/tools/configure-mcp.js.map +1 -1
  37. package/dist/tools/consolidate-status.d.ts +81 -0
  38. package/dist/tools/consolidate-status.d.ts.map +1 -0
  39. package/dist/tools/consolidate-status.js +251 -0
  40. package/dist/tools/consolidate-status.js.map +1 -0
  41. package/dist/tools/forgecraft-dispatch.d.ts.map +1 -1
  42. package/dist/tools/forgecraft-dispatch.js +13 -0
  43. package/dist/tools/forgecraft-dispatch.js.map +1 -1
  44. package/dist/tools/forgecraft-router.d.ts +8 -0
  45. package/dist/tools/forgecraft-router.d.ts.map +1 -1
  46. package/dist/tools/forgecraft-router.js +21 -1
  47. package/dist/tools/forgecraft-router.js.map +1 -1
  48. package/dist/tools/forgecraft-schema-params.d.ts +13 -4
  49. package/dist/tools/forgecraft-schema-params.d.ts.map +1 -1
  50. package/dist/tools/forgecraft-schema-params.js +21 -0
  51. package/dist/tools/forgecraft-schema-params.js.map +1 -1
  52. package/dist/tools/forgecraft-schema.d.ts +14 -5
  53. package/dist/tools/forgecraft-schema.d.ts.map +1 -1
  54. package/dist/tools/forgecraft-schema.js +3 -0
  55. package/dist/tools/forgecraft-schema.js.map +1 -1
  56. package/dist/tools/gate-violations.d.ts +59 -0
  57. package/dist/tools/gate-violations.d.ts.map +1 -0
  58. package/dist/tools/gate-violations.js +152 -0
  59. package/dist/tools/gate-violations.js.map +1 -0
  60. package/dist/tools/generate-session-prompt.d.ts +3 -3
  61. package/dist/tools/generate-session-prompt.d.ts.map +1 -1
  62. package/dist/tools/generate-session-prompt.js +57 -15
  63. package/dist/tools/generate-session-prompt.js.map +1 -1
  64. package/dist/tools/refresh-output.js +14 -14
  65. package/dist/tools/roadmap-builder.d.ts.map +1 -1
  66. package/dist/tools/roadmap-builder.js +19 -9
  67. package/dist/tools/roadmap-builder.js.map +1 -1
  68. package/dist/tools/scaffold-spec-stubs.js +115 -115
  69. package/dist/tools/scaffold-templates.js +62 -62
  70. package/dist/tools/session-prompt-builders.d.ts.map +1 -1
  71. package/dist/tools/session-prompt-builders.js +34 -10
  72. package/dist/tools/session-prompt-builders.js.map +1 -1
  73. package/dist/tools/setup-artifact-writers.d.ts +30 -0
  74. package/dist/tools/setup-artifact-writers.d.ts.map +1 -1
  75. package/dist/tools/setup-artifact-writers.js +120 -8
  76. package/dist/tools/setup-artifact-writers.js.map +1 -1
  77. package/dist/tools/setup-phase1.d.ts +3 -0
  78. package/dist/tools/setup-phase1.d.ts.map +1 -1
  79. package/dist/tools/setup-phase1.js +79 -35
  80. package/dist/tools/setup-phase1.js.map +1 -1
  81. package/dist/tools/setup-phase2.d.ts +2 -0
  82. package/dist/tools/setup-phase2.d.ts.map +1 -1
  83. package/dist/tools/setup-phase2.js +10 -1
  84. package/dist/tools/setup-phase2.js.map +1 -1
  85. package/dist/tools/setup-project.d.ts +18 -0
  86. package/dist/tools/setup-project.d.ts.map +1 -1
  87. package/dist/tools/setup-project.js +77 -1
  88. package/dist/tools/setup-project.js.map +1 -1
  89. package/dist/tools/spec-parser-tags.d.ts +9 -0
  90. package/dist/tools/spec-parser-tags.d.ts.map +1 -1
  91. package/dist/tools/spec-parser-tags.js +92 -0
  92. package/dist/tools/spec-parser-tags.js.map +1 -1
  93. package/package.json +89 -86
  94. package/templates/analytics/instructions.yaml +37 -37
  95. package/templates/analytics/mcp-servers.yaml +11 -11
  96. package/templates/analytics/structure.yaml +25 -25
  97. package/templates/api/instructions.yaml +231 -231
  98. package/templates/api/mcp-servers.yaml +22 -13
  99. package/templates/api/nfr.yaml +23 -23
  100. package/templates/api/review.yaml +103 -103
  101. package/templates/api/structure.yaml +34 -34
  102. package/templates/api/verification.yaml +132 -132
  103. package/templates/cli/instructions.yaml +31 -31
  104. package/templates/cli/mcp-servers.yaml +11 -11
  105. package/templates/cli/review.yaml +53 -53
  106. package/templates/cli/structure.yaml +16 -16
  107. package/templates/data-lineage/instructions.yaml +28 -28
  108. package/templates/data-lineage/mcp-servers.yaml +22 -22
  109. package/templates/data-pipeline/instructions.yaml +84 -84
  110. package/templates/data-pipeline/mcp-servers.yaml +13 -13
  111. package/templates/data-pipeline/nfr.yaml +39 -39
  112. package/templates/data-pipeline/structure.yaml +23 -23
  113. package/templates/fintech/hooks.yaml +55 -55
  114. package/templates/fintech/instructions.yaml +112 -112
  115. package/templates/fintech/mcp-servers.yaml +13 -13
  116. package/templates/fintech/nfr.yaml +46 -46
  117. package/templates/fintech/playbook.yaml +210 -210
  118. package/templates/fintech/verification.yaml +239 -239
  119. package/templates/game/instructions.yaml +289 -289
  120. package/templates/game/mcp-servers.yaml +38 -38
  121. package/templates/game/nfr.yaml +64 -64
  122. package/templates/game/playbook.yaml +214 -214
  123. package/templates/game/review.yaml +97 -97
  124. package/templates/game/structure.yaml +67 -67
  125. package/templates/game/verification.yaml +174 -174
  126. package/templates/healthcare/instructions.yaml +42 -42
  127. package/templates/healthcare/mcp-servers.yaml +13 -13
  128. package/templates/healthcare/nfr.yaml +47 -47
  129. package/templates/hipaa/instructions.yaml +41 -41
  130. package/templates/hipaa/mcp-servers.yaml +13 -13
  131. package/templates/infra/instructions.yaml +104 -104
  132. package/templates/infra/mcp-servers.yaml +20 -20
  133. package/templates/infra/nfr.yaml +46 -46
  134. package/templates/infra/review.yaml +65 -65
  135. package/templates/infra/structure.yaml +25 -25
  136. package/templates/library/instructions.yaml +36 -36
  137. package/templates/library/mcp-servers.yaml +20 -20
  138. package/templates/library/review.yaml +56 -56
  139. package/templates/library/structure.yaml +19 -19
  140. package/templates/medallion-architecture/instructions.yaml +41 -41
  141. package/templates/medallion-architecture/mcp-servers.yaml +22 -22
  142. package/templates/ml/instructions.yaml +85 -85
  143. package/templates/ml/mcp-servers.yaml +11 -11
  144. package/templates/ml/nfr.yaml +39 -39
  145. package/templates/ml/structure.yaml +25 -25
  146. package/templates/ml/verification.yaml +156 -156
  147. package/templates/mobile/instructions.yaml +44 -44
  148. package/templates/mobile/mcp-servers.yaml +11 -11
  149. package/templates/mobile/nfr.yaml +49 -49
  150. package/templates/mobile/structure.yaml +27 -27
  151. package/templates/mobile/verification.yaml +121 -121
  152. package/templates/observability-xray/instructions.yaml +40 -40
  153. package/templates/observability-xray/mcp-servers.yaml +15 -15
  154. package/templates/realtime/instructions.yaml +42 -42
  155. package/templates/realtime/mcp-servers.yaml +13 -13
  156. package/templates/soc2/instructions.yaml +41 -41
  157. package/templates/soc2/mcp-servers.yaml +24 -24
  158. package/templates/social/instructions.yaml +43 -43
  159. package/templates/social/mcp-servers.yaml +24 -24
  160. package/templates/state-machine/instructions.yaml +42 -42
  161. package/templates/state-machine/mcp-servers.yaml +11 -11
  162. package/templates/tools-registry.yaml +164 -164
  163. package/templates/universal/hooks.yaml +723 -531
  164. package/templates/universal/instructions.yaml +1692 -1692
  165. package/templates/universal/mcp-servers.yaml +50 -50
  166. package/templates/universal/nfr.yaml +197 -197
  167. package/templates/universal/reference.yaml +326 -326
  168. package/templates/universal/review.yaml +204 -204
  169. package/templates/universal/skills.yaml +262 -262
  170. package/templates/universal/structure.yaml +67 -67
  171. package/templates/universal/verification.yaml +416 -416
  172. package/templates/web-react/hooks.yaml +44 -44
  173. package/templates/web-react/instructions.yaml +207 -207
  174. package/templates/web-react/mcp-servers.yaml +20 -20
  175. package/templates/web-react/nfr.yaml +27 -27
  176. package/templates/web-react/review.yaml +94 -94
  177. package/templates/web-react/structure.yaml +46 -46
  178. package/templates/web-react/verification.yaml +126 -126
  179. package/templates/web-static/instructions.yaml +115 -115
  180. package/templates/web-static/mcp-servers.yaml +20 -20
  181. package/templates/web3/instructions.yaml +44 -44
  182. package/templates/web3/mcp-servers.yaml +11 -11
  183. package/templates/web3/verification.yaml +159 -159
  184. package/templates/zero-trust/instructions.yaml +41 -41
  185. package/templates/zero-trust/mcp-servers.yaml +15 -15
@@ -1,416 +1,416 @@
1
- tag: UNIVERSAL
2
- section: verification
3
- title: "Contract-First Baseline Verification"
4
- description: >
5
- Applicable to every domain. Establishes type contracts, schema validation,
6
- unit-level assertions, and hardening gates as the completeness floor before
7
- any domain-specific verification is applied.
8
-
9
- Phases gate on release_phase: contract-definition + execution + evidence gates
10
- are always active. pre-release-hardening gates are blocking at pre-release.
11
- release-candidate gates are blocking at release-candidate.
12
- deployment-gates and post-deployment gates are blocking at production.
13
-
14
- S = 0.40 is achievable with the three base phases alone. Hardening phases
15
- raise the ceiling to 0.90 — domain strategies supply the remaining 0.10.
16
- cycle_model: |
17
- ForgeCraft verification maps to the GS paper's 4 loops + 1 independent cycle:
18
-
19
- LOOP 1 — INITIALIZATION (once per project)
20
- Runs: spec → architecture → constitution → ADRs → use cases
21
- Gate: derivability — stateless agent can build system from artifacts alone
22
- OWASP: N/A (no running service yet)
23
-
24
- LOOP 2 — INCREMENTAL / SHORT LOOP (per roadmap item, agile-style)
25
- Runs: partial spec → implement → verify → Status.md → human review
26
- Gate: tests pass + feature exercised at boundary + cascade complete
27
- OWASP ASVS Level 1: static checks enforced at every commit (execution.owasp-l1-static)
28
- Less deterministic — human in the loop guides direction at each iteration
29
-
30
- LOOP 3 — PRE-RELEASE (before environment promotion)
31
- Runs: mutation testing + DAST + load test + chaos resilience
32
- Gate: MSI ≥ 80%, zero HIGH DAST findings, p95 ≤ SLA
33
- OWASP ASVS Level 2: dynamic analysis, systematic verification (pre-release-hardening phase)
34
-
35
- LOOP 4 — HOTFIX (emergency patches)
36
- Runs: minimal fix → smoke tests → ADR + cascade immediately after stabilization
37
- Gate: smoke tests pass; ADR filed within 24h of stabilization
38
- OWASP ASVS Level 1 minimum; Level 2 if auth/data path affected
39
-
40
- LOOP 5 — DEPLOYMENT/HARDENING (independent, longer cadence)
41
- Runs: post-implementation, on its own schedule (weekly/monthly/per-major)
42
- Gate: pentest, full mutation, compatibility matrix, accessibility audit
43
- OWASP ASVS Level 3: penetration testing, BOLA/IDOR, session fixation
44
- Managed independently from feature loops — see deployment-gates and release-candidate phases
45
- uncertainty_levels:
46
- - deterministic
47
- completeness_ceiling: 0.90
48
-
49
- phases:
50
-
51
- - id: contract-definition
52
- title: "Define Contracts Before Code"
53
- rationale: >
54
- Every function, module boundary, and API surface must have a machine-checkable
55
- contract before implementation begins. This transforms implicit intent into
56
- explicit invariants the verify loop can check automatically.
57
- steps:
58
- - id: define-type-contracts
59
- instruction: >
60
- For every public function and class, define TypeScript types or Python type
61
- annotations for all parameters and return values. No `any`, no untyped
62
- function signatures. Run `tsc --noEmit` or `mypy` before proceeding.
63
- contract: >
64
- Zero type errors reported by the compiler. Every exported function has
65
- explicit parameter and return types.
66
- tools: ["tsc --noEmit", "mypy --strict"]
67
- expected_output: "Exit code 0 from type checker with zero errors"
68
- pass_criterion: "tsc --noEmit exits 0"
69
-
70
- - id: define-schema-contracts
71
- instruction: >
72
- For every external interface (HTTP request/response, config file, event payload,
73
- CLI flags), define a Zod schema (TypeScript) or Pydantic model (Python) colocated
74
- with the module that owns it. The schema must be the single source of truth —
75
- types must be derived from it, not maintained separately.
76
- contract: >
77
- All external interfaces have a named Zod schema or Pydantic model that
78
- is used at the entry point and is importable by tests.
79
- tools: ["zod", "pydantic"]
80
- expected_output: "Schema files present and exported from module barrel"
81
- pass_criterion: >
82
- grep -r 'z.object\|z.string\|BaseModel' src/ returns ≥1 result per external interface
83
-
84
- - id: define-error-contracts
85
- instruction: >
86
- For every module, define a custom error class that carries: message, operation name,
87
- and a timestamp. No bare `throw new Error(...)` in business logic. In tests, assert
88
- on the specific error class, not just the message string.
89
- contract: >
90
- Every module directory has an errors file. No bare `new Error()` in business logic
91
- files (src/ excluding test files).
92
- tools: ["grep", "eslint"]
93
- expected_output: "Custom error classes present; eslint no-throw-generic rule passes"
94
- pass_criterion: "grep -r 'throw new Error' src --include='*.ts' --exclude='*.test.ts' returns 0 matches"
95
-
96
- - id: execution
97
- title: "Run Type Checker + Unit Tests"
98
- rationale: >
99
- Contracts defined above are verified by automated tooling. This is the
100
- deterministic verify loop — no human judgment needed at this phase.
101
- steps:
102
- - id: run-type-check
103
- instruction: >
104
- Run the type checker in strict mode. Treat warnings as errors.
105
- All type errors must be resolved before running tests.
106
- contract: "Zero compiler errors in strict mode"
107
- tools: ["tsc --noEmit --strict", "mypy --strict"]
108
- expected_output: "Exit code 0"
109
- pass_criterion: "tsc --noEmit exits 0"
110
-
111
- - id: run-unit-tests
112
- instruction: >
113
- Run the full test suite. All tests must pass. Coverage must meet
114
- the project's coverage gate (≥80% line coverage overall, ≥90% on changed files).
115
- contract: "All tests pass; coverage gate met"
116
- tools: ["jest --runInBand --coverage", "pytest --cov"]
117
- expected_output: "Test results JSON + coverage report"
118
- pass_criterion: "Exit code 0; coverage ≥ 80% overall"
119
-
120
- - id: owasp-l1-static
121
- instruction: >
122
- Run static security analysis as part of the development loop.
123
- OWASP ASVS Level 1 checks that run at commit time:
124
- (1) Secrets detection — no credentials, tokens, or keys in source (pre-commit-secrets hook)
125
- (2) Dependency audit — no HIGH/CRITICAL CVEs in direct dependencies (audit command from forgecraft.yaml tools.audit)
126
- (3) Injection pattern scan — grep for string-concatenated SQL, shell exec with user input, eval() patterns
127
- (4) Hardcoded config scan — no URLs, IPs, ports, credentials in non-config files
128
- These run automatically via pre-commit hooks (see .claude/hooks/). No manual action required
129
- unless a finding is reported.
130
- contract: >
131
- Zero secrets detected. Zero HIGH/CRITICAL CVEs in dependencies.
132
- Zero string-concatenated SQL queries. Zero eval() with dynamic input.
133
- tools: ["pre-commit-secrets.sh", "forgecraft.yaml tools.audit", "pre-commit-prod-quality.sh"]
134
- expected_output: "Hook output: PASS or list of violations with file:line"
135
- pass_criterion: "All pre-commit hooks exit 0 on staged files"
136
- owasp_asvs_level: 1
137
-
138
- - id: evidence
139
- title: "Persist and Interpret Results"
140
- rationale: >
141
- Verification evidence must be persisted as artifacts so the verify loop
142
- can compare pass/fail state across iterations. Without persisted evidence,
143
- regressions go undetected.
144
- steps:
145
- - id: persist-coverage-report
146
- instruction: >
147
- Save the coverage report to coverage/ in the project root. If CI is
148
- configured, upload as a build artifact. Record the overall line coverage
149
- percentage in the session log.
150
- contract: "coverage/lcov.info or coverage/coverage.json exists after test run"
151
- tools: ["jest --coverage", "pytest-cov", "lcov"]
152
- expected_output: "coverage/ directory with lcov.info or JSON summary"
153
- pass_criterion: "coverage/ directory non-empty after test run"
154
-
155
- - id: record-type-error-count
156
- instruction: >
157
- Record the number of type errors before and after each verify loop pass.
158
- If the count does not decrease across passes, the fix prompt is not effective —
159
- escalate to human review.
160
- contract: "Type error count is 0 at end of final pass"
161
- tools: ["tsc --noEmit 2>&1 | grep 'error TS'"]
162
- expected_output: "Integer error count per pass"
163
- pass_criterion: "Error count = 0"
164
-
165
- - id: pre-release-hardening
166
- title: "Pre-Release Hardening Gates"
167
- rationale: >
168
- Before promoting to pre-release (beta/RC-candidate), the implementation
169
- must survive adversarial conditions it will encounter in production:
170
- mutant code that slips through tests, external attack vectors, load spikes,
171
- and infrastructure failures. These gates are advisory during development
172
- but blocking at release_phase = pre-release.
173
- release_phase_gate: pre-release
174
- steps:
175
- - id: mutation-testing
176
- instruction: >
177
- Run mutation testing on changed files using Stryker (JS/TS) or mutmut (Python).
178
- Mutation score on changed files must be ≥ 80%. A low mutation score means
179
- tests pass but do not actually catch logic errors — the test suite is checking
180
- the wrong things.
181
- Configuration: stryker.config.json or .stryker.conf.json at project root.
182
- Target only files modified in this release branch to keep runtime reasonable.
183
- If forgecraft.yaml has tools.mutation configured, run that command directly.
184
- The pre-commit hook will check mutation score if the tool is configured.
185
- contract: >
186
- Mutation score ≥ 80% on changed files. Stryker/mutmut exits with surviving-mutant
187
- count below the 20% threshold for changed lines.
188
- tools: ["stryker run", "mutmut run && mutmut results"]
189
- expected_output: "Stryker HTML report or mutmut results table with mutation score"
190
- pass_criterion: "Mutation score on changed files ≥ 80%"
191
- requires_human_review: true
192
-
193
- - id: dast-scan
194
- instruction: >
195
- Run OWASP ZAP dynamic analysis against a running instance of the service.
196
- Use the baseline scan for APIs: `docker run -t owasp/zap2docker-stable
197
- zap-api-scan.py -t <openapi-spec-url> -f openapi`.
198
- All HIGH severity findings must be resolved before pre-release.
199
- MEDIUM findings must be triaged (accepted with documented rationale or fixed).
200
- contract: >
201
- Zero HIGH severity findings from ZAP API scan.
202
- Every MEDIUM finding has a documented disposition (fixed or accepted with ADR).
203
- tools: ["owasp/zap2docker-stable", "zap-api-scan.py", "zap-baseline.py"]
204
- expected_output: "ZAP HTML or JSON report with findings by severity"
205
- pass_criterion: "Zero HIGH findings; all MEDIUM findings triaged"
206
- requires_human_review: true
207
- owasp_asvs_level: 2
208
-
209
- - id: load-test
210
- instruction: >
211
- Run a load test at 2× peak expected traffic using k6, Locust, or Artillery.
212
- Peak is defined as the highest observed or projected request rate in the
213
- production traffic model. Test duration: minimum 10 minutes.
214
- Accept criteria: p95 latency ≤ SLA threshold; error rate < 1%.
215
- contract: >
216
- Under 2× peak load: p95 response time ≤ SLA; error rate < 1%;
217
- no memory leak detected (heap stable across test duration).
218
- tools: ["k6 run", "locust", "artillery run"]
219
- expected_output: "k6/Locust/Artillery HTML or JSON summary with p50/p95/p99 and error rate"
220
- pass_criterion: "p95 ≤ SLA; error rate < 1% sustained over test duration"
221
- requires_human_review: true
222
-
223
- - id: chaos-resilience
224
- instruction: >
225
- Inject one network failure and one dependency failure using Toxiproxy,
226
- Chaos Monkey, or AWS Fault Injection Simulator.
227
- The service must degrade gracefully: return 503 with Retry-After header
228
- (not 500 or silent hang) when a downstream is unavailable.
229
- Recovery time from injected failure must be ≤ 30 seconds.
230
- contract: >
231
- Service returns structured error response (not 500) when dependency fails.
232
- Recovers automatically within 30 seconds of dependency restoration.
233
- No data corruption or stuck transactions during failure window.
234
- tools: ["toxiproxy-cli", "chaos-monkey", "aws fis"]
235
- expected_output: "Chaos test report showing failure injection + recovery timeline"
236
- pass_criterion: "Graceful degradation confirmed; recovery time ≤ 30s"
237
- requires_human_review: true
238
-
239
- - id: release-candidate
240
- title: "Release Candidate Gates"
241
- rationale: >
242
- The release candidate is the last checkpoint before production. It must pass
243
- security penetration testing, accessibility audit (if UI), full compatibility
244
- matrix validation, and mutation testing at the overall (not just changed-files)
245
- threshold. Human sign-off is required on all findings.
246
- release_phase_gate: release-candidate
247
- steps:
248
- - id: penetration-testing
249
- instruction: >
250
- Perform OWASP Top 10 penetration testing. At minimum cover:
251
- injection (SQL, command, LDAP), broken authentication (session fixation,
252
- token leakage, brute-force), sensitive data exposure (TLS config, response
253
- headers, error messages), broken access control (BOLA/IDOR, privilege
254
- escalation, JWT algorithm confusion), security misconfiguration (default
255
- creds, verbose errors), and XXE/XSS if a UI is present.
256
- Use OWASP ZAP active scan + manual testing for BOLA/IDOR.
257
- contract: >
258
- Zero Critical or High CVSS ≥ 7.0 findings unresolved.
259
- All Medium findings (CVSS 4.0–6.9) have documented disposition.
260
- OWASP Top 10 categories fully covered in test plan.
261
- tools: ["owasp/zap2docker-stable", "burpsuite", "sqlmap", "nikto"]
262
- expected_output: "Penetration test report with CVSS scores, reproduction steps, and remediation status"
263
- pass_criterion: "Zero unresolved High/Critical; all Medium triaged with ADR or fix"
264
- requires_human_review: true
265
- owasp_asvs_level: 3
266
-
267
- - id: mutation-testing-full
268
- instruction: >
269
- Run mutation testing against the full codebase (not just changed files).
270
- Overall mutation score must be ≥ 80%. This validates that the entire
271
- test suite is specification-grade, not just the code touched in this release.
272
- contract: "Overall mutation score ≥ 80% across full codebase"
273
- tools: ["stryker run --all-files", "mutmut run"]
274
- expected_output: "Full mutation testing report with per-module scores"
275
- pass_criterion: "Overall mutation score ≥ 80%"
276
- requires_human_review: false
277
-
278
- - id: compatibility-matrix
279
- instruction: >
280
- Validate the software against all supported runtime versions specified in
281
- package.json engines field (Node.js), pyproject.toml (Python), or equivalent.
282
- Run the full test suite against each supported major version.
283
- If a database is used, validate against all supported DB major versions.
284
- contract: >
285
- Full test suite passes on every supported runtime version in the
286
- declared compatibility matrix. Zero version-specific failures unreported.
287
- tools: ["nvm use", "pyenv", "matrix CI strategy"]
288
- expected_output: "CI matrix run report showing pass/fail per runtime version"
289
- pass_criterion: "All tests pass on all declared runtime versions"
290
- requires_human_review: false
291
-
292
- - id: accessibility-audit
293
- instruction: >
294
- If the project has a user interface: run axe-core or Lighthouse accessibility
295
- audit against all primary user journeys. WCAG 2.1 AA compliance required.
296
- If no UI, mark this step as N/A with justification.
297
- contract: "Zero WCAG 2.1 AA violations on primary user journeys, or step marked N/A with justification"
298
- tools: ["axe-core", "lighthouse --only-categories=accessibility", "pa11y"]
299
- expected_output: "axe/Lighthouse accessibility report or N/A justification"
300
- pass_criterion: "Zero critical/serious axe violations; Lighthouse accessibility score ≥ 90"
301
- requires_human_review: true
302
-
303
- - id: deployment-gates
304
- title: "Deployment Readiness Gates"
305
- rationale: >
306
- Before production deployment, the operational configuration must be validated:
307
- canary or rolling deploy is configured, rollback threshold is defined and
308
- tested, smoke tests cover the critical path, and observability is confirmed
309
- active. These gates prevent deploying code that cannot be safely rolled back.
310
- release_phase_gate: production
311
- steps:
312
- - id: canary-config
313
- instruction: >
314
- Confirm deployment strategy is configured (canary, blue-green, or rolling).
315
- For canary: define the traffic split percentage and success threshold before
316
- promoting to 100%. For rolling: define the max-surge and max-unavailable
317
- settings. Document the rollback trigger condition (error rate threshold,
318
- latency threshold, or health check failure count).
319
- contract: >
320
- Deployment strategy config present (k8s deployment, railway.json,
321
- fly.toml, ECS service, or equivalent). Rollback trigger threshold
322
- documented in runbook or infra config.
323
- tools: ["kubectl", "fly", "railway", "aws ecs"]
324
- expected_output: "Deployment config file + documented rollback threshold"
325
- pass_criterion: "Deploy config present; rollback threshold defined and reachable in runbook"
326
- requires_human_review: true
327
-
328
- - id: smoke-tests
329
- instruction: >
330
- Define and run smoke tests covering the top 3–5 critical user journeys.
331
- Smoke tests must be runnable against a live environment in < 60 seconds.
332
- For web UIs: use Playwright — navigate to the live URL, wait for async
333
- data to load, take a full-page screenshot, assert critical elements are
334
- present and non-empty. Playwright catches rendering bugs, broken async
335
- fetches, blank pages, and layout collapses that API-only tests miss entirely.
336
- For APIs: use Hurl or Newman for machine-readable output.
337
- These same tests run automatically post-deploy.
338
- contract: >
339
- Smoke test suite exists, runs in < 60 seconds, covers the critical path.
340
- Web UIs: Playwright script with element assertions + screenshot artifact.
341
- APIs: Hurl/Newman with status code + response schema assertions.
342
- All smoke tests pass against staging before deploy to production.
343
- tools: ["playwright", "newman run", "hurl", "k6 run --vus 1"]
344
- expected_output: "Smoke test results with pass/fail per check, screenshot for web UIs, total duration"
345
- pass_criterion: "All checks pass; screenshot confirms page renders correctly; total duration < 60s"
346
- requires_human_review: false
347
-
348
- - id: observability-verified
349
- instruction: >
350
- Confirm that structured logging, metrics, and alerting are active in the
351
- target environment. Required: log aggregation (Datadog, Splunk, CloudWatch,
352
- or equivalent), error rate alert (fires within 5 minutes of >1% error rate),
353
- and latency alert (fires within 5 minutes of p95 > SLA). Run a synthetic
354
- error and confirm the alert fires.
355
- contract: >
356
- Log aggregation active; error rate alert configured and tested;
357
- latency alert configured and tested. Alert runbook linked in deploy notes.
358
- tools: ["datadog", "cloudwatch", "grafana", "pagerduty"]
359
- expected_output: "Screenshot or API response showing alerts active + test alert fired"
360
- pass_criterion: "Synthetic error triggers alert within 5 minutes"
361
- requires_human_review: true
362
-
363
- - id: post-deployment
364
- title: "Post-Deployment Health Verification"
365
- rationale: >
366
- After production deployment, synthetic probes and observability must confirm
367
- that the new version is serving traffic correctly. The deployment is not
368
- complete until post-deployment verification passes. If it fails, rollback
369
- is the default response — not patching in production.
370
- release_phase_gate: production
371
- steps:
372
- - id: synthetic-health-probes
373
- instruction: >
374
- Run the smoke test suite against the production environment within 5 minutes
375
- of deploy completion. For web UIs: Playwright navigates to the live URL,
376
- waits for async data, asserts critical elements, and saves a screenshot.
377
- The screenshot is the proof artifact — view it after every deploy.
378
- If any check fails, trigger rollback immediately.
379
- contract: >
380
- All smoke tests pass against production within 5 minutes of deployment.
381
- Web UIs: screenshot artifact saved and reviewed. API: response schema valid.
382
- Rollback is initiated automatically or within 10 minutes if any smoke test fails.
383
- tools: ["newman run", "hurl", "k6 run --vus 1"]
384
- expected_output: "Post-deploy smoke test results with timestamp"
385
- pass_criterion: "All smoke tests pass against production; timestamp within 5 min of deploy"
386
- requires_human_review: false
387
-
388
- - id: error-rate-monitoring
389
- instruction: >
390
- Monitor the error rate for 30 minutes after deployment. Error rate must stay
391
- below the rollback threshold. Compare p95 latency against pre-deploy baseline
392
- (from load test results). If error rate spikes above threshold or latency
393
- degrades > 20% vs baseline, initiate rollback.
394
- contract: >
395
- Error rate stays below rollback threshold for 30 minutes post-deploy.
396
- p95 latency does not degrade > 20% vs pre-deploy baseline.
397
- tools: ["datadog", "cloudwatch", "grafana"]
398
- expected_output: "30-minute error rate and latency time series with baseline comparison"
399
- pass_criterion: "Error rate below threshold; latency within 20% of baseline for 30 min"
400
- requires_human_review: true
401
-
402
- - id: incident-runbook-verified
403
- instruction: >
404
- Confirm that the incident runbook for this service is current and accessible.
405
- The runbook must cover: how to roll back this version, how to escalate,
406
- how to check the canary status, and how to interpret the health dashboard.
407
- Have one team member who was not involved in the deployment review and
408
- confirm the runbook is complete.
409
- contract: >
410
- Incident runbook exists at docs/runbooks/<service>.md or equivalent.
411
- Runbook covers rollback, escalation, canary status, and dashboard links.
412
- Independent review confirmed by a second team member.
413
- tools: ["docs/runbooks/"]
414
- expected_output: "Runbook link + peer review confirmation"
415
- pass_criterion: "Runbook current; independent reviewer confirms completeness"
416
- requires_human_review: true
1
+ tag: UNIVERSAL
2
+ section: verification
3
+ title: "Contract-First Baseline Verification"
4
+ description: >
5
+ Applicable to every domain. Establishes type contracts, schema validation,
6
+ unit-level assertions, and hardening gates as the completeness floor before
7
+ any domain-specific verification is applied.
8
+
9
+ Phases gate on release_phase: contract-definition + execution + evidence gates
10
+ are always active. pre-release-hardening gates are blocking at pre-release.
11
+ release-candidate gates are blocking at release-candidate.
12
+ deployment-gates and post-deployment gates are blocking at production.
13
+
14
+ S = 0.40 is achievable with the three base phases alone. Hardening phases
15
+ raise the ceiling to 0.90 — domain strategies supply the remaining 0.10.
16
+ cycle_model: |
17
+ ForgeCraft verification maps to the GS paper's 4 loops + 1 independent cycle:
18
+
19
+ LOOP 1 — INITIALIZATION (once per project)
20
+ Runs: spec → architecture → constitution → ADRs → use cases
21
+ Gate: derivability — stateless agent can build system from artifacts alone
22
+ OWASP: N/A (no running service yet)
23
+
24
+ LOOP 2 — INCREMENTAL / SHORT LOOP (per roadmap item, agile-style)
25
+ Runs: partial spec → implement → verify → Status.md → human review
26
+ Gate: tests pass + feature exercised at boundary + cascade complete
27
+ OWASP ASVS Level 1: static checks enforced at every commit (execution.owasp-l1-static)
28
+ Less deterministic — human in the loop guides direction at each iteration
29
+
30
+ LOOP 3 — PRE-RELEASE (before environment promotion)
31
+ Runs: mutation testing + DAST + load test + chaos resilience
32
+ Gate: MSI ≥ 80%, zero HIGH DAST findings, p95 ≤ SLA
33
+ OWASP ASVS Level 2: dynamic analysis, systematic verification (pre-release-hardening phase)
34
+
35
+ LOOP 4 — HOTFIX (emergency patches)
36
+ Runs: minimal fix → smoke tests → ADR + cascade immediately after stabilization
37
+ Gate: smoke tests pass; ADR filed within 24h of stabilization
38
+ OWASP ASVS Level 1 minimum; Level 2 if auth/data path affected
39
+
40
+ LOOP 5 — DEPLOYMENT/HARDENING (independent, longer cadence)
41
+ Runs: post-implementation, on its own schedule (weekly/monthly/per-major)
42
+ Gate: pentest, full mutation, compatibility matrix, accessibility audit
43
+ OWASP ASVS Level 3: penetration testing, BOLA/IDOR, session fixation
44
+ Managed independently from feature loops — see deployment-gates and release-candidate phases
45
+ uncertainty_levels:
46
+ - deterministic
47
+ completeness_ceiling: 0.90
48
+
49
+ phases:
50
+
51
+ - id: contract-definition
52
+ title: "Define Contracts Before Code"
53
+ rationale: >
54
+ Every function, module boundary, and API surface must have a machine-checkable
55
+ contract before implementation begins. This transforms implicit intent into
56
+ explicit invariants the verify loop can check automatically.
57
+ steps:
58
+ - id: define-type-contracts
59
+ instruction: >
60
+ For every public function and class, define TypeScript types or Python type
61
+ annotations for all parameters and return values. No `any`, no untyped
62
+ function signatures. Run `tsc --noEmit` or `mypy` before proceeding.
63
+ contract: >
64
+ Zero type errors reported by the compiler. Every exported function has
65
+ explicit parameter and return types.
66
+ tools: ["tsc --noEmit", "mypy --strict"]
67
+ expected_output: "Exit code 0 from type checker with zero errors"
68
+ pass_criterion: "tsc --noEmit exits 0"
69
+
70
+ - id: define-schema-contracts
71
+ instruction: >
72
+ For every external interface (HTTP request/response, config file, event payload,
73
+ CLI flags), define a Zod schema (TypeScript) or Pydantic model (Python) colocated
74
+ with the module that owns it. The schema must be the single source of truth —
75
+ types must be derived from it, not maintained separately.
76
+ contract: >
77
+ All external interfaces have a named Zod schema or Pydantic model that
78
+ is used at the entry point and is importable by tests.
79
+ tools: ["zod", "pydantic"]
80
+ expected_output: "Schema files present and exported from module barrel"
81
+ pass_criterion: >
82
+ grep -r 'z.object\|z.string\|BaseModel' src/ returns ≥1 result per external interface
83
+
84
+ - id: define-error-contracts
85
+ instruction: >
86
+ For every module, define a custom error class that carries: message, operation name,
87
+ and a timestamp. No bare `throw new Error(...)` in business logic. In tests, assert
88
+ on the specific error class, not just the message string.
89
+ contract: >
90
+ Every module directory has an errors file. No bare `new Error()` in business logic
91
+ files (src/ excluding test files).
92
+ tools: ["grep", "eslint"]
93
+ expected_output: "Custom error classes present; eslint no-throw-generic rule passes"
94
+ pass_criterion: "grep -r 'throw new Error' src --include='*.ts' --exclude='*.test.ts' returns 0 matches"
95
+
96
+ - id: execution
97
+ title: "Run Type Checker + Unit Tests"
98
+ rationale: >
99
+ Contracts defined above are verified by automated tooling. This is the
100
+ deterministic verify loop — no human judgment needed at this phase.
101
+ steps:
102
+ - id: run-type-check
103
+ instruction: >
104
+ Run the type checker in strict mode. Treat warnings as errors.
105
+ All type errors must be resolved before running tests.
106
+ contract: "Zero compiler errors in strict mode"
107
+ tools: ["tsc --noEmit --strict", "mypy --strict"]
108
+ expected_output: "Exit code 0"
109
+ pass_criterion: "tsc --noEmit exits 0"
110
+
111
+ - id: run-unit-tests
112
+ instruction: >
113
+ Run the full test suite. All tests must pass. Coverage must meet
114
+ the project's coverage gate (≥80% line coverage overall, ≥90% on changed files).
115
+ contract: "All tests pass; coverage gate met"
116
+ tools: ["jest --runInBand --coverage", "pytest --cov"]
117
+ expected_output: "Test results JSON + coverage report"
118
+ pass_criterion: "Exit code 0; coverage ≥ 80% overall"
119
+
120
+ - id: owasp-l1-static
121
+ instruction: >
122
+ Run static security analysis as part of the development loop.
123
+ OWASP ASVS Level 1 checks that run at commit time:
124
+ (1) Secrets detection — no credentials, tokens, or keys in source (pre-commit-secrets hook)
125
+ (2) Dependency audit — no HIGH/CRITICAL CVEs in direct dependencies (audit command from forgecraft.yaml tools.audit)
126
+ (3) Injection pattern scan — grep for string-concatenated SQL, shell exec with user input, eval() patterns
127
+ (4) Hardcoded config scan — no URLs, IPs, ports, credentials in non-config files
128
+ These run automatically via pre-commit hooks (see .claude/hooks/). No manual action required
129
+ unless a finding is reported.
130
+ contract: >
131
+ Zero secrets detected. Zero HIGH/CRITICAL CVEs in dependencies.
132
+ Zero string-concatenated SQL queries. Zero eval() with dynamic input.
133
+ tools: ["pre-commit-secrets.sh", "forgecraft.yaml tools.audit", "pre-commit-prod-quality.sh"]
134
+ expected_output: "Hook output: PASS or list of violations with file:line"
135
+ pass_criterion: "All pre-commit hooks exit 0 on staged files"
136
+ owasp_asvs_level: 1
137
+
138
+ - id: evidence
139
+ title: "Persist and Interpret Results"
140
+ rationale: >
141
+ Verification evidence must be persisted as artifacts so the verify loop
142
+ can compare pass/fail state across iterations. Without persisted evidence,
143
+ regressions go undetected.
144
+ steps:
145
+ - id: persist-coverage-report
146
+ instruction: >
147
+ Save the coverage report to coverage/ in the project root. If CI is
148
+ configured, upload as a build artifact. Record the overall line coverage
149
+ percentage in the session log.
150
+ contract: "coverage/lcov.info or coverage/coverage.json exists after test run"
151
+ tools: ["jest --coverage", "pytest-cov", "lcov"]
152
+ expected_output: "coverage/ directory with lcov.info or JSON summary"
153
+ pass_criterion: "coverage/ directory non-empty after test run"
154
+
155
+ - id: record-type-error-count
156
+ instruction: >
157
+ Record the number of type errors before and after each verify loop pass.
158
+ If the count does not decrease across passes, the fix prompt is not effective —
159
+ escalate to human review.
160
+ contract: "Type error count is 0 at end of final pass"
161
+ tools: ["tsc --noEmit 2>&1 | grep 'error TS'"]
162
+ expected_output: "Integer error count per pass"
163
+ pass_criterion: "Error count = 0"
164
+
165
+ - id: pre-release-hardening
166
+ title: "Pre-Release Hardening Gates"
167
+ rationale: >
168
+ Before promoting to pre-release (beta/RC-candidate), the implementation
169
+ must survive adversarial conditions it will encounter in production:
170
+ mutant code that slips through tests, external attack vectors, load spikes,
171
+ and infrastructure failures. These gates are advisory during development
172
+ but blocking at release_phase = pre-release.
173
+ release_phase_gate: pre-release
174
+ steps:
175
+ - id: mutation-testing
176
+ instruction: >
177
+ Run mutation testing on changed files using Stryker (JS/TS) or mutmut (Python).
178
+ Mutation score on changed files must be ≥ 80%. A low mutation score means
179
+ tests pass but do not actually catch logic errors — the test suite is checking
180
+ the wrong things.
181
+ Configuration: stryker.config.json or .stryker.conf.json at project root.
182
+ Target only files modified in this release branch to keep runtime reasonable.
183
+ If forgecraft.yaml has tools.mutation configured, run that command directly.
184
+ The pre-commit hook will check mutation score if the tool is configured.
185
+ contract: >
186
+ Mutation score ≥ 80% on changed files. Stryker/mutmut exits with surviving-mutant
187
+ count below the 20% threshold for changed lines.
188
+ tools: ["stryker run", "mutmut run && mutmut results"]
189
+ expected_output: "Stryker HTML report or mutmut results table with mutation score"
190
+ pass_criterion: "Mutation score on changed files ≥ 80%"
191
+ requires_human_review: true
192
+
193
+ - id: dast-scan
194
+ instruction: >
195
+ Run OWASP ZAP dynamic analysis against a running instance of the service.
196
+ Use the baseline scan for APIs: `docker run -t owasp/zap2docker-stable
197
+ zap-api-scan.py -t <openapi-spec-url> -f openapi`.
198
+ All HIGH severity findings must be resolved before pre-release.
199
+ MEDIUM findings must be triaged (accepted with documented rationale or fixed).
200
+ contract: >
201
+ Zero HIGH severity findings from ZAP API scan.
202
+ Every MEDIUM finding has a documented disposition (fixed or accepted with ADR).
203
+ tools: ["owasp/zap2docker-stable", "zap-api-scan.py", "zap-baseline.py"]
204
+ expected_output: "ZAP HTML or JSON report with findings by severity"
205
+ pass_criterion: "Zero HIGH findings; all MEDIUM findings triaged"
206
+ requires_human_review: true
207
+ owasp_asvs_level: 2
208
+
209
+ - id: load-test
210
+ instruction: >
211
+ Run a load test at 2× peak expected traffic using k6, Locust, or Artillery.
212
+ Peak is defined as the highest observed or projected request rate in the
213
+ production traffic model. Test duration: minimum 10 minutes.
214
+ Accept criteria: p95 latency ≤ SLA threshold; error rate < 1%.
215
+ contract: >
216
+ Under 2× peak load: p95 response time ≤ SLA; error rate < 1%;
217
+ no memory leak detected (heap stable across test duration).
218
+ tools: ["k6 run", "locust", "artillery run"]
219
+ expected_output: "k6/Locust/Artillery HTML or JSON summary with p50/p95/p99 and error rate"
220
+ pass_criterion: "p95 ≤ SLA; error rate < 1% sustained over test duration"
221
+ requires_human_review: true
222
+
223
+ - id: chaos-resilience
224
+ instruction: >
225
+ Inject one network failure and one dependency failure using Toxiproxy,
226
+ Chaos Monkey, or AWS Fault Injection Simulator.
227
+ The service must degrade gracefully: return 503 with Retry-After header
228
+ (not 500 or silent hang) when a downstream is unavailable.
229
+ Recovery time from injected failure must be ≤ 30 seconds.
230
+ contract: >
231
+ Service returns structured error response (not 500) when dependency fails.
232
+ Recovers automatically within 30 seconds of dependency restoration.
233
+ No data corruption or stuck transactions during failure window.
234
+ tools: ["toxiproxy-cli", "chaos-monkey", "aws fis"]
235
+ expected_output: "Chaos test report showing failure injection + recovery timeline"
236
+ pass_criterion: "Graceful degradation confirmed; recovery time ≤ 30s"
237
+ requires_human_review: true
238
+
239
+ - id: release-candidate
240
+ title: "Release Candidate Gates"
241
+ rationale: >
242
+ The release candidate is the last checkpoint before production. It must pass
243
+ security penetration testing, accessibility audit (if UI), full compatibility
244
+ matrix validation, and mutation testing at the overall (not just changed-files)
245
+ threshold. Human sign-off is required on all findings.
246
+ release_phase_gate: release-candidate
247
+ steps:
248
+ - id: penetration-testing
249
+ instruction: >
250
+ Perform OWASP Top 10 penetration testing. At minimum cover:
251
+ injection (SQL, command, LDAP), broken authentication (session fixation,
252
+ token leakage, brute-force), sensitive data exposure (TLS config, response
253
+ headers, error messages), broken access control (BOLA/IDOR, privilege
254
+ escalation, JWT algorithm confusion), security misconfiguration (default
255
+ creds, verbose errors), and XXE/XSS if a UI is present.
256
+ Use OWASP ZAP active scan + manual testing for BOLA/IDOR.
257
+ contract: >
258
+ Zero Critical or High CVSS ≥ 7.0 findings unresolved.
259
+ All Medium findings (CVSS 4.0–6.9) have documented disposition.
260
+ OWASP Top 10 categories fully covered in test plan.
261
+ tools: ["owasp/zap2docker-stable", "burpsuite", "sqlmap", "nikto"]
262
+ expected_output: "Penetration test report with CVSS scores, reproduction steps, and remediation status"
263
+ pass_criterion: "Zero unresolved High/Critical; all Medium triaged with ADR or fix"
264
+ requires_human_review: true
265
+ owasp_asvs_level: 3
266
+
267
+ - id: mutation-testing-full
268
+ instruction: >
269
+ Run mutation testing against the full codebase (not just changed files).
270
+ Overall mutation score must be ≥ 80%. This validates that the entire
271
+ test suite is specification-grade, not just the code touched in this release.
272
+ contract: "Overall mutation score ≥ 80% across full codebase"
273
+ tools: ["stryker run --all-files", "mutmut run"]
274
+ expected_output: "Full mutation testing report with per-module scores"
275
+ pass_criterion: "Overall mutation score ≥ 80%"
276
+ requires_human_review: false
277
+
278
+ - id: compatibility-matrix
279
+ instruction: >
280
+ Validate the software against all supported runtime versions specified in
281
+ package.json engines field (Node.js), pyproject.toml (Python), or equivalent.
282
+ Run the full test suite against each supported major version.
283
+ If a database is used, validate against all supported DB major versions.
284
+ contract: >
285
+ Full test suite passes on every supported runtime version in the
286
+ declared compatibility matrix. Zero version-specific failures unreported.
287
+ tools: ["nvm use", "pyenv", "matrix CI strategy"]
288
+ expected_output: "CI matrix run report showing pass/fail per runtime version"
289
+ pass_criterion: "All tests pass on all declared runtime versions"
290
+ requires_human_review: false
291
+
292
+ - id: accessibility-audit
293
+ instruction: >
294
+ If the project has a user interface: run axe-core or Lighthouse accessibility
295
+ audit against all primary user journeys. WCAG 2.1 AA compliance required.
296
+ If no UI, mark this step as N/A with justification.
297
+ contract: "Zero WCAG 2.1 AA violations on primary user journeys, or step marked N/A with justification"
298
+ tools: ["axe-core", "lighthouse --only-categories=accessibility", "pa11y"]
299
+ expected_output: "axe/Lighthouse accessibility report or N/A justification"
300
+ pass_criterion: "Zero critical/serious axe violations; Lighthouse accessibility score ≥ 90"
301
+ requires_human_review: true
302
+
303
+ - id: deployment-gates
304
+ title: "Deployment Readiness Gates"
305
+ rationale: >
306
+ Before production deployment, the operational configuration must be validated:
307
+ canary or rolling deploy is configured, rollback threshold is defined and
308
+ tested, smoke tests cover the critical path, and observability is confirmed
309
+ active. These gates prevent deploying code that cannot be safely rolled back.
310
+ release_phase_gate: production
311
+ steps:
312
+ - id: canary-config
313
+ instruction: >
314
+ Confirm deployment strategy is configured (canary, blue-green, or rolling).
315
+ For canary: define the traffic split percentage and success threshold before
316
+ promoting to 100%. For rolling: define the max-surge and max-unavailable
317
+ settings. Document the rollback trigger condition (error rate threshold,
318
+ latency threshold, or health check failure count).
319
+ contract: >
320
+ Deployment strategy config present (k8s deployment, railway.json,
321
+ fly.toml, ECS service, or equivalent). Rollback trigger threshold
322
+ documented in runbook or infra config.
323
+ tools: ["kubectl", "fly", "railway", "aws ecs"]
324
+ expected_output: "Deployment config file + documented rollback threshold"
325
+ pass_criterion: "Deploy config present; rollback threshold defined and reachable in runbook"
326
+ requires_human_review: true
327
+
328
+ - id: smoke-tests
329
+ instruction: >
330
+ Define and run smoke tests covering the top 3–5 critical user journeys.
331
+ Smoke tests must be runnable against a live environment in < 60 seconds.
332
+ For web UIs: use Playwright — navigate to the live URL, wait for async
333
+ data to load, take a full-page screenshot, assert critical elements are
334
+ present and non-empty. Playwright catches rendering bugs, broken async
335
+ fetches, blank pages, and layout collapses that API-only tests miss entirely.
336
+ For APIs: use Hurl or Newman for machine-readable output.
337
+ These same tests run automatically post-deploy.
338
+ contract: >
339
+ Smoke test suite exists, runs in < 60 seconds, covers the critical path.
340
+ Web UIs: Playwright script with element assertions + screenshot artifact.
341
+ APIs: Hurl/Newman with status code + response schema assertions.
342
+ All smoke tests pass against staging before deploy to production.
343
+ tools: ["playwright", "newman run", "hurl", "k6 run --vus 1"]
344
+ expected_output: "Smoke test results with pass/fail per check, screenshot for web UIs, total duration"
345
+ pass_criterion: "All checks pass; screenshot confirms page renders correctly; total duration < 60s"
346
+ requires_human_review: false
347
+
348
+ - id: observability-verified
349
+ instruction: >
350
+ Confirm that structured logging, metrics, and alerting are active in the
351
+ target environment. Required: log aggregation (Datadog, Splunk, CloudWatch,
352
+ or equivalent), error rate alert (fires within 5 minutes of >1% error rate),
353
+ and latency alert (fires within 5 minutes of p95 > SLA). Run a synthetic
354
+ error and confirm the alert fires.
355
+ contract: >
356
+ Log aggregation active; error rate alert configured and tested;
357
+ latency alert configured and tested. Alert runbook linked in deploy notes.
358
+ tools: ["datadog", "cloudwatch", "grafana", "pagerduty"]
359
+ expected_output: "Screenshot or API response showing alerts active + test alert fired"
360
+ pass_criterion: "Synthetic error triggers alert within 5 minutes"
361
+ requires_human_review: true
362
+
363
+ - id: post-deployment
364
+ title: "Post-Deployment Health Verification"
365
+ rationale: >
366
+ After production deployment, synthetic probes and observability must confirm
367
+ that the new version is serving traffic correctly. The deployment is not
368
+ complete until post-deployment verification passes. If it fails, rollback
369
+ is the default response — not patching in production.
370
+ release_phase_gate: production
371
+ steps:
372
+ - id: synthetic-health-probes
373
+ instruction: >
374
+ Run the smoke test suite against the production environment within 5 minutes
375
+ of deploy completion. For web UIs: Playwright navigates to the live URL,
376
+ waits for async data, asserts critical elements, and saves a screenshot.
377
+ The screenshot is the proof artifact — view it after every deploy.
378
+ If any check fails, trigger rollback immediately.
379
+ contract: >
380
+ All smoke tests pass against production within 5 minutes of deployment.
381
+ Web UIs: screenshot artifact saved and reviewed. API: response schema valid.
382
+ Rollback is initiated automatically or within 10 minutes if any smoke test fails.
383
+ tools: ["newman run", "hurl", "k6 run --vus 1"]
384
+ expected_output: "Post-deploy smoke test results with timestamp"
385
+ pass_criterion: "All smoke tests pass against production; timestamp within 5 min of deploy"
386
+ requires_human_review: false
387
+
388
+ - id: error-rate-monitoring
389
+ instruction: >
390
+ Monitor the error rate for 30 minutes after deployment. Error rate must stay
391
+ below the rollback threshold. Compare p95 latency against pre-deploy baseline
392
+ (from load test results). If error rate spikes above threshold or latency
393
+ degrades > 20% vs baseline, initiate rollback.
394
+ contract: >
395
+ Error rate stays below rollback threshold for 30 minutes post-deploy.
396
+ p95 latency does not degrade > 20% vs pre-deploy baseline.
397
+ tools: ["datadog", "cloudwatch", "grafana"]
398
+ expected_output: "30-minute error rate and latency time series with baseline comparison"
399
+ pass_criterion: "Error rate below threshold; latency within 20% of baseline for 30 min"
400
+ requires_human_review: true
401
+
402
+ - id: incident-runbook-verified
403
+ instruction: >
404
+ Confirm that the incident runbook for this service is current and accessible.
405
+ The runbook must cover: how to roll back this version, how to escalate,
406
+ how to check the canary status, and how to interpret the health dashboard.
407
+ Have one team member who was not involved in the deployment review and
408
+ confirm the runbook is complete.
409
+ contract: >
410
+ Incident runbook exists at docs/runbooks/<service>.md or equivalent.
411
+ Runbook covers rollback, escalation, canary status, and dashboard links.
412
+ Independent review confirmed by a second team member.
413
+ tools: ["docs/runbooks/"]
414
+ expected_output: "Runbook link + peer review confirmation"
415
+ pass_criterion: "Runbook current; independent reviewer confirms completeness"
416
+ requires_human_review: true