universal-dev-standards 5.4.0 → 5.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/bundled/ai/options/testing/integration-testing.ai.yaml +2 -2
  2. package/bundled/ai/options/testing/unit-testing.ai.yaml +2 -2
  3. package/bundled/ai/standards/adversarial-test.ai.yaml +277 -0
  4. package/bundled/ai/standards/audit-trail.ai.yaml +113 -0
  5. package/bundled/ai/standards/browser-compatibility-standards.ai.yaml +63 -0
  6. package/bundled/ai/standards/chaos-injection-tests.ai.yaml +91 -0
  7. package/bundled/ai/standards/container-image-standards.ai.yaml +88 -0
  8. package/bundled/ai/standards/container-security.ai.yaml +331 -0
  9. package/bundled/ai/standards/contract-testing-standards.ai.yaml +62 -0
  10. package/bundled/ai/standards/cost-budget-test.ai.yaml +96 -0
  11. package/bundled/ai/standards/cross-flow-regression.ai.yaml +61 -0
  12. package/bundled/ai/standards/data-contract.ai.yaml +110 -0
  13. package/bundled/ai/standards/data-migration-testing.ai.yaml +96 -0
  14. package/bundled/ai/standards/data-pipeline.ai.yaml +113 -0
  15. package/bundled/ai/standards/disaster-recovery-drill.ai.yaml +89 -0
  16. package/bundled/ai/standards/flaky-test-management.ai.yaml +89 -0
  17. package/bundled/ai/standards/flow-based-testing.ai.yaml +240 -0
  18. package/bundled/ai/standards/full-coverage-testing.ai.yaml +192 -0
  19. package/bundled/ai/standards/iac-design-principles.ai.yaml +83 -0
  20. package/bundled/ai/standards/incident-response.ai.yaml +107 -0
  21. package/bundled/ai/standards/license-compliance.ai.yaml +106 -0
  22. package/bundled/ai/standards/llm-output-validation.ai.yaml +269 -0
  23. package/bundled/ai/standards/mock-boundary.ai.yaml +250 -0
  24. package/bundled/ai/standards/mutation-testing.ai.yaml +192 -0
  25. package/bundled/ai/standards/pii-classification.ai.yaml +109 -0
  26. package/bundled/ai/standards/policy-as-code-testing.ai.yaml +227 -0
  27. package/bundled/ai/standards/prd-standards.ai.yaml +88 -0
  28. package/bundled/ai/standards/product-metrics-standards.ai.yaml +111 -0
  29. package/bundled/ai/standards/prompt-regression.ai.yaml +94 -0
  30. package/bundled/ai/standards/property-based-testing.ai.yaml +105 -0
  31. package/bundled/ai/standards/release-quality-manifest.ai.yaml +135 -0
  32. package/bundled/ai/standards/release-readiness-gate.ai.yaml +77 -0
  33. package/bundled/ai/standards/replay-test.ai.yaml +111 -0
  34. package/bundled/ai/standards/runbook.ai.yaml +104 -0
  35. package/bundled/ai/standards/sast-advanced.ai.yaml +135 -0
  36. package/bundled/ai/standards/schema-evolution.ai.yaml +111 -0
  37. package/bundled/ai/standards/secret-management-standards.ai.yaml +105 -0
  38. package/bundled/ai/standards/secure-op.ai.yaml +365 -0
  39. package/bundled/ai/standards/security-testing.ai.yaml +171 -0
  40. package/bundled/ai/standards/server-ops-security.ai.yaml +274 -0
  41. package/bundled/ai/standards/slo-sli.ai.yaml +97 -0
  42. package/bundled/ai/standards/smoke-test.ai.yaml +87 -0
  43. package/bundled/ai/standards/supply-chain-attestation.ai.yaml +109 -0
  44. package/bundled/ai/standards/test-completeness-dimensions.ai.yaml +52 -5
  45. package/bundled/ai/standards/testing.ai.yaml +20 -13
  46. package/bundled/ai/standards/user-story-mapping.ai.yaml +108 -0
  47. package/bundled/core/accessibility-standards.md +58 -0
  48. package/bundled/core/adversarial-test.md +212 -0
  49. package/bundled/core/branch-completion.md +4 -0
  50. package/bundled/core/browser-compatibility-standards.md +220 -0
  51. package/bundled/core/chaos-injection-tests.md +116 -0
  52. package/bundled/core/checkin-standards.md +1 -0
  53. package/bundled/core/container-security.md +521 -0
  54. package/bundled/core/contract-testing-standards.md +182 -0
  55. package/bundled/core/cost-budget-test.md +69 -0
  56. package/bundled/core/cross-flow-regression.md +190 -0
  57. package/bundled/core/data-migration-testing.md +110 -0
  58. package/bundled/core/disaster-recovery-drill.md +73 -0
  59. package/bundled/core/flaky-test-management.md +73 -0
  60. package/bundled/core/flow-based-testing.md +275 -0
  61. package/bundled/core/full-coverage-testing.md +183 -0
  62. package/bundled/core/llm-output-validation.md +178 -0
  63. package/bundled/core/mock-boundary.md +100 -0
  64. package/bundled/core/mutation-testing.md +97 -0
  65. package/bundled/core/performance-standards.md +65 -0
  66. package/bundled/core/policy-as-code-testing.md +188 -0
  67. package/bundled/core/prompt-regression.md +72 -0
  68. package/bundled/core/property-based-testing.md +73 -0
  69. package/bundled/core/release-quality-manifest.md +193 -0
  70. package/bundled/core/release-readiness-gate.md +184 -0
  71. package/bundled/core/replay-test.md +86 -0
  72. package/bundled/core/sast-advanced.md +300 -0
  73. package/bundled/core/secure-op.md +314 -0
  74. package/bundled/core/security-testing.md +87 -0
  75. package/bundled/core/server-ops-security.md +493 -0
  76. package/bundled/core/smoke-test.md +65 -0
  77. package/bundled/core/supply-chain-attestation.md +117 -0
  78. package/bundled/locales/zh-CN/CHANGELOG.md +3 -3
  79. package/bundled/locales/zh-CN/README.md +1 -1
  80. package/bundled/locales/zh-CN/skills/ai-instruction-standards/SKILL.md +5 -5
  81. package/bundled/locales/zh-TW/CHANGELOG.md +3 -3
  82. package/bundled/locales/zh-TW/README.md +1 -1
  83. package/bundled/locales/zh-TW/core/browser-compatibility-standards.md +11 -0
  84. package/bundled/locales/zh-TW/core/contract-testing-standards.md +11 -0
  85. package/bundled/locales/zh-TW/core/cross-flow-regression.md +11 -0
  86. package/bundled/locales/zh-TW/core/release-readiness-gate.md +11 -0
  87. package/bundled/locales/zh-TW/skills/ai-instruction-standards/SKILL.md +183 -79
  88. package/bundled/skills/README.md +4 -3
  89. package/bundled/skills/SKILL_NAMING.md +94 -0
  90. package/bundled/skills/ai-instruction-standards/SKILL.md +181 -88
  91. package/bundled/skills/atdd-assistant/SKILL.md +8 -0
  92. package/bundled/skills/bdd-assistant/SKILL.md +7 -0
  93. package/bundled/skills/checkin-assistant/SKILL.md +8 -0
  94. package/bundled/skills/code-review-assistant/SKILL.md +7 -0
  95. package/bundled/skills/journey-test-assistant/SKILL.md +203 -0
  96. package/bundled/skills/orchestrate/SKILL.md +167 -0
  97. package/bundled/skills/plan/SKILL.md +234 -0
  98. package/bundled/skills/pr-automation-assistant/SKILL.md +8 -0
  99. package/bundled/skills/push/SKILL.md +49 -2
  100. package/bundled/skills/{process-automation → skill-builder}/SKILL.md +1 -1
  101. package/bundled/skills/{forward-derivation → spec-derivation}/SKILL.md +1 -1
  102. package/bundled/skills/spec-driven-dev/SKILL.md +7 -0
  103. package/bundled/skills/sweep/SKILL.md +145 -0
  104. package/bundled/skills/tdd-assistant/SKILL.md +7 -0
  105. package/package.json +6 -6
  106. package/src/commands/check.js +43 -0
  107. package/src/commands/flow.js +8 -0
  108. package/src/commands/init.js +2 -1
  109. package/src/commands/start.js +14 -0
  110. package/src/commands/sweep.js +8 -0
  111. package/src/commands/update.js +10 -0
  112. package/src/commands/workflow.js +8 -0
  113. package/standards-registry.json +483 -5
  114. package/bundled/locales/zh-CN/skills/ac-coverage-assistant/SKILL.md +0 -190
  115. package/bundled/locales/zh-CN/skills/forward-derivation/SKILL.md +0 -71
  116. package/bundled/locales/zh-CN/skills/forward-derivation/guide.md +0 -130
  117. package/bundled/locales/zh-CN/skills/methodology-system/SKILL.md +0 -88
  118. package/bundled/locales/zh-CN/skills/methodology-system/create-methodology.md +0 -350
  119. package/bundled/locales/zh-CN/skills/methodology-system/guide.md +0 -131
  120. package/bundled/locales/zh-CN/skills/methodology-system/runtime.md +0 -279
  121. package/bundled/locales/zh-CN/skills/process-automation/SKILL.md +0 -143
  122. package/bundled/locales/zh-TW/skills/ac-coverage-assistant/SKILL.md +0 -195
  123. package/bundled/locales/zh-TW/skills/deploy-assistant/SKILL.md +0 -178
  124. package/bundled/locales/zh-TW/skills/forward-derivation/SKILL.md +0 -69
  125. package/bundled/locales/zh-TW/skills/forward-derivation/guide.md +0 -415
  126. package/bundled/locales/zh-TW/skills/methodology-system/SKILL.md +0 -86
  127. package/bundled/locales/zh-TW/skills/methodology-system/create-methodology.md +0 -350
  128. package/bundled/locales/zh-TW/skills/methodology-system/guide.md +0 -131
  129. package/bundled/locales/zh-TW/skills/methodology-system/runtime.md +0 -279
  130. package/bundled/locales/zh-TW/skills/process-automation/SKILL.md +0 -144
  131. /package/bundled/skills/{ac-coverage-assistant → ac-coverage}/SKILL.md +0 -0
  132. /package/bundled/skills/{methodology-system → dev-methodology}/SKILL.md +0 -0
  133. /package/bundled/skills/{methodology-system → dev-methodology}/create-methodology.md +0 -0
  134. /package/bundled/skills/{methodology-system → dev-methodology}/guide.md +0 -0
  135. /package/bundled/skills/{methodology-system → dev-methodology}/integrated-flow.md +0 -0
  136. /package/bundled/skills/{methodology-system → dev-methodology}/prerequisite-check.md +0 -0
  137. /package/bundled/skills/{methodology-system → dev-methodology}/runtime.md +0 -0
  138. /package/bundled/skills/{forward-derivation → spec-derivation}/guide.md +0 -0
@@ -0,0 +1,111 @@
1
+ # Product Metrics Standards - AI Optimized
2
+ # Source: XSPEC-069 Wave 4 Product Layer Pack
3
+
4
+ id: product-metrics-standards
5
+ title: Product Metrics Framework Standards
6
+ version: "1.0.0"
7
+ status: Active
8
+ tags: [product, metrics, kpi, aarrr, heart, north-star, analytics]
9
+ summary: |
10
+ Defines how teams select, structure, and govern product metrics. Covers
11
+ a framework selection matrix (AARRR for growth products, HEART for
12
+ experience products, custom North Star for platforms), North Star metric
13
+ criteria, a three-level metric hierarchy (North Star → L1 drivers →
14
+ L2 diagnostics), and an anti-vanity rule that rejects metrics decoupled
15
+ from revenue or retention impact. Designed to align teams around metrics
16
+ that drive meaningful product decisions rather than activity tracking.
17
+
18
+ requirements:
19
+ - id: REQ-001
20
+ title: Framework Selection Matrix
21
+ description: |
22
+ Teams MUST select a primary metrics framework appropriate to their
23
+ product type. Selection criteria: (1) Growth products (consumer apps,
24
+ marketplaces, viral products focused on user acquisition and monetization)
25
+ → use AARRR framework (Acquisition, Activation, Retention, Referral,
26
+ Revenue). (2) Experience products (productivity tools, B2B SaaS, apps
27
+ where user satisfaction and task completion drive retention) → use
28
+ HEART framework (Happiness, Engagement, Adoption, Retention, Task
29
+ Success). (3) Platform products (developer platforms, APIs, infrastructure
30
+ products with diverse use cases) → define a custom North Star metric
31
+ that reflects platform value delivered, supplemented by AARRR or HEART
32
+ components as applicable. Framework selection MUST be documented in the
33
+ PRD or product strategy document.
34
+ level: MUST
35
+ examples:
36
+ - "Consumer social app → AARRR; primary focus on D7 retention and referral coefficient"
37
+ - "B2B project management tool → HEART; primary focus on Task Success and Retention"
38
+ - "Developer API platform → custom North Star: 'API calls per active developer per week'"
39
+ - "Framework documented in product-metrics.md: 'We use HEART because...'"
40
+
41
+ - id: REQ-002
42
+ title: North Star Criteria
43
+ description: |
44
+ Every product MUST define exactly one North Star metric that satisfies
45
+ all four criteria: (1) Leading indicator — it predicts future business
46
+ health (revenue, retention) rather than measuring past outcomes.
47
+ (2) Measurable and trackable — it can be calculated from existing or
48
+ easily obtainable data with a defined measurement cadence (weekly
49
+ or monthly). (3) Actionable by the team — the product team has direct
50
+ levers to influence it through feature development and UX decisions.
51
+ (4) Explainable in one sentence — any team member can describe what
52
+ it measures and why it matters without needing additional context.
53
+ North Star MUST be reviewed and reconfirmed at each annual product
54
+ planning cycle.
55
+ level: MUST
56
+ examples:
57
+ - "Spotify: 'Time spent listening per user per week' (leading, measurable, actionable)"
58
+ - "Airbnb: 'Nights booked per month' (explainable, predicts revenue)"
59
+ - "Weak North Star: 'Total revenue' (lagging, not directly actionable by product team)"
60
+ - "Annual review: North Star unchanged but L1 driver metrics updated for new product area"
61
+
62
+ - id: REQ-003
63
+ title: Metric Hierarchy
64
+ description: |
65
+ Teams MUST structure metrics in a three-level hierarchy with a maximum
66
+ of three levels. Level 1 (North Star): one metric representing overall
67
+ product value delivered. Level 2 (L1 Driver Metrics): 3–5 metrics that
68
+ directly influence the North Star; each driver metric must have a
69
+ documented causal hypothesis linking it to the North Star. Level 3
70
+ (L2 Diagnostic Metrics): per-feature or per-team metrics that explain
71
+ movements in L1 drivers; maximum 3 diagnostics per driver. Metrics
72
+ beyond three levels of hierarchy are PROHIBITED — they indicate
73
+ measurement fragmentation rather than focus.
74
+ level: MUST
75
+ examples:
76
+ - "North Star: weekly active users completing core action"
77
+ - "L1 drivers: new user activation rate, 7-day retention, feature adoption breadth"
78
+ - "L2 diagnostic for activation: onboarding step completion rates (step 1/2/3)"
79
+ - "Prohibited: L4 sub-diagnostic metrics that obscure rather than explain"
80
+
81
+ - id: REQ-004
82
+ title: Anti-Vanity Rule
83
+ description: |
84
+ Teams MUST apply the anti-vanity test before adding any metric to the
85
+ official metrics dashboard. A metric fails the anti-vanity test if it
86
+ can increase while revenue and retention remain flat or decrease. Such
87
+ metrics MUST NOT appear in official product reviews or be used as
88
+ success criteria for features. Examples of vanity metrics that commonly
89
+ fail this test: total registered users (without active usage filter),
90
+ raw pageviews (without session quality filter), total API calls
91
+ (without unique active customer filter), press mentions, app store
92
+ downloads without activation. When a vanity metric is useful for
93
+ operational monitoring, it MUST be clearly labeled as "operational
94
+ indicator, not success metric."
95
+ level: MUST
96
+ examples:
97
+ - "Reject: 'Total signups this month' → replace with 'Signups who completed activation'"
98
+ - "Reject: 'Total pageviews' → replace with 'Sessions with ≥2 meaningful interactions'"
99
+ - "Allowed with label: 'Total API calls (operational indicator)' on infra dashboard"
100
+ - "Feature success metric: 'Users who used feature X and retained at D30' not 'feature clicks'"
101
+
102
+ anti_patterns:
103
+ - "Tracking vanity metrics (total signups, raw pageviews) as primary success indicators"
104
+ - "No North Star defined — teams optimize for different local metrics, creating misalignment"
105
+ - "Conflicting team metrics where one team's optimization harms another team's metric"
106
+ - "Metric hierarchy deeper than 3 levels, creating measurement complexity without insight"
107
+ - "Changing the North Star quarterly, preventing year-over-year trend analysis"
108
+
109
+ related_standards:
110
+ - prd-standards
111
+ - slo-sli
@@ -0,0 +1,94 @@
1
+ # SPDX-License-Identifier: MIT
2
+ name: Prompt Regression Standards
3
+ nameZh: Prompt 回歸測試標準
4
+ id: prompt-regression
5
+ version: "1.0.0"
6
+ category: testing
7
+ scope: ai-agent-systems
8
+ summary: >
9
+ Prevent unintended prompt changes from silently degrading AI agent behaviour.
10
+ Golden checksum tests detect modifications; snapshot diff confirms intent.
11
+
12
+ requirements:
13
+ - id: REQ-01
14
+ title: Prompt Versioning
15
+ titleZh: Prompt 版本化
16
+ level: MUST
17
+ description: >
18
+ Every AI agent MUST have a versioned prompt file (e.g. prompt.md with
19
+ frontmatter `version: "x.y.z"`). Changes require explicit version bump.
20
+
21
+ - id: REQ-02
22
+ title: Golden Checksum Test
23
+ titleZh: 黃金校驗和測試
24
+ level: MUST
25
+ description: >
26
+ A CI-enforced checksum test MUST store the SHA-256 hash of each agent's
27
+ prompt file. A mismatch fails the build and requires updating the golden
28
+ value with a comment confirming intent ("intentional change for X").
29
+ implementation: |
30
+ const GOLDEN = { "planner": "abc123...", ... }
31
+ it("prompt has not changed unexpectedly", () => {
32
+ const actual = sha256(readFileSync("agents/planner/prompt.md"))
33
+ expect(actual).toBe(GOLDEN["planner"])
34
+ })
35
+
36
+ - id: REQ-03
37
+ title: Snapshot Diff
38
+ titleZh: 快照差異比對
39
+ level: SHOULD
40
+ description: >
41
+ On CI failure, the system SHOULD output a unified diff between the
42
+ recorded snapshot and current prompt content to aid review.
43
+
44
+ - id: REQ-04
45
+ title: Change Review Gate
46
+ titleZh: 變更審查閘門
47
+ level: MUST
48
+ description: >
49
+ Prompt changes MUST be reviewed as code changes. Checksum updates require
50
+ a comment in the test file explaining why the prompt changed (e.g.
51
+ "Updated system role to include new Guardian policy reference XSPEC-160").
52
+
53
+ - id: REQ-05
54
+ title: Coverage
55
+ titleZh: 覆蓋範圍
56
+ level: MUST
57
+ description: >
58
+ The golden checksum test MUST cover ALL production agent prompt files.
59
+ New agents added to the system MUST be added to the test within the same
60
+ PR that introduces the agent.
61
+
62
+ examples:
63
+ - name: "SHA-256 checksum test in Vitest"
64
+ code: |
65
+ import { createHash } from "crypto"
66
+ import { readFileSync } from "fs"
67
+ import { describe, it, expect } from "vitest"
68
+
69
+ const GOLDEN_CHECKSUMS: Record<string, string> = {
70
+ planner: "dd0d086d...",
71
+ guardian: "f56555...",
72
+ }
73
+
74
+ describe("Agent prompt regression", () => {
75
+ for (const [agent, expected] of Object.entries(GOLDEN_CHECKSUMS)) {
76
+ it(`agents/${agent}/prompt.md has not changed unexpectedly`, () => {
77
+ const content = readFileSync(`agents/${agent}/prompt.md`)
78
+ const actual = createHash("sha256").update(content).digest("hex")
79
+ expect(actual).toBe(expected)
80
+ })
81
+ }
82
+ })
83
+
84
+ anti_patterns:
85
+ - description: >
86
+ Skipping checksums for "stable" agents — all prompts require regression
87
+ coverage regardless of perceived stability.
88
+ - description: >
89
+ Updating checksums in bulk without per-agent comments explaining intent.
90
+
91
+ related_standards:
92
+ - llm-output-validation
93
+ - adversarial-test
94
+ - testing
@@ -0,0 +1,105 @@
1
+ # SPDX-License-Identifier: MIT
2
+ name: Property-Based Testing Standards
3
+ nameZh: 屬性基礎測試標準
4
+ id: property-based-testing
5
+ version: "1.0.0"
6
+ category: testing
7
+ scope: correctness-validation
8
+ summary: >
9
+ Property-based testing generates hundreds of random inputs to verify that
10
+ invariants (properties) hold for all valid inputs, not just the examples
11
+ a developer thought to write. Especially valuable for pure functions,
12
+ parsers, and security-critical logic.
13
+
14
+ requirements:
15
+ - id: REQ-01
16
+ title: Property Identification
17
+ titleZh: 屬性識別
18
+ level: MUST
19
+ description: >
20
+ For each module under property-based test, developers MUST explicitly
21
+ identify the invariants to test. Examples: idempotency, monotonicity,
22
+ round-trip (encode/decode), boundary clamping, determinism.
23
+
24
+ - id: REQ-02
25
+ title: Generator Coverage
26
+ titleZh: 生成器覆蓋
27
+ level: MUST
28
+ description: >
29
+ Property tests MUST use appropriate generators: bounded integers for
30
+ array indices, valid enum values for type-constrained fields,
31
+ arbitrary strings for fuzz targets. Generators MUST cover the full
32
+ valid input space, not just a curated subset.
33
+
34
+ - id: REQ-03
35
+ title: Shrinking
36
+ titleZh: 最小化反例
37
+ level: SHOULD
38
+ description: >
39
+ When a property test fails, the framework SHOULD shrink the
40
+ counterexample to its minimal form. Libraries like fast-check (JS/TS)
41
+ and Hypothesis (Python) do this automatically.
42
+
43
+ - id: REQ-04
44
+ title: Run Count
45
+ titleZh: 執行次數
46
+ level: MUST
47
+ description: >
48
+ Property tests MUST run at least 100 samples per property in CI.
49
+ For security-critical functions (hash chains, token validation,
50
+ policy evaluation), run at least 1000 samples.
51
+
52
+ - id: REQ-05
53
+ title: Seed Persistence
54
+ titleZh: 種子持久化
55
+ level: SHOULD
56
+ description: >
57
+ When a property test fails, the failing seed SHOULD be saved and
58
+ re-run in CI until the fix is confirmed. This ensures regressions
59
+ are caught even after the random seed changes.
60
+
61
+ examples:
62
+ - name: "fast-check idempotency property for score clamping"
63
+ code: |
64
+ import fc from "fast-check"
65
+ import { describe, it } from "vitest"
66
+
67
+ describe("scoreReviewable idempotency", () => {
68
+ it("score is always in [0, 100]", () => {
69
+ fc.assert(
70
+ fc.property(
71
+ fc.record({
72
+ target_env: fc.constantFrom("prod", "staging", "dev"),
73
+ command_type: fc.constantFrom("query", "mutate", "exec", "delete"),
74
+ reversible: fc.boolean(),
75
+ }),
76
+ ({ target_env, command_type, reversible }) => {
77
+ const result = scoreReviewable({
78
+ session_id: "prop-001",
79
+ source_agent: "operator",
80
+ intent: "test",
81
+ plan: [{ command: "ls", command_type, reversible }],
82
+ target_env,
83
+ reversible,
84
+ })
85
+ return result.score >= 0 && result.score <= 100
86
+ }
87
+ ),
88
+ { numRuns: 1000 }
89
+ )
90
+ })
91
+ })
92
+
93
+ anti_patterns:
94
+ - description: >
95
+ Using fc.anything() for domain-specific inputs — unguided random
96
+ strings will mostly generate invalid inputs that get rejected early.
97
+ Use constrained generators (fc.constantFrom, fc.integer with bounds).
98
+ - description: >
99
+ Setting numRuns: 10 — too few samples to find edge cases. Use at
100
+ least 100 for regular code and 1000 for security-critical functions.
101
+
102
+ related_standards:
103
+ - mutation-testing
104
+ - testing
105
+ - adversarial-test
@@ -0,0 +1,135 @@
1
+ # Release Quality Manifest Standards - AI Optimized
2
+ # Source: core/release-quality-manifest.md
3
+
4
+ id: release-quality-manifest
5
+ meta:
6
+ version: "1.0.0"
7
+ updated: "2026-05-05"
8
+ source: core/release-quality-manifest.md
9
+ description: Automated per-release Quality Manifest that aggregates all quality gate results into a single machine-readable artifact
10
+
11
+ requirements:
12
+ REQ-1:
13
+ id: REQ-RQM-001
14
+ title: Machine-Readable Format
15
+ rule: >
16
+ Every release MUST produce a Quality Manifest in YAML or JSON format that
17
+ aggregates the results of all defined quality gates. The manifest MUST be
18
+ committed to source control or attached to the release artifact.
19
+ rationale: >
20
+ Machine-readable manifests enable automated release gates and customer audits;
21
+ prose-only release notes cannot be parsed by downstream tooling.
22
+
23
+ REQ-2:
24
+ id: REQ-RQM-002
25
+ title: Gate Coverage
26
+ rule: >
27
+ The Quality Manifest MUST include at minimum: unit test coverage %, mutation
28
+ score %, SCA CVE counts (critical/high), SAST finding counts (high/medium),
29
+ E2E pass rate %, container CVE scan status, image signature status, SBOM
30
+ presence, and (if applicable) LLM hallucination rate and prompt injection
31
+ resistance score.
32
+ rationale: >
33
+ Partial manifests create false confidence; a complete manifest proves end-to-end
34
+ quality rather than cherry-picked metrics.
35
+
36
+ REQ-3:
37
+ id: REQ-RQM-003
38
+ title: Pass/Warn/Fail Status per Gate
39
+ rule: >
40
+ Each gate entry MUST carry a status field: "pass" (meets target), "warn"
41
+ (within acceptable deviation from target), or "fail" (blocks release).
42
+ The manifest MUST have an overall status field derived from the worst gate.
43
+ rationale: >
44
+ Binary pass/fail per gate plus an aggregate status enables release go/no-go
45
+ automation without human judgment on individual metrics.
46
+
47
+ REQ-4:
48
+ id: REQ-RQM-004
49
+ title: Automated Generation in CI
50
+ rule: >
51
+ The Quality Manifest MUST be generated automatically by CI (not manually
52
+ authored). Each gate's value MUST be extracted from the corresponding tool
53
+ output (vitest coverage JSON, stryker JSON, trivy SARIF, etc.).
54
+ rationale: >
55
+ Manually authored manifests are unreliable; CI-generated manifests are the
56
+ only form of evidence that meets audit requirements.
57
+
58
+ REQ-5:
59
+ id: REQ-RQM-005
60
+ title: Customer-Facing Summary
61
+ rule: >
62
+ A human-readable summary of the Quality Manifest (e.g., Markdown table)
63
+ MUST be generated alongside the machine-readable format and included in
64
+ the release notes or documentation.
65
+ rationale: >
66
+ Customers and auditors need a scannable summary; the machine-readable format
67
+ alone does not satisfy human review requirements.
68
+
69
+ manifest_schema:
70
+ release: "string — semver tag e.g. v1.2.0"
71
+ generated_at: "ISO 8601 timestamp"
72
+ commit: "git SHA"
73
+ gates:
74
+ unit_coverage:
75
+ actual: "percentage string e.g. '73%'"
76
+ target: "threshold e.g. '80%'"
77
+ status: "pass | warn | fail"
78
+ mutation_score:
79
+ actual: "percentage string"
80
+ target: "threshold"
81
+ status: "pass | warn | fail"
82
+ sca_critical_cve:
83
+ actual: "integer"
84
+ target: "0"
85
+ status: "pass | fail"
86
+ sca_high_cve:
87
+ actual: "integer"
88
+ target: "0"
89
+ status: "pass | warn | fail"
90
+ sast_high:
91
+ actual: "integer"
92
+ target: "0"
93
+ status: "pass | warn | fail"
94
+ e2e_pass_rate:
95
+ actual: "percentage string"
96
+ target: "threshold"
97
+ status: "pass | warn | fail"
98
+ container_cve_critical:
99
+ actual: "integer"
100
+ target: "0"
101
+ status: "pass | fail"
102
+ image_signed:
103
+ actual: "boolean"
104
+ target: "true"
105
+ status: "pass | fail"
106
+ sbom_present:
107
+ actual: "boolean"
108
+ target: "true"
109
+ status: "pass | fail"
110
+ overall: "PASS | WARN | FAIL"
111
+
112
+ generation_guidance: >
113
+ Extract coverage from vitest --coverage --reporter=json (summary.total.lines.pct).
114
+ Extract mutation score from stryker's mutation-testing-report.json (metrics.mutationScore).
115
+ Extract CVE counts from trivy JSON output (Results[].Vulnerabilities filtered by Severity).
116
+ Extract SAST from CodeQL SARIF (runs[].results filtered by level=error).
117
+ Combine into manifest YAML via a CI shell script or Node.js release script.
118
+
119
+ anti_patterns:
120
+ - description: >
121
+ Generating the manifest after all gates have passed — gates should use
122
+ the manifest values, not precede them.
123
+ - description: >
124
+ Hardcoding metric values in the manifest generation script — all values
125
+ MUST be extracted from tool outputs to remain accurate.
126
+ - description: >
127
+ Using 'warn' status for critical security gates (sca_critical_cve,
128
+ container_cve_critical) — critical security gates are binary pass/fail.
129
+
130
+ related_standards:
131
+ - testing
132
+ - security-testing
133
+ - supply-chain-attestation
134
+ - verification-evidence
135
+ - deployment-standards
@@ -0,0 +1,77 @@
1
+ # Release Readiness Gate Standards - AI Optimized
2
+ # Source: core/release-readiness-gate.md
3
+
4
+ id: release-readiness-gate
5
+ meta:
6
+ version: "1.0.0"
7
+ updated: "2026-05-05"
8
+ source: core/release-readiness-gate.md
9
+ description: Single aggregated release gate covering 16 quality dimensions with tiered sign-off template and RQM integration
10
+
11
+ requirements:
12
+ REQ-1:
13
+ id: REQ-RRG-001
14
+ title: 16-Dimension Coverage
15
+ rule: >
16
+ Every production release MUST evaluate all 16 quality dimensions defined in
17
+ core/release-readiness-gate.md. Tier-1 dimensions block release if FAIL.
18
+ Tier-2 dimensions require documented rationale if WARN. Tier-3 dimensions
19
+ require rationale if N/A.
20
+ rationale: >
21
+ Without explicit multi-dimension coverage, teams pass individual gate checks
22
+ but ship with unverified quality dimensions, creating systematic blind spots.
23
+
24
+ REQ-2:
25
+ id: REQ-RRG-002
26
+ title: Release Readiness Sign-off
27
+ rule: >
28
+ A Release Readiness Sign-off document MUST be created from the template in
29
+ core/release-readiness-gate.md for every release tag. It must be stored at
30
+ .release-readiness/<version>.md. The Overall Decision field must be explicitly
31
+ set to GO or NO-GO by a named release owner.
32
+ rationale: >
33
+ Anonymous or implicit GO decisions remove accountability; the sign-off creates
34
+ a named, dated, auditable record of the go/no-go decision and its evidence.
35
+
36
+ REQ-3:
37
+ id: REQ-RRG-003
38
+ title: Tier-1 Hard Block
39
+ rule: >
40
+ ANY Tier-1 dimension at FAIL status MUST block production deployment.
41
+ Tier-1 dimensions are: Security (Dim 2), DB Migration (Dim 5), Operational
42
+ Readiness (Dim 7), Rollback/DR (Dim 13), Production Smoke (Dim 14).
43
+ rationale: >
44
+ Tier-1 dimensions represent existential risks: security vulnerabilities,
45
+ broken rollback, misconfigured monitoring. No business justification
46
+ overrides a Tier-1 FAIL.
47
+
48
+ REQ-4:
49
+ id: REQ-RRG-004
50
+ title: RQM Alignment
51
+ rule: >
52
+ The machine-readable Release Quality Manifest (release-quality-manifest.md)
53
+ MUST include entries for all automated dimensions (a11y_critical, contract_drift,
54
+ cross_flow_cuj_pass_rate, browser_tier1_pass_rate, capacity_headroom_cpu_pct,
55
+ smoke_pass_rate, flow_gate_report). The RQM overall field must be PASS or WARN
56
+ (never FAIL) before deployment.
57
+ rationale: >
58
+ Human sign-off and machine manifest are complementary; the manifest enables
59
+ automated enforcement while the sign-off provides human accountability.
60
+
61
+ REQ-5:
62
+ id: REQ-RRG-005
63
+ title: Incremental Collection
64
+ rule: >
65
+ Release Readiness Sign-off evidence MUST be collected incrementally throughout
66
+ the release cycle (Gate 0 at PRD, Gate 3 pre-UAT, Gate 4 post-UAT). Creating
67
+ the sign-off on the day of deployment is an anti-pattern.
68
+ rationale: >
69
+ Last-minute sign-offs are rubber stamps; evidence collected late cannot
70
+ be acted upon without delaying the release.
71
+
72
+ quick_reference:
73
+ tier_1_dimensions: "Security, DB Migration, Operational Readiness, Rollback/DR, Production Smoke"
74
+ tier_2_dimensions: "Performance, a11y, Cross-flow Regression, i18n, Docs, Feature Flags, Multi-Gate Flow"
75
+ tier_3_dimensions: "Contract Testing, Browser Compat, Capacity, Compliance/Privacy"
76
+ sign_off_location: ".release-readiness/<version>.md"
77
+ rqm_integration: "flow_gate_report.json → release-quality-manifest.yaml field flow_gate_report"
@@ -0,0 +1,111 @@
1
+ # SPDX-License-Identifier: MIT
2
+ name: Replay Test Standards
3
+ nameZh: 回放測試標準
4
+ id: replay-test
5
+ version: "1.0.0"
6
+ category: testing
7
+ scope: ai-agent-systems
8
+ summary: >
9
+ Golden fixture recording and deterministic replay for AI agent pipelines.
10
+ Enables customer bug reproduction, verdict regression detection, and
11
+ on-site incident investigation without requiring a live LLM.
12
+
13
+ requirements:
14
+ - id: REQ-01
15
+ title: Golden Fixture Format
16
+ titleZh: 黃金 fixture 格式
17
+ level: MUST
18
+ description: >
19
+ Each replay fixture MUST be a JSON file containing: (1) the exact input
20
+ that triggered the behaviour, (2) the expected output (decision/verdict),
21
+ (3) metadata (date recorded, source — customer report / CI regression /
22
+ incident, description). Fixtures MUST be deterministic (same input always
23
+ produces same output for pure-function components).
24
+
25
+ - id: REQ-02
26
+ title: Replay Test Suite
27
+ titleZh: 回放測試套件
28
+ level: MUST
29
+ description: >
30
+ A dedicated replay test file MUST load each fixture and assert that
31
+ re-running the component under test produces the recorded expected output.
32
+ For AI components with LLM dependencies, replay MUST mock the LLM layer
33
+ and test only the deterministic logic (scoring, routing, policy evaluation).
34
+
35
+ - id: REQ-03
36
+ title: Bug Regression Capture
37
+ titleZh: Bug 回歸捕捉
38
+ level: MUST
39
+ description: >
40
+ When a production bug is reported, a fixture MUST be created from the
41
+ failing input within the same PR that fixes the bug. The fixture prevents
42
+ the bug from being reintroduced silently.
43
+
44
+ - id: REQ-04
45
+ title: Fixture Coverage
46
+ titleZh: Fixture 覆蓋
47
+ level: SHOULD
48
+ description: >
49
+ The fixture set SHOULD include at least one representative for each
50
+ decision outcome (e.g. ALLOW / REQUIRE_HITL / DENY for Guardian).
51
+ Edge cases reported by customers or from red-team exercises SHOULD be
52
+ added as separate fixtures.
53
+
54
+ - id: REQ-05
55
+ title: Fixture Naming Convention
56
+ titleZh: Fixture 命名規範
57
+ level: MUST
58
+ description: >
59
+ Fixture files MUST follow the pattern:
60
+ `<component>-<outcome>-<short-description>.json`
61
+ e.g. `guardian-deny-prod-drop-table.json`,
62
+ `guardian-allow-dev-npm-install.json`
63
+
64
+ examples:
65
+ - name: "Guardian replay fixture file"
66
+ code: |
67
+ {
68
+ "meta": {
69
+ "recorded": "2026-05-05",
70
+ "source": "red-team-exercise",
71
+ "description": "DROP TABLE in prod should DENY"
72
+ },
73
+ "input": {
74
+ "session_id": "replay-001",
75
+ "source_agent": "operator",
76
+ "intent": "Clean up test data",
77
+ "plan": [{"command": "DROP TABLE users;", "command_type": "mutate", "target_resource": "db_schema", "reversible": false}],
78
+ "target_env": "prod",
79
+ "reversible": false
80
+ },
81
+ "expected": {
82
+ "decision": "DENY"
83
+ }
84
+ }
85
+
86
+ - name: "Replay test loading fixtures"
87
+ code: |
88
+ const fixtures = readdirSync('src/guardian/__fixtures__')
89
+ .filter(f => f.endsWith('.json'))
90
+ .map(f => JSON.parse(readFileSync(join('src/guardian/__fixtures__', f), 'utf-8')))
91
+
92
+ for (const { meta, input, expected } of fixtures) {
93
+ it(meta.description, () => {
94
+ const result = scoreReviewable(input)
95
+ const decision = deriveDecision(result.score)
96
+ expect(decision).toBe(expected.decision)
97
+ })
98
+ }
99
+
100
+ anti_patterns:
101
+ - description: >
102
+ Fixtures without metadata fields — without source and date, it's
103
+ impossible to know why a fixture exists or when it was added.
104
+ - description: >
105
+ Creating fixtures only for the happy path — the most valuable fixtures
106
+ are customer-reported failures and red-team findings.
107
+
108
+ related_standards:
109
+ - adversarial-test
110
+ - testing
111
+ - verification-evidence