universal-dev-standards 5.4.0 → 5.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled/ai/options/testing/integration-testing.ai.yaml +2 -2
- package/bundled/ai/options/testing/unit-testing.ai.yaml +2 -2
- package/bundled/ai/standards/adversarial-test.ai.yaml +277 -0
- package/bundled/ai/standards/audit-trail.ai.yaml +113 -0
- package/bundled/ai/standards/browser-compatibility-standards.ai.yaml +63 -0
- package/bundled/ai/standards/chaos-injection-tests.ai.yaml +91 -0
- package/bundled/ai/standards/container-image-standards.ai.yaml +88 -0
- package/bundled/ai/standards/container-security.ai.yaml +331 -0
- package/bundled/ai/standards/contract-testing-standards.ai.yaml +62 -0
- package/bundled/ai/standards/cost-budget-test.ai.yaml +96 -0
- package/bundled/ai/standards/cross-flow-regression.ai.yaml +61 -0
- package/bundled/ai/standards/data-contract.ai.yaml +110 -0
- package/bundled/ai/standards/data-migration-testing.ai.yaml +96 -0
- package/bundled/ai/standards/data-pipeline.ai.yaml +113 -0
- package/bundled/ai/standards/disaster-recovery-drill.ai.yaml +89 -0
- package/bundled/ai/standards/flaky-test-management.ai.yaml +89 -0
- package/bundled/ai/standards/flow-based-testing.ai.yaml +240 -0
- package/bundled/ai/standards/full-coverage-testing.ai.yaml +192 -0
- package/bundled/ai/standards/iac-design-principles.ai.yaml +83 -0
- package/bundled/ai/standards/incident-response.ai.yaml +107 -0
- package/bundled/ai/standards/license-compliance.ai.yaml +106 -0
- package/bundled/ai/standards/llm-output-validation.ai.yaml +269 -0
- package/bundled/ai/standards/mock-boundary.ai.yaml +250 -0
- package/bundled/ai/standards/mutation-testing.ai.yaml +192 -0
- package/bundled/ai/standards/pii-classification.ai.yaml +109 -0
- package/bundled/ai/standards/policy-as-code-testing.ai.yaml +227 -0
- package/bundled/ai/standards/prd-standards.ai.yaml +88 -0
- package/bundled/ai/standards/product-metrics-standards.ai.yaml +111 -0
- package/bundled/ai/standards/prompt-regression.ai.yaml +94 -0
- package/bundled/ai/standards/property-based-testing.ai.yaml +105 -0
- package/bundled/ai/standards/release-quality-manifest.ai.yaml +135 -0
- package/bundled/ai/standards/release-readiness-gate.ai.yaml +77 -0
- package/bundled/ai/standards/replay-test.ai.yaml +111 -0
- package/bundled/ai/standards/runbook.ai.yaml +104 -0
- package/bundled/ai/standards/sast-advanced.ai.yaml +135 -0
- package/bundled/ai/standards/schema-evolution.ai.yaml +111 -0
- package/bundled/ai/standards/secret-management-standards.ai.yaml +105 -0
- package/bundled/ai/standards/secure-op.ai.yaml +365 -0
- package/bundled/ai/standards/security-testing.ai.yaml +171 -0
- package/bundled/ai/standards/server-ops-security.ai.yaml +274 -0
- package/bundled/ai/standards/slo-sli.ai.yaml +97 -0
- package/bundled/ai/standards/smoke-test.ai.yaml +87 -0
- package/bundled/ai/standards/supply-chain-attestation.ai.yaml +109 -0
- package/bundled/ai/standards/test-completeness-dimensions.ai.yaml +52 -5
- package/bundled/ai/standards/testing.ai.yaml +20 -13
- package/bundled/ai/standards/user-story-mapping.ai.yaml +108 -0
- package/bundled/core/accessibility-standards.md +58 -0
- package/bundled/core/adversarial-test.md +212 -0
- package/bundled/core/branch-completion.md +4 -0
- package/bundled/core/browser-compatibility-standards.md +220 -0
- package/bundled/core/chaos-injection-tests.md +116 -0
- package/bundled/core/checkin-standards.md +1 -0
- package/bundled/core/container-security.md +521 -0
- package/bundled/core/contract-testing-standards.md +182 -0
- package/bundled/core/cost-budget-test.md +69 -0
- package/bundled/core/cross-flow-regression.md +190 -0
- package/bundled/core/data-migration-testing.md +110 -0
- package/bundled/core/disaster-recovery-drill.md +73 -0
- package/bundled/core/flaky-test-management.md +73 -0
- package/bundled/core/flow-based-testing.md +275 -0
- package/bundled/core/full-coverage-testing.md +183 -0
- package/bundled/core/llm-output-validation.md +178 -0
- package/bundled/core/mock-boundary.md +100 -0
- package/bundled/core/mutation-testing.md +97 -0
- package/bundled/core/performance-standards.md +65 -0
- package/bundled/core/policy-as-code-testing.md +188 -0
- package/bundled/core/prompt-regression.md +72 -0
- package/bundled/core/property-based-testing.md +73 -0
- package/bundled/core/release-quality-manifest.md +193 -0
- package/bundled/core/release-readiness-gate.md +184 -0
- package/bundled/core/replay-test.md +86 -0
- package/bundled/core/sast-advanced.md +300 -0
- package/bundled/core/secure-op.md +314 -0
- package/bundled/core/security-testing.md +87 -0
- package/bundled/core/server-ops-security.md +493 -0
- package/bundled/core/smoke-test.md +65 -0
- package/bundled/core/supply-chain-attestation.md +117 -0
- package/bundled/locales/zh-CN/CHANGELOG.md +3 -3
- package/bundled/locales/zh-CN/README.md +1 -1
- package/bundled/locales/zh-CN/skills/ai-instruction-standards/SKILL.md +5 -5
- package/bundled/locales/zh-TW/CHANGELOG.md +3 -3
- package/bundled/locales/zh-TW/README.md +1 -1
- package/bundled/locales/zh-TW/core/browser-compatibility-standards.md +11 -0
- package/bundled/locales/zh-TW/core/contract-testing-standards.md +11 -0
- package/bundled/locales/zh-TW/core/cross-flow-regression.md +11 -0
- package/bundled/locales/zh-TW/core/release-readiness-gate.md +11 -0
- package/bundled/locales/zh-TW/skills/ai-instruction-standards/SKILL.md +183 -79
- package/bundled/skills/README.md +4 -3
- package/bundled/skills/SKILL_NAMING.md +94 -0
- package/bundled/skills/ai-instruction-standards/SKILL.md +181 -88
- package/bundled/skills/atdd-assistant/SKILL.md +8 -0
- package/bundled/skills/bdd-assistant/SKILL.md +7 -0
- package/bundled/skills/checkin-assistant/SKILL.md +8 -0
- package/bundled/skills/code-review-assistant/SKILL.md +7 -0
- package/bundled/skills/journey-test-assistant/SKILL.md +203 -0
- package/bundled/skills/orchestrate/SKILL.md +167 -0
- package/bundled/skills/plan/SKILL.md +234 -0
- package/bundled/skills/pr-automation-assistant/SKILL.md +8 -0
- package/bundled/skills/push/SKILL.md +49 -2
- package/bundled/skills/{process-automation → skill-builder}/SKILL.md +1 -1
- package/bundled/skills/{forward-derivation → spec-derivation}/SKILL.md +1 -1
- package/bundled/skills/spec-driven-dev/SKILL.md +7 -0
- package/bundled/skills/sweep/SKILL.md +145 -0
- package/bundled/skills/tdd-assistant/SKILL.md +7 -0
- package/package.json +6 -6
- package/src/commands/check.js +43 -0
- package/src/commands/flow.js +8 -0
- package/src/commands/init.js +2 -1
- package/src/commands/start.js +14 -0
- package/src/commands/sweep.js +8 -0
- package/src/commands/update.js +10 -0
- package/src/commands/workflow.js +8 -0
- package/standards-registry.json +483 -5
- package/bundled/locales/zh-CN/skills/ac-coverage-assistant/SKILL.md +0 -190
- package/bundled/locales/zh-CN/skills/forward-derivation/SKILL.md +0 -71
- package/bundled/locales/zh-CN/skills/forward-derivation/guide.md +0 -130
- package/bundled/locales/zh-CN/skills/methodology-system/SKILL.md +0 -88
- package/bundled/locales/zh-CN/skills/methodology-system/create-methodology.md +0 -350
- package/bundled/locales/zh-CN/skills/methodology-system/guide.md +0 -131
- package/bundled/locales/zh-CN/skills/methodology-system/runtime.md +0 -279
- package/bundled/locales/zh-CN/skills/process-automation/SKILL.md +0 -143
- package/bundled/locales/zh-TW/skills/ac-coverage-assistant/SKILL.md +0 -195
- package/bundled/locales/zh-TW/skills/deploy-assistant/SKILL.md +0 -178
- package/bundled/locales/zh-TW/skills/forward-derivation/SKILL.md +0 -69
- package/bundled/locales/zh-TW/skills/forward-derivation/guide.md +0 -415
- package/bundled/locales/zh-TW/skills/methodology-system/SKILL.md +0 -86
- package/bundled/locales/zh-TW/skills/methodology-system/create-methodology.md +0 -350
- package/bundled/locales/zh-TW/skills/methodology-system/guide.md +0 -131
- package/bundled/locales/zh-TW/skills/methodology-system/runtime.md +0 -279
- package/bundled/locales/zh-TW/skills/process-automation/SKILL.md +0 -144
- /package/bundled/skills/{ac-coverage-assistant → ac-coverage}/SKILL.md +0 -0
- /package/bundled/skills/{methodology-system → dev-methodology}/SKILL.md +0 -0
- /package/bundled/skills/{methodology-system → dev-methodology}/create-methodology.md +0 -0
- /package/bundled/skills/{methodology-system → dev-methodology}/guide.md +0 -0
- /package/bundled/skills/{methodology-system → dev-methodology}/integrated-flow.md +0 -0
- /package/bundled/skills/{methodology-system → dev-methodology}/prerequisite-check.md +0 -0
- /package/bundled/skills/{methodology-system → dev-methodology}/runtime.md +0 -0
- /package/bundled/skills/{forward-derivation → spec-derivation}/guide.md +0 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# Product Metrics Standards - AI Optimized
|
|
2
|
+
# Source: XSPEC-069 Wave 4 Product Layer Pack
|
|
3
|
+
|
|
4
|
+
id: product-metrics-standards
|
|
5
|
+
title: Product Metrics Framework Standards
|
|
6
|
+
version: "1.0.0"
|
|
7
|
+
status: Active
|
|
8
|
+
tags: [product, metrics, kpi, aarrr, heart, north-star, analytics]
|
|
9
|
+
summary: |
|
|
10
|
+
Defines how teams select, structure, and govern product metrics. Covers
|
|
11
|
+
a framework selection matrix (AARRR for growth products, HEART for
|
|
12
|
+
experience products, custom North Star for platforms), North Star metric
|
|
13
|
+
criteria, a three-level metric hierarchy (North Star → L1 drivers →
|
|
14
|
+
L2 diagnostics), and an anti-vanity rule that rejects metrics decoupled
|
|
15
|
+
from revenue or retention impact. Designed to align teams around metrics
|
|
16
|
+
that drive meaningful product decisions rather than activity tracking.
|
|
17
|
+
|
|
18
|
+
requirements:
|
|
19
|
+
- id: REQ-001
|
|
20
|
+
title: Framework Selection Matrix
|
|
21
|
+
description: |
|
|
22
|
+
Teams MUST select a primary metrics framework appropriate to their
|
|
23
|
+
product type. Selection criteria: (1) Growth products (consumer apps,
|
|
24
|
+
marketplaces, viral products focused on user acquisition and monetization)
|
|
25
|
+
→ use AARRR framework (Acquisition, Activation, Retention, Referral,
|
|
26
|
+
Revenue). (2) Experience products (productivity tools, B2B SaaS, apps
|
|
27
|
+
where user satisfaction and task completion drive retention) → use
|
|
28
|
+
HEART framework (Happiness, Engagement, Adoption, Retention, Task
|
|
29
|
+
Success). (3) Platform products (developer platforms, APIs, infrastructure
|
|
30
|
+
products with diverse use cases) → define a custom North Star metric
|
|
31
|
+
that reflects platform value delivered, supplemented by AARRR or HEART
|
|
32
|
+
components as applicable. Framework selection MUST be documented in the
|
|
33
|
+
PRD or product strategy document.
|
|
34
|
+
level: MUST
|
|
35
|
+
examples:
|
|
36
|
+
- "Consumer social app → AARRR; primary focus on D7 retention and referral coefficient"
|
|
37
|
+
- "B2B project management tool → HEART; primary focus on Task Success and Retention"
|
|
38
|
+
- "Developer API platform → custom North Star: 'API calls per active developer per week'"
|
|
39
|
+
- "Framework documented in product-metrics.md: 'We use HEART because...'"
|
|
40
|
+
|
|
41
|
+
- id: REQ-002
|
|
42
|
+
title: North Star Criteria
|
|
43
|
+
description: |
|
|
44
|
+
Every product MUST define exactly one North Star metric that satisfies
|
|
45
|
+
all four criteria: (1) Leading indicator — it predicts future business
|
|
46
|
+
health (revenue, retention) rather than measuring past outcomes.
|
|
47
|
+
(2) Measurable and trackable — it can be calculated from existing or
|
|
48
|
+
easily obtainable data with a defined measurement cadence (weekly
|
|
49
|
+
or monthly). (3) Actionable by the team — the product team has direct
|
|
50
|
+
levers to influence it through feature development and UX decisions.
|
|
51
|
+
(4) Explainable in one sentence — any team member can describe what
|
|
52
|
+
it measures and why it matters without needing additional context.
|
|
53
|
+
North Star MUST be reviewed and reconfirmed at each annual product
|
|
54
|
+
planning cycle.
|
|
55
|
+
level: MUST
|
|
56
|
+
examples:
|
|
57
|
+
- "Spotify: 'Time spent listening per user per week' (leading, measurable, actionable)"
|
|
58
|
+
- "Airbnb: 'Nights booked per month' (explainable, predicts revenue)"
|
|
59
|
+
- "Weak North Star: 'Total revenue' (lagging, not directly actionable by product team)"
|
|
60
|
+
- "Annual review: North Star unchanged but L1 driver metrics updated for new product area"
|
|
61
|
+
|
|
62
|
+
- id: REQ-003
|
|
63
|
+
title: Metric Hierarchy
|
|
64
|
+
description: |
|
|
65
|
+
Teams MUST structure metrics in a three-level hierarchy with a maximum
|
|
66
|
+
of three levels. Level 1 (North Star): one metric representing overall
|
|
67
|
+
product value delivered. Level 2 (L1 Driver Metrics): 3–5 metrics that
|
|
68
|
+
directly influence the North Star; each driver metric must have a
|
|
69
|
+
documented causal hypothesis linking it to the North Star. Level 3
|
|
70
|
+
(L2 Diagnostic Metrics): per-feature or per-team metrics that explain
|
|
71
|
+
movements in L1 drivers; maximum 3 diagnostics per driver. Metrics
|
|
72
|
+
beyond three levels of hierarchy are PROHIBITED — they indicate
|
|
73
|
+
measurement fragmentation rather than focus.
|
|
74
|
+
level: MUST
|
|
75
|
+
examples:
|
|
76
|
+
- "North Star: weekly active users completing core action"
|
|
77
|
+
- "L1 drivers: new user activation rate, 7-day retention, feature adoption breadth"
|
|
78
|
+
- "L2 diagnostic for activation: onboarding step completion rates (step 1/2/3)"
|
|
79
|
+
- "Prohibited: L4 sub-diagnostic metrics that obscure rather than explain"
|
|
80
|
+
|
|
81
|
+
- id: REQ-004
|
|
82
|
+
title: Anti-Vanity Rule
|
|
83
|
+
description: |
|
|
84
|
+
Teams MUST apply the anti-vanity test before adding any metric to the
|
|
85
|
+
official metrics dashboard. A metric fails the anti-vanity test if it
|
|
86
|
+
can increase while revenue and retention remain flat or decrease. Such
|
|
87
|
+
metrics MUST NOT appear in official product reviews or be used as
|
|
88
|
+
success criteria for features. Examples of vanity metrics that commonly
|
|
89
|
+
fail this test: total registered users (without active usage filter),
|
|
90
|
+
raw pageviews (without session quality filter), total API calls
|
|
91
|
+
(without unique active customer filter), press mentions, app store
|
|
92
|
+
downloads without activation. When a vanity metric is useful for
|
|
93
|
+
operational monitoring, it MUST be clearly labeled as "operational
|
|
94
|
+
indicator, not success metric."
|
|
95
|
+
level: MUST
|
|
96
|
+
examples:
|
|
97
|
+
- "Reject: 'Total signups this month' → replace with 'Signups who completed activation'"
|
|
98
|
+
- "Reject: 'Total pageviews' → replace with 'Sessions with ≥2 meaningful interactions'"
|
|
99
|
+
- "Allowed with label: 'Total API calls (operational indicator)' on infra dashboard"
|
|
100
|
+
- "Feature success metric: 'Users who used feature X and retained at D30' not 'feature clicks'"
|
|
101
|
+
|
|
102
|
+
anti_patterns:
|
|
103
|
+
- "Tracking vanity metrics (total signups, raw pageviews) as primary success indicators"
|
|
104
|
+
- "No North Star defined — teams optimize for different local metrics, creating misalignment"
|
|
105
|
+
- "Conflicting team metrics where one team's optimization harms another team's metric"
|
|
106
|
+
- "Metric hierarchy deeper than 3 levels, creating measurement complexity without insight"
|
|
107
|
+
- "Changing the North Star quarterly, preventing year-over-year trend analysis"
|
|
108
|
+
|
|
109
|
+
related_standards:
|
|
110
|
+
- prd-standards
|
|
111
|
+
- slo-sli
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
name: Prompt Regression Standards
|
|
3
|
+
nameZh: Prompt 回歸測試標準
|
|
4
|
+
id: prompt-regression
|
|
5
|
+
version: "1.0.0"
|
|
6
|
+
category: testing
|
|
7
|
+
scope: ai-agent-systems
|
|
8
|
+
summary: >
|
|
9
|
+
Prevent unintended prompt changes from silently degrading AI agent behaviour.
|
|
10
|
+
Golden checksum tests detect modifications; snapshot diff confirms intent.
|
|
11
|
+
|
|
12
|
+
requirements:
|
|
13
|
+
- id: REQ-01
|
|
14
|
+
title: Prompt Versioning
|
|
15
|
+
titleZh: Prompt 版本化
|
|
16
|
+
level: MUST
|
|
17
|
+
description: >
|
|
18
|
+
Every AI agent MUST have a versioned prompt file (e.g. prompt.md with
|
|
19
|
+
frontmatter `version: "x.y.z"`). Changes require explicit version bump.
|
|
20
|
+
|
|
21
|
+
- id: REQ-02
|
|
22
|
+
title: Golden Checksum Test
|
|
23
|
+
titleZh: 黃金校驗和測試
|
|
24
|
+
level: MUST
|
|
25
|
+
description: >
|
|
26
|
+
A CI-enforced checksum test MUST store the SHA-256 hash of each agent's
|
|
27
|
+
prompt file. A mismatch fails the build and requires updating the golden
|
|
28
|
+
value with a comment confirming intent ("intentional change for X").
|
|
29
|
+
implementation: |
|
|
30
|
+
const GOLDEN = { "planner": "abc123...", ... }
|
|
31
|
+
it("prompt has not changed unexpectedly", () => {
|
|
32
|
+
const actual = sha256(readFileSync("agents/planner/prompt.md"))
|
|
33
|
+
expect(actual).toBe(GOLDEN["planner"])
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
- id: REQ-03
|
|
37
|
+
title: Snapshot Diff
|
|
38
|
+
titleZh: 快照差異比對
|
|
39
|
+
level: SHOULD
|
|
40
|
+
description: >
|
|
41
|
+
On CI failure, the system SHOULD output a unified diff between the
|
|
42
|
+
recorded snapshot and current prompt content to aid review.
|
|
43
|
+
|
|
44
|
+
- id: REQ-04
|
|
45
|
+
title: Change Review Gate
|
|
46
|
+
titleZh: 變更審查閘門
|
|
47
|
+
level: MUST
|
|
48
|
+
description: >
|
|
49
|
+
Prompt changes MUST be reviewed as code changes. Checksum updates require
|
|
50
|
+
a comment in the test file explaining why the prompt changed (e.g.
|
|
51
|
+
"Updated system role to include new Guardian policy reference XSPEC-160").
|
|
52
|
+
|
|
53
|
+
- id: REQ-05
|
|
54
|
+
title: Coverage
|
|
55
|
+
titleZh: 覆蓋範圍
|
|
56
|
+
level: MUST
|
|
57
|
+
description: >
|
|
58
|
+
The golden checksum test MUST cover ALL production agent prompt files.
|
|
59
|
+
New agents added to the system MUST be added to the test within the same
|
|
60
|
+
PR that introduces the agent.
|
|
61
|
+
|
|
62
|
+
examples:
|
|
63
|
+
- name: "SHA-256 checksum test in Vitest"
|
|
64
|
+
code: |
|
|
65
|
+
import { createHash } from "crypto"
|
|
66
|
+
import { readFileSync } from "fs"
|
|
67
|
+
import { describe, it, expect } from "vitest"
|
|
68
|
+
|
|
69
|
+
const GOLDEN_CHECKSUMS: Record<string, string> = {
|
|
70
|
+
planner: "dd0d086d...",
|
|
71
|
+
guardian: "f56555...",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
describe("Agent prompt regression", () => {
|
|
75
|
+
for (const [agent, expected] of Object.entries(GOLDEN_CHECKSUMS)) {
|
|
76
|
+
it(`agents/${agent}/prompt.md has not changed unexpectedly`, () => {
|
|
77
|
+
const content = readFileSync(`agents/${agent}/prompt.md`)
|
|
78
|
+
const actual = createHash("sha256").update(content).digest("hex")
|
|
79
|
+
expect(actual).toBe(expected)
|
|
80
|
+
})
|
|
81
|
+
}
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
anti_patterns:
|
|
85
|
+
- description: >
|
|
86
|
+
Skipping checksums for "stable" agents — all prompts require regression
|
|
87
|
+
coverage regardless of perceived stability.
|
|
88
|
+
- description: >
|
|
89
|
+
Updating checksums in bulk without per-agent comments explaining intent.
|
|
90
|
+
|
|
91
|
+
related_standards:
|
|
92
|
+
- llm-output-validation
|
|
93
|
+
- adversarial-test
|
|
94
|
+
- testing
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
name: Property-Based Testing Standards
|
|
3
|
+
nameZh: 屬性基礎測試標準
|
|
4
|
+
id: property-based-testing
|
|
5
|
+
version: "1.0.0"
|
|
6
|
+
category: testing
|
|
7
|
+
scope: correctness-validation
|
|
8
|
+
summary: >
|
|
9
|
+
Property-based testing generates hundreds of random inputs to verify that
|
|
10
|
+
invariants (properties) hold for all valid inputs, not just the examples
|
|
11
|
+
a developer thought to write. Especially valuable for pure functions,
|
|
12
|
+
parsers, and security-critical logic.
|
|
13
|
+
|
|
14
|
+
requirements:
|
|
15
|
+
- id: REQ-01
|
|
16
|
+
title: Property Identification
|
|
17
|
+
titleZh: 屬性識別
|
|
18
|
+
level: MUST
|
|
19
|
+
description: >
|
|
20
|
+
For each module under property-based test, developers MUST explicitly
|
|
21
|
+
identify the invariants to test. Examples: idempotency, monotonicity,
|
|
22
|
+
round-trip (encode/decode), boundary clamping, determinism.
|
|
23
|
+
|
|
24
|
+
- id: REQ-02
|
|
25
|
+
title: Generator Coverage
|
|
26
|
+
titleZh: 生成器覆蓋
|
|
27
|
+
level: MUST
|
|
28
|
+
description: >
|
|
29
|
+
Property tests MUST use appropriate generators: bounded integers for
|
|
30
|
+
array indices, valid enum values for type-constrained fields,
|
|
31
|
+
arbitrary strings for fuzz targets. Generators MUST cover the full
|
|
32
|
+
valid input space, not just a curated subset.
|
|
33
|
+
|
|
34
|
+
- id: REQ-03
|
|
35
|
+
title: Shrinking
|
|
36
|
+
titleZh: 最小化反例
|
|
37
|
+
level: SHOULD
|
|
38
|
+
description: >
|
|
39
|
+
When a property test fails, the framework SHOULD shrink the
|
|
40
|
+
counterexample to its minimal form. Libraries like fast-check (JS/TS)
|
|
41
|
+
and Hypothesis (Python) do this automatically.
|
|
42
|
+
|
|
43
|
+
- id: REQ-04
|
|
44
|
+
title: Run Count
|
|
45
|
+
titleZh: 執行次數
|
|
46
|
+
level: MUST
|
|
47
|
+
description: >
|
|
48
|
+
Property tests MUST run at least 100 samples per property in CI.
|
|
49
|
+
For security-critical functions (hash chains, token validation,
|
|
50
|
+
policy evaluation), run at least 1000 samples.
|
|
51
|
+
|
|
52
|
+
- id: REQ-05
|
|
53
|
+
title: Seed Persistence
|
|
54
|
+
titleZh: 種子持久化
|
|
55
|
+
level: SHOULD
|
|
56
|
+
description: >
|
|
57
|
+
When a property test fails, the failing seed SHOULD be saved and
|
|
58
|
+
re-run in CI until the fix is confirmed. This ensures regressions
|
|
59
|
+
are caught even after the random seed changes.
|
|
60
|
+
|
|
61
|
+
examples:
|
|
62
|
+
- name: "fast-check idempotency property for score clamping"
|
|
63
|
+
code: |
|
|
64
|
+
import fc from "fast-check"
|
|
65
|
+
import { describe, it } from "vitest"
|
|
66
|
+
|
|
67
|
+
describe("scoreReviewable idempotency", () => {
|
|
68
|
+
it("score is always in [0, 100]", () => {
|
|
69
|
+
fc.assert(
|
|
70
|
+
fc.property(
|
|
71
|
+
fc.record({
|
|
72
|
+
target_env: fc.constantFrom("prod", "staging", "dev"),
|
|
73
|
+
command_type: fc.constantFrom("query", "mutate", "exec", "delete"),
|
|
74
|
+
reversible: fc.boolean(),
|
|
75
|
+
}),
|
|
76
|
+
({ target_env, command_type, reversible }) => {
|
|
77
|
+
const result = scoreReviewable({
|
|
78
|
+
session_id: "prop-001",
|
|
79
|
+
source_agent: "operator",
|
|
80
|
+
intent: "test",
|
|
81
|
+
plan: [{ command: "ls", command_type, reversible }],
|
|
82
|
+
target_env,
|
|
83
|
+
reversible,
|
|
84
|
+
})
|
|
85
|
+
return result.score >= 0 && result.score <= 100
|
|
86
|
+
}
|
|
87
|
+
),
|
|
88
|
+
{ numRuns: 1000 }
|
|
89
|
+
)
|
|
90
|
+
})
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
anti_patterns:
|
|
94
|
+
- description: >
|
|
95
|
+
Using fc.anything() for domain-specific inputs — unguided random
|
|
96
|
+
strings will mostly generate invalid inputs that get rejected early.
|
|
97
|
+
Use constrained generators (fc.constantFrom, fc.integer with bounds).
|
|
98
|
+
- description: >
|
|
99
|
+
Setting numRuns: 10 — too few samples to find edge cases. Use at
|
|
100
|
+
least 100 for regular code and 1000 for security-critical functions.
|
|
101
|
+
|
|
102
|
+
related_standards:
|
|
103
|
+
- mutation-testing
|
|
104
|
+
- testing
|
|
105
|
+
- adversarial-test
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Release Quality Manifest Standards - AI Optimized
|
|
2
|
+
# Source: core/release-quality-manifest.md
|
|
3
|
+
|
|
4
|
+
id: release-quality-manifest
|
|
5
|
+
meta:
|
|
6
|
+
version: "1.0.0"
|
|
7
|
+
updated: "2026-05-05"
|
|
8
|
+
source: core/release-quality-manifest.md
|
|
9
|
+
description: Automated per-release Quality Manifest that aggregates all quality gate results into a single machine-readable artifact
|
|
10
|
+
|
|
11
|
+
requirements:
|
|
12
|
+
REQ-1:
|
|
13
|
+
id: REQ-RQM-001
|
|
14
|
+
title: Machine-Readable Format
|
|
15
|
+
rule: >
|
|
16
|
+
Every release MUST produce a Quality Manifest in YAML or JSON format that
|
|
17
|
+
aggregates the results of all defined quality gates. The manifest MUST be
|
|
18
|
+
committed to source control or attached to the release artifact.
|
|
19
|
+
rationale: >
|
|
20
|
+
Machine-readable manifests enable automated release gates and customer audits;
|
|
21
|
+
prose-only release notes cannot be parsed by downstream tooling.
|
|
22
|
+
|
|
23
|
+
REQ-2:
|
|
24
|
+
id: REQ-RQM-002
|
|
25
|
+
title: Gate Coverage
|
|
26
|
+
rule: >
|
|
27
|
+
The Quality Manifest MUST include at minimum: unit test coverage %, mutation
|
|
28
|
+
score %, SCA CVE counts (critical/high), SAST finding counts (high/medium),
|
|
29
|
+
E2E pass rate %, container CVE scan status, image signature status, SBOM
|
|
30
|
+
presence, and (if applicable) LLM hallucination rate and prompt injection
|
|
31
|
+
resistance score.
|
|
32
|
+
rationale: >
|
|
33
|
+
Partial manifests create false confidence; a complete manifest proves end-to-end
|
|
34
|
+
quality rather than cherry-picked metrics.
|
|
35
|
+
|
|
36
|
+
REQ-3:
|
|
37
|
+
id: REQ-RQM-003
|
|
38
|
+
title: Pass/Warn/Fail Status per Gate
|
|
39
|
+
rule: >
|
|
40
|
+
Each gate entry MUST carry a status field: "pass" (meets target), "warn"
|
|
41
|
+
(within acceptable deviation from target), or "fail" (blocks release).
|
|
42
|
+
The manifest MUST have an overall status field derived from the worst gate.
|
|
43
|
+
rationale: >
|
|
44
|
+
Binary pass/fail per gate plus an aggregate status enables release go/no-go
|
|
45
|
+
automation without human judgment on individual metrics.
|
|
46
|
+
|
|
47
|
+
REQ-4:
|
|
48
|
+
id: REQ-RQM-004
|
|
49
|
+
title: Automated Generation in CI
|
|
50
|
+
rule: >
|
|
51
|
+
The Quality Manifest MUST be generated automatically by CI (not manually
|
|
52
|
+
authored). Each gate's value MUST be extracted from the corresponding tool
|
|
53
|
+
output (vitest coverage JSON, stryker JSON, trivy SARIF, etc.).
|
|
54
|
+
rationale: >
|
|
55
|
+
Manually authored manifests are unreliable; CI-generated manifests are the
|
|
56
|
+
only form of evidence that meets audit requirements.
|
|
57
|
+
|
|
58
|
+
REQ-5:
|
|
59
|
+
id: REQ-RQM-005
|
|
60
|
+
title: Customer-Facing Summary
|
|
61
|
+
rule: >
|
|
62
|
+
A human-readable summary of the Quality Manifest (e.g., Markdown table)
|
|
63
|
+
MUST be generated alongside the machine-readable format and included in
|
|
64
|
+
the release notes or documentation.
|
|
65
|
+
rationale: >
|
|
66
|
+
Customers and auditors need a scannable summary; the machine-readable format
|
|
67
|
+
alone does not satisfy human review requirements.
|
|
68
|
+
|
|
69
|
+
manifest_schema:
|
|
70
|
+
release: "string — semver tag e.g. v1.2.0"
|
|
71
|
+
generated_at: "ISO 8601 timestamp"
|
|
72
|
+
commit: "git SHA"
|
|
73
|
+
gates:
|
|
74
|
+
unit_coverage:
|
|
75
|
+
actual: "percentage string e.g. '73%'"
|
|
76
|
+
target: "threshold e.g. '80%'"
|
|
77
|
+
status: "pass | warn | fail"
|
|
78
|
+
mutation_score:
|
|
79
|
+
actual: "percentage string"
|
|
80
|
+
target: "threshold"
|
|
81
|
+
status: "pass | warn | fail"
|
|
82
|
+
sca_critical_cve:
|
|
83
|
+
actual: "integer"
|
|
84
|
+
target: "0"
|
|
85
|
+
status: "pass | fail"
|
|
86
|
+
sca_high_cve:
|
|
87
|
+
actual: "integer"
|
|
88
|
+
target: "0"
|
|
89
|
+
status: "pass | warn | fail"
|
|
90
|
+
sast_high:
|
|
91
|
+
actual: "integer"
|
|
92
|
+
target: "0"
|
|
93
|
+
status: "pass | warn | fail"
|
|
94
|
+
e2e_pass_rate:
|
|
95
|
+
actual: "percentage string"
|
|
96
|
+
target: "threshold"
|
|
97
|
+
status: "pass | warn | fail"
|
|
98
|
+
container_cve_critical:
|
|
99
|
+
actual: "integer"
|
|
100
|
+
target: "0"
|
|
101
|
+
status: "pass | fail"
|
|
102
|
+
image_signed:
|
|
103
|
+
actual: "boolean"
|
|
104
|
+
target: "true"
|
|
105
|
+
status: "pass | fail"
|
|
106
|
+
sbom_present:
|
|
107
|
+
actual: "boolean"
|
|
108
|
+
target: "true"
|
|
109
|
+
status: "pass | fail"
|
|
110
|
+
overall: "PASS | WARN | FAIL"
|
|
111
|
+
|
|
112
|
+
generation_guidance: >
|
|
113
|
+
Extract coverage from vitest --coverage --reporter=json (summary.total.lines.pct).
|
|
114
|
+
Extract mutation score from stryker's mutation-testing-report.json (metrics.mutationScore).
|
|
115
|
+
Extract CVE counts from trivy JSON output (Results[].Vulnerabilities filtered by Severity).
|
|
116
|
+
Extract SAST from CodeQL SARIF (runs[].results filtered by level=error).
|
|
117
|
+
Combine into manifest YAML via a CI shell script or Node.js release script.
|
|
118
|
+
|
|
119
|
+
anti_patterns:
|
|
120
|
+
- description: >
|
|
121
|
+
Generating the manifest after all gates have passed — gates should use
|
|
122
|
+
the manifest values, not precede them.
|
|
123
|
+
- description: >
|
|
124
|
+
Hardcoding metric values in the manifest generation script — all values
|
|
125
|
+
MUST be extracted from tool outputs to remain accurate.
|
|
126
|
+
- description: >
|
|
127
|
+
Using 'warn' status for critical security gates (sca_critical_cve,
|
|
128
|
+
container_cve_critical) — critical security gates are binary pass/fail.
|
|
129
|
+
|
|
130
|
+
related_standards:
|
|
131
|
+
- testing
|
|
132
|
+
- security-testing
|
|
133
|
+
- supply-chain-attestation
|
|
134
|
+
- verification-evidence
|
|
135
|
+
- deployment-standards
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Release Readiness Gate Standards - AI Optimized
|
|
2
|
+
# Source: core/release-readiness-gate.md
|
|
3
|
+
|
|
4
|
+
id: release-readiness-gate
|
|
5
|
+
meta:
|
|
6
|
+
version: "1.0.0"
|
|
7
|
+
updated: "2026-05-05"
|
|
8
|
+
source: core/release-readiness-gate.md
|
|
9
|
+
description: Single aggregated release gate covering 16 quality dimensions with tiered sign-off template and RQM integration
|
|
10
|
+
|
|
11
|
+
requirements:
|
|
12
|
+
REQ-1:
|
|
13
|
+
id: REQ-RRG-001
|
|
14
|
+
title: 16-Dimension Coverage
|
|
15
|
+
rule: >
|
|
16
|
+
Every production release MUST evaluate all 16 quality dimensions defined in
|
|
17
|
+
core/release-readiness-gate.md. Tier-1 dimensions block release if FAIL.
|
|
18
|
+
Tier-2 dimensions require documented rationale if WARN. Tier-3 dimensions
|
|
19
|
+
require rationale if N/A.
|
|
20
|
+
rationale: >
|
|
21
|
+
Without explicit multi-dimension coverage, teams pass individual gate checks
|
|
22
|
+
but ship with unverified quality dimensions, creating systematic blind spots.
|
|
23
|
+
|
|
24
|
+
REQ-2:
|
|
25
|
+
id: REQ-RRG-002
|
|
26
|
+
title: Release Readiness Sign-off
|
|
27
|
+
rule: >
|
|
28
|
+
A Release Readiness Sign-off document MUST be created from the template in
|
|
29
|
+
core/release-readiness-gate.md for every release tag. It must be stored at
|
|
30
|
+
.release-readiness/<version>.md. The Overall Decision field must be explicitly
|
|
31
|
+
set to GO or NO-GO by a named release owner.
|
|
32
|
+
rationale: >
|
|
33
|
+
Anonymous or implicit GO decisions remove accountability; the sign-off creates
|
|
34
|
+
a named, dated, auditable record of the go/no-go decision and its evidence.
|
|
35
|
+
|
|
36
|
+
REQ-3:
|
|
37
|
+
id: REQ-RRG-003
|
|
38
|
+
title: Tier-1 Hard Block
|
|
39
|
+
rule: >
|
|
40
|
+
ANY Tier-1 dimension at FAIL status MUST block production deployment.
|
|
41
|
+
Tier-1 dimensions are: Security (Dim 2), DB Migration (Dim 5), Operational
|
|
42
|
+
Readiness (Dim 7), Rollback/DR (Dim 13), Production Smoke (Dim 14).
|
|
43
|
+
rationale: >
|
|
44
|
+
Tier-1 dimensions represent existential risks: security vulnerabilities,
|
|
45
|
+
broken rollback, misconfigured monitoring. No business justification
|
|
46
|
+
overrides a Tier-1 FAIL.
|
|
47
|
+
|
|
48
|
+
REQ-4:
|
|
49
|
+
id: REQ-RRG-004
|
|
50
|
+
title: RQM Alignment
|
|
51
|
+
rule: >
|
|
52
|
+
The machine-readable Release Quality Manifest (release-quality-manifest.md)
|
|
53
|
+
MUST include entries for all automated dimensions (a11y_critical, contract_drift,
|
|
54
|
+
cross_flow_cuj_pass_rate, browser_tier1_pass_rate, capacity_headroom_cpu_pct,
|
|
55
|
+
smoke_pass_rate, flow_gate_report). The RQM overall field must be PASS or WARN
|
|
56
|
+
(never FAIL) before deployment.
|
|
57
|
+
rationale: >
|
|
58
|
+
Human sign-off and machine manifest are complementary; the manifest enables
|
|
59
|
+
automated enforcement while the sign-off provides human accountability.
|
|
60
|
+
|
|
61
|
+
REQ-5:
|
|
62
|
+
id: REQ-RRG-005
|
|
63
|
+
title: Incremental Collection
|
|
64
|
+
rule: >
|
|
65
|
+
Release Readiness Sign-off evidence MUST be collected incrementally throughout
|
|
66
|
+
the release cycle (Gate 0 at PRD, Gate 3 pre-UAT, Gate 4 post-UAT). Creating
|
|
67
|
+
the sign-off on the day of deployment is an anti-pattern.
|
|
68
|
+
rationale: >
|
|
69
|
+
Last-minute sign-offs are rubber stamps; evidence collected late cannot
|
|
70
|
+
be acted upon without delaying the release.
|
|
71
|
+
|
|
72
|
+
quick_reference:
|
|
73
|
+
tier_1_dimensions: "Security, DB Migration, Operational Readiness, Rollback/DR, Production Smoke"
|
|
74
|
+
tier_2_dimensions: "Performance, a11y, Cross-flow Regression, i18n, Docs, Feature Flags, Multi-Gate Flow"
|
|
75
|
+
tier_3_dimensions: "Contract Testing, Browser Compat, Capacity, Compliance/Privacy"
|
|
76
|
+
sign_off_location: ".release-readiness/<version>.md"
|
|
77
|
+
rqm_integration: "flow_gate_report.json → release-quality-manifest.yaml field flow_gate_report"
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
name: Replay Test Standards
|
|
3
|
+
nameZh: 回放測試標準
|
|
4
|
+
id: replay-test
|
|
5
|
+
version: "1.0.0"
|
|
6
|
+
category: testing
|
|
7
|
+
scope: ai-agent-systems
|
|
8
|
+
summary: >
|
|
9
|
+
Golden fixture recording and deterministic replay for AI agent pipelines.
|
|
10
|
+
Enables customer bug reproduction, verdict regression detection, and
|
|
11
|
+
on-site incident investigation without requiring a live LLM.
|
|
12
|
+
|
|
13
|
+
requirements:
|
|
14
|
+
- id: REQ-01
|
|
15
|
+
title: Golden Fixture Format
|
|
16
|
+
titleZh: 黃金 fixture 格式
|
|
17
|
+
level: MUST
|
|
18
|
+
description: >
|
|
19
|
+
Each replay fixture MUST be a JSON file containing: (1) the exact input
|
|
20
|
+
that triggered the behaviour, (2) the expected output (decision/verdict),
|
|
21
|
+
(3) metadata (date recorded, source — customer report / CI regression /
|
|
22
|
+
incident, description). Fixtures MUST be deterministic (same input always
|
|
23
|
+
produces same output for pure-function components).
|
|
24
|
+
|
|
25
|
+
- id: REQ-02
|
|
26
|
+
title: Replay Test Suite
|
|
27
|
+
titleZh: 回放測試套件
|
|
28
|
+
level: MUST
|
|
29
|
+
description: >
|
|
30
|
+
A dedicated replay test file MUST load each fixture and assert that
|
|
31
|
+
re-running the component under test produces the recorded expected output.
|
|
32
|
+
For AI components with LLM dependencies, replay MUST mock the LLM layer
|
|
33
|
+
and test only the deterministic logic (scoring, routing, policy evaluation).
|
|
34
|
+
|
|
35
|
+
- id: REQ-03
|
|
36
|
+
title: Bug Regression Capture
|
|
37
|
+
titleZh: Bug 回歸捕捉
|
|
38
|
+
level: MUST
|
|
39
|
+
description: >
|
|
40
|
+
When a production bug is reported, a fixture MUST be created from the
|
|
41
|
+
failing input within the same PR that fixes the bug. The fixture prevents
|
|
42
|
+
the bug from being reintroduced silently.
|
|
43
|
+
|
|
44
|
+
- id: REQ-04
|
|
45
|
+
title: Fixture Coverage
|
|
46
|
+
titleZh: Fixture 覆蓋
|
|
47
|
+
level: SHOULD
|
|
48
|
+
description: >
|
|
49
|
+
The fixture set SHOULD include at least one representative for each
|
|
50
|
+
decision outcome (e.g. ALLOW / REQUIRE_HITL / DENY for Guardian).
|
|
51
|
+
Edge cases reported by customers or from red-team exercises SHOULD be
|
|
52
|
+
added as separate fixtures.
|
|
53
|
+
|
|
54
|
+
- id: REQ-05
|
|
55
|
+
title: Fixture Naming Convention
|
|
56
|
+
titleZh: Fixture 命名規範
|
|
57
|
+
level: MUST
|
|
58
|
+
description: >
|
|
59
|
+
Fixture files MUST follow the pattern:
|
|
60
|
+
`<component>-<outcome>-<short-description>.json`
|
|
61
|
+
e.g. `guardian-deny-prod-drop-table.json`,
|
|
62
|
+
`guardian-allow-dev-npm-install.json`
|
|
63
|
+
|
|
64
|
+
examples:
|
|
65
|
+
- name: "Guardian replay fixture file"
|
|
66
|
+
code: |
|
|
67
|
+
{
|
|
68
|
+
"meta": {
|
|
69
|
+
"recorded": "2026-05-05",
|
|
70
|
+
"source": "red-team-exercise",
|
|
71
|
+
"description": "DROP TABLE in prod should DENY"
|
|
72
|
+
},
|
|
73
|
+
"input": {
|
|
74
|
+
"session_id": "replay-001",
|
|
75
|
+
"source_agent": "operator",
|
|
76
|
+
"intent": "Clean up test data",
|
|
77
|
+
"plan": [{"command": "DROP TABLE users;", "command_type": "mutate", "target_resource": "db_schema", "reversible": false}],
|
|
78
|
+
"target_env": "prod",
|
|
79
|
+
"reversible": false
|
|
80
|
+
},
|
|
81
|
+
"expected": {
|
|
82
|
+
"decision": "DENY"
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
- name: "Replay test loading fixtures"
|
|
87
|
+
code: |
|
|
88
|
+
const fixtures = readdirSync('src/guardian/__fixtures__')
|
|
89
|
+
.filter(f => f.endsWith('.json'))
|
|
90
|
+
.map(f => JSON.parse(readFileSync(join('src/guardian/__fixtures__', f), 'utf-8')))
|
|
91
|
+
|
|
92
|
+
for (const { meta, input, expected } of fixtures) {
|
|
93
|
+
it(meta.description, () => {
|
|
94
|
+
const result = scoreReviewable(input)
|
|
95
|
+
const decision = deriveDecision(result.score)
|
|
96
|
+
expect(decision).toBe(expected.decision)
|
|
97
|
+
})
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
anti_patterns:
|
|
101
|
+
- description: >
|
|
102
|
+
Fixtures without metadata fields — without source and date, it's
|
|
103
|
+
impossible to know why a fixture exists or when it was added.
|
|
104
|
+
- description: >
|
|
105
|
+
Creating fixtures only for the happy path — the most valuable fixtures
|
|
106
|
+
are customer-reported failures and red-team findings.
|
|
107
|
+
|
|
108
|
+
related_standards:
|
|
109
|
+
- adversarial-test
|
|
110
|
+
- testing
|
|
111
|
+
- verification-evidence
|