universal-dev-standards 5.4.0 → 5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/bundled/ai/standards/adversarial-test.ai.yaml +277 -0
  2. package/bundled/ai/standards/audit-trail.ai.yaml +113 -0
  3. package/bundled/ai/standards/chaos-injection-tests.ai.yaml +91 -0
  4. package/bundled/ai/standards/container-image-standards.ai.yaml +88 -0
  5. package/bundled/ai/standards/container-security.ai.yaml +331 -0
  6. package/bundled/ai/standards/cost-budget-test.ai.yaml +96 -0
  7. package/bundled/ai/standards/data-contract.ai.yaml +110 -0
  8. package/bundled/ai/standards/data-migration-testing.ai.yaml +96 -0
  9. package/bundled/ai/standards/data-pipeline.ai.yaml +113 -0
  10. package/bundled/ai/standards/disaster-recovery-drill.ai.yaml +89 -0
  11. package/bundled/ai/standards/flaky-test-management.ai.yaml +89 -0
  12. package/bundled/ai/standards/flow-based-testing.ai.yaml +240 -0
  13. package/bundled/ai/standards/iac-design-principles.ai.yaml +83 -0
  14. package/bundled/ai/standards/incident-response.ai.yaml +107 -0
  15. package/bundled/ai/standards/license-compliance.ai.yaml +106 -0
  16. package/bundled/ai/standards/llm-output-validation.ai.yaml +269 -0
  17. package/bundled/ai/standards/mock-boundary.ai.yaml +250 -0
  18. package/bundled/ai/standards/mutation-testing.ai.yaml +192 -0
  19. package/bundled/ai/standards/pii-classification.ai.yaml +109 -0
  20. package/bundled/ai/standards/policy-as-code-testing.ai.yaml +227 -0
  21. package/bundled/ai/standards/prd-standards.ai.yaml +88 -0
  22. package/bundled/ai/standards/product-metrics-standards.ai.yaml +111 -0
  23. package/bundled/ai/standards/prompt-regression.ai.yaml +94 -0
  24. package/bundled/ai/standards/property-based-testing.ai.yaml +105 -0
  25. package/bundled/ai/standards/release-quality-manifest.ai.yaml +135 -0
  26. package/bundled/ai/standards/replay-test.ai.yaml +111 -0
  27. package/bundled/ai/standards/runbook.ai.yaml +104 -0
  28. package/bundled/ai/standards/sast-advanced.ai.yaml +135 -0
  29. package/bundled/ai/standards/schema-evolution.ai.yaml +111 -0
  30. package/bundled/ai/standards/secret-management-standards.ai.yaml +105 -0
  31. package/bundled/ai/standards/secure-op.ai.yaml +365 -0
  32. package/bundled/ai/standards/security-testing.ai.yaml +171 -0
  33. package/bundled/ai/standards/server-ops-security.ai.yaml +274 -0
  34. package/bundled/ai/standards/slo-sli.ai.yaml +97 -0
  35. package/bundled/ai/standards/smoke-test.ai.yaml +87 -0
  36. package/bundled/ai/standards/supply-chain-attestation.ai.yaml +109 -0
  37. package/bundled/ai/standards/test-completeness-dimensions.ai.yaml +52 -5
  38. package/bundled/ai/standards/user-story-mapping.ai.yaml +108 -0
  39. package/bundled/core/adversarial-test.md +212 -0
  40. package/bundled/core/chaos-injection-tests.md +116 -0
  41. package/bundled/core/container-security.md +521 -0
  42. package/bundled/core/cost-budget-test.md +69 -0
  43. package/bundled/core/data-migration-testing.md +110 -0
  44. package/bundled/core/disaster-recovery-drill.md +73 -0
  45. package/bundled/core/flaky-test-management.md +73 -0
  46. package/bundled/core/flow-based-testing.md +142 -0
  47. package/bundled/core/llm-output-validation.md +178 -0
  48. package/bundled/core/mock-boundary.md +100 -0
  49. package/bundled/core/mutation-testing.md +97 -0
  50. package/bundled/core/policy-as-code-testing.md +188 -0
  51. package/bundled/core/prompt-regression.md +72 -0
  52. package/bundled/core/property-based-testing.md +73 -0
  53. package/bundled/core/release-quality-manifest.md +147 -0
  54. package/bundled/core/replay-test.md +86 -0
  55. package/bundled/core/sast-advanced.md +300 -0
  56. package/bundled/core/secure-op.md +314 -0
  57. package/bundled/core/security-testing.md +87 -0
  58. package/bundled/core/server-ops-security.md +493 -0
  59. package/bundled/core/smoke-test.md +65 -0
  60. package/bundled/core/supply-chain-attestation.md +117 -0
  61. package/bundled/locales/zh-CN/CHANGELOG.md +3 -3
  62. package/bundled/locales/zh-CN/README.md +1 -1
  63. package/bundled/locales/zh-CN/skills/ai-instruction-standards/SKILL.md +5 -5
  64. package/bundled/locales/zh-TW/CHANGELOG.md +3 -3
  65. package/bundled/locales/zh-TW/README.md +1 -1
  66. package/bundled/locales/zh-TW/skills/ai-instruction-standards/SKILL.md +183 -79
  67. package/bundled/skills/README.md +4 -3
  68. package/bundled/skills/SKILL_NAMING.md +94 -0
  69. package/bundled/skills/ai-instruction-standards/SKILL.md +181 -88
  70. package/bundled/skills/atdd-assistant/SKILL.md +8 -0
  71. package/bundled/skills/bdd-assistant/SKILL.md +7 -0
  72. package/bundled/skills/checkin-assistant/SKILL.md +8 -0
  73. package/bundled/skills/code-review-assistant/SKILL.md +7 -0
  74. package/bundled/skills/journey-test-assistant/SKILL.md +203 -0
  75. package/bundled/skills/orchestrate/SKILL.md +167 -0
  76. package/bundled/skills/plan/SKILL.md +234 -0
  77. package/bundled/skills/pr-automation-assistant/SKILL.md +8 -0
  78. package/bundled/skills/push/SKILL.md +49 -2
  79. package/bundled/skills/{process-automation → skill-builder}/SKILL.md +1 -1
  80. package/bundled/skills/{forward-derivation → spec-derivation}/SKILL.md +1 -1
  81. package/bundled/skills/spec-driven-dev/SKILL.md +7 -0
  82. package/bundled/skills/sweep/SKILL.md +145 -0
  83. package/bundled/skills/tdd-assistant/SKILL.md +7 -0
  84. package/package.json +1 -1
  85. package/src/commands/flow.js +8 -0
  86. package/src/commands/start.js +14 -0
  87. package/src/commands/sweep.js +8 -0
  88. package/src/commands/workflow.js +8 -0
  89. package/standards-registry.json +426 -4
  90. package/bundled/locales/zh-CN/skills/ac-coverage-assistant/SKILL.md +0 -190
  91. package/bundled/locales/zh-CN/skills/forward-derivation/SKILL.md +0 -71
  92. package/bundled/locales/zh-CN/skills/forward-derivation/guide.md +0 -130
  93. package/bundled/locales/zh-CN/skills/methodology-system/SKILL.md +0 -88
  94. package/bundled/locales/zh-CN/skills/methodology-system/create-methodology.md +0 -350
  95. package/bundled/locales/zh-CN/skills/methodology-system/guide.md +0 -131
  96. package/bundled/locales/zh-CN/skills/methodology-system/runtime.md +0 -279
  97. package/bundled/locales/zh-CN/skills/process-automation/SKILL.md +0 -143
  98. package/bundled/locales/zh-TW/skills/ac-coverage-assistant/SKILL.md +0 -195
  99. package/bundled/locales/zh-TW/skills/deploy-assistant/SKILL.md +0 -178
  100. package/bundled/locales/zh-TW/skills/forward-derivation/SKILL.md +0 -69
  101. package/bundled/locales/zh-TW/skills/forward-derivation/guide.md +0 -415
  102. package/bundled/locales/zh-TW/skills/methodology-system/SKILL.md +0 -86
  103. package/bundled/locales/zh-TW/skills/methodology-system/create-methodology.md +0 -350
  104. package/bundled/locales/zh-TW/skills/methodology-system/guide.md +0 -131
  105. package/bundled/locales/zh-TW/skills/methodology-system/runtime.md +0 -279
  106. package/bundled/locales/zh-TW/skills/process-automation/SKILL.md +0 -144
  107. /package/bundled/skills/{ac-coverage-assistant → ac-coverage}/SKILL.md +0 -0
  108. /package/bundled/skills/{methodology-system → dev-methodology}/SKILL.md +0 -0
  109. /package/bundled/skills/{methodology-system → dev-methodology}/create-methodology.md +0 -0
  110. /package/bundled/skills/{methodology-system → dev-methodology}/guide.md +0 -0
  111. /package/bundled/skills/{methodology-system → dev-methodology}/integrated-flow.md +0 -0
  112. /package/bundled/skills/{methodology-system → dev-methodology}/prerequisite-check.md +0 -0
  113. /package/bundled/skills/{methodology-system → dev-methodology}/runtime.md +0 -0
  114. /package/bundled/skills/{forward-derivation → spec-derivation}/guide.md +0 -0
@@ -0,0 +1,227 @@
1
+ # Policy as Code Testing Standards - AI Optimized
2
+ # Source: core/policy-as-code-testing.md
3
+
4
+ id: policy-as-code-testing
5
+ meta:
6
+ version: "1.0.0"
7
+ updated: "2026-05-05"
8
+ source: core/policy-as-code-testing.md
9
+ description: >
10
+ Standards for unit testing Open Policy Agent (OPA) Rego policies and
11
+ other Policy as Code (PaC) engines. Ensures that AI agent authorization
12
+ policies are tested with the same rigor as application code.
13
+
14
+ # ─────────────────────────────────────────────────────────
15
+ # Core Concepts
16
+ # ─────────────────────────────────────────────────────────
17
+ core_concepts:
18
+ definition: >
19
+ Policy as Code (PaC) means security and authorization policies are expressed
20
+ as code (Rego, Cedar, CEL) rather than manual configuration. This enables
21
+ version control, code review, and automated testing of policies.
22
+
23
+ opa_test_framework:
24
+ overview: >
25
+ OPA's built-in test framework allows unit testing Rego policies with
26
+ `opa test`. Tests are Rego rules with names prefixed by `test_`.
27
+ Tests pass if they evaluate to `true`, fail if `false` or undefined.
28
+ run_command: "opa test <policy_directory> -v"
29
+ file_convention: "<policy_name>_test.rego in the same directory as the policy"
30
+
31
+ why_test_policies:
32
+ - reason: Policies encode security decisions — untested policies create silent security holes
33
+ - reason: Policy logic can have edge cases (reversible vs. irreversible, env-specific rules)
34
+ - reason: Policy changes must be validated against both allowed and denied cases
35
+ - reason: OPA Rego syntax errors are only caught at runtime without tests
36
+
37
+ # ─────────────────────────────────────────────────────────
38
+ # OPA Rego Test Structure
39
+ # ─────────────────────────────────────────────────────────
40
+ rego_test_structure:
41
+ file_naming: "<policy_module>_test.rego"
42
+ package_naming: "<policy_package>_test"
43
+
44
+ test_rule_format: |
45
+ # Each test is a Rego rule with `test_` prefix
46
+ # Test passes if rule body evaluates to true
47
+ test_<description> if {
48
+ <rule_under_test> with input as { <test_input> }
49
+ }
50
+
51
+ # Negative test (assert rule does NOT fire)
52
+ test_<description>_is_not_violated if {
53
+ not <rule_under_test> with input as { <test_input> }
54
+ }
55
+
56
+ required_test_categories:
57
+ - category: ALLOW cases
58
+ description: Inputs that must NOT trigger the policy violation
59
+ minimum: 2
60
+ example: |
61
+ test_safe_select_is_allowed if {
62
+ not data.my_pkg.has_violation with input as {
63
+ "plan": [{"command_type": "sql", "command": "SELECT * FROM t"}]
64
+ }
65
+ }
66
+
67
+ - category: DENY cases
68
+ description: Inputs that MUST trigger the policy violation
69
+ minimum: 3
70
+ example: |
71
+ test_drop_database_is_forbidden if {
72
+ data.my_pkg.has_forbidden_pattern with input as {
73
+ "plan": [{"command_type": "sql", "command": "DROP DATABASE prod"}]
74
+ }
75
+ }
76
+
77
+ - category: Boundary cases
78
+ description: Edge cases at the boundary of the policy condition
79
+ minimum: 1
80
+ example: |
81
+ # reversible=false triggers but reversible=true does not
82
+ test_irreversible_triggers if {
83
+ data.my_pkg.prod_violation with input as {
84
+ "target_env": "prod",
85
+ "plan": [{"reversible": false, "command": "DELETE FROM users"}]
86
+ }
87
+ }
88
+ test_reversible_does_not_trigger if {
89
+ not data.my_pkg.prod_violation with input as {
90
+ "target_env": "prod",
91
+ "plan": [{"reversible": true, "command": "SELECT * FROM users"}]
92
+ }
93
+ }
94
+
95
+ - category: Integration test (main policy)
96
+ description: Test the full policy chain via the main/root package
97
+ minimum: 2
98
+
99
+ # ─────────────────────────────────────────────────────────
100
+ # Policy Module Design Rules
101
+ # ─────────────────────────────────────────────────────────
102
+ policy_design_rules:
103
+ - rule: fail_closed_default
104
+ description: >
105
+ The root policy package MUST have `default allow = false`.
106
+ Any evaluation error or undefined result should deny, not allow.
107
+ example: |
108
+ default allow = false
109
+ allow if {
110
+ not data.my_pkg.forbidden.has_violation
111
+ not data.my_pkg.env.prod_violation
112
+ }
113
+
114
+ - rule: no_free_text_in_security_decisions
115
+ description: >
116
+ Policy rules MUST NOT parse user-controlled free-text fields (intent,
117
+ description, annotations) for security decisions. Only structured,
118
+ typed fields (command_type, reversible, target_env) should drive policy.
119
+ rationale: Free-text parsing creates prompt injection attack surface (OWASP LLM01)
120
+
121
+ - rule: set_not_array_for_violations
122
+ description: >
123
+ Use partial set rules (`violations[reason] if {...}`) to aggregate
124
+ violation reasons, not array rules. Arrays cannot be used with partial rules.
125
+ example: |
126
+ # CORRECT: partial set rule
127
+ violations[reason] if {
128
+ has_violation
129
+ reason := "VIOLATION_TYPE"
130
+ }
131
+ # INCORRECT: array.concat on sets causes type errors in OPA ≥ 0.40
132
+ # deny_reasons := array.concat(violations1, violations2) ← DO NOT USE
133
+
134
+ - rule: module_per_concern
135
+ description: >
136
+ Each policy concern should be a separate Rego module (file).
137
+ E.g., forbidden_patterns.rego / env_policy.rego / risk_gate.rego.
138
+ Main.rego aggregates all modules via data references.
139
+ benefit: Enables per-module testing and cleaner separation of concerns
140
+
141
+ # ─────────────────────────────────────────────────────────
142
+ # CI Integration
143
+ # ─────────────────────────────────────────────────────────
144
+ ci_integration:
145
+ github_actions_step: |
146
+ - name: Test OPA Rego Policies
147
+ run: |
148
+ docker run --rm \
149
+ -v "${{ github.workspace }}/src/guardian/policies:/policies:ro" \
150
+ openpolicyagent/opa:latest-static \
151
+ test /policies -v
152
+
153
+ npm_script: |
154
+ "test:policy": "docker run --rm -v \"$(pwd)/src/guardian/policies:/policies:ro\" openpolicyagent/opa:latest-static test /policies -v"
155
+
156
+ local_opa_binary: |
157
+ # If OPA binary is installed:
158
+ opa test src/guardian/policies/ -v
159
+
160
+ # ─────────────────────────────────────────────────────────
161
+ # Quality Gates
162
+ # ─────────────────────────────────────────────────────────
163
+ quality_gates:
164
+ - gate: OPA policy tests (CI)
165
+ threshold: "100% of Rego tests pass"
166
+ enforcement: Block merge
167
+ automated: true
168
+ note: Run via `opa test` on every PR that touches *.rego files
169
+
170
+ - gate: Policy coverage
171
+ threshold: "Each policy module has ≥ 2 ALLOW + ≥ 3 DENY test cases"
172
+ enforcement: Advisory (reviewer checklist)
173
+
174
+ - gate: Integration tests
175
+ threshold: "Root policy (main.rego) has tests for both allow and deny paths"
176
+ enforcement: Block merge
177
+
178
+ # ─────────────────────────────────────────────────────────
179
+ # Rules
180
+ # ─────────────────────────────────────────────────────────
181
+ rules:
182
+ - id: rego-unit-test-per-module
183
+ trigger: creating or modifying a Rego policy module
184
+ instruction: >
185
+ Every Rego module MUST have a corresponding _test.rego file with at minimum:
186
+ 2 ALLOW cases, 3 DENY cases, and 1 boundary case.
187
+ priority: required
188
+
189
+ - id: fail-closed-default
190
+ trigger: creating a root OPA policy package
191
+ instruction: >
192
+ Root policy MUST include `default allow = false`.
193
+ Any undefined evaluation must result in DENY.
194
+ priority: required
195
+
196
+ - id: no-free-text-in-policy
197
+ trigger: writing Rego rules
198
+ instruction: >
199
+ Never parse intent, description, or annotation fields in Rego rules.
200
+ Use only structured fields: command_type, command, reversible, target_env,
201
+ target_resource, risk_score.
202
+ priority: required
203
+
204
+ - id: policy-test-on-rego-change
205
+ trigger: modifying any *.rego file
206
+ instruction: >
207
+ Re-run `opa test` on the entire policy directory after any change.
208
+ CI must block merge if OPA tests fail.
209
+ priority: required
210
+
211
+ anti_patterns:
212
+ - Using array.concat() on set-type violation rules (type error in OPA ≥ 0.40)
213
+ - Parsing intent/user-input fields in security policy logic
214
+ - Missing `default allow = false` in root policy
215
+ - Policy modules without corresponding _test.rego files
216
+ - Testing only DENY cases (no ALLOW cases means you can't tell if the policy is too restrictive)
217
+ - Running OPA tests only locally, not in CI
218
+
219
+ quick_reference:
220
+ policy_test_checklist: |
221
+ □ Each policy module has a _test.rego file
222
+ □ Tests cover: ALLOW cases (≥ 2), DENY cases (≥ 3), boundary cases (≥ 1)
223
+ □ main.rego / root policy tested via integration tests
224
+ □ `default allow = false` present in root policy
225
+ □ No free-text field parsing in Rego rules
226
+ □ `opa test <policies_dir> -v` passes locally
227
+ □ CI step: `opa test` runs on every PR touching *.rego
@@ -0,0 +1,88 @@
1
+ # PRD Standards - AI Optimized
2
+ # Source: XSPEC-069 Wave 4 Product Layer Pack
3
+
4
+ id: prd-standards
5
+ title: Product Requirements Document Standards
6
+ version: "1.0.0"
7
+ status: Active
8
+ tags: [product, prd, requirements, user-research, planning]
9
+ summary: |
10
+ Defines the structure, content requirements, and lifecycle governance for
11
+ Product Requirements Documents (PRDs). Covers five mandatory PRD sections
12
+ (Problem Statement, Target User/Persona, Success Metrics, Scope in/out,
13
+ Constraints), the bridge from PRD requirements to traceable user stories,
14
+ and the revision policy for changes after kickoff. Designed to ensure product
15
+ intent is clearly communicable to engineering, design, and stakeholders with
16
+ measurable success criteria.
17
+
18
+ requirements:
19
+ - id: REQ-001
20
+ title: PRD Five Sections
21
+ description: |
22
+ Every PRD MUST contain five sections in the following order:
23
+ (1) Problem Statement — describes the user pain point or opportunity
24
+ in observable terms; includes quantitative data where available (e.g.,
25
+ support ticket volume, user research findings, funnel drop-off rates).
26
+ (2) Target User / Persona — identifies who is affected; references
27
+ named personas with role, context, and goals; at minimum one primary
28
+ persona and one secondary persona.
29
+ (3) Success Metrics — defines 2–4 measurable outcomes that indicate
30
+ the problem is solved; each metric must include current baseline,
31
+ target value, and measurement method.
32
+ (4) Scope In / Out — explicitly lists what is included and excluded
33
+ from this PRD; out-of-scope items may reference future PRDs.
34
+ (5) Constraints — technical, regulatory, time, budget, or dependency
35
+ constraints that bound the solution space.
36
+ level: MUST
37
+ examples:
38
+ - "Problem: 34% of users abandon checkout at payment step (Mixpanel, Q1 2026)"
39
+ - "Primary persona: Mid-market SaaS buyer; Secondary: IT admin approver"
40
+ - "Success metric: Checkout completion rate ≥ 72% (baseline 66%) measured via Amplitude"
41
+ - "Out of scope: saved payment methods (deferred to PRD-2026-Q3-payments)"
42
+
43
+ - id: REQ-002
44
+ title: PRD to User Story Bridge
45
+ description: |
46
+ Each PRD requirement MUST be broken down into one or more user stories
47
+ following the INVEST criteria defined in requirement-engineering.ai.yaml.
48
+ Every user story derived from a PRD MUST be traceable to at least one
49
+ PRD success metric — stories that cannot be linked to a success metric
50
+ MUST be flagged for PM review before inclusion in the backlog. The
51
+ traceability link (PRD section ID → User Story ID → Success Metric) MUST
52
+ be maintained in the backlog tool or as a traceability matrix in the PRD.
53
+ level: MUST
54
+ examples:
55
+ - "PRD-REQ-003 → US-042 'As a buyer, I want to pay with Apple Pay' → metric: checkout rate"
56
+ - "Story without metric link flagged with label `needs-metric-trace` in Jira"
57
+ - "Traceability matrix table in PRD section 6: REQ ↔ Stories ↔ Metrics"
58
+ - "Stories use INVEST: Independent, Negotiable, Valuable, Estimable, Small, Testable"
59
+
60
+ - id: REQ-003
61
+ title: Revision Policy
62
+ description: |
63
+ PRD changes requested after the development kickoff meeting MUST follow
64
+ a formal revision process: (1) Proposed change documented with rationale
65
+ and impact assessment. (2) Stakeholder sign-off obtained from PM, Tech
66
+ Lead, and Design Lead. (3) Scope impact assessed — if change adds scope,
67
+ a corresponding item must be moved to out-of-scope or timeline adjusted.
68
+ (4) Version history updated in the PRD with date, author, change summary,
69
+ and approver. PRDs without version history that have been modified after
70
+ kickoff are considered non-compliant.
71
+ level: MUST
72
+ examples:
73
+ - "PRD v1.2 (2026-04-15, @alice): Added biometric auth requirement; approved by @bob, @carol"
74
+ - "Scope impact: biometric auth added → saved cards feature deferred to v2"
75
+ - "Change log table at PRD top: Version | Date | Author | Summary | Approvers"
76
+ - "Minor editorial changes (typos, formatting) exempt from sign-off requirement"
77
+
78
+ anti_patterns:
79
+ - "PRD without measurable success metrics (qualitative goals only, e.g., 'improve UX')"
80
+ - "Scope creep without change log: adding requirements mid-sprint without documented approval"
81
+ - "Solution-first PRD: describing implementation details before establishing user problem"
82
+ - "PRD with no explicit out-of-scope section, causing boundary disputes during development"
83
+ - "Success metrics defined after development starts, making them unverifiable"
84
+
85
+ related_standards:
86
+ - requirement-engineering
87
+ - user-story-mapping
88
+ - product-metrics-standards
@@ -0,0 +1,111 @@
1
+ # Product Metrics Standards - AI Optimized
2
+ # Source: XSPEC-069 Wave 4 Product Layer Pack
3
+
4
+ id: product-metrics-standards
5
+ title: Product Metrics Framework Standards
6
+ version: "1.0.0"
7
+ status: Active
8
+ tags: [product, metrics, kpi, aarrr, heart, north-star, analytics]
9
+ summary: |
10
+ Defines how teams select, structure, and govern product metrics. Covers
11
+ a framework selection matrix (AARRR for growth products, HEART for
12
+ experience products, custom North Star for platforms), North Star metric
13
+ criteria, a three-level metric hierarchy (North Star → L1 drivers →
14
+ L2 diagnostics), and an anti-vanity rule that rejects metrics decoupled
15
+ from revenue or retention impact. Designed to align teams around metrics
16
+ that drive meaningful product decisions rather than activity tracking.
17
+
18
+ requirements:
19
+ - id: REQ-001
20
+ title: Framework Selection Matrix
21
+ description: |
22
+ Teams MUST select a primary metrics framework appropriate to their
23
+ product type. Selection criteria: (1) Growth products (consumer apps,
24
+ marketplaces, viral products focused on user acquisition and monetization)
25
+ → use AARRR framework (Acquisition, Activation, Retention, Referral,
26
+ Revenue). (2) Experience products (productivity tools, B2B SaaS, apps
27
+ where user satisfaction and task completion drive retention) → use
28
+ HEART framework (Happiness, Engagement, Adoption, Retention, Task
29
+ Success). (3) Platform products (developer platforms, APIs, infrastructure
30
+ products with diverse use cases) → define a custom North Star metric
31
+ that reflects platform value delivered, supplemented by AARRR or HEART
32
+ components as applicable. Framework selection MUST be documented in the
33
+ PRD or product strategy document.
34
+ level: MUST
35
+ examples:
36
+ - "Consumer social app → AARRR; primary focus on D7 retention and referral coefficient"
37
+ - "B2B project management tool → HEART; primary focus on Task Success and Retention"
38
+ - "Developer API platform → custom North Star: 'API calls per active developer per week'"
39
+ - "Framework documented in product-metrics.md: 'We use HEART because...'"
40
+
41
+ - id: REQ-002
42
+ title: North Star Criteria
43
+ description: |
44
+ Every product MUST define exactly one North Star metric that satisfies
45
+ all four criteria: (1) Leading indicator — it predicts future business
46
+ health (revenue, retention) rather than measuring past outcomes.
47
+ (2) Measurable and trackable — it can be calculated from existing or
48
+ easily obtainable data with a defined measurement cadence (weekly
49
+ or monthly). (3) Actionable by the team — the product team has direct
50
+ levers to influence it through feature development and UX decisions.
51
+ (4) Explainable in one sentence — any team member can describe what
52
+ it measures and why it matters without needing additional context.
53
+ North Star MUST be reviewed and reconfirmed at each annual product
54
+ planning cycle.
55
+ level: MUST
56
+ examples:
57
+ - "Spotify: 'Time spent listening per user per week' (leading, measurable, actionable)"
58
+ - "Airbnb: 'Nights booked per month' (explainable, predicts revenue)"
59
+ - "Weak North Star: 'Total revenue' (lagging, not directly actionable by product team)"
60
+ - "Annual review: North Star unchanged but L1 driver metrics updated for new product area"
61
+
62
+ - id: REQ-003
63
+ title: Metric Hierarchy
64
+ description: |
65
+ Teams MUST structure metrics in a three-level hierarchy with a maximum
66
+ of three levels. Level 1 (North Star): one metric representing overall
67
+ product value delivered. Level 2 (L1 Driver Metrics): 3–5 metrics that
68
+ directly influence the North Star; each driver metric must have a
69
+ documented causal hypothesis linking it to the North Star. Level 3
70
+ (L2 Diagnostic Metrics): per-feature or per-team metrics that explain
71
+ movements in L1 drivers; maximum 3 diagnostics per driver. Metrics
72
+ beyond three levels of hierarchy are PROHIBITED — they indicate
73
+ measurement fragmentation rather than focus.
74
+ level: MUST
75
+ examples:
76
+ - "North Star: weekly active users completing core action"
77
+ - "L1 drivers: new user activation rate, 7-day retention, feature adoption breadth"
78
+ - "L2 diagnostic for activation: onboarding step completion rates (step 1/2/3)"
79
+ - "Prohibited: L4 sub-diagnostic metrics that obscure rather than explain"
80
+
81
+ - id: REQ-004
82
+ title: Anti-Vanity Rule
83
+ description: |
84
+ Teams MUST apply the anti-vanity test before adding any metric to the
85
+ official metrics dashboard. A metric fails the anti-vanity test if it
86
+ can increase while revenue and retention remain flat or decrease. Such
87
+ metrics MUST NOT appear in official product reviews or be used as
88
+ success criteria for features. Examples of vanity metrics that commonly
89
+ fail this test: total registered users (without active usage filter),
90
+ raw pageviews (without session quality filter), total API calls
91
+ (without unique active customer filter), press mentions, app store
92
+ downloads without activation. When a vanity metric is useful for
93
+ operational monitoring, it MUST be clearly labeled as "operational
94
+ indicator, not success metric."
95
+ level: MUST
96
+ examples:
97
+ - "Reject: 'Total signups this month' → replace with 'Signups who completed activation'"
98
+ - "Reject: 'Total pageviews' → replace with 'Sessions with ≥2 meaningful interactions'"
99
+ - "Allowed with label: 'Total API calls (operational indicator)' on infra dashboard"
100
+ - "Feature success metric: 'Users who used feature X and retained at D30' not 'feature clicks'"
101
+
102
+ anti_patterns:
103
+ - "Tracking vanity metrics (total signups, raw pageviews) as primary success indicators"
104
+ - "No North Star defined — teams optimize for different local metrics, creating misalignment"
105
+ - "Conflicting team metrics where one team's optimization harms another team's metric"
106
+ - "Metric hierarchy deeper than 3 levels, creating measurement complexity without insight"
107
+ - "Changing the North Star quarterly, preventing year-over-year trend analysis"
108
+
109
+ related_standards:
110
+ - prd-standards
111
+ - slo-sli
@@ -0,0 +1,94 @@
1
+ # SPDX-License-Identifier: MIT
2
+ name: Prompt Regression Standards
3
+ nameZh: Prompt 回歸測試標準
4
+ id: prompt-regression
5
+ version: "1.0.0"
6
+ category: testing
7
+ scope: ai-agent-systems
8
+ summary: >
9
+ Prevent unintended prompt changes from silently degrading AI agent behaviour.
10
+ Golden checksum tests detect modifications; snapshot diff confirms intent.
11
+
12
+ requirements:
13
+ - id: REQ-01
14
+ title: Prompt Versioning
15
+ titleZh: Prompt 版本化
16
+ level: MUST
17
+ description: >
18
+ Every AI agent MUST have a versioned prompt file (e.g. prompt.md with
19
+ frontmatter `version: "x.y.z"`). Changes require explicit version bump.
20
+
21
+ - id: REQ-02
22
+ title: Golden Checksum Test
23
+ titleZh: 黃金校驗和測試
24
+ level: MUST
25
+ description: >
26
+ A CI-enforced checksum test MUST store the SHA-256 hash of each agent's
27
+ prompt file. A mismatch fails the build and requires updating the golden
28
+ value with a comment confirming intent ("intentional change for X").
29
+ implementation: |
30
+ const GOLDEN = { "planner": "abc123...", ... }
31
+ it("prompt has not changed unexpectedly", () => {
32
+ const actual = sha256(readFileSync("agents/planner/prompt.md"))
33
+ expect(actual).toBe(GOLDEN["planner"])
34
+ })
35
+
36
+ - id: REQ-03
37
+ title: Snapshot Diff
38
+ titleZh: 快照差異比對
39
+ level: SHOULD
40
+ description: >
41
+ On CI failure, the system SHOULD output a unified diff between the
42
+ recorded snapshot and current prompt content to aid review.
43
+
44
+ - id: REQ-04
45
+ title: Change Review Gate
46
+ titleZh: 變更審查閘門
47
+ level: MUST
48
+ description: >
49
+ Prompt changes MUST be reviewed as code changes. Checksum updates require
50
+ a comment in the test file explaining why the prompt changed (e.g.
51
+ "Updated system role to include new Guardian policy reference XSPEC-160").
52
+
53
+ - id: REQ-05
54
+ title: Coverage
55
+ titleZh: 覆蓋範圍
56
+ level: MUST
57
+ description: >
58
+ The golden checksum test MUST cover ALL production agent prompt files.
59
+ New agents added to the system MUST be added to the test within the same
60
+ PR that introduces the agent.
61
+
62
+ examples:
63
+ - name: "SHA-256 checksum test in Vitest"
64
+ code: |
65
+ import { createHash } from "crypto"
66
+ import { readFileSync } from "fs"
67
+ import { describe, it, expect } from "vitest"
68
+
69
+ const GOLDEN_CHECKSUMS: Record<string, string> = {
70
+ planner: "dd0d086d...",
71
+ guardian: "f56555...",
72
+ }
73
+
74
+ describe("Agent prompt regression", () => {
75
+ for (const [agent, expected] of Object.entries(GOLDEN_CHECKSUMS)) {
76
+ it(`agents/${agent}/prompt.md has not changed unexpectedly`, () => {
77
+ const content = readFileSync(`agents/${agent}/prompt.md`)
78
+ const actual = createHash("sha256").update(content).digest("hex")
79
+ expect(actual).toBe(expected)
80
+ })
81
+ }
82
+ })
83
+
84
+ anti_patterns:
85
+ - description: >
86
+ Skipping checksums for "stable" agents — all prompts require regression
87
+ coverage regardless of perceived stability.
88
+ - description: >
89
+ Updating checksums in bulk without per-agent comments explaining intent.
90
+
91
+ related_standards:
92
+ - llm-output-validation
93
+ - adversarial-test
94
+ - testing
@@ -0,0 +1,105 @@
1
+ # SPDX-License-Identifier: MIT
2
+ name: Property-Based Testing Standards
3
+ nameZh: 屬性基礎測試標準
4
+ id: property-based-testing
5
+ version: "1.0.0"
6
+ category: testing
7
+ scope: correctness-validation
8
+ summary: >
9
+ Property-based testing generates hundreds of random inputs to verify that
10
+ invariants (properties) hold for all valid inputs, not just the examples
11
+ a developer thought to write. Especially valuable for pure functions,
12
+ parsers, and security-critical logic.
13
+
14
+ requirements:
15
+ - id: REQ-01
16
+ title: Property Identification
17
+ titleZh: 屬性識別
18
+ level: MUST
19
+ description: >
20
+ For each module under property-based test, developers MUST explicitly
21
+ identify the invariants to test. Examples: idempotency, monotonicity,
22
+ round-trip (encode/decode), boundary clamping, determinism.
23
+
24
+ - id: REQ-02
25
+ title: Generator Coverage
26
+ titleZh: 生成器覆蓋
27
+ level: MUST
28
+ description: >
29
+ Property tests MUST use appropriate generators: bounded integers for
30
+ array indices, valid enum values for type-constrained fields,
31
+ arbitrary strings for fuzz targets. Generators MUST cover the full
32
+ valid input space, not just a curated subset.
33
+
34
+ - id: REQ-03
35
+ title: Shrinking
36
+ titleZh: 最小化反例
37
+ level: SHOULD
38
+ description: >
39
+ When a property test fails, the framework SHOULD shrink the
40
+ counterexample to its minimal form. Libraries like fast-check (JS/TS)
41
+ and Hypothesis (Python) do this automatically.
42
+
43
+ - id: REQ-04
44
+ title: Run Count
45
+ titleZh: 執行次數
46
+ level: MUST
47
+ description: >
48
+ Property tests MUST run at least 100 samples per property in CI.
49
+ For security-critical functions (hash chains, token validation,
50
+ policy evaluation), run at least 1000 samples.
51
+
52
+ - id: REQ-05
53
+ title: Seed Persistence
54
+ titleZh: 種子持久化
55
+ level: SHOULD
56
+ description: >
57
+ When a property test fails, the failing seed SHOULD be saved and
58
+ re-run in CI until the fix is confirmed. This ensures regressions
59
+ are caught even after the random seed changes.
60
+
61
+ examples:
62
+ - name: "fast-check idempotency property for score clamping"
63
+ code: |
64
+ import fc from "fast-check"
65
+ import { describe, it } from "vitest"
66
+
67
+ describe("scoreReviewable idempotency", () => {
68
+ it("score is always in [0, 100]", () => {
69
+ fc.assert(
70
+ fc.property(
71
+ fc.record({
72
+ target_env: fc.constantFrom("prod", "staging", "dev"),
73
+ command_type: fc.constantFrom("query", "mutate", "exec", "delete"),
74
+ reversible: fc.boolean(),
75
+ }),
76
+ ({ target_env, command_type, reversible }) => {
77
+ const result = scoreReviewable({
78
+ session_id: "prop-001",
79
+ source_agent: "operator",
80
+ intent: "test",
81
+ plan: [{ command: "ls", command_type, reversible }],
82
+ target_env,
83
+ reversible,
84
+ })
85
+ return result.score >= 0 && result.score <= 100
86
+ }
87
+ ),
88
+ { numRuns: 1000 }
89
+ )
90
+ })
91
+ })
92
+
93
+ anti_patterns:
94
+ - description: >
95
+ Using fc.anything() for domain-specific inputs — unguided random
96
+ strings will mostly generate invalid inputs that get rejected early.
97
+ Use constrained generators (fc.constantFrom, fc.integer with bounds).
98
+ - description: >
99
+ Setting numRuns: 10 — too few samples to find edge cases. Use at
100
+ least 100 for regular code and 1000 for security-critical functions.
101
+
102
+ related_standards:
103
+ - mutation-testing
104
+ - testing
105
+ - adversarial-test