npm - universal-dev-standards - Versions diffs - 5.4.0 → 5.5.0 - Mend

universal-dev-standards 5.4.0 → 5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

package/bundled/ai/standards/policy-as-code-testing.ai.yaml ADDED Viewed

@@ -0,0 +1,227 @@
+# Policy as Code Testing Standards - AI Optimized
+# Source: core/policy-as-code-testing.md
+id: policy-as-code-testing
+meta:
+  version: "1.0.0"
+  updated: "2026-05-05"
+  source: core/policy-as-code-testing.md
+  description: >
+    Standards for unit testing Open Policy Agent (OPA) Rego policies and
+    other Policy as Code (PaC) engines. Ensures that AI agent authorization
+    policies are tested with the same rigor as application code.
+# ─────────────────────────────────────────────────────────
+# Core Concepts
+# ─────────────────────────────────────────────────────────
+core_concepts:
+  definition: >
+    Policy as Code (PaC) means security and authorization policies are expressed
+    as code (Rego, Cedar, CEL) rather than manual configuration. This enables
+    version control, code review, and automated testing of policies.
+  opa_test_framework:
+    overview: >
+      OPA's built-in test framework allows unit testing Rego policies with
+      `opa test`. Tests are Rego rules with names prefixed by `test_`.
+      Tests pass if they evaluate to `true`, fail if `false` or undefined.
+    run_command: "opa test <policy_directory> -v"
+    file_convention: "<policy_name>_test.rego in the same directory as the policy"
+  why_test_policies:
+    - reason: Policies encode security decisions — untested policies create silent security holes
+    - reason: Policy logic can have edge cases (reversible vs. irreversible, env-specific rules)
+    - reason: Policy changes must be validated against both allowed and denied cases
+    - reason: OPA Rego syntax errors are only caught at runtime without tests
+# ─────────────────────────────────────────────────────────
+# OPA Rego Test Structure
+# ─────────────────────────────────────────────────────────
+rego_test_structure:
+  file_naming: "<policy_module>_test.rego"
+  package_naming: "<policy_package>_test"
+  test_rule_format: |
+    # Each test is a Rego rule with `test_` prefix
+    # Test passes if rule body evaluates to true
+    test_<description> if {
+        <rule_under_test> with input as { <test_input> }
+    }
+    # Negative test (assert rule does NOT fire)
+    test_<description>_is_not_violated if {
+        not <rule_under_test> with input as { <test_input> }
+    }
+  required_test_categories:
+    - category: ALLOW cases
+      description: Inputs that must NOT trigger the policy violation
+      minimum: 2
+      example: |
+        test_safe_select_is_allowed if {
+            not data.my_pkg.has_violation with input as {
+                "plan": [{"command_type": "sql", "command": "SELECT * FROM t"}]
+            }
+        }
+    - category: DENY cases
+      description: Inputs that MUST trigger the policy violation
+      minimum: 3
+      example: |
+        test_drop_database_is_forbidden if {
+            data.my_pkg.has_forbidden_pattern with input as {
+                "plan": [{"command_type": "sql", "command": "DROP DATABASE prod"}]
+            }
+        }
+    - category: Boundary cases
+      description: Edge cases at the boundary of the policy condition
+      minimum: 1
+      example: |
+        # reversible=false triggers but reversible=true does not
+        test_irreversible_triggers if {
+            data.my_pkg.prod_violation with input as {
+                "target_env": "prod",
+                "plan": [{"reversible": false, "command": "DELETE FROM users"}]
+            }
+        }
+        test_reversible_does_not_trigger if {
+            not data.my_pkg.prod_violation with input as {
+                "target_env": "prod",
+                "plan": [{"reversible": true, "command": "SELECT * FROM users"}]
+            }
+        }
+    - category: Integration test (main policy)
+      description: Test the full policy chain via the main/root package
+      minimum: 2
+# ─────────────────────────────────────────────────────────
+# Policy Module Design Rules
+# ─────────────────────────────────────────────────────────
+policy_design_rules:
+  - rule: fail_closed_default
+    description: >
+      The root policy package MUST have `default allow = false`.
+      Any evaluation error or undefined result should deny, not allow.
+    example: |
+      default allow = false
+      allow if {
+          not data.my_pkg.forbidden.has_violation
+          not data.my_pkg.env.prod_violation
+      }
+  - rule: no_free_text_in_security_decisions
+    description: >
+      Policy rules MUST NOT parse user-controlled free-text fields (intent,
+      description, annotations) for security decisions. Only structured,
+      typed fields (command_type, reversible, target_env) should drive policy.
+    rationale: Free-text parsing creates prompt injection attack surface (OWASP LLM01)
+  - rule: set_not_array_for_violations
+    description: >
+      Use partial set rules (`violations[reason] if {...}`) to aggregate
+      violation reasons, not array rules. Arrays cannot be used with partial rules.
+    example: |
+      # CORRECT: partial set rule
+      violations[reason] if {
+          has_violation
+          reason := "VIOLATION_TYPE"
+      }
+      # INCORRECT: array.concat on sets causes type errors in OPA ≥ 0.40
+      # deny_reasons := array.concat(violations1, violations2)  ← DO NOT USE
+  - rule: module_per_concern
+    description: >
+      Each policy concern should be a separate Rego module (file).
+      E.g., forbidden_patterns.rego / env_policy.rego / risk_gate.rego.
+      Main.rego aggregates all modules via data references.
+    benefit: Enables per-module testing and cleaner separation of concerns
+# ─────────────────────────────────────────────────────────
+# CI Integration
+# ─────────────────────────────────────────────────────────
+ci_integration:
+  github_actions_step: |
+    - name: Test OPA Rego Policies
+      run: |
+        docker run --rm \
+          -v "${{ github.workspace }}/src/guardian/policies:/policies:ro" \
+          openpolicyagent/opa:latest-static \
+          test /policies -v
+  npm_script: |
+    "test:policy": "docker run --rm -v \"$(pwd)/src/guardian/policies:/policies:ro\" openpolicyagent/opa:latest-static test /policies -v"
+  local_opa_binary: |
+    # If OPA binary is installed:
+    opa test src/guardian/policies/ -v
+# ─────────────────────────────────────────────────────────
+# Quality Gates
+# ─────────────────────────────────────────────────────────
+quality_gates:
+  - gate: OPA policy tests (CI)
+    threshold: "100% of Rego tests pass"
+    enforcement: Block merge
+    automated: true
+    note: Run via `opa test` on every PR that touches *.rego files
+  - gate: Policy coverage
+    threshold: "Each policy module has ≥ 2 ALLOW + ≥ 3 DENY test cases"
+    enforcement: Advisory (reviewer checklist)
+  - gate: Integration tests
+    threshold: "Root policy (main.rego) has tests for both allow and deny paths"
+    enforcement: Block merge
+# ─────────────────────────────────────────────────────────
+# Rules
+# ─────────────────────────────────────────────────────────
+rules:
+  - id: rego-unit-test-per-module
+    trigger: creating or modifying a Rego policy module
+    instruction: >
+      Every Rego module MUST have a corresponding _test.rego file with at minimum:
+      2 ALLOW cases, 3 DENY cases, and 1 boundary case.
+    priority: required
+  - id: fail-closed-default
+    trigger: creating a root OPA policy package
+    instruction: >
+      Root policy MUST include `default allow = false`.
+      Any undefined evaluation must result in DENY.
+    priority: required
+  - id: no-free-text-in-policy
+    trigger: writing Rego rules
+    instruction: >
+      Never parse intent, description, or annotation fields in Rego rules.
+      Use only structured fields: command_type, command, reversible, target_env,
+      target_resource, risk_score.
+    priority: required
+  - id: policy-test-on-rego-change
+    trigger: modifying any *.rego file
+    instruction: >
+      Re-run `opa test` on the entire policy directory after any change.
+      CI must block merge if OPA tests fail.
+    priority: required
+anti_patterns:
+  - Using array.concat() on set-type violation rules (type error in OPA ≥ 0.40)
+  - Parsing intent/user-input fields in security policy logic
+  - Missing `default allow = false` in root policy
+  - Policy modules without corresponding _test.rego files
+  - Testing only DENY cases (no ALLOW cases means you can't tell if the policy is too restrictive)
+  - Running OPA tests only locally, not in CI
+quick_reference:
+  policy_test_checklist: |
+    □ Each policy module has a _test.rego file
+    □ Tests cover: ALLOW cases (≥ 2), DENY cases (≥ 3), boundary cases (≥ 1)
+    □ main.rego / root policy tested via integration tests
+    □ `default allow = false` present in root policy
+    □ No free-text field parsing in Rego rules
+    □ `opa test <policies_dir> -v` passes locally
+    □ CI step: `opa test` runs on every PR touching *.rego

package/bundled/ai/standards/prd-standards.ai.yaml ADDED Viewed

@@ -0,0 +1,88 @@
+# PRD Standards - AI Optimized
+# Source: XSPEC-069 Wave 4 Product Layer Pack
+id: prd-standards
+title: Product Requirements Document Standards
+version: "1.0.0"
+status: Active
+tags: [product, prd, requirements, user-research, planning]
+summary: |
+  Defines the structure, content requirements, and lifecycle governance for
+  Product Requirements Documents (PRDs). Covers five mandatory PRD sections
+  (Problem Statement, Target User/Persona, Success Metrics, Scope in/out,
+  Constraints), the bridge from PRD requirements to traceable user stories,
+  and the revision policy for changes after kickoff. Designed to ensure product
+  intent is clearly communicable to engineering, design, and stakeholders with
+  measurable success criteria.
+requirements:
+  - id: REQ-001
+    title: PRD Five Sections
+    description: |
+      Every PRD MUST contain five sections in the following order:
+      (1) Problem Statement — describes the user pain point or opportunity
+      in observable terms; includes quantitative data where available (e.g.,
+      support ticket volume, user research findings, funnel drop-off rates).
+      (2) Target User / Persona — identifies who is affected; references
+      named personas with role, context, and goals; at minimum one primary
+      persona and one secondary persona.
+      (3) Success Metrics — defines 2–4 measurable outcomes that indicate
+      the problem is solved; each metric must include current baseline,
+      target value, and measurement method.
+      (4) Scope In / Out — explicitly lists what is included and excluded
+      from this PRD; out-of-scope items may reference future PRDs.
+      (5) Constraints — technical, regulatory, time, budget, or dependency
+      constraints that bound the solution space.
+    level: MUST
+    examples:
+      - "Problem: 34% of users abandon checkout at payment step (Mixpanel, Q1 2026)"
+      - "Primary persona: Mid-market SaaS buyer; Secondary: IT admin approver"
+      - "Success metric: Checkout completion rate ≥ 72% (baseline 66%) measured via Amplitude"
+      - "Out of scope: saved payment methods (deferred to PRD-2026-Q3-payments)"
+  - id: REQ-002
+    title: PRD to User Story Bridge
+    description: |
+      Each PRD requirement MUST be broken down into one or more user stories
+      following the INVEST criteria defined in requirement-engineering.ai.yaml.
+      Every user story derived from a PRD MUST be traceable to at least one
+      PRD success metric — stories that cannot be linked to a success metric
+      MUST be flagged for PM review before inclusion in the backlog. The
+      traceability link (PRD section ID → User Story ID → Success Metric) MUST
+      be maintained in the backlog tool or as a traceability matrix in the PRD.
+    level: MUST
+    examples:
+      - "PRD-REQ-003 → US-042 'As a buyer, I want to pay with Apple Pay' → metric: checkout rate"
+      - "Story without metric link flagged with label `needs-metric-trace` in Jira"
+      - "Traceability matrix table in PRD section 6: REQ ↔ Stories ↔ Metrics"
+      - "Stories use INVEST: Independent, Negotiable, Valuable, Estimable, Small, Testable"
+  - id: REQ-003
+    title: Revision Policy
+    description: |
+      PRD changes requested after the development kickoff meeting MUST follow
+      a formal revision process: (1) Proposed change documented with rationale
+      and impact assessment. (2) Stakeholder sign-off obtained from PM, Tech
+      Lead, and Design Lead. (3) Scope impact assessed — if change adds scope,
+      a corresponding item must be moved to out-of-scope or timeline adjusted.
+      (4) Version history updated in the PRD with date, author, change summary,
+      and approver. PRDs without version history that have been modified after
+      kickoff are considered non-compliant.
+    level: MUST
+    examples:
+      - "PRD v1.2 (2026-04-15, @alice): Added biometric auth requirement; approved by @bob, @carol"
+      - "Scope impact: biometric auth added → saved cards feature deferred to v2"
+      - "Change log table at PRD top: Version | Date | Author | Summary | Approvers"
+      - "Minor editorial changes (typos, formatting) exempt from sign-off requirement"
+anti_patterns:
+  - "PRD without measurable success metrics (qualitative goals only, e.g., 'improve UX')"
+  - "Scope creep without change log: adding requirements mid-sprint without documented approval"
+  - "Solution-first PRD: describing implementation details before establishing user problem"
+  - "PRD with no explicit out-of-scope section, causing boundary disputes during development"
+  - "Success metrics defined after development starts, making them unverifiable"
+related_standards:
+  - requirement-engineering
+  - user-story-mapping
+  - product-metrics-standards

package/bundled/ai/standards/product-metrics-standards.ai.yaml ADDED Viewed

@@ -0,0 +1,111 @@
+# Product Metrics Standards - AI Optimized
+# Source: XSPEC-069 Wave 4 Product Layer Pack
+id: product-metrics-standards
+title: Product Metrics Framework Standards
+version: "1.0.0"
+status: Active
+tags: [product, metrics, kpi, aarrr, heart, north-star, analytics]
+summary: |
+  Defines how teams select, structure, and govern product metrics. Covers
+  a framework selection matrix (AARRR for growth products, HEART for
+  experience products, custom North Star for platforms), North Star metric
+  criteria, a three-level metric hierarchy (North Star → L1 drivers →
+  L2 diagnostics), and an anti-vanity rule that rejects metrics decoupled
+  from revenue or retention impact. Designed to align teams around metrics
+  that drive meaningful product decisions rather than activity tracking.
+requirements:
+  - id: REQ-001
+    title: Framework Selection Matrix
+    description: |
+      Teams MUST select a primary metrics framework appropriate to their
+      product type. Selection criteria: (1) Growth products (consumer apps,
+      marketplaces, viral products focused on user acquisition and monetization)
+      → use AARRR framework (Acquisition, Activation, Retention, Referral,
+      Revenue). (2) Experience products (productivity tools, B2B SaaS, apps
+      where user satisfaction and task completion drive retention) → use
+      HEART framework (Happiness, Engagement, Adoption, Retention, Task
+      Success). (3) Platform products (developer platforms, APIs, infrastructure
+      products with diverse use cases) → define a custom North Star metric
+      that reflects platform value delivered, supplemented by AARRR or HEART
+      components as applicable. Framework selection MUST be documented in the
+      PRD or product strategy document.
+    level: MUST
+    examples:
+      - "Consumer social app → AARRR; primary focus on D7 retention and referral coefficient"
+      - "B2B project management tool → HEART; primary focus on Task Success and Retention"
+      - "Developer API platform → custom North Star: 'API calls per active developer per week'"
+      - "Framework documented in product-metrics.md: 'We use HEART because...'"
+  - id: REQ-002
+    title: North Star Criteria
+    description: |
+      Every product MUST define exactly one North Star metric that satisfies
+      all four criteria: (1) Leading indicator — it predicts future business
+      health (revenue, retention) rather than measuring past outcomes.
+      (2) Measurable and trackable — it can be calculated from existing or
+      easily obtainable data with a defined measurement cadence (weekly
+      or monthly). (3) Actionable by the team — the product team has direct
+      levers to influence it through feature development and UX decisions.
+      (4) Explainable in one sentence — any team member can describe what
+      it measures and why it matters without needing additional context.
+      North Star MUST be reviewed and reconfirmed at each annual product
+      planning cycle.
+    level: MUST
+    examples:
+      - "Spotify: 'Time spent listening per user per week' (leading, measurable, actionable)"
+      - "Airbnb: 'Nights booked per month' (explainable, predicts revenue)"
+      - "Weak North Star: 'Total revenue' (lagging, not directly actionable by product team)"
+      - "Annual review: North Star unchanged but L1 driver metrics updated for new product area"
+  - id: REQ-003
+    title: Metric Hierarchy
+    description: |
+      Teams MUST structure metrics in a three-level hierarchy with a maximum
+      of three levels. Level 1 (North Star): one metric representing overall
+      product value delivered. Level 2 (L1 Driver Metrics): 3–5 metrics that
+      directly influence the North Star; each driver metric must have a
+      documented causal hypothesis linking it to the North Star. Level 3
+      (L2 Diagnostic Metrics): per-feature or per-team metrics that explain
+      movements in L1 drivers; maximum 3 diagnostics per driver. Metrics
+      beyond three levels of hierarchy are PROHIBITED — they indicate
+      measurement fragmentation rather than focus.
+    level: MUST
+    examples:
+      - "North Star: weekly active users completing core action"
+      - "L1 drivers: new user activation rate, 7-day retention, feature adoption breadth"
+      - "L2 diagnostic for activation: onboarding step completion rates (step 1/2/3)"
+      - "Prohibited: L4 sub-diagnostic metrics that obscure rather than explain"
+  - id: REQ-004
+    title: Anti-Vanity Rule
+    description: |
+      Teams MUST apply the anti-vanity test before adding any metric to the
+      official metrics dashboard. A metric fails the anti-vanity test if it
+      can increase while revenue and retention remain flat or decrease. Such
+      metrics MUST NOT appear in official product reviews or be used as
+      success criteria for features. Examples of vanity metrics that commonly
+      fail this test: total registered users (without active usage filter),
+      raw pageviews (without session quality filter), total API calls
+      (without unique active customer filter), press mentions, app store
+      downloads without activation. When a vanity metric is useful for
+      operational monitoring, it MUST be clearly labeled as "operational
+      indicator, not success metric."
+    level: MUST
+    examples:
+      - "Reject: 'Total signups this month' → replace with 'Signups who completed activation'"
+      - "Reject: 'Total pageviews' → replace with 'Sessions with ≥2 meaningful interactions'"
+      - "Allowed with label: 'Total API calls (operational indicator)' on infra dashboard"
+      - "Feature success metric: 'Users who used feature X and retained at D30' not 'feature clicks'"
+anti_patterns:
+  - "Tracking vanity metrics (total signups, raw pageviews) as primary success indicators"
+  - "No North Star defined — teams optimize for different local metrics, creating misalignment"
+  - "Conflicting team metrics where one team's optimization harms another team's metric"
+  - "Metric hierarchy deeper than 3 levels, creating measurement complexity without insight"
+  - "Changing the North Star quarterly, preventing year-over-year trend analysis"
+related_standards:
+  - prd-standards
+  - slo-sli

package/bundled/ai/standards/prompt-regression.ai.yaml ADDED Viewed

@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: MIT
+name: Prompt Regression Standards
+nameZh: Prompt 回歸測試標準
+id: prompt-regression
+version: "1.0.0"
+category: testing
+scope: ai-agent-systems
+summary: >
+  Prevent unintended prompt changes from silently degrading AI agent behaviour.
+  Golden checksum tests detect modifications; snapshot diff confirms intent.
+requirements:
+  - id: REQ-01
+    title: Prompt Versioning
+    titleZh: Prompt 版本化
+    level: MUST
+    description: >
+      Every AI agent MUST have a versioned prompt file (e.g. prompt.md with
+      frontmatter `version: "x.y.z"`). Changes require explicit version bump.
+  - id: REQ-02
+    title: Golden Checksum Test
+    titleZh: 黃金校驗和測試
+    level: MUST
+    description: >
+      A CI-enforced checksum test MUST store the SHA-256 hash of each agent's
+      prompt file. A mismatch fails the build and requires updating the golden
+      value with a comment confirming intent ("intentional change for X").
+    implementation: |
+      const GOLDEN = { "planner": "abc123...", ... }
+      it("prompt has not changed unexpectedly", () => {
+        const actual = sha256(readFileSync("agents/planner/prompt.md"))
+        expect(actual).toBe(GOLDEN["planner"])
+      })
+  - id: REQ-03
+    title: Snapshot Diff
+    titleZh: 快照差異比對
+    level: SHOULD
+    description: >
+      On CI failure, the system SHOULD output a unified diff between the
+      recorded snapshot and current prompt content to aid review.
+  - id: REQ-04
+    title: Change Review Gate
+    titleZh: 變更審查閘門
+    level: MUST
+    description: >
+      Prompt changes MUST be reviewed as code changes. Checksum updates require
+      a comment in the test file explaining why the prompt changed (e.g.
+      "Updated system role to include new Guardian policy reference XSPEC-160").
+  - id: REQ-05
+    title: Coverage
+    titleZh: 覆蓋範圍
+    level: MUST
+    description: >
+      The golden checksum test MUST cover ALL production agent prompt files.
+      New agents added to the system MUST be added to the test within the same
+      PR that introduces the agent.
+examples:
+  - name: "SHA-256 checksum test in Vitest"
+    code: |
+      import { createHash } from "crypto"
+      import { readFileSync } from "fs"
+      import { describe, it, expect } from "vitest"
+      const GOLDEN_CHECKSUMS: Record<string, string> = {
+        planner: "dd0d086d...",
+        guardian: "f56555...",
+      }
+      describe("Agent prompt regression", () => {
+        for (const [agent, expected] of Object.entries(GOLDEN_CHECKSUMS)) {
+          it(`agents/${agent}/prompt.md has not changed unexpectedly`, () => {
+            const content = readFileSync(`agents/${agent}/prompt.md`)
+            const actual = createHash("sha256").update(content).digest("hex")
+            expect(actual).toBe(expected)
+          })
+        }
+      })
+anti_patterns:
+  - description: >
+      Skipping checksums for "stable" agents — all prompts require regression
+      coverage regardless of perceived stability.
+  - description: >
+      Updating checksums in bulk without per-agent comments explaining intent.
+related_standards:
+  - llm-output-validation
+  - adversarial-test
+  - testing

package/bundled/ai/standards/property-based-testing.ai.yaml ADDED Viewed

@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: MIT
+name: Property-Based Testing Standards
+nameZh: 屬性基礎測試標準
+id: property-based-testing
+version: "1.0.0"
+category: testing
+scope: correctness-validation
+summary: >
+  Property-based testing generates hundreds of random inputs to verify that
+  invariants (properties) hold for all valid inputs, not just the examples
+  a developer thought to write. Especially valuable for pure functions,
+  parsers, and security-critical logic.
+requirements:
+  - id: REQ-01
+    title: Property Identification
+    titleZh: 屬性識別
+    level: MUST
+    description: >
+      For each module under property-based test, developers MUST explicitly
+      identify the invariants to test. Examples: idempotency, monotonicity,
+      round-trip (encode/decode), boundary clamping, determinism.
+  - id: REQ-02
+    title: Generator Coverage
+    titleZh: 生成器覆蓋
+    level: MUST
+    description: >
+      Property tests MUST use appropriate generators: bounded integers for
+      array indices, valid enum values for type-constrained fields,
+      arbitrary strings for fuzz targets. Generators MUST cover the full
+      valid input space, not just a curated subset.
+  - id: REQ-03
+    title: Shrinking
+    titleZh: 最小化反例
+    level: SHOULD
+    description: >
+      When a property test fails, the framework SHOULD shrink the
+      counterexample to its minimal form. Libraries like fast-check (JS/TS)
+      and Hypothesis (Python) do this automatically.
+  - id: REQ-04
+    title: Run Count
+    titleZh: 執行次數
+    level: MUST
+    description: >
+      Property tests MUST run at least 100 samples per property in CI.
+      For security-critical functions (hash chains, token validation,
+      policy evaluation), run at least 1000 samples.
+  - id: REQ-05
+    title: Seed Persistence
+    titleZh: 種子持久化
+    level: SHOULD
+    description: >
+      When a property test fails, the failing seed SHOULD be saved and
+      re-run in CI until the fix is confirmed. This ensures regressions
+      are caught even after the random seed changes.
+examples:
+  - name: "fast-check idempotency property for score clamping"
+    code: |
+      import fc from "fast-check"
+      import { describe, it } from "vitest"
+      describe("scoreReviewable idempotency", () => {
+        it("score is always in [0, 100]", () => {
+          fc.assert(
+            fc.property(
+              fc.record({
+                target_env: fc.constantFrom("prod", "staging", "dev"),
+                command_type: fc.constantFrom("query", "mutate", "exec", "delete"),
+                reversible: fc.boolean(),
+              }),
+              ({ target_env, command_type, reversible }) => {
+                const result = scoreReviewable({
+                  session_id: "prop-001",
+                  source_agent: "operator",
+                  intent: "test",
+                  plan: [{ command: "ls", command_type, reversible }],
+                  target_env,
+                  reversible,
+                })
+                return result.score >= 0 && result.score <= 100
+              }
+            ),
+            { numRuns: 1000 }
+          )
+        })
+      })
+anti_patterns:
+  - description: >
+      Using fc.anything() for domain-specific inputs — unguided random
+      strings will mostly generate invalid inputs that get rejected early.
+      Use constrained generators (fc.constantFrom, fc.integer with bounds).
+  - description: >
+      Setting numRuns: 10 — too few samples to find edge cases. Use at
+      least 100 for regular code and 1000 for security-critical functions.
+related_standards:
+  - mutation-testing
+  - testing
+  - adversarial-test