npm - universal-dev-standards - Versions diffs - 5.4.0 → 5.6.0 - Mend

universal-dev-standards 5.4.0 → 5.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

package/bundled/ai/standards/llm-output-validation.ai.yaml ADDED Viewed

@@ -0,0 +1,269 @@
+# LLM Output Validation Standards - AI Optimized
+# Source: core/llm-output-validation.md
+id: llm-output-validation
+meta:
+  version: "1.0.0"
+  updated: "2026-05-05"
+  source: core/llm-output-validation.md
+  description: >
+    Standards for validating LLM and AI agent outputs to ensure schema
+    conformance, detect hallucinations, and assess response groundedness.
+    Covers structural validation, semantic validation, and test strategies.
+# ─────────────────────────────────────────────────────────
+# Core Concepts
+# ─────────────────────────────────────────────────────────
+core_concepts:
+  definition: >
+    LLM outputs are non-deterministic and may violate expected schema,
+    introduce hallucinated facts, or fail to stay grounded in provided context.
+    LLM output validation is the practice of systematically testing that
+    AI agent outputs meet structural and semantic quality standards.
+  validation_dimensions:
+    - dimension: Schema Conformance
+      definition: >
+        The output matches the declared JSON / type schema exactly —
+        required fields present, types correct, enums respected.
+      test_type: Contract Test
+      automated: true
+    - dimension: Hallucination Detection
+      definition: >
+        Factual claims in the output can be traced back to source material
+        (grounded) or are fabricated (hallucinated).
+      test_type: Semantic / NLI probe
+      automated: partial
+    - dimension: Grounded Answer Rate
+      definition: >
+        Ratio of responses that cite or derive from provided context
+        vs. responses that introduce unsupported facts.
+      metric: "grounded_answers / total_answers"
+      target: "≥ 0.95 for RAG-based agents"
+      automated: partial
+    - dimension: Refusal Evaluation
+      definition: >
+        Agent correctly refuses (or escalates) out-of-scope, harmful,
+        or ambiguous requests rather than hallucinating a plausible answer.
+      test_type: Adversarial probe
+      automated: true (via test corpus)
+    - dimension: Determinism / Stability
+      definition: >
+        With temperature=0, same prompt yields same output schema structure
+        (if not identical content).
+      test_type: Replay / Golden file
+      automated: true
+# ─────────────────────────────────────────────────────────
+# Contract Testing (Schema Conformance)
+# ─────────────────────────────────────────────────────────
+contract_testing:
+  definition: >
+    Contract tests verify that LLM/agent outputs satisfy a declared schema
+    (JSON Schema, Zod, Pydantic, etc.) before being consumed downstream.
+    They are the cheapest and most automated form of LLM output validation.
+  patterns:
+    - name: Schema-driven contract test
+      description: >
+        Maintain an output-schema.json per agent. Run fixture outputs through
+        a schema validator (Ajv / Pydantic / jsonschema) in unit tests.
+      example: |
+        // TypeScript with ajv
+        import Ajv from "ajv"
+        import schema from "./output-schema.json"
+        const ajv = new Ajv()
+        const validate = ajv.compile(schema)
+        const result = validate(agentOutput)
+        expect(result).toBe(true)
+    - name: Golden file test
+      description: >
+        Capture a real LLM response as a golden fixture (JSON).
+        Re-validate against the schema on every schema change.
+        Diff the fixture if schema is unchanged (regression).
+    - name: Negative contract test
+      description: >
+        Provide outputs that violate the schema (missing required fields,
+        wrong types). Assert the validator correctly rejects them.
+        Proves the schema is strict enough to catch real failures.
+  per_agent_structure:
+    - "agents/<agent-name>/output-schema.json — JSON Schema definition"
+    - "agents/<agent-name>/__tests__/contract.test.ts — contract test suite"
+    - "agents/<agent-name>/__fixtures__/valid.json — valid output fixture"
+    - "agents/<agent-name>/__fixtures__/invalid-*.json — invalid fixtures"
+# ─────────────────────────────────────────────────────────
+# Hallucination Detection
+# ─────────────────────────────────────────────────────────
+hallucination_detection:
+  definition: >
+    Hallucination occurs when an LLM generates plausible-sounding but
+    factually incorrect or unsupported content.
+  detection_strategies:
+    - strategy: Grounding check
+      description: >
+        For RAG-based agents: compare named entities / key facts in response
+        against retrieved documents using NLI or embedding similarity.
+      tools: [DeepEval, Ragas, custom NLI probe]
+      complexity: high
+    - strategy: Schema field validation
+      description: >
+        Structured outputs (JSON) partially prevent hallucination by requiring
+        specific enumerations and reference IDs. If the LLM hallucinates a
+        field value, it either violates the schema (caught by contract test)
+        or produces a real-looking but wrong value (requires semantic check).
+      complexity: low
+    - strategy: Confidence calibration
+      description: >
+        Agent includes a confidence score or explicitly marks uncertain claims.
+        Downstream consumers reject or escalate low-confidence outputs.
+      requires: Prompt design to elicit uncertainty markers
+      complexity: medium
+  rate_targets:
+    structured_output_agents:
+      schema_conformance: "≥ 99%"
+      factual_hallucination_rate: "≤ 5%"
+    rag_agents:
+      grounded_answer_rate: "≥ 95%"
+    chat_agents:
+      refusal_on_out_of_scope: "≥ 90%"
+# ─────────────────────────────────────────────────────────
+# Prompt Regression & Stability
+# ─────────────────────────────────────────────────────────
+prompt_regression:
+  definition: >
+    When a prompt is changed, verify that the output schema structure
+    is preserved (even if content differs). A prompt change that silently
+    breaks downstream field expectations is a regression.
+  test_pattern: |
+    # 1. Before prompt change: capture golden output structure
+    # 2. Change prompt
+    # 3. Re-run with temperature=0 on same input fixture
+    # 4. Validate output against schema (contract test)
+    # 5. Diff field presence / enum values against golden
+  required_on:
+    - Any change to agents/*/prompt.md
+    - Model version upgrade (same prompt, new model)
+    - New required output field added to schema
+# ─────────────────────────────────────────────────────────
+# Tools & Libraries
+# ─────────────────────────────────────────────────────────
+tools:
+  schema_validation:
+    - name: Ajv (TypeScript/JavaScript)
+      package: "ajv"
+      strengths: [JSON Schema draft-07 support, fast, minimal]
+      use_case: Contract tests for agents with JSON output-schema.json
+    - name: Zod (TypeScript)
+      package: "zod"
+      strengths: [Type-safe, composable, coercion support]
+      use_case: Runtime validation when TypeScript types are available
+    - name: Pydantic (Python)
+      package: "pydantic"
+      strengths: [Automatic validation from type annotations]
+      use_case: Python-based agent contract tests
+  hallucination_evaluation:
+    - name: DeepEval
+      package: "deepeval"
+      features: [Hallucination metric, Contextual Precision, Faithfulness]
+      note: Requires LLM judge (adds cost)
+    - name: Ragas
+      package: "ragas"
+      features: [Answer Grounding, Context Recall, Faithfulness]
+      note: Designed for RAG pipelines
+# ─────────────────────────────────────────────────────────
+# Quality Gates
+# ─────────────────────────────────────────────────────────
+quality_gates:
+  - gate: Schema conformance (CI)
+    threshold: "100% of valid fixtures pass schema validation"
+    enforcement: Block merge
+    automated: true
+    note: Run in every PR via contract.test.ts
+  - gate: Negative schema rejection (CI)
+    threshold: "100% of invalid fixtures are rejected by schema validator"
+    enforcement: Block merge
+    automated: true
+    note: Proves schema is strict enough
+  - gate: Hallucination rate (pre-release)
+    threshold: "≤ 5% hallucinated facts on benchmark dataset"
+    enforcement: Advisory (escalate to human review if exceeded)
+    automated: partial
+    note: Run monthly or on model upgrade
+  - gate: Prompt regression (on prompt change)
+    threshold: "Schema conformance maintained after prompt change"
+    enforcement: Block merge
+    automated: true (via contract test re-run)
+# ─────────────────────────────────────────────────────────
+# Rules
+# ─────────────────────────────────────────────────────────
+rules:
+  - id: agent-must-have-output-schema
+    trigger: defining or modifying an AI agent
+    instruction: >
+      Every agent MUST have a declared output-schema.json.
+      Agents without a schema cannot be safely composed into pipelines.
+    priority: required
+  - id: contract-test-per-agent
+    trigger: defining or modifying an AI agent
+    instruction: >
+      Every agent MUST have a contract.test.ts with at least:
+      (1) one valid fixture that passes schema validation
+      (2) one invalid fixture that fails schema validation
+    priority: required
+  - id: prompt-change-triggers-contract-rerun
+    trigger: modifying agents/*/prompt.md
+    instruction: >
+      Re-run the agent's contract tests with temperature=0 after any
+      prompt change. A schema violation in the golden fixture signals regression.
+    priority: required
+  - id: model-upgrade-triggers-contract-rerun
+    trigger: upgrading the LLM model version used by an agent
+    instruction: >
+      Run all agent contract tests against the new model.
+      Compare output structure, field presence, and enum values against golden.
+    priority: required
+anti_patterns:
+  - Defining agents without output-schema.json
+  - Only validating JSON.parse (syntax) without schema (semantic) validation
+  - Writing contract tests with only valid fixtures (no negative cases)
+  - Accepting model upgrades without re-running contract tests
+  - Using high temperature (> 0.3) in contract test fixtures (non-deterministic)
+quick_reference:
+  contract_test_checklist: |
+    □ agents/<name>/output-schema.json exists
+    □ agents/<name>/__tests__/contract.test.ts exists
+    □ Valid fixture: all required fields, correct types, enum values
+    □ Invalid fixtures: missing required field, wrong type, invalid enum
+    □ Schema validator: Ajv (JSON Schema) or Zod (TypeScript)
+    □ CI: contract tests run on every PR
+    □ Prompt change: re-run contract test, compare against golden

package/bundled/ai/standards/mock-boundary.ai.yaml ADDED Viewed

@@ -0,0 +1,250 @@
+# Mock Boundary Standards - AI Optimized
+# Source: core/mock-boundary.md
+id: mock-boundary
+meta:
+  version: "1.0.0"
+  updated: "2026-05-04"
+  source: core/mock-boundary.md
+  description: >
+    Rules defining what can and cannot be mocked to prevent hollow tests —
+    tests that pass while the real system is broken.
+# ─────────────────────────────────────────────────────────
+# Core Problem
+# ─────────────────────────────────────────────────────────
+core_problem:
+  name: Hollow Test Anti-Pattern
+  description: >
+    Over-mocking replaces real business logic with test doubles, making the test suite
+    a specification of mock behavior rather than system behavior.
+    The tests pass in CI while the real system silently fails.
+  real_world_example: |
+    // SPEC-002.test.ts (VibeOps) — hollow test example
+    vi.mock('../../src/runner/agent-runner.js')      // Core dependency mocked
+    vi.mock('../../src/runner/guardian-hooks.js')     // Core dependency mocked
+    vi.mock('../../src/runner/prototyper.js')         // Core dependency mocked
+    vi.mock('../../src/runner/iteration-report.js')   // Core dependency mocked
+    vi.mock('../../src/memory/memory-store.js')       // Core dependency mocked
+    vi.mock('node:fs/promises', ...)                  // I/O mocked
+    // Result: runPipeline() runs but touches ZERO real code.
+    // All 8 agent calls are faked. The test proves nothing about pipeline logic.
+# ─────────────────────────────────────────────────────────
+# Allowed Mocks
+# ─────────────────────────────────────────────────────────
+allowed:
+  - category: External HTTP Services
+    description: Third-party APIs, LLM providers, payment gateways, email services
+    reason: Prevents flaky tests from external dependencies; enables response scenario control
+    examples:
+      - OpenAI / Anthropic / Grok API
+      - Stripe / payment processors
+      - SendGrid / email providers
+      - External OAuth providers
+    implementation: Mock the HTTP client or provider factory; never mock the internal caller
+  - category: Time Functions
+    description: Date.now(), new Date(), setTimeout, setInterval
+    reason: Makes tests deterministic and enables time-travel scenarios
+    examples:
+      - "vi.useFakeTimers()"
+      - "vi.setSystemTime(new Date('2026-01-01'))"
+    note: Always restore real timers after test with vi.useRealTimers()
+  - category: Environment Variables
+    description: process.env values
+    reason: Tests need different configurations without changing system state
+    implementation: Use vi.stubEnv() or process.env assignment in beforeEach/afterEach
+  - category: File System (unit tests only)
+    description: fs.readFile, fs.writeFile, fs.stat in UNIT tests
+    reason: Avoids slow I/O in fast unit tests
+    constraint: >
+      Integration tests, flow tests, and E2E tests MUST use real filesystem
+      or in-memory FS (memfs) — never vi.mock('node:fs/promises') at those levels
+  - category: Cross-Module Boundaries (with counterpart)
+    description: Calls to OTHER modules when testing THIS module's own logic
+    reason: Isolates the unit under test from its collaborators
+    constraint: >
+      A corresponding integration test MUST exist that exercises the real interaction.
+      Mock only when the real collaborator has its own test coverage.
+    example: |
+      // Mocking the DB layer in a service unit test is OK IF:
+      // 1. The DB layer has its own integration tests
+      // 2. The service test focuses on service logic (not DB behavior)
+# ─────────────────────────────────────────────────────────
+# Forbidden Mocks
+# ─────────────────────────────────────────────────────────
+forbidden:
+  - category: Own Module Core Logic
+    description: Mocking the module's OWN functions in the file that tests it
+    example: |
+      // ❌ Testing pipeline-runner.ts but mocking pipeline-runner itself
+      vi.mock('../../src/runner/pipeline-runner.js')
+      import { runPipeline } from '../../src/runner/pipeline-runner.js'
+      // runPipeline is now a no-op stub — the test proves nothing
+    violation_indicator: >
+      The mock import path resolves to the same directory as the test file,
+      or the mock replaces the primary export being tested.
+    fix: Remove the mock and let the real code run; mock only its external dependencies
+  - category: Database Layer in Integration/Flow Tests
+    description: Replacing DB calls with in-memory return values in integration or flow tests
+    reason: Masks query bugs, schema constraint violations, index issues, and migration errors
+    alternative: >
+      Use in-memory SQLite (better-sqlite3 / sql.js), test containers,
+      or a dedicated test schema — a real database with controlled data
+    example: |
+      // ❌ Forbidden in integration/flow tests
+      vi.mock('../../src/db/client.js', () => ({ query: vi.fn().mockResolvedValue([]) }))
+      // ✅ Correct: use real in-memory DB
+      import Database from 'better-sqlite3'
+      const testDb = new Database(':memory:')
+  - category: Core Framework Internals
+    description: Express/Fastify routing, ORM core (Drizzle/Prisma internals), auth middleware core
+    reason: Tests pass while real routing, query building, or auth enforcement is broken
+    example: |
+      // ❌ Forbidden
+      vi.mock('express', () => ({ Router: () => ({ get: vi.fn(), post: vi.fn() }) }))
+  - category: Security Controls
+    description: Auth token validators, permission checks, rate limiters, input sanitizers
+    reason: Mocking security controls makes tests useless for security validation
+    example: |
+      // ❌ Forbidden — this test proves nothing about auth
+      vi.mock('../../src/auth/middleware.js', () => ({
+        requireAuth: (req, res, next) => next()  // always passes
+      }))
+    fix: Use a real test user with a real valid token; test with real auth logic
+# ─────────────────────────────────────────────────────────
+# Hollow Test Detection Patterns
+# ─────────────────────────────────────────────────────────
+detection_patterns:
+  hollow_test_indicators:
+    - name: Mock Count Exceeds Import Count
+      check: "vi.mock() call count >= number of non-type imports in the test file"
+      severity: high
+      action: Review all assertions; verify at least one assertion is on actual output
+    - name: Assertions Only on Mock Calls
+      check: "All expect() statements use .toHaveBeenCalled() or .toHaveBeenCalledWith()"
+      severity: high
+      action: Add assertions on actual return values and system state changes
+    - name: More Mock Setup Than Assertions
+      check: "Lines of mock setup > lines of expect() assertions"
+      severity: medium
+      action: Consider if the test is testing behavior or just mock wiring
+    - name: Self-Referential Mock
+      check: "A vi.mock() path resolves to the same module being imported as the subject under test"
+      severity: critical
+      action: Remove the self-mock immediately; it makes the test a no-op
+  ai_generation_warning: >
+    AI tools (including this assistant) tend to generate hollow tests because:
+    1. Mocking makes tests compile and pass without requiring real infrastructure
+    2. AI cannot know the full dependency graph at generation time
+    When reviewing AI-generated tests, always apply the hollow test indicators above.
+# ─────────────────────────────────────────────────────────
+# Anti-Patterns
+# ─────────────────────────────────────────────────────────
+anti_patterns:
+  - name: Total Mock Isolation
+    description: Every import is mocked; test verifies only mock interaction counts
+    problem: Tests pass regardless of actual logic correctness
+    symptom: Deleting the implementation file doesn't break the test
+  - name: Mock the World
+    description: External + internal + database + filesystem all mocked in one test
+    problem: Test becomes a specification of mock behavior, not system behavior
+  - name: Mock Without Integration Counterpart
+    description: Cross-module mock with no corresponding integration test
+    problem: The interaction between modules is never actually exercised
+  - name: Security Mock Bypass
+    description: Auth/permission middleware replaced with always-pass stub
+    problem: Security regression cannot be detected
+  - name: Database Mock Cascade
+    description: DB mock returns hardcoded data, hiding query logic errors
+    problem: Schema migrations, wrong predicates, missing joins — all invisible
+# ─────────────────────────────────────────────────────────
+# Rules
+# ─────────────────────────────────────────────────────────
+rules:
+  - id: no-own-module-mock
+    trigger: writing any test for a module
+    instruction: Never vi.mock() a path that resolves to the module being tested
+    priority: required
+  - id: real-db-in-flow-tests
+    trigger: writing flow test, integration test, or E2E test
+    instruction: >
+      Use a real database (in-memory SQLite, test container, or test schema).
+      Never mock the DB layer in these test levels.
+    priority: required
+  - id: mock-needs-integration-counterpart
+    trigger: adding a vi.mock() for a cross-module dependency in a unit test
+    instruction: >
+      Ensure a corresponding integration test exercises the real interaction.
+      Note the counterpart test in a comment: "// integration: see src/__tests__/integration/..."
+    priority: required
+  - id: security-no-mock
+    trigger: test involves authentication, authorization, or rate limiting
+    instruction: >
+      Never mock security controls.
+      Create a real test user with a real token; exercise the real auth logic.
+    priority: required
+  - id: hollow-test-review
+    trigger: mock count in test file equals or exceeds non-type import count
+    instruction: >
+      Apply hollow test indicators checklist before submitting.
+      At least one assertion must verify an actual output value (not a mock call).
+    priority: required
+  - id: ai-generated-test-mock-review
+    trigger: tests are AI-generated
+    instruction: >
+      AI-generated tests frequently over-mock. Apply all detection_patterns checks.
+      If any hollow test indicator triggers, rewrite the mocking strategy.
+    priority: required
+# ─────────────────────────────────────────────────────────
+# Quick Reference
+# ─────────────────────────────────────────────────────────
+quick_reference:
+  allowed_mock_summary:
+    - "✅ External HTTP/LLM/payment APIs"
+    - "✅ Time functions (Date.now, setTimeout)"
+    - "✅ Environment variables"
+    - "✅ File system — in unit tests only"
+    - "✅ Cross-module boundaries — with integration test counterpart"
+  forbidden_mock_summary:
+    - "❌ Own module's core logic (self-referential mock)"
+    - "❌ Database layer in integration/flow/E2E tests"
+    - "❌ HTTP framework internals (Express router, etc.)"
+    - "❌ Security controls (auth middleware, permission checks)"
+  checklist: |
+    Before submitting test with mocks:
+    □ No vi.mock() path matches the module under test
+    □ DB layer not mocked in integration/flow tests
+    □ Security controls not mocked
+    □ Mock count < import count (or justified with comment)
+    □ At least one assertion on actual output value (not mock call)
+    □ Integration counterpart exists for each cross-module mock