npm - specweave - Versions diffs - 0.1.9 → 0.3.1 - Mend

specweave 0.1.9 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (293) hide show

package/src/skills/role-orchestrator/test-cases/test-5-feedback-loops.yaml DELETED Viewed

@@ -1,149 +0,0 @@
----
-name: "Feedback Loops - PM Agent Auto-Refinement"
-description: "Test automatic quality improvement through iterative refinement with feedback"
-skill: role-orchestrator
-priority: P1
-estimated_time: "5-7 minutes"
-input:
-  user_request: "Create requirements for user authentication feature"
-  config:
-    feedback_loops:
-      enabled: true
-      max_retries: 3
-      stop_on_improvement: true
-      quality_threshold: 0.80
-    validation:
-      use_llm_judge: true
-      combine_with_rules: true
-      judge_weight: 0.5
-  agent: "pm-agent"
-  task: "Define requirements for user authentication"
-expected_output:
-  # Attempt 1: Initial generation (below threshold)
-  attempt_1:
-    output: "Initial requirements spec"
-    validation:
-      rule_based_score: 1.00  # All structural rules pass
-      llm_judge_score: 0.60-0.70
-      combined_score: 0.70-0.85
-      threshold: 0.80
-    issues_found:
-      - severity: "major"
-        description: "Acceptance criteria not testable"
-        location: "Success Criteria section"
-      - severity: "major"
-        description: "Security requirements too vague"
-        location: "Security section"
-      - severity: "minor"
-        description: "No rate limiting specified"
-        location: "Requirements"
-    below_threshold: true
-    triggers_refinement: true
-  # Attempt 2: Refinement with feedback
-  attempt_2:
-    feedback_provided: |
-      Issues from previous attempt:
-      • MAJOR: Acceptance criteria not testable
-        Location: Success Criteria section
-        Suggestion: Use measurable metrics (e.g., "Login completes in <2s")
-      • MAJOR: Security requirements too vague
-        Location: Security section
-        Suggestion: Specify encryption, hashing, rate limiting
-      • MINOR: No rate limiting specified
-        Location: Requirements
-        Suggestion: Add brute-force protection (5 attempts, 15min lockout)
-    output: "Improved requirements spec with feedback incorporated"
-    validation:
-      rule_based_score: 1.00
-      llm_judge_score: 0.80-0.90
-      combined_score: 0.85-0.95
-      threshold: 0.80
-    improvements:
-      - "Acceptance criteria now measurable with metrics"
-      - "Security requirements detailed (bcrypt, JWT, rate limiting)"
-      - "Rate limiting specified (5 attempts, 15min lockout)"
-    above_threshold: true
-    stops_refinement: true
-  final_result:
-    total_attempts: 2
-    final_score: 0.85-0.95
-    met_threshold: true
-    output_quality: "good"
-    remaining_issues:
-      count: 0-1
-      severity: "minor"
-      examples:
-        - "Performance requirements could be more specific (p95/p99 targets)"
-validation:
-  # Attempt 1 validation
-  - "Initial attempt produces output"
-  - "Rule-based validation passes (1.00)"
-  - "LLM judge score < 0.80"
-  - "Combined score may or may not be < 0.80 depending on weights"
-  - "At least 2 major issues found"
-  - "Triggers refinement"
-  # Attempt 2 validation
-  - "Feedback generated from issues"
-  - "PM agent receives feedback prompt"
-  - "Second attempt incorporates feedback"
-  - "LLM judge score improves (>= 0.80)"
-  - "Combined score >= 0.80"
-  - "Stops after reaching threshold"
-  # Overall validation
-  - "Total attempts <= 3"
-  - "Final score >= 0.80"
-  - "Quality improvement measurable"
-  - "Progress shown to user"
-success_criteria:
-  - "Feedback loop triggers on low quality"
-  - "Agent successfully incorporates feedback"
-  - "Quality score improves significantly"
-  - "Stops when threshold met (doesn't waste retries)"
-  - "User sees progress updates"
-  - "Final output meets quality standards"
-workflow_steps:
-  1. "PM agent generates initial requirements"
-  2. "Validate with rule-based + LLM-as-judge"
-  3. "Detect score below threshold (0.80)"
-  4. "Generate feedback from issues"
-  5. "Reinvoke PM agent with feedback"
-  6. "Validate improved output"
-  7. "Detect score above threshold"
-  8. "Stop refinement, return final output"
-edge_cases:
-  max_retries_reached:
-    description: "If quality doesn't improve after 3 attempts"
-    expected: "Return best attempt with warning"
-  stop_on_improvement:
-    description: "If score improves but doesn't reach threshold"
-    config: "stop_on_improvement: true"
-    expected: "Stop early if improvement detected"
-  first_attempt_passes:
-    description: "If initial attempt >= threshold"
-    expected: "No refinement needed, accept immediately"
----

package/src/skills/skill-creator/test-cases/test-1-placeholder.yaml DELETED Viewed

@@ -1,12 +0,0 @@
----
-name: "Placeholder Test"
-description: "TODO: Add actual test case for this skill"
-input:
-  prompt: "Test prompt"
-expected_output:
-  type: "validation"
-validation:
-  - "Placeholder validation"
-success_criteria:
-  - "Test passes"
----

package/src/skills/skill-creator/test-cases/test-2-placeholder.yaml DELETED Viewed

@@ -1,12 +0,0 @@
----
-name: "Placeholder Test"
-description: "TODO: Add actual test case for this skill"
-input:
-  prompt: "Test prompt"
-expected_output:
-  type: "validation"
-validation:
-  - "Placeholder validation"
-success_criteria:
-  - "Test passes"
----

package/src/skills/skill-creator/test-cases/test-3-placeholder.yaml DELETED Viewed

@@ -1,12 +0,0 @@
----
-name: "Placeholder Test"
-description: "TODO: Add actual test case for this skill"
-input:
-  prompt: "Test prompt"
-expected_output:
-  type: "validation"
-validation:
-  - "Placeholder validation"
-success_criteria:
-  - "Test passes"
----

package/src/skills/skill-router/test-cases/test-1-basic-routing.yaml DELETED Viewed

@@ -1,33 +0,0 @@
----
-name: "Basic Request Routing"
-description: "Tests if skill-router correctly routes user request to appropriate skill"
-input:
-  prompt: "Plan a new feature for user authentication"
-  context:
-    available_skills:
-      - "increment-planner"
-      - "spec-author"
-      - "developer"
-  user_intent: "plan feature"
-expected_output:
-  type: "skill_routing"
-  routes_to: "increment-planner"
-  confidence: "high"
-  reason: "User request contains 'plan' and 'feature' keywords"
-  alternatives: []
-  actions:
-    - "Activate increment-planner skill"
-    - "Pass user request to increment-planner"
-    - "Log routing decision"
-validation:
-  - "Routes to correct skill (increment-planner)"
-  - "Confidence level is high (>0.9)"
-  - "No ambiguity detected"
-  - "Logs routing decision"
-  - "Activates target skill"
-success_criteria:
-  - "Correct skill selected"
-  - ">90% routing accuracy"
-  - "Fast routing (<100ms)"
-  - "Logging works"
----

package/src/skills/skill-router/test-cases/test-2-ambiguous-request.yaml DELETED Viewed

@@ -1,42 +0,0 @@
----
-name: "Ambiguous Request Handling"
-description: "Tests how skill-router handles requests that could match multiple skills"
-input:
-  prompt: "Create documentation for the API"
-  context:
-    available_skills:
-      - "spec-author"      # Could create API specification
-      - "architect"        # Could document API architecture
-      - "docs-updater"     # Could update API docs
-  user_intent: "create documentation"
-expected_output:
-  type: "clarification_required"
-  routes_to: null
-  confidence: "low"
-  reason: "Multiple skills match: spec-author, architect, docs-updater"
-  alternatives:
-    - skill: "spec-author"
-      match_score: 0.65
-      reason: "Can create API specifications"
-    - skill: "architect"
-      match_score: 0.60
-      reason: "Can document API architecture"
-    - skill: "docs-updater"
-      match_score: 0.55
-      reason: "Can update API documentation"
-  actions:
-    - "Ask user for clarification"
-    - "Present alternatives with descriptions"
-    - "Wait for user selection"
-validation:
-  - "Detects ambiguity (confidence <0.7)"
-  - "Lists all matching skills"
-  - "Provides match scores"
-  - "Asks user to clarify"
-  - "Does not make incorrect assumption"
-success_criteria:
-  - "Ambiguity detected"
-  - "User presented with options"
-  - "No forced routing when uncertain"
-  - "Helpful context for each option"
----

package/src/skills/skill-router/test-cases/test-3-nested-orchestration.yaml DELETED Viewed

@@ -1,50 +0,0 @@
----
-name: "Nested Skill Orchestration"
-description: "Tests skill-router's ability to handle one skill calling another skill"
-input:
-  prompt: "Implement the user authentication feature"
-  context:
-    available_skills:
-      - "developer"
-      - "context-loader"
-      - "spec-author"
-    current_skill: "developer"
-  workflow:
-    - step: 1
-      skill: "developer"
-      action: "Needs to load context from specification"
-      calls: "context-loader"
-    - step: 2
-      skill: "context-loader"
-      action: "Load authentication spec"
-      returns_to: "developer"
-    - step: 3
-      skill: "developer"
-      action: "Implement based on loaded context"
-expected_output:
-  type: "nested_routing"
-  primary_skill: "developer"
-  sub_skills:
-    - "context-loader"
-  orchestration:
-    - "developer activates"
-    - "developer calls context-loader"
-    - "context-loader loads spec"
-    - "context returns to developer"
-    - "developer implements"
-  contains:
-    - "Orchestrating skills: developer -> context-loader"
-    - "Context loaded successfully"
-    - "Returning to developer"
-validation:
-  - "Primary skill (developer) activated correctly"
-  - "Sub-skill (context-loader) called when needed"
-  - "Context passed between skills"
-  - "Control returns to primary skill"
-  - "Workflow completes successfully"
-success_criteria:
-  - "Nested skill calls work"
-  - "Context preserved across skills"
-  - "No infinite loops"
-  - "Proper error handling in chain"
----

package/src/skills/spec-driven-brainstorming/test-cases/TC-001-simple-idea-to-design.yaml DELETED Viewed

@@ -1,148 +0,0 @@
----
-name: "Simple Idea to Validated Design"
-description: "Transform a simple rough idea into a validated design ready for increment creation"
-skill: spec-driven-brainstorming
-priority: P1
-estimated_time: "10-15 minutes"
-input:
-  prompt: "I want to add user authentication to my app"
-  context:
-    existing_project: true
-    tech_stack: "Next.js 14, TypeScript"
-    existing_files:
-      - "package.json"
-      - "src/app/page.tsx"
-      - "src/components/"
-    database: "PostgreSQL"
-expected_output:
-  phase_1_understanding:
-    questions_asked:
-      - "What's your primary authentication goal?" # user login, API keys, SSO
-      - "Who are the users?" # internal team, public, B2B customers
-      - "Any compliance requirements?" # GDPR, HIPAA, SOC2
-    gathered_requirements:
-      purpose: "Secure user login for public SaaS application"
-      constraints:
-        - "Must work with existing PostgreSQL database"
-        - "Budget-friendly (prefer open-source)"
-      success_criteria:
-        - "Users can register and login securely"
-        - "Session management across devices"
-        - "Password reset functionality"
-  phase_2_tech_stack:
-    detected_stack:
-      framework: "Next.js 14"
-      language: "TypeScript"
-      database: "PostgreSQL"
-    skills_activated:
-      - nextjs
-      - nodejs-backend
-    recommendation: "Use NextAuth.js v5 (Auth.js) - native Next.js integration"
-  phase_3_exploration:
-    approaches_proposed: 3
-    approach_1:
-      name: "NextAuth.js with Credentials Provider"
-      architecture: "Session-based with JWT"
-      trade_offs:
-        pros:
-          - "Simple setup, well-documented"
-          - "Built-in session management"
-        cons:
-          - "Basic features only"
-          - "Custom UI needed"
-      complexity: "Low"
-    approach_2:
-      name: "Clerk Integration"
-      architecture: "Third-party auth SaaS"
-      trade_offs:
-        pros:
-          - "Pre-built UI components"
-          - "Advanced features (MFA, social login)"
-        cons:
-          - "Monthly cost ($25+)"
-          - "Vendor lock-in"
-      complexity: "Very Low"
-    approach_3:
-      name: "Custom JWT + Refresh Tokens"
-      architecture: "Stateless authentication"
-      trade_offs:
-        pros:
-          - "Full control, no dependencies"
-          - "Scalable, stateless"
-        cons:
-          - "More code to maintain"
-          - "Security risks if done wrong"
-      complexity: "High"
-    user_selection: "Approach 1" # NextAuth.js
-  phase_4_design_validation:
-    sections_presented:
-      - title: "Architecture Overview"
-        content: "NextAuth.js integration with PostgreSQL adapter, session-based auth"
-        validated: true
-      - title: "Components"
-        content: "Auth API routes (/api/auth/*), session provider, protected middleware"
-        validated: true
-      - title: "Data Flow"
-        content: "Login → credentials check → session creation → JWT cookie → protected routes"
-        validated: true
-      - title: "Error Handling"
-        content: "Invalid credentials, session expiry, token refresh logic"
-        validated: true
-      - title: "Testing Strategy"
-        content: "Unit tests (auth logic), integration tests (API routes), E2E (Playwright login flow)"
-        validated: true
-    design_complete: true
-  phase_5_specweave_handoff:
-    handoff_type: "Full Increment Creation"
-    skill_invoked: "increment-planner"
-    expected_increment:
-      directory: ".specweave/increments/0001-user-authentication/"
-      files:
-        - "spec.md"
-        - "plan.md"
-        - "tasks.md"
-        - "tests.md"
-        - "context-manifest.yaml"
-    agents_invoked:
-      - "pm" # Creates strategy docs
-      - "architect" # Creates architecture docs
-      - "qa-lead" # Creates test strategy
-validation:
-  - "Phase 1: At least 3 understanding questions asked"
-  - "Phase 2: Tech stack detected from package.json"
-  - "Phase 3: Exactly 3 architectural approaches proposed"
-  - "Phase 4: Design presented in <300 word sections"
-  - "Phase 5: increment-planner skill invoked"
-  - "All phases completed in sequence"
-  - "AskUserQuestion tool used for structured choices"
-  - "Open-ended questions used for validation"
-expected_errors: []
-success_criteria:
-  - "Validated design produced"
-  - "Ready for increment creation"
-  - "User confirmed design alignment"
-  - "No skipped phases (unless explicitly revisited)"
----

package/src/skills/spec-driven-brainstorming/test-cases/TC-002-complex-ultrathink-design.yaml DELETED Viewed

@@ -1,190 +0,0 @@
----
-name: "Complex Problem with Ultrathink Mode"
-description: "Handle a complex distributed systems problem using ultrathink for deep reasoning"
-skill: spec-driven-brainstorming
-priority: P2
-estimated_time: "20-30 minutes"
-input:
-  prompt: "I need a distributed task queue with exactly-once delivery guarantees for processing financial transactions"
-  context:
-    existing_project: false
-    complexity: "High"
-    domain: "Financial systems"
-    requirements:
-      - "Exactly-once delivery semantics"
-      - "High throughput (10K+ tasks/sec)"
-      - "Audit trail for compliance"
-      - "Fault tolerance and disaster recovery"
-expected_output:
-  phase_1_understanding:
-    questions_asked:
-      - "What types of financial transactions?" # payments, transfers, reconciliation
-      - "What's your scale and peak load?" # concurrent tasks, daily volume
-      - "Compliance requirements?" # SOC2, PCI-DSS, audit logs
-      - "Acceptable latency?" # real-time vs batch processing
-      - "Budget constraints?" # cloud costs, team size
-    gathered_requirements:
-      purpose: "Process payment transactions with guaranteed exactly-once delivery"
-      constraints:
-        - "Must maintain audit trail for 7 years"
-        - "Compliance: PCI-DSS, SOC2"
-        - "Peak load: 10K tasks/sec"
-        - "Latency: <500ms p99"
-      success_criteria:
-        - "Zero duplicate transactions"
-        - "99.99% uptime SLA"
-        - "Complete audit trail"
-  phase_2_tech_stack:
-    detected_stack: null # greenfield
-    stack_question_asked: true
-    options_presented:
-      - "Go + PostgreSQL" # strong concurrency, SQL for audit
-      - "Node.js + Redis + PostgreSQL" # JavaScript ecosystem, Redis for queue
-      - "Python + Celery + RabbitMQ + PostgreSQL" # mature queue system
-    user_selection: "Go + PostgreSQL"
-  phase_3_exploration:
-    ultrathink_suggested: true
-    ultrathink_reason: "Complex distributed systems problem with many trade-offs around consistency, fault tolerance, and exactly-once semantics"
-    thinking_tokens_used: 31999 # full ultrathink budget
-    approaches_proposed: 3
-    approach_1:
-      name: "Event Sourcing with Outbox Pattern"
-      architecture: "Write events to DB, background processor publishes to queue"
-      trade_offs:
-        pros:
-          - "Exactly-once delivery via DB transactions"
-          - "Complete audit trail built-in"
-          - "Easy to replay events"
-        cons:
-          - "Higher latency (dual writes)"
-          - "Complex event schema evolution"
-        consistency: "Strong (ACID transactions)"
-        complexity: "High"
-        cost: "Medium (DB I/O intensive)"
-    approach_2:
-      name: "Idempotent Task Queue with Distributed Locks"
-      architecture: "Redis queue + PostgreSQL state tracking + distributed locks"
-      trade_offs:
-        pros:
-          - "Lower latency (in-memory queue)"
-          - "Horizontal scaling easier"
-        cons:
-          - "Requires careful lock management"
-          - "Risk of phantom duplicate if lock expires"
-        consistency: "Eventual (with idempotency keys)"
-        complexity: "Very High"
-        cost: "High (Redis cluster + PostgreSQL)"
-    approach_3:
-      name: "Transactional Outbox with Change Data Capture (CDC)"
-      architecture: "Write to DB, CDC streams changes to queue"
-      trade_offs:
-        pros:
-          - "Best of both worlds: ACID + async processing"
-          - "At-least-once delivery, idempotent consumers for exactly-once effect"
-          - "Built-in audit trail"
-        cons:
-          - "Requires CDC infrastructure (Debezium, Kafka Connect)"
-          - "Operational complexity"
-        consistency: "Strong write, eventual propagation"
-        complexity: "High (infrastructure)"
-        cost: "High (Kafka/CDC infrastructure)"
-    user_selection: "Approach 1" # Event Sourcing with Outbox
-    edge_cases_analyzed:
-      - "Database connection failure during transaction"
-      - "Task processor crashes mid-processing"
-      - "Network partition between queue and DB"
-      - "Clock skew in distributed environment"
-      - "Idempotency key collision"
-  phase_4_design_validation:
-    sections_presented:
-      - title: "Architecture Overview (Ultrathink Analysis)"
-        content: "Event sourcing + outbox pattern ensures exactly-once via DB transactions. Analyzed 5 edge cases for consistency guarantees."
-        validated: true
-      - title: "Components (with Fault Tolerance)"
-        content: "Event store (PostgreSQL), outbox processor (Go workers), task queue (Redis Streams), dead letter queue for failures"
-        validated: true
-      - title: "Data Flow (Transactional Guarantees)"
-        content: "Transaction starts → Write event to event_store → Write to outbox → Commit (ACID) → Background worker polls outbox → Publishes to queue → Marks processed"
-        validated: true
-      - title: "Exactly-Once Semantics (Implementation)"
-        content: "Idempotency keys (UUID v4), transaction boundaries, at-least-once delivery + idempotent consumers = exactly-once effect"
-        validated: true
-      - title: "Error Handling & Recovery"
-        content: "Retry with exponential backoff, dead letter queue after 3 retries, circuit breaker, health checks"
-        validated: true
-      - title: "Audit Trail & Compliance"
-        content: "Event sourcing provides complete history, immutable event log, 7-year retention in cold storage (S3 Glacier)"
-        validated: true
-      - title: "Testing Strategy (Chaos Engineering)"
-        content: "Unit tests (logic), integration tests (DB transactions), E2E tests (full flow), chaos tests (network failures, DB crashes)"
-        validated: true
-      - title: "Performance & Scale Analysis"
-        content: "Throughput: 10K tasks/sec achievable with 5 workers, latency: p99 <500ms, bottleneck: DB writes (optimize with batching)"
-        validated: true
-    design_complete: true
-    complexity_justified: true
-  phase_5_specweave_handoff:
-    handoff_type: "Full Increment Creation with Architecture Focus"
-    skill_invoked: "increment-planner"
-    expected_increment:
-      directory: ".specweave/increments/0001-distributed-task-queue/"
-      files:
-        - "spec.md" # references strategy/distributed-systems/
-        - "plan.md" # references architecture/distributed-systems/
-        - "tasks.md"
-        - "tests.md" # includes chaos engineering tests
-        - "context-manifest.yaml"
-    agents_invoked:
-      - "pm" # Financial transaction requirements
-      - "architect" # Distributed systems design + ADRs
-      - "security" # PCI-DSS, audit compliance
-      - "qa-lead" # Chaos engineering test strategy
-validation:
-  - "Phase 1: Domain-specific questions asked (financial, compliance)"
-  - "Phase 2: Tech stack selection with 3 options"
-  - "Phase 3: Ultrathink mode suggested and used"
-  - "Phase 3: Edge cases explicitly analyzed"
-  - "Phase 3: Consistency models compared (strong vs eventual)"
-  - "Phase 4: Design sections include fault tolerance and recovery"
-  - "Phase 4: Performance analysis with specific numbers"
-  - "Phase 5: Security agent invoked due to compliance needs"
-  - "Exactly-once semantics clearly explained"
-expected_errors: []
-success_criteria:
-  - "Ultrathink mode activated for complex reasoning"
-  - "Edge cases thoroughly analyzed"
-  - "Consistency guarantees proven"
-  - "Compliance requirements addressed"
-  - "Performance targets validated"
-  - "Ready for production-grade implementation"
----