shipwright-cli 3.1.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/code-reviewer.md +2 -0
- package/.claude/agents/devops-engineer.md +2 -0
- package/.claude/agents/doc-fleet-agent.md +2 -0
- package/.claude/agents/pipeline-agent.md +2 -0
- package/.claude/agents/shell-script-specialist.md +2 -0
- package/.claude/agents/test-specialist.md +2 -0
- package/.claude/hooks/agent-crash-capture.sh +32 -0
- package/.claude/hooks/post-tool-use.sh +3 -2
- package/.claude/hooks/pre-tool-use.sh +35 -3
- package/README.md +22 -8
- package/claude-code/hooks/config-change.sh +18 -0
- package/claude-code/hooks/instructions-reloaded.sh +7 -0
- package/claude-code/hooks/worktree-create.sh +25 -0
- package/claude-code/hooks/worktree-remove.sh +20 -0
- package/config/code-constitution.json +130 -0
- package/config/defaults.json +25 -2
- package/config/policy.json +1 -1
- package/dashboard/middleware/auth.ts +134 -0
- package/dashboard/middleware/constants.ts +21 -0
- package/dashboard/public/index.html +8 -6
- package/dashboard/public/styles.css +176 -97
- package/dashboard/routes/auth.ts +38 -0
- package/dashboard/server.ts +117 -25
- package/dashboard/services/config.ts +26 -0
- package/dashboard/services/db.ts +118 -0
- package/dashboard/src/canvas/pixel-agent.ts +298 -0
- package/dashboard/src/canvas/pixel-sprites.ts +440 -0
- package/dashboard/src/canvas/shipyard-effects.ts +367 -0
- package/dashboard/src/canvas/shipyard-scene.ts +616 -0
- package/dashboard/src/canvas/submarine-layout.ts +267 -0
- package/dashboard/src/components/header.ts +8 -7
- package/dashboard/src/core/api.ts +5 -0
- package/dashboard/src/core/router.ts +1 -0
- package/dashboard/src/design/submarine-theme.ts +253 -0
- package/dashboard/src/main.ts +2 -0
- package/dashboard/src/types/api.ts +12 -1
- package/dashboard/src/views/activity.ts +2 -1
- package/dashboard/src/views/metrics.ts +69 -1
- package/dashboard/src/views/shipyard.ts +39 -0
- package/dashboard/types/index.ts +166 -0
- package/docs/plans/2026-02-28-compound-audit-and-shipyard-design.md +186 -0
- package/docs/plans/2026-02-28-skipper-shipwright-implementation-plan.md +1182 -0
- package/docs/plans/2026-02-28-skipper-shipwright-integration-design.md +531 -0
- package/docs/plans/2026-03-01-ai-powered-skill-injection-design.md +298 -0
- package/docs/plans/2026-03-01-ai-powered-skill-injection-plan.md +1109 -0
- package/docs/plans/2026-03-01-capabilities-cleanup-plan.md +658 -0
- package/docs/plans/2026-03-01-clean-architecture-plan.md +924 -0
- package/docs/plans/2026-03-01-compound-audit-cascade-design.md +191 -0
- package/docs/plans/2026-03-01-compound-audit-cascade-plan.md +921 -0
- package/docs/plans/2026-03-01-deep-integration-plan.md +851 -0
- package/docs/plans/2026-03-01-pipeline-audit-trail-design.md +145 -0
- package/docs/plans/2026-03-01-pipeline-audit-trail-plan.md +770 -0
- package/docs/plans/2026-03-01-refined-depths-brand-design.md +382 -0
- package/docs/plans/2026-03-01-refined-depths-implementation.md +599 -0
- package/docs/plans/2026-03-01-skipper-kernel-integration-design.md +203 -0
- package/docs/plans/2026-03-01-unified-platform-design.md +272 -0
- package/docs/plans/2026-03-07-claude-code-feature-integration-design.md +189 -0
- package/docs/plans/2026-03-07-claude-code-feature-integration-plan.md +1165 -0
- package/docs/research/BACKLOG_QUICK_REFERENCE.md +352 -0
- package/docs/research/CUTTING_EDGE_RESEARCH_2026.md +546 -0
- package/docs/research/RESEARCH_INDEX.md +439 -0
- package/docs/research/RESEARCH_SOURCES.md +440 -0
- package/docs/research/RESEARCH_SUMMARY.txt +275 -0
- package/docs/superpowers/specs/2026-03-10-pipeline-quality-revolution-design.md +341 -0
- package/package.json +2 -2
- package/scripts/lib/adaptive-model.sh +427 -0
- package/scripts/lib/adaptive-timeout.sh +316 -0
- package/scripts/lib/audit-trail.sh +309 -0
- package/scripts/lib/auto-recovery.sh +471 -0
- package/scripts/lib/bandit-selector.sh +431 -0
- package/scripts/lib/bootstrap.sh +104 -2
- package/scripts/lib/causal-graph.sh +455 -0
- package/scripts/lib/compat.sh +126 -0
- package/scripts/lib/compound-audit.sh +337 -0
- package/scripts/lib/constitutional.sh +454 -0
- package/scripts/lib/context-budget.sh +359 -0
- package/scripts/lib/convergence.sh +594 -0
- package/scripts/lib/cost-optimizer.sh +634 -0
- package/scripts/lib/daemon-adaptive.sh +14 -2
- package/scripts/lib/daemon-dispatch.sh +106 -17
- package/scripts/lib/daemon-failure.sh +34 -4
- package/scripts/lib/daemon-patrol.sh +25 -4
- package/scripts/lib/daemon-poll-github.sh +361 -0
- package/scripts/lib/daemon-poll-health.sh +299 -0
- package/scripts/lib/daemon-poll.sh +27 -611
- package/scripts/lib/daemon-state.sh +119 -66
- package/scripts/lib/daemon-triage.sh +10 -0
- package/scripts/lib/dod-scorecard.sh +442 -0
- package/scripts/lib/error-actionability.sh +300 -0
- package/scripts/lib/formal-spec.sh +461 -0
- package/scripts/lib/helpers.sh +180 -5
- package/scripts/lib/intent-analysis.sh +409 -0
- package/scripts/lib/loop-convergence.sh +350 -0
- package/scripts/lib/loop-iteration.sh +682 -0
- package/scripts/lib/loop-progress.sh +48 -0
- package/scripts/lib/loop-restart.sh +185 -0
- package/scripts/lib/memory-effectiveness.sh +506 -0
- package/scripts/lib/mutation-executor.sh +352 -0
- package/scripts/lib/outcome-feedback.sh +521 -0
- package/scripts/lib/pipeline-cli.sh +336 -0
- package/scripts/lib/pipeline-commands.sh +1216 -0
- package/scripts/lib/pipeline-detection.sh +101 -3
- package/scripts/lib/pipeline-execution.sh +897 -0
- package/scripts/lib/pipeline-github.sh +28 -3
- package/scripts/lib/pipeline-intelligence-compound.sh +431 -0
- package/scripts/lib/pipeline-intelligence-scoring.sh +407 -0
- package/scripts/lib/pipeline-intelligence-skip.sh +181 -0
- package/scripts/lib/pipeline-intelligence.sh +104 -1138
- package/scripts/lib/pipeline-quality-bash-compat.sh +182 -0
- package/scripts/lib/pipeline-quality-checks.sh +17 -711
- package/scripts/lib/pipeline-quality-gates.sh +563 -0
- package/scripts/lib/pipeline-stages-build.sh +730 -0
- package/scripts/lib/pipeline-stages-delivery.sh +965 -0
- package/scripts/lib/pipeline-stages-intake.sh +1133 -0
- package/scripts/lib/pipeline-stages-monitor.sh +407 -0
- package/scripts/lib/pipeline-stages-review.sh +1022 -0
- package/scripts/lib/pipeline-stages.sh +161 -2901
- package/scripts/lib/pipeline-state.sh +36 -5
- package/scripts/lib/pipeline-util.sh +487 -0
- package/scripts/lib/policy-learner.sh +438 -0
- package/scripts/lib/process-reward.sh +493 -0
- package/scripts/lib/project-detect.sh +649 -0
- package/scripts/lib/quality-profile.sh +334 -0
- package/scripts/lib/recruit-commands.sh +885 -0
- package/scripts/lib/recruit-learning.sh +739 -0
- package/scripts/lib/recruit-roles.sh +648 -0
- package/scripts/lib/reward-aggregator.sh +458 -0
- package/scripts/lib/rl-optimizer.sh +362 -0
- package/scripts/lib/root-cause.sh +427 -0
- package/scripts/lib/scope-enforcement.sh +445 -0
- package/scripts/lib/session-restart.sh +493 -0
- package/scripts/lib/skill-memory.sh +300 -0
- package/scripts/lib/skill-registry.sh +775 -0
- package/scripts/lib/spec-driven.sh +476 -0
- package/scripts/lib/test-helpers.sh +18 -7
- package/scripts/lib/test-holdout.sh +429 -0
- package/scripts/lib/test-optimizer.sh +511 -0
- package/scripts/shipwright-file-suggest.sh +45 -0
- package/scripts/skills/adversarial-quality.md +61 -0
- package/scripts/skills/api-design.md +44 -0
- package/scripts/skills/architecture-design.md +50 -0
- package/scripts/skills/brainstorming.md +43 -0
- package/scripts/skills/data-pipeline.md +44 -0
- package/scripts/skills/deploy-safety.md +64 -0
- package/scripts/skills/documentation.md +38 -0
- package/scripts/skills/frontend-design.md +45 -0
- package/scripts/skills/generated/.gitkeep +0 -0
- package/scripts/skills/generated/_refinements/.gitkeep +0 -0
- package/scripts/skills/generated/_refinements/adversarial-quality.patch.md +3 -0
- package/scripts/skills/generated/_refinements/architecture-design.patch.md +3 -0
- package/scripts/skills/generated/_refinements/brainstorming.patch.md +3 -0
- package/scripts/skills/generated/cli-version-management.md +29 -0
- package/scripts/skills/generated/collection-system-validation.md +99 -0
- package/scripts/skills/generated/large-scale-c-refactoring-coordination.md +97 -0
- package/scripts/skills/generated/pattern-matching-similarity-scoring.md +195 -0
- package/scripts/skills/generated/test-parallelization-detection.md +65 -0
- package/scripts/skills/observability.md +79 -0
- package/scripts/skills/performance.md +48 -0
- package/scripts/skills/pr-quality.md +49 -0
- package/scripts/skills/product-thinking.md +43 -0
- package/scripts/skills/security-audit.md +49 -0
- package/scripts/skills/systematic-debugging.md +40 -0
- package/scripts/skills/testing-strategy.md +47 -0
- package/scripts/skills/two-stage-review.md +52 -0
- package/scripts/skills/validation-thoroughness.md +55 -0
- package/scripts/sw +9 -3
- package/scripts/sw-activity.sh +9 -8
- package/scripts/sw-adaptive.sh +8 -7
- package/scripts/sw-adversarial.sh +2 -1
- package/scripts/sw-architecture-enforcer.sh +3 -1
- package/scripts/sw-auth.sh +12 -2
- package/scripts/sw-autonomous.sh +5 -1
- package/scripts/sw-changelog.sh +4 -1
- package/scripts/sw-checkpoint.sh +2 -1
- package/scripts/sw-ci.sh +15 -6
- package/scripts/sw-cleanup.sh +4 -26
- package/scripts/sw-code-review.sh +45 -20
- package/scripts/sw-connect.sh +2 -1
- package/scripts/sw-context.sh +2 -1
- package/scripts/sw-cost.sh +107 -5
- package/scripts/sw-daemon.sh +71 -11
- package/scripts/sw-dashboard.sh +3 -1
- package/scripts/sw-db.sh +71 -20
- package/scripts/sw-decide.sh +8 -2
- package/scripts/sw-decompose.sh +360 -17
- package/scripts/sw-deps.sh +4 -1
- package/scripts/sw-developer-simulation.sh +4 -1
- package/scripts/sw-discovery.sh +378 -5
- package/scripts/sw-doc-fleet.sh +4 -1
- package/scripts/sw-docs-agent.sh +3 -1
- package/scripts/sw-docs.sh +2 -1
- package/scripts/sw-doctor.sh +453 -2
- package/scripts/sw-dora.sh +4 -1
- package/scripts/sw-durable.sh +12 -7
- package/scripts/sw-e2e-orchestrator.sh +17 -16
- package/scripts/sw-eventbus.sh +13 -4
- package/scripts/sw-evidence.sh +364 -12
- package/scripts/sw-feedback.sh +550 -9
- package/scripts/sw-fix.sh +20 -1
- package/scripts/sw-fleet-discover.sh +6 -2
- package/scripts/sw-fleet-viz.sh +9 -4
- package/scripts/sw-fleet.sh +5 -1
- package/scripts/sw-github-app.sh +18 -4
- package/scripts/sw-github-checks.sh +3 -2
- package/scripts/sw-github-deploy.sh +3 -2
- package/scripts/sw-github-graphql.sh +18 -7
- package/scripts/sw-guild.sh +5 -1
- package/scripts/sw-heartbeat.sh +5 -30
- package/scripts/sw-hello.sh +67 -0
- package/scripts/sw-hygiene.sh +10 -3
- package/scripts/sw-incident.sh +273 -5
- package/scripts/sw-init.sh +18 -2
- package/scripts/sw-instrument.sh +10 -2
- package/scripts/sw-intelligence.sh +44 -7
- package/scripts/sw-jira.sh +5 -1
- package/scripts/sw-launchd.sh +2 -1
- package/scripts/sw-linear.sh +4 -1
- package/scripts/sw-logs.sh +4 -1
- package/scripts/sw-loop.sh +436 -1076
- package/scripts/sw-memory.sh +357 -3
- package/scripts/sw-mission-control.sh +6 -1
- package/scripts/sw-model-router.sh +483 -27
- package/scripts/sw-otel.sh +15 -4
- package/scripts/sw-oversight.sh +14 -5
- package/scripts/sw-patrol-meta.sh +334 -0
- package/scripts/sw-pipeline-composer.sh +7 -1
- package/scripts/sw-pipeline-vitals.sh +12 -6
- package/scripts/sw-pipeline.sh +54 -2653
- package/scripts/sw-pm.sh +16 -8
- package/scripts/sw-pr-lifecycle.sh +2 -1
- package/scripts/sw-predictive.sh +17 -5
- package/scripts/sw-prep.sh +185 -2
- package/scripts/sw-ps.sh +5 -25
- package/scripts/sw-public-dashboard.sh +17 -4
- package/scripts/sw-quality.sh +14 -6
- package/scripts/sw-reaper.sh +8 -25
- package/scripts/sw-recruit.sh +156 -2303
- package/scripts/sw-regression.sh +19 -12
- package/scripts/sw-release-manager.sh +3 -1
- package/scripts/sw-release.sh +4 -1
- package/scripts/sw-remote.sh +3 -1
- package/scripts/sw-replay.sh +7 -1
- package/scripts/sw-retro.sh +158 -1
- package/scripts/sw-review-rerun.sh +3 -1
- package/scripts/sw-scale.sh +14 -5
- package/scripts/sw-security-audit.sh +6 -1
- package/scripts/sw-self-optimize.sh +173 -6
- package/scripts/sw-session.sh +9 -3
- package/scripts/sw-setup.sh +3 -1
- package/scripts/sw-stall-detector.sh +406 -0
- package/scripts/sw-standup.sh +15 -7
- package/scripts/sw-status.sh +3 -1
- package/scripts/sw-strategic.sh +14 -6
- package/scripts/sw-stream.sh +13 -4
- package/scripts/sw-swarm.sh +20 -7
- package/scripts/sw-team-stages.sh +13 -6
- package/scripts/sw-templates.sh +7 -31
- package/scripts/sw-testgen.sh +17 -6
- package/scripts/sw-tmux-pipeline.sh +4 -1
- package/scripts/sw-tmux-role-color.sh +2 -0
- package/scripts/sw-tmux-status.sh +1 -1
- package/scripts/sw-tmux.sh +37 -1
- package/scripts/sw-trace.sh +3 -1
- package/scripts/sw-tracker-github.sh +3 -0
- package/scripts/sw-tracker-jira.sh +3 -0
- package/scripts/sw-tracker-linear.sh +3 -0
- package/scripts/sw-tracker.sh +3 -1
- package/scripts/sw-triage.sh +3 -2
- package/scripts/sw-upgrade.sh +3 -1
- package/scripts/sw-ux.sh +5 -2
- package/scripts/sw-webhook.sh +5 -2
- package/scripts/sw-widgets.sh +9 -4
- package/scripts/sw-worktree.sh +15 -3
- package/scripts/test-skill-injection.sh +1233 -0
- package/templates/pipelines/autonomous.json +27 -3
- package/templates/pipelines/cost-aware.json +34 -8
- package/templates/pipelines/deployed.json +12 -0
- package/templates/pipelines/enterprise.json +12 -0
- package/templates/pipelines/fast.json +6 -0
- package/templates/pipelines/full.json +27 -3
- package/templates/pipelines/hotfix.json +6 -0
- package/templates/pipelines/standard.json +12 -0
- package/templates/pipelines/tdd.json +12 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
================================================================================
|
|
2
|
+
AUTONOMOUS CODING SYSTEMS: CUTTING EDGE RESEARCH SUMMARY (APRIL 2026)
|
|
3
|
+
================================================================================
|
|
4
|
+
|
|
5
|
+
RESEARCH CONDUCTED:
|
|
6
|
+
- 50+ web searches across 8 major research areas
|
|
7
|
+
- 25+ arXiv papers and conference proceedings (NeurIPS 2024, POPL 2026, ICLR 2026)
|
|
8
|
+
- 15+ GitHub repositories (SWE-agent, DeepSeek-R1, Claude, Aider)
|
|
9
|
+
- 10+ industry reports (BCG Platinion, Anthropic, Meta, Google, OpenAI)
|
|
10
|
+
- 5+ competitive benchmarks (SWE-bench, Codeforces, AIME)
|
|
11
|
+
|
|
12
|
+
RESEARCH AREAS COVERED:
|
|
13
|
+
1. Autonomous Loop Patterns & Convergence Detection
|
|
14
|
+
2. Dark Factory / Lights-Out Delivery
|
|
15
|
+
3. Reinforcement Learning for Code Generation
|
|
16
|
+
4. Long-Context Agent Memory & Episodic Traces
|
|
17
|
+
5. Formal Verification & Specification-Driven Pipeline
|
|
18
|
+
6. Test Generation with Mutation Testing
|
|
19
|
+
7. Cost-Optimized Model Routing & Cascading
|
|
20
|
+
8. Self-Healing CI/CD & AIOps
|
|
21
|
+
9. Multi-Agent Orchestration & Coordination
|
|
22
|
+
10. Reasoning-First Code Generation (Extended/Adaptive Thinking)
|
|
23
|
+
|
|
24
|
+
================================================================================
|
|
25
|
+
KEY FINDINGS
|
|
26
|
+
================================================================================
|
|
27
|
+
|
|
28
|
+
DARK FACTORY ERA (SOTA):
|
|
29
|
+
- BCG Platinion: 3-5 engineers running fully autonomous factories
|
|
30
|
+
- Spotify: 650+ AI-generated PRs/month, 90% faster migrations
|
|
31
|
+
- OpenAI: 1M-line product in 5 months with 3 engineers, no manual code
|
|
32
|
+
- Two critical disciplines: (1) Harness Engineering, (2) Intent Thinking
|
|
33
|
+
|
|
34
|
+
REASONING MODELS (FRONTIERS):
|
|
35
|
+
- OpenAI o1-pro: 200K context, 100K output tokens, $150/$600 pricing
|
|
36
|
+
• 86% on AIME (vs 78% o1), 89th percentile Codeforces
|
|
37
|
+
- DeepSeek-R1: Pure RL, 2,029 Codeforces Elo (Candidate Master)
|
|
38
|
+
• 671B capacity @ 37B inference cost via Mixture of Experts
|
|
39
|
+
- Claude Opus 4.6: Adaptive thinking (replaces extended thinking)
|
|
40
|
+
• Dynamically allocates reasoning budget by task difficulty
|
|
41
|
+
|
|
42
|
+
MULTI-AGENT STATE (2026):
|
|
43
|
+
- 40% of enterprise apps will have agentic AI (up from <5% in 2025)
|
|
44
|
+
- Standard 3-role pattern: Planner, Worker, Judge
|
|
45
|
+
- Git worktrees → standard isolation mechanism
|
|
46
|
+
- Google DORA 2025: 20-30% faster workflows, but 9% bug rate climb
|
|
47
|
+
|
|
48
|
+
REINFORCEMENT LEARNING ADVANCES:
|
|
49
|
+
- Meta ACH: 9,095 mutants + 571 tests on 10,795 Android classes
|
|
50
|
+
- FunPRM: Functions as PRM steps → 15-20% better completion
|
|
51
|
+
- SecCoderX: Vulnerability reward model + secure code RL
|
|
52
|
+
- Policy learning converging on PPO + preference data → reward → optimization
|
|
53
|
+
|
|
54
|
+
MEMORY SYSTEMS (2026):
|
|
55
|
+
- Mem0: Mature hybrid storage (Postgres episodic + semantic)
|
|
56
|
+
- EM-LLM: Bayesian surprise + graph refinement for event segmentation
|
|
57
|
+
- Active compression: Consolidate 10 episodes → semantic facts
|
|
58
|
+
- Multi-layer: Episodic (events), Semantic (facts), Working (context)
|
|
59
|
+
|
|
60
|
+
COST OPTIMIZATION:
|
|
61
|
+
- Google Speculative Cascades: 30-60% cost reduction via cascade routing
|
|
62
|
+
- Open-source cascading: 92% cost savings on benchmarks
|
|
63
|
+
- Unified routing + cascading: Theoretically optimal framework
|
|
64
|
+
- Haiku-first, escalate to Sonnet → Opus only on failure
|
|
65
|
+
|
|
66
|
+
SELF-HEALING CI/CD:
|
|
67
|
+
- Agentic SRE: Telemetry → reasoning → controlled automation (closed loop)
|
|
68
|
+
- 60% enterprise adoption of self-healing infrastructure (Gartner 2026)
|
|
69
|
+
- Pipeline Doctor pattern: Repair agent reads logs, commits fixes
|
|
70
|
+
- 67% MTTR drop with AIOps; 40-60% in high-performing orgs
|
|
71
|
+
|
|
72
|
+
BENCHMARKS & COMPETITIVE POSITION:
|
|
73
|
+
- SWE-bench Verified: DEPRECATED (training contamination discovered)
|
|
74
|
+
- SWE-bench Pro: 1,865 tasks across 41 repos (NEW STANDARD)
|
|
75
|
+
- Claude Code: 80.9% on SWE-bench (highest reported)
|
|
76
|
+
- Aider: 49.2% SWE-Verified, 4.2x fewer tokens than Claude Code
|
|
77
|
+
- Cline: 500K+ downloads; VS Code integration; multi-model support
|
|
78
|
+
|
|
79
|
+
================================================================================
|
|
80
|
+
SHIPWRIGHT COMPETITIVE ANALYSIS
|
|
81
|
+
================================================================================
|
|
82
|
+
|
|
83
|
+
SHIPWRIGHT STRENGTHS (DIFFERENTIATED):
|
|
84
|
+
✓ RL Architecture (multi-signal rewards, bandit selection, policy learning)
|
|
85
|
+
✓ 12-Stage Pipeline with quality gates + evidence capture
|
|
86
|
+
✓ Multi-Agent Fleet (5+ specialized agents, worktree isolation)
|
|
87
|
+
✓ Cost Intelligence (budget tracking, model routing, DORA metrics)
|
|
88
|
+
✓ Memory System (cross-session learning, failure patterns)
|
|
89
|
+
✓ CI Integration (GitHub Actions, Checks API, Deployments API)
|
|
90
|
+
✓ Daemon + Auto-Scaling (worker pool, load balancing)
|
|
91
|
+
✓ 121+ Test Suites (80% script coverage)
|
|
92
|
+
|
|
93
|
+
SHIPWRIGHT GAPS (vs SOTA):
|
|
94
|
+
✗ Loop convergence detection (heuristic, not formal regime analysis)
|
|
95
|
+
✗ Intent Specification Engine (no intent → outcome transformation)
|
|
96
|
+
✗ Vulnerability-aware RL (security signals not in reward model)
|
|
97
|
+
✗ Episodic memory (pattern-based, not execution-trace-based)
|
|
98
|
+
✗ Formal verification (tests only, no Dafny/Lean integration)
|
|
99
|
+
✗ Mutation testing feedback (no mutant generation/killing loops)
|
|
100
|
+
✗ Speculative cascading (fixed model routing, no escalation)
|
|
101
|
+
✗ CI repair agent (no automated flaky test fixes)
|
|
102
|
+
✗ Explicit conflict resolution (no file-level locks, DAG scheduling)
|
|
103
|
+
✗ Active memory compression (unbounded context growth)
|
|
104
|
+
|
|
105
|
+
POSITIONING:
|
|
106
|
+
- SWE-agent: Single-agent, custom ACI, best repository navigation
|
|
107
|
+
- Claude Code: Highest SWE-bench score (80.9%), but single-threaded
|
|
108
|
+
- Aider: Most cost-efficient (4.2x fewer tokens), git-native
|
|
109
|
+
- GitHub Copilot Agent: Closing dark factory gap via Project Padawan
|
|
110
|
+
- Shipwright: Unique as PLATFORM for multi-agent factories + RL optimization
|
|
111
|
+
|
|
112
|
+
================================================================================
|
|
113
|
+
20-ITEM BACKLOG: RANKED BY IMPACT/EFFORT RATIO
|
|
114
|
+
================================================================================
|
|
115
|
+
|
|
116
|
+
TIER 1 (EXCEPTIONAL ROI - IMPLEMENT IMMEDIATELY):
|
|
117
|
+
|
|
118
|
+
#1 [Medium effort] Semantic trajectory analysis + convergence detection
|
|
119
|
+
→ 25-40% iteration waste reduction; early exit on stuck loops
|
|
120
|
+
Impact: Foundational for cost optimization
|
|
121
|
+
|
|
122
|
+
#2 [High effort] Intent Specification Engine (business → outcomes)
|
|
123
|
+
→ 40-60% design time reduction; enables 3-5 person factories
|
|
124
|
+
Impact: Strategic, SOTA dark factory capability
|
|
125
|
+
|
|
126
|
+
#3 [Medium effort] Vulnerability Reward Model + online RL hardening
|
|
127
|
+
→ 30-40% security issue reduction; compliance-ready
|
|
128
|
+
Impact: Security-hardened autonomous pipelines
|
|
129
|
+
|
|
130
|
+
#5 [Medium effort] Speculative Cascade Model Routing (Haiku → Sonnet → Opus)
|
|
131
|
+
→ 40-60% cost reduction on median tasks; same quality on hard
|
|
132
|
+
Impact: Immediate cost leverage; proven by Google
|
|
133
|
+
|
|
134
|
+
TIER 2 (HIGH ROI - NEXT PHASE):
|
|
135
|
+
|
|
136
|
+
#4 [High effort] Episodic Memory Layer (execution traces + case-based reasoning)
|
|
137
|
+
→ 20-35% faster solutions via episode analogy
|
|
138
|
+
Impact: Unlocks long-horizon learning; self-improvement loop
|
|
139
|
+
|
|
140
|
+
#6 [Medium effort] Mutation Testing Feedback Loop (validate test quality)
|
|
141
|
+
→ 30-40% better test effectiveness; catches subtle bugs
|
|
142
|
+
Impact: Quality improvement via mutation score feedback
|
|
143
|
+
|
|
144
|
+
#7 [High effort] CI Repair Agent (automatic fix for flaky tests, timeouts)
|
|
145
|
+
→ 50% fewer retries; faster merge times
|
|
146
|
+
Impact: Self-healing + resilience
|
|
147
|
+
|
|
148
|
+
#8 [Medium effort] LLM-as-a-Judge validation stage (secondary reviewer)
|
|
149
|
+
→ 10-15% fewer merge regressions
|
|
150
|
+
Impact: Quality gate beyond rule-based checks
|
|
151
|
+
|
|
152
|
+
TIER 3 (MEDIUM ROI - LONGER TERM):
|
|
153
|
+
|
|
154
|
+
#9 [Medium effort] Explicit File Conflict Detection + DAG Scheduling
|
|
155
|
+
→ Prevents merge failures; enables parallelism
|
|
156
|
+
Impact: Prevents silent errors in multi-agent workflows
|
|
157
|
+
|
|
158
|
+
#10 [Medium effort] Intelligent Reasoning Budget Allocation
|
|
159
|
+
→ 15-25% harder-task success; cheaper on easy tasks
|
|
160
|
+
Impact: Quality + cost optimization on reasoning models
|
|
161
|
+
|
|
162
|
+
#11 [Very high effort] Formal Verification Integration (Dafny/Lean stage)
|
|
163
|
+
→ 99.99% confidence on critical code paths
|
|
164
|
+
Impact: High stakes (crypto, payments); niche use case
|
|
165
|
+
|
|
166
|
+
#12 [High effort] Active Context Compression + Semantic Memory Layer
|
|
167
|
+
→ Unbounded context bloat fixed; 30% better compression
|
|
168
|
+
Impact: Solves long-session scalability
|
|
169
|
+
|
|
170
|
+
#13 [High effort] Multi-Pass Mutation Generation (LLM-based mutants)
|
|
171
|
+
→ Diversified test coverage; Meta-style compliance
|
|
172
|
+
Impact: Better mutation diversity than rule-based
|
|
173
|
+
|
|
174
|
+
#14 [High effort] Anomaly Detection + Predictive Repair (log analysis)
|
|
175
|
+
→ Earlier failure prevention; MTTR ↓ 40%
|
|
176
|
+
Impact: Proactive vs reactive
|
|
177
|
+
|
|
178
|
+
#15 [High effort] Cross-Repo Fleet Learning (pattern sharing)
|
|
179
|
+
→ 20% faster on new repo types
|
|
180
|
+
Impact: Leverages multi-repo data
|
|
181
|
+
|
|
182
|
+
TIER 4 (LOWER ROI - NICE TO HAVE):
|
|
183
|
+
|
|
184
|
+
#16-20: Quorum merge decisions, privacy mutations, DAG executor, symbol caching,
|
|
185
|
+
WebSocket real-time monitoring
|
|
186
|
+
Impact: Quality, compliance, observability improvements
|
|
187
|
+
|
|
188
|
+
================================================================================
|
|
189
|
+
IMPLEMENTATION ROADMAP (12 WEEKS)
|
|
190
|
+
================================================================================
|
|
191
|
+
|
|
192
|
+
PHASE 1: CONVERGENCE & COST (Weeks 1-4)
|
|
193
|
+
✓ #1 Semantic trajectory analysis for loop regimes
|
|
194
|
+
✓ #5 Speculative cascade routing
|
|
195
|
+
→ #2 Intent Specification Engine (research phase)
|
|
196
|
+
|
|
197
|
+
PHASE 2: SECURITY & TESTING (Weeks 5-8)
|
|
198
|
+
✓ #3 Vulnerability Reward Model
|
|
199
|
+
✓ #6 Mutation Testing Loop
|
|
200
|
+
✓ #13 Multi-Pass Mutation Generation
|
|
201
|
+
|
|
202
|
+
PHASE 3: MEMORY & SELF-HEALING (Weeks 9-12)
|
|
203
|
+
✓ #4 Episodic Memory Layer
|
|
204
|
+
✓ #7 CI Repair Agent
|
|
205
|
+
✓ #8 LLM-as-a-Judge
|
|
206
|
+
|
|
207
|
+
================================================================================
|
|
208
|
+
KEY SOURCES CITED
|
|
209
|
+
================================================================================
|
|
210
|
+
|
|
211
|
+
BENCHMARKS:
|
|
212
|
+
- SWE-bench: https://www.vals.ai/benchmarks/swebench
|
|
213
|
+
- SWE-bench Pro: https://scale.com/blog/swe-bench-pro
|
|
214
|
+
- Codeforces: https://codeforces.com/
|
|
215
|
+
|
|
216
|
+
PAPERS (2024-2026):
|
|
217
|
+
- SWE-agent NeurIPS 2024: https://arxiv.org/abs/2405.15793
|
|
218
|
+
- Geometric Dynamics of Agentic Loops: https://arxiv.org/abs/2512.10350
|
|
219
|
+
- DafnyPro POPL 2026: https://popl26.sigplan.org/
|
|
220
|
+
- FunPRM Process Rewards: https://arxiv.org/abs/2601.22249
|
|
221
|
+
- DeepSeek-R1 Architecture: https://arxiv.org/abs/2501.12948
|
|
222
|
+
- Active Context Compression: https://arxiv.org/abs/2601.07190
|
|
223
|
+
- Episodic Memory for LLMs: https://arxiv.org/abs/2407.09450
|
|
224
|
+
|
|
225
|
+
INDUSTRY REPORTS:
|
|
226
|
+
- BCG Platinion Dark Software Factory:
|
|
227
|
+
https://www.bcgplatinion.com/insights/the-dark-software-factory
|
|
228
|
+
- Anthropic 2026 Agentic Coding Trends Report:
|
|
229
|
+
https://resources.anthropic.com/hubfs/2026%20Agentic%20Coding%20Trends%20Report.pdf
|
|
230
|
+
- GitHub Copilot Agent Mode:
|
|
231
|
+
https://github.com/newsroom/press-releases/agent-mode
|
|
232
|
+
- Meta Engineering Blog (LLM-powered bug catchers):
|
|
233
|
+
https://engineering.fb.com/2025/02/05/security/
|
|
234
|
+
- Google Speculative Cascades:
|
|
235
|
+
https://research.google/blog/speculative-cascades-a-hybrid-approach-for-smarter-faster-llm-inference/
|
|
236
|
+
|
|
237
|
+
MODELS & SYSTEMS:
|
|
238
|
+
- Claude Opus 4.6: https://platform.claude.com
|
|
239
|
+
- OpenAI o1-pro: https://openai.com/index/introducing-openai-o1-preview/
|
|
240
|
+
- DeepSeek-R1: https://github.com/deepseek-ai/DeepSeek-R1
|
|
241
|
+
- SWE-agent: https://github.com/SWE-agent/SWE-agent
|
|
242
|
+
- Aider: https://github.com/paul-gauthier/aider
|
|
243
|
+
- Cline: https://github.com/cline/cline
|
|
244
|
+
|
|
245
|
+
================================================================================
|
|
246
|
+
CONCLUSION
|
|
247
|
+
================================================================================
|
|
248
|
+
|
|
249
|
+
Shipwright is positioned as the PLATFORM-GRADE autonomous software factory —
|
|
250
|
+
the right abstraction level between human intent and shipped code. The next wave
|
|
251
|
+
of differentiation comes from:
|
|
252
|
+
|
|
253
|
+
1. PREDICTIVE INTELLIGENCE (convergence detection, loop regimes)
|
|
254
|
+
→ Cost & time reduction for iteration-heavy tasks
|
|
255
|
+
|
|
256
|
+
2. LEARNING ACROSS EPISODES (episodic memory)
|
|
257
|
+
→ Faster solutions on similar problems via case-based analogy
|
|
258
|
+
|
|
259
|
+
3. FORMAL GUARANTEES (verification, formal specs)
|
|
260
|
+
→ Safety/compliance for critical code paths
|
|
261
|
+
|
|
262
|
+
4. SELF-HEALING (CI repair, automated fixes)
|
|
263
|
+
→ Resilience and reduced human intervention
|
|
264
|
+
|
|
265
|
+
The 20-item backlog reflects industry momentum and fills Shipwright's remaining
|
|
266
|
+
gaps. Implementation order prioritizes highest ROI (cost, learning, quality).
|
|
267
|
+
|
|
268
|
+
Expected outcomes over 12 weeks:
|
|
269
|
+
- 40-60% cost reduction via cascading + convergence detection
|
|
270
|
+
- 30-40% security improvement via vulnerability-aware RL
|
|
271
|
+
- 20-35% faster solutions via episodic memory
|
|
272
|
+
- 50% fewer CI retries via repair agent
|
|
273
|
+
- Positioned as SOTA platform for dark factory era
|
|
274
|
+
|
|
275
|
+
================================================================================
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
# Pipeline Quality Revolution — Design Spec
|
|
2
|
+
|
|
3
|
+
**Date**: 2026-03-10
|
|
4
|
+
**Status**: Approved
|
|
5
|
+
**Goal**: Close 6 quality gaps + AI-readiness foundation so autonomous agents deliver better than humans
|
|
6
|
+
|
|
7
|
+
## Problem
|
|
8
|
+
|
|
9
|
+
Shipwright's pipeline infrastructure works — it polls issues, spawns agents, runs builds, creates PRs. But the _quality_ of output is mediocre because:
|
|
10
|
+
|
|
11
|
+
1. Plans are optimistic checklists without adversarial thinking
|
|
12
|
+
2. Definition of Done is written by the same agent that implements (fox/henhouse)
|
|
13
|
+
3. Agents don't understand _why_ they're building something
|
|
14
|
+
4. No feedback loop from PR review quality or post-merge outcomes
|
|
15
|
+
5. Code review is advisory, not adversarial — agents rubber-stamp
|
|
16
|
+
6. No scope discipline — PRs balloon to 16K+ lines
|
|
17
|
+
|
|
18
|
+
And underneath all of this: repos aren't "AI-ready." The daemon has no project-specific quality standards.
|
|
19
|
+
|
|
20
|
+
## Architecture
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
PREP (AI-Readiness Foundation) ──────────────────────
|
|
24
|
+
│ Interactive quality profile dialogue
|
|
25
|
+
│ → .claude/quality-profile.json
|
|
26
|
+
│ → enriched CLAUDE.md
|
|
27
|
+
▼
|
|
28
|
+
INTAKE ──────────────────────────────────────────────
|
|
29
|
+
│ Intent analysis: WHO/WHAT/WHY/HOW/NOT
|
|
30
|
+
│ → .claude/pipeline-artifacts/acceptance-criteria.json
|
|
31
|
+
▼
|
|
32
|
+
PLAN ────────────────────────────────────────────────
|
|
33
|
+
│ Constrained by external acceptance criteria
|
|
34
|
+
│ Mandatory failure mode analysis section
|
|
35
|
+
│ → plan.md (with "Files to Modify" for scope tracking)
|
|
36
|
+
▼
|
|
37
|
+
BUILD ───────────────────────────────────────────────
|
|
38
|
+
│ "never_ship" rules injected every iteration
|
|
39
|
+
│ Scope tracking: planned vs actual files
|
|
40
|
+
│ Quality rules from learned patterns
|
|
41
|
+
▼
|
|
42
|
+
REVIEW ──────────────────────────────────────────────
|
|
43
|
+
│ Adversarial-by-default, must find 3+ issues
|
|
44
|
+
│ Bugs block (not just criticals)
|
|
45
|
+
│ Scope creep flagged from plan diff
|
|
46
|
+
▼
|
|
47
|
+
COMPOUND_QUALITY ────────────────────────────────────
|
|
48
|
+
│ Machine-verifiable DoD scorecard
|
|
49
|
+
│ Each acceptance criterion: PASS/FAIL with evidence
|
|
50
|
+
▼
|
|
51
|
+
PR ──────────────────────────────────────────────────
|
|
52
|
+
│ PR size gate (configurable, default 500 lines)
|
|
53
|
+
▼
|
|
54
|
+
POST-MERGE ──────────────────────────────────────────
|
|
55
|
+
Review comment capture → memory
|
|
56
|
+
Merge quality score tracking
|
|
57
|
+
Auto-generated quality rules from patterns
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Component Designs
|
|
61
|
+
|
|
62
|
+
### Component 1: Quality Profile (`quality-profile.json`)
|
|
63
|
+
|
|
64
|
+
The keystone. Every pipeline stage reads this to calibrate behavior to the project.
|
|
65
|
+
|
|
66
|
+
**Schema:**
|
|
67
|
+
|
|
68
|
+
```json
|
|
69
|
+
{
|
|
70
|
+
"version": 1,
|
|
71
|
+
"project_name": "string",
|
|
72
|
+
"generated_at": "ISO-8601",
|
|
73
|
+
"architecture": {
|
|
74
|
+
"pattern": "monolith|modular_monolith|microservices|serverless|library",
|
|
75
|
+
"layers": ["string"],
|
|
76
|
+
"dependency_direction": "inward|none",
|
|
77
|
+
"rules": ["string — architectural constraints"]
|
|
78
|
+
},
|
|
79
|
+
"testing": {
|
|
80
|
+
"philosophy": "tdd|test_after|coverage_target|manual",
|
|
81
|
+
"min_coverage_delta": 0,
|
|
82
|
+
"required_test_types": ["unit", "integration", "e2e"],
|
|
83
|
+
"test_cmd": "string",
|
|
84
|
+
"fast_test_cmd": "string"
|
|
85
|
+
},
|
|
86
|
+
"quality": {
|
|
87
|
+
"max_pr_lines": 500,
|
|
88
|
+
"max_files_per_pr": 15,
|
|
89
|
+
"never_ship": ["string — absolute rules"],
|
|
90
|
+
"always_require": ["string — positive requirements"],
|
|
91
|
+
"learned_rules": [
|
|
92
|
+
{
|
|
93
|
+
"rule": "string",
|
|
94
|
+
"source": "string — how this was learned",
|
|
95
|
+
"confidence": 0.0-1.0,
|
|
96
|
+
"created_at": "ISO-8601",
|
|
97
|
+
"inject_at": ["plan", "build", "review"]
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
"review": {
|
|
102
|
+
"focus_areas": ["string"],
|
|
103
|
+
"blocking_severities": ["critical", "bug", "security"],
|
|
104
|
+
"min_issues_to_find": 3
|
|
105
|
+
},
|
|
106
|
+
"scope": {
|
|
107
|
+
"unplanned_files_block": false,
|
|
108
|
+
"decomposition_threshold_lines": 500
|
|
109
|
+
},
|
|
110
|
+
"deployment": {
|
|
111
|
+
"strategy": "direct|preview_then_production|staged_rollout",
|
|
112
|
+
"rollback_plan": "revert_commit|feature_flag|manual",
|
|
113
|
+
"monitoring_window_minutes": 30
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**Generation**: `shipwright prep --interactive` runs a guided dialogue analyzing repo structure, configs, tests, CI, and asking 5-7 targeted questions. `shipwright prep --auto` infers from repo analysis with confidence scores.
|
|
119
|
+
|
|
120
|
+
**Location**: `.claude/quality-profile.json` (checked into repo, grows over time)
|
|
121
|
+
|
|
122
|
+
### Component 2: Intent Analysis (Intake Stage Enhancement)
|
|
123
|
+
|
|
124
|
+
**Trigger**: Runs in `stage_intake()` after issue metadata is fetched, before plan stage.
|
|
125
|
+
|
|
126
|
+
**Prompt template**:
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
Analyze this issue deeply before any implementation planning.
|
|
130
|
+
|
|
131
|
+
Issue: {title}
|
|
132
|
+
Body: {body}
|
|
133
|
+
Labels: {labels}
|
|
134
|
+
|
|
135
|
+
Project architecture: {quality_profile.architecture}
|
|
136
|
+
|
|
137
|
+
Produce a structured analysis:
|
|
138
|
+
|
|
139
|
+
1. WHO benefits? (end user / developer / ops / CI)
|
|
140
|
+
2. WHAT changes? (concrete before→after behavior, with examples)
|
|
141
|
+
3. WHY does this matter? (pain solved / capability unlocked)
|
|
142
|
+
4. HOW will we know it worked? (observable signals — specific, testable)
|
|
143
|
+
5. WHAT SHOULD WE NOT DO? (explicit out-of-scope boundaries)
|
|
144
|
+
6. ACCEPTANCE CRITERIA: 3-7 machine-verifiable criteria
|
|
145
|
+
|
|
146
|
+
Output JSON to acceptance-criteria.json matching this schema:
|
|
147
|
+
{schema}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Output**: `acceptance-criteria.json` saved to pipeline artifacts. Passed to plan stage as input constraint.
|
|
151
|
+
|
|
152
|
+
**Key design decision**: Intent analysis runs as a _separate Claude session_ from planning. The analyst defines "what success looks like." The planner figures out "how to get there."
|
|
153
|
+
|
|
154
|
+
### Component 3: Adversarial Plan Validation (Plan Stage Enhancement)
|
|
155
|
+
|
|
156
|
+
**Trigger**: After plan is generated, before plan validation gate.
|
|
157
|
+
|
|
158
|
+
**Injected into plan prompt**:
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
After your implementation plan, include a MANDATORY section:
|
|
162
|
+
|
|
163
|
+
## Failure Mode Analysis
|
|
164
|
+
For each major component or decision:
|
|
165
|
+
1. Runtime failures: What happens when dependencies are unavailable?
|
|
166
|
+
2. Concurrency risks: Race conditions, stale state, duplicate processing?
|
|
167
|
+
3. Scale risks: 10x data, slow external deps, memory pressure?
|
|
168
|
+
4. Rollback story: Can we revert safely without data loss?
|
|
169
|
+
|
|
170
|
+
Project architecture rules to consider:
|
|
171
|
+
{quality_profile.architecture.rules}
|
|
172
|
+
|
|
173
|
+
You MUST identify at least 3 concrete failure modes.
|
|
174
|
+
Address the most critical one in your implementation plan.
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
**Validation gate addition**: New rejection reason `missing_failure_analysis` — plan is rejected if the failure mode section is empty, has fewer than 3 items, or contains only generic platitudes (detected by checking for project-specific references).
|
|
178
|
+
|
|
179
|
+
### Component 4: Scope Enforcement (Build + PR Stage Enhancement)
|
|
180
|
+
|
|
181
|
+
**A. Planned files tracking (build stage)**:
|
|
182
|
+
Extract "Files to Modify" from `plan.md` at build start. After each iteration, compare `git diff --name-only` against planned files. Log unplanned files to `scope-report.json`.
|
|
183
|
+
|
|
184
|
+
**B. PR size gate (PR stage)**:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
total_lines=$(git diff --stat origin/main...HEAD | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+')
|
|
188
|
+
max_lines=$(jq -r '.quality.max_pr_lines // 500' "$QUALITY_PROFILE")
|
|
189
|
+
if [[ "$total_lines" -gt "$max_lines" ]]; then
|
|
190
|
+
error "PR is ${total_lines} lines (max: ${max_lines}). Decompose into smaller PRs."
|
|
191
|
+
exit 1
|
|
192
|
+
fi
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
**C. Scope report injection into review**:
|
|
196
|
+
The review stage receives `scope-report.json` listing unplanned files. Reviewer must justify or flag each.
|
|
197
|
+
|
|
198
|
+
### Component 5: Adversarial Review (Review Stage Enhancement)
|
|
199
|
+
|
|
200
|
+
**New review prompt** (replaces current generic prompt):
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
You are a SKEPTICAL senior engineer reviewing code for production.
|
|
204
|
+
Your job is to FIND PROBLEMS, not confirm quality.
|
|
205
|
+
|
|
206
|
+
Project standards (from quality-profile.json):
|
|
207
|
+
- Never ship: {quality_profile.quality.never_ship}
|
|
208
|
+
- Always require: {quality_profile.quality.always_require}
|
|
209
|
+
- Focus areas: {quality_profile.review.focus_areas}
|
|
210
|
+
- Learned rules: {quality_profile.quality.learned_rules}
|
|
211
|
+
|
|
212
|
+
Definition of Done (from acceptance-criteria.json):
|
|
213
|
+
{acceptance_criteria}
|
|
214
|
+
|
|
215
|
+
Scope report (planned vs actual files):
|
|
216
|
+
{scope_report}
|
|
217
|
+
|
|
218
|
+
Rules:
|
|
219
|
+
1. Find at least {min_issues_to_find} issues. If truly zero issues exist,
|
|
220
|
+
write a paragraph explaining why this code is exceptional.
|
|
221
|
+
2. Rate each: Critical / Bug / Security / Warning / Suggestion
|
|
222
|
+
3. Check EVERY acceptance criterion — mark PASS/FAIL with evidence.
|
|
223
|
+
4. Flag every unplanned file — justify or mark as scope creep.
|
|
224
|
+
5. Check every "never_ship" rule — cite violations with line numbers.
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
**Blocking change**: Gate condition becomes `critical_count + bug_count + security_count > 0` (bugs now block).
|
|
228
|
+
|
|
229
|
+
### Component 6: Machine-Verifiable DoD Scorecard (Compound Quality Enhancement)
|
|
230
|
+
|
|
231
|
+
**Computed checks** (no LLM needed):
|
|
232
|
+
|
|
233
|
+
```json
|
|
234
|
+
{
|
|
235
|
+
"scorecard": {
|
|
236
|
+
"pr_size": { "status": "pass", "value": 247, "limit": 500 },
|
|
237
|
+
"test_count_delta": { "status": "pass", "value": 12, "baseline": 0 },
|
|
238
|
+
"coverage_delta": { "status": "pass", "value": 2.1, "min": 0 },
|
|
239
|
+
"lint_warnings_delta": { "status": "pass", "value": 0, "max": 0 },
|
|
240
|
+
"planned_files_coverage": {
|
|
241
|
+
"status": "pass",
|
|
242
|
+
"planned": 5,
|
|
243
|
+
"touched": 5,
|
|
244
|
+
"unplanned": 1
|
|
245
|
+
},
|
|
246
|
+
"never_ship_violations": { "status": "pass", "violations": [] },
|
|
247
|
+
"acceptance_criteria": [
|
|
248
|
+
{
|
|
249
|
+
"id": "ac-1",
|
|
250
|
+
"status": "pass",
|
|
251
|
+
"evidence": "GET /api/users returns 200 in test output"
|
|
252
|
+
},
|
|
253
|
+
{
|
|
254
|
+
"id": "ac-2",
|
|
255
|
+
"status": "fail",
|
|
256
|
+
"evidence": "No test for 401 response on invalid token"
|
|
257
|
+
}
|
|
258
|
+
]
|
|
259
|
+
},
|
|
260
|
+
"overall": "fail",
|
|
261
|
+
"blocking_failures": ["ac-2"]
|
|
262
|
+
}
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
Machine checks run first. LLM-based checks (adversarial, negative testing) run only if machine checks pass. This is faster and cheaper.
|
|
266
|
+
|
|
267
|
+
### Component 7: Outcome Feedback Loop (Post-Merge Enhancement)
|
|
268
|
+
|
|
269
|
+
**A. PR Review Capture** (new function in `sw-feedback.sh`):
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
capture_review_feedback() {
|
|
273
|
+
local pr_number="$1"
|
|
274
|
+
local reviews=$(gh api "repos/{owner}/{repo}/pulls/${pr_number}/reviews" --jq '.[].body')
|
|
275
|
+
local comments=$(gh api "repos/{owner}/{repo}/pulls/${pr_number}/comments" --jq '.[].body')
|
|
276
|
+
# Store in memory with type "review_feedback"
|
|
277
|
+
# Extract patterns for quality rule generation
|
|
278
|
+
}
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
**B. Merge Quality Score** (new function in `sw-feedback.sh`):
|
|
282
|
+
Track per-PR: `clean_merge` (+1), `changes_requested` (-1), `reverted` (-3), `regression` (-2). Rolling average → pipeline quality score in DORA dashboard.
|
|
283
|
+
|
|
284
|
+
**C. Quality Rule Auto-Generation** (new function in `sw-memory.sh`):
|
|
285
|
+
When same review pattern appears 3+ times, generate a quality rule and add to `quality-profile.json`'s `learned_rules` array. Rules are injected into plan, build, and review stages.
|
|
286
|
+
|
|
287
|
+
## Data Flow
|
|
288
|
+
|
|
289
|
+
```
|
|
290
|
+
quality-profile.json ──→ ALL STAGES (standards calibration)
|
|
291
|
+
│
|
|
292
|
+
├──→ intake: intent analysis prompt
|
|
293
|
+
├──→ plan: failure mode analysis prompt + acceptance criteria constraints
|
|
294
|
+
├──→ build: never_ship rules + learned quality rules
|
|
295
|
+
├──→ review: focus areas + blocking rules + scope report
|
|
296
|
+
├──→ compound_quality: machine check thresholds
|
|
297
|
+
└──→ pr: size limits
|
|
298
|
+
|
|
299
|
+
acceptance-criteria.json ──→ plan (constraints) → review (checklist) → compound_quality (scorecard)
|
|
300
|
+
scope-report.json ──→ review (scope creep detection) → pr (size gate)
|
|
301
|
+
dod-scorecard.json ──→ compound_quality output → pr gate
|
|
302
|
+
merge-quality.jsonl ──→ feedback → quality-profile.json (learned_rules)
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
## Testing Strategy
|
|
306
|
+
|
|
307
|
+
Each component has a corresponding test:
|
|
308
|
+
|
|
309
|
+
- `sw-quality-profile-test.sh` — profile generation, schema validation, merge with learned rules
|
|
310
|
+
- `sw-intent-analysis-test.sh` — acceptance criteria extraction, JSON schema compliance
|
|
311
|
+
- `sw-scope-enforcement-test.sh` — planned vs actual file tracking, PR size gate
|
|
312
|
+
- `sw-adversarial-review-test.sh` — minimum issue finding, blocking behavior, scope creep detection
|
|
313
|
+
- `sw-dod-scorecard-test.sh` — machine check computation, pass/fail logic
|
|
314
|
+
- `sw-outcome-feedback-test.sh` — review capture, quality score, rule auto-generation
|
|
315
|
+
|
|
316
|
+
Integration: `sw-pipeline-test.sh` gains tests for quality profile flow through all stages.
|
|
317
|
+
|
|
318
|
+
## Files to Create
|
|
319
|
+
|
|
320
|
+
| File | Purpose |
|
|
321
|
+
| ---------------------------------- | ----------------------------------------------------- |
|
|
322
|
+
| `scripts/lib/quality-profile.sh` | Profile loading, validation, merge with learned rules |
|
|
323
|
+
| `scripts/lib/intent-analysis.sh` | Issue intent analysis, acceptance criteria generation |
|
|
324
|
+
| `scripts/lib/scope-enforcement.sh` | Planned vs actual file tracking, PR size gate |
|
|
325
|
+
| `scripts/lib/dod-scorecard.sh` | Machine-verifiable DoD computation |
|
|
326
|
+
| `scripts/lib/outcome-feedback.sh` | Review capture, quality score, rule generation |
|
|
327
|
+
|
|
328
|
+
## Files to Modify
|
|
329
|
+
|
|
330
|
+
| File | Changes |
|
|
331
|
+
| ---------------------------------------------------------- | ----------------------------------------------------------- |
|
|
332
|
+
| `scripts/lib/pipeline-stages-intake.sh` | Add intent analysis step, generate acceptance-criteria.json |
|
|
333
|
+
| `scripts/lib/pipeline-stages-intake.sh` (plan section) | Inject failure mode analysis, consume acceptance criteria |
|
|
334
|
+
| `scripts/lib/pipeline-stages-build.sh` | Inject never_ship rules, scope tracking |
|
|
335
|
+
| `scripts/lib/pipeline-stages-review.sh` | New adversarial review prompt, bug-blocking, scope report |
|
|
336
|
+
| `scripts/lib/pipeline-stages-review.sh` (compound_quality) | Machine DoD scorecard before LLM checks |
|
|
337
|
+
| `scripts/lib/pipeline-stages-delivery.sh` | PR size gate |
|
|
338
|
+
| `scripts/sw-prep.sh` | Interactive quality profile generation |
|
|
339
|
+
| `scripts/sw-feedback.sh` | PR review capture, merge quality score |
|
|
340
|
+
| `scripts/sw-memory.sh` | Quality rule auto-generation from patterns |
|
|
341
|
+
| `scripts/sw-pipeline-test.sh` | Integration tests for quality profile flow |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "shipwright-cli",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.3.0",
|
|
4
4
|
"description": "Orchestrate autonomous Claude Code agent teams in tmux",
|
|
5
5
|
"bin": {
|
|
6
6
|
"shipwright": "scripts/sw",
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
"dashboard:test": "vitest run --config dashboard/vitest.config.ts",
|
|
37
37
|
"dashboard:test:watch": "vitest --config dashboard/vitest.config.ts",
|
|
38
38
|
"dashboard:test:coverage": "vitest run --config dashboard/vitest.config.ts --coverage",
|
|
39
|
-
"test": "bash scripts/sw-agi-roadmap-test.sh && bash scripts/sw-activity-test.sh && bash scripts/sw-adaptive-test.sh && bash scripts/sw-adversarial-test.sh && bash scripts/sw-architecture-enforcer-test.sh && bash scripts/sw-auth-test.sh && bash scripts/sw-autonomous-test.sh && bash scripts/sw-changelog-test.sh && bash scripts/sw-checkpoint-test.sh && bash scripts/sw-ci-test.sh && bash scripts/sw-cleanup-test.sh && bash scripts/sw-code-review-test.sh && bash scripts/sw-connect-test.sh && bash scripts/sw-context-test.sh && bash scripts/sw-cost-test.sh && bash scripts/sw-daemon-test.sh && bash scripts/sw-dashboard-test.sh && bash scripts/sw-db-test.sh && bash scripts/sw-decompose-test.sh && bash scripts/sw-decide-test.sh && bash scripts/sw-deps-test.sh && bash scripts/sw-developer-simulation-test.sh && bash scripts/sw-discovery-test.sh && bash scripts/sw-doc-fleet-test.sh && bash scripts/sw-docs-agent-test.sh && bash scripts/sw-docs-test.sh && bash scripts/sw-doctor-test.sh && bash scripts/sw-dora-test.sh && bash scripts/sw-durable-test.sh && bash scripts/sw-e2e-orchestrator-test.sh && bash scripts/sw-eventbus-test.sh && bash scripts/sw-feedback-test.sh && bash scripts/sw-fix-test.sh && bash scripts/sw-fleet-discover-test.sh && bash scripts/sw-fleet-test.sh && bash scripts/sw-fleet-viz-test.sh && bash scripts/sw-frontier-test.sh && bash scripts/sw-github-app-test.sh && bash scripts/sw-github-checks-test.sh && bash scripts/sw-github-deploy-test.sh && bash scripts/sw-github-graphql-test.sh && bash scripts/sw-guild-test.sh && bash scripts/sw-heartbeat-test.sh && bash scripts/sw-hygiene-test.sh && bash scripts/sw-incident-test.sh && bash scripts/sw-init-test.sh && bash scripts/sw-instrument-test.sh && bash scripts/sw-intelligence-test.sh && bash scripts/sw-jira-test.sh && bash scripts/sw-launchd-test.sh && bash scripts/sw-linear-test.sh && bash scripts/sw-logs-test.sh && bash scripts/sw-loop-test.sh && bash scripts/sw-memory-test.sh && bash scripts/sw-mission-control-test.sh && bash scripts/sw-model-router-test.sh && bash scripts/sw-otel-test.sh && bash scripts/sw-oversight-test.sh && bash scripts/sw-patrol-meta-test.sh && bash scripts/sw-pipeline-composer-test.sh && bash scripts/sw-pipeline-test.sh && bash scripts/sw-pipeline-vitals-test.sh && bash scripts/sw-pm-test.sh && bash scripts/sw-pr-lifecycle-test.sh && bash scripts/sw-predictive-test.sh && bash scripts/sw-prep-test.sh && bash scripts/sw-ps-test.sh && bash scripts/sw-public-dashboard-test.sh && bash scripts/sw-quality-test.sh && bash scripts/sw-reaper-test.sh && bash scripts/sw-recruit-test.sh && bash scripts/sw-regression-test.sh && bash scripts/sw-release-manager-test.sh && bash scripts/sw-release-test.sh && bash scripts/sw-remote-test.sh && bash scripts/sw-replay-test.sh && bash scripts/sw-retro-test.sh && bash scripts/sw-scale-test.sh && bash scripts/sw-security-audit-test.sh && bash scripts/sw-self-optimize-test.sh && bash scripts/sw-session-test.sh && bash scripts/sw-setup-test.sh && bash scripts/sw-standup-test.sh && bash scripts/sw-status-test.sh && bash scripts/sw-strategic-test.sh && bash scripts/sw-stream-test.sh && bash scripts/sw-swarm-test.sh && bash scripts/sw-team-stages-test.sh && bash scripts/sw-templates-test.sh && bash scripts/sw-testgen-test.sh && bash scripts/sw-tmux-pipeline-test.sh && bash scripts/sw-tmux-test.sh && bash scripts/sw-trace-test.sh && bash scripts/sw-tracker-test.sh && bash scripts/sw-triage-test.sh && bash scripts/sw-upgrade-test.sh && bash scripts/sw-ux-test.sh && bash scripts/sw-webhook-test.sh && bash scripts/sw-widgets-test.sh && bash scripts/sw-worktree-test.sh && bash scripts/sw-lib-compat-test.sh && bash scripts/sw-lib-helpers-test.sh && bash scripts/sw-lib-daemon-dispatch-test.sh && bash scripts/sw-lib-daemon-failure-test.sh && bash scripts/sw-lib-daemon-poll-test.sh && bash scripts/sw-lib-daemon-state-test.sh && bash scripts/sw-lib-daemon-triage-test.sh && bash scripts/sw-lib-pipeline-detection-test.sh && bash scripts/sw-lib-pipeline-intelligence-test.sh && bash scripts/sw-lib-pipeline-quality-checks-test.sh && bash scripts/sw-lib-pipeline-stages-test.sh && bash scripts/sw-lib-pipeline-state-test.sh && bash scripts/sw-adapters-test.sh && bash scripts/sw-evidence-test.sh && bash scripts/sw-review-rerun-test.sh && bash scripts/sw-tracker-providers-test.sh && bash scripts/sw-budget-chaos-test.sh && bash scripts/sw-autonomous-e2e-test.sh && bash scripts/sw-memory-discovery-e2e-test.sh && bash scripts/sw-policy-e2e-test.sh && bash scripts/sw-e2e-smoke-test.sh && bash scripts/sw-dashboard-e2e-test.sh",
|
|
39
|
+
"test": "bash scripts/sw-agi-roadmap-test.sh && bash scripts/sw-activity-test.sh && bash scripts/sw-adaptive-test.sh && bash scripts/sw-adversarial-test.sh && bash scripts/sw-architecture-enforcer-test.sh && bash scripts/sw-auth-test.sh && bash scripts/sw-autonomous-test.sh && bash scripts/sw-changelog-test.sh && bash scripts/sw-checkpoint-test.sh && bash scripts/sw-ci-test.sh && bash scripts/sw-cleanup-test.sh && bash scripts/sw-code-review-test.sh && bash scripts/sw-connect-test.sh && bash scripts/sw-context-budget-test.sh && bash scripts/sw-context-test.sh && bash scripts/sw-cost-test.sh && bash scripts/sw-daemon-test.sh && bash scripts/sw-dashboard-test.sh && bash scripts/sw-db-test.sh && bash scripts/sw-decompose-test.sh && bash scripts/sw-decide-test.sh && bash scripts/sw-deps-test.sh && bash scripts/sw-developer-simulation-test.sh && bash scripts/sw-discovery-test.sh && bash scripts/sw-doc-fleet-test.sh && bash scripts/sw-docs-agent-test.sh && bash scripts/sw-docs-test.sh && bash scripts/sw-doctor-test.sh && bash scripts/sw-dora-test.sh && bash scripts/sw-durable-test.sh && bash scripts/sw-e2e-orchestrator-test.sh && bash scripts/sw-eventbus-test.sh && bash scripts/sw-feedback-test.sh && bash scripts/sw-outcome-feedback-test.sh && bash scripts/sw-fix-test.sh && bash scripts/sw-fleet-discover-test.sh && bash scripts/sw-fleet-test.sh && bash scripts/sw-fleet-viz-test.sh && bash scripts/sw-frontier-test.sh && bash scripts/sw-github-app-test.sh && bash scripts/sw-github-checks-test.sh && bash scripts/sw-github-deploy-test.sh && bash scripts/sw-github-graphql-test.sh && bash scripts/sw-guild-test.sh && bash scripts/sw-heartbeat-test.sh && bash scripts/sw-hello-test.sh && bash scripts/sw-hygiene-test.sh && bash scripts/sw-incident-test.sh && bash scripts/sw-init-test.sh && bash scripts/sw-instrument-test.sh && bash scripts/sw-intelligence-test.sh && bash scripts/sw-jira-test.sh && bash scripts/sw-launchd-test.sh && bash scripts/sw-linear-test.sh && bash scripts/sw-logs-test.sh && bash scripts/sw-loop-test.sh && bash scripts/sw-memory-test.sh && bash scripts/sw-mission-control-test.sh && bash scripts/sw-model-router-test.sh && bash scripts/sw-otel-test.sh && bash scripts/sw-oversight-test.sh && bash scripts/sw-patrol-meta-test.sh && bash scripts/sw-pipeline-composer-test.sh && bash scripts/sw-pipeline-test.sh && bash scripts/sw-pipeline-vitals-test.sh && bash scripts/sw-pm-test.sh && bash scripts/sw-pr-lifecycle-test.sh && bash scripts/sw-predictive-test.sh && bash scripts/sw-prep-test.sh && bash scripts/sw-ps-test.sh && bash scripts/sw-public-dashboard-test.sh && bash scripts/sw-quality-profile-test.sh && bash scripts/sw-quality-test.sh && bash scripts/sw-reaper-test.sh && bash scripts/sw-recruit-test.sh && bash scripts/sw-regression-test.sh && bash scripts/sw-release-manager-test.sh && bash scripts/sw-release-test.sh && bash scripts/sw-root-cause-test.sh && bash scripts/sw-remote-test.sh && bash scripts/sw-replay-test.sh && bash scripts/sw-retro-test.sh && bash scripts/sw-scale-test.sh && bash scripts/sw-stall-detector-test.sh && bash scripts/sw-security-audit-test.sh && bash scripts/sw-self-optimize-test.sh && bash scripts/sw-session-test.sh && bash scripts/sw-setup-test.sh && bash scripts/sw-standup-test.sh && bash scripts/sw-status-test.sh && bash scripts/sw-strategic-test.sh && bash scripts/sw-stream-test.sh && bash scripts/sw-swarm-test.sh && bash scripts/sw-team-stages-test.sh && bash scripts/sw-templates-test.sh && bash scripts/sw-testgen-test.sh && bash scripts/sw-tmux-pipeline-test.sh && bash scripts/sw-tmux-test.sh && bash scripts/sw-trace-test.sh && bash scripts/sw-tracker-test.sh && bash scripts/sw-triage-test.sh && bash scripts/sw-upgrade-test.sh && bash scripts/sw-ux-test.sh && bash scripts/sw-webhook-test.sh && bash scripts/sw-widgets-test.sh && bash scripts/sw-worktree-test.sh && bash scripts/sw-lib-compat-test.sh && bash scripts/sw-lib-helpers-test.sh && bash scripts/sw-lib-error-actionability-test.sh && bash scripts/sw-lib-daemon-dispatch-test.sh && bash scripts/sw-lib-daemon-failure-test.sh && bash scripts/sw-lib-daemon-poll-test.sh && bash scripts/sw-lib-daemon-state-test.sh && bash scripts/sw-lib-daemon-triage-test.sh && bash scripts/sw-lib-daemon-patrol-test.sh && bash scripts/sw-lib-pipeline-detection-test.sh && bash scripts/sw-lib-pipeline-intelligence-test.sh && bash scripts/sw-lib-pipeline-quality-checks-test.sh && bash scripts/sw-lib-pipeline-stages-test.sh && bash scripts/sw-lib-pipeline-state-test.sh && bash scripts/sw-adapters-test.sh && bash scripts/sw-evidence-test.sh && bash scripts/sw-review-rerun-test.sh && bash scripts/sw-tracker-providers-test.sh && bash scripts/sw-budget-chaos-test.sh && bash scripts/sw-chaos-test.sh && bash scripts/sw-autonomous-e2e-test.sh && bash scripts/sw-memory-discovery-e2e-test.sh && bash scripts/sw-policy-e2e-test.sh && bash scripts/sw-e2e-smoke-test.sh && bash scripts/sw-dashboard-e2e-test.sh && bash scripts/sw-reward-aggregator-test.sh && bash scripts/sw-bandit-selector-test.sh && bash scripts/sw-policy-learner-test.sh && bash scripts/sw-autoresearch-e2e-test.sh",
|
|
40
40
|
"test:smoke": "bash scripts/sw-e2e-smoke-test.sh",
|
|
41
41
|
"test:integration": "bash scripts/sw-e2e-integration-test.sh && bash scripts/sw-e2e-system-test.sh && bash scripts/sw-server-api-test.sh && bash scripts/sw-integration-claude-test.sh",
|
|
42
42
|
"harness:evidence:capture": "bash scripts/sw-evidence.sh capture",
|