shipwright-cli 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (279) hide show
  1. package/.claude/agents/code-reviewer.md +2 -0
  2. package/.claude/agents/devops-engineer.md +2 -0
  3. package/.claude/agents/doc-fleet-agent.md +2 -0
  4. package/.claude/agents/pipeline-agent.md +2 -0
  5. package/.claude/agents/shell-script-specialist.md +2 -0
  6. package/.claude/agents/test-specialist.md +2 -0
  7. package/.claude/hooks/agent-crash-capture.sh +32 -0
  8. package/.claude/hooks/post-tool-use.sh +3 -2
  9. package/.claude/hooks/pre-tool-use.sh +35 -3
  10. package/README.md +4 -4
  11. package/claude-code/hooks/config-change.sh +18 -0
  12. package/claude-code/hooks/instructions-reloaded.sh +7 -0
  13. package/claude-code/hooks/worktree-create.sh +25 -0
  14. package/claude-code/hooks/worktree-remove.sh +20 -0
  15. package/config/code-constitution.json +130 -0
  16. package/dashboard/middleware/auth.ts +134 -0
  17. package/dashboard/middleware/constants.ts +21 -0
  18. package/dashboard/public/index.html +2 -6
  19. package/dashboard/public/styles.css +100 -97
  20. package/dashboard/routes/auth.ts +38 -0
  21. package/dashboard/server.ts +66 -25
  22. package/dashboard/services/config.ts +26 -0
  23. package/dashboard/services/db.ts +118 -0
  24. package/dashboard/src/canvas/pixel-agent.ts +298 -0
  25. package/dashboard/src/canvas/pixel-sprites.ts +440 -0
  26. package/dashboard/src/canvas/shipyard-effects.ts +367 -0
  27. package/dashboard/src/canvas/shipyard-scene.ts +616 -0
  28. package/dashboard/src/canvas/submarine-layout.ts +267 -0
  29. package/dashboard/src/components/header.ts +8 -7
  30. package/dashboard/src/core/router.ts +1 -0
  31. package/dashboard/src/design/submarine-theme.ts +253 -0
  32. package/dashboard/src/main.ts +2 -0
  33. package/dashboard/src/types/api.ts +2 -1
  34. package/dashboard/src/views/activity.ts +2 -1
  35. package/dashboard/src/views/shipyard.ts +39 -0
  36. package/dashboard/types/index.ts +166 -0
  37. package/docs/plans/2026-02-28-compound-audit-and-shipyard-design.md +186 -0
  38. package/docs/plans/2026-02-28-skipper-shipwright-implementation-plan.md +1182 -0
  39. package/docs/plans/2026-02-28-skipper-shipwright-integration-design.md +531 -0
  40. package/docs/plans/2026-03-01-ai-powered-skill-injection-design.md +298 -0
  41. package/docs/plans/2026-03-01-ai-powered-skill-injection-plan.md +1109 -0
  42. package/docs/plans/2026-03-01-capabilities-cleanup-plan.md +658 -0
  43. package/docs/plans/2026-03-01-clean-architecture-plan.md +924 -0
  44. package/docs/plans/2026-03-01-compound-audit-cascade-design.md +191 -0
  45. package/docs/plans/2026-03-01-compound-audit-cascade-plan.md +921 -0
  46. package/docs/plans/2026-03-01-deep-integration-plan.md +851 -0
  47. package/docs/plans/2026-03-01-pipeline-audit-trail-design.md +145 -0
  48. package/docs/plans/2026-03-01-pipeline-audit-trail-plan.md +770 -0
  49. package/docs/plans/2026-03-01-refined-depths-brand-design.md +382 -0
  50. package/docs/plans/2026-03-01-refined-depths-implementation.md +599 -0
  51. package/docs/plans/2026-03-01-skipper-kernel-integration-design.md +203 -0
  52. package/docs/plans/2026-03-01-unified-platform-design.md +272 -0
  53. package/docs/plans/2026-03-07-claude-code-feature-integration-design.md +189 -0
  54. package/docs/plans/2026-03-07-claude-code-feature-integration-plan.md +1165 -0
  55. package/docs/research/BACKLOG_QUICK_REFERENCE.md +352 -0
  56. package/docs/research/CUTTING_EDGE_RESEARCH_2026.md +546 -0
  57. package/docs/research/RESEARCH_INDEX.md +439 -0
  58. package/docs/research/RESEARCH_SOURCES.md +440 -0
  59. package/docs/research/RESEARCH_SUMMARY.txt +275 -0
  60. package/docs/superpowers/specs/2026-03-10-pipeline-quality-revolution-design.md +341 -0
  61. package/package.json +2 -2
  62. package/scripts/lib/adaptive-model.sh +427 -0
  63. package/scripts/lib/adaptive-timeout.sh +316 -0
  64. package/scripts/lib/audit-trail.sh +309 -0
  65. package/scripts/lib/auto-recovery.sh +471 -0
  66. package/scripts/lib/bandit-selector.sh +431 -0
  67. package/scripts/lib/bootstrap.sh +104 -2
  68. package/scripts/lib/causal-graph.sh +455 -0
  69. package/scripts/lib/compat.sh +126 -0
  70. package/scripts/lib/compound-audit.sh +337 -0
  71. package/scripts/lib/constitutional.sh +454 -0
  72. package/scripts/lib/context-budget.sh +359 -0
  73. package/scripts/lib/convergence.sh +594 -0
  74. package/scripts/lib/cost-optimizer.sh +634 -0
  75. package/scripts/lib/daemon-adaptive.sh +10 -0
  76. package/scripts/lib/daemon-dispatch.sh +106 -17
  77. package/scripts/lib/daemon-failure.sh +34 -4
  78. package/scripts/lib/daemon-patrol.sh +23 -2
  79. package/scripts/lib/daemon-poll-github.sh +361 -0
  80. package/scripts/lib/daemon-poll-health.sh +299 -0
  81. package/scripts/lib/daemon-poll.sh +27 -611
  82. package/scripts/lib/daemon-state.sh +112 -66
  83. package/scripts/lib/daemon-triage.sh +10 -0
  84. package/scripts/lib/dod-scorecard.sh +442 -0
  85. package/scripts/lib/error-actionability.sh +300 -0
  86. package/scripts/lib/formal-spec.sh +461 -0
  87. package/scripts/lib/helpers.sh +177 -4
  88. package/scripts/lib/intent-analysis.sh +409 -0
  89. package/scripts/lib/loop-convergence.sh +350 -0
  90. package/scripts/lib/loop-iteration.sh +682 -0
  91. package/scripts/lib/loop-progress.sh +48 -0
  92. package/scripts/lib/loop-restart.sh +185 -0
  93. package/scripts/lib/memory-effectiveness.sh +506 -0
  94. package/scripts/lib/mutation-executor.sh +352 -0
  95. package/scripts/lib/outcome-feedback.sh +521 -0
  96. package/scripts/lib/pipeline-cli.sh +336 -0
  97. package/scripts/lib/pipeline-commands.sh +1216 -0
  98. package/scripts/lib/pipeline-detection.sh +100 -2
  99. package/scripts/lib/pipeline-execution.sh +897 -0
  100. package/scripts/lib/pipeline-github.sh +28 -3
  101. package/scripts/lib/pipeline-intelligence-compound.sh +431 -0
  102. package/scripts/lib/pipeline-intelligence-scoring.sh +407 -0
  103. package/scripts/lib/pipeline-intelligence-skip.sh +181 -0
  104. package/scripts/lib/pipeline-intelligence.sh +100 -1136
  105. package/scripts/lib/pipeline-quality-bash-compat.sh +182 -0
  106. package/scripts/lib/pipeline-quality-checks.sh +17 -715
  107. package/scripts/lib/pipeline-quality-gates.sh +563 -0
  108. package/scripts/lib/pipeline-stages-build.sh +730 -0
  109. package/scripts/lib/pipeline-stages-delivery.sh +965 -0
  110. package/scripts/lib/pipeline-stages-intake.sh +1133 -0
  111. package/scripts/lib/pipeline-stages-monitor.sh +407 -0
  112. package/scripts/lib/pipeline-stages-review.sh +1022 -0
  113. package/scripts/lib/pipeline-stages.sh +59 -2929
  114. package/scripts/lib/pipeline-state.sh +36 -5
  115. package/scripts/lib/pipeline-util.sh +487 -0
  116. package/scripts/lib/policy-learner.sh +438 -0
  117. package/scripts/lib/process-reward.sh +493 -0
  118. package/scripts/lib/project-detect.sh +649 -0
  119. package/scripts/lib/quality-profile.sh +334 -0
  120. package/scripts/lib/recruit-commands.sh +885 -0
  121. package/scripts/lib/recruit-learning.sh +739 -0
  122. package/scripts/lib/recruit-roles.sh +648 -0
  123. package/scripts/lib/reward-aggregator.sh +458 -0
  124. package/scripts/lib/rl-optimizer.sh +362 -0
  125. package/scripts/lib/root-cause.sh +427 -0
  126. package/scripts/lib/scope-enforcement.sh +445 -0
  127. package/scripts/lib/session-restart.sh +493 -0
  128. package/scripts/lib/skill-memory.sh +300 -0
  129. package/scripts/lib/skill-registry.sh +775 -0
  130. package/scripts/lib/spec-driven.sh +476 -0
  131. package/scripts/lib/test-helpers.sh +18 -7
  132. package/scripts/lib/test-holdout.sh +429 -0
  133. package/scripts/lib/test-optimizer.sh +511 -0
  134. package/scripts/shipwright-file-suggest.sh +45 -0
  135. package/scripts/skills/adversarial-quality.md +61 -0
  136. package/scripts/skills/api-design.md +44 -0
  137. package/scripts/skills/architecture-design.md +50 -0
  138. package/scripts/skills/brainstorming.md +43 -0
  139. package/scripts/skills/data-pipeline.md +44 -0
  140. package/scripts/skills/deploy-safety.md +64 -0
  141. package/scripts/skills/documentation.md +38 -0
  142. package/scripts/skills/frontend-design.md +45 -0
  143. package/scripts/skills/generated/.gitkeep +0 -0
  144. package/scripts/skills/generated/_refinements/.gitkeep +0 -0
  145. package/scripts/skills/generated/_refinements/adversarial-quality.patch.md +3 -0
  146. package/scripts/skills/generated/_refinements/architecture-design.patch.md +3 -0
  147. package/scripts/skills/generated/_refinements/brainstorming.patch.md +3 -0
  148. package/scripts/skills/generated/cli-version-management.md +29 -0
  149. package/scripts/skills/generated/collection-system-validation.md +99 -0
  150. package/scripts/skills/generated/large-scale-c-refactoring-coordination.md +97 -0
  151. package/scripts/skills/generated/pattern-matching-similarity-scoring.md +195 -0
  152. package/scripts/skills/generated/test-parallelization-detection.md +65 -0
  153. package/scripts/skills/observability.md +79 -0
  154. package/scripts/skills/performance.md +48 -0
  155. package/scripts/skills/pr-quality.md +49 -0
  156. package/scripts/skills/product-thinking.md +43 -0
  157. package/scripts/skills/security-audit.md +49 -0
  158. package/scripts/skills/systematic-debugging.md +40 -0
  159. package/scripts/skills/testing-strategy.md +47 -0
  160. package/scripts/skills/two-stage-review.md +52 -0
  161. package/scripts/skills/validation-thoroughness.md +55 -0
  162. package/scripts/sw +9 -3
  163. package/scripts/sw-activity.sh +9 -2
  164. package/scripts/sw-adaptive.sh +2 -1
  165. package/scripts/sw-adversarial.sh +2 -1
  166. package/scripts/sw-architecture-enforcer.sh +3 -1
  167. package/scripts/sw-auth.sh +12 -2
  168. package/scripts/sw-autonomous.sh +5 -1
  169. package/scripts/sw-changelog.sh +4 -1
  170. package/scripts/sw-checkpoint.sh +2 -1
  171. package/scripts/sw-ci.sh +5 -1
  172. package/scripts/sw-cleanup.sh +4 -26
  173. package/scripts/sw-code-review.sh +10 -4
  174. package/scripts/sw-connect.sh +2 -1
  175. package/scripts/sw-context.sh +2 -1
  176. package/scripts/sw-cost.sh +48 -3
  177. package/scripts/sw-daemon.sh +66 -9
  178. package/scripts/sw-dashboard.sh +3 -1
  179. package/scripts/sw-db.sh +59 -16
  180. package/scripts/sw-decide.sh +8 -2
  181. package/scripts/sw-decompose.sh +360 -17
  182. package/scripts/sw-deps.sh +4 -1
  183. package/scripts/sw-developer-simulation.sh +4 -1
  184. package/scripts/sw-discovery.sh +325 -2
  185. package/scripts/sw-doc-fleet.sh +4 -1
  186. package/scripts/sw-docs-agent.sh +3 -1
  187. package/scripts/sw-docs.sh +2 -1
  188. package/scripts/sw-doctor.sh +453 -2
  189. package/scripts/sw-dora.sh +4 -1
  190. package/scripts/sw-durable.sh +4 -3
  191. package/scripts/sw-e2e-orchestrator.sh +17 -16
  192. package/scripts/sw-eventbus.sh +7 -1
  193. package/scripts/sw-evidence.sh +364 -12
  194. package/scripts/sw-feedback.sh +550 -9
  195. package/scripts/sw-fix.sh +20 -1
  196. package/scripts/sw-fleet-discover.sh +6 -2
  197. package/scripts/sw-fleet-viz.sh +4 -1
  198. package/scripts/sw-fleet.sh +5 -1
  199. package/scripts/sw-github-app.sh +16 -3
  200. package/scripts/sw-github-checks.sh +3 -2
  201. package/scripts/sw-github-deploy.sh +3 -2
  202. package/scripts/sw-github-graphql.sh +18 -7
  203. package/scripts/sw-guild.sh +5 -1
  204. package/scripts/sw-heartbeat.sh +5 -30
  205. package/scripts/sw-hello.sh +67 -0
  206. package/scripts/sw-hygiene.sh +6 -1
  207. package/scripts/sw-incident.sh +265 -1
  208. package/scripts/sw-init.sh +18 -2
  209. package/scripts/sw-instrument.sh +10 -2
  210. package/scripts/sw-intelligence.sh +42 -6
  211. package/scripts/sw-jira.sh +5 -1
  212. package/scripts/sw-launchd.sh +2 -1
  213. package/scripts/sw-linear.sh +4 -1
  214. package/scripts/sw-logs.sh +4 -1
  215. package/scripts/sw-loop.sh +432 -1128
  216. package/scripts/sw-memory.sh +356 -2
  217. package/scripts/sw-mission-control.sh +6 -1
  218. package/scripts/sw-model-router.sh +481 -26
  219. package/scripts/sw-otel.sh +13 -4
  220. package/scripts/sw-oversight.sh +14 -5
  221. package/scripts/sw-patrol-meta.sh +334 -0
  222. package/scripts/sw-pipeline-composer.sh +5 -1
  223. package/scripts/sw-pipeline-vitals.sh +2 -1
  224. package/scripts/sw-pipeline.sh +53 -2664
  225. package/scripts/sw-pm.sh +12 -5
  226. package/scripts/sw-pr-lifecycle.sh +2 -1
  227. package/scripts/sw-predictive.sh +7 -1
  228. package/scripts/sw-prep.sh +185 -2
  229. package/scripts/sw-ps.sh +5 -25
  230. package/scripts/sw-public-dashboard.sh +15 -3
  231. package/scripts/sw-quality.sh +2 -1
  232. package/scripts/sw-reaper.sh +8 -25
  233. package/scripts/sw-recruit.sh +156 -2303
  234. package/scripts/sw-regression.sh +19 -12
  235. package/scripts/sw-release-manager.sh +3 -1
  236. package/scripts/sw-release.sh +4 -1
  237. package/scripts/sw-remote.sh +3 -1
  238. package/scripts/sw-replay.sh +7 -1
  239. package/scripts/sw-retro.sh +158 -1
  240. package/scripts/sw-review-rerun.sh +3 -1
  241. package/scripts/sw-scale.sh +10 -3
  242. package/scripts/sw-security-audit.sh +6 -1
  243. package/scripts/sw-self-optimize.sh +6 -3
  244. package/scripts/sw-session.sh +9 -3
  245. package/scripts/sw-setup.sh +3 -1
  246. package/scripts/sw-stall-detector.sh +406 -0
  247. package/scripts/sw-standup.sh +15 -7
  248. package/scripts/sw-status.sh +3 -1
  249. package/scripts/sw-strategic.sh +4 -1
  250. package/scripts/sw-stream.sh +7 -1
  251. package/scripts/sw-swarm.sh +18 -6
  252. package/scripts/sw-team-stages.sh +13 -6
  253. package/scripts/sw-templates.sh +5 -29
  254. package/scripts/sw-testgen.sh +7 -1
  255. package/scripts/sw-tmux-pipeline.sh +4 -1
  256. package/scripts/sw-tmux-role-color.sh +2 -0
  257. package/scripts/sw-tmux-status.sh +1 -1
  258. package/scripts/sw-tmux.sh +3 -1
  259. package/scripts/sw-trace.sh +3 -1
  260. package/scripts/sw-tracker-github.sh +3 -0
  261. package/scripts/sw-tracker-jira.sh +3 -0
  262. package/scripts/sw-tracker-linear.sh +3 -0
  263. package/scripts/sw-tracker.sh +3 -1
  264. package/scripts/sw-triage.sh +2 -1
  265. package/scripts/sw-upgrade.sh +3 -1
  266. package/scripts/sw-ux.sh +5 -2
  267. package/scripts/sw-webhook.sh +3 -1
  268. package/scripts/sw-widgets.sh +3 -1
  269. package/scripts/sw-worktree.sh +15 -3
  270. package/scripts/test-skill-injection.sh +1233 -0
  271. package/templates/pipelines/autonomous.json +27 -3
  272. package/templates/pipelines/cost-aware.json +34 -8
  273. package/templates/pipelines/deployed.json +12 -0
  274. package/templates/pipelines/enterprise.json +12 -0
  275. package/templates/pipelines/fast.json +6 -0
  276. package/templates/pipelines/full.json +27 -3
  277. package/templates/pipelines/hotfix.json +6 -0
  278. package/templates/pipelines/standard.json +12 -0
  279. package/templates/pipelines/tdd.json +12 -0
@@ -0,0 +1,275 @@
1
+ ================================================================================
2
+ AUTONOMOUS CODING SYSTEMS: CUTTING EDGE RESEARCH SUMMARY (APRIL 2026)
3
+ ================================================================================
4
+
5
+ RESEARCH CONDUCTED:
6
+ - 50+ web searches across 8 major research areas
7
+ - 25+ arXiv papers and conference proceedings (NeurIPS 2024, POPL 2026, ICLR 2026)
8
+ - 15+ GitHub repositories (SWE-agent, DeepSeek-R1, Claude, Aider)
9
+ - 10+ industry reports (BCG Platinion, Anthropic, Meta, Google, OpenAI)
10
+ - 5+ competitive benchmarks (SWE-bench, Codeforces, AIME)
11
+
12
+ RESEARCH AREAS COVERED:
13
+ 1. Autonomous Loop Patterns & Convergence Detection
14
+ 2. Dark Factory / Lights-Out Delivery
15
+ 3. Reinforcement Learning for Code Generation
16
+ 4. Long-Context Agent Memory & Episodic Traces
17
+ 5. Formal Verification & Specification-Driven Pipeline
18
+ 6. Test Generation with Mutation Testing
19
+ 7. Cost-Optimized Model Routing & Cascading
20
+ 8. Self-Healing CI/CD & AIOps
21
+ 9. Multi-Agent Orchestration & Coordination
22
+ 10. Reasoning-First Code Generation (Extended/Adaptive Thinking)
23
+
24
+ ================================================================================
25
+ KEY FINDINGS
26
+ ================================================================================
27
+
28
+ DARK FACTORY ERA (SOTA):
29
+ - BCG Platinion: 3-5 engineers running fully autonomous factories
30
+ - Spotify: 650+ AI-generated PRs/month, 90% faster migrations
31
+ - OpenAI: 1M-line product in 5 months with 3 engineers, no manual code
32
+ - Two critical disciplines: (1) Harness Engineering, (2) Intent Thinking
33
+
34
+ REASONING MODELS (FRONTIERS):
35
+ - OpenAI o1-pro: 200K context, 100K output tokens, $150/$600 pricing
36
+ • 86% on AIME (vs 78% o1), 89th percentile Codeforces
37
+ - DeepSeek-R1: Pure RL, 2,029 Codeforces Elo (Candidate Master)
38
+ • 671B capacity @ 37B inference cost via Mixture of Experts
39
+ - Claude Opus 4.6: Adaptive thinking (replaces extended thinking)
40
+ • Dynamically allocates reasoning budget by task difficulty
41
+
42
+ MULTI-AGENT STATE (2026):
43
+ - 40% of enterprise apps will have agentic AI (up from <5% in 2025)
44
+ - Standard 3-role pattern: Planner, Worker, Judge
45
+ - Git worktrees → standard isolation mechanism
46
+ - Google DORA 2025: 20-30% faster workflows, but 9% bug rate climb
47
+
48
+ REINFORCEMENT LEARNING ADVANCES:
49
+ - Meta ACH: 9,095 mutants + 571 tests on 10,795 Android classes
50
+ - FunPRM: Functions as PRM steps → 15-20% better completion
51
+ - SecCoderX: Vulnerability reward model + secure code RL
52
+ - Policy learning converging on PPO + preference data → reward → optimization
53
+
54
+ MEMORY SYSTEMS (2026):
55
+ - Mem0: Mature hybrid storage (Postgres episodic + semantic)
56
+ - EM-LLM: Bayesian surprise + graph refinement for event segmentation
57
+ - Active compression: Consolidate 10 episodes → semantic facts
58
+ - Multi-layer: Episodic (events), Semantic (facts), Working (context)
59
+
60
+ COST OPTIMIZATION:
61
+ - Google Speculative Cascades: 30-60% cost reduction via cascade routing
62
+ - Open-source cascading: 92% cost savings on benchmarks
63
+ - Unified routing + cascading: Theoretically optimal framework
64
+ - Haiku-first, escalate to Sonnet → Opus only on failure
65
+
66
+ SELF-HEALING CI/CD:
67
+ - Agentic SRE: Telemetry → reasoning → controlled automation (closed loop)
68
+ - 60% enterprise adoption of self-healing infrastructure (Gartner 2026)
69
+ - Pipeline Doctor pattern: Repair agent reads logs, commits fixes
70
+ - 67% MTTR drop with AIOps; 40-60% in high-performing orgs
71
+
72
+ BENCHMARKS & COMPETITIVE POSITION:
73
+ - SWE-bench Verified: DEPRECATED (training contamination discovered)
74
+ - SWE-bench Pro: 1,865 tasks across 41 repos (NEW STANDARD)
75
+ - Claude Code: 80.9% on SWE-bench (highest reported)
76
+ - Aider: 49.2% SWE-Verified, 4.2x fewer tokens than Claude Code
77
+ - Cline: 500K+ downloads; VS Code integration; multi-model support
78
+
79
+ ================================================================================
80
+ SHIPWRIGHT COMPETITIVE ANALYSIS
81
+ ================================================================================
82
+
83
+ SHIPWRIGHT STRENGTHS (DIFFERENTIATED):
84
+ ✓ RL Architecture (multi-signal rewards, bandit selection, policy learning)
85
+ ✓ 12-Stage Pipeline with quality gates + evidence capture
86
+ ✓ Multi-Agent Fleet (5+ specialized agents, worktree isolation)
87
+ ✓ Cost Intelligence (budget tracking, model routing, DORA metrics)
88
+ ✓ Memory System (cross-session learning, failure patterns)
89
+ ✓ CI Integration (GitHub Actions, Checks API, Deployments API)
90
+ ✓ Daemon + Auto-Scaling (worker pool, load balancing)
91
+ ✓ 121+ Test Suites (80% script coverage)
92
+
93
+ SHIPWRIGHT GAPS (vs SOTA):
94
+ ✗ Loop convergence detection (heuristic, not formal regime analysis)
95
+ ✗ Intent Specification Engine (no intent → outcome transformation)
96
+ ✗ Vulnerability-aware RL (security signals not in reward model)
97
+ ✗ Episodic memory (pattern-based, not execution-trace-based)
98
+ ✗ Formal verification (tests only, no Dafny/Lean integration)
99
+ ✗ Mutation testing feedback (no mutant generation/killing loops)
100
+ ✗ Speculative cascading (fixed model routing, no escalation)
101
+ ✗ CI repair agent (no automated flaky test fixes)
102
+ ✗ Explicit conflict resolution (no file-level locks, DAG scheduling)
103
+ ✗ Active memory compression (unbounded context growth)
104
+
105
+ POSITIONING:
106
+ - SWE-agent: Single-agent, custom ACI, best repository navigation
107
+ - Claude Code: Highest SWE-bench score (80.9%), but single-threaded
108
+ - Aider: Most cost-efficient (4.2x fewer tokens), git-native
109
+ - GitHub Copilot Agent: Closing dark factory gap via Project Padawan
110
+ - Shipwright: Unique as PLATFORM for multi-agent factories + RL optimization
111
+
112
+ ================================================================================
113
+ 20-ITEM BACKLOG: RANKED BY IMPACT/EFFORT RATIO
114
+ ================================================================================
115
+
116
+ TIER 1 (EXCEPTIONAL ROI - IMPLEMENT IMMEDIATELY):
117
+
118
+ #1 [Medium effort] Semantic trajectory analysis + convergence detection
119
+ → 25-40% iteration waste reduction; early exit on stuck loops
120
+ Impact: Foundational for cost optimization
121
+
122
+ #2 [High effort] Intent Specification Engine (business → outcomes)
123
+ → 40-60% design time reduction; enables 3-5 person factories
124
+ Impact: Strategic, SOTA dark factory capability
125
+
126
+ #3 [Medium effort] Vulnerability Reward Model + online RL hardening
127
+ → 30-40% security issue reduction; compliance-ready
128
+ Impact: Security-hardened autonomous pipelines
129
+
130
+ #5 [Medium effort] Speculative Cascade Model Routing (Haiku → Sonnet → Opus)
131
+ → 40-60% cost reduction on median tasks; same quality on hard
132
+ Impact: Immediate cost leverage; proven by Google
133
+
134
+ TIER 2 (HIGH ROI - NEXT PHASE):
135
+
136
+ #4 [High effort] Episodic Memory Layer (execution traces + case-based reasoning)
137
+ → 20-35% faster solutions via episode analogy
138
+ Impact: Unlocks long-horizon learning; self-improvement loop
139
+
140
+ #6 [Medium effort] Mutation Testing Feedback Loop (validate test quality)
141
+ → 30-40% better test effectiveness; catches subtle bugs
142
+ Impact: Quality improvement via mutation score feedback
143
+
144
+ #7 [High effort] CI Repair Agent (automatic fix for flaky tests, timeouts)
145
+ → 50% fewer retries; faster merge times
146
+ Impact: Self-healing + resilience
147
+
148
+ #8 [Medium effort] LLM-as-a-Judge validation stage (secondary reviewer)
149
+ → 10-15% fewer merge regressions
150
+ Impact: Quality gate beyond rule-based checks
151
+
152
+ TIER 3 (MEDIUM ROI - LONGER TERM):
153
+
154
+ #9 [Medium effort] Explicit File Conflict Detection + DAG Scheduling
155
+ → Prevents merge failures; enables parallelism
156
+ Impact: Prevents silent errors in multi-agent workflows
157
+
158
+ #10 [Medium effort] Intelligent Reasoning Budget Allocation
159
+ → 15-25% harder-task success; cheaper on easy tasks
160
+ Impact: Quality + cost optimization on reasoning models
161
+
162
+ #11 [Very high effort] Formal Verification Integration (Dafny/Lean stage)
163
+ → 99.99% confidence on critical code paths
164
+ Impact: High stakes (crypto, payments); niche use case
165
+
166
+ #12 [High effort] Active Context Compression + Semantic Memory Layer
167
+ → Unbounded context bloat fixed; 30% better compression
168
+ Impact: Solves long-session scalability
169
+
170
+ #13 [High effort] Multi-Pass Mutation Generation (LLM-based mutants)
171
+ → Diversified test coverage; Meta-style compliance
172
+ Impact: Better mutation diversity than rule-based
173
+
174
+ #14 [High effort] Anomaly Detection + Predictive Repair (log analysis)
175
+ → Earlier failure prevention; MTTR ↓ 40%
176
+ Impact: Proactive vs reactive
177
+
178
+ #15 [High effort] Cross-Repo Fleet Learning (pattern sharing)
179
+ → 20% faster on new repo types
180
+ Impact: Leverages multi-repo data
181
+
182
+ TIER 4 (LOWER ROI - NICE TO HAVE):
183
+
184
+ #16-20: Quorum merge decisions, privacy mutations, DAG executor, symbol caching,
185
+ WebSocket real-time monitoring
186
+ Impact: Quality, compliance, observability improvements
187
+
188
+ ================================================================================
189
+ IMPLEMENTATION ROADMAP (12 WEEKS)
190
+ ================================================================================
191
+
192
+ PHASE 1: CONVERGENCE & COST (Weeks 1-4)
193
+ ✓ #1 Semantic trajectory analysis for loop regimes
194
+ ✓ #5 Speculative cascade routing
195
+ → #2 Intent Specification Engine (research phase)
196
+
197
+ PHASE 2: SECURITY & TESTING (Weeks 5-8)
198
+ ✓ #3 Vulnerability Reward Model
199
+ ✓ #6 Mutation Testing Loop
200
+ ✓ #13 Multi-Pass Mutation Generation
201
+
202
+ PHASE 3: MEMORY & SELF-HEALING (Weeks 9-12)
203
+ ✓ #4 Episodic Memory Layer
204
+ ✓ #7 CI Repair Agent
205
+ ✓ #8 LLM-as-a-Judge
206
+
207
+ ================================================================================
208
+ KEY SOURCES CITED
209
+ ================================================================================
210
+
211
+ BENCHMARKS:
212
+ - SWE-bench: https://www.vals.ai/benchmarks/swebench
213
+ - SWE-bench Pro: https://scale.com/blog/swe-bench-pro
214
+ - Codeforces: https://codeforces.com/
215
+
216
+ PAPERS (2024-2026):
217
+ - SWE-agent NeurIPS 2024: https://arxiv.org/abs/2405.15793
218
+ - Geometric Dynamics of Agentic Loops: https://arxiv.org/abs/2512.10350
219
+ - DafnyPro POPL 2026: https://popl26.sigplan.org/
220
+ - FunPRM Process Rewards: https://arxiv.org/abs/2601.22249
221
+ - DeepSeek-R1 Architecture: https://arxiv.org/abs/2501.12948
222
+ - Active Context Compression: https://arxiv.org/abs/2601.07190
223
+ - Episodic Memory for LLMs: https://arxiv.org/abs/2407.09450
224
+
225
+ INDUSTRY REPORTS:
226
+ - BCG Platinion Dark Software Factory:
227
+ https://www.bcgplatinion.com/insights/the-dark-software-factory
228
+ - Anthropic 2026 Agentic Coding Trends Report:
229
+ https://resources.anthropic.com/hubfs/2026%20Agentic%20Coding%20Trends%20Report.pdf
230
+ - GitHub Copilot Agent Mode:
231
+ https://github.com/newsroom/press-releases/agent-mode
232
+ - Meta Engineering Blog (LLM-powered bug catchers):
233
+ https://engineering.fb.com/2025/02/05/security/
234
+ - Google Speculative Cascades:
235
+ https://research.google/blog/speculative-cascades-a-hybrid-approach-for-smarter-faster-llm-inference/
236
+
237
+ MODELS & SYSTEMS:
238
+ - Claude Opus 4.6: https://platform.claude.com
239
+ - OpenAI o1-pro: https://openai.com/index/introducing-openai-o1-preview/
240
+ - DeepSeek-R1: https://github.com/deepseek-ai/DeepSeek-R1
241
+ - SWE-agent: https://github.com/SWE-agent/SWE-agent
242
+ - Aider: https://github.com/paul-gauthier/aider
243
+ - Cline: https://github.com/cline/cline
244
+
245
+ ================================================================================
246
+ CONCLUSION
247
+ ================================================================================
248
+
249
+ Shipwright is positioned as the PLATFORM-GRADE autonomous software factory —
250
+ the right abstraction level between human intent and shipped code. The next wave
251
+ of differentiation comes from:
252
+
253
+ 1. PREDICTIVE INTELLIGENCE (convergence detection, loop regimes)
254
+ → Cost & time reduction for iteration-heavy tasks
255
+
256
+ 2. LEARNING ACROSS EPISODES (episodic memory)
257
+ → Faster solutions on similar problems via case-based analogy
258
+
259
+ 3. FORMAL GUARANTEES (verification, formal specs)
260
+ → Safety/compliance for critical code paths
261
+
262
+ 4. SELF-HEALING (CI repair, automated fixes)
263
+ → Resilience and reduced human intervention
264
+
265
+ The 20-item backlog reflects industry momentum and fills Shipwright's remaining
266
+ gaps. Implementation order prioritizes highest ROI (cost, learning, quality).
267
+
268
+ Expected outcomes over 12 weeks:
269
+ - 40-60% cost reduction via cascading + convergence detection
270
+ - 30-40% security improvement via vulnerability-aware RL
271
+ - 20-35% faster solutions via episodic memory
272
+ - 50% fewer CI retries via repair agent
273
+ - Positioned as SOTA platform for dark factory era
274
+
275
+ ================================================================================
@@ -0,0 +1,341 @@
1
+ # Pipeline Quality Revolution — Design Spec
2
+
3
+ **Date**: 2026-03-10
4
+ **Status**: Approved
5
+ **Goal**: Close 6 quality gaps + AI-readiness foundation so autonomous agents deliver better than humans
6
+
7
+ ## Problem
8
+
9
+ Shipwright's pipeline infrastructure works — it polls issues, spawns agents, runs builds, creates PRs. But the _quality_ of output is mediocre because:
10
+
11
+ 1. Plans are optimistic checklists without adversarial thinking
12
+ 2. Definition of Done is written by the same agent that implements (fox/henhouse)
13
+ 3. Agents don't understand _why_ they're building something
14
+ 4. No feedback loop from PR review quality or post-merge outcomes
15
+ 5. Code review is advisory, not adversarial — agents rubber-stamp
16
+ 6. No scope discipline — PRs balloon to 16K+ lines
17
+
18
+ And underneath all of this: repos aren't "AI-ready." The daemon has no project-specific quality standards.
19
+
20
+ ## Architecture
21
+
22
+ ```
23
+ PREP (AI-Readiness Foundation) ──────────────────────
24
+ │ Interactive quality profile dialogue
25
+ │ → .claude/quality-profile.json
26
+ │ → enriched CLAUDE.md
27
+
28
+ INTAKE ──────────────────────────────────────────────
29
+ │ Intent analysis: WHO/WHAT/WHY/HOW/NOT
30
+ │ → .claude/pipeline-artifacts/acceptance-criteria.json
31
+
32
+ PLAN ────────────────────────────────────────────────
33
+ │ Constrained by external acceptance criteria
34
+ │ Mandatory failure mode analysis section
35
+ │ → plan.md (with "Files to Modify" for scope tracking)
36
+
37
+ BUILD ───────────────────────────────────────────────
38
+ │ "never_ship" rules injected every iteration
39
+ │ Scope tracking: planned vs actual files
40
+ │ Quality rules from learned patterns
41
+
42
+ REVIEW ──────────────────────────────────────────────
43
+ │ Adversarial-by-default, must find 3+ issues
44
+ │ Bugs block (not just criticals)
45
+ │ Scope creep flagged from plan diff
46
+
47
+ COMPOUND_QUALITY ────────────────────────────────────
48
+ │ Machine-verifiable DoD scorecard
49
+ │ Each acceptance criterion: PASS/FAIL with evidence
50
+
51
+ PR ──────────────────────────────────────────────────
52
+ │ PR size gate (configurable, default 500 lines)
53
+
54
+ POST-MERGE ──────────────────────────────────────────
55
+ Review comment capture → memory
56
+ Merge quality score tracking
57
+ Auto-generated quality rules from patterns
58
+ ```
59
+
60
+ ## Component Designs
61
+
62
+ ### Component 1: Quality Profile (`quality-profile.json`)
63
+
64
+ The keystone. Every pipeline stage reads this to calibrate behavior to the project.
65
+
66
+ **Schema:**
67
+
68
+ ```json
69
+ {
70
+ "version": 1,
71
+ "project_name": "string",
72
+ "generated_at": "ISO-8601",
73
+ "architecture": {
74
+ "pattern": "monolith|modular_monolith|microservices|serverless|library",
75
+ "layers": ["string"],
76
+ "dependency_direction": "inward|none",
77
+ "rules": ["string — architectural constraints"]
78
+ },
79
+ "testing": {
80
+ "philosophy": "tdd|test_after|coverage_target|manual",
81
+ "min_coverage_delta": 0,
82
+ "required_test_types": ["unit", "integration", "e2e"],
83
+ "test_cmd": "string",
84
+ "fast_test_cmd": "string"
85
+ },
86
+ "quality": {
87
+ "max_pr_lines": 500,
88
+ "max_files_per_pr": 15,
89
+ "never_ship": ["string — absolute rules"],
90
+ "always_require": ["string — positive requirements"],
91
+ "learned_rules": [
92
+ {
93
+ "rule": "string",
94
+ "source": "string — how this was learned",
95
+ "confidence": 0.0-1.0,
96
+ "created_at": "ISO-8601",
97
+ "inject_at": ["plan", "build", "review"]
98
+ }
99
+ ]
100
+ },
101
+ "review": {
102
+ "focus_areas": ["string"],
103
+ "blocking_severities": ["critical", "bug", "security"],
104
+ "min_issues_to_find": 3
105
+ },
106
+ "scope": {
107
+ "unplanned_files_block": false,
108
+ "decomposition_threshold_lines": 500
109
+ },
110
+ "deployment": {
111
+ "strategy": "direct|preview_then_production|staged_rollout",
112
+ "rollback_plan": "revert_commit|feature_flag|manual",
113
+ "monitoring_window_minutes": 30
114
+ }
115
+ }
116
+ ```
117
+
118
+ **Generation**: `shipwright prep --interactive` runs a guided dialogue analyzing repo structure, configs, tests, CI, and asking 5-7 targeted questions. `shipwright prep --auto` infers from repo analysis with confidence scores.
119
+
120
+ **Location**: `.claude/quality-profile.json` (checked into repo, grows over time)
121
+
122
+ ### Component 2: Intent Analysis (Intake Stage Enhancement)
123
+
124
+ **Trigger**: Runs in `stage_intake()` after issue metadata is fetched, before plan stage.
125
+
126
+ **Prompt template**:
127
+
128
+ ```
129
+ Analyze this issue deeply before any implementation planning.
130
+
131
+ Issue: {title}
132
+ Body: {body}
133
+ Labels: {labels}
134
+
135
+ Project architecture: {quality_profile.architecture}
136
+
137
+ Produce a structured analysis:
138
+
139
+ 1. WHO benefits? (end user / developer / ops / CI)
140
+ 2. WHAT changes? (concrete before→after behavior, with examples)
141
+ 3. WHY does this matter? (pain solved / capability unlocked)
142
+ 4. HOW will we know it worked? (observable signals — specific, testable)
143
+ 5. WHAT SHOULD WE NOT DO? (explicit out-of-scope boundaries)
144
+ 6. ACCEPTANCE CRITERIA: 3-7 machine-verifiable criteria
145
+
146
+ Output JSON to acceptance-criteria.json matching this schema:
147
+ {schema}
148
+ ```
149
+
150
+ **Output**: `acceptance-criteria.json` saved to pipeline artifacts. Passed to plan stage as input constraint.
151
+
152
+ **Key design decision**: Intent analysis runs as a _separate Claude session_ from planning. The analyst defines "what success looks like." The planner figures out "how to get there."
153
+
154
+ ### Component 3: Adversarial Plan Validation (Plan Stage Enhancement)
155
+
156
+ **Trigger**: After plan is generated, before plan validation gate.
157
+
158
+ **Injected into plan prompt**:
159
+
160
+ ```
161
+ After your implementation plan, include a MANDATORY section:
162
+
163
+ ## Failure Mode Analysis
164
+ For each major component or decision:
165
+ 1. Runtime failures: What happens when dependencies are unavailable?
166
+ 2. Concurrency risks: Race conditions, stale state, duplicate processing?
167
+ 3. Scale risks: 10x data, slow external deps, memory pressure?
168
+ 4. Rollback story: Can we revert safely without data loss?
169
+
170
+ Project architecture rules to consider:
171
+ {quality_profile.architecture.rules}
172
+
173
+ You MUST identify at least 3 concrete failure modes.
174
+ Address the most critical one in your implementation plan.
175
+ ```
176
+
177
+ **Validation gate addition**: New rejection reason `missing_failure_analysis` — plan is rejected if the failure mode section is empty, has fewer than 3 items, or contains only generic platitudes (detected by checking for project-specific references).
178
+
179
+ ### Component 4: Scope Enforcement (Build + PR Stage Enhancement)
180
+
181
+ **A. Planned files tracking (build stage)**:
182
+ Extract "Files to Modify" from `plan.md` at build start. After each iteration, compare `git diff --name-only` against planned files. Log unplanned files to `scope-report.json`.
183
+
184
+ **B. PR size gate (PR stage)**:
185
+
186
+ ```bash
187
+ total_lines=$(git diff --stat origin/main...HEAD | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+')
188
+ max_lines=$(jq -r '.quality.max_pr_lines // 500' "$QUALITY_PROFILE")
189
+ if [[ "$total_lines" -gt "$max_lines" ]]; then
190
+ error "PR is ${total_lines} lines (max: ${max_lines}). Decompose into smaller PRs."
191
+ exit 1
192
+ fi
193
+ ```
194
+
195
+ **C. Scope report injection into review**:
196
+ The review stage receives `scope-report.json` listing unplanned files. Reviewer must justify or flag each.
197
+
198
+ ### Component 5: Adversarial Review (Review Stage Enhancement)
199
+
200
+ **New review prompt** (replaces current generic prompt):
201
+
202
+ ```
203
+ You are a SKEPTICAL senior engineer reviewing code for production.
204
+ Your job is to FIND PROBLEMS, not confirm quality.
205
+
206
+ Project standards (from quality-profile.json):
207
+ - Never ship: {quality_profile.quality.never_ship}
208
+ - Always require: {quality_profile.quality.always_require}
209
+ - Focus areas: {quality_profile.review.focus_areas}
210
+ - Learned rules: {quality_profile.quality.learned_rules}
211
+
212
+ Definition of Done (from acceptance-criteria.json):
213
+ {acceptance_criteria}
214
+
215
+ Scope report (planned vs actual files):
216
+ {scope_report}
217
+
218
+ Rules:
219
+ 1. Find at least {min_issues_to_find} issues. If truly zero issues exist,
220
+ write a paragraph explaining why this code is exceptional.
221
+ 2. Rate each: Critical / Bug / Security / Warning / Suggestion
222
+ 3. Check EVERY acceptance criterion — mark PASS/FAIL with evidence.
223
+ 4. Flag every unplanned file — justify or mark as scope creep.
224
+ 5. Check every "never_ship" rule — cite violations with line numbers.
225
+ ```
226
+
227
+ **Blocking change**: Gate condition becomes `critical_count + bug_count + security_count > 0` (bugs now block).
228
+
229
+ ### Component 6: Machine-Verifiable DoD Scorecard (Compound Quality Enhancement)
230
+
231
+ **Computed checks** (no LLM needed):
232
+
233
+ ```json
234
+ {
235
+ "scorecard": {
236
+ "pr_size": { "status": "pass", "value": 247, "limit": 500 },
237
+ "test_count_delta": { "status": "pass", "value": 12, "baseline": 0 },
238
+ "coverage_delta": { "status": "pass", "value": 2.1, "min": 0 },
239
+ "lint_warnings_delta": { "status": "pass", "value": 0, "max": 0 },
240
+ "planned_files_coverage": {
241
+ "status": "pass",
242
+ "planned": 5,
243
+ "touched": 5,
244
+ "unplanned": 1
245
+ },
246
+ "never_ship_violations": { "status": "pass", "violations": [] },
247
+ "acceptance_criteria": [
248
+ {
249
+ "id": "ac-1",
250
+ "status": "pass",
251
+ "evidence": "GET /api/users returns 200 in test output"
252
+ },
253
+ {
254
+ "id": "ac-2",
255
+ "status": "fail",
256
+ "evidence": "No test for 401 response on invalid token"
257
+ }
258
+ ]
259
+ },
260
+ "overall": "fail",
261
+ "blocking_failures": ["ac-2"]
262
+ }
263
+ ```
264
+
265
+ Machine checks run first. LLM-based checks (adversarial, negative testing) run only if machine checks pass. This is faster and cheaper.
266
+
267
+ ### Component 7: Outcome Feedback Loop (Post-Merge Enhancement)
268
+
269
+ **A. PR Review Capture** (new function in `sw-feedback.sh`):
270
+
271
+ ```bash
272
+ capture_review_feedback() {
273
+ local pr_number="$1"
274
+ local reviews=$(gh api "repos/{owner}/{repo}/pulls/${pr_number}/reviews" --jq '.[].body')
275
+ local comments=$(gh api "repos/{owner}/{repo}/pulls/${pr_number}/comments" --jq '.[].body')
276
+ # Store in memory with type "review_feedback"
277
+ # Extract patterns for quality rule generation
278
+ }
279
+ ```
280
+
281
+ **B. Merge Quality Score** (new function in `sw-feedback.sh`):
282
+ Track per-PR: `clean_merge` (+1), `changes_requested` (-1), `reverted` (-3), `regression` (-2). Rolling average → pipeline quality score in DORA dashboard.
283
+
284
+ **C. Quality Rule Auto-Generation** (new function in `sw-memory.sh`):
285
+ When same review pattern appears 3+ times, generate a quality rule and add to `quality-profile.json`'s `learned_rules` array. Rules are injected into plan, build, and review stages.
286
+
287
+ ## Data Flow
288
+
289
+ ```
290
+ quality-profile.json ──→ ALL STAGES (standards calibration)
291
+
292
+ ├──→ intake: intent analysis prompt
293
+ ├──→ plan: failure mode analysis prompt + acceptance criteria constraints
294
+ ├──→ build: never_ship rules + learned quality rules
295
+ ├──→ review: focus areas + blocking rules + scope report
296
+ ├──→ compound_quality: machine check thresholds
297
+ └──→ pr: size limits
298
+
299
+ acceptance-criteria.json ──→ plan (constraints) → review (checklist) → compound_quality (scorecard)
300
+ scope-report.json ──→ review (scope creep detection) → pr (size gate)
301
+ dod-scorecard.json ──→ compound_quality output → pr gate
302
+ merge-quality.jsonl ──→ feedback → quality-profile.json (learned_rules)
303
+ ```
304
+
305
+ ## Testing Strategy
306
+
307
+ Each component has a corresponding test:
308
+
309
+ - `sw-quality-profile-test.sh` — profile generation, schema validation, merge with learned rules
310
+ - `sw-intent-analysis-test.sh` — acceptance criteria extraction, JSON schema compliance
311
+ - `sw-scope-enforcement-test.sh` — planned vs actual file tracking, PR size gate
312
+ - `sw-adversarial-review-test.sh` — minimum issue finding, blocking behavior, scope creep detection
313
+ - `sw-dod-scorecard-test.sh` — machine check computation, pass/fail logic
314
+ - `sw-outcome-feedback-test.sh` — review capture, quality score, rule auto-generation
315
+
316
+ Integration: `sw-pipeline-test.sh` gains tests for quality profile flow through all stages.
317
+
318
+ ## Files to Create
319
+
320
+ | File | Purpose |
321
+ | ---------------------------------- | ----------------------------------------------------- |
322
+ | `scripts/lib/quality-profile.sh` | Profile loading, validation, merge with learned rules |
323
+ | `scripts/lib/intent-analysis.sh` | Issue intent analysis, acceptance criteria generation |
324
+ | `scripts/lib/scope-enforcement.sh` | Planned vs actual file tracking, PR size gate |
325
+ | `scripts/lib/dod-scorecard.sh` | Machine-verifiable DoD computation |
326
+ | `scripts/lib/outcome-feedback.sh` | Review capture, quality score, rule generation |
327
+
328
+ ## Files to Modify
329
+
330
+ | File | Changes |
331
+ | ---------------------------------------------------------- | ----------------------------------------------------------- |
332
+ | `scripts/lib/pipeline-stages-intake.sh` | Add intent analysis step, generate acceptance-criteria.json |
333
+ | `scripts/lib/pipeline-stages-intake.sh` (plan section) | Inject failure mode analysis, consume acceptance criteria |
334
+ | `scripts/lib/pipeline-stages-build.sh` | Inject never_ship rules, scope tracking |
335
+ | `scripts/lib/pipeline-stages-review.sh` | New adversarial review prompt, bug-blocking, scope report |
336
+ | `scripts/lib/pipeline-stages-review.sh` (compound_quality) | Machine DoD scorecard before LLM checks |
337
+ | `scripts/lib/pipeline-stages-delivery.sh` | PR size gate |
338
+ | `scripts/sw-prep.sh` | Interactive quality profile generation |
339
+ | `scripts/sw-feedback.sh` | PR review capture, merge quality score |
340
+ | `scripts/sw-memory.sh` | Quality rule auto-generation from patterns |
341
+ | `scripts/sw-pipeline-test.sh` | Integration tests for quality profile flow |
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "shipwright-cli",
3
- "version": "3.2.0",
3
+ "version": "3.3.0",
4
4
  "description": "Orchestrate autonomous Claude Code agent teams in tmux",
5
5
  "bin": {
6
6
  "shipwright": "scripts/sw",
@@ -36,7 +36,7 @@
36
36
  "dashboard:test": "vitest run --config dashboard/vitest.config.ts",
37
37
  "dashboard:test:watch": "vitest --config dashboard/vitest.config.ts",
38
38
  "dashboard:test:coverage": "vitest run --config dashboard/vitest.config.ts --coverage",
39
- "test": "bash scripts/sw-agi-roadmap-test.sh && bash scripts/sw-activity-test.sh && bash scripts/sw-adaptive-test.sh && bash scripts/sw-adversarial-test.sh && bash scripts/sw-architecture-enforcer-test.sh && bash scripts/sw-auth-test.sh && bash scripts/sw-autonomous-test.sh && bash scripts/sw-changelog-test.sh && bash scripts/sw-checkpoint-test.sh && bash scripts/sw-ci-test.sh && bash scripts/sw-cleanup-test.sh && bash scripts/sw-code-review-test.sh && bash scripts/sw-connect-test.sh && bash scripts/sw-context-test.sh && bash scripts/sw-cost-test.sh && bash scripts/sw-daemon-test.sh && bash scripts/sw-dashboard-test.sh && bash scripts/sw-db-test.sh && bash scripts/sw-decompose-test.sh && bash scripts/sw-decide-test.sh && bash scripts/sw-deps-test.sh && bash scripts/sw-developer-simulation-test.sh && bash scripts/sw-discovery-test.sh && bash scripts/sw-doc-fleet-test.sh && bash scripts/sw-docs-agent-test.sh && bash scripts/sw-docs-test.sh && bash scripts/sw-doctor-test.sh && bash scripts/sw-dora-test.sh && bash scripts/sw-durable-test.sh && bash scripts/sw-e2e-orchestrator-test.sh && bash scripts/sw-eventbus-test.sh && bash scripts/sw-feedback-test.sh && bash scripts/sw-fix-test.sh && bash scripts/sw-fleet-discover-test.sh && bash scripts/sw-fleet-test.sh && bash scripts/sw-fleet-viz-test.sh && bash scripts/sw-frontier-test.sh && bash scripts/sw-github-app-test.sh && bash scripts/sw-github-checks-test.sh && bash scripts/sw-github-deploy-test.sh && bash scripts/sw-github-graphql-test.sh && bash scripts/sw-guild-test.sh && bash scripts/sw-heartbeat-test.sh && bash scripts/sw-hygiene-test.sh && bash scripts/sw-incident-test.sh && bash scripts/sw-init-test.sh && bash scripts/sw-instrument-test.sh && bash scripts/sw-intelligence-test.sh && bash scripts/sw-jira-test.sh && bash scripts/sw-launchd-test.sh && bash scripts/sw-linear-test.sh && bash scripts/sw-logs-test.sh && bash scripts/sw-loop-test.sh && bash scripts/sw-memory-test.sh && bash scripts/sw-mission-control-test.sh && bash scripts/sw-model-router-test.sh && bash scripts/sw-otel-test.sh && bash scripts/sw-oversight-test.sh && bash scripts/sw-patrol-meta-test.sh && bash scripts/sw-pipeline-composer-test.sh && bash scripts/sw-pipeline-test.sh && bash scripts/sw-pipeline-vitals-test.sh && bash scripts/sw-pm-test.sh && bash scripts/sw-pr-lifecycle-test.sh && bash scripts/sw-predictive-test.sh && bash scripts/sw-prep-test.sh && bash scripts/sw-ps-test.sh && bash scripts/sw-public-dashboard-test.sh && bash scripts/sw-quality-test.sh && bash scripts/sw-reaper-test.sh && bash scripts/sw-recruit-test.sh && bash scripts/sw-regression-test.sh && bash scripts/sw-release-manager-test.sh && bash scripts/sw-release-test.sh && bash scripts/sw-remote-test.sh && bash scripts/sw-replay-test.sh && bash scripts/sw-retro-test.sh && bash scripts/sw-scale-test.sh && bash scripts/sw-security-audit-test.sh && bash scripts/sw-self-optimize-test.sh && bash scripts/sw-session-test.sh && bash scripts/sw-setup-test.sh && bash scripts/sw-standup-test.sh && bash scripts/sw-status-test.sh && bash scripts/sw-strategic-test.sh && bash scripts/sw-stream-test.sh && bash scripts/sw-swarm-test.sh && bash scripts/sw-team-stages-test.sh && bash scripts/sw-templates-test.sh && bash scripts/sw-testgen-test.sh && bash scripts/sw-tmux-pipeline-test.sh && bash scripts/sw-tmux-test.sh && bash scripts/sw-trace-test.sh && bash scripts/sw-tracker-test.sh && bash scripts/sw-triage-test.sh && bash scripts/sw-upgrade-test.sh && bash scripts/sw-ux-test.sh && bash scripts/sw-webhook-test.sh && bash scripts/sw-widgets-test.sh && bash scripts/sw-worktree-test.sh && bash scripts/sw-lib-compat-test.sh && bash scripts/sw-lib-helpers-test.sh && bash scripts/sw-lib-daemon-dispatch-test.sh && bash scripts/sw-lib-daemon-failure-test.sh && bash scripts/sw-lib-daemon-poll-test.sh && bash scripts/sw-lib-daemon-state-test.sh && bash scripts/sw-lib-daemon-triage-test.sh && bash scripts/sw-lib-pipeline-detection-test.sh && bash scripts/sw-lib-pipeline-intelligence-test.sh && bash scripts/sw-lib-pipeline-quality-checks-test.sh && bash scripts/sw-lib-pipeline-stages-test.sh && bash scripts/sw-lib-pipeline-state-test.sh && bash scripts/sw-adapters-test.sh && bash scripts/sw-evidence-test.sh && bash scripts/sw-review-rerun-test.sh && bash scripts/sw-tracker-providers-test.sh && bash scripts/sw-budget-chaos-test.sh && bash scripts/sw-autonomous-e2e-test.sh && bash scripts/sw-memory-discovery-e2e-test.sh && bash scripts/sw-policy-e2e-test.sh && bash scripts/sw-e2e-smoke-test.sh && bash scripts/sw-dashboard-e2e-test.sh",
39
+ "test": "bash scripts/sw-agi-roadmap-test.sh && bash scripts/sw-activity-test.sh && bash scripts/sw-adaptive-test.sh && bash scripts/sw-adversarial-test.sh && bash scripts/sw-architecture-enforcer-test.sh && bash scripts/sw-auth-test.sh && bash scripts/sw-autonomous-test.sh && bash scripts/sw-changelog-test.sh && bash scripts/sw-checkpoint-test.sh && bash scripts/sw-ci-test.sh && bash scripts/sw-cleanup-test.sh && bash scripts/sw-code-review-test.sh && bash scripts/sw-connect-test.sh && bash scripts/sw-context-budget-test.sh && bash scripts/sw-context-test.sh && bash scripts/sw-cost-test.sh && bash scripts/sw-daemon-test.sh && bash scripts/sw-dashboard-test.sh && bash scripts/sw-db-test.sh && bash scripts/sw-decompose-test.sh && bash scripts/sw-decide-test.sh && bash scripts/sw-deps-test.sh && bash scripts/sw-developer-simulation-test.sh && bash scripts/sw-discovery-test.sh && bash scripts/sw-doc-fleet-test.sh && bash scripts/sw-docs-agent-test.sh && bash scripts/sw-docs-test.sh && bash scripts/sw-doctor-test.sh && bash scripts/sw-dora-test.sh && bash scripts/sw-durable-test.sh && bash scripts/sw-e2e-orchestrator-test.sh && bash scripts/sw-eventbus-test.sh && bash scripts/sw-feedback-test.sh && bash scripts/sw-outcome-feedback-test.sh && bash scripts/sw-fix-test.sh && bash scripts/sw-fleet-discover-test.sh && bash scripts/sw-fleet-test.sh && bash scripts/sw-fleet-viz-test.sh && bash scripts/sw-frontier-test.sh && bash scripts/sw-github-app-test.sh && bash scripts/sw-github-checks-test.sh && bash scripts/sw-github-deploy-test.sh && bash scripts/sw-github-graphql-test.sh && bash scripts/sw-guild-test.sh && bash scripts/sw-heartbeat-test.sh && bash scripts/sw-hello-test.sh && bash scripts/sw-hygiene-test.sh && bash scripts/sw-incident-test.sh && bash scripts/sw-init-test.sh && bash scripts/sw-instrument-test.sh && bash scripts/sw-intelligence-test.sh && bash scripts/sw-jira-test.sh && bash scripts/sw-launchd-test.sh && bash scripts/sw-linear-test.sh && bash scripts/sw-logs-test.sh && bash scripts/sw-loop-test.sh && bash scripts/sw-memory-test.sh && bash scripts/sw-mission-control-test.sh && bash scripts/sw-model-router-test.sh && bash scripts/sw-otel-test.sh && bash scripts/sw-oversight-test.sh && bash scripts/sw-patrol-meta-test.sh && bash scripts/sw-pipeline-composer-test.sh && bash scripts/sw-pipeline-test.sh && bash scripts/sw-pipeline-vitals-test.sh && bash scripts/sw-pm-test.sh && bash scripts/sw-pr-lifecycle-test.sh && bash scripts/sw-predictive-test.sh && bash scripts/sw-prep-test.sh && bash scripts/sw-ps-test.sh && bash scripts/sw-public-dashboard-test.sh && bash scripts/sw-quality-profile-test.sh && bash scripts/sw-quality-test.sh && bash scripts/sw-reaper-test.sh && bash scripts/sw-recruit-test.sh && bash scripts/sw-regression-test.sh && bash scripts/sw-release-manager-test.sh && bash scripts/sw-release-test.sh && bash scripts/sw-root-cause-test.sh && bash scripts/sw-remote-test.sh && bash scripts/sw-replay-test.sh && bash scripts/sw-retro-test.sh && bash scripts/sw-scale-test.sh && bash scripts/sw-stall-detector-test.sh && bash scripts/sw-security-audit-test.sh && bash scripts/sw-self-optimize-test.sh && bash scripts/sw-session-test.sh && bash scripts/sw-setup-test.sh && bash scripts/sw-standup-test.sh && bash scripts/sw-status-test.sh && bash scripts/sw-strategic-test.sh && bash scripts/sw-stream-test.sh && bash scripts/sw-swarm-test.sh && bash scripts/sw-team-stages-test.sh && bash scripts/sw-templates-test.sh && bash scripts/sw-testgen-test.sh && bash scripts/sw-tmux-pipeline-test.sh && bash scripts/sw-tmux-test.sh && bash scripts/sw-trace-test.sh && bash scripts/sw-tracker-test.sh && bash scripts/sw-triage-test.sh && bash scripts/sw-upgrade-test.sh && bash scripts/sw-ux-test.sh && bash scripts/sw-webhook-test.sh && bash scripts/sw-widgets-test.sh && bash scripts/sw-worktree-test.sh && bash scripts/sw-lib-compat-test.sh && bash scripts/sw-lib-helpers-test.sh && bash scripts/sw-lib-error-actionability-test.sh && bash scripts/sw-lib-daemon-dispatch-test.sh && bash scripts/sw-lib-daemon-failure-test.sh && bash scripts/sw-lib-daemon-poll-test.sh && bash scripts/sw-lib-daemon-state-test.sh && bash scripts/sw-lib-daemon-triage-test.sh && bash scripts/sw-lib-daemon-patrol-test.sh && bash scripts/sw-lib-pipeline-detection-test.sh && bash scripts/sw-lib-pipeline-intelligence-test.sh && bash scripts/sw-lib-pipeline-quality-checks-test.sh && bash scripts/sw-lib-pipeline-stages-test.sh && bash scripts/sw-lib-pipeline-state-test.sh && bash scripts/sw-adapters-test.sh && bash scripts/sw-evidence-test.sh && bash scripts/sw-review-rerun-test.sh && bash scripts/sw-tracker-providers-test.sh && bash scripts/sw-budget-chaos-test.sh && bash scripts/sw-chaos-test.sh && bash scripts/sw-autonomous-e2e-test.sh && bash scripts/sw-memory-discovery-e2e-test.sh && bash scripts/sw-policy-e2e-test.sh && bash scripts/sw-e2e-smoke-test.sh && bash scripts/sw-dashboard-e2e-test.sh && bash scripts/sw-reward-aggregator-test.sh && bash scripts/sw-bandit-selector-test.sh && bash scripts/sw-policy-learner-test.sh && bash scripts/sw-autoresearch-e2e-test.sh",
40
40
  "test:smoke": "bash scripts/sw-e2e-smoke-test.sh",
41
41
  "test:integration": "bash scripts/sw-e2e-integration-test.sh && bash scripts/sw-e2e-system-test.sh && bash scripts/sw-server-api-test.sh && bash scripts/sw-integration-claude-test.sh",
42
42
  "harness:evidence:capture": "bash scripts/sw-evidence.sh capture",