opengstack 0.14.0 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +4 -4
- package/CLAUDE.md +127 -110
- package/README.md +10 -5
- package/SKILL.md +500 -70
- package/bin/opengstack.js +69 -69
- package/commands/autoplan.md +7 -9
- package/commands/benchmark.md +84 -91
- package/commands/browse.md +60 -64
- package/commands/canary.md +7 -9
- package/commands/careful.md +2 -2
- package/commands/codex.md +7 -9
- package/commands/connect-chrome.md +7 -9
- package/commands/cso.md +7 -9
- package/commands/design-consultation.md +7 -9
- package/commands/design-review.md +7 -9
- package/commands/design-shotgun.md +7 -9
- package/commands/document-release.md +7 -9
- package/commands/freeze.md +3 -3
- package/commands/guard.md +4 -4
- package/commands/investigate.md +7 -9
- package/commands/land-and-deploy.md +7 -9
- package/commands/office-hours.md +7 -9
- package/commands/{gstack-upgrade.md → opengstack-upgrade.md} +64 -65
- package/commands/plan-ceo-review.md +7 -9
- package/commands/plan-design-review.md +7 -9
- package/commands/plan-eng-review.md +7 -9
- package/commands/qa-only.md +7 -9
- package/commands/qa.md +7 -9
- package/commands/retro.md +7 -9
- package/commands/review.md +7 -9
- package/commands/setup-browser-cookies.md +22 -26
- package/commands/setup-deploy.md +7 -9
- package/commands/ship.md +7 -9
- package/commands/unfreeze.md +7 -7
- package/docs/designs/CHROME_VS_CHROMIUM_EXPLORATION.md +9 -9
- package/docs/designs/CONDUCTOR_CHROME_SIDEBAR_INTEGRATION.md +2 -2
- package/docs/designs/CONDUCTOR_SESSION_API.md +16 -16
- package/docs/designs/DESIGN_SHOTGUN.md +74 -74
- package/docs/designs/DESIGN_TOOLS_V1.md +111 -111
- package/docs/skills.md +483 -202
- package/package.json +42 -43
- package/scripts/analytics.ts +188 -0
- package/scripts/dev-skill.ts +83 -0
- package/scripts/discover-skills.ts +39 -0
- package/scripts/eval-compare.ts +97 -0
- package/scripts/eval-list.ts +117 -0
- package/scripts/eval-select.ts +86 -0
- package/scripts/eval-summary.ts +188 -0
- package/scripts/eval-watch.ts +172 -0
- package/scripts/gen-skill-docs.ts +473 -0
- package/scripts/resolvers/browse.ts +129 -0
- package/scripts/resolvers/codex-helpers.ts +133 -0
- package/scripts/resolvers/composition.ts +48 -0
- package/scripts/resolvers/confidence.ts +37 -0
- package/scripts/resolvers/constants.ts +50 -0
- package/scripts/resolvers/design.ts +950 -0
- package/scripts/resolvers/index.ts +59 -0
- package/scripts/resolvers/learnings.ts +96 -0
- package/scripts/resolvers/preamble.ts +505 -0
- package/scripts/resolvers/review.ts +884 -0
- package/scripts/resolvers/testing.ts +573 -0
- package/scripts/resolvers/types.ts +45 -0
- package/scripts/resolvers/utility.ts +421 -0
- package/scripts/skill-check.ts +190 -0
- package/scripts/cleanup.py +0 -100
- package/scripts/filter-skills.sh +0 -114
- package/scripts/filter_skills.py +0 -164
- package/scripts/install-commands.js +0 -45
- package/scripts/install-skills.js +0 -60
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
import type { TemplateContext } from './types';
|
|
2
|
+
|
|
3
|
+
export function generateTestBootstrap(_ctx: TemplateContext): string {
|
|
4
|
+
return `## Test Framework Bootstrap
|
|
5
|
+
|
|
6
|
+
**Detect existing test framework and project runtime:**
|
|
7
|
+
|
|
8
|
+
\`\`\`bash
|
|
9
|
+
setopt +o nomatch 2>/dev/null || true # zsh compat
|
|
10
|
+
# Detect project runtime
|
|
11
|
+
[ -f Gemfile ] && echo "RUNTIME:ruby"
|
|
12
|
+
[ -f package.json ] && echo "RUNTIME:node"
|
|
13
|
+
[ -f requirements.txt ] || [ -f pyproject.toml ] && echo "RUNTIME:python"
|
|
14
|
+
[ -f go.mod ] && echo "RUNTIME:go"
|
|
15
|
+
[ -f Cargo.toml ] && echo "RUNTIME:rust"
|
|
16
|
+
[ -f composer.json ] && echo "RUNTIME:php"
|
|
17
|
+
[ -f mix.exs ] && echo "RUNTIME:elixir"
|
|
18
|
+
# Detect sub-frameworks
|
|
19
|
+
[ -f Gemfile ] && grep -q "rails" Gemfile 2>/dev/null && echo "FRAMEWORK:rails"
|
|
20
|
+
[ -f package.json ] && grep -q '"next"' package.json 2>/dev/null && echo "FRAMEWORK:nextjs"
|
|
21
|
+
# Check for existing test infrastructure
|
|
22
|
+
ls jest.config.* vitest.config.* playwright.config.* .rspec pytest.ini pyproject.toml phpunit.xml 2>/dev/null
|
|
23
|
+
ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null
|
|
24
|
+
# Check opt-out marker
|
|
25
|
+
[ -f .OpenGStack/no-test-bootstrap ] && echo "BOOTSTRAP_DECLINED"
|
|
26
|
+
\`\`\`
|
|
27
|
+
|
|
28
|
+
**If test framework detected** (config files or test directories found):
|
|
29
|
+
Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap."
|
|
30
|
+
Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns).
|
|
31
|
+
Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.**
|
|
32
|
+
|
|
33
|
+
**If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.**
|
|
34
|
+
|
|
35
|
+
**If NO runtime detected** (no config files found): Use AskUserQuestion:
|
|
36
|
+
"I couldn't detect your project's language. What runtime are you using?"
|
|
37
|
+
Options: A) Node.js/TypeScript B) Ruby/Rails C) Python D) Go E) Rust F) PHP G) Elixir H) This project doesn't need tests.
|
|
38
|
+
If user picks H → write \`.OpenGStack/no-test-bootstrap\` and continue without tests.
|
|
39
|
+
|
|
40
|
+
**If runtime detected but no test framework — bootstrap:**
|
|
41
|
+
|
|
42
|
+
### B2. Research best practices
|
|
43
|
+
|
|
44
|
+
Use WebSearch to find current best practices for the detected runtime:
|
|
45
|
+
- \`"[runtime] best test framework 2025 2026"\`
|
|
46
|
+
- \`"[framework A] vs [framework B] comparison"\`
|
|
47
|
+
|
|
48
|
+
If WebSearch is unavailable, use this built-in knowledge table:
|
|
49
|
+
|
|
50
|
+
| Runtime | Primary recommendation | Alternative |
|
|
51
|
+
|---------|----------------------|-------------|
|
|
52
|
+
| Ruby/Rails | minitest + fixtures + capybara | rspec + factory_bot + shoulda-matchers |
|
|
53
|
+
| Node.js | vitest + @testing-library | jest + @testing-library |
|
|
54
|
+
| Next.js | vitest + @testing-library/react + playwright | jest + cypress |
|
|
55
|
+
| Python | pytest + pytest-cov | unittest |
|
|
56
|
+
| Go | stdlib testing + testify | stdlib only |
|
|
57
|
+
| Rust | cargo test (built-in) + mockall | — |
|
|
58
|
+
| PHP | phpunit + mockery | pest |
|
|
59
|
+
| Elixir | ExUnit (built-in) + ex_machina | — |
|
|
60
|
+
|
|
61
|
+
### B3. Framework selection
|
|
62
|
+
|
|
63
|
+
Use AskUserQuestion:
|
|
64
|
+
"I detected this is a [Runtime/Framework] project with no test framework. I researched current best practices. Here are the options:
|
|
65
|
+
A) [Primary] — [rationale]. Includes: [packages]. Supports: unit, integration, smoke, e2e
|
|
66
|
+
B) [Alternative] — [rationale]. Includes: [packages]
|
|
67
|
+
C) Skip — don't set up testing right now
|
|
68
|
+
RECOMMENDATION: Choose A because [reason based on project context]"
|
|
69
|
+
|
|
70
|
+
If user picks C → write \`.OpenGStack/no-test-bootstrap\`. Tell user: "If you change your mind later, delete \`.OpenGStack/no-test-bootstrap\` and re-run." Continue without tests.
|
|
71
|
+
|
|
72
|
+
If multiple runtimes detected (monorepo) → ask which runtime to set up first, with option to do both sequentially.
|
|
73
|
+
|
|
74
|
+
### B4. Install and configure
|
|
75
|
+
|
|
76
|
+
1. Install the chosen packages (npm/bun/gem/pip/etc.)
|
|
77
|
+
2. Create minimal config file
|
|
78
|
+
3. Create directory structure (test/, spec/, etc.)
|
|
79
|
+
4. Create one example test matching the project's code to verify setup works
|
|
80
|
+
|
|
81
|
+
If package installation fails → debug once. If still failing → revert with \`git checkout -- package.json package-lock.json\` (or equivalent for the runtime). Warn user and continue without tests.
|
|
82
|
+
|
|
83
|
+
### B4.5. First real tests
|
|
84
|
+
|
|
85
|
+
Generate 3-5 real tests for existing code:
|
|
86
|
+
|
|
87
|
+
1. **Find recently changed files:** \`git log --since=30.days --name-only --format="" | sort | uniq -c | sort -rn | head -10\`
|
|
88
|
+
2. **Prioritize by risk:** Error handlers > business logic with conditionals > API endpoints > pure functions
|
|
89
|
+
3. **For each file:** Write one test that tests real behavior with meaningful assertions. Never \`expect(x).toBeDefined()\` — test what the code DOES.
|
|
90
|
+
4. Run each test. Passes → keep. Fails → fix once. Still fails → delete silently.
|
|
91
|
+
5. Generate at least 1 test, cap at 5.
|
|
92
|
+
|
|
93
|
+
Never import secrets, API keys, or credentials in test files. Use environment variables or test fixtures.
|
|
94
|
+
|
|
95
|
+
### B5. Verify
|
|
96
|
+
|
|
97
|
+
\`\`\`bash
|
|
98
|
+
# Run the full test suite to confirm everything works
|
|
99
|
+
{detected test command}
|
|
100
|
+
\`\`\`
|
|
101
|
+
|
|
102
|
+
If tests fail → debug once. If still failing → revert all bootstrap changes and warn user.
|
|
103
|
+
|
|
104
|
+
### B5.5. CI/CD pipeline
|
|
105
|
+
|
|
106
|
+
\`\`\`bash
|
|
107
|
+
# Check CI provider
|
|
108
|
+
ls -d .github/ 2>/dev/null && echo "CI:github"
|
|
109
|
+
ls .gitlab-ci.yml .circleci/ bitrise.yml 2>/dev/null
|
|
110
|
+
\`\`\`
|
|
111
|
+
|
|
112
|
+
If \`.github/\` exists (or no CI detected — default to GitHub Actions):
|
|
113
|
+
Create \`.github/workflows/test.yml\` with:
|
|
114
|
+
- \`runs-on: ubuntu-latest\`
|
|
115
|
+
- Appropriate setup action for the runtime (setup-node, setup-ruby, setup-python, etc.)
|
|
116
|
+
- The same test command verified in B5
|
|
117
|
+
- Trigger: push + pull_request
|
|
118
|
+
|
|
119
|
+
If non-GitHub CI detected → skip CI generation with note: "Detected {provider} — CI pipeline generation supports GitHub Actions only. Add test step to your existing pipeline manually."
|
|
120
|
+
|
|
121
|
+
### B6. Create TESTING.md
|
|
122
|
+
|
|
123
|
+
First check: If TESTING.md already exists → read it and update/append rather than overwriting. Never destroy existing content.
|
|
124
|
+
|
|
125
|
+
Write TESTING.md with:
|
|
126
|
+
- Philosophy: "100% test coverage is the key to great vibe coding. Tests let you move fast, trust your instincts, and ship with confidence — without them, vibe coding is just yolo coding. With tests, it's a superpower."
|
|
127
|
+
- Framework name and version
|
|
128
|
+
- How to run tests (the verified command from B5)
|
|
129
|
+
- Test layers: Unit tests (what, where, when), Integration tests, Smoke tests, E2E tests
|
|
130
|
+
- Conventions: file naming, assertion style, setup/teardown patterns
|
|
131
|
+
|
|
132
|
+
### B7. Update CLAUDE.md
|
|
133
|
+
|
|
134
|
+
First check: If CLAUDE.md already has a \`## Testing\` section → skip. Don't duplicate.
|
|
135
|
+
|
|
136
|
+
Append a \`## Testing\` section:
|
|
137
|
+
- Run command and test directory
|
|
138
|
+
- Reference to TESTING.md
|
|
139
|
+
- Test expectations:
|
|
140
|
+
- 100% test coverage is the goal — tests make vibe coding safe
|
|
141
|
+
- When writing new functions, write a corresponding test
|
|
142
|
+
- When fixing a bug, write a regression test
|
|
143
|
+
- When adding error handling, write a test that triggers the error
|
|
144
|
+
- When adding a conditional (if/else, switch), write tests for BOTH paths
|
|
145
|
+
- Never commit code that makes existing tests fail
|
|
146
|
+
|
|
147
|
+
### B8. Commit
|
|
148
|
+
|
|
149
|
+
\`\`\`bash
|
|
150
|
+
git status --porcelain
|
|
151
|
+
\`\`\`
|
|
152
|
+
|
|
153
|
+
Only commit if there are changes. Stage all bootstrap files (config, test directory, TESTING.md, CLAUDE.md, .github/workflows/test.yml if created):
|
|
154
|
+
\`git commit -m "chore: bootstrap test framework ({framework name})"\`
|
|
155
|
+
|
|
156
|
+
---`;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// ─── Test Coverage Audit ────────────────────────────────────
|
|
160
|
+
//
|
|
161
|
+
// Shared methodology for codepath tracing, ASCII diagrams, and test gap analysis.
|
|
162
|
+
// Three modes, three placeholders, one inner function:
|
|
163
|
+
//
|
|
164
|
+
// {{TEST_COVERAGE_AUDIT_PLAN}} → plan-eng-review: adds missing tests to the plan
|
|
165
|
+
// {{TEST_COVERAGE_AUDIT_SHIP}} → ship: auto-generates tests, coverage summary
|
|
166
|
+
// {{TEST_COVERAGE_AUDIT_REVIEW}} → review: generates tests via Fix-First (ASK)
|
|
167
|
+
//
|
|
168
|
+
// ┌────────────────────────────────────────────────┐
|
|
169
|
+
// │ generateTestCoverageAuditInner(mode) │
|
|
170
|
+
// │ │
|
|
171
|
+
// │ SHARED: framework detect, codepath trace, │
|
|
172
|
+
// │ ASCII diagram, quality rubric, E2E matrix, │
|
|
173
|
+
// │ regression rule │
|
|
174
|
+
// │ │
|
|
175
|
+
// │ plan: edit plan file, write artifact │
|
|
176
|
+
// │ ship: auto-generate tests, write artifact │
|
|
177
|
+
// │ review: Fix-First ASK, INFORMATIONAL gaps │
|
|
178
|
+
// └────────────────────────────────────────────────┘
|
|
179
|
+
|
|
180
|
+
type CoverageAuditMode = 'plan' | 'ship' | 'review';
|
|
181
|
+
|
|
182
|
+
function generateTestCoverageAuditInner(mode: CoverageAuditMode): string {
|
|
183
|
+
const sections: string[] = [];
|
|
184
|
+
|
|
185
|
+
// ── Intro (mode-specific) ──
|
|
186
|
+
if (mode === 'ship') {
|
|
187
|
+
sections.push(`100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned.`);
|
|
188
|
+
} else if (mode === 'plan') {
|
|
189
|
+
sections.push(`100% coverage is the goal. Evaluate every codepath in the plan and ensure the plan includes tests for each one. If the plan is missing tests, add them — the plan should be complete enough that implementation includes full test coverage from the start.`);
|
|
190
|
+
} else {
|
|
191
|
+
sections.push(`100% coverage is the goal. Evaluate every codepath changed in the diff and identify test gaps. Gaps become INFORMATIONAL findings that follow the Fix-First flow.`);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// ── Test framework detection (shared) ──
|
|
195
|
+
sections.push(`
|
|
196
|
+
### Test Framework Detection
|
|
197
|
+
|
|
198
|
+
Before analyzing coverage, detect the project's test framework:
|
|
199
|
+
|
|
200
|
+
1. **Read CLAUDE.md** — look for a \`## Testing\` section with test command and framework name. If found, use that as the authoritative source.
|
|
201
|
+
2. **If CLAUDE.md has no testing section, auto-detect:**
|
|
202
|
+
|
|
203
|
+
\`\`\`bash
|
|
204
|
+
setopt +o nomatch 2>/dev/null || true # zsh compat
|
|
205
|
+
# Detect project runtime
|
|
206
|
+
[ -f Gemfile ] && echo "RUNTIME:ruby"
|
|
207
|
+
[ -f package.json ] && echo "RUNTIME:node"
|
|
208
|
+
[ -f requirements.txt ] || [ -f pyproject.toml ] && echo "RUNTIME:python"
|
|
209
|
+
[ -f go.mod ] && echo "RUNTIME:go"
|
|
210
|
+
[ -f Cargo.toml ] && echo "RUNTIME:rust"
|
|
211
|
+
# Check for existing test infrastructure
|
|
212
|
+
ls jest.config.* vitest.config.* playwright.config.* cypress.config.* .rspec pytest.ini phpunit.xml 2>/dev/null
|
|
213
|
+
ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null
|
|
214
|
+
\`\`\`
|
|
215
|
+
|
|
216
|
+
3. **If no framework detected:**${mode === 'ship' ? ' falls through to the Test Framework Bootstrap step (Step 2.5) which handles full setup.' : ' still produce the coverage diagram, but skip test generation.'}`);
|
|
217
|
+
|
|
218
|
+
// ── Before/after count (ship only) ──
|
|
219
|
+
if (mode === 'ship') {
|
|
220
|
+
sections.push(`
|
|
221
|
+
**0. Before/after test count:**
|
|
222
|
+
|
|
223
|
+
\`\`\`bash
|
|
224
|
+
# Count test files before any generation
|
|
225
|
+
find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec.*' | grep -v node_modules | wc -l
|
|
226
|
+
\`\`\`
|
|
227
|
+
|
|
228
|
+
Store this number for the PR body.`);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// ── Codepath tracing methodology (shared, with mode-specific source) ──
|
|
232
|
+
const traceSource = mode === 'plan'
|
|
233
|
+
? `**Step 1. Trace every codepath in the plan:**
|
|
234
|
+
|
|
235
|
+
Read the plan document. For each new feature, service, endpoint, or component described, trace how data will flow through the code — don't just list planned functions, actually follow the planned execution:`
|
|
236
|
+
: `**${mode === 'ship' ? '1' : 'Step 1'}. Trace every codepath changed** using \`git diff origin/<base>...HEAD\`:
|
|
237
|
+
|
|
238
|
+
Read every changed file. For each one, trace how data flows through the code — don't just list functions, actually follow the execution:`;
|
|
239
|
+
|
|
240
|
+
const traceStep1 = mode === 'plan'
|
|
241
|
+
? `1. **Read the plan.** For each planned component, understand what it does and how it connects to existing code.`
|
|
242
|
+
: `1. **Read the diff.** For each changed file, read the full file (not just the diff hunk) to understand context.`;
|
|
243
|
+
|
|
244
|
+
sections.push(`
|
|
245
|
+
${traceSource}
|
|
246
|
+
|
|
247
|
+
${traceStep1}
|
|
248
|
+
2. **Trace data flow.** Starting from each entry point (route handler, exported function, event listener, component render), follow the data through every branch:
|
|
249
|
+
- Where does input come from? (request params, props, database, API call)
|
|
250
|
+
- What transforms it? (validation, mapping, computation)
|
|
251
|
+
- Where does it go? (database write, API response, rendered output, side effect)
|
|
252
|
+
- What can go wrong at each step? (null/undefined, invalid input, network failure, empty collection)
|
|
253
|
+
3. **Diagram the execution.** For each changed file, draw an ASCII diagram showing:
|
|
254
|
+
- Every function/method that was added or modified
|
|
255
|
+
- Every conditional branch (if/else, switch, ternary, guard clause, early return)
|
|
256
|
+
- Every error path (try/catch, rescue, error boundary, fallback)
|
|
257
|
+
- Every call to another function (trace into it — does IT have untested branches?)
|
|
258
|
+
- Every edge: what happens with null input? Empty array? Invalid type?
|
|
259
|
+
|
|
260
|
+
This is the critical step — you're building a map of every line of code that can execute differently based on input. Every branch in this diagram needs a test.`);
|
|
261
|
+
|
|
262
|
+
// ── User flow coverage (shared) ──
|
|
263
|
+
sections.push(`
|
|
264
|
+
**${mode === 'ship' ? '2' : 'Step 2'}. Map user flows, interactions, and error states:**
|
|
265
|
+
|
|
266
|
+
Code coverage isn't enough — you need to cover how real users interact with the changed code. For each changed feature, think through:
|
|
267
|
+
|
|
268
|
+
- **User flows:** What sequence of actions does a user take that touches this code? Map the full journey (e.g., "user clicks 'Pay' → form validates → API call → success/failure screen"). Each step in the journey needs a test.
|
|
269
|
+
- **Interaction edge cases:** What happens when the user does something unexpected?
|
|
270
|
+
- Double-click/rapid resubmit
|
|
271
|
+
- Navigate away mid-operation (back button, close tab, click another link)
|
|
272
|
+
- Submit with stale data (page sat open for 30 minutes, session expired)
|
|
273
|
+
- Slow connection (API takes 10 seconds — what does the user see?)
|
|
274
|
+
- Concurrent actions (two tabs, same form)
|
|
275
|
+
- **Error states the user can see:** For every error the code handles, what does the user actually experience?
|
|
276
|
+
- Is there a clear error message or a silent failure?
|
|
277
|
+
- Can the user recover (retry, go back, fix input) or are they stuck?
|
|
278
|
+
- What happens with no network? With a 500 from the API? With invalid data from the server?
|
|
279
|
+
- **Empty/zero/boundary states:** What does the UI show with zero results? With 10,000 results? With a single character input? With maximum-length input?
|
|
280
|
+
|
|
281
|
+
Add these to your diagram alongside the code branches. A user flow with no test is just as much a gap as an untested if/else.`);
|
|
282
|
+
|
|
283
|
+
// ── Check branches against tests + quality rubric (shared) ──
|
|
284
|
+
sections.push(`
|
|
285
|
+
**${mode === 'ship' ? '3' : 'Step 3'}. Check each branch against existing tests:**
|
|
286
|
+
|
|
287
|
+
Go through your diagram branch by branch — both code paths AND user flows. For each one, search for a test that exercises it:
|
|
288
|
+
- Function \`processPayment()\` → look for \`billing.test.ts\`, \`billing.spec.ts\`, \`test/billing_test.rb\`
|
|
289
|
+
- An if/else → look for tests covering BOTH the true AND false path
|
|
290
|
+
- An error handler → look for a test that triggers that specific error condition
|
|
291
|
+
- A call to \`helperFn()\` that has its own branches → those branches need tests too
|
|
292
|
+
- A user flow → look for an integration or E2E test that walks through the journey
|
|
293
|
+
- An interaction edge case → look for a test that simulates the unexpected action
|
|
294
|
+
|
|
295
|
+
Quality scoring rubric:
|
|
296
|
+
- ★★★ Tests behavior with edge cases AND error paths
|
|
297
|
+
- ★★ Tests correct behavior, happy path only
|
|
298
|
+
- ★ Smoke test / existence check / trivial assertion (e.g., "it renders", "it doesn't throw")`);
|
|
299
|
+
|
|
300
|
+
// ── E2E test decision matrix (shared) ──
|
|
301
|
+
sections.push(`
|
|
302
|
+
### E2E Test Decision Matrix
|
|
303
|
+
|
|
304
|
+
When checking each branch, also determine whether a unit test or E2E/integration test is the right tool:
|
|
305
|
+
|
|
306
|
+
**RECOMMEND E2E (mark as [→E2E] in the diagram):**
|
|
307
|
+
- Common user flow spanning 3+ components/services (e.g., signup → verify email → first login)
|
|
308
|
+
- Integration point where mocking hides real failures (e.g., API → queue → worker → DB)
|
|
309
|
+
- Auth/payment/data-destruction flows — too important to trust unit tests alone
|
|
310
|
+
|
|
311
|
+
**RECOMMEND EVAL (mark as [→EVAL] in the diagram):**
|
|
312
|
+
- Critical LLM call that needs a quality eval (e.g., prompt change → test output still meets quality bar)
|
|
313
|
+
- Changes to prompt templates, system instructions, or tool definitions
|
|
314
|
+
|
|
315
|
+
**STICK WITH UNIT TESTS:**
|
|
316
|
+
- Pure function with clear inputs/outputs
|
|
317
|
+
- Internal helper with no side effects
|
|
318
|
+
- Edge case of a single function (null input, empty array)
|
|
319
|
+
- Obscure/rare flow that isn't customer-facing`);
|
|
320
|
+
|
|
321
|
+
// ── Regression rule (shared) ──
|
|
322
|
+
sections.push(`
|
|
323
|
+
### REGRESSION RULE (mandatory)
|
|
324
|
+
|
|
325
|
+
**IRON RULE:** When the coverage audit identifies a REGRESSION — code that previously worked but the diff broke — a regression test is ${mode === 'plan' ? 'added to the plan as a critical requirement' : 'written immediately'}. No AskUserQuestion. No skipping. Regressions are the highest-priority test because they prove something broke.
|
|
326
|
+
|
|
327
|
+
A regression is when:
|
|
328
|
+
- The diff modifies existing behavior (not new code)
|
|
329
|
+
- The existing test suite (if any) doesn't cover the changed path
|
|
330
|
+
- The change introduces a new failure mode for existing callers
|
|
331
|
+
|
|
332
|
+
When uncertain whether a change is a regression, err on the side of writing the test.${mode !== 'plan' ? '\n\nFormat: commit as `test: regression test for {what broke}`' : ''}`);
|
|
333
|
+
|
|
334
|
+
// ── ASCII coverage diagram (shared) ──
|
|
335
|
+
sections.push(`
|
|
336
|
+
**${mode === 'ship' ? '4' : 'Step 4'}. Output ASCII coverage diagram:**
|
|
337
|
+
|
|
338
|
+
Include BOTH code paths and user flows in the same diagram. Mark E2E-worthy and eval-worthy paths:
|
|
339
|
+
|
|
340
|
+
\`\`\`
|
|
341
|
+
CODE PATH COVERAGE
|
|
342
|
+
===========================
|
|
343
|
+
[+] src/services/billing.ts
|
|
344
|
+
│
|
|
345
|
+
├── processPayment()
|
|
346
|
+
│ ├── [★★★ TESTED] Happy path + card declined + timeout — billing.test.ts:42
|
|
347
|
+
│ ├── [GAP] Network timeout — NO TEST
|
|
348
|
+
│ └── [GAP] Invalid currency — NO TEST
|
|
349
|
+
│
|
|
350
|
+
└── refundPayment()
|
|
351
|
+
├── [★★ TESTED] Full refund — billing.test.ts:89
|
|
352
|
+
└── [★ TESTED] Partial refund (checks non-throw only) — billing.test.ts:101
|
|
353
|
+
|
|
354
|
+
USER FLOW COVERAGE
|
|
355
|
+
===========================
|
|
356
|
+
[+] Payment checkout flow
|
|
357
|
+
│
|
|
358
|
+
├── [★★★ TESTED] Complete purchase — checkout.e2e.ts:15
|
|
359
|
+
├── [GAP] [→E2E] Double-click submit — needs E2E, not just unit
|
|
360
|
+
├── [GAP] Navigate away during payment — unit test sufficient
|
|
361
|
+
└── [★ TESTED] Form validation errors (checks render only) — checkout.test.ts:40
|
|
362
|
+
|
|
363
|
+
[+] Error states
|
|
364
|
+
│
|
|
365
|
+
├── [★★ TESTED] Card declined message — billing.test.ts:58
|
|
366
|
+
├── [GAP] Network timeout UX (what does user see?) — NO TEST
|
|
367
|
+
└── [GAP] Empty cart submission — NO TEST
|
|
368
|
+
|
|
369
|
+
[+] LLM integration
|
|
370
|
+
│
|
|
371
|
+
└── [GAP] [→EVAL] Prompt template change — needs eval test
|
|
372
|
+
|
|
373
|
+
─────────────────────────────────
|
|
374
|
+
COVERAGE: 5/13 paths tested (38%)
|
|
375
|
+
Code paths: 3/5 (60%)
|
|
376
|
+
User flows: 2/8 (25%)
|
|
377
|
+
QUALITY: ★★★: 2 ★★: 2 ★: 1
|
|
378
|
+
GAPS: 8 paths need tests (2 need E2E, 1 needs eval)
|
|
379
|
+
─────────────────────────────────
|
|
380
|
+
\`\`\`
|
|
381
|
+
|
|
382
|
+
**Fast path:** All paths covered → "${mode === 'ship' ? 'Step 3.4' : mode === 'review' ? 'Step 4.75' : 'Test review'}: All new code paths have test coverage ✓" Continue.`);
|
|
383
|
+
|
|
384
|
+
// ── Mode-specific action section ──
|
|
385
|
+
if (mode === 'plan') {
|
|
386
|
+
sections.push(`
|
|
387
|
+
**Step 5. Add missing tests to the plan:**
|
|
388
|
+
|
|
389
|
+
For each GAP identified in the diagram, add a test requirement to the plan. Be specific:
|
|
390
|
+
- What test file to create (match existing naming conventions)
|
|
391
|
+
- What the test should assert (specific inputs → expected outputs/behavior)
|
|
392
|
+
- Whether it's a unit test, E2E test, or eval (use the decision matrix)
|
|
393
|
+
- For regressions: flag as **CRITICAL** and explain what broke
|
|
394
|
+
|
|
395
|
+
The plan should be complete enough that when implementation begins, every test is written alongside the feature code — not deferred to a follow-up.`);
|
|
396
|
+
|
|
397
|
+
// ── Test plan artifact (plan + ship) ──
|
|
398
|
+
sections.push(`
|
|
399
|
+
### Test Plan Artifact
|
|
400
|
+
|
|
401
|
+
After producing the coverage diagram, write a test plan artifact to the project directory so \`/qa\` and \`/qa-only\` can consume it as primary test input:
|
|
402
|
+
|
|
403
|
+
\`\`\`bash
|
|
404
|
+
eval "$(~/.claude/skills/opengstack/bin/opengstack-slug 2>/dev/null)" && mkdir -p ~/.opengstack/projects/$SLUG
|
|
405
|
+
USER=$(whoami)
|
|
406
|
+
DATETIME=$(date +%Y%m%d-%H%M%S)
|
|
407
|
+
\`\`\`
|
|
408
|
+
|
|
409
|
+
Write to \`~/.opengstack/projects/{slug}/{user}-{branch}-eng-review-test-plan-{datetime}.md\`:
|
|
410
|
+
|
|
411
|
+
\`\`\`markdown
|
|
412
|
+
# Test Plan
|
|
413
|
+
Generated by /plan-eng-review on {date}
|
|
414
|
+
Branch: {branch}
|
|
415
|
+
Repo: {owner/repo}
|
|
416
|
+
|
|
417
|
+
## Affected Pages/Routes
|
|
418
|
+
- {URL path} — {what to test and why}
|
|
419
|
+
|
|
420
|
+
## Key Interactions to Verify
|
|
421
|
+
- {interaction description} on {page}
|
|
422
|
+
|
|
423
|
+
## Edge Cases
|
|
424
|
+
- {edge case} on {page}
|
|
425
|
+
|
|
426
|
+
## Critical Paths
|
|
427
|
+
- {end-to-end flow that must work}
|
|
428
|
+
\`\`\`
|
|
429
|
+
|
|
430
|
+
This file is consumed by \`/qa\` and \`/qa-only\` as primary test input. Include only the information that helps a QA tester know **what to test and where** — not implementation details.`);
|
|
431
|
+
} else if (mode === 'ship') {
|
|
432
|
+
sections.push(`
|
|
433
|
+
**5. Generate tests for uncovered paths:**
|
|
434
|
+
|
|
435
|
+
If test framework detected (or bootstrapped in Step 2.5):
|
|
436
|
+
- Prioritize error handlers and edge cases first (happy paths are more likely already tested)
|
|
437
|
+
- Read 2-3 existing test files to match conventions exactly
|
|
438
|
+
- Generate unit tests. Mock all external dependencies (DB, API, Redis).
|
|
439
|
+
- For paths marked [→E2E]: generate integration/E2E tests using the project's E2E framework (Playwright, Cypress, Capybara, etc.)
|
|
440
|
+
- For paths marked [→EVAL]: generate eval tests using the project's eval framework, or flag for manual eval if none exists
|
|
441
|
+
- Write tests that exercise the specific uncovered path with real assertions
|
|
442
|
+
- Run each test. Passes → commit as \`test: coverage for {feature}\`
|
|
443
|
+
- Fails → fix once. Still fails → revert, note gap in diagram.
|
|
444
|
+
|
|
445
|
+
Caps: 30 code paths max, 20 tests generated max (code + user flow combined), 2-min per-test exploration cap.
|
|
446
|
+
|
|
447
|
+
If no test framework AND user declined bootstrap → diagram only, no generation. Note: "Test generation skipped — no test framework configured."
|
|
448
|
+
|
|
449
|
+
**Diff is test-only changes:** Skip Step 3.4 entirely: "No new application code paths to audit."
|
|
450
|
+
|
|
451
|
+
**6. After-count and coverage summary:**
|
|
452
|
+
|
|
453
|
+
\`\`\`bash
|
|
454
|
+
# Count test files after generation
|
|
455
|
+
find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec.*' | grep -v node_modules | wc -l
|
|
456
|
+
\`\`\`
|
|
457
|
+
|
|
458
|
+
For PR body: \`Tests: {before} → {after} (+{delta} new)\`
|
|
459
|
+
Coverage line: \`Test Coverage Audit: N new code paths. M covered (X%). K tests generated, J committed.\`
|
|
460
|
+
|
|
461
|
+
**7. Coverage gate:**
|
|
462
|
+
|
|
463
|
+
Before proceeding, check CLAUDE.md for a \`## Test Coverage\` section with \`Minimum:\` and \`Target:\` fields. If found, use those percentages. Otherwise use defaults: Minimum = 60%, Target = 80%.
|
|
464
|
+
|
|
465
|
+
Using the coverage percentage from the diagram in substep 4 (the \`COVERAGE: X/Y (Z%)\` line):
|
|
466
|
+
|
|
467
|
+
- **>= target:** Pass. "Coverage gate: PASS ({X}%)." Continue.
|
|
468
|
+
- **>= minimum, < target:** Use AskUserQuestion:
|
|
469
|
+
- "AI-assessed coverage is {X}%. {N} code paths are untested. Target is {target}%."
|
|
470
|
+
- RECOMMENDATION: Choose A because untested code paths are where production bugs hide.
|
|
471
|
+
- Options:
|
|
472
|
+
A) Generate more tests for remaining gaps (recommended)
|
|
473
|
+
B) Ship anyway — I accept the coverage risk
|
|
474
|
+
C) These paths don't need tests — mark as intentionally uncovered
|
|
475
|
+
- If A: Loop back to substep 5 (generate tests) targeting the remaining gaps. After second pass, if still below target, present AskUserQuestion again with updated numbers. Maximum 2 generation passes total.
|
|
476
|
+
- If B: Continue. Include in PR body: "Coverage gate: {X}% — user accepted risk."
|
|
477
|
+
- If C: Continue. Include in PR body: "Coverage gate: {X}% — {N} paths intentionally uncovered."
|
|
478
|
+
|
|
479
|
+
- **< minimum:** Use AskUserQuestion:
|
|
480
|
+
- "AI-assessed coverage is critically low ({X}%). {N} of {M} code paths have no tests. Minimum threshold is {minimum}%."
|
|
481
|
+
- RECOMMENDATION: Choose A because less than {minimum}% means more code is untested than tested.
|
|
482
|
+
- Options:
|
|
483
|
+
A) Generate tests for remaining gaps (recommended)
|
|
484
|
+
B) Override — ship with low coverage (I understand the risk)
|
|
485
|
+
- If A: Loop back to substep 5. Maximum 2 passes. If still below minimum after 2 passes, present the override choice again.
|
|
486
|
+
- If B: Continue. Include in PR body: "Coverage gate: OVERRIDDEN at {X}%."
|
|
487
|
+
|
|
488
|
+
**Coverage percentage undetermined:** If the coverage diagram doesn't produce a clear numeric percentage (ambiguous output, parse error), **skip the gate** with: "Coverage gate: could not determine percentage — skipping." Do not default to 0% or block.
|
|
489
|
+
|
|
490
|
+
**Test-only diffs:** Skip the gate (same as the existing fast-path).
|
|
491
|
+
|
|
492
|
+
**100% coverage:** "Coverage gate: PASS (100%)." Continue.`);
|
|
493
|
+
|
|
494
|
+
// ── Test plan artifact (ship mode) ──
|
|
495
|
+
sections.push(`
|
|
496
|
+
### Test Plan Artifact
|
|
497
|
+
|
|
498
|
+
After producing the coverage diagram, write a test plan artifact so \`/qa\` and \`/qa-only\` can consume it:
|
|
499
|
+
|
|
500
|
+
\`\`\`bash
|
|
501
|
+
eval "$(~/.claude/skills/opengstack/bin/opengstack-slug 2>/dev/null)" && mkdir -p ~/.opengstack/projects/$SLUG
|
|
502
|
+
USER=$(whoami)
|
|
503
|
+
DATETIME=$(date +%Y%m%d-%H%M%S)
|
|
504
|
+
\`\`\`
|
|
505
|
+
|
|
506
|
+
Write to \`~/.opengstack/projects/{slug}/{user}-{branch}-ship-test-plan-{datetime}.md\`:
|
|
507
|
+
|
|
508
|
+
\`\`\`markdown
|
|
509
|
+
# Test Plan
|
|
510
|
+
Generated by /ship on {date}
|
|
511
|
+
Branch: {branch}
|
|
512
|
+
Repo: {owner/repo}
|
|
513
|
+
|
|
514
|
+
## Affected Pages/Routes
|
|
515
|
+
- {URL path} — {what to test and why}
|
|
516
|
+
|
|
517
|
+
## Key Interactions to Verify
|
|
518
|
+
- {interaction description} on {page}
|
|
519
|
+
|
|
520
|
+
## Edge Cases
|
|
521
|
+
- {edge case} on {page}
|
|
522
|
+
|
|
523
|
+
## Critical Paths
|
|
524
|
+
- {end-to-end flow that must work}
|
|
525
|
+
\`\`\``);
|
|
526
|
+
} else {
|
|
527
|
+
// review mode
|
|
528
|
+
sections.push(`
|
|
529
|
+
**Step 5. Generate tests for gaps (Fix-First):**
|
|
530
|
+
|
|
531
|
+
If test framework is detected and gaps were identified:
|
|
532
|
+
- Classify each gap as AUTO-FIX or ASK per the Fix-First Heuristic:
|
|
533
|
+
- **AUTO-FIX:** Simple unit tests for pure functions, edge cases of existing tested functions
|
|
534
|
+
- **ASK:** E2E tests, tests requiring new test infrastructure, tests for ambiguous behavior
|
|
535
|
+
- For AUTO-FIX gaps: generate the test, run it, commit as \`test: coverage for {feature}\`
|
|
536
|
+
- For ASK gaps: include in the Fix-First batch question with the other review findings
|
|
537
|
+
- For paths marked [→E2E]: always ASK (E2E tests are higher-effort and need user confirmation)
|
|
538
|
+
- For paths marked [→EVAL]: always ASK (eval tests need user confirmation on quality criteria)
|
|
539
|
+
|
|
540
|
+
If no test framework detected → include gaps as INFORMATIONAL findings only, no generation.
|
|
541
|
+
|
|
542
|
+
**Diff is test-only changes:** Skip Step 4.75 entirely: "No new application code paths to audit."
|
|
543
|
+
|
|
544
|
+
### Coverage Warning
|
|
545
|
+
|
|
546
|
+
After producing the coverage diagram, check the coverage percentage. Read CLAUDE.md for a \`## Test Coverage\` section with a \`Minimum:\` field. If not found, use default: 60%.
|
|
547
|
+
|
|
548
|
+
If coverage is below the minimum threshold, output a prominent warning **before** the regular review findings:
|
|
549
|
+
|
|
550
|
+
\`\`\`
|
|
551
|
+
⚠️ COVERAGE WARNING: AI-assessed coverage is {X}%. {N} code paths untested.
|
|
552
|
+
Consider writing tests before running /ship.
|
|
553
|
+
\`\`\`
|
|
554
|
+
|
|
555
|
+
This is INFORMATIONAL — does not block /review. But it makes low coverage visible early so the developer can address it before reaching the /ship coverage gate.
|
|
556
|
+
|
|
557
|
+
If coverage percentage cannot be determined, skip the warning silently.`);
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
return sections.join('\n');
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
export function generateTestCoverageAuditPlan(_ctx: TemplateContext): string {
|
|
564
|
+
return generateTestCoverageAuditInner('plan');
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
export function generateTestCoverageAuditShip(_ctx: TemplateContext): string {
|
|
568
|
+
return generateTestCoverageAuditInner('ship');
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
export function generateTestCoverageAuditReview(_ctx: TemplateContext): string {
|
|
572
|
+
return generateTestCoverageAuditInner('review');
|
|
573
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
export type Host = 'claude' | 'codex' | 'factory';
|
|
2
|
+
|
|
3
|
+
export interface HostPaths {
|
|
4
|
+
skillRoot: string;
|
|
5
|
+
localSkillRoot: string;
|
|
6
|
+
binDir: string;
|
|
7
|
+
browseDir: string;
|
|
8
|
+
designDir: string;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export const HOST_PATHS: Record<Host, HostPaths> = {
|
|
12
|
+
claude: {
|
|
13
|
+
skillRoot: '~/.claude/skills/opengstack',
|
|
14
|
+
localSkillRoot: '.claude/skills/opengstack',
|
|
15
|
+
binDir: '~/.claude/skills/opengstack/bin',
|
|
16
|
+
browseDir: '~/.claude/skills/opengstack/browse/dist',
|
|
17
|
+
designDir: '~/.claude/skills/opengstack/design/dist',
|
|
18
|
+
},
|
|
19
|
+
codex: {
|
|
20
|
+
skillRoot: '$OpenGStack_ROOT',
|
|
21
|
+
localSkillRoot: '.agents/skills/opengstack',
|
|
22
|
+
binDir: '$OpenGStack_BIN',
|
|
23
|
+
browseDir: '$OpenGStack_BROWSE',
|
|
24
|
+
designDir: '$OpenGStack_DESIGN',
|
|
25
|
+
},
|
|
26
|
+
factory: {
|
|
27
|
+
skillRoot: '$OpenGStack_ROOT',
|
|
28
|
+
localSkillRoot: '.factory/skills/opengstack',
|
|
29
|
+
binDir: '$OpenGStack_BIN',
|
|
30
|
+
browseDir: '$OpenGStack_BROWSE',
|
|
31
|
+
designDir: '$OpenGStack_DESIGN',
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
export interface TemplateContext {
|
|
36
|
+
skillName: string;
|
|
37
|
+
tmplPath: string;
|
|
38
|
+
benefitsFrom?: string[];
|
|
39
|
+
host: Host;
|
|
40
|
+
paths: HostPaths;
|
|
41
|
+
preambleTier?: number; // 1-4, controls which preamble sections are included
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Resolver function signature. args is populated for parameterized placeholders like {{INVOKE_SKILL:name}}. */
|
|
45
|
+
export type ResolverFn = (ctx: TemplateContext, args?: string[]) => string;
|