opengstack 0.14.0 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/AGENTS.md +4 -4
  2. package/CLAUDE.md +127 -110
  3. package/README.md +10 -5
  4. package/SKILL.md +500 -70
  5. package/bin/opengstack.js +69 -69
  6. package/commands/autoplan.md +7 -9
  7. package/commands/benchmark.md +84 -91
  8. package/commands/browse.md +60 -64
  9. package/commands/canary.md +7 -9
  10. package/commands/careful.md +2 -2
  11. package/commands/codex.md +7 -9
  12. package/commands/connect-chrome.md +7 -9
  13. package/commands/cso.md +7 -9
  14. package/commands/design-consultation.md +7 -9
  15. package/commands/design-review.md +7 -9
  16. package/commands/design-shotgun.md +7 -9
  17. package/commands/document-release.md +7 -9
  18. package/commands/freeze.md +3 -3
  19. package/commands/guard.md +4 -4
  20. package/commands/investigate.md +7 -9
  21. package/commands/land-and-deploy.md +7 -9
  22. package/commands/office-hours.md +7 -9
  23. package/commands/{gstack-upgrade.md → opengstack-upgrade.md} +64 -65
  24. package/commands/plan-ceo-review.md +7 -9
  25. package/commands/plan-design-review.md +7 -9
  26. package/commands/plan-eng-review.md +7 -9
  27. package/commands/qa-only.md +7 -9
  28. package/commands/qa.md +7 -9
  29. package/commands/retro.md +7 -9
  30. package/commands/review.md +7 -9
  31. package/commands/setup-browser-cookies.md +22 -26
  32. package/commands/setup-deploy.md +7 -9
  33. package/commands/ship.md +7 -9
  34. package/commands/unfreeze.md +7 -7
  35. package/docs/designs/CHROME_VS_CHROMIUM_EXPLORATION.md +9 -9
  36. package/docs/designs/CONDUCTOR_CHROME_SIDEBAR_INTEGRATION.md +2 -2
  37. package/docs/designs/CONDUCTOR_SESSION_API.md +16 -16
  38. package/docs/designs/DESIGN_SHOTGUN.md +74 -74
  39. package/docs/designs/DESIGN_TOOLS_V1.md +111 -111
  40. package/docs/skills.md +483 -202
  41. package/package.json +42 -43
  42. package/scripts/analytics.ts +188 -0
  43. package/scripts/dev-skill.ts +83 -0
  44. package/scripts/discover-skills.ts +39 -0
  45. package/scripts/eval-compare.ts +97 -0
  46. package/scripts/eval-list.ts +117 -0
  47. package/scripts/eval-select.ts +86 -0
  48. package/scripts/eval-summary.ts +188 -0
  49. package/scripts/eval-watch.ts +172 -0
  50. package/scripts/gen-skill-docs.ts +473 -0
  51. package/scripts/resolvers/browse.ts +129 -0
  52. package/scripts/resolvers/codex-helpers.ts +133 -0
  53. package/scripts/resolvers/composition.ts +48 -0
  54. package/scripts/resolvers/confidence.ts +37 -0
  55. package/scripts/resolvers/constants.ts +50 -0
  56. package/scripts/resolvers/design.ts +950 -0
  57. package/scripts/resolvers/index.ts +59 -0
  58. package/scripts/resolvers/learnings.ts +96 -0
  59. package/scripts/resolvers/preamble.ts +505 -0
  60. package/scripts/resolvers/review.ts +884 -0
  61. package/scripts/resolvers/testing.ts +573 -0
  62. package/scripts/resolvers/types.ts +45 -0
  63. package/scripts/resolvers/utility.ts +421 -0
  64. package/scripts/skill-check.ts +190 -0
  65. package/scripts/cleanup.py +0 -100
  66. package/scripts/filter-skills.sh +0 -114
  67. package/scripts/filter_skills.py +0 -164
  68. package/scripts/install-commands.js +0 -45
  69. package/scripts/install-skills.js +0 -60
@@ -0,0 +1,573 @@
1
+ import type { TemplateContext } from './types';
2
+
3
+ export function generateTestBootstrap(_ctx: TemplateContext): string {
4
+ return `## Test Framework Bootstrap
5
+
6
+ **Detect existing test framework and project runtime:**
7
+
8
+ \`\`\`bash
9
+ setopt +o nomatch 2>/dev/null || true # zsh compat
10
+ # Detect project runtime
11
+ [ -f Gemfile ] && echo "RUNTIME:ruby"
12
+ [ -f package.json ] && echo "RUNTIME:node"
13
+ [ -f requirements.txt ] || [ -f pyproject.toml ] && echo "RUNTIME:python"
14
+ [ -f go.mod ] && echo "RUNTIME:go"
15
+ [ -f Cargo.toml ] && echo "RUNTIME:rust"
16
+ [ -f composer.json ] && echo "RUNTIME:php"
17
+ [ -f mix.exs ] && echo "RUNTIME:elixir"
18
+ # Detect sub-frameworks
19
+ [ -f Gemfile ] && grep -q "rails" Gemfile 2>/dev/null && echo "FRAMEWORK:rails"
20
+ [ -f package.json ] && grep -q '"next"' package.json 2>/dev/null && echo "FRAMEWORK:nextjs"
21
+ # Check for existing test infrastructure
22
+ ls jest.config.* vitest.config.* playwright.config.* .rspec pytest.ini pyproject.toml phpunit.xml 2>/dev/null
23
+ ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null
24
+ # Check opt-out marker
25
+ [ -f .OpenGStack/no-test-bootstrap ] && echo "BOOTSTRAP_DECLINED"
26
+ \`\`\`
27
+
28
+ **If test framework detected** (config files or test directories found):
29
+ Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap."
30
+ Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns).
31
+ Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.**
32
+
33
+ **If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.**
34
+
35
+ **If NO runtime detected** (no config files found): Use AskUserQuestion:
36
+ "I couldn't detect your project's language. What runtime are you using?"
37
+ Options: A) Node.js/TypeScript B) Ruby/Rails C) Python D) Go E) Rust F) PHP G) Elixir H) This project doesn't need tests.
38
+ If user picks H → write \`.OpenGStack/no-test-bootstrap\` and continue without tests.
39
+
40
+ **If runtime detected but no test framework — bootstrap:**
41
+
42
+ ### B2. Research best practices
43
+
44
+ Use WebSearch to find current best practices for the detected runtime:
45
+ - \`"[runtime] best test framework 2025 2026"\`
46
+ - \`"[framework A] vs [framework B] comparison"\`
47
+
48
+ If WebSearch is unavailable, use this built-in knowledge table:
49
+
50
+ | Runtime | Primary recommendation | Alternative |
51
+ |---------|----------------------|-------------|
52
+ | Ruby/Rails | minitest + fixtures + capybara | rspec + factory_bot + shoulda-matchers |
53
+ | Node.js | vitest + @testing-library | jest + @testing-library |
54
+ | Next.js | vitest + @testing-library/react + playwright | jest + cypress |
55
+ | Python | pytest + pytest-cov | unittest |
56
+ | Go | stdlib testing + testify | stdlib only |
57
+ | Rust | cargo test (built-in) + mockall | — |
58
+ | PHP | phpunit + mockery | pest |
59
+ | Elixir | ExUnit (built-in) + ex_machina | — |
60
+
61
+ ### B3. Framework selection
62
+
63
+ Use AskUserQuestion:
64
+ "I detected this is a [Runtime/Framework] project with no test framework. I researched current best practices. Here are the options:
65
+ A) [Primary] — [rationale]. Includes: [packages]. Supports: unit, integration, smoke, e2e
66
+ B) [Alternative] — [rationale]. Includes: [packages]
67
+ C) Skip — don't set up testing right now
68
+ RECOMMENDATION: Choose A because [reason based on project context]"
69
+
70
+ If user picks C → write \`.OpenGStack/no-test-bootstrap\`. Tell user: "If you change your mind later, delete \`.OpenGStack/no-test-bootstrap\` and re-run." Continue without tests.
71
+
72
+ If multiple runtimes detected (monorepo) → ask which runtime to set up first, with option to do both sequentially.
73
+
74
+ ### B4. Install and configure
75
+
76
+ 1. Install the chosen packages (npm/bun/gem/pip/etc.)
77
+ 2. Create minimal config file
78
+ 3. Create directory structure (test/, spec/, etc.)
79
+ 4. Create one example test matching the project's code to verify setup works
80
+
81
+ If package installation fails → debug once. If still failing → revert with \`git checkout -- package.json package-lock.json\` (or equivalent for the runtime). Warn user and continue without tests.
82
+
83
+ ### B4.5. First real tests
84
+
85
+ Generate 3-5 real tests for existing code:
86
+
87
+ 1. **Find recently changed files:** \`git log --since=30.days --name-only --format="" | sort | uniq -c | sort -rn | head -10\`
88
+ 2. **Prioritize by risk:** Error handlers > business logic with conditionals > API endpoints > pure functions
89
+ 3. **For each file:** Write one test that tests real behavior with meaningful assertions. Never \`expect(x).toBeDefined()\` — test what the code DOES.
90
+ 4. Run each test. Passes → keep. Fails → fix once. Still fails → delete silently.
91
+ 5. Generate at least 1 test, cap at 5.
92
+
93
+ Never import secrets, API keys, or credentials in test files. Use environment variables or test fixtures.
94
+
95
+ ### B5. Verify
96
+
97
+ \`\`\`bash
98
+ # Run the full test suite to confirm everything works
99
+ {detected test command}
100
+ \`\`\`
101
+
102
+ If tests fail → debug once. If still failing → revert all bootstrap changes and warn user.
103
+
104
+ ### B5.5. CI/CD pipeline
105
+
106
+ \`\`\`bash
107
+ # Check CI provider
108
+ ls -d .github/ 2>/dev/null && echo "CI:github"
109
+ ls .gitlab-ci.yml .circleci/ bitrise.yml 2>/dev/null
110
+ \`\`\`
111
+
112
+ If \`.github/\` exists (or no CI detected — default to GitHub Actions):
113
+ Create \`.github/workflows/test.yml\` with:
114
+ - \`runs-on: ubuntu-latest\`
115
+ - Appropriate setup action for the runtime (setup-node, setup-ruby, setup-python, etc.)
116
+ - The same test command verified in B5
117
+ - Trigger: push + pull_request
118
+
119
+ If non-GitHub CI detected → skip CI generation with note: "Detected {provider} — CI pipeline generation supports GitHub Actions only. Add test step to your existing pipeline manually."
120
+
121
+ ### B6. Create TESTING.md
122
+
123
+ First check: If TESTING.md already exists → read it and update/append rather than overwriting. Never destroy existing content.
124
+
125
+ Write TESTING.md with:
126
+ - Philosophy: "100% test coverage is the key to great vibe coding. Tests let you move fast, trust your instincts, and ship with confidence — without them, vibe coding is just yolo coding. With tests, it's a superpower."
127
+ - Framework name and version
128
+ - How to run tests (the verified command from B5)
129
+ - Test layers: Unit tests (what, where, when), Integration tests, Smoke tests, E2E tests
130
+ - Conventions: file naming, assertion style, setup/teardown patterns
131
+
132
+ ### B7. Update CLAUDE.md
133
+
134
+ First check: If CLAUDE.md already has a \`## Testing\` section → skip. Don't duplicate.
135
+
136
+ Append a \`## Testing\` section:
137
+ - Run command and test directory
138
+ - Reference to TESTING.md
139
+ - Test expectations:
140
+ - 100% test coverage is the goal — tests make vibe coding safe
141
+ - When writing new functions, write a corresponding test
142
+ - When fixing a bug, write a regression test
143
+ - When adding error handling, write a test that triggers the error
144
+ - When adding a conditional (if/else, switch), write tests for BOTH paths
145
+ - Never commit code that makes existing tests fail
146
+
147
+ ### B8. Commit
148
+
149
+ \`\`\`bash
150
+ git status --porcelain
151
+ \`\`\`
152
+
153
+ Only commit if there are changes. Stage all bootstrap files (config, test directory, TESTING.md, CLAUDE.md, .github/workflows/test.yml if created):
154
+ \`git commit -m "chore: bootstrap test framework ({framework name})"\`
155
+
156
+ ---`;
157
+ }
158
+
159
+ // ─── Test Coverage Audit ────────────────────────────────────
160
+ //
161
+ // Shared methodology for codepath tracing, ASCII diagrams, and test gap analysis.
162
+ // Three modes, three placeholders, one inner function:
163
+ //
164
+ // {{TEST_COVERAGE_AUDIT_PLAN}} → plan-eng-review: adds missing tests to the plan
165
+ // {{TEST_COVERAGE_AUDIT_SHIP}} → ship: auto-generates tests, coverage summary
166
+ // {{TEST_COVERAGE_AUDIT_REVIEW}} → review: generates tests via Fix-First (ASK)
167
+ //
168
+ // ┌────────────────────────────────────────────────┐
169
+ // │ generateTestCoverageAuditInner(mode) │
170
+ // │ │
171
+ // │ SHARED: framework detect, codepath trace, │
172
+ // │ ASCII diagram, quality rubric, E2E matrix, │
173
+ // │ regression rule │
174
+ // │ │
175
+ // │ plan: edit plan file, write artifact │
176
+ // │ ship: auto-generate tests, write artifact │
177
+ // │ review: Fix-First ASK, INFORMATIONAL gaps │
178
+ // └────────────────────────────────────────────────┘
179
+
180
+ type CoverageAuditMode = 'plan' | 'ship' | 'review';
181
+
182
+ function generateTestCoverageAuditInner(mode: CoverageAuditMode): string {
183
+ const sections: string[] = [];
184
+
185
+ // ── Intro (mode-specific) ──
186
+ if (mode === 'ship') {
187
+ sections.push(`100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned.`);
188
+ } else if (mode === 'plan') {
189
+ sections.push(`100% coverage is the goal. Evaluate every codepath in the plan and ensure the plan includes tests for each one. If the plan is missing tests, add them — the plan should be complete enough that implementation includes full test coverage from the start.`);
190
+ } else {
191
+ sections.push(`100% coverage is the goal. Evaluate every codepath changed in the diff and identify test gaps. Gaps become INFORMATIONAL findings that follow the Fix-First flow.`);
192
+ }
193
+
194
+ // ── Test framework detection (shared) ──
195
+ sections.push(`
196
+ ### Test Framework Detection
197
+
198
+ Before analyzing coverage, detect the project's test framework:
199
+
200
+ 1. **Read CLAUDE.md** — look for a \`## Testing\` section with test command and framework name. If found, use that as the authoritative source.
201
+ 2. **If CLAUDE.md has no testing section, auto-detect:**
202
+
203
+ \`\`\`bash
204
+ setopt +o nomatch 2>/dev/null || true # zsh compat
205
+ # Detect project runtime
206
+ [ -f Gemfile ] && echo "RUNTIME:ruby"
207
+ [ -f package.json ] && echo "RUNTIME:node"
208
+ [ -f requirements.txt ] || [ -f pyproject.toml ] && echo "RUNTIME:python"
209
+ [ -f go.mod ] && echo "RUNTIME:go"
210
+ [ -f Cargo.toml ] && echo "RUNTIME:rust"
211
+ # Check for existing test infrastructure
212
+ ls jest.config.* vitest.config.* playwright.config.* cypress.config.* .rspec pytest.ini phpunit.xml 2>/dev/null
213
+ ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null
214
+ \`\`\`
215
+
216
+ 3. **If no framework detected:**${mode === 'ship' ? ' falls through to the Test Framework Bootstrap step (Step 2.5) which handles full setup.' : ' still produce the coverage diagram, but skip test generation.'}`);
217
+
218
+ // ── Before/after count (ship only) ──
219
+ if (mode === 'ship') {
220
+ sections.push(`
221
+ **0. Before/after test count:**
222
+
223
+ \`\`\`bash
224
+ # Count test files before any generation
225
+ find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec.*' | grep -v node_modules | wc -l
226
+ \`\`\`
227
+
228
+ Store this number for the PR body.`);
229
+ }
230
+
231
+ // ── Codepath tracing methodology (shared, with mode-specific source) ──
232
+ const traceSource = mode === 'plan'
233
+ ? `**Step 1. Trace every codepath in the plan:**
234
+
235
+ Read the plan document. For each new feature, service, endpoint, or component described, trace how data will flow through the code — don't just list planned functions, actually follow the planned execution:`
236
+ : `**${mode === 'ship' ? '1' : 'Step 1'}. Trace every codepath changed** using \`git diff origin/<base>...HEAD\`:
237
+
238
+ Read every changed file. For each one, trace how data flows through the code — don't just list functions, actually follow the execution:`;
239
+
240
+ const traceStep1 = mode === 'plan'
241
+ ? `1. **Read the plan.** For each planned component, understand what it does and how it connects to existing code.`
242
+ : `1. **Read the diff.** For each changed file, read the full file (not just the diff hunk) to understand context.`;
243
+
244
+ sections.push(`
245
+ ${traceSource}
246
+
247
+ ${traceStep1}
248
+ 2. **Trace data flow.** Starting from each entry point (route handler, exported function, event listener, component render), follow the data through every branch:
249
+ - Where does input come from? (request params, props, database, API call)
250
+ - What transforms it? (validation, mapping, computation)
251
+ - Where does it go? (database write, API response, rendered output, side effect)
252
+ - What can go wrong at each step? (null/undefined, invalid input, network failure, empty collection)
253
+ 3. **Diagram the execution.** For each changed file, draw an ASCII diagram showing:
254
+ - Every function/method that was added or modified
255
+ - Every conditional branch (if/else, switch, ternary, guard clause, early return)
256
+ - Every error path (try/catch, rescue, error boundary, fallback)
257
+ - Every call to another function (trace into it — does IT have untested branches?)
258
+ - Every edge: what happens with null input? Empty array? Invalid type?
259
+
260
+ This is the critical step — you're building a map of every line of code that can execute differently based on input. Every branch in this diagram needs a test.`);
261
+
262
+ // ── User flow coverage (shared) ──
263
+ sections.push(`
264
+ **${mode === 'ship' ? '2' : 'Step 2'}. Map user flows, interactions, and error states:**
265
+
266
+ Code coverage isn't enough — you need to cover how real users interact with the changed code. For each changed feature, think through:
267
+
268
+ - **User flows:** What sequence of actions does a user take that touches this code? Map the full journey (e.g., "user clicks 'Pay' → form validates → API call → success/failure screen"). Each step in the journey needs a test.
269
+ - **Interaction edge cases:** What happens when the user does something unexpected?
270
+ - Double-click/rapid resubmit
271
+ - Navigate away mid-operation (back button, close tab, click another link)
272
+ - Submit with stale data (page sat open for 30 minutes, session expired)
273
+ - Slow connection (API takes 10 seconds — what does the user see?)
274
+ - Concurrent actions (two tabs, same form)
275
+ - **Error states the user can see:** For every error the code handles, what does the user actually experience?
276
+ - Is there a clear error message or a silent failure?
277
+ - Can the user recover (retry, go back, fix input) or are they stuck?
278
+ - What happens with no network? With a 500 from the API? With invalid data from the server?
279
+ - **Empty/zero/boundary states:** What does the UI show with zero results? With 10,000 results? With a single character input? With maximum-length input?
280
+
281
+ Add these to your diagram alongside the code branches. A user flow with no test is just as much a gap as an untested if/else.`);
282
+
283
+ // ── Check branches against tests + quality rubric (shared) ──
284
+ sections.push(`
285
+ **${mode === 'ship' ? '3' : 'Step 3'}. Check each branch against existing tests:**
286
+
287
+ Go through your diagram branch by branch — both code paths AND user flows. For each one, search for a test that exercises it:
288
+ - Function \`processPayment()\` → look for \`billing.test.ts\`, \`billing.spec.ts\`, \`test/billing_test.rb\`
289
+ - An if/else → look for tests covering BOTH the true AND false path
290
+ - An error handler → look for a test that triggers that specific error condition
291
+ - A call to \`helperFn()\` that has its own branches → those branches need tests too
292
+ - A user flow → look for an integration or E2E test that walks through the journey
293
+ - An interaction edge case → look for a test that simulates the unexpected action
294
+
295
+ Quality scoring rubric:
296
+ - ★★★ Tests behavior with edge cases AND error paths
297
+ - ★★ Tests correct behavior, happy path only
298
+ - ★ Smoke test / existence check / trivial assertion (e.g., "it renders", "it doesn't throw")`);
299
+
300
+ // ── E2E test decision matrix (shared) ──
301
+ sections.push(`
302
+ ### E2E Test Decision Matrix
303
+
304
+ When checking each branch, also determine whether a unit test or E2E/integration test is the right tool:
305
+
306
+ **RECOMMEND E2E (mark as [→E2E] in the diagram):**
307
+ - Common user flow spanning 3+ components/services (e.g., signup → verify email → first login)
308
+ - Integration point where mocking hides real failures (e.g., API → queue → worker → DB)
309
+ - Auth/payment/data-destruction flows — too important to trust unit tests alone
310
+
311
+ **RECOMMEND EVAL (mark as [→EVAL] in the diagram):**
312
+ - Critical LLM call that needs a quality eval (e.g., prompt change → test output still meets quality bar)
313
+ - Changes to prompt templates, system instructions, or tool definitions
314
+
315
+ **STICK WITH UNIT TESTS:**
316
+ - Pure function with clear inputs/outputs
317
+ - Internal helper with no side effects
318
+ - Edge case of a single function (null input, empty array)
319
+ - Obscure/rare flow that isn't customer-facing`);
320
+
321
+ // ── Regression rule (shared) ──
322
+ sections.push(`
323
+ ### REGRESSION RULE (mandatory)
324
+
325
+ **IRON RULE:** When the coverage audit identifies a REGRESSION — code that previously worked but the diff broke — a regression test is ${mode === 'plan' ? 'added to the plan as a critical requirement' : 'written immediately'}. No AskUserQuestion. No skipping. Regressions are the highest-priority test because they prove something broke.
326
+
327
+ A regression is when:
328
+ - The diff modifies existing behavior (not new code)
329
+ - The existing test suite (if any) doesn't cover the changed path
330
+ - The change introduces a new failure mode for existing callers
331
+
332
+ When uncertain whether a change is a regression, err on the side of writing the test.${mode !== 'plan' ? '\n\nFormat: commit as `test: regression test for {what broke}`' : ''}`);
333
+
334
+ // ── ASCII coverage diagram (shared) ──
335
+ sections.push(`
336
+ **${mode === 'ship' ? '4' : 'Step 4'}. Output ASCII coverage diagram:**
337
+
338
+ Include BOTH code paths and user flows in the same diagram. Mark E2E-worthy and eval-worthy paths:
339
+
340
+ \`\`\`
341
+ CODE PATH COVERAGE
342
+ ===========================
343
+ [+] src/services/billing.ts
344
+
345
+ ├── processPayment()
346
+ │ ├── [★★★ TESTED] Happy path + card declined + timeout — billing.test.ts:42
347
+ │ ├── [GAP] Network timeout — NO TEST
348
+ │ └── [GAP] Invalid currency — NO TEST
349
+
350
+ └── refundPayment()
351
+ ├── [★★ TESTED] Full refund — billing.test.ts:89
352
+ └── [★ TESTED] Partial refund (checks non-throw only) — billing.test.ts:101
353
+
354
+ USER FLOW COVERAGE
355
+ ===========================
356
+ [+] Payment checkout flow
357
+
358
+ ├── [★★★ TESTED] Complete purchase — checkout.e2e.ts:15
359
+ ├── [GAP] [→E2E] Double-click submit — needs E2E, not just unit
360
+ ├── [GAP] Navigate away during payment — unit test sufficient
361
+ └── [★ TESTED] Form validation errors (checks render only) — checkout.test.ts:40
362
+
363
+ [+] Error states
364
+
365
+ ├── [★★ TESTED] Card declined message — billing.test.ts:58
366
+ ├── [GAP] Network timeout UX (what does user see?) — NO TEST
367
+ └── [GAP] Empty cart submission — NO TEST
368
+
369
+ [+] LLM integration
370
+
371
+ └── [GAP] [→EVAL] Prompt template change — needs eval test
372
+
373
+ ─────────────────────────────────
374
+ COVERAGE: 5/13 paths tested (38%)
375
+ Code paths: 3/5 (60%)
376
+ User flows: 2/8 (25%)
377
+ QUALITY: ★★★: 2 ★★: 2 ★: 1
378
+ GAPS: 8 paths need tests (2 need E2E, 1 needs eval)
379
+ ─────────────────────────────────
380
+ \`\`\`
381
+
382
+ **Fast path:** All paths covered → "${mode === 'ship' ? 'Step 3.4' : mode === 'review' ? 'Step 4.75' : 'Test review'}: All new code paths have test coverage ✓" Continue.`);
383
+
384
+ // ── Mode-specific action section ──
385
+ if (mode === 'plan') {
386
+ sections.push(`
387
+ **Step 5. Add missing tests to the plan:**
388
+
389
+ For each GAP identified in the diagram, add a test requirement to the plan. Be specific:
390
+ - What test file to create (match existing naming conventions)
391
+ - What the test should assert (specific inputs → expected outputs/behavior)
392
+ - Whether it's a unit test, E2E test, or eval (use the decision matrix)
393
+ - For regressions: flag as **CRITICAL** and explain what broke
394
+
395
+ The plan should be complete enough that when implementation begins, every test is written alongside the feature code — not deferred to a follow-up.`);
396
+
397
+ // ── Test plan artifact (plan + ship) ──
398
+ sections.push(`
399
+ ### Test Plan Artifact
400
+
401
+ After producing the coverage diagram, write a test plan artifact to the project directory so \`/qa\` and \`/qa-only\` can consume it as primary test input:
402
+
403
+ \`\`\`bash
404
+ eval "$(~/.claude/skills/opengstack/bin/opengstack-slug 2>/dev/null)" && mkdir -p ~/.opengstack/projects/$SLUG
405
+ USER=$(whoami)
406
+ DATETIME=$(date +%Y%m%d-%H%M%S)
407
+ \`\`\`
408
+
409
+ Write to \`~/.opengstack/projects/{slug}/{user}-{branch}-eng-review-test-plan-{datetime}.md\`:
410
+
411
+ \`\`\`markdown
412
+ # Test Plan
413
+ Generated by /plan-eng-review on {date}
414
+ Branch: {branch}
415
+ Repo: {owner/repo}
416
+
417
+ ## Affected Pages/Routes
418
+ - {URL path} — {what to test and why}
419
+
420
+ ## Key Interactions to Verify
421
+ - {interaction description} on {page}
422
+
423
+ ## Edge Cases
424
+ - {edge case} on {page}
425
+
426
+ ## Critical Paths
427
+ - {end-to-end flow that must work}
428
+ \`\`\`
429
+
430
+ This file is consumed by \`/qa\` and \`/qa-only\` as primary test input. Include only the information that helps a QA tester know **what to test and where** — not implementation details.`);
431
+ } else if (mode === 'ship') {
432
+ sections.push(`
433
+ **5. Generate tests for uncovered paths:**
434
+
435
+ If test framework detected (or bootstrapped in Step 2.5):
436
+ - Prioritize error handlers and edge cases first (happy paths are more likely already tested)
437
+ - Read 2-3 existing test files to match conventions exactly
438
+ - Generate unit tests. Mock all external dependencies (DB, API, Redis).
439
+ - For paths marked [→E2E]: generate integration/E2E tests using the project's E2E framework (Playwright, Cypress, Capybara, etc.)
440
+ - For paths marked [→EVAL]: generate eval tests using the project's eval framework, or flag for manual eval if none exists
441
+ - Write tests that exercise the specific uncovered path with real assertions
442
+ - Run each test. Passes → commit as \`test: coverage for {feature}\`
443
+ - Fails → fix once. Still fails → revert, note gap in diagram.
444
+
445
+ Caps: 30 code paths max, 20 tests generated max (code + user flow combined), 2-min per-test exploration cap.
446
+
447
+ If no test framework AND user declined bootstrap → diagram only, no generation. Note: "Test generation skipped — no test framework configured."
448
+
449
+ **Diff is test-only changes:** Skip Step 3.4 entirely: "No new application code paths to audit."
450
+
451
+ **6. After-count and coverage summary:**
452
+
453
+ \`\`\`bash
454
+ # Count test files after generation
455
+ find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec.*' | grep -v node_modules | wc -l
456
+ \`\`\`
457
+
458
+ For PR body: \`Tests: {before} → {after} (+{delta} new)\`
459
+ Coverage line: \`Test Coverage Audit: N new code paths. M covered (X%). K tests generated, J committed.\`
460
+
461
+ **7. Coverage gate:**
462
+
463
+ Before proceeding, check CLAUDE.md for a \`## Test Coverage\` section with \`Minimum:\` and \`Target:\` fields. If found, use those percentages. Otherwise use defaults: Minimum = 60%, Target = 80%.
464
+
465
+ Using the coverage percentage from the diagram in substep 4 (the \`COVERAGE: X/Y (Z%)\` line):
466
+
467
+ - **>= target:** Pass. "Coverage gate: PASS ({X}%)." Continue.
468
+ - **>= minimum, < target:** Use AskUserQuestion:
469
+ - "AI-assessed coverage is {X}%. {N} code paths are untested. Target is {target}%."
470
+ - RECOMMENDATION: Choose A because untested code paths are where production bugs hide.
471
+ - Options:
472
+ A) Generate more tests for remaining gaps (recommended)
473
+ B) Ship anyway — I accept the coverage risk
474
+ C) These paths don't need tests — mark as intentionally uncovered
475
+ - If A: Loop back to substep 5 (generate tests) targeting the remaining gaps. After second pass, if still below target, present AskUserQuestion again with updated numbers. Maximum 2 generation passes total.
476
+ - If B: Continue. Include in PR body: "Coverage gate: {X}% — user accepted risk."
477
+ - If C: Continue. Include in PR body: "Coverage gate: {X}% — {N} paths intentionally uncovered."
478
+
479
+ - **< minimum:** Use AskUserQuestion:
480
+ - "AI-assessed coverage is critically low ({X}%). {N} of {M} code paths have no tests. Minimum threshold is {minimum}%."
481
+ - RECOMMENDATION: Choose A because less than {minimum}% means more code is untested than tested.
482
+ - Options:
483
+ A) Generate tests for remaining gaps (recommended)
484
+ B) Override — ship with low coverage (I understand the risk)
485
+ - If A: Loop back to substep 5. Maximum 2 passes. If still below minimum after 2 passes, present the override choice again.
486
+ - If B: Continue. Include in PR body: "Coverage gate: OVERRIDDEN at {X}%."
487
+
488
+ **Coverage percentage undetermined:** If the coverage diagram doesn't produce a clear numeric percentage (ambiguous output, parse error), **skip the gate** with: "Coverage gate: could not determine percentage — skipping." Do not default to 0% or block.
489
+
490
+ **Test-only diffs:** Skip the gate (same as the existing fast-path).
491
+
492
+ **100% coverage:** "Coverage gate: PASS (100%)." Continue.`);
493
+
494
+ // ── Test plan artifact (ship mode) ──
495
+ sections.push(`
496
+ ### Test Plan Artifact
497
+
498
+ After producing the coverage diagram, write a test plan artifact so \`/qa\` and \`/qa-only\` can consume it:
499
+
500
+ \`\`\`bash
501
+ eval "$(~/.claude/skills/opengstack/bin/opengstack-slug 2>/dev/null)" && mkdir -p ~/.opengstack/projects/$SLUG
502
+ USER=$(whoami)
503
+ DATETIME=$(date +%Y%m%d-%H%M%S)
504
+ \`\`\`
505
+
506
+ Write to \`~/.opengstack/projects/{slug}/{user}-{branch}-ship-test-plan-{datetime}.md\`:
507
+
508
+ \`\`\`markdown
509
+ # Test Plan
510
+ Generated by /ship on {date}
511
+ Branch: {branch}
512
+ Repo: {owner/repo}
513
+
514
+ ## Affected Pages/Routes
515
+ - {URL path} — {what to test and why}
516
+
517
+ ## Key Interactions to Verify
518
+ - {interaction description} on {page}
519
+
520
+ ## Edge Cases
521
+ - {edge case} on {page}
522
+
523
+ ## Critical Paths
524
+ - {end-to-end flow that must work}
525
+ \`\`\``);
526
+ } else {
527
+ // review mode
528
+ sections.push(`
529
+ **Step 5. Generate tests for gaps (Fix-First):**
530
+
531
+ If test framework is detected and gaps were identified:
532
+ - Classify each gap as AUTO-FIX or ASK per the Fix-First Heuristic:
533
+ - **AUTO-FIX:** Simple unit tests for pure functions, edge cases of existing tested functions
534
+ - **ASK:** E2E tests, tests requiring new test infrastructure, tests for ambiguous behavior
535
+ - For AUTO-FIX gaps: generate the test, run it, commit as \`test: coverage for {feature}\`
536
+ - For ASK gaps: include in the Fix-First batch question with the other review findings
537
+ - For paths marked [→E2E]: always ASK (E2E tests are higher-effort and need user confirmation)
538
+ - For paths marked [→EVAL]: always ASK (eval tests need user confirmation on quality criteria)
539
+
540
+ If no test framework detected → include gaps as INFORMATIONAL findings only, no generation.
541
+
542
+ **Diff is test-only changes:** Skip Step 4.75 entirely: "No new application code paths to audit."
543
+
544
+ ### Coverage Warning
545
+
546
+ After producing the coverage diagram, check the coverage percentage. Read CLAUDE.md for a \`## Test Coverage\` section with a \`Minimum:\` field. If not found, use default: 60%.
547
+
548
+ If coverage is below the minimum threshold, output a prominent warning **before** the regular review findings:
549
+
550
+ \`\`\`
551
+ ⚠️ COVERAGE WARNING: AI-assessed coverage is {X}%. {N} code paths untested.
552
+ Consider writing tests before running /ship.
553
+ \`\`\`
554
+
555
+ This is INFORMATIONAL — does not block /review. But it makes low coverage visible early so the developer can address it before reaching the /ship coverage gate.
556
+
557
+ If coverage percentage cannot be determined, skip the warning silently.`);
558
+ }
559
+
560
+ return sections.join('\n');
561
+ }
562
+
563
+ export function generateTestCoverageAuditPlan(_ctx: TemplateContext): string {
564
+ return generateTestCoverageAuditInner('plan');
565
+ }
566
+
567
+ export function generateTestCoverageAuditShip(_ctx: TemplateContext): string {
568
+ return generateTestCoverageAuditInner('ship');
569
+ }
570
+
571
+ export function generateTestCoverageAuditReview(_ctx: TemplateContext): string {
572
+ return generateTestCoverageAuditInner('review');
573
+ }
@@ -0,0 +1,45 @@
1
+ export type Host = 'claude' | 'codex' | 'factory';
2
+
3
+ export interface HostPaths {
4
+ skillRoot: string;
5
+ localSkillRoot: string;
6
+ binDir: string;
7
+ browseDir: string;
8
+ designDir: string;
9
+ }
10
+
11
+ export const HOST_PATHS: Record<Host, HostPaths> = {
12
+ claude: {
13
+ skillRoot: '~/.claude/skills/opengstack',
14
+ localSkillRoot: '.claude/skills/opengstack',
15
+ binDir: '~/.claude/skills/opengstack/bin',
16
+ browseDir: '~/.claude/skills/opengstack/browse/dist',
17
+ designDir: '~/.claude/skills/opengstack/design/dist',
18
+ },
19
+ codex: {
20
+ skillRoot: '$OpenGStack_ROOT',
21
+ localSkillRoot: '.agents/skills/opengstack',
22
+ binDir: '$OpenGStack_BIN',
23
+ browseDir: '$OpenGStack_BROWSE',
24
+ designDir: '$OpenGStack_DESIGN',
25
+ },
26
+ factory: {
27
+ skillRoot: '$OpenGStack_ROOT',
28
+ localSkillRoot: '.factory/skills/opengstack',
29
+ binDir: '$OpenGStack_BIN',
30
+ browseDir: '$OpenGStack_BROWSE',
31
+ designDir: '$OpenGStack_DESIGN',
32
+ },
33
+ };
34
+
35
+ export interface TemplateContext {
36
+ skillName: string;
37
+ tmplPath: string;
38
+ benefitsFrom?: string[];
39
+ host: Host;
40
+ paths: HostPaths;
41
+ preambleTier?: number; // 1-4, controls which preamble sections are included
42
+ }
43
+
44
+ /** Resolver function signature. args is populated for parameterized placeholders like {{INVOKE_SKILL:name}}. */
45
+ export type ResolverFn = (ctx: TemplateContext, args?: string[]) => string;