forge-workflow 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/.claude/commands/dev.md +314 -0
  2. package/.claude/commands/plan.md +389 -0
  3. package/.claude/commands/premerge.md +179 -0
  4. package/.claude/commands/research.md +42 -0
  5. package/.claude/commands/review.md +442 -0
  6. package/.claude/commands/rollback.md +721 -0
  7. package/.claude/commands/ship.md +134 -0
  8. package/.claude/commands/sonarcloud.md +152 -0
  9. package/.claude/commands/status.md +77 -0
  10. package/.claude/commands/validate.md +237 -0
  11. package/.claude/commands/verify.md +221 -0
  12. package/.claude/rules/greptile-review-process.md +285 -0
  13. package/.claude/rules/workflow.md +105 -0
  14. package/.claude/scripts/greptile-resolve.sh +526 -0
  15. package/.claude/scripts/load-env.sh +32 -0
  16. package/.forge/hooks/check-tdd.js +240 -0
  17. package/.github/PLUGIN_TEMPLATE.json +32 -0
  18. package/.mcp.json.example +12 -0
  19. package/AGENTS.md +169 -0
  20. package/CLAUDE.md +99 -0
  21. package/LICENSE +21 -0
  22. package/README.md +414 -0
  23. package/bin/forge-cmd.js +313 -0
  24. package/bin/forge-validate.js +303 -0
  25. package/bin/forge.js +4228 -0
  26. package/docs/AGENT_INSTALL_PROMPT.md +342 -0
  27. package/docs/ENHANCED_ONBOARDING.md +602 -0
  28. package/docs/EXAMPLES.md +482 -0
  29. package/docs/GREPTILE_SETUP.md +400 -0
  30. package/docs/MANUAL_REVIEW_GUIDE.md +106 -0
  31. package/docs/ROADMAP.md +359 -0
  32. package/docs/SETUP.md +632 -0
  33. package/docs/TOOLCHAIN.md +849 -0
  34. package/docs/VALIDATION.md +363 -0
  35. package/docs/WORKFLOW.md +400 -0
  36. package/docs/planning/PROGRESS.md +396 -0
  37. package/docs/plans/.gitkeep +0 -0
  38. package/docs/plans/2026-02-27-forge-test-suite-v2-decisions.md +21 -0
  39. package/docs/plans/2026-02-27-forge-test-suite-v2-design.md +362 -0
  40. package/docs/plans/2026-02-27-forge-test-suite-v2-tasks.md +343 -0
  41. package/docs/plans/2026-03-02-superpowers-gaps-decisions.md +26 -0
  42. package/docs/plans/2026-03-02-superpowers-gaps-design.md +239 -0
  43. package/docs/plans/2026-03-02-superpowers-gaps-tasks.md +260 -0
  44. package/docs/plans/2026-03-04-agent-command-parity-design.md +163 -0
  45. package/docs/plans/2026-03-04-verify-worktree-cleanup-decisions.md +7 -0
  46. package/docs/plans/2026-03-04-verify-worktree-cleanup-design.md +165 -0
  47. package/docs/plans/2026-03-05-forge-uto-decisions.md +6 -0
  48. package/docs/plans/2026-03-05-forge-uto-design.md +116 -0
  49. package/docs/plans/2026-03-05-forge-uto-tasks.md +244 -0
  50. package/docs/plans/2026-03-10-command-creator-and-eval-decisions.md +52 -0
  51. package/docs/plans/2026-03-10-command-creator-and-eval-design.md +350 -0
  52. package/docs/plans/2026-03-10-command-creator-and-eval-tasks.md +426 -0
  53. package/docs/plans/2026-03-10-stale-workflow-refs-decisions.md +8 -0
  54. package/docs/plans/2026-03-10-stale-workflow-refs-design.md +80 -0
  55. package/docs/plans/2026-03-10-stale-workflow-refs-tasks.md +90 -0
  56. package/docs/plans/2026-03-14-beads-plan-context-decisions.md +9 -0
  57. package/docs/plans/2026-03-14-beads-plan-context-design.md +171 -0
  58. package/docs/plans/2026-03-14-beads-plan-context-tasks.md +160 -0
  59. package/docs/plans/2026-03-14-skill-eval-loop-decisions.md +33 -0
  60. package/docs/plans/2026-03-14-skill-eval-loop-design.md +118 -0
  61. package/docs/plans/2026-03-14-skill-eval-loop-results.md +78 -0
  62. package/docs/plans/2026-03-14-skill-eval-loop-tasks.md +160 -0
  63. package/docs/plans/2026-03-15-agent-command-parity-v2-decisions.md +11 -0
  64. package/docs/plans/2026-03-15-agent-command-parity-v2-design.md +145 -0
  65. package/docs/plans/2026-03-15-agent-command-parity-v2-tasks.md +211 -0
  66. package/docs/research/TEMPLATE.md +292 -0
  67. package/docs/research/advanced-testing.md +297 -0
  68. package/docs/research/agent-permissions.md +167 -0
  69. package/docs/research/dependency-chain.md +328 -0
  70. package/docs/research/forge-workflow-v2.md +550 -0
  71. package/docs/research/plugin-architecture.md +772 -0
  72. package/docs/research/pr4-cli-automation.md +326 -0
  73. package/docs/research/premerge-verify-restructure.md +205 -0
  74. package/docs/research/skills-restructure.md +508 -0
  75. package/docs/research/sonarcloud-perfection-plan.md +166 -0
  76. package/docs/research/sonarcloud-quality-gate.md +184 -0
  77. package/docs/research/superpowers-integration.md +403 -0
  78. package/docs/research/superpowers.md +319 -0
  79. package/docs/research/test-environment.md +519 -0
  80. package/install.sh +1062 -0
  81. package/lefthook.yml +39 -0
  82. package/lib/agents/README.md +198 -0
  83. package/lib/agents/claude.plugin.json +28 -0
  84. package/lib/agents/cline.plugin.json +22 -0
  85. package/lib/agents/codex.plugin.json +19 -0
  86. package/lib/agents/copilot.plugin.json +24 -0
  87. package/lib/agents/cursor.plugin.json +25 -0
  88. package/lib/agents/kilocode.plugin.json +22 -0
  89. package/lib/agents/opencode.plugin.json +20 -0
  90. package/lib/agents/roo.plugin.json +23 -0
  91. package/lib/agents-config.js +2112 -0
  92. package/lib/commands/dev.js +513 -0
  93. package/lib/commands/plan.js +696 -0
  94. package/lib/commands/recommend.js +119 -0
  95. package/lib/commands/ship.js +377 -0
  96. package/lib/commands/status.js +378 -0
  97. package/lib/commands/validate.js +602 -0
  98. package/lib/context-merge.js +359 -0
  99. package/lib/plugin-catalog.js +360 -0
  100. package/lib/plugin-manager.js +166 -0
  101. package/lib/plugin-recommender.js +141 -0
  102. package/lib/project-discovery.js +491 -0
  103. package/lib/setup.js +118 -0
  104. package/lib/workflow-profiles.js +203 -0
  105. package/package.json +115 -0
@@ -0,0 +1,292 @@
1
+ # Research: [Feature Name]
2
+
3
+ **Date**: YYYY-MM-DD
4
+ **Researcher**: Claude AI
5
+
6
+ ## Objective
7
+ [What we're trying to achieve - clear problem statement and goals]
8
+
9
+ ## Codebase Analysis
10
+
11
+ ### Existing Patterns
12
+ - **File**: `path/to/file.ts`
13
+ - **Pattern**: [Description of existing implementation]
14
+ - **Reusability**: [Yes/No + reasoning]
15
+ - **Lessons learned**: [What worked, what didn't]
16
+
17
+ ### Affected Modules
18
+ - **Module**: [name]
19
+ - **Changes needed**: [Description]
20
+ - **Impact**: [Low/Medium/High]
21
+ - **Dependencies**: [List any dependencies]
22
+
23
+ ### Test Infrastructure
24
+ - **Existing tests**: `path/to/tests/`
25
+ - **Test utilities**: [Available testing tools/helpers]
26
+ - **Coverage**: [Current state - percentage, gaps]
27
+ - **Test patterns**: [What patterns are used - unit, integration, E2E]
28
+
29
+ ## Web Research
30
+
31
+ ### Best Practices (parallel-web-search)
32
+ 1. **Source**: [URL]
33
+ - **Key insight**: [Summary of best practice]
34
+ - **Applicability**: [How it applies to our project]
35
+ - **Decision impact**: [What decision this influences]
36
+ - **Implementation notes**: [How to apply]
37
+
38
+ 2. **Source**: [URL]
39
+ - [...]
40
+
41
+ ### Known Issues (parallel-web-search)
42
+ 1. **Issue**: [Description]
43
+ - **Source**: [GitHub/SO/Blog URL]
44
+ - **Mitigation**: [How to avoid]
45
+ - **Decision impact**: [Changes to approach]
46
+ - **Frequency**: [How common is this issue]
47
+
48
+ 2. **Issue**: [...]
49
+
50
+ ### Library Documentation (Context7)
51
+ 1. **Library**: [name and version]
52
+ - **API**: [Relevant methods/patterns]
53
+ - **Compatibility**: [Version requirements, breaking changes]
54
+ - **Decision impact**: [Implementation details]
55
+ - **Example usage**: [Code snippet]
56
+
57
+ 2. **Library**: [...]
58
+
59
+ ### Case Studies
60
+ 1. **Source**: [URL]
61
+ - **Company/Project**: [Who implemented this]
62
+ - **Scale**: [Production scale, users, data volume]
63
+ - **Lessons**: [What they learned]
64
+ - **Applicability**: [How it relates to our use case]
65
+
66
+ 2. **Source**: [...]
67
+
68
+ ## Key Decisions & Reasoning
69
+
70
+ ### Decision 1: [Decision Title]
71
+ - **Decision**: [What we decided]
72
+ - **Reasoning**: [Why we chose this approach]
73
+ - **Evidence**: [Research that supports this - links to sources]
74
+ - **Alternatives considered**:
75
+ 1. [Alternative 1]: [Why rejected]
76
+ 2. [Alternative 2]: [Why rejected]
77
+ - **Trade-offs**: [What we're giving up, what we're gaining]
78
+ - **Risk**: [Low/Medium/High] - [Risk description]
79
+
80
+ ### Decision 2: [Decision Title]
81
+ - [...]
82
+
83
+ ## TDD Test Scenarios (Identified Upfront)
84
+
85
+ ### Unit Tests
86
+ 1. **Test**: [Scenario description]
87
+ - **File**: `test/path/test.ts`
88
+ - **Function under test**: `functionName()`
89
+ - **Assertions**: [What to verify]
90
+ - **Test data**: [Required test data/mocks]
91
+ - **Edge cases**: [List edge cases to cover]
92
+
93
+ 2. **Test**: [...]
94
+
95
+ ### Integration Tests
96
+ 1. **Test**: [Scenario description]
97
+ - **File**: `test/integration/test.ts`
98
+ - **Components**: [What components are being tested together]
99
+ - **Assertions**: [What to verify]
100
+ - **Test data**: [Database fixtures, API mocks]
101
+
102
+ 2. **Test**: [...]
103
+
104
+ ### E2E Tests
105
+ 1. **Test**: [User flow scenario]
106
+ - **File**: `test/e2e/test.ts`
107
+ - **User flow**: [Step-by-step user actions]
108
+ - **Assertions**: [What user should see/experience]
109
+ - **Test data**: [Complete test environment setup]
110
+
111
+ 2. **Test**: [...]
112
+
113
+ ## Security Analysis (OWASP Top 10 + Feature-Specific)
114
+
115
+ ### OWASP Top 10 Applicability
116
+
117
+ #### A01: Broken Access Control
118
+ - **Risk**: [High/Medium/Low]
119
+ - **Applicable**: [Yes/No]
120
+ - **Mitigation**: [How addressed - RLS policies, permission checks]
121
+ - **Tests**: [Security test scenarios]
122
+ - **Evidence**: [Links to security research]
123
+
124
+ #### A02: Cryptographic Failures
125
+ - **Risk**: [High/Medium/Low]
126
+ - **Applicable**: [Yes/No]
127
+ - **Mitigation**: [Encryption at rest/transit, key management]
128
+ - **Tests**: [Encryption tests]
129
+ - **Compliance**: [Data protection requirements]
130
+
131
+ #### A03: Injection
132
+ - **Risk**: [High/Medium/Low]
133
+ - **Applicable**: [Yes/No]
134
+ - **Mitigation**: [Parameterized queries, input validation, sanitization]
135
+ - **Tests**: [SQL injection tests, XSS tests, command injection tests]
136
+ - **Libraries**: [What libraries help prevent injection]
137
+
138
+ #### A04: Insecure Design
139
+ - **Risk**: [High/Medium/Low]
140
+ - **Threat model**: [Key threats identified]
141
+ - **Secure design patterns**: [Patterns used - zero trust, defense in depth]
142
+ - **Architecture review**: [Security considerations in design]
143
+ - **Tests**: [Security design validation]
144
+
145
+ #### A05: Security Misconfiguration
146
+ - **Risk**: [High/Medium/Low]
147
+ - **Configuration reviewed**: [Yes/No]
148
+ - **Security headers**: [CSP, HSTS, X-Frame-Options, etc.]
149
+ - **Error handling**: [No sensitive info in errors]
150
+ - **Default accounts**: [No default/test credentials]
151
+ - **Tests**: [Configuration security tests]
152
+
153
+ #### A06: Vulnerable Components
154
+ - **Risk**: [High/Medium/Low]
155
+ - **Dependencies scanned**: [Yes/No - tool used]
156
+ - **Known CVEs**: [Count and severity from scan]
157
+ - **Update plan**: [If vulnerabilities found]
158
+ - **Monitoring**: [Dependabot, Snyk, etc.]
159
+ - **Tests**: [Dependency security checks]
160
+
161
+ #### A07: Identification and Authentication Failures
162
+ - **Risk**: [High/Medium/Low]
163
+ - **Auth mechanism**: [OAuth2/JWT/Session/etc.]
164
+ - **Session management**: [Secure/reviewed - timeout, rotation]
165
+ - **Password policy**: [Requirements if applicable]
166
+ - **MFA**: [Required/Optional/Not applicable]
167
+ - **Brute force protection**: [Rate limiting, account lockout]
168
+ - **Tests**: [Authentication tests, session tests]
169
+
170
+ #### A08: Software and Data Integrity Failures
171
+ - **Risk**: [High/Medium/Low]
172
+ - **Integrity checks**: [Where implemented - signatures, checksums]
173
+ - **Code signing**: [Yes/No]
174
+ - **CI/CD security**: [Pipeline reviewed, secrets management]
175
+ - **Supply chain**: [Trusted sources, verification]
176
+ - **Tests**: [Integrity validation tests]
177
+
178
+ #### A09: Security Logging and Monitoring Failures
179
+ - **Risk**: [High/Medium/Low]
180
+ - **Security events logged**: [List what's tracked]
181
+ - **Audit trail**: [What's tracked for compliance]
182
+ - **No sensitive data**: [Verified - no passwords/tokens in logs]
183
+ - **Alerting**: [Security alerts configured]
184
+ - **Log retention**: [Duration and compliance]
185
+ - **Tests**: [Logging tests, no sensitive data tests]
186
+
187
+ #### A10: Server-Side Request Forgery (SSRF)
188
+ - **Risk**: [High/Medium/Low]
189
+ - **External requests**: [Where made in code]
190
+ - **URL validation**: [Whitelist/validation rules]
191
+ - **Network restrictions**: [Firewall rules, VPC]
192
+ - **Input sanitization**: [User-controlled URLs]
193
+ - **Tests**: [SSRF prevention tests]
194
+
195
+ ### Feature-Specific Security Risks
196
+
197
+ 1. **Risk**: [Specific risk for this feature]
198
+ - **Likelihood**: High/Medium/Low
199
+ - **Impact**: High/Medium/Low
200
+ - **Attack vector**: [How this could be exploited]
201
+ - **Mitigation**: [Specific solution]
202
+ - **Evidence**: [Research source showing this risk]
203
+ - **Tests**: [Security test scenarios]
204
+ - **Monitoring**: [How to detect attacks]
205
+
206
+ 2. **Risk**: [Next risk]
207
+ - [...]
208
+
209
+ ### Security Test Scenarios (TDD)
210
+
211
+ 1. **Test**: Unauthorized access attempt should fail
212
+ - **File**: `test/security/access-control.test.ts`
213
+ - **Scenario**: User tries to access another team's data
214
+ - **Expected**: 403 Forbidden, no data leak
215
+
216
+ 2. **Test**: SQL injection attempt should be blocked
217
+ - **File**: `test/security/injection.test.ts`
218
+ - **Scenario**: Malicious input in query parameter
219
+ - **Expected**: Input sanitized, no SQL execution, error logged
220
+
221
+ 3. **Test**: XSS attempt should be sanitized
222
+ - **File**: `test/security/xss.test.ts`
223
+ - **Scenario**: Script tag in user input
224
+ - **Expected**: HTML escaped, no script execution
225
+
226
+ 4. **Test**: [Additional security tests]
227
+ - [...]
228
+
229
+ ## Scope Assessment
230
+
231
+ - **Type**: Tactical / Strategic
232
+ - **Rationale**: [Why this classification]
233
+ - **OpenSpec needed**: Yes / No
234
+
235
+ - **Complexity**: Low / Medium / High
236
+ - **Rationale**: [Number of files, systems involved, dependencies]
237
+ - **Estimated effort**: [Without time, describe scope]
238
+
239
+ - **Parallel opportunity**: Yes / No
240
+ - **Rationale**: [Independent tracks available?]
241
+ - **Tracks**: [If yes, list potential parallel tracks]
242
+
243
+ - **Estimated files**: [Count]
244
+ - **New files**: [List]
245
+ - **Modified files**: [List]
246
+
247
+ - **Dependencies**:
248
+ - **Internal**: [Other features/modules]
249
+ - **External**: [Third-party libraries]
250
+ - **Blockers**: [Any blocking dependencies]
251
+
252
+ - **Security risk level**: Low / Medium / High / Critical
253
+ - **Rationale**: [Based on OWASP analysis]
254
+ - **Mitigation priority**: [When to address]
255
+
256
+ ## Next Steps
257
+
258
+ 1. **If Strategic**: Create OpenSpec proposal
259
+ - `openspec proposal create <feature-slug>`
260
+ - Write proposal.md, tasks.md, design.md
261
+ - Reference this research doc for evidence
262
+
263
+ 2. **Create Beads issue**:
264
+ - `bd create "<feature-name>"`
265
+ - Link to this research doc
266
+ - Link to OpenSpec if strategic
267
+
268
+ 3. **Create branch**:
269
+ - `git checkout -b feat/<feature-slug>`
270
+
271
+ 4. **Proceed to /plan**:
272
+ - Read this research doc
273
+ - Create formal implementation plan
274
+ - Wait for OpenSpec approval if strategic
275
+
276
+ ## Research Checklist
277
+
278
+ - [ ] Codebase exploration complete
279
+ - [ ] parallel-web-search web research complete (multiple sources)
280
+ - [ ] Context7 library documentation reviewed
281
+ - [ ] Case studies analyzed
282
+ - [ ] All key decisions documented with evidence
283
+ - [ ] TDD test scenarios identified upfront
284
+ - [ ] OWASP Top 10 analysis complete
285
+ - [ ] Feature-specific security risks identified
286
+ - [ ] Security test scenarios defined
287
+ - [ ] Scope assessment complete
288
+ - [ ] Next steps clear
289
+
290
+ ---
291
+
292
+ **Note**: This research document serves as the single source of truth for all architectural and implementation decisions. Reference it throughout the development lifecycle (in OpenSpec proposals, PR descriptions, code reviews, and documentation).
@@ -0,0 +1,297 @@
1
+ # Research: PR5 — Advanced Testing Expansion
2
+
3
+ **Date**: 2026-02-20
4
+ **Beads Issue**: forge-01p
5
+ **Status**: Research complete, ready for `/plan`
6
+
7
+ ---
8
+
9
+ ## Objective
10
+
11
+ Expand Forge's testing infrastructure with mutation testing (Stryker), performance benchmarks, extended OWASP security tests (A02, A07), and a test quality dashboard. Build on the foundation from PR3 (808 tests, 80% coverage thresholds, 6-platform CI matrix).
12
+
13
+ ---
14
+
15
+ ## Codebase Analysis
16
+
17
+ ### Current Test Infrastructure
18
+
19
+ | Category | Files | Tests | Location |
20
+ |----------|-------|-------|----------|
21
+ | Unit tests | 35+ | ~500 | `test/` |
22
+ | Edge cases | 12 | ~120 | `test-env/edge-cases/` |
23
+ | Validation helpers | 4+4 | ~52 | `test-env/validation/` |
24
+ | E2E tests | 5 | ~30 | `test/e2e/` |
25
+ | Integration | 1 | ~15 | `test/integration/` |
26
+ | Skills tests | 7 | ~50 | `packages/skills/test/` |
27
+ | CLI structure | 2 | ~10 | `test/cli/` |
28
+ | **Total** | **56** | **808** | — |
29
+
30
+ - **Framework**: Node.js built-in `node:test` + `node:assert/strict` (main), Bun test (skills)
31
+ - **Coverage**: c8 with 80% thresholds (lines, branches, functions, statements)
32
+ - **CI**: 6-platform matrix (ubuntu/macos/windows x Node 20/22) + coverage + E2E jobs
33
+ - **Skipped tests**: 36 instances of `test.skip()` — opportunity to fill gaps
34
+
35
+ ### Critical Gap: `bin/forge.js`
36
+
37
+ The main CLI file (4,407 lines) is **explicitly excluded from c8 coverage**. Only structural tests exist in `test/cli/forge.test.js` (10 tests verifying function existence). No direct execution, prompt handling, or integration tests.
38
+
39
+ ### Existing Security Tests
40
+
41
+ `test-env/edge-cases/security.test.js` covers:
42
+ - Shell injection prevention (`;`, `&&`, `|`, backticks)
43
+ - Path traversal attacks (`../`, `..\\`)
44
+ - Null byte injection
45
+ - Unicode smuggling attacks
46
+
47
+ **Not covered**: Cryptographic failures (OWASP A02), authentication failures (OWASP A07).
48
+
49
+ ---
50
+
51
+ ## Web Research
52
+
53
+ ### 1. Mutation Testing — Stryker
54
+
55
+ **Key findings from [Stryker Mutator docs](https://stryker-mutator.io/docs/stryker-js/guides/nodejs/) and [Sentry's experience](https://sentry.engineering/blog/js-mutation-testing-our-sdks):**
56
+
57
+ #### Configuration for Node.js + node:test
58
+
59
+ Stryker supports a `command` test runner (default) that runs any CLI command and bases results on exit codes. Since there's no dedicated `node:test` runner plugin, we use:
60
+
61
+ ```json
62
+ {
63
+ "testRunner": "command",
64
+ "commandRunner": { "command": "bun test" },
65
+ "mutate": ["lib/**/*.js", "bin/forge.js"],
66
+ "coverageAnalysis": "off",
67
+ "thresholds": { "high": 80, "low": 60, "break": 60 },
68
+ "reporters": ["clear-text", "html", "json"],
69
+ "tempDirName": ".stryker-tmp",
70
+ "cleanTempDir": true,
71
+ "incremental": true,
72
+ "incrementalFile": "stryker-report/stryker-incremental.json"
73
+ }
74
+ ```
75
+
76
+ **Important**: `coverageAnalysis: "off"` is required for the command runner (no per-test optimization). This means ALL tests run for EVERY mutant — expect longer runtimes.
77
+
78
+ #### Performance Considerations
79
+
80
+ Per [Sentry's blog post](https://sentry.engineering/blog/js-mutation-testing-our-sdks):
81
+ - Full mutation testing on large codebases takes 25-60+ minutes
82
+ - **Incremental mode** (`--incremental`) only mutates changed files — critical for CI
83
+ - Switching from Jest to Vitest cut their runtime from 60min to 25min
84
+ - Recommendation: Run full mutation testing nightly/weekly, incremental on PRs
85
+
86
+ #### Recommended Thresholds
87
+
88
+ Per [Stryker docs](https://stryker-mutator.io/docs/stryker-js/configuration/) and [community standards](https://github.com/stryker-mutator/stryker-net/issues/1779):
89
+ - `high: 80` (green) — excellent test quality
90
+ - `low: 60` (yellow) — acceptable but needs improvement
91
+ - `break: 60` (fail build) — minimum acceptable score
92
+ - **Our target**: 70%+ per roadmap, start with `break: 50` and increase iteratively
93
+
94
+ #### Scope Decision
95
+
96
+ Mutating `bin/forge.js` (4,407 lines) would create thousands of mutants and take very long with the command runner. **Recommendation**: Start with `lib/**/*.js` only (smaller, testable modules), add `bin/forge.js` later when it has better test coverage.
97
+
98
+ ### 2. Performance Benchmarking
99
+
100
+ **Key findings from [Medium - Node.js Benchmarks](https://medium.com/@Modexa/node-js-benchmarks-you-can-actually-trust-76dd35aa8ae1):**
101
+
102
+ #### Tools
103
+
104
+ | Tool | Use Case | Notes |
105
+ |------|----------|-------|
106
+ | `node:perf_hooks` | Built-in timing | `performance.now()`, `PerformanceObserver` |
107
+ | `tinybench` | Micro-benchmarks | Lightweight, modern, good for functions |
108
+ | `node --prof` | V8 profiling | CPU profiling, tick analysis |
109
+ | Custom harness | CLI benchmarks | Subprocess spawning + timing |
110
+
111
+ #### What to Benchmark
112
+
113
+ For a CLI tool like Forge:
114
+ 1. **Startup time**: `node bin/forge.js --help` (target: <500ms)
115
+ 2. **Agent detection**: `detectProjectType()` performance
116
+ 3. **Config generation**: AGENTS.md, CLAUDE.md generation speed
117
+ 4. **Package manager detection**: `detectPackageManager()` latency
118
+ 5. **File I/O**: Large project scanning (monorepo fixtures)
119
+
120
+ #### CI Integration
121
+
122
+ - Store benchmark results as JSON artifacts
123
+ - Compare against baselines using custom script
124
+ - Flag regressions >20% as warnings, >50% as failures
125
+ - **GitHub Actions**: Use `actions/upload-artifact` for benchmark reports
126
+
127
+ ### 3. OWASP Security Testing
128
+
129
+ **Key findings from [Node.js Security Best Practices](https://nodejs.org/en/learn/getting-started/security-best-practices):**
130
+
131
+ #### A02: Cryptographic Failures
132
+
133
+ Relevant to Forge:
134
+ - **API key handling**: `.env.local` files with `PARALLEL_API_KEY`, tokens
135
+ - **Token storage**: MCP server configurations with credentials
136
+ - **Path exposure**: Windows absolute paths leaking in generated files
137
+
138
+ Test scenarios:
139
+ 1. Verify API keys are never logged to stdout/stderr
140
+ 2. Verify `.env.local` is in `.gitignore`
141
+ 3. Verify generated configs don't embed plaintext secrets
142
+ 4. Verify token references use environment variables, not literals
143
+ 5. Verify no hardcoded credentials in source code
144
+
145
+ #### A07: Identification & Authentication Failures
146
+
147
+ Relevant to Forge:
148
+ - **GitHub CLI auth**: `gh auth status` validation
149
+ - **Git operations**: Push to protected branches
150
+ - **External service configs**: MCP server authentication
151
+
152
+ Test scenarios:
153
+ 1. Verify `gh auth status` is checked before operations requiring it
154
+ 2. Verify branch protection blocks unauthenticated pushes
155
+ 3. Verify MCP configs reference credential IDs, not inline secrets
156
+ 4. Verify setup warns when auth tokens are missing
157
+ 5. Verify no default/weak credentials in templates
158
+
159
+ ### 4. Test Quality Dashboard
160
+
161
+ **Key metrics to track:**
162
+
163
+ | Metric | Tool | Current | Target |
164
+ |--------|------|---------|--------|
165
+ | Test count | `bun test` output | 808 | Track growth |
166
+ | Code coverage | c8 | 80% threshold | >=80% maintained |
167
+ | Mutation score | Stryker | N/A | >=70% |
168
+ | ESLint warnings | ESLint | 0 | 0 maintained |
169
+ | Skipped tests | grep `test.skip` | 36 | Reduce to <10 |
170
+ | Test runtime | CI timing | ~12s | Track regressions |
171
+ | Flaky rate | CI history | ~0% | 0% |
172
+
173
+ **Implementation approach** (lightweight, CI-integrated):
174
+ - GitHub Actions job that generates a JSON summary after tests
175
+ - Badge updates in README (test count, coverage, mutation score)
176
+ - Artifact upload for trend tracking
177
+ - No external dashboard service needed — keep it in CI
178
+
179
+ ---
180
+
181
+ ## Key Decisions & Reasoning
182
+
183
+ ### D1: Use Stryker command runner (not Jest/Vitest runner)
184
+
185
+ **Decision**: Use `testRunner: "command"` with `bun test`
186
+ **Reasoning**: Project uses `node:test` framework, not Jest/Vitest. No Stryker plugin exists for `node:test`. Command runner works universally.
187
+ **Trade-off**: No per-test optimization (slower), but simpler setup and no framework migration needed.
188
+
189
+ ### D2: Start mutation testing on lib/ only
190
+
191
+ **Decision**: Mutate `lib/**/*.js` first, add `bin/forge.js` in a future PR
192
+ **Reasoning**: `bin/forge.js` is 4,407 lines with limited direct tests. Mutating it would create thousands of slow-to-test mutants. `lib/` modules are smaller and have better test coverage.
193
+ **Evidence**: Sentry's experience shows starting with well-tested modules gives actionable results faster.
194
+
195
+ ### D3: Use tinybench for performance benchmarks
196
+
197
+ **Decision**: `tinybench` for function-level benchmarks, subprocess spawning + `performance.now()` for CLI-level benchmarks
198
+ **Reasoning**: Zero dependencies for CLI timing, tinybench is lightweight (18KB) for micro-benchmarks. No need for heavy frameworks.
199
+
200
+ ### D4: Lightweight dashboard via CI artifacts + badges
201
+
202
+ **Decision**: Generate test quality JSON in CI, update README badges
203
+ **Reasoning**: No external service dependency. GitHub Actions artifacts provide history. Badges give at-a-glance status.
204
+ **Alternative rejected**: External dashboard tools (Grafana, Datadog) — overkill for this project size.
205
+
206
+ ### D5: Incremental mutation testing in CI
207
+
208
+ **Decision**: Run incremental Stryker on PRs, full run weekly
209
+ **Reasoning**: Full mutation testing takes 10-60+ minutes. Incremental mode only tests changed files, keeping PR checks fast.
210
+ **Evidence**: Standard practice per Stryker docs and Sentry's production experience.
211
+
212
+ ---
213
+
214
+ ## TDD Test Scenarios
215
+
216
+ ### Mutation Testing Tests (`test/mutation-config.test.js`)
217
+
218
+ 1. Stryker config file exists and is valid JSON
219
+ 2. Mutate patterns include `lib/**/*.js`
220
+ 3. Thresholds are set (high: 80, low: 60, break: 50)
221
+ 4. Incremental mode is enabled
222
+ 5. HTML reporter is configured for artifact upload
223
+ 6. `test:mutation` script exists in package.json
224
+ 7. Stryker report directory is in `.gitignore`
225
+
226
+ ### Performance Benchmark Tests (`test/benchmarks.test.js`)
227
+
228
+ 1. CLI startup completes in <2000ms
229
+ 2. `detectPackageManager()` completes in <500ms
230
+ 3. Agent detection for standard project completes in <1000ms
231
+ 4. Benchmark results file is generated as valid JSON
232
+ 5. `test:benchmark` script exists in package.json
233
+
234
+ ### OWASP A02 Security Tests (`test-env/edge-cases/crypto-security.test.js`)
235
+
236
+ 1. API keys are never in generated output files
237
+ 2. `.env.local` pattern is in `.gitignore`
238
+ 3. Generated AGENTS.md doesn't contain plaintext tokens
239
+ 4. MCP config uses credential references, not inline secrets
240
+ 5. Source code has no hardcoded API keys (regex scan)
241
+ 6. Token environment variables use descriptive names
242
+
243
+ ### OWASP A07 Auth Tests (`test-env/edge-cases/auth-security.test.js`)
244
+
245
+ 1. Prerequisites check validates `gh auth status`
246
+ 2. Branch protection script blocks unauthenticated scenarios
247
+ 3. Setup flow warns on missing auth tokens
248
+ 4. No default credentials in any template file
249
+ 5. OAuth/token patterns reference env vars only
250
+
251
+ ### Test Dashboard Tests (`test/test-dashboard.test.js`)
252
+
253
+ 1. Dashboard generation script exists
254
+ 2. Output JSON has required metrics fields
255
+ 3. Badge URLs are valid shield.io format
256
+ 4. CI workflow includes dashboard generation step
257
+
258
+ ---
259
+
260
+ ## Security Analysis (OWASP Top 10)
261
+
262
+ | Risk | Relevance | Current Coverage | PR5 Action |
263
+ |------|-----------|-----------------|------------|
264
+ | A01: Broken Access Control | Medium | Branch protection tests | Maintain |
265
+ | **A02: Cryptographic Failures** | **High** | **None** | **Add 6+ tests** |
266
+ | A03: Injection | High | Shell injection tests | Maintain |
267
+ | A04: Insecure Design | Low | Architecture tests | N/A |
268
+ | A05: Security Misconfiguration | Medium | Config validation | Maintain |
269
+ | A06: Vulnerable Components | Medium | `npm audit` in CI | Maintain |
270
+ | **A07: Identification/Auth** | **Medium** | **Partial (gh auth)** | **Add 5+ tests** |
271
+ | A08: Software/Data Integrity | Low | Commitlint, CODEOWNERS | Maintain |
272
+ | A09: Logging/Monitoring | Low | N/A for CLI | N/A |
273
+ | A10: SSRF | Low | N/A for CLI | N/A |
274
+
275
+ ---
276
+
277
+ ## Scope Assessment
278
+
279
+ - **Classification**: Tactical (concrete testing improvements, no architecture changes)
280
+ - **Complexity**: Medium (4 parallel workstreams, each independent)
281
+ - **Timeline**: 2-3 days per roadmap
282
+ - **Parallelization**: All 4 deliverables can be developed independently
283
+ - **Risk**: Low (additive only, no breaking changes)
284
+
285
+ ---
286
+
287
+ ## Sources
288
+
289
+ - [Stryker Node.js Guide](https://stryker-mutator.io/docs/stryker-js/guides/nodejs/)
290
+ - [Stryker Configuration Reference](https://stryker-mutator.io/docs/stryker-js/configuration/)
291
+ - [Stryker Getting Started](https://stryker-mutator.io/docs/stryker-js/getting-started/)
292
+ - [Sentry: Mutation-testing our JavaScript SDKs](https://sentry.engineering/blog/js-mutation-testing-our-sdks) (Aug 2024)
293
+ - [Mutation Testing with Stryker - DEV Community](https://dev.to/lucaspereiradesouzat/mutation-testing-with-stryker-1p4a) (Dec 2025)
294
+ - [Introducing Mutation Testing in Vue.js with StrykerJS](https://medium.com/accor-digital-and-tech/introducing-mutation-testing-in-vue-js-with-strykerjs-e1083afe7326) (Nov 2025)
295
+ - [Node.js Benchmarks You Can Actually Trust](https://medium.com/@Modexa/node-js-benchmarks-you-can-actually-trust-76dd35aa8ae1) (Jan 2026)
296
+ - [Node.js Security Best Practices](https://nodejs.org/en/learn/getting-started/security-best-practices) (Official)
297
+ - [Stryker Dashboard](https://dashboard.stryker-mutator.io) — community mutation score hosting