forge-workflow 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/.claude/commands/dev.md +314 -0
  2. package/.claude/commands/plan.md +389 -0
  3. package/.claude/commands/premerge.md +179 -0
  4. package/.claude/commands/research.md +42 -0
  5. package/.claude/commands/review.md +442 -0
  6. package/.claude/commands/rollback.md +721 -0
  7. package/.claude/commands/ship.md +134 -0
  8. package/.claude/commands/sonarcloud.md +152 -0
  9. package/.claude/commands/status.md +77 -0
  10. package/.claude/commands/validate.md +237 -0
  11. package/.claude/commands/verify.md +221 -0
  12. package/.claude/rules/greptile-review-process.md +285 -0
  13. package/.claude/rules/workflow.md +105 -0
  14. package/.claude/scripts/greptile-resolve.sh +526 -0
  15. package/.claude/scripts/load-env.sh +32 -0
  16. package/.forge/hooks/check-tdd.js +240 -0
  17. package/.github/PLUGIN_TEMPLATE.json +32 -0
  18. package/.mcp.json.example +12 -0
  19. package/AGENTS.md +169 -0
  20. package/CLAUDE.md +99 -0
  21. package/LICENSE +21 -0
  22. package/README.md +414 -0
  23. package/bin/forge-cmd.js +313 -0
  24. package/bin/forge-validate.js +303 -0
  25. package/bin/forge.js +4228 -0
  26. package/docs/AGENT_INSTALL_PROMPT.md +342 -0
  27. package/docs/ENHANCED_ONBOARDING.md +602 -0
  28. package/docs/EXAMPLES.md +482 -0
  29. package/docs/GREPTILE_SETUP.md +400 -0
  30. package/docs/MANUAL_REVIEW_GUIDE.md +106 -0
  31. package/docs/ROADMAP.md +359 -0
  32. package/docs/SETUP.md +632 -0
  33. package/docs/TOOLCHAIN.md +849 -0
  34. package/docs/VALIDATION.md +363 -0
  35. package/docs/WORKFLOW.md +400 -0
  36. package/docs/planning/PROGRESS.md +396 -0
  37. package/docs/plans/.gitkeep +0 -0
  38. package/docs/plans/2026-02-27-forge-test-suite-v2-decisions.md +21 -0
  39. package/docs/plans/2026-02-27-forge-test-suite-v2-design.md +362 -0
  40. package/docs/plans/2026-02-27-forge-test-suite-v2-tasks.md +343 -0
  41. package/docs/plans/2026-03-02-superpowers-gaps-decisions.md +26 -0
  42. package/docs/plans/2026-03-02-superpowers-gaps-design.md +239 -0
  43. package/docs/plans/2026-03-02-superpowers-gaps-tasks.md +260 -0
  44. package/docs/plans/2026-03-04-agent-command-parity-design.md +163 -0
  45. package/docs/plans/2026-03-04-verify-worktree-cleanup-decisions.md +7 -0
  46. package/docs/plans/2026-03-04-verify-worktree-cleanup-design.md +165 -0
  47. package/docs/plans/2026-03-05-forge-uto-decisions.md +6 -0
  48. package/docs/plans/2026-03-05-forge-uto-design.md +116 -0
  49. package/docs/plans/2026-03-05-forge-uto-tasks.md +244 -0
  50. package/docs/plans/2026-03-10-command-creator-and-eval-decisions.md +52 -0
  51. package/docs/plans/2026-03-10-command-creator-and-eval-design.md +350 -0
  52. package/docs/plans/2026-03-10-command-creator-and-eval-tasks.md +426 -0
  53. package/docs/plans/2026-03-10-stale-workflow-refs-decisions.md +8 -0
  54. package/docs/plans/2026-03-10-stale-workflow-refs-design.md +80 -0
  55. package/docs/plans/2026-03-10-stale-workflow-refs-tasks.md +90 -0
  56. package/docs/plans/2026-03-14-beads-plan-context-decisions.md +9 -0
  57. package/docs/plans/2026-03-14-beads-plan-context-design.md +171 -0
  58. package/docs/plans/2026-03-14-beads-plan-context-tasks.md +160 -0
  59. package/docs/plans/2026-03-14-skill-eval-loop-decisions.md +33 -0
  60. package/docs/plans/2026-03-14-skill-eval-loop-design.md +118 -0
  61. package/docs/plans/2026-03-14-skill-eval-loop-results.md +78 -0
  62. package/docs/plans/2026-03-14-skill-eval-loop-tasks.md +160 -0
  63. package/docs/plans/2026-03-15-agent-command-parity-v2-decisions.md +11 -0
  64. package/docs/plans/2026-03-15-agent-command-parity-v2-design.md +145 -0
  65. package/docs/plans/2026-03-15-agent-command-parity-v2-tasks.md +211 -0
  66. package/docs/research/TEMPLATE.md +292 -0
  67. package/docs/research/advanced-testing.md +297 -0
  68. package/docs/research/agent-permissions.md +167 -0
  69. package/docs/research/dependency-chain.md +328 -0
  70. package/docs/research/forge-workflow-v2.md +550 -0
  71. package/docs/research/plugin-architecture.md +772 -0
  72. package/docs/research/pr4-cli-automation.md +326 -0
  73. package/docs/research/premerge-verify-restructure.md +205 -0
  74. package/docs/research/skills-restructure.md +508 -0
  75. package/docs/research/sonarcloud-perfection-plan.md +166 -0
  76. package/docs/research/sonarcloud-quality-gate.md +184 -0
  77. package/docs/research/superpowers-integration.md +403 -0
  78. package/docs/research/superpowers.md +319 -0
  79. package/docs/research/test-environment.md +519 -0
  80. package/install.sh +1062 -0
  81. package/lefthook.yml +39 -0
  82. package/lib/agents/README.md +198 -0
  83. package/lib/agents/claude.plugin.json +28 -0
  84. package/lib/agents/cline.plugin.json +22 -0
  85. package/lib/agents/codex.plugin.json +19 -0
  86. package/lib/agents/copilot.plugin.json +24 -0
  87. package/lib/agents/cursor.plugin.json +25 -0
  88. package/lib/agents/kilocode.plugin.json +22 -0
  89. package/lib/agents/opencode.plugin.json +20 -0
  90. package/lib/agents/roo.plugin.json +23 -0
  91. package/lib/agents-config.js +2112 -0
  92. package/lib/commands/dev.js +513 -0
  93. package/lib/commands/plan.js +696 -0
  94. package/lib/commands/recommend.js +119 -0
  95. package/lib/commands/ship.js +377 -0
  96. package/lib/commands/status.js +378 -0
  97. package/lib/commands/validate.js +602 -0
  98. package/lib/context-merge.js +359 -0
  99. package/lib/plugin-catalog.js +360 -0
  100. package/lib/plugin-manager.js +166 -0
  101. package/lib/plugin-recommender.js +141 -0
  102. package/lib/project-discovery.js +491 -0
  103. package/lib/setup.js +118 -0
  104. package/lib/workflow-profiles.js +203 -0
  105. package/package.json +115 -0
@@ -0,0 +1,350 @@
1
+ # Design: Command Creator & Eval
2
+
3
+ - **Slug**: command-creator-and-eval
4
+ - **Date**: 2026-03-10
5
+ - **Status**: Draft
6
+ - **Branch**: feat/command-creator-and-eval
7
+ - **Worktree**: .worktrees/command-creator-and-eval
8
+
9
+ ---
10
+
11
+ ## Purpose
12
+
13
+ Forge ships 11 slash commands (`.claude/commands/*.md`) and 6 skills (`skills/*/SKILL.md`). Currently there is:
14
+ - **No way to test commands** — HARD-GATE enforcement, dead references, cross-command contracts are unchecked
15
+ - **No way to test skills** — trigger accuracy and output quality are unmeasured
16
+ - **No automated sync** — 8 agents each need adapted command files, but changes to canonical commands require manual propagation
17
+ - **No improvement loop** — when a command or skill has issues, fixing is ad-hoc with no before/after measurement
18
+
19
+ Real bugs exist today: `/status` references `openspec list` (removed), `/rollback` still says 9 stages, `GEMINI.md` has `/merge` instead of `/premerge`. These are caught by humans reading files, not by any automated check.
20
+
21
+ **Goal**: Ship infrastructure to validate, sync, and iteratively improve both commands and skills — with a one-source-of-truth sync mechanism so one file change updates all agents.
22
+
23
+ ---
24
+
25
+ ## Success Criteria
26
+
27
+ ### PR-A: Static Command Validator + Sync Infrastructure
28
+ 1. `forge check-agents` CLI command exists and passes on clean repo
29
+ 2. Static validator catches: dead references (e.g., `openspec list`), stale stage names (`/check` vs `/validate`), missing HARD-GATE blocks, inconsistent stage counts
30
+ 3. Cross-command contract tests verify: /plan output matches /dev input expectations, /dev output matches /validate expectations, etc.
31
+ 4. `scripts/sync-commands.js` reads canonical `.claude/commands/*.md` → generates agent-specific adapter files for all 8 supported agents
32
+ 5. `forge check-agents --sync-check` verifies all agent files are in sync with canonical source
33
+ 6. Works on Windows (bash/Git Bash compatible)
34
+
35
+ ### PR-B: Command Improvement Loop (Scope B → C)
36
+ 1. Grader agent (adapted from skill-creator) evaluates command execution transcripts against expectations
37
+ 2. `run_eval.sh` runs a command in a disposable worktree, captures transcript, grades output
38
+ 3. At least 3 eval scenarios per command (happy path, error path, edge case)
39
+ 4. First targets: `/status` and `/validate` (simplest to eval — deterministic output)
40
+ 5. `improve_command.py` (Scope C): analyzes failures, proposes command rewrites, re-tests, compares before/after
41
+ 6. User approval gate before any command modification is applied
42
+
43
+ ### PR-C: Skill Optimization
44
+ 1. Eval loop runs on all 6 skills in `skills/` using installed skill-creator plugin patterns
45
+ 2. Trigger accuracy measured: does Claude invoke the skill when it should? Does it NOT invoke when it shouldn't?
46
+ 3. At least 5 test queries per skill (3 should-trigger, 2 should-not-trigger)
47
+ 4. Description improvement loop with train/test split (60/40) to prevent overfitting
48
+ 5. Before/after benchmark comparison for each skill
49
+
50
+ ---
51
+
52
+ ## Out of Scope
53
+
54
+ - Creating new commands or skills (only testing/improving existing ones)
55
+ - Cross-agent behavioral testing (testing if Cursor/OpenCode/etc. actually execute commands correctly — that's runtime testing, not config validation)
56
+ - Merging forge-2w3 or forge-ctc (separate work streams)
57
+ - Modifying the 7-stage workflow itself
58
+
59
+ ---
60
+
61
+ ## Dependencies
62
+
63
+ ```
64
+ forge-ctc (in_progress) ← stale ref cleanup, running in parallel session
65
+ ↓ blocks
66
+ forge-2w3 (in_progress) ← agent command parity (70+ adapter files)
67
+ ↓ blocks
68
+ forge-agr ← fix global CLAUDE.md
69
+
70
+ PR-A ← no blockers, can start now
71
+ ↓ enhances
72
+ forge-2w3 ← sync script makes adapter generation trivial
73
+
74
+ PR-B ← depends on PR-A (uses validator infrastructure)
75
+ PR-C ← no blockers, parallel with everything
76
+ ```
77
+
78
+ **Ship order**: PR-A and PR-C ship first (no deps). PR-B ships after PR-A. forge-2w3 uses PR-A's sync script when unblocked.
79
+
80
+ ---
81
+
82
+ ## Approach Selected
83
+
84
+ ### Architecture: One Source of Truth + Adapter Sync
85
+
86
+ **Canonical source**: `.claude/commands/*.md` (already exists, 11 files)
87
+
88
+ **Sync mechanism**: `scripts/sync-commands.js` — reads each canonical command and generates agent-specific files with correct frontmatter, extension, and directory path.
89
+
90
+ Why `.claude/commands/` stays canonical (not a new `commands/` dir):
91
+ - Already exists with full content
92
+ - Claude Code is the primary development agent
93
+ - Moving would break existing workflows for zero benefit
94
+
95
+ **Adapter transforms per agent**:
96
+
97
+ | Agent | Directory | Extension | Frontmatter Transform | Tier |
98
+ |-------|-----------|-----------|----------------------|------|
99
+ | Claude Code | `.claude/commands/` | `.md` | None (canonical) | 1 |
100
+ | Cursor | `.cursor/skills/<name>/` | `.md` | Strip all frontmatter (migrated from commands to skills in v2.4+) | 1 |
101
+ | Cline | `.clinerules/workflows/` | `.md` | Strip all frontmatter | 1 |
102
+ | OpenCode | `.opencode/commands/` | `.md` | Keep `description:` | 1 |
103
+ | GitHub Copilot | `.github/prompts/` | `.prompt.md` | Add `name:`, `description:`, `tools:` | 1 |
104
+ | Kilo Code | `.kilocode/workflows/` | `.md` | Keep `description:`, add `mode: code` | 2 |
105
+ | Roo Code | `.roo/commands/` | `.md` | Keep `description:`, add `mode: code` | 2 |
106
+ | Codex | `.codex/skills/<name>/` | `SKILL.md` | Single combined file (special case) | 2 |
107
+
108
+ **Tier 1** (5 agents): Full workflow — commands + PreToolUse hooks + MCP + subagents. Can enforce HARD-GATEs.
109
+ **Tier 2** (3 agents): Partial — commands + MCP + subagents, but no PreToolUse hooks. Commands work as prompts but can't enforce gates.
110
+
111
+ **Dropped agents** (with rationale):
112
+ - Antigravity (Google): Not in AGENTS.md, not actively maintained (PR #54)
113
+ - Windsurf: Dropped in PR #54
114
+ - Continue: CLI still in alpha, no hooks, no in-conversation subagents
115
+ - Blackbox AI: No custom command directory, no hooks, no third-party MCP support
116
+
117
+ ### Static Validator: grep-based, no AI runtime
118
+
119
+ Pattern checks (all regex/grep):
120
+ 1. **Dead references**: Scan for strings that reference removed features (`openspec`, `/merge`, `/check` as stage name)
121
+ 2. **HARD-GATE structure**: Every command that claims HARD-GATEs has matching open/close blocks
122
+ 3. **Stage count consistency**: All files agree on 7 stages
123
+ 4. **Cross-command contracts**: /plan mentions output files that /dev expects as input
124
+ 5. **Sync drift**: Compare canonical vs adapted files (content hash minus frontmatter)
125
+
126
+ ### Behavioral Eval: Adapted from skill-creator
127
+
128
+ Reuse from skill-creator plugin (90% compatible):
129
+ - **Grader agent** (`agents/grader.md`): evaluates transcripts against assertions
130
+ - **Schemas** (`references/schemas.md`): `evals.json`, `grading.json` adapted for commands
131
+ - **Viewer**: HTML report generation
132
+
133
+ New for commands:
134
+ - **Disposable worktree execution**: Each eval runs `claude -p "/command-name" --output-format stream-json` inside a temp worktree
135
+ - **HARD-GATE assertion type**: "Did the command stop when the gate condition was unmet?"
136
+ - **Contract assertions**: "Does /plan's output contain a task list file that /dev would find?"
137
+
138
+ ### Claude CLI Eval Execution (Research Findings)
139
+
140
+ **Invocation**: `claude -p "/command" --output-format stream-json --verbose --no-session-persistence --max-turns N`
141
+
142
+ **Stream-json output**: NDJSON (one JSON object per line). Key event types:
143
+ - `assistant` — complete message with `content[]` array (text + tool_use blocks)
144
+ - `stream_event` — incremental events (content_block_start/delta/stop)
145
+ - `result` — final result when agent finishes
146
+
147
+ **Detecting tool calls**: Parse `assistant` events → `content[].type == "tool_use"` → `name` + `input`
148
+
149
+ **Windows compatibility**: skill-creator uses `select.select()` (Unix-only). **Fix**: Use threading-based reader with `queue.Queue` for portable pipe reading.
150
+
151
+ **Critical env var**: Must strip `CLAUDECODE` from subprocess env to allow nested `claude -p` calls.
152
+
153
+ **Built-in worktree**: `claude --worktree <name>` creates disposable worktree automatically. Auto-cleaned if no changes. Alternative to manual `git worktree add`.
154
+
155
+ **Eval set format** (adapted from skill-creator):
156
+ ```json
157
+ [
158
+ {"command": "/status", "scenario": "clean_repo", "assertions": ["lists beads", "shows branch"]},
159
+ {"command": "/validate", "scenario": "failing_tests", "assertions": ["reports test failures", "does NOT declare success"]}
160
+ ]
161
+ ```
162
+
163
+ ### Improvement Loop (Scope C)
164
+
165
+ Adapted from skill-creator's `improve_description.py`:
166
+ - Analyze eval failures
167
+ - Use Claude with extended thinking to propose command rewrites
168
+ - Re-run evals on proposed rewrite
169
+ - Compare before/after scores
170
+ - **User approval gate** before applying any change (never auto-modify)
171
+
172
+ ---
173
+
174
+ ## Constraints
175
+
176
+ - **Windows compatible**: All scripts must work in Git Bash on Windows
177
+ - **No `select.select()` on pipes**: skill-creator's `run_eval.py` uses this (Unix-only) — our adaptation must use subprocess with timeout instead
178
+ - **User approval for modifications**: Scope C improvement loop NEVER auto-applies changes
179
+ - **Deterministic-first targets**: Start with commands that have measurable output (/status, /validate) before attempting subjective ones (/plan, /dev)
180
+ - **No new dependencies**: Use existing tools (bun, bash, gh CLI, claude CLI)
181
+
182
+ ---
183
+
184
+ ## Edge Cases
185
+
186
+ 1. **Command references skill that doesn't exist**: /plan references `parallel-web-search` — validator should verify the skill exists in `skills/`
187
+ 2. **Circular cross-command deps**: /review references /validate which references /dev — contract checker must handle cycles
188
+ 3. **Agent doesn't support all 7 commands**: Some agents may only get a subset — sync script reads agent capabilities from `lib/agents/*.plugin.json`
189
+ 4. **Frontmatter extraction fails**: Canonical command has non-standard frontmatter — sync script should error clearly, not silently produce broken files
190
+ 5. **Worktree already exists during eval**: Eval creates temp worktrees — must handle cleanup on failure and concurrent runs
191
+ 6. **Cursor skills migration**: Cursor v2.4+ uses `.cursor/skills/` not `.cursor/commands/` — sync script must generate skills directory structure, not flat files
192
+
193
+ ---
194
+
195
+ ## Ambiguity Policy
196
+
197
+ **One source of truth resolves ambiguity**: When there's a question about what a command should do, the canonical `.claude/commands/*.md` file is authoritative. All adapted files must match.
198
+
199
+ For implementation decisions:
200
+ - **Low-risk** (formatting, file organization): Make reasonable choice, document in commit message
201
+ - **Scope-changing** (new assertion types, changing which commands to target): Pause and ask user
202
+
203
+ ---
204
+
205
+ ## Beads Integration
206
+
207
+ ### New Issues to Create
208
+
209
+ | ID | Title | Type | Priority | Depends On |
210
+ |----|-------|------|----------|------------|
211
+ | forge-jfw | PR-A: Static command validator + sync infrastructure | feature | P1 | None |
212
+ | forge-agp | PR-B: Command behavioral eval + improvement loop | feature | P2 | forge-jfw |
213
+ | forge-1jx | PR-C: Skill optimization via eval loop | feature | P2 | None |
214
+
215
+ ### Existing Issues Affected
216
+
217
+ | ID | How Affected |
218
+ |----|-------------|
219
+ | forge-2w3 | PR-A's sync script replaces manual adapter creation (Tasks 3-11) |
220
+ | forge-30k | PR-A's static validator overlaps with doc link checker — may merge or share infra |
221
+ | forge-ctc | Must complete before forge-2w3 can use sync script |
222
+
223
+ ---
224
+
225
+ ## Technical Research
226
+
227
+ ### Confirmed Agent Command Formats (8 agents)
228
+
229
+ | Agent | Directory | Extension | Required Frontmatter | Optional Frontmatter | Source |
230
+ |-------|-----------|-----------|---------------------|---------------------|--------|
231
+ | Claude Code | `.claude/commands/` | `.md` | `description` | — | Official docs |
232
+ | Cursor | `.cursor/skills/<name>/` | `.md` | **None** (skills are plain markdown) | — | [cursor.com/docs/context/commands](https://cursor.com/docs/context/commands) |
233
+ | Cline | `.clinerules/workflows/` | `.md` | None | `description`, `author`, `version`, `globs`, `tags` | [docs.cline.bot/features/slash-commands/workflows](https://docs.cline.bot/features/slash-commands/workflows) |
234
+ | OpenCode | `.opencode/commands/` | `.md` | `description` | `agent`, `model`, `subtask` | [opencode.ai/docs/commands](https://opencode.ai/docs/commands/) |
235
+ | GitHub Copilot | `.github/prompts/` | `.prompt.md` | None strictly | `name`, `description`, `agent`, `tools`, `model` | [code.visualstudio.com/docs/copilot/customization/prompt-files](https://code.visualstudio.com/docs/copilot/customization/prompt-files) |
236
+ | Kilo Code | `.kilocode/workflows/` | `.md` | `description` | `arguments`, `mode`, `model` | [kilo.ai/docs/features/slash-commands/workflows](https://kilo.ai/docs/features/slash-commands/workflows) |
237
+ | Roo Code | `.roo/commands/` | `.md` | `description` | `argument-hint`, `mode` | [docs.roocode.com/features/slash-commands](https://docs.roocode.com/features/slash-commands) |
238
+ | Codex | `.codex/skills/<name>/` | `SKILL.md` | `name`, `description` | — | [developers.openai.com/codex/skills](https://developers.openai.com/codex/skills/) |
239
+
240
+ **Key findings:**
241
+ - Cursor migrated from `.cursor/commands/` to `.cursor/skills/` in v2.4+ (use `/migrate-to-skills`)
242
+ - Cursor is the only agent with NO frontmatter support — sync strips everything
243
+ - `description` is the universal common field across all agents that support frontmatter
244
+ - 8 agents total (dropped: Antigravity, Windsurf, Continue, Blackbox AI)
245
+ - No documented content length limits for any agent
246
+
247
+ ### Agent Capability Tiers (New Research)
248
+
249
+ Full capability matrix verified from official documentation (March 2026):
250
+
251
+ | Capability | Claude Code | Cursor | Cline | OpenCode | Copilot | Kilo Code | Roo Code | Codex |
252
+ |---|---|---|---|---|---|---|---|---|
253
+ | **Custom commands** | `.claude/commands/` | `.cursor/skills/` | `.clinerules/workflows/` | `.opencode/commands/` | `.github/prompts/` | `.kilocode/workflows/` | `.roo/commands/` | Skills (`SKILL.md`) |
254
+ | **PreToolUse hooks** | Yes | Yes (can deny) | Yes (can cancel) | Yes (plugin system) | Yes (`.github/hooks/`) | No (permission config only) | No (file-event hooks only) | No (approval modes only) |
255
+ | **Subagents** | Yes (full) | Yes (8 parallel) | Yes (read-only) | Yes (`@general`, `@explore`) | Yes (sequential IDE, parallel `/fleet`) | Yes (Orchestrator + 4 parallel) | Yes (Orchestrator, sequential) | Yes (worktree-based, experimental) |
256
+ | **MCP (3rd party)** | Yes | Yes | Yes + Marketplace | Yes (local + remote + OAuth) | Yes | Yes + Marketplace | Yes + Marketplace | Yes |
257
+ | **Always-on rules** | `.claude/rules/` | `.cursor/rules/*.mdc` | `.clinerules/*.md` | `AGENTS.md` | `.github/instructions/` | `.kilocode/rules/` + `AGENTS.md` | `.roo/rules/` | `AGENTS.md` |
258
+ | **Custom modes** | No | No | No | Yes (plan, build) | No | Yes (5 built-in + custom) | Yes (5 built-in + custom) | No |
259
+ | **Skills** | Yes | Yes | No | Yes (`SKILL.md`) | No | Yes (`SKILL.md`) | Yes | Yes (`SKILL.md`) |
260
+
261
+ **Tier 1 — Full workflow enforcement** (commands + hooks + MCP): Claude Code, Cursor, Cline, OpenCode, GitHub Copilot
262
+ **Tier 2 — Partial workflow** (commands + MCP, no hook enforcement): Kilo Code, Roo Code, Codex
263
+
264
+ **Sources**: cursor.com/docs, docs.cline.bot, opencode.ai/docs, docs.github.com, kilo.ai/docs, docs.roocode.com, developers.openai.com/codex
265
+
266
+ ### Existing Linting Tools (Critical Discovery)
267
+
268
+ | Tool | Scope | Install | Rules |
269
+ |------|-------|---------|-------|
270
+ | **agnix** | Multi-agent (10+ formats) | `npx agnix .` or `cargo install agnix-cli` | 231 rules from official specs. CLAUDE.md, AGENTS.md, SKILL.md, hooks, MCP, Cursor rules, Copilot prompts, Cline rules, Windsurf rules. Supports `--fix`, `--strict`, watch mode, JSON/SARIF output. |
271
+ | **cclint** | Claude Code only | `npx @carlrannaberg/cclint` | Agent frontmatter, command definitions, tool permissions, hooks, CLAUDE.md best practices. Custom Zod schemas. |
272
+
273
+ **Impact on PR-A**: Instead of building `forge check-agents` from scratch, evaluate using **agnix** as the base validator and adding Forge-specific checks on top (cross-command contracts, sync drift, dead Forge-specific references like `openspec`, stage count consistency).
274
+
275
+ ### DRY Check Results — Existing Reusable Code
276
+
277
+ | File | What It Does | Reuse For |
278
+ |------|-------------|-----------|
279
+ | `test/structural/command-files.test.js` | Validates command files (existence, truncation, HARD-GATE counts, balanced code blocks) | **Extend** with frontmatter validation, dead ref checks, sync drift |
280
+ | `lib/plugin-manager.js` | Loads/validates `lib/agents/*.plugin.json` with schema validation | Agent capability detection for sync script |
281
+ | `scripts/behavioral-judge.sh` | Frontmatter extraction via grep, `check-lock-sync` subcommand | Pattern for YAML parsing in sync script |
282
+ | `lib/agents-config.js` | Agent metadata generation | Template for sync script structure |
283
+ | `.github/workflows/detect-command-file-changes.yml` | CI trigger on `.claude/commands/**` changes | Trigger sync validation in CI |
284
+ | `.github/workflows/check-agentic-workflow-sync.yml` | MD ↔ LOCK.yml sync validation | Model for cross-file validation |
285
+
286
+ **DRY conclusion**: PR-A's static validator should extend `test/structural/command-files.test.js`, not create a new file. The sync script and cross-reference checker are genuinely new.
287
+
288
+ ### OWASP Top 10 Analysis
289
+
290
+ | Category | Risk | Applies? | Mitigation |
291
+ |----------|------|----------|------------|
292
+ | A01 Broken Access Control | Sync overwrites user customizations | Low | Warn before overwriting modified files; `--force` flag required |
293
+ | A02 Cryptographic Failures | — | N/A | — |
294
+ | A03 Injection | `run_eval.sh` passes command names to shell | **Medium** | Validate names against `[a-z-]+` regex; quote all variables |
295
+ | A04 Insecure Design | Improvement loop could propose bad content | Low | User approval gate; diff shown for review |
296
+ | A05 Security Misconfiguration | Sync generates agent permission configs | **Medium** | Follow existing deny/ask/allow patterns; never auto-allow dangerous ops |
297
+ | A06-A07 | — | N/A | — |
298
+ | A08 Data Integrity | Generated files committed to git | Low | Git provides integrity; sync includes content hash verification |
299
+ | A09 Logging | Eval transcripts contain full conversations | Low | Store in `.forge/eval-logs/` (gitignored); warn on env var detection |
300
+ | A10 SSRF | — | N/A | — |
301
+
302
+ ### TDD Test Scenarios
303
+
304
+ **PR-A Static Validator (extend `test/structural/command-files.test.js`):**
305
+ 1. Happy path: clean repo with all commands → all checks pass
306
+ 2. Dead reference: `/status` contains `openspec list` → test catches it
307
+ 3. Sync drift: `.cursor/commands/plan.md` content differs from canonical → `sync-check` test flags it
308
+ 4. Missing HARD-GATE: command claims gate but has no closing block → test warns
309
+ 5. Stage count: all files agree on 7 stages → pass; file says 9 → fail
310
+ 6. Cross-command contract: /plan output mentions task file → /dev input expects same file → pass
311
+
312
+ **PR-B Behavioral Eval:**
313
+ 1. Happy path: `/status` in clean worktree → grader confirms expected output sections
314
+ 2. HARD-GATE enforcement: `/plan` on non-master branch → grader confirms it stopped
315
+ 3. Cross-command contract: /plan creates task file → /dev finds it
316
+ 4. Error path: `/validate` with failing tests → grader confirms it reports failures
317
+
318
+ **PR-C Skill Optimization:**
319
+ 1. Trigger accuracy: `parallel-web-search` triggers on "search for recent news about X"
320
+ 2. Non-trigger: `parallel-web-search` does NOT trigger on "read this file"
321
+ 3. Improvement: description rewrite improves trigger accuracy on test set (60/40 split)
322
+ 4. No regression: improved description doesn't trigger on previously-correct non-trigger queries
323
+
324
+ ---
325
+
326
+ ## Sources
327
+
328
+ - [Anthropic skill-creator plugin](https://github.com/anthropics/claude-plugins-official/tree/main/plugins/skill-creator)
329
+ - [Agent command parity design doc](docs/plans/2026-03-04-agent-command-parity-design.md)
330
+ - [Agent instructions sync research](docs/research/agent-instructions-sync.md)
331
+ - [skills.sh portable runner pattern](skills/)
332
+ - [agnix — multi-agent linter](https://github.com/agent-sh/agnix)
333
+ - [cclint — Claude Code linter](https://github.com/carlrannaberg/cclint)
334
+ - [OpenCode commands docs](https://opencode.ai/docs/commands/)
335
+ - [Cursor commands docs](https://cursor.com/docs/context/commands)
336
+ - [Cline workflows docs](https://docs.cline.bot/features/slash-commands/workflows)
337
+ - [Kilo Code workflows](https://kilo.ai/docs/features/slash-commands/workflows)
338
+ - [Kilo Code skills](https://kilo.ai/docs/customize/skills)
339
+ - [Roo Code slash commands](https://docs.roocode.com/features/slash-commands)
340
+ - [Roo Code hooks](https://github.com/RooCodeInc/Roo-Code/discussions/6147)
341
+ - [GitHub Copilot prompt files](https://code.visualstudio.com/docs/copilot/customization/prompt-files)
342
+ - [GitHub Copilot hooks](https://docs.github.com/en/copilot/concepts/agents/coding-agent/about-hooks)
343
+ - [Codex skills](https://developers.openai.com/codex/skills/)
344
+ - [Codex MCP](https://developers.openai.com/codex/mcp/)
345
+ - [Cursor skills (migrated from commands)](https://cursor.com/docs/context/commands)
346
+ - [Cursor hooks](https://cursor.com/docs/hooks)
347
+ - [Cline hooks](https://docs.cline.bot/features/hooks)
348
+ - [OpenCode plugins/hooks](https://opencode.ai/docs/plugins)
349
+ - [Claude Code CLI reference](https://code.claude.com/docs/en/cli-reference)
350
+ - [Claude Code headless mode](https://code.claude.com/docs/en/headless)