workspace-maxxing 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/.agents/skills/workspace-maxxing/.workspace-templates/CONTEXT.md +44 -0
  2. package/.agents/skills/workspace-maxxing/.workspace-templates/SYSTEM.md +44 -0
  3. package/.agents/skills/workspace-maxxing/.workspace-templates/references/anti-patterns.md +16 -0
  4. package/.agents/skills/workspace-maxxing/.workspace-templates/references/iron-laws.md +26 -0
  5. package/.agents/skills/workspace-maxxing/.workspace-templates/references/reporting-format.md +52 -0
  6. package/.agents/skills/workspace-maxxing/.workspace-templates/scripts/benchmark.ts +171 -0
  7. package/.agents/skills/workspace-maxxing/.workspace-templates/scripts/dispatch.ts +473 -0
  8. package/.agents/skills/workspace-maxxing/.workspace-templates/scripts/generate-tests.ts +158 -0
  9. package/.agents/skills/workspace-maxxing/.workspace-templates/scripts/install-tool.ts +82 -0
  10. package/.agents/skills/workspace-maxxing/.workspace-templates/scripts/iterate.ts +265 -0
  11. package/.agents/skills/workspace-maxxing/.workspace-templates/scripts/orchestrator.ts +539 -0
  12. package/.agents/skills/workspace-maxxing/.workspace-templates/scripts/scaffold.ts +282 -0
  13. package/.agents/skills/workspace-maxxing/.workspace-templates/scripts/validate.ts +452 -0
  14. package/.agents/skills/workspace-maxxing/.workspace-templates/skills/architecture/SKILL.md +95 -0
  15. package/.agents/skills/workspace-maxxing/.workspace-templates/skills/fixer/SKILL.md +109 -0
  16. package/.agents/skills/workspace-maxxing/.workspace-templates/skills/iteration/SKILL.md +89 -0
  17. package/.agents/skills/workspace-maxxing/.workspace-templates/skills/prompt-engineering/SKILL.md +87 -0
  18. package/.agents/skills/workspace-maxxing/.workspace-templates/skills/research/SKILL.md +94 -0
  19. package/.agents/skills/workspace-maxxing/.workspace-templates/skills/testing/SKILL.md +89 -0
  20. package/.agents/skills/workspace-maxxing/.workspace-templates/skills/tooling/SKILL.md +87 -0
  21. package/.agents/skills/workspace-maxxing/.workspace-templates/skills/validation/SKILL.md +103 -0
  22. package/.agents/skills/workspace-maxxing/.workspace-templates/skills/worker/SKILL.md +79 -0
  23. package/.agents/skills/workspace-maxxing/.workspace-templates/workspace/00-meta/CONTEXT.md +6 -0
  24. package/.agents/skills/workspace-maxxing/.workspace-templates/workspace/00-meta/execution-log.md +27 -0
  25. package/.agents/skills/workspace-maxxing/.workspace-templates/workspace/01-input/CONTEXT.md +29 -0
  26. package/.agents/skills/workspace-maxxing/.workspace-templates/workspace/02-process/CONTEXT.md +29 -0
  27. package/.agents/skills/workspace-maxxing/.workspace-templates/workspace/03-output/CONTEXT.md +29 -0
  28. package/.agents/skills/workspace-maxxing/.workspace-templates/workspace/README.md +14 -0
  29. package/.agents/skills/workspace-maxxing/SKILL.md +312 -0
  30. package/.agents/skills/workspace-maxxing/scripts/benchmark.ts +171 -0
  31. package/.agents/skills/workspace-maxxing/scripts/dispatch.ts +473 -0
  32. package/.agents/skills/workspace-maxxing/scripts/generate-tests.ts +158 -0
  33. package/.agents/skills/workspace-maxxing/scripts/install-tool.ts +82 -0
  34. package/.agents/skills/workspace-maxxing/scripts/iterate.ts +265 -0
  35. package/.agents/skills/workspace-maxxing/scripts/orchestrator.ts +539 -0
  36. package/.agents/skills/workspace-maxxing/scripts/scaffold.ts +282 -0
  37. package/.agents/skills/workspace-maxxing/scripts/validate.ts +452 -0
  38. package/README.md +144 -0
  39. package/dist/agent-creator.d.ts +9 -0
  40. package/dist/agent-creator.d.ts.map +1 -0
  41. package/dist/agent-creator.js +199 -0
  42. package/dist/agent-creator.js.map +1 -0
  43. package/dist/agent-iterator.d.ts +38 -0
  44. package/dist/agent-iterator.d.ts.map +1 -0
  45. package/dist/agent-iterator.js +327 -0
  46. package/dist/agent-iterator.js.map +1 -0
  47. package/dist/index.d.ts +3 -0
  48. package/dist/index.d.ts.map +1 -0
  49. package/dist/index.js +197 -0
  50. package/dist/index.js.map +1 -0
  51. package/dist/install.d.ts +18 -0
  52. package/dist/install.d.ts.map +1 -0
  53. package/dist/install.js +117 -0
  54. package/dist/install.js.map +1 -0
  55. package/dist/platforms/claude.d.ts +7 -0
  56. package/dist/platforms/claude.d.ts.map +1 -0
  57. package/dist/platforms/claude.js +70 -0
  58. package/dist/platforms/claude.js.map +1 -0
  59. package/dist/platforms/copilot.d.ts +7 -0
  60. package/dist/platforms/copilot.d.ts.map +1 -0
  61. package/dist/platforms/copilot.js +75 -0
  62. package/dist/platforms/copilot.js.map +1 -0
  63. package/dist/platforms/gemini.d.ts +7 -0
  64. package/dist/platforms/gemini.d.ts.map +1 -0
  65. package/dist/platforms/gemini.js +81 -0
  66. package/dist/platforms/gemini.js.map +1 -0
  67. package/dist/platforms/index.d.ts +8 -0
  68. package/dist/platforms/index.d.ts.map +1 -0
  69. package/dist/platforms/index.js +41 -0
  70. package/dist/platforms/index.js.map +1 -0
  71. package/dist/platforms/opencode.d.ts +7 -0
  72. package/dist/platforms/opencode.d.ts.map +1 -0
  73. package/dist/platforms/opencode.js +70 -0
  74. package/dist/platforms/opencode.js.map +1 -0
  75. package/dist/scripts/benchmark.d.ts +20 -0
  76. package/dist/scripts/benchmark.d.ts.map +1 -0
  77. package/dist/scripts/benchmark.js +170 -0
  78. package/dist/scripts/benchmark.js.map +1 -0
  79. package/dist/scripts/dispatch.d.ts +32 -0
  80. package/dist/scripts/dispatch.d.ts.map +1 -0
  81. package/dist/scripts/dispatch.js +386 -0
  82. package/dist/scripts/dispatch.js.map +1 -0
  83. package/dist/scripts/generate-tests.d.ts +11 -0
  84. package/dist/scripts/generate-tests.d.ts.map +1 -0
  85. package/dist/scripts/generate-tests.js +118 -0
  86. package/dist/scripts/generate-tests.js.map +1 -0
  87. package/dist/scripts/install-tool.d.ts +8 -0
  88. package/dist/scripts/install-tool.d.ts.map +1 -0
  89. package/dist/scripts/install-tool.js +98 -0
  90. package/dist/scripts/install-tool.js.map +1 -0
  91. package/dist/scripts/iterate.d.ts +44 -0
  92. package/dist/scripts/iterate.d.ts.map +1 -0
  93. package/dist/scripts/iterate.js +260 -0
  94. package/dist/scripts/iterate.js.map +1 -0
  95. package/dist/scripts/orchestrator.d.ts +40 -0
  96. package/dist/scripts/orchestrator.d.ts.map +1 -0
  97. package/dist/scripts/orchestrator.js +378 -0
  98. package/dist/scripts/orchestrator.js.map +1 -0
  99. package/dist/scripts/scaffold.d.ts +8 -0
  100. package/dist/scripts/scaffold.d.ts.map +1 -0
  101. package/dist/scripts/scaffold.js +279 -0
  102. package/dist/scripts/scaffold.js.map +1 -0
  103. package/dist/scripts/validate.d.ts +11 -0
  104. package/dist/scripts/validate.d.ts.map +1 -0
  105. package/dist/scripts/validate.js +472 -0
  106. package/dist/scripts/validate.js.map +1 -0
  107. package/docs/superpowers/plans/2026-04-07-autonomous-iteration-plan.md +1123 -0
  108. package/docs/superpowers/plans/2026-04-07-autonomous-iteration-sub-agent-batches.md +1923 -0
  109. package/docs/superpowers/plans/2026-04-07-autonomous-workflow-sub-skill-plan.md +1505 -0
  110. package/docs/superpowers/plans/2026-04-07-benchmarking-multi-agent-plan.md +854 -0
  111. package/docs/superpowers/plans/2026-04-07-workspace-builder-logic-plan.md +1426 -0
  112. package/docs/superpowers/plans/2026-04-07-workspace-maxxing-plan.md +1299 -0
  113. package/docs/superpowers/plans/2026-04-08-session-294c-subagent-invocation-plan.md +320 -0
  114. package/docs/superpowers/plans/2026-04-08-workflow-prompt-hardening-plan.md +1025 -0
  115. package/docs/superpowers/plans/2026-04-12-workspace-agent-creation-plan.md +992 -0
  116. package/docs/superpowers/specs/2026-04-07-autonomous-iteration-design.md +214 -0
  117. package/docs/superpowers/specs/2026-04-07-autonomous-iteration-sub-agent-batches-design.md +188 -0
  118. package/docs/superpowers/specs/2026-04-07-autonomous-workflow-sub-skill-design.md +137 -0
  119. package/docs/superpowers/specs/2026-04-07-benchmarking-multi-agent-design.md +105 -0
  120. package/docs/superpowers/specs/2026-04-07-workspace-builder-logic-design.md +179 -0
  121. package/docs/superpowers/specs/2026-04-07-workspace-maxxing-design.md +227 -0
  122. package/docs/superpowers/specs/2026-04-08-session-294c-subagent-invocation-design.md +265 -0
  123. package/docs/superpowers/specs/2026-04-08-workflow-prompt-hardening-design.md +146 -0
  124. package/docs/superpowers/specs/2026-04-12-workspace-agent-creation-design.md +239 -0
  125. package/jest.config.js +8 -0
  126. package/package.json +32 -0
  127. package/src/agent-creator.ts +180 -0
  128. package/src/agent-iterator.ts +397 -0
  129. package/src/index.ts +189 -0
  130. package/src/install.ts +105 -0
  131. package/src/platforms/claude.ts +40 -0
  132. package/src/platforms/copilot.ts +50 -0
  133. package/src/platforms/gemini.ts +55 -0
  134. package/src/platforms/index.ts +45 -0
  135. package/src/platforms/opencode.ts +41 -0
  136. package/src/scripts/benchmark.ts +171 -0
  137. package/src/scripts/dispatch.ts +473 -0
  138. package/src/scripts/generate-tests.ts +112 -0
  139. package/src/scripts/install-tool.ts +82 -0
  140. package/src/scripts/iterate.ts +271 -0
  141. package/src/scripts/orchestrator.ts +539 -0
  142. package/src/scripts/scaffold.ts +282 -0
  143. package/src/scripts/validate.ts +516 -0
  144. package/templates/.workspace-templates/CONTEXT.md +44 -0
  145. package/templates/.workspace-templates/SYSTEM.md +44 -0
  146. package/templates/.workspace-templates/references/anti-patterns.md +16 -0
  147. package/templates/.workspace-templates/references/iron-laws.md +26 -0
  148. package/templates/.workspace-templates/references/reporting-format.md +52 -0
  149. package/templates/.workspace-templates/scripts/benchmark.ts +171 -0
  150. package/templates/.workspace-templates/scripts/dispatch.ts +473 -0
  151. package/templates/.workspace-templates/scripts/generate-tests.ts +158 -0
  152. package/templates/.workspace-templates/scripts/install-tool.ts +82 -0
  153. package/templates/.workspace-templates/scripts/iterate.ts +265 -0
  154. package/templates/.workspace-templates/scripts/orchestrator.ts +539 -0
  155. package/templates/.workspace-templates/scripts/scaffold.ts +282 -0
  156. package/templates/.workspace-templates/scripts/validate.ts +452 -0
  157. package/templates/.workspace-templates/skills/architecture/SKILL.md +95 -0
  158. package/templates/.workspace-templates/skills/fixer/SKILL.md +109 -0
  159. package/templates/.workspace-templates/skills/iteration/SKILL.md +89 -0
  160. package/templates/.workspace-templates/skills/prompt-engineering/SKILL.md +87 -0
  161. package/templates/.workspace-templates/skills/research/SKILL.md +94 -0
  162. package/templates/.workspace-templates/skills/testing/SKILL.md +89 -0
  163. package/templates/.workspace-templates/skills/tooling/SKILL.md +87 -0
  164. package/templates/.workspace-templates/skills/validation/SKILL.md +103 -0
  165. package/templates/.workspace-templates/skills/worker/SKILL.md +79 -0
  166. package/templates/.workspace-templates/workspace/00-meta/CONTEXT.md +6 -0
  167. package/templates/.workspace-templates/workspace/00-meta/execution-log.md +27 -0
  168. package/templates/.workspace-templates/workspace/01-input/CONTEXT.md +29 -0
  169. package/templates/.workspace-templates/workspace/02-process/CONTEXT.md +29 -0
  170. package/templates/.workspace-templates/workspace/03-output/CONTEXT.md +29 -0
  171. package/templates/.workspace-templates/workspace/README.md +14 -0
  172. package/templates/SKILL.md +347 -0
  173. package/tests/benchmark.test.ts +158 -0
  174. package/tests/cli.test.ts +109 -0
  175. package/tests/dispatch-parallel.test.ts +124 -0
  176. package/tests/dispatch.test.ts +218 -0
  177. package/tests/fixer-skill.test.ts +203 -0
  178. package/tests/generate-tests.test.ts +101 -0
  179. package/tests/install-tool.test.ts +141 -0
  180. package/tests/install.test.ts +144 -0
  181. package/tests/integration.test.ts +324 -0
  182. package/tests/iterate.test.ts +219 -0
  183. package/tests/orchestrator.test.ts +710 -0
  184. package/tests/scaffold.test.ts +238 -0
  185. package/tests/templates-enhanced.test.ts +208 -0
  186. package/tests/templates.test.ts +219 -0
  187. package/tests/validate.test.ts +421 -0
  188. package/tests/validation-enhanced.test.ts +303 -0
  189. package/tests/worker-skill.test.ts +88 -0
  190. package/tsconfig.json +19 -0
  191. package/workspace/00-meta/CONTEXT.md +3 -0
  192. package/workspace/00-meta/execution-log.md +17 -0
  193. package/workspace/00-meta/tools.md +11 -0
  194. package/workspace/01-input/CONTEXT.md +27 -0
  195. package/workspace/CONTEXT.md +35 -0
  196. package/workspace/README.md +14 -0
  197. package/workspace/SYSTEM.md +36 -0
  198. package/workspace-maxxing-0.1.0.tgz +0 -0
@@ -0,0 +1,89 @@
1
+ ---
2
+ name: iteration
3
+ description: "Runs autonomous improvement loops with benchmark scoring. Use when score plateaued, deeper fixes needed, or after testing identifies patterns."
4
+ triggers: ["run improvement loop", "iterate on workspace", "deeper fixes", "score plateau"]
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ Execute improvement loops until quality thresholds are met. Iteration applies systematic fixes when first-pass prompt improvements are not enough.
10
+
11
+ ## When to Use
12
+
13
+ - Score is plateaued across runs
14
+ - Testing finds repeated failure patterns
15
+ - Validation failures persist after prompt-engineering
16
+ - The condition-driven improvement loop requires deeper fixes
17
+ - Latest benchmark score is strictly between 80 and 85 (`80 < score < 85`)
18
+ - Score is 80 or lower (`score <= 80`) after prompt-engineering stops improving
19
+
20
+ ## When Not to Use
21
+
22
+ - For first-pass improvements (use prompt-engineering first)
23
+ - When workspace is new and untested (use testing first)
24
+ - When structural redesign is needed (use architecture)
25
+
26
+ ## The Iron Law
27
+
28
+ NO CLAIMING IMPROVEMENT WITHOUT RE-RUNNING BENCHMARK
29
+ NO SKIPPING FIX SUGGESTIONS
30
+ NO INFINITE ITERATION LOOPS
31
+ NO SKIPPING ESCALATION WHEN STUCK
32
+
33
+ ## The Process
34
+
35
+ 1. **Run iterate.ts** - Execute `node scripts/iterate.ts --workspace <path> --max-retries 3`.
36
+ 2. **Read benchmark results** - Parse score, fixSuggestions, and improvementPotential.
37
+ 3. **Identify improvement areas** - Prioritize changes with highest impact.
38
+ 4. **Apply fixes** - Address each suggestion systematically.
39
+ 5. **Re-run iteration** - Verify score movement.
40
+ 6. **Repeat until threshold** - Continue until score is 85 or higher (`score >= 85`) or no improvement is possible.
41
+ 7. **Escalate if stuck** - If score remains below 85 after 3 attempts, escalate.
42
+
43
+ ## Red Flags
44
+
45
+ - Improvement is claimed without fresh benchmark evidence
46
+ - Fix suggestions are ignored
47
+ - Loop runs beyond max retries
48
+ - Escalation is skipped despite stalled score
49
+
50
+ ## Anti-Rationalization Table
51
+
52
+ | Thought | Reality |
53
+ |---------|---------|
54
+ | "I will just run it again" | Re-running without fixes wastes cycles. |
55
+ | "The score improved by one point" | Marginal gains are not enough. Target is 85 or higher. |
56
+ | "I will keep iterating until it works" | Maximum 3 attempts, then escalate. |
57
+
58
+ ## Sub-Skill Dispatch
59
+
60
+ - `status = passed` (`score >= 85`) -> `nextSkill = none`.
61
+ - `status = failed` (`score < 85` after max retries) -> `nextSkill = none` and require human follow-up.
62
+ - `status = escalated` (critical blocker prevents safe continuation) -> `nextSkill = none`.
63
+
64
+ ## Report Format
65
+
66
+ ```json
67
+ {
68
+ "skill": "iteration",
69
+ "status": "passed",
70
+ "timestamp": "2026-04-08T00:00:00Z",
71
+ "findings": ["Resolved two repeated edge-case failures"],
72
+ "recommendations": ["Run final validation and testing before delivery"],
73
+ "metrics": {
74
+ "scoreBefore": 81,
75
+ "scoreAfter": 88,
76
+ "iterationsRun": 2
77
+ },
78
+ "nextSkill": "none"
79
+ }
80
+ ```
81
+
82
+ Allowed `status` values: `passed`, `failed`, `escalated`.
83
+
84
+ Allowed `nextSkill` values: `none`.
85
+
86
+ ## Integration
87
+
88
+ - Works after testing or prompt-engineering when quality is stuck.
89
+ - Hands final results back to validation and completion checks.
@@ -0,0 +1,87 @@
1
+ ---
2
+ name: prompt-engineering
3
+ description: "Improves CONTEXT.md and SYSTEM.md prompts for better agent behavior. Use when workspace score is 80 or lower, prompts need improvement, or after validation identifies content gaps."
4
+ triggers: ["improve prompts", "fix content gaps", "optimize prompts", "clarify instructions"]
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ Optimize workspace prompts for clarity, completeness, and agent guidance. Prompt engineering resolves content-level quality issues without structural redesign.
10
+
11
+ ## When to Use
12
+
13
+ - Score is 80 or lower in benchmark results (`score <= 80`)
14
+ - Validation identifies missing or weak content
15
+ - Prompts are vague or incomplete
16
+ - Agent behavior does not match expectations
17
+
18
+ ## When Not to Use
19
+
20
+ - For structural issues (use architecture or fixer)
21
+ - When workspace has no content yet (use worker)
22
+ - For dependency installation (use tooling)
23
+
24
+ ## The Iron Law
25
+
26
+ NO COSMETIC CHANGES WITHOUT FUNCTIONAL IMPROVEMENT
27
+ NO CHANGING PROMPTS WITHOUT RE-VALIDATING
28
+ NO REMOVING CONTENT WITHOUT REPLACEMENT
29
+ NO CLAIMING IMPROVEMENT WITHOUT SCORE CHECK
30
+
31
+ ## The Process
32
+
33
+ 1. **Identify weak prompts** - Read benchmark findings and validation failures.
34
+ 2. **Analyze current prompts** - Identify what is missing, vague, or contradictory.
35
+ 3. **Apply prompt patterns** - Use clear structure, examples, constraints, and output format guidance.
36
+ 4. **Update CONTEXT.md files** - Improve stage-specific instructions.
37
+ 5. **Update SYSTEM.md if needed** - Improve folder map, rules, and tool inventory guidance.
38
+ 6. **Re-run validation** - Verify improvements did not break compliance.
39
+ 7. **Re-run benchmark** - Confirm score movement.
40
+
41
+ ## Red Flags
42
+
43
+ - Cosmetic wording changes with no measurable improvement
44
+ - Prompt edits made without re-validation
45
+ - Content removed without replacement
46
+ - No before/after score comparison
47
+
48
+ ## Anti-Rationalization Table
49
+
50
+ | Thought | Reality |
51
+ |---------|---------|
52
+ | "This wording change is enough" | Wording changes must produce functional improvement. |
53
+ | "I will remove vague sections" | Removing sections creates gaps. Improve, do not delete. |
54
+ | "The score did not change, but it is better" | No score change means no proven improvement. Iterate again. |
55
+
56
+ ## Sub-Skill Dispatch
57
+
58
+ - `status = passed` (`scoreAfter > 80`) -> `nextSkill = testing`.
59
+ - `status = failed` (`scoreAfter <= 80` or no measurable improvement) -> `nextSkill = iteration`.
60
+ - `status = escalated` (requirements conflict or critical blocker) -> `nextSkill = none`.
61
+
62
+ ## Report Format
63
+
64
+ ```json
65
+ {
66
+ "skill": "prompt-engineering",
67
+ "status": "passed",
68
+ "timestamp": "2026-04-08T00:00:00Z",
69
+ "findings": ["Clarified output constraints in two stage prompts"],
70
+ "recommendations": ["Run testing to verify edge-case behavior"],
71
+ "metrics": {
72
+ "scoreBefore": 74,
73
+ "scoreAfter": 83,
74
+ "promptsUpdated": 3
75
+ },
76
+ "nextSkill": "testing"
77
+ }
78
+ ```
79
+
80
+ Allowed `status` values: `passed`, `failed`, `escalated`.
81
+
82
+ Allowed `nextSkill` values: `testing`, `iteration`, `none`.
83
+
84
+ ## Integration
85
+
86
+ - Consumes findings from validation and benchmark.
87
+ - Produces higher-quality prompt content for testing and iteration.
@@ -0,0 +1,94 @@
1
+ ---
2
+ name: research
3
+ description: "Investigates patterns, gathers context, and identifies best practices for workspace design. Use when starting a new workspace, researching workflow patterns, or before architecture planning."
4
+ triggers: ["research workflow", "gather context", "identify patterns", "best practices"]
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ Gather context and identify patterns before building. Research ensures the workspace design is informed by real requirements, not assumptions.
10
+
11
+ ## When to Use
12
+
13
+ - Phase 1 of the hybrid flow (always first)
14
+ - Before architecture planning
15
+ - When the user asks for a novel workflow type
16
+ - When existing patterns do not fit the use case
17
+
18
+ ## When Not to Use
19
+
20
+ - After architecture is already planned (use architecture sub-skill)
21
+ - When workspace structure already exists (use validation sub-skill)
22
+ - For simple file creation (use direct file operations)
23
+
24
+ ## The Iron Law
25
+
26
+ NO BUILD WITHOUT RESEARCH
27
+ NO GENERIC FINDINGS
28
+ NO SKIPPING INPUT/OUTPUT ANALYSIS
29
+ NO ASSUMPTIONS WITHOUT EVIDENCE
30
+
31
+ ## Scope Guardrails
32
+
33
+ - Research the workflow needed to produce outcomes, not the product implementation stack.
34
+ - Convert domain asks (for example, "football predictor") into stageable workflow responsibilities.
35
+ - Keep research outputs markdown-first and suitable for numbered folder CONTEXT contracts.
36
+ - Do not produce backend/frontend/data-model implementation plans in this phase.
37
+
38
+ ## The Process
39
+
40
+ 1. **Identify workflow type** - Determine what process is being automated as a file-structured markdown workflow.
41
+ 2. **Research similar patterns** - Review existing workspaces, docs, and best practices.
42
+ 3. **Identify key stages** - Define the natural workflow phases.
43
+ 4. **Determine inputs and outputs** - Capture what goes in and what markdown artifacts come out at each stage.
44
+ 5. **Identify tooling needs** - List tools commonly needed for this workflow.
45
+ 6. **Document findings** - Create a concise research summary for architecture.
46
+
47
+ ## Red Flags
48
+
49
+ - Research is too generic and not tied to the requested workflow
50
+ - Input and output analysis is missing
51
+ - Tooling assessment is missing
52
+ - Architecture starts before research findings are complete
53
+ - Findings drift into app architecture, model design, or runtime repository scaffolding
54
+
55
+ ## Anti-Rationalization Table
56
+
57
+ | Thought | Reality |
58
+ |---------|---------|
59
+ | "I already know this workflow type" | Knowledge is not research. Document findings for the next agent. |
60
+ | "Research is taking too long" | Research prevents wasted build time. Be thorough. |
61
+ | "I will figure it out while building" | Building without research produces generic, non-optimal workspaces. |
62
+ | "The user will clarify later" | Ask now. Ambiguous requirements produce ambiguous workspaces. |
63
+
64
+ ## Sub-Skill Dispatch
65
+
66
+ - `status = passed` -> `nextSkill = architecture`.
67
+ - `status = failed` (research incomplete but recoverable) -> `nextSkill = none` and request missing inputs before rerun.
68
+ - `status = escalated` (blocking ambiguity or conflicting constraints) -> `nextSkill = none`.
69
+
70
+ ## Report Format
71
+
72
+ ```json
73
+ {
74
+ "skill": "research",
75
+ "status": "passed",
76
+ "timestamp": "2026-04-08T00:00:00Z",
77
+ "findings": ["Identified three reusable workflow stage patterns"],
78
+ "recommendations": ["Use a three-stage layout with explicit input/output boundaries"],
79
+ "metrics": {
80
+ "patternsIdentified": 3,
81
+ "stagesIdentified": 3
82
+ },
83
+ "nextSkill": "architecture"
84
+ }
85
+ ```
86
+
87
+ Allowed `status` values: `passed`, `failed`, `escalated`.
88
+
89
+ Allowed `nextSkill` values: `architecture`, `none`.
90
+
91
+ ## Integration
92
+
93
+ - Feeds architecture with concrete findings and stage proposals.
94
+ - Reduces rework by grounding structure decisions in evidence.
@@ -0,0 +1,89 @@
1
+ ---
2
+ name: testing
3
+ description: "Generates and runs test cases, evaluates results, and identifies gaps. Use when testing workspace quality, generating test cases, or after prompt improvements."
4
+ triggers: ["generate test cases", "run tests", "test workspace", "evaluate quality"]
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ Verify workspace quality through systematic testing. Testing confirms outputs across sample, edge-case, and empty-input scenarios.
10
+
11
+ ## When to Use
12
+
13
+ - After prompt-engineering improvements
14
+ - When no tests exist for the workspace
15
+ - Before claiming delivery
16
+ - When benchmark score is strictly between 80 and 85 (`80 < score < 85`)
17
+ - When score is 85 or higher and final evidence is still required
18
+
19
+ ## When Not to Use
20
+
21
+ - Before workspace build is complete (run scaffold.ts first)
22
+ - For structural validation (use validation sub-skill)
23
+ - When applying direct fixes to failures (use fixer sub-skill)
24
+ - When benchmark score is 80 or lower (`score <= 80`) (use prompt-engineering first)
25
+
26
+ ## The Iron Law
27
+
28
+ NO SKIPPING TEST GENERATION
29
+ NO IGNORING FAILED TESTS
30
+ NO CLAIMING QUALITY WITHOUT EVIDENCE
31
+ NO TESTING WITHOUT TEST CASES
32
+
33
+ ## The Process
34
+
35
+ 1. **Generate test cases** - Run `node scripts/generate-tests.ts --workspace <path> --output ./tests.json`.
36
+ 2. **Read test cases** - Parse generated test cases and expected outcomes.
37
+ 3. **Run generation tests** - Produce sample content each stage should output.
38
+ 4. **Run evaluation tests** - Review CONTEXT.md files against expected behavior.
39
+ 5. **Aggregate results** - Identify recurring patterns and quality gaps.
40
+ 6. **Document findings** - Create a pass/fail report per test case.
41
+
42
+ ## Red Flags
43
+
44
+ - Test generation is skipped
45
+ - Generation tests run without evaluation tests
46
+ - Failed test cases are ignored
47
+ - Failure patterns are undocumented
48
+
49
+ ## Anti-Rationalization Table
50
+
51
+ | Thought | Reality |
52
+ |---------|---------|
53
+ | "The workspace looks fine, no need to test" | Looks can deceive. Tests reveal behavior. |
54
+ | "One failed test is a fluke" | Failed tests are signals. Investigate each one. |
55
+ | "I will test after delivery" | Untested delivery is a gamble. Test first. |
56
+
57
+ ## Sub-Skill Dispatch
58
+
59
+ - `status = passed` (all required tests pass and `benchmarkScore >= 85`) -> `nextSkill = none`.
60
+ - `status = failed` (any required test fails or `benchmarkScore < 85`) -> `nextSkill = iteration`.
61
+ - `status = escalated` (testing cannot run reliably due to blockers) -> `nextSkill = none`.
62
+
63
+ ## Report Format
64
+
65
+ ```json
66
+ {
67
+ "skill": "testing",
68
+ "status": "failed",
69
+ "timestamp": "2026-04-08T00:00:00Z",
70
+ "findings": ["Two edge-case outputs failed acceptance checks"],
71
+ "recommendations": ["Run iteration to address repeated edge-case defects"],
72
+ "metrics": {
73
+ "benchmarkScore": 82,
74
+ "testCasesGenerated": 9,
75
+ "testCasesPassed": 7,
76
+ "testCasesFailed": 2
77
+ },
78
+ "nextSkill": "iteration"
79
+ }
80
+ ```
81
+
82
+ Allowed `status` values: `passed`, `failed`, `escalated`.
83
+
84
+ Allowed `nextSkill` values: `iteration`, `none`.
85
+
86
+ ## Integration
87
+
88
+ - Uses generate-tests.ts output as primary test input.
89
+ - Supplies pass/fail evidence for iteration and final verification.
@@ -0,0 +1,87 @@
1
+ ---
2
+ name: tooling
3
+ description: "Assesses, installs, and configures tools for the workspace. Use when tools are missing, tool inventory needs updating, or workspace requires specific dependencies."
4
+ triggers: ["install tools", "assess tooling", "update tool inventory", "configure dependencies"]
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ Ensure workspace has the right tools installed and configured. Tooling manages the dependency layer of the workspace.
10
+
11
+ ## When to Use
12
+
13
+ - Tool inventory is empty or incomplete
14
+ - Workspace requires specific dependencies
15
+ - Architecture identifies missing tooling needs
16
+ - User requests specific tool installation
17
+
18
+ ## When Not to Use
19
+
20
+ - For non-tool structural changes (use architecture)
21
+ - For content quality improvements (use prompt-engineering)
22
+ - When no additional tools are needed
23
+
24
+ ## The Iron Law
25
+
26
+ NO INSTALLING TOOLS WITHOUT USER APPROVAL
27
+ NO SKIPPING TOOL INVENTORY UPDATES
28
+ NO INSTALLING UNNECESSARY TOOLS
29
+ NO SKIPPING VERIFICATION AFTER INSTALLATION
30
+
31
+ ## The Process
32
+
33
+ 1. **Scan current tools** - Read SYSTEM.md tool inventory.
34
+ 2. **Identify missing tools** - Compare requirements against installed tools.
35
+ 3. **Propose tools** - Provide recommended tools with justification.
36
+ 4. **Get approval** - Present the tool plan before installing.
37
+ 5. **Install tools** - Run `node scripts/install-tool.ts --tool <name> --manager <mgr> --workspace <path>`.
38
+ 6. **Update inventory** - Confirm SYSTEM.md or inventory section is updated.
39
+ 7. **Verify installation** - Confirm each installed tool is accessible.
40
+
41
+ ## Red Flags
42
+
43
+ - Tools installed without approval
44
+ - Inventory not updated after install
45
+ - Unnecessary tools installed
46
+ - Installation not verified
47
+
48
+ ## Anti-Rationalization Table
49
+
50
+ | Thought | Reality |
51
+ |---------|---------|
52
+ | "This tool might be useful" | "Might" is not enough. Every tool needs explicit justification. |
53
+ | "I will install now and tell the user later" | Approval must come before installation. |
54
+ | "The install probably worked" | Probably is not verified. Validate each install. |
55
+
56
+ ## Sub-Skill Dispatch
57
+
58
+ - `status = passed` (approved tooling installed and verified) -> `nextSkill = none`.
59
+ - `status = failed` (installation incomplete or verification failed) -> `nextSkill = none`.
60
+ - `status = escalated` (blocked by permissions, policy, or unresolved conflicts) -> `nextSkill = none`.
61
+
62
+ ## Report Format
63
+
64
+ ```json
65
+ {
66
+ "skill": "tooling",
67
+ "status": "passed",
68
+ "timestamp": "2026-04-08T00:00:00Z",
69
+ "findings": ["Installed two approved dependencies"],
70
+ "recommendations": ["Run validation to confirm inventory consistency"],
71
+ "metrics": {
72
+ "toolsInstalled": 2,
73
+ "toolsProposed": 2,
74
+ "toolsFailed": 0
75
+ },
76
+ "nextSkill": "none"
77
+ }
78
+ ```
79
+
80
+ Allowed `status` values: `passed`, `failed`, `escalated`.
81
+
82
+ Allowed `nextSkill` values: `none`.
83
+
84
+ ## Integration
85
+
86
+ - Consumes architecture and requirement signals to propose tools.
87
+ - Produces verified dependency state for downstream validation.
@@ -0,0 +1,103 @@
1
+ ---
2
+ name: validation
3
+ description: "Checks workspace ICM compliance and benchmarks batch outputs. Use when validating a workspace, checking compliance, running validation, benchmarking batch results, or after making changes to workspace structure."
4
+ triggers: ["validate batch", "check results", "run validation", "benchmark outputs", "check compliance"]
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ Ensure workspace meets ICM standards and benchmark batch outputs through systematic validation. Validate both workspace structure and worker or fixer outputs before any completion claim.
10
+
11
+ ## When to Use
12
+
13
+ - After workspace scaffolding
14
+ - After any structural change
15
+ - After worker batch completes
16
+ - After fixer applies fixes
17
+ - Before claiming delivery
18
+ - When score drops below threshold
19
+
20
+ ## When Not to Use
21
+
22
+ - Generating outputs (use worker sub-skill)
23
+ - Fixing failures (use fixer sub-skill)
24
+ - Researching patterns (use research sub-skill)
25
+
26
+ ## The Iron Law
27
+
28
+ NO SCORE INFLATION
29
+ NO SKIPPING FAILURES
30
+ NO VALIDATING WITHOUT BENCHMARK
31
+ NO PASSING WITHOUT EVIDENCE
32
+
33
+ ## The Process
34
+
35
+ 1. **Run validate.ts** - Execute `node scripts/validate.ts --workspace <path>`
36
+ 2. **Parse validation results** - Read exit code and output; collect structural findings
37
+ 3. **Check batch outputs** - For each test case in `.agents/iteration/batch-<N>/`, verify `output.md` and `report.json` exist
38
+ 4. **Run benchmark.ts** - Execute `node scripts/benchmark.ts --workspace <path>` to compute benchmark scoring
39
+ 5. **Aggregate scores** - Combine structural validation score and benchmark score into a single batch score
40
+ 6. **Generate findings** - List failures with concrete fix suggestions mapped to each failing test case
41
+ 7. **Write batch-report.json** - Structured JSON with `{skill, status, timestamp, batchId, findings, fixSuggestions, recommendations, metrics, nextSkill}` where `nextSkill` is one of `fixer`, `orchestrator`, or `none`
42
+
43
+ ## Batch-Level Validation
44
+
45
+ When validating a batch:
46
+ - Read all `report.json` files in `.agents/iteration/batch-<N>/`
47
+ - Verify each worker or fixer output matches its test case expectations
48
+ - Calculate per-test-case pass/fail status
49
+ - Calculate overall batch score using benchmark weights
50
+ - If score < threshold, dispatch fixer with findings
51
+
52
+ ## Red Flags
53
+
54
+ - Reporting inflated scores to force a pass
55
+ - Skipping failing findings because they look minor
56
+ - Running validation without benchmark evidence
57
+ - Returning a passing status without per-case verification
58
+
59
+ ## Anti-Rationalization Table
60
+
61
+ | Thought | Reality |
62
+ |---------|---------|
63
+ | "This workspace looks good enough" | Good enough is the enemy of excellent. Run validation. |
64
+ | "The score is close, I will round up" | Score inflation hides real problems. Report the true score. |
65
+ | "One failure does not matter" | Every failure matters. Report it and route it to fixer. |
66
+ | "I already validated this" | Validation is a snapshot. Re-validate after every change. |
67
+ | "The benchmark is too strict" | The benchmark is the standard. Meet it or escalate. |
68
+
69
+ ## Sub-Skill Dispatch
70
+
71
+ - If batch score < threshold -> fixer sub-skill (`nextSkill = fixer`)
72
+ - If batch score >= threshold -> orchestrator (batch complete, `nextSkill = orchestrator`)
73
+ - If critical failures (for example missing SYSTEM.md) -> escalate to human and set `nextSkill = none`
74
+
75
+ ## Report Format
76
+
77
+ ```json
78
+ {
79
+ "skill": "validation",
80
+ "status": "passed",
81
+ "timestamp": "2026-04-08T00:00:00Z",
82
+ "batchId": 1,
83
+ "findings": ["All required files present"],
84
+ "fixSuggestions": ["No fixes required"],
85
+ "recommendations": ["Proceed to next batch"],
86
+ "metrics": {
87
+ "score": 94,
88
+ "benchmarkScore": 92,
89
+ "itemsChecked": 18,
90
+ "itemsPassed": 17,
91
+ "testCasesPassed": 7,
92
+ "testCasesFailed": 1
93
+ },
94
+ "nextSkill": "orchestrator"
95
+ }
96
+ ```
97
+
98
+ Allowed `nextSkill` values: `fixer`, `orchestrator`, `none`.
99
+
100
+ ## Integration
101
+
102
+ - Consumes worker and fixer reports from `.agents/iteration/batch-<N>/`.
103
+ - Produces `batch-report.json` that drives fixer routing or orchestrator continuation.
@@ -0,0 +1,79 @@
1
+ ---
2
+ name: worker
3
+ description: "Executes a single test case against the workspace and produces output. Use when running test cases, executing workspace tasks, or processing stage-specific work."
4
+ triggers: ["run test case", "execute workspace task", "process stage", "generate output"]
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ Execute a single test case by reading the relevant workspace sections, performing the required work, and producing structured output. Each worker runs with fresh context - no assumptions about prior runs.
10
+
11
+ ## When to Use
12
+
13
+ - Dispatched by orchestrator as part of a batch
14
+ - User asks to run a specific test case
15
+ - User asks to execute a workspace stage task
16
+
17
+ ## When Not to Use
18
+
19
+ - Validating outputs (use validation sub-skill)
20
+ - Fixing failed outputs (use fixer sub-skill)
21
+ - Planning workspace structure (use architecture sub-skill)
22
+
23
+ ## The Iron Law
24
+
25
+ NO SKIPPING TEST CASE STEPS
26
+ NO MODIFYING WORKSPACE STRUCTURE
27
+ NO CLAIMING DONE WITHOUT OUTPUT
28
+ NO ASSUMING PRIOR CONTEXT
29
+
30
+ ## The Process
31
+
32
+ 1. **Read test case** - Load the test case JSON from `.agents/iteration/batch-<N>/<testCaseId>/` or orchestrator input
33
+ 2. **Load workspace context** - Read `SYSTEM.md` and relevant stage `CONTEXT.md` files
34
+ 3. **Execute the task** - Follow the test case input/expected instructions
35
+ 4. **Write output.md** - Human-readable output in `.agents/iteration/batch-<N>/<testCaseId>/output.md`
36
+ 5. **Write report.json** - Structured JSON with `{skill, status, timestamp, testCaseId, batchId, findings, recommendations, metrics, nextSkill}`
37
+ 6. **Dispatch validation** - Signal that output is ready for validation
38
+
39
+ ## External Runner Contract
40
+
41
+ - Worker execution in autonomous iteration is orchestrated via `--subagent-runner`.
42
+ - Direct worker dispatch must provide `--runner-command` with placeholders `{skill}`, `{workspace}`, `{batchId}`, `{testCaseId}`.
43
+ - Runner output must be JSON and include `report.json` compatible fields:
44
+ - `skill`, `status`, `timestamp`, `findings`, `recommendations`, `metrics`, `nextSkill`
45
+ - Missing/invalid runner output is a failure, not a simulated success path.
46
+ - Use `.agents/iteration/runs/*.json` telemetry to debug command rendering and runner payload issues.
47
+
48
+ ## Anti-Rationalization Table
49
+
50
+ | Thought | Reality |
51
+ |---------|---------|
52
+ | "I already know what this stage does" | Read the CONTEXT.md. Assumptions cause failures. |
53
+ | "The output is good enough" | Good enough fails validation. Follow the test case exactly. |
54
+ | "I'll modify the workspace structure to make this easier" | Workers don't modify structure. That's the fixer's job. |
55
+ | "This test case is redundant" | Every test case exists for a reason. Execute it. |
56
+ | "I'll skip writing report.json" | Validation depends on report.json. It's mandatory. |
57
+
58
+ ## Sub-Skill Dispatch
59
+
60
+ - After output complete -> validation sub-skill
61
+
62
+ ## Report Format
63
+
64
+ ```json
65
+ {
66
+ "skill": "worker",
67
+ "status": "passed|failed|escalated",
68
+ "timestamp": "2026-04-08T00:00:00Z",
69
+ "testCaseId": "tc-001",
70
+ "batchId": 1,
71
+ "findings": ["Output generated with required sections"],
72
+ "recommendations": ["Proceed to validation"],
73
+ "metrics": {
74
+ "executionTimeMs": 120,
75
+ "outputLength": 640
76
+ },
77
+ "nextSkill": "validation"
78
+ }
79
+ ```
@@ -0,0 +1,6 @@
1
+ # 00-meta CONTEXT.md
2
+
3
+ ## Workspace Metadata
4
+ - Created: [date]
5
+ - Purpose: [description]
6
+ - Status: active
@@ -0,0 +1,27 @@
1
+ # Execution Log
2
+
3
+ ## Stage Checklist
4
+
5
+ - [ ] 01-input
6
+ - [ ] 02-process
7
+ - [ ] 03-output
8
+
9
+ ## Rules
10
+
11
+ 1. Check stages in order from top to bottom.
12
+ 2. Do not mark a stage complete without required evidence.
13
+ 3. Add handoff notes before moving to the next stage.
14
+
15
+ ## Evidence Notes
16
+
17
+ ### 01-input
18
+ - Artifacts:
19
+ - Handoff Summary:
20
+
21
+ ### 02-process
22
+ - Artifacts:
23
+ - Handoff Summary:
24
+
25
+ ### 03-output
26
+ - Artifacts:
27
+ - Handoff Summary: