workspace-maxxing 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +73 -10
  3. package/dist/install.d.ts +1 -1
  4. package/dist/install.d.ts.map +1 -1
  5. package/dist/install.js +7 -1
  6. package/dist/install.js.map +1 -1
  7. package/package.json +1 -1
  8. package/src/install.ts +8 -1
  9. package/templates/SKILL.md +88 -0
  10. package/docs/superpowers/plans/2026-04-07-autonomous-iteration-plan.md +0 -1123
  11. package/docs/superpowers/plans/2026-04-07-autonomous-iteration-sub-agent-batches.md +0 -1923
  12. package/docs/superpowers/plans/2026-04-07-autonomous-workflow-sub-skill-plan.md +0 -1505
  13. package/docs/superpowers/plans/2026-04-07-benchmarking-multi-agent-plan.md +0 -854
  14. package/docs/superpowers/plans/2026-04-07-workspace-builder-logic-plan.md +0 -1426
  15. package/docs/superpowers/plans/2026-04-07-workspace-maxxing-plan.md +0 -1299
  16. package/docs/superpowers/plans/2026-04-08-session-294c-subagent-invocation-plan.md +0 -320
  17. package/docs/superpowers/plans/2026-04-08-workflow-prompt-hardening-plan.md +0 -1025
  18. package/docs/superpowers/plans/2026-04-12-workspace-agent-creation-plan.md +0 -992
  19. package/docs/superpowers/specs/2026-04-07-autonomous-iteration-design.md +0 -214
  20. package/docs/superpowers/specs/2026-04-07-autonomous-iteration-sub-agent-batches-design.md +0 -188
  21. package/docs/superpowers/specs/2026-04-07-autonomous-workflow-sub-skill-design.md +0 -137
  22. package/docs/superpowers/specs/2026-04-07-benchmarking-multi-agent-design.md +0 -105
  23. package/docs/superpowers/specs/2026-04-07-workspace-builder-logic-design.md +0 -179
  24. package/docs/superpowers/specs/2026-04-07-workspace-maxxing-design.md +0 -227
  25. package/docs/superpowers/specs/2026-04-08-session-294c-subagent-invocation-design.md +0 -265
  26. package/docs/superpowers/specs/2026-04-08-workflow-prompt-hardening-design.md +0 -146
  27. package/docs/superpowers/specs/2026-04-12-workspace-agent-creation-design.md +0 -239
@@ -1,1923 +0,0 @@
1
- # Autonomous Iteration with Sub-Agent Batches Implementation Plan
2
-
3
- > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
4
-
5
- **Goal:** Implement batched parallel sub-agent iteration with validator checkpoints and fix loops, plus rewrite all sub-skills with obra/superpowers patterns.
6
-
7
- **Architecture:** New `orchestrator.ts` coordinates the batch lifecycle. `dispatch.ts` extended for parallel invocation with batch IDs. Three new sub-skills (`worker`, `fixer`, enhanced `validation`). All existing sub-skills rewritten with YAML frontmatter, trigger phrases, anti-rationalization tables, and iron laws.
8
-
9
- **Tech Stack:** TypeScript, Node.js builtins (fs, path, child_process, os), Jest for testing.
10
-
11
- ---
12
-
13
- ### Task 1: orchestrator.ts — Core Types & Batch Splitting
14
-
15
- **Files:**
16
- - Create: `src/scripts/orchestrator.ts`
17
- - Test: `tests/orchestrator.test.ts`
18
-
19
- - [ ] **Step 1: Write failing test for batch splitting**
20
-
21
- ```typescript
22
- // tests/orchestrator.test.ts
23
- import * as fs from 'fs';
24
- import * as path from 'path';
25
- import * as os from 'os';
26
- import { splitIntoBatches, OrchestratorConfig } from '../src/scripts/orchestrator';
27
-
28
- describe('orchestrator', () => {
29
- let tempDir: string;
30
-
31
- beforeEach(() => {
32
- tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'orchestrator-test-'));
33
- });
34
-
35
- afterEach(() => {
36
- fs.rmSync(tempDir, { recursive: true, force: true });
37
- });
38
-
39
- describe('splitIntoBatches', () => {
40
- it('splits items into batches of specified size', () => {
41
- const items = ['tc-001', 'tc-002', 'tc-003', 'tc-004', 'tc-005'];
42
- const result = splitIntoBatches(items, 3);
43
-
44
- expect(result).toHaveLength(2);
45
- expect(result[0]).toEqual(['tc-001', 'tc-002', 'tc-003']);
46
- expect(result[1]).toEqual(['tc-004', 'tc-005']);
47
- });
48
-
49
- it('returns single batch when items fit', () => {
50
- const items = ['tc-001', 'tc-002'];
51
- const result = splitIntoBatches(items, 3);
52
-
53
- expect(result).toHaveLength(1);
54
- expect(result[0]).toEqual(['tc-001', 'tc-002']);
55
- });
56
-
57
- it('returns empty array for empty input', () => {
58
- const result = splitIntoBatches([], 3);
59
- expect(result).toEqual([]);
60
- });
61
-
62
- it('uses default batch size of 3 when not specified', () => {
63
- const items = ['a', 'b', 'c', 'd', 'e', 'f', 'g'];
64
- const result = splitIntoBatches(items);
65
-
66
- expect(result).toHaveLength(3);
67
- expect(result[0]).toHaveLength(3);
68
- expect(result[1]).toHaveLength(3);
69
- expect(result[2]).toHaveLength(1);
70
- });
71
- });
72
- });
73
- ```
74
-
75
- - [ ] **Step 2: Run test to verify it fails**
76
-
77
- Run: `npx jest tests/orchestrator.test.ts -t "splitIntoBatches" -v`
78
- Expected: FAIL with "Cannot find module"
79
-
80
- - [ ] **Step 3: Write orchestrator.ts with types and splitIntoBatches**
81
-
82
- ```typescript
83
- // src/scripts/orchestrator.ts
84
- import * as fs from 'fs';
85
- import * as path from 'path';
86
-
87
- export interface OrchestratorConfig {
88
- batchSize?: number;
89
- maxFixRetries?: number;
90
- scoreThreshold?: number;
91
- workerTimeout?: number;
92
- }
93
-
94
- export interface BatchReport {
95
- batchId: number;
96
- testCases: string[];
97
- score: number;
98
- status: 'passed' | 'failed' | 'partial' | 'escalated';
99
- findings: string[];
100
- timestamp: string;
101
- }
102
-
103
- export interface OrchestratorSummary {
104
- totalBatches: number;
105
- passedBatches: number;
106
- failedBatches: number;
107
- escalatedBatches: number;
108
- overallScore: number;
109
- batchReports: BatchReport[];
110
- timestamp: string;
111
- }
112
-
113
- export const DEFAULT_CONFIG: Required<OrchestratorConfig> = {
114
- batchSize: 3,
115
- maxFixRetries: 3,
116
- scoreThreshold: 85,
117
- workerTimeout: 300,
118
- };
119
-
120
- export function splitIntoBatches(items: string[], batchSize: number = DEFAULT_CONFIG.batchSize): string[][] {
121
- if (items.length === 0) return [];
122
-
123
- const batches: string[][] = [];
124
- for (let i = 0; i < items.length; i += batchSize) {
125
- batches.push(items.slice(i, i + batchSize));
126
- }
127
- return batches;
128
- }
129
-
130
- if (require.main === module) {
131
- const args = process.argv.slice(2);
132
- const parseArg = (flag: string): string | undefined => {
133
- const idx = args.indexOf(flag);
134
- return idx !== -1 ? args[idx + 1] : undefined;
135
- };
136
-
137
- const testCasesPath = parseArg('--test-cases');
138
- const batchSizeStr = parseArg('--batch-size');
139
- const batchSize = batchSizeStr ? parseInt(batchSizeStr, 10) : DEFAULT_CONFIG.batchSize;
140
-
141
- if (!testCasesPath) {
142
- console.error('Usage: node orchestrator.ts --test-cases <path> [--batch-size <n>]');
143
- process.exit(1);
144
- }
145
-
146
- const testCases = JSON.parse(fs.readFileSync(testCasesPath, 'utf-8'));
147
- const testCaseIds = testCases.testCases.map((tc: any, i: number) => `tc-${String(i + 1).padStart(3, '0')}`);
148
- const batches = splitIntoBatches(testCaseIds, batchSize);
149
-
150
- console.log(JSON.stringify({ batches, totalTestCases: testCaseIds.length, totalBatches: batches.length }, null, 2));
151
- }
152
- ```
153
-
154
- - [ ] **Step 4: Run test to verify it passes**
155
-
156
- Run: `npx jest tests/orchestrator.test.ts -t "splitIntoBatches" -v`
157
- Expected: PASS
158
-
159
- - [ ] **Step 5: Commit**
160
-
161
- ```bash
162
- git add src/scripts/orchestrator.ts tests/orchestrator.test.ts
163
- git commit -m "feat(orchestrator): add types and batch splitting"
164
- ```
165
-
166
- ---
167
-
168
- ### Task 2: orchestrator.ts — Batch Output Directory Management
169
-
170
- **Files:**
171
- - Modify: `src/scripts/orchestrator.ts`
172
- - Modify: `tests/orchestrator.test.ts`
173
-
174
- - [ ] **Step 1: Write failing test for batch directory creation**
175
-
176
- Add to `tests/orchestrator.test.ts`:
177
-
178
- ```typescript
179
- import { createBatchDirectory, getBatchDirectory } from '../src/scripts/orchestrator';
180
-
181
- describe('batch directory management', () => {
182
- it('creates batch directory structure', () => {
183
- const baseDir = path.join(tempDir, '.agents', 'iteration');
184
- const result = createBatchDirectory(baseDir, 1);
185
-
186
- expect(fs.existsSync(result)).toBe(true);
187
- expect(result).toContain('batch-01');
188
- });
189
-
190
- it('returns existing batch directory path', () => {
191
- const baseDir = path.join(tempDir, '.agents', 'iteration');
192
- fs.mkdirSync(path.join(baseDir, 'batch-02'), { recursive: true });
193
-
194
- const result = getBatchDirectory(baseDir, 2);
195
- expect(result).toContain('batch-02');
196
- });
197
- });
198
- ```
199
-
200
- - [ ] **Step 2: Run test to verify it fails**
201
-
202
- Run: `npx jest tests/orchestrator.test.ts -t "batch directory" -v`
203
- Expected: FAIL
204
-
205
- - [ ] **Step 3: Add batch directory functions to orchestrator.ts**
206
-
207
- Add to `src/scripts/orchestrator.ts`:
208
-
209
- ```typescript
210
- export function createBatchDirectory(baseDir: string, batchId: number): string {
211
- const batchDir = path.join(baseDir, `batch-${String(batchId).padStart(2, '0')}`);
212
- fs.mkdirSync(batchDir, { recursive: true });
213
- return batchDir;
214
- }
215
-
216
- export function getBatchDirectory(baseDir: string, batchId: number): string {
217
- return path.join(baseDir, `batch-${String(batchId).padStart(2, '0')}`);
218
- }
219
-
220
- export function createTestCaseDirectory(batchDir: string, testCaseId: string): string {
221
- const tcDir = path.join(batchDir, testCaseId);
222
- fs.mkdirSync(tcDir, { recursive: true });
223
- return tcDir;
224
- }
225
- ```
226
-
227
- - [ ] **Step 4: Run test to verify it passes**
228
-
229
- Run: `npx jest tests/orchestrator.test.ts -t "batch directory" -v`
230
- Expected: PASS
231
-
232
- - [ ] **Step 5: Commit**
233
-
234
- ```bash
235
- git add src/scripts/orchestrator.ts tests/orchestrator.test.ts
236
- git commit -m "feat(orchestrator): add batch directory management"
237
- ```
238
-
239
- ---
240
-
241
- ### Task 3: dispatch.ts — Parallel Dispatch & Batch ID Support
242
-
243
- **Files:**
244
- - Modify: `src/scripts/dispatch.ts`
245
- - Test: `tests/dispatch-parallel.test.ts`
246
-
247
- - [ ] **Step 1: Write failing test for parallel dispatch**
248
-
249
- ```typescript
250
- // tests/dispatch-parallel.test.ts
251
- import * as fs from 'fs';
252
- import * as path from 'path';
253
- import * as os from 'os';
254
- import { dispatchSkill, dispatchParallel, ParallelDispatchResult } from '../src/scripts/dispatch';
255
-
256
- jest.mock('child_process');
257
-
258
- describe('parallel dispatch', () => {
259
- let tempDir: string;
260
-
261
- beforeEach(() => {
262
- tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'dispatch-parallel-'));
263
- });
264
-
265
- afterEach(() => {
266
- fs.rmSync(tempDir, { recursive: true, force: true });
267
- jest.clearAllMocks();
268
- });
269
-
270
- it('dispatches multiple skills in parallel and aggregates results', () => {
271
- const skillsDir = path.join(tempDir, 'skills');
272
- fs.mkdirSync(path.join(skillsDir, 'worker'), { recursive: true });
273
- fs.writeFileSync(path.join(skillsDir, 'worker', 'SKILL.md'), '---\nname: worker\n---\n\nTest');
274
-
275
- const invocations = [
276
- { skill: 'worker', batchId: 1, testCaseId: 'tc-001' },
277
- { skill: 'worker', batchId: 1, testCaseId: 'tc-002' },
278
- ];
279
-
280
- const results = dispatchParallel(invocations, skillsDir);
281
-
282
- expect(results).toHaveLength(2);
283
- expect(results[0].status).toBe('passed');
284
- expect(results[1].status).toBe('passed');
285
- });
286
-
287
- it('includes batchId and testCaseId in results', () => {
288
- const skillsDir = path.join(tempDir, 'skills');
289
- fs.mkdirSync(path.join(skillsDir, 'validation'), { recursive: true });
290
- fs.writeFileSync(path.join(skillsDir, 'validation', 'SKILL.md'), '---\nname: validation\n---\n\nTest');
291
-
292
- const invocations = [
293
- { skill: 'validation', batchId: 2, testCaseId: 'tc-003' },
294
- ];
295
-
296
- const results = dispatchParallel(invocations, skillsDir);
297
-
298
- expect(results[0].batchId).toBe(2);
299
- expect(results[0].testCaseId).toBe('tc-003');
300
- });
301
-
302
- it('handles missing skill gracefully in parallel mode', () => {
303
- const skillsDir = path.join(tempDir, 'skills');
304
- fs.mkdirSync(skillsDir, { recursive: true });
305
-
306
- const invocations = [
307
- { skill: 'nonexistent', batchId: 1, testCaseId: 'tc-001' },
308
- ];
309
-
310
- const results = dispatchParallel(invocations, skillsDir);
311
-
312
- expect(results[0].status).toBe('failed');
313
- });
314
- });
315
- ```
316
-
317
- - [ ] **Step 2: Run test to verify it fails**
318
-
319
- Run: `npx jest tests/dispatch-parallel.test.ts -v`
320
- Expected: FAIL
321
-
322
- - [ ] **Step 3: Add parallel dispatch to dispatch.ts**
323
-
324
- Add to `src/scripts/dispatch.ts`:
325
-
326
- ```typescript
327
- import * as child_process from 'child_process';
328
-
329
- export interface ParallelInvocation {
330
- skill: string;
331
- batchId: number;
332
- testCaseId: string;
333
- }
334
-
335
- export interface ParallelDispatchResult extends DispatchReport {
336
- batchId: number;
337
- testCaseId: string;
338
- }
339
-
340
- export function dispatchParallel(
341
- invocations: ParallelInvocation[],
342
- skillsDir: string,
343
- ): ParallelDispatchResult[] {
344
- return invocations.map((inv) => {
345
- const report = dispatchSkill(inv.skill, skillsDir);
346
- return {
347
- ...report,
348
- batchId: inv.batchId,
349
- testCaseId: inv.testCaseId,
350
- };
351
- });
352
- }
353
- ```
354
-
355
- - [ ] **Step 4: Update dispatch.ts CLI to support --parallel and --batch-id**
356
-
357
- Modify the CLI section at the bottom of `src/scripts/dispatch.ts`:
358
-
359
- ```typescript
360
- if (require.main === module) {
361
- const args = process.argv.slice(2);
362
- const parseArg = (flag: string): string | undefined => {
363
- const idx = args.indexOf(flag);
364
- return idx !== -1 ? args[idx + 1] : undefined;
365
- };
366
-
367
- const skill = parseArg('--skill');
368
- const workspace = parseArg('--workspace');
369
- const batchId = parseArg('--batch-id');
370
- const parallel = args.includes('--parallel');
371
-
372
- if (!skill) {
373
- console.error('Usage: node dispatch.ts --skill <name> --workspace <path> [--batch-id <n>] [--parallel]');
374
- process.exit(1);
375
- }
376
-
377
- const skillsDir = workspace
378
- ? path.join(workspace, '.agents', 'skills', 'workspace-maxxing', 'skills')
379
- : path.join(process.cwd(), 'skills');
380
-
381
- if (parallel) {
382
- // Read invocation list from stdin or file
383
- const invocationsPath = parseArg('--invocations');
384
- if (!invocationsPath) {
385
- console.error('--parallel requires --invocations <path>');
386
- process.exit(1);
387
- }
388
- const invocations = JSON.parse(fs.readFileSync(invocationsPath, 'utf-8'));
389
- const results = dispatchParallel(invocations, skillsDir);
390
- console.log(JSON.stringify(results, null, 2));
391
- } else {
392
- const result = dispatchSkill(skill, skillsDir);
393
- const output = batchId
394
- ? { ...result, batchId: parseInt(batchId, 10) }
395
- : result;
396
- console.log(JSON.stringify(output, null, 2));
397
- }
398
- }
399
- ```
400
-
401
- - [ ] **Step 5: Run test to verify it passes**
402
-
403
- Run: `npx jest tests/dispatch-parallel.test.ts -v`
404
- Expected: PASS
405
-
406
- - [ ] **Step 6: Commit**
407
-
408
- ```bash
409
- git add src/scripts/dispatch.ts tests/dispatch-parallel.test.ts
410
- git commit -m "feat(dispatch): add parallel dispatch and batch-id support"
411
- ```
412
-
413
- ---
414
-
415
- ### Task 4: orchestrator.ts — Full Batch Lifecycle (generate → dispatch → validate → fix)
416
-
417
- **Files:**
418
- - Modify: `src/scripts/orchestrator.ts`
419
- - Modify: `tests/orchestrator.test.ts`
420
-
421
- - [ ] **Step 1: Write failing test for batch lifecycle**
422
-
423
- Add to `tests/orchestrator.test.ts`:
424
-
425
- ```typescript
426
- import { runBatchLifecycle, BatchLifecycleResult } from '../src/scripts/orchestrator';
427
-
428
- jest.mock('../src/scripts/dispatch');
429
- jest.mock('../src/scripts/generate-tests');
430
- jest.mock('../src/scripts/validate');
431
- jest.mock('../src/scripts/benchmark');
432
-
433
- import * as dispatch from '../src/scripts/dispatch';
434
- import * as generateTests from '../src/scripts/generate-tests';
435
- import * as validate from '../src/scripts/validate';
436
- import * as benchmark from '../src/scripts/benchmark';
437
-
438
- describe('batch lifecycle', () => {
439
- it('runs full lifecycle: generate → dispatch → validate → complete', () => {
440
- const ws = createBasicWorkspace();
441
- const config = { batchSize: 2, maxFixRetries: 3, scoreThreshold: 85, workerTimeout: 300 };
442
-
443
- (generateTests.generateTestCases as jest.Mock).mockReturnValue({
444
- testCases: [
445
- { stage: '01-input', type: 'sample', input: 'test', expected: 'test' },
446
- { stage: '02-output', type: 'sample', input: 'test', expected: 'test' },
447
- ],
448
- });
449
-
450
- (dispatch.dispatchParallel as jest.Mock).mockReturnValue([
451
- { skill: 'worker', status: 'passed', batchId: 1, testCaseId: 'tc-001', timestamp: new Date().toISOString(), findings: [], recommendations: [], metrics: {}, nextSkill: 'validation' },
452
- { skill: 'worker', status: 'passed', batchId: 1, testCaseId: 'tc-002', timestamp: new Date().toISOString(), findings: [], recommendations: [], metrics: {}, nextSkill: 'validation' },
453
- ]);
454
-
455
- (benchmark.calculateBenchmark as jest.Mock).mockReturnValue({
456
- workspace: 'test',
457
- agent: 'test',
458
- timestamp: new Date().toISOString(),
459
- rawScore: 80,
460
- weightedScore: 90,
461
- stages: [],
462
- fixSuggestions: [],
463
- improvementPotential: false,
464
- });
465
-
466
- const result = runBatchLifecycle(ws, config);
467
-
468
- expect(result.totalBatches).toBe(1);
469
- expect(result.passedBatches).toBe(1);
470
- expect(result.overallScore).toBe(90);
471
- });
472
-
473
- it('triggers fix loop when batch score below threshold', () => {
474
- const ws = createBasicWorkspace();
475
- const config = { batchSize: 2, maxFixRetries: 3, scoreThreshold: 85, workerTimeout: 300 };
476
-
477
- (generateTests.generateTestCases as jest.Mock).mockReturnValue({
478
- testCases: [
479
- { stage: '01-input', type: 'sample', input: 'test', expected: 'test' },
480
- ],
481
- });
482
-
483
- (dispatch.dispatchParallel as jest.Mock).mockReturnValue([
484
- { skill: 'worker', status: 'failed', batchId: 1, testCaseId: 'tc-001', timestamp: new Date().toISOString(), findings: ['output missing'], recommendations: ['run worker'], metrics: {}, nextSkill: 'validation' },
485
- ]);
486
-
487
- (benchmark.calculateBenchmark as jest.Mock).mockReturnValue({
488
- workspace: 'test',
489
- agent: 'test',
490
- timestamp: new Date().toISOString(),
491
- rawScore: 30,
492
- weightedScore: 40,
493
- stages: [],
494
- fixSuggestions: ['Improve output'],
495
- improvementPotential: true,
496
- });
497
-
498
- const result = runBatchLifecycle(ws, config);
499
-
500
- expect(result.failedBatches).toBeGreaterThanOrEqual(0);
501
- });
502
- });
503
- ```
504
-
505
- - [ ] **Step 2: Run test to verify it fails**
506
-
507
- Run: `npx jest tests/orchestrator.test.ts -t "batch lifecycle" -v`
508
- Expected: FAIL
509
-
510
- - [ ] **Step 3: Add runBatchLifecycle to orchestrator.ts**
511
-
512
- Add to `src/scripts/orchestrator.ts`:
513
-
514
- ```typescript
515
- import { generateTestCases } from './generate-tests';
516
- import { dispatchParallel, ParallelInvocation, ParallelDispatchResult } from './dispatch';
517
- import { calculateBenchmark } from './benchmark';
518
-
519
- export interface BatchLifecycleResult {
520
- totalBatches: number;
521
- passedBatches: number;
522
- failedBatches: number;
523
- escalatedBatches: number;
524
- overallScore: number;
525
- batchReports: BatchReport[];
526
- timestamp: string;
527
- }
528
-
529
- export function runBatchLifecycle(
530
- workspacePath: string,
531
- config: OrchestratorConfig = {},
532
- ): BatchLifecycleResult {
533
- const resolvedConfig: Required<OrchestratorConfig> = {
534
- batchSize: config.batchSize ?? DEFAULT_CONFIG.batchSize,
535
- maxFixRetries: config.maxFixRetries ?? DEFAULT_CONFIG.maxFixRetries,
536
- scoreThreshold: config.scoreThreshold ?? DEFAULT_CONFIG.scoreThreshold,
537
- workerTimeout: config.workerTimeout ?? DEFAULT_CONFIG.workerTimeout,
538
- };
539
-
540
- const ws = path.resolve(workspacePath);
541
- const iterationDir = path.join(ws, '.agents', 'iteration');
542
- fs.mkdirSync(iterationDir, { recursive: true });
543
-
544
- // Phase 1: Generate test cases
545
- const testCasesResult = generateTestCases(ws);
546
- const testCaseIds = testCasesResult.testCases.map((_, i) => `tc-${String(i + 1).padStart(3, '0')}`);
547
-
548
- // Phase 2: Split into batches
549
- const batches = splitIntoBatches(testCaseIds, resolvedConfig.batchSize);
550
-
551
- // Phase 3: Process each batch
552
- const batchReports: BatchReport[] = [];
553
- let passedBatches = 0;
554
- let failedBatches = 0;
555
- let escalatedBatches = 0;
556
-
557
- for (let batchIdx = 0; batchIdx < batches.length; batchIdx++) {
558
- const batchId = batchIdx + 1;
559
- const batchDir = createBatchDirectory(iterationDir, batchId);
560
- const batchTestCases = batches[batchIdx];
561
-
562
- // Dispatch workers in parallel
563
- const invocations: ParallelInvocation[] = batchTestCases.map((tcId) => ({
564
- skill: 'worker',
565
- batchId,
566
- testCaseId: tcId,
567
- }));
568
-
569
- const workerResults = dispatchParallel(invocations, path.join(ws, '.agents', 'skills', 'workspace-maxxing', 'skills'));
570
-
571
- // Write worker outputs
572
- workerResults.forEach((result) => {
573
- const tcDir = createTestCaseDirectory(batchDir, result.testCaseId);
574
- fs.writeFileSync(
575
- path.join(tcDir, 'report.json'),
576
- JSON.stringify(result, null, 2),
577
- );
578
- });
579
-
580
- // Run benchmark for batch
581
- const benchmarkResult = calculateBenchmark(ws);
582
- const batchScore = benchmarkResult.weightedScore;
583
-
584
- // Determine batch status
585
- let batchStatus: BatchReport['status'] = 'passed';
586
- if (benchmarkResult.weightedScore < resolvedConfig.scoreThreshold) {
587
- // Fix loop
588
- const fixResults = runFixLoop(
589
- batchDir,
590
- workerResults,
591
- benchmarkResult.fixSuggestions,
592
- resolvedConfig.maxFixRetries,
593
- ws,
594
- );
595
-
596
- if (fixResults.status === 'escalated') {
597
- batchStatus = 'escalated';
598
- escalatedBatches++;
599
- } else if (fixResults.status === 'failed') {
600
- batchStatus = 'failed';
601
- failedBatches++;
602
- } else {
603
- batchStatus = 'passed';
604
- passedBatches++;
605
- }
606
-
607
- // Re-run benchmark after fixes
608
- const postFixBenchmark = calculateBenchmark(ws);
609
- batchReports.push({
610
- batchId,
611
- testCases: batchTestCases,
612
- score: postFixBenchmark.weightedScore,
613
- status: batchStatus,
614
- findings: fixResults.findings,
615
- timestamp: new Date().toISOString(),
616
- });
617
- } else {
618
- passedBatches++;
619
- batchReports.push({
620
- batchId,
621
- testCases: batchTestCases,
622
- score: batchScore,
623
- status: 'passed',
624
- findings: ['Batch passed threshold'],
625
- timestamp: new Date().toISOString(),
626
- });
627
- }
628
- }
629
-
630
- // Write summary
631
- const summary: BatchLifecycleResult = {
632
- totalBatches: batches.length,
633
- passedBatches,
634
- failedBatches,
635
- escalatedBatches,
636
- overallScore: batchReports.length > 0
637
- ? Math.round(batchReports.reduce((sum, r) => sum + r.score, 0) / batchReports.length)
638
- : 0,
639
- batchReports,
640
- timestamp: new Date().toISOString(),
641
- };
642
-
643
- fs.writeFileSync(
644
- path.join(iterationDir, 'summary.json'),
645
- JSON.stringify(summary, null, 2),
646
- );
647
-
648
- console.log(JSON.stringify(summary, null, 2));
649
- return summary;
650
- }
651
-
652
- interface FixLoopResult {
653
- status: 'passed' | 'failed' | 'escalated';
654
- findings: string[];
655
- }
656
-
657
- function runFixLoop(
658
- batchDir: string,
659
- workerResults: ParallelDispatchResult[],
660
- fixSuggestions: string[],
661
- maxRetries: number,
662
- workspacePath: string,
663
- ): FixLoopResult {
664
- const findings: string[] = [];
665
-
666
- for (let retry = 0; retry < maxRetries; retry++) {
667
- const failingResults = workerResults.filter((r) => r.status !== 'passed');
668
-
669
- if (failingResults.length === 0) {
670
- return { status: 'passed', findings };
671
- }
672
-
673
- // Dispatch fixers in parallel
674
- const fixInvocations: ParallelInvocation[] = failingResults.map((r) => ({
675
- skill: 'fixer',
676
- batchId: r.batchId,
677
- testCaseId: r.testCaseId,
678
- }));
679
-
680
- const fixResults = dispatchParallel(
681
- fixInvocations,
682
- path.join(workspacePath, '.agents', 'skills', 'workspace-maxxing', 'skills'),
683
- );
684
-
685
- findings.push(`Fix attempt ${retry + 1}: ${fixResults.length} fixes applied`);
686
-
687
- // Re-check benchmark
688
- const benchmarkResult = calculateBenchmark(workspacePath);
689
- if (benchmarkResult.weightedScore >= 85) {
690
- return { status: 'passed', findings };
691
- }
692
- }
693
-
694
- return { status: 'escalated', findings: [...findings, 'Max retries exhausted'] };
695
- }
696
- ```
697
-
698
- - [ ] **Step 4: Run test to verify it passes**
699
-
700
- Run: `npx jest tests/orchestrator.test.ts -t "batch lifecycle" -v`
701
- Expected: PASS
702
-
703
- - [ ] **Step 5: Commit**
704
-
705
- ```bash
706
- git add src/scripts/orchestrator.ts tests/orchestrator.test.ts
707
- git commit -m "feat(orchestrator): add full batch lifecycle with fix loop"
708
- ```
709
-
710
- ---
711
-
712
- ### Task 5: Worker Sub-Skill SKILL.md
713
-
714
- **Files:**
715
- - Create: `templates/.workspace-templates/skills/worker/SKILL.md`
716
- - Test: `tests/worker-skill.test.ts`
717
-
718
- - [ ] **Step 1: Write test for worker skill structure**
719
-
720
- ```typescript
721
- // tests/worker-skill.test.ts
722
- import * as fs from 'fs';
723
- import * as path from 'path';
724
-
725
- describe('worker sub-skill', () => {
726
- const skillPath = path.join(__dirname, '..', 'templates', '.workspace-templates', 'skills', 'worker', 'SKILL.md');
727
-
728
- it('exists', () => {
729
- expect(fs.existsSync(skillPath)).toBe(true);
730
- });
731
-
732
- it('has YAML frontmatter with name and triggers', () => {
733
- const content = fs.readFileSync(skillPath, 'utf-8');
734
- expect(content).toMatch(/^---/m);
735
- expect(content).toMatch(/name:\s*worker/);
736
- expect(content).toMatch(/triggers:/);
737
- });
738
-
739
- it('has Iron Law section', () => {
740
- const content = fs.readFileSync(skillPath, 'utf-8');
741
- expect(content.toLowerCase()).toContain('the iron law');
742
- });
743
-
744
- it('has Anti-Rationalization Table', () => {
745
- const content = fs.readFileSync(skillPath, 'utf-8');
746
- expect(content).toContain('| Thought | Reality |');
747
- });
748
-
749
- it('has Report Format section with JSON schema', () => {
750
- const content = fs.readFileSync(skillPath, 'utf-8');
751
- expect(content.toLowerCase()).toContain('report format');
752
- expect(content).toContain('"skill": "worker"');
753
- });
754
- });
755
- ```
756
-
757
- - [ ] **Step 2: Run test to verify it fails**
758
-
759
- Run: `npx jest tests/worker-skill.test.ts -v`
760
- Expected: FAIL
761
-
762
- - [ ] **Step 3: Create worker SKILL.md**
763
-
764
- ```markdown
765
- ---
766
- name: worker
767
- description: "Executes a single test case against the workspace and produces output. Use when running test cases, executing workspace tasks, or processing stage-specific work."
768
- triggers: ["run test case", "execute workspace task", "process stage", "generate output"]
769
- ---
770
-
771
- ## Overview
772
-
773
- Execute a single test case by reading the relevant workspace sections, performing the required work, and producing structured output. Each worker runs with fresh context — no assumptions about prior runs.
774
-
775
- ## When to Use
776
-
777
- - Dispatched by orchestrator as part of a batch
778
- - User asks to run a specific test case
779
- - User asks to execute a workspace stage task
780
-
781
- ## When Not to Use
782
-
783
- - Validating outputs (use validation sub-skill)
784
- - Fixing failed outputs (use fixer sub-skill)
785
- - Planning workspace structure (use architecture sub-skill)
786
-
787
- ## The Iron Law
788
-
789
- NO SKIPPING TEST CASE STEPS
790
- NO MODIFYING WORKSPACE STRUCTURE
791
- NO CLAIMING DONE WITHOUT OUTPUT
792
- NO ASSUMING PRIOR CONTEXT
793
-
794
- ## The Process
795
-
796
- 1. **Read test case** — Load the test case JSON from `.agents/iteration/batch-<N>/<testCaseId>/` or orchestrator input
797
- 2. **Load workspace context** — Read `SYSTEM.md` and relevant stage `CONTEXT.md` files
798
- 3. **Execute the task** — Follow the test case input/expected instructions
799
- 4. **Write output.md** — Human-readable output in `.agents/iteration/batch-<N>/<testCaseId>/output.md`
800
- 5. **Write report.json** — Structured JSON with `{testCaseId, status, output, findings}`
801
- 6. **Dispatch validation** — Signal that output is ready for validation
802
-
803
- ## Anti-Rationalization Table
804
-
805
- | Thought | Reality |
806
- |---------|---------|
807
- | "I already know what this stage does" | Read the CONTEXT.md. Assumptions cause failures. |
808
- | "The output is good enough" | Good enough fails validation. Follow the test case exactly. |
809
- | "I'll modify the workspace structure to make this easier" | Workers don't modify structure. That's the fixer's job. |
810
- | "This test case is redundant" | Every test case exists for a reason. Execute it. |
811
- | "I'll skip writing report.json" | Validation depends on report.json. It's mandatory. |
812
-
813
- ## Sub-Skill Dispatch
814
-
815
- - After output complete → validation sub-skill
816
-
817
- ## Report Format
818
-
819
- ```json
820
- {
821
- "skill": "worker",
822
- "status": "passed|failed|escalated",
823
- "timestamp": "<ISO-8601>",
824
- "testCaseId": "<id>",
825
- "batchId": <number>,
826
- "findings": ["<finding>"],
827
- "recommendations": ["<recommendation>"],
828
- "metrics": {
829
- "executionTimeMs": <number>,
830
- "outputLength": <number>
831
- },
832
- "nextSkill": "validation"
833
- }
834
- ```
835
- ```
836
-
837
- - [ ] **Step 4: Run test to verify it passes**
838
-
839
- Run: `npx jest tests/worker-skill.test.ts -v`
840
- Expected: PASS
841
-
842
- - [ ] **Step 5: Commit**
843
-
844
- ```bash
845
- git add templates/.workspace-templates/skills/worker/SKILL.md tests/worker-skill.test.ts
846
- git commit -m "feat(worker): add worker sub-skill with obra patterns"
847
- ```
848
-
849
- ---
850
-
851
- ### Task 6: Fixer Sub-Skill SKILL.md
852
-
853
- **Files:**
854
- - Create: `templates/.workspace-templates/skills/fixer/SKILL.md`
855
- - Test: `tests/fixer-skill.test.ts`
856
-
857
- - [ ] **Step 1: Write test for fixer skill structure**
858
-
859
- ```typescript
860
- // tests/fixer-skill.test.ts
861
- import * as fs from 'fs';
862
- import * as path from 'path';
863
-
864
- describe('fixer sub-skill', () => {
865
- const skillPath = path.join(__dirname, '..', 'templates', '.workspace-templates', 'skills', 'fixer', 'SKILL.md');
866
-
867
- it('exists', () => {
868
- expect(fs.existsSync(skillPath)).toBe(true);
869
- });
870
-
871
- it('has YAML frontmatter with name and triggers', () => {
872
- const content = fs.readFileSync(skillPath, 'utf-8');
873
- expect(content).toMatch(/^---/m);
874
- expect(content).toMatch(/name:\s*fixer/);
875
- expect(content).toMatch(/triggers:/);
876
- });
877
-
878
- it('has Iron Law section', () => {
879
- const content = fs.readFileSync(skillPath, 'utf-8');
880
- expect(content.toLowerCase()).toContain('the iron law');
881
- });
882
-
883
- it('has Anti-Rationalization Table', () => {
884
- const content = fs.readFileSync(skillPath, 'utf-8');
885
- expect(content).toContain('| Thought | Reality |');
886
- });
887
-
888
- it('has Report Format section with JSON schema', () => {
889
- const content = fs.readFileSync(skillPath, 'utf-8');
890
- expect(content.toLowerCase()).toContain('report format');
891
- expect(content).toContain('"skill": "fixer"');
892
- });
893
- });
894
- ```
895
-
896
- - [ ] **Step 2: Run test to verify it fails**
897
-
898
- Run: `npx jest tests/fixer-skill.test.ts -v`
899
- Expected: FAIL
900
-
901
- - [ ] **Step 3: Create fixer SKILL.md**
902
-
903
- ```markdown
904
- ---
905
- name: fixer
906
- description: "Applies targeted fixes to failing test case outputs. Use when fixing failed worker outputs, improving low-scoring results, or addressing validator findings."
907
- triggers: ["fix failing test", "improve output", "address validation failure", "apply targeted fix"]
908
- ---
909
-
910
- ## Overview
911
-
912
- Read validator findings and original worker output, identify the root cause of failure, apply the minimal fix needed, and re-validate. Each fixer runs with fresh context.
913
-
914
- ## When to Use
915
-
916
- - Dispatched by orchestrator fix loop
917
- - Validator identifies specific failures
918
- - Worker output is incomplete or incorrect
919
-
920
- ## When Not to Use
921
-
922
- - Generating new output from scratch (use worker sub-skill)
923
- - Validating outputs (use validation sub-skill)
924
- - Restructuring workspace (use architecture sub-skill)
925
-
926
- ## The Iron Law
927
-
928
- NO BLIND RETRIES
929
- NO COSMETIC FIXES
930
- NO FIXING WHAT ISN'T BROKEN
931
- NO CLAIMING FIX WITHOUT RE-VALIDATION
932
-
933
- ## The Process
934
-
935
- 1. **Read validator findings** — Load `batch-report.json` from batch directory
936
- 2. **Read original output** — Load `output.md` and `report.json` from `.agents/iteration/batch-<N>/<testCaseId>/`
937
- 3. **Identify root cause** — Map each finding to a specific issue in the output
938
- 4. **Apply minimal fix** — Change only what's needed to address the finding
939
- 5. **Update output.md** — Write the fixed output
940
- 6. **Update report.json** — Write updated report with fix details
941
- 7. **Dispatch validation** — Signal that fix is ready for re-validation
942
-
943
- ## Anti-Rationalization Table
944
-
945
- | Thought | Reality |
946
- |---------|---------|
947
- | "I'll just re-run the worker logic" | Blind retries don't fix root causes. Read the findings. |
948
- | "This looks better now" | Better is subjective. Does it pass the test case? |
949
- | "I'll fix other things while I'm here" | Fix only what the validator flagged. Scope creep wastes cycles. |
950
- | "The fix is obvious" | Obvious to whom? Follow the findings, not intuition. |
951
- | "I don't need to re-validate" | Unvalidated fixes are guesses. Always re-validate. |
952
-
953
- ## Sub-Skill Dispatch
954
-
955
- - After fix applied → validation sub-skill
956
-
957
- ## Report Format
958
-
959
- ```json
960
- {
961
- "skill": "fixer",
962
- "status": "passed|failed|escalated",
963
- "timestamp": "<ISO-8601>",
964
- "testCaseId": "<id>",
965
- "batchId": <number>,
966
- "findings": ["<finding>"],
967
- "fixesApplied": ["<fix description>"],
968
- "recommendations": ["<recommendation>"],
969
- "metrics": {
970
- "findingsAddressed": <number>,
971
- "fixesApplied": <number>
972
- },
973
- "nextSkill": "validation"
974
- }
975
- ```
976
- ```
977
-
978
- - [ ] **Step 4: Run test to verify it passes**
979
-
980
- Run: `npx jest tests/fixer-skill.test.ts -v`
981
- Expected: PASS
982
-
983
- - [ ] **Step 5: Commit**
984
-
985
- ```bash
986
- git add templates/.workspace-templates/skills/fixer/SKILL.md tests/fixer-skill.test.ts
987
- git commit -m "feat(fixer): add fixer sub-skill with obra patterns"
988
- ```
989
-
990
- ---
991
-
992
- ### Task 7: Enhanced Validation Sub-Skill SKILL.md
993
-
994
- **Files:**
995
- - Modify: `templates/.workspace-templates/skills/validation/SKILL.md`
996
- - Test: `tests/validation-enhanced.test.ts`
997
-
998
- - [ ] **Step 1: Write test for enhanced validation skill**
999
-
1000
- ```typescript
1001
- // tests/validation-enhanced.test.ts
1002
- import * as fs from 'fs';
1003
- import * as path from 'path';
1004
-
1005
- describe('enhanced validation sub-skill', () => {
1006
- const skillPath = path.join(__dirname, '..', 'templates', '.workspace-templates', 'skills', 'validation', 'SKILL.md');
1007
-
1008
- it('has YAML frontmatter with triggers', () => {
1009
- const content = fs.readFileSync(skillPath, 'utf-8');
1010
- expect(content).toMatch(/^---/m);
1011
- expect(content).toMatch(/name:\s*validation/);
1012
- expect(content).toMatch(/triggers:/);
1013
- });
1014
-
1015
- it('has Iron Law section', () => {
1016
- const content = fs.readFileSync(skillPath, 'utf-8');
1017
- expect(content.toLowerCase()).toContain('the iron law');
1018
- });
1019
-
1020
- it('has Anti-Rationalization Table', () => {
1021
- const content = fs.readFileSync(skillPath, 'utf-8');
1022
- expect(content).toContain('| Thought | Reality |');
1023
- });
1024
-
1025
- it('includes batch-level validation instructions', () => {
1026
- const content = fs.readFileSync(skillPath, 'utf-8');
1027
- expect(content.toLowerCase()).toContain('batch');
1028
- });
1029
-
1030
- it('has Report Format with benchmark scoring', () => {
1031
- const content = fs.readFileSync(skillPath, 'utf-8');
1032
- expect(content.toLowerCase()).toContain('report format');
1033
- expect(content.toLowerCase()).toContain('benchmark');
1034
- });
1035
- });
1036
- ```
1037
-
1038
- - [ ] **Step 2: Run test to verify it fails**
1039
-
1040
- Run: `npx jest tests/validation-enhanced.test.ts -v`
1041
- Expected: FAIL
1042
-
1043
- - [ ] **Step 3: Rewrite validation SKILL.md with obra patterns**
1044
-
1045
- ```markdown
1046
- ---
1047
- name: validation
1048
- description: "Checks workspace ICM compliance and benchmarks batch outputs. Use when validating a workspace, checking compliance, running validation, benchmarking batch results, or after making changes to workspace structure."
1049
- triggers: ["validate batch", "check results", "run validation", "benchmark outputs", "check compliance"]
1050
- ---
1051
-
1052
- ## Overview
1053
-
1054
- Ensure workspace meets ICM standards and benchmark batch outputs through systematic validation. Validates both workspace structure and worker/fixer outputs.
1055
-
1056
- ## When to Use
1057
-
1058
- - After workspace scaffolding
1059
- - After any structural change
1060
- - After worker batch completes
1061
- - After fixer applies fixes
1062
- - Before claiming delivery
1063
- - When score drops below threshold
1064
-
1065
- ## When Not to Use
1066
-
1067
- - Generating outputs (use worker sub-skill)
1068
- - Fixing failures (use fixer sub-skill)
1069
- - Researching patterns (use research sub-skill)
1070
-
1071
- ## The Iron Law
1072
-
1073
- NO SCORE INFLATION
1074
- NO SKIPPING FAILURES
1075
- NO VALIDATING WITHOUT BENCHMARK
1076
- NO PASSING WITHOUT EVIDENCE
1077
-
1078
- ## The Process
1079
-
1080
- 1. **Run validate.ts** — Execute `node scripts/validate.ts --workspace <path>`
1081
- 2. **Parse results** — Read exit code and output
1082
- 3. **Check batch outputs** — For each test case in batch, verify output.md and report.json exist
1083
- 4. **Run benchmark** — Execute `node scripts/benchmark.ts --workspace <path>`
1084
- 5. **Aggregate scores** — Combine workspace validation + benchmark scores
1085
- 6. **Generate findings** — List specific failures with fix suggestions
1086
- 7. **Write batch-report.json** — Structured report with per-test scores and overall batch score
1087
-
1088
- ## Batch-Level Validation
1089
-
1090
- When validating a batch:
1091
- - Read all `report.json` files in `.agents/iteration/batch-<N>/`
1092
- - Verify each worker output matches its test case expectations
1093
- - Calculate per-test-case pass/fail
1094
- - Calculate overall batch score using benchmark weights
1095
- - If batch score < threshold → recommend fixer sub-skill
1096
-
1097
- ## Anti-Rationalization Table
1098
-
1099
- | Thought | Reality |
1100
- |---------|---------|
1101
- | "This workspace looks good enough" | Good enough is the enemy of excellent. Run validation. |
1102
- | "The score is close, I'll round up" | Score inflation hides real problems. Report the true score. |
1103
- | "One failure doesn't matter" | Every failure matters. Report it. |
1104
- | "I already validated this" | Validation is a snapshot. Re-validate after every change. |
1105
- | "The benchmark is too strict" | The benchmark is the standard. Meet it or escalate. |
1106
-
1107
- ## Sub-Skill Dispatch
1108
-
1109
- - If batch score < threshold → fixer sub-skill
1110
- - If batch score >= threshold → orchestrator (batch complete)
1111
- - If critical failures (missing SYSTEM.md) → escalate to human
1112
-
1113
- ## Report Format
1114
-
1115
- ```json
1116
- {
1117
- "skill": "validation",
1118
- "status": "passed|failed|escalated",
1119
- "timestamp": "<ISO-8601>",
1120
- "batchId": <number>,
1121
- "findings": ["<finding>"],
1122
- "fixSuggestions": ["<suggestion>"],
1123
- "recommendations": ["<recommendation>"],
1124
- "metrics": {
1125
- "score": <0-100>,
1126
- "benchmarkScore": <0-100>,
1127
- "itemsChecked": <number>,
1128
- "itemsPassed": <number>,
1129
- "testCasesPassed": <number>,
1130
- "testCasesFailed": <number>
1131
- },
1132
- "nextSkill": "fixer|none"
1133
- }
1134
- ```
1135
- ```
1136
-
1137
- - [ ] **Step 4: Run test to verify it passes**
1138
-
1139
- Run: `npx jest tests/validation-enhanced.test.ts -v`
1140
- Expected: PASS
1141
-
1142
- - [ ] **Step 5: Commit**
1143
-
1144
- ```bash
1145
- git add templates/.workspace-templates/skills/validation/SKILL.md tests/validation-enhanced.test.ts
1146
- git commit -m "feat(validation): enhance with obra patterns and batch validation"
1147
- ```
1148
-
1149
- ---
1150
-
1151
- ### Task 8: Rewrite Remaining 6 Sub-Skills with obra/superpowers Patterns
1152
-
1153
- **Files:**
1154
- - Modify: `templates/.workspace-templates/skills/research/SKILL.md`
1155
- - Modify: `templates/.workspace-templates/skills/architecture/SKILL.md`
1156
- - Modify: `templates/.workspace-templates/skills/testing/SKILL.md`
1157
- - Modify: `templates/.workspace-templates/skills/prompt-engineering/SKILL.md`
1158
- - Modify: `templates/.workspace-templates/skills/iteration/SKILL.md`
1159
- - Modify: `templates/.workspace-templates/skills/tooling/SKILL.md`
1160
-
1161
- Each sub-skill gets this structure:
1162
- - YAML frontmatter: `name`, `description`, `triggers`
1163
- - Overview
1164
- - When to Use / When Not to Use
1165
- - The Iron Law (3-4 rules)
1166
- - The Process (numbered steps)
1167
- - Anti-Rationalization Table
1168
- - Sub-Skill Dispatch (if applicable)
1169
- - Report Format (JSON schema)
1170
-
1171
- - [ ] **Step 1: Rewrite research/SKILL.md**
1172
-
1173
- ```markdown
1174
- ---
1175
- name: research
1176
- description: "Investigates patterns, gathers context, and identifies best practices for workspace design. Use when starting a new workspace, researching workflow patterns, or before architecture planning."
1177
- triggers: ["research workflow", "gather context", "identify patterns", "best practices"]
1178
- ---
1179
-
1180
- ## Overview
1181
-
1182
- Gather context and identify patterns before building. Research ensures the workspace design is informed by real requirements, not assumptions.
1183
-
1184
- ## When to Use
1185
-
1186
- - Phase 1 of hybrid flow (always first)
1187
- - Before architecture planning
1188
- - When user asks for a novel workflow type
1189
- - When existing patterns don't fit the use case
1190
-
1191
- ## When Not to Use
1192
-
1193
- - After architecture is already planned (use architecture sub-skill)
1194
- - When workspace structure already exists (use validation sub-skill)
1195
- - For simple file creation (direct file operations)
1196
-
1197
- ## The Iron Law
1198
-
1199
- NO BUILD WITHOUT RESEARCH
1200
- NO GENERIC FINDINGS
1201
- NO SKIPPING INPUT/OUTPUT ANALYSIS
1202
- NO ASSUMPTIONS WITHOUT EVIDENCE
1203
-
1204
- ## The Process
1205
-
1206
- 1. **Identify workflow type** — What kind of process is being automated?
1207
- 2. **Research similar patterns** — Look at existing workspaces, documentation, best practices
1208
- 3. **Identify key stages** — What are the natural phases of this workflow?
1209
- 4. **Determine inputs/outputs** — What goes in, what comes out at each stage?
1210
- 5. **Identify tooling needs** — What tools are commonly used for this workflow?
1211
- 6. **Document findings** — Create a research summary for the architecture phase
1212
-
1213
- ## Anti-Rationalization Table
1214
-
1215
- | Thought | Reality |
1216
- |---------|---------|
1217
- | "I already know this workflow type" | Knowledge ≠ research. Document findings for the next agent. |
1218
- | "Research is taking too long" | Research prevents wasted build time. Be thorough. |
1219
- | "I'll figure it out while building" | Building without research produces generic, non-optimal workspaces. |
1220
- | "The user will clarify later" | Ask now. Ambiguous requirements produce ambiguous workspaces. |
1221
-
1222
- ## Sub-Skill Dispatch
1223
-
1224
- - Always dispatches to architecture sub-skill next
1225
- - If research is inconclusive → escalate to human for clarification
1226
-
1227
- ## Report Format
1228
-
1229
- ```json
1230
- {
1231
- "skill": "research",
1232
- "status": "passed|failed|escalated",
1233
- "timestamp": "<ISO-8601>",
1234
- "findings": ["<finding>"],
1235
- "recommendations": ["<recommendation>"],
1236
- "metrics": {
1237
- "patternsIdentified": <number>,
1238
- "stagesIdentified": <number>
1239
- },
1240
- "nextSkill": "architecture"
1241
- }
1242
- ```
1243
- ```
1244
-
1245
- - [ ] **Step 2: Rewrite architecture/SKILL.md**
1246
-
1247
- ```markdown
1248
- ---
1249
- name: architecture
1250
- description: "Designs workspace structure, plans folder layout, and creates the build plan. Use when planning workspace structure, designing folder hierarchy, or after research phase."
1251
- triggers: ["design workspace", "plan structure", "folder layout", "build plan"]
1252
- ---
1253
-
1254
- ## Overview
1255
-
1256
- Design the workspace structure based on research findings. Architecture translates research into a concrete, buildable plan.
1257
-
1258
- ## When to Use
1259
-
1260
- - Phase 2 of hybrid flow (after research)
1261
- - When research is complete and building is next
1262
- - When restructuring an existing workspace
1263
-
1264
- ## When Not to Use
1265
-
1266
- - Before research is complete (use research sub-skill)
1267
- - During building (use scaffold.ts directly)
1268
- - For minor structural tweaks (direct file operations)
1269
-
1270
- ## The Iron Law
1271
-
1272
- NO ARCHITECTURE WITHOUT RESEARCH
1273
- NO BUILDING WITHOUT APPROVED PLAN
1274
- NO SKIPPING USER APPROVAL
1275
- NO AMBIGUOUS STAGE DEFINITIONS
1276
-
1277
- ## The Process
1278
-
1279
- 1. **Review research findings** — Read the research sub-skill report
1280
- 2. **Define stage folders** — Determine numbered folder structure (01-xxx, 02-xxx, etc.)
1281
- 3. **Design routing table** — Plan CONTEXT.md routing for each stage
1282
- 4. **Define SYSTEM.md** — Plan folder map, rules, and tool inventory
1283
- 5. **Plan CONTEXT.md content** — Define what each stage's CONTEXT.md should contain
1284
- 6. **Create build plan** — Document the scaffold.ts command with all parameters
1285
- 7. **Get approval** — Present plan to user before building
1286
-
1287
- ## Anti-Rationalization Table
1288
-
1289
- | Thought | Reality |
1290
- |---------|---------|
1291
- | "I'll adjust the structure while building" | Structure changes mid-build are expensive. Plan first. |
1292
- | "This stage name is good enough" | Stage names affect routing. Be precise. |
1293
- | "The user will understand without approval" | Unapproved plans produce unwanted results. Always present the plan. |
1294
-
1295
- ## Sub-Skill Dispatch
1296
-
1297
- - Receives input from research sub-skill
1298
- - After approval → main skill runs scaffold.ts
1299
- - If architecture is unclear → escalate to human
1300
-
1301
- ## Report Format
1302
-
1303
- ```json
1304
- {
1305
- "skill": "architecture",
1306
- "status": "passed|failed|escalated",
1307
- "timestamp": "<ISO-8601>",
1308
- "findings": ["<finding>"],
1309
- "recommendations": ["<recommendation>"],
1310
- "metrics": {
1311
- "stagesPlanned": <number>,
1312
- "toolsIdentified": <number>
1313
- },
1314
- "nextSkill": "none"
1315
- }
1316
- ```
1317
- ```
1318
-
1319
- - [ ] **Step 3: Rewrite testing/SKILL.md**
1320
-
1321
- ```markdown
1322
- ---
1323
- name: testing
1324
- description: "Generates and runs test cases, evaluates results, and identifies gaps. Use when testing workspace quality, generating test cases, or after prompt improvements."
1325
- triggers: ["generate test cases", "run tests", "test workspace", "evaluate quality"]
1326
- ---
1327
-
1328
- ## Overview
1329
-
1330
- Verify workspace quality through systematic testing. Testing ensures the workspace produces correct outputs across sample, edge-case, and empty inputs.
1331
-
1332
- ## When to Use
1333
-
1334
- - After prompt-engineering improvements
1335
- - When no tests exist for the workspace
1336
- - Before claiming delivery
1337
- - When score is above 80 but quality is uncertain
1338
-
1339
- ## When Not to Use
1340
-
1341
- - Before workspace is built (use scaffold.ts first)
1342
- - For structural validation (use validation sub-skill)
1343
- - When fixing failures (use fixer sub-skill)
1344
-
1345
- ## The Iron Law
1346
-
1347
- NO SKIPPING TEST GENERATION
1348
- NO IGNORING FAILED TESTS
1349
- NO CLAIMING QUALITY WITHOUT EVIDENCE
1350
- NO TESTING WITHOUT TEST CASES
1351
-
1352
- ## The Process
1353
-
1354
- 1. **Generate test cases** — Run `node scripts/generate-tests.ts --workspace <path> --output ./tests.json`
1355
- 2. **Read test cases** — Parse the generated test cases
1356
- 3. **Run generation tests** — For each test case, create sample content the stage should produce
1357
- 4. **Run evaluation tests** — Review CONTEXT.md files against test cases
1358
- 5. **Aggregate results** — Identify patterns and gaps
1359
- 6. **Document findings** — Create test report with pass/fail per test case
1360
-
1361
- ## Anti-Rationalization Table
1362
-
1363
- | Thought | Reality |
1364
- |---------|---------|
1365
- | "The workspace looks fine, no need to test" | Looks deceive. Tests reveal. |
1366
- | "One failed test is a fluke" | Failed tests are signals. Investigate. |
1367
- | "I'll test after delivery" | Untested delivery is a gamble. Test first. |
1368
-
1369
- ## Sub-Skill Dispatch
1370
-
1371
- - Dispatched after prompt-engineering
1372
- - If tests fail → dispatch iteration for fixes
1373
- - If tests pass → workflow is nearly complete
1374
-
1375
- ## Report Format
1376
-
1377
- ```json
1378
- {
1379
- "skill": "testing",
1380
- "status": "passed|failed|escalated",
1381
- "timestamp": "<ISO-8601>",
1382
- "findings": ["<finding>"],
1383
- "recommendations": ["<recommendation>"],
1384
- "metrics": {
1385
- "testCasesGenerated": <number>,
1386
- "testCasesPassed": <number>,
1387
- "testCasesFailed": <number>
1388
- },
1389
- "nextSkill": "iteration|none"
1390
- }
1391
- ```
1392
- ```
1393
-
1394
- - [ ] **Step 4: Rewrite prompt-engineering/SKILL.md**
1395
-
1396
- ```markdown
1397
- ---
1398
- name: prompt-engineering
1399
- description: "Improves CONTEXT.md and SYSTEM.md prompts for better agent behavior. Use when workspace score is below 80, prompts need improvement, or after validation identifies content gaps."
1400
- triggers: ["improve prompts", "fix content gaps", "optimize prompts", "clarify instructions"]
1401
- ---
1402
-
1403
- ## Overview
1404
-
1405
- Optimize workspace prompts for clarity, completeness, and agent guidance. Prompt engineering fixes content-level issues without structural changes.
1406
-
1407
- ## When to Use
1408
-
1409
- - Score < 80 in benchmark results
1410
- - Validation identifies missing content
1411
- - Prompts are vague or incomplete
1412
- - Agent behavior doesn't match expectations
1413
-
1414
- ## When Not to Use
1415
-
1416
- - For structural issues (use fixer or architecture sub-skill)
1417
- - When workspace has no content yet (use worker sub-skill)
1418
- - For tool installation (use tooling sub-skill)
1419
-
1420
- ## The Iron Law
1421
-
1422
- NO COSMETIC CHANGES WITHOUT FUNCTIONAL IMPROVEMENT
1423
- NO CHANGING PROMPTS WITHOUT RE-VALIDATING
1424
- NO REMOVING CONTENT WITHOUT REPLACEMENT
1425
- NO CLAIMING IMPROVEMENT WITHOUT SCORE CHECK
1426
-
1427
- ## The Process
1428
-
1429
- 1. **Identify weak prompts** — Read benchmark findings and validation failures
1430
- 2. **Analyze current prompts** — What's missing, vague, or unclear?
1431
- 3. **Apply prompt patterns** — Use clear structure, examples, constraints, and output formats
1432
- 4. **Update CONTEXT.md files** — Improve stage-specific instructions
1433
- 5. **Update SYSTEM.md if needed** — Improve folder map, rules, or tool inventory
1434
- 6. **Re-run validation** — Verify improvements didn't break anything
1435
- 7. **Re-run benchmark** — Check if score improved
1436
-
1437
- ## Anti-Rationalization Table
1438
-
1439
- | Thought | Reality |
1440
- |---------|---------|
1441
- | "This wording change is enough" | Wording changes must produce functional improvement. |
1442
- | "I'll remove vague sections" | Removing creates gaps. Improve, don't delete. |
1443
- | "The score didn't change, but it's better" | If the score didn't change, it's not better. Try again. |
1444
-
1445
- ## Sub-Skill Dispatch
1446
-
1447
- - Dispatched when score < 80
1448
- - After improvements → dispatch testing to verify
1449
- - If score doesn't improve → dispatch iteration for deeper fixes
1450
-
1451
- ## Report Format
1452
-
1453
- ```json
1454
- {
1455
- "skill": "prompt-engineering",
1456
- "status": "passed|failed|escalated",
1457
- "timestamp": "<ISO-8601>",
1458
- "findings": ["<finding>"],
1459
- "recommendations": ["<recommendation>"],
1460
- "metrics": {
1461
- "scoreBefore": <number>,
1462
- "scoreAfter": <number>,
1463
- "promptsUpdated": <number>
1464
- },
1465
- "nextSkill": "testing|iteration|none"
1466
- }
1467
- ```
1468
- ```
1469
-
1470
- - [ ] **Step 5: Rewrite iteration/SKILL.md**
1471
-
1472
- ```markdown
1473
- ---
1474
- name: iteration
1475
- description: "Runs autonomous improvement loops with benchmark scoring. Use when score plateaued, deeper fixes needed, or after testing identifies patterns."
1476
- triggers: ["run improvement loop", "iterate on workspace", "deeper fixes", "score plateau"]
1477
- ---
1478
-
1479
- ## Overview
1480
-
1481
- Execute improvement loops until quality thresholds are met. Iteration applies systematic fixes when prompt-engineering isn't enough.
1482
-
1483
- ## When to Use
1484
-
1485
- - Score plateaued (no improvement between runs)
1486
- - Testing identified patterns requiring deeper fixes
1487
- - Validation failures persist after prompt-engineering
1488
- - As part of the condition-driven improvement loop
1489
-
1490
- ## When Not to Use
1491
-
1492
- - For first-pass improvements (use prompt-engineering first)
1493
- - When workspace is new and untested (use testing first)
1494
- - When structural changes are needed (use architecture sub-skill)
1495
-
1496
- ## The Iron Law
1497
-
1498
- NO CLAIMING IMPROVEMENT WITHOUT RE-RUNNING BENCHMARK
1499
- NO SKIPPING FIX SUGGESTIONS
1500
- NO INFINITE ITERATION LOOPS
1501
- NO SKIPPING ESCALATION WHEN STUCK
1502
-
1503
- ## The Process
1504
-
1505
- 1. **Run iterate.ts** — Execute `node scripts/iterate.ts --workspace <path> --max-retries 3`
1506
- 2. **Read benchmark results** — Parse the JSON output
1507
- 3. **Identify improvement areas** — Read fixSuggestions and improvementPotential
1508
- 4. **Apply fixes** — Address each suggestion systematically
1509
- 5. **Re-run iteration** — Check if score improved
1510
- 6. **Repeat until threshold** — Continue until score > 85 or no improvement possible
1511
- 7. **Escalate if stuck** — If score doesn't improve after 3 attempts, escalate to human
1512
-
1513
- ## Anti-Rationalization Table
1514
-
1515
- | Thought | Reality |
1516
- |---------|---------|
1517
- | "I'll just run it again" | Without applying fixes, re-running is wasted cycles. |
1518
- | "The score improved by 1 point" | Marginal improvements aren't meaningful. Target > 85. |
1519
- | "I'll keep iterating until it works" | Max 3 attempts. Then escalate. |
1520
-
1521
- ## Sub-Skill Dispatch
1522
-
1523
- - Dispatched when score plateaued
1524
- - After iteration → re-run validation and benchmark
1525
- - If score > 85 → workflow complete
1526
- - If stuck after 3 attempts → escalate to human
1527
-
1528
- ## Report Format
1529
-
1530
- ```json
1531
- {
1532
- "skill": "iteration",
1533
- "status": "passed|failed|escalated",
1534
- "timestamp": "<ISO-8601>",
1535
- "findings": ["<finding>"],
1536
- "recommendations": ["<recommendation>"],
1537
- "metrics": {
1538
- "scoreBefore": <number>,
1539
- "scoreAfter": <number>,
1540
- "iterationsRun": <number>
1541
- },
1542
- "nextSkill": "none"
1543
- }
1544
- ```
1545
- ```
1546
-
1547
- - [ ] **Step 6: Rewrite tooling/SKILL.md**
1548
-
1549
- ```markdown
1550
- ---
1551
- name: tooling
1552
- description: "Assesses, installs, and configures tools for the workspace. Use when tools are missing, tool inventory needs updating, or workspace requires specific dependencies."
1553
- triggers: ["install tools", "assess tooling", "update tool inventory", "configure dependencies"]
1554
- ---
1555
-
1556
- ## Overview
1557
-
1558
- Ensure workspace has the right tools installed and configured. Tooling manages the dependency layer of the workspace.
1559
-
1560
- ## When to Use
1561
-
1562
- - Tool inventory is empty or incomplete
1563
- - Workspace requires specific dependencies
1564
- - After architecture phase identifies tooling needs
1565
- - When user requests specific tool installation
1566
-
1567
- ## When Not to Use
1568
-
1569
- - For non-tool structural changes (use architecture sub-skill)
1570
- - For content improvements (use prompt-engineering sub-skill)
1571
- - When no tools are needed (skip tooling phase)
1572
-
1573
- ## The Iron Law
1574
-
1575
- NO INSTALLING TOOLS WITHOUT USER APPROVAL
1576
- NO SKIPPING TOOL INVENTORY UPDATES
1577
- NO INSTALLING UNNECESSARY TOOLS
1578
- NO SKIPPING VERIFICATION AFTER INSTALLATION
1579
-
1580
- ## The Process
1581
-
1582
- 1. **Scan current tools** — Read SYSTEM.md tool inventory
1583
- 2. **Identify missing tools** — Compare against workspace requirements
1584
- 3. **Propose tools** — List recommended tools with justifications
1585
- 4. **Get approval** — Present tool list to user for approval
1586
- 5. **Install tools** — Run `node scripts/install-tool.ts --tool <name> --manager <mgr> --workspace <path>`
1587
- 6. **Update inventory** — Verify tool inventory is updated
1588
- 7. **Verify installation** — Confirm tools are accessible
1589
-
1590
- ## Anti-Rationalization Table
1591
-
1592
- | Thought | Reality |
1593
- |---------|---------|
1594
- | "This tool might be useful" | Might is not enough. Justify each tool against workspace needs. |
1595
- | "I'll install it now and tell the user later" | User approval comes before installation. Always. |
1596
- | "The installation probably worked" | Probably is not verified. Check. |
1597
-
1598
- ## Sub-Skill Dispatch
1599
-
1600
- - Dispatched when tools are missing
1601
- - After installation → workflow continues to next phase
1602
- - If tool installation fails → escalate to human
1603
-
1604
- ## Report Format
1605
-
1606
- ```json
1607
- {
1608
- "skill": "tooling",
1609
- "status": "passed|failed|escalated",
1610
- "timestamp": "<ISO-8601>",
1611
- "findings": ["<finding>"],
1612
- "recommendations": ["<recommendation>"],
1613
- "metrics": {
1614
- "toolsInstalled": <number>,
1615
- "toolsProposed": <number>,
1616
- "toolsFailed": <number>
1617
- },
1618
- "nextSkill": "none"
1619
- }
1620
- ```
1621
- ```
1622
-
1623
- - [ ] **Step 7: Commit**
1624
-
1625
- ```bash
1626
- git add templates/.workspace-templates/skills/research/SKILL.md templates/.workspace-templates/skills/architecture/SKILL.md templates/.workspace-templates/skills/testing/SKILL.md templates/.workspace-templates/skills/prompt-engineering/SKILL.md templates/.workspace-templates/skills/iteration/SKILL.md templates/.workspace-templates/skills/tooling/SKILL.md
1627
- git commit -m "refactor(sub-skills): rewrite all 6 remaining sub-skills with obra patterns"
1628
- ```
1629
-
1630
- ---
1631
-
1632
- ### Task 9: Update Main SKILL.md with Autonomous Iteration Workflow
1633
-
1634
- **Files:**
1635
- - Modify: `templates/SKILL.md`
1636
-
1637
- - [ ] **Step 1: Rewrite templates/SKILL.md with new workflow section**
1638
-
1639
- Replace the entire file content with:
1640
-
1641
- ```markdown
1642
- ---
1643
- name: workspace-maxxing
1644
- description: "Autonomously creates, validates, and improves ICM-compliant workspaces using batched parallel sub-agents. Use when user asks to 'build a workspace', 'create a workflow', 'automate a process', 'improve this workspace', 'validate this workspace', 'iterate on this workspace', or 'run test cases'."
1645
- ---
1646
-
1647
- # Workspace-Maxxing Skill
1648
-
1649
- ## Overview
1650
-
1651
- Autonomous workflow system that creates, validates, and improves ICM-compliant workspaces through phased execution, batched parallel sub-agent iteration, and condition-driven improvement loops.
1652
-
1653
- ## When to Use
1654
-
1655
- - User asks to build, create, or automate a workflow
1656
- - User asks to improve, validate, or iterate on an existing workspace
1657
- - User asks for workspace architecture or structure design
1658
- - User asks to assess or install tools for a workspace
1659
- - User asks to run test cases against a workspace
1660
-
1661
- ## When Not to Use
1662
-
1663
- - Simple file creation or editing (use direct file operations)
1664
- - Questions about ICM methodology (answer directly)
1665
- - Non-workspace tasks (check for other applicable skills first)
1666
-
1667
- ## The Iron Law
1668
-
1669
- NO BUILD WITHOUT PLAN
1670
- NO PLAN WITHOUT RESEARCH
1671
- NO IMPROVEMENT WITHOUT VALIDATION
1672
- NO COMPLETION CLAIM WITHOUT VERIFICATION
1673
-
1674
- ## Hybrid Flow
1675
-
1676
- ```
1677
- Phase 1: RESEARCH (dispatch research sub-skill)
1678
-
1679
- Phase 2: ARCHITECTURE (dispatch architecture sub-skill)
1680
-
1681
- Phase 3: BUILD (use scaffold.ts script)
1682
-
1683
- Phase 4: VALIDATE (dispatch validation sub-skill)
1684
-
1685
- Phase 5: AUTONOMOUS ITERATION (use orchestrator.ts)
1686
- ├─ Generate test cases
1687
- ├─ Split into batches
1688
- ├─ Dispatch workers in parallel per batch
1689
- ├─ Validate batch results
1690
- ├─ If score < threshold → fix loop → re-validate
1691
- └─ Next batch or complete
1692
-
1693
- Phase 6: DELIVER
1694
- ```
1695
-
1696
- ## Autonomous Iteration Workflow
1697
-
1698
- The orchestrator manages batched parallel sub-agent execution:
1699
-
1700
- ```bash
1701
- node scripts/orchestrator.ts --workspace ./workspace --batch-size 3 --score-threshold 85
1702
- ```
1703
-
1704
- **Flow:**
1705
- 1. Generate test cases from workspace stages
1706
- 2. Split into batches (default 3 per batch)
1707
- 3. Dispatch worker sub-agents in parallel for each batch
1708
- 4. Validate batch outputs with benchmark scoring
1709
- 5. If batch score < threshold → dispatch fixer sub-agents → re-validate (max 3 retries)
1710
- 6. Move to next batch or write summary
1711
-
1712
- **Options:**
1713
- - `--batch-size <n>` — Test cases per batch (default: 3)
1714
- - `--score-threshold <n>` — Minimum batch score to pass (default: 85)
1715
- - `--max-fix-retries <n>` — Max fix attempts per batch (default: 3)
1716
- - `--worker-timeout <s>` — Worker timeout in seconds (default: 300)
1717
-
1718
- ## Sub-Skill Dispatch
1719
-
1720
- | Condition | Sub-Skill | Command |
1721
- |-----------|-----------|---------|
1722
- | Starting new workflow | `research` | `node scripts/dispatch.ts --skill research --workspace ./workspace` |
1723
- | After research complete | `architecture` | `node scripts/dispatch.ts --skill architecture --workspace ./workspace` |
1724
- | After architecture approved | (use scaffold.ts) | `node scripts/scaffold.ts --name "<name>" --stages "<stages>" --output ./workspace` |
1725
- | After building | `validation` | `node scripts/dispatch.ts --skill validation --workspace ./workspace` |
1726
- | Running autonomous iteration | (use orchestrator.ts) | `node scripts/orchestrator.ts --workspace ./workspace` |
1727
- | Worker execution | `worker` | `node scripts/dispatch.ts --skill worker --workspace ./workspace --batch-id <N>` |
1728
- | Fix loop | `fixer` | `node scripts/dispatch.ts --skill fixer --workspace ./workspace --batch-id <N>` |
1729
- | Score < 80 | `prompt-engineering` | `node scripts/dispatch.ts --skill prompt-engineering --workspace ./workspace` |
1730
- | No tests exist | `testing` | `node scripts/dispatch.ts --skill testing --workspace ./workspace` |
1731
- | Score plateaued | `iteration` | `node scripts/dispatch.ts --skill iteration --workspace ./workspace` |
1732
- | Tools missing | `tooling` | `node scripts/dispatch.ts --skill tooling --workspace ./workspace` |
1733
-
1734
- ## Available Scripts
1735
-
1736
- ### orchestrator.ts — Autonomous Batch Iteration
1737
-
1738
- Runs the full batched parallel sub-agent workflow.
1739
-
1740
- ```bash
1741
- node scripts/orchestrator.ts --workspace ./workspace --batch-size 3 --score-threshold 85
1742
- ```
1743
-
1744
- ### scaffold.ts — Generate ICM Workspace
1745
-
1746
- Creates a complete ICM workspace structure from a plan.
1747
-
1748
- ```bash
1749
- node scripts/scaffold.ts --name "research" --stages "01-research,02-analysis,03-report" --output ./workspace
1750
- ```
1751
-
1752
- ### validate.ts — Check ICM Compliance
1753
-
1754
- Validates a workspace against ICM rules.
1755
-
1756
- ```bash
1757
- node scripts/validate.ts --workspace ./workspace
1758
- ```
1759
-
1760
- ### install-tool.ts — Install Packages
1761
-
1762
- Installs a tool and updates the workspace inventory.
1763
-
1764
- ```bash
1765
- node scripts/install-tool.ts --tool "pdf-lib" --manager npm --workspace ./workspace
1766
- ```
1767
-
1768
- ### iterate.ts — Single-Workspace Iteration (legacy)
1769
-
1770
- Runs a 3-pass improvement loop. Use orchestrator.ts for batched parallel iteration.
1771
-
1772
- ```bash
1773
- node scripts/iterate.ts --workspace ./workspace --max-retries 3
1774
- ```
1775
-
1776
- ### generate-tests.ts — Generate Test Cases
1777
-
1778
- Creates test cases for each stage (sample, edge-case, empty).
1779
-
1780
- ```bash
1781
- node scripts/generate-tests.ts --workspace ./workspace --output ./tests.json
1782
- ```
1783
-
1784
- ### benchmark.ts — Weighted Benchmark Scoring
1785
-
1786
- Runs weighted benchmark scoring on a workspace.
1787
-
1788
- ```bash
1789
- node scripts/benchmark.ts --workspace ./workspace
1790
- ```
1791
-
1792
- ### dispatch.ts — Sub-Skill Dispatcher
1793
-
1794
- Loads and executes sub-skill workflows. Supports parallel dispatch.
1795
-
1796
- ```bash
1797
- node scripts/dispatch.ts --skill <name> --workspace ./workspace [--parallel --invocations <path>]
1798
- ```
1799
-
1800
- ## Anti-Rationalization Table
1801
-
1802
- | Thought | Reality |
1803
- |---------|---------|
1804
- | "This workspace looks good enough" | Good enough is the enemy of excellent. Run validation. |
1805
- | "I'll skip research and go straight to building" | Building without research produces generic, non-optimal workspaces. |
1806
- | "The user didn't ask for tests" | Autonomous workflows require self-verification. Tests are mandatory. |
1807
- | "I'll fix this later" | Later never comes. Fix it now or escalate. |
1808
- | "This sub-skill doesn't apply here" | If there's a 1% chance it applies, dispatch it. |
1809
- | "The score is fine" | Fine is not good. Target > 85. |
1810
- | "I already validated this" | Validation is a snapshot. Re-validate after every change. |
1811
- | "I'll do all phases at once" | Phases exist for a reason. Complete each before moving to the next. |
1812
-
1813
- ## Integration
1814
-
1815
- - Sub-skills live in `skills/` directory, loaded via dispatch.ts
1816
- - Shared references in `references/` directory (anti-patterns, reporting-format, iron-laws)
1817
- - All sub-skills return structured JSON reports
1818
- - Orchestrator manages batch lifecycle with fix loops
1819
- - Condition loop continues until score > 85 AND all validations pass
1820
- - Escalate to human if stuck after 3 iteration attempts
1821
-
1822
- ## ICM Rules
1823
- - Canonical sources: each fact lives in exactly one file
1824
- - One-way dependencies only: A → B, never B → A
1825
- - Selective loading: route to sections, not whole files
1826
- - Numbered folders for workflow stages
1827
-
1828
- ## Output Format
1829
- - workspace/ — the built workspace
1830
- - .agents/skills/<workspace-name>/ — installable skill
1831
- - USAGE.md — how to use this workspace in future sessions
1832
- - .agents/iteration/summary.json — autonomous iteration results
1833
- ```
1834
-
1835
- - [ ] **Step 2: Commit**
1836
-
1837
- ```bash
1838
- git add templates/SKILL.md
1839
- git commit -m "feat(SKILL.md): add autonomous iteration workflow section"
1840
- ```
1841
-
1842
- ---
1843
-
1844
- ### Task 10: Full Integration Test
1845
-
1846
- **Files:**
1847
- - Modify: `tests/integration.test.ts`
1848
-
1849
- - [ ] **Step 1: Add integration test for orchestrator batch lifecycle**
1850
-
1851
- Add to `tests/integration.test.ts`:
1852
-
1853
- ```typescript
1854
- describe('orchestrator integration', () => {
1855
- it('runs full batch lifecycle on a valid workspace', () => {
1856
- const ws = createBasicWorkspace();
1857
- const orchestratorPath = path.join(__dirname, '..', 'dist', 'scripts', 'orchestrator.js');
1858
-
1859
- const { stdout } = execSync(`node "${orchestratorPath}" --workspace "${ws}" --batch-size 2`, {
1860
- encoding: 'utf-8',
1861
- });
1862
-
1863
- const result = JSON.parse(stdout);
1864
- expect(result.totalBatches).toBeGreaterThan(0);
1865
- expect(result.batchReports).toBeDefined();
1866
- });
1867
-
1868
- it('writes summary.json to iteration directory', () => {
1869
- const ws = createBasicWorkspace();
1870
- const orchestratorPath = path.join(__dirname, '..', 'dist', 'scripts', 'orchestrator.js');
1871
-
1872
- execSync(`node "${orchestratorPath}" --workspace "${ws}" --batch-size 2`, {
1873
- encoding: 'utf-8',
1874
- });
1875
-
1876
- const summaryPath = path.join(ws, '.agents', 'iteration', 'summary.json');
1877
- expect(fs.existsSync(summaryPath)).toBe(true);
1878
-
1879
- const summary = JSON.parse(fs.readFileSync(summaryPath, 'utf-8'));
1880
- expect(summary.totalBatches).toBeDefined();
1881
- expect(summary.timestamp).toBeDefined();
1882
- });
1883
- });
1884
- ```
1885
-
1886
- - [ ] **Step 2: Build and run all tests**
1887
-
1888
- Run: `npm run build && npm test`
1889
- Expected: All tests pass
1890
-
1891
- - [ ] **Step 3: Commit**
1892
-
1893
- ```bash
1894
- git add tests/integration.test.ts
1895
- git commit -m "test(integration): add orchestrator batch lifecycle tests"
1896
- ```
1897
-
1898
- ---
1899
-
1900
- ### Task 11: Run Full Test Suite & Verify
1901
-
1902
- - [ ] **Step 1: Run full test suite**
1903
-
1904
- Run: `npm test`
1905
- Expected: All tests pass (114+ existing + new tests)
1906
-
1907
- - [ ] **Step 2: Build**
1908
-
1909
- Run: `npm run build`
1910
- Expected: Clean build, no errors
1911
-
1912
- - [ ] **Step 3: Verify all sub-skill files exist**
1913
-
1914
- Run: `ls templates/.workspace-templates/skills/*/SKILL.md`
1915
- Expected: All 9 sub-skills listed (research, architecture, validation, prompt-engineering, testing, iteration, tooling, worker, fixer)
1916
-
1917
- - [ ] **Step 4: Final commit if needed**
1918
-
1919
- ```bash
1920
- git status
1921
- git add -A
1922
- git commit -m "chore: final verification and cleanup"
1923
- ```