workspace-maxxing 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +73 -10
- package/dist/install.d.ts +1 -1
- package/dist/install.d.ts.map +1 -1
- package/dist/install.js +7 -1
- package/dist/install.js.map +1 -1
- package/package.json +1 -1
- package/src/install.ts +8 -1
- package/templates/SKILL.md +88 -0
- package/docs/superpowers/plans/2026-04-07-autonomous-iteration-plan.md +0 -1123
- package/docs/superpowers/plans/2026-04-07-autonomous-iteration-sub-agent-batches.md +0 -1923
- package/docs/superpowers/plans/2026-04-07-autonomous-workflow-sub-skill-plan.md +0 -1505
- package/docs/superpowers/plans/2026-04-07-benchmarking-multi-agent-plan.md +0 -854
- package/docs/superpowers/plans/2026-04-07-workspace-builder-logic-plan.md +0 -1426
- package/docs/superpowers/plans/2026-04-07-workspace-maxxing-plan.md +0 -1299
- package/docs/superpowers/plans/2026-04-08-session-294c-subagent-invocation-plan.md +0 -320
- package/docs/superpowers/plans/2026-04-08-workflow-prompt-hardening-plan.md +0 -1025
- package/docs/superpowers/plans/2026-04-12-workspace-agent-creation-plan.md +0 -992
- package/docs/superpowers/specs/2026-04-07-autonomous-iteration-design.md +0 -214
- package/docs/superpowers/specs/2026-04-07-autonomous-iteration-sub-agent-batches-design.md +0 -188
- package/docs/superpowers/specs/2026-04-07-autonomous-workflow-sub-skill-design.md +0 -137
- package/docs/superpowers/specs/2026-04-07-benchmarking-multi-agent-design.md +0 -105
- package/docs/superpowers/specs/2026-04-07-workspace-builder-logic-design.md +0 -179
- package/docs/superpowers/specs/2026-04-07-workspace-maxxing-design.md +0 -227
- package/docs/superpowers/specs/2026-04-08-session-294c-subagent-invocation-design.md +0 -265
- package/docs/superpowers/specs/2026-04-08-workflow-prompt-hardening-design.md +0 -146
- package/docs/superpowers/specs/2026-04-12-workspace-agent-creation-design.md +0 -239
|
@@ -1,1923 +0,0 @@
|
|
|
1
|
-
# Autonomous Iteration with Sub-Agent Batches Implementation Plan
|
|
2
|
-
|
|
3
|
-
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
|
4
|
-
|
|
5
|
-
**Goal:** Implement batched parallel sub-agent iteration with validator checkpoints and fix loops, plus rewrite all sub-skills with obra/superpowers patterns.
|
|
6
|
-
|
|
7
|
-
**Architecture:** New `orchestrator.ts` coordinates the batch lifecycle. `dispatch.ts` extended for parallel invocation with batch IDs. Three new sub-skills (`worker`, `fixer`, enhanced `validation`). All existing sub-skills rewritten with YAML frontmatter, trigger phrases, anti-rationalization tables, and iron laws.
|
|
8
|
-
|
|
9
|
-
**Tech Stack:** TypeScript, Node.js builtins (fs, path, child_process, os), Jest for testing.
|
|
10
|
-
|
|
11
|
-
---
|
|
12
|
-
|
|
13
|
-
### Task 1: orchestrator.ts — Core Types & Batch Splitting
|
|
14
|
-
|
|
15
|
-
**Files:**
|
|
16
|
-
- Create: `src/scripts/orchestrator.ts`
|
|
17
|
-
- Test: `tests/orchestrator.test.ts`
|
|
18
|
-
|
|
19
|
-
- [ ] **Step 1: Write failing test for batch splitting**
|
|
20
|
-
|
|
21
|
-
```typescript
|
|
22
|
-
// tests/orchestrator.test.ts
|
|
23
|
-
import * as fs from 'fs';
|
|
24
|
-
import * as path from 'path';
|
|
25
|
-
import * as os from 'os';
|
|
26
|
-
import { splitIntoBatches, OrchestratorConfig } from '../src/scripts/orchestrator';
|
|
27
|
-
|
|
28
|
-
describe('orchestrator', () => {
|
|
29
|
-
let tempDir: string;
|
|
30
|
-
|
|
31
|
-
beforeEach(() => {
|
|
32
|
-
tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'orchestrator-test-'));
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
afterEach(() => {
|
|
36
|
-
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
describe('splitIntoBatches', () => {
|
|
40
|
-
it('splits items into batches of specified size', () => {
|
|
41
|
-
const items = ['tc-001', 'tc-002', 'tc-003', 'tc-004', 'tc-005'];
|
|
42
|
-
const result = splitIntoBatches(items, 3);
|
|
43
|
-
|
|
44
|
-
expect(result).toHaveLength(2);
|
|
45
|
-
expect(result[0]).toEqual(['tc-001', 'tc-002', 'tc-003']);
|
|
46
|
-
expect(result[1]).toEqual(['tc-004', 'tc-005']);
|
|
47
|
-
});
|
|
48
|
-
|
|
49
|
-
it('returns single batch when items fit', () => {
|
|
50
|
-
const items = ['tc-001', 'tc-002'];
|
|
51
|
-
const result = splitIntoBatches(items, 3);
|
|
52
|
-
|
|
53
|
-
expect(result).toHaveLength(1);
|
|
54
|
-
expect(result[0]).toEqual(['tc-001', 'tc-002']);
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
it('returns empty array for empty input', () => {
|
|
58
|
-
const result = splitIntoBatches([], 3);
|
|
59
|
-
expect(result).toEqual([]);
|
|
60
|
-
});
|
|
61
|
-
|
|
62
|
-
it('uses default batch size of 3 when not specified', () => {
|
|
63
|
-
const items = ['a', 'b', 'c', 'd', 'e', 'f', 'g'];
|
|
64
|
-
const result = splitIntoBatches(items);
|
|
65
|
-
|
|
66
|
-
expect(result).toHaveLength(3);
|
|
67
|
-
expect(result[0]).toHaveLength(3);
|
|
68
|
-
expect(result[1]).toHaveLength(3);
|
|
69
|
-
expect(result[2]).toHaveLength(1);
|
|
70
|
-
});
|
|
71
|
-
});
|
|
72
|
-
});
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
- [ ] **Step 2: Run test to verify it fails**
|
|
76
|
-
|
|
77
|
-
Run: `npx jest tests/orchestrator.test.ts -t "splitIntoBatches" -v`
|
|
78
|
-
Expected: FAIL with "Cannot find module"
|
|
79
|
-
|
|
80
|
-
- [ ] **Step 3: Write orchestrator.ts with types and splitIntoBatches**
|
|
81
|
-
|
|
82
|
-
```typescript
|
|
83
|
-
// src/scripts/orchestrator.ts
|
|
84
|
-
import * as fs from 'fs';
|
|
85
|
-
import * as path from 'path';
|
|
86
|
-
|
|
87
|
-
export interface OrchestratorConfig {
|
|
88
|
-
batchSize?: number;
|
|
89
|
-
maxFixRetries?: number;
|
|
90
|
-
scoreThreshold?: number;
|
|
91
|
-
workerTimeout?: number;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
export interface BatchReport {
|
|
95
|
-
batchId: number;
|
|
96
|
-
testCases: string[];
|
|
97
|
-
score: number;
|
|
98
|
-
status: 'passed' | 'failed' | 'partial' | 'escalated';
|
|
99
|
-
findings: string[];
|
|
100
|
-
timestamp: string;
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
export interface OrchestratorSummary {
|
|
104
|
-
totalBatches: number;
|
|
105
|
-
passedBatches: number;
|
|
106
|
-
failedBatches: number;
|
|
107
|
-
escalatedBatches: number;
|
|
108
|
-
overallScore: number;
|
|
109
|
-
batchReports: BatchReport[];
|
|
110
|
-
timestamp: string;
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
export const DEFAULT_CONFIG: Required<OrchestratorConfig> = {
|
|
114
|
-
batchSize: 3,
|
|
115
|
-
maxFixRetries: 3,
|
|
116
|
-
scoreThreshold: 85,
|
|
117
|
-
workerTimeout: 300,
|
|
118
|
-
};
|
|
119
|
-
|
|
120
|
-
export function splitIntoBatches(items: string[], batchSize: number = DEFAULT_CONFIG.batchSize): string[][] {
|
|
121
|
-
if (items.length === 0) return [];
|
|
122
|
-
|
|
123
|
-
const batches: string[][] = [];
|
|
124
|
-
for (let i = 0; i < items.length; i += batchSize) {
|
|
125
|
-
batches.push(items.slice(i, i + batchSize));
|
|
126
|
-
}
|
|
127
|
-
return batches;
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
if (require.main === module) {
|
|
131
|
-
const args = process.argv.slice(2);
|
|
132
|
-
const parseArg = (flag: string): string | undefined => {
|
|
133
|
-
const idx = args.indexOf(flag);
|
|
134
|
-
return idx !== -1 ? args[idx + 1] : undefined;
|
|
135
|
-
};
|
|
136
|
-
|
|
137
|
-
const testCasesPath = parseArg('--test-cases');
|
|
138
|
-
const batchSizeStr = parseArg('--batch-size');
|
|
139
|
-
const batchSize = batchSizeStr ? parseInt(batchSizeStr, 10) : DEFAULT_CONFIG.batchSize;
|
|
140
|
-
|
|
141
|
-
if (!testCasesPath) {
|
|
142
|
-
console.error('Usage: node orchestrator.ts --test-cases <path> [--batch-size <n>]');
|
|
143
|
-
process.exit(1);
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
const testCases = JSON.parse(fs.readFileSync(testCasesPath, 'utf-8'));
|
|
147
|
-
const testCaseIds = testCases.testCases.map((tc: any, i: number) => `tc-${String(i + 1).padStart(3, '0')}`);
|
|
148
|
-
const batches = splitIntoBatches(testCaseIds, batchSize);
|
|
149
|
-
|
|
150
|
-
console.log(JSON.stringify({ batches, totalTestCases: testCaseIds.length, totalBatches: batches.length }, null, 2));
|
|
151
|
-
}
|
|
152
|
-
```
|
|
153
|
-
|
|
154
|
-
- [ ] **Step 4: Run test to verify it passes**
|
|
155
|
-
|
|
156
|
-
Run: `npx jest tests/orchestrator.test.ts -t "splitIntoBatches" -v`
|
|
157
|
-
Expected: PASS
|
|
158
|
-
|
|
159
|
-
- [ ] **Step 5: Commit**
|
|
160
|
-
|
|
161
|
-
```bash
|
|
162
|
-
git add src/scripts/orchestrator.ts tests/orchestrator.test.ts
|
|
163
|
-
git commit -m "feat(orchestrator): add types and batch splitting"
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
---
|
|
167
|
-
|
|
168
|
-
### Task 2: orchestrator.ts — Batch Output Directory Management
|
|
169
|
-
|
|
170
|
-
**Files:**
|
|
171
|
-
- Modify: `src/scripts/orchestrator.ts`
|
|
172
|
-
- Modify: `tests/orchestrator.test.ts`
|
|
173
|
-
|
|
174
|
-
- [ ] **Step 1: Write failing test for batch directory creation**
|
|
175
|
-
|
|
176
|
-
Add to `tests/orchestrator.test.ts`:
|
|
177
|
-
|
|
178
|
-
```typescript
|
|
179
|
-
import { createBatchDirectory, getBatchDirectory } from '../src/scripts/orchestrator';
|
|
180
|
-
|
|
181
|
-
describe('batch directory management', () => {
|
|
182
|
-
it('creates batch directory structure', () => {
|
|
183
|
-
const baseDir = path.join(tempDir, '.agents', 'iteration');
|
|
184
|
-
const result = createBatchDirectory(baseDir, 1);
|
|
185
|
-
|
|
186
|
-
expect(fs.existsSync(result)).toBe(true);
|
|
187
|
-
expect(result).toContain('batch-01');
|
|
188
|
-
});
|
|
189
|
-
|
|
190
|
-
it('returns existing batch directory path', () => {
|
|
191
|
-
const baseDir = path.join(tempDir, '.agents', 'iteration');
|
|
192
|
-
fs.mkdirSync(path.join(baseDir, 'batch-02'), { recursive: true });
|
|
193
|
-
|
|
194
|
-
const result = getBatchDirectory(baseDir, 2);
|
|
195
|
-
expect(result).toContain('batch-02');
|
|
196
|
-
});
|
|
197
|
-
});
|
|
198
|
-
```
|
|
199
|
-
|
|
200
|
-
- [ ] **Step 2: Run test to verify it fails**
|
|
201
|
-
|
|
202
|
-
Run: `npx jest tests/orchestrator.test.ts -t "batch directory" -v`
|
|
203
|
-
Expected: FAIL
|
|
204
|
-
|
|
205
|
-
- [ ] **Step 3: Add batch directory functions to orchestrator.ts**
|
|
206
|
-
|
|
207
|
-
Add to `src/scripts/orchestrator.ts`:
|
|
208
|
-
|
|
209
|
-
```typescript
|
|
210
|
-
export function createBatchDirectory(baseDir: string, batchId: number): string {
|
|
211
|
-
const batchDir = path.join(baseDir, `batch-${String(batchId).padStart(2, '0')}`);
|
|
212
|
-
fs.mkdirSync(batchDir, { recursive: true });
|
|
213
|
-
return batchDir;
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
export function getBatchDirectory(baseDir: string, batchId: number): string {
|
|
217
|
-
return path.join(baseDir, `batch-${String(batchId).padStart(2, '0')}`);
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
export function createTestCaseDirectory(batchDir: string, testCaseId: string): string {
|
|
221
|
-
const tcDir = path.join(batchDir, testCaseId);
|
|
222
|
-
fs.mkdirSync(tcDir, { recursive: true });
|
|
223
|
-
return tcDir;
|
|
224
|
-
}
|
|
225
|
-
```
|
|
226
|
-
|
|
227
|
-
- [ ] **Step 4: Run test to verify it passes**
|
|
228
|
-
|
|
229
|
-
Run: `npx jest tests/orchestrator.test.ts -t "batch directory" -v`
|
|
230
|
-
Expected: PASS
|
|
231
|
-
|
|
232
|
-
- [ ] **Step 5: Commit**
|
|
233
|
-
|
|
234
|
-
```bash
|
|
235
|
-
git add src/scripts/orchestrator.ts tests/orchestrator.test.ts
|
|
236
|
-
git commit -m "feat(orchestrator): add batch directory management"
|
|
237
|
-
```
|
|
238
|
-
|
|
239
|
-
---
|
|
240
|
-
|
|
241
|
-
### Task 3: dispatch.ts — Parallel Dispatch & Batch ID Support
|
|
242
|
-
|
|
243
|
-
**Files:**
|
|
244
|
-
- Modify: `src/scripts/dispatch.ts`
|
|
245
|
-
- Test: `tests/dispatch-parallel.test.ts`
|
|
246
|
-
|
|
247
|
-
- [ ] **Step 1: Write failing test for parallel dispatch**
|
|
248
|
-
|
|
249
|
-
```typescript
|
|
250
|
-
// tests/dispatch-parallel.test.ts
|
|
251
|
-
import * as fs from 'fs';
|
|
252
|
-
import * as path from 'path';
|
|
253
|
-
import * as os from 'os';
|
|
254
|
-
import { dispatchSkill, dispatchParallel, ParallelDispatchResult } from '../src/scripts/dispatch';
|
|
255
|
-
|
|
256
|
-
jest.mock('child_process');
|
|
257
|
-
|
|
258
|
-
describe('parallel dispatch', () => {
|
|
259
|
-
let tempDir: string;
|
|
260
|
-
|
|
261
|
-
beforeEach(() => {
|
|
262
|
-
tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'dispatch-parallel-'));
|
|
263
|
-
});
|
|
264
|
-
|
|
265
|
-
afterEach(() => {
|
|
266
|
-
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
267
|
-
jest.clearAllMocks();
|
|
268
|
-
});
|
|
269
|
-
|
|
270
|
-
it('dispatches multiple skills in parallel and aggregates results', () => {
|
|
271
|
-
const skillsDir = path.join(tempDir, 'skills');
|
|
272
|
-
fs.mkdirSync(path.join(skillsDir, 'worker'), { recursive: true });
|
|
273
|
-
fs.writeFileSync(path.join(skillsDir, 'worker', 'SKILL.md'), '---\nname: worker\n---\n\nTest');
|
|
274
|
-
|
|
275
|
-
const invocations = [
|
|
276
|
-
{ skill: 'worker', batchId: 1, testCaseId: 'tc-001' },
|
|
277
|
-
{ skill: 'worker', batchId: 1, testCaseId: 'tc-002' },
|
|
278
|
-
];
|
|
279
|
-
|
|
280
|
-
const results = dispatchParallel(invocations, skillsDir);
|
|
281
|
-
|
|
282
|
-
expect(results).toHaveLength(2);
|
|
283
|
-
expect(results[0].status).toBe('passed');
|
|
284
|
-
expect(results[1].status).toBe('passed');
|
|
285
|
-
});
|
|
286
|
-
|
|
287
|
-
it('includes batchId and testCaseId in results', () => {
|
|
288
|
-
const skillsDir = path.join(tempDir, 'skills');
|
|
289
|
-
fs.mkdirSync(path.join(skillsDir, 'validation'), { recursive: true });
|
|
290
|
-
fs.writeFileSync(path.join(skillsDir, 'validation', 'SKILL.md'), '---\nname: validation\n---\n\nTest');
|
|
291
|
-
|
|
292
|
-
const invocations = [
|
|
293
|
-
{ skill: 'validation', batchId: 2, testCaseId: 'tc-003' },
|
|
294
|
-
];
|
|
295
|
-
|
|
296
|
-
const results = dispatchParallel(invocations, skillsDir);
|
|
297
|
-
|
|
298
|
-
expect(results[0].batchId).toBe(2);
|
|
299
|
-
expect(results[0].testCaseId).toBe('tc-003');
|
|
300
|
-
});
|
|
301
|
-
|
|
302
|
-
it('handles missing skill gracefully in parallel mode', () => {
|
|
303
|
-
const skillsDir = path.join(tempDir, 'skills');
|
|
304
|
-
fs.mkdirSync(skillsDir, { recursive: true });
|
|
305
|
-
|
|
306
|
-
const invocations = [
|
|
307
|
-
{ skill: 'nonexistent', batchId: 1, testCaseId: 'tc-001' },
|
|
308
|
-
];
|
|
309
|
-
|
|
310
|
-
const results = dispatchParallel(invocations, skillsDir);
|
|
311
|
-
|
|
312
|
-
expect(results[0].status).toBe('failed');
|
|
313
|
-
});
|
|
314
|
-
});
|
|
315
|
-
```
|
|
316
|
-
|
|
317
|
-
- [ ] **Step 2: Run test to verify it fails**
|
|
318
|
-
|
|
319
|
-
Run: `npx jest tests/dispatch-parallel.test.ts -v`
|
|
320
|
-
Expected: FAIL
|
|
321
|
-
|
|
322
|
-
- [ ] **Step 3: Add parallel dispatch to dispatch.ts**
|
|
323
|
-
|
|
324
|
-
Add to `src/scripts/dispatch.ts`:
|
|
325
|
-
|
|
326
|
-
```typescript
|
|
327
|
-
import * as child_process from 'child_process';
|
|
328
|
-
|
|
329
|
-
export interface ParallelInvocation {
|
|
330
|
-
skill: string;
|
|
331
|
-
batchId: number;
|
|
332
|
-
testCaseId: string;
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
export interface ParallelDispatchResult extends DispatchReport {
|
|
336
|
-
batchId: number;
|
|
337
|
-
testCaseId: string;
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
export function dispatchParallel(
|
|
341
|
-
invocations: ParallelInvocation[],
|
|
342
|
-
skillsDir: string,
|
|
343
|
-
): ParallelDispatchResult[] {
|
|
344
|
-
return invocations.map((inv) => {
|
|
345
|
-
const report = dispatchSkill(inv.skill, skillsDir);
|
|
346
|
-
return {
|
|
347
|
-
...report,
|
|
348
|
-
batchId: inv.batchId,
|
|
349
|
-
testCaseId: inv.testCaseId,
|
|
350
|
-
};
|
|
351
|
-
});
|
|
352
|
-
}
|
|
353
|
-
```
|
|
354
|
-
|
|
355
|
-
- [ ] **Step 4: Update dispatch.ts CLI to support --parallel and --batch-id**
|
|
356
|
-
|
|
357
|
-
Modify the CLI section at the bottom of `src/scripts/dispatch.ts`:
|
|
358
|
-
|
|
359
|
-
```typescript
|
|
360
|
-
if (require.main === module) {
|
|
361
|
-
const args = process.argv.slice(2);
|
|
362
|
-
const parseArg = (flag: string): string | undefined => {
|
|
363
|
-
const idx = args.indexOf(flag);
|
|
364
|
-
return idx !== -1 ? args[idx + 1] : undefined;
|
|
365
|
-
};
|
|
366
|
-
|
|
367
|
-
const skill = parseArg('--skill');
|
|
368
|
-
const workspace = parseArg('--workspace');
|
|
369
|
-
const batchId = parseArg('--batch-id');
|
|
370
|
-
const parallel = args.includes('--parallel');
|
|
371
|
-
|
|
372
|
-
if (!skill) {
|
|
373
|
-
console.error('Usage: node dispatch.ts --skill <name> --workspace <path> [--batch-id <n>] [--parallel]');
|
|
374
|
-
process.exit(1);
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
const skillsDir = workspace
|
|
378
|
-
? path.join(workspace, '.agents', 'skills', 'workspace-maxxing', 'skills')
|
|
379
|
-
: path.join(process.cwd(), 'skills');
|
|
380
|
-
|
|
381
|
-
if (parallel) {
|
|
382
|
-
// Read invocation list from stdin or file
|
|
383
|
-
const invocationsPath = parseArg('--invocations');
|
|
384
|
-
if (!invocationsPath) {
|
|
385
|
-
console.error('--parallel requires --invocations <path>');
|
|
386
|
-
process.exit(1);
|
|
387
|
-
}
|
|
388
|
-
const invocations = JSON.parse(fs.readFileSync(invocationsPath, 'utf-8'));
|
|
389
|
-
const results = dispatchParallel(invocations, skillsDir);
|
|
390
|
-
console.log(JSON.stringify(results, null, 2));
|
|
391
|
-
} else {
|
|
392
|
-
const result = dispatchSkill(skill, skillsDir);
|
|
393
|
-
const output = batchId
|
|
394
|
-
? { ...result, batchId: parseInt(batchId, 10) }
|
|
395
|
-
: result;
|
|
396
|
-
console.log(JSON.stringify(output, null, 2));
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
```
|
|
400
|
-
|
|
401
|
-
- [ ] **Step 5: Run test to verify it passes**
|
|
402
|
-
|
|
403
|
-
Run: `npx jest tests/dispatch-parallel.test.ts -v`
|
|
404
|
-
Expected: PASS
|
|
405
|
-
|
|
406
|
-
- [ ] **Step 6: Commit**
|
|
407
|
-
|
|
408
|
-
```bash
|
|
409
|
-
git add src/scripts/dispatch.ts tests/dispatch-parallel.test.ts
|
|
410
|
-
git commit -m "feat(dispatch): add parallel dispatch and batch-id support"
|
|
411
|
-
```
|
|
412
|
-
|
|
413
|
-
---
|
|
414
|
-
|
|
415
|
-
### Task 4: orchestrator.ts — Full Batch Lifecycle (generate → dispatch → validate → fix)
|
|
416
|
-
|
|
417
|
-
**Files:**
|
|
418
|
-
- Modify: `src/scripts/orchestrator.ts`
|
|
419
|
-
- Modify: `tests/orchestrator.test.ts`
|
|
420
|
-
|
|
421
|
-
- [ ] **Step 1: Write failing test for batch lifecycle**
|
|
422
|
-
|
|
423
|
-
Add to `tests/orchestrator.test.ts`:
|
|
424
|
-
|
|
425
|
-
```typescript
|
|
426
|
-
import { runBatchLifecycle, BatchLifecycleResult } from '../src/scripts/orchestrator';
|
|
427
|
-
|
|
428
|
-
jest.mock('../src/scripts/dispatch');
|
|
429
|
-
jest.mock('../src/scripts/generate-tests');
|
|
430
|
-
jest.mock('../src/scripts/validate');
|
|
431
|
-
jest.mock('../src/scripts/benchmark');
|
|
432
|
-
|
|
433
|
-
import * as dispatch from '../src/scripts/dispatch';
|
|
434
|
-
import * as generateTests from '../src/scripts/generate-tests';
|
|
435
|
-
import * as validate from '../src/scripts/validate';
|
|
436
|
-
import * as benchmark from '../src/scripts/benchmark';
|
|
437
|
-
|
|
438
|
-
describe('batch lifecycle', () => {
|
|
439
|
-
it('runs full lifecycle: generate → dispatch → validate → complete', () => {
|
|
440
|
-
const ws = createBasicWorkspace();
|
|
441
|
-
const config = { batchSize: 2, maxFixRetries: 3, scoreThreshold: 85, workerTimeout: 300 };
|
|
442
|
-
|
|
443
|
-
(generateTests.generateTestCases as jest.Mock).mockReturnValue({
|
|
444
|
-
testCases: [
|
|
445
|
-
{ stage: '01-input', type: 'sample', input: 'test', expected: 'test' },
|
|
446
|
-
{ stage: '02-output', type: 'sample', input: 'test', expected: 'test' },
|
|
447
|
-
],
|
|
448
|
-
});
|
|
449
|
-
|
|
450
|
-
(dispatch.dispatchParallel as jest.Mock).mockReturnValue([
|
|
451
|
-
{ skill: 'worker', status: 'passed', batchId: 1, testCaseId: 'tc-001', timestamp: new Date().toISOString(), findings: [], recommendations: [], metrics: {}, nextSkill: 'validation' },
|
|
452
|
-
{ skill: 'worker', status: 'passed', batchId: 1, testCaseId: 'tc-002', timestamp: new Date().toISOString(), findings: [], recommendations: [], metrics: {}, nextSkill: 'validation' },
|
|
453
|
-
]);
|
|
454
|
-
|
|
455
|
-
(benchmark.calculateBenchmark as jest.Mock).mockReturnValue({
|
|
456
|
-
workspace: 'test',
|
|
457
|
-
agent: 'test',
|
|
458
|
-
timestamp: new Date().toISOString(),
|
|
459
|
-
rawScore: 80,
|
|
460
|
-
weightedScore: 90,
|
|
461
|
-
stages: [],
|
|
462
|
-
fixSuggestions: [],
|
|
463
|
-
improvementPotential: false,
|
|
464
|
-
});
|
|
465
|
-
|
|
466
|
-
const result = runBatchLifecycle(ws, config);
|
|
467
|
-
|
|
468
|
-
expect(result.totalBatches).toBe(1);
|
|
469
|
-
expect(result.passedBatches).toBe(1);
|
|
470
|
-
expect(result.overallScore).toBe(90);
|
|
471
|
-
});
|
|
472
|
-
|
|
473
|
-
it('triggers fix loop when batch score below threshold', () => {
|
|
474
|
-
const ws = createBasicWorkspace();
|
|
475
|
-
const config = { batchSize: 2, maxFixRetries: 3, scoreThreshold: 85, workerTimeout: 300 };
|
|
476
|
-
|
|
477
|
-
(generateTests.generateTestCases as jest.Mock).mockReturnValue({
|
|
478
|
-
testCases: [
|
|
479
|
-
{ stage: '01-input', type: 'sample', input: 'test', expected: 'test' },
|
|
480
|
-
],
|
|
481
|
-
});
|
|
482
|
-
|
|
483
|
-
(dispatch.dispatchParallel as jest.Mock).mockReturnValue([
|
|
484
|
-
{ skill: 'worker', status: 'failed', batchId: 1, testCaseId: 'tc-001', timestamp: new Date().toISOString(), findings: ['output missing'], recommendations: ['run worker'], metrics: {}, nextSkill: 'validation' },
|
|
485
|
-
]);
|
|
486
|
-
|
|
487
|
-
(benchmark.calculateBenchmark as jest.Mock).mockReturnValue({
|
|
488
|
-
workspace: 'test',
|
|
489
|
-
agent: 'test',
|
|
490
|
-
timestamp: new Date().toISOString(),
|
|
491
|
-
rawScore: 30,
|
|
492
|
-
weightedScore: 40,
|
|
493
|
-
stages: [],
|
|
494
|
-
fixSuggestions: ['Improve output'],
|
|
495
|
-
improvementPotential: true,
|
|
496
|
-
});
|
|
497
|
-
|
|
498
|
-
const result = runBatchLifecycle(ws, config);
|
|
499
|
-
|
|
500
|
-
expect(result.failedBatches).toBeGreaterThanOrEqual(0);
|
|
501
|
-
});
|
|
502
|
-
});
|
|
503
|
-
```
|
|
504
|
-
|
|
505
|
-
- [ ] **Step 2: Run test to verify it fails**
|
|
506
|
-
|
|
507
|
-
Run: `npx jest tests/orchestrator.test.ts -t "batch lifecycle" -v`
|
|
508
|
-
Expected: FAIL
|
|
509
|
-
|
|
510
|
-
- [ ] **Step 3: Add runBatchLifecycle to orchestrator.ts**
|
|
511
|
-
|
|
512
|
-
Add to `src/scripts/orchestrator.ts`:
|
|
513
|
-
|
|
514
|
-
```typescript
|
|
515
|
-
import { generateTestCases } from './generate-tests';
|
|
516
|
-
import { dispatchParallel, ParallelInvocation, ParallelDispatchResult } from './dispatch';
|
|
517
|
-
import { calculateBenchmark } from './benchmark';
|
|
518
|
-
|
|
519
|
-
export interface BatchLifecycleResult {
|
|
520
|
-
totalBatches: number;
|
|
521
|
-
passedBatches: number;
|
|
522
|
-
failedBatches: number;
|
|
523
|
-
escalatedBatches: number;
|
|
524
|
-
overallScore: number;
|
|
525
|
-
batchReports: BatchReport[];
|
|
526
|
-
timestamp: string;
|
|
527
|
-
}
|
|
528
|
-
|
|
529
|
-
export function runBatchLifecycle(
|
|
530
|
-
workspacePath: string,
|
|
531
|
-
config: OrchestratorConfig = {},
|
|
532
|
-
): BatchLifecycleResult {
|
|
533
|
-
const resolvedConfig: Required<OrchestratorConfig> = {
|
|
534
|
-
batchSize: config.batchSize ?? DEFAULT_CONFIG.batchSize,
|
|
535
|
-
maxFixRetries: config.maxFixRetries ?? DEFAULT_CONFIG.maxFixRetries,
|
|
536
|
-
scoreThreshold: config.scoreThreshold ?? DEFAULT_CONFIG.scoreThreshold,
|
|
537
|
-
workerTimeout: config.workerTimeout ?? DEFAULT_CONFIG.workerTimeout,
|
|
538
|
-
};
|
|
539
|
-
|
|
540
|
-
const ws = path.resolve(workspacePath);
|
|
541
|
-
const iterationDir = path.join(ws, '.agents', 'iteration');
|
|
542
|
-
fs.mkdirSync(iterationDir, { recursive: true });
|
|
543
|
-
|
|
544
|
-
// Phase 1: Generate test cases
|
|
545
|
-
const testCasesResult = generateTestCases(ws);
|
|
546
|
-
const testCaseIds = testCasesResult.testCases.map((_, i) => `tc-${String(i + 1).padStart(3, '0')}`);
|
|
547
|
-
|
|
548
|
-
// Phase 2: Split into batches
|
|
549
|
-
const batches = splitIntoBatches(testCaseIds, resolvedConfig.batchSize);
|
|
550
|
-
|
|
551
|
-
// Phase 3: Process each batch
|
|
552
|
-
const batchReports: BatchReport[] = [];
|
|
553
|
-
let passedBatches = 0;
|
|
554
|
-
let failedBatches = 0;
|
|
555
|
-
let escalatedBatches = 0;
|
|
556
|
-
|
|
557
|
-
for (let batchIdx = 0; batchIdx < batches.length; batchIdx++) {
|
|
558
|
-
const batchId = batchIdx + 1;
|
|
559
|
-
const batchDir = createBatchDirectory(iterationDir, batchId);
|
|
560
|
-
const batchTestCases = batches[batchIdx];
|
|
561
|
-
|
|
562
|
-
// Dispatch workers in parallel
|
|
563
|
-
const invocations: ParallelInvocation[] = batchTestCases.map((tcId) => ({
|
|
564
|
-
skill: 'worker',
|
|
565
|
-
batchId,
|
|
566
|
-
testCaseId: tcId,
|
|
567
|
-
}));
|
|
568
|
-
|
|
569
|
-
const workerResults = dispatchParallel(invocations, path.join(ws, '.agents', 'skills', 'workspace-maxxing', 'skills'));
|
|
570
|
-
|
|
571
|
-
// Write worker outputs
|
|
572
|
-
workerResults.forEach((result) => {
|
|
573
|
-
const tcDir = createTestCaseDirectory(batchDir, result.testCaseId);
|
|
574
|
-
fs.writeFileSync(
|
|
575
|
-
path.join(tcDir, 'report.json'),
|
|
576
|
-
JSON.stringify(result, null, 2),
|
|
577
|
-
);
|
|
578
|
-
});
|
|
579
|
-
|
|
580
|
-
// Run benchmark for batch
|
|
581
|
-
const benchmarkResult = calculateBenchmark(ws);
|
|
582
|
-
const batchScore = benchmarkResult.weightedScore;
|
|
583
|
-
|
|
584
|
-
// Determine batch status
|
|
585
|
-
let batchStatus: BatchReport['status'] = 'passed';
|
|
586
|
-
if (benchmarkResult.weightedScore < resolvedConfig.scoreThreshold) {
|
|
587
|
-
// Fix loop
|
|
588
|
-
const fixResults = runFixLoop(
|
|
589
|
-
batchDir,
|
|
590
|
-
workerResults,
|
|
591
|
-
benchmarkResult.fixSuggestions,
|
|
592
|
-
resolvedConfig.maxFixRetries,
|
|
593
|
-
ws,
|
|
594
|
-
);
|
|
595
|
-
|
|
596
|
-
if (fixResults.status === 'escalated') {
|
|
597
|
-
batchStatus = 'escalated';
|
|
598
|
-
escalatedBatches++;
|
|
599
|
-
} else if (fixResults.status === 'failed') {
|
|
600
|
-
batchStatus = 'failed';
|
|
601
|
-
failedBatches++;
|
|
602
|
-
} else {
|
|
603
|
-
batchStatus = 'passed';
|
|
604
|
-
passedBatches++;
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
// Re-run benchmark after fixes
|
|
608
|
-
const postFixBenchmark = calculateBenchmark(ws);
|
|
609
|
-
batchReports.push({
|
|
610
|
-
batchId,
|
|
611
|
-
testCases: batchTestCases,
|
|
612
|
-
score: postFixBenchmark.weightedScore,
|
|
613
|
-
status: batchStatus,
|
|
614
|
-
findings: fixResults.findings,
|
|
615
|
-
timestamp: new Date().toISOString(),
|
|
616
|
-
});
|
|
617
|
-
} else {
|
|
618
|
-
passedBatches++;
|
|
619
|
-
batchReports.push({
|
|
620
|
-
batchId,
|
|
621
|
-
testCases: batchTestCases,
|
|
622
|
-
score: batchScore,
|
|
623
|
-
status: 'passed',
|
|
624
|
-
findings: ['Batch passed threshold'],
|
|
625
|
-
timestamp: new Date().toISOString(),
|
|
626
|
-
});
|
|
627
|
-
}
|
|
628
|
-
}
|
|
629
|
-
|
|
630
|
-
// Write summary
|
|
631
|
-
const summary: BatchLifecycleResult = {
|
|
632
|
-
totalBatches: batches.length,
|
|
633
|
-
passedBatches,
|
|
634
|
-
failedBatches,
|
|
635
|
-
escalatedBatches,
|
|
636
|
-
overallScore: batchReports.length > 0
|
|
637
|
-
? Math.round(batchReports.reduce((sum, r) => sum + r.score, 0) / batchReports.length)
|
|
638
|
-
: 0,
|
|
639
|
-
batchReports,
|
|
640
|
-
timestamp: new Date().toISOString(),
|
|
641
|
-
};
|
|
642
|
-
|
|
643
|
-
fs.writeFileSync(
|
|
644
|
-
path.join(iterationDir, 'summary.json'),
|
|
645
|
-
JSON.stringify(summary, null, 2),
|
|
646
|
-
);
|
|
647
|
-
|
|
648
|
-
console.log(JSON.stringify(summary, null, 2));
|
|
649
|
-
return summary;
|
|
650
|
-
}
|
|
651
|
-
|
|
652
|
-
interface FixLoopResult {
|
|
653
|
-
status: 'passed' | 'failed' | 'escalated';
|
|
654
|
-
findings: string[];
|
|
655
|
-
}
|
|
656
|
-
|
|
657
|
-
function runFixLoop(
|
|
658
|
-
batchDir: string,
|
|
659
|
-
workerResults: ParallelDispatchResult[],
|
|
660
|
-
fixSuggestions: string[],
|
|
661
|
-
maxRetries: number,
|
|
662
|
-
workspacePath: string,
|
|
663
|
-
): FixLoopResult {
|
|
664
|
-
const findings: string[] = [];
|
|
665
|
-
|
|
666
|
-
for (let retry = 0; retry < maxRetries; retry++) {
|
|
667
|
-
const failingResults = workerResults.filter((r) => r.status !== 'passed');
|
|
668
|
-
|
|
669
|
-
if (failingResults.length === 0) {
|
|
670
|
-
return { status: 'passed', findings };
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
// Dispatch fixers in parallel
|
|
674
|
-
const fixInvocations: ParallelInvocation[] = failingResults.map((r) => ({
|
|
675
|
-
skill: 'fixer',
|
|
676
|
-
batchId: r.batchId,
|
|
677
|
-
testCaseId: r.testCaseId,
|
|
678
|
-
}));
|
|
679
|
-
|
|
680
|
-
const fixResults = dispatchParallel(
|
|
681
|
-
fixInvocations,
|
|
682
|
-
path.join(workspacePath, '.agents', 'skills', 'workspace-maxxing', 'skills'),
|
|
683
|
-
);
|
|
684
|
-
|
|
685
|
-
findings.push(`Fix attempt ${retry + 1}: ${fixResults.length} fixes applied`);
|
|
686
|
-
|
|
687
|
-
// Re-check benchmark
|
|
688
|
-
const benchmarkResult = calculateBenchmark(workspacePath);
|
|
689
|
-
if (benchmarkResult.weightedScore >= 85) {
|
|
690
|
-
return { status: 'passed', findings };
|
|
691
|
-
}
|
|
692
|
-
}
|
|
693
|
-
|
|
694
|
-
return { status: 'escalated', findings: [...findings, 'Max retries exhausted'] };
|
|
695
|
-
}
|
|
696
|
-
```
|
|
697
|
-
|
|
698
|
-
- [ ] **Step 4: Run test to verify it passes**
|
|
699
|
-
|
|
700
|
-
Run: `npx jest tests/orchestrator.test.ts -t "batch lifecycle" -v`
|
|
701
|
-
Expected: PASS
|
|
702
|
-
|
|
703
|
-
- [ ] **Step 5: Commit**
|
|
704
|
-
|
|
705
|
-
```bash
|
|
706
|
-
git add src/scripts/orchestrator.ts tests/orchestrator.test.ts
|
|
707
|
-
git commit -m "feat(orchestrator): add full batch lifecycle with fix loop"
|
|
708
|
-
```
|
|
709
|
-
|
|
710
|
-
---
|
|
711
|
-
|
|
712
|
-
### Task 5: Worker Sub-Skill SKILL.md
|
|
713
|
-
|
|
714
|
-
**Files:**
|
|
715
|
-
- Create: `templates/.workspace-templates/skills/worker/SKILL.md`
|
|
716
|
-
- Test: `tests/worker-skill.test.ts`
|
|
717
|
-
|
|
718
|
-
- [ ] **Step 1: Write test for worker skill structure**
|
|
719
|
-
|
|
720
|
-
```typescript
|
|
721
|
-
// tests/worker-skill.test.ts
|
|
722
|
-
import * as fs from 'fs';
|
|
723
|
-
import * as path from 'path';
|
|
724
|
-
|
|
725
|
-
describe('worker sub-skill', () => {
|
|
726
|
-
const skillPath = path.join(__dirname, '..', 'templates', '.workspace-templates', 'skills', 'worker', 'SKILL.md');
|
|
727
|
-
|
|
728
|
-
it('exists', () => {
|
|
729
|
-
expect(fs.existsSync(skillPath)).toBe(true);
|
|
730
|
-
});
|
|
731
|
-
|
|
732
|
-
it('has YAML frontmatter with name and triggers', () => {
|
|
733
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
734
|
-
expect(content).toMatch(/^---/m);
|
|
735
|
-
expect(content).toMatch(/name:\s*worker/);
|
|
736
|
-
expect(content).toMatch(/triggers:/);
|
|
737
|
-
});
|
|
738
|
-
|
|
739
|
-
it('has Iron Law section', () => {
|
|
740
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
741
|
-
expect(content.toLowerCase()).toContain('the iron law');
|
|
742
|
-
});
|
|
743
|
-
|
|
744
|
-
it('has Anti-Rationalization Table', () => {
|
|
745
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
746
|
-
expect(content).toContain('| Thought | Reality |');
|
|
747
|
-
});
|
|
748
|
-
|
|
749
|
-
it('has Report Format section with JSON schema', () => {
|
|
750
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
751
|
-
expect(content.toLowerCase()).toContain('report format');
|
|
752
|
-
expect(content).toContain('"skill": "worker"');
|
|
753
|
-
});
|
|
754
|
-
});
|
|
755
|
-
```
|
|
756
|
-
|
|
757
|
-
- [ ] **Step 2: Run test to verify it fails**
|
|
758
|
-
|
|
759
|
-
Run: `npx jest tests/worker-skill.test.ts -v`
|
|
760
|
-
Expected: FAIL
|
|
761
|
-
|
|
762
|
-
- [ ] **Step 3: Create worker SKILL.md**
|
|
763
|
-
|
|
764
|
-
```markdown
|
|
765
|
-
---
|
|
766
|
-
name: worker
|
|
767
|
-
description: "Executes a single test case against the workspace and produces output. Use when running test cases, executing workspace tasks, or processing stage-specific work."
|
|
768
|
-
triggers: ["run test case", "execute workspace task", "process stage", "generate output"]
|
|
769
|
-
---
|
|
770
|
-
|
|
771
|
-
## Overview
|
|
772
|
-
|
|
773
|
-
Execute a single test case by reading the relevant workspace sections, performing the required work, and producing structured output. Each worker runs with fresh context — no assumptions about prior runs.
|
|
774
|
-
|
|
775
|
-
## When to Use
|
|
776
|
-
|
|
777
|
-
- Dispatched by orchestrator as part of a batch
|
|
778
|
-
- User asks to run a specific test case
|
|
779
|
-
- User asks to execute a workspace stage task
|
|
780
|
-
|
|
781
|
-
## When Not to Use
|
|
782
|
-
|
|
783
|
-
- Validating outputs (use validation sub-skill)
|
|
784
|
-
- Fixing failed outputs (use fixer sub-skill)
|
|
785
|
-
- Planning workspace structure (use architecture sub-skill)
|
|
786
|
-
|
|
787
|
-
## The Iron Law
|
|
788
|
-
|
|
789
|
-
NO SKIPPING TEST CASE STEPS
|
|
790
|
-
NO MODIFYING WORKSPACE STRUCTURE
|
|
791
|
-
NO CLAIMING DONE WITHOUT OUTPUT
|
|
792
|
-
NO ASSUMING PRIOR CONTEXT
|
|
793
|
-
|
|
794
|
-
## The Process
|
|
795
|
-
|
|
796
|
-
1. **Read test case** — Load the test case JSON from `.agents/iteration/batch-<N>/<testCaseId>/` or orchestrator input
|
|
797
|
-
2. **Load workspace context** — Read `SYSTEM.md` and relevant stage `CONTEXT.md` files
|
|
798
|
-
3. **Execute the task** — Follow the test case input/expected instructions
|
|
799
|
-
4. **Write output.md** — Human-readable output in `.agents/iteration/batch-<N>/<testCaseId>/output.md`
|
|
800
|
-
5. **Write report.json** — Structured JSON with `{testCaseId, status, output, findings}`
|
|
801
|
-
6. **Dispatch validation** — Signal that output is ready for validation
|
|
802
|
-
|
|
803
|
-
## Anti-Rationalization Table
|
|
804
|
-
|
|
805
|
-
| Thought | Reality |
|
|
806
|
-
|---------|---------|
|
|
807
|
-
| "I already know what this stage does" | Read the CONTEXT.md. Assumptions cause failures. |
|
|
808
|
-
| "The output is good enough" | Good enough fails validation. Follow the test case exactly. |
|
|
809
|
-
| "I'll modify the workspace structure to make this easier" | Workers don't modify structure. That's the fixer's job. |
|
|
810
|
-
| "This test case is redundant" | Every test case exists for a reason. Execute it. |
|
|
811
|
-
| "I'll skip writing report.json" | Validation depends on report.json. It's mandatory. |
|
|
812
|
-
|
|
813
|
-
## Sub-Skill Dispatch
|
|
814
|
-
|
|
815
|
-
- After output complete → validation sub-skill
|
|
816
|
-
|
|
817
|
-
## Report Format
|
|
818
|
-
|
|
819
|
-
```json
|
|
820
|
-
{
|
|
821
|
-
"skill": "worker",
|
|
822
|
-
"status": "passed|failed|escalated",
|
|
823
|
-
"timestamp": "<ISO-8601>",
|
|
824
|
-
"testCaseId": "<id>",
|
|
825
|
-
"batchId": <number>,
|
|
826
|
-
"findings": ["<finding>"],
|
|
827
|
-
"recommendations": ["<recommendation>"],
|
|
828
|
-
"metrics": {
|
|
829
|
-
"executionTimeMs": <number>,
|
|
830
|
-
"outputLength": <number>
|
|
831
|
-
},
|
|
832
|
-
"nextSkill": "validation"
|
|
833
|
-
}
|
|
834
|
-
```
|
|
835
|
-
```
|
|
836
|
-
|
|
837
|
-
- [ ] **Step 4: Run test to verify it passes**
|
|
838
|
-
|
|
839
|
-
Run: `npx jest tests/worker-skill.test.ts -v`
|
|
840
|
-
Expected: PASS
|
|
841
|
-
|
|
842
|
-
- [ ] **Step 5: Commit**
|
|
843
|
-
|
|
844
|
-
```bash
|
|
845
|
-
git add templates/.workspace-templates/skills/worker/SKILL.md tests/worker-skill.test.ts
|
|
846
|
-
git commit -m "feat(worker): add worker sub-skill with obra patterns"
|
|
847
|
-
```
|
|
848
|
-
|
|
849
|
-
---
|
|
850
|
-
|
|
851
|
-
### Task 6: Fixer Sub-Skill SKILL.md
|
|
852
|
-
|
|
853
|
-
**Files:**
|
|
854
|
-
- Create: `templates/.workspace-templates/skills/fixer/SKILL.md`
|
|
855
|
-
- Test: `tests/fixer-skill.test.ts`
|
|
856
|
-
|
|
857
|
-
- [ ] **Step 1: Write test for fixer skill structure**
|
|
858
|
-
|
|
859
|
-
```typescript
|
|
860
|
-
// tests/fixer-skill.test.ts
|
|
861
|
-
import * as fs from 'fs';
|
|
862
|
-
import * as path from 'path';
|
|
863
|
-
|
|
864
|
-
describe('fixer sub-skill', () => {
|
|
865
|
-
const skillPath = path.join(__dirname, '..', 'templates', '.workspace-templates', 'skills', 'fixer', 'SKILL.md');
|
|
866
|
-
|
|
867
|
-
it('exists', () => {
|
|
868
|
-
expect(fs.existsSync(skillPath)).toBe(true);
|
|
869
|
-
});
|
|
870
|
-
|
|
871
|
-
it('has YAML frontmatter with name and triggers', () => {
|
|
872
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
873
|
-
expect(content).toMatch(/^---/m);
|
|
874
|
-
expect(content).toMatch(/name:\s*fixer/);
|
|
875
|
-
expect(content).toMatch(/triggers:/);
|
|
876
|
-
});
|
|
877
|
-
|
|
878
|
-
it('has Iron Law section', () => {
|
|
879
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
880
|
-
expect(content.toLowerCase()).toContain('the iron law');
|
|
881
|
-
});
|
|
882
|
-
|
|
883
|
-
it('has Anti-Rationalization Table', () => {
|
|
884
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
885
|
-
expect(content).toContain('| Thought | Reality |');
|
|
886
|
-
});
|
|
887
|
-
|
|
888
|
-
it('has Report Format section with JSON schema', () => {
|
|
889
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
890
|
-
expect(content.toLowerCase()).toContain('report format');
|
|
891
|
-
expect(content).toContain('"skill": "fixer"');
|
|
892
|
-
});
|
|
893
|
-
});
|
|
894
|
-
```
|
|
895
|
-
|
|
896
|
-
- [ ] **Step 2: Run test to verify it fails**
|
|
897
|
-
|
|
898
|
-
Run: `npx jest tests/fixer-skill.test.ts -v`
|
|
899
|
-
Expected: FAIL
|
|
900
|
-
|
|
901
|
-
- [ ] **Step 3: Create fixer SKILL.md**
|
|
902
|
-
|
|
903
|
-
```markdown
|
|
904
|
-
---
|
|
905
|
-
name: fixer
|
|
906
|
-
description: "Applies targeted fixes to failing test case outputs. Use when fixing failed worker outputs, improving low-scoring results, or addressing validator findings."
|
|
907
|
-
triggers: ["fix failing test", "improve output", "address validation failure", "apply targeted fix"]
|
|
908
|
-
---
|
|
909
|
-
|
|
910
|
-
## Overview
|
|
911
|
-
|
|
912
|
-
Read validator findings and original worker output, identify the root cause of failure, apply the minimal fix needed, and re-validate. Each fixer runs with fresh context.
|
|
913
|
-
|
|
914
|
-
## When to Use
|
|
915
|
-
|
|
916
|
-
- Dispatched by orchestrator fix loop
|
|
917
|
-
- Validator identifies specific failures
|
|
918
|
-
- Worker output is incomplete or incorrect
|
|
919
|
-
|
|
920
|
-
## When Not to Use
|
|
921
|
-
|
|
922
|
-
- Generating new output from scratch (use worker sub-skill)
|
|
923
|
-
- Validating outputs (use validation sub-skill)
|
|
924
|
-
- Restructuring workspace (use architecture sub-skill)
|
|
925
|
-
|
|
926
|
-
## The Iron Law
|
|
927
|
-
|
|
928
|
-
NO BLIND RETRIES
|
|
929
|
-
NO COSMETIC FIXES
|
|
930
|
-
NO FIXING WHAT ISN'T BROKEN
|
|
931
|
-
NO CLAIMING FIX WITHOUT RE-VALIDATION
|
|
932
|
-
|
|
933
|
-
## The Process
|
|
934
|
-
|
|
935
|
-
1. **Read validator findings** — Load `batch-report.json` from batch directory
|
|
936
|
-
2. **Read original output** — Load `output.md` and `report.json` from `.agents/iteration/batch-<N>/<testCaseId>/`
|
|
937
|
-
3. **Identify root cause** — Map each finding to a specific issue in the output
|
|
938
|
-
4. **Apply minimal fix** — Change only what's needed to address the finding
|
|
939
|
-
5. **Update output.md** — Write the fixed output
|
|
940
|
-
6. **Update report.json** — Write updated report with fix details
|
|
941
|
-
7. **Dispatch validation** — Signal that fix is ready for re-validation
|
|
942
|
-
|
|
943
|
-
## Anti-Rationalization Table
|
|
944
|
-
|
|
945
|
-
| Thought | Reality |
|
|
946
|
-
|---------|---------|
|
|
947
|
-
| "I'll just re-run the worker logic" | Blind retries don't fix root causes. Read the findings. |
|
|
948
|
-
| "This looks better now" | Better is subjective. Does it pass the test case? |
|
|
949
|
-
| "I'll fix other things while I'm here" | Fix only what the validator flagged. Scope creep wastes cycles. |
|
|
950
|
-
| "The fix is obvious" | Obvious to whom? Follow the findings, not intuition. |
|
|
951
|
-
| "I don't need to re-validate" | Unvalidated fixes are guesses. Always re-validate. |
|
|
952
|
-
|
|
953
|
-
## Sub-Skill Dispatch
|
|
954
|
-
|
|
955
|
-
- After fix applied → validation sub-skill
|
|
956
|
-
|
|
957
|
-
## Report Format
|
|
958
|
-
|
|
959
|
-
```json
|
|
960
|
-
{
|
|
961
|
-
"skill": "fixer",
|
|
962
|
-
"status": "passed|failed|escalated",
|
|
963
|
-
"timestamp": "<ISO-8601>",
|
|
964
|
-
"testCaseId": "<id>",
|
|
965
|
-
"batchId": <number>,
|
|
966
|
-
"findings": ["<finding>"],
|
|
967
|
-
"fixesApplied": ["<fix description>"],
|
|
968
|
-
"recommendations": ["<recommendation>"],
|
|
969
|
-
"metrics": {
|
|
970
|
-
"findingsAddressed": <number>,
|
|
971
|
-
"fixesApplied": <number>
|
|
972
|
-
},
|
|
973
|
-
"nextSkill": "validation"
|
|
974
|
-
}
|
|
975
|
-
```
|
|
976
|
-
```
|
|
977
|
-
|
|
978
|
-
- [ ] **Step 4: Run test to verify it passes**
|
|
979
|
-
|
|
980
|
-
Run: `npx jest tests/fixer-skill.test.ts -v`
|
|
981
|
-
Expected: PASS
|
|
982
|
-
|
|
983
|
-
- [ ] **Step 5: Commit**
|
|
984
|
-
|
|
985
|
-
```bash
|
|
986
|
-
git add templates/.workspace-templates/skills/fixer/SKILL.md tests/fixer-skill.test.ts
|
|
987
|
-
git commit -m "feat(fixer): add fixer sub-skill with obra patterns"
|
|
988
|
-
```
|
|
989
|
-
|
|
990
|
-
---
|
|
991
|
-
|
|
992
|
-
### Task 7: Enhanced Validation Sub-Skill SKILL.md
|
|
993
|
-
|
|
994
|
-
**Files:**
|
|
995
|
-
- Modify: `templates/.workspace-templates/skills/validation/SKILL.md`
|
|
996
|
-
- Test: `tests/validation-enhanced.test.ts`
|
|
997
|
-
|
|
998
|
-
- [ ] **Step 1: Write test for enhanced validation skill**
|
|
999
|
-
|
|
1000
|
-
```typescript
|
|
1001
|
-
// tests/validation-enhanced.test.ts
|
|
1002
|
-
import * as fs from 'fs';
|
|
1003
|
-
import * as path from 'path';
|
|
1004
|
-
|
|
1005
|
-
describe('enhanced validation sub-skill', () => {
|
|
1006
|
-
const skillPath = path.join(__dirname, '..', 'templates', '.workspace-templates', 'skills', 'validation', 'SKILL.md');
|
|
1007
|
-
|
|
1008
|
-
it('has YAML frontmatter with triggers', () => {
|
|
1009
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
1010
|
-
expect(content).toMatch(/^---/m);
|
|
1011
|
-
expect(content).toMatch(/name:\s*validation/);
|
|
1012
|
-
expect(content).toMatch(/triggers:/);
|
|
1013
|
-
});
|
|
1014
|
-
|
|
1015
|
-
it('has Iron Law section', () => {
|
|
1016
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
1017
|
-
expect(content.toLowerCase()).toContain('the iron law');
|
|
1018
|
-
});
|
|
1019
|
-
|
|
1020
|
-
it('has Anti-Rationalization Table', () => {
|
|
1021
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
1022
|
-
expect(content).toContain('| Thought | Reality |');
|
|
1023
|
-
});
|
|
1024
|
-
|
|
1025
|
-
it('includes batch-level validation instructions', () => {
|
|
1026
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
1027
|
-
expect(content.toLowerCase()).toContain('batch');
|
|
1028
|
-
});
|
|
1029
|
-
|
|
1030
|
-
it('has Report Format with benchmark scoring', () => {
|
|
1031
|
-
const content = fs.readFileSync(skillPath, 'utf-8');
|
|
1032
|
-
expect(content.toLowerCase()).toContain('report format');
|
|
1033
|
-
expect(content.toLowerCase()).toContain('benchmark');
|
|
1034
|
-
});
|
|
1035
|
-
});
|
|
1036
|
-
```
|
|
1037
|
-
|
|
1038
|
-
- [ ] **Step 2: Run test to verify it fails**
|
|
1039
|
-
|
|
1040
|
-
Run: `npx jest tests/validation-enhanced.test.ts -v`
|
|
1041
|
-
Expected: FAIL
|
|
1042
|
-
|
|
1043
|
-
- [ ] **Step 3: Rewrite validation SKILL.md with obra patterns**
|
|
1044
|
-
|
|
1045
|
-
```markdown
|
|
1046
|
-
---
|
|
1047
|
-
name: validation
|
|
1048
|
-
description: "Checks workspace ICM compliance and benchmarks batch outputs. Use when validating a workspace, checking compliance, running validation, benchmarking batch results, or after making changes to workspace structure."
|
|
1049
|
-
triggers: ["validate batch", "check results", "run validation", "benchmark outputs", "check compliance"]
|
|
1050
|
-
---
|
|
1051
|
-
|
|
1052
|
-
## Overview
|
|
1053
|
-
|
|
1054
|
-
Ensure workspace meets ICM standards and benchmark batch outputs through systematic validation. Validates both workspace structure and worker/fixer outputs.
|
|
1055
|
-
|
|
1056
|
-
## When to Use
|
|
1057
|
-
|
|
1058
|
-
- After workspace scaffolding
|
|
1059
|
-
- After any structural change
|
|
1060
|
-
- After worker batch completes
|
|
1061
|
-
- After fixer applies fixes
|
|
1062
|
-
- Before claiming delivery
|
|
1063
|
-
- When score drops below threshold
|
|
1064
|
-
|
|
1065
|
-
## When Not to Use
|
|
1066
|
-
|
|
1067
|
-
- Generating outputs (use worker sub-skill)
|
|
1068
|
-
- Fixing failures (use fixer sub-skill)
|
|
1069
|
-
- Researching patterns (use research sub-skill)
|
|
1070
|
-
|
|
1071
|
-
## The Iron Law
|
|
1072
|
-
|
|
1073
|
-
NO SCORE INFLATION
|
|
1074
|
-
NO SKIPPING FAILURES
|
|
1075
|
-
NO VALIDATING WITHOUT BENCHMARK
|
|
1076
|
-
NO PASSING WITHOUT EVIDENCE
|
|
1077
|
-
|
|
1078
|
-
## The Process
|
|
1079
|
-
|
|
1080
|
-
1. **Run validate.ts** — Execute `node scripts/validate.ts --workspace <path>`
|
|
1081
|
-
2. **Parse results** — Read exit code and output
|
|
1082
|
-
3. **Check batch outputs** — For each test case in batch, verify output.md and report.json exist
|
|
1083
|
-
4. **Run benchmark** — Execute `node scripts/benchmark.ts --workspace <path>`
|
|
1084
|
-
5. **Aggregate scores** — Combine workspace validation + benchmark scores
|
|
1085
|
-
6. **Generate findings** — List specific failures with fix suggestions
|
|
1086
|
-
7. **Write batch-report.json** — Structured report with per-test scores and overall batch score
|
|
1087
|
-
|
|
1088
|
-
## Batch-Level Validation
|
|
1089
|
-
|
|
1090
|
-
When validating a batch:
|
|
1091
|
-
- Read all `report.json` files in `.agents/iteration/batch-<N>/`
|
|
1092
|
-
- Verify each worker output matches its test case expectations
|
|
1093
|
-
- Calculate per-test-case pass/fail
|
|
1094
|
-
- Calculate overall batch score using benchmark weights
|
|
1095
|
-
- If batch score < threshold → recommend fixer sub-skill
|
|
1096
|
-
|
|
1097
|
-
## Anti-Rationalization Table
|
|
1098
|
-
|
|
1099
|
-
| Thought | Reality |
|
|
1100
|
-
|---------|---------|
|
|
1101
|
-
| "This workspace looks good enough" | Good enough is the enemy of excellent. Run validation. |
|
|
1102
|
-
| "The score is close, I'll round up" | Score inflation hides real problems. Report the true score. |
|
|
1103
|
-
| "One failure doesn't matter" | Every failure matters. Report it. |
|
|
1104
|
-
| "I already validated this" | Validation is a snapshot. Re-validate after every change. |
|
|
1105
|
-
| "The benchmark is too strict" | The benchmark is the standard. Meet it or escalate. |
|
|
1106
|
-
|
|
1107
|
-
## Sub-Skill Dispatch
|
|
1108
|
-
|
|
1109
|
-
- If batch score < threshold → fixer sub-skill
|
|
1110
|
-
- If batch score >= threshold → orchestrator (batch complete)
|
|
1111
|
-
- If critical failures (missing SYSTEM.md) → escalate to human
|
|
1112
|
-
|
|
1113
|
-
## Report Format
|
|
1114
|
-
|
|
1115
|
-
```json
|
|
1116
|
-
{
|
|
1117
|
-
"skill": "validation",
|
|
1118
|
-
"status": "passed|failed|escalated",
|
|
1119
|
-
"timestamp": "<ISO-8601>",
|
|
1120
|
-
"batchId": <number>,
|
|
1121
|
-
"findings": ["<finding>"],
|
|
1122
|
-
"fixSuggestions": ["<suggestion>"],
|
|
1123
|
-
"recommendations": ["<recommendation>"],
|
|
1124
|
-
"metrics": {
|
|
1125
|
-
"score": <0-100>,
|
|
1126
|
-
"benchmarkScore": <0-100>,
|
|
1127
|
-
"itemsChecked": <number>,
|
|
1128
|
-
"itemsPassed": <number>,
|
|
1129
|
-
"testCasesPassed": <number>,
|
|
1130
|
-
"testCasesFailed": <number>
|
|
1131
|
-
},
|
|
1132
|
-
"nextSkill": "fixer|none"
|
|
1133
|
-
}
|
|
1134
|
-
```
|
|
1135
|
-
```
|
|
1136
|
-
|
|
1137
|
-
- [ ] **Step 4: Run test to verify it passes**
|
|
1138
|
-
|
|
1139
|
-
Run: `npx jest tests/validation-enhanced.test.ts -v`
|
|
1140
|
-
Expected: PASS
|
|
1141
|
-
|
|
1142
|
-
- [ ] **Step 5: Commit**
|
|
1143
|
-
|
|
1144
|
-
```bash
|
|
1145
|
-
git add templates/.workspace-templates/skills/validation/SKILL.md tests/validation-enhanced.test.ts
|
|
1146
|
-
git commit -m "feat(validation): enhance with obra patterns and batch validation"
|
|
1147
|
-
```
|
|
1148
|
-
|
|
1149
|
-
---
|
|
1150
|
-
|
|
1151
|
-
### Task 8: Rewrite Remaining 6 Sub-Skills with obra/superpowers Patterns
|
|
1152
|
-
|
|
1153
|
-
**Files:**
|
|
1154
|
-
- Modify: `templates/.workspace-templates/skills/research/SKILL.md`
|
|
1155
|
-
- Modify: `templates/.workspace-templates/skills/architecture/SKILL.md`
|
|
1156
|
-
- Modify: `templates/.workspace-templates/skills/testing/SKILL.md`
|
|
1157
|
-
- Modify: `templates/.workspace-templates/skills/prompt-engineering/SKILL.md`
|
|
1158
|
-
- Modify: `templates/.workspace-templates/skills/iteration/SKILL.md`
|
|
1159
|
-
- Modify: `templates/.workspace-templates/skills/tooling/SKILL.md`
|
|
1160
|
-
|
|
1161
|
-
Each sub-skill gets this structure:
|
|
1162
|
-
- YAML frontmatter: `name`, `description`, `triggers`
|
|
1163
|
-
- Overview
|
|
1164
|
-
- When to Use / When Not to Use
|
|
1165
|
-
- The Iron Law (3-4 rules)
|
|
1166
|
-
- The Process (numbered steps)
|
|
1167
|
-
- Anti-Rationalization Table
|
|
1168
|
-
- Sub-Skill Dispatch (if applicable)
|
|
1169
|
-
- Report Format (JSON schema)
|
|
1170
|
-
|
|
1171
|
-
- [ ] **Step 1: Rewrite research/SKILL.md**
|
|
1172
|
-
|
|
1173
|
-
```markdown
|
|
1174
|
-
---
|
|
1175
|
-
name: research
|
|
1176
|
-
description: "Investigates patterns, gathers context, and identifies best practices for workspace design. Use when starting a new workspace, researching workflow patterns, or before architecture planning."
|
|
1177
|
-
triggers: ["research workflow", "gather context", "identify patterns", "best practices"]
|
|
1178
|
-
---
|
|
1179
|
-
|
|
1180
|
-
## Overview
|
|
1181
|
-
|
|
1182
|
-
Gather context and identify patterns before building. Research ensures the workspace design is informed by real requirements, not assumptions.
|
|
1183
|
-
|
|
1184
|
-
## When to Use
|
|
1185
|
-
|
|
1186
|
-
- Phase 1 of hybrid flow (always first)
|
|
1187
|
-
- Before architecture planning
|
|
1188
|
-
- When user asks for a novel workflow type
|
|
1189
|
-
- When existing patterns don't fit the use case
|
|
1190
|
-
|
|
1191
|
-
## When Not to Use
|
|
1192
|
-
|
|
1193
|
-
- After architecture is already planned (use architecture sub-skill)
|
|
1194
|
-
- When workspace structure already exists (use validation sub-skill)
|
|
1195
|
-
- For simple file creation (direct file operations)
|
|
1196
|
-
|
|
1197
|
-
## The Iron Law
|
|
1198
|
-
|
|
1199
|
-
NO BUILD WITHOUT RESEARCH
|
|
1200
|
-
NO GENERIC FINDINGS
|
|
1201
|
-
NO SKIPPING INPUT/OUTPUT ANALYSIS
|
|
1202
|
-
NO ASSUMPTIONS WITHOUT EVIDENCE
|
|
1203
|
-
|
|
1204
|
-
## The Process
|
|
1205
|
-
|
|
1206
|
-
1. **Identify workflow type** — What kind of process is being automated?
|
|
1207
|
-
2. **Research similar patterns** — Look at existing workspaces, documentation, best practices
|
|
1208
|
-
3. **Identify key stages** — What are the natural phases of this workflow?
|
|
1209
|
-
4. **Determine inputs/outputs** — What goes in, what comes out at each stage?
|
|
1210
|
-
5. **Identify tooling needs** — What tools are commonly used for this workflow?
|
|
1211
|
-
6. **Document findings** — Create a research summary for the architecture phase
|
|
1212
|
-
|
|
1213
|
-
## Anti-Rationalization Table
|
|
1214
|
-
|
|
1215
|
-
| Thought | Reality |
|
|
1216
|
-
|---------|---------|
|
|
1217
|
-
| "I already know this workflow type" | Knowledge ≠ research. Document findings for the next agent. |
|
|
1218
|
-
| "Research is taking too long" | Research prevents wasted build time. Be thorough. |
|
|
1219
|
-
| "I'll figure it out while building" | Building without research produces generic, non-optimal workspaces. |
|
|
1220
|
-
| "The user will clarify later" | Ask now. Ambiguous requirements produce ambiguous workspaces. |
|
|
1221
|
-
|
|
1222
|
-
## Sub-Skill Dispatch
|
|
1223
|
-
|
|
1224
|
-
- Always dispatches to architecture sub-skill next
|
|
1225
|
-
- If research is inconclusive → escalate to human for clarification
|
|
1226
|
-
|
|
1227
|
-
## Report Format
|
|
1228
|
-
|
|
1229
|
-
```json
|
|
1230
|
-
{
|
|
1231
|
-
"skill": "research",
|
|
1232
|
-
"status": "passed|failed|escalated",
|
|
1233
|
-
"timestamp": "<ISO-8601>",
|
|
1234
|
-
"findings": ["<finding>"],
|
|
1235
|
-
"recommendations": ["<recommendation>"],
|
|
1236
|
-
"metrics": {
|
|
1237
|
-
"patternsIdentified": <number>,
|
|
1238
|
-
"stagesIdentified": <number>
|
|
1239
|
-
},
|
|
1240
|
-
"nextSkill": "architecture"
|
|
1241
|
-
}
|
|
1242
|
-
```
|
|
1243
|
-
```
|
|
1244
|
-
|
|
1245
|
-
- [ ] **Step 2: Rewrite architecture/SKILL.md**
|
|
1246
|
-
|
|
1247
|
-
```markdown
|
|
1248
|
-
---
|
|
1249
|
-
name: architecture
|
|
1250
|
-
description: "Designs workspace structure, plans folder layout, and creates the build plan. Use when planning workspace structure, designing folder hierarchy, or after research phase."
|
|
1251
|
-
triggers: ["design workspace", "plan structure", "folder layout", "build plan"]
|
|
1252
|
-
---
|
|
1253
|
-
|
|
1254
|
-
## Overview
|
|
1255
|
-
|
|
1256
|
-
Design the workspace structure based on research findings. Architecture translates research into a concrete, buildable plan.
|
|
1257
|
-
|
|
1258
|
-
## When to Use
|
|
1259
|
-
|
|
1260
|
-
- Phase 2 of hybrid flow (after research)
|
|
1261
|
-
- When research is complete and building is next
|
|
1262
|
-
- When restructuring an existing workspace
|
|
1263
|
-
|
|
1264
|
-
## When Not to Use
|
|
1265
|
-
|
|
1266
|
-
- Before research is complete (use research sub-skill)
|
|
1267
|
-
- During building (use scaffold.ts directly)
|
|
1268
|
-
- For minor structural tweaks (direct file operations)
|
|
1269
|
-
|
|
1270
|
-
## The Iron Law
|
|
1271
|
-
|
|
1272
|
-
NO ARCHITECTURE WITHOUT RESEARCH
|
|
1273
|
-
NO BUILDING WITHOUT APPROVED PLAN
|
|
1274
|
-
NO SKIPPING USER APPROVAL
|
|
1275
|
-
NO AMBIGUOUS STAGE DEFINITIONS
|
|
1276
|
-
|
|
1277
|
-
## The Process
|
|
1278
|
-
|
|
1279
|
-
1. **Review research findings** — Read the research sub-skill report
|
|
1280
|
-
2. **Define stage folders** — Determine numbered folder structure (01-xxx, 02-xxx, etc.)
|
|
1281
|
-
3. **Design routing table** — Plan CONTEXT.md routing for each stage
|
|
1282
|
-
4. **Define SYSTEM.md** — Plan folder map, rules, and tool inventory
|
|
1283
|
-
5. **Plan CONTEXT.md content** — Define what each stage's CONTEXT.md should contain
|
|
1284
|
-
6. **Create build plan** — Document the scaffold.ts command with all parameters
|
|
1285
|
-
7. **Get approval** — Present plan to user before building
|
|
1286
|
-
|
|
1287
|
-
## Anti-Rationalization Table
|
|
1288
|
-
|
|
1289
|
-
| Thought | Reality |
|
|
1290
|
-
|---------|---------|
|
|
1291
|
-
| "I'll adjust the structure while building" | Structure changes mid-build are expensive. Plan first. |
|
|
1292
|
-
| "This stage name is good enough" | Stage names affect routing. Be precise. |
|
|
1293
|
-
| "The user will understand without approval" | Unapproved plans produce unwanted results. Always present the plan. |
|
|
1294
|
-
|
|
1295
|
-
## Sub-Skill Dispatch
|
|
1296
|
-
|
|
1297
|
-
- Receives input from research sub-skill
|
|
1298
|
-
- After approval → main skill runs scaffold.ts
|
|
1299
|
-
- If architecture is unclear → escalate to human
|
|
1300
|
-
|
|
1301
|
-
## Report Format
|
|
1302
|
-
|
|
1303
|
-
```json
|
|
1304
|
-
{
|
|
1305
|
-
"skill": "architecture",
|
|
1306
|
-
"status": "passed|failed|escalated",
|
|
1307
|
-
"timestamp": "<ISO-8601>",
|
|
1308
|
-
"findings": ["<finding>"],
|
|
1309
|
-
"recommendations": ["<recommendation>"],
|
|
1310
|
-
"metrics": {
|
|
1311
|
-
"stagesPlanned": <number>,
|
|
1312
|
-
"toolsIdentified": <number>
|
|
1313
|
-
},
|
|
1314
|
-
"nextSkill": "none"
|
|
1315
|
-
}
|
|
1316
|
-
```
|
|
1317
|
-
```
|
|
1318
|
-
|
|
1319
|
-
- [ ] **Step 3: Rewrite testing/SKILL.md**
|
|
1320
|
-
|
|
1321
|
-
```markdown
|
|
1322
|
-
---
|
|
1323
|
-
name: testing
|
|
1324
|
-
description: "Generates and runs test cases, evaluates results, and identifies gaps. Use when testing workspace quality, generating test cases, or after prompt improvements."
|
|
1325
|
-
triggers: ["generate test cases", "run tests", "test workspace", "evaluate quality"]
|
|
1326
|
-
---
|
|
1327
|
-
|
|
1328
|
-
## Overview
|
|
1329
|
-
|
|
1330
|
-
Verify workspace quality through systematic testing. Testing ensures the workspace produces correct outputs across sample, edge-case, and empty inputs.
|
|
1331
|
-
|
|
1332
|
-
## When to Use
|
|
1333
|
-
|
|
1334
|
-
- After prompt-engineering improvements
|
|
1335
|
-
- When no tests exist for the workspace
|
|
1336
|
-
- Before claiming delivery
|
|
1337
|
-
- When score is above 80 but quality is uncertain
|
|
1338
|
-
|
|
1339
|
-
## When Not to Use
|
|
1340
|
-
|
|
1341
|
-
- Before workspace is built (use scaffold.ts first)
|
|
1342
|
-
- For structural validation (use validation sub-skill)
|
|
1343
|
-
- When fixing failures (use fixer sub-skill)
|
|
1344
|
-
|
|
1345
|
-
## The Iron Law
|
|
1346
|
-
|
|
1347
|
-
NO SKIPPING TEST GENERATION
|
|
1348
|
-
NO IGNORING FAILED TESTS
|
|
1349
|
-
NO CLAIMING QUALITY WITHOUT EVIDENCE
|
|
1350
|
-
NO TESTING WITHOUT TEST CASES
|
|
1351
|
-
|
|
1352
|
-
## The Process
|
|
1353
|
-
|
|
1354
|
-
1. **Generate test cases** — Run `node scripts/generate-tests.ts --workspace <path> --output ./tests.json`
|
|
1355
|
-
2. **Read test cases** — Parse the generated test cases
|
|
1356
|
-
3. **Run generation tests** — For each test case, create sample content the stage should produce
|
|
1357
|
-
4. **Run evaluation tests** — Review CONTEXT.md files against test cases
|
|
1358
|
-
5. **Aggregate results** — Identify patterns and gaps
|
|
1359
|
-
6. **Document findings** — Create test report with pass/fail per test case
|
|
1360
|
-
|
|
1361
|
-
## Anti-Rationalization Table
|
|
1362
|
-
|
|
1363
|
-
| Thought | Reality |
|
|
1364
|
-
|---------|---------|
|
|
1365
|
-
| "The workspace looks fine, no need to test" | Looks deceive. Tests reveal. |
|
|
1366
|
-
| "One failed test is a fluke" | Failed tests are signals. Investigate. |
|
|
1367
|
-
| "I'll test after delivery" | Untested delivery is a gamble. Test first. |
|
|
1368
|
-
|
|
1369
|
-
## Sub-Skill Dispatch
|
|
1370
|
-
|
|
1371
|
-
- Dispatched after prompt-engineering
|
|
1372
|
-
- If tests fail → dispatch iteration for fixes
|
|
1373
|
-
- If tests pass → workflow is nearly complete
|
|
1374
|
-
|
|
1375
|
-
## Report Format
|
|
1376
|
-
|
|
1377
|
-
```json
|
|
1378
|
-
{
|
|
1379
|
-
"skill": "testing",
|
|
1380
|
-
"status": "passed|failed|escalated",
|
|
1381
|
-
"timestamp": "<ISO-8601>",
|
|
1382
|
-
"findings": ["<finding>"],
|
|
1383
|
-
"recommendations": ["<recommendation>"],
|
|
1384
|
-
"metrics": {
|
|
1385
|
-
"testCasesGenerated": <number>,
|
|
1386
|
-
"testCasesPassed": <number>,
|
|
1387
|
-
"testCasesFailed": <number>
|
|
1388
|
-
},
|
|
1389
|
-
"nextSkill": "iteration|none"
|
|
1390
|
-
}
|
|
1391
|
-
```
|
|
1392
|
-
```
|
|
1393
|
-
|
|
1394
|
-
- [ ] **Step 4: Rewrite prompt-engineering/SKILL.md**
|
|
1395
|
-
|
|
1396
|
-
```markdown
|
|
1397
|
-
---
|
|
1398
|
-
name: prompt-engineering
|
|
1399
|
-
description: "Improves CONTEXT.md and SYSTEM.md prompts for better agent behavior. Use when workspace score is below 80, prompts need improvement, or after validation identifies content gaps."
|
|
1400
|
-
triggers: ["improve prompts", "fix content gaps", "optimize prompts", "clarify instructions"]
|
|
1401
|
-
---
|
|
1402
|
-
|
|
1403
|
-
## Overview
|
|
1404
|
-
|
|
1405
|
-
Optimize workspace prompts for clarity, completeness, and agent guidance. Prompt engineering fixes content-level issues without structural changes.
|
|
1406
|
-
|
|
1407
|
-
## When to Use
|
|
1408
|
-
|
|
1409
|
-
- Score < 80 in benchmark results
|
|
1410
|
-
- Validation identifies missing content
|
|
1411
|
-
- Prompts are vague or incomplete
|
|
1412
|
-
- Agent behavior doesn't match expectations
|
|
1413
|
-
|
|
1414
|
-
## When Not to Use
|
|
1415
|
-
|
|
1416
|
-
- For structural issues (use fixer or architecture sub-skill)
|
|
1417
|
-
- When workspace has no content yet (use worker sub-skill)
|
|
1418
|
-
- For tool installation (use tooling sub-skill)
|
|
1419
|
-
|
|
1420
|
-
## The Iron Law
|
|
1421
|
-
|
|
1422
|
-
NO COSMETIC CHANGES WITHOUT FUNCTIONAL IMPROVEMENT
|
|
1423
|
-
NO CHANGING PROMPTS WITHOUT RE-VALIDATING
|
|
1424
|
-
NO REMOVING CONTENT WITHOUT REPLACEMENT
|
|
1425
|
-
NO CLAIMING IMPROVEMENT WITHOUT SCORE CHECK
|
|
1426
|
-
|
|
1427
|
-
## The Process
|
|
1428
|
-
|
|
1429
|
-
1. **Identify weak prompts** — Read benchmark findings and validation failures
|
|
1430
|
-
2. **Analyze current prompts** — What's missing, vague, or unclear?
|
|
1431
|
-
3. **Apply prompt patterns** — Use clear structure, examples, constraints, and output formats
|
|
1432
|
-
4. **Update CONTEXT.md files** — Improve stage-specific instructions
|
|
1433
|
-
5. **Update SYSTEM.md if needed** — Improve folder map, rules, or tool inventory
|
|
1434
|
-
6. **Re-run validation** — Verify improvements didn't break anything
|
|
1435
|
-
7. **Re-run benchmark** — Check if score improved
|
|
1436
|
-
|
|
1437
|
-
## Anti-Rationalization Table
|
|
1438
|
-
|
|
1439
|
-
| Thought | Reality |
|
|
1440
|
-
|---------|---------|
|
|
1441
|
-
| "This wording change is enough" | Wording changes must produce functional improvement. |
|
|
1442
|
-
| "I'll remove vague sections" | Removing creates gaps. Improve, don't delete. |
|
|
1443
|
-
| "The score didn't change, but it's better" | If the score didn't change, it's not better. Try again. |
|
|
1444
|
-
|
|
1445
|
-
## Sub-Skill Dispatch
|
|
1446
|
-
|
|
1447
|
-
- Dispatched when score < 80
|
|
1448
|
-
- After improvements → dispatch testing to verify
|
|
1449
|
-
- If score doesn't improve → dispatch iteration for deeper fixes
|
|
1450
|
-
|
|
1451
|
-
## Report Format
|
|
1452
|
-
|
|
1453
|
-
```json
|
|
1454
|
-
{
|
|
1455
|
-
"skill": "prompt-engineering",
|
|
1456
|
-
"status": "passed|failed|escalated",
|
|
1457
|
-
"timestamp": "<ISO-8601>",
|
|
1458
|
-
"findings": ["<finding>"],
|
|
1459
|
-
"recommendations": ["<recommendation>"],
|
|
1460
|
-
"metrics": {
|
|
1461
|
-
"scoreBefore": <number>,
|
|
1462
|
-
"scoreAfter": <number>,
|
|
1463
|
-
"promptsUpdated": <number>
|
|
1464
|
-
},
|
|
1465
|
-
"nextSkill": "testing|iteration|none"
|
|
1466
|
-
}
|
|
1467
|
-
```
|
|
1468
|
-
```
|
|
1469
|
-
|
|
1470
|
-
- [ ] **Step 5: Rewrite iteration/SKILL.md**
|
|
1471
|
-
|
|
1472
|
-
```markdown
|
|
1473
|
-
---
|
|
1474
|
-
name: iteration
|
|
1475
|
-
description: "Runs autonomous improvement loops with benchmark scoring. Use when score plateaued, deeper fixes needed, or after testing identifies patterns."
|
|
1476
|
-
triggers: ["run improvement loop", "iterate on workspace", "deeper fixes", "score plateau"]
|
|
1477
|
-
---
|
|
1478
|
-
|
|
1479
|
-
## Overview
|
|
1480
|
-
|
|
1481
|
-
Execute improvement loops until quality thresholds are met. Iteration applies systematic fixes when prompt-engineering isn't enough.
|
|
1482
|
-
|
|
1483
|
-
## When to Use
|
|
1484
|
-
|
|
1485
|
-
- Score plateaued (no improvement between runs)
|
|
1486
|
-
- Testing identified patterns requiring deeper fixes
|
|
1487
|
-
- Validation failures persist after prompt-engineering
|
|
1488
|
-
- As part of the condition-driven improvement loop
|
|
1489
|
-
|
|
1490
|
-
## When Not to Use
|
|
1491
|
-
|
|
1492
|
-
- For first-pass improvements (use prompt-engineering first)
|
|
1493
|
-
- When workspace is new and untested (use testing first)
|
|
1494
|
-
- When structural changes are needed (use architecture sub-skill)
|
|
1495
|
-
|
|
1496
|
-
## The Iron Law
|
|
1497
|
-
|
|
1498
|
-
NO CLAIMING IMPROVEMENT WITHOUT RE-RUNNING BENCHMARK
|
|
1499
|
-
NO SKIPPING FIX SUGGESTIONS
|
|
1500
|
-
NO INFINITE ITERATION LOOPS
|
|
1501
|
-
NO SKIPPING ESCALATION WHEN STUCK
|
|
1502
|
-
|
|
1503
|
-
## The Process
|
|
1504
|
-
|
|
1505
|
-
1. **Run iterate.ts** — Execute `node scripts/iterate.ts --workspace <path> --max-retries 3`
|
|
1506
|
-
2. **Read benchmark results** — Parse the JSON output
|
|
1507
|
-
3. **Identify improvement areas** — Read fixSuggestions and improvementPotential
|
|
1508
|
-
4. **Apply fixes** — Address each suggestion systematically
|
|
1509
|
-
5. **Re-run iteration** — Check if score improved
|
|
1510
|
-
6. **Repeat until threshold** — Continue until score > 85 or no improvement possible
|
|
1511
|
-
7. **Escalate if stuck** — If score doesn't improve after 3 attempts, escalate to human
|
|
1512
|
-
|
|
1513
|
-
## Anti-Rationalization Table
|
|
1514
|
-
|
|
1515
|
-
| Thought | Reality |
|
|
1516
|
-
|---------|---------|
|
|
1517
|
-
| "I'll just run it again" | Without applying fixes, re-running is wasted cycles. |
|
|
1518
|
-
| "The score improved by 1 point" | Marginal improvements aren't meaningful. Target > 85. |
|
|
1519
|
-
| "I'll keep iterating until it works" | Max 3 attempts. Then escalate. |
|
|
1520
|
-
|
|
1521
|
-
## Sub-Skill Dispatch
|
|
1522
|
-
|
|
1523
|
-
- Dispatched when score plateaued
|
|
1524
|
-
- After iteration → re-run validation and benchmark
|
|
1525
|
-
- If score > 85 → workflow complete
|
|
1526
|
-
- If stuck after 3 attempts → escalate to human
|
|
1527
|
-
|
|
1528
|
-
## Report Format
|
|
1529
|
-
|
|
1530
|
-
```json
|
|
1531
|
-
{
|
|
1532
|
-
"skill": "iteration",
|
|
1533
|
-
"status": "passed|failed|escalated",
|
|
1534
|
-
"timestamp": "<ISO-8601>",
|
|
1535
|
-
"findings": ["<finding>"],
|
|
1536
|
-
"recommendations": ["<recommendation>"],
|
|
1537
|
-
"metrics": {
|
|
1538
|
-
"scoreBefore": <number>,
|
|
1539
|
-
"scoreAfter": <number>,
|
|
1540
|
-
"iterationsRun": <number>
|
|
1541
|
-
},
|
|
1542
|
-
"nextSkill": "none"
|
|
1543
|
-
}
|
|
1544
|
-
```
|
|
1545
|
-
```
|
|
1546
|
-
|
|
1547
|
-
- [ ] **Step 6: Rewrite tooling/SKILL.md**
|
|
1548
|
-
|
|
1549
|
-
```markdown
|
|
1550
|
-
---
|
|
1551
|
-
name: tooling
|
|
1552
|
-
description: "Assesses, installs, and configures tools for the workspace. Use when tools are missing, tool inventory needs updating, or workspace requires specific dependencies."
|
|
1553
|
-
triggers: ["install tools", "assess tooling", "update tool inventory", "configure dependencies"]
|
|
1554
|
-
---
|
|
1555
|
-
|
|
1556
|
-
## Overview
|
|
1557
|
-
|
|
1558
|
-
Ensure workspace has the right tools installed and configured. Tooling manages the dependency layer of the workspace.
|
|
1559
|
-
|
|
1560
|
-
## When to Use
|
|
1561
|
-
|
|
1562
|
-
- Tool inventory is empty or incomplete
|
|
1563
|
-
- Workspace requires specific dependencies
|
|
1564
|
-
- After architecture phase identifies tooling needs
|
|
1565
|
-
- When user requests specific tool installation
|
|
1566
|
-
|
|
1567
|
-
## When Not to Use
|
|
1568
|
-
|
|
1569
|
-
- For non-tool structural changes (use architecture sub-skill)
|
|
1570
|
-
- For content improvements (use prompt-engineering sub-skill)
|
|
1571
|
-
- When no tools are needed (skip tooling phase)
|
|
1572
|
-
|
|
1573
|
-
## The Iron Law
|
|
1574
|
-
|
|
1575
|
-
NO INSTALLING TOOLS WITHOUT USER APPROVAL
|
|
1576
|
-
NO SKIPPING TOOL INVENTORY UPDATES
|
|
1577
|
-
NO INSTALLING UNNECESSARY TOOLS
|
|
1578
|
-
NO SKIPPING VERIFICATION AFTER INSTALLATION
|
|
1579
|
-
|
|
1580
|
-
## The Process
|
|
1581
|
-
|
|
1582
|
-
1. **Scan current tools** — Read SYSTEM.md tool inventory
|
|
1583
|
-
2. **Identify missing tools** — Compare against workspace requirements
|
|
1584
|
-
3. **Propose tools** — List recommended tools with justifications
|
|
1585
|
-
4. **Get approval** — Present tool list to user for approval
|
|
1586
|
-
5. **Install tools** — Run `node scripts/install-tool.ts --tool <name> --manager <mgr> --workspace <path>`
|
|
1587
|
-
6. **Update inventory** — Verify tool inventory is updated
|
|
1588
|
-
7. **Verify installation** — Confirm tools are accessible
|
|
1589
|
-
|
|
1590
|
-
## Anti-Rationalization Table
|
|
1591
|
-
|
|
1592
|
-
| Thought | Reality |
|
|
1593
|
-
|---------|---------|
|
|
1594
|
-
| "This tool might be useful" | Might is not enough. Justify each tool against workspace needs. |
|
|
1595
|
-
| "I'll install it now and tell the user later" | User approval comes before installation. Always. |
|
|
1596
|
-
| "The installation probably worked" | Probably is not verified. Check. |
|
|
1597
|
-
|
|
1598
|
-
## Sub-Skill Dispatch
|
|
1599
|
-
|
|
1600
|
-
- Dispatched when tools are missing
|
|
1601
|
-
- After installation → workflow continues to next phase
|
|
1602
|
-
- If tool installation fails → escalate to human
|
|
1603
|
-
|
|
1604
|
-
## Report Format
|
|
1605
|
-
|
|
1606
|
-
```json
|
|
1607
|
-
{
|
|
1608
|
-
"skill": "tooling",
|
|
1609
|
-
"status": "passed|failed|escalated",
|
|
1610
|
-
"timestamp": "<ISO-8601>",
|
|
1611
|
-
"findings": ["<finding>"],
|
|
1612
|
-
"recommendations": ["<recommendation>"],
|
|
1613
|
-
"metrics": {
|
|
1614
|
-
"toolsInstalled": <number>,
|
|
1615
|
-
"toolsProposed": <number>,
|
|
1616
|
-
"toolsFailed": <number>
|
|
1617
|
-
},
|
|
1618
|
-
"nextSkill": "none"
|
|
1619
|
-
}
|
|
1620
|
-
```
|
|
1621
|
-
```
|
|
1622
|
-
|
|
1623
|
-
- [ ] **Step 7: Commit**
|
|
1624
|
-
|
|
1625
|
-
```bash
|
|
1626
|
-
git add templates/.workspace-templates/skills/research/SKILL.md templates/.workspace-templates/skills/architecture/SKILL.md templates/.workspace-templates/skills/testing/SKILL.md templates/.workspace-templates/skills/prompt-engineering/SKILL.md templates/.workspace-templates/skills/iteration/SKILL.md templates/.workspace-templates/skills/tooling/SKILL.md
|
|
1627
|
-
git commit -m "refactor(sub-skills): rewrite all 6 remaining sub-skills with obra patterns"
|
|
1628
|
-
```
|
|
1629
|
-
|
|
1630
|
-
---
|
|
1631
|
-
|
|
1632
|
-
### Task 9: Update Main SKILL.md with Autonomous Iteration Workflow
|
|
1633
|
-
|
|
1634
|
-
**Files:**
|
|
1635
|
-
- Modify: `templates/SKILL.md`
|
|
1636
|
-
|
|
1637
|
-
- [ ] **Step 1: Rewrite templates/SKILL.md with new workflow section**
|
|
1638
|
-
|
|
1639
|
-
Replace the entire file content with:
|
|
1640
|
-
|
|
1641
|
-
```markdown
|
|
1642
|
-
---
|
|
1643
|
-
name: workspace-maxxing
|
|
1644
|
-
description: "Autonomously creates, validates, and improves ICM-compliant workspaces using batched parallel sub-agents. Use when user asks to 'build a workspace', 'create a workflow', 'automate a process', 'improve this workspace', 'validate this workspace', 'iterate on this workspace', or 'run test cases'."
|
|
1645
|
-
---
|
|
1646
|
-
|
|
1647
|
-
# Workspace-Maxxing Skill
|
|
1648
|
-
|
|
1649
|
-
## Overview
|
|
1650
|
-
|
|
1651
|
-
Autonomous workflow system that creates, validates, and improves ICM-compliant workspaces through phased execution, batched parallel sub-agent iteration, and condition-driven improvement loops.
|
|
1652
|
-
|
|
1653
|
-
## When to Use
|
|
1654
|
-
|
|
1655
|
-
- User asks to build, create, or automate a workflow
|
|
1656
|
-
- User asks to improve, validate, or iterate on an existing workspace
|
|
1657
|
-
- User asks for workspace architecture or structure design
|
|
1658
|
-
- User asks to assess or install tools for a workspace
|
|
1659
|
-
- User asks to run test cases against a workspace
|
|
1660
|
-
|
|
1661
|
-
## When Not to Use
|
|
1662
|
-
|
|
1663
|
-
- Simple file creation or editing (use direct file operations)
|
|
1664
|
-
- Questions about ICM methodology (answer directly)
|
|
1665
|
-
- Non-workspace tasks (check for other applicable skills first)
|
|
1666
|
-
|
|
1667
|
-
## The Iron Law
|
|
1668
|
-
|
|
1669
|
-
NO BUILD WITHOUT PLAN
|
|
1670
|
-
NO PLAN WITHOUT RESEARCH
|
|
1671
|
-
NO IMPROVEMENT WITHOUT VALIDATION
|
|
1672
|
-
NO COMPLETION CLAIM WITHOUT VERIFICATION
|
|
1673
|
-
|
|
1674
|
-
## Hybrid Flow
|
|
1675
|
-
|
|
1676
|
-
```
|
|
1677
|
-
Phase 1: RESEARCH (dispatch research sub-skill)
|
|
1678
|
-
↓
|
|
1679
|
-
Phase 2: ARCHITECTURE (dispatch architecture sub-skill)
|
|
1680
|
-
↓
|
|
1681
|
-
Phase 3: BUILD (use scaffold.ts script)
|
|
1682
|
-
↓
|
|
1683
|
-
Phase 4: VALIDATE (dispatch validation sub-skill)
|
|
1684
|
-
↓
|
|
1685
|
-
Phase 5: AUTONOMOUS ITERATION (use orchestrator.ts)
|
|
1686
|
-
├─ Generate test cases
|
|
1687
|
-
├─ Split into batches
|
|
1688
|
-
├─ Dispatch workers in parallel per batch
|
|
1689
|
-
├─ Validate batch results
|
|
1690
|
-
├─ If score < threshold → fix loop → re-validate
|
|
1691
|
-
└─ Next batch or complete
|
|
1692
|
-
↓
|
|
1693
|
-
Phase 6: DELIVER
|
|
1694
|
-
```
|
|
1695
|
-
|
|
1696
|
-
## Autonomous Iteration Workflow
|
|
1697
|
-
|
|
1698
|
-
The orchestrator manages batched parallel sub-agent execution:
|
|
1699
|
-
|
|
1700
|
-
```bash
|
|
1701
|
-
node scripts/orchestrator.ts --workspace ./workspace --batch-size 3 --score-threshold 85
|
|
1702
|
-
```
|
|
1703
|
-
|
|
1704
|
-
**Flow:**
|
|
1705
|
-
1. Generate test cases from workspace stages
|
|
1706
|
-
2. Split into batches (default 3 per batch)
|
|
1707
|
-
3. Dispatch worker sub-agents in parallel for each batch
|
|
1708
|
-
4. Validate batch outputs with benchmark scoring
|
|
1709
|
-
5. If batch score < threshold → dispatch fixer sub-agents → re-validate (max 3 retries)
|
|
1710
|
-
6. Move to next batch or write summary
|
|
1711
|
-
|
|
1712
|
-
**Options:**
|
|
1713
|
-
- `--batch-size <n>` — Test cases per batch (default: 3)
|
|
1714
|
-
- `--score-threshold <n>` — Minimum batch score to pass (default: 85)
|
|
1715
|
-
- `--max-fix-retries <n>` — Max fix attempts per batch (default: 3)
|
|
1716
|
-
- `--worker-timeout <s>` — Worker timeout in seconds (default: 300)
|
|
1717
|
-
|
|
1718
|
-
## Sub-Skill Dispatch
|
|
1719
|
-
|
|
1720
|
-
| Condition | Sub-Skill | Command |
|
|
1721
|
-
|-----------|-----------|---------|
|
|
1722
|
-
| Starting new workflow | `research` | `node scripts/dispatch.ts --skill research --workspace ./workspace` |
|
|
1723
|
-
| After research complete | `architecture` | `node scripts/dispatch.ts --skill architecture --workspace ./workspace` |
|
|
1724
|
-
| After architecture approved | (use scaffold.ts) | `node scripts/scaffold.ts --name "<name>" --stages "<stages>" --output ./workspace` |
|
|
1725
|
-
| After building | `validation` | `node scripts/dispatch.ts --skill validation --workspace ./workspace` |
|
|
1726
|
-
| Running autonomous iteration | (use orchestrator.ts) | `node scripts/orchestrator.ts --workspace ./workspace` |
|
|
1727
|
-
| Worker execution | `worker` | `node scripts/dispatch.ts --skill worker --workspace ./workspace --batch-id <N>` |
|
|
1728
|
-
| Fix loop | `fixer` | `node scripts/dispatch.ts --skill fixer --workspace ./workspace --batch-id <N>` |
|
|
1729
|
-
| Score < 80 | `prompt-engineering` | `node scripts/dispatch.ts --skill prompt-engineering --workspace ./workspace` |
|
|
1730
|
-
| No tests exist | `testing` | `node scripts/dispatch.ts --skill testing --workspace ./workspace` |
|
|
1731
|
-
| Score plateaued | `iteration` | `node scripts/dispatch.ts --skill iteration --workspace ./workspace` |
|
|
1732
|
-
| Tools missing | `tooling` | `node scripts/dispatch.ts --skill tooling --workspace ./workspace` |
|
|
1733
|
-
|
|
1734
|
-
## Available Scripts
|
|
1735
|
-
|
|
1736
|
-
### orchestrator.ts — Autonomous Batch Iteration
|
|
1737
|
-
|
|
1738
|
-
Runs the full batched parallel sub-agent workflow.
|
|
1739
|
-
|
|
1740
|
-
```bash
|
|
1741
|
-
node scripts/orchestrator.ts --workspace ./workspace --batch-size 3 --score-threshold 85
|
|
1742
|
-
```
|
|
1743
|
-
|
|
1744
|
-
### scaffold.ts — Generate ICM Workspace
|
|
1745
|
-
|
|
1746
|
-
Creates a complete ICM workspace structure from a plan.
|
|
1747
|
-
|
|
1748
|
-
```bash
|
|
1749
|
-
node scripts/scaffold.ts --name "research" --stages "01-research,02-analysis,03-report" --output ./workspace
|
|
1750
|
-
```
|
|
1751
|
-
|
|
1752
|
-
### validate.ts — Check ICM Compliance
|
|
1753
|
-
|
|
1754
|
-
Validates a workspace against ICM rules.
|
|
1755
|
-
|
|
1756
|
-
```bash
|
|
1757
|
-
node scripts/validate.ts --workspace ./workspace
|
|
1758
|
-
```
|
|
1759
|
-
|
|
1760
|
-
### install-tool.ts — Install Packages
|
|
1761
|
-
|
|
1762
|
-
Installs a tool and updates the workspace inventory.
|
|
1763
|
-
|
|
1764
|
-
```bash
|
|
1765
|
-
node scripts/install-tool.ts --tool "pdf-lib" --manager npm --workspace ./workspace
|
|
1766
|
-
```
|
|
1767
|
-
|
|
1768
|
-
### iterate.ts — Single-Workspace Iteration (legacy)
|
|
1769
|
-
|
|
1770
|
-
Runs a 3-pass improvement loop. Use orchestrator.ts for batched parallel iteration.
|
|
1771
|
-
|
|
1772
|
-
```bash
|
|
1773
|
-
node scripts/iterate.ts --workspace ./workspace --max-retries 3
|
|
1774
|
-
```
|
|
1775
|
-
|
|
1776
|
-
### generate-tests.ts — Generate Test Cases
|
|
1777
|
-
|
|
1778
|
-
Creates test cases for each stage (sample, edge-case, empty).
|
|
1779
|
-
|
|
1780
|
-
```bash
|
|
1781
|
-
node scripts/generate-tests.ts --workspace ./workspace --output ./tests.json
|
|
1782
|
-
```
|
|
1783
|
-
|
|
1784
|
-
### benchmark.ts — Weighted Benchmark Scoring
|
|
1785
|
-
|
|
1786
|
-
Runs weighted benchmark scoring on a workspace.
|
|
1787
|
-
|
|
1788
|
-
```bash
|
|
1789
|
-
node scripts/benchmark.ts --workspace ./workspace
|
|
1790
|
-
```
|
|
1791
|
-
|
|
1792
|
-
### dispatch.ts — Sub-Skill Dispatcher
|
|
1793
|
-
|
|
1794
|
-
Loads and executes sub-skill workflows. Supports parallel dispatch.
|
|
1795
|
-
|
|
1796
|
-
```bash
|
|
1797
|
-
node scripts/dispatch.ts --skill <name> --workspace ./workspace [--parallel --invocations <path>]
|
|
1798
|
-
```
|
|
1799
|
-
|
|
1800
|
-
## Anti-Rationalization Table
|
|
1801
|
-
|
|
1802
|
-
| Thought | Reality |
|
|
1803
|
-
|---------|---------|
|
|
1804
|
-
| "This workspace looks good enough" | Good enough is the enemy of excellent. Run validation. |
|
|
1805
|
-
| "I'll skip research and go straight to building" | Building without research produces generic, non-optimal workspaces. |
|
|
1806
|
-
| "The user didn't ask for tests" | Autonomous workflows require self-verification. Tests are mandatory. |
|
|
1807
|
-
| "I'll fix this later" | Later never comes. Fix it now or escalate. |
|
|
1808
|
-
| "This sub-skill doesn't apply here" | If there's a 1% chance it applies, dispatch it. |
|
|
1809
|
-
| "The score is fine" | Fine is not good. Target > 85. |
|
|
1810
|
-
| "I already validated this" | Validation is a snapshot. Re-validate after every change. |
|
|
1811
|
-
| "I'll do all phases at once" | Phases exist for a reason. Complete each before moving to the next. |
|
|
1812
|
-
|
|
1813
|
-
## Integration
|
|
1814
|
-
|
|
1815
|
-
- Sub-skills live in `skills/` directory, loaded via dispatch.ts
|
|
1816
|
-
- Shared references in `references/` directory (anti-patterns, reporting-format, iron-laws)
|
|
1817
|
-
- All sub-skills return structured JSON reports
|
|
1818
|
-
- Orchestrator manages batch lifecycle with fix loops
|
|
1819
|
-
- Condition loop continues until score > 85 AND all validations pass
|
|
1820
|
-
- Escalate to human if stuck after 3 iteration attempts
|
|
1821
|
-
|
|
1822
|
-
## ICM Rules
|
|
1823
|
-
- Canonical sources: each fact lives in exactly one file
|
|
1824
|
-
- One-way dependencies only: A → B, never B → A
|
|
1825
|
-
- Selective loading: route to sections, not whole files
|
|
1826
|
-
- Numbered folders for workflow stages
|
|
1827
|
-
|
|
1828
|
-
## Output Format
|
|
1829
|
-
- workspace/ — the built workspace
|
|
1830
|
-
- .agents/skills/<workspace-name>/ — installable skill
|
|
1831
|
-
- USAGE.md — how to use this workspace in future sessions
|
|
1832
|
-
- .agents/iteration/summary.json — autonomous iteration results
|
|
1833
|
-
```
|
|
1834
|
-
|
|
1835
|
-
- [ ] **Step 2: Commit**
|
|
1836
|
-
|
|
1837
|
-
```bash
|
|
1838
|
-
git add templates/SKILL.md
|
|
1839
|
-
git commit -m "feat(SKILL.md): add autonomous iteration workflow section"
|
|
1840
|
-
```
|
|
1841
|
-
|
|
1842
|
-
---
|
|
1843
|
-
|
|
1844
|
-
### Task 10: Full Integration Test
|
|
1845
|
-
|
|
1846
|
-
**Files:**
|
|
1847
|
-
- Modify: `tests/integration.test.ts`
|
|
1848
|
-
|
|
1849
|
-
- [ ] **Step 1: Add integration test for orchestrator batch lifecycle**
|
|
1850
|
-
|
|
1851
|
-
Add to `tests/integration.test.ts`:
|
|
1852
|
-
|
|
1853
|
-
```typescript
|
|
1854
|
-
describe('orchestrator integration', () => {
|
|
1855
|
-
it('runs full batch lifecycle on a valid workspace', () => {
|
|
1856
|
-
const ws = createBasicWorkspace();
|
|
1857
|
-
const orchestratorPath = path.join(__dirname, '..', 'dist', 'scripts', 'orchestrator.js');
|
|
1858
|
-
|
|
1859
|
-
const { stdout } = execSync(`node "${orchestratorPath}" --workspace "${ws}" --batch-size 2`, {
|
|
1860
|
-
encoding: 'utf-8',
|
|
1861
|
-
});
|
|
1862
|
-
|
|
1863
|
-
const result = JSON.parse(stdout);
|
|
1864
|
-
expect(result.totalBatches).toBeGreaterThan(0);
|
|
1865
|
-
expect(result.batchReports).toBeDefined();
|
|
1866
|
-
});
|
|
1867
|
-
|
|
1868
|
-
it('writes summary.json to iteration directory', () => {
|
|
1869
|
-
const ws = createBasicWorkspace();
|
|
1870
|
-
const orchestratorPath = path.join(__dirname, '..', 'dist', 'scripts', 'orchestrator.js');
|
|
1871
|
-
|
|
1872
|
-
execSync(`node "${orchestratorPath}" --workspace "${ws}" --batch-size 2`, {
|
|
1873
|
-
encoding: 'utf-8',
|
|
1874
|
-
});
|
|
1875
|
-
|
|
1876
|
-
const summaryPath = path.join(ws, '.agents', 'iteration', 'summary.json');
|
|
1877
|
-
expect(fs.existsSync(summaryPath)).toBe(true);
|
|
1878
|
-
|
|
1879
|
-
const summary = JSON.parse(fs.readFileSync(summaryPath, 'utf-8'));
|
|
1880
|
-
expect(summary.totalBatches).toBeDefined();
|
|
1881
|
-
expect(summary.timestamp).toBeDefined();
|
|
1882
|
-
});
|
|
1883
|
-
});
|
|
1884
|
-
```
|
|
1885
|
-
|
|
1886
|
-
- [ ] **Step 2: Build and run all tests**
|
|
1887
|
-
|
|
1888
|
-
Run: `npm run build && npm test`
|
|
1889
|
-
Expected: All tests pass
|
|
1890
|
-
|
|
1891
|
-
- [ ] **Step 3: Commit**
|
|
1892
|
-
|
|
1893
|
-
```bash
|
|
1894
|
-
git add tests/integration.test.ts
|
|
1895
|
-
git commit -m "test(integration): add orchestrator batch lifecycle tests"
|
|
1896
|
-
```
|
|
1897
|
-
|
|
1898
|
-
---
|
|
1899
|
-
|
|
1900
|
-
### Task 11: Run Full Test Suite & Verify
|
|
1901
|
-
|
|
1902
|
-
- [ ] **Step 1: Run full test suite**
|
|
1903
|
-
|
|
1904
|
-
Run: `npm test`
|
|
1905
|
-
Expected: All tests pass (114+ existing + new tests)
|
|
1906
|
-
|
|
1907
|
-
- [ ] **Step 2: Build**
|
|
1908
|
-
|
|
1909
|
-
Run: `npm run build`
|
|
1910
|
-
Expected: Clean build, no errors
|
|
1911
|
-
|
|
1912
|
-
- [ ] **Step 3: Verify all sub-skill files exist**
|
|
1913
|
-
|
|
1914
|
-
Run: `ls templates/.workspace-templates/skills/*/SKILL.md`
|
|
1915
|
-
Expected: All 9 sub-skills listed (research, architecture, validation, prompt-engineering, testing, iteration, tooling, worker, fixer)
|
|
1916
|
-
|
|
1917
|
-
- [ ] **Step 4: Final commit if needed**
|
|
1918
|
-
|
|
1919
|
-
```bash
|
|
1920
|
-
git status
|
|
1921
|
-
git add -A
|
|
1922
|
-
git commit -m "chore: final verification and cleanup"
|
|
1923
|
-
```
|