deepflow 0.1.83 → 0.1.84
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install.js +21 -7
- package/package.json +1 -1
- package/src/agents/reasoner.md +1 -0
- package/src/commands/df/auto-cycle.md +244 -11
- package/src/commands/df/auto.md +5 -0
- package/src/commands/df/consolidate.md +8 -0
- package/src/commands/df/debate.md +6 -0
- package/src/commands/df/discover.md +6 -0
- package/src/commands/df/execute.md +265 -1
- package/src/commands/df/note.md +5 -0
- package/src/commands/df/plan.md +89 -15
- package/src/commands/df/resume.md +11 -5
- package/src/commands/df/spec.md +5 -0
- package/src/commands/df/update.md +5 -0
- package/src/commands/df/verify.md +8 -2
- package/src/skills/atomic-commits/SKILL.md +3 -2
- package/src/skills/browse-fetch/SKILL.md +2 -0
- package/src/skills/browse-verify/SKILL.md +1 -0
- package/src/skills/code-completeness/SKILL.md +1 -0
- package/src/skills/gap-discovery/SKILL.md +1 -0
- package/templates/explore-agent.md +68 -3
package/bin/install.js
CHANGED
|
@@ -259,14 +259,19 @@ async function configureHooks(claudeDir) {
|
|
|
259
259
|
|
|
260
260
|
// Configure statusline
|
|
261
261
|
if (settings.statusLine) {
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
262
|
+
if (process.stdin.isTTY) {
|
|
263
|
+
const answer = await ask(
|
|
264
|
+
` ${c.yellow}!${c.reset} Existing statusLine found. Replace with deepflow? [y/N] `
|
|
265
|
+
);
|
|
266
|
+
if (answer.toLowerCase() === 'y') {
|
|
267
|
+
settings.statusLine = { type: 'command', command: statuslineCmd };
|
|
268
|
+
log('Statusline configured');
|
|
269
|
+
} else {
|
|
270
|
+
console.log(` ${c.yellow}!${c.reset} Skipped statusline configuration`);
|
|
271
|
+
}
|
|
268
272
|
} else {
|
|
269
|
-
|
|
273
|
+
// Non-interactive (e.g. Claude Code bash tool) — skip prompt, keep existing
|
|
274
|
+
console.log(` ${c.yellow}!${c.reset} Existing statusLine found — kept (non-interactive mode)`);
|
|
270
275
|
}
|
|
271
276
|
} else {
|
|
272
277
|
settings.statusLine = { type: 'command', command: statuslineCmd };
|
|
@@ -407,6 +412,11 @@ function ask(question) {
|
|
|
407
412
|
}
|
|
408
413
|
|
|
409
414
|
async function askInstallLevel(prompt) {
|
|
415
|
+
if (!process.stdin.isTTY) {
|
|
416
|
+
// Non-interactive — default to global
|
|
417
|
+
console.log(`${c.dim}Non-interactive mode — defaulting to global install${c.reset}`);
|
|
418
|
+
return 'global';
|
|
419
|
+
}
|
|
410
420
|
console.log(prompt);
|
|
411
421
|
console.log('');
|
|
412
422
|
console.log(` ${c.cyan}1${c.reset}) Global ${c.dim}(~/.claude/ - available in all projects)${c.reset}`);
|
|
@@ -455,6 +465,10 @@ async function uninstall() {
|
|
|
455
465
|
const CLAUDE_DIR = level === 'global' ? GLOBAL_DIR : PROJECT_DIR;
|
|
456
466
|
const levelLabel = level === 'global' ? 'global' : 'project';
|
|
457
467
|
|
|
468
|
+
if (!process.stdin.isTTY) {
|
|
469
|
+
console.log('Uninstall requires interactive mode. Run from a terminal.');
|
|
470
|
+
return;
|
|
471
|
+
}
|
|
458
472
|
const confirm = await ask(`Remove ${levelLabel} installation from ${CLAUDE_DIR}? [y/N] `);
|
|
459
473
|
if (confirm.toLowerCase() !== 'y') {
|
|
460
474
|
console.log('Cancelled.');
|
package/package.json
CHANGED
package/src/agents/reasoner.md
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:auto-cycle
|
|
3
|
+
description: Execute one task from PLAN.md with ratchet health checks and state tracking for autonomous mode
|
|
4
|
+
---
|
|
5
|
+
|
|
1
6
|
# /df:auto-cycle — Single Cycle of Auto Mode
|
|
2
7
|
|
|
3
8
|
## Purpose
|
|
@@ -22,6 +27,10 @@ Load: PLAN.md (required)
|
|
|
22
27
|
Load: .deepflow/auto-memory.yaml (optional — cross-cycle state, ignore if missing)
|
|
23
28
|
```
|
|
24
29
|
|
|
30
|
+
Shell injection (use output directly — no manual file reads needed):
|
|
31
|
+
- `` !`cat PLAN.md 2>/dev/null || echo 'NOT_FOUND'` ``
|
|
32
|
+
- `` !`cat .deepflow/auto-memory.yaml 2>/dev/null || echo 'NOT_FOUND'` ``
|
|
33
|
+
|
|
25
34
|
**auto-memory.yaml full schema:**
|
|
26
35
|
|
|
27
36
|
```yaml
|
|
@@ -36,13 +45,37 @@ consecutive_reverts: # written by circuit breaker (step 3.5)
|
|
|
36
45
|
T2: 2
|
|
37
46
|
probe_learnings:
|
|
38
47
|
- { spike: T1, probe: "streaming", insight: "discovered hidden dependency on fs.watch" }
|
|
48
|
+
optimize_state: # present only when an optimize task is active or was completed
|
|
49
|
+
task_id: "T{n}"
|
|
50
|
+
metric_command: "{shell command}"
|
|
51
|
+
target: {number}
|
|
52
|
+
direction: "higher|lower"
|
|
53
|
+
baseline: null # float; set on first measure
|
|
54
|
+
current_best: null # best metric value seen
|
|
55
|
+
best_commit: null # short commit hash of best value
|
|
56
|
+
cycles_run: 0
|
|
57
|
+
cycles_without_improvement: 0
|
|
58
|
+
consecutive_reverts: 0 # optimize-specific revert counter (separate from global)
|
|
59
|
+
probe_scale: 0 # 0=no probes yet, 2/4/6
|
|
60
|
+
max_cycles: {number}
|
|
61
|
+
history: [] # [{cycle, value, delta_pct, kept: bool, commit}]
|
|
62
|
+
failed_hypotheses: [] # ["{description}"] — written to experiments/ on completion
|
|
39
63
|
```
|
|
40
64
|
|
|
41
65
|
Each section is optional. Missing keys are treated as empty. The file is created on first write if absent.
|
|
42
66
|
|
|
43
67
|
### 2. PICK NEXT TASK
|
|
44
68
|
|
|
45
|
-
|
|
69
|
+
**Optimize-active override:** Before scanning PLAN.md, check `auto-memory.yaml` for `optimize_state.task_id`. If present and the corresponding task is still `[ ]` in PLAN.md, resume that task immediately — skip the normal `[ ]` scan. This ensures optimize tasks survive context exhaustion and resume across cycles.
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
If optimize_state.task_id exists in auto-memory.yaml:
|
|
73
|
+
→ Look up that task_id in PLAN.md
|
|
74
|
+
→ If the task is still [ ] → select it (override normal scan)
|
|
75
|
+
→ If the task is [x] → clear optimize_state.task_id and fall through to normal scan
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Otherwise, scan PLAN.md for the first `[ ]` task where all "Blocked by:" dependencies are `[x]`:
|
|
46
79
|
|
|
47
80
|
```
|
|
48
81
|
For each [ ] task in PLAN.md (top to bottom):
|
|
@@ -85,7 +118,7 @@ This handles worktree creation, agent spawning, ratchet health checks, and commi
|
|
|
85
118
|
|
|
86
119
|
After `/df:execute` returns, record the task result in `.deepflow/auto-memory.yaml`:
|
|
87
120
|
|
|
88
|
-
**On success (ratchet passed):**
|
|
121
|
+
**On success (ratchet passed — non-optimize task):**
|
|
89
122
|
|
|
90
123
|
```yaml
|
|
91
124
|
# Set task_results[task_id] = success entry
|
|
@@ -93,7 +126,7 @@ task_results:
|
|
|
93
126
|
{task_id}: { status: success, commit: {short_hash}, cycle: {cycle_number} }
|
|
94
127
|
```
|
|
95
128
|
|
|
96
|
-
**On revert (ratchet failed):**
|
|
129
|
+
**On revert (ratchet failed — non-optimize task):**
|
|
97
130
|
|
|
98
131
|
```yaml
|
|
99
132
|
# Set task_results[task_id] = reverted entry
|
|
@@ -105,6 +138,20 @@ revert_history:
|
|
|
105
138
|
- { task: {task_id}, cycle: {cycle_number}, reason: "{ratchet failure summary}" }
|
|
106
139
|
```
|
|
107
140
|
|
|
141
|
+
**On optimize cycle result** (task has `Optimize:` block — execute.md section 5.9 handles the inner cycle; auto-cycle only updates the outer state here):
|
|
142
|
+
|
|
143
|
+
After each optimize cycle reported by `/df:execute`:
|
|
144
|
+
|
|
145
|
+
```yaml
|
|
146
|
+
# Merge updated optimize_state written by execute into auto-memory.yaml
|
|
147
|
+
# execute already persists optimize_state after each cycle (5.9.5) — confirm it was written
|
|
148
|
+
# Increment cycles_run tracked at auto-cycle level for report summary
|
|
149
|
+
optimize_state:
|
|
150
|
+
cycles_run: {N} # echoed from execute's optimize_state
|
|
151
|
+
current_best: {value}
|
|
152
|
+
history: [...] # full history from execute's optimize_state
|
|
153
|
+
```
|
|
154
|
+
|
|
108
155
|
Read the current file first (create if missing), merge the new values, and write back. Preserve all existing keys.
|
|
109
156
|
|
|
110
157
|
### 3.6. CIRCUIT BREAKER
|
|
@@ -126,7 +173,7 @@ What does NOT count as a failure:
|
|
|
126
173
|
- L5 ⚠ (passed on retry): treated as pass, resets counter
|
|
127
174
|
```
|
|
128
175
|
|
|
129
|
-
**On revert (ratchet failed — any of L0 ✗, L1 ✗, L2 ✗, L4 ✗, L5 ✗, or L5 ✗ flaky):**
|
|
176
|
+
**On revert (ratchet failed — any of L0 ✗, L1 ✗, L2 ✗, L4 ✗, L5 ✗, or L5 ✗ flaky — non-optimize task):**
|
|
130
177
|
|
|
131
178
|
```
|
|
132
179
|
1. Read .deepflow/auto-memory.yaml (create if missing)
|
|
@@ -141,12 +188,37 @@ What does NOT count as a failure:
|
|
|
141
188
|
→ Continue to step 4 (UPDATE REPORT) as normal
|
|
142
189
|
```
|
|
143
190
|
|
|
144
|
-
**On success (ratchet passed — including L5 — no frontend or L5 ⚠ pass-on-retry):**
|
|
191
|
+
**On success (ratchet passed — including L5 — no frontend or L5 ⚠ pass-on-retry — non-optimize task):**
|
|
145
192
|
|
|
146
193
|
```
|
|
147
194
|
1. Reset consecutive_reverts[task_id] to 0 in .deepflow/auto-memory.yaml
|
|
148
195
|
```
|
|
149
196
|
|
|
197
|
+
**Optimize stop conditions** (task has `Optimize:` block — checked after every optimize cycle result from execute):
|
|
198
|
+
|
|
199
|
+
Execute (section 5.9.3) handles the inner-cycle circuit breaker inside the optimize loop. At the auto-cycle level, watch for these terminal outcomes reported by `/df:execute`:
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
1. "target reached: {value}"
|
|
203
|
+
→ Mark task [x] (execute already did this — confirm)
|
|
204
|
+
→ Write optimize completion (step 3.7)
|
|
205
|
+
→ Report: "Optimize complete: target reached — {value} (target: {target})"
|
|
206
|
+
→ Continue to step 4
|
|
207
|
+
|
|
208
|
+
2. "max cycles reached, best: {current_best}"
|
|
209
|
+
→ Mark task [x] (execute already did this — confirm)
|
|
210
|
+
→ Write optimize completion (step 3.7)
|
|
211
|
+
→ Report: "Optimize complete: max cycles reached — best: {current_best} (target: {target})"
|
|
212
|
+
→ Continue to step 4
|
|
213
|
+
|
|
214
|
+
3. "circuit breaker: 3 consecutive reverts"
|
|
215
|
+
→ Task stays [ ] — do NOT mark [x]
|
|
216
|
+
→ Write optimize failure to experiments/ (step 3.7)
|
|
217
|
+
→ Clear optimize_state.task_id (task stays [ ] for manual intervention)
|
|
218
|
+
→ Report: "Circuit breaker tripped (optimize): T{n} halted after 3 consecutive reverts. Resolve manually."
|
|
219
|
+
→ Halt (exit without scheduling next cycle)
|
|
220
|
+
```
|
|
221
|
+
|
|
150
222
|
**auto-memory.yaml schema for the circuit breaker:**
|
|
151
223
|
|
|
152
224
|
```yaml
|
|
@@ -161,6 +233,43 @@ consecutive_reverts:
|
|
|
161
233
|
circuit_breaker_threshold: 3 # halt after this many consecutive reverts on the same task
|
|
162
234
|
```
|
|
163
235
|
|
|
236
|
+
### 3.7. OPTIMIZE COMPLETION
|
|
237
|
+
|
|
238
|
+
When an optimize task reaches a terminal stop condition (target reached, max cycles, or circuit breaker):
|
|
239
|
+
|
|
240
|
+
**On target reached or max cycles (task [x]):**
|
|
241
|
+
|
|
242
|
+
```
|
|
243
|
+
1. Read optimize_state.failed_hypotheses from .deepflow/auto-memory.yaml
|
|
244
|
+
2. For each failed hypothesis, write to .deepflow/experiments/:
|
|
245
|
+
File: {spec}--optimize-{task_id}--{slug}--failed.md
|
|
246
|
+
Content:
|
|
247
|
+
# Failed Hypothesis: {description}
|
|
248
|
+
Task: {task_id} Spec: {spec_name} Cycle: {cycle_N}
|
|
249
|
+
Metric before: {value_before} Metric after: {value_after}
|
|
250
|
+
Reason: {why it was reverted}
|
|
251
|
+
3. Write a summary experiment file for the optimize run:
|
|
252
|
+
File: {spec}--optimize-{task_id}--summary--{status}.md
|
|
253
|
+
Content:
|
|
254
|
+
# Optimize Summary: {task_id}
|
|
255
|
+
Metric: {metric_command} Target: {target} Direction: {direction}
|
|
256
|
+
Baseline: {baseline} Best achieved: {current_best} Final: {final_value}
|
|
257
|
+
Cycles run: {cycles_run} Status: {reached|max_cycles}
|
|
258
|
+
History (all cycles):
|
|
259
|
+
| Cycle | Value | Delta | Kept | Commit |
|
|
260
|
+
...
|
|
261
|
+
4. Clear optimize_state from .deepflow/auto-memory.yaml (set to null or remove key)
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
**On circuit breaker halt:**
|
|
265
|
+
|
|
266
|
+
```
|
|
267
|
+
1. Write failed_hypotheses to .deepflow/experiments/ (same as above)
|
|
268
|
+
2. Write summary experiment file with status: circuit_breaker
|
|
269
|
+
3. Preserve optimize_state in auto-memory.yaml (do NOT clear — enables human diagnosis)
|
|
270
|
+
Add note: "halted: circuit_breaker — requires manual intervention"
|
|
271
|
+
```
|
|
272
|
+
|
|
164
273
|
### 4. UPDATE REPORT
|
|
165
274
|
|
|
166
275
|
Write a comprehensive report to `.deepflow/auto-report.md` after every cycle. The file is appended each cycle — never overwritten. Each cycle adds its row to the per-cycle log table and updates the running summary counts.
|
|
@@ -181,13 +290,16 @@ _Last updated: {YYYY-MM-DDTHH:MM:SSZ}_
|
|
|
181
290
|
| Total cycles run | {N} |
|
|
182
291
|
| Tasks committed | {N} |
|
|
183
292
|
| Tasks reverted | {N} |
|
|
293
|
+
| Optimize cycles run | {N} | ← present only when optimize tasks exist in PLAN.md
|
|
294
|
+
| Optimize best value | {value} / {target} | ← present only when optimize tasks exist
|
|
184
295
|
|
|
185
296
|
## Cycle Log
|
|
186
297
|
|
|
187
|
-
| Cycle | Task | Status | Commit / Revert | Delta | Reason | Timestamp |
|
|
188
|
-
|
|
189
|
-
| 1 | T1 | passed | abc1234 | tests: 24→24, build: ok | — | 2025-01-15T10:00:00Z |
|
|
190
|
-
| 2 | T2 | failed | reverted | tests: 24→22 (−2) | tests failed: 2 of 24 | 2025-01-15T10:05:00Z |
|
|
298
|
+
| Cycle | Task | Status | Commit / Revert | Delta | Metric Delta | Reason | Timestamp |
|
|
299
|
+
|-------|------|--------|-----------------|-------|--------------|--------|-----------|
|
|
300
|
+
| 1 | T1 | passed | abc1234 | tests: 24→24, build: ok | — | — | 2025-01-15T10:00:00Z |
|
|
301
|
+
| 2 | T2 | failed | reverted | tests: 24→22 (−2) | — | tests failed: 2 of 24 | 2025-01-15T10:05:00Z |
|
|
302
|
+
| 3 | T3 | optimize | def789 | tests: 24→24, build: ok | 72.3→74.1 (+2.5%) | — | 2025-01-15T10:10:00Z |
|
|
191
303
|
|
|
192
304
|
## Probe Results
|
|
193
305
|
|
|
@@ -196,6 +308,20 @@ _(empty until a probe/spike task runs)_
|
|
|
196
308
|
| Probe | Metric | Winner | Loser | Notes |
|
|
197
309
|
|-------|--------|--------|-------|-------|
|
|
198
310
|
|
|
311
|
+
## Optimize Runs
|
|
312
|
+
|
|
313
|
+
_(empty until an optimize task completes)_
|
|
314
|
+
|
|
315
|
+
| Task | Metric | Baseline | Best | Target | Cycles | Status |
|
|
316
|
+
|------|--------|----------|------|--------|--------|--------|
|
|
317
|
+
|
|
318
|
+
## Secondary Metric Warnings
|
|
319
|
+
|
|
320
|
+
_(empty until a secondary metric regresses >5%)_
|
|
321
|
+
|
|
322
|
+
| Cycle | Task | Secondary Metric | Before | After | Delta | Severity |
|
|
323
|
+
|-------|------|-----------------|--------|-------|-------|----------|
|
|
324
|
+
|
|
199
325
|
## Health Score
|
|
200
326
|
|
|
201
327
|
| Check | Status |
|
|
@@ -203,6 +329,7 @@ _(empty until a probe/spike task runs)_
|
|
|
203
329
|
| Tests passed | {N} / {total} |
|
|
204
330
|
| Build status | passing / failing |
|
|
205
331
|
| Ratchet | green / red |
|
|
332
|
+
| Optimize status | in_progress / reached / max_cycles / circuit_breaker / — | ← present only when optimize tasks exist
|
|
206
333
|
|
|
207
334
|
## Reverted Tasks
|
|
208
335
|
|
|
@@ -217,14 +344,15 @@ _(tasks that were reverted with their failure reasons)_
|
|
|
217
344
|
**Cycle Log — append one row:**
|
|
218
345
|
|
|
219
346
|
```
|
|
220
|
-
| {cycle_number} | {task_id} | {status} | {commit_hash or "reverted"} | {delta} | {reason or "—"} | {YYYY-MM-DDTHH:MM:SSZ} |
|
|
347
|
+
| {cycle_number} | {task_id} | {status} | {commit_hash or "reverted"} | {delta} | {metric_delta} | {reason or "—"} | {YYYY-MM-DDTHH:MM:SSZ} |
|
|
221
348
|
```
|
|
222
349
|
|
|
223
350
|
- `cycle_number`: total number of cycles executed so far (count existing data rows in the Cycle Log + 1)
|
|
224
351
|
- `task_id`: task ID from PLAN.md, or `BOOTSTRAP` for bootstrap cycles
|
|
225
|
-
- `status`: `passed` (ratchet passed), `failed` (ratchet failed, reverted),
|
|
352
|
+
- `status`: `passed` (ratchet passed), `failed` (ratchet failed, reverted), `skipped` (task was already done), or `optimize` (optimize cycle — one inner cycle of an Optimize task)
|
|
226
353
|
- `commit_hash`: short hash from the commit, or `reverted` if ratchet failed
|
|
227
354
|
- `delta`: ratchet metric change from this cycle. Format: `tests: {before}→{after}, build: ok/fail`. Include coverage delta if available (e.g., `cov: 80%→82% (+2%)`). On revert, show the regression that triggered it (e.g., `tests: 24→22 (−2)`)
|
|
355
|
+
- `metric_delta`: for optimize cycles, show `{old}→{new} ({+/-pct}%)`. For non-optimize cycles, use `—`.
|
|
228
356
|
- `reason`: failure reason from ratchet output (e.g., `"tests failed: 2 of 24"`), or `—` if passed
|
|
229
357
|
|
|
230
358
|
**Summary table — recalculate from Cycle Log rows:**
|
|
@@ -232,9 +360,31 @@ _(tasks that were reverted with their failure reasons)_
|
|
|
232
360
|
- `Total cycles run`: count of all data rows in the Cycle Log
|
|
233
361
|
- `Tasks committed`: count of rows where Status = `passed`
|
|
234
362
|
- `Tasks reverted`: count of rows where Status = `failed`
|
|
363
|
+
- `Optimize cycles run`: count of rows where Status = `optimize` (omit row if no optimize tasks in PLAN.md)
|
|
364
|
+
- `Optimize best value`: `{current_best} / {target}` from `optimize_state` in auto-memory.yaml (omit row if no optimize tasks)
|
|
235
365
|
|
|
236
366
|
**Last updated timestamp:** always overwrite the `_Last updated:` line with the current timestamp.
|
|
237
367
|
|
|
368
|
+
**Optimize Runs table — update on optimize terminal events:**
|
|
369
|
+
|
|
370
|
+
When an optimize stop condition is reached (target reached, max cycles, circuit breaker), append or update the row for the optimize task:
|
|
371
|
+
|
|
372
|
+
```
|
|
373
|
+
| {task_id} | {metric_command} | {baseline} | {current_best} | {target} | {cycles_run} | {reached|max_cycles|circuit_breaker} |
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
If the task is still in progress, do not add a row yet (it will be added when the terminal event fires).
|
|
377
|
+
|
|
378
|
+
**Secondary Metric Warnings table — append on regression >5%:**
|
|
379
|
+
|
|
380
|
+
After each optimize cycle, `/df:execute` section 5.9.2 step j measures secondary metrics. If a regression exceeds the threshold, auto-cycle reads the warning from execute's output and appends to the table:
|
|
381
|
+
|
|
382
|
+
```
|
|
383
|
+
| {cycle_number} | {task_id} | {secondary_metric_command} | {before} | {after} | {+/-pct}% | WARNING |
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
The severity is always `WARNING` (no auto-revert — human decision required). These rows are informational only.
|
|
387
|
+
|
|
238
388
|
#### 4.3 Probe results (when applicable)
|
|
239
389
|
|
|
240
390
|
If the executed task was a probe/spike (task description contains "probe" or "spike"), append a row to the Probe Results table:
|
|
@@ -254,6 +404,13 @@ Read the ratchet output from the last `/df:execute` result and populate:
|
|
|
254
404
|
- `Tests passed`: e.g., `22 / 24` (from ratchet summary line)
|
|
255
405
|
- `Build status`: `passing` if exit code 0, `failing` if build error
|
|
256
406
|
- `Ratchet`: `green` if ratchet passed, `red` if ratchet failed
|
|
407
|
+
- `Optimize status`: read from `optimize_state` in auto-memory.yaml:
|
|
408
|
+
- `in_progress` if `optimize_state.task_id` present and task still `[ ]`
|
|
409
|
+
- `reached` if stop condition was "target reached"
|
|
410
|
+
- `max_cycles` if stop condition was "max cycles"
|
|
411
|
+
- `circuit_breaker` if halted by circuit breaker
|
|
412
|
+
- `—` if no optimize task is active or was ever run
|
|
413
|
+
- Omit this row entirely if PLAN.md contains no `[OPTIMIZE]` tasks
|
|
257
414
|
|
|
258
415
|
Replace the entire Health Score section content with the latest values each cycle.
|
|
259
416
|
|
|
@@ -301,6 +458,11 @@ pending_count = number of [ ] tasks
|
|
|
301
458
|
| Auto-memory updated after every cycle | `task_results`, `revert_history`, and `consecutive_reverts` in `.deepflow/auto-memory.yaml` are written after each EXECUTE result |
|
|
302
459
|
| Cross-cycle state read at cycle start | LOAD STATE reads the full `auto-memory.yaml` schema; prior task outcomes and probe learnings are available to the cycle |
|
|
303
460
|
| Circuit breaker halts the loop | After N consecutive reverts on the same task (default 3, configurable via `circuit_breaker_threshold` in `.deepflow/config.yaml`), the loop is stopped and the reason is reported |
|
|
461
|
+
| One optimize task at a time | Only one `[OPTIMIZE]` task runs at a time — auto-cycle defers other optimize tasks until the active one reaches a terminal stop condition |
|
|
462
|
+
| Optimize tasks resume across context windows | `optimize_state.task_id` in `auto-memory.yaml` overrides the normal `[ ]` scan; the same task is picked every cycle until a stop condition fires |
|
|
463
|
+
| Optimize circuit breaker halts AND preserves state | When optimize hits 3 consecutive reverts: task stays `[ ]`, `optimize_state` is preserved in `auto-memory.yaml` (not cleared), loop halts |
|
|
464
|
+
| Secondary metric regression is advisory only | >5% regression generates WARNING in `auto-report.md` Secondary Metric Warnings table — never triggers auto-revert |
|
|
465
|
+
| Optimize completion writes experiments | Failed hypotheses and run summary are written to `.deepflow/experiments/` when a terminal stop condition fires |
|
|
304
466
|
|
|
305
467
|
## Example
|
|
306
468
|
|
|
@@ -383,6 +545,77 @@ Circuit breaker tripped: T3 failed 3 consecutive times. Reason: 2 tests regresse
|
|
|
383
545
|
Loop halted. Resolve T3 manually, then run /df:auto-cycle to resume.
|
|
384
546
|
```
|
|
385
547
|
|
|
548
|
+
### Optimize Cycle (in progress — task resumes from optimize_state)
|
|
549
|
+
|
|
550
|
+
```
|
|
551
|
+
/df:auto-cycle
|
|
552
|
+
|
|
553
|
+
Loading PLAN.md... 4 tasks total, 2 done, 2 pending
|
|
554
|
+
Loading auto-memory.yaml... optimize_state.task_id = T3
|
|
555
|
+
|
|
556
|
+
Optimize-active override: T3 still [ ] — resuming optimize task
|
|
557
|
+
optimize_state: cycles_run=4, current_best=74.1, target=85.0, direction=higher
|
|
558
|
+
|
|
559
|
+
Running: /df:execute T3
|
|
560
|
+
⟳ T3 cycle 5: 74.1 → 75.8 (+2.3%) — kept [best: 75.8, target: 85.0]
|
|
561
|
+
|
|
562
|
+
Updated .deepflow/auto-memory.yaml:
|
|
563
|
+
optimize_state.cycles_run = 5
|
|
564
|
+
optimize_state.current_best = 75.8
|
|
565
|
+
|
|
566
|
+
Updated .deepflow/auto-report.md:
|
|
567
|
+
Summary: cycles=5, committed=2, reverted=0, optimize_cycles=5, optimize_best=75.8/85.0
|
|
568
|
+
Cycle Log row: | 5 | T3 | optimize | abc1234 | tests: 24→24, build: ok | 74.1→75.8 (+2.3%) | — | 2025-01-15T10:15:00Z |
|
|
569
|
+
Health: tests 24/24, build passing, ratchet green, optimize in_progress
|
|
570
|
+
|
|
571
|
+
Cycle complete. 2 tasks remaining.
|
|
572
|
+
```
|
|
573
|
+
|
|
574
|
+
### Optimize Complete (target reached)
|
|
575
|
+
|
|
576
|
+
```
|
|
577
|
+
/df:auto-cycle
|
|
578
|
+
|
|
579
|
+
Loading PLAN.md... 4 tasks total, 2 done, 2 pending
|
|
580
|
+
Loading auto-memory.yaml... optimize_state.task_id = T3
|
|
581
|
+
|
|
582
|
+
Optimize-active override: T3 still [ ] — resuming optimize task
|
|
583
|
+
optimize_state: cycles_run=12, current_best=84.9, target=85.0, direction=higher
|
|
584
|
+
|
|
585
|
+
Running: /df:execute T3
|
|
586
|
+
⟳ T3 cycle 13: 84.9 → 85.3 (+0.5%) — kept [best: 85.3, target: 85.0]
|
|
587
|
+
Target reached: 85.3 >= 85.0 — marking T3 [x]
|
|
588
|
+
|
|
589
|
+
Optimize completion:
|
|
590
|
+
Writing 3 failed hypotheses to .deepflow/experiments/
|
|
591
|
+
Writing summary: specs--optimize-T3--summary--reached.md
|
|
592
|
+
Clearing optimize_state from auto-memory.yaml
|
|
593
|
+
|
|
594
|
+
Updated .deepflow/auto-report.md:
|
|
595
|
+
Summary: cycles=13, committed=3, reverted=0, optimize_cycles=13, optimize_best=85.3/85.0
|
|
596
|
+
Cycle Log row: | 13 | T3 | optimize | def456 | tests: 24→24, build: ok | 84.9→85.3 (+0.5%) | — | 2025-01-15T10:45:00Z |
|
|
597
|
+
Optimize Runs row: | T3 | coverage_cmd | 72.3 | 85.3 | 85.0 | 13 | reached |
|
|
598
|
+
Health: tests 24/24, build passing, ratchet green, optimize reached
|
|
599
|
+
|
|
600
|
+
Cycle complete. 1 tasks remaining.
|
|
601
|
+
```
|
|
602
|
+
|
|
603
|
+
### Optimize Secondary Metric Warning
|
|
604
|
+
|
|
605
|
+
```
|
|
606
|
+
/df:auto-cycle
|
|
607
|
+
|
|
608
|
+
Running: /df:execute T3
|
|
609
|
+
⟳ T3 cycle 8: 80.1 → 81.4 (+1.6%) — kept [best: 81.4, target: 85.0]
|
|
610
|
+
WARNING: secondary metric 'lint_errors' regressed: 2 → 5 (+150%) — exceeds 5% threshold
|
|
611
|
+
|
|
612
|
+
Updated .deepflow/auto-report.md:
|
|
613
|
+
Secondary Metric Warnings row: | 8 | T3 | lint_errors | 2 | 5 | +150% | WARNING |
|
|
614
|
+
(No auto-revert — human decision required)
|
|
615
|
+
|
|
616
|
+
Cycle complete. 2 tasks remaining.
|
|
617
|
+
```
|
|
618
|
+
|
|
386
619
|
### All Tasks Blocked
|
|
387
620
|
|
|
388
621
|
```
|
package/src/commands/df/auto.md
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:auto
|
|
3
|
+
description: Set up and launch fully autonomous execution with plan generation and ratchet snapshots
|
|
4
|
+
---
|
|
5
|
+
|
|
1
6
|
# /df:auto — Autonomous Mode Setup
|
|
2
7
|
|
|
3
8
|
Set up and launch fully autonomous execution. Runs `/df:plan` if no PLAN.md exists, takes a ratchet snapshot, then starts `/loop 1m /df:auto-cycle`.
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:consolidate
|
|
3
|
+
description: Remove duplicates and superseded entries from decisions file, promote stale provisionals
|
|
4
|
+
---
|
|
5
|
+
|
|
1
6
|
# /df:consolidate — Consolidate Decisions
|
|
2
7
|
|
|
3
8
|
## Purpose
|
|
@@ -15,6 +20,9 @@ Remove duplicates, superseded entries, and promote stale provisionals. Keep deci
|
|
|
15
20
|
### 1. LOAD
|
|
16
21
|
Read `.deepflow/decisions.md`. If missing or empty, report and exit.
|
|
17
22
|
|
|
23
|
+
Shell injection (use output directly — no manual file reads needed):
|
|
24
|
+
- `` !`cat .deepflow/decisions.md 2>/dev/null || echo 'NOT_FOUND'` ``
|
|
25
|
+
|
|
18
26
|
### 2. ANALYZE
|
|
19
27
|
Model-driven analysis (not regex):
|
|
20
28
|
- Identify duplicate decisions (same meaning, different wording)
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:execute
|
|
3
|
+
description: Execute tasks from PLAN.md with agent spawning, ratchet health checks, and worktree management
|
|
4
|
+
---
|
|
5
|
+
|
|
1
6
|
# /df:execute — Execute Tasks from Plan
|
|
2
7
|
|
|
3
8
|
## Orchestrator Role
|
|
@@ -51,6 +56,10 @@ checkpoint exists → Prompt: "Resume? (y/n)"
|
|
|
51
56
|
else → Start fresh
|
|
52
57
|
```
|
|
53
58
|
|
|
59
|
+
Shell injection (use output directly — no manual file reads needed):
|
|
60
|
+
- `` !`cat .deepflow/checkpoint.json 2>/dev/null || echo 'NOT_FOUND'` ``
|
|
61
|
+
- `` !`git diff --quiet && echo 'CLEAN' || echo 'DIRTY'` ``
|
|
62
|
+
|
|
54
63
|
### 1.5. CREATE WORKTREE
|
|
55
64
|
|
|
56
65
|
Require clean HEAD (`git diff --quiet`). Derive SPEC_NAME from `specs/doing-*.md`.
|
|
@@ -92,6 +101,10 @@ Load: PLAN.md (required), specs/doing-*.md, .deepflow/config.yaml
|
|
|
92
101
|
If missing: "No PLAN.md found. Run /df:plan first."
|
|
93
102
|
```
|
|
94
103
|
|
|
104
|
+
Shell injection (use output directly — no manual file reads needed):
|
|
105
|
+
- `` !`cat .deepflow/checkpoint.json 2>/dev/null || echo 'NOT_FOUND'` ``
|
|
106
|
+
- `` !`git diff --quiet && echo 'CLEAN' || echo 'DIRTY'` ``
|
|
107
|
+
|
|
95
108
|
### 2.5. REGISTER NATIVE TASKS
|
|
96
109
|
|
|
97
110
|
For each `[ ]` task in PLAN.md: `TaskCreate(subject: "{task_id}: {description}", activeForm: "{gerund}", description: full block)`. Store task_id → native ID mapping. Set dependencies via `TaskUpdate(addBlockedBy: [...])`. On `--continue`: only register remaining `[ ]` items.
|
|
@@ -123,6 +136,8 @@ Before spawning, check `Files:` lists of all ready tasks. If two+ ready tasks sh
|
|
|
123
136
|
|
|
124
137
|
**≥2 [SPIKE] tasks for same problem:** Follow Parallel Spike Probes (section 5.7).
|
|
125
138
|
|
|
139
|
+
**[OPTIMIZE] tasks:** Follow Optimize Cycle (section 5.9). Only ONE optimize task runs at a time — defer others until the active one completes.
|
|
140
|
+
|
|
126
141
|
### 5.5. RATCHET CHECK
|
|
127
142
|
|
|
128
143
|
After each agent completes, run health checks in the worktree.
|
|
@@ -144,6 +159,28 @@ Run Build → Test → Typecheck → Lint (stop on first failure).
|
|
|
144
159
|
Compare `git diff HEAD~1 --name-only` against Impact callers/duplicates list.
|
|
145
160
|
File listed but not modified → **advisory warning**: "Impact gap: {file} listed as {caller|duplicate} but not modified — verify manually". Not auto-revert (callers sometimes don't need changes), but flags the risk.
|
|
146
161
|
|
|
162
|
+
**Metric gate (Optimize tasks only):**
|
|
163
|
+
|
|
164
|
+
After ratchet passes, if the current task has an `Optimize:` block, run the metric gate:
|
|
165
|
+
|
|
166
|
+
1. Run the `metric` shell command in the worktree: `cd ${WORKTREE_PATH} && eval "${metric_command}"`
|
|
167
|
+
2. Parse output as float. Non-numeric output → cycle failure (revert, log "metric parse error: {raw output}")
|
|
168
|
+
3. Compare against previous measurement using `direction`:
|
|
169
|
+
- `direction: higher` → new value must be > previous + (previous × min_improvement_threshold)
|
|
170
|
+
- `direction: lower` → new value must be < previous - (previous × min_improvement_threshold)
|
|
171
|
+
4. Both ratchet AND metric improvement required → keep commit
|
|
172
|
+
5. Ratchet passes but metric did not improve → revert (log "ratchet passed but metric stagnant/regressed: {old} → {new}")
|
|
173
|
+
6. Run each `secondary_metrics` command, parse as float. If regression > `regression_threshold` (default 5%) compared to baseline: append WARNING to `.deepflow/auto-report.md`: `"WARNING: {name} regressed {delta}% ({baseline_val} → {new_val}) at cycle {N}"`. Do NOT auto-revert.
|
|
174
|
+
|
|
175
|
+
**Output Truncation:**
|
|
176
|
+
|
|
177
|
+
After ratchet checks complete, truncate command output for context efficiency:
|
|
178
|
+
|
|
179
|
+
- **Success (all checks passed):** Suppress output entirely — do not include build/test/lint output in reports
|
|
180
|
+
- **Build failure:** Include last 15 lines of build error only
|
|
181
|
+
- **Test failure:** Include failed test name(s) + last 20 lines of test output
|
|
182
|
+
- **Typecheck/lint failure:** Include error count + first 5 errors only
|
|
183
|
+
|
|
147
184
|
**Evaluate:** All pass + no violations → commit stands. Any failure → attempt partial salvage before reverting:
|
|
148
185
|
|
|
149
186
|
**Partial salvage protocol:**
|
|
@@ -166,7 +203,8 @@ Trigger: ≥2 [SPIKE] tasks with same "Blocked by:" target or identical hypothes
|
|
|
166
203
|
4. **Ratchet:** Per notification, run standard ratchet (5.5) in probe worktree. Record: ratchet_passed, regressions, coverage_delta, files_changed, commit
|
|
167
204
|
5. **Select winner** (after ALL complete, no LLM judge):
|
|
168
205
|
- Disqualify any with regressions
|
|
169
|
-
- Rank: fewer regressions > higher coverage_delta > fewer files_changed > first to complete
|
|
206
|
+
- **Standard spikes**: Rank: fewer regressions > higher coverage_delta > fewer files_changed > first to complete
|
|
207
|
+
- **Optimize probes**: Rank: best metric improvement (absolute delta toward target) > fewer regressions > fewer files_changed
|
|
170
208
|
- No passes → reset all to pending for retry with debugger
|
|
171
209
|
6. **Preserve all worktrees.** Losers: rename branch + `-failed` suffix. Record in checkpoint.json under `"spike_probes"`
|
|
172
210
|
7. **Log ALL probe outcomes** to `.deepflow/auto-memory.yaml` (main tree):
|
|
@@ -200,6 +238,141 @@ Trigger: ≥2 [SPIKE] tasks with same "Blocked by:" target or identical hypothes
|
|
|
200
238
|
Create file if missing. Preserve existing keys when merging. Log BOTH winners and losers — downstream tasks need to know what was chosen, not just what failed.
|
|
201
239
|
8. **Promote winner:** Cherry-pick into shared worktree. Winner → `[x] [PROBE_WINNER]`, losers → `[~] [PROBE_FAILED]`. Resume standard loop.
|
|
202
240
|
|
|
241
|
+
#### 5.7.1. PROBE DIVERSITY ENFORCEMENT (Optimize Probes)
|
|
242
|
+
|
|
243
|
+
When spawning probes for optimize plateau resolution, enforce diversity roles:
|
|
244
|
+
|
|
245
|
+
**Role definitions:**
|
|
246
|
+
- **contextualizada**: Builds on the best approach so far — refines, extends, or combines what worked. Prompt includes: "Build on the best result so far: {best_approach_summary}. Refine or extend it."
|
|
247
|
+
- **contraditoria**: Tries the opposite of the current best. Prompt includes: "The best approach so far is {best_approach_summary}. Try the OPPOSITE direction — if it cached, don't cache; if it optimized hot path, optimize cold path; etc."
|
|
248
|
+
- **ingenua**: No prior context — naive fresh attempt. Prompt includes: "Ignore all prior attempts. Approach this from scratch with no assumptions about what works."
|
|
249
|
+
|
|
250
|
+
**Auto-scaling by probe round:**
|
|
251
|
+
|
|
252
|
+
| Probe round | Count | Required roles |
|
|
253
|
+
|-------------|-------|----------------|
|
|
254
|
+
| 1st plateau | 2 | 1 contraditoria + 1 ingenua |
|
|
255
|
+
| 2nd plateau | 4 | 1 contextualizada + 2 contraditoria + 1 ingenua |
|
|
256
|
+
| 3rd+ plateau | 6 | 2 contextualizada + 2 contraditoria + 2 ingenua |
|
|
257
|
+
|
|
258
|
+
**Rules:**
|
|
259
|
+
- Every probe set MUST include ≥1 contraditoria and ≥1 ingenua (minimum diversity)
|
|
260
|
+
- contextualizada only added from round 2+ (needs prior data to build on)
|
|
261
|
+
- Each probe prompt includes its role label and role-specific instruction
|
|
262
|
+
- Probe scale persists in `optimize_state.probe_scale` in `auto-memory.yaml`
|
|
263
|
+
|
|
264
|
+
### 5.9. OPTIMIZE CYCLE
|
|
265
|
+
|
|
266
|
+
Trigger: task has `Optimize:` block in PLAN.md. Runs instead of standard single-agent spawn.
|
|
267
|
+
|
|
268
|
+
**Optimize is a distinct execution mode** — one optimize task at a time, spanning N cycles until a stop condition.
|
|
269
|
+
|
|
270
|
+
#### 5.9.1. INITIALIZATION
|
|
271
|
+
|
|
272
|
+
1. Parse `Optimize:` block from PLAN.md task: `metric`, `target`, `direction`, `max_cycles`, `secondary_metrics`
|
|
273
|
+
2. Load or initialize `optimize_state` from `.deepflow/auto-memory.yaml`:
|
|
274
|
+
```yaml
|
|
275
|
+
optimize_state:
|
|
276
|
+
task_id: "T{n}"
|
|
277
|
+
metric_command: "{shell command}"
|
|
278
|
+
target: {number}
|
|
279
|
+
direction: "higher|lower"
|
|
280
|
+
baseline: null # set on first measure
|
|
281
|
+
current_best: null # best metric value seen
|
|
282
|
+
best_commit: null # commit hash of best value
|
|
283
|
+
cycles_run: 0
|
|
284
|
+
cycles_without_improvement: 0
|
|
285
|
+
consecutive_reverts: 0
|
|
286
|
+
probe_scale: 0 # 0=no probes yet, 2/4/6
|
|
287
|
+
max_cycles: {number}
|
|
288
|
+
history: [] # [{cycle, value, delta, kept, commit}]
|
|
289
|
+
failed_hypotheses: [] # ["{description}"]
|
|
290
|
+
```
|
|
291
|
+
3. **Measure baseline**: `cd ${WORKTREE_PATH} && eval "${metric_command}"` → parse float → store as `baseline` and `current_best`
|
|
292
|
+
4. Measure each secondary metric → store as `secondary_baselines`
|
|
293
|
+
5. Check if target already met (`direction: higher` → baseline >= target; `lower` → baseline <= target). If met → mark task `[x]`, log "target already met: {baseline}", done.
|
|
294
|
+
|
|
295
|
+
#### 5.9.2. CYCLE LOOP
|
|
296
|
+
|
|
297
|
+
Each cycle = one agent spawn + measure + keep/revert decision.
|
|
298
|
+
|
|
299
|
+
```
|
|
300
|
+
REPEAT:
|
|
301
|
+
1. Check stop conditions (5.9.3) → if triggered, exit loop
|
|
302
|
+
2. Spawn ONE optimize agent (section 6, Optimize Task prompt) with run_in_background=true
|
|
303
|
+
3. STOP. End turn. Wait for notification.
|
|
304
|
+
4. On notification:
|
|
305
|
+
a. Run ratchet check (section 5.5) — build/test/lint must pass
|
|
306
|
+
b. If ratchet fails → git revert HEAD --no-edit, increment consecutive_reverts, log failed hypothesis, go to step 1
|
|
307
|
+
c. Run metric gate (section 5.5 metric gate) — measure new value
|
|
308
|
+
d. If metric parse error → git revert HEAD --no-edit, increment consecutive_reverts, log "metric parse error"
|
|
309
|
+
e. Compute improvement:
|
|
310
|
+
- direction: higher → improvement = (new - current_best) / |current_best| × 100
|
|
311
|
+
- direction: lower → improvement = (current_best - new) / |current_best| × 100
|
|
312
|
+
- current_best == 0 → use absolute delta
|
|
313
|
+
f. If improvement >= min_improvement_threshold (default 1%):
|
|
314
|
+
→ KEEP: update current_best, best_commit, reset cycles_without_improvement=0, reset consecutive_reverts=0
|
|
315
|
+
g. If improvement < min_improvement_threshold:
|
|
316
|
+
→ REVERT: git revert HEAD --no-edit, increment cycles_without_improvement
|
|
317
|
+
h. Increment cycles_run
|
|
318
|
+
i. Append to history: {cycle, value, delta_pct, kept: bool, commit}
|
|
319
|
+
j. Measure secondary metrics, check regression (WARNING only, no revert)
|
|
320
|
+
k. Persist optimize_state to auto-memory.yaml
|
|
321
|
+
l. Report: "⟳ T{n} cycle {N}: {old} → {new} ({+/-delta}%) — {kept|reverted} [best: {current_best}, target: {target}]"
|
|
322
|
+
m. Check context %. If ≥50% → checkpoint and exit (auto-cycle resumes).
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
#### 5.9.3. STOP CONDITIONS
|
|
326
|
+
|
|
327
|
+
| Condition | Detection | Action |
|
|
328
|
+
|-----------|-----------|--------|
|
|
329
|
+
| **Target reached** | `direction: higher` → value >= target; `lower` → value <= target | Mark task `[x]`, log "target reached: {value}" |
|
|
330
|
+
| **Max cycles** | `cycles_run >= max_cycles` | Mark task `[x]` with note: "max cycles reached, best: {current_best}". If current_best worse than baseline → `git reset --hard {best_commit}`, log "reverted to best-known" |
|
|
331
|
+
| **Plateau** | `cycles_without_improvement >= 3` | Pause normal cycle → launch probes (5.9.4) |
|
|
332
|
+
| **Circuit breaker** | `consecutive_reverts >= 3` | Halt, task stays `[ ]`, log "circuit breaker: 3 consecutive reverts". Requires human intervention. |
|
|
333
|
+
|
|
334
|
+
On **max cycles** with final value worse than baseline:
|
|
335
|
+
1. `git reset --hard {best_commit}` in worktree
|
|
336
|
+
2. Log: "final value {current} worse than baseline {baseline}, reverted to best-known commit {best_commit} (value: {current_best})"
|
|
337
|
+
|
|
338
|
+
#### 5.9.4. PLATEAU → PROBE LAUNCH
|
|
339
|
+
|
|
340
|
+
When plateau detected (3 cycles without ≥1% improvement):
|
|
341
|
+
|
|
342
|
+
1. Pause normal optimize cycle
|
|
343
|
+
2. Determine probe count from `probe_scale` (section 5.7.1 auto-scaling table): 0→2, 2→4, 4→6
|
|
344
|
+
3. Update `probe_scale` in optimize_state
|
|
345
|
+
4. Record `BASELINE=$(git rev-parse HEAD)` in shared worktree
|
|
346
|
+
5. Create sub-worktrees per probe: `git worktree add -b df/{spec}--opt-probe-{N} .deepflow/worktrees/{spec}/opt-probe-{N} ${BASELINE}`
|
|
347
|
+
6. Spawn ALL probes in ONE message using Optimize Probe prompt (section 6), each with its diversity role
|
|
348
|
+
7. End turn. Wait for all notifications.
|
|
349
|
+
8. Per notification: run ratchet + metric measurement in probe worktree
|
|
350
|
+
9. Select winner (section 5.7 step 5, optimize ranking): best metric improvement toward target
|
|
351
|
+
10. Winner → cherry-pick into shared worktree, update current_best, reset cycles_without_improvement=0
|
|
352
|
+
11. Losers → rename branch with `-failed` suffix, preserve worktrees
|
|
353
|
+
12. Log all probe outcomes to `auto-memory.yaml` under `spike_insights` (reuse existing format)
|
|
354
|
+
13. Log probe learnings: winning approach summary + each loser's failure reason
|
|
355
|
+
14. Resume normal optimize cycle from step 1
|
|
356
|
+
|
|
357
|
+
#### 5.9.5. STATE PERSISTENCE (auto-memory.yaml)
|
|
358
|
+
|
|
359
|
+
After every cycle, write `optimize_state` to `.deepflow/auto-memory.yaml` (main tree). This ensures:
|
|
360
|
+
- Context exhaustion at 50% → auto-cycle resumes with full history
|
|
361
|
+
- Failed hypotheses carry forward (agents won't repeat approaches)
|
|
362
|
+
- Probe scale persists across context windows
|
|
363
|
+
|
|
364
|
+
Also append cycle results to `.deepflow/auto-report.md`:
|
|
365
|
+
```
|
|
366
|
+
## Optimize: T{n} — {metric_name}
|
|
367
|
+
| Cycle | Value | Delta | Kept | Commit |
|
|
368
|
+
|-------|-------|-------|------|--------|
|
|
369
|
+
| 1 | 72.3 | — | baseline | abc123 |
|
|
370
|
+
| 2 | 74.1 | +2.5% | ✓ | def456 |
|
|
371
|
+
| 3 | 73.8 | -0.4% | ✗ | (reverted) |
|
|
372
|
+
...
|
|
373
|
+
Best: {current_best} | Target: {target} | Status: {in_progress|reached|max_cycles|circuit_breaker}
|
|
374
|
+
```
|
|
375
|
+
|
|
203
376
|
---
|
|
204
377
|
|
|
205
378
|
### 6. PER-TASK (agent prompt)
|
|
@@ -296,6 +469,91 @@ Implement minimal spike to validate hypothesis.
|
|
|
296
469
|
Commit as spike({spec}): {description}
|
|
297
470
|
```
|
|
298
471
|
|
|
472
|
+
**Optimize Task** (spawn with `Agent(model="opus", subagent_type="general-purpose")`):
|
|
473
|
+
|
|
474
|
+
One agent per cycle. Agent makes ONE atomic change to improve the metric.
|
|
475
|
+
|
|
476
|
+
```
|
|
477
|
+
--- START (high attention zone) ---
|
|
478
|
+
|
|
479
|
+
{task_id} [OPTIMIZE]: Improve {metric_name} — cycle {N}/{max_cycles}
|
|
480
|
+
Files: {target files} Spec: {spec_name}
|
|
481
|
+
|
|
482
|
+
Current metric: {current_value} (baseline: {baseline}, best: {current_best})
|
|
483
|
+
Target: {target} ({direction})
|
|
484
|
+
Improvement needed: {delta_to_target} ({direction})
|
|
485
|
+
|
|
486
|
+
CONSTRAINT: Make exactly ONE atomic change. Do not refactor broadly.
|
|
487
|
+
The metric is measured by: {metric_command}
|
|
488
|
+
You succeed if the metric moves toward {target} after your change.
|
|
489
|
+
|
|
490
|
+
--- MIDDLE (navigable data zone) ---
|
|
491
|
+
|
|
492
|
+
Attempt history (last 5 cycles):
|
|
493
|
+
{For each recent history entry:}
|
|
494
|
+
- Cycle {N}: {value} ({+/-delta}%) — {kept|reverted} — "{one-line description of what was tried}"
|
|
495
|
+
{Omit if cycle 1.}
|
|
496
|
+
|
|
497
|
+
DO NOT repeat these failed approaches:
|
|
498
|
+
{For each failed_hypothesis in optimize_state:}
|
|
499
|
+
- "{hypothesis description}"
|
|
500
|
+
{Omit if no failed hypotheses.}
|
|
501
|
+
|
|
502
|
+
{Impact block from PLAN.md if present}
|
|
503
|
+
|
|
504
|
+
{Dependency context if present}
|
|
505
|
+
|
|
506
|
+
Steps:
|
|
507
|
+
1. Analyze the metric command to understand what's being measured
|
|
508
|
+
2. Read the target files and identify ONE specific improvement
|
|
509
|
+
3. Implement the change (ONE atomic modification)
|
|
510
|
+
4. Commit as feat({spec}): optimize {metric_name} — {what you changed}
|
|
511
|
+
|
|
512
|
+
--- END (high attention zone) ---
|
|
513
|
+
|
|
514
|
+
{Spike/probe learnings if any}
|
|
515
|
+
|
|
516
|
+
Your ONLY job is to make ONE atomic change and commit. Orchestrator measures the metric after.
|
|
517
|
+
Do NOT run the metric command yourself. Do NOT make multiple changes.
|
|
518
|
+
STOP after committing. Do NOT merge branches, rename spec files, remove worktrees, or run git checkout on main.
|
|
519
|
+
```
|
|
520
|
+
|
|
521
|
+
**Optimize Probe Task** (spawn with `Agent(model="opus", subagent_type="general-purpose")`):
|
|
522
|
+
|
|
523
|
+
Used during plateau resolution. Each probe has a diversity role.
|
|
524
|
+
|
|
525
|
+
```
|
|
526
|
+
--- START (high attention zone) ---
|
|
527
|
+
|
|
528
|
+
{task_id} [OPTIMIZE PROBE]: {metric_name} — probe {probe_id} ({role_label})
|
|
529
|
+
Files: {target files} Spec: {spec_name}
|
|
530
|
+
|
|
531
|
+
Current metric: {current_value} (baseline: {baseline}, best: {current_best})
|
|
532
|
+
Target: {target} ({direction})
|
|
533
|
+
|
|
534
|
+
Role: {role_label}
|
|
535
|
+
{role_instruction — one of:}
|
|
536
|
+
contextualizada: "Build on the best approach so far: {best_approach_summary}. Refine, extend, or combine what worked."
|
|
537
|
+
contraditoria: "The best approach so far was: {best_approach_summary}. Try the OPPOSITE — if it optimized X, try Y instead. Challenge the current direction."
|
|
538
|
+
ingenua: "Ignore all prior attempts. Approach this metric from scratch with no assumptions about what has or hasn't worked."
|
|
539
|
+
|
|
540
|
+
--- MIDDLE (navigable data zone) ---
|
|
541
|
+
|
|
542
|
+
Full attempt history:
|
|
543
|
+
{ALL history entries from optimize_state}
|
|
544
|
+
- Cycle {N}: {value} ({+/-delta}%) — {kept|reverted}
|
|
545
|
+
|
|
546
|
+
All failed approaches (DO NOT repeat):
|
|
547
|
+
{ALL failed_hypotheses}
|
|
548
|
+
- "{hypothesis description}"
|
|
549
|
+
|
|
550
|
+
--- END (high attention zone) ---
|
|
551
|
+
|
|
552
|
+
Make ONE atomic change that moves the metric toward {target}.
|
|
553
|
+
Commit as feat({spec}): optimize probe {probe_id} — {what you changed}
|
|
554
|
+
STOP after committing.
|
|
555
|
+
```
|
|
556
|
+
|
|
299
557
|
### 8. COMPLETE SPECS
|
|
300
558
|
|
|
301
559
|
When all tasks done for a `doing-*` spec:
|
|
@@ -370,6 +628,12 @@ When task fails ratchet and is reverted:
|
|
|
370
628
|
| All probe worktrees preserved | Losers renamed `-failed`; never deleted |
|
|
371
629
|
| Machine-selected winner | Regressions > coverage > files changed; no LLM judge |
|
|
372
630
|
| External APIs → chub first | Skip if unavailable |
|
|
631
|
+
| 1 optimize task at a time | Inherently sequential — no parallel optimize tasks |
|
|
632
|
+
| Optimize = atomic changes only | One modification per cycle for diagnosability |
|
|
633
|
+
| Ratchet + metric = both required | Optimize keeps commit only if ratchet AND metric improve |
|
|
634
|
+
| Plateau → probes, not more cycles | 3 cycles without ≥1% improvement triggers probe launch |
|
|
635
|
+
| Circuit breaker = 3 consecutive reverts | Halts optimize loop, requires human intervention |
|
|
636
|
+
| Optimize probes need diversity | Every probe set: ≥1 contraditoria + ≥1 ingenua minimum |
|
|
373
637
|
|
|
374
638
|
## Example
|
|
375
639
|
|
package/src/commands/df/note.md
CHANGED
package/src/commands/df/plan.md
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:plan
|
|
3
|
+
description: Compare specs against codebase and past experiments, generate prioritized tasks
|
|
4
|
+
---
|
|
5
|
+
|
|
1
6
|
# /df:plan — Generate Task Plan from Specs
|
|
2
7
|
|
|
3
8
|
## Purpose
|
|
@@ -32,6 +37,10 @@ Load: specs/*.md (exclude doing-*/done-*), PLAN.md (if exists), .deepflow/config
|
|
|
32
37
|
Determine source_dir from config or default to src/
|
|
33
38
|
```
|
|
34
39
|
|
|
40
|
+
Shell injection (use output directly — no manual file reads needed):
|
|
41
|
+
- `` !`ls specs/*.md 2>/dev/null || echo 'NOT_FOUND'` ``
|
|
42
|
+
- `` !`cat PLAN.md 2>/dev/null || echo 'NOT_FOUND'` ``
|
|
43
|
+
|
|
35
44
|
Run `validateSpec` on each spec. Hard failures → skip + error. Advisory → include in output.
|
|
36
45
|
No new specs → report counts, suggest `/df:execute`.
|
|
37
46
|
|
|
@@ -58,20 +67,7 @@ Full implementation tasks BLOCKED until spike validates. See `templates/experime
|
|
|
58
67
|
|
|
59
68
|
Identify code style, patterns (error handling, API structure), integration points. Include in task descriptions.
|
|
60
69
|
|
|
61
|
-
### 4.
|
|
62
|
-
|
|
63
|
-
Follow `templates/explore-agent.md` for spawn rules and scope.
|
|
64
|
-
|
|
65
|
-
| File Count | Agents |
|
|
66
|
-
|------------|--------|
|
|
67
|
-
| <20 | 3-5 |
|
|
68
|
-
| 20-100 | 10-15 |
|
|
69
|
-
| 100-500 | 25-40 |
|
|
70
|
-
| 500+ | 50-100 (cap) |
|
|
71
|
-
|
|
72
|
-
Use `code-completeness` skill to search for: implementations matching spec requirements, TODOs/FIXMEs/HACKs, stubs, skipped tests.
|
|
73
|
-
|
|
74
|
-
### 4.5. IMPACT ANALYSIS (per planned file)
|
|
70
|
+
### 4. IMPACT ANALYSIS (per planned file)
|
|
75
71
|
|
|
76
72
|
For each file in a task's "Files:" list, find the full blast radius.
|
|
77
73
|
|
|
@@ -99,6 +95,16 @@ For each file in a task's "Files:" list, find the full blast radius.
|
|
|
99
95
|
Files outside original "Files:" → add with `(impact — verify/update)`.
|
|
100
96
|
Skip for spike tasks.
|
|
101
97
|
|
|
98
|
+
### 4.5. TARGETED EXPLORATION
|
|
99
|
+
|
|
100
|
+
Follow `templates/explore-agent.md` for spawn rules and scope. Explore agents cover **what LSP did not reveal**: conventions, dead code, implicit patterns.
|
|
101
|
+
|
|
102
|
+
| Finding Type | Agents |
|
|
103
|
+
|--------------|--------|
|
|
104
|
+
| Post-LSP gaps | 3-5 |
|
|
105
|
+
|
|
106
|
+
Use `code-completeness` skill to search for: implementations matching spec requirements, TODOs/FIXMEs/HACKs, stubs, skipped tests.
|
|
107
|
+
|
|
102
108
|
### 4.6. CROSS-TASK FILE CONFLICT DETECTION
|
|
103
109
|
|
|
104
110
|
After all tasks have their `Files:` lists, detect overlaps that require sequential execution.
|
|
@@ -133,6 +139,17 @@ Spawn `Task(subagent_type="reasoner", model="opus")`. Map each requirement to DO
|
|
|
133
139
|
|
|
134
140
|
Priority: Dependencies → Impact → Risk
|
|
135
141
|
|
|
142
|
+
#### Metric AC Detection
|
|
143
|
+
|
|
144
|
+
While comparing requirements, scan each spec AC for the pattern `{metric} {operator} {number}[unit]`:
|
|
145
|
+
|
|
146
|
+
- **Pattern examples**: `coverage > 85%`, `latency < 200ms`, `p99_latency <= 150ms`, `bundle_size < 500kb`
|
|
147
|
+
- **Operators**: `>`, `<`, `>=`, `<=`, `==`
|
|
148
|
+
- **Number**: float or integer, optional unit suffix (%, ms, kb, mb, s, etc.)
|
|
149
|
+
- **On match**: flag the AC as a **metric AC** and generate an `Optimize:` task (see section 6.5)
|
|
150
|
+
- **Non-match**: treat as standard functional AC → standard implementation task
|
|
151
|
+
- **Ambiguous ACs** (qualitative terms like "fast", "small", "improved"): flag as spec gap, request numeric threshold before planning
|
|
152
|
+
|
|
136
153
|
### 5.5. CLASSIFY MODEL + EFFORT PER TASK
|
|
137
154
|
|
|
138
155
|
For each task, assign `Model:` and `Effort:` based on the routing matrix:
|
|
@@ -148,6 +165,7 @@ For each task, assign `Model:` and `Effort:` based on the routing matrix:
|
|
|
148
165
|
| Bug fix (clear repro) | `sonnet` | `medium` | Diagnosis done, just apply fix |
|
|
149
166
|
| Bug fix (unclear cause) | `sonnet` | `high` | Needs reasoning to find root cause |
|
|
150
167
|
| Spike / validation | `sonnet` | `high` | Scoped but needs reasoning to validate hypothesis |
|
|
168
|
+
| Optimize (metric AC) | `opus` | `high` | Multi-cycle, ambiguous — best strategy changes per iteration |
|
|
151
169
|
| Feature work (well-specced) | `sonnet` | `medium` | Clear ACs reduce thinking overhead |
|
|
152
170
|
| Feature work (ambiguous ACs) | `opus` | `medium` | Needs intelligence but effort can be moderate with good specs |
|
|
153
171
|
| Refactor (>5 files, many callers) | `opus` | `medium` | Blast radius needs intelligence, patterns are repetitive |
|
|
@@ -224,6 +242,40 @@ Before output, verify: ≥2 opposing probes, ≥1 naive, all independent.
|
|
|
224
242
|
- Success criteria: DB queries drop ≥80% with zero cache infrastructure
|
|
225
243
|
```
|
|
226
244
|
|
|
245
|
+
### 6.5. GENERATE OPTIMIZE TASKS (FROM METRIC ACs)
|
|
246
|
+
|
|
247
|
+
For each metric AC detected in section 5, generate an `Optimize:` task using this format:
|
|
248
|
+
|
|
249
|
+
**Optimize Task Format:**
|
|
250
|
+
```markdown
|
|
251
|
+
- [ ] **T{n}** [OPTIMIZE]: Improve {metric_name} to {target}
|
|
252
|
+
- Type: optimize
|
|
253
|
+
- Files: {primary files likely to affect the metric}
|
|
254
|
+
- Optimize:
|
|
255
|
+
metric: "{shell command that outputs a single number}"
|
|
256
|
+
target: {number}
|
|
257
|
+
direction: higher|lower
|
|
258
|
+
max_cycles: {number, default 20}
|
|
259
|
+
secondary_metrics:
|
|
260
|
+
- metric: "{shell command}"
|
|
261
|
+
name: "{label}"
|
|
262
|
+
regression_threshold: 5%
|
|
263
|
+
- Model: opus
|
|
264
|
+
- Effort: high
|
|
265
|
+
- Blocked by: {spike T{n} if applicable, else none}
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
**Field rules:**
|
|
269
|
+
- `metric`: a shell command returning a single scalar float/integer (e.g., `npx jest --coverage --json | jq '.coverageMap | .. | .pct? | numbers' | awk '{sum+=$1;n++} END{print sum/n}'`). Must be deterministic and side-effect free.
|
|
270
|
+
- `target`: the numeric threshold extracted from the AC (strip unit suffix for the value; note unit in task description)
|
|
271
|
+
- `direction`: `higher` if operator is `>` or `>=`; `lower` if `<` or `<=`; `higher` by convention for `==`
|
|
272
|
+
- `max_cycles`: from spec if stated; default 20
|
|
273
|
+
- `secondary_metrics`: other metrics from the same spec that could regress (e.g., build time, bundle size, test count). Omit if none.
|
|
274
|
+
|
|
275
|
+
**Model/Effort**: always `opus` / `high` (see routing matrix).
|
|
276
|
+
|
|
277
|
+
**Blocking**: if a spike exists for the same area, block the optimize task on the spike passing.
|
|
278
|
+
|
|
227
279
|
### 7. VALIDATE HYPOTHESES
|
|
228
280
|
|
|
229
281
|
Unfamiliar APIs or performance-critical → prototype in scratchpad. Fails → write `--failed.md`. Skip for known patterns.
|
|
@@ -250,7 +302,7 @@ Report: `✓ Plan generated — {n} specs, {n} tasks. Run /df:execute`
|
|
|
250
302
|
|
|
251
303
|
| Agent | Model | Base | Scale |
|
|
252
304
|
|-------|-------|------|-------|
|
|
253
|
-
| Explore | haiku |
|
|
305
|
+
| Explore | haiku | 3-5 | none |
|
|
254
306
|
| Reasoner | opus | 5 | +1 per 2 specs |
|
|
255
307
|
|
|
256
308
|
Always use `Task` tool with explicit `subagent_type` and `model`.
|
|
@@ -280,3 +332,25 @@ Always use `Task` tool with explicit `subagent_type` and `model`.
|
|
|
280
332
|
- Model: opus
|
|
281
333
|
- Blocked by: T1, T2
|
|
282
334
|
```
|
|
335
|
+
|
|
336
|
+
**Optimize task example** (from spec AC: `coverage > 85%`):
|
|
337
|
+
|
|
338
|
+
```markdown
|
|
339
|
+
### doing-quality
|
|
340
|
+
|
|
341
|
+
- [ ] **T1** [OPTIMIZE]: Improve test coverage to >85%
|
|
342
|
+
- Type: optimize
|
|
343
|
+
- Files: src/
|
|
344
|
+
- Optimize:
|
|
345
|
+
metric: "npx jest --coverage --json 2>/dev/null | jq '[.. | .pct? | numbers] | add / length'"
|
|
346
|
+
target: 85
|
|
347
|
+
direction: higher
|
|
348
|
+
max_cycles: 20
|
|
349
|
+
secondary_metrics:
|
|
350
|
+
- metric: "npx jest --json 2>/dev/null | jq '.testResults | length'"
|
|
351
|
+
name: test_count
|
|
352
|
+
regression_threshold: 5%
|
|
353
|
+
- Model: opus
|
|
354
|
+
- Effort: high
|
|
355
|
+
- Blocked by: none
|
|
356
|
+
```
|
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:resume
|
|
3
|
+
description: Synthesize project state into a briefing covering what happened, current decisions, and next steps
|
|
4
|
+
allowed-tools: [Read, Grep, Glob, Bash]
|
|
5
|
+
---
|
|
6
|
+
|
|
1
7
|
# /df:resume — Session Continuity Briefing
|
|
2
8
|
|
|
3
9
|
## Orchestrator Role
|
|
@@ -28,11 +34,11 @@ Read these sources in parallel (all reads, no writes):
|
|
|
28
34
|
|
|
29
35
|
| Source | Command/Path | Purpose |
|
|
30
36
|
|--------|-------------|---------|
|
|
31
|
-
| Git timeline |
|
|
32
|
-
| Decisions |
|
|
33
|
-
| Plan |
|
|
34
|
-
| Spec headers |
|
|
35
|
-
| Experiments |
|
|
37
|
+
| Git timeline | `!`git log --oneline -20`` | What changed and when |
|
|
38
|
+
| Decisions | `!`cat .deepflow/decisions.md 2>/dev/null \|\| echo 'NOT_FOUND'`` | Current [APPROACH], [PROVISIONAL], [ASSUMPTION] entries |
|
|
39
|
+
| Plan | `!`cat PLAN.md 2>/dev/null \|\| echo 'NOT_FOUND'`` | Task status (checked vs unchecked) |
|
|
40
|
+
| Spec headers | `!`head -20 specs/doing-*.md 2>/dev/null \|\| echo 'NOT_FOUND'`` | What features are in-flight |
|
|
41
|
+
| Experiments | `!`ls .deepflow/experiments/ 2>/dev/null \|\| echo 'NOT_FOUND'`` | Validated and failed approaches |
|
|
36
42
|
|
|
37
43
|
**Token budget:** Read only what's needed — ~2500 tokens total across all sources.
|
|
38
44
|
|
package/src/commands/df/spec.md
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:verify
|
|
3
|
+
description: Check that implemented code satisfies spec requirements and acceptance criteria through machine-verifiable checks
|
|
4
|
+
context: fork
|
|
5
|
+
---
|
|
6
|
+
|
|
1
7
|
# /df:verify — Verify Specs Satisfied
|
|
2
8
|
|
|
3
9
|
## Purpose
|
|
@@ -25,7 +31,7 @@ specs/
|
|
|
25
31
|
|
|
26
32
|
### 1. LOAD CONTEXT
|
|
27
33
|
|
|
28
|
-
Load:
|
|
34
|
+
Load: `!`ls specs/doing-*.md 2>/dev/null || echo 'NOT_FOUND'``, `!`cat PLAN.md 2>/dev/null || echo 'NOT_FOUND'``, source code. Load `specs/done-*.md` only if `--re-verify`.
|
|
29
35
|
|
|
30
36
|
**Readiness check:** For each `doing-*` spec, check PLAN.md:
|
|
31
37
|
- All tasks `[x]` → ready (proceed)
|
|
@@ -35,7 +41,7 @@ If no `doing-*` specs found: report counts, suggest `/df:execute`.
|
|
|
35
41
|
|
|
36
42
|
### 1.5. DETECT PROJECT COMMANDS
|
|
37
43
|
|
|
38
|
-
**Config override always wins.** If
|
|
44
|
+
**Config override always wins.** If `!`cat .deepflow/config.yaml 2>/dev/null || echo 'NOT_FOUND'`` has `quality.test_command` or `quality.build_command`, use those.
|
|
39
45
|
|
|
40
46
|
**Auto-detection (first match wins):**
|
|
41
47
|
|
|
@@ -45,8 +45,9 @@ Task: T1
|
|
|
45
45
|
1. Implement task completely
|
|
46
46
|
2. Verify it works (tests, types, lint)
|
|
47
47
|
3. Stage specific files (`git add {files}`, not `-A`)
|
|
48
|
-
4.
|
|
49
|
-
5.
|
|
48
|
+
4. Read staged changes: !`git diff --cached --stat 2>/dev/null || echo 'NOT_FOUND'`
|
|
49
|
+
5. Commit with proper format
|
|
50
|
+
6. Return hash
|
|
50
51
|
|
|
51
52
|
## Pre-Commit
|
|
52
53
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: browse-fetch
|
|
3
3
|
description: Fetches live web content using headless Chromium via Playwright. Use when you need to read documentation, articles, or any public URL that requires JavaScript rendering. Falls back to WebFetch for simple HTML pages.
|
|
4
|
+
context: fork
|
|
5
|
+
allowed-tools: [Bash, WebFetch, WebSearch, Read]
|
|
4
6
|
---
|
|
5
7
|
|
|
6
8
|
# Browse-Fetch
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: browse-verify
|
|
3
3
|
description: Verifies UI acceptance criteria by launching a headless browser, extracting the accessibility tree, and evaluating structured assertions deterministically. Use when a spec has browser-based ACs that need automated verification after implementation.
|
|
4
|
+
context: fork
|
|
4
5
|
---
|
|
5
6
|
|
|
6
7
|
# Browse-Verify
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: code-completeness
|
|
3
3
|
description: Finds incomplete code in codebase. Use when analyzing for TODOs, stubs, placeholders, skipped tests, or missing implementations. Helps compare specs against actual code state.
|
|
4
|
+
allowed-tools: [Read, Grep, Glob]
|
|
4
5
|
---
|
|
5
6
|
|
|
6
7
|
# Code Completeness
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: gap-discovery
|
|
3
3
|
description: Discovers requirement gaps during ideation. Use when user describes features, planning specs, or requirements seem incomplete. Asks clarifying questions about scope, constraints, edge cases, success criteria.
|
|
4
|
+
allowed-tools: [AskUserQuestion, Read]
|
|
4
5
|
---
|
|
5
6
|
|
|
6
7
|
# Gap Discovery
|
|
@@ -12,21 +12,86 @@ Task(subagent_type="Explore", model="haiku", prompt="Find: ...")
|
|
|
12
12
|
# Returns final message only; blocks until all complete; no late notifications
|
|
13
13
|
```
|
|
14
14
|
|
|
15
|
+
## Search Protocol
|
|
16
|
+
|
|
17
|
+
Exploration follows three named phases:
|
|
18
|
+
|
|
19
|
+
### DIVERSIFY
|
|
20
|
+
- **Goal**: Find ALL potential matches across the codebase quickly
|
|
21
|
+
- **Method**: Launch 5–8 parallel tool calls in a single message
|
|
22
|
+
- **Tools**: Glob (broad patterns), Grep (regex searches), Read (file content verification)
|
|
23
|
+
- **Result**: Narrow down to 2–5 candidate files
|
|
24
|
+
|
|
25
|
+
Example: Search for "config" + "settings" + "env" patterns in parallel, not sequentially.
|
|
26
|
+
|
|
27
|
+
### CONVERGE
|
|
28
|
+
- **Goal**: Validate matches against the search criteria
|
|
29
|
+
- **Method**: Read only the matched files; extract relevant line ranges
|
|
30
|
+
- **Result**: Eliminate false positives, confirm relevance
|
|
31
|
+
|
|
32
|
+
### EARLY STOP
|
|
33
|
+
- **Goal**: Avoid wasting tokens on exhaustive searches
|
|
34
|
+
- **Rule**: Stop as soon as **>= 2 relevant files found** that answer the question
|
|
35
|
+
- **Exception**: If searching for a single unique thing (e.g., "the entry point file"), find just 1
|
|
36
|
+
|
|
15
37
|
## Prompt Structure
|
|
16
38
|
|
|
17
39
|
```
|
|
18
40
|
Find: [specific question]
|
|
19
41
|
|
|
20
42
|
Return ONLY:
|
|
21
|
-
-
|
|
22
|
-
- One-line description per file
|
|
43
|
+
- filepath:startLine-endLine -- why relevant
|
|
23
44
|
- Integration points (if asked)
|
|
24
45
|
|
|
25
|
-
DO NOT: read/summarize specs, make recommendations, propose solutions, generate tables.
|
|
46
|
+
DO NOT: read/summarize specs, make recommendations, propose solutions, generate tables, narrate your search process.
|
|
26
47
|
|
|
27
48
|
Max response: 500 tokens (configurable via .deepflow/config.yaml explore.max_tokens)
|
|
28
49
|
```
|
|
29
50
|
|
|
51
|
+
## Examples
|
|
52
|
+
|
|
53
|
+
### GOOD: Parallel search (2 turns total)
|
|
54
|
+
|
|
55
|
+
**Turn 1 (DIVERSIFY):**
|
|
56
|
+
```
|
|
57
|
+
- Glob: "src/**/*.ts" pattern="config" (search in all TS files)
|
|
58
|
+
- Glob: "src/**/*.js" pattern="config" (search in all JS files)
|
|
59
|
+
- Grep: pattern="export.*config", type="ts" (find exports)
|
|
60
|
+
- Grep: pattern="interface.*Config", type="ts" (find type definitions)
|
|
61
|
+
- Grep: pattern="class.*Settings", type="ts" (alternative pattern)
|
|
62
|
+
- Read: src/index.ts (verify entry point structure)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
**Turn 2 (CONVERGE):**
|
|
66
|
+
Return only confirmed matches:
|
|
67
|
+
```
|
|
68
|
+
src/config/app.ts:1-45 -- main config export with environment settings
|
|
69
|
+
src/config/types.ts:10-30 -- Config interface definition
|
|
70
|
+
src/utils/settings.ts:1-20 -- Settings helper functions
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### DO NOT: Sequential search (antipattern, 5+ turns)
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
Turn 1: Glob for config files
|
|
77
|
+
Turn 2: Read the first file
|
|
78
|
+
Turn 3: Grep for config patterns
|
|
79
|
+
Turn 4: Read results
|
|
80
|
+
Turn 5: Another Grep search
|
|
81
|
+
... (narrating each step)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
This pattern wastes tokens and breaks context efficiency.
|
|
85
|
+
|
|
86
|
+
## Fallback
|
|
87
|
+
|
|
88
|
+
Search dependency directories **only when not found in app code**:
|
|
89
|
+
- `node_modules/` — npm packages
|
|
90
|
+
- `vendor/` — vendored dependencies
|
|
91
|
+
- `site-packages/` — Python packages
|
|
92
|
+
|
|
93
|
+
Fallback instruction: "Check node_modules/ only if target not found in src/ or lib/"
|
|
94
|
+
|
|
30
95
|
## Scope Restrictions
|
|
31
96
|
|
|
32
97
|
MUST only report factual findings: files found, patterns/conventions, integration points.
|