deepflow 0.1.103 → 0.1.104

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/bin/install-dynamic-hooks.test.js +461 -0
  2. package/bin/install.js +150 -250
  3. package/bin/lineage-ingest.js +70 -0
  4. package/hooks/df-check-update.js +1 -0
  5. package/hooks/df-command-usage.js +18 -0
  6. package/hooks/df-dashboard-push.js +1 -0
  7. package/hooks/df-execution-history.js +1 -0
  8. package/hooks/df-explore-protocol.js +83 -0
  9. package/hooks/df-explore-protocol.test.js +228 -0
  10. package/hooks/df-hook-event-tags.test.js +127 -0
  11. package/hooks/df-invariant-check.js +1 -0
  12. package/hooks/df-quota-logger.js +1 -0
  13. package/hooks/df-snapshot-guard.js +1 -0
  14. package/hooks/df-spec-lint.js +58 -1
  15. package/hooks/df-spec-lint.test.js +412 -0
  16. package/hooks/df-statusline.js +1 -0
  17. package/hooks/df-subagent-registry.js +1 -0
  18. package/hooks/df-tool-usage.js +13 -3
  19. package/hooks/df-worktree-guard.js +1 -0
  20. package/package.json +1 -1
  21. package/src/commands/df/debate.md +1 -1
  22. package/src/commands/df/eval.md +117 -0
  23. package/src/commands/df/execute.md +1 -1
  24. package/src/commands/df/fix.md +104 -0
  25. package/src/eval/git-memory.js +159 -0
  26. package/src/eval/git-memory.test.js +439 -0
  27. package/src/eval/hypothesis.js +80 -0
  28. package/src/eval/hypothesis.test.js +169 -0
  29. package/src/eval/loop.js +378 -0
  30. package/src/eval/loop.test.js +306 -0
  31. package/src/eval/metric-collector.js +163 -0
  32. package/src/eval/metric-collector.test.js +369 -0
  33. package/src/eval/metric-pivot.js +119 -0
  34. package/src/eval/metric-pivot.test.js +350 -0
  35. package/src/eval/mutator-prompt.js +106 -0
  36. package/src/eval/mutator-prompt.test.js +180 -0
  37. package/templates/config-template.yaml +5 -0
  38. package/templates/eval-fixture-template/config.yaml +39 -0
  39. package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
  40. package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
  41. package/templates/eval-fixture-template/fixture/package.json +12 -0
  42. package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
  43. package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
  44. package/templates/eval-fixture-template/fixture/src/config.js +40 -0
  45. package/templates/eval-fixture-template/fixture/src/index.js +19 -0
  46. package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
  47. package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
  48. package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
  49. package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
  50. package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
  51. package/templates/eval-fixture-template/hypotheses.md +14 -0
  52. package/templates/eval-fixture-template/spec.md +34 -0
  53. package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
  54. package/templates/eval-fixture-template/tests/guard.test.js +108 -0
  55. package/templates/eval-fixture-template.test.js +318 -0
  56. package/templates/explore-agent.md +5 -74
  57. package/templates/explore-protocol.md +44 -0
  58. package/templates/spec-template.md +4 -0
@@ -0,0 +1,412 @@
1
+ /**
2
+ * Tests for hooks/df-spec-lint.js
3
+ *
4
+ * Validates that computeLayer, validateSpec, and extractSection correctly
5
+ * handle YAML frontmatter (including derives-from fields) without
6
+ * misinterpreting frontmatter lines as section headers.
7
+ *
8
+ * Uses Node.js built-in node:test to avoid adding dependencies.
9
+ */
10
+
11
+ 'use strict';
12
+
13
+ const { test, describe } = require('node:test');
14
+ const assert = require('node:assert/strict');
15
+ const fs = require('fs');
16
+ const path = require('path');
17
+ const os = require('os');
18
+
19
+ const { computeLayer, validateSpec, extractSection, parseFrontmatter } = require('./df-spec-lint');
20
+
21
+ // ---------------------------------------------------------------------------
22
+ // Helpers
23
+ // ---------------------------------------------------------------------------
24
+
25
+ /** Minimal L0 spec (just Objective) */
26
+ function minimalSpec(objective = 'Build the thing') {
27
+ return `## Objective\n${objective}\n`;
28
+ }
29
+
30
+ /** Full L3 spec with all required sections */
31
+ function fullSpec() {
32
+ return [
33
+ '## Objective',
34
+ 'Build the thing',
35
+ '',
36
+ '## Requirements',
37
+ '- REQ-1: Do something',
38
+ '',
39
+ '## Constraints',
40
+ 'Must be fast',
41
+ '',
42
+ '## Out of Scope',
43
+ 'Not doing X',
44
+ '',
45
+ '## Acceptance Criteria',
46
+ '- [ ] REQ-1 works',
47
+ '',
48
+ '## Technical Notes',
49
+ 'Use module Y',
50
+ ].join('\n');
51
+ }
52
+
53
+ /** Wrap content with YAML frontmatter */
54
+ function withFrontmatter(body, fields = {}) {
55
+ const yamlLines = Object.entries(fields).map(([k, v]) => `${k}: ${v}`);
56
+ return ['---', ...yamlLines, '---', '', body].join('\n');
57
+ }
58
+
59
+ // ---------------------------------------------------------------------------
60
+ // computeLayer — frontmatter handling
61
+ // ---------------------------------------------------------------------------
62
+
63
+ describe('computeLayer', () => {
64
+ test('returns L0 for spec with only Objective', () => {
65
+ assert.equal(computeLayer(minimalSpec()), 0);
66
+ });
67
+
68
+ test('returns L0 when frontmatter with derives-from precedes Objective', () => {
69
+ const content = withFrontmatter(minimalSpec(), {
70
+ 'derives-from': 'done-auth',
71
+ });
72
+ assert.equal(computeLayer(content), 0);
73
+ });
74
+
75
+ test('returns L3 for full spec with derives-from frontmatter', () => {
76
+ const content = withFrontmatter(fullSpec(), {
77
+ 'derives-from': 'done-auth',
78
+ name: 'spec-lineage',
79
+ });
80
+ assert.equal(computeLayer(content), 3);
81
+ });
82
+
83
+ test('frontmatter --- lines are not counted as section headers', () => {
84
+ // If --- were mistaken for headers, layer computation would break.
85
+ // Verify that a spec with frontmatter computes same layer as without.
86
+ const bare = fullSpec();
87
+ const wrapped = withFrontmatter(fullSpec(), {
88
+ 'derives-from': 'done-auth',
89
+ });
90
+ assert.equal(computeLayer(bare), computeLayer(wrapped));
91
+ });
92
+
93
+ test('derives-from value is not mistaken for a section name', () => {
94
+ // derives-from: done-auth — should not create a phantom header
95
+ const content = withFrontmatter(minimalSpec(), {
96
+ 'derives-from': 'done-auth',
97
+ });
98
+ // Still L0, not some higher layer from phantom headers
99
+ assert.equal(computeLayer(content), 0);
100
+ });
101
+
102
+ test('returns -1 when frontmatter exists but no Objective section', () => {
103
+ const content = withFrontmatter('Just some text, no headings.', {
104
+ 'derives-from': 'done-auth',
105
+ });
106
+ assert.equal(computeLayer(content), -1);
107
+ });
108
+ });
109
+
110
+ // ---------------------------------------------------------------------------
111
+ // validateSpec — frontmatter handling
112
+ // ---------------------------------------------------------------------------
113
+
114
+ describe('validateSpec with frontmatter', () => {
115
+ test('full spec with derives-from frontmatter produces no hard errors', () => {
116
+ const content = withFrontmatter(fullSpec(), {
117
+ 'derives-from': 'done-auth',
118
+ });
119
+ const result = validateSpec(content);
120
+ assert.deepEqual(result.hard, []);
121
+ });
122
+
123
+ test('layer is correctly reported when frontmatter is present', () => {
124
+ const content = withFrontmatter(fullSpec(), {
125
+ 'derives-from': 'done-auth',
126
+ });
127
+ const result = validateSpec(content);
128
+ assert.equal(result.layer, 3);
129
+ });
130
+
131
+ test('L0 spec with frontmatter reports missing sections as advisory only', () => {
132
+ const content = withFrontmatter(minimalSpec(), {
133
+ 'derives-from': 'done-auth',
134
+ });
135
+ const result = validateSpec(content);
136
+ // L0 only requires Objective — everything else is advisory
137
+ assert.deepEqual(result.hard, []);
138
+ assert.ok(result.advisory.length > 0, 'should have advisory warnings for missing sections');
139
+ });
140
+
141
+ test('frontmatter --- delimiters do not appear in hard or advisory messages', () => {
142
+ const content = withFrontmatter(fullSpec(), {
143
+ 'derives-from': 'done-auth',
144
+ });
145
+ const result = validateSpec(content);
146
+ const allMessages = [...result.hard, ...result.advisory];
147
+ for (const msg of allMessages) {
148
+ assert.ok(!msg.includes('---'), `Unexpected --- in message: ${msg}`);
149
+ }
150
+ });
151
+ });
152
+
153
+ // ---------------------------------------------------------------------------
154
+ // extractSection — frontmatter handling
155
+ // ---------------------------------------------------------------------------
156
+
157
+ describe('extractSection with frontmatter', () => {
158
+ test('extracts Objective section when frontmatter is present', () => {
159
+ const content = withFrontmatter(minimalSpec('Build the thing'), {
160
+ 'derives-from': 'done-auth',
161
+ });
162
+ const section = extractSection(content, 'Objective');
163
+ assert.ok(section !== null, 'Objective section should be found');
164
+ assert.ok(section.includes('Build the thing'));
165
+ });
166
+
167
+ test('extracts Requirements section with frontmatter', () => {
168
+ const content = withFrontmatter(fullSpec(), {
169
+ 'derives-from': 'done-auth',
170
+ });
171
+ const section = extractSection(content, 'Requirements');
172
+ assert.ok(section !== null);
173
+ assert.ok(section.includes('REQ-1'));
174
+ });
175
+
176
+ test('frontmatter content does not leak into extracted sections', () => {
177
+ const content = withFrontmatter(fullSpec(), {
178
+ 'derives-from': 'done-auth',
179
+ description: 'A spec about things',
180
+ });
181
+ const objective = extractSection(content, 'Objective');
182
+ assert.ok(objective !== null);
183
+ assert.ok(!objective.includes('derives-from'));
184
+ assert.ok(!objective.includes('done-auth'));
185
+ assert.ok(!objective.includes('description'));
186
+ });
187
+
188
+ test('returns null for non-existent section even with frontmatter', () => {
189
+ const content = withFrontmatter(minimalSpec(), {
190
+ 'derives-from': 'done-auth',
191
+ });
192
+ const section = extractSection(content, 'Nonexistent');
193
+ assert.equal(section, null);
194
+ });
195
+
196
+ test('extracts section using alias when frontmatter is present', () => {
197
+ const content = withFrontmatter(
198
+ '## Goal\nDo the thing\n\n## Requirements\n- REQ-1: stuff\n',
199
+ { 'derives-from': 'done-auth' }
200
+ );
201
+ // 'goal' is an alias for 'Objective'
202
+ const section = extractSection(content, 'Objective');
203
+ assert.ok(section !== null, 'Should find section via alias "Goal"');
204
+ assert.ok(section.includes('Do the thing'));
205
+ });
206
+ });
207
+
208
+ // ---------------------------------------------------------------------------
209
+ // Edge cases — frontmatter-like patterns inside body
210
+ // ---------------------------------------------------------------------------
211
+
212
+ describe('frontmatter edge cases', () => {
213
+ test('--- inside spec body (e.g. horizontal rule) does not break computeLayer', () => {
214
+ const content = [
215
+ '---',
216
+ 'derives-from: done-auth',
217
+ '---',
218
+ '',
219
+ '## Objective',
220
+ 'Build it',
221
+ '',
222
+ '---',
223
+ '',
224
+ '## Requirements',
225
+ '- REQ-1: Something',
226
+ ].join('\n');
227
+ // Should at least be L1 (has Objective + Requirements)
228
+ assert.ok(computeLayer(content) >= 1);
229
+ });
230
+
231
+ test('multiple derives-from fields in frontmatter do not affect layer', () => {
232
+ const content = withFrontmatter(fullSpec(), {
233
+ 'derives-from': 'done-auth, done-payments',
234
+ });
235
+ assert.equal(computeLayer(content), 3);
236
+ });
237
+ });
238
+
239
+ // ---------------------------------------------------------------------------
240
+ // parseFrontmatter — direct unit tests
241
+ // ---------------------------------------------------------------------------
242
+
243
+ describe('parseFrontmatter', () => {
244
+ test('parses key-value pairs and returns body without frontmatter', () => {
245
+ const content = [
246
+ '---',
247
+ 'derives-from: done-auth',
248
+ 'name: spec-lineage',
249
+ '---',
250
+ '',
251
+ '## Objective',
252
+ 'Build it',
253
+ ].join('\n');
254
+ const { frontmatter, body } = parseFrontmatter(content);
255
+ assert.equal(frontmatter['derives-from'], 'done-auth');
256
+ assert.equal(frontmatter['name'], 'spec-lineage');
257
+ assert.ok(body.includes('## Objective'));
258
+ assert.ok(body.includes('Build it'));
259
+ });
260
+
261
+ test('returns empty frontmatter and full body when no --- opener', () => {
262
+ const content = '## Objective\nBuild it\n';
263
+ const { frontmatter, body } = parseFrontmatter(content);
264
+ assert.deepEqual(frontmatter, {});
265
+ assert.equal(body, content);
266
+ });
267
+
268
+ test('returns empty frontmatter when opening --- exists but no closing ---', () => {
269
+ const content = '---\nderives-from: done-auth\n## Objective\nBuild it\n';
270
+ const { frontmatter, body } = parseFrontmatter(content);
271
+ assert.deepEqual(frontmatter, {});
272
+ assert.equal(body, content);
273
+ });
274
+
275
+ test('handles empty frontmatter block (--- immediately followed by ---)', () => {
276
+ const content = ['---', '---', '', '## Objective', 'Build it'].join('\n');
277
+ const { frontmatter, body } = parseFrontmatter(content);
278
+ assert.deepEqual(frontmatter, {});
279
+ assert.ok(body.includes('## Objective'));
280
+ });
281
+
282
+ test('trims whitespace from keys and values', () => {
283
+ const content = [
284
+ '---',
285
+ ' derives-from : done-auth ',
286
+ '---',
287
+ '',
288
+ '## Objective',
289
+ 'Build it',
290
+ ].join('\n');
291
+ const { frontmatter } = parseFrontmatter(content);
292
+ assert.equal(frontmatter['derives-from'], 'done-auth');
293
+ });
294
+
295
+ test('handles empty string input', () => {
296
+ const { frontmatter, body } = parseFrontmatter('');
297
+ assert.deepEqual(frontmatter, {});
298
+ assert.equal(body, '');
299
+ });
300
+
301
+ test('body does not include frontmatter delimiters', () => {
302
+ const content = withFrontmatter('## Objective\nBuild it', {
303
+ 'derives-from': 'done-auth',
304
+ });
305
+ const { body } = parseFrontmatter(content);
306
+ // Body should not start with ---
307
+ assert.ok(!body.trimStart().startsWith('---'));
308
+ });
309
+
310
+ test('handles value containing colons', () => {
311
+ const content = [
312
+ '---',
313
+ 'description: a spec: with colons: inside',
314
+ '---',
315
+ '',
316
+ 'body',
317
+ ].join('\n');
318
+ const { frontmatter } = parseFrontmatter(content);
319
+ assert.equal(frontmatter['description'], 'a spec: with colons: inside');
320
+ });
321
+ });
322
+
323
+ // ---------------------------------------------------------------------------
324
+ // derives-from validation in validateSpec
325
+ // ---------------------------------------------------------------------------
326
+
327
+ describe('derives-from validation', () => {
328
+ test('spec without derives-from produces no derives-from advisory', () => {
329
+ const content = fullSpec();
330
+ const result = validateSpec(content);
331
+ const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
332
+ assert.equal(derivesAdvisory.length, 0);
333
+ });
334
+
335
+ test('derives-from with no specsDir skips reference check (no warning)', () => {
336
+ const content = withFrontmatter(fullSpec(), {
337
+ 'derives-from': 'nonexistent-spec',
338
+ });
339
+ // No specsDir passed — cannot verify, should not warn
340
+ const result = validateSpec(content);
341
+ const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
342
+ assert.equal(derivesAdvisory.length, 0);
343
+ });
344
+
345
+ test('derives-from referencing missing spec emits advisory warning, not hard error', () => {
346
+ const content = withFrontmatter(fullSpec(), {
347
+ 'derives-from': 'nonexistent-spec',
348
+ });
349
+ // Use a real directory that won't contain spec files
350
+ const tmpDir = path.join(__dirname, '..', 'templates');
351
+ const result = validateSpec(content, { specsDir: tmpDir });
352
+ // Should be advisory, not hard
353
+ const derivesHard = result.hard.filter((m) => m.includes('derives-from'));
354
+ assert.equal(derivesHard.length, 0, 'missing derives-from reference must not be a hard error');
355
+ const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
356
+ assert.ok(derivesAdvisory.length > 0, 'should emit advisory warning for missing reference');
357
+ });
358
+
359
+ test('advisory message includes the referenced spec name', () => {
360
+ const content = withFrontmatter(fullSpec(), {
361
+ 'derives-from': 'phantom-spec',
362
+ });
363
+ const tmpDir = path.join(__dirname, '..', 'templates');
364
+ const result = validateSpec(content, { specsDir: tmpDir });
365
+ const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
366
+ assert.ok(
367
+ derivesAdvisory.some((m) => m.includes('phantom-spec')),
368
+ 'advisory should mention the referenced spec name'
369
+ );
370
+ });
371
+
372
+ test('derives-from referencing existing spec file produces no advisory', () => {
373
+ // Create a temp specs dir with a matching file
374
+ const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'spec-lint-test-'));
375
+ try {
376
+ fs.writeFileSync(path.join(tmpDir, 'done-auth.md'), '## Objective\nAuth\n');
377
+ const content = withFrontmatter(fullSpec(), {
378
+ 'derives-from': 'done-auth',
379
+ });
380
+ const result = validateSpec(content, { specsDir: tmpDir });
381
+ const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
382
+ assert.equal(derivesAdvisory.length, 0, 'should not warn when reference exists');
383
+ } finally {
384
+ fs.rmSync(tmpDir, { recursive: true, force: true });
385
+ }
386
+ });
387
+
388
+ test('derives-from resolves done- prefixed files', () => {
389
+ // Reference "auth" but file is "done-auth.md" — should resolve
390
+ const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'spec-lint-test-'));
391
+ try {
392
+ fs.writeFileSync(path.join(tmpDir, 'done-auth.md'), '## Objective\nAuth\n');
393
+ const content = withFrontmatter(fullSpec(), {
394
+ 'derives-from': 'auth',
395
+ });
396
+ const result = validateSpec(content, { specsDir: tmpDir });
397
+ const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
398
+ assert.equal(derivesAdvisory.length, 0, 'should resolve done- prefixed file');
399
+ } finally {
400
+ fs.rmSync(tmpDir, { recursive: true, force: true });
401
+ }
402
+ });
403
+
404
+ test('spec layer and hard errors are unaffected by derives-from presence', () => {
405
+ const withDerives = withFrontmatter(fullSpec(), { 'derives-from': 'done-auth' });
406
+ const without = fullSpec();
407
+ const resultWith = validateSpec(withDerives);
408
+ const resultWithout = validateSpec(without);
409
+ assert.equal(resultWith.layer, resultWithout.layer);
410
+ assert.deepEqual(resultWith.hard, resultWithout.hard);
411
+ });
412
+ });
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env node
2
+ // @hook-event: statusLine
2
3
  /**
3
4
  * deepflow statusline for Claude Code
4
5
  * Displays: update | model | project | context usage
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env node
2
+ // @hook-event: SubagentStop
2
3
  'use strict';
3
4
  const fs = require('fs');
4
5
  const path = require('path');
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env node
2
+ // @hook-event: PostToolUse
2
3
  /**
3
4
  * deepflow tool usage logger
4
5
  * Logs every PostToolUse event to ~/.claude/tool-usage.jsonl for token instrumentation.
@@ -67,13 +68,22 @@ process.stdin.on('end', () => {
67
68
  activeCommand = JSON.parse(markerRaw).command || null;
68
69
  } catch (_e) { /* no marker or unreadable — null */ }
69
70
 
71
+ // Extract a compact tool_input summary per tool type
72
+ const ti = data.tool_input || {};
73
+ let inputSummary = null;
74
+ if (toolName === 'Bash') inputSummary = ti.command || null;
75
+ else if (toolName === 'LSP') inputSummary = `${ti.operation || '?'}:${(ti.filePath || '').split('/').pop()}:${ti.line || '?'}`;
76
+ else if (toolName === 'Read') inputSummary = (ti.file_path || '').split('/').pop() + (ti.offset ? `:${ti.offset}-${ti.offset + (ti.limit || 0)}` : '');
77
+ else if (toolName === 'Grep') inputSummary = ti.pattern || null;
78
+ else if (toolName === 'Glob') inputSummary = ti.pattern || null;
79
+ else if (toolName === 'Agent') inputSummary = `${ti.subagent_type || '?'}/${ti.model || '?'}`;
80
+ else if (toolName === 'Edit' || toolName === 'Write') inputSummary = (ti.file_path || '').split('/').pop();
81
+
70
82
  const record = {
71
83
  timestamp: new Date().toISOString(),
72
84
  session_id: data.session_id || null,
73
85
  tool_name: toolName,
74
- command: (toolName === 'Bash' && data.tool_input && data.tool_input.command != null)
75
- ? data.tool_input.command
76
- : null,
86
+ input: inputSummary,
77
87
  output_size_est_tokens: Math.ceil(JSON.stringify(toolResponse).length / 4),
78
88
  project: cwd ? path.basename(cwd) : null,
79
89
  phase: inferPhase(cwd),
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env node
2
+ // @hook-event: PostToolUse
2
3
  /**
3
4
  * deepflow worktree guard
4
5
  * PostToolUse hook: blocks Write/Edit to main-branch files when a df/* worktree exists.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "deepflow",
3
- "version": "0.1.103",
3
+ "version": "0.1.104",
4
4
  "description": "Doing reveals what thinking can't predict — spec-driven iterative development for Claude Code",
5
5
  "keywords": [
6
6
  "claude",
@@ -30,7 +30,7 @@ Coordinate reasoner agents to debate a problem from multiple perspectives, then
30
30
  Summarize conversation context in ~200 words: core problem, requirements, constraints, user priorities. Passed to each perspective agent.
31
31
 
32
32
  ### 2. GATHER CODEBASE CONTEXT
33
- Glob/Grep/Read relevant files (up to 5-6, focus on core logic). Produce ~300 word codebase summary: what exists, key interfaces, current limitations, dependencies. Passed to every agent.
33
+ Prefer LSP documentSymbol to understand file structure, then Read with offset/limit on relevant ranges only (never read full files). Glob/Grep to locate files (up to 5-6, focus on core logic). Produce ~300 word codebase summary: what exists, key interfaces, current limitations, dependencies. Passed to every agent.
34
34
 
35
35
  ### 3. SPAWN PERSPECTIVES
36
36
 
@@ -0,0 +1,117 @@
1
+ ---
2
+ name: df:eval
3
+ description: Evaluate a skill or command against a benchmark suite, or scaffold a new benchmark directory
4
+ allowed-tools: [Read, Bash, Write, Glob, Grep]
5
+ ---
6
+
7
+ # /df:eval — Skill Evaluation
8
+
9
+ Run a benchmark suite against a skill/command, or scaffold a new benchmark directory.
10
+
11
+ ## Usage
12
+
13
+ ```
14
+ /df:eval --scaffold benchmarks/<name>/ # Create benchmark directory structure
15
+ /df:eval benchmarks/<name>/ # Run benchmark suite (reads hypotheses.md)
16
+ /df:eval benchmarks/<name>/ --hypothesis "reduce token use" # Override hypothesis explicitly
17
+ ```
18
+
19
+ ## Subcommands
20
+
21
+ ### `--scaffold <target-dir>`
22
+
23
+ Creates a benchmark directory from the fixture template at `templates/eval-fixture-template/`.
24
+
25
+ **What gets created:**
26
+
27
+ ```
28
+ <target-dir>/
29
+ fixture/ # Minimal repo fixture (hooks, specs, src, package.json)
30
+ tests/ # Behavior and guard test files
31
+ spec.md # Benchmark objective and acceptance criteria
32
+ config.yaml # Benchmark configuration (skill under test, thresholds)
33
+ hypotheses.md # Hypotheses to validate
34
+ ```
35
+
36
+ **Steps:**
37
+
38
+ 1. Validate `<target-dir>` argument is provided; abort with usage hint if missing.
39
+ 2. Check `<target-dir>` does not already exist; abort with error if it does.
40
+ 3. Copy `templates/eval-fixture-template/` recursively to `<target-dir>`.
41
+ 4. Confirm with summary:
42
+
43
+ ```
44
+ Created benchmark scaffold at <target-dir>/
45
+ fixture/ - minimal repo fixture
46
+ tests/ - behavior.test.js, guard.test.js
47
+ spec.md - edit to define benchmark objective
48
+ config.yaml - edit to set skill under test and thresholds
49
+ hypotheses.md - edit to define hypotheses
50
+
51
+ Next: edit spec.md and config.yaml, then run /df:eval <target-dir>/
52
+ ```
53
+
54
+ **Implementation:**
55
+
56
+ ```bash
57
+ # Parse --scaffold flag and target dir from $ARGUMENTS
58
+ # e.g. /df:eval --scaffold benchmarks/my-bench/
59
+ ARGS="$ARGUMENTS"
60
+ TARGET=$(echo "$ARGS" | sed 's/--scaffold[[:space:]]*//')
61
+ TEMPLATE="templates/eval-fixture-template"
62
+
63
+ if [ -z "$TARGET" ]; then
64
+ echo "Error: target directory required. Usage: /df:eval --scaffold benchmarks/<name>/"
65
+ exit 1
66
+ fi
67
+
68
+ if [ -d "$TARGET" ]; then
69
+ echo "Error: $TARGET already exists."
70
+ exit 1
71
+ fi
72
+
73
+ cp -r "$TEMPLATE/" "$TARGET"
74
+ echo "Created benchmark scaffold at $TARGET"
75
+ ```
76
+
77
+ ### `--hypothesis <text>`
78
+
79
+ Overrides the mutation hypothesis for the eval session. Without this flag the
80
+ loop reads `{benchDir}/hypotheses.md` and uses the first list item it finds.
81
+
82
+ **Hypothesis resolution order:**
83
+
84
+ 1. `--hypothesis "<text>"` flag value — used as-is.
85
+ 2. `{benchDir}/hypotheses.md` first list item (ordered or unordered markdown list).
86
+ 3. Error if neither source is available.
87
+
88
+ **Module:** `src/eval/hypothesis.js` — `loadHypothesis({ flag, benchDir })`
89
+
90
+ ---
91
+
92
+ ## Main Eval Loop (T9 — implemented)
93
+
94
+ Running `/df:eval benchmarks/<name>/` without `--scaffold` runs the Karpathy loop:
95
+
96
+ 1. Load `benchmarks/<name>/config.yaml` — skill under test, thresholds, iteration count
97
+ 2. Resolve hypothesis via `--hypothesis` flag or `benchmarks/<name>/hypotheses.md` (first list item)
98
+ 3. Create a worktree-isolated branch for the session (`eval/<skill>/<timestamp>`)
99
+ 4. **Loop** (until Ctrl+C or `--loop N`):
100
+ a. Mutate skill file via agent prompt built from current content + history
101
+ b. Commit experiment (`status:pending`)
102
+ c. Run guard check (build + test commands from config)
103
+ - Guard fail → `git revert`, log `status:guard_fail`, next iteration
104
+ d. Collect metrics from `.deepflow/` JSONL files
105
+ e. Compare target metric against baseline
106
+ - Improved → log `status:kept`, update baseline
107
+ - Regression → `git revert`, log `status:reverted`
108
+ f. Record secondary metrics in commit message (never influence keep/revert)
109
+
110
+ **Implementation:** `src/eval/loop.js` (`runEvalLoop`), `src/eval/hypothesis.js` (`loadHypothesis`)
111
+
112
+ ## Rules
113
+
114
+ - `--scaffold` never overwrites an existing directory
115
+ - Template is always copied from `templates/eval-fixture-template/`
116
+ - Main eval loop is non-deterministic by design — it samples skill behavior across N runs
117
+ - No LLM judges another LLM — only objective metrics (file diffs, test results, token counts) are used
@@ -376,7 +376,7 @@ Success criteria: {ACs from spec relevant to this task}
376
376
  {TASK_DETAIL if available, else inline block:}
377
377
  Impact: Callers: {file} ({why}) | Duplicates: [active→consolidate] [dead→DELETE] | Data flow: {consumers}
378
378
  Prior tasks: {dep_id}: {summary}
379
- Steps: 1. chub search/get for APIs 2. LSP findReferences, add unlisted callers 3. Read all Impact files 4. Implement 5. Commit
379
+ Steps: 1. chub search/get for APIs 2. LSP findReferences, add unlisted callers 3. LSP documentSymbol on Impact files → Read with offset/limit on relevant ranges only (never read full files) 4. Implement 5. Commit
380
380
  --- END ---
381
381
  Duplicates: [active]→consolidate [dead]→DELETE. ONLY job: code+commit. No merge/rename/checkout.
382
382
  Last line of your response MUST be: TASK_STATUS:pass (if successful) or TASK_STATUS:fail (if failed) or TASK_STATUS:revert (if reverted)