deepflow 0.1.103 → 0.1.104
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install-dynamic-hooks.test.js +461 -0
- package/bin/install.js +150 -250
- package/bin/lineage-ingest.js +70 -0
- package/hooks/df-check-update.js +1 -0
- package/hooks/df-command-usage.js +18 -0
- package/hooks/df-dashboard-push.js +1 -0
- package/hooks/df-execution-history.js +1 -0
- package/hooks/df-explore-protocol.js +83 -0
- package/hooks/df-explore-protocol.test.js +228 -0
- package/hooks/df-hook-event-tags.test.js +127 -0
- package/hooks/df-invariant-check.js +1 -0
- package/hooks/df-quota-logger.js +1 -0
- package/hooks/df-snapshot-guard.js +1 -0
- package/hooks/df-spec-lint.js +58 -1
- package/hooks/df-spec-lint.test.js +412 -0
- package/hooks/df-statusline.js +1 -0
- package/hooks/df-subagent-registry.js +1 -0
- package/hooks/df-tool-usage.js +13 -3
- package/hooks/df-worktree-guard.js +1 -0
- package/package.json +1 -1
- package/src/commands/df/debate.md +1 -1
- package/src/commands/df/eval.md +117 -0
- package/src/commands/df/execute.md +1 -1
- package/src/commands/df/fix.md +104 -0
- package/src/eval/git-memory.js +159 -0
- package/src/eval/git-memory.test.js +439 -0
- package/src/eval/hypothesis.js +80 -0
- package/src/eval/hypothesis.test.js +169 -0
- package/src/eval/loop.js +378 -0
- package/src/eval/loop.test.js +306 -0
- package/src/eval/metric-collector.js +163 -0
- package/src/eval/metric-collector.test.js +369 -0
- package/src/eval/metric-pivot.js +119 -0
- package/src/eval/metric-pivot.test.js +350 -0
- package/src/eval/mutator-prompt.js +106 -0
- package/src/eval/mutator-prompt.test.js +180 -0
- package/templates/config-template.yaml +5 -0
- package/templates/eval-fixture-template/config.yaml +39 -0
- package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
- package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
- package/templates/eval-fixture-template/fixture/package.json +12 -0
- package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
- package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
- package/templates/eval-fixture-template/fixture/src/config.js +40 -0
- package/templates/eval-fixture-template/fixture/src/index.js +19 -0
- package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
- package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
- package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
- package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
- package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
- package/templates/eval-fixture-template/hypotheses.md +14 -0
- package/templates/eval-fixture-template/spec.md +34 -0
- package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
- package/templates/eval-fixture-template/tests/guard.test.js +108 -0
- package/templates/eval-fixture-template.test.js +318 -0
- package/templates/explore-agent.md +5 -74
- package/templates/explore-protocol.md +44 -0
- package/templates/spec-template.md +4 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for hooks/df-spec-lint.js
|
|
3
|
+
*
|
|
4
|
+
* Validates that computeLayer, validateSpec, and extractSection correctly
|
|
5
|
+
* handle YAML frontmatter (including derives-from fields) without
|
|
6
|
+
* misinterpreting frontmatter lines as section headers.
|
|
7
|
+
*
|
|
8
|
+
* Uses Node.js built-in node:test to avoid adding dependencies.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
'use strict';
|
|
12
|
+
|
|
13
|
+
const { test, describe } = require('node:test');
|
|
14
|
+
const assert = require('node:assert/strict');
|
|
15
|
+
const fs = require('fs');
|
|
16
|
+
const path = require('path');
|
|
17
|
+
const os = require('os');
|
|
18
|
+
|
|
19
|
+
const { computeLayer, validateSpec, extractSection, parseFrontmatter } = require('./df-spec-lint');
|
|
20
|
+
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Helpers
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
/** Minimal L0 spec (just Objective) */
|
|
26
|
+
function minimalSpec(objective = 'Build the thing') {
|
|
27
|
+
return `## Objective\n${objective}\n`;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** Full L3 spec with all required sections */
|
|
31
|
+
function fullSpec() {
|
|
32
|
+
return [
|
|
33
|
+
'## Objective',
|
|
34
|
+
'Build the thing',
|
|
35
|
+
'',
|
|
36
|
+
'## Requirements',
|
|
37
|
+
'- REQ-1: Do something',
|
|
38
|
+
'',
|
|
39
|
+
'## Constraints',
|
|
40
|
+
'Must be fast',
|
|
41
|
+
'',
|
|
42
|
+
'## Out of Scope',
|
|
43
|
+
'Not doing X',
|
|
44
|
+
'',
|
|
45
|
+
'## Acceptance Criteria',
|
|
46
|
+
'- [ ] REQ-1 works',
|
|
47
|
+
'',
|
|
48
|
+
'## Technical Notes',
|
|
49
|
+
'Use module Y',
|
|
50
|
+
].join('\n');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Wrap content with YAML frontmatter */
|
|
54
|
+
function withFrontmatter(body, fields = {}) {
|
|
55
|
+
const yamlLines = Object.entries(fields).map(([k, v]) => `${k}: ${v}`);
|
|
56
|
+
return ['---', ...yamlLines, '---', '', body].join('\n');
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
// computeLayer — frontmatter handling
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
describe('computeLayer', () => {
|
|
64
|
+
test('returns L0 for spec with only Objective', () => {
|
|
65
|
+
assert.equal(computeLayer(minimalSpec()), 0);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
test('returns L0 when frontmatter with derives-from precedes Objective', () => {
|
|
69
|
+
const content = withFrontmatter(minimalSpec(), {
|
|
70
|
+
'derives-from': 'done-auth',
|
|
71
|
+
});
|
|
72
|
+
assert.equal(computeLayer(content), 0);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
test('returns L3 for full spec with derives-from frontmatter', () => {
|
|
76
|
+
const content = withFrontmatter(fullSpec(), {
|
|
77
|
+
'derives-from': 'done-auth',
|
|
78
|
+
name: 'spec-lineage',
|
|
79
|
+
});
|
|
80
|
+
assert.equal(computeLayer(content), 3);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
test('frontmatter --- lines are not counted as section headers', () => {
|
|
84
|
+
// If --- were mistaken for headers, layer computation would break.
|
|
85
|
+
// Verify that a spec with frontmatter computes same layer as without.
|
|
86
|
+
const bare = fullSpec();
|
|
87
|
+
const wrapped = withFrontmatter(fullSpec(), {
|
|
88
|
+
'derives-from': 'done-auth',
|
|
89
|
+
});
|
|
90
|
+
assert.equal(computeLayer(bare), computeLayer(wrapped));
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
test('derives-from value is not mistaken for a section name', () => {
|
|
94
|
+
// derives-from: done-auth — should not create a phantom header
|
|
95
|
+
const content = withFrontmatter(minimalSpec(), {
|
|
96
|
+
'derives-from': 'done-auth',
|
|
97
|
+
});
|
|
98
|
+
// Still L0, not some higher layer from phantom headers
|
|
99
|
+
assert.equal(computeLayer(content), 0);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
test('returns -1 when frontmatter exists but no Objective section', () => {
|
|
103
|
+
const content = withFrontmatter('Just some text, no headings.', {
|
|
104
|
+
'derives-from': 'done-auth',
|
|
105
|
+
});
|
|
106
|
+
assert.equal(computeLayer(content), -1);
|
|
107
|
+
});
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
// ---------------------------------------------------------------------------
|
|
111
|
+
// validateSpec — frontmatter handling
|
|
112
|
+
// ---------------------------------------------------------------------------
|
|
113
|
+
|
|
114
|
+
describe('validateSpec with frontmatter', () => {
|
|
115
|
+
test('full spec with derives-from frontmatter produces no hard errors', () => {
|
|
116
|
+
const content = withFrontmatter(fullSpec(), {
|
|
117
|
+
'derives-from': 'done-auth',
|
|
118
|
+
});
|
|
119
|
+
const result = validateSpec(content);
|
|
120
|
+
assert.deepEqual(result.hard, []);
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
test('layer is correctly reported when frontmatter is present', () => {
|
|
124
|
+
const content = withFrontmatter(fullSpec(), {
|
|
125
|
+
'derives-from': 'done-auth',
|
|
126
|
+
});
|
|
127
|
+
const result = validateSpec(content);
|
|
128
|
+
assert.equal(result.layer, 3);
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
test('L0 spec with frontmatter reports missing sections as advisory only', () => {
|
|
132
|
+
const content = withFrontmatter(minimalSpec(), {
|
|
133
|
+
'derives-from': 'done-auth',
|
|
134
|
+
});
|
|
135
|
+
const result = validateSpec(content);
|
|
136
|
+
// L0 only requires Objective — everything else is advisory
|
|
137
|
+
assert.deepEqual(result.hard, []);
|
|
138
|
+
assert.ok(result.advisory.length > 0, 'should have advisory warnings for missing sections');
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
test('frontmatter --- delimiters do not appear in hard or advisory messages', () => {
|
|
142
|
+
const content = withFrontmatter(fullSpec(), {
|
|
143
|
+
'derives-from': 'done-auth',
|
|
144
|
+
});
|
|
145
|
+
const result = validateSpec(content);
|
|
146
|
+
const allMessages = [...result.hard, ...result.advisory];
|
|
147
|
+
for (const msg of allMessages) {
|
|
148
|
+
assert.ok(!msg.includes('---'), `Unexpected --- in message: ${msg}`);
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
// ---------------------------------------------------------------------------
|
|
154
|
+
// extractSection — frontmatter handling
|
|
155
|
+
// ---------------------------------------------------------------------------
|
|
156
|
+
|
|
157
|
+
describe('extractSection with frontmatter', () => {
|
|
158
|
+
test('extracts Objective section when frontmatter is present', () => {
|
|
159
|
+
const content = withFrontmatter(minimalSpec('Build the thing'), {
|
|
160
|
+
'derives-from': 'done-auth',
|
|
161
|
+
});
|
|
162
|
+
const section = extractSection(content, 'Objective');
|
|
163
|
+
assert.ok(section !== null, 'Objective section should be found');
|
|
164
|
+
assert.ok(section.includes('Build the thing'));
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
test('extracts Requirements section with frontmatter', () => {
|
|
168
|
+
const content = withFrontmatter(fullSpec(), {
|
|
169
|
+
'derives-from': 'done-auth',
|
|
170
|
+
});
|
|
171
|
+
const section = extractSection(content, 'Requirements');
|
|
172
|
+
assert.ok(section !== null);
|
|
173
|
+
assert.ok(section.includes('REQ-1'));
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
test('frontmatter content does not leak into extracted sections', () => {
|
|
177
|
+
const content = withFrontmatter(fullSpec(), {
|
|
178
|
+
'derives-from': 'done-auth',
|
|
179
|
+
description: 'A spec about things',
|
|
180
|
+
});
|
|
181
|
+
const objective = extractSection(content, 'Objective');
|
|
182
|
+
assert.ok(objective !== null);
|
|
183
|
+
assert.ok(!objective.includes('derives-from'));
|
|
184
|
+
assert.ok(!objective.includes('done-auth'));
|
|
185
|
+
assert.ok(!objective.includes('description'));
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
test('returns null for non-existent section even with frontmatter', () => {
|
|
189
|
+
const content = withFrontmatter(minimalSpec(), {
|
|
190
|
+
'derives-from': 'done-auth',
|
|
191
|
+
});
|
|
192
|
+
const section = extractSection(content, 'Nonexistent');
|
|
193
|
+
assert.equal(section, null);
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
test('extracts section using alias when frontmatter is present', () => {
|
|
197
|
+
const content = withFrontmatter(
|
|
198
|
+
'## Goal\nDo the thing\n\n## Requirements\n- REQ-1: stuff\n',
|
|
199
|
+
{ 'derives-from': 'done-auth' }
|
|
200
|
+
);
|
|
201
|
+
// 'goal' is an alias for 'Objective'
|
|
202
|
+
const section = extractSection(content, 'Objective');
|
|
203
|
+
assert.ok(section !== null, 'Should find section via alias "Goal"');
|
|
204
|
+
assert.ok(section.includes('Do the thing'));
|
|
205
|
+
});
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
// ---------------------------------------------------------------------------
|
|
209
|
+
// Edge cases — frontmatter-like patterns inside body
|
|
210
|
+
// ---------------------------------------------------------------------------
|
|
211
|
+
|
|
212
|
+
describe('frontmatter edge cases', () => {
|
|
213
|
+
test('--- inside spec body (e.g. horizontal rule) does not break computeLayer', () => {
|
|
214
|
+
const content = [
|
|
215
|
+
'---',
|
|
216
|
+
'derives-from: done-auth',
|
|
217
|
+
'---',
|
|
218
|
+
'',
|
|
219
|
+
'## Objective',
|
|
220
|
+
'Build it',
|
|
221
|
+
'',
|
|
222
|
+
'---',
|
|
223
|
+
'',
|
|
224
|
+
'## Requirements',
|
|
225
|
+
'- REQ-1: Something',
|
|
226
|
+
].join('\n');
|
|
227
|
+
// Should at least be L1 (has Objective + Requirements)
|
|
228
|
+
assert.ok(computeLayer(content) >= 1);
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
test('multiple derives-from fields in frontmatter do not affect layer', () => {
|
|
232
|
+
const content = withFrontmatter(fullSpec(), {
|
|
233
|
+
'derives-from': 'done-auth, done-payments',
|
|
234
|
+
});
|
|
235
|
+
assert.equal(computeLayer(content), 3);
|
|
236
|
+
});
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
// ---------------------------------------------------------------------------
|
|
240
|
+
// parseFrontmatter — direct unit tests
|
|
241
|
+
// ---------------------------------------------------------------------------
|
|
242
|
+
|
|
243
|
+
describe('parseFrontmatter', () => {
|
|
244
|
+
test('parses key-value pairs and returns body without frontmatter', () => {
|
|
245
|
+
const content = [
|
|
246
|
+
'---',
|
|
247
|
+
'derives-from: done-auth',
|
|
248
|
+
'name: spec-lineage',
|
|
249
|
+
'---',
|
|
250
|
+
'',
|
|
251
|
+
'## Objective',
|
|
252
|
+
'Build it',
|
|
253
|
+
].join('\n');
|
|
254
|
+
const { frontmatter, body } = parseFrontmatter(content);
|
|
255
|
+
assert.equal(frontmatter['derives-from'], 'done-auth');
|
|
256
|
+
assert.equal(frontmatter['name'], 'spec-lineage');
|
|
257
|
+
assert.ok(body.includes('## Objective'));
|
|
258
|
+
assert.ok(body.includes('Build it'));
|
|
259
|
+
});
|
|
260
|
+
|
|
261
|
+
test('returns empty frontmatter and full body when no --- opener', () => {
|
|
262
|
+
const content = '## Objective\nBuild it\n';
|
|
263
|
+
const { frontmatter, body } = parseFrontmatter(content);
|
|
264
|
+
assert.deepEqual(frontmatter, {});
|
|
265
|
+
assert.equal(body, content);
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
test('returns empty frontmatter when opening --- exists but no closing ---', () => {
|
|
269
|
+
const content = '---\nderives-from: done-auth\n## Objective\nBuild it\n';
|
|
270
|
+
const { frontmatter, body } = parseFrontmatter(content);
|
|
271
|
+
assert.deepEqual(frontmatter, {});
|
|
272
|
+
assert.equal(body, content);
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
test('handles empty frontmatter block (--- immediately followed by ---)', () => {
|
|
276
|
+
const content = ['---', '---', '', '## Objective', 'Build it'].join('\n');
|
|
277
|
+
const { frontmatter, body } = parseFrontmatter(content);
|
|
278
|
+
assert.deepEqual(frontmatter, {});
|
|
279
|
+
assert.ok(body.includes('## Objective'));
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
test('trims whitespace from keys and values', () => {
|
|
283
|
+
const content = [
|
|
284
|
+
'---',
|
|
285
|
+
' derives-from : done-auth ',
|
|
286
|
+
'---',
|
|
287
|
+
'',
|
|
288
|
+
'## Objective',
|
|
289
|
+
'Build it',
|
|
290
|
+
].join('\n');
|
|
291
|
+
const { frontmatter } = parseFrontmatter(content);
|
|
292
|
+
assert.equal(frontmatter['derives-from'], 'done-auth');
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
test('handles empty string input', () => {
|
|
296
|
+
const { frontmatter, body } = parseFrontmatter('');
|
|
297
|
+
assert.deepEqual(frontmatter, {});
|
|
298
|
+
assert.equal(body, '');
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
test('body does not include frontmatter delimiters', () => {
|
|
302
|
+
const content = withFrontmatter('## Objective\nBuild it', {
|
|
303
|
+
'derives-from': 'done-auth',
|
|
304
|
+
});
|
|
305
|
+
const { body } = parseFrontmatter(content);
|
|
306
|
+
// Body should not start with ---
|
|
307
|
+
assert.ok(!body.trimStart().startsWith('---'));
|
|
308
|
+
});
|
|
309
|
+
|
|
310
|
+
test('handles value containing colons', () => {
|
|
311
|
+
const content = [
|
|
312
|
+
'---',
|
|
313
|
+
'description: a spec: with colons: inside',
|
|
314
|
+
'---',
|
|
315
|
+
'',
|
|
316
|
+
'body',
|
|
317
|
+
].join('\n');
|
|
318
|
+
const { frontmatter } = parseFrontmatter(content);
|
|
319
|
+
assert.equal(frontmatter['description'], 'a spec: with colons: inside');
|
|
320
|
+
});
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
// ---------------------------------------------------------------------------
|
|
324
|
+
// derives-from validation in validateSpec
|
|
325
|
+
// ---------------------------------------------------------------------------
|
|
326
|
+
|
|
327
|
+
describe('derives-from validation', () => {
|
|
328
|
+
test('spec without derives-from produces no derives-from advisory', () => {
|
|
329
|
+
const content = fullSpec();
|
|
330
|
+
const result = validateSpec(content);
|
|
331
|
+
const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
|
|
332
|
+
assert.equal(derivesAdvisory.length, 0);
|
|
333
|
+
});
|
|
334
|
+
|
|
335
|
+
test('derives-from with no specsDir skips reference check (no warning)', () => {
|
|
336
|
+
const content = withFrontmatter(fullSpec(), {
|
|
337
|
+
'derives-from': 'nonexistent-spec',
|
|
338
|
+
});
|
|
339
|
+
// No specsDir passed — cannot verify, should not warn
|
|
340
|
+
const result = validateSpec(content);
|
|
341
|
+
const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
|
|
342
|
+
assert.equal(derivesAdvisory.length, 0);
|
|
343
|
+
});
|
|
344
|
+
|
|
345
|
+
test('derives-from referencing missing spec emits advisory warning, not hard error', () => {
|
|
346
|
+
const content = withFrontmatter(fullSpec(), {
|
|
347
|
+
'derives-from': 'nonexistent-spec',
|
|
348
|
+
});
|
|
349
|
+
// Use a real directory that won't contain spec files
|
|
350
|
+
const tmpDir = path.join(__dirname, '..', 'templates');
|
|
351
|
+
const result = validateSpec(content, { specsDir: tmpDir });
|
|
352
|
+
// Should be advisory, not hard
|
|
353
|
+
const derivesHard = result.hard.filter((m) => m.includes('derives-from'));
|
|
354
|
+
assert.equal(derivesHard.length, 0, 'missing derives-from reference must not be a hard error');
|
|
355
|
+
const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
|
|
356
|
+
assert.ok(derivesAdvisory.length > 0, 'should emit advisory warning for missing reference');
|
|
357
|
+
});
|
|
358
|
+
|
|
359
|
+
test('advisory message includes the referenced spec name', () => {
|
|
360
|
+
const content = withFrontmatter(fullSpec(), {
|
|
361
|
+
'derives-from': 'phantom-spec',
|
|
362
|
+
});
|
|
363
|
+
const tmpDir = path.join(__dirname, '..', 'templates');
|
|
364
|
+
const result = validateSpec(content, { specsDir: tmpDir });
|
|
365
|
+
const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
|
|
366
|
+
assert.ok(
|
|
367
|
+
derivesAdvisory.some((m) => m.includes('phantom-spec')),
|
|
368
|
+
'advisory should mention the referenced spec name'
|
|
369
|
+
);
|
|
370
|
+
});
|
|
371
|
+
|
|
372
|
+
test('derives-from referencing existing spec file produces no advisory', () => {
|
|
373
|
+
// Create a temp specs dir with a matching file
|
|
374
|
+
const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'spec-lint-test-'));
|
|
375
|
+
try {
|
|
376
|
+
fs.writeFileSync(path.join(tmpDir, 'done-auth.md'), '## Objective\nAuth\n');
|
|
377
|
+
const content = withFrontmatter(fullSpec(), {
|
|
378
|
+
'derives-from': 'done-auth',
|
|
379
|
+
});
|
|
380
|
+
const result = validateSpec(content, { specsDir: tmpDir });
|
|
381
|
+
const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
|
|
382
|
+
assert.equal(derivesAdvisory.length, 0, 'should not warn when reference exists');
|
|
383
|
+
} finally {
|
|
384
|
+
fs.rmSync(tmpDir, { recursive: true, force: true });
|
|
385
|
+
}
|
|
386
|
+
});
|
|
387
|
+
|
|
388
|
+
test('derives-from resolves done- prefixed files', () => {
|
|
389
|
+
// Reference "auth" but file is "done-auth.md" — should resolve
|
|
390
|
+
const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'spec-lint-test-'));
|
|
391
|
+
try {
|
|
392
|
+
fs.writeFileSync(path.join(tmpDir, 'done-auth.md'), '## Objective\nAuth\n');
|
|
393
|
+
const content = withFrontmatter(fullSpec(), {
|
|
394
|
+
'derives-from': 'auth',
|
|
395
|
+
});
|
|
396
|
+
const result = validateSpec(content, { specsDir: tmpDir });
|
|
397
|
+
const derivesAdvisory = result.advisory.filter((m) => m.includes('derives-from'));
|
|
398
|
+
assert.equal(derivesAdvisory.length, 0, 'should resolve done- prefixed file');
|
|
399
|
+
} finally {
|
|
400
|
+
fs.rmSync(tmpDir, { recursive: true, force: true });
|
|
401
|
+
}
|
|
402
|
+
});
|
|
403
|
+
|
|
404
|
+
test('spec layer and hard errors are unaffected by derives-from presence', () => {
|
|
405
|
+
const withDerives = withFrontmatter(fullSpec(), { 'derives-from': 'done-auth' });
|
|
406
|
+
const without = fullSpec();
|
|
407
|
+
const resultWith = validateSpec(withDerives);
|
|
408
|
+
const resultWithout = validateSpec(without);
|
|
409
|
+
assert.equal(resultWith.layer, resultWithout.layer);
|
|
410
|
+
assert.deepEqual(resultWith.hard, resultWithout.hard);
|
|
411
|
+
});
|
|
412
|
+
});
|
package/hooks/df-statusline.js
CHANGED
package/hooks/df-tool-usage.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
// @hook-event: PostToolUse
|
|
2
3
|
/**
|
|
3
4
|
* deepflow tool usage logger
|
|
4
5
|
* Logs every PostToolUse event to ~/.claude/tool-usage.jsonl for token instrumentation.
|
|
@@ -67,13 +68,22 @@ process.stdin.on('end', () => {
|
|
|
67
68
|
activeCommand = JSON.parse(markerRaw).command || null;
|
|
68
69
|
} catch (_e) { /* no marker or unreadable — null */ }
|
|
69
70
|
|
|
71
|
+
// Extract a compact tool_input summary per tool type
|
|
72
|
+
const ti = data.tool_input || {};
|
|
73
|
+
let inputSummary = null;
|
|
74
|
+
if (toolName === 'Bash') inputSummary = ti.command || null;
|
|
75
|
+
else if (toolName === 'LSP') inputSummary = `${ti.operation || '?'}:${(ti.filePath || '').split('/').pop()}:${ti.line || '?'}`;
|
|
76
|
+
else if (toolName === 'Read') inputSummary = (ti.file_path || '').split('/').pop() + (ti.offset ? `:${ti.offset}-${ti.offset + (ti.limit || 0)}` : '');
|
|
77
|
+
else if (toolName === 'Grep') inputSummary = ti.pattern || null;
|
|
78
|
+
else if (toolName === 'Glob') inputSummary = ti.pattern || null;
|
|
79
|
+
else if (toolName === 'Agent') inputSummary = `${ti.subagent_type || '?'}/${ti.model || '?'}`;
|
|
80
|
+
else if (toolName === 'Edit' || toolName === 'Write') inputSummary = (ti.file_path || '').split('/').pop();
|
|
81
|
+
|
|
70
82
|
const record = {
|
|
71
83
|
timestamp: new Date().toISOString(),
|
|
72
84
|
session_id: data.session_id || null,
|
|
73
85
|
tool_name: toolName,
|
|
74
|
-
|
|
75
|
-
? data.tool_input.command
|
|
76
|
-
: null,
|
|
86
|
+
input: inputSummary,
|
|
77
87
|
output_size_est_tokens: Math.ceil(JSON.stringify(toolResponse).length / 4),
|
|
78
88
|
project: cwd ? path.basename(cwd) : null,
|
|
79
89
|
phase: inferPhase(cwd),
|
package/package.json
CHANGED
|
@@ -30,7 +30,7 @@ Coordinate reasoner agents to debate a problem from multiple perspectives, then
|
|
|
30
30
|
Summarize conversation context in ~200 words: core problem, requirements, constraints, user priorities. Passed to each perspective agent.
|
|
31
31
|
|
|
32
32
|
### 2. GATHER CODEBASE CONTEXT
|
|
33
|
-
Glob/Grep
|
|
33
|
+
Prefer LSP documentSymbol to understand file structure, then Read with offset/limit on relevant ranges only (never read full files). Glob/Grep to locate files (up to 5-6, focus on core logic). Produce ~300 word codebase summary: what exists, key interfaces, current limitations, dependencies. Passed to every agent.
|
|
34
34
|
|
|
35
35
|
### 3. SPAWN PERSPECTIVES
|
|
36
36
|
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:eval
|
|
3
|
+
description: Evaluate a skill or command against a benchmark suite, or scaffold a new benchmark directory
|
|
4
|
+
allowed-tools: [Read, Bash, Write, Glob, Grep]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# /df:eval — Skill Evaluation
|
|
8
|
+
|
|
9
|
+
Run a benchmark suite against a skill/command, or scaffold a new benchmark directory.
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
/df:eval --scaffold benchmarks/<name>/ # Create benchmark directory structure
|
|
15
|
+
/df:eval benchmarks/<name>/ # Run benchmark suite (reads hypotheses.md)
|
|
16
|
+
/df:eval benchmarks/<name>/ --hypothesis "reduce token use" # Override hypothesis explicitly
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Subcommands
|
|
20
|
+
|
|
21
|
+
### `--scaffold <target-dir>`
|
|
22
|
+
|
|
23
|
+
Creates a benchmark directory from the fixture template at `templates/eval-fixture-template/`.
|
|
24
|
+
|
|
25
|
+
**What gets created:**
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
<target-dir>/
|
|
29
|
+
fixture/ # Minimal repo fixture (hooks, specs, src, package.json)
|
|
30
|
+
tests/ # Behavior and guard test files
|
|
31
|
+
spec.md # Benchmark objective and acceptance criteria
|
|
32
|
+
config.yaml # Benchmark configuration (skill under test, thresholds)
|
|
33
|
+
hypotheses.md # Hypotheses to validate
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Steps:**
|
|
37
|
+
|
|
38
|
+
1. Validate `<target-dir>` argument is provided; abort with usage hint if missing.
|
|
39
|
+
2. Check `<target-dir>` does not already exist; abort with error if it does.
|
|
40
|
+
3. Copy `templates/eval-fixture-template/` recursively to `<target-dir>`.
|
|
41
|
+
4. Confirm with summary:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
Created benchmark scaffold at <target-dir>/
|
|
45
|
+
fixture/ - minimal repo fixture
|
|
46
|
+
tests/ - behavior.test.js, guard.test.js
|
|
47
|
+
spec.md - edit to define benchmark objective
|
|
48
|
+
config.yaml - edit to set skill under test and thresholds
|
|
49
|
+
hypotheses.md - edit to define hypotheses
|
|
50
|
+
|
|
51
|
+
Next: edit spec.md and config.yaml, then run /df:eval <target-dir>/
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
**Implementation:**
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Parse --scaffold flag and target dir from $ARGUMENTS
|
|
58
|
+
# e.g. /df:eval --scaffold benchmarks/my-bench/
|
|
59
|
+
ARGS="$ARGUMENTS"
|
|
60
|
+
TARGET=$(echo "$ARGS" | sed 's/--scaffold[[:space:]]*//')
|
|
61
|
+
TEMPLATE="templates/eval-fixture-template"
|
|
62
|
+
|
|
63
|
+
if [ -z "$TARGET" ]; then
|
|
64
|
+
echo "Error: target directory required. Usage: /df:eval --scaffold benchmarks/<name>/"
|
|
65
|
+
exit 1
|
|
66
|
+
fi
|
|
67
|
+
|
|
68
|
+
if [ -d "$TARGET" ]; then
|
|
69
|
+
echo "Error: $TARGET already exists."
|
|
70
|
+
exit 1
|
|
71
|
+
fi
|
|
72
|
+
|
|
73
|
+
cp -r "$TEMPLATE/" "$TARGET"
|
|
74
|
+
echo "Created benchmark scaffold at $TARGET"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### `--hypothesis <text>`
|
|
78
|
+
|
|
79
|
+
Overrides the mutation hypothesis for the eval session. Without this flag the
|
|
80
|
+
loop reads `{benchDir}/hypotheses.md` and uses the first list item it finds.
|
|
81
|
+
|
|
82
|
+
**Hypothesis resolution order:**
|
|
83
|
+
|
|
84
|
+
1. `--hypothesis "<text>"` flag value — used as-is.
|
|
85
|
+
2. `{benchDir}/hypotheses.md` first list item (ordered or unordered markdown list).
|
|
86
|
+
3. Error if neither source is available.
|
|
87
|
+
|
|
88
|
+
**Module:** `src/eval/hypothesis.js` — `loadHypothesis({ flag, benchDir })`
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Main Eval Loop (T9 — implemented)
|
|
93
|
+
|
|
94
|
+
Running `/df:eval benchmarks/<name>/` without `--scaffold` runs the Karpathy loop:
|
|
95
|
+
|
|
96
|
+
1. Load `benchmarks/<name>/config.yaml` — skill under test, thresholds, iteration count
|
|
97
|
+
2. Resolve hypothesis via `--hypothesis` flag or `benchmarks/<name>/hypotheses.md` (first list item)
|
|
98
|
+
3. Create a worktree-isolated branch for the session (`eval/<skill>/<timestamp>`)
|
|
99
|
+
4. **Loop** (until Ctrl+C or `--loop N`):
|
|
100
|
+
a. Mutate skill file via agent prompt built from current content + history
|
|
101
|
+
b. Commit experiment (`status:pending`)
|
|
102
|
+
c. Run guard check (build + test commands from config)
|
|
103
|
+
- Guard fail → `git revert`, log `status:guard_fail`, next iteration
|
|
104
|
+
d. Collect metrics from `.deepflow/` JSONL files
|
|
105
|
+
e. Compare target metric against baseline
|
|
106
|
+
- Improved → log `status:kept`, update baseline
|
|
107
|
+
- Regression → `git revert`, log `status:reverted`
|
|
108
|
+
f. Record secondary metrics in commit message (never influence keep/revert)
|
|
109
|
+
|
|
110
|
+
**Implementation:** `src/eval/loop.js` (`runEvalLoop`), `src/eval/hypothesis.js` (`loadHypothesis`)
|
|
111
|
+
|
|
112
|
+
## Rules
|
|
113
|
+
|
|
114
|
+
- `--scaffold` never overwrites an existing directory
|
|
115
|
+
- Template is always copied from `templates/eval-fixture-template/`
|
|
116
|
+
- Main eval loop is non-deterministic by design — it samples skill behavior across N runs
|
|
117
|
+
- No LLM judges another LLM — only objective metrics (file diffs, test results, token counts) are used
|
|
@@ -376,7 +376,7 @@ Success criteria: {ACs from spec relevant to this task}
|
|
|
376
376
|
{TASK_DETAIL if available, else inline block:}
|
|
377
377
|
Impact: Callers: {file} ({why}) | Duplicates: [active→consolidate] [dead→DELETE] | Data flow: {consumers}
|
|
378
378
|
Prior tasks: {dep_id}: {summary}
|
|
379
|
-
Steps: 1. chub search/get for APIs 2. LSP findReferences, add unlisted callers 3.
|
|
379
|
+
Steps: 1. chub search/get for APIs 2. LSP findReferences, add unlisted callers 3. LSP documentSymbol on Impact files → Read with offset/limit on relevant ranges only (never read full files) 4. Implement 5. Commit
|
|
380
380
|
--- END ---
|
|
381
381
|
Duplicates: [active]→consolidate [dead]→DELETE. ONLY job: code+commit. No merge/rename/checkout.
|
|
382
382
|
Last line of your response MUST be: TASK_STATUS:pass (if successful) or TASK_STATUS:fail (if failed) or TASK_STATUS:revert (if reverted)
|