deepflow 0.1.103 → 0.1.104

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/bin/install-dynamic-hooks.test.js +461 -0
  2. package/bin/install.js +150 -250
  3. package/bin/lineage-ingest.js +70 -0
  4. package/hooks/df-check-update.js +1 -0
  5. package/hooks/df-command-usage.js +18 -0
  6. package/hooks/df-dashboard-push.js +1 -0
  7. package/hooks/df-execution-history.js +1 -0
  8. package/hooks/df-explore-protocol.js +83 -0
  9. package/hooks/df-explore-protocol.test.js +228 -0
  10. package/hooks/df-hook-event-tags.test.js +127 -0
  11. package/hooks/df-invariant-check.js +1 -0
  12. package/hooks/df-quota-logger.js +1 -0
  13. package/hooks/df-snapshot-guard.js +1 -0
  14. package/hooks/df-spec-lint.js +58 -1
  15. package/hooks/df-spec-lint.test.js +412 -0
  16. package/hooks/df-statusline.js +1 -0
  17. package/hooks/df-subagent-registry.js +1 -0
  18. package/hooks/df-tool-usage.js +13 -3
  19. package/hooks/df-worktree-guard.js +1 -0
  20. package/package.json +1 -1
  21. package/src/commands/df/debate.md +1 -1
  22. package/src/commands/df/eval.md +117 -0
  23. package/src/commands/df/execute.md +1 -1
  24. package/src/commands/df/fix.md +104 -0
  25. package/src/eval/git-memory.js +159 -0
  26. package/src/eval/git-memory.test.js +439 -0
  27. package/src/eval/hypothesis.js +80 -0
  28. package/src/eval/hypothesis.test.js +169 -0
  29. package/src/eval/loop.js +378 -0
  30. package/src/eval/loop.test.js +306 -0
  31. package/src/eval/metric-collector.js +163 -0
  32. package/src/eval/metric-collector.test.js +369 -0
  33. package/src/eval/metric-pivot.js +119 -0
  34. package/src/eval/metric-pivot.test.js +350 -0
  35. package/src/eval/mutator-prompt.js +106 -0
  36. package/src/eval/mutator-prompt.test.js +180 -0
  37. package/templates/config-template.yaml +5 -0
  38. package/templates/eval-fixture-template/config.yaml +39 -0
  39. package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
  40. package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
  41. package/templates/eval-fixture-template/fixture/package.json +12 -0
  42. package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
  43. package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
  44. package/templates/eval-fixture-template/fixture/src/config.js +40 -0
  45. package/templates/eval-fixture-template/fixture/src/index.js +19 -0
  46. package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
  47. package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
  48. package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
  49. package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
  50. package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
  51. package/templates/eval-fixture-template/hypotheses.md +14 -0
  52. package/templates/eval-fixture-template/spec.md +34 -0
  53. package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
  54. package/templates/eval-fixture-template/tests/guard.test.js +108 -0
  55. package/templates/eval-fixture-template.test.js +318 -0
  56. package/templates/explore-agent.md +5 -74
  57. package/templates/explore-protocol.md +44 -0
  58. package/templates/spec-template.md +4 -0
@@ -0,0 +1,350 @@
1
+ 'use strict';
2
+
3
+ const { describe, it, before, after } = require('node:test');
4
+ const assert = require('node:assert');
5
+ const fs = require('fs');
6
+ const path = require('path');
7
+ const os = require('os');
8
+ const { execSync } = require('child_process');
9
+
10
+ const {
11
+ parseSecondaries,
12
+ surfaceCandidates,
13
+ formatCandidates,
14
+ } = require('./metric-pivot.js');
15
+
16
+ const { commitExperiment, revertExperiment } = require('./git-memory.js');
17
+
18
+ /**
19
+ * Creates a temporary git repo with an initial commit.
20
+ */
21
+ function createTempRepo() {
22
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'metric-pivot-test-'));
23
+ execSync('git init', { cwd: dir, stdio: 'pipe' });
24
+ execSync('git config user.email "test@test.com"', { cwd: dir, stdio: 'pipe' });
25
+ execSync('git config user.name "Test"', { cwd: dir, stdio: 'pipe' });
26
+ fs.writeFileSync(path.join(dir, 'README.md'), '# test repo\n');
27
+ execSync('git add -A && git commit -m "initial commit"', { cwd: dir, stdio: 'pipe' });
28
+ return dir;
29
+ }
30
+
31
+ function cleanupRepo(dir) {
32
+ fs.rmSync(dir, { recursive: true, force: true });
33
+ }
34
+
35
+ // --- parseSecondaries ---
36
+
37
+ describe('parseSecondaries', () => {
38
+ it('parses single key=value pair', () => {
39
+ const result = parseSecondaries('accuracy=98%');
40
+ assert.deepStrictEqual(result, { accuracy: '98%' });
41
+ });
42
+
43
+ it('parses multiple key=value pairs', () => {
44
+ const result = parseSecondaries('accuracy=98% latency=4.2s mem=50MB');
45
+ assert.deepStrictEqual(result, {
46
+ accuracy: '98%',
47
+ latency: '4.2s',
48
+ mem: '50MB',
49
+ });
50
+ });
51
+
52
+ it('returns empty object for null input', () => {
53
+ assert.deepStrictEqual(parseSecondaries(null), {});
54
+ });
55
+
56
+ it('returns empty object for undefined input', () => {
57
+ assert.deepStrictEqual(parseSecondaries(undefined), {});
58
+ });
59
+
60
+ it('returns empty object for empty string', () => {
61
+ assert.deepStrictEqual(parseSecondaries(''), {});
62
+ });
63
+
64
+ it('skips tokens without equals sign', () => {
65
+ const result = parseSecondaries('good=1 badtoken good2=2');
66
+ assert.deepStrictEqual(result, { good: '1', good2: '2' });
67
+ });
68
+
69
+ it('handles values containing equals signs', () => {
70
+ // token "key=a=b" => key="a=b" (first = is the split point)
71
+ const result = parseSecondaries('formula=x=y+z');
72
+ assert.deepStrictEqual(result, { formula: 'x=y+z' });
73
+ });
74
+
75
+ it('handles extra whitespace between tokens', () => {
76
+ const result = parseSecondaries(' a=1 b=2 ');
77
+ assert.deepStrictEqual(result, { a: '1', b: '2' });
78
+ });
79
+
80
+ it('skips tokens with empty key (e.g. "=value")', () => {
81
+ const result = parseSecondaries('=nokey valid=yes');
82
+ assert.deepStrictEqual(result, { valid: 'yes' });
83
+ });
84
+ });
85
+
86
+ // --- surfaceCandidates (integration with temp git repo) ---
87
+
88
+ describe('surfaceCandidates', () => {
89
+ describe('AC-14: finds reverted experiments with positive delta on new target', () => {
90
+ let cwd;
91
+
92
+ before(() => {
93
+ cwd = createTempRepo();
94
+
95
+ // Experiment 1: reverted, primary target=latency, positive delta
96
+ fs.writeFileSync(path.join(cwd, 'e1.txt'), '1');
97
+ commitExperiment({
98
+ cwd, skillName: 'browse-fetch', hypothesis: 'cache responses',
99
+ target: 'latency', value: '4.2s', delta: '+16', status: 'reverted',
100
+ secondaries: 'accuracy=98%',
101
+ });
102
+
103
+ // Experiment 2: reverted, primary target=latency, negative delta => excluded
104
+ fs.writeFileSync(path.join(cwd, 'e2.txt'), '2');
105
+ commitExperiment({
106
+ cwd, skillName: 'browse-fetch', hypothesis: 'bad cache',
107
+ target: 'latency', value: '10s', delta: '-5', status: 'reverted',
108
+ secondaries: '',
109
+ });
110
+
111
+ // Experiment 3: kept (pass), primary target=latency, positive delta => excluded (not reverted)
112
+ fs.writeFileSync(path.join(cwd, 'e3.txt'), '3');
113
+ commitExperiment({
114
+ cwd, skillName: 'browse-fetch', hypothesis: 'kept experiment',
115
+ target: 'latency', value: '3s', delta: '+20', status: 'pass',
116
+ secondaries: '',
117
+ });
118
+
119
+ // Experiment 4: reverted, primary target=speed, but latency in secondaries
120
+ fs.writeFileSync(path.join(cwd, 'e4.txt'), '4');
121
+ commitExperiment({
122
+ cwd, skillName: 'browse-fetch', hypothesis: 'speed tweak with latency side-effect',
123
+ target: 'speed', value: '150', delta: '+10', status: 'reverted',
124
+ secondaries: 'latency=3.1s mem=40MB',
125
+ });
126
+ });
127
+
128
+ after(() => {
129
+ cleanupRepo(cwd);
130
+ });
131
+
132
+ it('finds reverted experiment with positive delta on primary target', () => {
133
+ const candidates = surfaceCandidates({
134
+ cwd, skillName: 'browse-fetch', newTarget: 'latency',
135
+ });
136
+
137
+ const primaryCandidate = candidates.find(c => c.candidateSource === 'primary');
138
+ assert.ok(primaryCandidate, 'Should find a primary candidate');
139
+ assert.strictEqual(primaryCandidate.hypothesis, 'cache responses');
140
+ assert.strictEqual(primaryCandidate.candidateValue, '4.2s');
141
+ assert.strictEqual(primaryCandidate.candidateDelta, 16);
142
+ });
143
+
144
+ it('excludes reverted experiments with negative delta on primary target', () => {
145
+ const candidates = surfaceCandidates({
146
+ cwd, skillName: 'browse-fetch', newTarget: 'latency',
147
+ });
148
+
149
+ const badCache = candidates.find(c => c.hypothesis === 'bad cache');
150
+ assert.strictEqual(badCache, undefined, 'Should not include negative delta experiment');
151
+ });
152
+
153
+ it('excludes non-reverted (kept) experiments', () => {
154
+ const candidates = surfaceCandidates({
155
+ cwd, skillName: 'browse-fetch', newTarget: 'latency',
156
+ });
157
+
158
+ const kept = candidates.find(c => c.hypothesis === 'kept experiment');
159
+ assert.strictEqual(kept, undefined, 'Should not include kept (pass) experiments');
160
+ });
161
+
162
+ it('finds candidates from secondary metrics', () => {
163
+ const candidates = surfaceCandidates({
164
+ cwd, skillName: 'browse-fetch', newTarget: 'latency',
165
+ });
166
+
167
+ const secondaryCandidate = candidates.find(c => c.candidateSource === 'secondary');
168
+ assert.ok(secondaryCandidate, 'Should find a secondary candidate');
169
+ assert.strictEqual(secondaryCandidate.hypothesis, 'speed tweak with latency side-effect');
170
+ assert.strictEqual(secondaryCandidate.candidateValue, '3.1s');
171
+ assert.strictEqual(secondaryCandidate.candidateDelta, null, 'Secondary candidates have no delta');
172
+ });
173
+
174
+ it('returns empty array when no candidates match new target', () => {
175
+ const candidates = surfaceCandidates({
176
+ cwd, skillName: 'browse-fetch', newTarget: 'nonexistent-metric',
177
+ });
178
+ assert.deepStrictEqual(candidates, []);
179
+ });
180
+ });
181
+
182
+ describe('edge cases', () => {
183
+ let cwd;
184
+
185
+ before(() => {
186
+ cwd = createTempRepo();
187
+ });
188
+
189
+ after(() => {
190
+ cleanupRepo(cwd);
191
+ });
192
+
193
+ it('returns empty array when no experiments exist', () => {
194
+ const candidates = surfaceCandidates({
195
+ cwd, skillName: 'no-experiments', newTarget: 'latency',
196
+ });
197
+ assert.deepStrictEqual(candidates, []);
198
+ });
199
+
200
+ it('returns empty array for unknown skill name', () => {
201
+ // Add an experiment for a different skill
202
+ fs.writeFileSync(path.join(cwd, 'other.txt'), 'other');
203
+ commitExperiment({
204
+ cwd, skillName: 'other-skill', hypothesis: 'h',
205
+ target: 'metric', value: '1', delta: '+5', status: 'reverted',
206
+ secondaries: '',
207
+ });
208
+
209
+ const candidates = surfaceCandidates({
210
+ cwd, skillName: 'nonexistent-skill', newTarget: 'metric',
211
+ });
212
+ assert.deepStrictEqual(candidates, []);
213
+ });
214
+ });
215
+
216
+ describe('reverted experiments with zero delta excluded', () => {
217
+ let cwd;
218
+
219
+ before(() => {
220
+ cwd = createTempRepo();
221
+
222
+ fs.writeFileSync(path.join(cwd, 'z.txt'), 'z');
223
+ commitExperiment({
224
+ cwd, skillName: 'zero-test', hypothesis: 'zero delta',
225
+ target: 'metric', value: '50', delta: '0', status: 'reverted',
226
+ secondaries: '',
227
+ });
228
+ });
229
+
230
+ after(() => {
231
+ cleanupRepo(cwd);
232
+ });
233
+
234
+ it('excludes reverted experiments with zero delta (not positive)', () => {
235
+ const candidates = surfaceCandidates({
236
+ cwd, skillName: 'zero-test', newTarget: 'metric',
237
+ });
238
+ assert.deepStrictEqual(candidates, [], 'Zero delta is not positive, should be excluded');
239
+ });
240
+ });
241
+ });
242
+
243
+ // --- formatCandidates ---
244
+
245
+ describe('formatCandidates', () => {
246
+ it('returns "no candidates" message for empty list', () => {
247
+ const result = formatCandidates([], 'latency');
248
+ assert.strictEqual(result, 'No reverted experiments found with positive delta on target="latency".');
249
+ });
250
+
251
+ it('formats candidate with primary delta', () => {
252
+ const candidates = [{
253
+ hash: 'abc1234',
254
+ skillName: 'browse-fetch',
255
+ hypothesis: 'cache responses',
256
+ target: 'latency',
257
+ value: '4.2s',
258
+ delta: 16,
259
+ status: 'reverted',
260
+ secondaries: 'accuracy=98%',
261
+ candidateValue: '4.2s',
262
+ candidateDelta: 16,
263
+ candidateSource: 'primary',
264
+ }];
265
+
266
+ const result = formatCandidates(candidates, 'latency');
267
+
268
+ assert.ok(result.includes('Reverted experiments with positive signal on "latency"'));
269
+ assert.ok(result.includes('[abc1234] browse-fetch: cache responses'));
270
+ assert.ok(result.includes('delta=+16% (primary)'));
271
+ assert.ok(result.includes('original target: latency=4.2s delta=16%'));
272
+ });
273
+
274
+ it('formats candidate from secondary (no delta)', () => {
275
+ const candidates = [{
276
+ hash: 'def5678',
277
+ skillName: 'browse-fetch',
278
+ hypothesis: 'speed tweak',
279
+ target: 'speed',
280
+ value: '150',
281
+ delta: 10,
282
+ status: 'reverted',
283
+ secondaries: 'latency=3.1s',
284
+ candidateValue: '3.1s',
285
+ candidateDelta: null,
286
+ candidateSource: 'secondary',
287
+ }];
288
+
289
+ const result = formatCandidates(candidates, 'latency');
290
+
291
+ assert.ok(result.includes('[def5678] browse-fetch: speed tweak'));
292
+ assert.ok(result.includes('value=3.1s (secondary'));
293
+ assert.ok(result.includes('no delta available'));
294
+ assert.ok(result.includes('original target: speed=150 delta=10%'));
295
+ });
296
+
297
+ it('formats multiple candidates', () => {
298
+ const candidates = [
299
+ {
300
+ hash: 'aaa',
301
+ skillName: 'skill-a',
302
+ hypothesis: 'h1',
303
+ target: 'latency',
304
+ value: '4s',
305
+ delta: 10,
306
+ status: 'reverted',
307
+ candidateValue: '4s',
308
+ candidateDelta: 10,
309
+ candidateSource: 'primary',
310
+ },
311
+ {
312
+ hash: 'bbb',
313
+ skillName: 'skill-a',
314
+ hypothesis: 'h2',
315
+ target: 'speed',
316
+ value: '200',
317
+ delta: 5,
318
+ status: 'reverted',
319
+ candidateValue: '3s',
320
+ candidateDelta: null,
321
+ candidateSource: 'secondary',
322
+ },
323
+ ];
324
+
325
+ const result = formatCandidates(candidates, 'latency');
326
+
327
+ assert.ok(result.includes('[aaa]'));
328
+ assert.ok(result.includes('[bbb]'));
329
+ assert.ok(result.includes('h1'));
330
+ assert.ok(result.includes('h2'));
331
+ });
332
+
333
+ it('does not have trailing newline', () => {
334
+ const candidates = [{
335
+ hash: 'x',
336
+ skillName: 's',
337
+ hypothesis: 'h',
338
+ target: 't',
339
+ value: '1',
340
+ delta: 5,
341
+ status: 'reverted',
342
+ candidateValue: '1',
343
+ candidateDelta: 5,
344
+ candidateSource: 'primary',
345
+ }];
346
+
347
+ const result = formatCandidates(candidates, 't');
348
+ assert.ok(!result.endsWith('\n'), 'Output should not end with newline (trimEnd applied)');
349
+ });
350
+ });
@@ -0,0 +1,106 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Mutator prompt builder for the skill-eval Karpathy loop.
5
+ *
6
+ * Layout follows the attention U-curve pattern:
7
+ * START zone: ## Skill + ## Hypothesis (highest attention — task + context)
8
+ * MIDDLE zone: ## History (lower attention — truncated experiment log)
9
+ * END zone: ## Instructions (highest attention — action directive)
10
+ *
11
+ * Full file replacement: the mutator is instructed to output the COMPLETE new
12
+ * skill file, not a diff or partial edit.
13
+ */
14
+
15
+ const CHARS_PER_TOKEN = 4; // rough estimate: 1 token ≈ 4 characters
16
+
17
+ /**
18
+ * Truncate experiment history to fit within maxHistoryTokens, keeping the most
19
+ * recent experiments first (they are assumed to be ordered newest → oldest in
20
+ * the `history` array).
21
+ *
22
+ * @param {string[]} history Array of experiment strings, most recent first.
23
+ * @param {number} maxTokens Token budget for the history section.
24
+ * @returns {string} Concatenated history that fits within budget.
25
+ */
26
+ function truncateHistory(history, maxTokens) {
27
+ if (!history || history.length === 0) return '(no experiments yet)';
28
+
29
+ const budget = maxTokens * CHARS_PER_TOKEN;
30
+ const kept = [];
31
+ let used = 0;
32
+
33
+ for (const entry of history) {
34
+ const entryChars = entry.length + 2; // +2 for the separating newlines
35
+ if (used + entryChars > budget) break;
36
+ kept.push(entry);
37
+ used += entryChars;
38
+ }
39
+
40
+ if (kept.length === 0) {
41
+ // Even the single most-recent entry exceeds budget — truncate it hard.
42
+ const single = history[0].slice(0, budget - 3) + '...';
43
+ return single;
44
+ }
45
+
46
+ const dropped = history.length - kept.length;
47
+ const suffix = dropped > 0 ? `\n\n_(${dropped} older experiment(s) omitted to fit token budget)_` : '';
48
+ return kept.join('\n\n') + suffix;
49
+ }
50
+
51
+ /**
52
+ * Build the mutator prompt.
53
+ *
54
+ * @param {object} options
55
+ * @param {string} options.skillContent Full text of the current skill file.
56
+ * @param {string} options.hypothesis The current mutation hypothesis.
57
+ * @param {string[]} [options.history=[]] Experiment history strings, most recent first.
58
+ * @param {number} [options.maxHistoryTokens=4000] Token budget for history section.
59
+ * @returns {string} The complete prompt string to send to the mutator LLM.
60
+ */
61
+ function buildMutatorPrompt({
62
+ skillContent,
63
+ hypothesis,
64
+ history = [],
65
+ maxHistoryTokens = 4000,
66
+ }) {
67
+ if (typeof skillContent !== 'string' || skillContent.length === 0) {
68
+ throw new Error('skillContent must be a non-empty string');
69
+ }
70
+ if (typeof hypothesis !== 'string' || hypothesis.length === 0) {
71
+ throw new Error('hypothesis must be a non-empty string');
72
+ }
73
+
74
+ const historyText = truncateHistory(history, maxHistoryTokens);
75
+
76
+ return [
77
+ '## Skill',
78
+ '',
79
+ skillContent.trimEnd(),
80
+ '',
81
+ '## Hypothesis',
82
+ '',
83
+ hypothesis.trimEnd(),
84
+ '',
85
+ '## History',
86
+ '',
87
+ historyText,
88
+ '',
89
+ '## Instructions',
90
+ '',
91
+ 'You are mutating the skill file above to test the hypothesis.',
92
+ '',
93
+ 'Rules:',
94
+ '- Output the COMPLETE replacement skill file — no diffs, no partial edits.',
95
+ '- Do not change the YAML frontmatter `name` or `description` fields.',
96
+ '- Apply exactly ONE focused change that directly tests the hypothesis.',
97
+ '- If the history shows this hypothesis already failed, try a different angle.',
98
+ '- If the history shows a best-known state, you may backtrack to it and try a',
99
+ ' smaller variation.',
100
+ '',
101
+ 'Respond with only the new skill file content, starting with the YAML front matter.',
102
+ 'No prose, no explanation, no code fences.',
103
+ ].join('\n');
104
+ }
105
+
106
+ module.exports = { buildMutatorPrompt };
@@ -0,0 +1,180 @@
1
+ 'use strict';
2
+
3
+ const { describe, it } = require('node:test');
4
+ const assert = require('node:assert/strict');
5
+ const { buildMutatorPrompt } = require('./mutator-prompt.js');
6
+
7
+ const DEFAULTS = { skillContent: '---\nname: test\n---\nSome skill', hypothesis: 'Try X' };
8
+
9
+ describe('buildMutatorPrompt', () => {
10
+ // --- AC-8: Section order ---
11
+
12
+ it('emits sections in order: ## Skill, ## Hypothesis, ## History, ## Instructions', () => {
13
+ const result = buildMutatorPrompt(DEFAULTS);
14
+ const headings = [...result.matchAll(/^## \w+/gm)].map(m => m[0]);
15
+ assert.deepStrictEqual(headings, ['## Skill', '## Hypothesis', '## History', '## Instructions']);
16
+ });
17
+
18
+ it('## Skill appears before ## Hypothesis which appears before ## History which appears before ## Instructions', () => {
19
+ const result = buildMutatorPrompt({ ...DEFAULTS, history: ['exp1'] });
20
+ const skillIdx = result.indexOf('## Skill');
21
+ const hypIdx = result.indexOf('## Hypothesis');
22
+ const histIdx = result.indexOf('## History');
23
+ const instrIdx = result.indexOf('## Instructions');
24
+ assert.ok(skillIdx < hypIdx, 'Skill before Hypothesis');
25
+ assert.ok(hypIdx < histIdx, 'Hypothesis before History');
26
+ assert.ok(histIdx < instrIdx, 'History before Instructions');
27
+ });
28
+
29
+ // --- Skill content placement ---
30
+
31
+ it('places skillContent after ## Skill heading', () => {
32
+ const result = buildMutatorPrompt(DEFAULTS);
33
+ const afterSkill = result.split('## Skill\n\n')[1];
34
+ assert.ok(afterSkill.startsWith(DEFAULTS.skillContent));
35
+ });
36
+
37
+ it('places hypothesis after ## Hypothesis heading', () => {
38
+ const result = buildMutatorPrompt(DEFAULTS);
39
+ const afterHyp = result.split('## Hypothesis\n\n')[1];
40
+ assert.ok(afterHyp.startsWith(DEFAULTS.hypothesis));
41
+ });
42
+
43
+ // --- Empty history ---
44
+
45
+ it('shows "(no experiments yet)" when history is empty', () => {
46
+ const result = buildMutatorPrompt(DEFAULTS);
47
+ assert.ok(result.includes('(no experiments yet)'));
48
+ });
49
+
50
+ it('shows "(no experiments yet)" when history is not provided', () => {
51
+ const result = buildMutatorPrompt({ skillContent: 'skill', hypothesis: 'hyp' });
52
+ assert.ok(result.includes('(no experiments yet)'));
53
+ });
54
+
55
+ // --- AC-9: History truncation ---
56
+
57
+ it('includes all history entries when within token budget', () => {
58
+ const history = ['entry-one', 'entry-two'];
59
+ const result = buildMutatorPrompt({ ...DEFAULTS, history, maxHistoryTokens: 4000 });
60
+ assert.ok(result.includes('entry-one'));
61
+ assert.ok(result.includes('entry-two'));
62
+ assert.ok(!result.includes('omitted'));
63
+ });
64
+
65
+ it('truncates older entries when history exceeds maxHistoryTokens budget', () => {
66
+ // budget = 5 tokens * 4 chars/token = 20 chars
67
+ // Each entry gets length + 2 for separating newlines
68
+ const history = ['aaaa', 'bbbb', 'cccccccccccccccc'];
69
+ // entry 'aaaa' = 4+2=6 chars, 'bbbb' = 4+2=6 chars => 12 chars fits in 20
70
+ // 'cccc...' = 16+2=18 chars => 12+18=30 > 20 => dropped
71
+ const result = buildMutatorPrompt({ ...DEFAULTS, history, maxHistoryTokens: 5 });
72
+ assert.ok(result.includes('aaaa'), 'newest entry kept');
73
+ assert.ok(result.includes('bbbb'), 'second entry kept');
74
+ assert.ok(!result.includes('cccccccccccccccc'), 'oldest entry dropped');
75
+ assert.ok(result.includes('1 older experiment(s) omitted'));
76
+ });
77
+
78
+ it('keeps entries newest-first and drops from the end', () => {
79
+ // budget = 3 tokens * 4 = 12 chars total
80
+ const history = ['AAAA', 'BBBB', 'CCCC'];
81
+ // 'AAAA' = 4+2=6, 'BBBB' = 4+2=6 => 12, fits exactly. 'CCCC' dropped.
82
+ const result = buildMutatorPrompt({ ...DEFAULTS, history, maxHistoryTokens: 3 });
83
+ assert.ok(result.includes('AAAA'));
84
+ assert.ok(result.includes('BBBB'));
85
+ assert.ok(!result.includes('CCCC'));
86
+ assert.ok(result.includes('1 older experiment(s) omitted'));
87
+ });
88
+
89
+ it('reports correct count of dropped entries', () => {
90
+ // budget = 2 tokens * 4 = 8 chars
91
+ // 'AA' = 2+2=4, fits. 'BB'=4+2=8 => 4+4=8 fits. 'CC' and 'DD' dropped.
92
+ // Wait: 'AA' len=2+2=4, 'BB' len=2+2=4, total=8 which equals budget so 'BB' fits.
93
+ const history = ['AA', 'BB', 'CC', 'DD'];
94
+ const result = buildMutatorPrompt({ ...DEFAULTS, history, maxHistoryTokens: 2 });
95
+ assert.ok(result.includes('2 older experiment(s) omitted'));
96
+ });
97
+
98
+ // --- Single entry exceeding budget: hard truncation ---
99
+
100
+ it('hard-truncates a single entry that exceeds the entire budget', () => {
101
+ const longEntry = 'X'.repeat(200);
102
+ // budget = 2 tokens * 4 = 8 chars. Entry is 200 chars, exceeds budget.
103
+ const result = buildMutatorPrompt({ ...DEFAULTS, history: [longEntry], maxHistoryTokens: 2 });
104
+ // Hard truncated to budget-3 chars + '...'
105
+ assert.ok(result.includes('...'), 'should end with ellipsis');
106
+ assert.ok(!result.includes('X'.repeat(200)), 'full entry should not appear');
107
+ // The truncated text should be budget - 3 = 5 chars of X + '...'
108
+ assert.ok(result.includes('XXXXX...'));
109
+ });
110
+
111
+ it('hard-truncated single entry length equals budget chars', () => {
112
+ const longEntry = 'Y'.repeat(500);
113
+ const maxTokens = 10; // budget = 40 chars
114
+ const result = buildMutatorPrompt({ ...DEFAULTS, history: [longEntry], maxHistoryTokens: maxTokens });
115
+ // Truncated to (40 - 3) = 37 Y's + '...' = 40 chars total
116
+ const historySection = result.split('## History\n\n')[1].split('\n\n## Instructions')[0];
117
+ assert.equal(historySection.length, 40);
118
+ assert.ok(historySection.endsWith('...'));
119
+ });
120
+
121
+ // --- Full file replacement instructions ---
122
+
123
+ it('contains full file replacement instruction', () => {
124
+ const result = buildMutatorPrompt(DEFAULTS);
125
+ assert.ok(result.includes('COMPLETE replacement skill file'));
126
+ });
127
+
128
+ it('instructs no diffs or partial edits', () => {
129
+ const result = buildMutatorPrompt(DEFAULTS);
130
+ assert.ok(result.includes('no diffs, no partial edits'));
131
+ });
132
+
133
+ it('instructs to respond with only skill file content', () => {
134
+ const result = buildMutatorPrompt(DEFAULTS);
135
+ assert.ok(result.includes('Respond with only the new skill file content'));
136
+ });
137
+
138
+ // --- Default maxHistoryTokens is 4000 ---
139
+
140
+ it('defaults maxHistoryTokens to 4000 (fits ~16000 chars of history)', () => {
141
+ // With default 4000 tokens = 16000 chars budget
142
+ // Create history that fits in 16000 but would not fit in, say, 1000
143
+ const entry = 'Z'.repeat(5000); // 5000 + 2 = 5002 chars each
144
+ const history = [entry, entry, entry]; // 3 * 5002 = 15006 chars, fits in 16000
145
+ const result = buildMutatorPrompt({ ...DEFAULTS, history });
146
+ // All three should be present with the default budget
147
+ const count = (result.match(/Z{5000}/g) || []).length;
148
+ assert.equal(count, 3, 'all three entries should be included with default 4000 token budget');
149
+ assert.ok(!result.includes('omitted'));
150
+ });
151
+
152
+ // --- Input validation ---
153
+
154
+ it('throws when skillContent is missing', () => {
155
+ assert.throws(() => buildMutatorPrompt({ hypothesis: 'hyp' }), /skillContent must be a non-empty string/);
156
+ });
157
+
158
+ it('throws when skillContent is empty string', () => {
159
+ assert.throws(() => buildMutatorPrompt({ skillContent: '', hypothesis: 'hyp' }), /skillContent must be a non-empty string/);
160
+ });
161
+
162
+ it('throws when hypothesis is missing', () => {
163
+ assert.throws(() => buildMutatorPrompt({ skillContent: 'skill' }), /hypothesis must be a non-empty string/);
164
+ });
165
+
166
+ it('throws when hypothesis is empty string', () => {
167
+ assert.throws(() => buildMutatorPrompt({ skillContent: 'skill', hypothesis: '' }), /hypothesis must be a non-empty string/);
168
+ });
169
+
170
+ // --- Edge cases ---
171
+
172
+ it('trims trailing whitespace from skillContent and hypothesis', () => {
173
+ const result = buildMutatorPrompt({ skillContent: 'skill \n\n', hypothesis: 'hyp \n' });
174
+ // After ## Skill, should have trimmed content
175
+ const skillSection = result.split('## Skill\n\n')[1].split('\n\n## Hypothesis')[0];
176
+ assert.equal(skillSection, 'skill');
177
+ const hypSection = result.split('## Hypothesis\n\n')[1].split('\n\n## History')[0];
178
+ assert.equal(hypSection, 'hyp');
179
+ });
180
+ });
@@ -129,6 +129,11 @@ dashboard_url: ""
129
129
  # Default: 3334 (3333 is reserved for local mode)
130
130
  dashboard_port: 3334
131
131
 
132
+ eval:
133
+ # Max tokens of experiment history passed to the mutator prompt during skill evaluation
134
+ max_history_tokens: 4000
135
+ # Future: rework metric (fix count) as secondary signal — requires spec-lineage tracking (not in v1)
136
+
132
137
  # Recommended .gitignore entries
133
138
  # Add these entries to your .gitignore to exclude instrumentation artifacts
134
139
  gitignore_entries:
@@ -0,0 +1,39 @@
1
+ # Benchmark configuration for /df:eval
2
+
3
+ benchmark:
4
+ name: "{benchmark-name}"
5
+ skill: "skills/{skill-name}/SKILL.md" # path to skill being evaluated
6
+ description: "[What this benchmark measures]"
7
+
8
+ # Metric definitions
9
+ metrics:
10
+ # Target metric — the one numeric value that decides keep/revert
11
+ target: cache_ratio
12
+
13
+ # Guard — binary pass/fail. Failure auto-reverts before any metric check.
14
+ guard_command: "node tests/guard.test.js"
15
+
16
+ # Secondary metrics — logged in commit message, inform mutator, never decide
17
+ secondary:
18
+ - total_tokens
19
+ - wall_time
20
+ - context_burn
21
+
22
+ # Iteration settings
23
+ loop:
24
+ # Default iteration cap. Override with --loop N flag. 0 = unlimited (Ctrl+C to stop)
25
+ default_iterations: 0
26
+
27
+ # History window passed to mutator prompt (~4000 tokens ≈ 15 experiments)
28
+ max_history_tokens: 4000
29
+
30
+ # Fixture execution
31
+ fixture:
32
+ # Command to run inside the fixture dir during each eval iteration
33
+ run_command: "node scripts/run-task.js"
34
+
35
+ # Timeout per iteration in seconds
36
+ timeout: 300
37
+
38
+ # deepflow token-history location relative to fixture dir
39
+ token_history_path: ".deepflow/token-history.jsonl"