deepflow 0.1.102 → 0.1.104
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install-dynamic-hooks.test.js +461 -0
- package/bin/install.js +150 -204
- package/bin/install.test.js +214 -0
- package/bin/lineage-ingest.js +70 -0
- package/hooks/df-check-update.js +1 -0
- package/hooks/df-command-usage.js +305 -0
- package/hooks/df-command-usage.test.js +1019 -0
- package/hooks/df-dashboard-push.js +1 -0
- package/hooks/df-execution-history.js +1 -0
- package/hooks/df-explore-protocol.js +83 -0
- package/hooks/df-explore-protocol.test.js +228 -0
- package/hooks/df-hook-event-tags.test.js +127 -0
- package/hooks/df-invariant-check.js +1 -0
- package/hooks/df-quota-logger.js +1 -0
- package/hooks/df-snapshot-guard.js +1 -0
- package/hooks/df-spec-lint.js +58 -1
- package/hooks/df-spec-lint.test.js +412 -0
- package/hooks/df-statusline.js +1 -0
- package/hooks/df-subagent-registry.js +34 -14
- package/hooks/df-tool-usage.js +21 -3
- package/hooks/df-tool-usage.test.js +200 -0
- package/hooks/df-worktree-guard.js +1 -0
- package/package.json +1 -1
- package/src/commands/df/debate.md +1 -1
- package/src/commands/df/eval.md +117 -0
- package/src/commands/df/execute.md +1 -1
- package/src/commands/df/fix.md +104 -0
- package/src/eval/git-memory.js +159 -0
- package/src/eval/git-memory.test.js +439 -0
- package/src/eval/hypothesis.js +80 -0
- package/src/eval/hypothesis.test.js +169 -0
- package/src/eval/loop.js +378 -0
- package/src/eval/loop.test.js +306 -0
- package/src/eval/metric-collector.js +163 -0
- package/src/eval/metric-collector.test.js +369 -0
- package/src/eval/metric-pivot.js +119 -0
- package/src/eval/metric-pivot.test.js +350 -0
- package/src/eval/mutator-prompt.js +106 -0
- package/src/eval/mutator-prompt.test.js +180 -0
- package/templates/config-template.yaml +5 -0
- package/templates/eval-fixture-template/config.yaml +39 -0
- package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
- package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
- package/templates/eval-fixture-template/fixture/package.json +12 -0
- package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
- package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
- package/templates/eval-fixture-template/fixture/src/config.js +40 -0
- package/templates/eval-fixture-template/fixture/src/index.js +19 -0
- package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
- package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
- package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
- package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
- package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
- package/templates/eval-fixture-template/hypotheses.md +14 -0
- package/templates/eval-fixture-template/spec.md +34 -0
- package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
- package/templates/eval-fixture-template/tests/guard.test.js +108 -0
- package/templates/eval-fixture-template.test.js +318 -0
- package/templates/explore-agent.md +5 -74
- package/templates/explore-protocol.md +44 -0
- package/templates/spec-template.md +4 -0
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { describe, it, before, after } = require('node:test');
|
|
4
|
+
const assert = require('node:assert');
|
|
5
|
+
const fs = require('fs');
|
|
6
|
+
const path = require('path');
|
|
7
|
+
const os = require('os');
|
|
8
|
+
const { execSync } = require('child_process');
|
|
9
|
+
|
|
10
|
+
const {
|
|
11
|
+
commitExperiment,
|
|
12
|
+
revertExperiment,
|
|
13
|
+
queryExperiments,
|
|
14
|
+
getExperimentHistory,
|
|
15
|
+
formatCommitMessage,
|
|
16
|
+
parseExperimentLine,
|
|
17
|
+
} = require('./git-memory.js');
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Creates a temporary git repo with an initial commit.
|
|
21
|
+
* Returns the directory path.
|
|
22
|
+
*/
|
|
23
|
+
function createTempRepo() {
|
|
24
|
+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'git-memory-test-'));
|
|
25
|
+
execSync('git init', { cwd: dir, stdio: 'pipe' });
|
|
26
|
+
execSync('git config user.email "test@test.com"', { cwd: dir, stdio: 'pipe' });
|
|
27
|
+
execSync('git config user.name "Test"', { cwd: dir, stdio: 'pipe' });
|
|
28
|
+
// Initial commit so we have a HEAD
|
|
29
|
+
fs.writeFileSync(path.join(dir, 'README.md'), '# test repo\n');
|
|
30
|
+
execSync('git add -A && git commit -m "initial commit"', { cwd: dir, stdio: 'pipe' });
|
|
31
|
+
return dir;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function cleanupRepo(dir) {
|
|
35
|
+
fs.rmSync(dir, { recursive: true, force: true });
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// --- Unit tests for pure functions ---
|
|
39
|
+
|
|
40
|
+
describe('formatCommitMessage', () => {
|
|
41
|
+
it('formats a complete experiment commit message', () => {
|
|
42
|
+
const msg = formatCommitMessage({
|
|
43
|
+
skillName: 'browse-fetch',
|
|
44
|
+
hypothesis: 'reduce timeout to 5s',
|
|
45
|
+
target: 'latency',
|
|
46
|
+
value: '4.2s',
|
|
47
|
+
delta: '-16',
|
|
48
|
+
status: 'pass',
|
|
49
|
+
secondaries: 'accuracy=98%',
|
|
50
|
+
});
|
|
51
|
+
assert.strictEqual(
|
|
52
|
+
msg,
|
|
53
|
+
'experiment(browse-fetch): reduce timeout to 5s | latency=4.2s delta=-16% pass | accuracy=98%'
|
|
54
|
+
);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it('handles missing secondaries as empty string', () => {
|
|
58
|
+
const msg = formatCommitMessage({
|
|
59
|
+
skillName: 'skill-a',
|
|
60
|
+
hypothesis: 'test hyp',
|
|
61
|
+
target: 'speed',
|
|
62
|
+
value: '10',
|
|
63
|
+
delta: '+5',
|
|
64
|
+
status: 'fail',
|
|
65
|
+
secondaries: undefined,
|
|
66
|
+
});
|
|
67
|
+
assert.ok(msg.endsWith('| '), `Expected message to end with "| " but got: ${msg}`);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
it('handles numeric value and delta', () => {
|
|
71
|
+
const msg = formatCommitMessage({
|
|
72
|
+
skillName: 'x',
|
|
73
|
+
hypothesis: 'h',
|
|
74
|
+
target: 't',
|
|
75
|
+
value: 42,
|
|
76
|
+
delta: -3.5,
|
|
77
|
+
status: 'pass',
|
|
78
|
+
secondaries: null,
|
|
79
|
+
});
|
|
80
|
+
assert.ok(msg.includes('t=42'));
|
|
81
|
+
assert.ok(msg.includes('delta=-3.5%'));
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
describe('parseExperimentLine', () => {
|
|
86
|
+
it('parses a well-formed experiment subject', () => {
|
|
87
|
+
const subject = 'experiment(browse-fetch): reduce timeout | latency=4.2s delta=-16% pass | accuracy=98%';
|
|
88
|
+
const result = parseExperimentLine('abc123', subject);
|
|
89
|
+
assert.deepStrictEqual(result, {
|
|
90
|
+
hash: 'abc123',
|
|
91
|
+
skillName: 'browse-fetch',
|
|
92
|
+
hypothesis: 'reduce timeout',
|
|
93
|
+
target: 'latency',
|
|
94
|
+
value: '4.2s',
|
|
95
|
+
delta: -16,
|
|
96
|
+
status: 'pass',
|
|
97
|
+
secondaries: 'accuracy=98%',
|
|
98
|
+
});
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
it('returns null for non-experiment commits', () => {
|
|
102
|
+
assert.strictEqual(parseExperimentLine('abc', 'feat(core): add feature'), null);
|
|
103
|
+
assert.strictEqual(parseExperimentLine('abc', 'random text'), null);
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
it('returns null for malformed metrics section', () => {
|
|
107
|
+
const subject = 'experiment(x): hyp | bad-metrics | sec';
|
|
108
|
+
assert.strictEqual(parseExperimentLine('abc', subject), null);
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
it('parses positive delta', () => {
|
|
112
|
+
const subject = 'experiment(s): h | metric=100 delta=+12.5% pass | ';
|
|
113
|
+
const result = parseExperimentLine('def', subject);
|
|
114
|
+
assert.strictEqual(result.delta, 12.5);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
it('handles empty secondaries', () => {
|
|
118
|
+
const subject = 'experiment(s): h | m=1 delta=0% inconclusive | ';
|
|
119
|
+
const result = parseExperimentLine('ghi', subject);
|
|
120
|
+
assert.strictEqual(result.secondaries, '');
|
|
121
|
+
assert.strictEqual(result.status, 'inconclusive');
|
|
122
|
+
});
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
// --- Integration tests with temp git repos ---
|
|
126
|
+
|
|
127
|
+
describe('commitExperiment', () => {
|
|
128
|
+
let cwd;
|
|
129
|
+
|
|
130
|
+
before(() => {
|
|
131
|
+
cwd = createTempRepo();
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
after(() => {
|
|
135
|
+
cleanupRepo(cwd);
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
it('creates a commit with correctly formatted message (AC-10)', () => {
|
|
139
|
+
// Create a file change to commit
|
|
140
|
+
fs.writeFileSync(path.join(cwd, 'experiment1.txt'), 'trial 1\n');
|
|
141
|
+
|
|
142
|
+
const hash = commitExperiment({
|
|
143
|
+
cwd,
|
|
144
|
+
skillName: 'browse-fetch',
|
|
145
|
+
hypothesis: 'reduce timeout to 5s',
|
|
146
|
+
target: 'latency',
|
|
147
|
+
value: '4.2s',
|
|
148
|
+
delta: '-16',
|
|
149
|
+
status: 'pass',
|
|
150
|
+
secondaries: 'accuracy=98%',
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
assert.ok(typeof hash === 'string');
|
|
154
|
+
assert.ok(hash.length >= 7, `Hash should be at least 7 chars, got: ${hash}`);
|
|
155
|
+
|
|
156
|
+
// Verify commit message format
|
|
157
|
+
const subject = execSync('git log -1 --format=%s', { cwd, stdio: 'pipe' }).toString().trim();
|
|
158
|
+
assert.strictEqual(
|
|
159
|
+
subject,
|
|
160
|
+
'experiment(browse-fetch): reduce timeout to 5s | latency=4.2s delta=-16% pass | accuracy=98%'
|
|
161
|
+
);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
it('stages all changes before committing', () => {
|
|
165
|
+
// Create an untracked file — commitExperiment should stage it
|
|
166
|
+
fs.writeFileSync(path.join(cwd, 'untracked.txt'), 'new file\n');
|
|
167
|
+
|
|
168
|
+
commitExperiment({
|
|
169
|
+
cwd,
|
|
170
|
+
skillName: 'test-skill',
|
|
171
|
+
hypothesis: 'auto-stage',
|
|
172
|
+
target: 'coverage',
|
|
173
|
+
value: '80',
|
|
174
|
+
delta: '+2',
|
|
175
|
+
status: 'pass',
|
|
176
|
+
secondaries: '',
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
// The file should be in the commit
|
|
180
|
+
const files = execSync('git diff-tree --no-commit-id --name-only -r HEAD', { cwd, stdio: 'pipe' })
|
|
181
|
+
.toString().trim();
|
|
182
|
+
assert.ok(files.includes('untracked.txt'));
|
|
183
|
+
});
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
describe('revertExperiment (AC-7)', () => {
|
|
187
|
+
let cwd;
|
|
188
|
+
|
|
189
|
+
before(() => {
|
|
190
|
+
cwd = createTempRepo();
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
after(() => {
|
|
194
|
+
cleanupRepo(cwd);
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
it('uses git revert (not reset) — preserves experiment commit in history', () => {
|
|
198
|
+
// Make an experiment commit
|
|
199
|
+
fs.writeFileSync(path.join(cwd, 'exp.txt'), 'experiment data\n');
|
|
200
|
+
commitExperiment({
|
|
201
|
+
cwd,
|
|
202
|
+
skillName: 'skill-a',
|
|
203
|
+
hypothesis: 'bad idea',
|
|
204
|
+
target: 'speed',
|
|
205
|
+
value: '10',
|
|
206
|
+
delta: '-5',
|
|
207
|
+
status: 'fail',
|
|
208
|
+
secondaries: '',
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
const experimentSubject = execSync('git log -1 --format=%s', { cwd, stdio: 'pipe' }).toString().trim();
|
|
212
|
+
|
|
213
|
+
// Revert
|
|
214
|
+
const revertHash = revertExperiment({ cwd });
|
|
215
|
+
assert.ok(typeof revertHash === 'string');
|
|
216
|
+
|
|
217
|
+
// Verify revert commit message contains "Revert"
|
|
218
|
+
const revertSubject = execSync('git log -1 --format=%s', { cwd, stdio: 'pipe' }).toString().trim();
|
|
219
|
+
assert.ok(revertSubject.startsWith('Revert'), `Expected revert commit to start with "Revert", got: ${revertSubject}`);
|
|
220
|
+
|
|
221
|
+
// The original experiment commit should still be in the log (not erased)
|
|
222
|
+
const fullLog = execSync('git log --format=%s', { cwd, stdio: 'pipe' }).toString();
|
|
223
|
+
assert.ok(fullLog.includes(experimentSubject), 'Original experiment commit should remain in history after revert');
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
it('reverted file content matches pre-experiment state', () => {
|
|
227
|
+
// Write a file, commit as experiment, then revert
|
|
228
|
+
fs.writeFileSync(path.join(cwd, 'state.txt'), 'before\n');
|
|
229
|
+
execSync('git add -A && git commit -m "baseline"', { cwd, stdio: 'pipe' });
|
|
230
|
+
|
|
231
|
+
fs.writeFileSync(path.join(cwd, 'state.txt'), 'after experiment\n');
|
|
232
|
+
commitExperiment({
|
|
233
|
+
cwd,
|
|
234
|
+
skillName: 's',
|
|
235
|
+
hypothesis: 'h',
|
|
236
|
+
target: 't',
|
|
237
|
+
value: '1',
|
|
238
|
+
delta: '0',
|
|
239
|
+
status: 'fail',
|
|
240
|
+
secondaries: '',
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
revertExperiment({ cwd });
|
|
244
|
+
|
|
245
|
+
const content = fs.readFileSync(path.join(cwd, 'state.txt'), 'utf-8');
|
|
246
|
+
assert.strictEqual(content, 'before\n');
|
|
247
|
+
});
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
describe('queryExperiments', () => {
|
|
251
|
+
let cwd;
|
|
252
|
+
|
|
253
|
+
before(() => {
|
|
254
|
+
cwd = createTempRepo();
|
|
255
|
+
|
|
256
|
+
// Create multiple experiment commits for different skills
|
|
257
|
+
fs.writeFileSync(path.join(cwd, 'a.txt'), '1');
|
|
258
|
+
commitExperiment({
|
|
259
|
+
cwd, skillName: 'skill-a', hypothesis: 'hyp-a1',
|
|
260
|
+
target: 'speed', value: '100', delta: '+10', status: 'pass', secondaries: 'mem=50MB',
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
fs.writeFileSync(path.join(cwd, 'b.txt'), '2');
|
|
264
|
+
commitExperiment({
|
|
265
|
+
cwd, skillName: 'skill-b', hypothesis: 'hyp-b1',
|
|
266
|
+
target: 'accuracy', value: '95', delta: '-2', status: 'fail', secondaries: '',
|
|
267
|
+
});
|
|
268
|
+
|
|
269
|
+
fs.writeFileSync(path.join(cwd, 'c.txt'), '3');
|
|
270
|
+
commitExperiment({
|
|
271
|
+
cwd, skillName: 'skill-a', hypothesis: 'hyp-a2',
|
|
272
|
+
target: 'speed', value: '120', delta: '+20', status: 'pass', secondaries: '',
|
|
273
|
+
});
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
after(() => {
|
|
277
|
+
cleanupRepo(cwd);
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
it('returns all experiments when no skillName filter', () => {
|
|
281
|
+
const results = queryExperiments({ cwd });
|
|
282
|
+
assert.strictEqual(results.length, 3);
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
it('filters by skillName', () => {
|
|
286
|
+
const results = queryExperiments({ cwd, skillName: 'skill-a' });
|
|
287
|
+
assert.strictEqual(results.length, 2);
|
|
288
|
+
for (const r of results) {
|
|
289
|
+
assert.strictEqual(r.skillName, 'skill-a');
|
|
290
|
+
}
|
|
291
|
+
});
|
|
292
|
+
|
|
293
|
+
it('returns empty array for unknown skill', () => {
|
|
294
|
+
const results = queryExperiments({ cwd, skillName: 'nonexistent' });
|
|
295
|
+
assert.deepStrictEqual(results, []);
|
|
296
|
+
});
|
|
297
|
+
|
|
298
|
+
it('parses commit messages into structured objects with correct fields', () => {
|
|
299
|
+
const results = queryExperiments({ cwd, skillName: 'skill-a' });
|
|
300
|
+
// git log returns newest first
|
|
301
|
+
const newest = results[0];
|
|
302
|
+
assert.strictEqual(newest.hypothesis, 'hyp-a2');
|
|
303
|
+
assert.strictEqual(newest.target, 'speed');
|
|
304
|
+
assert.strictEqual(newest.value, '120');
|
|
305
|
+
assert.strictEqual(newest.delta, 20);
|
|
306
|
+
assert.strictEqual(newest.status, 'pass');
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
it('returns empty array in repo with no experiment commits', () => {
|
|
310
|
+
const emptyRepo = createTempRepo();
|
|
311
|
+
try {
|
|
312
|
+
const results = queryExperiments({ cwd: emptyRepo, skillName: 'anything' });
|
|
313
|
+
assert.deepStrictEqual(results, []);
|
|
314
|
+
} finally {
|
|
315
|
+
cleanupRepo(emptyRepo);
|
|
316
|
+
}
|
|
317
|
+
});
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
describe('getExperimentHistory', () => {
|
|
321
|
+
let cwd;
|
|
322
|
+
|
|
323
|
+
before(() => {
|
|
324
|
+
cwd = createTempRepo();
|
|
325
|
+
|
|
326
|
+
for (let i = 1; i <= 5; i++) {
|
|
327
|
+
fs.writeFileSync(path.join(cwd, `file${i}.txt`), `${i}`);
|
|
328
|
+
commitExperiment({
|
|
329
|
+
cwd, skillName: 'perf', hypothesis: `trial-${i}`,
|
|
330
|
+
target: 'throughput', value: `${i * 100}`, delta: `+${i}`,
|
|
331
|
+
status: i % 2 === 0 ? 'fail' : 'pass', secondaries: '',
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
});
|
|
335
|
+
|
|
336
|
+
after(() => {
|
|
337
|
+
cleanupRepo(cwd);
|
|
338
|
+
});
|
|
339
|
+
|
|
340
|
+
it('returns formatted multi-line string', () => {
|
|
341
|
+
const history = getExperimentHistory({ cwd, skillName: 'perf' });
|
|
342
|
+
const lines = history.split('\n');
|
|
343
|
+
assert.strictEqual(lines.length, 5);
|
|
344
|
+
// Each line should contain the skill name and hash
|
|
345
|
+
for (const line of lines) {
|
|
346
|
+
assert.ok(line.includes('perf:'), `Line should contain "perf:": ${line}`);
|
|
347
|
+
assert.ok(line.startsWith('['), `Line should start with "[": ${line}`);
|
|
348
|
+
}
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
it('respects maxEntries', () => {
|
|
352
|
+
const history = getExperimentHistory({ cwd, skillName: 'perf', maxEntries: 2 });
|
|
353
|
+
const lines = history.split('\n');
|
|
354
|
+
assert.strictEqual(lines.length, 2);
|
|
355
|
+
});
|
|
356
|
+
|
|
357
|
+
it('defaults maxEntries to 20', () => {
|
|
358
|
+
// We have 5 experiments, all should appear (5 < 20)
|
|
359
|
+
const history = getExperimentHistory({ cwd, skillName: 'perf' });
|
|
360
|
+
const lines = history.split('\n');
|
|
361
|
+
assert.strictEqual(lines.length, 5);
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
it('returns "(no experiment history)" when no experiments match', () => {
|
|
365
|
+
const history = getExperimentHistory({ cwd, skillName: 'nonexistent' });
|
|
366
|
+
assert.strictEqual(history, '(no experiment history)');
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
it('returns "(no experiment history)" for empty repo', () => {
|
|
370
|
+
const emptyRepo = createTempRepo();
|
|
371
|
+
try {
|
|
372
|
+
const history = getExperimentHistory({ cwd: emptyRepo });
|
|
373
|
+
assert.strictEqual(history, '(no experiment history)');
|
|
374
|
+
} finally {
|
|
375
|
+
cleanupRepo(emptyRepo);
|
|
376
|
+
}
|
|
377
|
+
});
|
|
378
|
+
});
|
|
379
|
+
|
|
380
|
+
describe('round-trip: commit → query → verify', () => {
|
|
381
|
+
let cwd;
|
|
382
|
+
|
|
383
|
+
before(() => {
|
|
384
|
+
cwd = createTempRepo();
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
after(() => {
|
|
388
|
+
cleanupRepo(cwd);
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
it('parsed fields match original input', () => {
|
|
392
|
+
const input = {
|
|
393
|
+
skillName: 'round-trip-skill',
|
|
394
|
+
hypothesis: 'cache improves latency',
|
|
395
|
+
target: 'p99',
|
|
396
|
+
value: '42ms',
|
|
397
|
+
delta: '-33',
|
|
398
|
+
status: 'pass',
|
|
399
|
+
secondaries: 'p50=12ms cpu=65%',
|
|
400
|
+
};
|
|
401
|
+
|
|
402
|
+
fs.writeFileSync(path.join(cwd, 'rt.txt'), 'round-trip test\n');
|
|
403
|
+
const commitHash = commitExperiment({ cwd, ...input });
|
|
404
|
+
|
|
405
|
+
const experiments = queryExperiments({ cwd, skillName: 'round-trip-skill' });
|
|
406
|
+
assert.strictEqual(experiments.length, 1);
|
|
407
|
+
|
|
408
|
+
const parsed = experiments[0];
|
|
409
|
+
assert.strictEqual(parsed.skillName, input.skillName);
|
|
410
|
+
assert.strictEqual(parsed.hypothesis, input.hypothesis);
|
|
411
|
+
assert.strictEqual(parsed.target, input.target);
|
|
412
|
+
assert.strictEqual(parsed.value, input.value);
|
|
413
|
+
assert.strictEqual(parsed.delta, parseFloat(input.delta));
|
|
414
|
+
assert.strictEqual(parsed.status, input.status);
|
|
415
|
+
assert.strictEqual(parsed.secondaries, input.secondaries);
|
|
416
|
+
// Hash from commit should match hash from query (full vs short may differ, but short should be prefix)
|
|
417
|
+
assert.ok(
|
|
418
|
+
parsed.hash.startsWith(commitHash) || commitHash.startsWith(parsed.hash.slice(0, 7)),
|
|
419
|
+
`Hashes should be related: commit=${commitHash}, query=${parsed.hash}`
|
|
420
|
+
);
|
|
421
|
+
});
|
|
422
|
+
|
|
423
|
+
it('round-trip with numeric delta preserves sign', () => {
|
|
424
|
+
fs.writeFileSync(path.join(cwd, 'rt2.txt'), 'test\n');
|
|
425
|
+
commitExperiment({
|
|
426
|
+
cwd,
|
|
427
|
+
skillName: 'sign-test',
|
|
428
|
+
hypothesis: 'negative delta',
|
|
429
|
+
target: 'errors',
|
|
430
|
+
value: '3',
|
|
431
|
+
delta: '-50',
|
|
432
|
+
status: 'pass',
|
|
433
|
+
secondaries: '',
|
|
434
|
+
});
|
|
435
|
+
|
|
436
|
+
const [result] = queryExperiments({ cwd, skillName: 'sign-test' });
|
|
437
|
+
assert.strictEqual(result.delta, -50);
|
|
438
|
+
});
|
|
439
|
+
});
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Hypothesis loading for df:eval.
|
|
5
|
+
*
|
|
6
|
+
* AC-11: Loop accepts --hypothesis flag; without it, reads hypotheses.md from
|
|
7
|
+
* benchmark dir and returns the next unused hypothesis.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const fs = require('fs');
|
|
11
|
+
const path = require('path');
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Parse markdown list items from hypotheses.md content.
|
|
15
|
+
* Recognises both ordered (1. ...) and unordered (- ... / * ...) list items.
|
|
16
|
+
*
|
|
17
|
+
* @param {string} content - Raw file content
|
|
18
|
+
* @returns {string[]} - Array of hypothesis strings (trimmed, non-empty)
|
|
19
|
+
*/
|
|
20
|
+
function parseHypothesesFile(content) {
|
|
21
|
+
return content
|
|
22
|
+
.split('\n')
|
|
23
|
+
.map((line) => line.match(/^(?:\d+\.|[-*])\s+(.+)/))
|
|
24
|
+
.filter(Boolean)
|
|
25
|
+
.map((m) => m[1].trim())
|
|
26
|
+
.filter((h) => h.length > 0);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Load the active hypothesis for an eval session.
|
|
31
|
+
*
|
|
32
|
+
* Resolution order:
|
|
33
|
+
* 1. If `flag` is a non-empty string → return it directly.
|
|
34
|
+
* 2. Otherwise read `{benchDir}/hypotheses.md` and return the first entry.
|
|
35
|
+
* If the file is missing or contains no list items, throw an error.
|
|
36
|
+
*
|
|
37
|
+
* "Next unused" is kept simple for now: always return the first list item.
|
|
38
|
+
* Iteration tracking (marking items as used) is left to the loop's git-memory
|
|
39
|
+
* history, which records which hypotheses were already attempted.
|
|
40
|
+
*
|
|
41
|
+
* @param {object} opts
|
|
42
|
+
* @param {string} [opts.flag] - Value of --hypothesis CLI flag (may be undefined)
|
|
43
|
+
* @param {string} opts.benchDir - Path to the benchmark directory
|
|
44
|
+
* @returns {string} - The hypothesis string to use
|
|
45
|
+
* @throws {Error} - If no hypothesis can be resolved
|
|
46
|
+
*/
|
|
47
|
+
function loadHypothesis({ flag, benchDir }) {
|
|
48
|
+
// 1. CLI flag takes priority
|
|
49
|
+
if (flag && typeof flag === 'string' && flag.trim().length > 0) {
|
|
50
|
+
return flag.trim();
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// 2. Fall back to hypotheses.md
|
|
54
|
+
const hypothesesPath = path.join(benchDir, 'hypotheses.md');
|
|
55
|
+
|
|
56
|
+
let content;
|
|
57
|
+
try {
|
|
58
|
+
content = fs.readFileSync(hypothesesPath, 'utf8');
|
|
59
|
+
} catch (err) {
|
|
60
|
+
throw new Error(
|
|
61
|
+
`No --hypothesis flag provided and could not read ${hypothesesPath}: ${err.message}`
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const hypotheses = parseHypothesesFile(content);
|
|
66
|
+
|
|
67
|
+
if (hypotheses.length === 0) {
|
|
68
|
+
throw new Error(
|
|
69
|
+
`No hypotheses found in ${hypothesesPath}. Add list items (- ... or 1. ...) to define hypotheses.`
|
|
70
|
+
);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Return the first hypothesis (loop history tracks which were attempted)
|
|
74
|
+
return hypotheses[0];
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
module.exports = {
|
|
78
|
+
loadHypothesis,
|
|
79
|
+
parseHypothesesFile,
|
|
80
|
+
};
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { describe, it, before, after } = require('node:test');
|
|
4
|
+
const assert = require('node:assert');
|
|
5
|
+
const fs = require('fs');
|
|
6
|
+
const path = require('path');
|
|
7
|
+
const os = require('os');
|
|
8
|
+
|
|
9
|
+
const { loadHypothesis, parseHypothesesFile } = require('./hypothesis.js');
|
|
10
|
+
|
|
11
|
+
// --- parseHypothesesFile ---
|
|
12
|
+
|
|
13
|
+
describe('parseHypothesesFile', () => {
|
|
14
|
+
it('parses ordered list items (1. ...)', () => {
|
|
15
|
+
const content = '1. First hypothesis\n2. Second hypothesis\n3. Third one\n';
|
|
16
|
+
const result = parseHypothesesFile(content);
|
|
17
|
+
assert.deepStrictEqual(result, [
|
|
18
|
+
'First hypothesis',
|
|
19
|
+
'Second hypothesis',
|
|
20
|
+
'Third one',
|
|
21
|
+
]);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it('parses unordered list items with dashes (- ...)', () => {
|
|
25
|
+
const content = '- Dash one\n- Dash two\n';
|
|
26
|
+
const result = parseHypothesesFile(content);
|
|
27
|
+
assert.deepStrictEqual(result, ['Dash one', 'Dash two']);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('parses unordered list items with asterisks (* ...)', () => {
|
|
31
|
+
const content = '* Star one\n* Star two\n';
|
|
32
|
+
const result = parseHypothesesFile(content);
|
|
33
|
+
assert.deepStrictEqual(result, ['Star one', 'Star two']);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('handles mixed ordered and unordered items', () => {
|
|
37
|
+
const content = '1. Ordered first\n- Dash second\n* Star third\n2. Ordered fourth\n';
|
|
38
|
+
const result = parseHypothesesFile(content);
|
|
39
|
+
assert.deepStrictEqual(result, [
|
|
40
|
+
'Ordered first',
|
|
41
|
+
'Dash second',
|
|
42
|
+
'Star third',
|
|
43
|
+
'Ordered fourth',
|
|
44
|
+
]);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('returns empty array for empty content', () => {
|
|
48
|
+
assert.deepStrictEqual(parseHypothesesFile(''), []);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it('returns empty array when content has no list items', () => {
|
|
52
|
+
const content = '# Hypotheses\n\nSome paragraph text.\nAnother line.\n';
|
|
53
|
+
const result = parseHypothesesFile(content);
|
|
54
|
+
assert.deepStrictEqual(result, []);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it('ignores non-list lines interspersed with list items', () => {
|
|
58
|
+
const content = '# Title\n\n1. Real item\nNot a list item\n- Another real item\n';
|
|
59
|
+
const result = parseHypothesesFile(content);
|
|
60
|
+
assert.deepStrictEqual(result, ['Real item', 'Another real item']);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it('trims whitespace from parsed items', () => {
|
|
64
|
+
const content = '1. Lots of spaces \n- Also spaced \n';
|
|
65
|
+
const result = parseHypothesesFile(content);
|
|
66
|
+
assert.deepStrictEqual(result, ['Lots of spaces', 'Also spaced']);
|
|
67
|
+
});
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
// --- loadHypothesis ---
|
|
71
|
+
|
|
72
|
+
describe('loadHypothesis', () => {
|
|
73
|
+
let tmpDir;
|
|
74
|
+
|
|
75
|
+
before(() => {
|
|
76
|
+
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hypothesis-test-'));
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
after(() => {
|
|
80
|
+
fs.rmSync(tmpDir, { recursive: true, force: true });
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
it('returns flag when provided (AC-11)', () => {
|
|
84
|
+
const result = loadHypothesis({ flag: 'my hypothesis', benchDir: tmpDir });
|
|
85
|
+
assert.strictEqual(result, 'my hypothesis');
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
it('trims the flag value', () => {
|
|
89
|
+
const result = loadHypothesis({ flag: ' padded ', benchDir: tmpDir });
|
|
90
|
+
assert.strictEqual(result, 'padded');
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it('reads hypotheses.md when no flag is provided', () => {
|
|
94
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-read-'));
|
|
95
|
+
fs.writeFileSync(
|
|
96
|
+
path.join(benchDir, 'hypotheses.md'),
|
|
97
|
+
'1. First from file\n2. Second from file\n'
|
|
98
|
+
);
|
|
99
|
+
const result = loadHypothesis({ benchDir });
|
|
100
|
+
assert.strictEqual(result, 'First from file');
|
|
101
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it('ignores empty-string flag and falls back to file', () => {
|
|
105
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-empty-'));
|
|
106
|
+
fs.writeFileSync(
|
|
107
|
+
path.join(benchDir, 'hypotheses.md'),
|
|
108
|
+
'- Fallback hypothesis\n'
|
|
109
|
+
);
|
|
110
|
+
const result = loadHypothesis({ flag: '', benchDir });
|
|
111
|
+
assert.strictEqual(result, 'Fallback hypothesis');
|
|
112
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
it('ignores whitespace-only flag and falls back to file', () => {
|
|
116
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-ws-'));
|
|
117
|
+
fs.writeFileSync(
|
|
118
|
+
path.join(benchDir, 'hypotheses.md'),
|
|
119
|
+
'- WS fallback\n'
|
|
120
|
+
);
|
|
121
|
+
const result = loadHypothesis({ flag: ' ', benchDir });
|
|
122
|
+
assert.strictEqual(result, 'WS fallback');
|
|
123
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it('throws when neither flag nor file available', () => {
|
|
127
|
+
const missingDir = path.join(tmpDir, 'nonexistent');
|
|
128
|
+
assert.throws(
|
|
129
|
+
() => loadHypothesis({ benchDir: missingDir }),
|
|
130
|
+
(err) => {
|
|
131
|
+
assert.ok(err instanceof Error);
|
|
132
|
+
assert.ok(err.message.includes('No --hypothesis flag provided'));
|
|
133
|
+
assert.ok(err.message.includes('hypotheses.md'));
|
|
134
|
+
return true;
|
|
135
|
+
}
|
|
136
|
+
);
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
it('throws when file exists but contains no list items', () => {
|
|
140
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-nolist-'));
|
|
141
|
+
fs.writeFileSync(
|
|
142
|
+
path.join(benchDir, 'hypotheses.md'),
|
|
143
|
+
'# Just a heading\n\nSome text but no list items.\n'
|
|
144
|
+
);
|
|
145
|
+
assert.throws(
|
|
146
|
+
() => loadHypothesis({ benchDir }),
|
|
147
|
+
(err) => {
|
|
148
|
+
assert.ok(err instanceof Error);
|
|
149
|
+
assert.ok(err.message.includes('No hypotheses found'));
|
|
150
|
+
return true;
|
|
151
|
+
}
|
|
152
|
+
);
|
|
153
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
it('throws when file is empty', () => {
|
|
157
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-empty-file-'));
|
|
158
|
+
fs.writeFileSync(path.join(benchDir, 'hypotheses.md'), '');
|
|
159
|
+
assert.throws(
|
|
160
|
+
() => loadHypothesis({ benchDir }),
|
|
161
|
+
(err) => {
|
|
162
|
+
assert.ok(err instanceof Error);
|
|
163
|
+
assert.ok(err.message.includes('No hypotheses found'));
|
|
164
|
+
return true;
|
|
165
|
+
}
|
|
166
|
+
);
|
|
167
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
168
|
+
});
|
|
169
|
+
});
|