deepflow 0.1.103 → 0.1.104
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install-dynamic-hooks.test.js +461 -0
- package/bin/install.js +150 -250
- package/bin/lineage-ingest.js +70 -0
- package/hooks/df-check-update.js +1 -0
- package/hooks/df-command-usage.js +18 -0
- package/hooks/df-dashboard-push.js +1 -0
- package/hooks/df-execution-history.js +1 -0
- package/hooks/df-explore-protocol.js +83 -0
- package/hooks/df-explore-protocol.test.js +228 -0
- package/hooks/df-hook-event-tags.test.js +127 -0
- package/hooks/df-invariant-check.js +1 -0
- package/hooks/df-quota-logger.js +1 -0
- package/hooks/df-snapshot-guard.js +1 -0
- package/hooks/df-spec-lint.js +58 -1
- package/hooks/df-spec-lint.test.js +412 -0
- package/hooks/df-statusline.js +1 -0
- package/hooks/df-subagent-registry.js +1 -0
- package/hooks/df-tool-usage.js +13 -3
- package/hooks/df-worktree-guard.js +1 -0
- package/package.json +1 -1
- package/src/commands/df/debate.md +1 -1
- package/src/commands/df/eval.md +117 -0
- package/src/commands/df/execute.md +1 -1
- package/src/commands/df/fix.md +104 -0
- package/src/eval/git-memory.js +159 -0
- package/src/eval/git-memory.test.js +439 -0
- package/src/eval/hypothesis.js +80 -0
- package/src/eval/hypothesis.test.js +169 -0
- package/src/eval/loop.js +378 -0
- package/src/eval/loop.test.js +306 -0
- package/src/eval/metric-collector.js +163 -0
- package/src/eval/metric-collector.test.js +369 -0
- package/src/eval/metric-pivot.js +119 -0
- package/src/eval/metric-pivot.test.js +350 -0
- package/src/eval/mutator-prompt.js +106 -0
- package/src/eval/mutator-prompt.test.js +180 -0
- package/templates/config-template.yaml +5 -0
- package/templates/eval-fixture-template/config.yaml +39 -0
- package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
- package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
- package/templates/eval-fixture-template/fixture/package.json +12 -0
- package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
- package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
- package/templates/eval-fixture-template/fixture/src/config.js +40 -0
- package/templates/eval-fixture-template/fixture/src/index.js +19 -0
- package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
- package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
- package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
- package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
- package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
- package/templates/eval-fixture-template/hypotheses.md +14 -0
- package/templates/eval-fixture-template/spec.md +34 -0
- package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
- package/templates/eval-fixture-template/tests/guard.test.js +108 -0
- package/templates/eval-fixture-template.test.js +318 -0
- package/templates/explore-agent.md +5 -74
- package/templates/explore-protocol.md +44 -0
- package/templates/spec-template.md +4 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Hypothesis loading for df:eval.
|
|
5
|
+
*
|
|
6
|
+
* AC-11: Loop accepts --hypothesis flag; without it, reads hypotheses.md from
|
|
7
|
+
* benchmark dir and returns the next unused hypothesis.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const fs = require('fs');
|
|
11
|
+
const path = require('path');
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Parse markdown list items from hypotheses.md content.
|
|
15
|
+
* Recognises both ordered (1. ...) and unordered (- ... / * ...) list items.
|
|
16
|
+
*
|
|
17
|
+
* @param {string} content - Raw file content
|
|
18
|
+
* @returns {string[]} - Array of hypothesis strings (trimmed, non-empty)
|
|
19
|
+
*/
|
|
20
|
+
function parseHypothesesFile(content) {
|
|
21
|
+
return content
|
|
22
|
+
.split('\n')
|
|
23
|
+
.map((line) => line.match(/^(?:\d+\.|[-*])\s+(.+)/))
|
|
24
|
+
.filter(Boolean)
|
|
25
|
+
.map((m) => m[1].trim())
|
|
26
|
+
.filter((h) => h.length > 0);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Load the active hypothesis for an eval session.
|
|
31
|
+
*
|
|
32
|
+
* Resolution order:
|
|
33
|
+
* 1. If `flag` is a non-empty string → return it directly.
|
|
34
|
+
* 2. Otherwise read `{benchDir}/hypotheses.md` and return the first entry.
|
|
35
|
+
* If the file is missing or contains no list items, throw an error.
|
|
36
|
+
*
|
|
37
|
+
* "Next unused" is kept simple for now: always return the first list item.
|
|
38
|
+
* Iteration tracking (marking items as used) is left to the loop's git-memory
|
|
39
|
+
* history, which records which hypotheses were already attempted.
|
|
40
|
+
*
|
|
41
|
+
* @param {object} opts
|
|
42
|
+
* @param {string} [opts.flag] - Value of --hypothesis CLI flag (may be undefined)
|
|
43
|
+
* @param {string} opts.benchDir - Path to the benchmark directory
|
|
44
|
+
* @returns {string} - The hypothesis string to use
|
|
45
|
+
* @throws {Error} - If no hypothesis can be resolved
|
|
46
|
+
*/
|
|
47
|
+
function loadHypothesis({ flag, benchDir }) {
|
|
48
|
+
// 1. CLI flag takes priority
|
|
49
|
+
if (flag && typeof flag === 'string' && flag.trim().length > 0) {
|
|
50
|
+
return flag.trim();
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// 2. Fall back to hypotheses.md
|
|
54
|
+
const hypothesesPath = path.join(benchDir, 'hypotheses.md');
|
|
55
|
+
|
|
56
|
+
let content;
|
|
57
|
+
try {
|
|
58
|
+
content = fs.readFileSync(hypothesesPath, 'utf8');
|
|
59
|
+
} catch (err) {
|
|
60
|
+
throw new Error(
|
|
61
|
+
`No --hypothesis flag provided and could not read ${hypothesesPath}: ${err.message}`
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const hypotheses = parseHypothesesFile(content);
|
|
66
|
+
|
|
67
|
+
if (hypotheses.length === 0) {
|
|
68
|
+
throw new Error(
|
|
69
|
+
`No hypotheses found in ${hypothesesPath}. Add list items (- ... or 1. ...) to define hypotheses.`
|
|
70
|
+
);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Return the first hypothesis (loop history tracks which were attempted)
|
|
74
|
+
return hypotheses[0];
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
module.exports = {
|
|
78
|
+
loadHypothesis,
|
|
79
|
+
parseHypothesesFile,
|
|
80
|
+
};
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { describe, it, before, after } = require('node:test');
|
|
4
|
+
const assert = require('node:assert');
|
|
5
|
+
const fs = require('fs');
|
|
6
|
+
const path = require('path');
|
|
7
|
+
const os = require('os');
|
|
8
|
+
|
|
9
|
+
const { loadHypothesis, parseHypothesesFile } = require('./hypothesis.js');
|
|
10
|
+
|
|
11
|
+
// --- parseHypothesesFile ---
|
|
12
|
+
|
|
13
|
+
describe('parseHypothesesFile', () => {
|
|
14
|
+
it('parses ordered list items (1. ...)', () => {
|
|
15
|
+
const content = '1. First hypothesis\n2. Second hypothesis\n3. Third one\n';
|
|
16
|
+
const result = parseHypothesesFile(content);
|
|
17
|
+
assert.deepStrictEqual(result, [
|
|
18
|
+
'First hypothesis',
|
|
19
|
+
'Second hypothesis',
|
|
20
|
+
'Third one',
|
|
21
|
+
]);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it('parses unordered list items with dashes (- ...)', () => {
|
|
25
|
+
const content = '- Dash one\n- Dash two\n';
|
|
26
|
+
const result = parseHypothesesFile(content);
|
|
27
|
+
assert.deepStrictEqual(result, ['Dash one', 'Dash two']);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('parses unordered list items with asterisks (* ...)', () => {
|
|
31
|
+
const content = '* Star one\n* Star two\n';
|
|
32
|
+
const result = parseHypothesesFile(content);
|
|
33
|
+
assert.deepStrictEqual(result, ['Star one', 'Star two']);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('handles mixed ordered and unordered items', () => {
|
|
37
|
+
const content = '1. Ordered first\n- Dash second\n* Star third\n2. Ordered fourth\n';
|
|
38
|
+
const result = parseHypothesesFile(content);
|
|
39
|
+
assert.deepStrictEqual(result, [
|
|
40
|
+
'Ordered first',
|
|
41
|
+
'Dash second',
|
|
42
|
+
'Star third',
|
|
43
|
+
'Ordered fourth',
|
|
44
|
+
]);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('returns empty array for empty content', () => {
|
|
48
|
+
assert.deepStrictEqual(parseHypothesesFile(''), []);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it('returns empty array when content has no list items', () => {
|
|
52
|
+
const content = '# Hypotheses\n\nSome paragraph text.\nAnother line.\n';
|
|
53
|
+
const result = parseHypothesesFile(content);
|
|
54
|
+
assert.deepStrictEqual(result, []);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it('ignores non-list lines interspersed with list items', () => {
|
|
58
|
+
const content = '# Title\n\n1. Real item\nNot a list item\n- Another real item\n';
|
|
59
|
+
const result = parseHypothesesFile(content);
|
|
60
|
+
assert.deepStrictEqual(result, ['Real item', 'Another real item']);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it('trims whitespace from parsed items', () => {
|
|
64
|
+
const content = '1. Lots of spaces \n- Also spaced \n';
|
|
65
|
+
const result = parseHypothesesFile(content);
|
|
66
|
+
assert.deepStrictEqual(result, ['Lots of spaces', 'Also spaced']);
|
|
67
|
+
});
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
// --- loadHypothesis ---
|
|
71
|
+
|
|
72
|
+
describe('loadHypothesis', () => {
|
|
73
|
+
let tmpDir;
|
|
74
|
+
|
|
75
|
+
before(() => {
|
|
76
|
+
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hypothesis-test-'));
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
after(() => {
|
|
80
|
+
fs.rmSync(tmpDir, { recursive: true, force: true });
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
it('returns flag when provided (AC-11)', () => {
|
|
84
|
+
const result = loadHypothesis({ flag: 'my hypothesis', benchDir: tmpDir });
|
|
85
|
+
assert.strictEqual(result, 'my hypothesis');
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
it('trims the flag value', () => {
|
|
89
|
+
const result = loadHypothesis({ flag: ' padded ', benchDir: tmpDir });
|
|
90
|
+
assert.strictEqual(result, 'padded');
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it('reads hypotheses.md when no flag is provided', () => {
|
|
94
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-read-'));
|
|
95
|
+
fs.writeFileSync(
|
|
96
|
+
path.join(benchDir, 'hypotheses.md'),
|
|
97
|
+
'1. First from file\n2. Second from file\n'
|
|
98
|
+
);
|
|
99
|
+
const result = loadHypothesis({ benchDir });
|
|
100
|
+
assert.strictEqual(result, 'First from file');
|
|
101
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it('ignores empty-string flag and falls back to file', () => {
|
|
105
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-empty-'));
|
|
106
|
+
fs.writeFileSync(
|
|
107
|
+
path.join(benchDir, 'hypotheses.md'),
|
|
108
|
+
'- Fallback hypothesis\n'
|
|
109
|
+
);
|
|
110
|
+
const result = loadHypothesis({ flag: '', benchDir });
|
|
111
|
+
assert.strictEqual(result, 'Fallback hypothesis');
|
|
112
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
it('ignores whitespace-only flag and falls back to file', () => {
|
|
116
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-ws-'));
|
|
117
|
+
fs.writeFileSync(
|
|
118
|
+
path.join(benchDir, 'hypotheses.md'),
|
|
119
|
+
'- WS fallback\n'
|
|
120
|
+
);
|
|
121
|
+
const result = loadHypothesis({ flag: ' ', benchDir });
|
|
122
|
+
assert.strictEqual(result, 'WS fallback');
|
|
123
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it('throws when neither flag nor file available', () => {
|
|
127
|
+
const missingDir = path.join(tmpDir, 'nonexistent');
|
|
128
|
+
assert.throws(
|
|
129
|
+
() => loadHypothesis({ benchDir: missingDir }),
|
|
130
|
+
(err) => {
|
|
131
|
+
assert.ok(err instanceof Error);
|
|
132
|
+
assert.ok(err.message.includes('No --hypothesis flag provided'));
|
|
133
|
+
assert.ok(err.message.includes('hypotheses.md'));
|
|
134
|
+
return true;
|
|
135
|
+
}
|
|
136
|
+
);
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
it('throws when file exists but contains no list items', () => {
|
|
140
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-nolist-'));
|
|
141
|
+
fs.writeFileSync(
|
|
142
|
+
path.join(benchDir, 'hypotheses.md'),
|
|
143
|
+
'# Just a heading\n\nSome text but no list items.\n'
|
|
144
|
+
);
|
|
145
|
+
assert.throws(
|
|
146
|
+
() => loadHypothesis({ benchDir }),
|
|
147
|
+
(err) => {
|
|
148
|
+
assert.ok(err instanceof Error);
|
|
149
|
+
assert.ok(err.message.includes('No hypotheses found'));
|
|
150
|
+
return true;
|
|
151
|
+
}
|
|
152
|
+
);
|
|
153
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
it('throws when file is empty', () => {
|
|
157
|
+
const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-empty-file-'));
|
|
158
|
+
fs.writeFileSync(path.join(benchDir, 'hypotheses.md'), '');
|
|
159
|
+
assert.throws(
|
|
160
|
+
() => loadHypothesis({ benchDir }),
|
|
161
|
+
(err) => {
|
|
162
|
+
assert.ok(err instanceof Error);
|
|
163
|
+
assert.ok(err.message.includes('No hypotheses found'));
|
|
164
|
+
return true;
|
|
165
|
+
}
|
|
166
|
+
);
|
|
167
|
+
fs.rmSync(benchDir, { recursive: true, force: true });
|
|
168
|
+
});
|
|
169
|
+
});
|
package/src/eval/loop.js
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Karpathy loop orchestrator for df:eval.
|
|
5
|
+
*
|
|
6
|
+
* Implements the core eval loop: mutate → commit → guard → measure → keep/revert.
|
|
7
|
+
* Worktree-isolated, git-as-memory, single target metric decides.
|
|
8
|
+
*
|
|
9
|
+
* AC-1: Guard failure auto-reverts before metric comparison (status:guard_fail)
|
|
10
|
+
* AC-2: Target improvement keeps; regression reverts (status:kept / status:reverted)
|
|
11
|
+
* AC-3: Secondary metrics in commit message, never decide
|
|
12
|
+
* AC-6: Runs indefinitely until Ctrl+C; --loop N caps at N iterations
|
|
13
|
+
* AC-7: Reverts via git revert (not reset)
|
|
14
|
+
* AC-12: All experiments on worktree-isolated branch
|
|
15
|
+
* AC-13: Commit before verify for clean rollback
|
|
16
|
+
* AC-15: Loop terminates on Ctrl+C or --loop N
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
const fs = require('fs');
|
|
20
|
+
const path = require('path');
|
|
21
|
+
const { execSync } = require('child_process');
|
|
22
|
+
const { buildMutatorPrompt } = require('./mutator-prompt');
|
|
23
|
+
const { collectMetrics } = require('./metric-collector');
|
|
24
|
+
const { commitExperiment, revertExperiment, getExperimentHistory } = require('./git-memory');
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Create a worktree-isolated branch for the eval session.
|
|
28
|
+
* AC-12: All experiments on worktree-isolated branch.
|
|
29
|
+
*
|
|
30
|
+
* @param {string} repoRoot - Root of the main git repo
|
|
31
|
+
* @param {string} skillName - Skill being evaluated (used in branch name)
|
|
32
|
+
* @returns {{ branch: string, worktreePath: string }}
|
|
33
|
+
*/
|
|
34
|
+
function createEvalWorktree(repoRoot, skillName) {
|
|
35
|
+
const timestamp = Date.now();
|
|
36
|
+
const branch = `eval/${skillName}/${timestamp}`;
|
|
37
|
+
const worktreeBase = path.join(repoRoot, '.deepflow', 'worktrees');
|
|
38
|
+
|
|
39
|
+
// Ensure worktree base exists
|
|
40
|
+
fs.mkdirSync(worktreeBase, { recursive: true });
|
|
41
|
+
|
|
42
|
+
const worktreePath = path.join(worktreeBase, `eval-${skillName}-${timestamp}`);
|
|
43
|
+
|
|
44
|
+
// Create orphan branch from current HEAD
|
|
45
|
+
execSync(`git worktree add -b "${branch}" "${worktreePath}" HEAD`, {
|
|
46
|
+
cwd: repoRoot,
|
|
47
|
+
stdio: 'pipe',
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
return { branch, worktreePath };
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Remove a worktree and optionally its branch.
|
|
55
|
+
*
|
|
56
|
+
* @param {string} repoRoot
|
|
57
|
+
* @param {string} worktreePath
|
|
58
|
+
*/
|
|
59
|
+
function removeEvalWorktree(repoRoot, worktreePath) {
|
|
60
|
+
try {
|
|
61
|
+
execSync(`git worktree remove "${worktreePath}" --force`, {
|
|
62
|
+
cwd: repoRoot,
|
|
63
|
+
stdio: 'pipe',
|
|
64
|
+
});
|
|
65
|
+
} catch (_) {
|
|
66
|
+
// best-effort cleanup
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Run the guard check (build + test commands from config).
|
|
72
|
+
* AC-1, AC-5: Guard = fixture tests via configured test command.
|
|
73
|
+
*
|
|
74
|
+
* @param {string} cwd - Working directory to run commands in
|
|
75
|
+
* @param {object} config - Config with build_command / test_command
|
|
76
|
+
* @returns {{ passed: boolean, output: string }}
|
|
77
|
+
*/
|
|
78
|
+
function runGuardCheck(cwd, config) {
|
|
79
|
+
const commands = [];
|
|
80
|
+
if (config.build_command) commands.push(config.build_command);
|
|
81
|
+
if (config.test_command) commands.push(config.test_command);
|
|
82
|
+
|
|
83
|
+
if (commands.length === 0) {
|
|
84
|
+
return { passed: true, output: '(no guard commands configured)' };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const fullCommand = commands.join(' && ');
|
|
88
|
+
try {
|
|
89
|
+
const output = execSync(fullCommand, {
|
|
90
|
+
cwd,
|
|
91
|
+
stdio: 'pipe',
|
|
92
|
+
timeout: 120_000, // 2 minute timeout for guard
|
|
93
|
+
}).toString();
|
|
94
|
+
return { passed: true, output };
|
|
95
|
+
} catch (err) {
|
|
96
|
+
return { passed: false, output: err.stderr?.toString() || err.message };
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Compare a target metric between baseline and current.
|
|
102
|
+
* Returns delta percentage and whether it improved.
|
|
103
|
+
*
|
|
104
|
+
* For metrics where "higher is better" (cache_ratio), improvement = current > baseline.
|
|
105
|
+
* For metrics where "lower is better" (total_tokens, wall_time, context_burn),
|
|
106
|
+
* improvement = current < baseline.
|
|
107
|
+
*
|
|
108
|
+
* @param {string} metricName
|
|
109
|
+
* @param {number} baseline
|
|
110
|
+
* @param {number} current
|
|
111
|
+
* @returns {{ delta: number, improved: boolean }}
|
|
112
|
+
*/
|
|
113
|
+
function compareMetric(metricName, baseline, current) {
|
|
114
|
+
// Guard against zero baseline
|
|
115
|
+
const delta = baseline !== 0
|
|
116
|
+
? ((current - baseline) / Math.abs(baseline)) * 100
|
|
117
|
+
: current === 0 ? 0 : 100;
|
|
118
|
+
|
|
119
|
+
// "Lower is better" metrics
|
|
120
|
+
const lowerIsBetter = ['total_tokens', 'wall_time', 'context_burn'];
|
|
121
|
+
|
|
122
|
+
const improved = lowerIsBetter.includes(metricName)
|
|
123
|
+
? current < baseline
|
|
124
|
+
: current > baseline;
|
|
125
|
+
|
|
126
|
+
return { delta: Math.round(delta * 100) / 100, improved };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Format secondary metrics for the commit message.
|
|
131
|
+
* AC-3: Secondary metrics in commit message but never trigger keep/revert.
|
|
132
|
+
*
|
|
133
|
+
* @param {object} metrics - Full metrics object
|
|
134
|
+
* @param {string} targetMetric - Primary metric name (excluded from secondaries)
|
|
135
|
+
* @param {string[]} secondaryMetrics - List of secondary metric names
|
|
136
|
+
* @returns {string}
|
|
137
|
+
*/
|
|
138
|
+
function formatSecondaries(metrics, targetMetric, secondaryMetrics) {
|
|
139
|
+
if (!secondaryMetrics || secondaryMetrics.length === 0) return '';
|
|
140
|
+
|
|
141
|
+
return secondaryMetrics
|
|
142
|
+
.filter((m) => m !== targetMetric && metrics[m] != null)
|
|
143
|
+
.map((m) => `${m}=${metrics[m]}`)
|
|
144
|
+
.join(' ');
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Extract the skill name from a skill file path.
|
|
149
|
+
* e.g. "skills/atomic-commits/SKILL.md" → "atomic-commits"
|
|
150
|
+
*
|
|
151
|
+
* @param {string} skillPath
|
|
152
|
+
* @returns {string}
|
|
153
|
+
*/
|
|
154
|
+
function extractSkillName(skillPath) {
|
|
155
|
+
const parts = skillPath.replace(/\\/g, '/').split('/');
|
|
156
|
+
// Try to find the directory name before SKILL.md
|
|
157
|
+
const skillIdx = parts.findIndex((p) => /^SKILL\.md$/i.test(p));
|
|
158
|
+
if (skillIdx > 0) return parts[skillIdx - 1];
|
|
159
|
+
// Fallback: use filename without extension
|
|
160
|
+
return path.basename(skillPath, path.extname(skillPath));
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Run the Karpathy eval loop.
|
|
165
|
+
*
|
|
166
|
+
* @param {object} options
|
|
167
|
+
* @param {string} options.repoRoot - Git repo root
|
|
168
|
+
* @param {string} options.skillPath - Path to skill file (relative to repo root)
|
|
169
|
+
* @param {string} options.benchDir - Path to benchmark directory
|
|
170
|
+
* @param {string} options.target - Primary metric name (e.g. "cache_ratio")
|
|
171
|
+
* @param {string} options.hypothesis - Mutation hypothesis
|
|
172
|
+
* @param {number} [options.maxIterations=Infinity] - --loop N cap (AC-6, AC-15)
|
|
173
|
+
* @param {string[]} [options.secondaryMetrics=[]] - Secondary metric names (AC-3)
|
|
174
|
+
* @param {object} [options.config={}] - Project config (build_command, test_command)
|
|
175
|
+
* @param {Function} [options.mutateSkill] - Async function that receives prompt and returns new skill content
|
|
176
|
+
* @param {Function} [options.onIteration] - Callback per iteration for logging
|
|
177
|
+
* @returns {Promise<{ iterations: number, kept: number, reverted: number, guardFails: number, branch: string }>}
|
|
178
|
+
*/
|
|
179
|
+
async function runEvalLoop({
|
|
180
|
+
repoRoot,
|
|
181
|
+
skillPath,
|
|
182
|
+
benchDir,
|
|
183
|
+
target,
|
|
184
|
+
hypothesis,
|
|
185
|
+
maxIterations = Infinity,
|
|
186
|
+
secondaryMetrics = [],
|
|
187
|
+
config = {},
|
|
188
|
+
mutateSkill,
|
|
189
|
+
onIteration,
|
|
190
|
+
}) {
|
|
191
|
+
const skillName = extractSkillName(skillPath);
|
|
192
|
+
const absoluteSkillPath = path.isAbsolute(skillPath)
|
|
193
|
+
? skillPath
|
|
194
|
+
: path.join(repoRoot, skillPath);
|
|
195
|
+
|
|
196
|
+
// AC-12: Create worktree-isolated branch
|
|
197
|
+
const { branch, worktreePath } = createEvalWorktree(repoRoot, skillName);
|
|
198
|
+
|
|
199
|
+
const worktreeSkillPath = path.join(
|
|
200
|
+
worktreePath,
|
|
201
|
+
path.relative(repoRoot, absoluteSkillPath)
|
|
202
|
+
);
|
|
203
|
+
|
|
204
|
+
const deepflowDir = path.join(worktreePath, '.deepflow');
|
|
205
|
+
|
|
206
|
+
const stats = { iterations: 0, kept: 0, reverted: 0, guardFails: 0, branch };
|
|
207
|
+
|
|
208
|
+
// Track abort signal for Ctrl+C (AC-6, AC-15)
|
|
209
|
+
let aborted = false;
|
|
210
|
+
const abortHandler = () => { aborted = true; };
|
|
211
|
+
process.on('SIGINT', abortHandler);
|
|
212
|
+
|
|
213
|
+
try {
|
|
214
|
+
// Collect baseline metrics before the loop starts
|
|
215
|
+
let baselineMetrics = await collectMetrics(deepflowDir);
|
|
216
|
+
|
|
217
|
+
// AC-6: Loop until Ctrl+C or --loop N reached
|
|
218
|
+
while (!aborted && stats.iterations < maxIterations) {
|
|
219
|
+
stats.iterations++;
|
|
220
|
+
const iterNum = stats.iterations;
|
|
221
|
+
|
|
222
|
+
// --- Step 1: Build mutator prompt (T7) ---
|
|
223
|
+
const currentSkillContent = fs.readFileSync(worktreeSkillPath, 'utf8');
|
|
224
|
+
const historyStr = getExperimentHistory({ cwd: worktreePath, skillName });
|
|
225
|
+
const historyEntries = historyStr === '(no experiment history)'
|
|
226
|
+
? []
|
|
227
|
+
: historyStr.split('\n');
|
|
228
|
+
|
|
229
|
+
const prompt = buildMutatorPrompt({
|
|
230
|
+
skillContent: currentSkillContent,
|
|
231
|
+
hypothesis,
|
|
232
|
+
history: historyEntries,
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
// --- Step 2: Spawn agent to mutate skill file (full replacement) ---
|
|
236
|
+
let newSkillContent;
|
|
237
|
+
try {
|
|
238
|
+
newSkillContent = await mutateSkill(prompt);
|
|
239
|
+
} catch (err) {
|
|
240
|
+
// Mutator failure — log and continue to next iteration
|
|
241
|
+
if (onIteration) {
|
|
242
|
+
onIteration({ iteration: iterNum, status: 'mutator_error', error: err.message });
|
|
243
|
+
}
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Write mutated skill file
|
|
248
|
+
fs.writeFileSync(worktreeSkillPath, newSkillContent, 'utf8');
|
|
249
|
+
|
|
250
|
+
// --- Step 3: Commit experiment BEFORE verify (AC-13) ---
|
|
251
|
+
// Use placeholder values; will amend after metrics if kept
|
|
252
|
+
const experimentHash = commitExperiment({
|
|
253
|
+
cwd: worktreePath,
|
|
254
|
+
skillName,
|
|
255
|
+
hypothesis,
|
|
256
|
+
target,
|
|
257
|
+
value: 'pending',
|
|
258
|
+
delta: '0',
|
|
259
|
+
status: 'pending',
|
|
260
|
+
secondaries: '',
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
// --- Step 4: Run guard check ---
|
|
264
|
+
const guardResult = runGuardCheck(worktreePath, config);
|
|
265
|
+
|
|
266
|
+
// --- Step 5: Guard fail → revert, log guard_fail, next iteration (AC-1) ---
|
|
267
|
+
if (!guardResult.passed) {
|
|
268
|
+
revertExperiment({ cwd: worktreePath });
|
|
269
|
+
stats.guardFails++;
|
|
270
|
+
|
|
271
|
+
// Amend the experiment commit message is not possible since we reverted.
|
|
272
|
+
// The revert commit captures the guard_fail state in history.
|
|
273
|
+
// Log a guard_fail experiment for git-as-memory
|
|
274
|
+
commitExperiment({
|
|
275
|
+
cwd: worktreePath,
|
|
276
|
+
skillName,
|
|
277
|
+
hypothesis,
|
|
278
|
+
target,
|
|
279
|
+
value: 'N/A',
|
|
280
|
+
delta: '0',
|
|
281
|
+
status: 'guard_fail',
|
|
282
|
+
secondaries: '',
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
if (onIteration) {
|
|
286
|
+
onIteration({
|
|
287
|
+
iteration: iterNum,
|
|
288
|
+
status: 'guard_fail',
|
|
289
|
+
guardOutput: guardResult.output,
|
|
290
|
+
hash: experimentHash,
|
|
291
|
+
});
|
|
292
|
+
}
|
|
293
|
+
continue;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// --- Step 6: Collect metrics (T6) (AC-16) ---
|
|
297
|
+
const startTs = Date.now() - 120_000; // approximate window
|
|
298
|
+
const endTs = Date.now();
|
|
299
|
+
const currentMetrics = await collectMetrics(deepflowDir, startTs, endTs);
|
|
300
|
+
|
|
301
|
+
// --- Step 7: Compare target metric (AC-2) ---
|
|
302
|
+
const baselineValue = baselineMetrics[target] || 0;
|
|
303
|
+
const currentValue = currentMetrics[target] || 0;
|
|
304
|
+
const { delta, improved } = compareMetric(target, baselineValue, currentValue);
|
|
305
|
+
|
|
306
|
+
// AC-3: Format secondary metrics (never decide)
|
|
307
|
+
const secondariesStr = formatSecondaries(currentMetrics, target, secondaryMetrics);
|
|
308
|
+
|
|
309
|
+
let status;
|
|
310
|
+
if (improved) {
|
|
311
|
+
// Target improved → keep (AC-2: status:kept)
|
|
312
|
+
status = 'kept';
|
|
313
|
+
stats.kept++;
|
|
314
|
+
|
|
315
|
+
// Update baseline to the new best
|
|
316
|
+
baselineMetrics = currentMetrics;
|
|
317
|
+
|
|
318
|
+
// The experiment commit is already in place; record a kept marker
|
|
319
|
+
commitExperiment({
|
|
320
|
+
cwd: worktreePath,
|
|
321
|
+
skillName,
|
|
322
|
+
hypothesis,
|
|
323
|
+
target,
|
|
324
|
+
value: currentValue,
|
|
325
|
+
delta: delta.toString(),
|
|
326
|
+
status: 'kept',
|
|
327
|
+
secondaries: secondariesStr,
|
|
328
|
+
});
|
|
329
|
+
} else {
|
|
330
|
+
// Target regression → revert (AC-2: status:reverted, AC-7: git revert)
|
|
331
|
+
status = 'reverted';
|
|
332
|
+
stats.reverted++;
|
|
333
|
+
|
|
334
|
+
revertExperiment({ cwd: worktreePath });
|
|
335
|
+
|
|
336
|
+
// Record the reverted experiment result
|
|
337
|
+
commitExperiment({
|
|
338
|
+
cwd: worktreePath,
|
|
339
|
+
skillName,
|
|
340
|
+
hypothesis,
|
|
341
|
+
target,
|
|
342
|
+
value: currentValue,
|
|
343
|
+
delta: delta.toString(),
|
|
344
|
+
status: 'reverted',
|
|
345
|
+
secondaries: secondariesStr,
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
if (onIteration) {
|
|
350
|
+
onIteration({
|
|
351
|
+
iteration: iterNum,
|
|
352
|
+
status,
|
|
353
|
+
target,
|
|
354
|
+
value: currentValue,
|
|
355
|
+
delta,
|
|
356
|
+
secondaries: secondariesStr,
|
|
357
|
+
hash: experimentHash,
|
|
358
|
+
});
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
} finally {
|
|
362
|
+
// Clean up SIGINT handler
|
|
363
|
+
process.removeListener('SIGINT', abortHandler);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return stats;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
module.exports = {
|
|
370
|
+
runEvalLoop,
|
|
371
|
+
// exported for testing / composition
|
|
372
|
+
createEvalWorktree,
|
|
373
|
+
removeEvalWorktree,
|
|
374
|
+
runGuardCheck,
|
|
375
|
+
compareMetric,
|
|
376
|
+
formatSecondaries,
|
|
377
|
+
extractSkillName,
|
|
378
|
+
};
|