deepflow 0.1.103 → 0.1.104

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/bin/install-dynamic-hooks.test.js +461 -0
  2. package/bin/install.js +150 -250
  3. package/bin/lineage-ingest.js +70 -0
  4. package/hooks/df-check-update.js +1 -0
  5. package/hooks/df-command-usage.js +18 -0
  6. package/hooks/df-dashboard-push.js +1 -0
  7. package/hooks/df-execution-history.js +1 -0
  8. package/hooks/df-explore-protocol.js +83 -0
  9. package/hooks/df-explore-protocol.test.js +228 -0
  10. package/hooks/df-hook-event-tags.test.js +127 -0
  11. package/hooks/df-invariant-check.js +1 -0
  12. package/hooks/df-quota-logger.js +1 -0
  13. package/hooks/df-snapshot-guard.js +1 -0
  14. package/hooks/df-spec-lint.js +58 -1
  15. package/hooks/df-spec-lint.test.js +412 -0
  16. package/hooks/df-statusline.js +1 -0
  17. package/hooks/df-subagent-registry.js +1 -0
  18. package/hooks/df-tool-usage.js +13 -3
  19. package/hooks/df-worktree-guard.js +1 -0
  20. package/package.json +1 -1
  21. package/src/commands/df/debate.md +1 -1
  22. package/src/commands/df/eval.md +117 -0
  23. package/src/commands/df/execute.md +1 -1
  24. package/src/commands/df/fix.md +104 -0
  25. package/src/eval/git-memory.js +159 -0
  26. package/src/eval/git-memory.test.js +439 -0
  27. package/src/eval/hypothesis.js +80 -0
  28. package/src/eval/hypothesis.test.js +169 -0
  29. package/src/eval/loop.js +378 -0
  30. package/src/eval/loop.test.js +306 -0
  31. package/src/eval/metric-collector.js +163 -0
  32. package/src/eval/metric-collector.test.js +369 -0
  33. package/src/eval/metric-pivot.js +119 -0
  34. package/src/eval/metric-pivot.test.js +350 -0
  35. package/src/eval/mutator-prompt.js +106 -0
  36. package/src/eval/mutator-prompt.test.js +180 -0
  37. package/templates/config-template.yaml +5 -0
  38. package/templates/eval-fixture-template/config.yaml +39 -0
  39. package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
  40. package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
  41. package/templates/eval-fixture-template/fixture/package.json +12 -0
  42. package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
  43. package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
  44. package/templates/eval-fixture-template/fixture/src/config.js +40 -0
  45. package/templates/eval-fixture-template/fixture/src/index.js +19 -0
  46. package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
  47. package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
  48. package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
  49. package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
  50. package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
  51. package/templates/eval-fixture-template/hypotheses.md +14 -0
  52. package/templates/eval-fixture-template/spec.md +34 -0
  53. package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
  54. package/templates/eval-fixture-template/tests/guard.test.js +108 -0
  55. package/templates/eval-fixture-template.test.js +318 -0
  56. package/templates/explore-agent.md +5 -74
  57. package/templates/explore-protocol.md +44 -0
  58. package/templates/spec-template.md +4 -0
@@ -0,0 +1,80 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Hypothesis loading for df:eval.
5
+ *
6
+ * AC-11: Loop accepts --hypothesis flag; without it, reads hypotheses.md from
7
+ * benchmark dir and returns the next unused hypothesis.
8
+ */
9
+
10
+ const fs = require('fs');
11
+ const path = require('path');
12
+
13
+ /**
14
+ * Parse markdown list items from hypotheses.md content.
15
+ * Recognises both ordered (1. ...) and unordered (- ... / * ...) list items.
16
+ *
17
+ * @param {string} content - Raw file content
18
+ * @returns {string[]} - Array of hypothesis strings (trimmed, non-empty)
19
+ */
20
+ function parseHypothesesFile(content) {
21
+ return content
22
+ .split('\n')
23
+ .map((line) => line.match(/^(?:\d+\.|[-*])\s+(.+)/))
24
+ .filter(Boolean)
25
+ .map((m) => m[1].trim())
26
+ .filter((h) => h.length > 0);
27
+ }
28
+
29
+ /**
30
+ * Load the active hypothesis for an eval session.
31
+ *
32
+ * Resolution order:
33
+ * 1. If `flag` is a non-empty string → return it directly.
34
+ * 2. Otherwise read `{benchDir}/hypotheses.md` and return the first entry.
35
+ * If the file is missing or contains no list items, throw an error.
36
+ *
37
+ * "Next unused" is kept simple for now: always return the first list item.
38
+ * Iteration tracking (marking items as used) is left to the loop's git-memory
39
+ * history, which records which hypotheses were already attempted.
40
+ *
41
+ * @param {object} opts
42
+ * @param {string} [opts.flag] - Value of --hypothesis CLI flag (may be undefined)
43
+ * @param {string} opts.benchDir - Path to the benchmark directory
44
+ * @returns {string} - The hypothesis string to use
45
+ * @throws {Error} - If no hypothesis can be resolved
46
+ */
47
+ function loadHypothesis({ flag, benchDir }) {
48
+ // 1. CLI flag takes priority
49
+ if (flag && typeof flag === 'string' && flag.trim().length > 0) {
50
+ return flag.trim();
51
+ }
52
+
53
+ // 2. Fall back to hypotheses.md
54
+ const hypothesesPath = path.join(benchDir, 'hypotheses.md');
55
+
56
+ let content;
57
+ try {
58
+ content = fs.readFileSync(hypothesesPath, 'utf8');
59
+ } catch (err) {
60
+ throw new Error(
61
+ `No --hypothesis flag provided and could not read ${hypothesesPath}: ${err.message}`
62
+ );
63
+ }
64
+
65
+ const hypotheses = parseHypothesesFile(content);
66
+
67
+ if (hypotheses.length === 0) {
68
+ throw new Error(
69
+ `No hypotheses found in ${hypothesesPath}. Add list items (- ... or 1. ...) to define hypotheses.`
70
+ );
71
+ }
72
+
73
+ // Return the first hypothesis (loop history tracks which were attempted)
74
+ return hypotheses[0];
75
+ }
76
+
77
+ module.exports = {
78
+ loadHypothesis,
79
+ parseHypothesesFile,
80
+ };
@@ -0,0 +1,169 @@
1
+ 'use strict';
2
+
3
+ const { describe, it, before, after } = require('node:test');
4
+ const assert = require('node:assert');
5
+ const fs = require('fs');
6
+ const path = require('path');
7
+ const os = require('os');
8
+
9
+ const { loadHypothesis, parseHypothesesFile } = require('./hypothesis.js');
10
+
11
+ // --- parseHypothesesFile ---
12
+
13
+ describe('parseHypothesesFile', () => {
14
+ it('parses ordered list items (1. ...)', () => {
15
+ const content = '1. First hypothesis\n2. Second hypothesis\n3. Third one\n';
16
+ const result = parseHypothesesFile(content);
17
+ assert.deepStrictEqual(result, [
18
+ 'First hypothesis',
19
+ 'Second hypothesis',
20
+ 'Third one',
21
+ ]);
22
+ });
23
+
24
+ it('parses unordered list items with dashes (- ...)', () => {
25
+ const content = '- Dash one\n- Dash two\n';
26
+ const result = parseHypothesesFile(content);
27
+ assert.deepStrictEqual(result, ['Dash one', 'Dash two']);
28
+ });
29
+
30
+ it('parses unordered list items with asterisks (* ...)', () => {
31
+ const content = '* Star one\n* Star two\n';
32
+ const result = parseHypothesesFile(content);
33
+ assert.deepStrictEqual(result, ['Star one', 'Star two']);
34
+ });
35
+
36
+ it('handles mixed ordered and unordered items', () => {
37
+ const content = '1. Ordered first\n- Dash second\n* Star third\n2. Ordered fourth\n';
38
+ const result = parseHypothesesFile(content);
39
+ assert.deepStrictEqual(result, [
40
+ 'Ordered first',
41
+ 'Dash second',
42
+ 'Star third',
43
+ 'Ordered fourth',
44
+ ]);
45
+ });
46
+
47
+ it('returns empty array for empty content', () => {
48
+ assert.deepStrictEqual(parseHypothesesFile(''), []);
49
+ });
50
+
51
+ it('returns empty array when content has no list items', () => {
52
+ const content = '# Hypotheses\n\nSome paragraph text.\nAnother line.\n';
53
+ const result = parseHypothesesFile(content);
54
+ assert.deepStrictEqual(result, []);
55
+ });
56
+
57
+ it('ignores non-list lines interspersed with list items', () => {
58
+ const content = '# Title\n\n1. Real item\nNot a list item\n- Another real item\n';
59
+ const result = parseHypothesesFile(content);
60
+ assert.deepStrictEqual(result, ['Real item', 'Another real item']);
61
+ });
62
+
63
+ it('trims whitespace from parsed items', () => {
64
+ const content = '1. Lots of spaces \n- Also spaced \n';
65
+ const result = parseHypothesesFile(content);
66
+ assert.deepStrictEqual(result, ['Lots of spaces', 'Also spaced']);
67
+ });
68
+ });
69
+
70
+ // --- loadHypothesis ---
71
+
72
+ describe('loadHypothesis', () => {
73
+ let tmpDir;
74
+
75
+ before(() => {
76
+ tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hypothesis-test-'));
77
+ });
78
+
79
+ after(() => {
80
+ fs.rmSync(tmpDir, { recursive: true, force: true });
81
+ });
82
+
83
+ it('returns flag when provided (AC-11)', () => {
84
+ const result = loadHypothesis({ flag: 'my hypothesis', benchDir: tmpDir });
85
+ assert.strictEqual(result, 'my hypothesis');
86
+ });
87
+
88
+ it('trims the flag value', () => {
89
+ const result = loadHypothesis({ flag: ' padded ', benchDir: tmpDir });
90
+ assert.strictEqual(result, 'padded');
91
+ });
92
+
93
+ it('reads hypotheses.md when no flag is provided', () => {
94
+ const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-read-'));
95
+ fs.writeFileSync(
96
+ path.join(benchDir, 'hypotheses.md'),
97
+ '1. First from file\n2. Second from file\n'
98
+ );
99
+ const result = loadHypothesis({ benchDir });
100
+ assert.strictEqual(result, 'First from file');
101
+ fs.rmSync(benchDir, { recursive: true, force: true });
102
+ });
103
+
104
+ it('ignores empty-string flag and falls back to file', () => {
105
+ const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-empty-'));
106
+ fs.writeFileSync(
107
+ path.join(benchDir, 'hypotheses.md'),
108
+ '- Fallback hypothesis\n'
109
+ );
110
+ const result = loadHypothesis({ flag: '', benchDir });
111
+ assert.strictEqual(result, 'Fallback hypothesis');
112
+ fs.rmSync(benchDir, { recursive: true, force: true });
113
+ });
114
+
115
+ it('ignores whitespace-only flag and falls back to file', () => {
116
+ const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-ws-'));
117
+ fs.writeFileSync(
118
+ path.join(benchDir, 'hypotheses.md'),
119
+ '- WS fallback\n'
120
+ );
121
+ const result = loadHypothesis({ flag: ' ', benchDir });
122
+ assert.strictEqual(result, 'WS fallback');
123
+ fs.rmSync(benchDir, { recursive: true, force: true });
124
+ });
125
+
126
+ it('throws when neither flag nor file available', () => {
127
+ const missingDir = path.join(tmpDir, 'nonexistent');
128
+ assert.throws(
129
+ () => loadHypothesis({ benchDir: missingDir }),
130
+ (err) => {
131
+ assert.ok(err instanceof Error);
132
+ assert.ok(err.message.includes('No --hypothesis flag provided'));
133
+ assert.ok(err.message.includes('hypotheses.md'));
134
+ return true;
135
+ }
136
+ );
137
+ });
138
+
139
+ it('throws when file exists but contains no list items', () => {
140
+ const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-nolist-'));
141
+ fs.writeFileSync(
142
+ path.join(benchDir, 'hypotheses.md'),
143
+ '# Just a heading\n\nSome text but no list items.\n'
144
+ );
145
+ assert.throws(
146
+ () => loadHypothesis({ benchDir }),
147
+ (err) => {
148
+ assert.ok(err instanceof Error);
149
+ assert.ok(err.message.includes('No hypotheses found'));
150
+ return true;
151
+ }
152
+ );
153
+ fs.rmSync(benchDir, { recursive: true, force: true });
154
+ });
155
+
156
+ it('throws when file is empty', () => {
157
+ const benchDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hyp-empty-file-'));
158
+ fs.writeFileSync(path.join(benchDir, 'hypotheses.md'), '');
159
+ assert.throws(
160
+ () => loadHypothesis({ benchDir }),
161
+ (err) => {
162
+ assert.ok(err instanceof Error);
163
+ assert.ok(err.message.includes('No hypotheses found'));
164
+ return true;
165
+ }
166
+ );
167
+ fs.rmSync(benchDir, { recursive: true, force: true });
168
+ });
169
+ });
@@ -0,0 +1,378 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Karpathy loop orchestrator for df:eval.
5
+ *
6
+ * Implements the core eval loop: mutate → commit → guard → measure → keep/revert.
7
+ * Worktree-isolated, git-as-memory, single target metric decides.
8
+ *
9
+ * AC-1: Guard failure auto-reverts before metric comparison (status:guard_fail)
10
+ * AC-2: Target improvement keeps; regression reverts (status:kept / status:reverted)
11
+ * AC-3: Secondary metrics in commit message, never decide
12
+ * AC-6: Runs indefinitely until Ctrl+C; --loop N caps at N iterations
13
+ * AC-7: Reverts via git revert (not reset)
14
+ * AC-12: All experiments on worktree-isolated branch
15
+ * AC-13: Commit before verify for clean rollback
16
+ * AC-15: Loop terminates on Ctrl+C or --loop N
17
+ */
18
+
19
+ const fs = require('fs');
20
+ const path = require('path');
21
+ const { execSync } = require('child_process');
22
+ const { buildMutatorPrompt } = require('./mutator-prompt');
23
+ const { collectMetrics } = require('./metric-collector');
24
+ const { commitExperiment, revertExperiment, getExperimentHistory } = require('./git-memory');
25
+
26
+ /**
27
+ * Create a worktree-isolated branch for the eval session.
28
+ * AC-12: All experiments on worktree-isolated branch.
29
+ *
30
+ * @param {string} repoRoot - Root of the main git repo
31
+ * @param {string} skillName - Skill being evaluated (used in branch name)
32
+ * @returns {{ branch: string, worktreePath: string }}
33
+ */
34
+ function createEvalWorktree(repoRoot, skillName) {
35
+ const timestamp = Date.now();
36
+ const branch = `eval/${skillName}/${timestamp}`;
37
+ const worktreeBase = path.join(repoRoot, '.deepflow', 'worktrees');
38
+
39
+ // Ensure worktree base exists
40
+ fs.mkdirSync(worktreeBase, { recursive: true });
41
+
42
+ const worktreePath = path.join(worktreeBase, `eval-${skillName}-${timestamp}`);
43
+
44
+ // Create orphan branch from current HEAD
45
+ execSync(`git worktree add -b "${branch}" "${worktreePath}" HEAD`, {
46
+ cwd: repoRoot,
47
+ stdio: 'pipe',
48
+ });
49
+
50
+ return { branch, worktreePath };
51
+ }
52
+
53
+ /**
54
+ * Remove a worktree and optionally its branch.
55
+ *
56
+ * @param {string} repoRoot
57
+ * @param {string} worktreePath
58
+ */
59
+ function removeEvalWorktree(repoRoot, worktreePath) {
60
+ try {
61
+ execSync(`git worktree remove "${worktreePath}" --force`, {
62
+ cwd: repoRoot,
63
+ stdio: 'pipe',
64
+ });
65
+ } catch (_) {
66
+ // best-effort cleanup
67
+ }
68
+ }
69
+
70
+ /**
71
+ * Run the guard check (build + test commands from config).
72
+ * AC-1, AC-5: Guard = fixture tests via configured test command.
73
+ *
74
+ * @param {string} cwd - Working directory to run commands in
75
+ * @param {object} config - Config with build_command / test_command
76
+ * @returns {{ passed: boolean, output: string }}
77
+ */
78
+ function runGuardCheck(cwd, config) {
79
+ const commands = [];
80
+ if (config.build_command) commands.push(config.build_command);
81
+ if (config.test_command) commands.push(config.test_command);
82
+
83
+ if (commands.length === 0) {
84
+ return { passed: true, output: '(no guard commands configured)' };
85
+ }
86
+
87
+ const fullCommand = commands.join(' && ');
88
+ try {
89
+ const output = execSync(fullCommand, {
90
+ cwd,
91
+ stdio: 'pipe',
92
+ timeout: 120_000, // 2 minute timeout for guard
93
+ }).toString();
94
+ return { passed: true, output };
95
+ } catch (err) {
96
+ return { passed: false, output: err.stderr?.toString() || err.message };
97
+ }
98
+ }
99
+
100
+ /**
101
+ * Compare a target metric between baseline and current.
102
+ * Returns delta percentage and whether it improved.
103
+ *
104
+ * For metrics where "higher is better" (cache_ratio), improvement = current > baseline.
105
+ * For metrics where "lower is better" (total_tokens, wall_time, context_burn),
106
+ * improvement = current < baseline.
107
+ *
108
+ * @param {string} metricName
109
+ * @param {number} baseline
110
+ * @param {number} current
111
+ * @returns {{ delta: number, improved: boolean }}
112
+ */
113
+ function compareMetric(metricName, baseline, current) {
114
+ // Guard against zero baseline
115
+ const delta = baseline !== 0
116
+ ? ((current - baseline) / Math.abs(baseline)) * 100
117
+ : current === 0 ? 0 : 100;
118
+
119
+ // "Lower is better" metrics
120
+ const lowerIsBetter = ['total_tokens', 'wall_time', 'context_burn'];
121
+
122
+ const improved = lowerIsBetter.includes(metricName)
123
+ ? current < baseline
124
+ : current > baseline;
125
+
126
+ return { delta: Math.round(delta * 100) / 100, improved };
127
+ }
128
+
129
+ /**
130
+ * Format secondary metrics for the commit message.
131
+ * AC-3: Secondary metrics in commit message but never trigger keep/revert.
132
+ *
133
+ * @param {object} metrics - Full metrics object
134
+ * @param {string} targetMetric - Primary metric name (excluded from secondaries)
135
+ * @param {string[]} secondaryMetrics - List of secondary metric names
136
+ * @returns {string}
137
+ */
138
+ function formatSecondaries(metrics, targetMetric, secondaryMetrics) {
139
+ if (!secondaryMetrics || secondaryMetrics.length === 0) return '';
140
+
141
+ return secondaryMetrics
142
+ .filter((m) => m !== targetMetric && metrics[m] != null)
143
+ .map((m) => `${m}=${metrics[m]}`)
144
+ .join(' ');
145
+ }
146
+
147
+ /**
148
+ * Extract the skill name from a skill file path.
149
+ * e.g. "skills/atomic-commits/SKILL.md" → "atomic-commits"
150
+ *
151
+ * @param {string} skillPath
152
+ * @returns {string}
153
+ */
154
+ function extractSkillName(skillPath) {
155
+ const parts = skillPath.replace(/\\/g, '/').split('/');
156
+ // Try to find the directory name before SKILL.md
157
+ const skillIdx = parts.findIndex((p) => /^SKILL\.md$/i.test(p));
158
+ if (skillIdx > 0) return parts[skillIdx - 1];
159
+ // Fallback: use filename without extension
160
+ return path.basename(skillPath, path.extname(skillPath));
161
+ }
162
+
163
+ /**
164
+ * Run the Karpathy eval loop.
165
+ *
166
+ * @param {object} options
167
+ * @param {string} options.repoRoot - Git repo root
168
+ * @param {string} options.skillPath - Path to skill file (relative to repo root)
169
+ * @param {string} options.benchDir - Path to benchmark directory
170
+ * @param {string} options.target - Primary metric name (e.g. "cache_ratio")
171
+ * @param {string} options.hypothesis - Mutation hypothesis
172
+ * @param {number} [options.maxIterations=Infinity] - --loop N cap (AC-6, AC-15)
173
+ * @param {string[]} [options.secondaryMetrics=[]] - Secondary metric names (AC-3)
174
+ * @param {object} [options.config={}] - Project config (build_command, test_command)
175
+ * @param {Function} [options.mutateSkill] - Async function that receives prompt and returns new skill content
176
+ * @param {Function} [options.onIteration] - Callback per iteration for logging
177
+ * @returns {Promise<{ iterations: number, kept: number, reverted: number, guardFails: number, branch: string }>}
178
+ */
179
+ async function runEvalLoop({
180
+ repoRoot,
181
+ skillPath,
182
+ benchDir,
183
+ target,
184
+ hypothesis,
185
+ maxIterations = Infinity,
186
+ secondaryMetrics = [],
187
+ config = {},
188
+ mutateSkill,
189
+ onIteration,
190
+ }) {
191
+ const skillName = extractSkillName(skillPath);
192
+ const absoluteSkillPath = path.isAbsolute(skillPath)
193
+ ? skillPath
194
+ : path.join(repoRoot, skillPath);
195
+
196
+ // AC-12: Create worktree-isolated branch
197
+ const { branch, worktreePath } = createEvalWorktree(repoRoot, skillName);
198
+
199
+ const worktreeSkillPath = path.join(
200
+ worktreePath,
201
+ path.relative(repoRoot, absoluteSkillPath)
202
+ );
203
+
204
+ const deepflowDir = path.join(worktreePath, '.deepflow');
205
+
206
+ const stats = { iterations: 0, kept: 0, reverted: 0, guardFails: 0, branch };
207
+
208
+ // Track abort signal for Ctrl+C (AC-6, AC-15)
209
+ let aborted = false;
210
+ const abortHandler = () => { aborted = true; };
211
+ process.on('SIGINT', abortHandler);
212
+
213
+ try {
214
+ // Collect baseline metrics before the loop starts
215
+ let baselineMetrics = await collectMetrics(deepflowDir);
216
+
217
+ // AC-6: Loop until Ctrl+C or --loop N reached
218
+ while (!aborted && stats.iterations < maxIterations) {
219
+ stats.iterations++;
220
+ const iterNum = stats.iterations;
221
+
222
+ // --- Step 1: Build mutator prompt (T7) ---
223
+ const currentSkillContent = fs.readFileSync(worktreeSkillPath, 'utf8');
224
+ const historyStr = getExperimentHistory({ cwd: worktreePath, skillName });
225
+ const historyEntries = historyStr === '(no experiment history)'
226
+ ? []
227
+ : historyStr.split('\n');
228
+
229
+ const prompt = buildMutatorPrompt({
230
+ skillContent: currentSkillContent,
231
+ hypothesis,
232
+ history: historyEntries,
233
+ });
234
+
235
+ // --- Step 2: Spawn agent to mutate skill file (full replacement) ---
236
+ let newSkillContent;
237
+ try {
238
+ newSkillContent = await mutateSkill(prompt);
239
+ } catch (err) {
240
+ // Mutator failure — log and continue to next iteration
241
+ if (onIteration) {
242
+ onIteration({ iteration: iterNum, status: 'mutator_error', error: err.message });
243
+ }
244
+ continue;
245
+ }
246
+
247
+ // Write mutated skill file
248
+ fs.writeFileSync(worktreeSkillPath, newSkillContent, 'utf8');
249
+
250
+ // --- Step 3: Commit experiment BEFORE verify (AC-13) ---
251
+ // Use placeholder values; will amend after metrics if kept
252
+ const experimentHash = commitExperiment({
253
+ cwd: worktreePath,
254
+ skillName,
255
+ hypothesis,
256
+ target,
257
+ value: 'pending',
258
+ delta: '0',
259
+ status: 'pending',
260
+ secondaries: '',
261
+ });
262
+
263
+ // --- Step 4: Run guard check ---
264
+ const guardResult = runGuardCheck(worktreePath, config);
265
+
266
+ // --- Step 5: Guard fail → revert, log guard_fail, next iteration (AC-1) ---
267
+ if (!guardResult.passed) {
268
+ revertExperiment({ cwd: worktreePath });
269
+ stats.guardFails++;
270
+
271
+ // Amend the experiment commit message is not possible since we reverted.
272
+ // The revert commit captures the guard_fail state in history.
273
+ // Log a guard_fail experiment for git-as-memory
274
+ commitExperiment({
275
+ cwd: worktreePath,
276
+ skillName,
277
+ hypothesis,
278
+ target,
279
+ value: 'N/A',
280
+ delta: '0',
281
+ status: 'guard_fail',
282
+ secondaries: '',
283
+ });
284
+
285
+ if (onIteration) {
286
+ onIteration({
287
+ iteration: iterNum,
288
+ status: 'guard_fail',
289
+ guardOutput: guardResult.output,
290
+ hash: experimentHash,
291
+ });
292
+ }
293
+ continue;
294
+ }
295
+
296
+ // --- Step 6: Collect metrics (T6) (AC-16) ---
297
+ const startTs = Date.now() - 120_000; // approximate window
298
+ const endTs = Date.now();
299
+ const currentMetrics = await collectMetrics(deepflowDir, startTs, endTs);
300
+
301
+ // --- Step 7: Compare target metric (AC-2) ---
302
+ const baselineValue = baselineMetrics[target] || 0;
303
+ const currentValue = currentMetrics[target] || 0;
304
+ const { delta, improved } = compareMetric(target, baselineValue, currentValue);
305
+
306
+ // AC-3: Format secondary metrics (never decide)
307
+ const secondariesStr = formatSecondaries(currentMetrics, target, secondaryMetrics);
308
+
309
+ let status;
310
+ if (improved) {
311
+ // Target improved → keep (AC-2: status:kept)
312
+ status = 'kept';
313
+ stats.kept++;
314
+
315
+ // Update baseline to the new best
316
+ baselineMetrics = currentMetrics;
317
+
318
+ // The experiment commit is already in place; record a kept marker
319
+ commitExperiment({
320
+ cwd: worktreePath,
321
+ skillName,
322
+ hypothesis,
323
+ target,
324
+ value: currentValue,
325
+ delta: delta.toString(),
326
+ status: 'kept',
327
+ secondaries: secondariesStr,
328
+ });
329
+ } else {
330
+ // Target regression → revert (AC-2: status:reverted, AC-7: git revert)
331
+ status = 'reverted';
332
+ stats.reverted++;
333
+
334
+ revertExperiment({ cwd: worktreePath });
335
+
336
+ // Record the reverted experiment result
337
+ commitExperiment({
338
+ cwd: worktreePath,
339
+ skillName,
340
+ hypothesis,
341
+ target,
342
+ value: currentValue,
343
+ delta: delta.toString(),
344
+ status: 'reverted',
345
+ secondaries: secondariesStr,
346
+ });
347
+ }
348
+
349
+ if (onIteration) {
350
+ onIteration({
351
+ iteration: iterNum,
352
+ status,
353
+ target,
354
+ value: currentValue,
355
+ delta,
356
+ secondaries: secondariesStr,
357
+ hash: experimentHash,
358
+ });
359
+ }
360
+ }
361
+ } finally {
362
+ // Clean up SIGINT handler
363
+ process.removeListener('SIGINT', abortHandler);
364
+ }
365
+
366
+ return stats;
367
+ }
368
+
369
+ module.exports = {
370
+ runEvalLoop,
371
+ // exported for testing / composition
372
+ createEvalWorktree,
373
+ removeEvalWorktree,
374
+ runGuardCheck,
375
+ compareMetric,
376
+ formatSecondaries,
377
+ extractSkillName,
378
+ };