deepflow 0.1.102 → 0.1.104
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install-dynamic-hooks.test.js +461 -0
- package/bin/install.js +150 -204
- package/bin/install.test.js +214 -0
- package/bin/lineage-ingest.js +70 -0
- package/hooks/df-check-update.js +1 -0
- package/hooks/df-command-usage.js +305 -0
- package/hooks/df-command-usage.test.js +1019 -0
- package/hooks/df-dashboard-push.js +1 -0
- package/hooks/df-execution-history.js +1 -0
- package/hooks/df-explore-protocol.js +83 -0
- package/hooks/df-explore-protocol.test.js +228 -0
- package/hooks/df-hook-event-tags.test.js +127 -0
- package/hooks/df-invariant-check.js +1 -0
- package/hooks/df-quota-logger.js +1 -0
- package/hooks/df-snapshot-guard.js +1 -0
- package/hooks/df-spec-lint.js +58 -1
- package/hooks/df-spec-lint.test.js +412 -0
- package/hooks/df-statusline.js +1 -0
- package/hooks/df-subagent-registry.js +34 -14
- package/hooks/df-tool-usage.js +21 -3
- package/hooks/df-tool-usage.test.js +200 -0
- package/hooks/df-worktree-guard.js +1 -0
- package/package.json +1 -1
- package/src/commands/df/debate.md +1 -1
- package/src/commands/df/eval.md +117 -0
- package/src/commands/df/execute.md +1 -1
- package/src/commands/df/fix.md +104 -0
- package/src/eval/git-memory.js +159 -0
- package/src/eval/git-memory.test.js +439 -0
- package/src/eval/hypothesis.js +80 -0
- package/src/eval/hypothesis.test.js +169 -0
- package/src/eval/loop.js +378 -0
- package/src/eval/loop.test.js +306 -0
- package/src/eval/metric-collector.js +163 -0
- package/src/eval/metric-collector.test.js +369 -0
- package/src/eval/metric-pivot.js +119 -0
- package/src/eval/metric-pivot.test.js +350 -0
- package/src/eval/mutator-prompt.js +106 -0
- package/src/eval/mutator-prompt.test.js +180 -0
- package/templates/config-template.yaml +5 -0
- package/templates/eval-fixture-template/config.yaml +39 -0
- package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
- package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
- package/templates/eval-fixture-template/fixture/package.json +12 -0
- package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
- package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
- package/templates/eval-fixture-template/fixture/src/config.js +40 -0
- package/templates/eval-fixture-template/fixture/src/index.js +19 -0
- package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
- package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
- package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
- package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
- package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
- package/templates/eval-fixture-template/hypotheses.md +14 -0
- package/templates/eval-fixture-template/spec.md +34 -0
- package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
- package/templates/eval-fixture-template/tests/guard.test.js +108 -0
- package/templates/eval-fixture-template.test.js +318 -0
- package/templates/explore-agent.md +5 -74
- package/templates/explore-protocol.md +44 -0
- package/templates/spec-template.md +4 -0
package/src/eval/loop.js
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Karpathy loop orchestrator for df:eval.
|
|
5
|
+
*
|
|
6
|
+
* Implements the core eval loop: mutate → commit → guard → measure → keep/revert.
|
|
7
|
+
* Worktree-isolated, git-as-memory, single target metric decides.
|
|
8
|
+
*
|
|
9
|
+
* AC-1: Guard failure auto-reverts before metric comparison (status:guard_fail)
|
|
10
|
+
* AC-2: Target improvement keeps; regression reverts (status:kept / status:reverted)
|
|
11
|
+
* AC-3: Secondary metrics in commit message, never decide
|
|
12
|
+
* AC-6: Runs indefinitely until Ctrl+C; --loop N caps at N iterations
|
|
13
|
+
* AC-7: Reverts via git revert (not reset)
|
|
14
|
+
* AC-12: All experiments on worktree-isolated branch
|
|
15
|
+
* AC-13: Commit before verify for clean rollback
|
|
16
|
+
* AC-15: Loop terminates on Ctrl+C or --loop N
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
const fs = require('fs');
|
|
20
|
+
const path = require('path');
|
|
21
|
+
const { execSync } = require('child_process');
|
|
22
|
+
const { buildMutatorPrompt } = require('./mutator-prompt');
|
|
23
|
+
const { collectMetrics } = require('./metric-collector');
|
|
24
|
+
const { commitExperiment, revertExperiment, getExperimentHistory } = require('./git-memory');
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Create a worktree-isolated branch for the eval session.
|
|
28
|
+
* AC-12: All experiments on worktree-isolated branch.
|
|
29
|
+
*
|
|
30
|
+
* @param {string} repoRoot - Root of the main git repo
|
|
31
|
+
* @param {string} skillName - Skill being evaluated (used in branch name)
|
|
32
|
+
* @returns {{ branch: string, worktreePath: string }}
|
|
33
|
+
*/
|
|
34
|
+
function createEvalWorktree(repoRoot, skillName) {
|
|
35
|
+
const timestamp = Date.now();
|
|
36
|
+
const branch = `eval/${skillName}/${timestamp}`;
|
|
37
|
+
const worktreeBase = path.join(repoRoot, '.deepflow', 'worktrees');
|
|
38
|
+
|
|
39
|
+
// Ensure worktree base exists
|
|
40
|
+
fs.mkdirSync(worktreeBase, { recursive: true });
|
|
41
|
+
|
|
42
|
+
const worktreePath = path.join(worktreeBase, `eval-${skillName}-${timestamp}`);
|
|
43
|
+
|
|
44
|
+
// Create orphan branch from current HEAD
|
|
45
|
+
execSync(`git worktree add -b "${branch}" "${worktreePath}" HEAD`, {
|
|
46
|
+
cwd: repoRoot,
|
|
47
|
+
stdio: 'pipe',
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
return { branch, worktreePath };
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Remove a worktree and optionally its branch.
|
|
55
|
+
*
|
|
56
|
+
* @param {string} repoRoot
|
|
57
|
+
* @param {string} worktreePath
|
|
58
|
+
*/
|
|
59
|
+
function removeEvalWorktree(repoRoot, worktreePath) {
|
|
60
|
+
try {
|
|
61
|
+
execSync(`git worktree remove "${worktreePath}" --force`, {
|
|
62
|
+
cwd: repoRoot,
|
|
63
|
+
stdio: 'pipe',
|
|
64
|
+
});
|
|
65
|
+
} catch (_) {
|
|
66
|
+
// best-effort cleanup
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Run the guard check (build + test commands from config).
|
|
72
|
+
* AC-1, AC-5: Guard = fixture tests via configured test command.
|
|
73
|
+
*
|
|
74
|
+
* @param {string} cwd - Working directory to run commands in
|
|
75
|
+
* @param {object} config - Config with build_command / test_command
|
|
76
|
+
* @returns {{ passed: boolean, output: string }}
|
|
77
|
+
*/
|
|
78
|
+
function runGuardCheck(cwd, config) {
|
|
79
|
+
const commands = [];
|
|
80
|
+
if (config.build_command) commands.push(config.build_command);
|
|
81
|
+
if (config.test_command) commands.push(config.test_command);
|
|
82
|
+
|
|
83
|
+
if (commands.length === 0) {
|
|
84
|
+
return { passed: true, output: '(no guard commands configured)' };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const fullCommand = commands.join(' && ');
|
|
88
|
+
try {
|
|
89
|
+
const output = execSync(fullCommand, {
|
|
90
|
+
cwd,
|
|
91
|
+
stdio: 'pipe',
|
|
92
|
+
timeout: 120_000, // 2 minute timeout for guard
|
|
93
|
+
}).toString();
|
|
94
|
+
return { passed: true, output };
|
|
95
|
+
} catch (err) {
|
|
96
|
+
return { passed: false, output: err.stderr?.toString() || err.message };
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Compare a target metric between baseline and current.
|
|
102
|
+
* Returns delta percentage and whether it improved.
|
|
103
|
+
*
|
|
104
|
+
* For metrics where "higher is better" (cache_ratio), improvement = current > baseline.
|
|
105
|
+
* For metrics where "lower is better" (total_tokens, wall_time, context_burn),
|
|
106
|
+
* improvement = current < baseline.
|
|
107
|
+
*
|
|
108
|
+
* @param {string} metricName
|
|
109
|
+
* @param {number} baseline
|
|
110
|
+
* @param {number} current
|
|
111
|
+
* @returns {{ delta: number, improved: boolean }}
|
|
112
|
+
*/
|
|
113
|
+
function compareMetric(metricName, baseline, current) {
|
|
114
|
+
// Guard against zero baseline
|
|
115
|
+
const delta = baseline !== 0
|
|
116
|
+
? ((current - baseline) / Math.abs(baseline)) * 100
|
|
117
|
+
: current === 0 ? 0 : 100;
|
|
118
|
+
|
|
119
|
+
// "Lower is better" metrics
|
|
120
|
+
const lowerIsBetter = ['total_tokens', 'wall_time', 'context_burn'];
|
|
121
|
+
|
|
122
|
+
const improved = lowerIsBetter.includes(metricName)
|
|
123
|
+
? current < baseline
|
|
124
|
+
: current > baseline;
|
|
125
|
+
|
|
126
|
+
return { delta: Math.round(delta * 100) / 100, improved };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Format secondary metrics for the commit message.
|
|
131
|
+
* AC-3: Secondary metrics in commit message but never trigger keep/revert.
|
|
132
|
+
*
|
|
133
|
+
* @param {object} metrics - Full metrics object
|
|
134
|
+
* @param {string} targetMetric - Primary metric name (excluded from secondaries)
|
|
135
|
+
* @param {string[]} secondaryMetrics - List of secondary metric names
|
|
136
|
+
* @returns {string}
|
|
137
|
+
*/
|
|
138
|
+
function formatSecondaries(metrics, targetMetric, secondaryMetrics) {
|
|
139
|
+
if (!secondaryMetrics || secondaryMetrics.length === 0) return '';
|
|
140
|
+
|
|
141
|
+
return secondaryMetrics
|
|
142
|
+
.filter((m) => m !== targetMetric && metrics[m] != null)
|
|
143
|
+
.map((m) => `${m}=${metrics[m]}`)
|
|
144
|
+
.join(' ');
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Extract the skill name from a skill file path.
|
|
149
|
+
* e.g. "skills/atomic-commits/SKILL.md" → "atomic-commits"
|
|
150
|
+
*
|
|
151
|
+
* @param {string} skillPath
|
|
152
|
+
* @returns {string}
|
|
153
|
+
*/
|
|
154
|
+
function extractSkillName(skillPath) {
|
|
155
|
+
const parts = skillPath.replace(/\\/g, '/').split('/');
|
|
156
|
+
// Try to find the directory name before SKILL.md
|
|
157
|
+
const skillIdx = parts.findIndex((p) => /^SKILL\.md$/i.test(p));
|
|
158
|
+
if (skillIdx > 0) return parts[skillIdx - 1];
|
|
159
|
+
// Fallback: use filename without extension
|
|
160
|
+
return path.basename(skillPath, path.extname(skillPath));
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Run the Karpathy eval loop.
|
|
165
|
+
*
|
|
166
|
+
* @param {object} options
|
|
167
|
+
* @param {string} options.repoRoot - Git repo root
|
|
168
|
+
* @param {string} options.skillPath - Path to skill file (relative to repo root)
|
|
169
|
+
* @param {string} options.benchDir - Path to benchmark directory
|
|
170
|
+
* @param {string} options.target - Primary metric name (e.g. "cache_ratio")
|
|
171
|
+
* @param {string} options.hypothesis - Mutation hypothesis
|
|
172
|
+
* @param {number} [options.maxIterations=Infinity] - --loop N cap (AC-6, AC-15)
|
|
173
|
+
* @param {string[]} [options.secondaryMetrics=[]] - Secondary metric names (AC-3)
|
|
174
|
+
* @param {object} [options.config={}] - Project config (build_command, test_command)
|
|
175
|
+
* @param {Function} [options.mutateSkill] - Async function that receives prompt and returns new skill content
|
|
176
|
+
* @param {Function} [options.onIteration] - Callback per iteration for logging
|
|
177
|
+
* @returns {Promise<{ iterations: number, kept: number, reverted: number, guardFails: number, branch: string }>}
|
|
178
|
+
*/
|
|
179
|
+
async function runEvalLoop({
|
|
180
|
+
repoRoot,
|
|
181
|
+
skillPath,
|
|
182
|
+
benchDir,
|
|
183
|
+
target,
|
|
184
|
+
hypothesis,
|
|
185
|
+
maxIterations = Infinity,
|
|
186
|
+
secondaryMetrics = [],
|
|
187
|
+
config = {},
|
|
188
|
+
mutateSkill,
|
|
189
|
+
onIteration,
|
|
190
|
+
}) {
|
|
191
|
+
const skillName = extractSkillName(skillPath);
|
|
192
|
+
const absoluteSkillPath = path.isAbsolute(skillPath)
|
|
193
|
+
? skillPath
|
|
194
|
+
: path.join(repoRoot, skillPath);
|
|
195
|
+
|
|
196
|
+
// AC-12: Create worktree-isolated branch
|
|
197
|
+
const { branch, worktreePath } = createEvalWorktree(repoRoot, skillName);
|
|
198
|
+
|
|
199
|
+
const worktreeSkillPath = path.join(
|
|
200
|
+
worktreePath,
|
|
201
|
+
path.relative(repoRoot, absoluteSkillPath)
|
|
202
|
+
);
|
|
203
|
+
|
|
204
|
+
const deepflowDir = path.join(worktreePath, '.deepflow');
|
|
205
|
+
|
|
206
|
+
const stats = { iterations: 0, kept: 0, reverted: 0, guardFails: 0, branch };
|
|
207
|
+
|
|
208
|
+
// Track abort signal for Ctrl+C (AC-6, AC-15)
|
|
209
|
+
let aborted = false;
|
|
210
|
+
const abortHandler = () => { aborted = true; };
|
|
211
|
+
process.on('SIGINT', abortHandler);
|
|
212
|
+
|
|
213
|
+
try {
|
|
214
|
+
// Collect baseline metrics before the loop starts
|
|
215
|
+
let baselineMetrics = await collectMetrics(deepflowDir);
|
|
216
|
+
|
|
217
|
+
// AC-6: Loop until Ctrl+C or --loop N reached
|
|
218
|
+
while (!aborted && stats.iterations < maxIterations) {
|
|
219
|
+
stats.iterations++;
|
|
220
|
+
const iterNum = stats.iterations;
|
|
221
|
+
|
|
222
|
+
// --- Step 1: Build mutator prompt (T7) ---
|
|
223
|
+
const currentSkillContent = fs.readFileSync(worktreeSkillPath, 'utf8');
|
|
224
|
+
const historyStr = getExperimentHistory({ cwd: worktreePath, skillName });
|
|
225
|
+
const historyEntries = historyStr === '(no experiment history)'
|
|
226
|
+
? []
|
|
227
|
+
: historyStr.split('\n');
|
|
228
|
+
|
|
229
|
+
const prompt = buildMutatorPrompt({
|
|
230
|
+
skillContent: currentSkillContent,
|
|
231
|
+
hypothesis,
|
|
232
|
+
history: historyEntries,
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
// --- Step 2: Spawn agent to mutate skill file (full replacement) ---
|
|
236
|
+
let newSkillContent;
|
|
237
|
+
try {
|
|
238
|
+
newSkillContent = await mutateSkill(prompt);
|
|
239
|
+
} catch (err) {
|
|
240
|
+
// Mutator failure — log and continue to next iteration
|
|
241
|
+
if (onIteration) {
|
|
242
|
+
onIteration({ iteration: iterNum, status: 'mutator_error', error: err.message });
|
|
243
|
+
}
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Write mutated skill file
|
|
248
|
+
fs.writeFileSync(worktreeSkillPath, newSkillContent, 'utf8');
|
|
249
|
+
|
|
250
|
+
// --- Step 3: Commit experiment BEFORE verify (AC-13) ---
|
|
251
|
+
// Use placeholder values; will amend after metrics if kept
|
|
252
|
+
const experimentHash = commitExperiment({
|
|
253
|
+
cwd: worktreePath,
|
|
254
|
+
skillName,
|
|
255
|
+
hypothesis,
|
|
256
|
+
target,
|
|
257
|
+
value: 'pending',
|
|
258
|
+
delta: '0',
|
|
259
|
+
status: 'pending',
|
|
260
|
+
secondaries: '',
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
// --- Step 4: Run guard check ---
|
|
264
|
+
const guardResult = runGuardCheck(worktreePath, config);
|
|
265
|
+
|
|
266
|
+
// --- Step 5: Guard fail → revert, log guard_fail, next iteration (AC-1) ---
|
|
267
|
+
if (!guardResult.passed) {
|
|
268
|
+
revertExperiment({ cwd: worktreePath });
|
|
269
|
+
stats.guardFails++;
|
|
270
|
+
|
|
271
|
+
// Amend the experiment commit message is not possible since we reverted.
|
|
272
|
+
// The revert commit captures the guard_fail state in history.
|
|
273
|
+
// Log a guard_fail experiment for git-as-memory
|
|
274
|
+
commitExperiment({
|
|
275
|
+
cwd: worktreePath,
|
|
276
|
+
skillName,
|
|
277
|
+
hypothesis,
|
|
278
|
+
target,
|
|
279
|
+
value: 'N/A',
|
|
280
|
+
delta: '0',
|
|
281
|
+
status: 'guard_fail',
|
|
282
|
+
secondaries: '',
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
if (onIteration) {
|
|
286
|
+
onIteration({
|
|
287
|
+
iteration: iterNum,
|
|
288
|
+
status: 'guard_fail',
|
|
289
|
+
guardOutput: guardResult.output,
|
|
290
|
+
hash: experimentHash,
|
|
291
|
+
});
|
|
292
|
+
}
|
|
293
|
+
continue;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// --- Step 6: Collect metrics (T6) (AC-16) ---
|
|
297
|
+
const startTs = Date.now() - 120_000; // approximate window
|
|
298
|
+
const endTs = Date.now();
|
|
299
|
+
const currentMetrics = await collectMetrics(deepflowDir, startTs, endTs);
|
|
300
|
+
|
|
301
|
+
// --- Step 7: Compare target metric (AC-2) ---
|
|
302
|
+
const baselineValue = baselineMetrics[target] || 0;
|
|
303
|
+
const currentValue = currentMetrics[target] || 0;
|
|
304
|
+
const { delta, improved } = compareMetric(target, baselineValue, currentValue);
|
|
305
|
+
|
|
306
|
+
// AC-3: Format secondary metrics (never decide)
|
|
307
|
+
const secondariesStr = formatSecondaries(currentMetrics, target, secondaryMetrics);
|
|
308
|
+
|
|
309
|
+
let status;
|
|
310
|
+
if (improved) {
|
|
311
|
+
// Target improved → keep (AC-2: status:kept)
|
|
312
|
+
status = 'kept';
|
|
313
|
+
stats.kept++;
|
|
314
|
+
|
|
315
|
+
// Update baseline to the new best
|
|
316
|
+
baselineMetrics = currentMetrics;
|
|
317
|
+
|
|
318
|
+
// The experiment commit is already in place; record a kept marker
|
|
319
|
+
commitExperiment({
|
|
320
|
+
cwd: worktreePath,
|
|
321
|
+
skillName,
|
|
322
|
+
hypothesis,
|
|
323
|
+
target,
|
|
324
|
+
value: currentValue,
|
|
325
|
+
delta: delta.toString(),
|
|
326
|
+
status: 'kept',
|
|
327
|
+
secondaries: secondariesStr,
|
|
328
|
+
});
|
|
329
|
+
} else {
|
|
330
|
+
// Target regression → revert (AC-2: status:reverted, AC-7: git revert)
|
|
331
|
+
status = 'reverted';
|
|
332
|
+
stats.reverted++;
|
|
333
|
+
|
|
334
|
+
revertExperiment({ cwd: worktreePath });
|
|
335
|
+
|
|
336
|
+
// Record the reverted experiment result
|
|
337
|
+
commitExperiment({
|
|
338
|
+
cwd: worktreePath,
|
|
339
|
+
skillName,
|
|
340
|
+
hypothesis,
|
|
341
|
+
target,
|
|
342
|
+
value: currentValue,
|
|
343
|
+
delta: delta.toString(),
|
|
344
|
+
status: 'reverted',
|
|
345
|
+
secondaries: secondariesStr,
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
if (onIteration) {
|
|
350
|
+
onIteration({
|
|
351
|
+
iteration: iterNum,
|
|
352
|
+
status,
|
|
353
|
+
target,
|
|
354
|
+
value: currentValue,
|
|
355
|
+
delta,
|
|
356
|
+
secondaries: secondariesStr,
|
|
357
|
+
hash: experimentHash,
|
|
358
|
+
});
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
} finally {
|
|
362
|
+
// Clean up SIGINT handler
|
|
363
|
+
process.removeListener('SIGINT', abortHandler);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return stats;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
module.exports = {
|
|
370
|
+
runEvalLoop,
|
|
371
|
+
// exported for testing / composition
|
|
372
|
+
createEvalWorktree,
|
|
373
|
+
removeEvalWorktree,
|
|
374
|
+
runGuardCheck,
|
|
375
|
+
compareMetric,
|
|
376
|
+
formatSecondaries,
|
|
377
|
+
extractSkillName,
|
|
378
|
+
};
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { describe, it, before, after } = require('node:test');
|
|
4
|
+
const assert = require('node:assert');
|
|
5
|
+
const fs = require('fs');
|
|
6
|
+
const path = require('path');
|
|
7
|
+
const os = require('os');
|
|
8
|
+
const { execSync } = require('child_process');
|
|
9
|
+
|
|
10
|
+
const {
|
|
11
|
+
compareMetric,
|
|
12
|
+
formatSecondaries,
|
|
13
|
+
extractSkillName,
|
|
14
|
+
createEvalWorktree,
|
|
15
|
+
removeEvalWorktree,
|
|
16
|
+
runGuardCheck,
|
|
17
|
+
} = require('./loop.js');
|
|
18
|
+
|
|
19
|
+
// --- Helper: create a temporary git repo with initial commit ---
|
|
20
|
+
|
|
21
|
+
function createTempRepo() {
|
|
22
|
+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'loop-test-'));
|
|
23
|
+
execSync('git init', { cwd: dir, stdio: 'pipe' });
|
|
24
|
+
execSync('git config user.email "test@test.com"', { cwd: dir, stdio: 'pipe' });
|
|
25
|
+
execSync('git config user.name "Test"', { cwd: dir, stdio: 'pipe' });
|
|
26
|
+
fs.writeFileSync(path.join(dir, 'README.md'), '# test repo\n');
|
|
27
|
+
execSync('git add -A && git commit -m "initial commit"', { cwd: dir, stdio: 'pipe' });
|
|
28
|
+
return dir;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function cleanupRepo(dir) {
|
|
32
|
+
// Remove any worktrees first to avoid git lock issues
|
|
33
|
+
try {
|
|
34
|
+
execSync('git worktree prune', { cwd: dir, stdio: 'pipe' });
|
|
35
|
+
} catch (_) { /* ignore */ }
|
|
36
|
+
fs.rmSync(dir, { recursive: true, force: true });
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// --- compareMetric ---
|
|
40
|
+
|
|
41
|
+
describe('compareMetric', () => {
|
|
42
|
+
describe('higher-is-better metrics (e.g. cache_ratio)', () => {
|
|
43
|
+
it('reports improved when current > baseline', () => {
|
|
44
|
+
const result = compareMetric('cache_ratio', 50, 75);
|
|
45
|
+
assert.strictEqual(result.improved, true);
|
|
46
|
+
assert.strictEqual(result.delta, 50); // (75-50)/50 * 100 = 50%
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it('reports not improved when current < baseline', () => {
|
|
50
|
+
const result = compareMetric('cache_ratio', 80, 60);
|
|
51
|
+
assert.strictEqual(result.improved, false);
|
|
52
|
+
assert.strictEqual(result.delta, -25); // (60-80)/80 * 100 = -25%
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
it('reports not improved when values are equal', () => {
|
|
56
|
+
const result = compareMetric('cache_ratio', 50, 50);
|
|
57
|
+
assert.strictEqual(result.improved, false);
|
|
58
|
+
assert.strictEqual(result.delta, 0);
|
|
59
|
+
});
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
describe('lower-is-better metrics (total_tokens, wall_time, context_burn)', () => {
|
|
63
|
+
it('reports improved when current < baseline for total_tokens', () => {
|
|
64
|
+
const result = compareMetric('total_tokens', 1000, 800);
|
|
65
|
+
assert.strictEqual(result.improved, true);
|
|
66
|
+
assert.strictEqual(result.delta, -20); // (800-1000)/1000 * 100 = -20%
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it('reports improved when current < baseline for wall_time', () => {
|
|
70
|
+
const result = compareMetric('wall_time', 60, 45);
|
|
71
|
+
assert.strictEqual(result.improved, true);
|
|
72
|
+
assert.strictEqual(result.delta, -25);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it('reports improved when current < baseline for context_burn', () => {
|
|
76
|
+
const result = compareMetric('context_burn', 200, 150);
|
|
77
|
+
assert.strictEqual(result.improved, true);
|
|
78
|
+
assert.strictEqual(result.delta, -25);
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('reports not improved when current > baseline for total_tokens', () => {
|
|
82
|
+
const result = compareMetric('total_tokens', 1000, 1200);
|
|
83
|
+
assert.strictEqual(result.improved, false);
|
|
84
|
+
assert.strictEqual(result.delta, 20);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
it('reports not improved when values are equal for lower-is-better', () => {
|
|
88
|
+
const result = compareMetric('total_tokens', 500, 500);
|
|
89
|
+
assert.strictEqual(result.improved, false);
|
|
90
|
+
assert.strictEqual(result.delta, 0);
|
|
91
|
+
});
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
describe('zero baseline edge cases', () => {
|
|
95
|
+
it('returns delta=0 when both baseline and current are zero', () => {
|
|
96
|
+
const result = compareMetric('cache_ratio', 0, 0);
|
|
97
|
+
assert.strictEqual(result.delta, 0);
|
|
98
|
+
assert.strictEqual(result.improved, false);
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
it('returns delta=100 when baseline is zero and current is nonzero (higher-is-better)', () => {
|
|
102
|
+
const result = compareMetric('cache_ratio', 0, 50);
|
|
103
|
+
assert.strictEqual(result.delta, 100);
|
|
104
|
+
assert.strictEqual(result.improved, true);
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
it('returns delta=100 when baseline is zero and current is nonzero (lower-is-better)', () => {
|
|
108
|
+
const result = compareMetric('total_tokens', 0, 50);
|
|
109
|
+
assert.strictEqual(result.delta, 100);
|
|
110
|
+
// For lower-is-better, current(50) > baseline(0) => not improved
|
|
111
|
+
assert.strictEqual(result.improved, false);
|
|
112
|
+
});
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
describe('delta rounding', () => {
|
|
116
|
+
it('rounds delta to two decimal places', () => {
|
|
117
|
+
// (7 - 3) / 3 * 100 = 133.33333...
|
|
118
|
+
const result = compareMetric('cache_ratio', 3, 7);
|
|
119
|
+
assert.strictEqual(result.delta, 133.33);
|
|
120
|
+
});
|
|
121
|
+
});
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
// --- formatSecondaries ---
|
|
125
|
+
|
|
126
|
+
describe('formatSecondaries', () => {
|
|
127
|
+
it('formats secondary metric names and values', () => {
|
|
128
|
+
const metrics = { cache_ratio: 0.85, total_tokens: 1200, wall_time: 45 };
|
|
129
|
+
const result = formatSecondaries(metrics, 'cache_ratio', ['total_tokens', 'wall_time']);
|
|
130
|
+
assert.strictEqual(result, 'total_tokens=1200 wall_time=45');
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
it('excludes the target metric from secondaries', () => {
|
|
134
|
+
const metrics = { cache_ratio: 0.85, total_tokens: 1200 };
|
|
135
|
+
const result = formatSecondaries(metrics, 'cache_ratio', ['cache_ratio', 'total_tokens']);
|
|
136
|
+
assert.strictEqual(result, 'total_tokens=1200');
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
it('returns empty string when secondaryMetrics is empty', () => {
|
|
140
|
+
const metrics = { cache_ratio: 0.85 };
|
|
141
|
+
const result = formatSecondaries(metrics, 'cache_ratio', []);
|
|
142
|
+
assert.strictEqual(result, '');
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
it('returns empty string when secondaryMetrics is null/undefined', () => {
|
|
146
|
+
const metrics = { cache_ratio: 0.85 };
|
|
147
|
+
assert.strictEqual(formatSecondaries(metrics, 'cache_ratio', null), '');
|
|
148
|
+
assert.strictEqual(formatSecondaries(metrics, 'cache_ratio', undefined), '');
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
it('skips metrics not present in metrics object', () => {
|
|
152
|
+
const metrics = { cache_ratio: 0.85 };
|
|
153
|
+
const result = formatSecondaries(metrics, 'cache_ratio', ['total_tokens', 'wall_time']);
|
|
154
|
+
assert.strictEqual(result, '');
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
it('includes only metrics present in the metrics object', () => {
|
|
158
|
+
const metrics = { cache_ratio: 0.85, total_tokens: 1200 };
|
|
159
|
+
const result = formatSecondaries(metrics, 'cache_ratio', ['total_tokens', 'missing_metric']);
|
|
160
|
+
assert.strictEqual(result, 'total_tokens=1200');
|
|
161
|
+
});
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
// --- extractSkillName ---
|
|
165
|
+
|
|
166
|
+
describe('extractSkillName', () => {
|
|
167
|
+
it('extracts skill name from path containing SKILL.md', () => {
|
|
168
|
+
assert.strictEqual(extractSkillName('skills/atomic-commits/SKILL.md'), 'atomic-commits');
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
it('falls back to filename without extension', () => {
|
|
172
|
+
assert.strictEqual(extractSkillName('some/path/my-skill.md'), 'my-skill');
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
it('handles Windows-style backslashes', () => {
|
|
176
|
+
assert.strictEqual(extractSkillName('skills\\browse-fetch\\SKILL.md'), 'browse-fetch');
|
|
177
|
+
});
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
// --- createEvalWorktree / removeEvalWorktree ---
|
|
181
|
+
|
|
182
|
+
describe('createEvalWorktree', () => {
|
|
183
|
+
let repoDir;
|
|
184
|
+
|
|
185
|
+
before(() => {
|
|
186
|
+
repoDir = createTempRepo();
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
after(() => {
|
|
190
|
+
cleanupRepo(repoDir);
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
it('creates a worktree directory and returns branch and path', () => {
|
|
194
|
+
const { branch, worktreePath } = createEvalWorktree(repoDir, 'test-skill');
|
|
195
|
+
|
|
196
|
+
assert.ok(branch.startsWith('eval/test-skill/'), `branch should start with eval/test-skill/, got: ${branch}`);
|
|
197
|
+
assert.ok(fs.existsSync(worktreePath), 'worktree directory should exist');
|
|
198
|
+
|
|
199
|
+
// Verify it is a valid git worktree (has .git file)
|
|
200
|
+
const gitFile = path.join(worktreePath, '.git');
|
|
201
|
+
assert.ok(fs.existsSync(gitFile), 'worktree should have .git file/dir');
|
|
202
|
+
|
|
203
|
+
// Verify the README from initial commit is present
|
|
204
|
+
const readme = path.join(worktreePath, 'README.md');
|
|
205
|
+
assert.ok(fs.existsSync(readme), 'worktree should contain files from HEAD');
|
|
206
|
+
|
|
207
|
+
// Cleanup worktree
|
|
208
|
+
removeEvalWorktree(repoDir, worktreePath);
|
|
209
|
+
});
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
describe('removeEvalWorktree', () => {
|
|
213
|
+
let repoDir;
|
|
214
|
+
|
|
215
|
+
before(() => {
|
|
216
|
+
repoDir = createTempRepo();
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
after(() => {
|
|
220
|
+
cleanupRepo(repoDir);
|
|
221
|
+
});
|
|
222
|
+
|
|
223
|
+
it('removes an existing worktree', () => {
|
|
224
|
+
const { worktreePath } = createEvalWorktree(repoDir, 'remove-test');
|
|
225
|
+
|
|
226
|
+
assert.ok(fs.existsSync(worktreePath), 'worktree should exist before removal');
|
|
227
|
+
|
|
228
|
+
removeEvalWorktree(repoDir, worktreePath);
|
|
229
|
+
|
|
230
|
+
assert.ok(!fs.existsSync(worktreePath), 'worktree directory should be removed');
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
it('does not throw when removing a non-existent worktree', () => {
|
|
234
|
+
const fakePath = path.join(os.tmpdir(), 'nonexistent-worktree-12345');
|
|
235
|
+
assert.doesNotThrow(() => {
|
|
236
|
+
removeEvalWorktree(repoDir, fakePath);
|
|
237
|
+
});
|
|
238
|
+
});
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
// --- runGuardCheck ---
|
|
242
|
+
|
|
243
|
+
describe('runGuardCheck', () => {
|
|
244
|
+
let tempDir;
|
|
245
|
+
|
|
246
|
+
before(() => {
|
|
247
|
+
tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'guard-test-'));
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
after(() => {
|
|
251
|
+
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
it('passes when build and test commands succeed', () => {
|
|
255
|
+
const result = runGuardCheck(tempDir, {
|
|
256
|
+
build_command: 'echo "build ok"',
|
|
257
|
+
test_command: 'echo "test ok"',
|
|
258
|
+
});
|
|
259
|
+
assert.strictEqual(result.passed, true);
|
|
260
|
+
assert.ok(result.output.includes('test ok'));
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
it('fails when build command fails', () => {
|
|
264
|
+
const result = runGuardCheck(tempDir, {
|
|
265
|
+
build_command: 'exit 1',
|
|
266
|
+
test_command: 'echo "test ok"',
|
|
267
|
+
});
|
|
268
|
+
assert.strictEqual(result.passed, false);
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
it('fails when test command fails', () => {
|
|
272
|
+
const result = runGuardCheck(tempDir, {
|
|
273
|
+
build_command: 'echo "build ok"',
|
|
274
|
+
test_command: 'exit 1',
|
|
275
|
+
});
|
|
276
|
+
assert.strictEqual(result.passed, false);
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
it('passes with no guard commands configured', () => {
|
|
280
|
+
const result = runGuardCheck(tempDir, {});
|
|
281
|
+
assert.strictEqual(result.passed, true);
|
|
282
|
+
assert.ok(result.output.includes('no guard commands configured'));
|
|
283
|
+
});
|
|
284
|
+
|
|
285
|
+
it('passes with only build_command configured', () => {
|
|
286
|
+
const result = runGuardCheck(tempDir, {
|
|
287
|
+
build_command: 'echo "build only"',
|
|
288
|
+
});
|
|
289
|
+
assert.strictEqual(result.passed, true);
|
|
290
|
+
});
|
|
291
|
+
|
|
292
|
+
it('passes with only test_command configured', () => {
|
|
293
|
+
const result = runGuardCheck(tempDir, {
|
|
294
|
+
test_command: 'echo "test only"',
|
|
295
|
+
});
|
|
296
|
+
assert.strictEqual(result.passed, true);
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
it('returns error output on failure', () => {
|
|
300
|
+
const result = runGuardCheck(tempDir, {
|
|
301
|
+
test_command: 'echo "some error" >&2 && exit 1',
|
|
302
|
+
});
|
|
303
|
+
assert.strictEqual(result.passed, false);
|
|
304
|
+
assert.ok(result.output.includes('some error'));
|
|
305
|
+
});
|
|
306
|
+
});
|