deepflow 0.1.102 → 0.1.104

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/bin/install-dynamic-hooks.test.js +461 -0
  2. package/bin/install.js +150 -204
  3. package/bin/install.test.js +214 -0
  4. package/bin/lineage-ingest.js +70 -0
  5. package/hooks/df-check-update.js +1 -0
  6. package/hooks/df-command-usage.js +305 -0
  7. package/hooks/df-command-usage.test.js +1019 -0
  8. package/hooks/df-dashboard-push.js +1 -0
  9. package/hooks/df-execution-history.js +1 -0
  10. package/hooks/df-explore-protocol.js +83 -0
  11. package/hooks/df-explore-protocol.test.js +228 -0
  12. package/hooks/df-hook-event-tags.test.js +127 -0
  13. package/hooks/df-invariant-check.js +1 -0
  14. package/hooks/df-quota-logger.js +1 -0
  15. package/hooks/df-snapshot-guard.js +1 -0
  16. package/hooks/df-spec-lint.js +58 -1
  17. package/hooks/df-spec-lint.test.js +412 -0
  18. package/hooks/df-statusline.js +1 -0
  19. package/hooks/df-subagent-registry.js +34 -14
  20. package/hooks/df-tool-usage.js +21 -3
  21. package/hooks/df-tool-usage.test.js +200 -0
  22. package/hooks/df-worktree-guard.js +1 -0
  23. package/package.json +1 -1
  24. package/src/commands/df/debate.md +1 -1
  25. package/src/commands/df/eval.md +117 -0
  26. package/src/commands/df/execute.md +1 -1
  27. package/src/commands/df/fix.md +104 -0
  28. package/src/eval/git-memory.js +159 -0
  29. package/src/eval/git-memory.test.js +439 -0
  30. package/src/eval/hypothesis.js +80 -0
  31. package/src/eval/hypothesis.test.js +169 -0
  32. package/src/eval/loop.js +378 -0
  33. package/src/eval/loop.test.js +306 -0
  34. package/src/eval/metric-collector.js +163 -0
  35. package/src/eval/metric-collector.test.js +369 -0
  36. package/src/eval/metric-pivot.js +119 -0
  37. package/src/eval/metric-pivot.test.js +350 -0
  38. package/src/eval/mutator-prompt.js +106 -0
  39. package/src/eval/mutator-prompt.test.js +180 -0
  40. package/templates/config-template.yaml +5 -0
  41. package/templates/eval-fixture-template/config.yaml +39 -0
  42. package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
  43. package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
  44. package/templates/eval-fixture-template/fixture/package.json +12 -0
  45. package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
  46. package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
  47. package/templates/eval-fixture-template/fixture/src/config.js +40 -0
  48. package/templates/eval-fixture-template/fixture/src/index.js +19 -0
  49. package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
  50. package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
  51. package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
  52. package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
  53. package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
  54. package/templates/eval-fixture-template/hypotheses.md +14 -0
  55. package/templates/eval-fixture-template/spec.md +34 -0
  56. package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
  57. package/templates/eval-fixture-template/tests/guard.test.js +108 -0
  58. package/templates/eval-fixture-template.test.js +318 -0
  59. package/templates/explore-agent.md +5 -74
  60. package/templates/explore-protocol.md +44 -0
  61. package/templates/spec-template.md +4 -0
@@ -0,0 +1,378 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Karpathy loop orchestrator for df:eval.
5
+ *
6
+ * Implements the core eval loop: mutate → commit → guard → measure → keep/revert.
7
+ * Worktree-isolated, git-as-memory, single target metric decides.
8
+ *
9
+ * AC-1: Guard failure auto-reverts before metric comparison (status:guard_fail)
10
+ * AC-2: Target improvement keeps; regression reverts (status:kept / status:reverted)
11
+ * AC-3: Secondary metrics in commit message, never decide
12
+ * AC-6: Runs indefinitely until Ctrl+C; --loop N caps at N iterations
13
+ * AC-7: Reverts via git revert (not reset)
14
+ * AC-12: All experiments on worktree-isolated branch
15
+ * AC-13: Commit before verify for clean rollback
16
+ * AC-15: Loop terminates on Ctrl+C or --loop N
17
+ */
18
+
19
+ const fs = require('fs');
20
+ const path = require('path');
21
+ const { execSync } = require('child_process');
22
+ const { buildMutatorPrompt } = require('./mutator-prompt');
23
+ const { collectMetrics } = require('./metric-collector');
24
+ const { commitExperiment, revertExperiment, getExperimentHistory } = require('./git-memory');
25
+
26
+ /**
27
+ * Create a worktree-isolated branch for the eval session.
28
+ * AC-12: All experiments on worktree-isolated branch.
29
+ *
30
+ * @param {string} repoRoot - Root of the main git repo
31
+ * @param {string} skillName - Skill being evaluated (used in branch name)
32
+ * @returns {{ branch: string, worktreePath: string }}
33
+ */
34
+ function createEvalWorktree(repoRoot, skillName) {
35
+ const timestamp = Date.now();
36
+ const branch = `eval/${skillName}/${timestamp}`;
37
+ const worktreeBase = path.join(repoRoot, '.deepflow', 'worktrees');
38
+
39
+ // Ensure worktree base exists
40
+ fs.mkdirSync(worktreeBase, { recursive: true });
41
+
42
+ const worktreePath = path.join(worktreeBase, `eval-${skillName}-${timestamp}`);
43
+
44
+ // Create orphan branch from current HEAD
45
+ execSync(`git worktree add -b "${branch}" "${worktreePath}" HEAD`, {
46
+ cwd: repoRoot,
47
+ stdio: 'pipe',
48
+ });
49
+
50
+ return { branch, worktreePath };
51
+ }
52
+
53
+ /**
54
+ * Remove a worktree and optionally its branch.
55
+ *
56
+ * @param {string} repoRoot
57
+ * @param {string} worktreePath
58
+ */
59
+ function removeEvalWorktree(repoRoot, worktreePath) {
60
+ try {
61
+ execSync(`git worktree remove "${worktreePath}" --force`, {
62
+ cwd: repoRoot,
63
+ stdio: 'pipe',
64
+ });
65
+ } catch (_) {
66
+ // best-effort cleanup
67
+ }
68
+ }
69
+
70
+ /**
71
+ * Run the guard check (build + test commands from config).
72
+ * AC-1, AC-5: Guard = fixture tests via configured test command.
73
+ *
74
+ * @param {string} cwd - Working directory to run commands in
75
+ * @param {object} config - Config with build_command / test_command
76
+ * @returns {{ passed: boolean, output: string }}
77
+ */
78
+ function runGuardCheck(cwd, config) {
79
+ const commands = [];
80
+ if (config.build_command) commands.push(config.build_command);
81
+ if (config.test_command) commands.push(config.test_command);
82
+
83
+ if (commands.length === 0) {
84
+ return { passed: true, output: '(no guard commands configured)' };
85
+ }
86
+
87
+ const fullCommand = commands.join(' && ');
88
+ try {
89
+ const output = execSync(fullCommand, {
90
+ cwd,
91
+ stdio: 'pipe',
92
+ timeout: 120_000, // 2 minute timeout for guard
93
+ }).toString();
94
+ return { passed: true, output };
95
+ } catch (err) {
96
+ return { passed: false, output: err.stderr?.toString() || err.message };
97
+ }
98
+ }
99
+
100
+ /**
101
+ * Compare a target metric between baseline and current.
102
+ * Returns delta percentage and whether it improved.
103
+ *
104
+ * For metrics where "higher is better" (cache_ratio), improvement = current > baseline.
105
+ * For metrics where "lower is better" (total_tokens, wall_time, context_burn),
106
+ * improvement = current < baseline.
107
+ *
108
+ * @param {string} metricName
109
+ * @param {number} baseline
110
+ * @param {number} current
111
+ * @returns {{ delta: number, improved: boolean }}
112
+ */
113
+ function compareMetric(metricName, baseline, current) {
114
+ // Guard against zero baseline
115
+ const delta = baseline !== 0
116
+ ? ((current - baseline) / Math.abs(baseline)) * 100
117
+ : current === 0 ? 0 : 100;
118
+
119
+ // "Lower is better" metrics
120
+ const lowerIsBetter = ['total_tokens', 'wall_time', 'context_burn'];
121
+
122
+ const improved = lowerIsBetter.includes(metricName)
123
+ ? current < baseline
124
+ : current > baseline;
125
+
126
+ return { delta: Math.round(delta * 100) / 100, improved };
127
+ }
128
+
129
+ /**
130
+ * Format secondary metrics for the commit message.
131
+ * AC-3: Secondary metrics in commit message but never trigger keep/revert.
132
+ *
133
+ * @param {object} metrics - Full metrics object
134
+ * @param {string} targetMetric - Primary metric name (excluded from secondaries)
135
+ * @param {string[]} secondaryMetrics - List of secondary metric names
136
+ * @returns {string}
137
+ */
138
+ function formatSecondaries(metrics, targetMetric, secondaryMetrics) {
139
+ if (!secondaryMetrics || secondaryMetrics.length === 0) return '';
140
+
141
+ return secondaryMetrics
142
+ .filter((m) => m !== targetMetric && metrics[m] != null)
143
+ .map((m) => `${m}=${metrics[m]}`)
144
+ .join(' ');
145
+ }
146
+
147
+ /**
148
+ * Extract the skill name from a skill file path.
149
+ * e.g. "skills/atomic-commits/SKILL.md" → "atomic-commits"
150
+ *
151
+ * @param {string} skillPath
152
+ * @returns {string}
153
+ */
154
+ function extractSkillName(skillPath) {
155
+ const parts = skillPath.replace(/\\/g, '/').split('/');
156
+ // Try to find the directory name before SKILL.md
157
+ const skillIdx = parts.findIndex((p) => /^SKILL\.md$/i.test(p));
158
+ if (skillIdx > 0) return parts[skillIdx - 1];
159
+ // Fallback: use filename without extension
160
+ return path.basename(skillPath, path.extname(skillPath));
161
+ }
162
+
163
+ /**
164
+ * Run the Karpathy eval loop.
165
+ *
166
+ * @param {object} options
167
+ * @param {string} options.repoRoot - Git repo root
168
+ * @param {string} options.skillPath - Path to skill file (relative to repo root)
169
+ * @param {string} options.benchDir - Path to benchmark directory
170
+ * @param {string} options.target - Primary metric name (e.g. "cache_ratio")
171
+ * @param {string} options.hypothesis - Mutation hypothesis
172
+ * @param {number} [options.maxIterations=Infinity] - --loop N cap (AC-6, AC-15)
173
+ * @param {string[]} [options.secondaryMetrics=[]] - Secondary metric names (AC-3)
174
+ * @param {object} [options.config={}] - Project config (build_command, test_command)
175
+ * @param {Function} [options.mutateSkill] - Async function that receives prompt and returns new skill content
176
+ * @param {Function} [options.onIteration] - Callback per iteration for logging
177
+ * @returns {Promise<{ iterations: number, kept: number, reverted: number, guardFails: number, branch: string }>}
178
+ */
179
+ async function runEvalLoop({
180
+ repoRoot,
181
+ skillPath,
182
+ benchDir,
183
+ target,
184
+ hypothesis,
185
+ maxIterations = Infinity,
186
+ secondaryMetrics = [],
187
+ config = {},
188
+ mutateSkill,
189
+ onIteration,
190
+ }) {
191
+ const skillName = extractSkillName(skillPath);
192
+ const absoluteSkillPath = path.isAbsolute(skillPath)
193
+ ? skillPath
194
+ : path.join(repoRoot, skillPath);
195
+
196
+ // AC-12: Create worktree-isolated branch
197
+ const { branch, worktreePath } = createEvalWorktree(repoRoot, skillName);
198
+
199
+ const worktreeSkillPath = path.join(
200
+ worktreePath,
201
+ path.relative(repoRoot, absoluteSkillPath)
202
+ );
203
+
204
+ const deepflowDir = path.join(worktreePath, '.deepflow');
205
+
206
+ const stats = { iterations: 0, kept: 0, reverted: 0, guardFails: 0, branch };
207
+
208
+ // Track abort signal for Ctrl+C (AC-6, AC-15)
209
+ let aborted = false;
210
+ const abortHandler = () => { aborted = true; };
211
+ process.on('SIGINT', abortHandler);
212
+
213
+ try {
214
+ // Collect baseline metrics before the loop starts
215
+ let baselineMetrics = await collectMetrics(deepflowDir);
216
+
217
+ // AC-6: Loop until Ctrl+C or --loop N reached
218
+ while (!aborted && stats.iterations < maxIterations) {
219
+ stats.iterations++;
220
+ const iterNum = stats.iterations;
221
+
222
+ // --- Step 1: Build mutator prompt (T7) ---
223
+ const currentSkillContent = fs.readFileSync(worktreeSkillPath, 'utf8');
224
+ const historyStr = getExperimentHistory({ cwd: worktreePath, skillName });
225
+ const historyEntries = historyStr === '(no experiment history)'
226
+ ? []
227
+ : historyStr.split('\n');
228
+
229
+ const prompt = buildMutatorPrompt({
230
+ skillContent: currentSkillContent,
231
+ hypothesis,
232
+ history: historyEntries,
233
+ });
234
+
235
+ // --- Step 2: Spawn agent to mutate skill file (full replacement) ---
236
+ let newSkillContent;
237
+ try {
238
+ newSkillContent = await mutateSkill(prompt);
239
+ } catch (err) {
240
+ // Mutator failure — log and continue to next iteration
241
+ if (onIteration) {
242
+ onIteration({ iteration: iterNum, status: 'mutator_error', error: err.message });
243
+ }
244
+ continue;
245
+ }
246
+
247
+ // Write mutated skill file
248
+ fs.writeFileSync(worktreeSkillPath, newSkillContent, 'utf8');
249
+
250
+ // --- Step 3: Commit experiment BEFORE verify (AC-13) ---
251
+ // Use placeholder values; will amend after metrics if kept
252
+ const experimentHash = commitExperiment({
253
+ cwd: worktreePath,
254
+ skillName,
255
+ hypothesis,
256
+ target,
257
+ value: 'pending',
258
+ delta: '0',
259
+ status: 'pending',
260
+ secondaries: '',
261
+ });
262
+
263
+ // --- Step 4: Run guard check ---
264
+ const guardResult = runGuardCheck(worktreePath, config);
265
+
266
+ // --- Step 5: Guard fail → revert, log guard_fail, next iteration (AC-1) ---
267
+ if (!guardResult.passed) {
268
+ revertExperiment({ cwd: worktreePath });
269
+ stats.guardFails++;
270
+
271
+ // Amend the experiment commit message is not possible since we reverted.
272
+ // The revert commit captures the guard_fail state in history.
273
+ // Log a guard_fail experiment for git-as-memory
274
+ commitExperiment({
275
+ cwd: worktreePath,
276
+ skillName,
277
+ hypothesis,
278
+ target,
279
+ value: 'N/A',
280
+ delta: '0',
281
+ status: 'guard_fail',
282
+ secondaries: '',
283
+ });
284
+
285
+ if (onIteration) {
286
+ onIteration({
287
+ iteration: iterNum,
288
+ status: 'guard_fail',
289
+ guardOutput: guardResult.output,
290
+ hash: experimentHash,
291
+ });
292
+ }
293
+ continue;
294
+ }
295
+
296
+ // --- Step 6: Collect metrics (T6) (AC-16) ---
297
+ const startTs = Date.now() - 120_000; // approximate window
298
+ const endTs = Date.now();
299
+ const currentMetrics = await collectMetrics(deepflowDir, startTs, endTs);
300
+
301
+ // --- Step 7: Compare target metric (AC-2) ---
302
+ const baselineValue = baselineMetrics[target] || 0;
303
+ const currentValue = currentMetrics[target] || 0;
304
+ const { delta, improved } = compareMetric(target, baselineValue, currentValue);
305
+
306
+ // AC-3: Format secondary metrics (never decide)
307
+ const secondariesStr = formatSecondaries(currentMetrics, target, secondaryMetrics);
308
+
309
+ let status;
310
+ if (improved) {
311
+ // Target improved → keep (AC-2: status:kept)
312
+ status = 'kept';
313
+ stats.kept++;
314
+
315
+ // Update baseline to the new best
316
+ baselineMetrics = currentMetrics;
317
+
318
+ // The experiment commit is already in place; record a kept marker
319
+ commitExperiment({
320
+ cwd: worktreePath,
321
+ skillName,
322
+ hypothesis,
323
+ target,
324
+ value: currentValue,
325
+ delta: delta.toString(),
326
+ status: 'kept',
327
+ secondaries: secondariesStr,
328
+ });
329
+ } else {
330
+ // Target regression → revert (AC-2: status:reverted, AC-7: git revert)
331
+ status = 'reverted';
332
+ stats.reverted++;
333
+
334
+ revertExperiment({ cwd: worktreePath });
335
+
336
+ // Record the reverted experiment result
337
+ commitExperiment({
338
+ cwd: worktreePath,
339
+ skillName,
340
+ hypothesis,
341
+ target,
342
+ value: currentValue,
343
+ delta: delta.toString(),
344
+ status: 'reverted',
345
+ secondaries: secondariesStr,
346
+ });
347
+ }
348
+
349
+ if (onIteration) {
350
+ onIteration({
351
+ iteration: iterNum,
352
+ status,
353
+ target,
354
+ value: currentValue,
355
+ delta,
356
+ secondaries: secondariesStr,
357
+ hash: experimentHash,
358
+ });
359
+ }
360
+ }
361
+ } finally {
362
+ // Clean up SIGINT handler
363
+ process.removeListener('SIGINT', abortHandler);
364
+ }
365
+
366
+ return stats;
367
+ }
368
+
369
+ module.exports = {
370
+ runEvalLoop,
371
+ // exported for testing / composition
372
+ createEvalWorktree,
373
+ removeEvalWorktree,
374
+ runGuardCheck,
375
+ compareMetric,
376
+ formatSecondaries,
377
+ extractSkillName,
378
+ };
@@ -0,0 +1,306 @@
1
+ 'use strict';
2
+
3
+ const { describe, it, before, after } = require('node:test');
4
+ const assert = require('node:assert');
5
+ const fs = require('fs');
6
+ const path = require('path');
7
+ const os = require('os');
8
+ const { execSync } = require('child_process');
9
+
10
+ const {
11
+ compareMetric,
12
+ formatSecondaries,
13
+ extractSkillName,
14
+ createEvalWorktree,
15
+ removeEvalWorktree,
16
+ runGuardCheck,
17
+ } = require('./loop.js');
18
+
19
+ // --- Helper: create a temporary git repo with initial commit ---
20
+
21
+ function createTempRepo() {
22
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'loop-test-'));
23
+ execSync('git init', { cwd: dir, stdio: 'pipe' });
24
+ execSync('git config user.email "test@test.com"', { cwd: dir, stdio: 'pipe' });
25
+ execSync('git config user.name "Test"', { cwd: dir, stdio: 'pipe' });
26
+ fs.writeFileSync(path.join(dir, 'README.md'), '# test repo\n');
27
+ execSync('git add -A && git commit -m "initial commit"', { cwd: dir, stdio: 'pipe' });
28
+ return dir;
29
+ }
30
+
31
+ function cleanupRepo(dir) {
32
+ // Remove any worktrees first to avoid git lock issues
33
+ try {
34
+ execSync('git worktree prune', { cwd: dir, stdio: 'pipe' });
35
+ } catch (_) { /* ignore */ }
36
+ fs.rmSync(dir, { recursive: true, force: true });
37
+ }
38
+
39
+ // --- compareMetric ---
40
+
41
+ describe('compareMetric', () => {
42
+ describe('higher-is-better metrics (e.g. cache_ratio)', () => {
43
+ it('reports improved when current > baseline', () => {
44
+ const result = compareMetric('cache_ratio', 50, 75);
45
+ assert.strictEqual(result.improved, true);
46
+ assert.strictEqual(result.delta, 50); // (75-50)/50 * 100 = 50%
47
+ });
48
+
49
+ it('reports not improved when current < baseline', () => {
50
+ const result = compareMetric('cache_ratio', 80, 60);
51
+ assert.strictEqual(result.improved, false);
52
+ assert.strictEqual(result.delta, -25); // (60-80)/80 * 100 = -25%
53
+ });
54
+
55
+ it('reports not improved when values are equal', () => {
56
+ const result = compareMetric('cache_ratio', 50, 50);
57
+ assert.strictEqual(result.improved, false);
58
+ assert.strictEqual(result.delta, 0);
59
+ });
60
+ });
61
+
62
+ describe('lower-is-better metrics (total_tokens, wall_time, context_burn)', () => {
63
+ it('reports improved when current < baseline for total_tokens', () => {
64
+ const result = compareMetric('total_tokens', 1000, 800);
65
+ assert.strictEqual(result.improved, true);
66
+ assert.strictEqual(result.delta, -20); // (800-1000)/1000 * 100 = -20%
67
+ });
68
+
69
+ it('reports improved when current < baseline for wall_time', () => {
70
+ const result = compareMetric('wall_time', 60, 45);
71
+ assert.strictEqual(result.improved, true);
72
+ assert.strictEqual(result.delta, -25);
73
+ });
74
+
75
+ it('reports improved when current < baseline for context_burn', () => {
76
+ const result = compareMetric('context_burn', 200, 150);
77
+ assert.strictEqual(result.improved, true);
78
+ assert.strictEqual(result.delta, -25);
79
+ });
80
+
81
+ it('reports not improved when current > baseline for total_tokens', () => {
82
+ const result = compareMetric('total_tokens', 1000, 1200);
83
+ assert.strictEqual(result.improved, false);
84
+ assert.strictEqual(result.delta, 20);
85
+ });
86
+
87
+ it('reports not improved when values are equal for lower-is-better', () => {
88
+ const result = compareMetric('total_tokens', 500, 500);
89
+ assert.strictEqual(result.improved, false);
90
+ assert.strictEqual(result.delta, 0);
91
+ });
92
+ });
93
+
94
+ describe('zero baseline edge cases', () => {
95
+ it('returns delta=0 when both baseline and current are zero', () => {
96
+ const result = compareMetric('cache_ratio', 0, 0);
97
+ assert.strictEqual(result.delta, 0);
98
+ assert.strictEqual(result.improved, false);
99
+ });
100
+
101
+ it('returns delta=100 when baseline is zero and current is nonzero (higher-is-better)', () => {
102
+ const result = compareMetric('cache_ratio', 0, 50);
103
+ assert.strictEqual(result.delta, 100);
104
+ assert.strictEqual(result.improved, true);
105
+ });
106
+
107
+ it('returns delta=100 when baseline is zero and current is nonzero (lower-is-better)', () => {
108
+ const result = compareMetric('total_tokens', 0, 50);
109
+ assert.strictEqual(result.delta, 100);
110
+ // For lower-is-better, current(50) > baseline(0) => not improved
111
+ assert.strictEqual(result.improved, false);
112
+ });
113
+ });
114
+
115
+ describe('delta rounding', () => {
116
+ it('rounds delta to two decimal places', () => {
117
+ // (7 - 3) / 3 * 100 = 133.33333...
118
+ const result = compareMetric('cache_ratio', 3, 7);
119
+ assert.strictEqual(result.delta, 133.33);
120
+ });
121
+ });
122
+ });
123
+
124
+ // --- formatSecondaries ---
125
+
126
+ describe('formatSecondaries', () => {
127
+ it('formats secondary metric names and values', () => {
128
+ const metrics = { cache_ratio: 0.85, total_tokens: 1200, wall_time: 45 };
129
+ const result = formatSecondaries(metrics, 'cache_ratio', ['total_tokens', 'wall_time']);
130
+ assert.strictEqual(result, 'total_tokens=1200 wall_time=45');
131
+ });
132
+
133
+ it('excludes the target metric from secondaries', () => {
134
+ const metrics = { cache_ratio: 0.85, total_tokens: 1200 };
135
+ const result = formatSecondaries(metrics, 'cache_ratio', ['cache_ratio', 'total_tokens']);
136
+ assert.strictEqual(result, 'total_tokens=1200');
137
+ });
138
+
139
+ it('returns empty string when secondaryMetrics is empty', () => {
140
+ const metrics = { cache_ratio: 0.85 };
141
+ const result = formatSecondaries(metrics, 'cache_ratio', []);
142
+ assert.strictEqual(result, '');
143
+ });
144
+
145
+ it('returns empty string when secondaryMetrics is null/undefined', () => {
146
+ const metrics = { cache_ratio: 0.85 };
147
+ assert.strictEqual(formatSecondaries(metrics, 'cache_ratio', null), '');
148
+ assert.strictEqual(formatSecondaries(metrics, 'cache_ratio', undefined), '');
149
+ });
150
+
151
+ it('skips metrics not present in metrics object', () => {
152
+ const metrics = { cache_ratio: 0.85 };
153
+ const result = formatSecondaries(metrics, 'cache_ratio', ['total_tokens', 'wall_time']);
154
+ assert.strictEqual(result, '');
155
+ });
156
+
157
+ it('includes only metrics present in the metrics object', () => {
158
+ const metrics = { cache_ratio: 0.85, total_tokens: 1200 };
159
+ const result = formatSecondaries(metrics, 'cache_ratio', ['total_tokens', 'missing_metric']);
160
+ assert.strictEqual(result, 'total_tokens=1200');
161
+ });
162
+ });
163
+
164
+ // --- extractSkillName ---
165
+
166
+ describe('extractSkillName', () => {
167
+ it('extracts skill name from path containing SKILL.md', () => {
168
+ assert.strictEqual(extractSkillName('skills/atomic-commits/SKILL.md'), 'atomic-commits');
169
+ });
170
+
171
+ it('falls back to filename without extension', () => {
172
+ assert.strictEqual(extractSkillName('some/path/my-skill.md'), 'my-skill');
173
+ });
174
+
175
+ it('handles Windows-style backslashes', () => {
176
+ assert.strictEqual(extractSkillName('skills\\browse-fetch\\SKILL.md'), 'browse-fetch');
177
+ });
178
+ });
179
+
180
+ // --- createEvalWorktree / removeEvalWorktree ---
181
+
182
+ describe('createEvalWorktree', () => {
183
+ let repoDir;
184
+
185
+ before(() => {
186
+ repoDir = createTempRepo();
187
+ });
188
+
189
+ after(() => {
190
+ cleanupRepo(repoDir);
191
+ });
192
+
193
+ it('creates a worktree directory and returns branch and path', () => {
194
+ const { branch, worktreePath } = createEvalWorktree(repoDir, 'test-skill');
195
+
196
+ assert.ok(branch.startsWith('eval/test-skill/'), `branch should start with eval/test-skill/, got: ${branch}`);
197
+ assert.ok(fs.existsSync(worktreePath), 'worktree directory should exist');
198
+
199
+ // Verify it is a valid git worktree (has .git file)
200
+ const gitFile = path.join(worktreePath, '.git');
201
+ assert.ok(fs.existsSync(gitFile), 'worktree should have .git file/dir');
202
+
203
+ // Verify the README from initial commit is present
204
+ const readme = path.join(worktreePath, 'README.md');
205
+ assert.ok(fs.existsSync(readme), 'worktree should contain files from HEAD');
206
+
207
+ // Cleanup worktree
208
+ removeEvalWorktree(repoDir, worktreePath);
209
+ });
210
+ });
211
+
212
+ describe('removeEvalWorktree', () => {
213
+ let repoDir;
214
+
215
+ before(() => {
216
+ repoDir = createTempRepo();
217
+ });
218
+
219
+ after(() => {
220
+ cleanupRepo(repoDir);
221
+ });
222
+
223
+ it('removes an existing worktree', () => {
224
+ const { worktreePath } = createEvalWorktree(repoDir, 'remove-test');
225
+
226
+ assert.ok(fs.existsSync(worktreePath), 'worktree should exist before removal');
227
+
228
+ removeEvalWorktree(repoDir, worktreePath);
229
+
230
+ assert.ok(!fs.existsSync(worktreePath), 'worktree directory should be removed');
231
+ });
232
+
233
+ it('does not throw when removing a non-existent worktree', () => {
234
+ const fakePath = path.join(os.tmpdir(), 'nonexistent-worktree-12345');
235
+ assert.doesNotThrow(() => {
236
+ removeEvalWorktree(repoDir, fakePath);
237
+ });
238
+ });
239
+ });
240
+
241
+ // --- runGuardCheck ---
242
+
243
+ describe('runGuardCheck', () => {
244
+ let tempDir;
245
+
246
+ before(() => {
247
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'guard-test-'));
248
+ });
249
+
250
+ after(() => {
251
+ fs.rmSync(tempDir, { recursive: true, force: true });
252
+ });
253
+
254
+ it('passes when build and test commands succeed', () => {
255
+ const result = runGuardCheck(tempDir, {
256
+ build_command: 'echo "build ok"',
257
+ test_command: 'echo "test ok"',
258
+ });
259
+ assert.strictEqual(result.passed, true);
260
+ assert.ok(result.output.includes('test ok'));
261
+ });
262
+
263
+ it('fails when build command fails', () => {
264
+ const result = runGuardCheck(tempDir, {
265
+ build_command: 'exit 1',
266
+ test_command: 'echo "test ok"',
267
+ });
268
+ assert.strictEqual(result.passed, false);
269
+ });
270
+
271
+ it('fails when test command fails', () => {
272
+ const result = runGuardCheck(tempDir, {
273
+ build_command: 'echo "build ok"',
274
+ test_command: 'exit 1',
275
+ });
276
+ assert.strictEqual(result.passed, false);
277
+ });
278
+
279
+ it('passes with no guard commands configured', () => {
280
+ const result = runGuardCheck(tempDir, {});
281
+ assert.strictEqual(result.passed, true);
282
+ assert.ok(result.output.includes('no guard commands configured'));
283
+ });
284
+
285
+ it('passes with only build_command configured', () => {
286
+ const result = runGuardCheck(tempDir, {
287
+ build_command: 'echo "build only"',
288
+ });
289
+ assert.strictEqual(result.passed, true);
290
+ });
291
+
292
+ it('passes with only test_command configured', () => {
293
+ const result = runGuardCheck(tempDir, {
294
+ test_command: 'echo "test only"',
295
+ });
296
+ assert.strictEqual(result.passed, true);
297
+ });
298
+
299
+ it('returns error output on failure', () => {
300
+ const result = runGuardCheck(tempDir, {
301
+ test_command: 'echo "some error" >&2 && exit 1',
302
+ });
303
+ assert.strictEqual(result.passed, false);
304
+ assert.ok(result.output.includes('some error'));
305
+ });
306
+ });