npm - deepflow - Versions diffs - 0.1.103 → 0.1.104 - Mend

deepflow 0.1.103 → 0.1.104

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/bin/install-dynamic-hooks.test.js +461 -0
package/bin/install.js +150 -250
package/bin/lineage-ingest.js +70 -0
package/hooks/df-check-update.js +1 -0
package/hooks/df-command-usage.js +18 -0
package/hooks/df-dashboard-push.js +1 -0
package/hooks/df-execution-history.js +1 -0
package/hooks/df-explore-protocol.js +83 -0
package/hooks/df-explore-protocol.test.js +228 -0
package/hooks/df-hook-event-tags.test.js +127 -0
package/hooks/df-invariant-check.js +1 -0
package/hooks/df-quota-logger.js +1 -0
package/hooks/df-snapshot-guard.js +1 -0
package/hooks/df-spec-lint.js +58 -1
package/hooks/df-spec-lint.test.js +412 -0
package/hooks/df-statusline.js +1 -0
package/hooks/df-subagent-registry.js +1 -0
package/hooks/df-tool-usage.js +13 -3
package/hooks/df-worktree-guard.js +1 -0
package/package.json +1 -1
package/src/commands/df/debate.md +1 -1
package/src/commands/df/eval.md +117 -0
package/src/commands/df/execute.md +1 -1
package/src/commands/df/fix.md +104 -0
package/src/eval/git-memory.js +159 -0
package/src/eval/git-memory.test.js +439 -0
package/src/eval/hypothesis.js +80 -0
package/src/eval/hypothesis.test.js +169 -0
package/src/eval/loop.js +378 -0
package/src/eval/loop.test.js +306 -0
package/src/eval/metric-collector.js +163 -0
package/src/eval/metric-collector.test.js +369 -0
package/src/eval/metric-pivot.js +119 -0
package/src/eval/metric-pivot.test.js +350 -0
package/src/eval/mutator-prompt.js +106 -0
package/src/eval/mutator-prompt.test.js +180 -0
package/templates/config-template.yaml +5 -0
package/templates/eval-fixture-template/config.yaml +39 -0
package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
package/templates/eval-fixture-template/fixture/package.json +12 -0
package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
package/templates/eval-fixture-template/fixture/src/config.js +40 -0
package/templates/eval-fixture-template/fixture/src/index.js +19 -0
package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
package/templates/eval-fixture-template/hypotheses.md +14 -0
package/templates/eval-fixture-template/spec.md +34 -0
package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
package/templates/eval-fixture-template/tests/guard.test.js +108 -0
package/templates/eval-fixture-template.test.js +318 -0
package/templates/explore-agent.md +5 -74
package/templates/explore-protocol.md +44 -0
package/templates/spec-template.md +4 -0

package/src/eval/metric-pivot.test.js ADDED Viewed

@@ -0,0 +1,350 @@
+'use strict';
+const { describe, it, before, after } = require('node:test');
+const assert = require('node:assert');
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+const { execSync } = require('child_process');
+const {
+  parseSecondaries,
+  surfaceCandidates,
+  formatCandidates,
+} = require('./metric-pivot.js');
+const { commitExperiment, revertExperiment } = require('./git-memory.js');
+/**
+ * Creates a temporary git repo with an initial commit.
+ */
+function createTempRepo() {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'metric-pivot-test-'));
+  execSync('git init', { cwd: dir, stdio: 'pipe' });
+  execSync('git config user.email "test@test.com"', { cwd: dir, stdio: 'pipe' });
+  execSync('git config user.name "Test"', { cwd: dir, stdio: 'pipe' });
+  fs.writeFileSync(path.join(dir, 'README.md'), '# test repo\n');
+  execSync('git add -A && git commit -m "initial commit"', { cwd: dir, stdio: 'pipe' });
+  return dir;
+}
+function cleanupRepo(dir) {
+  fs.rmSync(dir, { recursive: true, force: true });
+}
+// --- parseSecondaries ---
+describe('parseSecondaries', () => {
+  it('parses single key=value pair', () => {
+    const result = parseSecondaries('accuracy=98%');
+    assert.deepStrictEqual(result, { accuracy: '98%' });
+  });
+  it('parses multiple key=value pairs', () => {
+    const result = parseSecondaries('accuracy=98% latency=4.2s mem=50MB');
+    assert.deepStrictEqual(result, {
+      accuracy: '98%',
+      latency: '4.2s',
+      mem: '50MB',
+    });
+  });
+  it('returns empty object for null input', () => {
+    assert.deepStrictEqual(parseSecondaries(null), {});
+  });
+  it('returns empty object for undefined input', () => {
+    assert.deepStrictEqual(parseSecondaries(undefined), {});
+  });
+  it('returns empty object for empty string', () => {
+    assert.deepStrictEqual(parseSecondaries(''), {});
+  });
+  it('skips tokens without equals sign', () => {
+    const result = parseSecondaries('good=1 badtoken good2=2');
+    assert.deepStrictEqual(result, { good: '1', good2: '2' });
+  });
+  it('handles values containing equals signs', () => {
+    // token "key=a=b" => key="a=b" (first = is the split point)
+    const result = parseSecondaries('formula=x=y+z');
+    assert.deepStrictEqual(result, { formula: 'x=y+z' });
+  });
+  it('handles extra whitespace between tokens', () => {
+    const result = parseSecondaries('  a=1   b=2  ');
+    assert.deepStrictEqual(result, { a: '1', b: '2' });
+  });
+  it('skips tokens with empty key (e.g. "=value")', () => {
+    const result = parseSecondaries('=nokey valid=yes');
+    assert.deepStrictEqual(result, { valid: 'yes' });
+  });
+});
+// --- surfaceCandidates (integration with temp git repo) ---
+describe('surfaceCandidates', () => {
+  describe('AC-14: finds reverted experiments with positive delta on new target', () => {
+    let cwd;
+    before(() => {
+      cwd = createTempRepo();
+      // Experiment 1: reverted, primary target=latency, positive delta
+      fs.writeFileSync(path.join(cwd, 'e1.txt'), '1');
+      commitExperiment({
+        cwd, skillName: 'browse-fetch', hypothesis: 'cache responses',
+        target: 'latency', value: '4.2s', delta: '+16', status: 'reverted',
+        secondaries: 'accuracy=98%',
+      });
+      // Experiment 2: reverted, primary target=latency, negative delta => excluded
+      fs.writeFileSync(path.join(cwd, 'e2.txt'), '2');
+      commitExperiment({
+        cwd, skillName: 'browse-fetch', hypothesis: 'bad cache',
+        target: 'latency', value: '10s', delta: '-5', status: 'reverted',
+        secondaries: '',
+      });
+      // Experiment 3: kept (pass), primary target=latency, positive delta => excluded (not reverted)
+      fs.writeFileSync(path.join(cwd, 'e3.txt'), '3');
+      commitExperiment({
+        cwd, skillName: 'browse-fetch', hypothesis: 'kept experiment',
+        target: 'latency', value: '3s', delta: '+20', status: 'pass',
+        secondaries: '',
+      });
+      // Experiment 4: reverted, primary target=speed, but latency in secondaries
+      fs.writeFileSync(path.join(cwd, 'e4.txt'), '4');
+      commitExperiment({
+        cwd, skillName: 'browse-fetch', hypothesis: 'speed tweak with latency side-effect',
+        target: 'speed', value: '150', delta: '+10', status: 'reverted',
+        secondaries: 'latency=3.1s mem=40MB',
+      });
+    });
+    after(() => {
+      cleanupRepo(cwd);
+    });
+    it('finds reverted experiment with positive delta on primary target', () => {
+      const candidates = surfaceCandidates({
+        cwd, skillName: 'browse-fetch', newTarget: 'latency',
+      });
+      const primaryCandidate = candidates.find(c => c.candidateSource === 'primary');
+      assert.ok(primaryCandidate, 'Should find a primary candidate');
+      assert.strictEqual(primaryCandidate.hypothesis, 'cache responses');
+      assert.strictEqual(primaryCandidate.candidateValue, '4.2s');
+      assert.strictEqual(primaryCandidate.candidateDelta, 16);
+    });
+    it('excludes reverted experiments with negative delta on primary target', () => {
+      const candidates = surfaceCandidates({
+        cwd, skillName: 'browse-fetch', newTarget: 'latency',
+      });
+      const badCache = candidates.find(c => c.hypothesis === 'bad cache');
+      assert.strictEqual(badCache, undefined, 'Should not include negative delta experiment');
+    });
+    it('excludes non-reverted (kept) experiments', () => {
+      const candidates = surfaceCandidates({
+        cwd, skillName: 'browse-fetch', newTarget: 'latency',
+      });
+      const kept = candidates.find(c => c.hypothesis === 'kept experiment');
+      assert.strictEqual(kept, undefined, 'Should not include kept (pass) experiments');
+    });
+    it('finds candidates from secondary metrics', () => {
+      const candidates = surfaceCandidates({
+        cwd, skillName: 'browse-fetch', newTarget: 'latency',
+      });
+      const secondaryCandidate = candidates.find(c => c.candidateSource === 'secondary');
+      assert.ok(secondaryCandidate, 'Should find a secondary candidate');
+      assert.strictEqual(secondaryCandidate.hypothesis, 'speed tweak with latency side-effect');
+      assert.strictEqual(secondaryCandidate.candidateValue, '3.1s');
+      assert.strictEqual(secondaryCandidate.candidateDelta, null, 'Secondary candidates have no delta');
+    });
+    it('returns empty array when no candidates match new target', () => {
+      const candidates = surfaceCandidates({
+        cwd, skillName: 'browse-fetch', newTarget: 'nonexistent-metric',
+      });
+      assert.deepStrictEqual(candidates, []);
+    });
+  });
+  describe('edge cases', () => {
+    let cwd;
+    before(() => {
+      cwd = createTempRepo();
+    });
+    after(() => {
+      cleanupRepo(cwd);
+    });
+    it('returns empty array when no experiments exist', () => {
+      const candidates = surfaceCandidates({
+        cwd, skillName: 'no-experiments', newTarget: 'latency',
+      });
+      assert.deepStrictEqual(candidates, []);
+    });
+    it('returns empty array for unknown skill name', () => {
+      // Add an experiment for a different skill
+      fs.writeFileSync(path.join(cwd, 'other.txt'), 'other');
+      commitExperiment({
+        cwd, skillName: 'other-skill', hypothesis: 'h',
+        target: 'metric', value: '1', delta: '+5', status: 'reverted',
+        secondaries: '',
+      });
+      const candidates = surfaceCandidates({
+        cwd, skillName: 'nonexistent-skill', newTarget: 'metric',
+      });
+      assert.deepStrictEqual(candidates, []);
+    });
+  });
+  describe('reverted experiments with zero delta excluded', () => {
+    let cwd;
+    before(() => {
+      cwd = createTempRepo();
+      fs.writeFileSync(path.join(cwd, 'z.txt'), 'z');
+      commitExperiment({
+        cwd, skillName: 'zero-test', hypothesis: 'zero delta',
+        target: 'metric', value: '50', delta: '0', status: 'reverted',
+        secondaries: '',
+      });
+    });
+    after(() => {
+      cleanupRepo(cwd);
+    });
+    it('excludes reverted experiments with zero delta (not positive)', () => {
+      const candidates = surfaceCandidates({
+        cwd, skillName: 'zero-test', newTarget: 'metric',
+      });
+      assert.deepStrictEqual(candidates, [], 'Zero delta is not positive, should be excluded');
+    });
+  });
+});
+// --- formatCandidates ---
+describe('formatCandidates', () => {
+  it('returns "no candidates" message for empty list', () => {
+    const result = formatCandidates([], 'latency');
+    assert.strictEqual(result, 'No reverted experiments found with positive delta on target="latency".');
+  });
+  it('formats candidate with primary delta', () => {
+    const candidates = [{
+      hash: 'abc1234',
+      skillName: 'browse-fetch',
+      hypothesis: 'cache responses',
+      target: 'latency',
+      value: '4.2s',
+      delta: 16,
+      status: 'reverted',
+      secondaries: 'accuracy=98%',
+      candidateValue: '4.2s',
+      candidateDelta: 16,
+      candidateSource: 'primary',
+    }];
+    const result = formatCandidates(candidates, 'latency');
+    assert.ok(result.includes('Reverted experiments with positive signal on "latency"'));
+    assert.ok(result.includes('[abc1234] browse-fetch: cache responses'));
+    assert.ok(result.includes('delta=+16% (primary)'));
+    assert.ok(result.includes('original target: latency=4.2s delta=16%'));
+  });
+  it('formats candidate from secondary (no delta)', () => {
+    const candidates = [{
+      hash: 'def5678',
+      skillName: 'browse-fetch',
+      hypothesis: 'speed tweak',
+      target: 'speed',
+      value: '150',
+      delta: 10,
+      status: 'reverted',
+      secondaries: 'latency=3.1s',
+      candidateValue: '3.1s',
+      candidateDelta: null,
+      candidateSource: 'secondary',
+    }];
+    const result = formatCandidates(candidates, 'latency');
+    assert.ok(result.includes('[def5678] browse-fetch: speed tweak'));
+    assert.ok(result.includes('value=3.1s (secondary'));
+    assert.ok(result.includes('no delta available'));
+    assert.ok(result.includes('original target: speed=150 delta=10%'));
+  });
+  it('formats multiple candidates', () => {
+    const candidates = [
+      {
+        hash: 'aaa',
+        skillName: 'skill-a',
+        hypothesis: 'h1',
+        target: 'latency',
+        value: '4s',
+        delta: 10,
+        status: 'reverted',
+        candidateValue: '4s',
+        candidateDelta: 10,
+        candidateSource: 'primary',
+      },
+      {
+        hash: 'bbb',
+        skillName: 'skill-a',
+        hypothesis: 'h2',
+        target: 'speed',
+        value: '200',
+        delta: 5,
+        status: 'reverted',
+        candidateValue: '3s',
+        candidateDelta: null,
+        candidateSource: 'secondary',
+      },
+    ];
+    const result = formatCandidates(candidates, 'latency');
+    assert.ok(result.includes('[aaa]'));
+    assert.ok(result.includes('[bbb]'));
+    assert.ok(result.includes('h1'));
+    assert.ok(result.includes('h2'));
+  });
+  it('does not have trailing newline', () => {
+    const candidates = [{
+      hash: 'x',
+      skillName: 's',
+      hypothesis: 'h',
+      target: 't',
+      value: '1',
+      delta: 5,
+      status: 'reverted',
+      candidateValue: '1',
+      candidateDelta: 5,
+      candidateSource: 'primary',
+    }];
+    const result = formatCandidates(candidates, 't');
+    assert.ok(!result.endsWith('\n'), 'Output should not end with newline (trimEnd applied)');
+  });
+});

package/src/eval/mutator-prompt.js ADDED Viewed

@@ -0,0 +1,106 @@
+'use strict';
+/**
+ * Mutator prompt builder for the skill-eval Karpathy loop.
+ *
+ * Layout follows the attention U-curve pattern:
+ *   START zone: ## Skill + ## Hypothesis  (highest attention — task + context)
+ *   MIDDLE zone: ## History               (lower attention — truncated experiment log)
+ *   END zone:   ## Instructions           (highest attention — action directive)
+ *
+ * Full file replacement: the mutator is instructed to output the COMPLETE new
+ * skill file, not a diff or partial edit.
+ */
+const CHARS_PER_TOKEN = 4; // rough estimate: 1 token ≈ 4 characters
+/**
+ * Truncate experiment history to fit within maxHistoryTokens, keeping the most
+ * recent experiments first (they are assumed to be ordered newest → oldest in
+ * the `history` array).
+ *
+ * @param {string[]} history  Array of experiment strings, most recent first.
+ * @param {number}   maxTokens  Token budget for the history section.
+ * @returns {string}  Concatenated history that fits within budget.
+ */
+function truncateHistory(history, maxTokens) {
+  if (!history || history.length === 0) return '(no experiments yet)';
+  const budget = maxTokens * CHARS_PER_TOKEN;
+  const kept = [];
+  let used = 0;
+  for (const entry of history) {
+    const entryChars = entry.length + 2; // +2 for the separating newlines
+    if (used + entryChars > budget) break;
+    kept.push(entry);
+    used += entryChars;
+  }
+  if (kept.length === 0) {
+    // Even the single most-recent entry exceeds budget — truncate it hard.
+    const single = history[0].slice(0, budget - 3) + '...';
+    return single;
+  }
+  const dropped = history.length - kept.length;
+  const suffix = dropped > 0 ? `\n\n_(${dropped} older experiment(s) omitted to fit token budget)_` : '';
+  return kept.join('\n\n') + suffix;
+}
+/**
+ * Build the mutator prompt.
+ *
+ * @param {object} options
+ * @param {string}   options.skillContent       Full text of the current skill file.
+ * @param {string}   options.hypothesis         The current mutation hypothesis.
+ * @param {string[]} [options.history=[]]       Experiment history strings, most recent first.
+ * @param {number}   [options.maxHistoryTokens=4000]  Token budget for history section.
+ * @returns {string}  The complete prompt string to send to the mutator LLM.
+ */
+function buildMutatorPrompt({
+  skillContent,
+  hypothesis,
+  history = [],
+  maxHistoryTokens = 4000,
+}) {
+  if (typeof skillContent !== 'string' || skillContent.length === 0) {
+    throw new Error('skillContent must be a non-empty string');
+  }
+  if (typeof hypothesis !== 'string' || hypothesis.length === 0) {
+    throw new Error('hypothesis must be a non-empty string');
+  }
+  const historyText = truncateHistory(history, maxHistoryTokens);
+  return [
+    '## Skill',
+    '',
+    skillContent.trimEnd(),
+    '',
+    '## Hypothesis',
+    '',
+    hypothesis.trimEnd(),
+    '',
+    '## History',
+    '',
+    historyText,
+    '',
+    '## Instructions',
+    '',
+    'You are mutating the skill file above to test the hypothesis.',
+    '',
+    'Rules:',
+    '- Output the COMPLETE replacement skill file — no diffs, no partial edits.',
+    '- Do not change the YAML frontmatter `name` or `description` fields.',
+    '- Apply exactly ONE focused change that directly tests the hypothesis.',
+    '- If the history shows this hypothesis already failed, try a different angle.',
+    '- If the history shows a best-known state, you may backtrack to it and try a',
+    '  smaller variation.',
+    '',
+    'Respond with only the new skill file content, starting with the YAML front matter.',
+    'No prose, no explanation, no code fences.',
+  ].join('\n');
+}
+module.exports = { buildMutatorPrompt };

package/src/eval/mutator-prompt.test.js ADDED Viewed

@@ -0,0 +1,180 @@
+'use strict';
+const { describe, it } = require('node:test');
+const assert = require('node:assert/strict');
+const { buildMutatorPrompt } = require('./mutator-prompt.js');
+const DEFAULTS = { skillContent: '---\nname: test\n---\nSome skill', hypothesis: 'Try X' };
+describe('buildMutatorPrompt', () => {
+  // --- AC-8: Section order ---
+  it('emits sections in order: ## Skill, ## Hypothesis, ## History, ## Instructions', () => {
+    const result = buildMutatorPrompt(DEFAULTS);
+    const headings = [...result.matchAll(/^## \w+/gm)].map(m => m[0]);
+    assert.deepStrictEqual(headings, ['## Skill', '## Hypothesis', '## History', '## Instructions']);
+  });
+  it('## Skill appears before ## Hypothesis which appears before ## History which appears before ## Instructions', () => {
+    const result = buildMutatorPrompt({ ...DEFAULTS, history: ['exp1'] });
+    const skillIdx = result.indexOf('## Skill');
+    const hypIdx = result.indexOf('## Hypothesis');
+    const histIdx = result.indexOf('## History');
+    const instrIdx = result.indexOf('## Instructions');
+    assert.ok(skillIdx < hypIdx, 'Skill before Hypothesis');
+    assert.ok(hypIdx < histIdx, 'Hypothesis before History');
+    assert.ok(histIdx < instrIdx, 'History before Instructions');
+  });
+  // --- Skill content placement ---
+  it('places skillContent after ## Skill heading', () => {
+    const result = buildMutatorPrompt(DEFAULTS);
+    const afterSkill = result.split('## Skill\n\n')[1];
+    assert.ok(afterSkill.startsWith(DEFAULTS.skillContent));
+  });
+  it('places hypothesis after ## Hypothesis heading', () => {
+    const result = buildMutatorPrompt(DEFAULTS);
+    const afterHyp = result.split('## Hypothesis\n\n')[1];
+    assert.ok(afterHyp.startsWith(DEFAULTS.hypothesis));
+  });
+  // --- Empty history ---
+  it('shows "(no experiments yet)" when history is empty', () => {
+    const result = buildMutatorPrompt(DEFAULTS);
+    assert.ok(result.includes('(no experiments yet)'));
+  });
+  it('shows "(no experiments yet)" when history is not provided', () => {
+    const result = buildMutatorPrompt({ skillContent: 'skill', hypothesis: 'hyp' });
+    assert.ok(result.includes('(no experiments yet)'));
+  });
+  // --- AC-9: History truncation ---
+  it('includes all history entries when within token budget', () => {
+    const history = ['entry-one', 'entry-two'];
+    const result = buildMutatorPrompt({ ...DEFAULTS, history, maxHistoryTokens: 4000 });
+    assert.ok(result.includes('entry-one'));
+    assert.ok(result.includes('entry-two'));
+    assert.ok(!result.includes('omitted'));
+  });
+  it('truncates older entries when history exceeds maxHistoryTokens budget', () => {
+    // budget = 5 tokens * 4 chars/token = 20 chars
+    // Each entry gets length + 2 for separating newlines
+    const history = ['aaaa', 'bbbb', 'cccccccccccccccc'];
+    // entry 'aaaa' = 4+2=6 chars, 'bbbb' = 4+2=6 chars => 12 chars fits in 20
+    // 'cccc...' = 16+2=18 chars => 12+18=30 > 20 => dropped
+    const result = buildMutatorPrompt({ ...DEFAULTS, history, maxHistoryTokens: 5 });
+    assert.ok(result.includes('aaaa'), 'newest entry kept');
+    assert.ok(result.includes('bbbb'), 'second entry kept');
+    assert.ok(!result.includes('cccccccccccccccc'), 'oldest entry dropped');
+    assert.ok(result.includes('1 older experiment(s) omitted'));
+  });
+  it('keeps entries newest-first and drops from the end', () => {
+    // budget = 3 tokens * 4 = 12 chars total
+    const history = ['AAAA', 'BBBB', 'CCCC'];
+    // 'AAAA' = 4+2=6, 'BBBB' = 4+2=6 => 12, fits exactly. 'CCCC' dropped.
+    const result = buildMutatorPrompt({ ...DEFAULTS, history, maxHistoryTokens: 3 });
+    assert.ok(result.includes('AAAA'));
+    assert.ok(result.includes('BBBB'));
+    assert.ok(!result.includes('CCCC'));
+    assert.ok(result.includes('1 older experiment(s) omitted'));
+  });
+  it('reports correct count of dropped entries', () => {
+    // budget = 2 tokens * 4 = 8 chars
+    // 'AA' = 2+2=4, fits. 'BB'=4+2=8 => 4+4=8 fits. 'CC' and 'DD' dropped.
+    // Wait: 'AA' len=2+2=4, 'BB' len=2+2=4, total=8 which equals budget so 'BB' fits.
+    const history = ['AA', 'BB', 'CC', 'DD'];
+    const result = buildMutatorPrompt({ ...DEFAULTS, history, maxHistoryTokens: 2 });
+    assert.ok(result.includes('2 older experiment(s) omitted'));
+  });
+  // --- Single entry exceeding budget: hard truncation ---
+  it('hard-truncates a single entry that exceeds the entire budget', () => {
+    const longEntry = 'X'.repeat(200);
+    // budget = 2 tokens * 4 = 8 chars. Entry is 200 chars, exceeds budget.
+    const result = buildMutatorPrompt({ ...DEFAULTS, history: [longEntry], maxHistoryTokens: 2 });
+    // Hard truncated to budget-3 chars + '...'
+    assert.ok(result.includes('...'), 'should end with ellipsis');
+    assert.ok(!result.includes('X'.repeat(200)), 'full entry should not appear');
+    // The truncated text should be budget - 3 = 5 chars of X + '...'
+    assert.ok(result.includes('XXXXX...'));
+  });
+  it('hard-truncated single entry length equals budget chars', () => {
+    const longEntry = 'Y'.repeat(500);
+    const maxTokens = 10; // budget = 40 chars
+    const result = buildMutatorPrompt({ ...DEFAULTS, history: [longEntry], maxHistoryTokens: maxTokens });
+    // Truncated to (40 - 3) = 37 Y's + '...' = 40 chars total
+    const historySection = result.split('## History\n\n')[1].split('\n\n## Instructions')[0];
+    assert.equal(historySection.length, 40);
+    assert.ok(historySection.endsWith('...'));
+  });
+  // --- Full file replacement instructions ---
+  it('contains full file replacement instruction', () => {
+    const result = buildMutatorPrompt(DEFAULTS);
+    assert.ok(result.includes('COMPLETE replacement skill file'));
+  });
+  it('instructs no diffs or partial edits', () => {
+    const result = buildMutatorPrompt(DEFAULTS);
+    assert.ok(result.includes('no diffs, no partial edits'));
+  });
+  it('instructs to respond with only skill file content', () => {
+    const result = buildMutatorPrompt(DEFAULTS);
+    assert.ok(result.includes('Respond with only the new skill file content'));
+  });
+  // --- Default maxHistoryTokens is 4000 ---
+  it('defaults maxHistoryTokens to 4000 (fits ~16000 chars of history)', () => {
+    // With default 4000 tokens = 16000 chars budget
+    // Create history that fits in 16000 but would not fit in, say, 1000
+    const entry = 'Z'.repeat(5000); // 5000 + 2 = 5002 chars each
+    const history = [entry, entry, entry]; // 3 * 5002 = 15006 chars, fits in 16000
+    const result = buildMutatorPrompt({ ...DEFAULTS, history });
+    // All three should be present with the default budget
+    const count = (result.match(/Z{5000}/g) || []).length;
+    assert.equal(count, 3, 'all three entries should be included with default 4000 token budget');
+    assert.ok(!result.includes('omitted'));
+  });
+  // --- Input validation ---
+  it('throws when skillContent is missing', () => {
+    assert.throws(() => buildMutatorPrompt({ hypothesis: 'hyp' }), /skillContent must be a non-empty string/);
+  });
+  it('throws when skillContent is empty string', () => {
+    assert.throws(() => buildMutatorPrompt({ skillContent: '', hypothesis: 'hyp' }), /skillContent must be a non-empty string/);
+  });
+  it('throws when hypothesis is missing', () => {
+    assert.throws(() => buildMutatorPrompt({ skillContent: 'skill' }), /hypothesis must be a non-empty string/);
+  });
+  it('throws when hypothesis is empty string', () => {
+    assert.throws(() => buildMutatorPrompt({ skillContent: 'skill', hypothesis: '' }), /hypothesis must be a non-empty string/);
+  });
+  // --- Edge cases ---
+  it('trims trailing whitespace from skillContent and hypothesis', () => {
+    const result = buildMutatorPrompt({ skillContent: 'skill  \n\n', hypothesis: 'hyp  \n' });
+    // After ## Skill, should have trimmed content
+    const skillSection = result.split('## Skill\n\n')[1].split('\n\n## Hypothesis')[0];
+    assert.equal(skillSection, 'skill');
+    const hypSection = result.split('## Hypothesis\n\n')[1].split('\n\n## History')[0];
+    assert.equal(hypSection, 'hyp');
+  });
+});

package/templates/config-template.yaml CHANGED Viewed

@@ -129,6 +129,11 @@ dashboard_url: ""
 # Default: 3334  (3333 is reserved for local mode)
 dashboard_port: 3334
+eval:
+  # Max tokens of experiment history passed to the mutator prompt during skill evaluation
+  max_history_tokens: 4000
+  # Future: rework metric (fix count) as secondary signal — requires spec-lineage tracking (not in v1)
 # Recommended .gitignore entries
 # Add these entries to your .gitignore to exclude instrumentation artifacts
 gitignore_entries:

package/templates/eval-fixture-template/config.yaml ADDED Viewed

@@ -0,0 +1,39 @@
+# Benchmark configuration for /df:eval
+benchmark:
+  name: "{benchmark-name}"
+  skill: "skills/{skill-name}/SKILL.md"   # path to skill being evaluated
+  description: "[What this benchmark measures]"
+# Metric definitions
+metrics:
+  # Target metric — the one numeric value that decides keep/revert
+  target: cache_ratio
+  # Guard — binary pass/fail. Failure auto-reverts before any metric check.
+  guard_command: "node tests/guard.test.js"
+  # Secondary metrics — logged in commit message, inform mutator, never decide
+  secondary:
+    - total_tokens
+    - wall_time
+    - context_burn
+# Iteration settings
+loop:
+  # Default iteration cap. Override with --loop N flag. 0 = unlimited (Ctrl+C to stop)
+  default_iterations: 0
+  # History window passed to mutator prompt (~4000 tokens ≈ 15 experiments)
+  max_history_tokens: 4000
+# Fixture execution
+fixture:
+  # Command to run inside the fixture dir during each eval iteration
+  run_command: "node scripts/run-task.js"
+  # Timeout per iteration in seconds
+  timeout: 300
+  # deepflow token-history location relative to fixture dir
+  token_history_path: ".deepflow/token-history.jsonl"