deepflow 0.1.102 → 0.1.104
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install-dynamic-hooks.test.js +461 -0
- package/bin/install.js +150 -204
- package/bin/install.test.js +214 -0
- package/bin/lineage-ingest.js +70 -0
- package/hooks/df-check-update.js +1 -0
- package/hooks/df-command-usage.js +305 -0
- package/hooks/df-command-usage.test.js +1019 -0
- package/hooks/df-dashboard-push.js +1 -0
- package/hooks/df-execution-history.js +1 -0
- package/hooks/df-explore-protocol.js +83 -0
- package/hooks/df-explore-protocol.test.js +228 -0
- package/hooks/df-hook-event-tags.test.js +127 -0
- package/hooks/df-invariant-check.js +1 -0
- package/hooks/df-quota-logger.js +1 -0
- package/hooks/df-snapshot-guard.js +1 -0
- package/hooks/df-spec-lint.js +58 -1
- package/hooks/df-spec-lint.test.js +412 -0
- package/hooks/df-statusline.js +1 -0
- package/hooks/df-subagent-registry.js +34 -14
- package/hooks/df-tool-usage.js +21 -3
- package/hooks/df-tool-usage.test.js +200 -0
- package/hooks/df-worktree-guard.js +1 -0
- package/package.json +1 -1
- package/src/commands/df/debate.md +1 -1
- package/src/commands/df/eval.md +117 -0
- package/src/commands/df/execute.md +1 -1
- package/src/commands/df/fix.md +104 -0
- package/src/eval/git-memory.js +159 -0
- package/src/eval/git-memory.test.js +439 -0
- package/src/eval/hypothesis.js +80 -0
- package/src/eval/hypothesis.test.js +169 -0
- package/src/eval/loop.js +378 -0
- package/src/eval/loop.test.js +306 -0
- package/src/eval/metric-collector.js +163 -0
- package/src/eval/metric-collector.test.js +369 -0
- package/src/eval/metric-pivot.js +119 -0
- package/src/eval/metric-pivot.test.js +350 -0
- package/src/eval/mutator-prompt.js +106 -0
- package/src/eval/mutator-prompt.test.js +180 -0
- package/templates/config-template.yaml +5 -0
- package/templates/eval-fixture-template/config.yaml +39 -0
- package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
- package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
- package/templates/eval-fixture-template/fixture/package.json +12 -0
- package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
- package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
- package/templates/eval-fixture-template/fixture/src/config.js +40 -0
- package/templates/eval-fixture-template/fixture/src/index.js +19 -0
- package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
- package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
- package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
- package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
- package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
- package/templates/eval-fixture-template/hypotheses.md +14 -0
- package/templates/eval-fixture-template/spec.md +34 -0
- package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
- package/templates/eval-fixture-template/tests/guard.test.js +108 -0
- package/templates/eval-fixture-template.test.js +318 -0
- package/templates/explore-agent.md +5 -74
- package/templates/explore-protocol.md +44 -0
- package/templates/spec-template.md +4 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Metric collector for df:eval.
|
|
5
|
+
*
|
|
6
|
+
* Reads `.deepflow/token-history.jsonl` and `~/.claude/tool-usage.jsonl`
|
|
7
|
+
* from existing hook outputs — no new instrumentation hooks installed (AC-17).
|
|
8
|
+
*
|
|
9
|
+
* Metric source mapping (from spec doing-skill-eval.md):
|
|
10
|
+
* cache_ratio = cache_read_input_tokens / input_tokens
|
|
11
|
+
* total_tokens = sum of (input_tokens + cache_creation_input_tokens + cache_read_input_tokens + output_tokens) per entry
|
|
12
|
+
* wall_time = endTimestamp - startTimestamp (ms)
|
|
13
|
+
* context_burn = max used_percentage across entries
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
const fs = require('fs');
|
|
17
|
+
const path = require('path');
|
|
18
|
+
const os = require('os');
|
|
19
|
+
const readline = require('readline');
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Parse a JSONL file, yielding one parsed object per line.
|
|
23
|
+
* Lines that are empty or fail to parse are silently skipped.
|
|
24
|
+
*
|
|
25
|
+
* @param {string} filePath
|
|
26
|
+
* @returns {Promise<object[]>}
|
|
27
|
+
*/
|
|
28
|
+
async function readJsonl(filePath) {
|
|
29
|
+
const entries = [];
|
|
30
|
+
let stream;
|
|
31
|
+
try {
|
|
32
|
+
stream = fs.createReadStream(filePath, { encoding: 'utf8' });
|
|
33
|
+
} catch (_) {
|
|
34
|
+
return entries;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
|
|
38
|
+
|
|
39
|
+
for await (const line of rl) {
|
|
40
|
+
const trimmed = line.trim();
|
|
41
|
+
if (!trimmed) continue;
|
|
42
|
+
try {
|
|
43
|
+
entries.push(JSON.parse(trimmed));
|
|
44
|
+
} catch (_) {
|
|
45
|
+
// skip malformed lines
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return entries;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Filter entries whose `timestamp` field falls within [startTimestamp, endTimestamp].
|
|
54
|
+
* If startTimestamp or endTimestamp is null/undefined, that bound is open.
|
|
55
|
+
*
|
|
56
|
+
* @param {object[]} entries
|
|
57
|
+
* @param {number|null} startTimestamp — ms since epoch (inclusive)
|
|
58
|
+
* @param {number|null} endTimestamp — ms since epoch (inclusive)
|
|
59
|
+
* @returns {object[]}
|
|
60
|
+
*/
|
|
61
|
+
function filterByRange(entries, startTimestamp, endTimestamp) {
|
|
62
|
+
return entries.filter((e) => {
|
|
63
|
+
const ts = new Date(e.timestamp).getTime();
|
|
64
|
+
if (isNaN(ts)) return false;
|
|
65
|
+
if (startTimestamp != null && ts < startTimestamp) return false;
|
|
66
|
+
if (endTimestamp != null && ts > endTimestamp) return false;
|
|
67
|
+
return true;
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Collect evaluation metrics from existing hook output files.
|
|
73
|
+
*
|
|
74
|
+
* AC-16: reads `.deepflow/token-history.jsonl` from the fixture's execution
|
|
75
|
+
* to compute cache_ratio (cache_read / input_tokens) and total_tokens.
|
|
76
|
+
* AC-17: metrics sourced from existing hook outputs; no new hooks installed.
|
|
77
|
+
*
|
|
78
|
+
* @param {string} deepflowDir — path to the fixture's `.deepflow/` directory
|
|
79
|
+
* @param {number|null} startTimestamp — ms since epoch; open bound if null
|
|
80
|
+
* @param {number|null} endTimestamp — ms since epoch; open bound if null
|
|
81
|
+
* @returns {Promise<{
|
|
82
|
+
* cache_ratio: number,
|
|
83
|
+
* total_tokens: number,
|
|
84
|
+
* wall_time: number,
|
|
85
|
+
* context_burn: number,
|
|
86
|
+
* entry_count: number
|
|
87
|
+
* }>}
|
|
88
|
+
*/
|
|
89
|
+
async function collectMetrics(deepflowDir, startTimestamp = null, endTimestamp = null) {
|
|
90
|
+
const tokenHistoryPath = path.join(deepflowDir, 'token-history.jsonl');
|
|
91
|
+
|
|
92
|
+
const allEntries = await readJsonl(tokenHistoryPath);
|
|
93
|
+
const entries = filterByRange(allEntries, startTimestamp, endTimestamp);
|
|
94
|
+
|
|
95
|
+
let sumInputTokens = 0;
|
|
96
|
+
let sumCacheRead = 0;
|
|
97
|
+
let sumCacheCreation = 0;
|
|
98
|
+
let sumOutputTokens = 0;
|
|
99
|
+
let maxUsedPercentage = 0;
|
|
100
|
+
|
|
101
|
+
for (const e of entries) {
|
|
102
|
+
const inputTokens = Number(e.input_tokens) || 0;
|
|
103
|
+
const cacheRead = Number(e.cache_read_input_tokens) || 0;
|
|
104
|
+
const cacheCreation = Number(e.cache_creation_input_tokens) || 0;
|
|
105
|
+
const outputTokens = Number(e.output_tokens) || 0;
|
|
106
|
+
const usedPct = Number(e.used_percentage) || 0;
|
|
107
|
+
|
|
108
|
+
sumInputTokens += inputTokens;
|
|
109
|
+
sumCacheRead += cacheRead;
|
|
110
|
+
sumCacheCreation += cacheCreation;
|
|
111
|
+
sumOutputTokens += outputTokens;
|
|
112
|
+
|
|
113
|
+
if (usedPct > maxUsedPercentage) {
|
|
114
|
+
maxUsedPercentage = usedPct;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// cache_ratio: fraction of input tokens served from cache.
|
|
119
|
+
// Use sumInputTokens as denominator; guard against division-by-zero.
|
|
120
|
+
const cache_ratio = sumInputTokens > 0 ? sumCacheRead / sumInputTokens : 0;
|
|
121
|
+
|
|
122
|
+
// total_tokens: all tokens consumed — input (fresh + cache-creation + cache-read) + output.
|
|
123
|
+
const total_tokens = sumInputTokens + sumCacheCreation + sumCacheRead + sumOutputTokens;
|
|
124
|
+
|
|
125
|
+
// wall_time: caller-supplied timestamp delta in ms.
|
|
126
|
+
const wall_time =
|
|
127
|
+
startTimestamp != null && endTimestamp != null
|
|
128
|
+
? endTimestamp - startTimestamp
|
|
129
|
+
: 0;
|
|
130
|
+
|
|
131
|
+
// context_burn: peak context window utilisation across entries.
|
|
132
|
+
const context_burn = maxUsedPercentage;
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
cache_ratio,
|
|
136
|
+
total_tokens,
|
|
137
|
+
wall_time,
|
|
138
|
+
context_burn,
|
|
139
|
+
entry_count: entries.length,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Read tool-usage entries from `~/.claude/tool-usage.jsonl` filtered by range.
|
|
145
|
+
* Provided for orchestrator use; secondary metric source (REQ-10).
|
|
146
|
+
*
|
|
147
|
+
* @param {number|null} startTimestamp
|
|
148
|
+
* @param {number|null} endTimestamp
|
|
149
|
+
* @returns {Promise<object[]>}
|
|
150
|
+
*/
|
|
151
|
+
async function readToolUsage(startTimestamp = null, endTimestamp = null) {
|
|
152
|
+
const toolUsagePath = path.join(os.homedir(), '.claude', 'tool-usage.jsonl');
|
|
153
|
+
const allEntries = await readJsonl(toolUsagePath);
|
|
154
|
+
return filterByRange(allEntries, startTimestamp, endTimestamp);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
module.exports = {
|
|
158
|
+
collectMetrics,
|
|
159
|
+
readToolUsage,
|
|
160
|
+
// exported for testing / orchestrator composition
|
|
161
|
+
readJsonl,
|
|
162
|
+
filterByRange,
|
|
163
|
+
};
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Tests for src/eval/metric-collector.js — T6: wave-2 unit tests
|
|
5
|
+
*
|
|
6
|
+
* Validates metric computation from JSONL token-history data:
|
|
7
|
+
* - cache_ratio = cache_read_input_tokens / input_tokens (AC-16)
|
|
8
|
+
* - total_tokens sums all token fields
|
|
9
|
+
* - context_burn picks max used_percentage
|
|
10
|
+
* - wall_time is timestamp delta
|
|
11
|
+
* - filterByRange filters by ISO timestamps
|
|
12
|
+
* - readJsonl parses multi-line JSONL
|
|
13
|
+
* - Edge cases: empty file, single entry, no entries in range
|
|
14
|
+
*
|
|
15
|
+
* Uses Node.js built-in node:test to avoid adding dependencies.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
const { test, describe, beforeEach, afterEach } = require('node:test');
|
|
19
|
+
const assert = require('node:assert/strict');
|
|
20
|
+
const fs = require('node:fs');
|
|
21
|
+
const path = require('node:path');
|
|
22
|
+
const os = require('os');
|
|
23
|
+
|
|
24
|
+
const {
|
|
25
|
+
collectMetrics,
|
|
26
|
+
readJsonl,
|
|
27
|
+
filterByRange,
|
|
28
|
+
} = require('./metric-collector');
|
|
29
|
+
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Helpers
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
function makeTmpDir() {
|
|
35
|
+
return fs.mkdtempSync(path.join(os.tmpdir(), 'metric-collector-test-'));
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function rmrf(dir) {
|
|
39
|
+
if (fs.existsSync(dir)) {
|
|
40
|
+
fs.rmSync(dir, { recursive: true, force: true });
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Write a JSONL file from an array of objects.
|
|
46
|
+
*/
|
|
47
|
+
function writeJsonl(filePath, entries) {
|
|
48
|
+
const content = entries.map((e) => JSON.stringify(e)).join('\n') + '\n';
|
|
49
|
+
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
|
50
|
+
fs.writeFileSync(filePath, content, 'utf8');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Build a token-history entry with sensible defaults.
|
|
55
|
+
*/
|
|
56
|
+
function makeEntry(overrides = {}) {
|
|
57
|
+
return {
|
|
58
|
+
timestamp: '2026-03-25T10:00:00.000Z',
|
|
59
|
+
input_tokens: 1000,
|
|
60
|
+
cache_read_input_tokens: 500,
|
|
61
|
+
cache_creation_input_tokens: 200,
|
|
62
|
+
output_tokens: 300,
|
|
63
|
+
used_percentage: 50,
|
|
64
|
+
...overrides,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
// readJsonl
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
describe('readJsonl', () => {
|
|
73
|
+
let tmpDir;
|
|
74
|
+
|
|
75
|
+
beforeEach(() => {
|
|
76
|
+
tmpDir = makeTmpDir();
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
afterEach(() => {
|
|
80
|
+
rmrf(tmpDir);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
test('parses multi-line JSONL correctly', async () => {
|
|
84
|
+
const filePath = path.join(tmpDir, 'data.jsonl');
|
|
85
|
+
const entries = [
|
|
86
|
+
{ a: 1, b: 'hello' },
|
|
87
|
+
{ a: 2, b: 'world' },
|
|
88
|
+
{ a: 3, b: 'foo' },
|
|
89
|
+
];
|
|
90
|
+
writeJsonl(filePath, entries);
|
|
91
|
+
|
|
92
|
+
const result = await readJsonl(filePath);
|
|
93
|
+
assert.equal(result.length, 3);
|
|
94
|
+
assert.deepEqual(result[0], { a: 1, b: 'hello' });
|
|
95
|
+
assert.deepEqual(result[1], { a: 2, b: 'world' });
|
|
96
|
+
assert.deepEqual(result[2], { a: 3, b: 'foo' });
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
test('returns empty array for empty file', async () => {
|
|
100
|
+
const filePath = path.join(tmpDir, 'empty.jsonl');
|
|
101
|
+
fs.writeFileSync(filePath, '', 'utf8');
|
|
102
|
+
|
|
103
|
+
const result = await readJsonl(filePath);
|
|
104
|
+
assert.deepEqual(result, []);
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
test('skips blank lines and malformed JSON', async () => {
|
|
108
|
+
const filePath = path.join(tmpDir, 'messy.jsonl');
|
|
109
|
+
const content = [
|
|
110
|
+
'{"valid": true}',
|
|
111
|
+
'',
|
|
112
|
+
'not-json',
|
|
113
|
+
' ',
|
|
114
|
+
'{"also": "valid"}',
|
|
115
|
+
].join('\n');
|
|
116
|
+
fs.writeFileSync(filePath, content, 'utf8');
|
|
117
|
+
|
|
118
|
+
const result = await readJsonl(filePath);
|
|
119
|
+
assert.equal(result.length, 2);
|
|
120
|
+
assert.deepEqual(result[0], { valid: true });
|
|
121
|
+
assert.deepEqual(result[1], { also: 'valid' });
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
test('rejects for non-existent file', async () => {
|
|
125
|
+
const filePath = path.join(tmpDir, 'nope.jsonl');
|
|
126
|
+
await assert.rejects(() => readJsonl(filePath), { code: 'ENOENT' });
|
|
127
|
+
});
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
// ---------------------------------------------------------------------------
|
|
131
|
+
// filterByRange
|
|
132
|
+
// ---------------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
describe('filterByRange', () => {
|
|
135
|
+
const entries = [
|
|
136
|
+
{ timestamp: '2026-03-25T10:00:00.000Z', val: 1 },
|
|
137
|
+
{ timestamp: '2026-03-25T11:00:00.000Z', val: 2 },
|
|
138
|
+
{ timestamp: '2026-03-25T12:00:00.000Z', val: 3 },
|
|
139
|
+
{ timestamp: '2026-03-25T13:00:00.000Z', val: 4 },
|
|
140
|
+
];
|
|
141
|
+
|
|
142
|
+
const t10 = new Date('2026-03-25T10:00:00.000Z').getTime();
|
|
143
|
+
const t11 = new Date('2026-03-25T11:00:00.000Z').getTime();
|
|
144
|
+
const t12 = new Date('2026-03-25T12:00:00.000Z').getTime();
|
|
145
|
+
const t13 = new Date('2026-03-25T13:00:00.000Z').getTime();
|
|
146
|
+
|
|
147
|
+
test('filters entries within inclusive range', () => {
|
|
148
|
+
const result = filterByRange(entries, t11, t12);
|
|
149
|
+
assert.equal(result.length, 2);
|
|
150
|
+
assert.equal(result[0].val, 2);
|
|
151
|
+
assert.equal(result[1].val, 3);
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
test('open start bound returns entries up to end', () => {
|
|
155
|
+
const result = filterByRange(entries, null, t11);
|
|
156
|
+
assert.equal(result.length, 2);
|
|
157
|
+
assert.equal(result[0].val, 1);
|
|
158
|
+
assert.equal(result[1].val, 2);
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
test('open end bound returns entries from start onward', () => {
|
|
162
|
+
const result = filterByRange(entries, t12, null);
|
|
163
|
+
assert.equal(result.length, 2);
|
|
164
|
+
assert.equal(result[0].val, 3);
|
|
165
|
+
assert.equal(result[1].val, 4);
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
test('both bounds null returns all entries', () => {
|
|
169
|
+
const result = filterByRange(entries, null, null);
|
|
170
|
+
assert.equal(result.length, 4);
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
test('no entries in range returns empty array', () => {
|
|
174
|
+
const futureStart = new Date('2030-01-01T00:00:00.000Z').getTime();
|
|
175
|
+
const futureEnd = new Date('2030-12-31T00:00:00.000Z').getTime();
|
|
176
|
+
const result = filterByRange(entries, futureStart, futureEnd);
|
|
177
|
+
assert.deepEqual(result, []);
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
test('entries without valid timestamp are excluded', () => {
|
|
181
|
+
const bad = [
|
|
182
|
+
{ val: 1 },
|
|
183
|
+
{ timestamp: 'not-a-date', val: 2 },
|
|
184
|
+
{ timestamp: '2026-03-25T10:00:00.000Z', val: 3 },
|
|
185
|
+
];
|
|
186
|
+
const result = filterByRange(bad, null, null);
|
|
187
|
+
assert.equal(result.length, 1);
|
|
188
|
+
assert.equal(result[0].val, 3);
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
test('exact boundary timestamps are inclusive', () => {
|
|
192
|
+
const result = filterByRange(entries, t10, t13);
|
|
193
|
+
assert.equal(result.length, 4);
|
|
194
|
+
});
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
// ---------------------------------------------------------------------------
|
|
198
|
+
// collectMetrics
|
|
199
|
+
// ---------------------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
describe('collectMetrics', () => {
|
|
202
|
+
let tmpDir;
|
|
203
|
+
|
|
204
|
+
beforeEach(() => {
|
|
205
|
+
tmpDir = makeTmpDir();
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
afterEach(() => {
|
|
209
|
+
rmrf(tmpDir);
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
test('computes correct cache_ratio from known data (AC-16)', async () => {
|
|
213
|
+
// Two entries: input_tokens = 1000+2000 = 3000, cache_read = 500+1500 = 2000
|
|
214
|
+
// cache_ratio = 2000 / 3000 = 0.6667
|
|
215
|
+
const entries = [
|
|
216
|
+
makeEntry({ input_tokens: 1000, cache_read_input_tokens: 500 }),
|
|
217
|
+
makeEntry({ input_tokens: 2000, cache_read_input_tokens: 1500 }),
|
|
218
|
+
];
|
|
219
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
220
|
+
|
|
221
|
+
const result = await collectMetrics(tmpDir);
|
|
222
|
+
assert.ok(Math.abs(result.cache_ratio - 2000 / 3000) < 1e-10,
|
|
223
|
+
`Expected cache_ratio ~0.6667, got ${result.cache_ratio}`);
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
test('total_tokens sums all token fields correctly', async () => {
|
|
227
|
+
const entries = [
|
|
228
|
+
makeEntry({
|
|
229
|
+
input_tokens: 100,
|
|
230
|
+
cache_read_input_tokens: 200,
|
|
231
|
+
cache_creation_input_tokens: 300,
|
|
232
|
+
output_tokens: 400,
|
|
233
|
+
}),
|
|
234
|
+
makeEntry({
|
|
235
|
+
input_tokens: 50,
|
|
236
|
+
cache_read_input_tokens: 60,
|
|
237
|
+
cache_creation_input_tokens: 70,
|
|
238
|
+
output_tokens: 80,
|
|
239
|
+
}),
|
|
240
|
+
];
|
|
241
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
242
|
+
|
|
243
|
+
const result = await collectMetrics(tmpDir);
|
|
244
|
+
// total = (100+200+300+400) + (50+60+70+80) = 1000 + 260 = 1260
|
|
245
|
+
assert.equal(result.total_tokens, 1260);
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
test('context_burn picks max used_percentage', async () => {
|
|
249
|
+
const entries = [
|
|
250
|
+
makeEntry({ used_percentage: 25 }),
|
|
251
|
+
makeEntry({ used_percentage: 75 }),
|
|
252
|
+
makeEntry({ used_percentage: 50 }),
|
|
253
|
+
];
|
|
254
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
255
|
+
|
|
256
|
+
const result = await collectMetrics(tmpDir);
|
|
257
|
+
assert.equal(result.context_burn, 75);
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
test('wall_time is timestamp delta in ms', async () => {
|
|
261
|
+
const entries = [makeEntry()];
|
|
262
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
263
|
+
|
|
264
|
+
const start = 1000;
|
|
265
|
+
const end = 6000;
|
|
266
|
+
const result = await collectMetrics(tmpDir, start, end);
|
|
267
|
+
assert.equal(result.wall_time, 5000);
|
|
268
|
+
});
|
|
269
|
+
|
|
270
|
+
test('wall_time is 0 when start or end is null', async () => {
|
|
271
|
+
const entries = [makeEntry()];
|
|
272
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
273
|
+
|
|
274
|
+
const result1 = await collectMetrics(tmpDir, null, 6000);
|
|
275
|
+
assert.equal(result1.wall_time, 0);
|
|
276
|
+
|
|
277
|
+
const result2 = await collectMetrics(tmpDir, 1000, null);
|
|
278
|
+
assert.equal(result2.wall_time, 0);
|
|
279
|
+
|
|
280
|
+
const result3 = await collectMetrics(tmpDir);
|
|
281
|
+
assert.equal(result3.wall_time, 0);
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
test('entry_count reflects filtered entries', async () => {
|
|
285
|
+
const t1 = '2026-03-25T10:00:00.000Z';
|
|
286
|
+
const t2 = '2026-03-25T11:00:00.000Z';
|
|
287
|
+
const t3 = '2026-03-25T12:00:00.000Z';
|
|
288
|
+
const entries = [
|
|
289
|
+
makeEntry({ timestamp: t1 }),
|
|
290
|
+
makeEntry({ timestamp: t2 }),
|
|
291
|
+
makeEntry({ timestamp: t3 }),
|
|
292
|
+
];
|
|
293
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
294
|
+
|
|
295
|
+
const start = new Date(t2).getTime();
|
|
296
|
+
const end = new Date(t2).getTime();
|
|
297
|
+
const result = await collectMetrics(tmpDir, start, end);
|
|
298
|
+
assert.equal(result.entry_count, 1);
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
test('empty file returns zero metrics', async () => {
|
|
302
|
+
fs.writeFileSync(path.join(tmpDir, 'token-history.jsonl'), '', 'utf8');
|
|
303
|
+
|
|
304
|
+
const result = await collectMetrics(tmpDir);
|
|
305
|
+
assert.equal(result.cache_ratio, 0);
|
|
306
|
+
assert.equal(result.total_tokens, 0);
|
|
307
|
+
assert.equal(result.wall_time, 0);
|
|
308
|
+
assert.equal(result.context_burn, 0);
|
|
309
|
+
assert.equal(result.entry_count, 0);
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
test('single entry computes metrics correctly', async () => {
|
|
313
|
+
const entries = [
|
|
314
|
+
makeEntry({
|
|
315
|
+
input_tokens: 800,
|
|
316
|
+
cache_read_input_tokens: 600,
|
|
317
|
+
cache_creation_input_tokens: 100,
|
|
318
|
+
output_tokens: 200,
|
|
319
|
+
used_percentage: 42,
|
|
320
|
+
}),
|
|
321
|
+
];
|
|
322
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
323
|
+
|
|
324
|
+
const result = await collectMetrics(tmpDir);
|
|
325
|
+
assert.equal(result.cache_ratio, 600 / 800);
|
|
326
|
+
assert.equal(result.total_tokens, 800 + 600 + 100 + 200);
|
|
327
|
+
assert.equal(result.context_burn, 42);
|
|
328
|
+
assert.equal(result.entry_count, 1);
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
test('no entries in range returns zero metrics', async () => {
|
|
332
|
+
const entries = [
|
|
333
|
+
makeEntry({ timestamp: '2026-03-25T10:00:00.000Z' }),
|
|
334
|
+
];
|
|
335
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
336
|
+
|
|
337
|
+
const futureStart = new Date('2030-01-01T00:00:00.000Z').getTime();
|
|
338
|
+
const futureEnd = new Date('2030-12-31T00:00:00.000Z').getTime();
|
|
339
|
+
const result = await collectMetrics(tmpDir, futureStart, futureEnd);
|
|
340
|
+
assert.equal(result.cache_ratio, 0);
|
|
341
|
+
assert.equal(result.total_tokens, 0);
|
|
342
|
+
assert.equal(result.context_burn, 0);
|
|
343
|
+
assert.equal(result.entry_count, 0);
|
|
344
|
+
});
|
|
345
|
+
|
|
346
|
+
test('handles missing token fields gracefully (treated as 0)', async () => {
|
|
347
|
+
const entries = [
|
|
348
|
+
{ timestamp: '2026-03-25T10:00:00.000Z', input_tokens: 500 },
|
|
349
|
+
];
|
|
350
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
351
|
+
|
|
352
|
+
const result = await collectMetrics(tmpDir);
|
|
353
|
+
// cache_read=0, cache_creation=0, output=0
|
|
354
|
+
assert.equal(result.cache_ratio, 0);
|
|
355
|
+
assert.equal(result.total_tokens, 500);
|
|
356
|
+
assert.equal(result.context_burn, 0);
|
|
357
|
+
assert.equal(result.entry_count, 1);
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
test('division by zero: cache_ratio is 0 when input_tokens is 0', async () => {
|
|
361
|
+
const entries = [
|
|
362
|
+
makeEntry({ input_tokens: 0, cache_read_input_tokens: 0 }),
|
|
363
|
+
];
|
|
364
|
+
writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
|
|
365
|
+
|
|
366
|
+
const result = await collectMetrics(tmpDir);
|
|
367
|
+
assert.equal(result.cache_ratio, 0);
|
|
368
|
+
});
|
|
369
|
+
});
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { queryExperiments } = require('./git-memory');
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Parses a secondaries string into a key/value map.
|
|
7
|
+
* Secondaries format: "key=value key=value ..." (space-separated, produced by formatSecondaries)
|
|
8
|
+
*
|
|
9
|
+
* @param {string} secondaries
|
|
10
|
+
* @returns {Object<string, string>}
|
|
11
|
+
*/
|
|
12
|
+
function parseSecondaries(secondaries) {
|
|
13
|
+
if (!secondaries) return {};
|
|
14
|
+
const result = {};
|
|
15
|
+
for (const token of secondaries.split(/\s+/)) {
|
|
16
|
+
const eqIdx = token.indexOf('=');
|
|
17
|
+
if (eqIdx === -1) continue;
|
|
18
|
+
const key = token.slice(0, eqIdx);
|
|
19
|
+
const val = token.slice(eqIdx + 1);
|
|
20
|
+
if (key) result[key] = val;
|
|
21
|
+
}
|
|
22
|
+
return result;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Surfaces previously-reverted experiments that had a positive delta on newTarget.
|
|
27
|
+
*
|
|
28
|
+
* A candidate is an experiment where:
|
|
29
|
+
* - status === 'reverted' (the experiment was rolled back)
|
|
30
|
+
* - The newTarget metric was either:
|
|
31
|
+
* a) The primary target and had delta > 0, OR
|
|
32
|
+
* b) Recorded as a secondary metric (value parsed from secondaries string)
|
|
33
|
+
* — in this case the experiment is included as a candidate since we cannot
|
|
34
|
+
* compute a delta without a baseline; callers should review the raw value.
|
|
35
|
+
*
|
|
36
|
+
* AC-14: After --target pivot, git log --grep="experiment:" is parsed and
|
|
37
|
+
* previously-reverted experiments with positive delta on new target are
|
|
38
|
+
* surfaced as candidates.
|
|
39
|
+
*
|
|
40
|
+
* @param {object} opts
|
|
41
|
+
* @param {string} opts.cwd - Working directory (git repo root)
|
|
42
|
+
* @param {string} opts.skillName - Skill being evaluated
|
|
43
|
+
* @param {string} opts.newTarget - The new primary metric after a pivot
|
|
44
|
+
* @returns {Array<{hash, skillName, hypothesis, target, value, delta, status, secondaries, candidateValue, candidateDelta}>}
|
|
45
|
+
*/
|
|
46
|
+
function surfaceCandidates({ cwd, skillName, newTarget }) {
|
|
47
|
+
const experiments = queryExperiments({ cwd, skillName });
|
|
48
|
+
|
|
49
|
+
const candidates = [];
|
|
50
|
+
|
|
51
|
+
for (const exp of experiments) {
|
|
52
|
+
if (exp.status !== 'reverted') continue;
|
|
53
|
+
|
|
54
|
+
// Case A: newTarget was the primary metric for this experiment
|
|
55
|
+
if (exp.target === newTarget) {
|
|
56
|
+
if (exp.delta > 0) {
|
|
57
|
+
candidates.push({
|
|
58
|
+
...exp,
|
|
59
|
+
candidateValue: exp.value,
|
|
60
|
+
candidateDelta: exp.delta,
|
|
61
|
+
candidateSource: 'primary',
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Case B: newTarget appears as a secondary metric
|
|
68
|
+
const secondaryMap = parseSecondaries(exp.secondaries);
|
|
69
|
+
if (Object.prototype.hasOwnProperty.call(secondaryMap, newTarget)) {
|
|
70
|
+
candidates.push({
|
|
71
|
+
...exp,
|
|
72
|
+
candidateValue: secondaryMap[newTarget],
|
|
73
|
+
candidateDelta: null, // delta unknown for secondaries — only raw value available
|
|
74
|
+
candidateSource: 'secondary',
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return candidates;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Formats surfaced candidates for stdout display.
|
|
84
|
+
*
|
|
85
|
+
* @param {Array} candidates - Result of surfaceCandidates()
|
|
86
|
+
* @param {string} newTarget - The new primary metric name
|
|
87
|
+
* @returns {string}
|
|
88
|
+
*/
|
|
89
|
+
function formatCandidates(candidates, newTarget) {
|
|
90
|
+
if (candidates.length === 0) {
|
|
91
|
+
return `No reverted experiments found with positive delta on target="${newTarget}".`;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const lines = [
|
|
95
|
+
`Reverted experiments with positive signal on "${newTarget}" (candidates for retry):`,
|
|
96
|
+
'',
|
|
97
|
+
];
|
|
98
|
+
|
|
99
|
+
for (const c of candidates) {
|
|
100
|
+
const deltaStr =
|
|
101
|
+
c.candidateDelta !== null
|
|
102
|
+
? `delta=+${c.candidateDelta}% (primary)`
|
|
103
|
+
: `value=${c.candidateValue} (secondary — no delta available)`;
|
|
104
|
+
|
|
105
|
+
lines.push(` [${c.hash}] ${c.skillName}: ${c.hypothesis}`);
|
|
106
|
+
lines.push(` ${newTarget}=${c.candidateValue} ${deltaStr}`);
|
|
107
|
+
lines.push(` original target: ${c.target}=${c.value} delta=${c.delta}%`);
|
|
108
|
+
lines.push('');
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return lines.join('\n').trimEnd();
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
module.exports = {
|
|
115
|
+
surfaceCandidates,
|
|
116
|
+
formatCandidates,
|
|
117
|
+
// exported for testing
|
|
118
|
+
parseSecondaries,
|
|
119
|
+
};
|