deepflow 0.1.102 → 0.1.104

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/bin/install-dynamic-hooks.test.js +461 -0
  2. package/bin/install.js +150 -204
  3. package/bin/install.test.js +214 -0
  4. package/bin/lineage-ingest.js +70 -0
  5. package/hooks/df-check-update.js +1 -0
  6. package/hooks/df-command-usage.js +305 -0
  7. package/hooks/df-command-usage.test.js +1019 -0
  8. package/hooks/df-dashboard-push.js +1 -0
  9. package/hooks/df-execution-history.js +1 -0
  10. package/hooks/df-explore-protocol.js +83 -0
  11. package/hooks/df-explore-protocol.test.js +228 -0
  12. package/hooks/df-hook-event-tags.test.js +127 -0
  13. package/hooks/df-invariant-check.js +1 -0
  14. package/hooks/df-quota-logger.js +1 -0
  15. package/hooks/df-snapshot-guard.js +1 -0
  16. package/hooks/df-spec-lint.js +58 -1
  17. package/hooks/df-spec-lint.test.js +412 -0
  18. package/hooks/df-statusline.js +1 -0
  19. package/hooks/df-subagent-registry.js +34 -14
  20. package/hooks/df-tool-usage.js +21 -3
  21. package/hooks/df-tool-usage.test.js +200 -0
  22. package/hooks/df-worktree-guard.js +1 -0
  23. package/package.json +1 -1
  24. package/src/commands/df/debate.md +1 -1
  25. package/src/commands/df/eval.md +117 -0
  26. package/src/commands/df/execute.md +1 -1
  27. package/src/commands/df/fix.md +104 -0
  28. package/src/eval/git-memory.js +159 -0
  29. package/src/eval/git-memory.test.js +439 -0
  30. package/src/eval/hypothesis.js +80 -0
  31. package/src/eval/hypothesis.test.js +169 -0
  32. package/src/eval/loop.js +378 -0
  33. package/src/eval/loop.test.js +306 -0
  34. package/src/eval/metric-collector.js +163 -0
  35. package/src/eval/metric-collector.test.js +369 -0
  36. package/src/eval/metric-pivot.js +119 -0
  37. package/src/eval/metric-pivot.test.js +350 -0
  38. package/src/eval/mutator-prompt.js +106 -0
  39. package/src/eval/mutator-prompt.test.js +180 -0
  40. package/templates/config-template.yaml +5 -0
  41. package/templates/eval-fixture-template/config.yaml +39 -0
  42. package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
  43. package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
  44. package/templates/eval-fixture-template/fixture/package.json +12 -0
  45. package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
  46. package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
  47. package/templates/eval-fixture-template/fixture/src/config.js +40 -0
  48. package/templates/eval-fixture-template/fixture/src/index.js +19 -0
  49. package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
  50. package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
  51. package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
  52. package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
  53. package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
  54. package/templates/eval-fixture-template/hypotheses.md +14 -0
  55. package/templates/eval-fixture-template/spec.md +34 -0
  56. package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
  57. package/templates/eval-fixture-template/tests/guard.test.js +108 -0
  58. package/templates/eval-fixture-template.test.js +318 -0
  59. package/templates/explore-agent.md +5 -74
  60. package/templates/explore-protocol.md +44 -0
  61. package/templates/spec-template.md +4 -0
@@ -0,0 +1,163 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Metric collector for df:eval.
5
+ *
6
+ * Reads `.deepflow/token-history.jsonl` and `~/.claude/tool-usage.jsonl`
7
+ * from existing hook outputs — no new instrumentation hooks installed (AC-17).
8
+ *
9
+ * Metric source mapping (from spec doing-skill-eval.md):
10
+ * cache_ratio = cache_read_input_tokens / input_tokens
11
+ * total_tokens = sum of (input_tokens + cache_creation_input_tokens + cache_read_input_tokens + output_tokens) per entry
12
+ * wall_time = endTimestamp - startTimestamp (ms)
13
+ * context_burn = max used_percentage across entries
14
+ */
15
+
16
+ const fs = require('fs');
17
+ const path = require('path');
18
+ const os = require('os');
19
+ const readline = require('readline');
20
+
21
+ /**
22
+ * Parse a JSONL file, yielding one parsed object per line.
23
+ * Lines that are empty or fail to parse are silently skipped.
24
+ *
25
+ * @param {string} filePath
26
+ * @returns {Promise<object[]>}
27
+ */
28
+ async function readJsonl(filePath) {
29
+ const entries = [];
30
+ let stream;
31
+ try {
32
+ stream = fs.createReadStream(filePath, { encoding: 'utf8' });
33
+ } catch (_) {
34
+ return entries;
35
+ }
36
+
37
+ const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
38
+
39
+ for await (const line of rl) {
40
+ const trimmed = line.trim();
41
+ if (!trimmed) continue;
42
+ try {
43
+ entries.push(JSON.parse(trimmed));
44
+ } catch (_) {
45
+ // skip malformed lines
46
+ }
47
+ }
48
+
49
+ return entries;
50
+ }
51
+
52
+ /**
53
+ * Filter entries whose `timestamp` field falls within [startTimestamp, endTimestamp].
54
+ * If startTimestamp or endTimestamp is null/undefined, that bound is open.
55
+ *
56
+ * @param {object[]} entries
57
+ * @param {number|null} startTimestamp — ms since epoch (inclusive)
58
+ * @param {number|null} endTimestamp — ms since epoch (inclusive)
59
+ * @returns {object[]}
60
+ */
61
+ function filterByRange(entries, startTimestamp, endTimestamp) {
62
+ return entries.filter((e) => {
63
+ const ts = new Date(e.timestamp).getTime();
64
+ if (isNaN(ts)) return false;
65
+ if (startTimestamp != null && ts < startTimestamp) return false;
66
+ if (endTimestamp != null && ts > endTimestamp) return false;
67
+ return true;
68
+ });
69
+ }
70
+
71
+ /**
72
+ * Collect evaluation metrics from existing hook output files.
73
+ *
74
+ * AC-16: reads `.deepflow/token-history.jsonl` from the fixture's execution
75
+ * to compute cache_ratio (cache_read / input_tokens) and total_tokens.
76
+ * AC-17: metrics sourced from existing hook outputs; no new hooks installed.
77
+ *
78
+ * @param {string} deepflowDir — path to the fixture's `.deepflow/` directory
79
+ * @param {number|null} startTimestamp — ms since epoch; open bound if null
80
+ * @param {number|null} endTimestamp — ms since epoch; open bound if null
81
+ * @returns {Promise<{
82
+ * cache_ratio: number,
83
+ * total_tokens: number,
84
+ * wall_time: number,
85
+ * context_burn: number,
86
+ * entry_count: number
87
+ * }>}
88
+ */
89
+ async function collectMetrics(deepflowDir, startTimestamp = null, endTimestamp = null) {
90
+ const tokenHistoryPath = path.join(deepflowDir, 'token-history.jsonl');
91
+
92
+ const allEntries = await readJsonl(tokenHistoryPath);
93
+ const entries = filterByRange(allEntries, startTimestamp, endTimestamp);
94
+
95
+ let sumInputTokens = 0;
96
+ let sumCacheRead = 0;
97
+ let sumCacheCreation = 0;
98
+ let sumOutputTokens = 0;
99
+ let maxUsedPercentage = 0;
100
+
101
+ for (const e of entries) {
102
+ const inputTokens = Number(e.input_tokens) || 0;
103
+ const cacheRead = Number(e.cache_read_input_tokens) || 0;
104
+ const cacheCreation = Number(e.cache_creation_input_tokens) || 0;
105
+ const outputTokens = Number(e.output_tokens) || 0;
106
+ const usedPct = Number(e.used_percentage) || 0;
107
+
108
+ sumInputTokens += inputTokens;
109
+ sumCacheRead += cacheRead;
110
+ sumCacheCreation += cacheCreation;
111
+ sumOutputTokens += outputTokens;
112
+
113
+ if (usedPct > maxUsedPercentage) {
114
+ maxUsedPercentage = usedPct;
115
+ }
116
+ }
117
+
118
+ // cache_ratio: fraction of input tokens served from cache.
119
+ // Use sumInputTokens as denominator; guard against division-by-zero.
120
+ const cache_ratio = sumInputTokens > 0 ? sumCacheRead / sumInputTokens : 0;
121
+
122
+ // total_tokens: all tokens consumed — input (fresh + cache-creation + cache-read) + output.
123
+ const total_tokens = sumInputTokens + sumCacheCreation + sumCacheRead + sumOutputTokens;
124
+
125
+ // wall_time: caller-supplied timestamp delta in ms.
126
+ const wall_time =
127
+ startTimestamp != null && endTimestamp != null
128
+ ? endTimestamp - startTimestamp
129
+ : 0;
130
+
131
+ // context_burn: peak context window utilisation across entries.
132
+ const context_burn = maxUsedPercentage;
133
+
134
+ return {
135
+ cache_ratio,
136
+ total_tokens,
137
+ wall_time,
138
+ context_burn,
139
+ entry_count: entries.length,
140
+ };
141
+ }
142
+
143
+ /**
144
+ * Read tool-usage entries from `~/.claude/tool-usage.jsonl` filtered by range.
145
+ * Provided for orchestrator use; secondary metric source (REQ-10).
146
+ *
147
+ * @param {number|null} startTimestamp
148
+ * @param {number|null} endTimestamp
149
+ * @returns {Promise<object[]>}
150
+ */
151
+ async function readToolUsage(startTimestamp = null, endTimestamp = null) {
152
+ const toolUsagePath = path.join(os.homedir(), '.claude', 'tool-usage.jsonl');
153
+ const allEntries = await readJsonl(toolUsagePath);
154
+ return filterByRange(allEntries, startTimestamp, endTimestamp);
155
+ }
156
+
157
+ module.exports = {
158
+ collectMetrics,
159
+ readToolUsage,
160
+ // exported for testing / orchestrator composition
161
+ readJsonl,
162
+ filterByRange,
163
+ };
@@ -0,0 +1,369 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Tests for src/eval/metric-collector.js — T6: wave-2 unit tests
5
+ *
6
+ * Validates metric computation from JSONL token-history data:
7
+ * - cache_ratio = cache_read_input_tokens / input_tokens (AC-16)
8
+ * - total_tokens sums all token fields
9
+ * - context_burn picks max used_percentage
10
+ * - wall_time is timestamp delta
11
+ * - filterByRange filters by ISO timestamps
12
+ * - readJsonl parses multi-line JSONL
13
+ * - Edge cases: empty file, single entry, no entries in range
14
+ *
15
+ * Uses Node.js built-in node:test to avoid adding dependencies.
16
+ */
17
+
18
+ const { test, describe, beforeEach, afterEach } = require('node:test');
19
+ const assert = require('node:assert/strict');
20
+ const fs = require('node:fs');
21
+ const path = require('node:path');
22
+ const os = require('os');
23
+
24
+ const {
25
+ collectMetrics,
26
+ readJsonl,
27
+ filterByRange,
28
+ } = require('./metric-collector');
29
+
30
+ // ---------------------------------------------------------------------------
31
+ // Helpers
32
+ // ---------------------------------------------------------------------------
33
+
34
+ function makeTmpDir() {
35
+ return fs.mkdtempSync(path.join(os.tmpdir(), 'metric-collector-test-'));
36
+ }
37
+
38
+ function rmrf(dir) {
39
+ if (fs.existsSync(dir)) {
40
+ fs.rmSync(dir, { recursive: true, force: true });
41
+ }
42
+ }
43
+
44
+ /**
45
+ * Write a JSONL file from an array of objects.
46
+ */
47
+ function writeJsonl(filePath, entries) {
48
+ const content = entries.map((e) => JSON.stringify(e)).join('\n') + '\n';
49
+ fs.mkdirSync(path.dirname(filePath), { recursive: true });
50
+ fs.writeFileSync(filePath, content, 'utf8');
51
+ }
52
+
53
+ /**
54
+ * Build a token-history entry with sensible defaults.
55
+ */
56
+ function makeEntry(overrides = {}) {
57
+ return {
58
+ timestamp: '2026-03-25T10:00:00.000Z',
59
+ input_tokens: 1000,
60
+ cache_read_input_tokens: 500,
61
+ cache_creation_input_tokens: 200,
62
+ output_tokens: 300,
63
+ used_percentage: 50,
64
+ ...overrides,
65
+ };
66
+ }
67
+
68
+ // ---------------------------------------------------------------------------
69
+ // readJsonl
70
+ // ---------------------------------------------------------------------------
71
+
72
+ describe('readJsonl', () => {
73
+ let tmpDir;
74
+
75
+ beforeEach(() => {
76
+ tmpDir = makeTmpDir();
77
+ });
78
+
79
+ afterEach(() => {
80
+ rmrf(tmpDir);
81
+ });
82
+
83
+ test('parses multi-line JSONL correctly', async () => {
84
+ const filePath = path.join(tmpDir, 'data.jsonl');
85
+ const entries = [
86
+ { a: 1, b: 'hello' },
87
+ { a: 2, b: 'world' },
88
+ { a: 3, b: 'foo' },
89
+ ];
90
+ writeJsonl(filePath, entries);
91
+
92
+ const result = await readJsonl(filePath);
93
+ assert.equal(result.length, 3);
94
+ assert.deepEqual(result[0], { a: 1, b: 'hello' });
95
+ assert.deepEqual(result[1], { a: 2, b: 'world' });
96
+ assert.deepEqual(result[2], { a: 3, b: 'foo' });
97
+ });
98
+
99
+ test('returns empty array for empty file', async () => {
100
+ const filePath = path.join(tmpDir, 'empty.jsonl');
101
+ fs.writeFileSync(filePath, '', 'utf8');
102
+
103
+ const result = await readJsonl(filePath);
104
+ assert.deepEqual(result, []);
105
+ });
106
+
107
+ test('skips blank lines and malformed JSON', async () => {
108
+ const filePath = path.join(tmpDir, 'messy.jsonl');
109
+ const content = [
110
+ '{"valid": true}',
111
+ '',
112
+ 'not-json',
113
+ ' ',
114
+ '{"also": "valid"}',
115
+ ].join('\n');
116
+ fs.writeFileSync(filePath, content, 'utf8');
117
+
118
+ const result = await readJsonl(filePath);
119
+ assert.equal(result.length, 2);
120
+ assert.deepEqual(result[0], { valid: true });
121
+ assert.deepEqual(result[1], { also: 'valid' });
122
+ });
123
+
124
+ test('rejects for non-existent file', async () => {
125
+ const filePath = path.join(tmpDir, 'nope.jsonl');
126
+ await assert.rejects(() => readJsonl(filePath), { code: 'ENOENT' });
127
+ });
128
+ });
129
+
130
+ // ---------------------------------------------------------------------------
131
+ // filterByRange
132
+ // ---------------------------------------------------------------------------
133
+
134
+ describe('filterByRange', () => {
135
+ const entries = [
136
+ { timestamp: '2026-03-25T10:00:00.000Z', val: 1 },
137
+ { timestamp: '2026-03-25T11:00:00.000Z', val: 2 },
138
+ { timestamp: '2026-03-25T12:00:00.000Z', val: 3 },
139
+ { timestamp: '2026-03-25T13:00:00.000Z', val: 4 },
140
+ ];
141
+
142
+ const t10 = new Date('2026-03-25T10:00:00.000Z').getTime();
143
+ const t11 = new Date('2026-03-25T11:00:00.000Z').getTime();
144
+ const t12 = new Date('2026-03-25T12:00:00.000Z').getTime();
145
+ const t13 = new Date('2026-03-25T13:00:00.000Z').getTime();
146
+
147
+ test('filters entries within inclusive range', () => {
148
+ const result = filterByRange(entries, t11, t12);
149
+ assert.equal(result.length, 2);
150
+ assert.equal(result[0].val, 2);
151
+ assert.equal(result[1].val, 3);
152
+ });
153
+
154
+ test('open start bound returns entries up to end', () => {
155
+ const result = filterByRange(entries, null, t11);
156
+ assert.equal(result.length, 2);
157
+ assert.equal(result[0].val, 1);
158
+ assert.equal(result[1].val, 2);
159
+ });
160
+
161
+ test('open end bound returns entries from start onward', () => {
162
+ const result = filterByRange(entries, t12, null);
163
+ assert.equal(result.length, 2);
164
+ assert.equal(result[0].val, 3);
165
+ assert.equal(result[1].val, 4);
166
+ });
167
+
168
+ test('both bounds null returns all entries', () => {
169
+ const result = filterByRange(entries, null, null);
170
+ assert.equal(result.length, 4);
171
+ });
172
+
173
+ test('no entries in range returns empty array', () => {
174
+ const futureStart = new Date('2030-01-01T00:00:00.000Z').getTime();
175
+ const futureEnd = new Date('2030-12-31T00:00:00.000Z').getTime();
176
+ const result = filterByRange(entries, futureStart, futureEnd);
177
+ assert.deepEqual(result, []);
178
+ });
179
+
180
+ test('entries without valid timestamp are excluded', () => {
181
+ const bad = [
182
+ { val: 1 },
183
+ { timestamp: 'not-a-date', val: 2 },
184
+ { timestamp: '2026-03-25T10:00:00.000Z', val: 3 },
185
+ ];
186
+ const result = filterByRange(bad, null, null);
187
+ assert.equal(result.length, 1);
188
+ assert.equal(result[0].val, 3);
189
+ });
190
+
191
+ test('exact boundary timestamps are inclusive', () => {
192
+ const result = filterByRange(entries, t10, t13);
193
+ assert.equal(result.length, 4);
194
+ });
195
+ });
196
+
197
+ // ---------------------------------------------------------------------------
198
+ // collectMetrics
199
+ // ---------------------------------------------------------------------------
200
+
201
+ describe('collectMetrics', () => {
202
+ let tmpDir;
203
+
204
+ beforeEach(() => {
205
+ tmpDir = makeTmpDir();
206
+ });
207
+
208
+ afterEach(() => {
209
+ rmrf(tmpDir);
210
+ });
211
+
212
+ test('computes correct cache_ratio from known data (AC-16)', async () => {
213
+ // Two entries: input_tokens = 1000+2000 = 3000, cache_read = 500+1500 = 2000
214
+ // cache_ratio = 2000 / 3000 = 0.6667
215
+ const entries = [
216
+ makeEntry({ input_tokens: 1000, cache_read_input_tokens: 500 }),
217
+ makeEntry({ input_tokens: 2000, cache_read_input_tokens: 1500 }),
218
+ ];
219
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
220
+
221
+ const result = await collectMetrics(tmpDir);
222
+ assert.ok(Math.abs(result.cache_ratio - 2000 / 3000) < 1e-10,
223
+ `Expected cache_ratio ~0.6667, got ${result.cache_ratio}`);
224
+ });
225
+
226
+ test('total_tokens sums all token fields correctly', async () => {
227
+ const entries = [
228
+ makeEntry({
229
+ input_tokens: 100,
230
+ cache_read_input_tokens: 200,
231
+ cache_creation_input_tokens: 300,
232
+ output_tokens: 400,
233
+ }),
234
+ makeEntry({
235
+ input_tokens: 50,
236
+ cache_read_input_tokens: 60,
237
+ cache_creation_input_tokens: 70,
238
+ output_tokens: 80,
239
+ }),
240
+ ];
241
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
242
+
243
+ const result = await collectMetrics(tmpDir);
244
+ // total = (100+200+300+400) + (50+60+70+80) = 1000 + 260 = 1260
245
+ assert.equal(result.total_tokens, 1260);
246
+ });
247
+
248
+ test('context_burn picks max used_percentage', async () => {
249
+ const entries = [
250
+ makeEntry({ used_percentage: 25 }),
251
+ makeEntry({ used_percentage: 75 }),
252
+ makeEntry({ used_percentage: 50 }),
253
+ ];
254
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
255
+
256
+ const result = await collectMetrics(tmpDir);
257
+ assert.equal(result.context_burn, 75);
258
+ });
259
+
260
+ test('wall_time is timestamp delta in ms', async () => {
261
+ const entries = [makeEntry()];
262
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
263
+
264
+ const start = 1000;
265
+ const end = 6000;
266
+ const result = await collectMetrics(tmpDir, start, end);
267
+ assert.equal(result.wall_time, 5000);
268
+ });
269
+
270
+ test('wall_time is 0 when start or end is null', async () => {
271
+ const entries = [makeEntry()];
272
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
273
+
274
+ const result1 = await collectMetrics(tmpDir, null, 6000);
275
+ assert.equal(result1.wall_time, 0);
276
+
277
+ const result2 = await collectMetrics(tmpDir, 1000, null);
278
+ assert.equal(result2.wall_time, 0);
279
+
280
+ const result3 = await collectMetrics(tmpDir);
281
+ assert.equal(result3.wall_time, 0);
282
+ });
283
+
284
+ test('entry_count reflects filtered entries', async () => {
285
+ const t1 = '2026-03-25T10:00:00.000Z';
286
+ const t2 = '2026-03-25T11:00:00.000Z';
287
+ const t3 = '2026-03-25T12:00:00.000Z';
288
+ const entries = [
289
+ makeEntry({ timestamp: t1 }),
290
+ makeEntry({ timestamp: t2 }),
291
+ makeEntry({ timestamp: t3 }),
292
+ ];
293
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
294
+
295
+ const start = new Date(t2).getTime();
296
+ const end = new Date(t2).getTime();
297
+ const result = await collectMetrics(tmpDir, start, end);
298
+ assert.equal(result.entry_count, 1);
299
+ });
300
+
301
+ test('empty file returns zero metrics', async () => {
302
+ fs.writeFileSync(path.join(tmpDir, 'token-history.jsonl'), '', 'utf8');
303
+
304
+ const result = await collectMetrics(tmpDir);
305
+ assert.equal(result.cache_ratio, 0);
306
+ assert.equal(result.total_tokens, 0);
307
+ assert.equal(result.wall_time, 0);
308
+ assert.equal(result.context_burn, 0);
309
+ assert.equal(result.entry_count, 0);
310
+ });
311
+
312
+ test('single entry computes metrics correctly', async () => {
313
+ const entries = [
314
+ makeEntry({
315
+ input_tokens: 800,
316
+ cache_read_input_tokens: 600,
317
+ cache_creation_input_tokens: 100,
318
+ output_tokens: 200,
319
+ used_percentage: 42,
320
+ }),
321
+ ];
322
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
323
+
324
+ const result = await collectMetrics(tmpDir);
325
+ assert.equal(result.cache_ratio, 600 / 800);
326
+ assert.equal(result.total_tokens, 800 + 600 + 100 + 200);
327
+ assert.equal(result.context_burn, 42);
328
+ assert.equal(result.entry_count, 1);
329
+ });
330
+
331
+ test('no entries in range returns zero metrics', async () => {
332
+ const entries = [
333
+ makeEntry({ timestamp: '2026-03-25T10:00:00.000Z' }),
334
+ ];
335
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
336
+
337
+ const futureStart = new Date('2030-01-01T00:00:00.000Z').getTime();
338
+ const futureEnd = new Date('2030-12-31T00:00:00.000Z').getTime();
339
+ const result = await collectMetrics(tmpDir, futureStart, futureEnd);
340
+ assert.equal(result.cache_ratio, 0);
341
+ assert.equal(result.total_tokens, 0);
342
+ assert.equal(result.context_burn, 0);
343
+ assert.equal(result.entry_count, 0);
344
+ });
345
+
346
+ test('handles missing token fields gracefully (treated as 0)', async () => {
347
+ const entries = [
348
+ { timestamp: '2026-03-25T10:00:00.000Z', input_tokens: 500 },
349
+ ];
350
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
351
+
352
+ const result = await collectMetrics(tmpDir);
353
+ // cache_read=0, cache_creation=0, output=0
354
+ assert.equal(result.cache_ratio, 0);
355
+ assert.equal(result.total_tokens, 500);
356
+ assert.equal(result.context_burn, 0);
357
+ assert.equal(result.entry_count, 1);
358
+ });
359
+
360
+ test('division by zero: cache_ratio is 0 when input_tokens is 0', async () => {
361
+ const entries = [
362
+ makeEntry({ input_tokens: 0, cache_read_input_tokens: 0 }),
363
+ ];
364
+ writeJsonl(path.join(tmpDir, 'token-history.jsonl'), entries);
365
+
366
+ const result = await collectMetrics(tmpDir);
367
+ assert.equal(result.cache_ratio, 0);
368
+ });
369
+ });
@@ -0,0 +1,119 @@
1
+ 'use strict';
2
+
3
+ const { queryExperiments } = require('./git-memory');
4
+
5
+ /**
6
+ * Parses a secondaries string into a key/value map.
7
+ * Secondaries format: "key=value key=value ..." (space-separated, produced by formatSecondaries)
8
+ *
9
+ * @param {string} secondaries
10
+ * @returns {Object<string, string>}
11
+ */
12
+ function parseSecondaries(secondaries) {
13
+ if (!secondaries) return {};
14
+ const result = {};
15
+ for (const token of secondaries.split(/\s+/)) {
16
+ const eqIdx = token.indexOf('=');
17
+ if (eqIdx === -1) continue;
18
+ const key = token.slice(0, eqIdx);
19
+ const val = token.slice(eqIdx + 1);
20
+ if (key) result[key] = val;
21
+ }
22
+ return result;
23
+ }
24
+
25
+ /**
26
+ * Surfaces previously-reverted experiments that had a positive delta on newTarget.
27
+ *
28
+ * A candidate is an experiment where:
29
+ * - status === 'reverted' (the experiment was rolled back)
30
+ * - The newTarget metric was either:
31
+ * a) The primary target and had delta > 0, OR
32
+ * b) Recorded as a secondary metric (value parsed from secondaries string)
33
+ * — in this case the experiment is included as a candidate since we cannot
34
+ * compute a delta without a baseline; callers should review the raw value.
35
+ *
36
+ * AC-14: After --target pivot, git log --grep="experiment:" is parsed and
37
+ * previously-reverted experiments with positive delta on new target are
38
+ * surfaced as candidates.
39
+ *
40
+ * @param {object} opts
41
+ * @param {string} opts.cwd - Working directory (git repo root)
42
+ * @param {string} opts.skillName - Skill being evaluated
43
+ * @param {string} opts.newTarget - The new primary metric after a pivot
44
+ * @returns {Array<{hash, skillName, hypothesis, target, value, delta, status, secondaries, candidateValue, candidateDelta}>}
45
+ */
46
+ function surfaceCandidates({ cwd, skillName, newTarget }) {
47
+ const experiments = queryExperiments({ cwd, skillName });
48
+
49
+ const candidates = [];
50
+
51
+ for (const exp of experiments) {
52
+ if (exp.status !== 'reverted') continue;
53
+
54
+ // Case A: newTarget was the primary metric for this experiment
55
+ if (exp.target === newTarget) {
56
+ if (exp.delta > 0) {
57
+ candidates.push({
58
+ ...exp,
59
+ candidateValue: exp.value,
60
+ candidateDelta: exp.delta,
61
+ candidateSource: 'primary',
62
+ });
63
+ }
64
+ continue;
65
+ }
66
+
67
+ // Case B: newTarget appears as a secondary metric
68
+ const secondaryMap = parseSecondaries(exp.secondaries);
69
+ if (Object.prototype.hasOwnProperty.call(secondaryMap, newTarget)) {
70
+ candidates.push({
71
+ ...exp,
72
+ candidateValue: secondaryMap[newTarget],
73
+ candidateDelta: null, // delta unknown for secondaries — only raw value available
74
+ candidateSource: 'secondary',
75
+ });
76
+ }
77
+ }
78
+
79
+ return candidates;
80
+ }
81
+
82
+ /**
83
+ * Formats surfaced candidates for stdout display.
84
+ *
85
+ * @param {Array} candidates - Result of surfaceCandidates()
86
+ * @param {string} newTarget - The new primary metric name
87
+ * @returns {string}
88
+ */
89
+ function formatCandidates(candidates, newTarget) {
90
+ if (candidates.length === 0) {
91
+ return `No reverted experiments found with positive delta on target="${newTarget}".`;
92
+ }
93
+
94
+ const lines = [
95
+ `Reverted experiments with positive signal on "${newTarget}" (candidates for retry):`,
96
+ '',
97
+ ];
98
+
99
+ for (const c of candidates) {
100
+ const deltaStr =
101
+ c.candidateDelta !== null
102
+ ? `delta=+${c.candidateDelta}% (primary)`
103
+ : `value=${c.candidateValue} (secondary — no delta available)`;
104
+
105
+ lines.push(` [${c.hash}] ${c.skillName}: ${c.hypothesis}`);
106
+ lines.push(` ${newTarget}=${c.candidateValue} ${deltaStr}`);
107
+ lines.push(` original target: ${c.target}=${c.value} delta=${c.delta}%`);
108
+ lines.push('');
109
+ }
110
+
111
+ return lines.join('\n').trimEnd();
112
+ }
113
+
114
+ module.exports = {
115
+ surfaceCandidates,
116
+ formatCandidates,
117
+ // exported for testing
118
+ parseSecondaries,
119
+ };