deepflow 0.1.102 → 0.1.104
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install-dynamic-hooks.test.js +461 -0
- package/bin/install.js +150 -204
- package/bin/install.test.js +214 -0
- package/bin/lineage-ingest.js +70 -0
- package/hooks/df-check-update.js +1 -0
- package/hooks/df-command-usage.js +305 -0
- package/hooks/df-command-usage.test.js +1019 -0
- package/hooks/df-dashboard-push.js +1 -0
- package/hooks/df-execution-history.js +1 -0
- package/hooks/df-explore-protocol.js +83 -0
- package/hooks/df-explore-protocol.test.js +228 -0
- package/hooks/df-hook-event-tags.test.js +127 -0
- package/hooks/df-invariant-check.js +1 -0
- package/hooks/df-quota-logger.js +1 -0
- package/hooks/df-snapshot-guard.js +1 -0
- package/hooks/df-spec-lint.js +58 -1
- package/hooks/df-spec-lint.test.js +412 -0
- package/hooks/df-statusline.js +1 -0
- package/hooks/df-subagent-registry.js +34 -14
- package/hooks/df-tool-usage.js +21 -3
- package/hooks/df-tool-usage.test.js +200 -0
- package/hooks/df-worktree-guard.js +1 -0
- package/package.json +1 -1
- package/src/commands/df/debate.md +1 -1
- package/src/commands/df/eval.md +117 -0
- package/src/commands/df/execute.md +1 -1
- package/src/commands/df/fix.md +104 -0
- package/src/eval/git-memory.js +159 -0
- package/src/eval/git-memory.test.js +439 -0
- package/src/eval/hypothesis.js +80 -0
- package/src/eval/hypothesis.test.js +169 -0
- package/src/eval/loop.js +378 -0
- package/src/eval/loop.test.js +306 -0
- package/src/eval/metric-collector.js +163 -0
- package/src/eval/metric-collector.test.js +369 -0
- package/src/eval/metric-pivot.js +119 -0
- package/src/eval/metric-pivot.test.js +350 -0
- package/src/eval/mutator-prompt.js +106 -0
- package/src/eval/mutator-prompt.test.js +180 -0
- package/templates/config-template.yaml +5 -0
- package/templates/eval-fixture-template/config.yaml +39 -0
- package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
- package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
- package/templates/eval-fixture-template/fixture/package.json +12 -0
- package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
- package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
- package/templates/eval-fixture-template/fixture/src/config.js +40 -0
- package/templates/eval-fixture-template/fixture/src/index.js +19 -0
- package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
- package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
- package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
- package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
- package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
- package/templates/eval-fixture-template/hypotheses.md +14 -0
- package/templates/eval-fixture-template/spec.md +34 -0
- package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
- package/templates/eval-fixture-template/tests/guard.test.js +108 -0
- package/templates/eval-fixture-template.test.js +318 -0
- package/templates/explore-agent.md +5 -74
- package/templates/explore-protocol.md +44 -0
- package/templates/spec-template.md +4 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for hooks/df-tool-usage.js — T3: active_command field in tool-usage records
|
|
3
|
+
*
|
|
4
|
+
* Validates that the tool usage hook reads .deepflow/active-command.json marker
|
|
5
|
+
* and includes the active_command field in tool-usage.jsonl records.
|
|
6
|
+
*
|
|
7
|
+
* Uses Node.js built-in node:test to avoid adding dependencies.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
'use strict';
|
|
11
|
+
|
|
12
|
+
const { test, describe, beforeEach, afterEach } = require('node:test');
|
|
13
|
+
const assert = require('node:assert/strict');
|
|
14
|
+
const fs = require('node:fs');
|
|
15
|
+
const path = require('node:path');
|
|
16
|
+
const os = require('os');
|
|
17
|
+
const { execFileSync } = require('node:child_process');
|
|
18
|
+
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Helpers
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
const HOOK_PATH = path.resolve(__dirname, 'df-tool-usage.js');
|
|
24
|
+
|
|
25
|
+
function makeTmpDir() {
|
|
26
|
+
return fs.mkdtempSync(path.join(os.tmpdir(), 'df-tool-usage-test-'));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function rmrf(dir) {
|
|
30
|
+
if (fs.existsSync(dir)) {
|
|
31
|
+
fs.rmSync(dir, { recursive: true, force: true });
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Run the tool usage hook as a child process with JSON piped to stdin.
|
|
37
|
+
* Overrides HOME so tool-usage.jsonl goes to a temp location.
|
|
38
|
+
* Returns { stdout, stderr, code }.
|
|
39
|
+
*/
|
|
40
|
+
function runHook(input, { cwd, env: extraEnv } = {}) {
|
|
41
|
+
const json = typeof input === 'string' ? input : JSON.stringify(input);
|
|
42
|
+
const env = { ...process.env, ...extraEnv };
|
|
43
|
+
// Override HOME so the log file goes to our temp dir
|
|
44
|
+
env.HOME = cwd || os.tmpdir();
|
|
45
|
+
try {
|
|
46
|
+
const stdout = execFileSync(
|
|
47
|
+
process.execPath,
|
|
48
|
+
[HOOK_PATH],
|
|
49
|
+
{
|
|
50
|
+
input: json,
|
|
51
|
+
cwd: cwd || os.tmpdir(),
|
|
52
|
+
encoding: 'utf8',
|
|
53
|
+
timeout: 5000,
|
|
54
|
+
env,
|
|
55
|
+
}
|
|
56
|
+
);
|
|
57
|
+
return { stdout, stderr: '', code: 0 };
|
|
58
|
+
} catch (err) {
|
|
59
|
+
return {
|
|
60
|
+
stdout: err.stdout || '',
|
|
61
|
+
stderr: err.stderr || '',
|
|
62
|
+
code: err.status ?? 1,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Build a minimal PostToolUse event payload.
|
|
69
|
+
*/
|
|
70
|
+
function makeToolInput(cwd) {
|
|
71
|
+
return {
|
|
72
|
+
session_id: 'tool-test-session',
|
|
73
|
+
tool_name: 'Read',
|
|
74
|
+
tool_input: { file_path: '/tmp/test.js' },
|
|
75
|
+
tool_response: { content: 'file contents here' },
|
|
76
|
+
cwd: cwd,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Read the last record from tool-usage.jsonl.
|
|
82
|
+
*/
|
|
83
|
+
function readLastToolRecord(homeDir) {
|
|
84
|
+
const logPath = path.join(homeDir, '.claude', 'tool-usage.jsonl');
|
|
85
|
+
if (!fs.existsSync(logPath)) return null;
|
|
86
|
+
const lines = fs.readFileSync(logPath, 'utf8').trim().split('\n');
|
|
87
|
+
return JSON.parse(lines[lines.length - 1]);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
// T3: active_command field in tool-usage records
|
|
92
|
+
// ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
describe('T3 — tool-usage active_command field', () => {
|
|
95
|
+
let tmpDir;
|
|
96
|
+
let deepflowDir;
|
|
97
|
+
|
|
98
|
+
beforeEach(() => {
|
|
99
|
+
tmpDir = makeTmpDir();
|
|
100
|
+
deepflowDir = path.join(tmpDir, '.deepflow');
|
|
101
|
+
fs.mkdirSync(deepflowDir, { recursive: true });
|
|
102
|
+
// Create .claude dir for tool-usage.jsonl output
|
|
103
|
+
fs.mkdirSync(path.join(tmpDir, '.claude'), { recursive: true });
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
afterEach(() => {
|
|
107
|
+
rmrf(tmpDir);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
test('active_command is set when active-command.json marker exists', () => {
|
|
111
|
+
fs.writeFileSync(
|
|
112
|
+
path.join(deepflowDir, 'active-command.json'),
|
|
113
|
+
JSON.stringify({ command: 'df:plan', started_at: new Date().toISOString() })
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
const input = makeToolInput(tmpDir);
|
|
117
|
+
const { code } = runHook(input, { cwd: tmpDir });
|
|
118
|
+
assert.equal(code, 0, 'Hook should exit successfully');
|
|
119
|
+
|
|
120
|
+
const record = readLastToolRecord(tmpDir);
|
|
121
|
+
assert.ok(record, 'Tool usage record should exist');
|
|
122
|
+
assert.equal(record.active_command, 'df:plan', 'active_command should match marker');
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
test('active_command is null when no marker file exists', () => {
|
|
126
|
+
const input = makeToolInput(tmpDir);
|
|
127
|
+
const { code } = runHook(input, { cwd: tmpDir });
|
|
128
|
+
assert.equal(code, 0);
|
|
129
|
+
|
|
130
|
+
const record = readLastToolRecord(tmpDir);
|
|
131
|
+
assert.ok(record, 'Tool usage record should exist');
|
|
132
|
+
assert.equal(record.active_command, null, 'active_command should be null without marker');
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
test('active_command is null when marker file contains corrupt JSON', () => {
|
|
136
|
+
fs.writeFileSync(
|
|
137
|
+
path.join(deepflowDir, 'active-command.json'),
|
|
138
|
+
'{{corrupt json'
|
|
139
|
+
);
|
|
140
|
+
|
|
141
|
+
const input = makeToolInput(tmpDir);
|
|
142
|
+
const { code } = runHook(input, { cwd: tmpDir });
|
|
143
|
+
assert.equal(code, 0, 'Hook should not crash on corrupt marker');
|
|
144
|
+
|
|
145
|
+
const record = readLastToolRecord(tmpDir);
|
|
146
|
+
assert.ok(record);
|
|
147
|
+
assert.equal(record.active_command, null, 'active_command should be null for corrupt marker');
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
test('active_command is null when marker has no command field', () => {
|
|
151
|
+
fs.writeFileSync(
|
|
152
|
+
path.join(deepflowDir, 'active-command.json'),
|
|
153
|
+
JSON.stringify({ other_field: 'value' })
|
|
154
|
+
);
|
|
155
|
+
|
|
156
|
+
const input = makeToolInput(tmpDir);
|
|
157
|
+
const { code } = runHook(input, { cwd: tmpDir });
|
|
158
|
+
assert.equal(code, 0);
|
|
159
|
+
|
|
160
|
+
const record = readLastToolRecord(tmpDir);
|
|
161
|
+
assert.ok(record);
|
|
162
|
+
assert.equal(record.active_command, null, 'active_command should be null when command field missing');
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
test('active_command field always present in tool-usage record schema', () => {
|
|
166
|
+
const input = makeToolInput(tmpDir);
|
|
167
|
+
runHook(input, { cwd: tmpDir });
|
|
168
|
+
|
|
169
|
+
const record = readLastToolRecord(tmpDir);
|
|
170
|
+
assert.ok(record);
|
|
171
|
+
assert.ok('active_command' in record, 'active_command key must always be present');
|
|
172
|
+
// Verify other expected fields
|
|
173
|
+
assert.ok('timestamp' in record);
|
|
174
|
+
assert.ok('session_id' in record);
|
|
175
|
+
assert.ok('tool_name' in record);
|
|
176
|
+
assert.ok('phase' in record);
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
test('active_command reads df:execute correctly', () => {
|
|
180
|
+
fs.writeFileSync(
|
|
181
|
+
path.join(deepflowDir, 'active-command.json'),
|
|
182
|
+
JSON.stringify({ command: 'df:execute' })
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
const input = makeToolInput(tmpDir);
|
|
186
|
+
runHook(input, { cwd: tmpDir });
|
|
187
|
+
|
|
188
|
+
const record = readLastToolRecord(tmpDir);
|
|
189
|
+
assert.ok(record);
|
|
190
|
+
assert.equal(record.active_command, 'df:execute');
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
test('hook exits 0 even when marker is unreadable (permissions)', () => {
|
|
194
|
+
// Write marker then make the deepflow dir inaccessible won't work on all OS.
|
|
195
|
+
// Instead, set cwd to a non-existent path to trigger the fallback.
|
|
196
|
+
const input = makeToolInput('/nonexistent/path/that/does/not/exist');
|
|
197
|
+
const { code } = runHook(input, { cwd: tmpDir });
|
|
198
|
+
assert.equal(code, 0, 'Hook should always exit 0');
|
|
199
|
+
});
|
|
200
|
+
});
|
package/package.json
CHANGED
|
@@ -30,7 +30,7 @@ Coordinate reasoner agents to debate a problem from multiple perspectives, then
|
|
|
30
30
|
Summarize conversation context in ~200 words: core problem, requirements, constraints, user priorities. Passed to each perspective agent.
|
|
31
31
|
|
|
32
32
|
### 2. GATHER CODEBASE CONTEXT
|
|
33
|
-
Glob/Grep
|
|
33
|
+
Prefer LSP documentSymbol to understand file structure, then Read with offset/limit on relevant ranges only (never read full files). Glob/Grep to locate files (up to 5-6, focus on core logic). Produce ~300 word codebase summary: what exists, key interfaces, current limitations, dependencies. Passed to every agent.
|
|
34
34
|
|
|
35
35
|
### 3. SPAWN PERSPECTIVES
|
|
36
36
|
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:eval
|
|
3
|
+
description: Evaluate a skill or command against a benchmark suite, or scaffold a new benchmark directory
|
|
4
|
+
allowed-tools: [Read, Bash, Write, Glob, Grep]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# /df:eval — Skill Evaluation
|
|
8
|
+
|
|
9
|
+
Run a benchmark suite against a skill/command, or scaffold a new benchmark directory.
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
/df:eval --scaffold benchmarks/<name>/ # Create benchmark directory structure
|
|
15
|
+
/df:eval benchmarks/<name>/ # Run benchmark suite (reads hypotheses.md)
|
|
16
|
+
/df:eval benchmarks/<name>/ --hypothesis "reduce token use" # Override hypothesis explicitly
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Subcommands
|
|
20
|
+
|
|
21
|
+
### `--scaffold <target-dir>`
|
|
22
|
+
|
|
23
|
+
Creates a benchmark directory from the fixture template at `templates/eval-fixture-template/`.
|
|
24
|
+
|
|
25
|
+
**What gets created:**
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
<target-dir>/
|
|
29
|
+
fixture/ # Minimal repo fixture (hooks, specs, src, package.json)
|
|
30
|
+
tests/ # Behavior and guard test files
|
|
31
|
+
spec.md # Benchmark objective and acceptance criteria
|
|
32
|
+
config.yaml # Benchmark configuration (skill under test, thresholds)
|
|
33
|
+
hypotheses.md # Hypotheses to validate
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Steps:**
|
|
37
|
+
|
|
38
|
+
1. Validate `<target-dir>` argument is provided; abort with usage hint if missing.
|
|
39
|
+
2. Check `<target-dir>` does not already exist; abort with error if it does.
|
|
40
|
+
3. Copy `templates/eval-fixture-template/` recursively to `<target-dir>`.
|
|
41
|
+
4. Confirm with summary:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
Created benchmark scaffold at <target-dir>/
|
|
45
|
+
fixture/ - minimal repo fixture
|
|
46
|
+
tests/ - behavior.test.js, guard.test.js
|
|
47
|
+
spec.md - edit to define benchmark objective
|
|
48
|
+
config.yaml - edit to set skill under test and thresholds
|
|
49
|
+
hypotheses.md - edit to define hypotheses
|
|
50
|
+
|
|
51
|
+
Next: edit spec.md and config.yaml, then run /df:eval <target-dir>/
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
**Implementation:**
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Parse --scaffold flag and target dir from $ARGUMENTS
|
|
58
|
+
# e.g. /df:eval --scaffold benchmarks/my-bench/
|
|
59
|
+
ARGS="$ARGUMENTS"
|
|
60
|
+
TARGET=$(echo "$ARGS" | sed 's/--scaffold[[:space:]]*//')
|
|
61
|
+
TEMPLATE="templates/eval-fixture-template"
|
|
62
|
+
|
|
63
|
+
if [ -z "$TARGET" ]; then
|
|
64
|
+
echo "Error: target directory required. Usage: /df:eval --scaffold benchmarks/<name>/"
|
|
65
|
+
exit 1
|
|
66
|
+
fi
|
|
67
|
+
|
|
68
|
+
if [ -d "$TARGET" ]; then
|
|
69
|
+
echo "Error: $TARGET already exists."
|
|
70
|
+
exit 1
|
|
71
|
+
fi
|
|
72
|
+
|
|
73
|
+
cp -r "$TEMPLATE/" "$TARGET"
|
|
74
|
+
echo "Created benchmark scaffold at $TARGET"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### `--hypothesis <text>`
|
|
78
|
+
|
|
79
|
+
Overrides the mutation hypothesis for the eval session. Without this flag the
|
|
80
|
+
loop reads `{benchDir}/hypotheses.md` and uses the first list item it finds.
|
|
81
|
+
|
|
82
|
+
**Hypothesis resolution order:**
|
|
83
|
+
|
|
84
|
+
1. `--hypothesis "<text>"` flag value — used as-is.
|
|
85
|
+
2. `{benchDir}/hypotheses.md` first list item (ordered or unordered markdown list).
|
|
86
|
+
3. Error if neither source is available.
|
|
87
|
+
|
|
88
|
+
**Module:** `src/eval/hypothesis.js` — `loadHypothesis({ flag, benchDir })`
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Main Eval Loop (T9 — implemented)
|
|
93
|
+
|
|
94
|
+
Running `/df:eval benchmarks/<name>/` without `--scaffold` runs the Karpathy loop:
|
|
95
|
+
|
|
96
|
+
1. Load `benchmarks/<name>/config.yaml` — skill under test, thresholds, iteration count
|
|
97
|
+
2. Resolve hypothesis via `--hypothesis` flag or `benchmarks/<name>/hypotheses.md` (first list item)
|
|
98
|
+
3. Create a worktree-isolated branch for the session (`eval/<skill>/<timestamp>`)
|
|
99
|
+
4. **Loop** (until Ctrl+C or `--loop N`):
|
|
100
|
+
a. Mutate skill file via agent prompt built from current content + history
|
|
101
|
+
b. Commit experiment (`status:pending`)
|
|
102
|
+
c. Run guard check (build + test commands from config)
|
|
103
|
+
- Guard fail → `git revert`, log `status:guard_fail`, next iteration
|
|
104
|
+
d. Collect metrics from `.deepflow/` JSONL files
|
|
105
|
+
e. Compare target metric against baseline
|
|
106
|
+
- Improved → log `status:kept`, update baseline
|
|
107
|
+
- Regression → `git revert`, log `status:reverted`
|
|
108
|
+
f. Record secondary metrics in commit message (never influence keep/revert)
|
|
109
|
+
|
|
110
|
+
**Implementation:** `src/eval/loop.js` (`runEvalLoop`), `src/eval/hypothesis.js` (`loadHypothesis`)
|
|
111
|
+
|
|
112
|
+
## Rules
|
|
113
|
+
|
|
114
|
+
- `--scaffold` never overwrites an existing directory
|
|
115
|
+
- Template is always copied from `templates/eval-fixture-template/`
|
|
116
|
+
- Main eval loop is non-deterministic by design — it samples skill behavior across N runs
|
|
117
|
+
- No LLM judges another LLM — only objective metrics (file diffs, test results, token counts) are used
|
|
@@ -376,7 +376,7 @@ Success criteria: {ACs from spec relevant to this task}
|
|
|
376
376
|
{TASK_DETAIL if available, else inline block:}
|
|
377
377
|
Impact: Callers: {file} ({why}) | Duplicates: [active→consolidate] [dead→DELETE] | Data flow: {consumers}
|
|
378
378
|
Prior tasks: {dep_id}: {summary}
|
|
379
|
-
Steps: 1. chub search/get for APIs 2. LSP findReferences, add unlisted callers 3.
|
|
379
|
+
Steps: 1. chub search/get for APIs 2. LSP findReferences, add unlisted callers 3. LSP documentSymbol on Impact files → Read with offset/limit on relevant ranges only (never read full files) 4. Implement 5. Commit
|
|
380
380
|
--- END ---
|
|
381
381
|
Duplicates: [active]→consolidate [dead]→DELETE. ONLY job: code+commit. No merge/rename/checkout.
|
|
382
382
|
Last line of your response MUST be: TASK_STATUS:pass (if successful) or TASK_STATUS:fail (if failed) or TASK_STATUS:revert (if reverted)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: df:fix
|
|
3
|
+
description: Create a new spec derived from a completed spec to address issues, regressions, or unmet acceptance criteria
|
|
4
|
+
allowed-tools: [Read, Write, AskUserQuestion]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# /df:fix {done-spec-name} — Create Fix Spec from Completed Spec
|
|
8
|
+
|
|
9
|
+
## Purpose
|
|
10
|
+
|
|
11
|
+
Creates a new spec file pre-populated with lineage from a `done-*` spec. Use when a completed feature has regressions, unmet ACs, or needs targeted fixes without reopening the original spec.
|
|
12
|
+
|
|
13
|
+
## Behavior
|
|
14
|
+
|
|
15
|
+
### 1. VALIDATE ARGUMENT
|
|
16
|
+
|
|
17
|
+
The command receives one argument: `{done-spec-name}` (e.g., `done-auth`).
|
|
18
|
+
|
|
19
|
+
If no argument is provided, ask:
|
|
20
|
+
```
|
|
21
|
+
Which completed spec needs a fix? (e.g., done-auth)
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### 2. READ PARENT SPEC
|
|
25
|
+
|
|
26
|
+
Read `specs/{done-spec-name}.md`.
|
|
27
|
+
|
|
28
|
+
If the file does not exist, show:
|
|
29
|
+
```
|
|
30
|
+
Error: specs/{done-spec-name}.md not found.
|
|
31
|
+
Make sure the spec exists and uses the done-* prefix.
|
|
32
|
+
```
|
|
33
|
+
Then stop.
|
|
34
|
+
|
|
35
|
+
Extract from the parent spec:
|
|
36
|
+
- **Objective** (from `## Objective` section)
|
|
37
|
+
- **Acceptance Criteria** (all items from `## Acceptance Criteria` section)
|
|
38
|
+
|
|
39
|
+
### 3. ASK WHAT NEEDS FIXING
|
|
40
|
+
|
|
41
|
+
Use `AskUserQuestion` to ask (max 4 questions per call):
|
|
42
|
+
|
|
43
|
+
1. What is broken or not working as expected?
|
|
44
|
+
2. Which acceptance criteria from the original spec are failing or incomplete? (show the extracted ACs as reference)
|
|
45
|
+
3. Any new constraints or scope boundaries for this fix?
|
|
46
|
+
|
|
47
|
+
### 4. CREATE FIX SPEC
|
|
48
|
+
|
|
49
|
+
Determine a short name for the fix spec. Default: `fix-{done-spec-name-without-done-prefix}` (e.g., `fix-auth`).
|
|
50
|
+
|
|
51
|
+
Create `specs/{fix-name}.md`:
|
|
52
|
+
|
|
53
|
+
```markdown
|
|
54
|
+
---
|
|
55
|
+
derives-from: {done-spec-name}
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
# Fix: {Title derived from done-spec-name}
|
|
59
|
+
|
|
60
|
+
## Objective
|
|
61
|
+
|
|
62
|
+
Fix issues in `{done-spec-name}`: {one sentence summarizing what needs to be fixed, from user input}
|
|
63
|
+
|
|
64
|
+
## Requirements
|
|
65
|
+
|
|
66
|
+
- **REQ-1**: [Requirement based on user-described issue]
|
|
67
|
+
|
|
68
|
+
## Constraints
|
|
69
|
+
|
|
70
|
+
- Scope limited to fixing regressions/gaps from `{done-spec-name}`
|
|
71
|
+
- Must not break passing ACs from the parent spec
|
|
72
|
+
|
|
73
|
+
## Acceptance Criteria
|
|
74
|
+
|
|
75
|
+
<!-- Failing ACs carried over from parent spec -->
|
|
76
|
+
{carried-over failing ACs as unchecked items}
|
|
77
|
+
|
|
78
|
+
<!-- New ACs for this fix -->
|
|
79
|
+
- [ ] {new criterion from user input, if any}
|
|
80
|
+
|
|
81
|
+
## Technical Notes
|
|
82
|
+
|
|
83
|
+
Parent spec: `specs/{done-spec-name}.md`
|
|
84
|
+
Parent objective: {parent objective text}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 5. CONFIRM
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
Created specs/{fix-name}.md
|
|
91
|
+
|
|
92
|
+
derives-from: {done-spec-name}
|
|
93
|
+
Carried over {N} ACs from parent spec
|
|
94
|
+
|
|
95
|
+
Next: Run /df:plan {fix-name} to generate fix tasks
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Rules
|
|
99
|
+
|
|
100
|
+
- Always set `derives-from` in the frontmatter — this is the lineage anchor
|
|
101
|
+
- Carry over only ACs that are failing or unverified; do not duplicate passing ones
|
|
102
|
+
- Keep fix specs narrowly scoped — no scope creep beyond the stated issue
|
|
103
|
+
- Do not reopen or modify the parent `done-*` spec
|
|
104
|
+
- Fix spec name must start with `fix-` by default; user may override
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { execSync } = require('child_process');
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Formats the commit message for an experiment commit.
|
|
7
|
+
* Format: experiment({skillName}): {hypothesis} | {target}={value} delta={delta}% {status} | {secondaries}
|
|
8
|
+
*/
|
|
9
|
+
function formatCommitMessage({ skillName, hypothesis, target, value, delta, status, secondaries }) {
|
|
10
|
+
const secondariesStr = secondaries != null ? String(secondaries) : '';
|
|
11
|
+
return `experiment(${skillName}): ${hypothesis} | ${target}=${value} delta=${delta}% ${status} | ${secondariesStr}`;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Commits all staged/unstaged changes as an experiment commit.
|
|
16
|
+
* AC-10, AC-13: Each experiment gets exactly one commit before verification.
|
|
17
|
+
*
|
|
18
|
+
* @param {object} opts
|
|
19
|
+
* @param {string} opts.cwd - Working directory (git repo root)
|
|
20
|
+
* @param {string} opts.skillName - Skill being evaluated
|
|
21
|
+
* @param {string} opts.hypothesis - Short hypothesis string
|
|
22
|
+
* @param {string} opts.target - Primary metric name
|
|
23
|
+
* @param {string|number} opts.value - Primary metric value
|
|
24
|
+
* @param {string|number} opts.delta - Delta percentage (numeric, sign included)
|
|
25
|
+
* @param {string} opts.status - "pass" | "fail" | "inconclusive"
|
|
26
|
+
* @param {string} [opts.secondaries] - Secondary metrics string
|
|
27
|
+
* @returns {string} The commit hash (short)
|
|
28
|
+
*/
|
|
29
|
+
function commitExperiment({ cwd, skillName, hypothesis, target, value, delta, status, secondaries }) {
|
|
30
|
+
const message = formatCommitMessage({ skillName, hypothesis, target, value, delta, status, secondaries });
|
|
31
|
+
|
|
32
|
+
// Stage all changes so the commit captures the experiment state
|
|
33
|
+
execSync('git add -A', { cwd, stdio: 'pipe' });
|
|
34
|
+
execSync(`git commit -m ${JSON.stringify(message)}`, { cwd, stdio: 'pipe' });
|
|
35
|
+
|
|
36
|
+
const hash = execSync('git rev-parse --short HEAD', { cwd, stdio: 'pipe' }).toString().trim();
|
|
37
|
+
return hash;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Reverts the HEAD commit using `git revert --no-edit`.
|
|
42
|
+
* AC-7: Keeps failed experiment in history (no reset/amend).
|
|
43
|
+
*
|
|
44
|
+
* @param {object} opts
|
|
45
|
+
* @param {string} opts.cwd - Working directory (git repo root)
|
|
46
|
+
* @returns {string} The revert commit hash (short)
|
|
47
|
+
*/
|
|
48
|
+
function revertExperiment({ cwd }) {
|
|
49
|
+
execSync('git revert HEAD --no-edit', { cwd, stdio: 'pipe' });
|
|
50
|
+
const hash = execSync('git rev-parse --short HEAD', { cwd, stdio: 'pipe' }).toString().trim();
|
|
51
|
+
return hash;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Parses a single commit subject line into a structured experiment record.
|
|
56
|
+
* Returns null if the line does not match the experiment format.
|
|
57
|
+
*
|
|
58
|
+
* @param {string} hash
|
|
59
|
+
* @param {string} subject
|
|
60
|
+
* @returns {object|null}
|
|
61
|
+
*/
|
|
62
|
+
function parseExperimentLine(hash, subject) {
|
|
63
|
+
// Pattern: experiment({skillName}): {hypothesis} | {target}={value} delta={delta}% {status} | {secondaries}
|
|
64
|
+
const outerMatch = subject.match(/^experiment\(([^)]+)\):\s*(.+?)\s*\|\s*(.+?)\s*\|\s*(.*)$/);
|
|
65
|
+
if (!outerMatch) return null;
|
|
66
|
+
|
|
67
|
+
const [, parsedSkillName, hypothesis, metricsPart, secondaries] = outerMatch;
|
|
68
|
+
|
|
69
|
+
// Parse metrics: {target}={value} delta={delta}% {status}
|
|
70
|
+
const metricsMatch = metricsPart.match(/^(\S+)=(\S+)\s+delta=([-+]?[\d.]+)%\s+(\S+)$/);
|
|
71
|
+
if (!metricsMatch) return null;
|
|
72
|
+
|
|
73
|
+
const [, target, value, delta, status] = metricsMatch;
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
hash,
|
|
77
|
+
skillName: parsedSkillName,
|
|
78
|
+
hypothesis,
|
|
79
|
+
target,
|
|
80
|
+
value,
|
|
81
|
+
delta: parseFloat(delta),
|
|
82
|
+
status,
|
|
83
|
+
secondaries: secondaries.trim(),
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Queries git log for experiment commits matching a given skill.
|
|
89
|
+
* AC-10: Uses `git log --grep` to retrieve complete experiment history.
|
|
90
|
+
*
|
|
91
|
+
* @param {object} opts
|
|
92
|
+
* @param {string} opts.cwd - Working directory (git repo root)
|
|
93
|
+
* @param {string} [opts.skillName] - If provided, filters by experiment({skillName}):
|
|
94
|
+
* @returns {Array<{hash, skillName, hypothesis, target, value, delta, status, secondaries}>}
|
|
95
|
+
*/
|
|
96
|
+
function queryExperiments({ cwd, skillName }) {
|
|
97
|
+
const grepPattern = skillName
|
|
98
|
+
? `experiment(${skillName}):`
|
|
99
|
+
: 'experiment(';
|
|
100
|
+
|
|
101
|
+
let output;
|
|
102
|
+
try {
|
|
103
|
+
output = execSync(
|
|
104
|
+
`git log --grep=${JSON.stringify(grepPattern)} --format="%H %s"`,
|
|
105
|
+
{ cwd, stdio: 'pipe' }
|
|
106
|
+
).toString().trim();
|
|
107
|
+
} catch {
|
|
108
|
+
return [];
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if (!output) return [];
|
|
112
|
+
|
|
113
|
+
const results = [];
|
|
114
|
+
for (const line of output.split('\n')) {
|
|
115
|
+
const spaceIdx = line.indexOf(' ');
|
|
116
|
+
if (spaceIdx === -1) continue;
|
|
117
|
+
const hash = line.slice(0, spaceIdx);
|
|
118
|
+
const subject = line.slice(spaceIdx + 1);
|
|
119
|
+
const parsed = parseExperimentLine(hash, subject);
|
|
120
|
+
if (parsed) results.push(parsed);
|
|
121
|
+
}
|
|
122
|
+
return results;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Returns a formatted history string suitable for inclusion in a mutator prompt.
|
|
127
|
+
* AC-10: Provides the complete experiment history for a skill.
|
|
128
|
+
*
|
|
129
|
+
* @param {object} opts
|
|
130
|
+
* @param {string} opts.cwd - Working directory (git repo root)
|
|
131
|
+
* @param {string} [opts.skillName] - Filter by skill name
|
|
132
|
+
* @param {number} [opts.maxEntries=20] - Maximum number of entries to return
|
|
133
|
+
* @returns {string} Formatted history or "(no experiment history)" if empty
|
|
134
|
+
*/
|
|
135
|
+
function getExperimentHistory({ cwd, skillName, maxEntries = 20 }) {
|
|
136
|
+
const experiments = queryExperiments({ cwd, skillName });
|
|
137
|
+
const entries = experiments.slice(0, maxEntries);
|
|
138
|
+
|
|
139
|
+
if (entries.length === 0) {
|
|
140
|
+
return '(no experiment history)';
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
const lines = entries.map((e) => {
|
|
144
|
+
const secondary = e.secondaries ? ` | ${e.secondaries}` : '';
|
|
145
|
+
return `[${e.hash}] ${e.skillName}: ${e.hypothesis} => ${e.target}=${e.value} delta=${e.delta}% ${e.status}${secondary}`;
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
return lines.join('\n');
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
module.exports = {
|
|
152
|
+
commitExperiment,
|
|
153
|
+
revertExperiment,
|
|
154
|
+
queryExperiments,
|
|
155
|
+
getExperimentHistory,
|
|
156
|
+
// exported for testing
|
|
157
|
+
formatCommitMessage,
|
|
158
|
+
parseExperimentLine,
|
|
159
|
+
};
|