deepflow 0.1.103 → 0.1.104

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/bin/install-dynamic-hooks.test.js +461 -0
  2. package/bin/install.js +150 -250
  3. package/bin/lineage-ingest.js +70 -0
  4. package/hooks/df-check-update.js +1 -0
  5. package/hooks/df-command-usage.js +18 -0
  6. package/hooks/df-dashboard-push.js +1 -0
  7. package/hooks/df-execution-history.js +1 -0
  8. package/hooks/df-explore-protocol.js +83 -0
  9. package/hooks/df-explore-protocol.test.js +228 -0
  10. package/hooks/df-hook-event-tags.test.js +127 -0
  11. package/hooks/df-invariant-check.js +1 -0
  12. package/hooks/df-quota-logger.js +1 -0
  13. package/hooks/df-snapshot-guard.js +1 -0
  14. package/hooks/df-spec-lint.js +58 -1
  15. package/hooks/df-spec-lint.test.js +412 -0
  16. package/hooks/df-statusline.js +1 -0
  17. package/hooks/df-subagent-registry.js +1 -0
  18. package/hooks/df-tool-usage.js +13 -3
  19. package/hooks/df-worktree-guard.js +1 -0
  20. package/package.json +1 -1
  21. package/src/commands/df/debate.md +1 -1
  22. package/src/commands/df/eval.md +117 -0
  23. package/src/commands/df/execute.md +1 -1
  24. package/src/commands/df/fix.md +104 -0
  25. package/src/eval/git-memory.js +159 -0
  26. package/src/eval/git-memory.test.js +439 -0
  27. package/src/eval/hypothesis.js +80 -0
  28. package/src/eval/hypothesis.test.js +169 -0
  29. package/src/eval/loop.js +378 -0
  30. package/src/eval/loop.test.js +306 -0
  31. package/src/eval/metric-collector.js +163 -0
  32. package/src/eval/metric-collector.test.js +369 -0
  33. package/src/eval/metric-pivot.js +119 -0
  34. package/src/eval/metric-pivot.test.js +350 -0
  35. package/src/eval/mutator-prompt.js +106 -0
  36. package/src/eval/mutator-prompt.test.js +180 -0
  37. package/templates/config-template.yaml +5 -0
  38. package/templates/eval-fixture-template/config.yaml +39 -0
  39. package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
  40. package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
  41. package/templates/eval-fixture-template/fixture/package.json +12 -0
  42. package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
  43. package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
  44. package/templates/eval-fixture-template/fixture/src/config.js +40 -0
  45. package/templates/eval-fixture-template/fixture/src/index.js +19 -0
  46. package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
  47. package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
  48. package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
  49. package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
  50. package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
  51. package/templates/eval-fixture-template/hypotheses.md +14 -0
  52. package/templates/eval-fixture-template/spec.md +34 -0
  53. package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
  54. package/templates/eval-fixture-template/tests/guard.test.js +108 -0
  55. package/templates/eval-fixture-template.test.js +318 -0
  56. package/templates/explore-agent.md +5 -74
  57. package/templates/explore-protocol.md +44 -0
  58. package/templates/spec-template.md +4 -0
@@ -0,0 +1,5 @@
1
+ # Architectural Decisions
2
+
3
+ [APPROACH] Use stub task-runner — real fixture populates this with actual skill invocations.
4
+ [APPROACH] Output artifacts in `output/{task-id}/result.json` — simple, inspectable, no external deps.
5
+ [PROVISIONAL] Minimal YAML parser in config.js — replace if fixture needs full YAML support.
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Fixture invariant hook
4
+ *
5
+ * Mirrors the structure of the real deepflow invariant hook.
6
+ * Checks that no files outside allowed paths were modified.
7
+ *
8
+ * Exit 0 = pass, exit 1 = block with message.
9
+ */
10
+
11
+ const input = JSON.parse(process.argv[2] || '{}');
12
+ const toolName = input.tool_name || '';
13
+ const toolInput = input.tool_input || {};
14
+
15
+ // Paths that are read-only in the fixture
16
+ const PROTECTED = ['specs/', '.deepflow/config.yaml', 'hooks/', 'package.json'];
17
+
18
+ if (['Write', 'Edit', 'MultiEdit'].includes(toolName)) {
19
+ const filePath = toolInput.file_path || toolInput.path || '';
20
+ const violation = PROTECTED.find((p) => filePath.startsWith(p));
21
+
22
+ if (violation) {
23
+ console.error(`[invariant] Blocked write to protected path: ${filePath}`);
24
+ process.exit(1);
25
+ }
26
+ }
27
+
28
+ process.exit(0);
@@ -0,0 +1,12 @@
1
+ {
2
+ "name": "eval-fixture",
3
+ "version": "1.0.0",
4
+ "description": "Minimal deepflow-like codebase for skill evaluation",
5
+ "main": "src/index.js",
6
+ "scripts": {
7
+ "build": "node scripts/build.js",
8
+ "test": "node tests/run.js",
9
+ "lint": "node scripts/lint.js"
10
+ },
11
+ "dependencies": {}
12
+ }
@@ -0,0 +1,18 @@
1
+ # example-task
2
+
3
+ ## Objective
4
+
5
+ Create output artifacts for two tasks to demonstrate the pipeline works end-to-end.
6
+
7
+ ## T1: Create greeting artifact
8
+
9
+ Write `output/T1/result.json` with `{ "status": "complete", "message": "hello" }`.
10
+
11
+ ## T2: Create summary artifact
12
+
13
+ Write `output/T2/result.json` with `{ "status": "complete", "items": 2 }`.
14
+
15
+ ## Acceptance Criteria
16
+
17
+ - [ ] `output/T1/result.json` exists and `status === "complete"`
18
+ - [ ] `output/T2/result.json` exists and `status === "complete"`
@@ -0,0 +1,18 @@
1
+ ---
2
+ name: df:example
3
+ description: Example command to demonstrate skill behavior in the fixture
4
+ allowed-tools: [Read, Edit, Bash]
5
+ ---
6
+
7
+ # df:example
8
+
9
+ Demonstrates a minimal skill invocation that the eval loop can measure.
10
+
11
+ ## Steps
12
+
13
+ 1. Read the active spec from `specs/doing-*.md`
14
+ 2. List tasks in the spec
15
+ 3. For each task, create an artifact in `output/{task-id}/result.json`
16
+ 4. Report completion
17
+
18
+ !`cat specs/doing-*.md 2>/dev/null || echo 'No active spec'`
@@ -0,0 +1,40 @@
1
+ /**
2
+ * Configuration loader
3
+ *
4
+ * Reads .deepflow/config.yaml and returns a validated config object.
5
+ * Merges project-level overrides over defaults.
6
+ */
7
+
8
+ const fs = require('fs');
9
+ const path = require('path');
10
+
11
+ const DEFAULTS = {
12
+ build_command: 'node scripts/build.js',
13
+ test_command: 'node tests/run.js',
14
+ dev_command: 'node src/index.js',
15
+ dev_port: 3000,
16
+ max_consecutive_reverts: 3,
17
+ };
18
+
19
+ function loadConfig(root = process.cwd()) {
20
+ const configPath = path.join(root, '.deepflow', 'config.yaml');
21
+
22
+ if (!fs.existsSync(configPath)) {
23
+ return { ...DEFAULTS };
24
+ }
25
+
26
+ // Minimal YAML parser for key: value lines (no nested blocks needed here)
27
+ const raw = fs.readFileSync(configPath, 'utf8');
28
+ const overrides = {};
29
+
30
+ for (const line of raw.split('\n')) {
31
+ const match = line.match(/^(\w+):\s*"?([^"#\n]+)"?\s*$/);
32
+ if (match) {
33
+ overrides[match[1].trim()] = match[2].trim();
34
+ }
35
+ }
36
+
37
+ return { ...DEFAULTS, ...overrides };
38
+ }
39
+
40
+ module.exports = { loadConfig, DEFAULTS };
@@ -0,0 +1,19 @@
1
+ /**
2
+ * eval-fixture entry point
3
+ *
4
+ * Simulates a small deepflow-like project that a skill will be asked to modify.
5
+ * The skill under evaluation receives a task referencing this codebase.
6
+ */
7
+
8
+ const { loadConfig } = require('./config');
9
+ const { runPipeline } = require('./pipeline');
10
+
11
+ async function main() {
12
+ const config = loadConfig();
13
+ await runPipeline(config);
14
+ }
15
+
16
+ main().catch((err) => {
17
+ console.error('Fatal:', err.message);
18
+ process.exit(1);
19
+ });
@@ -0,0 +1,40 @@
1
+ /**
2
+ * Execution pipeline
3
+ *
4
+ * Orchestrates the plan → execute → verify cycle for a single spec.
5
+ * This is the core loop a skill is expected to drive.
6
+ */
7
+
8
+ const { loadSpec } = require('./spec-loader');
9
+ const { applyTask } = require('./task-runner');
10
+ const { verifyOutput } = require('./verifier');
11
+
12
+ async function runPipeline(config) {
13
+ const spec = loadSpec(config.specs_dir || 'specs');
14
+
15
+ if (!spec) {
16
+ console.log('No active spec found — nothing to do.');
17
+ return { status: 'noop' };
18
+ }
19
+
20
+ console.log(`Running pipeline for spec: ${spec.name}`);
21
+
22
+ const tasks = spec.tasks || [];
23
+ const results = [];
24
+
25
+ for (const task of tasks) {
26
+ console.log(` Task: ${task.id} — ${task.description}`);
27
+ const result = await applyTask(task, config);
28
+ results.push(result);
29
+
30
+ if (result.status === 'fail') {
31
+ console.error(` Task ${task.id} failed: ${result.error}`);
32
+ return { status: 'fail', task: task.id, error: result.error };
33
+ }
34
+ }
35
+
36
+ const verification = await verifyOutput(results, config);
37
+ return { status: verification.pass ? 'pass' : 'fail', verification };
38
+ }
39
+
40
+ module.exports = { runPipeline };
@@ -0,0 +1,32 @@
1
+ ---
2
+ name: example-skill
3
+ description: Skeleton skill for eval fixture — replace with the real skill content
4
+ allowed-tools: [Read, Edit, Bash, Write]
5
+ ---
6
+
7
+ # Example Skill
8
+
9
+ This is the skill file that the eval loop will mutate each iteration.
10
+ Replace this entire file with the real skill you want to evaluate.
11
+
12
+ ## Context Loading
13
+
14
+ !`cat specs/doing-*.md 2>/dev/null || echo 'NOT_FOUND'`
15
+ !`cat .deepflow/decisions.md 2>/dev/null || echo 'NOT_FOUND'`
16
+
17
+ ## Task
18
+
19
+ Apply the changes described in the active spec, one task at a time.
20
+
21
+ ## Steps
22
+
23
+ 1. Read the active spec to understand the task list
24
+ 2. For each task marked incomplete, implement the required change
25
+ 3. Verify each change is minimal and targeted — no scope creep
26
+ 4. Confirm output artifacts exist in `output/{task-id}/result.json`
27
+
28
+ ## Invariants
29
+
30
+ - Never modify files outside the task's stated scope
31
+ - Artifact must have `status: "complete"` field
32
+ - Do not create files in `specs/` or `.deepflow/`
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Spec loader
3
+ *
4
+ * Finds the active spec in the specs/ directory.
5
+ * Active spec = first file matching `doing-*.md`.
6
+ */
7
+
8
+ const fs = require('fs');
9
+ const path = require('path');
10
+
11
+ function loadSpec(specsDir = 'specs') {
12
+ if (!fs.existsSync(specsDir)) return null;
13
+
14
+ const files = fs.readdirSync(specsDir);
15
+ const active = files.find((f) => f.startsWith('doing-') && f.endsWith('.md'));
16
+
17
+ if (!active) return null;
18
+
19
+ const content = fs.readFileSync(path.join(specsDir, active), 'utf8');
20
+ return parseSpec(active.replace(/^doing-/, '').replace(/\.md$/, ''), content);
21
+ }
22
+
23
+ function parseSpec(name, content) {
24
+ const tasks = [];
25
+ const taskPattern = /^##\s+T(\d+):\s+(.+)$/gm;
26
+ let match;
27
+
28
+ while ((match = taskPattern.exec(content)) !== null) {
29
+ tasks.push({ id: `T${match[1]}`, description: match[2].trim() });
30
+ }
31
+
32
+ return { name, content, tasks };
33
+ }
34
+
35
+ module.exports = { loadSpec, parseSpec };
@@ -0,0 +1,32 @@
1
+ /**
2
+ * Task runner
3
+ *
4
+ * Executes a single task from the spec. In the real deepflow system this
5
+ * dispatches to Claude via the CLI. In the fixture it runs a stub that
6
+ * creates the expected output files so guard tests can verify them.
7
+ */
8
+
9
+ const fs = require('fs');
10
+ const path = require('path');
11
+
12
+ async function applyTask(task, config) {
13
+ // Stub: simulate task execution by writing an artifact
14
+ const outputDir = path.join('output', task.id);
15
+ fs.mkdirSync(outputDir, { recursive: true });
16
+
17
+ const artifact = {
18
+ task: task.id,
19
+ description: task.description,
20
+ timestamp: new Date().toISOString(),
21
+ status: 'complete',
22
+ };
23
+
24
+ fs.writeFileSync(
25
+ path.join(outputDir, 'result.json'),
26
+ JSON.stringify(artifact, null, 2)
27
+ );
28
+
29
+ return { status: 'pass', task: task.id, artifact };
30
+ }
31
+
32
+ module.exports = { applyTask };
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Output verifier
3
+ *
4
+ * Checks that task artifacts meet acceptance criteria.
5
+ * Mirrors the L0–L4 verification levels from /df:verify.
6
+ */
7
+
8
+ const fs = require('fs');
9
+ const path = require('path');
10
+
11
+ async function verifyOutput(results, config) {
12
+ const checks = [];
13
+
14
+ for (const result of results) {
15
+ if (result.status !== 'pass') {
16
+ checks.push({ check: `task-${result.task}-status`, pass: false });
17
+ continue;
18
+ }
19
+
20
+ const artifactPath = path.join('output', result.task, 'result.json');
21
+ const exists = fs.existsSync(artifactPath);
22
+ checks.push({ check: `task-${result.task}-artifact`, pass: exists });
23
+
24
+ if (exists) {
25
+ const data = JSON.parse(fs.readFileSync(artifactPath, 'utf8'));
26
+ checks.push({
27
+ check: `task-${result.task}-complete`,
28
+ pass: data.status === 'complete',
29
+ });
30
+ }
31
+ }
32
+
33
+ const allPassed = checks.every((c) => c.pass);
34
+ return { pass: allPassed, checks };
35
+ }
36
+
37
+ module.exports = { verifyOutput };
@@ -0,0 +1,14 @@
1
+ # Hypotheses
2
+
3
+ Each line is one hypothesis for the eval loop. `/df:eval` picks the next unused
4
+ hypothesis from this file when `--hypothesis` flag is not supplied.
5
+
6
+ Format: one hypothesis per line, plain English. Be specific about what to change and why.
7
+
8
+ ---
9
+
10
+ Add explicit cache-priming instructions at the top of the skill prompt to front-load repeated context reads.
11
+ Reduce the number of tool calls by batching related file reads into a single step.
12
+ Reorder instructions to place the most frequently accessed context at the START zone of the prompt (attention U-curve).
13
+ Shorten the skill preamble to reduce input tokens on every invocation.
14
+ Replace prose descriptions of steps with numbered list format to improve instruction clarity and reduce re-reads.
@@ -0,0 +1,34 @@
1
+ # {benchmark-name}
2
+
3
+ ## Objective
4
+
5
+ [One sentence: what skill behavior this benchmark evaluates]
6
+
7
+ ## Target Metric
8
+
9
+ - **Primary (target)**: `cache_ratio` — cache_read_input_tokens / input_tokens (higher = better)
10
+ - **Secondary**: `total_tokens`, `wall_time`, `context_burn`
11
+ - **Guard**: fixture tests pass (binary — failure auto-reverts before any metric check)
12
+
13
+ ## Skill Under Evaluation
14
+
15
+ - **Skill path**: `skills/{skill-name}/SKILL.md`
16
+ - **First hypothesis**: [Your opening hypothesis about what to change and why]
17
+
18
+ ## Fixture Design
19
+
20
+ The `fixture/` directory contains a 12-file deepflow-like skeleton codebase. It is intentionally small but realistic — enough to exercise real skill behavior (file reads, edits, spec lookups, git operations) without taking more than a few minutes per iteration.
21
+
22
+ The `tests/` directory holds guard tests. These MUST cover the behavior you care about so the optimizer cannot game the target metric by breaking real functionality.
23
+
24
+ ## Constraints
25
+
26
+ - One change per iteration (atomic causality)
27
+ - Loop runs until Ctrl+C or `--loop N` cap
28
+ - No LLM judges — only mechanical metrics decide keep/revert
29
+
30
+ ## Acceptance Criteria
31
+
32
+ - [ ] Guard tests in `tests/` pass on the unmodified fixture
33
+ - [ ] Fixture exercises the skill's primary code path
34
+ - [ ] Hypotheses file (`hypotheses.md`) has at least 3 entries to seed the loop
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Behavior tests for eval fixture
4
+ *
5
+ * These tests verify that the skill produced correct functional output,
6
+ * not just that files exist. Run after the skill execution completes.
7
+ *
8
+ * Run: node tests/behavior.test.js
9
+ * Exit 0 = all pass, exit 1 = one or more failed
10
+ */
11
+
12
+ const fs = require('fs');
13
+ const path = require('path');
14
+
15
+ const FIXTURE_DIR = path.join(__dirname, '..', 'fixture');
16
+
17
+ let passed = 0;
18
+ let failed = 0;
19
+
20
+ function assert(name, condition, detail = '') {
21
+ if (condition) {
22
+ console.log(` PASS ${name}`);
23
+ passed++;
24
+ } else {
25
+ console.error(` FAIL ${name}${detail ? ': ' + detail : ''}`);
26
+ failed++;
27
+ }
28
+ }
29
+
30
+ function assertArtifact(taskId, extraChecks = () => {}) {
31
+ const artifactPath = path.join(FIXTURE_DIR, 'output', taskId, 'result.json');
32
+ const exists = fs.existsSync(artifactPath);
33
+
34
+ assert(`output/${taskId}/result.json exists`, exists);
35
+
36
+ if (!exists) return;
37
+
38
+ let data;
39
+ try {
40
+ data = JSON.parse(fs.readFileSync(artifactPath, 'utf8'));
41
+ } catch (e) {
42
+ assert(`output/${taskId}/result.json is valid JSON`, false, e.message);
43
+ return;
44
+ }
45
+
46
+ assert(`output/${taskId} status === "complete"`, data.status === 'complete');
47
+ extraChecks(data);
48
+ }
49
+
50
+ // ---------------------------------------------------------------------------
51
+ // Task output tests — verify skill produced the expected artifacts
52
+ // ---------------------------------------------------------------------------
53
+
54
+ console.log('\n[behavior] Task output checks');
55
+
56
+ assertArtifact('T1', (data) => {
57
+ assert('T1 has message field', typeof data.message === 'string');
58
+ });
59
+
60
+ assertArtifact('T2', (data) => {
61
+ assert('T2 has items field', data.items !== undefined);
62
+ });
63
+
64
+ // ---------------------------------------------------------------------------
65
+ // Summary
66
+ // ---------------------------------------------------------------------------
67
+
68
+ console.log(`\n[behavior] ${passed} passed, ${failed} failed`);
69
+ process.exit(failed > 0 ? 1 : 0);
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Guard tests for eval fixture
4
+ *
5
+ * These tests constitute the binary guard check in the eval loop.
6
+ * ALL tests must pass for the iteration to proceed to metric collection.
7
+ * A failing guard causes immediate git revert and logs status:guard_fail.
8
+ *
9
+ * Run: node tests/guard.test.js
10
+ * Exit 0 = all pass, exit 1 = one or more failed
11
+ */
12
+
13
+ const fs = require('fs');
14
+ const path = require('path');
15
+
16
+ const FIXTURE_DIR = path.join(__dirname, '..', 'fixture');
17
+
18
+ let passed = 0;
19
+ let failed = 0;
20
+
21
+ function assert(name, condition, detail = '') {
22
+ if (condition) {
23
+ console.log(` PASS ${name}`);
24
+ passed++;
25
+ } else {
26
+ console.error(` FAIL ${name}${detail ? ': ' + detail : ''}`);
27
+ failed++;
28
+ }
29
+ }
30
+
31
+ // ---------------------------------------------------------------------------
32
+ // Structural tests — fixture files must exist before skill evaluation runs
33
+ // ---------------------------------------------------------------------------
34
+
35
+ console.log('\n[guard] Structural integrity checks');
36
+
37
+ assert(
38
+ 'fixture/package.json exists',
39
+ fs.existsSync(path.join(FIXTURE_DIR, 'package.json'))
40
+ );
41
+
42
+ assert(
43
+ 'fixture/src/index.js exists',
44
+ fs.existsSync(path.join(FIXTURE_DIR, 'src', 'index.js'))
45
+ );
46
+
47
+ assert(
48
+ 'fixture/src/skills/example-skill/SKILL.md exists',
49
+ fs.existsSync(path.join(FIXTURE_DIR, 'src', 'skills', 'example-skill', 'SKILL.md'))
50
+ );
51
+
52
+ assert(
53
+ 'fixture/specs/ contains at least one doing-*.md',
54
+ fs.readdirSync(path.join(FIXTURE_DIR, 'specs')).some(
55
+ (f) => f.startsWith('doing-') && f.endsWith('.md')
56
+ )
57
+ );
58
+
59
+ assert(
60
+ 'fixture/.deepflow/decisions.md exists',
61
+ fs.existsSync(path.join(FIXTURE_DIR, '.deepflow', 'decisions.md'))
62
+ );
63
+
64
+ // ---------------------------------------------------------------------------
65
+ // Content tests — critical fields must be present in key files
66
+ // ---------------------------------------------------------------------------
67
+
68
+ console.log('\n[guard] Content validity checks');
69
+
70
+ const skillPath = path.join(FIXTURE_DIR, 'src', 'skills', 'example-skill', 'SKILL.md');
71
+ const skillContent = fs.readFileSync(skillPath, 'utf8');
72
+
73
+ assert(
74
+ 'SKILL.md has YAML frontmatter',
75
+ skillContent.startsWith('---')
76
+ );
77
+
78
+ assert(
79
+ 'SKILL.md has allowed-tools',
80
+ skillContent.includes('allowed-tools')
81
+ );
82
+
83
+ const pkgPath = path.join(FIXTURE_DIR, 'package.json');
84
+ const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8'));
85
+
86
+ assert(
87
+ 'package.json has test script',
88
+ typeof pkg.scripts?.test === 'string'
89
+ );
90
+
91
+ assert(
92
+ 'package.json has build script',
93
+ typeof pkg.scripts?.build === 'string'
94
+ );
95
+
96
+ // ---------------------------------------------------------------------------
97
+ // Summary
98
+ // ---------------------------------------------------------------------------
99
+
100
+ console.log(`\n[guard] ${passed} passed, ${failed} failed`);
101
+
102
+ if (failed > 0) {
103
+ console.error('[guard] GUARD FAILED — iteration will be reverted');
104
+ process.exit(1);
105
+ }
106
+
107
+ console.log('[guard] All guard checks passed');
108
+ process.exit(0);