npm - deepflow - Versions diffs - 0.1.103 → 0.1.104 - Mend

deepflow 0.1.103 → 0.1.104

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/bin/install-dynamic-hooks.test.js +461 -0
package/bin/install.js +150 -250
package/bin/lineage-ingest.js +70 -0
package/hooks/df-check-update.js +1 -0
package/hooks/df-command-usage.js +18 -0
package/hooks/df-dashboard-push.js +1 -0
package/hooks/df-execution-history.js +1 -0
package/hooks/df-explore-protocol.js +83 -0
package/hooks/df-explore-protocol.test.js +228 -0
package/hooks/df-hook-event-tags.test.js +127 -0
package/hooks/df-invariant-check.js +1 -0
package/hooks/df-quota-logger.js +1 -0
package/hooks/df-snapshot-guard.js +1 -0
package/hooks/df-spec-lint.js +58 -1
package/hooks/df-spec-lint.test.js +412 -0
package/hooks/df-statusline.js +1 -0
package/hooks/df-subagent-registry.js +1 -0
package/hooks/df-tool-usage.js +13 -3
package/hooks/df-worktree-guard.js +1 -0
package/package.json +1 -1
package/src/commands/df/debate.md +1 -1
package/src/commands/df/eval.md +117 -0
package/src/commands/df/execute.md +1 -1
package/src/commands/df/fix.md +104 -0
package/src/eval/git-memory.js +159 -0
package/src/eval/git-memory.test.js +439 -0
package/src/eval/hypothesis.js +80 -0
package/src/eval/hypothesis.test.js +169 -0
package/src/eval/loop.js +378 -0
package/src/eval/loop.test.js +306 -0
package/src/eval/metric-collector.js +163 -0
package/src/eval/metric-collector.test.js +369 -0
package/src/eval/metric-pivot.js +119 -0
package/src/eval/metric-pivot.test.js +350 -0
package/src/eval/mutator-prompt.js +106 -0
package/src/eval/mutator-prompt.test.js +180 -0
package/templates/config-template.yaml +5 -0
package/templates/eval-fixture-template/config.yaml +39 -0
package/templates/eval-fixture-template/fixture/.deepflow/decisions.md +5 -0
package/templates/eval-fixture-template/fixture/hooks/invariant.js +28 -0
package/templates/eval-fixture-template/fixture/package.json +12 -0
package/templates/eval-fixture-template/fixture/specs/doing-example-task.md +18 -0
package/templates/eval-fixture-template/fixture/src/commands/df/example.md +18 -0
package/templates/eval-fixture-template/fixture/src/config.js +40 -0
package/templates/eval-fixture-template/fixture/src/index.js +19 -0
package/templates/eval-fixture-template/fixture/src/pipeline.js +40 -0
package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md +32 -0
package/templates/eval-fixture-template/fixture/src/spec-loader.js +35 -0
package/templates/eval-fixture-template/fixture/src/task-runner.js +32 -0
package/templates/eval-fixture-template/fixture/src/verifier.js +37 -0
package/templates/eval-fixture-template/hypotheses.md +14 -0
package/templates/eval-fixture-template/spec.md +34 -0
package/templates/eval-fixture-template/tests/behavior.test.js +69 -0
package/templates/eval-fixture-template/tests/guard.test.js +108 -0
package/templates/eval-fixture-template.test.js +318 -0
package/templates/explore-agent.md +5 -74
package/templates/explore-protocol.md +44 -0
package/templates/spec-template.md +4 -0

package/templates/eval-fixture-template/fixture/.deepflow/decisions.md ADDED Viewed

@@ -0,0 +1,5 @@
+# Architectural Decisions
+[APPROACH] Use stub task-runner — real fixture populates this with actual skill invocations.
+[APPROACH] Output artifacts in `output/{task-id}/result.json` — simple, inspectable, no external deps.
+[PROVISIONAL] Minimal YAML parser in config.js — replace if fixture needs full YAML support.

package/templates/eval-fixture-template/fixture/hooks/invariant.js ADDED Viewed

@@ -0,0 +1,28 @@
+#!/usr/bin/env node
+/**
+ * Fixture invariant hook
+ *
+ * Mirrors the structure of the real deepflow invariant hook.
+ * Checks that no files outside allowed paths were modified.
+ *
+ * Exit 0 = pass, exit 1 = block with message.
+ */
+const input = JSON.parse(process.argv[2] || '{}');
+const toolName = input.tool_name || '';
+const toolInput = input.tool_input || {};
+// Paths that are read-only in the fixture
+const PROTECTED = ['specs/', '.deepflow/config.yaml', 'hooks/', 'package.json'];
+if (['Write', 'Edit', 'MultiEdit'].includes(toolName)) {
+  const filePath = toolInput.file_path || toolInput.path || '';
+  const violation = PROTECTED.find((p) => filePath.startsWith(p));
+  if (violation) {
+    console.error(`[invariant] Blocked write to protected path: ${filePath}`);
+    process.exit(1);
+  }
+}
+process.exit(0);

package/templates/eval-fixture-template/fixture/package.json ADDED Viewed

@@ -0,0 +1,12 @@
+{
+  "name": "eval-fixture",
+  "version": "1.0.0",
+  "description": "Minimal deepflow-like codebase for skill evaluation",
+  "main": "src/index.js",
+  "scripts": {
+    "build": "node scripts/build.js",
+    "test": "node tests/run.js",
+    "lint": "node scripts/lint.js"
+  },
+  "dependencies": {}
+}

package/templates/eval-fixture-template/fixture/specs/doing-example-task.md ADDED Viewed

@@ -0,0 +1,18 @@
+# example-task
+## Objective
+Create output artifacts for two tasks to demonstrate the pipeline works end-to-end.
+## T1: Create greeting artifact
+Write `output/T1/result.json` with `{ "status": "complete", "message": "hello" }`.
+## T2: Create summary artifact
+Write `output/T2/result.json` with `{ "status": "complete", "items": 2 }`.
+## Acceptance Criteria
+- [ ] `output/T1/result.json` exists and `status === "complete"`
+- [ ] `output/T2/result.json` exists and `status === "complete"`

package/templates/eval-fixture-template/fixture/src/commands/df/example.md ADDED Viewed

@@ -0,0 +1,18 @@
+---
+name: df:example
+description: Example command to demonstrate skill behavior in the fixture
+allowed-tools: [Read, Edit, Bash]
+---
+# df:example
+Demonstrates a minimal skill invocation that the eval loop can measure.
+## Steps
+1. Read the active spec from `specs/doing-*.md`
+2. List tasks in the spec
+3. For each task, create an artifact in `output/{task-id}/result.json`
+4. Report completion
+!`cat specs/doing-*.md 2>/dev/null || echo 'No active spec'`

package/templates/eval-fixture-template/fixture/src/config.js ADDED Viewed

@@ -0,0 +1,40 @@
+/**
+ * Configuration loader
+ *
+ * Reads .deepflow/config.yaml and returns a validated config object.
+ * Merges project-level overrides over defaults.
+ */
+const fs = require('fs');
+const path = require('path');
+const DEFAULTS = {
+  build_command: 'node scripts/build.js',
+  test_command: 'node tests/run.js',
+  dev_command: 'node src/index.js',
+  dev_port: 3000,
+  max_consecutive_reverts: 3,
+};
+function loadConfig(root = process.cwd()) {
+  const configPath = path.join(root, '.deepflow', 'config.yaml');
+  if (!fs.existsSync(configPath)) {
+    return { ...DEFAULTS };
+  }
+  // Minimal YAML parser for key: value lines (no nested blocks needed here)
+  const raw = fs.readFileSync(configPath, 'utf8');
+  const overrides = {};
+  for (const line of raw.split('\n')) {
+    const match = line.match(/^(\w+):\s*"?([^"#\n]+)"?\s*$/);
+    if (match) {
+      overrides[match[1].trim()] = match[2].trim();
+    }
+  }
+  return { ...DEFAULTS, ...overrides };
+}
+module.exports = { loadConfig, DEFAULTS };

package/templates/eval-fixture-template/fixture/src/index.js ADDED Viewed

@@ -0,0 +1,19 @@
+/**
+ * eval-fixture entry point
+ *
+ * Simulates a small deepflow-like project that a skill will be asked to modify.
+ * The skill under evaluation receives a task referencing this codebase.
+ */
+const { loadConfig } = require('./config');
+const { runPipeline } = require('./pipeline');
+async function main() {
+  const config = loadConfig();
+  await runPipeline(config);
+}
+main().catch((err) => {
+  console.error('Fatal:', err.message);
+  process.exit(1);
+});

package/templates/eval-fixture-template/fixture/src/pipeline.js ADDED Viewed

@@ -0,0 +1,40 @@
+/**
+ * Execution pipeline
+ *
+ * Orchestrates the plan → execute → verify cycle for a single spec.
+ * This is the core loop a skill is expected to drive.
+ */
+const { loadSpec } = require('./spec-loader');
+const { applyTask } = require('./task-runner');
+const { verifyOutput } = require('./verifier');
+async function runPipeline(config) {
+  const spec = loadSpec(config.specs_dir || 'specs');
+  if (!spec) {
+    console.log('No active spec found — nothing to do.');
+    return { status: 'noop' };
+  }
+  console.log(`Running pipeline for spec: ${spec.name}`);
+  const tasks = spec.tasks || [];
+  const results = [];
+  for (const task of tasks) {
+    console.log(`  Task: ${task.id} — ${task.description}`);
+    const result = await applyTask(task, config);
+    results.push(result);
+    if (result.status === 'fail') {
+      console.error(`  Task ${task.id} failed: ${result.error}`);
+      return { status: 'fail', task: task.id, error: result.error };
+    }
+  }
+  const verification = await verifyOutput(results, config);
+  return { status: verification.pass ? 'pass' : 'fail', verification };
+}
+module.exports = { runPipeline };

package/templates/eval-fixture-template/fixture/src/skills/example-skill/SKILL.md ADDED Viewed

@@ -0,0 +1,32 @@
+---
+name: example-skill
+description: Skeleton skill for eval fixture — replace with the real skill content
+allowed-tools: [Read, Edit, Bash, Write]
+---
+# Example Skill
+This is the skill file that the eval loop will mutate each iteration.
+Replace this entire file with the real skill you want to evaluate.
+## Context Loading
+!`cat specs/doing-*.md 2>/dev/null || echo 'NOT_FOUND'`
+!`cat .deepflow/decisions.md 2>/dev/null || echo 'NOT_FOUND'`
+## Task
+Apply the changes described in the active spec, one task at a time.
+## Steps
+1. Read the active spec to understand the task list
+2. For each task marked incomplete, implement the required change
+3. Verify each change is minimal and targeted — no scope creep
+4. Confirm output artifacts exist in `output/{task-id}/result.json`
+## Invariants
+- Never modify files outside the task's stated scope
+- Artifact must have `status: "complete"` field
+- Do not create files in `specs/` or `.deepflow/`

package/templates/eval-fixture-template/fixture/src/spec-loader.js ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * Spec loader
+ *
+ * Finds the active spec in the specs/ directory.
+ * Active spec = first file matching `doing-*.md`.
+ */
+const fs = require('fs');
+const path = require('path');
+function loadSpec(specsDir = 'specs') {
+  if (!fs.existsSync(specsDir)) return null;
+  const files = fs.readdirSync(specsDir);
+  const active = files.find((f) => f.startsWith('doing-') && f.endsWith('.md'));
+  if (!active) return null;
+  const content = fs.readFileSync(path.join(specsDir, active), 'utf8');
+  return parseSpec(active.replace(/^doing-/, '').replace(/\.md$/, ''), content);
+}
+function parseSpec(name, content) {
+  const tasks = [];
+  const taskPattern = /^##\s+T(\d+):\s+(.+)$/gm;
+  let match;
+  while ((match = taskPattern.exec(content)) !== null) {
+    tasks.push({ id: `T${match[1]}`, description: match[2].trim() });
+  }
+  return { name, content, tasks };
+}
+module.exports = { loadSpec, parseSpec };

package/templates/eval-fixture-template/fixture/src/task-runner.js ADDED Viewed

@@ -0,0 +1,32 @@
+/**
+ * Task runner
+ *
+ * Executes a single task from the spec. In the real deepflow system this
+ * dispatches to Claude via the CLI. In the fixture it runs a stub that
+ * creates the expected output files so guard tests can verify them.
+ */
+const fs = require('fs');
+const path = require('path');
+async function applyTask(task, config) {
+  // Stub: simulate task execution by writing an artifact
+  const outputDir = path.join('output', task.id);
+  fs.mkdirSync(outputDir, { recursive: true });
+  const artifact = {
+    task: task.id,
+    description: task.description,
+    timestamp: new Date().toISOString(),
+    status: 'complete',
+  };
+  fs.writeFileSync(
+    path.join(outputDir, 'result.json'),
+    JSON.stringify(artifact, null, 2)
+  );
+  return { status: 'pass', task: task.id, artifact };
+}
+module.exports = { applyTask };

package/templates/eval-fixture-template/fixture/src/verifier.js ADDED Viewed

@@ -0,0 +1,37 @@
+/**
+ * Output verifier
+ *
+ * Checks that task artifacts meet acceptance criteria.
+ * Mirrors the L0–L4 verification levels from /df:verify.
+ */
+const fs = require('fs');
+const path = require('path');
+async function verifyOutput(results, config) {
+  const checks = [];
+  for (const result of results) {
+    if (result.status !== 'pass') {
+      checks.push({ check: `task-${result.task}-status`, pass: false });
+      continue;
+    }
+    const artifactPath = path.join('output', result.task, 'result.json');
+    const exists = fs.existsSync(artifactPath);
+    checks.push({ check: `task-${result.task}-artifact`, pass: exists });
+    if (exists) {
+      const data = JSON.parse(fs.readFileSync(artifactPath, 'utf8'));
+      checks.push({
+        check: `task-${result.task}-complete`,
+        pass: data.status === 'complete',
+      });
+    }
+  }
+  const allPassed = checks.every((c) => c.pass);
+  return { pass: allPassed, checks };
+}
+module.exports = { verifyOutput };

package/templates/eval-fixture-template/hypotheses.md ADDED Viewed

@@ -0,0 +1,14 @@
+# Hypotheses
+Each line is one hypothesis for the eval loop. `/df:eval` picks the next unused
+hypothesis from this file when `--hypothesis` flag is not supplied.
+Format: one hypothesis per line, plain English. Be specific about what to change and why.
+---
+Add explicit cache-priming instructions at the top of the skill prompt to front-load repeated context reads.
+Reduce the number of tool calls by batching related file reads into a single step.
+Reorder instructions to place the most frequently accessed context at the START zone of the prompt (attention U-curve).
+Shorten the skill preamble to reduce input tokens on every invocation.
+Replace prose descriptions of steps with numbered list format to improve instruction clarity and reduce re-reads.

package/templates/eval-fixture-template/spec.md ADDED Viewed

@@ -0,0 +1,34 @@
+# {benchmark-name}
+## Objective
+[One sentence: what skill behavior this benchmark evaluates]
+## Target Metric
+- **Primary (target)**: `cache_ratio` — cache_read_input_tokens / input_tokens (higher = better)
+- **Secondary**: `total_tokens`, `wall_time`, `context_burn`
+- **Guard**: fixture tests pass (binary — failure auto-reverts before any metric check)
+## Skill Under Evaluation
+- **Skill path**: `skills/{skill-name}/SKILL.md`
+- **First hypothesis**: [Your opening hypothesis about what to change and why]
+## Fixture Design
+The `fixture/` directory contains a 12-file deepflow-like skeleton codebase. It is intentionally small but realistic — enough to exercise real skill behavior (file reads, edits, spec lookups, git operations) without taking more than a few minutes per iteration.
+The `tests/` directory holds guard tests. These MUST cover the behavior you care about so the optimizer cannot game the target metric by breaking real functionality.
+## Constraints
+- One change per iteration (atomic causality)
+- Loop runs until Ctrl+C or `--loop N` cap
+- No LLM judges — only mechanical metrics decide keep/revert
+## Acceptance Criteria
+- [ ] Guard tests in `tests/` pass on the unmodified fixture
+- [ ] Fixture exercises the skill's primary code path
+- [ ] Hypotheses file (`hypotheses.md`) has at least 3 entries to seed the loop

package/templates/eval-fixture-template/tests/behavior.test.js ADDED Viewed

@@ -0,0 +1,69 @@
+#!/usr/bin/env node
+/**
+ * Behavior tests for eval fixture
+ *
+ * These tests verify that the skill produced correct functional output,
+ * not just that files exist. Run after the skill execution completes.
+ *
+ * Run: node tests/behavior.test.js
+ * Exit 0 = all pass, exit 1 = one or more failed
+ */
+const fs = require('fs');
+const path = require('path');
+const FIXTURE_DIR = path.join(__dirname, '..', 'fixture');
+let passed = 0;
+let failed = 0;
+function assert(name, condition, detail = '') {
+  if (condition) {
+    console.log(`  PASS  ${name}`);
+    passed++;
+  } else {
+    console.error(`  FAIL  ${name}${detail ? ': ' + detail : ''}`);
+    failed++;
+  }
+}
+function assertArtifact(taskId, extraChecks = () => {}) {
+  const artifactPath = path.join(FIXTURE_DIR, 'output', taskId, 'result.json');
+  const exists = fs.existsSync(artifactPath);
+  assert(`output/${taskId}/result.json exists`, exists);
+  if (!exists) return;
+  let data;
+  try {
+    data = JSON.parse(fs.readFileSync(artifactPath, 'utf8'));
+  } catch (e) {
+    assert(`output/${taskId}/result.json is valid JSON`, false, e.message);
+    return;
+  }
+  assert(`output/${taskId} status === "complete"`, data.status === 'complete');
+  extraChecks(data);
+}
+// ---------------------------------------------------------------------------
+// Task output tests — verify skill produced the expected artifacts
+// ---------------------------------------------------------------------------
+console.log('\n[behavior] Task output checks');
+assertArtifact('T1', (data) => {
+  assert('T1 has message field', typeof data.message === 'string');
+});
+assertArtifact('T2', (data) => {
+  assert('T2 has items field', data.items !== undefined);
+});
+// ---------------------------------------------------------------------------
+// Summary
+// ---------------------------------------------------------------------------
+console.log(`\n[behavior] ${passed} passed, ${failed} failed`);
+process.exit(failed > 0 ? 1 : 0);

package/templates/eval-fixture-template/tests/guard.test.js ADDED Viewed

@@ -0,0 +1,108 @@
+#!/usr/bin/env node
+/**
+ * Guard tests for eval fixture
+ *
+ * These tests constitute the binary guard check in the eval loop.
+ * ALL tests must pass for the iteration to proceed to metric collection.
+ * A failing guard causes immediate git revert and logs status:guard_fail.
+ *
+ * Run: node tests/guard.test.js
+ * Exit 0 = all pass, exit 1 = one or more failed
+ */
+const fs = require('fs');
+const path = require('path');
+const FIXTURE_DIR = path.join(__dirname, '..', 'fixture');
+let passed = 0;
+let failed = 0;
+function assert(name, condition, detail = '') {
+  if (condition) {
+    console.log(`  PASS  ${name}`);
+    passed++;
+  } else {
+    console.error(`  FAIL  ${name}${detail ? ': ' + detail : ''}`);
+    failed++;
+  }
+}
+// ---------------------------------------------------------------------------
+// Structural tests — fixture files must exist before skill evaluation runs
+// ---------------------------------------------------------------------------
+console.log('\n[guard] Structural integrity checks');
+assert(
+  'fixture/package.json exists',
+  fs.existsSync(path.join(FIXTURE_DIR, 'package.json'))
+);
+assert(
+  'fixture/src/index.js exists',
+  fs.existsSync(path.join(FIXTURE_DIR, 'src', 'index.js'))
+);
+assert(
+  'fixture/src/skills/example-skill/SKILL.md exists',
+  fs.existsSync(path.join(FIXTURE_DIR, 'src', 'skills', 'example-skill', 'SKILL.md'))
+);
+assert(
+  'fixture/specs/ contains at least one doing-*.md',
+  fs.readdirSync(path.join(FIXTURE_DIR, 'specs')).some(
+    (f) => f.startsWith('doing-') && f.endsWith('.md')
+  )
+);
+assert(
+  'fixture/.deepflow/decisions.md exists',
+  fs.existsSync(path.join(FIXTURE_DIR, '.deepflow', 'decisions.md'))
+);
+// ---------------------------------------------------------------------------
+// Content tests — critical fields must be present in key files
+// ---------------------------------------------------------------------------
+console.log('\n[guard] Content validity checks');
+const skillPath = path.join(FIXTURE_DIR, 'src', 'skills', 'example-skill', 'SKILL.md');
+const skillContent = fs.readFileSync(skillPath, 'utf8');
+assert(
+  'SKILL.md has YAML frontmatter',
+  skillContent.startsWith('---')
+);
+assert(
+  'SKILL.md has allowed-tools',
+  skillContent.includes('allowed-tools')
+);
+const pkgPath = path.join(FIXTURE_DIR, 'package.json');
+const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8'));
+assert(
+  'package.json has test script',
+  typeof pkg.scripts?.test === 'string'
+);
+assert(
+  'package.json has build script',
+  typeof pkg.scripts?.build === 'string'
+);
+// ---------------------------------------------------------------------------
+// Summary
+// ---------------------------------------------------------------------------
+console.log(`\n[guard] ${passed} passed, ${failed} failed`);
+if (failed > 0) {
+  console.error('[guard] GUARD FAILED — iteration will be reverted');
+  process.exit(1);
+}
+console.log('[guard] All guard checks passed');
+process.exit(0);