npm - devlyn-cli - Versions diffs - 1.15.0 → 2.1.0 - Mend

devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

package/benchmark/auto-resolve/fixtures/test-repo/package.json ADDED Viewed

@@ -0,0 +1,22 @@
+{
+  "name": "bench-test-repo",
+  "version": "0.1.0",
+  "private": true,
+  "description": "Deterministic base Node project for devlyn-cli auto-resolve benchmarks. Every fixture starts from a fresh copy of this directory.",
+  "bin": {
+    "bench-cli": "bin/cli.js"
+  },
+  "scripts": {
+    "cli": "node bin/cli.js",
+    "start": "node server/index.js",
+    "test": "node --test tests/",
+    "lint:json": "node scripts/lint-json.js"
+  },
+  "dependencies": {
+    "express": "4.19.2"
+  },
+  "devDependencies": {},
+  "engines": {
+    "node": ">=18.0.0"
+  }
+}

package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js ADDED Viewed

@@ -0,0 +1,17 @@
+// Playwright config used only by browser-validate benchmark fixtures.
+// Runs against web/index.html served via `npx serve web` (fixture setup.sh
+// starts the server). Keep config minimal.
+module.exports = {
+  testDir: './tests/e2e',
+  timeout: 30_000,
+  use: {
+    baseURL: 'http://127.0.0.1:5173',
+    headless: true,
+  },
+  webServer: {
+    command: 'npx --yes serve -l 5173 web',
+    port: 5173,
+    reuseExistingServer: !process.env.CI,
+    timeout: 15_000,
+  },
+};

package/benchmark/auto-resolve/fixtures/test-repo/server/index.js ADDED Viewed

@@ -0,0 +1,37 @@
+// Tiny Express server used by backend-contract fixtures. Intentionally small.
+const express = require('express');
+const app = express();
+app.use(express.json());
+const items = [
+  { id: 1, name: 'alpha', qty: 3 },
+  { id: 2, name: 'beta', qty: 5 },
+];
+app.get('/health', (_req, res) => {
+  res.json({ status: 'ok' });
+});
+app.get('/items', (_req, res) => {
+  res.json({ items });
+});
+app.get('/items/:id', (req, res) => {
+  const id = Number(req.params.id);
+  const item = items.find((it) => it.id === id);
+  if (!item) {
+    res.status(404).json({ error: 'not_found', id });
+    return;
+  }
+  res.json({ item });
+});
+if (require.main === module) {
+  const port = Number(process.env.PORT) || 3000;
+  app.listen(port, () => {
+    console.log(`bench-test-repo server listening on :${port}`);
+  });
+}
+module.exports = { app };

package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js ADDED Viewed

@@ -0,0 +1,25 @@
+const { test } = require('node:test');
+const assert = require('node:assert');
+const { execFileSync } = require('node:child_process');
+const path = require('node:path');
+const CLI = path.join(__dirname, '..', 'bin', 'cli.js');
+function run(args) {
+  return execFileSync('node', [CLI, ...args], { encoding: 'utf8' });
+}
+test('hello default', () => {
+  const out = run(['hello']);
+  assert.match(out, /Hello, world!/);
+});
+test('hello with --name', () => {
+  const out = run(['hello', '--name', 'alice']);
+  assert.match(out, /Hello, alice!/);
+});
+test('version prints package version', () => {
+  const out = run(['version']);
+  assert.match(out, /\d+\.\d+\.\d+/);
+});

package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js ADDED Viewed

@@ -0,0 +1,58 @@
+const { test } = require('node:test');
+const assert = require('node:assert');
+const http = require('node:http');
+const { app } = require('../server');
+function startServer() {
+  return new Promise((resolve) => {
+    const server = http.createServer(app);
+    server.listen(0, () => resolve(server));
+  });
+}
+function get(server, path) {
+  return new Promise((resolve, reject) => {
+    const { port } = server.address();
+    http
+      .get(`http://127.0.0.1:${port}${path}`, (res) => {
+        let body = '';
+        res.on('data', (chunk) => (body += chunk));
+        res.on('end', () => resolve({ status: res.statusCode, body: JSON.parse(body) }));
+      })
+      .on('error', reject);
+  });
+}
+test('GET /health returns ok', async () => {
+  const server = await startServer();
+  try {
+    const { status, body } = await get(server, '/health');
+    assert.strictEqual(status, 200);
+    assert.deepStrictEqual(body, { status: 'ok' });
+  } finally {
+    server.close();
+  }
+});
+test('GET /items returns list', async () => {
+  const server = await startServer();
+  try {
+    const { status, body } = await get(server, '/items');
+    assert.strictEqual(status, 200);
+    assert.ok(Array.isArray(body.items));
+    assert.ok(body.items.length >= 2);
+  } finally {
+    server.close();
+  }
+});
+test('GET /items/:id returns 404 for missing', async () => {
+  const server = await startServer();
+  try {
+    const { status, body } = await get(server, '/items/99999');
+    assert.strictEqual(status, 404);
+    assert.strictEqual(body.error, 'not_found');
+  } finally {
+    server.close();
+  }
+});

package/benchmark/auto-resolve/fixtures/test-repo/web/index.html ADDED Viewed

@@ -0,0 +1,37 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>bench-test-repo</title>
+    <style>
+      body {
+        font-family: system-ui, sans-serif;
+        margin: 2rem;
+        max-width: 40rem;
+      }
+      button {
+        padding: 0.5rem 1rem;
+        font-size: 1rem;
+      }
+      #output {
+        margin-top: 1rem;
+        padding: 1rem;
+        border: 1px solid #ccc;
+        border-radius: 4px;
+        min-height: 2rem;
+      }
+    </style>
+  </head>
+  <body>
+    <h1>bench-test-repo</h1>
+    <p>Minimal page used by browser-validate benchmark fixtures.</p>
+    <button id="greet">Greet</button>
+    <div id="output" data-testid="output"></div>
+    <script>
+      document.getElementById('greet').addEventListener('click', () => {
+        const out = document.getElementById('output');
+        out.textContent = 'Hello from bench-test-repo';
+      });
+    </script>
+  </body>
+</html>

package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py ADDED Viewed

@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+"""Build the iter-0033c pair-eligible manifest (Codex R0/R0.5 + R0-infra/R0.5-infra).
+Manifest captures the immutable Gate-3 input to iter-0033c-compare.py:
+  - which fixtures are pair-eligible (high-value ∪ L1≤L0 ∪ F9-if-iter-0033a-passed)
+  - what the Gate-3 threshold count is
+  - sha256 over the canonical document so any post-write tampering is detectable
+Hashing pattern is the pre-stamp form lifted from
+benchmark/auto-resolve/scripts/pair-plan-lint.py:81-91 — deep-copy the manifest,
+zero out `manifest_sha256`, serialize with `sort_keys=True, separators=(",",":"),
+ensure_ascii=False, allow_nan=False`, then sha256 the bytes.
+Inputs (all required):
+  --c1-summary <path>        iter-0033 (C1) summary.json (selection grounds; never a comparison baseline)
+  --f9-judge <path>          iter-0033a F9 judge.json (F9 inclusion proof)
+  --l1-rerun-summary <path>  L1 rerun summary at iter-0033c HEAD (fresh baseline)
+  --output <path>            destination .devlyn/manifests/iter-0033c-pair-eligible.json
+Selection rule (frozen pre-registration, iter-0033c §"Pair-eligible fixture set"):
+  high_value = {F2, F3, F4, F6, F7}
+  promoted_by_l1_le_l0 = {f ∈ C1 summary | solo_claude.score ≤ bare.score}
+  conditional_excluded = {F1, F5}    # promoted only if L1≤L0
+  reporting_only = {F8}              # excluded from Gate 3
+  pair_eligible = high_value ∪ promoted_by_l1_le_l0 ∪ {F9 if iter-0033a passed}
+                  − reporting_only
+                  − conditional_excluded that did not get promoted
+"""
+import argparse
+import copy
+import hashlib
+import json
+import subprocess
+import sys
+from pathlib import Path
+HIGH_VALUE = ["F2", "F3", "F4", "F6", "F7"]
+CONDITIONAL = ["F1", "F5"]
+REPORTING_ONLY = ["F8"]
+def file_sha256(path: Path) -> str:
+    return hashlib.sha256(path.read_bytes()).hexdigest()
+def canonical_manifest_sha256(manifest: dict) -> str:
+    """Pre-stamp hash per pair-plan-lint.py:81-91 — zero out the stamp, then sha256."""
+    pre = copy.deepcopy(manifest)
+    pre["manifest_sha256"] = ""
+    s = json.dumps(
+        pre,
+        sort_keys=True,
+        separators=(",", ":"),
+        ensure_ascii=False,
+        allow_nan=False,
+    )
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()
+def fixture_short_id(full: str) -> str:
+    """'F3-backend-contract-risk' -> 'F3'. Pure prefix; matches existing convention."""
+    return full.split("-", 1)[0] if "-" in full else full
+def compute_promoted_l1_le_l0(c1_rows: list) -> list:
+    """Return short fixture IDs (e.g. 'F3') where solo_claude.score ≤ bare.score in C1."""
+    promoted = []
+    for row in c1_rows:
+        arms = row.get("arms", {})
+        solo = arms.get("solo_claude", {}).get("score")
+        bare = arms.get("bare", {}).get("score")
+        if solo is None or bare is None:
+            continue
+        if solo <= bare:
+            promoted.append(fixture_short_id(row["fixture"]))
+    return promoted
+def f9_passed(f9_judge: dict) -> bool:
+    """iter-0033a passed iff A score > B score AND A is not disqualified."""
+    a = f9_judge.get("a_score")
+    b = f9_judge.get("b_score")
+    dqs = f9_judge.get("disqualifiers") or {}
+    if a is None or b is None:
+        return False
+    return a > b and not bool(dqs.get("A", False))
+def head_sha() -> str:
+    try:
+        out = subprocess.check_output(
+            ["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL
+        )
+        return out.decode().strip()
+    except Exception:
+        return ""
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--c1-summary", required=True)
+    ap.add_argument("--f9-judge", required=True)
+    ap.add_argument("--l1-rerun-summary", required=True)
+    ap.add_argument("--output", required=True)
+    args = ap.parse_args()
+    c1_path = Path(args.c1_summary)
+    f9_path = Path(args.f9_judge)
+    l1_path = Path(args.l1_rerun_summary)
+    out_path = Path(args.output)
+    for p, label in [(c1_path, "c1-summary"), (f9_path, "f9-judge"), (l1_path, "l1-rerun-summary")]:
+        if not p.is_file():
+            print(f"error: {label} not found: {p}", file=sys.stderr)
+            return 2
+    c1 = json.loads(c1_path.read_text())
+    f9 = json.loads(f9_path.read_text())
+    promoted = compute_promoted_l1_le_l0(c1.get("rows", []))
+    f9_in = f9_passed(f9)
+    pair_eligible = list(HIGH_VALUE)  # frozen high-value list, ordered
+    for fx in promoted:
+        if fx not in pair_eligible and fx not in REPORTING_ONLY:
+            pair_eligible.append(fx)
+    if f9_in and "F9" not in pair_eligible:
+        pair_eligible.append("F9")
+    pair_eligible = [fx for fx in pair_eligible if fx not in REPORTING_ONLY]
+    conditional_promoted = [fx for fx in CONDITIONAL if fx in promoted]
+    conditional_excluded = [fx for fx in CONDITIONAL if fx not in promoted]
+    pair_eligible_sorted = sorted(pair_eligible, key=lambda s: (s[0], int(s[1:])))
+    gate3_total = len(pair_eligible_sorted)
+    gate3_threshold = (gate3_total + 1) // 2  # ≥50% — ceil(gate3_total / 2)
+    manifest = {
+        "schema_version": "1.0",
+        "iter": "0033c",
+        "head": head_sha(),
+        "sources": {
+            "c1_summary": {"path": str(c1_path), "sha256": file_sha256(c1_path)},
+            "f9_judge": {"path": str(f9_path), "sha256": file_sha256(f9_path)},
+            "l1_rerun_summary": {"path": str(l1_path), "sha256": file_sha256(l1_path)},
+        },
+        "selection_rule": {
+            "high_value": HIGH_VALUE,
+            "promoted_by_l1_le_l0": sorted(set(promoted)),
+            "f9_included": f9_in,
+            "f9_passed_iter_0033a": f9_in,
+            "reporting_only": REPORTING_ONLY,
+            "conditional_excluded": conditional_excluded,
+            "conditional_promoted": conditional_promoted,
+        },
+        "fixtures_pair_eligible": pair_eligible_sorted,
+        "gate3_threshold_count": gate3_threshold,
+        "gate3_total": gate3_total,
+        "manifest_sha256": "",
+    }
+    manifest["manifest_sha256"] = canonical_manifest_sha256(manifest)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(manifest, indent=2) + "\n")
+    print(f"[manifest] wrote {out_path}")
+    print(f"[manifest] pair-eligible: {pair_eligible_sorted} "
+          f"(gate3 ≥ {gate3_threshold} / {gate3_total})")
+    print(f"[manifest] sha256: {manifest['manifest_sha256']}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

package/benchmark/auto-resolve/scripts/check-f9-artifacts.py ADDED Viewed

@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+"""F9 variant/solo arm artifact + transcript fingerprint check.
+Out-of-band per Codex R0.5 §B (iter-0033a): expected.json.verification_commands
+apply to ALL arms (run-fixture.sh:472), so a `docs/specs/**` check there would
+punish bare. This script runs AFTER run-fixture.sh and asserts variant/solo
+arms produced the artifacts the 2-skill ideate→resolve chain should emit.
+Bare arm is exempt by construction.
+Usage:
+  check-f9-artifacts.py --result-dir <results/<run_id>/F9-e2e-ideate-to-resolve/<arm>>
+Exits:
+  0 — all checks pass (or bare arm — exempt).
+  1 — variant/solo arm but artifact contract violated.
+  2 — invalid invocation (missing args, missing dir).
+Emits a small JSON report at <result-dir>/check-f9-artifacts.json.
+"""
+import argparse
+import json
+import os
+import re
+import sys
+from pathlib import Path
+VARIANT_ARMS = {"variant", "solo_claude", "l2_gated", "l2_forced"}
+EXEMPT_ARMS = {"bare"}
+SPEC_DIR_GLOB = "docs/specs/*/spec.md"
+SPEC_EXPECTED_GLOB = "docs/specs/*/spec.expected.json"
+# Transcript fingerprint regexes (negative checks only — `claude -p`
+# transcript captures only the agent's final reply, not intermediate
+# tool calls; positive resolve invocation evidence lives in state).
+RE_AUTO_RESOLVE = re.compile(r"/devlyn:auto-resolve\b")
+RE_PREFLIGHT = re.compile(r"/devlyn:preflight\b")
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
+    p.add_argument("--result-dir", required=True,
+                   help="Path to results/<run_id>/<fixture>/<arm>/")
+    args = p.parse_args()
+    result_dir = Path(args.result_dir)
+    if not result_dir.is_dir():
+        print(f"error: result dir not found: {result_dir}", file=sys.stderr)
+        return 2
+    arm = result_dir.name
+    fixture = result_dir.parent.name
+    if fixture != "F9-e2e-ideate-to-resolve":
+        print(f"error: this script is F9-only (got fixture={fixture})", file=sys.stderr)
+        return 2
+    report = {
+        "fixture": fixture,
+        "arm": arm,
+        "checks": [],
+        "exempt": False,
+        "pass": True,
+    }
+    if arm in EXEMPT_ARMS:
+        report["exempt"] = True
+        report["checks"].append({"name": "arm-is-bare-exempt", "pass": True})
+        _write_report(result_dir, report)
+        return 0
+    if arm not in VARIANT_ARMS:
+        print(f"error: unknown arm '{arm}' (expected one of {VARIANT_ARMS | EXEMPT_ARMS})",
+              file=sys.stderr)
+        return 2
+    # The fixture's work-dir is referenced from result_dir/timing.json. The
+    # arm produced files inside that work-dir; we glob from there.
+    timing_path = result_dir / "timing.json"
+    work_dir: Path
+    if timing_path.is_file():
+        try:
+            timing = json.loads(timing_path.read_text())
+            work_dir = Path(timing.get("work_dir", ""))
+        except Exception:
+            work_dir = Path("")
+    else:
+        work_dir = Path("")
+    if not work_dir.is_dir():
+        report["checks"].append({
+            "name": "work-dir-resolvable",
+            "pass": False,
+            "reason": f"work_dir from timing.json not usable: {work_dir!r}",
+        })
+        report["pass"] = False
+        _write_report(result_dir, report)
+        return 1
+    # Check 1: docs/specs/<id>-<slug>/spec.md exists.
+    specs_root = work_dir / "docs" / "specs"
+    spec_md_files = list(specs_root.glob("*/spec.md")) if specs_root.is_dir() else []
+    spec_md_present = bool(spec_md_files)
+    report["checks"].append({
+        "name": "spec.md-exists-under-docs/specs",
+        "pass": spec_md_present,
+        "matched": [str(p.relative_to(work_dir)) for p in spec_md_files],
+    })
+    if not spec_md_present:
+        report["pass"] = False
+    # Check 2: spec.expected.json exists at the same dir.
+    spec_exp_files = list(specs_root.glob("*/spec.expected.json")) if specs_root.is_dir() else []
+    spec_exp_present = bool(spec_exp_files)
+    report["checks"].append({
+        "name": "spec.expected.json-exists-under-docs/specs",
+        "pass": spec_exp_present,
+        "matched": [str(p.relative_to(work_dir)) for p in spec_exp_files],
+    })
+    if not spec_exp_present:
+        report["pass"] = False
+    # Path-shape regression: the parent dir name should be `<id>-<slug>` shape.
+    # Both id and slug are kebab-case, so the dir must contain at least one
+    # hyphen. Bare `<id>/spec.md` (no hyphen) is the legacy shape we reject.
+    if spec_md_files:
+        bad_shapes = [p for p in spec_md_files if "-" not in p.parent.name]
+        report["checks"].append({
+            "name": "path-shape-id-slug",
+            "pass": not bad_shapes,
+            "non_conforming": [str(p.relative_to(work_dir)) for p in bad_shapes],
+        })
+        if bad_shapes:
+            report["pass"] = False
+    # Resolve invocation evidence — primary source is pipeline.state.json,
+    # NOT transcript.txt. `claude -p` only emits the agent's final reply to
+    # stdout; intermediate Skill / Agent / Bash tool calls do not appear in
+    # transcript.txt. Therefore "regex /devlyn:resolve --spec in transcript"
+    # is the wrong source. The authoritative evidence resolve actually ran
+    # in --spec mode is `state.mode == "spec"` plus `state.source.type ==
+    # "spec"` plus a populated `state.source.spec_path` pointing under
+    # `docs/specs/`. Per state-schema.md this is single-source-of-truth.
+    # Look for the archive first (preferred), then fall back to the live
+    # in-flight location. NEW resolve currently lands artifacts directly in
+    # `.devlyn/` and may skip the move-to-runs/ archive step (TODO: separate
+    # iter to fix archive); both locations carry the same authoritative
+    # state shape.
+    archived_paths = list(work_dir.glob(".devlyn/runs/*/pipeline.state.json"))
+    live_path = work_dir / ".devlyn" / "pipeline.state.json"
+    state_paths = archived_paths if archived_paths else (
+        [live_path] if live_path.is_file() else []
+    )
+    if not state_paths:
+        report["checks"].append({
+            "name": "pipeline.state.json-present",
+            "pass": False,
+            "reason": "neither .devlyn/runs/*/pipeline.state.json nor .devlyn/pipeline.state.json found in work_dir",
+        })
+        report["pass"] = False
+    else:
+        # Read the most recent run.
+        state_path = sorted(state_paths)[-1]
+        try:
+            state = json.loads(state_path.read_text())
+        except Exception as exc:
+            report["checks"].append({
+                "name": "pipeline.state.json-parses",
+                "pass": False,
+                "reason": f"{exc.__class__.__name__}: {exc}",
+            })
+            report["pass"] = False
+            state = None
+        if state is not None:
+            archived = "/runs/" in str(state_path)
+            report["checks"].append({
+                "name": "pipeline.state.json-present",
+                "pass": True,
+                "path": str(state_path.relative_to(work_dir)),
+                "archived_to_runs_dir": archived,
+            })
+            if not archived:
+                # Not a fail — note for harness developer that NEW resolve
+                # is skipping the archive step in this run.
+                report["checks"].append({
+                    "name": "archive-step-completed",
+                    "pass": True,
+                    "warning": "NEW resolve left artifacts in .devlyn/ instead of .devlyn/runs/<id>/ — archive step skipped (separate iter for harness fix)",
+                })
+            mode = state.get("mode")
+            src_type = (state.get("source") or {}).get("type")
+            spec_path = (state.get("source") or {}).get("spec_path") or ""
+            spec_under_specs = spec_path.startswith("docs/specs/") and spec_path.endswith("spec.md")
+            mode_ok = mode == "spec"
+            src_ok = src_type == "spec"
+            report["checks"].append({
+                "name": "state.mode-and-source-spec",
+                "pass": mode_ok and src_ok and spec_under_specs,
+                "mode": mode,
+                "source.type": src_type,
+                "source.spec_path": spec_path,
+            })
+            if not (mode_ok and src_ok and spec_under_specs):
+                report["pass"] = False
+    # Transcript fingerprint — negative checks only. transcript.txt records
+    # the agent's final reply; if the agent (or any subagent) had invoked
+    # /devlyn:auto-resolve or /devlyn:preflight, the prompt-following gate
+    # should still surface the name in the summary. Positive resolve
+    # evidence lives in state above; here we just rule out the deprecated
+    # 3-skill chain names.
+    transcript_path = result_dir / "transcript.txt"
+    if not transcript_path.is_file():
+        report["checks"].append({
+            "name": "transcript-readable",
+            "pass": False,
+            "reason": f"transcript.txt missing at {transcript_path}",
+        })
+        report["pass"] = False
+        _write_report(result_dir, report)
+        return 1
+    transcript = transcript_path.read_text(errors="replace")
+    auto_resolve_hits = RE_AUTO_RESOLVE.findall(transcript)
+    report["checks"].append({
+        "name": "transcript-no-auto-resolve",
+        "pass": len(auto_resolve_hits) == 0,
+        "count": len(auto_resolve_hits),
+    })
+    if auto_resolve_hits:
+        report["pass"] = False
+    preflight_hits = RE_PREFLIGHT.findall(transcript)
+    report["checks"].append({
+        "name": "transcript-no-preflight",
+        "pass": len(preflight_hits) == 0,
+        "count": len(preflight_hits),
+    })
+    if preflight_hits:
+        report["pass"] = False
+    _write_report(result_dir, report)
+    return 0 if report["pass"] else 1
+def _write_report(result_dir: Path, report: dict) -> None:
+    out_path = result_dir / "check-f9-artifacts.json"
+    out_path.write_text(json.dumps(report, indent=2) + "\n")
+if __name__ == "__main__":
+    sys.exit(main())