npm - @pugi/cli - Versions diffs - 0.1.0-beta.100 → 0.1.0-beta.101 - Mend

@pugi/cli 0.1.0-beta.100 → 0.1.0-beta.101

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/README.md +2 -0
package/dist/core/codegraph/parser.js +574 -47
package/dist/core/codegraph/queries/go.scm +57 -0
package/dist/core/codegraph/queries/javascript.scm +56 -0
package/dist/core/codegraph/queries/python.scm +55 -0
package/dist/core/codegraph/queries/rust.scm +63 -0
package/dist/core/codegraph/queries/typescript.scm +91 -0
package/dist/core/codegraph/reindex.js +218 -0
package/dist/core/codegraph/resolve-edges.js +107 -0
package/dist/core/codegraph/watcher.js +440 -0
package/dist/core/diagnostics/probes/sandbox.js +7 -12
package/dist/core/engine/prompts.js +32 -0
package/dist/core/eval/v1/ledger.js +83 -0
package/dist/core/eval/v1/runner.js +280 -0
package/dist/core/eval/v1/scoring.js +68 -0
package/dist/core/eval/v1/task-loader.js +191 -0
package/dist/core/eval/v1/types.js +14 -0
package/dist/core/eval/v1/verifier.js +176 -0
package/dist/core/eval/v1/yaml-parser.js +250 -0
package/dist/core/sandboxing/adapter.js +31 -17
package/dist/core/sandboxing/bubblewrap.js +209 -0
package/dist/core/sandboxing/index.js +32 -3
package/dist/core/sandboxing/policy.js +97 -0
package/dist/core/sandboxing/seatbelt.js +69 -21
package/dist/core/settings.js +31 -7
package/dist/runtime/cli.js +58 -0
package/dist/runtime/commands/eval-v1.js +266 -0
package/dist/runtime/commands/index-cmd.js +125 -19
package/dist/runtime/commands/servers-cli.js +182 -0
package/dist/runtime/version.js +1 -1
package/dist/tools/bash.js +187 -3
package/package.json +10 -3

package/dist/core/eval/v1/runner.js ADDED Viewed

@@ -0,0 +1,280 @@
+/**
+ * Runner for pugi-eval-v1.
+ *
+ * Per task: spawn a fresh tmp workspace, copy fixture files, invoke
+ * the `pugi <command>` subprocess with the brief, capture stdout +
+ * exit code + wall-clock, then run the verification checks.
+ *
+ * The runner is deliberately subprocess-based - mirrors the smoke
+ * harness pattern in `core/smoke/headless-driver.ts`. Validating the
+ * AS-PUBLISHED CLI is the whole point of a benchmark; bypassing
+ * `bin/run.js` would let us miss whole categories of regression
+ * (loader cost, env propagation, exit-code handling).
+ *
+ * Tests inject a `runner` callback that returns a fake `RunCapture`
+ * so the meta-spec can exercise scoring + ledger without a real
+ * engine.
+ */
+import { spawn } from 'node:child_process';
+import { existsSync, mkdirSync, mkdtempSync, rmSync, writeFileSync, readdirSync, statSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { dirname, join, resolve } from 'node:path';
+import { computePugiScore } from './scoring.js';
+import { runVerifications } from './verifier.js';
+/**
+ * Default executor: spawn `pugi <command> "<brief>" --json --print`
+ * inside the workspace. The `--print` flag forces non-interactive
+ * mode; `--json` produces the structured envelope the runner parses
+ * for `tokensUsed` etc.
+ */
+export const subprocessRunner = async (input) => {
+    const args = [
+        input.spec.command,
+        '--print',
+        '--json',
+        '--intensity',
+        input.spec.intensity,
+        '--max-turns',
+        String(input.spec.maxTurns),
+    ];
+    if (input.model) {
+        args.push('--model', input.model);
+    }
+    args.push(input.spec.brief);
+    const child = spawn(input.pugiBin, args, {
+        cwd: input.workspaceRoot,
+        env: input.env,
+        stdio: ['ignore', 'pipe', 'pipe'],
+    });
+    const start = Date.now();
+    let stdout = '';
+    let stderr = '';
+    child.stdout?.on('data', (chunk) => {
+        stdout += chunk.toString('utf8');
+    });
+    child.stderr?.on('data', (chunk) => {
+        stderr += chunk.toString('utf8');
+    });
+    let timedOut = false;
+    const timer = setTimeout(() => {
+        timedOut = true;
+        try {
+            child.kill('SIGTERM');
+        }
+        catch {
+            /* noop */
+        }
+        setTimeout(() => {
+            try {
+                child.kill('SIGKILL');
+            }
+            catch {
+                /* noop */
+            }
+        }, 5_000);
+    }, input.spec.timeoutMs);
+    const onAbort = () => {
+        try {
+            child.kill('SIGTERM');
+        }
+        catch {
+            /* noop */
+        }
+    };
+    input.signal?.addEventListener('abort', onAbort);
+    const exitCode = await new Promise((resolveExit) => {
+        child.on('exit', (code) => resolveExit(code ?? -1));
+        child.on('error', () => resolveExit(-1));
+    });
+    clearTimeout(timer);
+    input.signal?.removeEventListener('abort', onAbort);
+    const wallClockMs = Date.now() - start;
+    const parsed = parseEnvelope(stdout);
+    const budgetExhausted = parsed?.status === 'budget_exceeded' ||
+        parsed?.status === 'budget_exhausted';
+    const engineError = parsed?.status === 'engine_unavailable' ||
+        parsed?.status === 'failed';
+    return {
+        stdout,
+        stderr,
+        exitCode,
+        wallClockMs,
+        tokensUsed: parsed?.tokensUsed ?? 0,
+        turnsUsed: parsed?.turnsUsed ?? 0,
+        toolCallCount: parsed?.toolCallCount ?? 0,
+        timedOut,
+        budgetExhausted,
+        engineError,
+    };
+};
+/**
+ * Parse the last JSON envelope from stdout. Pugi `--json` emits one
+ * JSON object per invocation; the runner scans for the final `{...}`
+ * block so warning lines before it do not break parsing.
+ */
+function parseEnvelope(stdout) {
+    const trimmed = stdout.trim();
+    if (trimmed === '')
+        return null;
+    // Try the entire trimmed payload first (common case).
+    try {
+        return JSON.parse(trimmed);
+    }
+    catch {
+        /* fall through to line scan */
+    }
+    const lines = trimmed.split(/\r?\n/);
+    for (let i = lines.length - 1; i >= 0; i -= 1) {
+        const line = lines[i].trim();
+        if (!line.startsWith('{'))
+            continue;
+        try {
+            return JSON.parse(line);
+        }
+        catch {
+            continue;
+        }
+    }
+    return null;
+}
+function walkFiles(root, prefix, out) {
+    let entries;
+    try {
+        entries = readdirSync(root);
+    }
+    catch {
+        return;
+    }
+    for (const entry of entries) {
+        if (entry === '.pugi' || entry === 'node_modules' || entry === '.git') {
+            continue;
+        }
+        const abs = join(root, entry);
+        const rel = prefix === '' ? entry : `${prefix}/${entry}`;
+        let st;
+        try {
+            st = statSync(abs);
+        }
+        catch {
+            continue;
+        }
+        if (st.isDirectory()) {
+            walkFiles(abs, rel, out);
+        }
+        else if (st.isFile()) {
+            out.push(rel);
+        }
+    }
+}
+function classifyStatus(capture, verificationsAllPassed) {
+    if (capture.timedOut)
+        return 'timeout';
+    if (capture.budgetExhausted)
+        return 'budget_exhausted';
+    if (capture.engineError)
+        return 'engine_error';
+    if (capture.exitCode !== 0)
+        return 'fail';
+    return verificationsAllPassed ? 'pass' : 'fail';
+}
+export function prepareWorkspace(spec) {
+    const root = mkdtempSync(join(tmpdir(), `pugi-eval-v1-${spec.id}-`));
+    if (spec.fixture) {
+        for (const [relPath, body] of Object.entries(spec.fixture)) {
+            if (relPath.split(/[\\/]/).includes('..')) {
+                throw new Error(`eval-v1 task ${spec.id}: fixture path ${relPath} contains ..`);
+            }
+            const abs = resolve(root, relPath);
+            mkdirSync(dirname(abs), { recursive: true });
+            writeFileSync(abs, body, { mode: 0o644 });
+        }
+    }
+    const cleanup = () => {
+        try {
+            rmSync(root, { recursive: true, force: true });
+        }
+        catch {
+            /* swallow */
+        }
+    };
+    return { root, cleanup };
+}
+export async function runTaskWithCapture(spec, workspaceRoot, capture) {
+    const parsed = parseEnvelope(capture.stdout);
+    const finalText = parsed?.finalText ?? capture.stdout;
+    const verifications = runVerifications(spec.verification, {
+        workspaceRoot,
+        finalText,
+    });
+    const allPassed = verifications.every((v) => v.passed);
+    const status = classifyStatus(capture, allPassed);
+    const filesWritten = [];
+    walkFiles(workspaceRoot, '', filesWritten);
+    filesWritten.sort();
+    const base = {
+        taskId: spec.id,
+        status,
+        tokensUsed: capture.tokensUsed,
+        toolCallCount: capture.toolCallCount,
+        turnsUsed: capture.turnsUsed,
+        wallClockMs: capture.wallClockMs,
+        exitCode: capture.exitCode,
+        verifications,
+        finalText,
+        filesWritten,
+    };
+    const pugiScore = computePugiScore(base, spec);
+    return { ...base, pugiScore };
+}
+export async function runOneTask(spec, options) {
+    const ws = prepareWorkspace(spec);
+    try {
+        const capture = await options.runner({
+            spec,
+            workspaceRoot: ws.root,
+            pugiBin: options.pugiBin,
+            ...(options.model !== undefined ? { model: options.model } : {}),
+            env: options.env,
+            ...(options.signal !== undefined ? { signal: options.signal } : {}),
+        });
+        return await runTaskWithCapture(spec, ws.root, capture);
+    }
+    finally {
+        ws.cleanup();
+    }
+}
+export async function runHarness(input) {
+    const runner = input.options.runner ?? subprocessRunner;
+    const env = input.options.env ?? process.env;
+    const onlyFilter = input.options.only
+        ? new Set(input.options.only)
+        : null;
+    const out = [];
+    for (const spec of input.specs) {
+        if (onlyFilter && !onlyFilter.has(spec.id))
+            continue;
+        input.options.onTaskStart?.(spec);
+        const runOpts = {
+            pugiBin: input.options.pugiBin,
+            env,
+            runner,
+        };
+        if (input.options.model !== undefined) {
+            runOpts.model = input.options.model;
+        }
+        const result = await runOneTask(spec, runOpts);
+        out.push(result);
+        input.options.onTaskFinish?.(result);
+    }
+    if (onlyFilter && out.length === 0) {
+        throw new Error(`eval-v1: --task filter matched zero tasks (asked for ${[...onlyFilter].join(', ')})`);
+    }
+    // Verify path safety: workspace cleanup happened, no temp dirs
+    // leaked beyond tmpdir prefix.
+    if (!existsSync(tmpdir())) {
+        // pathological - tmpdir disappeared. Surface so CI fails loud.
+        throw new Error('eval-v1: tmpdir no longer exists post-run');
+    }
+    return out;
+}
+//# sourceMappingURL=runner.js.map

package/dist/core/eval/v1/scoring.js ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * pugi_score scoring formula for eval-v1.
+ *
+ * Per-task score (0..150, higher = better):
+ *
+ *   pugi_score = pass_rate                           * 100   // 0..100
+ *              + verification_completeness           *  50   // 0.. 50
+ *              - (tokens_used   / max_tokens)        *  30   // 0..-30
+ *              - (wall_clock_ms / timeout_ms)        *  20   // 0..-20
+ *
+ * Where:
+ *   - `pass_rate` is 1.0 if status is `pass`, 0 otherwise.
+ *   - `verification_completeness` is `passed_checks / total_checks`.
+ *   - Token and wall-clock penalties are clamped to [0, 1] so a run
+ *     that exceeds the budget cap caps the penalty (avoid runaway
+ *     negative scores that would skew the aggregate).
+ *
+ * Aggregate is the arithmetic mean across all per-task scores. Mean
+ * is defensible because every task contributes equally to the
+ * benchmark (we are not weighting by difficulty - eval-v2 may add
+ * weights once we have a baseline year of data).
+ */
+const PASS_WEIGHT = 100;
+const VERIFICATION_WEIGHT = 50;
+const TOKEN_PENALTY = 30;
+const WALL_PENALTY = 20;
+function clamp01(n) {
+    if (!Number.isFinite(n))
+        return 1;
+    if (n < 0)
+        return 0;
+    if (n > 1)
+        return 1;
+    return n;
+}
+/**
+ * Compute the per-task pugi_score. Called by the runner before
+ * appending the result to the ledger.
+ */
+export function computePugiScore(result, spec) {
+    const passRate = result.status === 'pass' ? 1 : 0;
+    const totalChecks = Math.max(1, result.verifications.length);
+    const passedChecks = result.verifications.filter((v) => v.passed).length;
+    const completeness = passedChecks / totalChecks;
+    const tokenPenalty = clamp01(result.tokensUsed / spec.maxTokens);
+    const wallPenalty = clamp01(result.wallClockMs / spec.timeoutMs);
+    const score = passRate * PASS_WEIGHT +
+        completeness * VERIFICATION_WEIGHT -
+        tokenPenalty * TOKEN_PENALTY -
+        wallPenalty * WALL_PENALTY;
+    // Clamp к [0, 150] so a partial-completeness fail with non-zero
+    // penalties never produces a negative aggregate. The penalties are
+    // already clamped to [0, 1] individually; this final clamp protects
+    // the documented range invariant when verification_completeness is
+    // 0 AND budget penalties land.
+    const clamped = Math.max(0, Math.min(150, score));
+    return Math.round(clamped * 100) / 100;
+}
+/**
+ * Aggregate score across an entire harness run. Mean by design.
+ */
+export function aggregateScore(results) {
+    if (results.length === 0)
+        return 0;
+    const sum = results.reduce((acc, r) => acc + r.pugiScore, 0);
+    return Math.round((sum / results.length) * 100) / 100;
+}
+//# sourceMappingURL=scoring.js.map

package/dist/core/eval/v1/task-loader.js ADDED Viewed

@@ -0,0 +1,191 @@
+/**
+ * Task loader for pugi-eval-v1.
+ *
+ * Reads `<root>/eval/v1/tasks/<NN>-<slug>.task.yml`, parses via the
+ * embedded minimal YAML parser, validates via Zod, then asserts the
+ * filename matches the parsed `id` so a refactor cannot silently
+ * decouple the basename from the schema id.
+ *
+ * The loader also exposes `loadTaskManifest` which is the spec-side
+ * gate that pins task content via sha256 and refuses mismatches (the
+ * frozen-benchmark invariant from backlog #120).
+ */
+import { createHash } from 'node:crypto';
+import { readdirSync, readFileSync } from 'node:fs';
+import { basename, join, resolve } from 'node:path';
+import { z } from 'zod';
+import { parseTaskYaml } from './yaml-parser.js';
+const TASK_ID_RE = /^\d{2}-[a-z][a-z0-9-]*$/;
+const TASK_FILENAME_RE = /^(\d{2}-[a-z][a-z0-9-]*)\.task\.yml$/;
+const verificationSchema = z.discriminatedUnion('kind', [
+    z
+        .object({ kind: z.literal('file_exists'), path: z.string().min(1) })
+        .strict(),
+    z
+        .object({
+        kind: z.literal('file_contains'),
+        path: z.string().min(1),
+        pattern: z.string().min(1),
+        mode: z.enum(['literal', 'regex']).optional(),
+    })
+        .strict(),
+    z
+        .object({
+        kind: z.literal('output_contains'),
+        pattern: z.string().min(1),
+        mode: z.enum(['literal', 'regex']).optional(),
+    })
+        .strict(),
+    z
+        .object({
+        kind: z.literal('command_exit_code'),
+        command: z.string().min(1),
+        expectedExitCode: z.number().int(),
+        timeoutMs: z.number().int().positive().optional(),
+    })
+        .strict(),
+]);
+const taskSchema = z
+    .object({
+    id: z.string().regex(TASK_ID_RE, 'id must match <NN>-<slug>'),
+    difficulty: z.enum(['simple', 'medium', 'hard']),
+    intensity: z.enum(['quick', 'standard', 'deep', 'marathon']),
+    command: z.enum(['code', 'fix', 'explain', 'plan', 'build']),
+    brief: z.string().min(1),
+    fixture: z.record(z.string(), z.string()).optional(),
+    verification: z.array(verificationSchema).min(1),
+    maxTokens: z.number().int().positive(),
+    maxTurns: z.number().int().positive(),
+    timeoutMs: z.number().int().positive(),
+})
+    .strict();
+/**
+ * Default tasks directory relative to the @pugi/cli workspace root.
+ * Tests inject a different directory; production resolves it through
+ * `defaultTasksDir`.
+ */
+export function defaultTasksDir(packageRoot) {
+    return resolve(packageRoot, 'eval', 'v1', 'tasks');
+}
+export function defaultManifestPath(packageRoot) {
+    return resolve(packageRoot, 'eval', 'v1', 'manifest.json');
+}
+export function defaultLedgerPath(packageRoot) {
+    return resolve(packageRoot, 'eval', 'v1', 'results.tsv');
+}
+export function listTaskFiles(tasksDir) {
+    let entries;
+    try {
+        entries = readdirSync(tasksDir);
+    }
+    catch (err) {
+        throw new Error(`eval-v1 tasks directory not found at ${tasksDir}: ${err.message}`);
+    }
+    const files = entries.filter((e) => TASK_FILENAME_RE.test(e)).sort();
+    return files.map((f) => join(tasksDir, f));
+}
+export function loadTaskFile(path) {
+    const raw = readFileSync(path, 'utf8');
+    const filenameMatch = TASK_FILENAME_RE.exec(basename(path));
+    if (!filenameMatch) {
+        throw new Error(`eval-v1: filename ${basename(path)} does not match <NN>-<slug>.task.yml`);
+    }
+    const expectedId = filenameMatch[1];
+    let parsed;
+    try {
+        parsed = parseTaskYaml(raw);
+    }
+    catch (err) {
+        throw new Error(`eval-v1 task ${basename(path)} failed YAML parse: ${err.message}`);
+    }
+    const result = taskSchema.safeParse(parsed);
+    if (!result.success) {
+        throw new Error(`eval-v1 task ${basename(path)} failed schema validation: ${result.error.message}`);
+    }
+    if (result.data.id !== expectedId) {
+        throw new Error(`eval-v1 task ${basename(path)} id field ${result.data.id} does not match filename ${expectedId}`);
+    }
+    return { path, raw, spec: result.data };
+}
+export function loadAllTasks(tasksDir) {
+    const files = listTaskFiles(tasksDir);
+    const loaded = files.map((f) => loadTaskFile(f));
+    // Detect duplicate ids that survived (cannot happen given filename
+    // regex but defensive against future refactors).
+    const seen = new Set();
+    for (const entry of loaded) {
+        if (seen.has(entry.spec.id)) {
+            throw new Error(`eval-v1: duplicate task id ${entry.spec.id}`);
+        }
+        seen.add(entry.spec.id);
+    }
+    return loaded;
+}
+export function manifestEntryFor(path, raw) {
+    const filenameMatch = TASK_FILENAME_RE.exec(basename(path));
+    if (!filenameMatch) {
+        throw new Error(`cannot derive manifest entry: filename ${basename(path)} does not match`);
+    }
+    const id = filenameMatch[1];
+    const sha = createHash('sha256').update(raw).digest('hex');
+    return { id, sha256: sha, byteLength: Buffer.byteLength(raw, 'utf8') };
+}
+export function readManifest(manifestPath) {
+    const raw = readFileSync(manifestPath, 'utf8');
+    const parsed = JSON.parse(raw);
+    const schema = z
+        .object({
+        schemaVersion: z.literal(1),
+        generatedAt: z.string(),
+        entries: z
+            .array(z
+            .object({
+            id: z.string().regex(TASK_ID_RE),
+            sha256: z.string().regex(/^[0-9a-f]{64}$/),
+            byteLength: z.number().int().nonnegative(),
+        })
+            .strict())
+            .min(1),
+    })
+        .strict();
+    const result = schema.safeParse(parsed);
+    if (!result.success) {
+        throw new Error(`eval-v1 manifest ${manifestPath} invalid: ${result.error.message}`);
+    }
+    return result.data;
+}
+/**
+ * Compare the on-disk task files against the committed manifest. Used
+ * by the meta-spec to enforce the frozen-benchmark invariant.
+ */
+export function diffManifest(tasks, manifest) {
+    const reasons = [];
+    const computed = new Map();
+    for (const t of tasks) {
+        const entry = manifestEntryFor(t.path, t.raw);
+        computed.set(entry.id, entry);
+    }
+    const declared = new Map();
+    for (const e of manifest.entries)
+        declared.set(e.id, e);
+    for (const [id, entry] of computed) {
+        const decl = declared.get(id);
+        if (!decl) {
+            reasons.push(`task ${id} present on disk but missing from manifest`);
+            continue;
+        }
+        if (decl.sha256 !== entry.sha256) {
+            reasons.push(`task ${id} sha256 mismatch (disk=${entry.sha256.slice(0, 12)} manifest=${decl.sha256.slice(0, 12)})`);
+        }
+        if (decl.byteLength !== entry.byteLength) {
+            reasons.push(`task ${id} byteLength mismatch (disk=${entry.byteLength} manifest=${decl.byteLength})`);
+        }
+    }
+    for (const [id] of declared) {
+        if (!computed.has(id)) {
+            reasons.push(`task ${id} declared in manifest but missing on disk`);
+        }
+    }
+    return { ok: reasons.length === 0, reasons };
+}
+//# sourceMappingURL=task-loader.js.map

package/dist/core/eval/v1/types.js ADDED Viewed

@@ -0,0 +1,14 @@
+/**
+ * pugi-eval-v1 type definitions (backlog #120,  Reviewer foundation).
+ *
+ * Frozen benchmark harness types. The shapes here are stable: every
+ * field added later must preserve backward compatibility with the v1
+ * `results.tsv` ledger columns and the v1 task YAML schema. Breaking
+ * changes ship as `eval-v2`.
+ *
+ * Why types live in `core/eval/v1/` and not next to the CLI command:
+ * the meta-spec, ledger, scoring, and verifier all consume them. CLI
+ * command modules stay thin wrappers per the project convention.
+ */
+export {};
+//# sourceMappingURL=types.js.map