npm - cclaw-cli - Versions diffs - 0.49.0 → 0.51.1 - Mend

cclaw-cli 0.49.0 → 0.51.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (183) hide show

package/README.md +57 -84
package/dist/artifact-linter.d.ts +4 -0
package/dist/artifact-linter.js +24 -3
package/dist/cli.d.ts +1 -19
package/dist/cli.js +49 -491
package/dist/constants.d.ts +2 -13
package/dist/constants.js +1 -43
package/dist/content/closeout-guidance.d.ts +14 -0
package/dist/content/closeout-guidance.js +42 -0
package/dist/content/core-agents.js +55 -17
package/dist/content/decision-protocol.d.ts +12 -0
package/dist/content/decision-protocol.js +20 -0
package/dist/content/diff-command.d.ts +1 -2
package/dist/content/diff-command.js +8 -94
package/dist/content/examples.d.ts +4 -10
package/dist/content/examples.js +10 -20
package/dist/content/hook-events.js +2 -2
package/dist/content/hook-inline-snippets.d.ts +5 -2
package/dist/content/hook-inline-snippets.js +33 -1
package/dist/content/hook-manifest.d.ts +3 -4
package/dist/content/hook-manifest.js +11 -12
package/dist/content/hooks.js +44 -21
package/dist/content/ideate-command.d.ts +2 -0
package/dist/content/ideate-command.js +34 -25
package/dist/content/iron-laws.d.ts +5 -5
package/dist/content/iron-laws.js +5 -5
package/dist/content/language-policy.d.ts +2 -0
package/dist/content/language-policy.js +13 -0
package/dist/content/learnings.d.ts +3 -4
package/dist/content/learnings.js +26 -50
package/dist/content/meta-skill.js +33 -22
package/dist/content/next-command.js +41 -38
package/dist/content/node-hooks.js +17 -345
package/dist/content/opencode-plugin.js +5 -103
package/dist/content/research-playbooks.js +14 -14
package/dist/content/review-loop.d.ts +2 -0
package/dist/content/review-loop.js +8 -0
package/dist/content/session-hooks.js +15 -47
package/dist/content/skills.d.ts +0 -5
package/dist/content/skills.js +55 -128
package/dist/content/stage-common-guidance.d.ts +0 -1
package/dist/content/stage-common-guidance.js +17 -14
package/dist/content/stage-schema.d.ts +26 -1
package/dist/content/stage-schema.js +121 -40
package/dist/content/stages/_lint-metadata/index.js +9 -15
package/dist/content/stages/brainstorm.js +22 -43
package/dist/content/stages/design.js +37 -57
package/dist/content/stages/plan.js +22 -13
package/dist/content/stages/review.js +24 -27
package/dist/content/stages/scope.js +34 -46
package/dist/content/stages/ship.js +7 -4
package/dist/content/stages/spec.js +20 -9
package/dist/content/stages/tdd.js +64 -44
package/dist/content/start-command.js +13 -12
package/dist/content/status-command.d.ts +2 -7
package/dist/content/status-command.js +19 -146
package/dist/content/subagents.d.ts +0 -5
package/dist/content/subagents.js +51 -28
package/dist/content/templates.d.ts +1 -1
package/dist/content/templates.js +126 -135
package/dist/content/track-render-context.d.ts +17 -0
package/dist/content/track-render-context.js +44 -0
package/dist/content/tree-command.d.ts +1 -2
package/dist/content/tree-command.js +4 -87
package/dist/content/utility-skills.d.ts +2 -29
package/dist/content/utility-skills.js +2 -1534
package/dist/content/view-command.js +31 -11
package/dist/delegation.d.ts +1 -1
package/dist/delegation.js +5 -15
package/dist/doctor-registry.js +20 -21
package/dist/doctor.js +88 -344
package/dist/flow-state.d.ts +3 -0
package/dist/flow-state.js +2 -0
package/dist/harness-adapters.d.ts +1 -1
package/dist/harness-adapters.js +51 -58
package/dist/install.js +128 -358
package/dist/internal/advance-stage.js +3 -9
package/dist/internal/compound-readiness.d.ts +1 -1
package/dist/internal/compound-readiness.js +1 -1
package/dist/internal/tdd-loop-status.d.ts +1 -1
package/dist/internal/tdd-loop-status.js +1 -1
package/dist/knowledge-store.d.ts +16 -10
package/dist/knowledge-store.js +51 -15
package/dist/policy.js +16 -105
package/dist/run-archive.d.ts +4 -6
package/dist/run-archive.js +15 -20
package/dist/run-persistence.d.ts +2 -2
package/dist/run-persistence.js +3 -9
package/package.json +1 -2
package/dist/content/archive-command.d.ts +0 -2
package/dist/content/archive-command.js +0 -124
package/dist/content/compound-command.d.ts +0 -5
package/dist/content/compound-command.js +0 -193
package/dist/content/contexts.d.ts +0 -18
package/dist/content/contexts.js +0 -24
package/dist/content/contracts.d.ts +0 -2
package/dist/content/contracts.js +0 -51
package/dist/content/doctor-references.d.ts +0 -2
package/dist/content/doctor-references.js +0 -150
package/dist/content/eval-scaffold.d.ts +0 -15
package/dist/content/eval-scaffold.js +0 -370
package/dist/content/feature-command.d.ts +0 -2
package/dist/content/feature-command.js +0 -123
package/dist/content/flow-map.d.ts +0 -23
package/dist/content/flow-map.js +0 -134
package/dist/content/harness-doc.d.ts +0 -2
package/dist/content/harness-doc.js +0 -202
package/dist/content/harness-playbooks.d.ts +0 -24
package/dist/content/harness-playbooks.js +0 -393
package/dist/content/harness-tool-refs.d.ts +0 -20
package/dist/content/harness-tool-refs.js +0 -268
package/dist/content/ops-command.d.ts +0 -2
package/dist/content/ops-command.js +0 -71
package/dist/content/protocols.d.ts +0 -7
package/dist/content/protocols.js +0 -215
package/dist/content/retro-command.d.ts +0 -2
package/dist/content/retro-command.js +0 -165
package/dist/content/rewind-command.d.ts +0 -2
package/dist/content/rewind-command.js +0 -106
package/dist/content/tdd-log-command.d.ts +0 -2
package/dist/content/tdd-log-command.js +0 -85
package/dist/eval/agents/single-shot.d.ts +0 -27
package/dist/eval/agents/single-shot.js +0 -79
package/dist/eval/agents/with-tools.d.ts +0 -44
package/dist/eval/agents/with-tools.js +0 -261
package/dist/eval/agents/workflow.d.ts +0 -31
package/dist/eval/agents/workflow.js +0 -155
package/dist/eval/baseline.d.ts +0 -38
package/dist/eval/baseline.js +0 -282
package/dist/eval/config-loader.d.ts +0 -14
package/dist/eval/config-loader.js +0 -395
package/dist/eval/corpus.d.ts +0 -30
package/dist/eval/corpus.js +0 -330
package/dist/eval/cost-guard.d.ts +0 -102
package/dist/eval/cost-guard.js +0 -190
package/dist/eval/diff.d.ts +0 -64
package/dist/eval/diff.js +0 -323
package/dist/eval/llm-client.d.ts +0 -176
package/dist/eval/llm-client.js +0 -267
package/dist/eval/mode.d.ts +0 -28
package/dist/eval/mode.js +0 -61
package/dist/eval/progress.d.ts +0 -83
package/dist/eval/progress.js +0 -59
package/dist/eval/report.d.ts +0 -11
package/dist/eval/report.js +0 -181
package/dist/eval/rubric-loader.d.ts +0 -20
package/dist/eval/rubric-loader.js +0 -143
package/dist/eval/runner.d.ts +0 -81
package/dist/eval/runner.js +0 -746
package/dist/eval/runs.d.ts +0 -41
package/dist/eval/runs.js +0 -114
package/dist/eval/sandbox.d.ts +0 -38
package/dist/eval/sandbox.js +0 -137
package/dist/eval/tools/glob.d.ts +0 -2
package/dist/eval/tools/glob.js +0 -163
package/dist/eval/tools/grep.d.ts +0 -2
package/dist/eval/tools/grep.js +0 -152
package/dist/eval/tools/index.d.ts +0 -7
package/dist/eval/tools/index.js +0 -35
package/dist/eval/tools/read.d.ts +0 -2
package/dist/eval/tools/read.js +0 -122
package/dist/eval/tools/types.d.ts +0 -49
package/dist/eval/tools/types.js +0 -41
package/dist/eval/tools/write.d.ts +0 -2
package/dist/eval/tools/write.js +0 -92
package/dist/eval/types.d.ts +0 -561
package/dist/eval/types.js +0 -47
package/dist/eval/verifiers/judge.d.ts +0 -40
package/dist/eval/verifiers/judge.js +0 -256
package/dist/eval/verifiers/rules.d.ts +0 -24
package/dist/eval/verifiers/rules.js +0 -218
package/dist/eval/verifiers/structural.d.ts +0 -14
package/dist/eval/verifiers/structural.js +0 -171
package/dist/eval/verifiers/traceability.d.ts +0 -23
package/dist/eval/verifiers/traceability.js +0 -84
package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
package/dist/eval/verifiers/workflow-consistency.js +0 -225
package/dist/eval/workflow-corpus.d.ts +0 -7
package/dist/eval/workflow-corpus.js +0 -207
package/dist/feature-system.d.ts +0 -42
package/dist/feature-system.js +0 -432
package/dist/internal/knowledge-digest.d.ts +0 -7
package/dist/internal/knowledge-digest.js +0 -93

package/dist/eval/diff.d.ts DELETED Viewed

@@ -1,64 +0,0 @@
-import type { EvalReport } from "./types.js";
-export interface EvalDiffInput {
-    projectRoot: string;
-    /** Version string, filename, or "latest". */
-    old: string;
-    /** Version string, filename, or "latest". */
-    new: string;
-}
-export interface EvalDiffCaseEntry {
-    caseId: string;
-    stage: string;
-    /** Pass/fail transition: `same`, `regressed`, `recovered`, `added`, `removed`. */
-    transition: "same" | "regressed" | "recovered" | "added" | "removed";
-    previousPassed?: boolean;
-    currentPassed?: boolean;
-    durationDeltaMs?: number;
-    costDeltaUsd?: number;
-    verifierDeltas: EvalDiffVerifierEntry[];
-    stageDeltas?: EvalDiffStageEntry[];
-}
-export interface EvalDiffVerifierEntry {
-    verifierId: string;
-    kind: string;
-    transition: "same" | "regressed" | "recovered" | "added" | "removed" | "score-drop";
-    previousScore?: number;
-    currentScore?: number;
-    previousOk?: boolean;
-    currentOk?: boolean;
-}
-export interface EvalDiffStageEntry {
-    stage: string;
-    durationDeltaMs: number;
-    costDeltaUsd: number;
-    turnsDelta: number;
-    callsDelta: number;
-}
-export interface EvalDiffReport {
-    old: EvalDiffReportMeta;
-    new: EvalDiffReportMeta;
-    summaryDelta: {
-        totalCasesDelta: number;
-        passedDelta: number;
-        failedDelta: number;
-        skippedDelta: number;
-        totalCostUsdDelta: number;
-        totalDurationMsDelta: number;
-    };
-    cases: EvalDiffCaseEntry[];
-    /** True when any case regressed or any verifier dropped. */
-    regressed: boolean;
-}
-export interface EvalDiffReportMeta {
-    runId: string;
-    cclawVersion: string;
-    generatedAt: string;
-    mode: string;
-    model: string;
-    sourcePath: string;
-}
-export declare function resolveReportPath(projectRoot: string, selector: string): Promise<string>;
-export declare function diffReports(previous: EvalReport, current: EvalReport, prevPath: string, currPath: string): EvalDiffReport;
-export declare function runEvalDiff(input: EvalDiffInput): Promise<EvalDiffReport>;
-/** Render the diff as a terse human-readable Markdown block. */
-export declare function formatDiffMarkdown(diff: EvalDiffReport): string;

package/dist/eval/diff.js DELETED Viewed

@@ -1,323 +0,0 @@
-/**
- * `cclaw eval diff <old> <new>` — side-by-side report comparison.
- *
- * Loads two JSON reports under `.cclaw/evals/reports/` (by version tag or
- * explicit filename) and emits a compact human-readable + JSON diff:
- *
- *   - summary-level deltas (passed/failed/cost/duration)
- *   - per-case pass/fail transitions
- *   - per-verifier score drops (only the drops — new passes are noted in
- *     the summary line, not repeated per verifier)
- *   - Workflow-mode stage-level cost & duration deltas when both reports
- *     carry a `workflow` summary for the same case id
- *
- * The resolver accepts three shapes for the `<old>` / `<new>` arguments:
- *
- *   1. A bare version string (`0.26.0`) — matched against any report JSON
- *      whose `cclawVersion` field equals the string.
- *   2. A full or relative filename (`eval-2026-04-17T...-abc123.json`).
- *   3. The literal `latest` — picks the most recent report on disk by
- *      mtime.
- *
- * The diff is deterministic: sorted by case id, then verifier id. Missing
- * cases in one report show up as `added` or `removed` so callers can see
- * which corpus changes slipped in between versions.
- */
-import fs from "node:fs/promises";
-import path from "node:path";
-import { EVALS_ROOT } from "../constants.js";
-import { exists } from "../fs-utils.js";
-const SCORE_DROP_EPSILON = 0.0001;
-export async function resolveReportPath(projectRoot, selector) {
-    const dir = path.join(projectRoot, EVALS_ROOT, "reports");
-    if (!(await exists(dir))) {
-        throw new Error(`No reports directory at ${path.relative(projectRoot, dir)}. ` +
-            `Run \`cclaw eval\` at least once before comparing reports.`);
-    }
-    const trimmed = selector.trim();
-    if (trimmed.length === 0) {
-        throw new Error(`Empty report selector. Pass a version like "0.26.0" or "latest".`);
-    }
-    // 1. Explicit filename (absolute or relative).
-    const asPath = path.isAbsolute(trimmed) ? trimmed : path.join(dir, trimmed);
-    if (await exists(asPath))
-        return asPath;
-    if (trimmed.endsWith(".json") && (await exists(asPath)))
-        return asPath;
-    const entries = await fs.readdir(dir, { withFileTypes: true });
-    const jsonFiles = entries
-        .filter((e) => e.isFile() && e.name.endsWith(".json"))
-        .map((e) => path.join(dir, e.name));
-    if (jsonFiles.length === 0) {
-        throw new Error(`No JSON reports found under ${path.relative(projectRoot, dir)}.`);
-    }
-    if (trimmed === "latest") {
-        let latest = jsonFiles[0];
-        let latestMtime = (await fs.stat(latest)).mtimeMs;
-        for (const f of jsonFiles.slice(1)) {
-            const stat = await fs.stat(f);
-            if (stat.mtimeMs > latestMtime) {
-                latest = f;
-                latestMtime = stat.mtimeMs;
-            }
-        }
-        return latest;
-    }
-    // 3. Version match — pick most recent by mtime among matches.
-    const matches = [];
-    for (const file of jsonFiles) {
-        try {
-            const raw = await fs.readFile(file, "utf8");
-            const parsed = JSON.parse(raw);
-            if (parsed.cclawVersion === trimmed) {
-                const stat = await fs.stat(file);
-                matches.push({ file, mtimeMs: stat.mtimeMs });
-            }
-        }
-        catch {
-            continue;
-        }
-    }
-    if (matches.length === 0) {
-        throw new Error(`No report matched selector "${selector}". ` +
-            `Pass a filename under ${path.relative(projectRoot, dir)} or a cclawVersion present in one of the reports.`);
-    }
-    matches.sort((a, b) => b.mtimeMs - a.mtimeMs);
-    return matches[0].file;
-}
-async function loadReport(filePath) {
-    const raw = await fs.readFile(filePath, "utf8");
-    const parsed = JSON.parse(raw);
-    if (parsed.schemaVersion !== 1 || !Array.isArray(parsed.cases)) {
-        throw new Error(`File at ${filePath} is not a valid cclaw eval report (missing schemaVersion or cases).`);
-    }
-    return parsed;
-}
-function meta(report, sourcePath) {
-    return {
-        runId: report.runId,
-        cclawVersion: report.cclawVersion,
-        generatedAt: report.generatedAt,
-        mode: report.mode,
-        model: report.model,
-        sourcePath
-    };
-}
-function verifierMap(results) {
-    const out = new Map();
-    for (const v of results)
-        out.set(v.id, v);
-    return out;
-}
-function diffCase(caseId, previous, current) {
-    const stage = (current ?? previous).stage;
-    if (!previous) {
-        return {
-            caseId,
-            stage,
-            transition: "added",
-            currentPassed: current?.passed,
-            verifierDeltas: []
-        };
-    }
-    if (!current) {
-        return {
-            caseId,
-            stage,
-            transition: "removed",
-            previousPassed: previous.passed,
-            verifierDeltas: []
-        };
-    }
-    const transition = previous.passed === current.passed
-        ? "same"
-        : previous.passed && !current.passed
-            ? "regressed"
-            : "recovered";
-    const prevMap = verifierMap(previous.verifierResults);
-    const currMap = verifierMap(current.verifierResults);
-    const verifierDeltas = [];
-    const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
-    for (const id of [...allIds].sort((a, b) => a.localeCompare(b))) {
-        const p = prevMap.get(id);
-        const c = currMap.get(id);
-        const kind = (c ?? p).kind;
-        if (!p && c) {
-            verifierDeltas.push({
-                verifierId: id,
-                kind,
-                transition: "added",
-                currentOk: c.ok,
-                ...(c.score !== undefined ? { currentScore: c.score } : {})
-            });
-            continue;
-        }
-        if (p && !c) {
-            verifierDeltas.push({
-                verifierId: id,
-                kind,
-                transition: "removed",
-                previousOk: p.ok,
-                ...(p.score !== undefined ? { previousScore: p.score } : {})
-            });
-            continue;
-        }
-        if (!p || !c)
-            continue;
-        const okChanged = p.ok !== c.ok;
-        const scoreChanged = typeof p.score === "number" &&
-            typeof c.score === "number" &&
-            Math.abs(p.score - c.score) > SCORE_DROP_EPSILON;
-        if (!okChanged && !scoreChanged)
-            continue;
-        const entry = {
-            verifierId: id,
-            kind,
-            transition: okChanged
-                ? p.ok
-                    ? "regressed"
-                    : "recovered"
-                : typeof p.score === "number" &&
-                    typeof c.score === "number" &&
-                    c.score < p.score
-                    ? "score-drop"
-                    : "same",
-            previousOk: p.ok,
-            currentOk: c.ok
-        };
-        if (typeof p.score === "number")
-            entry.previousScore = p.score;
-        if (typeof c.score === "number")
-            entry.currentScore = c.score;
-        if (entry.transition !== "same")
-            verifierDeltas.push(entry);
-    }
-    const caseEntry = {
-        caseId,
-        stage,
-        transition,
-        previousPassed: previous.passed,
-        currentPassed: current.passed,
-        durationDeltaMs: current.durationMs - previous.durationMs,
-        verifierDeltas
-    };
-    const costDelta = (current.costUsd ?? 0) - (previous.costUsd ?? 0);
-    if (Math.abs(costDelta) > SCORE_DROP_EPSILON) {
-        caseEntry.costDeltaUsd = Number(costDelta.toFixed(6));
-    }
-    if (previous.workflow && current.workflow) {
-        const prevStages = new Map();
-        for (const s of previous.workflow.stages)
-            prevStages.set(s.stage, s);
-        const stageDeltas = [];
-        for (const curStage of current.workflow.stages) {
-            const prevStage = prevStages.get(curStage.stage);
-            if (!prevStage)
-                continue;
-            stageDeltas.push({
-                stage: curStage.stage,
-                durationDeltaMs: curStage.durationMs - prevStage.durationMs,
-                costDeltaUsd: Number((curStage.usageUsd - prevStage.usageUsd).toFixed(6)),
-                turnsDelta: curStage.toolUse.turns - prevStage.toolUse.turns,
-                callsDelta: curStage.toolUse.calls - prevStage.toolUse.calls
-            });
-        }
-        if (stageDeltas.length > 0)
-            caseEntry.stageDeltas = stageDeltas;
-    }
-    return caseEntry;
-}
-export function diffReports(previous, current, prevPath, currPath) {
-    const prevMap = new Map();
-    const currMap = new Map();
-    for (const c of previous.cases)
-        prevMap.set(c.caseId, c);
-    for (const c of current.cases)
-        currMap.set(c.caseId, c);
-    const allIds = new Set([...prevMap.keys(), ...currMap.keys()]);
-    const cases = [...allIds]
-        .sort((a, b) => a.localeCompare(b))
-        .map((id) => diffCase(id, prevMap.get(id), currMap.get(id)));
-    const regressed = cases.some((c) => c.transition === "regressed" ||
-        c.transition === "removed" ||
-        c.verifierDeltas.some((v) => v.transition === "regressed" || v.transition === "score-drop"));
-    return {
-        old: meta(previous, prevPath),
-        new: meta(current, currPath),
-        summaryDelta: {
-            totalCasesDelta: current.summary.totalCases - previous.summary.totalCases,
-            passedDelta: current.summary.passed - previous.summary.passed,
-            failedDelta: current.summary.failed - previous.summary.failed,
-            skippedDelta: current.summary.skipped - previous.summary.skipped,
-            totalCostUsdDelta: Number((current.summary.totalCostUsd - previous.summary.totalCostUsd).toFixed(6)),
-            totalDurationMsDelta: current.summary.totalDurationMs - previous.summary.totalDurationMs
-        },
-        cases,
-        regressed
-    };
-}
-export async function runEvalDiff(input) {
-    const [oldPath, newPath] = await Promise.all([
-        resolveReportPath(input.projectRoot, input.old),
-        resolveReportPath(input.projectRoot, input.new)
-    ]);
-    const [oldReport, newReport] = await Promise.all([
-        loadReport(oldPath),
-        loadReport(newPath)
-    ]);
-    return diffReports(oldReport, newReport, oldPath, newPath);
-}
-/** Render the diff as a terse human-readable Markdown block. */
-export function formatDiffMarkdown(diff) {
-    const lines = [];
-    lines.push(`# cclaw eval diff`);
-    lines.push(``);
-    lines.push(`- old: ${diff.old.cclawVersion} (${path.basename(diff.old.sourcePath)})`);
-    lines.push(`- new: ${diff.new.cclawVersion} (${path.basename(diff.new.sourcePath)})`);
-    lines.push(`- regressed: ${diff.regressed ? "yes" : "no"}`);
-    lines.push(``);
-    lines.push(`## Summary delta`);
-    lines.push(``);
-    const sd = diff.summaryDelta;
-    lines.push(`| metric | delta |`);
-    lines.push(`| --- | --- |`);
-    lines.push(`| total cases | ${sd.totalCasesDelta >= 0 ? "+" : ""}${sd.totalCasesDelta} |`);
-    lines.push(`| passed | ${sd.passedDelta >= 0 ? "+" : ""}${sd.passedDelta} |`);
-    lines.push(`| failed | ${sd.failedDelta >= 0 ? "+" : ""}${sd.failedDelta} |`);
-    lines.push(`| skipped | ${sd.skippedDelta >= 0 ? "+" : ""}${sd.skippedDelta} |`);
-    lines.push(`| cost (USD) | ${sd.totalCostUsdDelta >= 0 ? "+" : ""}${sd.totalCostUsdDelta.toFixed(4)} |`);
-    lines.push(`| duration (ms) | ${sd.totalDurationMsDelta >= 0 ? "+" : ""}${sd.totalDurationMsDelta} |`);
-    lines.push(``);
-    const noisyCases = diff.cases.filter((c) => c.transition !== "same" || c.verifierDeltas.length > 0);
-    if (noisyCases.length === 0) {
-        lines.push(`No case-level changes.`);
-        lines.push(``);
-        return `${lines.join("\n")}\n`;
-    }
-    lines.push(`## Case changes`);
-    lines.push(``);
-    lines.push(`| case id | stage | transition | prev | curr |`);
-    lines.push(`| --- | --- | --- | --- | --- |`);
-    for (const c of noisyCases) {
-        const prev = c.previousPassed === undefined ? "-" : c.previousPassed ? "pass" : "fail";
-        const curr = c.currentPassed === undefined ? "-" : c.currentPassed ? "pass" : "fail";
-        lines.push(`| ${c.caseId} | ${c.stage} | ${c.transition} | ${prev} | ${curr} |`);
-    }
-    lines.push(``);
-    const withVerifiers = noisyCases.filter((c) => c.verifierDeltas.length > 0);
-    if (withVerifiers.length > 0) {
-        lines.push(`## Verifier changes`);
-        lines.push(``);
-        lines.push(`| case id | verifier | kind | transition | prev score | curr score |`);
-        lines.push(`| --- | --- | --- | --- | --- | --- |`);
-        for (const c of withVerifiers) {
-            for (const v of c.verifierDeltas) {
-                const prev = v.previousScore !== undefined ? v.previousScore.toFixed(2) : "-";
-                const curr = v.currentScore !== undefined ? v.currentScore.toFixed(2) : "-";
-                lines.push(`| ${c.caseId} | ${v.verifierId} | ${v.kind} | ${v.transition} | ${prev} | ${curr} |`);
-            }
-        }
-        lines.push(``);
-    }
-    return `${lines.join("\n")}\n`;
-}

package/dist/eval/llm-client.d.ts DELETED Viewed

@@ -1,176 +0,0 @@
-import type { ClientOptions } from "openai";
-import type { ResolvedEvalConfig } from "./types.js";
-export interface ChatMessage {
-    role: "system" | "user" | "assistant" | "tool";
-    content: string;
-    name?: string;
-    toolCallId?: string;
-    /**
-     * OpenAI-style tool calls carried on a preceding assistant message.
-     * Populated by the with-tools loop so the wire transcript stays
-     * consistent (assistant message → tool responses).
-     */
-    toolCalls?: Array<{
-        id: string;
-        name: string;
-        arguments: string;
-    }>;
-}
-export interface ChatRequest {
-    model: string;
-    messages: ChatMessage[];
-    maxTokens?: number;
-    temperature?: number;
-    /** Per-call timeout override. Falls back to `config.timeoutMs`. */
-    timeoutMs?: number;
-    /**
-     * Ask the provider for a JSON-object response. The judge pipeline sets
-     * this; the agent-under-test usually leaves it unset.
-     */
-    responseFormatJson?: boolean;
-    /**
-     * Optional deterministic sampling seed. Providers that don't implement
-     * `seed` simply ignore it.
-     */
-    seed?: number;
-    /**
-     * Tool/function-calling definitions in OpenAI wire format. Populated only
-     * by agent/workflow modes. Ignored by the single-shot path.
-     */
-    tools?: unknown[];
-    toolChoice?: "auto" | "none";
-}
-export interface ChatUsage {
-    promptTokens: number;
-    completionTokens: number;
-    totalTokens: number;
-}
-export interface ChatResponse {
-    content: string;
-    toolCalls?: Array<{
-        id: string;
-        name: string;
-        arguments: string;
-    }>;
-    usage: ChatUsage;
-    finishReason: "stop" | "length" | "tool_calls" | "content_filter";
-    model: string;
-    attempts: number;
-}
-/** Base class so callers can `catch (err) { if (err instanceof EvalLlmError) ... }`. */
-export declare class EvalLlmError extends Error {
-    readonly retryable: boolean;
-    readonly status?: number;
-    constructor(message: string, opts: {
-        retryable: boolean;
-        status?: number;
-        cause?: unknown;
-    });
-}
-export declare class EvalLlmAuthError extends EvalLlmError {
-    constructor(cause: unknown);
-}
-export declare class EvalLlmConfigError extends EvalLlmError {
-    constructor(message: string, cause?: unknown);
-}
-export declare class EvalLlmTimeoutError extends EvalLlmError {
-    constructor(timeoutMs: number);
-}
-export declare class EvalLlmRateLimitedError extends EvalLlmError {
-    constructor(cause: unknown);
-}
-export declare class EvalLlmTransportError extends EvalLlmError {
-    constructor(cause: unknown, status?: number);
-}
-export declare class EvalLlmInvalidResponseError extends EvalLlmError {
-    constructor(message: string, details?: Record<string, unknown>);
-}
-export declare class EvalLlmNotConfiguredError extends EvalLlmError {
-    constructor();
-}
-/** Lightweight client abstraction shared across eval runners. */
-export interface EvalLlmClient {
-    chat(request: ChatRequest): Promise<ChatResponse>;
-}
-/**
- * Deprecated shim preserved so older wiring keeps compiling. Prefer
- * `EvalLlmNotConfiguredError` for the "caller forgot to provide an API
- * key" case.
- */
-export declare class EvalLlmNotWiredError extends EvalLlmNotConfiguredError {
-}
-/** `createEvalClient` options — mostly for tests to inject a fake transport. */
-export interface CreateEvalClientOptions {
-    /** Inject an `openai` stand-in. Used by unit tests to avoid real HTTP. */
-    openaiFactory?: (opts: ClientOptions) => OpenAILike;
-    /**
-     * Override the default retry/backoff policy. Honored by the internal
-     * retry loop; transport errors still fall back to the defaults when
-     * unset.
-     */
-    retryPolicy?: RetryPolicy;
-    /** Deterministic sleep used by the retry loop. Defaults to `setTimeout`. */
-    sleep?: (ms: number) => Promise<void>;
-    /**
-     * Observer invoked when a chat() call is about to sleep before the next
-     * retry attempt. Use this to surface "we are retrying" status via the
-     * progress logger so long, silent backoff windows become visible.
-     */
-    onRetry?: (event: {
-        attempt: number;
-        maxAttempts: number;
-        waitMs: number;
-        error: EvalLlmError;
-    }) => void;
-}
-export interface RetryPolicy {
-    /** Max retries *on top of* the initial attempt. 0 = single attempt. */
-    maxRetries: number;
-    /** Initial backoff in ms. Doubles each retry (capped at `maxBackoffMs`). */
-    initialBackoffMs: number;
-    /** Upper bound for a single sleep between attempts. */
-    maxBackoffMs: number;
-}
-export declare const DEFAULT_RETRY_POLICY: RetryPolicy;
-/**
- * Minimal OpenAI-SDK surface we depend on, declared here so tests can
- * substitute a plain object without pulling the real SDK into the test
- * runtime.
- */
-export interface OpenAILike {
-    chat: {
-        completions: {
-            create(body: Record<string, unknown>, options: {
-                signal: AbortSignal;
-            }): Promise<OpenAILikeChatResponse>;
-        };
-    };
-}
-interface OpenAILikeChatResponse {
-    model?: string;
-    choices: Array<{
-        message?: {
-            content?: string | null;
-            tool_calls?: Array<{
-                id: string;
-                function: {
-                    name: string;
-                    arguments: string;
-                };
-            }>;
-        };
-        finish_reason?: string | null;
-    }>;
-    usage?: {
-        prompt_tokens?: number;
-        completion_tokens?: number;
-        total_tokens?: number;
-    };
-}
-/**
- * Build a real client pointed at the configured endpoint. Throws
- * `EvalLlmNotConfiguredError` at call time (not construction time) when no
- * API key is available, so CLI help and dry-run paths stay offline-safe.
- */
-export declare function createEvalClient(config: ResolvedEvalConfig, options?: CreateEvalClientOptions): EvalLlmClient;
-export {};