npm - cclaw-cli - Versions diffs - 0.25.0 → 0.26.0 - Mend

cclaw-cli 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/dist/cli.js +2 -1
package/dist/eval/agents/with-tools.d.ts +31 -0
package/dist/eval/agents/with-tools.js +255 -0
package/dist/eval/config-loader.js +34 -2
package/dist/eval/llm-client.d.ts +10 -0
package/dist/eval/llm-client.js +10 -1
package/dist/eval/report.js +19 -0
package/dist/eval/runner.js +50 -2
package/dist/eval/sandbox.d.ts +38 -0
package/dist/eval/sandbox.js +137 -0
package/dist/eval/tools/glob.d.ts +2 -0
package/dist/eval/tools/glob.js +163 -0
package/dist/eval/tools/grep.d.ts +2 -0
package/dist/eval/tools/grep.js +152 -0
package/dist/eval/tools/index.d.ts +7 -0
package/dist/eval/tools/index.js +35 -0
package/dist/eval/tools/read.d.ts +2 -0
package/dist/eval/tools/read.js +122 -0
package/dist/eval/tools/types.d.ts +49 -0
package/dist/eval/tools/types.js +41 -0
package/dist/eval/tools/write.d.ts +2 -0
package/dist/eval/tools/write.js +92 -0
package/dist/eval/types.d.ts +35 -0
package/package.json +1 -1

package/dist/eval/sandbox.js ADDED Viewed

@@ -0,0 +1,137 @@
+/**
+ * Per-case sandbox for the Tier B with-tools agent.
+ *
+ * Every case gets its own `os.tmpdir()/cclaw-eval-<uuid>/` directory. Any
+ * `contextFiles` the case declares are copied in relative to the project
+ * root, and every tool invocation resolves paths against the sandbox
+ * root with a defensive check that refuses symlinks and `..` escapes.
+ *
+ * Design notes:
+ *
+ * - The sandbox is intentionally tiny (one directory, no symlink
+ *   creation, no executable bits). We rely on `fs.realpath` on every
+ *   resolved path so hostile tool output that creates a symlink to
+ *   `/etc/passwd` and then tries to read it still trips the boundary
+ *   check.
+ * - Cleanup is handled by `dispose()`; callers (runner, tests) must
+ *   invoke it in a `try/finally` so leftover temp directories never
+ *   accumulate.
+ * - The sandbox does not preserve the project's directory structure
+ *   verbatim. Each entry in `contextFiles` is copied flat into
+ *   `sandboxRoot/<basename>` unless it contains path separators, in
+ *   which case the full relative layout is recreated. That keeps demo
+ *   cases portable while still letting richer cases place files under
+ *   subdirectories (e.g. `.cclaw/skills/brainstorming/SKILL.md`).
+ */
+import { randomUUID } from "node:crypto";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+export class SandboxEscapeError extends Error {
+    requestedPath;
+    constructor(requestedPath, reason) {
+        super(`Sandbox refused path "${requestedPath}": ${reason}.`);
+        this.name = "SandboxEscapeError";
+        this.requestedPath = requestedPath;
+    }
+}
+/** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
+export async function createSandbox(options) {
+    const baseDir = options.baseDir ?? os.tmpdir();
+    const id = options.idOverride ?? randomUUID();
+    const root = path.join(baseDir, `cclaw-eval-${id}`);
+    await fs.mkdir(root, { recursive: true });
+    const realRoot = await fs.realpath(root);
+    if (options.contextFiles && options.contextFiles.length > 0) {
+        for (const rel of options.contextFiles) {
+            await copyContextFile(options.projectRoot, realRoot, rel);
+        }
+    }
+    async function resolveInside(requested, opts = {}) {
+        if (typeof requested !== "string" || requested.length === 0) {
+            throw new SandboxEscapeError(String(requested), "path must be a non-empty string");
+        }
+        if (path.isAbsolute(requested)) {
+            throw new SandboxEscapeError(requested, "absolute paths are not allowed");
+        }
+        if (requested.includes("\0")) {
+            throw new SandboxEscapeError(requested, "NUL byte in path");
+        }
+        const joined = path.resolve(realRoot, requested);
+        const relative = path.relative(realRoot, joined);
+        if (relative.startsWith("..") || path.isAbsolute(relative)) {
+            throw new SandboxEscapeError(requested, "resolves outside the sandbox");
+        }
+        let finalPath;
+        try {
+            finalPath = await fs.realpath(joined);
+        }
+        catch (err) {
+            if (!opts.allowMissing) {
+                throw new SandboxEscapeError(requested, `realpath failed: ${err.message}`);
+            }
+            const existingAncestor = await findExistingAncestor(joined, realRoot);
+            if (!existingAncestor) {
+                throw new SandboxEscapeError(requested, "no existing ancestor inside the sandbox");
+            }
+            const ancestorRel = path.relative(realRoot, existingAncestor.real);
+            if (ancestorRel.startsWith("..") || path.isAbsolute(ancestorRel)) {
+                throw new SandboxEscapeError(requested, "parent resolves outside the sandbox");
+            }
+            finalPath = path.join(existingAncestor.real, existingAncestor.trailing);
+        }
+        const finalRel = path.relative(realRoot, finalPath);
+        if (finalRel.startsWith("..") || path.isAbsolute(finalRel)) {
+            throw new SandboxEscapeError(requested, "realpath escapes the sandbox");
+        }
+        return finalPath;
+    }
+    return {
+        root: realRoot,
+        resolve: resolveInside,
+        async dispose() {
+            await fs.rm(realRoot, { recursive: true, force: true });
+        }
+    };
+}
+async function findExistingAncestor(target, stopAt) {
+    const segments = [];
+    let current = target;
+    while (true) {
+        try {
+            const real = await fs.realpath(current);
+            return { real, trailing: path.join(...segments.reverse()) };
+        }
+        catch {
+            const parent = path.dirname(current);
+            if (parent === current)
+                return undefined;
+            segments.push(path.basename(current));
+            if (path.relative(stopAt, parent).startsWith(".."))
+                return undefined;
+            current = parent;
+        }
+    }
+}
+async function copyContextFile(projectRoot, sandboxRoot, relPath) {
+    if (path.isAbsolute(relPath)) {
+        throw new Error(`context_files must be project-relative: ${relPath}`);
+    }
+    const src = path.resolve(projectRoot, relPath);
+    const srcReal = await fs.realpath(src);
+    const projectReal = await fs.realpath(projectRoot);
+    const inside = path.relative(projectReal, srcReal);
+    if (inside.startsWith("..") || path.isAbsolute(inside)) {
+        throw new Error(`context_files entry resolves outside the project: ${relPath}`);
+    }
+    const stat = await fs.stat(srcReal);
+    if (stat.isDirectory()) {
+        const dest = path.join(sandboxRoot, relPath);
+        await fs.mkdir(dest, { recursive: true });
+        await fs.cp(srcReal, dest, { recursive: true });
+        return;
+    }
+    const dest = path.join(sandboxRoot, relPath);
+    await fs.mkdir(path.dirname(dest), { recursive: true });
+    await fs.copyFile(srcReal, dest);
+}

package/dist/eval/tools/glob.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { type SandboxTool } from "./types.js";
2	+ export declare const globTool: SandboxTool;

package/dist/eval/tools/glob.js ADDED Viewed

@@ -0,0 +1,163 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { SandboxEscapeError } from "../sandbox.js";
+import { parseArgs, requireString, truncatePayload } from "./types.js";
+const DESCRIPTION = "List files inside the sandbox whose relative path matches a glob-style " +
+    "pattern. Supports `*` (any chars within a path segment) and `**` " +
+    "(any number of path segments). Returns matching paths, one per line.";
+const MAX_MATCHES = 500;
+export const globTool = {
+    descriptor: {
+        name: "glob",
+        description: DESCRIPTION,
+        parameters: {
+            type: "object",
+            additionalProperties: false,
+            required: ["pattern"],
+            properties: {
+                pattern: {
+                    type: "string",
+                    description: "Glob pattern, relative to the sandbox root."
+                }
+            }
+        }
+    },
+    async invoke(rawArgs, ctx) {
+        let args;
+        try {
+            args = parseArgs(rawArgs);
+        }
+        catch (err) {
+            return { ok: false, name: this.descriptor.name, error: err.message };
+        }
+        let pattern;
+        try {
+            pattern = requireString(args, "pattern");
+        }
+        catch (err) {
+            return { ok: false, name: this.descriptor.name, error: err.message };
+        }
+        if (pattern.includes("\0")) {
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: '"pattern" must not contain NUL bytes'
+            };
+        }
+        let regex;
+        try {
+            regex = globToRegExp(pattern);
+        }
+        catch (err) {
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: err.message
+            };
+        }
+        const matches = [];
+        try {
+            await walk(ctx.sandbox.root, "", matches, regex);
+        }
+        catch (err) {
+            if (err instanceof SandboxEscapeError) {
+                return {
+                    ok: false,
+                    name: this.descriptor.name,
+                    error: err.message,
+                    details: { deniedPath: pattern }
+                };
+            }
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: `walk failed: ${err.message}`
+            };
+        }
+        matches.sort();
+        const capped = matches.slice(0, MAX_MATCHES);
+        const body = capped.length > 0
+            ? capped.join("\n") +
+                (matches.length > capped.length
+                    ? `\n…[truncated at ${MAX_MATCHES} matches]`
+                    : "")
+            : "(no matches)";
+        return {
+            ok: true,
+            name: this.descriptor.name,
+            content: truncatePayload(body, ctx.maxResultBytes),
+            details: {
+                pattern,
+                matches: capped.length,
+                totalMatches: matches.length,
+                truncated: matches.length > capped.length
+            }
+        };
+    }
+};
+async function walk(root, rel, acc, regex) {
+    const dir = path.join(root, rel);
+    let entries;
+    try {
+        entries = (await fs.readdir(dir, { withFileTypes: true }));
+    }
+    catch {
+        return;
+    }
+    for (const entry of entries) {
+        const childRel = rel ? path.join(rel, entry.name) : entry.name;
+        if (entry.isSymbolicLink())
+            continue;
+        if (entry.isDirectory()) {
+            await walk(root, childRel, acc, regex);
+            continue;
+        }
+        if (entry.isFile() && regex.test(childRel.replace(/\\/g, "/"))) {
+            acc.push(childRel);
+        }
+    }
+}
+/**
+ * Minimal glob → regex: `**` matches zero or more path segments, `*`
+ * matches anything except `/`, `?` matches a single non-slash char.
+ * Everything else is escaped. Intentionally narrower than full
+ * bash-style expansion so behavior is easy to reason about.
+ */
+function globToRegExp(pattern) {
+    const normalized = pattern.replace(/\\/g, "/");
+    let src = "^";
+    let i = 0;
+    while (i < normalized.length) {
+        const c = normalized[i];
+        if (c === "*") {
+            if (normalized[i + 1] === "*") {
+                if (normalized[i + 2] === "/") {
+                    src += "(?:.*/)?";
+                    i += 3;
+                }
+                else {
+                    src += ".*";
+                    i += 2;
+                }
+            }
+            else {
+                src += "[^/]*";
+                i += 1;
+            }
+        }
+        else if (c === "?") {
+            src += "[^/]";
+            i += 1;
+        }
+        else if ("+()|^$.{}[]\\".includes(c)) {
+            src += `\\${c}`;
+            i += 1;
+        }
+        else {
+            src += c;
+            i += 1;
+        }
+    }
+    src += "$";
+    return new RegExp(src);
+}

package/dist/eval/tools/grep.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { type SandboxTool } from "./types.js";
2	+ export declare const grepTool: SandboxTool;

package/dist/eval/tools/grep.js ADDED Viewed

@@ -0,0 +1,152 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { SandboxEscapeError } from "../sandbox.js";
+import { parseArgs, requireString, optionalNumber, truncatePayload } from "./types.js";
+const DESCRIPTION = "Search the sandbox for a regular expression. Returns matching lines in " +
+    "`path:line:text` form. Accepts optional `caseInsensitive` and a per-call " +
+    "`maxMatches` cap (default 100, hard max 500).";
+const HARD_MAX = 500;
+export const grepTool = {
+    descriptor: {
+        name: "grep",
+        description: DESCRIPTION,
+        parameters: {
+            type: "object",
+            additionalProperties: false,
+            required: ["pattern"],
+            properties: {
+                pattern: {
+                    type: "string",
+                    description: "Regular expression compiled with JavaScript semantics."
+                },
+                caseInsensitive: {
+                    type: "boolean",
+                    description: "Match case-insensitively (default false)."
+                },
+                maxMatches: {
+                    type: "integer",
+                    minimum: 1,
+                    description: "Stop after N matches (default 100, hard max 500)."
+                }
+            }
+        }
+    },
+    async invoke(rawArgs, ctx) {
+        let args;
+        try {
+            args = parseArgs(rawArgs);
+        }
+        catch (err) {
+            return { ok: false, name: this.descriptor.name, error: err.message };
+        }
+        let pattern;
+        try {
+            pattern = requireString(args, "pattern");
+        }
+        catch (err) {
+            return { ok: false, name: this.descriptor.name, error: err.message };
+        }
+        const caseInsensitive = args.caseInsensitive === true;
+        let maxMatches;
+        try {
+            const raw = optionalNumber(args, "maxMatches");
+            maxMatches = raw === undefined ? 100 : Math.min(HARD_MAX, Math.max(1, Math.floor(raw)));
+        }
+        catch (err) {
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: err.message
+            };
+        }
+        let regex;
+        try {
+            regex = new RegExp(pattern, caseInsensitive ? "i" : "");
+        }
+        catch (err) {
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: `invalid regex: ${err.message}`
+            };
+        }
+        let filesScanned = 0;
+        const hits = [];
+        try {
+            await walk(ctx.sandbox.root, "", async (relPath, abs) => {
+                if (hits.length >= maxMatches)
+                    return false;
+                let content;
+                try {
+                    content = await fs.readFile(abs, "utf8");
+                }
+                catch {
+                    return true;
+                }
+                filesScanned += 1;
+                const lines = content.split(/\r?\n/);
+                for (let i = 0; i < lines.length; i += 1) {
+                    const line = lines[i];
+                    if (regex.test(line)) {
+                        hits.push(`${relPath}:${i + 1}:${line}`);
+                        if (hits.length >= maxMatches)
+                            return false;
+                    }
+                }
+                return true;
+            });
+        }
+        catch (err) {
+            if (err instanceof SandboxEscapeError) {
+                return {
+                    ok: false,
+                    name: this.descriptor.name,
+                    error: err.message,
+                    details: { deniedPath: pattern }
+                };
+            }
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: `walk failed: ${err.message}`
+            };
+        }
+        const body = hits.length > 0 ? hits.join("\n") : "(no matches)";
+        return {
+            ok: true,
+            name: this.descriptor.name,
+            content: truncatePayload(body, ctx.maxResultBytes),
+            details: {
+                pattern,
+                caseInsensitive,
+                matches: hits.length,
+                filesScanned,
+                truncated: hits.length >= maxMatches
+            }
+        };
+    }
+};
+async function walk(root, rel, visit) {
+    const dir = path.join(root, rel);
+    let entries;
+    try {
+        entries = (await fs.readdir(dir, { withFileTypes: true }));
+    }
+    catch {
+        return;
+    }
+    for (const entry of entries) {
+        const childRel = rel ? path.join(rel, entry.name) : entry.name;
+        if (entry.isSymbolicLink())
+            continue;
+        if (entry.isDirectory()) {
+            await walk(root, childRel, visit);
+            continue;
+        }
+        if (entry.isFile()) {
+            const keepGoing = await visit(childRel.replace(/\\/g, "/"), path.join(root, childRel));
+            if (keepGoing === false)
+                return;
+        }
+    }
+}

package/dist/eval/tools/index.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import type { SandboxTool } from "./types.js";
+export { SandboxTool, ToolResult, ToolContext, truncatePayload } from "./types.js";
+export declare const BUILTIN_TOOLS: SandboxTool[];
+/** Build a lookup for the agent loop. */
+export declare function toolsByName(tools?: SandboxTool[]): Map<string, SandboxTool>;
+/** Shape a tool list for OpenAI-style `tools[]` in the chat request. */
+export declare function toolsForRequest(tools?: SandboxTool[]): unknown[];

package/dist/eval/tools/index.js ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * Registry of sandbox-confined tools used by the Tier B with-tools agent.
+ *
+ * The registry order defines the advertised schema order in the
+ * function-calling payload. Keeping it stable means judges reading
+ * generated traces can rely on predictable tool descriptions.
+ */
+import { globTool } from "./glob.js";
+import { grepTool } from "./grep.js";
+import { readTool } from "./read.js";
+import { writeTool } from "./write.js";
+export { truncatePayload } from "./types.js";
+export const BUILTIN_TOOLS = [readTool, writeTool, globTool, grepTool];
+/** Build a lookup for the agent loop. */
+export function toolsByName(tools = BUILTIN_TOOLS) {
+    const map = new Map();
+    for (const tool of tools) {
+        if (map.has(tool.descriptor.name)) {
+            throw new Error(`duplicate tool name: ${tool.descriptor.name}`);
+        }
+        map.set(tool.descriptor.name, tool);
+    }
+    return map;
+}
+/** Shape a tool list for OpenAI-style `tools[]` in the chat request. */
+export function toolsForRequest(tools = BUILTIN_TOOLS) {
+    return tools.map((tool) => ({
+        type: "function",
+        function: {
+            name: tool.descriptor.name,
+            description: tool.descriptor.description,
+            parameters: tool.descriptor.parameters
+        }
+    }));
+}

package/dist/eval/tools/read.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { type SandboxTool } from "./types.js";
2	+ export declare const readTool: SandboxTool;

package/dist/eval/tools/read.js ADDED Viewed

@@ -0,0 +1,122 @@
+import fs from "node:fs/promises";
+import { SandboxEscapeError } from "../sandbox.js";
+import { parseArgs, requireString, optionalNumber, truncatePayload } from "./types.js";
+const DESCRIPTION = "Read a UTF-8 text file from the sandbox. Returns the file contents. " +
+    "Supports optional 1-indexed `offset` and `limit` to read a slice.";
+export const readTool = {
+    descriptor: {
+        name: "read_file",
+        description: DESCRIPTION,
+        parameters: {
+            type: "object",
+            additionalProperties: false,
+            required: ["path"],
+            properties: {
+                path: {
+                    type: "string",
+                    description: "Path relative to the sandbox root."
+                },
+                offset: {
+                    type: "integer",
+                    minimum: 1,
+                    description: "1-indexed start line (inclusive)."
+                },
+                limit: {
+                    type: "integer",
+                    minimum: 1,
+                    description: "Maximum number of lines to return."
+                }
+            }
+        }
+    },
+    async invoke(rawArgs, ctx) {
+        let args;
+        try {
+            args = parseArgs(rawArgs);
+        }
+        catch (err) {
+            return { ok: false, name: this.descriptor.name, error: err.message };
+        }
+        let relPath;
+        try {
+            relPath = requireString(args, "path");
+        }
+        catch (err) {
+            return { ok: false, name: this.descriptor.name, error: err.message };
+        }
+        let offset;
+        let limit;
+        try {
+            offset = optionalNumber(args, "offset");
+            limit = optionalNumber(args, "limit");
+        }
+        catch (err) {
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: err.message
+            };
+        }
+        if (offset !== undefined && (!Number.isInteger(offset) || offset < 1)) {
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: '"offset" must be a positive integer'
+            };
+        }
+        if (limit !== undefined && (!Number.isInteger(limit) || limit < 1)) {
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: '"limit" must be a positive integer'
+            };
+        }
+        let abs;
+        try {
+            abs = await ctx.sandbox.resolve(relPath);
+        }
+        catch (err) {
+            const denied = err instanceof SandboxEscapeError ? relPath : undefined;
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: err.message,
+                details: denied ? { deniedPath: denied } : undefined
+            };
+        }
+        let raw;
+        try {
+            raw = await fs.readFile(abs, "utf8");
+        }
+        catch (err) {
+            return {
+                ok: false,
+                name: this.descriptor.name,
+                error: `read failed: ${err.message}`,
+                details: { path: relPath }
+            };
+        }
+        let content = raw;
+        let effectiveLines;
+        if (offset !== undefined || limit !== undefined) {
+            const lines = raw.split(/\r?\n/);
+            const start = Math.max(0, (offset ?? 1) - 1);
+            const end = limit !== undefined ? Math.min(lines.length, start + limit) : lines.length;
+            const slice = lines.slice(start, end);
+            content = slice.join("\n");
+            effectiveLines = slice.length;
+        }
+        const truncated = truncatePayload(content, ctx.maxResultBytes);
+        return {
+            ok: true,
+            name: this.descriptor.name,
+            content: truncated,
+            details: {
+                path: relPath,
+                bytes: Buffer.byteLength(truncated, "utf8"),
+                truncated: truncated !== content,
+                ...(effectiveLines !== undefined ? { lines: effectiveLines } : {})
+            }
+        };
+    }
+};

package/dist/eval/tools/types.d.ts ADDED Viewed

@@ -0,0 +1,49 @@
+/**
+ * Shared types for Tier B sandbox-confined tools.
+ *
+ * Tools are plain async functions: they take validated arguments and a
+ * sandbox handle and return a structured result. The runner serializes
+ * results for the model as JSON; the `SandboxTool.invoke` wrapper keeps
+ * both the raw structured output (for tests/metrics) and the stringified
+ * model-facing payload.
+ */
+import type { Sandbox } from "../sandbox.js";
+export interface ToolDescriptor {
+    /** Name the model calls (must match the function-calling schema). */
+    name: string;
+    /** Human-readable prompt shown to the model. */
+    description: string;
+    /** JSON schema shipped with the OpenAI-style `tools[]` array. */
+    parameters: Record<string, unknown>;
+}
+export interface ToolContext {
+    sandbox: Sandbox;
+    /**
+     * Maximum bytes the tool may return in `content`. Results longer than
+     * this are truncated with a trailing marker so the model sees the
+     * cutoff.
+     */
+    maxResultBytes: number;
+}
+export interface ToolSuccess {
+    ok: true;
+    name: string;
+    content: string;
+    details?: Record<string, unknown>;
+}
+export interface ToolFailure {
+    ok: false;
+    name: string;
+    error: string;
+    details?: Record<string, unknown>;
+}
+export type ToolResult = ToolSuccess | ToolFailure;
+export interface SandboxTool {
+    descriptor: ToolDescriptor;
+    invoke(rawArgs: string, ctx: ToolContext): Promise<ToolResult>;
+}
+/** Truncate a result payload to `maxBytes` with a visible cutoff marker. */
+export declare function truncatePayload(payload: string, maxBytes: number): string;
+export declare function parseArgs(raw: string): Record<string, unknown>;
+export declare function requireString(args: Record<string, unknown>, key: string): string;
+export declare function optionalNumber(args: Record<string, unknown>, key: string): number | undefined;