npm - ashr-labs - Versions diffs - 0.2.0 → 0.4.0 - Mend

ashr-labs 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/cli.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+#!/usr/bin/env node
+/**
+ * ashr-labs — one command to set up automatic agent testing with Claude Code.
+ *
+ * Usage:
+ *   npx ashr-labs <api-key>
+ *   npx ashr-labs tp_abc123
+ *   ASHR_LABS_API_KEY=tp_... npx ashr-labs
+ */
+export {};

package/dist/cli.js ADDED Viewed

@@ -0,0 +1,396 @@
+#!/usr/bin/env node
+/**
+ * ashr-labs — one command to set up automatic agent testing with Claude Code.
+ *
+ * Usage:
+ *   npx ashr-labs <api-key>
+ *   npx ashr-labs tp_abc123
+ *   ASHR_LABS_API_KEY=tp_... npx ashr-labs
+ */
+import * as fs from "fs";
+import * as path from "path";
+const ASHR_BLUE = "\x1b[38;5;69m";
+const DIM = "\x1b[2m";
+const BOLD = "\x1b[1m";
+const RESET = "\x1b[0m";
+const GREEN = "\x1b[32m";
+const YELLOW = "\x1b[33m";
+function print(msg) {
+    process.stdout.write(msg + "\n");
+}
+function generateAshrConfig(config) {
+    return JSON.stringify({
+        $schema: "https://rank.ashr.io/schemas/ashr.json",
+        apiKeyEnvVar: config.apiKeyEnvVar,
+        agent: {
+            name: config.agentName,
+            description: config.agentDescription,
+            entrypoint: config.entrypoint,
+            domain: config.domain,
+        },
+        eval: {
+            autoRun: true,
+            scenarioCount: 5,
+            maxWorkers: 3,
+        },
+    }, null, 2);
+}
+function generateClaudeMdSection(config) {
+    const sdkImport = config.lang === "typescript"
+        ? `import { AshrLabsClient, EvalRunner } from "ashr-labs";`
+        : `from ashr_labs import AshrLabsClient, EvalRunner`;
+    const clientCtor = config.lang === "typescript"
+        ? `new AshrLabsClient(process.env.${config.apiKeyEnvVar}!)`
+        : `AshrLabsClient(os.environ["${config.apiKeyEnvVar}"])`;
+    return `
+# Ashr Labs — Automated Agent Testing
+This project uses [Ashr Labs](https://rank.ashr.io) for continuous agent evaluation.
+Configuration is in \`.ashr.json\`. The API key is in the \`${config.apiKeyEnvVar}\` environment variable.
+## When to test
+Run \`/test-agent\` or proactively trigger a test whenever you:
+- Change agent logic, prompts, or system instructions
+- Add, modify, or remove tools the agent can call
+- Change how the agent handles edge cases
+- Before committing agent-related changes
+## How to test
+\`\`\`${config.lang}
+${sdkImport}
+const client = ${clientCtor};
+// Generate a test dataset from the agent config
+const [datasetId, source] = await client.generate_dataset(
+  "Eval: ${config.agentName}",
+  {
+    agent: { name: "${config.agentName}", description: "${config.agentDescription}" },
+    context: { domain: "${config.domain}" },
+  }
+);
+// Run the agent against it
+const runner = new EvalRunner(source);
+const results = await runner.run(agent, { maxWorkers: 3 });
+const metrics = results.build().aggregate_metrics;
+// Deploy: await results.deploy(client, datasetId);
+\`\`\`
+## Interpreting results
+- \`tests_passed / total_tests\` — overall pass rate
+- \`average_similarity_score\` — how close agent responses match expected (0-1)
+- \`total_tool_call_divergence\` — count of wrong/missing tool calls
+- If pass rate drops after a change, investigate before committing
+`;
+}
+function generateTestAgentCommand(config) {
+    const isTs = config.lang === "typescript";
+    return `---
+description: Run the Ashr Labs eval suite against the agent and report results.
+---
+Run an automated evaluation of the "${config.agentName}" agent using the Ashr Labs SDK.
+## Steps
+1. Read \`.ashr.json\` for project configuration.
+2. Read the agent code at \`${config.entrypoint}\` to understand current behavior.
+3. Write a temporary eval script that:
+   - Imports \`AshrLabsClient\` and \`EvalRunner\` from \`${isTs ? "ashr-labs" : "ashr_labs"}\`
+   - Creates a client using \`process.env.${config.apiKeyEnvVar}\`
+   - Calls \`client.generate_dataset()\` with the agent config from \`.ashr.json\`
+   - Implements a lightweight Agent wrapper around the actual agent code in \`${config.entrypoint}\`
+   - Runs \`runner.run(agent, { maxWorkers: 3 })\`
+   - Prints aggregate metrics (pass rate, similarity, divergence)
+4. Run the eval script.
+5. Report results clearly:
+   - Total tests, passed, failed
+   - Average similarity score
+   - Any tool call mismatches (list them)
+6. If there are failures, read the agent code and suggest specific improvements.
+7. Clean up the temporary eval script.
+## Important
+- Use the \`${config.apiKeyEnvVar}\` env var for the API key — never hardcode it.
+- If the eval finds issues, propose fixes to \`${config.entrypoint}\` but ask before applying.
+- Compare results to previous runs if available.
+`;
+}
+function generateImproveAgentCommand(config) {
+    return `---
+description: Run evals, find failures, and fix the agent automatically.
+---
+Continuously improve "${config.agentName}" by running evaluations and fixing issues.
+## Steps
+1. Run \`/test-agent\` first to get a baseline.
+2. For each failure or low-similarity result:
+   a. Read the failing scenario and expected behavior.
+   b. Read the agent code at \`${config.entrypoint}\`.
+   c. Identify why the agent produced the wrong output.
+   d. Apply a targeted fix (prompt change, logic change, tool handling).
+   e. Re-run \`/test-agent\` to verify the fix.
+3. Repeat until pass rate is above 80% or no more actionable failures.
+4. Summarize all changes made and the before/after metrics.
+## Rules
+- Make the smallest change that fixes each failure.
+- Don't refactor unrelated code.
+- If a test seems wrong (bad expected output), note it but don't skip it.
+- Deploy the final passing run with \`results.deploy(client, datasetId)\`.
+`;
+}
+function generateHookSettings(config) {
+    return {
+        hooks: {
+            Stop: [
+                {
+                    matcher: "",
+                    hooks: [
+                        {
+                            type: "prompt",
+                            prompt: `If you modified agent code in this conversation (files related to "${config.agentName}" or ${config.entrypoint}), ` +
+                                `remind the user they can run /test-agent to verify the changes. ` +
+                                `Keep it to one sentence.`,
+                        },
+                    ],
+                },
+            ],
+        },
+    };
+}
+// ──────────────────────────────────────────────────────────────
+// File writers (safe — never overwrite without asking)
+// ──────────────────────────────────────────────────────────────
+function writeFile(filePath, content) {
+    const dir = path.dirname(filePath);
+    if (!fs.existsSync(dir)) {
+        fs.mkdirSync(dir, { recursive: true });
+    }
+    if (fs.existsSync(filePath)) {
+        print(`  ${YELLOW}skip${RESET}  ${path.relative(process.cwd(), filePath)}`);
+        return false;
+    }
+    fs.writeFileSync(filePath, content, "utf-8");
+    print(`  ${GREEN}create${RESET}  ${path.relative(process.cwd(), filePath)}`);
+    return true;
+}
+function appendToFile(filePath, content, marker) {
+    if (fs.existsSync(filePath)) {
+        const existing = fs.readFileSync(filePath, "utf-8");
+        if (existing.includes(marker)) {
+            print(`  ${YELLOW}skip${RESET}  ${path.relative(process.cwd(), filePath)}`);
+            return false;
+        }
+        fs.appendFileSync(filePath, "\n" + content, "utf-8");
+        print(`  ${GREEN}append${RESET}  ${path.relative(process.cwd(), filePath)}`);
+    }
+    else {
+        fs.writeFileSync(filePath, content, "utf-8");
+        print(`  ${GREEN}create${RESET}  ${path.relative(process.cwd(), filePath)}`);
+    }
+    return true;
+}
+function mergeJsonFile(filePath, newSettings) {
+    const dir = path.dirname(filePath);
+    if (!fs.existsSync(dir)) {
+        fs.mkdirSync(dir, { recursive: true });
+    }
+    let existing = {};
+    if (fs.existsSync(filePath)) {
+        try {
+            existing = JSON.parse(fs.readFileSync(filePath, "utf-8"));
+        }
+        catch {
+            existing = {};
+        }
+    }
+    const merged = { ...existing };
+    if (newSettings.hooks) {
+        const existingHooks = (existing.hooks || {});
+        const newHooks = newSettings.hooks;
+        merged.hooks = { ...existingHooks };
+        for (const [event, handlers] of Object.entries(newHooks)) {
+            merged.hooks[event] = [
+                ...(existingHooks[event] || []),
+                ...handlers,
+            ];
+        }
+    }
+    fs.writeFileSync(filePath, JSON.stringify(merged, null, 2) + "\n", "utf-8");
+    print(`  ${GREEN}create${RESET}  ${path.relative(process.cwd(), filePath)}`);
+    return true;
+}
+// ──────────────────────────────────────────────────────────────
+// Auto-detection
+// ──────────────────────────────────────────────────────────────
+function detectLang() {
+    if (fs.existsSync("tsconfig.json") || fs.existsSync("package.json"))
+        return "typescript";
+    if (fs.existsSync("pyproject.toml") || fs.existsSync("setup.py") || fs.existsSync("requirements.txt"))
+        return "python";
+    return "typescript";
+}
+function detectEntrypoint(lang) {
+    const candidates = lang === "typescript"
+        ? ["src/agent.ts", "src/index.ts", "agent.ts", "index.ts", "src/agent.js", "agent.js"]
+        : ["agent.py", "src/agent.py", "app/agent.py", "main.py", "src/main.py"];
+    for (const c of candidates) {
+        if (fs.existsSync(c))
+            return c;
+    }
+    return lang === "typescript" ? "src/agent.ts" : "agent.py";
+}
+function detectAgentName() {
+    // Try package.json name
+    if (fs.existsSync("package.json")) {
+        try {
+            const pkg = JSON.parse(fs.readFileSync("package.json", "utf-8"));
+            if (pkg.name && pkg.name !== "undefined")
+                return pkg.name;
+        }
+        catch { /* ignore */ }
+    }
+    // Try pyproject.toml project name
+    if (fs.existsSync("pyproject.toml")) {
+        try {
+            const toml = fs.readFileSync("pyproject.toml", "utf-8");
+            const match = toml.match(/^name\s*=\s*"([^"]+)"/m);
+            if (match)
+                return match[1];
+        }
+        catch { /* ignore */ }
+    }
+    // Fall back to directory name
+    return path.basename(process.cwd());
+}
+function detectDescription() {
+    if (fs.existsSync("package.json")) {
+        try {
+            const pkg = JSON.parse(fs.readFileSync("package.json", "utf-8"));
+            if (pkg.description)
+                return pkg.description;
+        }
+        catch { /* ignore */ }
+    }
+    if (fs.existsSync("pyproject.toml")) {
+        try {
+            const toml = fs.readFileSync("pyproject.toml", "utf-8");
+            const match = toml.match(/^description\s*=\s*"([^"]+)"/m);
+            if (match)
+                return match[1];
+        }
+        catch { /* ignore */ }
+    }
+    return "An AI agent";
+}
+// ──────────────────────────────────────────────────────────────
+// Main
+// ──────────────────────────────────────────────────────────────
+async function main() {
+    const args = process.argv.slice(2);
+    // Help
+    if (args.includes("--help") || args.includes("-h")) {
+        print(`\n${BOLD}ashr-labs${RESET} — set up automatic agent testing\n`);
+        print(`Usage: npx ashr-labs <api-key>\n`);
+        print(`  The API key is the only required argument. Everything else`);
+        print(`  is auto-detected from your project.\n`);
+        print(`  You can also set ASHR_LABS_API_KEY in your environment`);
+        print(`  and run ${DIM}npx ashr-labs${RESET} with no arguments.\n`);
+        print(`  Get your key at ${BOLD}https://app.ashr.io → API Keys${RESET}\n`);
+        process.exit(0);
+    }
+    // Find the API key: first positional arg that starts with tp_, or env var
+    // Skip "init" if someone passes it for backward compat
+    const positionalArgs = args.filter(a => a !== "init" && !a.startsWith("-"));
+    const apiKey = positionalArgs.find(a => a.startsWith("tp_"))
+        || process.env.ASHR_LABS_API_KEY
+        || "";
+    if (!apiKey) {
+        print(`\n${BOLD}${ASHR_BLUE}  ashr labs${RESET}\n`);
+        print(`  Usage: ${BOLD}npx ashr-labs <api-key>${RESET}\n`);
+        print(`  Get your key at https://app.ashr.io → API Keys\n`);
+        process.exit(1);
+    }
+    if (!apiKey.startsWith("tp_")) {
+        print(`\n${YELLOW}  Invalid API key — must start with tp_${RESET}`);
+        print(`  Get one at https://app.ashr.io → API Keys\n`);
+        process.exit(1);
+    }
+    print("");
+    print(`${BOLD}${ASHR_BLUE}  ashr labs${RESET} ${DIM}— setting up agent testing${RESET}`);
+    print("");
+    // Already initialized?
+    if (fs.existsSync(".ashr.json")) {
+        print(`  ${YELLOW}.ashr.json already exists${RESET} — delete it first to re-initialize.\n`);
+        process.exit(0);
+    }
+    // Validate the key
+    print(`  ${DIM}Validating...${RESET}`);
+    let tenantName = "";
+    try {
+        const { AshrLabsClient } = await import("./client.js");
+        const client = new AshrLabsClient(apiKey);
+        const session = await client.init();
+        const tenant = session.tenant;
+        tenantName = (tenant.tenant_name || "");
+        print(`  ${GREEN}✓${RESET} ${tenantName}\n`);
+    }
+    catch (e) {
+        print(`  ${YELLOW}✗ ${e.message}${RESET}`);
+        print(`  ${DIM}Continuing anyway.${RESET}\n`);
+    }
+    // Auto-detect everything
+    const lang = detectLang();
+    const config = {
+        apiKeyEnvVar: "ASHR_LABS_API_KEY",
+        agentName: detectAgentName(),
+        agentDescription: detectDescription(),
+        entrypoint: detectEntrypoint(lang),
+        domain: "general",
+        lang,
+    };
+    // Write all files
+    writeFile(".ashr.json", generateAshrConfig(config));
+    // .env
+    const envLine = `${config.apiKeyEnvVar}=${apiKey}\n`;
+    if (fs.existsSync(".env")) {
+        const envContent = fs.readFileSync(".env", "utf-8");
+        if (!envContent.includes(config.apiKeyEnvVar)) {
+            fs.appendFileSync(".env", envLine);
+            print(`  ${GREEN}append${RESET}  .env`);
+        }
+        else {
+            print(`  ${YELLOW}skip${RESET}  .env`);
+        }
+    }
+    else {
+        fs.writeFileSync(".env", envLine);
+        print(`  ${GREEN}create${RESET}  .env`);
+    }
+    // .gitignore
+    if (fs.existsSync(".gitignore")) {
+        const gi = fs.readFileSync(".gitignore", "utf-8");
+        if (!gi.includes(".env")) {
+            fs.appendFileSync(".gitignore", "\n.env\n");
+        }
+    }
+    appendToFile("CLAUDE.md", generateClaudeMdSection(config), "# Ashr Labs");
+    writeFile(".claude/commands/test-agent.md", generateTestAgentCommand(config));
+    writeFile(".claude/commands/improve-agent.md", generateImproveAgentCommand(config));
+    mergeJsonFile(".claude/settings.json", generateHookSettings(config));
+    // Done
+    print(`\n${GREEN}  Done.${RESET} Open Claude Code and type ${BOLD}/test-agent${RESET}\n`);
+}
+main().catch((e) => {
+    console.error(e.message);
+    process.exit(1);
+});

package/dist/client.d.ts CHANGED Viewed

@@ -13,20 +13,26 @@ export declare class AshrLabsClient {
     private _makeRequest;
     private _raiseForStatus;
     getDataset(datasetId: number, includeSignedUrls?: boolean, urlExpiresSeconds?: number): Promise<Record<string, unknown>>;
-    listDatasets(tenantId?: number | null, limit?: number, offset?: number, includeSignedUrls?: boolean, urlExpiresSeconds?: number): Promise<Record<string, unknown>>;
+    listDatasets(tenantId?: number | null, limit?: number, cursor?: number | null, includeSignedUrls?: boolean, urlExpiresSeconds?: number): Promise<Record<string, unknown>>;
     createRun(datasetId: number, result: Record<string, unknown>, tenantId?: number | null, runnerId?: number | null): Promise<Record<string, unknown>>;
     deleteRun(runId: number): Promise<Record<string, unknown>>;
     getRun(runId: number): Promise<Record<string, unknown>>;
-    listRuns(datasetId?: number | null, tenantId?: number | null, limit?: number, offset?: number): Promise<Record<string, unknown>>;
+    listRuns(datasetId?: number | null, tenantId?: number | null, limit?: number): Promise<Record<string, unknown>>;
     private static _validateConfigStructure;
     createRequest(requestName: string, request: Record<string, unknown>, requestInputSchema?: Record<string, unknown> | null, tenantId?: number | null, requestorId?: number | null): Promise<Record<string, unknown>>;
     getRequest(requestId: number): Promise<Record<string, unknown>>;
-    listRequests(tenantId?: number | null, status?: string | null, limit?: number, offset?: number): Promise<Record<string, unknown>>;
+    listRequests(tenantId?: number | null, status?: string | null, limit?: number, cursor?: number | null): Promise<Record<string, unknown>>;
     listApiKeys(includeInactive?: boolean): Promise<Record<string, unknown>[]>;
     revokeApiKey(apiKeyId: number): Promise<Record<string, unknown>>;
     init(): Promise<Record<string, unknown>>;
     healthCheck(): Promise<Record<string, unknown>>;
     waitForRequest(requestId: number, timeout?: number, pollInterval?: number): Promise<Record<string, unknown>>;
+    /**
+     * Fill in missing context fields so the backend has enough to generate.
+     * If use_case and scenario_context are both missing, synthesize them
+     * from agent name/description.
+     */
+    private static _enrichConfig;
     generateDataset(requestName: string, config: Record<string, unknown>, requestInputSchema?: Record<string, unknown> | null, timeout?: number, pollInterval?: number): Promise<[number, Record<string, unknown>]>;
     toString(): string;
 }

package/dist/client.js CHANGED Viewed

@@ -114,14 +114,16 @@ export class AshrLabsClient {
         });
         return response.dataset;
     }
-    async listDatasets(tenantId, limit = 50, offset = 0, includeSignedUrls = false, urlExpiresSeconds = 3600) {
-        return this._makeRequest("list_datasets", {
+    async listDatasets(tenantId, limit = 50, cursor, includeSignedUrls = false, urlExpiresSeconds = 3600) {
+        const params = {
             tenant_id: await this._resolveTenantId(tenantId),
             limit,
-            offset,
             include_signed_urls: includeSignedUrls,
             url_expires_seconds: urlExpiresSeconds,
-        });
+        };
+        if (cursor != null)
+            params.cursor = cursor;
+        return this._makeRequest("list_datasets", params);
     }
     // =========================================================================
     // Run Operations
@@ -144,11 +146,10 @@ export class AshrLabsClient {
         const response = await this._makeRequest("get_run", { run_id: runId });
         return response.run;
     }
-    async listRuns(datasetId, tenantId, limit = 50, offset = 0) {
+    async listRuns(datasetId, tenantId, limit = 50) {
         const params = {
             tenant_id: await this._resolveTenantId(tenantId),
             limit,
-            offset,
         };
         if (datasetId != null)
             params.dataset_id = datasetId;
@@ -210,14 +211,15 @@ export class AshrLabsClient {
         });
         return response.request;
     }
-    async listRequests(tenantId, status, limit = 50, offset = 0) {
+    async listRequests(tenantId, status, limit = 50, cursor) {
         const params = {
             tenant_id: await this._resolveTenantId(tenantId),
             limit,
-            offset,
         };
         if (status != null)
             params.status = status;
+        if (cursor != null)
+            params.cursor = cursor;
         return this._makeRequest("list_requests", params);
     }
     // =========================================================================
@@ -259,16 +261,63 @@ export class AshrLabsClient {
         }
         throw new Error(`Request ${requestId} did not complete within ${timeout}s`);
     }
+    /**
+     * Fill in missing context fields so the backend has enough to generate.
+     * If use_case and scenario_context are both missing, synthesize them
+     * from agent name/description.
+     */
+    static _enrichConfig(config) {
+        const out = structuredClone(config);
+        const agent = (out.agent ?? {});
+        const context = (out.context ?? {});
+        const hasUseCase = Boolean(context.use_case);
+        const hasScenario = Boolean(context.scenario_context);
+        if (!hasUseCase && !hasScenario) {
+            const name = (agent.name ?? "");
+            const desc = (agent.description ?? "");
+            const domain = (context.domain ?? "");
+            if (desc) {
+                context.use_case = desc;
+            }
+            else if (name) {
+                context.use_case = `Testing the ${name} agent`;
+            }
+            if (name && desc) {
+                const parts = [`A user interacting with ${name}`];
+                if (domain && domain !== "general") {
+                    parts.push(`in the ${domain} domain`);
+                }
+                parts.push(`— ${desc}`);
+                context.scenario_context = parts.join(" ");
+            }
+            out.context = context;
+        }
+        // Default test_config if missing
+        if (!out.test_config) {
+            out.test_config = {
+                num_variations: 5,
+                coverage: {
+                    happy_path: true,
+                    edge_cases: true,
+                    error_handling: true,
+                },
+            };
+        }
+        return out;
+    }
     async generateDataset(requestName, config, requestInputSchema, timeout = 600, pollInterval = 5) {
-        const req = await this.createRequest(requestName, config, requestInputSchema);
+        const enriched = AshrLabsClient._enrichConfig(config);
+        const req = await this.createRequest(requestName, enriched, requestInputSchema);
         const requestId = req.id;
         await this.waitForRequest(requestId, timeout, pollInterval);
-        const resp = await this.listDatasets(undefined, 1);
+        // Find the dataset created by this request — check recent datasets
+        const resp = await this.listDatasets(undefined, 10, undefined, false);
         const datasets = resp.datasets;
-        if (!datasets || datasets.length === 0) {
-            throw new AshrLabsError("No datasets found after generation completed");
+        const match = datasets?.find((d) => d.request_id === requestId);
+        if (!match) {
+            throw new AshrLabsError(`No dataset found for request ${requestId}`);
         }
-        const datasetId = datasets[0].id;
+        const datasetId = match.id;
         const fullDs = await this.getDataset(datasetId, false);
         const source = (fullDs.dataset_source ?? {});
         return [datasetId, source];

package/dist/eval.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
 import { RunBuilder } from "./run-builder.js";
 import type { AshrLabsClient } from "./client.js";
 export interface Agent {
-    respond(message: string): Record<string, unknown> | Promise<Record<string, unknown>>;
-    reset(): void | Promise<void>;
+    respond(message: string, scenarioId?: string): Record<string, unknown> | Promise<Record<string, unknown>>;
+    reset(scenarioId?: string): void | Promise<void>;
 }
 export type OnScenarioCallback = (scenarioId: string, scenario: Record<string, unknown>) => void;
 export type OnActionCallback = (actionIndex: number, action: Record<string, unknown>) => void;
@@ -31,12 +31,12 @@ export declare class EvalRunner {
         };
     }): Promise<EvalRunner>;
     private _runScenario;
-    run(agent: Agent, options?: {
+    run(agent: Agent | (() => Agent), options?: {
         onScenario?: OnScenarioCallback;
         onAction?: OnActionCallback;
         maxWorkers?: number;
     }): Promise<RunBuilder>;
-    runAndDeploy(agent: Agent, client: AshrLabsClient, datasetId?: number, options?: {
+    runAndDeploy(agent: Agent | (() => Agent), client: AshrLabsClient, datasetId: number, options?: {
         onScenario?: OnScenarioCallback;
         onAction?: OnActionCallback;
         maxWorkers?: number;

package/dist/eval.js CHANGED Viewed

@@ -22,7 +22,7 @@ export class EvalRunner {
     async _runScenario(agent, runId, scenario, onScenario, onAction) {
         if (onScenario)
             onScenario(runId, scenario);
-        await agent.reset();
+        await agent.reset(runId);
         const test = new TestBuilder(runId);
         test.start();
         let agentText = "";
@@ -37,7 +37,7 @@ export class EvalRunner {
             if (actor === "user") {
                 test.addUserText(content, action.name ?? `user_action_${i}`, i);
                 try {
-                    const result = await agent.respond(content);
+                    const result = await agent.respond(content, runId);
                     agentText = (result.text ?? "");
                     agentTools = [...(result.tool_calls ?? [])];
                 }
@@ -106,36 +106,30 @@ export class EvalRunner {
             }
         }
         const maxWorkers = options?.maxWorkers ?? 1;
+        const resolvedAgent = typeof agent === "function" ? agent() : agent;
         if (maxWorkers <= 1) {
-            // Sequential — use the agent directly
             for (const [runId, scenario] of scenarios) {
-                const test = await this._runScenario(agent, runId, scenario, options?.onScenario, options?.onAction);
+                const test = await this._runScenario(resolvedAgent, runId, scenario, options?.onScenario, options?.onAction);
                 run._tests.push(test);
             }
         }
         else {
             // Parallel — run scenarios concurrently with concurrency limit.
-            // Each scenario needs its own agent instance since they maintain
-            // conversation state. The caller must provide an agent that supports
-            // structuredClone, or the agent's respond() must be stateless when
-            // used with maxWorkers > 1.
+            // The agent must key its conversation state on the scenarioId
+            // passed to respond(message, scenarioId) and reset(scenarioId).
+            // This allows a single agent instance (one API client) to handle
+            // multiple concurrent scenarios without cloning or extra clients.
             const results = new Array(scenarios.length).fill(null);
-            // Process in batches of maxWorkers
             for (let batchStart = 0; batchStart < scenarios.length; batchStart += maxWorkers) {
                 const batchEnd = Math.min(batchStart + maxWorkers, scenarios.length);
                 const batch = scenarios.slice(batchStart, batchEnd);
                 const promises = batch.map(async ([runId, scenario], batchIdx) => {
                     const idx = batchStart + batchIdx;
                     try {
-                        // Each parallel scenario gets a deep-copied agent
-                        const agentCopy = structuredClone(agent);
-                        // Restore prototype methods lost by structuredClone
-                        Object.setPrototypeOf(agentCopy, Object.getPrototypeOf(agent));
-                        const test = await this._runScenario(agentCopy, runId, scenario, options?.onScenario, options?.onAction);
+                        const test = await this._runScenario(resolvedAgent, runId, scenario, options?.onScenario, options?.onAction);
                         results[idx] = test;
                     }
                     catch {
-                        // Scenario raised — record as a failed test
                         const failed = new TestBuilder(runId);
                         failed.start();
                         failed.complete("failed");

package/package.json CHANGED Viewed

@@ -1,10 +1,13 @@
 {
   "name": "ashr-labs",
-  "version": "0.2.0",
-  "description": "TypeScript SDK for the Ashr Labs API",
+  "version": "0.4.0",
+  "description": "TypeScript SDK for the Ashr Labs API — agent testing & evaluation",
   "type": "module",
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",
+  "bin": {
+    "ashr-labs": "./dist/cli.js"
+  },
   "exports": {
     ".": {
       "import": "./dist/index.js",
@@ -33,5 +36,9 @@
   },
   "engines": {
     "node": ">=18.0.0"
+  },
+  "dependencies": {
+    "@anthropic-ai/sdk": "^0.78.0",
+    "tsx": "^4.21.0"
   }
 }