ashr-labs 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.d.ts ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * ashr-labs — one command to set up automatic agent testing with Claude Code.
4
+ *
5
+ * Usage:
6
+ * npx ashr-labs <api-key>
7
+ * npx ashr-labs tp_abc123
8
+ * ASHR_LABS_API_KEY=tp_... npx ashr-labs
9
+ */
10
+ export {};
package/dist/cli.js ADDED
@@ -0,0 +1,396 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * ashr-labs — one command to set up automatic agent testing with Claude Code.
4
+ *
5
+ * Usage:
6
+ * npx ashr-labs <api-key>
7
+ * npx ashr-labs tp_abc123
8
+ * ASHR_LABS_API_KEY=tp_... npx ashr-labs
9
+ */
10
+ import * as fs from "fs";
11
+ import * as path from "path";
12
+ const ASHR_BLUE = "\x1b[38;5;69m";
13
+ const DIM = "\x1b[2m";
14
+ const BOLD = "\x1b[1m";
15
+ const RESET = "\x1b[0m";
16
+ const GREEN = "\x1b[32m";
17
+ const YELLOW = "\x1b[33m";
18
+ function print(msg) {
19
+ process.stdout.write(msg + "\n");
20
+ }
21
+ function generateAshrConfig(config) {
22
+ return JSON.stringify({
23
+ $schema: "https://rank.ashr.io/schemas/ashr.json",
24
+ apiKeyEnvVar: config.apiKeyEnvVar,
25
+ agent: {
26
+ name: config.agentName,
27
+ description: config.agentDescription,
28
+ entrypoint: config.entrypoint,
29
+ domain: config.domain,
30
+ },
31
+ eval: {
32
+ autoRun: true,
33
+ scenarioCount: 5,
34
+ maxWorkers: 3,
35
+ },
36
+ }, null, 2);
37
+ }
38
+ function generateClaudeMdSection(config) {
39
+ const sdkImport = config.lang === "typescript"
40
+ ? `import { AshrLabsClient, EvalRunner } from "ashr-labs";`
41
+ : `from ashr_labs import AshrLabsClient, EvalRunner`;
42
+ const clientCtor = config.lang === "typescript"
43
+ ? `new AshrLabsClient(process.env.${config.apiKeyEnvVar}!)`
44
+ : `AshrLabsClient(os.environ["${config.apiKeyEnvVar}"])`;
45
+ return `
46
+ # Ashr Labs — Automated Agent Testing
47
+
48
+ This project uses [Ashr Labs](https://rank.ashr.io) for continuous agent evaluation.
49
+ Configuration is in \`.ashr.json\`. The API key is in the \`${config.apiKeyEnvVar}\` environment variable.
50
+
51
+ ## When to test
52
+
53
+ Run \`/test-agent\` or proactively trigger a test whenever you:
54
+ - Change agent logic, prompts, or system instructions
55
+ - Add, modify, or remove tools the agent can call
56
+ - Change how the agent handles edge cases
57
+ - Before committing agent-related changes
58
+
59
+ ## How to test
60
+
61
+ \`\`\`${config.lang}
62
+ ${sdkImport}
63
+
64
+ const client = ${clientCtor};
65
+
66
+ // Generate a test dataset from the agent config
67
+ const [datasetId, source] = await client.generate_dataset(
68
+ "Eval: ${config.agentName}",
69
+ {
70
+ agent: { name: "${config.agentName}", description: "${config.agentDescription}" },
71
+ context: { domain: "${config.domain}" },
72
+ }
73
+ );
74
+
75
+ // Run the agent against it
76
+ const runner = new EvalRunner(source);
77
+ const results = await runner.run(agent, { maxWorkers: 3 });
78
+ const metrics = results.build().aggregate_metrics;
79
+ // Deploy: await results.deploy(client, datasetId);
80
+ \`\`\`
81
+
82
+ ## Interpreting results
83
+
84
+ - \`tests_passed / total_tests\` — overall pass rate
85
+ - \`average_similarity_score\` — how close agent responses match expected (0-1)
86
+ - \`total_tool_call_divergence\` — count of wrong/missing tool calls
87
+ - If pass rate drops after a change, investigate before committing
88
+ `;
89
+ }
90
+ function generateTestAgentCommand(config) {
91
+ const isTs = config.lang === "typescript";
92
+ return `---
93
+ description: Run the Ashr Labs eval suite against the agent and report results.
94
+ ---
95
+
96
+ Run an automated evaluation of the "${config.agentName}" agent using the Ashr Labs SDK.
97
+
98
+ ## Steps
99
+
100
+ 1. Read \`.ashr.json\` for project configuration.
101
+ 2. Read the agent code at \`${config.entrypoint}\` to understand current behavior.
102
+ 3. Write a temporary eval script that:
103
+ - Imports \`AshrLabsClient\` and \`EvalRunner\` from \`${isTs ? "ashr-labs" : "ashr_labs"}\`
104
+ - Creates a client using \`process.env.${config.apiKeyEnvVar}\`
105
+ - Calls \`client.generate_dataset()\` with the agent config from \`.ashr.json\`
106
+ - Implements a lightweight Agent wrapper around the actual agent code in \`${config.entrypoint}\`
107
+ - Runs \`runner.run(agent, { maxWorkers: 3 })\`
108
+ - Prints aggregate metrics (pass rate, similarity, divergence)
109
+ 4. Run the eval script.
110
+ 5. Report results clearly:
111
+ - Total tests, passed, failed
112
+ - Average similarity score
113
+ - Any tool call mismatches (list them)
114
+ 6. If there are failures, read the agent code and suggest specific improvements.
115
+ 7. Clean up the temporary eval script.
116
+
117
+ ## Important
118
+
119
+ - Use the \`${config.apiKeyEnvVar}\` env var for the API key — never hardcode it.
120
+ - If the eval finds issues, propose fixes to \`${config.entrypoint}\` but ask before applying.
121
+ - Compare results to previous runs if available.
122
+ `;
123
+ }
124
+ function generateImproveAgentCommand(config) {
125
+ return `---
126
+ description: Run evals, find failures, and fix the agent automatically.
127
+ ---
128
+
129
+ Continuously improve "${config.agentName}" by running evaluations and fixing issues.
130
+
131
+ ## Steps
132
+
133
+ 1. Run \`/test-agent\` first to get a baseline.
134
+ 2. For each failure or low-similarity result:
135
+ a. Read the failing scenario and expected behavior.
136
+ b. Read the agent code at \`${config.entrypoint}\`.
137
+ c. Identify why the agent produced the wrong output.
138
+ d. Apply a targeted fix (prompt change, logic change, tool handling).
139
+ e. Re-run \`/test-agent\` to verify the fix.
140
+ 3. Repeat until pass rate is above 80% or no more actionable failures.
141
+ 4. Summarize all changes made and the before/after metrics.
142
+
143
+ ## Rules
144
+
145
+ - Make the smallest change that fixes each failure.
146
+ - Don't refactor unrelated code.
147
+ - If a test seems wrong (bad expected output), note it but don't skip it.
148
+ - Deploy the final passing run with \`results.deploy(client, datasetId)\`.
149
+ `;
150
+ }
151
+ function generateHookSettings(config) {
152
+ return {
153
+ hooks: {
154
+ Stop: [
155
+ {
156
+ matcher: "",
157
+ hooks: [
158
+ {
159
+ type: "prompt",
160
+ prompt: `If you modified agent code in this conversation (files related to "${config.agentName}" or ${config.entrypoint}), ` +
161
+ `remind the user they can run /test-agent to verify the changes. ` +
162
+ `Keep it to one sentence.`,
163
+ },
164
+ ],
165
+ },
166
+ ],
167
+ },
168
+ };
169
+ }
170
+ // ──────────────────────────────────────────────────────────────
171
+ // File writers (safe — never overwrite without asking)
172
+ // ──────────────────────────────────────────────────────────────
173
+ function writeFile(filePath, content) {
174
+ const dir = path.dirname(filePath);
175
+ if (!fs.existsSync(dir)) {
176
+ fs.mkdirSync(dir, { recursive: true });
177
+ }
178
+ if (fs.existsSync(filePath)) {
179
+ print(` ${YELLOW}skip${RESET} ${path.relative(process.cwd(), filePath)}`);
180
+ return false;
181
+ }
182
+ fs.writeFileSync(filePath, content, "utf-8");
183
+ print(` ${GREEN}create${RESET} ${path.relative(process.cwd(), filePath)}`);
184
+ return true;
185
+ }
186
+ function appendToFile(filePath, content, marker) {
187
+ if (fs.existsSync(filePath)) {
188
+ const existing = fs.readFileSync(filePath, "utf-8");
189
+ if (existing.includes(marker)) {
190
+ print(` ${YELLOW}skip${RESET} ${path.relative(process.cwd(), filePath)}`);
191
+ return false;
192
+ }
193
+ fs.appendFileSync(filePath, "\n" + content, "utf-8");
194
+ print(` ${GREEN}append${RESET} ${path.relative(process.cwd(), filePath)}`);
195
+ }
196
+ else {
197
+ fs.writeFileSync(filePath, content, "utf-8");
198
+ print(` ${GREEN}create${RESET} ${path.relative(process.cwd(), filePath)}`);
199
+ }
200
+ return true;
201
+ }
202
+ function mergeJsonFile(filePath, newSettings) {
203
+ const dir = path.dirname(filePath);
204
+ if (!fs.existsSync(dir)) {
205
+ fs.mkdirSync(dir, { recursive: true });
206
+ }
207
+ let existing = {};
208
+ if (fs.existsSync(filePath)) {
209
+ try {
210
+ existing = JSON.parse(fs.readFileSync(filePath, "utf-8"));
211
+ }
212
+ catch {
213
+ existing = {};
214
+ }
215
+ }
216
+ const merged = { ...existing };
217
+ if (newSettings.hooks) {
218
+ const existingHooks = (existing.hooks || {});
219
+ const newHooks = newSettings.hooks;
220
+ merged.hooks = { ...existingHooks };
221
+ for (const [event, handlers] of Object.entries(newHooks)) {
222
+ merged.hooks[event] = [
223
+ ...(existingHooks[event] || []),
224
+ ...handlers,
225
+ ];
226
+ }
227
+ }
228
+ fs.writeFileSync(filePath, JSON.stringify(merged, null, 2) + "\n", "utf-8");
229
+ print(` ${GREEN}create${RESET} ${path.relative(process.cwd(), filePath)}`);
230
+ return true;
231
+ }
232
+ // ──────────────────────────────────────────────────────────────
233
+ // Auto-detection
234
+ // ──────────────────────────────────────────────────────────────
235
+ function detectLang() {
236
+ if (fs.existsSync("tsconfig.json") || fs.existsSync("package.json"))
237
+ return "typescript";
238
+ if (fs.existsSync("pyproject.toml") || fs.existsSync("setup.py") || fs.existsSync("requirements.txt"))
239
+ return "python";
240
+ return "typescript";
241
+ }
242
+ function detectEntrypoint(lang) {
243
+ const candidates = lang === "typescript"
244
+ ? ["src/agent.ts", "src/index.ts", "agent.ts", "index.ts", "src/agent.js", "agent.js"]
245
+ : ["agent.py", "src/agent.py", "app/agent.py", "main.py", "src/main.py"];
246
+ for (const c of candidates) {
247
+ if (fs.existsSync(c))
248
+ return c;
249
+ }
250
+ return lang === "typescript" ? "src/agent.ts" : "agent.py";
251
+ }
252
+ function detectAgentName() {
253
+ // Try package.json name
254
+ if (fs.existsSync("package.json")) {
255
+ try {
256
+ const pkg = JSON.parse(fs.readFileSync("package.json", "utf-8"));
257
+ if (pkg.name && pkg.name !== "undefined")
258
+ return pkg.name;
259
+ }
260
+ catch { /* ignore */ }
261
+ }
262
+ // Try pyproject.toml project name
263
+ if (fs.existsSync("pyproject.toml")) {
264
+ try {
265
+ const toml = fs.readFileSync("pyproject.toml", "utf-8");
266
+ const match = toml.match(/^name\s*=\s*"([^"]+)"/m);
267
+ if (match)
268
+ return match[1];
269
+ }
270
+ catch { /* ignore */ }
271
+ }
272
+ // Fall back to directory name
273
+ return path.basename(process.cwd());
274
+ }
275
+ function detectDescription() {
276
+ if (fs.existsSync("package.json")) {
277
+ try {
278
+ const pkg = JSON.parse(fs.readFileSync("package.json", "utf-8"));
279
+ if (pkg.description)
280
+ return pkg.description;
281
+ }
282
+ catch { /* ignore */ }
283
+ }
284
+ if (fs.existsSync("pyproject.toml")) {
285
+ try {
286
+ const toml = fs.readFileSync("pyproject.toml", "utf-8");
287
+ const match = toml.match(/^description\s*=\s*"([^"]+)"/m);
288
+ if (match)
289
+ return match[1];
290
+ }
291
+ catch { /* ignore */ }
292
+ }
293
+ return "An AI agent";
294
+ }
295
+ // ──────────────────────────────────────────────────────────────
296
+ // Main
297
+ // ──────────────────────────────────────────────────────────────
298
+ async function main() {
299
+ const args = process.argv.slice(2);
300
+ // Help
301
+ if (args.includes("--help") || args.includes("-h")) {
302
+ print(`\n${BOLD}ashr-labs${RESET} — set up automatic agent testing\n`);
303
+ print(`Usage: npx ashr-labs <api-key>\n`);
304
+ print(` The API key is the only required argument. Everything else`);
305
+ print(` is auto-detected from your project.\n`);
306
+ print(` You can also set ASHR_LABS_API_KEY in your environment`);
307
+ print(` and run ${DIM}npx ashr-labs${RESET} with no arguments.\n`);
308
+ print(` Get your key at ${BOLD}https://app.ashr.io → API Keys${RESET}\n`);
309
+ process.exit(0);
310
+ }
311
+ // Find the API key: first positional arg that starts with tp_, or env var
312
+ // Skip "init" if someone passes it for backward compat
313
+ const positionalArgs = args.filter(a => a !== "init" && !a.startsWith("-"));
314
+ const apiKey = positionalArgs.find(a => a.startsWith("tp_"))
315
+ || process.env.ASHR_LABS_API_KEY
316
+ || "";
317
+ if (!apiKey) {
318
+ print(`\n${BOLD}${ASHR_BLUE} ashr labs${RESET}\n`);
319
+ print(` Usage: ${BOLD}npx ashr-labs <api-key>${RESET}\n`);
320
+ print(` Get your key at https://app.ashr.io → API Keys\n`);
321
+ process.exit(1);
322
+ }
323
+ if (!apiKey.startsWith("tp_")) {
324
+ print(`\n${YELLOW} Invalid API key — must start with tp_${RESET}`);
325
+ print(` Get one at https://app.ashr.io → API Keys\n`);
326
+ process.exit(1);
327
+ }
328
+ print("");
329
+ print(`${BOLD}${ASHR_BLUE} ashr labs${RESET} ${DIM}— setting up agent testing${RESET}`);
330
+ print("");
331
+ // Already initialized?
332
+ if (fs.existsSync(".ashr.json")) {
333
+ print(` ${YELLOW}.ashr.json already exists${RESET} — delete it first to re-initialize.\n`);
334
+ process.exit(0);
335
+ }
336
+ // Validate the key
337
+ print(` ${DIM}Validating...${RESET}`);
338
+ let tenantName = "";
339
+ try {
340
+ const { AshrLabsClient } = await import("./client.js");
341
+ const client = new AshrLabsClient(apiKey);
342
+ const session = await client.init();
343
+ const tenant = session.tenant;
344
+ tenantName = (tenant.tenant_name || "");
345
+ print(` ${GREEN}✓${RESET} ${tenantName}\n`);
346
+ }
347
+ catch (e) {
348
+ print(` ${YELLOW}✗ ${e.message}${RESET}`);
349
+ print(` ${DIM}Continuing anyway.${RESET}\n`);
350
+ }
351
+ // Auto-detect everything
352
+ const lang = detectLang();
353
+ const config = {
354
+ apiKeyEnvVar: "ASHR_LABS_API_KEY",
355
+ agentName: detectAgentName(),
356
+ agentDescription: detectDescription(),
357
+ entrypoint: detectEntrypoint(lang),
358
+ domain: "general",
359
+ lang,
360
+ };
361
+ // Write all files
362
+ writeFile(".ashr.json", generateAshrConfig(config));
363
+ // .env
364
+ const envLine = `${config.apiKeyEnvVar}=${apiKey}\n`;
365
+ if (fs.existsSync(".env")) {
366
+ const envContent = fs.readFileSync(".env", "utf-8");
367
+ if (!envContent.includes(config.apiKeyEnvVar)) {
368
+ fs.appendFileSync(".env", envLine);
369
+ print(` ${GREEN}append${RESET} .env`);
370
+ }
371
+ else {
372
+ print(` ${YELLOW}skip${RESET} .env`);
373
+ }
374
+ }
375
+ else {
376
+ fs.writeFileSync(".env", envLine);
377
+ print(` ${GREEN}create${RESET} .env`);
378
+ }
379
+ // .gitignore
380
+ if (fs.existsSync(".gitignore")) {
381
+ const gi = fs.readFileSync(".gitignore", "utf-8");
382
+ if (!gi.includes(".env")) {
383
+ fs.appendFileSync(".gitignore", "\n.env\n");
384
+ }
385
+ }
386
+ appendToFile("CLAUDE.md", generateClaudeMdSection(config), "# Ashr Labs");
387
+ writeFile(".claude/commands/test-agent.md", generateTestAgentCommand(config));
388
+ writeFile(".claude/commands/improve-agent.md", generateImproveAgentCommand(config));
389
+ mergeJsonFile(".claude/settings.json", generateHookSettings(config));
390
+ // Done
391
+ print(`\n${GREEN} Done.${RESET} Open Claude Code and type ${BOLD}/test-agent${RESET}\n`);
392
+ }
393
+ main().catch((e) => {
394
+ console.error(e.message);
395
+ process.exit(1);
396
+ });
package/dist/client.d.ts CHANGED
@@ -13,20 +13,26 @@ export declare class AshrLabsClient {
13
13
  private _makeRequest;
14
14
  private _raiseForStatus;
15
15
  getDataset(datasetId: number, includeSignedUrls?: boolean, urlExpiresSeconds?: number): Promise<Record<string, unknown>>;
16
- listDatasets(tenantId?: number | null, limit?: number, offset?: number, includeSignedUrls?: boolean, urlExpiresSeconds?: number): Promise<Record<string, unknown>>;
16
+ listDatasets(tenantId?: number | null, limit?: number, cursor?: number | null, includeSignedUrls?: boolean, urlExpiresSeconds?: number): Promise<Record<string, unknown>>;
17
17
  createRun(datasetId: number, result: Record<string, unknown>, tenantId?: number | null, runnerId?: number | null): Promise<Record<string, unknown>>;
18
18
  deleteRun(runId: number): Promise<Record<string, unknown>>;
19
19
  getRun(runId: number): Promise<Record<string, unknown>>;
20
- listRuns(datasetId?: number | null, tenantId?: number | null, limit?: number, offset?: number): Promise<Record<string, unknown>>;
20
+ listRuns(datasetId?: number | null, tenantId?: number | null, limit?: number): Promise<Record<string, unknown>>;
21
21
  private static _validateConfigStructure;
22
22
  createRequest(requestName: string, request: Record<string, unknown>, requestInputSchema?: Record<string, unknown> | null, tenantId?: number | null, requestorId?: number | null): Promise<Record<string, unknown>>;
23
23
  getRequest(requestId: number): Promise<Record<string, unknown>>;
24
- listRequests(tenantId?: number | null, status?: string | null, limit?: number, offset?: number): Promise<Record<string, unknown>>;
24
+ listRequests(tenantId?: number | null, status?: string | null, limit?: number, cursor?: number | null): Promise<Record<string, unknown>>;
25
25
  listApiKeys(includeInactive?: boolean): Promise<Record<string, unknown>[]>;
26
26
  revokeApiKey(apiKeyId: number): Promise<Record<string, unknown>>;
27
27
  init(): Promise<Record<string, unknown>>;
28
28
  healthCheck(): Promise<Record<string, unknown>>;
29
29
  waitForRequest(requestId: number, timeout?: number, pollInterval?: number): Promise<Record<string, unknown>>;
30
+ /**
31
+ * Fill in missing context fields so the backend has enough to generate.
32
+ * If use_case and scenario_context are both missing, synthesize them
33
+ * from agent name/description.
34
+ */
35
+ private static _enrichConfig;
30
36
  generateDataset(requestName: string, config: Record<string, unknown>, requestInputSchema?: Record<string, unknown> | null, timeout?: number, pollInterval?: number): Promise<[number, Record<string, unknown>]>;
31
37
  toString(): string;
32
38
  }
package/dist/client.js CHANGED
@@ -114,14 +114,16 @@ export class AshrLabsClient {
114
114
  });
115
115
  return response.dataset;
116
116
  }
117
- async listDatasets(tenantId, limit = 50, offset = 0, includeSignedUrls = false, urlExpiresSeconds = 3600) {
118
- return this._makeRequest("list_datasets", {
117
+ async listDatasets(tenantId, limit = 50, cursor, includeSignedUrls = false, urlExpiresSeconds = 3600) {
118
+ const params = {
119
119
  tenant_id: await this._resolveTenantId(tenantId),
120
120
  limit,
121
- offset,
122
121
  include_signed_urls: includeSignedUrls,
123
122
  url_expires_seconds: urlExpiresSeconds,
124
- });
123
+ };
124
+ if (cursor != null)
125
+ params.cursor = cursor;
126
+ return this._makeRequest("list_datasets", params);
125
127
  }
126
128
  // =========================================================================
127
129
  // Run Operations
@@ -144,11 +146,10 @@ export class AshrLabsClient {
144
146
  const response = await this._makeRequest("get_run", { run_id: runId });
145
147
  return response.run;
146
148
  }
147
- async listRuns(datasetId, tenantId, limit = 50, offset = 0) {
149
+ async listRuns(datasetId, tenantId, limit = 50) {
148
150
  const params = {
149
151
  tenant_id: await this._resolveTenantId(tenantId),
150
152
  limit,
151
- offset,
152
153
  };
153
154
  if (datasetId != null)
154
155
  params.dataset_id = datasetId;
@@ -210,14 +211,15 @@ export class AshrLabsClient {
210
211
  });
211
212
  return response.request;
212
213
  }
213
- async listRequests(tenantId, status, limit = 50, offset = 0) {
214
+ async listRequests(tenantId, status, limit = 50, cursor) {
214
215
  const params = {
215
216
  tenant_id: await this._resolveTenantId(tenantId),
216
217
  limit,
217
- offset,
218
218
  };
219
219
  if (status != null)
220
220
  params.status = status;
221
+ if (cursor != null)
222
+ params.cursor = cursor;
221
223
  return this._makeRequest("list_requests", params);
222
224
  }
223
225
  // =========================================================================
@@ -259,16 +261,63 @@ export class AshrLabsClient {
259
261
  }
260
262
  throw new Error(`Request ${requestId} did not complete within ${timeout}s`);
261
263
  }
264
+ /**
265
+ * Fill in missing context fields so the backend has enough to generate.
266
+ * If use_case and scenario_context are both missing, synthesize them
267
+ * from agent name/description.
268
+ */
269
+ static _enrichConfig(config) {
270
+ const out = structuredClone(config);
271
+ const agent = (out.agent ?? {});
272
+ const context = (out.context ?? {});
273
+ const hasUseCase = Boolean(context.use_case);
274
+ const hasScenario = Boolean(context.scenario_context);
275
+ if (!hasUseCase && !hasScenario) {
276
+ const name = (agent.name ?? "");
277
+ const desc = (agent.description ?? "");
278
+ const domain = (context.domain ?? "");
279
+ if (desc) {
280
+ context.use_case = desc;
281
+ }
282
+ else if (name) {
283
+ context.use_case = `Testing the ${name} agent`;
284
+ }
285
+ if (name && desc) {
286
+ const parts = [`A user interacting with ${name}`];
287
+ if (domain && domain !== "general") {
288
+ parts.push(`in the ${domain} domain`);
289
+ }
290
+ parts.push(`— ${desc}`);
291
+ context.scenario_context = parts.join(" ");
292
+ }
293
+ out.context = context;
294
+ }
295
+ // Default test_config if missing
296
+ if (!out.test_config) {
297
+ out.test_config = {
298
+ num_variations: 5,
299
+ coverage: {
300
+ happy_path: true,
301
+ edge_cases: true,
302
+ error_handling: true,
303
+ },
304
+ };
305
+ }
306
+ return out;
307
+ }
262
308
  async generateDataset(requestName, config, requestInputSchema, timeout = 600, pollInterval = 5) {
263
- const req = await this.createRequest(requestName, config, requestInputSchema);
309
+ const enriched = AshrLabsClient._enrichConfig(config);
310
+ const req = await this.createRequest(requestName, enriched, requestInputSchema);
264
311
  const requestId = req.id;
265
312
  await this.waitForRequest(requestId, timeout, pollInterval);
266
- const resp = await this.listDatasets(undefined, 1);
313
+ // Find the dataset created by this request — check recent datasets
314
+ const resp = await this.listDatasets(undefined, 10, undefined, false);
267
315
  const datasets = resp.datasets;
268
- if (!datasets || datasets.length === 0) {
269
- throw new AshrLabsError("No datasets found after generation completed");
316
+ const match = datasets?.find((d) => d.request_id === requestId);
317
+ if (!match) {
318
+ throw new AshrLabsError(`No dataset found for request ${requestId}`);
270
319
  }
271
- const datasetId = datasets[0].id;
320
+ const datasetId = match.id;
272
321
  const fullDs = await this.getDataset(datasetId, false);
273
322
  const source = (fullDs.dataset_source ?? {});
274
323
  return [datasetId, source];
package/dist/eval.d.ts CHANGED
@@ -1,8 +1,8 @@
1
1
  import { RunBuilder } from "./run-builder.js";
2
2
  import type { AshrLabsClient } from "./client.js";
3
3
  export interface Agent {
4
- respond(message: string): Record<string, unknown> | Promise<Record<string, unknown>>;
5
- reset(): void | Promise<void>;
4
+ respond(message: string, scenarioId?: string): Record<string, unknown> | Promise<Record<string, unknown>>;
5
+ reset(scenarioId?: string): void | Promise<void>;
6
6
  }
7
7
  export type OnScenarioCallback = (scenarioId: string, scenario: Record<string, unknown>) => void;
8
8
  export type OnActionCallback = (actionIndex: number, action: Record<string, unknown>) => void;
@@ -31,12 +31,12 @@ export declare class EvalRunner {
31
31
  };
32
32
  }): Promise<EvalRunner>;
33
33
  private _runScenario;
34
- run(agent: Agent, options?: {
34
+ run(agent: Agent | (() => Agent), options?: {
35
35
  onScenario?: OnScenarioCallback;
36
36
  onAction?: OnActionCallback;
37
37
  maxWorkers?: number;
38
38
  }): Promise<RunBuilder>;
39
- runAndDeploy(agent: Agent, client: AshrLabsClient, datasetId?: number, options?: {
39
+ runAndDeploy(agent: Agent | (() => Agent), client: AshrLabsClient, datasetId: number, options?: {
40
40
  onScenario?: OnScenarioCallback;
41
41
  onAction?: OnActionCallback;
42
42
  maxWorkers?: number;
package/dist/eval.js CHANGED
@@ -22,7 +22,7 @@ export class EvalRunner {
22
22
  async _runScenario(agent, runId, scenario, onScenario, onAction) {
23
23
  if (onScenario)
24
24
  onScenario(runId, scenario);
25
- await agent.reset();
25
+ await agent.reset(runId);
26
26
  const test = new TestBuilder(runId);
27
27
  test.start();
28
28
  let agentText = "";
@@ -37,7 +37,7 @@ export class EvalRunner {
37
37
  if (actor === "user") {
38
38
  test.addUserText(content, action.name ?? `user_action_${i}`, i);
39
39
  try {
40
- const result = await agent.respond(content);
40
+ const result = await agent.respond(content, runId);
41
41
  agentText = (result.text ?? "");
42
42
  agentTools = [...(result.tool_calls ?? [])];
43
43
  }
@@ -106,36 +106,30 @@ export class EvalRunner {
106
106
  }
107
107
  }
108
108
  const maxWorkers = options?.maxWorkers ?? 1;
109
+ const resolvedAgent = typeof agent === "function" ? agent() : agent;
109
110
  if (maxWorkers <= 1) {
110
- // Sequential — use the agent directly
111
111
  for (const [runId, scenario] of scenarios) {
112
- const test = await this._runScenario(agent, runId, scenario, options?.onScenario, options?.onAction);
112
+ const test = await this._runScenario(resolvedAgent, runId, scenario, options?.onScenario, options?.onAction);
113
113
  run._tests.push(test);
114
114
  }
115
115
  }
116
116
  else {
117
117
  // Parallel — run scenarios concurrently with concurrency limit.
118
- // Each scenario needs its own agent instance since they maintain
119
- // conversation state. The caller must provide an agent that supports
120
- // structuredClone, or the agent's respond() must be stateless when
121
- // used with maxWorkers > 1.
118
+ // The agent must key its conversation state on the scenarioId
119
+ // passed to respond(message, scenarioId) and reset(scenarioId).
120
+ // This allows a single agent instance (one API client) to handle
121
+ // multiple concurrent scenarios without cloning or extra clients.
122
122
  const results = new Array(scenarios.length).fill(null);
123
- // Process in batches of maxWorkers
124
123
  for (let batchStart = 0; batchStart < scenarios.length; batchStart += maxWorkers) {
125
124
  const batchEnd = Math.min(batchStart + maxWorkers, scenarios.length);
126
125
  const batch = scenarios.slice(batchStart, batchEnd);
127
126
  const promises = batch.map(async ([runId, scenario], batchIdx) => {
128
127
  const idx = batchStart + batchIdx;
129
128
  try {
130
- // Each parallel scenario gets a deep-copied agent
131
- const agentCopy = structuredClone(agent);
132
- // Restore prototype methods lost by structuredClone
133
- Object.setPrototypeOf(agentCopy, Object.getPrototypeOf(agent));
134
- const test = await this._runScenario(agentCopy, runId, scenario, options?.onScenario, options?.onAction);
129
+ const test = await this._runScenario(resolvedAgent, runId, scenario, options?.onScenario, options?.onAction);
135
130
  results[idx] = test;
136
131
  }
137
132
  catch {
138
- // Scenario raised — record as a failed test
139
133
  const failed = new TestBuilder(runId);
140
134
  failed.start();
141
135
  failed.complete("failed");
package/package.json CHANGED
@@ -1,10 +1,13 @@
1
1
  {
2
2
  "name": "ashr-labs",
3
- "version": "0.2.0",
4
- "description": "TypeScript SDK for the Ashr Labs API",
3
+ "version": "0.4.0",
4
+ "description": "TypeScript SDK for the Ashr Labs API — agent testing & evaluation",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
7
7
  "types": "./dist/index.d.ts",
8
+ "bin": {
9
+ "ashr-labs": "./dist/cli.js"
10
+ },
8
11
  "exports": {
9
12
  ".": {
10
13
  "import": "./dist/index.js",
@@ -33,5 +36,9 @@
33
36
  },
34
37
  "engines": {
35
38
  "node": ">=18.0.0"
39
+ },
40
+ "dependencies": {
41
+ "@anthropic-ai/sdk": "^0.78.0",
42
+ "tsx": "^4.21.0"
36
43
  }
37
44
  }