ashr-labs 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +10 -0
- package/dist/cli.js +396 -0
- package/dist/client.d.ts +9 -3
- package/dist/client.js +62 -13
- package/dist/eval.d.ts +4 -4
- package/dist/eval.js +9 -15
- package/package.json +9 -2
package/dist/cli.d.ts
ADDED
package/dist/cli.js
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* ashr-labs — one command to set up automatic agent testing with Claude Code.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* npx ashr-labs <api-key>
|
|
7
|
+
* npx ashr-labs tp_abc123
|
|
8
|
+
* ASHR_LABS_API_KEY=tp_... npx ashr-labs
|
|
9
|
+
*/
|
|
10
|
+
import * as fs from "fs";
|
|
11
|
+
import * as path from "path";
|
|
12
|
+
const ASHR_BLUE = "\x1b[38;5;69m";
|
|
13
|
+
const DIM = "\x1b[2m";
|
|
14
|
+
const BOLD = "\x1b[1m";
|
|
15
|
+
const RESET = "\x1b[0m";
|
|
16
|
+
const GREEN = "\x1b[32m";
|
|
17
|
+
const YELLOW = "\x1b[33m";
|
|
18
|
+
function print(msg) {
|
|
19
|
+
process.stdout.write(msg + "\n");
|
|
20
|
+
}
|
|
21
|
+
function generateAshrConfig(config) {
|
|
22
|
+
return JSON.stringify({
|
|
23
|
+
$schema: "https://rank.ashr.io/schemas/ashr.json",
|
|
24
|
+
apiKeyEnvVar: config.apiKeyEnvVar,
|
|
25
|
+
agent: {
|
|
26
|
+
name: config.agentName,
|
|
27
|
+
description: config.agentDescription,
|
|
28
|
+
entrypoint: config.entrypoint,
|
|
29
|
+
domain: config.domain,
|
|
30
|
+
},
|
|
31
|
+
eval: {
|
|
32
|
+
autoRun: true,
|
|
33
|
+
scenarioCount: 5,
|
|
34
|
+
maxWorkers: 3,
|
|
35
|
+
},
|
|
36
|
+
}, null, 2);
|
|
37
|
+
}
|
|
38
|
+
function generateClaudeMdSection(config) {
|
|
39
|
+
const sdkImport = config.lang === "typescript"
|
|
40
|
+
? `import { AshrLabsClient, EvalRunner } from "ashr-labs";`
|
|
41
|
+
: `from ashr_labs import AshrLabsClient, EvalRunner`;
|
|
42
|
+
const clientCtor = config.lang === "typescript"
|
|
43
|
+
? `new AshrLabsClient(process.env.${config.apiKeyEnvVar}!)`
|
|
44
|
+
: `AshrLabsClient(os.environ["${config.apiKeyEnvVar}"])`;
|
|
45
|
+
return `
|
|
46
|
+
# Ashr Labs — Automated Agent Testing
|
|
47
|
+
|
|
48
|
+
This project uses [Ashr Labs](https://rank.ashr.io) for continuous agent evaluation.
|
|
49
|
+
Configuration is in \`.ashr.json\`. The API key is in the \`${config.apiKeyEnvVar}\` environment variable.
|
|
50
|
+
|
|
51
|
+
## When to test
|
|
52
|
+
|
|
53
|
+
Run \`/test-agent\` or proactively trigger a test whenever you:
|
|
54
|
+
- Change agent logic, prompts, or system instructions
|
|
55
|
+
- Add, modify, or remove tools the agent can call
|
|
56
|
+
- Change how the agent handles edge cases
|
|
57
|
+
- Before committing agent-related changes
|
|
58
|
+
|
|
59
|
+
## How to test
|
|
60
|
+
|
|
61
|
+
\`\`\`${config.lang}
|
|
62
|
+
${sdkImport}
|
|
63
|
+
|
|
64
|
+
const client = ${clientCtor};
|
|
65
|
+
|
|
66
|
+
// Generate a test dataset from the agent config
|
|
67
|
+
const [datasetId, source] = await client.generate_dataset(
|
|
68
|
+
"Eval: ${config.agentName}",
|
|
69
|
+
{
|
|
70
|
+
agent: { name: "${config.agentName}", description: "${config.agentDescription}" },
|
|
71
|
+
context: { domain: "${config.domain}" },
|
|
72
|
+
}
|
|
73
|
+
);
|
|
74
|
+
|
|
75
|
+
// Run the agent against it
|
|
76
|
+
const runner = new EvalRunner(source);
|
|
77
|
+
const results = await runner.run(agent, { maxWorkers: 3 });
|
|
78
|
+
const metrics = results.build().aggregate_metrics;
|
|
79
|
+
// Deploy: await results.deploy(client, datasetId);
|
|
80
|
+
\`\`\`
|
|
81
|
+
|
|
82
|
+
## Interpreting results
|
|
83
|
+
|
|
84
|
+
- \`tests_passed / total_tests\` — overall pass rate
|
|
85
|
+
- \`average_similarity_score\` — how close agent responses match expected (0-1)
|
|
86
|
+
- \`total_tool_call_divergence\` — count of wrong/missing tool calls
|
|
87
|
+
- If pass rate drops after a change, investigate before committing
|
|
88
|
+
`;
|
|
89
|
+
}
|
|
90
|
+
function generateTestAgentCommand(config) {
|
|
91
|
+
const isTs = config.lang === "typescript";
|
|
92
|
+
return `---
|
|
93
|
+
description: Run the Ashr Labs eval suite against the agent and report results.
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
Run an automated evaluation of the "${config.agentName}" agent using the Ashr Labs SDK.
|
|
97
|
+
|
|
98
|
+
## Steps
|
|
99
|
+
|
|
100
|
+
1. Read \`.ashr.json\` for project configuration.
|
|
101
|
+
2. Read the agent code at \`${config.entrypoint}\` to understand current behavior.
|
|
102
|
+
3. Write a temporary eval script that:
|
|
103
|
+
- Imports \`AshrLabsClient\` and \`EvalRunner\` from \`${isTs ? "ashr-labs" : "ashr_labs"}\`
|
|
104
|
+
- Creates a client using \`process.env.${config.apiKeyEnvVar}\`
|
|
105
|
+
- Calls \`client.generate_dataset()\` with the agent config from \`.ashr.json\`
|
|
106
|
+
- Implements a lightweight Agent wrapper around the actual agent code in \`${config.entrypoint}\`
|
|
107
|
+
- Runs \`runner.run(agent, { maxWorkers: 3 })\`
|
|
108
|
+
- Prints aggregate metrics (pass rate, similarity, divergence)
|
|
109
|
+
4. Run the eval script.
|
|
110
|
+
5. Report results clearly:
|
|
111
|
+
- Total tests, passed, failed
|
|
112
|
+
- Average similarity score
|
|
113
|
+
- Any tool call mismatches (list them)
|
|
114
|
+
6. If there are failures, read the agent code and suggest specific improvements.
|
|
115
|
+
7. Clean up the temporary eval script.
|
|
116
|
+
|
|
117
|
+
## Important
|
|
118
|
+
|
|
119
|
+
- Use the \`${config.apiKeyEnvVar}\` env var for the API key — never hardcode it.
|
|
120
|
+
- If the eval finds issues, propose fixes to \`${config.entrypoint}\` but ask before applying.
|
|
121
|
+
- Compare results to previous runs if available.
|
|
122
|
+
`;
|
|
123
|
+
}
|
|
124
|
+
function generateImproveAgentCommand(config) {
|
|
125
|
+
return `---
|
|
126
|
+
description: Run evals, find failures, and fix the agent automatically.
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
Continuously improve "${config.agentName}" by running evaluations and fixing issues.
|
|
130
|
+
|
|
131
|
+
## Steps
|
|
132
|
+
|
|
133
|
+
1. Run \`/test-agent\` first to get a baseline.
|
|
134
|
+
2. For each failure or low-similarity result:
|
|
135
|
+
a. Read the failing scenario and expected behavior.
|
|
136
|
+
b. Read the agent code at \`${config.entrypoint}\`.
|
|
137
|
+
c. Identify why the agent produced the wrong output.
|
|
138
|
+
d. Apply a targeted fix (prompt change, logic change, tool handling).
|
|
139
|
+
e. Re-run \`/test-agent\` to verify the fix.
|
|
140
|
+
3. Repeat until pass rate is above 80% or no more actionable failures.
|
|
141
|
+
4. Summarize all changes made and the before/after metrics.
|
|
142
|
+
|
|
143
|
+
## Rules
|
|
144
|
+
|
|
145
|
+
- Make the smallest change that fixes each failure.
|
|
146
|
+
- Don't refactor unrelated code.
|
|
147
|
+
- If a test seems wrong (bad expected output), note it but don't skip it.
|
|
148
|
+
- Deploy the final passing run with \`results.deploy(client, datasetId)\`.
|
|
149
|
+
`;
|
|
150
|
+
}
|
|
151
|
+
function generateHookSettings(config) {
|
|
152
|
+
return {
|
|
153
|
+
hooks: {
|
|
154
|
+
Stop: [
|
|
155
|
+
{
|
|
156
|
+
matcher: "",
|
|
157
|
+
hooks: [
|
|
158
|
+
{
|
|
159
|
+
type: "prompt",
|
|
160
|
+
prompt: `If you modified agent code in this conversation (files related to "${config.agentName}" or ${config.entrypoint}), ` +
|
|
161
|
+
`remind the user they can run /test-agent to verify the changes. ` +
|
|
162
|
+
`Keep it to one sentence.`,
|
|
163
|
+
},
|
|
164
|
+
],
|
|
165
|
+
},
|
|
166
|
+
],
|
|
167
|
+
},
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
// ──────────────────────────────────────────────────────────────
|
|
171
|
+
// File writers (safe — never overwrite without asking)
|
|
172
|
+
// ──────────────────────────────────────────────────────────────
|
|
173
|
+
function writeFile(filePath, content) {
|
|
174
|
+
const dir = path.dirname(filePath);
|
|
175
|
+
if (!fs.existsSync(dir)) {
|
|
176
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
177
|
+
}
|
|
178
|
+
if (fs.existsSync(filePath)) {
|
|
179
|
+
print(` ${YELLOW}skip${RESET} ${path.relative(process.cwd(), filePath)}`);
|
|
180
|
+
return false;
|
|
181
|
+
}
|
|
182
|
+
fs.writeFileSync(filePath, content, "utf-8");
|
|
183
|
+
print(` ${GREEN}create${RESET} ${path.relative(process.cwd(), filePath)}`);
|
|
184
|
+
return true;
|
|
185
|
+
}
|
|
186
|
+
function appendToFile(filePath, content, marker) {
|
|
187
|
+
if (fs.existsSync(filePath)) {
|
|
188
|
+
const existing = fs.readFileSync(filePath, "utf-8");
|
|
189
|
+
if (existing.includes(marker)) {
|
|
190
|
+
print(` ${YELLOW}skip${RESET} ${path.relative(process.cwd(), filePath)}`);
|
|
191
|
+
return false;
|
|
192
|
+
}
|
|
193
|
+
fs.appendFileSync(filePath, "\n" + content, "utf-8");
|
|
194
|
+
print(` ${GREEN}append${RESET} ${path.relative(process.cwd(), filePath)}`);
|
|
195
|
+
}
|
|
196
|
+
else {
|
|
197
|
+
fs.writeFileSync(filePath, content, "utf-8");
|
|
198
|
+
print(` ${GREEN}create${RESET} ${path.relative(process.cwd(), filePath)}`);
|
|
199
|
+
}
|
|
200
|
+
return true;
|
|
201
|
+
}
|
|
202
|
+
function mergeJsonFile(filePath, newSettings) {
|
|
203
|
+
const dir = path.dirname(filePath);
|
|
204
|
+
if (!fs.existsSync(dir)) {
|
|
205
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
206
|
+
}
|
|
207
|
+
let existing = {};
|
|
208
|
+
if (fs.existsSync(filePath)) {
|
|
209
|
+
try {
|
|
210
|
+
existing = JSON.parse(fs.readFileSync(filePath, "utf-8"));
|
|
211
|
+
}
|
|
212
|
+
catch {
|
|
213
|
+
existing = {};
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
const merged = { ...existing };
|
|
217
|
+
if (newSettings.hooks) {
|
|
218
|
+
const existingHooks = (existing.hooks || {});
|
|
219
|
+
const newHooks = newSettings.hooks;
|
|
220
|
+
merged.hooks = { ...existingHooks };
|
|
221
|
+
for (const [event, handlers] of Object.entries(newHooks)) {
|
|
222
|
+
merged.hooks[event] = [
|
|
223
|
+
...(existingHooks[event] || []),
|
|
224
|
+
...handlers,
|
|
225
|
+
];
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
fs.writeFileSync(filePath, JSON.stringify(merged, null, 2) + "\n", "utf-8");
|
|
229
|
+
print(` ${GREEN}create${RESET} ${path.relative(process.cwd(), filePath)}`);
|
|
230
|
+
return true;
|
|
231
|
+
}
|
|
232
|
+
// ──────────────────────────────────────────────────────────────
|
|
233
|
+
// Auto-detection
|
|
234
|
+
// ──────────────────────────────────────────────────────────────
|
|
235
|
+
function detectLang() {
|
|
236
|
+
if (fs.existsSync("tsconfig.json") || fs.existsSync("package.json"))
|
|
237
|
+
return "typescript";
|
|
238
|
+
if (fs.existsSync("pyproject.toml") || fs.existsSync("setup.py") || fs.existsSync("requirements.txt"))
|
|
239
|
+
return "python";
|
|
240
|
+
return "typescript";
|
|
241
|
+
}
|
|
242
|
+
function detectEntrypoint(lang) {
|
|
243
|
+
const candidates = lang === "typescript"
|
|
244
|
+
? ["src/agent.ts", "src/index.ts", "agent.ts", "index.ts", "src/agent.js", "agent.js"]
|
|
245
|
+
: ["agent.py", "src/agent.py", "app/agent.py", "main.py", "src/main.py"];
|
|
246
|
+
for (const c of candidates) {
|
|
247
|
+
if (fs.existsSync(c))
|
|
248
|
+
return c;
|
|
249
|
+
}
|
|
250
|
+
return lang === "typescript" ? "src/agent.ts" : "agent.py";
|
|
251
|
+
}
|
|
252
|
+
function detectAgentName() {
|
|
253
|
+
// Try package.json name
|
|
254
|
+
if (fs.existsSync("package.json")) {
|
|
255
|
+
try {
|
|
256
|
+
const pkg = JSON.parse(fs.readFileSync("package.json", "utf-8"));
|
|
257
|
+
if (pkg.name && pkg.name !== "undefined")
|
|
258
|
+
return pkg.name;
|
|
259
|
+
}
|
|
260
|
+
catch { /* ignore */ }
|
|
261
|
+
}
|
|
262
|
+
// Try pyproject.toml project name
|
|
263
|
+
if (fs.existsSync("pyproject.toml")) {
|
|
264
|
+
try {
|
|
265
|
+
const toml = fs.readFileSync("pyproject.toml", "utf-8");
|
|
266
|
+
const match = toml.match(/^name\s*=\s*"([^"]+)"/m);
|
|
267
|
+
if (match)
|
|
268
|
+
return match[1];
|
|
269
|
+
}
|
|
270
|
+
catch { /* ignore */ }
|
|
271
|
+
}
|
|
272
|
+
// Fall back to directory name
|
|
273
|
+
return path.basename(process.cwd());
|
|
274
|
+
}
|
|
275
|
+
function detectDescription() {
|
|
276
|
+
if (fs.existsSync("package.json")) {
|
|
277
|
+
try {
|
|
278
|
+
const pkg = JSON.parse(fs.readFileSync("package.json", "utf-8"));
|
|
279
|
+
if (pkg.description)
|
|
280
|
+
return pkg.description;
|
|
281
|
+
}
|
|
282
|
+
catch { /* ignore */ }
|
|
283
|
+
}
|
|
284
|
+
if (fs.existsSync("pyproject.toml")) {
|
|
285
|
+
try {
|
|
286
|
+
const toml = fs.readFileSync("pyproject.toml", "utf-8");
|
|
287
|
+
const match = toml.match(/^description\s*=\s*"([^"]+)"/m);
|
|
288
|
+
if (match)
|
|
289
|
+
return match[1];
|
|
290
|
+
}
|
|
291
|
+
catch { /* ignore */ }
|
|
292
|
+
}
|
|
293
|
+
return "An AI agent";
|
|
294
|
+
}
|
|
295
|
+
// ──────────────────────────────────────────────────────────────
|
|
296
|
+
// Main
|
|
297
|
+
// ──────────────────────────────────────────────────────────────
|
|
298
|
+
async function main() {
|
|
299
|
+
const args = process.argv.slice(2);
|
|
300
|
+
// Help
|
|
301
|
+
if (args.includes("--help") || args.includes("-h")) {
|
|
302
|
+
print(`\n${BOLD}ashr-labs${RESET} — set up automatic agent testing\n`);
|
|
303
|
+
print(`Usage: npx ashr-labs <api-key>\n`);
|
|
304
|
+
print(` The API key is the only required argument. Everything else`);
|
|
305
|
+
print(` is auto-detected from your project.\n`);
|
|
306
|
+
print(` You can also set ASHR_LABS_API_KEY in your environment`);
|
|
307
|
+
print(` and run ${DIM}npx ashr-labs${RESET} with no arguments.\n`);
|
|
308
|
+
print(` Get your key at ${BOLD}https://app.ashr.io → API Keys${RESET}\n`);
|
|
309
|
+
process.exit(0);
|
|
310
|
+
}
|
|
311
|
+
// Find the API key: first positional arg that starts with tp_, or env var
|
|
312
|
+
// Skip "init" if someone passes it for backward compat
|
|
313
|
+
const positionalArgs = args.filter(a => a !== "init" && !a.startsWith("-"));
|
|
314
|
+
const apiKey = positionalArgs.find(a => a.startsWith("tp_"))
|
|
315
|
+
|| process.env.ASHR_LABS_API_KEY
|
|
316
|
+
|| "";
|
|
317
|
+
if (!apiKey) {
|
|
318
|
+
print(`\n${BOLD}${ASHR_BLUE} ashr labs${RESET}\n`);
|
|
319
|
+
print(` Usage: ${BOLD}npx ashr-labs <api-key>${RESET}\n`);
|
|
320
|
+
print(` Get your key at https://app.ashr.io → API Keys\n`);
|
|
321
|
+
process.exit(1);
|
|
322
|
+
}
|
|
323
|
+
if (!apiKey.startsWith("tp_")) {
|
|
324
|
+
print(`\n${YELLOW} Invalid API key — must start with tp_${RESET}`);
|
|
325
|
+
print(` Get one at https://app.ashr.io → API Keys\n`);
|
|
326
|
+
process.exit(1);
|
|
327
|
+
}
|
|
328
|
+
print("");
|
|
329
|
+
print(`${BOLD}${ASHR_BLUE} ashr labs${RESET} ${DIM}— setting up agent testing${RESET}`);
|
|
330
|
+
print("");
|
|
331
|
+
// Already initialized?
|
|
332
|
+
if (fs.existsSync(".ashr.json")) {
|
|
333
|
+
print(` ${YELLOW}.ashr.json already exists${RESET} — delete it first to re-initialize.\n`);
|
|
334
|
+
process.exit(0);
|
|
335
|
+
}
|
|
336
|
+
// Validate the key
|
|
337
|
+
print(` ${DIM}Validating...${RESET}`);
|
|
338
|
+
let tenantName = "";
|
|
339
|
+
try {
|
|
340
|
+
const { AshrLabsClient } = await import("./client.js");
|
|
341
|
+
const client = new AshrLabsClient(apiKey);
|
|
342
|
+
const session = await client.init();
|
|
343
|
+
const tenant = session.tenant;
|
|
344
|
+
tenantName = (tenant.tenant_name || "");
|
|
345
|
+
print(` ${GREEN}✓${RESET} ${tenantName}\n`);
|
|
346
|
+
}
|
|
347
|
+
catch (e) {
|
|
348
|
+
print(` ${YELLOW}✗ ${e.message}${RESET}`);
|
|
349
|
+
print(` ${DIM}Continuing anyway.${RESET}\n`);
|
|
350
|
+
}
|
|
351
|
+
// Auto-detect everything
|
|
352
|
+
const lang = detectLang();
|
|
353
|
+
const config = {
|
|
354
|
+
apiKeyEnvVar: "ASHR_LABS_API_KEY",
|
|
355
|
+
agentName: detectAgentName(),
|
|
356
|
+
agentDescription: detectDescription(),
|
|
357
|
+
entrypoint: detectEntrypoint(lang),
|
|
358
|
+
domain: "general",
|
|
359
|
+
lang,
|
|
360
|
+
};
|
|
361
|
+
// Write all files
|
|
362
|
+
writeFile(".ashr.json", generateAshrConfig(config));
|
|
363
|
+
// .env
|
|
364
|
+
const envLine = `${config.apiKeyEnvVar}=${apiKey}\n`;
|
|
365
|
+
if (fs.existsSync(".env")) {
|
|
366
|
+
const envContent = fs.readFileSync(".env", "utf-8");
|
|
367
|
+
if (!envContent.includes(config.apiKeyEnvVar)) {
|
|
368
|
+
fs.appendFileSync(".env", envLine);
|
|
369
|
+
print(` ${GREEN}append${RESET} .env`);
|
|
370
|
+
}
|
|
371
|
+
else {
|
|
372
|
+
print(` ${YELLOW}skip${RESET} .env`);
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
else {
|
|
376
|
+
fs.writeFileSync(".env", envLine);
|
|
377
|
+
print(` ${GREEN}create${RESET} .env`);
|
|
378
|
+
}
|
|
379
|
+
// .gitignore
|
|
380
|
+
if (fs.existsSync(".gitignore")) {
|
|
381
|
+
const gi = fs.readFileSync(".gitignore", "utf-8");
|
|
382
|
+
if (!gi.includes(".env")) {
|
|
383
|
+
fs.appendFileSync(".gitignore", "\n.env\n");
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
appendToFile("CLAUDE.md", generateClaudeMdSection(config), "# Ashr Labs");
|
|
387
|
+
writeFile(".claude/commands/test-agent.md", generateTestAgentCommand(config));
|
|
388
|
+
writeFile(".claude/commands/improve-agent.md", generateImproveAgentCommand(config));
|
|
389
|
+
mergeJsonFile(".claude/settings.json", generateHookSettings(config));
|
|
390
|
+
// Done
|
|
391
|
+
print(`\n${GREEN} Done.${RESET} Open Claude Code and type ${BOLD}/test-agent${RESET}\n`);
|
|
392
|
+
}
|
|
393
|
+
main().catch((e) => {
|
|
394
|
+
console.error(e.message);
|
|
395
|
+
process.exit(1);
|
|
396
|
+
});
|
package/dist/client.d.ts
CHANGED
|
@@ -13,20 +13,26 @@ export declare class AshrLabsClient {
|
|
|
13
13
|
private _makeRequest;
|
|
14
14
|
private _raiseForStatus;
|
|
15
15
|
getDataset(datasetId: number, includeSignedUrls?: boolean, urlExpiresSeconds?: number): Promise<Record<string, unknown>>;
|
|
16
|
-
listDatasets(tenantId?: number | null, limit?: number,
|
|
16
|
+
listDatasets(tenantId?: number | null, limit?: number, cursor?: number | null, includeSignedUrls?: boolean, urlExpiresSeconds?: number): Promise<Record<string, unknown>>;
|
|
17
17
|
createRun(datasetId: number, result: Record<string, unknown>, tenantId?: number | null, runnerId?: number | null): Promise<Record<string, unknown>>;
|
|
18
18
|
deleteRun(runId: number): Promise<Record<string, unknown>>;
|
|
19
19
|
getRun(runId: number): Promise<Record<string, unknown>>;
|
|
20
|
-
listRuns(datasetId?: number | null, tenantId?: number | null, limit?: number
|
|
20
|
+
listRuns(datasetId?: number | null, tenantId?: number | null, limit?: number): Promise<Record<string, unknown>>;
|
|
21
21
|
private static _validateConfigStructure;
|
|
22
22
|
createRequest(requestName: string, request: Record<string, unknown>, requestInputSchema?: Record<string, unknown> | null, tenantId?: number | null, requestorId?: number | null): Promise<Record<string, unknown>>;
|
|
23
23
|
getRequest(requestId: number): Promise<Record<string, unknown>>;
|
|
24
|
-
listRequests(tenantId?: number | null, status?: string | null, limit?: number,
|
|
24
|
+
listRequests(tenantId?: number | null, status?: string | null, limit?: number, cursor?: number | null): Promise<Record<string, unknown>>;
|
|
25
25
|
listApiKeys(includeInactive?: boolean): Promise<Record<string, unknown>[]>;
|
|
26
26
|
revokeApiKey(apiKeyId: number): Promise<Record<string, unknown>>;
|
|
27
27
|
init(): Promise<Record<string, unknown>>;
|
|
28
28
|
healthCheck(): Promise<Record<string, unknown>>;
|
|
29
29
|
waitForRequest(requestId: number, timeout?: number, pollInterval?: number): Promise<Record<string, unknown>>;
|
|
30
|
+
/**
|
|
31
|
+
* Fill in missing context fields so the backend has enough to generate.
|
|
32
|
+
* If use_case and scenario_context are both missing, synthesize them
|
|
33
|
+
* from agent name/description.
|
|
34
|
+
*/
|
|
35
|
+
private static _enrichConfig;
|
|
30
36
|
generateDataset(requestName: string, config: Record<string, unknown>, requestInputSchema?: Record<string, unknown> | null, timeout?: number, pollInterval?: number): Promise<[number, Record<string, unknown>]>;
|
|
31
37
|
toString(): string;
|
|
32
38
|
}
|
package/dist/client.js
CHANGED
|
@@ -114,14 +114,16 @@ export class AshrLabsClient {
|
|
|
114
114
|
});
|
|
115
115
|
return response.dataset;
|
|
116
116
|
}
|
|
117
|
-
async listDatasets(tenantId, limit = 50,
|
|
118
|
-
|
|
117
|
+
async listDatasets(tenantId, limit = 50, cursor, includeSignedUrls = false, urlExpiresSeconds = 3600) {
|
|
118
|
+
const params = {
|
|
119
119
|
tenant_id: await this._resolveTenantId(tenantId),
|
|
120
120
|
limit,
|
|
121
|
-
offset,
|
|
122
121
|
include_signed_urls: includeSignedUrls,
|
|
123
122
|
url_expires_seconds: urlExpiresSeconds,
|
|
124
|
-
}
|
|
123
|
+
};
|
|
124
|
+
if (cursor != null)
|
|
125
|
+
params.cursor = cursor;
|
|
126
|
+
return this._makeRequest("list_datasets", params);
|
|
125
127
|
}
|
|
126
128
|
// =========================================================================
|
|
127
129
|
// Run Operations
|
|
@@ -144,11 +146,10 @@ export class AshrLabsClient {
|
|
|
144
146
|
const response = await this._makeRequest("get_run", { run_id: runId });
|
|
145
147
|
return response.run;
|
|
146
148
|
}
|
|
147
|
-
async listRuns(datasetId, tenantId, limit = 50
|
|
149
|
+
async listRuns(datasetId, tenantId, limit = 50) {
|
|
148
150
|
const params = {
|
|
149
151
|
tenant_id: await this._resolveTenantId(tenantId),
|
|
150
152
|
limit,
|
|
151
|
-
offset,
|
|
152
153
|
};
|
|
153
154
|
if (datasetId != null)
|
|
154
155
|
params.dataset_id = datasetId;
|
|
@@ -210,14 +211,15 @@ export class AshrLabsClient {
|
|
|
210
211
|
});
|
|
211
212
|
return response.request;
|
|
212
213
|
}
|
|
213
|
-
async listRequests(tenantId, status, limit = 50,
|
|
214
|
+
async listRequests(tenantId, status, limit = 50, cursor) {
|
|
214
215
|
const params = {
|
|
215
216
|
tenant_id: await this._resolveTenantId(tenantId),
|
|
216
217
|
limit,
|
|
217
|
-
offset,
|
|
218
218
|
};
|
|
219
219
|
if (status != null)
|
|
220
220
|
params.status = status;
|
|
221
|
+
if (cursor != null)
|
|
222
|
+
params.cursor = cursor;
|
|
221
223
|
return this._makeRequest("list_requests", params);
|
|
222
224
|
}
|
|
223
225
|
// =========================================================================
|
|
@@ -259,16 +261,63 @@ export class AshrLabsClient {
|
|
|
259
261
|
}
|
|
260
262
|
throw new Error(`Request ${requestId} did not complete within ${timeout}s`);
|
|
261
263
|
}
|
|
264
|
+
/**
|
|
265
|
+
* Fill in missing context fields so the backend has enough to generate.
|
|
266
|
+
* If use_case and scenario_context are both missing, synthesize them
|
|
267
|
+
* from agent name/description.
|
|
268
|
+
*/
|
|
269
|
+
static _enrichConfig(config) {
|
|
270
|
+
const out = structuredClone(config);
|
|
271
|
+
const agent = (out.agent ?? {});
|
|
272
|
+
const context = (out.context ?? {});
|
|
273
|
+
const hasUseCase = Boolean(context.use_case);
|
|
274
|
+
const hasScenario = Boolean(context.scenario_context);
|
|
275
|
+
if (!hasUseCase && !hasScenario) {
|
|
276
|
+
const name = (agent.name ?? "");
|
|
277
|
+
const desc = (agent.description ?? "");
|
|
278
|
+
const domain = (context.domain ?? "");
|
|
279
|
+
if (desc) {
|
|
280
|
+
context.use_case = desc;
|
|
281
|
+
}
|
|
282
|
+
else if (name) {
|
|
283
|
+
context.use_case = `Testing the ${name} agent`;
|
|
284
|
+
}
|
|
285
|
+
if (name && desc) {
|
|
286
|
+
const parts = [`A user interacting with ${name}`];
|
|
287
|
+
if (domain && domain !== "general") {
|
|
288
|
+
parts.push(`in the ${domain} domain`);
|
|
289
|
+
}
|
|
290
|
+
parts.push(`— ${desc}`);
|
|
291
|
+
context.scenario_context = parts.join(" ");
|
|
292
|
+
}
|
|
293
|
+
out.context = context;
|
|
294
|
+
}
|
|
295
|
+
// Default test_config if missing
|
|
296
|
+
if (!out.test_config) {
|
|
297
|
+
out.test_config = {
|
|
298
|
+
num_variations: 5,
|
|
299
|
+
coverage: {
|
|
300
|
+
happy_path: true,
|
|
301
|
+
edge_cases: true,
|
|
302
|
+
error_handling: true,
|
|
303
|
+
},
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
return out;
|
|
307
|
+
}
|
|
262
308
|
async generateDataset(requestName, config, requestInputSchema, timeout = 600, pollInterval = 5) {
|
|
263
|
-
const
|
|
309
|
+
const enriched = AshrLabsClient._enrichConfig(config);
|
|
310
|
+
const req = await this.createRequest(requestName, enriched, requestInputSchema);
|
|
264
311
|
const requestId = req.id;
|
|
265
312
|
await this.waitForRequest(requestId, timeout, pollInterval);
|
|
266
|
-
|
|
313
|
+
// Find the dataset created by this request — check recent datasets
|
|
314
|
+
const resp = await this.listDatasets(undefined, 10, undefined, false);
|
|
267
315
|
const datasets = resp.datasets;
|
|
268
|
-
|
|
269
|
-
|
|
316
|
+
const match = datasets?.find((d) => d.request_id === requestId);
|
|
317
|
+
if (!match) {
|
|
318
|
+
throw new AshrLabsError(`No dataset found for request ${requestId}`);
|
|
270
319
|
}
|
|
271
|
-
const datasetId =
|
|
320
|
+
const datasetId = match.id;
|
|
272
321
|
const fullDs = await this.getDataset(datasetId, false);
|
|
273
322
|
const source = (fullDs.dataset_source ?? {});
|
|
274
323
|
return [datasetId, source];
|
package/dist/eval.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { RunBuilder } from "./run-builder.js";
|
|
2
2
|
import type { AshrLabsClient } from "./client.js";
|
|
3
3
|
export interface Agent {
|
|
4
|
-
respond(message: string): Record<string, unknown> | Promise<Record<string, unknown>>;
|
|
5
|
-
reset(): void | Promise<void>;
|
|
4
|
+
respond(message: string, scenarioId?: string): Record<string, unknown> | Promise<Record<string, unknown>>;
|
|
5
|
+
reset(scenarioId?: string): void | Promise<void>;
|
|
6
6
|
}
|
|
7
7
|
export type OnScenarioCallback = (scenarioId: string, scenario: Record<string, unknown>) => void;
|
|
8
8
|
export type OnActionCallback = (actionIndex: number, action: Record<string, unknown>) => void;
|
|
@@ -31,12 +31,12 @@ export declare class EvalRunner {
|
|
|
31
31
|
};
|
|
32
32
|
}): Promise<EvalRunner>;
|
|
33
33
|
private _runScenario;
|
|
34
|
-
run(agent: Agent, options?: {
|
|
34
|
+
run(agent: Agent | (() => Agent), options?: {
|
|
35
35
|
onScenario?: OnScenarioCallback;
|
|
36
36
|
onAction?: OnActionCallback;
|
|
37
37
|
maxWorkers?: number;
|
|
38
38
|
}): Promise<RunBuilder>;
|
|
39
|
-
runAndDeploy(agent: Agent, client: AshrLabsClient, datasetId
|
|
39
|
+
runAndDeploy(agent: Agent | (() => Agent), client: AshrLabsClient, datasetId: number, options?: {
|
|
40
40
|
onScenario?: OnScenarioCallback;
|
|
41
41
|
onAction?: OnActionCallback;
|
|
42
42
|
maxWorkers?: number;
|
package/dist/eval.js
CHANGED
|
@@ -22,7 +22,7 @@ export class EvalRunner {
|
|
|
22
22
|
async _runScenario(agent, runId, scenario, onScenario, onAction) {
|
|
23
23
|
if (onScenario)
|
|
24
24
|
onScenario(runId, scenario);
|
|
25
|
-
await agent.reset();
|
|
25
|
+
await agent.reset(runId);
|
|
26
26
|
const test = new TestBuilder(runId);
|
|
27
27
|
test.start();
|
|
28
28
|
let agentText = "";
|
|
@@ -37,7 +37,7 @@ export class EvalRunner {
|
|
|
37
37
|
if (actor === "user") {
|
|
38
38
|
test.addUserText(content, action.name ?? `user_action_${i}`, i);
|
|
39
39
|
try {
|
|
40
|
-
const result = await agent.respond(content);
|
|
40
|
+
const result = await agent.respond(content, runId);
|
|
41
41
|
agentText = (result.text ?? "");
|
|
42
42
|
agentTools = [...(result.tool_calls ?? [])];
|
|
43
43
|
}
|
|
@@ -106,36 +106,30 @@ export class EvalRunner {
|
|
|
106
106
|
}
|
|
107
107
|
}
|
|
108
108
|
const maxWorkers = options?.maxWorkers ?? 1;
|
|
109
|
+
const resolvedAgent = typeof agent === "function" ? agent() : agent;
|
|
109
110
|
if (maxWorkers <= 1) {
|
|
110
|
-
// Sequential — use the agent directly
|
|
111
111
|
for (const [runId, scenario] of scenarios) {
|
|
112
|
-
const test = await this._runScenario(
|
|
112
|
+
const test = await this._runScenario(resolvedAgent, runId, scenario, options?.onScenario, options?.onAction);
|
|
113
113
|
run._tests.push(test);
|
|
114
114
|
}
|
|
115
115
|
}
|
|
116
116
|
else {
|
|
117
117
|
// Parallel — run scenarios concurrently with concurrency limit.
|
|
118
|
-
//
|
|
119
|
-
//
|
|
120
|
-
//
|
|
121
|
-
//
|
|
118
|
+
// The agent must key its conversation state on the scenarioId
|
|
119
|
+
// passed to respond(message, scenarioId) and reset(scenarioId).
|
|
120
|
+
// This allows a single agent instance (one API client) to handle
|
|
121
|
+
// multiple concurrent scenarios without cloning or extra clients.
|
|
122
122
|
const results = new Array(scenarios.length).fill(null);
|
|
123
|
-
// Process in batches of maxWorkers
|
|
124
123
|
for (let batchStart = 0; batchStart < scenarios.length; batchStart += maxWorkers) {
|
|
125
124
|
const batchEnd = Math.min(batchStart + maxWorkers, scenarios.length);
|
|
126
125
|
const batch = scenarios.slice(batchStart, batchEnd);
|
|
127
126
|
const promises = batch.map(async ([runId, scenario], batchIdx) => {
|
|
128
127
|
const idx = batchStart + batchIdx;
|
|
129
128
|
try {
|
|
130
|
-
|
|
131
|
-
const agentCopy = structuredClone(agent);
|
|
132
|
-
// Restore prototype methods lost by structuredClone
|
|
133
|
-
Object.setPrototypeOf(agentCopy, Object.getPrototypeOf(agent));
|
|
134
|
-
const test = await this._runScenario(agentCopy, runId, scenario, options?.onScenario, options?.onAction);
|
|
129
|
+
const test = await this._runScenario(resolvedAgent, runId, scenario, options?.onScenario, options?.onAction);
|
|
135
130
|
results[idx] = test;
|
|
136
131
|
}
|
|
137
132
|
catch {
|
|
138
|
-
// Scenario raised — record as a failed test
|
|
139
133
|
const failed = new TestBuilder(runId);
|
|
140
134
|
failed.start();
|
|
141
135
|
failed.complete("failed");
|
package/package.json
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ashr-labs",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "TypeScript SDK for the Ashr Labs API",
|
|
3
|
+
"version": "0.4.0",
|
|
4
|
+
"description": "TypeScript SDK for the Ashr Labs API — agent testing & evaluation",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
7
7
|
"types": "./dist/index.d.ts",
|
|
8
|
+
"bin": {
|
|
9
|
+
"ashr-labs": "./dist/cli.js"
|
|
10
|
+
},
|
|
8
11
|
"exports": {
|
|
9
12
|
".": {
|
|
10
13
|
"import": "./dist/index.js",
|
|
@@ -33,5 +36,9 @@
|
|
|
33
36
|
},
|
|
34
37
|
"engines": {
|
|
35
38
|
"node": ">=18.0.0"
|
|
39
|
+
},
|
|
40
|
+
"dependencies": {
|
|
41
|
+
"@anthropic-ai/sdk": "^0.78.0",
|
|
42
|
+
"tsx": "^4.21.0"
|
|
36
43
|
}
|
|
37
44
|
}
|