ashr-labs 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +245 -0
  2. package/dist/cli.js +139 -0
  3. package/package.json +1 -1
package/README.md ADDED
@@ -0,0 +1,245 @@
1
+ # Ashr Labs TypeScript SDK
2
+
3
+ A TypeScript client library for evaluating AI agents against Ashr Labs test datasets.
4
+
5
+ ## Documentation
6
+
7
+ - [Testing Your Agent](docs/testing-your-agent.md) — **start here**
8
+ - [Quick Start Guide](docs/quickstart.md)
9
+ - [Installation](docs/installation.md)
10
+ - [Authentication](docs/authentication.md)
11
+ - [API Reference](docs/api-reference.md)
12
+ - [Error Handling](docs/error-handling.md)
13
+ - [Examples](docs/examples.md)
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ npm install ashr-labs
19
+ ```
20
+
21
+ ## Quick Start
22
+
23
+ ```typescript
24
+ import { AshrLabsClient, EvalRunner } from "ashr-labs";
25
+
26
+ // Only need your API key — baseUrl and tenantId are automatic
27
+ const client = new AshrLabsClient("tp_your_api_key_here");
28
+
29
+ // Fetch a dataset and run your agent against it
30
+ const runner = await EvalRunner.fromDataset(client, 42);
31
+ const run = await runner.run(myAgent);
32
+
33
+ // Inspect results
34
+ const metrics = run.build().aggregate_metrics as Record<string, unknown>;
35
+ console.log(`Passed: ${metrics.tests_passed}/${metrics.total_tests}`);
36
+ console.log(`Avg similarity: ${metrics.average_similarity_score}`);
37
+
38
+ // Submit results
39
+ await run.deploy(client, 42);
40
+ ```
41
+
42
+ Your agent just needs two methods:
43
+
44
+ ```typescript
45
+ import type { Agent } from "ashr-labs";
46
+
47
+ const myAgent: Agent = {
48
+ async respond(message: string) {
49
+ // Call your LLM, return { text: "...", tool_calls: [...] }
50
+ return { text: "response", tool_calls: [] };
51
+ },
52
+
53
+ async reset() {
54
+ // Clear conversation history between scenarios
55
+ },
56
+ };
57
+ ```
58
+
59
+ See [Testing Your Agent](docs/testing-your-agent.md) for a full end-to-end guide.
60
+
61
+ ## Observability — Production Tracing
62
+
63
+ Trace your agent in production. Captures LLM calls, tool invocations, and events. **Never rejects** — if the backend is unreachable, errors are logged silently.
64
+
65
+ ```typescript
66
+ // wrap() pattern — auto-end on completion, auto-capture errors
67
+ await client.trace("handle-ticket", { userId: "user_42" }).wrap(async (trace) => {
68
+ const gen = trace.generation("classify", { model: "claude-sonnet-4-6",
69
+ input: [{ role: "user", content: "help" }] });
70
+ const result = await callLlm(...);
71
+ gen.end({ output: result, usage: { input_tokens: 50, output_tokens: 12 } });
72
+
73
+ await trace.span("tool:search", { input: { q: "..." } }).wrap(async (s) => {
74
+ const data = await search(...);
75
+ s.end({ output: data });
76
+ });
77
+ });
78
+
79
+ // Analytics
80
+ const analytics = await client.getObservabilityAnalytics(7);
81
+ console.log(`Traces: ${analytics.overview.total_traces}`);
82
+ console.log(`Tool calls: ${analytics.overview.total_tool_calls}`);
83
+ ```
84
+
85
+ See [API Reference](docs/api-reference.md) for full Trace/Span/Generation docs.
86
+
87
+ ## VM Stream Logs
88
+
89
+ Attach virtual machine session logs to test results for browser-based or desktop-based agents:
90
+
91
+ ```typescript
92
+ test = run.addTest("checkout_flow");
93
+ test.start();
94
+ // ... run agent, add tool calls and responses ...
95
+
96
+ // Kernel browser session (first-class support)
97
+ test.setKernelVm("kern_sess_abc123", {
98
+ durationMs: 15000,
99
+ logs: [
100
+ { ts: 0, type: "navigation", data: { url: "https://app.example.com" } },
101
+ { ts: 1200, type: "action", data: { action: "click", selector: "#login" } },
102
+ ],
103
+ replayId: "replay_abc123",
104
+ replayViewUrl: "https://www.kernel.sh/replays/replay_abc123",
105
+ stealth: true,
106
+ viewport: { width: 1920, height: 1080 },
107
+ });
108
+
109
+ // Or use the generic setVmStream() for any provider
110
+ test.setVmStream("browserbase", {
111
+ sessionId: "sess_abc123",
112
+ durationMs: 45000,
113
+ logs: [
114
+ { ts: 0, type: "navigation", data: { url: "https://app.example.com" } },
115
+ { ts: 1200, type: "action", data: { action: "click", selector: "#login" } },
116
+ ],
117
+ });
118
+ test.complete();
119
+ ```
120
+
121
+ ## Available Methods
122
+
123
+ All methods that accept `tenantId` auto-resolve it from your API key if omitted.
124
+
125
+ ### Datasets
126
+
127
+ | Method | Description |
128
+ |--------|-------------|
129
+ | `getDataset(datasetId, ...)` | Get a dataset by ID |
130
+ | `listDatasets(tenantId, limit, offset, ...)` | List datasets |
131
+
132
+ ### Runs
133
+
134
+ | Method | Description |
135
+ |--------|-------------|
136
+ | `createRun(datasetId, result, ...)` | Create a new test run |
137
+ | `getRun(runId)` | Get a run by ID |
138
+ | `listRuns(datasetId, tenantId, limit, offset)` | List runs |
139
+ | `deleteRun(runId)` | Delete a run |
140
+
141
+ ### EvalRunner
142
+
143
+ | Method | Description |
144
+ |--------|-------------|
145
+ | `EvalRunner.fromDataset(client, datasetId)` | Create a runner from a dataset |
146
+ | `runner.run(agent, { maxWorkers })` | Run agent against all scenarios, return `RunBuilder` |
147
+ | `runner.runAndDeploy(agent, client, datasetId, { maxWorkers })` | Run and submit in one call |
148
+
149
+ ### RunBuilder
150
+
151
+ | Method | Description |
152
+ |--------|-------------|
153
+ | `new RunBuilder()` | Create a new run builder |
154
+ | `run.start()` | Mark the run as started |
155
+ | `run.addTest(testId)` | Add a test and get a `TestBuilder` |
156
+ | `run.complete(status)` | Mark the run as completed |
157
+ | `run.build()` | Serialize to a result object |
158
+ | `run.deploy(client, datasetId)` | Build and submit via the API |
159
+
160
+ ### TestBuilder
161
+
162
+ | Method | Description |
163
+ |--------|-------------|
164
+ | `test.start()` | Mark the test as started |
165
+ | `test.addUserFile(filePath, description)` | Record a user file upload |
166
+ | `test.addUserText(text, description)` | Record a user text input |
167
+ | `test.addToolCall(expected, actual, matchStatus)` | Record an agent tool call |
168
+ | `test.addAgentResponse(expectedResponse, actualResponse, matchStatus)` | Record an agent response |
169
+ | `test.setVmStream(provider, opts)` | Attach VM session logs |
170
+ | `test.setKernelVm(sessionId, opts)` | Attach Kernel VM session (convenience) |
171
+ | `test.complete(status)` | Mark the test as completed |
172
+
173
+ ### Requests
174
+
175
+ | Method | Description |
176
+ |--------|-------------|
177
+ | `createRequest(requestName, request, ...)` | Create a new request |
178
+ | `getRequest(requestId)` | Get a request by ID |
179
+ | `listRequests(tenantId, status, limit, offset)` | List requests |
180
+
181
+ ### Observability
182
+
183
+ | Method | Description |
184
+ |--------|-------------|
185
+ | `client.trace(name, opts?)` | Start a production trace (returns `Trace`) |
186
+ | `trace.span(name, opts?)` / `trace.generation(name, opts?)` | Add spans or LLM calls |
187
+ | `trace.wrap(fn)` / `span.wrap(fn)` | Auto-end on completion, auto-capture errors |
188
+ | `await trace.end(opts?)` | Flush trace to backend (**never rejects**) |
189
+ | `listObservabilityTraces(opts?)` | List traces |
190
+ | `getObservabilityTrace(traceId)` | Get trace with full observation tree |
191
+ | `getObservabilityAnalytics(days?)` | Analytics: tokens, latency, errors, tool perf |
192
+ | `getObservabilityErrors(opts?)` | Traces with errors |
193
+ | `getObservabilityToolErrors(opts?)` | Traces with tool failures |
194
+
195
+ ### API Keys & Session
196
+
197
+ | Method | Description |
198
+ |--------|-------------|
199
+ | `init()` | Validate credentials and get user/tenant info |
200
+ | `listApiKeys(includeInactive)` | List API keys for your tenant |
201
+ | `revokeApiKey(apiKeyId)` | Revoke an API key |
202
+ | `healthCheck()` | Check if the API is reachable |
203
+
204
+ ## Error Handling
205
+
206
+ ```typescript
207
+ import { AshrLabsClient, NotFoundError, AuthenticationError } from "ashr-labs";
208
+
209
+ const client = new AshrLabsClient("tp_...");
210
+
211
+ try {
212
+ const dataset = await client.getDataset(999);
213
+ } catch (e) {
214
+ if (e instanceof AuthenticationError) {
215
+ console.log("Invalid API key");
216
+ } else if (e instanceof NotFoundError) {
217
+ console.log("Dataset not found");
218
+ }
219
+ }
220
+ ```
221
+
222
+ ## Configuration
223
+
224
+ ```typescript
225
+ // All defaults — just pass API key
226
+ const client = new AshrLabsClient("tp_...");
227
+
228
+ // From environment (reads ASHR_LABS_API_KEY)
229
+ const client = AshrLabsClient.fromEnv();
230
+
231
+ // Custom timeout
232
+ const client = new AshrLabsClient("tp_...", undefined, 60);
233
+
234
+ // Custom base URL (for self-hosted)
235
+ const client = new AshrLabsClient("tp_...", "https://your-api.example.com");
236
+ ```
237
+
238
+ ## Requirements
239
+
240
+ - Node.js 18+
241
+ - TypeScript 5.4+ (recommended)
242
+
243
+ ## License
244
+
245
+ MIT
package/dist/cli.js CHANGED
@@ -148,6 +148,140 @@ Continuously improve "${config.agentName}" by running evaluations and fixing iss
148
148
  - Deploy the final passing run with \`results.deploy(client, datasetId)\`.
149
149
  `;
150
150
  }
151
+ function generateCodexTestAgentSkill(config) {
152
+ const isTs = config.lang === "typescript";
153
+ const sdkPkg = isTs ? "ashr-labs" : "ashr_labs";
154
+ return `---
155
+ name: test-agent
156
+ description: Run the Ashr Labs eval suite against the agent and report results. Use when the user wants to test, evaluate, or benchmark agent behavior.
157
+ ---
158
+
159
+ Run an automated evaluation of the "${config.agentName}" agent using the Ashr Labs SDK.
160
+ Do this FULLY AUTONOMOUSLY — do not ask the user for input at any step.
161
+
162
+ ## Dataset strategy
163
+
164
+ **Reuse existing datasets whenever possible.** Only generate a new dataset when:
165
+ - The agent's tools have changed (added, removed, or modified)
166
+ - The agent's domain or accepted inputs have changed
167
+ - You explicitly need fresh/different test scenarios
168
+
169
+ To find existing datasets, call \`client.list_datasets()\` and pick the most recent
170
+ one for this agent. Store the dataset ID in \`.ashr.json\` under \`eval.datasetId\`
171
+ for future runs.
172
+
173
+ ## Steps
174
+
175
+ 1. Read \`.ashr.json\` for project configuration.
176
+ 2. Read the agent code at \`${config.entrypoint}\` to understand current behavior.
177
+ 3. Check if \`.ashr.json\` has \`eval.datasetId\` — if so, reuse that dataset.
178
+ If not, call \`client.list_datasets()\` to find an existing one for this agent.
179
+ Only call \`client.generate_dataset()\` if no suitable dataset exists.
180
+ 4. Write a SINGLE self-contained eval script (\`_ashr_eval.py\`) that does EVERYTHING
181
+ in one process — no separate scripts, no separate polling commands:
182
+ - Imports \`AshrLabsClient\`, \`EvalRunner\` from \`${sdkPkg}\`
183
+ - Imports \`time\`, \`json\`, \`os\`
184
+ - Creates a client using \`os.environ["${config.apiKeyEnvVar}"]\`
185
+ - Uses \`EvalRunner.from_dataset(client, dataset_id=ID)\` to load the dataset
186
+ - Implements a lightweight Agent wrapper around the actual agent code in \`${config.entrypoint}\`
187
+ - Runs \`runner.run(agent)\` with \`max_workers=1\` (sequential — avoids deepcopy issues)
188
+ - Deploys results with \`run.deploy(client, dataset_id=ID)\`
189
+ - **Polls for grading inline**: loops calling \`client.get_run(run_id)\` every 20 seconds
190
+ until \`aggregate_metrics.tests_passed\` is not None (max 5 minutes)
191
+ - Prints final graded results: passed, failed, per-test status, tool mismatches
192
+ 5. Run the eval script with a long timeout (e.g. 10 minutes).
193
+ 6. If a new dataset was generated, update \`.ashr.json\` with the new \`eval.datasetId\`.
194
+ 7. Delete \`_ashr_eval.py\`.
195
+
196
+ ## CRITICAL: Single command execution
197
+
198
+ The eval script MUST be a single file that runs as a single command. Do NOT write
199
+ separate scripts for polling or analysis. Do NOT use \`sleep\` in bash. Everything
200
+ happens inside the Python script so it only requires ONE permission grant to run.
201
+
202
+ ## Important
203
+
204
+ - Use the \`${config.apiKeyEnvVar}\` env var for the API key — never hardcode it.
205
+ - All grading is done server-side — do not compute scores locally.
206
+ - Do NOT ask the user for permission or input. Run everything automatically.
207
+ `;
208
+ }
209
+ function generateCodexImproveAgentSkill(config) {
210
+ const isTs = config.lang === "typescript";
211
+ const sdkPkg = isTs ? "ashr-labs" : "ashr_labs";
212
+ return `---
213
+ name: improve-agent
214
+ description: Automatically run evals, analyze failures, fix the agent, and re-test until passing. Use when the user wants to improve agent quality or fix failing tests.
215
+ ---
216
+
217
+ Continuously improve "${config.agentName}" by running evaluations, analyzing failures, and
218
+ applying fixes — ALL AUTOMATICALLY without asking for user input.
219
+
220
+ ## Overview
221
+
222
+ This is an autonomous improvement loop. You will:
223
+ 1. Run eval + wait for grading (single script, single command)
224
+ 2. Analyze failures from the output
225
+ 3. Fix the agent code
226
+ 4. Re-run eval (same single script, single command)
227
+ 5. Repeat until target pass rate is met
228
+
229
+ Do NOT ask the user for permission between iterations. Just run the loop.
230
+
231
+ ## CRITICAL: Single-script execution
232
+
233
+ ALL eval + grading + analysis MUST happen in ONE Python script (\`_ashr_eval.py\`) run as
234
+ ONE command. The script must:
235
+
236
+ 1. Import the agent from \`${config.entrypoint}\` and \`AshrLabsClient\`, \`EvalRunner\` from \`${sdkPkg}\`
237
+ 2. Load dataset from \`.ashr.json\` \`eval.datasetId\` using \`EvalRunner.from_dataset()\`
238
+ 3. Run \`runner.run(agent)\` with \`max_workers=1\`
239
+ 4. Deploy with \`run.deploy(client, dataset_id=ID)\`
240
+ 5. Poll \`client.get_run(run_id)\` every 20s until \`aggregate_metrics.tests_passed\` is not None (max 5min)
241
+ 6. Print graded results: pass/fail per test, tool mismatches with expected vs actual args
242
+ 7. For each FAILED test, also fetch and print the dataset scenario actions so the failure
243
+ context is visible (user messages, expected tool calls at each action index)
244
+
245
+ ## Iteration loop
246
+
247
+ ### After each eval run, read the output and analyze failures:
248
+
249
+ Common failure patterns:
250
+ - **Tool called at wrong step**: Agent calls the right tool but too early/late.
251
+ Fix: adjust system prompt to be more/less eager about that tool.
252
+ - **Tool not called**: Agent didn't call an expected tool.
253
+ Fix: strengthen prompt guidance about when to use that tool.
254
+ - **Extra unexpected tool call**: Agent called a tool it shouldn't have.
255
+ Fix: add prompt constraints about when NOT to call that tool.
256
+ - **Wrong tool arguments**: Agent called the right tool with wrong args.
257
+ Fix: improve tool descriptions or add examples in the system prompt.
258
+ - **Tool retry timing**: Agent auto-retries a failed tool instead of returning control
259
+ (or vice versa). Fix: adjust the tool execution loop in the agent code.
260
+
261
+ ### Apply fixes to \`${config.entrypoint}\`:
262
+ - **System prompt changes** fix most failures — adjust instructions about tool timing
263
+ - **Tool loop logic** changes fix retry/timing issues
264
+ - Make the SMALLEST change that addresses each failure. Do not refactor unrelated code.
265
+
266
+ ### Re-run eval:
267
+ After editing \`${config.entrypoint}\`, run the SAME \`_ashr_eval.py\` script again (it imports
268
+ the agent fresh each time). Same single command, same timeout.
269
+
270
+ ### Stop conditions:
271
+ - All tests pass or pass rate >= 80%: Stop. Print summary of changes + before/after metrics.
272
+ - 5 iterations reached: Stop. Summarize what's still failing.
273
+ - No improvement after 2 consecutive iterations: Stop. Explain blockers.
274
+
275
+ Clean up \`_ashr_eval.py\` when done.
276
+
277
+ ## Rules
278
+
279
+ - Do NOT ask the user for input at any step. Run everything automatically.
280
+ - Do NOT write separate scripts for polling or analysis — everything in ONE script.
281
+ - Use \`${config.apiKeyEnvVar}\` env var for the API key.
282
+ - All grading is server-side. Never grade or score locally.
283
+ `;
284
+ }
151
285
  function generateHookSettings(config) {
152
286
  return {
153
287
  hooks: {
@@ -387,6 +521,11 @@ async function main() {
387
521
  writeFile(".claude/commands/test-agent.md", generateTestAgentCommand(config));
388
522
  writeFile(".claude/commands/improve-agent.md", generateImproveAgentCommand(config));
389
523
  mergeJsonFile(".claude/settings.json", generateHookSettings(config));
524
+ // Codex + Cursor skills (both scan .agents/skills/; Cursor also scans .cursor/skills/)
525
+ writeFile(".agents/skills/test-agent/SKILL.md", generateCodexTestAgentSkill(config));
526
+ writeFile(".agents/skills/improve-agent/SKILL.md", generateCodexImproveAgentSkill(config));
527
+ writeFile(".cursor/skills/test-agent/SKILL.md", generateCodexTestAgentSkill(config));
528
+ writeFile(".cursor/skills/improve-agent/SKILL.md", generateCodexImproveAgentSkill(config));
390
529
  // Done
391
530
  print(`\n${GREEN} Done.${RESET} Open Claude Code and type ${BOLD}/test-agent${RESET}\n`);
392
531
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ashr-labs",
3
- "version": "0.4.0",
3
+ "version": "0.4.2",
4
4
  "description": "TypeScript SDK for the Ashr Labs API — agent testing & evaluation",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",