ashr-labs 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/client.d.ts CHANGED
@@ -18,7 +18,116 @@ export declare class AshrLabsClient {
18
18
  deleteRun(runId: number): Promise<Record<string, unknown>>;
19
19
  getRun(runId: number): Promise<Record<string, unknown>>;
20
20
  listRuns(datasetId?: number | null, tenantId?: number | null, limit?: number): Promise<Record<string, unknown>>;
21
+ /**
22
+ * Start a new observability trace for a production agent interaction.
23
+ *
24
+ * Returns a `Trace` object. Add spans, generations, and events to it,
25
+ * then call `await trace.end()` to flush to the Ashr Labs backend.
26
+ *
27
+ * Requires the `observability` feature flag to be enabled.
28
+ *
29
+ * @example
30
+ * ```typescript
31
+ * const trace = client.trace("support-chat", { userId: "user_42" });
32
+ * const gen = trace.generation("respond", { model: "claude-sonnet-4-6", input: messages });
33
+ * gen.end({ output: reply, usage: { input_tokens: 100, output_tokens: 50 } });
34
+ * await trace.end({ output: { resolution: "resolved" } });
35
+ * ```
36
+ */
37
+ trace(name: string, opts?: {
38
+ userId?: string;
39
+ sessionId?: string;
40
+ metadata?: Record<string, unknown>;
41
+ tags?: string[];
42
+ }): import("./tracing.js").Trace;
43
+ /**
44
+ * List observability traces for the current tenant.
45
+ *
46
+ * Requires the `observability` feature flag to be enabled.
47
+ */
48
+ listObservabilityTraces(opts?: {
49
+ userId?: string;
50
+ sessionId?: string;
51
+ limit?: number;
52
+ page?: number;
53
+ }): Promise<Record<string, unknown>>;
54
+ /**
55
+ * Get a single observability trace with full span/generation detail.
56
+ *
57
+ * Requires the `observability` feature flag to be enabled.
58
+ */
59
+ getObservabilityTrace(traceId: string): Promise<Record<string, unknown>>;
60
+ /** Get observability analytics: overview, tool performance, model usage. */
61
+ getObservabilityAnalytics(days?: number): Promise<Record<string, unknown>>;
62
+ /** Get traces with errors. */
63
+ getObservabilityErrors(opts?: {
64
+ days?: number;
65
+ limit?: number;
66
+ page?: number;
67
+ }): Promise<Record<string, unknown>>;
68
+ /** Get traces with tool call failures. */
69
+ getObservabilityToolErrors(opts?: {
70
+ days?: number;
71
+ limit?: number;
72
+ page?: number;
73
+ }): Promise<Record<string, unknown>>;
21
74
  private static _validateConfigStructure;
75
+ /**
76
+ * Create a new dataset generation request.
77
+ *
78
+ * The `request` object describes your agent and what test scenarios to generate.
79
+ *
80
+ * **Required sections:**
81
+ * - `agent` — At least one of `name`, `description`, or `system_prompt`.
82
+ * Include `tools` here (with `name`, `description`, `parameters`) so the
83
+ * generator creates scenarios with tool call expectations.
84
+ * Include `accepted_inputs` specifying which input types your agent supports.
85
+ * Allowed input keys: `text`, `audio`, `file`, `image`, `video`, `conversation`.
86
+ * - `context` — At least one of `domain`, `use_case`, or `scenario_context`.
87
+ * - `generation_options` — Controls what assets to generate.
88
+ * Keys: `scenario_count`, `generate_audio`, `generate_files`, `generate_images`,
89
+ * `generate_videos`, `generate_simulations`.
90
+ *
91
+ * **Optional sections:** `test_config`, `metadata`
92
+ *
93
+ * @example
94
+ * ```typescript
95
+ * const req = await client.createRequest("Loan Agent Eval", {
96
+ * agent: {
97
+ * name: "QuickLend Loan Officer",
98
+ * description: "Helps applicants check credit and submit applications",
99
+ * system_prompt: "You are a professional loan officer.",
100
+ * tools: [
101
+ * {
102
+ * name: "check_credit_score",
103
+ * description: "Pull applicant credit score and history",
104
+ * parameters: {
105
+ * type: "object",
106
+ * required: ["applicant_id"],
107
+ * properties: { applicant_id: { type: "string" } },
108
+ * },
109
+ * },
110
+ * ],
111
+ * accepted_inputs: { text: true, audio: false, file: false, image: false, video: false },
112
+ * },
113
+ * context: {
114
+ * domain: "financial services",
115
+ * use_case: "Applicants inquiring about loan eligibility and rates",
116
+ * scenario_context: "A digital lending platform called QuickLend Financial",
117
+ * },
118
+ * generation_options: {
119
+ * scenario_count: 5,
120
+ * generate_audio: false,
121
+ * generate_files: false,
122
+ * generate_simulations: false,
123
+ * },
124
+ * test_config: {
125
+ * num_variations: 5,
126
+ * coverage: { happy_path: true, edge_cases: true, error_handling: true },
127
+ * },
128
+ * });
129
+ * ```
130
+ */
22
131
  createRequest(requestName: string, request: Record<string, unknown>, requestInputSchema?: Record<string, unknown> | null, tenantId?: number | null, requestorId?: number | null): Promise<Record<string, unknown>>;
23
132
  getRequest(requestId: number): Promise<Record<string, unknown>>;
24
133
  listRequests(tenantId?: number | null, status?: string | null, limit?: number, cursor?: number | null): Promise<Record<string, unknown>>;
@@ -33,6 +142,20 @@ export declare class AshrLabsClient {
33
142
  * from agent name/description.
34
143
  */
35
144
  private static _enrichConfig;
145
+ /**
146
+ * Generate a dataset: creates a request, waits for completion, and returns the dataset.
147
+ *
148
+ * **Prefer reusing existing datasets** with `EvalRunner.fromDataset()` instead of
149
+ * generating new ones each time. Only generate a new dataset when the agent's tools,
150
+ * inputs, or domain have changed.
151
+ *
152
+ * @param requestName - A name/title for the request.
153
+ * @param config - Generation config (same structure as `createRequest`).
154
+ * @param requestInputSchema - Optional JSON Schema. If omitted, tools are auto-populated from `config.agent.tools`.
155
+ * @param timeout - Max seconds to wait for generation (default 600).
156
+ * @param pollInterval - Seconds between status checks (default 5).
157
+ * @returns `[datasetId, datasetSource]` tuple.
158
+ */
36
159
  generateDataset(requestName: string, config: Record<string, unknown>, requestInputSchema?: Record<string, unknown> | null, timeout?: number, pollInterval?: number): Promise<[number, Record<string, unknown>]>;
37
160
  toString(): string;
38
161
  }
package/dist/client.js CHANGED
@@ -156,6 +156,76 @@ export class AshrLabsClient {
156
156
  return this._makeRequest("list_runs", params);
157
157
  }
158
158
  // =========================================================================
159
+ // Observability — Production Agent Tracing
160
+ // =========================================================================
161
+ /**
162
+ * Start a new observability trace for a production agent interaction.
163
+ *
164
+ * Returns a `Trace` object. Add spans, generations, and events to it,
165
+ * then call `await trace.end()` to flush to the Ashr Labs backend.
166
+ *
167
+ * Requires the `observability` feature flag to be enabled.
168
+ *
169
+ * @example
170
+ * ```typescript
171
+ * const trace = client.trace("support-chat", { userId: "user_42" });
172
+ * const gen = trace.generation("respond", { model: "claude-sonnet-4-6", input: messages });
173
+ * gen.end({ output: reply, usage: { input_tokens: 100, output_tokens: 50 } });
174
+ * await trace.end({ output: { resolution: "resolved" } });
175
+ * ```
176
+ */
177
+ trace(name, opts = {}) {
178
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
179
+ const { Trace } = require("./tracing.js");
180
+ return new Trace(this, name, opts);
181
+ }
182
+ /**
183
+ * List observability traces for the current tenant.
184
+ *
185
+ * Requires the `observability` feature flag to be enabled.
186
+ */
187
+ async listObservabilityTraces(opts = {}) {
188
+ const params = {
189
+ limit: opts.limit ?? 50,
190
+ page: opts.page ?? 1,
191
+ };
192
+ if (opts.userId != null)
193
+ params.user_id = opts.userId;
194
+ if (opts.sessionId != null)
195
+ params.session_id = opts.sessionId;
196
+ return this._makeRequest("list_observability_traces", params);
197
+ }
198
+ /**
199
+ * Get a single observability trace with full span/generation detail.
200
+ *
201
+ * Requires the `observability` feature flag to be enabled.
202
+ */
203
+ async getObservabilityTrace(traceId) {
204
+ return this._makeRequest("get_observability_trace", {
205
+ trace_id: traceId,
206
+ });
207
+ }
208
+ /** Get observability analytics: overview, tool performance, model usage. */
209
+ async getObservabilityAnalytics(days = 7) {
210
+ return this._makeRequest("get_observability_analytics", { days });
211
+ }
212
+ /** Get traces with errors. */
213
+ async getObservabilityErrors(opts = {}) {
214
+ return this._makeRequest("get_observability_errors", {
215
+ days: opts.days ?? 7,
216
+ limit: opts.limit ?? 50,
217
+ page: opts.page ?? 1,
218
+ });
219
+ }
220
+ /** Get traces with tool call failures. */
221
+ async getObservabilityToolErrors(opts = {}) {
222
+ return this._makeRequest("get_observability_tool_errors", {
223
+ days: opts.days ?? 7,
224
+ limit: opts.limit ?? 50,
225
+ page: opts.page ?? 1,
226
+ });
227
+ }
228
+ // =========================================================================
159
229
  // Request Operations
160
230
  // =========================================================================
161
231
  static _validateConfigStructure(config) {
@@ -185,6 +255,62 @@ export class AshrLabsClient {
185
255
  throw new ValidationError("config.context must include at least one of: domain, use_case, scenario_context");
186
256
  }
187
257
  }
258
+ /**
259
+ * Create a new dataset generation request.
260
+ *
261
+ * The `request` object describes your agent and what test scenarios to generate.
262
+ *
263
+ * **Required sections:**
264
+ * - `agent` — At least one of `name`, `description`, or `system_prompt`.
265
+ * Include `tools` here (with `name`, `description`, `parameters`) so the
266
+ * generator creates scenarios with tool call expectations.
267
+ * Include `accepted_inputs` specifying which input types your agent supports.
268
+ * Allowed input keys: `text`, `audio`, `file`, `image`, `video`, `conversation`.
269
+ * - `context` — At least one of `domain`, `use_case`, or `scenario_context`.
270
+ * - `generation_options` — Controls what assets to generate.
271
+ * Keys: `scenario_count`, `generate_audio`, `generate_files`, `generate_images`,
272
+ * `generate_videos`, `generate_simulations`.
273
+ *
274
+ * **Optional sections:** `test_config`, `metadata`
275
+ *
276
+ * @example
277
+ * ```typescript
278
+ * const req = await client.createRequest("Loan Agent Eval", {
279
+ * agent: {
280
+ * name: "QuickLend Loan Officer",
281
+ * description: "Helps applicants check credit and submit applications",
282
+ * system_prompt: "You are a professional loan officer.",
283
+ * tools: [
284
+ * {
285
+ * name: "check_credit_score",
286
+ * description: "Pull applicant credit score and history",
287
+ * parameters: {
288
+ * type: "object",
289
+ * required: ["applicant_id"],
290
+ * properties: { applicant_id: { type: "string" } },
291
+ * },
292
+ * },
293
+ * ],
294
+ * accepted_inputs: { text: true, audio: false, file: false, image: false, video: false },
295
+ * },
296
+ * context: {
297
+ * domain: "financial services",
298
+ * use_case: "Applicants inquiring about loan eligibility and rates",
299
+ * scenario_context: "A digital lending platform called QuickLend Financial",
300
+ * },
301
+ * generation_options: {
302
+ * scenario_count: 5,
303
+ * generate_audio: false,
304
+ * generate_files: false,
305
+ * generate_simulations: false,
306
+ * },
307
+ * test_config: {
308
+ * num_variations: 5,
309
+ * coverage: { happy_path: true, edge_cases: true, error_handling: true },
310
+ * },
311
+ * });
312
+ * ```
313
+ */
188
314
  async createRequest(requestName, request, requestInputSchema, tenantId, requestorId) {
189
315
  AshrLabsClient._validateConfigStructure(request);
190
316
  if (requestInputSchema == null) {
@@ -305,6 +431,20 @@ export class AshrLabsClient {
305
431
  }
306
432
  return out;
307
433
  }
434
+ /**
435
+ * Generate a dataset: creates a request, waits for completion, and returns the dataset.
436
+ *
437
+ * **Prefer reusing existing datasets** with `EvalRunner.fromDataset()` instead of
438
+ * generating new ones each time. Only generate a new dataset when the agent's tools,
439
+ * inputs, or domain have changed.
440
+ *
441
+ * @param requestName - A name/title for the request.
442
+ * @param config - Generation config (same structure as `createRequest`).
443
+ * @param requestInputSchema - Optional JSON Schema. If omitted, tools are auto-populated from `config.agent.tools`.
444
+ * @param timeout - Max seconds to wait for generation (default 600).
445
+ * @param pollInterval - Seconds between status checks (default 5).
446
+ * @returns `[datasetId, datasetSource]` tuple.
447
+ */
308
448
  async generateDataset(requestName, config, requestInputSchema, timeout = 600, pollInterval = 5) {
309
449
  const enriched = AshrLabsClient._enrichConfig(config);
310
450
  const req = await this.createRequest(requestName, enriched, requestInputSchema);
package/dist/index.d.ts CHANGED
@@ -1,6 +1,9 @@
1
1
  export { AshrLabsClient } from "./client.js";
2
2
  export { AshrLabsError, AuthenticationError, AuthorizationError, NotFoundError, ValidationError, RateLimitError, ServerError, } from "./exceptions.js";
3
- export type { User, Tenant, Session, Dataset, Run, Request, APIKey, ToolCall, ExpectedResponse, Action, Scenario, } from "./models.js";
3
+ export type { User, Tenant, Session, Dataset, Run, Request, APIKey, ToolCall, ExpectedResponse, Action, Scenario, ObservabilityObservation, ObservabilityTrace, VmLogEntry, VmStream, KernelViewport, KernelActionData, KernelEventData, KernelVmMetadata, KernelVmStream, } from "./models.js";
4
+ export { KERNEL_ACTION_TYPES, KERNEL_EVENT_TYPES, } from "./models.js";
5
+ export type { KernelActionType, KernelEventType, } from "./models.js";
6
+ export { Trace, Span, Generation } from "./tracing.js";
4
7
  export { RunBuilder, TestBuilder } from "./run-builder.js";
5
8
  export { stripMarkdown, tokenize, fuzzyStrMatch, extractToolArgs, compareToolArgs, textSimilarity, } from "./comparators.js";
6
9
  export { EvalRunner } from "./eval.js";
package/dist/index.js CHANGED
@@ -1,5 +1,7 @@
1
1
  export { AshrLabsClient } from "./client.js";
2
2
  export { AshrLabsError, AuthenticationError, AuthorizationError, NotFoundError, ValidationError, RateLimitError, ServerError, } from "./exceptions.js";
3
+ export { KERNEL_ACTION_TYPES, KERNEL_EVENT_TYPES, } from "./models.js";
4
+ export { Trace, Span, Generation } from "./tracing.js";
3
5
  export { RunBuilder, TestBuilder } from "./run-builder.js";
4
6
  export { stripMarkdown, tokenize, fuzzyStrMatch, extractToolArgs, compareToolArgs, textSimilarity, } from "./comparators.js";
5
7
  export { EvalRunner } from "./eval.js";
package/dist/models.d.ts CHANGED
@@ -47,6 +47,35 @@ export interface Run {
47
47
  runner?: number;
48
48
  result?: Record<string, unknown>;
49
49
  }
50
+ export interface ObservabilityObservation {
51
+ id?: string;
52
+ name?: string;
53
+ type?: string;
54
+ parent_observation_id?: string | null;
55
+ input?: unknown | null;
56
+ output?: unknown | null;
57
+ metadata?: Record<string, unknown> | null;
58
+ model?: string | null;
59
+ usage?: {
60
+ input_tokens?: number;
61
+ output_tokens?: number;
62
+ } | null;
63
+ level?: "DEBUG" | "DEFAULT" | "WARNING" | "ERROR" | null;
64
+ status_message?: string | null;
65
+ start_time?: string | null;
66
+ end_time?: string | null;
67
+ }
68
+ export interface ObservabilityTrace {
69
+ id?: string;
70
+ name?: string;
71
+ user_id?: string | null;
72
+ session_id?: string | null;
73
+ metadata?: Record<string, unknown> | null;
74
+ tags?: string[];
75
+ created_at?: string | null;
76
+ output?: unknown | null;
77
+ observations?: ObservabilityObservation[];
78
+ }
50
79
  export interface Request {
51
80
  id?: number;
52
81
  created_at?: string;
@@ -73,6 +102,76 @@ export interface RequestsListResponse extends ListResponse {
73
102
  export interface APIKeysListResponse extends ListResponse {
74
103
  api_keys: APIKey[];
75
104
  }
105
+ export interface VmLogEntry {
106
+ ts?: number;
107
+ type?: string;
108
+ data?: Record<string, unknown>;
109
+ }
110
+ export interface KernelViewport {
111
+ width?: number;
112
+ height?: number;
113
+ }
114
+ export declare const KERNEL_ACTION_TYPES: readonly ["click_mouse", "move_mouse", "drag_mouse", "type_text", "press_key", "scroll", "screenshot"];
115
+ export type KernelActionType = (typeof KERNEL_ACTION_TYPES)[number];
116
+ export declare const KERNEL_EVENT_TYPES: readonly ["navigation", "log", "error", "invocation_state", "console", "network"];
117
+ export type KernelEventType = (typeof KERNEL_EVENT_TYPES)[number];
118
+ /** Data payload for a Kernel computer control action. */
119
+ export interface KernelActionData {
120
+ x?: number;
121
+ y?: number;
122
+ button?: string;
123
+ click_type?: string;
124
+ num_clicks?: number;
125
+ smooth?: boolean;
126
+ path?: number[][];
127
+ text?: string;
128
+ delay?: number;
129
+ keys?: string[];
130
+ duration?: number;
131
+ hold_keys?: string[];
132
+ delta_x?: number;
133
+ delta_y?: number;
134
+ format?: string;
135
+ s3_key?: string;
136
+ duration_ms?: number;
137
+ }
138
+ /** Data payload for a Kernel event. */
139
+ export interface KernelEventData {
140
+ url?: string;
141
+ message?: string;
142
+ level?: string;
143
+ code?: string;
144
+ details?: Record<string, unknown>[];
145
+ method?: string;
146
+ status?: number;
147
+ invocation_id?: string;
148
+ action_name?: string;
149
+ status_reason?: string;
150
+ output?: string;
151
+ }
152
+ export interface KernelVmMetadata {
153
+ live_view_url?: string;
154
+ cdp_ws_url?: string;
155
+ replay_id?: string;
156
+ replay_view_url?: string;
157
+ headless?: boolean;
158
+ stealth?: boolean;
159
+ viewport?: KernelViewport;
160
+ }
161
+ export interface KernelVmStream {
162
+ provider: "kernel";
163
+ session_id?: string;
164
+ duration_ms?: number;
165
+ logs?: VmLogEntry[];
166
+ metadata?: KernelVmMetadata;
167
+ }
168
+ export interface VmStream {
169
+ provider: string;
170
+ session_id?: string;
171
+ duration_ms?: number;
172
+ logs?: VmLogEntry[];
173
+ metadata?: Record<string, unknown>;
174
+ }
76
175
  export interface ToolCall {
77
176
  name?: string;
78
177
  arguments_json?: string;
package/dist/models.js CHANGED
@@ -1 +1,20 @@
1
- export {};
1
+ // ---- Kernel action types (computer control API) ----
2
+ // Map to POST /browsers/{id}/computer/* endpoints.
3
+ export const KERNEL_ACTION_TYPES = [
4
+ "click_mouse", // {x, y, button?, click_type?, num_clicks?}
5
+ "move_mouse", // {x, y, duration_ms?, smooth?}
6
+ "drag_mouse", // {path: [[x,y],...], button?, smooth?, duration_ms?}
7
+ "type_text", // {text, delay?}
8
+ "press_key", // {keys: string[], duration?, hold_keys?}
9
+ "scroll", // {x, y, delta_x?, delta_y?}
10
+ "screenshot", // {format?} — result may include s3_key or base64
11
+ ];
12
+ // ---- Kernel event types (SSE streams + navigation) ----
13
+ export const KERNEL_EVENT_TYPES = [
14
+ "navigation", // {url} — page navigation
15
+ "log", // {message} — from GET /browsers/{id}/logs SSE
16
+ "error", // {code, message, details?} — ErrorEvent
17
+ "invocation_state", // {invocation_id, status, action_name, output?}
18
+ "console", // {level, message} — browser console output
19
+ "network", // {method, url, status} — HTTP request observed
20
+ ];
@@ -6,12 +6,43 @@ export declare class TestBuilder {
6
6
  private _completedAt;
7
7
  private _actionResults;
8
8
  private _nextActionIndex;
9
+ private _vmStream;
9
10
  constructor(testId: string);
11
+ /** The test ID (matches the scenario ID from the dataset). */
12
+ get test_id(): string;
10
13
  start(): this;
11
14
  addUserFile(filePath: string, description: string, actionIndex?: number): this;
12
15
  addUserText(text: string, description: string, actionIndex?: number): this;
13
16
  addToolCall(expected: Record<string, unknown>, actual: Record<string, unknown>, matchStatus: string, divergenceNotes?: string | null, actionIndex?: number): this;
14
17
  addAgentResponse(expectedResponse: Record<string, unknown>, actualResponse: Record<string, unknown>, matchStatus: string, semanticSimilarity?: number | null, divergenceNotes?: string | null, actionIndex?: number): this;
18
+ /**
19
+ * Attach VM session logs to this test.
20
+ */
21
+ setVmStream(provider: string, opts?: {
22
+ sessionId?: string;
23
+ durationMs?: number;
24
+ logs?: Record<string, unknown>[];
25
+ metadata?: Record<string, unknown>;
26
+ }): this;
27
+ /**
28
+ * Attach a Kernel browser session to this test.
29
+ * Metadata fields map to Kernel's browser API response
30
+ * (see https://www.kernel.sh/docs).
31
+ */
32
+ setKernelVm(sessionId: string, opts?: {
33
+ durationMs?: number;
34
+ logs?: Record<string, unknown>[];
35
+ liveViewUrl?: string;
36
+ cdpWsUrl?: string;
37
+ replayId?: string;
38
+ replayViewUrl?: string;
39
+ headless?: boolean;
40
+ stealth?: boolean;
41
+ viewport?: {
42
+ width: number;
43
+ height: number;
44
+ };
45
+ }): this;
15
46
  complete(status?: string): this;
16
47
  build(): Record<string, unknown>;
17
48
  private _resolveIndex;
@@ -22,6 +53,8 @@ export declare class RunBuilder {
22
53
  private _completedAt;
23
54
  /** @internal */
24
55
  _tests: TestBuilder[];
56
+ /** The list of tests in this run. Use this to attach VM streams after eval. */
57
+ get tests(): TestBuilder[];
25
58
  start(): this;
26
59
  addTest(testId: string): TestBuilder;
27
60
  complete(status?: string): this;
@@ -85,9 +85,14 @@ export class TestBuilder {
85
85
  _completedAt = null;
86
86
  _actionResults = [];
87
87
  _nextActionIndex = 0;
88
+ _vmStream = null;
88
89
  constructor(testId) {
89
90
  this._testId = testId;
90
91
  }
92
+ /** The test ID (matches the scenario ID from the dataset). */
93
+ get test_id() {
94
+ return this._testId;
95
+ }
91
96
  start() {
92
97
  this._status = "running";
93
98
  this._startedAt = now();
@@ -156,6 +161,50 @@ export class TestBuilder {
156
161
  this._actionResults.push(result);
157
162
  return this;
158
163
  }
164
+ /**
165
+ * Attach VM session logs to this test.
166
+ */
167
+ setVmStream(provider, opts) {
168
+ const vm = { provider };
169
+ if (opts?.sessionId != null)
170
+ vm.session_id = opts.sessionId;
171
+ if (opts?.durationMs != null)
172
+ vm.duration_ms = opts.durationMs;
173
+ if (opts?.logs != null)
174
+ vm.logs = opts.logs;
175
+ if (opts?.metadata != null)
176
+ vm.metadata = opts.metadata;
177
+ this._vmStream = vm;
178
+ return this;
179
+ }
180
+ /**
181
+ * Attach a Kernel browser session to this test.
182
+ * Metadata fields map to Kernel's browser API response
183
+ * (see https://www.kernel.sh/docs).
184
+ */
185
+ setKernelVm(sessionId, opts) {
186
+ const metadata = {};
187
+ if (opts?.liveViewUrl != null)
188
+ metadata.live_view_url = opts.liveViewUrl;
189
+ if (opts?.cdpWsUrl != null)
190
+ metadata.cdp_ws_url = opts.cdpWsUrl;
191
+ if (opts?.replayId != null)
192
+ metadata.replay_id = opts.replayId;
193
+ if (opts?.replayViewUrl != null)
194
+ metadata.replay_view_url = opts.replayViewUrl;
195
+ if (opts?.headless != null)
196
+ metadata.headless = opts.headless;
197
+ if (opts?.stealth != null)
198
+ metadata.stealth = opts.stealth;
199
+ if (opts?.viewport != null)
200
+ metadata.viewport = opts.viewport;
201
+ return this.setVmStream("kernel", {
202
+ sessionId,
203
+ durationMs: opts?.durationMs,
204
+ logs: opts?.logs,
205
+ metadata: Object.keys(metadata).length > 0 ? metadata : undefined,
206
+ });
207
+ }
159
208
  complete(status = "completed") {
160
209
  this._status = status;
161
210
  this._completedAt = now();
@@ -171,6 +220,8 @@ export class TestBuilder {
171
220
  result.started_at = this._startedAt;
172
221
  if (this._completedAt)
173
222
  result.completed_at = this._completedAt;
223
+ if (this._vmStream)
224
+ result.vm_stream = this._vmStream;
174
225
  return result;
175
226
  }
176
227
  _resolveIndex(explicit) {
@@ -189,6 +240,10 @@ export class RunBuilder {
189
240
  _completedAt = null;
190
241
  /** @internal */
191
242
  _tests = [];
243
+ /** The list of tests in this run. Use this to attach VM streams after eval. */
244
+ get tests() {
245
+ return this._tests;
246
+ }
192
247
  start() {
193
248
  this._status = "running";
194
249
  this._startedAt = now();
@@ -0,0 +1,160 @@
1
+ /**
2
+ * Production agent tracing for Ashr Labs Observability.
3
+ *
4
+ * **Production-safe:** tracing never throws or interferes with your agent.
5
+ * If the backend is unreachable, `trace.end()` resolves with an error object
6
+ * instead of rejecting.
7
+ *
8
+ * @example Manual instrumentation
9
+ * ```typescript
10
+ * const trace = client.trace("handle-ticket", { userId: "user_42" });
11
+ *
12
+ * const gen = trace.generation("classify", { model: "claude-sonnet-4-6", input: [...] });
13
+ * gen.end({ output: { intent: "reset" }, usage: { input_tokens: 50, output_tokens: 12 } });
14
+ *
15
+ * const tool = trace.span("tool:reset_password", { input: { user_id: "42" } });
16
+ * tool.end({ output: { success: true } });
17
+ *
18
+ * await trace.end({ output: { resolution: "password_reset" } });
19
+ * ```
20
+ *
21
+ * @example Using `wrap()` for automatic span lifecycle
22
+ * ```typescript
23
+ * const result = await trace.wrap("tool:search", { input: { q: "..." } }, async (span) => {
24
+ * const data = await search(...);
25
+ * span.end({ output: data });
26
+ * return data;
27
+ * });
28
+ * // If the callback throws, the span auto-ends with level="ERROR" and the error re-throws.
29
+ * ```
30
+ */
31
+ import type { AshrLabsClient } from "./client.js";
32
+ interface ObservationData {
33
+ id: string;
34
+ type: "span" | "generation" | "event";
35
+ name: string;
36
+ parent_observation_id: string | null;
37
+ start_time: string;
38
+ end_time?: string | null;
39
+ input?: unknown | null;
40
+ output?: unknown | null;
41
+ metadata?: Record<string, unknown> | null;
42
+ model?: string | null;
43
+ usage?: {
44
+ input_tokens?: number;
45
+ output_tokens?: number;
46
+ } | null;
47
+ level?: string | null;
48
+ status_message?: string | null;
49
+ }
50
+ export declare class Span {
51
+ readonly id: string;
52
+ protected _trace: Trace;
53
+ protected _data: ObservationData;
54
+ protected _ended: boolean;
55
+ constructor(trace: Trace, name: string, opts?: {
56
+ parentId?: string | null;
57
+ input?: unknown;
58
+ metadata?: Record<string, unknown>;
59
+ level?: string;
60
+ });
61
+ /** Create a child span nested under this span. */
62
+ span(name: string, opts?: {
63
+ input?: unknown;
64
+ metadata?: Record<string, unknown>;
65
+ }): Span;
66
+ /** Create a child generation nested under this span. */
67
+ generation(name: string, opts?: {
68
+ model?: string;
69
+ input?: unknown;
70
+ metadata?: Record<string, unknown>;
71
+ }): Generation;
72
+ /** Record a point-in-time event under this span. */
73
+ event(name: string, opts?: {
74
+ input?: unknown;
75
+ metadata?: Record<string, unknown>;
76
+ level?: string;
77
+ }): void;
78
+ /** Mark this span as complete. */
79
+ end(opts?: {
80
+ output?: unknown;
81
+ statusMessage?: string;
82
+ level?: string;
83
+ }): void;
84
+ /**
85
+ * Run a callback within this span's lifecycle.
86
+ * Auto-ends the span when the callback completes.
87
+ * If the callback throws, the span is ended with `level="ERROR"` and the error re-throws.
88
+ */
89
+ wrap<T>(fn: (span: this) => T | Promise<T>): Promise<T>;
90
+ }
91
+ export declare class Generation extends Span {
92
+ constructor(trace: Trace, name: string, opts?: {
93
+ parentId?: string | null;
94
+ model?: string;
95
+ input?: unknown;
96
+ metadata?: Record<string, unknown>;
97
+ });
98
+ /** Mark this generation as complete. */
99
+ end(opts?: {
100
+ output?: unknown;
101
+ usage?: {
102
+ input_tokens?: number;
103
+ output_tokens?: number;
104
+ };
105
+ statusMessage?: string;
106
+ level?: string;
107
+ }): void;
108
+ }
109
+ export declare class Trace {
110
+ /** @internal */
111
+ _observations: ObservationData[];
112
+ private _client;
113
+ private _name;
114
+ private _userId;
115
+ private _sessionId;
116
+ private _metadata;
117
+ private _tags;
118
+ private _traceId;
119
+ private _flushed;
120
+ constructor(client: AshrLabsClient, name: string, opts?: {
121
+ userId?: string;
122
+ sessionId?: string;
123
+ metadata?: Record<string, unknown>;
124
+ tags?: string[];
125
+ });
126
+ /** The server-assigned trace ID (available after `end()` resolves). */
127
+ get traceId(): string | null;
128
+ /** Create a top-level span in this trace. */
129
+ span(name: string, opts?: {
130
+ input?: unknown;
131
+ metadata?: Record<string, unknown>;
132
+ }): Span;
133
+ /** Create a top-level generation (LLM call) in this trace. */
134
+ generation(name: string, opts?: {
135
+ model?: string;
136
+ input?: unknown;
137
+ metadata?: Record<string, unknown>;
138
+ }): Generation;
139
+ /** Record a point-in-time event in this trace. */
140
+ event(name: string, opts?: {
141
+ input?: unknown;
142
+ metadata?: Record<string, unknown>;
143
+ level?: string;
144
+ }): void;
145
+ /**
146
+ * Run a callback within this trace's lifecycle.
147
+ * Auto-flushes the trace when the callback completes.
148
+ */
149
+ wrap<T>(fn: (trace: this) => T | Promise<T>): Promise<T>;
150
+ /**
151
+ * Flush the trace to the Ashr Labs backend.
152
+ *
153
+ * **Never rejects.** If the backend is unreachable, logs the error
154
+ * and resolves with `{ status: "error", message: "..." }`.
155
+ */
156
+ end(opts?: {
157
+ output?: unknown;
158
+ }): Promise<Record<string, unknown>>;
159
+ }
160
+ export {};
@@ -0,0 +1,229 @@
1
+ /**
2
+ * Production agent tracing for Ashr Labs Observability.
3
+ *
4
+ * **Production-safe:** tracing never throws or interferes with your agent.
5
+ * If the backend is unreachable, `trace.end()` resolves with an error object
6
+ * instead of rejecting.
7
+ *
8
+ * @example Manual instrumentation
9
+ * ```typescript
10
+ * const trace = client.trace("handle-ticket", { userId: "user_42" });
11
+ *
12
+ * const gen = trace.generation("classify", { model: "claude-sonnet-4-6", input: [...] });
13
+ * gen.end({ output: { intent: "reset" }, usage: { input_tokens: 50, output_tokens: 12 } });
14
+ *
15
+ * const tool = trace.span("tool:reset_password", { input: { user_id: "42" } });
16
+ * tool.end({ output: { success: true } });
17
+ *
18
+ * await trace.end({ output: { resolution: "password_reset" } });
19
+ * ```
20
+ *
21
+ * @example Using `wrap()` for automatic span lifecycle
22
+ * ```typescript
23
+ * const result = await trace.wrap("tool:search", { input: { q: "..." } }, async (span) => {
24
+ * const data = await search(...);
25
+ * span.end({ output: data });
26
+ * return data;
27
+ * });
28
+ * // If the callback throws, the span auto-ends with level="ERROR" and the error re-throws.
29
+ * ```
30
+ */
31
+ function now() {
32
+ return new Date().toISOString();
33
+ }
34
+ let _counter = 0;
35
+ function makeId() {
36
+ _counter += 1;
37
+ return `${Date.now().toString(36)}${(_counter).toString(36)}${Math.random().toString(36).slice(2, 8)}`;
38
+ }
39
+ export class Span {
40
+ id;
41
+ _trace;
42
+ _data;
43
+ _ended = false;
44
+ constructor(trace, name, opts = {}) {
45
+ this.id = makeId();
46
+ this._trace = trace;
47
+ this._data = {
48
+ id: this.id,
49
+ type: "span",
50
+ name,
51
+ parent_observation_id: opts.parentId ?? null,
52
+ start_time: now(),
53
+ input: opts.input ?? null,
54
+ metadata: opts.metadata ?? null,
55
+ level: opts.level ?? null,
56
+ };
57
+ trace._observations.push(this._data);
58
+ }
59
+ /** Create a child span nested under this span. */
60
+ span(name, opts = {}) {
61
+ return new Span(this._trace, name, { ...opts, parentId: this.id });
62
+ }
63
+ /** Create a child generation nested under this span. */
64
+ generation(name, opts = {}) {
65
+ return new Generation(this._trace, name, { ...opts, parentId: this.id });
66
+ }
67
+ /** Record a point-in-time event under this span. */
68
+ event(name, opts = {}) {
69
+ this._trace._observations.push({
70
+ id: makeId(),
71
+ type: "event",
72
+ name,
73
+ parent_observation_id: this.id,
74
+ start_time: now(),
75
+ input: opts.input ?? null,
76
+ metadata: opts.metadata ?? null,
77
+ level: opts.level ?? null,
78
+ });
79
+ }
80
+ /** Mark this span as complete. */
81
+ end(opts = {}) {
82
+ this._data.end_time = now();
83
+ if (opts.output !== undefined)
84
+ this._data.output = opts.output;
85
+ if (opts.statusMessage !== undefined)
86
+ this._data.status_message = opts.statusMessage;
87
+ if (opts.level !== undefined)
88
+ this._data.level = opts.level;
89
+ this._ended = true;
90
+ }
91
+ /**
92
+ * Run a callback within this span's lifecycle.
93
+ * Auto-ends the span when the callback completes.
94
+ * If the callback throws, the span is ended with `level="ERROR"` and the error re-throws.
95
+ */
96
+ async wrap(fn) {
97
+ try {
98
+ const result = await fn(this);
99
+ if (!this._ended)
100
+ this.end();
101
+ return result;
102
+ }
103
+ catch (e) {
104
+ if (!this._ended) {
105
+ this.end({
106
+ statusMessage: e instanceof Error ? `${e.name}: ${e.message}` : String(e),
107
+ level: "ERROR",
108
+ });
109
+ }
110
+ throw e;
111
+ }
112
+ }
113
+ }
114
+ export class Generation extends Span {
115
+ constructor(trace, name, opts = {}) {
116
+ super(trace, name, opts);
117
+ this._data.type = "generation";
118
+ if (opts.model)
119
+ this._data.model = opts.model;
120
+ }
121
+ /** Mark this generation as complete. */
122
+ end(opts = {}) {
123
+ this._data.end_time = now();
124
+ if (opts.output !== undefined)
125
+ this._data.output = opts.output;
126
+ if (opts.usage !== undefined)
127
+ this._data.usage = opts.usage;
128
+ if (opts.statusMessage !== undefined)
129
+ this._data.status_message = opts.statusMessage;
130
+ if (opts.level !== undefined)
131
+ this._data.level = opts.level;
132
+ this._ended = true;
133
+ }
134
+ }
135
+ export class Trace {
136
+ /** @internal */
137
+ _observations = [];
138
+ _client;
139
+ _name;
140
+ _userId;
141
+ _sessionId;
142
+ _metadata;
143
+ _tags;
144
+ _traceId = null;
145
+ _flushed = false;
146
+ constructor(client, name, opts = {}) {
147
+ this._client = client;
148
+ this._name = name;
149
+ this._userId = opts.userId ?? null;
150
+ this._sessionId = opts.sessionId ?? null;
151
+ this._metadata = opts.metadata ?? null;
152
+ this._tags = opts.tags ? [...opts.tags] : [];
153
+ }
154
+ /** The server-assigned trace ID (available after `end()` resolves). */
155
+ get traceId() {
156
+ return this._traceId;
157
+ }
158
+ /** Create a top-level span in this trace. */
159
+ span(name, opts = {}) {
160
+ return new Span(this, name, opts);
161
+ }
162
+ /** Create a top-level generation (LLM call) in this trace. */
163
+ generation(name, opts = {}) {
164
+ return new Generation(this, name, opts);
165
+ }
166
+ /** Record a point-in-time event in this trace. */
167
+ event(name, opts = {}) {
168
+ this._observations.push({
169
+ id: makeId(),
170
+ type: "event",
171
+ name,
172
+ parent_observation_id: null,
173
+ start_time: now(),
174
+ input: opts.input ?? null,
175
+ metadata: opts.metadata ?? null,
176
+ level: opts.level ?? null,
177
+ });
178
+ }
179
+ /**
180
+ * Run a callback within this trace's lifecycle.
181
+ * Auto-flushes the trace when the callback completes.
182
+ */
183
+ async wrap(fn) {
184
+ try {
185
+ const result = await fn(this);
186
+ if (!this._flushed)
187
+ await this.end();
188
+ return result;
189
+ }
190
+ catch (e) {
191
+ if (!this._flushed) {
192
+ await this.end({
193
+ output: { error: e instanceof Error ? `${e.name}: ${e.message}` : String(e) },
194
+ });
195
+ }
196
+ throw e;
197
+ }
198
+ }
199
+ /**
200
+ * Flush the trace to the Ashr Labs backend.
201
+ *
202
+ * **Never rejects.** If the backend is unreachable, logs the error
203
+ * and resolves with `{ status: "error", message: "..." }`.
204
+ */
205
+ async end(opts = {}) {
206
+ this._flushed = true;
207
+ const payload = {
208
+ trace: {
209
+ name: this._name,
210
+ user_id: this._userId,
211
+ session_id: this._sessionId,
212
+ metadata: this._metadata,
213
+ tags: this._tags,
214
+ observations: this._observations,
215
+ ...(opts.output !== undefined ? { output: opts.output } : {}),
216
+ },
217
+ };
218
+ try {
219
+ const response = await this._client._makeRequest("ingest_observability_trace", payload);
220
+ this._traceId = response.trace_id ?? null;
221
+ return response;
222
+ }
223
+ catch (e) {
224
+ const message = e instanceof Error ? e.message : String(e);
225
+ console.warn(`[ashr_labs] Failed to flush trace "${this._name}": ${message}`);
226
+ return { status: "error", message };
227
+ }
228
+ }
229
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ashr-labs",
3
- "version": "0.4.2",
3
+ "version": "0.4.3",
4
4
  "description": "TypeScript SDK for the Ashr Labs API — agent testing & evaluation",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",