ashr-labs 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/eval.d.ts +4 -4
- package/dist/eval.js +9 -15
- package/package.json +5 -1
package/dist/eval.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { RunBuilder } from "./run-builder.js";
|
|
2
2
|
import type { AshrLabsClient } from "./client.js";
|
|
3
3
|
export interface Agent {
|
|
4
|
-
respond(message: string): Record<string, unknown> | Promise<Record<string, unknown>>;
|
|
5
|
-
reset(): void | Promise<void>;
|
|
4
|
+
respond(message: string, scenarioId?: string): Record<string, unknown> | Promise<Record<string, unknown>>;
|
|
5
|
+
reset(scenarioId?: string): void | Promise<void>;
|
|
6
6
|
}
|
|
7
7
|
export type OnScenarioCallback = (scenarioId: string, scenario: Record<string, unknown>) => void;
|
|
8
8
|
export type OnActionCallback = (actionIndex: number, action: Record<string, unknown>) => void;
|
|
@@ -31,12 +31,12 @@ export declare class EvalRunner {
|
|
|
31
31
|
};
|
|
32
32
|
}): Promise<EvalRunner>;
|
|
33
33
|
private _runScenario;
|
|
34
|
-
run(agent: Agent, options?: {
|
|
34
|
+
run(agent: Agent | (() => Agent), options?: {
|
|
35
35
|
onScenario?: OnScenarioCallback;
|
|
36
36
|
onAction?: OnActionCallback;
|
|
37
37
|
maxWorkers?: number;
|
|
38
38
|
}): Promise<RunBuilder>;
|
|
39
|
-
runAndDeploy(agent: Agent, client: AshrLabsClient, datasetId?: number, options?: {
|
|
39
|
+
runAndDeploy(agent: Agent | (() => Agent), client: AshrLabsClient, datasetId?: number, options?: {
|
|
40
40
|
onScenario?: OnScenarioCallback;
|
|
41
41
|
onAction?: OnActionCallback;
|
|
42
42
|
maxWorkers?: number;
|
package/dist/eval.js
CHANGED
|
@@ -22,7 +22,7 @@ export class EvalRunner {
|
|
|
22
22
|
async _runScenario(agent, runId, scenario, onScenario, onAction) {
|
|
23
23
|
if (onScenario)
|
|
24
24
|
onScenario(runId, scenario);
|
|
25
|
-
await agent.reset();
|
|
25
|
+
await agent.reset(runId);
|
|
26
26
|
const test = new TestBuilder(runId);
|
|
27
27
|
test.start();
|
|
28
28
|
let agentText = "";
|
|
@@ -37,7 +37,7 @@ export class EvalRunner {
|
|
|
37
37
|
if (actor === "user") {
|
|
38
38
|
test.addUserText(content, action.name ?? `user_action_${i}`, i);
|
|
39
39
|
try {
|
|
40
|
-
const result = await agent.respond(content);
|
|
40
|
+
const result = await agent.respond(content, runId);
|
|
41
41
|
agentText = (result.text ?? "");
|
|
42
42
|
agentTools = [...(result.tool_calls ?? [])];
|
|
43
43
|
}
|
|
@@ -106,36 +106,30 @@ export class EvalRunner {
|
|
|
106
106
|
}
|
|
107
107
|
}
|
|
108
108
|
const maxWorkers = options?.maxWorkers ?? 1;
|
|
109
|
+
const resolvedAgent = typeof agent === "function" ? agent() : agent;
|
|
109
110
|
if (maxWorkers <= 1) {
|
|
110
|
-
// Sequential — use the agent directly
|
|
111
111
|
for (const [runId, scenario] of scenarios) {
|
|
112
|
-
const test = await this._runScenario(
|
|
112
|
+
const test = await this._runScenario(resolvedAgent, runId, scenario, options?.onScenario, options?.onAction);
|
|
113
113
|
run._tests.push(test);
|
|
114
114
|
}
|
|
115
115
|
}
|
|
116
116
|
else {
|
|
117
117
|
// Parallel — run scenarios concurrently with concurrency limit.
|
|
118
|
-
//
|
|
119
|
-
//
|
|
120
|
-
//
|
|
121
|
-
//
|
|
118
|
+
// The agent must key its conversation state on the scenarioId
|
|
119
|
+
// passed to respond(message, scenarioId) and reset(scenarioId).
|
|
120
|
+
// This allows a single agent instance (one API client) to handle
|
|
121
|
+
// multiple concurrent scenarios without cloning or extra clients.
|
|
122
122
|
const results = new Array(scenarios.length).fill(null);
|
|
123
|
-
// Process in batches of maxWorkers
|
|
124
123
|
for (let batchStart = 0; batchStart < scenarios.length; batchStart += maxWorkers) {
|
|
125
124
|
const batchEnd = Math.min(batchStart + maxWorkers, scenarios.length);
|
|
126
125
|
const batch = scenarios.slice(batchStart, batchEnd);
|
|
127
126
|
const promises = batch.map(async ([runId, scenario], batchIdx) => {
|
|
128
127
|
const idx = batchStart + batchIdx;
|
|
129
128
|
try {
|
|
130
|
-
|
|
131
|
-
const agentCopy = structuredClone(agent);
|
|
132
|
-
// Restore prototype methods lost by structuredClone
|
|
133
|
-
Object.setPrototypeOf(agentCopy, Object.getPrototypeOf(agent));
|
|
134
|
-
const test = await this._runScenario(agentCopy, runId, scenario, options?.onScenario, options?.onAction);
|
|
129
|
+
const test = await this._runScenario(resolvedAgent, runId, scenario, options?.onScenario, options?.onAction);
|
|
135
130
|
results[idx] = test;
|
|
136
131
|
}
|
|
137
132
|
catch {
|
|
138
|
-
// Scenario raised — record as a failed test
|
|
139
133
|
const failed = new TestBuilder(runId);
|
|
140
134
|
failed.start();
|
|
141
135
|
failed.complete("failed");
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ashr-labs",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "TypeScript SDK for the Ashr Labs API",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -33,5 +33,9 @@
|
|
|
33
33
|
},
|
|
34
34
|
"engines": {
|
|
35
35
|
"node": ">=18.0.0"
|
|
36
|
+
},
|
|
37
|
+
"dependencies": {
|
|
38
|
+
"@anthropic-ai/sdk": "^0.78.0",
|
|
39
|
+
"tsx": "^4.21.0"
|
|
36
40
|
}
|
|
37
41
|
}
|