@arizeai/phoenix-client 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/client.d.ts +13 -1
- package/dist/esm/client.d.ts.map +1 -1
- package/dist/esm/client.js +4 -1
- package/dist/esm/client.js.map +1 -1
- package/dist/esm/experiments/instrumention.d.ts +18 -0
- package/dist/esm/experiments/instrumention.d.ts.map +1 -0
- package/dist/esm/experiments/instrumention.js +34 -0
- package/dist/esm/experiments/instrumention.js.map +1 -0
- package/dist/esm/experiments/runExperiment.d.ts +19 -16
- package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
- package/dist/esm/experiments/runExperiment.js +209 -98
- package/dist/esm/experiments/runExperiment.js.map +1 -1
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/experiments.d.ts +0 -1
- package/dist/esm/types/experiments.d.ts.map +1 -1
- package/dist/esm/utils/ensureString.d.ts +8 -0
- package/dist/esm/utils/ensureString.d.ts.map +1 -0
- package/dist/esm/utils/ensureString.js +14 -0
- package/dist/esm/utils/ensureString.js.map +1 -0
- package/dist/esm/utils/objectAsAttributes.d.ts +3 -0
- package/dist/esm/utils/objectAsAttributes.d.ts.map +1 -0
- package/dist/esm/utils/objectAsAttributes.js +4 -0
- package/dist/esm/utils/objectAsAttributes.js.map +1 -0
- package/dist/src/client.d.ts +13 -1
- package/dist/src/client.d.ts.map +1 -1
- package/dist/src/client.js +1 -1
- package/dist/src/client.js.map +1 -1
- package/dist/src/experiments/instrumention.d.ts +18 -0
- package/dist/src/experiments/instrumention.d.ts.map +1 -0
- package/dist/src/experiments/instrumention.js +38 -0
- package/dist/src/experiments/instrumention.js.map +1 -0
- package/dist/src/experiments/runExperiment.d.ts +19 -16
- package/dist/src/experiments/runExperiment.d.ts.map +1 -1
- package/dist/src/experiments/runExperiment.js +211 -102
- package/dist/src/experiments/runExperiment.js.map +1 -1
- package/dist/src/types/experiments.d.ts +0 -1
- package/dist/src/types/experiments.d.ts.map +1 -1
- package/dist/src/utils/ensureString.d.ts +8 -0
- package/dist/src/utils/ensureString.d.ts.map +1 -0
- package/dist/src/utils/ensureString.js +18 -0
- package/dist/src/utils/ensureString.js.map +1 -0
- package/dist/src/utils/objectAsAttributes.d.ts +3 -0
- package/dist/src/utils/objectAsAttributes.d.ts.map +1 -0
- package/dist/src/utils/objectAsAttributes.js +7 -0
- package/dist/src/utils/objectAsAttributes.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +9 -1
- package/src/client.ts +4 -1
- package/src/experiments/instrumention.ts +52 -0
- package/src/experiments/runExperiment.ts +246 -108
- package/src/types/experiments.ts +0 -1
- package/src/utils/ensureString.ts +14 -0
- package/src/utils/objectAsAttributes.ts +9 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arizeai/phoenix-client",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "A client for the Phoenix API",
|
|
5
5
|
"main": "dist/src/index.js",
|
|
6
6
|
"module": "dist/esm/index.js",
|
|
@@ -52,6 +52,14 @@
|
|
|
52
52
|
"vitest": "^2.1.9"
|
|
53
53
|
},
|
|
54
54
|
"dependencies": {
|
|
55
|
+
"@arizeai/openinference-semantic-conventions": "^1.1.0",
|
|
56
|
+
"@opentelemetry/api": "^1.9.0",
|
|
57
|
+
"@opentelemetry/core": "^1.25.1",
|
|
58
|
+
"@opentelemetry/instrumentation": "^0.57.2",
|
|
59
|
+
"@opentelemetry/exporter-trace-otlp-proto": "^0.57.2",
|
|
60
|
+
"@opentelemetry/resources": "^2.0.0",
|
|
61
|
+
"@opentelemetry/sdk-trace-base": "^1.30.1",
|
|
62
|
+
"@opentelemetry/sdk-trace-node": "^1.30.1",
|
|
55
63
|
"async": "^3.2.6",
|
|
56
64
|
"openapi-fetch": "^0.12.5",
|
|
57
65
|
"tiny-invariant": "^1.3.3",
|
package/src/client.ts
CHANGED
|
@@ -79,7 +79,10 @@ export const createClient = (
|
|
|
79
79
|
} = {}
|
|
80
80
|
) => {
|
|
81
81
|
const mergedOptions = getMergedOptions(config);
|
|
82
|
-
return
|
|
82
|
+
return {
|
|
83
|
+
...createOpenApiClient<pathsV1>(mergedOptions),
|
|
84
|
+
config: mergedOptions,
|
|
85
|
+
};
|
|
83
86
|
};
|
|
84
87
|
|
|
85
88
|
/**
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { diag, DiagConsoleLogger, DiagLogLevel } from "@opentelemetry/api";
|
|
2
|
+
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto";
|
|
3
|
+
import { resourceFromAttributes } from "@opentelemetry/resources";
|
|
4
|
+
import { SimpleSpanProcessor } from "@opentelemetry/sdk-trace-base";
|
|
5
|
+
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
6
|
+
import { SEMRESATTRS_PROJECT_NAME } from "@arizeai/openinference-semantic-conventions";
|
|
7
|
+
import { HeadersOptions } from "openapi-fetch";
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Creates a provider that exports traces to Phoenix.
|
|
11
|
+
*/
|
|
12
|
+
export function createProvider({
|
|
13
|
+
projectName,
|
|
14
|
+
baseUrl,
|
|
15
|
+
headers,
|
|
16
|
+
}: {
|
|
17
|
+
projectName: string;
|
|
18
|
+
headers: HeadersOptions;
|
|
19
|
+
/**
|
|
20
|
+
* The base URL of the Phoenix. Doesn't include the /v1/traces path.
|
|
21
|
+
*/
|
|
22
|
+
baseUrl: string;
|
|
23
|
+
}) {
|
|
24
|
+
diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.ERROR);
|
|
25
|
+
|
|
26
|
+
const provider = new NodeTracerProvider({
|
|
27
|
+
resource: resourceFromAttributes({
|
|
28
|
+
[SEMRESATTRS_PROJECT_NAME]: projectName,
|
|
29
|
+
}),
|
|
30
|
+
spanProcessors: [
|
|
31
|
+
new SimpleSpanProcessor(
|
|
32
|
+
new OTLPTraceExporter({
|
|
33
|
+
url: `${baseUrl}/v1/traces`,
|
|
34
|
+
headers: Array.isArray(headers)
|
|
35
|
+
? Object.fromEntries(headers)
|
|
36
|
+
: headers,
|
|
37
|
+
})
|
|
38
|
+
),
|
|
39
|
+
],
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
return provider;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* For dry runs we create a provider that doesn't export traces.
|
|
47
|
+
*/
|
|
48
|
+
export function createNoOpProvider() {
|
|
49
|
+
const provider = new NodeTracerProvider({});
|
|
50
|
+
|
|
51
|
+
return provider;
|
|
52
|
+
}
|
|
@@ -7,7 +7,6 @@ import type {
|
|
|
7
7
|
Evaluator,
|
|
8
8
|
Experiment,
|
|
9
9
|
ExperimentEvaluationRun,
|
|
10
|
-
ExperimentParameters,
|
|
11
10
|
ExperimentRun,
|
|
12
11
|
ExperimentTask,
|
|
13
12
|
RanExperiment,
|
|
@@ -17,6 +16,16 @@ import { getDatasetBySelector } from "../utils/getDatasetBySelector";
|
|
|
17
16
|
import { pluralize } from "../utils/pluralize";
|
|
18
17
|
import { promisifyResult } from "../utils/promisifyResult";
|
|
19
18
|
import { AnnotatorKind } from "../types/annotations";
|
|
19
|
+
import { createProvider, createNoOpProvider } from "./instrumention";
|
|
20
|
+
import { SpanStatusCode, Tracer } from "@opentelemetry/api";
|
|
21
|
+
import {
|
|
22
|
+
MimeType,
|
|
23
|
+
OpenInferenceSpanKind,
|
|
24
|
+
SemanticConventions,
|
|
25
|
+
} from "@arizeai/openinference-semantic-conventions";
|
|
26
|
+
import { ensureString } from "../utils/ensureString";
|
|
27
|
+
import type { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
28
|
+
import { objectAsAttributes } from "../utils/objectAsAttributes";
|
|
20
29
|
|
|
21
30
|
/**
|
|
22
31
|
* Parameters for running an experiment.
|
|
@@ -49,10 +58,6 @@ export type RunExperimentParams = ClientFn & {
|
|
|
49
58
|
* The evaluators to use
|
|
50
59
|
*/
|
|
51
60
|
evaluators?: Evaluator[];
|
|
52
|
-
/**
|
|
53
|
-
* The project under which the experiment task traces are recorded
|
|
54
|
-
*/
|
|
55
|
-
projectName?: string;
|
|
56
61
|
/**
|
|
57
62
|
* The logger to use
|
|
58
63
|
*/
|
|
@@ -73,7 +78,23 @@ export type RunExperimentParams = ClientFn & {
|
|
|
73
78
|
};
|
|
74
79
|
|
|
75
80
|
/**
|
|
76
|
-
*
|
|
81
|
+
* Runs an experiment using a given set of dataset of examples.
|
|
82
|
+
*
|
|
83
|
+
* An experiment is a user-defined task that runs on each example in a dataset. The results from
|
|
84
|
+
* each experiment can be evaluated using any number of evaluators to measure the behavior of the
|
|
85
|
+
* task. The experiment and evaluation results are stored in the Phoenix database for comparison
|
|
86
|
+
* and analysis.
|
|
87
|
+
*
|
|
88
|
+
* A `task` is either a sync or async function that returns a JSON serializable
|
|
89
|
+
* output. If the `task` is a function of one argument then that argument will be bound to the
|
|
90
|
+
* `input` field of the dataset example. Alternatively, the `task` can be a function of any
|
|
91
|
+
* combination of specific argument names that will be bound to special values:
|
|
92
|
+
*
|
|
93
|
+
* - `input`: The input field of the dataset example
|
|
94
|
+
* - `expected`: The expected or reference output of the dataset example
|
|
95
|
+
* - `reference`: An alias for `expected`
|
|
96
|
+
* - `metadata`: Metadata associated with the dataset example
|
|
97
|
+
* - `example`: The dataset `Example` object with all associated fields
|
|
77
98
|
*
|
|
78
99
|
* @example
|
|
79
100
|
* ```ts
|
|
@@ -83,27 +104,25 @@ export type RunExperimentParams = ClientFn & {
|
|
|
83
104
|
* dataset: "my-dataset",
|
|
84
105
|
* task: async (example) => example.input,
|
|
85
106
|
* evaluators: [
|
|
86
|
-
* asEvaluator("my-evaluator", "CODE", async (params) => params.output),
|
|
107
|
+
* asEvaluator({ name: "my-evaluator", kind: "CODE", evaluate: async (params) => params.output }),
|
|
87
108
|
* ],
|
|
88
109
|
* });
|
|
89
110
|
* ```
|
|
90
|
-
*
|
|
91
|
-
* @experimental This feature is not complete, and will change in the future.
|
|
92
111
|
*/
|
|
93
112
|
export async function runExperiment({
|
|
94
|
-
experimentName
|
|
113
|
+
experimentName,
|
|
95
114
|
experimentDescription,
|
|
96
115
|
experimentMetadata,
|
|
97
116
|
client: _client,
|
|
98
117
|
dataset: _dataset,
|
|
99
118
|
task,
|
|
100
119
|
evaluators,
|
|
101
|
-
projectName = "default",
|
|
102
120
|
logger = console,
|
|
103
121
|
record = true,
|
|
104
122
|
concurrency = 5,
|
|
105
123
|
dryRun = false,
|
|
106
124
|
}: RunExperimentParams): Promise<RanExperiment> {
|
|
125
|
+
let provider: NodeTracerProvider | undefined;
|
|
107
126
|
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
108
127
|
const client = _client ?? createClient();
|
|
109
128
|
const dataset = await getDatasetBySelector({ dataset: _dataset, client });
|
|
@@ -114,19 +133,18 @@ export async function runExperiment({
|
|
|
114
133
|
? Math.max(dryRun, dataset.examples.length)
|
|
115
134
|
: dataset.examples.length;
|
|
116
135
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
nExamples,
|
|
121
|
-
};
|
|
136
|
+
let projectName = `${dataset.name}-exp-${new Date().toISOString()}`;
|
|
137
|
+
// initialize the tracer into scope
|
|
138
|
+
let taskTracer: Tracer;
|
|
122
139
|
let experiment: Experiment;
|
|
123
140
|
if (isDryRun) {
|
|
124
141
|
experiment = {
|
|
125
|
-
id:
|
|
142
|
+
id: localId(),
|
|
126
143
|
datasetId: dataset.id,
|
|
127
144
|
datasetVersionId: dataset.versionId,
|
|
128
145
|
projectName,
|
|
129
146
|
};
|
|
147
|
+
taskTracer = createNoOpProvider().getTracer("no-op");
|
|
130
148
|
} else {
|
|
131
149
|
const experimentResponse = await client
|
|
132
150
|
.POST("/v1/datasets/{dataset_id}/experiments", {
|
|
@@ -144,14 +162,26 @@ export async function runExperiment({
|
|
|
144
162
|
})
|
|
145
163
|
.then((res) => res.data?.data);
|
|
146
164
|
invariant(experimentResponse, `Failed to create experiment`);
|
|
165
|
+
projectName = experimentResponse.project_name ?? projectName;
|
|
147
166
|
experiment = {
|
|
148
167
|
id: experimentResponse.id,
|
|
149
168
|
datasetId: dataset.id,
|
|
150
169
|
datasetVersionId: dataset.versionId,
|
|
151
170
|
projectName,
|
|
152
171
|
};
|
|
172
|
+
// Initialize the tracer, now that we have a project name
|
|
173
|
+
const baseUrl = client.config.baseUrl;
|
|
174
|
+
invariant(
|
|
175
|
+
baseUrl,
|
|
176
|
+
"Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
|
|
177
|
+
);
|
|
178
|
+
provider = createProvider({
|
|
179
|
+
projectName,
|
|
180
|
+
baseUrl,
|
|
181
|
+
headers: client.config.headers ?? {},
|
|
182
|
+
});
|
|
183
|
+
taskTracer = provider.getTracer(projectName);
|
|
153
184
|
}
|
|
154
|
-
|
|
155
185
|
if (!record) {
|
|
156
186
|
logger.info(
|
|
157
187
|
`🔧 Running experiment in readonly mode. Results will not be recorded.`
|
|
@@ -180,15 +210,20 @@ export async function runExperiment({
|
|
|
180
210
|
concurrency,
|
|
181
211
|
isDryRun,
|
|
182
212
|
nExamples,
|
|
213
|
+
tracer: taskTracer,
|
|
183
214
|
});
|
|
184
215
|
logger.info(`✅ Task runs completed`);
|
|
185
216
|
|
|
186
217
|
const ranExperiment: RanExperiment = {
|
|
187
218
|
...experiment,
|
|
188
|
-
params: experimentParams,
|
|
189
219
|
runs,
|
|
190
220
|
};
|
|
191
221
|
|
|
222
|
+
// Shut down the provider so that the experiments run
|
|
223
|
+
if (provider) {
|
|
224
|
+
await provider.shutdown?.();
|
|
225
|
+
}
|
|
226
|
+
|
|
192
227
|
const { evaluationRuns } = await evaluateExperiment({
|
|
193
228
|
experiment: ranExperiment,
|
|
194
229
|
evaluators: evaluators ?? [],
|
|
@@ -217,6 +252,7 @@ function runTask({
|
|
|
217
252
|
concurrency = 5,
|
|
218
253
|
isDryRun,
|
|
219
254
|
nExamples,
|
|
255
|
+
tracer,
|
|
220
256
|
}: {
|
|
221
257
|
/** The client to use */
|
|
222
258
|
client: PhoenixClient;
|
|
@@ -236,61 +272,88 @@ function runTask({
|
|
|
236
272
|
isDryRun: boolean;
|
|
237
273
|
/** The number of examples to run */
|
|
238
274
|
nExamples: number;
|
|
275
|
+
/** TraceProvider instance that will be used to create spans from task calls */
|
|
276
|
+
tracer: Tracer;
|
|
239
277
|
}) {
|
|
240
278
|
logger.info(`🔧 Running task "${task.name}" on dataset "${dataset.id}"`);
|
|
241
279
|
const run = async (example: Example) => {
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
280
|
+
return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => {
|
|
281
|
+
logger.info(
|
|
282
|
+
`🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`
|
|
283
|
+
);
|
|
284
|
+
const traceId = span.spanContext().traceId;
|
|
285
|
+
const thisRun: ExperimentRun = {
|
|
286
|
+
id: localId(), // initialized with local id, will be replaced with server-assigned id when dry run is false
|
|
287
|
+
traceId,
|
|
288
|
+
experimentId,
|
|
289
|
+
datasetExampleId: example.id,
|
|
290
|
+
startTime: new Date(),
|
|
291
|
+
endTime: new Date(), // will get replaced with actual end time
|
|
292
|
+
output: null,
|
|
293
|
+
error: null,
|
|
294
|
+
};
|
|
295
|
+
try {
|
|
296
|
+
const taskOutput = await promisifyResult(task(example));
|
|
297
|
+
thisRun.output =
|
|
298
|
+
typeof taskOutput === "string"
|
|
299
|
+
? taskOutput
|
|
300
|
+
: JSON.stringify(taskOutput);
|
|
301
|
+
} catch (error) {
|
|
302
|
+
thisRun.error =
|
|
303
|
+
error instanceof Error ? error.message : "Unknown error";
|
|
304
|
+
span.setStatus({ code: SpanStatusCode.ERROR });
|
|
305
|
+
}
|
|
306
|
+
thisRun.endTime = new Date();
|
|
307
|
+
if (!isDryRun) {
|
|
308
|
+
// Log the run to the server
|
|
309
|
+
const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
|
|
310
|
+
params: {
|
|
311
|
+
path: {
|
|
312
|
+
experiment_id: experimentId,
|
|
313
|
+
},
|
|
273
314
|
},
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
315
|
+
body: {
|
|
316
|
+
dataset_example_id: example.id,
|
|
317
|
+
output: thisRun.output,
|
|
318
|
+
repetition_number: 0,
|
|
319
|
+
start_time: thisRun.startTime.toISOString(),
|
|
320
|
+
end_time: thisRun.endTime.toISOString(),
|
|
321
|
+
trace_id: thisRun.traceId,
|
|
322
|
+
error: thisRun.error,
|
|
323
|
+
},
|
|
324
|
+
});
|
|
325
|
+
// replace the local run id with the server-assigned id
|
|
326
|
+
thisRun.id = res.data?.data.id ?? thisRun.id;
|
|
327
|
+
const inputMimeType =
|
|
328
|
+
typeof example.input === "string" ? MimeType.TEXT : MimeType.JSON;
|
|
329
|
+
const outputMimeType =
|
|
330
|
+
typeof thisRun.output === "string" ? MimeType.TEXT : MimeType.JSON;
|
|
331
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
332
|
+
span.setAttributes({
|
|
333
|
+
[SemanticConventions.OPENINFERENCE_SPAN_KIND]:
|
|
334
|
+
OpenInferenceSpanKind.CHAIN,
|
|
335
|
+
[SemanticConventions.INPUT_MIME_TYPE]: inputMimeType,
|
|
336
|
+
[SemanticConventions.INPUT_VALUE]: ensureString(example.input),
|
|
337
|
+
[SemanticConventions.OUTPUT_MIME_TYPE]: outputMimeType,
|
|
338
|
+
[SemanticConventions.OUTPUT_VALUE]: ensureString(thisRun.output),
|
|
339
|
+
});
|
|
340
|
+
}
|
|
341
|
+
span?.end();
|
|
342
|
+
onComplete(thisRun);
|
|
343
|
+
return thisRun;
|
|
344
|
+
});
|
|
290
345
|
};
|
|
291
346
|
const q = queue(run, concurrency);
|
|
292
347
|
const examplesToUse = dataset.examples.slice(0, nExamples);
|
|
293
|
-
examplesToUse.forEach((example) =>
|
|
348
|
+
examplesToUse.forEach((example) =>
|
|
349
|
+
q.push(example, (err) => {
|
|
350
|
+
if (err) {
|
|
351
|
+
logger.error(
|
|
352
|
+
`Error running task "${task.name}" on example "${example.id}": ${err}`
|
|
353
|
+
);
|
|
354
|
+
}
|
|
355
|
+
})
|
|
356
|
+
);
|
|
294
357
|
return q.drain();
|
|
295
358
|
}
|
|
296
359
|
|
|
@@ -328,11 +391,29 @@ export async function evaluateExperiment({
|
|
|
328
391
|
dryRun?: boolean | number;
|
|
329
392
|
}): Promise<RanExperiment> {
|
|
330
393
|
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
394
|
+
const client = _client ?? createClient();
|
|
395
|
+
const baseUrl = client.config.baseUrl;
|
|
396
|
+
invariant(
|
|
397
|
+
baseUrl,
|
|
398
|
+
"Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
|
|
399
|
+
);
|
|
400
|
+
let provider: NodeTracerProvider;
|
|
401
|
+
if (!isDryRun) {
|
|
402
|
+
provider = createProvider({
|
|
403
|
+
projectName: "evaluators",
|
|
404
|
+
baseUrl,
|
|
405
|
+
headers: client.config.headers ?? {},
|
|
406
|
+
});
|
|
407
|
+
} else {
|
|
408
|
+
provider = createNoOpProvider();
|
|
409
|
+
}
|
|
410
|
+
const tracer = isDryRun
|
|
411
|
+
? provider.getTracer("no-op")
|
|
412
|
+
: provider.getTracer("evaluators");
|
|
331
413
|
const nRuns =
|
|
332
414
|
typeof dryRun === "number"
|
|
333
415
|
? Math.max(dryRun, Object.keys(experiment.runs).length)
|
|
334
416
|
: Object.keys(experiment.runs).length;
|
|
335
|
-
const client = _client ?? createClient();
|
|
336
417
|
const dataset = await getDatasetBySelector({
|
|
337
418
|
dataset: experiment.datasetId,
|
|
338
419
|
client,
|
|
@@ -345,14 +426,12 @@ export async function evaluateExperiment({
|
|
|
345
426
|
invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`);
|
|
346
427
|
|
|
347
428
|
const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
|
|
348
|
-
|
|
349
429
|
if (evaluators?.length === 0) {
|
|
350
430
|
return {
|
|
351
431
|
...experiment,
|
|
352
432
|
evaluationRuns: [],
|
|
353
433
|
};
|
|
354
434
|
}
|
|
355
|
-
|
|
356
435
|
logger.info(
|
|
357
436
|
`🧠 Evaluating experiment "${experiment.id}" with ${evaluators?.length ?? 0} ${pluralize(
|
|
358
437
|
"evaluator",
|
|
@@ -381,40 +460,91 @@ export async function evaluateExperiment({
|
|
|
381
460
|
);
|
|
382
461
|
const evaluatorsQueue = queue(
|
|
383
462
|
async (evaluatorAndRun: { evaluator: Evaluator; run: ExperimentRun }) => {
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
463
|
+
return tracer.startActiveSpan(
|
|
464
|
+
`Evaluation: ${evaluatorAndRun.evaluator.name}`,
|
|
465
|
+
async (span) => {
|
|
466
|
+
const evalResult = await runEvaluator({
|
|
467
|
+
evaluator: evaluatorAndRun.evaluator,
|
|
468
|
+
run: evaluatorAndRun.run,
|
|
469
|
+
exampleCache: examplesById,
|
|
470
|
+
onComplete: onEvaluationComplete,
|
|
471
|
+
logger,
|
|
472
|
+
});
|
|
473
|
+
span.setAttributes({
|
|
474
|
+
[SemanticConventions.OPENINFERENCE_SPAN_KIND]:
|
|
475
|
+
OpenInferenceSpanKind.EVALUATOR,
|
|
476
|
+
[SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
|
|
477
|
+
[SemanticConventions.INPUT_VALUE]: JSON.stringify({
|
|
478
|
+
input: examplesById[evaluatorAndRun.run.datasetExampleId]?.input,
|
|
479
|
+
output: evaluatorAndRun.run.output,
|
|
480
|
+
expected:
|
|
481
|
+
examplesById[evaluatorAndRun.run.datasetExampleId]?.output,
|
|
482
|
+
metadata:
|
|
483
|
+
examplesById[evaluatorAndRun.run.datasetExampleId]?.metadata,
|
|
484
|
+
}),
|
|
485
|
+
[SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
|
|
486
|
+
[SemanticConventions.OUTPUT_VALUE]: ensureString(evalResult.result),
|
|
487
|
+
});
|
|
488
|
+
if (evalResult.error) {
|
|
489
|
+
span.setStatus({
|
|
490
|
+
code: SpanStatusCode.ERROR,
|
|
491
|
+
message: evalResult.error,
|
|
492
|
+
});
|
|
493
|
+
} else {
|
|
494
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
495
|
+
}
|
|
496
|
+
if (evalResult.result) {
|
|
497
|
+
span.setAttributes(objectAsAttributes(evalResult.result));
|
|
498
|
+
}
|
|
499
|
+
evalResult.traceId = span.spanContext().traceId;
|
|
500
|
+
if (!isDryRun) {
|
|
501
|
+
// Log the evaluation to the server
|
|
502
|
+
// We log this without awaiting (e.g. best effort)
|
|
503
|
+
client.POST("/v1/experiment_evaluations", {
|
|
504
|
+
body: {
|
|
505
|
+
experiment_run_id: evaluatorAndRun.run.id,
|
|
506
|
+
name: evaluatorAndRun.evaluator.name,
|
|
507
|
+
annotator_kind: evaluatorAndRun.evaluator.kind,
|
|
508
|
+
start_time: evalResult.startTime.toISOString(),
|
|
509
|
+
end_time: evalResult.endTime.toISOString(),
|
|
510
|
+
result: {
|
|
511
|
+
...evalResult.result,
|
|
512
|
+
},
|
|
513
|
+
error: evalResult.error,
|
|
514
|
+
trace_id: evalResult.traceId,
|
|
515
|
+
},
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
span.end();
|
|
519
|
+
return evalResult;
|
|
520
|
+
}
|
|
521
|
+
);
|
|
409
522
|
},
|
|
410
523
|
concurrency
|
|
411
524
|
);
|
|
525
|
+
if (!evaluatorsAndRuns.length) {
|
|
526
|
+
logger.info(`⛔ No evaluators to run`);
|
|
527
|
+
return {
|
|
528
|
+
...experiment,
|
|
529
|
+
evaluationRuns: [],
|
|
530
|
+
};
|
|
531
|
+
}
|
|
412
532
|
evaluatorsAndRuns.forEach((evaluatorAndRun) =>
|
|
413
|
-
evaluatorsQueue.push(evaluatorAndRun)
|
|
533
|
+
evaluatorsQueue.push(evaluatorAndRun, (err) => {
|
|
534
|
+
if (err) {
|
|
535
|
+
logger.error(
|
|
536
|
+
`❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`
|
|
537
|
+
);
|
|
538
|
+
}
|
|
539
|
+
})
|
|
414
540
|
);
|
|
415
541
|
await evaluatorsQueue.drain();
|
|
416
542
|
logger.info(`✅ Evaluation runs completed`);
|
|
417
543
|
|
|
544
|
+
if (provider) {
|
|
545
|
+
await provider.shutdown?.();
|
|
546
|
+
}
|
|
547
|
+
|
|
418
548
|
return {
|
|
419
549
|
...experiment,
|
|
420
550
|
evaluationRuns: Object.values(evaluationRuns),
|
|
@@ -431,25 +561,30 @@ async function runEvaluator({
|
|
|
431
561
|
run,
|
|
432
562
|
exampleCache,
|
|
433
563
|
onComplete,
|
|
564
|
+
logger,
|
|
434
565
|
}: {
|
|
435
566
|
evaluator: Evaluator;
|
|
436
567
|
run: ExperimentRun;
|
|
437
568
|
exampleCache: Record<string, Example>;
|
|
569
|
+
logger: Logger;
|
|
438
570
|
onComplete: (run: ExperimentEvaluationRun) => void;
|
|
439
571
|
}) {
|
|
440
572
|
const example = exampleCache[run.datasetExampleId];
|
|
441
573
|
invariant(example, `Example "${run.datasetExampleId}" not found`);
|
|
442
574
|
const evaluate = async () => {
|
|
575
|
+
logger.info(
|
|
576
|
+
`🧠 Evaluating run "${run.id}" with evaluator "${evaluator.name}"`
|
|
577
|
+
);
|
|
443
578
|
const thisEval: ExperimentEvaluationRun = {
|
|
444
|
-
id:
|
|
445
|
-
traceId: null,
|
|
579
|
+
id: localId(),
|
|
580
|
+
traceId: null,
|
|
446
581
|
experimentRunId: run.id,
|
|
447
582
|
startTime: new Date(),
|
|
448
583
|
endTime: new Date(), // will get replaced with actual end time
|
|
449
584
|
name: evaluator.name,
|
|
450
585
|
result: null,
|
|
451
586
|
error: null,
|
|
452
|
-
annotatorKind:
|
|
587
|
+
annotatorKind: evaluator.kind,
|
|
453
588
|
};
|
|
454
589
|
try {
|
|
455
590
|
const result = await evaluator.evaluate({
|
|
@@ -459,8 +594,14 @@ async function runEvaluator({
|
|
|
459
594
|
metadata: example.metadata,
|
|
460
595
|
});
|
|
461
596
|
thisEval.result = result;
|
|
597
|
+
logger.info(
|
|
598
|
+
`✅ Evaluator "${evaluator.name}" on run "${run.id}" completed`
|
|
599
|
+
);
|
|
462
600
|
} catch (error) {
|
|
463
601
|
thisEval.error = error instanceof Error ? error.message : "Unknown error";
|
|
602
|
+
logger.error(
|
|
603
|
+
`❌ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`
|
|
604
|
+
);
|
|
464
605
|
}
|
|
465
606
|
thisEval.endTime = new Date();
|
|
466
607
|
onComplete(thisEval);
|
|
@@ -495,17 +636,14 @@ export function asEvaluator({
|
|
|
495
636
|
};
|
|
496
637
|
}
|
|
497
638
|
|
|
498
|
-
let
|
|
639
|
+
let _localIdIndex = 1000;
|
|
499
640
|
|
|
500
641
|
/**
|
|
501
|
-
* Generate a
|
|
642
|
+
* Generate a local id.
|
|
502
643
|
*
|
|
503
|
-
* @
|
|
504
|
-
* @returns A unique id.
|
|
644
|
+
* @returns A semi-unique id.
|
|
505
645
|
*/
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
return _id.toString();
|
|
510
|
-
})();
|
|
646
|
+
function localId(): string {
|
|
647
|
+
_localIdIndex++;
|
|
648
|
+
return `local_${_localIdIndex}`;
|
|
511
649
|
}
|
package/src/types/experiments.ts
CHANGED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { safelyStringifyJSON } from "./safelyStringifyJSON";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Ensures that a value is a string.
|
|
5
|
+
* If the value is not a string, it will be converted to a string using `safelyStringifyJSON`.
|
|
6
|
+
* @param value - The value to ensure is a string.
|
|
7
|
+
* @returns The value as a string.
|
|
8
|
+
*/
|
|
9
|
+
export function ensureString(value: unknown): string {
|
|
10
|
+
if (typeof value === "string") {
|
|
11
|
+
return value;
|
|
12
|
+
}
|
|
13
|
+
return safelyStringifyJSON(value)?.json ?? "";
|
|
14
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { AttributeValue } from "@opentelemetry/api";
|
|
2
|
+
|
|
3
|
+
export function objectAsAttributes<T extends Record<string, unknown>>(
|
|
4
|
+
obj: T
|
|
5
|
+
): Record<string, AttributeValue> {
|
|
6
|
+
return Object.fromEntries(
|
|
7
|
+
Object.entries(obj).filter(([_, value]) => value !== null)
|
|
8
|
+
) as Record<string, AttributeValue>;
|
|
9
|
+
}
|