@arizeai/phoenix-client 1.0.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -5
- package/dist/esm/__generated__/api/v1.d.ts +1809 -295
- package/dist/esm/__generated__/api/v1.d.ts.map +1 -1
- package/dist/esm/experiments/runExperiment.d.ts +48 -13
- package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
- package/dist/esm/experiments/runExperiment.js +139 -36
- package/dist/esm/experiments/runExperiment.js.map +1 -1
- package/dist/esm/schemas/llm/constants.d.ts +1 -1
- package/dist/esm/schemas/llm/converters.d.ts +4 -4
- package/dist/esm/schemas/llm/openai/converters.d.ts +1 -1
- package/dist/esm/schemas/llm/phoenixPrompt/converters.d.ts +8 -8
- package/dist/esm/schemas/llm/phoenixPrompt/messagePartSchemas.d.ts +5 -5
- package/dist/esm/schemas/llm/phoenixPrompt/messageSchemas.d.ts +8 -8
- package/dist/esm/schemas/llm/schemas.d.ts +4 -4
- package/dist/esm/spans/addSpanAnnotation.d.ts +39 -0
- package/dist/esm/spans/addSpanAnnotation.d.ts.map +1 -0
- package/dist/esm/spans/addSpanAnnotation.js +44 -0
- package/dist/esm/spans/addSpanAnnotation.js.map +1 -0
- package/dist/esm/spans/index.d.ts +3 -0
- package/dist/esm/spans/index.d.ts.map +1 -0
- package/dist/esm/spans/index.js +3 -0
- package/dist/esm/spans/index.js.map +1 -0
- package/dist/esm/spans/logSpanAnnotations.d.ts +51 -0
- package/dist/esm/spans/logSpanAnnotations.d.ts.map +1 -0
- package/dist/esm/spans/logSpanAnnotations.js +53 -0
- package/dist/esm/spans/logSpanAnnotations.js.map +1 -0
- package/dist/esm/spans/types.d.ts +43 -0
- package/dist/esm/spans/types.d.ts.map +1 -0
- package/dist/esm/spans/types.js +18 -0
- package/dist/esm/spans/types.js.map +1 -0
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/annotations.d.ts +2 -1
- package/dist/esm/types/annotations.d.ts.map +1 -1
- package/dist/esm/types/experiments.d.ts +1 -7
- package/dist/esm/types/experiments.d.ts.map +1 -1
- package/dist/esm/utils/formatPromptMessages.d.ts +3 -1
- package/dist/esm/utils/formatPromptMessages.d.ts.map +1 -1
- package/dist/esm/utils/getPromptBySelector.d.ts.map +1 -1
- package/dist/src/__generated__/api/v1.d.ts +1809 -295
- package/dist/src/__generated__/api/v1.d.ts.map +1 -1
- package/dist/src/experiments/runExperiment.d.ts +48 -13
- package/dist/src/experiments/runExperiment.d.ts.map +1 -1
- package/dist/src/experiments/runExperiment.js +138 -36
- package/dist/src/experiments/runExperiment.js.map +1 -1
- package/dist/src/schemas/llm/constants.d.ts +1 -1
- package/dist/src/schemas/llm/converters.d.ts +4 -4
- package/dist/src/schemas/llm/openai/converters.d.ts +1 -1
- package/dist/src/schemas/llm/phoenixPrompt/converters.d.ts +8 -8
- package/dist/src/schemas/llm/phoenixPrompt/messagePartSchemas.d.ts +5 -5
- package/dist/src/schemas/llm/phoenixPrompt/messageSchemas.d.ts +8 -8
- package/dist/src/schemas/llm/schemas.d.ts +4 -4
- package/dist/src/spans/addSpanAnnotation.d.ts +39 -0
- package/dist/src/spans/addSpanAnnotation.d.ts.map +1 -0
- package/dist/src/spans/addSpanAnnotation.js +59 -0
- package/dist/src/spans/addSpanAnnotation.js.map +1 -0
- package/dist/src/spans/index.d.ts +3 -0
- package/dist/src/spans/index.d.ts.map +1 -0
- package/dist/src/spans/index.js +19 -0
- package/dist/src/spans/index.js.map +1 -0
- package/dist/src/spans/logSpanAnnotations.d.ts +51 -0
- package/dist/src/spans/logSpanAnnotations.d.ts.map +1 -0
- package/dist/src/spans/logSpanAnnotations.js +68 -0
- package/dist/src/spans/logSpanAnnotations.js.map +1 -0
- package/dist/src/spans/types.d.ts +43 -0
- package/dist/src/spans/types.d.ts.map +1 -0
- package/dist/src/spans/types.js +22 -0
- package/dist/src/spans/types.js.map +1 -0
- package/dist/src/types/annotations.d.ts +2 -1
- package/dist/src/types/annotations.d.ts.map +1 -1
- package/dist/src/types/experiments.d.ts +1 -7
- package/dist/src/types/experiments.d.ts.map +1 -1
- package/dist/src/utils/formatPromptMessages.d.ts +3 -1
- package/dist/src/utils/formatPromptMessages.d.ts.map +1 -1
- package/dist/src/utils/getPromptBySelector.d.ts.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +8 -2
- package/src/__generated__/api/v1.ts +1809 -295
- package/src/experiments/runExperiment.ts +211 -74
- package/src/spans/addSpanAnnotation.ts +59 -0
- package/src/spans/index.ts +2 -0
- package/src/spans/logSpanAnnotations.ts +71 -0
- package/src/spans/types.ts +60 -0
- package/src/types/annotations.ts +4 -1
- package/src/types/experiments.ts +1 -7
|
@@ -1,5 +1,8 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { queue } from "async";
|
|
2
|
+
import invariant from "tiny-invariant";
|
|
2
3
|
import { createClient, type PhoenixClient } from "../client";
|
|
4
|
+
import { ClientFn } from "../types/core";
|
|
5
|
+
import { Dataset, Example } from "../types/datasets";
|
|
3
6
|
import type {
|
|
4
7
|
Evaluator,
|
|
5
8
|
Experiment,
|
|
@@ -9,18 +12,16 @@ import type {
|
|
|
9
12
|
ExperimentTask,
|
|
10
13
|
RanExperiment,
|
|
11
14
|
} from "../types/experiments";
|
|
12
|
-
import { promisifyResult } from "../utils/promisifyResult";
|
|
13
|
-
import invariant from "tiny-invariant";
|
|
14
|
-
import { pluralize } from "../utils/pluralize";
|
|
15
|
-
import { ClientFn } from "../types/core";
|
|
16
|
-
import { getDatasetBySelector } from "../utils/getDatasetBySelector";
|
|
17
15
|
import { type Logger } from "../types/logger";
|
|
16
|
+
import { getDatasetBySelector } from "../utils/getDatasetBySelector";
|
|
17
|
+
import { pluralize } from "../utils/pluralize";
|
|
18
|
+
import { promisifyResult } from "../utils/promisifyResult";
|
|
19
|
+
import { AnnotatorKind } from "../types/annotations";
|
|
18
20
|
|
|
19
21
|
/**
|
|
20
22
|
* Parameters for running an experiment.
|
|
21
23
|
*
|
|
22
24
|
* @experimental This feature is not complete, and will change in the future.
|
|
23
|
-
* @deprecated This function will be un-marked as deprecated once the experimental feature flag is removed.
|
|
24
25
|
*/
|
|
25
26
|
export type RunExperimentParams = ClientFn & {
|
|
26
27
|
/**
|
|
@@ -28,6 +29,14 @@ export type RunExperimentParams = ClientFn & {
|
|
|
28
29
|
* Defaults to the dataset name + a timestamp
|
|
29
30
|
*/
|
|
30
31
|
experimentName?: string;
|
|
32
|
+
/**
|
|
33
|
+
* The description of the experiment
|
|
34
|
+
*/
|
|
35
|
+
experimentDescription?: string;
|
|
36
|
+
/**
|
|
37
|
+
* Experiment metadata
|
|
38
|
+
*/
|
|
39
|
+
experimentMetadata?: Record<string, unknown>;
|
|
31
40
|
/**
|
|
32
41
|
* The dataset to run the experiment on
|
|
33
42
|
*/
|
|
@@ -40,10 +49,6 @@ export type RunExperimentParams = ClientFn & {
|
|
|
40
49
|
* The evaluators to use
|
|
41
50
|
*/
|
|
42
51
|
evaluators?: Evaluator[];
|
|
43
|
-
/**
|
|
44
|
-
* The number of repetitions to run
|
|
45
|
-
*/
|
|
46
|
-
repetitions?: number;
|
|
47
52
|
/**
|
|
48
53
|
* The project under which the experiment task traces are recorded
|
|
49
54
|
*/
|
|
@@ -56,43 +61,96 @@ export type RunExperimentParams = ClientFn & {
|
|
|
56
61
|
* Whether to record the experiment results
|
|
57
62
|
*/
|
|
58
63
|
record?: boolean;
|
|
64
|
+
/**
|
|
65
|
+
* The number of dataset examples to run in parallel
|
|
66
|
+
*/
|
|
67
|
+
concurrency?: number;
|
|
68
|
+
/**
|
|
69
|
+
* Whether or not to run the experiment as a dry run. If a number is privided, n examples will be run.
|
|
70
|
+
* @default false
|
|
71
|
+
*/
|
|
72
|
+
dryRun?: number | boolean;
|
|
59
73
|
};
|
|
60
74
|
|
|
61
75
|
/**
|
|
62
76
|
* Run an experiment.
|
|
63
77
|
*
|
|
78
|
+
* @example
|
|
79
|
+
* ```ts
|
|
80
|
+
* import { asEvaluator, runExperiment } from "@phoenix/client/experiments";
|
|
81
|
+
*
|
|
82
|
+
* const experiment = await runExperiment({
|
|
83
|
+
* dataset: "my-dataset",
|
|
84
|
+
* task: async (example) => example.input,
|
|
85
|
+
* evaluators: [
|
|
86
|
+
* asEvaluator("my-evaluator", "CODE", async (params) => params.output),
|
|
87
|
+
* ],
|
|
88
|
+
* });
|
|
89
|
+
* ```
|
|
90
|
+
*
|
|
64
91
|
* @experimental This feature is not complete, and will change in the future.
|
|
65
|
-
* @deprecated This function will be un-marked as deprecated once the experimental feature flag is removed.
|
|
66
92
|
*/
|
|
67
93
|
export async function runExperiment({
|
|
68
94
|
experimentName: _experimentName,
|
|
95
|
+
experimentDescription,
|
|
96
|
+
experimentMetadata,
|
|
69
97
|
client: _client,
|
|
70
98
|
dataset: _dataset,
|
|
71
99
|
task,
|
|
72
100
|
evaluators,
|
|
73
|
-
repetitions = 1,
|
|
74
101
|
projectName = "default",
|
|
75
102
|
logger = console,
|
|
76
103
|
record = true,
|
|
104
|
+
concurrency = 5,
|
|
105
|
+
dryRun = false,
|
|
77
106
|
}: RunExperimentParams): Promise<RanExperiment> {
|
|
107
|
+
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
78
108
|
const client = _client ?? createClient();
|
|
79
109
|
const dataset = await getDatasetBySelector({ dataset: _dataset, client });
|
|
80
110
|
invariant(dataset, `Dataset not found`);
|
|
81
111
|
invariant(dataset.examples.length > 0, `Dataset has no examples`);
|
|
112
|
+
const nExamples =
|
|
113
|
+
typeof dryRun === "number"
|
|
114
|
+
? Math.max(dryRun, dataset.examples.length)
|
|
115
|
+
: dataset.examples.length;
|
|
116
|
+
|
|
82
117
|
const experimentName =
|
|
83
118
|
_experimentName ?? `${dataset.name}-${new Date().toISOString()}`;
|
|
84
119
|
const experimentParams: ExperimentParameters = {
|
|
85
|
-
|
|
86
|
-
// TODO: Make configurable?
|
|
87
|
-
nExamples: dataset.examples.length,
|
|
88
|
-
};
|
|
89
|
-
const experiment: Experiment = {
|
|
90
|
-
id: id(),
|
|
91
|
-
datasetId: dataset.id,
|
|
92
|
-
datasetVersionId: dataset.versionId,
|
|
93
|
-
repetitions,
|
|
94
|
-
projectName,
|
|
120
|
+
nExamples,
|
|
95
121
|
};
|
|
122
|
+
let experiment: Experiment;
|
|
123
|
+
if (isDryRun) {
|
|
124
|
+
experiment = {
|
|
125
|
+
id: id(),
|
|
126
|
+
datasetId: dataset.id,
|
|
127
|
+
datasetVersionId: dataset.versionId,
|
|
128
|
+
projectName,
|
|
129
|
+
};
|
|
130
|
+
} else {
|
|
131
|
+
const experimentResponse = await client
|
|
132
|
+
.POST("/v1/datasets/{dataset_id}/experiments", {
|
|
133
|
+
params: {
|
|
134
|
+
path: {
|
|
135
|
+
dataset_id: dataset.id,
|
|
136
|
+
},
|
|
137
|
+
},
|
|
138
|
+
body: {
|
|
139
|
+
name: experimentName,
|
|
140
|
+
description: experimentDescription,
|
|
141
|
+
metadata: experimentMetadata,
|
|
142
|
+
project_name: projectName,
|
|
143
|
+
},
|
|
144
|
+
})
|
|
145
|
+
.then((res) => res.data?.data);
|
|
146
|
+
invariant(experimentResponse, `Failed to create experiment`);
|
|
147
|
+
experiment = {
|
|
148
|
+
id: experimentResponse.id,
|
|
149
|
+
datasetId: dataset.id,
|
|
150
|
+
datasetVersionId: dataset.versionId,
|
|
151
|
+
projectName,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
96
154
|
|
|
97
155
|
if (!record) {
|
|
98
156
|
logger.info(
|
|
@@ -104,30 +162,25 @@ export async function runExperiment({
|
|
|
104
162
|
`🧪 Starting experiment "${experimentName}" on dataset "${dataset.id}" with task "${task.name}" and ${evaluators?.length ?? 0} ${pluralize(
|
|
105
163
|
"evaluator",
|
|
106
164
|
evaluators?.length ?? 0
|
|
107
|
-
)}`
|
|
108
|
-
);
|
|
109
|
-
|
|
110
|
-
logger.info(
|
|
111
|
-
`🔁 Running ${repetitions} ${pluralize("repetition", repetitions)} of task "${task.name}"`
|
|
165
|
+
)} and ${concurrency} concurrent runs`
|
|
112
166
|
);
|
|
113
167
|
|
|
114
168
|
// Run task against all examples, for each repetition
|
|
115
169
|
type ExperimentRunId = string;
|
|
116
170
|
const runs: Record<ExperimentRunId, ExperimentRun> = {};
|
|
117
|
-
await
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
);
|
|
171
|
+
await runTask({
|
|
172
|
+
client,
|
|
173
|
+
experimentId: experiment.id,
|
|
174
|
+
task,
|
|
175
|
+
dataset,
|
|
176
|
+
logger,
|
|
177
|
+
onComplete: (run) => {
|
|
178
|
+
runs[run.id] = run;
|
|
179
|
+
},
|
|
180
|
+
concurrency,
|
|
181
|
+
isDryRun,
|
|
182
|
+
nExamples,
|
|
183
|
+
});
|
|
131
184
|
logger.info(`✅ Task runs completed`);
|
|
132
185
|
|
|
133
186
|
const ranExperiment: RanExperiment = {
|
|
@@ -141,6 +194,8 @@ export async function runExperiment({
|
|
|
141
194
|
evaluators: evaluators ?? [],
|
|
142
195
|
client,
|
|
143
196
|
logger,
|
|
197
|
+
concurrency,
|
|
198
|
+
dryRun,
|
|
144
199
|
});
|
|
145
200
|
ranExperiment.evaluationRuns = evaluationRuns;
|
|
146
201
|
|
|
@@ -150,40 +205,49 @@ export async function runExperiment({
|
|
|
150
205
|
}
|
|
151
206
|
|
|
152
207
|
/**
|
|
153
|
-
* Run a task against
|
|
208
|
+
* Run a task against n examples in a dataset.
|
|
154
209
|
*/
|
|
155
210
|
function runTask({
|
|
211
|
+
client,
|
|
156
212
|
experimentId,
|
|
157
213
|
task,
|
|
158
214
|
dataset,
|
|
159
|
-
repetition,
|
|
160
215
|
onComplete,
|
|
161
216
|
logger,
|
|
217
|
+
concurrency = 5,
|
|
218
|
+
isDryRun,
|
|
219
|
+
nExamples,
|
|
162
220
|
}: {
|
|
221
|
+
/** The client to use */
|
|
222
|
+
client: PhoenixClient;
|
|
163
223
|
/** The id of the experiment */
|
|
164
224
|
experimentId: string;
|
|
165
225
|
/** The task to run */
|
|
166
226
|
task: ExperimentTask;
|
|
167
227
|
/** The dataset to run the task on */
|
|
168
228
|
dataset: Dataset;
|
|
169
|
-
/** The repetition number */
|
|
170
|
-
repetition: number;
|
|
171
229
|
/** A callback to call when the task is complete */
|
|
172
230
|
onComplete: (run: ExperimentRun) => void;
|
|
173
231
|
/** The logger to use */
|
|
174
232
|
logger: Logger;
|
|
233
|
+
/** The number of examples to run in parallel */
|
|
234
|
+
concurrency: number;
|
|
235
|
+
/** Whether to run the task as a dry run */
|
|
236
|
+
isDryRun: boolean;
|
|
237
|
+
/** The number of examples to run */
|
|
238
|
+
nExamples: number;
|
|
175
239
|
}) {
|
|
176
|
-
logger.info(
|
|
177
|
-
`🔧 (${repetition}) Running task "${task.name}" on dataset "${dataset.id}"`
|
|
178
|
-
);
|
|
240
|
+
logger.info(`🔧 Running task "${task.name}" on dataset "${dataset.id}"`);
|
|
179
241
|
const run = async (example: Example) => {
|
|
242
|
+
logger.info(
|
|
243
|
+
`🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`
|
|
244
|
+
);
|
|
180
245
|
const thisRun: ExperimentRun = {
|
|
181
246
|
id: id(),
|
|
182
|
-
traceId:
|
|
247
|
+
traceId: null, // TODO: fill this in once we trace experiments
|
|
183
248
|
experimentId,
|
|
184
249
|
datasetExampleId: example.id,
|
|
185
250
|
startTime: new Date(),
|
|
186
|
-
repetitionNumber: repetition,
|
|
187
251
|
endTime: new Date(), // will get replaced with actual end time
|
|
188
252
|
output: null,
|
|
189
253
|
error: null,
|
|
@@ -199,22 +263,49 @@ function runTask({
|
|
|
199
263
|
thisRun.error = error instanceof Error ? error.message : "Unknown error";
|
|
200
264
|
}
|
|
201
265
|
thisRun.endTime = new Date();
|
|
266
|
+
if (!isDryRun) {
|
|
267
|
+
// Log the run to the server
|
|
268
|
+
// We log this without awaiting (e.g. best effort)
|
|
269
|
+
const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
|
|
270
|
+
params: {
|
|
271
|
+
path: {
|
|
272
|
+
experiment_id: experimentId,
|
|
273
|
+
},
|
|
274
|
+
},
|
|
275
|
+
body: {
|
|
276
|
+
dataset_example_id: example.id,
|
|
277
|
+
output: thisRun.output,
|
|
278
|
+
repetition_number: 0,
|
|
279
|
+
start_time: thisRun.startTime.toISOString(),
|
|
280
|
+
end_time: thisRun.endTime.toISOString(),
|
|
281
|
+
trace_id: thisRun.traceId,
|
|
282
|
+
error: thisRun.error,
|
|
283
|
+
},
|
|
284
|
+
});
|
|
285
|
+
// replace the local run id with the server-assigned id
|
|
286
|
+
thisRun.id = res.data?.data.id ?? thisRun.id;
|
|
287
|
+
}
|
|
202
288
|
onComplete(thisRun);
|
|
289
|
+
return thisRun;
|
|
203
290
|
};
|
|
204
|
-
|
|
291
|
+
const q = queue(run, concurrency);
|
|
292
|
+
const examplesToUse = dataset.examples.slice(0, nExamples);
|
|
293
|
+
examplesToUse.forEach((example) => q.push(example));
|
|
294
|
+
return q.drain();
|
|
205
295
|
}
|
|
206
296
|
|
|
207
297
|
/**
|
|
208
298
|
* Evaluate an experiment.
|
|
209
299
|
*
|
|
210
300
|
* @experimental This feature is not complete, and will change in the future.
|
|
211
|
-
* @deprecated This function will be un-marked as deprecated once the experimental feature flag is removed.
|
|
212
301
|
*/
|
|
213
302
|
export async function evaluateExperiment({
|
|
214
303
|
experiment,
|
|
215
304
|
evaluators,
|
|
216
305
|
client: _client,
|
|
217
306
|
logger,
|
|
307
|
+
concurrency = 5,
|
|
308
|
+
dryRun = false,
|
|
218
309
|
}: {
|
|
219
310
|
/**
|
|
220
311
|
* The experiment to evaluate
|
|
@@ -227,7 +318,20 @@ export async function evaluateExperiment({
|
|
|
227
318
|
client?: PhoenixClient;
|
|
228
319
|
/** The logger to use */
|
|
229
320
|
logger: Logger;
|
|
321
|
+
/** The number of evaluators to run in parallel */
|
|
322
|
+
concurrency: number;
|
|
323
|
+
/**
|
|
324
|
+
* Whether to run the evaluation as a dry run
|
|
325
|
+
* If a number is provided, the evaluation will be run for the first n runs
|
|
326
|
+
* @default false
|
|
327
|
+
* */
|
|
328
|
+
dryRun?: boolean | number;
|
|
230
329
|
}): Promise<RanExperiment> {
|
|
330
|
+
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
331
|
+
const nRuns =
|
|
332
|
+
typeof dryRun === "number"
|
|
333
|
+
? Math.max(dryRun, Object.keys(experiment.runs).length)
|
|
334
|
+
: Object.keys(experiment.runs).length;
|
|
231
335
|
const client = _client ?? createClient();
|
|
232
336
|
const dataset = await getDatasetBySelector({
|
|
233
337
|
dataset: experiment.datasetId,
|
|
@@ -240,6 +344,8 @@ export async function evaluateExperiment({
|
|
|
240
344
|
);
|
|
241
345
|
invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`);
|
|
242
346
|
|
|
347
|
+
const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
|
|
348
|
+
|
|
243
349
|
if (evaluators?.length === 0) {
|
|
244
350
|
return {
|
|
245
351
|
...experiment,
|
|
@@ -266,21 +372,47 @@ export async function evaluateExperiment({
|
|
|
266
372
|
};
|
|
267
373
|
|
|
268
374
|
// Run evaluators against all runs
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
run,
|
|
276
|
-
exampleCache: examplesById,
|
|
277
|
-
onComplete: onEvaluationComplete,
|
|
278
|
-
})
|
|
279
|
-
)
|
|
280
|
-
)
|
|
281
|
-
)
|
|
375
|
+
// Flat list of evaluator + run tuples
|
|
376
|
+
const evaluatorsAndRuns = evaluators.flatMap((evaluator) =>
|
|
377
|
+
runsToEvaluate.map((run) => ({
|
|
378
|
+
evaluator,
|
|
379
|
+
run,
|
|
380
|
+
}))
|
|
282
381
|
);
|
|
283
|
-
|
|
382
|
+
const evaluatorsQueue = queue(
|
|
383
|
+
async (evaluatorAndRun: { evaluator: Evaluator; run: ExperimentRun }) => {
|
|
384
|
+
const evalResult = await runEvaluator({
|
|
385
|
+
evaluator: evaluatorAndRun.evaluator,
|
|
386
|
+
run: evaluatorAndRun.run,
|
|
387
|
+
exampleCache: examplesById,
|
|
388
|
+
onComplete: onEvaluationComplete,
|
|
389
|
+
});
|
|
390
|
+
if (!isDryRun) {
|
|
391
|
+
logger.info(`📝 Logging evaluation ${evalResult.id}`);
|
|
392
|
+
// Log the evaluation to the server
|
|
393
|
+
// We log this without awaiting (e.g. best effort)
|
|
394
|
+
client.POST("/v1/experiment_evaluations", {
|
|
395
|
+
body: {
|
|
396
|
+
experiment_run_id: evaluatorAndRun.run.id,
|
|
397
|
+
name: evaluatorAndRun.evaluator.name,
|
|
398
|
+
annotator_kind: evaluatorAndRun.evaluator.kind,
|
|
399
|
+
start_time: evalResult.startTime.toISOString(),
|
|
400
|
+
end_time: evalResult.endTime.toISOString(),
|
|
401
|
+
result: {
|
|
402
|
+
...evalResult.result,
|
|
403
|
+
},
|
|
404
|
+
error: evalResult.error,
|
|
405
|
+
trace_id: evalResult.traceId,
|
|
406
|
+
},
|
|
407
|
+
});
|
|
408
|
+
}
|
|
409
|
+
},
|
|
410
|
+
concurrency
|
|
411
|
+
);
|
|
412
|
+
evaluatorsAndRuns.forEach((evaluatorAndRun) =>
|
|
413
|
+
evaluatorsQueue.push(evaluatorAndRun)
|
|
414
|
+
);
|
|
415
|
+
await evaluatorsQueue.drain();
|
|
284
416
|
logger.info(`✅ Evaluation runs completed`);
|
|
285
417
|
|
|
286
418
|
return {
|
|
@@ -293,7 +425,6 @@ export async function evaluateExperiment({
|
|
|
293
425
|
* Run an evaluator against a run.
|
|
294
426
|
*
|
|
295
427
|
* @experimental This feature is not complete, and will change in the future.
|
|
296
|
-
* @deprecated This function will be un-marked as deprecated once the experimental feature flag is removed.
|
|
297
428
|
*/
|
|
298
429
|
async function runEvaluator({
|
|
299
430
|
evaluator,
|
|
@@ -311,7 +442,7 @@ async function runEvaluator({
|
|
|
311
442
|
const evaluate = async () => {
|
|
312
443
|
const thisEval: ExperimentEvaluationRun = {
|
|
313
444
|
id: id(),
|
|
314
|
-
traceId:
|
|
445
|
+
traceId: null, // TODO: fill this in once we trace experiments
|
|
315
446
|
experimentRunId: run.id,
|
|
316
447
|
startTime: new Date(),
|
|
317
448
|
endTime: new Date(), // will get replaced with actual end time
|
|
@@ -333,6 +464,7 @@ async function runEvaluator({
|
|
|
333
464
|
}
|
|
334
465
|
thisEval.endTime = new Date();
|
|
335
466
|
onComplete(thisEval);
|
|
467
|
+
return thisEval;
|
|
336
468
|
};
|
|
337
469
|
|
|
338
470
|
return evaluate();
|
|
@@ -342,18 +474,23 @@ async function runEvaluator({
|
|
|
342
474
|
* Wrap an evaluator function in an object with a name property.
|
|
343
475
|
*
|
|
344
476
|
* @experimental This feature is not complete, and will change in the future.
|
|
345
|
-
* @deprecated This function will be un-marked as deprecated once the experimental feature flag is removed.
|
|
346
477
|
*
|
|
347
478
|
* @param name - The name of the evaluator.
|
|
348
479
|
* @param evaluate - The evaluator function.
|
|
349
480
|
* @returns The evaluator object.
|
|
350
481
|
*/
|
|
351
|
-
export function asEvaluator(
|
|
352
|
-
name
|
|
353
|
-
|
|
354
|
-
|
|
482
|
+
export function asEvaluator({
|
|
483
|
+
name,
|
|
484
|
+
kind,
|
|
485
|
+
evaluate,
|
|
486
|
+
}: {
|
|
487
|
+
name: string;
|
|
488
|
+
kind: AnnotatorKind;
|
|
489
|
+
evaluate: Evaluator["evaluate"];
|
|
490
|
+
}): Evaluator {
|
|
355
491
|
return {
|
|
356
492
|
name,
|
|
493
|
+
kind,
|
|
357
494
|
evaluate,
|
|
358
495
|
};
|
|
359
496
|
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { createClient } from "../client";
|
|
2
|
+
import { ClientFn } from "../types/core";
|
|
3
|
+
import { SpanAnnotation, toSpanAnnotationData } from "./types";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Parameters to add a span annotation
|
|
7
|
+
*/
|
|
8
|
+
interface AddSpanAnnotationParams extends ClientFn {
|
|
9
|
+
spanAnnotation: SpanAnnotation;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Add an annotation to a span.
|
|
14
|
+
*
|
|
15
|
+
* The annotation can be of type "LLM", "CODE", or "HUMAN" and can include a label, score, and metadata.
|
|
16
|
+
* If an identifier is provided and an annotation with that identifier already exists, it will be updated.
|
|
17
|
+
*
|
|
18
|
+
* @param params - The parameters to add a span annotation
|
|
19
|
+
* @returns The ID of the created or updated annotation
|
|
20
|
+
*
|
|
21
|
+
* @example
|
|
22
|
+
* ```ts
|
|
23
|
+
* const result = await addSpanAnnotation({
|
|
24
|
+
* spanAnnotation: {
|
|
25
|
+
* spanId: "123abc",
|
|
26
|
+
* name: "quality_score",
|
|
27
|
+
* label: "good",
|
|
28
|
+
* score: 0.95,
|
|
29
|
+
* annotatorKind: "LLM",
|
|
30
|
+
* identifier: "custom_id_123",
|
|
31
|
+
* metadata: {
|
|
32
|
+
* model: "gpt-4"
|
|
33
|
+
* }
|
|
34
|
+
* }
|
|
35
|
+
* });
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
38
|
+
export async function addSpanAnnotation({
|
|
39
|
+
client: _client,
|
|
40
|
+
spanAnnotation,
|
|
41
|
+
}: AddSpanAnnotationParams): Promise<{ id: string }> {
|
|
42
|
+
const client = _client ?? createClient();
|
|
43
|
+
|
|
44
|
+
const { data, error } = await client.POST("/v1/span_annotations", {
|
|
45
|
+
body: {
|
|
46
|
+
data: [toSpanAnnotationData(spanAnnotation)],
|
|
47
|
+
},
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
if (error) {
|
|
51
|
+
throw new Error(`Failed to add span annotation: ${error}`);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if (!data?.data?.[0]?.id) {
|
|
55
|
+
throw new Error("No annotation ID returned from server");
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return data.data[0];
|
|
59
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { createClient } from "../client";
|
|
2
|
+
import { ClientFn } from "../types/core";
|
|
3
|
+
import { SpanAnnotation, toSpanAnnotationData } from "./types";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Parameters to log multiple span annotations
|
|
7
|
+
*/
|
|
8
|
+
interface LogSpanAnnotationsParams extends ClientFn {
|
|
9
|
+
/**
|
|
10
|
+
* The span annotations to log
|
|
11
|
+
*/
|
|
12
|
+
spanAnnotations: SpanAnnotation[];
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Log multiple span annotations in a single request.
|
|
17
|
+
*
|
|
18
|
+
* Each annotation can be of type "LLM", "CODE", or "HUMAN" and can include a label, score, and metadata.
|
|
19
|
+
* If an identifier is provided and an annotation with that identifier already exists, it will be updated.
|
|
20
|
+
*
|
|
21
|
+
* @param params - The parameters to log span annotations
|
|
22
|
+
* @returns The IDs of the created or updated annotations
|
|
23
|
+
*
|
|
24
|
+
* @example
|
|
25
|
+
* ```ts
|
|
26
|
+
* const results = await logSpanAnnotations({
|
|
27
|
+
* spanAnnotations: [
|
|
28
|
+
* {
|
|
29
|
+
* spanId: "123abc",
|
|
30
|
+
* name: "quality_score",
|
|
31
|
+
* label: "good",
|
|
32
|
+
* score: 0.95,
|
|
33
|
+
* annotatorKind: "LLM",
|
|
34
|
+
* identifier: "custom_id_123",
|
|
35
|
+
* metadata: {
|
|
36
|
+
* model: "gpt-4"
|
|
37
|
+
* }
|
|
38
|
+
* },
|
|
39
|
+
* {
|
|
40
|
+
* spanId: "456def",
|
|
41
|
+
* name: "sentiment",
|
|
42
|
+
* label: "positive",
|
|
43
|
+
* score: 0.8,
|
|
44
|
+
* annotatorKind: "CODE"
|
|
45
|
+
* }
|
|
46
|
+
* ]
|
|
47
|
+
* });
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
export async function logSpanAnnotations({
|
|
51
|
+
client: _client,
|
|
52
|
+
spanAnnotations,
|
|
53
|
+
}: LogSpanAnnotationsParams): Promise<{ id: string }[]> {
|
|
54
|
+
const client = _client ?? createClient();
|
|
55
|
+
|
|
56
|
+
const { data, error } = await client.POST("/v1/span_annotations", {
|
|
57
|
+
body: {
|
|
58
|
+
data: spanAnnotations.map(toSpanAnnotationData),
|
|
59
|
+
},
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
if (error) {
|
|
63
|
+
throw new Error(`Failed to log span annotations: ${error}`);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (!data?.data?.length) {
|
|
67
|
+
throw new Error("No annotation IDs returned from server");
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
return data.data;
|
|
71
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { paths } from "../__generated__/api/v1";
|
|
2
|
+
|
|
3
|
+
type SpanAnnotationData =
|
|
4
|
+
paths["/v1/span_annotations"]["post"]["requestBody"]["content"]["application/json"]["data"][0];
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Parameters for a single span annotation
|
|
8
|
+
*/
|
|
9
|
+
export interface SpanAnnotation {
|
|
10
|
+
/**
|
|
11
|
+
* The OpenTelemetry Span ID (hex format without 0x prefix)
|
|
12
|
+
*/
|
|
13
|
+
spanId: string;
|
|
14
|
+
/**
|
|
15
|
+
* The name of the annotation
|
|
16
|
+
*/
|
|
17
|
+
name: string;
|
|
18
|
+
/**
|
|
19
|
+
* The label assigned by the annotation
|
|
20
|
+
*/
|
|
21
|
+
label?: string;
|
|
22
|
+
/**
|
|
23
|
+
* The score assigned by the annotation
|
|
24
|
+
*/
|
|
25
|
+
score?: number;
|
|
26
|
+
/**
|
|
27
|
+
* The identifier of the annotation. If provided, the annotation will be updated if it already exists.
|
|
28
|
+
*/
|
|
29
|
+
identifier?: string;
|
|
30
|
+
/**
|
|
31
|
+
* Metadata for the annotation
|
|
32
|
+
*/
|
|
33
|
+
metadata?: Record<string, unknown>;
|
|
34
|
+
/**
|
|
35
|
+
* The kind of annotator used for the annotation
|
|
36
|
+
* Can be "HUMAN", "LLM", or "CODE"
|
|
37
|
+
* @default "HUMAN"
|
|
38
|
+
*/
|
|
39
|
+
annotatorKind?: SpanAnnotationData["annotator_kind"];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Convert a SpanAnnotation to the API format
|
|
44
|
+
*/
|
|
45
|
+
export function toSpanAnnotationData(
|
|
46
|
+
annotation: SpanAnnotation
|
|
47
|
+
): SpanAnnotationData {
|
|
48
|
+
return {
|
|
49
|
+
span_id: annotation.spanId,
|
|
50
|
+
name: annotation.name,
|
|
51
|
+
annotator_kind: annotation.annotatorKind ?? "HUMAN",
|
|
52
|
+
result: {
|
|
53
|
+
label: annotation.label ?? null,
|
|
54
|
+
score: annotation.score ?? null,
|
|
55
|
+
explanation: null,
|
|
56
|
+
},
|
|
57
|
+
metadata: annotation.metadata ?? null,
|
|
58
|
+
identifier: annotation.identifier ?? "",
|
|
59
|
+
};
|
|
60
|
+
}
|
package/src/types/annotations.ts
CHANGED