@arizeai/phoenix-client 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/datasets/getDatasetInfoByName.d.ts.map +1 -1
- package/dist/esm/datasets/getDatasetInfoByName.js +4 -2
- package/dist/esm/datasets/getDatasetInfoByName.js.map +1 -1
- package/dist/esm/schemas/llm/anthropic/converters.d.ts +14 -14
- package/dist/esm/schemas/llm/anthropic/messagePartSchemas.d.ts +4 -4
- package/dist/esm/schemas/llm/anthropic/messageSchemas.d.ts +12 -12
- package/dist/esm/schemas/llm/anthropic/toolCallSchemas.d.ts +4 -4
- package/dist/esm/schemas/llm/constants.d.ts +3 -3
- package/dist/esm/schemas/llm/converters.d.ts +12 -12
- package/dist/esm/schemas/llm/openai/converters.d.ts +3 -3
- package/dist/esm/schemas/llm/schemas.d.ts +12 -12
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/src/datasets/appendDatasetExamples.js +34 -45
- package/dist/src/datasets/appendDatasetExamples.js.map +1 -1
- package/dist/src/datasets/createDataset.js +25 -36
- package/dist/src/datasets/createDataset.js.map +1 -1
- package/dist/src/datasets/getDataset.js +7 -18
- package/dist/src/datasets/getDataset.js.map +1 -1
- package/dist/src/datasets/getDatasetExamples.js +25 -36
- package/dist/src/datasets/getDatasetExamples.js.map +1 -1
- package/dist/src/datasets/getDatasetInfo.js +22 -33
- package/dist/src/datasets/getDatasetInfo.js.map +1 -1
- package/dist/src/datasets/getDatasetInfoByName.d.ts.map +1 -1
- package/dist/src/datasets/getDatasetInfoByName.js +19 -28
- package/dist/src/datasets/getDatasetInfoByName.js.map +1 -1
- package/dist/src/datasets/listDatasets.js +6 -17
- package/dist/src/datasets/listDatasets.js.map +1 -1
- package/dist/src/experiments/getExperiment.js +13 -24
- package/dist/src/experiments/getExperiment.js.map +1 -1
- package/dist/src/experiments/getExperimentInfo.js +15 -26
- package/dist/src/experiments/getExperimentInfo.js.map +1 -1
- package/dist/src/experiments/getExperimentRuns.js +24 -35
- package/dist/src/experiments/getExperimentRuns.js.map +1 -1
- package/dist/src/experiments/runExperiment.js +280 -295
- package/dist/src/experiments/runExperiment.js.map +1 -1
- package/dist/src/prompts/createPrompt.js +14 -25
- package/dist/src/prompts/createPrompt.js.map +1 -1
- package/dist/src/prompts/getPrompt.js +4 -15
- package/dist/src/prompts/getPrompt.js.map +1 -1
- package/dist/src/schemas/llm/anthropic/converters.d.ts +14 -14
- package/dist/src/schemas/llm/anthropic/messagePartSchemas.d.ts +4 -4
- package/dist/src/schemas/llm/anthropic/messageSchemas.d.ts +12 -12
- package/dist/src/schemas/llm/anthropic/toolCallSchemas.d.ts +4 -4
- package/dist/src/schemas/llm/constants.d.ts +3 -3
- package/dist/src/schemas/llm/converters.d.ts +12 -12
- package/dist/src/schemas/llm/openai/converters.d.ts +3 -3
- package/dist/src/schemas/llm/schemas.d.ts +12 -12
- package/dist/src/spans/addSpanAnnotation.js +14 -25
- package/dist/src/spans/addSpanAnnotation.js.map +1 -1
- package/dist/src/spans/getSpanAnnotations.js +29 -40
- package/dist/src/spans/getSpanAnnotations.js.map +1 -1
- package/dist/src/spans/getSpans.js +29 -40
- package/dist/src/spans/getSpans.js.map +1 -1
- package/dist/src/spans/logSpanAnnotations.js +14 -25
- package/dist/src/spans/logSpanAnnotations.js.map +1 -1
- package/dist/src/utils/getPromptBySelector.js +37 -48
- package/dist/src/utils/getPromptBySelector.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +1 -1
- package/src/datasets/getDatasetInfoByName.ts +4 -2
|
@@ -1,13 +1,4 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
-
});
|
|
10
|
-
};
|
|
11
2
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
3
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
4
|
};
|
|
@@ -59,138 +50,136 @@ const urlUtils_1 = require("../utils/urlUtils");
|
|
|
59
50
|
* });
|
|
60
51
|
* ```
|
|
61
52
|
*/
|
|
62
|
-
function runExperiment(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
dataset_id: dataset.id,
|
|
94
|
-
},
|
|
95
|
-
},
|
|
96
|
-
body: {
|
|
97
|
-
name: experimentName,
|
|
98
|
-
description: experimentDescription,
|
|
99
|
-
metadata: experimentMetadata,
|
|
100
|
-
project_name: projectName,
|
|
53
|
+
async function runExperiment({ experimentName, experimentDescription, experimentMetadata = {}, client: _client, dataset: DatasetSelector, task, evaluators, logger = console, record = true, concurrency = 5, dryRun = false, }) {
|
|
54
|
+
var _a, _b, _c, _d, _e;
|
|
55
|
+
let provider;
|
|
56
|
+
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
57
|
+
const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
|
|
58
|
+
const dataset = await (0, getDataset_1.getDataset)({ dataset: DatasetSelector, client });
|
|
59
|
+
(0, tiny_invariant_1.default)(dataset, `Dataset not found`);
|
|
60
|
+
(0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset has no examples`);
|
|
61
|
+
const nExamples = typeof dryRun === "number"
|
|
62
|
+
? Math.min(dryRun, dataset.examples.length)
|
|
63
|
+
: dataset.examples.length;
|
|
64
|
+
let projectName = `${dataset.name}-exp-${new Date().toISOString()}`;
|
|
65
|
+
// initialize the tracer into scope
|
|
66
|
+
let taskTracer;
|
|
67
|
+
let experiment;
|
|
68
|
+
if (isDryRun) {
|
|
69
|
+
experiment = {
|
|
70
|
+
id: localId(),
|
|
71
|
+
datasetId: dataset.id,
|
|
72
|
+
datasetVersionId: dataset.versionId,
|
|
73
|
+
projectName,
|
|
74
|
+
metadata: experimentMetadata,
|
|
75
|
+
};
|
|
76
|
+
taskTracer = (0, instrumention_1.createNoOpProvider)().getTracer("no-op");
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
const experimentResponse = await client
|
|
80
|
+
.POST("/v1/datasets/{dataset_id}/experiments", {
|
|
81
|
+
params: {
|
|
82
|
+
path: {
|
|
83
|
+
dataset_id: dataset.id,
|
|
101
84
|
},
|
|
102
|
-
})
|
|
103
|
-
.then((res) => { var _a; return (_a = res.data) === null || _a === void 0 ? void 0 : _a.data; });
|
|
104
|
-
(0, tiny_invariant_1.default)(experimentResponse, `Failed to create experiment`);
|
|
105
|
-
projectName = (_b = experimentResponse.project_name) !== null && _b !== void 0 ? _b : projectName;
|
|
106
|
-
experiment = {
|
|
107
|
-
id: experimentResponse.id,
|
|
108
|
-
datasetId: experimentResponse.dataset_id,
|
|
109
|
-
datasetVersionId: experimentResponse.dataset_version_id,
|
|
110
|
-
projectName,
|
|
111
|
-
metadata: experimentResponse.metadata,
|
|
112
|
-
};
|
|
113
|
-
// Initialize the tracer, now that we have a project name
|
|
114
|
-
const baseUrl = client.config.baseUrl;
|
|
115
|
-
(0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
|
|
116
|
-
provider = (0, instrumention_1.createProvider)({
|
|
117
|
-
projectName,
|
|
118
|
-
baseUrl,
|
|
119
|
-
headers: (_c = client.config.headers) !== null && _c !== void 0 ? _c : {},
|
|
120
|
-
});
|
|
121
|
-
taskTracer = provider.getTracer(projectName);
|
|
122
|
-
}
|
|
123
|
-
if (!record) {
|
|
124
|
-
logger.info(`🔧 Running experiment in readonly mode. Results will not be recorded.`);
|
|
125
|
-
}
|
|
126
|
-
if (!isDryRun && client.config.baseUrl) {
|
|
127
|
-
const datasetUrl = (0, urlUtils_1.getDatasetUrl)({
|
|
128
|
-
baseUrl: client.config.baseUrl,
|
|
129
|
-
datasetId: dataset.id,
|
|
130
|
-
});
|
|
131
|
-
const datasetExperimentsUrl = (0, urlUtils_1.getDatasetExperimentsUrl)({
|
|
132
|
-
baseUrl: client.config.baseUrl,
|
|
133
|
-
datasetId: dataset.id,
|
|
134
|
-
});
|
|
135
|
-
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
|
|
136
|
-
baseUrl: client.config.baseUrl,
|
|
137
|
-
datasetId: dataset.id,
|
|
138
|
-
experimentId: experiment.id,
|
|
139
|
-
});
|
|
140
|
-
logger.info(`📊 View dataset: ${datasetUrl}`);
|
|
141
|
-
logger.info(`📺 View dataset experiments: ${datasetExperimentsUrl}`);
|
|
142
|
-
logger.info(`🔗 View this experiment: ${experimentUrl}`);
|
|
143
|
-
}
|
|
144
|
-
logger.info(`🧪 Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${(_d = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _d !== void 0 ? _d : 0} ${(0, pluralize_1.pluralize)("evaluator", (_e = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _e !== void 0 ? _e : 0)} and ${concurrency} concurrent runs`);
|
|
145
|
-
const runs = {};
|
|
146
|
-
yield runTaskWithExamples({
|
|
147
|
-
client,
|
|
148
|
-
experimentId: experiment.id,
|
|
149
|
-
task,
|
|
150
|
-
dataset,
|
|
151
|
-
logger,
|
|
152
|
-
onComplete: (run) => {
|
|
153
|
-
runs[run.id] = run;
|
|
154
85
|
},
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
86
|
+
body: {
|
|
87
|
+
name: experimentName,
|
|
88
|
+
description: experimentDescription,
|
|
89
|
+
metadata: experimentMetadata,
|
|
90
|
+
project_name: projectName,
|
|
91
|
+
},
|
|
92
|
+
})
|
|
93
|
+
.then((res) => { var _a; return (_a = res.data) === null || _a === void 0 ? void 0 : _a.data; });
|
|
94
|
+
(0, tiny_invariant_1.default)(experimentResponse, `Failed to create experiment`);
|
|
95
|
+
projectName = (_a = experimentResponse.project_name) !== null && _a !== void 0 ? _a : projectName;
|
|
96
|
+
experiment = {
|
|
97
|
+
id: experimentResponse.id,
|
|
98
|
+
datasetId: experimentResponse.dataset_id,
|
|
99
|
+
datasetVersionId: experimentResponse.dataset_version_id,
|
|
100
|
+
projectName,
|
|
101
|
+
metadata: experimentResponse.metadata,
|
|
102
|
+
};
|
|
103
|
+
// Initialize the tracer, now that we have a project name
|
|
104
|
+
const baseUrl = client.config.baseUrl;
|
|
105
|
+
(0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
|
|
106
|
+
provider = (0, instrumention_1.createProvider)({
|
|
107
|
+
projectName,
|
|
108
|
+
baseUrl,
|
|
109
|
+
headers: (_b = client.config.headers) !== null && _b !== void 0 ? _b : {},
|
|
159
110
|
});
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
const
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
client,
|
|
170
|
-
logger,
|
|
171
|
-
concurrency,
|
|
172
|
-
dryRun,
|
|
111
|
+
taskTracer = provider.getTracer(projectName);
|
|
112
|
+
}
|
|
113
|
+
if (!record) {
|
|
114
|
+
logger.info(`🔧 Running experiment in readonly mode. Results will not be recorded.`);
|
|
115
|
+
}
|
|
116
|
+
if (!isDryRun && client.config.baseUrl) {
|
|
117
|
+
const datasetUrl = (0, urlUtils_1.getDatasetUrl)({
|
|
118
|
+
baseUrl: client.config.baseUrl,
|
|
119
|
+
datasetId: dataset.id,
|
|
173
120
|
});
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
}
|
|
184
|
-
|
|
121
|
+
const datasetExperimentsUrl = (0, urlUtils_1.getDatasetExperimentsUrl)({
|
|
122
|
+
baseUrl: client.config.baseUrl,
|
|
123
|
+
datasetId: dataset.id,
|
|
124
|
+
});
|
|
125
|
+
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
|
|
126
|
+
baseUrl: client.config.baseUrl,
|
|
127
|
+
datasetId: dataset.id,
|
|
128
|
+
experimentId: experiment.id,
|
|
129
|
+
});
|
|
130
|
+
logger.info(`📊 View dataset: ${datasetUrl}`);
|
|
131
|
+
logger.info(`📺 View dataset experiments: ${datasetExperimentsUrl}`);
|
|
132
|
+
logger.info(`🔗 View this experiment: ${experimentUrl}`);
|
|
133
|
+
}
|
|
134
|
+
logger.info(`🧪 Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${(_c = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _c !== void 0 ? _c : 0} ${(0, pluralize_1.pluralize)("evaluator", (_d = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _d !== void 0 ? _d : 0)} and ${concurrency} concurrent runs`);
|
|
135
|
+
const runs = {};
|
|
136
|
+
await runTaskWithExamples({
|
|
137
|
+
client,
|
|
138
|
+
experimentId: experiment.id,
|
|
139
|
+
task,
|
|
140
|
+
dataset,
|
|
141
|
+
logger,
|
|
142
|
+
onComplete: (run) => {
|
|
143
|
+
runs[run.id] = run;
|
|
144
|
+
},
|
|
145
|
+
concurrency,
|
|
146
|
+
isDryRun,
|
|
147
|
+
nExamples,
|
|
148
|
+
tracer: taskTracer,
|
|
185
149
|
});
|
|
150
|
+
logger.info(`✅ Task runs completed`);
|
|
151
|
+
const ranExperiment = Object.assign(Object.assign({}, experiment), { runs });
|
|
152
|
+
// Shut down the provider so that the experiments run
|
|
153
|
+
if (provider) {
|
|
154
|
+
await ((_e = provider.shutdown) === null || _e === void 0 ? void 0 : _e.call(provider));
|
|
155
|
+
}
|
|
156
|
+
const { evaluationRuns } = await evaluateExperiment({
|
|
157
|
+
experiment: ranExperiment,
|
|
158
|
+
evaluators: evaluators !== null && evaluators !== void 0 ? evaluators : [],
|
|
159
|
+
client,
|
|
160
|
+
logger,
|
|
161
|
+
concurrency,
|
|
162
|
+
dryRun,
|
|
163
|
+
});
|
|
164
|
+
ranExperiment.evaluationRuns = evaluationRuns;
|
|
165
|
+
logger.info(`✅ Experiment ${experiment.id} completed`);
|
|
166
|
+
if (!isDryRun && client.config.baseUrl) {
|
|
167
|
+
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
|
|
168
|
+
baseUrl: client.config.baseUrl,
|
|
169
|
+
datasetId: dataset.id,
|
|
170
|
+
experimentId: experiment.id,
|
|
171
|
+
});
|
|
172
|
+
logger.info(`🔍 View results: ${experimentUrl}`);
|
|
173
|
+
}
|
|
174
|
+
return ranExperiment;
|
|
186
175
|
}
|
|
187
176
|
/**
|
|
188
177
|
* Run a task against n examples in a dataset.
|
|
189
178
|
*/
|
|
190
179
|
function runTaskWithExamples({ client, experimentId, task, dataset, onComplete, logger, concurrency = 5, isDryRun, nExamples, tracer, }) {
|
|
191
180
|
logger.info(`🔧 Running task "${task.name}" on dataset "${dataset.id}"`);
|
|
192
|
-
const run = (example) =>
|
|
193
|
-
return tracer.startActiveSpan(`Task: ${task.name}`, (span) =>
|
|
181
|
+
const run = async (example) => {
|
|
182
|
+
return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => {
|
|
194
183
|
var _a, _b;
|
|
195
184
|
logger.info(`🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`);
|
|
196
185
|
const traceId = span.spanContext().traceId;
|
|
@@ -205,7 +194,7 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
|
|
|
205
194
|
error: null,
|
|
206
195
|
};
|
|
207
196
|
try {
|
|
208
|
-
const taskOutput =
|
|
197
|
+
const taskOutput = await (0, promisifyResult_1.promisifyResult)(task(example));
|
|
209
198
|
thisRun.output = taskOutput;
|
|
210
199
|
}
|
|
211
200
|
catch (error) {
|
|
@@ -216,7 +205,7 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
|
|
|
216
205
|
thisRun.endTime = new Date();
|
|
217
206
|
if (!isDryRun) {
|
|
218
207
|
// Log the run to the server
|
|
219
|
-
const res =
|
|
208
|
+
const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
|
|
220
209
|
params: {
|
|
221
210
|
path: {
|
|
222
211
|
experiment_id: experimentId,
|
|
@@ -248,8 +237,8 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
|
|
|
248
237
|
span === null || span === void 0 ? void 0 : span.end();
|
|
249
238
|
onComplete(thisRun);
|
|
250
239
|
return thisRun;
|
|
251
|
-
})
|
|
252
|
-
}
|
|
240
|
+
});
|
|
241
|
+
};
|
|
253
242
|
const q = (0, async_1.queue)(run, concurrency);
|
|
254
243
|
const examplesToUse = dataset.examples.slice(0, nExamples);
|
|
255
244
|
examplesToUse.forEach((example) => q.push(example, (err) => {
|
|
@@ -264,179 +253,175 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
|
|
|
264
253
|
*
|
|
265
254
|
* @experimental This feature is not complete, and will change in the future.
|
|
266
255
|
*/
|
|
267
|
-
function evaluateExperiment(
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
headers: (_b = client.config.headers) !== null && _b !== void 0 ? _b : {},
|
|
280
|
-
});
|
|
281
|
-
}
|
|
282
|
-
else {
|
|
283
|
-
provider = (0, instrumention_1.createNoOpProvider)();
|
|
284
|
-
}
|
|
285
|
-
const tracer = isDryRun
|
|
286
|
-
? provider.getTracer("no-op")
|
|
287
|
-
: provider.getTracer("evaluators");
|
|
288
|
-
const nRuns = typeof dryRun === "number"
|
|
289
|
-
? Math.max(dryRun, Object.keys(experiment.runs).length)
|
|
290
|
-
: Object.keys(experiment.runs).length;
|
|
291
|
-
const dataset = yield (0, getDataset_1.getDataset)({
|
|
292
|
-
dataset: { datasetId: experiment.datasetId },
|
|
293
|
-
client,
|
|
256
|
+
async function evaluateExperiment({ experiment, evaluators, client: _client, logger = console, concurrency = 5, dryRun = false, }) {
|
|
257
|
+
var _a, _b, _c, _d;
|
|
258
|
+
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
259
|
+
const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
|
|
260
|
+
const baseUrl = client.config.baseUrl;
|
|
261
|
+
(0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
|
|
262
|
+
let provider;
|
|
263
|
+
if (!isDryRun) {
|
|
264
|
+
provider = (0, instrumention_1.createProvider)({
|
|
265
|
+
projectName: "evaluators",
|
|
266
|
+
baseUrl,
|
|
267
|
+
headers: (_a = client.config.headers) !== null && _a !== void 0 ? _a : {},
|
|
294
268
|
});
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
(0,
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
269
|
+
}
|
|
270
|
+
else {
|
|
271
|
+
provider = (0, instrumention_1.createNoOpProvider)();
|
|
272
|
+
}
|
|
273
|
+
const tracer = isDryRun
|
|
274
|
+
? provider.getTracer("no-op")
|
|
275
|
+
: provider.getTracer("evaluators");
|
|
276
|
+
const nRuns = typeof dryRun === "number"
|
|
277
|
+
? Math.max(dryRun, Object.keys(experiment.runs).length)
|
|
278
|
+
: Object.keys(experiment.runs).length;
|
|
279
|
+
const dataset = await (0, getDataset_1.getDataset)({
|
|
280
|
+
dataset: { datasetId: experiment.datasetId },
|
|
281
|
+
client,
|
|
282
|
+
});
|
|
283
|
+
(0, tiny_invariant_1.default)(dataset, `Dataset "${experiment.datasetId}" not found`);
|
|
284
|
+
(0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset "${experiment.datasetId}" has no examples`);
|
|
285
|
+
(0, tiny_invariant_1.default)(experiment.runs, `Experiment "${experiment.id}" has no runs`);
|
|
286
|
+
const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
|
|
287
|
+
if ((evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) === 0) {
|
|
288
|
+
return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
|
|
289
|
+
}
|
|
290
|
+
logger.info(`🧠 Evaluating experiment "${experiment.id}" with ${(_b = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _b !== void 0 ? _b : 0} ${(0, pluralize_1.pluralize)("evaluator", (_c = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _c !== void 0 ? _c : 0)}`);
|
|
291
|
+
if (!isDryRun && client.config.baseUrl) {
|
|
292
|
+
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
|
|
293
|
+
baseUrl: client.config.baseUrl,
|
|
294
|
+
datasetId: experiment.datasetId,
|
|
295
|
+
experimentId: experiment.id,
|
|
296
|
+
});
|
|
297
|
+
logger.info(`🔗 View experiment evaluation: ${experimentUrl}`);
|
|
298
|
+
}
|
|
299
|
+
const evaluationRuns = {};
|
|
300
|
+
const examplesById = {};
|
|
301
|
+
for (const example of dataset.examples) {
|
|
302
|
+
examplesById[example.id] = example;
|
|
303
|
+
}
|
|
304
|
+
const onEvaluationComplete = (run) => {
|
|
305
|
+
evaluationRuns[run.id] = run;
|
|
306
|
+
};
|
|
307
|
+
// Run evaluators against all runs
|
|
308
|
+
// Flat list of evaluator + run tuples
|
|
309
|
+
const evaluatorsAndRuns = evaluators.flatMap((evaluator) => runsToEvaluate.map((run) => ({
|
|
310
|
+
evaluator,
|
|
311
|
+
run,
|
|
312
|
+
})));
|
|
313
|
+
const evaluatorsQueue = (0, async_1.queue)(async (evaluatorAndRun) => {
|
|
314
|
+
return tracer.startActiveSpan(`Evaluation: ${evaluatorAndRun.evaluator.name}`, async (span) => {
|
|
315
|
+
var _a, _b, _c;
|
|
316
|
+
const evalResult = await runEvaluator({
|
|
317
|
+
evaluator: evaluatorAndRun.evaluator,
|
|
318
|
+
run: evaluatorAndRun.run,
|
|
319
|
+
exampleCache: examplesById,
|
|
320
|
+
onComplete: onEvaluationComplete,
|
|
321
|
+
logger,
|
|
308
322
|
});
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
const evaluatorsQueue = (0, async_1.queue)((evaluatorAndRun) => __awaiter(this, void 0, void 0, function* () {
|
|
326
|
-
return tracer.startActiveSpan(`Evaluation: ${evaluatorAndRun.evaluator.name}`, (span) => __awaiter(this, void 0, void 0, function* () {
|
|
327
|
-
var _a, _b, _c;
|
|
328
|
-
const evalResult = yield runEvaluator({
|
|
329
|
-
evaluator: evaluatorAndRun.evaluator,
|
|
330
|
-
run: evaluatorAndRun.run,
|
|
331
|
-
exampleCache: examplesById,
|
|
332
|
-
onComplete: onEvaluationComplete,
|
|
333
|
-
logger,
|
|
323
|
+
span.setAttributes({
|
|
324
|
+
[openinference_semantic_conventions_1.SemanticConventions.OPENINFERENCE_SPAN_KIND]: openinference_semantic_conventions_1.OpenInferenceSpanKind.EVALUATOR,
|
|
325
|
+
[openinference_semantic_conventions_1.SemanticConventions.INPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
|
|
326
|
+
[openinference_semantic_conventions_1.SemanticConventions.INPUT_VALUE]: JSON.stringify({
|
|
327
|
+
input: (_a = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _a === void 0 ? void 0 : _a.input,
|
|
328
|
+
output: evaluatorAndRun.run.output,
|
|
329
|
+
expected: (_b = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _b === void 0 ? void 0 : _b.output,
|
|
330
|
+
metadata: (_c = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _c === void 0 ? void 0 : _c.metadata,
|
|
331
|
+
}),
|
|
332
|
+
[openinference_semantic_conventions_1.SemanticConventions.OUTPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
|
|
333
|
+
[openinference_semantic_conventions_1.SemanticConventions.OUTPUT_VALUE]: (0, ensureString_1.ensureString)(evalResult.result),
|
|
334
|
+
});
|
|
335
|
+
if (evalResult.error) {
|
|
336
|
+
span.setStatus({
|
|
337
|
+
code: api_1.SpanStatusCode.ERROR,
|
|
338
|
+
message: evalResult.error,
|
|
334
339
|
});
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
340
|
+
}
|
|
341
|
+
else {
|
|
342
|
+
span.setStatus({ code: api_1.SpanStatusCode.OK });
|
|
343
|
+
}
|
|
344
|
+
if (evalResult.result) {
|
|
345
|
+
span.setAttributes((0, objectAsAttributes_1.objectAsAttributes)(evalResult.result));
|
|
346
|
+
}
|
|
347
|
+
evalResult.traceId = span.spanContext().traceId;
|
|
348
|
+
if (!isDryRun) {
|
|
349
|
+
// Log the evaluation to the server
|
|
350
|
+
// We log this without awaiting (e.g. best effort)
|
|
351
|
+
client.POST("/v1/experiment_evaluations", {
|
|
352
|
+
body: {
|
|
353
|
+
experiment_run_id: evaluatorAndRun.run.id,
|
|
354
|
+
name: evaluatorAndRun.evaluator.name,
|
|
355
|
+
annotator_kind: evaluatorAndRun.evaluator.kind,
|
|
356
|
+
start_time: evalResult.startTime.toISOString(),
|
|
357
|
+
end_time: evalResult.endTime.toISOString(),
|
|
358
|
+
result: Object.assign({}, evalResult.result),
|
|
359
|
+
error: evalResult.error,
|
|
360
|
+
trace_id: evalResult.traceId,
|
|
361
|
+
},
|
|
346
362
|
});
|
|
347
|
-
if (evalResult.error) {
|
|
348
|
-
span.setStatus({
|
|
349
|
-
code: api_1.SpanStatusCode.ERROR,
|
|
350
|
-
message: evalResult.error,
|
|
351
|
-
});
|
|
352
|
-
}
|
|
353
|
-
else {
|
|
354
|
-
span.setStatus({ code: api_1.SpanStatusCode.OK });
|
|
355
|
-
}
|
|
356
|
-
if (evalResult.result) {
|
|
357
|
-
span.setAttributes((0, objectAsAttributes_1.objectAsAttributes)(evalResult.result));
|
|
358
|
-
}
|
|
359
|
-
evalResult.traceId = span.spanContext().traceId;
|
|
360
|
-
if (!isDryRun) {
|
|
361
|
-
// Log the evaluation to the server
|
|
362
|
-
// We log this without awaiting (e.g. best effort)
|
|
363
|
-
client.POST("/v1/experiment_evaluations", {
|
|
364
|
-
body: {
|
|
365
|
-
experiment_run_id: evaluatorAndRun.run.id,
|
|
366
|
-
name: evaluatorAndRun.evaluator.name,
|
|
367
|
-
annotator_kind: evaluatorAndRun.evaluator.kind,
|
|
368
|
-
start_time: evalResult.startTime.toISOString(),
|
|
369
|
-
end_time: evalResult.endTime.toISOString(),
|
|
370
|
-
result: Object.assign({}, evalResult.result),
|
|
371
|
-
error: evalResult.error,
|
|
372
|
-
trace_id: evalResult.traceId,
|
|
373
|
-
},
|
|
374
|
-
});
|
|
375
|
-
}
|
|
376
|
-
span.end();
|
|
377
|
-
return evalResult;
|
|
378
|
-
}));
|
|
379
|
-
}), concurrency);
|
|
380
|
-
if (!evaluatorsAndRuns.length) {
|
|
381
|
-
logger.info(`⛔ No evaluators to run`);
|
|
382
|
-
return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
|
|
383
|
-
}
|
|
384
|
-
evaluatorsAndRuns.forEach((evaluatorAndRun) => evaluatorsQueue.push(evaluatorAndRun, (err) => {
|
|
385
|
-
if (err) {
|
|
386
|
-
logger.error(`❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`);
|
|
387
363
|
}
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
364
|
+
span.end();
|
|
365
|
+
return evalResult;
|
|
366
|
+
});
|
|
367
|
+
}, concurrency);
|
|
368
|
+
if (!evaluatorsAndRuns.length) {
|
|
369
|
+
logger.info(`⛔ No evaluators to run`);
|
|
370
|
+
return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
|
|
371
|
+
}
|
|
372
|
+
evaluatorsAndRuns.forEach((evaluatorAndRun) => evaluatorsQueue.push(evaluatorAndRun, (err) => {
|
|
373
|
+
if (err) {
|
|
374
|
+
logger.error(`❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`);
|
|
393
375
|
}
|
|
394
|
-
|
|
395
|
-
|
|
376
|
+
}));
|
|
377
|
+
await evaluatorsQueue.drain();
|
|
378
|
+
logger.info(`✅ Evaluation runs completed`);
|
|
379
|
+
if (provider) {
|
|
380
|
+
await ((_d = provider.shutdown) === null || _d === void 0 ? void 0 : _d.call(provider));
|
|
381
|
+
}
|
|
382
|
+
return Object.assign(Object.assign({}, experiment), { evaluationRuns: Object.values(evaluationRuns) });
|
|
396
383
|
}
|
|
397
384
|
/**
|
|
398
385
|
* Run an evaluator against a run.
|
|
399
386
|
*
|
|
400
387
|
* @experimental This feature is not complete, and will change in the future.
|
|
401
388
|
*/
|
|
402
|
-
function runEvaluator(
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
return evaluate();
|
|
439
|
-
});
|
|
389
|
+
async function runEvaluator({ evaluator, run, exampleCache, onComplete, logger, }) {
|
|
390
|
+
const example = exampleCache[run.datasetExampleId];
|
|
391
|
+
(0, tiny_invariant_1.default)(example, `Example "${run.datasetExampleId}" not found`);
|
|
392
|
+
const evaluate = async () => {
|
|
393
|
+
var _a;
|
|
394
|
+
logger.info(`🧠 Evaluating run "${run.id}" with evaluator "${evaluator.name}"`);
|
|
395
|
+
const thisEval = {
|
|
396
|
+
id: localId(),
|
|
397
|
+
traceId: null,
|
|
398
|
+
experimentRunId: run.id,
|
|
399
|
+
startTime: new Date(),
|
|
400
|
+
endTime: new Date(), // will get replaced with actual end time
|
|
401
|
+
name: evaluator.name,
|
|
402
|
+
result: null,
|
|
403
|
+
error: null,
|
|
404
|
+
annotatorKind: evaluator.kind,
|
|
405
|
+
};
|
|
406
|
+
try {
|
|
407
|
+
const result = await evaluator.evaluate({
|
|
408
|
+
input: example.input,
|
|
409
|
+
output: (_a = run.output) !== null && _a !== void 0 ? _a : null,
|
|
410
|
+
expected: example.output,
|
|
411
|
+
metadata: example.metadata,
|
|
412
|
+
});
|
|
413
|
+
thisEval.result = result;
|
|
414
|
+
logger.info(`✅ Evaluator "${evaluator.name}" on run "${run.id}" completed`);
|
|
415
|
+
}
|
|
416
|
+
catch (error) {
|
|
417
|
+
thisEval.error = error instanceof Error ? error.message : "Unknown error";
|
|
418
|
+
logger.error(`❌ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`);
|
|
419
|
+
}
|
|
420
|
+
thisEval.endTime = new Date();
|
|
421
|
+
onComplete(thisEval);
|
|
422
|
+
return thisEval;
|
|
423
|
+
};
|
|
424
|
+
return evaluate();
|
|
440
425
|
}
|
|
441
426
|
/**
|
|
442
427
|
* Wrap an evaluator function in an object with a name property.
|