@arizeai/phoenix-client 2.3.2 → 2.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/experiments/instrumention.d.ts.map +1 -1
- package/dist/esm/experiments/instrumention.js +11 -7
- package/dist/esm/experiments/instrumention.js.map +1 -1
- package/dist/esm/experiments/runExperiment.d.ts +14 -2
- package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
- package/dist/esm/experiments/runExperiment.js +11 -3
- package/dist/esm/experiments/runExperiment.js.map +1 -1
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/src/datasets/appendDatasetExamples.js +34 -45
- package/dist/src/datasets/appendDatasetExamples.js.map +1 -1
- package/dist/src/datasets/createDataset.js +25 -36
- package/dist/src/datasets/createDataset.js.map +1 -1
- package/dist/src/datasets/getDataset.js +7 -18
- package/dist/src/datasets/getDataset.js.map +1 -1
- package/dist/src/datasets/getDatasetExamples.js +25 -36
- package/dist/src/datasets/getDatasetExamples.js.map +1 -1
- package/dist/src/datasets/getDatasetInfo.js +22 -33
- package/dist/src/datasets/getDatasetInfo.js.map +1 -1
- package/dist/src/datasets/getDatasetInfoByName.js +21 -32
- package/dist/src/datasets/getDatasetInfoByName.js.map +1 -1
- package/dist/src/datasets/listDatasets.js +6 -17
- package/dist/src/datasets/listDatasets.js.map +1 -1
- package/dist/src/experiments/getExperiment.js +13 -24
- package/dist/src/experiments/getExperiment.js.map +1 -1
- package/dist/src/experiments/getExperimentInfo.js +15 -26
- package/dist/src/experiments/getExperimentInfo.js.map +1 -1
- package/dist/src/experiments/getExperimentRuns.js +24 -35
- package/dist/src/experiments/getExperimentRuns.js.map +1 -1
- package/dist/src/experiments/instrumention.d.ts.map +1 -1
- package/dist/src/experiments/instrumention.js +11 -7
- package/dist/src/experiments/instrumention.js.map +1 -1
- package/dist/src/experiments/runExperiment.d.ts +14 -2
- package/dist/src/experiments/runExperiment.d.ts.map +1 -1
- package/dist/src/experiments/runExperiment.js +286 -293
- package/dist/src/experiments/runExperiment.js.map +1 -1
- package/dist/src/prompts/createPrompt.js +14 -25
- package/dist/src/prompts/createPrompt.js.map +1 -1
- package/dist/src/prompts/getPrompt.js +4 -15
- package/dist/src/prompts/getPrompt.js.map +1 -1
- package/dist/src/spans/addSpanAnnotation.js +14 -25
- package/dist/src/spans/addSpanAnnotation.js.map +1 -1
- package/dist/src/spans/getSpanAnnotations.js +29 -40
- package/dist/src/spans/getSpanAnnotations.js.map +1 -1
- package/dist/src/spans/getSpans.js +29 -40
- package/dist/src/spans/getSpans.js.map +1 -1
- package/dist/src/spans/logSpanAnnotations.js +14 -25
- package/dist/src/spans/logSpanAnnotations.js.map +1 -1
- package/dist/src/utils/getPromptBySelector.js +37 -48
- package/dist/src/utils/getPromptBySelector.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +3 -2
- package/src/experiments/instrumention.ts +7 -5
- package/src/experiments/runExperiment.ts +23 -1
|
@@ -1,13 +1,4 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
-
});
|
|
10
|
-
};
|
|
11
2
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
3
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
4
|
};
|
|
@@ -59,138 +50,141 @@ const urlUtils_1 = require("../utils/urlUtils");
|
|
|
59
50
|
* });
|
|
60
51
|
* ```
|
|
61
52
|
*/
|
|
62
|
-
function runExperiment(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
dataset_id: dataset.id,
|
|
94
|
-
},
|
|
95
|
-
},
|
|
96
|
-
body: {
|
|
97
|
-
name: experimentName,
|
|
98
|
-
description: experimentDescription,
|
|
99
|
-
metadata: experimentMetadata,
|
|
100
|
-
project_name: projectName,
|
|
53
|
+
async function runExperiment({ experimentName, experimentDescription, experimentMetadata = {}, client: _client, dataset: DatasetSelector, task, evaluators, logger = console, record = true, concurrency = 5, dryRun = false, setGlobalTracerProvider = true, }) {
|
|
54
|
+
var _a, _b, _c, _d, _e;
|
|
55
|
+
let provider;
|
|
56
|
+
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
57
|
+
const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
|
|
58
|
+
const dataset = await (0, getDataset_1.getDataset)({ dataset: DatasetSelector, client });
|
|
59
|
+
(0, tiny_invariant_1.default)(dataset, `Dataset not found`);
|
|
60
|
+
(0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset has no examples`);
|
|
61
|
+
const nExamples = typeof dryRun === "number"
|
|
62
|
+
? Math.min(dryRun, dataset.examples.length)
|
|
63
|
+
: dataset.examples.length;
|
|
64
|
+
let projectName = `${dataset.name}-exp-${new Date().toISOString()}`;
|
|
65
|
+
// initialize the tracer into scope
|
|
66
|
+
let taskTracer;
|
|
67
|
+
let experiment;
|
|
68
|
+
if (isDryRun) {
|
|
69
|
+
experiment = {
|
|
70
|
+
id: localId(),
|
|
71
|
+
datasetId: dataset.id,
|
|
72
|
+
datasetVersionId: dataset.versionId,
|
|
73
|
+
projectName,
|
|
74
|
+
metadata: experimentMetadata,
|
|
75
|
+
};
|
|
76
|
+
taskTracer = (0, instrumention_1.createNoOpProvider)().getTracer("no-op");
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
const experimentResponse = await client
|
|
80
|
+
.POST("/v1/datasets/{dataset_id}/experiments", {
|
|
81
|
+
params: {
|
|
82
|
+
path: {
|
|
83
|
+
dataset_id: dataset.id,
|
|
101
84
|
},
|
|
102
|
-
})
|
|
103
|
-
.then((res) => { var _a; return (_a = res.data) === null || _a === void 0 ? void 0 : _a.data; });
|
|
104
|
-
(0, tiny_invariant_1.default)(experimentResponse, `Failed to create experiment`);
|
|
105
|
-
projectName = (_b = experimentResponse.project_name) !== null && _b !== void 0 ? _b : projectName;
|
|
106
|
-
experiment = {
|
|
107
|
-
id: experimentResponse.id,
|
|
108
|
-
datasetId: experimentResponse.dataset_id,
|
|
109
|
-
datasetVersionId: experimentResponse.dataset_version_id,
|
|
110
|
-
projectName,
|
|
111
|
-
metadata: experimentResponse.metadata,
|
|
112
|
-
};
|
|
113
|
-
// Initialize the tracer, now that we have a project name
|
|
114
|
-
const baseUrl = client.config.baseUrl;
|
|
115
|
-
(0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
|
|
116
|
-
provider = (0, instrumention_1.createProvider)({
|
|
117
|
-
projectName,
|
|
118
|
-
baseUrl,
|
|
119
|
-
headers: (_c = client.config.headers) !== null && _c !== void 0 ? _c : {},
|
|
120
|
-
});
|
|
121
|
-
taskTracer = provider.getTracer(projectName);
|
|
122
|
-
}
|
|
123
|
-
if (!record) {
|
|
124
|
-
logger.info(`🔧 Running experiment in readonly mode. Results will not be recorded.`);
|
|
125
|
-
}
|
|
126
|
-
if (!isDryRun && client.config.baseUrl) {
|
|
127
|
-
const datasetUrl = (0, urlUtils_1.getDatasetUrl)({
|
|
128
|
-
baseUrl: client.config.baseUrl,
|
|
129
|
-
datasetId: dataset.id,
|
|
130
|
-
});
|
|
131
|
-
const datasetExperimentsUrl = (0, urlUtils_1.getDatasetExperimentsUrl)({
|
|
132
|
-
baseUrl: client.config.baseUrl,
|
|
133
|
-
datasetId: dataset.id,
|
|
134
|
-
});
|
|
135
|
-
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
|
|
136
|
-
baseUrl: client.config.baseUrl,
|
|
137
|
-
datasetId: dataset.id,
|
|
138
|
-
experimentId: experiment.id,
|
|
139
|
-
});
|
|
140
|
-
logger.info(`📊 View dataset: ${datasetUrl}`);
|
|
141
|
-
logger.info(`📺 View dataset experiments: ${datasetExperimentsUrl}`);
|
|
142
|
-
logger.info(`🔗 View this experiment: ${experimentUrl}`);
|
|
143
|
-
}
|
|
144
|
-
logger.info(`🧪 Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${(_d = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _d !== void 0 ? _d : 0} ${(0, pluralize_1.pluralize)("evaluator", (_e = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _e !== void 0 ? _e : 0)} and ${concurrency} concurrent runs`);
|
|
145
|
-
const runs = {};
|
|
146
|
-
yield runTaskWithExamples({
|
|
147
|
-
client,
|
|
148
|
-
experimentId: experiment.id,
|
|
149
|
-
task,
|
|
150
|
-
dataset,
|
|
151
|
-
logger,
|
|
152
|
-
onComplete: (run) => {
|
|
153
|
-
runs[run.id] = run;
|
|
154
85
|
},
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
86
|
+
body: {
|
|
87
|
+
name: experimentName,
|
|
88
|
+
description: experimentDescription,
|
|
89
|
+
metadata: experimentMetadata,
|
|
90
|
+
project_name: projectName,
|
|
91
|
+
},
|
|
92
|
+
})
|
|
93
|
+
.then((res) => { var _a; return (_a = res.data) === null || _a === void 0 ? void 0 : _a.data; });
|
|
94
|
+
(0, tiny_invariant_1.default)(experimentResponse, `Failed to create experiment`);
|
|
95
|
+
projectName = (_a = experimentResponse.project_name) !== null && _a !== void 0 ? _a : projectName;
|
|
96
|
+
experiment = {
|
|
97
|
+
id: experimentResponse.id,
|
|
98
|
+
datasetId: experimentResponse.dataset_id,
|
|
99
|
+
datasetVersionId: experimentResponse.dataset_version_id,
|
|
100
|
+
projectName,
|
|
101
|
+
metadata: experimentResponse.metadata,
|
|
102
|
+
};
|
|
103
|
+
// Initialize the tracer, now that we have a project name
|
|
104
|
+
const baseUrl = client.config.baseUrl;
|
|
105
|
+
(0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
|
|
106
|
+
provider = (0, instrumention_1.createProvider)({
|
|
107
|
+
projectName,
|
|
108
|
+
baseUrl,
|
|
109
|
+
headers: (_b = client.config.headers) !== null && _b !== void 0 ? _b : {},
|
|
159
110
|
});
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
if (provider) {
|
|
164
|
-
yield ((_f = provider.shutdown) === null || _f === void 0 ? void 0 : _f.call(provider));
|
|
111
|
+
// Register the provider
|
|
112
|
+
if (setGlobalTracerProvider) {
|
|
113
|
+
provider.register();
|
|
165
114
|
}
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
115
|
+
taskTracer = provider.getTracer(projectName);
|
|
116
|
+
}
|
|
117
|
+
if (!record) {
|
|
118
|
+
logger.info(`🔧 Running experiment in readonly mode. Results will not be recorded.`);
|
|
119
|
+
}
|
|
120
|
+
if (!isDryRun && client.config.baseUrl) {
|
|
121
|
+
const datasetUrl = (0, urlUtils_1.getDatasetUrl)({
|
|
122
|
+
baseUrl: client.config.baseUrl,
|
|
123
|
+
datasetId: dataset.id,
|
|
173
124
|
});
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
}
|
|
184
|
-
|
|
125
|
+
const datasetExperimentsUrl = (0, urlUtils_1.getDatasetExperimentsUrl)({
|
|
126
|
+
baseUrl: client.config.baseUrl,
|
|
127
|
+
datasetId: dataset.id,
|
|
128
|
+
});
|
|
129
|
+
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
|
|
130
|
+
baseUrl: client.config.baseUrl,
|
|
131
|
+
datasetId: dataset.id,
|
|
132
|
+
experimentId: experiment.id,
|
|
133
|
+
});
|
|
134
|
+
logger.info(`📊 View dataset: ${datasetUrl}`);
|
|
135
|
+
logger.info(`📺 View dataset experiments: ${datasetExperimentsUrl}`);
|
|
136
|
+
logger.info(`🔗 View this experiment: ${experimentUrl}`);
|
|
137
|
+
}
|
|
138
|
+
logger.info(`🧪 Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${(_c = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _c !== void 0 ? _c : 0} ${(0, pluralize_1.pluralize)("evaluator", (_d = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _d !== void 0 ? _d : 0)} and ${concurrency} concurrent runs`);
|
|
139
|
+
const runs = {};
|
|
140
|
+
await runTaskWithExamples({
|
|
141
|
+
client,
|
|
142
|
+
experimentId: experiment.id,
|
|
143
|
+
task,
|
|
144
|
+
dataset,
|
|
145
|
+
logger,
|
|
146
|
+
onComplete: (run) => {
|
|
147
|
+
runs[run.id] = run;
|
|
148
|
+
},
|
|
149
|
+
concurrency,
|
|
150
|
+
isDryRun,
|
|
151
|
+
nExamples,
|
|
152
|
+
tracer: taskTracer,
|
|
185
153
|
});
|
|
154
|
+
logger.info(`✅ Task runs completed`);
|
|
155
|
+
const ranExperiment = Object.assign(Object.assign({}, experiment), { runs });
|
|
156
|
+
// Shut down the provider so that the experiments run
|
|
157
|
+
if (provider) {
|
|
158
|
+
await ((_e = provider.shutdown) === null || _e === void 0 ? void 0 : _e.call(provider));
|
|
159
|
+
}
|
|
160
|
+
const { evaluationRuns } = await evaluateExperiment({
|
|
161
|
+
experiment: ranExperiment,
|
|
162
|
+
evaluators: evaluators !== null && evaluators !== void 0 ? evaluators : [],
|
|
163
|
+
client,
|
|
164
|
+
logger,
|
|
165
|
+
concurrency,
|
|
166
|
+
dryRun,
|
|
167
|
+
setGlobalTracerProvider,
|
|
168
|
+
});
|
|
169
|
+
ranExperiment.evaluationRuns = evaluationRuns;
|
|
170
|
+
logger.info(`✅ Experiment ${experiment.id} completed`);
|
|
171
|
+
if (!isDryRun && client.config.baseUrl) {
|
|
172
|
+
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
|
|
173
|
+
baseUrl: client.config.baseUrl,
|
|
174
|
+
datasetId: dataset.id,
|
|
175
|
+
experimentId: experiment.id,
|
|
176
|
+
});
|
|
177
|
+
logger.info(`🔍 View results: ${experimentUrl}`);
|
|
178
|
+
}
|
|
179
|
+
return ranExperiment;
|
|
186
180
|
}
|
|
187
181
|
/**
|
|
188
182
|
* Run a task against n examples in a dataset.
|
|
189
183
|
*/
|
|
190
184
|
function runTaskWithExamples({ client, experimentId, task, dataset, onComplete, logger, concurrency = 5, isDryRun, nExamples, tracer, }) {
|
|
191
185
|
logger.info(`🔧 Running task "${task.name}" on dataset "${dataset.id}"`);
|
|
192
|
-
const run = (example) =>
|
|
193
|
-
return tracer.startActiveSpan(`Task: ${task.name}`, (span) =>
|
|
186
|
+
const run = async (example) => {
|
|
187
|
+
return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => {
|
|
194
188
|
var _a, _b;
|
|
195
189
|
logger.info(`🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`);
|
|
196
190
|
const traceId = span.spanContext().traceId;
|
|
@@ -205,7 +199,7 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
|
|
|
205
199
|
error: null,
|
|
206
200
|
};
|
|
207
201
|
try {
|
|
208
|
-
const taskOutput =
|
|
202
|
+
const taskOutput = await (0, promisifyResult_1.promisifyResult)(task(example));
|
|
209
203
|
thisRun.output = taskOutput;
|
|
210
204
|
}
|
|
211
205
|
catch (error) {
|
|
@@ -216,7 +210,7 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
|
|
|
216
210
|
thisRun.endTime = new Date();
|
|
217
211
|
if (!isDryRun) {
|
|
218
212
|
// Log the run to the server
|
|
219
|
-
const res =
|
|
213
|
+
const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
|
|
220
214
|
params: {
|
|
221
215
|
path: {
|
|
222
216
|
experiment_id: experimentId,
|
|
@@ -248,8 +242,8 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
|
|
|
248
242
|
span === null || span === void 0 ? void 0 : span.end();
|
|
249
243
|
onComplete(thisRun);
|
|
250
244
|
return thisRun;
|
|
251
|
-
})
|
|
252
|
-
}
|
|
245
|
+
});
|
|
246
|
+
};
|
|
253
247
|
const q = (0, async_1.queue)(run, concurrency);
|
|
254
248
|
const examplesToUse = dataset.examples.slice(0, nExamples);
|
|
255
249
|
examplesToUse.forEach((example) => q.push(example, (err) => {
|
|
@@ -264,179 +258,178 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
|
|
|
264
258
|
*
|
|
265
259
|
* @experimental This feature is not complete, and will change in the future.
|
|
266
260
|
*/
|
|
267
|
-
function evaluateExperiment(
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
headers: (_b = client.config.headers) !== null && _b !== void 0 ? _b : {},
|
|
280
|
-
});
|
|
281
|
-
}
|
|
282
|
-
else {
|
|
283
|
-
provider = (0, instrumention_1.createNoOpProvider)();
|
|
284
|
-
}
|
|
285
|
-
const tracer = isDryRun
|
|
286
|
-
? provider.getTracer("no-op")
|
|
287
|
-
: provider.getTracer("evaluators");
|
|
288
|
-
const nRuns = typeof dryRun === "number"
|
|
289
|
-
? Math.max(dryRun, Object.keys(experiment.runs).length)
|
|
290
|
-
: Object.keys(experiment.runs).length;
|
|
291
|
-
const dataset = yield (0, getDataset_1.getDataset)({
|
|
292
|
-
dataset: { datasetId: experiment.datasetId },
|
|
293
|
-
client,
|
|
261
|
+
async function evaluateExperiment({ experiment, evaluators, client: _client, logger = console, concurrency = 5, dryRun = false, setGlobalTracerProvider = true, }) {
|
|
262
|
+
var _a, _b, _c, _d;
|
|
263
|
+
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
264
|
+
const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
|
|
265
|
+
const baseUrl = client.config.baseUrl;
|
|
266
|
+
(0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
|
|
267
|
+
let provider;
|
|
268
|
+
if (!isDryRun) {
|
|
269
|
+
provider = (0, instrumention_1.createProvider)({
|
|
270
|
+
projectName: "evaluators",
|
|
271
|
+
baseUrl,
|
|
272
|
+
headers: (_a = client.config.headers) !== null && _a !== void 0 ? _a : {},
|
|
294
273
|
});
|
|
295
|
-
(
|
|
296
|
-
|
|
297
|
-
(0, tiny_invariant_1.default)(experiment.runs, `Experiment "${experiment.id}" has no runs`);
|
|
298
|
-
const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
|
|
299
|
-
if ((evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) === 0) {
|
|
300
|
-
return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
|
|
274
|
+
if (setGlobalTracerProvider) {
|
|
275
|
+
provider.register();
|
|
301
276
|
}
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
277
|
+
}
|
|
278
|
+
else {
|
|
279
|
+
provider = (0, instrumention_1.createNoOpProvider)();
|
|
280
|
+
}
|
|
281
|
+
const tracer = isDryRun
|
|
282
|
+
? provider.getTracer("no-op")
|
|
283
|
+
: provider.getTracer("evaluators");
|
|
284
|
+
const nRuns = typeof dryRun === "number"
|
|
285
|
+
? Math.min(dryRun, Object.keys(experiment.runs).length)
|
|
286
|
+
: Object.keys(experiment.runs).length;
|
|
287
|
+
const dataset = await (0, getDataset_1.getDataset)({
|
|
288
|
+
dataset: { datasetId: experiment.datasetId },
|
|
289
|
+
client,
|
|
290
|
+
});
|
|
291
|
+
(0, tiny_invariant_1.default)(dataset, `Dataset "${experiment.datasetId}" not found`);
|
|
292
|
+
(0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset "${experiment.datasetId}" has no examples`);
|
|
293
|
+
(0, tiny_invariant_1.default)(experiment.runs, `Experiment "${experiment.id}" has no runs`);
|
|
294
|
+
const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
|
|
295
|
+
if ((evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) === 0) {
|
|
296
|
+
return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
|
|
297
|
+
}
|
|
298
|
+
logger.info(`🧠 Evaluating experiment "${experiment.id}" with ${(_b = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _b !== void 0 ? _b : 0} ${(0, pluralize_1.pluralize)("evaluator", (_c = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _c !== void 0 ? _c : 0)}`);
|
|
299
|
+
if (!isDryRun && client.config.baseUrl) {
|
|
300
|
+
const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
|
|
301
|
+
baseUrl: client.config.baseUrl,
|
|
302
|
+
datasetId: experiment.datasetId,
|
|
303
|
+
experimentId: experiment.id,
|
|
304
|
+
});
|
|
305
|
+
logger.info(`🔗 View experiment evaluation: ${experimentUrl}`);
|
|
306
|
+
}
|
|
307
|
+
const evaluationRuns = {};
|
|
308
|
+
const examplesById = {};
|
|
309
|
+
for (const example of dataset.examples) {
|
|
310
|
+
examplesById[example.id] = example;
|
|
311
|
+
}
|
|
312
|
+
const onEvaluationComplete = (run) => {
|
|
313
|
+
evaluationRuns[run.id] = run;
|
|
314
|
+
};
|
|
315
|
+
// Run evaluators against all runs
|
|
316
|
+
// Flat list of evaluator + run tuples
|
|
317
|
+
const evaluatorsAndRuns = evaluators.flatMap((evaluator) => runsToEvaluate.map((run) => ({
|
|
318
|
+
evaluator,
|
|
319
|
+
run,
|
|
320
|
+
})));
|
|
321
|
+
const evaluatorsQueue = (0, async_1.queue)(async (evaluatorAndRun) => {
|
|
322
|
+
return tracer.startActiveSpan(`Evaluation: ${evaluatorAndRun.evaluator.name}`, async (span) => {
|
|
323
|
+
var _a, _b, _c;
|
|
324
|
+
const evalResult = await runEvaluator({
|
|
325
|
+
evaluator: evaluatorAndRun.evaluator,
|
|
326
|
+
run: evaluatorAndRun.run,
|
|
327
|
+
exampleCache: examplesById,
|
|
328
|
+
onComplete: onEvaluationComplete,
|
|
329
|
+
logger,
|
|
308
330
|
});
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
const evaluatorsQueue = (0, async_1.queue)((evaluatorAndRun) => __awaiter(this, void 0, void 0, function* () {
|
|
326
|
-
return tracer.startActiveSpan(`Evaluation: ${evaluatorAndRun.evaluator.name}`, (span) => __awaiter(this, void 0, void 0, function* () {
|
|
327
|
-
var _a, _b, _c;
|
|
328
|
-
const evalResult = yield runEvaluator({
|
|
329
|
-
evaluator: evaluatorAndRun.evaluator,
|
|
330
|
-
run: evaluatorAndRun.run,
|
|
331
|
-
exampleCache: examplesById,
|
|
332
|
-
onComplete: onEvaluationComplete,
|
|
333
|
-
logger,
|
|
331
|
+
span.setAttributes({
|
|
332
|
+
[openinference_semantic_conventions_1.SemanticConventions.OPENINFERENCE_SPAN_KIND]: openinference_semantic_conventions_1.OpenInferenceSpanKind.EVALUATOR,
|
|
333
|
+
[openinference_semantic_conventions_1.SemanticConventions.INPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
|
|
334
|
+
[openinference_semantic_conventions_1.SemanticConventions.INPUT_VALUE]: JSON.stringify({
|
|
335
|
+
input: (_a = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _a === void 0 ? void 0 : _a.input,
|
|
336
|
+
output: evaluatorAndRun.run.output,
|
|
337
|
+
expected: (_b = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _b === void 0 ? void 0 : _b.output,
|
|
338
|
+
metadata: (_c = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _c === void 0 ? void 0 : _c.metadata,
|
|
339
|
+
}),
|
|
340
|
+
[openinference_semantic_conventions_1.SemanticConventions.OUTPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
|
|
341
|
+
[openinference_semantic_conventions_1.SemanticConventions.OUTPUT_VALUE]: (0, ensureString_1.ensureString)(evalResult.result),
|
|
342
|
+
});
|
|
343
|
+
if (evalResult.error) {
|
|
344
|
+
span.setStatus({
|
|
345
|
+
code: api_1.SpanStatusCode.ERROR,
|
|
346
|
+
message: evalResult.error,
|
|
334
347
|
});
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
348
|
+
}
|
|
349
|
+
else {
|
|
350
|
+
span.setStatus({ code: api_1.SpanStatusCode.OK });
|
|
351
|
+
}
|
|
352
|
+
if (evalResult.result) {
|
|
353
|
+
span.setAttributes((0, objectAsAttributes_1.objectAsAttributes)(evalResult.result));
|
|
354
|
+
}
|
|
355
|
+
evalResult.traceId = span.spanContext().traceId;
|
|
356
|
+
if (!isDryRun) {
|
|
357
|
+
// Log the evaluation to the server
|
|
358
|
+
// We log this without awaiting (e.g. best effort)
|
|
359
|
+
client.POST("/v1/experiment_evaluations", {
|
|
360
|
+
body: {
|
|
361
|
+
experiment_run_id: evaluatorAndRun.run.id,
|
|
362
|
+
name: evaluatorAndRun.evaluator.name,
|
|
363
|
+
annotator_kind: evaluatorAndRun.evaluator.kind,
|
|
364
|
+
start_time: evalResult.startTime.toISOString(),
|
|
365
|
+
end_time: evalResult.endTime.toISOString(),
|
|
366
|
+
result: Object.assign({}, evalResult.result),
|
|
367
|
+
error: evalResult.error,
|
|
368
|
+
trace_id: evalResult.traceId,
|
|
369
|
+
},
|
|
346
370
|
});
|
|
347
|
-
if (evalResult.error) {
|
|
348
|
-
span.setStatus({
|
|
349
|
-
code: api_1.SpanStatusCode.ERROR,
|
|
350
|
-
message: evalResult.error,
|
|
351
|
-
});
|
|
352
|
-
}
|
|
353
|
-
else {
|
|
354
|
-
span.setStatus({ code: api_1.SpanStatusCode.OK });
|
|
355
|
-
}
|
|
356
|
-
if (evalResult.result) {
|
|
357
|
-
span.setAttributes((0, objectAsAttributes_1.objectAsAttributes)(evalResult.result));
|
|
358
|
-
}
|
|
359
|
-
evalResult.traceId = span.spanContext().traceId;
|
|
360
|
-
if (!isDryRun) {
|
|
361
|
-
// Log the evaluation to the server
|
|
362
|
-
// We log this without awaiting (e.g. best effort)
|
|
363
|
-
client.POST("/v1/experiment_evaluations", {
|
|
364
|
-
body: {
|
|
365
|
-
experiment_run_id: evaluatorAndRun.run.id,
|
|
366
|
-
name: evaluatorAndRun.evaluator.name,
|
|
367
|
-
annotator_kind: evaluatorAndRun.evaluator.kind,
|
|
368
|
-
start_time: evalResult.startTime.toISOString(),
|
|
369
|
-
end_time: evalResult.endTime.toISOString(),
|
|
370
|
-
result: Object.assign({}, evalResult.result),
|
|
371
|
-
error: evalResult.error,
|
|
372
|
-
trace_id: evalResult.traceId,
|
|
373
|
-
},
|
|
374
|
-
});
|
|
375
|
-
}
|
|
376
|
-
span.end();
|
|
377
|
-
return evalResult;
|
|
378
|
-
}));
|
|
379
|
-
}), concurrency);
|
|
380
|
-
if (!evaluatorsAndRuns.length) {
|
|
381
|
-
logger.info(`⛔ No evaluators to run`);
|
|
382
|
-
return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
|
|
383
|
-
}
|
|
384
|
-
evaluatorsAndRuns.forEach((evaluatorAndRun) => evaluatorsQueue.push(evaluatorAndRun, (err) => {
|
|
385
|
-
if (err) {
|
|
386
|
-
logger.error(`❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`);
|
|
387
371
|
}
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
372
|
+
span.end();
|
|
373
|
+
return evalResult;
|
|
374
|
+
});
|
|
375
|
+
}, concurrency);
|
|
376
|
+
if (!evaluatorsAndRuns.length) {
|
|
377
|
+
logger.info(`⛔ No evaluators to run`);
|
|
378
|
+
return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
|
|
379
|
+
}
|
|
380
|
+
evaluatorsAndRuns.forEach((evaluatorAndRun) => evaluatorsQueue.push(evaluatorAndRun, (err) => {
|
|
381
|
+
if (err) {
|
|
382
|
+
logger.error(`❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`);
|
|
393
383
|
}
|
|
394
|
-
|
|
395
|
-
|
|
384
|
+
}));
|
|
385
|
+
await evaluatorsQueue.drain();
|
|
386
|
+
logger.info(`✅ Evaluation runs completed`);
|
|
387
|
+
if (provider) {
|
|
388
|
+
await ((_d = provider.shutdown) === null || _d === void 0 ? void 0 : _d.call(provider));
|
|
389
|
+
}
|
|
390
|
+
return Object.assign(Object.assign({}, experiment), { evaluationRuns: Object.values(evaluationRuns) });
|
|
396
391
|
}
|
|
397
392
|
/**
|
|
398
393
|
* Run an evaluator against a run.
|
|
399
394
|
*
|
|
400
395
|
* @experimental This feature is not complete, and will change in the future.
|
|
401
396
|
*/
|
|
402
|
-
function runEvaluator(
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
return evaluate();
|
|
439
|
-
});
|
|
397
|
+
async function runEvaluator({ evaluator, run, exampleCache, onComplete, logger, }) {
|
|
398
|
+
const example = exampleCache[run.datasetExampleId];
|
|
399
|
+
(0, tiny_invariant_1.default)(example, `Example "${run.datasetExampleId}" not found`);
|
|
400
|
+
const evaluate = async () => {
|
|
401
|
+
var _a;
|
|
402
|
+
logger.info(`🧠 Evaluating run "${run.id}" with evaluator "${evaluator.name}"`);
|
|
403
|
+
const thisEval = {
|
|
404
|
+
id: localId(),
|
|
405
|
+
traceId: null,
|
|
406
|
+
experimentRunId: run.id,
|
|
407
|
+
startTime: new Date(),
|
|
408
|
+
endTime: new Date(), // will get replaced with actual end time
|
|
409
|
+
name: evaluator.name,
|
|
410
|
+
result: null,
|
|
411
|
+
error: null,
|
|
412
|
+
annotatorKind: evaluator.kind,
|
|
413
|
+
};
|
|
414
|
+
try {
|
|
415
|
+
const result = await evaluator.evaluate({
|
|
416
|
+
input: example.input,
|
|
417
|
+
output: (_a = run.output) !== null && _a !== void 0 ? _a : null,
|
|
418
|
+
expected: example.output,
|
|
419
|
+
metadata: example.metadata,
|
|
420
|
+
});
|
|
421
|
+
thisEval.result = result;
|
|
422
|
+
logger.info(`✅ Evaluator "${evaluator.name}" on run "${run.id}" completed`);
|
|
423
|
+
}
|
|
424
|
+
catch (error) {
|
|
425
|
+
thisEval.error = error instanceof Error ? error.message : "Unknown error";
|
|
426
|
+
logger.error(`❌ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`);
|
|
427
|
+
}
|
|
428
|
+
thisEval.endTime = new Date();
|
|
429
|
+
onComplete(thisEval);
|
|
430
|
+
return thisEval;
|
|
431
|
+
};
|
|
432
|
+
return evaluate();
|
|
440
433
|
}
|
|
441
434
|
/**
|
|
442
435
|
* Wrap an evaluator function in an object with a name property.
|