@arizeai/phoenix-client 2.3.2 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/src/datasets/appendDatasetExamples.js +34 -45
  2. package/dist/src/datasets/appendDatasetExamples.js.map +1 -1
  3. package/dist/src/datasets/createDataset.js +25 -36
  4. package/dist/src/datasets/createDataset.js.map +1 -1
  5. package/dist/src/datasets/getDataset.js +7 -18
  6. package/dist/src/datasets/getDataset.js.map +1 -1
  7. package/dist/src/datasets/getDatasetExamples.js +25 -36
  8. package/dist/src/datasets/getDatasetExamples.js.map +1 -1
  9. package/dist/src/datasets/getDatasetInfo.js +22 -33
  10. package/dist/src/datasets/getDatasetInfo.js.map +1 -1
  11. package/dist/src/datasets/getDatasetInfoByName.js +21 -32
  12. package/dist/src/datasets/getDatasetInfoByName.js.map +1 -1
  13. package/dist/src/datasets/listDatasets.js +6 -17
  14. package/dist/src/datasets/listDatasets.js.map +1 -1
  15. package/dist/src/experiments/getExperiment.js +13 -24
  16. package/dist/src/experiments/getExperiment.js.map +1 -1
  17. package/dist/src/experiments/getExperimentInfo.js +15 -26
  18. package/dist/src/experiments/getExperimentInfo.js.map +1 -1
  19. package/dist/src/experiments/getExperimentRuns.js +24 -35
  20. package/dist/src/experiments/getExperimentRuns.js.map +1 -1
  21. package/dist/src/experiments/runExperiment.js +280 -295
  22. package/dist/src/experiments/runExperiment.js.map +1 -1
  23. package/dist/src/prompts/createPrompt.js +14 -25
  24. package/dist/src/prompts/createPrompt.js.map +1 -1
  25. package/dist/src/prompts/getPrompt.js +4 -15
  26. package/dist/src/prompts/getPrompt.js.map +1 -1
  27. package/dist/src/spans/addSpanAnnotation.js +14 -25
  28. package/dist/src/spans/addSpanAnnotation.js.map +1 -1
  29. package/dist/src/spans/getSpanAnnotations.js +29 -40
  30. package/dist/src/spans/getSpanAnnotations.js.map +1 -1
  31. package/dist/src/spans/getSpans.js +29 -40
  32. package/dist/src/spans/getSpans.js.map +1 -1
  33. package/dist/src/spans/logSpanAnnotations.js +14 -25
  34. package/dist/src/spans/logSpanAnnotations.js.map +1 -1
  35. package/dist/src/utils/getPromptBySelector.js +37 -48
  36. package/dist/src/utils/getPromptBySelector.js.map +1 -1
  37. package/dist/tsconfig.tsbuildinfo +1 -1
  38. package/package.json +1 -1
@@ -1,13 +1,4 @@
1
1
  "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
10
- };
11
2
  var __importDefault = (this && this.__importDefault) || function (mod) {
12
3
  return (mod && mod.__esModule) ? mod : { "default": mod };
13
4
  };
@@ -59,138 +50,136 @@ const urlUtils_1 = require("../utils/urlUtils");
59
50
  * });
60
51
  * ```
61
52
  */
62
- function runExperiment(_a) {
63
- return __awaiter(this, arguments, void 0, function* ({ experimentName, experimentDescription, experimentMetadata = {}, client: _client, dataset: DatasetSelector, task, evaluators, logger = console, record = true, concurrency = 5, dryRun = false, }) {
64
- var _b, _c, _d, _e, _f;
65
- let provider;
66
- const isDryRun = typeof dryRun === "number" || dryRun === true;
67
- const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
68
- const dataset = yield (0, getDataset_1.getDataset)({ dataset: DatasetSelector, client });
69
- (0, tiny_invariant_1.default)(dataset, `Dataset not found`);
70
- (0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset has no examples`);
71
- const nExamples = typeof dryRun === "number"
72
- ? Math.min(dryRun, dataset.examples.length)
73
- : dataset.examples.length;
74
- let projectName = `${dataset.name}-exp-${new Date().toISOString()}`;
75
- // initialize the tracer into scope
76
- let taskTracer;
77
- let experiment;
78
- if (isDryRun) {
79
- experiment = {
80
- id: localId(),
81
- datasetId: dataset.id,
82
- datasetVersionId: dataset.versionId,
83
- projectName,
84
- metadata: experimentMetadata,
85
- };
86
- taskTracer = (0, instrumention_1.createNoOpProvider)().getTracer("no-op");
87
- }
88
- else {
89
- const experimentResponse = yield client
90
- .POST("/v1/datasets/{dataset_id}/experiments", {
91
- params: {
92
- path: {
93
- dataset_id: dataset.id,
94
- },
95
- },
96
- body: {
97
- name: experimentName,
98
- description: experimentDescription,
99
- metadata: experimentMetadata,
100
- project_name: projectName,
53
+ async function runExperiment({ experimentName, experimentDescription, experimentMetadata = {}, client: _client, dataset: DatasetSelector, task, evaluators, logger = console, record = true, concurrency = 5, dryRun = false, }) {
54
+ var _a, _b, _c, _d, _e;
55
+ let provider;
56
+ const isDryRun = typeof dryRun === "number" || dryRun === true;
57
+ const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
58
+ const dataset = await (0, getDataset_1.getDataset)({ dataset: DatasetSelector, client });
59
+ (0, tiny_invariant_1.default)(dataset, `Dataset not found`);
60
+ (0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset has no examples`);
61
+ const nExamples = typeof dryRun === "number"
62
+ ? Math.min(dryRun, dataset.examples.length)
63
+ : dataset.examples.length;
64
+ let projectName = `${dataset.name}-exp-${new Date().toISOString()}`;
65
+ // initialize the tracer into scope
66
+ let taskTracer;
67
+ let experiment;
68
+ if (isDryRun) {
69
+ experiment = {
70
+ id: localId(),
71
+ datasetId: dataset.id,
72
+ datasetVersionId: dataset.versionId,
73
+ projectName,
74
+ metadata: experimentMetadata,
75
+ };
76
+ taskTracer = (0, instrumention_1.createNoOpProvider)().getTracer("no-op");
77
+ }
78
+ else {
79
+ const experimentResponse = await client
80
+ .POST("/v1/datasets/{dataset_id}/experiments", {
81
+ params: {
82
+ path: {
83
+ dataset_id: dataset.id,
101
84
  },
102
- })
103
- .then((res) => { var _a; return (_a = res.data) === null || _a === void 0 ? void 0 : _a.data; });
104
- (0, tiny_invariant_1.default)(experimentResponse, `Failed to create experiment`);
105
- projectName = (_b = experimentResponse.project_name) !== null && _b !== void 0 ? _b : projectName;
106
- experiment = {
107
- id: experimentResponse.id,
108
- datasetId: experimentResponse.dataset_id,
109
- datasetVersionId: experimentResponse.dataset_version_id,
110
- projectName,
111
- metadata: experimentResponse.metadata,
112
- };
113
- // Initialize the tracer, now that we have a project name
114
- const baseUrl = client.config.baseUrl;
115
- (0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
116
- provider = (0, instrumention_1.createProvider)({
117
- projectName,
118
- baseUrl,
119
- headers: (_c = client.config.headers) !== null && _c !== void 0 ? _c : {},
120
- });
121
- taskTracer = provider.getTracer(projectName);
122
- }
123
- if (!record) {
124
- logger.info(`🔧 Running experiment in readonly mode. Results will not be recorded.`);
125
- }
126
- if (!isDryRun && client.config.baseUrl) {
127
- const datasetUrl = (0, urlUtils_1.getDatasetUrl)({
128
- baseUrl: client.config.baseUrl,
129
- datasetId: dataset.id,
130
- });
131
- const datasetExperimentsUrl = (0, urlUtils_1.getDatasetExperimentsUrl)({
132
- baseUrl: client.config.baseUrl,
133
- datasetId: dataset.id,
134
- });
135
- const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
136
- baseUrl: client.config.baseUrl,
137
- datasetId: dataset.id,
138
- experimentId: experiment.id,
139
- });
140
- logger.info(`📊 View dataset: ${datasetUrl}`);
141
- logger.info(`📺 View dataset experiments: ${datasetExperimentsUrl}`);
142
- logger.info(`🔗 View this experiment: ${experimentUrl}`);
143
- }
144
- logger.info(`🧪 Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${(_d = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _d !== void 0 ? _d : 0} ${(0, pluralize_1.pluralize)("evaluator", (_e = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _e !== void 0 ? _e : 0)} and ${concurrency} concurrent runs`);
145
- const runs = {};
146
- yield runTaskWithExamples({
147
- client,
148
- experimentId: experiment.id,
149
- task,
150
- dataset,
151
- logger,
152
- onComplete: (run) => {
153
- runs[run.id] = run;
154
85
  },
155
- concurrency,
156
- isDryRun,
157
- nExamples,
158
- tracer: taskTracer,
86
+ body: {
87
+ name: experimentName,
88
+ description: experimentDescription,
89
+ metadata: experimentMetadata,
90
+ project_name: projectName,
91
+ },
92
+ })
93
+ .then((res) => { var _a; return (_a = res.data) === null || _a === void 0 ? void 0 : _a.data; });
94
+ (0, tiny_invariant_1.default)(experimentResponse, `Failed to create experiment`);
95
+ projectName = (_a = experimentResponse.project_name) !== null && _a !== void 0 ? _a : projectName;
96
+ experiment = {
97
+ id: experimentResponse.id,
98
+ datasetId: experimentResponse.dataset_id,
99
+ datasetVersionId: experimentResponse.dataset_version_id,
100
+ projectName,
101
+ metadata: experimentResponse.metadata,
102
+ };
103
+ // Initialize the tracer, now that we have a project name
104
+ const baseUrl = client.config.baseUrl;
105
+ (0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
106
+ provider = (0, instrumention_1.createProvider)({
107
+ projectName,
108
+ baseUrl,
109
+ headers: (_b = client.config.headers) !== null && _b !== void 0 ? _b : {},
159
110
  });
160
- logger.info(`✅ Task runs completed`);
161
- const ranExperiment = Object.assign(Object.assign({}, experiment), { runs });
162
- // Shut down the provider so that the experiments run
163
- if (provider) {
164
- yield ((_f = provider.shutdown) === null || _f === void 0 ? void 0 : _f.call(provider));
165
- }
166
- const { evaluationRuns } = yield evaluateExperiment({
167
- experiment: ranExperiment,
168
- evaluators: evaluators !== null && evaluators !== void 0 ? evaluators : [],
169
- client,
170
- logger,
171
- concurrency,
172
- dryRun,
111
+ taskTracer = provider.getTracer(projectName);
112
+ }
113
+ if (!record) {
114
+ logger.info(`🔧 Running experiment in readonly mode. Results will not be recorded.`);
115
+ }
116
+ if (!isDryRun && client.config.baseUrl) {
117
+ const datasetUrl = (0, urlUtils_1.getDatasetUrl)({
118
+ baseUrl: client.config.baseUrl,
119
+ datasetId: dataset.id,
173
120
  });
174
- ranExperiment.evaluationRuns = evaluationRuns;
175
- logger.info(`✅ Experiment ${experiment.id} completed`);
176
- if (!isDryRun && client.config.baseUrl) {
177
- const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
178
- baseUrl: client.config.baseUrl,
179
- datasetId: dataset.id,
180
- experimentId: experiment.id,
181
- });
182
- logger.info(`🔍 View results: ${experimentUrl}`);
183
- }
184
- return ranExperiment;
121
+ const datasetExperimentsUrl = (0, urlUtils_1.getDatasetExperimentsUrl)({
122
+ baseUrl: client.config.baseUrl,
123
+ datasetId: dataset.id,
124
+ });
125
+ const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
126
+ baseUrl: client.config.baseUrl,
127
+ datasetId: dataset.id,
128
+ experimentId: experiment.id,
129
+ });
130
+ logger.info(`📊 View dataset: ${datasetUrl}`);
131
+ logger.info(`📺 View dataset experiments: ${datasetExperimentsUrl}`);
132
+ logger.info(`🔗 View this experiment: ${experimentUrl}`);
133
+ }
134
+ logger.info(`🧪 Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${(_c = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _c !== void 0 ? _c : 0} ${(0, pluralize_1.pluralize)("evaluator", (_d = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _d !== void 0 ? _d : 0)} and ${concurrency} concurrent runs`);
135
+ const runs = {};
136
+ await runTaskWithExamples({
137
+ client,
138
+ experimentId: experiment.id,
139
+ task,
140
+ dataset,
141
+ logger,
142
+ onComplete: (run) => {
143
+ runs[run.id] = run;
144
+ },
145
+ concurrency,
146
+ isDryRun,
147
+ nExamples,
148
+ tracer: taskTracer,
185
149
  });
150
+ logger.info(`✅ Task runs completed`);
151
+ const ranExperiment = Object.assign(Object.assign({}, experiment), { runs });
152
+ // Shut down the provider so that the experiments run
153
+ if (provider) {
154
+ await ((_e = provider.shutdown) === null || _e === void 0 ? void 0 : _e.call(provider));
155
+ }
156
+ const { evaluationRuns } = await evaluateExperiment({
157
+ experiment: ranExperiment,
158
+ evaluators: evaluators !== null && evaluators !== void 0 ? evaluators : [],
159
+ client,
160
+ logger,
161
+ concurrency,
162
+ dryRun,
163
+ });
164
+ ranExperiment.evaluationRuns = evaluationRuns;
165
+ logger.info(`✅ Experiment ${experiment.id} completed`);
166
+ if (!isDryRun && client.config.baseUrl) {
167
+ const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
168
+ baseUrl: client.config.baseUrl,
169
+ datasetId: dataset.id,
170
+ experimentId: experiment.id,
171
+ });
172
+ logger.info(`🔍 View results: ${experimentUrl}`);
173
+ }
174
+ return ranExperiment;
186
175
  }
187
176
  /**
188
177
  * Run a task against n examples in a dataset.
189
178
  */
190
179
  function runTaskWithExamples({ client, experimentId, task, dataset, onComplete, logger, concurrency = 5, isDryRun, nExamples, tracer, }) {
191
180
  logger.info(`🔧 Running task "${task.name}" on dataset "${dataset.id}"`);
192
- const run = (example) => __awaiter(this, void 0, void 0, function* () {
193
- return tracer.startActiveSpan(`Task: ${task.name}`, (span) => __awaiter(this, void 0, void 0, function* () {
181
+ const run = async (example) => {
182
+ return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => {
194
183
  var _a, _b;
195
184
  logger.info(`🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`);
196
185
  const traceId = span.spanContext().traceId;
@@ -205,7 +194,7 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
205
194
  error: null,
206
195
  };
207
196
  try {
208
- const taskOutput = yield (0, promisifyResult_1.promisifyResult)(task(example));
197
+ const taskOutput = await (0, promisifyResult_1.promisifyResult)(task(example));
209
198
  thisRun.output = taskOutput;
210
199
  }
211
200
  catch (error) {
@@ -216,7 +205,7 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
216
205
  thisRun.endTime = new Date();
217
206
  if (!isDryRun) {
218
207
  // Log the run to the server
219
- const res = yield client.POST("/v1/experiments/{experiment_id}/runs", {
208
+ const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
220
209
  params: {
221
210
  path: {
222
211
  experiment_id: experimentId,
@@ -248,8 +237,8 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
248
237
  span === null || span === void 0 ? void 0 : span.end();
249
238
  onComplete(thisRun);
250
239
  return thisRun;
251
- }));
252
- });
240
+ });
241
+ };
253
242
  const q = (0, async_1.queue)(run, concurrency);
254
243
  const examplesToUse = dataset.examples.slice(0, nExamples);
255
244
  examplesToUse.forEach((example) => q.push(example, (err) => {
@@ -264,179 +253,175 @@ function runTaskWithExamples({ client, experimentId, task, dataset, onComplete,
264
253
  *
265
254
  * @experimental This feature is not complete, and will change in the future.
266
255
  */
267
- function evaluateExperiment(_a) {
268
- return __awaiter(this, arguments, void 0, function* ({ experiment, evaluators, client: _client, logger = console, concurrency = 5, dryRun = false, }) {
269
- var _b, _c, _d, _e;
270
- const isDryRun = typeof dryRun === "number" || dryRun === true;
271
- const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
272
- const baseUrl = client.config.baseUrl;
273
- (0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
274
- let provider;
275
- if (!isDryRun) {
276
- provider = (0, instrumention_1.createProvider)({
277
- projectName: "evaluators",
278
- baseUrl,
279
- headers: (_b = client.config.headers) !== null && _b !== void 0 ? _b : {},
280
- });
281
- }
282
- else {
283
- provider = (0, instrumention_1.createNoOpProvider)();
284
- }
285
- const tracer = isDryRun
286
- ? provider.getTracer("no-op")
287
- : provider.getTracer("evaluators");
288
- const nRuns = typeof dryRun === "number"
289
- ? Math.max(dryRun, Object.keys(experiment.runs).length)
290
- : Object.keys(experiment.runs).length;
291
- const dataset = yield (0, getDataset_1.getDataset)({
292
- dataset: { datasetId: experiment.datasetId },
293
- client,
256
+ async function evaluateExperiment({ experiment, evaluators, client: _client, logger = console, concurrency = 5, dryRun = false, }) {
257
+ var _a, _b, _c, _d;
258
+ const isDryRun = typeof dryRun === "number" || dryRun === true;
259
+ const client = _client !== null && _client !== void 0 ? _client : (0, client_1.createClient)();
260
+ const baseUrl = client.config.baseUrl;
261
+ (0, tiny_invariant_1.default)(baseUrl, "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client.");
262
+ let provider;
263
+ if (!isDryRun) {
264
+ provider = (0, instrumention_1.createProvider)({
265
+ projectName: "evaluators",
266
+ baseUrl,
267
+ headers: (_a = client.config.headers) !== null && _a !== void 0 ? _a : {},
294
268
  });
295
- (0, tiny_invariant_1.default)(dataset, `Dataset "${experiment.datasetId}" not found`);
296
- (0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset "${experiment.datasetId}" has no examples`);
297
- (0, tiny_invariant_1.default)(experiment.runs, `Experiment "${experiment.id}" has no runs`);
298
- const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
299
- if ((evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) === 0) {
300
- return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
301
- }
302
- logger.info(`🧠 Evaluating experiment "${experiment.id}" with ${(_c = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _c !== void 0 ? _c : 0} ${(0, pluralize_1.pluralize)("evaluator", (_d = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _d !== void 0 ? _d : 0)}`);
303
- if (!isDryRun && client.config.baseUrl) {
304
- const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
305
- baseUrl: client.config.baseUrl,
306
- datasetId: experiment.datasetId,
307
- experimentId: experiment.id,
269
+ }
270
+ else {
271
+ provider = (0, instrumention_1.createNoOpProvider)();
272
+ }
273
+ const tracer = isDryRun
274
+ ? provider.getTracer("no-op")
275
+ : provider.getTracer("evaluators");
276
+ const nRuns = typeof dryRun === "number"
277
+ ? Math.max(dryRun, Object.keys(experiment.runs).length)
278
+ : Object.keys(experiment.runs).length;
279
+ const dataset = await (0, getDataset_1.getDataset)({
280
+ dataset: { datasetId: experiment.datasetId },
281
+ client,
282
+ });
283
+ (0, tiny_invariant_1.default)(dataset, `Dataset "${experiment.datasetId}" not found`);
284
+ (0, tiny_invariant_1.default)(dataset.examples.length > 0, `Dataset "${experiment.datasetId}" has no examples`);
285
+ (0, tiny_invariant_1.default)(experiment.runs, `Experiment "${experiment.id}" has no runs`);
286
+ const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
287
+ if ((evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) === 0) {
288
+ return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
289
+ }
290
+ logger.info(`🧠 Evaluating experiment "${experiment.id}" with ${(_b = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _b !== void 0 ? _b : 0} ${(0, pluralize_1.pluralize)("evaluator", (_c = evaluators === null || evaluators === void 0 ? void 0 : evaluators.length) !== null && _c !== void 0 ? _c : 0)}`);
291
+ if (!isDryRun && client.config.baseUrl) {
292
+ const experimentUrl = (0, urlUtils_1.getExperimentUrl)({
293
+ baseUrl: client.config.baseUrl,
294
+ datasetId: experiment.datasetId,
295
+ experimentId: experiment.id,
296
+ });
297
+ logger.info(`🔗 View experiment evaluation: ${experimentUrl}`);
298
+ }
299
+ const evaluationRuns = {};
300
+ const examplesById = {};
301
+ for (const example of dataset.examples) {
302
+ examplesById[example.id] = example;
303
+ }
304
+ const onEvaluationComplete = (run) => {
305
+ evaluationRuns[run.id] = run;
306
+ };
307
+ // Run evaluators against all runs
308
+ // Flat list of evaluator + run tuples
309
+ const evaluatorsAndRuns = evaluators.flatMap((evaluator) => runsToEvaluate.map((run) => ({
310
+ evaluator,
311
+ run,
312
+ })));
313
+ const evaluatorsQueue = (0, async_1.queue)(async (evaluatorAndRun) => {
314
+ return tracer.startActiveSpan(`Evaluation: ${evaluatorAndRun.evaluator.name}`, async (span) => {
315
+ var _a, _b, _c;
316
+ const evalResult = await runEvaluator({
317
+ evaluator: evaluatorAndRun.evaluator,
318
+ run: evaluatorAndRun.run,
319
+ exampleCache: examplesById,
320
+ onComplete: onEvaluationComplete,
321
+ logger,
308
322
  });
309
- logger.info(`🔗 View experiment evaluation: ${experimentUrl}`);
310
- }
311
- const evaluationRuns = {};
312
- const examplesById = {};
313
- for (const example of dataset.examples) {
314
- examplesById[example.id] = example;
315
- }
316
- const onEvaluationComplete = (run) => {
317
- evaluationRuns[run.id] = run;
318
- };
319
- // Run evaluators against all runs
320
- // Flat list of evaluator + run tuples
321
- const evaluatorsAndRuns = evaluators.flatMap((evaluator) => runsToEvaluate.map((run) => ({
322
- evaluator,
323
- run,
324
- })));
325
- const evaluatorsQueue = (0, async_1.queue)((evaluatorAndRun) => __awaiter(this, void 0, void 0, function* () {
326
- return tracer.startActiveSpan(`Evaluation: ${evaluatorAndRun.evaluator.name}`, (span) => __awaiter(this, void 0, void 0, function* () {
327
- var _a, _b, _c;
328
- const evalResult = yield runEvaluator({
329
- evaluator: evaluatorAndRun.evaluator,
330
- run: evaluatorAndRun.run,
331
- exampleCache: examplesById,
332
- onComplete: onEvaluationComplete,
333
- logger,
323
+ span.setAttributes({
324
+ [openinference_semantic_conventions_1.SemanticConventions.OPENINFERENCE_SPAN_KIND]: openinference_semantic_conventions_1.OpenInferenceSpanKind.EVALUATOR,
325
+ [openinference_semantic_conventions_1.SemanticConventions.INPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
326
+ [openinference_semantic_conventions_1.SemanticConventions.INPUT_VALUE]: JSON.stringify({
327
+ input: (_a = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _a === void 0 ? void 0 : _a.input,
328
+ output: evaluatorAndRun.run.output,
329
+ expected: (_b = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _b === void 0 ? void 0 : _b.output,
330
+ metadata: (_c = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _c === void 0 ? void 0 : _c.metadata,
331
+ }),
332
+ [openinference_semantic_conventions_1.SemanticConventions.OUTPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
333
+ [openinference_semantic_conventions_1.SemanticConventions.OUTPUT_VALUE]: (0, ensureString_1.ensureString)(evalResult.result),
334
+ });
335
+ if (evalResult.error) {
336
+ span.setStatus({
337
+ code: api_1.SpanStatusCode.ERROR,
338
+ message: evalResult.error,
334
339
  });
335
- span.setAttributes({
336
- [openinference_semantic_conventions_1.SemanticConventions.OPENINFERENCE_SPAN_KIND]: openinference_semantic_conventions_1.OpenInferenceSpanKind.EVALUATOR,
337
- [openinference_semantic_conventions_1.SemanticConventions.INPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
338
- [openinference_semantic_conventions_1.SemanticConventions.INPUT_VALUE]: JSON.stringify({
339
- input: (_a = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _a === void 0 ? void 0 : _a.input,
340
- output: evaluatorAndRun.run.output,
341
- expected: (_b = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _b === void 0 ? void 0 : _b.output,
342
- metadata: (_c = examplesById[evaluatorAndRun.run.datasetExampleId]) === null || _c === void 0 ? void 0 : _c.metadata,
343
- }),
344
- [openinference_semantic_conventions_1.SemanticConventions.OUTPUT_MIME_TYPE]: openinference_semantic_conventions_1.MimeType.JSON,
345
- [openinference_semantic_conventions_1.SemanticConventions.OUTPUT_VALUE]: (0, ensureString_1.ensureString)(evalResult.result),
340
+ }
341
+ else {
342
+ span.setStatus({ code: api_1.SpanStatusCode.OK });
343
+ }
344
+ if (evalResult.result) {
345
+ span.setAttributes((0, objectAsAttributes_1.objectAsAttributes)(evalResult.result));
346
+ }
347
+ evalResult.traceId = span.spanContext().traceId;
348
+ if (!isDryRun) {
349
+ // Log the evaluation to the server
350
+ // We log this without awaiting (e.g. best effort)
351
+ client.POST("/v1/experiment_evaluations", {
352
+ body: {
353
+ experiment_run_id: evaluatorAndRun.run.id,
354
+ name: evaluatorAndRun.evaluator.name,
355
+ annotator_kind: evaluatorAndRun.evaluator.kind,
356
+ start_time: evalResult.startTime.toISOString(),
357
+ end_time: evalResult.endTime.toISOString(),
358
+ result: Object.assign({}, evalResult.result),
359
+ error: evalResult.error,
360
+ trace_id: evalResult.traceId,
361
+ },
346
362
  });
347
- if (evalResult.error) {
348
- span.setStatus({
349
- code: api_1.SpanStatusCode.ERROR,
350
- message: evalResult.error,
351
- });
352
- }
353
- else {
354
- span.setStatus({ code: api_1.SpanStatusCode.OK });
355
- }
356
- if (evalResult.result) {
357
- span.setAttributes((0, objectAsAttributes_1.objectAsAttributes)(evalResult.result));
358
- }
359
- evalResult.traceId = span.spanContext().traceId;
360
- if (!isDryRun) {
361
- // Log the evaluation to the server
362
- // We log this without awaiting (e.g. best effort)
363
- client.POST("/v1/experiment_evaluations", {
364
- body: {
365
- experiment_run_id: evaluatorAndRun.run.id,
366
- name: evaluatorAndRun.evaluator.name,
367
- annotator_kind: evaluatorAndRun.evaluator.kind,
368
- start_time: evalResult.startTime.toISOString(),
369
- end_time: evalResult.endTime.toISOString(),
370
- result: Object.assign({}, evalResult.result),
371
- error: evalResult.error,
372
- trace_id: evalResult.traceId,
373
- },
374
- });
375
- }
376
- span.end();
377
- return evalResult;
378
- }));
379
- }), concurrency);
380
- if (!evaluatorsAndRuns.length) {
381
- logger.info(`⛔ No evaluators to run`);
382
- return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
383
- }
384
- evaluatorsAndRuns.forEach((evaluatorAndRun) => evaluatorsQueue.push(evaluatorAndRun, (err) => {
385
- if (err) {
386
- logger.error(`❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`);
387
363
  }
388
- }));
389
- yield evaluatorsQueue.drain();
390
- logger.info(`✅ Evaluation runs completed`);
391
- if (provider) {
392
- yield ((_e = provider.shutdown) === null || _e === void 0 ? void 0 : _e.call(provider));
364
+ span.end();
365
+ return evalResult;
366
+ });
367
+ }, concurrency);
368
+ if (!evaluatorsAndRuns.length) {
369
+ logger.info(`⛔ No evaluators to run`);
370
+ return Object.assign(Object.assign({}, experiment), { evaluationRuns: [] });
371
+ }
372
+ evaluatorsAndRuns.forEach((evaluatorAndRun) => evaluatorsQueue.push(evaluatorAndRun, (err) => {
373
+ if (err) {
374
+ logger.error(`❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`);
393
375
  }
394
- return Object.assign(Object.assign({}, experiment), { evaluationRuns: Object.values(evaluationRuns) });
395
- });
376
+ }));
377
+ await evaluatorsQueue.drain();
378
+ logger.info(`✅ Evaluation runs completed`);
379
+ if (provider) {
380
+ await ((_d = provider.shutdown) === null || _d === void 0 ? void 0 : _d.call(provider));
381
+ }
382
+ return Object.assign(Object.assign({}, experiment), { evaluationRuns: Object.values(evaluationRuns) });
396
383
  }
397
384
  /**
398
385
  * Run an evaluator against a run.
399
386
  *
400
387
  * @experimental This feature is not complete, and will change in the future.
401
388
  */
402
- function runEvaluator(_a) {
403
- return __awaiter(this, arguments, void 0, function* ({ evaluator, run, exampleCache, onComplete, logger, }) {
404
- const example = exampleCache[run.datasetExampleId];
405
- (0, tiny_invariant_1.default)(example, `Example "${run.datasetExampleId}" not found`);
406
- const evaluate = () => __awaiter(this, void 0, void 0, function* () {
407
- var _a;
408
- logger.info(`🧠 Evaluating run "${run.id}" with evaluator "${evaluator.name}"`);
409
- const thisEval = {
410
- id: localId(),
411
- traceId: null,
412
- experimentRunId: run.id,
413
- startTime: new Date(),
414
- endTime: new Date(), // will get replaced with actual end time
415
- name: evaluator.name,
416
- result: null,
417
- error: null,
418
- annotatorKind: evaluator.kind,
419
- };
420
- try {
421
- const result = yield evaluator.evaluate({
422
- input: example.input,
423
- output: (_a = run.output) !== null && _a !== void 0 ? _a : null,
424
- expected: example.output,
425
- metadata: example.metadata,
426
- });
427
- thisEval.result = result;
428
- logger.info(`✅ Evaluator "${evaluator.name}" on run "${run.id}" completed`);
429
- }
430
- catch (error) {
431
- thisEval.error = error instanceof Error ? error.message : "Unknown error";
432
- logger.error(`❌ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`);
433
- }
434
- thisEval.endTime = new Date();
435
- onComplete(thisEval);
436
- return thisEval;
437
- });
438
- return evaluate();
439
- });
389
+ async function runEvaluator({ evaluator, run, exampleCache, onComplete, logger, }) {
390
+ const example = exampleCache[run.datasetExampleId];
391
+ (0, tiny_invariant_1.default)(example, `Example "${run.datasetExampleId}" not found`);
392
+ const evaluate = async () => {
393
+ var _a;
394
+ logger.info(`🧠 Evaluating run "${run.id}" with evaluator "${evaluator.name}"`);
395
+ const thisEval = {
396
+ id: localId(),
397
+ traceId: null,
398
+ experimentRunId: run.id,
399
+ startTime: new Date(),
400
+ endTime: new Date(), // will get replaced with actual end time
401
+ name: evaluator.name,
402
+ result: null,
403
+ error: null,
404
+ annotatorKind: evaluator.kind,
405
+ };
406
+ try {
407
+ const result = await evaluator.evaluate({
408
+ input: example.input,
409
+ output: (_a = run.output) !== null && _a !== void 0 ? _a : null,
410
+ expected: example.output,
411
+ metadata: example.metadata,
412
+ });
413
+ thisEval.result = result;
414
+ logger.info(`✅ Evaluator "${evaluator.name}" on run "${run.id}" completed`);
415
+ }
416
+ catch (error) {
417
+ thisEval.error = error instanceof Error ? error.message : "Unknown error";
418
+ logger.error(`❌ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`);
419
+ }
420
+ thisEval.endTime = new Date();
421
+ onComplete(thisEval);
422
+ return thisEval;
423
+ };
424
+ return evaluate();
440
425
  }
441
426
  /**
442
427
  * Wrap an evaluator function in an object with a name property.