@arizeai/phoenix-client 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/esm/client.d.ts +13 -1
  2. package/dist/esm/client.d.ts.map +1 -1
  3. package/dist/esm/client.js +4 -1
  4. package/dist/esm/client.js.map +1 -1
  5. package/dist/esm/experiments/instrumention.d.ts +18 -0
  6. package/dist/esm/experiments/instrumention.d.ts.map +1 -0
  7. package/dist/esm/experiments/instrumention.js +34 -0
  8. package/dist/esm/experiments/instrumention.js.map +1 -0
  9. package/dist/esm/experiments/runExperiment.d.ts +19 -16
  10. package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
  11. package/dist/esm/experiments/runExperiment.js +209 -98
  12. package/dist/esm/experiments/runExperiment.js.map +1 -1
  13. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  14. package/dist/esm/types/experiments.d.ts +0 -1
  15. package/dist/esm/types/experiments.d.ts.map +1 -1
  16. package/dist/esm/utils/ensureString.d.ts +8 -0
  17. package/dist/esm/utils/ensureString.d.ts.map +1 -0
  18. package/dist/esm/utils/ensureString.js +14 -0
  19. package/dist/esm/utils/ensureString.js.map +1 -0
  20. package/dist/esm/utils/objectAsAttributes.d.ts +3 -0
  21. package/dist/esm/utils/objectAsAttributes.d.ts.map +1 -0
  22. package/dist/esm/utils/objectAsAttributes.js +4 -0
  23. package/dist/esm/utils/objectAsAttributes.js.map +1 -0
  24. package/dist/src/client.d.ts +13 -1
  25. package/dist/src/client.d.ts.map +1 -1
  26. package/dist/src/client.js +1 -1
  27. package/dist/src/client.js.map +1 -1
  28. package/dist/src/experiments/instrumention.d.ts +18 -0
  29. package/dist/src/experiments/instrumention.d.ts.map +1 -0
  30. package/dist/src/experiments/instrumention.js +38 -0
  31. package/dist/src/experiments/instrumention.js.map +1 -0
  32. package/dist/src/experiments/runExperiment.d.ts +19 -16
  33. package/dist/src/experiments/runExperiment.d.ts.map +1 -1
  34. package/dist/src/experiments/runExperiment.js +211 -102
  35. package/dist/src/experiments/runExperiment.js.map +1 -1
  36. package/dist/src/types/experiments.d.ts +0 -1
  37. package/dist/src/types/experiments.d.ts.map +1 -1
  38. package/dist/src/utils/ensureString.d.ts +8 -0
  39. package/dist/src/utils/ensureString.d.ts.map +1 -0
  40. package/dist/src/utils/ensureString.js +18 -0
  41. package/dist/src/utils/ensureString.js.map +1 -0
  42. package/dist/src/utils/objectAsAttributes.d.ts +3 -0
  43. package/dist/src/utils/objectAsAttributes.d.ts.map +1 -0
  44. package/dist/src/utils/objectAsAttributes.js +7 -0
  45. package/dist/src/utils/objectAsAttributes.js.map +1 -0
  46. package/dist/tsconfig.tsbuildinfo +1 -1
  47. package/package.json +9 -1
  48. package/src/client.ts +4 -1
  49. package/src/experiments/instrumention.ts +52 -0
  50. package/src/experiments/runExperiment.ts +246 -108
  51. package/src/types/experiments.ts +0 -1
  52. package/src/utils/ensureString.ts +14 -0
  53. package/src/utils/objectAsAttributes.ts +9 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arizeai/phoenix-client",
3
- "version": "1.2.0",
3
+ "version": "1.3.0",
4
4
  "description": "A client for the Phoenix API",
5
5
  "main": "dist/src/index.js",
6
6
  "module": "dist/esm/index.js",
@@ -52,6 +52,14 @@
52
52
  "vitest": "^2.1.9"
53
53
  },
54
54
  "dependencies": {
55
+ "@arizeai/openinference-semantic-conventions": "^1.1.0",
56
+ "@opentelemetry/api": "^1.9.0",
57
+ "@opentelemetry/core": "^1.25.1",
58
+ "@opentelemetry/instrumentation": "^0.57.2",
59
+ "@opentelemetry/exporter-trace-otlp-proto": "^0.57.2",
60
+ "@opentelemetry/resources": "^2.0.0",
61
+ "@opentelemetry/sdk-trace-base": "^1.30.1",
62
+ "@opentelemetry/sdk-trace-node": "^1.30.1",
55
63
  "async": "^3.2.6",
56
64
  "openapi-fetch": "^0.12.5",
57
65
  "tiny-invariant": "^1.3.3",
package/src/client.ts CHANGED
@@ -79,7 +79,10 @@ export const createClient = (
79
79
  } = {}
80
80
  ) => {
81
81
  const mergedOptions = getMergedOptions(config);
82
- return createOpenApiClient<pathsV1>(mergedOptions);
82
+ return {
83
+ ...createOpenApiClient<pathsV1>(mergedOptions),
84
+ config: mergedOptions,
85
+ };
83
86
  };
84
87
 
85
88
  /**
@@ -0,0 +1,52 @@
1
+ import { diag, DiagConsoleLogger, DiagLogLevel } from "@opentelemetry/api";
2
+ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto";
3
+ import { resourceFromAttributes } from "@opentelemetry/resources";
4
+ import { SimpleSpanProcessor } from "@opentelemetry/sdk-trace-base";
5
+ import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
6
+ import { SEMRESATTRS_PROJECT_NAME } from "@arizeai/openinference-semantic-conventions";
7
+ import { HeadersOptions } from "openapi-fetch";
8
+
9
+ /**
10
+ * Creates a provider that exports traces to Phoenix.
11
+ */
12
+ export function createProvider({
13
+ projectName,
14
+ baseUrl,
15
+ headers,
16
+ }: {
17
+ projectName: string;
18
+ headers: HeadersOptions;
19
+ /**
20
+ * The base URL of the Phoenix. Doesn't include the /v1/traces path.
21
+ */
22
+ baseUrl: string;
23
+ }) {
24
+ diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.ERROR);
25
+
26
+ const provider = new NodeTracerProvider({
27
+ resource: resourceFromAttributes({
28
+ [SEMRESATTRS_PROJECT_NAME]: projectName,
29
+ }),
30
+ spanProcessors: [
31
+ new SimpleSpanProcessor(
32
+ new OTLPTraceExporter({
33
+ url: `${baseUrl}/v1/traces`,
34
+ headers: Array.isArray(headers)
35
+ ? Object.fromEntries(headers)
36
+ : headers,
37
+ })
38
+ ),
39
+ ],
40
+ });
41
+
42
+ return provider;
43
+ }
44
+
45
+ /**
46
+ * For dry runs we create a provider that doesn't export traces.
47
+ */
48
+ export function createNoOpProvider() {
49
+ const provider = new NodeTracerProvider({});
50
+
51
+ return provider;
52
+ }
@@ -7,7 +7,6 @@ import type {
7
7
  Evaluator,
8
8
  Experiment,
9
9
  ExperimentEvaluationRun,
10
- ExperimentParameters,
11
10
  ExperimentRun,
12
11
  ExperimentTask,
13
12
  RanExperiment,
@@ -17,6 +16,16 @@ import { getDatasetBySelector } from "../utils/getDatasetBySelector";
17
16
  import { pluralize } from "../utils/pluralize";
18
17
  import { promisifyResult } from "../utils/promisifyResult";
19
18
  import { AnnotatorKind } from "../types/annotations";
19
+ import { createProvider, createNoOpProvider } from "./instrumention";
20
+ import { SpanStatusCode, Tracer } from "@opentelemetry/api";
21
+ import {
22
+ MimeType,
23
+ OpenInferenceSpanKind,
24
+ SemanticConventions,
25
+ } from "@arizeai/openinference-semantic-conventions";
26
+ import { ensureString } from "../utils/ensureString";
27
+ import type { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
28
+ import { objectAsAttributes } from "../utils/objectAsAttributes";
20
29
 
21
30
  /**
22
31
  * Parameters for running an experiment.
@@ -49,10 +58,6 @@ export type RunExperimentParams = ClientFn & {
49
58
  * The evaluators to use
50
59
  */
51
60
  evaluators?: Evaluator[];
52
- /**
53
- * The project under which the experiment task traces are recorded
54
- */
55
- projectName?: string;
56
61
  /**
57
62
  * The logger to use
58
63
  */
@@ -73,7 +78,23 @@ export type RunExperimentParams = ClientFn & {
73
78
  };
74
79
 
75
80
  /**
76
- * Run an experiment.
81
+ * Runs an experiment using a given set of dataset of examples.
82
+ *
83
+ * An experiment is a user-defined task that runs on each example in a dataset. The results from
84
+ * each experiment can be evaluated using any number of evaluators to measure the behavior of the
85
+ * task. The experiment and evaluation results are stored in the Phoenix database for comparison
86
+ * and analysis.
87
+ *
88
+ * A `task` is either a sync or async function that returns a JSON serializable
89
+ * output. If the `task` is a function of one argument then that argument will be bound to the
90
+ * `input` field of the dataset example. Alternatively, the `task` can be a function of any
91
+ * combination of specific argument names that will be bound to special values:
92
+ *
93
+ * - `input`: The input field of the dataset example
94
+ * - `expected`: The expected or reference output of the dataset example
95
+ * - `reference`: An alias for `expected`
96
+ * - `metadata`: Metadata associated with the dataset example
97
+ * - `example`: The dataset `Example` object with all associated fields
77
98
  *
78
99
  * @example
79
100
  * ```ts
@@ -83,27 +104,25 @@ export type RunExperimentParams = ClientFn & {
83
104
  * dataset: "my-dataset",
84
105
  * task: async (example) => example.input,
85
106
  * evaluators: [
86
- * asEvaluator("my-evaluator", "CODE", async (params) => params.output),
107
+ * asEvaluator({ name: "my-evaluator", kind: "CODE", evaluate: async (params) => params.output }),
87
108
  * ],
88
109
  * });
89
110
  * ```
90
- *
91
- * @experimental This feature is not complete, and will change in the future.
92
111
  */
93
112
  export async function runExperiment({
94
- experimentName: _experimentName,
113
+ experimentName,
95
114
  experimentDescription,
96
115
  experimentMetadata,
97
116
  client: _client,
98
117
  dataset: _dataset,
99
118
  task,
100
119
  evaluators,
101
- projectName = "default",
102
120
  logger = console,
103
121
  record = true,
104
122
  concurrency = 5,
105
123
  dryRun = false,
106
124
  }: RunExperimentParams): Promise<RanExperiment> {
125
+ let provider: NodeTracerProvider | undefined;
107
126
  const isDryRun = typeof dryRun === "number" || dryRun === true;
108
127
  const client = _client ?? createClient();
109
128
  const dataset = await getDatasetBySelector({ dataset: _dataset, client });
@@ -114,19 +133,18 @@ export async function runExperiment({
114
133
  ? Math.max(dryRun, dataset.examples.length)
115
134
  : dataset.examples.length;
116
135
 
117
- const experimentName =
118
- _experimentName ?? `${dataset.name}-${new Date().toISOString()}`;
119
- const experimentParams: ExperimentParameters = {
120
- nExamples,
121
- };
136
+ let projectName = `${dataset.name}-exp-${new Date().toISOString()}`;
137
+ // initialize the tracer into scope
138
+ let taskTracer: Tracer;
122
139
  let experiment: Experiment;
123
140
  if (isDryRun) {
124
141
  experiment = {
125
- id: id(),
142
+ id: localId(),
126
143
  datasetId: dataset.id,
127
144
  datasetVersionId: dataset.versionId,
128
145
  projectName,
129
146
  };
147
+ taskTracer = createNoOpProvider().getTracer("no-op");
130
148
  } else {
131
149
  const experimentResponse = await client
132
150
  .POST("/v1/datasets/{dataset_id}/experiments", {
@@ -144,14 +162,26 @@ export async function runExperiment({
144
162
  })
145
163
  .then((res) => res.data?.data);
146
164
  invariant(experimentResponse, `Failed to create experiment`);
165
+ projectName = experimentResponse.project_name ?? projectName;
147
166
  experiment = {
148
167
  id: experimentResponse.id,
149
168
  datasetId: dataset.id,
150
169
  datasetVersionId: dataset.versionId,
151
170
  projectName,
152
171
  };
172
+ // Initialize the tracer, now that we have a project name
173
+ const baseUrl = client.config.baseUrl;
174
+ invariant(
175
+ baseUrl,
176
+ "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
177
+ );
178
+ provider = createProvider({
179
+ projectName,
180
+ baseUrl,
181
+ headers: client.config.headers ?? {},
182
+ });
183
+ taskTracer = provider.getTracer(projectName);
153
184
  }
154
-
155
185
  if (!record) {
156
186
  logger.info(
157
187
  `🔧 Running experiment in readonly mode. Results will not be recorded.`
@@ -180,15 +210,20 @@ export async function runExperiment({
180
210
  concurrency,
181
211
  isDryRun,
182
212
  nExamples,
213
+ tracer: taskTracer,
183
214
  });
184
215
  logger.info(`✅ Task runs completed`);
185
216
 
186
217
  const ranExperiment: RanExperiment = {
187
218
  ...experiment,
188
- params: experimentParams,
189
219
  runs,
190
220
  };
191
221
 
222
+ // Shut down the provider so that the experiments run
223
+ if (provider) {
224
+ await provider.shutdown?.();
225
+ }
226
+
192
227
  const { evaluationRuns } = await evaluateExperiment({
193
228
  experiment: ranExperiment,
194
229
  evaluators: evaluators ?? [],
@@ -217,6 +252,7 @@ function runTask({
217
252
  concurrency = 5,
218
253
  isDryRun,
219
254
  nExamples,
255
+ tracer,
220
256
  }: {
221
257
  /** The client to use */
222
258
  client: PhoenixClient;
@@ -236,61 +272,88 @@ function runTask({
236
272
  isDryRun: boolean;
237
273
  /** The number of examples to run */
238
274
  nExamples: number;
275
+ /** TraceProvider instance that will be used to create spans from task calls */
276
+ tracer: Tracer;
239
277
  }) {
240
278
  logger.info(`🔧 Running task "${task.name}" on dataset "${dataset.id}"`);
241
279
  const run = async (example: Example) => {
242
- logger.info(
243
- `🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`
244
- );
245
- const thisRun: ExperimentRun = {
246
- id: id(),
247
- traceId: null, // TODO: fill this in once we trace experiments
248
- experimentId,
249
- datasetExampleId: example.id,
250
- startTime: new Date(),
251
- endTime: new Date(), // will get replaced with actual end time
252
- output: null,
253
- error: null,
254
- };
255
- try {
256
- const taskOutput = await promisifyResult(task(example));
257
- // TODO: why doesn't run output type match task output type?
258
- thisRun.output =
259
- typeof taskOutput === "string"
260
- ? taskOutput
261
- : JSON.stringify(taskOutput);
262
- } catch (error) {
263
- thisRun.error = error instanceof Error ? error.message : "Unknown error";
264
- }
265
- thisRun.endTime = new Date();
266
- if (!isDryRun) {
267
- // Log the run to the server
268
- // We log this without awaiting (e.g. best effort)
269
- const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
270
- params: {
271
- path: {
272
- experiment_id: experimentId,
280
+ return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => {
281
+ logger.info(
282
+ `🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`
283
+ );
284
+ const traceId = span.spanContext().traceId;
285
+ const thisRun: ExperimentRun = {
286
+ id: localId(), // initialized with local id, will be replaced with server-assigned id when dry run is false
287
+ traceId,
288
+ experimentId,
289
+ datasetExampleId: example.id,
290
+ startTime: new Date(),
291
+ endTime: new Date(), // will get replaced with actual end time
292
+ output: null,
293
+ error: null,
294
+ };
295
+ try {
296
+ const taskOutput = await promisifyResult(task(example));
297
+ thisRun.output =
298
+ typeof taskOutput === "string"
299
+ ? taskOutput
300
+ : JSON.stringify(taskOutput);
301
+ } catch (error) {
302
+ thisRun.error =
303
+ error instanceof Error ? error.message : "Unknown error";
304
+ span.setStatus({ code: SpanStatusCode.ERROR });
305
+ }
306
+ thisRun.endTime = new Date();
307
+ if (!isDryRun) {
308
+ // Log the run to the server
309
+ const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
310
+ params: {
311
+ path: {
312
+ experiment_id: experimentId,
313
+ },
273
314
  },
274
- },
275
- body: {
276
- dataset_example_id: example.id,
277
- output: thisRun.output,
278
- repetition_number: 0,
279
- start_time: thisRun.startTime.toISOString(),
280
- end_time: thisRun.endTime.toISOString(),
281
- trace_id: thisRun.traceId,
282
- error: thisRun.error,
283
- },
284
- });
285
- // replace the local run id with the server-assigned id
286
- thisRun.id = res.data?.data.id ?? thisRun.id;
287
- }
288
- onComplete(thisRun);
289
- return thisRun;
315
+ body: {
316
+ dataset_example_id: example.id,
317
+ output: thisRun.output,
318
+ repetition_number: 0,
319
+ start_time: thisRun.startTime.toISOString(),
320
+ end_time: thisRun.endTime.toISOString(),
321
+ trace_id: thisRun.traceId,
322
+ error: thisRun.error,
323
+ },
324
+ });
325
+ // replace the local run id with the server-assigned id
326
+ thisRun.id = res.data?.data.id ?? thisRun.id;
327
+ const inputMimeType =
328
+ typeof example.input === "string" ? MimeType.TEXT : MimeType.JSON;
329
+ const outputMimeType =
330
+ typeof thisRun.output === "string" ? MimeType.TEXT : MimeType.JSON;
331
+ span.setStatus({ code: SpanStatusCode.OK });
332
+ span.setAttributes({
333
+ [SemanticConventions.OPENINFERENCE_SPAN_KIND]:
334
+ OpenInferenceSpanKind.CHAIN,
335
+ [SemanticConventions.INPUT_MIME_TYPE]: inputMimeType,
336
+ [SemanticConventions.INPUT_VALUE]: ensureString(example.input),
337
+ [SemanticConventions.OUTPUT_MIME_TYPE]: outputMimeType,
338
+ [SemanticConventions.OUTPUT_VALUE]: ensureString(thisRun.output),
339
+ });
340
+ }
341
+ span?.end();
342
+ onComplete(thisRun);
343
+ return thisRun;
344
+ });
290
345
  };
291
346
  const q = queue(run, concurrency);
292
347
  const examplesToUse = dataset.examples.slice(0, nExamples);
293
- examplesToUse.forEach((example) => q.push(example));
348
+ examplesToUse.forEach((example) =>
349
+ q.push(example, (err) => {
350
+ if (err) {
351
+ logger.error(
352
+ `Error running task "${task.name}" on example "${example.id}": ${err}`
353
+ );
354
+ }
355
+ })
356
+ );
294
357
  return q.drain();
295
358
  }
296
359
 
@@ -328,11 +391,29 @@ export async function evaluateExperiment({
328
391
  dryRun?: boolean | number;
329
392
  }): Promise<RanExperiment> {
330
393
  const isDryRun = typeof dryRun === "number" || dryRun === true;
394
+ const client = _client ?? createClient();
395
+ const baseUrl = client.config.baseUrl;
396
+ invariant(
397
+ baseUrl,
398
+ "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
399
+ );
400
+ let provider: NodeTracerProvider;
401
+ if (!isDryRun) {
402
+ provider = createProvider({
403
+ projectName: "evaluators",
404
+ baseUrl,
405
+ headers: client.config.headers ?? {},
406
+ });
407
+ } else {
408
+ provider = createNoOpProvider();
409
+ }
410
+ const tracer = isDryRun
411
+ ? provider.getTracer("no-op")
412
+ : provider.getTracer("evaluators");
331
413
  const nRuns =
332
414
  typeof dryRun === "number"
333
415
  ? Math.max(dryRun, Object.keys(experiment.runs).length)
334
416
  : Object.keys(experiment.runs).length;
335
- const client = _client ?? createClient();
336
417
  const dataset = await getDatasetBySelector({
337
418
  dataset: experiment.datasetId,
338
419
  client,
@@ -345,14 +426,12 @@ export async function evaluateExperiment({
345
426
  invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`);
346
427
 
347
428
  const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
348
-
349
429
  if (evaluators?.length === 0) {
350
430
  return {
351
431
  ...experiment,
352
432
  evaluationRuns: [],
353
433
  };
354
434
  }
355
-
356
435
  logger.info(
357
436
  `🧠 Evaluating experiment "${experiment.id}" with ${evaluators?.length ?? 0} ${pluralize(
358
437
  "evaluator",
@@ -381,40 +460,91 @@ export async function evaluateExperiment({
381
460
  );
382
461
  const evaluatorsQueue = queue(
383
462
  async (evaluatorAndRun: { evaluator: Evaluator; run: ExperimentRun }) => {
384
- const evalResult = await runEvaluator({
385
- evaluator: evaluatorAndRun.evaluator,
386
- run: evaluatorAndRun.run,
387
- exampleCache: examplesById,
388
- onComplete: onEvaluationComplete,
389
- });
390
- if (!isDryRun) {
391
- logger.info(`📝 Logging evaluation ${evalResult.id}`);
392
- // Log the evaluation to the server
393
- // We log this without awaiting (e.g. best effort)
394
- client.POST("/v1/experiment_evaluations", {
395
- body: {
396
- experiment_run_id: evaluatorAndRun.run.id,
397
- name: evaluatorAndRun.evaluator.name,
398
- annotator_kind: evaluatorAndRun.evaluator.kind,
399
- start_time: evalResult.startTime.toISOString(),
400
- end_time: evalResult.endTime.toISOString(),
401
- result: {
402
- ...evalResult.result,
403
- },
404
- error: evalResult.error,
405
- trace_id: evalResult.traceId,
406
- },
407
- });
408
- }
463
+ return tracer.startActiveSpan(
464
+ `Evaluation: ${evaluatorAndRun.evaluator.name}`,
465
+ async (span) => {
466
+ const evalResult = await runEvaluator({
467
+ evaluator: evaluatorAndRun.evaluator,
468
+ run: evaluatorAndRun.run,
469
+ exampleCache: examplesById,
470
+ onComplete: onEvaluationComplete,
471
+ logger,
472
+ });
473
+ span.setAttributes({
474
+ [SemanticConventions.OPENINFERENCE_SPAN_KIND]:
475
+ OpenInferenceSpanKind.EVALUATOR,
476
+ [SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
477
+ [SemanticConventions.INPUT_VALUE]: JSON.stringify({
478
+ input: examplesById[evaluatorAndRun.run.datasetExampleId]?.input,
479
+ output: evaluatorAndRun.run.output,
480
+ expected:
481
+ examplesById[evaluatorAndRun.run.datasetExampleId]?.output,
482
+ metadata:
483
+ examplesById[evaluatorAndRun.run.datasetExampleId]?.metadata,
484
+ }),
485
+ [SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
486
+ [SemanticConventions.OUTPUT_VALUE]: ensureString(evalResult.result),
487
+ });
488
+ if (evalResult.error) {
489
+ span.setStatus({
490
+ code: SpanStatusCode.ERROR,
491
+ message: evalResult.error,
492
+ });
493
+ } else {
494
+ span.setStatus({ code: SpanStatusCode.OK });
495
+ }
496
+ if (evalResult.result) {
497
+ span.setAttributes(objectAsAttributes(evalResult.result));
498
+ }
499
+ evalResult.traceId = span.spanContext().traceId;
500
+ if (!isDryRun) {
501
+ // Log the evaluation to the server
502
+ // We log this without awaiting (e.g. best effort)
503
+ client.POST("/v1/experiment_evaluations", {
504
+ body: {
505
+ experiment_run_id: evaluatorAndRun.run.id,
506
+ name: evaluatorAndRun.evaluator.name,
507
+ annotator_kind: evaluatorAndRun.evaluator.kind,
508
+ start_time: evalResult.startTime.toISOString(),
509
+ end_time: evalResult.endTime.toISOString(),
510
+ result: {
511
+ ...evalResult.result,
512
+ },
513
+ error: evalResult.error,
514
+ trace_id: evalResult.traceId,
515
+ },
516
+ });
517
+ }
518
+ span.end();
519
+ return evalResult;
520
+ }
521
+ );
409
522
  },
410
523
  concurrency
411
524
  );
525
+ if (!evaluatorsAndRuns.length) {
526
+ logger.info(`⛔ No evaluators to run`);
527
+ return {
528
+ ...experiment,
529
+ evaluationRuns: [],
530
+ };
531
+ }
412
532
  evaluatorsAndRuns.forEach((evaluatorAndRun) =>
413
- evaluatorsQueue.push(evaluatorAndRun)
533
+ evaluatorsQueue.push(evaluatorAndRun, (err) => {
534
+ if (err) {
535
+ logger.error(
536
+ `❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`
537
+ );
538
+ }
539
+ })
414
540
  );
415
541
  await evaluatorsQueue.drain();
416
542
  logger.info(`✅ Evaluation runs completed`);
417
543
 
544
+ if (provider) {
545
+ await provider.shutdown?.();
546
+ }
547
+
418
548
  return {
419
549
  ...experiment,
420
550
  evaluationRuns: Object.values(evaluationRuns),
@@ -431,25 +561,30 @@ async function runEvaluator({
431
561
  run,
432
562
  exampleCache,
433
563
  onComplete,
564
+ logger,
434
565
  }: {
435
566
  evaluator: Evaluator;
436
567
  run: ExperimentRun;
437
568
  exampleCache: Record<string, Example>;
569
+ logger: Logger;
438
570
  onComplete: (run: ExperimentEvaluationRun) => void;
439
571
  }) {
440
572
  const example = exampleCache[run.datasetExampleId];
441
573
  invariant(example, `Example "${run.datasetExampleId}" not found`);
442
574
  const evaluate = async () => {
575
+ logger.info(
576
+ `🧠 Evaluating run "${run.id}" with evaluator "${evaluator.name}"`
577
+ );
443
578
  const thisEval: ExperimentEvaluationRun = {
444
- id: id(),
445
- traceId: null, // TODO: fill this in once we trace experiments
579
+ id: localId(),
580
+ traceId: null,
446
581
  experimentRunId: run.id,
447
582
  startTime: new Date(),
448
583
  endTime: new Date(), // will get replaced with actual end time
449
584
  name: evaluator.name,
450
585
  result: null,
451
586
  error: null,
452
- annotatorKind: "LLM", // TODO: make configurable via evaluator def
587
+ annotatorKind: evaluator.kind,
453
588
  };
454
589
  try {
455
590
  const result = await evaluator.evaluate({
@@ -459,8 +594,14 @@ async function runEvaluator({
459
594
  metadata: example.metadata,
460
595
  });
461
596
  thisEval.result = result;
597
+ logger.info(
598
+ `✅ Evaluator "${evaluator.name}" on run "${run.id}" completed`
599
+ );
462
600
  } catch (error) {
463
601
  thisEval.error = error instanceof Error ? error.message : "Unknown error";
602
+ logger.error(
603
+ `❌ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`
604
+ );
464
605
  }
465
606
  thisEval.endTime = new Date();
466
607
  onComplete(thisEval);
@@ -495,17 +636,14 @@ export function asEvaluator({
495
636
  };
496
637
  }
497
638
 
498
- let _id = 1000;
639
+ let _localIdIndex = 1000;
499
640
 
500
641
  /**
501
- * Generate a unique id.
642
+ * Generate a local id.
502
643
  *
503
- * @deprecated Use id generated by phoenix instead.
504
- * @returns A unique id.
644
+ * @returns A semi-unique id.
505
645
  */
506
- export function id(): string {
507
- return (() => {
508
- _id++;
509
- return _id.toString();
510
- })();
646
+ function localId(): string {
647
+ _localIdIndex++;
648
+ return `local_${_localIdIndex}`;
511
649
  }
@@ -15,7 +15,6 @@ export interface Experiment extends Node {
15
15
  }
16
16
 
17
17
  export interface RanExperiment extends Experiment {
18
- params: ExperimentParameters;
19
18
  runs: Record<string, ExperimentRun>;
20
19
  evaluationRuns?: ExperimentEvaluationRun[];
21
20
  }
@@ -0,0 +1,14 @@
1
+ import { safelyStringifyJSON } from "./safelyStringifyJSON";
2
+
3
+ /**
4
+ * Ensures that a value is a string.
5
+ * If the value is not a string, it will be converted to a string using `safelyStringifyJSON`.
6
+ * @param value - The value to ensure is a string.
7
+ * @returns The value as a string.
8
+ */
9
+ export function ensureString(value: unknown): string {
10
+ if (typeof value === "string") {
11
+ return value;
12
+ }
13
+ return safelyStringifyJSON(value)?.json ?? "";
14
+ }
@@ -0,0 +1,9 @@
1
+ import { AttributeValue } from "@opentelemetry/api";
2
+
3
+ export function objectAsAttributes<T extends Record<string, unknown>>(
4
+ obj: T
5
+ ): Record<string, AttributeValue> {
6
+ return Object.fromEntries(
7
+ Object.entries(obj).filter(([_, value]) => value !== null)
8
+ ) as Record<string, AttributeValue>;
9
+ }