@arizeai/phoenix-client 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. package/README.md +118 -0
  2. package/dist/esm/client.d.ts +13 -1
  3. package/dist/esm/client.d.ts.map +1 -1
  4. package/dist/esm/client.js +4 -1
  5. package/dist/esm/client.js.map +1 -1
  6. package/dist/esm/datasets/appendDatasetExamples.d.ts +21 -0
  7. package/dist/esm/datasets/appendDatasetExamples.d.ts.map +1 -0
  8. package/dist/esm/datasets/appendDatasetExamples.js +32 -0
  9. package/dist/esm/datasets/appendDatasetExamples.js.map +1 -0
  10. package/dist/esm/datasets/createDataset.d.ts +25 -0
  11. package/dist/esm/datasets/createDataset.d.ts.map +1 -0
  12. package/dist/esm/datasets/createDataset.js +34 -0
  13. package/dist/esm/datasets/createDataset.js.map +1 -0
  14. package/dist/esm/datasets/getDataset.d.ts +10 -0
  15. package/dist/esm/datasets/getDataset.d.ts.map +1 -0
  16. package/dist/esm/datasets/getDataset.js +18 -0
  17. package/dist/esm/datasets/getDataset.js.map +1 -0
  18. package/dist/esm/datasets/getDatasetExamples.d.ts +10 -0
  19. package/dist/esm/datasets/getDatasetExamples.d.ts.map +1 -0
  20. package/dist/esm/datasets/getDatasetExamples.js +25 -0
  21. package/dist/esm/datasets/getDatasetExamples.js.map +1 -0
  22. package/dist/esm/datasets/getDatasetInfo.d.ts +11 -0
  23. package/dist/esm/datasets/getDatasetInfo.d.ts.map +1 -0
  24. package/dist/esm/datasets/getDatasetInfo.js +25 -0
  25. package/dist/esm/datasets/getDatasetInfo.js.map +1 -0
  26. package/dist/esm/datasets/index.d.ts +7 -0
  27. package/dist/esm/datasets/index.d.ts.map +1 -0
  28. package/dist/esm/datasets/index.js +7 -0
  29. package/dist/esm/datasets/index.js.map +1 -0
  30. package/dist/esm/datasets/listDatasets.d.ts +23 -0
  31. package/dist/esm/datasets/listDatasets.d.ts.map +1 -0
  32. package/dist/esm/datasets/listDatasets.js +26 -0
  33. package/dist/esm/datasets/listDatasets.js.map +1 -0
  34. package/dist/esm/experiments/getExperiment.d.ts +14 -0
  35. package/dist/esm/experiments/getExperiment.d.ts.map +1 -0
  36. package/dist/esm/experiments/getExperiment.js +25 -0
  37. package/dist/esm/experiments/getExperiment.js.map +1 -0
  38. package/dist/esm/experiments/getExperimentInfo.d.ts +13 -0
  39. package/dist/esm/experiments/getExperimentInfo.d.ts.map +1 -0
  40. package/dist/esm/experiments/getExperimentInfo.js +24 -0
  41. package/dist/esm/experiments/getExperimentInfo.js.map +1 -0
  42. package/dist/esm/experiments/getExperimentRuns.d.ts +15 -0
  43. package/dist/esm/experiments/getExperimentRuns.d.ts.map +1 -0
  44. package/dist/esm/experiments/getExperimentRuns.js +33 -0
  45. package/dist/esm/experiments/getExperimentRuns.js.map +1 -0
  46. package/dist/esm/experiments/index.d.ts +3 -0
  47. package/dist/esm/experiments/index.d.ts.map +1 -1
  48. package/dist/esm/experiments/index.js +3 -0
  49. package/dist/esm/experiments/index.js.map +1 -1
  50. package/dist/esm/experiments/instrumention.d.ts +18 -0
  51. package/dist/esm/experiments/instrumention.d.ts.map +1 -0
  52. package/dist/esm/experiments/instrumention.js +34 -0
  53. package/dist/esm/experiments/instrumention.js.map +1 -0
  54. package/dist/esm/experiments/runExperiment.d.ts +24 -21
  55. package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
  56. package/dist/esm/experiments/runExperiment.js +221 -108
  57. package/dist/esm/experiments/runExperiment.js.map +1 -1
  58. package/dist/esm/schemas/llm/anthropic/converters.d.ts +28 -28
  59. package/dist/esm/schemas/llm/anthropic/messagePartSchemas.d.ts +8 -8
  60. package/dist/esm/schemas/llm/anthropic/messageSchemas.d.ts +24 -24
  61. package/dist/esm/schemas/llm/anthropic/toolCallSchemas.d.ts +8 -8
  62. package/dist/esm/schemas/llm/constants.d.ts +6 -6
  63. package/dist/esm/schemas/llm/converters.d.ts +24 -24
  64. package/dist/esm/schemas/llm/openai/converters.d.ts +6 -6
  65. package/dist/esm/schemas/llm/schemas.d.ts +22 -22
  66. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  67. package/dist/esm/types/datasets.d.ts +33 -8
  68. package/dist/esm/types/datasets.d.ts.map +1 -1
  69. package/dist/esm/types/experiments.d.ts +17 -4
  70. package/dist/esm/types/experiments.d.ts.map +1 -1
  71. package/dist/esm/utils/ensureString.d.ts +8 -0
  72. package/dist/esm/utils/ensureString.d.ts.map +1 -0
  73. package/dist/esm/utils/ensureString.js +14 -0
  74. package/dist/esm/utils/ensureString.js.map +1 -0
  75. package/dist/esm/utils/objectAsAttributes.d.ts +3 -0
  76. package/dist/esm/utils/objectAsAttributes.d.ts.map +1 -0
  77. package/dist/esm/utils/objectAsAttributes.js +4 -0
  78. package/dist/esm/utils/objectAsAttributes.js.map +1 -0
  79. package/dist/src/client.d.ts +13 -1
  80. package/dist/src/client.d.ts.map +1 -1
  81. package/dist/src/client.js +1 -1
  82. package/dist/src/client.js.map +1 -1
  83. package/dist/src/datasets/appendDatasetExamples.d.ts +21 -0
  84. package/dist/src/datasets/appendDatasetExamples.d.ts.map +1 -0
  85. package/dist/src/datasets/appendDatasetExamples.js +50 -0
  86. package/dist/src/datasets/appendDatasetExamples.js.map +1 -0
  87. package/dist/src/datasets/createDataset.d.ts +25 -0
  88. package/dist/src/datasets/createDataset.d.ts.map +1 -0
  89. package/dist/src/datasets/createDataset.js +52 -0
  90. package/dist/src/datasets/createDataset.js.map +1 -0
  91. package/dist/src/datasets/getDataset.d.ts +10 -0
  92. package/dist/src/datasets/getDataset.d.ts.map +1 -0
  93. package/dist/src/datasets/getDataset.js +29 -0
  94. package/dist/src/datasets/getDataset.js.map +1 -0
  95. package/dist/src/datasets/getDatasetExamples.d.ts +10 -0
  96. package/dist/src/datasets/getDatasetExamples.d.ts.map +1 -0
  97. package/dist/src/datasets/getDatasetExamples.js +40 -0
  98. package/dist/src/datasets/getDatasetExamples.js.map +1 -0
  99. package/dist/src/datasets/getDatasetInfo.d.ts +11 -0
  100. package/dist/src/datasets/getDatasetInfo.d.ts.map +1 -0
  101. package/dist/src/datasets/getDatasetInfo.js +43 -0
  102. package/dist/src/datasets/getDatasetInfo.js.map +1 -0
  103. package/dist/src/datasets/index.d.ts +7 -0
  104. package/dist/src/datasets/index.d.ts.map +1 -0
  105. package/dist/src/datasets/index.js +23 -0
  106. package/dist/src/datasets/index.js.map +1 -0
  107. package/dist/src/datasets/listDatasets.d.ts +23 -0
  108. package/dist/src/datasets/listDatasets.d.ts.map +1 -0
  109. package/dist/src/datasets/listDatasets.js +40 -0
  110. package/dist/src/datasets/listDatasets.js.map +1 -0
  111. package/dist/src/experiments/getExperiment.d.ts +14 -0
  112. package/dist/src/experiments/getExperiment.d.ts.map +1 -0
  113. package/dist/src/experiments/getExperiment.js +36 -0
  114. package/dist/src/experiments/getExperiment.js.map +1 -0
  115. package/dist/src/experiments/getExperimentInfo.d.ts +13 -0
  116. package/dist/src/experiments/getExperimentInfo.d.ts.map +1 -0
  117. package/dist/src/experiments/getExperimentInfo.js +41 -0
  118. package/dist/src/experiments/getExperimentInfo.js.map +1 -0
  119. package/dist/src/experiments/getExperimentRuns.d.ts +15 -0
  120. package/dist/src/experiments/getExperimentRuns.d.ts.map +1 -0
  121. package/dist/src/experiments/getExperimentRuns.js +50 -0
  122. package/dist/src/experiments/getExperimentRuns.js.map +1 -0
  123. package/dist/src/experiments/index.d.ts +3 -0
  124. package/dist/src/experiments/index.d.ts.map +1 -1
  125. package/dist/src/experiments/index.js +3 -0
  126. package/dist/src/experiments/index.js.map +1 -1
  127. package/dist/src/experiments/instrumention.d.ts +18 -0
  128. package/dist/src/experiments/instrumention.d.ts.map +1 -0
  129. package/dist/src/experiments/instrumention.js +38 -0
  130. package/dist/src/experiments/instrumention.js.map +1 -0
  131. package/dist/src/experiments/runExperiment.d.ts +24 -21
  132. package/dist/src/experiments/runExperiment.d.ts.map +1 -1
  133. package/dist/src/experiments/runExperiment.js +222 -111
  134. package/dist/src/experiments/runExperiment.js.map +1 -1
  135. package/dist/src/schemas/llm/anthropic/converters.d.ts +28 -28
  136. package/dist/src/schemas/llm/anthropic/messagePartSchemas.d.ts +8 -8
  137. package/dist/src/schemas/llm/anthropic/messageSchemas.d.ts +24 -24
  138. package/dist/src/schemas/llm/anthropic/toolCallSchemas.d.ts +8 -8
  139. package/dist/src/schemas/llm/constants.d.ts +6 -6
  140. package/dist/src/schemas/llm/converters.d.ts +24 -24
  141. package/dist/src/schemas/llm/openai/converters.d.ts +6 -6
  142. package/dist/src/schemas/llm/schemas.d.ts +22 -22
  143. package/dist/src/types/datasets.d.ts +33 -8
  144. package/dist/src/types/datasets.d.ts.map +1 -1
  145. package/dist/src/types/experiments.d.ts +17 -4
  146. package/dist/src/types/experiments.d.ts.map +1 -1
  147. package/dist/src/utils/ensureString.d.ts +8 -0
  148. package/dist/src/utils/ensureString.d.ts.map +1 -0
  149. package/dist/src/utils/ensureString.js +18 -0
  150. package/dist/src/utils/ensureString.js.map +1 -0
  151. package/dist/src/utils/objectAsAttributes.d.ts +3 -0
  152. package/dist/src/utils/objectAsAttributes.d.ts.map +1 -0
  153. package/dist/src/utils/objectAsAttributes.js +7 -0
  154. package/dist/src/utils/objectAsAttributes.js.map +1 -0
  155. package/dist/tsconfig.tsbuildinfo +1 -1
  156. package/package.json +13 -1
  157. package/src/client.ts +4 -1
  158. package/src/datasets/appendDatasetExamples.ts +55 -0
  159. package/src/datasets/createDataset.ts +60 -0
  160. package/src/datasets/getDataset.ts +27 -0
  161. package/src/datasets/getDatasetExamples.ts +34 -0
  162. package/src/datasets/getDatasetInfo.ts +34 -0
  163. package/src/datasets/index.ts +6 -0
  164. package/src/datasets/listDatasets.ts +37 -0
  165. package/src/experiments/getExperiment.ts +40 -0
  166. package/src/experiments/getExperimentInfo.ts +39 -0
  167. package/src/experiments/getExperimentRuns.ts +45 -0
  168. package/src/experiments/index.ts +3 -0
  169. package/src/experiments/instrumention.ts +52 -0
  170. package/src/experiments/runExperiment.ts +277 -133
  171. package/src/types/datasets.ts +35 -9
  172. package/src/types/experiments.ts +19 -4
  173. package/src/utils/ensureString.ts +14 -0
  174. package/src/utils/objectAsAttributes.ts +9 -0
  175. package/dist/esm/utils/getDatasetBySelector.d.ts +0 -25
  176. package/dist/esm/utils/getDatasetBySelector.d.ts.map +0 -1
  177. package/dist/esm/utils/getDatasetBySelector.js +0 -37
  178. package/dist/esm/utils/getDatasetBySelector.js.map +0 -1
  179. package/dist/src/utils/getDatasetBySelector.d.ts +0 -25
  180. package/dist/src/utils/getDatasetBySelector.d.ts.map +0 -1
  181. package/dist/src/utils/getDatasetBySelector.js +0 -47
  182. package/dist/src/utils/getDatasetBySelector.js.map +0 -1
  183. package/src/utils/getDatasetBySelector.ts +0 -55
@@ -2,21 +2,36 @@ import { queue } from "async";
2
2
  import invariant from "tiny-invariant";
3
3
  import { createClient, type PhoenixClient } from "../client";
4
4
  import { ClientFn } from "../types/core";
5
- import { Dataset, Example } from "../types/datasets";
5
+ import {
6
+ Dataset,
7
+ DatasetSelector,
8
+ Example,
9
+ ExampleWithId,
10
+ } from "../types/datasets";
6
11
  import type {
7
12
  Evaluator,
8
- Experiment,
13
+ ExperimentInfo,
9
14
  ExperimentEvaluationRun,
10
- ExperimentParameters,
11
15
  ExperimentRun,
16
+ ExperimentRunID,
12
17
  ExperimentTask,
13
18
  RanExperiment,
14
19
  } from "../types/experiments";
15
20
  import { type Logger } from "../types/logger";
16
- import { getDatasetBySelector } from "../utils/getDatasetBySelector";
21
+ import { getDataset } from "../datasets/getDataset";
17
22
  import { pluralize } from "../utils/pluralize";
18
23
  import { promisifyResult } from "../utils/promisifyResult";
19
24
  import { AnnotatorKind } from "../types/annotations";
25
+ import { createProvider, createNoOpProvider } from "./instrumention";
26
+ import { SpanStatusCode, Tracer } from "@opentelemetry/api";
27
+ import {
28
+ MimeType,
29
+ OpenInferenceSpanKind,
30
+ SemanticConventions,
31
+ } from "@arizeai/openinference-semantic-conventions";
32
+ import { ensureString } from "../utils/ensureString";
33
+ import type { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
34
+ import { objectAsAttributes } from "../utils/objectAsAttributes";
20
35
 
21
36
  /**
22
37
  * Parameters for running an experiment.
@@ -35,12 +50,13 @@ export type RunExperimentParams = ClientFn & {
35
50
  experimentDescription?: string;
36
51
  /**
37
52
  * Experiment metadata
53
+ * E.x. modelName
38
54
  */
39
55
  experimentMetadata?: Record<string, unknown>;
40
56
  /**
41
57
  * The dataset to run the experiment on
42
58
  */
43
- dataset: Dataset | string | Example[];
59
+ dataset: DatasetSelector;
44
60
  /**
45
61
  * The task to run
46
62
  */
@@ -49,10 +65,6 @@ export type RunExperimentParams = ClientFn & {
49
65
  * The evaluators to use
50
66
  */
51
67
  evaluators?: Evaluator[];
52
- /**
53
- * The project under which the experiment task traces are recorded
54
- */
55
- projectName?: string;
56
68
  /**
57
69
  * The logger to use
58
70
  */
@@ -73,7 +85,23 @@ export type RunExperimentParams = ClientFn & {
73
85
  };
74
86
 
75
87
  /**
76
- * Run an experiment.
88
+ * Runs an experiment using a given set of dataset of examples.
89
+ *
90
+ * An experiment is a user-defined task that runs on each example in a dataset. The results from
91
+ * each experiment can be evaluated using any number of evaluators to measure the behavior of the
92
+ * task. The experiment and evaluation results are stored in the Phoenix database for comparison
93
+ * and analysis.
94
+ *
95
+ * A `task` is either a sync or async function that returns a JSON serializable
96
+ * output. If the `task` is a function of one argument then that argument will be bound to the
97
+ * `input` field of the dataset example. Alternatively, the `task` can be a function of any
98
+ * combination of specific argument names that will be bound to special values:
99
+ *
100
+ * - `input`: The input field of the dataset example
101
+ * - `expected`: The expected or reference output of the dataset example
102
+ * - `reference`: An alias for `expected`
103
+ * - `metadata`: Metadata associated with the dataset example
104
+ * - `example`: The dataset `Example` object with all associated fields
77
105
  *
78
106
  * @example
79
107
  * ```ts
@@ -83,50 +111,48 @@ export type RunExperimentParams = ClientFn & {
83
111
  * dataset: "my-dataset",
84
112
  * task: async (example) => example.input,
85
113
  * evaluators: [
86
- * asEvaluator("my-evaluator", "CODE", async (params) => params.output),
114
+ * asEvaluator({ name: "my-evaluator", kind: "CODE", evaluate: async (params) => params.output }),
87
115
  * ],
88
116
  * });
89
117
  * ```
90
- *
91
- * @experimental This feature is not complete, and will change in the future.
92
118
  */
93
119
  export async function runExperiment({
94
- experimentName: _experimentName,
120
+ experimentName,
95
121
  experimentDescription,
96
- experimentMetadata,
122
+ experimentMetadata = {},
97
123
  client: _client,
98
- dataset: _dataset,
124
+ dataset: DatasetSelector,
99
125
  task,
100
126
  evaluators,
101
- projectName = "default",
102
127
  logger = console,
103
128
  record = true,
104
129
  concurrency = 5,
105
130
  dryRun = false,
106
131
  }: RunExperimentParams): Promise<RanExperiment> {
132
+ let provider: NodeTracerProvider | undefined;
107
133
  const isDryRun = typeof dryRun === "number" || dryRun === true;
108
134
  const client = _client ?? createClient();
109
- const dataset = await getDatasetBySelector({ dataset: _dataset, client });
135
+ const dataset = await getDataset({ dataset: DatasetSelector, client });
110
136
  invariant(dataset, `Dataset not found`);
111
137
  invariant(dataset.examples.length > 0, `Dataset has no examples`);
112
138
  const nExamples =
113
139
  typeof dryRun === "number"
114
- ? Math.max(dryRun, dataset.examples.length)
140
+ ? Math.min(dryRun, dataset.examples.length)
115
141
  : dataset.examples.length;
116
142
 
117
- const experimentName =
118
- _experimentName ?? `${dataset.name}-${new Date().toISOString()}`;
119
- const experimentParams: ExperimentParameters = {
120
- nExamples,
121
- };
122
- let experiment: Experiment;
143
+ let projectName = `${dataset.name}-exp-${new Date().toISOString()}`;
144
+ // initialize the tracer into scope
145
+ let taskTracer: Tracer;
146
+ let experiment: ExperimentInfo;
123
147
  if (isDryRun) {
124
148
  experiment = {
125
- id: id(),
149
+ id: localId(),
126
150
  datasetId: dataset.id,
127
151
  datasetVersionId: dataset.versionId,
128
152
  projectName,
153
+ metadata: experimentMetadata,
129
154
  };
155
+ taskTracer = createNoOpProvider().getTracer("no-op");
130
156
  } else {
131
157
  const experimentResponse = await client
132
158
  .POST("/v1/datasets/{dataset_id}/experiments", {
@@ -144,14 +170,27 @@ export async function runExperiment({
144
170
  })
145
171
  .then((res) => res.data?.data);
146
172
  invariant(experimentResponse, `Failed to create experiment`);
173
+ projectName = experimentResponse.project_name ?? projectName;
147
174
  experiment = {
148
175
  id: experimentResponse.id,
149
- datasetId: dataset.id,
150
- datasetVersionId: dataset.versionId,
176
+ datasetId: experimentResponse.dataset_id,
177
+ datasetVersionId: experimentResponse.dataset_version_id,
151
178
  projectName,
179
+ metadata: experimentResponse.metadata,
152
180
  };
181
+ // Initialize the tracer, now that we have a project name
182
+ const baseUrl = client.config.baseUrl;
183
+ invariant(
184
+ baseUrl,
185
+ "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
186
+ );
187
+ provider = createProvider({
188
+ projectName,
189
+ baseUrl,
190
+ headers: client.config.headers ?? {},
191
+ });
192
+ taskTracer = provider.getTracer(projectName);
153
193
  }
154
-
155
194
  if (!record) {
156
195
  logger.info(
157
196
  `🔧 Running experiment in readonly mode. Results will not be recorded.`
@@ -159,16 +198,14 @@ export async function runExperiment({
159
198
  }
160
199
 
161
200
  logger.info(
162
- `🧪 Starting experiment "${experimentName}" on dataset "${dataset.id}" with task "${task.name}" and ${evaluators?.length ?? 0} ${pluralize(
201
+ `🧪 Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${evaluators?.length ?? 0} ${pluralize(
163
202
  "evaluator",
164
203
  evaluators?.length ?? 0
165
204
  )} and ${concurrency} concurrent runs`
166
205
  );
167
206
 
168
- // Run task against all examples, for each repetition
169
- type ExperimentRunId = string;
170
- const runs: Record<ExperimentRunId, ExperimentRun> = {};
171
- await runTask({
207
+ const runs: Record<ExperimentRunID, ExperimentRun> = {};
208
+ await runTaskWithExamples({
172
209
  client,
173
210
  experimentId: experiment.id,
174
211
  task,
@@ -180,15 +217,20 @@ export async function runExperiment({
180
217
  concurrency,
181
218
  isDryRun,
182
219
  nExamples,
220
+ tracer: taskTracer,
183
221
  });
184
222
  logger.info(`✅ Task runs completed`);
185
223
 
186
224
  const ranExperiment: RanExperiment = {
187
225
  ...experiment,
188
- params: experimentParams,
189
226
  runs,
190
227
  };
191
228
 
229
+ // Shut down the provider so that the experiments run
230
+ if (provider) {
231
+ await provider.shutdown?.();
232
+ }
233
+
192
234
  const { evaluationRuns } = await evaluateExperiment({
193
235
  experiment: ranExperiment,
194
236
  evaluators: evaluators ?? [],
@@ -207,7 +249,7 @@ export async function runExperiment({
207
249
  /**
208
250
  * Run a task against n examples in a dataset.
209
251
  */
210
- function runTask({
252
+ function runTaskWithExamples({
211
253
  client,
212
254
  experimentId,
213
255
  task,
@@ -217,6 +259,7 @@ function runTask({
217
259
  concurrency = 5,
218
260
  isDryRun,
219
261
  nExamples,
262
+ tracer,
220
263
  }: {
221
264
  /** The client to use */
222
265
  client: PhoenixClient;
@@ -236,61 +279,88 @@ function runTask({
236
279
  isDryRun: boolean;
237
280
  /** The number of examples to run */
238
281
  nExamples: number;
239
- }) {
282
+ /** TraceProvider instance that will be used to create spans from task calls */
283
+ tracer: Tracer;
284
+ }): Promise<void> {
240
285
  logger.info(`🔧 Running task "${task.name}" on dataset "${dataset.id}"`);
241
- const run = async (example: Example) => {
242
- logger.info(
243
- `🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`
244
- );
245
- const thisRun: ExperimentRun = {
246
- id: id(),
247
- traceId: null, // TODO: fill this in once we trace experiments
248
- experimentId,
249
- datasetExampleId: example.id,
250
- startTime: new Date(),
251
- endTime: new Date(), // will get replaced with actual end time
252
- output: null,
253
- error: null,
254
- };
255
- try {
256
- const taskOutput = await promisifyResult(task(example));
257
- // TODO: why doesn't run output type match task output type?
258
- thisRun.output =
259
- typeof taskOutput === "string"
260
- ? taskOutput
261
- : JSON.stringify(taskOutput);
262
- } catch (error) {
263
- thisRun.error = error instanceof Error ? error.message : "Unknown error";
264
- }
265
- thisRun.endTime = new Date();
266
- if (!isDryRun) {
267
- // Log the run to the server
268
- // We log this without awaiting (e.g. best effort)
269
- const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
270
- params: {
271
- path: {
272
- experiment_id: experimentId,
286
+ const run = async (example: ExampleWithId) => {
287
+ return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => {
288
+ logger.info(
289
+ `🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`
290
+ );
291
+ const traceId = span.spanContext().traceId;
292
+ const thisRun: ExperimentRun = {
293
+ id: localId(), // initialized with local id, will be replaced with server-assigned id when dry run is false
294
+ traceId,
295
+ experimentId,
296
+ datasetExampleId: example.id,
297
+ startTime: new Date(),
298
+ endTime: new Date(), // will get replaced with actual end time
299
+ output: null,
300
+ error: null,
301
+ };
302
+ try {
303
+ const taskOutput = await promisifyResult(task(example));
304
+ thisRun.output =
305
+ typeof taskOutput === "string"
306
+ ? taskOutput
307
+ : JSON.stringify(taskOutput);
308
+ } catch (error) {
309
+ thisRun.error =
310
+ error instanceof Error ? error.message : "Unknown error";
311
+ span.setStatus({ code: SpanStatusCode.ERROR });
312
+ }
313
+ thisRun.endTime = new Date();
314
+ if (!isDryRun) {
315
+ // Log the run to the server
316
+ const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
317
+ params: {
318
+ path: {
319
+ experiment_id: experimentId,
320
+ },
273
321
  },
274
- },
275
- body: {
276
- dataset_example_id: example.id,
277
- output: thisRun.output,
278
- repetition_number: 0,
279
- start_time: thisRun.startTime.toISOString(),
280
- end_time: thisRun.endTime.toISOString(),
281
- trace_id: thisRun.traceId,
282
- error: thisRun.error,
283
- },
284
- });
285
- // replace the local run id with the server-assigned id
286
- thisRun.id = res.data?.data.id ?? thisRun.id;
287
- }
288
- onComplete(thisRun);
289
- return thisRun;
322
+ body: {
323
+ dataset_example_id: example.id,
324
+ output: thisRun.output,
325
+ repetition_number: 0,
326
+ start_time: thisRun.startTime.toISOString(),
327
+ end_time: thisRun.endTime.toISOString(),
328
+ trace_id: thisRun.traceId,
329
+ error: thisRun.error,
330
+ },
331
+ });
332
+ // replace the local run id with the server-assigned id
333
+ thisRun.id = res.data?.data.id ?? thisRun.id;
334
+ const inputMimeType =
335
+ typeof example.input === "string" ? MimeType.TEXT : MimeType.JSON;
336
+ const outputMimeType =
337
+ typeof thisRun.output === "string" ? MimeType.TEXT : MimeType.JSON;
338
+ span.setStatus({ code: SpanStatusCode.OK });
339
+ span.setAttributes({
340
+ [SemanticConventions.OPENINFERENCE_SPAN_KIND]:
341
+ OpenInferenceSpanKind.CHAIN,
342
+ [SemanticConventions.INPUT_MIME_TYPE]: inputMimeType,
343
+ [SemanticConventions.INPUT_VALUE]: ensureString(example.input),
344
+ [SemanticConventions.OUTPUT_MIME_TYPE]: outputMimeType,
345
+ [SemanticConventions.OUTPUT_VALUE]: ensureString(thisRun.output),
346
+ });
347
+ }
348
+ span?.end();
349
+ onComplete(thisRun);
350
+ return thisRun;
351
+ });
290
352
  };
291
353
  const q = queue(run, concurrency);
292
354
  const examplesToUse = dataset.examples.slice(0, nExamples);
293
- examplesToUse.forEach((example) => q.push(example));
355
+ examplesToUse.forEach((example) =>
356
+ q.push(example, (err) => {
357
+ if (err) {
358
+ logger.error(
359
+ `Error running task "${task.name}" on example "${example.id}": ${err}`
360
+ );
361
+ }
362
+ })
363
+ );
294
364
  return q.drain();
295
365
  }
296
366
 
@@ -303,13 +373,12 @@ export async function evaluateExperiment({
303
373
  experiment,
304
374
  evaluators,
305
375
  client: _client,
306
- logger,
376
+ logger = console,
307
377
  concurrency = 5,
308
378
  dryRun = false,
309
379
  }: {
310
380
  /**
311
381
  * The experiment to evaluate
312
- * @todo also accept Experiment, and attempt to fetch the runs from the server
313
382
  **/
314
383
  experiment: RanExperiment;
315
384
  /** The evaluators to use */
@@ -317,9 +386,9 @@ export async function evaluateExperiment({
317
386
  /** The client to use */
318
387
  client?: PhoenixClient;
319
388
  /** The logger to use */
320
- logger: Logger;
389
+ logger?: Logger;
321
390
  /** The number of evaluators to run in parallel */
322
- concurrency: number;
391
+ concurrency?: number;
323
392
  /**
324
393
  * Whether to run the evaluation as a dry run
325
394
  * If a number is provided, the evaluation will be run for the first n runs
@@ -328,13 +397,31 @@ export async function evaluateExperiment({
328
397
  dryRun?: boolean | number;
329
398
  }): Promise<RanExperiment> {
330
399
  const isDryRun = typeof dryRun === "number" || dryRun === true;
400
+ const client = _client ?? createClient();
401
+ const baseUrl = client.config.baseUrl;
402
+ invariant(
403
+ baseUrl,
404
+ "Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
405
+ );
406
+ let provider: NodeTracerProvider;
407
+ if (!isDryRun) {
408
+ provider = createProvider({
409
+ projectName: "evaluators",
410
+ baseUrl,
411
+ headers: client.config.headers ?? {},
412
+ });
413
+ } else {
414
+ provider = createNoOpProvider();
415
+ }
416
+ const tracer = isDryRun
417
+ ? provider.getTracer("no-op")
418
+ : provider.getTracer("evaluators");
331
419
  const nRuns =
332
420
  typeof dryRun === "number"
333
421
  ? Math.max(dryRun, Object.keys(experiment.runs).length)
334
422
  : Object.keys(experiment.runs).length;
335
- const client = _client ?? createClient();
336
- const dataset = await getDatasetBySelector({
337
- dataset: experiment.datasetId,
423
+ const dataset = await getDataset({
424
+ dataset: { datasetId: experiment.datasetId },
338
425
  client,
339
426
  });
340
427
  invariant(dataset, `Dataset "${experiment.datasetId}" not found`);
@@ -345,14 +432,12 @@ export async function evaluateExperiment({
345
432
  invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`);
346
433
 
347
434
  const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
348
-
349
435
  if (evaluators?.length === 0) {
350
436
  return {
351
437
  ...experiment,
352
438
  evaluationRuns: [],
353
439
  };
354
440
  }
355
-
356
441
  logger.info(
357
442
  `🧠 Evaluating experiment "${experiment.id}" with ${evaluators?.length ?? 0} ${pluralize(
358
443
  "evaluator",
@@ -381,40 +466,91 @@ export async function evaluateExperiment({
381
466
  );
382
467
  const evaluatorsQueue = queue(
383
468
  async (evaluatorAndRun: { evaluator: Evaluator; run: ExperimentRun }) => {
384
- const evalResult = await runEvaluator({
385
- evaluator: evaluatorAndRun.evaluator,
386
- run: evaluatorAndRun.run,
387
- exampleCache: examplesById,
388
- onComplete: onEvaluationComplete,
389
- });
390
- if (!isDryRun) {
391
- logger.info(`📝 Logging evaluation ${evalResult.id}`);
392
- // Log the evaluation to the server
393
- // We log this without awaiting (e.g. best effort)
394
- client.POST("/v1/experiment_evaluations", {
395
- body: {
396
- experiment_run_id: evaluatorAndRun.run.id,
397
- name: evaluatorAndRun.evaluator.name,
398
- annotator_kind: evaluatorAndRun.evaluator.kind,
399
- start_time: evalResult.startTime.toISOString(),
400
- end_time: evalResult.endTime.toISOString(),
401
- result: {
402
- ...evalResult.result,
403
- },
404
- error: evalResult.error,
405
- trace_id: evalResult.traceId,
406
- },
407
- });
408
- }
469
+ return tracer.startActiveSpan(
470
+ `Evaluation: ${evaluatorAndRun.evaluator.name}`,
471
+ async (span) => {
472
+ const evalResult = await runEvaluator({
473
+ evaluator: evaluatorAndRun.evaluator,
474
+ run: evaluatorAndRun.run,
475
+ exampleCache: examplesById,
476
+ onComplete: onEvaluationComplete,
477
+ logger,
478
+ });
479
+ span.setAttributes({
480
+ [SemanticConventions.OPENINFERENCE_SPAN_KIND]:
481
+ OpenInferenceSpanKind.EVALUATOR,
482
+ [SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
483
+ [SemanticConventions.INPUT_VALUE]: JSON.stringify({
484
+ input: examplesById[evaluatorAndRun.run.datasetExampleId]?.input,
485
+ output: evaluatorAndRun.run.output,
486
+ expected:
487
+ examplesById[evaluatorAndRun.run.datasetExampleId]?.output,
488
+ metadata:
489
+ examplesById[evaluatorAndRun.run.datasetExampleId]?.metadata,
490
+ }),
491
+ [SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
492
+ [SemanticConventions.OUTPUT_VALUE]: ensureString(evalResult.result),
493
+ });
494
+ if (evalResult.error) {
495
+ span.setStatus({
496
+ code: SpanStatusCode.ERROR,
497
+ message: evalResult.error,
498
+ });
499
+ } else {
500
+ span.setStatus({ code: SpanStatusCode.OK });
501
+ }
502
+ if (evalResult.result) {
503
+ span.setAttributes(objectAsAttributes(evalResult.result));
504
+ }
505
+ evalResult.traceId = span.spanContext().traceId;
506
+ if (!isDryRun) {
507
+ // Log the evaluation to the server
508
+ // We log this without awaiting (e.g. best effort)
509
+ client.POST("/v1/experiment_evaluations", {
510
+ body: {
511
+ experiment_run_id: evaluatorAndRun.run.id,
512
+ name: evaluatorAndRun.evaluator.name,
513
+ annotator_kind: evaluatorAndRun.evaluator.kind,
514
+ start_time: evalResult.startTime.toISOString(),
515
+ end_time: evalResult.endTime.toISOString(),
516
+ result: {
517
+ ...evalResult.result,
518
+ },
519
+ error: evalResult.error,
520
+ trace_id: evalResult.traceId,
521
+ },
522
+ });
523
+ }
524
+ span.end();
525
+ return evalResult;
526
+ }
527
+ );
409
528
  },
410
529
  concurrency
411
530
  );
531
+ if (!evaluatorsAndRuns.length) {
532
+ logger.info(`⛔ No evaluators to run`);
533
+ return {
534
+ ...experiment,
535
+ evaluationRuns: [],
536
+ };
537
+ }
412
538
  evaluatorsAndRuns.forEach((evaluatorAndRun) =>
413
- evaluatorsQueue.push(evaluatorAndRun)
539
+ evaluatorsQueue.push(evaluatorAndRun, (err) => {
540
+ if (err) {
541
+ logger.error(
542
+ `❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`
543
+ );
544
+ }
545
+ })
414
546
  );
415
547
  await evaluatorsQueue.drain();
416
548
  logger.info(`✅ Evaluation runs completed`);
417
549
 
550
+ if (provider) {
551
+ await provider.shutdown?.();
552
+ }
553
+
418
554
  return {
419
555
  ...experiment,
420
556
  evaluationRuns: Object.values(evaluationRuns),
@@ -431,25 +567,30 @@ async function runEvaluator({
431
567
  run,
432
568
  exampleCache,
433
569
  onComplete,
570
+ logger,
434
571
  }: {
435
572
  evaluator: Evaluator;
436
573
  run: ExperimentRun;
437
574
  exampleCache: Record<string, Example>;
575
+ logger: Logger;
438
576
  onComplete: (run: ExperimentEvaluationRun) => void;
439
577
  }) {
440
578
  const example = exampleCache[run.datasetExampleId];
441
579
  invariant(example, `Example "${run.datasetExampleId}" not found`);
442
580
  const evaluate = async () => {
581
+ logger.info(
582
+ `🧠 Evaluating run "${run.id}" with evaluator "${evaluator.name}"`
583
+ );
443
584
  const thisEval: ExperimentEvaluationRun = {
444
- id: id(),
445
- traceId: null, // TODO: fill this in once we trace experiments
585
+ id: localId(),
586
+ traceId: null,
446
587
  experimentRunId: run.id,
447
588
  startTime: new Date(),
448
589
  endTime: new Date(), // will get replaced with actual end time
449
590
  name: evaluator.name,
450
591
  result: null,
451
592
  error: null,
452
- annotatorKind: "LLM", // TODO: make configurable via evaluator def
593
+ annotatorKind: evaluator.kind,
453
594
  };
454
595
  try {
455
596
  const result = await evaluator.evaluate({
@@ -459,8 +600,14 @@ async function runEvaluator({
459
600
  metadata: example.metadata,
460
601
  });
461
602
  thisEval.result = result;
603
+ logger.info(
604
+ `✅ Evaluator "${evaluator.name}" on run "${run.id}" completed`
605
+ );
462
606
  } catch (error) {
463
607
  thisEval.error = error instanceof Error ? error.message : "Unknown error";
608
+ logger.error(
609
+ `❌ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`
610
+ );
464
611
  }
465
612
  thisEval.endTime = new Date();
466
613
  onComplete(thisEval);
@@ -495,17 +642,14 @@ export function asEvaluator({
495
642
  };
496
643
  }
497
644
 
498
- let _id = 1000;
645
+ let _localIdIndex = 1000;
499
646
 
500
647
  /**
501
- * Generate a unique id.
648
+ * Generate a local id.
502
649
  *
503
- * @deprecated Use id generated by phoenix instead.
504
- * @returns A unique id.
650
+ * @returns A semi-unique id.
505
651
  */
506
- export function id(): string {
507
- return (() => {
508
- _id++;
509
- return _id.toString();
510
- })();
652
+ function localId(): string {
653
+ _localIdIndex++;
654
+ return `local_${_localIdIndex}`;
511
655
  }