@arizeai/phoenix-client 1.2.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +118 -0
- package/dist/esm/client.d.ts +13 -1
- package/dist/esm/client.d.ts.map +1 -1
- package/dist/esm/client.js +4 -1
- package/dist/esm/client.js.map +1 -1
- package/dist/esm/datasets/appendDatasetExamples.d.ts +21 -0
- package/dist/esm/datasets/appendDatasetExamples.d.ts.map +1 -0
- package/dist/esm/datasets/appendDatasetExamples.js +32 -0
- package/dist/esm/datasets/appendDatasetExamples.js.map +1 -0
- package/dist/esm/datasets/createDataset.d.ts +25 -0
- package/dist/esm/datasets/createDataset.d.ts.map +1 -0
- package/dist/esm/datasets/createDataset.js +34 -0
- package/dist/esm/datasets/createDataset.js.map +1 -0
- package/dist/esm/datasets/getDataset.d.ts +10 -0
- package/dist/esm/datasets/getDataset.d.ts.map +1 -0
- package/dist/esm/datasets/getDataset.js +18 -0
- package/dist/esm/datasets/getDataset.js.map +1 -0
- package/dist/esm/datasets/getDatasetExamples.d.ts +10 -0
- package/dist/esm/datasets/getDatasetExamples.d.ts.map +1 -0
- package/dist/esm/datasets/getDatasetExamples.js +25 -0
- package/dist/esm/datasets/getDatasetExamples.js.map +1 -0
- package/dist/esm/datasets/getDatasetInfo.d.ts +11 -0
- package/dist/esm/datasets/getDatasetInfo.d.ts.map +1 -0
- package/dist/esm/datasets/getDatasetInfo.js +25 -0
- package/dist/esm/datasets/getDatasetInfo.js.map +1 -0
- package/dist/esm/datasets/index.d.ts +7 -0
- package/dist/esm/datasets/index.d.ts.map +1 -0
- package/dist/esm/datasets/index.js +7 -0
- package/dist/esm/datasets/index.js.map +1 -0
- package/dist/esm/datasets/listDatasets.d.ts +23 -0
- package/dist/esm/datasets/listDatasets.d.ts.map +1 -0
- package/dist/esm/datasets/listDatasets.js +26 -0
- package/dist/esm/datasets/listDatasets.js.map +1 -0
- package/dist/esm/experiments/getExperiment.d.ts +14 -0
- package/dist/esm/experiments/getExperiment.d.ts.map +1 -0
- package/dist/esm/experiments/getExperiment.js +25 -0
- package/dist/esm/experiments/getExperiment.js.map +1 -0
- package/dist/esm/experiments/getExperimentInfo.d.ts +13 -0
- package/dist/esm/experiments/getExperimentInfo.d.ts.map +1 -0
- package/dist/esm/experiments/getExperimentInfo.js +24 -0
- package/dist/esm/experiments/getExperimentInfo.js.map +1 -0
- package/dist/esm/experiments/getExperimentRuns.d.ts +15 -0
- package/dist/esm/experiments/getExperimentRuns.d.ts.map +1 -0
- package/dist/esm/experiments/getExperimentRuns.js +33 -0
- package/dist/esm/experiments/getExperimentRuns.js.map +1 -0
- package/dist/esm/experiments/index.d.ts +3 -0
- package/dist/esm/experiments/index.d.ts.map +1 -1
- package/dist/esm/experiments/index.js +3 -0
- package/dist/esm/experiments/index.js.map +1 -1
- package/dist/esm/experiments/instrumention.d.ts +18 -0
- package/dist/esm/experiments/instrumention.d.ts.map +1 -0
- package/dist/esm/experiments/instrumention.js +34 -0
- package/dist/esm/experiments/instrumention.js.map +1 -0
- package/dist/esm/experiments/runExperiment.d.ts +24 -21
- package/dist/esm/experiments/runExperiment.d.ts.map +1 -1
- package/dist/esm/experiments/runExperiment.js +221 -108
- package/dist/esm/experiments/runExperiment.js.map +1 -1
- package/dist/esm/schemas/llm/anthropic/converters.d.ts +28 -28
- package/dist/esm/schemas/llm/anthropic/messagePartSchemas.d.ts +8 -8
- package/dist/esm/schemas/llm/anthropic/messageSchemas.d.ts +24 -24
- package/dist/esm/schemas/llm/anthropic/toolCallSchemas.d.ts +8 -8
- package/dist/esm/schemas/llm/constants.d.ts +6 -6
- package/dist/esm/schemas/llm/converters.d.ts +24 -24
- package/dist/esm/schemas/llm/openai/converters.d.ts +6 -6
- package/dist/esm/schemas/llm/schemas.d.ts +22 -22
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/datasets.d.ts +33 -8
- package/dist/esm/types/datasets.d.ts.map +1 -1
- package/dist/esm/types/experiments.d.ts +17 -4
- package/dist/esm/types/experiments.d.ts.map +1 -1
- package/dist/esm/utils/ensureString.d.ts +8 -0
- package/dist/esm/utils/ensureString.d.ts.map +1 -0
- package/dist/esm/utils/ensureString.js +14 -0
- package/dist/esm/utils/ensureString.js.map +1 -0
- package/dist/esm/utils/objectAsAttributes.d.ts +3 -0
- package/dist/esm/utils/objectAsAttributes.d.ts.map +1 -0
- package/dist/esm/utils/objectAsAttributes.js +4 -0
- package/dist/esm/utils/objectAsAttributes.js.map +1 -0
- package/dist/src/client.d.ts +13 -1
- package/dist/src/client.d.ts.map +1 -1
- package/dist/src/client.js +1 -1
- package/dist/src/client.js.map +1 -1
- package/dist/src/datasets/appendDatasetExamples.d.ts +21 -0
- package/dist/src/datasets/appendDatasetExamples.d.ts.map +1 -0
- package/dist/src/datasets/appendDatasetExamples.js +50 -0
- package/dist/src/datasets/appendDatasetExamples.js.map +1 -0
- package/dist/src/datasets/createDataset.d.ts +25 -0
- package/dist/src/datasets/createDataset.d.ts.map +1 -0
- package/dist/src/datasets/createDataset.js +52 -0
- package/dist/src/datasets/createDataset.js.map +1 -0
- package/dist/src/datasets/getDataset.d.ts +10 -0
- package/dist/src/datasets/getDataset.d.ts.map +1 -0
- package/dist/src/datasets/getDataset.js +29 -0
- package/dist/src/datasets/getDataset.js.map +1 -0
- package/dist/src/datasets/getDatasetExamples.d.ts +10 -0
- package/dist/src/datasets/getDatasetExamples.d.ts.map +1 -0
- package/dist/src/datasets/getDatasetExamples.js +40 -0
- package/dist/src/datasets/getDatasetExamples.js.map +1 -0
- package/dist/src/datasets/getDatasetInfo.d.ts +11 -0
- package/dist/src/datasets/getDatasetInfo.d.ts.map +1 -0
- package/dist/src/datasets/getDatasetInfo.js +43 -0
- package/dist/src/datasets/getDatasetInfo.js.map +1 -0
- package/dist/src/datasets/index.d.ts +7 -0
- package/dist/src/datasets/index.d.ts.map +1 -0
- package/dist/src/datasets/index.js +23 -0
- package/dist/src/datasets/index.js.map +1 -0
- package/dist/src/datasets/listDatasets.d.ts +23 -0
- package/dist/src/datasets/listDatasets.d.ts.map +1 -0
- package/dist/src/datasets/listDatasets.js +40 -0
- package/dist/src/datasets/listDatasets.js.map +1 -0
- package/dist/src/experiments/getExperiment.d.ts +14 -0
- package/dist/src/experiments/getExperiment.d.ts.map +1 -0
- package/dist/src/experiments/getExperiment.js +36 -0
- package/dist/src/experiments/getExperiment.js.map +1 -0
- package/dist/src/experiments/getExperimentInfo.d.ts +13 -0
- package/dist/src/experiments/getExperimentInfo.d.ts.map +1 -0
- package/dist/src/experiments/getExperimentInfo.js +41 -0
- package/dist/src/experiments/getExperimentInfo.js.map +1 -0
- package/dist/src/experiments/getExperimentRuns.d.ts +15 -0
- package/dist/src/experiments/getExperimentRuns.d.ts.map +1 -0
- package/dist/src/experiments/getExperimentRuns.js +50 -0
- package/dist/src/experiments/getExperimentRuns.js.map +1 -0
- package/dist/src/experiments/index.d.ts +3 -0
- package/dist/src/experiments/index.d.ts.map +1 -1
- package/dist/src/experiments/index.js +3 -0
- package/dist/src/experiments/index.js.map +1 -1
- package/dist/src/experiments/instrumention.d.ts +18 -0
- package/dist/src/experiments/instrumention.d.ts.map +1 -0
- package/dist/src/experiments/instrumention.js +38 -0
- package/dist/src/experiments/instrumention.js.map +1 -0
- package/dist/src/experiments/runExperiment.d.ts +24 -21
- package/dist/src/experiments/runExperiment.d.ts.map +1 -1
- package/dist/src/experiments/runExperiment.js +222 -111
- package/dist/src/experiments/runExperiment.js.map +1 -1
- package/dist/src/schemas/llm/anthropic/converters.d.ts +28 -28
- package/dist/src/schemas/llm/anthropic/messagePartSchemas.d.ts +8 -8
- package/dist/src/schemas/llm/anthropic/messageSchemas.d.ts +24 -24
- package/dist/src/schemas/llm/anthropic/toolCallSchemas.d.ts +8 -8
- package/dist/src/schemas/llm/constants.d.ts +6 -6
- package/dist/src/schemas/llm/converters.d.ts +24 -24
- package/dist/src/schemas/llm/openai/converters.d.ts +6 -6
- package/dist/src/schemas/llm/schemas.d.ts +22 -22
- package/dist/src/types/datasets.d.ts +33 -8
- package/dist/src/types/datasets.d.ts.map +1 -1
- package/dist/src/types/experiments.d.ts +17 -4
- package/dist/src/types/experiments.d.ts.map +1 -1
- package/dist/src/utils/ensureString.d.ts +8 -0
- package/dist/src/utils/ensureString.d.ts.map +1 -0
- package/dist/src/utils/ensureString.js +18 -0
- package/dist/src/utils/ensureString.js.map +1 -0
- package/dist/src/utils/objectAsAttributes.d.ts +3 -0
- package/dist/src/utils/objectAsAttributes.d.ts.map +1 -0
- package/dist/src/utils/objectAsAttributes.js +7 -0
- package/dist/src/utils/objectAsAttributes.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +13 -1
- package/src/client.ts +4 -1
- package/src/datasets/appendDatasetExamples.ts +55 -0
- package/src/datasets/createDataset.ts +60 -0
- package/src/datasets/getDataset.ts +27 -0
- package/src/datasets/getDatasetExamples.ts +34 -0
- package/src/datasets/getDatasetInfo.ts +34 -0
- package/src/datasets/index.ts +6 -0
- package/src/datasets/listDatasets.ts +37 -0
- package/src/experiments/getExperiment.ts +40 -0
- package/src/experiments/getExperimentInfo.ts +39 -0
- package/src/experiments/getExperimentRuns.ts +45 -0
- package/src/experiments/index.ts +3 -0
- package/src/experiments/instrumention.ts +52 -0
- package/src/experiments/runExperiment.ts +277 -133
- package/src/types/datasets.ts +35 -9
- package/src/types/experiments.ts +19 -4
- package/src/utils/ensureString.ts +14 -0
- package/src/utils/objectAsAttributes.ts +9 -0
- package/dist/esm/utils/getDatasetBySelector.d.ts +0 -25
- package/dist/esm/utils/getDatasetBySelector.d.ts.map +0 -1
- package/dist/esm/utils/getDatasetBySelector.js +0 -37
- package/dist/esm/utils/getDatasetBySelector.js.map +0 -1
- package/dist/src/utils/getDatasetBySelector.d.ts +0 -25
- package/dist/src/utils/getDatasetBySelector.d.ts.map +0 -1
- package/dist/src/utils/getDatasetBySelector.js +0 -47
- package/dist/src/utils/getDatasetBySelector.js.map +0 -1
- package/src/utils/getDatasetBySelector.ts +0 -55
|
@@ -2,21 +2,36 @@ import { queue } from "async";
|
|
|
2
2
|
import invariant from "tiny-invariant";
|
|
3
3
|
import { createClient, type PhoenixClient } from "../client";
|
|
4
4
|
import { ClientFn } from "../types/core";
|
|
5
|
-
import {
|
|
5
|
+
import {
|
|
6
|
+
Dataset,
|
|
7
|
+
DatasetSelector,
|
|
8
|
+
Example,
|
|
9
|
+
ExampleWithId,
|
|
10
|
+
} from "../types/datasets";
|
|
6
11
|
import type {
|
|
7
12
|
Evaluator,
|
|
8
|
-
|
|
13
|
+
ExperimentInfo,
|
|
9
14
|
ExperimentEvaluationRun,
|
|
10
|
-
ExperimentParameters,
|
|
11
15
|
ExperimentRun,
|
|
16
|
+
ExperimentRunID,
|
|
12
17
|
ExperimentTask,
|
|
13
18
|
RanExperiment,
|
|
14
19
|
} from "../types/experiments";
|
|
15
20
|
import { type Logger } from "../types/logger";
|
|
16
|
-
import {
|
|
21
|
+
import { getDataset } from "../datasets/getDataset";
|
|
17
22
|
import { pluralize } from "../utils/pluralize";
|
|
18
23
|
import { promisifyResult } from "../utils/promisifyResult";
|
|
19
24
|
import { AnnotatorKind } from "../types/annotations";
|
|
25
|
+
import { createProvider, createNoOpProvider } from "./instrumention";
|
|
26
|
+
import { SpanStatusCode, Tracer } from "@opentelemetry/api";
|
|
27
|
+
import {
|
|
28
|
+
MimeType,
|
|
29
|
+
OpenInferenceSpanKind,
|
|
30
|
+
SemanticConventions,
|
|
31
|
+
} from "@arizeai/openinference-semantic-conventions";
|
|
32
|
+
import { ensureString } from "../utils/ensureString";
|
|
33
|
+
import type { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
34
|
+
import { objectAsAttributes } from "../utils/objectAsAttributes";
|
|
20
35
|
|
|
21
36
|
/**
|
|
22
37
|
* Parameters for running an experiment.
|
|
@@ -35,12 +50,13 @@ export type RunExperimentParams = ClientFn & {
|
|
|
35
50
|
experimentDescription?: string;
|
|
36
51
|
/**
|
|
37
52
|
* Experiment metadata
|
|
53
|
+
* E.x. modelName
|
|
38
54
|
*/
|
|
39
55
|
experimentMetadata?: Record<string, unknown>;
|
|
40
56
|
/**
|
|
41
57
|
* The dataset to run the experiment on
|
|
42
58
|
*/
|
|
43
|
-
dataset:
|
|
59
|
+
dataset: DatasetSelector;
|
|
44
60
|
/**
|
|
45
61
|
* The task to run
|
|
46
62
|
*/
|
|
@@ -49,10 +65,6 @@ export type RunExperimentParams = ClientFn & {
|
|
|
49
65
|
* The evaluators to use
|
|
50
66
|
*/
|
|
51
67
|
evaluators?: Evaluator[];
|
|
52
|
-
/**
|
|
53
|
-
* The project under which the experiment task traces are recorded
|
|
54
|
-
*/
|
|
55
|
-
projectName?: string;
|
|
56
68
|
/**
|
|
57
69
|
* The logger to use
|
|
58
70
|
*/
|
|
@@ -73,7 +85,23 @@ export type RunExperimentParams = ClientFn & {
|
|
|
73
85
|
};
|
|
74
86
|
|
|
75
87
|
/**
|
|
76
|
-
*
|
|
88
|
+
* Runs an experiment using a given set of dataset of examples.
|
|
89
|
+
*
|
|
90
|
+
* An experiment is a user-defined task that runs on each example in a dataset. The results from
|
|
91
|
+
* each experiment can be evaluated using any number of evaluators to measure the behavior of the
|
|
92
|
+
* task. The experiment and evaluation results are stored in the Phoenix database for comparison
|
|
93
|
+
* and analysis.
|
|
94
|
+
*
|
|
95
|
+
* A `task` is either a sync or async function that returns a JSON serializable
|
|
96
|
+
* output. If the `task` is a function of one argument then that argument will be bound to the
|
|
97
|
+
* `input` field of the dataset example. Alternatively, the `task` can be a function of any
|
|
98
|
+
* combination of specific argument names that will be bound to special values:
|
|
99
|
+
*
|
|
100
|
+
* - `input`: The input field of the dataset example
|
|
101
|
+
* - `expected`: The expected or reference output of the dataset example
|
|
102
|
+
* - `reference`: An alias for `expected`
|
|
103
|
+
* - `metadata`: Metadata associated with the dataset example
|
|
104
|
+
* - `example`: The dataset `Example` object with all associated fields
|
|
77
105
|
*
|
|
78
106
|
* @example
|
|
79
107
|
* ```ts
|
|
@@ -83,50 +111,48 @@ export type RunExperimentParams = ClientFn & {
|
|
|
83
111
|
* dataset: "my-dataset",
|
|
84
112
|
* task: async (example) => example.input,
|
|
85
113
|
* evaluators: [
|
|
86
|
-
* asEvaluator("my-evaluator", "CODE", async (params) => params.output),
|
|
114
|
+
* asEvaluator({ name: "my-evaluator", kind: "CODE", evaluate: async (params) => params.output }),
|
|
87
115
|
* ],
|
|
88
116
|
* });
|
|
89
117
|
* ```
|
|
90
|
-
*
|
|
91
|
-
* @experimental This feature is not complete, and will change in the future.
|
|
92
118
|
*/
|
|
93
119
|
export async function runExperiment({
|
|
94
|
-
experimentName
|
|
120
|
+
experimentName,
|
|
95
121
|
experimentDescription,
|
|
96
|
-
experimentMetadata,
|
|
122
|
+
experimentMetadata = {},
|
|
97
123
|
client: _client,
|
|
98
|
-
dataset:
|
|
124
|
+
dataset: DatasetSelector,
|
|
99
125
|
task,
|
|
100
126
|
evaluators,
|
|
101
|
-
projectName = "default",
|
|
102
127
|
logger = console,
|
|
103
128
|
record = true,
|
|
104
129
|
concurrency = 5,
|
|
105
130
|
dryRun = false,
|
|
106
131
|
}: RunExperimentParams): Promise<RanExperiment> {
|
|
132
|
+
let provider: NodeTracerProvider | undefined;
|
|
107
133
|
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
108
134
|
const client = _client ?? createClient();
|
|
109
|
-
const dataset = await
|
|
135
|
+
const dataset = await getDataset({ dataset: DatasetSelector, client });
|
|
110
136
|
invariant(dataset, `Dataset not found`);
|
|
111
137
|
invariant(dataset.examples.length > 0, `Dataset has no examples`);
|
|
112
138
|
const nExamples =
|
|
113
139
|
typeof dryRun === "number"
|
|
114
|
-
? Math.
|
|
140
|
+
? Math.min(dryRun, dataset.examples.length)
|
|
115
141
|
: dataset.examples.length;
|
|
116
142
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
};
|
|
122
|
-
let experiment: Experiment;
|
|
143
|
+
let projectName = `${dataset.name}-exp-${new Date().toISOString()}`;
|
|
144
|
+
// initialize the tracer into scope
|
|
145
|
+
let taskTracer: Tracer;
|
|
146
|
+
let experiment: ExperimentInfo;
|
|
123
147
|
if (isDryRun) {
|
|
124
148
|
experiment = {
|
|
125
|
-
id:
|
|
149
|
+
id: localId(),
|
|
126
150
|
datasetId: dataset.id,
|
|
127
151
|
datasetVersionId: dataset.versionId,
|
|
128
152
|
projectName,
|
|
153
|
+
metadata: experimentMetadata,
|
|
129
154
|
};
|
|
155
|
+
taskTracer = createNoOpProvider().getTracer("no-op");
|
|
130
156
|
} else {
|
|
131
157
|
const experimentResponse = await client
|
|
132
158
|
.POST("/v1/datasets/{dataset_id}/experiments", {
|
|
@@ -144,14 +170,27 @@ export async function runExperiment({
|
|
|
144
170
|
})
|
|
145
171
|
.then((res) => res.data?.data);
|
|
146
172
|
invariant(experimentResponse, `Failed to create experiment`);
|
|
173
|
+
projectName = experimentResponse.project_name ?? projectName;
|
|
147
174
|
experiment = {
|
|
148
175
|
id: experimentResponse.id,
|
|
149
|
-
datasetId:
|
|
150
|
-
datasetVersionId:
|
|
176
|
+
datasetId: experimentResponse.dataset_id,
|
|
177
|
+
datasetVersionId: experimentResponse.dataset_version_id,
|
|
151
178
|
projectName,
|
|
179
|
+
metadata: experimentResponse.metadata,
|
|
152
180
|
};
|
|
181
|
+
// Initialize the tracer, now that we have a project name
|
|
182
|
+
const baseUrl = client.config.baseUrl;
|
|
183
|
+
invariant(
|
|
184
|
+
baseUrl,
|
|
185
|
+
"Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
|
|
186
|
+
);
|
|
187
|
+
provider = createProvider({
|
|
188
|
+
projectName,
|
|
189
|
+
baseUrl,
|
|
190
|
+
headers: client.config.headers ?? {},
|
|
191
|
+
});
|
|
192
|
+
taskTracer = provider.getTracer(projectName);
|
|
153
193
|
}
|
|
154
|
-
|
|
155
194
|
if (!record) {
|
|
156
195
|
logger.info(
|
|
157
196
|
`🔧 Running experiment in readonly mode. Results will not be recorded.`
|
|
@@ -159,16 +198,14 @@ export async function runExperiment({
|
|
|
159
198
|
}
|
|
160
199
|
|
|
161
200
|
logger.info(
|
|
162
|
-
`🧪 Starting experiment "${experimentName}" on dataset "${dataset.id}" with task "${task.name}" and ${evaluators?.length ?? 0} ${pluralize(
|
|
201
|
+
`🧪 Starting experiment "${experimentName || `<unnamed>`}" on dataset "${dataset.id}" with task "${task.name}" and ${evaluators?.length ?? 0} ${pluralize(
|
|
163
202
|
"evaluator",
|
|
164
203
|
evaluators?.length ?? 0
|
|
165
204
|
)} and ${concurrency} concurrent runs`
|
|
166
205
|
);
|
|
167
206
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
const runs: Record<ExperimentRunId, ExperimentRun> = {};
|
|
171
|
-
await runTask({
|
|
207
|
+
const runs: Record<ExperimentRunID, ExperimentRun> = {};
|
|
208
|
+
await runTaskWithExamples({
|
|
172
209
|
client,
|
|
173
210
|
experimentId: experiment.id,
|
|
174
211
|
task,
|
|
@@ -180,15 +217,20 @@ export async function runExperiment({
|
|
|
180
217
|
concurrency,
|
|
181
218
|
isDryRun,
|
|
182
219
|
nExamples,
|
|
220
|
+
tracer: taskTracer,
|
|
183
221
|
});
|
|
184
222
|
logger.info(`✅ Task runs completed`);
|
|
185
223
|
|
|
186
224
|
const ranExperiment: RanExperiment = {
|
|
187
225
|
...experiment,
|
|
188
|
-
params: experimentParams,
|
|
189
226
|
runs,
|
|
190
227
|
};
|
|
191
228
|
|
|
229
|
+
// Shut down the provider so that the experiments run
|
|
230
|
+
if (provider) {
|
|
231
|
+
await provider.shutdown?.();
|
|
232
|
+
}
|
|
233
|
+
|
|
192
234
|
const { evaluationRuns } = await evaluateExperiment({
|
|
193
235
|
experiment: ranExperiment,
|
|
194
236
|
evaluators: evaluators ?? [],
|
|
@@ -207,7 +249,7 @@ export async function runExperiment({
|
|
|
207
249
|
/**
|
|
208
250
|
* Run a task against n examples in a dataset.
|
|
209
251
|
*/
|
|
210
|
-
function
|
|
252
|
+
function runTaskWithExamples({
|
|
211
253
|
client,
|
|
212
254
|
experimentId,
|
|
213
255
|
task,
|
|
@@ -217,6 +259,7 @@ function runTask({
|
|
|
217
259
|
concurrency = 5,
|
|
218
260
|
isDryRun,
|
|
219
261
|
nExamples,
|
|
262
|
+
tracer,
|
|
220
263
|
}: {
|
|
221
264
|
/** The client to use */
|
|
222
265
|
client: PhoenixClient;
|
|
@@ -236,61 +279,88 @@ function runTask({
|
|
|
236
279
|
isDryRun: boolean;
|
|
237
280
|
/** The number of examples to run */
|
|
238
281
|
nExamples: number;
|
|
239
|
-
|
|
282
|
+
/** TraceProvider instance that will be used to create spans from task calls */
|
|
283
|
+
tracer: Tracer;
|
|
284
|
+
}): Promise<void> {
|
|
240
285
|
logger.info(`🔧 Running task "${task.name}" on dataset "${dataset.id}"`);
|
|
241
|
-
const run = async (example:
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
286
|
+
const run = async (example: ExampleWithId) => {
|
|
287
|
+
return tracer.startActiveSpan(`Task: ${task.name}`, async (span) => {
|
|
288
|
+
logger.info(
|
|
289
|
+
`🔧 Running task "${task.name}" on example "${example.id} of dataset "${dataset.id}"`
|
|
290
|
+
);
|
|
291
|
+
const traceId = span.spanContext().traceId;
|
|
292
|
+
const thisRun: ExperimentRun = {
|
|
293
|
+
id: localId(), // initialized with local id, will be replaced with server-assigned id when dry run is false
|
|
294
|
+
traceId,
|
|
295
|
+
experimentId,
|
|
296
|
+
datasetExampleId: example.id,
|
|
297
|
+
startTime: new Date(),
|
|
298
|
+
endTime: new Date(), // will get replaced with actual end time
|
|
299
|
+
output: null,
|
|
300
|
+
error: null,
|
|
301
|
+
};
|
|
302
|
+
try {
|
|
303
|
+
const taskOutput = await promisifyResult(task(example));
|
|
304
|
+
thisRun.output =
|
|
305
|
+
typeof taskOutput === "string"
|
|
306
|
+
? taskOutput
|
|
307
|
+
: JSON.stringify(taskOutput);
|
|
308
|
+
} catch (error) {
|
|
309
|
+
thisRun.error =
|
|
310
|
+
error instanceof Error ? error.message : "Unknown error";
|
|
311
|
+
span.setStatus({ code: SpanStatusCode.ERROR });
|
|
312
|
+
}
|
|
313
|
+
thisRun.endTime = new Date();
|
|
314
|
+
if (!isDryRun) {
|
|
315
|
+
// Log the run to the server
|
|
316
|
+
const res = await client.POST("/v1/experiments/{experiment_id}/runs", {
|
|
317
|
+
params: {
|
|
318
|
+
path: {
|
|
319
|
+
experiment_id: experimentId,
|
|
320
|
+
},
|
|
273
321
|
},
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
322
|
+
body: {
|
|
323
|
+
dataset_example_id: example.id,
|
|
324
|
+
output: thisRun.output,
|
|
325
|
+
repetition_number: 0,
|
|
326
|
+
start_time: thisRun.startTime.toISOString(),
|
|
327
|
+
end_time: thisRun.endTime.toISOString(),
|
|
328
|
+
trace_id: thisRun.traceId,
|
|
329
|
+
error: thisRun.error,
|
|
330
|
+
},
|
|
331
|
+
});
|
|
332
|
+
// replace the local run id with the server-assigned id
|
|
333
|
+
thisRun.id = res.data?.data.id ?? thisRun.id;
|
|
334
|
+
const inputMimeType =
|
|
335
|
+
typeof example.input === "string" ? MimeType.TEXT : MimeType.JSON;
|
|
336
|
+
const outputMimeType =
|
|
337
|
+
typeof thisRun.output === "string" ? MimeType.TEXT : MimeType.JSON;
|
|
338
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
339
|
+
span.setAttributes({
|
|
340
|
+
[SemanticConventions.OPENINFERENCE_SPAN_KIND]:
|
|
341
|
+
OpenInferenceSpanKind.CHAIN,
|
|
342
|
+
[SemanticConventions.INPUT_MIME_TYPE]: inputMimeType,
|
|
343
|
+
[SemanticConventions.INPUT_VALUE]: ensureString(example.input),
|
|
344
|
+
[SemanticConventions.OUTPUT_MIME_TYPE]: outputMimeType,
|
|
345
|
+
[SemanticConventions.OUTPUT_VALUE]: ensureString(thisRun.output),
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
span?.end();
|
|
349
|
+
onComplete(thisRun);
|
|
350
|
+
return thisRun;
|
|
351
|
+
});
|
|
290
352
|
};
|
|
291
353
|
const q = queue(run, concurrency);
|
|
292
354
|
const examplesToUse = dataset.examples.slice(0, nExamples);
|
|
293
|
-
examplesToUse.forEach((example) =>
|
|
355
|
+
examplesToUse.forEach((example) =>
|
|
356
|
+
q.push(example, (err) => {
|
|
357
|
+
if (err) {
|
|
358
|
+
logger.error(
|
|
359
|
+
`Error running task "${task.name}" on example "${example.id}": ${err}`
|
|
360
|
+
);
|
|
361
|
+
}
|
|
362
|
+
})
|
|
363
|
+
);
|
|
294
364
|
return q.drain();
|
|
295
365
|
}
|
|
296
366
|
|
|
@@ -303,13 +373,12 @@ export async function evaluateExperiment({
|
|
|
303
373
|
experiment,
|
|
304
374
|
evaluators,
|
|
305
375
|
client: _client,
|
|
306
|
-
logger,
|
|
376
|
+
logger = console,
|
|
307
377
|
concurrency = 5,
|
|
308
378
|
dryRun = false,
|
|
309
379
|
}: {
|
|
310
380
|
/**
|
|
311
381
|
* The experiment to evaluate
|
|
312
|
-
* @todo also accept Experiment, and attempt to fetch the runs from the server
|
|
313
382
|
**/
|
|
314
383
|
experiment: RanExperiment;
|
|
315
384
|
/** The evaluators to use */
|
|
@@ -317,9 +386,9 @@ export async function evaluateExperiment({
|
|
|
317
386
|
/** The client to use */
|
|
318
387
|
client?: PhoenixClient;
|
|
319
388
|
/** The logger to use */
|
|
320
|
-
logger
|
|
389
|
+
logger?: Logger;
|
|
321
390
|
/** The number of evaluators to run in parallel */
|
|
322
|
-
concurrency
|
|
391
|
+
concurrency?: number;
|
|
323
392
|
/**
|
|
324
393
|
* Whether to run the evaluation as a dry run
|
|
325
394
|
* If a number is provided, the evaluation will be run for the first n runs
|
|
@@ -328,13 +397,31 @@ export async function evaluateExperiment({
|
|
|
328
397
|
dryRun?: boolean | number;
|
|
329
398
|
}): Promise<RanExperiment> {
|
|
330
399
|
const isDryRun = typeof dryRun === "number" || dryRun === true;
|
|
400
|
+
const client = _client ?? createClient();
|
|
401
|
+
const baseUrl = client.config.baseUrl;
|
|
402
|
+
invariant(
|
|
403
|
+
baseUrl,
|
|
404
|
+
"Phoenix base URL not found. Please set PHOENIX_HOST or set baseUrl on the client."
|
|
405
|
+
);
|
|
406
|
+
let provider: NodeTracerProvider;
|
|
407
|
+
if (!isDryRun) {
|
|
408
|
+
provider = createProvider({
|
|
409
|
+
projectName: "evaluators",
|
|
410
|
+
baseUrl,
|
|
411
|
+
headers: client.config.headers ?? {},
|
|
412
|
+
});
|
|
413
|
+
} else {
|
|
414
|
+
provider = createNoOpProvider();
|
|
415
|
+
}
|
|
416
|
+
const tracer = isDryRun
|
|
417
|
+
? provider.getTracer("no-op")
|
|
418
|
+
: provider.getTracer("evaluators");
|
|
331
419
|
const nRuns =
|
|
332
420
|
typeof dryRun === "number"
|
|
333
421
|
? Math.max(dryRun, Object.keys(experiment.runs).length)
|
|
334
422
|
: Object.keys(experiment.runs).length;
|
|
335
|
-
const
|
|
336
|
-
|
|
337
|
-
dataset: experiment.datasetId,
|
|
423
|
+
const dataset = await getDataset({
|
|
424
|
+
dataset: { datasetId: experiment.datasetId },
|
|
338
425
|
client,
|
|
339
426
|
});
|
|
340
427
|
invariant(dataset, `Dataset "${experiment.datasetId}" not found`);
|
|
@@ -345,14 +432,12 @@ export async function evaluateExperiment({
|
|
|
345
432
|
invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`);
|
|
346
433
|
|
|
347
434
|
const runsToEvaluate = Object.values(experiment.runs).slice(0, nRuns);
|
|
348
|
-
|
|
349
435
|
if (evaluators?.length === 0) {
|
|
350
436
|
return {
|
|
351
437
|
...experiment,
|
|
352
438
|
evaluationRuns: [],
|
|
353
439
|
};
|
|
354
440
|
}
|
|
355
|
-
|
|
356
441
|
logger.info(
|
|
357
442
|
`🧠 Evaluating experiment "${experiment.id}" with ${evaluators?.length ?? 0} ${pluralize(
|
|
358
443
|
"evaluator",
|
|
@@ -381,40 +466,91 @@ export async function evaluateExperiment({
|
|
|
381
466
|
);
|
|
382
467
|
const evaluatorsQueue = queue(
|
|
383
468
|
async (evaluatorAndRun: { evaluator: Evaluator; run: ExperimentRun }) => {
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
469
|
+
return tracer.startActiveSpan(
|
|
470
|
+
`Evaluation: ${evaluatorAndRun.evaluator.name}`,
|
|
471
|
+
async (span) => {
|
|
472
|
+
const evalResult = await runEvaluator({
|
|
473
|
+
evaluator: evaluatorAndRun.evaluator,
|
|
474
|
+
run: evaluatorAndRun.run,
|
|
475
|
+
exampleCache: examplesById,
|
|
476
|
+
onComplete: onEvaluationComplete,
|
|
477
|
+
logger,
|
|
478
|
+
});
|
|
479
|
+
span.setAttributes({
|
|
480
|
+
[SemanticConventions.OPENINFERENCE_SPAN_KIND]:
|
|
481
|
+
OpenInferenceSpanKind.EVALUATOR,
|
|
482
|
+
[SemanticConventions.INPUT_MIME_TYPE]: MimeType.JSON,
|
|
483
|
+
[SemanticConventions.INPUT_VALUE]: JSON.stringify({
|
|
484
|
+
input: examplesById[evaluatorAndRun.run.datasetExampleId]?.input,
|
|
485
|
+
output: evaluatorAndRun.run.output,
|
|
486
|
+
expected:
|
|
487
|
+
examplesById[evaluatorAndRun.run.datasetExampleId]?.output,
|
|
488
|
+
metadata:
|
|
489
|
+
examplesById[evaluatorAndRun.run.datasetExampleId]?.metadata,
|
|
490
|
+
}),
|
|
491
|
+
[SemanticConventions.OUTPUT_MIME_TYPE]: MimeType.JSON,
|
|
492
|
+
[SemanticConventions.OUTPUT_VALUE]: ensureString(evalResult.result),
|
|
493
|
+
});
|
|
494
|
+
if (evalResult.error) {
|
|
495
|
+
span.setStatus({
|
|
496
|
+
code: SpanStatusCode.ERROR,
|
|
497
|
+
message: evalResult.error,
|
|
498
|
+
});
|
|
499
|
+
} else {
|
|
500
|
+
span.setStatus({ code: SpanStatusCode.OK });
|
|
501
|
+
}
|
|
502
|
+
if (evalResult.result) {
|
|
503
|
+
span.setAttributes(objectAsAttributes(evalResult.result));
|
|
504
|
+
}
|
|
505
|
+
evalResult.traceId = span.spanContext().traceId;
|
|
506
|
+
if (!isDryRun) {
|
|
507
|
+
// Log the evaluation to the server
|
|
508
|
+
// We log this without awaiting (e.g. best effort)
|
|
509
|
+
client.POST("/v1/experiment_evaluations", {
|
|
510
|
+
body: {
|
|
511
|
+
experiment_run_id: evaluatorAndRun.run.id,
|
|
512
|
+
name: evaluatorAndRun.evaluator.name,
|
|
513
|
+
annotator_kind: evaluatorAndRun.evaluator.kind,
|
|
514
|
+
start_time: evalResult.startTime.toISOString(),
|
|
515
|
+
end_time: evalResult.endTime.toISOString(),
|
|
516
|
+
result: {
|
|
517
|
+
...evalResult.result,
|
|
518
|
+
},
|
|
519
|
+
error: evalResult.error,
|
|
520
|
+
trace_id: evalResult.traceId,
|
|
521
|
+
},
|
|
522
|
+
});
|
|
523
|
+
}
|
|
524
|
+
span.end();
|
|
525
|
+
return evalResult;
|
|
526
|
+
}
|
|
527
|
+
);
|
|
409
528
|
},
|
|
410
529
|
concurrency
|
|
411
530
|
);
|
|
531
|
+
if (!evaluatorsAndRuns.length) {
|
|
532
|
+
logger.info(`⛔ No evaluators to run`);
|
|
533
|
+
return {
|
|
534
|
+
...experiment,
|
|
535
|
+
evaluationRuns: [],
|
|
536
|
+
};
|
|
537
|
+
}
|
|
412
538
|
evaluatorsAndRuns.forEach((evaluatorAndRun) =>
|
|
413
|
-
evaluatorsQueue.push(evaluatorAndRun)
|
|
539
|
+
evaluatorsQueue.push(evaluatorAndRun, (err) => {
|
|
540
|
+
if (err) {
|
|
541
|
+
logger.error(
|
|
542
|
+
`❌ Error running evaluator "${evaluatorAndRun.evaluator.name}" on run "${evaluatorAndRun.run.id}": ${err}`
|
|
543
|
+
);
|
|
544
|
+
}
|
|
545
|
+
})
|
|
414
546
|
);
|
|
415
547
|
await evaluatorsQueue.drain();
|
|
416
548
|
logger.info(`✅ Evaluation runs completed`);
|
|
417
549
|
|
|
550
|
+
if (provider) {
|
|
551
|
+
await provider.shutdown?.();
|
|
552
|
+
}
|
|
553
|
+
|
|
418
554
|
return {
|
|
419
555
|
...experiment,
|
|
420
556
|
evaluationRuns: Object.values(evaluationRuns),
|
|
@@ -431,25 +567,30 @@ async function runEvaluator({
|
|
|
431
567
|
run,
|
|
432
568
|
exampleCache,
|
|
433
569
|
onComplete,
|
|
570
|
+
logger,
|
|
434
571
|
}: {
|
|
435
572
|
evaluator: Evaluator;
|
|
436
573
|
run: ExperimentRun;
|
|
437
574
|
exampleCache: Record<string, Example>;
|
|
575
|
+
logger: Logger;
|
|
438
576
|
onComplete: (run: ExperimentEvaluationRun) => void;
|
|
439
577
|
}) {
|
|
440
578
|
const example = exampleCache[run.datasetExampleId];
|
|
441
579
|
invariant(example, `Example "${run.datasetExampleId}" not found`);
|
|
442
580
|
const evaluate = async () => {
|
|
581
|
+
logger.info(
|
|
582
|
+
`🧠 Evaluating run "${run.id}" with evaluator "${evaluator.name}"`
|
|
583
|
+
);
|
|
443
584
|
const thisEval: ExperimentEvaluationRun = {
|
|
444
|
-
id:
|
|
445
|
-
traceId: null,
|
|
585
|
+
id: localId(),
|
|
586
|
+
traceId: null,
|
|
446
587
|
experimentRunId: run.id,
|
|
447
588
|
startTime: new Date(),
|
|
448
589
|
endTime: new Date(), // will get replaced with actual end time
|
|
449
590
|
name: evaluator.name,
|
|
450
591
|
result: null,
|
|
451
592
|
error: null,
|
|
452
|
-
annotatorKind:
|
|
593
|
+
annotatorKind: evaluator.kind,
|
|
453
594
|
};
|
|
454
595
|
try {
|
|
455
596
|
const result = await evaluator.evaluate({
|
|
@@ -459,8 +600,14 @@ async function runEvaluator({
|
|
|
459
600
|
metadata: example.metadata,
|
|
460
601
|
});
|
|
461
602
|
thisEval.result = result;
|
|
603
|
+
logger.info(
|
|
604
|
+
`✅ Evaluator "${evaluator.name}" on run "${run.id}" completed`
|
|
605
|
+
);
|
|
462
606
|
} catch (error) {
|
|
463
607
|
thisEval.error = error instanceof Error ? error.message : "Unknown error";
|
|
608
|
+
logger.error(
|
|
609
|
+
`❌ Evaluator "${evaluator.name}" on run "${run.id}" failed: ${thisEval.error}`
|
|
610
|
+
);
|
|
464
611
|
}
|
|
465
612
|
thisEval.endTime = new Date();
|
|
466
613
|
onComplete(thisEval);
|
|
@@ -495,17 +642,14 @@ export function asEvaluator({
|
|
|
495
642
|
};
|
|
496
643
|
}
|
|
497
644
|
|
|
498
|
-
let
|
|
645
|
+
let _localIdIndex = 1000;
|
|
499
646
|
|
|
500
647
|
/**
|
|
501
|
-
* Generate a
|
|
648
|
+
* Generate a local id.
|
|
502
649
|
*
|
|
503
|
-
* @
|
|
504
|
-
* @returns A unique id.
|
|
650
|
+
* @returns A semi-unique id.
|
|
505
651
|
*/
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
return _id.toString();
|
|
510
|
-
})();
|
|
652
|
+
function localId(): string {
|
|
653
|
+
_localIdIndex++;
|
|
654
|
+
return `local_${_localIdIndex}`;
|
|
511
655
|
}
|