@arizeai/phoenix-client 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -0
- package/dist/esm/__generated__/api/v1.d.ts +1253 -0
- package/dist/esm/__generated__/api/v1.d.ts.map +1 -0
- package/dist/esm/__generated__/api/v1.js +6 -0
- package/dist/esm/__generated__/api/v1.js.map +1 -0
- package/dist/esm/client.d.ts +36 -0
- package/dist/esm/client.d.ts.map +1 -0
- package/dist/esm/client.js +29 -0
- package/dist/esm/client.js.map +1 -0
- package/dist/esm/config.d.ts +14 -0
- package/dist/esm/config.d.ts.map +1 -0
- package/dist/esm/config.js +56 -0
- package/dist/esm/config.js.map +1 -0
- package/dist/esm/experiments/index.d.ts +2 -0
- package/dist/esm/experiments/index.d.ts.map +1 -0
- package/dist/esm/experiments/index.js +2 -0
- package/dist/esm/experiments/index.js.map +1 -0
- package/dist/esm/experiments/runExperiment.d.ts +58 -0
- package/dist/esm/experiments/runExperiment.d.ts.map +1 -0
- package/dist/esm/experiments/runExperiment.js +215 -0
- package/dist/esm/experiments/runExperiment.js.map +1 -0
- package/dist/esm/index.d.ts +3 -0
- package/dist/esm/index.d.ts.map +1 -0
- package/dist/esm/index.js +3 -0
- package/dist/esm/index.js.map +1 -0
- package/dist/esm/package.json +1 -0
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -0
- package/dist/esm/types/annotations.d.ts +2 -0
- package/dist/esm/types/annotations.d.ts.map +1 -0
- package/dist/esm/types/annotations.js +2 -0
- package/dist/esm/types/annotations.js.map +1 -0
- package/dist/esm/types/core.d.ts +7 -0
- package/dist/esm/types/core.d.ts.map +1 -0
- package/dist/esm/types/core.js +2 -0
- package/dist/esm/types/core.js.map +1 -0
- package/dist/esm/types/datasets.d.ts +21 -0
- package/dist/esm/types/datasets.d.ts.map +1 -0
- package/dist/esm/types/datasets.js +2 -0
- package/dist/esm/types/datasets.js.map +1 -0
- package/dist/esm/types/experiments.d.ts +95 -0
- package/dist/esm/types/experiments.d.ts.map +1 -0
- package/dist/esm/types/experiments.js +2 -0
- package/dist/esm/types/experiments.js.map +1 -0
- package/dist/esm/utils/pluralize.d.ts +9 -0
- package/dist/esm/utils/pluralize.d.ts.map +1 -0
- package/dist/esm/utils/pluralize.js +11 -0
- package/dist/esm/utils/pluralize.js.map +1 -0
- package/dist/esm/utils/promisifyResult.d.ts +6 -0
- package/dist/esm/utils/promisifyResult.d.ts.map +1 -0
- package/dist/esm/utils/promisifyResult.js +11 -0
- package/dist/esm/utils/promisifyResult.js.map +1 -0
- package/dist/src/__generated__/api/v1.d.ts +1253 -0
- package/dist/src/__generated__/api/v1.d.ts.map +1 -0
- package/dist/src/__generated__/api/v1.js +7 -0
- package/dist/src/__generated__/api/v1.js.map +1 -0
- package/dist/src/client.d.ts +36 -0
- package/dist/src/client.d.ts.map +1 -0
- package/dist/src/client.js +33 -0
- package/dist/src/client.js.map +1 -0
- package/dist/src/config.d.ts +14 -0
- package/dist/src/config.d.ts.map +1 -0
- package/dist/src/config.js +65 -0
- package/dist/src/config.js.map +1 -0
- package/dist/src/experiments/index.d.ts +2 -0
- package/dist/src/experiments/index.d.ts.map +1 -0
- package/dist/src/experiments/index.js +18 -0
- package/dist/src/experiments/index.js.map +1 -0
- package/dist/src/experiments/runExperiment.d.ts +58 -0
- package/dist/src/experiments/runExperiment.d.ts.map +1 -0
- package/dist/src/experiments/runExperiment.js +227 -0
- package/dist/src/experiments/runExperiment.js.map +1 -0
- package/dist/src/index.d.ts +3 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +19 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/types/annotations.d.ts +2 -0
- package/dist/src/types/annotations.d.ts.map +1 -0
- package/dist/src/types/annotations.js +3 -0
- package/dist/src/types/annotations.js.map +1 -0
- package/dist/src/types/core.d.ts +7 -0
- package/dist/src/types/core.d.ts.map +1 -0
- package/dist/src/types/core.js +3 -0
- package/dist/src/types/core.js.map +1 -0
- package/dist/src/types/datasets.d.ts +21 -0
- package/dist/src/types/datasets.d.ts.map +1 -0
- package/dist/src/types/datasets.js +3 -0
- package/dist/src/types/datasets.js.map +1 -0
- package/dist/src/types/experiments.d.ts +95 -0
- package/dist/src/types/experiments.d.ts.map +1 -0
- package/dist/src/types/experiments.js +3 -0
- package/dist/src/types/experiments.js.map +1 -0
- package/dist/src/utils/pluralize.d.ts +9 -0
- package/dist/src/utils/pluralize.d.ts.map +1 -0
- package/dist/src/utils/pluralize.js +14 -0
- package/dist/src/utils/pluralize.js.map +1 -0
- package/dist/src/utils/promisifyResult.d.ts +6 -0
- package/dist/src/utils/promisifyResult.d.ts.map +1 -0
- package/dist/src/utils/promisifyResult.js +14 -0
- package/dist/src/utils/promisifyResult.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/package.json +44 -0
- package/src/__generated__/api/v1.d.ts +1253 -0
- package/src/__generated__/api/v1.ts +1253 -0
- package/src/client.ts +63 -0
- package/src/config.ts +68 -0
- package/src/experiments/index.ts +1 -0
- package/src/experiments/runExperiment.ts +367 -0
- package/src/index.ts +3 -0
- package/src/types/annotations.ts +1 -0
- package/src/types/core.ts +6 -0
- package/src/types/datasets.ts +22 -0
- package/src/types/experiments.ts +108 -0
- package/src/utils/pluralize.ts +10 -0
- package/src/utils/promisifyResult.ts +13 -0
package/src/client.ts
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import createOpenApiClient, { ClientOptions } from "openapi-fetch";
|
|
2
|
+
import type {
|
|
3
|
+
paths as oapiPathsV1,
|
|
4
|
+
components as oapiComponentsV1,
|
|
5
|
+
operations as oapiOperationsV1,
|
|
6
|
+
} from "./__generated__/api/v1.d.ts";
|
|
7
|
+
import {
|
|
8
|
+
defaultGetEnvironmentOptions,
|
|
9
|
+
makeDefaultClientOptions,
|
|
10
|
+
} from "./config";
|
|
11
|
+
|
|
12
|
+
type pathsV1 = oapiPathsV1;
|
|
13
|
+
type componentsV1 = oapiComponentsV1;
|
|
14
|
+
type operationsV1 = oapiOperationsV1;
|
|
15
|
+
|
|
16
|
+
export type Types = {
|
|
17
|
+
V1: {
|
|
18
|
+
paths: pathsV1;
|
|
19
|
+
components: componentsV1;
|
|
20
|
+
operations: operationsV1;
|
|
21
|
+
};
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Merge all configuration options according to priority:
|
|
26
|
+
* defaults < environment < explicit options
|
|
27
|
+
*
|
|
28
|
+
* Headers are simply replaced, not merged.
|
|
29
|
+
*/
|
|
30
|
+
export const getMergedOptions = ({
|
|
31
|
+
options = {},
|
|
32
|
+
getEnvironmentOptions = defaultGetEnvironmentOptions,
|
|
33
|
+
}: {
|
|
34
|
+
options?: Partial<ClientOptions>;
|
|
35
|
+
getEnvironmentOptions?: () => Partial<ClientOptions>;
|
|
36
|
+
} = {}): ClientOptions => {
|
|
37
|
+
const defaultOptions = makeDefaultClientOptions();
|
|
38
|
+
const environmentOptions = getEnvironmentOptions();
|
|
39
|
+
return {
|
|
40
|
+
...defaultOptions,
|
|
41
|
+
...environmentOptions,
|
|
42
|
+
...options,
|
|
43
|
+
};
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Create a Phoenix client.
|
|
48
|
+
*
|
|
49
|
+
* @param configuration - The configuration to use for the client.
|
|
50
|
+
* @param configuration.options - The options to use for the client's OpenAPI Fetch wrapper.
|
|
51
|
+
* @returns The Phoenix client.
|
|
52
|
+
*/
|
|
53
|
+
export const createClient = (
|
|
54
|
+
config: {
|
|
55
|
+
options?: Partial<ClientOptions>;
|
|
56
|
+
getEnvironmentOptions?: () => Partial<ClientOptions>;
|
|
57
|
+
} = {}
|
|
58
|
+
) => {
|
|
59
|
+
const mergedOptions = getMergedOptions(config);
|
|
60
|
+
return createOpenApiClient<pathsV1>(mergedOptions);
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
export type PhoenixClient = ReturnType<typeof createClient>;
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import type { ClientOptions } from "openapi-fetch";
|
|
2
|
+
import z from "zod";
|
|
3
|
+
|
|
4
|
+
const phoenixEnvironmentSchema = z.object({
|
|
5
|
+
PHOENIX_HOST: z.string().optional(),
|
|
6
|
+
PHOENIX_CLIENT_HEADERS: z
|
|
7
|
+
.string()
|
|
8
|
+
.transform((s) => JSON.parse(s))
|
|
9
|
+
.transform((o) => z.record(z.string()).parse(o))
|
|
10
|
+
.optional(),
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
type PhoenixEnvironment = z.infer<typeof phoenixEnvironmentSchema>;
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Parse environment variables from an opaque object into a PhoenixEnvironment object.
|
|
17
|
+
*
|
|
18
|
+
* @param environment - The environment variables object-like structure to parse.
|
|
19
|
+
* @returns The parsed PhoenixEnvironment object.
|
|
20
|
+
*/
|
|
21
|
+
const fromEnvironment = (environment: unknown) => {
|
|
22
|
+
return phoenixEnvironmentSchema.safeParse(environment)?.data ?? {};
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Convert a PhoenixEnvironment object into a ClientOptions object.
|
|
27
|
+
*
|
|
28
|
+
* @param environment - The PhoenixEnvironment object to convert.
|
|
29
|
+
* @returns The converted ClientOptions object.
|
|
30
|
+
*/
|
|
31
|
+
const phoenixEnvironmentToClientOptions = (
|
|
32
|
+
environment: PhoenixEnvironment
|
|
33
|
+
): Partial<ClientOptions> => {
|
|
34
|
+
const options: Partial<ClientOptions> = {
|
|
35
|
+
baseUrl: environment.PHOENIX_HOST,
|
|
36
|
+
headers: environment.PHOENIX_CLIENT_HEADERS,
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
// filter out undefined values
|
|
40
|
+
// this will prevent clobbering over default values when merging
|
|
41
|
+
return Object.fromEntries(
|
|
42
|
+
Object.entries(options).filter(([_, v]) => v !== undefined)
|
|
43
|
+
);
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Get the environment options from the environment.
|
|
48
|
+
*
|
|
49
|
+
* @returns The environment options as a Partial<ClientOptions> object.
|
|
50
|
+
*/
|
|
51
|
+
export const defaultGetEnvironmentOptions = (): Partial<ClientOptions> => {
|
|
52
|
+
// feature detect process and process.env
|
|
53
|
+
if (typeof process !== "object" || typeof process.env !== "object") {
|
|
54
|
+
return {};
|
|
55
|
+
}
|
|
56
|
+
return phoenixEnvironmentToClientOptions(fromEnvironment(process.env));
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Make the default client options.
|
|
61
|
+
*
|
|
62
|
+
* @returns The default client options as a Partial<ClientOptions> object.
|
|
63
|
+
*/
|
|
64
|
+
export const makeDefaultClientOptions = (): Partial<ClientOptions> => {
|
|
65
|
+
return {
|
|
66
|
+
baseUrl: "http://localhost:6006",
|
|
67
|
+
};
|
|
68
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from "./runExperiment";
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
import { Dataset, Example } from "../types/datasets";
|
|
2
|
+
import { createClient, type PhoenixClient } from "../client";
|
|
3
|
+
import type {
|
|
4
|
+
Evaluator,
|
|
5
|
+
Experiment,
|
|
6
|
+
ExperimentEvaluationRun,
|
|
7
|
+
ExperimentParameters,
|
|
8
|
+
ExperimentRun,
|
|
9
|
+
ExperimentTask,
|
|
10
|
+
RanExperiment,
|
|
11
|
+
} from "../types/experiments";
|
|
12
|
+
import { promisifyResult } from "../utils/promisifyResult";
|
|
13
|
+
import invariant from "tiny-invariant";
|
|
14
|
+
import { pluralize } from "../utils/pluralize";
|
|
15
|
+
|
|
16
|
+
export type Logger = {
|
|
17
|
+
info: (message: string) => void;
|
|
18
|
+
error: (message: string) => void;
|
|
19
|
+
log: (message: string) => void;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
export type RunExperimentParams = {
|
|
23
|
+
/**
|
|
24
|
+
* An optional name for the experiment.
|
|
25
|
+
* Defaults to the dataset name + a timestamp
|
|
26
|
+
*/
|
|
27
|
+
experimentName?: string;
|
|
28
|
+
client?: PhoenixClient;
|
|
29
|
+
dataset: Dataset | string | Example[];
|
|
30
|
+
task: ExperimentTask;
|
|
31
|
+
evaluators?: Evaluator[];
|
|
32
|
+
repetitions?: number;
|
|
33
|
+
/**
|
|
34
|
+
* The project under which the experiment task traces are recorded
|
|
35
|
+
*/
|
|
36
|
+
projectName?: string;
|
|
37
|
+
logger?: Logger;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Run an experiment.
|
|
42
|
+
*/
|
|
43
|
+
export async function runExperiment({
|
|
44
|
+
experimentName: _experimentName,
|
|
45
|
+
client: _client,
|
|
46
|
+
dataset: _dataset,
|
|
47
|
+
task,
|
|
48
|
+
evaluators,
|
|
49
|
+
repetitions = 1,
|
|
50
|
+
projectName = "default",
|
|
51
|
+
logger = console,
|
|
52
|
+
}: RunExperimentParams): Promise<RanExperiment> {
|
|
53
|
+
const client = _client ?? createClient();
|
|
54
|
+
const dataset = await getDataset({ dataset: _dataset, client });
|
|
55
|
+
invariant(dataset, `Dataset not found`);
|
|
56
|
+
invariant(dataset.examples.length > 0, `Dataset has no examples`);
|
|
57
|
+
const experimentName =
|
|
58
|
+
_experimentName ?? `${dataset.name}-${new Date().toISOString()}`;
|
|
59
|
+
const experimentParams: ExperimentParameters = {
|
|
60
|
+
nRepetitions: repetitions,
|
|
61
|
+
// TODO: Make configurable?
|
|
62
|
+
nExamples: dataset.examples.length,
|
|
63
|
+
};
|
|
64
|
+
const experiment: Experiment = {
|
|
65
|
+
id: id(),
|
|
66
|
+
datasetId: dataset.id,
|
|
67
|
+
datasetVersionId: dataset.versionId,
|
|
68
|
+
repetitions,
|
|
69
|
+
projectName,
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
logger.info(
|
|
73
|
+
`🧪 Starting experiment "${experimentName}" on dataset "${dataset.id}" with task "${task.name}" and ${evaluators?.length ?? 0} ${pluralize(
|
|
74
|
+
"evaluator",
|
|
75
|
+
evaluators?.length ?? 0
|
|
76
|
+
)}`
|
|
77
|
+
);
|
|
78
|
+
|
|
79
|
+
logger.info(
|
|
80
|
+
`🔁 Running ${repetitions} ${pluralize("repetition", repetitions)} of task "${task.name}"`
|
|
81
|
+
);
|
|
82
|
+
|
|
83
|
+
// Run task against all examples, for each repetition
|
|
84
|
+
type ExperimentRunId = string;
|
|
85
|
+
const runs: Record<ExperimentRunId, ExperimentRun> = {};
|
|
86
|
+
await Promise.all(
|
|
87
|
+
Array.from({ length: repetitions }, (_, i) =>
|
|
88
|
+
runTask({
|
|
89
|
+
repetition: i + 1,
|
|
90
|
+
experimentId: experiment.id,
|
|
91
|
+
task,
|
|
92
|
+
dataset,
|
|
93
|
+
logger,
|
|
94
|
+
onComplete: (run) => {
|
|
95
|
+
runs[run.id] = run;
|
|
96
|
+
},
|
|
97
|
+
})
|
|
98
|
+
)
|
|
99
|
+
);
|
|
100
|
+
logger.info(`✅ Task runs completed`);
|
|
101
|
+
|
|
102
|
+
const ranExperiment: RanExperiment = {
|
|
103
|
+
...experiment,
|
|
104
|
+
params: experimentParams,
|
|
105
|
+
runs,
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
const { evaluationRuns } = await evaluateExperiment({
|
|
109
|
+
experiment: ranExperiment,
|
|
110
|
+
evaluators: evaluators ?? [],
|
|
111
|
+
client,
|
|
112
|
+
logger,
|
|
113
|
+
});
|
|
114
|
+
ranExperiment.evaluationRuns = evaluationRuns;
|
|
115
|
+
|
|
116
|
+
logger.info(`✅ Experiment ${experiment.id} completed`);
|
|
117
|
+
|
|
118
|
+
return ranExperiment;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Run a task against all examples in a dataset.
|
|
123
|
+
*/
|
|
124
|
+
function runTask({
|
|
125
|
+
experimentId,
|
|
126
|
+
task,
|
|
127
|
+
dataset,
|
|
128
|
+
repetition,
|
|
129
|
+
onComplete,
|
|
130
|
+
logger,
|
|
131
|
+
}: {
|
|
132
|
+
/** The id of the experiment */
|
|
133
|
+
experimentId: string;
|
|
134
|
+
/** The task to run */
|
|
135
|
+
task: ExperimentTask;
|
|
136
|
+
/** The dataset to run the task on */
|
|
137
|
+
dataset: Dataset;
|
|
138
|
+
/** The repetition number */
|
|
139
|
+
repetition: number;
|
|
140
|
+
/** A callback to call when the task is complete */
|
|
141
|
+
onComplete: (run: ExperimentRun) => void;
|
|
142
|
+
/** The logger to use */
|
|
143
|
+
logger: Logger;
|
|
144
|
+
}) {
|
|
145
|
+
logger.info(
|
|
146
|
+
`🔧 (${repetition}) Running task "${task.name}" on dataset "${dataset.id}"`
|
|
147
|
+
);
|
|
148
|
+
const run = async (example: Example) => {
|
|
149
|
+
const thisRun: ExperimentRun = {
|
|
150
|
+
id: id(),
|
|
151
|
+
traceId: id(),
|
|
152
|
+
experimentId,
|
|
153
|
+
datasetExampleId: example.id,
|
|
154
|
+
startTime: new Date(),
|
|
155
|
+
repetitionNumber: repetition,
|
|
156
|
+
endTime: new Date(), // will get replaced with actual end time
|
|
157
|
+
output: null,
|
|
158
|
+
error: null,
|
|
159
|
+
};
|
|
160
|
+
try {
|
|
161
|
+
const taskOutput = await promisifyResult(task(example));
|
|
162
|
+
// TODO: why doesn't run output type match task output type?
|
|
163
|
+
thisRun.output = JSON.stringify(taskOutput);
|
|
164
|
+
} catch (error) {
|
|
165
|
+
thisRun.error = error instanceof Error ? error.message : "Unknown error";
|
|
166
|
+
}
|
|
167
|
+
thisRun.endTime = new Date();
|
|
168
|
+
onComplete(thisRun);
|
|
169
|
+
};
|
|
170
|
+
return Promise.all(dataset.examples.map(run));
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
export async function evaluateExperiment({
|
|
174
|
+
experiment,
|
|
175
|
+
evaluators,
|
|
176
|
+
client: _client,
|
|
177
|
+
logger,
|
|
178
|
+
}: {
|
|
179
|
+
/**
|
|
180
|
+
* The experiment to evaluate
|
|
181
|
+
* @todo also accept Experiment, and attempt to fetch the runs from the server
|
|
182
|
+
**/
|
|
183
|
+
experiment: RanExperiment;
|
|
184
|
+
/** The evaluators to use */
|
|
185
|
+
evaluators: Evaluator[];
|
|
186
|
+
/** The client to use */
|
|
187
|
+
client?: PhoenixClient;
|
|
188
|
+
/** The logger to use */
|
|
189
|
+
logger: Logger;
|
|
190
|
+
}): Promise<RanExperiment> {
|
|
191
|
+
const client = _client ?? createClient();
|
|
192
|
+
const dataset = await getDataset({ dataset: experiment.datasetId, client });
|
|
193
|
+
invariant(dataset, `Dataset "${experiment.datasetId}" not found`);
|
|
194
|
+
invariant(
|
|
195
|
+
dataset.examples.length > 0,
|
|
196
|
+
`Dataset "${experiment.datasetId}" has no examples`
|
|
197
|
+
);
|
|
198
|
+
invariant(experiment.runs, `Experiment "${experiment.id}" has no runs`);
|
|
199
|
+
|
|
200
|
+
if (evaluators?.length === 0) {
|
|
201
|
+
return {
|
|
202
|
+
...experiment,
|
|
203
|
+
evaluationRuns: [],
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
logger.info(
|
|
208
|
+
`🧠 Evaluating experiment "${experiment.id}" with ${evaluators?.length ?? 0} ${pluralize(
|
|
209
|
+
"evaluator",
|
|
210
|
+
evaluators?.length ?? 0
|
|
211
|
+
)}`
|
|
212
|
+
);
|
|
213
|
+
type EvaluationId = string;
|
|
214
|
+
const evaluationRuns: Record<EvaluationId, ExperimentEvaluationRun> = {};
|
|
215
|
+
|
|
216
|
+
const examplesById: Record<string, Example> = {};
|
|
217
|
+
for (const example of dataset.examples) {
|
|
218
|
+
examplesById[example.id] = example;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
const onEvaluationComplete = (run: ExperimentEvaluationRun) => {
|
|
222
|
+
evaluationRuns[run.id] = run;
|
|
223
|
+
};
|
|
224
|
+
|
|
225
|
+
// Run evaluators against all runs
|
|
226
|
+
await Promise.all(
|
|
227
|
+
evaluators.map((evaluator) =>
|
|
228
|
+
Promise.all(
|
|
229
|
+
Object.values(experiment.runs).map((run) =>
|
|
230
|
+
runEvaluator({
|
|
231
|
+
evaluator,
|
|
232
|
+
run,
|
|
233
|
+
exampleCache: examplesById,
|
|
234
|
+
onComplete: onEvaluationComplete,
|
|
235
|
+
})
|
|
236
|
+
)
|
|
237
|
+
)
|
|
238
|
+
)
|
|
239
|
+
);
|
|
240
|
+
|
|
241
|
+
logger.info(`✅ Evaluation runs completed`);
|
|
242
|
+
|
|
243
|
+
return {
|
|
244
|
+
...experiment,
|
|
245
|
+
evaluationRuns: Object.values(evaluationRuns),
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Run an evaluator against a run.
|
|
251
|
+
*/
|
|
252
|
+
async function runEvaluator({
|
|
253
|
+
evaluator,
|
|
254
|
+
run,
|
|
255
|
+
exampleCache,
|
|
256
|
+
onComplete,
|
|
257
|
+
}: {
|
|
258
|
+
evaluator: Evaluator;
|
|
259
|
+
run: ExperimentRun;
|
|
260
|
+
exampleCache: Record<string, Example>;
|
|
261
|
+
onComplete: (run: ExperimentEvaluationRun) => void;
|
|
262
|
+
}) {
|
|
263
|
+
const example = exampleCache[run.datasetExampleId];
|
|
264
|
+
invariant(example, `Example "${run.datasetExampleId}" not found`);
|
|
265
|
+
const evaluate = async () => {
|
|
266
|
+
const thisEval: ExperimentEvaluationRun = {
|
|
267
|
+
id: id(),
|
|
268
|
+
traceId: id(),
|
|
269
|
+
experimentRunId: run.id,
|
|
270
|
+
startTime: new Date(),
|
|
271
|
+
endTime: new Date(), // will get replaced with actual end time
|
|
272
|
+
name: evaluator.name,
|
|
273
|
+
result: null,
|
|
274
|
+
error: null,
|
|
275
|
+
annotatorKind: "LLM", // TODO: make configurable via evaluator def
|
|
276
|
+
};
|
|
277
|
+
try {
|
|
278
|
+
const result = await evaluator.evaluate({
|
|
279
|
+
input: example.input,
|
|
280
|
+
output: run.output ?? null,
|
|
281
|
+
expected: example.output,
|
|
282
|
+
metadata: example.metadata,
|
|
283
|
+
});
|
|
284
|
+
thisEval.result = result;
|
|
285
|
+
} catch (error) {
|
|
286
|
+
thisEval.error = error instanceof Error ? error.message : "Unknown error";
|
|
287
|
+
}
|
|
288
|
+
thisEval.endTime = new Date();
|
|
289
|
+
onComplete(thisEval);
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
return evaluate();
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Return a dataset object from the input.
|
|
297
|
+
*
|
|
298
|
+
* If the input is a string, assume it is a dataset id and fetch the dataset from the client.
|
|
299
|
+
* If the input is an array of examples, create a new dataset from the examples then return it.
|
|
300
|
+
* If the input is a dataset, return it as is.
|
|
301
|
+
*
|
|
302
|
+
* @param dataset - The dataset to get.
|
|
303
|
+
* @returns The dataset.
|
|
304
|
+
*/
|
|
305
|
+
async function getDataset({
|
|
306
|
+
dataset,
|
|
307
|
+
client,
|
|
308
|
+
}: {
|
|
309
|
+
dataset: Dataset | string | Example[];
|
|
310
|
+
client: PhoenixClient;
|
|
311
|
+
}): Promise<Dataset> {
|
|
312
|
+
if (typeof dataset === "string") {
|
|
313
|
+
const datasetResponse = await client
|
|
314
|
+
.GET(`/v1/datasets/{id}`, { params: { path: { id: dataset } } })
|
|
315
|
+
.then((d) => d.data?.data);
|
|
316
|
+
invariant(datasetResponse, `Dataset ${dataset} not found`);
|
|
317
|
+
const examples = await client
|
|
318
|
+
.GET(`/v1/datasets/{id}/examples`, { params: { path: { id: dataset } } })
|
|
319
|
+
.then((e) => e.data?.data);
|
|
320
|
+
invariant(examples, `Examples for dataset ${dataset} not found`);
|
|
321
|
+
const datasetWithExamples: Dataset = {
|
|
322
|
+
...datasetResponse,
|
|
323
|
+
examples: examples.examples.map((example) => ({
|
|
324
|
+
...example,
|
|
325
|
+
updatedAt: new Date(example.updated_at),
|
|
326
|
+
})),
|
|
327
|
+
versionId: examples.version_id,
|
|
328
|
+
};
|
|
329
|
+
return datasetWithExamples;
|
|
330
|
+
}
|
|
331
|
+
if (Array.isArray(dataset)) {
|
|
332
|
+
throw new Error("TODO: implement dataset creation from examples");
|
|
333
|
+
}
|
|
334
|
+
return dataset;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
/**
|
|
338
|
+
* Wrap an evaluator function in an object with a name property.
|
|
339
|
+
*
|
|
340
|
+
* @param name - The name of the evaluator.
|
|
341
|
+
* @param evaluate - The evaluator function.
|
|
342
|
+
* @returns The evaluator object.
|
|
343
|
+
*/
|
|
344
|
+
export function asEvaluator(
|
|
345
|
+
name: string,
|
|
346
|
+
evaluate: Evaluator["evaluate"]
|
|
347
|
+
): Evaluator {
|
|
348
|
+
return {
|
|
349
|
+
name,
|
|
350
|
+
evaluate,
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
let _id = 1000;
|
|
355
|
+
|
|
356
|
+
/**
|
|
357
|
+
* Generate a unique id.
|
|
358
|
+
*
|
|
359
|
+
* @deprecated Use id generated by phoenix instead.
|
|
360
|
+
* @returns A unique id.
|
|
361
|
+
*/
|
|
362
|
+
export function id(): string {
|
|
363
|
+
return (() => {
|
|
364
|
+
_id++;
|
|
365
|
+
return _id.toString();
|
|
366
|
+
})();
|
|
367
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export type AnnotatorKind = "HUMAN" | "LLM";
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { Node } from "./core";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* An example is a record to feed into an AI task
|
|
5
|
+
*/
|
|
6
|
+
export interface Example extends Node {
|
|
7
|
+
id: string;
|
|
8
|
+
updatedAt: Date;
|
|
9
|
+
input: Record<string, unknown>;
|
|
10
|
+
output: Record<string, unknown> | null;
|
|
11
|
+
metadata: Record<string, unknown>;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* A dataset is a collection of examples for an AI task
|
|
16
|
+
*/
|
|
17
|
+
export interface Dataset extends Node {
|
|
18
|
+
id: string;
|
|
19
|
+
name: string;
|
|
20
|
+
versionId: string;
|
|
21
|
+
examples: Example[];
|
|
22
|
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import { AnnotatorKind } from "./annotations";
|
|
2
|
+
import { Node } from "./core";
|
|
3
|
+
import { Example } from "./datasets";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* An experiment is a set of task runs on a dataset version
|
|
7
|
+
*/
|
|
8
|
+
export interface Experiment extends Node {
|
|
9
|
+
datasetId: string;
|
|
10
|
+
datasetVersionId: string;
|
|
11
|
+
repetitions: number;
|
|
12
|
+
/**
|
|
13
|
+
* The project under which the experiment task traces are recorded
|
|
14
|
+
*/
|
|
15
|
+
projectName: string;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface RanExperiment extends Experiment {
|
|
19
|
+
params: ExperimentParameters;
|
|
20
|
+
runs: Record<string, ExperimentRun>;
|
|
21
|
+
evaluationRuns?: ExperimentEvaluationRun[];
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* The result of running an experiment on a single example
|
|
26
|
+
*/
|
|
27
|
+
export interface ExperimentRun extends Node {
|
|
28
|
+
startTime: Date;
|
|
29
|
+
endTime: Date;
|
|
30
|
+
/**
|
|
31
|
+
* What experiment the run belongs to
|
|
32
|
+
*/
|
|
33
|
+
experimentId: string;
|
|
34
|
+
datasetExampleId: string;
|
|
35
|
+
repetitionNumber: number;
|
|
36
|
+
output?: string | Record<string, unknown> | null;
|
|
37
|
+
error: string | null;
|
|
38
|
+
traceId: string | null;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export type EvaluatorParams = {
|
|
42
|
+
/**
|
|
43
|
+
* The input field of the Dataset Example
|
|
44
|
+
*/
|
|
45
|
+
input: Example["input"];
|
|
46
|
+
/**
|
|
47
|
+
* The output of the task
|
|
48
|
+
*/
|
|
49
|
+
output: TaskOutput;
|
|
50
|
+
/**
|
|
51
|
+
* The expected or reference output of the Dataset Example
|
|
52
|
+
*/
|
|
53
|
+
expected?: Example["output"];
|
|
54
|
+
/**
|
|
55
|
+
* Metadata associated with the Dataset Example
|
|
56
|
+
*/
|
|
57
|
+
metadata?: Record<string, unknown>;
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
export type Evaluator = {
|
|
61
|
+
name: string;
|
|
62
|
+
evaluate: (
|
|
63
|
+
args: EvaluatorParams
|
|
64
|
+
) => Promise<EvaluationResult> | EvaluationResult;
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
export type EvaluationResult = {
|
|
68
|
+
score: number | null;
|
|
69
|
+
label: string | null;
|
|
70
|
+
metadata: Record<string, unknown>;
|
|
71
|
+
explanation: string | null;
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
export interface ExperimentEvaluationRun extends Node {
|
|
75
|
+
experimentRunId: string;
|
|
76
|
+
startTime: Date;
|
|
77
|
+
endTime: Date;
|
|
78
|
+
/**
|
|
79
|
+
* THe name of the evaluation
|
|
80
|
+
*/
|
|
81
|
+
name: string;
|
|
82
|
+
annotatorKind: AnnotatorKind;
|
|
83
|
+
error: string | null;
|
|
84
|
+
result: EvaluationResult | null;
|
|
85
|
+
/**
|
|
86
|
+
* The trace id of the evaluation
|
|
87
|
+
* This is null if the trace is deleted or never recorded
|
|
88
|
+
*/
|
|
89
|
+
traceId: string | null;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export type TaskOutput = string | boolean | number | object | null;
|
|
93
|
+
|
|
94
|
+
export type ExperimentTask = (
|
|
95
|
+
example: Example
|
|
96
|
+
) => Promise<TaskOutput> | TaskOutput;
|
|
97
|
+
|
|
98
|
+
export interface ExperimentParameters {
|
|
99
|
+
/**
|
|
100
|
+
* The number of examples to run the experiment on
|
|
101
|
+
*/
|
|
102
|
+
nExamples: number;
|
|
103
|
+
/**
|
|
104
|
+
* The number of repetitions to run the experiment
|
|
105
|
+
* e.g. the number of times to run the task
|
|
106
|
+
*/
|
|
107
|
+
nRepetitions: number;
|
|
108
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pluralize a word based on the count.
|
|
3
|
+
*
|
|
4
|
+
* @param word - The word to pluralize.
|
|
5
|
+
* @param count - The count of the word.
|
|
6
|
+
* @returns The pluralized word.
|
|
7
|
+
*/
|
|
8
|
+
export function pluralize(word: string, count: number) {
|
|
9
|
+
return count === 1 ? word : `${word}s`;
|
|
10
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* If the incoming function returns a promise, return the promise.
|
|
3
|
+
* Otherwise, return a promise that resolves to the incoming function's return value.
|
|
4
|
+
*/
|
|
5
|
+
export function promisifyResult<T>(result: T) {
|
|
6
|
+
if (result instanceof Promise) {
|
|
7
|
+
return result as T extends Promise<infer U> ? Promise<U> : never;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
return Promise.resolve(result) as T extends Promise<unknown>
|
|
11
|
+
? never
|
|
12
|
+
: Promise<T>;
|
|
13
|
+
}
|