@arizeai/phoenix-evals 0.0.8 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/default_templates/HALLUCINATION_TEMPLATE.d.ts +1 -1
- package/dist/esm/default_templates/HALLUCINATION_TEMPLATE.js +2 -2
- package/dist/esm/default_templates/HALLUCINATION_TEMPLATE.js.map +1 -1
- package/dist/esm/llm/createClassificationEvaluator.d.ts +3 -0
- package/dist/esm/llm/createClassificationEvaluator.d.ts.map +1 -0
- package/dist/esm/llm/createClassificationEvaluator.js +10 -0
- package/dist/esm/llm/createClassificationEvaluator.js.map +1 -0
- package/dist/esm/llm/createClassifierFn.d.ts +6 -0
- package/dist/esm/llm/createClassifierFn.d.ts.map +1 -0
- package/dist/esm/llm/{createClassifier.js → createClassifierFn.js} +3 -3
- package/dist/esm/llm/createClassifierFn.js.map +1 -0
- package/dist/esm/llm/createDocumentRelevancyEvaluator.d.ts +8 -6
- package/dist/esm/llm/createDocumentRelevancyEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createDocumentRelevancyEvaluator.js +6 -5
- package/dist/esm/llm/createDocumentRelevancyEvaluator.js.map +1 -1
- package/dist/esm/llm/createHallucinationEvaluator.d.ts +7 -5
- package/dist/esm/llm/createHallucinationEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createHallucinationEvaluator.js +5 -4
- package/dist/esm/llm/createHallucinationEvaluator.js.map +1 -1
- package/dist/esm/llm/index.d.ts +2 -1
- package/dist/esm/llm/index.d.ts.map +1 -1
- package/dist/esm/llm/index.js +2 -1
- package/dist/esm/llm/index.js.map +1 -1
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/evals.d.ts +45 -0
- package/dist/esm/types/evals.d.ts.map +1 -1
- package/dist/src/default_templates/HALLUCINATION_TEMPLATE.d.ts +1 -1
- package/dist/src/default_templates/HALLUCINATION_TEMPLATE.js +2 -2
- package/dist/src/default_templates/HALLUCINATION_TEMPLATE.js.map +1 -1
- package/dist/src/llm/createClassificationEvaluator.d.ts +3 -0
- package/dist/src/llm/createClassificationEvaluator.d.ts.map +1 -0
- package/dist/src/llm/createClassificationEvaluator.js +13 -0
- package/dist/src/llm/createClassificationEvaluator.js.map +1 -0
- package/dist/src/llm/createClassifierFn.d.ts +6 -0
- package/dist/src/llm/createClassifierFn.d.ts.map +1 -0
- package/dist/src/llm/{createClassifier.js → createClassifierFn.js} +4 -4
- package/dist/src/llm/createClassifierFn.js.map +1 -0
- package/dist/src/llm/createDocumentRelevancyEvaluator.d.ts +8 -6
- package/dist/src/llm/createDocumentRelevancyEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createDocumentRelevancyEvaluator.js +7 -6
- package/dist/src/llm/createDocumentRelevancyEvaluator.js.map +1 -1
- package/dist/src/llm/createHallucinationEvaluator.d.ts +7 -5
- package/dist/src/llm/createHallucinationEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createHallucinationEvaluator.js +6 -5
- package/dist/src/llm/createHallucinationEvaluator.js.map +1 -1
- package/dist/src/llm/index.d.ts +2 -1
- package/dist/src/llm/index.d.ts.map +1 -1
- package/dist/src/llm/index.js +2 -1
- package/dist/src/llm/index.js.map +1 -1
- package/dist/src/types/evals.d.ts +45 -0
- package/dist/src/types/evals.d.ts.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +2 -2
- package/src/default_templates/HALLUCINATION_TEMPLATE.ts +2 -2
- package/src/llm/createClassificationEvaluator.ts +13 -0
- package/src/llm/{createClassifier.ts → createClassifierFn.ts} +2 -2
- package/src/llm/createDocumentRelevancyEvaluator.ts +23 -15
- package/src/llm/createHallucinationEvaluator.ts +16 -8
- package/src/llm/index.ts +2 -1
- package/src/types/evals.ts +49 -0
- package/dist/esm/llm/createClassifier.d.ts +0 -6
- package/dist/esm/llm/createClassifier.d.ts.map +0 -1
- package/dist/esm/llm/createClassifier.js.map +0 -1
- package/dist/src/llm/createClassifier.d.ts +0 -6
- package/dist/src/llm/createClassifier.d.ts.map +0 -1
- package/dist/src/llm/createClassifier.js.map +0 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arizeai/phoenix-evals",
|
|
3
|
-
"version": "0.0
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "A library for running evaluations for AI use cases",
|
|
5
5
|
"main": "dist/src/index.js",
|
|
6
6
|
"module": "dist/esm/index.js",
|
|
@@ -50,7 +50,7 @@
|
|
|
50
50
|
"typedoc": "^0.27.9",
|
|
51
51
|
"typescript": "^5.8.2",
|
|
52
52
|
"vitest": "^2.1.9",
|
|
53
|
-
"@arizeai/phoenix-client": "
|
|
53
|
+
"@arizeai/phoenix-client": "3.0.0"
|
|
54
54
|
},
|
|
55
55
|
"engines": {
|
|
56
56
|
"node": ">=18"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { CreateClassificationEvaluatorArgs, Evaluator } from "../types/evals";
|
|
2
|
+
import { createClassifierFn } from "./createClassifierFn";
|
|
3
|
+
|
|
4
|
+
export function createClassificationEvaluator<
|
|
5
|
+
ExampleType extends Record<string, unknown>,
|
|
6
|
+
>(args: CreateClassificationEvaluatorArgs): Evaluator<ExampleType> {
|
|
7
|
+
return {
|
|
8
|
+
name: args.name,
|
|
9
|
+
source: "LLM",
|
|
10
|
+
optimizationDirection: args.optimizationDirection,
|
|
11
|
+
evaluate: createClassifierFn(args),
|
|
12
|
+
};
|
|
13
|
+
}
|
|
@@ -22,9 +22,9 @@ function choicesToLabels(
|
|
|
22
22
|
}
|
|
23
23
|
|
|
24
24
|
/**
|
|
25
|
-
* A function that serves as a factory that will output a classification evaluator
|
|
25
|
+
* A function that serves as a factory that will output a classification evaluator function
|
|
26
26
|
*/
|
|
27
|
-
export function
|
|
27
|
+
export function createClassifierFn<ExampleType extends Record<string, unknown>>(
|
|
28
28
|
args: CreateClassifierArgs
|
|
29
29
|
): EvaluatorFn<ExampleType> {
|
|
30
30
|
const { model, choices, promptTemplate, ...rest } = args;
|
|
@@ -1,14 +1,19 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { CreateClassifierArgs, EvaluatorFn } from "../types/evals";
|
|
1
|
+
import { CreateClassificationEvaluatorArgs, Evaluator } from "../types/evals";
|
|
3
2
|
import {
|
|
4
3
|
DOCUMENT_RELEVANCY_TEMPLATE,
|
|
5
4
|
DOCUMENT_RELEVANCY_CHOICES,
|
|
6
5
|
} from "../default_templates/DOCUMENT_RELEVANCY_TEMPLATE";
|
|
6
|
+
import { createClassificationEvaluator } from "./createClassificationEvaluator";
|
|
7
7
|
|
|
8
8
|
export interface DocumentRelevancyEvaluatorArgs
|
|
9
|
-
extends Omit<
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
extends Omit<
|
|
10
|
+
CreateClassificationEvaluatorArgs,
|
|
11
|
+
"promptTemplate" | "choices" | "optimizationDirection" | "name"
|
|
12
|
+
> {
|
|
13
|
+
optimizationDirection?: CreateClassificationEvaluatorArgs["optimizationDirection"];
|
|
14
|
+
name?: CreateClassificationEvaluatorArgs["name"];
|
|
15
|
+
choices?: CreateClassificationEvaluatorArgs["choices"];
|
|
16
|
+
promptTemplate?: CreateClassificationEvaluatorArgs["promptTemplate"];
|
|
12
17
|
}
|
|
13
18
|
|
|
14
19
|
/**
|
|
@@ -38,7 +43,7 @@ export type DocumentRelevancyExample = {
|
|
|
38
43
|
* @example
|
|
39
44
|
* ```ts
|
|
40
45
|
* const evaluator = createDocumentRelevancyEvaluator({ model: openai("gpt-4o-mini") });
|
|
41
|
-
* const result = await evaluator({
|
|
46
|
+
* const result = await evaluator.evaluate({
|
|
42
47
|
* input: "What is the capital of France?",
|
|
43
48
|
* documentText: "Paris is the capital and most populous city of France.",
|
|
44
49
|
* });
|
|
@@ -47,18 +52,21 @@ export type DocumentRelevancyExample = {
|
|
|
47
52
|
*/
|
|
48
53
|
export function createDocumentRelevancyEvaluator(
|
|
49
54
|
args: DocumentRelevancyEvaluatorArgs
|
|
50
|
-
):
|
|
55
|
+
): Evaluator<DocumentRelevancyExample> {
|
|
51
56
|
const {
|
|
52
57
|
choices = DOCUMENT_RELEVANCY_CHOICES,
|
|
53
58
|
promptTemplate = DOCUMENT_RELEVANCY_TEMPLATE,
|
|
59
|
+
optimizationDirection = "MAXIMIZE",
|
|
60
|
+
name = "document_relevancy",
|
|
54
61
|
...rest
|
|
55
62
|
} = args;
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
63
|
+
|
|
64
|
+
return createClassificationEvaluator<DocumentRelevancyExample>({
|
|
65
|
+
...args,
|
|
66
|
+
promptTemplate,
|
|
67
|
+
choices,
|
|
68
|
+
optimizationDirection,
|
|
69
|
+
name,
|
|
70
|
+
...rest,
|
|
71
|
+
});
|
|
64
72
|
}
|
|
@@ -1,14 +1,19 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { CreateClassifierArgs, EvaluatorFn } from "../types/evals";
|
|
1
|
+
import { CreateClassificationEvaluatorArgs, Evaluator } from "../types/evals";
|
|
3
2
|
import {
|
|
4
3
|
HALLUCINATION_TEMPLATE,
|
|
5
4
|
HALLUCINATION_CHOICES,
|
|
6
5
|
} from "../default_templates/HALLUCINATION_TEMPLATE";
|
|
6
|
+
import { createClassificationEvaluator } from "./createClassificationEvaluator";
|
|
7
7
|
|
|
8
8
|
export interface HallucinationEvaluatorArgs
|
|
9
|
-
extends Omit<
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
extends Omit<
|
|
10
|
+
CreateClassificationEvaluatorArgs,
|
|
11
|
+
"promptTemplate" | "choices" | "optimizationDirection" | "name"
|
|
12
|
+
> {
|
|
13
|
+
optimizationDirection?: CreateClassificationEvaluatorArgs["optimizationDirection"];
|
|
14
|
+
name?: CreateClassificationEvaluatorArgs["name"];
|
|
15
|
+
choices?: CreateClassificationEvaluatorArgs["choices"];
|
|
16
|
+
promptTemplate?: CreateClassificationEvaluatorArgs["promptTemplate"];
|
|
12
17
|
}
|
|
13
18
|
|
|
14
19
|
/**
|
|
@@ -28,17 +33,20 @@ export type HallucinationExample = {
|
|
|
28
33
|
*/
|
|
29
34
|
export function createHallucinationEvaluator(
|
|
30
35
|
args: HallucinationEvaluatorArgs
|
|
31
|
-
):
|
|
36
|
+
): Evaluator<HallucinationExample> {
|
|
32
37
|
const {
|
|
33
38
|
choices = HALLUCINATION_CHOICES,
|
|
34
39
|
promptTemplate = HALLUCINATION_TEMPLATE,
|
|
40
|
+
optimizationDirection = "MINIMIZE",
|
|
41
|
+
name = "hallucination",
|
|
35
42
|
...rest
|
|
36
43
|
} = args;
|
|
37
|
-
|
|
44
|
+
return createClassificationEvaluator<HallucinationExample>({
|
|
38
45
|
...args,
|
|
39
46
|
promptTemplate,
|
|
40
47
|
choices,
|
|
48
|
+
optimizationDirection,
|
|
49
|
+
name,
|
|
41
50
|
...rest,
|
|
42
51
|
});
|
|
43
|
-
return hallucinationEvaluatorFn;
|
|
44
52
|
}
|
package/src/llm/index.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
export * from "./generateClassification";
|
|
2
|
-
export * from "./
|
|
2
|
+
export * from "./createClassifierFn";
|
|
3
|
+
export * from "./createClassificationEvaluator";
|
|
3
4
|
export * from "./createHallucinationEvaluator";
|
|
4
5
|
export * from "./createDocumentRelevancyEvaluator";
|
package/src/types/evals.ts
CHANGED
|
@@ -78,6 +78,55 @@ export interface CreateClassifierArgs extends WithTelemetry {
|
|
|
78
78
|
promptTemplate: string;
|
|
79
79
|
}
|
|
80
80
|
|
|
81
|
+
export interface CreateClassificationEvaluatorArgs
|
|
82
|
+
extends CreateClassifierArgs {
|
|
83
|
+
/**
|
|
84
|
+
* The name of the metric that the evaluator produces
|
|
85
|
+
* E.x. "correctness"
|
|
86
|
+
*/
|
|
87
|
+
name: string;
|
|
88
|
+
/**
|
|
89
|
+
* If present, represents the direction in which you want the metric to be optimized
|
|
90
|
+
* E.x. "MAXIMIZE" means you want the number to be higher.
|
|
91
|
+
*/
|
|
92
|
+
optimizationDirection?: OptimizationDirection;
|
|
93
|
+
}
|
|
94
|
+
|
|
81
95
|
export type EvaluatorFn<ExampleType extends Record<string, unknown>> = (
|
|
82
96
|
args: ExampleType
|
|
83
97
|
) => Promise<EvaluationResult>;
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* The source of the evaluation
|
|
101
|
+
*/
|
|
102
|
+
type EvaluationSource = "LLM" | "CODE";
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* The direction to optimize the numeric evaluation score
|
|
106
|
+
* E.x. "MAXIMIZE" means that the higher the score, the better the evaluation
|
|
107
|
+
*/
|
|
108
|
+
type OptimizationDirection = "MAXIMIZE" | "MINIMIZE";
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* The Base Evaluator interface
|
|
112
|
+
* This is the interface that all evaluators must implement
|
|
113
|
+
*/
|
|
114
|
+
export interface Evaluator<ExampleType extends Record<string, unknown>> {
|
|
115
|
+
/**
|
|
116
|
+
* The name of the evaluator / the metric that it measures
|
|
117
|
+
*/
|
|
118
|
+
name: string;
|
|
119
|
+
/**
|
|
120
|
+
* The source of the evaluation. Also known as the "kind" of evaluator.
|
|
121
|
+
*/
|
|
122
|
+
source: EvaluationSource;
|
|
123
|
+
/**
|
|
124
|
+
* The direction to optimize the numeric evaluation score
|
|
125
|
+
* E.x. "MAXIMIZE" means that the higher the score, the better the evaluation
|
|
126
|
+
*/
|
|
127
|
+
optimizationDirection?: OptimizationDirection;
|
|
128
|
+
/**
|
|
129
|
+
* The function that evaluates the example
|
|
130
|
+
*/
|
|
131
|
+
evaluate: EvaluatorFn<ExampleType>;
|
|
132
|
+
}
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
import { CreateClassifierArgs, EvaluatorFn } from "../types/evals.js";
|
|
2
|
-
/**
|
|
3
|
-
* A function that serves as a factory that will output a classification evaluator
|
|
4
|
-
*/
|
|
5
|
-
export declare function createClassifier<ExampleType extends Record<string, unknown>>(args: CreateClassifierArgs): EvaluatorFn<ExampleType>;
|
|
6
|
-
//# sourceMappingURL=createClassifier.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"createClassifier.d.ts","sourceRoot":"","sources":["../../../src/llm/createClassifier.ts"],"names":[],"mappings":"AAAA,OAAO,EAGL,oBAAoB,EACpB,WAAW,EACZ,MAAM,gBAAgB,CAAC;AAkBxB;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAC1E,IAAI,EAAE,oBAAoB,GACzB,WAAW,CAAC,WAAW,CAAC,CA4B1B"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"createClassifier.js","sourceRoot":"","sources":["../../../src/llm/createClassifier.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,sBAAsB,EAAE,MAAM,0BAA0B,CAAC;AAClE,OAAO,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAE7C;;;GAGG;AACH,SAAS,eAAe,CACtB,OAAiC;IAEjC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACpC,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,MAAM,IAAI,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,MAA+B,CAAC;AACzC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAC9B,IAA0B;IAE1B,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,cAAc,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,CAAC;IAEzD,OAAO,KAAK,EAAE,IAAiB,EAA6B,EAAE;QAC5D,MAAM,iBAAiB,GAAG;YACxB,GAAG,IAAI;SACR,CAAC;QAEF,MAAM,MAAM,GAAG,cAAc,CAAC;YAC5B,QAAQ,EAAE,cAAc;YACxB,SAAS,EAAE,iBAAiB;SAC7B,CAAC,CAAC;QAEH,MAAM,cAAc,GAAG,MAAM,sBAAsB,CAAC;YAClD,KAAK;YACL,MAAM,EAAE,eAAe,CAAC,OAAO,CAAC;YAChC,MAAM;YACN,GAAG,IAAI;SACR,CAAC,CAAC;QAEH,mEAAmE;QACnE,MAAM,KAAK,GAAG,OAAO,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;QAE5C,OAAO;YACL,KAAK;YACL,GAAG,cAAc;SAClB,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC"}
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
import { CreateClassifierArgs, EvaluatorFn } from "../types/evals";
|
|
2
|
-
/**
|
|
3
|
-
* A function that serves as a factory that will output a classification evaluator
|
|
4
|
-
*/
|
|
5
|
-
export declare function createClassifier<ExampleType extends Record<string, unknown>>(args: CreateClassifierArgs): EvaluatorFn<ExampleType>;
|
|
6
|
-
//# sourceMappingURL=createClassifier.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"createClassifier.d.ts","sourceRoot":"","sources":["../../../src/llm/createClassifier.ts"],"names":[],"mappings":"AAAA,OAAO,EAGL,oBAAoB,EACpB,WAAW,EACZ,MAAM,gBAAgB,CAAC;AAkBxB;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAC1E,IAAI,EAAE,oBAAoB,GACzB,WAAW,CAAC,WAAW,CAAC,CA4B1B"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"createClassifier.js","sourceRoot":"","sources":["../../../src/llm/createClassifier.ts"],"names":[],"mappings":";;;;;;;;;;;;;AA0BA,4CA8BC;AAlDD,qEAAkE;AAClE,0CAA6C;AAE7C;;;GAGG;AACH,SAAS,eAAe,CACtB,OAAiC;IAEjC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACpC,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,MAAM,IAAI,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,MAA+B,CAAC;AACzC,CAAC;AAED;;GAEG;AACH,SAAgB,gBAAgB,CAC9B,IAA0B;IAE1B,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,cAAc,KAAc,IAAI,EAAb,IAAI,UAAK,IAAI,EAAlD,sCAA2C,CAAO,CAAC;IAEzD,OAAO,KAAK,EAAE,IAAiB,EAA6B,EAAE;QAC5D,MAAM,iBAAiB,qBAClB,IAAI,CACR,CAAC;QAEF,MAAM,MAAM,GAAG,IAAA,yBAAc,EAAC;YAC5B,QAAQ,EAAE,cAAc;YACxB,SAAS,EAAE,iBAAiB;SAC7B,CAAC,CAAC;QAEH,MAAM,cAAc,GAAG,MAAM,IAAA,+CAAsB,kBACjD,KAAK,EACL,MAAM,EAAE,eAAe,CAAC,OAAO,CAAC,EAChC,MAAM,IACH,IAAI,EACP,CAAC;QAEH,mEAAmE;QACnE,MAAM,KAAK,GAAG,OAAO,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;QAE5C,uBACE,KAAK,IACF,cAAc,EACjB;IACJ,CAAC,CAAC;AACJ,CAAC"}
|