@arizeai/phoenix-evals 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/README.md +30 -1
  2. package/dist/esm/llm/ClassificationEvaluator.d.ts +17 -0
  3. package/dist/esm/llm/ClassificationEvaluator.d.ts.map +1 -0
  4. package/dist/esm/llm/ClassificationEvaluator.js +33 -0
  5. package/dist/esm/llm/ClassificationEvaluator.js.map +1 -0
  6. package/dist/esm/llm/LLMEvaluator.d.ts +12 -0
  7. package/dist/esm/llm/LLMEvaluator.d.ts.map +1 -0
  8. package/dist/esm/llm/LLMEvaluator.js +13 -0
  9. package/dist/esm/llm/LLMEvaluator.js.map +1 -0
  10. package/dist/esm/llm/createClassificationEvaluator.d.ts +3 -2
  11. package/dist/esm/llm/createClassificationEvaluator.d.ts.map +1 -1
  12. package/dist/esm/llm/createClassificationEvaluator.js +2 -7
  13. package/dist/esm/llm/createClassificationEvaluator.js.map +1 -1
  14. package/dist/esm/llm/createClassifierFn.d.ts +1 -1
  15. package/dist/esm/llm/createClassifierFn.d.ts.map +1 -1
  16. package/dist/esm/llm/createClassifierFn.js.map +1 -1
  17. package/dist/esm/llm/createDocumentRelevancyEvaluator.d.ts +5 -4
  18. package/dist/esm/llm/createDocumentRelevancyEvaluator.d.ts.map +1 -1
  19. package/dist/esm/llm/createDocumentRelevancyEvaluator.js.map +1 -1
  20. package/dist/esm/llm/createHallucinationEvaluator.d.ts +5 -4
  21. package/dist/esm/llm/createHallucinationEvaluator.d.ts.map +1 -1
  22. package/dist/esm/llm/createHallucinationEvaluator.js.map +1 -1
  23. package/dist/esm/llm/generateClassification.d.ts +2 -2
  24. package/dist/esm/llm/generateClassification.d.ts.map +1 -1
  25. package/dist/esm/llm/generateClassification.js.map +1 -1
  26. package/dist/esm/template/getTemplateVariables.d.ts +12 -0
  27. package/dist/esm/template/getTemplateVariables.d.ts.map +1 -0
  28. package/dist/esm/template/getTemplateVariables.js +18 -0
  29. package/dist/esm/template/getTemplateVariables.js.map +1 -0
  30. package/dist/esm/template/index.d.ts +1 -0
  31. package/dist/esm/template/index.d.ts.map +1 -1
  32. package/dist/esm/template/index.js +1 -0
  33. package/dist/esm/template/index.js.map +1 -1
  34. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  35. package/dist/esm/types/evals.d.ts +13 -6
  36. package/dist/esm/types/evals.d.ts.map +1 -1
  37. package/dist/esm/types/otel.d.ts +2 -2
  38. package/dist/esm/types/otel.d.ts.map +1 -1
  39. package/dist/esm/types/prompts.d.ts +2 -16
  40. package/dist/esm/types/prompts.d.ts.map +1 -1
  41. package/dist/src/llm/ClassificationEvaluator.d.ts +17 -0
  42. package/dist/src/llm/ClassificationEvaluator.d.ts.map +1 -0
  43. package/dist/src/llm/ClassificationEvaluator.js +34 -0
  44. package/dist/src/llm/ClassificationEvaluator.js.map +1 -0
  45. package/dist/src/llm/LLMEvaluator.d.ts +12 -0
  46. package/dist/src/llm/LLMEvaluator.d.ts.map +1 -0
  47. package/dist/src/llm/LLMEvaluator.js +15 -0
  48. package/dist/src/llm/LLMEvaluator.js.map +1 -0
  49. package/dist/src/llm/createClassificationEvaluator.d.ts +3 -2
  50. package/dist/src/llm/createClassificationEvaluator.d.ts.map +1 -1
  51. package/dist/src/llm/createClassificationEvaluator.js +2 -7
  52. package/dist/src/llm/createClassificationEvaluator.js.map +1 -1
  53. package/dist/src/llm/createClassifierFn.d.ts +1 -1
  54. package/dist/src/llm/createClassifierFn.d.ts.map +1 -1
  55. package/dist/src/llm/createClassifierFn.js.map +1 -1
  56. package/dist/src/llm/createDocumentRelevancyEvaluator.d.ts +5 -4
  57. package/dist/src/llm/createDocumentRelevancyEvaluator.d.ts.map +1 -1
  58. package/dist/src/llm/createDocumentRelevancyEvaluator.js.map +1 -1
  59. package/dist/src/llm/createHallucinationEvaluator.d.ts +5 -4
  60. package/dist/src/llm/createHallucinationEvaluator.d.ts.map +1 -1
  61. package/dist/src/llm/createHallucinationEvaluator.js.map +1 -1
  62. package/dist/src/llm/generateClassification.d.ts +2 -2
  63. package/dist/src/llm/generateClassification.d.ts.map +1 -1
  64. package/dist/src/llm/generateClassification.js.map +1 -1
  65. package/dist/src/template/getTemplateVariables.d.ts +12 -0
  66. package/dist/src/template/getTemplateVariables.d.ts.map +1 -0
  67. package/dist/src/template/getTemplateVariables.js +24 -0
  68. package/dist/src/template/getTemplateVariables.js.map +1 -0
  69. package/dist/src/template/index.d.ts +1 -0
  70. package/dist/src/template/index.d.ts.map +1 -1
  71. package/dist/src/template/index.js +1 -0
  72. package/dist/src/template/index.js.map +1 -1
  73. package/dist/src/types/evals.d.ts +13 -6
  74. package/dist/src/types/evals.d.ts.map +1 -1
  75. package/dist/src/types/otel.d.ts +2 -2
  76. package/dist/src/types/otel.d.ts.map +1 -1
  77. package/dist/src/types/prompts.d.ts +2 -16
  78. package/dist/src/types/prompts.d.ts.map +1 -1
  79. package/dist/tsconfig.tsbuildinfo +1 -1
  80. package/package.json +3 -3
  81. package/src/llm/ClassificationEvaluator.ts +40 -0
  82. package/src/llm/LLMEvaluator.ts +22 -0
  83. package/src/llm/createClassificationEvaluator.ts +7 -10
  84. package/src/llm/createClassifierFn.ts +4 -4
  85. package/src/llm/createDocumentRelevancyEvaluator.ts +11 -7
  86. package/src/llm/createHallucinationEvaluator.ts +8 -7
  87. package/src/llm/generateClassification.ts +16 -14
  88. package/src/template/getTemplateVariables.ts +22 -0
  89. package/src/template/index.ts +1 -0
  90. package/src/types/evals.ts +18 -7
  91. package/src/types/otel.ts +2 -2
  92. package/src/types/prompts.ts +2 -17
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arizeai/phoenix-evals",
3
- "version": "0.2.0",
3
+ "version": "0.2.2",
4
4
  "description": "A library for running evaluations for AI use cases",
5
5
  "main": "dist/src/index.js",
6
6
  "module": "dist/esm/index.js",
@@ -50,7 +50,7 @@
50
50
  "typedoc": "^0.27.9",
51
51
  "typescript": "^5.8.2",
52
52
  "vitest": "^2.1.9",
53
- "@arizeai/phoenix-client": "3.0.0"
53
+ "@arizeai/phoenix-client": "4.2.0"
54
54
  },
55
55
  "engines": {
56
56
  "node": ">=18"
@@ -65,7 +65,7 @@
65
65
  "clean": "rimraf dist",
66
66
  "prebuild": "pnpm run clean",
67
67
  "build": "tsc --build tsconfig.json tsconfig.esm.json && tsc-alias -p tsconfig.esm.json",
68
- "postbuild": "echo '{\"type\": \"module\"}' > ./dist/esm/package.json && rimraf dist/test dist/examples",
68
+ "postbuild": "echo '{\"type\": \"module\"}' > ./dist/esm/package.json",
69
69
  "type:check": "tsc --noEmit",
70
70
  "test": "vitest --typecheck",
71
71
  "docs": "typedoc",
@@ -0,0 +1,40 @@
1
+ import {
2
+ CreateClassificationEvaluatorArgs,
3
+ EvaluatorFn,
4
+ Template,
5
+ } from "../types";
6
+ import { createClassifierFn } from "./createClassifierFn";
7
+ import { LLMEvaluator } from "./LLMEvaluator";
8
+ import { getTemplateVariables } from "../template";
9
+
10
+ /**
11
+ * An LLM evaluator that performs evaluation via classification
12
+ */
13
+ export class ClassificationEvaluator<
14
+ RecordType extends Record<string, unknown>,
15
+ > extends LLMEvaluator<RecordType> {
16
+ readonly evaluatorFn: EvaluatorFn<RecordType>;
17
+ readonly promptTemplate: Template;
18
+ private _promptTemplateVariables: string[] | undefined;
19
+ constructor(args: CreateClassificationEvaluatorArgs) {
20
+ super(args);
21
+ this.promptTemplate = args.promptTemplate;
22
+ this.evaluatorFn = createClassifierFn<RecordType>(args);
23
+ }
24
+ evaluate = (example: RecordType) => {
25
+ return this.evaluatorFn(example);
26
+ };
27
+ /**
28
+ * List out the prompt template variables needed to perform evaluation
29
+ */
30
+ get promptTemplateVariables(): string[] {
31
+ // Use dynamic programming to see if it's computed already
32
+ if (!Array.isArray(this._promptTemplateVariables)) {
33
+ this._promptTemplateVariables = getTemplateVariables({
34
+ template: this.promptTemplate,
35
+ });
36
+ }
37
+ // Give a copy of the variables
38
+ return [...this._promptTemplateVariables];
39
+ }
40
+ }
@@ -0,0 +1,22 @@
1
+ import {
2
+ EvaluationResult,
3
+ Evaluator,
4
+ OptimizationDirection,
5
+ CreateEvaluatorArgs,
6
+ } from "../types";
7
+
8
+ /**
9
+ * Base class for llm evaluation metrics / scores
10
+ */
11
+ export abstract class LLMEvaluator<RecordType extends Record<string, unknown>>
12
+ implements Evaluator<RecordType>
13
+ {
14
+ readonly name: string;
15
+ readonly source = "LLM" as const;
16
+ readonly optimizationDirection?: OptimizationDirection;
17
+ constructor({ name, optimizationDirection }: CreateEvaluatorArgs) {
18
+ this.name = name;
19
+ this.optimizationDirection = optimizationDirection;
20
+ }
21
+ abstract evaluate(_example: RecordType): Promise<EvaluationResult>;
22
+ }
@@ -1,13 +1,10 @@
1
- import { CreateClassificationEvaluatorArgs, Evaluator } from "../types/evals";
2
- import { createClassifierFn } from "./createClassifierFn";
1
+ import { CreateClassificationEvaluatorArgs } from "../types/evals";
2
+ import { ClassificationEvaluator } from "./ClassificationEvaluator";
3
3
 
4
4
  export function createClassificationEvaluator<
5
- ExampleType extends Record<string, unknown>,
6
- >(args: CreateClassificationEvaluatorArgs): Evaluator<ExampleType> {
7
- return {
8
- name: args.name,
9
- source: "LLM",
10
- optimizationDirection: args.optimizationDirection,
11
- evaluate: createClassifierFn(args),
12
- };
5
+ RecordType extends Record<string, unknown>,
6
+ >(
7
+ args: CreateClassificationEvaluatorArgs
8
+ ): ClassificationEvaluator<RecordType> {
9
+ return new ClassificationEvaluator<RecordType>(args);
13
10
  }
@@ -24,12 +24,12 @@ function choicesToLabels(
24
24
  /**
25
25
  * A function that serves as a factory that will output a classification evaluator function
26
26
  */
27
- export function createClassifierFn<ExampleType extends Record<string, unknown>>(
28
- args: CreateClassifierArgs
29
- ): EvaluatorFn<ExampleType> {
27
+ export function createClassifierFn<
28
+ RecordToEvaluate extends Record<string, unknown>,
29
+ >(args: CreateClassifierArgs): EvaluatorFn<RecordToEvaluate> {
30
30
  const { model, choices, promptTemplate, ...rest } = args;
31
31
 
32
- return async (args: ExampleType): Promise<EvaluationResult> => {
32
+ return async (args: RecordToEvaluate): Promise<EvaluationResult> => {
33
33
  const templateVariables = {
34
34
  ...args,
35
35
  };
@@ -17,12 +17,13 @@ export interface DocumentRelevancyEvaluatorArgs
17
17
  }
18
18
 
19
19
  /**
20
- * An example to be evaluated by the document relevancy evaluator.
20
+ * A record to be evaluated by the document relevancy evaluator.
21
21
  */
22
- export type DocumentRelevancyExample = {
22
+ export interface DocumentRelevancyEvaluationRecord {
23
23
  input: string;
24
24
  documentText: string;
25
- };
25
+ [key: string]: unknown;
26
+ }
26
27
 
27
28
  /**
28
29
  * Creates a document relevancy evaluator function.
@@ -50,9 +51,12 @@ export type DocumentRelevancyExample = {
50
51
  * console.log(result.label); // "relevant" or "unrelated"
51
52
  * ```
52
53
  */
53
- export function createDocumentRelevancyEvaluator(
54
- args: DocumentRelevancyEvaluatorArgs
55
- ): Evaluator<DocumentRelevancyExample> {
54
+ export function createDocumentRelevancyEvaluator<
55
+ RecordType extends Record<
56
+ string,
57
+ unknown
58
+ > = DocumentRelevancyEvaluationRecord,
59
+ >(args: DocumentRelevancyEvaluatorArgs): Evaluator<RecordType> {
56
60
  const {
57
61
  choices = DOCUMENT_RELEVANCY_CHOICES,
58
62
  promptTemplate = DOCUMENT_RELEVANCY_TEMPLATE,
@@ -61,7 +65,7 @@ export function createDocumentRelevancyEvaluator(
61
65
  ...rest
62
66
  } = args;
63
67
 
64
- return createClassificationEvaluator<DocumentRelevancyExample>({
68
+ return createClassificationEvaluator<RecordType>({
65
69
  ...args,
66
70
  promptTemplate,
67
71
  choices,
@@ -1,9 +1,10 @@
1
- import { CreateClassificationEvaluatorArgs, Evaluator } from "../types/evals";
1
+ import { CreateClassificationEvaluatorArgs } from "../types/evals";
2
2
  import {
3
3
  HALLUCINATION_TEMPLATE,
4
4
  HALLUCINATION_CHOICES,
5
5
  } from "../default_templates/HALLUCINATION_TEMPLATE";
6
6
  import { createClassificationEvaluator } from "./createClassificationEvaluator";
7
+ import { ClassificationEvaluator } from "./ClassificationEvaluator";
7
8
 
8
9
  export interface HallucinationEvaluatorArgs
9
10
  extends Omit<
@@ -17,9 +18,9 @@ export interface HallucinationEvaluatorArgs
17
18
  }
18
19
 
19
20
  /**
20
- * An example to be evaluated by the hallucination evaluator.
21
+ * A record to be evaluated by the hallucination evaluator.
21
22
  */
22
- export type HallucinationExample = {
23
+ export type HallucinationEvaluationRecord = {
23
24
  input: string;
24
25
  output: string;
25
26
  reference?: string;
@@ -31,9 +32,9 @@ export type HallucinationExample = {
31
32
  * @param args - The arguments for creating the hallucination evaluator.
32
33
  * @returns A function that evaluates whether an answer is factual or hallucinated based on a query and reference text.
33
34
  */
34
- export function createHallucinationEvaluator(
35
- args: HallucinationEvaluatorArgs
36
- ): Evaluator<HallucinationExample> {
35
+ export function createHallucinationEvaluator<
36
+ RecordType extends Record<string, unknown> = HallucinationEvaluationRecord,
37
+ >(args: HallucinationEvaluatorArgs): ClassificationEvaluator<RecordType> {
37
38
  const {
38
39
  choices = HALLUCINATION_CHOICES,
39
40
  promptTemplate = HALLUCINATION_TEMPLATE,
@@ -41,7 +42,7 @@ export function createHallucinationEvaluator(
41
42
  name = "hallucination",
42
43
  ...rest
43
44
  } = args;
44
- return createClassificationEvaluator<HallucinationExample>({
45
+ return createClassificationEvaluator<RecordType>({
45
46
  ...args,
46
47
  promptTemplate,
47
48
  choices,
@@ -4,20 +4,22 @@ import type { WithPrompt } from "../types/prompts";
4
4
  import { generateObject } from "ai";
5
5
  import { z } from "zod";
6
6
  import { tracer } from "../telemetry";
7
- export interface ClassifyArgs extends WithLLM, WithPrompt, WithTelemetry {
8
- /**
9
- * The labels to classify the example into. E.x. ["correct", "incorrect"]
10
- */
11
- labels: [string, ...string[]];
12
- /**
13
- * The name of the schema for generating the label and explanation.
14
- */
15
- schemaName?: string;
16
- /**
17
- * The description of the schema for generating the label and explanation.
18
- */
19
- schemaDescription?: string;
20
- }
7
+ export type ClassifyArgs = WithLLM &
8
+ WithTelemetry &
9
+ WithPrompt & {
10
+ /**
11
+ * The labels to classify the example into. E.x. ["correct", "incorrect"]
12
+ */
13
+ labels: [string, ...string[]];
14
+ /**
15
+ * The name of the schema for generating the label and explanation.
16
+ */
17
+ schemaName?: string;
18
+ /**
19
+ * The description of the schema for generating the label and explanation.
20
+ */
21
+ schemaDescription?: string;
22
+ };
21
23
  /**
22
24
  * A function that leverages an llm to perform a classification
23
25
  */
@@ -0,0 +1,22 @@
1
+ import { Template } from "../types/templating";
2
+ import Mustache from "mustache";
3
+
4
+ type GetTemplateVariableArgs = {
5
+ template: Template;
6
+ };
7
+ /**
8
+ * Parse out the template variables of a prompt
9
+ * @param {GetTemplateVariableArgs} args
10
+ * @returns {string[]} a list of prompt template variables
11
+ */
12
+ export function getTemplateVariables(args: GetTemplateVariableArgs): string[] {
13
+ const { template } = args;
14
+ const templateSpans = Mustache.parse(template);
15
+ return templateSpans.reduce((acc, templateSpan) => {
16
+ const [spanType, value] = templateSpan;
17
+ if (spanType === "name" && typeof value === "string") {
18
+ acc = [...acc, value];
19
+ }
20
+ return acc;
21
+ }, [] as string[]);
22
+ }
@@ -1 +1,2 @@
1
1
  export * from "./applyTemplate";
2
+ export * from "./getTemplateVariables";
@@ -15,6 +15,7 @@ export interface WithLLM {
15
15
  model: LanguageModel;
16
16
  }
17
17
 
18
+ // eslint-disable-next-line @typescript-eslint/no-empty-object-type
18
19
  export interface LLMEvaluationArgs extends WithLLM {}
19
20
 
20
21
  /**
@@ -78,8 +79,7 @@ export interface CreateClassifierArgs extends WithTelemetry {
78
79
  promptTemplate: string;
79
80
  }
80
81
 
81
- export interface CreateClassificationEvaluatorArgs
82
- extends CreateClassifierArgs {
82
+ export interface CreateEvaluatorArgs {
83
83
  /**
84
84
  * The name of the metric that the evaluator produces
85
85
  * E.x. "correctness"
@@ -92,6 +92,10 @@ export interface CreateClassificationEvaluatorArgs
92
92
  optimizationDirection?: OptimizationDirection;
93
93
  }
94
94
 
95
+ export interface CreateClassificationEvaluatorArgs
96
+ extends CreateClassifierArgs,
97
+ CreateEvaluatorArgs {}
98
+
95
99
  export type EvaluatorFn<ExampleType extends Record<string, unknown>> = (
96
100
  args: ExampleType
97
101
  ) => Promise<EvaluationResult>;
@@ -99,19 +103,18 @@ export type EvaluatorFn<ExampleType extends Record<string, unknown>> = (
99
103
  /**
100
104
  * The source of the evaluation
101
105
  */
102
- type EvaluationSource = "LLM" | "CODE";
106
+ export type EvaluationSource = "LLM" | "CODE";
103
107
 
104
108
  /**
105
109
  * The direction to optimize the numeric evaluation score
106
110
  * E.x. "MAXIMIZE" means that the higher the score, the better the evaluation
107
111
  */
108
- type OptimizationDirection = "MAXIMIZE" | "MINIMIZE";
112
+ export type OptimizationDirection = "MAXIMIZE" | "MINIMIZE";
109
113
 
110
114
  /**
111
- * The Base Evaluator interface
112
- * This is the interface that all evaluators must implement
115
+ * The description of an evaluator
113
116
  */
114
- export interface Evaluator<ExampleType extends Record<string, unknown>> {
117
+ interface EvaluatorDescription {
115
118
  /**
116
119
  * The name of the evaluator / the metric that it measures
117
120
  */
@@ -125,6 +128,14 @@ export interface Evaluator<ExampleType extends Record<string, unknown>> {
125
128
  * E.x. "MAXIMIZE" means that the higher the score, the better the evaluation
126
129
  */
127
130
  optimizationDirection?: OptimizationDirection;
131
+ }
132
+
133
+ /**
134
+ * The Base Evaluator interface
135
+ * This is the interface that all evaluators must implement
136
+ */
137
+ export interface Evaluator<ExampleType extends Record<string, unknown>>
138
+ extends EvaluatorDescription {
128
139
  /**
129
140
  * The function that evaluates the example
130
141
  */
package/src/types/otel.ts CHANGED
@@ -1,6 +1,6 @@
1
1
  import { Tracer } from "@opentelemetry/api";
2
2
 
3
- export interface WithTelemetry {
3
+ export type WithTelemetry = {
4
4
  telemetry?: {
5
5
  /**
6
6
  * Whether OpenTelemetry is enabled on the call.
@@ -14,4 +14,4 @@ export interface WithTelemetry {
14
14
  */
15
15
  tracer?: Tracer;
16
16
  };
17
- }
17
+ };
@@ -1,22 +1,7 @@
1
- import { ModelMessage } from "ai";
2
-
1
+ import type { Prompt } from "ai";
3
2
  /**
4
3
  * Prompt part of the AI function options for model generation.
5
4
  * It contains a system message, a simple text prompt, or a list of model messages.
6
5
  * Uses ModelMessage format compatible with AI SDK v5 generateObject function.
7
6
  */
8
- export interface WithPrompt {
9
- /**
10
- * System message to include in the prompt. Can be used with `prompt` or `messages`.
11
- */
12
- system?: string;
13
- /**
14
- * A simple text prompt. You can either use `prompt` or `messages` but not both.
15
- */
16
- prompt?: string;
17
- /**
18
- * A list of model messages. You can either use `prompt` or `messages` but not both.
19
- * Uses ModelMessage format for compatibility with AI SDK v5.
20
- */
21
- messages?: Array<ModelMessage>;
22
- }
7
+ export type WithPrompt = Prompt;