@arizeai/phoenix-evals 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ ---
2
+ title: "Classification"
3
+ description: "Classification helpers in @arizeai/phoenix-evals"
4
+ ---
5
+
6
+ Use the classification helpers when you want an LLM to choose from a fixed set of labels and return a structured explanation.
7
+
8
+ ## Create A Classifier Function
9
+
10
+ ```ts
11
+ import { openai } from "@ai-sdk/openai";
12
+ import { createClassifierFn } from "@arizeai/phoenix-evals";
13
+
14
+ const classify = createClassifierFn({
15
+ model: openai("gpt-4o-mini"),
16
+ choices: { relevant: 1, irrelevant: 0 },
17
+ promptTemplate:
18
+ "Question: {{input}}\nContext: {{context}}\nAnswer: {{output}}\nLabel as relevant or irrelevant.",
19
+ });
20
+
21
+ const result = await classify({
22
+ input: "What is Phoenix?",
23
+ context: "Phoenix is an AI observability platform.",
24
+ output: "Phoenix helps teams inspect traces and experiments.",
25
+ });
26
+ ```
27
+
28
+ ## Lower-Level API
29
+
30
+ Use `generateClassification` directly when you already have a rendered prompt and only need structured label generation.
31
+
32
+ <section className="hidden" data-agent-context="source-map" aria-label="Source map">
33
+ <h2>Source Map</h2>
34
+ <ul>
35
+ <li><code>src/llm/createClassifierFn.ts</code></li>
36
+ <li><code>src/llm/createClassificationEvaluator.ts</code></li>
37
+ <li><code>src/llm/generateClassification.ts</code></li>
38
+ <li><code>src/types/evals.ts</code></li>
39
+ </ul>
40
+ </section>
@@ -0,0 +1,86 @@
1
+ ---
2
+ title: "Create Evaluator"
3
+ description: "Build custom evaluators with @arizeai/phoenix-evals"
4
+ ---
5
+
6
+ Use `createEvaluator` when your evaluation logic is plain TypeScript and you want a reusable evaluator object with consistent metadata and telemetry.
7
+
8
+ <section className="hidden" data-agent-context="relevant-source-files" aria-label="Relevant source files">
9
+ <h2>Relevant Source Files</h2>
10
+ <ul>
11
+ <li><code>src/helpers/createEvaluator.ts</code></li>
12
+ </ul>
13
+ </section>
14
+
15
+ ## Example
16
+
17
+ ```ts
18
+ import { createEvaluator } from "@arizeai/phoenix-evals";
19
+
20
+ const exactMatch = createEvaluator(
21
+ ({ output, expected }) => ({
22
+ score: output === expected ? 1 : 0,
23
+ label: output === expected ? "match" : "mismatch",
24
+ }),
25
+ {
26
+ name: "exact-match",
27
+ kind: "CODE",
28
+ }
29
+ );
30
+
31
+ const result = await exactMatch.evaluate({
32
+ output: "Paris",
33
+ expected: "Paris",
34
+ });
35
+ ```
36
+
37
+ ## What You Get
38
+
39
+ - an evaluator name
40
+ - evaluator kind such as `CODE` or `LLM`
41
+ - optimization direction metadata
42
+ - optional OpenTelemetry spans around execution
43
+
44
+ ## When To Use Code Evaluators
45
+
46
+ Code evaluators are the right fit when the scoring logic should stay deterministic, cheap, and fully under your control:
47
+
48
+ - regex or exact-match checks
49
+ - JSON structure validation
50
+ - latency and cost thresholds
51
+ - post-processing checks on existing model output
52
+
53
+ ```ts
54
+ import { createEvaluator } from "@arizeai/phoenix-evals";
55
+
56
+ const lengthCheck = createEvaluator(
57
+ ({ output }) => {
58
+ const score = typeof output === "string" && output.length < 280 ? 1 : 0;
59
+ return {
60
+ score,
61
+ label: score ? "fits-limit" : "too-long",
62
+ };
63
+ },
64
+ {
65
+ name: "response-length",
66
+ kind: "CODE",
67
+ }
68
+ );
69
+ ```
70
+
71
+ ## Related Helpers
72
+
73
+ - `asEvaluatorFn`
74
+ - `toEvaluationResult`
75
+
76
+ <section className="hidden" data-agent-context="source-map" aria-label="Source map">
77
+ <h2>Source Map</h2>
78
+ <ul>
79
+ <li><code>src/helpers/createEvaluator.ts</code></li>
80
+ <li><code>src/helpers/asEvaluatorFn.ts</code></li>
81
+ <li><code>src/helpers/toEvaluationResult.ts</code></li>
82
+ <li><code>src/core/FunctionEvaluator.ts</code></li>
83
+ <li><code>src/core/EvaluatorBase.ts</code></li>
84
+ <li><code>src/types/evals.ts</code></li>
85
+ </ul>
86
+ </section>
@@ -0,0 +1,66 @@
1
+ ---
2
+ title: "LLM Evaluators"
3
+ description: "Use LLM-backed evaluators in @arizeai/phoenix-evals"
4
+ ---
5
+
6
+ The `llm` entrypoint provides reusable evaluator factories that call an AI SDK model and return structured evaluation results.
7
+
8
+ <section className="hidden" data-agent-context="relevant-source-files" aria-label="Relevant source files">
9
+ <h2>Relevant Source Files</h2>
10
+ <ul>
11
+ <li><code>src/llm/index.ts</code></li>
12
+ </ul>
13
+ </section>
14
+
15
+ ## Example
16
+
17
+ ```ts
18
+ import { openai } from "@ai-sdk/openai";
19
+ import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals";
20
+
21
+ const faithfulness = createFaithfulnessEvaluator({
22
+ model: openai("gpt-4o-mini"),
23
+ });
24
+
25
+ const result = await faithfulness.evaluate({
26
+ input: "What is the capital of France?",
27
+ context: "France is a country in Europe. Paris is its capital city.",
28
+ output: "The capital of France is Paris.",
29
+ });
30
+ ```
31
+
32
+ ## Built-In Evaluator Factories
33
+
34
+ - `createConcisenessEvaluator`
35
+ - `createCorrectnessEvaluator`
36
+ - `createDocumentRelevanceEvaluator`
37
+ - `createFaithfulnessEvaluator`
38
+ - `createRefusalEvaluator`
39
+ - `createClassificationEvaluator`
40
+ - `createToolSelectionEvaluator`
41
+ - `createToolInvocationEvaluator`
42
+ - `createToolResponseHandlingEvaluator`
43
+
44
+ ```ts
45
+ import { openai } from "@ai-sdk/openai";
46
+ import {
47
+ createCorrectnessEvaluator,
48
+ createRefusalEvaluator,
49
+ } from "@arizeai/phoenix-evals";
50
+
51
+ const model = openai("gpt-4o-mini");
52
+
53
+ const correctness = createCorrectnessEvaluator({ model });
54
+ const refusal = createRefusalEvaluator({ model });
55
+ ```
56
+
57
+ <section className="hidden" data-agent-context="source-map" aria-label="Source map">
58
+ <h2>Source Map</h2>
59
+ <ul>
60
+ <li><code>src/llm/createClassificationEvaluator.ts</code></li>
61
+ <li><code>src/llm/ClassificationEvaluator.ts</code></li>
62
+ <li><code>src/llm/LLMEvaluator.ts</code></li>
63
+ <li><code>src/llm/createFaithfulnessEvaluator.ts</code></li>
64
+ <li><code>src/types/evals.ts</code></li>
65
+ </ul>
66
+ </section>
@@ -0,0 +1,90 @@
1
+ ---
2
+ title: "Overview"
3
+ description: "Bundled docs for @arizeai/phoenix-evals"
4
+ ---
5
+
6
+ `@arizeai/phoenix-evals` provides evaluator building blocks for TypeScript workflows. It includes LLM-based evaluators, code-based evaluators, prompt templating helpers, and compatibility points for Phoenix experiments.
7
+
8
+ ## Install
9
+
10
+ `@arizeai/phoenix-evals` depends on model adapters from the AI SDK ecosystem. Install the package plus at least one provider adapter for the models you plan to use.
11
+
12
+ ```bash
13
+ npm install @arizeai/phoenix-evals
14
+ ```
15
+
16
+ ### Common Setups
17
+
18
+ ```bash
19
+ npm install @arizeai/phoenix-evals @ai-sdk/openai
20
+ ```
21
+
22
+ ```bash
23
+ npm install @arizeai/phoenix-evals @ai-sdk/google
24
+ ```
25
+
26
+ You can also pair it with Phoenix experiments:
27
+
28
+ ```bash
29
+ npm install @arizeai/phoenix-evals @arizeai/phoenix-client @ai-sdk/openai
30
+ ```
31
+
32
+ ### Runtime Expectations
33
+
34
+ - Node.js 18+
35
+ - an AI SDK provider package such as `@ai-sdk/openai`
36
+ - credentials required by your chosen provider
37
+
38
+ ## Minimal Example
39
+
40
+ ```ts
41
+ import { openai } from "@ai-sdk/openai";
42
+ import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals";
43
+
44
+ const faithfulness = createFaithfulnessEvaluator({
45
+ model: openai("gpt-4o-mini"),
46
+ });
47
+
48
+ const result = await faithfulness.evaluate({
49
+ input: "What is Phoenix?",
50
+ context: "Phoenix is an open-source AI observability platform from Arize.",
51
+ output: "Phoenix is an open-source AI observability platform from Arize.",
52
+ });
53
+ ```
54
+
55
+ ## Docs And Source In `node_modules`
56
+
57
+ After install, a coding agent can inspect the installed package directly:
58
+
59
+ ```text
60
+ node_modules/@arizeai/phoenix-evals/docs/
61
+ node_modules/@arizeai/phoenix-evals/src/
62
+ ```
63
+
64
+ The bundled docs cover evaluator creation, LLM evaluators, templates, classification, and Phoenix integration.
65
+
66
+ ## Where To Start
67
+
68
+ - [Create evaluator](./create-evaluator) for custom and code-based evaluator flows
69
+ - [LLM evaluators](./llm-evaluators) and [Classification](./classification) for model-backed evaluation
70
+ - [Templates](./templates) and [Phoenix integration](./phoenix-integration) for prompt helpers and experiment wiring
71
+
72
+ ## Source Layout
73
+
74
+ - `src/index.ts` re-exports the package surface you usually import from `@arizeai/phoenix-evals`
75
+ - `src/llm/` contains classification helpers and built-in LLM evaluator factories
76
+ - `src/helpers/` contains `createEvaluator` and evaluation-result helpers
77
+ - `src/template/` contains `formatTemplate` and `getTemplateVariables`
78
+ - `src/types/` contains shared evaluator and prompt types
79
+
80
+ <section className="hidden" data-agent-context="source-map" aria-label="Source map">
81
+ <h2>Source Map</h2>
82
+ <ul>
83
+ <li><code>src/index.ts</code></li>
84
+ <li><code>src/llm/</code></li>
85
+ <li><code>src/helpers/</code></li>
86
+ <li><code>src/template/</code></li>
87
+ <li><code>src/core/</code></li>
88
+ <li><code>src/types/</code></li>
89
+ </ul>
90
+ </section>
@@ -0,0 +1,80 @@
1
+ ---
2
+ title: "Phoenix Integration"
3
+ description: "Use @arizeai/phoenix-evals with Phoenix experiments"
4
+ ---
5
+
6
+ `@arizeai/phoenix-evals` pairs with `@arizeai/phoenix-client` when you want to run evaluator-backed experiments and store both task traces and evaluation results in Phoenix.
7
+
8
+ <section className="hidden" data-agent-context="relevant-source-files" aria-label="Relevant source files">
9
+ <h2>Relevant Source Files</h2>
10
+ <ul>
11
+ <li><code>src/index.ts</code> for the root evaluator exports</li>
12
+ <li><code>companion package: @arizeai/phoenix-client/datasets</code></li>
13
+ <li><code>companion package: @arizeai/phoenix-client/experiments</code></li>
14
+ </ul>
15
+ </section>
16
+
17
+ ## Example
18
+
19
+ ```ts
20
+ import { openai } from "@ai-sdk/openai";
21
+ import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals";
22
+ import { createOrGetDataset } from "@arizeai/phoenix-client/datasets";
23
+ import {
24
+ asExperimentEvaluator,
25
+ runExperiment,
26
+ } from "@arizeai/phoenix-client/experiments";
27
+
28
+ await createOrGetDataset({
29
+ name: "support-eval",
30
+ examples: [
31
+ {
32
+ input: {
33
+ question: "Is Phoenix open source?",
34
+ context: "Phoenix is open source.",
35
+ },
36
+ output: {
37
+ answer: "Phoenix is open source.",
38
+ },
39
+ },
40
+ ],
41
+ });
42
+
43
+ const faithfulness = createFaithfulnessEvaluator({
44
+ model: openai("gpt-4o-mini"),
45
+ });
46
+
47
+ await runExperiment({
48
+ dataset: { datasetName: "support-eval" },
49
+ task: async ({ question, context }) =>
50
+ `${question} Answer using only this context: ${context}`,
51
+ evaluators: [
52
+ asExperimentEvaluator({
53
+ name: "faithfulness",
54
+ kind: "LLM",
55
+ evaluate: async ({ input, output }) =>
56
+ faithfulness.evaluate({
57
+ input: String(input.question ?? ""),
58
+ context: String(input.context ?? ""),
59
+ output: String(output ?? ""),
60
+ }),
61
+ }),
62
+ ],
63
+ });
64
+ ```
65
+
66
+ ## What Each Package Does
67
+
68
+ - `@arizeai/phoenix-evals` builds evaluator logic
69
+ - `@arizeai/phoenix-client` handles experiment execution and persistence
70
+ - combined usage produces evaluator traces and experiment results in Phoenix
71
+
72
+ <section className="hidden" data-agent-context="source-map" aria-label="Source map">
73
+ <h2>Source Map</h2>
74
+ <ul>
75
+ <li><code>src/index.ts</code></li>
76
+ <li><code>src/llm/</code></li>
77
+ <li><code>companion package: @arizeai/phoenix-client/datasets</code></li>
78
+ <li><code>companion package: @arizeai/phoenix-client/experiments</code></li>
79
+ </ul>
80
+ </section>
@@ -0,0 +1,52 @@
1
+ ---
2
+ title: "Templates"
3
+ description: "Template helpers in @arizeai/phoenix-evals"
4
+ ---
5
+
6
+ The template helpers make it easier to manage Mustache-style prompt templates separately from evaluator execution.
7
+
8
+ <section className="hidden" data-agent-context="relevant-source-files" aria-label="Relevant source files">
9
+ <h2>Relevant Source Files</h2>
10
+ <ul>
11
+ <li><code>src/template/applyTemplate.ts</code></li>
12
+ <li><code>src/template/getTemplateVariables.ts</code></li>
13
+ </ul>
14
+ </section>
15
+
16
+ ## Render A Template
17
+
18
+ ```ts
19
+ import { formatTemplate } from "@arizeai/phoenix-evals";
20
+
21
+ const prompt = formatTemplate({
22
+ template: [
23
+ {
24
+ role: "user",
25
+ content: "Rate the answer to {{question}}",
26
+ },
27
+ ],
28
+ variables: {
29
+ question: "What is retrieval-augmented generation?",
30
+ },
31
+ });
32
+ ```
33
+
34
+ ## Discover Variables
35
+
36
+ ```ts
37
+ import { getTemplateVariables } from "@arizeai/phoenix-evals";
38
+
39
+ const variables = getTemplateVariables({
40
+ template: "Answer {{question}} using {{context}}",
41
+ });
42
+ ```
43
+
44
+ <section className="hidden" data-agent-context="source-map" aria-label="Source map">
45
+ <h2>Source Map</h2>
46
+ <ul>
47
+ <li><code>src/template/applyTemplate.ts</code></li>
48
+ <li><code>src/template/getTemplateVariables.ts</code></li>
49
+ <li><code>src/template/createTemplateVariablesProxy.ts</code></li>
50
+ <li><code>src/types/templating.ts</code></li>
51
+ </ul>
52
+ </section>
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arizeai/phoenix-evals",
3
- "version": "1.0.0",
3
+ "version": "1.0.2",
4
4
  "description": "A library for running evaluations for AI use cases",
5
5
  "keywords": [
6
6
  "arize",
@@ -19,8 +19,12 @@
19
19
  "type": "git",
20
20
  "url": "git+https://github.com/Arize-ai/phoenix.git"
21
21
  },
22
+ "directories": {
23
+ "doc": "./docs"
24
+ },
22
25
  "files": [
23
26
  "dist",
27
+ "docs",
24
28
  "src",
25
29
  "package.json"
26
30
  ],
@@ -72,7 +76,7 @@
72
76
  "nock": "^14.0.5",
73
77
  "tsx": "^4.19.3",
74
78
  "typedoc": "^0.28.17",
75
- "vitest": "^4.0.10"
79
+ "vitest": "^4.1.0"
76
80
  },
77
81
  "engines": {
78
82
  "node": ">=18"
@@ -2,6 +2,7 @@ import { withSpan } from "@arizeai/openinference-core";
2
2
 
3
3
  import type { EvaluatorBase } from "../core/EvaluatorBase";
4
4
  import { FunctionEvaluator } from "../core/FunctionEvaluator";
5
+ import { tracer as defaultTracer } from "../telemetry";
5
6
  import type {
6
7
  EvaluationKind,
7
8
  OptimizationDirection,
@@ -168,7 +169,7 @@ export function createEvaluator<
168
169
  // Add OpenTelemetry span wrapping if telemetry is enabled
169
170
  if (telemetry && telemetry.isEnabled) {
170
171
  evaluateFn = withSpan(evaluateFn, {
171
- tracer: telemetry.tracer,
172
+ tracer: telemetry.tracer ?? defaultTracer,
172
173
  name: evaluatorName,
173
174
  kind: "EVALUATOR",
174
175
  });
@@ -1,3 +1,24 @@
1
- import { trace } from "@opentelemetry/api";
1
+ import { trace, type Tracer } from "@opentelemetry/api";
2
2
 
3
- export const tracer = trace.getTracer("phoenix-evals");
3
+ const DEFAULT_TRACER_NAME = "phoenix-evals";
4
+
5
+ /**
6
+ * Returns a lazy tracer that resolves from `trace.getTracer()` on every call,
7
+ * so evaluator spans follow whichever provider is currently mounted as global.
8
+ *
9
+ * Cast to `Tracer` is necessary because `startActiveSpan` has multiple
10
+ * overload signatures that cannot be satisfied by a single implementation.
11
+ */
12
+ export function getTracer(name: string = DEFAULT_TRACER_NAME): Tracer {
13
+ return {
14
+ startSpan(spanName, options, context) {
15
+ return trace.getTracer(name).startSpan(spanName, options, context);
16
+ },
17
+ startActiveSpan(...args: unknown[]) {
18
+ const tracer = trace.getTracer(name);
19
+ return Reflect.apply(tracer.startActiveSpan, tracer, args);
20
+ },
21
+ } as Tracer;
22
+ }
23
+
24
+ export const tracer = getTracer();