@arizeai/phoenix-evals 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -14
- package/dist/esm/helpers/createEvaluator.d.ts.map +1 -1
- package/dist/esm/helpers/createEvaluator.js +2 -1
- package/dist/esm/helpers/createEvaluator.js.map +1 -1
- package/dist/esm/llm/ClassificationEvaluator.d.ts +1 -1
- package/dist/esm/llm/ClassificationEvaluator.d.ts.map +1 -1
- package/dist/esm/telemetry/index.d.ts +10 -1
- package/dist/esm/telemetry/index.d.ts.map +1 -1
- package/dist/esm/telemetry/index.js +20 -1
- package/dist/esm/telemetry/index.js.map +1 -1
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/src/helpers/createEvaluator.d.ts.map +1 -1
- package/dist/src/helpers/createEvaluator.js +3 -1
- package/dist/src/helpers/createEvaluator.js.map +1 -1
- package/dist/src/telemetry/index.d.ts +10 -1
- package/dist/src/telemetry/index.d.ts.map +1 -1
- package/dist/src/telemetry/index.js +21 -1
- package/dist/src/telemetry/index.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/docs/classification.mdx +40 -0
- package/docs/create-evaluator.mdx +86 -0
- package/docs/llm-evaluators.mdx +66 -0
- package/docs/overview.mdx +90 -0
- package/docs/phoenix-integration.mdx +80 -0
- package/docs/templates.mdx +52 -0
- package/package.json +6 -2
- package/src/helpers/createEvaluator.ts +2 -1
- package/src/telemetry/index.ts +23 -2
package/README.md
CHANGED
|
@@ -91,25 +91,58 @@ See the complete example in [`examples/classifier_example.ts`](examples/classifi
|
|
|
91
91
|
|
|
92
92
|
The library includes several pre-built evaluators for common evaluation tasks. These evaluators come with optimized prompts and can be used directly with any AI SDK model.
|
|
93
93
|
|
|
94
|
+
All pre-built evaluators are available from the `@arizeai/phoenix-evals/llm` module:
|
|
95
|
+
|
|
96
|
+
| Evaluator | Function | Description |
|
|
97
|
+
| ---------------------- | ------------------------------------- | --------------------------------------------------------------------------------- |
|
|
98
|
+
| Faithfulness | `createFaithfulnessEvaluator` | Detects hallucinations — checks if the output is grounded in the provided context |
|
|
99
|
+
| Conciseness | `createConcisenessEvaluator` | Evaluates whether the response is appropriately concise |
|
|
100
|
+
| Correctness | `createCorrectnessEvaluator` | Checks if the output is factually correct given the input |
|
|
101
|
+
| Document Relevance | `createDocumentRelevanceEvaluator` | Measures how relevant a retrieved document is to the query |
|
|
102
|
+
| Refusal | `createRefusalEvaluator` | Detects whether the model refused to answer |
|
|
103
|
+
| Tool Invocation | `createToolInvocationEvaluator` | Evaluates whether the correct tool was invoked with the right arguments |
|
|
104
|
+
| Tool Selection | `createToolSelectionEvaluator` | Checks whether the right tool was selected for the task |
|
|
105
|
+
| Tool Response Handling | `createToolResponseHandlingEvaluator` | Evaluates how well the model uses a tool's response |
|
|
106
|
+
|
|
94
107
|
```typescript
|
|
95
|
-
import {
|
|
108
|
+
import {
|
|
109
|
+
createFaithfulnessEvaluator,
|
|
110
|
+
createConcisenessEvaluator,
|
|
111
|
+
createCorrectnessEvaluator,
|
|
112
|
+
createDocumentRelevanceEvaluator,
|
|
113
|
+
createRefusalEvaluator,
|
|
114
|
+
} from "@arizeai/phoenix-evals/llm";
|
|
96
115
|
import { openai } from "@ai-sdk/openai";
|
|
97
|
-
const model = openai("gpt-4o-mini");
|
|
98
116
|
|
|
99
|
-
|
|
100
|
-
const faithfulnessEvaluator = createFaithfulnessEvaluator({
|
|
101
|
-
model,
|
|
102
|
-
});
|
|
117
|
+
const model = openai("gpt-4o-mini");
|
|
103
118
|
|
|
104
|
-
//
|
|
105
|
-
const
|
|
119
|
+
// Faithfulness: checks if the output is grounded in the context
|
|
120
|
+
const faithfulnessEvaluator = createFaithfulnessEvaluator({ model });
|
|
121
|
+
const faithfulnessResult = await faithfulnessEvaluator.evaluate({
|
|
106
122
|
input: "What is the capital of France?",
|
|
107
123
|
context: "France is a country in Europe. Paris is its capital city.",
|
|
108
124
|
output: "The capital of France is London.",
|
|
109
125
|
});
|
|
110
|
-
|
|
111
|
-
console.log(result);
|
|
126
|
+
console.log(faithfulnessResult);
|
|
112
127
|
// Output: { label: "unfaithful", score: 0, explanation: "..." }
|
|
128
|
+
|
|
129
|
+
// Correctness: checks if the output is factually correct
|
|
130
|
+
const correctnessEvaluator = createCorrectnessEvaluator({ model });
|
|
131
|
+
const correctnessResult = await correctnessEvaluator.evaluate({
|
|
132
|
+
input: "What is the capital of France?",
|
|
133
|
+
output: "Paris is the capital of France.",
|
|
134
|
+
});
|
|
135
|
+
console.log(correctnessResult);
|
|
136
|
+
// Output: { label: "correct", score: 1, explanation: "..." }
|
|
137
|
+
|
|
138
|
+
// Document Relevance: checks if a retrieved document is relevant to the query
|
|
139
|
+
const relevanceEvaluator = createDocumentRelevanceEvaluator({ model });
|
|
140
|
+
const relevanceResult = await relevanceEvaluator.evaluate({
|
|
141
|
+
input: "What is the capital of France?",
|
|
142
|
+
documentText: "Paris is the capital of France and a major European city.",
|
|
143
|
+
});
|
|
144
|
+
console.log(relevanceResult);
|
|
145
|
+
// Output: { label: "relevant", score: 1, explanation: "..." }
|
|
113
146
|
```
|
|
114
147
|
|
|
115
148
|
### Data Mapping
|
|
@@ -117,10 +150,8 @@ console.log(result);
|
|
|
117
150
|
When your data structure doesn't match what an evaluator expects, use `bindEvaluator` to map your fields to the evaluator's expected input format:
|
|
118
151
|
|
|
119
152
|
```typescript
|
|
120
|
-
import {
|
|
121
|
-
|
|
122
|
-
createFaithfulnessEvaluator,
|
|
123
|
-
} from "@arizeai/phoenix-evals";
|
|
153
|
+
import { bindEvaluator } from "@arizeai/phoenix-evals";
|
|
154
|
+
import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals/llm";
|
|
124
155
|
import { openai } from "@ai-sdk/openai";
|
|
125
156
|
|
|
126
157
|
const model = openai("gpt-4o-mini");
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"createEvaluator.d.ts","sourceRoot":"","sources":["../../../src/helpers/createEvaluator.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"createEvaluator.d.ts","sourceRoot":"","sources":["../../../src/helpers/createEvaluator.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAG3D,OAAO,KAAK,EACV,cAAc,EACd,qBAAqB,EACrB,eAAe,EAChB,MAAM,UAAU,CAAC;AAIlB,KAAK,KAAK,GAAG,CAAC,GAAG,IAAI,EAAE,GAAG,EAAE,KAAK,GAAG,CAAC;AAMrC;;;;GAIG;AACH,MAAM,MAAM,sBAAsB,GAAG;IACnC;;;;;;;;;;OAUG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;;;;;;;;;;OAYG;IACH,IAAI,CAAC,EAAE,cAAc,CAAC;IACtB;;;;;;;;;;;;;;OAcG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAC9C;;;;;;;;;;;;;;;OAeG;IACH,SAAS,CAAC,EAAE,eAAe,CAAC;CAC7B,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoEG;AACH,wBAAgB,eAAe,CAC7B,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EACpE,EAAE,SAAS,KAAK,GAAG,KAAK,EACxB,EAAE,EAAE,EAAE,EAAE,OAAO,CAAC,EAAE,sBAAsB,GAAG,aAAa,CAAC,UAAU,CAAC,CAyBrE"}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { withSpan } from "@arizeai/openinference-core";
|
|
2
2
|
import { FunctionEvaluator } from "../core/FunctionEvaluator.js";
|
|
3
|
+
import { tracer as defaultTracer } from "../telemetry/index.js";
|
|
3
4
|
import { asEvaluatorFn } from "./asEvaluatorFn.js";
|
|
4
5
|
function generateUniqueName() {
|
|
5
6
|
return `evaluator-${Math.random().toString(36).substring(2, 15)}`;
|
|
@@ -80,7 +81,7 @@ export function createEvaluator(fn, options) {
|
|
|
80
81
|
// Add OpenTelemetry span wrapping if telemetry is enabled
|
|
81
82
|
if (telemetry && telemetry.isEnabled) {
|
|
82
83
|
evaluateFn = withSpan(evaluateFn, {
|
|
83
|
-
tracer: telemetry.tracer,
|
|
84
|
+
tracer: telemetry.tracer ?? defaultTracer,
|
|
84
85
|
name: evaluatorName,
|
|
85
86
|
kind: "EVALUATOR",
|
|
86
87
|
});
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"createEvaluator.js","sourceRoot":"","sources":["../../../src/helpers/createEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,6BAA6B,CAAC;AAGvD,OAAO,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;
|
|
1
|
+
{"version":3,"file":"createEvaluator.js","sourceRoot":"","sources":["../../../src/helpers/createEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,6BAA6B,CAAC;AAGvD,OAAO,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AAC9D,OAAO,EAAE,MAAM,IAAI,aAAa,EAAE,MAAM,cAAc,CAAC;AAMvD,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAKhD,SAAS,kBAAkB;IACzB,OAAO,aAAa,IAAI,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;AACpE,CAAC;AAqED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoEG;AACH,MAAM,UAAU,eAAe,CAG7B,EAAM,EAAE,OAAgC;IACxC,MAAM,EACJ,IAAI,EACJ,IAAI,EACJ,qBAAqB,EACrB,SAAS,GAAG,EAAE,SAAS,EAAE,IAAI,EAAE,GAChC,GAAG,OAAO,IAAI,EAAE,CAAC;IAClB,MAAM,aAAa,GAAG,IAAI,IAAI,EAAE,CAAC,IAAI,IAAI,kBAAkB,EAAE,CAAC;IAC9D,IAAI,UAAU,GAAG,aAAa,CAAa,EAAE,CAAC,CAAC;IAE/C,0DAA0D;IAC1D,IAAI,SAAS,IAAI,SAAS,CAAC,SAAS,EAAE,CAAC;QACrC,UAAU,GAAG,QAAQ,CAAC,UAAU,EAAE;YAChC,MAAM,EAAE,SAAS,CAAC,MAAM,IAAI,aAAa;YACzC,IAAI,EAAE,aAAa;YACnB,IAAI,EAAE,WAAW;SAClB,CAAC,CAAC;IACL,CAAC;IACD,OAAO,IAAI,iBAAiB,CAAa;QACvC,UAAU;QACV,IAAI,EAAE,aAAa;QACnB,IAAI,EAAE,IAAI,IAAI,MAAM;QACpB,qBAAqB,EAAE,qBAAqB,IAAI,UAAU;QAC1D,SAAS;KACV,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -21,7 +21,7 @@ export declare class ClassificationEvaluator<RecordType extends Record<string, u
|
|
|
21
21
|
*/
|
|
22
22
|
readonly choices: ClassificationChoicesMap;
|
|
23
23
|
constructor(args: CreateClassificationEvaluatorArgs<RecordType>);
|
|
24
|
-
evaluate: (example: RecordType) => Promise<import("../
|
|
24
|
+
evaluate: (example: RecordType) => Promise<import("../index.js").EvaluationResult>;
|
|
25
25
|
/**
|
|
26
26
|
* List out the prompt template variables needed to perform evaluation
|
|
27
27
|
*/
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ClassificationEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/ClassificationEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AAGxC,OAAO,KAAK,EACV,wBAAwB,EACxB,iCAAiC,EACjC,WAAW,EACX,cAAc,EACd,kBAAkB,EACnB,MAAM,UAAU,CAAC;AAClB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAGnD,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAE9C;;GAEG;AACH,qBAAa,uBAAuB,CAAC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAC7E,SAAQ,YAAY,CAAC,UAAU,CAC/B,YAAW,kBAAkB;IAE7B,QAAQ,CAAC,WAAW,EAAE,WAAW,CAAC,UAAU,CAAC,CAAC;IAC9C,QAAQ,CAAC,cAAc,EAAE,cAAc,CAAC;IACxC;;OAEG;IACH,OAAO,CAAC,wBAAwB,CAAuB;IACvD;;OAEG;IACH,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC;IAC9B;;OAEG;IACH,QAAQ,CAAC,OAAO,EAAE,wBAAwB,CAAC;gBAE/B,IAAI,EAAE,iCAAiC,CAAC,UAAU,CAAC;IAS/D,QAAQ,GAAI,SAAS,UAAU,
|
|
1
|
+
{"version":3,"file":"ClassificationEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/ClassificationEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AAGxC,OAAO,KAAK,EACV,wBAAwB,EACxB,iCAAiC,EACjC,WAAW,EACX,cAAc,EACd,kBAAkB,EACnB,MAAM,UAAU,CAAC;AAClB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAGnD,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAE9C;;GAEG;AACH,qBAAa,uBAAuB,CAAC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAC7E,SAAQ,YAAY,CAAC,UAAU,CAC/B,YAAW,kBAAkB;IAE7B,QAAQ,CAAC,WAAW,EAAE,WAAW,CAAC,UAAU,CAAC,CAAC;IAC9C,QAAQ,CAAC,cAAc,EAAE,cAAc,CAAC;IACxC;;OAEG;IACH,OAAO,CAAC,wBAAwB,CAAuB;IACvD;;OAEG;IACH,QAAQ,CAAC,KAAK,EAAE,aAAa,CAAC;IAC9B;;OAEG;IACH,QAAQ,CAAC,OAAO,EAAE,wBAAwB,CAAC;gBAE/B,IAAI,EAAE,iCAAiC,CAAC,UAAU,CAAC;IAS/D,QAAQ,GAAI,SAAS,UAAU,4CAM7B;IACF;;OAEG;IACH,IAAI,uBAAuB,IAAI,MAAM,EAAE,CAStC;IACD;;OAEG;IACH,gBAAgB,CACd,YAAY,EAAE,aAAa,CAAC,UAAU,CAAC,GACtC,uBAAuB,CAAC,UAAU,CAAC;CAMvC"}
|
|
@@ -1,2 +1,11 @@
|
|
|
1
|
-
|
|
1
|
+
import { type Tracer } from "@opentelemetry/api";
|
|
2
|
+
/**
|
|
3
|
+
* Returns a lazy tracer that resolves from `trace.getTracer()` on every call,
|
|
4
|
+
* so evaluator spans follow whichever provider is currently mounted as global.
|
|
5
|
+
*
|
|
6
|
+
* Cast to `Tracer` is necessary because `startActiveSpan` has multiple
|
|
7
|
+
* overload signatures that cannot be satisfied by a single implementation.
|
|
8
|
+
*/
|
|
9
|
+
export declare function getTracer(name?: string): Tracer;
|
|
10
|
+
export declare const tracer: Tracer;
|
|
2
11
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/telemetry/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/telemetry/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAS,KAAK,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAIxD;;;;;;GAMG;AACH,wBAAgB,SAAS,CAAC,IAAI,GAAE,MAA4B,GAAG,MAAM,CAUpE;AAED,eAAO,MAAM,MAAM,QAAc,CAAC"}
|
|
@@ -1,3 +1,22 @@
|
|
|
1
1
|
import { trace } from "@opentelemetry/api";
|
|
2
|
-
|
|
2
|
+
const DEFAULT_TRACER_NAME = "phoenix-evals";
|
|
3
|
+
/**
|
|
4
|
+
* Returns a lazy tracer that resolves from `trace.getTracer()` on every call,
|
|
5
|
+
* so evaluator spans follow whichever provider is currently mounted as global.
|
|
6
|
+
*
|
|
7
|
+
* Cast to `Tracer` is necessary because `startActiveSpan` has multiple
|
|
8
|
+
* overload signatures that cannot be satisfied by a single implementation.
|
|
9
|
+
*/
|
|
10
|
+
export function getTracer(name = DEFAULT_TRACER_NAME) {
|
|
11
|
+
return {
|
|
12
|
+
startSpan(spanName, options, context) {
|
|
13
|
+
return trace.getTracer(name).startSpan(spanName, options, context);
|
|
14
|
+
},
|
|
15
|
+
startActiveSpan(...args) {
|
|
16
|
+
const tracer = trace.getTracer(name);
|
|
17
|
+
return Reflect.apply(tracer.startActiveSpan, tracer, args);
|
|
18
|
+
},
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
export const tracer = getTracer();
|
|
3
22
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/telemetry/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/telemetry/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAe,MAAM,oBAAoB,CAAC;AAExD,MAAM,mBAAmB,GAAG,eAAe,CAAC;AAE5C;;;;;;GAMG;AACH,MAAM,UAAU,SAAS,CAAC,OAAe,mBAAmB;IAC1D,OAAO;QACL,SAAS,CAAC,QAAQ,EAAE,OAAO,EAAE,OAAO;YAClC,OAAO,KAAK,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,QAAQ,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;QACrE,CAAC;QACD,eAAe,CAAC,GAAG,IAAe;YAChC,MAAM,MAAM,GAAG,KAAK,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YACrC,OAAO,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,eAAe,EAAE,MAAM,EAAE,IAAI,CAAC,CAAC;QAC7D,CAAC;KACQ,CAAC;AACd,CAAC;AAED,MAAM,CAAC,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC"}
|