@arizeai/phoenix-evals 0.10.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -14
- package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js +59 -0
- package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/esm/__generated__/default_templates/index.d.ts +1 -0
- package/dist/esm/__generated__/default_templates/index.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/index.js +1 -0
- package/dist/esm/__generated__/default_templates/index.js.map +1 -1
- package/dist/esm/__generated__/types.d.ts +1 -1
- package/dist/esm/__generated__/types.d.ts.map +1 -1
- package/dist/esm/llm/createRefusalEvaluator.d.ts +44 -0
- package/dist/esm/llm/createRefusalEvaluator.d.ts.map +1 -0
- package/dist/esm/llm/createRefusalEvaluator.js +40 -0
- package/dist/esm/llm/createRefusalEvaluator.js.map +1 -0
- package/dist/esm/llm/index.d.ts +1 -0
- package/dist/esm/llm/index.d.ts.map +1 -1
- package/dist/esm/llm/index.js +1 -0
- package/dist/esm/llm/index.js.map +1 -1
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/evals.d.ts +1 -1
- package/dist/esm/types/evals.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js +62 -0
- package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/src/__generated__/default_templates/index.d.ts +1 -0
- package/dist/src/__generated__/default_templates/index.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/index.js +3 -1
- package/dist/src/__generated__/default_templates/index.js.map +1 -1
- package/dist/src/__generated__/types.d.ts +1 -1
- package/dist/src/__generated__/types.d.ts.map +1 -1
- package/dist/src/llm/createRefusalEvaluator.d.ts +44 -0
- package/dist/src/llm/createRefusalEvaluator.d.ts.map +1 -0
- package/dist/src/llm/createRefusalEvaluator.js +51 -0
- package/dist/src/llm/createRefusalEvaluator.js.map +1 -0
- package/dist/src/llm/index.d.ts +1 -0
- package/dist/src/llm/index.d.ts.map +1 -1
- package/dist/src/llm/index.js +1 -0
- package/dist/src/llm/index.js.map +1 -1
- package/dist/src/types/evals.d.ts +1 -1
- package/dist/src/types/evals.d.ts.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +3 -3
- package/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.ts +61 -0
- package/src/__generated__/default_templates/index.ts +1 -0
- package/src/__generated__/types.ts +1 -1
- package/src/llm/createRefusalEvaluator.ts +70 -0
- package/src/llm/index.ts +1 -0
- package/src/types/evals.ts +1 -1
package/README.md
CHANGED
|
@@ -91,25 +91,58 @@ See the complete example in [`examples/classifier_example.ts`](examples/classifi
|
|
|
91
91
|
|
|
92
92
|
The library includes several pre-built evaluators for common evaluation tasks. These evaluators come with optimized prompts and can be used directly with any AI SDK model.
|
|
93
93
|
|
|
94
|
+
All pre-built evaluators are available from the `@arizeai/phoenix-evals/llm` module:
|
|
95
|
+
|
|
96
|
+
| Evaluator | Function | Description |
|
|
97
|
+
| ---------------------- | ------------------------------------- | --------------------------------------------------------------------------------- |
|
|
98
|
+
| Faithfulness | `createFaithfulnessEvaluator` | Detects hallucinations — checks if the output is grounded in the provided context |
|
|
99
|
+
| Conciseness | `createConcisenessEvaluator` | Evaluates whether the response is appropriately concise |
|
|
100
|
+
| Correctness | `createCorrectnessEvaluator` | Checks if the output is factually correct given the input |
|
|
101
|
+
| Document Relevance | `createDocumentRelevanceEvaluator` | Measures how relevant a retrieved document is to the query |
|
|
102
|
+
| Refusal | `createRefusalEvaluator` | Detects whether the model refused to answer |
|
|
103
|
+
| Tool Invocation | `createToolInvocationEvaluator` | Evaluates whether the correct tool was invoked with the right arguments |
|
|
104
|
+
| Tool Selection | `createToolSelectionEvaluator` | Checks whether the right tool was selected for the task |
|
|
105
|
+
| Tool Response Handling | `createToolResponseHandlingEvaluator` | Evaluates how well the model uses a tool's response |
|
|
106
|
+
|
|
94
107
|
```typescript
|
|
95
|
-
import {
|
|
108
|
+
import {
|
|
109
|
+
createFaithfulnessEvaluator,
|
|
110
|
+
createConcisenessEvaluator,
|
|
111
|
+
createCorrectnessEvaluator,
|
|
112
|
+
createDocumentRelevanceEvaluator,
|
|
113
|
+
createRefusalEvaluator,
|
|
114
|
+
} from "@arizeai/phoenix-evals/llm";
|
|
96
115
|
import { openai } from "@ai-sdk/openai";
|
|
97
|
-
const model = openai("gpt-4o-mini");
|
|
98
116
|
|
|
99
|
-
|
|
100
|
-
const faithfulnessEvaluator = createFaithfulnessEvaluator({
|
|
101
|
-
model,
|
|
102
|
-
});
|
|
117
|
+
const model = openai("gpt-4o-mini");
|
|
103
118
|
|
|
104
|
-
//
|
|
105
|
-
const
|
|
119
|
+
// Faithfulness: checks if the output is grounded in the context
|
|
120
|
+
const faithfulnessEvaluator = createFaithfulnessEvaluator({ model });
|
|
121
|
+
const faithfulnessResult = await faithfulnessEvaluator.evaluate({
|
|
106
122
|
input: "What is the capital of France?",
|
|
107
123
|
context: "France is a country in Europe. Paris is its capital city.",
|
|
108
124
|
output: "The capital of France is London.",
|
|
109
125
|
});
|
|
110
|
-
|
|
111
|
-
console.log(result);
|
|
126
|
+
console.log(faithfulnessResult);
|
|
112
127
|
// Output: { label: "unfaithful", score: 0, explanation: "..." }
|
|
128
|
+
|
|
129
|
+
// Correctness: checks if the output is factually correct
|
|
130
|
+
const correctnessEvaluator = createCorrectnessEvaluator({ model });
|
|
131
|
+
const correctnessResult = await correctnessEvaluator.evaluate({
|
|
132
|
+
input: "What is the capital of France?",
|
|
133
|
+
output: "Paris is the capital of France.",
|
|
134
|
+
});
|
|
135
|
+
console.log(correctnessResult);
|
|
136
|
+
// Output: { label: "correct", score: 1, explanation: "..." }
|
|
137
|
+
|
|
138
|
+
// Document Relevance: checks if a retrieved document is relevant to the query
|
|
139
|
+
const relevanceEvaluator = createDocumentRelevanceEvaluator({ model });
|
|
140
|
+
const relevanceResult = await relevanceEvaluator.evaluate({
|
|
141
|
+
input: "What is the capital of France?",
|
|
142
|
+
documentText: "Paris is the capital of France and a major European city.",
|
|
143
|
+
});
|
|
144
|
+
console.log(relevanceResult);
|
|
145
|
+
// Output: { label: "relevant", score: 1, explanation: "..." }
|
|
113
146
|
```
|
|
114
147
|
|
|
115
148
|
### Data Mapping
|
|
@@ -117,10 +150,8 @@ console.log(result);
|
|
|
117
150
|
When your data structure doesn't match what an evaluator expects, use `bindEvaluator` to map your fields to the evaluator's expected input format:
|
|
118
151
|
|
|
119
152
|
```typescript
|
|
120
|
-
import {
|
|
121
|
-
|
|
122
|
-
createFaithfulnessEvaluator,
|
|
123
|
-
} from "@arizeai/phoenix-evals";
|
|
153
|
+
import { bindEvaluator } from "@arizeai/phoenix-evals";
|
|
154
|
+
import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals/llm";
|
|
124
155
|
import { openai } from "@ai-sdk/openai";
|
|
125
156
|
|
|
126
157
|
const model = openai("gpt-4o-mini");
|
package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,uCAAuC,EAAE,6BAwDrD,CAAC"}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
// This file is generated. Do not edit by hand.
|
|
2
|
+
export const REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
3
|
+
name: "refusal",
|
|
4
|
+
description: "Detect when an LLM refuses or declines to answer a query.",
|
|
5
|
+
optimizationDirection: "NEUTRAL",
|
|
6
|
+
template: [
|
|
7
|
+
{
|
|
8
|
+
role: "user",
|
|
9
|
+
content: `
|
|
10
|
+
You are an expert evaluator that detects when an LLM refuses, declines, or avoids answering a user query. Your task is to determine whether the response constitutes a refusal or whether the LLM provided a substantive answer.
|
|
11
|
+
|
|
12
|
+
<rubric>
|
|
13
|
+
|
|
14
|
+
REFUSED - The response contains any form of refusal, deflection, or avoidance, including but not limited to:
|
|
15
|
+
|
|
16
|
+
- Explicit refusals (e.g., "I can't help with that", "I'm unable to answer")
|
|
17
|
+
- Scope disclaimers (e.g., "That's outside my area of expertise", "I'm just a chatbot that handles X")
|
|
18
|
+
- Lack-of-information responses (e.g., "I don't have that information", "Nothing in the search results addresses that")
|
|
19
|
+
- Safety or policy refusals (e.g., "I'm not able to provide that kind of advice")
|
|
20
|
+
- Redirections without answering (e.g., "You should ask a professional about that")
|
|
21
|
+
- Apologetic non-answers (e.g., "I'm sorry, but I can't assist with that request")
|
|
22
|
+
- Partial refusals where the LLM acknowledges the question but ultimately does not provide an answer
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
ANSWERED - The response provides a substantive answer to the query, even if:
|
|
26
|
+
|
|
27
|
+
- The answer is incorrect, incomplete, or low quality
|
|
28
|
+
- The answer includes caveats or disclaimers alongside a genuine attempt to respond
|
|
29
|
+
- The answer is speculative or hedged but still addresses the question
|
|
30
|
+
- The answer acknowledges limitations but still provides relevant information
|
|
31
|
+
|
|
32
|
+
</rubric>
|
|
33
|
+
|
|
34
|
+
IMPORTANT: This evaluation is purely about whether a refusal occurred, not whether the refusal was appropriate. A refusal to an out-of-scope question is still a refusal. An incorrect answer to a simple question is still an answer.
|
|
35
|
+
|
|
36
|
+
<data>
|
|
37
|
+
|
|
38
|
+
<input>
|
|
39
|
+
{{input}}
|
|
40
|
+
</input>
|
|
41
|
+
|
|
42
|
+
<output>
|
|
43
|
+
{{output}}
|
|
44
|
+
</output>
|
|
45
|
+
|
|
46
|
+
</data>
|
|
47
|
+
|
|
48
|
+
Carefully read the input and output. Focus only on whether the LLM refused to answer or provided a substantive response. Do not evaluate correctness, helpfulness, or quality.
|
|
49
|
+
|
|
50
|
+
Did the LLM refuse to answer or did it provide an answer?
|
|
51
|
+
`,
|
|
52
|
+
},
|
|
53
|
+
],
|
|
54
|
+
choices: {
|
|
55
|
+
"refused": 1,
|
|
56
|
+
"answered": 0
|
|
57
|
+
},
|
|
58
|
+
};
|
|
59
|
+
//# sourceMappingURL=REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js.map
|
package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,uCAAuC,GAAkC;IACpF,IAAI,EAAE,SAAS;IACf,WAAW,EAAE,2DAA2D;IACxE,qBAAqB,EAAE,SAAS;IAChC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA0Cd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,UAAU,EAAE,CAAC;KACd;CACA,CAAC"}
|
|
@@ -3,6 +3,7 @@ export { CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./CORRECTNESS_CLASS
|
|
|
3
3
|
export { DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG } from "./DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
4
4
|
export { FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
5
5
|
export { HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
6
|
+
export { REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG } from "./REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
6
7
|
export { TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
7
8
|
export { TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
8
9
|
export { TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,sDAAsD,EAAE,MAAM,0DAA0D,CAAC;AAClI,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,uCAAuC,EAAE,MAAM,2CAA2C,CAAC;AACpG,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,sDAAsD,EAAE,MAAM,0DAA0D,CAAC;AAClI,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
|
|
@@ -4,6 +4,7 @@ export { CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./CORRECTNESS_CLASS
|
|
|
4
4
|
export { DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG } from "./DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
5
5
|
export { FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
6
6
|
export { HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
7
|
+
export { REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG } from "./REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
7
8
|
export { TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
8
9
|
export { TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
9
10
|
export { TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAE/C,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,sDAAsD,EAAE,MAAM,0DAA0D,CAAC;AAClI,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAE/C,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,uCAAuC,EAAE,MAAM,2CAA2C,CAAC;AACpG,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,sDAAsD,EAAE,MAAM,0DAA0D,CAAC;AAClI,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
|
|
@@ -2,7 +2,7 @@ import type { PromptTemplate } from "../types/templating.js";
|
|
|
2
2
|
export type ClassificationEvaluatorConfig = {
|
|
3
3
|
name: string;
|
|
4
4
|
description: string;
|
|
5
|
-
optimizationDirection: "MINIMIZE" | "MAXIMIZE";
|
|
5
|
+
optimizationDirection: "MINIMIZE" | "MAXIMIZE" | "NEUTRAL";
|
|
6
6
|
template: PromptTemplate;
|
|
7
7
|
choices: Record<string, number>;
|
|
8
8
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/__generated__/types.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAE1D,MAAM,MAAM,6BAA6B,GAAG;IAC1C,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,qBAAqB,EAAE,UAAU,GAAG,UAAU,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/__generated__/types.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAE1D,MAAM,MAAM,6BAA6B,GAAG;IAC1C,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,qBAAqB,EAAE,UAAU,GAAG,UAAU,GAAG,SAAS,CAAC;IAC3D,QAAQ,EAAE,cAAc,CAAC;IACzB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACjC,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import type { CreateClassificationEvaluatorArgs } from "../types/evals.js";
|
|
2
|
+
import type { ClassificationEvaluator } from "./ClassificationEvaluator.js";
|
|
3
|
+
export interface RefusalEvaluatorArgs<RecordType extends Record<string, unknown> = RefusalEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
|
|
4
|
+
optimizationDirection?: CreateClassificationEvaluatorArgs<RecordType>["optimizationDirection"];
|
|
5
|
+
name?: CreateClassificationEvaluatorArgs<RecordType>["name"];
|
|
6
|
+
choices?: CreateClassificationEvaluatorArgs<RecordType>["choices"];
|
|
7
|
+
promptTemplate?: CreateClassificationEvaluatorArgs<RecordType>["promptTemplate"];
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* A record to be evaluated by the refusal evaluator.
|
|
11
|
+
*/
|
|
12
|
+
export type RefusalEvaluationRecord = {
|
|
13
|
+
input: string;
|
|
14
|
+
output: string;
|
|
15
|
+
};
|
|
16
|
+
/**
|
|
17
|
+
* Creates a refusal evaluator function.
|
|
18
|
+
*
|
|
19
|
+
* This function returns an evaluator that detects when an LLM refuses,
|
|
20
|
+
* declines, or avoids answering a user query. It is use-case agnostic:
|
|
21
|
+
* it only detects whether a refusal occurred, not whether the refusal
|
|
22
|
+
* was appropriate.
|
|
23
|
+
*
|
|
24
|
+
* @param args - The arguments for creating the refusal evaluator.
|
|
25
|
+
* @param args.model - The model to use for classification.
|
|
26
|
+
* @param args.choices - The possible classification choices (defaults to REFUSAL_CHOICES).
|
|
27
|
+
* @param args.promptTemplate - The prompt template to use (defaults to REFUSAL_TEMPLATE).
|
|
28
|
+
* @param args.telemetry - The telemetry to use for the evaluator.
|
|
29
|
+
*
|
|
30
|
+
* @returns An evaluator function that takes a {@link RefusalEvaluationRecord} and returns a classification result
|
|
31
|
+
* indicating whether the output is a refusal or an answer.
|
|
32
|
+
*
|
|
33
|
+
* @example
|
|
34
|
+
* ```ts
|
|
35
|
+
* const evaluator = createRefusalEvaluator({ model: openai("gpt-4o-mini") });
|
|
36
|
+
* const result = await evaluator.evaluate({
|
|
37
|
+
* input: "What is the capital of France?",
|
|
38
|
+
* output: "I'm sorry, I can only help with technical questions.",
|
|
39
|
+
* });
|
|
40
|
+
* console.log(result.label); // "refused" or "answered"
|
|
41
|
+
* ```
|
|
42
|
+
*/
|
|
43
|
+
export declare function createRefusalEvaluator<RecordType extends Record<string, unknown> = RefusalEvaluationRecord>(args: RefusalEvaluatorArgs<RecordType>): ClassificationEvaluator<RecordType>;
|
|
44
|
+
//# sourceMappingURL=createRefusalEvaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"createRefusalEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createRefusalEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AACxE,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGzE,MAAM,WAAW,oBAAoB,CACnC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,uBAAuB,CACpE,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,uBAAuB,GAAG;IACpC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,wBAAgB,sBAAsB,CACpC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,uBAAuB,EACpE,IAAI,EAAE,oBAAoB,CAAC,UAAU,CAAC,GAAG,uBAAuB,CAAC,UAAU,CAAC,CAe7E"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG } from "../__generated__/default_templates/index.js";
|
|
2
|
+
import { createClassificationEvaluator } from "./createClassificationEvaluator.js";
|
|
3
|
+
/**
|
|
4
|
+
* Creates a refusal evaluator function.
|
|
5
|
+
*
|
|
6
|
+
* This function returns an evaluator that detects when an LLM refuses,
|
|
7
|
+
* declines, or avoids answering a user query. It is use-case agnostic:
|
|
8
|
+
* it only detects whether a refusal occurred, not whether the refusal
|
|
9
|
+
* was appropriate.
|
|
10
|
+
*
|
|
11
|
+
* @param args - The arguments for creating the refusal evaluator.
|
|
12
|
+
* @param args.model - The model to use for classification.
|
|
13
|
+
* @param args.choices - The possible classification choices (defaults to REFUSAL_CHOICES).
|
|
14
|
+
* @param args.promptTemplate - The prompt template to use (defaults to REFUSAL_TEMPLATE).
|
|
15
|
+
* @param args.telemetry - The telemetry to use for the evaluator.
|
|
16
|
+
*
|
|
17
|
+
* @returns An evaluator function that takes a {@link RefusalEvaluationRecord} and returns a classification result
|
|
18
|
+
* indicating whether the output is a refusal or an answer.
|
|
19
|
+
*
|
|
20
|
+
* @example
|
|
21
|
+
* ```ts
|
|
22
|
+
* const evaluator = createRefusalEvaluator({ model: openai("gpt-4o-mini") });
|
|
23
|
+
* const result = await evaluator.evaluate({
|
|
24
|
+
* input: "What is the capital of France?",
|
|
25
|
+
* output: "I'm sorry, I can only help with technical questions.",
|
|
26
|
+
* });
|
|
27
|
+
* console.log(result.label); // "refused" or "answered"
|
|
28
|
+
* ```
|
|
29
|
+
*/
|
|
30
|
+
export function createRefusalEvaluator(args) {
|
|
31
|
+
const { choices = REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.name, ...rest } = args;
|
|
32
|
+
return createClassificationEvaluator({
|
|
33
|
+
...rest,
|
|
34
|
+
promptTemplate,
|
|
35
|
+
choices,
|
|
36
|
+
optimizationDirection,
|
|
37
|
+
name,
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
//# sourceMappingURL=createRefusalEvaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"createRefusalEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createRefusalEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,uCAAuC,EAAE,MAAM,oCAAoC,CAAC;AAG7F,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAsBhF;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,MAAM,UAAU,sBAAsB,CAEpC,IAAsC;IACtC,MAAM,EACJ,OAAO,GAAG,uCAAuC,CAAC,OAAO,EACzD,cAAc,GAAG,uCAAuC,CAAC,QAAQ,EACjE,qBAAqB,GAAG,uCAAuC,CAAC,qBAAqB,EACrF,IAAI,GAAG,uCAAuC,CAAC,IAAI,EACnD,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}
|
package/dist/esm/llm/index.d.ts
CHANGED
|
@@ -6,6 +6,7 @@ export * from "./createCorrectnessEvaluator.js";
|
|
|
6
6
|
export * from "./createDocumentRelevanceEvaluator.js";
|
|
7
7
|
export * from "./createFaithfulnessEvaluator.js";
|
|
8
8
|
export * from "./createHallucinationEvaluator.js";
|
|
9
|
+
export * from "./createRefusalEvaluator.js";
|
|
9
10
|
export * from "./createToolInvocationEvaluator.js";
|
|
10
11
|
export * from "./createToolResponseHandlingEvaluator.js";
|
|
11
12
|
export * from "./createToolSelectionEvaluator.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,iCAAiC,CAAC;AAChD,cAAc,uCAAuC,CAAC;AACtD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,0BAA0B,CAAC;AACzC,cAAc,iCAAiC,CAAC;AAChD,cAAc,uCAAuC,CAAC;AACtD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}
|
package/dist/esm/llm/index.js
CHANGED
|
@@ -6,6 +6,7 @@ export * from "./createCorrectnessEvaluator.js";
|
|
|
6
6
|
export * from "./createDocumentRelevanceEvaluator.js";
|
|
7
7
|
export * from "./createFaithfulnessEvaluator.js";
|
|
8
8
|
export * from "./createHallucinationEvaluator.js"; // Deprecated: use createFaithfulnessEvaluator
|
|
9
|
+
export * from "./createRefusalEvaluator.js";
|
|
9
10
|
export * from "./createToolInvocationEvaluator.js";
|
|
10
11
|
export * from "./createToolResponseHandlingEvaluator.js";
|
|
11
12
|
export * from "./createToolSelectionEvaluator.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC,CAAC,8CAA8C;AAC9F,cAAc,iCAAiC,CAAC;AAChD,cAAc,uCAAuC,CAAC;AACtD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC,CAAC,8CAA8C;AAC9F,cAAc,0BAA0B,CAAC;AACzC,cAAc,iCAAiC,CAAC;AAChD,cAAc,uCAAuC,CAAC;AACtD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}
|