@arizeai/phoenix-evals 0.6.5 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +49 -0
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js +10 -17
- package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +33 -0
- package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js +12 -22
- package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js +30 -0
- package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js +27 -0
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/esm/__generated__/default_templates/index.d.ts +4 -0
- package/dist/esm/__generated__/default_templates/index.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/index.js +4 -0
- package/dist/esm/__generated__/default_templates/index.js.map +1 -1
- package/dist/esm/core/EvaluatorBase.d.ts.map +1 -1
- package/dist/esm/llm/createCorrectnessEvaluator.d.ts +42 -0
- package/dist/esm/llm/createCorrectnessEvaluator.d.ts.map +1 -0
- package/dist/esm/llm/createCorrectnessEvaluator.js +38 -0
- package/dist/esm/llm/createCorrectnessEvaluator.js.map +1 -0
- package/dist/esm/llm/createDocumentRelevanceEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createDocumentRelevanceEvaluator.js.map +1 -1
- package/dist/esm/llm/createFaithfulnessEvaluator.d.ts +24 -0
- package/dist/esm/llm/createFaithfulnessEvaluator.d.ts.map +1 -0
- package/dist/esm/llm/createFaithfulnessEvaluator.js +19 -0
- package/dist/esm/llm/createFaithfulnessEvaluator.js.map +1 -0
- package/dist/esm/llm/createHallucinationEvaluator.d.ts +12 -0
- package/dist/esm/llm/createHallucinationEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createHallucinationEvaluator.js +17 -0
- package/dist/esm/llm/createHallucinationEvaluator.js.map +1 -1
- package/dist/esm/llm/createToolInvocationEvaluator.d.ts +74 -0
- package/dist/esm/llm/createToolInvocationEvaluator.d.ts.map +1 -0
- package/dist/esm/llm/createToolInvocationEvaluator.js +60 -0
- package/dist/esm/llm/createToolInvocationEvaluator.js.map +1 -0
- package/dist/esm/llm/index.d.ts +7 -4
- package/dist/esm/llm/index.d.ts.map +1 -1
- package/dist/esm/llm/index.js +7 -4
- package/dist/esm/llm/index.js.map +1 -1
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/evals.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +52 -0
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js +10 -17
- package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +36 -0
- package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js +12 -22
- package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js +33 -0
- package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js +30 -0
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/src/__generated__/default_templates/index.d.ts +4 -0
- package/dist/src/__generated__/default_templates/index.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/index.js +9 -1
- package/dist/src/__generated__/default_templates/index.js.map +1 -1
- package/dist/src/core/EvaluatorBase.d.ts.map +1 -1
- package/dist/src/llm/createCorrectnessEvaluator.d.ts +42 -0
- package/dist/src/llm/createCorrectnessEvaluator.d.ts.map +1 -0
- package/dist/src/llm/createCorrectnessEvaluator.js +49 -0
- package/dist/src/llm/createCorrectnessEvaluator.js.map +1 -0
- package/dist/src/llm/createDocumentRelevanceEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createDocumentRelevanceEvaluator.js.map +1 -1
- package/dist/src/llm/createFaithfulnessEvaluator.d.ts +24 -0
- package/dist/src/llm/createFaithfulnessEvaluator.d.ts.map +1 -0
- package/dist/src/llm/createFaithfulnessEvaluator.js +30 -0
- package/dist/src/llm/createFaithfulnessEvaluator.js.map +1 -0
- package/dist/src/llm/createHallucinationEvaluator.d.ts +12 -0
- package/dist/src/llm/createHallucinationEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createHallucinationEvaluator.js +17 -0
- package/dist/src/llm/createHallucinationEvaluator.js.map +1 -1
- package/dist/src/llm/createToolInvocationEvaluator.d.ts +74 -0
- package/dist/src/llm/createToolInvocationEvaluator.d.ts.map +1 -0
- package/dist/src/llm/createToolInvocationEvaluator.js +71 -0
- package/dist/src/llm/createToolInvocationEvaluator.js.map +1 -0
- package/dist/src/llm/index.d.ts +7 -4
- package/dist/src/llm/index.d.ts.map +1 -1
- package/dist/src/llm/index.js +7 -4
- package/dist/src/llm/index.js.map +1 -1
- package/dist/src/types/evals.d.ts.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +1 -1
- package/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +51 -0
- package/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts +10 -17
- package/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +35 -0
- package/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts +12 -22
- package/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts +32 -0
- package/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts +29 -0
- package/src/__generated__/default_templates/index.ts +4 -0
- package/src/core/EvaluatorBase.ts +3 -3
- package/src/llm/createCorrectnessEvaluator.ts +71 -0
- package/src/llm/createDocumentRelevanceEvaluator.ts +7 -11
- package/src/llm/createFaithfulnessEvaluator.ts +52 -0
- package/src/llm/createHallucinationEvaluator.ts +25 -3
- package/src/llm/createToolInvocationEvaluator.ts +103 -0
- package/src/llm/index.ts +7 -4
- package/src/types/evals.ts +5 -4
|
@@ -1,4 +1,10 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* @deprecated This evaluator is maintained for backwards compatibility.
|
|
4
|
+
* Please use createFaithfulnessEvaluator instead, which uses updated terminology:
|
|
5
|
+
* - 'faithful'/'unfaithful' labels instead of 'factual'/'hallucinated'
|
|
6
|
+
* - Maximizes score (1.0=faithful) instead of minimizing it
|
|
7
|
+
*/
|
|
2
8
|
var __rest = (this && this.__rest) || function (s, e) {
|
|
3
9
|
var t = {};
|
|
4
10
|
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)
|
|
@@ -15,12 +21,23 @@ exports.createHallucinationEvaluator = createHallucinationEvaluator;
|
|
|
15
21
|
const default_templates_1 = require("../__generated__/default_templates");
|
|
16
22
|
const createClassificationEvaluator_1 = require("./createClassificationEvaluator");
|
|
17
23
|
/**
|
|
24
|
+
* @deprecated Use createFaithfulnessEvaluator instead.
|
|
25
|
+
*
|
|
18
26
|
* Creates a function that evaluates whether an answer is factual or hallucinated based on a query and reference text.
|
|
19
27
|
*
|
|
28
|
+
* Note: This is deprecated. Please use createFaithfulnessEvaluator which:
|
|
29
|
+
* - Uses 'faithful'/'unfaithful' labels instead of 'factual'/'hallucinated'
|
|
30
|
+
* - Maximizes the score (1.0 for faithful, 0.0 for unfaithful)
|
|
31
|
+
*
|
|
20
32
|
* @param args - The arguments for creating the hallucination evaluator.
|
|
21
33
|
* @returns A function that evaluates whether an answer is factual or hallucinated based on a query and reference text.
|
|
22
34
|
*/
|
|
23
35
|
function createHallucinationEvaluator(args) {
|
|
36
|
+
// eslint-disable-next-line no-console
|
|
37
|
+
console.warn("createHallucinationEvaluator is deprecated and will be removed in a future version. " +
|
|
38
|
+
"Please use createFaithfulnessEvaluator instead. The new evaluator uses " +
|
|
39
|
+
"'faithful'/'unfaithful' labels and maximizes score (1.0=faithful) instead of " +
|
|
40
|
+
"minimizing it (0.0=factual).");
|
|
24
41
|
const { choices = default_templates_1.HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = default_templates_1.HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = default_templates_1.HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = default_templates_1.HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.name } = args, rest = __rest(args, ["choices", "promptTemplate", "optimizationDirection", "name"]);
|
|
25
42
|
return (0, createClassificationEvaluator_1.createClassificationEvaluator)(Object.assign(Object.assign({}, rest), { promptTemplate,
|
|
26
43
|
choices,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"createHallucinationEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createHallucinationEvaluator.ts"],"names":[],"mappings":";;;;;;;;;;;;;
|
|
1
|
+
{"version":3,"file":"createHallucinationEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createHallucinationEvaluator.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;;;;;;;;;;;AAyCH,oEA2BC;AAlED,0EAAmG;AAInG,mFAAgF;AAuBhF;;;;;;;;;;;GAWG;AACH,SAAgB,4BAA4B,CAG1C,IAA4C;IAE5C,sCAAsC;IACtC,OAAO,CAAC,IAAI,CACV,sFAAsF;QACpF,yEAAyE;QACzE,+EAA+E;QAC/E,8BAA8B,CACjC,CAAC;IAEF,MAAM,EACJ,OAAO,GAAG,iEAA6C,CAAC,OAAO,EAC/D,cAAc,GAAG,iEAA6C,CAAC,QAAQ,EACvE,qBAAqB,GAAG,iEAA6C,CAAC,qBAAqB,EAC3F,IAAI,GAAG,iEAA6C,CAAC,IAAI,KAEvD,IAAI,EADH,IAAI,UACL,IAAI,EANF,8DAML,CAAO,CAAC;IACT,OAAO,IAAA,6DAA6B,kCAC/B,IAAI,KACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI,IACJ,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { CreateClassificationEvaluatorArgs } from "../types/evals";
|
|
2
|
+
import { ClassificationEvaluator } from "./ClassificationEvaluator";
|
|
3
|
+
export interface ToolInvocationEvaluatorArgs<RecordType extends Record<string, unknown> = ToolInvocationEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
|
|
4
|
+
optimizationDirection?: CreateClassificationEvaluatorArgs<RecordType>["optimizationDirection"];
|
|
5
|
+
name?: CreateClassificationEvaluatorArgs<RecordType>["name"];
|
|
6
|
+
choices?: CreateClassificationEvaluatorArgs<RecordType>["choices"];
|
|
7
|
+
promptTemplate?: CreateClassificationEvaluatorArgs<RecordType>["promptTemplate"];
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* A record to be evaluated by the tool invocation evaluator.
|
|
11
|
+
*/
|
|
12
|
+
export type ToolInvocationEvaluationRecord = {
|
|
13
|
+
/**
|
|
14
|
+
* The input query or conversation context.
|
|
15
|
+
*/
|
|
16
|
+
input: string;
|
|
17
|
+
/**
|
|
18
|
+
* The available tool schemas, either as JSON schema or human-readable format.
|
|
19
|
+
*/
|
|
20
|
+
availableTools: string;
|
|
21
|
+
/**
|
|
22
|
+
* The tool invocation(s) made by the LLM, including arguments.
|
|
23
|
+
*/
|
|
24
|
+
toolSelection: string;
|
|
25
|
+
};
|
|
26
|
+
/**
|
|
27
|
+
* Creates a tool invocation evaluator function.
|
|
28
|
+
*
|
|
29
|
+
* This function returns an evaluator that determines whether a tool was invoked
|
|
30
|
+
* correctly with proper arguments, formatting, and safe content.
|
|
31
|
+
*
|
|
32
|
+
* The evaluator checks for:
|
|
33
|
+
* - Properly structured JSON (if applicable)
|
|
34
|
+
* - All required fields/parameters present
|
|
35
|
+
* - No hallucinated or nonexistent fields
|
|
36
|
+
* - Argument values matching user query and schema expectations
|
|
37
|
+
* - No unsafe content (e.g., PII) in arguments
|
|
38
|
+
*
|
|
39
|
+
* @param args - The arguments for creating the tool invocation evaluator.
|
|
40
|
+
* @param args.model - The model to use for classification.
|
|
41
|
+
* @param args.choices - The possible classification choices (defaults to correct/incorrect).
|
|
42
|
+
* @param args.promptTemplate - The prompt template to use (defaults to TOOL_INVOCATION_TEMPLATE).
|
|
43
|
+
* @param args.telemetry - The telemetry to use for the evaluator.
|
|
44
|
+
*
|
|
45
|
+
* @returns An evaluator function that takes a {@link ToolInvocationEvaluationRecord} and returns
|
|
46
|
+
* a classification result indicating whether the tool invocation is correct or incorrect.
|
|
47
|
+
*
|
|
48
|
+
* @example
|
|
49
|
+
* ```ts
|
|
50
|
+
* const evaluator = createToolInvocationEvaluator({ model: openai("gpt-4o-mini") });
|
|
51
|
+
*
|
|
52
|
+
* // Example with JSON schema format for available tools
|
|
53
|
+
* const result = await evaluator.evaluate({
|
|
54
|
+
* input: "User: Book a flight from NYC to LA for tomorrow",
|
|
55
|
+
* availableTools: JSON.stringify({
|
|
56
|
+
* name: "book_flight",
|
|
57
|
+
* description: "Book a flight between two cities",
|
|
58
|
+
* parameters: {
|
|
59
|
+
* type: "object",
|
|
60
|
+
* properties: {
|
|
61
|
+
* origin: { type: "string", description: "Departure city code" },
|
|
62
|
+
* destination: { type: "string", description: "Arrival city code" },
|
|
63
|
+
* date: { type: "string", description: "Flight date in YYYY-MM-DD" }
|
|
64
|
+
* },
|
|
65
|
+
* required: ["origin", "destination", "date"]
|
|
66
|
+
* }
|
|
67
|
+
* }),
|
|
68
|
+
* toolSelection: 'book_flight(origin="NYC", destination="LA", date="2024-01-15")'
|
|
69
|
+
* });
|
|
70
|
+
* console.log(result.label); // "correct" or "incorrect"
|
|
71
|
+
* ```
|
|
72
|
+
*/
|
|
73
|
+
export declare function createToolInvocationEvaluator<RecordType extends Record<string, unknown> = ToolInvocationEvaluationRecord>(args: ToolInvocationEvaluatorArgs<RecordType>): ClassificationEvaluator<RecordType>;
|
|
74
|
+
//# sourceMappingURL=createToolInvocationEvaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"createToolInvocationEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createToolInvocationEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,2BAA2B,CAC1C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,8BAA8B,CAC3E,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,8BAA8B,GAAG;IAC3C;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB;;OAEG;IACH,aAAa,EAAE,MAAM,CAAC;CACvB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8CG;AACH,wBAAgB,6BAA6B,CAC3C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,8BAA8B,EAE3E,IAAI,EAAE,2BAA2B,CAAC,UAAU,CAAC,GAC5C,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __rest = (this && this.__rest) || function (s, e) {
|
|
3
|
+
var t = {};
|
|
4
|
+
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)
|
|
5
|
+
t[p] = s[p];
|
|
6
|
+
if (s != null && typeof Object.getOwnPropertySymbols === "function")
|
|
7
|
+
for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {
|
|
8
|
+
if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))
|
|
9
|
+
t[p[i]] = s[p[i]];
|
|
10
|
+
}
|
|
11
|
+
return t;
|
|
12
|
+
};
|
|
13
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
14
|
+
exports.createToolInvocationEvaluator = createToolInvocationEvaluator;
|
|
15
|
+
const default_templates_1 = require("../__generated__/default_templates");
|
|
16
|
+
const createClassificationEvaluator_1 = require("./createClassificationEvaluator");
|
|
17
|
+
/**
|
|
18
|
+
* Creates a tool invocation evaluator function.
|
|
19
|
+
*
|
|
20
|
+
* This function returns an evaluator that determines whether a tool was invoked
|
|
21
|
+
* correctly with proper arguments, formatting, and safe content.
|
|
22
|
+
*
|
|
23
|
+
* The evaluator checks for:
|
|
24
|
+
* - Properly structured JSON (if applicable)
|
|
25
|
+
* - All required fields/parameters present
|
|
26
|
+
* - No hallucinated or nonexistent fields
|
|
27
|
+
* - Argument values matching user query and schema expectations
|
|
28
|
+
* - No unsafe content (e.g., PII) in arguments
|
|
29
|
+
*
|
|
30
|
+
* @param args - The arguments for creating the tool invocation evaluator.
|
|
31
|
+
* @param args.model - The model to use for classification.
|
|
32
|
+
* @param args.choices - The possible classification choices (defaults to correct/incorrect).
|
|
33
|
+
* @param args.promptTemplate - The prompt template to use (defaults to TOOL_INVOCATION_TEMPLATE).
|
|
34
|
+
* @param args.telemetry - The telemetry to use for the evaluator.
|
|
35
|
+
*
|
|
36
|
+
* @returns An evaluator function that takes a {@link ToolInvocationEvaluationRecord} and returns
|
|
37
|
+
* a classification result indicating whether the tool invocation is correct or incorrect.
|
|
38
|
+
*
|
|
39
|
+
* @example
|
|
40
|
+
* ```ts
|
|
41
|
+
* const evaluator = createToolInvocationEvaluator({ model: openai("gpt-4o-mini") });
|
|
42
|
+
*
|
|
43
|
+
* // Example with JSON schema format for available tools
|
|
44
|
+
* const result = await evaluator.evaluate({
|
|
45
|
+
* input: "User: Book a flight from NYC to LA for tomorrow",
|
|
46
|
+
* availableTools: JSON.stringify({
|
|
47
|
+
* name: "book_flight",
|
|
48
|
+
* description: "Book a flight between two cities",
|
|
49
|
+
* parameters: {
|
|
50
|
+
* type: "object",
|
|
51
|
+
* properties: {
|
|
52
|
+
* origin: { type: "string", description: "Departure city code" },
|
|
53
|
+
* destination: { type: "string", description: "Arrival city code" },
|
|
54
|
+
* date: { type: "string", description: "Flight date in YYYY-MM-DD" }
|
|
55
|
+
* },
|
|
56
|
+
* required: ["origin", "destination", "date"]
|
|
57
|
+
* }
|
|
58
|
+
* }),
|
|
59
|
+
* toolSelection: 'book_flight(origin="NYC", destination="LA", date="2024-01-15")'
|
|
60
|
+
* });
|
|
61
|
+
* console.log(result.label); // "correct" or "incorrect"
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
function createToolInvocationEvaluator(args) {
|
|
65
|
+
const { choices = default_templates_1.TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = default_templates_1.TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = default_templates_1.TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = default_templates_1.TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.name } = args, rest = __rest(args, ["choices", "promptTemplate", "optimizationDirection", "name"]);
|
|
66
|
+
return (0, createClassificationEvaluator_1.createClassificationEvaluator)(Object.assign(Object.assign({}, rest), { promptTemplate,
|
|
67
|
+
choices,
|
|
68
|
+
optimizationDirection,
|
|
69
|
+
name }));
|
|
70
|
+
}
|
|
71
|
+
//# sourceMappingURL=createToolInvocationEvaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"createToolInvocationEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createToolInvocationEvaluator.ts"],"names":[],"mappings":";;;;;;;;;;;;;AAmFA,sEAmBC;AAtGD,0EAAqG;AAIrG,mFAAgF;AAgChF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8CG;AACH,SAAgB,6BAA6B,CAG3C,IAA6C;IAE7C,MAAM,EACJ,OAAO,GAAG,mEAA+C,CAAC,OAAO,EACjE,cAAc,GAAG,mEAA+C,CAAC,QAAQ,EACzE,qBAAqB,GAAG,mEAA+C,CAAC,qBAAqB,EAC7F,IAAI,GAAG,mEAA+C,CAAC,IAAI,KAEzD,IAAI,EADH,IAAI,UACL,IAAI,EANF,8DAML,CAAO,CAAC;IACT,OAAO,IAAA,6DAA6B,kCAC/B,IAAI,KACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI,IACJ,CAAC;AACL,CAAC"}
|
package/dist/src/llm/index.d.ts
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
export * from "./
|
|
2
|
-
export * from "./createClassifierFn";
|
|
1
|
+
export * from "./ClassificationEvaluator";
|
|
3
2
|
export * from "./createClassificationEvaluator";
|
|
4
|
-
export * from "./
|
|
3
|
+
export * from "./createClassifierFn";
|
|
4
|
+
export * from "./createCorrectnessEvaluator";
|
|
5
5
|
export * from "./createDocumentRelevanceEvaluator";
|
|
6
|
-
export * from "./
|
|
6
|
+
export * from "./createFaithfulnessEvaluator";
|
|
7
|
+
export * from "./createHallucinationEvaluator";
|
|
8
|
+
export * from "./createToolInvocationEvaluator";
|
|
9
|
+
export * from "./generateClassification";
|
|
7
10
|
export * from "./LLMEvaluator";
|
|
8
11
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,iCAAiC,CAAC;AAChD,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}
|
package/dist/src/llm/index.js
CHANGED
|
@@ -14,11 +14,14 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
14
14
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
15
|
};
|
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
-
__exportStar(require("./
|
|
18
|
-
__exportStar(require("./createClassifierFn"), exports);
|
|
17
|
+
__exportStar(require("./ClassificationEvaluator"), exports);
|
|
19
18
|
__exportStar(require("./createClassificationEvaluator"), exports);
|
|
20
|
-
__exportStar(require("./
|
|
19
|
+
__exportStar(require("./createClassifierFn"), exports);
|
|
20
|
+
__exportStar(require("./createCorrectnessEvaluator"), exports);
|
|
21
21
|
__exportStar(require("./createDocumentRelevanceEvaluator"), exports);
|
|
22
|
-
__exportStar(require("./
|
|
22
|
+
__exportStar(require("./createFaithfulnessEvaluator"), exports);
|
|
23
|
+
__exportStar(require("./createHallucinationEvaluator"), exports); // Deprecated: use createFaithfulnessEvaluator
|
|
24
|
+
__exportStar(require("./createToolInvocationEvaluator"), exports);
|
|
25
|
+
__exportStar(require("./generateClassification"), exports);
|
|
23
26
|
__exportStar(require("./LLMEvaluator"), exports);
|
|
24
27
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,4DAA0C;AAC1C,kEAAgD;AAChD,uDAAqC;AACrC,+DAA6C;AAC7C,qEAAmD;AACnD,gEAA8C;AAC9C,iEAA+C,CAAC,8CAA8C;AAC9F,kEAAgD;AAChD,2DAAyC;AACzC,iDAA+B"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evals.d.ts","sourceRoot":"","sources":["../../../src/types/evals.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,OAAO,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AAEnC;;GAEG;AACH,MAAM,WAAW,aAAa,CAAC,UAAU,EAAE,SAAS;IAClD,MAAM,EAAE,UAAU,CAAC;IACnB,QAAQ,CAAC,EAAE,UAAU,CAAC;IACtB,KAAK,CAAC,EAAE,SAAS,CAAC;IAClB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,aAAa,CAAC;CACtB;AAGD,MAAM,WAAW,iBAAkB,SAAQ,OAAO;CAAG;AAErD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;CACf;AAED;;GAEG;AACH,MAAM,MAAM,wBAAwB,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;AAE9D;;GAEG;AACH,MAAM,WAAW,oBAAqB,SAAQ,aAAa;IAIzD,KAAK,EAAE,aAAa,CAAC;IACrB;;;OAGG;IACH,OAAO,EAAE,wBAAwB,CAAC;IAClC;;OAEG;IACH,cAAc,EAAE,cAAc,CAAC;CAChC;AAED,MAAM,WAAW,mBAAmB,CAClC,WAAW,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CACrE,SAAQ,aAAa;IACrB;;;OAGG;IACH,IAAI,EAAE,MAAM,CAAC;IACb;;OAEG;IACH,IAAI,EAAE,cAAc,CAAC;IACrB;;;OAGG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAC9C;;OAEG;IACH,YAAY,CAAC,EAAE,aAAa,CAAC,WAAW,CAAC,CAAC;CAC3C;AAED,MAAM,MAAM,sBAAsB,CAAC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,IAC3E,IAAI,CAAC,mBAAmB,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC,CAAC;AAEhD,MAAM,WAAW,iCAAiC,CAChD,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,
|
|
1
|
+
{"version":3,"file":"evals.d.ts","sourceRoot":"","sources":["../../../src/types/evals.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,OAAO,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AAEnC;;GAEG;AACH,MAAM,WAAW,aAAa,CAAC,UAAU,EAAE,SAAS;IAClD,MAAM,EAAE,UAAU,CAAC;IACnB,QAAQ,CAAC,EAAE,UAAU,CAAC;IACtB,KAAK,CAAC,EAAE,SAAS,CAAC;IAClB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,aAAa,CAAC;CACtB;AAGD,MAAM,WAAW,iBAAkB,SAAQ,OAAO;CAAG;AAErD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;CACf;AAED;;GAEG;AACH,MAAM,MAAM,wBAAwB,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;AAE9D;;GAEG;AACH,MAAM,WAAW,oBAAqB,SAAQ,aAAa;IAIzD,KAAK,EAAE,aAAa,CAAC;IACrB;;;OAGG;IACH,OAAO,EAAE,wBAAwB,CAAC;IAClC;;OAEG;IACH,cAAc,EAAE,cAAc,CAAC;CAChC;AAED,MAAM,WAAW,mBAAmB,CAClC,WAAW,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CACrE,SAAQ,aAAa;IACrB;;;OAGG;IACH,IAAI,EAAE,MAAM,CAAC;IACb;;OAEG;IACH,IAAI,EAAE,cAAc,CAAC;IACrB;;;OAGG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAC9C;;OAEG;IACH,YAAY,CAAC,EAAE,aAAa,CAAC,WAAW,CAAC,CAAC;CAC3C;AAED,MAAM,MAAM,sBAAsB,CAAC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,IAC3E,IAAI,CAAC,mBAAmB,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC,CAAC;AAEhD,MAAM,WAAW,iCAAiC,CAChD,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAE1C,SAAQ,oBAAoB,EAAE,sBAAsB,CAAC,UAAU,CAAC;IAChE;;OAEG;IACH,cAAc,EAAE,cAAc,CAAC;CAChC;AAED,MAAM,MAAM,WAAW,CAAC,WAAW,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,IAAI,CACrE,IAAI,EAAE,WAAW,KACd,OAAO,CAAC,gBAAgB,CAAC,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,cAAc,GAAG,KAAK,GAAG,MAAM,CAAC;AAE5C;;;GAGG;AACH,MAAM,MAAM,qBAAqB,GAAG,UAAU,GAAG,UAAU,CAAC;AAE5D;;GAEG;AACH,UAAU,oBAAoB;IAC5B;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;IACb;;OAEG;IACH,IAAI,EAAE,cAAc,CAAC;IACrB;;;OAGG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;CAC/C;AAED;;;GAGG;AACH,MAAM,WAAW,kBAAkB,CACjC,WAAW,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAC3C,SAAQ,oBAAoB;IAC5B;;OAEG;IACH,QAAQ,EAAE,WAAW,CAAC,WAAW,CAAC,CAAC;CACpC"}
|