@arizeai/phoenix-evals 0.6.5 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +49 -0
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js +10 -17
- package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js +12 -22
- package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js +27 -0
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/esm/__generated__/default_templates/index.d.ts +2 -0
- package/dist/esm/__generated__/default_templates/index.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/index.js +2 -0
- package/dist/esm/__generated__/default_templates/index.js.map +1 -1
- package/dist/esm/llm/createCorrectnessEvaluator.d.ts +42 -0
- package/dist/esm/llm/createCorrectnessEvaluator.d.ts.map +1 -0
- package/dist/esm/llm/createCorrectnessEvaluator.js +38 -0
- package/dist/esm/llm/createCorrectnessEvaluator.js.map +1 -0
- package/dist/esm/llm/index.d.ts +1 -0
- package/dist/esm/llm/index.d.ts.map +1 -1
- package/dist/esm/llm/index.js +1 -0
- package/dist/esm/llm/index.js.map +1 -1
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +52 -0
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js +10 -17
- package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js +12 -22
- package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js +30 -0
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/src/__generated__/default_templates/index.d.ts +2 -0
- package/dist/src/__generated__/default_templates/index.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/index.js +5 -1
- package/dist/src/__generated__/default_templates/index.js.map +1 -1
- package/dist/src/llm/createCorrectnessEvaluator.d.ts +42 -0
- package/dist/src/llm/createCorrectnessEvaluator.d.ts.map +1 -0
- package/dist/src/llm/createCorrectnessEvaluator.js +49 -0
- package/dist/src/llm/createCorrectnessEvaluator.js.map +1 -0
- package/dist/src/llm/index.d.ts +1 -0
- package/dist/src/llm/index.d.ts.map +1 -1
- package/dist/src/llm/index.js +1 -0
- package/dist/src/llm/index.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +1 -1
- package/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +51 -0
- package/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts +10 -17
- package/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts +12 -22
- package/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts +29 -0
- package/src/__generated__/default_templates/index.ts +2 -0
- package/src/llm/createCorrectnessEvaluator.ts +71 -0
- package/src/llm/index.ts +1 -0
package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// This file is generated. Do not edit by hand.
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
|
+
exports.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
6
|
+
name: "correctness",
|
|
7
|
+
description: "Assess factual accuracy and completeness of model outputs.",
|
|
8
|
+
optimizationDirection: "MAXIMIZE",
|
|
9
|
+
template: [
|
|
10
|
+
{
|
|
11
|
+
role: "user",
|
|
12
|
+
content: `
|
|
13
|
+
You are an expert evaluator labeling model outputs for correctness. Your task is to assign a classification based on the following criteria:
|
|
14
|
+
|
|
15
|
+
<rubric>
|
|
16
|
+
CORRECT - The response:
|
|
17
|
+
- Provides accurate and complete information with no factual errors
|
|
18
|
+
- Addresses all parts of the question
|
|
19
|
+
- Is logically consistent with no contradictions
|
|
20
|
+
- Uses precise, domain-appropriate terminology
|
|
21
|
+
- Avoids ambiguous or misleading language
|
|
22
|
+
|
|
23
|
+
INCORRECT - The response contains any of:
|
|
24
|
+
- Factual errors or inaccuracies
|
|
25
|
+
- Incomplete or partial answers
|
|
26
|
+
- Misleading or ambiguous statements
|
|
27
|
+
- Incorrect terminology
|
|
28
|
+
- Logical inconsistencies
|
|
29
|
+
- Missing key information
|
|
30
|
+
</rubric>
|
|
31
|
+
|
|
32
|
+
<data>
|
|
33
|
+
<input>
|
|
34
|
+
{{input}}
|
|
35
|
+
</input>
|
|
36
|
+
<output>
|
|
37
|
+
{{output}}
|
|
38
|
+
</output>
|
|
39
|
+
</data>
|
|
40
|
+
|
|
41
|
+
Carefully read the input and output and check for factual accuracy and completeness. Focus on correctness of information rather than verboseness or style.
|
|
42
|
+
|
|
43
|
+
Is the output correct or incorrect?
|
|
44
|
+
`,
|
|
45
|
+
},
|
|
46
|
+
],
|
|
47
|
+
choices: {
|
|
48
|
+
"correct": 1,
|
|
49
|
+
"incorrect": 0
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
//# sourceMappingURL=CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map
|
package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,2CAA2C,GAAkC;IACxF,IAAI,EAAE,aAAa;IACnB,WAAW,EAAE,4DAA4D;IACzE,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAgCd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,kDAAkD,EAAE,
|
|
1
|
+
{"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,kDAAkD,EAAE,6BA2BhE,CAAC"}
|
|
@@ -10,25 +10,18 @@ exports.DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
|
10
10
|
{
|
|
11
11
|
role: "user",
|
|
12
12
|
content: `
|
|
13
|
-
You are comparing a document to a question and trying to determine
|
|
14
|
-
if the document text contains information relevant to answering the
|
|
15
|
-
question. Here is the data:
|
|
13
|
+
You are comparing a document to a question and trying to determine if the document text contains information relevant to answering the question. Here is the data:
|
|
16
14
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
15
|
+
<data>
|
|
16
|
+
<question>
|
|
17
|
+
{{input}}
|
|
18
|
+
</question>
|
|
19
|
+
<document_text>
|
|
20
|
+
{{documentText}}
|
|
21
|
+
</document_text>
|
|
22
|
+
</data>
|
|
24
23
|
|
|
25
|
-
Compare the question above to the document text. You must determine
|
|
26
|
-
whether the document text contains information that can answer the
|
|
27
|
-
question. Please focus on whether the very specific question can be
|
|
28
|
-
answered by the information in the document text. Your response must be
|
|
29
|
-
either "relevant" or "unrelated". "unrelated" means that the document
|
|
30
|
-
text does not contain an answer to the question. "relevant" means the
|
|
31
|
-
document text contains an answer to the question.
|
|
24
|
+
Compare the question above to the document text. You must determine whether the document text contains information that can answer the question. Please focus on whether the very specific question can be answered by the information in the document text. Your response must be either "relevant" or "unrelated". "unrelated" means that the document text does not contain an answer to the question. "relevant" means the document text contains an answer to the question.
|
|
32
25
|
`,
|
|
33
26
|
},
|
|
34
27
|
],
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,kDAAkD,GAAkC;IAC/F,IAAI,EAAE,oBAAoB;IAC1B,WAAW,EAAE,iFAAiF;IAC9F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE
|
|
1
|
+
{"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,kDAAkD,GAAkC;IAC/F,IAAI,EAAE,oBAAoB;IAC1B,WAAW,EAAE,iFAAiF;IAC9F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;CAad;SACI;KACF;IACD,OAAO,EAAE;QACT,UAAU,EAAE,CAAC;QACb,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,6CAA6C,EAAE,
|
|
1
|
+
{"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,6CAA6C,EAAE,6BA8B3D,CAAC"}
|
package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js
CHANGED
|
@@ -10,29 +10,19 @@ exports.HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
|
10
10
|
{
|
|
11
11
|
role: "user",
|
|
12
12
|
content: `
|
|
13
|
-
In this task, you will be presented with a query, some context and a response. The response
|
|
14
|
-
is generated to the question based on the context. The response may contain false
|
|
15
|
-
information. You must use the context to determine if the response to the question
|
|
16
|
-
contains false information, if the response is a hallucination of facts. Your objective is
|
|
17
|
-
to determine whether the response text contains factual information and is not a
|
|
18
|
-
hallucination. A 'hallucination' refers to a response that is not based on the context or
|
|
19
|
-
assumes information that is not available in the context. Your response should be a single
|
|
20
|
-
word: either 'factual' or 'hallucinated', and it should not include any other text or
|
|
21
|
-
characters. 'hallucinated' indicates that the response provides factually inaccurate
|
|
22
|
-
information to the query based on the context. 'factual' indicates that the response to
|
|
23
|
-
the question is correct relative to the context, and does not contain made up
|
|
24
|
-
information. Please read the query and context carefully before determining your
|
|
25
|
-
response.
|
|
13
|
+
In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is a hallucination of facts. Your objective is to determine whether the response text contains factual information and is not a hallucination. A 'hallucination' refers to a response that is not based on the context or assumes information that is not available in the context. Your response should be a single word: either 'factual' or 'hallucinated', and it should not include any other text or characters. 'hallucinated' indicates that the response provides factually inaccurate information to the query based on the context. 'factual' indicates that the response to the question is correct relative to the context, and does not contain made up information. Please read the query and context carefully before determining your response.
|
|
26
14
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
15
|
+
<data>
|
|
16
|
+
<query>
|
|
17
|
+
{{input}}
|
|
18
|
+
</query>
|
|
19
|
+
<context>
|
|
20
|
+
{{context}}
|
|
21
|
+
</context>
|
|
22
|
+
<response>
|
|
23
|
+
{{output}}
|
|
24
|
+
</response>
|
|
25
|
+
</data>
|
|
36
26
|
|
|
37
27
|
Is the response above factual or hallucinated based on the query and context?
|
|
38
28
|
`,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,6CAA6C,GAAkC;IAC1F,IAAI,EAAE,eAAe;IACrB,WAAW,EAAE,iFAAiF;IAC9F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE
|
|
1
|
+
{"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,6CAA6C,GAAkC;IAC1F,IAAI,EAAE,eAAe;IACrB,WAAW,EAAE,iFAAiF;IAC9F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;CAgBd;SACI;KACF;IACD,OAAO,EAAE;QACT,cAAc,EAAE,CAAC;QACjB,SAAS,EAAE,CAAC;KACb;CACA,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,8CAA8C,EAAE,6BAwB5D,CAAC"}
|
package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// This file is generated. Do not edit by hand.
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
|
+
exports.TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
6
|
+
name: "tool_selection",
|
|
7
|
+
description: "For determining if the correct tool was selected for a given context. Requires conversation context, a list of available tools, and the LLM's tool selections.",
|
|
8
|
+
optimizationDirection: "MAXIMIZE",
|
|
9
|
+
template: [
|
|
10
|
+
{
|
|
11
|
+
role: "user",
|
|
12
|
+
content: `
|
|
13
|
+
You are an impartial judge evaluating an LLM's tool-calling behavior, specifically whether the LLM selected the most appropriate tool or tools for the task.
|
|
14
|
+
Your task: Determine whether the LLM's tool selection was correct or incorrect based on: - The conversation context - The available tools - The LLM's tool invocation(s)
|
|
15
|
+
Criteria Return "correct" only when ALL of the following are true: - The LLM chose the best available tool for the user query OR correctly avoided tools if none were needed. - The tool name exists in the available tools list. - The tool is allowed and safe to call. - The LLM selected the correct number of tools for the task.
|
|
16
|
+
Return "incorrect" if ANY of the following are true: - The LLM used a hallucinated or nonexistent tool. - The LLM selected a tool when none was needed. - The LLM did not use a tool when one was required. - The LLM chose a suboptimal or irrelevant tool. - The LLM selected an unsafe or not-permitted tool. - The tool name does not appear in the available tools list.
|
|
17
|
+
Before providing your final judgment, explain your reasoning and consider: - What does the input context require? - Can this be answered without tools, or is a tool necessary? - If a tool was selected, does it exist in the available tools? - Does the selected tool's description match the user's needs? - Is the selection safe and appropriate? - Is there a better tool available that should have been chosen instead?
|
|
18
|
+
<data> <context> {{input}} </context>
|
|
19
|
+
<available_tools> {{availableTools}} </available_tools>
|
|
20
|
+
<tool_selection> {{toolSelection}} </tool_selection> </data>
|
|
21
|
+
Given the above data, is the tool selection correct or incorrect?
|
|
22
|
+
`,
|
|
23
|
+
},
|
|
24
|
+
],
|
|
25
|
+
choices: {
|
|
26
|
+
"correct": 1,
|
|
27
|
+
"incorrect": 0
|
|
28
|
+
},
|
|
29
|
+
};
|
|
30
|
+
//# sourceMappingURL=TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,8CAA8C,GAAkC;IAC3F,IAAI,EAAE,gBAAgB;IACtB,WAAW,EAAE,gKAAgK;IAC7K,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;CAUd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
export { CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG";
|
|
1
2
|
export { DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG } from "./DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG";
|
|
2
3
|
export { HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG";
|
|
4
|
+
export { TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG";
|
|
3
5
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
|
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
// This file is generated. Do not edit by hand.
|
|
3
3
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
-
exports.HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG = exports.DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
4
|
+
exports.TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG = exports.HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG = exports.DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG = exports.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
|
+
var CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG_1 = require("./CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG");
|
|
6
|
+
Object.defineProperty(exports, "CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG", { enumerable: true, get: function () { return CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG_1.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG; } });
|
|
5
7
|
var DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG_1 = require("./DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG");
|
|
6
8
|
Object.defineProperty(exports, "DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG", { enumerable: true, get: function () { return DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG_1.DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG; } });
|
|
7
9
|
var HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG_1 = require("./HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG");
|
|
8
10
|
Object.defineProperty(exports, "HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG", { enumerable: true, get: function () { return HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG_1.HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG; } });
|
|
11
|
+
var TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG_1 = require("./TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG");
|
|
12
|
+
Object.defineProperty(exports, "TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG", { enumerable: true, get: function () { return TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG_1.TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG; } });
|
|
9
13
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAE/C,2HAA0H;AAAjH,wLAAA,kDAAkD,OAAA;AAC3D,iHAAgH;AAAvG,8KAAA,6CAA6C,OAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAE/C,6GAA4G;AAAnG,0KAAA,2CAA2C,OAAA;AACpD,2HAA0H;AAAjH,wLAAA,kDAAkD,OAAA;AAC3D,iHAAgH;AAAvG,8KAAA,6CAA6C,OAAA;AACtD,mHAAkH;AAAzG,gLAAA,8CAA8C,OAAA"}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { CreateClassificationEvaluatorArgs } from "../types/evals";
|
|
2
|
+
import { ClassificationEvaluator } from "./ClassificationEvaluator";
|
|
3
|
+
export interface CorrectnessEvaluatorArgs<RecordType extends Record<string, unknown> = CorrectnessEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
|
|
4
|
+
optimizationDirection?: CreateClassificationEvaluatorArgs<RecordType>["optimizationDirection"];
|
|
5
|
+
name?: CreateClassificationEvaluatorArgs<RecordType>["name"];
|
|
6
|
+
choices?: CreateClassificationEvaluatorArgs<RecordType>["choices"];
|
|
7
|
+
promptTemplate?: CreateClassificationEvaluatorArgs<RecordType>["promptTemplate"];
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* A record to be evaluated by the correctness evaluator.
|
|
11
|
+
*/
|
|
12
|
+
export type CorrectnessEvaluationRecord = {
|
|
13
|
+
input: string;
|
|
14
|
+
output: string;
|
|
15
|
+
};
|
|
16
|
+
/**
|
|
17
|
+
* Creates a correctness evaluator function.
|
|
18
|
+
*
|
|
19
|
+
* This function returns an evaluator that determines whether a given output
|
|
20
|
+
* is factually accurate, complete, logically consistent, and uses precise terminology.
|
|
21
|
+
*
|
|
22
|
+
* @param args - The arguments for creating the correctness evaluator.
|
|
23
|
+
* @param args.model - The model to use for classification.
|
|
24
|
+
* @param args.choices - The possible classification choices (defaults to CORRECTNESS_CHOICES).
|
|
25
|
+
* @param args.promptTemplate - The prompt template to use (defaults to CORRECTNESS_TEMPLATE).
|
|
26
|
+
* @param args.telemetry - The telemetry to use for the evaluator.
|
|
27
|
+
*
|
|
28
|
+
* @returns An evaluator function that takes a {@link CorrectnessEvaluationRecord} and returns a classification result
|
|
29
|
+
* indicating whether the output is correct or incorrect.
|
|
30
|
+
*
|
|
31
|
+
* @example
|
|
32
|
+
* ```ts
|
|
33
|
+
* const evaluator = createCorrectnessEvaluator({ model: openai("gpt-4o-mini") });
|
|
34
|
+
* const result = await evaluator.evaluate({
|
|
35
|
+
* input: "What is the capital of France?",
|
|
36
|
+
* output: "Paris is the capital of France.",
|
|
37
|
+
* });
|
|
38
|
+
* console.log(result.label); // "correct" or "incorrect"
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
export declare function createCorrectnessEvaluator<RecordType extends Record<string, unknown> = CorrectnessEvaluationRecord>(args: CorrectnessEvaluatorArgs<RecordType>): ClassificationEvaluator<RecordType>;
|
|
42
|
+
//# sourceMappingURL=createCorrectnessEvaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"createCorrectnessEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createCorrectnessEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,wBAAwB,CACvC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,2BAA2B,CACxE,SAAQ,IAAI,CACV,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACD,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,2BAA2B,GAAG;IACxC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,0BAA0B,CACxC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,2BAA2B,EAExE,IAAI,EAAE,wBAAwB,CAAC,UAAU,CAAC,GACzC,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __rest = (this && this.__rest) || function (s, e) {
|
|
3
|
+
var t = {};
|
|
4
|
+
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)
|
|
5
|
+
t[p] = s[p];
|
|
6
|
+
if (s != null && typeof Object.getOwnPropertySymbols === "function")
|
|
7
|
+
for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {
|
|
8
|
+
if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))
|
|
9
|
+
t[p[i]] = s[p[i]];
|
|
10
|
+
}
|
|
11
|
+
return t;
|
|
12
|
+
};
|
|
13
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
14
|
+
exports.createCorrectnessEvaluator = createCorrectnessEvaluator;
|
|
15
|
+
const default_templates_1 = require("../__generated__/default_templates");
|
|
16
|
+
const createClassificationEvaluator_1 = require("./createClassificationEvaluator");
|
|
17
|
+
/**
|
|
18
|
+
* Creates a correctness evaluator function.
|
|
19
|
+
*
|
|
20
|
+
* This function returns an evaluator that determines whether a given output
|
|
21
|
+
* is factually accurate, complete, logically consistent, and uses precise terminology.
|
|
22
|
+
*
|
|
23
|
+
* @param args - The arguments for creating the correctness evaluator.
|
|
24
|
+
* @param args.model - The model to use for classification.
|
|
25
|
+
* @param args.choices - The possible classification choices (defaults to CORRECTNESS_CHOICES).
|
|
26
|
+
* @param args.promptTemplate - The prompt template to use (defaults to CORRECTNESS_TEMPLATE).
|
|
27
|
+
* @param args.telemetry - The telemetry to use for the evaluator.
|
|
28
|
+
*
|
|
29
|
+
* @returns An evaluator function that takes a {@link CorrectnessEvaluationRecord} and returns a classification result
|
|
30
|
+
* indicating whether the output is correct or incorrect.
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* ```ts
|
|
34
|
+
* const evaluator = createCorrectnessEvaluator({ model: openai("gpt-4o-mini") });
|
|
35
|
+
* const result = await evaluator.evaluate({
|
|
36
|
+
* input: "What is the capital of France?",
|
|
37
|
+
* output: "Paris is the capital of France.",
|
|
38
|
+
* });
|
|
39
|
+
* console.log(result.label); // "correct" or "incorrect"
|
|
40
|
+
* ```
|
|
41
|
+
*/
|
|
42
|
+
function createCorrectnessEvaluator(args) {
|
|
43
|
+
const { choices = default_templates_1.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = default_templates_1.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = default_templates_1.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = default_templates_1.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.name } = args, rest = __rest(args, ["choices", "promptTemplate", "optimizationDirection", "name"]);
|
|
44
|
+
return (0, createClassificationEvaluator_1.createClassificationEvaluator)(Object.assign(Object.assign({}, rest), { promptTemplate,
|
|
45
|
+
choices,
|
|
46
|
+
optimizationDirection,
|
|
47
|
+
name }));
|
|
48
|
+
}
|
|
49
|
+
//# sourceMappingURL=createCorrectnessEvaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"createCorrectnessEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createCorrectnessEvaluator.ts"],"names":[],"mappings":";;;;;;;;;;;;;AAmDA,gEAmBC;AAtED,0EAAiG;AAIjG,mFAAgF;AAsBhF;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,SAAgB,0BAA0B,CAGxC,IAA0C;IAE1C,MAAM,EACJ,OAAO,GAAG,+DAA2C,CAAC,OAAO,EAC7D,cAAc,GAAG,+DAA2C,CAAC,QAAQ,EACrE,qBAAqB,GAAG,+DAA2C,CAAC,qBAAqB,EACzF,IAAI,GAAG,+DAA2C,CAAC,IAAI,KAErD,IAAI,EADH,IAAI,UACL,IAAI,EANF,8DAML,CAAO,CAAC;IACT,OAAO,IAAA,6DAA6B,kCAC/B,IAAI,KACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI,IACJ,CAAC;AACL,CAAC"}
|
package/dist/src/llm/index.d.ts
CHANGED
|
@@ -3,6 +3,7 @@ export * from "./createClassifierFn";
|
|
|
3
3
|
export * from "./createClassificationEvaluator";
|
|
4
4
|
export * from "./createHallucinationEvaluator";
|
|
5
5
|
export * from "./createDocumentRelevanceEvaluator";
|
|
6
|
+
export * from "./createCorrectnessEvaluator";
|
|
6
7
|
export * from "./ClassificationEvaluator";
|
|
7
8
|
export * from "./LLMEvaluator";
|
|
8
9
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,0BAA0B,CAAC;AACzC,cAAc,sBAAsB,CAAC;AACrC,cAAc,iCAAiC,CAAC;AAChD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,oCAAoC,CAAC;AACnD,cAAc,2BAA2B,CAAC;AAC1C,cAAc,gBAAgB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,0BAA0B,CAAC;AACzC,cAAc,sBAAsB,CAAC;AACrC,cAAc,iCAAiC,CAAC;AAChD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,oCAAoC,CAAC;AACnD,cAAc,8BAA8B,CAAC;AAC7C,cAAc,2BAA2B,CAAC;AAC1C,cAAc,gBAAgB,CAAC"}
|
package/dist/src/llm/index.js
CHANGED
|
@@ -19,6 +19,7 @@ __exportStar(require("./createClassifierFn"), exports);
|
|
|
19
19
|
__exportStar(require("./createClassificationEvaluator"), exports);
|
|
20
20
|
__exportStar(require("./createHallucinationEvaluator"), exports);
|
|
21
21
|
__exportStar(require("./createDocumentRelevanceEvaluator"), exports);
|
|
22
|
+
__exportStar(require("./createCorrectnessEvaluator"), exports);
|
|
22
23
|
__exportStar(require("./ClassificationEvaluator"), exports);
|
|
23
24
|
__exportStar(require("./LLMEvaluator"), exports);
|
|
24
25
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,2DAAyC;AACzC,uDAAqC;AACrC,kEAAgD;AAChD,iEAA+C;AAC/C,qEAAmD;AACnD,4DAA0C;AAC1C,iDAA+B"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,2DAAyC;AACzC,uDAAqC;AACrC,kEAAgD;AAChD,iEAA+C;AAC/C,qEAAmD;AACnD,+DAA6C;AAC7C,4DAA0C;AAC1C,iDAA+B"}
|