judgeval 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/common/logger.js +28 -24
- package/dist/cjs/common/logger.js.map +1 -1
- package/dist/cjs/common/tracer.js +80 -130
- package/dist/cjs/common/tracer.js.map +1 -1
- package/dist/cjs/constants.js +2 -1
- package/dist/cjs/constants.js.map +1 -1
- package/dist/cjs/data/datasets/eval-dataset-client.js +45 -0
- package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -1
- package/dist/cjs/e2etests/eval-operations.test.js +3 -3
- package/dist/cjs/exporters/otel-exporter.js +352 -0
- package/dist/cjs/exporters/otel-exporter.js.map +1 -0
- package/dist/cjs/judges/index.js +217 -0
- package/dist/cjs/judges/index.js.map +1 -0
- package/dist/cjs/run-evaluation.js +13 -13
- package/dist/cjs/run-evaluation.js.map +1 -1
- package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js +610 -0
- package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-correctness/index.js +19 -0
- package/dist/cjs/scorers/metrics/answer-correctness/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-correctness/prompts.js +175 -0
- package/dist/cjs/scorers/metrics/answer-correctness/prompts.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js +525 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/index.js +19 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js +179 -0
- package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
- package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js +524 -0
- package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
- package/dist/cjs/scorers/metrics/faithfulness/index.js +19 -0
- package/dist/cjs/scorers/metrics/faithfulness/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/faithfulness/prompts.js +232 -0
- package/dist/cjs/scorers/metrics/faithfulness/prompts.js.map +1 -0
- package/dist/cjs/scorers/metrics/hallucination/hallucination.js +390 -0
- package/dist/cjs/scorers/metrics/hallucination/hallucination.js.map +1 -0
- package/dist/cjs/scorers/metrics/hallucination/index.js +11 -0
- package/dist/cjs/scorers/metrics/hallucination/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/hallucination/prompts.js +106 -0
- package/dist/cjs/scorers/metrics/hallucination/prompts.js.map +1 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/index.js +19 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/index.js.map +1 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js +382 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js +124 -0
- package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
- package/dist/esm/common/logger.js +16 -11
- package/dist/esm/common/logger.js.map +1 -1
- package/dist/esm/common/tracer.js +78 -128
- package/dist/esm/common/tracer.js.map +1 -1
- package/dist/esm/constants.js +1 -0
- package/dist/esm/constants.js.map +1 -1
- package/dist/esm/data/datasets/eval-dataset-client.js +46 -1
- package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -1
- package/dist/esm/e2etests/eval-operations.test.js +3 -3
- package/dist/esm/exporters/otel-exporter.js +348 -0
- package/dist/esm/exporters/otel-exporter.js.map +1 -0
- package/dist/esm/judges/index.js +185 -0
- package/dist/esm/judges/index.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js +601 -0
- package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-correctness/index.js +3 -0
- package/dist/esm/scorers/metrics/answer-correctness/index.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-correctness/prompts.js +171 -0
- package/dist/esm/scorers/metrics/answer-correctness/prompts.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js +521 -0
- package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-relevancy/index.js +3 -0
- package/dist/esm/scorers/metrics/answer-relevancy/index.js.map +1 -0
- package/dist/esm/scorers/metrics/answer-relevancy/prompts.js +175 -0
- package/dist/esm/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
- package/dist/esm/scorers/metrics/faithfulness/faithfulness.js +520 -0
- package/dist/esm/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
- package/dist/esm/scorers/metrics/faithfulness/index.js +3 -0
- package/dist/esm/scorers/metrics/faithfulness/index.js.map +1 -0
- package/dist/esm/scorers/metrics/faithfulness/prompts.js +228 -0
- package/dist/esm/scorers/metrics/faithfulness/prompts.js.map +1 -0
- package/dist/esm/scorers/metrics/hallucination/hallucination.js +386 -0
- package/dist/esm/scorers/metrics/hallucination/hallucination.js.map +1 -0
- package/dist/esm/scorers/metrics/hallucination/index.js +3 -0
- package/dist/esm/scorers/metrics/hallucination/index.js.map +1 -0
- package/dist/esm/scorers/metrics/hallucination/prompts.js +102 -0
- package/dist/esm/scorers/metrics/hallucination/prompts.js.map +1 -0
- package/dist/esm/scorers/metrics/instruction-adherence/index.js +3 -0
- package/dist/esm/scorers/metrics/instruction-adherence/index.js.map +1 -0
- package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js +378 -0
- package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
- package/dist/esm/scorers/metrics/instruction-adherence/prompts.js +120 -0
- package/dist/esm/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
- package/dist/types/common/logger.d.ts +1 -1
- package/dist/types/constants.d.ts +1 -0
- package/dist/types/data/datasets/eval-dataset-client.d.ts +5 -0
- package/dist/types/exporters/otel-exporter.d.ts +16 -0
- package/dist/types/judges/index.d.ts +50 -0
- package/dist/types/scorers/metrics/answer-correctness/answer-correctness.d.ts +99 -0
- package/dist/types/scorers/metrics/answer-correctness/index.d.ts +2 -0
- package/dist/types/scorers/metrics/answer-correctness/prompts.d.ts +71 -0
- package/dist/types/scorers/metrics/answer-relevancy/answer-relevancy.d.ts +78 -0
- package/dist/types/scorers/metrics/answer-relevancy/index.d.ts +2 -0
- package/dist/types/scorers/metrics/answer-relevancy/prompts.d.ts +71 -0
- package/dist/types/scorers/metrics/faithfulness/faithfulness.d.ts +77 -0
- package/dist/types/scorers/metrics/faithfulness/index.d.ts +2 -0
- package/dist/types/scorers/metrics/faithfulness/prompts.d.ts +94 -0
- package/dist/types/scorers/metrics/hallucination/hallucination.d.ts +67 -0
- package/dist/types/scorers/metrics/hallucination/index.d.ts +3 -0
- package/dist/types/scorers/metrics/hallucination/prompts.d.ts +63 -0
- package/dist/types/scorers/metrics/instruction-adherence/index.d.ts +2 -0
- package/dist/types/scorers/metrics/instruction-adherence/instruction-adherence.d.ts +67 -0
- package/dist/types/scorers/metrics/instruction-adherence/prompts.d.ts +78 -0
- package/package.json +32 -14
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
/**
|
|
3
|
+
* Schema for hallucination verdict
|
|
4
|
+
*/
|
|
5
|
+
export const HallucinationVerdictSchema = z.object({
|
|
6
|
+
verdict: z.string(),
|
|
7
|
+
reason: z.string()
|
|
8
|
+
});
|
|
9
|
+
/**
|
|
10
|
+
* Schema for verdicts
|
|
11
|
+
*/
|
|
12
|
+
export const VerdictsSchema = z.object({
|
|
13
|
+
verdicts: z.array(HallucinationVerdictSchema)
|
|
14
|
+
});
|
|
15
|
+
/**
|
|
16
|
+
* Schema for reason
|
|
17
|
+
*/
|
|
18
|
+
export const ReasonSchema = z.object({
|
|
19
|
+
reason: z.string()
|
|
20
|
+
});
|
|
21
|
+
/**
|
|
22
|
+
* Templates for hallucination scorer prompts
|
|
23
|
+
*/
|
|
24
|
+
export class HallucinationTemplate {
|
|
25
|
+
/**
|
|
26
|
+
* Generate a prompt to evaluate hallucinations in the actual output
|
|
27
|
+
*/
|
|
28
|
+
static generateVerdicts(actualOutput, contexts) {
|
|
29
|
+
return `==== TASK INSTRUCTIONS ====
|
|
30
|
+
You will be provided with an \`actual output\` (the response of an LLM to a particular query) and \`contexts\` (ground truth contextual information from a knowledge base).
|
|
31
|
+
Your task is to take each context in contexts and determine whether the \`actual output\` factually agrees with the context.
|
|
32
|
+
|
|
33
|
+
Additional notes:
|
|
34
|
+
You should NOT use any prior knowledge you have in your decision making process; take each context at face value.
|
|
35
|
+
Since you will determine a verdict for EACH context, the number of 'verdicts' is EXACTLY EQUAL TO the number of contexts.
|
|
36
|
+
You should be lenient in your judgment when the actual output lacks detail with respect to the context segment; you should ONLY provide a 'no' answer if the context contradicts the actual output.
|
|
37
|
+
|
|
38
|
+
==== FORMATTING INSTRUCTIONS ====
|
|
39
|
+
You should return a JSON object with a key 'verdicts', which is a list of JSON objects. Each JSON object corresponds to a context in \`contexts\`, and should have 2 fields: 'verdict' and 'reason'.
|
|
40
|
+
The 'verdict' key should be EXACTLY one of 'yes' or 'no', representing whether the \`actual output\` factually agrees with the context segment.
|
|
41
|
+
The 'reason' is the justification for the verdict. If your verdict is 'no', try to provide a correction in the reason.
|
|
42
|
+
|
|
43
|
+
==== EXAMPLE ====
|
|
44
|
+
Example contexts: ["Einstein won the Nobel Prize for his discovery of the photoelectric effect.", "Einstein won the Nobel Prize in 1968."]
|
|
45
|
+
Example actual output: "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect."
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
{
|
|
49
|
+
"verdicts": [
|
|
50
|
+
{
|
|
51
|
+
"verdict": "yes",
|
|
52
|
+
"reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect."
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"verdict": "no",
|
|
56
|
+
"reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969."
|
|
57
|
+
}
|
|
58
|
+
]
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
==== YOUR TURN ====
|
|
62
|
+
Contexts:
|
|
63
|
+
${contexts.map((context, index) => `${index + 1}. ${context}`).join('\n')}
|
|
64
|
+
|
|
65
|
+
Actual Output:
|
|
66
|
+
${actualOutput}
|
|
67
|
+
|
|
68
|
+
JSON:`;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Generate a prompt to create a reason for the hallucination score
|
|
72
|
+
*/
|
|
73
|
+
static generateReason(actualOutput, contexts) {
|
|
74
|
+
return `==== TASK INSTRUCTIONS ====
|
|
75
|
+
You will be provided with an \`actual output\` (the response of an LLM to a particular query) and \`contexts\` (ground truth contextual information from a knowledge base).
|
|
76
|
+
Your task is to analyze whether the actual output contains any hallucinations (factual inaccuracies) when compared to the provided contexts.
|
|
77
|
+
|
|
78
|
+
Please provide a clear and concise reason summarizing your analysis. Focus on any contradictions between the actual output and the contexts, or note if the output is factually consistent with the contexts.
|
|
79
|
+
|
|
80
|
+
==== FORMATTING INSTRUCTIONS ====
|
|
81
|
+
Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
82
|
+
Example JSON:
|
|
83
|
+
{
|
|
84
|
+
"reason": "The output contains factual inaccuracies because..."
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
Or if no hallucinations:
|
|
88
|
+
{
|
|
89
|
+
"reason": "The output is factually consistent with the provided contexts."
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
==== YOUR TURN ====
|
|
93
|
+
Contexts:
|
|
94
|
+
${contexts.map((context, index) => `${index + 1}. ${context}`).join('\n')}
|
|
95
|
+
|
|
96
|
+
Actual Output:
|
|
97
|
+
${actualOutput}
|
|
98
|
+
|
|
99
|
+
JSON:`;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
//# sourceMappingURL=prompts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/hallucination/prompts.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB;;GAEG;AACH,MAAM,CAAC,MAAM,0BAA0B,GAAG,CAAC,CAAC,MAAM,CAAC;IACjD,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE;IACnB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;CACnB,CAAC,CAAC;AAIH;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IACrC,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,0BAA0B,CAAC;CAC9C,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,CAAC,MAAM,CAAC;IACnC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;CACnB,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,OAAO,qBAAqB;IAChC;;OAEG;IACH,MAAM,CAAC,gBAAgB,CAAC,YAAoB,EAAE,QAAkB;QAC9D,OAAO;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAkCT,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,KAAK,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;EAGvE,YAAY;;MAER,CAAC;IACL,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,cAAc,CAAC,YAAoB,EAAE,QAAkB;QAC5D,OAAO;;;;;;;;;;;;;;;;;;;;EAoBT,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,KAAK,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;EAGvE,YAAY;;MAER,CAAC;IACL,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/instruction-adherence/index.ts"],"names":[],"mappings":"AAAA,cAAc,4BAA4B,CAAC;AAC3C,cAAc,cAAc,CAAC"}
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { JudgevalScorer } from '../../base-scorer.js';
|
|
11
|
+
import { APIScorer } from '../../../constants.js';
|
|
12
|
+
import { info } from '../../../common/logger.js';
|
|
13
|
+
import { InstructionAdherenceTemplate, InstructionsSchema, VerdictsSchema } from './prompts.js';
|
|
14
|
+
import { createJudge } from '../../../judges/index.js';
|
|
15
|
+
// Required parameters for this scorer
|
|
16
|
+
const required_params = ['input', 'actualOutput'];
|
|
17
|
+
/**
|
|
18
|
+
* InstructionAdherenceScorer evaluates how well an LLM follows instructions
|
|
19
|
+
* by extracting instructions from the input and checking if they are followed in the output.
|
|
20
|
+
*
|
|
21
|
+
* The score is the average of scores for each instruction (1 = followed, 0.5 = partially followed, 0 = not followed).
|
|
22
|
+
*/
|
|
23
|
+
export class InstructionAdherenceScorer extends JudgevalScorer {
|
|
24
|
+
/**
|
|
25
|
+
* Create a new InstructionAdherenceScorer
|
|
26
|
+
*
|
|
27
|
+
* @param threshold - Success threshold (default: 0.5)
|
|
28
|
+
* @param model - Model to use for evaluation (default: DefaultJudge)
|
|
29
|
+
* @param include_reason - Whether to include a reason for the score (default: true)
|
|
30
|
+
* @param async_mode - Whether to use async mode (default: false)
|
|
31
|
+
* @param strict_mode - Whether to use strict mode (default: false)
|
|
32
|
+
* @param verbose_mode - Whether to include verbose logs (default: false)
|
|
33
|
+
*/
|
|
34
|
+
constructor(threshold = 0.5, model = undefined, include_reason = true, async_mode = false, strict_mode = false, verbose_mode = false) {
|
|
35
|
+
super(APIScorer.INSTRUCTION_ADHERENCE, strict_mode ? 1 : threshold, undefined, include_reason, async_mode, strict_mode, verbose_mode);
|
|
36
|
+
this._instructions = [];
|
|
37
|
+
this._verdicts = [];
|
|
38
|
+
const { judge, usingNativeModel } = createJudge(model);
|
|
39
|
+
this.model = judge;
|
|
40
|
+
this.using_native_model = usingNativeModel;
|
|
41
|
+
this.evaluation_model = this.model.getModelName();
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Extract instructions from input text
|
|
45
|
+
*/
|
|
46
|
+
_aGetInstructions(input) {
|
|
47
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
48
|
+
const prompt = InstructionAdherenceTemplate.getInstructions(input);
|
|
49
|
+
if (this.using_native_model) {
|
|
50
|
+
const res = yield this.model.aGenerate(prompt);
|
|
51
|
+
try {
|
|
52
|
+
const data = JSON.parse(res);
|
|
53
|
+
return data.instructions || [];
|
|
54
|
+
}
|
|
55
|
+
catch (error) {
|
|
56
|
+
throw new Error(`Failed to parse response: ${error}`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
try {
|
|
61
|
+
// Create a parser function to validate the response
|
|
62
|
+
const parseInstructionsResponse = (response) => {
|
|
63
|
+
const parsed = JSON.parse(response);
|
|
64
|
+
const result = InstructionsSchema.safeParse(parsed);
|
|
65
|
+
if (result.success) {
|
|
66
|
+
return result.data;
|
|
67
|
+
}
|
|
68
|
+
throw new Error(`Invalid response format: ${result.error}`);
|
|
69
|
+
};
|
|
70
|
+
const res = yield this.model.aGenerate(prompt);
|
|
71
|
+
return parseInstructionsResponse(res).instructions;
|
|
72
|
+
}
|
|
73
|
+
catch (error) {
|
|
74
|
+
const res = yield this.model.aGenerate(prompt);
|
|
75
|
+
try {
|
|
76
|
+
const data = JSON.parse(res);
|
|
77
|
+
return data.instructions || [];
|
|
78
|
+
}
|
|
79
|
+
catch (parseError) {
|
|
80
|
+
throw new Error(`Failed to parse response: ${parseError}`);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Extract instructions from input text (synchronous)
|
|
88
|
+
*/
|
|
89
|
+
_getInstructions(input) {
|
|
90
|
+
const prompt = InstructionAdherenceTemplate.getInstructions(input);
|
|
91
|
+
if (this.using_native_model) {
|
|
92
|
+
const res = this.model.generate(prompt);
|
|
93
|
+
try {
|
|
94
|
+
const data = JSON.parse(res);
|
|
95
|
+
return data.instructions || [];
|
|
96
|
+
}
|
|
97
|
+
catch (error) {
|
|
98
|
+
throw new Error(`Failed to parse response: ${error}`);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
else {
|
|
102
|
+
try {
|
|
103
|
+
// Create a parser function to validate the response
|
|
104
|
+
const parseInstructionsResponse = (response) => {
|
|
105
|
+
const parsed = JSON.parse(response);
|
|
106
|
+
const result = InstructionsSchema.safeParse(parsed);
|
|
107
|
+
if (result.success) {
|
|
108
|
+
return result.data;
|
|
109
|
+
}
|
|
110
|
+
throw new Error(`Invalid response format: ${result.error}`);
|
|
111
|
+
};
|
|
112
|
+
const res = this.model.generate(prompt);
|
|
113
|
+
return parseInstructionsResponse(res).instructions;
|
|
114
|
+
}
|
|
115
|
+
catch (error) {
|
|
116
|
+
const res = this.model.generate(prompt);
|
|
117
|
+
try {
|
|
118
|
+
const data = JSON.parse(res);
|
|
119
|
+
return data.instructions || [];
|
|
120
|
+
}
|
|
121
|
+
catch (parseError) {
|
|
122
|
+
throw new Error(`Failed to parse response: ${parseError}`);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Generate verdicts for each instruction
|
|
129
|
+
*/
|
|
130
|
+
_aGetVerdicts(instructions, actualOutput) {
|
|
131
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
132
|
+
if (instructions.length === 0) {
|
|
133
|
+
return [];
|
|
134
|
+
}
|
|
135
|
+
const prompt = InstructionAdherenceTemplate.generateVerdicts(instructions, actualOutput);
|
|
136
|
+
if (this.using_native_model) {
|
|
137
|
+
const res = yield this.model.aGenerate(prompt);
|
|
138
|
+
try {
|
|
139
|
+
const data = JSON.parse(res);
|
|
140
|
+
return data.verdicts || [];
|
|
141
|
+
}
|
|
142
|
+
catch (error) {
|
|
143
|
+
throw new Error(`Failed to parse response: ${error}`);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
else {
|
|
147
|
+
try {
|
|
148
|
+
// Create a parser function to validate the response
|
|
149
|
+
const parseVerdictsResponse = (response) => {
|
|
150
|
+
const parsed = JSON.parse(response);
|
|
151
|
+
const result = VerdictsSchema.safeParse(parsed);
|
|
152
|
+
if (result.success) {
|
|
153
|
+
return result.data;
|
|
154
|
+
}
|
|
155
|
+
throw new Error(`Invalid response format: ${result.error}`);
|
|
156
|
+
};
|
|
157
|
+
const res = yield this.model.aGenerate(prompt);
|
|
158
|
+
return parseVerdictsResponse(res).verdicts;
|
|
159
|
+
}
|
|
160
|
+
catch (error) {
|
|
161
|
+
const res = yield this.model.aGenerate(prompt);
|
|
162
|
+
try {
|
|
163
|
+
const data = JSON.parse(res);
|
|
164
|
+
return data.verdicts || [];
|
|
165
|
+
}
|
|
166
|
+
catch (parseError) {
|
|
167
|
+
throw new Error(`Failed to parse response: ${parseError}`);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Generate verdicts for each instruction (synchronous)
|
|
175
|
+
*/
|
|
176
|
+
_getVerdicts(instructions, actualOutput) {
|
|
177
|
+
if (instructions.length === 0) {
|
|
178
|
+
return [];
|
|
179
|
+
}
|
|
180
|
+
const prompt = InstructionAdherenceTemplate.generateVerdicts(instructions, actualOutput);
|
|
181
|
+
if (this.using_native_model) {
|
|
182
|
+
const res = this.model.generate(prompt);
|
|
183
|
+
try {
|
|
184
|
+
const data = JSON.parse(res);
|
|
185
|
+
return data.verdicts || [];
|
|
186
|
+
}
|
|
187
|
+
catch (error) {
|
|
188
|
+
throw new Error(`Failed to parse response: ${error}`);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
else {
|
|
192
|
+
try {
|
|
193
|
+
// Create a parser function to validate the response
|
|
194
|
+
const parseVerdictsResponse = (response) => {
|
|
195
|
+
const parsed = JSON.parse(response);
|
|
196
|
+
const result = VerdictsSchema.safeParse(parsed);
|
|
197
|
+
if (result.success) {
|
|
198
|
+
return result.data;
|
|
199
|
+
}
|
|
200
|
+
throw new Error(`Invalid response format: ${result.error}`);
|
|
201
|
+
};
|
|
202
|
+
const res = this.model.generate(prompt);
|
|
203
|
+
return parseVerdictsResponse(res).verdicts;
|
|
204
|
+
}
|
|
205
|
+
catch (error) {
|
|
206
|
+
const res = this.model.generate(prompt);
|
|
207
|
+
try {
|
|
208
|
+
const data = JSON.parse(res);
|
|
209
|
+
return data.verdicts || [];
|
|
210
|
+
}
|
|
211
|
+
catch (parseError) {
|
|
212
|
+
throw new Error(`Failed to parse response: ${parseError}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Calculate the instruction adherence score
|
|
219
|
+
*/
|
|
220
|
+
_computeScore() {
|
|
221
|
+
if (this._verdicts.length === 0) {
|
|
222
|
+
return 1;
|
|
223
|
+
}
|
|
224
|
+
let totalScore = 0;
|
|
225
|
+
for (const verdict of this._verdicts) {
|
|
226
|
+
totalScore += verdict.score;
|
|
227
|
+
}
|
|
228
|
+
return totalScore / this._verdicts.length;
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Create verbose logs for debugging
|
|
232
|
+
*/
|
|
233
|
+
_createVerboseLogs() {
|
|
234
|
+
if (!this.verbose_mode) {
|
|
235
|
+
return null;
|
|
236
|
+
}
|
|
237
|
+
const steps = [
|
|
238
|
+
`Instructions:\n${JSON.stringify(this._instructions, null, 2)}`,
|
|
239
|
+
`Score: ${this.score}\nReason: ${this.reason || "No reason provided"}`
|
|
240
|
+
];
|
|
241
|
+
return steps.join('\n\n');
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Check if example has required parameters
|
|
245
|
+
*/
|
|
246
|
+
_checkExampleParams(example) {
|
|
247
|
+
for (const param of required_params) {
|
|
248
|
+
if (param === 'input' && !example.input) {
|
|
249
|
+
throw new Error(`Example is missing required parameter: input`);
|
|
250
|
+
}
|
|
251
|
+
else if (param === 'actualOutput' && !example.actualOutput) {
|
|
252
|
+
throw new Error(`Example is missing required parameter: actualOutput`);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Score an example synchronously
|
|
258
|
+
*/
|
|
259
|
+
syncScoreExample(example) {
|
|
260
|
+
info("Starting example scoring (sync mode)");
|
|
261
|
+
try {
|
|
262
|
+
// Check required parameters
|
|
263
|
+
this._checkExampleParams(example);
|
|
264
|
+
// Process example
|
|
265
|
+
this._instructions = this._getInstructions(example.input);
|
|
266
|
+
this._verdicts = this._getVerdicts(this._instructions, example.actualOutput);
|
|
267
|
+
// Add instructions and verdicts to additional metadata
|
|
268
|
+
const additional_metadata = {
|
|
269
|
+
instructions: this._instructions,
|
|
270
|
+
verdicts: this._verdicts
|
|
271
|
+
};
|
|
272
|
+
this.score = this._computeScore();
|
|
273
|
+
this.reason = this._verdicts.length > 0 ? JSON.stringify(this._verdicts) : 'No instructions found';
|
|
274
|
+
this.success = this._successCheck();
|
|
275
|
+
const verbose_logs = this._createVerboseLogs();
|
|
276
|
+
info(`Scoring completed with score: ${this.score}`);
|
|
277
|
+
// Ensure all fields match the ScorerData interface
|
|
278
|
+
return {
|
|
279
|
+
name: this.type,
|
|
280
|
+
threshold: this.threshold,
|
|
281
|
+
success: this.success,
|
|
282
|
+
score: this.score,
|
|
283
|
+
reason: this.reason || "",
|
|
284
|
+
strict_mode: this.strict_mode,
|
|
285
|
+
evaluation_model: this.evaluation_model || null,
|
|
286
|
+
error: null,
|
|
287
|
+
evaluation_cost: null,
|
|
288
|
+
verbose_logs: verbose_logs,
|
|
289
|
+
additional_metadata: additional_metadata
|
|
290
|
+
};
|
|
291
|
+
}
|
|
292
|
+
catch (error) {
|
|
293
|
+
// Handle errors
|
|
294
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
295
|
+
this.error = errorMessage;
|
|
296
|
+
return {
|
|
297
|
+
name: this.type,
|
|
298
|
+
threshold: this.threshold,
|
|
299
|
+
success: false,
|
|
300
|
+
score: 0,
|
|
301
|
+
reason: `Error during scoring: ${errorMessage}`,
|
|
302
|
+
strict_mode: this.strict_mode,
|
|
303
|
+
evaluation_model: this.evaluation_model || null,
|
|
304
|
+
error: errorMessage,
|
|
305
|
+
evaluation_cost: null,
|
|
306
|
+
verbose_logs: null,
|
|
307
|
+
additional_metadata: {}
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
/**
|
|
312
|
+
* Score an example asynchronously
|
|
313
|
+
*/
|
|
314
|
+
scoreExample(example) {
|
|
315
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
316
|
+
if (!this.async_mode) {
|
|
317
|
+
return this.syncScoreExample(example);
|
|
318
|
+
}
|
|
319
|
+
info("Starting example scoring (async mode)");
|
|
320
|
+
try {
|
|
321
|
+
// Check required parameters
|
|
322
|
+
this._checkExampleParams(example);
|
|
323
|
+
// Process example
|
|
324
|
+
this._instructions = yield this._aGetInstructions(example.input);
|
|
325
|
+
this._verdicts = yield this._aGetVerdicts(this._instructions, example.actualOutput);
|
|
326
|
+
// Add instructions and verdicts to additional metadata
|
|
327
|
+
const additional_metadata = {
|
|
328
|
+
instructions: this._instructions,
|
|
329
|
+
verdicts: this._verdicts
|
|
330
|
+
};
|
|
331
|
+
this.score = this._computeScore();
|
|
332
|
+
this.reason = this._verdicts.length > 0 ? JSON.stringify(this._verdicts) : 'No instructions found';
|
|
333
|
+
this.success = this._successCheck();
|
|
334
|
+
const verbose_logs = this._createVerboseLogs();
|
|
335
|
+
info(`Scoring completed with score: ${this.score}`);
|
|
336
|
+
// Ensure all fields match the ScorerData interface
|
|
337
|
+
return {
|
|
338
|
+
name: this.type,
|
|
339
|
+
threshold: this.threshold,
|
|
340
|
+
success: this.success,
|
|
341
|
+
score: this.score,
|
|
342
|
+
reason: this.reason || "",
|
|
343
|
+
strict_mode: this.strict_mode,
|
|
344
|
+
evaluation_model: this.evaluation_model || null,
|
|
345
|
+
error: null,
|
|
346
|
+
evaluation_cost: null,
|
|
347
|
+
verbose_logs: verbose_logs,
|
|
348
|
+
additional_metadata: additional_metadata
|
|
349
|
+
};
|
|
350
|
+
}
|
|
351
|
+
catch (error) {
|
|
352
|
+
// Handle errors
|
|
353
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
354
|
+
this.error = errorMessage;
|
|
355
|
+
return {
|
|
356
|
+
name: this.type,
|
|
357
|
+
threshold: this.threshold,
|
|
358
|
+
success: false,
|
|
359
|
+
score: 0,
|
|
360
|
+
reason: `Error during scoring: ${errorMessage}`,
|
|
361
|
+
strict_mode: this.strict_mode,
|
|
362
|
+
evaluation_model: this.evaluation_model || null,
|
|
363
|
+
error: errorMessage,
|
|
364
|
+
evaluation_cost: null,
|
|
365
|
+
verbose_logs: null,
|
|
366
|
+
additional_metadata: {}
|
|
367
|
+
};
|
|
368
|
+
}
|
|
369
|
+
});
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* Get the name of the scorer
|
|
373
|
+
*/
|
|
374
|
+
get name() {
|
|
375
|
+
return "Instruction Adherence";
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
//# sourceMappingURL=instruction-adherence.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"instruction-adherence.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/instruction-adherence/instruction-adherence.ts"],"names":[],"mappings":";;;;;;;;;AAEA,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAO,IAAI,EAAe,MAAM,2BAA2B,CAAC;AACnE,OAAO,EACL,4BAA4B,EAE5B,kBAAkB,EAClB,cAAc,EACf,MAAM,cAAc,CAAC;AACtB,OAAO,EAAS,WAAW,EAAE,MAAM,0BAA0B,CAAC;AAE9D,sCAAsC;AACtC,MAAM,eAAe,GAAG,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;AAElD;;;;;GAKG;AACH,MAAM,OAAO,0BAA2B,SAAQ,cAAc;IAM5D;;;;;;;;;OASG;IACH,YACE,YAAoB,GAAG,EACvB,QAAoC,SAAS,EAC7C,iBAA0B,IAAI,EAC9B,aAAsB,KAAK,EAC3B,cAAuB,KAAK,EAC5B,eAAwB,KAAK;QAE7B,KAAK,CACH,SAAS,CAAC,qBAAqB,EAC/B,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,EAC3B,SAAS,EACT,cAAc,EACd,UAAU,EACV,WAAW,EACX,YAAY,CACb,CAAC;QA7BI,kBAAa,GAAa,EAAE,CAAC;QAC7B,cAAS,GAAkC,EAAE,CAAC;QA8BpD,MAAM,EAAE,KAAK,EAAE,gBAAgB,EAAE,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC;QACvD,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,kBAAkB,GAAG,gBAAgB,CAAC;QAC3C,IAAI,CAAC,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;IACpD,CAAC;IAED;;OAEG;IACW,iBAAiB,CAAC,KAAa;;YAC3C,MAAM,MAAM,GAAG,4BAA4B,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YAEnE,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC5B,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;gBAC/C,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;gBACjC,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;gBACxD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC;oBACH,oDAAoD;oBACpD,MAAM,yBAAyB,GAAG,CAAC,QAAgB,EAA8B,EAAE;wBACjF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;wBACpC,MAAM,MAAM,GAAG,kBAAkB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;wBACpD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;4BACnB,OAAO,MAAM,CAAC,IAAI,CAAC;wBACrB,CAAC;wBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;oBAC9D,CAAC,CAAC;oBAEF,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,OAAO,yBAAyB,CAAC,GAAG,CAAC,CAAC,YAAY,CAAC;gBACrD,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,IAAI,CAAC;wBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;wBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;oBACjC,CAAC;oBAAC,OAAO,UAAU,EAAE,CAAC;wBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;oBAC7D,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;KAAA;IAED;;OAEG;IACK,gBAAgB,CAAC,KAAa;QACpC,MAAM,MAAM,GAAG,4BAA4B,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;QAEnE,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC5B,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YACxC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;YACjC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;aAAM,CAAC;YACN,IAAI,CAAC;gBACH,oDAAoD;gBACpD,MAAM,yBAAyB,GAAG,CAAC,QAAgB,EAA8B,EAAE;oBACjF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;oBACpC,MAAM,MAAM,GAAG,kBAAkB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBACpD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;wBACnB,OAAO,MAAM,CAAC,IAAI,CAAC;oBACrB,CAAC;oBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC9D,CAAC,CAAC;gBAEF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,OAAO,yBAAyB,CAAC,GAAG,CAAC,CAAC,YAAY,CAAC;YACrD,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;gBACjC,CAAC;gBAAC,OAAO,UAAU,EAAE,CAAC;oBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACW,aAAa,CAAC,YAAsB,EAAE,YAAoB;;YACtE,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC9B,OAAO,EAAE,CAAC;YACZ,CAAC;YAED,MAAM,MAAM,GAAG,4BAA4B,CAAC,gBAAgB,CAAC,YAAY,EAAE,YAAY,CAAC,CAAC;YAEzF,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC5B,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;gBAC/C,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC7B,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;gBACxD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC;oBACH,oDAAoD;oBACpD,MAAM,qBAAqB,GAAG,CAAC,QAAgB,EAA+C,EAAE;wBAC9F,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;wBACpC,MAAM,MAAM,GAAG,cAAc,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;wBAChD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;4BACnB,OAAO,MAAM,CAAC,IAAI,CAAC;wBACrB,CAAC;wBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;oBAC9D,CAAC,CAAC;oBAEF,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,OAAO,qBAAqB,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;gBAC7C,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,IAAI,CAAC;wBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;wBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;oBAC7B,CAAC;oBAAC,OAAO,UAAU,EAAE,CAAC;wBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;oBAC7D,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;KAAA;IAED;;OAEG;IACK,YAAY,CAAC,YAAsB,EAAE,YAAoB;QAC/D,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC9B,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,MAAM,MAAM,GAAG,4BAA4B,CAAC,gBAAgB,CAAC,YAAY,EAAE,YAAY,CAAC,CAAC;QAEzF,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC5B,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YACxC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;YAC7B,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;aAAM,CAAC;YACN,IAAI,CAAC;gBACH,oDAAoD;gBACpD,MAAM,qBAAqB,GAAG,CAAC,QAAgB,EAA+C,EAAE;oBAC9F,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;oBACpC,MAAM,MAAM,GAAG,cAAc,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAChD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;wBACnB,OAAO,MAAM,CAAC,IAAI,CAAC;oBACrB,CAAC;oBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC9D,CAAC,CAAC;gBAEF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,OAAO,qBAAqB,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YAC7C,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC7B,CAAC;gBAAC,OAAO,UAAU,EAAE,CAAC;oBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa;QACnB,IAAI,IAAI,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAChC,OAAO,CAAC,CAAC;QACX,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YACrC,UAAU,IAAI,OAAO,CAAC,KAAK,CAAC;QAC9B,CAAC;QAED,OAAO,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC;IAC5C,CAAC;IAED;;OAEG;IACK,kBAAkB;QACxB,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC;YACvB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,KAAK,GAAG;YACZ,kBAAkB,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE;YAC/D,UAAU,IAAI,CAAC,KAAK,aAAa,IAAI,CAAC,MAAM,IAAI,oBAAoB,EAAE;SACvE,CAAC;QAEF,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC5B,CAAC;IAED;;OAEG;IACK,mBAAmB,CAAC,OAAgB;QAC1C,KAAK,MAAM,KAAK,IAAI,eAAe,EAAE,CAAC;YACpC,IAAI,KAAK,KAAK,OAAO,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;gBACxC,MAAM,IAAI,KAAK,CAAC,8CAA8C,CAAC,CAAC;YAClE,CAAC;iBAAM,IAAI,KAAK,KAAK,cAAc,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC;gBAC7D,MAAM,IAAI,KAAK,CAAC,qDAAqD,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,gBAAgB,CAAC,OAAgB;QAC/B,IAAI,CAAC,sCAAsC,CAAC,CAAC;QAE7C,IAAI,CAAC;YACH,4BAA4B;YAC5B,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;YAElC,kBAAkB;YAClB,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,KAAM,CAAC,CAAC;YAC3D,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,aAAa,EAAE,OAAO,CAAC,YAAsB,CAAC,CAAC;YAEvF,uDAAuD;YACvD,MAAM,mBAAmB,GAAG;gBAC1B,YAAY,EAAE,IAAI,CAAC,aAAa;gBAChC,QAAQ,EAAE,IAAI,CAAC,SAAS;aACzB,CAAC;YAEF,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;YAClC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,uBAAuB,CAAC;YACnG,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;YACpC,MAAM,YAAY,GAAG,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAE/C,IAAI,CAAC,iCAAiC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;YAEpD,mDAAmD;YACnD,OAAO;gBACL,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,SAAS,EAAE,IAAI,CAAC,SAAS;gBACzB,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,EAAE;gBACzB,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;gBAC/C,KAAK,EAAE,IAAI;gBACX,eAAe,EAAE,IAAI;gBACrB,YAAY,EAAE,YAAY;gBAC1B,mBAAmB,EAAE,mBAAmB;aACzC,CAAC;QACJ,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,gBAAgB;YAChB,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,IAAI,CAAC,KAAK,GAAG,YAAY,CAAC;YAE1B,OAAO;gBACL,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,SAAS,EAAE,IAAI,CAAC,SAAS;gBACzB,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,CAAC;gBACR,MAAM,EAAE,yBAAyB,YAAY,EAAE;gBAC/C,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;gBAC/C,KAAK,EAAE,YAAY;gBACnB,eAAe,EAAE,IAAI;gBACrB,YAAY,EAAE,IAAI;gBAClB,mBAAmB,EAAE,EAAE;aACxB,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACG,YAAY,CAAC,OAAgB;;YACjC,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;gBACrB,OAAO,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;YACxC,CAAC;YAED,IAAI,CAAC,uCAAuC,CAAC,CAAC;YAE9C,IAAI,CAAC;gBACH,4BAA4B;gBAC5B,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;gBAElC,kBAAkB;gBAClB,IAAI,CAAC,aAAa,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,OAAO,CAAC,KAAM,CAAC,CAAC;gBAClE,IAAI,CAAC,SAAS,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,aAAa,EAAE,OAAO,CAAC,YAAsB,CAAC,CAAC;gBAE9F,uDAAuD;gBACvD,MAAM,mBAAmB,GAAG;oBAC1B,YAAY,EAAE,IAAI,CAAC,aAAa;oBAChC,QAAQ,EAAE,IAAI,CAAC,SAAS;iBACzB,CAAC;gBAEF,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;gBAClC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,uBAAuB,CAAC;gBACnG,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;gBACpC,MAAM,YAAY,GAAG,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAE/C,IAAI,CAAC,iCAAiC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;gBAEpD,mDAAmD;gBACnD,OAAO;oBACL,IAAI,EAAE,IAAI,CAAC,IAAI;oBACf,SAAS,EAAE,IAAI,CAAC,SAAS;oBACzB,OAAO,EAAE,IAAI,CAAC,OAAO;oBACrB,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,EAAE;oBACzB,WAAW,EAAE,IAAI,CAAC,WAAW;oBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;oBAC/C,KAAK,EAAE,IAAI;oBACX,eAAe,EAAE,IAAI;oBACrB,YAAY,EAAE,YAAY;oBAC1B,mBAAmB,EAAE,mBAAmB;iBACzC,CAAC;YACJ,CAAC;YAAC,OAAO,KAAU,EAAE,CAAC;gBACpB,gBAAgB;gBAChB,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAC5E,IAAI,CAAC,KAAK,GAAG,YAAY,CAAC;gBAE1B,OAAO;oBACL,IAAI,EAAE,IAAI,CAAC,IAAI;oBACf,SAAS,EAAE,IAAI,CAAC,SAAS;oBACzB,OAAO,EAAE,KAAK;oBACd,KAAK,EAAE,CAAC;oBACR,MAAM,EAAE,yBAAyB,YAAY,EAAE;oBAC/C,WAAW,EAAE,IAAI,CAAC,WAAW;oBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;oBAC/C,KAAK,EAAE,YAAY;oBACnB,eAAe,EAAE,IAAI;oBACrB,YAAY,EAAE,IAAI;oBAClB,mBAAmB,EAAE,EAAE;iBACxB,CAAC;YACJ,CAAC;QACH,CAAC;KAAA;IAED;;OAEG;IACH,IAAI,IAAI;QACN,OAAO,uBAAuB,CAAC;IACjC,CAAC;CACF"}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
/**
|
|
3
|
+
* Schema for a single instruction adherence verdict
|
|
4
|
+
*/
|
|
5
|
+
export const VerdictSchema = z.object({
|
|
6
|
+
instruction: z.string(),
|
|
7
|
+
score: z.number(),
|
|
8
|
+
reason: z.string()
|
|
9
|
+
});
|
|
10
|
+
/**
|
|
11
|
+
* Schema for a list of verdicts
|
|
12
|
+
*/
|
|
13
|
+
export const VerdictsSchema = z.object({
|
|
14
|
+
verdicts: z.array(VerdictSchema)
|
|
15
|
+
});
|
|
16
|
+
/**
|
|
17
|
+
* Schema for a list of instructions
|
|
18
|
+
*/
|
|
19
|
+
export const InstructionsSchema = z.object({
|
|
20
|
+
instructions: z.array(z.string())
|
|
21
|
+
});
|
|
22
|
+
/**
|
|
23
|
+
* Templates for prompts used in the InstructionAdherenceScorer
|
|
24
|
+
*/
|
|
25
|
+
export class InstructionAdherenceTemplate {
|
|
26
|
+
/**
|
|
27
|
+
* Generate a prompt to extract instructions from input text
|
|
28
|
+
*/
|
|
29
|
+
static getInstructions(input) {
|
|
30
|
+
return `You will be presented with a piece of text. Your task is to break down the text and generate a list of the instructions contained within the text.
|
|
31
|
+
|
|
32
|
+
===== START OF EXAMPLES =====
|
|
33
|
+
Example 1:
|
|
34
|
+
Example text: Hello my name is John Doe. I like cars. Write two poems about the weather and create a joke. Also what is 5 + 5?
|
|
35
|
+
|
|
36
|
+
Output:
|
|
37
|
+
{
|
|
38
|
+
"instructions": ["Write two poem about the weather", "Create a joke", "What is 5 + 5?"]
|
|
39
|
+
}
|
|
40
|
+
===== END OF EXAMPLES =====
|
|
41
|
+
|
|
42
|
+
**
|
|
43
|
+
IMPORTANT: Please return your answer in valid JSON format, with the "instructions" key mapping to a list of strings. No words or explanation is needed.
|
|
44
|
+
**
|
|
45
|
+
|
|
46
|
+
==== START OF INPUT ====
|
|
47
|
+
Text:
|
|
48
|
+
${input}
|
|
49
|
+
==== END OF INPUT ====
|
|
50
|
+
|
|
51
|
+
==== YOUR ANSWER ====
|
|
52
|
+
JSON:`;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Generate a prompt to evaluate adherence to instructions
|
|
56
|
+
*/
|
|
57
|
+
static generateVerdicts(instructions, actualOutput) {
|
|
58
|
+
return `You will be presented with a list of instructions and a piece of text. For each instruction, determine if the instruction was completed in the text. There are 3 categories: either completed, partially completed, or not completed. The scores for these will be 1, 0.5, and 0 respectively.
|
|
59
|
+
Go through each instruction and provide score for each instruction as well as the reasoning for that score.
|
|
60
|
+
|
|
61
|
+
==== FORMATTING YOUR ANSWER ====
|
|
62
|
+
Please return your answer in JSON format, with a list of JSON objects with keys "instruction", "score", and "reason". No words or explanation beyond the output JSON is needed.
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
===== START OF EXAMPLES =====
|
|
66
|
+
Example 1:
|
|
67
|
+
instructions: ["Write two poems about the weather", "Create a joke", "What is 5 + 5?"]
|
|
68
|
+
output: Poem 1: The Sun's Embrace
|
|
69
|
+
The sun climbs high, a golden flame,
|
|
70
|
+
It whispers warmth, it calls my name.
|
|
71
|
+
The sky, a canvas, blue and clear,
|
|
72
|
+
A perfect day for cars, my dear.
|
|
73
|
+
|
|
74
|
+
The asphalt hums beneath the wheels,
|
|
75
|
+
A symphony of speed it feels.
|
|
76
|
+
The weather smiles, no clouds in sight,
|
|
77
|
+
A driver's joy, pure delight.
|
|
78
|
+
|
|
79
|
+
Poem 2: The Storm's Dance
|
|
80
|
+
A sunlit meadow, alive with whispers of wind, where daisies dance and hope begins again. Each petal holds a promise—bright, unbruised— a symphony of light that cannot be refused.
|
|
81
|
+
|
|
82
|
+
Joke
|
|
83
|
+
Why dont cars ever get cold in the winter?
|
|
84
|
+
Because they have radiators!
|
|
85
|
+
|
|
86
|
+
Math Answer
|
|
87
|
+
5 + 5 = 10
|
|
88
|
+
|
|
89
|
+
YOUR JSON OUTPUT:
|
|
90
|
+
{
|
|
91
|
+
"verdicts": [
|
|
92
|
+
{
|
|
93
|
+
"instruction": "Write two poem about the weather",
|
|
94
|
+
"score": 0.5,
|
|
95
|
+
"reason": "The output contained one poem about the weather, but the other poem was not about the weather."
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"instruction": "Create a joke",
|
|
99
|
+
"score": 1,
|
|
100
|
+
"reason": "There was a joke created in the output."
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"instruction": "What is 5 + 5?",
|
|
104
|
+
"score": 1,
|
|
105
|
+
"reason": "The answer to the math question was provided in the output."
|
|
106
|
+
}
|
|
107
|
+
]
|
|
108
|
+
}
|
|
109
|
+
===== END OF EXAMPLES =====
|
|
110
|
+
|
|
111
|
+
==== START OF INPUT ====
|
|
112
|
+
instructions: ${JSON.stringify(instructions)}
|
|
113
|
+
output: ${actualOutput}
|
|
114
|
+
==== END OF INPUT ====
|
|
115
|
+
|
|
116
|
+
==== YOUR ANSWER ====
|
|
117
|
+
JSON:`;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
//# sourceMappingURL=prompts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/instruction-adherence/prompts.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAG,CAAC,CAAC,MAAM,CAAC;IACpC,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE;IACvB,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE;IACjB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;CACnB,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IACrC,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,aAAa,CAAC;CACjC,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC,CAAC,MAAM,CAAC;IACzC,YAAY,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;CAClC,CAAC,CAAC;AAYH;;GAEG;AACH,MAAM,OAAO,4BAA4B;IACvC;;OAEG;IACH,MAAM,CAAC,eAAe,CAAC,KAAa;QAClC,OAAO;;;;;;;;;;;;;;;;;;EAkBT,KAAK;;;;MAID,CAAC;IACL,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,gBAAgB,CAAC,YAAsB,EAAE,YAAoB;QAClE,OAAO;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBAsDK,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC;UAClC,YAAY;;;;MAIhB,CAAC;IACL,CAAC;CACF"}
|
|
@@ -8,7 +8,7 @@ export declare function debug(message: string, meta?: Record<string, any>): void
|
|
|
8
8
|
/**
|
|
9
9
|
* Log an info message (alias for info)
|
|
10
10
|
*/
|
|
11
|
-
export declare function log(message: string,
|
|
11
|
+
export declare function log(message: string, meta?: Record<string, any>): void;
|
|
12
12
|
/**
|
|
13
13
|
* Log an info message
|
|
14
14
|
*/
|
|
@@ -22,6 +22,7 @@ export declare enum APIScorer {
|
|
|
22
22
|
export declare const UNBOUNDED_SCORERS: Set<APIScorer>;
|
|
23
23
|
export declare const ROOT_API = "https://api.judgmentlabs.ai";
|
|
24
24
|
export declare const JUDGMENT_DATASETS_PUSH_API_URL = "https://api.judgmentlabs.ai/datasets/push/";
|
|
25
|
+
export declare const JUDGMENT_DATASETS_APPEND_API_URL = "https://api.judgmentlabs.ai/datasets/insert_examples/";
|
|
25
26
|
export declare const JUDGMENT_DATASETS_PULL_API_URL = "https://api.judgmentlabs.ai/datasets/pull/";
|
|
26
27
|
export declare const JUDGMENT_DATASETS_DELETE_API_URL = "https://api.judgmentlabs.ai/datasets/delete/";
|
|
27
28
|
export declare const JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = "https://api.judgmentlabs.ai/datasets/export_jsonl/";
|
|
@@ -34,6 +34,11 @@ export declare class EvalDatasetClient {
|
|
|
34
34
|
* @returns AxiosResponse containing the stream if successful.
|
|
35
35
|
*/
|
|
36
36
|
exportJsonl(alias: string, projectName: string): Promise<AxiosResponse>;
|
|
37
|
+
/**
|
|
38
|
+
* Appends examples to an existing dataset on the Judgment platform.
|
|
39
|
+
* @returns True if successful, false otherwise.
|
|
40
|
+
*/
|
|
41
|
+
append(alias: string, examples: Example[], projectName: string): Promise<boolean>;
|
|
37
42
|
private getAuthHeaders;
|
|
38
43
|
private handleApiError;
|
|
39
44
|
}
|