judgeval 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/dist/cjs/common/logger.js +28 -24
  2. package/dist/cjs/common/logger.js.map +1 -1
  3. package/dist/cjs/common/tracer.js +80 -130
  4. package/dist/cjs/common/tracer.js.map +1 -1
  5. package/dist/cjs/constants.js +2 -1
  6. package/dist/cjs/constants.js.map +1 -1
  7. package/dist/cjs/data/datasets/eval-dataset-client.js +45 -0
  8. package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -1
  9. package/dist/cjs/e2etests/eval-operations.test.js +3 -3
  10. package/dist/cjs/exporters/otel-exporter.js +352 -0
  11. package/dist/cjs/exporters/otel-exporter.js.map +1 -0
  12. package/dist/cjs/judges/index.js +217 -0
  13. package/dist/cjs/judges/index.js.map +1 -0
  14. package/dist/cjs/run-evaluation.js +13 -13
  15. package/dist/cjs/run-evaluation.js.map +1 -1
  16. package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js +610 -0
  17. package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
  18. package/dist/cjs/scorers/metrics/answer-correctness/index.js +19 -0
  19. package/dist/cjs/scorers/metrics/answer-correctness/index.js.map +1 -0
  20. package/dist/cjs/scorers/metrics/answer-correctness/prompts.js +175 -0
  21. package/dist/cjs/scorers/metrics/answer-correctness/prompts.js.map +1 -0
  22. package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js +525 -0
  23. package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
  24. package/dist/cjs/scorers/metrics/answer-relevancy/index.js +19 -0
  25. package/dist/cjs/scorers/metrics/answer-relevancy/index.js.map +1 -0
  26. package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js +179 -0
  27. package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
  28. package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js +524 -0
  29. package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
  30. package/dist/cjs/scorers/metrics/faithfulness/index.js +19 -0
  31. package/dist/cjs/scorers/metrics/faithfulness/index.js.map +1 -0
  32. package/dist/cjs/scorers/metrics/faithfulness/prompts.js +232 -0
  33. package/dist/cjs/scorers/metrics/faithfulness/prompts.js.map +1 -0
  34. package/dist/cjs/scorers/metrics/hallucination/hallucination.js +390 -0
  35. package/dist/cjs/scorers/metrics/hallucination/hallucination.js.map +1 -0
  36. package/dist/cjs/scorers/metrics/hallucination/index.js +11 -0
  37. package/dist/cjs/scorers/metrics/hallucination/index.js.map +1 -0
  38. package/dist/cjs/scorers/metrics/hallucination/prompts.js +106 -0
  39. package/dist/cjs/scorers/metrics/hallucination/prompts.js.map +1 -0
  40. package/dist/cjs/scorers/metrics/instruction-adherence/index.js +19 -0
  41. package/dist/cjs/scorers/metrics/instruction-adherence/index.js.map +1 -0
  42. package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js +382 -0
  43. package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
  44. package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js +124 -0
  45. package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
  46. package/dist/esm/common/logger.js +16 -11
  47. package/dist/esm/common/logger.js.map +1 -1
  48. package/dist/esm/common/tracer.js +78 -128
  49. package/dist/esm/common/tracer.js.map +1 -1
  50. package/dist/esm/constants.js +1 -0
  51. package/dist/esm/constants.js.map +1 -1
  52. package/dist/esm/data/datasets/eval-dataset-client.js +46 -1
  53. package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -1
  54. package/dist/esm/e2etests/eval-operations.test.js +3 -3
  55. package/dist/esm/exporters/otel-exporter.js +348 -0
  56. package/dist/esm/exporters/otel-exporter.js.map +1 -0
  57. package/dist/esm/judges/index.js +185 -0
  58. package/dist/esm/judges/index.js.map +1 -0
  59. package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js +601 -0
  60. package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
  61. package/dist/esm/scorers/metrics/answer-correctness/index.js +3 -0
  62. package/dist/esm/scorers/metrics/answer-correctness/index.js.map +1 -0
  63. package/dist/esm/scorers/metrics/answer-correctness/prompts.js +171 -0
  64. package/dist/esm/scorers/metrics/answer-correctness/prompts.js.map +1 -0
  65. package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js +521 -0
  66. package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
  67. package/dist/esm/scorers/metrics/answer-relevancy/index.js +3 -0
  68. package/dist/esm/scorers/metrics/answer-relevancy/index.js.map +1 -0
  69. package/dist/esm/scorers/metrics/answer-relevancy/prompts.js +175 -0
  70. package/dist/esm/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
  71. package/dist/esm/scorers/metrics/faithfulness/faithfulness.js +520 -0
  72. package/dist/esm/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
  73. package/dist/esm/scorers/metrics/faithfulness/index.js +3 -0
  74. package/dist/esm/scorers/metrics/faithfulness/index.js.map +1 -0
  75. package/dist/esm/scorers/metrics/faithfulness/prompts.js +228 -0
  76. package/dist/esm/scorers/metrics/faithfulness/prompts.js.map +1 -0
  77. package/dist/esm/scorers/metrics/hallucination/hallucination.js +386 -0
  78. package/dist/esm/scorers/metrics/hallucination/hallucination.js.map +1 -0
  79. package/dist/esm/scorers/metrics/hallucination/index.js +3 -0
  80. package/dist/esm/scorers/metrics/hallucination/index.js.map +1 -0
  81. package/dist/esm/scorers/metrics/hallucination/prompts.js +102 -0
  82. package/dist/esm/scorers/metrics/hallucination/prompts.js.map +1 -0
  83. package/dist/esm/scorers/metrics/instruction-adherence/index.js +3 -0
  84. package/dist/esm/scorers/metrics/instruction-adherence/index.js.map +1 -0
  85. package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js +378 -0
  86. package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
  87. package/dist/esm/scorers/metrics/instruction-adherence/prompts.js +120 -0
  88. package/dist/esm/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
  89. package/dist/types/common/logger.d.ts +1 -1
  90. package/dist/types/constants.d.ts +1 -0
  91. package/dist/types/data/datasets/eval-dataset-client.d.ts +5 -0
  92. package/dist/types/exporters/otel-exporter.d.ts +16 -0
  93. package/dist/types/judges/index.d.ts +50 -0
  94. package/dist/types/scorers/metrics/answer-correctness/answer-correctness.d.ts +99 -0
  95. package/dist/types/scorers/metrics/answer-correctness/index.d.ts +2 -0
  96. package/dist/types/scorers/metrics/answer-correctness/prompts.d.ts +71 -0
  97. package/dist/types/scorers/metrics/answer-relevancy/answer-relevancy.d.ts +78 -0
  98. package/dist/types/scorers/metrics/answer-relevancy/index.d.ts +2 -0
  99. package/dist/types/scorers/metrics/answer-relevancy/prompts.d.ts +71 -0
  100. package/dist/types/scorers/metrics/faithfulness/faithfulness.d.ts +77 -0
  101. package/dist/types/scorers/metrics/faithfulness/index.d.ts +2 -0
  102. package/dist/types/scorers/metrics/faithfulness/prompts.d.ts +94 -0
  103. package/dist/types/scorers/metrics/hallucination/hallucination.d.ts +67 -0
  104. package/dist/types/scorers/metrics/hallucination/index.d.ts +3 -0
  105. package/dist/types/scorers/metrics/hallucination/prompts.d.ts +63 -0
  106. package/dist/types/scorers/metrics/instruction-adherence/index.d.ts +2 -0
  107. package/dist/types/scorers/metrics/instruction-adherence/instruction-adherence.d.ts +67 -0
  108. package/dist/types/scorers/metrics/instruction-adherence/prompts.d.ts +78 -0
  109. package/package.json +32 -14
@@ -0,0 +1,106 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.HallucinationTemplate = exports.ReasonSchema = exports.VerdictsSchema = exports.HallucinationVerdictSchema = void 0;
4
+ const zod_1 = require("zod");
5
+ /**
6
+ * Schema for hallucination verdict
7
+ */
8
+ exports.HallucinationVerdictSchema = zod_1.z.object({
9
+ verdict: zod_1.z.string(),
10
+ reason: zod_1.z.string()
11
+ });
12
+ /**
13
+ * Schema for verdicts
14
+ */
15
+ exports.VerdictsSchema = zod_1.z.object({
16
+ verdicts: zod_1.z.array(exports.HallucinationVerdictSchema)
17
+ });
18
+ /**
19
+ * Schema for reason
20
+ */
21
+ exports.ReasonSchema = zod_1.z.object({
22
+ reason: zod_1.z.string()
23
+ });
24
+ /**
25
+ * Templates for hallucination scorer prompts
26
+ */
27
+ class HallucinationTemplate {
28
+ /**
29
+ * Generate a prompt to evaluate hallucinations in the actual output
30
+ */
31
+ static generateVerdicts(actualOutput, contexts) {
32
+ return `==== TASK INSTRUCTIONS ====
33
+ You will be provided with an \`actual output\` (the response of an LLM to a particular query) and \`contexts\` (ground truth contextual information from a knowledge base).
34
+ Your task is to take each context in contexts and determine whether the \`actual output\` factually agrees with the context.
35
+
36
+ Additional notes:
37
+ You should NOT use any prior knowledge you have in your decision making process; take each context at face value.
38
+ Since you will determine a verdict for EACH context, the number of 'verdicts' is EXACTLY EQUAL TO the number of contexts.
39
+ You should be lenient in your judgment when the actual output lacks detail with respect to the context segment; you should ONLY provide a 'no' answer if the context contradicts the actual output.
40
+
41
+ ==== FORMATTING INSTRUCTIONS ====
42
+ You should return a JSON object with a key 'verdicts', which is a list of JSON objects. Each JSON object corresponds to a context in \`contexts\`, and should have 2 fields: 'verdict' and 'reason'.
43
+ The 'verdict' key should be EXACTLY one of 'yes' or 'no', representing whether the \`actual output\` factually agrees with the context segment.
44
+ The 'reason' is the justification for the verdict. If your verdict is 'no', try to provide a correction in the reason.
45
+
46
+ ==== EXAMPLE ====
47
+ Example contexts: ["Einstein won the Nobel Prize for his discovery of the photoelectric effect.", "Einstein won the Nobel Prize in 1968."]
48
+ Example actual output: "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect."
49
+
50
+ Example:
51
+ {
52
+ "verdicts": [
53
+ {
54
+ "verdict": "yes",
55
+ "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect."
56
+ },
57
+ {
58
+ "verdict": "no",
59
+ "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969."
60
+ }
61
+ ]
62
+ }
63
+
64
+ ==== YOUR TURN ====
65
+ Contexts:
66
+ ${contexts.map((context, index) => `${index + 1}. ${context}`).join('\n')}
67
+
68
+ Actual Output:
69
+ ${actualOutput}
70
+
71
+ JSON:`;
72
+ }
73
+ /**
74
+ * Generate a prompt to create a reason for the hallucination score
75
+ */
76
+ static generateReason(actualOutput, contexts) {
77
+ return `==== TASK INSTRUCTIONS ====
78
+ You will be provided with an \`actual output\` (the response of an LLM to a particular query) and \`contexts\` (ground truth contextual information from a knowledge base).
79
+ Your task is to analyze whether the actual output contains any hallucinations (factual inaccuracies) when compared to the provided contexts.
80
+
81
+ Please provide a clear and concise reason summarizing your analysis. Focus on any contradictions between the actual output and the contexts, or note if the output is factually consistent with the contexts.
82
+
83
+ ==== FORMATTING INSTRUCTIONS ====
84
+ Please make sure to only return in JSON format, with the 'reason' key providing the reason.
85
+ Example JSON:
86
+ {
87
+ "reason": "The output contains factual inaccuracies because..."
88
+ }
89
+
90
+ Or if no hallucinations:
91
+ {
92
+ "reason": "The output is factually consistent with the provided contexts."
93
+ }
94
+
95
+ ==== YOUR TURN ====
96
+ Contexts:
97
+ ${contexts.map((context, index) => `${index + 1}. ${context}`).join('\n')}
98
+
99
+ Actual Output:
100
+ ${actualOutput}
101
+
102
+ JSON:`;
103
+ }
104
+ }
105
+ exports.HallucinationTemplate = HallucinationTemplate;
106
+ //# sourceMappingURL=prompts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/hallucination/prompts.ts"],"names":[],"mappings":";;;AAAA,6BAAwB;AAExB;;GAEG;AACU,QAAA,0BAA0B,GAAG,OAAC,CAAC,MAAM,CAAC;IACjD,OAAO,EAAE,OAAC,CAAC,MAAM,EAAE;IACnB,MAAM,EAAE,OAAC,CAAC,MAAM,EAAE;CACnB,CAAC,CAAC;AAIH;;GAEG;AACU,QAAA,cAAc,GAAG,OAAC,CAAC,MAAM,CAAC;IACrC,QAAQ,EAAE,OAAC,CAAC,KAAK,CAAC,kCAA0B,CAAC;CAC9C,CAAC,CAAC;AAEH;;GAEG;AACU,QAAA,YAAY,GAAG,OAAC,CAAC,MAAM,CAAC;IACnC,MAAM,EAAE,OAAC,CAAC,MAAM,EAAE;CACnB,CAAC,CAAC;AAEH;;GAEG;AACH,MAAa,qBAAqB;IAChC;;OAEG;IACH,MAAM,CAAC,gBAAgB,CAAC,YAAoB,EAAE,QAAkB;QAC9D,OAAO;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAkCT,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,KAAK,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;EAGvE,YAAY;;MAER,CAAC;IACL,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,cAAc,CAAC,YAAoB,EAAE,QAAkB;QAC5D,OAAO;;;;;;;;;;;;;;;;;;;;EAoBT,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,KAAK,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;EAGvE,YAAY;;MAER,CAAC;IACL,CAAC;CACF;AA9ED,sDA8EC"}
@@ -0,0 +1,19 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ __exportStar(require("./instruction-adherence.js"), exports);
18
+ __exportStar(require("./prompts.js"), exports);
19
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/instruction-adherence/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,6DAA2C;AAC3C,+CAA6B"}
@@ -0,0 +1,382 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.InstructionAdherenceScorer = void 0;
13
+ const base_scorer_js_1 = require("../../base-scorer.js");
14
+ const constants_js_1 = require("../../../constants.js");
15
+ const logger_js_1 = require("../../../common/logger.js");
16
+ const prompts_js_1 = require("./prompts.js");
17
+ const index_js_1 = require("../../../judges/index.js");
18
+ // Required parameters for this scorer
19
+ const required_params = ['input', 'actualOutput'];
20
+ /**
21
+ * InstructionAdherenceScorer evaluates how well an LLM follows instructions
22
+ * by extracting instructions from the input and checking if they are followed in the output.
23
+ *
24
+ * The score is the average of scores for each instruction (1 = followed, 0.5 = partially followed, 0 = not followed).
25
+ */
26
+ class InstructionAdherenceScorer extends base_scorer_js_1.JudgevalScorer {
27
+ /**
28
+ * Create a new InstructionAdherenceScorer
29
+ *
30
+ * @param threshold - Success threshold (default: 0.5)
31
+ * @param model - Model to use for evaluation (default: DefaultJudge)
32
+ * @param include_reason - Whether to include a reason for the score (default: true)
33
+ * @param async_mode - Whether to use async mode (default: false)
34
+ * @param strict_mode - Whether to use strict mode (default: false)
35
+ * @param verbose_mode - Whether to include verbose logs (default: false)
36
+ */
37
+ constructor(threshold = 0.5, model = undefined, include_reason = true, async_mode = false, strict_mode = false, verbose_mode = false) {
38
+ super(constants_js_1.APIScorer.INSTRUCTION_ADHERENCE, strict_mode ? 1 : threshold, undefined, include_reason, async_mode, strict_mode, verbose_mode);
39
+ this._instructions = [];
40
+ this._verdicts = [];
41
+ const { judge, usingNativeModel } = (0, index_js_1.createJudge)(model);
42
+ this.model = judge;
43
+ this.using_native_model = usingNativeModel;
44
+ this.evaluation_model = this.model.getModelName();
45
+ }
46
+ /**
47
+ * Extract instructions from input text
48
+ */
49
+ _aGetInstructions(input) {
50
+ return __awaiter(this, void 0, void 0, function* () {
51
+ const prompt = prompts_js_1.InstructionAdherenceTemplate.getInstructions(input);
52
+ if (this.using_native_model) {
53
+ const res = yield this.model.aGenerate(prompt);
54
+ try {
55
+ const data = JSON.parse(res);
56
+ return data.instructions || [];
57
+ }
58
+ catch (error) {
59
+ throw new Error(`Failed to parse response: ${error}`);
60
+ }
61
+ }
62
+ else {
63
+ try {
64
+ // Create a parser function to validate the response
65
+ const parseInstructionsResponse = (response) => {
66
+ const parsed = JSON.parse(response);
67
+ const result = prompts_js_1.InstructionsSchema.safeParse(parsed);
68
+ if (result.success) {
69
+ return result.data;
70
+ }
71
+ throw new Error(`Invalid response format: ${result.error}`);
72
+ };
73
+ const res = yield this.model.aGenerate(prompt);
74
+ return parseInstructionsResponse(res).instructions;
75
+ }
76
+ catch (error) {
77
+ const res = yield this.model.aGenerate(prompt);
78
+ try {
79
+ const data = JSON.parse(res);
80
+ return data.instructions || [];
81
+ }
82
+ catch (parseError) {
83
+ throw new Error(`Failed to parse response: ${parseError}`);
84
+ }
85
+ }
86
+ }
87
+ });
88
+ }
89
+ /**
90
+ * Extract instructions from input text (synchronous)
91
+ */
92
+ _getInstructions(input) {
93
+ const prompt = prompts_js_1.InstructionAdherenceTemplate.getInstructions(input);
94
+ if (this.using_native_model) {
95
+ const res = this.model.generate(prompt);
96
+ try {
97
+ const data = JSON.parse(res);
98
+ return data.instructions || [];
99
+ }
100
+ catch (error) {
101
+ throw new Error(`Failed to parse response: ${error}`);
102
+ }
103
+ }
104
+ else {
105
+ try {
106
+ // Create a parser function to validate the response
107
+ const parseInstructionsResponse = (response) => {
108
+ const parsed = JSON.parse(response);
109
+ const result = prompts_js_1.InstructionsSchema.safeParse(parsed);
110
+ if (result.success) {
111
+ return result.data;
112
+ }
113
+ throw new Error(`Invalid response format: ${result.error}`);
114
+ };
115
+ const res = this.model.generate(prompt);
116
+ return parseInstructionsResponse(res).instructions;
117
+ }
118
+ catch (error) {
119
+ const res = this.model.generate(prompt);
120
+ try {
121
+ const data = JSON.parse(res);
122
+ return data.instructions || [];
123
+ }
124
+ catch (parseError) {
125
+ throw new Error(`Failed to parse response: ${parseError}`);
126
+ }
127
+ }
128
+ }
129
+ }
130
+ /**
131
+ * Generate verdicts for each instruction
132
+ */
133
+ _aGetVerdicts(instructions, actualOutput) {
134
+ return __awaiter(this, void 0, void 0, function* () {
135
+ if (instructions.length === 0) {
136
+ return [];
137
+ }
138
+ const prompt = prompts_js_1.InstructionAdherenceTemplate.generateVerdicts(instructions, actualOutput);
139
+ if (this.using_native_model) {
140
+ const res = yield this.model.aGenerate(prompt);
141
+ try {
142
+ const data = JSON.parse(res);
143
+ return data.verdicts || [];
144
+ }
145
+ catch (error) {
146
+ throw new Error(`Failed to parse response: ${error}`);
147
+ }
148
+ }
149
+ else {
150
+ try {
151
+ // Create a parser function to validate the response
152
+ const parseVerdictsResponse = (response) => {
153
+ const parsed = JSON.parse(response);
154
+ const result = prompts_js_1.VerdictsSchema.safeParse(parsed);
155
+ if (result.success) {
156
+ return result.data;
157
+ }
158
+ throw new Error(`Invalid response format: ${result.error}`);
159
+ };
160
+ const res = yield this.model.aGenerate(prompt);
161
+ return parseVerdictsResponse(res).verdicts;
162
+ }
163
+ catch (error) {
164
+ const res = yield this.model.aGenerate(prompt);
165
+ try {
166
+ const data = JSON.parse(res);
167
+ return data.verdicts || [];
168
+ }
169
+ catch (parseError) {
170
+ throw new Error(`Failed to parse response: ${parseError}`);
171
+ }
172
+ }
173
+ }
174
+ });
175
+ }
176
+ /**
177
+ * Generate verdicts for each instruction (synchronous)
178
+ */
179
+ _getVerdicts(instructions, actualOutput) {
180
+ if (instructions.length === 0) {
181
+ return [];
182
+ }
183
+ const prompt = prompts_js_1.InstructionAdherenceTemplate.generateVerdicts(instructions, actualOutput);
184
+ if (this.using_native_model) {
185
+ const res = this.model.generate(prompt);
186
+ try {
187
+ const data = JSON.parse(res);
188
+ return data.verdicts || [];
189
+ }
190
+ catch (error) {
191
+ throw new Error(`Failed to parse response: ${error}`);
192
+ }
193
+ }
194
+ else {
195
+ try {
196
+ // Create a parser function to validate the response
197
+ const parseVerdictsResponse = (response) => {
198
+ const parsed = JSON.parse(response);
199
+ const result = prompts_js_1.VerdictsSchema.safeParse(parsed);
200
+ if (result.success) {
201
+ return result.data;
202
+ }
203
+ throw new Error(`Invalid response format: ${result.error}`);
204
+ };
205
+ const res = this.model.generate(prompt);
206
+ return parseVerdictsResponse(res).verdicts;
207
+ }
208
+ catch (error) {
209
+ const res = this.model.generate(prompt);
210
+ try {
211
+ const data = JSON.parse(res);
212
+ return data.verdicts || [];
213
+ }
214
+ catch (parseError) {
215
+ throw new Error(`Failed to parse response: ${parseError}`);
216
+ }
217
+ }
218
+ }
219
+ }
220
+ /**
221
+ * Calculate the instruction adherence score
222
+ */
223
+ _computeScore() {
224
+ if (this._verdicts.length === 0) {
225
+ return 1;
226
+ }
227
+ let totalScore = 0;
228
+ for (const verdict of this._verdicts) {
229
+ totalScore += verdict.score;
230
+ }
231
+ return totalScore / this._verdicts.length;
232
+ }
233
+ /**
234
+ * Create verbose logs for debugging
235
+ */
236
+ _createVerboseLogs() {
237
+ if (!this.verbose_mode) {
238
+ return null;
239
+ }
240
+ const steps = [
241
+ `Instructions:\n${JSON.stringify(this._instructions, null, 2)}`,
242
+ `Score: ${this.score}\nReason: ${this.reason || "No reason provided"}`
243
+ ];
244
+ return steps.join('\n\n');
245
+ }
246
+ /**
247
+ * Check if example has required parameters
248
+ */
249
+ _checkExampleParams(example) {
250
+ for (const param of required_params) {
251
+ if (param === 'input' && !example.input) {
252
+ throw new Error(`Example is missing required parameter: input`);
253
+ }
254
+ else if (param === 'actualOutput' && !example.actualOutput) {
255
+ throw new Error(`Example is missing required parameter: actualOutput`);
256
+ }
257
+ }
258
+ }
259
+ /**
260
+ * Score an example synchronously
261
+ */
262
+ syncScoreExample(example) {
263
+ (0, logger_js_1.info)("Starting example scoring (sync mode)");
264
+ try {
265
+ // Check required parameters
266
+ this._checkExampleParams(example);
267
+ // Process example
268
+ this._instructions = this._getInstructions(example.input);
269
+ this._verdicts = this._getVerdicts(this._instructions, example.actualOutput);
270
+ // Add instructions and verdicts to additional metadata
271
+ const additional_metadata = {
272
+ instructions: this._instructions,
273
+ verdicts: this._verdicts
274
+ };
275
+ this.score = this._computeScore();
276
+ this.reason = this._verdicts.length > 0 ? JSON.stringify(this._verdicts) : 'No instructions found';
277
+ this.success = this._successCheck();
278
+ const verbose_logs = this._createVerboseLogs();
279
+ (0, logger_js_1.info)(`Scoring completed with score: ${this.score}`);
280
+ // Ensure all fields match the ScorerData interface
281
+ return {
282
+ name: this.type,
283
+ threshold: this.threshold,
284
+ success: this.success,
285
+ score: this.score,
286
+ reason: this.reason || "",
287
+ strict_mode: this.strict_mode,
288
+ evaluation_model: this.evaluation_model || null,
289
+ error: null,
290
+ evaluation_cost: null,
291
+ verbose_logs: verbose_logs,
292
+ additional_metadata: additional_metadata
293
+ };
294
+ }
295
+ catch (error) {
296
+ // Handle errors
297
+ const errorMessage = error instanceof Error ? error.message : String(error);
298
+ this.error = errorMessage;
299
+ return {
300
+ name: this.type,
301
+ threshold: this.threshold,
302
+ success: false,
303
+ score: 0,
304
+ reason: `Error during scoring: ${errorMessage}`,
305
+ strict_mode: this.strict_mode,
306
+ evaluation_model: this.evaluation_model || null,
307
+ error: errorMessage,
308
+ evaluation_cost: null,
309
+ verbose_logs: null,
310
+ additional_metadata: {}
311
+ };
312
+ }
313
+ }
314
+ /**
315
+ * Score an example asynchronously
316
+ */
317
+ scoreExample(example) {
318
+ return __awaiter(this, void 0, void 0, function* () {
319
+ if (!this.async_mode) {
320
+ return this.syncScoreExample(example);
321
+ }
322
+ (0, logger_js_1.info)("Starting example scoring (async mode)");
323
+ try {
324
+ // Check required parameters
325
+ this._checkExampleParams(example);
326
+ // Process example
327
+ this._instructions = yield this._aGetInstructions(example.input);
328
+ this._verdicts = yield this._aGetVerdicts(this._instructions, example.actualOutput);
329
+ // Add instructions and verdicts to additional metadata
330
+ const additional_metadata = {
331
+ instructions: this._instructions,
332
+ verdicts: this._verdicts
333
+ };
334
+ this.score = this._computeScore();
335
+ this.reason = this._verdicts.length > 0 ? JSON.stringify(this._verdicts) : 'No instructions found';
336
+ this.success = this._successCheck();
337
+ const verbose_logs = this._createVerboseLogs();
338
+ (0, logger_js_1.info)(`Scoring completed with score: ${this.score}`);
339
+ // Ensure all fields match the ScorerData interface
340
+ return {
341
+ name: this.type,
342
+ threshold: this.threshold,
343
+ success: this.success,
344
+ score: this.score,
345
+ reason: this.reason || "",
346
+ strict_mode: this.strict_mode,
347
+ evaluation_model: this.evaluation_model || null,
348
+ error: null,
349
+ evaluation_cost: null,
350
+ verbose_logs: verbose_logs,
351
+ additional_metadata: additional_metadata
352
+ };
353
+ }
354
+ catch (error) {
355
+ // Handle errors
356
+ const errorMessage = error instanceof Error ? error.message : String(error);
357
+ this.error = errorMessage;
358
+ return {
359
+ name: this.type,
360
+ threshold: this.threshold,
361
+ success: false,
362
+ score: 0,
363
+ reason: `Error during scoring: ${errorMessage}`,
364
+ strict_mode: this.strict_mode,
365
+ evaluation_model: this.evaluation_model || null,
366
+ error: errorMessage,
367
+ evaluation_cost: null,
368
+ verbose_logs: null,
369
+ additional_metadata: {}
370
+ };
371
+ }
372
+ });
373
+ }
374
+ /**
375
+ * Get the name of the scorer
376
+ */
377
+ get name() {
378
+ return "Instruction Adherence";
379
+ }
380
+ }
381
+ exports.InstructionAdherenceScorer = InstructionAdherenceScorer;
382
+ //# sourceMappingURL=instruction-adherence.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"instruction-adherence.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/instruction-adherence/instruction-adherence.ts"],"names":[],"mappings":";;;;;;;;;;;;AAEA,yDAAsD;AACtD,wDAAkD;AAClD,yDAAmE;AACnE,6CAKsB;AACtB,uDAA8D;AAE9D,sCAAsC;AACtC,MAAM,eAAe,GAAG,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;AAElD;;;;;GAKG;AACH,MAAa,0BAA2B,SAAQ,+BAAc;IAM5D;;;;;;;;;OASG;IACH,YACE,YAAoB,GAAG,EACvB,QAAoC,SAAS,EAC7C,iBAA0B,IAAI,EAC9B,aAAsB,KAAK,EAC3B,cAAuB,KAAK,EAC5B,eAAwB,KAAK;QAE7B,KAAK,CACH,wBAAS,CAAC,qBAAqB,EAC/B,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,EAC3B,SAAS,EACT,cAAc,EACd,UAAU,EACV,WAAW,EACX,YAAY,CACb,CAAC;QA7BI,kBAAa,GAAa,EAAE,CAAC;QAC7B,cAAS,GAAkC,EAAE,CAAC;QA8BpD,MAAM,EAAE,KAAK,EAAE,gBAAgB,EAAE,GAAG,IAAA,sBAAW,EAAC,KAAK,CAAC,CAAC;QACvD,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,kBAAkB,GAAG,gBAAgB,CAAC;QAC3C,IAAI,CAAC,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;IACpD,CAAC;IAED;;OAEG;IACW,iBAAiB,CAAC,KAAa;;YAC3C,MAAM,MAAM,GAAG,yCAA4B,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YAEnE,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC5B,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;gBAC/C,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;gBACjC,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;gBACxD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC;oBACH,oDAAoD;oBACpD,MAAM,yBAAyB,GAAG,CAAC,QAAgB,EAA8B,EAAE;wBACjF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;wBACpC,MAAM,MAAM,GAAG,+BAAkB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;wBACpD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;4BACnB,OAAO,MAAM,CAAC,IAAI,CAAC;wBACrB,CAAC;wBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;oBAC9D,CAAC,CAAC;oBAEF,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,OAAO,yBAAyB,CAAC,GAAG,CAAC,CAAC,YAAY,CAAC;gBACrD,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,IAAI,CAAC;wBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;wBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;oBACjC,CAAC;oBAAC,OAAO,UAAU,EAAE,CAAC;wBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;oBAC7D,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;KAAA;IAED;;OAEG;IACK,gBAAgB,CAAC,KAAa;QACpC,MAAM,MAAM,GAAG,yCAA4B,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;QAEnE,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC5B,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YACxC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;YACjC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;aAAM,CAAC;YACN,IAAI,CAAC;gBACH,oDAAoD;gBACpD,MAAM,yBAAyB,GAAG,CAAC,QAAgB,EAA8B,EAAE;oBACjF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;oBACpC,MAAM,MAAM,GAAG,+BAAkB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBACpD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;wBACnB,OAAO,MAAM,CAAC,IAAI,CAAC;oBACrB,CAAC;oBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC9D,CAAC,CAAC;gBAEF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,OAAO,yBAAyB,CAAC,GAAG,CAAC,CAAC,YAAY,CAAC;YACrD,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;gBACjC,CAAC;gBAAC,OAAO,UAAU,EAAE,CAAC;oBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACW,aAAa,CAAC,YAAsB,EAAE,YAAoB;;YACtE,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC9B,OAAO,EAAE,CAAC;YACZ,CAAC;YAED,MAAM,MAAM,GAAG,yCAA4B,CAAC,gBAAgB,CAAC,YAAY,EAAE,YAAY,CAAC,CAAC;YAEzF,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC5B,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;gBAC/C,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC7B,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;gBACxD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC;oBACH,oDAAoD;oBACpD,MAAM,qBAAqB,GAAG,CAAC,QAAgB,EAA+C,EAAE;wBAC9F,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;wBACpC,MAAM,MAAM,GAAG,2BAAc,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;wBAChD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;4BACnB,OAAO,MAAM,CAAC,IAAI,CAAC;wBACrB,CAAC;wBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;oBAC9D,CAAC,CAAC;oBAEF,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,OAAO,qBAAqB,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;gBAC7C,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,IAAI,CAAC;wBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;wBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;oBAC7B,CAAC;oBAAC,OAAO,UAAU,EAAE,CAAC;wBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;oBAC7D,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;KAAA;IAED;;OAEG;IACK,YAAY,CAAC,YAAsB,EAAE,YAAoB;QAC/D,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC9B,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,MAAM,MAAM,GAAG,yCAA4B,CAAC,gBAAgB,CAAC,YAAY,EAAE,YAAY,CAAC,CAAC;QAEzF,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC5B,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YACxC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;YAC7B,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;aAAM,CAAC;YACN,IAAI,CAAC;gBACH,oDAAoD;gBACpD,MAAM,qBAAqB,GAAG,CAAC,QAAgB,EAA+C,EAAE;oBAC9F,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;oBACpC,MAAM,MAAM,GAAG,2BAAc,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAChD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;wBACnB,OAAO,MAAM,CAAC,IAAI,CAAC;oBACrB,CAAC;oBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC9D,CAAC,CAAC;gBAEF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,OAAO,qBAAqB,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YAC7C,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC7B,CAAC;gBAAC,OAAO,UAAU,EAAE,CAAC;oBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa;QACnB,IAAI,IAAI,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAChC,OAAO,CAAC,CAAC;QACX,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YACrC,UAAU,IAAI,OAAO,CAAC,KAAK,CAAC;QAC9B,CAAC;QAED,OAAO,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC;IAC5C,CAAC;IAED;;OAEG;IACK,kBAAkB;QACxB,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC;YACvB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,KAAK,GAAG;YACZ,kBAAkB,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE;YAC/D,UAAU,IAAI,CAAC,KAAK,aAAa,IAAI,CAAC,MAAM,IAAI,oBAAoB,EAAE;SACvE,CAAC;QAEF,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC5B,CAAC;IAED;;OAEG;IACK,mBAAmB,CAAC,OAAgB;QAC1C,KAAK,MAAM,KAAK,IAAI,eAAe,EAAE,CAAC;YACpC,IAAI,KAAK,KAAK,OAAO,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;gBACxC,MAAM,IAAI,KAAK,CAAC,8CAA8C,CAAC,CAAC;YAClE,CAAC;iBAAM,IAAI,KAAK,KAAK,cAAc,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC;gBAC7D,MAAM,IAAI,KAAK,CAAC,qDAAqD,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,gBAAgB,CAAC,OAAgB;QAC/B,IAAA,gBAAI,EAAC,sCAAsC,CAAC,CAAC;QAE7C,IAAI,CAAC;YACH,4BAA4B;YAC5B,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;YAElC,kBAAkB;YAClB,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,KAAM,CAAC,CAAC;YAC3D,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,aAAa,EAAE,OAAO,CAAC,YAAsB,CAAC,CAAC;YAEvF,uDAAuD;YACvD,MAAM,mBAAmB,GAAG;gBAC1B,YAAY,EAAE,IAAI,CAAC,aAAa;gBAChC,QAAQ,EAAE,IAAI,CAAC,SAAS;aACzB,CAAC;YAEF,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;YAClC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,uBAAuB,CAAC;YACnG,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;YACpC,MAAM,YAAY,GAAG,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAE/C,IAAA,gBAAI,EAAC,iCAAiC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;YAEpD,mDAAmD;YACnD,OAAO;gBACL,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,SAAS,EAAE,IAAI,CAAC,SAAS;gBACzB,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,EAAE;gBACzB,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;gBAC/C,KAAK,EAAE,IAAI;gBACX,eAAe,EAAE,IAAI;gBACrB,YAAY,EAAE,YAAY;gBAC1B,mBAAmB,EAAE,mBAAmB;aACzC,CAAC;QACJ,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,gBAAgB;YAChB,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,IAAI,CAAC,KAAK,GAAG,YAAY,CAAC;YAE1B,OAAO;gBACL,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,SAAS,EAAE,IAAI,CAAC,SAAS;gBACzB,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,CAAC;gBACR,MAAM,EAAE,yBAAyB,YAAY,EAAE;gBAC/C,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;gBAC/C,KAAK,EAAE,YAAY;gBACnB,eAAe,EAAE,IAAI;gBACrB,YAAY,EAAE,IAAI;gBAClB,mBAAmB,EAAE,EAAE;aACxB,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACG,YAAY,CAAC,OAAgB;;YACjC,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;gBACrB,OAAO,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;YACxC,CAAC;YAED,IAAA,gBAAI,EAAC,uCAAuC,CAAC,CAAC;YAE9C,IAAI,CAAC;gBACH,4BAA4B;gBAC5B,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;gBAElC,kBAAkB;gBAClB,IAAI,CAAC,aAAa,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,OAAO,CAAC,KAAM,CAAC,CAAC;gBAClE,IAAI,CAAC,SAAS,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,aAAa,EAAE,OAAO,CAAC,YAAsB,CAAC,CAAC;gBAE9F,uDAAuD;gBACvD,MAAM,mBAAmB,GAAG;oBAC1B,YAAY,EAAE,IAAI,CAAC,aAAa;oBAChC,QAAQ,EAAE,IAAI,CAAC,SAAS;iBACzB,CAAC;gBAEF,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;gBAClC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,uBAAuB,CAAC;gBACnG,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;gBACpC,MAAM,YAAY,GAAG,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAE/C,IAAA,gBAAI,EAAC,iCAAiC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;gBAEpD,mDAAmD;gBACnD,OAAO;oBACL,IAAI,EAAE,IAAI,CAAC,IAAI;oBACf,SAAS,EAAE,IAAI,CAAC,SAAS;oBACzB,OAAO,EAAE,IAAI,CAAC,OAAO;oBACrB,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,EAAE;oBACzB,WAAW,EAAE,IAAI,CAAC,WAAW;oBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;oBAC/C,KAAK,EAAE,IAAI;oBACX,eAAe,EAAE,IAAI;oBACrB,YAAY,EAAE,YAAY;oBAC1B,mBAAmB,EAAE,mBAAmB;iBACzC,CAAC;YACJ,CAAC;YAAC,OAAO,KAAU,EAAE,CAAC;gBACpB,gBAAgB;gBAChB,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAC5E,IAAI,CAAC,KAAK,GAAG,YAAY,CAAC;gBAE1B,OAAO;oBACL,IAAI,EAAE,IAAI,CAAC,IAAI;oBACf,SAAS,EAAE,IAAI,CAAC,SAAS;oBACzB,OAAO,EAAE,KAAK;oBACd,KAAK,EAAE,CAAC;oBACR,MAAM,EAAE,yBAAyB,YAAY,EAAE;oBAC/C,WAAW,EAAE,IAAI,CAAC,WAAW;oBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;oBAC/C,KAAK,EAAE,YAAY;oBACnB,eAAe,EAAE,IAAI;oBACrB,YAAY,EAAE,IAAI;oBAClB,mBAAmB,EAAE,EAAE;iBACxB,CAAC;YACJ,CAAC;QACH,CAAC;KAAA;IAED;;OAEG;IACH,IAAI,IAAI;QACN,OAAO,uBAAuB,CAAC;IACjC,CAAC;CACF;AAnYD,gEAmYC"}
@@ -0,0 +1,124 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.InstructionAdherenceTemplate = exports.InstructionsSchema = exports.VerdictsSchema = exports.VerdictSchema = void 0;
4
+ const zod_1 = require("zod");
5
+ /**
6
+ * Schema for a single instruction adherence verdict
7
+ */
8
+ exports.VerdictSchema = zod_1.z.object({
9
+ instruction: zod_1.z.string(),
10
+ score: zod_1.z.number(),
11
+ reason: zod_1.z.string()
12
+ });
13
+ /**
14
+ * Schema for a list of verdicts
15
+ */
16
+ exports.VerdictsSchema = zod_1.z.object({
17
+ verdicts: zod_1.z.array(exports.VerdictSchema)
18
+ });
19
+ /**
20
+ * Schema for a list of instructions
21
+ */
22
+ exports.InstructionsSchema = zod_1.z.object({
23
+ instructions: zod_1.z.array(zod_1.z.string())
24
+ });
25
+ /**
26
+ * Templates for prompts used in the InstructionAdherenceScorer
27
+ */
28
+ class InstructionAdherenceTemplate {
29
+ /**
30
+ * Generate a prompt to extract instructions from input text
31
+ */
32
+ static getInstructions(input) {
33
+ return `You will be presented with a piece of text. Your task is to break down the text and generate a list of the instructions contained within the text.
34
+
35
+ ===== START OF EXAMPLES =====
36
+ Example 1:
37
+ Example text: Hello my name is John Doe. I like cars. Write two poems about the weather and create a joke. Also what is 5 + 5?
38
+
39
+ Output:
40
+ {
41
+ "instructions": ["Write two poem about the weather", "Create a joke", "What is 5 + 5?"]
42
+ }
43
+ ===== END OF EXAMPLES =====
44
+
45
+ **
46
+ IMPORTANT: Please return your answer in valid JSON format, with the "instructions" key mapping to a list of strings. No words or explanation is needed.
47
+ **
48
+
49
+ ==== START OF INPUT ====
50
+ Text:
51
+ ${input}
52
+ ==== END OF INPUT ====
53
+
54
+ ==== YOUR ANSWER ====
55
+ JSON:`;
56
+ }
57
+ /**
58
+ * Generate a prompt to evaluate adherence to instructions
59
+ */
60
+ static generateVerdicts(instructions, actualOutput) {
61
+ return `You will be presented with a list of instructions and a piece of text. For each instruction, determine if the instruction was completed in the text. There are 3 categories: either completed, partially completed, or not completed. The scores for these will be 1, 0.5, and 0 respectively.
62
+ Go through each instruction and provide score for each instruction as well as the reasoning for that score.
63
+
64
+ ==== FORMATTING YOUR ANSWER ====
65
+ Please return your answer in JSON format, with a list of JSON objects with keys "instruction", "score", and "reason". No words or explanation beyond the output JSON is needed.
66
+
67
+
68
+ ===== START OF EXAMPLES =====
69
+ Example 1:
70
+ instructions: ["Write two poems about the weather", "Create a joke", "What is 5 + 5?"]
71
+ output: Poem 1: The Sun's Embrace
72
+ The sun climbs high, a golden flame,
73
+ It whispers warmth, it calls my name.
74
+ The sky, a canvas, blue and clear,
75
+ A perfect day for cars, my dear.
76
+
77
+ The asphalt hums beneath the wheels,
78
+ A symphony of speed it feels.
79
+ The weather smiles, no clouds in sight,
80
+ A driver's joy, pure delight.
81
+
82
+ Poem 2: The Storm's Dance
83
+ A sunlit meadow, alive with whispers of wind, where daisies dance and hope begins again. Each petal holds a promise—bright, unbruised— a symphony of light that cannot be refused.
84
+
85
+ Joke
86
+ Why dont cars ever get cold in the winter?
87
+ Because they have radiators!
88
+
89
+ Math Answer
90
+ 5 + 5 = 10
91
+
92
+ YOUR JSON OUTPUT:
93
+ {
94
+ "verdicts": [
95
+ {
96
+ "instruction": "Write two poem about the weather",
97
+ "score": 0.5,
98
+ "reason": "The output contained one poem about the weather, but the other poem was not about the weather."
99
+ },
100
+ {
101
+ "instruction": "Create a joke",
102
+ "score": 1,
103
+ "reason": "There was a joke created in the output."
104
+ },
105
+ {
106
+ "instruction": "What is 5 + 5?",
107
+ "score": 1,
108
+ "reason": "The answer to the math question was provided in the output."
109
+ }
110
+ ]
111
+ }
112
+ ===== END OF EXAMPLES =====
113
+
114
+ ==== START OF INPUT ====
115
+ instructions: ${JSON.stringify(instructions)}
116
+ output: ${actualOutput}
117
+ ==== END OF INPUT ====
118
+
119
+ ==== YOUR ANSWER ====
120
+ JSON:`;
121
+ }
122
+ }
123
+ exports.InstructionAdherenceTemplate = InstructionAdherenceTemplate;
124
+ //# sourceMappingURL=prompts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/instruction-adherence/prompts.ts"],"names":[],"mappings":";;;AAAA,6BAAwB;AAExB;;GAEG;AACU,QAAA,aAAa,GAAG,OAAC,CAAC,MAAM,CAAC;IACpC,WAAW,EAAE,OAAC,CAAC,MAAM,EAAE;IACvB,KAAK,EAAE,OAAC,CAAC,MAAM,EAAE;IACjB,MAAM,EAAE,OAAC,CAAC,MAAM,EAAE;CACnB,CAAC,CAAC;AAEH;;GAEG;AACU,QAAA,cAAc,GAAG,OAAC,CAAC,MAAM,CAAC;IACrC,QAAQ,EAAE,OAAC,CAAC,KAAK,CAAC,qBAAa,CAAC;CACjC,CAAC,CAAC;AAEH;;GAEG;AACU,QAAA,kBAAkB,GAAG,OAAC,CAAC,MAAM,CAAC;IACzC,YAAY,EAAE,OAAC,CAAC,KAAK,CAAC,OAAC,CAAC,MAAM,EAAE,CAAC;CAClC,CAAC,CAAC;AAYH;;GAEG;AACH,MAAa,4BAA4B;IACvC;;OAEG;IACH,MAAM,CAAC,eAAe,CAAC,KAAa;QAClC,OAAO;;;;;;;;;;;;;;;;;;EAkBT,KAAK;;;;MAID,CAAC;IACL,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,gBAAgB,CAAC,YAAsB,EAAE,YAAoB;QAClE,OAAO;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBAsDK,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC;UAClC,YAAY;;;;MAIhB,CAAC;IACL,CAAC;CACF;AA/FD,oEA+FC"}