judgeval 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/dist/cjs/common/logger.js +28 -24
  2. package/dist/cjs/common/logger.js.map +1 -1
  3. package/dist/cjs/common/tracer.js +80 -130
  4. package/dist/cjs/common/tracer.js.map +1 -1
  5. package/dist/cjs/constants.js +2 -1
  6. package/dist/cjs/constants.js.map +1 -1
  7. package/dist/cjs/data/datasets/eval-dataset-client.js +45 -0
  8. package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -1
  9. package/dist/cjs/e2etests/eval-operations.test.js +3 -3
  10. package/dist/cjs/exporters/otel-exporter.js +352 -0
  11. package/dist/cjs/exporters/otel-exporter.js.map +1 -0
  12. package/dist/cjs/judges/index.js +217 -0
  13. package/dist/cjs/judges/index.js.map +1 -0
  14. package/dist/cjs/run-evaluation.js +13 -13
  15. package/dist/cjs/run-evaluation.js.map +1 -1
  16. package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js +610 -0
  17. package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
  18. package/dist/cjs/scorers/metrics/answer-correctness/index.js +19 -0
  19. package/dist/cjs/scorers/metrics/answer-correctness/index.js.map +1 -0
  20. package/dist/cjs/scorers/metrics/answer-correctness/prompts.js +175 -0
  21. package/dist/cjs/scorers/metrics/answer-correctness/prompts.js.map +1 -0
  22. package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js +525 -0
  23. package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
  24. package/dist/cjs/scorers/metrics/answer-relevancy/index.js +19 -0
  25. package/dist/cjs/scorers/metrics/answer-relevancy/index.js.map +1 -0
  26. package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js +179 -0
  27. package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
  28. package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js +524 -0
  29. package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
  30. package/dist/cjs/scorers/metrics/faithfulness/index.js +19 -0
  31. package/dist/cjs/scorers/metrics/faithfulness/index.js.map +1 -0
  32. package/dist/cjs/scorers/metrics/faithfulness/prompts.js +232 -0
  33. package/dist/cjs/scorers/metrics/faithfulness/prompts.js.map +1 -0
  34. package/dist/cjs/scorers/metrics/hallucination/hallucination.js +390 -0
  35. package/dist/cjs/scorers/metrics/hallucination/hallucination.js.map +1 -0
  36. package/dist/cjs/scorers/metrics/hallucination/index.js +11 -0
  37. package/dist/cjs/scorers/metrics/hallucination/index.js.map +1 -0
  38. package/dist/cjs/scorers/metrics/hallucination/prompts.js +106 -0
  39. package/dist/cjs/scorers/metrics/hallucination/prompts.js.map +1 -0
  40. package/dist/cjs/scorers/metrics/instruction-adherence/index.js +19 -0
  41. package/dist/cjs/scorers/metrics/instruction-adherence/index.js.map +1 -0
  42. package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js +382 -0
  43. package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
  44. package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js +124 -0
  45. package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
  46. package/dist/esm/common/logger.js +16 -11
  47. package/dist/esm/common/logger.js.map +1 -1
  48. package/dist/esm/common/tracer.js +78 -128
  49. package/dist/esm/common/tracer.js.map +1 -1
  50. package/dist/esm/constants.js +1 -0
  51. package/dist/esm/constants.js.map +1 -1
  52. package/dist/esm/data/datasets/eval-dataset-client.js +46 -1
  53. package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -1
  54. package/dist/esm/e2etests/eval-operations.test.js +3 -3
  55. package/dist/esm/exporters/otel-exporter.js +348 -0
  56. package/dist/esm/exporters/otel-exporter.js.map +1 -0
  57. package/dist/esm/judges/index.js +185 -0
  58. package/dist/esm/judges/index.js.map +1 -0
  59. package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js +601 -0
  60. package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
  61. package/dist/esm/scorers/metrics/answer-correctness/index.js +3 -0
  62. package/dist/esm/scorers/metrics/answer-correctness/index.js.map +1 -0
  63. package/dist/esm/scorers/metrics/answer-correctness/prompts.js +171 -0
  64. package/dist/esm/scorers/metrics/answer-correctness/prompts.js.map +1 -0
  65. package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js +521 -0
  66. package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
  67. package/dist/esm/scorers/metrics/answer-relevancy/index.js +3 -0
  68. package/dist/esm/scorers/metrics/answer-relevancy/index.js.map +1 -0
  69. package/dist/esm/scorers/metrics/answer-relevancy/prompts.js +175 -0
  70. package/dist/esm/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
  71. package/dist/esm/scorers/metrics/faithfulness/faithfulness.js +520 -0
  72. package/dist/esm/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
  73. package/dist/esm/scorers/metrics/faithfulness/index.js +3 -0
  74. package/dist/esm/scorers/metrics/faithfulness/index.js.map +1 -0
  75. package/dist/esm/scorers/metrics/faithfulness/prompts.js +228 -0
  76. package/dist/esm/scorers/metrics/faithfulness/prompts.js.map +1 -0
  77. package/dist/esm/scorers/metrics/hallucination/hallucination.js +386 -0
  78. package/dist/esm/scorers/metrics/hallucination/hallucination.js.map +1 -0
  79. package/dist/esm/scorers/metrics/hallucination/index.js +3 -0
  80. package/dist/esm/scorers/metrics/hallucination/index.js.map +1 -0
  81. package/dist/esm/scorers/metrics/hallucination/prompts.js +102 -0
  82. package/dist/esm/scorers/metrics/hallucination/prompts.js.map +1 -0
  83. package/dist/esm/scorers/metrics/instruction-adherence/index.js +3 -0
  84. package/dist/esm/scorers/metrics/instruction-adherence/index.js.map +1 -0
  85. package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js +378 -0
  86. package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
  87. package/dist/esm/scorers/metrics/instruction-adherence/prompts.js +120 -0
  88. package/dist/esm/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
  89. package/dist/types/common/logger.d.ts +1 -1
  90. package/dist/types/constants.d.ts +1 -0
  91. package/dist/types/data/datasets/eval-dataset-client.d.ts +5 -0
  92. package/dist/types/exporters/otel-exporter.d.ts +16 -0
  93. package/dist/types/judges/index.d.ts +50 -0
  94. package/dist/types/scorers/metrics/answer-correctness/answer-correctness.d.ts +99 -0
  95. package/dist/types/scorers/metrics/answer-correctness/index.d.ts +2 -0
  96. package/dist/types/scorers/metrics/answer-correctness/prompts.d.ts +71 -0
  97. package/dist/types/scorers/metrics/answer-relevancy/answer-relevancy.d.ts +78 -0
  98. package/dist/types/scorers/metrics/answer-relevancy/index.d.ts +2 -0
  99. package/dist/types/scorers/metrics/answer-relevancy/prompts.d.ts +71 -0
  100. package/dist/types/scorers/metrics/faithfulness/faithfulness.d.ts +77 -0
  101. package/dist/types/scorers/metrics/faithfulness/index.d.ts +2 -0
  102. package/dist/types/scorers/metrics/faithfulness/prompts.d.ts +94 -0
  103. package/dist/types/scorers/metrics/hallucination/hallucination.d.ts +67 -0
  104. package/dist/types/scorers/metrics/hallucination/index.d.ts +3 -0
  105. package/dist/types/scorers/metrics/hallucination/prompts.d.ts +63 -0
  106. package/dist/types/scorers/metrics/instruction-adherence/index.d.ts +2 -0
  107. package/dist/types/scorers/metrics/instruction-adherence/instruction-adherence.d.ts +67 -0
  108. package/dist/types/scorers/metrics/instruction-adherence/prompts.d.ts +78 -0
  109. package/package.json +32 -14
@@ -0,0 +1,102 @@
1
+ import { z } from 'zod';
2
+ /**
3
+ * Schema for hallucination verdict
4
+ */
5
+ export const HallucinationVerdictSchema = z.object({
6
+ verdict: z.string(),
7
+ reason: z.string()
8
+ });
9
+ /**
10
+ * Schema for verdicts
11
+ */
12
+ export const VerdictsSchema = z.object({
13
+ verdicts: z.array(HallucinationVerdictSchema)
14
+ });
15
+ /**
16
+ * Schema for reason
17
+ */
18
+ export const ReasonSchema = z.object({
19
+ reason: z.string()
20
+ });
21
+ /**
22
+ * Templates for hallucination scorer prompts
23
+ */
24
+ export class HallucinationTemplate {
25
+ /**
26
+ * Generate a prompt to evaluate hallucinations in the actual output
27
+ */
28
+ static generateVerdicts(actualOutput, contexts) {
29
+ return `==== TASK INSTRUCTIONS ====
30
+ You will be provided with an \`actual output\` (the response of an LLM to a particular query) and \`contexts\` (ground truth contextual information from a knowledge base).
31
+ Your task is to take each context in contexts and determine whether the \`actual output\` factually agrees with the context.
32
+
33
+ Additional notes:
34
+ You should NOT use any prior knowledge you have in your decision making process; take each context at face value.
35
+ Since you will determine a verdict for EACH context, the number of 'verdicts' is EXACTLY EQUAL TO the number of contexts.
36
+ You should be lenient in your judgment when the actual output lacks detail with respect to the context segment; you should ONLY provide a 'no' answer if the context contradicts the actual output.
37
+
38
+ ==== FORMATTING INSTRUCTIONS ====
39
+ You should return a JSON object with a key 'verdicts', which is a list of JSON objects. Each JSON object corresponds to a context in \`contexts\`, and should have 2 fields: 'verdict' and 'reason'.
40
+ The 'verdict' key should be EXACTLY one of 'yes' or 'no', representing whether the \`actual output\` factually agrees with the context segment.
41
+ The 'reason' is the justification for the verdict. If your verdict is 'no', try to provide a correction in the reason.
42
+
43
+ ==== EXAMPLE ====
44
+ Example contexts: ["Einstein won the Nobel Prize for his discovery of the photoelectric effect.", "Einstein won the Nobel Prize in 1968."]
45
+ Example actual output: "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect."
46
+
47
+ Example:
48
+ {
49
+ "verdicts": [
50
+ {
51
+ "verdict": "yes",
52
+ "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect."
53
+ },
54
+ {
55
+ "verdict": "no",
56
+ "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969."
57
+ }
58
+ ]
59
+ }
60
+
61
+ ==== YOUR TURN ====
62
+ Contexts:
63
+ ${contexts.map((context, index) => `${index + 1}. ${context}`).join('\n')}
64
+
65
+ Actual Output:
66
+ ${actualOutput}
67
+
68
+ JSON:`;
69
+ }
70
+ /**
71
+ * Generate a prompt to create a reason for the hallucination score
72
+ */
73
+ static generateReason(actualOutput, contexts) {
74
+ return `==== TASK INSTRUCTIONS ====
75
+ You will be provided with an \`actual output\` (the response of an LLM to a particular query) and \`contexts\` (ground truth contextual information from a knowledge base).
76
+ Your task is to analyze whether the actual output contains any hallucinations (factual inaccuracies) when compared to the provided contexts.
77
+
78
+ Please provide a clear and concise reason summarizing your analysis. Focus on any contradictions between the actual output and the contexts, or note if the output is factually consistent with the contexts.
79
+
80
+ ==== FORMATTING INSTRUCTIONS ====
81
+ Please make sure to only return in JSON format, with the 'reason' key providing the reason.
82
+ Example JSON:
83
+ {
84
+ "reason": "The output contains factual inaccuracies because..."
85
+ }
86
+
87
+ Or if no hallucinations:
88
+ {
89
+ "reason": "The output is factually consistent with the provided contexts."
90
+ }
91
+
92
+ ==== YOUR TURN ====
93
+ Contexts:
94
+ ${contexts.map((context, index) => `${index + 1}. ${context}`).join('\n')}
95
+
96
+ Actual Output:
97
+ ${actualOutput}
98
+
99
+ JSON:`;
100
+ }
101
+ }
102
+ //# sourceMappingURL=prompts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/hallucination/prompts.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB;;GAEG;AACH,MAAM,CAAC,MAAM,0BAA0B,GAAG,CAAC,CAAC,MAAM,CAAC;IACjD,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE;IACnB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;CACnB,CAAC,CAAC;AAIH;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IACrC,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,0BAA0B,CAAC;CAC9C,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,CAAC,MAAM,CAAC;IACnC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;CACnB,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,OAAO,qBAAqB;IAChC;;OAEG;IACH,MAAM,CAAC,gBAAgB,CAAC,YAAoB,EAAE,QAAkB;QAC9D,OAAO;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAkCT,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,KAAK,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;EAGvE,YAAY;;MAER,CAAC;IACL,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,cAAc,CAAC,YAAoB,EAAE,QAAkB;QAC5D,OAAO;;;;;;;;;;;;;;;;;;;;EAoBT,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,KAAK,EAAE,EAAE,CAAC,GAAG,KAAK,GAAG,CAAC,KAAK,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;EAGvE,YAAY;;MAER,CAAC;IACL,CAAC;CACF"}
@@ -0,0 +1,3 @@
1
+ export * from './instruction-adherence.js';
2
+ export * from './prompts.js';
3
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/instruction-adherence/index.ts"],"names":[],"mappings":"AAAA,cAAc,4BAA4B,CAAC;AAC3C,cAAc,cAAc,CAAC"}
@@ -0,0 +1,378 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { JudgevalScorer } from '../../base-scorer.js';
11
+ import { APIScorer } from '../../../constants.js';
12
+ import { info } from '../../../common/logger.js';
13
+ import { InstructionAdherenceTemplate, InstructionsSchema, VerdictsSchema } from './prompts.js';
14
+ import { createJudge } from '../../../judges/index.js';
15
+ // Required parameters for this scorer
16
+ const required_params = ['input', 'actualOutput'];
17
+ /**
18
+ * InstructionAdherenceScorer evaluates how well an LLM follows instructions
19
+ * by extracting instructions from the input and checking if they are followed in the output.
20
+ *
21
+ * The score is the average of scores for each instruction (1 = followed, 0.5 = partially followed, 0 = not followed).
22
+ */
23
+ export class InstructionAdherenceScorer extends JudgevalScorer {
24
+ /**
25
+ * Create a new InstructionAdherenceScorer
26
+ *
27
+ * @param threshold - Success threshold (default: 0.5)
28
+ * @param model - Model to use for evaluation (default: DefaultJudge)
29
+ * @param include_reason - Whether to include a reason for the score (default: true)
30
+ * @param async_mode - Whether to use async mode (default: false)
31
+ * @param strict_mode - Whether to use strict mode (default: false)
32
+ * @param verbose_mode - Whether to include verbose logs (default: false)
33
+ */
34
+ constructor(threshold = 0.5, model = undefined, include_reason = true, async_mode = false, strict_mode = false, verbose_mode = false) {
35
+ super(APIScorer.INSTRUCTION_ADHERENCE, strict_mode ? 1 : threshold, undefined, include_reason, async_mode, strict_mode, verbose_mode);
36
+ this._instructions = [];
37
+ this._verdicts = [];
38
+ const { judge, usingNativeModel } = createJudge(model);
39
+ this.model = judge;
40
+ this.using_native_model = usingNativeModel;
41
+ this.evaluation_model = this.model.getModelName();
42
+ }
43
+ /**
44
+ * Extract instructions from input text
45
+ */
46
+ _aGetInstructions(input) {
47
+ return __awaiter(this, void 0, void 0, function* () {
48
+ const prompt = InstructionAdherenceTemplate.getInstructions(input);
49
+ if (this.using_native_model) {
50
+ const res = yield this.model.aGenerate(prompt);
51
+ try {
52
+ const data = JSON.parse(res);
53
+ return data.instructions || [];
54
+ }
55
+ catch (error) {
56
+ throw new Error(`Failed to parse response: ${error}`);
57
+ }
58
+ }
59
+ else {
60
+ try {
61
+ // Create a parser function to validate the response
62
+ const parseInstructionsResponse = (response) => {
63
+ const parsed = JSON.parse(response);
64
+ const result = InstructionsSchema.safeParse(parsed);
65
+ if (result.success) {
66
+ return result.data;
67
+ }
68
+ throw new Error(`Invalid response format: ${result.error}`);
69
+ };
70
+ const res = yield this.model.aGenerate(prompt);
71
+ return parseInstructionsResponse(res).instructions;
72
+ }
73
+ catch (error) {
74
+ const res = yield this.model.aGenerate(prompt);
75
+ try {
76
+ const data = JSON.parse(res);
77
+ return data.instructions || [];
78
+ }
79
+ catch (parseError) {
80
+ throw new Error(`Failed to parse response: ${parseError}`);
81
+ }
82
+ }
83
+ }
84
+ });
85
+ }
86
+ /**
87
+ * Extract instructions from input text (synchronous)
88
+ */
89
+ _getInstructions(input) {
90
+ const prompt = InstructionAdherenceTemplate.getInstructions(input);
91
+ if (this.using_native_model) {
92
+ const res = this.model.generate(prompt);
93
+ try {
94
+ const data = JSON.parse(res);
95
+ return data.instructions || [];
96
+ }
97
+ catch (error) {
98
+ throw new Error(`Failed to parse response: ${error}`);
99
+ }
100
+ }
101
+ else {
102
+ try {
103
+ // Create a parser function to validate the response
104
+ const parseInstructionsResponse = (response) => {
105
+ const parsed = JSON.parse(response);
106
+ const result = InstructionsSchema.safeParse(parsed);
107
+ if (result.success) {
108
+ return result.data;
109
+ }
110
+ throw new Error(`Invalid response format: ${result.error}`);
111
+ };
112
+ const res = this.model.generate(prompt);
113
+ return parseInstructionsResponse(res).instructions;
114
+ }
115
+ catch (error) {
116
+ const res = this.model.generate(prompt);
117
+ try {
118
+ const data = JSON.parse(res);
119
+ return data.instructions || [];
120
+ }
121
+ catch (parseError) {
122
+ throw new Error(`Failed to parse response: ${parseError}`);
123
+ }
124
+ }
125
+ }
126
+ }
127
+ /**
128
+ * Generate verdicts for each instruction
129
+ */
130
+ _aGetVerdicts(instructions, actualOutput) {
131
+ return __awaiter(this, void 0, void 0, function* () {
132
+ if (instructions.length === 0) {
133
+ return [];
134
+ }
135
+ const prompt = InstructionAdherenceTemplate.generateVerdicts(instructions, actualOutput);
136
+ if (this.using_native_model) {
137
+ const res = yield this.model.aGenerate(prompt);
138
+ try {
139
+ const data = JSON.parse(res);
140
+ return data.verdicts || [];
141
+ }
142
+ catch (error) {
143
+ throw new Error(`Failed to parse response: ${error}`);
144
+ }
145
+ }
146
+ else {
147
+ try {
148
+ // Create a parser function to validate the response
149
+ const parseVerdictsResponse = (response) => {
150
+ const parsed = JSON.parse(response);
151
+ const result = VerdictsSchema.safeParse(parsed);
152
+ if (result.success) {
153
+ return result.data;
154
+ }
155
+ throw new Error(`Invalid response format: ${result.error}`);
156
+ };
157
+ const res = yield this.model.aGenerate(prompt);
158
+ return parseVerdictsResponse(res).verdicts;
159
+ }
160
+ catch (error) {
161
+ const res = yield this.model.aGenerate(prompt);
162
+ try {
163
+ const data = JSON.parse(res);
164
+ return data.verdicts || [];
165
+ }
166
+ catch (parseError) {
167
+ throw new Error(`Failed to parse response: ${parseError}`);
168
+ }
169
+ }
170
+ }
171
+ });
172
+ }
173
+ /**
174
+ * Generate verdicts for each instruction (synchronous)
175
+ */
176
+ _getVerdicts(instructions, actualOutput) {
177
+ if (instructions.length === 0) {
178
+ return [];
179
+ }
180
+ const prompt = InstructionAdherenceTemplate.generateVerdicts(instructions, actualOutput);
181
+ if (this.using_native_model) {
182
+ const res = this.model.generate(prompt);
183
+ try {
184
+ const data = JSON.parse(res);
185
+ return data.verdicts || [];
186
+ }
187
+ catch (error) {
188
+ throw new Error(`Failed to parse response: ${error}`);
189
+ }
190
+ }
191
+ else {
192
+ try {
193
+ // Create a parser function to validate the response
194
+ const parseVerdictsResponse = (response) => {
195
+ const parsed = JSON.parse(response);
196
+ const result = VerdictsSchema.safeParse(parsed);
197
+ if (result.success) {
198
+ return result.data;
199
+ }
200
+ throw new Error(`Invalid response format: ${result.error}`);
201
+ };
202
+ const res = this.model.generate(prompt);
203
+ return parseVerdictsResponse(res).verdicts;
204
+ }
205
+ catch (error) {
206
+ const res = this.model.generate(prompt);
207
+ try {
208
+ const data = JSON.parse(res);
209
+ return data.verdicts || [];
210
+ }
211
+ catch (parseError) {
212
+ throw new Error(`Failed to parse response: ${parseError}`);
213
+ }
214
+ }
215
+ }
216
+ }
217
+ /**
218
+ * Calculate the instruction adherence score
219
+ */
220
+ _computeScore() {
221
+ if (this._verdicts.length === 0) {
222
+ return 1;
223
+ }
224
+ let totalScore = 0;
225
+ for (const verdict of this._verdicts) {
226
+ totalScore += verdict.score;
227
+ }
228
+ return totalScore / this._verdicts.length;
229
+ }
230
+ /**
231
+ * Create verbose logs for debugging
232
+ */
233
+ _createVerboseLogs() {
234
+ if (!this.verbose_mode) {
235
+ return null;
236
+ }
237
+ const steps = [
238
+ `Instructions:\n${JSON.stringify(this._instructions, null, 2)}`,
239
+ `Score: ${this.score}\nReason: ${this.reason || "No reason provided"}`
240
+ ];
241
+ return steps.join('\n\n');
242
+ }
243
+ /**
244
+ * Check if example has required parameters
245
+ */
246
+ _checkExampleParams(example) {
247
+ for (const param of required_params) {
248
+ if (param === 'input' && !example.input) {
249
+ throw new Error(`Example is missing required parameter: input`);
250
+ }
251
+ else if (param === 'actualOutput' && !example.actualOutput) {
252
+ throw new Error(`Example is missing required parameter: actualOutput`);
253
+ }
254
+ }
255
+ }
256
+ /**
257
+ * Score an example synchronously
258
+ */
259
+ syncScoreExample(example) {
260
+ info("Starting example scoring (sync mode)");
261
+ try {
262
+ // Check required parameters
263
+ this._checkExampleParams(example);
264
+ // Process example
265
+ this._instructions = this._getInstructions(example.input);
266
+ this._verdicts = this._getVerdicts(this._instructions, example.actualOutput);
267
+ // Add instructions and verdicts to additional metadata
268
+ const additional_metadata = {
269
+ instructions: this._instructions,
270
+ verdicts: this._verdicts
271
+ };
272
+ this.score = this._computeScore();
273
+ this.reason = this._verdicts.length > 0 ? JSON.stringify(this._verdicts) : 'No instructions found';
274
+ this.success = this._successCheck();
275
+ const verbose_logs = this._createVerboseLogs();
276
+ info(`Scoring completed with score: ${this.score}`);
277
+ // Ensure all fields match the ScorerData interface
278
+ return {
279
+ name: this.type,
280
+ threshold: this.threshold,
281
+ success: this.success,
282
+ score: this.score,
283
+ reason: this.reason || "",
284
+ strict_mode: this.strict_mode,
285
+ evaluation_model: this.evaluation_model || null,
286
+ error: null,
287
+ evaluation_cost: null,
288
+ verbose_logs: verbose_logs,
289
+ additional_metadata: additional_metadata
290
+ };
291
+ }
292
+ catch (error) {
293
+ // Handle errors
294
+ const errorMessage = error instanceof Error ? error.message : String(error);
295
+ this.error = errorMessage;
296
+ return {
297
+ name: this.type,
298
+ threshold: this.threshold,
299
+ success: false,
300
+ score: 0,
301
+ reason: `Error during scoring: ${errorMessage}`,
302
+ strict_mode: this.strict_mode,
303
+ evaluation_model: this.evaluation_model || null,
304
+ error: errorMessage,
305
+ evaluation_cost: null,
306
+ verbose_logs: null,
307
+ additional_metadata: {}
308
+ };
309
+ }
310
+ }
311
+ /**
312
+ * Score an example asynchronously
313
+ */
314
+ scoreExample(example) {
315
+ return __awaiter(this, void 0, void 0, function* () {
316
+ if (!this.async_mode) {
317
+ return this.syncScoreExample(example);
318
+ }
319
+ info("Starting example scoring (async mode)");
320
+ try {
321
+ // Check required parameters
322
+ this._checkExampleParams(example);
323
+ // Process example
324
+ this._instructions = yield this._aGetInstructions(example.input);
325
+ this._verdicts = yield this._aGetVerdicts(this._instructions, example.actualOutput);
326
+ // Add instructions and verdicts to additional metadata
327
+ const additional_metadata = {
328
+ instructions: this._instructions,
329
+ verdicts: this._verdicts
330
+ };
331
+ this.score = this._computeScore();
332
+ this.reason = this._verdicts.length > 0 ? JSON.stringify(this._verdicts) : 'No instructions found';
333
+ this.success = this._successCheck();
334
+ const verbose_logs = this._createVerboseLogs();
335
+ info(`Scoring completed with score: ${this.score}`);
336
+ // Ensure all fields match the ScorerData interface
337
+ return {
338
+ name: this.type,
339
+ threshold: this.threshold,
340
+ success: this.success,
341
+ score: this.score,
342
+ reason: this.reason || "",
343
+ strict_mode: this.strict_mode,
344
+ evaluation_model: this.evaluation_model || null,
345
+ error: null,
346
+ evaluation_cost: null,
347
+ verbose_logs: verbose_logs,
348
+ additional_metadata: additional_metadata
349
+ };
350
+ }
351
+ catch (error) {
352
+ // Handle errors
353
+ const errorMessage = error instanceof Error ? error.message : String(error);
354
+ this.error = errorMessage;
355
+ return {
356
+ name: this.type,
357
+ threshold: this.threshold,
358
+ success: false,
359
+ score: 0,
360
+ reason: `Error during scoring: ${errorMessage}`,
361
+ strict_mode: this.strict_mode,
362
+ evaluation_model: this.evaluation_model || null,
363
+ error: errorMessage,
364
+ evaluation_cost: null,
365
+ verbose_logs: null,
366
+ additional_metadata: {}
367
+ };
368
+ }
369
+ });
370
+ }
371
+ /**
372
+ * Get the name of the scorer
373
+ */
374
+ get name() {
375
+ return "Instruction Adherence";
376
+ }
377
+ }
378
+ //# sourceMappingURL=instruction-adherence.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"instruction-adherence.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/instruction-adherence/instruction-adherence.ts"],"names":[],"mappings":";;;;;;;;;AAEA,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAO,IAAI,EAAe,MAAM,2BAA2B,CAAC;AACnE,OAAO,EACL,4BAA4B,EAE5B,kBAAkB,EAClB,cAAc,EACf,MAAM,cAAc,CAAC;AACtB,OAAO,EAAS,WAAW,EAAE,MAAM,0BAA0B,CAAC;AAE9D,sCAAsC;AACtC,MAAM,eAAe,GAAG,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;AAElD;;;;;GAKG;AACH,MAAM,OAAO,0BAA2B,SAAQ,cAAc;IAM5D;;;;;;;;;OASG;IACH,YACE,YAAoB,GAAG,EACvB,QAAoC,SAAS,EAC7C,iBAA0B,IAAI,EAC9B,aAAsB,KAAK,EAC3B,cAAuB,KAAK,EAC5B,eAAwB,KAAK;QAE7B,KAAK,CACH,SAAS,CAAC,qBAAqB,EAC/B,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,EAC3B,SAAS,EACT,cAAc,EACd,UAAU,EACV,WAAW,EACX,YAAY,CACb,CAAC;QA7BI,kBAAa,GAAa,EAAE,CAAC;QAC7B,cAAS,GAAkC,EAAE,CAAC;QA8BpD,MAAM,EAAE,KAAK,EAAE,gBAAgB,EAAE,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC;QACvD,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,kBAAkB,GAAG,gBAAgB,CAAC;QAC3C,IAAI,CAAC,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;IACpD,CAAC;IAED;;OAEG;IACW,iBAAiB,CAAC,KAAa;;YAC3C,MAAM,MAAM,GAAG,4BAA4B,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;YAEnE,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC5B,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;gBAC/C,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;gBACjC,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;gBACxD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC;oBACH,oDAAoD;oBACpD,MAAM,yBAAyB,GAAG,CAAC,QAAgB,EAA8B,EAAE;wBACjF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;wBACpC,MAAM,MAAM,GAAG,kBAAkB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;wBACpD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;4BACnB,OAAO,MAAM,CAAC,IAAI,CAAC;wBACrB,CAAC;wBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;oBAC9D,CAAC,CAAC;oBAEF,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,OAAO,yBAAyB,CAAC,GAAG,CAAC,CAAC,YAAY,CAAC;gBACrD,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,IAAI,CAAC;wBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;wBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;oBACjC,CAAC;oBAAC,OAAO,UAAU,EAAE,CAAC;wBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;oBAC7D,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;KAAA;IAED;;OAEG;IACK,gBAAgB,CAAC,KAAa;QACpC,MAAM,MAAM,GAAG,4BAA4B,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;QAEnE,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC5B,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YACxC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;YACjC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;aAAM,CAAC;YACN,IAAI,CAAC;gBACH,oDAAoD;gBACpD,MAAM,yBAAyB,GAAG,CAAC,QAAgB,EAA8B,EAAE;oBACjF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;oBACpC,MAAM,MAAM,GAAG,kBAAkB,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBACpD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;wBACnB,OAAO,MAAM,CAAC,IAAI,CAAC;oBACrB,CAAC;oBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC9D,CAAC,CAAC;gBAEF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,OAAO,yBAAyB,CAAC,GAAG,CAAC,CAAC,YAAY,CAAC;YACrD,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,YAAY,IAAI,EAAE,CAAC;gBACjC,CAAC;gBAAC,OAAO,UAAU,EAAE,CAAC;oBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACW,aAAa,CAAC,YAAsB,EAAE,YAAoB;;YACtE,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC9B,OAAO,EAAE,CAAC;YACZ,CAAC;YAED,MAAM,MAAM,GAAG,4BAA4B,CAAC,gBAAgB,CAAC,YAAY,EAAE,YAAY,CAAC,CAAC;YAEzF,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAC5B,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;gBAC/C,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC7B,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;gBACxD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,CAAC;oBACH,oDAAoD;oBACpD,MAAM,qBAAqB,GAAG,CAAC,QAAgB,EAA+C,EAAE;wBAC9F,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;wBACpC,MAAM,MAAM,GAAG,cAAc,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;wBAChD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;4BACnB,OAAO,MAAM,CAAC,IAAI,CAAC;wBACrB,CAAC;wBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;oBAC9D,CAAC,CAAC;oBAEF,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,OAAO,qBAAqB,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;gBAC7C,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAC/C,IAAI,CAAC;wBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;wBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;oBAC7B,CAAC;oBAAC,OAAO,UAAU,EAAE,CAAC;wBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;oBAC7D,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;KAAA;IAED;;OAEG;IACK,YAAY,CAAC,YAAsB,EAAE,YAAoB;QAC/D,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC9B,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,MAAM,MAAM,GAAG,4BAA4B,CAAC,gBAAgB,CAAC,YAAY,EAAE,YAAY,CAAC,CAAC;QAEzF,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAC5B,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YACxC,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;gBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;YAC7B,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,IAAI,KAAK,CAAC,6BAA6B,KAAK,EAAE,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;aAAM,CAAC;YACN,IAAI,CAAC;gBACH,oDAAoD;gBACpD,MAAM,qBAAqB,GAAG,CAAC,QAAgB,EAA+C,EAAE;oBAC9F,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;oBACpC,MAAM,MAAM,GAAG,cAAc,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;oBAChD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;wBACnB,OAAO,MAAM,CAAC,IAAI,CAAC;oBACrB,CAAC;oBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC9D,CAAC,CAAC;gBAEF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,OAAO,qBAAqB,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YAC7C,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBACxC,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;oBAC7B,OAAO,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC7B,CAAC;gBAAC,OAAO,UAAU,EAAE,CAAC;oBACpB,MAAM,IAAI,KAAK,CAAC,6BAA6B,UAAU,EAAE,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa;QACnB,IAAI,IAAI,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAChC,OAAO,CAAC,CAAC;QACX,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YACrC,UAAU,IAAI,OAAO,CAAC,KAAK,CAAC;QAC9B,CAAC;QAED,OAAO,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC;IAC5C,CAAC;IAED;;OAEG;IACK,kBAAkB;QACxB,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC;YACvB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,KAAK,GAAG;YACZ,kBAAkB,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE;YAC/D,UAAU,IAAI,CAAC,KAAK,aAAa,IAAI,CAAC,MAAM,IAAI,oBAAoB,EAAE;SACvE,CAAC;QAEF,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC5B,CAAC;IAED;;OAEG;IACK,mBAAmB,CAAC,OAAgB;QAC1C,KAAK,MAAM,KAAK,IAAI,eAAe,EAAE,CAAC;YACpC,IAAI,KAAK,KAAK,OAAO,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;gBACxC,MAAM,IAAI,KAAK,CAAC,8CAA8C,CAAC,CAAC;YAClE,CAAC;iBAAM,IAAI,KAAK,KAAK,cAAc,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC;gBAC7D,MAAM,IAAI,KAAK,CAAC,qDAAqD,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,gBAAgB,CAAC,OAAgB;QAC/B,IAAI,CAAC,sCAAsC,CAAC,CAAC;QAE7C,IAAI,CAAC;YACH,4BAA4B;YAC5B,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;YAElC,kBAAkB;YAClB,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,KAAM,CAAC,CAAC;YAC3D,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,aAAa,EAAE,OAAO,CAAC,YAAsB,CAAC,CAAC;YAEvF,uDAAuD;YACvD,MAAM,mBAAmB,GAAG;gBAC1B,YAAY,EAAE,IAAI,CAAC,aAAa;gBAChC,QAAQ,EAAE,IAAI,CAAC,SAAS;aACzB,CAAC;YAEF,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;YAClC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,uBAAuB,CAAC;YACnG,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;YACpC,MAAM,YAAY,GAAG,IAAI,CAAC,kBAAkB,EAAE,CAAC;YAE/C,IAAI,CAAC,iCAAiC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;YAEpD,mDAAmD;YACnD,OAAO;gBACL,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,SAAS,EAAE,IAAI,CAAC,SAAS;gBACzB,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,EAAE;gBACzB,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;gBAC/C,KAAK,EAAE,IAAI;gBACX,eAAe,EAAE,IAAI;gBACrB,YAAY,EAAE,YAAY;gBAC1B,mBAAmB,EAAE,mBAAmB;aACzC,CAAC;QACJ,CAAC;QAAC,OAAO,KAAU,EAAE,CAAC;YACpB,gBAAgB;YAChB,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,IAAI,CAAC,KAAK,GAAG,YAAY,CAAC;YAE1B,OAAO;gBACL,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,SAAS,EAAE,IAAI,CAAC,SAAS;gBACzB,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,CAAC;gBACR,MAAM,EAAE,yBAAyB,YAAY,EAAE;gBAC/C,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;gBAC/C,KAAK,EAAE,YAAY;gBACnB,eAAe,EAAE,IAAI;gBACrB,YAAY,EAAE,IAAI;gBAClB,mBAAmB,EAAE,EAAE;aACxB,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACG,YAAY,CAAC,OAAgB;;YACjC,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;gBACrB,OAAO,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;YACxC,CAAC;YAED,IAAI,CAAC,uCAAuC,CAAC,CAAC;YAE9C,IAAI,CAAC;gBACH,4BAA4B;gBAC5B,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,CAAC;gBAElC,kBAAkB;gBAClB,IAAI,CAAC,aAAa,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,OAAO,CAAC,KAAM,CAAC,CAAC;gBAClE,IAAI,CAAC,SAAS,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,aAAa,EAAE,OAAO,CAAC,YAAsB,CAAC,CAAC;gBAE9F,uDAAuD;gBACvD,MAAM,mBAAmB,GAAG;oBAC1B,YAAY,EAAE,IAAI,CAAC,aAAa;oBAChC,QAAQ,EAAE,IAAI,CAAC,SAAS;iBACzB,CAAC;gBAEF,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;gBAClC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,uBAAuB,CAAC;gBACnG,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;gBACpC,MAAM,YAAY,GAAG,IAAI,CAAC,kBAAkB,EAAE,CAAC;gBAE/C,IAAI,CAAC,iCAAiC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;gBAEpD,mDAAmD;gBACnD,OAAO;oBACL,IAAI,EAAE,IAAI,CAAC,IAAI;oBACf,SAAS,EAAE,IAAI,CAAC,SAAS;oBACzB,OAAO,EAAE,IAAI,CAAC,OAAO;oBACrB,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,EAAE;oBACzB,WAAW,EAAE,IAAI,CAAC,WAAW;oBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;oBAC/C,KAAK,EAAE,IAAI;oBACX,eAAe,EAAE,IAAI;oBACrB,YAAY,EAAE,YAAY;oBAC1B,mBAAmB,EAAE,mBAAmB;iBACzC,CAAC;YACJ,CAAC;YAAC,OAAO,KAAU,EAAE,CAAC;gBACpB,gBAAgB;gBAChB,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAC5E,IAAI,CAAC,KAAK,GAAG,YAAY,CAAC;gBAE1B,OAAO;oBACL,IAAI,EAAE,IAAI,CAAC,IAAI;oBACf,SAAS,EAAE,IAAI,CAAC,SAAS;oBACzB,OAAO,EAAE,KAAK;oBACd,KAAK,EAAE,CAAC;oBACR,MAAM,EAAE,yBAAyB,YAAY,EAAE;oBAC/C,WAAW,EAAE,IAAI,CAAC,WAAW;oBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB,IAAI,IAAI;oBAC/C,KAAK,EAAE,YAAY;oBACnB,eAAe,EAAE,IAAI;oBACrB,YAAY,EAAE,IAAI;oBAClB,mBAAmB,EAAE,EAAE;iBACxB,CAAC;YACJ,CAAC;QACH,CAAC;KAAA;IAED;;OAEG;IACH,IAAI,IAAI;QACN,OAAO,uBAAuB,CAAC;IACjC,CAAC;CACF"}
@@ -0,0 +1,120 @@
1
+ import { z } from 'zod';
2
+ /**
3
+ * Schema for a single instruction adherence verdict
4
+ */
5
+ export const VerdictSchema = z.object({
6
+ instruction: z.string(),
7
+ score: z.number(),
8
+ reason: z.string()
9
+ });
10
+ /**
11
+ * Schema for a list of verdicts
12
+ */
13
+ export const VerdictsSchema = z.object({
14
+ verdicts: z.array(VerdictSchema)
15
+ });
16
+ /**
17
+ * Schema for a list of instructions
18
+ */
19
+ export const InstructionsSchema = z.object({
20
+ instructions: z.array(z.string())
21
+ });
22
+ /**
23
+ * Templates for prompts used in the InstructionAdherenceScorer
24
+ */
25
+ export class InstructionAdherenceTemplate {
26
+ /**
27
+ * Generate a prompt to extract instructions from input text
28
+ */
29
+ static getInstructions(input) {
30
+ return `You will be presented with a piece of text. Your task is to break down the text and generate a list of the instructions contained within the text.
31
+
32
+ ===== START OF EXAMPLES =====
33
+ Example 1:
34
+ Example text: Hello my name is John Doe. I like cars. Write two poems about the weather and create a joke. Also what is 5 + 5?
35
+
36
+ Output:
37
+ {
38
+ "instructions": ["Write two poem about the weather", "Create a joke", "What is 5 + 5?"]
39
+ }
40
+ ===== END OF EXAMPLES =====
41
+
42
+ **
43
+ IMPORTANT: Please return your answer in valid JSON format, with the "instructions" key mapping to a list of strings. No words or explanation is needed.
44
+ **
45
+
46
+ ==== START OF INPUT ====
47
+ Text:
48
+ ${input}
49
+ ==== END OF INPUT ====
50
+
51
+ ==== YOUR ANSWER ====
52
+ JSON:`;
53
+ }
54
+ /**
55
+ * Generate a prompt to evaluate adherence to instructions
56
+ */
57
+ static generateVerdicts(instructions, actualOutput) {
58
+ return `You will be presented with a list of instructions and a piece of text. For each instruction, determine if the instruction was completed in the text. There are 3 categories: either completed, partially completed, or not completed. The scores for these will be 1, 0.5, and 0 respectively.
59
+ Go through each instruction and provide score for each instruction as well as the reasoning for that score.
60
+
61
+ ==== FORMATTING YOUR ANSWER ====
62
+ Please return your answer in JSON format, with a list of JSON objects with keys "instruction", "score", and "reason". No words or explanation beyond the output JSON is needed.
63
+
64
+
65
+ ===== START OF EXAMPLES =====
66
+ Example 1:
67
+ instructions: ["Write two poems about the weather", "Create a joke", "What is 5 + 5?"]
68
+ output: Poem 1: The Sun's Embrace
69
+ The sun climbs high, a golden flame,
70
+ It whispers warmth, it calls my name.
71
+ The sky, a canvas, blue and clear,
72
+ A perfect day for cars, my dear.
73
+
74
+ The asphalt hums beneath the wheels,
75
+ A symphony of speed it feels.
76
+ The weather smiles, no clouds in sight,
77
+ A driver's joy, pure delight.
78
+
79
+ Poem 2: The Storm's Dance
80
+ A sunlit meadow, alive with whispers of wind, where daisies dance and hope begins again. Each petal holds a promise—bright, unbruised— a symphony of light that cannot be refused.
81
+
82
+ Joke
83
+ Why dont cars ever get cold in the winter?
84
+ Because they have radiators!
85
+
86
+ Math Answer
87
+ 5 + 5 = 10
88
+
89
+ YOUR JSON OUTPUT:
90
+ {
91
+ "verdicts": [
92
+ {
93
+ "instruction": "Write two poem about the weather",
94
+ "score": 0.5,
95
+ "reason": "The output contained one poem about the weather, but the other poem was not about the weather."
96
+ },
97
+ {
98
+ "instruction": "Create a joke",
99
+ "score": 1,
100
+ "reason": "There was a joke created in the output."
101
+ },
102
+ {
103
+ "instruction": "What is 5 + 5?",
104
+ "score": 1,
105
+ "reason": "The answer to the math question was provided in the output."
106
+ }
107
+ ]
108
+ }
109
+ ===== END OF EXAMPLES =====
110
+
111
+ ==== START OF INPUT ====
112
+ instructions: ${JSON.stringify(instructions)}
113
+ output: ${actualOutput}
114
+ ==== END OF INPUT ====
115
+
116
+ ==== YOUR ANSWER ====
117
+ JSON:`;
118
+ }
119
+ }
120
+ //# sourceMappingURL=prompts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../../../../src/scorers/metrics/instruction-adherence/prompts.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB;;GAEG;AACH,MAAM,CAAC,MAAM,aAAa,GAAG,CAAC,CAAC,MAAM,CAAC;IACpC,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE;IACvB,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE;IACjB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;CACnB,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IACrC,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,aAAa,CAAC;CACjC,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAG,CAAC,CAAC,MAAM,CAAC;IACzC,YAAY,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;CAClC,CAAC,CAAC;AAYH;;GAEG;AACH,MAAM,OAAO,4BAA4B;IACvC;;OAEG;IACH,MAAM,CAAC,eAAe,CAAC,KAAa;QAClC,OAAO;;;;;;;;;;;;;;;;;;EAkBT,KAAK;;;;MAID,CAAC;IACL,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,gBAAgB,CAAC,YAAsB,EAAE,YAAoB;QAClE,OAAO;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gBAsDK,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC;UAClC,YAAY;;;;MAIhB,CAAC;IACL,CAAC;CACF"}
@@ -8,7 +8,7 @@ export declare function debug(message: string, meta?: Record<string, any>): void
8
8
  /**
9
9
  * Log an info message (alias for info)
10
10
  */
11
- export declare function log(message: string, ...args: any[]): void;
11
+ export declare function log(message: string, meta?: Record<string, any>): void;
12
12
  /**
13
13
  * Log an info message
14
14
  */
@@ -22,6 +22,7 @@ export declare enum APIScorer {
22
22
  export declare const UNBOUNDED_SCORERS: Set<APIScorer>;
23
23
  export declare const ROOT_API = "https://api.judgmentlabs.ai";
24
24
  export declare const JUDGMENT_DATASETS_PUSH_API_URL = "https://api.judgmentlabs.ai/datasets/push/";
25
+ export declare const JUDGMENT_DATASETS_APPEND_API_URL = "https://api.judgmentlabs.ai/datasets/insert_examples/";
25
26
  export declare const JUDGMENT_DATASETS_PULL_API_URL = "https://api.judgmentlabs.ai/datasets/pull/";
26
27
  export declare const JUDGMENT_DATASETS_DELETE_API_URL = "https://api.judgmentlabs.ai/datasets/delete/";
27
28
  export declare const JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = "https://api.judgmentlabs.ai/datasets/export_jsonl/";
@@ -34,6 +34,11 @@ export declare class EvalDatasetClient {
34
34
  * @returns AxiosResponse containing the stream if successful.
35
35
  */
36
36
  exportJsonl(alias: string, projectName: string): Promise<AxiosResponse>;
37
+ /**
38
+ * Appends examples to an existing dataset on the Judgment platform.
39
+ * @returns True if successful, false otherwise.
40
+ */
41
+ append(alias: string, examples: Example[], projectName: string): Promise<boolean>;
37
42
  private getAuthHeaders;
38
43
  private handleApiError;
39
44
  }