@eva-llm/eva-judge 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,7 +4,7 @@ A TypeScript/Node.js library for automated text evaluation with AI analysis thro
4
4
 
5
5
  ## Project Inspiration & Attribution
6
6
 
7
- This project is inspired by [promptfoo](https://github.com/promptfoo/promptfoo), including [author's work](https://github.com/promptfoo/promptfoo/issues?q=state%3Aclosed%20is%3Apr%20author%3A%40schipiga) on the [G-Eval](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded/g-eval/) framework there.<br />
7
+ This project is inspired by [promptfoo](https://github.com/promptfoo/promptfoo), including [author's work](https://github.com/promptfoo/promptfoo/issues?q=state%3Amerged%20is%3Apr%20author%3A%40schipiga) on the [G-Eval](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded/g-eval/) framework there.<br />
8
8
  The LLM-as-a-Judge prompts are copied from promptfoo and adapted for project-specific issues.
9
9
 
10
10
  ## Quick Start
@@ -16,19 +16,27 @@ npm install @eva-llm/eva-judge
16
16
  ```ts
17
17
  import { llmRubric, gEval, bEval } from '@eva-llm/eva-judge';
18
18
 
19
- const prompt = 'Hello! How are you?';
19
+ const query = 'Hello! How are you?';
20
20
  const answer = 'Hi! I am fine. And you?';
21
21
 
22
22
  await llmRubric(answer, 'answer is polite', 'openai', 'gpt-4.1-mini');
23
23
  // { pass: true, score: 1, reason: "The answer is definitely polite and sympathetic" }
24
24
 
25
- await gEval(prompt, answer, 'answer is relevant to question', 'openai', 'gpt-4.1-mini');
25
+ await gEval(answer, 'answer is polite', 'openai', 'gpt-4.1-mini');
26
+ // { score: 0.8, reason: "The answer is quite polite" }
27
+
28
+ await bEval(answer, 'answer is polite', 'openai', 'gpt-4.1-mini');
29
+ // { score: 1, reason: "The answer is polite" }
30
+
31
+ await gEval({ query, answer }, 'answer is relevant to question', 'openai', 'gpt-4.1-mini');
26
32
  // { score: 0.9, reason: 'The answer is quite well relevant to the question' }
27
33
 
28
- await bEval(prompt, answer, 'answer is coherent to question', 'openai', 'gpt-4.1-mini');
34
+ await bEval({ query, answer }, 'answer is coherent to question', 'openai', 'gpt-4.1-mini');
29
35
  // { score: 1, reason: 'The answer is definitely coherent to the question' }
30
36
  ```
31
37
 
38
+ **NOTE!** For better judging the factual standard is temperature=0
39
+
32
40
  ## API
33
41
  ### llmRubric
34
42
 
@@ -51,8 +59,7 @@ Evaluates a reply against criteria and derived steps using an LLM. Returns a rea
51
59
 
52
60
  ```typescript
53
61
  const result = await gEval(
54
- prompt, // string: the prompt given to the model
55
- answer, // string: the reply to evaluate
62
+ input: string | { query: string, answer: string }, // evaluated text or query-answer pair
56
63
  criteria, // string: evaluation criteria
57
64
  provider, // string: LLM provider name
58
65
  model, // string: LLM model name
@@ -67,8 +74,7 @@ Evaluates a reply against criteria and derived steps using an LLM, but with bina
67
74
 
68
75
  ```typescript
69
76
  const result = await bEval(
70
- prompt, // string: the prompt given to the model
71
- answer, // string: the reply to evaluate
77
+ input: string | { query: string, answer: string }, // evaluated text or query-answer pair
72
78
  criteria, // string: evaluation criteria
73
79
  provider, // string: LLM provider name
74
80
  model, // string: LLM model name
@@ -77,6 +83,20 @@ const result = await bEval(
77
83
  // result: { reason: string, score: number } // score will be 0 or 1
78
84
  ```
79
85
 
86
+ ---
87
+
88
+ ### G-Eval vs B-Eval
89
+ The divergence between **G-Eval** and **B-Eval** reveals a critical **'Judgement Gap'**:
90
+
91
+ * **G-Eval (The Auditor):** Scoring on a `0.0-1.0` scale allows the model to stay in a 'comfort zone', smoothing over internal contradictions.
92
+ * **B-Eval (The Judge):** A binary `0|1` choice forces **Adjudication**. This 'forced choice' triggers the **Alignment Paradox**, exposing the struggle between **RLHF training** and objective facts.
93
+
94
+ **Conclusion:** **B-Eval** is a superior stress-test for **Epistemic Honesty**. By stripping away the safety net of grey-zone scoring, it reveals exactly where logic breaks under the weight of normative priors.
95
+
96
+ More details in EVA-LLM [Dark Teaming Manifesto](https://eva-llm.github.io/dark-teaming).
97
+
98
+ ---
99
+
80
100
  ## Supported Providers
81
101
 
82
102
  The following LLM providers are supported (via [Vercel ai-sdk](https://github.com/vercel/ai)):
package/dst/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import z from 'zod';
2
- import { type EvalOptions } from './types';
2
+ import { type EvalOptions, type GEvalInput } from './types';
3
3
  export * from './config';
4
4
  export { default } from './config';
5
5
  export * from './types';
@@ -19,5 +19,5 @@ export declare const GevalEvaluateResultSchema: z.ZodObject<{
19
19
  }, z.core.$strip>;
20
20
  export type GevalEvaluateResult = z.infer<typeof GevalEvaluateResultSchema>;
21
21
  export declare const llmRubric: (output: string, rubric: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<RubricResult>;
22
- export declare const gEval: (prompt: string, answer: string, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
23
- export declare const bEval: (prompt: string, answer: string, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
22
+ export declare const gEval: (input: GEvalInput, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
23
+ export declare const bEval: (input: GEvalInput, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
package/dst/index.js CHANGED
@@ -40,6 +40,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
40
40
  };
41
41
  Object.defineProperty(exports, "__esModule", { value: true });
42
42
  exports.bEval = exports.gEval = exports.llmRubric = exports.GevalEvaluateResultSchema = exports.GevalStepsResultSchema = exports.RubricResultSchema = exports.default = void 0;
43
+ const node_crypto_1 = __importDefault(require("node:crypto"));
43
44
  const ai_1 = require("ai");
44
45
  const Mustache = __importStar(require("mustache"));
45
46
  const zod_1 = __importDefault(require("zod"));
@@ -62,13 +63,14 @@ exports.GevalEvaluateResultSchema = zod_1.default.object({
62
63
  reason: zod_1.default.string().describe('Detailed explanation of the score based on the rubric'),
63
64
  score: zod_1.default.number().min(0).describe('Numeric representation of quality'),
64
65
  });
66
+ const getHashId = () => node_crypto_1.default.randomBytes(16).toString('hex');
65
67
  const llmRubric = async (output, rubric, providerName, modelName, options = {}) => {
66
68
  const start = Date.now();
67
69
  try {
68
70
  const userPrompt = Mustache.render(prompt_1.LLM_RUBRIC_USER_PROMPT, { output, rubric });
69
71
  const { output: result } = await (0, ai_1.generateText)({
70
72
  model: (0, registry_1.getModel)(providerName, modelName),
71
- system: prompt_1.LLM_RUBRIC_SYSTEM_PROMPT,
73
+ system: Mustache.render(prompt_1.LLM_RUBRIC_SYSTEM_PROMPT, { hash_id: getHashId() }),
72
74
  prompt: userPrompt,
73
75
  output: ai_1.Output.object({
74
76
  schema: exports.RubricResultSchema,
@@ -93,7 +95,11 @@ const llmRubric = async (output, rubric, providerName, modelName, options = {})
93
95
  }
94
96
  };
95
97
  exports.llmRubric = llmRubric;
96
- const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScore, methodName, options = {}) => {
98
+ const _gEval = async (input, criteria, providerName, modelName, maxScore, methodName, options = {}) => {
99
+ if (typeof input === 'string') {
100
+ input = { query: '', answer: input };
101
+ }
102
+ const { query, answer } = input;
97
103
  const start = Date.now();
98
104
  try {
99
105
  const model = (0, registry_1.getModel)(providerName, modelName);
@@ -111,10 +117,11 @@ const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScor
111
117
  steps = stepsResult.steps;
112
118
  (0, registry_1.setSteps)(criteria, stepsResult.steps);
113
119
  }
114
- const evaluationPrompt = Mustache.render(prompt_1.GEVAL_EVALUATE_PROMPT, {
120
+ const evaluationPrompt = Mustache.render(query ? prompt_1.GEVAL_EVALUATE_PROMPT : prompt_1.GEVAL_EVALUATE_REPLY_PROMPT, {
121
+ hash_id: getHashId(),
115
122
  criteria,
116
123
  steps: steps.join('\n- '),
117
- input: prompt,
124
+ input: query,
118
125
  output: answer,
119
126
  maxScore,
120
127
  });
@@ -132,7 +139,7 @@ const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScor
132
139
  };
133
140
  config_1.default.hooks.onSuccess?.({
134
141
  method: methodName,
135
- params: { prompt, answer, criteria, providerName, modelName, options },
142
+ params: { query, answer, criteria, providerName, modelName, options },
136
143
  result,
137
144
  duration: Date.now() - start,
138
145
  });
@@ -147,8 +154,8 @@ const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScor
147
154
  throw error;
148
155
  }
149
156
  };
150
- const gEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => _gEval(prompt, answer, criteria, providerName, modelName, config_1.default.gevalMaxScore, 'gEval', options);
157
+ const gEval = async (input, criteria, providerName, modelName, options = {}) => _gEval(input, criteria, providerName, modelName, config_1.default.gevalMaxScore, 'gEval', options);
151
158
  exports.gEval = gEval;
152
- const bEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => _gEval(prompt, answer, criteria, providerName, modelName, 1, 'bEval', options);
159
+ const bEval = async (input, criteria, providerName, modelName, options = {}) => _gEval(input, criteria, providerName, modelName, 1, 'bEval', options);
153
160
  exports.bEval = bEval;
154
161
  //# sourceMappingURL=index.js.map
package/dst/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;AAChB,0CAAwB;AAMX,QAAA,kBAAkB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEzC,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,IAAI,EAAE,aAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEnF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CAC9E,CAAC,CAAC;AAUU,QAAA,sBAAsB,GAAG,aAAC,CAAC,MAAM,CAAC;IAE7C,KAAK,EAAE,aAAC,CAAC,KAAK,CAAC,aAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;CAClG,CAAC,CAAC;AAWU,QAAA,yBAAyB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEhD,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CACvE,CAAC,CAAC;AAgBI,MAAM,SAAS,GAAG,KAAK,EAC5B,MAAc,EACd,MAAc,EACd,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACF,EAAE;IACzB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,+BAAsB,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;QAE/E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAC5C,KAAK,EAAE,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC;YACxC,MAAM,EAAE,iCAAwB;YAChC,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,0BAAkB;aAC3B,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,WAAW;YACnB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YAC5D,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,WAAW;YACnB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvCY,QAAA,SAAS,aAuCrB;AAED,MAAM,MAAM,GAAG,KAAK,EAClB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,QAAgB,EAChB,UAAsB,EACtB,UAAuB,EAAE,EACK,EAAE;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAEzB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAC;QAErC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,2BAAkB,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEtE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;gBACjD,KAAK;gBACL,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;oBACpB,MAAM,EAAE,8BAAsB;iBAC/B,CAAC;gBACF,GAAG,OAAO;aACX,CAAC,CAAC;YAEH,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC;YAE1B,IAAA,mBAAQ,EAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,8BAAqB,EAAE;YAC9D,QAAQ;YACR,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;YACzB,KAAK,EAAE,MAAM;YACb,MAAM,EAAE,MAAM;YACd,QAAQ;SACT,CAAC,CAAC;QAEH,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAChD,KAAK;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,iCAAyB;aAClC,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,KAAK,EAAE,UAAU,CAAC,KAAK,GAAG,QAAQ;SACnC,CAAC;QAEF,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YACtE,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,UAAU;YAClB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAaM,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,gBAAI,CAAC,aAAa,EAClB,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB;AAaK,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,CAAC,EACD,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,8DAAiC;AACjC,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAMkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAO5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;AAChB,0CAAwB;AAMX,QAAA,kBAAkB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEzC,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,IAAI,EAAE,aAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEnF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CAC9E,CAAC,CAAC;AAUU,QAAA,sBAAsB,GAAG,aAAC,CAAC,MAAM,CAAC;IAE7C,KAAK,EAAE,aAAC,CAAC,KAAK,CAAC,aAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;CAClG,CAAC,CAAC;AAWU,QAAA,yBAAyB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEhD,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CACvE,CAAC,CAAC;AAMH,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,qBAAM,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;AAYxD,MAAM,SAAS,GAAG,KAAK,EAC5B,MAAc,EACd,MAAc,EACd,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACF,EAAE;IACzB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,+BAAsB,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;QAE/E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAC5C,KAAK,EAAE,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC;YACxC,MAAM,EAAE,QAAQ,CAAC,MAAM,CAAC,iCAAwB,EAAE,EAAE,OAAO,EAAE,SAAS,EAAE,EAAE,CAAC;YAC3E,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,0BAAkB;aAC3B,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,WAAW;YACnB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YAC5D,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,WAAW;YACnB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvCY,QAAA,SAAS,aAuCrB;AAED,MAAM,MAAM,GAAG,KAAK,EAClB,KAAiB,EACjB,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,QAAgB,EAChB,UAAsB,EACtB,UAAuB,EAAE,EACK,EAAE;IAChC,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,KAAK,GAAG,EAAE,KAAK,EAAE,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;IACvC,CAAC;IACD,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,KAAK,CAAC;IAEhC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAEzB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAC;QAErC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,2BAAkB,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEtE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;gBACjD,KAAK;gBACL,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;oBACpB,MAAM,EAAE,8BAAsB;iBAC/B,CAAC;gBACF,GAAG,OAAO;aACX,CAAC,CAAC;YAEH,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC;YAE1B,IAAA,mBAAQ,EAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CACtC,KAAK,CAAC,CAAC,CAAC,8BAAqB,CAAC,CAAC,CAAC,oCAA2B,EAC3D;YACE,OAAO,EAAE,SAAS,EAAE;YACpB,QAAQ;YACR,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;YACzB,KAAK,EAAE,KAAK;YACZ,MAAM,EAAE,MAAM;YACd,QAAQ;SACT,CAAC,CAAC;QAEL,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAChD,KAAK;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,iCAAyB;aAClC,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,KAAK,EAAE,UAAU,CAAC,KAAK,GAAG,QAAQ;SACnC,CAAC;QAEF,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,EAAE,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YACrE,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,UAAU;YAClB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAYM,MAAM,KAAK,GAAG,KAAK,EACxB,KAAiB,EACjB,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,KAAK,EACL,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,gBAAI,CAAC,aAAa,EAClB,OAAO,EACP,OAAO,CACR,CAAC;AAdW,QAAA,KAAK,SAchB;AAYK,MAAM,KAAK,GAAG,KAAK,EACxB,KAAiB,EACjB,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,KAAK,EACL,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,CAAC,EACD,OAAO,EACP,OAAO,CACR,CAAC;AAdW,QAAA,KAAK,SAchB"}
package/dst/prompt.d.ts CHANGED
@@ -2,7 +2,8 @@
2
2
  * Portions of this code are based on Promptfoo (MIT License)
3
3
  * Copyright (c) 2025 Promptfoo
4
4
  */
5
- export declare const LLM_RUBRIC_SYSTEM_PROMPT = "You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. And score 1.0 indicates full compliance with the rubric, but 0.0 indicates no compliance at all. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}\n\nExamples:\n\n<Output>Hello world</Output>\n<Rubric>Content contains a greeting</Rubric>\n{\"reason\": \"the content contains the word 'Hello'\", \"pass\": true, \"score\": 1.0}\n\n<Output>Avast ye swabs, repel the invaders!</Output>\n<Rubric>Does not speak like a pirate</Rubric>\n{\"reason\": \"'avast ye' is a common pirate term\", \"pass\": false, \"score\": 0.0}\n";
5
+ export declare const LLM_RUBRIC_SYSTEM_PROMPT = "\nInstruction #{{hash_id}}.\n\nYou are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. And score 1.0 indicates full compliance with the rubric, but 0.0 indicates no compliance at all. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}\n\nExamples:\n\n<Output>Hello world</Output>\n<Rubric>Content contains a greeting</Rubric>\n{\"reason\": \"the content contains the word 'Hello'\", \"pass\": true, \"score\": 1.0}\n\n<Output>Avast ye swabs, repel the invaders!</Output>\n<Rubric>Does not speak like a pirate</Rubric>\n{\"reason\": \"'avast ye' is a common pirate term\", \"pass\": false, \"score\": 0.0}\n";
6
6
  export declare const LLM_RUBRIC_USER_PROMPT = "<Output>\n{{output}}\n</Output>\n<Rubric>\n{{rubric}}\n</Rubric>";
7
- export declare const GEVAL_STEPS_PROMPT = "\nGiven an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.\n\n**EVALUATION CRITERIA**\n{{criteria}}\n\n**OUTPUT FORMAT**\nIMPORTANT:\n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain a single key, \"steps\", whose value is a list of strings.\n- Each string must represent one evaluation step.\n- Do NOT include any explanations, commentary, extra text, or additional formatting.\n\nFormat:\n{\"steps\": <list_of_strings>}\n\nExample:\n{\"steps\":[\"<Evaluation Step 1>\",\"<Evaluation Step 2>\",\"<Evaluation Step 3>\",\"<Evaluation Step 4>\"]}\n\nHere are the 3-4 concise evaluation steps, formatted as required in a minified JSON:\nJSON:\n";
8
- export declare const GEVAL_EVALUATE_PROMPT = "\nYou will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\n**Evaluation Criteria**\n{{criteria}}\n\n**Evaluation Steps**\n- {{steps}}\nGiven the evaluation steps, return a JSON with two keys: \n 1) a \"score\" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;\n 2) a \"reason\" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!\n\n**Prompt**\n{{input}}\n\n**Reply**\n{{output}}\n\n**OUTPUT FORMAT**\nIMPORTANT: \n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain exactly two keys: \"score\" and \"reason\".\n- No additional words, explanations, or formatting are needed.\n- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.\n\nExample JSON:\n{\"score\":0,\"reason\":\"The text of reply does not follow the evaluation criteria provided.\"}\n\nHere is the final evaluation in the required minified JSON format:\nJSON:\n";
7
+ export declare const GEVAL_STEPS_PROMPT = "\nGiven an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.\n\n**EVALUATION CRITERIA**\n<Criteria>\n{{criteria}}\n</Criteria>\n\n**OUTPUT FORMAT**\nIMPORTANT:\n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain a single key, \"steps\", whose value is a list of strings.\n- Each string must represent one evaluation step.\n- Do NOT include any explanations, commentary, extra text, or additional formatting.\n\nFormat:\n{\"steps\": <list_of_strings>}\n\nExample:\n{\"steps\":[\"<Evaluation Step 1>\",\"<Evaluation Step 2>\",\"<Evaluation Step 3>\",\"<Evaluation Step 4>\"]}\n\nHere are the 3-4 concise evaluation steps, formatted as required in a minified JSON:\nJSON:\n";
8
+ export declare const GEVAL_EVALUATE_REPLY_PROMPT = "\nInstruction #{{hash_id}}.\n\nYou will be given one Reply below. Your task is to rate the Reply on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\n**Evaluation Criteria**\n<Criteria>\n{{criteria}}\n</Criteria>\n\n**Evaluation Steps**\n- {{steps}}\nGiven the evaluation steps, return a JSON with two keys: \n 1) a \"score\" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;\n 2) a \"reason\" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Reply in your reason, but be very concise with it!\n\n**Reply**\n<Reply>\n{{output}}\n</Reply>\n\n**OUTPUT FORMAT**\nIMPORTANT: \n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain exactly two keys: \"score\" and \"reason\".\n- No additional words, explanations, or formatting are needed.\n- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.\n\nExample JSON:\n{\"score\":0,\"reason\":\"The text of Reply does not follow the evaluation criteria provided.\"}\n\nHere is the final evaluation in the required minified JSON format:\nJSON:\n";
9
+ export declare const GEVAL_EVALUATE_PROMPT = "\nInstruction #{{hash_id}}.\n\nYou will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\n**Evaluation Criteria**\n<Criteria>\n{{criteria}}\n</Criteria>\n\n**Evaluation Steps**\n- {{steps}}\nGiven the evaluation steps, return a JSON with two keys: \n 1) a \"score\" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;\n 2) a \"reason\" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!\n\n**Prompt**\n<Prompt>\n{{input}}\n</Prompt>\n\n**Reply**\n<Reply>\n{{output}}\n</Reply>\n\n**OUTPUT FORMAT**\nIMPORTANT: \n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain exactly two keys: \"score\" and \"reason\".\n- No additional words, explanations, or formatting are needed.\n- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.\n\nExample JSON:\n{\"score\":0,\"reason\":\"The text of Reply does not follow the evaluation criteria provided.\"}\n\nHere is the final evaluation in the required minified JSON format:\nJSON:\n";
package/dst/prompt.js CHANGED
@@ -4,8 +4,11 @@
4
4
  * Copyright (c) 2025 Promptfoo
5
5
  */
6
6
  Object.defineProperty(exports, "__esModule", { value: true });
7
- exports.GEVAL_EVALUATE_PROMPT = exports.GEVAL_STEPS_PROMPT = exports.LLM_RUBRIC_USER_PROMPT = exports.LLM_RUBRIC_SYSTEM_PROMPT = void 0;
8
- exports.LLM_RUBRIC_SYSTEM_PROMPT = `You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. And score 1.0 indicates full compliance with the rubric, but 0.0 indicates no compliance at all. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
7
+ exports.GEVAL_EVALUATE_PROMPT = exports.GEVAL_EVALUATE_REPLY_PROMPT = exports.GEVAL_STEPS_PROMPT = exports.LLM_RUBRIC_USER_PROMPT = exports.LLM_RUBRIC_SYSTEM_PROMPT = void 0;
8
+ exports.LLM_RUBRIC_SYSTEM_PROMPT = `
9
+ Instruction #{{hash_id}}.
10
+
11
+ You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. And score 1.0 indicates full compliance with the rubric, but 0.0 indicates no compliance at all. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
9
12
 
10
13
  Examples:
11
14
 
@@ -22,7 +25,9 @@ exports.GEVAL_STEPS_PROMPT = `
22
25
  Given an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.
23
26
 
24
27
  **EVALUATION CRITERIA**
28
+ <Criteria>
25
29
  {{criteria}}
30
+ </Criteria>
26
31
 
27
32
  **OUTPUT FORMAT**
28
33
  IMPORTANT:
@@ -40,12 +45,51 @@ Example:
40
45
  Here are the 3-4 concise evaluation steps, formatted as required in a minified JSON:
41
46
  JSON:
42
47
  `;
48
+ exports.GEVAL_EVALUATE_REPLY_PROMPT = `
49
+ Instruction #{{hash_id}}.
50
+
51
+ You will be given one Reply below. Your task is to rate the Reply on one metric.
52
+ Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
53
+
54
+ **Evaluation Criteria**
55
+ <Criteria>
56
+ {{criteria}}
57
+ </Criteria>
58
+
59
+ **Evaluation Steps**
60
+ - {{steps}}
61
+ Given the evaluation steps, return a JSON with two keys:
62
+ 1) a "score" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;
63
+ 2) a "reason" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Reply in your reason, but be very concise with it!
64
+
65
+ **Reply**
66
+ <Reply>
67
+ {{output}}
68
+ </Reply>
69
+
70
+ **OUTPUT FORMAT**
71
+ IMPORTANT:
72
+ - Return output ONLY as a minified JSON object (no code fences).
73
+ - The JSON object must contain exactly two keys: "score" and "reason".
74
+ - No additional words, explanations, or formatting are needed.
75
+ - Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.
76
+
77
+ Example JSON:
78
+ {"score":0,"reason":"The text of Reply does not follow the evaluation criteria provided."}
79
+
80
+ Here is the final evaluation in the required minified JSON format:
81
+ JSON:
82
+ `;
43
83
  exports.GEVAL_EVALUATE_PROMPT = `
84
+ Instruction #{{hash_id}}.
85
+
44
86
  You will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.
45
87
  Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
46
88
 
47
89
  **Evaluation Criteria**
90
+ <Criteria>
48
91
  {{criteria}}
92
+ </Criteria>
49
93
 
50
94
  **Evaluation Steps**
51
95
  - {{steps}}
@@ -54,10 +98,14 @@ Given the evaluation steps, return a JSON with two keys:
54
98
  2) a "reason" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!
55
99
 
56
100
  **Prompt**
101
+ <Prompt>
57
102
  {{input}}
103
+ </Prompt>
58
104
 
59
105
  **Reply**
106
+ <Reply>
60
107
  {{output}}
108
+ </Reply>
61
109
 
62
110
  **OUTPUT FORMAT**
63
111
  IMPORTANT:
@@ -67,7 +115,7 @@ IMPORTANT:
67
115
  - Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.
68
116
 
69
117
  Example JSON:
70
- {"score":0,"reason":"The text of reply does not follow the evaluation criteria provided."}
118
+ {"score":0,"reason":"The text of Reply does not follow the evaluation criteria provided."}
71
119
 
72
120
  Here is the final evaluation in the required minified JSON format:
73
121
  JSON:
package/dst/prompt.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":";AAAA;;;GAGG;;;AAKU,QAAA,wBAAwB,GAAG;;;;;;;;;;;CAWvC,CAAC;AAKW,QAAA,sBAAsB,GAAG,kEAAkE,CAAC;AAK5F,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;;;;;;CAqBjC,CAAC;AAKW,QAAA,qBAAqB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA+BpC,CAAC"}
1
+ {"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":";AAAA;;;GAGG;;;AAKU,QAAA,wBAAwB,GAAG;;;;;;;;;;;;;;CAcvC,CAAC;AAKW,QAAA,sBAAsB,GAAG,kEAAkE,CAAC;AAK5F,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;;;;;;;;CAuBjC,CAAC;AAKW,QAAA,2BAA2B,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAkC1C,CAAC;AAKW,QAAA,qBAAqB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAuCpC,CAAC"}
package/dst/types.d.ts CHANGED
@@ -1,4 +1,8 @@
1
1
  export type EvalMethod = 'bEval' | 'gEval' | 'llmRubric';
2
+ export type GEvalInput = string | {
3
+ query: string;
4
+ answer: string;
5
+ };
2
6
  export interface IStepsCache {
3
7
  set(key: string, value: string[]): Promise<void>;
4
8
  get(key: string): Promise<string[] | undefined>;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@eva-llm/eva-judge",
3
- "version": "1.0.1",
3
+ "version": "1.0.3",
4
4
  "description": "LLM-as-a-Judge abstraction layer using ai-sdk and plugins",
5
5
  "main": "dst/index.js",
6
6
  "types": "dst/index.d.ts",
@@ -26,6 +26,7 @@
26
26
  "@types/node": "^25.5.0",
27
27
  "jest": "^30.3.0",
28
28
  "ts-jest": "^29.4.6",
29
+ "ts-node": "^10.9.2",
29
30
  "typescript": "^5.9.3"
30
31
  },
31
32
  "dependencies": {
@@ -46,6 +47,7 @@
46
47
  },
47
48
  "scripts": {
48
49
  "build": "tsc",
50
+ "example": "ts-node scripts/example.ts",
49
51
  "test": "jest"
50
52
  }
51
53
  }