@eva-llm/eva-judge 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -10
- package/dst/config.d.ts +1 -14
- package/dst/config.js.map +1 -1
- package/dst/index.d.ts +3 -3
- package/dst/index.js +10 -6
- package/dst/index.js.map +1 -1
- package/dst/prompt.d.ts +3 -2
- package/dst/prompt.js +43 -2
- package/dst/prompt.js.map +1 -1
- package/dst/types.d.ts +17 -0
- package/package.json +4 -2
package/README.md
CHANGED
|
@@ -14,21 +14,29 @@ npm install @eva-llm/eva-judge
|
|
|
14
14
|
```
|
|
15
15
|
|
|
16
16
|
```ts
|
|
17
|
-
import { llmRubric, gEval, bEval } from '@eva-llm/eva-judge'
|
|
17
|
+
import { llmRubric, gEval, bEval } from '@eva-llm/eva-judge';
|
|
18
18
|
|
|
19
|
-
const
|
|
19
|
+
const query = 'Hello! How are you?';
|
|
20
20
|
const answer = 'Hi! I am fine. And you?';
|
|
21
21
|
|
|
22
22
|
await llmRubric(answer, 'answer is polite', 'openai', 'gpt-4.1-mini');
|
|
23
23
|
// { pass: true, score: 1, reason: "The answer is definitely polite and sympathetic" }
|
|
24
24
|
|
|
25
|
-
await gEval(
|
|
25
|
+
await gEval(answer, 'answer is polite', 'openai', 'gpt-4.1-mini');
|
|
26
|
+
// { score: 0.8, reason: "The answer is quite polite" }
|
|
27
|
+
|
|
28
|
+
await bEval(answer, 'answer is polite', 'openai', 'gpt-4.1-mini');
|
|
29
|
+
// { score: 1, reason: "The answer is polite" }
|
|
30
|
+
|
|
31
|
+
await gEval({ query, answer }, 'answer is relevant to question', 'openai', 'gpt-4.1-mini');
|
|
26
32
|
// { score: 0.9, reason: 'The answer is quite well relevant to the question' }
|
|
27
33
|
|
|
28
|
-
await bEval(
|
|
34
|
+
await bEval({ query, answer }, 'answer is coherent to question', 'openai', 'gpt-4.1-mini');
|
|
29
35
|
// { score: 1, reason: 'The answer is definitely coherent to the question' }
|
|
30
36
|
```
|
|
31
37
|
|
|
38
|
+
**NOTE!** For better judging the factual standard is temperature=0
|
|
39
|
+
|
|
32
40
|
## API
|
|
33
41
|
### llmRubric
|
|
34
42
|
|
|
@@ -45,15 +53,13 @@ const result = await llmRubric(
|
|
|
45
53
|
// result: { reason: string, pass: boolean, score: number }
|
|
46
54
|
```
|
|
47
55
|
|
|
48
|
-
|
|
49
56
|
### gEval
|
|
50
57
|
|
|
51
58
|
Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score (0.0-1.0).
|
|
52
59
|
|
|
53
60
|
```typescript
|
|
54
61
|
const result = await gEval(
|
|
55
|
-
|
|
56
|
-
answer, // string: the reply to evaluate
|
|
62
|
+
input: string | { query: string, answer: string }, // evaluated text or query-answer pair
|
|
57
63
|
criteria, // string: evaluation criteria
|
|
58
64
|
provider, // string: LLM provider name
|
|
59
65
|
model, // string: LLM model name
|
|
@@ -68,8 +74,7 @@ Evaluates a reply against criteria and derived steps using an LLM, but with bina
|
|
|
68
74
|
|
|
69
75
|
```typescript
|
|
70
76
|
const result = await bEval(
|
|
71
|
-
|
|
72
|
-
answer, // string: the reply to evaluate
|
|
77
|
+
input: string | { query: string, answer: string }, // evaluated text or query-answer pair
|
|
73
78
|
criteria, // string: evaluation criteria
|
|
74
79
|
provider, // string: LLM provider name
|
|
75
80
|
model, // string: LLM model name
|
|
@@ -98,7 +103,7 @@ Specify the provider name and model name in `llmRubric`, `gEval`, or `bEval`.
|
|
|
98
103
|
> **Note:** Each provider integration is based on its respective ai-sdk package. Be sure to follow the provider's documentation for setup and authentication. Most providers require you to export an API key or token as an environment variable (e.g., `export OPENAI_API_KEY=...`).
|
|
99
104
|
|
|
100
105
|
## Enterprise
|
|
101
|
-
### Hooks
|
|
106
|
+
### LLM Judge Hooks
|
|
102
107
|
|
|
103
108
|
You can provide hooks to receive notifications about evaluation events (success or error) for logging, monitoring, or custom handling. Hooks can also be used to integrate with observability tools such as OpenTelemetry for tracing and metrics. Set these in the config:
|
|
104
109
|
|
|
@@ -115,6 +120,19 @@ Config.hooks = {
|
|
|
115
120
|
};
|
|
116
121
|
```
|
|
117
122
|
|
|
123
|
+
### Configuring
|
|
124
|
+
|
|
125
|
+
```ts
|
|
126
|
+
import Config from '@eva-llm/eva-judge';
|
|
127
|
+
|
|
128
|
+
Config.restartModelCache(500); // cache 500 (default 100) models by provider:model with LRU Cache
|
|
129
|
+
Config.restartStepsCache(1000); // cache 1000 (default 500) Evaluations Steps by criteria with LRU Cache
|
|
130
|
+
Config.enableModelCache();
|
|
131
|
+
Config.disableModelCache();
|
|
132
|
+
Config.enableStepsCache();
|
|
133
|
+
Config.disableStepsCache();
|
|
134
|
+
```
|
|
135
|
+
|
|
118
136
|
### G-Eval/B-Eval Evaluation Steps Persistent Storage
|
|
119
137
|
|
|
120
138
|
For advanced use, you can implement your own cache storage for evaluation steps (e.g., using Redis or another backend) by providing a custom cache via `setStepsCache()`:
|
package/dst/config.d.ts
CHANGED
|
@@ -1,19 +1,6 @@
|
|
|
1
1
|
import { LRUCache } from 'lru-cache';
|
|
2
2
|
import { type LanguageModel } from 'ai';
|
|
3
|
-
import { type
|
|
4
|
-
export interface EvaHooks {
|
|
5
|
-
onSuccess?: (data: {
|
|
6
|
-
method: EvalMethod;
|
|
7
|
-
params: any;
|
|
8
|
-
result: any;
|
|
9
|
-
duration: number;
|
|
10
|
-
}) => void;
|
|
11
|
-
onError?: (data: {
|
|
12
|
-
method: EvalMethod;
|
|
13
|
-
error: any;
|
|
14
|
-
duration: number;
|
|
15
|
-
}) => void;
|
|
16
|
-
}
|
|
3
|
+
import { type EvaHooks, type IStepsCache } from './types';
|
|
17
4
|
declare const _default: {
|
|
18
5
|
gevalMaxScore: number;
|
|
19
6
|
isModelCached: boolean;
|
package/dst/config.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;AASrC,MAAM,kBAAkB;IACd,KAAK,CAA6B;IAM1C,YAAY,IAAY;QACtB,IAAI,CAAC,KAAK,GAAG,IAAI,oBAAQ,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW,EAAE,KAAe;QACpC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAC7B,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW;QACnB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;CACF;
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;AASrC,MAAM,kBAAkB;IACd,KAAK,CAA6B;IAM1C,YAAY,IAAY;QACtB,IAAI,CAAC,KAAK,GAAG,IAAI,oBAAQ,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW,EAAE,KAAe;QACpC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAC7B,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW;QACnB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;CACF;AAOD,kBAAe;IAIb,aAAa,EAAE,EAAE;IAIjB,aAAa,EAAE,IAAI;IAInB,aAAa,EAAE,IAAI;IAInB,UAAU,EAAE,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;IAI7D,UAAU,EAAE,IAAI,kBAAkB,CAAC,GAAG,CAAgB;IAKtD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IACvE,CAAC;IAKD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,kBAAkB,CAAC,IAAI,CAAgB,CAAC;IAChE,CAAC;IAKD,aAAa,CAAC,KAAkB;QAC9B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,KAAK,EAAE,EAAc;IAKrB,QAAQ,CAAC,KAAe;QACtB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;CACF,CAAC"}
|
package/dst/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import z from 'zod';
|
|
2
|
-
import { type EvalOptions } from './types';
|
|
2
|
+
import { type EvalOptions, type GEvalInput } from './types';
|
|
3
3
|
export * from './config';
|
|
4
4
|
export { default } from './config';
|
|
5
5
|
export * from './types';
|
|
@@ -19,5 +19,5 @@ export declare const GevalEvaluateResultSchema: z.ZodObject<{
|
|
|
19
19
|
}, z.core.$strip>;
|
|
20
20
|
export type GevalEvaluateResult = z.infer<typeof GevalEvaluateResultSchema>;
|
|
21
21
|
export declare const llmRubric: (output: string, rubric: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<RubricResult>;
|
|
22
|
-
export declare const gEval: (
|
|
23
|
-
export declare const bEval: (
|
|
22
|
+
export declare const gEval: (input: GEvalInput, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
|
|
23
|
+
export declare const bEval: (input: GEvalInput, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
|
package/dst/index.js
CHANGED
|
@@ -93,7 +93,11 @@ const llmRubric = async (output, rubric, providerName, modelName, options = {})
|
|
|
93
93
|
}
|
|
94
94
|
};
|
|
95
95
|
exports.llmRubric = llmRubric;
|
|
96
|
-
const _gEval = async (
|
|
96
|
+
const _gEval = async (input, criteria, providerName, modelName, maxScore, methodName, options = {}) => {
|
|
97
|
+
if (typeof input === 'string') {
|
|
98
|
+
input = { query: '', answer: input };
|
|
99
|
+
}
|
|
100
|
+
const { query, answer } = input;
|
|
97
101
|
const start = Date.now();
|
|
98
102
|
try {
|
|
99
103
|
const model = (0, registry_1.getModel)(providerName, modelName);
|
|
@@ -111,10 +115,10 @@ const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScor
|
|
|
111
115
|
steps = stepsResult.steps;
|
|
112
116
|
(0, registry_1.setSteps)(criteria, stepsResult.steps);
|
|
113
117
|
}
|
|
114
|
-
const evaluationPrompt = Mustache.render(prompt_1.GEVAL_EVALUATE_PROMPT, {
|
|
118
|
+
const evaluationPrompt = Mustache.render(query ? prompt_1.GEVAL_EVALUATE_PROMPT : prompt_1.GEVAL_EVALUATE_REPLY_PROMPT, {
|
|
115
119
|
criteria,
|
|
116
120
|
steps: steps.join('\n- '),
|
|
117
|
-
input:
|
|
121
|
+
input: query,
|
|
118
122
|
output: answer,
|
|
119
123
|
maxScore,
|
|
120
124
|
});
|
|
@@ -132,7 +136,7 @@ const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScor
|
|
|
132
136
|
};
|
|
133
137
|
config_1.default.hooks.onSuccess?.({
|
|
134
138
|
method: methodName,
|
|
135
|
-
params: {
|
|
139
|
+
params: { query, answer, criteria, providerName, modelName, options },
|
|
136
140
|
result,
|
|
137
141
|
duration: Date.now() - start,
|
|
138
142
|
});
|
|
@@ -147,8 +151,8 @@ const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScor
|
|
|
147
151
|
throw error;
|
|
148
152
|
}
|
|
149
153
|
};
|
|
150
|
-
const gEval = async (
|
|
154
|
+
const gEval = async (input, criteria, providerName, modelName, options = {}) => _gEval(input, criteria, providerName, modelName, config_1.default.gevalMaxScore, 'gEval', options);
|
|
151
155
|
exports.gEval = gEval;
|
|
152
|
-
const bEval = async (
|
|
156
|
+
const bEval = async (input, criteria, providerName, modelName, options = {}) => _gEval(input, criteria, providerName, modelName, 1, 'bEval', options);
|
|
153
157
|
exports.bEval = bEval;
|
|
154
158
|
//# sourceMappingURL=index.js.map
|
package/dst/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAMkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAO5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;AAChB,0CAAwB;AAMX,QAAA,kBAAkB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEzC,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,IAAI,EAAE,aAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEnF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CAC9E,CAAC,CAAC;AAUU,QAAA,sBAAsB,GAAG,aAAC,CAAC,MAAM,CAAC;IAE7C,KAAK,EAAE,aAAC,CAAC,KAAK,CAAC,aAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;CAClG,CAAC,CAAC;AAWU,QAAA,yBAAyB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEhD,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CACvE,CAAC,CAAC;AAgBI,MAAM,SAAS,GAAG,KAAK,EAC5B,MAAc,EACd,MAAc,EACd,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACF,EAAE;IACzB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,+BAAsB,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;QAE/E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAC5C,KAAK,EAAE,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC;YACxC,MAAM,EAAE,iCAAwB;YAChC,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,0BAAkB;aAC3B,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,WAAW;YACnB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YAC5D,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,WAAW;YACnB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvCY,QAAA,SAAS,aAuCrB;AAED,MAAM,MAAM,GAAG,KAAK,EAClB,KAAiB,EACjB,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,QAAgB,EAChB,UAAsB,EACtB,UAAuB,EAAE,EACK,EAAE;IAChC,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,KAAK,GAAG,EAAE,KAAK,EAAE,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;IACvC,CAAC;IACD,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,KAAK,CAAC;IAEhC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAEzB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAC;QAErC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,2BAAkB,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEtE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;gBACjD,KAAK;gBACL,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;oBACpB,MAAM,EAAE,8BAAsB;iBAC/B,CAAC;gBACF,GAAG,OAAO;aACX,CAAC,CAAC;YAEH,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC;YAE1B,IAAA,mBAAQ,EAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CACtC,KAAK,CAAC,CAAC,CAAC,8BAAqB,CAAC,CAAC,CAAC,oCAA2B,EAC3D;YACE,QAAQ;YACR,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;YACzB,KAAK,EAAE,KAAK;YACZ,MAAM,EAAE,MAAM;YACd,QAAQ;SACT,CAAC,CAAC;QAEL,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAChD,KAAK;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,iCAAyB;aAClC,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,KAAK,EAAE,UAAU,CAAC,KAAK,GAAG,QAAQ;SACnC,CAAC;QAEF,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,EAAE,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YACrE,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,UAAU;YAClB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAYM,MAAM,KAAK,GAAG,KAAK,EACxB,KAAiB,EACjB,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,KAAK,EACL,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,gBAAI,CAAC,aAAa,EAClB,OAAO,EACP,OAAO,CACR,CAAC;AAdW,QAAA,KAAK,SAchB;AAYK,MAAM,KAAK,GAAG,KAAK,EACxB,KAAiB,EACjB,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,KAAK,EACL,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,CAAC,EACD,OAAO,EACP,OAAO,CACR,CAAC;AAdW,QAAA,KAAK,SAchB"}
|
package/dst/prompt.d.ts
CHANGED
|
@@ -4,5 +4,6 @@
|
|
|
4
4
|
*/
|
|
5
5
|
export declare const LLM_RUBRIC_SYSTEM_PROMPT = "You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. And score 1.0 indicates full compliance with the rubric, but 0.0 indicates no compliance at all. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}\n\nExamples:\n\n<Output>Hello world</Output>\n<Rubric>Content contains a greeting</Rubric>\n{\"reason\": \"the content contains the word 'Hello'\", \"pass\": true, \"score\": 1.0}\n\n<Output>Avast ye swabs, repel the invaders!</Output>\n<Rubric>Does not speak like a pirate</Rubric>\n{\"reason\": \"'avast ye' is a common pirate term\", \"pass\": false, \"score\": 0.0}\n";
|
|
6
6
|
export declare const LLM_RUBRIC_USER_PROMPT = "<Output>\n{{output}}\n</Output>\n<Rubric>\n{{rubric}}\n</Rubric>";
|
|
7
|
-
export declare const GEVAL_STEPS_PROMPT = "\nGiven an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.\n\n**EVALUATION CRITERIA**\n{{criteria}}\n\n**OUTPUT FORMAT**\nIMPORTANT:\n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain a single key, \"steps\", whose value is a list of strings.\n- Each string must represent one evaluation step.\n- Do NOT include any explanations, commentary, extra text, or additional formatting.\n\nFormat:\n{\"steps\": <list_of_strings>}\n\nExample:\n{\"steps\":[\"<Evaluation Step 1>\",\"<Evaluation Step 2>\",\"<Evaluation Step 3>\",\"<Evaluation Step 4>\"]}\n\nHere are the 3-4 concise evaluation steps, formatted as required in a minified JSON:\nJSON:\n";
|
|
8
|
-
export declare const
|
|
7
|
+
export declare const GEVAL_STEPS_PROMPT = "\nGiven an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.\n\n**EVALUATION CRITERIA**\n<Criteria>\n{{criteria}}\n</Criteria>\n\n**OUTPUT FORMAT**\nIMPORTANT:\n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain a single key, \"steps\", whose value is a list of strings.\n- Each string must represent one evaluation step.\n- Do NOT include any explanations, commentary, extra text, or additional formatting.\n\nFormat:\n{\"steps\": <list_of_strings>}\n\nExample:\n{\"steps\":[\"<Evaluation Step 1>\",\"<Evaluation Step 2>\",\"<Evaluation Step 3>\",\"<Evaluation Step 4>\"]}\n\nHere are the 3-4 concise evaluation steps, formatted as required in a minified JSON:\nJSON:\n";
|
|
8
|
+
export declare const GEVAL_EVALUATE_REPLY_PROMPT = "\nYou will be given one Reply below. Your task is to rate the Reply on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\n**Evaluation Criteria**\n<Criteria>\n{{criteria}}\n</Criteria>\n\n**Evaluation Steps**\n- {{steps}}\nGiven the evaluation steps, return a JSON with two keys: \n 1) a \"score\" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;\n 2) a \"reason\" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Reply in your reason, but be very concise with it!\n\n**Reply**\n<Reply>\n{{output}}\n</Reply>\n\n**OUTPUT FORMAT**\nIMPORTANT: \n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain exactly two keys: \"score\" and \"reason\".\n- No additional words, explanations, or formatting are needed.\n- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.\n\nExample JSON:\n{\"score\":0,\"reason\":\"The text of Reply does not follow the evaluation criteria provided.\"}\n\nHere is the final evaluation in the required minified JSON format:\nJSON:\n";
|
|
9
|
+
export declare const GEVAL_EVALUATE_PROMPT = "\nYou will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\n**Evaluation Criteria**\n<Criteria>\n{{criteria}}\n</Criteria>\n\n**Evaluation Steps**\n- {{steps}}\nGiven the evaluation steps, return a JSON with two keys: \n 1) a \"score\" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;\n 2) a \"reason\" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!\n\n**Prompt**\n<Prompt>\n{{input}}\n</Prompt>\n\n**Reply**\n<Reply>\n{{output}}\n</Reply>\n\n**OUTPUT FORMAT**\nIMPORTANT: \n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain exactly two keys: \"score\" and \"reason\".\n- No additional words, explanations, or formatting are needed.\n- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.\n\nExample JSON:\n{\"score\":0,\"reason\":\"The text of Reply does not follow the evaluation criteria provided.\"}\n\nHere is the final evaluation in the required minified JSON format:\nJSON:\n";
|
package/dst/prompt.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Copyright (c) 2025 Promptfoo
|
|
5
5
|
*/
|
|
6
6
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
-
exports.GEVAL_EVALUATE_PROMPT = exports.GEVAL_STEPS_PROMPT = exports.LLM_RUBRIC_USER_PROMPT = exports.LLM_RUBRIC_SYSTEM_PROMPT = void 0;
|
|
7
|
+
exports.GEVAL_EVALUATE_PROMPT = exports.GEVAL_EVALUATE_REPLY_PROMPT = exports.GEVAL_STEPS_PROMPT = exports.LLM_RUBRIC_USER_PROMPT = exports.LLM_RUBRIC_SYSTEM_PROMPT = void 0;
|
|
8
8
|
exports.LLM_RUBRIC_SYSTEM_PROMPT = `You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. And score 1.0 indicates full compliance with the rubric, but 0.0 indicates no compliance at all. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
9
9
|
|
|
10
10
|
Examples:
|
|
@@ -22,7 +22,9 @@ exports.GEVAL_STEPS_PROMPT = `
|
|
|
22
22
|
Given an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.
|
|
23
23
|
|
|
24
24
|
**EVALUATION CRITERIA**
|
|
25
|
+
<Criteria>
|
|
25
26
|
{{criteria}}
|
|
27
|
+
</Criteria>
|
|
26
28
|
|
|
27
29
|
**OUTPUT FORMAT**
|
|
28
30
|
IMPORTANT:
|
|
@@ -40,12 +42,47 @@ Example:
|
|
|
40
42
|
Here are the 3-4 concise evaluation steps, formatted as required in a minified JSON:
|
|
41
43
|
JSON:
|
|
42
44
|
`;
|
|
45
|
+
exports.GEVAL_EVALUATE_REPLY_PROMPT = `
|
|
46
|
+
You will be given one Reply below. Your task is to rate the Reply on one metric.
|
|
47
|
+
Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
|
|
48
|
+
|
|
49
|
+
**Evaluation Criteria**
|
|
50
|
+
<Criteria>
|
|
51
|
+
{{criteria}}
|
|
52
|
+
</Criteria>
|
|
53
|
+
|
|
54
|
+
**Evaluation Steps**
|
|
55
|
+
- {{steps}}
|
|
56
|
+
Given the evaluation steps, return a JSON with two keys:
|
|
57
|
+
1) a "score" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;
|
|
58
|
+
2) a "reason" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Reply in your reason, but be very concise with it!
|
|
59
|
+
|
|
60
|
+
**Reply**
|
|
61
|
+
<Reply>
|
|
62
|
+
{{output}}
|
|
63
|
+
</Reply>
|
|
64
|
+
|
|
65
|
+
**OUTPUT FORMAT**
|
|
66
|
+
IMPORTANT:
|
|
67
|
+
- Return output ONLY as a minified JSON object (no code fences).
|
|
68
|
+
- The JSON object must contain exactly two keys: "score" and "reason".
|
|
69
|
+
- No additional words, explanations, or formatting are needed.
|
|
70
|
+
- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.
|
|
71
|
+
|
|
72
|
+
Example JSON:
|
|
73
|
+
{"score":0,"reason":"The text of Reply does not follow the evaluation criteria provided."}
|
|
74
|
+
|
|
75
|
+
Here is the final evaluation in the required minified JSON format:
|
|
76
|
+
JSON:
|
|
77
|
+
`;
|
|
43
78
|
exports.GEVAL_EVALUATE_PROMPT = `
|
|
44
79
|
You will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.
|
|
45
80
|
Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
|
|
46
81
|
|
|
47
82
|
**Evaluation Criteria**
|
|
83
|
+
<Criteria>
|
|
48
84
|
{{criteria}}
|
|
85
|
+
</Criteria>
|
|
49
86
|
|
|
50
87
|
**Evaluation Steps**
|
|
51
88
|
- {{steps}}
|
|
@@ -54,10 +91,14 @@ Given the evaluation steps, return a JSON with two keys:
|
|
|
54
91
|
2) a "reason" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!
|
|
55
92
|
|
|
56
93
|
**Prompt**
|
|
94
|
+
<Prompt>
|
|
57
95
|
{{input}}
|
|
96
|
+
</Prompt>
|
|
58
97
|
|
|
59
98
|
**Reply**
|
|
99
|
+
<Reply>
|
|
60
100
|
{{output}}
|
|
101
|
+
</Reply>
|
|
61
102
|
|
|
62
103
|
**OUTPUT FORMAT**
|
|
63
104
|
IMPORTANT:
|
|
@@ -67,7 +108,7 @@ IMPORTANT:
|
|
|
67
108
|
- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.
|
|
68
109
|
|
|
69
110
|
Example JSON:
|
|
70
|
-
{"score":0,"reason":"The text of
|
|
111
|
+
{"score":0,"reason":"The text of Reply does not follow the evaluation criteria provided."}
|
|
71
112
|
|
|
72
113
|
Here is the final evaluation in the required minified JSON format:
|
|
73
114
|
JSON:
|
package/dst/prompt.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":";AAAA;;;GAGG;;;AAKU,QAAA,wBAAwB,GAAG;;;;;;;;;;;CAWvC,CAAC;AAKW,QAAA,sBAAsB,GAAG,kEAAkE,CAAC;AAK5F,QAAA,kBAAkB,GAAG
|
|
1
|
+
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":";AAAA;;;GAGG;;;AAKU,QAAA,wBAAwB,GAAG;;;;;;;;;;;CAWvC,CAAC;AAKW,QAAA,sBAAsB,GAAG,kEAAkE,CAAC;AAK5F,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;;;;;;;;CAuBjC,CAAC;AAKW,QAAA,2BAA2B,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAgC1C,CAAC;AAKW,QAAA,qBAAqB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAqCpC,CAAC"}
|
package/dst/types.d.ts
CHANGED
|
@@ -1,4 +1,8 @@
|
|
|
1
1
|
export type EvalMethod = 'bEval' | 'gEval' | 'llmRubric';
|
|
2
|
+
export type GEvalInput = string | {
|
|
3
|
+
query: string;
|
|
4
|
+
answer: string;
|
|
5
|
+
};
|
|
2
6
|
export interface IStepsCache {
|
|
3
7
|
set(key: string, value: string[]): Promise<void>;
|
|
4
8
|
get(key: string): Promise<string[] | undefined>;
|
|
@@ -7,3 +11,16 @@ export interface EvalOptions {
|
|
|
7
11
|
temperature?: number;
|
|
8
12
|
providerOptions?: Record<string, any>;
|
|
9
13
|
}
|
|
14
|
+
export interface EvaHooks {
|
|
15
|
+
onSuccess?: (data: {
|
|
16
|
+
method: EvalMethod;
|
|
17
|
+
params: any;
|
|
18
|
+
result: any;
|
|
19
|
+
duration: number;
|
|
20
|
+
}) => void;
|
|
21
|
+
onError?: (data: {
|
|
22
|
+
method: EvalMethod;
|
|
23
|
+
error: any;
|
|
24
|
+
duration: number;
|
|
25
|
+
}) => void;
|
|
26
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@eva-llm/eva-judge",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "LLM-as-a-Judge abstraction layer using ai-sdk and plugins",
|
|
5
5
|
"main": "dst/index.js",
|
|
6
6
|
"types": "dst/index.d.ts",
|
|
@@ -19,13 +19,14 @@
|
|
|
19
19
|
"bugs": {
|
|
20
20
|
"url": "https://github.com/eva-llm/eva-judge/issues"
|
|
21
21
|
},
|
|
22
|
-
"homepage": "https://
|
|
22
|
+
"homepage": "https://eva-llm.github.io/eva-judge",
|
|
23
23
|
"devDependencies": {
|
|
24
24
|
"@types/jest": "^30.0.0",
|
|
25
25
|
"@types/mustache": "^4.2.6",
|
|
26
26
|
"@types/node": "^25.5.0",
|
|
27
27
|
"jest": "^30.3.0",
|
|
28
28
|
"ts-jest": "^29.4.6",
|
|
29
|
+
"ts-node": "^10.9.2",
|
|
29
30
|
"typescript": "^5.9.3"
|
|
30
31
|
},
|
|
31
32
|
"dependencies": {
|
|
@@ -46,6 +47,7 @@
|
|
|
46
47
|
},
|
|
47
48
|
"scripts": {
|
|
48
49
|
"build": "tsc",
|
|
50
|
+
"example": "ts-node scripts/example.ts",
|
|
49
51
|
"test": "jest"
|
|
50
52
|
}
|
|
51
53
|
}
|