@eva-llm/eva-judge 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -2
- package/dst/config.d.ts +3 -2
- package/dst/config.js.map +1 -1
- package/dst/index.d.ts +1 -0
- package/dst/index.js +9 -6
- package/dst/index.js.map +1 -1
- package/dst/types.d.ts +1 -0
- package/dst/types.js +3 -0
- package/dst/types.js.map +1 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -33,7 +33,7 @@ pnpm test
|
|
|
33
33
|
Import and use the modules in your TypeScript/Node.js project:
|
|
34
34
|
|
|
35
35
|
```typescript
|
|
36
|
-
import { llmRubric, gEval } from '@eva-llm/eva-judge';
|
|
36
|
+
import { llmRubric, gEval, bEval } from '@eva-llm/eva-judge';
|
|
37
37
|
```
|
|
38
38
|
|
|
39
39
|
### llmRubric
|
|
@@ -51,9 +51,10 @@ const result = await llmRubric(
|
|
|
51
51
|
// result: { reason: string, pass: boolean, score: number }
|
|
52
52
|
```
|
|
53
53
|
|
|
54
|
+
|
|
54
55
|
### gEval
|
|
55
56
|
|
|
56
|
-
Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score.
|
|
57
|
+
Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score (0.0–1.0).
|
|
57
58
|
|
|
58
59
|
```typescript
|
|
59
60
|
const result = await gEval(
|
|
@@ -67,6 +68,22 @@ const result = await gEval(
|
|
|
67
68
|
// result: { reason: string, score: number }
|
|
68
69
|
```
|
|
69
70
|
|
|
71
|
+
### bEval (Binary G-Eval)
|
|
72
|
+
|
|
73
|
+
Evaluates a reply against criteria and derived steps using an LLM, but with binary scoring (0 or 1). Returns a reason and a normalized score (0 or 1).
|
|
74
|
+
|
|
75
|
+
```typescript
|
|
76
|
+
const result = await bEval(
|
|
77
|
+
prompt, // string: the prompt given to the model
|
|
78
|
+
answer, // string: the reply to evaluate
|
|
79
|
+
criteria, // string: evaluation criteria
|
|
80
|
+
provider, // string: LLM provider name
|
|
81
|
+
model, // string: LLM model name
|
|
82
|
+
options // optional: { temperature, providerOptions }
|
|
83
|
+
);
|
|
84
|
+
// result: { reason: string, score: number } // score will be 0 or 1
|
|
85
|
+
```
|
|
86
|
+
|
|
70
87
|
## Development
|
|
71
88
|
- Source code is in `src/`
|
|
72
89
|
- Tests are in `tests/`
|
package/dst/config.d.ts
CHANGED
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
import { LRUCache } from 'lru-cache';
|
|
2
2
|
import { type LanguageModel } from 'ai';
|
|
3
|
+
import { type EvalMethod } from './types';
|
|
3
4
|
export interface IStepsCache {
|
|
4
5
|
set(key: string, value: string[]): Promise<void>;
|
|
5
6
|
get(key: string): Promise<string[] | undefined>;
|
|
6
7
|
}
|
|
7
8
|
export interface EvaHooks {
|
|
8
9
|
onSuccess?: (data: {
|
|
9
|
-
method:
|
|
10
|
+
method: EvalMethod;
|
|
10
11
|
params: any;
|
|
11
12
|
result: any;
|
|
12
13
|
duration: number;
|
|
13
14
|
}) => void;
|
|
14
15
|
onError?: (data: {
|
|
15
|
-
method:
|
|
16
|
+
method: EvalMethod;
|
|
16
17
|
error: any;
|
|
17
18
|
duration: number;
|
|
18
19
|
}) => void;
|
package/dst/config.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;AA8BrC,MAAM,kBAAkB;IACd,KAAK,CAA6B;IAM1C,YAAY,IAAY;QACtB,IAAI,CAAC,KAAK,GAAG,IAAI,oBAAQ,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW,EAAE,KAAe;QACpC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAC7B,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW;QACnB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;CACF;AAiCD,kBAAe;IAIb,aAAa,EAAE,EAAE;IAIjB,aAAa,EAAE,IAAI;IAInB,aAAa,EAAE,IAAI;IAInB,UAAU,EAAE,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;IAI7D,UAAU,EAAE,IAAI,kBAAkB,CAAC,GAAG,CAAgB;IAKtD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IACvE,CAAC;IAKD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,kBAAkB,CAAC,IAAI,CAAgB,CAAC;IAChE,CAAC;IAKD,aAAa,CAAC,KAAkB;QAC9B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,KAAK,EAAE,EAAc;IAKrB,QAAQ,CAAC,KAAe;QACtB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;CACF,CAAC"}
|
package/dst/index.d.ts
CHANGED
|
@@ -22,3 +22,4 @@ export declare const GevalEvaluateResultSchema: z.ZodObject<{
|
|
|
22
22
|
export type GevalEvaluateResult = z.infer<typeof GevalEvaluateResultSchema>;
|
|
23
23
|
export declare const llmRubric: (output: string, rubric: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<RubricResult>;
|
|
24
24
|
export declare const gEval: (prompt: string, answer: string, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
|
|
25
|
+
export declare const bEval: (prompt: string, answer: string, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
|
package/dst/index.js
CHANGED
|
@@ -39,7 +39,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
39
39
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
40
40
|
};
|
|
41
41
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
-
exports.gEval = exports.llmRubric = exports.GevalEvaluateResultSchema = exports.GevalStepsResultSchema = exports.RubricResultSchema = exports.default = void 0;
|
|
42
|
+
exports.bEval = exports.gEval = exports.llmRubric = exports.GevalEvaluateResultSchema = exports.GevalStepsResultSchema = exports.RubricResultSchema = exports.default = void 0;
|
|
43
43
|
const ai_1 = require("ai");
|
|
44
44
|
const Mustache = __importStar(require("mustache"));
|
|
45
45
|
const zod_1 = __importDefault(require("zod"));
|
|
@@ -92,7 +92,7 @@ const llmRubric = async (output, rubric, providerName, modelName, options = {})
|
|
|
92
92
|
}
|
|
93
93
|
};
|
|
94
94
|
exports.llmRubric = llmRubric;
|
|
95
|
-
const
|
|
95
|
+
const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScore, methodName, options = {}) => {
|
|
96
96
|
const start = Date.now();
|
|
97
97
|
try {
|
|
98
98
|
const model = (0, registry_1.getModel)(providerName, modelName);
|
|
@@ -115,7 +115,7 @@ const gEval = async (prompt, answer, criteria, providerName, modelName, options
|
|
|
115
115
|
steps: steps.join('\n- '),
|
|
116
116
|
input: prompt,
|
|
117
117
|
output: answer,
|
|
118
|
-
maxScore
|
|
118
|
+
maxScore,
|
|
119
119
|
});
|
|
120
120
|
const { output: evalResult } = await (0, ai_1.generateText)({
|
|
121
121
|
model,
|
|
@@ -127,10 +127,10 @@ const gEval = async (prompt, answer, criteria, providerName, modelName, options
|
|
|
127
127
|
});
|
|
128
128
|
const result = {
|
|
129
129
|
reason: evalResult.reason,
|
|
130
|
-
score: evalResult.score /
|
|
130
|
+
score: evalResult.score / maxScore,
|
|
131
131
|
};
|
|
132
132
|
config_1.default.hooks.onSuccess?.({
|
|
133
|
-
method:
|
|
133
|
+
method: methodName,
|
|
134
134
|
params: { prompt, answer, criteria, providerName, modelName, options },
|
|
135
135
|
result,
|
|
136
136
|
duration: Date.now() - start,
|
|
@@ -139,12 +139,15 @@ const gEval = async (prompt, answer, criteria, providerName, modelName, options
|
|
|
139
139
|
}
|
|
140
140
|
catch (error) {
|
|
141
141
|
config_1.default.hooks.onError?.({
|
|
142
|
-
method:
|
|
142
|
+
method: methodName,
|
|
143
143
|
error,
|
|
144
144
|
duration: Date.now() - start,
|
|
145
145
|
});
|
|
146
146
|
throw error;
|
|
147
147
|
}
|
|
148
148
|
};
|
|
149
|
+
const gEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => _gEval(prompt, answer, criteria, providerName, modelName, config_1.default.gevalMaxScore, 'gEval', options);
|
|
149
150
|
exports.gEval = gEval;
|
|
151
|
+
const bEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => _gEval(prompt, answer, criteria, providerName, modelName, 1, 'bEval', options);
|
|
152
|
+
exports.bEval = bEval;
|
|
150
153
|
//# sourceMappingURL=index.js.map
|
package/dst/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;AAqBH,QAAA,kBAAkB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEzC,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,IAAI,EAAE,aAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEnF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CAC9E,CAAC,CAAC;AAUU,QAAA,sBAAsB,GAAG,aAAC,CAAC,MAAM,CAAC;IAE7C,KAAK,EAAE,aAAC,CAAC,KAAK,CAAC,aAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;CAClG,CAAC,CAAC;AAWU,QAAA,yBAAyB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEhD,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CACvE,CAAC,CAAC;AAgBI,MAAM,SAAS,GAAG,KAAK,EAC5B,MAAc,EACd,MAAc,EACd,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACF,EAAE;IACzB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,+BAAsB,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;QAE/E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAC5C,KAAK,EAAE,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC;YACxC,MAAM,EAAE,iCAAwB;YAChC,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,0BAAkB;aAC3B,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,WAAW;YACnB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YAC5D,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,WAAW;YACnB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvCY,QAAA,SAAS,aAuCrB;AAED,MAAM,MAAM,GAAG,KAAK,EAClB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,QAAgB,EAChB,UAAsB,EACtB,UAAuB,EAAE,EACK,EAAE;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAEzB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAC;QAErC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,2BAAkB,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEtE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;gBACjD,KAAK;gBACL,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;oBACpB,MAAM,EAAE,8BAAsB;iBAC/B,CAAC;gBACF,GAAG,OAAO;aACX,CAAC,CAAC;YAEH,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC;YAE1B,IAAA,mBAAQ,EAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,8BAAqB,EAAE;YAC9D,QAAQ;YACR,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;YACzB,KAAK,EAAE,MAAM;YACb,MAAM,EAAE,MAAM;YACd,QAAQ;SACT,CAAC,CAAC;QAEH,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAChD,KAAK;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,iCAAyB;aAClC,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,KAAK,EAAE,UAAU,CAAC,KAAK,GAAG,QAAQ;SACnC,CAAC;QAEF,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YACtE,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,UAAU;YAClB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAaM,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,gBAAI,CAAC,aAAa,EAClB,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB;AAaK,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,CAAC,EACD,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB"}
|
package/dst/types.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export type EvalMethod = 'bEval' | 'gEval' | 'llmRubric';
|
package/dst/types.js
ADDED
package/dst/types.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|