@eva-llm/eva-judge 0.1.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -37
- package/dst/config.d.ts +1 -5
- package/dst/config.js.map +1 -1
- package/dst/index.d.ts +2 -4
- package/dst/index.js +1 -0
- package/dst/index.js.map +1 -1
- package/dst/prompt.d.ts +1 -1
- package/dst/prompt.js +1 -1
- package/dst/types.d.ts +8 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,41 +1,35 @@
|
|
|
1
|
-
# Project Inspiration & Attribution
|
|
2
|
-
|
|
3
|
-
This project is inspired by [promptfoo](https://github.com/promptfoo/promptfoo), including author's work on the G-Eval framework there. The LLM-as-a-Judge prompts are copied from promptfoo and adapted for project-specific issues.
|
|
4
|
-
|
|
5
1
|
# eva-judge
|
|
6
2
|
|
|
7
|
-
A TypeScript/Node.js
|
|
3
|
+
A TypeScript/Node.js library for automated text evaluation with AI analysis through **LLM-Rubric**, **G-Eval**, or **B-Eval** (Binary G-Eval).
|
|
8
4
|
|
|
9
|
-
##
|
|
10
|
-
- Configuration management for evaluation workflows
|
|
11
|
-
- Prompt handling and manipulation
|
|
12
|
-
- Registry for test cases and evaluation items
|
|
13
|
-
- Designed for integration with Jest and other test runners
|
|
5
|
+
## Project Inspiration & Attribution
|
|
14
6
|
|
|
15
|
-
|
|
7
|
+
This project is inspired by [promptfoo](https://github.com/promptfoo/promptfoo), including [author's work](https://github.com/promptfoo/promptfoo/issues?q=state%3Aclosed%20is%3Apr%20author%3A%40schipiga) on the [G-Eval](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded/g-eval/) framework there.<br />
|
|
8
|
+
The LLM-as-a-Judge prompts are copied from promptfoo and adapted for project-specific issues.
|
|
16
9
|
|
|
17
|
-
|
|
10
|
+
## Quick Start
|
|
18
11
|
|
|
19
12
|
```bash
|
|
20
13
|
npm install @eva-llm/eva-judge
|
|
21
|
-
# or
|
|
22
|
-
pnpm add @eva-llm/eva-judge
|
|
23
14
|
```
|
|
24
15
|
|
|
25
|
-
|
|
16
|
+
```ts
|
|
17
|
+
import { llmRubric, gEval, bEval } from '@eva-llm/eva-judge'
|
|
26
18
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
```
|
|
19
|
+
const prompt = 'Hello! How are you?';
|
|
20
|
+
const answer = 'Hi! I am fine. And you?';
|
|
30
21
|
|
|
31
|
-
|
|
22
|
+
await llmRubric(answer, 'answer is polite', 'openai', 'gpt-4.1-mini');
|
|
23
|
+
// { pass: true, score: 1, reason: "The answer is definitely polite and sympathetic" }
|
|
32
24
|
|
|
33
|
-
|
|
25
|
+
await gEval(prompt, answer, 'answer is relevant to question', 'openai', 'gpt-4.1-mini');
|
|
26
|
+
// { score: 0.9, reason: 'The answer is quite well relevant to the question' }
|
|
34
27
|
|
|
35
|
-
|
|
36
|
-
|
|
28
|
+
await bEval(prompt, answer, 'answer is coherent to question', 'openai', 'gpt-4.1-mini');
|
|
29
|
+
// { score: 1, reason: 'The answer is definitely coherent to the question' }
|
|
37
30
|
```
|
|
38
31
|
|
|
32
|
+
## API
|
|
39
33
|
### llmRubric
|
|
40
34
|
|
|
41
35
|
Evaluates an output against a rubric using an LLM. Returns a reason, pass/fail, and normalized score.
|
|
@@ -54,7 +48,7 @@ const result = await llmRubric(
|
|
|
54
48
|
|
|
55
49
|
### gEval
|
|
56
50
|
|
|
57
|
-
Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score (0.0
|
|
51
|
+
Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score (0.0-1.0).
|
|
58
52
|
|
|
59
53
|
```typescript
|
|
60
54
|
const result = await gEval(
|
|
@@ -84,17 +78,8 @@ const result = await bEval(
|
|
|
84
78
|
// result: { reason: string, score: number } // score will be 0 or 1
|
|
85
79
|
```
|
|
86
80
|
|
|
87
|
-
## Development
|
|
88
|
-
- Source code is in `src/`
|
|
89
|
-
- Tests are in `tests/`
|
|
90
|
-
- Uses TypeScript and Jest for testing
|
|
91
|
-
|
|
92
|
-
## License
|
|
93
|
-
MIT
|
|
94
|
-
|
|
95
81
|
## Supported Providers
|
|
96
82
|
|
|
97
|
-
|
|
98
83
|
The following LLM providers are supported (via [Vercel ai-sdk](https://github.com/vercel/ai)):
|
|
99
84
|
|
|
100
85
|
- OpenAI (`openai`)
|
|
@@ -108,11 +93,12 @@ The following LLM providers are supported (via [Vercel ai-sdk](https://github.co
|
|
|
108
93
|
- Perplexity (`perplexity`)
|
|
109
94
|
- xAI (`xai`)
|
|
110
95
|
|
|
111
|
-
Specify the provider name and model name in `llmRubric` or `
|
|
96
|
+
Specify the provider name and model name in `llmRubric`, `gEval`, or `bEval`.
|
|
112
97
|
|
|
113
98
|
> **Note:** Each provider integration is based on its respective ai-sdk package. Be sure to follow the provider's documentation for setup and authentication. Most providers require you to export an API key or token as an environment variable (e.g., `export OPENAI_API_KEY=...`).
|
|
114
99
|
|
|
115
|
-
##
|
|
100
|
+
## Enterprise
|
|
101
|
+
### Hooks
|
|
116
102
|
|
|
117
103
|
You can provide hooks to receive notifications about evaluation events (success or error) for logging, monitoring, or custom handling. Hooks can also be used to integrate with observability tools such as OpenTelemetry for tracing and metrics. Set these in the config:
|
|
118
104
|
|
|
@@ -129,12 +115,20 @@ Config.hooks = {
|
|
|
129
115
|
};
|
|
130
116
|
```
|
|
131
117
|
|
|
118
|
+
### G-Eval/B-Eval Evaluation Steps Persistent Storage
|
|
119
|
+
|
|
132
120
|
For advanced use, you can implement your own cache storage for evaluation steps (e.g., using Redis or another backend) by providing a custom cache via `setStepsCache()`:
|
|
133
121
|
|
|
134
122
|
```typescript
|
|
135
|
-
import Config from '@eva-llm/eva-judge';
|
|
123
|
+
import Config, { type IStepsCache } from '@eva-llm/eva-judge';
|
|
124
|
+
|
|
125
|
+
class RedisCache implements IStepsCache {
|
|
126
|
+
...
|
|
127
|
+
};
|
|
136
128
|
|
|
137
|
-
Config.setStepsCache(RedisCache);
|
|
129
|
+
Config.setStepsCache(RedisCache);
|
|
138
130
|
```
|
|
139
131
|
|
|
140
|
-
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
MIT
|
package/dst/config.d.ts
CHANGED
|
@@ -1,10 +1,6 @@
|
|
|
1
1
|
import { LRUCache } from 'lru-cache';
|
|
2
2
|
import { type LanguageModel } from 'ai';
|
|
3
|
-
import { type EvalMethod } from './types';
|
|
4
|
-
export interface IStepsCache {
|
|
5
|
-
set(key: string, value: string[]): Promise<void>;
|
|
6
|
-
get(key: string): Promise<string[] | undefined>;
|
|
7
|
-
}
|
|
3
|
+
import { type EvalMethod, type IStepsCache } from './types';
|
|
8
4
|
export interface EvaHooks {
|
|
9
5
|
onSuccess?: (data: {
|
|
10
6
|
method: EvalMethod;
|
package/dst/config.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;AASrC,MAAM,kBAAkB;IACd,KAAK,CAA6B;IAM1C,YAAY,IAAY;QACtB,IAAI,CAAC,KAAK,GAAG,IAAI,oBAAQ,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW,EAAE,KAAe;QACpC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAC7B,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW;QACnB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;CACF;AAiCD,kBAAe;IAIb,aAAa,EAAE,EAAE;IAIjB,aAAa,EAAE,IAAI;IAInB,aAAa,EAAE,IAAI;IAInB,UAAU,EAAE,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;IAI7D,UAAU,EAAE,IAAI,kBAAkB,CAAC,GAAG,CAAgB;IAKtD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IACvE,CAAC;IAKD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,kBAAkB,CAAC,IAAI,CAAgB,CAAC;IAChE,CAAC;IAKD,aAAa,CAAC,KAAkB;QAC9B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,KAAK,EAAE,EAAc;IAKrB,QAAQ,CAAC,KAAe;QACtB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;CACF,CAAC"}
|
package/dst/index.d.ts
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import z from 'zod';
|
|
2
|
+
import { type EvalOptions } from './types';
|
|
2
3
|
export * from './config';
|
|
3
4
|
export { default } from './config';
|
|
4
|
-
export
|
|
5
|
-
temperature?: number;
|
|
6
|
-
providerOptions?: Record<string, any>;
|
|
7
|
-
}
|
|
5
|
+
export * from './types';
|
|
8
6
|
export declare const RubricResultSchema: z.ZodObject<{
|
|
9
7
|
reason: z.ZodString;
|
|
10
8
|
pass: z.ZodBoolean;
|
package/dst/index.js
CHANGED
|
@@ -49,6 +49,7 @@ const config_1 = __importDefault(require("./config"));
|
|
|
49
49
|
__exportStar(require("./config"), exports);
|
|
50
50
|
var config_2 = require("./config");
|
|
51
51
|
Object.defineProperty(exports, "default", { enumerable: true, get: function () { return __importDefault(config_2).default; } });
|
|
52
|
+
__exportStar(require("./types"), exports);
|
|
52
53
|
exports.RubricResultSchema = zod_1.default.object({
|
|
53
54
|
reason: zod_1.default.string().describe('Detailed explanation of the score based on the rubric'),
|
|
54
55
|
pass: zod_1.default.boolean().describe('Whether the output satisfies the minimum requirements'),
|
package/dst/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;AAChB,0CAAwB;AAMX,QAAA,kBAAkB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEzC,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,IAAI,EAAE,aAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEnF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CAC9E,CAAC,CAAC;AAUU,QAAA,sBAAsB,GAAG,aAAC,CAAC,MAAM,CAAC;IAE7C,KAAK,EAAE,aAAC,CAAC,KAAK,CAAC,aAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;CAClG,CAAC,CAAC;AAWU,QAAA,yBAAyB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEhD,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CACvE,CAAC,CAAC;AAgBI,MAAM,SAAS,GAAG,KAAK,EAC5B,MAAc,EACd,MAAc,EACd,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACF,EAAE;IACzB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,+BAAsB,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;QAE/E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAC5C,KAAK,EAAE,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC;YACxC,MAAM,EAAE,iCAAwB;YAChC,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,0BAAkB;aAC3B,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,WAAW;YACnB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YAC5D,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,WAAW;YACnB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvCY,QAAA,SAAS,aAuCrB;AAED,MAAM,MAAM,GAAG,KAAK,EAClB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,QAAgB,EAChB,UAAsB,EACtB,UAAuB,EAAE,EACK,EAAE;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAEzB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAC;QAErC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,2BAAkB,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEtE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;gBACjD,KAAK;gBACL,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;oBACpB,MAAM,EAAE,8BAAsB;iBAC/B,CAAC;gBACF,GAAG,OAAO;aACX,CAAC,CAAC;YAEH,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC;YAE1B,IAAA,mBAAQ,EAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,8BAAqB,EAAE;YAC9D,QAAQ;YACR,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;YACzB,KAAK,EAAE,MAAM;YACb,MAAM,EAAE,MAAM;YACd,QAAQ;SACT,CAAC,CAAC;QAEH,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAChD,KAAK;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,iCAAyB;aAClC,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,KAAK,EAAE,UAAU,CAAC,KAAK,GAAG,QAAQ;SACnC,CAAC;QAEF,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YACtE,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,UAAU;YAClB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAaM,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,gBAAI,CAAC,aAAa,EAClB,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB;AAaK,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,CAAC,EACD,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB"}
|
package/dst/prompt.d.ts
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Portions of this code are based on Promptfoo (MIT License)
|
|
3
3
|
* Copyright (c) 2025 Promptfoo
|
|
4
4
|
*/
|
|
5
|
-
export declare const LLM_RUBRIC_SYSTEM_PROMPT = "You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}\n\nExamples:\n\n<Output>Hello world</Output>\n<Rubric>Content contains a greeting</Rubric>\n{\"reason\": \"the content contains the word 'Hello'\", \"pass\": true, \"score\": 1.0}\n\n<Output>Avast ye swabs, repel the invaders!</Output>\n<Rubric>Does not speak like a pirate</Rubric>\n{\"reason\": \"'avast ye' is a common pirate term\", \"pass\": false, \"score\": 0.0}\n";
|
|
5
|
+
export declare const LLM_RUBRIC_SYSTEM_PROMPT = "You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. And score 1.0 indicates full compliance with the rubric, but 0.0 indicates no compliance at all. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}\n\nExamples:\n\n<Output>Hello world</Output>\n<Rubric>Content contains a greeting</Rubric>\n{\"reason\": \"the content contains the word 'Hello'\", \"pass\": true, \"score\": 1.0}\n\n<Output>Avast ye swabs, repel the invaders!</Output>\n<Rubric>Does not speak like a pirate</Rubric>\n{\"reason\": \"'avast ye' is a common pirate term\", \"pass\": false, \"score\": 0.0}\n";
|
|
6
6
|
export declare const LLM_RUBRIC_USER_PROMPT = "<Output>\n{{output}}\n</Output>\n<Rubric>\n{{rubric}}\n</Rubric>";
|
|
7
7
|
export declare const GEVAL_STEPS_PROMPT = "\nGiven an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.\n\n**EVALUATION CRITERIA**\n{{criteria}}\n\n**OUTPUT FORMAT**\nIMPORTANT:\n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain a single key, \"steps\", whose value is a list of strings.\n- Each string must represent one evaluation step.\n- Do NOT include any explanations, commentary, extra text, or additional formatting.\n\nFormat:\n{\"steps\": <list_of_strings>}\n\nExample:\n{\"steps\":[\"<Evaluation Step 1>\",\"<Evaluation Step 2>\",\"<Evaluation Step 3>\",\"<Evaluation Step 4>\"]}\n\nHere are the 3-4 concise evaluation steps, formatted as required in a minified JSON:\nJSON:\n";
|
|
8
8
|
export declare const GEVAL_EVALUATE_PROMPT = "\nYou will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\n**Evaluation Criteria**\n{{criteria}}\n\n**Evaluation Steps**\n- {{steps}}\nGiven the evaluation steps, return a JSON with two keys: \n 1) a \"score\" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;\n 2) a \"reason\" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!\n\n**Prompt**\n{{input}}\n\n**Reply**\n{{output}}\n\n**OUTPUT FORMAT**\nIMPORTANT: \n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain exactly two keys: \"score\" and \"reason\".\n- No additional words, explanations, or formatting are needed.\n- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.\n\nExample JSON:\n{\"score\":0,\"reason\":\"The text of reply does not follow the evaluation criteria provided.\"}\n\nHere is the final evaluation in the required minified JSON format:\nJSON:\n";
|
package/dst/prompt.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
*/
|
|
6
6
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
7
|
exports.GEVAL_EVALUATE_PROMPT = exports.GEVAL_STEPS_PROMPT = exports.LLM_RUBRIC_USER_PROMPT = exports.LLM_RUBRIC_SYSTEM_PROMPT = void 0;
|
|
8
|
-
exports.LLM_RUBRIC_SYSTEM_PROMPT = `You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
8
|
+
exports.LLM_RUBRIC_SYSTEM_PROMPT = `You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. And score 1.0 indicates full compliance with the rubric, but 0.0 indicates no compliance at all. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
9
9
|
|
|
10
10
|
Examples:
|
|
11
11
|
|
package/dst/types.d.ts
CHANGED
|
@@ -1 +1,9 @@
|
|
|
1
1
|
export type EvalMethod = 'bEval' | 'gEval' | 'llmRubric';
|
|
2
|
+
export interface IStepsCache {
|
|
3
|
+
set(key: string, value: string[]): Promise<void>;
|
|
4
|
+
get(key: string): Promise<string[] | undefined>;
|
|
5
|
+
}
|
|
6
|
+
export interface EvalOptions {
|
|
7
|
+
temperature?: number;
|
|
8
|
+
providerOptions?: Record<string, any>;
|
|
9
|
+
}
|