@eva-llm/eva-judge 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +26 -21
- package/dst/config.d.ts +3 -81
- package/dst/config.js +0 -67
- package/dst/config.js.map +1 -1
- package/dst/index.d.ts +1 -52
- package/dst/index.js +10 -46
- package/dst/index.js.map +1 -1
- package/dst/prompt.d.ts +3 -11
- package/dst/prompt.js +4 -12
- package/dst/prompt.js.map +1 -1
- package/dst/registry.d.ts +0 -17
- package/dst/registry.js +0 -25
- package/dst/registry.js.map +1 -1
- package/dst/types.d.ts +1 -0
- package/dst/types.js +3 -0
- package/dst/types.js.map +1 -0
- package/package.json +2 -2
package/LICENSE
CHANGED
package/README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# Project Inspiration & Attribution
|
|
2
2
|
|
|
3
|
-
This project is inspired by [promptfoo](https://github.com/promptfoo/promptfoo), including author's work on the G-Eval framework there. The LLM-as-a-
|
|
3
|
+
This project is inspired by [promptfoo](https://github.com/promptfoo/promptfoo), including author's work on the G-Eval framework there. The LLM-as-a-Judge prompts are copied from promptfoo and adapted for project-specific issues.
|
|
4
4
|
|
|
5
5
|
# eva-judge
|
|
6
6
|
|
|
7
|
-
A TypeScript/Node.js package for evaluating and managing test cases, prompts, and registry logic for AI or code evaluation workflows.
|
|
7
|
+
A TypeScript/Node.js package for evaluating and managing test cases, prompts, and registry logic for AI or code evaluation workflows with LLM-Rubric or G-Eval.
|
|
8
8
|
|
|
9
9
|
## Features
|
|
10
10
|
- Configuration management for evaluation workflows
|
|
@@ -12,26 +12,14 @@ A TypeScript/Node.js package for evaluating and managing test cases, prompts, an
|
|
|
12
12
|
- Registry for test cases and evaluation items
|
|
13
13
|
- Designed for integration with Jest and other test runners
|
|
14
14
|
|
|
15
|
-
## Project Structure
|
|
16
|
-
- `src/` — Main source code
|
|
17
|
-
- `config.ts` — Configuration logic
|
|
18
|
-
- `prompt.ts` — Prompt utilities
|
|
19
|
-
- `registry.ts` — Registry management
|
|
20
|
-
- `index.ts` — Entry point
|
|
21
|
-
- `tests/` — Unit tests for all modules
|
|
22
|
-
|
|
23
15
|
## Getting Started
|
|
24
16
|
|
|
25
|
-
### Prerequisites
|
|
26
|
-
- Node.js (>= 16)
|
|
27
|
-
- pnpm (recommended) or npm/yarn
|
|
28
|
-
|
|
29
17
|
### Installation
|
|
30
18
|
|
|
31
|
-
Clone the repository and install dependencies:
|
|
32
|
-
|
|
33
19
|
```bash
|
|
34
|
-
|
|
20
|
+
npm install @eva-llm/eva-judge
|
|
21
|
+
# or
|
|
22
|
+
pnpm add @eva-llm/eva-judge
|
|
35
23
|
```
|
|
36
24
|
|
|
37
25
|
### Running Tests
|
|
@@ -45,7 +33,7 @@ pnpm test
|
|
|
45
33
|
Import and use the modules in your TypeScript/Node.js project:
|
|
46
34
|
|
|
47
35
|
```typescript
|
|
48
|
-
import { llmRubric, gEval } from 'eva-judge';
|
|
36
|
+
import { llmRubric, gEval, bEval } from '@eva-llm/eva-judge';
|
|
49
37
|
```
|
|
50
38
|
|
|
51
39
|
### llmRubric
|
|
@@ -63,9 +51,10 @@ const result = await llmRubric(
|
|
|
63
51
|
// result: { reason: string, pass: boolean, score: number }
|
|
64
52
|
```
|
|
65
53
|
|
|
54
|
+
|
|
66
55
|
### gEval
|
|
67
56
|
|
|
68
|
-
Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score.
|
|
57
|
+
Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score (0.0–1.0).
|
|
69
58
|
|
|
70
59
|
```typescript
|
|
71
60
|
const result = await gEval(
|
|
@@ -79,6 +68,22 @@ const result = await gEval(
|
|
|
79
68
|
// result: { reason: string, score: number }
|
|
80
69
|
```
|
|
81
70
|
|
|
71
|
+
### bEval (Binary G-Eval)
|
|
72
|
+
|
|
73
|
+
Evaluates a reply against criteria and derived steps using an LLM, but with binary scoring (0 or 1). Returns a reason and a normalized score (0 or 1).
|
|
74
|
+
|
|
75
|
+
```typescript
|
|
76
|
+
const result = await bEval(
|
|
77
|
+
prompt, // string: the prompt given to the model
|
|
78
|
+
answer, // string: the reply to evaluate
|
|
79
|
+
criteria, // string: evaluation criteria
|
|
80
|
+
provider, // string: LLM provider name
|
|
81
|
+
model, // string: LLM model name
|
|
82
|
+
options // optional: { temperature, providerOptions }
|
|
83
|
+
);
|
|
84
|
+
// result: { reason: string, score: number } // score will be 0 or 1
|
|
85
|
+
```
|
|
86
|
+
|
|
82
87
|
## Development
|
|
83
88
|
- Source code is in `src/`
|
|
84
89
|
- Tests are in `tests/`
|
|
@@ -112,7 +117,7 @@ Specify the provider name and model name in `llmRubric` or `gEval`.
|
|
|
112
117
|
You can provide hooks to receive notifications about evaluation events (success or error) for logging, monitoring, or custom handling. Hooks can also be used to integrate with observability tools such as OpenTelemetry for tracing and metrics. Set these in the config:
|
|
113
118
|
|
|
114
119
|
```typescript
|
|
115
|
-
import Config from 'eva-judge';
|
|
120
|
+
import Config from '@eva-llm/eva-judge';
|
|
116
121
|
|
|
117
122
|
Config.hooks = {
|
|
118
123
|
onSuccess: ({ method, params, result, duration }) => {
|
|
@@ -127,7 +132,7 @@ Config.hooks = {
|
|
|
127
132
|
For advanced use, you can implement your own cache storage for evaluation steps (e.g., using Redis or another backend) by providing a custom cache via `setStepsCache()`:
|
|
128
133
|
|
|
129
134
|
```typescript
|
|
130
|
-
import Config from 'eva-judge';
|
|
135
|
+
import Config from '@eva-llm/eva-judge';
|
|
131
136
|
|
|
132
137
|
Config.setStepsCache(RedisCache); // RedisCache must implement IStepsCache
|
|
133
138
|
```
|
package/dst/config.d.ts
CHANGED
|
@@ -1,115 +1,37 @@
|
|
|
1
1
|
import { LRUCache } from 'lru-cache';
|
|
2
2
|
import { type LanguageModel } from 'ai';
|
|
3
|
-
|
|
4
|
-
* Interface for a cache that stores evaluation steps.
|
|
5
|
-
* Implementations should provide asynchronous set/get methods for storing and retrieving
|
|
6
|
-
* arrays of strings, typically representing evaluation steps for a given key.
|
|
7
|
-
*/
|
|
3
|
+
import { type EvalMethod } from './types';
|
|
8
4
|
export interface IStepsCache {
|
|
9
|
-
/**
|
|
10
|
-
* Store an array of steps in the cache for a given key.
|
|
11
|
-
* @param key Unique identifier for the steps (e.g., criteria string).
|
|
12
|
-
* @param value Array of step strings to cache.
|
|
13
|
-
* @returns Promise that resolves when the value is set.
|
|
14
|
-
*/
|
|
15
5
|
set(key: string, value: string[]): Promise<void>;
|
|
16
|
-
/**
|
|
17
|
-
* Retrieve an array of steps from the cache for a given key.
|
|
18
|
-
* @param key Unique identifier for the steps (e.g., criteria string).
|
|
19
|
-
* @returns Promise resolving to the cached array of steps, or undefined if not found.
|
|
20
|
-
*/
|
|
21
6
|
get(key: string): Promise<string[] | undefined>;
|
|
22
7
|
}
|
|
23
|
-
/**
|
|
24
|
-
* Optional hooks for receiving notifications about evaluation events.
|
|
25
|
-
* Can be used to monitor or log success and error events for evaluation functions.
|
|
26
|
-
*/
|
|
27
8
|
export interface EvaHooks {
|
|
28
|
-
/**
|
|
29
|
-
* Called when an evaluation completes successfully.
|
|
30
|
-
* @param data Information about the evaluation, including method, params, result, and duration (ms).
|
|
31
|
-
*/
|
|
32
9
|
onSuccess?: (data: {
|
|
33
|
-
method:
|
|
10
|
+
method: EvalMethod;
|
|
34
11
|
params: any;
|
|
35
12
|
result: any;
|
|
36
13
|
duration: number;
|
|
37
14
|
}) => void;
|
|
38
|
-
/**
|
|
39
|
-
* Called when an evaluation throws an error.
|
|
40
|
-
* @param data Information about the error, including method, error object, and duration (ms).
|
|
41
|
-
*/
|
|
42
15
|
onError?: (data: {
|
|
43
|
-
method:
|
|
16
|
+
method: EvalMethod;
|
|
44
17
|
error: any;
|
|
45
18
|
duration: number;
|
|
46
19
|
}) => void;
|
|
47
20
|
}
|
|
48
|
-
/**
|
|
49
|
-
* Global configuration and cache management for evaluation operations.
|
|
50
|
-
* Provides options for enabling/disabling model and steps caching, and allows
|
|
51
|
-
* customization of cache implementations and event hooks.
|
|
52
|
-
*/
|
|
53
21
|
declare const _default: {
|
|
54
|
-
/**
|
|
55
|
-
* Maximum score for evaluation (used for normalization).
|
|
56
|
-
*/
|
|
57
22
|
gevalMaxScore: number;
|
|
58
|
-
/**
|
|
59
|
-
* Whether model caching is enabled (for LLM instances).
|
|
60
|
-
*/
|
|
61
23
|
isModelCached: boolean;
|
|
62
|
-
/**
|
|
63
|
-
* Whether steps caching is enabled (for evaluation steps).
|
|
64
|
-
*/
|
|
65
24
|
isStepsCached: boolean;
|
|
66
|
-
/**
|
|
67
|
-
* LRU cache for language model instances.
|
|
68
|
-
*/
|
|
69
25
|
modelCache: LRUCache<string, LanguageModel, unknown>;
|
|
70
|
-
/**
|
|
71
|
-
* Cache for evaluation steps (criteria → steps).
|
|
72
|
-
*/
|
|
73
26
|
stepsCache: IStepsCache;
|
|
74
|
-
/**
|
|
75
|
-
* Restart the model cache with a new maximum size.
|
|
76
|
-
* @param size The new cache size (default: 100).
|
|
77
|
-
*/
|
|
78
27
|
restartModelCache(size?: number): void;
|
|
79
|
-
/**
|
|
80
|
-
* Restart the steps cache with a new maximum size.
|
|
81
|
-
* @param size The new cache size (default: 500).
|
|
82
|
-
*/
|
|
83
28
|
restartStepsCache(size?: number): void;
|
|
84
|
-
/**
|
|
85
|
-
* Set a custom steps cache implementation.
|
|
86
|
-
* @param cache The new IStepsCache implementation to use.
|
|
87
|
-
*/
|
|
88
29
|
setStepsCache(cache: IStepsCache): void;
|
|
89
|
-
/**
|
|
90
|
-
* Enable model caching (LLM instances).
|
|
91
|
-
*/
|
|
92
30
|
enableModelCache(): void;
|
|
93
|
-
/**
|
|
94
|
-
* Disable model caching (LLM instances).
|
|
95
|
-
*/
|
|
96
31
|
disableModelCache(): void;
|
|
97
|
-
/**
|
|
98
|
-
* Enable steps caching (criteria → steps).
|
|
99
|
-
*/
|
|
100
32
|
enableStepsCache(): void;
|
|
101
|
-
/**
|
|
102
|
-
* Disable steps caching (criteria → steps).
|
|
103
|
-
*/
|
|
104
33
|
disableStepsCache(): void;
|
|
105
|
-
/**
|
|
106
|
-
* Hooks for evaluation events (success/error notifications).
|
|
107
|
-
*/
|
|
108
34
|
hooks: EvaHooks;
|
|
109
|
-
/**
|
|
110
|
-
* Set the hooks for evaluation events.
|
|
111
|
-
* @param hooks The hooks object implementing EvaHooks.
|
|
112
|
-
*/
|
|
113
35
|
setHooks(hooks: EvaHooks): void;
|
|
114
36
|
};
|
|
115
37
|
export default _default;
|
package/dst/config.js
CHANGED
|
@@ -1,113 +1,46 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
const lru_cache_1 = require("lru-cache");
|
|
4
|
-
/**
|
|
5
|
-
* In-memory implementation of IStepsCache using an LRU (Least Recently Used) cache.
|
|
6
|
-
* Useful for fast, ephemeral caching of evaluation steps during runtime.
|
|
7
|
-
*/
|
|
8
4
|
class StepsMemoryAdapter {
|
|
9
5
|
cache;
|
|
10
|
-
/**
|
|
11
|
-
* Construct a new StepsMemoryAdapter.
|
|
12
|
-
* @param size Maximum number of items to store in the cache.
|
|
13
|
-
*/
|
|
14
6
|
constructor(size) {
|
|
15
7
|
this.cache = new lru_cache_1.LRUCache({ max: size });
|
|
16
8
|
}
|
|
17
|
-
/**
|
|
18
|
-
* Store an array of steps in the cache for a given key.
|
|
19
|
-
* @inheritdoc
|
|
20
|
-
*/
|
|
21
9
|
async set(key, value) {
|
|
22
10
|
this.cache.set(key, value);
|
|
23
11
|
}
|
|
24
|
-
/**
|
|
25
|
-
* Retrieve an array of steps from the cache for a given key.
|
|
26
|
-
* @inheritdoc
|
|
27
|
-
*/
|
|
28
12
|
async get(key) {
|
|
29
13
|
return this.cache.get(key);
|
|
30
14
|
}
|
|
31
15
|
}
|
|
32
|
-
/**
|
|
33
|
-
* Global configuration and cache management for evaluation operations.
|
|
34
|
-
* Provides options for enabling/disabling model and steps caching, and allows
|
|
35
|
-
* customization of cache implementations and event hooks.
|
|
36
|
-
*/
|
|
37
16
|
exports.default = {
|
|
38
|
-
/**
|
|
39
|
-
* Maximum score for evaluation (used for normalization).
|
|
40
|
-
*/
|
|
41
17
|
gevalMaxScore: 10,
|
|
42
|
-
/**
|
|
43
|
-
* Whether model caching is enabled (for LLM instances).
|
|
44
|
-
*/
|
|
45
18
|
isModelCached: true,
|
|
46
|
-
/**
|
|
47
|
-
* Whether steps caching is enabled (for evaluation steps).
|
|
48
|
-
*/
|
|
49
19
|
isStepsCached: true,
|
|
50
|
-
/**
|
|
51
|
-
* LRU cache for language model instances.
|
|
52
|
-
*/
|
|
53
20
|
modelCache: new lru_cache_1.LRUCache({ max: 100 }),
|
|
54
|
-
/**
|
|
55
|
-
* Cache for evaluation steps (criteria → steps).
|
|
56
|
-
*/
|
|
57
21
|
stepsCache: new StepsMemoryAdapter(500),
|
|
58
|
-
/**
|
|
59
|
-
* Restart the model cache with a new maximum size.
|
|
60
|
-
* @param size The new cache size (default: 100).
|
|
61
|
-
*/
|
|
62
22
|
restartModelCache(size = 100) {
|
|
63
23
|
this.modelCache = new lru_cache_1.LRUCache({ max: size });
|
|
64
24
|
},
|
|
65
|
-
/**
|
|
66
|
-
* Restart the steps cache with a new maximum size.
|
|
67
|
-
* @param size The new cache size (default: 500).
|
|
68
|
-
*/
|
|
69
25
|
restartStepsCache(size = 500) {
|
|
70
26
|
this.stepsCache = new StepsMemoryAdapter(size);
|
|
71
27
|
},
|
|
72
|
-
/**
|
|
73
|
-
* Set a custom steps cache implementation.
|
|
74
|
-
* @param cache The new IStepsCache implementation to use.
|
|
75
|
-
*/
|
|
76
28
|
setStepsCache(cache) {
|
|
77
29
|
this.stepsCache = cache;
|
|
78
30
|
},
|
|
79
|
-
/**
|
|
80
|
-
* Enable model caching (LLM instances).
|
|
81
|
-
*/
|
|
82
31
|
enableModelCache() {
|
|
83
32
|
this.isModelCached = true;
|
|
84
33
|
},
|
|
85
|
-
/**
|
|
86
|
-
* Disable model caching (LLM instances).
|
|
87
|
-
*/
|
|
88
34
|
disableModelCache() {
|
|
89
35
|
this.isModelCached = false;
|
|
90
36
|
},
|
|
91
|
-
/**
|
|
92
|
-
* Enable steps caching (criteria → steps).
|
|
93
|
-
*/
|
|
94
37
|
enableStepsCache() {
|
|
95
38
|
this.isStepsCached = true;
|
|
96
39
|
},
|
|
97
|
-
/**
|
|
98
|
-
* Disable steps caching (criteria → steps).
|
|
99
|
-
*/
|
|
100
40
|
disableStepsCache() {
|
|
101
41
|
this.isStepsCached = false;
|
|
102
42
|
},
|
|
103
|
-
/**
|
|
104
|
-
* Hooks for evaluation events (success/error notifications).
|
|
105
|
-
*/
|
|
106
43
|
hooks: {},
|
|
107
|
-
/**
|
|
108
|
-
* Set the hooks for evaluation events.
|
|
109
|
-
* @param hooks The hooks object implementing EvaHooks.
|
|
110
|
-
*/
|
|
111
44
|
setHooks(hooks) {
|
|
112
45
|
this.hooks = hooks;
|
|
113
46
|
}
|
package/dst/config.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;AA8BrC,MAAM,kBAAkB;IACd,KAAK,CAA6B;IAM1C,YAAY,IAAY;QACtB,IAAI,CAAC,KAAK,GAAG,IAAI,oBAAQ,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW,EAAE,KAAe;QACpC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAC7B,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW;QACnB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;CACF;AAiCD,kBAAe;IAIb,aAAa,EAAE,EAAE;IAIjB,aAAa,EAAE,IAAI;IAInB,aAAa,EAAE,IAAI;IAInB,UAAU,EAAE,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;IAI7D,UAAU,EAAE,IAAI,kBAAkB,CAAC,GAAG,CAAgB;IAKtD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IACvE,CAAC;IAKD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,kBAAkB,CAAC,IAAI,CAAgB,CAAC;IAChE,CAAC;IAKD,aAAa,CAAC,KAAkB;QAC9B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,KAAK,EAAE,EAAc;IAKrB,QAAQ,CAAC,KAAe;QACtB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;CACF,CAAC"}
|
package/dst/index.d.ts
CHANGED
|
@@ -1,76 +1,25 @@
|
|
|
1
1
|
import z from 'zod';
|
|
2
2
|
export * from './config';
|
|
3
3
|
export { default } from './config';
|
|
4
|
-
/**
|
|
5
|
-
* Options for evaluation functions.
|
|
6
|
-
* Allows customization of LLM generation parameters and provider-specific options.
|
|
7
|
-
*/
|
|
8
4
|
export interface EvalOptions {
|
|
9
|
-
/**
|
|
10
|
-
* Temperature for model generation (controls randomness).
|
|
11
|
-
*/
|
|
12
5
|
temperature?: number;
|
|
13
|
-
/**
|
|
14
|
-
* Additional provider-specific options (passed to the LLM provider).
|
|
15
|
-
*/
|
|
16
6
|
providerOptions?: Record<string, any>;
|
|
17
7
|
}
|
|
18
|
-
/**
|
|
19
|
-
* Zod schema for rubric result.
|
|
20
|
-
* Describes the structure of the result returned by rubric-based evaluation.
|
|
21
|
-
*/
|
|
22
8
|
export declare const RubricResultSchema: z.ZodObject<{
|
|
23
9
|
reason: z.ZodString;
|
|
24
10
|
pass: z.ZodBoolean;
|
|
25
11
|
score: z.ZodNumber;
|
|
26
12
|
}, z.core.$strip>;
|
|
27
|
-
/**
|
|
28
|
-
* Type for rubric result (inferred from RubricResultSchema).
|
|
29
|
-
*/
|
|
30
13
|
export type RubricResult = z.infer<typeof RubricResultSchema>;
|
|
31
|
-
/**
|
|
32
|
-
* Zod schema for evaluation steps result.
|
|
33
|
-
* Describes the structure of the result containing evaluation steps derived from criteria.
|
|
34
|
-
*/
|
|
35
14
|
export declare const GevalStepsResultSchema: z.ZodObject<{
|
|
36
15
|
steps: z.ZodArray<z.ZodString>;
|
|
37
16
|
}, z.core.$strip>;
|
|
38
|
-
/**
|
|
39
|
-
* Type for evaluation steps result (inferred from GevalStepsResultSchema).
|
|
40
|
-
*/
|
|
41
17
|
export type GevalStepsResult = z.infer<typeof GevalStepsResultSchema>;
|
|
42
|
-
/**
|
|
43
|
-
* Zod schema for evaluation result.
|
|
44
|
-
* Describes the structure of the result returned by the main evaluation function.
|
|
45
|
-
*/
|
|
46
18
|
export declare const GevalEvaluateResultSchema: z.ZodObject<{
|
|
47
19
|
reason: z.ZodString;
|
|
48
20
|
score: z.ZodNumber;
|
|
49
21
|
}, z.core.$strip>;
|
|
50
|
-
/**
|
|
51
|
-
* Type for evaluation result (inferred from GevalEvaluateResultSchema).
|
|
52
|
-
*/
|
|
53
22
|
export type GevalEvaluateResult = z.infer<typeof GevalEvaluateResultSchema>;
|
|
54
|
-
/**
|
|
55
|
-
* Evaluate output against a rubric using an LLM.
|
|
56
|
-
* Uses a system and user prompt to instruct the LLM to grade the output according to the rubric.
|
|
57
|
-
* @param output The output to grade.
|
|
58
|
-
* @param rubric The rubric to use for grading.
|
|
59
|
-
* @param providerName The provider name for the LLM.
|
|
60
|
-
* @param modelName The model name for the LLM.
|
|
61
|
-
* @param options Optional evaluation options (temperature, providerOptions, etc).
|
|
62
|
-
* @returns The rubric result (reason, pass, score).
|
|
63
|
-
*/
|
|
64
23
|
export declare const llmRubric: (output: string, rubric: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<RubricResult>;
|
|
65
|
-
/**
|
|
66
|
-
* Evaluate a reply against criteria and steps using an LLM.
|
|
67
|
-
* If steps for the criteria are not cached, generates them first, then evaluates the answer.
|
|
68
|
-
* @param prompt The prompt given to the model.
|
|
69
|
-
* @param answer The reply to evaluate.
|
|
70
|
-
* @param criteria The evaluation criteria (used to derive steps).
|
|
71
|
-
* @param providerName The provider name for the LLM.
|
|
72
|
-
* @param modelName The model name for the LLM.
|
|
73
|
-
* @param options Optional evaluation options (temperature, providerOptions, etc).
|
|
74
|
-
* @returns The evaluation result with normalized score (reason, score).
|
|
75
|
-
*/
|
|
76
24
|
export declare const gEval: (prompt: string, answer: string, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
|
|
25
|
+
export declare const bEval: (prompt: string, answer: string, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
|
package/dst/index.js
CHANGED
|
@@ -39,7 +39,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
39
39
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
40
40
|
};
|
|
41
41
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
-
exports.gEval = exports.llmRubric = exports.GevalEvaluateResultSchema = exports.GevalStepsResultSchema = exports.RubricResultSchema = exports.default = void 0;
|
|
42
|
+
exports.bEval = exports.gEval = exports.llmRubric = exports.GevalEvaluateResultSchema = exports.GevalStepsResultSchema = exports.RubricResultSchema = exports.default = void 0;
|
|
43
43
|
const ai_1 = require("ai");
|
|
44
44
|
const Mustache = __importStar(require("mustache"));
|
|
45
45
|
const zod_1 = __importDefault(require("zod"));
|
|
@@ -49,46 +49,18 @@ const config_1 = __importDefault(require("./config"));
|
|
|
49
49
|
__exportStar(require("./config"), exports);
|
|
50
50
|
var config_2 = require("./config");
|
|
51
51
|
Object.defineProperty(exports, "default", { enumerable: true, get: function () { return __importDefault(config_2).default; } });
|
|
52
|
-
/**
|
|
53
|
-
* Zod schema for rubric result.
|
|
54
|
-
* Describes the structure of the result returned by rubric-based evaluation.
|
|
55
|
-
*/
|
|
56
52
|
exports.RubricResultSchema = zod_1.default.object({
|
|
57
|
-
/** Detailed explanation of the score based on the rubric. */
|
|
58
53
|
reason: zod_1.default.string().describe('Detailed explanation of the score based on the rubric'),
|
|
59
|
-
/** Whether the output satisfies the minimum requirements. */
|
|
60
54
|
pass: zod_1.default.boolean().describe('Whether the output satisfies the minimum requirements'),
|
|
61
|
-
/** Numeric representation of quality (0-1). */
|
|
62
55
|
score: zod_1.default.number().min(0).max(1).describe('Numeric representation of quality'),
|
|
63
56
|
});
|
|
64
|
-
/**
|
|
65
|
-
* Zod schema for evaluation steps result.
|
|
66
|
-
* Describes the structure of the result containing evaluation steps derived from criteria.
|
|
67
|
-
*/
|
|
68
57
|
exports.GevalStepsResultSchema = zod_1.default.object({
|
|
69
|
-
/** List of concise evaluation steps derived from the criteria. */
|
|
70
58
|
steps: zod_1.default.array(zod_1.default.string()).describe('List of concise evaluation steps derived from the criteria'),
|
|
71
59
|
});
|
|
72
|
-
/**
|
|
73
|
-
* Zod schema for evaluation result.
|
|
74
|
-
* Describes the structure of the result returned by the main evaluation function.
|
|
75
|
-
*/
|
|
76
60
|
exports.GevalEvaluateResultSchema = zod_1.default.object({
|
|
77
|
-
/** Detailed explanation of the score based on the rubric. */
|
|
78
61
|
reason: zod_1.default.string().describe('Detailed explanation of the score based on the rubric'),
|
|
79
|
-
/** Numeric representation of quality (normalized score, 0-1). */
|
|
80
62
|
score: zod_1.default.number().min(0).describe('Numeric representation of quality'),
|
|
81
63
|
});
|
|
82
|
-
/**
|
|
83
|
-
* Evaluate output against a rubric using an LLM.
|
|
84
|
-
* Uses a system and user prompt to instruct the LLM to grade the output according to the rubric.
|
|
85
|
-
* @param output The output to grade.
|
|
86
|
-
* @param rubric The rubric to use for grading.
|
|
87
|
-
* @param providerName The provider name for the LLM.
|
|
88
|
-
* @param modelName The model name for the LLM.
|
|
89
|
-
* @param options Optional evaluation options (temperature, providerOptions, etc).
|
|
90
|
-
* @returns The rubric result (reason, pass, score).
|
|
91
|
-
*/
|
|
92
64
|
const llmRubric = async (output, rubric, providerName, modelName, options = {}) => {
|
|
93
65
|
const start = Date.now();
|
|
94
66
|
try {
|
|
@@ -120,18 +92,7 @@ const llmRubric = async (output, rubric, providerName, modelName, options = {})
|
|
|
120
92
|
}
|
|
121
93
|
};
|
|
122
94
|
exports.llmRubric = llmRubric;
|
|
123
|
-
|
|
124
|
-
* Evaluate a reply against criteria and steps using an LLM.
|
|
125
|
-
* If steps for the criteria are not cached, generates them first, then evaluates the answer.
|
|
126
|
-
* @param prompt The prompt given to the model.
|
|
127
|
-
* @param answer The reply to evaluate.
|
|
128
|
-
* @param criteria The evaluation criteria (used to derive steps).
|
|
129
|
-
* @param providerName The provider name for the LLM.
|
|
130
|
-
* @param modelName The model name for the LLM.
|
|
131
|
-
* @param options Optional evaluation options (temperature, providerOptions, etc).
|
|
132
|
-
* @returns The evaluation result with normalized score (reason, score).
|
|
133
|
-
*/
|
|
134
|
-
const gEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => {
|
|
95
|
+
const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScore, methodName, options = {}) => {
|
|
135
96
|
const start = Date.now();
|
|
136
97
|
try {
|
|
137
98
|
const model = (0, registry_1.getModel)(providerName, modelName);
|
|
@@ -147,14 +108,14 @@ const gEval = async (prompt, answer, criteria, providerName, modelName, options
|
|
|
147
108
|
...options,
|
|
148
109
|
});
|
|
149
110
|
steps = stepsResult.steps;
|
|
150
|
-
(0, registry_1.setSteps)(criteria, stepsResult.steps);
|
|
111
|
+
(0, registry_1.setSteps)(criteria, stepsResult.steps);
|
|
151
112
|
}
|
|
152
113
|
const evaluationPrompt = Mustache.render(prompt_1.GEVAL_EVALUATE_PROMPT, {
|
|
153
114
|
criteria,
|
|
154
115
|
steps: steps.join('\n- '),
|
|
155
116
|
input: prompt,
|
|
156
117
|
output: answer,
|
|
157
|
-
maxScore
|
|
118
|
+
maxScore,
|
|
158
119
|
});
|
|
159
120
|
const { output: evalResult } = await (0, ai_1.generateText)({
|
|
160
121
|
model,
|
|
@@ -166,10 +127,10 @@ const gEval = async (prompt, answer, criteria, providerName, modelName, options
|
|
|
166
127
|
});
|
|
167
128
|
const result = {
|
|
168
129
|
reason: evalResult.reason,
|
|
169
|
-
score: evalResult.score /
|
|
130
|
+
score: evalResult.score / maxScore,
|
|
170
131
|
};
|
|
171
132
|
config_1.default.hooks.onSuccess?.({
|
|
172
|
-
method:
|
|
133
|
+
method: methodName,
|
|
173
134
|
params: { prompt, answer, criteria, providerName, modelName, options },
|
|
174
135
|
result,
|
|
175
136
|
duration: Date.now() - start,
|
|
@@ -178,12 +139,15 @@ const gEval = async (prompt, answer, criteria, providerName, modelName, options
|
|
|
178
139
|
}
|
|
179
140
|
catch (error) {
|
|
180
141
|
config_1.default.hooks.onError?.({
|
|
181
|
-
method:
|
|
142
|
+
method: methodName,
|
|
182
143
|
error,
|
|
183
144
|
duration: Date.now() - start,
|
|
184
145
|
});
|
|
185
146
|
throw error;
|
|
186
147
|
}
|
|
187
148
|
};
|
|
149
|
+
const gEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => _gEval(prompt, answer, criteria, providerName, modelName, config_1.default.gevalMaxScore, 'gEval', options);
|
|
188
150
|
exports.gEval = gEval;
|
|
151
|
+
const bEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => _gEval(prompt, answer, criteria, providerName, modelName, 1, 'bEval', options);
|
|
152
|
+
exports.bEval = bEval;
|
|
189
153
|
//# sourceMappingURL=index.js.map
|
package/dst/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;AAqBH,QAAA,kBAAkB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEzC,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,IAAI,EAAE,aAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEnF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CAC9E,CAAC,CAAC;AAUU,QAAA,sBAAsB,GAAG,aAAC,CAAC,MAAM,CAAC;IAE7C,KAAK,EAAE,aAAC,CAAC,KAAK,CAAC,aAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;CAClG,CAAC,CAAC;AAWU,QAAA,yBAAyB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEhD,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CACvE,CAAC,CAAC;AAgBI,MAAM,SAAS,GAAG,KAAK,EAC5B,MAAc,EACd,MAAc,EACd,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACF,EAAE;IACzB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,+BAAsB,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;QAE/E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAC5C,KAAK,EAAE,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC;YACxC,MAAM,EAAE,iCAAwB;YAChC,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,0BAAkB;aAC3B,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,WAAW;YACnB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YAC5D,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,WAAW;YACnB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvCY,QAAA,SAAS,aAuCrB;AAED,MAAM,MAAM,GAAG,KAAK,EAClB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,QAAgB,EAChB,UAAsB,EACtB,UAAuB,EAAE,EACK,EAAE;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAEzB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAC;QAErC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,2BAAkB,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEtE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;gBACjD,KAAK;gBACL,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;oBACpB,MAAM,EAAE,8BAAsB;iBAC/B,CAAC;gBACF,GAAG,OAAO;aACX,CAAC,CAAC;YAEH,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC;YAE1B,IAAA,mBAAQ,EAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,8BAAqB,EAAE;YAC9D,QAAQ;YACR,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;YACzB,KAAK,EAAE,MAAM;YACb,MAAM,EAAE,MAAM;YACd,QAAQ;SACT,CAAC,CAAC;QAEH,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAChD,KAAK;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,iCAAyB;aAClC,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,KAAK,EAAE,UAAU,CAAC,KAAK,GAAG,QAAQ;SACnC,CAAC;QAEF,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YACtE,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,UAAU;YAClB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAaM,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,gBAAI,CAAC,aAAa,EAClB,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB;AAaK,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,CAAC,EACD,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB"}
|
package/dst/prompt.d.ts
CHANGED
|
@@ -1,16 +1,8 @@
|
|
|
1
|
-
|
|
2
|
-
*
|
|
1
|
+
/*!
|
|
2
|
+
* Portions of this code are based on Promptfoo (MIT License)
|
|
3
|
+
* Copyright (c) 2025 Promptfoo
|
|
3
4
|
*/
|
|
4
5
|
export declare const LLM_RUBRIC_SYSTEM_PROMPT = "You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}\n\nExamples:\n\n<Output>Hello world</Output>\n<Rubric>Content contains a greeting</Rubric>\n{\"reason\": \"the content contains the word 'Hello'\", \"pass\": true, \"score\": 1.0}\n\n<Output>Avast ye swabs, repel the invaders!</Output>\n<Rubric>Does not speak like a pirate</Rubric>\n{\"reason\": \"'avast ye' is a common pirate term\", \"pass\": false, \"score\": 0.0}\n";
|
|
5
|
-
/**
|
|
6
|
-
* User prompt template for rubric-based grading. Used to inject output and rubric into the prompt.
|
|
7
|
-
*/
|
|
8
6
|
export declare const LLM_RUBRIC_USER_PROMPT = "<Output>\n{{output}}\n</Output>\n<Rubric>\n{{rubric}}\n</Rubric>";
|
|
9
|
-
/**
|
|
10
|
-
* System prompt for generating evaluation steps from criteria. Guides the LLM to output a minified JSON array of steps.
|
|
11
|
-
*/
|
|
12
7
|
export declare const GEVAL_STEPS_PROMPT = "\nGiven an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.\n\n**EVALUATION CRITERIA**\n{{criteria}}\n\n**OUTPUT FORMAT**\nIMPORTANT:\n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain a single key, \"steps\", whose value is a list of strings.\n- Each string must represent one evaluation step.\n- Do NOT include any explanations, commentary, extra text, or additional formatting.\n\nFormat:\n{\"steps\": <list_of_strings>}\n\nExample:\n{\"steps\":[\"<Evaluation Step 1>\",\"<Evaluation Step 2>\",\"<Evaluation Step 3>\",\"<Evaluation Step 4>\"]}\n\nHere are the 3-4 concise evaluation steps, formatted as required in a minified JSON:\nJSON:\n";
|
|
13
|
-
/**
|
|
14
|
-
* System prompt for evaluating a reply against criteria and steps. Guides the LLM to return a JSON with score and reason.
|
|
15
|
-
*/
|
|
16
8
|
export declare const GEVAL_EVALUATE_PROMPT = "\nYou will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\n**Evaluation Criteria**\n{{criteria}}\n\n**Evaluation Steps**\n- {{steps}}\nGiven the evaluation steps, return a JSON with two keys: \n 1) a \"score\" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;\n 2) a \"reason\" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!\n\n**Prompt**\n{{input}}\n\n**Reply**\n{{output}}\n\n**OUTPUT FORMAT**\nIMPORTANT: \n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain exactly two keys: \"score\" and \"reason\".\n- No additional words, explanations, or formatting are needed.\n- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.\n\nExample JSON:\n{\"score\":0,\"reason\":\"The text of reply does not follow the evaluation criteria provided.\"}\n\nHere is the final evaluation in the required minified JSON format:\nJSON:\n";
|
package/dst/prompt.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
/*!
|
|
3
|
+
* Portions of this code are based on Promptfoo (MIT License)
|
|
4
|
+
* Copyright (c) 2025 Promptfoo
|
|
5
|
+
*/
|
|
2
6
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
7
|
exports.GEVAL_EVALUATE_PROMPT = exports.GEVAL_STEPS_PROMPT = exports.LLM_RUBRIC_USER_PROMPT = exports.LLM_RUBRIC_SYSTEM_PROMPT = void 0;
|
|
4
|
-
/**
|
|
5
|
-
* System prompt for LLM rubric-based grading. Guides the LLM to grade output according to a rubric and respond with a JSON object.
|
|
6
|
-
*/
|
|
7
8
|
exports.LLM_RUBRIC_SYSTEM_PROMPT = `You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
8
9
|
|
|
9
10
|
Examples:
|
|
@@ -16,13 +17,7 @@ Examples:
|
|
|
16
17
|
<Rubric>Does not speak like a pirate</Rubric>
|
|
17
18
|
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
|
|
18
19
|
`;
|
|
19
|
-
/**
|
|
20
|
-
* User prompt template for rubric-based grading. Used to inject output and rubric into the prompt.
|
|
21
|
-
*/
|
|
22
20
|
exports.LLM_RUBRIC_USER_PROMPT = '<Output>\n{{output}}\n</Output>\n<Rubric>\n{{rubric}}\n</Rubric>';
|
|
23
|
-
/**
|
|
24
|
-
* System prompt for generating evaluation steps from criteria. Guides the LLM to output a minified JSON array of steps.
|
|
25
|
-
*/
|
|
26
21
|
exports.GEVAL_STEPS_PROMPT = `
|
|
27
22
|
Given an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.
|
|
28
23
|
|
|
@@ -45,9 +40,6 @@ Example:
|
|
|
45
40
|
Here are the 3-4 concise evaluation steps, formatted as required in a minified JSON:
|
|
46
41
|
JSON:
|
|
47
42
|
`;
|
|
48
|
-
/**
|
|
49
|
-
* System prompt for evaluating a reply against criteria and steps. Guides the LLM to return a JSON with score and reason.
|
|
50
|
-
*/
|
|
51
43
|
exports.GEVAL_EVALUATE_PROMPT = `
|
|
52
44
|
You will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.
|
|
53
45
|
Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
|
package/dst/prompt.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":";;;
|
|
1
|
+
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":";AAAA;;;GAGG;;;AAKU,QAAA,wBAAwB,GAAG;;;;;;;;;;;CAWvC,CAAC;AAKW,QAAA,sBAAsB,GAAG,kEAAkE,CAAC;AAK5F,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;;;;;;CAqBjC,CAAC;AAKW,QAAA,qBAAqB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA+BpC,CAAC"}
|
package/dst/registry.d.ts
CHANGED
|
@@ -1,21 +1,4 @@
|
|
|
1
1
|
import { type LanguageModel } from 'ai';
|
|
2
|
-
/**
|
|
3
|
-
* Get a language model instance from the provider and model name, using cache if enabled.
|
|
4
|
-
* @param providerName The provider name (e.g., 'openai').
|
|
5
|
-
* @param modelName The model name.
|
|
6
|
-
* @returns The language model instance.
|
|
7
|
-
*/
|
|
8
2
|
export declare const getModel: (providerName: string, modelName: string) => LanguageModel;
|
|
9
|
-
/**
|
|
10
|
-
* Get cached evaluation steps for a criteria, if caching is enabled.
|
|
11
|
-
* @param criteria The evaluation criteria string.
|
|
12
|
-
* @returns Promise resolving to the cached steps or undefined.
|
|
13
|
-
*/
|
|
14
3
|
export declare const getSteps: (criteria: string) => Promise<string[] | undefined>;
|
|
15
|
-
/**
|
|
16
|
-
* Set evaluation steps for a criteria in the cache, if caching is enabled.
|
|
17
|
-
* @param criteria The evaluation criteria string.
|
|
18
|
-
* @param steps The steps to cache.
|
|
19
|
-
* @returns Promise that resolves when the steps are set.
|
|
20
|
-
*/
|
|
21
4
|
export declare const setSteps: (criteria: string, steps: string[]) => Promise<void>;
|
package/dst/registry.js
CHANGED
|
@@ -49,9 +49,6 @@ const groq_1 = require("@ai-sdk/groq");
|
|
|
49
49
|
const perplexity_1 = require("@ai-sdk/perplexity");
|
|
50
50
|
const xai_1 = require("@ai-sdk/xai");
|
|
51
51
|
const config_1 = __importDefault(require("./config"));
|
|
52
|
-
/**
|
|
53
|
-
* Map of provider names to provider functions.
|
|
54
|
-
*/
|
|
55
52
|
const PROVIDERS = {
|
|
56
53
|
openai: openai_1.openai,
|
|
57
54
|
anthropic: anthropic_1.anthropic,
|
|
@@ -64,12 +61,6 @@ const PROVIDERS = {
|
|
|
64
61
|
perplexity: perplexity_1.perplexity,
|
|
65
62
|
xai: xai_1.xai,
|
|
66
63
|
};
|
|
67
|
-
/**
|
|
68
|
-
* Get a language model instance from the provider and model name, using cache if enabled.
|
|
69
|
-
* @param providerName The provider name (e.g., 'openai').
|
|
70
|
-
* @param modelName The model name.
|
|
71
|
-
* @returns The language model instance.
|
|
72
|
-
*/
|
|
73
64
|
const getModel = (providerName, modelName) => {
|
|
74
65
|
const cacheKey = `${providerName}:${modelName}`;
|
|
75
66
|
let model = config_1.default.isModelCached ? config_1.default.modelCache.get(cacheKey) : undefined;
|
|
@@ -86,29 +77,13 @@ const getModel = (providerName, modelName) => {
|
|
|
86
77
|
return model;
|
|
87
78
|
};
|
|
88
79
|
exports.getModel = getModel;
|
|
89
|
-
/**
|
|
90
|
-
* Compute the MD5 hash of a string.
|
|
91
|
-
* @param str The input string.
|
|
92
|
-
* @returns The MD5 hash as a hex string.
|
|
93
|
-
*/
|
|
94
80
|
const md5 = (str) => {
|
|
95
81
|
return crypto.createHash('md5').update(str).digest('hex');
|
|
96
82
|
};
|
|
97
|
-
/**
|
|
98
|
-
* Get cached evaluation steps for a criteria, if caching is enabled.
|
|
99
|
-
* @param criteria The evaluation criteria string.
|
|
100
|
-
* @returns Promise resolving to the cached steps or undefined.
|
|
101
|
-
*/
|
|
102
83
|
const getSteps = (criteria) => {
|
|
103
84
|
return config_1.default.isStepsCached ? config_1.default.stepsCache.get(md5(criteria)) : Promise.resolve(undefined);
|
|
104
85
|
};
|
|
105
86
|
exports.getSteps = getSteps;
|
|
106
|
-
/**
|
|
107
|
-
* Set evaluation steps for a criteria in the cache, if caching is enabled.
|
|
108
|
-
* @param criteria The evaluation criteria string.
|
|
109
|
-
* @param steps The steps to cache.
|
|
110
|
-
* @returns Promise that resolves when the steps are set.
|
|
111
|
-
*/
|
|
112
87
|
const setSteps = (criteria, steps) => {
|
|
113
88
|
if (config_1.default.isStepsCached) {
|
|
114
89
|
return config_1.default.stepsCache.set(md5(criteria), steps);
|
package/dst/registry.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"registry.js","sourceRoot":"","sources":["../src/registry.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,oDAAsC;AAEtC,2CAAwC;AACxC,iDAA8C;AAC9C,2CAAwC;AACxC,6CAA0C;AAC1C,2DAAiD;AACjD,yCAAsC;AACtC,+CAA4C;AAC5C,uCAAoC;AACpC,mDAAgD;AAChD,qCAAkC;AAElC,sDAA4B;
|
|
1
|
+
{"version":3,"file":"registry.js","sourceRoot":"","sources":["../src/registry.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,oDAAsC;AAEtC,2CAAwC;AACxC,iDAA8C;AAC9C,2CAAwC;AACxC,6CAA0C;AAC1C,2DAAiD;AACjD,yCAAsC;AACtC,+CAA4C;AAC5C,uCAAoC;AACpC,mDAAgD;AAChD,qCAAkC;AAElC,sDAA4B;AAM5B,MAAM,SAAS,GAA6B;IAC1C,MAAM,EAAN,eAAM;IACN,SAAS,EAAT,qBAAS;IACT,MAAM,EAAN,eAAM;IACN,OAAO,EAAP,iBAAO;IACP,OAAO,EAAP,wBAAO;IACP,KAAK,EAAL,aAAK;IACL,QAAQ,EAAR,mBAAQ;IACR,IAAI,EAAJ,WAAI;IACJ,UAAU,EAAV,uBAAU;IACV,GAAG,EAAH,SAAG;CACJ,CAAC;AAQK,MAAM,QAAQ,GAAG,CAAC,YAAoB,EAAE,SAAiB,EAAiB,EAAE;IACjF,MAAM,QAAQ,GAAG,GAAG,YAAY,IAAI,SAAS,EAAE,CAAC;IAEhD,IAAI,KAAK,GAAG,gBAAI,CAAC,aAAa,CAAC,CAAC,CAAC,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAE3E,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,QAAQ,GAAG,SAAS,CAAC,YAAY,CAAC,CAAC;QAEzC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,MAAM,IAAI,KAAK,CAAC,sBAAsB,YAAY,2BAA2B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACpH,CAAC;QAED,KAAK,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC;QAE5B,IAAI,gBAAI,CAAC,aAAa,EAAE,CAAC;YACvB,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;IAED,OAAO,KAAM,CAAC;AAChB,CAAC,CAAA;AApBY,QAAA,QAAQ,YAoBpB;AAOD,MAAM,GAAG,GAAG,CAAC,GAAW,EAAU,EAAE;IAClC,OAAO,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC5D,CAAC,CAAA;AAOM,MAAM,QAAQ,GAAG,CAAC,QAAgB,EAAiC,EAAE;IAC1E,OAAO,gBAAI,CAAC,aAAa,CAAC,CAAC,CAAC,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;AAC9F,CAAC,CAAA;AAFY,QAAA,QAAQ,YAEpB;AAQM,MAAM,QAAQ,GAAG,CAAC,QAAgB,EAAE,KAAe,EAAiB,EAAE;IAC3E,IAAI,gBAAI,CAAC,aAAa,EAAE,CAAC;QACvB,OAAO,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,KAAK,CAAC,CAAC;IACnD,CAAC;IAED,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC;AAC3B,CAAC,CAAA;AANY,QAAA,QAAQ,YAMpB"}
|
package/dst/types.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export type EvalMethod = 'bEval' | 'gEval' | 'llmRubric';
|
package/dst/types.js
ADDED
package/dst/types.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@eva-llm/eva-judge",
|
|
3
|
-
"version": "0.1.
|
|
4
|
-
"description": "LLM-as-a-
|
|
3
|
+
"version": "0.1.2",
|
|
4
|
+
"description": "LLM-as-a-Judge abstraction layer using ai-sdk and plugins",
|
|
5
5
|
"main": "dst/index.js",
|
|
6
6
|
"types": "dst/index.d.ts",
|
|
7
7
|
"engines": {
|