@eva-llm/eva-judge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +135 -0
- package/dst/config.d.ts +115 -0
- package/dst/config.js +115 -0
- package/dst/config.js.map +1 -0
- package/dst/index.d.ts +76 -0
- package/dst/index.js +189 -0
- package/dst/index.js.map +1 -0
- package/dst/prompt.d.ts +16 -0
- package/dst/prompt.js +83 -0
- package/dst/prompt.js.map +1 -0
- package/dst/registry.d.ts +21 -0
- package/dst/registry.js +119 -0
- package/dst/registry.js.map +1 -0
- package/package.json +51 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 [Your Name or Organization]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Project Inspiration & Attribution
|
|
2
|
+
|
|
3
|
+
This project is inspired by [promptfoo](https://github.com/promptfoo/promptfoo), including author's work on the G-Eval framework there. The LLM-as-a-judge prompts are copied from promptfoo and adapted for project-specific issues.
|
|
4
|
+
|
|
5
|
+
# eva-judge
|
|
6
|
+
|
|
7
|
+
A TypeScript/Node.js package for evaluating and managing test cases, prompts, and registry logic for AI or code evaluation workflows.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
- Configuration management for evaluation workflows
|
|
11
|
+
- Prompt handling and manipulation
|
|
12
|
+
- Registry for test cases and evaluation items
|
|
13
|
+
- Designed for integration with Jest and other test runners
|
|
14
|
+
|
|
15
|
+
## Project Structure
|
|
16
|
+
- `src/` — Main source code
|
|
17
|
+
- `config.ts` — Configuration logic
|
|
18
|
+
- `prompt.ts` — Prompt utilities
|
|
19
|
+
- `registry.ts` — Registry management
|
|
20
|
+
- `index.ts` — Entry point
|
|
21
|
+
- `tests/` — Unit tests for all modules
|
|
22
|
+
|
|
23
|
+
## Getting Started
|
|
24
|
+
|
|
25
|
+
### Prerequisites
|
|
26
|
+
- Node.js (>= 16)
|
|
27
|
+
- pnpm (recommended) or npm/yarn
|
|
28
|
+
|
|
29
|
+
### Installation
|
|
30
|
+
|
|
31
|
+
Clone the repository and install dependencies:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pnpm install
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Running Tests
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pnpm test
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
Import and use the modules in your TypeScript/Node.js project:
|
|
46
|
+
|
|
47
|
+
```typescript
|
|
48
|
+
import { llmRubric, gEval } from 'eva-judge';
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### llmRubric
|
|
52
|
+
|
|
53
|
+
Evaluates an output against a rubric using an LLM. Returns a reason, pass/fail, and normalized score.
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
const result = await llmRubric(
|
|
57
|
+
output, // string: the output to grade
|
|
58
|
+
rubric, // string: the rubric to use
|
|
59
|
+
provider, // string: LLM provider name
|
|
60
|
+
model, // string: LLM model name
|
|
61
|
+
options // optional: { temperature, providerOptions }
|
|
62
|
+
);
|
|
63
|
+
// result: { reason: string, pass: boolean, score: number }
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### gEval
|
|
67
|
+
|
|
68
|
+
Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score.
|
|
69
|
+
|
|
70
|
+
```typescript
|
|
71
|
+
const result = await gEval(
|
|
72
|
+
prompt, // string: the prompt given to the model
|
|
73
|
+
answer, // string: the reply to evaluate
|
|
74
|
+
criteria, // string: evaluation criteria
|
|
75
|
+
provider, // string: LLM provider name
|
|
76
|
+
model, // string: LLM model name
|
|
77
|
+
options // optional: { temperature, providerOptions }
|
|
78
|
+
);
|
|
79
|
+
// result: { reason: string, score: number }
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Development
|
|
83
|
+
- Source code is in `src/`
|
|
84
|
+
- Tests are in `tests/`
|
|
85
|
+
- Uses TypeScript and Jest for testing
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
MIT
|
|
89
|
+
|
|
90
|
+
## Supported Providers
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
The following LLM providers are supported (via [Vercel ai-sdk](https://github.com/vercel/ai)):
|
|
94
|
+
|
|
95
|
+
- OpenAI (`openai`)
|
|
96
|
+
- Anthropic (`anthropic`)
|
|
97
|
+
- Google (`google`)
|
|
98
|
+
- Mistral (`mistral`)
|
|
99
|
+
- Amazon Bedrock (`bedrock`)
|
|
100
|
+
- Azure (`azure`)
|
|
101
|
+
- DeepSeek (`deepseek`)
|
|
102
|
+
- Groq (`groq`)
|
|
103
|
+
- Perplexity (`perplexity`)
|
|
104
|
+
- xAI (`xai`)
|
|
105
|
+
|
|
106
|
+
Specify the provider name and model name in `llmRubric` or `gEval`.
|
|
107
|
+
|
|
108
|
+
> **Note:** Each provider integration is based on its respective ai-sdk package. Be sure to follow the provider's documentation for setup and authentication. Most providers require you to export an API key or token as an environment variable (e.g., `export OPENAI_API_KEY=...`).
|
|
109
|
+
|
|
110
|
+
## Hooks
|
|
111
|
+
|
|
112
|
+
You can provide hooks to receive notifications about evaluation events (success or error) for logging, monitoring, or custom handling. Hooks can also be used to integrate with observability tools such as OpenTelemetry for tracing and metrics. Set these in the config:
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
import Config from 'eva-judge';
|
|
116
|
+
|
|
117
|
+
Config.hooks = {
|
|
118
|
+
onSuccess: ({ method, params, result, duration }) => {
|
|
119
|
+
// handle successful evaluation
|
|
120
|
+
},
|
|
121
|
+
onError: ({ method, error, duration }) => {
|
|
122
|
+
// handle evaluation error
|
|
123
|
+
}
|
|
124
|
+
};
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
For advanced use, you can implement your own cache storage for evaluation steps (e.g., using Redis or another backend) by providing a custom cache via `setStepsCache()`:
|
|
128
|
+
|
|
129
|
+
```typescript
|
|
130
|
+
import Config from 'eva-judge';
|
|
131
|
+
|
|
132
|
+
Config.setStepsCache(RedisCache); // RedisCache must implement IStepsCache
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
See `src/config.ts` for more details on available hooks and configuration options.
|
package/dst/config.d.ts
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { LRUCache } from 'lru-cache';
|
|
2
|
+
import { type LanguageModel } from 'ai';
|
|
3
|
+
/**
|
|
4
|
+
* Interface for a cache that stores evaluation steps.
|
|
5
|
+
* Implementations should provide asynchronous set/get methods for storing and retrieving
|
|
6
|
+
* arrays of strings, typically representing evaluation steps for a given key.
|
|
7
|
+
*/
|
|
8
|
+
export interface IStepsCache {
|
|
9
|
+
/**
|
|
10
|
+
* Store an array of steps in the cache for a given key.
|
|
11
|
+
* @param key Unique identifier for the steps (e.g., criteria string).
|
|
12
|
+
* @param value Array of step strings to cache.
|
|
13
|
+
* @returns Promise that resolves when the value is set.
|
|
14
|
+
*/
|
|
15
|
+
set(key: string, value: string[]): Promise<void>;
|
|
16
|
+
/**
|
|
17
|
+
* Retrieve an array of steps from the cache for a given key.
|
|
18
|
+
* @param key Unique identifier for the steps (e.g., criteria string).
|
|
19
|
+
* @returns Promise resolving to the cached array of steps, or undefined if not found.
|
|
20
|
+
*/
|
|
21
|
+
get(key: string): Promise<string[] | undefined>;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Optional hooks for receiving notifications about evaluation events.
|
|
25
|
+
* Can be used to monitor or log success and error events for evaluation functions.
|
|
26
|
+
*/
|
|
27
|
+
export interface EvaHooks {
|
|
28
|
+
/**
|
|
29
|
+
* Called when an evaluation completes successfully.
|
|
30
|
+
* @param data Information about the evaluation, including method, params, result, and duration (ms).
|
|
31
|
+
*/
|
|
32
|
+
onSuccess?: (data: {
|
|
33
|
+
method: 'gEval' | 'llmRubric';
|
|
34
|
+
params: any;
|
|
35
|
+
result: any;
|
|
36
|
+
duration: number;
|
|
37
|
+
}) => void;
|
|
38
|
+
/**
|
|
39
|
+
* Called when an evaluation throws an error.
|
|
40
|
+
* @param data Information about the error, including method, error object, and duration (ms).
|
|
41
|
+
*/
|
|
42
|
+
onError?: (data: {
|
|
43
|
+
method: 'gEval' | 'llmRubric';
|
|
44
|
+
error: any;
|
|
45
|
+
duration: number;
|
|
46
|
+
}) => void;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Global configuration and cache management for evaluation operations.
|
|
50
|
+
* Provides options for enabling/disabling model and steps caching, and allows
|
|
51
|
+
* customization of cache implementations and event hooks.
|
|
52
|
+
*/
|
|
53
|
+
declare const _default: {
|
|
54
|
+
/**
|
|
55
|
+
* Maximum score for evaluation (used for normalization).
|
|
56
|
+
*/
|
|
57
|
+
gevalMaxScore: number;
|
|
58
|
+
/**
|
|
59
|
+
* Whether model caching is enabled (for LLM instances).
|
|
60
|
+
*/
|
|
61
|
+
isModelCached: boolean;
|
|
62
|
+
/**
|
|
63
|
+
* Whether steps caching is enabled (for evaluation steps).
|
|
64
|
+
*/
|
|
65
|
+
isStepsCached: boolean;
|
|
66
|
+
/**
|
|
67
|
+
* LRU cache for language model instances.
|
|
68
|
+
*/
|
|
69
|
+
modelCache: LRUCache<string, LanguageModel, unknown>;
|
|
70
|
+
/**
|
|
71
|
+
* Cache for evaluation steps (criteria → steps).
|
|
72
|
+
*/
|
|
73
|
+
stepsCache: IStepsCache;
|
|
74
|
+
/**
|
|
75
|
+
* Restart the model cache with a new maximum size.
|
|
76
|
+
* @param size The new cache size (default: 100).
|
|
77
|
+
*/
|
|
78
|
+
restartModelCache(size?: number): void;
|
|
79
|
+
/**
|
|
80
|
+
* Restart the steps cache with a new maximum size.
|
|
81
|
+
* @param size The new cache size (default: 500).
|
|
82
|
+
*/
|
|
83
|
+
restartStepsCache(size?: number): void;
|
|
84
|
+
/**
|
|
85
|
+
* Set a custom steps cache implementation.
|
|
86
|
+
* @param cache The new IStepsCache implementation to use.
|
|
87
|
+
*/
|
|
88
|
+
setStepsCache(cache: IStepsCache): void;
|
|
89
|
+
/**
|
|
90
|
+
* Enable model caching (LLM instances).
|
|
91
|
+
*/
|
|
92
|
+
enableModelCache(): void;
|
|
93
|
+
/**
|
|
94
|
+
* Disable model caching (LLM instances).
|
|
95
|
+
*/
|
|
96
|
+
disableModelCache(): void;
|
|
97
|
+
/**
|
|
98
|
+
* Enable steps caching (criteria → steps).
|
|
99
|
+
*/
|
|
100
|
+
enableStepsCache(): void;
|
|
101
|
+
/**
|
|
102
|
+
* Disable steps caching (criteria → steps).
|
|
103
|
+
*/
|
|
104
|
+
disableStepsCache(): void;
|
|
105
|
+
/**
|
|
106
|
+
* Hooks for evaluation events (success/error notifications).
|
|
107
|
+
*/
|
|
108
|
+
hooks: EvaHooks;
|
|
109
|
+
/**
|
|
110
|
+
* Set the hooks for evaluation events.
|
|
111
|
+
* @param hooks The hooks object implementing EvaHooks.
|
|
112
|
+
*/
|
|
113
|
+
setHooks(hooks: EvaHooks): void;
|
|
114
|
+
};
|
|
115
|
+
export default _default;
|
package/dst/config.js
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const lru_cache_1 = require("lru-cache");
|
|
4
|
+
/**
|
|
5
|
+
* In-memory implementation of IStepsCache using an LRU (Least Recently Used) cache.
|
|
6
|
+
* Useful for fast, ephemeral caching of evaluation steps during runtime.
|
|
7
|
+
*/
|
|
8
|
+
class StepsMemoryAdapter {
|
|
9
|
+
cache;
|
|
10
|
+
/**
|
|
11
|
+
* Construct a new StepsMemoryAdapter.
|
|
12
|
+
* @param size Maximum number of items to store in the cache.
|
|
13
|
+
*/
|
|
14
|
+
constructor(size) {
|
|
15
|
+
this.cache = new lru_cache_1.LRUCache({ max: size });
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Store an array of steps in the cache for a given key.
|
|
19
|
+
* @inheritdoc
|
|
20
|
+
*/
|
|
21
|
+
async set(key, value) {
|
|
22
|
+
this.cache.set(key, value);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Retrieve an array of steps from the cache for a given key.
|
|
26
|
+
* @inheritdoc
|
|
27
|
+
*/
|
|
28
|
+
async get(key) {
|
|
29
|
+
return this.cache.get(key);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Global configuration and cache management for evaluation operations.
|
|
34
|
+
* Provides options for enabling/disabling model and steps caching, and allows
|
|
35
|
+
* customization of cache implementations and event hooks.
|
|
36
|
+
*/
|
|
37
|
+
exports.default = {
|
|
38
|
+
/**
|
|
39
|
+
* Maximum score for evaluation (used for normalization).
|
|
40
|
+
*/
|
|
41
|
+
gevalMaxScore: 10,
|
|
42
|
+
/**
|
|
43
|
+
* Whether model caching is enabled (for LLM instances).
|
|
44
|
+
*/
|
|
45
|
+
isModelCached: true,
|
|
46
|
+
/**
|
|
47
|
+
* Whether steps caching is enabled (for evaluation steps).
|
|
48
|
+
*/
|
|
49
|
+
isStepsCached: true,
|
|
50
|
+
/**
|
|
51
|
+
* LRU cache for language model instances.
|
|
52
|
+
*/
|
|
53
|
+
modelCache: new lru_cache_1.LRUCache({ max: 100 }),
|
|
54
|
+
/**
|
|
55
|
+
* Cache for evaluation steps (criteria → steps).
|
|
56
|
+
*/
|
|
57
|
+
stepsCache: new StepsMemoryAdapter(500),
|
|
58
|
+
/**
|
|
59
|
+
* Restart the model cache with a new maximum size.
|
|
60
|
+
* @param size The new cache size (default: 100).
|
|
61
|
+
*/
|
|
62
|
+
restartModelCache(size = 100) {
|
|
63
|
+
this.modelCache = new lru_cache_1.LRUCache({ max: size });
|
|
64
|
+
},
|
|
65
|
+
/**
|
|
66
|
+
* Restart the steps cache with a new maximum size.
|
|
67
|
+
* @param size The new cache size (default: 500).
|
|
68
|
+
*/
|
|
69
|
+
restartStepsCache(size = 500) {
|
|
70
|
+
this.stepsCache = new StepsMemoryAdapter(size);
|
|
71
|
+
},
|
|
72
|
+
/**
|
|
73
|
+
* Set a custom steps cache implementation.
|
|
74
|
+
* @param cache The new IStepsCache implementation to use.
|
|
75
|
+
*/
|
|
76
|
+
setStepsCache(cache) {
|
|
77
|
+
this.stepsCache = cache;
|
|
78
|
+
},
|
|
79
|
+
/**
|
|
80
|
+
* Enable model caching (LLM instances).
|
|
81
|
+
*/
|
|
82
|
+
enableModelCache() {
|
|
83
|
+
this.isModelCached = true;
|
|
84
|
+
},
|
|
85
|
+
/**
|
|
86
|
+
* Disable model caching (LLM instances).
|
|
87
|
+
*/
|
|
88
|
+
disableModelCache() {
|
|
89
|
+
this.isModelCached = false;
|
|
90
|
+
},
|
|
91
|
+
/**
|
|
92
|
+
* Enable steps caching (criteria → steps).
|
|
93
|
+
*/
|
|
94
|
+
enableStepsCache() {
|
|
95
|
+
this.isStepsCached = true;
|
|
96
|
+
},
|
|
97
|
+
/**
|
|
98
|
+
* Disable steps caching (criteria → steps).
|
|
99
|
+
*/
|
|
100
|
+
disableStepsCache() {
|
|
101
|
+
this.isStepsCached = false;
|
|
102
|
+
},
|
|
103
|
+
/**
|
|
104
|
+
* Hooks for evaluation events (success/error notifications).
|
|
105
|
+
*/
|
|
106
|
+
hooks: {},
|
|
107
|
+
/**
|
|
108
|
+
* Set the hooks for evaluation events.
|
|
109
|
+
* @param hooks The hooks object implementing EvaHooks.
|
|
110
|
+
*/
|
|
111
|
+
setHooks(hooks) {
|
|
112
|
+
this.hooks = hooks;
|
|
113
|
+
}
|
|
114
|
+
};
|
|
115
|
+
//# sourceMappingURL=config.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;AA2BrC;;;GAGG;AACH,MAAM,kBAAkB;IACd,KAAK,CAA6B;IAE1C;;;OAGG;IACH,YAAY,IAAY;QACtB,IAAI,CAAC,KAAK,GAAG,IAAI,oBAAQ,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,GAAG,CAAC,GAAW,EAAE,KAAe;QACpC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAC7B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,GAAG,CAAC,GAAW;QACnB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;CACF;AA8BD;;;;GAIG;AACH,kBAAe;IACb;;OAEG;IACH,aAAa,EAAE,EAAE;IACjB;;OAEG;IACH,aAAa,EAAE,IAAI;IACnB;;OAEG;IACH,aAAa,EAAE,IAAI;IACnB;;OAEG;IACH,UAAU,EAAE,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;IAC7D;;OAEG;IACH,UAAU,EAAE,IAAI,kBAAkB,CAAC,GAAG,CAAgB;IAEtD;;;OAGG;IACH,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IACvE,CAAC;IAED;;;OAGG;IACH,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,kBAAkB,CAAC,IAAI,CAAgB,CAAC;IAChE,CAAC;IAED;;;OAGG;IACH,aAAa,CAAC,KAAkB;QAC9B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAED;;OAEG;IACH,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAED;;OAEG;IACH,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,KAAK,EAAE,EAAc;IAErB;;;OAGG;IACH,QAAQ,CAAC,KAAe;QACtB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;CACF,CAAC"}
|
package/dst/index.d.ts
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import z from 'zod';
|
|
2
|
+
export * from './config';
|
|
3
|
+
export { default } from './config';
|
|
4
|
+
/**
|
|
5
|
+
* Options for evaluation functions.
|
|
6
|
+
* Allows customization of LLM generation parameters and provider-specific options.
|
|
7
|
+
*/
|
|
8
|
+
export interface EvalOptions {
|
|
9
|
+
/**
|
|
10
|
+
* Temperature for model generation (controls randomness).
|
|
11
|
+
*/
|
|
12
|
+
temperature?: number;
|
|
13
|
+
/**
|
|
14
|
+
* Additional provider-specific options (passed to the LLM provider).
|
|
15
|
+
*/
|
|
16
|
+
providerOptions?: Record<string, any>;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Zod schema for rubric result.
|
|
20
|
+
* Describes the structure of the result returned by rubric-based evaluation.
|
|
21
|
+
*/
|
|
22
|
+
export declare const RubricResultSchema: z.ZodObject<{
|
|
23
|
+
reason: z.ZodString;
|
|
24
|
+
pass: z.ZodBoolean;
|
|
25
|
+
score: z.ZodNumber;
|
|
26
|
+
}, z.core.$strip>;
|
|
27
|
+
/**
|
|
28
|
+
* Type for rubric result (inferred from RubricResultSchema).
|
|
29
|
+
*/
|
|
30
|
+
export type RubricResult = z.infer<typeof RubricResultSchema>;
|
|
31
|
+
/**
|
|
32
|
+
* Zod schema for evaluation steps result.
|
|
33
|
+
* Describes the structure of the result containing evaluation steps derived from criteria.
|
|
34
|
+
*/
|
|
35
|
+
export declare const GevalStepsResultSchema: z.ZodObject<{
|
|
36
|
+
steps: z.ZodArray<z.ZodString>;
|
|
37
|
+
}, z.core.$strip>;
|
|
38
|
+
/**
|
|
39
|
+
* Type for evaluation steps result (inferred from GevalStepsResultSchema).
|
|
40
|
+
*/
|
|
41
|
+
export type GevalStepsResult = z.infer<typeof GevalStepsResultSchema>;
|
|
42
|
+
/**
|
|
43
|
+
* Zod schema for evaluation result.
|
|
44
|
+
* Describes the structure of the result returned by the main evaluation function.
|
|
45
|
+
*/
|
|
46
|
+
export declare const GevalEvaluateResultSchema: z.ZodObject<{
|
|
47
|
+
reason: z.ZodString;
|
|
48
|
+
score: z.ZodNumber;
|
|
49
|
+
}, z.core.$strip>;
|
|
50
|
+
/**
|
|
51
|
+
* Type for evaluation result (inferred from GevalEvaluateResultSchema).
|
|
52
|
+
*/
|
|
53
|
+
export type GevalEvaluateResult = z.infer<typeof GevalEvaluateResultSchema>;
|
|
54
|
+
/**
|
|
55
|
+
* Evaluate output against a rubric using an LLM.
|
|
56
|
+
* Uses a system and user prompt to instruct the LLM to grade the output according to the rubric.
|
|
57
|
+
* @param output The output to grade.
|
|
58
|
+
* @param rubric The rubric to use for grading.
|
|
59
|
+
* @param providerName The provider name for the LLM.
|
|
60
|
+
* @param modelName The model name for the LLM.
|
|
61
|
+
* @param options Optional evaluation options (temperature, providerOptions, etc).
|
|
62
|
+
* @returns The rubric result (reason, pass, score).
|
|
63
|
+
*/
|
|
64
|
+
export declare const llmRubric: (output: string, rubric: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<RubricResult>;
|
|
65
|
+
/**
|
|
66
|
+
* Evaluate a reply against criteria and steps using an LLM.
|
|
67
|
+
* If steps for the criteria are not cached, generates them first, then evaluates the answer.
|
|
68
|
+
* @param prompt The prompt given to the model.
|
|
69
|
+
* @param answer The reply to evaluate.
|
|
70
|
+
* @param criteria The evaluation criteria (used to derive steps).
|
|
71
|
+
* @param providerName The provider name for the LLM.
|
|
72
|
+
* @param modelName The model name for the LLM.
|
|
73
|
+
* @param options Optional evaluation options (temperature, providerOptions, etc).
|
|
74
|
+
* @returns The evaluation result with normalized score (reason, score).
|
|
75
|
+
*/
|
|
76
|
+
export declare const gEval: (prompt: string, answer: string, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
|
package/dst/index.js
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
36
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
37
|
+
};
|
|
38
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
39
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
40
|
+
};
|
|
41
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
+
exports.gEval = exports.llmRubric = exports.GevalEvaluateResultSchema = exports.GevalStepsResultSchema = exports.RubricResultSchema = exports.default = void 0;
|
|
43
|
+
const ai_1 = require("ai");
|
|
44
|
+
const Mustache = __importStar(require("mustache"));
|
|
45
|
+
const zod_1 = __importDefault(require("zod"));
|
|
46
|
+
const prompt_1 = require("./prompt");
|
|
47
|
+
const registry_1 = require("./registry");
|
|
48
|
+
const config_1 = __importDefault(require("./config"));
|
|
49
|
+
__exportStar(require("./config"), exports);
|
|
50
|
+
var config_2 = require("./config");
|
|
51
|
+
Object.defineProperty(exports, "default", { enumerable: true, get: function () { return __importDefault(config_2).default; } });
|
|
52
|
+
/**
|
|
53
|
+
* Zod schema for rubric result.
|
|
54
|
+
* Describes the structure of the result returned by rubric-based evaluation.
|
|
55
|
+
*/
|
|
56
|
+
exports.RubricResultSchema = zod_1.default.object({
|
|
57
|
+
/** Detailed explanation of the score based on the rubric. */
|
|
58
|
+
reason: zod_1.default.string().describe('Detailed explanation of the score based on the rubric'),
|
|
59
|
+
/** Whether the output satisfies the minimum requirements. */
|
|
60
|
+
pass: zod_1.default.boolean().describe('Whether the output satisfies the minimum requirements'),
|
|
61
|
+
/** Numeric representation of quality (0-1). */
|
|
62
|
+
score: zod_1.default.number().min(0).max(1).describe('Numeric representation of quality'),
|
|
63
|
+
});
|
|
64
|
+
/**
|
|
65
|
+
* Zod schema for evaluation steps result.
|
|
66
|
+
* Describes the structure of the result containing evaluation steps derived from criteria.
|
|
67
|
+
*/
|
|
68
|
+
exports.GevalStepsResultSchema = zod_1.default.object({
|
|
69
|
+
/** List of concise evaluation steps derived from the criteria. */
|
|
70
|
+
steps: zod_1.default.array(zod_1.default.string()).describe('List of concise evaluation steps derived from the criteria'),
|
|
71
|
+
});
|
|
72
|
+
/**
|
|
73
|
+
* Zod schema for evaluation result.
|
|
74
|
+
* Describes the structure of the result returned by the main evaluation function.
|
|
75
|
+
*/
|
|
76
|
+
exports.GevalEvaluateResultSchema = zod_1.default.object({
|
|
77
|
+
/** Detailed explanation of the score based on the rubric. */
|
|
78
|
+
reason: zod_1.default.string().describe('Detailed explanation of the score based on the rubric'),
|
|
79
|
+
/** Numeric representation of quality (normalized score, 0-1). */
|
|
80
|
+
score: zod_1.default.number().min(0).describe('Numeric representation of quality'),
|
|
81
|
+
});
|
|
82
|
+
/**
|
|
83
|
+
* Evaluate output against a rubric using an LLM.
|
|
84
|
+
* Uses a system and user prompt to instruct the LLM to grade the output according to the rubric.
|
|
85
|
+
* @param output The output to grade.
|
|
86
|
+
* @param rubric The rubric to use for grading.
|
|
87
|
+
* @param providerName The provider name for the LLM.
|
|
88
|
+
* @param modelName The model name for the LLM.
|
|
89
|
+
* @param options Optional evaluation options (temperature, providerOptions, etc).
|
|
90
|
+
* @returns The rubric result (reason, pass, score).
|
|
91
|
+
*/
|
|
92
|
+
const llmRubric = async (output, rubric, providerName, modelName, options = {}) => {
|
|
93
|
+
const start = Date.now();
|
|
94
|
+
try {
|
|
95
|
+
const userPrompt = Mustache.render(prompt_1.LLM_RUBRIC_USER_PROMPT, { output, rubric });
|
|
96
|
+
const { output: result } = await (0, ai_1.generateText)({
|
|
97
|
+
model: (0, registry_1.getModel)(providerName, modelName),
|
|
98
|
+
system: prompt_1.LLM_RUBRIC_SYSTEM_PROMPT,
|
|
99
|
+
prompt: userPrompt,
|
|
100
|
+
output: ai_1.Output.object({
|
|
101
|
+
schema: exports.RubricResultSchema,
|
|
102
|
+
}),
|
|
103
|
+
...options,
|
|
104
|
+
});
|
|
105
|
+
config_1.default.hooks.onSuccess?.({
|
|
106
|
+
method: 'llmRubric',
|
|
107
|
+
params: { output, rubric, providerName, modelName, options },
|
|
108
|
+
result,
|
|
109
|
+
duration: Date.now() - start,
|
|
110
|
+
});
|
|
111
|
+
return result;
|
|
112
|
+
}
|
|
113
|
+
catch (error) {
|
|
114
|
+
config_1.default.hooks.onError?.({
|
|
115
|
+
method: 'llmRubric',
|
|
116
|
+
error,
|
|
117
|
+
duration: Date.now() - start,
|
|
118
|
+
});
|
|
119
|
+
throw error;
|
|
120
|
+
}
|
|
121
|
+
};
|
|
122
|
+
exports.llmRubric = llmRubric;
|
|
123
|
+
/**
|
|
124
|
+
* Evaluate a reply against criteria and steps using an LLM.
|
|
125
|
+
* If steps for the criteria are not cached, generates them first, then evaluates the answer.
|
|
126
|
+
* @param prompt The prompt given to the model.
|
|
127
|
+
* @param answer The reply to evaluate.
|
|
128
|
+
* @param criteria The evaluation criteria (used to derive steps).
|
|
129
|
+
* @param providerName The provider name for the LLM.
|
|
130
|
+
* @param modelName The model name for the LLM.
|
|
131
|
+
* @param options Optional evaluation options (temperature, providerOptions, etc).
|
|
132
|
+
* @returns The evaluation result with normalized score (reason, score).
|
|
133
|
+
*/
|
|
134
|
+
const gEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => {
|
|
135
|
+
const start = Date.now();
|
|
136
|
+
try {
|
|
137
|
+
const model = (0, registry_1.getModel)(providerName, modelName);
|
|
138
|
+
let steps = await (0, registry_1.getSteps)(criteria);
|
|
139
|
+
if (!steps) {
|
|
140
|
+
const stepsPrompt = Mustache.render(prompt_1.GEVAL_STEPS_PROMPT, { criteria });
|
|
141
|
+
const { output: stepsResult } = await (0, ai_1.generateText)({
|
|
142
|
+
model,
|
|
143
|
+
prompt: stepsPrompt,
|
|
144
|
+
output: ai_1.Output.object({
|
|
145
|
+
schema: exports.GevalStepsResultSchema,
|
|
146
|
+
}),
|
|
147
|
+
...options,
|
|
148
|
+
});
|
|
149
|
+
steps = stepsResult.steps;
|
|
150
|
+
(0, registry_1.setSteps)(criteria, stepsResult.steps); // NOTE: cache asynchronously, without awaiting
|
|
151
|
+
}
|
|
152
|
+
const evaluationPrompt = Mustache.render(prompt_1.GEVAL_EVALUATE_PROMPT, {
|
|
153
|
+
criteria,
|
|
154
|
+
steps: steps.join('\n- '),
|
|
155
|
+
input: prompt,
|
|
156
|
+
output: answer,
|
|
157
|
+
maxScore: config_1.default.gevalMaxScore,
|
|
158
|
+
});
|
|
159
|
+
const { output: evalResult } = await (0, ai_1.generateText)({
|
|
160
|
+
model,
|
|
161
|
+
prompt: evaluationPrompt,
|
|
162
|
+
output: ai_1.Output.object({
|
|
163
|
+
schema: exports.GevalEvaluateResultSchema,
|
|
164
|
+
}),
|
|
165
|
+
...options,
|
|
166
|
+
});
|
|
167
|
+
const result = {
|
|
168
|
+
reason: evalResult.reason,
|
|
169
|
+
score: evalResult.score / config_1.default.gevalMaxScore,
|
|
170
|
+
};
|
|
171
|
+
config_1.default.hooks.onSuccess?.({
|
|
172
|
+
method: 'gEval',
|
|
173
|
+
params: { prompt, answer, criteria, providerName, modelName, options },
|
|
174
|
+
result,
|
|
175
|
+
duration: Date.now() - start,
|
|
176
|
+
});
|
|
177
|
+
return result;
|
|
178
|
+
}
|
|
179
|
+
catch (error) {
|
|
180
|
+
config_1.default.hooks.onError?.({
|
|
181
|
+
method: 'gEval',
|
|
182
|
+
error,
|
|
183
|
+
duration: Date.now() - start,
|
|
184
|
+
});
|
|
185
|
+
throw error;
|
|
186
|
+
}
|
|
187
|
+
};
|
|
188
|
+
exports.gEval = gEval;
|
|
189
|
+
//# sourceMappingURL=index.js.map
|
package/dst/index.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;AAmBhB;;;GAGG;AACU,QAAA,kBAAkB,GAAG,aAAC,CAAC,MAAM,CAAC;IACzC,6DAA6D;IAC7D,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IACpF,6DAA6D;IAC7D,IAAI,EAAE,aAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IACnF,+CAA+C;IAC/C,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CAC9E,CAAC,CAAC;AAQH;;;GAGG;AACU,QAAA,sBAAsB,GAAG,aAAC,CAAC,MAAM,CAAC;IAC7C,kEAAkE;IAClE,KAAK,EAAE,aAAC,CAAC,KAAK,CAAC,aAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;CAClG,CAAC,CAAC;AAQH;;;GAGG;AACU,QAAA,yBAAyB,GAAG,aAAC,CAAC,MAAM,CAAC;IAChD,6DAA6D;IAC7D,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IACpF,iEAAiE;IACjE,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CACvE,CAAC,CAAC;AAOH;;;;;;;;;GASG;AACI,MAAM,SAAS,GAAG,KAAK,EAC5B,MAAc,EACd,MAAc,EACd,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACF,EAAE;IACzB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,+BAAsB,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;QAE/E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAC5C,KAAK,EAAE,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC;YACxC,MAAM,EAAE,iCAAwB;YAChC,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,0BAAkB;aAC3B,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,WAAW;YACnB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YAC5D,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,WAAW;YACnB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvCY,QAAA,SAAS,aAuCrB;AAED;;;;;;;;;;GAUG;AACI,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAEzB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAC;QAErC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,2BAAkB,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEtE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;gBACjD,KAAK;gBACL,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;oBACpB,MAAM,EAAE,8BAAsB;iBAC/B,CAAC;gBACF,GAAG,OAAO;aACX,CAAC,CAAC;YAEH,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC;YAE1B,IAAA,mBAAQ,EAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,+CAA+C;QACxF,CAAC;QAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,8BAAqB,EAAE;YAC9D,QAAQ;YACR,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;YACzB,KAAK,EAAE,MAAM;YACb,MAAM,EAAE,MAAM;YACd,QAAQ,EAAE,gBAAI,CAAC,aAAa;SAC7B,CAAC,CAAC;QAEH,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAChD,KAAK;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,iCAAyB;aAClC,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,KAAK,EAAE,UAAU,CAAC,KAAK,GAAG,gBAAI,CAAC,aAAa;SAC7C,CAAC;QAEF,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,OAAO;YACf,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YACtE,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,OAAO;YACf,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvEY,QAAA,KAAK,SAuEjB"}
|
package/dst/prompt.d.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for LLM rubric-based grading. Guides the LLM to grade output according to a rubric and respond with a JSON object.
|
|
3
|
+
*/
|
|
4
|
+
export declare const LLM_RUBRIC_SYSTEM_PROMPT = "You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}\n\nExamples:\n\n<Output>Hello world</Output>\n<Rubric>Content contains a greeting</Rubric>\n{\"reason\": \"the content contains the word 'Hello'\", \"pass\": true, \"score\": 1.0}\n\n<Output>Avast ye swabs, repel the invaders!</Output>\n<Rubric>Does not speak like a pirate</Rubric>\n{\"reason\": \"'avast ye' is a common pirate term\", \"pass\": false, \"score\": 0.0}\n";
|
|
5
|
+
/**
|
|
6
|
+
* User prompt template for rubric-based grading. Used to inject output and rubric into the prompt.
|
|
7
|
+
*/
|
|
8
|
+
export declare const LLM_RUBRIC_USER_PROMPT = "<Output>\n{{output}}\n</Output>\n<Rubric>\n{{rubric}}\n</Rubric>";
|
|
9
|
+
/**
|
|
10
|
+
* System prompt for generating evaluation steps from criteria. Guides the LLM to output a minified JSON array of steps.
|
|
11
|
+
*/
|
|
12
|
+
export declare const GEVAL_STEPS_PROMPT = "\nGiven an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.\n\n**EVALUATION CRITERIA**\n{{criteria}}\n\n**OUTPUT FORMAT**\nIMPORTANT:\n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain a single key, \"steps\", whose value is a list of strings.\n- Each string must represent one evaluation step.\n- Do NOT include any explanations, commentary, extra text, or additional formatting.\n\nFormat:\n{\"steps\": <list_of_strings>}\n\nExample:\n{\"steps\":[\"<Evaluation Step 1>\",\"<Evaluation Step 2>\",\"<Evaluation Step 3>\",\"<Evaluation Step 4>\"]}\n\nHere are the 3-4 concise evaluation steps, formatted as required in a minified JSON:\nJSON:\n";
|
|
13
|
+
/**
|
|
14
|
+
* System prompt for evaluating a reply against criteria and steps. Guides the LLM to return a JSON with score and reason.
|
|
15
|
+
*/
|
|
16
|
+
export declare const GEVAL_EVALUATE_PROMPT = "\nYou will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\n**Evaluation Criteria**\n{{criteria}}\n\n**Evaluation Steps**\n- {{steps}}\nGiven the evaluation steps, return a JSON with two keys: \n 1) a \"score\" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;\n 2) a \"reason\" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!\n\n**Prompt**\n{{input}}\n\n**Reply**\n{{output}}\n\n**OUTPUT FORMAT**\nIMPORTANT: \n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain exactly two keys: \"score\" and \"reason\".\n- No additional words, explanations, or formatting are needed.\n- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.\n\nExample JSON:\n{\"score\":0,\"reason\":\"The text of reply does not follow the evaluation criteria provided.\"}\n\nHere is the final evaluation in the required minified JSON format:\nJSON:\n";
|
package/dst/prompt.js
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.GEVAL_EVALUATE_PROMPT = exports.GEVAL_STEPS_PROMPT = exports.LLM_RUBRIC_USER_PROMPT = exports.LLM_RUBRIC_SYSTEM_PROMPT = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* System prompt for LLM rubric-based grading. Guides the LLM to grade output according to a rubric and respond with a JSON object.
|
|
6
|
+
*/
|
|
7
|
+
exports.LLM_RUBRIC_SYSTEM_PROMPT = `You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
8
|
+
|
|
9
|
+
Examples:
|
|
10
|
+
|
|
11
|
+
<Output>Hello world</Output>
|
|
12
|
+
<Rubric>Content contains a greeting</Rubric>
|
|
13
|
+
{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
|
|
14
|
+
|
|
15
|
+
<Output>Avast ye swabs, repel the invaders!</Output>
|
|
16
|
+
<Rubric>Does not speak like a pirate</Rubric>
|
|
17
|
+
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
|
|
18
|
+
`;
|
|
19
|
+
/**
|
|
20
|
+
* User prompt template for rubric-based grading. Used to inject output and rubric into the prompt.
|
|
21
|
+
*/
|
|
22
|
+
exports.LLM_RUBRIC_USER_PROMPT = '<Output>\n{{output}}\n</Output>\n<Rubric>\n{{rubric}}\n</Rubric>';
|
|
23
|
+
/**
|
|
24
|
+
* System prompt for generating evaluation steps from criteria. Guides the LLM to output a minified JSON array of steps.
|
|
25
|
+
*/
|
|
26
|
+
exports.GEVAL_STEPS_PROMPT = `
|
|
27
|
+
Given an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.
|
|
28
|
+
|
|
29
|
+
**EVALUATION CRITERIA**
|
|
30
|
+
{{criteria}}
|
|
31
|
+
|
|
32
|
+
**OUTPUT FORMAT**
|
|
33
|
+
IMPORTANT:
|
|
34
|
+
- Return output ONLY as a minified JSON object (no code fences).
|
|
35
|
+
- The JSON object must contain a single key, "steps", whose value is a list of strings.
|
|
36
|
+
- Each string must represent one evaluation step.
|
|
37
|
+
- Do NOT include any explanations, commentary, extra text, or additional formatting.
|
|
38
|
+
|
|
39
|
+
Format:
|
|
40
|
+
{"steps": <list_of_strings>}
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
{"steps":["<Evaluation Step 1>","<Evaluation Step 2>","<Evaluation Step 3>","<Evaluation Step 4>"]}
|
|
44
|
+
|
|
45
|
+
Here are the 3-4 concise evaluation steps, formatted as required in a minified JSON:
|
|
46
|
+
JSON:
|
|
47
|
+
`;
|
|
48
|
+
/**
|
|
49
|
+
* System prompt for evaluating a reply against criteria and steps. Guides the LLM to return a JSON with score and reason.
|
|
50
|
+
*/
|
|
51
|
+
exports.GEVAL_EVALUATE_PROMPT = `
|
|
52
|
+
You will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.
|
|
53
|
+
Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
|
|
54
|
+
|
|
55
|
+
**Evaluation Criteria**
|
|
56
|
+
{{criteria}}
|
|
57
|
+
|
|
58
|
+
**Evaluation Steps**
|
|
59
|
+
- {{steps}}
|
|
60
|
+
Given the evaluation steps, return a JSON with two keys:
|
|
61
|
+
1) a "score" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;
|
|
62
|
+
2) a "reason" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!
|
|
63
|
+
|
|
64
|
+
**Prompt**
|
|
65
|
+
{{input}}
|
|
66
|
+
|
|
67
|
+
**Reply**
|
|
68
|
+
{{output}}
|
|
69
|
+
|
|
70
|
+
**OUTPUT FORMAT**
|
|
71
|
+
IMPORTANT:
|
|
72
|
+
- Return output ONLY as a minified JSON object (no code fences).
|
|
73
|
+
- The JSON object must contain exactly two keys: "score" and "reason".
|
|
74
|
+
- No additional words, explanations, or formatting are needed.
|
|
75
|
+
- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.
|
|
76
|
+
|
|
77
|
+
Example JSON:
|
|
78
|
+
{"score":0,"reason":"The text of reply does not follow the evaluation criteria provided."}
|
|
79
|
+
|
|
80
|
+
Here is the final evaluation in the required minified JSON format:
|
|
81
|
+
JSON:
|
|
82
|
+
`;
|
|
83
|
+
//# sourceMappingURL=prompt.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":";;;AAAA;;GAEG;AACU,QAAA,wBAAwB,GAAG;;;;;;;;;;;CAWvC,CAAC;AAEF;;GAEG;AACU,QAAA,sBAAsB,GAAG,kEAAkE,CAAC;AAEzG;;GAEG;AACU,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;;;;;;CAqBjC,CAAC;AAEF;;GAEG;AACU,QAAA,qBAAqB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA+BpC,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { type LanguageModel } from 'ai';
|
|
2
|
+
/**
|
|
3
|
+
* Get a language model instance from the provider and model name, using cache if enabled.
|
|
4
|
+
* @param providerName The provider name (e.g., 'openai').
|
|
5
|
+
* @param modelName The model name.
|
|
6
|
+
* @returns The language model instance.
|
|
7
|
+
*/
|
|
8
|
+
export declare const getModel: (providerName: string, modelName: string) => LanguageModel;
|
|
9
|
+
/**
|
|
10
|
+
* Get cached evaluation steps for a criteria, if caching is enabled.
|
|
11
|
+
* @param criteria The evaluation criteria string.
|
|
12
|
+
* @returns Promise resolving to the cached steps or undefined.
|
|
13
|
+
*/
|
|
14
|
+
export declare const getSteps: (criteria: string) => Promise<string[] | undefined>;
|
|
15
|
+
/**
|
|
16
|
+
* Set evaluation steps for a criteria in the cache, if caching is enabled.
|
|
17
|
+
* @param criteria The evaluation criteria string.
|
|
18
|
+
* @param steps The steps to cache.
|
|
19
|
+
* @returns Promise that resolves when the steps are set.
|
|
20
|
+
*/
|
|
21
|
+
export declare const setSteps: (criteria: string, steps: string[]) => Promise<void>;
|
package/dst/registry.js
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.setSteps = exports.getSteps = exports.getModel = void 0;
|
|
40
|
+
const crypto = __importStar(require("node:crypto"));
|
|
41
|
+
const openai_1 = require("@ai-sdk/openai");
|
|
42
|
+
const anthropic_1 = require("@ai-sdk/anthropic");
|
|
43
|
+
const google_1 = require("@ai-sdk/google");
|
|
44
|
+
const mistral_1 = require("@ai-sdk/mistral");
|
|
45
|
+
const amazon_bedrock_1 = require("@ai-sdk/amazon-bedrock");
|
|
46
|
+
const azure_1 = require("@ai-sdk/azure");
|
|
47
|
+
const deepseek_1 = require("@ai-sdk/deepseek");
|
|
48
|
+
const groq_1 = require("@ai-sdk/groq");
|
|
49
|
+
const perplexity_1 = require("@ai-sdk/perplexity");
|
|
50
|
+
const xai_1 = require("@ai-sdk/xai");
|
|
51
|
+
const config_1 = __importDefault(require("./config"));
|
|
52
|
+
/**
|
|
53
|
+
* Map of provider names to provider functions.
|
|
54
|
+
*/
|
|
55
|
+
const PROVIDERS = {
|
|
56
|
+
openai: openai_1.openai,
|
|
57
|
+
anthropic: anthropic_1.anthropic,
|
|
58
|
+
google: google_1.google,
|
|
59
|
+
mistral: mistral_1.mistral,
|
|
60
|
+
bedrock: amazon_bedrock_1.bedrock,
|
|
61
|
+
azure: azure_1.azure,
|
|
62
|
+
deepseek: deepseek_1.deepseek,
|
|
63
|
+
groq: groq_1.groq,
|
|
64
|
+
perplexity: perplexity_1.perplexity,
|
|
65
|
+
xai: xai_1.xai,
|
|
66
|
+
};
|
|
67
|
+
/**
|
|
68
|
+
* Get a language model instance from the provider and model name, using cache if enabled.
|
|
69
|
+
* @param providerName The provider name (e.g., 'openai').
|
|
70
|
+
* @param modelName The model name.
|
|
71
|
+
* @returns The language model instance.
|
|
72
|
+
*/
|
|
73
|
+
const getModel = (providerName, modelName) => {
|
|
74
|
+
const cacheKey = `${providerName}:${modelName}`;
|
|
75
|
+
let model = config_1.default.isModelCached ? config_1.default.modelCache.get(cacheKey) : undefined;
|
|
76
|
+
if (!model) {
|
|
77
|
+
const provider = PROVIDERS[providerName];
|
|
78
|
+
if (!provider) {
|
|
79
|
+
throw new Error(`Unknown provider: "${providerName}". Available providers: ${Object.keys(PROVIDERS).join(', ')}`);
|
|
80
|
+
}
|
|
81
|
+
model = provider(modelName);
|
|
82
|
+
if (config_1.default.isModelCached) {
|
|
83
|
+
config_1.default.modelCache.set(cacheKey, model);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return model;
|
|
87
|
+
};
|
|
88
|
+
exports.getModel = getModel;
|
|
89
|
+
/**
|
|
90
|
+
* Compute the MD5 hash of a string.
|
|
91
|
+
* @param str The input string.
|
|
92
|
+
* @returns The MD5 hash as a hex string.
|
|
93
|
+
*/
|
|
94
|
+
const md5 = (str) => {
|
|
95
|
+
return crypto.createHash('md5').update(str).digest('hex');
|
|
96
|
+
};
|
|
97
|
+
/**
|
|
98
|
+
* Get cached evaluation steps for a criteria, if caching is enabled.
|
|
99
|
+
* @param criteria The evaluation criteria string.
|
|
100
|
+
* @returns Promise resolving to the cached steps or undefined.
|
|
101
|
+
*/
|
|
102
|
+
const getSteps = (criteria) => {
|
|
103
|
+
return config_1.default.isStepsCached ? config_1.default.stepsCache.get(md5(criteria)) : Promise.resolve(undefined);
|
|
104
|
+
};
|
|
105
|
+
exports.getSteps = getSteps;
|
|
106
|
+
/**
|
|
107
|
+
* Set evaluation steps for a criteria in the cache, if caching is enabled.
|
|
108
|
+
* @param criteria The evaluation criteria string.
|
|
109
|
+
* @param steps The steps to cache.
|
|
110
|
+
* @returns Promise that resolves when the steps are set.
|
|
111
|
+
*/
|
|
112
|
+
const setSteps = (criteria, steps) => {
|
|
113
|
+
if (config_1.default.isStepsCached) {
|
|
114
|
+
return config_1.default.stepsCache.set(md5(criteria), steps);
|
|
115
|
+
}
|
|
116
|
+
return Promise.resolve();
|
|
117
|
+
};
|
|
118
|
+
exports.setSteps = setSteps;
|
|
119
|
+
//# sourceMappingURL=registry.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"registry.js","sourceRoot":"","sources":["../src/registry.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,oDAAsC;AAEtC,2CAAwC;AACxC,iDAA8C;AAC9C,2CAAwC;AACxC,6CAA0C;AAC1C,2DAAiD;AACjD,yCAAsC;AACtC,+CAA4C;AAC5C,uCAAoC;AACpC,mDAAgD;AAChD,qCAAkC;AAElC,sDAA4B;AAG5B;;GAEG;AACH,MAAM,SAAS,GAA6B;IAC1C,MAAM,EAAN,eAAM;IACN,SAAS,EAAT,qBAAS;IACT,MAAM,EAAN,eAAM;IACN,OAAO,EAAP,iBAAO;IACP,OAAO,EAAP,wBAAO;IACP,KAAK,EAAL,aAAK;IACL,QAAQ,EAAR,mBAAQ;IACR,IAAI,EAAJ,WAAI;IACJ,UAAU,EAAV,uBAAU;IACV,GAAG,EAAH,SAAG;CACJ,CAAC;AAEF;;;;;GAKG;AACI,MAAM,QAAQ,GAAG,CAAC,YAAoB,EAAE,SAAiB,EAAiB,EAAE;IACjF,MAAM,QAAQ,GAAG,GAAG,YAAY,IAAI,SAAS,EAAE,CAAC;IAEhD,IAAI,KAAK,GAAG,gBAAI,CAAC,aAAa,CAAC,CAAC,CAAC,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAE3E,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,QAAQ,GAAG,SAAS,CAAC,YAAY,CAAC,CAAC;QAEzC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,MAAM,IAAI,KAAK,CAAC,sBAAsB,YAAY,2BAA2B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACpH,CAAC;QAED,KAAK,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC;QAE5B,IAAI,gBAAI,CAAC,aAAa,EAAE,CAAC;YACvB,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;IAED,OAAO,KAAM,CAAC;AAChB,CAAC,CAAA;AApBY,QAAA,QAAQ,YAoBpB;AAED;;;;GAIG;AACH,MAAM,GAAG,GAAG,CAAC,GAAW,EAAU,EAAE;IAClC,OAAO,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC5D,CAAC,CAAA;AAED;;;;GAIG;AACI,MAAM,QAAQ,GAAG,CAAC,QAAgB,EAAiC,EAAE;IAC1E,OAAO,gBAAI,CAAC,aAAa,CAAC,CAAC,CAAC,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;AAC9F,CAAC,CAAA;AAFY,QAAA,QAAQ,YAEpB;AAED;;;;;GAKG;AACI,MAAM,QAAQ,GAAG,CAAC,QAAgB,EAAE,KAAe,EAAiB,EAAE;IAC3E,IAAI,gBAAI,CAAC,aAAa,EAAE,CAAC;QACvB,OAAO,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,KAAK,CAAC,CAAC;IACnD,CAAC;IAED,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC;AAC3B,CAAC,CAAA;AANY,QAAA,QAAQ,YAMpB"}
|
package/package.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@eva-llm/eva-judge",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "LLM-as-a-judge abstraction layer using ai-sdk and plugins",
|
|
5
|
+
"main": "dst/index.js",
|
|
6
|
+
"types": "dst/index.d.ts",
|
|
7
|
+
"engines": {
|
|
8
|
+
"node": ">=22"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"/dst"
|
|
12
|
+
],
|
|
13
|
+
"repository": {
|
|
14
|
+
"type": "git",
|
|
15
|
+
"url": "git+https://github.com/eva-llm/eva-judge.git"
|
|
16
|
+
},
|
|
17
|
+
"author": "Sergei Chipiga",
|
|
18
|
+
"license": "MIT",
|
|
19
|
+
"bugs": {
|
|
20
|
+
"url": "https://github.com/eva-llm/eva-judge/issues"
|
|
21
|
+
},
|
|
22
|
+
"homepage": "https://github.com/eva-llm/eva-judge#readme",
|
|
23
|
+
"devDependencies": {
|
|
24
|
+
"@types/jest": "^30.0.0",
|
|
25
|
+
"@types/mustache": "^4.2.6",
|
|
26
|
+
"@types/node": "^25.5.0",
|
|
27
|
+
"jest": "^30.3.0",
|
|
28
|
+
"ts-jest": "^29.4.6",
|
|
29
|
+
"typescript": "^5.9.3"
|
|
30
|
+
},
|
|
31
|
+
"dependencies": {
|
|
32
|
+
"@ai-sdk/amazon-bedrock": "^4.0.82",
|
|
33
|
+
"@ai-sdk/anthropic": "^3.0.58",
|
|
34
|
+
"@ai-sdk/azure": "^3.0.48",
|
|
35
|
+
"@ai-sdk/deepseek": "^2.0.26",
|
|
36
|
+
"@ai-sdk/google": "^3.0.43",
|
|
37
|
+
"@ai-sdk/groq": "^3.0.31",
|
|
38
|
+
"@ai-sdk/mistral": "^3.0.24",
|
|
39
|
+
"@ai-sdk/openai": "^3.0.41",
|
|
40
|
+
"@ai-sdk/perplexity": "^3.0.25",
|
|
41
|
+
"@ai-sdk/xai": "^3.0.72",
|
|
42
|
+
"ai": "^6.0.116",
|
|
43
|
+
"lru-cache": "^11.2.7",
|
|
44
|
+
"mustache": "^4.2.0",
|
|
45
|
+
"zod": "^4.3.6"
|
|
46
|
+
},
|
|
47
|
+
"scripts": {
|
|
48
|
+
"build": "tsc",
|
|
49
|
+
"test": "jest"
|
|
50
|
+
}
|
|
51
|
+
}
|