@eva-llm/eva-judge 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE CHANGED
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2026 [Your Name or Organization]
3
+ Copyright (c) 2026 EVA-LLM
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
package/README.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # Project Inspiration & Attribution
2
2
 
3
- This project is inspired by [promptfoo](https://github.com/promptfoo/promptfoo), including author's work on the G-Eval framework there. The LLM-as-a-judge prompts are copied from promptfoo and adapted for project-specific issues.
3
+ This project is inspired by [promptfoo](https://github.com/promptfoo/promptfoo), including author's work on the G-Eval framework there. The LLM-as-a-Judge prompts are copied from promptfoo and adapted for project-specific issues.
4
4
 
5
5
  # eva-judge
6
6
 
7
- A TypeScript/Node.js package for evaluating and managing test cases, prompts, and registry logic for AI or code evaluation workflows.
7
+ A TypeScript/Node.js package for evaluating and managing test cases, prompts, and registry logic for AI or code evaluation workflows with LLM-Rubric or G-Eval.
8
8
 
9
9
  ## Features
10
10
  - Configuration management for evaluation workflows
@@ -12,26 +12,14 @@ A TypeScript/Node.js package for evaluating and managing test cases, prompts, an
12
12
  - Registry for test cases and evaluation items
13
13
  - Designed for integration with Jest and other test runners
14
14
 
15
- ## Project Structure
16
- - `src/` — Main source code
17
- - `config.ts` — Configuration logic
18
- - `prompt.ts` — Prompt utilities
19
- - `registry.ts` — Registry management
20
- - `index.ts` — Entry point
21
- - `tests/` — Unit tests for all modules
22
-
23
15
  ## Getting Started
24
16
 
25
- ### Prerequisites
26
- - Node.js (>= 16)
27
- - pnpm (recommended) or npm/yarn
28
-
29
17
  ### Installation
30
18
 
31
- Clone the repository and install dependencies:
32
-
33
19
  ```bash
34
- pnpm install
20
+ npm install @eva-llm/eva-judge
21
+ # or
22
+ pnpm add @eva-llm/eva-judge
35
23
  ```
36
24
 
37
25
  ### Running Tests
@@ -45,7 +33,7 @@ pnpm test
45
33
  Import and use the modules in your TypeScript/Node.js project:
46
34
 
47
35
  ```typescript
48
- import { llmRubric, gEval } from 'eva-judge';
36
+ import { llmRubric, gEval, bEval } from '@eva-llm/eva-judge';
49
37
  ```
50
38
 
51
39
  ### llmRubric
@@ -63,9 +51,10 @@ const result = await llmRubric(
63
51
  // result: { reason: string, pass: boolean, score: number }
64
52
  ```
65
53
 
54
+
66
55
  ### gEval
67
56
 
68
- Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score.
57
+ Evaluates a reply against criteria and derived steps using an LLM. Returns a reason and normalized score (0.0–1.0).
69
58
 
70
59
  ```typescript
71
60
  const result = await gEval(
@@ -79,6 +68,22 @@ const result = await gEval(
79
68
  // result: { reason: string, score: number }
80
69
  ```
81
70
 
71
+ ### bEval (Binary G-Eval)
72
+
73
+ Evaluates a reply against criteria and derived steps using an LLM, but with binary scoring (0 or 1). Returns a reason and a normalized score (0 or 1).
74
+
75
+ ```typescript
76
+ const result = await bEval(
77
+ prompt, // string: the prompt given to the model
78
+ answer, // string: the reply to evaluate
79
+ criteria, // string: evaluation criteria
80
+ provider, // string: LLM provider name
81
+ model, // string: LLM model name
82
+ options // optional: { temperature, providerOptions }
83
+ );
84
+ // result: { reason: string, score: number } // score will be 0 or 1
85
+ ```
86
+
82
87
  ## Development
83
88
  - Source code is in `src/`
84
89
  - Tests are in `tests/`
@@ -112,7 +117,7 @@ Specify the provider name and model name in `llmRubric` or `gEval`.
112
117
  You can provide hooks to receive notifications about evaluation events (success or error) for logging, monitoring, or custom handling. Hooks can also be used to integrate with observability tools such as OpenTelemetry for tracing and metrics. Set these in the config:
113
118
 
114
119
  ```typescript
115
- import Config from 'eva-judge';
120
+ import Config from '@eva-llm/eva-judge';
116
121
 
117
122
  Config.hooks = {
118
123
  onSuccess: ({ method, params, result, duration }) => {
@@ -127,7 +132,7 @@ Config.hooks = {
127
132
  For advanced use, you can implement your own cache storage for evaluation steps (e.g., using Redis or another backend) by providing a custom cache via `setStepsCache()`:
128
133
 
129
134
  ```typescript
130
- import Config from 'eva-judge';
135
+ import Config from '@eva-llm/eva-judge';
131
136
 
132
137
  Config.setStepsCache(RedisCache); // RedisCache must implement IStepsCache
133
138
  ```
package/dst/config.d.ts CHANGED
@@ -1,115 +1,37 @@
1
1
  import { LRUCache } from 'lru-cache';
2
2
  import { type LanguageModel } from 'ai';
3
- /**
4
- * Interface for a cache that stores evaluation steps.
5
- * Implementations should provide asynchronous set/get methods for storing and retrieving
6
- * arrays of strings, typically representing evaluation steps for a given key.
7
- */
3
+ import { type EvalMethod } from './types';
8
4
  export interface IStepsCache {
9
- /**
10
- * Store an array of steps in the cache for a given key.
11
- * @param key Unique identifier for the steps (e.g., criteria string).
12
- * @param value Array of step strings to cache.
13
- * @returns Promise that resolves when the value is set.
14
- */
15
5
  set(key: string, value: string[]): Promise<void>;
16
- /**
17
- * Retrieve an array of steps from the cache for a given key.
18
- * @param key Unique identifier for the steps (e.g., criteria string).
19
- * @returns Promise resolving to the cached array of steps, or undefined if not found.
20
- */
21
6
  get(key: string): Promise<string[] | undefined>;
22
7
  }
23
- /**
24
- * Optional hooks for receiving notifications about evaluation events.
25
- * Can be used to monitor or log success and error events for evaluation functions.
26
- */
27
8
  export interface EvaHooks {
28
- /**
29
- * Called when an evaluation completes successfully.
30
- * @param data Information about the evaluation, including method, params, result, and duration (ms).
31
- */
32
9
  onSuccess?: (data: {
33
- method: 'gEval' | 'llmRubric';
10
+ method: EvalMethod;
34
11
  params: any;
35
12
  result: any;
36
13
  duration: number;
37
14
  }) => void;
38
- /**
39
- * Called when an evaluation throws an error.
40
- * @param data Information about the error, including method, error object, and duration (ms).
41
- */
42
15
  onError?: (data: {
43
- method: 'gEval' | 'llmRubric';
16
+ method: EvalMethod;
44
17
  error: any;
45
18
  duration: number;
46
19
  }) => void;
47
20
  }
48
- /**
49
- * Global configuration and cache management for evaluation operations.
50
- * Provides options for enabling/disabling model and steps caching, and allows
51
- * customization of cache implementations and event hooks.
52
- */
53
21
  declare const _default: {
54
- /**
55
- * Maximum score for evaluation (used for normalization).
56
- */
57
22
  gevalMaxScore: number;
58
- /**
59
- * Whether model caching is enabled (for LLM instances).
60
- */
61
23
  isModelCached: boolean;
62
- /**
63
- * Whether steps caching is enabled (for evaluation steps).
64
- */
65
24
  isStepsCached: boolean;
66
- /**
67
- * LRU cache for language model instances.
68
- */
69
25
  modelCache: LRUCache<string, LanguageModel, unknown>;
70
- /**
71
- * Cache for evaluation steps (criteria → steps).
72
- */
73
26
  stepsCache: IStepsCache;
74
- /**
75
- * Restart the model cache with a new maximum size.
76
- * @param size The new cache size (default: 100).
77
- */
78
27
  restartModelCache(size?: number): void;
79
- /**
80
- * Restart the steps cache with a new maximum size.
81
- * @param size The new cache size (default: 500).
82
- */
83
28
  restartStepsCache(size?: number): void;
84
- /**
85
- * Set a custom steps cache implementation.
86
- * @param cache The new IStepsCache implementation to use.
87
- */
88
29
  setStepsCache(cache: IStepsCache): void;
89
- /**
90
- * Enable model caching (LLM instances).
91
- */
92
30
  enableModelCache(): void;
93
- /**
94
- * Disable model caching (LLM instances).
95
- */
96
31
  disableModelCache(): void;
97
- /**
98
- * Enable steps caching (criteria → steps).
99
- */
100
32
  enableStepsCache(): void;
101
- /**
102
- * Disable steps caching (criteria → steps).
103
- */
104
33
  disableStepsCache(): void;
105
- /**
106
- * Hooks for evaluation events (success/error notifications).
107
- */
108
34
  hooks: EvaHooks;
109
- /**
110
- * Set the hooks for evaluation events.
111
- * @param hooks The hooks object implementing EvaHooks.
112
- */
113
35
  setHooks(hooks: EvaHooks): void;
114
36
  };
115
37
  export default _default;
package/dst/config.js CHANGED
@@ -1,113 +1,46 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  const lru_cache_1 = require("lru-cache");
4
- /**
5
- * In-memory implementation of IStepsCache using an LRU (Least Recently Used) cache.
6
- * Useful for fast, ephemeral caching of evaluation steps during runtime.
7
- */
8
4
  class StepsMemoryAdapter {
9
5
  cache;
10
- /**
11
- * Construct a new StepsMemoryAdapter.
12
- * @param size Maximum number of items to store in the cache.
13
- */
14
6
  constructor(size) {
15
7
  this.cache = new lru_cache_1.LRUCache({ max: size });
16
8
  }
17
- /**
18
- * Store an array of steps in the cache for a given key.
19
- * @inheritdoc
20
- */
21
9
  async set(key, value) {
22
10
  this.cache.set(key, value);
23
11
  }
24
- /**
25
- * Retrieve an array of steps from the cache for a given key.
26
- * @inheritdoc
27
- */
28
12
  async get(key) {
29
13
  return this.cache.get(key);
30
14
  }
31
15
  }
32
- /**
33
- * Global configuration and cache management for evaluation operations.
34
- * Provides options for enabling/disabling model and steps caching, and allows
35
- * customization of cache implementations and event hooks.
36
- */
37
16
  exports.default = {
38
- /**
39
- * Maximum score for evaluation (used for normalization).
40
- */
41
17
  gevalMaxScore: 10,
42
- /**
43
- * Whether model caching is enabled (for LLM instances).
44
- */
45
18
  isModelCached: true,
46
- /**
47
- * Whether steps caching is enabled (for evaluation steps).
48
- */
49
19
  isStepsCached: true,
50
- /**
51
- * LRU cache for language model instances.
52
- */
53
20
  modelCache: new lru_cache_1.LRUCache({ max: 100 }),
54
- /**
55
- * Cache for evaluation steps (criteria → steps).
56
- */
57
21
  stepsCache: new StepsMemoryAdapter(500),
58
- /**
59
- * Restart the model cache with a new maximum size.
60
- * @param size The new cache size (default: 100).
61
- */
62
22
  restartModelCache(size = 100) {
63
23
  this.modelCache = new lru_cache_1.LRUCache({ max: size });
64
24
  },
65
- /**
66
- * Restart the steps cache with a new maximum size.
67
- * @param size The new cache size (default: 500).
68
- */
69
25
  restartStepsCache(size = 500) {
70
26
  this.stepsCache = new StepsMemoryAdapter(size);
71
27
  },
72
- /**
73
- * Set a custom steps cache implementation.
74
- * @param cache The new IStepsCache implementation to use.
75
- */
76
28
  setStepsCache(cache) {
77
29
  this.stepsCache = cache;
78
30
  },
79
- /**
80
- * Enable model caching (LLM instances).
81
- */
82
31
  enableModelCache() {
83
32
  this.isModelCached = true;
84
33
  },
85
- /**
86
- * Disable model caching (LLM instances).
87
- */
88
34
  disableModelCache() {
89
35
  this.isModelCached = false;
90
36
  },
91
- /**
92
- * Enable steps caching (criteria → steps).
93
- */
94
37
  enableStepsCache() {
95
38
  this.isStepsCached = true;
96
39
  },
97
- /**
98
- * Disable steps caching (criteria → steps).
99
- */
100
40
  disableStepsCache() {
101
41
  this.isStepsCached = false;
102
42
  },
103
- /**
104
- * Hooks for evaluation events (success/error notifications).
105
- */
106
43
  hooks: {},
107
- /**
108
- * Set the hooks for evaluation events.
109
- * @param hooks The hooks object implementing EvaHooks.
110
- */
111
44
  setHooks(hooks) {
112
45
  this.hooks = hooks;
113
46
  }
package/dst/config.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;AA2BrC;;;GAGG;AACH,MAAM,kBAAkB;IACd,KAAK,CAA6B;IAE1C;;;OAGG;IACH,YAAY,IAAY;QACtB,IAAI,CAAC,KAAK,GAAG,IAAI,oBAAQ,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,GAAG,CAAC,GAAW,EAAE,KAAe;QACpC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAC7B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,GAAG,CAAC,GAAW;QACnB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;CACF;AA8BD;;;;GAIG;AACH,kBAAe;IACb;;OAEG;IACH,aAAa,EAAE,EAAE;IACjB;;OAEG;IACH,aAAa,EAAE,IAAI;IACnB;;OAEG;IACH,aAAa,EAAE,IAAI;IACnB;;OAEG;IACH,UAAU,EAAE,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;IAC7D;;OAEG;IACH,UAAU,EAAE,IAAI,kBAAkB,CAAC,GAAG,CAAgB;IAEtD;;;OAGG;IACH,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IACvE,CAAC;IAED;;;OAGG;IACH,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,kBAAkB,CAAC,IAAI,CAAgB,CAAC;IAChE,CAAC;IAED;;;OAGG;IACH,aAAa,CAAC,KAAkB;QAC9B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAED;;OAEG;IACH,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAED;;OAEG;IACH,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,KAAK,EAAE,EAAc;IAErB;;;OAGG;IACH,QAAQ,CAAC,KAAe;QACtB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;CACF,CAAC"}
1
+ {"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":";;AAAA,yCAAqC;AA8BrC,MAAM,kBAAkB;IACd,KAAK,CAA6B;IAM1C,YAAY,IAAY;QACtB,IAAI,CAAC,KAAK,GAAG,IAAI,oBAAQ,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW,EAAE,KAAe;QACpC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAC7B,CAAC;IAMD,KAAK,CAAC,GAAG,CAAC,GAAW;QACnB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAC7B,CAAC;CACF;AAiCD,kBAAe;IAIb,aAAa,EAAE,EAAE;IAIjB,aAAa,EAAE,IAAI;IAInB,aAAa,EAAE,IAAI;IAInB,UAAU,EAAE,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;IAI7D,UAAU,EAAE,IAAI,kBAAkB,CAAC,GAAG,CAAgB;IAKtD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,oBAAQ,CAAwB,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;IACvE,CAAC;IAKD,iBAAiB,CAAC,OAAe,GAAG;QAClC,IAAI,CAAC,UAAU,GAAG,IAAI,kBAAkB,CAAC,IAAI,CAAgB,CAAC;IAChE,CAAC;IAKD,aAAa,CAAC,KAAkB;QAC9B,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;IAC1B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,gBAAgB;QACd,IAAI,CAAC,aAAa,GAAG,IAAI,CAAC;IAC5B,CAAC;IAID,iBAAiB;QACf,IAAI,CAAC,aAAa,GAAG,KAAK,CAAC;IAC7B,CAAC;IAID,KAAK,EAAE,EAAc;IAKrB,QAAQ,CAAC,KAAe;QACtB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACrB,CAAC;CACF,CAAC"}
package/dst/index.d.ts CHANGED
@@ -1,76 +1,25 @@
1
1
  import z from 'zod';
2
2
  export * from './config';
3
3
  export { default } from './config';
4
- /**
5
- * Options for evaluation functions.
6
- * Allows customization of LLM generation parameters and provider-specific options.
7
- */
8
4
  export interface EvalOptions {
9
- /**
10
- * Temperature for model generation (controls randomness).
11
- */
12
5
  temperature?: number;
13
- /**
14
- * Additional provider-specific options (passed to the LLM provider).
15
- */
16
6
  providerOptions?: Record<string, any>;
17
7
  }
18
- /**
19
- * Zod schema for rubric result.
20
- * Describes the structure of the result returned by rubric-based evaluation.
21
- */
22
8
  export declare const RubricResultSchema: z.ZodObject<{
23
9
  reason: z.ZodString;
24
10
  pass: z.ZodBoolean;
25
11
  score: z.ZodNumber;
26
12
  }, z.core.$strip>;
27
- /**
28
- * Type for rubric result (inferred from RubricResultSchema).
29
- */
30
13
  export type RubricResult = z.infer<typeof RubricResultSchema>;
31
- /**
32
- * Zod schema for evaluation steps result.
33
- * Describes the structure of the result containing evaluation steps derived from criteria.
34
- */
35
14
  export declare const GevalStepsResultSchema: z.ZodObject<{
36
15
  steps: z.ZodArray<z.ZodString>;
37
16
  }, z.core.$strip>;
38
- /**
39
- * Type for evaluation steps result (inferred from GevalStepsResultSchema).
40
- */
41
17
  export type GevalStepsResult = z.infer<typeof GevalStepsResultSchema>;
42
- /**
43
- * Zod schema for evaluation result.
44
- * Describes the structure of the result returned by the main evaluation function.
45
- */
46
18
  export declare const GevalEvaluateResultSchema: z.ZodObject<{
47
19
  reason: z.ZodString;
48
20
  score: z.ZodNumber;
49
21
  }, z.core.$strip>;
50
- /**
51
- * Type for evaluation result (inferred from GevalEvaluateResultSchema).
52
- */
53
22
  export type GevalEvaluateResult = z.infer<typeof GevalEvaluateResultSchema>;
54
- /**
55
- * Evaluate output against a rubric using an LLM.
56
- * Uses a system and user prompt to instruct the LLM to grade the output according to the rubric.
57
- * @param output The output to grade.
58
- * @param rubric The rubric to use for grading.
59
- * @param providerName The provider name for the LLM.
60
- * @param modelName The model name for the LLM.
61
- * @param options Optional evaluation options (temperature, providerOptions, etc).
62
- * @returns The rubric result (reason, pass, score).
63
- */
64
23
  export declare const llmRubric: (output: string, rubric: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<RubricResult>;
65
- /**
66
- * Evaluate a reply against criteria and steps using an LLM.
67
- * If steps for the criteria are not cached, generates them first, then evaluates the answer.
68
- * @param prompt The prompt given to the model.
69
- * @param answer The reply to evaluate.
70
- * @param criteria The evaluation criteria (used to derive steps).
71
- * @param providerName The provider name for the LLM.
72
- * @param modelName The model name for the LLM.
73
- * @param options Optional evaluation options (temperature, providerOptions, etc).
74
- * @returns The evaluation result with normalized score (reason, score).
75
- */
76
24
  export declare const gEval: (prompt: string, answer: string, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
25
+ export declare const bEval: (prompt: string, answer: string, criteria: string, providerName: string, modelName: string, options?: EvalOptions) => Promise<GevalEvaluateResult>;
package/dst/index.js CHANGED
@@ -39,7 +39,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
39
39
  return (mod && mod.__esModule) ? mod : { "default": mod };
40
40
  };
41
41
  Object.defineProperty(exports, "__esModule", { value: true });
42
- exports.gEval = exports.llmRubric = exports.GevalEvaluateResultSchema = exports.GevalStepsResultSchema = exports.RubricResultSchema = exports.default = void 0;
42
+ exports.bEval = exports.gEval = exports.llmRubric = exports.GevalEvaluateResultSchema = exports.GevalStepsResultSchema = exports.RubricResultSchema = exports.default = void 0;
43
43
  const ai_1 = require("ai");
44
44
  const Mustache = __importStar(require("mustache"));
45
45
  const zod_1 = __importDefault(require("zod"));
@@ -49,46 +49,18 @@ const config_1 = __importDefault(require("./config"));
49
49
  __exportStar(require("./config"), exports);
50
50
  var config_2 = require("./config");
51
51
  Object.defineProperty(exports, "default", { enumerable: true, get: function () { return __importDefault(config_2).default; } });
52
- /**
53
- * Zod schema for rubric result.
54
- * Describes the structure of the result returned by rubric-based evaluation.
55
- */
56
52
  exports.RubricResultSchema = zod_1.default.object({
57
- /** Detailed explanation of the score based on the rubric. */
58
53
  reason: zod_1.default.string().describe('Detailed explanation of the score based on the rubric'),
59
- /** Whether the output satisfies the minimum requirements. */
60
54
  pass: zod_1.default.boolean().describe('Whether the output satisfies the minimum requirements'),
61
- /** Numeric representation of quality (0-1). */
62
55
  score: zod_1.default.number().min(0).max(1).describe('Numeric representation of quality'),
63
56
  });
64
- /**
65
- * Zod schema for evaluation steps result.
66
- * Describes the structure of the result containing evaluation steps derived from criteria.
67
- */
68
57
  exports.GevalStepsResultSchema = zod_1.default.object({
69
- /** List of concise evaluation steps derived from the criteria. */
70
58
  steps: zod_1.default.array(zod_1.default.string()).describe('List of concise evaluation steps derived from the criteria'),
71
59
  });
72
- /**
73
- * Zod schema for evaluation result.
74
- * Describes the structure of the result returned by the main evaluation function.
75
- */
76
60
  exports.GevalEvaluateResultSchema = zod_1.default.object({
77
- /** Detailed explanation of the score based on the rubric. */
78
61
  reason: zod_1.default.string().describe('Detailed explanation of the score based on the rubric'),
79
- /** Numeric representation of quality (normalized score, 0-1). */
80
62
  score: zod_1.default.number().min(0).describe('Numeric representation of quality'),
81
63
  });
82
- /**
83
- * Evaluate output against a rubric using an LLM.
84
- * Uses a system and user prompt to instruct the LLM to grade the output according to the rubric.
85
- * @param output The output to grade.
86
- * @param rubric The rubric to use for grading.
87
- * @param providerName The provider name for the LLM.
88
- * @param modelName The model name for the LLM.
89
- * @param options Optional evaluation options (temperature, providerOptions, etc).
90
- * @returns The rubric result (reason, pass, score).
91
- */
92
64
  const llmRubric = async (output, rubric, providerName, modelName, options = {}) => {
93
65
  const start = Date.now();
94
66
  try {
@@ -120,18 +92,7 @@ const llmRubric = async (output, rubric, providerName, modelName, options = {})
120
92
  }
121
93
  };
122
94
  exports.llmRubric = llmRubric;
123
- /**
124
- * Evaluate a reply against criteria and steps using an LLM.
125
- * If steps for the criteria are not cached, generates them first, then evaluates the answer.
126
- * @param prompt The prompt given to the model.
127
- * @param answer The reply to evaluate.
128
- * @param criteria The evaluation criteria (used to derive steps).
129
- * @param providerName The provider name for the LLM.
130
- * @param modelName The model name for the LLM.
131
- * @param options Optional evaluation options (temperature, providerOptions, etc).
132
- * @returns The evaluation result with normalized score (reason, score).
133
- */
134
- const gEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => {
95
+ const _gEval = async (prompt, answer, criteria, providerName, modelName, maxScore, methodName, options = {}) => {
135
96
  const start = Date.now();
136
97
  try {
137
98
  const model = (0, registry_1.getModel)(providerName, modelName);
@@ -147,14 +108,14 @@ const gEval = async (prompt, answer, criteria, providerName, modelName, options
147
108
  ...options,
148
109
  });
149
110
  steps = stepsResult.steps;
150
- (0, registry_1.setSteps)(criteria, stepsResult.steps); // NOTE: cache asynchronously, without awaiting
111
+ (0, registry_1.setSteps)(criteria, stepsResult.steps);
151
112
  }
152
113
  const evaluationPrompt = Mustache.render(prompt_1.GEVAL_EVALUATE_PROMPT, {
153
114
  criteria,
154
115
  steps: steps.join('\n- '),
155
116
  input: prompt,
156
117
  output: answer,
157
- maxScore: config_1.default.gevalMaxScore,
118
+ maxScore,
158
119
  });
159
120
  const { output: evalResult } = await (0, ai_1.generateText)({
160
121
  model,
@@ -166,10 +127,10 @@ const gEval = async (prompt, answer, criteria, providerName, modelName, options
166
127
  });
167
128
  const result = {
168
129
  reason: evalResult.reason,
169
- score: evalResult.score / config_1.default.gevalMaxScore,
130
+ score: evalResult.score / maxScore,
170
131
  };
171
132
  config_1.default.hooks.onSuccess?.({
172
- method: 'gEval',
133
+ method: methodName,
173
134
  params: { prompt, answer, criteria, providerName, modelName, options },
174
135
  result,
175
136
  duration: Date.now() - start,
@@ -178,12 +139,15 @@ const gEval = async (prompt, answer, criteria, providerName, modelName, options
178
139
  }
179
140
  catch (error) {
180
141
  config_1.default.hooks.onError?.({
181
- method: 'gEval',
142
+ method: methodName,
182
143
  error,
183
144
  duration: Date.now() - start,
184
145
  });
185
146
  throw error;
186
147
  }
187
148
  };
149
+ const gEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => _gEval(prompt, answer, criteria, providerName, modelName, config_1.default.gevalMaxScore, 'gEval', options);
188
150
  exports.gEval = gEval;
151
+ const bEval = async (prompt, answer, criteria, providerName, modelName, options = {}) => _gEval(prompt, answer, criteria, providerName, modelName, 1, 'bEval', options);
152
+ exports.bEval = bEval;
189
153
  //# sourceMappingURL=index.js.map
package/dst/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;AAmBhB;;;GAGG;AACU,QAAA,kBAAkB,GAAG,aAAC,CAAC,MAAM,CAAC;IACzC,6DAA6D;IAC7D,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IACpF,6DAA6D;IAC7D,IAAI,EAAE,aAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IACnF,+CAA+C;IAC/C,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CAC9E,CAAC,CAAC;AAQH;;;GAGG;AACU,QAAA,sBAAsB,GAAG,aAAC,CAAC,MAAM,CAAC;IAC7C,kEAAkE;IAClE,KAAK,EAAE,aAAC,CAAC,KAAK,CAAC,aAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;CAClG,CAAC,CAAC;AAQH;;;GAGG;AACU,QAAA,yBAAyB,GAAG,aAAC,CAAC,MAAM,CAAC;IAChD,6DAA6D;IAC7D,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IACpF,iEAAiE;IACjE,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CACvE,CAAC,CAAC;AAOH;;;;;;;;;GASG;AACI,MAAM,SAAS,GAAG,KAAK,EAC5B,MAAc,EACd,MAAc,EACd,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACF,EAAE;IACzB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,+BAAsB,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;QAE/E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAC5C,KAAK,EAAE,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC;YACxC,MAAM,EAAE,iCAAwB;YAChC,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,0BAAkB;aAC3B,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,WAAW;YACnB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YAC5D,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,WAAW;YACnB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvCY,QAAA,SAAS,aAuCrB;AAED;;;;;;;;;;GAUG;AACI,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAEzB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAC;QAErC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,2BAAkB,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEtE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;gBACjD,KAAK;gBACL,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;oBACpB,MAAM,EAAE,8BAAsB;iBAC/B,CAAC;gBACF,GAAG,OAAO;aACX,CAAC,CAAC;YAEH,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC;YAE1B,IAAA,mBAAQ,EAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,+CAA+C;QACxF,CAAC;QAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,8BAAqB,EAAE;YAC9D,QAAQ;YACR,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;YACzB,KAAK,EAAE,MAAM;YACb,MAAM,EAAE,MAAM;YACd,QAAQ,EAAE,gBAAI,CAAC,aAAa;SAC7B,CAAC,CAAC;QAEH,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAChD,KAAK;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,iCAAyB;aAClC,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,KAAK,EAAE,UAAU,CAAC,KAAK,GAAG,gBAAI,CAAC,aAAa;SAC7C,CAAC;QAEF,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,OAAO;YACf,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YACtE,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,OAAO;YACf,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvEY,QAAA,KAAK,SAuEjB"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,2BAA0C;AAC1C,mDAAqC;AACrC,8CAAoB;AAEpB,qCAKkB;AAClB,yCAA0D;AAC1D,sDAA4B;AAG5B,2CAAyB;AACzB,mCAAmC;AAA1B,kHAAA,OAAO,OAAA;AAqBH,QAAA,kBAAkB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEzC,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,IAAI,EAAE,aAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEnF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CAC9E,CAAC,CAAC;AAUU,QAAA,sBAAsB,GAAG,aAAC,CAAC,MAAM,CAAC;IAE7C,KAAK,EAAE,aAAC,CAAC,KAAK,CAAC,aAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,4DAA4D,CAAC;CAClG,CAAC,CAAC;AAWU,QAAA,yBAAyB,GAAG,aAAC,CAAC,MAAM,CAAC;IAEhD,MAAM,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IAEpF,KAAK,EAAE,aAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,mCAAmC,CAAC;CACvE,CAAC,CAAC;AAgBI,MAAM,SAAS,GAAG,KAAK,EAC5B,MAAc,EACd,MAAc,EACd,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACF,EAAE;IACzB,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC,+BAAsB,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAC;QAE/E,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAC5C,KAAK,EAAE,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC;YACxC,MAAM,EAAE,iCAAwB;YAChC,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,0BAAkB;aAC3B,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,WAAW;YACnB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YAC5D,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,WAAW;YACnB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAvCY,QAAA,SAAS,aAuCrB;AAED,MAAM,MAAM,GAAG,KAAK,EAClB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,QAAgB,EAChB,UAAsB,EACtB,UAAuB,EAAE,EACK,EAAE;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAEzB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,mBAAQ,EAAC,YAAY,EAAE,SAAS,CAAC,CAAC;QAChD,IAAI,KAAK,GAAG,MAAM,IAAA,mBAAQ,EAAC,QAAQ,CAAC,CAAC;QAErC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,2BAAkB,EAAE,EAAE,QAAQ,EAAE,CAAC,CAAC;YAEtE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;gBACjD,KAAK;gBACL,MAAM,EAAE,WAAW;gBACnB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;oBACpB,MAAM,EAAE,8BAAsB;iBAC/B,CAAC;gBACF,GAAG,OAAO;aACX,CAAC,CAAC;YAEH,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC;YAE1B,IAAA,mBAAQ,EAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC;QACxC,CAAC;QAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC,8BAAqB,EAAE;YAC9D,QAAQ;YACR,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC;YACzB,KAAK,EAAE,MAAM;YACb,MAAM,EAAE,MAAM;YACd,QAAQ;SACT,CAAC,CAAC;QAEH,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,IAAA,iBAAY,EAAC;YAChD,KAAK;YACL,MAAM,EAAE,gBAAgB;YACxB,MAAM,EAAE,WAAM,CAAC,MAAM,CAAC;gBACpB,MAAM,EAAE,iCAAyB;aAClC,CAAC;YACF,GAAG,OAAO;SACX,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG;YACb,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,KAAK,EAAE,UAAU,CAAC,KAAK,GAAG,QAAQ;SACnC,CAAC;QAEF,gBAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACrB,MAAM,EAAE,UAAU;YAClB,MAAM,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE;YACtE,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAEf,gBAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,EAAE,UAAU;YAClB,KAAK;YACL,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;SAC7B,CAAC,CAAC;QAEH,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC,CAAA;AAaM,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,gBAAI,CAAC,aAAa,EAClB,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB;AAaK,MAAM,KAAK,GAAG,KAAK,EACxB,MAAc,EACd,MAAc,EACd,QAAgB,EAChB,YAAoB,EACpB,SAAiB,EACjB,UAAuB,EAAE,EACK,EAAE,CAAC,MAAM,CACvC,MAAM,EACN,MAAM,EACN,QAAQ,EACR,YAAY,EACZ,SAAS,EACT,CAAC,EACD,OAAO,EACP,OAAO,CACR,CAAC;AAhBW,QAAA,KAAK,SAgBhB"}
package/dst/prompt.d.ts CHANGED
@@ -1,16 +1,8 @@
1
- /**
2
- * System prompt for LLM rubric-based grading. Guides the LLM to grade output according to a rubric and respond with a JSON object.
1
+ /*!
2
+ * Portions of this code are based on Promptfoo (MIT License)
3
+ * Copyright (c) 2025 Promptfoo
3
4
  */
4
5
  export declare const LLM_RUBRIC_SYSTEM_PROMPT = "You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}\n\nExamples:\n\n<Output>Hello world</Output>\n<Rubric>Content contains a greeting</Rubric>\n{\"reason\": \"the content contains the word 'Hello'\", \"pass\": true, \"score\": 1.0}\n\n<Output>Avast ye swabs, repel the invaders!</Output>\n<Rubric>Does not speak like a pirate</Rubric>\n{\"reason\": \"'avast ye' is a common pirate term\", \"pass\": false, \"score\": 0.0}\n";
5
- /**
6
- * User prompt template for rubric-based grading. Used to inject output and rubric into the prompt.
7
- */
8
6
  export declare const LLM_RUBRIC_USER_PROMPT = "<Output>\n{{output}}\n</Output>\n<Rubric>\n{{rubric}}\n</Rubric>";
9
- /**
10
- * System prompt for generating evaluation steps from criteria. Guides the LLM to output a minified JSON array of steps.
11
- */
12
7
  export declare const GEVAL_STEPS_PROMPT = "\nGiven an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.\n\n**EVALUATION CRITERIA**\n{{criteria}}\n\n**OUTPUT FORMAT**\nIMPORTANT:\n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain a single key, \"steps\", whose value is a list of strings.\n- Each string must represent one evaluation step.\n- Do NOT include any explanations, commentary, extra text, or additional formatting.\n\nFormat:\n{\"steps\": <list_of_strings>}\n\nExample:\n{\"steps\":[\"<Evaluation Step 1>\",\"<Evaluation Step 2>\",\"<Evaluation Step 3>\",\"<Evaluation Step 4>\"]}\n\nHere are the 3-4 concise evaluation steps, formatted as required in a minified JSON:\nJSON:\n";
13
- /**
14
- * System prompt for evaluating a reply against criteria and steps. Guides the LLM to return a JSON with score and reason.
15
- */
16
8
  export declare const GEVAL_EVALUATE_PROMPT = "\nYou will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.\nPlease make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.\n\n**Evaluation Criteria**\n{{criteria}}\n\n**Evaluation Steps**\n- {{steps}}\nGiven the evaluation steps, return a JSON with two keys: \n 1) a \"score\" key that MUST be an integer from 0 to {{maxScore}}, where {{maxScore}} indicates that the Evaluation Criteria is fully and clearly present in the Reply according to the Evaluation Steps, and 0 indicates the total absence of the Evaluation Criteria;\n 2) a \"reason\" key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from Prompt and Reply in your reason, but be very concise with it!\n\n**Prompt**\n{{input}}\n\n**Reply**\n{{output}}\n\n**OUTPUT FORMAT**\nIMPORTANT: \n- Return output ONLY as a minified JSON object (no code fences).\n- The JSON object must contain exactly two keys: \"score\" and \"reason\".\n- No additional words, explanations, or formatting are needed.\n- Absolutely no additional text, explanations, line breaks, or formatting outside the JSON object are allowed.\n\nExample JSON:\n{\"score\":0,\"reason\":\"The text of reply does not follow the evaluation criteria provided.\"}\n\nHere is the final evaluation in the required minified JSON format:\nJSON:\n";
package/dst/prompt.js CHANGED
@@ -1,9 +1,10 @@
1
1
  "use strict";
2
+ /*!
3
+ * Portions of this code are based on Promptfoo (MIT License)
4
+ * Copyright (c) 2025 Promptfoo
5
+ */
2
6
  Object.defineProperty(exports, "__esModule", { value: true });
3
7
  exports.GEVAL_EVALUATE_PROMPT = exports.GEVAL_STEPS_PROMPT = exports.LLM_RUBRIC_USER_PROMPT = exports.LLM_RUBRIC_SYSTEM_PROMPT = void 0;
4
- /**
5
- * System prompt for LLM rubric-based grading. Guides the LLM to grade output according to a rubric and respond with a JSON object.
6
- */
7
8
  exports.LLM_RUBRIC_SYSTEM_PROMPT = `You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
8
9
 
9
10
  Examples:
@@ -16,13 +17,7 @@ Examples:
16
17
  <Rubric>Does not speak like a pirate</Rubric>
17
18
  {"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
18
19
  `;
19
- /**
20
- * User prompt template for rubric-based grading. Used to inject output and rubric into the prompt.
21
- */
22
20
  exports.LLM_RUBRIC_USER_PROMPT = '<Output>\n{{output}}\n</Output>\n<Rubric>\n{{rubric}}\n</Rubric>';
23
- /**
24
- * System prompt for generating evaluation steps from criteria. Guides the LLM to output a minified JSON array of steps.
25
- */
26
21
  exports.GEVAL_STEPS_PROMPT = `
27
22
  Given an evaluation criteria which outlines how you should judge a piece of text, generate 3-4 concise evaluation steps applicable to any text based on the criteria below and designed to confirm the criteria.
28
23
 
@@ -45,9 +40,6 @@ Example:
45
40
  Here are the 3-4 concise evaluation steps, formatted as required in a minified JSON:
46
41
  JSON:
47
42
  `;
48
- /**
49
- * System prompt for evaluating a reply against criteria and steps. Guides the LLM to return a JSON with score and reason.
50
- */
51
43
  exports.GEVAL_EVALUATE_PROMPT = `
52
44
  You will be given one Reply for a Prompt below. Your task is to rate the Reply on one metric.
53
45
  Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
package/dst/prompt.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":";;;AAAA;;GAEG;AACU,QAAA,wBAAwB,GAAG;;;;;;;;;;;CAWvC,CAAC;AAEF;;GAEG;AACU,QAAA,sBAAsB,GAAG,kEAAkE,CAAC;AAEzG;;GAEG;AACU,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;;;;;;CAqBjC,CAAC;AAEF;;GAEG;AACU,QAAA,qBAAqB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA+BpC,CAAC"}
1
+ {"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":";AAAA;;;GAGG;;;AAKU,QAAA,wBAAwB,GAAG;;;;;;;;;;;CAWvC,CAAC;AAKW,QAAA,sBAAsB,GAAG,kEAAkE,CAAC;AAK5F,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;;;;;;CAqBjC,CAAC;AAKW,QAAA,qBAAqB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA+BpC,CAAC"}
package/dst/registry.d.ts CHANGED
@@ -1,21 +1,4 @@
1
1
  import { type LanguageModel } from 'ai';
2
- /**
3
- * Get a language model instance from the provider and model name, using cache if enabled.
4
- * @param providerName The provider name (e.g., 'openai').
5
- * @param modelName The model name.
6
- * @returns The language model instance.
7
- */
8
2
  export declare const getModel: (providerName: string, modelName: string) => LanguageModel;
9
- /**
10
- * Get cached evaluation steps for a criteria, if caching is enabled.
11
- * @param criteria The evaluation criteria string.
12
- * @returns Promise resolving to the cached steps or undefined.
13
- */
14
3
  export declare const getSteps: (criteria: string) => Promise<string[] | undefined>;
15
- /**
16
- * Set evaluation steps for a criteria in the cache, if caching is enabled.
17
- * @param criteria The evaluation criteria string.
18
- * @param steps The steps to cache.
19
- * @returns Promise that resolves when the steps are set.
20
- */
21
4
  export declare const setSteps: (criteria: string, steps: string[]) => Promise<void>;
package/dst/registry.js CHANGED
@@ -49,9 +49,6 @@ const groq_1 = require("@ai-sdk/groq");
49
49
  const perplexity_1 = require("@ai-sdk/perplexity");
50
50
  const xai_1 = require("@ai-sdk/xai");
51
51
  const config_1 = __importDefault(require("./config"));
52
- /**
53
- * Map of provider names to provider functions.
54
- */
55
52
  const PROVIDERS = {
56
53
  openai: openai_1.openai,
57
54
  anthropic: anthropic_1.anthropic,
@@ -64,12 +61,6 @@ const PROVIDERS = {
64
61
  perplexity: perplexity_1.perplexity,
65
62
  xai: xai_1.xai,
66
63
  };
67
- /**
68
- * Get a language model instance from the provider and model name, using cache if enabled.
69
- * @param providerName The provider name (e.g., 'openai').
70
- * @param modelName The model name.
71
- * @returns The language model instance.
72
- */
73
64
  const getModel = (providerName, modelName) => {
74
65
  const cacheKey = `${providerName}:${modelName}`;
75
66
  let model = config_1.default.isModelCached ? config_1.default.modelCache.get(cacheKey) : undefined;
@@ -86,29 +77,13 @@ const getModel = (providerName, modelName) => {
86
77
  return model;
87
78
  };
88
79
  exports.getModel = getModel;
89
- /**
90
- * Compute the MD5 hash of a string.
91
- * @param str The input string.
92
- * @returns The MD5 hash as a hex string.
93
- */
94
80
  const md5 = (str) => {
95
81
  return crypto.createHash('md5').update(str).digest('hex');
96
82
  };
97
- /**
98
- * Get cached evaluation steps for a criteria, if caching is enabled.
99
- * @param criteria The evaluation criteria string.
100
- * @returns Promise resolving to the cached steps or undefined.
101
- */
102
83
  const getSteps = (criteria) => {
103
84
  return config_1.default.isStepsCached ? config_1.default.stepsCache.get(md5(criteria)) : Promise.resolve(undefined);
104
85
  };
105
86
  exports.getSteps = getSteps;
106
- /**
107
- * Set evaluation steps for a criteria in the cache, if caching is enabled.
108
- * @param criteria The evaluation criteria string.
109
- * @param steps The steps to cache.
110
- * @returns Promise that resolves when the steps are set.
111
- */
112
87
  const setSteps = (criteria, steps) => {
113
88
  if (config_1.default.isStepsCached) {
114
89
  return config_1.default.stepsCache.set(md5(criteria), steps);
@@ -1 +1 @@
1
- {"version":3,"file":"registry.js","sourceRoot":"","sources":["../src/registry.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,oDAAsC;AAEtC,2CAAwC;AACxC,iDAA8C;AAC9C,2CAAwC;AACxC,6CAA0C;AAC1C,2DAAiD;AACjD,yCAAsC;AACtC,+CAA4C;AAC5C,uCAAoC;AACpC,mDAAgD;AAChD,qCAAkC;AAElC,sDAA4B;AAG5B;;GAEG;AACH,MAAM,SAAS,GAA6B;IAC1C,MAAM,EAAN,eAAM;IACN,SAAS,EAAT,qBAAS;IACT,MAAM,EAAN,eAAM;IACN,OAAO,EAAP,iBAAO;IACP,OAAO,EAAP,wBAAO;IACP,KAAK,EAAL,aAAK;IACL,QAAQ,EAAR,mBAAQ;IACR,IAAI,EAAJ,WAAI;IACJ,UAAU,EAAV,uBAAU;IACV,GAAG,EAAH,SAAG;CACJ,CAAC;AAEF;;;;;GAKG;AACI,MAAM,QAAQ,GAAG,CAAC,YAAoB,EAAE,SAAiB,EAAiB,EAAE;IACjF,MAAM,QAAQ,GAAG,GAAG,YAAY,IAAI,SAAS,EAAE,CAAC;IAEhD,IAAI,KAAK,GAAG,gBAAI,CAAC,aAAa,CAAC,CAAC,CAAC,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAE3E,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,QAAQ,GAAG,SAAS,CAAC,YAAY,CAAC,CAAC;QAEzC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,MAAM,IAAI,KAAK,CAAC,sBAAsB,YAAY,2BAA2B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACpH,CAAC;QAED,KAAK,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC;QAE5B,IAAI,gBAAI,CAAC,aAAa,EAAE,CAAC;YACvB,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;IAED,OAAO,KAAM,CAAC;AAChB,CAAC,CAAA;AApBY,QAAA,QAAQ,YAoBpB;AAED;;;;GAIG;AACH,MAAM,GAAG,GAAG,CAAC,GAAW,EAAU,EAAE;IAClC,OAAO,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC5D,CAAC,CAAA;AAED;;;;GAIG;AACI,MAAM,QAAQ,GAAG,CAAC,QAAgB,EAAiC,EAAE;IAC1E,OAAO,gBAAI,CAAC,aAAa,CAAC,CAAC,CAAC,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;AAC9F,CAAC,CAAA;AAFY,QAAA,QAAQ,YAEpB;AAED;;;;;GAKG;AACI,MAAM,QAAQ,GAAG,CAAC,QAAgB,EAAE,KAAe,EAAiB,EAAE;IAC3E,IAAI,gBAAI,CAAC,aAAa,EAAE,CAAC;QACvB,OAAO,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,KAAK,CAAC,CAAC;IACnD,CAAC;IAED,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC;AAC3B,CAAC,CAAA;AANY,QAAA,QAAQ,YAMpB"}
1
+ {"version":3,"file":"registry.js","sourceRoot":"","sources":["../src/registry.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,oDAAsC;AAEtC,2CAAwC;AACxC,iDAA8C;AAC9C,2CAAwC;AACxC,6CAA0C;AAC1C,2DAAiD;AACjD,yCAAsC;AACtC,+CAA4C;AAC5C,uCAAoC;AACpC,mDAAgD;AAChD,qCAAkC;AAElC,sDAA4B;AAM5B,MAAM,SAAS,GAA6B;IAC1C,MAAM,EAAN,eAAM;IACN,SAAS,EAAT,qBAAS;IACT,MAAM,EAAN,eAAM;IACN,OAAO,EAAP,iBAAO;IACP,OAAO,EAAP,wBAAO;IACP,KAAK,EAAL,aAAK;IACL,QAAQ,EAAR,mBAAQ;IACR,IAAI,EAAJ,WAAI;IACJ,UAAU,EAAV,uBAAU;IACV,GAAG,EAAH,SAAG;CACJ,CAAC;AAQK,MAAM,QAAQ,GAAG,CAAC,YAAoB,EAAE,SAAiB,EAAiB,EAAE;IACjF,MAAM,QAAQ,GAAG,GAAG,YAAY,IAAI,SAAS,EAAE,CAAC;IAEhD,IAAI,KAAK,GAAG,gBAAI,CAAC,aAAa,CAAC,CAAC,CAAC,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAE3E,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,QAAQ,GAAG,SAAS,CAAC,YAAY,CAAC,CAAC;QAEzC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,MAAM,IAAI,KAAK,CAAC,sBAAsB,YAAY,2BAA2B,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACpH,CAAC;QAED,KAAK,GAAG,QAAQ,CAAC,SAAS,CAAC,CAAC;QAE5B,IAAI,gBAAI,CAAC,aAAa,EAAE,CAAC;YACvB,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;IAED,OAAO,KAAM,CAAC;AAChB,CAAC,CAAA;AApBY,QAAA,QAAQ,YAoBpB;AAOD,MAAM,GAAG,GAAG,CAAC,GAAW,EAAU,EAAE;IAClC,OAAO,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC5D,CAAC,CAAA;AAOM,MAAM,QAAQ,GAAG,CAAC,QAAgB,EAAiC,EAAE;IAC1E,OAAO,gBAAI,CAAC,aAAa,CAAC,CAAC,CAAC,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;AAC9F,CAAC,CAAA;AAFY,QAAA,QAAQ,YAEpB;AAQM,MAAM,QAAQ,GAAG,CAAC,QAAgB,EAAE,KAAe,EAAiB,EAAE;IAC3E,IAAI,gBAAI,CAAC,aAAa,EAAE,CAAC;QACvB,OAAO,gBAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,KAAK,CAAC,CAAC;IACnD,CAAC;IAED,OAAO,OAAO,CAAC,OAAO,EAAE,CAAC;AAC3B,CAAC,CAAA;AANY,QAAA,QAAQ,YAMpB"}
package/dst/types.d.ts ADDED
@@ -0,0 +1 @@
1
+ export type EvalMethod = 'bEval' | 'gEval' | 'llmRubric';
package/dst/types.js ADDED
@@ -0,0 +1,3 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@eva-llm/eva-judge",
3
- "version": "0.1.0",
4
- "description": "LLM-as-a-judge abstraction layer using ai-sdk and plugins",
3
+ "version": "0.1.2",
4
+ "description": "LLM-as-a-Judge abstraction layer using ai-sdk and plugins",
5
5
  "main": "dst/index.js",
6
6
  "types": "dst/index.d.ts",
7
7
  "engines": {