praisonai 1.5.3 → 1.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/eval.d.ts +2 -0
- package/dist/cli/commands/eval.js +58 -3
- package/dist/eval/index.d.ts +1 -0
- package/dist/eval/index.js +16 -1
- package/dist/eval/judge.d.ts +275 -0
- package/dist/eval/judge.js +528 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.js +22 -8
- package/package.json +1 -1
|
@@ -45,12 +45,12 @@ const cli_spec_1 = require("../spec/cli-spec");
|
|
|
45
45
|
const errors_1 = require("../output/errors");
|
|
46
46
|
async function execute(args, options) {
|
|
47
47
|
const subcommand = args[0];
|
|
48
|
-
if (!subcommand || !['accuracy', 'performance', 'reliability'].includes(subcommand)) {
|
|
48
|
+
if (!subcommand || !['accuracy', 'performance', 'reliability', 'judge'].includes(subcommand)) {
|
|
49
49
|
if (options.json || options.output === 'json') {
|
|
50
|
-
(0, json_1.printError)(errors_1.ERROR_CODES.INVALID_ARGS, 'Please specify a subcommand: accuracy, performance, or
|
|
50
|
+
(0, json_1.printError)(errors_1.ERROR_CODES.INVALID_ARGS, 'Please specify a subcommand: accuracy, performance, reliability, or judge');
|
|
51
51
|
}
|
|
52
52
|
else {
|
|
53
|
-
await pretty.error('Please specify a subcommand: accuracy, performance, or
|
|
53
|
+
await pretty.error('Please specify a subcommand: accuracy, performance, reliability, or judge');
|
|
54
54
|
}
|
|
55
55
|
process.exit(cli_spec_1.EXIT_CODES.INVALID_ARGUMENTS);
|
|
56
56
|
}
|
|
@@ -72,6 +72,9 @@ async function execute(args, options) {
|
|
|
72
72
|
case 'reliability':
|
|
73
73
|
await runReliabilityEval(options, config, outputFormat);
|
|
74
74
|
break;
|
|
75
|
+
case 'judge':
|
|
76
|
+
await runJudgeEval(options, config, outputFormat);
|
|
77
|
+
break;
|
|
75
78
|
}
|
|
76
79
|
}
|
|
77
80
|
catch (error) {
|
|
@@ -245,3 +248,55 @@ function calculateSimilarity(str1, str2) {
|
|
|
245
248
|
const union = new Set([...words1, ...words2]);
|
|
246
249
|
return intersection.length / union.size;
|
|
247
250
|
}
|
|
251
|
+
/**
|
|
252
|
+
* Run LLM-as-Judge evaluation
|
|
253
|
+
*/
|
|
254
|
+
async function runJudgeEval(options, config, outputFormat) {
|
|
255
|
+
// Lazy import to avoid performance impact
|
|
256
|
+
const { Judge } = await Promise.resolve().then(() => __importStar(require('../../eval/judge')));
|
|
257
|
+
const output = options.input;
|
|
258
|
+
if (!output) {
|
|
259
|
+
throw new Error('--input is required for judge evaluation (the output to judge)');
|
|
260
|
+
}
|
|
261
|
+
const startTime = Date.now();
|
|
262
|
+
const threshold = options.threshold ?? 7.0;
|
|
263
|
+
const judge = new Judge({
|
|
264
|
+
model: config.model,
|
|
265
|
+
threshold,
|
|
266
|
+
criteria: options.criteria,
|
|
267
|
+
});
|
|
268
|
+
const result = await judge.run({
|
|
269
|
+
output,
|
|
270
|
+
expected: options.expected,
|
|
271
|
+
criteria: options.criteria,
|
|
272
|
+
});
|
|
273
|
+
const duration = Date.now() - startTime;
|
|
274
|
+
if (outputFormat === 'json') {
|
|
275
|
+
(0, json_1.outputJson)((0, json_1.formatSuccess)({
|
|
276
|
+
type: 'judge',
|
|
277
|
+
threshold,
|
|
278
|
+
result: {
|
|
279
|
+
score: result.score,
|
|
280
|
+
passed: result.passed,
|
|
281
|
+
reasoning: result.reasoning,
|
|
282
|
+
suggestions: result.suggestions,
|
|
283
|
+
}
|
|
284
|
+
}, { duration_ms: duration, model: config.model }));
|
|
285
|
+
}
|
|
286
|
+
else {
|
|
287
|
+
await pretty.heading('LLM-as-Judge Evaluation Results');
|
|
288
|
+
await pretty.keyValue({
|
|
289
|
+
'Score': `${result.score.toFixed(1)}/10`,
|
|
290
|
+
'Status': result.passed ? '✅ PASSED' : '❌ FAILED',
|
|
291
|
+
'Threshold': threshold,
|
|
292
|
+
'Reasoning': result.reasoning,
|
|
293
|
+
'Duration': `${duration}ms`
|
|
294
|
+
});
|
|
295
|
+
if (result.suggestions.length > 0) {
|
|
296
|
+
await pretty.heading('Suggestions');
|
|
297
|
+
for (const suggestion of result.suggestions) {
|
|
298
|
+
console.log(` • ${suggestion}`);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
package/dist/eval/index.d.ts
CHANGED
|
@@ -61,3 +61,4 @@ export declare class EvalSuite {
|
|
|
61
61
|
}
|
|
62
62
|
export { Evaluator, createEvaluator, createDefaultEvaluator, relevanceCriterion, lengthCriterion, containsKeywordsCriterion, noHarmfulContentCriterion, type EvalCriteria, type EvalResult as BaseEvalResult, type EvalSummary, type EvaluatorConfig, } from './base';
|
|
63
63
|
export { EvalResults, createEvalResults, type TestResult, type AggregatedResults, type TrendPoint, } from './results';
|
|
64
|
+
export { Judge, AccuracyJudge, CriteriaJudge, RecipeJudge, addJudge, getJudge, listJudges, removeJudge, addOptimizationRule, getOptimizationRule, listOptimizationRules, removeOptimizationRule, parseJudgeResponse, type JudgeConfig, type JudgeCriteriaConfig, type JudgeResult, type JudgeRunOptions, type JudgeOptions, type JudgeProtocol, } from './judge';
|
package/dist/eval/index.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* Evaluation Framework - Accuracy, Performance, and Reliability evaluation
|
|
4
4
|
*/
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.createEvalResults = exports.EvalResults = exports.noHarmfulContentCriterion = exports.containsKeywordsCriterion = exports.lengthCriterion = exports.relevanceCriterion = exports.createDefaultEvaluator = exports.createEvaluator = exports.Evaluator = exports.EvalSuite = void 0;
|
|
6
|
+
exports.parseJudgeResponse = exports.removeOptimizationRule = exports.listOptimizationRules = exports.getOptimizationRule = exports.addOptimizationRule = exports.removeJudge = exports.listJudges = exports.getJudge = exports.addJudge = exports.RecipeJudge = exports.CriteriaJudge = exports.AccuracyJudge = exports.Judge = exports.createEvalResults = exports.EvalResults = exports.noHarmfulContentCriterion = exports.containsKeywordsCriterion = exports.lengthCriterion = exports.relevanceCriterion = exports.createDefaultEvaluator = exports.createEvaluator = exports.Evaluator = exports.EvalSuite = void 0;
|
|
7
7
|
exports.accuracyEval = accuracyEval;
|
|
8
8
|
exports.performanceEval = performanceEval;
|
|
9
9
|
exports.reliabilityEval = reliabilityEval;
|
|
@@ -168,3 +168,18 @@ Object.defineProperty(exports, "noHarmfulContentCriterion", { enumerable: true,
|
|
|
168
168
|
var results_1 = require("./results");
|
|
169
169
|
Object.defineProperty(exports, "EvalResults", { enumerable: true, get: function () { return results_1.EvalResults; } });
|
|
170
170
|
Object.defineProperty(exports, "createEvalResults", { enumerable: true, get: function () { return results_1.createEvalResults; } });
|
|
171
|
+
// Re-export Judge (LLM-as-Judge)
|
|
172
|
+
var judge_1 = require("./judge");
|
|
173
|
+
Object.defineProperty(exports, "Judge", { enumerable: true, get: function () { return judge_1.Judge; } });
|
|
174
|
+
Object.defineProperty(exports, "AccuracyJudge", { enumerable: true, get: function () { return judge_1.AccuracyJudge; } });
|
|
175
|
+
Object.defineProperty(exports, "CriteriaJudge", { enumerable: true, get: function () { return judge_1.CriteriaJudge; } });
|
|
176
|
+
Object.defineProperty(exports, "RecipeJudge", { enumerable: true, get: function () { return judge_1.RecipeJudge; } });
|
|
177
|
+
Object.defineProperty(exports, "addJudge", { enumerable: true, get: function () { return judge_1.addJudge; } });
|
|
178
|
+
Object.defineProperty(exports, "getJudge", { enumerable: true, get: function () { return judge_1.getJudge; } });
|
|
179
|
+
Object.defineProperty(exports, "listJudges", { enumerable: true, get: function () { return judge_1.listJudges; } });
|
|
180
|
+
Object.defineProperty(exports, "removeJudge", { enumerable: true, get: function () { return judge_1.removeJudge; } });
|
|
181
|
+
Object.defineProperty(exports, "addOptimizationRule", { enumerable: true, get: function () { return judge_1.addOptimizationRule; } });
|
|
182
|
+
Object.defineProperty(exports, "getOptimizationRule", { enumerable: true, get: function () { return judge_1.getOptimizationRule; } });
|
|
183
|
+
Object.defineProperty(exports, "listOptimizationRules", { enumerable: true, get: function () { return judge_1.listOptimizationRules; } });
|
|
184
|
+
Object.defineProperty(exports, "removeOptimizationRule", { enumerable: true, get: function () { return judge_1.removeOptimizationRule; } });
|
|
185
|
+
Object.defineProperty(exports, "parseJudgeResponse", { enumerable: true, get: function () { return judge_1.parseJudgeResponse; } });
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified Judge class for LLM-as-judge evaluation.
|
|
3
|
+
*
|
|
4
|
+
* Provides a simple, unified API for evaluating agent outputs using LLM-as-judge.
|
|
5
|
+
* Follows PraisonAI naming conventions and engineering principles.
|
|
6
|
+
*
|
|
7
|
+
* DRY: Reuses existing provider infrastructure.
|
|
8
|
+
* Protocol-driven: Implements JudgeProtocol for extensibility.
|
|
9
|
+
* Zero performance impact: Lazy imports for LLM providers.
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```typescript
|
|
13
|
+
* import { Judge } from 'praisonai';
|
|
14
|
+
* const result = await Judge.run({ output: "4", expected: "4" });
|
|
15
|
+
* console.log(`Score: ${result.score}/10`);
|
|
16
|
+
* ```
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* Configuration for Judge instances.
|
|
20
|
+
*/
|
|
21
|
+
export interface JudgeConfig {
|
|
22
|
+
/** LLM model to use for judging (default: gpt-4o-mini) */
|
|
23
|
+
model?: string;
|
|
24
|
+
/** Temperature for LLM calls (default: 0.1 for consistency) */
|
|
25
|
+
temperature?: number;
|
|
26
|
+
/** Maximum tokens for LLM response */
|
|
27
|
+
maxTokens?: number;
|
|
28
|
+
/** Score threshold for passing (default: 7.0) */
|
|
29
|
+
threshold?: number;
|
|
30
|
+
/** Optional custom criteria for evaluation */
|
|
31
|
+
criteria?: string;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Dynamic criteria configuration for domain-agnostic judging.
|
|
35
|
+
*
|
|
36
|
+
* Enables judges to evaluate ANY domain, not just agent outputs:
|
|
37
|
+
* - Water flow optimization
|
|
38
|
+
* - Data pipeline efficiency
|
|
39
|
+
* - Manufacturing quality
|
|
40
|
+
* - Recipe/workflow optimization
|
|
41
|
+
* - Any custom domain
|
|
42
|
+
*/
|
|
43
|
+
export interface JudgeCriteriaConfig {
|
|
44
|
+
/** Name of the criteria configuration */
|
|
45
|
+
name: string;
|
|
46
|
+
/** Description of what is being evaluated */
|
|
47
|
+
description: string;
|
|
48
|
+
/** Custom prompt template with {output} placeholder */
|
|
49
|
+
promptTemplate: string;
|
|
50
|
+
/** List of dimensions to score (e.g., ["efficiency", "safety"]) */
|
|
51
|
+
scoringDimensions: string[];
|
|
52
|
+
/** Score threshold for passing (default: 7.0) */
|
|
53
|
+
threshold?: number;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Result from a Judge evaluation.
|
|
57
|
+
*
|
|
58
|
+
* This is the unified result type for all LLM-as-judge evaluations.
|
|
59
|
+
*/
|
|
60
|
+
export interface JudgeResult {
|
|
61
|
+
/** Quality score (1-10) */
|
|
62
|
+
score: number;
|
|
63
|
+
/** Whether the evaluation passed (score >= threshold) */
|
|
64
|
+
passed: boolean;
|
|
65
|
+
/** Explanation for the score */
|
|
66
|
+
reasoning: string;
|
|
67
|
+
/** The output that was judged */
|
|
68
|
+
output: string;
|
|
69
|
+
/** Optional expected output */
|
|
70
|
+
expected?: string;
|
|
71
|
+
/** Optional criteria used for evaluation */
|
|
72
|
+
criteria?: string;
|
|
73
|
+
/** List of improvement suggestions */
|
|
74
|
+
suggestions: string[];
|
|
75
|
+
/** When judging occurred */
|
|
76
|
+
timestamp: number;
|
|
77
|
+
/** Additional metadata */
|
|
78
|
+
metadata?: Record<string, any>;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Options for Judge.run() method
|
|
82
|
+
*/
|
|
83
|
+
export interface JudgeRunOptions {
|
|
84
|
+
/** The output to judge (required if no agent) */
|
|
85
|
+
output?: string;
|
|
86
|
+
/** Optional expected output for accuracy evaluation */
|
|
87
|
+
expected?: string;
|
|
88
|
+
/** Optional criteria for criteria evaluation */
|
|
89
|
+
criteria?: string;
|
|
90
|
+
/** Optional input context */
|
|
91
|
+
input?: string;
|
|
92
|
+
/** Optional Agent to run and judge */
|
|
93
|
+
agent?: any;
|
|
94
|
+
/** Optional Agents to run and judge */
|
|
95
|
+
agents?: any;
|
|
96
|
+
/** Whether to print result summary */
|
|
97
|
+
printSummary?: boolean;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Constructor options for Judge class
|
|
101
|
+
*/
|
|
102
|
+
export interface JudgeOptions {
|
|
103
|
+
/** LLM model to use */
|
|
104
|
+
model?: string;
|
|
105
|
+
/** Temperature for LLM calls */
|
|
106
|
+
temperature?: number;
|
|
107
|
+
/** Maximum tokens for response */
|
|
108
|
+
maxTokens?: number;
|
|
109
|
+
/** Score threshold for passing */
|
|
110
|
+
threshold?: number;
|
|
111
|
+
/** Custom criteria for evaluation */
|
|
112
|
+
criteria?: string;
|
|
113
|
+
/** Full JudgeConfig object */
|
|
114
|
+
config?: JudgeConfig;
|
|
115
|
+
/** Domain-agnostic criteria config */
|
|
116
|
+
criteriaConfig?: JudgeCriteriaConfig;
|
|
117
|
+
/** Session ID for trace isolation */
|
|
118
|
+
sessionId?: string;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Protocol interface for Judge implementations
|
|
122
|
+
*/
|
|
123
|
+
export interface JudgeProtocol {
|
|
124
|
+
run(options: JudgeRunOptions): Promise<JudgeResult>;
|
|
125
|
+
runAsync(options: JudgeRunOptions): Promise<JudgeResult>;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Parse LLM response into JudgeResult.
|
|
129
|
+
*
|
|
130
|
+
* @param responseText - Raw LLM response
|
|
131
|
+
* @param output - Original output
|
|
132
|
+
* @param expected - Original expected output
|
|
133
|
+
* @param criteria - Original criteria
|
|
134
|
+
* @param threshold - Score threshold for passing
|
|
135
|
+
* @returns JudgeResult with score, passed, reasoning, suggestions
|
|
136
|
+
*/
|
|
137
|
+
export declare function parseJudgeResponse(responseText: string, output: string, expected: string | null, criteria: string | null, threshold: number): JudgeResult;
|
|
138
|
+
/**
|
|
139
|
+
* Unified LLM-as-judge for evaluating agent outputs.
|
|
140
|
+
*
|
|
141
|
+
* Provides a simple API for:
|
|
142
|
+
* - Accuracy evaluation (comparing output to expected)
|
|
143
|
+
* - Criteria evaluation (evaluating against custom criteria)
|
|
144
|
+
* - Custom evaluation (subclass for domain-specific judges)
|
|
145
|
+
*
|
|
146
|
+
* @example
|
|
147
|
+
* ```typescript
|
|
148
|
+
* // Simple accuracy check
|
|
149
|
+
* const result = await new Judge().run({ output: "4", expected: "4" });
|
|
150
|
+
*
|
|
151
|
+
* // Custom criteria
|
|
152
|
+
* const result = await new Judge({ criteria: "Response is helpful" }).run({ output: "Hello!" });
|
|
153
|
+
*
|
|
154
|
+
* // With agent
|
|
155
|
+
* const result = await new Judge().run({ agent: myAgent, input: "2+2", expected: "4" });
|
|
156
|
+
* ```
|
|
157
|
+
*/
|
|
158
|
+
export declare class Judge implements JudgeProtocol {
|
|
159
|
+
readonly model: string;
|
|
160
|
+
readonly temperature: number;
|
|
161
|
+
readonly maxTokens: number;
|
|
162
|
+
readonly threshold: number;
|
|
163
|
+
readonly criteria: string | null;
|
|
164
|
+
readonly criteriaConfig: JudgeCriteriaConfig | null;
|
|
165
|
+
readonly sessionId: string | null;
|
|
166
|
+
constructor(options?: JudgeOptions);
|
|
167
|
+
/**
|
|
168
|
+
* Build the appropriate prompt based on evaluation type.
|
|
169
|
+
*/
|
|
170
|
+
protected buildPrompt(output: string, expected: string | null, criteria: string | null, input: string): string;
|
|
171
|
+
/**
|
|
172
|
+
* Get LLM provider lazily.
|
|
173
|
+
*/
|
|
174
|
+
protected getProvider(): Promise<any>;
|
|
175
|
+
/**
|
|
176
|
+
* Get output from an Agent.
|
|
177
|
+
*/
|
|
178
|
+
protected getAgentOutput(agent: any, input: string): Promise<string>;
|
|
179
|
+
/**
|
|
180
|
+
* Get output from Agents (multi-agent).
|
|
181
|
+
*/
|
|
182
|
+
protected getAgentsOutput(agents: any, input: string): Promise<string>;
|
|
183
|
+
/**
|
|
184
|
+
* Judge an output.
|
|
185
|
+
*
|
|
186
|
+
* @param options - Evaluation options
|
|
187
|
+
* @returns JudgeResult with score, passed, reasoning, suggestions
|
|
188
|
+
*/
|
|
189
|
+
run(options: JudgeRunOptions): Promise<JudgeResult>;
|
|
190
|
+
/**
|
|
191
|
+
* Judge an output asynchronously (alias for run).
|
|
192
|
+
*/
|
|
193
|
+
runAsync(options: JudgeRunOptions): Promise<JudgeResult>;
|
|
194
|
+
/**
|
|
195
|
+
* Print a summary of the judge result.
|
|
196
|
+
*/
|
|
197
|
+
printSummary(result: JudgeResult): void;
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Judge for accuracy evaluation (comparing output to expected).
|
|
201
|
+
*/
|
|
202
|
+
export declare class AccuracyJudge extends Judge {
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Judge for criteria-based evaluation.
|
|
206
|
+
*/
|
|
207
|
+
export declare class CriteriaJudge extends Judge {
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Judge for evaluating recipe/workflow execution traces.
|
|
211
|
+
*/
|
|
212
|
+
export declare class RecipeJudge extends Judge {
|
|
213
|
+
readonly mode: string;
|
|
214
|
+
constructor(options?: JudgeOptions & {
|
|
215
|
+
mode?: string;
|
|
216
|
+
});
|
|
217
|
+
protected buildPrompt(output: string, expected: string | null, criteria: string | null, input: string): string;
|
|
218
|
+
}
|
|
219
|
+
type JudgeConstructor = new (options?: JudgeOptions) => Judge;
|
|
220
|
+
/**
|
|
221
|
+
* Register a custom judge type.
|
|
222
|
+
*
|
|
223
|
+
* @param name - Name for the judge type
|
|
224
|
+
* @param judgeClass - Judge class to register
|
|
225
|
+
*/
|
|
226
|
+
export declare function addJudge(name: string, judgeClass: JudgeConstructor): void;
|
|
227
|
+
/**
|
|
228
|
+
* Get a registered judge type by name.
|
|
229
|
+
*
|
|
230
|
+
* @param name - Name of the judge type
|
|
231
|
+
* @returns Judge class or undefined if not found
|
|
232
|
+
*/
|
|
233
|
+
export declare function getJudge(name: string): JudgeConstructor | undefined;
|
|
234
|
+
/**
|
|
235
|
+
* List all registered judge types.
|
|
236
|
+
*
|
|
237
|
+
* @returns List of judge type names
|
|
238
|
+
*/
|
|
239
|
+
export declare function listJudges(): string[];
|
|
240
|
+
/**
|
|
241
|
+
* Remove a registered judge type.
|
|
242
|
+
*
|
|
243
|
+
* @param name - Name of the judge type to remove
|
|
244
|
+
* @returns True if removed, false if not found
|
|
245
|
+
*/
|
|
246
|
+
export declare function removeJudge(name: string): boolean;
|
|
247
|
+
type OptimizationRuleConstructor = new (...args: any[]) => any;
|
|
248
|
+
/**
|
|
249
|
+
* Register a custom optimization rule.
|
|
250
|
+
*
|
|
251
|
+
* @param name - Name for the rule
|
|
252
|
+
* @param ruleClass - Rule class implementing OptimizationRuleProtocol
|
|
253
|
+
*/
|
|
254
|
+
export declare function addOptimizationRule(name: string, ruleClass: OptimizationRuleConstructor): void;
|
|
255
|
+
/**
|
|
256
|
+
* Get a registered optimization rule by name.
|
|
257
|
+
*
|
|
258
|
+
* @param name - Name of the rule
|
|
259
|
+
* @returns Rule class or undefined if not found
|
|
260
|
+
*/
|
|
261
|
+
export declare function getOptimizationRule(name: string): OptimizationRuleConstructor | undefined;
|
|
262
|
+
/**
|
|
263
|
+
* List all registered optimization rules.
|
|
264
|
+
*
|
|
265
|
+
* @returns List of rule names
|
|
266
|
+
*/
|
|
267
|
+
export declare function listOptimizationRules(): string[];
|
|
268
|
+
/**
|
|
269
|
+
* Remove a registered optimization rule.
|
|
270
|
+
*
|
|
271
|
+
* @param name - Name of the rule to remove
|
|
272
|
+
* @returns True if removed, false if not found
|
|
273
|
+
*/
|
|
274
|
+
export declare function removeOptimizationRule(name: string): boolean;
|
|
275
|
+
export default Judge;
|
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Unified Judge class for LLM-as-judge evaluation.
|
|
4
|
+
*
|
|
5
|
+
* Provides a simple, unified API for evaluating agent outputs using LLM-as-judge.
|
|
6
|
+
* Follows PraisonAI naming conventions and engineering principles.
|
|
7
|
+
*
|
|
8
|
+
* DRY: Reuses existing provider infrastructure.
|
|
9
|
+
* Protocol-driven: Implements JudgeProtocol for extensibility.
|
|
10
|
+
* Zero performance impact: Lazy imports for LLM providers.
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* import { Judge } from 'praisonai';
|
|
15
|
+
* const result = await Judge.run({ output: "4", expected: "4" });
|
|
16
|
+
* console.log(`Score: ${result.score}/10`);
|
|
17
|
+
* ```
|
|
18
|
+
*/
|
|
19
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
20
|
+
if (k2 === undefined) k2 = k;
|
|
21
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
22
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
23
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
24
|
+
}
|
|
25
|
+
Object.defineProperty(o, k2, desc);
|
|
26
|
+
}) : (function(o, m, k, k2) {
|
|
27
|
+
if (k2 === undefined) k2 = k;
|
|
28
|
+
o[k2] = m[k];
|
|
29
|
+
}));
|
|
30
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
31
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
32
|
+
}) : function(o, v) {
|
|
33
|
+
o["default"] = v;
|
|
34
|
+
});
|
|
35
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
36
|
+
var ownKeys = function(o) {
|
|
37
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
38
|
+
var ar = [];
|
|
39
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
40
|
+
return ar;
|
|
41
|
+
};
|
|
42
|
+
return ownKeys(o);
|
|
43
|
+
};
|
|
44
|
+
return function (mod) {
|
|
45
|
+
if (mod && mod.__esModule) return mod;
|
|
46
|
+
var result = {};
|
|
47
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
48
|
+
__setModuleDefault(result, mod);
|
|
49
|
+
return result;
|
|
50
|
+
};
|
|
51
|
+
})();
|
|
52
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
53
|
+
exports.RecipeJudge = exports.CriteriaJudge = exports.AccuracyJudge = exports.Judge = void 0;
|
|
54
|
+
exports.parseJudgeResponse = parseJudgeResponse;
|
|
55
|
+
exports.addJudge = addJudge;
|
|
56
|
+
exports.getJudge = getJudge;
|
|
57
|
+
exports.listJudges = listJudges;
|
|
58
|
+
exports.removeJudge = removeJudge;
|
|
59
|
+
exports.addOptimizationRule = addOptimizationRule;
|
|
60
|
+
exports.getOptimizationRule = getOptimizationRule;
|
|
61
|
+
exports.listOptimizationRules = listOptimizationRules;
|
|
62
|
+
exports.removeOptimizationRule = removeOptimizationRule;
|
|
63
|
+
// ============================================================================
|
|
64
|
+
// Prompt Templates
|
|
65
|
+
// ============================================================================
|
|
66
|
+
/** Default prompt for accuracy evaluation */
|
|
67
|
+
const ACCURACY_PROMPT = `You are an expert evaluator. Compare the actual output against the expected output.
|
|
68
|
+
|
|
69
|
+
INPUT: {input}
|
|
70
|
+
|
|
71
|
+
EXPECTED OUTPUT:
|
|
72
|
+
{expected}
|
|
73
|
+
|
|
74
|
+
ACTUAL OUTPUT:
|
|
75
|
+
{output}
|
|
76
|
+
|
|
77
|
+
Scoring Guidelines:
|
|
78
|
+
- 10: Perfect match in meaning and completeness
|
|
79
|
+
- 8-9: Very close, minor differences that don't affect correctness
|
|
80
|
+
- 6-7: Mostly correct but missing some details or has minor errors
|
|
81
|
+
- 4-5: Partially correct but significant issues
|
|
82
|
+
- 2-3: Mostly incorrect but shows some understanding
|
|
83
|
+
- 1: Completely wrong or irrelevant
|
|
84
|
+
|
|
85
|
+
Respond in this EXACT format:
|
|
86
|
+
SCORE: [number 1-10]
|
|
87
|
+
REASONING: [brief explanation]
|
|
88
|
+
SUGGESTIONS:
|
|
89
|
+
- [improvement suggestion 1]
|
|
90
|
+
- [improvement suggestion 2]
|
|
91
|
+
`;
|
|
92
|
+
/** Default prompt for criteria evaluation */
|
|
93
|
+
const CRITERIA_PROMPT = `You are an expert evaluator. Evaluate the output against the given criteria.
|
|
94
|
+
|
|
95
|
+
CRITERIA: {criteria}
|
|
96
|
+
|
|
97
|
+
OUTPUT TO EVALUATE:
|
|
98
|
+
{output}
|
|
99
|
+
|
|
100
|
+
Score the output from 1-10 based on how well it meets the criteria.
|
|
101
|
+
- 10: Perfectly meets all criteria
|
|
102
|
+
- 8-9: Meets criteria very well with minor issues
|
|
103
|
+
- 6-7: Meets most criteria but has some gaps
|
|
104
|
+
- 4-5: Partially meets criteria
|
|
105
|
+
- 2-3: Barely meets criteria
|
|
106
|
+
- 1: Does not meet criteria at all
|
|
107
|
+
|
|
108
|
+
Respond in this EXACT format:
|
|
109
|
+
SCORE: [number 1-10]
|
|
110
|
+
REASONING: [brief explanation]
|
|
111
|
+
SUGGESTIONS:
|
|
112
|
+
- [improvement suggestion 1]
|
|
113
|
+
- [improvement suggestion 2]
|
|
114
|
+
`;
|
|
115
|
+
/** Default prompt for general quality evaluation */
|
|
116
|
+
const GENERAL_PROMPT = `You are an expert evaluator for AI agent outputs. Your task is to grade the quality of an agent's response.
|
|
117
|
+
|
|
118
|
+
INPUT (what the agent was asked):
|
|
119
|
+
{input}
|
|
120
|
+
|
|
121
|
+
AGENT OUTPUT (what the agent responded):
|
|
122
|
+
{output}
|
|
123
|
+
|
|
124
|
+
GRADING CRITERIA:
|
|
125
|
+
- Accuracy: Is the response factually correct?
|
|
126
|
+
- Completeness: Does it fully address the input?
|
|
127
|
+
- Clarity: Is it well-written and easy to understand?
|
|
128
|
+
- Relevance: Does it stay on topic?
|
|
129
|
+
|
|
130
|
+
SCORING GUIDELINES:
|
|
131
|
+
- 10: Perfect - Excellent in all criteria
|
|
132
|
+
- 8-9: Very Good - Minor improvements possible
|
|
133
|
+
- 6-7: Good - Some issues but mostly correct
|
|
134
|
+
- 4-5: Fair - Significant issues
|
|
135
|
+
- 2-3: Poor - Major problems
|
|
136
|
+
- 1: Very Poor - Completely wrong or irrelevant
|
|
137
|
+
|
|
138
|
+
Respond in this EXACT format:
|
|
139
|
+
SCORE: [number 1-10]
|
|
140
|
+
REASONING: [brief explanation of the score]
|
|
141
|
+
SUGGESTIONS:
|
|
142
|
+
- [first suggestion for improvement]
|
|
143
|
+
- [second suggestion if applicable]
|
|
144
|
+
`;
|
|
145
|
+
/** Recipe evaluation prompt */
|
|
146
|
+
const RECIPE_PROMPT = `You are an expert evaluator for AI agent workflow recipes.
|
|
147
|
+
|
|
148
|
+
RECIPE OUTPUT TO EVALUATE:
|
|
149
|
+
{output}
|
|
150
|
+
|
|
151
|
+
EXPECTED BEHAVIOR:
|
|
152
|
+
{expected}
|
|
153
|
+
|
|
154
|
+
EVALUATION CRITERIA:
|
|
155
|
+
{criteria}
|
|
156
|
+
|
|
157
|
+
Evaluate the recipe execution on:
|
|
158
|
+
1. Task completion (1-10): Did agents complete their assigned tasks?
|
|
159
|
+
2. Context flow (1-10): Was context properly passed between agents?
|
|
160
|
+
3. Output quality (1-10): Is the final output useful and accurate?
|
|
161
|
+
|
|
162
|
+
Respond in this EXACT format:
|
|
163
|
+
SCORE: [average of above, 1-10]
|
|
164
|
+
REASONING: [brief explanation]
|
|
165
|
+
SUGGESTIONS:
|
|
166
|
+
- [improvement suggestion 1]
|
|
167
|
+
- [improvement suggestion 2]
|
|
168
|
+
`;
|
|
169
|
+
// ============================================================================
|
|
170
|
+
// Response Parsing
|
|
171
|
+
// ============================================================================
|
|
172
|
+
/**
|
|
173
|
+
* Parse LLM response into JudgeResult.
|
|
174
|
+
*
|
|
175
|
+
* @param responseText - Raw LLM response
|
|
176
|
+
* @param output - Original output
|
|
177
|
+
* @param expected - Original expected output
|
|
178
|
+
* @param criteria - Original criteria
|
|
179
|
+
* @param threshold - Score threshold for passing
|
|
180
|
+
* @returns JudgeResult with score, passed, reasoning, suggestions
|
|
181
|
+
*/
|
|
182
|
+
function parseJudgeResponse(responseText, output, expected, criteria, threshold) {
|
|
183
|
+
let score = 5.0;
|
|
184
|
+
let reasoning = 'Unable to parse response';
|
|
185
|
+
const suggestions = [];
|
|
186
|
+
const lines = responseText.trim().split('\n');
|
|
187
|
+
let inSuggestions = false;
|
|
188
|
+
for (const rawLine of lines) {
|
|
189
|
+
const line = rawLine.trim();
|
|
190
|
+
if (line.startsWith('SCORE:')) {
|
|
191
|
+
try {
|
|
192
|
+
const scoreStr = line.replace('SCORE:', '').trim();
|
|
193
|
+
score = parseFloat(scoreStr);
|
|
194
|
+
// Clamp to valid range
|
|
195
|
+
score = Math.max(1.0, Math.min(10.0, score));
|
|
196
|
+
}
|
|
197
|
+
catch {
|
|
198
|
+
// Keep default
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
else if (line.startsWith('REASONING:')) {
|
|
202
|
+
reasoning = line.replace('REASONING:', '').trim();
|
|
203
|
+
}
|
|
204
|
+
else if (line.startsWith('SUGGESTIONS:')) {
|
|
205
|
+
inSuggestions = true;
|
|
206
|
+
const rest = line.replace('SUGGESTIONS:', '').trim();
|
|
207
|
+
if (rest.toLowerCase() !== 'none' && rest) {
|
|
208
|
+
suggestions.push(rest);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
else if (inSuggestions && line.startsWith('-')) {
|
|
212
|
+
const suggestion = line.replace(/^-\s*/, '').trim();
|
|
213
|
+
if (suggestion && suggestion.toLowerCase() !== 'none') {
|
|
214
|
+
suggestions.push(suggestion);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return {
|
|
219
|
+
score,
|
|
220
|
+
passed: score >= threshold,
|
|
221
|
+
reasoning,
|
|
222
|
+
output,
|
|
223
|
+
expected: expected ?? undefined,
|
|
224
|
+
criteria: criteria ?? undefined,
|
|
225
|
+
suggestions,
|
|
226
|
+
timestamp: Date.now(),
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
// ============================================================================
|
|
230
|
+
// Judge Class
|
|
231
|
+
// ============================================================================
|
|
232
|
+
/**
|
|
233
|
+
* Unified LLM-as-judge for evaluating agent outputs.
|
|
234
|
+
*
|
|
235
|
+
* Provides a simple API for:
|
|
236
|
+
* - Accuracy evaluation (comparing output to expected)
|
|
237
|
+
* - Criteria evaluation (evaluating against custom criteria)
|
|
238
|
+
* - Custom evaluation (subclass for domain-specific judges)
|
|
239
|
+
*
|
|
240
|
+
* @example
|
|
241
|
+
* ```typescript
|
|
242
|
+
* // Simple accuracy check
|
|
243
|
+
* const result = await new Judge().run({ output: "4", expected: "4" });
|
|
244
|
+
*
|
|
245
|
+
* // Custom criteria
|
|
246
|
+
* const result = await new Judge({ criteria: "Response is helpful" }).run({ output: "Hello!" });
|
|
247
|
+
*
|
|
248
|
+
* // With agent
|
|
249
|
+
* const result = await new Judge().run({ agent: myAgent, input: "2+2", expected: "4" });
|
|
250
|
+
* ```
|
|
251
|
+
*/
|
|
252
|
+
class Judge {
|
|
253
|
+
constructor(options = {}) {
|
|
254
|
+
// Use config if provided, otherwise use individual params
|
|
255
|
+
const config = options.config;
|
|
256
|
+
this.model = options.model ?? config?.model ?? process.env.OPENAI_MODEL_NAME ?? 'gpt-4o-mini';
|
|
257
|
+
this.temperature = options.temperature ?? config?.temperature ?? 0.1;
|
|
258
|
+
this.maxTokens = options.maxTokens ?? config?.maxTokens ?? 500;
|
|
259
|
+
this.threshold = options.threshold ?? config?.threshold ?? options.criteriaConfig?.threshold ?? 7.0;
|
|
260
|
+
this.criteria = options.criteria ?? config?.criteria ?? options.criteriaConfig?.description ?? null;
|
|
261
|
+
this.criteriaConfig = options.criteriaConfig ?? null;
|
|
262
|
+
this.sessionId = options.sessionId ?? null;
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Build the appropriate prompt based on evaluation type.
|
|
266
|
+
*/
|
|
267
|
+
buildPrompt(output, expected, criteria, input) {
|
|
268
|
+
// Use criteriaConfig custom prompt template if available
|
|
269
|
+
if (this.criteriaConfig?.promptTemplate) {
|
|
270
|
+
return this.criteriaConfig.promptTemplate
|
|
271
|
+
.replace('{output}', output)
|
|
272
|
+
.replace('{input}', input || 'Not provided')
|
|
273
|
+
.replace('{input_text}', input || 'Not provided')
|
|
274
|
+
.replace('{expected}', expected || 'Not specified');
|
|
275
|
+
}
|
|
276
|
+
// Use instance criteria if not provided
|
|
277
|
+
const effectiveCriteria = criteria ?? this.criteria;
|
|
278
|
+
if (expected !== null) {
|
|
279
|
+
// Accuracy mode
|
|
280
|
+
return ACCURACY_PROMPT
|
|
281
|
+
.replace('{input}', input || 'Not provided')
|
|
282
|
+
.replace('{expected}', expected)
|
|
283
|
+
.replace('{output}', output);
|
|
284
|
+
}
|
|
285
|
+
else if (effectiveCriteria) {
|
|
286
|
+
// Criteria mode
|
|
287
|
+
return CRITERIA_PROMPT
|
|
288
|
+
.replace('{criteria}', effectiveCriteria)
|
|
289
|
+
.replace('{output}', output);
|
|
290
|
+
}
|
|
291
|
+
else {
|
|
292
|
+
// Default: general quality evaluation
|
|
293
|
+
return GENERAL_PROMPT
|
|
294
|
+
.replace('{input}', input || 'Not provided')
|
|
295
|
+
.replace('{output}', output);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Get LLM provider lazily.
|
|
300
|
+
*/
|
|
301
|
+
async getProvider() {
|
|
302
|
+
// Lazy import to avoid performance impact
|
|
303
|
+
const { createProvider } = await Promise.resolve().then(() => __importStar(require('../llm/providers')));
|
|
304
|
+
return createProvider(this.model);
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Get output from an Agent.
|
|
308
|
+
*/
|
|
309
|
+
async getAgentOutput(agent, input) {
|
|
310
|
+
if (typeof agent.chat === 'function') {
|
|
311
|
+
return String(await agent.chat(input));
|
|
312
|
+
}
|
|
313
|
+
else if (typeof agent.start === 'function') {
|
|
314
|
+
const result = await agent.start(input);
|
|
315
|
+
if (result && typeof result === 'object' && 'raw' in result) {
|
|
316
|
+
return String(result.raw);
|
|
317
|
+
}
|
|
318
|
+
return String(result);
|
|
319
|
+
}
|
|
320
|
+
throw new Error("Agent must have 'chat' or 'start' method");
|
|
321
|
+
}
|
|
322
|
+
/**
|
|
323
|
+
* Get output from Agents (multi-agent).
|
|
324
|
+
*/
|
|
325
|
+
async getAgentsOutput(agents, input) {
|
|
326
|
+
if (typeof agents.start === 'function') {
|
|
327
|
+
const result = await agents.start(input);
|
|
328
|
+
if (result && typeof result === 'object' && 'raw' in result) {
|
|
329
|
+
return String(result.raw);
|
|
330
|
+
}
|
|
331
|
+
return String(result);
|
|
332
|
+
}
|
|
333
|
+
throw new Error("Agents must have 'start' method");
|
|
334
|
+
}
|
|
335
|
+
/**
|
|
336
|
+
* Judge an output.
|
|
337
|
+
*
|
|
338
|
+
* @param options - Evaluation options
|
|
339
|
+
* @returns JudgeResult with score, passed, reasoning, suggestions
|
|
340
|
+
*/
|
|
341
|
+
async run(options) {
|
|
342
|
+
let output = options.output ?? '';
|
|
343
|
+
// Get output from agent if provided
|
|
344
|
+
if (options.agent) {
|
|
345
|
+
output = await this.getAgentOutput(options.agent, options.input ?? '');
|
|
346
|
+
}
|
|
347
|
+
else if (options.agents) {
|
|
348
|
+
output = await this.getAgentsOutput(options.agents, options.input ?? '');
|
|
349
|
+
}
|
|
350
|
+
if (!output) {
|
|
351
|
+
return {
|
|
352
|
+
score: 0,
|
|
353
|
+
passed: false,
|
|
354
|
+
reasoning: 'No output provided to judge',
|
|
355
|
+
output: '',
|
|
356
|
+
expected: options.expected,
|
|
357
|
+
criteria: options.criteria ?? this.criteria ?? undefined,
|
|
358
|
+
suggestions: [],
|
|
359
|
+
timestamp: Date.now(),
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
try {
|
|
363
|
+
const provider = await this.getProvider();
|
|
364
|
+
const prompt = this.buildPrompt(output, options.expected ?? null, options.criteria ?? null, options.input ?? '');
|
|
365
|
+
const response = await provider.generateText({
|
|
366
|
+
messages: [{ role: 'user', content: prompt }],
|
|
367
|
+
temperature: this.temperature,
|
|
368
|
+
maxTokens: this.maxTokens,
|
|
369
|
+
});
|
|
370
|
+
const responseText = response.text ?? '';
|
|
371
|
+
const result = parseJudgeResponse(responseText, output, options.expected ?? null, options.criteria ?? this.criteria, this.threshold);
|
|
372
|
+
if (options.printSummary) {
|
|
373
|
+
this.printSummary(result);
|
|
374
|
+
}
|
|
375
|
+
return result;
|
|
376
|
+
}
|
|
377
|
+
catch (error) {
|
|
378
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
379
|
+
return {
|
|
380
|
+
score: 0,
|
|
381
|
+
passed: false,
|
|
382
|
+
reasoning: `Evaluation error: ${errorMessage}`,
|
|
383
|
+
output,
|
|
384
|
+
expected: options.expected,
|
|
385
|
+
criteria: options.criteria ?? this.criteria ?? undefined,
|
|
386
|
+
suggestions: [],
|
|
387
|
+
timestamp: Date.now(),
|
|
388
|
+
};
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
/**
|
|
392
|
+
* Judge an output asynchronously (alias for run).
|
|
393
|
+
*/
|
|
394
|
+
async runAsync(options) {
|
|
395
|
+
return this.run(options);
|
|
396
|
+
}
|
|
397
|
+
/**
|
|
398
|
+
* Print a summary of the judge result.
|
|
399
|
+
*/
|
|
400
|
+
printSummary(result) {
|
|
401
|
+
const status = result.passed ? '✅ PASSED' : '❌ FAILED';
|
|
402
|
+
console.log(`\n=== Judge Result ===`);
|
|
403
|
+
console.log(`Score: ${result.score.toFixed(1)}/10`);
|
|
404
|
+
console.log(`Status: ${status}`);
|
|
405
|
+
console.log(`Reasoning: ${result.reasoning}`);
|
|
406
|
+
if (result.suggestions.length > 0) {
|
|
407
|
+
console.log(`Suggestions:`);
|
|
408
|
+
result.suggestions.forEach(s => console.log(` - ${s}`));
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
exports.Judge = Judge;
|
|
413
|
+
// ============================================================================
|
|
414
|
+
// Built-in Judge Types
|
|
415
|
+
// ============================================================================
|
|
416
|
+
/**
|
|
417
|
+
* Judge for accuracy evaluation (comparing output to expected).
|
|
418
|
+
*/
|
|
419
|
+
class AccuracyJudge extends Judge {
|
|
420
|
+
}
|
|
421
|
+
exports.AccuracyJudge = AccuracyJudge;
|
|
422
|
+
/**
|
|
423
|
+
* Judge for criteria-based evaluation.
|
|
424
|
+
*/
|
|
425
|
+
class CriteriaJudge extends Judge {
|
|
426
|
+
}
|
|
427
|
+
exports.CriteriaJudge = CriteriaJudge;
|
|
428
|
+
/**
|
|
429
|
+
* Judge for evaluating recipe/workflow execution traces.
|
|
430
|
+
*/
|
|
431
|
+
class RecipeJudge extends Judge {
|
|
432
|
+
constructor(options = {}) {
|
|
433
|
+
const criteria = `Recipe execution quality in ${options.mode ?? 'context'} mode`;
|
|
434
|
+
super({
|
|
435
|
+
...options,
|
|
436
|
+
criteria,
|
|
437
|
+
maxTokens: options.maxTokens ?? 800,
|
|
438
|
+
});
|
|
439
|
+
this.mode = options.mode ?? 'context';
|
|
440
|
+
}
|
|
441
|
+
buildPrompt(output, expected, criteria, input) {
|
|
442
|
+
return RECIPE_PROMPT
|
|
443
|
+
.replace('{output}', output)
|
|
444
|
+
.replace('{expected}', expected ?? 'Complete workflow execution')
|
|
445
|
+
.replace('{criteria}', criteria ?? this.criteria ?? `Recipe quality in ${this.mode} mode`);
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
exports.RecipeJudge = RecipeJudge;
|
|
449
|
+
const JUDGE_REGISTRY = new Map([
|
|
450
|
+
['accuracy', AccuracyJudge],
|
|
451
|
+
['criteria', CriteriaJudge],
|
|
452
|
+
['recipe', RecipeJudge],
|
|
453
|
+
]);
|
|
454
|
+
/**
|
|
455
|
+
* Register a custom judge type.
|
|
456
|
+
*
|
|
457
|
+
* @param name - Name for the judge type
|
|
458
|
+
* @param judgeClass - Judge class to register
|
|
459
|
+
*/
|
|
460
|
+
function addJudge(name, judgeClass) {
|
|
461
|
+
JUDGE_REGISTRY.set(name.toLowerCase(), judgeClass);
|
|
462
|
+
}
|
|
463
|
+
/**
|
|
464
|
+
* Get a registered judge type by name.
|
|
465
|
+
*
|
|
466
|
+
* @param name - Name of the judge type
|
|
467
|
+
* @returns Judge class or undefined if not found
|
|
468
|
+
*/
|
|
469
|
+
function getJudge(name) {
|
|
470
|
+
return JUDGE_REGISTRY.get(name.toLowerCase());
|
|
471
|
+
}
|
|
472
|
+
/**
|
|
473
|
+
* List all registered judge types.
|
|
474
|
+
*
|
|
475
|
+
* @returns List of judge type names
|
|
476
|
+
*/
|
|
477
|
+
function listJudges() {
|
|
478
|
+
return Array.from(JUDGE_REGISTRY.keys());
|
|
479
|
+
}
|
|
480
|
+
/**
|
|
481
|
+
* Remove a registered judge type.
|
|
482
|
+
*
|
|
483
|
+
* @param name - Name of the judge type to remove
|
|
484
|
+
* @returns True if removed, false if not found
|
|
485
|
+
*/
|
|
486
|
+
function removeJudge(name) {
|
|
487
|
+
return JUDGE_REGISTRY.delete(name.toLowerCase());
|
|
488
|
+
}
|
|
489
|
+
const OPTIMIZATION_RULE_REGISTRY = new Map();
|
|
490
|
+
/**
|
|
491
|
+
* Register a custom optimization rule.
|
|
492
|
+
*
|
|
493
|
+
* @param name - Name for the rule
|
|
494
|
+
* @param ruleClass - Rule class implementing OptimizationRuleProtocol
|
|
495
|
+
*/
|
|
496
|
+
function addOptimizationRule(name, ruleClass) {
|
|
497
|
+
OPTIMIZATION_RULE_REGISTRY.set(name.toLowerCase(), ruleClass);
|
|
498
|
+
}
|
|
499
|
+
/**
|
|
500
|
+
* Get a registered optimization rule by name.
|
|
501
|
+
*
|
|
502
|
+
* @param name - Name of the rule
|
|
503
|
+
* @returns Rule class or undefined if not found
|
|
504
|
+
*/
|
|
505
|
+
function getOptimizationRule(name) {
|
|
506
|
+
return OPTIMIZATION_RULE_REGISTRY.get(name.toLowerCase());
|
|
507
|
+
}
|
|
508
|
+
/**
|
|
509
|
+
* List all registered optimization rules.
|
|
510
|
+
*
|
|
511
|
+
* @returns List of rule names
|
|
512
|
+
*/
|
|
513
|
+
function listOptimizationRules() {
|
|
514
|
+
return Array.from(OPTIMIZATION_RULE_REGISTRY.keys());
|
|
515
|
+
}
|
|
516
|
+
/**
|
|
517
|
+
* Remove a registered optimization rule.
|
|
518
|
+
*
|
|
519
|
+
* @param name - Name of the rule to remove
|
|
520
|
+
* @returns True if removed, false if not found
|
|
521
|
+
*/
|
|
522
|
+
function removeOptimizationRule(name) {
|
|
523
|
+
return OPTIMIZATION_RULE_REGISTRY.delete(name.toLowerCase());
|
|
524
|
+
}
|
|
525
|
+
// ============================================================================
|
|
526
|
+
// Exports
|
|
527
|
+
// ============================================================================
|
|
528
|
+
exports.default = Judge;
|
package/dist/index.d.ts
CHANGED
|
@@ -61,7 +61,7 @@ export * from './guardrails';
|
|
|
61
61
|
export { Handoff, handoff, handoffFilters, type HandoffConfig, type HandoffContext, type HandoffResult } from './agent/handoff';
|
|
62
62
|
export { RouterAgent, createRouter, routeConditions, type RouterConfig, type RouteConfig, type RouteContext } from './agent/router';
|
|
63
63
|
export { ContextAgent, createContextAgent, type ContextAgentConfig, type ContextMessage } from './agent/context';
|
|
64
|
-
export { accuracyEval, performanceEval, reliabilityEval, EvalSuite, Evaluator, createEvaluator, createDefaultEvaluator, EvalResults, createEvalResults, relevanceCriterion, lengthCriterion, containsKeywordsCriterion, noHarmfulContentCriterion, type EvalResult, type PerformanceResult, type AccuracyEvalConfig, type PerformanceEvalConfig, type ReliabilityEvalConfig, type EvalCriteria, type EvaluatorConfig, type TestResult, type AggregatedResults } from './eval';
|
|
64
|
+
export { accuracyEval, performanceEval, reliabilityEval, EvalSuite, Evaluator, createEvaluator, createDefaultEvaluator, EvalResults, createEvalResults, relevanceCriterion, lengthCriterion, containsKeywordsCriterion, noHarmfulContentCriterion, Judge, AccuracyJudge, CriteriaJudge, RecipeJudge, addJudge, getJudge, listJudges, removeJudge, addOptimizationRule, getOptimizationRule, listOptimizationRules, removeOptimizationRule, parseJudgeResponse, type EvalResult, type PerformanceResult, type AccuracyEvalConfig, type PerformanceEvalConfig, type ReliabilityEvalConfig, type EvalCriteria, type EvaluatorConfig, type TestResult, type AggregatedResults, type JudgeConfig, type JudgeCriteriaConfig, type JudgeResult, type JudgeRunOptions, type JudgeOptions, } from './eval';
|
|
65
65
|
export { SkillManager, createSkillManager, parseSkillFile, type Skill, type SkillMetadata, type SkillDiscoveryOptions } from './skills';
|
|
66
66
|
export { parseArgs, executeCommand, CLI_SPEC_VERSION } from './cli';
|
|
67
67
|
export { Memory, createMemory } from './memory/memory';
|
package/dist/index.js
CHANGED
|
@@ -54,14 +54,14 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
54
54
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
55
55
|
exports.tavilySearch = exports.codeExecution = exports.composeMiddleware = exports.createValidationMiddleware = exports.createTracingMiddleware = exports.createRetryMiddleware = exports.createRateLimitMiddleware = exports.createRedactionMiddleware = exports.createTimeoutMiddleware = exports.createLoggingMiddleware = exports.MissingEnvVarError = exports.MissingDependencyError = exports.ToolsRegistry = exports.resetToolsRegistry = exports.createToolsRegistry = exports.getToolsRegistry = exports.registerBuiltinTools = exports.tools = exports.createDelegator = exports.createSubagentTools = exports.createSubagentTool = exports.SubagentTool = exports.getTool = exports.registerTool = exports.getRegistry = exports.ToolRegistry = exports.tool = exports.FunctionTool = exports.createTool = exports.validateTool = exports.ToolValidationError = exports.BaseTool = exports.setDefaultDbAdapter = exports.getDefaultDbAdapter = exports.createDbAdapter = exports.db = exports.WorkflowStep = exports.repeatPattern = exports.Repeat = exports.loopPattern = exports.Loop = exports.repeat = exports.loop = exports.route = exports.parallel = exports.Workflow = exports.Router = exports.PraisonAIAgents = exports.Agents = exports.Agent = void 0;
|
|
56
56
|
exports.EvalSuite = exports.reliabilityEval = exports.performanceEval = exports.accuracyEval = exports.createContextAgent = exports.ContextAgent = exports.routeConditions = exports.createRouter = exports.RouterAgent = exports.handoffFilters = exports.handoff = exports.Handoff = exports.createRateLimitPolicy = exports.createApiKeyPolicy = exports.createMCPSecurity = exports.MCPSecurity = exports.createMCPSession = exports.createMCPServer = exports.MCPServer = exports.getMCPTools = exports.createMCPClient = exports.MCPClient = exports.registerLocalTool = exports.registerNpmTool = exports.createCustomTool = exports.registerCustomTool = exports.codeMode = exports.airweaveSearch = exports.bedrockBrowserFill = exports.bedrockBrowserClick = exports.bedrockBrowserNavigate = exports.bedrockCodeInterpreter = exports.valyuCompanyResearch = exports.valyuEconomicsSearch = exports.valyuSecSearch = exports.valyuPatentSearch = exports.valyuBioSearch = exports.valyuPaperSearch = exports.valyuFinanceSearch = exports.valyuWebSearch = exports.superagentVerify = exports.superagentRedact = exports.superagentGuard = exports.firecrawlCrawl = exports.firecrawlScrape = exports.parallelSearch = exports.perplexitySearch = exports.exaSearch = exports.tavilyCrawl = exports.tavilyExtract = void 0;
|
|
57
|
-
exports.
|
|
58
|
-
exports.
|
|
59
|
-
exports.
|
|
60
|
-
exports.
|
|
61
|
-
exports.
|
|
62
|
-
exports.
|
|
63
|
-
exports.
|
|
64
|
-
exports.createCLIApprovalPrompt = exports.createComputerUseAgent = exports.ComputerUseClient = exports.createComputerUse = exports.createPostgresTool = exports.NLPostgresClient = exports.createNLPostgres = exports.parseSlackMessage = exports.verifySlackSignature = exports.SlackBot = void 0;
|
|
57
|
+
exports.createValidationOperationHooks = exports.createLoggingOperationHooks = exports.createHooksManager = exports.HooksManager = exports.createDocsManager = exports.DocsManager = exports.createSafetyRules = exports.createRulesManager = exports.RulesManager = exports.createEncryptionHooks = exports.createValidationHooks = exports.createLoggingHooks = exports.createMemoryHooks = exports.MemoryHooks = exports.DEFAULT_POLICIES = exports.createLLMSummarizer = exports.createAutoMemory = exports.AutoMemory = exports.createFileMemory = exports.FileMemory = exports.createMemory = exports.Memory = exports.CLI_SPEC_VERSION = exports.executeCommand = exports.parseArgs = exports.parseSkillFile = exports.createSkillManager = exports.SkillManager = exports.parseJudgeResponse = exports.removeOptimizationRule = exports.listOptimizationRules = exports.getOptimizationRule = exports.addOptimizationRule = exports.removeJudge = exports.listJudges = exports.getJudge = exports.addJudge = exports.RecipeJudge = exports.CriteriaJudge = exports.AccuracyJudge = exports.Judge = exports.noHarmfulContentCriterion = exports.containsKeywordsCriterion = exports.lengthCriterion = exports.relevanceCriterion = exports.createEvalResults = exports.EvalResults = exports.createDefaultEvaluator = exports.createEvaluator = exports.Evaluator = void 0;
|
|
58
|
+
exports.createPlan = exports.TaskAgent = exports.PlanningAgent = exports.PlanStorage = exports.TodoItem = exports.TodoList = exports.PlanStep = exports.Plan = exports.createLLMGuardrail = exports.LLMGuardrail = exports.createPromptExpanderAgent = exports.PromptExpanderAgent = exports.createQueryRewriterAgent = exports.QueryRewriterAgent = exports.createDeepResearchAgent = exports.DeepResearchAgent = exports.createAudioAgent = exports.AudioAgent = exports.createImageAgent = exports.ImageAgent = exports.createAutoAgents = exports.AutoAgents = exports.createHTTPSink = exports.createConsoleSink = exports.createTelemetryIntegration = exports.TelemetryIntegration = exports.createPerformanceMonitor = exports.PerformanceMonitor = exports.createAgentTelemetry = exports.cleanupTelemetry = exports.disableTelemetry = exports.enableTelemetry = exports.getTelemetry = exports.AgentTelemetry = exports.TelemetryCollector = exports.createTimingWorkflowHooks = exports.createLoggingWorkflowHooks = exports.createWorkflowHooks = exports.WorkflowHooksExecutor = exports.DisplayTypes = exports.clearAllCallbacks = exports.getRegisteredDisplayTypes = exports.hasApprovalCallback = exports.requestApproval = exports.executeCallback = exports.executeSyncCallback = exports.clearApprovalCallback = exports.registerApprovalCallback = exports.unregisterDisplayCallback = exports.registerDisplayCallback = void 0;
|
|
59
|
+
exports.BaseVoiceProvider = exports.createLangfuseObservability = exports.createMemoryObservability = exports.createConsoleObservability = exports.LangfuseObservabilityProvider = exports.MemoryObservabilityProvider = exports.ConsoleObservabilityProvider = exports.BaseObservabilityProvider = exports.createChromaStore = exports.ChromaVectorStore = exports.createQdrantStore = exports.QdrantVectorStore = exports.createWeaviateStore = exports.WeaviateVectorStore = exports.createPineconeStore = exports.PineconeVectorStore = exports.createMemoryVectorStore = exports.MemoryVectorStore = exports.BaseVectorStore = exports.createPostgresSessionStorage = exports.createMemoryPostgres = exports.createNeonPostgres = exports.PostgresSessionStorage = exports.MemoryPostgresAdapter = exports.NeonPostgresAdapter = exports.createMemoryRedis = exports.createUpstashRedis = exports.MemoryRedisAdapter = exports.UpstashRedisAdapter = exports.createSQLiteAdapter = exports.SQLiteAdapter = exports.validateWorkflowDefinition = exports.loadWorkflowFromFile = exports.createWorkflowFromYAML = exports.parseYAMLWorkflow = exports.createPubSub = exports.createEventBus = exports.AgentEvents = exports.AgentEventBus = exports.EventEmitterPubSub = exports.PubSub = exports.createFileCache = exports.createMemoryCache = exports.FileCache = exports.MemoryCache = exports.BaseCache = exports.createTaskAgent = exports.createPlanningAgent = exports.createPlanStorage = exports.createTodoList = void 0;
|
|
60
|
+
exports.COMMUNITY_PROVIDERS = exports.PROVIDER_ALIASES = exports.AISDK_PROVIDERS = exports.trace = exports.resetObservabilityAdapter = exports.getObservabilityAdapter = exports.setObservabilityAdapter = exports.clearAdapterCache = exports.createObservabilityAdapter = exports.createConsoleAdapter = exports.ConsoleObservabilityAdapter = exports.createMemoryAdapter = exports.MemoryObservabilityAdapter = exports.noopAdapter = exports.NoopObservabilityAdapter = exports.hasObservabilityToolEnvVar = exports.listObservabilityTools = exports.getObservabilityToolInfo = exports.OBSERVABILITY_TOOLS = exports.registerBuiltinProviders = exports.createProviderRegistry = exports.getDefaultRegistry = exports.listProviders = exports.hasProvider = exports.unregisterProvider = exports.registerProvider = exports.ProviderRegistry = exports.BaseProvider = exports.GoogleProvider = exports.AnthropicProvider = exports.OpenAIProvider = exports.getAvailableProviders = exports.isProviderAvailable = exports.parseModelString = exports.getDefaultProvider = exports.createProvider = exports.createGraphRAG = exports.GraphRAG = exports.GraphStore = exports.createLLMReranker = exports.createCrossEncoderReranker = exports.createCohereReranker = exports.LLMReranker = exports.CrossEncoderReranker = exports.CohereReranker = exports.BaseReranker = exports.createElevenLabsVoice = exports.createOpenAIVoice = exports.ElevenLabsVoiceProvider = exports.OpenAIVoiceProvider = void 0;
|
|
61
|
+
exports.FlowDisplay = exports.createFileCheckpointStorage = exports.FileCheckpointStorage = exports.MemoryCheckpointStorage = exports.createCheckpointManager = exports.CheckpointManager = exports.createFileJobStorage = exports.FileJobStorage = exports.MemoryJobStorage = exports.createJobQueue = exports.JobQueue = exports.cronExpressions = exports.createScheduler = exports.Scheduler = exports.MODE_POLICIES = exports.cliApprovalPrompt = exports.createAutonomyManager = exports.AutonomyManager = exports.DEFAULT_BLOCKED_PATHS = exports.DEFAULT_BLOCKED_COMMANDS = exports.CommandValidator = exports.sandboxExec = exports.createSandboxExecutor = exports.SandboxExecutor = exports.createDiffViewer = exports.DiffViewer = exports.createGitManager = exports.GitManager = exports.DEFAULT_IGNORE_PATTERNS = exports.getRepoTree = exports.createRepoMap = exports.RepoMap = exports.createHistoryManager = exports.HistoryManager = exports.createStatusDisplay = exports.StatusDisplay = exports.createInteractiveTUI = exports.InteractiveTUI = exports.MODEL_PRICING = exports.formatCost = exports.estimateTokens = exports.createCostTracker = exports.CostTracker = exports.isSlashCommand = exports.executeSlashCommand = exports.parseSlashCommand = exports.registerCommand = exports.createSlashCommandHandler = exports.SlashCommandHandler = exports.ADAPTERS = void 0;
|
|
62
|
+
exports.isDataUrl = exports.isUrl = exports.uint8ArrayToBase64 = exports.base64ToUint8Array = exports.toMessageContent = exports.createMultimodalMessage = exports.createTextPart = exports.createPdfPart = exports.createFilePart = exports.createImagePart = exports.getAICacheStats = exports.clearAICache = exports.applyMiddleware = exports.wrapModel = exports.createAILoggingMiddleware = exports.createCachingMiddleware = exports.resolveModelAlias = exports.hasModelAlias = exports.listModelAliases = exports.MODEL_ALIASES = exports.parseModel = exports.getModel = exports.createModel = exports.functionToTool = exports.createToolSet = exports.defineTool = exports.aiEmbedMany = exports.aiEmbed = exports.aiGenerateImage = exports.aiStreamObject = exports.aiGenerateObject = exports.aiStreamText = exports.aiGenerateText = exports.getQuickContext = exports.createFastContext = exports.FastContext = exports.triggerN8NWebhook = exports.createN8NIntegration = exports.N8NIntegration = exports.externalAgentAsTool = exports.createExternalAgent = exports.getExternalAgentRegistry = exports.GenericExternalAgent = exports.AiderAgent = exports.CodexCliAgent = exports.GeminiCliAgent = exports.ClaudeCodeAgent = exports.BaseExternalAgent = exports.renderWorkflow = exports.createFlowDisplay = void 0;
|
|
63
|
+
exports.getTelemetrySettings = exports.configureTelemetry = exports.autoEnableDevTools = exports.createDevToolsMiddleware = exports.getDevToolsUrl = exports.getDevToolsState = exports.isDevToolsEnabled = exports.disableDevTools = exports.enableDevTools = exports.TRANSCRIPTION_MODELS = exports.SPEECH_MODELS = exports.transcribe = exports.generateSpeech = exports.createDangerousPatternChecker = exports.isDangerous = exports.DANGEROUS_PATTERNS = exports.ToolApprovalTimeoutError = exports.ToolApprovalDeniedError = exports.withApproval = exports.setApprovalManager = exports.getApprovalManager = exports.ApprovalManager = exports.pipeUIMessageStreamToResponse = exports.toUIMessageStreamResponse = exports.createApprovalResponse = exports.getToolsNeedingApproval = exports.hasPendingApprovals = exports.createSystemMessage = exports.createTextMessage = exports.safeValidateUIMessages = exports.validateUIMessages = exports.convertToUIMessages = exports.convertToModelMessages = exports.stopWhen = exports.stopWhenNoToolCalls = exports.stopAfterSteps = exports.AgentLoop = exports.createAgentLoop = exports.createPagesHandler = exports.createRouteHandler = exports.createNestHandler = exports.createFastifyHandler = exports.createHonoHandler = exports.createExpressHandler = exports.createHttpHandler = exports.mcpToolsToAITools = exports.closeAllMCPClients = exports.closeMCPClient = exports.getMCPClient = exports.createMCP = void 0;
|
|
64
|
+
exports.createCLIApprovalPrompt = exports.createComputerUseAgent = exports.ComputerUseClient = exports.createComputerUse = exports.createPostgresTool = exports.NLPostgresClient = exports.createNLPostgres = exports.parseSlackMessage = exports.verifySlackSignature = exports.SlackBot = exports.createSlackBot = exports.createTelemetrySettings = exports.clearEvents = exports.getEvents = exports.recordEvent = exports.createTelemetryMiddleware = exports.withSpan = exports.createAISpan = exports.getTracer = exports.initOpenTelemetry = exports.isTelemetryEnabled = exports.disableAITelemetry = exports.enableAITelemetry = void 0;
|
|
65
65
|
// ============================================================================
|
|
66
66
|
// CORE API - The main classes users should use
|
|
67
67
|
// ============================================================================
|
|
@@ -230,6 +230,20 @@ Object.defineProperty(exports, "relevanceCriterion", { enumerable: true, get: fu
|
|
|
230
230
|
Object.defineProperty(exports, "lengthCriterion", { enumerable: true, get: function () { return eval_1.lengthCriterion; } });
|
|
231
231
|
Object.defineProperty(exports, "containsKeywordsCriterion", { enumerable: true, get: function () { return eval_1.containsKeywordsCriterion; } });
|
|
232
232
|
Object.defineProperty(exports, "noHarmfulContentCriterion", { enumerable: true, get: function () { return eval_1.noHarmfulContentCriterion; } });
|
|
233
|
+
// LLM-as-Judge
|
|
234
|
+
Object.defineProperty(exports, "Judge", { enumerable: true, get: function () { return eval_1.Judge; } });
|
|
235
|
+
Object.defineProperty(exports, "AccuracyJudge", { enumerable: true, get: function () { return eval_1.AccuracyJudge; } });
|
|
236
|
+
Object.defineProperty(exports, "CriteriaJudge", { enumerable: true, get: function () { return eval_1.CriteriaJudge; } });
|
|
237
|
+
Object.defineProperty(exports, "RecipeJudge", { enumerable: true, get: function () { return eval_1.RecipeJudge; } });
|
|
238
|
+
Object.defineProperty(exports, "addJudge", { enumerable: true, get: function () { return eval_1.addJudge; } });
|
|
239
|
+
Object.defineProperty(exports, "getJudge", { enumerable: true, get: function () { return eval_1.getJudge; } });
|
|
240
|
+
Object.defineProperty(exports, "listJudges", { enumerable: true, get: function () { return eval_1.listJudges; } });
|
|
241
|
+
Object.defineProperty(exports, "removeJudge", { enumerable: true, get: function () { return eval_1.removeJudge; } });
|
|
242
|
+
Object.defineProperty(exports, "addOptimizationRule", { enumerable: true, get: function () { return eval_1.addOptimizationRule; } });
|
|
243
|
+
Object.defineProperty(exports, "getOptimizationRule", { enumerable: true, get: function () { return eval_1.getOptimizationRule; } });
|
|
244
|
+
Object.defineProperty(exports, "listOptimizationRules", { enumerable: true, get: function () { return eval_1.listOptimizationRules; } });
|
|
245
|
+
Object.defineProperty(exports, "removeOptimizationRule", { enumerable: true, get: function () { return eval_1.removeOptimizationRule; } });
|
|
246
|
+
Object.defineProperty(exports, "parseJudgeResponse", { enumerable: true, get: function () { return eval_1.parseJudgeResponse; } });
|
|
233
247
|
// Note: Observability exports are at the bottom of this file with the full 14+ integrations
|
|
234
248
|
// Export skills
|
|
235
249
|
var skills_1 = require("./skills");
|