praisonai 1.5.3 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,275 @@
1
+ /**
2
+ * Unified Judge class for LLM-as-judge evaluation.
3
+ *
4
+ * Provides a simple, unified API for evaluating agent outputs using LLM-as-judge.
5
+ * Follows PraisonAI naming conventions and engineering principles.
6
+ *
7
+ * DRY: Reuses existing provider infrastructure.
8
+ * Protocol-driven: Implements JudgeProtocol for extensibility.
9
+ * Zero performance impact: Lazy imports for LLM providers.
10
+ *
11
+ * @example
12
+ * ```typescript
13
+ * import { Judge } from 'praisonai';
14
+ * const result = await Judge.run({ output: "4", expected: "4" });
15
+ * console.log(`Score: ${result.score}/10`);
16
+ * ```
17
+ */
18
+ /**
19
+ * Configuration for Judge instances.
20
+ */
21
+ export interface JudgeConfig {
22
+ /** LLM model to use for judging (default: gpt-4o-mini) */
23
+ model?: string;
24
+ /** Temperature for LLM calls (default: 0.1 for consistency) */
25
+ temperature?: number;
26
+ /** Maximum tokens for LLM response */
27
+ maxTokens?: number;
28
+ /** Score threshold for passing (default: 7.0) */
29
+ threshold?: number;
30
+ /** Optional custom criteria for evaluation */
31
+ criteria?: string;
32
+ }
33
+ /**
34
+ * Dynamic criteria configuration for domain-agnostic judging.
35
+ *
36
+ * Enables judges to evaluate ANY domain, not just agent outputs:
37
+ * - Water flow optimization
38
+ * - Data pipeline efficiency
39
+ * - Manufacturing quality
40
+ * - Recipe/workflow optimization
41
+ * - Any custom domain
42
+ */
43
+ export interface JudgeCriteriaConfig {
44
+ /** Name of the criteria configuration */
45
+ name: string;
46
+ /** Description of what is being evaluated */
47
+ description: string;
48
+ /** Custom prompt template with {output} placeholder */
49
+ promptTemplate: string;
50
+ /** List of dimensions to score (e.g., ["efficiency", "safety"]) */
51
+ scoringDimensions: string[];
52
+ /** Score threshold for passing (default: 7.0) */
53
+ threshold?: number;
54
+ }
55
+ /**
56
+ * Result from a Judge evaluation.
57
+ *
58
+ * This is the unified result type for all LLM-as-judge evaluations.
59
+ */
60
+ export interface JudgeResult {
61
+ /** Quality score (1-10) */
62
+ score: number;
63
+ /** Whether the evaluation passed (score >= threshold) */
64
+ passed: boolean;
65
+ /** Explanation for the score */
66
+ reasoning: string;
67
+ /** The output that was judged */
68
+ output: string;
69
+ /** Optional expected output */
70
+ expected?: string;
71
+ /** Optional criteria used for evaluation */
72
+ criteria?: string;
73
+ /** List of improvement suggestions */
74
+ suggestions: string[];
75
+ /** When judging occurred */
76
+ timestamp: number;
77
+ /** Additional metadata */
78
+ metadata?: Record<string, any>;
79
+ }
80
+ /**
81
+ * Options for Judge.run() method
82
+ */
83
+ export interface JudgeRunOptions {
84
+ /** The output to judge (required if no agent) */
85
+ output?: string;
86
+ /** Optional expected output for accuracy evaluation */
87
+ expected?: string;
88
+ /** Optional criteria for criteria evaluation */
89
+ criteria?: string;
90
+ /** Optional input context */
91
+ input?: string;
92
+ /** Optional Agent to run and judge */
93
+ agent?: any;
94
+ /** Optional Agents to run and judge */
95
+ agents?: any;
96
+ /** Whether to print result summary */
97
+ printSummary?: boolean;
98
+ }
99
+ /**
100
+ * Constructor options for Judge class
101
+ */
102
+ export interface JudgeOptions {
103
+ /** LLM model to use */
104
+ model?: string;
105
+ /** Temperature for LLM calls */
106
+ temperature?: number;
107
+ /** Maximum tokens for response */
108
+ maxTokens?: number;
109
+ /** Score threshold for passing */
110
+ threshold?: number;
111
+ /** Custom criteria for evaluation */
112
+ criteria?: string;
113
+ /** Full JudgeConfig object */
114
+ config?: JudgeConfig;
115
+ /** Domain-agnostic criteria config */
116
+ criteriaConfig?: JudgeCriteriaConfig;
117
+ /** Session ID for trace isolation */
118
+ sessionId?: string;
119
+ }
120
+ /**
121
+ * Protocol interface for Judge implementations
122
+ */
123
+ export interface JudgeProtocol {
124
+ run(options: JudgeRunOptions): Promise<JudgeResult>;
125
+ runAsync(options: JudgeRunOptions): Promise<JudgeResult>;
126
+ }
127
+ /**
128
+ * Parse LLM response into JudgeResult.
129
+ *
130
+ * @param responseText - Raw LLM response
131
+ * @param output - Original output
132
+ * @param expected - Original expected output
133
+ * @param criteria - Original criteria
134
+ * @param threshold - Score threshold for passing
135
+ * @returns JudgeResult with score, passed, reasoning, suggestions
136
+ */
137
+ export declare function parseJudgeResponse(responseText: string, output: string, expected: string | null, criteria: string | null, threshold: number): JudgeResult;
138
+ /**
139
+ * Unified LLM-as-judge for evaluating agent outputs.
140
+ *
141
+ * Provides a simple API for:
142
+ * - Accuracy evaluation (comparing output to expected)
143
+ * - Criteria evaluation (evaluating against custom criteria)
144
+ * - Custom evaluation (subclass for domain-specific judges)
145
+ *
146
+ * @example
147
+ * ```typescript
148
+ * // Simple accuracy check
149
+ * const result = await new Judge().run({ output: "4", expected: "4" });
150
+ *
151
+ * // Custom criteria
152
+ * const result = await new Judge({ criteria: "Response is helpful" }).run({ output: "Hello!" });
153
+ *
154
+ * // With agent
155
+ * const result = await new Judge().run({ agent: myAgent, input: "2+2", expected: "4" });
156
+ * ```
157
+ */
158
+ export declare class Judge implements JudgeProtocol {
159
+ readonly model: string;
160
+ readonly temperature: number;
161
+ readonly maxTokens: number;
162
+ readonly threshold: number;
163
+ readonly criteria: string | null;
164
+ readonly criteriaConfig: JudgeCriteriaConfig | null;
165
+ readonly sessionId: string | null;
166
+ constructor(options?: JudgeOptions);
167
+ /**
168
+ * Build the appropriate prompt based on evaluation type.
169
+ */
170
+ protected buildPrompt(output: string, expected: string | null, criteria: string | null, input: string): string;
171
+ /**
172
+ * Get LLM provider lazily.
173
+ */
174
+ protected getProvider(): Promise<any>;
175
+ /**
176
+ * Get output from an Agent.
177
+ */
178
+ protected getAgentOutput(agent: any, input: string): Promise<string>;
179
+ /**
180
+ * Get output from Agents (multi-agent).
181
+ */
182
+ protected getAgentsOutput(agents: any, input: string): Promise<string>;
183
+ /**
184
+ * Judge an output.
185
+ *
186
+ * @param options - Evaluation options
187
+ * @returns JudgeResult with score, passed, reasoning, suggestions
188
+ */
189
+ run(options: JudgeRunOptions): Promise<JudgeResult>;
190
+ /**
191
+ * Judge an output asynchronously (alias for run).
192
+ */
193
+ runAsync(options: JudgeRunOptions): Promise<JudgeResult>;
194
+ /**
195
+ * Print a summary of the judge result.
196
+ */
197
+ printSummary(result: JudgeResult): void;
198
+ }
199
+ /**
200
+ * Judge for accuracy evaluation (comparing output to expected).
201
+ */
202
+ export declare class AccuracyJudge extends Judge {
203
+ }
204
+ /**
205
+ * Judge for criteria-based evaluation.
206
+ */
207
+ export declare class CriteriaJudge extends Judge {
208
+ }
209
+ /**
210
+ * Judge for evaluating recipe/workflow execution traces.
211
+ */
212
+ export declare class RecipeJudge extends Judge {
213
+ readonly mode: string;
214
+ constructor(options?: JudgeOptions & {
215
+ mode?: string;
216
+ });
217
+ protected buildPrompt(output: string, expected: string | null, criteria: string | null, input: string): string;
218
+ }
219
+ type JudgeConstructor = new (options?: JudgeOptions) => Judge;
220
+ /**
221
+ * Register a custom judge type.
222
+ *
223
+ * @param name - Name for the judge type
224
+ * @param judgeClass - Judge class to register
225
+ */
226
+ export declare function addJudge(name: string, judgeClass: JudgeConstructor): void;
227
+ /**
228
+ * Get a registered judge type by name.
229
+ *
230
+ * @param name - Name of the judge type
231
+ * @returns Judge class or undefined if not found
232
+ */
233
+ export declare function getJudge(name: string): JudgeConstructor | undefined;
234
+ /**
235
+ * List all registered judge types.
236
+ *
237
+ * @returns List of judge type names
238
+ */
239
+ export declare function listJudges(): string[];
240
+ /**
241
+ * Remove a registered judge type.
242
+ *
243
+ * @param name - Name of the judge type to remove
244
+ * @returns True if removed, false if not found
245
+ */
246
+ export declare function removeJudge(name: string): boolean;
247
+ type OptimizationRuleConstructor = new (...args: any[]) => any;
248
+ /**
249
+ * Register a custom optimization rule.
250
+ *
251
+ * @param name - Name for the rule
252
+ * @param ruleClass - Rule class implementing OptimizationRuleProtocol
253
+ */
254
+ export declare function addOptimizationRule(name: string, ruleClass: OptimizationRuleConstructor): void;
255
+ /**
256
+ * Get a registered optimization rule by name.
257
+ *
258
+ * @param name - Name of the rule
259
+ * @returns Rule class or undefined if not found
260
+ */
261
+ export declare function getOptimizationRule(name: string): OptimizationRuleConstructor | undefined;
262
+ /**
263
+ * List all registered optimization rules.
264
+ *
265
+ * @returns List of rule names
266
+ */
267
+ export declare function listOptimizationRules(): string[];
268
+ /**
269
+ * Remove a registered optimization rule.
270
+ *
271
+ * @param name - Name of the rule to remove
272
+ * @returns True if removed, false if not found
273
+ */
274
+ export declare function removeOptimizationRule(name: string): boolean;
275
+ export default Judge;