praisonai 1.5.3 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -12
- package/dist/agent/index.d.ts +2 -2
- package/dist/agent/index.js +2 -1
- package/dist/agent/proxy.d.ts +11 -1
- package/dist/agent/proxy.js +16 -6
- package/dist/agent/simple.d.ts +28 -11
- package/dist/agent/simple.js +17 -14
- package/dist/agent/types.d.ts +7 -3
- package/dist/agent/types.js +6 -6
- package/dist/cli/commands/eval.d.ts +2 -0
- package/dist/cli/commands/eval.js +58 -3
- package/dist/cli/commands/flow.js +2 -2
- package/dist/cli/features/flow-display.d.ts +1 -1
- package/dist/cli/features/flow-display.js +2 -2
- package/dist/eval/index.d.ts +1 -0
- package/dist/eval/index.js +16 -1
- package/dist/eval/judge.d.ts +275 -0
- package/dist/eval/judge.js +528 -0
- package/dist/index.d.ts +8 -6
- package/dist/index.js +38 -12
- package/dist/llm/providers/registry.js +22 -9
- package/dist/os/agentos.d.ts +145 -0
- package/dist/os/agentos.js +268 -0
- package/dist/os/config.d.ts +65 -0
- package/dist/os/config.js +50 -0
- package/dist/os/index.d.ts +31 -0
- package/dist/os/index.js +37 -0
- package/dist/os/protocols.d.ts +55 -0
- package/dist/os/protocols.js +21 -0
- package/dist/workflows/index.d.ts +28 -8
- package/dist/workflows/index.js +29 -9
- package/dist/workflows/loop.js +1 -1
- package/dist/workflows/repeat.js +1 -1
- package/dist/workflows/yaml-parser.d.ts +2 -2
- package/dist/workflows/yaml-parser.js +1 -1
- package/package.json +2 -2
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified Judge class for LLM-as-judge evaluation.
|
|
3
|
+
*
|
|
4
|
+
* Provides a simple, unified API for evaluating agent outputs using LLM-as-judge.
|
|
5
|
+
* Follows PraisonAI naming conventions and engineering principles.
|
|
6
|
+
*
|
|
7
|
+
* DRY: Reuses existing provider infrastructure.
|
|
8
|
+
* Protocol-driven: Implements JudgeProtocol for extensibility.
|
|
9
|
+
* Zero performance impact: Lazy imports for LLM providers.
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```typescript
|
|
13
|
+
* import { Judge } from 'praisonai';
|
|
14
|
+
* const result = await Judge.run({ output: "4", expected: "4" });
|
|
15
|
+
* console.log(`Score: ${result.score}/10`);
|
|
16
|
+
* ```
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* Configuration for Judge instances.
|
|
20
|
+
*/
|
|
21
|
+
export interface JudgeConfig {
|
|
22
|
+
/** LLM model to use for judging (default: gpt-4o-mini) */
|
|
23
|
+
model?: string;
|
|
24
|
+
/** Temperature for LLM calls (default: 0.1 for consistency) */
|
|
25
|
+
temperature?: number;
|
|
26
|
+
/** Maximum tokens for LLM response */
|
|
27
|
+
maxTokens?: number;
|
|
28
|
+
/** Score threshold for passing (default: 7.0) */
|
|
29
|
+
threshold?: number;
|
|
30
|
+
/** Optional custom criteria for evaluation */
|
|
31
|
+
criteria?: string;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Dynamic criteria configuration for domain-agnostic judging.
|
|
35
|
+
*
|
|
36
|
+
* Enables judges to evaluate ANY domain, not just agent outputs:
|
|
37
|
+
* - Water flow optimization
|
|
38
|
+
* - Data pipeline efficiency
|
|
39
|
+
* - Manufacturing quality
|
|
40
|
+
* - Recipe/workflow optimization
|
|
41
|
+
* - Any custom domain
|
|
42
|
+
*/
|
|
43
|
+
export interface JudgeCriteriaConfig {
|
|
44
|
+
/** Name of the criteria configuration */
|
|
45
|
+
name: string;
|
|
46
|
+
/** Description of what is being evaluated */
|
|
47
|
+
description: string;
|
|
48
|
+
/** Custom prompt template with {output} placeholder */
|
|
49
|
+
promptTemplate: string;
|
|
50
|
+
/** List of dimensions to score (e.g., ["efficiency", "safety"]) */
|
|
51
|
+
scoringDimensions: string[];
|
|
52
|
+
/** Score threshold for passing (default: 7.0) */
|
|
53
|
+
threshold?: number;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Result from a Judge evaluation.
|
|
57
|
+
*
|
|
58
|
+
* This is the unified result type for all LLM-as-judge evaluations.
|
|
59
|
+
*/
|
|
60
|
+
export interface JudgeResult {
|
|
61
|
+
/** Quality score (1-10) */
|
|
62
|
+
score: number;
|
|
63
|
+
/** Whether the evaluation passed (score >= threshold) */
|
|
64
|
+
passed: boolean;
|
|
65
|
+
/** Explanation for the score */
|
|
66
|
+
reasoning: string;
|
|
67
|
+
/** The output that was judged */
|
|
68
|
+
output: string;
|
|
69
|
+
/** Optional expected output */
|
|
70
|
+
expected?: string;
|
|
71
|
+
/** Optional criteria used for evaluation */
|
|
72
|
+
criteria?: string;
|
|
73
|
+
/** List of improvement suggestions */
|
|
74
|
+
suggestions: string[];
|
|
75
|
+
/** When judging occurred */
|
|
76
|
+
timestamp: number;
|
|
77
|
+
/** Additional metadata */
|
|
78
|
+
metadata?: Record<string, any>;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Options for Judge.run() method
|
|
82
|
+
*/
|
|
83
|
+
export interface JudgeRunOptions {
|
|
84
|
+
/** The output to judge (required if no agent) */
|
|
85
|
+
output?: string;
|
|
86
|
+
/** Optional expected output for accuracy evaluation */
|
|
87
|
+
expected?: string;
|
|
88
|
+
/** Optional criteria for criteria evaluation */
|
|
89
|
+
criteria?: string;
|
|
90
|
+
/** Optional input context */
|
|
91
|
+
input?: string;
|
|
92
|
+
/** Optional Agent to run and judge */
|
|
93
|
+
agent?: any;
|
|
94
|
+
/** Optional Agents to run and judge */
|
|
95
|
+
agents?: any;
|
|
96
|
+
/** Whether to print result summary */
|
|
97
|
+
printSummary?: boolean;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Constructor options for Judge class
|
|
101
|
+
*/
|
|
102
|
+
export interface JudgeOptions {
|
|
103
|
+
/** LLM model to use */
|
|
104
|
+
model?: string;
|
|
105
|
+
/** Temperature for LLM calls */
|
|
106
|
+
temperature?: number;
|
|
107
|
+
/** Maximum tokens for response */
|
|
108
|
+
maxTokens?: number;
|
|
109
|
+
/** Score threshold for passing */
|
|
110
|
+
threshold?: number;
|
|
111
|
+
/** Custom criteria for evaluation */
|
|
112
|
+
criteria?: string;
|
|
113
|
+
/** Full JudgeConfig object */
|
|
114
|
+
config?: JudgeConfig;
|
|
115
|
+
/** Domain-agnostic criteria config */
|
|
116
|
+
criteriaConfig?: JudgeCriteriaConfig;
|
|
117
|
+
/** Session ID for trace isolation */
|
|
118
|
+
sessionId?: string;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Protocol interface for Judge implementations
|
|
122
|
+
*/
|
|
123
|
+
export interface JudgeProtocol {
|
|
124
|
+
run(options: JudgeRunOptions): Promise<JudgeResult>;
|
|
125
|
+
runAsync(options: JudgeRunOptions): Promise<JudgeResult>;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Parse LLM response into JudgeResult.
|
|
129
|
+
*
|
|
130
|
+
* @param responseText - Raw LLM response
|
|
131
|
+
* @param output - Original output
|
|
132
|
+
* @param expected - Original expected output
|
|
133
|
+
* @param criteria - Original criteria
|
|
134
|
+
* @param threshold - Score threshold for passing
|
|
135
|
+
* @returns JudgeResult with score, passed, reasoning, suggestions
|
|
136
|
+
*/
|
|
137
|
+
export declare function parseJudgeResponse(responseText: string, output: string, expected: string | null, criteria: string | null, threshold: number): JudgeResult;
|
|
138
|
+
/**
|
|
139
|
+
* Unified LLM-as-judge for evaluating agent outputs.
|
|
140
|
+
*
|
|
141
|
+
* Provides a simple API for:
|
|
142
|
+
* - Accuracy evaluation (comparing output to expected)
|
|
143
|
+
* - Criteria evaluation (evaluating against custom criteria)
|
|
144
|
+
* - Custom evaluation (subclass for domain-specific judges)
|
|
145
|
+
*
|
|
146
|
+
* @example
|
|
147
|
+
* ```typescript
|
|
148
|
+
* // Simple accuracy check
|
|
149
|
+
* const result = await new Judge().run({ output: "4", expected: "4" });
|
|
150
|
+
*
|
|
151
|
+
* // Custom criteria
|
|
152
|
+
* const result = await new Judge({ criteria: "Response is helpful" }).run({ output: "Hello!" });
|
|
153
|
+
*
|
|
154
|
+
* // With agent
|
|
155
|
+
* const result = await new Judge().run({ agent: myAgent, input: "2+2", expected: "4" });
|
|
156
|
+
* ```
|
|
157
|
+
*/
|
|
158
|
+
export declare class Judge implements JudgeProtocol {
|
|
159
|
+
readonly model: string;
|
|
160
|
+
readonly temperature: number;
|
|
161
|
+
readonly maxTokens: number;
|
|
162
|
+
readonly threshold: number;
|
|
163
|
+
readonly criteria: string | null;
|
|
164
|
+
readonly criteriaConfig: JudgeCriteriaConfig | null;
|
|
165
|
+
readonly sessionId: string | null;
|
|
166
|
+
constructor(options?: JudgeOptions);
|
|
167
|
+
/**
|
|
168
|
+
* Build the appropriate prompt based on evaluation type.
|
|
169
|
+
*/
|
|
170
|
+
protected buildPrompt(output: string, expected: string | null, criteria: string | null, input: string): string;
|
|
171
|
+
/**
|
|
172
|
+
* Get LLM provider lazily.
|
|
173
|
+
*/
|
|
174
|
+
protected getProvider(): Promise<any>;
|
|
175
|
+
/**
|
|
176
|
+
* Get output from an Agent.
|
|
177
|
+
*/
|
|
178
|
+
protected getAgentOutput(agent: any, input: string): Promise<string>;
|
|
179
|
+
/**
|
|
180
|
+
* Get output from Agents (multi-agent).
|
|
181
|
+
*/
|
|
182
|
+
protected getAgentsOutput(agents: any, input: string): Promise<string>;
|
|
183
|
+
/**
|
|
184
|
+
* Judge an output.
|
|
185
|
+
*
|
|
186
|
+
* @param options - Evaluation options
|
|
187
|
+
* @returns JudgeResult with score, passed, reasoning, suggestions
|
|
188
|
+
*/
|
|
189
|
+
run(options: JudgeRunOptions): Promise<JudgeResult>;
|
|
190
|
+
/**
|
|
191
|
+
* Judge an output asynchronously (alias for run).
|
|
192
|
+
*/
|
|
193
|
+
runAsync(options: JudgeRunOptions): Promise<JudgeResult>;
|
|
194
|
+
/**
|
|
195
|
+
* Print a summary of the judge result.
|
|
196
|
+
*/
|
|
197
|
+
printSummary(result: JudgeResult): void;
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Judge for accuracy evaluation (comparing output to expected).
|
|
201
|
+
*/
|
|
202
|
+
export declare class AccuracyJudge extends Judge {
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Judge for criteria-based evaluation.
|
|
206
|
+
*/
|
|
207
|
+
export declare class CriteriaJudge extends Judge {
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Judge for evaluating recipe/workflow execution traces.
|
|
211
|
+
*/
|
|
212
|
+
export declare class RecipeJudge extends Judge {
|
|
213
|
+
readonly mode: string;
|
|
214
|
+
constructor(options?: JudgeOptions & {
|
|
215
|
+
mode?: string;
|
|
216
|
+
});
|
|
217
|
+
protected buildPrompt(output: string, expected: string | null, criteria: string | null, input: string): string;
|
|
218
|
+
}
|
|
219
|
+
type JudgeConstructor = new (options?: JudgeOptions) => Judge;
|
|
220
|
+
/**
|
|
221
|
+
* Register a custom judge type.
|
|
222
|
+
*
|
|
223
|
+
* @param name - Name for the judge type
|
|
224
|
+
* @param judgeClass - Judge class to register
|
|
225
|
+
*/
|
|
226
|
+
export declare function addJudge(name: string, judgeClass: JudgeConstructor): void;
|
|
227
|
+
/**
|
|
228
|
+
* Get a registered judge type by name.
|
|
229
|
+
*
|
|
230
|
+
* @param name - Name of the judge type
|
|
231
|
+
* @returns Judge class or undefined if not found
|
|
232
|
+
*/
|
|
233
|
+
export declare function getJudge(name: string): JudgeConstructor | undefined;
|
|
234
|
+
/**
|
|
235
|
+
* List all registered judge types.
|
|
236
|
+
*
|
|
237
|
+
* @returns List of judge type names
|
|
238
|
+
*/
|
|
239
|
+
export declare function listJudges(): string[];
|
|
240
|
+
/**
|
|
241
|
+
* Remove a registered judge type.
|
|
242
|
+
*
|
|
243
|
+
* @param name - Name of the judge type to remove
|
|
244
|
+
* @returns True if removed, false if not found
|
|
245
|
+
*/
|
|
246
|
+
export declare function removeJudge(name: string): boolean;
|
|
247
|
+
type OptimizationRuleConstructor = new (...args: any[]) => any;
|
|
248
|
+
/**
|
|
249
|
+
* Register a custom optimization rule.
|
|
250
|
+
*
|
|
251
|
+
* @param name - Name for the rule
|
|
252
|
+
* @param ruleClass - Rule class implementing OptimizationRuleProtocol
|
|
253
|
+
*/
|
|
254
|
+
export declare function addOptimizationRule(name: string, ruleClass: OptimizationRuleConstructor): void;
|
|
255
|
+
/**
|
|
256
|
+
* Get a registered optimization rule by name.
|
|
257
|
+
*
|
|
258
|
+
* @param name - Name of the rule
|
|
259
|
+
* @returns Rule class or undefined if not found
|
|
260
|
+
*/
|
|
261
|
+
export declare function getOptimizationRule(name: string): OptimizationRuleConstructor | undefined;
|
|
262
|
+
/**
|
|
263
|
+
* List all registered optimization rules.
|
|
264
|
+
*
|
|
265
|
+
* @returns List of rule names
|
|
266
|
+
*/
|
|
267
|
+
export declare function listOptimizationRules(): string[];
|
|
268
|
+
/**
|
|
269
|
+
* Remove a registered optimization rule.
|
|
270
|
+
*
|
|
271
|
+
* @param name - Name of the rule to remove
|
|
272
|
+
* @returns True if removed, false if not found
|
|
273
|
+
*/
|
|
274
|
+
export declare function removeOptimizationRule(name: string): boolean;
|
|
275
|
+
export default Judge;
|