praisonai 1.5.3 → 1.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,8 @@ export interface EvalOptions {
7
7
  iterations?: number;
8
8
  warmup?: number;
9
9
  'expected-tools'?: string;
10
+ criteria?: string;
11
+ threshold?: number;
10
12
  model?: string;
11
13
  verbose?: boolean;
12
14
  profile?: string;
@@ -45,12 +45,12 @@ const cli_spec_1 = require("../spec/cli-spec");
45
45
  const errors_1 = require("../output/errors");
46
46
  async function execute(args, options) {
47
47
  const subcommand = args[0];
48
- if (!subcommand || !['accuracy', 'performance', 'reliability'].includes(subcommand)) {
48
+ if (!subcommand || !['accuracy', 'performance', 'reliability', 'judge'].includes(subcommand)) {
49
49
  if (options.json || options.output === 'json') {
50
- (0, json_1.printError)(errors_1.ERROR_CODES.INVALID_ARGS, 'Please specify a subcommand: accuracy, performance, or reliability');
50
+ (0, json_1.printError)(errors_1.ERROR_CODES.INVALID_ARGS, 'Please specify a subcommand: accuracy, performance, reliability, or judge');
51
51
  }
52
52
  else {
53
- await pretty.error('Please specify a subcommand: accuracy, performance, or reliability');
53
+ await pretty.error('Please specify a subcommand: accuracy, performance, reliability, or judge');
54
54
  }
55
55
  process.exit(cli_spec_1.EXIT_CODES.INVALID_ARGUMENTS);
56
56
  }
@@ -72,6 +72,9 @@ async function execute(args, options) {
72
72
  case 'reliability':
73
73
  await runReliabilityEval(options, config, outputFormat);
74
74
  break;
75
+ case 'judge':
76
+ await runJudgeEval(options, config, outputFormat);
77
+ break;
75
78
  }
76
79
  }
77
80
  catch (error) {
@@ -245,3 +248,55 @@ function calculateSimilarity(str1, str2) {
245
248
  const union = new Set([...words1, ...words2]);
246
249
  return intersection.length / union.size;
247
250
  }
251
+ /**
252
+ * Run LLM-as-Judge evaluation
253
+ */
254
+ async function runJudgeEval(options, config, outputFormat) {
255
+ // Lazy import to avoid performance impact
256
+ const { Judge } = await Promise.resolve().then(() => __importStar(require('../../eval/judge')));
257
+ const output = options.input;
258
+ if (!output) {
259
+ throw new Error('--input is required for judge evaluation (the output to judge)');
260
+ }
261
+ const startTime = Date.now();
262
+ const threshold = options.threshold ?? 7.0;
263
+ const judge = new Judge({
264
+ model: config.model,
265
+ threshold,
266
+ criteria: options.criteria,
267
+ });
268
+ const result = await judge.run({
269
+ output,
270
+ expected: options.expected,
271
+ criteria: options.criteria,
272
+ });
273
+ const duration = Date.now() - startTime;
274
+ if (outputFormat === 'json') {
275
+ (0, json_1.outputJson)((0, json_1.formatSuccess)({
276
+ type: 'judge',
277
+ threshold,
278
+ result: {
279
+ score: result.score,
280
+ passed: result.passed,
281
+ reasoning: result.reasoning,
282
+ suggestions: result.suggestions,
283
+ }
284
+ }, { duration_ms: duration, model: config.model }));
285
+ }
286
+ else {
287
+ await pretty.heading('LLM-as-Judge Evaluation Results');
288
+ await pretty.keyValue({
289
+ 'Score': `${result.score.toFixed(1)}/10`,
290
+ 'Status': result.passed ? '✅ PASSED' : '❌ FAILED',
291
+ 'Threshold': threshold,
292
+ 'Reasoning': result.reasoning,
293
+ 'Duration': `${duration}ms`
294
+ });
295
+ if (result.suggestions.length > 0) {
296
+ await pretty.heading('Suggestions');
297
+ for (const suggestion of result.suggestions) {
298
+ console.log(` • ${suggestion}`);
299
+ }
300
+ }
301
+ }
302
+ }
@@ -61,3 +61,4 @@ export declare class EvalSuite {
61
61
  }
62
62
  export { Evaluator, createEvaluator, createDefaultEvaluator, relevanceCriterion, lengthCriterion, containsKeywordsCriterion, noHarmfulContentCriterion, type EvalCriteria, type EvalResult as BaseEvalResult, type EvalSummary, type EvaluatorConfig, } from './base';
63
63
  export { EvalResults, createEvalResults, type TestResult, type AggregatedResults, type TrendPoint, } from './results';
64
+ export { Judge, AccuracyJudge, CriteriaJudge, RecipeJudge, addJudge, getJudge, listJudges, removeJudge, addOptimizationRule, getOptimizationRule, listOptimizationRules, removeOptimizationRule, parseJudgeResponse, type JudgeConfig, type JudgeCriteriaConfig, type JudgeResult, type JudgeRunOptions, type JudgeOptions, type JudgeProtocol, } from './judge';
@@ -3,7 +3,7 @@
3
3
  * Evaluation Framework - Accuracy, Performance, and Reliability evaluation
4
4
  */
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
- exports.createEvalResults = exports.EvalResults = exports.noHarmfulContentCriterion = exports.containsKeywordsCriterion = exports.lengthCriterion = exports.relevanceCriterion = exports.createDefaultEvaluator = exports.createEvaluator = exports.Evaluator = exports.EvalSuite = void 0;
6
+ exports.parseJudgeResponse = exports.removeOptimizationRule = exports.listOptimizationRules = exports.getOptimizationRule = exports.addOptimizationRule = exports.removeJudge = exports.listJudges = exports.getJudge = exports.addJudge = exports.RecipeJudge = exports.CriteriaJudge = exports.AccuracyJudge = exports.Judge = exports.createEvalResults = exports.EvalResults = exports.noHarmfulContentCriterion = exports.containsKeywordsCriterion = exports.lengthCriterion = exports.relevanceCriterion = exports.createDefaultEvaluator = exports.createEvaluator = exports.Evaluator = exports.EvalSuite = void 0;
7
7
  exports.accuracyEval = accuracyEval;
8
8
  exports.performanceEval = performanceEval;
9
9
  exports.reliabilityEval = reliabilityEval;
@@ -168,3 +168,18 @@ Object.defineProperty(exports, "noHarmfulContentCriterion", { enumerable: true,
168
168
  var results_1 = require("./results");
169
169
  Object.defineProperty(exports, "EvalResults", { enumerable: true, get: function () { return results_1.EvalResults; } });
170
170
  Object.defineProperty(exports, "createEvalResults", { enumerable: true, get: function () { return results_1.createEvalResults; } });
171
+ // Re-export Judge (LLM-as-Judge)
172
+ var judge_1 = require("./judge");
173
+ Object.defineProperty(exports, "Judge", { enumerable: true, get: function () { return judge_1.Judge; } });
174
+ Object.defineProperty(exports, "AccuracyJudge", { enumerable: true, get: function () { return judge_1.AccuracyJudge; } });
175
+ Object.defineProperty(exports, "CriteriaJudge", { enumerable: true, get: function () { return judge_1.CriteriaJudge; } });
176
+ Object.defineProperty(exports, "RecipeJudge", { enumerable: true, get: function () { return judge_1.RecipeJudge; } });
177
+ Object.defineProperty(exports, "addJudge", { enumerable: true, get: function () { return judge_1.addJudge; } });
178
+ Object.defineProperty(exports, "getJudge", { enumerable: true, get: function () { return judge_1.getJudge; } });
179
+ Object.defineProperty(exports, "listJudges", { enumerable: true, get: function () { return judge_1.listJudges; } });
180
+ Object.defineProperty(exports, "removeJudge", { enumerable: true, get: function () { return judge_1.removeJudge; } });
181
+ Object.defineProperty(exports, "addOptimizationRule", { enumerable: true, get: function () { return judge_1.addOptimizationRule; } });
182
+ Object.defineProperty(exports, "getOptimizationRule", { enumerable: true, get: function () { return judge_1.getOptimizationRule; } });
183
+ Object.defineProperty(exports, "listOptimizationRules", { enumerable: true, get: function () { return judge_1.listOptimizationRules; } });
184
+ Object.defineProperty(exports, "removeOptimizationRule", { enumerable: true, get: function () { return judge_1.removeOptimizationRule; } });
185
+ Object.defineProperty(exports, "parseJudgeResponse", { enumerable: true, get: function () { return judge_1.parseJudgeResponse; } });
@@ -0,0 +1,275 @@
1
+ /**
2
+ * Unified Judge class for LLM-as-judge evaluation.
3
+ *
4
+ * Provides a simple, unified API for evaluating agent outputs using LLM-as-judge.
5
+ * Follows PraisonAI naming conventions and engineering principles.
6
+ *
7
+ * DRY: Reuses existing provider infrastructure.
8
+ * Protocol-driven: Implements JudgeProtocol for extensibility.
9
+ * Zero performance impact: Lazy imports for LLM providers.
10
+ *
11
+ * @example
12
+ * ```typescript
13
+ * import { Judge } from 'praisonai';
14
+ * const result = await Judge.run({ output: "4", expected: "4" });
15
+ * console.log(`Score: ${result.score}/10`);
16
+ * ```
17
+ */
18
+ /**
19
+ * Configuration for Judge instances.
20
+ */
21
+ export interface JudgeConfig {
22
+ /** LLM model to use for judging (default: gpt-4o-mini) */
23
+ model?: string;
24
+ /** Temperature for LLM calls (default: 0.1 for consistency) */
25
+ temperature?: number;
26
+ /** Maximum tokens for LLM response */
27
+ maxTokens?: number;
28
+ /** Score threshold for passing (default: 7.0) */
29
+ threshold?: number;
30
+ /** Optional custom criteria for evaluation */
31
+ criteria?: string;
32
+ }
33
+ /**
34
+ * Dynamic criteria configuration for domain-agnostic judging.
35
+ *
36
+ * Enables judges to evaluate ANY domain, not just agent outputs:
37
+ * - Water flow optimization
38
+ * - Data pipeline efficiency
39
+ * - Manufacturing quality
40
+ * - Recipe/workflow optimization
41
+ * - Any custom domain
42
+ */
43
+ export interface JudgeCriteriaConfig {
44
+ /** Name of the criteria configuration */
45
+ name: string;
46
+ /** Description of what is being evaluated */
47
+ description: string;
48
+ /** Custom prompt template with {output} placeholder */
49
+ promptTemplate: string;
50
+ /** List of dimensions to score (e.g., ["efficiency", "safety"]) */
51
+ scoringDimensions: string[];
52
+ /** Score threshold for passing (default: 7.0) */
53
+ threshold?: number;
54
+ }
55
+ /**
56
+ * Result from a Judge evaluation.
57
+ *
58
+ * This is the unified result type for all LLM-as-judge evaluations.
59
+ */
60
+ export interface JudgeResult {
61
+ /** Quality score (1-10) */
62
+ score: number;
63
+ /** Whether the evaluation passed (score >= threshold) */
64
+ passed: boolean;
65
+ /** Explanation for the score */
66
+ reasoning: string;
67
+ /** The output that was judged */
68
+ output: string;
69
+ /** Optional expected output */
70
+ expected?: string;
71
+ /** Optional criteria used for evaluation */
72
+ criteria?: string;
73
+ /** List of improvement suggestions */
74
+ suggestions: string[];
75
+ /** When judging occurred */
76
+ timestamp: number;
77
+ /** Additional metadata */
78
+ metadata?: Record<string, any>;
79
+ }
80
+ /**
81
+ * Options for Judge.run() method
82
+ */
83
+ export interface JudgeRunOptions {
84
+ /** The output to judge (required if no agent) */
85
+ output?: string;
86
+ /** Optional expected output for accuracy evaluation */
87
+ expected?: string;
88
+ /** Optional criteria for criteria evaluation */
89
+ criteria?: string;
90
+ /** Optional input context */
91
+ input?: string;
92
+ /** Optional Agent to run and judge */
93
+ agent?: any;
94
+ /** Optional Agents to run and judge */
95
+ agents?: any;
96
+ /** Whether to print result summary */
97
+ printSummary?: boolean;
98
+ }
99
+ /**
100
+ * Constructor options for Judge class
101
+ */
102
+ export interface JudgeOptions {
103
+ /** LLM model to use */
104
+ model?: string;
105
+ /** Temperature for LLM calls */
106
+ temperature?: number;
107
+ /** Maximum tokens for response */
108
+ maxTokens?: number;
109
+ /** Score threshold for passing */
110
+ threshold?: number;
111
+ /** Custom criteria for evaluation */
112
+ criteria?: string;
113
+ /** Full JudgeConfig object */
114
+ config?: JudgeConfig;
115
+ /** Domain-agnostic criteria config */
116
+ criteriaConfig?: JudgeCriteriaConfig;
117
+ /** Session ID for trace isolation */
118
+ sessionId?: string;
119
+ }
120
+ /**
121
+ * Protocol interface for Judge implementations
122
+ */
123
+ export interface JudgeProtocol {
124
+ run(options: JudgeRunOptions): Promise<JudgeResult>;
125
+ runAsync(options: JudgeRunOptions): Promise<JudgeResult>;
126
+ }
127
+ /**
128
+ * Parse LLM response into JudgeResult.
129
+ *
130
+ * @param responseText - Raw LLM response
131
+ * @param output - Original output
132
+ * @param expected - Original expected output
133
+ * @param criteria - Original criteria
134
+ * @param threshold - Score threshold for passing
135
+ * @returns JudgeResult with score, passed, reasoning, suggestions
136
+ */
137
+ export declare function parseJudgeResponse(responseText: string, output: string, expected: string | null, criteria: string | null, threshold: number): JudgeResult;
138
+ /**
139
+ * Unified LLM-as-judge for evaluating agent outputs.
140
+ *
141
+ * Provides a simple API for:
142
+ * - Accuracy evaluation (comparing output to expected)
143
+ * - Criteria evaluation (evaluating against custom criteria)
144
+ * - Custom evaluation (subclass for domain-specific judges)
145
+ *
146
+ * @example
147
+ * ```typescript
148
+ * // Simple accuracy check
149
+ * const result = await new Judge().run({ output: "4", expected: "4" });
150
+ *
151
+ * // Custom criteria
152
+ * const result = await new Judge({ criteria: "Response is helpful" }).run({ output: "Hello!" });
153
+ *
154
+ * // With agent
155
+ * const result = await new Judge().run({ agent: myAgent, input: "2+2", expected: "4" });
156
+ * ```
157
+ */
158
+ export declare class Judge implements JudgeProtocol {
159
+ readonly model: string;
160
+ readonly temperature: number;
161
+ readonly maxTokens: number;
162
+ readonly threshold: number;
163
+ readonly criteria: string | null;
164
+ readonly criteriaConfig: JudgeCriteriaConfig | null;
165
+ readonly sessionId: string | null;
166
+ constructor(options?: JudgeOptions);
167
+ /**
168
+ * Build the appropriate prompt based on evaluation type.
169
+ */
170
+ protected buildPrompt(output: string, expected: string | null, criteria: string | null, input: string): string;
171
+ /**
172
+ * Get LLM provider lazily.
173
+ */
174
+ protected getProvider(): Promise<any>;
175
+ /**
176
+ * Get output from an Agent.
177
+ */
178
+ protected getAgentOutput(agent: any, input: string): Promise<string>;
179
+ /**
180
+ * Get output from Agents (multi-agent).
181
+ */
182
+ protected getAgentsOutput(agents: any, input: string): Promise<string>;
183
+ /**
184
+ * Judge an output.
185
+ *
186
+ * @param options - Evaluation options
187
+ * @returns JudgeResult with score, passed, reasoning, suggestions
188
+ */
189
+ run(options: JudgeRunOptions): Promise<JudgeResult>;
190
+ /**
191
+ * Judge an output asynchronously (alias for run).
192
+ */
193
+ runAsync(options: JudgeRunOptions): Promise<JudgeResult>;
194
+ /**
195
+ * Print a summary of the judge result.
196
+ */
197
+ printSummary(result: JudgeResult): void;
198
+ }
199
+ /**
200
+ * Judge for accuracy evaluation (comparing output to expected).
201
+ */
202
+ export declare class AccuracyJudge extends Judge {
203
+ }
204
+ /**
205
+ * Judge for criteria-based evaluation.
206
+ */
207
+ export declare class CriteriaJudge extends Judge {
208
+ }
209
+ /**
210
+ * Judge for evaluating recipe/workflow execution traces.
211
+ */
212
+ export declare class RecipeJudge extends Judge {
213
+ readonly mode: string;
214
+ constructor(options?: JudgeOptions & {
215
+ mode?: string;
216
+ });
217
+ protected buildPrompt(output: string, expected: string | null, criteria: string | null, input: string): string;
218
+ }
219
+ type JudgeConstructor = new (options?: JudgeOptions) => Judge;
220
+ /**
221
+ * Register a custom judge type.
222
+ *
223
+ * @param name - Name for the judge type
224
+ * @param judgeClass - Judge class to register
225
+ */
226
+ export declare function addJudge(name: string, judgeClass: JudgeConstructor): void;
227
+ /**
228
+ * Get a registered judge type by name.
229
+ *
230
+ * @param name - Name of the judge type
231
+ * @returns Judge class or undefined if not found
232
+ */
233
+ export declare function getJudge(name: string): JudgeConstructor | undefined;
234
+ /**
235
+ * List all registered judge types.
236
+ *
237
+ * @returns List of judge type names
238
+ */
239
+ export declare function listJudges(): string[];
240
+ /**
241
+ * Remove a registered judge type.
242
+ *
243
+ * @param name - Name of the judge type to remove
244
+ * @returns True if removed, false if not found
245
+ */
246
+ export declare function removeJudge(name: string): boolean;
247
+ type OptimizationRuleConstructor = new (...args: any[]) => any;
248
+ /**
249
+ * Register a custom optimization rule.
250
+ *
251
+ * @param name - Name for the rule
252
+ * @param ruleClass - Rule class implementing OptimizationRuleProtocol
253
+ */
254
+ export declare function addOptimizationRule(name: string, ruleClass: OptimizationRuleConstructor): void;
255
+ /**
256
+ * Get a registered optimization rule by name.
257
+ *
258
+ * @param name - Name of the rule
259
+ * @returns Rule class or undefined if not found
260
+ */
261
+ export declare function getOptimizationRule(name: string): OptimizationRuleConstructor | undefined;
262
+ /**
263
+ * List all registered optimization rules.
264
+ *
265
+ * @returns List of rule names
266
+ */
267
+ export declare function listOptimizationRules(): string[];
268
+ /**
269
+ * Remove a registered optimization rule.
270
+ *
271
+ * @param name - Name of the rule to remove
272
+ * @returns True if removed, false if not found
273
+ */
274
+ export declare function removeOptimizationRule(name: string): boolean;
275
+ export default Judge;
@@ -0,0 +1,528 @@
1
+ "use strict";
2
+ /**
3
+ * Unified Judge class for LLM-as-judge evaluation.
4
+ *
5
+ * Provides a simple, unified API for evaluating agent outputs using LLM-as-judge.
6
+ * Follows PraisonAI naming conventions and engineering principles.
7
+ *
8
+ * DRY: Reuses existing provider infrastructure.
9
+ * Protocol-driven: Implements JudgeProtocol for extensibility.
10
+ * Zero performance impact: Lazy imports for LLM providers.
11
+ *
12
+ * @example
13
+ * ```typescript
14
+ * import { Judge } from 'praisonai';
15
+ * const result = await Judge.run({ output: "4", expected: "4" });
16
+ * console.log(`Score: ${result.score}/10`);
17
+ * ```
18
+ */
19
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
20
+ if (k2 === undefined) k2 = k;
21
+ var desc = Object.getOwnPropertyDescriptor(m, k);
22
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
23
+ desc = { enumerable: true, get: function() { return m[k]; } };
24
+ }
25
+ Object.defineProperty(o, k2, desc);
26
+ }) : (function(o, m, k, k2) {
27
+ if (k2 === undefined) k2 = k;
28
+ o[k2] = m[k];
29
+ }));
30
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
31
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
32
+ }) : function(o, v) {
33
+ o["default"] = v;
34
+ });
35
+ var __importStar = (this && this.__importStar) || (function () {
36
+ var ownKeys = function(o) {
37
+ ownKeys = Object.getOwnPropertyNames || function (o) {
38
+ var ar = [];
39
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
40
+ return ar;
41
+ };
42
+ return ownKeys(o);
43
+ };
44
+ return function (mod) {
45
+ if (mod && mod.__esModule) return mod;
46
+ var result = {};
47
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
48
+ __setModuleDefault(result, mod);
49
+ return result;
50
+ };
51
+ })();
52
+ Object.defineProperty(exports, "__esModule", { value: true });
53
+ exports.RecipeJudge = exports.CriteriaJudge = exports.AccuracyJudge = exports.Judge = void 0;
54
+ exports.parseJudgeResponse = parseJudgeResponse;
55
+ exports.addJudge = addJudge;
56
+ exports.getJudge = getJudge;
57
+ exports.listJudges = listJudges;
58
+ exports.removeJudge = removeJudge;
59
+ exports.addOptimizationRule = addOptimizationRule;
60
+ exports.getOptimizationRule = getOptimizationRule;
61
+ exports.listOptimizationRules = listOptimizationRules;
62
+ exports.removeOptimizationRule = removeOptimizationRule;
63
+ // ============================================================================
64
+ // Prompt Templates
65
+ // ============================================================================
66
+ /** Default prompt for accuracy evaluation */
67
+ const ACCURACY_PROMPT = `You are an expert evaluator. Compare the actual output against the expected output.
68
+
69
+ INPUT: {input}
70
+
71
+ EXPECTED OUTPUT:
72
+ {expected}
73
+
74
+ ACTUAL OUTPUT:
75
+ {output}
76
+
77
+ Scoring Guidelines:
78
+ - 10: Perfect match in meaning and completeness
79
+ - 8-9: Very close, minor differences that don't affect correctness
80
+ - 6-7: Mostly correct but missing some details or has minor errors
81
+ - 4-5: Partially correct but significant issues
82
+ - 2-3: Mostly incorrect but shows some understanding
83
+ - 1: Completely wrong or irrelevant
84
+
85
+ Respond in this EXACT format:
86
+ SCORE: [number 1-10]
87
+ REASONING: [brief explanation]
88
+ SUGGESTIONS:
89
+ - [improvement suggestion 1]
90
+ - [improvement suggestion 2]
91
+ `;
92
+ /** Default prompt for criteria evaluation */
93
+ const CRITERIA_PROMPT = `You are an expert evaluator. Evaluate the output against the given criteria.
94
+
95
+ CRITERIA: {criteria}
96
+
97
+ OUTPUT TO EVALUATE:
98
+ {output}
99
+
100
+ Score the output from 1-10 based on how well it meets the criteria.
101
+ - 10: Perfectly meets all criteria
102
+ - 8-9: Meets criteria very well with minor issues
103
+ - 6-7: Meets most criteria but has some gaps
104
+ - 4-5: Partially meets criteria
105
+ - 2-3: Barely meets criteria
106
+ - 1: Does not meet criteria at all
107
+
108
+ Respond in this EXACT format:
109
+ SCORE: [number 1-10]
110
+ REASONING: [brief explanation]
111
+ SUGGESTIONS:
112
+ - [improvement suggestion 1]
113
+ - [improvement suggestion 2]
114
+ `;
115
+ /** Default prompt for general quality evaluation */
116
+ const GENERAL_PROMPT = `You are an expert evaluator for AI agent outputs. Your task is to grade the quality of an agent's response.
117
+
118
+ INPUT (what the agent was asked):
119
+ {input}
120
+
121
+ AGENT OUTPUT (what the agent responded):
122
+ {output}
123
+
124
+ GRADING CRITERIA:
125
+ - Accuracy: Is the response factually correct?
126
+ - Completeness: Does it fully address the input?
127
+ - Clarity: Is it well-written and easy to understand?
128
+ - Relevance: Does it stay on topic?
129
+
130
+ SCORING GUIDELINES:
131
+ - 10: Perfect - Excellent in all criteria
132
+ - 8-9: Very Good - Minor improvements possible
133
+ - 6-7: Good - Some issues but mostly correct
134
+ - 4-5: Fair - Significant issues
135
+ - 2-3: Poor - Major problems
136
+ - 1: Very Poor - Completely wrong or irrelevant
137
+
138
+ Respond in this EXACT format:
139
+ SCORE: [number 1-10]
140
+ REASONING: [brief explanation of the score]
141
+ SUGGESTIONS:
142
+ - [first suggestion for improvement]
143
+ - [second suggestion if applicable]
144
+ `;
145
+ /** Recipe evaluation prompt */
146
+ const RECIPE_PROMPT = `You are an expert evaluator for AI agent workflow recipes.
147
+
148
+ RECIPE OUTPUT TO EVALUATE:
149
+ {output}
150
+
151
+ EXPECTED BEHAVIOR:
152
+ {expected}
153
+
154
+ EVALUATION CRITERIA:
155
+ {criteria}
156
+
157
+ Evaluate the recipe execution on:
158
+ 1. Task completion (1-10): Did agents complete their assigned tasks?
159
+ 2. Context flow (1-10): Was context properly passed between agents?
160
+ 3. Output quality (1-10): Is the final output useful and accurate?
161
+
162
+ Respond in this EXACT format:
163
+ SCORE: [average of above, 1-10]
164
+ REASONING: [brief explanation]
165
+ SUGGESTIONS:
166
+ - [improvement suggestion 1]
167
+ - [improvement suggestion 2]
168
+ `;
169
+ // ============================================================================
170
+ // Response Parsing
171
+ // ============================================================================
172
+ /**
173
+ * Parse LLM response into JudgeResult.
174
+ *
175
+ * @param responseText - Raw LLM response
176
+ * @param output - Original output
177
+ * @param expected - Original expected output
178
+ * @param criteria - Original criteria
179
+ * @param threshold - Score threshold for passing
180
+ * @returns JudgeResult with score, passed, reasoning, suggestions
181
+ */
182
+ function parseJudgeResponse(responseText, output, expected, criteria, threshold) {
183
+ let score = 5.0;
184
+ let reasoning = 'Unable to parse response';
185
+ const suggestions = [];
186
+ const lines = responseText.trim().split('\n');
187
+ let inSuggestions = false;
188
+ for (const rawLine of lines) {
189
+ const line = rawLine.trim();
190
+ if (line.startsWith('SCORE:')) {
191
+ try {
192
+ const scoreStr = line.replace('SCORE:', '').trim();
193
+ score = parseFloat(scoreStr);
194
+ // Clamp to valid range
195
+ score = Math.max(1.0, Math.min(10.0, score));
196
+ }
197
+ catch {
198
+ // Keep default
199
+ }
200
+ }
201
+ else if (line.startsWith('REASONING:')) {
202
+ reasoning = line.replace('REASONING:', '').trim();
203
+ }
204
+ else if (line.startsWith('SUGGESTIONS:')) {
205
+ inSuggestions = true;
206
+ const rest = line.replace('SUGGESTIONS:', '').trim();
207
+ if (rest.toLowerCase() !== 'none' && rest) {
208
+ suggestions.push(rest);
209
+ }
210
+ }
211
+ else if (inSuggestions && line.startsWith('-')) {
212
+ const suggestion = line.replace(/^-\s*/, '').trim();
213
+ if (suggestion && suggestion.toLowerCase() !== 'none') {
214
+ suggestions.push(suggestion);
215
+ }
216
+ }
217
+ }
218
+ return {
219
+ score,
220
+ passed: score >= threshold,
221
+ reasoning,
222
+ output,
223
+ expected: expected ?? undefined,
224
+ criteria: criteria ?? undefined,
225
+ suggestions,
226
+ timestamp: Date.now(),
227
+ };
228
+ }
229
+ // ============================================================================
230
+ // Judge Class
231
+ // ============================================================================
232
+ /**
233
+ * Unified LLM-as-judge for evaluating agent outputs.
234
+ *
235
+ * Provides a simple API for:
236
+ * - Accuracy evaluation (comparing output to expected)
237
+ * - Criteria evaluation (evaluating against custom criteria)
238
+ * - Custom evaluation (subclass for domain-specific judges)
239
+ *
240
+ * @example
241
+ * ```typescript
242
+ * // Simple accuracy check
243
+ * const result = await new Judge().run({ output: "4", expected: "4" });
244
+ *
245
+ * // Custom criteria
246
+ * const result = await new Judge({ criteria: "Response is helpful" }).run({ output: "Hello!" });
247
+ *
248
+ * // With agent
249
+ * const result = await new Judge().run({ agent: myAgent, input: "2+2", expected: "4" });
250
+ * ```
251
+ */
252
+ class Judge {
253
+ constructor(options = {}) {
254
+ // Use config if provided, otherwise use individual params
255
+ const config = options.config;
256
+ this.model = options.model ?? config?.model ?? process.env.OPENAI_MODEL_NAME ?? 'gpt-4o-mini';
257
+ this.temperature = options.temperature ?? config?.temperature ?? 0.1;
258
+ this.maxTokens = options.maxTokens ?? config?.maxTokens ?? 500;
259
+ this.threshold = options.threshold ?? config?.threshold ?? options.criteriaConfig?.threshold ?? 7.0;
260
+ this.criteria = options.criteria ?? config?.criteria ?? options.criteriaConfig?.description ?? null;
261
+ this.criteriaConfig = options.criteriaConfig ?? null;
262
+ this.sessionId = options.sessionId ?? null;
263
+ }
264
+ /**
265
+ * Build the appropriate prompt based on evaluation type.
266
+ */
267
+ buildPrompt(output, expected, criteria, input) {
268
+ // Use criteriaConfig custom prompt template if available
269
+ if (this.criteriaConfig?.promptTemplate) {
270
+ return this.criteriaConfig.promptTemplate
271
+ .replace('{output}', output)
272
+ .replace('{input}', input || 'Not provided')
273
+ .replace('{input_text}', input || 'Not provided')
274
+ .replace('{expected}', expected || 'Not specified');
275
+ }
276
+ // Use instance criteria if not provided
277
+ const effectiveCriteria = criteria ?? this.criteria;
278
+ if (expected !== null) {
279
+ // Accuracy mode
280
+ return ACCURACY_PROMPT
281
+ .replace('{input}', input || 'Not provided')
282
+ .replace('{expected}', expected)
283
+ .replace('{output}', output);
284
+ }
285
+ else if (effectiveCriteria) {
286
+ // Criteria mode
287
+ return CRITERIA_PROMPT
288
+ .replace('{criteria}', effectiveCriteria)
289
+ .replace('{output}', output);
290
+ }
291
+ else {
292
+ // Default: general quality evaluation
293
+ return GENERAL_PROMPT
294
+ .replace('{input}', input || 'Not provided')
295
+ .replace('{output}', output);
296
+ }
297
+ }
298
+ /**
299
+ * Get LLM provider lazily.
300
+ */
301
+ async getProvider() {
302
+ // Lazy import to avoid performance impact
303
+ const { createProvider } = await Promise.resolve().then(() => __importStar(require('../llm/providers')));
304
+ return createProvider(this.model);
305
+ }
306
+ /**
307
+ * Get output from an Agent.
308
+ */
309
+ async getAgentOutput(agent, input) {
310
+ if (typeof agent.chat === 'function') {
311
+ return String(await agent.chat(input));
312
+ }
313
+ else if (typeof agent.start === 'function') {
314
+ const result = await agent.start(input);
315
+ if (result && typeof result === 'object' && 'raw' in result) {
316
+ return String(result.raw);
317
+ }
318
+ return String(result);
319
+ }
320
+ throw new Error("Agent must have 'chat' or 'start' method");
321
+ }
322
+ /**
323
+ * Get output from Agents (multi-agent).
324
+ */
325
+ async getAgentsOutput(agents, input) {
326
+ if (typeof agents.start === 'function') {
327
+ const result = await agents.start(input);
328
+ if (result && typeof result === 'object' && 'raw' in result) {
329
+ return String(result.raw);
330
+ }
331
+ return String(result);
332
+ }
333
+ throw new Error("Agents must have 'start' method");
334
+ }
335
+ /**
336
+ * Judge an output.
337
+ *
338
+ * @param options - Evaluation options
339
+ * @returns JudgeResult with score, passed, reasoning, suggestions
340
+ */
341
+ async run(options) {
342
+ let output = options.output ?? '';
343
+ // Get output from agent if provided
344
+ if (options.agent) {
345
+ output = await this.getAgentOutput(options.agent, options.input ?? '');
346
+ }
347
+ else if (options.agents) {
348
+ output = await this.getAgentsOutput(options.agents, options.input ?? '');
349
+ }
350
+ if (!output) {
351
+ return {
352
+ score: 0,
353
+ passed: false,
354
+ reasoning: 'No output provided to judge',
355
+ output: '',
356
+ expected: options.expected,
357
+ criteria: options.criteria ?? this.criteria ?? undefined,
358
+ suggestions: [],
359
+ timestamp: Date.now(),
360
+ };
361
+ }
362
+ try {
363
+ const provider = await this.getProvider();
364
+ const prompt = this.buildPrompt(output, options.expected ?? null, options.criteria ?? null, options.input ?? '');
365
+ const response = await provider.generateText({
366
+ messages: [{ role: 'user', content: prompt }],
367
+ temperature: this.temperature,
368
+ maxTokens: this.maxTokens,
369
+ });
370
+ const responseText = response.text ?? '';
371
+ const result = parseJudgeResponse(responseText, output, options.expected ?? null, options.criteria ?? this.criteria, this.threshold);
372
+ if (options.printSummary) {
373
+ this.printSummary(result);
374
+ }
375
+ return result;
376
+ }
377
+ catch (error) {
378
+ const errorMessage = error instanceof Error ? error.message : String(error);
379
+ return {
380
+ score: 0,
381
+ passed: false,
382
+ reasoning: `Evaluation error: ${errorMessage}`,
383
+ output,
384
+ expected: options.expected,
385
+ criteria: options.criteria ?? this.criteria ?? undefined,
386
+ suggestions: [],
387
+ timestamp: Date.now(),
388
+ };
389
+ }
390
+ }
391
+ /**
392
+ * Judge an output asynchronously (alias for run).
393
+ */
394
+ async runAsync(options) {
395
+ return this.run(options);
396
+ }
397
+ /**
398
+ * Print a summary of the judge result.
399
+ */
400
+ printSummary(result) {
401
+ const status = result.passed ? '✅ PASSED' : '❌ FAILED';
402
+ console.log(`\n=== Judge Result ===`);
403
+ console.log(`Score: ${result.score.toFixed(1)}/10`);
404
+ console.log(`Status: ${status}`);
405
+ console.log(`Reasoning: ${result.reasoning}`);
406
+ if (result.suggestions.length > 0) {
407
+ console.log(`Suggestions:`);
408
+ result.suggestions.forEach(s => console.log(` - ${s}`));
409
+ }
410
+ }
411
+ }
412
+ exports.Judge = Judge;
413
+ // ============================================================================
414
+ // Built-in Judge Types
415
+ // ============================================================================
416
+ /**
417
+ * Judge for accuracy evaluation (comparing output to expected).
418
+ */
419
+ class AccuracyJudge extends Judge {
420
+ }
421
+ exports.AccuracyJudge = AccuracyJudge;
422
+ /**
423
+ * Judge for criteria-based evaluation.
424
+ */
425
+ class CriteriaJudge extends Judge {
426
+ }
427
+ exports.CriteriaJudge = CriteriaJudge;
428
+ /**
429
+ * Judge for evaluating recipe/workflow execution traces.
430
+ */
431
+ class RecipeJudge extends Judge {
432
+ constructor(options = {}) {
433
+ const criteria = `Recipe execution quality in ${options.mode ?? 'context'} mode`;
434
+ super({
435
+ ...options,
436
+ criteria,
437
+ maxTokens: options.maxTokens ?? 800,
438
+ });
439
+ this.mode = options.mode ?? 'context';
440
+ }
441
+ buildPrompt(output, expected, criteria, input) {
442
+ return RECIPE_PROMPT
443
+ .replace('{output}', output)
444
+ .replace('{expected}', expected ?? 'Complete workflow execution')
445
+ .replace('{criteria}', criteria ?? this.criteria ?? `Recipe quality in ${this.mode} mode`);
446
+ }
447
+ }
448
+ exports.RecipeJudge = RecipeJudge;
449
+ const JUDGE_REGISTRY = new Map([
450
+ ['accuracy', AccuracyJudge],
451
+ ['criteria', CriteriaJudge],
452
+ ['recipe', RecipeJudge],
453
+ ]);
454
+ /**
455
+ * Register a custom judge type.
456
+ *
457
+ * @param name - Name for the judge type
458
+ * @param judgeClass - Judge class to register
459
+ */
460
+ function addJudge(name, judgeClass) {
461
+ JUDGE_REGISTRY.set(name.toLowerCase(), judgeClass);
462
+ }
463
+ /**
464
+ * Get a registered judge type by name.
465
+ *
466
+ * @param name - Name of the judge type
467
+ * @returns Judge class or undefined if not found
468
+ */
469
+ function getJudge(name) {
470
+ return JUDGE_REGISTRY.get(name.toLowerCase());
471
+ }
472
+ /**
473
+ * List all registered judge types.
474
+ *
475
+ * @returns List of judge type names
476
+ */
477
+ function listJudges() {
478
+ return Array.from(JUDGE_REGISTRY.keys());
479
+ }
480
+ /**
481
+ * Remove a registered judge type.
482
+ *
483
+ * @param name - Name of the judge type to remove
484
+ * @returns True if removed, false if not found
485
+ */
486
+ function removeJudge(name) {
487
+ return JUDGE_REGISTRY.delete(name.toLowerCase());
488
+ }
489
+ const OPTIMIZATION_RULE_REGISTRY = new Map();
490
+ /**
491
+ * Register a custom optimization rule.
492
+ *
493
+ * @param name - Name for the rule
494
+ * @param ruleClass - Rule class implementing OptimizationRuleProtocol
495
+ */
496
+ function addOptimizationRule(name, ruleClass) {
497
+ OPTIMIZATION_RULE_REGISTRY.set(name.toLowerCase(), ruleClass);
498
+ }
499
+ /**
500
+ * Get a registered optimization rule by name.
501
+ *
502
+ * @param name - Name of the rule
503
+ * @returns Rule class or undefined if not found
504
+ */
505
+ function getOptimizationRule(name) {
506
+ return OPTIMIZATION_RULE_REGISTRY.get(name.toLowerCase());
507
+ }
508
+ /**
509
+ * List all registered optimization rules.
510
+ *
511
+ * @returns List of rule names
512
+ */
513
+ function listOptimizationRules() {
514
+ return Array.from(OPTIMIZATION_RULE_REGISTRY.keys());
515
+ }
516
+ /**
517
+ * Remove a registered optimization rule.
518
+ *
519
+ * @param name - Name of the rule to remove
520
+ * @returns True if removed, false if not found
521
+ */
522
+ function removeOptimizationRule(name) {
523
+ return OPTIMIZATION_RULE_REGISTRY.delete(name.toLowerCase());
524
+ }
525
+ // ============================================================================
526
+ // Exports
527
+ // ============================================================================
528
+ exports.default = Judge;
package/dist/index.d.ts CHANGED
@@ -61,7 +61,7 @@ export * from './guardrails';
61
61
  export { Handoff, handoff, handoffFilters, type HandoffConfig, type HandoffContext, type HandoffResult } from './agent/handoff';
62
62
  export { RouterAgent, createRouter, routeConditions, type RouterConfig, type RouteConfig, type RouteContext } from './agent/router';
63
63
  export { ContextAgent, createContextAgent, type ContextAgentConfig, type ContextMessage } from './agent/context';
64
- export { accuracyEval, performanceEval, reliabilityEval, EvalSuite, Evaluator, createEvaluator, createDefaultEvaluator, EvalResults, createEvalResults, relevanceCriterion, lengthCriterion, containsKeywordsCriterion, noHarmfulContentCriterion, type EvalResult, type PerformanceResult, type AccuracyEvalConfig, type PerformanceEvalConfig, type ReliabilityEvalConfig, type EvalCriteria, type EvaluatorConfig, type TestResult, type AggregatedResults } from './eval';
64
+ export { accuracyEval, performanceEval, reliabilityEval, EvalSuite, Evaluator, createEvaluator, createDefaultEvaluator, EvalResults, createEvalResults, relevanceCriterion, lengthCriterion, containsKeywordsCriterion, noHarmfulContentCriterion, Judge, AccuracyJudge, CriteriaJudge, RecipeJudge, addJudge, getJudge, listJudges, removeJudge, addOptimizationRule, getOptimizationRule, listOptimizationRules, removeOptimizationRule, parseJudgeResponse, type EvalResult, type PerformanceResult, type AccuracyEvalConfig, type PerformanceEvalConfig, type ReliabilityEvalConfig, type EvalCriteria, type EvaluatorConfig, type TestResult, type AggregatedResults, type JudgeConfig, type JudgeCriteriaConfig, type JudgeResult, type JudgeRunOptions, type JudgeOptions, } from './eval';
65
65
  export { SkillManager, createSkillManager, parseSkillFile, type Skill, type SkillMetadata, type SkillDiscoveryOptions } from './skills';
66
66
  export { parseArgs, executeCommand, CLI_SPEC_VERSION } from './cli';
67
67
  export { Memory, createMemory } from './memory/memory';
package/dist/index.js CHANGED
@@ -54,14 +54,14 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
54
54
  Object.defineProperty(exports, "__esModule", { value: true });
55
55
  exports.tavilySearch = exports.codeExecution = exports.composeMiddleware = exports.createValidationMiddleware = exports.createTracingMiddleware = exports.createRetryMiddleware = exports.createRateLimitMiddleware = exports.createRedactionMiddleware = exports.createTimeoutMiddleware = exports.createLoggingMiddleware = exports.MissingEnvVarError = exports.MissingDependencyError = exports.ToolsRegistry = exports.resetToolsRegistry = exports.createToolsRegistry = exports.getToolsRegistry = exports.registerBuiltinTools = exports.tools = exports.createDelegator = exports.createSubagentTools = exports.createSubagentTool = exports.SubagentTool = exports.getTool = exports.registerTool = exports.getRegistry = exports.ToolRegistry = exports.tool = exports.FunctionTool = exports.createTool = exports.validateTool = exports.ToolValidationError = exports.BaseTool = exports.setDefaultDbAdapter = exports.getDefaultDbAdapter = exports.createDbAdapter = exports.db = exports.WorkflowStep = exports.repeatPattern = exports.Repeat = exports.loopPattern = exports.Loop = exports.repeat = exports.loop = exports.route = exports.parallel = exports.Workflow = exports.Router = exports.PraisonAIAgents = exports.Agents = exports.Agent = void 0;
56
56
  exports.EvalSuite = exports.reliabilityEval = exports.performanceEval = exports.accuracyEval = exports.createContextAgent = exports.ContextAgent = exports.routeConditions = exports.createRouter = exports.RouterAgent = exports.handoffFilters = exports.handoff = exports.Handoff = exports.createRateLimitPolicy = exports.createApiKeyPolicy = exports.createMCPSecurity = exports.MCPSecurity = exports.createMCPSession = exports.createMCPServer = exports.MCPServer = exports.getMCPTools = exports.createMCPClient = exports.MCPClient = exports.registerLocalTool = exports.registerNpmTool = exports.createCustomTool = exports.registerCustomTool = exports.codeMode = exports.airweaveSearch = exports.bedrockBrowserFill = exports.bedrockBrowserClick = exports.bedrockBrowserNavigate = exports.bedrockCodeInterpreter = exports.valyuCompanyResearch = exports.valyuEconomicsSearch = exports.valyuSecSearch = exports.valyuPatentSearch = exports.valyuBioSearch = exports.valyuPaperSearch = exports.valyuFinanceSearch = exports.valyuWebSearch = exports.superagentVerify = exports.superagentRedact = exports.superagentGuard = exports.firecrawlCrawl = exports.firecrawlScrape = exports.parallelSearch = exports.perplexitySearch = exports.exaSearch = exports.tavilyCrawl = exports.tavilyExtract = void 0;
57
- exports.createWorkflowHooks = exports.WorkflowHooksExecutor = exports.DisplayTypes = exports.clearAllCallbacks = exports.getRegisteredDisplayTypes = exports.hasApprovalCallback = exports.requestApproval = exports.executeCallback = exports.executeSyncCallback = exports.clearApprovalCallback = exports.registerApprovalCallback = exports.unregisterDisplayCallback = exports.registerDisplayCallback = exports.createValidationOperationHooks = exports.createLoggingOperationHooks = exports.createHooksManager = exports.HooksManager = exports.createDocsManager = exports.DocsManager = exports.createSafetyRules = exports.createRulesManager = exports.RulesManager = exports.createEncryptionHooks = exports.createValidationHooks = exports.createLoggingHooks = exports.createMemoryHooks = exports.MemoryHooks = exports.DEFAULT_POLICIES = exports.createLLMSummarizer = exports.createAutoMemory = exports.AutoMemory = exports.createFileMemory = exports.FileMemory = exports.createMemory = exports.Memory = exports.CLI_SPEC_VERSION = exports.executeCommand = exports.parseArgs = exports.parseSkillFile = exports.createSkillManager = exports.SkillManager = exports.noHarmfulContentCriterion = exports.containsKeywordsCriterion = exports.lengthCriterion = exports.relevanceCriterion = exports.createEvalResults = exports.EvalResults = exports.createDefaultEvaluator = exports.createEvaluator = exports.Evaluator = void 0;
58
- exports.AgentEvents = exports.AgentEventBus = exports.EventEmitterPubSub = exports.PubSub = exports.createFileCache = exports.createMemoryCache = exports.FileCache = exports.MemoryCache = exports.BaseCache = exports.createTaskAgent = exports.createPlanningAgent = exports.createPlanStorage = exports.createTodoList = exports.createPlan = exports.TaskAgent = exports.PlanningAgent = exports.PlanStorage = exports.TodoItem = exports.TodoList = exports.PlanStep = exports.Plan = exports.createLLMGuardrail = exports.LLMGuardrail = exports.createPromptExpanderAgent = exports.PromptExpanderAgent = exports.createQueryRewriterAgent = exports.QueryRewriterAgent = exports.createDeepResearchAgent = exports.DeepResearchAgent = exports.createAudioAgent = exports.AudioAgent = exports.createImageAgent = exports.ImageAgent = exports.createAutoAgents = exports.AutoAgents = exports.createHTTPSink = exports.createConsoleSink = exports.createTelemetryIntegration = exports.TelemetryIntegration = exports.createPerformanceMonitor = exports.PerformanceMonitor = exports.createAgentTelemetry = exports.cleanupTelemetry = exports.disableTelemetry = exports.enableTelemetry = exports.getTelemetry = exports.AgentTelemetry = exports.TelemetryCollector = exports.createTimingWorkflowHooks = exports.createLoggingWorkflowHooks = void 0;
59
- exports.GraphRAG = exports.GraphStore = exports.createLLMReranker = exports.createCrossEncoderReranker = exports.createCohereReranker = exports.LLMReranker = exports.CrossEncoderReranker = exports.CohereReranker = exports.BaseReranker = exports.createElevenLabsVoice = exports.createOpenAIVoice = exports.ElevenLabsVoiceProvider = exports.OpenAIVoiceProvider = exports.BaseVoiceProvider = exports.createLangfuseObservability = exports.createMemoryObservability = exports.createConsoleObservability = exports.LangfuseObservabilityProvider = exports.MemoryObservabilityProvider = exports.ConsoleObservabilityProvider = exports.BaseObservabilityProvider = exports.createChromaStore = exports.ChromaVectorStore = exports.createQdrantStore = exports.QdrantVectorStore = exports.createWeaviateStore = exports.WeaviateVectorStore = exports.createPineconeStore = exports.PineconeVectorStore = exports.createMemoryVectorStore = exports.MemoryVectorStore = exports.BaseVectorStore = exports.createPostgresSessionStorage = exports.createMemoryPostgres = exports.createNeonPostgres = exports.PostgresSessionStorage = exports.MemoryPostgresAdapter = exports.NeonPostgresAdapter = exports.createMemoryRedis = exports.createUpstashRedis = exports.MemoryRedisAdapter = exports.UpstashRedisAdapter = exports.createSQLiteAdapter = exports.SQLiteAdapter = exports.validateWorkflowDefinition = exports.loadWorkflowFromFile = exports.createWorkflowFromYAML = exports.parseYAMLWorkflow = exports.createPubSub = exports.createEventBus = void 0;
60
- exports.InteractiveTUI = exports.MODEL_PRICING = exports.formatCost = exports.estimateTokens = exports.createCostTracker = exports.CostTracker = exports.isSlashCommand = exports.executeSlashCommand = exports.parseSlashCommand = exports.registerCommand = exports.createSlashCommandHandler = exports.SlashCommandHandler = exports.ADAPTERS = exports.COMMUNITY_PROVIDERS = exports.PROVIDER_ALIASES = exports.AISDK_PROVIDERS = exports.trace = exports.resetObservabilityAdapter = exports.getObservabilityAdapter = exports.setObservabilityAdapter = exports.clearAdapterCache = exports.createObservabilityAdapter = exports.createConsoleAdapter = exports.ConsoleObservabilityAdapter = exports.createMemoryAdapter = exports.MemoryObservabilityAdapter = exports.noopAdapter = exports.NoopObservabilityAdapter = exports.hasObservabilityToolEnvVar = exports.listObservabilityTools = exports.getObservabilityToolInfo = exports.OBSERVABILITY_TOOLS = exports.registerBuiltinProviders = exports.createProviderRegistry = exports.getDefaultRegistry = exports.listProviders = exports.hasProvider = exports.unregisterProvider = exports.registerProvider = exports.ProviderRegistry = exports.BaseProvider = exports.GoogleProvider = exports.AnthropicProvider = exports.OpenAIProvider = exports.getAvailableProviders = exports.isProviderAvailable = exports.parseModelString = exports.getDefaultProvider = exports.createProvider = exports.createGraphRAG = void 0;
61
- exports.createN8NIntegration = exports.N8NIntegration = exports.externalAgentAsTool = exports.createExternalAgent = exports.getExternalAgentRegistry = exports.GenericExternalAgent = exports.AiderAgent = exports.CodexCliAgent = exports.GeminiCliAgent = exports.ClaudeCodeAgent = exports.BaseExternalAgent = exports.renderWorkflow = exports.createFlowDisplay = exports.FlowDisplay = exports.createFileCheckpointStorage = exports.FileCheckpointStorage = exports.MemoryCheckpointStorage = exports.createCheckpointManager = exports.CheckpointManager = exports.createFileJobStorage = exports.FileJobStorage = exports.MemoryJobStorage = exports.createJobQueue = exports.JobQueue = exports.cronExpressions = exports.createScheduler = exports.Scheduler = exports.MODE_POLICIES = exports.cliApprovalPrompt = exports.createAutonomyManager = exports.AutonomyManager = exports.DEFAULT_BLOCKED_PATHS = exports.DEFAULT_BLOCKED_COMMANDS = exports.CommandValidator = exports.sandboxExec = exports.createSandboxExecutor = exports.SandboxExecutor = exports.createDiffViewer = exports.DiffViewer = exports.createGitManager = exports.GitManager = exports.DEFAULT_IGNORE_PATTERNS = exports.getRepoTree = exports.createRepoMap = exports.RepoMap = exports.createHistoryManager = exports.HistoryManager = exports.createStatusDisplay = exports.StatusDisplay = exports.createInteractiveTUI = void 0;
62
- exports.createAgentLoop = exports.createPagesHandler = exports.createRouteHandler = exports.createNestHandler = exports.createFastifyHandler = exports.createHonoHandler = exports.createExpressHandler = exports.createHttpHandler = exports.mcpToolsToAITools = exports.closeAllMCPClients = exports.closeMCPClient = exports.getMCPClient = exports.createMCP = exports.isDataUrl = exports.isUrl = exports.uint8ArrayToBase64 = exports.base64ToUint8Array = exports.toMessageContent = exports.createMultimodalMessage = exports.createTextPart = exports.createPdfPart = exports.createFilePart = exports.createImagePart = exports.getAICacheStats = exports.clearAICache = exports.applyMiddleware = exports.wrapModel = exports.createAILoggingMiddleware = exports.createCachingMiddleware = exports.resolveModelAlias = exports.hasModelAlias = exports.listModelAliases = exports.MODEL_ALIASES = exports.parseModel = exports.getModel = exports.createModel = exports.functionToTool = exports.createToolSet = exports.defineTool = exports.aiEmbedMany = exports.aiEmbed = exports.aiGenerateImage = exports.aiStreamObject = exports.aiGenerateObject = exports.aiStreamText = exports.aiGenerateText = exports.getQuickContext = exports.createFastContext = exports.FastContext = exports.triggerN8NWebhook = void 0;
63
- exports.createSlackBot = exports.createTelemetrySettings = exports.clearEvents = exports.getEvents = exports.recordEvent = exports.createTelemetryMiddleware = exports.withSpan = exports.createAISpan = exports.getTracer = exports.initOpenTelemetry = exports.isTelemetryEnabled = exports.disableAITelemetry = exports.enableAITelemetry = exports.getTelemetrySettings = exports.configureTelemetry = exports.autoEnableDevTools = exports.createDevToolsMiddleware = exports.getDevToolsUrl = exports.getDevToolsState = exports.isDevToolsEnabled = exports.disableDevTools = exports.enableDevTools = exports.TRANSCRIPTION_MODELS = exports.SPEECH_MODELS = exports.transcribe = exports.generateSpeech = exports.createDangerousPatternChecker = exports.isDangerous = exports.DANGEROUS_PATTERNS = exports.ToolApprovalTimeoutError = exports.ToolApprovalDeniedError = exports.withApproval = exports.setApprovalManager = exports.getApprovalManager = exports.ApprovalManager = exports.pipeUIMessageStreamToResponse = exports.toUIMessageStreamResponse = exports.createApprovalResponse = exports.getToolsNeedingApproval = exports.hasPendingApprovals = exports.createSystemMessage = exports.createTextMessage = exports.safeValidateUIMessages = exports.validateUIMessages = exports.convertToUIMessages = exports.convertToModelMessages = exports.stopWhen = exports.stopWhenNoToolCalls = exports.stopAfterSteps = exports.AgentLoop = void 0;
64
- exports.createCLIApprovalPrompt = exports.createComputerUseAgent = exports.ComputerUseClient = exports.createComputerUse = exports.createPostgresTool = exports.NLPostgresClient = exports.createNLPostgres = exports.parseSlackMessage = exports.verifySlackSignature = exports.SlackBot = void 0;
57
+ exports.createValidationOperationHooks = exports.createLoggingOperationHooks = exports.createHooksManager = exports.HooksManager = exports.createDocsManager = exports.DocsManager = exports.createSafetyRules = exports.createRulesManager = exports.RulesManager = exports.createEncryptionHooks = exports.createValidationHooks = exports.createLoggingHooks = exports.createMemoryHooks = exports.MemoryHooks = exports.DEFAULT_POLICIES = exports.createLLMSummarizer = exports.createAutoMemory = exports.AutoMemory = exports.createFileMemory = exports.FileMemory = exports.createMemory = exports.Memory = exports.CLI_SPEC_VERSION = exports.executeCommand = exports.parseArgs = exports.parseSkillFile = exports.createSkillManager = exports.SkillManager = exports.parseJudgeResponse = exports.removeOptimizationRule = exports.listOptimizationRules = exports.getOptimizationRule = exports.addOptimizationRule = exports.removeJudge = exports.listJudges = exports.getJudge = exports.addJudge = exports.RecipeJudge = exports.CriteriaJudge = exports.AccuracyJudge = exports.Judge = exports.noHarmfulContentCriterion = exports.containsKeywordsCriterion = exports.lengthCriterion = exports.relevanceCriterion = exports.createEvalResults = exports.EvalResults = exports.createDefaultEvaluator = exports.createEvaluator = exports.Evaluator = void 0;
58
+ exports.createPlan = exports.TaskAgent = exports.PlanningAgent = exports.PlanStorage = exports.TodoItem = exports.TodoList = exports.PlanStep = exports.Plan = exports.createLLMGuardrail = exports.LLMGuardrail = exports.createPromptExpanderAgent = exports.PromptExpanderAgent = exports.createQueryRewriterAgent = exports.QueryRewriterAgent = exports.createDeepResearchAgent = exports.DeepResearchAgent = exports.createAudioAgent = exports.AudioAgent = exports.createImageAgent = exports.ImageAgent = exports.createAutoAgents = exports.AutoAgents = exports.createHTTPSink = exports.createConsoleSink = exports.createTelemetryIntegration = exports.TelemetryIntegration = exports.createPerformanceMonitor = exports.PerformanceMonitor = exports.createAgentTelemetry = exports.cleanupTelemetry = exports.disableTelemetry = exports.enableTelemetry = exports.getTelemetry = exports.AgentTelemetry = exports.TelemetryCollector = exports.createTimingWorkflowHooks = exports.createLoggingWorkflowHooks = exports.createWorkflowHooks = exports.WorkflowHooksExecutor = exports.DisplayTypes = exports.clearAllCallbacks = exports.getRegisteredDisplayTypes = exports.hasApprovalCallback = exports.requestApproval = exports.executeCallback = exports.executeSyncCallback = exports.clearApprovalCallback = exports.registerApprovalCallback = exports.unregisterDisplayCallback = exports.registerDisplayCallback = void 0;
59
+ exports.BaseVoiceProvider = exports.createLangfuseObservability = exports.createMemoryObservability = exports.createConsoleObservability = exports.LangfuseObservabilityProvider = exports.MemoryObservabilityProvider = exports.ConsoleObservabilityProvider = exports.BaseObservabilityProvider = exports.createChromaStore = exports.ChromaVectorStore = exports.createQdrantStore = exports.QdrantVectorStore = exports.createWeaviateStore = exports.WeaviateVectorStore = exports.createPineconeStore = exports.PineconeVectorStore = exports.createMemoryVectorStore = exports.MemoryVectorStore = exports.BaseVectorStore = exports.createPostgresSessionStorage = exports.createMemoryPostgres = exports.createNeonPostgres = exports.PostgresSessionStorage = exports.MemoryPostgresAdapter = exports.NeonPostgresAdapter = exports.createMemoryRedis = exports.createUpstashRedis = exports.MemoryRedisAdapter = exports.UpstashRedisAdapter = exports.createSQLiteAdapter = exports.SQLiteAdapter = exports.validateWorkflowDefinition = exports.loadWorkflowFromFile = exports.createWorkflowFromYAML = exports.parseYAMLWorkflow = exports.createPubSub = exports.createEventBus = exports.AgentEvents = exports.AgentEventBus = exports.EventEmitterPubSub = exports.PubSub = exports.createFileCache = exports.createMemoryCache = exports.FileCache = exports.MemoryCache = exports.BaseCache = exports.createTaskAgent = exports.createPlanningAgent = exports.createPlanStorage = exports.createTodoList = void 0;
60
+ exports.COMMUNITY_PROVIDERS = exports.PROVIDER_ALIASES = exports.AISDK_PROVIDERS = exports.trace = exports.resetObservabilityAdapter = exports.getObservabilityAdapter = exports.setObservabilityAdapter = exports.clearAdapterCache = exports.createObservabilityAdapter = exports.createConsoleAdapter = exports.ConsoleObservabilityAdapter = exports.createMemoryAdapter = exports.MemoryObservabilityAdapter = exports.noopAdapter = exports.NoopObservabilityAdapter = exports.hasObservabilityToolEnvVar = exports.listObservabilityTools = exports.getObservabilityToolInfo = exports.OBSERVABILITY_TOOLS = exports.registerBuiltinProviders = exports.createProviderRegistry = exports.getDefaultRegistry = exports.listProviders = exports.hasProvider = exports.unregisterProvider = exports.registerProvider = exports.ProviderRegistry = exports.BaseProvider = exports.GoogleProvider = exports.AnthropicProvider = exports.OpenAIProvider = exports.getAvailableProviders = exports.isProviderAvailable = exports.parseModelString = exports.getDefaultProvider = exports.createProvider = exports.createGraphRAG = exports.GraphRAG = exports.GraphStore = exports.createLLMReranker = exports.createCrossEncoderReranker = exports.createCohereReranker = exports.LLMReranker = exports.CrossEncoderReranker = exports.CohereReranker = exports.BaseReranker = exports.createElevenLabsVoice = exports.createOpenAIVoice = exports.ElevenLabsVoiceProvider = exports.OpenAIVoiceProvider = void 0;
61
+ exports.FlowDisplay = exports.createFileCheckpointStorage = exports.FileCheckpointStorage = exports.MemoryCheckpointStorage = exports.createCheckpointManager = exports.CheckpointManager = exports.createFileJobStorage = exports.FileJobStorage = exports.MemoryJobStorage = exports.createJobQueue = exports.JobQueue = exports.cronExpressions = exports.createScheduler = exports.Scheduler = exports.MODE_POLICIES = exports.cliApprovalPrompt = exports.createAutonomyManager = exports.AutonomyManager = exports.DEFAULT_BLOCKED_PATHS = exports.DEFAULT_BLOCKED_COMMANDS = exports.CommandValidator = exports.sandboxExec = exports.createSandboxExecutor = exports.SandboxExecutor = exports.createDiffViewer = exports.DiffViewer = exports.createGitManager = exports.GitManager = exports.DEFAULT_IGNORE_PATTERNS = exports.getRepoTree = exports.createRepoMap = exports.RepoMap = exports.createHistoryManager = exports.HistoryManager = exports.createStatusDisplay = exports.StatusDisplay = exports.createInteractiveTUI = exports.InteractiveTUI = exports.MODEL_PRICING = exports.formatCost = exports.estimateTokens = exports.createCostTracker = exports.CostTracker = exports.isSlashCommand = exports.executeSlashCommand = exports.parseSlashCommand = exports.registerCommand = exports.createSlashCommandHandler = exports.SlashCommandHandler = exports.ADAPTERS = void 0;
62
+ exports.isDataUrl = exports.isUrl = exports.uint8ArrayToBase64 = exports.base64ToUint8Array = exports.toMessageContent = exports.createMultimodalMessage = exports.createTextPart = exports.createPdfPart = exports.createFilePart = exports.createImagePart = exports.getAICacheStats = exports.clearAICache = exports.applyMiddleware = exports.wrapModel = exports.createAILoggingMiddleware = exports.createCachingMiddleware = exports.resolveModelAlias = exports.hasModelAlias = exports.listModelAliases = exports.MODEL_ALIASES = exports.parseModel = exports.getModel = exports.createModel = exports.functionToTool = exports.createToolSet = exports.defineTool = exports.aiEmbedMany = exports.aiEmbed = exports.aiGenerateImage = exports.aiStreamObject = exports.aiGenerateObject = exports.aiStreamText = exports.aiGenerateText = exports.getQuickContext = exports.createFastContext = exports.FastContext = exports.triggerN8NWebhook = exports.createN8NIntegration = exports.N8NIntegration = exports.externalAgentAsTool = exports.createExternalAgent = exports.getExternalAgentRegistry = exports.GenericExternalAgent = exports.AiderAgent = exports.CodexCliAgent = exports.GeminiCliAgent = exports.ClaudeCodeAgent = exports.BaseExternalAgent = exports.renderWorkflow = exports.createFlowDisplay = void 0;
63
+ exports.getTelemetrySettings = exports.configureTelemetry = exports.autoEnableDevTools = exports.createDevToolsMiddleware = exports.getDevToolsUrl = exports.getDevToolsState = exports.isDevToolsEnabled = exports.disableDevTools = exports.enableDevTools = exports.TRANSCRIPTION_MODELS = exports.SPEECH_MODELS = exports.transcribe = exports.generateSpeech = exports.createDangerousPatternChecker = exports.isDangerous = exports.DANGEROUS_PATTERNS = exports.ToolApprovalTimeoutError = exports.ToolApprovalDeniedError = exports.withApproval = exports.setApprovalManager = exports.getApprovalManager = exports.ApprovalManager = exports.pipeUIMessageStreamToResponse = exports.toUIMessageStreamResponse = exports.createApprovalResponse = exports.getToolsNeedingApproval = exports.hasPendingApprovals = exports.createSystemMessage = exports.createTextMessage = exports.safeValidateUIMessages = exports.validateUIMessages = exports.convertToUIMessages = exports.convertToModelMessages = exports.stopWhen = exports.stopWhenNoToolCalls = exports.stopAfterSteps = exports.AgentLoop = exports.createAgentLoop = exports.createPagesHandler = exports.createRouteHandler = exports.createNestHandler = exports.createFastifyHandler = exports.createHonoHandler = exports.createExpressHandler = exports.createHttpHandler = exports.mcpToolsToAITools = exports.closeAllMCPClients = exports.closeMCPClient = exports.getMCPClient = exports.createMCP = void 0;
64
+ exports.createCLIApprovalPrompt = exports.createComputerUseAgent = exports.ComputerUseClient = exports.createComputerUse = exports.createPostgresTool = exports.NLPostgresClient = exports.createNLPostgres = exports.parseSlackMessage = exports.verifySlackSignature = exports.SlackBot = exports.createSlackBot = exports.createTelemetrySettings = exports.clearEvents = exports.getEvents = exports.recordEvent = exports.createTelemetryMiddleware = exports.withSpan = exports.createAISpan = exports.getTracer = exports.initOpenTelemetry = exports.isTelemetryEnabled = exports.disableAITelemetry = exports.enableAITelemetry = void 0;
65
65
  // ============================================================================
66
66
  // CORE API - The main classes users should use
67
67
  // ============================================================================
@@ -230,6 +230,20 @@ Object.defineProperty(exports, "relevanceCriterion", { enumerable: true, get: fu
230
230
  Object.defineProperty(exports, "lengthCriterion", { enumerable: true, get: function () { return eval_1.lengthCriterion; } });
231
231
  Object.defineProperty(exports, "containsKeywordsCriterion", { enumerable: true, get: function () { return eval_1.containsKeywordsCriterion; } });
232
232
  Object.defineProperty(exports, "noHarmfulContentCriterion", { enumerable: true, get: function () { return eval_1.noHarmfulContentCriterion; } });
233
+ // LLM-as-Judge
234
+ Object.defineProperty(exports, "Judge", { enumerable: true, get: function () { return eval_1.Judge; } });
235
+ Object.defineProperty(exports, "AccuracyJudge", { enumerable: true, get: function () { return eval_1.AccuracyJudge; } });
236
+ Object.defineProperty(exports, "CriteriaJudge", { enumerable: true, get: function () { return eval_1.CriteriaJudge; } });
237
+ Object.defineProperty(exports, "RecipeJudge", { enumerable: true, get: function () { return eval_1.RecipeJudge; } });
238
+ Object.defineProperty(exports, "addJudge", { enumerable: true, get: function () { return eval_1.addJudge; } });
239
+ Object.defineProperty(exports, "getJudge", { enumerable: true, get: function () { return eval_1.getJudge; } });
240
+ Object.defineProperty(exports, "listJudges", { enumerable: true, get: function () { return eval_1.listJudges; } });
241
+ Object.defineProperty(exports, "removeJudge", { enumerable: true, get: function () { return eval_1.removeJudge; } });
242
+ Object.defineProperty(exports, "addOptimizationRule", { enumerable: true, get: function () { return eval_1.addOptimizationRule; } });
243
+ Object.defineProperty(exports, "getOptimizationRule", { enumerable: true, get: function () { return eval_1.getOptimizationRule; } });
244
+ Object.defineProperty(exports, "listOptimizationRules", { enumerable: true, get: function () { return eval_1.listOptimizationRules; } });
245
+ Object.defineProperty(exports, "removeOptimizationRule", { enumerable: true, get: function () { return eval_1.removeOptimizationRule; } });
246
+ Object.defineProperty(exports, "parseJudgeResponse", { enumerable: true, get: function () { return eval_1.parseJudgeResponse; } });
233
247
  // Note: Observability exports are at the bottom of this file with the full 14+ integrations
234
248
  // Export skills
235
249
  var skills_1 = require("./skills");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "praisonai",
3
- "version": "1.5.3",
3
+ "version": "1.5.4",
4
4
  "description": "PraisonAI TypeScript AI Agents Framework - Node.js, npm, and Javascript AI Agents Framework",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",