@juspay/neurolink 9.36.1 → 9.37.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/auth/errors.d.ts +1 -1
- package/dist/auth/middleware/AuthMiddleware.d.ts +1 -1
- package/dist/auth/providers/BaseAuthProvider.d.ts +1 -1
- package/dist/browser/neurolink.min.js +921 -423
- package/dist/cli/commands/evaluate.d.ts +48 -0
- package/dist/cli/commands/evaluate.js +955 -0
- package/dist/cli/parser.js +4 -1
- package/dist/evaluation/BatchEvaluator.d.ts +163 -0
- package/dist/evaluation/BatchEvaluator.js +267 -0
- package/dist/evaluation/EvaluationAggregator.d.ts +272 -0
- package/dist/evaluation/EvaluationAggregator.js +377 -0
- package/dist/evaluation/EvaluatorFactory.d.ts +113 -0
- package/dist/evaluation/EvaluatorFactory.js +280 -0
- package/dist/evaluation/EvaluatorRegistry.d.ts +160 -0
- package/dist/evaluation/EvaluatorRegistry.js +184 -0
- package/dist/evaluation/errors/EvaluationError.d.ts +189 -0
- package/dist/evaluation/errors/EvaluationError.js +206 -0
- package/dist/evaluation/errors/index.d.ts +4 -0
- package/dist/evaluation/errors/index.js +4 -0
- package/dist/evaluation/hooks/index.d.ts +6 -0
- package/dist/evaluation/hooks/index.js +6 -0
- package/dist/evaluation/hooks/langfuseAdapter.d.ts +99 -0
- package/dist/evaluation/hooks/langfuseAdapter.js +172 -0
- package/dist/evaluation/hooks/observabilityHooks.d.ts +129 -0
- package/dist/evaluation/hooks/observabilityHooks.js +181 -0
- package/dist/evaluation/index.d.ts +11 -2
- package/dist/evaluation/index.js +15 -0
- package/dist/evaluation/pipeline/evaluationPipeline.d.ts +114 -0
- package/dist/evaluation/pipeline/evaluationPipeline.js +381 -0
- package/dist/evaluation/pipeline/index.d.ts +8 -0
- package/dist/evaluation/pipeline/index.js +8 -0
- package/dist/evaluation/pipeline/pipelineBuilder.d.ts +126 -0
- package/dist/evaluation/pipeline/pipelineBuilder.js +260 -0
- package/dist/evaluation/pipeline/presets.d.ts +66 -0
- package/dist/evaluation/pipeline/presets.js +224 -0
- package/dist/evaluation/pipeline/strategies/batchStrategy.d.ts +99 -0
- package/dist/evaluation/pipeline/strategies/batchStrategy.js +238 -0
- package/dist/evaluation/pipeline/strategies/index.d.ts +6 -0
- package/dist/evaluation/pipeline/strategies/index.js +6 -0
- package/dist/evaluation/pipeline/strategies/samplingStrategy.d.ts +76 -0
- package/dist/evaluation/pipeline/strategies/samplingStrategy.js +238 -0
- package/dist/evaluation/reporting/index.d.ts +6 -0
- package/dist/evaluation/reporting/index.js +6 -0
- package/dist/evaluation/reporting/metricsCollector.d.ts +147 -0
- package/dist/evaluation/reporting/metricsCollector.js +285 -0
- package/dist/evaluation/reporting/reportGenerator.d.ts +90 -0
- package/dist/evaluation/reporting/reportGenerator.js +374 -0
- package/dist/evaluation/scorers/baseScorer.d.ts +83 -0
- package/dist/evaluation/scorers/baseScorer.js +232 -0
- package/dist/evaluation/scorers/customScorerUtils.d.ts +95 -0
- package/dist/evaluation/scorers/customScorerUtils.js +381 -0
- package/dist/evaluation/scorers/index.d.ts +10 -0
- package/dist/evaluation/scorers/index.js +16 -0
- package/dist/evaluation/scorers/llm/answerRelevancyScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/answerRelevancyScorer.js +99 -0
- package/dist/evaluation/scorers/llm/baseLLMScorer.d.ts +71 -0
- package/dist/evaluation/scorers/llm/baseLLMScorer.js +281 -0
- package/dist/evaluation/scorers/llm/biasDetectionScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/biasDetectionScorer.js +127 -0
- package/dist/evaluation/scorers/llm/contextPrecisionScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/contextPrecisionScorer.js +92 -0
- package/dist/evaluation/scorers/llm/contextRelevancyScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/contextRelevancyScorer.js +107 -0
- package/dist/evaluation/scorers/llm/faithfulnessScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/faithfulnessScorer.js +121 -0
- package/dist/evaluation/scorers/llm/hallucinationScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/hallucinationScorer.js +140 -0
- package/dist/evaluation/scorers/llm/index.d.ts +15 -0
- package/dist/evaluation/scorers/llm/index.js +16 -0
- package/dist/evaluation/scorers/llm/promptAlignmentScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/promptAlignmentScorer.js +106 -0
- package/dist/evaluation/scorers/llm/summarizationScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/summarizationScorer.js +114 -0
- package/dist/evaluation/scorers/llm/toneConsistencyScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/toneConsistencyScorer.js +106 -0
- package/dist/evaluation/scorers/llm/toxicityScorer.d.ts +12 -0
- package/dist/evaluation/scorers/llm/toxicityScorer.js +121 -0
- package/dist/evaluation/scorers/rule/baseRuleScorer.d.ts +77 -0
- package/dist/evaluation/scorers/rule/baseRuleScorer.js +233 -0
- package/dist/evaluation/scorers/rule/contentSimilarityScorer.d.ts +108 -0
- package/dist/evaluation/scorers/rule/contentSimilarityScorer.js +350 -0
- package/dist/evaluation/scorers/rule/formatScorer.d.ts +147 -0
- package/dist/evaluation/scorers/rule/formatScorer.js +470 -0
- package/dist/evaluation/scorers/rule/index.d.ts +9 -0
- package/dist/evaluation/scorers/rule/index.js +10 -0
- package/dist/evaluation/scorers/rule/keywordCoverageScorer.d.ts +83 -0
- package/dist/evaluation/scorers/rule/keywordCoverageScorer.js +347 -0
- package/dist/evaluation/scorers/rule/lengthScorer.d.ts +105 -0
- package/dist/evaluation/scorers/rule/lengthScorer.js +351 -0
- package/dist/evaluation/scorers/scorerBuilder.d.ts +161 -0
- package/dist/evaluation/scorers/scorerBuilder.js +420 -0
- package/dist/evaluation/scorers/scorerRegistry.d.ts +62 -0
- package/dist/evaluation/scorers/scorerRegistry.js +467 -0
- package/dist/index.d.ts +37 -25
- package/dist/index.js +65 -26
- package/dist/lib/auth/providers/BaseAuthProvider.d.ts +1 -1
- package/dist/lib/evaluation/BatchEvaluator.d.ts +163 -0
- package/dist/lib/evaluation/BatchEvaluator.js +268 -0
- package/dist/lib/evaluation/EvaluationAggregator.d.ts +272 -0
- package/dist/lib/evaluation/EvaluationAggregator.js +378 -0
- package/dist/lib/evaluation/EvaluatorFactory.d.ts +113 -0
- package/dist/lib/evaluation/EvaluatorFactory.js +281 -0
- package/dist/lib/evaluation/EvaluatorRegistry.d.ts +160 -0
- package/dist/lib/evaluation/EvaluatorRegistry.js +185 -0
- package/dist/lib/evaluation/errors/EvaluationError.d.ts +189 -0
- package/dist/lib/evaluation/errors/EvaluationError.js +207 -0
- package/dist/lib/evaluation/errors/index.d.ts +4 -0
- package/dist/lib/evaluation/errors/index.js +5 -0
- package/dist/lib/evaluation/hooks/index.d.ts +6 -0
- package/dist/lib/evaluation/hooks/index.js +7 -0
- package/dist/lib/evaluation/hooks/langfuseAdapter.d.ts +99 -0
- package/dist/lib/evaluation/hooks/langfuseAdapter.js +173 -0
- package/dist/lib/evaluation/hooks/observabilityHooks.d.ts +129 -0
- package/dist/lib/evaluation/hooks/observabilityHooks.js +182 -0
- package/dist/lib/evaluation/index.d.ts +11 -2
- package/dist/lib/evaluation/index.js +15 -0
- package/dist/lib/evaluation/pipeline/evaluationPipeline.d.ts +114 -0
- package/dist/lib/evaluation/pipeline/evaluationPipeline.js +382 -0
- package/dist/lib/evaluation/pipeline/index.d.ts +8 -0
- package/dist/lib/evaluation/pipeline/index.js +9 -0
- package/dist/lib/evaluation/pipeline/pipelineBuilder.d.ts +126 -0
- package/dist/lib/evaluation/pipeline/pipelineBuilder.js +261 -0
- package/dist/lib/evaluation/pipeline/presets.d.ts +66 -0
- package/dist/lib/evaluation/pipeline/presets.js +225 -0
- package/dist/lib/evaluation/pipeline/strategies/batchStrategy.d.ts +99 -0
- package/dist/lib/evaluation/pipeline/strategies/batchStrategy.js +239 -0
- package/dist/lib/evaluation/pipeline/strategies/index.d.ts +6 -0
- package/dist/lib/evaluation/pipeline/strategies/index.js +7 -0
- package/dist/lib/evaluation/pipeline/strategies/samplingStrategy.d.ts +76 -0
- package/dist/lib/evaluation/pipeline/strategies/samplingStrategy.js +239 -0
- package/dist/lib/evaluation/reporting/index.d.ts +6 -0
- package/dist/lib/evaluation/reporting/index.js +7 -0
- package/dist/lib/evaluation/reporting/metricsCollector.d.ts +147 -0
- package/dist/lib/evaluation/reporting/metricsCollector.js +286 -0
- package/dist/lib/evaluation/reporting/reportGenerator.d.ts +90 -0
- package/dist/lib/evaluation/reporting/reportGenerator.js +375 -0
- package/dist/lib/evaluation/scorers/baseScorer.d.ts +83 -0
- package/dist/lib/evaluation/scorers/baseScorer.js +233 -0
- package/dist/lib/evaluation/scorers/customScorerUtils.d.ts +95 -0
- package/dist/lib/evaluation/scorers/customScorerUtils.js +382 -0
- package/dist/lib/evaluation/scorers/index.d.ts +10 -0
- package/dist/lib/evaluation/scorers/index.js +17 -0
- package/dist/lib/evaluation/scorers/llm/answerRelevancyScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/answerRelevancyScorer.js +100 -0
- package/dist/lib/evaluation/scorers/llm/baseLLMScorer.d.ts +71 -0
- package/dist/lib/evaluation/scorers/llm/baseLLMScorer.js +282 -0
- package/dist/lib/evaluation/scorers/llm/biasDetectionScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/biasDetectionScorer.js +128 -0
- package/dist/lib/evaluation/scorers/llm/contextPrecisionScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/contextPrecisionScorer.js +93 -0
- package/dist/lib/evaluation/scorers/llm/contextRelevancyScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/contextRelevancyScorer.js +108 -0
- package/dist/lib/evaluation/scorers/llm/faithfulnessScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/faithfulnessScorer.js +122 -0
- package/dist/lib/evaluation/scorers/llm/hallucinationScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/hallucinationScorer.js +141 -0
- package/dist/lib/evaluation/scorers/llm/index.d.ts +15 -0
- package/dist/lib/evaluation/scorers/llm/index.js +17 -0
- package/dist/lib/evaluation/scorers/llm/promptAlignmentScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/promptAlignmentScorer.js +107 -0
- package/dist/lib/evaluation/scorers/llm/summarizationScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/summarizationScorer.js +115 -0
- package/dist/lib/evaluation/scorers/llm/toneConsistencyScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/toneConsistencyScorer.js +107 -0
- package/dist/lib/evaluation/scorers/llm/toxicityScorer.d.ts +12 -0
- package/dist/lib/evaluation/scorers/llm/toxicityScorer.js +122 -0
- package/dist/lib/evaluation/scorers/rule/baseRuleScorer.d.ts +77 -0
- package/dist/lib/evaluation/scorers/rule/baseRuleScorer.js +234 -0
- package/dist/lib/evaluation/scorers/rule/contentSimilarityScorer.d.ts +108 -0
- package/dist/lib/evaluation/scorers/rule/contentSimilarityScorer.js +351 -0
- package/dist/lib/evaluation/scorers/rule/formatScorer.d.ts +147 -0
- package/dist/lib/evaluation/scorers/rule/formatScorer.js +471 -0
- package/dist/lib/evaluation/scorers/rule/index.d.ts +9 -0
- package/dist/lib/evaluation/scorers/rule/index.js +11 -0
- package/dist/lib/evaluation/scorers/rule/keywordCoverageScorer.d.ts +83 -0
- package/dist/lib/evaluation/scorers/rule/keywordCoverageScorer.js +348 -0
- package/dist/lib/evaluation/scorers/rule/lengthScorer.d.ts +105 -0
- package/dist/lib/evaluation/scorers/rule/lengthScorer.js +352 -0
- package/dist/lib/evaluation/scorers/scorerBuilder.d.ts +161 -0
- package/dist/lib/evaluation/scorers/scorerBuilder.js +421 -0
- package/dist/lib/evaluation/scorers/scorerRegistry.d.ts +62 -0
- package/dist/lib/evaluation/scorers/scorerRegistry.js +468 -0
- package/dist/lib/index.d.ts +37 -25
- package/dist/lib/index.js +65 -26
- package/dist/lib/neurolink.d.ts +204 -0
- package/dist/lib/neurolink.js +296 -0
- package/dist/lib/types/index.d.ts +3 -1
- package/dist/lib/types/index.js +3 -2
- package/dist/lib/types/scorerTypes.d.ts +423 -0
- package/dist/lib/types/scorerTypes.js +6 -0
- package/dist/lib/utils/errorHandling.d.ts +20 -0
- package/dist/lib/utils/errorHandling.js +60 -0
- package/dist/neurolink.d.ts +204 -0
- package/dist/neurolink.js +296 -0
- package/dist/types/index.d.ts +3 -1
- package/dist/types/index.js +3 -2
- package/dist/types/scorerTypes.d.ts +423 -0
- package/dist/types/scorerTypes.js +5 -0
- package/dist/utils/errorHandling.d.ts +20 -0
- package/dist/utils/errorHandling.js +60 -0
- package/package.json +1 -1
|
@@ -0,0 +1,955 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* NeuroLink CLI Evaluate Command
|
|
4
|
+
*
|
|
5
|
+
* Evaluate AI responses using configured scorers and pipelines.
|
|
6
|
+
* Supports subcommands: run, score, report, presets, scorers (list-scorers)
|
|
7
|
+
*/
|
|
8
|
+
import chalk from "chalk";
|
|
9
|
+
import ora from "ora";
|
|
10
|
+
import fs from "node:fs";
|
|
11
|
+
import { EvaluationPipeline, getPreset, getPresetNames, PipelinePresets, } from "../../lib/evaluation/pipeline/index.js";
|
|
12
|
+
import { ScorerRegistry } from "../../lib/evaluation/scorers/index.js";
|
|
13
|
+
import { ReportGenerator, } from "../../lib/evaluation/reporting/reportGenerator.js";
|
|
14
|
+
import { logger } from "../../lib/utils/logger.js";
|
|
15
|
+
/**
|
|
16
|
+
* Format score result for display
|
|
17
|
+
*/
|
|
18
|
+
function formatScoreResult(result, verbose) {
|
|
19
|
+
const passIcon = result.passed ? chalk.green("PASS") : chalk.red("FAIL");
|
|
20
|
+
const scoreColor = result.passed ? chalk.green : chalk.red;
|
|
21
|
+
let output = ` ${passIcon} ${chalk.cyan(result.scorerName)}: ${scoreColor(result.score.toFixed(2))}`;
|
|
22
|
+
if (verbose) {
|
|
23
|
+
output += `\n ${chalk.gray(result.reasoning)}`;
|
|
24
|
+
output += `\n ${chalk.gray(`(${result.computeTime}ms)`)}`;
|
|
25
|
+
}
|
|
26
|
+
return output;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Check if a preset name is valid
|
|
30
|
+
*/
|
|
31
|
+
function isValidPreset(name) {
|
|
32
|
+
return name in PipelinePresets;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Create scorer input from command arguments
|
|
36
|
+
*/
|
|
37
|
+
function createScorerInput(argv) {
|
|
38
|
+
// Handle context - can be array of strings or path to file
|
|
39
|
+
let contextArray;
|
|
40
|
+
if (argv.context) {
|
|
41
|
+
if (typeof argv.context === "string") {
|
|
42
|
+
// Check if it's a file path
|
|
43
|
+
if (fs.existsSync(argv.context)) {
|
|
44
|
+
try {
|
|
45
|
+
const content = fs.readFileSync(argv.context, "utf-8");
|
|
46
|
+
const parsed = JSON.parse(content);
|
|
47
|
+
contextArray = Array.isArray(parsed) ? parsed : [content];
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
contextArray = [argv.context];
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
else {
|
|
54
|
+
contextArray = [argv.context];
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
contextArray = argv.context;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return {
|
|
62
|
+
query: argv.query ?? argv.input ?? "",
|
|
63
|
+
response: argv.output ?? argv.input ?? "",
|
|
64
|
+
context: contextArray,
|
|
65
|
+
groundTruth: argv.groundTruth,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* List-scorers subcommand - List all available scorers
|
|
70
|
+
*/
|
|
71
|
+
const listScorersCommand = {
|
|
72
|
+
command: "list-scorers",
|
|
73
|
+
describe: "List all available scorers",
|
|
74
|
+
builder: (yargs) => yargs
|
|
75
|
+
.option("category", {
|
|
76
|
+
type: "string",
|
|
77
|
+
describe: "Filter by category (accuracy, relevancy, safety, quality, faithfulness)",
|
|
78
|
+
})
|
|
79
|
+
.option("type", {
|
|
80
|
+
type: "string",
|
|
81
|
+
describe: "Filter by type (llm, rule)",
|
|
82
|
+
choices: ["llm", "rule"],
|
|
83
|
+
})
|
|
84
|
+
.option("detailed", {
|
|
85
|
+
type: "boolean",
|
|
86
|
+
describe: "Show detailed scorer information",
|
|
87
|
+
default: false,
|
|
88
|
+
})
|
|
89
|
+
.option("json", {
|
|
90
|
+
type: "boolean",
|
|
91
|
+
describe: "Output as JSON",
|
|
92
|
+
default: false,
|
|
93
|
+
})
|
|
94
|
+
.example("$0 evaluate list-scorers", "List all scorers")
|
|
95
|
+
.example("$0 evaluate list-scorers --category safety", "List safety scorers")
|
|
96
|
+
.example("$0 evaluate list-scorers --type rule --detailed", "List rule-based scorers with details"),
|
|
97
|
+
handler: async (argv) => {
|
|
98
|
+
const { category, type, json, detailed } = argv;
|
|
99
|
+
await ScorerRegistry.registerBuiltInScorers();
|
|
100
|
+
let scorerList = ScorerRegistry.list();
|
|
101
|
+
// Apply filters
|
|
102
|
+
if (category) {
|
|
103
|
+
scorerList = scorerList.filter((s) => s.category === category);
|
|
104
|
+
}
|
|
105
|
+
if (type) {
|
|
106
|
+
scorerList = scorerList.filter((s) => s.type === type);
|
|
107
|
+
}
|
|
108
|
+
if (json) {
|
|
109
|
+
logger.always(JSON.stringify(scorerList, null, 2));
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
logger.always("");
|
|
113
|
+
logger.always(chalk.bold("Available Scorers:"));
|
|
114
|
+
logger.always(chalk.gray("-".repeat(60)));
|
|
115
|
+
// Group by category
|
|
116
|
+
const byCategory = new Map();
|
|
117
|
+
for (const s of scorerList) {
|
|
118
|
+
const cat = s.category;
|
|
119
|
+
if (!byCategory.has(cat)) {
|
|
120
|
+
byCategory.set(cat, []);
|
|
121
|
+
}
|
|
122
|
+
const categoryList = byCategory.get(cat);
|
|
123
|
+
if (categoryList) {
|
|
124
|
+
categoryList.push(s);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
for (const [cat, scorers] of byCategory) {
|
|
128
|
+
logger.always("");
|
|
129
|
+
logger.always(chalk.bold.underline(cat.toUpperCase()));
|
|
130
|
+
for (const metadata of scorers) {
|
|
131
|
+
const typeIcon = metadata.type === "llm" ? "AI" : "Rule";
|
|
132
|
+
logger.always("");
|
|
133
|
+
logger.always(` ${chalk.cyan(metadata.id)} [${typeIcon}]`);
|
|
134
|
+
logger.always(` ${chalk.gray(metadata.description)}`);
|
|
135
|
+
if (detailed) {
|
|
136
|
+
logger.always(` Required: ${metadata.requiredInputs.join(", ") || "none"}`);
|
|
137
|
+
if (metadata.optionalInputs.length > 0) {
|
|
138
|
+
logger.always(` Optional: ${metadata.optionalInputs.join(", ")}`);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
logger.always("");
|
|
144
|
+
logger.always(chalk.gray(`Total: ${scorerList.length} scorers`));
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
};
|
|
148
|
+
/**
|
|
149
|
+
* Run-pipeline subcommand - Run evaluation using a predefined pipeline preset
|
|
150
|
+
*/
|
|
151
|
+
const runPipelineCommand = {
|
|
152
|
+
command: "run-pipeline",
|
|
153
|
+
describe: "Run evaluation using a predefined pipeline preset",
|
|
154
|
+
builder: (yargs) => yargs
|
|
155
|
+
.option("preset", {
|
|
156
|
+
type: "string",
|
|
157
|
+
describe: `Pipeline preset to use (${getPresetNames().join(", ")})`,
|
|
158
|
+
alias: "p",
|
|
159
|
+
demandOption: true,
|
|
160
|
+
})
|
|
161
|
+
.option("input", {
|
|
162
|
+
type: "string",
|
|
163
|
+
describe: "AI response text to evaluate",
|
|
164
|
+
alias: "i",
|
|
165
|
+
demandOption: true,
|
|
166
|
+
})
|
|
167
|
+
.option("query", {
|
|
168
|
+
type: "string",
|
|
169
|
+
describe: "Original user query",
|
|
170
|
+
alias: "q",
|
|
171
|
+
})
|
|
172
|
+
.option("context", {
|
|
173
|
+
type: "string",
|
|
174
|
+
describe: "Path to context file (JSON format) or context string",
|
|
175
|
+
alias: "c",
|
|
176
|
+
})
|
|
177
|
+
.option("threshold", {
|
|
178
|
+
type: "number",
|
|
179
|
+
describe: "Custom pass threshold (0-1)",
|
|
180
|
+
alias: "t",
|
|
181
|
+
})
|
|
182
|
+
.option("format", {
|
|
183
|
+
type: "string",
|
|
184
|
+
describe: "Output format",
|
|
185
|
+
choices: ["text", "json", "table"],
|
|
186
|
+
default: "text",
|
|
187
|
+
})
|
|
188
|
+
.option("json", {
|
|
189
|
+
type: "boolean",
|
|
190
|
+
describe: "Output results as JSON (shorthand for --format json)",
|
|
191
|
+
default: false,
|
|
192
|
+
})
|
|
193
|
+
.option("verbose", {
|
|
194
|
+
type: "boolean",
|
|
195
|
+
describe: "Show detailed reasoning and timing",
|
|
196
|
+
alias: "v",
|
|
197
|
+
default: false,
|
|
198
|
+
})
|
|
199
|
+
.example('$0 evaluate run-pipeline --preset quality --input "The capital of France is Paris."', "Run quality evaluation")
|
|
200
|
+
.example('$0 evaluate run-pipeline --preset rag --input "Response" --query "Question" --context ./context.json', "Run RAG evaluation with context file"),
|
|
201
|
+
handler: async (argv) => {
|
|
202
|
+
const { preset, input, query, context, threshold, json, verbose, format } = argv;
|
|
203
|
+
const outputFormat = json ? "json" : format;
|
|
204
|
+
const spinner = outputFormat === "json"
|
|
205
|
+
? null
|
|
206
|
+
: ora(`Running ${preset} evaluation pipeline...`).start();
|
|
207
|
+
try {
|
|
208
|
+
if (!isValidPreset(preset)) {
|
|
209
|
+
spinner?.fail(`Unknown pipeline preset: ${preset}`);
|
|
210
|
+
logger.always(chalk.gray(`Available presets: ${getPresetNames().join(", ")}`));
|
|
211
|
+
process.exit(1);
|
|
212
|
+
}
|
|
213
|
+
const presetConfig = getPreset(preset);
|
|
214
|
+
// Apply custom threshold if provided
|
|
215
|
+
if (threshold !== undefined) {
|
|
216
|
+
presetConfig.passThreshold = threshold;
|
|
217
|
+
}
|
|
218
|
+
const evaluationPipeline = new EvaluationPipeline(presetConfig);
|
|
219
|
+
const scorerInput = createScorerInput({
|
|
220
|
+
input: query,
|
|
221
|
+
output: input,
|
|
222
|
+
context,
|
|
223
|
+
});
|
|
224
|
+
await evaluationPipeline.initialize();
|
|
225
|
+
const result = await evaluationPipeline.execute(scorerInput);
|
|
226
|
+
spinner?.stop();
|
|
227
|
+
if (outputFormat === "json") {
|
|
228
|
+
logger.always(JSON.stringify(result, null, 2));
|
|
229
|
+
}
|
|
230
|
+
else if (outputFormat === "table") {
|
|
231
|
+
logger.always("");
|
|
232
|
+
logger.always(chalk.bold(`Pipeline: ${preset}`));
|
|
233
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
234
|
+
// Table header
|
|
235
|
+
logger.always(`${chalk.bold("Scorer".padEnd(25))} ${chalk.bold("Score".padEnd(10))} ${chalk.bold("Status")}`);
|
|
236
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
237
|
+
for (const score of result.scores) {
|
|
238
|
+
const status = score.passed ? chalk.green("PASS") : chalk.red("FAIL");
|
|
239
|
+
const scoreColor = score.passed ? chalk.green : chalk.red;
|
|
240
|
+
logger.always(`${score.scorerName.padEnd(25)} ${scoreColor(score.score.toFixed(2).padEnd(10))} ${status}`);
|
|
241
|
+
}
|
|
242
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
243
|
+
const overallColor = result.passed ? chalk.green : chalk.red;
|
|
244
|
+
logger.always(`${"Overall".padEnd(25)} ${overallColor(result.overallScore.toFixed(2).padEnd(10))} ${result.passed ? chalk.green("PASS") : chalk.red("FAIL")}`);
|
|
245
|
+
}
|
|
246
|
+
else {
|
|
247
|
+
// Text format
|
|
248
|
+
logger.always("");
|
|
249
|
+
logger.always(chalk.bold(`Pipeline: ${preset} Evaluation Results`));
|
|
250
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
251
|
+
const overallColor = result.passed ? chalk.green : chalk.red;
|
|
252
|
+
const overallIcon = result.passed ? "PASS" : "FAIL";
|
|
253
|
+
logger.always(`${overallColor(overallIcon)} Overall Score: ${overallColor(result.overallScore.toFixed(2))} (${result.aggregationMethod})`);
|
|
254
|
+
logger.always("");
|
|
255
|
+
logger.always(chalk.bold("Individual Scores:"));
|
|
256
|
+
for (const score of result.scores) {
|
|
257
|
+
logger.always(formatScoreResult(score, verbose ?? false));
|
|
258
|
+
}
|
|
259
|
+
if (result.errors.length > 0) {
|
|
260
|
+
logger.always("");
|
|
261
|
+
logger.always(chalk.yellow("Errors:"));
|
|
262
|
+
for (const error of result.errors) {
|
|
263
|
+
logger.always(` ${chalk.yellow("!")} ${error.scorerId}: ${error.error}`);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
logger.always("");
|
|
267
|
+
logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
catch (error) {
|
|
271
|
+
spinner?.fail("Pipeline evaluation failed");
|
|
272
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
273
|
+
logger.error(chalk.red(`Error: ${errorMessage}`));
|
|
274
|
+
process.exit(1);
|
|
275
|
+
}
|
|
276
|
+
},
|
|
277
|
+
};
|
|
278
|
+
/**
|
|
279
|
+
* Run subcommand - Execute evaluation pipeline (legacy support)
|
|
280
|
+
*/
|
|
281
|
+
const runCommand = {
|
|
282
|
+
command: "run",
|
|
283
|
+
describe: "Run evaluation pipeline on a response",
|
|
284
|
+
builder: (yargs) => yargs
|
|
285
|
+
.option("input", {
|
|
286
|
+
type: "string",
|
|
287
|
+
describe: "Input query/question that was asked",
|
|
288
|
+
alias: "i",
|
|
289
|
+
})
|
|
290
|
+
.option("output", {
|
|
291
|
+
type: "string",
|
|
292
|
+
describe: "Output/answer to evaluate",
|
|
293
|
+
alias: "o",
|
|
294
|
+
})
|
|
295
|
+
.option("context", {
|
|
296
|
+
type: "array",
|
|
297
|
+
string: true,
|
|
298
|
+
describe: "Context documents for RAG evaluation (can be used multiple times)",
|
|
299
|
+
alias: "c",
|
|
300
|
+
})
|
|
301
|
+
.option("ground-truth", {
|
|
302
|
+
type: "string",
|
|
303
|
+
describe: "Expected/correct answer for accuracy evaluation",
|
|
304
|
+
alias: "g",
|
|
305
|
+
})
|
|
306
|
+
.option("pipeline", {
|
|
307
|
+
type: "string",
|
|
308
|
+
describe: `Pipeline preset to use (${getPresetNames().join(", ")})`,
|
|
309
|
+
alias: "p",
|
|
310
|
+
})
|
|
311
|
+
.option("scorer", {
|
|
312
|
+
type: "array",
|
|
313
|
+
string: true,
|
|
314
|
+
describe: "Specific scorers to use (can be used multiple times)",
|
|
315
|
+
alias: "s",
|
|
316
|
+
})
|
|
317
|
+
.option("json", {
|
|
318
|
+
type: "boolean",
|
|
319
|
+
describe: "Output results as JSON",
|
|
320
|
+
default: false,
|
|
321
|
+
})
|
|
322
|
+
.option("verbose", {
|
|
323
|
+
type: "boolean",
|
|
324
|
+
describe: "Show detailed reasoning and timing",
|
|
325
|
+
alias: "v",
|
|
326
|
+
default: false,
|
|
327
|
+
})
|
|
328
|
+
.example('$0 evaluate run -i "What is the capital of France?" -o "Paris" -p quality', "Evaluate a response using the quality pipeline"),
|
|
329
|
+
handler: async (argv) => {
|
|
330
|
+
const { input, output, context, groundTruth, pipeline, scorer, json, verbose, } = argv;
|
|
331
|
+
if (!input || !output) {
|
|
332
|
+
logger.error(chalk.red("Error: Both --input and --output are required"));
|
|
333
|
+
logger.always(chalk.gray("Use --help for usage information"));
|
|
334
|
+
process.exit(1);
|
|
335
|
+
}
|
|
336
|
+
const spinner = json ? null : ora("Initializing evaluation...").start();
|
|
337
|
+
try {
|
|
338
|
+
const scorerInput = createScorerInput({
|
|
339
|
+
input,
|
|
340
|
+
output,
|
|
341
|
+
context,
|
|
342
|
+
groundTruth,
|
|
343
|
+
});
|
|
344
|
+
let evaluationPipeline;
|
|
345
|
+
if (pipeline) {
|
|
346
|
+
if (!isValidPreset(pipeline)) {
|
|
347
|
+
spinner?.fail(`Unknown pipeline preset: ${pipeline}`);
|
|
348
|
+
logger.always(chalk.gray(`Available presets: ${getPresetNames().join(", ")}`));
|
|
349
|
+
process.exit(1);
|
|
350
|
+
}
|
|
351
|
+
const presetConfig = getPreset(pipeline);
|
|
352
|
+
evaluationPipeline = new EvaluationPipeline(presetConfig);
|
|
353
|
+
}
|
|
354
|
+
else if (scorer && scorer.length > 0) {
|
|
355
|
+
const pipelineConfig = {
|
|
356
|
+
name: "CLI Custom Pipeline",
|
|
357
|
+
description: "Custom pipeline from CLI scorer arguments",
|
|
358
|
+
scorers: scorer.map((s) => ({ id: s })),
|
|
359
|
+
executionMode: "parallel",
|
|
360
|
+
};
|
|
361
|
+
evaluationPipeline = new EvaluationPipeline(pipelineConfig);
|
|
362
|
+
}
|
|
363
|
+
else {
|
|
364
|
+
const defaultPreset = getPreset("quality");
|
|
365
|
+
evaluationPipeline = new EvaluationPipeline(defaultPreset);
|
|
366
|
+
}
|
|
367
|
+
if (spinner) {
|
|
368
|
+
spinner.text = "Running evaluation...";
|
|
369
|
+
}
|
|
370
|
+
await evaluationPipeline.initialize();
|
|
371
|
+
const result = await evaluationPipeline.execute(scorerInput);
|
|
372
|
+
spinner?.stop();
|
|
373
|
+
if (json) {
|
|
374
|
+
logger.always(JSON.stringify(result, null, 2));
|
|
375
|
+
}
|
|
376
|
+
else {
|
|
377
|
+
logger.always("");
|
|
378
|
+
logger.always(chalk.bold("Evaluation Results"));
|
|
379
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
380
|
+
const overallColor = result.passed ? chalk.green : chalk.red;
|
|
381
|
+
const overallIcon = result.passed ? "PASS" : "FAIL";
|
|
382
|
+
logger.always(`${overallColor(overallIcon)} Overall Score: ${overallColor(result.overallScore.toFixed(2))} (${result.aggregationMethod})`);
|
|
383
|
+
logger.always("");
|
|
384
|
+
logger.always(chalk.bold("Individual Scores:"));
|
|
385
|
+
for (const score of result.scores) {
|
|
386
|
+
logger.always(formatScoreResult(score, verbose ?? false));
|
|
387
|
+
}
|
|
388
|
+
if (result.errors.length > 0) {
|
|
389
|
+
logger.always("");
|
|
390
|
+
logger.always(chalk.yellow("Errors:"));
|
|
391
|
+
for (const error of result.errors) {
|
|
392
|
+
logger.always(` ${chalk.yellow("!")} ${error.scorerId}: ${error.error}`);
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
if (result.skippedScorers.length > 0 && verbose) {
|
|
396
|
+
logger.always("");
|
|
397
|
+
logger.always(chalk.gray(`Skipped: ${result.skippedScorers.join(", ")}`));
|
|
398
|
+
}
|
|
399
|
+
logger.always("");
|
|
400
|
+
logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
catch (error) {
|
|
404
|
+
spinner?.fail("Evaluation failed");
|
|
405
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
406
|
+
logger.error(chalk.red(`Error: ${errorMessage}`));
|
|
407
|
+
process.exit(1);
|
|
408
|
+
}
|
|
409
|
+
},
|
|
410
|
+
};
|
|
411
|
+
/**
|
|
412
|
+
* Score subcommand - Score a single response with a specific scorer
|
|
413
|
+
*/
|
|
414
|
+
const scoreCommand = {
|
|
415
|
+
command: "score <scorer>",
|
|
416
|
+
describe: "Score a response using a single scorer",
|
|
417
|
+
builder: (yargs) => yargs
|
|
418
|
+
.positional("scorer", {
|
|
419
|
+
type: "string",
|
|
420
|
+
describe: "Scorer ID to use (e.g., hallucination, toxicity)",
|
|
421
|
+
demandOption: true,
|
|
422
|
+
})
|
|
423
|
+
.option("input", {
|
|
424
|
+
type: "string",
|
|
425
|
+
describe: "Input query/question that was asked",
|
|
426
|
+
alias: "i",
|
|
427
|
+
})
|
|
428
|
+
.option("output", {
|
|
429
|
+
type: "string",
|
|
430
|
+
describe: "Output/answer to evaluate",
|
|
431
|
+
alias: "o",
|
|
432
|
+
})
|
|
433
|
+
.option("context", {
|
|
434
|
+
type: "array",
|
|
435
|
+
string: true,
|
|
436
|
+
describe: "Context documents for evaluation",
|
|
437
|
+
alias: "c",
|
|
438
|
+
})
|
|
439
|
+
.option("ground-truth", {
|
|
440
|
+
type: "string",
|
|
441
|
+
describe: "Expected answer for comparison",
|
|
442
|
+
alias: "g",
|
|
443
|
+
})
|
|
444
|
+
.option("json", {
|
|
445
|
+
type: "boolean",
|
|
446
|
+
describe: "Output results as JSON",
|
|
447
|
+
default: false,
|
|
448
|
+
})
|
|
449
|
+
.option("verbose", {
|
|
450
|
+
type: "boolean",
|
|
451
|
+
describe: "Show detailed output",
|
|
452
|
+
alias: "v",
|
|
453
|
+
default: false,
|
|
454
|
+
})
|
|
455
|
+
.example('$0 evaluate score toxicity -o "This is a test response"', "Score a response for toxicity")
|
|
456
|
+
.example('$0 evaluate score hallucination -i "What is 2+2?" -o "2+2 equals 4" --json', "Score for hallucinations and output JSON"),
|
|
457
|
+
handler: async (argv) => {
|
|
458
|
+
const { scorer, input, output, context, groundTruth, json, verbose } = argv;
|
|
459
|
+
if (!output) {
|
|
460
|
+
logger.error(chalk.red("Error: --output is required"));
|
|
461
|
+
logger.always(chalk.gray("Use --help for usage information"));
|
|
462
|
+
process.exit(1);
|
|
463
|
+
}
|
|
464
|
+
const spinnerInstance = json
|
|
465
|
+
? null
|
|
466
|
+
: ora(`Loading scorer: ${scorer}...`).start();
|
|
467
|
+
try {
|
|
468
|
+
await ScorerRegistry.registerBuiltInScorers();
|
|
469
|
+
const scorerInstance = await ScorerRegistry.getScorer(scorer);
|
|
470
|
+
if (!scorerInstance) {
|
|
471
|
+
spinnerInstance?.fail(`Scorer not found: ${scorer}`);
|
|
472
|
+
const available = ScorerRegistry.list().map((s) => s.id);
|
|
473
|
+
logger.always(chalk.gray(`Available scorers: ${available.join(", ")}`));
|
|
474
|
+
process.exit(1);
|
|
475
|
+
}
|
|
476
|
+
if (spinnerInstance) {
|
|
477
|
+
spinnerInstance.text = "Running scorer...";
|
|
478
|
+
}
|
|
479
|
+
const scorerInput = createScorerInput({
|
|
480
|
+
input: input ?? "",
|
|
481
|
+
output,
|
|
482
|
+
context,
|
|
483
|
+
groundTruth,
|
|
484
|
+
});
|
|
485
|
+
const validation = scorerInstance.validateInput(scorerInput);
|
|
486
|
+
if (!validation.valid) {
|
|
487
|
+
spinnerInstance?.fail("Input validation failed");
|
|
488
|
+
for (const err of validation.errors) {
|
|
489
|
+
logger.always(chalk.red(` - ${err}`));
|
|
490
|
+
}
|
|
491
|
+
process.exit(1);
|
|
492
|
+
}
|
|
493
|
+
const result = await scorerInstance.score(scorerInput);
|
|
494
|
+
spinnerInstance?.stop();
|
|
495
|
+
if (json) {
|
|
496
|
+
logger.always(JSON.stringify(result, null, 2));
|
|
497
|
+
}
|
|
498
|
+
else {
|
|
499
|
+
logger.always("");
|
|
500
|
+
logger.always(chalk.bold(`${result.scorerName} Score: ${result.score.toFixed(2)}/10`));
|
|
501
|
+
logger.always(result.passed
|
|
502
|
+
? chalk.green(" Status: PASSED")
|
|
503
|
+
: chalk.red(" Status: FAILED"));
|
|
504
|
+
logger.always(` Threshold: ${result.threshold}`);
|
|
505
|
+
logger.always(` Time: ${result.computeTime}ms`);
|
|
506
|
+
if (verbose || !result.passed) {
|
|
507
|
+
logger.always("");
|
|
508
|
+
logger.always(chalk.gray("Reasoning:"));
|
|
509
|
+
logger.always(chalk.gray(` ${result.reasoning}`));
|
|
510
|
+
}
|
|
511
|
+
if (result.confidence !== undefined) {
|
|
512
|
+
logger.always("");
|
|
513
|
+
logger.always(chalk.gray(`Confidence: ${(result.confidence * 100).toFixed(1)}%`));
|
|
514
|
+
}
|
|
515
|
+
if (verbose && result.metadata) {
|
|
516
|
+
logger.always("");
|
|
517
|
+
logger.always(chalk.gray("Metadata:"));
|
|
518
|
+
logger.always(chalk.gray(JSON.stringify(result.metadata, null, 2)));
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
catch (error) {
|
|
523
|
+
spinnerInstance?.fail("Scoring failed");
|
|
524
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
525
|
+
logger.error(chalk.red(`Error: ${errorMessage}`));
|
|
526
|
+
process.exit(1);
|
|
527
|
+
}
|
|
528
|
+
},
|
|
529
|
+
};
|
|
530
|
+
/**
|
|
531
|
+
* Report subcommand - Generate evaluation report
|
|
532
|
+
*/
|
|
533
|
+
const reportCommand = {
|
|
534
|
+
command: "report",
|
|
535
|
+
describe: "Generate an evaluation report",
|
|
536
|
+
builder: (yargs) => yargs
|
|
537
|
+
.option("input", {
|
|
538
|
+
type: "string",
|
|
539
|
+
describe: "Input query/question that was asked",
|
|
540
|
+
alias: "i",
|
|
541
|
+
})
|
|
542
|
+
.option("output", {
|
|
543
|
+
type: "string",
|
|
544
|
+
describe: "Output/answer to evaluate",
|
|
545
|
+
alias: "o",
|
|
546
|
+
})
|
|
547
|
+
.option("context", {
|
|
548
|
+
type: "array",
|
|
549
|
+
string: true,
|
|
550
|
+
describe: "Context documents for evaluation",
|
|
551
|
+
alias: "c",
|
|
552
|
+
})
|
|
553
|
+
.option("ground-truth", {
|
|
554
|
+
type: "string",
|
|
555
|
+
describe: "Expected answer for comparison",
|
|
556
|
+
alias: "g",
|
|
557
|
+
})
|
|
558
|
+
.option("pipeline", {
|
|
559
|
+
type: "string",
|
|
560
|
+
describe: `Pipeline preset to use (${getPresetNames().join(", ")})`,
|
|
561
|
+
alias: "p",
|
|
562
|
+
})
|
|
563
|
+
.option("scorer", {
|
|
564
|
+
type: "array",
|
|
565
|
+
string: true,
|
|
566
|
+
describe: "Specific scorers to use",
|
|
567
|
+
alias: "s",
|
|
568
|
+
})
|
|
569
|
+
.option("format", {
|
|
570
|
+
type: "string",
|
|
571
|
+
describe: "Report format (text, json, markdown, html)",
|
|
572
|
+
choices: ["text", "json", "markdown", "html"],
|
|
573
|
+
default: "text",
|
|
574
|
+
})
|
|
575
|
+
.option("output-file", {
|
|
576
|
+
type: "string",
|
|
577
|
+
describe: "Save report to file",
|
|
578
|
+
alias: "f",
|
|
579
|
+
})
|
|
580
|
+
.option("verbose", {
|
|
581
|
+
type: "boolean",
|
|
582
|
+
describe: "Include detailed information in report",
|
|
583
|
+
alias: "v",
|
|
584
|
+
default: true,
|
|
585
|
+
})
|
|
586
|
+
.example('$0 evaluate report -i "Question" -o "Answer" -p quality --format markdown', "Generate markdown report")
|
|
587
|
+
.example('$0 evaluate report -i "Question" -o "Answer" -p rag --format html -f report.html', "Generate HTML report and save to file"),
|
|
588
|
+
handler: async (argv) => {
|
|
589
|
+
const { input, output, context, groundTruth, pipeline, scorer, format, outputFile, verbose, } = argv;
|
|
590
|
+
if (!input || !output) {
|
|
591
|
+
logger.error(chalk.red("Error: Both --input and --output are required"));
|
|
592
|
+
logger.always(chalk.gray("Use --help for usage information"));
|
|
593
|
+
process.exit(1);
|
|
594
|
+
}
|
|
595
|
+
const spinnerInstance = ora("Running evaluation...").start();
|
|
596
|
+
try {
|
|
597
|
+
const scorerInput = createScorerInput({
|
|
598
|
+
input,
|
|
599
|
+
output,
|
|
600
|
+
context,
|
|
601
|
+
groundTruth,
|
|
602
|
+
});
|
|
603
|
+
let evaluationPipeline;
|
|
604
|
+
if (pipeline && isValidPreset(pipeline)) {
|
|
605
|
+
evaluationPipeline = new EvaluationPipeline(getPreset(pipeline));
|
|
606
|
+
}
|
|
607
|
+
else if (scorer && scorer.length > 0) {
|
|
608
|
+
const pipelineConfig = {
|
|
609
|
+
name: "CLI Custom Pipeline",
|
|
610
|
+
scorers: scorer.map((s) => ({ id: s })),
|
|
611
|
+
executionMode: "parallel",
|
|
612
|
+
};
|
|
613
|
+
evaluationPipeline = new EvaluationPipeline(pipelineConfig);
|
|
614
|
+
}
|
|
615
|
+
else {
|
|
616
|
+
evaluationPipeline = new EvaluationPipeline(getPreset("quality"));
|
|
617
|
+
}
|
|
618
|
+
await evaluationPipeline.initialize();
|
|
619
|
+
const result = await evaluationPipeline.execute(scorerInput);
|
|
620
|
+
spinnerInstance.text = "Generating report...";
|
|
621
|
+
const reportData = {
|
|
622
|
+
title: `Evaluation Report - ${pipeline ?? "Custom Pipeline"}`,
|
|
623
|
+
timestamp: Date.now(),
|
|
624
|
+
result,
|
|
625
|
+
customSections: [
|
|
626
|
+
{
|
|
627
|
+
title: "Input",
|
|
628
|
+
content: { query: input, responseLength: output.length },
|
|
629
|
+
},
|
|
630
|
+
],
|
|
631
|
+
};
|
|
632
|
+
const validFormats = ["text", "json", "markdown", "html"];
|
|
633
|
+
const reportFormat = validFormats.includes(format)
|
|
634
|
+
? format
|
|
635
|
+
: "text";
|
|
636
|
+
const generator = new ReportGenerator({
|
|
637
|
+
format: reportFormat,
|
|
638
|
+
includeReasoning: verbose ?? true,
|
|
639
|
+
includeMetadata: verbose ?? true,
|
|
640
|
+
includeTiming: true,
|
|
641
|
+
});
|
|
642
|
+
const report = generator.generate(reportData);
|
|
643
|
+
spinnerInstance.stop();
|
|
644
|
+
if (outputFile) {
|
|
645
|
+
const fsPromises = await import("node:fs/promises");
|
|
646
|
+
await fsPromises.writeFile(outputFile, report.content, "utf-8");
|
|
647
|
+
logger.always(chalk.green(`Report saved to: ${outputFile}`));
|
|
648
|
+
}
|
|
649
|
+
else {
|
|
650
|
+
logger.always(report.content);
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
catch (error) {
|
|
654
|
+
spinnerInstance.fail("Report generation failed");
|
|
655
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
656
|
+
logger.error(chalk.red(`Error: ${errorMessage}`));
|
|
657
|
+
process.exit(1);
|
|
658
|
+
}
|
|
659
|
+
},
|
|
660
|
+
};
|
|
661
|
+
/**
|
|
662
|
+
* Presets subcommand - List available pipeline presets
|
|
663
|
+
*/
|
|
664
|
+
const presetsCommand = {
|
|
665
|
+
command: "presets [preset]",
|
|
666
|
+
describe: "List available pipeline presets or show details of a specific preset",
|
|
667
|
+
builder: (yargs) => yargs
|
|
668
|
+
.positional("preset", {
|
|
669
|
+
type: "string",
|
|
670
|
+
describe: "Specific preset to show details for",
|
|
671
|
+
})
|
|
672
|
+
.option("json", {
|
|
673
|
+
type: "boolean",
|
|
674
|
+
describe: "Output as JSON",
|
|
675
|
+
default: false,
|
|
676
|
+
})
|
|
677
|
+
.example("$0 evaluate presets", "List all available presets")
|
|
678
|
+
.example("$0 evaluate presets rag", "Show details of the RAG preset"),
|
|
679
|
+
handler: async (argv) => {
|
|
680
|
+
const { preset, json } = argv;
|
|
681
|
+
if (preset) {
|
|
682
|
+
// Show specific preset details
|
|
683
|
+
if (!isValidPreset(preset)) {
|
|
684
|
+
logger.error(chalk.red(`Unknown preset: ${preset}`));
|
|
685
|
+
logger.always(chalk.gray(`Available presets: ${getPresetNames().join(", ")}`));
|
|
686
|
+
process.exit(1);
|
|
687
|
+
}
|
|
688
|
+
const config = getPreset(preset);
|
|
689
|
+
if (json) {
|
|
690
|
+
logger.always(JSON.stringify(config, null, 2));
|
|
691
|
+
}
|
|
692
|
+
else {
|
|
693
|
+
logger.always("");
|
|
694
|
+
logger.always(chalk.bold(`Preset: ${chalk.cyan(preset)}`));
|
|
695
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
696
|
+
if (config.description) {
|
|
697
|
+
logger.always(`Description: ${config.description}`);
|
|
698
|
+
}
|
|
699
|
+
logger.always(`Pass Threshold: ${config.passThreshold ?? 0.7}`);
|
|
700
|
+
logger.always(`Execution Mode: ${config.executionMode ?? "parallel"}`);
|
|
701
|
+
logger.always("");
|
|
702
|
+
logger.always(chalk.bold("Scorers:"));
|
|
703
|
+
for (const s of config.scorers) {
|
|
704
|
+
const weight = s.config?.weight ?? 1.0;
|
|
705
|
+
const threshold = s.config?.threshold ?? "default";
|
|
706
|
+
logger.always(` - ${chalk.cyan(s.id)} (weight: ${weight}, threshold: ${threshold})`);
|
|
707
|
+
}
|
|
708
|
+
if (config.requiredScorers && config.requiredScorers.length > 0) {
|
|
709
|
+
logger.always("");
|
|
710
|
+
logger.always(chalk.bold("Required Scorers: ") +
|
|
711
|
+
config.requiredScorers.join(", "));
|
|
712
|
+
}
|
|
713
|
+
if (config.aggregation) {
|
|
714
|
+
logger.always("");
|
|
715
|
+
logger.always(chalk.bold("Aggregation: ") + config.aggregation.method);
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
else {
|
|
720
|
+
// List all presets
|
|
721
|
+
const presets = getPresetNames();
|
|
722
|
+
if (json) {
|
|
723
|
+
const presetData = Object.fromEntries(presets.filter(isValidPreset).map((p) => [p, getPreset(p)]));
|
|
724
|
+
logger.always(JSON.stringify(presetData, null, 2));
|
|
725
|
+
}
|
|
726
|
+
else {
|
|
727
|
+
logger.always("");
|
|
728
|
+
logger.always(chalk.bold("Available Pipeline Presets:"));
|
|
729
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
730
|
+
for (const p of presets) {
|
|
731
|
+
if (isValidPreset(p)) {
|
|
732
|
+
const config = getPreset(p);
|
|
733
|
+
logger.always("");
|
|
734
|
+
logger.always(` ${chalk.cyan(p)}`);
|
|
735
|
+
if (config.description) {
|
|
736
|
+
logger.always(` ${chalk.gray(config.description)}`);
|
|
737
|
+
}
|
|
738
|
+
logger.always(` Scorers: ${config.scorers.map((s) => s.id).join(", ")}`);
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
logger.always("");
|
|
742
|
+
logger.always(chalk.gray('Use "neurolink evaluate presets <name>" for more details'));
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
},
|
|
746
|
+
};
|
|
747
|
+
/**
|
|
748
|
+
* Main evaluate command with subcommands
|
|
749
|
+
*/
|
|
750
|
+
export const evaluateCommand = {
|
|
751
|
+
command: "evaluate [subcommand]",
|
|
752
|
+
describe: "Evaluate AI responses using RAGAS-style scorers and pipelines",
|
|
753
|
+
builder: (yargs) => yargs
|
|
754
|
+
.command(listScorersCommand)
|
|
755
|
+
.command(runPipelineCommand)
|
|
756
|
+
.command(runCommand)
|
|
757
|
+
.command(scoreCommand)
|
|
758
|
+
.command(reportCommand)
|
|
759
|
+
.command(presetsCommand)
|
|
760
|
+
.option("input", {
|
|
761
|
+
type: "string",
|
|
762
|
+
describe: "AI response text to evaluate",
|
|
763
|
+
alias: "i",
|
|
764
|
+
})
|
|
765
|
+
.option("query", {
|
|
766
|
+
type: "string",
|
|
767
|
+
describe: "Original user query",
|
|
768
|
+
alias: "q",
|
|
769
|
+
})
|
|
770
|
+
.option("scorers", {
|
|
771
|
+
type: "array",
|
|
772
|
+
string: true,
|
|
773
|
+
describe: "List of scorers to use for evaluation",
|
|
774
|
+
alias: "s",
|
|
775
|
+
})
|
|
776
|
+
.option("context", {
|
|
777
|
+
type: "string",
|
|
778
|
+
describe: "Path to context file (JSON format)",
|
|
779
|
+
alias: "c",
|
|
780
|
+
})
|
|
781
|
+
.option("threshold", {
|
|
782
|
+
type: "number",
|
|
783
|
+
describe: "Minimum score threshold for passing (0-1)",
|
|
784
|
+
alias: "t",
|
|
785
|
+
})
|
|
786
|
+
.option("format", {
|
|
787
|
+
type: "string",
|
|
788
|
+
describe: "Output format",
|
|
789
|
+
choices: ["text", "json", "table"],
|
|
790
|
+
default: "text",
|
|
791
|
+
})
|
|
792
|
+
.option("json", {
|
|
793
|
+
type: "boolean",
|
|
794
|
+
describe: "Output results as JSON (shorthand for --format json)",
|
|
795
|
+
default: false,
|
|
796
|
+
})
|
|
797
|
+
.option("verbose", {
|
|
798
|
+
type: "boolean",
|
|
799
|
+
describe: "Show detailed reasoning and timing",
|
|
800
|
+
alias: "v",
|
|
801
|
+
default: false,
|
|
802
|
+
})
|
|
803
|
+
.example('$0 evaluate --input "Response text" --query "User question" --scorers hallucination toxicity', "Evaluate with specific scorers")
|
|
804
|
+
.example('$0 evaluate --input "Response" --query "Query" --context ./context.json --format json', "Evaluate with context file and JSON output")
|
|
805
|
+
.example("$0 evaluate list-scorers", "List all available scorers")
|
|
806
|
+
.example('$0 evaluate run-pipeline --preset quality --input "Response"', "Run quality pipeline evaluation"),
|
|
807
|
+
handler: async (argv) => {
|
|
808
|
+
const { input, query, scorers, context, threshold, json, verbose, format } = argv;
|
|
809
|
+
// If no input provided and no subcommand executed, show help
|
|
810
|
+
if (!input) {
|
|
811
|
+
return;
|
|
812
|
+
}
|
|
813
|
+
const outputFormat = json ? "json" : format;
|
|
814
|
+
const spinner = outputFormat === "json" ? null : ora("Running evaluation...").start();
|
|
815
|
+
try {
|
|
816
|
+
// Load context if provided
|
|
817
|
+
let contextArray;
|
|
818
|
+
if (context) {
|
|
819
|
+
if (fs.existsSync(context)) {
|
|
820
|
+
try {
|
|
821
|
+
const content = fs.readFileSync(context, "utf-8");
|
|
822
|
+
const parsed = JSON.parse(content);
|
|
823
|
+
contextArray = Array.isArray(parsed) ? parsed : [content];
|
|
824
|
+
}
|
|
825
|
+
catch {
|
|
826
|
+
contextArray = [context];
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
else {
|
|
830
|
+
contextArray = [context];
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
const scorerInput = {
|
|
834
|
+
query: query ?? "",
|
|
835
|
+
response: input,
|
|
836
|
+
context: contextArray,
|
|
837
|
+
};
|
|
838
|
+
let evaluationPipeline;
|
|
839
|
+
if (scorers && scorers.length > 0) {
|
|
840
|
+
const pipelineConfig = {
|
|
841
|
+
name: "CLI Custom Pipeline",
|
|
842
|
+
description: "Custom pipeline from CLI scorer arguments",
|
|
843
|
+
scorers: scorers.map((s) => ({ id: s })),
|
|
844
|
+
executionMode: "parallel",
|
|
845
|
+
passThreshold: threshold ?? 0.7,
|
|
846
|
+
};
|
|
847
|
+
evaluationPipeline = new EvaluationPipeline(pipelineConfig);
|
|
848
|
+
}
|
|
849
|
+
else {
|
|
850
|
+
const defaultPreset = getPreset("quality");
|
|
851
|
+
if (threshold !== undefined) {
|
|
852
|
+
defaultPreset.passThreshold = threshold;
|
|
853
|
+
}
|
|
854
|
+
evaluationPipeline = new EvaluationPipeline(defaultPreset);
|
|
855
|
+
}
|
|
856
|
+
await evaluationPipeline.initialize();
|
|
857
|
+
const result = await evaluationPipeline.execute(scorerInput);
|
|
858
|
+
spinner?.stop();
|
|
859
|
+
if (outputFormat === "json") {
|
|
860
|
+
logger.always(JSON.stringify(result, null, 2));
|
|
861
|
+
}
|
|
862
|
+
else if (outputFormat === "table") {
|
|
863
|
+
logger.always("");
|
|
864
|
+
logger.always(chalk.bold("Evaluation Results"));
|
|
865
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
866
|
+
logger.always(`${chalk.bold("Scorer".padEnd(25))} ${chalk.bold("Score".padEnd(10))} ${chalk.bold("Status")}`);
|
|
867
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
868
|
+
for (const score of result.scores) {
|
|
869
|
+
const status = score.passed ? chalk.green("PASS") : chalk.red("FAIL");
|
|
870
|
+
const scoreColor = score.passed ? chalk.green : chalk.red;
|
|
871
|
+
logger.always(`${score.scorerName.padEnd(25)} ${scoreColor(score.score.toFixed(2).padEnd(10))} ${status}`);
|
|
872
|
+
}
|
|
873
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
874
|
+
const overallColor = result.passed ? chalk.green : chalk.red;
|
|
875
|
+
logger.always(`${"Overall".padEnd(25)} ${overallColor(result.overallScore.toFixed(2).padEnd(10))} ${result.passed ? chalk.green("PASS") : chalk.red("FAIL")}`);
|
|
876
|
+
logger.always("");
|
|
877
|
+
logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
|
|
878
|
+
}
|
|
879
|
+
else {
|
|
880
|
+
logger.always("");
|
|
881
|
+
logger.always(chalk.bold("Evaluation Results"));
|
|
882
|
+
logger.always(chalk.gray("-".repeat(50)));
|
|
883
|
+
const overallColor = result.passed ? chalk.green : chalk.red;
|
|
884
|
+
const overallIcon = result.passed ? "PASS" : "FAIL";
|
|
885
|
+
logger.always(`${overallColor(overallIcon)} Overall Score: ${overallColor(result.overallScore.toFixed(2))} (${result.aggregationMethod})`);
|
|
886
|
+
logger.always("");
|
|
887
|
+
logger.always(chalk.bold("Individual Scores:"));
|
|
888
|
+
for (const score of result.scores) {
|
|
889
|
+
logger.always(formatScoreResult(score, verbose ?? false));
|
|
890
|
+
}
|
|
891
|
+
if (result.errors.length > 0) {
|
|
892
|
+
logger.always("");
|
|
893
|
+
logger.always(chalk.yellow("Errors:"));
|
|
894
|
+
for (const error of result.errors) {
|
|
895
|
+
logger.always(` ${chalk.yellow("!")} ${error.scorerId}: ${error.error}`);
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
logger.always("");
|
|
899
|
+
logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
catch (error) {
|
|
903
|
+
spinner?.fail("Evaluation failed");
|
|
904
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
905
|
+
logger.error(chalk.red(`Error: ${errorMessage}`));
|
|
906
|
+
process.exit(1);
|
|
907
|
+
}
|
|
908
|
+
},
|
|
909
|
+
};
|
|
910
|
+
/**
|
|
911
|
+
* Create evaluate command factory for CLICommandFactory
|
|
912
|
+
*/
|
|
913
|
+
export class EvaluateCommandFactory {
|
|
914
|
+
/**
|
|
915
|
+
* Create the evaluate command module
|
|
916
|
+
*/
|
|
917
|
+
static createEvaluateCommand() {
|
|
918
|
+
return evaluateCommand;
|
|
919
|
+
}
|
|
920
|
+
/**
|
|
921
|
+
* List available scorers (utility method)
|
|
922
|
+
*/
|
|
923
|
+
static async listScorers() {
|
|
924
|
+
await ScorerRegistry.registerBuiltInScorers();
|
|
925
|
+
const scorerList = ScorerRegistry.list();
|
|
926
|
+
logger.always(chalk.bold("Available Scorers:"));
|
|
927
|
+
logger.always("");
|
|
928
|
+
for (const metadata of scorerList) {
|
|
929
|
+
logger.always(` ${chalk.cyan(metadata.id)}`);
|
|
930
|
+
logger.always(` ${chalk.gray(metadata.description)}`);
|
|
931
|
+
logger.always(` Type: ${metadata.type}, Category: ${metadata.category}`);
|
|
932
|
+
logger.always("");
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
/**
|
|
936
|
+
* List available pipeline presets (utility method)
|
|
937
|
+
*/
|
|
938
|
+
static listPipelines() {
|
|
939
|
+
const presets = getPresetNames();
|
|
940
|
+
logger.always(chalk.bold("Available Pipeline Presets:"));
|
|
941
|
+
logger.always("");
|
|
942
|
+
for (const preset of presets) {
|
|
943
|
+
if (isValidPreset(preset)) {
|
|
944
|
+
const config = getPreset(preset);
|
|
945
|
+
logger.always(` ${chalk.cyan(preset)}`);
|
|
946
|
+
if (config.description) {
|
|
947
|
+
logger.always(` ${chalk.gray(config.description)}`);
|
|
948
|
+
}
|
|
949
|
+
logger.always(` Scorers: ${config.scorers.map((s) => s.id).join(", ")}`);
|
|
950
|
+
logger.always("");
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
//# sourceMappingURL=evaluate.js.map
|